├── .gitignore
├── COPYRIGHT
├── README.md
├── docs
    ├── Generic.SIMD.Library.WPMVP2014.pdf
    ├── Makefile
    ├── apiguide
    │   ├── Makefile
    │   ├── README.md
    │   ├── apidata.js
    │   ├── apiguide.html
    │   └── vendor
    │   │   ├── filter.js
    │   │   ├── jquery-1.10.1.min.js
    │   │   ├── jquery-ui.css
    │   │   └── jquery-ui.js
    ├── developer_guide.md
    ├── doxygen_main.txt
    ├── faq.md
    ├── getting_started.md
    ├── history.md
    ├── img
    │   ├── intel2power_apps.jpg
    │   ├── intel_apps.jpg
    │   ├── power2intel_apps.jpg
    │   └── power_apps.jpg
    ├── intrinsics.doxyfile
    ├── performance.md
    └── programming_guide.md
├── examples
    ├── HelloSIMD
    │   ├── HelloSIMD.cpp
    │   └── Makefile
    ├── RGB2Gray
    │   ├── Makefile
    │   ├── RGB2Gray.cpp
    │   └── RGB2Gray_tune.cpp
    ├── RGB2YUV
    │   ├── Makefile
    │   └── RGB2YUV.cpp
    ├── common.mk
    └── mandelbrot
    │   ├── Makefile
    │   └── mandelbrot.cpp
├── include
    ├── README.md
    ├── generic.h
    ├── generic4.h
    ├── generic8.h
    ├── gsimd.h
    ├── gsimd_utility.h
    ├── perfmeasure.h
    ├── platform_intrinsics.h
    ├── power7_intrinsics.h
    ├── power8_intrinsics.h
    ├── power_vsx4.h
    ├── sse4.h
    ├── svec-vsx.h
    └── timing.h
├── tests
    ├── Makefile
    ├── README
    ├── codegen.cpp
    ├── test_lanes4.cpp
    ├── test_lanes8.cpp
    ├── test_svec.cpp
    └── test_utility.h
└── tools
    ├── allgroupspower7.sh
    ├── groupnamepower7.sh
    ├── grouppower7.sh
    └── p7.sh


/.gitignore:
--------------------------------------------------------------------------------
1 | .cproject
2 | .project
3 | docs/html/*
4 | docs/gh-pages.github/*
5 | tests/gtest-1.6.0/*
6 | .settings/*
7 | 


--------------------------------------------------------------------------------
/COPYRIGHT:
--------------------------------------------------------------------------------
 1 | Copyright 2012 the Generic SIMD Intrinsic Library project authors. All rights reserved.
 2 | 
 3 | Copyright IBM Corp. 2013, 2013. All rights reserved.
 4 | 
 5 | Redistribution and use in source and binary forms, with or without
 6 | modification, are permitted provided that the following conditions are
 7 | met:
 8 | 
 9 |    * Redistributions of source code must retain the above copyright
10 |      notice, this list of conditions and the following disclaimer.
11 |    * Redistributions in binary form must reproduce the above
12 |      copyright notice, this list of conditions and the following
13 |      disclaimer in the documentation and/or other materials provided
14 |      with the distribution.
15 |    * Neither the name of IBM Corp. nor the names of its contributors may be
16 |      used to endorse or promote products derived from this software
17 |      without specific prior written permission.
18 | 
19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 
31 | The original source code covered by the above license above has been
32 | modified significantly by IBM Corp.
33 | Copyright 2013 the Generic SIMD Intrinsic Library project authors. All rights reserved.
34 | 
35 | Copyright (c) 2010-2012, Intel Corporation
36 |   All rights reserved.
37 | 
38 |   Redistribution and use in source and binary forms, with or without
39 |   modification, are permitted provided that the following conditions are
40 |   met:
41 | 
42 |     * Redistributions of source code must retain the above copyright
43 |       notice, this list of conditions and the following disclaimer.
44 | 
45 |     * Redistributions in binary form must reproduce the above copyright
46 |       notice, this list of conditions and the following disclaimer in the
47 |       documentation and/or other materials provided with the distribution.
48 | 
49 |     * Neither the name of Intel Corporation nor the names of its
50 |       contributors may be used to endorse or promote products derived from
51 |       this software without specific prior written permission.
52 | 
53 | 
54 |   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
55 |   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
56 |   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
57 |   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
58 |   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
59 |   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
60 |   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
61 |   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
62 |   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
63 |   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
64 |   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | #Generic SIMD Library
 2 | 
 3 | The Generic SIMD Library allowers users to write C++ SIMD codes that are portable across different SIMD ISAs.
 4 | 
 5 | ##Running examples
 6 | ```c++
 7 | //HelloSIMD.cpp
 8 | #include <iostream>
 9 | #include <gsimd.h>
10 |     
11 | int main (int argc, char* argv[])
12 | {
13 |   svec<4,float> v1(1.1, 2.2, 3.3, 4.4);
14 |   svec<4,float> v2 = v1 * 2;
15 |   std::cout << "Hello World: " << v2 << std::endl;
16 |   return 0;
17 | }
18 | ```
19 | 
20 | Let's use the example above to illustrate some of the basics features of the library:
21 | - The entire generic SIMD library is included from the header file <gsimd.h>.
22 | - Using proper platform-specific compiler flags, the code can be compiled by standard G++ into binaries for different target SIMD architectures.
23 | - In this example, svec<4,float> is the SIMD vector abstraction provided by the library. It represents a vector of 4 floating-point values.
24 | - Most operations on SIMD vectors use standard C++ operators such as "*" and "<<".
25 | 
26 | ##Key features
27 | 
28 | The library provides:
29 | - <b>Fixed-lane SIMD vectors.</b> Our SIMD vectors are defined based on the number of elements per vector (<i>fixed-lane</i>) instead of the byte-length of a vector (<i>fixed-width</i>). This is the key diffence between our vector types and the ones defined in platform-specific intrinsics.
30 | 
31 |    We choose fixed-lane vector because it is more natural to SIMDized parallel loops that involve data of different length such as int and double.
32 | 
33 |    We intend to support vectors with arbitrary power-of-two lanes, but currently only 4-element vectors are supported. Vector of 2-, and 8-elements are under development.
34 | 
35 | - <b>Portable SIMD programming.</b> The programming interface of the library is completely platform neutral. The library provides mapping from the interface to target SIMD platforms. The current release supports the following target platforms:
36 |   + SSE4.2
37 |   + VSX for P7
38 |   + Scalar emulation for non-SIMD platforms
39 | 
40 | - <b>Overloaded C++ semantics on SIMD vectors.</b> We define SIMD vector operations based on semantics of C++ operators instead of platform-specific ISA semantics. This is because the semantics of C++ operators are platform independent. Secondly, C++ operators provide a slightly higher semantics than platform-specific intrinsics and are more natural to program since most users understand C++ operators well.
41 | 
42 | ##More Information
43 | - [Generic SIMD Intrinsics Library API](http://genericsimd.github.io/generic_simd/index.html)
44 | - [Generic SIMD API Guide](http://genericsimd.github.io/generic_simd/apiguide/apiguide.html)
45 | - [Getting Started](docs/getting_started.md)
46 | - [Programming Guide](docs/programming_guide.md)
47 | - [Developer Guide](docs/developer_guide.md)
48 | - [WPMVP2014 Paper] (https://github.com/genericsimd/generic_simd/raw/master/docs/Generic.SIMD.Library.WPMVP2014.pdf)
49 | - [FAQ & Trouble Shooting](docs/faq.md)
50 | - [Performance Data](docs/performance.md)
51 | - [History](docs/history.md)
52 | 
53 | 


--------------------------------------------------------------------------------
/docs/Generic.SIMD.Library.WPMVP2014.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/genericsimd/generic_simd/3a92a1983b195a790742b3dce93a8bc3d0679dff/docs/Generic.SIMD.Library.WPMVP2014.pdf


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Make file to build the files
 2 | 
 3 | PUBLISH_ROOT=gsimd_html
 4 | GH_PAGES_ROOT=gh-pages.github
 5 | default: html
 6 | 
 7 | .PHONY: html
 8 | 
 9 | html: intrinsics.doxyfile ../include/power_vsx4.h 
10 | 	make clean
11 | 	doxygen $<
12 | 
13 | gitpub: html
14 | 	cp -fR html/* ${GH_PAGES_ROOT}
15 | 
16 | copy: html
17 | 	cp -R html/* ${PUBLISH_ROOT}
18 | 	chmod -R g+wxr ${PUBLISH_ROOT}/*
19 | 	chmod -R a+xr ${PUBLISH_ROOT}/*
20 | 
21 | clean:
22 | 	@rm -fR html
23 | 


--------------------------------------------------------------------------------
/docs/apiguide/Makefile:
--------------------------------------------------------------------------------
1 | # Make file to build the files
2 | 
3 | GH_PAGES_ROOT=../gh-pages.github
4 | 
5 | gitpub: 
6 | 	mkdir -p ${GH_PAGES_ROOT}/apiguide/
7 | 	cp -rf * ${GH_PAGES_ROOT}/apiguide/
8 | 


--------------------------------------------------------------------------------
/docs/apiguide/README.md:
--------------------------------------------------------------------------------
 1 | # Generic SIMD API Guide
 2 | 
 3 | The Generic SIMD API Guide is a simple tool to search the data types and APIs of the Generic SIMD library.
 4 | 
 5 | The tool is a pure static html tool based on [filter.js framework](https://github.com/jiren/filter.js).
 6 | 
 7 | The tool uses one search box and three checkbox filters 
 8 | - Lane: filter API json object's Lane attribute
 9 | - Type: filter API json object's Type attribute
10 | - category: filter API json object's Category attribute
11 | 
12 | The search box is a full text search of all the json object's text. So if you search "add", you should get the result containting "address".
13 | 
14 | ## API data
15 | The api data is defined in apidata.js as a json object.
16 | 
17 | Each API json object has five attributes
18 | - *name*: String, the API's name.
19 | - *Lane*: Integer, could be only 4 or 8 right now.
20 | - *Type*: String, the API's base(scalar) type, could be one of the following types
21 |   + bool
22 |   + int8_t
23 |   + uint8_t
24 |   + int16_t
25 |   + uint16_t
26 |   + int32_t
27 |   + uint32_t
28 |   + int64_t
29 |   + uint64_t
30 |   + float
31 |   + double
32 | - *Category*: String, could be
33 |   + datatype: data type or constructor
34 |   + math: arithmetic operations
35 |   + bitop: bit operations
36 |   + cmp: compare
37 |   + load: load operation
38 |   + store: store operation
39 |   + cast: cast operation
40 |   + other: other operations
41 | - *Description*: String, detail description
42 | - *Example*: String. Optional. Example code.
43 | 
44 | Example
45 | ```json
46 | { name: "svec< 4, bool >",
47 |   Lane:4, 
48 |   Type: "bool",
49 |   Category: "datatype",
50 |   Description: "Data representation and operations on a vector of 4 boolean values. This is used in predicated vector operations. Specifically the ith value of svec<4,bool> indicates whether the ith lane of a predicated vector operation is enabled or not",
51 |   Example: "Sample code. Use <br> for line break"
52 | }
53 | ```
54 | 


--------------------------------------------------------------------------------
/docs/apiguide/apidata.js:
--------------------------------------------------------------------------------
 1 | apidata = [
 2 | { name: "svec< 4, bool >",
 3 |   Lane:4, 
 4 |   Type: "bool",
 5 |   Category: "datatype",
 6 |   Description: "Data representation and operations on a vector of 4 boolean values. This is used in predicated vector operations. Specifically the ith value of svec<4,bool> indicates whether the ith lane of a predicated vector operation is enabled or not."
 7 | },
 8 | { name: "svec< 4, bool >::svec()",
 9 |   Lane:4, 
10 |   Type: "bool",
11 |   Category: "datatype",
12 |   Description: "Default constructor.<br>Return a vector of 4 undefined values"
13 | },
14 | { name: "svec< 4, bool >::svec(uint32_t a)",
15 |   Lane:4, 
16 |   Type: "bool",
17 |   Category: "datatype",
18 |   Description: "Constructor.<br> Return a vector of 4 mask/booleans: {a,a,a,a}.<br><b>Note:</b>a must be either 0 or -1."
19 | },
20 | { name: "svec< 4, bool >::svec(uint32_t a, uint32_t b, uint32_t c, uint32_t d)",
21 |   Lane:4, 
22 |   Type: "bool",
23 |   Category: "datatype",
24 |   Description: "Constructor.<br> Return a vector of a vector of 4 mask/booleans: {a,b,c,d}.<br><b>Note:</b>a,b,c,d must be either 0 or -1."
25 | },
26 | { name: "svec< 4, bool >::operator[](int index)",
27 |   Lane:4, 
28 |   Type: "bool",
29 |   Category: "other",
30 |   Description: "Set or get the vector element specified by index.",
31 |   Example: "svec<4,bool> mask(0,-1,-1,0);<br>bool a = mask[0];//a is false<br>mask[2] = 0; //mask is now{0,-1,0,0}"
32 | },
33 | { name: "svec< 4, bool >::operator==(svec<4,bool> a)",
34 |   Lane:4, 
35 |   Type: "bool",
36 |   Category: "cmp",
37 |   Description: "Element-wise compare equal. Return a bool vector.",
38 |   Example: "a == b"
39 | },
40 | { name: "svec< 4, bool >::operator!=(svec<4,bool> a)",
41 |   Lane:4, 
42 |   Type: "bool",
43 |   Category: "cmp",
44 |   Description: "Element-wise compare not equal. Return a bool vector",
45 |   Example: "a != b"
46 | },
47 | { name: "svec< 4, bool >::store (svec< 4, bool > *p)",
48 |   Lane:4, 
49 |   Type: "bool",
50 |   Category: "store",
51 |   Description: "Store the vector to address p. p does not have to be aligned. Each svec< 4, bool > requires 16 bytes",
52 |   Example: "svec< 4, bool > mask(0,-1,-1,0);<br>void* dst=...;<br>mask.store((svec< 4, bool > *)dst);"
53 | },
54 | { name: "static svec< 4, bool >::load (svec< 4, bool > *p)",
55 |   Lane:4, 
56 |   Type: "bool",
57 |   Category: "load",
58 |   Description: "Class method, load the vector from the pointer p, and return a new svec< 4, bool > vector. p does not have to be aligned. Each svec< 4, bool > requires 16 bytes",
59 |   Example: "void* src=...;<br> svec< 4, bool > mask = svec< 4, bool >::load((svec< 4, bool >*)src);"
60 | },
61 | { name: "svec< 4, bool >::any_true()",
62 |   Lane:4, 
63 |   Type: "bool",
64 |   Category: "other",
65 |   Description: "Check if any element in the mask vector is true. Return true if at least one element in the mask vector is true, otherwise false. This is a reduction operation that returns a scalar value.",
66 |   Example: ""
67 | },
68 | { name: "svec< 4, bool >::all_true()",
69 |   Lane:4, 
70 |   Type: "bool",
71 |   Category: "other",
72 |   Description: "Check if all the elements in the mask vector is true. Return true if all the elements in the mask vector are true, otherwise false. This is a reduction operation that returns a scalar value.",
73 |   Example: ""
74 | },
75 | { name: "svec< 4, bool >::none_true()",
76 |   Lane:4, 
77 |   Type: "bool",
78 |   Category: "other",
79 |   Description: "Check all the elements in the mask vector is false. Return true if all the elements in the mask vector are false, otherwise false. This is a reduction operation that returns a scalar value.",
80 |   Example: ""
81 | },
82 | ];


--------------------------------------------------------------------------------
/docs/apiguide/apiguide.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE HTML>
  2 | <html xmlns="http://www.w3.org/1999/xhtml">
  3 | <head>
  4 |   <meta content="text/html; charset=UTF-8" http-equiv="Content-Type">
  5 |   <!--link href="http://code.jquery.com/ui/1.10.3/themes/smoothness/jquery-ui.css" media="screen" rel="stylesheet" type="text/css"-->
  6 |   <link href="vendor/jquery-ui.css" media="screen" rel="stylesheet" type="text/css">
  7 |   <!--script src="http://code.jquery.com/jquery-1.10.1.min.js"></script-->
  8 |   <script src="vendor/jquery-1.10.1.min.js"></script>
  9 |   <!--script src="http://code.jquery.com/ui/1.10.3/jquery-ui.js"></script-->
 10 |   <script src="vendor/jquery-ui.js"></script>
 11 |   <script src="vendor/filter.js" type="text/javascript"></script>
 12 |   <script src="apidata.js" type="text/javascript"></script>
 13 |   <title>Generic SIMD API Guide</title>
 14 | <style type="text/css">
 15 | body {
 16 |   font-family: "Trebuchet MS", "Helvetica", "Arial",  "Verdana", "sans-serif";
 17 |   font-size: 100%;
 18 | }
 19 | 
 20 | a:link {text-decoration:none;font-weight:bold;}    /* unvisited link */
 21 | a:visited {text-decoration:none;font-weight:bold;} /* visited link */
 22 | a:hover {text-decoration:underline;font-weight:bold;}   /* mouse over link */
 23 | a:active {text-decoration:underline;font-weight:bold;}  /* selected link */
 24 | 
 25 | .dataclass {
 26 | margin:5px;
 27 | overflow-y:auto;
 28 | position:absolute;
 29 | background-color:#BBFFFFFF;
 30 | text-align:left;
 31 | border-style:solid;
 32 | border-width:1px;
 33 | }
 34 | 
 35 | .slidingDiv {
 36 |     height:300px;
 37 |     background-color: #99CCFF;
 38 |     padding:20px;
 39 |     margin-top:10px;
 40 |     border-bottom:5px solid #3399FF;
 41 | }
 42 | 
 43 | .apiclass {
 44 | margin:5px 5px;
 45 | border-style:solid;
 46 | border-width:1px;
 47 | }
 48 | 
 49 | .apiheaderclass {
 50 | background-color:#e0ffff;
 51 | }
 52 |  
 53 | .apidescclass {
 54 |   margin:5px 5px;
 55 |   display:none;
 56 | }
 57 | 
 58 | .apiexampleclass {
 59 |   background-color:#ffffcc;
 60 | }
 61 | 
 62 | .show_hide {
 63 |  display:none;
 64 | }
 65 | </style>
 66 | </head>
 67 | <body>
 68 | <script type="text/javascript">
 69 | 
 70 | </script>
 71 | 
 72 | <div id="filter_panel" class="dataclass" style="left:0px;height:600px;top:0px;width:300px">
 73 | <div>
 74 |   Search API:<input type="text" id="search_box" class="searchbox" placeholder="Type here...." style="margin-top:0px"/>
 75 | </div>
 76 | 
 77 | <div id="lane_list">
 78 |   <h3 style="margin:5px 5px;">Lanes</h3>
 79 |   <ul style="margin:0px 0px;">
 80 |     <li><input id="lane_all" value="all" type="checkbox" checked="true"><span>All</span></li>
 81 |     <li><input id="lane_4" value="4" type="checkbox" checked="true"><span>LANE4</span></li>
 82 |     <li><input id="lane_8" value="8" type="checkbox" checked="true"><span>LANE8</span></li>
 83 |   </ul>
 84 | </div>
 85 | <div id="type_list">
 86 |   <h3 style="margin:5px 5px;">Types</h3>
 87 |   <ul style="margin:0px 0px;">
 88 |     <li><input id="type_all" value="all" type="checkbox" checked="true"><span>All</span></li>
 89 |     <li><input id="type_bool" value="bool" type="checkbox" checked="true"><span>bool</span></li>
 90 |     <li><input id="type_int8" value="int8_t" type="checkbox" checked="true"><span>int8_t</span></li>
 91 |     <li><input id="type_uint8" value="uint8_t" type="checkbox" checked="true"><span>uint8_t</span></li>
 92 |     <li><input id="type_int16" value="int16_t" type="checkbox" checked="true"><span>int16_t</span></li>
 93 |     <li><input id="type_uint16" value="uint16_t" type="checkbox" checked="true"><span>uint16_t</span></li>
 94 |     <li><input id="type_int32" value="int32_t" type="checkbox" checked="true"><span>int32_t</span></li>
 95 |     <li><input id="type_uint32" value="uint32_t" type="checkbox" checked="true"><span>uint32_t</span></li>
 96 |     <li><input id="type_int64" value="int64_t" type="checkbox" checked="true"><span>int64_t</span></li>
 97 |     <li><input id="type_uint64" value="uint64_t" type="checkbox" checked="true"><span>uint64_t</span></li>
 98 |     <li><input id="type_float" value="float" type="checkbox" checked="true"><span>float</span></li>
 99 |     <li><input id="type_double" value="double" type="checkbox" checked="true"><span>double</span></li>
100 |   </ul>
101 | </div>
102 | <div id="category_list">
103 |   <h3 style="margin:5px 5px;">Categories</h3>
104 |   <ul style="margin:0px 0px;">
105 |     <li><input id="category_all" value="all" type="checkbox" checked="true"><span>All</span></li>
106 |     <li><input id="category_datatype" value="datatype" type="checkbox" checked="true"><span>Datatype/Constructor</span></li>
107 |     <li><input id="category_arithmetic" value="math" type="checkbox" checked="true"><span>Arithmetic</span></li>
108 |     <li><input id="category_bitop" value="bitop" type="checkbox" checked="true"><span>Bit Manipulation</span></li>
109 |     <li><input id="category_bitop" value="cmp" type="checkbox" checked="true"><span>Compare</span></li>
110 |     <li><input id="category_load" value="load" type="checkbox" checked="true"><span>Load</span></li>
111 |     <li><input id="category_store" value="store" type="checkbox" checked="true"><span>Store</span></li>
112 |     <li><input id="category_cast" value="cast" type="checkbox" checked="true"><span>Cast</span></li>
113 |     <li><input id="category_other" value="other" type="checkbox" checked="true"><span>Other</span></li>
114 |   </ul>
115 | </div>
116 | 
117 | </div>
118 | <div id="api_panel" class="dataclass" style="height:600px;top:0px;width:900px;left:310px;">
119 | <h3  style="margin:5px 5px;">API List</h3>
120 | <div id="api_list"></div>
121 | 
122 | </div>
123 | 
124 | <script type="text/javascript">
125 | $(document).ready(function(){
126 |     $(window).resize(function() {
127 |         $('#filter_panel').height($(window).height()-10);
128 |         $('#api_panel').height($(window).height()-10);
129 |         $('#api_panel').width($(window).width()-320);
130 |     });
131 | });
132 | $('#filter_panel').height($(window).height()-10);
133 | $('#api_panel').height($(window).height()-10);
134 | $('#api_panel').width($(window).width()-320);
135 | 
136 | 
137 | $('#lane_all,#type_all,#category_all').closest('ul').children().find(':checkbox').prop('checked', true);
138 | 
139 | $('#lane_all,#type_all,#category_all').on('click',function(){
140 |    $(this).closest('ul').children().find(':checkbox').prop('checked', $(this).is(':checked'));
141 |   });
142 | 
143 | 
144 | $.each(apidata, function(i, m){ m.id = i+1; });
145 |              
146 | var view = function(api) {
147 |     if(api.Example === undefined || api.Example == "") {
148 |       examplestr = "";
149 |     } else {
150 |       examplestr = "<h4>Example</h4><div class='apiexampleclass'>"+  api.Example  +"<div>";
151 | 
152 |     }
153 |     return "<div class='apiclass'><div class='apiheaderclass'><a class='toggle' href='#'>"+ api.name +"</a></div>"+
154 |     "<div class='apidescclass'><h4>Description</h4>"+ api.Description + examplestr + "</div></div>";
155 | }
156 | 
157 | var settings = {
158 |   filter_criteria: {
159 |           lane: ['#lane_list input:checkbox .EVENT.click .SELECT.:checked', 'Lane'],
160 |           type: ['#type_list input:checkbox .EVENT.click .SELECT.:checked', 'Type'],
161 |           category: ['#category_list input:checkbox .EVENT.click .SELECT.:checked', 'Category'],
162 |     },
163 |    and_filter_on: true,
164 |    search: { input: '#search_box' },
165 | };
166 |  
167 | 
168 | /* Trigger the filtering */
169 | var fJS = new FilterJS(apidata, "#api_list", view, settings);
170 | 
171 | /* Function for api document's expand */
172 | $('.toggle').click(function() {
173 |   var content = $(this).parent().next();
174 |   $(content).toggle();
175 |   return false;
176 | });
177 | 
178 | </script>
179 | 
180 | </body>
181 | </html>


--------------------------------------------------------------------------------
/docs/apiguide/vendor/filter.js:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Filter.js
  3 |  * version: 1.5.1 (22/4/2013)
  4 |  *
  5 |  * Licensed under the MIT:
  6 |  *   http://www.opensource.org/licenses/mit-license.php
  7 |  *
  8 |  * Copyright 2013 Jiren Patel[ joshsoftware.com ]
  9 |  * 
 10 |  * Dependency:
 11 |  *  jQuery(v1.8 >=)
 12 |  */
 13 | 
 14 | (function(window) {
 15 | 
 16 |   'use strict';
 17 | 
 18 |   var FilterJS = function(data, container, view, options) {
 19 |     return new _FilterJS(data, container, view, options);
 20 |   };
 21 | 
 22 |   FilterJS.VERSION = '1.5.1';
 23 | 
 24 |   $.fn.filterjs = function(data, view, options) {
 25 |     var $this = $(this);
 26 |     if ($this.data('fjs')) return;
 27 |     $this.data('fjs', new _FilterJS(data, $this, view, options));
 28 |   };
 29 | 
 30 |   window.FilterJS = FilterJS;
 31 | 
 32 |   var _FilterJS = function(data, container, view, options) {
 33 |     var property_count = 0;
 34 | 
 35 |     this.data = data;
 36 |     this.view = view;
 37 |     this.container = container;
 38 |     this.options = options || {};
 39 |     this.categories_map = {}
 40 |     this.record_ids = [];
 41 | 
 42 |     if (this.data.constructor != Array) this.data = [this.data];
 43 | 
 44 |     for (name in this.data[0]){
 45 |       this.root = name;
 46 |       property_count += 1;
 47 |     }
 48 | 
 49 |     if (property_count == 1){
 50 |       this.getRecord = function(i, d){ return d[i][this.root]; }
 51 |     }else{
 52 |       this.getRecord = function(i, d){ return d[i]; }
 53 |       this.root = 'fjs';
 54 |     }
 55 | 
 56 |     this.id_field = this.options.id_field || 'id';
 57 |     this.render(this.data);
 58 |     this.parseOptions();
 59 |     this.buildCategoryMap(this.data);
 60 |     this.bindEvents();
 61 | 
 62 |     this.options.callbacks = this.options.callbacks || {};
 63 |     this.execCallBack('after_init', this.record_ids);
 64 |     this.execCallBack('after_add', this.data);
 65 |     this.options.filter_types = this.options.filter_types || {};
 66 | 
 67 |     if (!this.options.filter_types['range'])
 68 |       this.options.filter_types['range'] = this.rangeFilter;
 69 | 
 70 |     this.options.streaming = this.options.streaming || {};
 71 |     if (this.options.streaming.data_url){
 72 |       this.options.streaming.stream_after = (this.options.streaming.stream_after || 2)*1000;
 73 |       this.options.streaming.batch_size = this.options.streaming.batch_size || false;
 74 |       this.streamData(this.options.streaming.stream_after);
 75 |     }
 76 |     
 77 |     return this;
 78 |   };
 79 | 
 80 |   _FilterJS.prototype = {
 81 | 
 82 |     //Render Html using JSON data
 83 |     render: function(data, offset) {
 84 |       var $container = $(this.container), record, el;
 85 | 
 86 |       if (!data) return;
 87 | 
 88 |       for (var i = 0, l = data.length; i < l; i++){
 89 |         record = this.getRecord(i, data);
 90 |         el = $(this.view(record));
 91 |         el.attr({id: this.root + '_' + record[this.id_field], 'data-fjs': true});
 92 |         el = $container.append(el);
 93 |       }
 94 |     },
 95 | 
 96 |     //Bind Events to filter html elements
 97 |     bindEvents: function() {
 98 |       var self = this, s = this.options.selectors, i = 0, l = s.length;
 99 | 
100 |       for (i; i < l; i++){
101 |         this.bindSelectorEvent(s[i], self);
102 |       }
103 | 
104 |       if (this.options.search){
105 |         $(this.options.search.input).on('keyup', function(e){
106 |           self.filter();
107 |         });
108 |       }
109 |     },
110 | 
111 |     bindSelectorEvent: function(selector, context) {
112 |       $(selector.element).on(selector.events, function(e) {
113 |         context.filter();
114 |       });
115 |     },
116 | 
117 |     //Unbind fileter events
118 |     clear: function() {
119 |       var s = this.options.selectors, i = 0, l = s.length;
120 | 
121 |       for (i; i < l; i++)
122 |         $(s[i].element).off(s[i].events);
123 | 
124 |       if (this.options.search) $(this.options.search.input).off('keyup');
125 | 
126 |       this.category_map = null;
127 |       this.record_ids = null;
128 |     },
129 |                   
130 |     //Find elements accroding to selection criteria.
131 |     filter: function(){
132 |       var result, s, selected_vals, records, selected_none = false, i = 0, l = this.options.selectors.length;
133 | 
134 |       for (i; i < l; i++){
135 |         s = this.options.selectors[i];
136 |         selected_vals = $(s.element).filter(s.select).map(function() {
137 |           return $(this).val();
138 |         });
139 | 
140 |         if (selected_vals.length) {
141 |           records = this.findObjects(selected_vals, this.categories_map[s.name], this.options.filter_types[s.type]);
142 | 
143 |           result = $.grep((result || this.record_ids), function(v) {
144 |             return (records.indexOf(v) != -1);
145 |           });
146 |         }else{
147 |           selected_none = true;
148 |         }
149 |       }
150 | 
151 |       if (selected_none && this.options.and_filter_on) result = [];
152 | 
153 |       if (this.options.search) result = this.search(this.options.search, result);
154 |       
155 |       this.hideShow(result);
156 | 
157 |       this.execCallBack('after_filter', result);
158 |     },
159 | 
160 |     //Compare and collect objects
161 |     findObjects: function(category_vals, category_map, filter_type_func) {
162 |       var r = [], ids, category_val, i = 0, l = category_vals.length;
163 | 
164 |       for (i; i < l; i++){
165 |         category_val = category_vals[i];
166 |         
167 |         if (filter_type_func){
168 |           ids = $.map(category_map, function(n,v){
169 |             if (filter_type_func(category_val, v)) return n;
170 |           });
171 |         } else {
172 |           ids = category_map.constructor == Array ? category_map : category_map[category_val];
173 |         }
174 | 
175 |         if (ids) r = r.concat(ids);
176 |       }
177 | 
178 |       return r;
179 |     },
180 | 
181 |     //Make eval expresssion  to collect object from the json data.
182 |     buildEvalString: function(field_map) {
183 |       var fields = field_map.split('.ARRAY.'), eval_str, i = 1, l = fields.length;
184 | 
185 |       eval_str = fields[0];
186 | 
187 |       for (i; i < l; i++) {
188 |         eval_str += ".filter_collect('" + fields[i] + "')";
189 |       }
190 | 
191 |       return eval_str;
192 |     },
193 | 
194 |     addFilterCriteria: function(name, criteria, ids_or_mapping) {
195 |       this.categories_map[name] = {};
196 | 
197 |       var selector = this.parseSelectorOptions({name: name}, [criteria]);
198 |       ids_or_mapping = ids_or_mapping || $(selector.element).data('ids') || [];
199 | 
200 |       this.options.selectors.push(selector);
201 |       this.categories_map[name] = ids_or_mapping;
202 | 
203 |       this.bindSelectorEvent(selector, this);
204 |     },
205 | 
206 |     //Create map accroding to selection criteria.
207 |     parseOptions: function() {
208 |       var filter_criteria = this.options.filter_criteria, selector, criteria, ele, ele_type;
209 |       this.options.selectors = [];
210 | 
211 |       for (name in filter_criteria) {
212 | 
213 |         criteria = filter_criteria[name];
214 |         selector = this.parseSelectorOptions({name: name}, criteria);
215 | 
216 |         this.options.selectors.push(selector);
217 | 
218 |         criteria.push(this.buildEvalString(criteria[1]));
219 |         this.categories_map[name] = {};
220 |       }
221 |     },
222 | 
223 |     parseSelectorOptions: function(selector, criteria) {
224 |       selector.element = criteria[0].split(/.EVENT.|.SELECT.|.TYPE./)[0];
225 |       selector.events = (criteria[0].match(/.EVENT.(\S*)/) || [])[1];
226 |       selector.select = (criteria[0].match(/.SELECT.(\S*)/) || [])[1];
227 |       selector.type = (criteria[0].match(/.TYPE.(\S*)/) || [])[1];
228 | 
229 |       var ele = $(selector.element),
230 |           ele_type = ele.attr('type');
231 | 
232 |       if (!selector.select){
233 |         if (ele.get(0).tagName == 'INPUT'){
234 |           if (ele_type == 'checkbox' || ele_type == 'radio'){
235 |             selector.select = ':checked';
236 |           }else if (ele_type == 'hidden'){
237 |             selector.select = ':input';
238 |           }
239 |         }else if (ele.get(0).tagName == 'SELECT'){
240 |            selector.select = 'select';
241 |         }
242 |       }
243 | 
244 |       if (!selector.events){
245 |         if (ele_type == 'checkbox' ||ele_type == 'radio'){
246 |           selector.events = 'click';
247 |         }else if (ele_type == 'hidden' || ele.get(0).tagName == 'SELECT'){
248 |           selector.events = 'change';
249 |         }
250 |       }
251 | 
252 |       return selector;
253 |     },
254 | 
255 |     buildCategoryMap: function(data) {
256 |       var filter_criteria = this.options.filter_criteria, record, categories, obj, x;
257 | 
258 |       for (var i = 0, l = data.length; i < l; i++){
259 |         record = this.getRecord(i, data);
260 |         this.record_ids.push(record[this.id_field]);
261 | 
262 |         for (name in filter_criteria) {
263 |           categories = eval('record.' + filter_criteria[name][2]);
264 |           obj = this.categories_map[name];
265 | 
266 |           if (categories && categories.constructor == Array) {
267 |             for (var j = 0, lj = categories.length; j < lj; j++){
268 |               x = categories[j];
269 |               obj[x] ? obj[x].push(record[this.id_field]) : obj[x] = [record[this.id_field]];
270 |             }
271 |           } else {
272 |             obj[categories] ? obj[categories].push(record[this.id_field]) : obj[categories] = [record[this.id_field]];
273 |           }
274 |         }
275 |       }
276 |     },
277 | 
278 |     hideShow: function(ids) {
279 |       var e_id = '#' + this.root + '_', i = 0, l = ids.length;
280 | 
281 |       $(this.container + ' > *[data-fjs]').hide();
282 | 
283 |       for (i; i < l; i++)
284 |         $(e_id + ids[i]).show();
285 |     },
286 | 
287 |     search: function (search_config, filter_result) {
288 |   		var val = $.trim($(search_config.input).val());
289 | 	  	var search_in = search_config.search_in;
290 | 		  var min_length = $.isNumeric(search_config.min_length) ? search_config.min_length : 1;
291 | 
292 | 		  if (val.length < min_length) return filter_result;
293 | 
294 | 		  var id_prefix = '#' + this.root + '_';
295 | 		  val = val.toUpperCase();
296 | 
297 | 		  return $.map(filter_result, function (id) {
298 | 			  var $ele = $(id_prefix + id);
299 | 
300 | 			  if (search_in) $ele = $ele.find(search_in);
301 |   
302 | 	  		if ($ele.text().toUpperCase().indexOf(val) >= 0) return id;
303 | 		  });
304 | 	  },
305 | 
306 |     execCallBack: function(type, result){
307 |       if(this.options.callbacks[type]) 
308 |         this.options.callbacks[type].call(this, result)
309 |     },
310 | 
311 |     rangeFilter: function(category_value, v){
312 |       var range = category_value.split('-');
313 | 
314 |       if (range.length == 2){
315 |         if (range[0] == 'below') range[0] = -Infinity;
316 |         if (range[1] == 'above') range[1] = Infinity;
317 |         if (Number(v) >= range[0] && Number(v) <= range[1]){
318 |           return true;
319 |         }
320 |       }
321 |     },
322 | 
323 |     //Collect Records by id array
324 |     getRecordsByIds: function(ids){
325 |       var records = [], r, i = 0, l = this.data.length;
326 | 
327 |       for (i; i < l; i++){
328 |         r = this.getRecord(i, this.data);
329 |         if (ids.indexOf(r[this.id_field]) != -1) records.push(r)
330 |       }
331 | 
332 |       return records; 
333 |     },
334 | 
335 |     addData: function(data){
336 |       if (data == undefined || data.length == 0 ) return;
337 | 
338 |       var i = 0, l = data.length, r, uniq_data = [], e_id = '#' + this.root + '_';
339 | 
340 |       this.execCallBack('before_add', data)
341 | 
342 |       //for (i, l; i < l; i++){
343 |       //  r = this.getRecord(i, data);
344 |       //  if ($(e_id + r.id).length == 0) uniq_data.push(data[i]);
345 |       //}
346 | 
347 |       this.data = this.data.concat(data);
348 |       this.render(data);
349 |       this.buildCategoryMap(data);
350 |       this.execCallBack('after_add', data)
351 |       this.filter();
352 |     },
353 | 
354 |     setStreamingTimer: function(){
355 |       var self = this, 
356 |           timer_func = this.options.streaming.batch_size ? setInterval : setTimeout;
357 | 
358 |       return timer_func(function(){
359 |         self.streamData();
360 |       }, this.options.streaming.stream_after);
361 |     },
362 | 
363 |     clearStreamingTimer: function(){
364 |       if (this.timer) clearTimeout(this.timer);
365 |     },
366 | 
367 |     fetchData: function(){
368 |       var self = this, 
369 |           params = this.options.params || {},
370 |           opts = this.options.streaming;
371 | 
372 |       params['offset'] = this.data.length;
373 | 
374 |       if (opts.batch_size) params['limit'] = opts.batch_size;
375 |       if (this.options.search) params['q'] = $.trim($(this.options.search.input).val()); 
376 | 
377 |       $.getJSON(opts.data_url, params).done(function(data){
378 | 
379 |         if (params.limit != null && (!data || !data.length)){
380 |           self.stopStreaming();
381 |         }else{
382 |           self.setStreamInterval();
383 |           self.addData(data);
384 |         }
385 | 
386 |       }).fail(function(e){
387 |         self.stopStreaming();
388 |       });
389 |     },
390 | 
391 |     setStreamInterval: function(){
392 |       var self = this;
393 |       if(self.options.streaming.stop_streaming == true) return;
394 | 
395 |       self.timer = setTimeout(function(){
396 |         self.fetchData();
397 |       }, self.options.streaming.stream_after);
398 |     },
399 | 
400 |     stopStreaming: function(){
401 |       this.options.streaming.stop_streaming = true;
402 |       if (this.timer) clearTimeout(this.timer);
403 |     },
404 | 
405 |     resumeStreaming: function(){
406 |       this.options.streaming.stop_streaming = false;
407 |       this.streamData(this.options.streaming.stream_after);
408 |     },
409 | 
410 |     streamData: function(time){
411 |       this.setStreamInterval();
412 |       if(!this.options.streaming.batch_size) this.stopStreaming();
413 |     }
414 | 
415 | }
416 | 
417 | 
418 | })(this);
419 | 
420 | /**
421 |  * Recursive method to collect object from json object.
422 |  * i.e. test =  [ {"deal": {"id": 1 }}, {"deal": {"id": 2}}]
423 |  *  - to collect id from the json data
424 |  *    test.filter_collect('deal').filter_collect('id')
425 |  *    this will return [1,2]
426 |  */
427 | Array.prototype.filter_collect = function(field, arr) {
428 |   var arr = arr || [];
429 |   for (var i = 0, l = this.length; i < l; i++){
430 |     var obj = this[i];
431 |     if (obj.constructor == Array){
432 |       obj.filter_collect(field, arr);
433 |     }
434 |     else {
435 |       arr.push(obj[field]);
436 |     }
437 |   }
438 | 
439 |   return arr;
440 | };
441 | 
442 | //In IE indexOf method not define.
443 | if (!Array.prototype.indexOf) {
444 |   Array.prototype.indexOf = function(obj, start) {
445 |     for (var i = (start || 0), j = this.length; i < j; i++) {
446 |       if (this[i] === obj) { return i; }
447 |     }
448 |     return -1;
449 |   }
450 | }
451 | 


--------------------------------------------------------------------------------
/docs/developer_guide.md:
--------------------------------------------------------------------------------
 1 | #Developer Guide
 2 | 
 3 | 
 4 | ##Source code structure
 5 | 
 6 | The package contains the following structures:
 7 | - <b>docs</b> Generate html docs through doxygen
 8 | - <b>examples</b> Examples using the library 
 9 | - <b>include</b> The library source code
10 | - <b>tests</b> Unit tests, test library implementation 
11 | 
12 | When adding a new target platform, need to add <platform>.h to include/ and may need to modify makefiles so that the new platform can be tested and will run with existing examples.
13 | 
14 | ##Run unit tests
15 | 
16 | Unit tests are in tests directory. The current unit test covers power_vsx4.h, generic4.h, generic8,h and sse4.h
17 | 
18 | Please download googletest framework first from https://code.google.com/p/googletest/, and unzip it into "tests/gtest-1.6.0" dirctory.
19 | Or you can unzip it to where you want, and modify the "GTEST_DIR" value in tests/Makefile.
20 | 
21 | Then you can run the test
22 | ```bash
23 | $ cd tests
24 | $ make clean
25 | $ make {vsx4|sse4|generic4|generic8}     # build/run unit tests for target SIMD ISA
26 | ```
27 | The test app will test vsx4, generic4 and sse4 interfaces, and generate the report.
28 | 
29 | 
30 | ##Generate the documentation
31 | 
32 | We use doxygen to generate documentations. The input files for doxygen is under <gsimd_path>/docs/. To update the documentation, either modigy the *.txt files or doxygen annotations in the library source codes.
33 | 
34 | To publish new documentations, you need to go through the following steps:
35 | 
36 | 1. Make sure you have doxygen installed
37 | 
38 | 2. Checkout the gh-pages branch of your project to docs/gh-pages.github
39 | ```bash
40 | $ add docs/gh-pages.github to .gitignore
41 | $ cd docs
42 | # create a branch gh-pages from the github web interface
43 | # clone the project repo to docs/gh-pages.github
44 | $ git clone -b gh-pages https://github.com/genericsimd/generic_simd.git gh-pages.github
45 | $ cd gh-pages.github
46 | # remove all trunk files from gh-pages branch
47 | ```
48 |   
49 | 3. Generate new doxygen pages and copy into gh-pages.github
50 | ```bash
51 | $ cd docs
52 | $ make         # generate documentation into docs/html
53 | $ make gitpub  # copy docs/html into docs/gh-pages.github
54 | $ cd gh-pages.github
55 | $ git add -A   # add everything under the directory
56 | $ git commit -a # checkin new documentation to github
57 | $ git push     # push to github
58 | ```
59 |   Note: it may take 10 minutes before the new pages appear on http://genericsimd.github.io/generic_simd
60 | 


--------------------------------------------------------------------------------
/docs/doxygen_main.txt:
--------------------------------------------------------------------------------
1 | /**
2 | \mainpage
3 | This documentation describes the API of Generic SIMD Intrinsic Library.
4 | 
5 | For usage instructions, please see the document https://github.com/genericsimd/generic_simd.
6 | 
7 | */


--------------------------------------------------------------------------------
/docs/faq.md:
--------------------------------------------------------------------------------
 1 | #Frequently Asked Questions
 2 | 
 3 | ## Frequently asked questions
 4 | 
 5 | 1. What target SIMD platforms does the library support?
 6 | 
 7 |   Currently we support two SIMD platforms, SSE4.2 and VSX. We also
 8 | support a generic implementation of the library using scalar codes.
 9 | 
10 | 2. Failed to build unit tests under tests/
11 | 
12 |   ```bash
13 | -bash-4.1$ make
14 | g++ -Igtest-1.6.0/include -Igtest-1.6.0 -c gtest-1.6.0/src/gtest-all.cc
15 | g++: error: gtest-1.6.0/src/gtest-all.cc: No such file or directory
16 | g++: fatal error: no input files
17 | ```
18 | 
19 |   Our unit test engines uses google test framework. Due to opensource
20 | license issues, googletest is not included in our source tree. Please
21 | download googletest from [here](https://code.google.com/p/googletest/)
22 | and unzip it into "tests/gtest-1.6.0/". Or you can unzip it to
23 | where you want, and modify the "GTEST_DIR" value in tests/Makefile.
24 | 
25 | 3. Could I get slightly different results using svec_madd and
26 | svec_msub on different platforms?
27 | 
28 |   The vsx's vsx::svec_madd(), vsx::svec_msub() are mapped into madd and
29 | msub intrinsics directly, while generic::svec_madd,
30 | generic::svec_msub() is implemented by scalar code. In rare occasions,
31 | fused operation by one hardware instruction provides higher precision
32 | in float operations. So it's possible the vsx and generic provide
33 | slightly different results.
34 | 
35 | ## Known Bugs
36 | 
37 | 


--------------------------------------------------------------------------------
/docs/getting_started.md:
--------------------------------------------------------------------------------
 1 | #Getting Started
 2 | 
 3 | ##Getting the source
 4 | 
 5 | Clone the library from github
 6 | ```bash
 7 | $ git clone https://github.com/genericsimd/generic_simd.git generic_simd
 8 | ```
 9 | The package contains the following directories:
10 | 
11 | - docs/ input to doxygen and makefile to generate documents
12 | - examples/ Examples using the library
13 | - include/</t> The library source code
14 | - tests/ Unit tests, test library implementation
15 | 
16 | ##Using the library
17 | 
18 | The library is implemented completely inside header files, all of which are under include/. To use the library, follow these steps:
19 | 
20 | 1. Include the library header <gsimd.h> into your source code
21 | 2. Programming according to library API
22 | 3. Build the binary w/ standard g++ like this:
23 | ```bash
24 | g++ -I <gsimd_root>/include -m{vsx|sse4.2} -Wno-int-to-pointer-cast -flax-vector-conversions ...
25 | ```
26 |   - -mvsx: standard g++ option to generate VSX instructions
27 |   - -msse4.2: standard g++ option to generate SSE4.2 instructions
28 |   - if no -mvsx or -msse4.2 is specified: generate scalar codes emulating generic SIMD intrinsics
29 |   - -Wno-int-to-pointer-cast -flax-vector-conversions: ignore some warnings and enable vector casts
30 | 
31 | Consider the hello-world example:
32 | ```cpp
33 | //HelloSIMD.cpp
34 | #include <iostream>
35 | #include <gsimd.h>
36 | 
37 | int main (int argc, char* argv[])
38 | {
39 |     svec<4,float> v1(1.1, 2.2, 3.3, 4.4);
40 |     svec<4,float> v2 = v1 * 2;
41 |     std::cout << "Hello World: " << v2 << std::endl;
42 |     return 0;
43 | }
44 | ```
45 | 
46 | Example#1: how to build for VSX
47 | ```bash
48 | $ g++ -I../../include HelloSIMD.cpp -mvsx -flax-vector-conversions -o HelloSIMD -Wno-int-to-pointer-cast
49 | $ ./HelloSIMD
50 | Hello World: svec4_f[2.2, 4.4, 6.6, 8.8]
51 | ```
52 | 
53 | Eample#2: how to build for SSE4.2
54 | ```
55 | $ g++ -I../../include HelloSIMD.cpp -msse4.2 -o HelloSIMD -Wno-int-to-pointer-cast
56 | $ ./HelloSIMD
57 | Hello World: svec4_f[2.2, 4.4, 6.6, 8.8]
58 | ```
59 | 
60 | ##Running examples
61 | 
62 | We provided a few examples under examples/, that includes:
63 | 
64 | - HelloSIMD hello-world example
65 | - mandelbrot mandelbrot algorithm
66 | - RGB2Gray RGB to gray conversion
67 | 
68 | To try out these examples, simply
69 | ```bash
70 | $ cd examples/RGB2Gray
71 | $ make      
72 | $ make run
73 | ```
74 | 


--------------------------------------------------------------------------------
/docs/history.md:
--------------------------------------------------------------------------------
 1 | #History
 2 | 
 3 | ##Version 0.2
 4 | 
 5 | - Add Intel SSE4.2 LANES=4 implementation.
 6 | - Add gather_stride() and scatter_stride interface.
 7 | 
 8 | ##Version 0.1
 9 | 
10 | - Initial implementation for LANES=4. Including power vsx and generic support.
11 | 


--------------------------------------------------------------------------------
/docs/img/intel2power_apps.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/genericsimd/generic_simd/3a92a1983b195a790742b3dce93a8bc3d0679dff/docs/img/intel2power_apps.jpg


--------------------------------------------------------------------------------
/docs/img/intel_apps.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/genericsimd/generic_simd/3a92a1983b195a790742b3dce93a8bc3d0679dff/docs/img/intel_apps.jpg


--------------------------------------------------------------------------------
/docs/img/power2intel_apps.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/genericsimd/generic_simd/3a92a1983b195a790742b3dce93a8bc3d0679dff/docs/img/power2intel_apps.jpg


--------------------------------------------------------------------------------
/docs/img/power_apps.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/genericsimd/generic_simd/3a92a1983b195a790742b3dce93a8bc3d0679dff/docs/img/power_apps.jpg


--------------------------------------------------------------------------------
/docs/performance.md:
--------------------------------------------------------------------------------
 1 | #Performance Data
 2 |  
 3 |  
 4 | ##Performance Evaluation
 5 | 
 6 | The performance goal of the generic SIMD intrinsic library is to match up with the performance of the same code written in a platform-specific intrinsics. We have ported some examples from intrinsics implementation to Generic SIMD implementation, and evaluated their performance.
 7 | 
 8 | Applications include
 9 | 
10 | - RGB2Gray, float
11 | - Mandelbrot, float
12 | - SPSS-sweep, double
13 | - Dgemm, double
14 | - CVA-Mean, float
15 | 
16 | Some of them only have Power Platform intrinsics implementation, and some of them only have intel intrinsics implementation.  We compared the generic SIMD's performance with the intrinsics implementation by measure their speed up over scalar code.
17 | 
18 | We also evaluated the portability of our Generic SIMD intrinsics library by running the generic SIMD version application on the platform without intrinsics implementation.
19 | 
20 | Below are the result.
21 | 
22 | The following figures show the performance speedup versus the scalar code on different platforms.
23 | 
24 | Higher is better.
25 | 
26 | __App on Power Platform__<br> ![App on Power Platform](img/power_apps.jpg)
27 | 
28 | __App on Intel Platform__<br> ![App on Intel Platform](img/intel_apps.jpg) 
29 | 
30 | __App with Power Intrinsics only__<br> ![App with Power Intrinsics only](img/power2intel_apps.jpg) 
31 | 
32 | __App with Intel Intrinsics only__<br> ![App with Intel Intrinsics only](img/intel2power_apps.jpg) 
33 | 


--------------------------------------------------------------------------------
/docs/programming_guide.md:
--------------------------------------------------------------------------------
  1 | #Programming Guide
  2 |  
  3 | <b>For detailed interface specification, refer to [Generic SIMD intrinsics library API] (http://genericsimd.github.io/generic_simd/index.html) </b>
  4 | 
  5 | ##Data Types
  6 | 
  7 | The library supports templaterized SIMD vector types, *svec<N,STYPE>*,
  8 | where *N* specifies elements per vector and has to be power of two.
  9 | *STYPE* specifies scalar type of vector element: *bool*, *char*, "unsigned
 10 | char*, *short*, *unsigned short*, *int*, *unsigned int*, *long long*,
 11 | *unsigned long long*, *float*, and *double*.
 12 | 
 13 | Currently the library supports only N = 4
 14 | 
 15 | - *svec<4,bool>*: vector of 4 boolean 
 16 | - *svec<4,int8_t>, svec<4,uint8_t>: vector of 4 signed/unsigned 8-bit int 
 17 | - *svec<4,int16_t>*, *svec<4,uint16_t>*: vector of 4 signed/unsigned 16-bit int 
 18 | - *svec<4,int32_t>*, *svec<4,int32_t>*: vector of 4 signed/unsigned 32-bit int 
 19 | - *svec<4,int64_t>*, *svec<4,uint64_t>*: vector 4 signed/unsigned 32-bit int 
 20 | - *svec<4,float>*: vector of 4 float 
 21 | - *svec<4,double>*: vector of 4 double 
 22 | - *svec<4,void*>*: vector of 4 pointers
 23 | 
 24 | In the rest of the document we use VTYPE to indicate SIMD vector types.
 25 | 
 26 | ##Operations
 27 | 
 28 | ###Constructor
 29 | 
 30 | - Default constructor returns a vector with undefined value. e.g. "svec<4,int32_t> v;" 
 31 |   You can modify it's elements by "[]" operator. 
 32 | - Construct a SIMD vector with four scalar values. e.g. "svec<4,int32_t> v(1,2,3,4)" 
 33 | - Construct a SIMD vector with one scalar value. e.g. "svec<4,int32_t> v(100)". 
 34 | 
 35 | All the four values in the SIMD vector is 100. 
 36 | 
 37 | 
 38 | ###Extract/insert single vector element
 39 | 
 40 | "[]" operator is used to get and set the elements.
 41 | ```c++
 42 | svec<4,int32_t> v(1,2,3,4);
 43 | int a = v[2]; // extracts the 3rd element of the vector (i.e., element index starts from 0), a is 3 now
 44 | v[3] = 10;    // assigns 10 to the 3rd element of the vector, v is [1,2,3,10] now
 45 | ```
 46 | 
 47 | Due to the current limitation, bool vector's setter must use "-1" as true in the right hand side.
 48 | ```c++
 49 | svec<4,bool> m(0); // construct a vector of boolean with all elements initialized to false
 50 | m[0] = -1;     // after assignment, 1st element of m is true.
 51 | ```
 52 | 
 53 | ###Load and Store
 54 | 
 55 | Store a vector to location p through instance method store(VTYPE *).
 56 | 
 57 | Load a vector from location p through class static method VTYPE::(VTYPE *).
 58 | e.g. "svec<4,int32_t>::load(an_address)" will return a new svec<4,int32_t> vector.
 59 | 
 60 | Load a scalar value from an address and splat it into the whole vector could be done through class static method VTYPE::load_and_splat(STYPE *)
 61 | 
 62 | There is another method called VTYPE::load_const(STYPE*), which has similar semantics.
 63 | 
 64 | ###Compare Operations
 65 | 
 66 | Compare two vectors, and return a svec<4,bool> vector.
 67 | 
 68 | Operators: == != for all types
 69 | 
 70 | Operators: >, >=, <, <= for all types except svec<4,bool>.
 71 | 
 72 | ###Bit operations
 73 | 
 74 | svec<4,bool> has operator ~ to reverse the boolean value.
 75 | 
 76 | Binary bit operators &, |, ^ are available for all integer vector types.
 77 | 
 78 | Logical operators !, &&, || are available for svec<4,bool> type.
 79 | 
 80 | ###Math operations
 81 | 
 82 | Support all types except svec<4,bool>.
 83 | 
 84 | Unary operator "-" is used to get the neg value for non-boolean vectors
 85 | 
 86 | Binary operators +, -, *, / can support VTYPE op VTYPE, VTYPE op STYPE, STYPE op VTYPE.
 87 | 
 88 | Binary operators >>, <<, % can support VTYPE op VTYPE, VTYPE op STYPE over all integer types. 
 89 | 
 90 | \>> and << for shift, and % for remainder.
 91 | 
 92 | Please note shift by a vector can only has unsigned integer vector in the right hand.
 93 | 
 94 | ###Instance methods operations
 95 | 
 96 | broadcast(), rotate(), shuffle() support all types exclude svec<4,bool>().
 97 | 
 98 | round(), floor(), ceil(), sqrt(), rcp(), rsqrt(), exp(), log(), pow(VTYPE) support svec<4,float>, and svec<4,double>.
 99 | 
100 | All above will return a new vector.
101 | 
102 | reduce_add(), reduce_max(), reduce_min() do a vector scope's reduction, and return a scalar value.
103 | 
104 | any_true(), all_true(), none_true() do a svec<4,bool> vector's reduction, and return a boolean scalar value.
105 | 
106 | ###Gather and Scatter
107 | 
108 | Please refer the detail document for how to use gather and scatter.
109 | E.g. svec<4,int32_t> type
110 | 
111 | - svec<4,int32_t>::gather()
112 | - svec<4,int32_t>::scatter()
113 | - svec<4,int32_T>::gather_base_offsets()
114 | - svec<4,int32_t>::scatter_base_offsets()
115 | - svec<4,int32_t>::gather_stride()
116 | - svec<4,int32_t>::scatter_stride()
117 | 
118 | **Note** The current power processor has no gather/scatter instructions. The software based implementation is slow right now, especially the gather_base_offsets() and scatter_base_offsets().
119 | 
120 | In case of regular stride style gather/scatter, it's better to use gather_stride() and scatter_stride().
121 | 
122 | ###Multiply-Add and Multiply-Sub
123 | 
124 | VTYPE svec_madd(VTYPE a, VTYPE b, VTYPE c) returns a * b + c;
125 | 
126 | VTYPE svec_msub(VTYPE a, VTYPE b, VTYPE c) returns a * b - c;
127 | 
128 | VTYPE svec_nmsub(VTYPE a, VTYPE b, VTYPE c) returns -(a * b - c);
129 | 
130 | ###Select operation
131 | 
132 | The prototype is svec_select(svec<4,bool> mask, VTYPE a, VTYPE b), and return a new vector whose elements are selected from _a_ or _b_ based on the mask. True from _a_ and false from _b_.
133 | 
134 | There is another select svec_select(bool cond, VTYPE a, VTYPE b), which is the same as "cond ? a : b".
135 | 
136 | ###Type cast operation
137 | 
138 | The prototype is svec_cast<TO_VTYPE>(FROM_VTYPE). It supports all combinations of type cast. Each element's cast semantics is the same as scalar cast.
139 | 
140 | ###Operation with mask
141 | 
142 | load, store, gatter, scatter, compare operations have a masked version.
143 | Please refer the detail document for detail.
144 | 


--------------------------------------------------------------------------------
/examples/HelloSIMD/HelloSIMD.cpp:
--------------------------------------------------------------------------------
 1 | /**
 2 | Copyright 2012 the Generic SIMD Intrinsic Library project authors. All rights reserved.
 3 | 
 4 | Copyright IBM Corp. 2013, 2013. All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are
 8 | met:
 9 | 
10 |    * Redistributions of source code must retain the above copyright
11 |      notice, this list of conditions and the following disclaimer.
12 |    * Redistributions in binary form must reproduce the above
13 |      copyright notice, this list of conditions and the following
14 |      disclaimer in the documentation and/or other materials provided
15 |      with the distribution.
16 |    * Neither the name of IBM Corp. nor the names of its contributors may be
17 |      used to endorse or promote products derived from this software
18 |      without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 | */
32 | 
33 | 
34 | /*
35 |  *  g++ -I../../include HelloSIMD.cpp -mvsx -flax-vector-conversions -Wno-int-to-pointer-cast -o HelloSIMD
36 |  * */
37 | 
38 | #include <iostream>
39 | #include <gsimd.h>
40 | 
41 | int main (int argc, char* argv[])
42 | {
43 |   svec<4,float> v1(1.1, 2.2, 3.3, 4.4);
44 |   svec<4,float> v2 = v1 * 2;
45 |   std::cout << "Hello World: " << v2 << std::endl;
46 |   return 0;
47 | }
48 | 
49 | 


--------------------------------------------------------------------------------
/examples/HelloSIMD/Makefile:
--------------------------------------------------------------------------------
1 | 
2 | 
3 | EXAMPLE=HelloSIMD
4 | RUN_ARGS=
5 | 
6 | include ../common.mk


--------------------------------------------------------------------------------
/examples/RGB2Gray/Makefile:
--------------------------------------------------------------------------------
1 | 
2 | 
3 | EXAMPLE=RGB2Gray
4 | RUN_ARGS=
5 | 
6 | include ../common.mk


--------------------------------------------------------------------------------
/examples/RGB2Gray/RGB2Gray.cpp:
--------------------------------------------------------------------------------
  1 | /**
  2 | Copyright 2012 the Generic SIMD Intrinsic Library project authors. All rights reserved.
  3 | 
  4 | Copyright IBM Corp. 2013, 2013. All rights reserved.
  5 | 
  6 | Redistribution and use in source and binary forms, with or without
  7 | modification, are permitted provided that the following conditions are
  8 | met:
  9 | 
 10 |    * Redistributions of source code must retain the above copyright
 11 |      notice, this list of conditions and the following disclaimer.
 12 |    * Redistributions in binary form must reproduce the above
 13 |      copyright notice, this list of conditions and the following
 14 |      disclaimer in the documentation and/or other materials provided
 15 |      with the distribution.
 16 |    * Neither the name of IBM Corp. nor the names of its contributors may be
 17 |      used to endorse or promote products derived from this software
 18 |      without specific prior written permission.
 19 | 
 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 21 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 22 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 23 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 24 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 25 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 26 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 27 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 28 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 30 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 31 | */
 32 | 
 33 | /**
 34 |  * RGB2Gray.cpp
 35 |  *
 36 |  *  Created on: Jun 12, 2013
 37 |  *  @author: Haichuan Wang (haichuan@us.ibm.com, hwang154@illinois.edu)
 38 | 
 39 |  */
 40 | 
 41 | 
 42 | /*
 43 |  *  g++ -I../../include RGB2Gray.cpp -mvsx -flax-vector-conversions -Wno-int-to-pointer-cast -g -O2 -o RGB2Gray
 44 |  * */
 45 | 
 46 | #include <getopt.h>
 47 | #include <iostream>
 48 | #include <stdlib.h>
 49 | #include <string.h>
 50 | #include <assert.h>
 51 | #include <timing.h>
 52 | #include <gsimd.h>
 53 | 
 54 | #define N (1048576)
 55 | //#define N (1000000)
 56 | 
 57 | //Doesn't work
 58 | //__attribute__((optimize("no-tree-vectorize")))
 59 | 
 60 | void
 61 | #ifdef __SSE4_2__
 62 | __attribute__((target("no-sse")))
 63 | #endif
 64 | serial_rgb2gray(float* ra, float* ga, float* ba, float* gray) {
 65 |     for(int i = 0; i < N; i++) {
 66 |         gray[i] = 0.3f * ra[i] + 0.59f * ga[i] + 0.11f * ba[i];
 67 |     }
 68 | }
 69 | 
 70 | typedef svec<4,float> vfloat;
 71 | 
 72 | void svec4_rgb2gray(float* ra, float* ga, float* ba, float* gray ) {
 73 | 
 74 |     for(int i = 0; i < N; i+=4) {
 75 |         vfloat a = vfloat::load((vfloat*)(ra+i));
 76 |         vfloat b = vfloat::load((vfloat*)(ga+i));
 77 |         vfloat c = vfloat::load((vfloat*)(ba+i));
 78 |         vfloat out = 0.3f * a  + 0.59f * b  + 0.11f * c ;
 79 |         out.store((vfloat*)(gray+i));
 80 |     }
 81 | }
 82 | 
 83 | void svec4_rgb2gray_ptr(float* ra, float* ga, float* ba, float* gray ) {
 84 | 
 85 |     for(int i = 0; i < N; i+=4) {
 86 |         vfloat a = *(vfloat*)(ra+i);
 87 |         vfloat b = *(vfloat*)(ga+i);
 88 |         vfloat c = *(vfloat*)(ba+i);
 89 |         vfloat out = 0.3f * a  + 0.59f * b  + 0.11f * c ;
 90 |         *(vfloat*)(gray+i) = out;
 91 |     }
 92 | }
 93 | 
 94 | #ifdef __ALTIVEC__
 95 | void intrinsics_rgb2gray(float* ra, float* ga, float* ba, float* gray ) {
 96 |     __vector float c1 = vec_splats(0.3f);
 97 |     __vector float c2 = vec_splats(0.59f);
 98 |     __vector float c3 = vec_splats(0.11f);
 99 | 
100 |     for(int i = 0; i < N; i+=4) {
101 |         __vector float a = vec_vsx_ld(0, ra+i);
102 |         __vector float b = vec_vsx_ld(0, ga+i);
103 |         __vector float c = vec_vsx_ld(0, ba+i);
104 |         __vector float out = c1 * a  + c2 * b  +  c3 * c ;
105 |         vec_vsx_st(out, 0, gray+i);
106 |     }
107 | }
108 | #endif
109 | 
110 | #ifdef __SSE4_2__
111 | 
112 | void sse_rgb2gray(float* ra, float* ga, float* ba, float* gray) {
113 |   __m128 c1 =  _mm_set1_ps(0.3f);
114 |   __m128 c2 =  _mm_set1_ps(0.59f);
115 |   __m128 c3 =  _mm_set1_ps(0.11f);
116 | 
117 |   for(int i = 0; i < N; i+=4) {
118 |       __m128 a = _mm_loadu_ps(ra+i);
119 |       __m128 b = _mm_loadu_ps(ga+i);
120 |       __m128 c = _mm_loadu_ps(ba+i);
121 |       __m128 ab = _mm_add_ps(_mm_mul_ps(c1, a), _mm_mul_ps(c2, b));
122 |       __m128 out = _mm_add_ps(ab, _mm_mul_ps(c3, c));
123 |       _mm_storeu_ps(gray+i, out);
124 |   }
125 | }
126 | 
127 | #endif
128 | 
129 | void svec4_rgb2gray_fma(float* ra, float* ga, float* ba, float* gray) {
130 |     for(int i = 0; i < N; i+=4) {
131 |         vfloat a = vfloat::load((vfloat*)(ra+i));
132 |         vfloat b = vfloat::load((vfloat*)(ga+i));
133 |         vfloat c = vfloat::load((vfloat*)(ba+i));
134 |         vfloat out = 0.3 * a;
135 |         out = svec_madd(vfloat(0.59), b, out);
136 |         out = svec_madd(vfloat(0.11), c, out);
137 |         out.store((vfloat*)(gray+i));
138 |     }
139 | }
140 | 
141 | 
142 | float r[N+10000] POST_ALIGN(16);
143 | float g[N+20000] POST_ALIGN(16);
144 | float b[N+30000] POST_ALIGN(16);
145 | float gray[N+40000] POST_ALIGN(16);
146 | 
147 | #define ITERATIONS 1000
148 | int main (int argc, char* argv[])
149 | {
150 |     for(int i = 0; i < N; i++) {
151 |         r[N] = random() % 256;
152 |         g[N] = random() % 256;
153 |         b[N] = random() % 256;
154 |     }
155 |     std::cout<< "Convert " << N << " pixels RGB to gray." << std::endl;
156 | 
157 |     reset_and_start_stimer();
158 |     for(int i = 0; i < ITERATIONS; i++) { serial_rgb2gray(r, g, b, gray);}
159 |     double dt = get_elapsed_seconds();
160 |     std::cout<< "serial version: " << dt << " seconds" << std::endl;
161 | 
162 |     reset_and_start_stimer();
163 |     for(int i = 0; i < ITERATIONS; i++) { svec4_rgb2gray(r, g, b, gray);}
164 |     double dt2 = get_elapsed_seconds();
165 |     std::cout<< "svec4 version: " << dt2 << " seconds" << std::endl;
166 | 
167 |     reset_and_start_stimer();
168 |     for(int i = 0; i < ITERATIONS; i++) { svec4_rgb2gray_ptr(r, g, b, gray); }
169 |     double dt3 = get_elapsed_seconds();
170 |     std::cout<< "svec4 ptr ld/st version: " << dt3 << " seconds" << std::endl;
171 | 
172 | #ifdef __ALTIVEC__
173 |     reset_and_start_stimer();
174 |     for(int i = 0; i < ITERATIONS; i++) { intrinsics_rgb2gray(r, g, b, gray);}
175 |     double dt5 = get_elapsed_seconds();
176 |     std::cout<< "Intrinsics version: " << dt5 << " seconds" << std::endl;
177 | #endif
178 | 
179 | #ifdef __SSE4_2__
180 |     reset_and_start_stimer();
181 |     for(int i = 0; i < ITERATIONS; i++) { sse_rgb2gray(r, g, b, gray);}
182 |     double dt6 = get_elapsed_seconds();
183 |     std::cout<< "SSE version: " << dt6 << " seconds" << std::endl;
184 | #endif
185 | 
186 |     reset_and_start_stimer();
187 |     for(int i = 0; i < ITERATIONS; i++) { svec4_rgb2gray_fma(r, g, b, gray); }
188 |     double dt4 = get_elapsed_seconds();
189 |     std::cout<< "svec4 fma version: " << dt4 << " seconds" << std::endl;
190 | 
191 | 
192 |     return 0;
193 | }
194 | 
195 | 


--------------------------------------------------------------------------------
/examples/RGB2Gray/RGB2Gray_tune.cpp:
--------------------------------------------------------------------------------
  1 | /**
  2 | Copyright 2012 the Generic SIMD Intrinsic Library project authors. All rights reserved.
  3 | 
  4 | Copyright IBM Corp. 2013, 2013. All rights reserved.
  5 | 
  6 | Redistribution and use in source and binary forms, with or without
  7 | modification, are permitted provided that the following conditions are
  8 | met:
  9 | 
 10 |    * Redistributions of source code must retain the above copyright
 11 |      notice, this list of conditions and the following disclaimer.
 12 |    * Redistributions in binary form must reproduce the above
 13 |      copyright notice, this list of conditions and the following
 14 |      disclaimer in the documentation and/or other materials provided
 15 |      with the distribution.
 16 |    * Neither the name of IBM Corp. nor the names of its contributors may be
 17 |      used to endorse or promote products derived from this software
 18 |      without specific prior written permission.
 19 | 
 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 21 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 22 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 23 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 24 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 25 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 26 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 27 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 28 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 30 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 31 | */
 32 | 
 33 | /*
 34 |  * RGB2Gray_tune.cpp
 35 |  *
 36 |  *  Created on: Jun 12, 2013
 37 |  *      Author: haichuan
 38 |  */
 39 | 
 40 | 
 41 | /*
 42 |  *  g++ -I../../include RGB2Gray.cpp -mvsx -flax-vector-conversions -Wno-int-to-pointer-cast -g -O2 -o RGB2Gray
 43 |  * */
 44 | 
 45 | #include <getopt.h>
 46 | #include <iostream>
 47 | #include <stdlib.h>
 48 | #include <string.h>
 49 | #include <assert.h>
 50 | #include <timing.h>
 51 | #include <perfmeasure.h>
 52 | #ifdef __ALTIVEC__
 53 | #include <power_vsx4.h>
 54 | using namespace vsx;
 55 | #else
 56 | #ifdef __SSE4_2__
 57 | #include <sse4.h>
 58 | using namespace sse;
 59 | #else
 60 | #include <generic4.h>
 61 | using namespace generic;
 62 | #endif //__SSE4_2__
 63 | #endif //__ALTIVEC__
 64 | 
 65 | 
 66 | 
 67 | //#define N (16000)
 68 | //#define N 1000000
 69 | #define N (1048576)
 70 | #ifdef __SSE4_2__
 71 | __attribute__((target("no-sse")))
 72 | #endif
 73 | void serial_rgb2gray(float* ra, float* ga, float* ba, float* gray) {
 74 |     for(int i = 0; i < N; i++) {
 75 |         gray[i] = 0.3f * ra[i] + 0.59f * ga[i] + 0.11f * ba[i];
 76 |     }
 77 | }
 78 | 
 79 | void
 80 | __attribute__((optimize("no-tree-vectorize")))
 81 | svec4_rgb2gray(float* ra, float* ga, float* ba, float* gray ) {
 82 | 
 83 |     for(int i = 0; i < N; i+=4) {
 84 |         svec4_f a = svec4_f::load((svec4_f*)(ra+i));
 85 |         svec4_f b = svec4_f::load((svec4_f*)(ga+i));
 86 |         svec4_f c = svec4_f::load((svec4_f*)(ba+i));
 87 |         svec4_f out = 0.3f * a  + 0.59f * b  + 0.11f * c ;
 88 |         out.store((svec4_f*)(gray+i));
 89 |     }
 90 | }
 91 | 
 92 | void
 93 | __attribute__((optimize("no-tree-vectorize")))
 94 | svec4_rgb2gray_ptr(float* ra, float* ga, float* ba, float* gray ) {
 95 | 
 96 |     for(int i = 0; i < N; i+=4) {
 97 |         svec4_f a = *(svec4_f*)(ra+i);
 98 |         svec4_f b = *(svec4_f*)(ga+i);
 99 |         svec4_f c = *(svec4_f*)(ba+i);
100 |         svec4_f out = 0.3f * a  + 0.59f * b  + 0.11f * c ;
101 |         *(svec4_f*)(gray+i) = out;
102 |     }
103 | }
104 | 
105 | 
106 | void
107 | __attribute__((optimize("no-tree-vectorize")))
108 | svec4_rgb2gray_fma(float* ra, float* ga, float* ba, float* gray) {
109 |     for(int i = 0; i < N; i+=4) {
110 |         svec4_f a = svec4_f::load((svec4_f*)(ra+i));
111 |         svec4_f b = svec4_f::load((svec4_f*)(ga+i));
112 |         svec4_f c = svec4_f::load((svec4_f*)(ba+i));
113 |         svec4_f out = 0.3 * a;
114 |         out = svec_madd(svec4_f(0.59), b, out);
115 |         out = svec_madd(svec4_f(0.11), c, out);
116 |         out.store((svec4_f*)(gray+i));
117 |     }
118 | }
119 | 
120 | #ifdef __ALTIVEC__
121 | void intrinsics_rgb2gray(float* ra, float* ga, float* ba, float* gray ) {
122 |     __vector float c1 = vec_splats(0.3f);
123 |     __vector float c2 = vec_splats(0.59f);
124 |     __vector float c3 = vec_splats(0.11f);
125 | 
126 |     for(int i = 0; i < N; i+=4) {
127 |         __vector float a = vec_vsx_ld(0, ra+i);
128 |         __vector float b = vec_vsx_ld(0, ga+i);
129 |         __vector float c = vec_vsx_ld(0, ba+i);
130 |         __vector float out = c1 * a  + c2 * b  +  c3 * c ;
131 |         vec_vsx_st(out, 0, gray+i);
132 |     }
133 | }
134 | #endif
135 | 
136 | 
137 | #ifdef __SSE4_2__
138 | 
139 | void sse_rgb2gray(float* ra, float* ga, float* ba, float* gray) {
140 |   __m128 c1 =  _mm_set1_ps(0.3f);
141 |   __m128 c2 =  _mm_set1_ps(0.59f);
142 |   __m128 c3 =  _mm_set1_ps(0.11f);
143 | 
144 |   for(int i = 0; i < N; i+=4) {
145 |       __m128 a = _mm_loadu_ps(ra+i);
146 |       __m128 b = _mm_loadu_ps(ga+i);
147 |       __m128 c = _mm_loadu_ps(ba+i);
148 |       __m128 ab = _mm_add_ps(_mm_mul_ps(c1, a), _mm_mul_ps(c2, b));
149 |       __m128 out = _mm_add_ps(ab, _mm_mul_ps(c3, c));
150 |       _mm_storeu_ps(gray+i, out);
151 |   }
152 | }
153 | #endif
154 | 
155 | 
156 | #ifdef __AVX__
157 | #include "immintrin.h"
158 | void avx_rgb2gray(float* ra, float* ga, float* ba, float* gray) {
159 |   __m256 c1 =  _mm256_set1_ps(0.3f);
160 |   __m256 c2 =  _mm256_set1_ps(0.59f);
161 |   __m256 c3 =  _mm256_set1_ps(0.11f);
162 | 
163 |   for(int i = 0; i < N; i+=8) {
164 |       __m256 a = _mm256_loadu_ps(ra+i);
165 |       __m256 b = _mm256_loadu_ps(ga+i);
166 |       __m256 c = _mm256_loadu_ps(ba+i);
167 |       __m256 ab = _mm256_add_ps(_mm256_mul_ps(c1, a), _mm256_mul_ps(c2, b));
168 |       __m256 out = _mm256_add_ps(ab, _mm256_mul_ps(c3, c));
169 |       _mm256_storeu_ps(gray+i, out);
170 |   }
171 | }
172 | 
173 | #endif
174 | 
175 | float r[N+10000] POST_ALIGN(16);
176 | float g[N+20000] POST_ALIGN(16);
177 | float b[N+30000] POST_ALIGN(16);
178 | float gray[N+40000] POST_ALIGN(16);
179 | 
180 | #define ITERATIONS 1000
181 | int main (int argc, char* argv[])
182 | {
183 | 
184 |     for(int i = 0; i < N; i++) {
185 |         r[N] = random() % 256;
186 |         g[N] = random() % 256;
187 |         b[N] = random() % 256;
188 |     }
189 |     std::cout<< "Convert " << N << " pixels RGB to gray." << std::endl;
190 | 
191 |     HPM_PERF_CREATE;
192 | 
193 |     HPM_PERF_START;
194 |     reset_and_start_stimer();
195 |     for(int i = 0; i < ITERATIONS; i++) { serial_rgb2gray(r, g, b, gray);}
196 |     double dt = get_elapsed_seconds();
197 |     HPM_PERF_STOP;
198 |     std::cout<< "serial version: " << dt << " seconds" << std::endl;
199 | 
200 |     HPM_PERF_START;
201 |     reset_and_start_stimer();
202 |     for(int i = 0; i < ITERATIONS; i++) { svec4_rgb2gray(r, g, b, gray);}
203 |     double dt2 = get_elapsed_seconds();
204 |     HPM_PERF_STOP;
205 |     std::cout<< "svec4 version: " << dt2 << " seconds" << std::endl;
206 | 
207 |     HPM_PERF_START;
208 |     reset_and_start_stimer();
209 |     for(int i = 0; i < ITERATIONS; i++) { svec4_rgb2gray_ptr(r, g, b, gray); }
210 |     double dt3 = get_elapsed_seconds();
211 |     HPM_PERF_STOP;
212 |     std::cout<< "svec4 ptr ld/st version: " << dt3 << " seconds" << std::endl;
213 | 
214 |     HPM_PERF_START;
215 |     reset_and_start_stimer();
216 |     for(int i = 0; i < ITERATIONS; i++) { svec4_rgb2gray_fma(r, g, b, gray); }
217 |     double dt4 = get_elapsed_seconds();
218 |     HPM_PERF_STOP;
219 |     std::cout<< "svec4 fma version: " << dt4 << " seconds" << std::endl;
220 | 
221 | #ifdef __ALTIVEC__
222 |     HPM_PERF_START;
223 |     reset_and_start_stimer();
224 |     for(int i = 0; i < ITERATIONS; i++) { intrinsics_rgb2gray(r, g, b, gray);}
225 |     double dt5 = get_elapsed_seconds();
226 |     HPM_PERF_STOP;
227 |     std::cout<< "Intrinsics version: " << dt5 << " seconds" << std::endl;
228 | #endif
229 | 
230 | #ifdef __SSE4_2__
231 |     HPM_PERF_START;
232 |     reset_and_start_stimer();
233 |     for(int i = 0; i < ITERATIONS; i++) { sse_rgb2gray(r, g, b, gray);}
234 |     double dt6 = get_elapsed_seconds();
235 |     HPM_PERF_STOP;
236 |     std::cout<< "SSE version: " << dt6 << " seconds" << std::endl;
237 | #endif
238 | 
239 | #ifdef __AVX__
240 |     HPM_PERF_START;
241 |     reset_and_start_stimer();
242 |     for(int i = 0; i < ITERATIONS; i++) { avx_rgb2gray(r, g, b, gray);}
243 |     double dt7 = get_elapsed_seconds();
244 |     HPM_PERF_STOP;
245 |     std::cout<< "AVX version: " << dt7 << " seconds" << std::endl;
246 | #endif
247 | 
248 |     HPM_PERF_CLOSE;
249 |     return 0;
250 | }
251 | 
252 | 


--------------------------------------------------------------------------------
/examples/RGB2YUV/Makefile:
--------------------------------------------------------------------------------
1 | 
2 | 
3 | EXAMPLE=RGB2YUV
4 | RUN_ARGS=
5 | 
6 | include ../common.mk


--------------------------------------------------------------------------------
/examples/RGB2YUV/RGB2YUV.cpp:
--------------------------------------------------------------------------------
  1 | /**
  2 | Copyright 2012 the Generic SIMD Intrinsic Library project authors. All rights reserved.
  3 | 
  4 | Copyright IBM Corp. 2013, 2013. All rights reserved.
  5 | 
  6 | Redistribution and use in source and binary forms, with or without
  7 | modification, are permitted provided that the following conditions are
  8 | met:
  9 | 
 10 |    * Redistributions of source code must retain the above copyright
 11 |      notice, this list of conditions and the following disclaimer.
 12 |    * Redistributions in binary form must reproduce the above
 13 |      copyright notice, this list of conditions and the following
 14 |      disclaimer in the documentation and/or other materials provided
 15 |      with the distribution.
 16 |    * Neither the name of IBM Corp. nor the names of its contributors may be
 17 |      used to endorse or promote products derived from this software
 18 |      without specific prior written permission.
 19 | 
 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 21 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 22 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 23 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 24 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 25 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 26 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 27 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 28 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 30 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 31 | */
 32 | 
 33 | /**
 34 |  * RGB2YUV.cpp
 35 |  *
 36 |  *  Created on: Jun 12, 2013
 37 |  *  @author: Haichuan Wang (hwang154@illinois.edu)
 38 |  *
 39 |  * Parameters are from http://en.wikipedia.org/wiki/YUV
 40 |  * Storages are based on SoA
 41 |  */
 42 | 
 43 | 
 44 | /*
 45 |  *  IBM Power compiling
 46 |  *  g++ -I../../include RGB2YUV.cpp -mvsx -flax-vector-conversions -Wno-int-to-pointer-cast -g -O2 -o RGB2YUV
 47 |  * */
 48 | 
 49 | #include <getopt.h>
 50 | #include <iostream>
 51 | #include <stdlib.h>
 52 | #include <string.h>
 53 | #include <assert.h>
 54 | #include <timing.h>
 55 | #include <gsimd.h>
 56 | 
 57 | //#define N (1048576)
 58 | #define N (512*512)
 59 | 
 60 | //Doesn't work
 61 | //__attribute__((optimize("no-tree-vectorize")))
 62 | 
 63 | void
 64 | #ifdef __SSE4_2__
 65 | __attribute__((target("no-sse")))
 66 | #endif
 67 | serial_rgb2gray(float* ra, float* ga, float* ba, float* ya, float* ua, float* va) {
 68 |     for(int i = 0; i < N; i++) {
 69 |         ya[i] = 0.299f * ra[i] + 0.584f * ga[i] + 0.114f * ba[i];
 70 |         ua[i] = -0.14713f * ra[i] -0.28886f * ga[i] + 0.436f * ba[i];
 71 |         va[i] = 0.615f * ra[i] - 0.51499f * ga[i] - 0.10001f * ba[i];
 72 |     }
 73 | }
 74 | 
 75 | typedef svec<4,float> vfloat;
 76 | 
 77 | void svec4_rgb2gray(float* ra, float* ga, float* ba, float* ya, float* ua, float* va) {
 78 | 
 79 |     for(int i = 0; i < N; i+=4) {
 80 |         vfloat a = vfloat::load((vfloat*)(ra+i));
 81 |         vfloat b = vfloat::load((vfloat*)(ga+i));
 82 |         vfloat c = vfloat::load((vfloat*)(ba+i));
 83 |         vfloat y = 0.299f * a  + 0.584f * b  + 0.114f * c ;
 84 |         y.store((vfloat*)(ya+i));
 85 |         vfloat u = -0.14713f * a - 0.28886f * b + 0.436f * c;
 86 |         u.store((vfloat*)(ua+i));
 87 |         vfloat v = 0.615f * a - 0.51499f * b - 0.10001f * c;
 88 |         v.store((vfloat*)(va+i));
 89 |     }
 90 | }
 91 | 
 92 | void svec4_rgb2gray_ptr(float* ra, float* ga, float* ba, float* ya, float* ua, float* va) {
 93 | 
 94 |     for(int i = 0; i < N; i+=4) {
 95 |         vfloat a = *(vfloat*)(ra+i);
 96 |         vfloat b = *(vfloat*)(ga+i);
 97 |         vfloat c = *(vfloat*)(ba+i);
 98 |         vfloat y = 0.299f * a  + 0.584f * b  + 0.114f * c ;
 99 |         *(vfloat*)(ya+i) = y;
100 |         vfloat u = -0.14713f * a - 0.28886f * b + 0.436f * c;
101 |         *(vfloat*)(ua+i) = u;
102 |         vfloat v = 0.615f * a - 0.51499f * b - 0.10001f * c;
103 |         *(vfloat*)(va+i) = v;
104 |     }
105 | }
106 | 
107 | #ifdef __ALTIVEC__
108 | void intrinsics_rgb2gray(float* ra, float* ga, float* ba, float* ya, float* ua, float* va) {
109 |     __vector float c11 = vec_splats(0.299f);
110 |     __vector float c12 = vec_splats(0.584f);
111 |     __vector float c13 = vec_splats(0.114f);
112 |     __vector float c21 = vec_splats(-0.1471f);
113 |     __vector float c22 = vec_splats(-0.28886f);
114 |     __vector float c23 = vec_splats(0.436f);
115 |     __vector float c31 = vec_splats(0.615f);
116 |     __vector float c32 = vec_splats(-0.51499f);
117 |     __vector float c33 = vec_splats(-0.10001f);
118 | 
119 |     for(int i = 0; i < N; i+=4) {
120 |         __vector float a = vec_vsx_ld(0, ra+i);
121 |         __vector float b = vec_vsx_ld(0, ga+i);
122 |         __vector float c = vec_vsx_ld(0, ba+i);
123 |         __vector float y = c11 * a  + c12 * b  +  c13 * c ;
124 |         vec_vsx_st(y, 0, ya+i);
125 |         __vector float u = c21 * a  + c22 * b  +  c23 * c ;
126 |         vec_vsx_st(u, 0, ua+i);
127 |         __vector float v = c31 * a  + c32 * b  +  c33 * c ;
128 |         vec_vsx_st(v, 0, va+i);
129 |     }
130 | }
131 | #endif
132 | 
133 | #ifdef __SSE4_2__
134 | 
135 | void sse_rgb2gray(float* ra, float* ga, float* ba, float* ya, float* ua, float* va) {
136 |   __m128 c11 =  _mm_set1_ps(0.299f);
137 |   __m128 c12 =  _mm_set1_ps(0.584f);
138 |   __m128 c13 =  _mm_set1_ps(0.114f);
139 |   __m128 c21 = _mm_set1_ps(-0.1471f);
140 |   __m128 c22 = _mm_set1_ps(-0.28886f);
141 |   __m128 c23 = _mm_set1_ps(0.436f);
142 |   __m128 c31 = _mm_set1_ps(0.615f);
143 |   __m128 c32 = _mm_set1_ps(-0.51499f);
144 |   __m128 c33 = _mm_set1_ps(-0.10001f);
145 | 
146 | 
147 |   for(int i = 0; i < N; i+=4) {
148 |       __m128 a = _mm_loadu_ps(ra+i);
149 |       __m128 b = _mm_loadu_ps(ga+i);
150 |       __m128 c = _mm_loadu_ps(ba+i);
151 |       __m128 y = _mm_add_ps(_mm_add_ps(_mm_mul_ps(c11, a), _mm_mul_ps(c12, b)), _mm_mul_ps(c13, c));
152 |       _mm_storeu_ps(ya+i, y);
153 |       __m128 u = _mm_add_ps(_mm_add_ps(_mm_mul_ps(c21, a), _mm_mul_ps(c22, b)), _mm_mul_ps(c23, c));
154 |       _mm_storeu_ps(ua+i, u);
155 |       __m128 v = _mm_add_ps(_mm_add_ps(_mm_mul_ps(c31, a), _mm_mul_ps(c32, b)), _mm_mul_ps(c33, c));
156 |       _mm_storeu_ps(va+i, v);
157 |   }
158 | }
159 | 
160 | #endif
161 | 
162 | 
163 | //the strange 100,200,300,... offset is used to reduce the effect of "address conflicts"
164 | float r[N+100] POST_ALIGN(16);
165 | float g[N+200] POST_ALIGN(16);
166 | float b[N+300] POST_ALIGN(16);
167 | float y[N+400] POST_ALIGN(16);
168 | float u[N+500] POST_ALIGN(16);
169 | float v[N+600] POST_ALIGN(16);
170 | 
171 | #define ITERATIONS 1000
172 | int main (int argc, char* argv[])
173 | {
174 |     for(int i = 0; i < N; i++) {
175 |         r[N] = random() % 256;
176 |         g[N] = random() % 256;
177 |         b[N] = random() % 256;
178 |     }
179 |     std::cout<< "Convert " << N << " pixels RGB to YUV." << std::endl;
180 | 
181 |     reset_and_start_stimer();
182 |     for(int i = 0; i < ITERATIONS; i++) { serial_rgb2gray(r, g, b, y, u, v);}
183 |     double dt = get_elapsed_seconds();
184 |     std::cout<< "serial version: " << dt << " seconds" << std::endl;
185 | 
186 |     reset_and_start_stimer();
187 |     for(int i = 0; i < ITERATIONS; i++) { svec4_rgb2gray(r, g, b, y, u, v);}
188 |     double dt2 = get_elapsed_seconds();
189 |     std::cout<< "svec4 version: " << dt2 << " seconds" << std::endl;
190 | 
191 |     reset_and_start_stimer();
192 |     for(int i = 0; i < ITERATIONS; i++) { svec4_rgb2gray_ptr(r, g, b, y, u, v); }
193 |     double dt3 = get_elapsed_seconds();
194 |     std::cout<< "svec4 ptr ld/st version: " << dt3 << " seconds" << std::endl;
195 | 
196 | #ifdef __ALTIVEC__
197 |     reset_and_start_stimer();
198 |     for(int i = 0; i < ITERATIONS; i++) { intrinsics_rgb2gray(r, g, b, y, u, v);}
199 |     double dt5 = get_elapsed_seconds();
200 |     std::cout<< "Power VSX version: " << dt5 << " seconds" << std::endl;
201 | #endif
202 | 
203 | #ifdef __SSE4_2__
204 |     reset_and_start_stimer();
205 |     for(int i = 0; i < ITERATIONS; i++) { sse_rgb2gray(r, g, b, y, u, v);}
206 |     double dt6 = get_elapsed_seconds();
207 |     std::cout<< "SSE version: " << dt6 << " seconds" << std::endl;
208 | #endif
209 | 
210 |     return 0;
211 | }
212 | 
213 | 


--------------------------------------------------------------------------------
/examples/common.mk:
--------------------------------------------------------------------------------
  1 | .PONY: %.perfexe %.perfcollect %.perfbr %.perhw
  2 | 
  3 | 
  4 | ifeq (${BASE_DIR},)
  5 |   BASE_DIR=../..
  6 | endif
  7 | 
  8 | 
  9 | BITS=64
 10 | CC=gcc
 11 | CXX=g++ 
 12 | CCFLAGS+= -I${BASE_DIR}/include -O2 -m$(BITS)
 13 | CXXFLAGS+= -I${BASE_DIR}/include -O2  -m$(BITS)
 14 | 
 15 | CCFLAGS+= -Wno-int-to-pointer-cast
 16 | CXXFLAGS+= -Wno-int-to-pointer-cast
 17 | ##########################################################
 18 | # Platform specific options
 19 | # ppc64 or intel
 20 | ##########################################################
 21 | 
 22 | MACHINE=$(shell uname -m)
 23 | ifeq ($(firstword $(filter ppc64,$(MACHINE))),ppc64)
 24 |   CXXFLAGS += -mno-vrsave -mvsx -flax-vector-conversions -mcpu=power7
 25 |   CCFLAGS += -mno-vrsave -mvsx -flax-vector-conversions -mcpu=power7
 26 |   ifeq (${PPC_ISA}, P8)
 27 |     PLATFORM = ppc64_P8
 28 |     CCFLAGS += -D__POWER8
 29 |     CXXFLAGS += -D__POWER8
 30 |   else
 31 |     PLATFORM = ppc64_P7
 32 |   endif
 33 | else
 34 |   PLATFORM=x86-64
 35 |   CCFLAGS += -msse4.2
 36 |   CXXFLAGS += -msse4.2
 37 | endif
 38 | 
 39 | 
 40 | default: ${EXAMPLE}
 41 | 
 42 | 
 43 | ${EXAMPLE}: ${EXAMPLE}.cpp
 44 | 	${CXX} ${CXXFLAGS} $< -o $@
 45 | 
 46 | 	
 47 | run: ${EXAMPLE}
 48 | 	./$< ${RUN_ARGS}
 49 | 	
 50 | ${EXAMPLE}_tune: ${EXAMPLE}_tune.cpp
 51 | 	${CXX} ${CXXFLAGS} $< -o $@
 52 | 
 53 | tune: ${EXAMPLE}_tune
 54 | 	./$< ${RUN_ARGS}
 55 | 
 56 | TMP=__perf.tmp
 57 | 
 58 | #special for collecting all perf data
 59 | %.perf: %.perfbr %.perficache %.perfdcache %.perfllc
 60 | 	@echo "end"
 61 | 	@rm -f ${TMP}
 62 | 
 63 | 
 64 | 
 65 | 
 66 | %.perfhw: CXXFLAGS+= -DPERF_HW
 67 | %.perfhw: %.cpp
 68 | 	${CXX} ${CXXFLAGS} $< -o $@
 69 | 	./$@ | tee ${TMP}
 70 | 	@grep "HPM Event" ${TMP} | tail -1
 71 | 	@grep "HPM Values" ${TMP}
 72 | 
 73 | 
 74 | %.perfbr: CXXFLAGS+= -DPERF_BR
 75 | %.perfbr: %.cpp
 76 | 	${CXX} ${CXXFLAGS} $< -o $@
 77 | 	./$@ | tee ${TMP}
 78 | 	@grep "HPM Event" ${TMP} | tail -1
 79 | 	@grep "HPM Values" ${TMP}
 80 | 	
 81 | %.perficache: CXXFLAGS+= -DPERF_ICACHE
 82 | %.perficache: %.cpp
 83 | 	${CXX} ${CXXFLAGS} $< -o $@
 84 | 	./$@ | tee ${TMP}
 85 | 	@grep "HPM Event" ${TMP} | tail -1
 86 | 	@grep "HPM Values" ${TMP}
 87 | 
 88 | 
 89 | %.perfdcache: CXXFLAGS+= -DPERF_DCACHE
 90 | %.perfdcache: %.cpp
 91 | 	${CXX} ${CXXFLAGS} $< -o $@
 92 | 	./$@ | tee ${TMP}
 93 | 	@grep "HPM Event" ${TMP} | tail -1
 94 | 	@grep "HPM Values" ${TMP}
 95 | 
 96 | %.perfllc: CXXFLAGS+= -DPERF_LLC
 97 | %.perfllc: %.cpp
 98 | 	${CXX} ${CXXFLAGS} $< -o $@
 99 | 	./$@ | tee ${TMP}
100 | 	@grep "HPM Event" ${TMP} | tail -1
101 | 	@grep "HPM Values" ${TMP}
102 | 
103 | 
104 | clean:
105 | 	rm -f ${EXAMPLE} ${EXAMPLE}_tune
106 | 	
107 | 


--------------------------------------------------------------------------------
/examples/mandelbrot/Makefile:
--------------------------------------------------------------------------------
1 | 
2 | 
3 | EXAMPLE=mandelbrot
4 | RUN_ARGS=
5 | 
6 | include ../common.mk


--------------------------------------------------------------------------------
/examples/mandelbrot/mandelbrot.cpp:
--------------------------------------------------------------------------------
  1 | /**
  2 | Copyright 2012 the Generic SIMD Intrinsic Library project authors. All rights reserved.
  3 | 
  4 | Copyright IBM Corp. 2013, 2013. All rights reserved.
  5 | 
  6 | Redistribution and use in source and binary forms, with or without
  7 | modification, are permitted provided that the following conditions are
  8 | met:
  9 | 
 10 |    * Redistributions of source code must retain the above copyright
 11 |      notice, this list of conditions and the following disclaimer.
 12 |    * Redistributions in binary form must reproduce the above
 13 |      copyright notice, this list of conditions and the following
 14 |      disclaimer in the documentation and/or other materials provided
 15 |      with the distribution.
 16 |    * Neither the name of IBM Corp. nor the names of its contributors may be
 17 |      used to endorse or promote products derived from this software
 18 |      without specific prior written permission.
 19 | 
 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 21 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 22 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 23 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 24 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 25 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 26 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 27 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 28 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 30 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 31 | */
 32 | 
 33 | #include <getopt.h>
 34 | #include <iostream>
 35 | #include <stdlib.h>
 36 | #include <string.h>
 37 | #include <assert.h>
 38 | #include <timing.h>
 39 | #include <stdio.h>
 40 | #include <gsimd.h>
 41 | 
 42 | /*
 43 |   g++ -I../../include mandelbrot.cc -mvsx -flax-vector-conversions -Wno-int-to-pointer-cast -O3 -o mandelbrot
 44 |  */
 45 | 
 46 | /* 
 47 |                 Scalar version of mandelbrot 
 48 | */
 49 | static int mandel(float c_re, float c_im, int count) {
 50 |     float z_re = c_re, z_im = c_im;
 51 |     int cci=0;
 52 |     for (cci = 0; cci < count; ++cci) {
 53 |       if (z_re * z_re + z_im * z_im > 4.f)
 54 | 	break;
 55 | 
 56 |       float new_re = z_re*z_re - z_im*z_im;
 57 |       float new_im = 2.f * z_re * z_im;
 58 |       z_re = c_re + new_re;
 59 |       z_im = c_im + new_im;
 60 |     }
 61 |     return cci;
 62 | }
 63 | 
 64 | void mandelbrot_serial(float x0, float y0, float x1, float y1,
 65 |                        int width, int height, int maxIterations,
 66 |                        int output[])
 67 | {
 68 |     float dx = (x1 - x0) / width;
 69 |     float dy = (y1 - y0) / height;
 70 | 
 71 |     for (int j = 0; j < height; j++) {
 72 |         for (int i = 0; i < width; ++i) {
 73 |             float x = x0 + i * dx;
 74 |             float y = y0 + j * dy;
 75 | 
 76 |             int index = (j * width + i);
 77 |             output[index] = mandel(x, y, maxIterations);
 78 |         }
 79 |     }
 80 | }
 81 | 
 82 | 
 83 | /*
 84 |               Generic Intrinsics
 85 |  */
 86 | void mandelbrot_generic(float x0, float y0, float x1, float y1,
 87 |                        int width, int height, int maxIterations,
 88 |                        int output[])
 89 | {
 90 |   typedef svec<4,float>     vfloat;
 91 |   typedef svec<4,int>   vint;
 92 |   typedef svec<4,unsigned int>   vuint;
 93 |   typedef svec<4,short>   vshort;
 94 |   typedef svec<4,bool>    vbool;
 95 | 
 96 |   float dx = (x1 - x0) / width;
 97 |   float dy = (y1 - y0) / height;
 98 | 
 99 |   vfloat v_x0(x0);
100 |   vfloat v_y0(y0);
101 |   vfloat v_x1(x1);
102 |   vfloat v_y1(y1);
103 |   vint vci_4(4);
104 |     
105 |   vfloat v_w((float)width);
106 |   vfloat v_h((float)height);
107 | 
108 |   vfloat v_dx = (v_x1 - v_x0) / v_w;
109 |   vfloat v_dy = (v_y1 - v_y0) / v_h;
110 |   
111 |   for (int j = 0; j < height; j++) {
112 |     vint v_j(j);
113 |     vint v_i(0,1,2,3);
114 |     vfloat v_i_f(0.0,1.0,2.0,3.0);
115 |     vfloat v_j_f((float)j);
116 |         
117 |     //this is the 'parallel loop'
118 |     for (int i = 0; i < width; i+=4) {
119 |       //float x = x0 + i * dx;
120 |       //float y = y0 + j * dy;
121 |       vfloat v_x = v_x0 + (v_i_f * v_dx);
122 |       vfloat v_y = v_y0 + (v_j_f * v_dy);
123 |       
124 |       //int index = (j * width + i);
125 |       //vint v_index = svec_add(svec_mulo((vshort)v_j,(vshort)v_w),v_i);
126 |       vint v_index = v_j * svec_cast<vint>(v_w) + v_i;
127 |       
128 |       //   //output[index] = mandel(x, y, maxIterations);
129 |       
130 |       //float z_re = x, z_im = y;
131 |       vfloat v_z_re = v_x;
132 |       vfloat v_z_im = v_y;
133 |       
134 |       int ci=0;
135 |       //float ct_4=4.f;
136 |       //float ct_2=2.f;
137 |       vint vci(0);
138 |       vbool vzero(0);
139 |       vint vci_1(1);
140 |       vfloat v_ct_4(4.f);
141 |       vfloat v_ct_2(2.f);
142 |       
143 |       vbool v_mask(0xffff);
144 |       
145 |       //next stay the same      
146 |       for (ci = 0; ci < maxIterations; ++ci) {
147 | 	//if (z_re * z_re + z_im * z_im > ct_4)
148 | 	//  break;
149 | 	vfloat v_m = v_z_re*v_z_re + v_z_im*v_z_im;
150 | 	vbool v_cmp = v_m > v_ct_4; 
151 | 	
152 | 	//v_mask = vec_andc(v_mask, v_cmp);
153 | 	v_mask = v_mask & (~v_cmp);
154 | 
155 | 	int allexit = svec_all_true(v_cmp);
156 | 	
157 | 	if( allexit ) break;
158 | 	
159 | 	//here some threads will stop; how do we implement that 
160 | 	
161 | 	//float new_re = z_re*z_re - z_im*z_im;
162 | 	vfloat v_new_re = v_z_re*v_z_re - v_z_im*v_z_im;
163 | 	//float new_im = ct_2 * z_re * z_im;
164 | 	vfloat v_new_im = v_ct_2 * (v_z_re*v_z_im);
165 | 	
166 | 	//z_re = x + new_re;
167 | 	v_z_re = v_x+v_new_re;
168 | 	//z_im = y + new_im;
169 | 	v_z_im = v_y + v_new_im;
170 | 	
171 | 	vint vnci = vci + vci_1;
172 | 	vci = svec_select(v_mask, vnci, vci); 
173 |       }
174 |       //store vci
175 |       //output[index] = ci;
176 |       
177 |       int index = (j * width + i);
178 |       vci.store((vint*)(output+index));
179 |       
180 |       //increment vector i
181 |       v_i   = v_i + vci_4;
182 |       v_i_f = v_i_f + v_ct_4;
183 |     }    
184 |   }
185 | }
186 | 
187 | 
188 | #ifdef __ALTIVEC__
189 | void mandelbrot_intrinsics(float x0, float y0, float x1, float y1,
190 |                        int width, int height, int maxIterations,
191 |                        int output[])
192 | {
193 |   typedef __vector float        vfloat;
194 |   typedef __vector signed int   vint;
195 |   typedef __vector unsigned int vuint;
196 |   typedef __vector signed short vshort;
197 |   typedef __vector bool int     vbool;
198 | 
199 |     float dx = (x1 - x0) / width;
200 |     float dy = (y1 - y0) / height;
201 | 
202 |     vfloat v_x0={x0,x0,x0,x0};
203 |     vfloat v_y0={y0,y0,y0,y0};
204 |     vfloat v_x1={x1,x1,x1,x1};
205 |     vfloat v_y1={y1,y1,y1,y1};
206 |     vint vci_4 = {4,4,4,4};
207 | 
208 |     vfloat v_w={(float)width,(float)width,(float)width,(float)width};
209 |     vfloat v_h={(float)height,(float)height,(float)height,(float)height};
210 | 
211 | 
212 |     vfloat v_dx = vec_div( vec_sub(v_x1,v_x0), v_w);
213 |     vfloat v_dy = vec_div( vec_sub(v_y1,v_y0), v_h);
214 | 
215 |     for (int j = 0; j < height; j++) {
216 |       vint v_j = {j,j,j,j};
217 |       vint v_i = {0,1,2,3};
218 |       vfloat v_i_f = {0.0,1.0,2.0,3.0};
219 |       vfloat v_j_f = {(float)j,(float)j,(float)j,(float)j};
220 | 
221 | 
222 |       //this is the 'parallel loop'
223 |       for (int i = 0; i < width; i+=4) {
224 |     //float x = x0 + i * dx;
225 |     //float y = y0 + j * dy;
226 |     vfloat v_x = vec_add(v_x0,vec_mul(v_i_f,v_dx));
227 |     vfloat v_y = vec_add(v_y0,vec_mul(v_j_f,v_dy));
228 | 
229 |     //int index = (j * width + i);
230 |     vint v_index = vec_add(vec_mulo((vshort)v_j,(vshort)v_w),v_i);
231 | 
232 |     //   //output[index] = mandel(x, y, maxIterations);
233 | 
234 |     //float z_re = x, z_im = y;
235 |     vfloat v_z_re = v_x;
236 |     vfloat v_z_im = v_y;
237 | 
238 |     int ci=0;
239 |     //float ct_4=4.f;
240 |     //float ct_2=2.f;
241 |     vint vci = {0,0,0,0};
242 |     vbool vzero={0,0,0,0};
243 |     vint vci_1 = {1,1,1,1};
244 |     vfloat v_ct_4 = {4.f,4.f,4.f,4.f};
245 |     vfloat v_ct_2 = {2.f,2.f,2.f,2.f};
246 | 
247 |     vbool v_mask = {0xffff,0xffff,0xffff,0xffff};
248 | 
249 |     //next stay the same
250 | 
251 |     for (ci = 0; ci < maxIterations; ++ci) {
252 |       //if (z_re * z_re + z_im * z_im > ct_4)
253 |       //  break;
254 |       vfloat v_m = vec_add(vec_mul(v_z_re,v_z_re),vec_mul(v_z_im,v_z_im));
255 |       vbool v_cmp = vec_cmpgt(v_m,v_ct_4);
256 | 
257 |       //v_mask = v_mask & !v_cmp
258 |       v_mask = vec_andc(v_mask, v_cmp);
259 | 
260 |       int allexit = vec_all_ne(v_cmp, vzero);
261 | 
262 |       if( allexit ) break;
263 | 
264 |       //here some threads will stop; how do we implement that
265 | 
266 |       //float new_re = z_re*z_re - z_im*z_im;
267 |       vfloat v_new_re = vec_sub((vec_mul(v_z_re,v_z_re)),(vec_mul(v_z_im,v_z_im)));
268 |       //float new_im = ct_2 * z_re * z_im;
269 |       vfloat v_new_im = vec_mul(v_ct_2,vec_mul(v_z_re,v_z_im));
270 | 
271 |       //z_re = x + new_re;
272 |       v_z_re = vec_add(v_x,v_new_re);
273 |       //z_im = y + new_im;
274 |       v_z_im = vec_add(v_y, v_new_im);
275 | 
276 |       vint vnci = vec_add(vci,vci_1);
277 |       vci = vec_sel(vci, vnci, v_mask);
278 |     }
279 |     //store vci
280 |     //output[index] = ci;
281 | 
282 |     int index = (j * width + i);
283 |     vec_st(vci, 0, output+index);
284 | 
285 |     //increment vector i
286 |     v_i = vec_add(v_i, vci_4);
287 |     v_i_f = vec_add(v_i_f, v_ct_4);
288 |       }
289 | 
290 |     }
291 | }
292 | #endif
293 | 
294 | 
295 | /* Write a PPM image file with the image of the Mandelbrot set */
296 | static void
297 | writePPM(int *buf, int width, int height, const char *fn) {
298 |     FILE *fp = fopen(fn, "wb");
299 |     fprintf(fp, "P6\n");
300 |     fprintf(fp, "%d %d\n", width, height);
301 |     fprintf(fp, "255\n");
302 |     for (int i = 0; i < width*height; ++i) {
303 |         // Map the iteration count to colors by just alternating between
304 |         // two greys.
305 |         char c = (buf[i] & 0x1) ? 240 : 20;
306 |         for (int j = 0; j < 3; ++j)
307 |             fputc(c, fp);
308 |     }
309 |     fclose(fp);
310 |     printf("Wrote image file %s\n", fn);
311 | }
312 | 
313 | 
314 | static void
315 | writePPM_d(int *buf, int width, int height, const char *fn) {
316 |     for (int i = 0; i < width; ++i) {
317 |       for (int j = 0; j < height; ++j) {
318 | 	int index = i*width+j;
319 | 	printf("%4d ", buf[index]); 
320 |       }
321 |       printf("\n");
322 |     }
323 |     printf("Wrote image file %s\n", fn);
324 | }
325 | 
326 | 
327 | int main() {
328 |   unsigned int width = 768;
329 |   unsigned int height = 512;
330 | 
331 |   //unsigned int width = 1024;
332 |   //unsigned int height = 1024;
333 | 
334 |     float x0 = -2;
335 |     float x1 = 1;
336 |     float y0 = -1;
337 |     float y1 = 1;
338 | 
339 |     int maxIterations = 10;
340 |     int *buf = new int[width*height];
341 | 
342 |     //
343 |     // Compute the image using the scalar and generic intrinsics implementations; report the minimum
344 |     // time of three runs.
345 |     //
346 | 
347 |     double minSerial = 1e30;
348 |     for (int i = 0; i < 3; ++i) {
349 |         reset_and_start_stimer();
350 |         mandelbrot_serial(x0, y0, x1, y1, width, height, maxIterations, buf);
351 |         double dt = get_elapsed_seconds();
352 |         minSerial = std::min(minSerial, dt);
353 |     }
354 |     printf("[mandelbrot serial]:\t\t[%.4f] seconds\n", minSerial);
355 |     writePPM(buf, width, height, "mandelbrot-serial.ppm");
356 | 
357 | 
358 |     double minGeneric = 1e30;
359 |     for (int i = 0; i < 3; ++i) {
360 |         reset_and_start_stimer();
361 |         mandelbrot_generic(x0, y0, x1, y1, width, height, maxIterations, buf);
362 |         double dt = get_elapsed_seconds();
363 |         minGeneric = std::min(minGeneric, dt);
364 |     }
365 |     printf("[mandelbrot generic simd]:\t\t[%.4f] seconds\n", minGeneric);
366 |     writePPM(buf, width, height, "mandelbrot-generic.ppm");
367 |     printf("[mandelbrot generic speedup]:\t\t%.2fx from GENERIC\n", minSerial/minGeneric);
368 | 
369 | #ifdef __ALTIVEC__
370 |     double minIntrinsics = 1e30;
371 |     for (int i = 0; i < 3; ++i) {
372 |         reset_and_start_stimer();
373 |         mandelbrot_intrinsics(x0, y0, x1, y1, width, height, maxIterations, buf);
374 |         double dt = get_elapsed_seconds();
375 |         minIntrinsics = std::min(minIntrinsics, dt);
376 |     }
377 |     printf("[mandelbrot intrinsics simd]:\t\t[%.4f] seconds\n", minIntrinsics);
378 |     writePPM(buf, width, height, "mandelbrot-intrinsics.ppm");
379 |     printf("[mandelbrot intrinsics speedup]:\t%.2fx from INTRINSICS\n", minSerial/minIntrinsics);
380 | #endif
381 |     return 0;
382 | }
383 | 


--------------------------------------------------------------------------------
/include/README.md:
--------------------------------------------------------------------------------
 1 | # Header Files Organization
 2 | 
 3 | The key header files is gsimd.h, which is the only header file that user codes include.
 4 | 
 5 | The below structure is the header file organization.
 6 | ```
 7 | gsimd.h
 8 |  |
 9 |  + generic.h
10 |  |  |
11 |  |  + generic4.h: Generic implementation of LANES=4 
12 |  |  + generic8.h: Generic implementation of LANES=8
13 |  |  
14 |  + sse4.h: Intel SSE4.2 intrinsics implementaiton of LANES=4 
15 |  |
16 |  +-power_vsx4.h: IBM Power VSX intrinsics implementation of LANES=4
17 |     |
18 |     + power7_intrinsics.h Intrinsics only available on IBM Power7 Platform
19 |     + power8_intrinsics.h Intrinsics only available on IBM Power8 Platform
20 | 
21 | gsimd_utility.h: Common macros definitions
22 | ```
23 | 


--------------------------------------------------------------------------------
/include/generic.h:
--------------------------------------------------------------------------------
 1 | /**
 2 | Copyright 2012 the Generic SIMD Intrinsic Library project authors. All rights reserved.
 3 | 
 4 | Copyright IBM Corp. 2013, 2013. All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are
 8 | met:
 9 | 
10 |    * Redistributions of source code must retain the above copyright
11 |      notice, this list of conditions and the following disclaimer.
12 |    * Redistributions in binary form must reproduce the above
13 |      copyright notice, this list of conditions and the following
14 |      disclaimer in the documentation and/or other materials provided
15 |      with the distribution.
16 |    * Neither the name of IBM Corp. nor the names of its contributors may be
17 |      used to endorse or promote products derived from this software
18 |      without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 | */
32 | 
33 | /**
34 |  * generic.h
35 |  *
36 |  *  Created on: Oct.7, 2013
37 |  *  @author: Haichuan Wang (hwang154@illinois.edu)
38 |  *  @brief: Generic SIMD Library header configuration file for generic implementation
39 |  *  The common defintions for all generic implementations
40 |  */
41 | 
42 | #ifndef GENERIC_H_
43 | #define GENERIC_H_
44 | 
45 | #include <stdint.h>
46 | #include <math.h>
47 | #include <assert.h>
48 | #include <iostream>
49 | 
50 | #include "gsimd_utility.h"
51 | 
52 | namespace generic {
53 | //simple trick to generate a compiler error if invalid template
54 | //arguments are used
55 | 
56 | template <int Lanes, class T>
57 | struct svec : public invalid_template_arguments<Lanes,T>::type {
58 |   //here we need to add the static assert
59 | };
60 | 
61 | } //generic namespace
62 | 
63 | #include <generic4.h>
64 | #include <generic8.h>
65 | 
66 | 
67 | #endif /* GENERIC_H_ */
68 | 


--------------------------------------------------------------------------------
/include/gsimd.h:
--------------------------------------------------------------------------------
 1 | /**
 2 | Copyright 2012 the Generic SIMD Intrinsic Library project authors. All rights reserved.
 3 | 
 4 | Copyright IBM Corp. 2013, 2013. All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are
 8 | met:
 9 | 
10 |    * Redistributions of source code must retain the above copyright
11 |      notice, this list of conditions and the following disclaimer.
12 |    * Redistributions in binary form must reproduce the above
13 |      copyright notice, this list of conditions and the following
14 |      disclaimer in the documentation and/or other materials provided
15 |      with the distribution.
16 |    * Neither the name of IBM Corp. nor the names of its contributors may be
17 |      used to endorse or promote products derived from this software
18 |      without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 | */
32 | 
33 | /**
34 |  * gsimd.h
35 |  *
36 |  *  Created on: Aug 14, 2013
37 |  *  @author: Haichuan Wang (haichuan@us.ibm.com hwang154@illinois.edu)
38 |  *  @brief: Generic SIMD Library header configuration file
39 |  */
40 | 
41 | #ifndef GSIMD_H_
42 | #define GSIMD_H_
43 | 
44 | //a macro to for GSIMD
45 | #define __GSIMD__
46 | #ifdef __ALTIVEC__
47 | #include <power_vsx4.h>
48 | using namespace vsx;
49 | #else
50 | #ifdef __SSE4_2__
51 | #include <sse4.h>
52 | using namespace sse;
53 | #else
54 | #include <generic.h>
55 | using namespace generic;
56 | #endif //__SSE4_2__
57 | #endif //__ALTIVEC__
58 | 
59 | 
60 | 
61 | 
62 | #endif /* GSIMD_H_ */
63 | 


--------------------------------------------------------------------------------
/include/perfmeasure.h:
--------------------------------------------------------------------------------
  1 | /**
  2 | Copyright 2012 the Generic SIMD Intrinsic Library project authors. All rights reserved.
  3 | 
  4 | Copyright IBM Corp. 2013, 2013. All rights reserved.
  5 | 
  6 | Redistribution and use in source and binary forms, with or without
  7 | modification, are permitted provided that the following conditions are
  8 | met:
  9 | 
 10 |    * Redistributions of source code must retain the above copyright
 11 |      notice, this list of conditions and the following disclaimer.
 12 |    * Redistributions in binary form must reproduce the above
 13 |      copyright notice, this list of conditions and the following
 14 |      disclaimer in the documentation and/or other materials provided
 15 |      with the distribution.
 16 |    * Neither the name of IBM Corp. nor the names of its contributors may be
 17 |      used to endorse or promote products derived from this software
 18 |      without specific prior written permission.
 19 | 
 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 21 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 22 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 23 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 24 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 25 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 26 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 27 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 28 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 30 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 31 | */
 32 | 
 33 | /**
 34 |  * perfmeasure.h
 35 |  *
 36 |  *  Created on: Jun 3, 2013
 37 |  *  author: Haichuan Wang (haichuan@us.ibm.com, hwang154@illinois.edu)
 38 |  *
 39 |  *  Header file for call linux perf tool to measure HPM
 40 |  *  Reference: http://web.eece.maine.edu/~vweaver/projects/perf_events/perf_event_open.html
 41 |  */
 42 | 
 43 | #ifndef PERFMEASURE_H_
 44 | #define PERFMEASURE_H_
 45 | 
 46 | #include <stdio.h>
 47 | #include <stdlib.h>
 48 | #include <unistd.h>
 49 | #include <string.h>
 50 | #include <sys/ioctl.h>
 51 | #include <linux/perf_event.h>
 52 | #include <asm/unistd.h>
 53 | 
 54 | long
 55 | perf_event_open(struct perf_event_attr *hw_event, pid_t pid,
 56 |                 int cpu, int group_fd, unsigned long flags)
 57 | {
 58 |     int ret;
 59 | 
 60 |     ret = syscall(__NR_perf_event_open, hw_event, pid, cpu,
 61 |                    group_fd, flags);
 62 |     return ret;
 63 | }
 64 | 
 65 | typedef struct hpm_fds_t {
 66 |   int fd[6]; //max 6 events
 67 | } hpm_fds_t;
 68 | 
 69 | typedef struct hpm_group_t {
 70 |   __u32 type;//type of events
 71 |   __u32 size;//4 or 6
 72 |   __u64 event[6];//max 6 events
 73 |   const char* event_name[6];//each event's name
 74 | } hpm_group_t;
 75 | 
 76 | static hpm_group_t hw_group = { PERF_TYPE_HARDWARE,
 77 |                         4,
 78 |                         {PERF_COUNT_HW_INSTRUCTIONS,
 79 |                          PERF_COUNT_HW_CPU_CYCLES,
 80 |                          PERF_COUNT_HW_CACHE_REFERENCES,
 81 |                          PERF_COUNT_HW_CACHE_MISSES,
 82 |                          0,
 83 |                          0},
 84 |                         {
 85 |                          "Instr","Cycles", "Cache Ref", "Cache Miss", "", ""
 86 |                         }
 87 |                        };
 88 | 
 89 | static hpm_group_t br_group = { PERF_TYPE_HARDWARE,
 90 |                         4,
 91 |                         {PERF_COUNT_HW_INSTRUCTIONS,
 92 |                          PERF_COUNT_HW_CPU_CYCLES,
 93 |                          PERF_COUNT_HW_BRANCH_INSTRUCTIONS,
 94 |                          PERF_COUNT_HW_BRANCH_MISSES,
 95 |                          0,
 96 |                          0},
 97 |                         {
 98 |                          "Instr","Cycles", "Branch Instr", "Branch Miss", "", ""
 99 |                         }
100 |                        };
101 | 
102 | 
103 | static hpm_group_t sw_group = { PERF_TYPE_SOFTWARE,
104 |                         5, //not support by power
105 |                         {PERF_COUNT_SW_CPU_CLOCK,
106 |                          PERF_COUNT_SW_TASK_CLOCK,
107 |                          PERF_COUNT_SW_PAGE_FAULTS,
108 |                          PERF_COUNT_SW_CONTEXT_SWITCHES,
109 |                          PERF_COUNT_SW_CPU_MIGRATIONS,
110 |                          0, /*PERF_COUNT_SW_ALIGNMENT_FAULTS*/},
111 |                         {
112 |                          "CPU clock","Task clock", "Page fault", "Context switch", "Migration", "Aligment fault"
113 |                         }
114 |                        };
115 | 
116 | #define CACHE_READ(name) (name | (PERF_COUNT_HW_CACHE_OP_READ << 8) | ( PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16))
117 | #define CACHE_READ_MISS(name) (name | (PERF_COUNT_HW_CACHE_OP_READ << 8) | ( PERF_COUNT_HW_CACHE_RESULT_MISS << 16))
118 | #define CACHE_WRITE(name) (name | (PERF_COUNT_HW_CACHE_OP_WRITE << 8) | ( PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16))
119 | #define CACHE_WRITE_MISS(name) (name | (PERF_COUNT_HW_CACHE_OP_WRITE << 8) | ( PERF_COUNT_HW_CACHE_RESULT_MISS << 16))
120 | 
121 | static hpm_group_t icache_group = { PERF_TYPE_HW_CACHE,
122 |                         2, //not support by power
123 |                         {
124 |                          CACHE_READ_MISS(PERF_COUNT_HW_CACHE_L1I),
125 |                          CACHE_READ_MISS(PERF_COUNT_HW_CACHE_ITLB),
126 |                          CACHE_READ(PERF_COUNT_HW_CACHE_L1I),
127 |                          CACHE_READ(PERF_COUNT_HW_CACHE_ITLB),
128 |                          0,
129 |                          0, /*PERF_COUNT_SW_ALIGNMENT_FAULTS*/},
130 |                         {
131 |                          "L1I Miss", "ITLB Miss", "L1I Read", "ITLB Read", "", ""
132 |                         }
133 |                        };
134 | 
135 | static hpm_group_t dcache_group = { PERF_TYPE_HW_CACHE,
136 |                         3, //not support by power
137 |                         {
138 |                             CACHE_READ(PERF_COUNT_HW_CACHE_L1D),
139 |                             CACHE_READ_MISS(PERF_COUNT_HW_CACHE_L1D),
140 |                             CACHE_WRITE_MISS(PERF_COUNT_HW_CACHE_L1D),
141 |                             CACHE_WRITE(PERF_COUNT_HW_CACHE_L1D),
142 |                             CACHE_READ(PERF_COUNT_HW_CACHE_DTLB),
143 |                             CACHE_READ_MISS(PERF_COUNT_HW_CACHE_DTLB),
144 |                          },
145 |                         {
146 |                          "L1D Read", "L1D Read Miss", "L1D Write Miss", "L1D Write", "L1D Write",  "DTLB Ref"
147 |                         }
148 |                        };
149 | 
150 | static hpm_group_t llc_group = { PERF_TYPE_HW_CACHE,
151 |                         4, //not support by power
152 |                         {
153 |                          CACHE_READ(PERF_COUNT_HW_CACHE_LL),
154 |                          CACHE_READ_MISS(PERF_COUNT_HW_CACHE_LL),
155 |                          CACHE_WRITE(PERF_COUNT_HW_CACHE_LL),
156 |                          CACHE_WRITE_MISS(PERF_COUNT_HW_CACHE_LL),
157 |                          CACHE_WRITE(PERF_COUNT_HW_CACHE_L1D),
158 |                          CACHE_READ(PERF_COUNT_HW_CACHE_DTLB),
159 |                          },
160 |                         {
161 |                          "LLC Read", "LLC Read Miss", "LLC Write", "LLC Write Miss", "L1D Write",  "DTLB Ref"
162 |                         }
163 |                        };
164 | 
165 | 
166 | void perf_events_create(hpm_fds_t* fds, hpm_group_t* egroup) {
167 |   int i;
168 |   int size = egroup->size;
169 |   struct perf_event_attr pe;
170 |   memset(&pe, 0, sizeof(struct perf_event_attr));
171 |   pe.type = egroup->type;
172 |   pe.size = sizeof(struct perf_event_attr);
173 |   pe.disabled = 1;
174 |   //pe.exclude_kernel = 1;
175 |   //pe.exclude_idle = 1;
176 |   pe.exclude_hv = 1;
177 | 
178 |   for(i = 0; i < size; ++i) {
179 |     pe.config = egroup->event[i];
180 |     fds->fd[i] = perf_event_open(&pe, 0, -1, -1, 0);
181 |     if (fds->fd[i] == -1) {
182 |            fprintf(stderr, "Error opening leader %llx, %d, %s\n", pe.config, i, egroup->event_name[i]);
183 |            exit(EXIT_FAILURE);
184 |     }
185 |   }
186 | }
187 | 
188 | //void perf_events_create2(hpm_fds_t* fds) {
189 | //  int i;
190 | //  struct perf_event_attr pe;
191 | //  memset(&pe, 0, sizeof(struct perf_event_attr));
192 | //  pe.type = PERF_TYPE_HARDWARE;
193 | //  pe.size = sizeof(struct perf_event_attr);
194 | //  pe.disabled = 1;
195 | //  //pe.exclude_kernel = 1;
196 | //  //pe.exclude_idle = 1;
197 | //  pe.exclude_hv = 1;
198 | //
199 | //  //instrs
200 | //  pe.config = PERF_COUNT_HW_BRANCH_INSTRUCTIONS;
201 | //  fds->fd0 = perf_event_open(&pe, 0, -1, -1, 0);
202 | //  if (fds->fd0 == -1) {
203 | //         fprintf(stderr, "Error opening leader %llx\n", pe.config);
204 | //         exit(EXIT_FAILURE);
205 | //  }
206 | //  //cycles
207 | //  pe.config = PERF_COUNT_HW_BRANCH_MISSES;
208 | //  fds->fd1 = perf_event_open(&pe, 0, -1, fds->fd0, 0);
209 | //  if (fds->fd1 == -1) {
210 | //         fprintf(stderr, "Error opening event %llx\n", pe.config);
211 | //         exit(EXIT_FAILURE);
212 | //  }
213 | //  //cache ref
214 | //  pe.config = PERF_COUNT_HW_INSTRUCTIONS; //PERF_COUNT_HW_STALLED_CYCLES_FRONTEND ;
215 | //  fds->fd2 = perf_event_open(&pe, 0, -1, fds->fd0, 0);
216 | //  if (fds->fd2 == -1) {
217 | //         fprintf(stderr, "Error opening event %llx\n", pe.config);
218 | //         exit(EXIT_FAILURE);
219 | //  }
220 | //
221 | //  //cache ref
222 | //  pe.config = PERF_COUNT_HW_CPU_CYCLES; //PERF_COUNT_HW_STALLED_CYCLES_BACKEND ;
223 | //  fds->fd3 = perf_event_open(&pe, 0, -1, fds->fd0, 0);
224 | //  if (fds->fd3 == -1) {
225 | //         fprintf(stderr, "Error opening event %llx\n", pe.config);
226 | //         exit(EXIT_FAILURE);
227 | //  }
228 | //}
229 | 
230 | void perf_events_start(hpm_fds_t* fds, hpm_group_t* egroup) {
231 |   int i;
232 |   int size = egroup->size;
233 |   for(i = 0; i < size; ++i) {
234 |     ioctl(fds->fd[i], PERF_EVENT_IOC_RESET, 0);
235 |   }
236 |   for(int i = 0; i < egroup->size; ++i) {
237 |     ioctl(fds->fd[i], PERF_EVENT_IOC_ENABLE, 0);
238 |   }
239 | }
240 | 
241 | 
242 | void perf_events_stop_report(hpm_fds_t* fds, hpm_group_t* egroup) {
243 |   int size = egroup->size;
244 |   int i;
245 |   for(i = 0; i < size; ++i) {
246 |     ioctl(fds->fd[i], PERF_EVENT_IOC_DISABLE, 0);
247 |   }
248 |   printf("[HPM Event]");
249 |   //title
250 |   for(i = 0; i < size; ++i) {
251 |     printf("%s,", egroup->event_name[i]);
252 |   }
253 |   printf("\n[HPM Values]");
254 |   long long c;
255 |   for(i = 0; i < size; ++i) {
256 |     read(fds->fd[i], &c, sizeof(long long));
257 |     printf("%lld,", c);
258 |   }
259 |   printf("\n");
260 | }
261 | 
262 | 
263 | //void perf_events_stop_report2(hpm_fds_t* fds) {
264 | //  long long c0,c1,c2,c3;
265 | //  ioctl(fds->fd0, PERF_EVENT_IOC_DISABLE, 0);
266 | //  read(fds->fd0, &c0, sizeof(long long));
267 | //  ioctl(fds->fd1, PERF_EVENT_IOC_DISABLE, 0);
268 | //  read(fds->fd1, &c1, sizeof(long long));
269 | //  ioctl(fds->fd2, PERF_EVENT_IOC_DISABLE, 0);
270 | //  read(fds->fd2, &c2, sizeof(long long));
271 | //  ioctl(fds->fd3, PERF_EVENT_IOC_DISABLE, 0);
272 | //  read(fds->fd3, &c3, sizeof(long long));
273 | //
274 | ////  printf("[HPM Perf]Branch instrs:%lld; Misbranch instrs:%lld; Frontend Stall:%lld; Backend Stall:%lld\n",
275 | ////      c0, c1, c2,c3);
276 | //  printf("[HPM Perf]Branch instrs:%lld; Misbranch instrs:%lld; Instrs:%lld; Cycles:%lld\n",
277 | //      c0, c1, c2,c3);
278 | //}
279 | 
280 | void perf_events_close(hpm_fds_t* fds, hpm_group_t* egroup) {
281 |   int i;
282 |   int size = egroup->size;
283 |   for(i = 0; i < size; ++i) {
284 |     close(fds->fd[i]);
285 |   }
286 | }
287 | 
288 | /***** Macro Definition  *****/
289 | #if (defined PERF_HW) || (defined PERF_BR) || (defined PERF_SW) || (defined PERF_ICACHE) || (defined PERF_DCACHE) || (defined PERF_LLC)
290 | 
291 | #ifdef PERF_HW
292 | #define GNAME hw_group
293 | #endif
294 | 
295 | #ifdef PERF_BR
296 | #define GNAME br_group
297 | #endif
298 | 
299 | #ifdef PERF_SW
300 | #define GNAME sw_group
301 | #endif
302 | 
303 | #ifdef PERF_ICACHE
304 | #define GNAME icache_group
305 | #endif
306 | 
307 | #ifdef PERF_DCACHE
308 | #define GNAME dcache_group
309 | #endif
310 | 
311 | #ifdef PERF_LLC
312 | #define GNAME llc_group
313 | #endif
314 | 
315 | #define HPM_PERF_CREATE hpm_fds_t __hpm_fds; perf_events_create(&__hpm_fds, &GNAME)
316 | #define HPM_PERF_START perf_events_start(&__hpm_fds, &GNAME)
317 | #define HPM_PERF_STOP perf_events_stop_report(&__hpm_fds, &GNAME)
318 | #define HPM_PERF_CLOSE perf_events_close(&__hpm_fds, &GNAME);
319 | #else
320 | #define HPM_PERF_CREATE
321 | #define HPM_PERF_START
322 | #define HPM_PERF_STOP
323 | #define HPM_PERF_CLOSE
324 | #endif
325 | 
326 | #endif /* PERFMEASURE_H_ */
327 | 


--------------------------------------------------------------------------------
/include/platform_intrinsics.h:
--------------------------------------------------------------------------------
 1 | /**
 2 | Copyright 2012 the Generic SIMD Intrinsic Library project authors. All rights reserved.
 3 | 
 4 | Copyright IBM Corp. 2013, 2013. All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are
 8 | met:
 9 | 
10 |    * Redistributions of source code must retain the above copyright
11 |      notice, this list of conditions and the following disclaimer.
12 |    * Redistributions in binary form must reproduce the above
13 |      copyright notice, this list of conditions and the following
14 |      disclaimer in the documentation and/or other materials provided
15 |      with the distribution.
16 |    * Neither the name of IBM Corp. nor the names of its contributors may be
17 |      used to endorse or promote products derived from this software
18 |      without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 | */
32 | 
33 | #include "power7_intrinsics.h"
34 | 
35 | #ifdef __POWER8
36 | #include "power8_intrinsics.h"
37 | #endif
38 | 


--------------------------------------------------------------------------------
/include/power7_intrinsics.h:
--------------------------------------------------------------------------------
  1 | /**
  2 | Copyright 2012 the Generic SIMD Intrinsic Library project authors. All rights reserved.
  3 | 
  4 | Copyright IBM Corp. 2013, 2013. All rights reserved.
  5 | 
  6 | Redistribution and use in source and binary forms, with or without
  7 | modification, are permitted provided that the following conditions are
  8 | met:
  9 | 
 10 |    * Redistributions of source code must retain the above copyright
 11 |      notice, this list of conditions and the following disclaimer.
 12 |    * Redistributions in binary form must reproduce the above
 13 |      copyright notice, this list of conditions and the following
 14 |      disclaimer in the documentation and/or other materials provided
 15 |      with the distribution.
 16 |    * Neither the name of IBM Corp. nor the names of its contributors may be
 17 |      used to endorse or promote products derived from this software
 18 |      without specific prior written permission.
 19 | 
 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 21 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 22 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 23 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 24 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 25 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 26 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 27 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 28 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 30 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 31 | */
 32 | 
 33 | #define VSXW    "=wa"
 34 | #define VSXR    "wa"
 35 | #define VSXWC   "=&wa"
 36 | 
 37 | /// already implemented in POWER7
 38 | //
 39 | 
 40 | static FORCEINLINE __vector float               vec_splat_p7(__vector float a, const int v){
 41 |   if(__builtin_constant_p(v) && v >= 0 && v < 4) {
 42 |     __vector float register r;
 43 |     asm ("xxspltw %x[xt], %x[xa],%[im] "  : [xt] VSXW(r) : [xa] VSXR(a), [im] "i"(v)    );
 44 |     return r;
 45 |   } else {
 46 |     float f = vec_extract(a, v);
 47 |     __vector float r = {f,f,f,f};
 48 |     return r;
 49 |   }
 50 | }
 51 | 
 52 | static FORCEINLINE __vector signed int          vec_splat_p7(__vector signed int a, const int v){
 53 |   return (__vector signed int)vec_splat_p7((__vector float)a, v);
 54 | }
 55 | 
 56 | static FORCEINLINE __vector unsigned int          vec_splat_p7(__vector unsigned int a, const int v){
 57 |   return (__vector unsigned int)vec_splat_p7((__vector float)a, v);
 58 | }
 59 | 
 60 | /**
 61 |  * @brief use xxpermdi
 62 |  */
 63 | static FORCEINLINE __vector double               vec_splat_p7(__vector double a, const int v){
 64 |   if(__builtin_constant_p(v) && v >= 0 && v < 2) {
 65 |       __vector double register r;
 66 |       const int perm_v = (v == 0 ? 0 : 3);
 67 |       asm ("xxpermdi %x[xt], %x[xa], %x[xb], %[im] "  : [xt] VSXW(r) : [xa] VSXR(a), [xb] VSXR(a), [im] "i"(perm_v)    );
 68 |       return r;
 69 |   } else {
 70 |     double d = vec_extract(a, v);
 71 |     __vector double r = {d,d};
 72 |     return r;
 73 |   }
 74 | }
 75 | 
 76 | /**
 77 |  * @brief use xxpermdi
 78 |  */
 79 | static FORCEINLINE __vector long long            vec_splat_p7(__vector long long a, const int v){
 80 |   return (__vector long long)vec_splat_p7((__vector double )a, v);
 81 | }
 82 | 
 83 | /**
 84 |  * @brief use xxpermdi
 85 |  */
 86 | static FORCEINLINE __vector unsigned long long   vec_splat_p7(__vector unsigned long long a, const int v){
 87 |   return (__vector unsigned long long)vec_splat_p7((__vector double )a, v);
 88 | }
 89 | 
 90 | 
 91 | 
 92 | static FORCEINLINE __vector double              vec_smear_p7(double a){
 93 |   __vector double register r;
 94 |   asm ("xxspltd %x[xt], %x[xa], 0"  : [xt] VSXW(r) : [xa] "f"(a)   );
 95 |   return r;
 96 | }
 97 | 
 98 | static FORCEINLINE __vector float vec_zero_p7(){
 99 |   __vector float register r;
100 |   asm   ("vspltisw %[xt], 0": [xt] "=v"(r) );
101 |   return r;
102 | }
103 | 
104 | static FORCEINLINE __vector unsigned long long   vec_smear_i64_p7(unsigned long long *ptr) {
105 |   __vector unsigned long long r;
106 |   asm ("lxvdsx %x[xt],%y1" : [xt] VSXW(r) : "Z"(*ptr) );
107 |   return r;
108 | }
109 | 
110 | static FORCEINLINE __vector long long   vec_smear_i64_p7(long long *ptr) {
111 |   __vector long long r;
112 |   asm ("lxvdsx %x[xt],%y1" : [xt] VSXW(r) : "Z"(*ptr) );
113 |   return r;
114 | }
115 | 
116 | static FORCEINLINE __vector double               vec_smear_double_p7(double *ptr) {
117 |   __vector double r;
118 |   asm ("lxvdsx %x[xt],%y1" : [xt] VSXW(r) : "Z"(*ptr) );
119 |   return r;
120 | }
121 | 
122 | static FORCEINLINE __vector double               vec_smear_const_double_p7(const double *ptr) {
123 |   __vector double r;
124 |   //asm ("lxvdsx %x[xt],%y1" : [xt] VSXW(r) : "Z"(*ptr) );
125 |   asm ("lxvdsx %x[xt], 0, %[xb]"   : [xt] VSXW(r) : [xb] "b"(ptr) );
126 |   return r;
127 | }
128 | 
129 | static FORCEINLINE __vector unsigned long long   vec_smear_const_i64_p7(const long long *ptr) {
130 |   __vector unsigned long long r;
131 |   //asm ("lxvdsx %x[xt],%y1" : [xt] VSXW(r) : "Z"(*ptr) );
132 |   asm ("lxvdsx %x[xt], 0, %[xb]"   : [xt] VSXW(r) : [xb] "b"(ptr) );
133 |   return r;
134 | }
135 | 
136 | /**
137 |  *\brief This one is not really a smear constant. Need fix it.
138 |  */
139 | static FORCEINLINE __vector float               vec_smear_const_float_p7(const __vector float *ptr) {
140 |   __vector float r, r1;
141 |   asm ("lxvw4x %x[xt], 0, %[xb]"   : [xt] VSXW(r) : [xb] "b"(ptr) );
142 |   asm ("vspltw %x[xt], %x[xa], %[im]"   : [xt] VSXW(r1) : [xa] VSXR(r) , [im] "i"(0));;
143 |   return r1;
144 | }
145 | 
146 | static FORCEINLINE __vector float vec_neg_p7(__vector float a) {
147 |   __vector float register r;
148 |   asm ("xvnegsp %x[xt], %x[xa]"    : [xt] VSXW(r) : [xa] VSXR(a));
149 |   return r;
150 | }
151 | 
152 | 
153 | 


--------------------------------------------------------------------------------
/include/power8_intrinsics.h:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * IBM Confidential
  3 |  */
  4 | 
  5 | /**
  6 | Copyright 2012 the Generic SIMD Intrinsic Library project authors. All rights reserved.
  7 | 
  8 | Copyright IBM Corp. 2013, 2013. All rights reserved.
  9 | 
 10 | Redistribution and use in source and binary forms, with or without
 11 | modification, are permitted provided that the following conditions are
 12 | met:
 13 | 
 14 |    * Redistributions of source code must retain the above copyright
 15 |      notice, this list of conditions and the following disclaimer.
 16 |    * Redistributions in binary form must reproduce the above
 17 |      copyright notice, this list of conditions and the following
 18 |      disclaimer in the documentation and/or other materials provided
 19 |      with the distribution.
 20 |    * Neither the name of IBM Corp. nor the names of its contributors may be
 21 |      used to endorse or promote products derived from this software
 22 |      without specific prior written permission.
 23 | 
 24 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 25 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 26 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 27 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 28 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 29 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 30 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 31 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 32 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 33 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 34 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 35 | */
 36 | 
 37 | #define VSXW    "=wa"
 38 | #define VSXR    "wa"
 39 | #define VSXWC   "=&wa"
 40 | 
 41 | //
 42 | //// POWER8 intrinsics
 43 | //
 44 | 
 45 | #ifdef __POWER8
 46 | 
 47 | ////////////////////////////////////////////////
 48 | //   int 64 math/logic operations
 49 | static FORCEINLINE __vector signed long long vec_add_p8(__vector signed long long a,
 50 | 					                  __vector signed long long b){
 51 |   __vector signed long long register r;
 52 |   asm ("vaddudm %[xt], %[xa], %[xb]" : [xt] "=v"(r) : [xa] "v"(a), [xb] "v"(b) );
 53 |   return r;
 54 | }
 55 | 
 56 | static FORCEINLINE __vector signed long long vec_sub_p8(__vector signed long long a,
 57 |                                                           __vector signed long long b){
 58 |   __vector signed long long register r;
 59 |   asm ("vsubudm %[xt], %[xa], %[xb]" : [xt] "=v"(r) : [xa] "v"(a), [xb] "v"(b) );
 60 |   return r;
 61 | }
 62 | 
 63 | 
 64 | static FORCEINLINE __vector signed long long vec_sld_p8(__vector signed long long a,
 65 |                                                           __vector signed long long b){
 66 |   __vector signed long long register r;
 67 |   asm ("vsld %[xt], %[xa], %[xb]" : [xt] "=v"(r) : [xa] "v"(a), [xb] "v"(b) );
 68 |   return r;
 69 | }
 70 | 
 71 | 
 72 | static FORCEINLINE __vector unsigned int       vec_pack_p8(__vector signed long long a,
 73 | 					                   __vector signed long long b){
 74 |   __vector unsigned int  register r;
 75 |   asm ("vpkuhum %[xt], %[xa], %[xb]" : [xt] "=v"(r) : [xa] "v"(a), [xb] "v"(b) );
 76 |   return r;
 77 | }
 78 | 
 79 | static FORCEINLINE __vector unsigned long long vec_unpackh_p8(__vector unsigned int a){
 80 |   __vector unsigned long long register r;
 81 |   asm ("vupkhsw %[xt], %[xa]"        : [xt] "=v"(r)  : [xa] "v"(a)    );
 82 |   return r;
 83 | }
 84 | 
 85 | static FORCEINLINE __vector unsigned long long vec_unpackl_p8(__vector unsigned int a){
 86 |   __vector unsigned long long register r;
 87 |   asm ("vupklsw %[xt], %[xa]"        : [xt] "=v"(r)  : [xa] "v"(a)    );
 88 |   return r;
 89 | }
 90 | 
 91 | static FORCEINLINE __vector signed long long vec_cmpeq_p8(__vector signed long long a,
 92 | 						            __vector signed long long b){
 93 |   __vector signed long long register r;
 94 |   asm ("vcmpequd %[xt], %[xa], %[xb]": [xt] "=v"(r) : [xa] "v"(a), [xb] "v"(b) );
 95 |   return r;
 96 | }
 97 | 
 98 | static FORCEINLINE __vector unsigned long long vec_sel_p8(__vector unsigned long long a,
 99 | 					                  __vector unsigned long long b,
100 | 					                  __vector unsigned long long m /*mask*/){
101 |   __vector unsigned long long register r;
102 |   asm ("vsel %[xt], %[xa], %[xb], %[xc]": [xt] "=v"(r) : [xa] "v"(a), [xb] "v"(b), [xc] "v"(m));
103 |   return r;
104 | }
105 | 
106 | static FORCEINLINE __vector double vec_sel_p8(__vector double a,__vector double b, __vector double m){
107 |   __vector double register r;
108 |   asm ("vsel %[xt], %[xa], %[xb], %[xc]": [xt] "=v"(r) : [xa] "v"(a), [xb] "v"(b), [xc] "v"(m));
109 |   return r;
110 | }
111 | 
112 | static FORCEINLINE __vector signed int   vec_smear_p8(signed int a){
113 |   __vector signed int register r, t;
114 |   asm ("mtvsrwz %x[xt], %[xa]"      : [xt] VSXW(t) : [xa] "r"(a)   );
115 |   asm ("xxspltw %x[xt], %x[xa], 1"  : [xt] VSXW(r) : [xa] VSXR(t)   );
116 |   return r;
117 | }
118 | 
119 | static FORCEINLINE __vector unsigned int vec_smear_p8(unsigned int a){
120 |   __vector unsigned int register r, t;
121 |   asm ("mtvsrwz %x[xt], %[xa]"      : [xt] VSXW(t) : [xa] "r"(a)   );
122 |   asm ("xxspltw %x[xt], %x[xa], 1"  : [xt] VSXW(r) : [xa] VSXR(t)   );
123 |   return r;
124 | }
125 | 
126 | static FORCEINLINE __vector float        vec_smear_p8(float a){
127 |   __vector float register r, t;
128 |   asm ("xscvdpspn %x[xt], %[xa]"    : [xt] VSXW(t) : [xa] "f"(a)   );
129 |   asm ("xxspltw %x[xt], %x[xa], 0"  : [xt] VSXW(r) : [xa] VSXR(t)   );
130 |   return r;
131 | }
132 | 
133 | static FORCEINLINE __vector unsigned long long  vec_smear_i64_p8(long long a){
134 |   __vector unsigned long long register r, t;
135 |   asm ("mtvsrd  %x[xt], %[xa]"      : [xt] VSXW(t) : [xa] "r"(a)   );
136 |   asm ("xxspltd %x[xt], %x[xa], 0"  : [xt] VSXW(r) : [xa] VSXR(t)   );
137 |   return r;
138 | }
139 | 
140 | static FORCEINLINE __vector float               vec_smear_float_p8(float *ptr){
141 |   __vector float register r, t;
142 |   //asm ("lxsiwzx %x[xt], 0, %[xb]"   : [xt] VSXW(t) : [xb] "b"(ptr): "memory");
143 |   asm ("lxsiwzx %x[xt],%y1"         : [xt] VSXW(t) : "Z"(*ptr));
144 |   asm ("xxspltw %x[xt], %x[xa], 1"  : [xt] VSXW(r) : [xa] VSXR(t)   );
145 |   return r;
146 | }
147 | 
148 | static FORCEINLINE __vector unsigned int        vec_smear_i32_p8(unsigned int *ptr){
149 |   __vector unsigned int register r, t;
150 |   //asm ("lxsiwzx %x[xt], 0, %[xb]"   : [xt] VSXW(t) : [xb] "b"(ptr): "memory");
151 |   asm ("lxsiwzx %x[xt],%y1"         : [xt] VSXW(t) : "Z"(*ptr));
152 |   asm ("xxspltw %x[xt], %x[xa], 1"  : [xt] VSXW(r) : [xa] VSXR(t)   );
153 |   return r;
154 | }
155 | 
156 | 
157 | static FORCEINLINE __vector unsigned int vec_mul_p8(__vector unsigned int a,__vector unsigned int b) {
158 |   __vector unsigned int register r;
159 |   asm ("vmuluwm %[xt], %[xa], %[xb]": [xt] "=v"(r)  : [xa] "v"(a), [xb] "v"(b) );
160 |   return r;
161 | }
162 | 
163 | //P8 specific extract 
164 | static FORCEINLINE uint64_t vec_extract_l(__vector int a) {
165 |   uint64_t register r;
166 |   asm ("mfvsrd %[ra], %x[xs]": [ra] "=r"(r)  : [xs] VSXR(a) );
167 |   return r;
168 | }
169 | 
170 | static FORCEINLINE uint64_t vec_extract_r(__vector int a) {
171 |   uint64_t register r;
172 |   __vector int register tmp;
173 |   asm ("vsldoi %[xt], %[xa], %[xb], %[im]": [xt] "=v"(tmp)  : [xa] "v"(a), [xb] "v"(a),[im] "i"(8) );
174 |   asm ("mfvsrd %[ra], %x[xs]": [ra] "=r"(r)  : [xs] VSXR(tmp) );
175 |   return r;
176 | }
177 | 
178 | static FORCEINLINE uint64_t vec_extract_l(__vector float a) {
179 |   uint64_t register r;
180 |   asm ("mfvsrd %[ra], %x[xs]": [ra] "=r"(r)  : [xs] VSXR(a) );
181 |   return r;
182 | }
183 | 
184 | static FORCEINLINE uint64_t vec_extract_r(__vector float a) {
185 |   uint64_t register r;
186 |   __vector int register tmp;
187 |   asm ("vsldoi %[xt], %[xa], %[xb], %[im]": [xt] "=v"(tmp)  : [xa] "v"(a), [xb] "v"(a),[im] "i"(8) );
188 |   asm ("mfvsrd %[ra], %x[xs]": [ra] "=r"(r)  : [xs] VSXR(tmp) );
189 |   return r;
190 | }
191 | 
192 | #define GATHER_WORD_OFF32_P8(TYPE) \
193 | static FORCEINLINE __vector TYPE vec_gather_p8(TYPE *ptr0, \
194 | 					       TYPE *ptr1, \
195 | 					       TYPE *ptr2, \
196 | 					       TYPE *ptr3){ \
197 |   __vector TYPE register r0,r1,r2,r3;			    \
198 |   __vector TYPE register t0,t1;				    \
199 |   __vector TYPE register r;						\
200 |   asm ("lxsiwzx %x[xt],%y1"         : [xt] VSXW(r0) : "Z"(*ptr0));	\
201 |   asm ("lxsiwzx %x[xt],%y1"         : [xt] VSXW(r1) : "Z"(*ptr1));	\
202 |   asm ("xxmrghd %x[xt], %x[xa], %x[xb]"         : [xt] VSXW(t0) : [xa] VSXR(r0), [xb] VSXR(r1) ); \
203 |   asm ("lxsiwzx %x[xt],%y1"         : [xt] VSXW(r2) : "Z"(*ptr2));	\
204 |   asm ("lxsiwzx %x[xt],%y1"         : [xt] VSXW(r3) : "Z"(*ptr3));	\
205 |   asm ("xxmrghd %x[xt], %x[xa], %x[xb]"         : [xt] VSXW(t1) : [xa] VSXR(r2), [xb] VSXR(r3) ); \
206 |   asm ("vpkudum %[xt], %[xa], %[xb]"         : [xt] "=v"(r) : [xa] "v"(t0), [xb] "v"(t1) ); \
207 |   return r;								\
208 | }									
209 | 
210 | GATHER_WORD_OFF32_P8(float)
211 | GATHER_WORD_OFF32_P8(signed int)
212 | GATHER_WORD_OFF32_P8(unsigned int)
213 | 
214 | 
215 | 
216 | #define GATHER_D_WORD_OFF32_P8(TYPE) \
217 | static FORCEINLINE __vector TYPE vec_gather_p8(TYPE *ptr0, \
218 | 					       TYPE *ptr1){	    \
219 |   __vector TYPE register r0,r1;			    \
220 |   __vector TYPE register r;						\
221 |   asm ("lxvdsx %x[xt],%y1"         : [xt] VSXW(r0) : "Z"(*ptr0));	\
222 |   asm ("lxvdsx %x[xt],%y1"         : [xt] VSXW(r1) : "Z"(*ptr1));	\
223 |   asm ("xxmrghd %x[xt], %x[xa], %x[xb]"         : [xt] VSXW(r) : [xa] VSXR(r0), [xb] VSXR(r1) ); \
224 |   return r;								\
225 | }									
226 | 
227 | GATHER_D_WORD_OFF32_P8(double)
228 | GATHER_D_WORD_OFF32_P8(signed long)
229 | 
230 | //POWER 8 Scatter Intrinsics
231 | 
232 | #define SCATTER_WORD_OFF32_P8(TYPE,IMM)					\
233 | void vec_scatter_step_##IMM(TYPE* ptr0, __vector TYPE val){ \
234 |  __vector TYPE register tmp;							\
235 |  asm ("vsldoi %[xt], %[xa], %[xb], %[im]": [xt] "=v"(tmp)  : [xa] "v"(val), [xb] "v"(val),[im] "i"(IMM) ); \
236 |  asm ("stxsiwx %x[xt],%y1"         : : [xt] VSXR(tmp), "Z"(*ptr0));	\
237 | }
238 | 
239 | #define SCATTER_WORD_OFF32_Z_P8(TYPE)					\
240 | void vec_scatter_step_0(TYPE* ptr0, __vector TYPE val){ \
241 |  asm ("stxsiwx %x[xt],%y1"         : : [xt] VSXR(val), "Z"(*ptr0));	\
242 | }
243 | 
244 | SCATTER_WORD_OFF32_Z_P8(float)
245 | SCATTER_WORD_OFF32_P8(float,4)
246 | SCATTER_WORD_OFF32_P8(float,8)
247 | SCATTER_WORD_OFF32_P8(float,12)
248 | 
249 | SCATTER_WORD_OFF32_Z_P8(signed int)
250 | SCATTER_WORD_OFF32_P8(signed int,4)
251 | SCATTER_WORD_OFF32_P8(signed int,8)
252 | SCATTER_WORD_OFF32_P8(signed int,12)
253 | 
254 | SCATTER_WORD_OFF32_Z_P8(unsigned int)
255 | SCATTER_WORD_OFF32_P8(unsigned int,4)
256 | SCATTER_WORD_OFF32_P8(unsigned int,8)
257 | SCATTER_WORD_OFF32_P8(unsigned int,12)
258 | 
259 | #define SCATTER_D_WORD_OFF32_P8(TYPE) \
260 | void vec_scatter_step_8(TYPE* ptr0, __vector TYPE val){ \
261 |  __vector TYPE tmp;							\
262 |  asm ("vsldoi %[xt], %[xa], %[xb], %[im]": [xt] "=v"(tmp)  : [xa] "v"(val), [xb] "v"(val),[im] "i"(8) ); \
263 |  asm ("stxsdx %x[xt],%y1"         : : [xt] VSXR(tmp), "Z"(*ptr0));	\
264 | }
265 | 
266 | #define SCATTER_D_WORD_OFF32_Z_P8(TYPE) \
267 | void vec_scatter_step_0(TYPE* ptr0, __vector TYPE val){ \
268 |  asm ("stxsdx %x[xt],%y1"         : : [xt] VSXR(val), "Z"(*ptr0));	\
269 | }
270 | 
271 | SCATTER_D_WORD_OFF32_P8(double)
272 | SCATTER_D_WORD_OFF32_P8(signed long)
273 | SCATTER_D_WORD_OFF32_Z_P8(double)
274 | SCATTER_D_WORD_OFF32_Z_P8(signed long)
275 | #endif
276 | 


--------------------------------------------------------------------------------
/include/svec-vsx.h:
--------------------------------------------------------------------------------
  1 | /**
  2 | Copyright 2012 the Generic SIMD Intrinsic Library project authors. All rights reserved.
  3 | 
  4 | Copyright IBM Corp. 2013, 2013. All rights reserved.
  5 | 
  6 | Redistribution and use in source and binary forms, with or without
  7 | modification, are permitted provided that the following conditions are
  8 | met:
  9 | 
 10 |    * Redistributions of source code must retain the above copyright
 11 |      notice, this list of conditions and the following disclaimer.
 12 |    * Redistributions in binary form must reproduce the above
 13 |      copyright notice, this list of conditions and the following
 14 |      disclaimer in the documentation and/or other materials provided
 15 |      with the distribution.
 16 |    * Neither the name of IBM Corp. nor the names of its contributors may be
 17 |      used to endorse or promote products derived from this software
 18 |      without specific prior written permission.
 19 | 
 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 21 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 22 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 23 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 24 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 25 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 26 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 27 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 28 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 30 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 31 | */
 32 | /*
 33 |  * svec-vsx.h
 34 |  *
 35 |  *  Created on: Jul 7, 2013
 36 |  *      Author: Haichuan Wang (haichuan@us.ibm.com, hwang154@illinois.edu)
 37 |  */
 38 | 
 39 | #ifndef SVEC_VSX_H_
 40 | #define SVEC_VSX_H_
 41 | 
 42 | #include <stdint.h>
 43 | #include <math.h>
 44 | #include <altivec.h>
 45 | #include <assert.h>
 46 | #include <iostream>
 47 | 
 48 | #include "gsimd_utility.h"
 49 | #include "platform_intrinsics.h"
 50 | 
 51 | std::ostream& operator<< (std::ostream &out, uint8_t &v) {
 52 |   out << uint16_t(v);
 53 |   return out;
 54 | }
 55 | 
 56 | std::ostream& operator<< (std::ostream &out, int8_t &v) {
 57 |   out << int16_t(v);
 58 |   return out;
 59 | }
 60 | 
 61 | namespace vsx {
 62 | 
 63 | template<int N, typename REGTYPE, typename STYPE>
 64 | class svec_internal {
 65 | protected:
 66 |   FORCEINLINE int lanes_per_reg() { return sizeof(REGTYPE)/sizeof(STYPE);}
 67 |   FORCEINLINE int regs() { return N/lanes_per_reg();}
 68 | 
 69 | 
 70 |   FORCEINLINE svec_internal() {}
 71 | 
 72 |   FORCEINLINE svec_internal(const REGTYPE vva[]) {
 73 |     for(int i=0; i < regs() ; i++) {
 74 |       va[i] = vva[i];
 75 |     }
 76 |   }
 77 | 
 78 |   FORCEINLINE svec_internal(const STYPE v) {
 79 |     REGTYPE t;
 80 |     switch(lanes_per_reg()) {
 81 |     case 2: {//uint64_t, int64_t, double for 128bit{
 82 |         t = REGTYPE(v,v);
 83 |       }
 84 |       break;
 85 |     case 4: {//
 86 |         t = REGTYPE(v,v,v,v);
 87 |       }
 88 |       break;
 89 |     case 8: {
 90 |         t = REGTYPE(v,v,v,v,v,v,v,v);
 91 |       }
 92 |       break;
 93 |     case 16: { //
 94 |         t = REGTYPE(v,v,v,v,v,v,v,v,v,v,v,v,v,v,v,v);
 95 |       }
 96 |       break;
 97 |     case 32: {//suppose 256bit SIMD for 8 bit
 98 |         t = REGTYPE(v,v,v,v,v,v,v,v,v,v,v,v,v,v,v,v,
 99 |                       v,v,v,v,v,v,v,v,v,v,v,v,v,v,v,v);
100 |       }
101 |       break;
102 |     } //switch
103 |     for(int i=0; i < N; i+=lanes_per_reg()) {
104 |       va[i/lanes_per_reg()] = t;
105 |     } //for
106 |   }
107 | 
108 | 
109 |   FORCEINLINE svec_internal(const STYPE v[]) {
110 |     for(int i=0; i < N; i+=lanes_per_reg()) {
111 |       REGTYPE t;
112 |       switch(lanes_per_reg()) {
113 |       case 2: {//uint64_t, int64_t, double for 128bit{
114 |           t = REGTYPE(v[i], v[i+1]);
115 |         }
116 |         break;
117 |       case 4: {//
118 |           t = REGTYPE(v[i], v[i+1], v[i+2], v[i+3]);
119 |         }
120 |         break;
121 |       case 8: {
122 |           t = REGTYPE(v[i], v[i+1], v[i+2], v[i+3], v[i+4], v[i+5], v[i+6], v[i+7]);
123 |         }
124 |         break;
125 |       case 16: { //
126 |           t = REGTYPE(v[i], v[i+1], v[i+2], v[i+3], v[i+4], v[i+5], v[i+6], v[i+7],
127 |               v[i+8], v[i+9], v[i+10], v[i+11], v[i+12], v[i+13], v[i+14], v[i+15]);
128 |         }
129 |         break;
130 |       case 32: {//suppose 256bit SIMD for 8 bit
131 |           t = REGTYPE(v[i], v[i+1], v[i+2], v[i+3], v[i+4], v[i+5], v[i+6], v[i+7],
132 |                v[i+8], v[i+9], v[i+10], v[i+11], v[i+12], v[i+13], v[i+14], v[i+15],
133 |                v[i+16], v[i+17], v[i+18], v[i+19], v[i+20], v[i+21], v[i+22], v[i+23],
134 |                v[i+24], v[i+25], v[i+26], v[i+27], v[i+28], v[i+29], v[i+30], v[i+31]);
135 |         }
136 |         break;
137 |       } //switch
138 |       va[i/lanes_per_reg()] = t;
139 |     } //for
140 |   }
141 | 
142 | public:
143 |   /**
144 |    * @brief Internal use for get the storage register
145 |    */
146 |   FORCEINLINE REGTYPE & reg(int index) { return va[index];}
147 |   FORCEINLINE const REGTYPE & reg(int index) const { return va[index];}
148 | 
149 |   FORCEINLINE STYPE& operator[](int index) {return ((STYPE *)va)[index];}
150 |   FORCEINLINE const STYPE& operator[](int index) const {return ((STYPE *)va)[index]; }
151 | 
152 |   friend std::ostream& operator<< (std::ostream &out, const svec_internal &v) {
153 |     out << "svec_"<< iu_get_type_name<STYPE>() << "<" << N << ">[";
154 |     stdout_scalar<STYPE>(out, v[0]);
155 |     for(int i = 1; i < N ; i++) {
156 |       out << ", ";
157 |       stdout_scalar<STYPE>(out, v[i]);
158 |     }
159 |     out << "]";
160 |     return out;
161 |   }
162 | 
163 |   REGTYPE va[N/(sizeof(REGTYPE)/sizeof(STYPE))];
164 | };
165 | 
166 | 
167 | template <int N>
168 | class svec_bool: public svec_internal<N, __vector uint32_t, uint32_t> {
169 | 
170 | public:
171 |   FORCEINLINE svec_bool() { }
172 |   FORCEINLINE svec_bool(const __vector uint32_t vva[]) : svec_internal<N, __vector uint32_t, uint32_t>(vva) { }
173 | 
174 |   /**
175 |    * @brief bool type's initial function need set each element full bits, either 0 or 0xFFFFFFFF.
176 |    * @param v an array of bool values.
177 |    * @return a svec_bool type object
178 |    */
179 |   FORCEINLINE svec_bool(const bool v[]) {
180 |     for(int i=0; i < N; i+=4) {
181 |       __vector uint32_t t = { v[i] ? -1 : 0, v[i+1] ? -1 : 0,
182 |                               v[i+2] ? -1 : 0, v[i+3] ? -1 : 0 };
183 |       this->va[i>>2] = t;
184 |     }
185 |   }
186 | 
187 |   FORCEINLINE svec_bool(const bool &v0, const bool &v1, const bool &v2, const bool &v3) {
188 |     __vector uint32_t t = {v0 ? -1 : 0, v1 ? -1 : 0, v2 ? -1 : 0, v3 ? -1 : 0};
189 |     this->va[0] = t;
190 |   }
191 |   FORCEINLINE svec_bool(const bool &v0, const bool &v1, const bool &v2, const bool &v3,
192 |                         const bool &v4, const bool &v5, const bool &v6, const bool &v7) {
193 |     __vector uint32_t t0 = {v0 ? -1 : 0, v1 ? -1 : 0, v2 ? -1 : 0, v3 ? -1 : 0};
194 |     this->va[0] = t0;
195 |     __vector uint32_t t1 = {v4 ? -1 : 0, v5 ? -1 : 0, v6 ? -1 : 0, v7 ? -1 : 0};
196 |     this->va[1] = t1;
197 |   }
198 |   FORCEINLINE svec_bool(const bool& v) {
199 |     if(__builtin_constant_p(v)){
200 |       __vector uint32_t t = (v) ? vec_splat_s32(-1) : vec_splat_s32(0);
201 |       for(int i=0; i < N; i+=this->lanes_per_reg()) {
202 |         this->va[i/this->lanes_per_reg()] = t;
203 |       } //fo
204 |     } else {
205 |       svec_internal<N, __vector uint32_t, uint32_t>(v);
206 |     }
207 |   }
208 | };
209 | 
210 | 
211 | template <int N>
212 | class svec_i8: public svec_internal<N, __vector int8_t, int8_t> {
213 | 
214 | public:
215 |   FORCEINLINE svec_i8() { }
216 |   FORCEINLINE svec_i8(const __vector int8_t vva[]) : svec_internal<N, __vector int8_t, int8_t>(vva) {}
217 |   FORCEINLINE svec_i8(const int8_t v[]) : svec_internal<N, __vector int8_t, int8_t>(v) {}
218 |   FORCEINLINE svec_i8(const int8_t& v0, const int8_t& v1, const int8_t& v2, const int8_t& v3) {
219 |     __vector int8_t t = {v0, v1, v2, v3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
220 |     this->va[0] = t;
221 |   }
222 |   FORCEINLINE svec_i8(const int8_t& v0, const int8_t& v1, const int8_t& v2, const int8_t& v3,
223 |                       const int8_t& v4, const int8_t& v5, const int8_t& v6, const int8_t& v7) {
224 |     __vector int8_t t = {v0, v1, v2, v3, v4, v5, v6, v7, 0, 0, 0, 0, 0, 0, 0, 0};
225 |     this->va[0] = t;
226 |   }
227 |   FORCEINLINE svec_i8(const int8_t& v) {
228 |     if(__builtin_constant_p(v) && (v <= 15) && (v >= -16)){
229 |       __vector int8_t t = vec_splat_s8(v); //will gen one instr.vspltisb
230 |       for(int i=0; i < N; i+=this->lanes_per_reg()) {
231 |         this->va[i/this->lanes_per_reg()] = t;
232 |       } //fo
233 |     } else {
234 |       svec_internal<N, __vector int8_t, int8_t>(v);
235 |     }
236 |   }
237 | 
238 | };
239 | 
240 | template <int N>
241 | class svec_u8: public svec_internal<N, __vector uint8_t, uint8_t> {
242 | 
243 | public:
244 |   FORCEINLINE svec_u8() { }
245 |   FORCEINLINE svec_u8(const __vector uint8_t vva[]) : svec_internal<N, __vector uint8_t, uint8_t>(vva) {}
246 |   FORCEINLINE svec_u8(const uint8_t v[]) : svec_internal<N, __vector uint8_t, uint8_t>(v) {}
247 |   FORCEINLINE svec_u8(const uint8_t& v0, const uint8_t& v1, const uint8_t& v2, const uint8_t& v3) {
248 |     __vector uint8_t t = {v0, v1, v2, v3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
249 |     this->va[0] = t;
250 |   }
251 |   FORCEINLINE svec_u8(const uint8_t& v0, const uint8_t& v1, const uint8_t& v2, const uint8_t& v3,
252 |                       const uint8_t& v4, const uint8_t& v5, const uint8_t& v6, const uint8_t& v7) {
253 |     __vector uint8_t t = {v0, v1, v2, v3, v4, v5, v6, v7, 0, 0, 0, 0, 0, 0, 0, 0};
254 |     this->va[0] = t;
255 |   }
256 |   FORCEINLINE svec_u8(const uint8_t& v) {
257 |     if(__builtin_constant_p(v) && (v <= 15)){
258 |       __vector uint8_t t = vec_splat_u8(v); //will gen one instr.vspltisb
259 |       for(int i=0; i < N; i+=this->lanes_per_reg()) {
260 |         this->va[i/this->lanes_per_reg()] = t;
261 |       } //fo
262 |     } else {
263 |       svec_internal<N, __vector uint8_t, uint8_t>(v);
264 |     }
265 |   }
266 | };
267 | 
268 | template <int N>
269 | class svec_i16: public svec_internal<N, __vector int16_t, int16_t> {
270 | 
271 | public:
272 |   FORCEINLINE svec_i16() { }
273 |   FORCEINLINE svec_i16(const __vector int16_t vva[]) : svec_internal<N, __vector int16_t, int16_t>(vva) {}
274 |   FORCEINLINE svec_i16(const int16_t v[]) : svec_internal<N, __vector int16_t, int16_t>(v) {}
275 |   FORCEINLINE svec_i16(const int16_t& v0, const int16_t& v1, const int16_t& v2, const int16_t& v3) {
276 |     __vector int16_t t = {v0, v1, v2, v3, 0, 0, 0, 0};
277 |     this->va[0] = t;
278 |   }
279 |   FORCEINLINE svec_i16(const int16_t& v0, const int16_t& v1, const int16_t& v2, const int16_t& v3,
280 |                       const int16_t& v4, const int16_t& v5, const int16_t& v6, const int16_t& v7) {
281 |     __vector int16_t t = {v0, v1, v2, v3, v4, v5, v6, v7};
282 |     this->va[0] = t;
283 |   }
284 |   FORCEINLINE svec_i16(const int16_t& v) {
285 |     if(__builtin_constant_p(v) && (v <= 15) && (v >= -16)){
286 |       __vector int16_t t = vec_splat_s16(v); //will gen one instr.vspltisb
287 |       for(int i=0; i < N; i+=this->lanes_per_reg()) {
288 |         this->va[i/this->lanes_per_reg()] = t;
289 |       } //fo
290 |     } else {
291 |       svec_internal<N, __vector int16_t, int16_t>(v);
292 |     }
293 |   }
294 | };
295 | 
296 | template <int N>
297 | class svec_u16: public svec_internal<N, __vector uint16_t, uint16_t> {
298 | 
299 | public:
300 |   FORCEINLINE svec_u16() { }
301 |   FORCEINLINE svec_u16(const __vector uint16_t vva[]) : svec_internal<N, __vector uint16_t, uint16_t>(vva) {}
302 |   FORCEINLINE svec_u16(const uint16_t v[]) : svec_internal<N, __vector uint16_t, uint16_t>(v) {}
303 |   FORCEINLINE svec_u16(const uint16_t& v0, const uint16_t& v1, const uint16_t& v2, const uint16_t& v3) {
304 |     __vector uint16_t t = {v0, v1, v2, v3, 0, 0, 0, 0};
305 |     this->va[0] = t;
306 |   }
307 |   FORCEINLINE svec_u16(const uint16_t& v0, const uint16_t& v1, const uint16_t& v2, const uint16_t& v3,
308 |                       const uint16_t& v4, const uint16_t& v5, const uint16_t& v6, const uint16_t& v7) {
309 |     __vector uint16_t t = {v0, v1, v2, v3, v4, v5, v6, v7};
310 |     this->va[0] = t;
311 |   }
312 |   FORCEINLINE svec_u16(const uint16_t& v) {
313 |     if(__builtin_constant_p(v) && (v <= 15)){
314 |       __vector uint16_t t = vec_splat_u16(v); //will gen one instr.vspltisb
315 |       for(int i=0; i < N; i+=this->lanes_per_reg()) {
316 |         this->va[i/this->lanes_per_reg()] = t;
317 |       } //fo
318 |     } else {
319 |       svec_internal<N, __vector uint16_t, uint16_t>(v);
320 |     }
321 |   }
322 | };
323 | 
324 | template <int N>
325 | class svec_i32: public svec_internal<N, __vector int32_t, int32_t> {
326 | 
327 | public:
328 |   FORCEINLINE svec_i32() { }
329 |   FORCEINLINE svec_i32(const __vector int32_t vva[]) : svec_internal<N, __vector int32_t, int32_t>(vva) {}
330 |   FORCEINLINE svec_i32(const int32_t v[]) : svec_internal<N, __vector int32_t, int32_t>(v) {}
331 |   FORCEINLINE svec_i32(const int32_t& v0, const int32_t& v1, const int32_t& v2, const int32_t& v3) {
332 |     __vector int32_t t = {v0, v1, v2, v3};
333 |     this->va[0] = t;
334 |   }
335 |   FORCEINLINE svec_i32(const int32_t& v0, const int32_t& v1, const int32_t& v2, const int32_t& v3,
336 |                       const int32_t& v4, const int32_t& v5, const int32_t& v6, const int32_t& v7) {
337 |     __vector int32_t t0 = {v0, v1, v2, v3};
338 |     this->va[0] = t0;
339 |     __vector int32_t t1 = {v4, v5, v6, v7};
340 |     this->va[1] = t1;
341 |   }
342 |   FORCEINLINE svec_i32(const int32_t& v) {
343 |     if(__builtin_constant_p(v) && (v <= 15) && (v >= -16)){
344 |       __vector int32_t t = vec_splat_s32(v); //will gen one instr.vspltisb
345 |       for(int i=0; i < N; i+=this->lanes_per_reg()) {
346 |         this->va[i/this->lanes_per_reg()] = t;
347 |       } //fo
348 |     } else {
349 |       svec_internal<N, __vector int32_t, int32_t>(v);
350 |     }
351 |   }
352 | };
353 | 
354 | template <int N>
355 | class svec_u32: public svec_internal<N, __vector uint32_t, uint32_t> {
356 | 
357 | public:
358 |   FORCEINLINE svec_u32() { }
359 |   FORCEINLINE svec_u32(const __vector uint32_t vva[]) : svec_internal<N, __vector uint32_t, uint32_t>(vva) {}
360 |   FORCEINLINE svec_u32(const uint32_t v[]) : svec_internal<N, __vector uint32_t, uint32_t>(v) {}
361 |   FORCEINLINE svec_u32(const uint32_t& v0, const uint32_t& v1, const uint32_t& v2, const uint32_t& v3) {
362 |     __vector uint32_t t = {v0, v1, v2, v3};
363 |     this->va[0] = t;
364 |   }
365 |   FORCEINLINE svec_u32(const uint32_t& v0, const uint32_t& v1, const uint32_t& v2, const uint32_t& v3,
366 |                       const uint32_t& v4, const uint32_t& v5, const uint32_t& v6, const uint32_t& v7) {
367 |     __vector uint32_t t0 = {v0, v1, v2, v3};
368 |     this->va[0] = t0;
369 |     __vector uint32_t t1 = {v4, v5, v6, v7};
370 |     this->va[1] = t1;
371 |   }
372 |   FORCEINLINE svec_u32(const uint32_t& v) {
373 |     if(__builtin_constant_p(v) && (v <= 15)){
374 |       __vector uint32_t t = vec_splat_u8(v); //will gen one instr.vspltisb
375 |       for(int i=0; i < N; i+=this->lanes_per_reg()) {
376 |         this->va[i/this->lanes_per_reg()] = t;
377 |       } //for
378 |     } else {
379 |       svec_internal<N, __vector uint32_t, uint32_t>(v);
380 |     }
381 |   }
382 | };
383 | 
384 | 
385 | template <int N>
386 | class svec_i64: public svec_internal<N, __vector int64_t, int64_t> {
387 | 
388 | public:
389 |   FORCEINLINE svec_i64() { }
390 |   FORCEINLINE svec_i64(const __vector int64_t vva[]) : svec_internal<N, __vector int64_t, int64_t>(vva) {}
391 |   FORCEINLINE svec_i64(const int64_t v[]) : svec_internal<N, __vector int64_t, int64_t>(v) {}
392 |   FORCEINLINE svec_i64(const int64_t& v0, const int64_t& v1, const int64_t& v2, const int64_t& v3) {
393 |     __vector int64_t t0 = {v0, v1};
394 |     this->va[0] = t0;
395 |     __vector int64_t t1 = {v2, v3};
396 |     this->va[1] = t1;
397 |   }
398 |   FORCEINLINE svec_i64(const int64_t& v0, const int64_t& v1, const int64_t& v2, const int64_t& v3,
399 |                       const int64_t& v4, const int64_t& v5, const int64_t& v6, const int64_t& v7) {
400 |     __vector int64_t t0 = {v0, v1};
401 |     this->va[0] = t0;
402 |     __vector int64_t t1 = {v2, v3};
403 |     this->va[1] = t1;
404 |     __vector int64_t t2 = {v4, v5};
405 |     this->va[2] = t2;
406 |     __vector int64_t t3 = {v6, v7};
407 |     this->va[3] = t3;
408 |   }
409 |   FORCEINLINE svec_i64(const int64_t& v) {
410 |     if(__builtin_constant_p(v)){
411 |       __vector int64_t t;
412 | #ifdef __POWER8
413 |       if ((v >= -16l) && (v <= 15l)) {
414 |         const int iv = (int)v;
415 |         __vector signed int x = {iv,iv,iv,iv};
416 |         t = vec_unpackh_p8(x);
417 |       } else
418 | #endif
419 |       t = (__vector int64_t)(v,v);
420 | 
421 |       for(int i=0; i < N; i+=this->lanes_per_reg()) {
422 |         this->va[i/this->lanes_per_reg()] = t;
423 |       } //for
424 |     } else {
425 |       svec_internal<N, __vector int64_t, int64_t>(v);
426 |     }
427 |   }
428 | };
429 | 
430 | template <int N>
431 | class svec_u64: public svec_internal<N, __vector uint64_t, uint64_t> {
432 | 
433 | public:
434 |   FORCEINLINE svec_u64() { }
435 |   FORCEINLINE svec_u64(const __vector uint64_t vva[]) : svec_internal<N, __vector uint64_t, uint64_t>(vva) {}
436 |   FORCEINLINE svec_u64(const uint64_t v[]) : svec_internal<N, __vector uint64_t, uint64_t>(v) {}
437 |   FORCEINLINE svec_u64(const uint64_t& v0, const uint64_t& v1, const uint64_t& v2, const uint64_t& v3) {
438 |     __vector uint64_t t0 = {v0, v1};
439 |     this->va[0] = t0;
440 |     __vector uint64_t t1 = {v2, v3};
441 |     this->va[1] = t1;
442 |   }
443 |   FORCEINLINE svec_u64(const uint64_t& v0, const uint64_t& v1, const uint64_t& v2, const uint64_t& v3,
444 |                       const uint64_t& v4, const uint64_t& v5, const uint64_t& v6, const uint64_t& v7) {
445 |     __vector uint64_t t0 = {v0, v1};
446 |     this->va[0] = t0;
447 |     __vector uint64_t t1 = {v2, v3};
448 |     this->va[1] = t1;
449 |     __vector uint64_t t2 = {v4, v5};
450 |     this->va[2] = t2;
451 |     __vector uint64_t t3 = {v6, v7};
452 |     this->va[3] = t3;
453 |   }
454 |   FORCEINLINE svec_u64(const uint64_t& v) {
455 |     if(__builtin_constant_p(v)){
456 |       __vector uint64_t t;
457 | #ifdef __POWER8
458 |       if ((v >= 0ul) && (v <= 31ul)) {
459 |         const int iv = (int)v;
460 |         __vector signed int x = {iv,iv,iv,iv};
461 |         t = vec_unpackh_p8(x);
462 |       } else
463 | #endif
464 |       t = (__vector uint64_t)(v,v);
465 | 
466 |       for(int i=0; i < N; i+=this->lanes_per_reg()) {
467 |         this->va[i/this->lanes_per_reg()] = t;
468 |       } //for
469 |     } else {
470 |       svec_internal<N, __vector uint64_t, uint64_t>(v);
471 |     }
472 |   }
473 | };
474 | 
475 | 
476 | template <int N>
477 | class svec_f: public svec_internal<N, __vector float, float> {
478 | 
479 | public:
480 |   FORCEINLINE svec_f() { }
481 |   FORCEINLINE svec_f(const __vector float vva[]) : svec_internal<N, __vector float, float>(vva) {}
482 |   FORCEINLINE svec_f(const float v[]) : svec_internal<N, __vector float, float>(v) {}
483 |   FORCEINLINE svec_f(const float& v0, const float& v1, const float& v2, const float& v3) {
484 |     __vector float t = {v0, v1, v2, v3};
485 |     this->va[0] = t;
486 |   }
487 |   FORCEINLINE svec_f(const float& v0, const float& v1, const float& v2, const float& v3,
488 |                       const float& v4, const float& v5, const float& v6, const float& v7) {
489 |     __vector float t0 = {v0, v1, v2, v3};
490 |     this->va[0] = t0;
491 |     __vector float t1 = {v4, v5, v6, v7};
492 |     this->va[1] = t1;
493 |   }
494 |   FORCEINLINE svec_f(const float& v) {
495 |     if(__builtin_constant_p(v)){
496 |       __vector float t;
497 |       float p; int iv;
498 |       p = 1.0; iv = (int)(p*v);
499 |       if (( (((float)iv)/p) == v ) && (iv >= -32) && (iv <= 15)) {
500 |         t = vec_ctf(vec_splat_s32(iv),0);
501 |       } else {
502 |         p = 2.0; iv = (int)(p*v);
503 |         if (( (((float)iv)/p) == v ) && (iv >= -16) && (iv <= 15)) {
504 |           t = vec_ctf(vec_splat_s32(iv),1);
505 |         } else {
506 |           p = 4.0; iv = (int)(p*v);
507 |           if (( (((float)iv)/p) == v ) && (iv >= -16) && (iv <= 15)) {
508 |             t = vec_ctf(vec_splat_s32(iv),2);
509 |           } else {
510 |             t = (__vector float)(v, v, v, v);
511 |           }
512 |         }
513 |       }
514 |       for(int i=0; i < N; i+=this->lanes_per_reg()) {
515 |         this->va[i/this->lanes_per_reg()] = t;
516 |       } //for
517 |     } else { //use built-in constructor
518 |       svec_internal<N, __vector float, float>(v);
519 |     }
520 |   }
521 | 
522 | };
523 | 
524 | template <int N>
525 | class svec_d: public svec_internal<N, __vector double, double> {
526 | 
527 | public:
528 |   FORCEINLINE svec_d() { }
529 |   FORCEINLINE svec_d(const __vector double vva[]) : svec_internal<N, __vector double, double>(vva) {}
530 |   FORCEINLINE svec_d(const double v[]) : svec_internal<N, __vector double, double>(v) {}
531 |   FORCEINLINE svec_d(const double& v0, const double& v1, const double& v2, const double& v3) {
532 |     __vector double t0 = {v0, v1};
533 |     this->va[0] = t0;
534 |     __vector double t1 = {v2, v3};
535 |     this->va[1] = t1;
536 |   }
537 |   FORCEINLINE svec_d(const double& v0, const double& v1, const double& v2, const double& v3,
538 |                       const double& v4, const double& v5, const double& v6, const double& v7) {
539 |     __vector double t0 = {v0, v1};
540 |     this->va[0] = t0;
541 |     __vector double t1 = {v2, v3};
542 |     this->va[1] = t1;
543 |     __vector double t2 = {v4, v5};
544 |     this->va[2] = t2;
545 |     __vector double t3 = {v6, v7};
546 |     this->va[3] = t3;
547 |   }
548 |   FORCEINLINE svec_d(const double& v) {
549 |     __vector double t = vec_smear_p7(v);
550 |     for(int i=0; i < N; i+=this->lanes_per_reg()) {
551 |       this->va[i/this->lanes_per_reg()] = t;
552 |     } //for
553 |   }
554 | };
555 | 
556 | 
557 | ////////////Section of class member functions
558 | 
559 | 
560 | 
561 | 
562 | } //namespace vsx
563 | 
564 | 
565 | #endif /* SVEC_VSX_H_ */
566 | 


--------------------------------------------------------------------------------
/include/timing.h:
--------------------------------------------------------------------------------
  1 | /**
  2 | Copyright 2012 the Generic SIMD Intrinsic Library project authors. All rights reserved.
  3 | 
  4 | Copyright IBM Corp. 2013, 2013. All rights reserved.
  5 | 
  6 | Redistribution and use in source and binary forms, with or without
  7 | modification, are permitted provided that the following conditions are
  8 | met:
  9 | 
 10 |    * Redistributions of source code must retain the above copyright
 11 |      notice, this list of conditions and the following disclaimer.
 12 |    * Redistributions in binary form must reproduce the above
 13 |      copyright notice, this list of conditions and the following
 14 |      disclaimer in the documentation and/or other materials provided
 15 |      with the distribution.
 16 |    * Neither the name of IBM Corp. nor the names of its contributors may be
 17 |      used to endorse or promote products derived from this software
 18 |      without specific prior written permission.
 19 | 
 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 21 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 22 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 23 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 24 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 25 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 26 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 27 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 28 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 30 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 31 | 
 32 | The original source code covered by the above license above has been
 33 | modified significantly by IBM Corp.
 34 | Copyright 2013 the Generic SIMD Intrinsic Library project authors. All rights reserved.
 35 | 
 36 | Copyright (c) 2010-2012, Intel Corporation
 37 |   All rights reserved.
 38 | 
 39 |   Redistribution and use in source and binary forms, with or without
 40 |   modification, are permitted provided that the following conditions are
 41 |   met:
 42 | 
 43 |     * Redistributions of source code must retain the above copyright
 44 |       notice, this list of conditions and the following disclaimer.
 45 | 
 46 |     * Redistributions in binary form must reproduce the above copyright
 47 |       notice, this list of conditions and the following disclaimer in the
 48 |       documentation and/or other materials provided with the distribution.
 49 | 
 50 |     * Neither the name of Intel Corporation nor the names of its
 51 |       contributors may be used to endorse or promote products derived from
 52 |       this software without specific prior written permission.
 53 | 
 54 | 
 55 |    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
 56 |    IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 57 |    TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
 58 |    PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
 59 |    OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 60 |    EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 61 |    PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 62 |    PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 63 |    LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 64 |    NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 65 |    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 66 | */
 67 | 
 68 | #include <stdint.h>
 69 | 
 70 | 
 71 | #ifdef WIN32
 72 | #include <windows.h>
 73 | #define rdtsc __rdtsc
 74 | #else
 75 | #ifdef __cplusplus
 76 | extern "C" {
 77 | #endif /* __cplusplus */
 78 | 
 79 | #include <sys/time.h>
 80 | 
 81 | __inline__ uint64_t rdtsc() {
 82 | 
 83 | #ifdef __PPC__
 84 |   uint32_t tbl, tbu0, tbu1;
 85 |   do {
 86 |     __asm__ __volatile__ ("mftbu %0" : "=r"(tbu0));
 87 |     __asm__ __volatile__ ("mftb %0" : "=r"(tbl));
 88 |     __asm__ __volatile__ ("mftbu %0" : "=r"(tbu1));
 89 |   } while (tbu0 != tbu1);
 90 |   
 91 |   return (((uint64_t)tbu0) << 32) | tbl;
 92 | #else
 93 | 
 94 |         uint32_t low, high;
 95 |   #ifdef __x86_64
 96 |         __asm__ __volatile__ (
 97 |             "xorl %%eax,%%eax \n    cpuid"
 98 |             ::: "%rax", "%rbx", "%rcx", "%rdx" );
 99 |   #else
100 |         __asm__ __volatile__ (
101 |             "xorl %%eax,%%eax \n    cpuid"
102 |             ::: "%eax", "%ebx", "%ecx", "%edx" );
103 |   #endif
104 |         __asm__ __volatile__ (
105 |                               "rdtsc" : "=a" (low), "=d" (high));
106 |         return (uint64_t)high << 32 | low;
107 | #endif
108 |     }
109 | #ifdef __cplusplus
110 | }
111 | #endif /* __cplusplus */
112 | #endif            
113 | 
114 | //rdtsc based, used in simulator or must set thread affinity first
115 | static uint64_t start, end;
116 | 
117 | static inline void reset_and_start_timer()
118 | {
119 |     start = rdtsc();
120 | }
121 | 
122 | /* Returns the number of millions of elapsed processor cycles since the
123 |    last reset_and_start_timer() call. */
124 | static inline double get_elapsed_mcycles()
125 | {
126 |     end = rdtsc();
127 |     return (end-start) / (1024. * 1024.);
128 | }
129 | 
130 | //timeofday based, used in real hardware
131 | static double start_s;
132 | 
133 | static inline double get_usec()
134 | {
135 |   struct timeval tim;
136 |   gettimeofday(&tim, 0);
137 |   return tim.tv_sec+(tim.tv_usec/1000000.0);
138 | }
139 | 
140 | /*
141 |  * Start timer
142 |  *   Simulator (__POWER8 defined): use rdtsc register
143 |  *   Real machine: use gettimeofday()
144 |  */
145 | static inline void reset_and_start_stimer()
146 | {
147 | #ifdef __POWER8
148 |     reset_and_start_timer();
149 | #else
150 |     start_s = get_usec();
151 | #endif
152 | }
153 | 
154 | /*
155 |  * End timer and report
156 |  *   Simulator (__POWER8 defined): use rdtsc register
157 |  *   Real machine: use gettimeofday()
158 |  */
159 | static inline double get_elapsed_seconds()
160 | {
161 | #ifdef __POWER8
162 |     return get_elapsed_mcycles();
163 | #else
164 |     return get_usec() - start_s;
165 | #endif
166 | }
167 | 


--------------------------------------------------------------------------------
/tests/Makefile:
--------------------------------------------------------------------------------
 1 | GXX = g++
 2 | #GXX = /opt/at5.0/bin/g++
 3 | ECHO=echo
 4 | 
 5 | GTEST_DIR=gtest-1.6.0
 6 | SIMD_TARGETS=vsx4 generic4 sse4
 7 | 
 8 | CXXFLAGS=-I../include -Wno-int-to-pointer-cast -flax-vector-conversions -g
 9 | #disable assert failure
10 | CXXFLAGS+= -DNDEBUG
11 | ########### The below section is used for new test on gtest framework
12 |  
13 | help:
14 | 	@-$(ECHO) "Usage:  'make <targets>' to build/run unit-tests for a target SIMD platform"
15 | 	@-$(ECHO) "   where <targets> are: ${SIMD_TARGETS}"
16 | 	@-$(ECHO) "   e.g., use 'make clean; make vsx4'  to build/run vsx4 unit tests"
17 | 
18 | # Need build the libgtest.a
19 | check_googletest:
20 | 	@if [ ! -d ${GTEST_DIR} ]; then \
21 | 	   $(ECHO) "${GTEST_DIR} does not exist, please refer to README to install googletest."; \
22 | 	   exit 2; \
23 | 	fi 
24 | 
25 | libgtest.a: check_googletest
26 | 	${GXX} -I${GTEST_DIR}/include -I${GTEST_DIR} -c ${GTEST_DIR}/src/gtest-all.cc
27 | 	ar -rv $@ gtest-all.o
28 | 
29 | vsx4: test_lanes4.cpp libgtest.a ../include/power_vsx4.h
30 | 	${GXX} -I${GTEST_DIR}/include -l pthread -mvsx ${CXXFLAGS} $^ -o $@
31 | 	./$@
32 | 
33 | generic4: test_lanes4.cpp libgtest.a
34 | 	${GXX} -I${GTEST_DIR}/include -l pthread ${CXXFLAGS} $^ -o $@
35 | 	./$@
36 | 
37 | generic8: test_lanes8.cpp libgtest.a
38 | 	${GXX} -I${GTEST_DIR}/include -l pthread ${CXXFLAGS} $^ -o $@
39 | 	./$@
40 | 
41 | sse4: test_lanes4.cpp libgtest.a ../include/sse4.h
42 | 	${GXX} -I${GTEST_DIR}/include -l pthread -msse4.2 ${CXXFLAGS} $^ -o $@
43 | 	./$@
44 | 
45 | #test_svec: test_svec.cpp libgtest.a ../include/svec-vsx.h
46 | #	${GXX} -I${GTEST_DIR}/include -l pthread -mvsx ${CXXFLAGS} $^ -o $@
47 | 
48 | 
49 | #codegen: codegen.cpp
50 | #	${GXX} -mno-vrsave -mvsx ${CXXFLAGS} $< -O2 -S
51 | #	${GXX} -mno-vrsave -mvsx ${CXXFLAGS} $< -O2 -c -g -Wa,-a,-ad > $@.cs
52 | 
53 | clean:
54 | 	rm -f *.o *.exe core *~ ${TARGETS} *.output *.a
55 | 


--------------------------------------------------------------------------------
/tests/README:
--------------------------------------------------------------------------------
 1 | The tests are dependent on google test framework.  Due to the
 2 | opensource license issue, we don't include it in our source tree.
 3 | 
 4 | Please download googletest framework first from
 5 |   https://code.google.com/p/googletest/
 6 | and unzip it into "tests/gtest-1.6.0" directory. 
 7 | Alternatively one can modify the "GTEST_DIR" value in 
 8 | tests/Makefile.
 9 | 
10 | 


--------------------------------------------------------------------------------
/tests/codegen.cpp:
--------------------------------------------------------------------------------
  1 | /**
  2 | Copyright 2012 the Generic SIMD Intrinsic Library project authors. All rights reserved.
  3 | 
  4 | Copyright IBM Corp. 2013, 2013. All rights reserved.
  5 | 
  6 | Redistribution and use in source and binary forms, with or without
  7 | modification, are permitted provided that the following conditions are
  8 | met:
  9 | 
 10 |    * Redistributions of source code must retain the above copyright
 11 |      notice, this list of conditions and the following disclaimer.
 12 |    * Redistributions in binary form must reproduce the above
 13 |      copyright notice, this list of conditions and the following
 14 |      disclaimer in the documentation and/or other materials provided
 15 |      with the distribution.
 16 |    * Neither the name of IBM Corp. nor the names of its contributors may be
 17 |      used to endorse or promote products derived from this software
 18 |      without specific prior written permission.
 19 | 
 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 21 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 22 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 23 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 24 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 25 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 26 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 27 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 28 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 30 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 31 | */
 32 | 
 33 | /**
 34 |  * @brief Test the intrinsics' code mapping with our own code mapping
 35 |  * codegen.cpp
 36 |  *
 37 |  *  Created on: Jul 12, 2013
 38 |  *      Author: Haichuan
 39 |  */
 40 | 
 41 | #include <power_vsx4.h>
 42 | using namespace vsx;
 43 | 
 44 | static char mem[128] POST_ALIGN(16);
 45 | static svec4_i1* p_vi1 = (svec4_i1*)mem;
 46 | static svec4_i32* p_vi32 = (svec4_i32*)mem;
 47 | static svec4_i64* p_vi64 = (svec4_i64*)mem;
 48 | static svec4_f* p_vf = (svec4_f*)mem;
 49 | static svec4_d* p_vd = (svec4_d*)mem;
 50 | 
 51 | const svec4_i32 base_off(0,1,2,3);
 52 | 
 53 | FORCEINLINE svec4_d gather(double* base, svec4_i32 off) {
 54 |   int* off_addr = (int*)(&(off.v));
 55 | 
 56 |   double d0 = *(base + svec_extract(off, 0));
 57 |   double d1 = *(base + svec_extract(off, 1));
 58 |   double d2 = *(base + svec_extract(off, 2));
 59 |   double d3 = *(base + svec_extract(off, 3));
 60 |   return svec4_d(d0, d1, d2, d3);
 61 | }
 62 | 
 63 | svec4_d test_gather(int scale) {
 64 |   svec4_i32 off_ip = scale * base_off;
 65 |   return gather((double*)mem, off_ip);
 66 | }
 67 | 
 68 | FORCEINLINE svec4_d gather_opt(double* base, svec4_i32 off) {
 69 |   int* off_addr = (int*)(&(off.v));
 70 | 
 71 |   double d0 = *(base + off_addr[0]);
 72 |   double d1 = *(base + off_addr[1]);
 73 |   double d2 = *(base + off_addr[2]);
 74 |   double d3 = *(base + off_addr[3]);
 75 |   return svec4_d(d0, d1, d2, d3);
 76 | }
 77 | 
 78 | svec4_d test_gather_opt(int scale) {
 79 |   svec4_i32 off_ip = scale * base_off;
 80 |   return gather_opt((double*)mem, off_ip);
 81 | }
 82 | 
 83 | 
 84 | FORCEINLINE svec4_d gather_stride(double* base, int off0, int off1, int off2, int off3) {
 85 |   double d0 = *(base + off0);
 86 |   double d1 = *(base + off1);
 87 |   double d2 = *(base + off2);
 88 |   double d3 = *(base + off3);
 89 |   return svec4_d(d0, d1, d2, d3);
 90 | }
 91 | 
 92 | svec4_d test_gather_stride(int scale) {
 93 |   return gather_stride((double*)mem, 0, scale*1, scale*2, scale*3);
 94 | }
 95 | 
 96 | FORCEINLINE svec4_d gather_stride2(double* base, long long off, long long stride) {
 97 |   long long stride2 = stride * 2;
 98 |   double d0 = *(base + off);
 99 |   double d1 = *(base + off+stride);
100 |   double d2 = *(base + off+stride2);
101 |   double d3 = *(base + off+stride2+stride);
102 |   return svec4_d(d0, d1, d2, d3);
103 | }
104 | 
105 | svec4_d test_gather_stride2(int scale) {
106 |   return gather_stride2((double*)mem, scale, (long long)scale);
107 | }
108 | 
109 | 
110 | FORCEINLINE svec4_d gather_stride3(double* base, long long stride) {
111 |   double d0 = *base;
112 |   base += stride;
113 |   double d1 = *base;
114 |   base += stride;
115 |   double d2 = *base;
116 |   base += stride;
117 |   double d3 = *base;
118 |   return svec4_d(d0, d1, d2, d3);
119 | }
120 | 
121 | svec4_d test_gather_stride3(int scale) {
122 |   return gather_stride3((double*)(mem+scale), (long long)scale);
123 | }
124 | 
125 | FORCEINLINE svec4_d gather_stride4(double* base, long long off, long long stride) {
126 |   base += off;
127 |   double d0 = *base;
128 |   base += stride;
129 |   double d1 = *base;
130 |   base += stride;
131 |   double d2 = *base;
132 |   base += stride;
133 |   double d3 = *base;
134 |   return svec4_d(d0, d1, d2, d3);
135 | }
136 | 
137 | svec4_d test_gather_stride4(int scale) {
138 |   return gather_stride4((double*)mem, scale, (long long)scale);
139 | }
140 | 
141 | FORCEINLINE svec4_d gather_stride5(double* base, long long stride) {
142 |   long long stride2 = stride * 2;
143 |   double d0 = *(base);
144 |   double d1 = *(base + stride);
145 |   double d2 = *(base + stride2);
146 |   double d3 = *(base + stride2+stride);
147 |   return svec4_d(d0, d1, d2, d3);
148 | }
149 | 
150 | svec4_d test_gather_stride5(int scale) {
151 |   return gather_stride5((double*)(mem+scale), (long long)scale);
152 | }
153 | 
154 | 
155 | int test_access(svec4_i1 v) {
156 | //  li 0,48
157 | //   addi 9,1,-64
158 | //   stxvw4x 34,9,0
159 | //.LBE21:
160 | //   .loc 1 20 0
161 | //   lwa 3,-4(1)
162 | //   blr
163 | 
164 | //  int i = v_i32[3];
165 | 
166 | 
167 | //  li 0,48
168 | //  addi 9,1,-64
169 | //  stxvw4x 34,9,0
170 | //.LBE18:
171 | //  .loc 1 30 0
172 | //  lwa 3,-4(1)
173 | //  blr
174 | //  int i = vec_extract(v_i32.v, 3);
175 | 
176 | 
177 | //  li 0,100
178 | //  ld 11,.LC1@toc(2)
179 | //  stw 0,-16(1)
180 | //  addi 10,1,-64
181 | //.LBB28:
182 | //.LBB29:
183 | //.LBB30:
184 | //  .file 2 "../include/power_vsx4.h"
185 | //  .loc 2 1065 0
186 | //  ld 9,.LC2@toc(2)
187 | //.LBE30:
188 | //.LBE29:
189 | //.LBE28:
190 | //  .loc 1 39 0
191 | //  li 0,48
192 | //  lxvw4x 33,0,11
193 | //  lvewx 0,10,0
194 | //  .loc 1 57 0
195 | //  li 3,0
196 | //  .loc 1 39 0
197 | //  vperm 0,2,0,1
198 | //.LVL1:
199 | //.LBB33:
200 | //.LBB32:
201 | //.LBB31:
202 | //  .loc 2 1065 0
203 | //  stxvw4x 32,0,9
204 | //.LBE31:
205 | //.LBE32:
206 | //.LBE33:
207 | //  .loc 1 57 0
208 | //  blr
209 | 
210 |   v[3] = 15;
211 | 
212 |   int r = v[2];
213 | 
214 | 
215 | //  .loc 1 43 0
216 | //  li 3,0
217 | //.LBB31:
218 | //.LBB30:
219 | //.LBB29:
220 | //  .loc 2 1065 0
221 | //  stxvw4x 34,0,9
222 | //.LBE29:
223 | //.LBE30:
224 | //.LBE31:
225 | //  .loc 1 43 0
226 | //  blr
227 | 
228 | //  vec_insert(100, v_i32.v, 3);
229 |   v.store(p_vi1);
230 | 
231 |   return r;
232 | }
233 | 
234 | void test_broadcasts_64(svec4_d v_d) {
235 | //  v_d[0] = 1.1;
236 | 
237 | //  li 0,16
238 | //  .loc 2 1107 0
239 | //  lfd 13,-16(1)
240 | //  xxpermdi 0,13,13,0
241 | //  stxvd2x 0,0,9
242 | //  .loc 2 1108 0
243 | //  stxvd2x 0,9,0
244 | //  __vector double splat_d = vec_splat_p7(v_d.v[0], 0);
245 | //  svec4_d nvd(splat_d, splat_d);
246 | 
247 | 
248 | //  ld 9,.LC3@toc(2)
249 | //.LBE118:
250 | //.LBE117:
251 | //.LBE116:
252 | //.LBE115:
253 | //  .loc 1 97 0
254 | //  lxvd2x 0,11,0
255 | //.LVL3:
256 | //.LBB130:
257 | //.LBB123:
258 | //.LBB121:
259 | //.LBB119:
260 | //  .loc 2 1174 0
261 | //  li 0,16
262 | //.LVL4:
263 | //.LBE119:
264 | //.LBE121:
265 | //.LBE123:
266 | //.LBB124:
267 | //.LBB125:
268 | //.LBB126:
269 | //.LBB127:
270 | //.LBB128:
271 | //  .file 3 "../include/power9_intrinsics.h"
272 | //  .loc 3 746 0
273 | //#APP
274 | //# 746 "../include/power9_intrinsics.h" 1
275 | //  xxpermdi 0, 0, 0, 0
276 | //# 0 "" 2
277 | //.LVL5:
278 | //#NO_APP
279 | //.LBE128:
280 | //.LBE127:
281 | //.LBE126:
282 | //.LBE125:
283 | //.LBE124:
284 | //.LBB129:
285 | //.LBB122:
286 | //.LBB120:
287 | //  .loc 2 1173 0
288 | //  stxvd2x 0,0,9
289 | //  .loc 2 1174 0
290 | //  stxvd2x 0,9,0
291 |   svec4_d nvd = v_d.broadcast(0);
292 | 
293 |   nvd.store(p_vd);
294 | //  DUMP(nvd);
295 | 
296 | }
297 | 
298 | 
299 | void test_broadcasts_32(svec4_i32 v_i32) {
300 | //  li 0,48
301 | //  addi 9,1,-80
302 | //  stxvw4x 34,9,0
303 | //  lwz 0,-24(1)
304 | //.LVL3:
305 | //.LBB66:
306 | //.LBB67:
307 | //.LBB68:
308 | //.LBB69:
309 | //  .loc 2 1065 0
310 | //  addi 11,1,-80
311 | //  ld 9,.LC3@toc(2)
312 | //  stw 0,-16(1)
313 | //  stw 0,-12(1)
314 | //  stw 0,-8(1)
315 | //  stw 0,-4(1)
316 | //  li 0,64
317 | //.LVL4:
318 | //  lxvw4x 32,11,0
319 | //  stxvw4x 32,0,9
320 | //.LBE69:
321 | //.LBE68:
322 | //.LBE67:
323 | //.LBE66:
324 | //  .loc 1 99 0
325 | //  blr
326 | 
327 | //  svec4_i32 vi = v_i32.broadcast(2);
328 | 
329 | 
330 | 
331 | //  li 0,48
332 | //  addi 9,1,-80
333 | //  stxvw4x 34,9,0
334 | //  lwz 0,-24(1)
335 | //.LVL3:
336 | //.LBB46:
337 | //.LBB47:
338 | //.LBB48:
339 | //  .loc 2 1065 0
340 | //  addi 11,1,-80
341 | //  ld 9,.LC3@toc(2)
342 | //  stw 0,-16(1)
343 | //  stw 0,-12(1)
344 | //  stw 0,-8(1)
345 | //  stw 0,-4(1)
346 | //  li 0,64
347 | //.LVL4:
348 | //  lxvw4x 32,11,0
349 | //  stxvw4x 32,0,9
350 | //  v_i32[2] = 100;
351 |  // svec4_i32 vi = svec4_i32(vec_splats(vec_extract(v_i32.v, 2)));
352 | 
353 | 
354 | //  .loc 2 1065 0
355 | //  ld 9,.LC3@toc(2)
356 | //.LBE53:
357 | //.LBE52:
358 | //.LBE51:
359 | //.LBB56:
360 | //.LBB57:
361 | //  .file 3 "../include/power9_intrinsics.h"
362 | //  .loc 3 734 0
363 | //#APP
364 | //# 734 "../include/power9_intrinsics.h" 1
365 | //  xxspltw 34, 34,2
366 | //# 0 "" 2
367 | //.LVL3:
368 | //#NO_APP
369 | //.LBE57:
370 | //.LBE56:
371 | //.LBB58:
372 | //.LBB55:
373 | //.LBB54:
374 | //  .loc 2 1065 0
375 | //  stxvw4x 34,0,9
376 | //.LBE54:
377 | //.LBE55:
378 | //.LBE58:
379 | //.LBE50:
380 | //  .loc 1 156 0
381 | //  blr
382 | 
383 |   svec4_i32 vi = svec4_i32(vec_splat_p7(v_i32.v, 2));
384 | 
385 | //  DUMP(vi);
386 | 
387 |   vi.store(p_vi32);
388 | }
389 | 
390 | 
391 | void test_splats(int i) {
392 |   //integer
393 | //  svec4_i32 i0(i+1);
394 | //  i0.store(p_vi32);
395 | 
396 | //  //float
397 | //  svec4_f f0(0.25f);
398 | //  f0.store(p_vf+2);
399 | //
400 | //  //integer
401 | 
402 | //  .loc 2 1107 0
403 | //  std 3,-16(1)
404 | //  .loc 2 1108 0
405 | //  li 0,16
406 | //  .loc 2 1107 0
407 | //  lfd 13,-16(1)
408 | //  xxpermdi 0,13,13,0
409 | //  stxvd2x 0,0,9
410 | //  .loc 2 1108 0
411 |   svec4_i64 i1(vec_splats((signed long long)(i+2)), vec_splats((signed long long)(i+2)));
412 |   i1.store(p_vi64);
413 | 
414 | //
415 | //  //float
416 | //  svec4_f f1(vec_splats(0.5f));
417 | //   f1.store(p_vf+3);
418 | }
419 | 
420 | 
421 | 
422 | 
423 | int main(int argc, char* argv[])
424 | {
425 |   int j = 0;
426 |   svec4_i32 v_i32 =  * p_vi32;
427 |   svec4_d v_d =  * p_vd;
428 |     //test_splats(argc);
429 |   svec4_i1 v_i1 =  * p_vi1;
430 |     test_access(v_i1);
431 | //  test_broadcasts_32(v_i32);
432 | //  test_broadcasts_64(v_d);
433 | 
434 |     DUMP(test_gather(argc+1));
435 |     DUMP(test_gather_opt(argc+1));
436 |     DUMP(test_gather_stride2(argc+1));
437 |     DUMP(test_gather_stride3(argc+1));
438 |     DUMP(test_gather_stride4(argc+1));
439 |     DUMP(test_gather_stride5(argc+1));
440 | 
441 |     return 0;
442 | }
443 | 
444 | 
445 | 


--------------------------------------------------------------------------------
/tests/test_svec.cpp:
--------------------------------------------------------------------------------
 1 | /**
 2 | Copyright 2012 the Generic SIMD Intrinsic Library project authors. All rights reserved.
 3 | 
 4 | Copyright IBM Corp. 2013, 2013. All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are
 8 | met:
 9 | 
10 |    * Redistributions of source code must retain the above copyright
11 |      notice, this list of conditions and the following disclaimer.
12 |    * Redistributions in binary form must reproduce the above
13 |      copyright notice, this list of conditions and the following
14 |      disclaimer in the documentation and/or other materials provided
15 |      with the distribution.
16 |    * Neither the name of IBM Corp. nor the names of its contributors may be
17 |      used to endorse or promote products derived from this software
18 |      without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 | */
32 | 
33 | /*
34 |  * test_svec-vsx.cpp
35 |  *
36 |  *  Created on: Jul 7, 2013
37 |  *      Author: Haichuan Wang (haichuan@us.ibm.com, hwang154@illinois.edu)
38 |  */
39 | 
40 | #include <gtest/gtest.h>
41 | #include <svec-vsx.h>
42 | 
43 | using namespace vsx;
44 | 
45 | #define EXPECT_VEC_EQ(v1, v2) EXPECT_TRUE(vec_all_eq(v1, v2))
46 | #define DUMP(v) std::cout << #v << ":" << (v) << std::endl
47 | 
48 | 
49 | 
50 | TEST(svec_bool, ConstructorByScalars)
51 | {
52 | 
53 |     __vector unsigned int t = { -1, 0, -1, 0};
54 |     svec_bool<4> v1(1, 0, 1, 0);
55 |     EXPECT_VEC_EQ(v1.reg(0), t);
56 | 
57 |     svec_bool<8> v2(1, 0, 1, 0, 1, 0, 1, 0);
58 |     EXPECT_VEC_EQ(v2.reg(0), t);
59 |     EXPECT_VEC_EQ(v2.reg(1), t);
60 | 
61 |     bool a[] = {1, 0, 1, 0};
62 |     svec_bool<4> v3(a);
63 |     EXPECT_VEC_EQ(v3.reg(0), t);
64 | 
65 |     __vector uint32_t va[] = { t, t };
66 |     svec_bool<8> v4(va);
67 |     EXPECT_VEC_EQ(v4.reg(0), t);
68 |     EXPECT_VEC_EQ(v4.reg(1), t);
69 | }
70 | 
71 | TEST(svec_8, ConstructorByScalars)
72 | {
73 | 
74 |   svec_i8<4> v1(100,0,-50,1);
75 |   __vector int8_t t = { 100, 0, -50, 1, 0,0,0,0, 0,0,0,0, 0,0,0,0};
76 |   DUMP(v1);
77 |   EXPECT_VEC_EQ(v1.reg(0), t);
78 | 
79 | }
80 | 
81 | 
82 | int main(int argc, char* argv[])
83 | {
84 |     testing::InitGoogleTest(&argc, argv);
85 |     return RUN_ALL_TESTS();
86 | }
87 | 


--------------------------------------------------------------------------------
/tests/test_utility.h:
--------------------------------------------------------------------------------
  1 | /**
  2 | Copyright 2012 the Generic SIMD Intrinsic Library project authors. All rights reserved.
  3 | 
  4 | Copyright IBM Corp. 2013, 2013. All rights reserved.
  5 | 
  6 | Redistribution and use in source and binary forms, with or without
  7 | modification, are permitted provided that the following conditions are
  8 | met:
  9 | 
 10 |    * Redistributions of source code must retain the above copyright
 11 |      notice, this list of conditions and the following disclaimer.
 12 |    * Redistributions in binary form must reproduce the above
 13 |      copyright notice, this list of conditions and the following
 14 |      disclaimer in the documentation and/or other materials provided
 15 |      with the distribution.
 16 |    * Neither the name of IBM Corp. nor the names of its contributors may be
 17 |      used to endorse or promote products derived from this software
 18 |      without specific prior written permission.
 19 | 
 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 21 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 22 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 23 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 24 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 25 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 26 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 27 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 28 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 30 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 31 | */
 32 | 
 33 | /**
 34 |  * test_utility.h
 35 |  *
 36 |  *  Created on: Aug 16, 2013
 37 |  *  @author: Haichuan Wang (hwang154@illinois.edu)
 38 |  *  @brief: common functions for test different lanes.
 39 |  */
 40 | 
 41 | #ifndef TEST_UTILITY_H_
 42 | #define TEST_UTILITY_H_
 43 | 
 44 | #define EXPECT_SVEC_EQ(v1, v2) EXPECT_TRUE(((v1) == (v2)).all_true())
 45 | #define EXPECT_SVEC_MASKED_EQ(v1, v2, mask) EXPECT_TRUE((svec_masked_equal((v1), (v2), (mask)) == mask).all_true())
 46 | 
 47 | /**
 48 |  * @brief macros for check float equal
 49 |  */
 50 | #define EXPECT_SVEC_FEQ(v1, v2) EXPECT_TRUE( \
 51 |     (v1 - v2).abs().reduce_add() < 0.005 * LANES)
 52 | 
 53 | 
 54 | #define DUMP(v) std::cout << #v << ":" << (v) << std::endl
 55 | 
 56 | template<typename STYPE, typename VTYPE>
 57 | VTYPE random_vec(int maxValue) {
 58 |   VTYPE vec;
 59 |   for (int i=0; i<LANES; i++) {
 60 |     STYPE value = (STYPE) rand();
 61 |     if (maxValue != -1) value = (STYPE)((int)value % maxValue);
 62 |     vec[i] = value;
 63 |   }
 64 |   return vec;
 65 | }
 66 | 
 67 | template<typename STYPE, typename VTYPE>
 68 | VTYPE random_vec() {
 69 |   random_vec<STYPE, VTYPE>(-1);
 70 | }
 71 | 
 72 | template <class VTYPE, class VTYPE2>
 73 | VTYPE ref_shr(VTYPE val, VTYPE2 s) {
 74 |   VTYPE ret;
 75 |   for(int i = 0; i < LANES; i++) {
 76 |     ret[i] = val[i] >> s[i];
 77 |   }
 78 |   return ret;
 79 | }
 80 | 
 81 | template <class VTYPE>
 82 | VTYPE ref_shr(VTYPE val, int s) {
 83 |   VTYPE ret;
 84 |   for(int i = 0; i < LANES; i++) {
 85 |     ret[i] = val[i] >> s;
 86 |   }
 87 |   return ret;
 88 | }
 89 | 
 90 | 
 91 | template <class VTYPE, class VTYPE2>
 92 | VTYPE ref_shl(VTYPE val, VTYPE2 s) {
 93 |   VTYPE ret;
 94 |   for(int i = 0; i < LANES; i++) {
 95 |     ret[i] = val[i] << s[i];
 96 |   }
 97 |   return ret;
 98 | }
 99 | 
100 | template <class VTYPE>
101 | VTYPE ref_shl(VTYPE val, int s) {
102 |   VTYPE ret;
103 |   for(int i = 0; i < LANES; i++) {
104 |     ret[i] = val[i] << s;
105 |   }
106 |   return ret;
107 | }
108 | 
109 | 
110 | 
111 | template <class FROM, class TO, class STO>
112 | TO ref_cast(FROM val) {
113 |   TO ret;
114 |   for(int i = 0; i < LANES; i++) {
115 |     ret[i] = (STO)val[i];
116 |   }
117 |   return ret;
118 | }
119 | 
120 | 
121 | #endif /* TEST_UTILITY_H_ */
122 | 


--------------------------------------------------------------------------------
/tools/allgroupspower7.sh:
--------------------------------------------------------------------------------
1 | #/bin/bash
2 | LOG=$1_pmc.log
3 | echo "run $1 to get pmc" | tee $LOG 
4 | for group in {0..260}
5 | do
6 | ./grouppower7.sh $group $1 2>&1 | tee -a $LOG 
7 | done
8 | 


--------------------------------------------------------------------------------
/tools/grouppower7.sh:
--------------------------------------------------------------------------------
  1 |  #/bin/ksh
  2 | g=$1
  3 | shift
  4 | echo "*************************  group $g" 1>&2
  5 | case $g in
  6 |    0)  perf stat -e r1001E,r200F4,r300F2,r40002,r500fa,r600f4 $*;;
  7 |    1)  perf stat -e r140A0,r240A2,r340A4,r440AE,r500fa,r600f4 $*;;
  8 |    2)  perf stat -e r1409C,r240A8,r340A0,r440A2,r500fa,r600f4 $*;;
  9 |    3)  perf stat -e r10068,r20004,r3409C,r400F6,r500fa,r600f4 $*;;
 10 |    4)  perf stat -e r140AC,r2409E,r340AE,r440A4,r500fa,r600f4 $*;;
 11 |    5)  perf stat -e r148AA,r248AE,r3409C,r440A8,r500fa,r600f4 $*;;
 12 |    6)  perf stat -e r140A0,r240A2,r340A8,r440AA,r500fa,r600f4 $*;;
 13 |    7)  perf stat -e r140AC,r240A8,r340A0,r440A2,r500fa,r600f4 $*;;
 14 |    8)  perf stat -e r140AE,r240A8,r340A0,r440A2,r500fa,r600f4 $*;;
 15 |    9)  perf stat -e r140A4,r240A8,r340A0,r440A2,r500fa,r600f4 $*;;
 16 |   10)  perf stat -e r100F6,r2D090,r3D092,r4D890,r500fa,r600f4 $*;;
 17 |   11)  perf stat -e r15088,r20066,r300FC,r400FC,r500fa,r600f4 $*;;
 18 |   12)  perf stat -e r1C05E,r2C05E,r3C05E,r4C05E,r500fa,r600f4 $*;;
 19 |   13)  perf stat -e r1C05C,r2C05C,r3C05C,r4C05C,r500fa,r600f4 $*;;
 20 |   14)  perf stat -e r10002,r2C05C,r3C05C,r4C05C,r500fa,r600f4 $*;;
 21 |   15)  perf stat -e r1D090,r200FE,r3C05A,r400F0,r500fa,r600f4 $*;;
 22 |   16)  perf stat -e r1001E,r2C058,r3C05A,r400FA,r500fa,r600f4 $*;;
 23 |   17)  perf stat -e r1001E,r2C058,r3C05A,r4C058,r500fa,r600f4 $*;;
 24 |   18)  perf stat -e r1D090,r24048,r30002,r400FA,r500fa,r600f4 $*;;
 25 |   19)  perf stat -e r100F6,r2D090,r3D092,r40002,r500fa,r600f4 $*;;
 26 |   20)  perf stat -e r1C050,r2E050,r3C056,r4E054,r500fa,r600f4 $*;;
 27 |   21)  perf stat -e r1E050,r2E054,r3E054,r4C054,r500fa,r600f4 $*;;
 28 |   22)  perf stat -e r1C054,r2C058,r3E052,r4C052,r500fa,r600f4 $*;;
 29 |   23)  perf stat -e r1E052,r2C052,r3C052,r4C052,r500fa,r600f4 $*;;
 30 |   24)  perf stat -e r1C052,r2C056,r3C054,r4C056,r500fa,r600f4 $*;;
 31 |   25)  perf stat -e r1E054,r2E052,r3E056,r4E052,r500fa,r600f4 $*;;
 32 |   26)  perf stat -e r1E054,r2E056,r3E056,r4E056,r500fa,r600f4 $*;;
 33 |   27)  perf stat -e r1E050,r2E058,r3E052,r4E058,r500fa,r600f4 $*;;
 34 |   28)  perf stat -e r1C050,r2C050,r3C052,r4C058,r500fa,r600f4 $*;;
 35 |   29)  perf stat -e r1C050,r2C050,r30002,r4001E,r500fa,r600f4 $*;;
 36 |   30)  perf stat -e r1C052,r2C054,r30002,r4C054,r500fa,r600f4 $*;;
 37 |   31)  perf stat -e r10002,r2C052,r3C052,r4C052,r500fa,r600f4 $*;;
 38 |   32)  perf stat -e r1006E,r20006,r3000C,r4000C,r500fa,r600f4 $*;;
 39 |   33)  perf stat -e r1006E,r20006,r30006,r4000C,r500fa,r600f4 $*;;
 40 |   34)  perf stat -e r1C880,r2C080,r3C082,r4D0A6,r500fa,r600f4 $*;;
 41 |   35)  perf stat -e r12088,r2208A,r3208C,r400F8,r500fa,r600f4 $*;;
 42 |   36)  perf stat -e r12086,r22082,r3208E,r4C0AA,r500fa,r600f4 $*;;
 43 |   37)  perf stat -e r12082,r2001E,r30012,r400F8,r500fa,r600f4 $*;;
 44 |   38)  perf stat -e r1C8B0,r2C8B4,r3C8B8,r4C8BC,r500fa,r600f4 $*;;
 45 |   39)  perf stat -e r1C8B0,r2C0B0,r3C0B2,r400F8,r500fa,r600f4 $*;;
 46 |   40)  perf stat -e r1C8B4,r2C0B4,r3C0B6,r400F8,r500fa,r600f4 $*;;
 47 |   41)  perf stat -e r1C8B8,r2C0B8,r3C0BA,r400F8,r500fa,r600f4 $*;;
 48 |   42)  perf stat -e r1C8BC,r2C0BC,r3C0BE,r400F8,r500fa,r600f4 $*;;
 49 |   43)  perf stat -e r10018,r2408A,r34096,r4408E,r500fa,r600f4 $*;;
 50 |   44)  perf stat -e r100FA,r2000C,r300F4,r40060,r500fa,r600f4 $*;;
 51 |   45)  perf stat -e r10012,r2000C,r300F4,r440B0,r500fa,r600f4 $*;;
 52 |   46)  perf stat -e r10062,r20060,r30060,r440B4,r500fa,r600f4 $*;;
 53 |   47)  perf stat -e r140B2,r20062,r30062,r40062,r500fa,r600f4 $*;;
 54 |   48)  perf stat -e r140B0,r240B2,r340B4,r440B6,r500fa,r600f4 $*;;
 55 |   49)  perf stat -e r10060,r2000C,r300F4,r40060,r500fa,r600f4 $*;;
 56 |   50)  perf stat -e r1000E,r2000E,r3000E,r4000E,r500fa,r600f4 $*;;
 57 |   51)  perf stat -e r10004,r200F4,r30002,r40004,r500fa,r600f4 $*;;
 58 |   52)  perf stat -e r1001E,r2000E,r3000E,r4000E,r500fa,r600f4 $*;;
 59 |   53)  perf stat -e r1000E,r2000E,r3001E,r40002,r500fa,r600f4 $*;;
 60 |   54)  perf stat -e r16280,r26280,r36280,r46282,r500fa,r600f4 $*;;
 61 |   55)  perf stat -e r16382,r2001E,r36380,r40002,r500fa,r600f4 $*;;
 62 |   56)  perf stat -e r16280,r26280,r36282,r46280,r500fa,r600f4 $*;;
 63 |   57)  perf stat -e r16082,r26080,r30002,r4001E,r500fa,r600f4 $*;;
 64 |   58)  perf stat -e r10002,r2001E,r36182,r46182,r500fa,r600f4 $*;;
 65 |   59)  perf stat -e r10002,r2001E,r36180,r46180,r500fa,r600f4 $*;;
 66 |   60)  perf stat -e r16282,r26282,r30002,r4001E,r500fa,r600f4 $*;;
 67 |   61)  perf stat -e r10081,r20081,r30081,r40081,r500fa,r600f4 $*;;
 68 |   62)  perf stat -e r10083,r20083,r30083,r40083,r500fa,r600f4 $*;;
 69 |   63)  perf stat -e r10881,r20881,r30881,r40881,r500fa,r600f4 $*;;
 70 |   64)  perf stat -e r10883,r20883,r30883,r40883,r500fa,r600f4 $*;;
 71 |   65)  perf stat -e r14098,r2409A,r34088,r44082,r500fa,r600f4 $*;;
 72 |   66)  perf stat -e r1C040,r200F2,r300F6,r400F2,r500fa,r600f4 $*;;
 73 |   67)  perf stat -e r1C048,r2001E,r300F6,r40002,r500fa,r600f4 $*;;
 74 |   68)  perf stat -e r1C042,r2C044,r300F6,r40002,r500fa,r600f4 $*;;
 75 |   69)  perf stat -e r10064,r2C0AC,r3C0AE,r4C8AC,r500fa,r600f4 $*;;
 76 |   70)  perf stat -e r10064,r20064,r3C8A8,r40008,r500fa,r600f4 $*;;
 77 |   71)  perf stat -e r1C8A8,r2C0A8,r3001E,r40002,r500fa,r600f4 $*;;
 78 |   72)  perf stat -e r1C8A4,r2C0A4,r3C0A6,r40002,r500fa,r600f4 $*;;
 79 |   73)  perf stat -e r1C88C,r2C08C,r3C08E,r40002,r500fa,r600f4 $*;;
 80 |   74)  perf stat -e r100F8,r20008,r34086,r4001E,r500fa,r600f4 $*;;
 81 |   75)  perf stat -e r1209C,r2209E,r320A0,r420A2,r500fa,r600f4 $*;;
 82 |   76)  perf stat -e r16180,r26182,r30002,r4001E,r500fa,r600f4 $*;;
 83 |   77)  perf stat -e r16182,r26180,r30002,r4001E,r500fa,r600f4 $*;;
 84 |   78)  perf stat -e r10006,r20006,r30006,r400F2,r500fa,r600f4 $*;;
 85 |   79)  perf stat -e r10016,r20006,r30006,r40006,r500fa,r600f4 $*;;
 86 |   80)  perf stat -e r12092,r22094,r32096,r42098,r500fa,r600f4 $*;;
 87 |   81)  perf stat -e r1006E,r2006E,r3006E,r4006E,r500fa,r600f4 $*;;
 88 |   82)  perf stat -e r100F2,r200F2,r3000A,r400F2,r500fa,r600f4 $*;;
 89 |   83)  perf stat -e r100F2,r2001E,r30002,r400F2,r500fa,r600f4 $*;;
 90 |   84)  perf stat -e r14888,r2488C,r34890,r44898,r500fa,r600f4 $*;;
 91 |   85)  perf stat -e r14090,r24092,r34094,r44890,r500fa,r600f4 $*;;
 92 |   86)  perf stat -e r100F6,r200FC,r30002,r4001E,r500fa,r600f4 $*;;
 93 |   87)  perf stat -e r1C040,r20016,r300F6,r40018,r500fa,r600f4 $*;;
 94 |   88)  perf stat -e r1000E,r20014,r30004,r40014,r500fa,r600f4 $*;;
 95 |   89)  perf stat -e r10026,r20012,r3001A,r40016,r500fa,r600f4 $*;;
 96 |   90)  perf stat -e r100F4,r20018,r3003E,r40012,r500fa,r600f4 $*;;
 97 |   91)  perf stat -e r10028,r2001C,r3003F,r4000A,r500fa,r600f4 $*;;
 98 |   92)  perf stat -e r1001C,r2003C,r30002,r4001C,r500fa,r600f4 $*;;
 99 |   93)  perf stat -e r100F8,r2001A,r30014,r4001A,r500fa,r600f4 $*;;
100 |   94)  perf stat -e r1C040,r2C040,r3C042,r4C042,r500fa,r600f4 $*;;
101 |   95)  perf stat -e r1C048,r2C046,r3C04A,r4C048,r500fa,r600f4 $*;;
102 |   96)  perf stat -e r1C04A,r2C048,r3C046,r4C048,r500fa,r600f4 $*;;
103 |   97)  perf stat -e r1C044,r2C044,r3C04C,r4C044,r500fa,r600f4 $*;;
104 |   98)  perf stat -e r1C04E,r2C042,r3C044,r4C046,r500fa,r600f4 $*;;
105 |   99)  perf stat -e r1C042,r2C044,r3C04E,r4C048,r500fa,r600f4 $*;;
106 |  100)  perf stat -e r1C04C,r2C048,r3C04C,r4C044,r500fa,r600f4 $*;;
107 |  101)  perf stat -e r10002,r2C040,r300FE,r4C042,r500fa,r600f4 $*;;
108 |  102)  perf stat -e r1C040,r200FE,r300F6,r400F0,r500fa,r600f4 $*;;
109 |  103)  perf stat -e r1C042,r2C044,r3C044,r4C044,r500fa,r600f4 $*;;
110 |  104)  perf stat -e r1C040,r200FE,r300FE,r400FA,r500fa,r600f4 $*;;
111 |  105)  perf stat -e r1C042,r2C042,r3C042,r4C042,r500fa,r600f4 $*;;
112 |  106)  perf stat -e r1C05C,r20002,r3C044,r4C044,r500fa,r600f4 $*;;
113 |  107)  perf stat -e r1C04A,r20002,r3C042,r4C042,r500fa,r600f4 $*;;
114 |  108)  perf stat -e r1C04A,r20002,r300F6,r4C042,r500fa,r600f4 $*;;
115 |  109)  perf stat -e r14040,r24040,r3404A,r44048,r500fa,r600f4 $*;;
116 |  110)  perf stat -e r14048,r24042,r3404C,r44042,r500fa,r600f4 $*;;
117 |  111)  perf stat -e r1404A,r24048,r34044,r44044,r500fa,r600f4 $*;;
118 |  112)  perf stat -e r14044,r24046,r34046,r44046,r500fa,r600f4 $*;;
119 |  113)  perf stat -e r1404E,r24044,r3404E,r44048,r500fa,r600f4 $*;;
120 |  114)  perf stat -e r14046,r24048,r3404A,r44048,r500fa,r600f4 $*;;
121 |  115)  perf stat -e r14042,r24044,r34044,r44044,r500fa,r600f4 $*;;
122 |  116)  perf stat -e r1404C,r24048,r3404A,r44048,r500fa,r600f4 $*;;
123 |  117)  perf stat -e r14046,r24042,r34042,r44042,r500fa,r600f4 $*;;
124 |  118)  perf stat -e r14040,r24040,r30002,r4001E,r500fa,r600f4 $*;;
125 |  119)  perf stat -e r14042,r24044,r3404A,r40002,r500fa,r600f4 $*;;
126 |  120)  perf stat -e r1001E,r20002,r34044,r44044,r500fa,r600f4 $*;;
127 |  121)  perf stat -e r1404A,r20002,r34042,r44042,r500fa,r600f4 $*;;
128 |  122)  perf stat -e r1D8A8,r2D8AC,r3D8B4,r4D8B8,r500fa,r600f4 $*;;
129 |  123)  perf stat -e r1D8BC,r2C880,r30066,r400F0,r500fa,r600f4 $*;;
130 |  124)  perf stat -e r1A080,r2A082,r3A098,r4A09A,r500fa,r600f4 $*;;
131 |  125)  perf stat -e r1A09C,r2A09E,r3A0A0,r4A0A2,r500fa,r600f4 $*;;
132 |  126)  perf stat -e r1A898,r2A88C,r3A08C,r4A08E,r500fa,r600f4 $*;;
133 |  127)  perf stat -e r1A084,r2A086,r3A884,r40002,r500fa,r600f4 $*;;
134 |  128)  perf stat -e r1A090,r2A092,r3A890,r40002,r500fa,r600f4 $*;;
135 |  129)  perf stat -e r1B880,r2B080,r3B082,r40002,r500fa,r600f4 $*;;
136 |  130)  perf stat -e r1A8AC,r2A0AC,r3A0AE,r40002,r500fa,r600f4 $*;;
137 |  131)  perf stat -e r1A8BC,r2A0BC,r3A0BE,r40002,r500fa,r600f4 $*;;
138 |  132)  perf stat -e r1B88C,r2B08C,r3B08E,r40002,r500fa,r600f4 $*;;
139 |  133)  perf stat -e r1A8A8,r2A0A8,r3A0AA,r4A0A4,r500fa,r600f4 $*;;
140 |  134)  perf stat -e r1A888,r2A088,r3A08A,r40002,r500fa,r600f4 $*;;
141 |  135)  perf stat -e r1A894,r2A094,r3A096,r40002,r500fa,r600f4 $*;;
142 |  136)  perf stat -e r1B888,r2B088,r3B08A,r40002,r500fa,r600f4 $*;;
143 |  137)  perf stat -e r1B884,r2B084,r3B086,r40002,r500fa,r600f4 $*;;
144 |  138)  perf stat -e r1A880,r2A89C,r3A8A0,r4A898,r500fa,r600f4 $*;;
145 |  139)  perf stat -e r1B890,r2B090,r3B09C,r40002,r500fa,r600f4 $*;;
146 |  140)  perf stat -e r1B894,r2B094,r3B096,r4B0A0,r500fa,r600f4 $*;;
147 |  141)  perf stat -e r1B098,r2B09A,r3B092,r4B09E,r500fa,r600f4 $*;;
148 |  142)  perf stat -e r1A8B0,r2A0B0,r3A0B2,r40002,r500fa,r600f4 $*;;
149 |  143)  perf stat -e r1A8B4,r2A0B4,r3A0B6,r40002,r500fa,r600f4 $*;;
150 |  144)  perf stat -e r1A8B8,r2A0B8,r3A0BA,r40002,r500fa,r600f4 $*;;
151 |  145)  perf stat -e r10068,r200F4,r30002,r4A8BC,r500fa,r600f4 $*;;
152 |  146)  perf stat -e r1C884,r2B88C,r3A884,r4A880,r500fa,r600f4 $*;;
153 |  147)  perf stat -e r1A888,r2A8BC,r3A884,r4A880,r500fa,r600f4 $*;;
154 |  148)  perf stat -e r100F4,r2A8BC,r3A8B8,r4A880,r500fa,r600f4 $*;;
155 |  149)  perf stat -e r1B88C,r2A8BC,r3A8B4,r4A8B0,r500fa,r600f4 $*;;
156 |  150)  perf stat -e r1D0A4,r2003E,r3001C,r40008,r500fa,r600f4 $*;;
157 |  151)  perf stat -e r10066,r2C090,r30066,r4208E,r500fa,r600f4 $*;;
158 |  152)  perf stat -e r1D098,r2D09A,r3D0A0,r4D0A4,r500fa,r600f4 $*;;
159 |  153)  perf stat -e r1C8A0,r2C0A0,r3C0A2,r40002,r500fa,r600f4 $*;;
160 |  154)  perf stat -e r1D096,r2D097,r3D09C,r40002,r500fa,r600f4 $*;;
161 |  155)  perf stat -e r1D09C,r2D09E,r3D0A0,r40002,r500fa,r600f4 $*;;
162 |  156)  perf stat -e r1D0A1,r2D09F,r3D09D,r40002,r500fa,r600f4 $*;;
163 |  157)  perf stat -e r1D8B8,r2D0B8,r3D0BA,r40002,r500fa,r600f4 $*;;
164 |  158)  perf stat -e r16480,r26480,r3001E,r40002,r500fa,r600f4 $*;;
165 |  159)  perf stat -e r16482,r26482,r3001E,r40002,r500fa,r600f4 $*;;
166 |  160)  perf stat -e r100F0,r24080,r30016,r40002,r500fa,r600f4 $*;;
167 |  161)  perf stat -e r12080,r200F8,r300F8,r4001E,r500fa,r600f4 $*;;
168 |  162)  perf stat -e r100F2,r2000A,r300F2,r400F2,r500fa,r600f4 $*;;
169 |  163)  perf stat -e r1000C,r2001A,r3001E,r4001C,r500fa,r600f4 $*;;
170 |  164)  perf stat -e r1000A,r248AE,r340A4,r400F6,r500fa,r600f4 $*;;
171 |  165)  perf stat -e r1408C,r2408E,r3488C,r40002,r500fa,r600f4 $*;;
172 |  166)  perf stat -e r10038,r2000A,r3001E,r40066,r500fa,r600f4 $*;;
173 |  167)  perf stat -e r140A6,r200F8,r300F6,r400F6,r500fa,r600f4 $*;;
174 |  168)  perf stat -e r12084,r22086,r3C0A8,r400F6,r500fa,r600f4 $*;;
175 |  169)  perf stat -e r1001A,r2D8A8,r3D8B8,r44084,r500fa,r600f4 $*;;
176 |  170)  perf stat -e r100F4,r2001E,r30004,r40002,r500fa,r600f4 $*;;
177 |  171)  perf stat -e r10002,r200F0,r300F8,r400F8,r500fa,r600f4 $*;;
178 |  172)  perf stat -e r100F8,r200F0,r300FC,r400F6,r500fa,r600f4 $*;;
179 |  173)  perf stat -e r1001E,r2001E,r30002,r40066,r500fa,r600f4 $*;;
180 |  174)  perf stat -e r1D0A2,r2004A,r300F6,r4004A,r500fa,r600f4 $*;;
181 |  175)  perf stat -e r10028,r2C09C,r3C09E,r4004C,r500fa,r600f4 $*;;
182 |  176)  perf stat -e r10068,r200F0,r3D054,r4004E,r500fa,r600f4 $*;;
183 |  177)  perf stat -e r10000,r2001E,r3D094,r40002,r500fa,r600f4 $*;;
184 |  178)  perf stat -e r10014,r2001E,r30014,r40002,r500fa,r600f4 $*;;
185 |  179)  perf stat -e r1D094,r2001E,r3209A,r40002,r500fa,r600f4 $*;;
186 |  180)  perf stat -e r1001E,r228A4,r320A4,r420A6,r500fa,r600f4 $*;;
187 |  181)  perf stat -e r1F080,r2F080,r3F080,r4F080,r500fa,r600f4 $*;;
188 |  182)  perf stat -e r15080,r25082,r35084,r45086,r500fa,r600f4 $*;;
189 |  183)  perf stat -e r1D0AC,r2D0AE,r3D8AC,r4D8B8,r500fa,r600f4 $*;;
190 |  184)  perf stat -e r1F082,r2F082,r3F082,r4F082,r500fa,r600f4 $*;;
191 |  185)  perf stat -e r1001E,r2D8B4,r3D0B4,r4D0B6,r500fa,r600f4 $*;;
192 |  186)  perf stat -e r1001E,r2D8BC,r3D0BC,r4D0BE,r500fa,r600f4 $*;;
193 |  187)  perf stat -e r1D0B0,r2D8A8,r3D0A8,r4D0AA,r500fa,r600f4 $*;;
194 |  188)  perf stat -e r1C094,r2C096,r3001E,r4C894,r500fa,r600f4 $*;;
195 |  189)  perf stat -e r1001E,r2C884,r3C084,r4C086,r500fa,r600f4 $*;;
196 |  190)  perf stat -e r1001E,r2C888,r3C088,r4C08A,r500fa,r600f4 $*;;
197 |  191)  perf stat -e r16080,r26082,r3F080,r4001E,r500fa,r600f4 $*;;
198 |  192)  perf stat -e r1C894,r2C8AC,r3C098,r4C09A,r500fa,r600f4 $*;;
199 |  193)  perf stat -e r1508A,r25088,r3C098,r4C09A,r500fa,r600f4 $*;;
200 |  194)  perf stat -e r140B8,r240BA,r3001E,r40002,r500fa,r600f4 $*;;
201 |  195)  perf stat -e r100F0,r200F2,r30016,r40002,r500fa,r600f4 $*;;
202 |  196)  perf stat -e r16880,r26880,r36082,r46080,r500fa,r600f4 $*;;
203 |  197)  perf stat -e r10002,r2001E,r36080,r46080,r500fa,r600f4 $*;;
204 |  198)  perf stat -e r10002,r2001E,r36482,r400FA,r500fa,r600f4 $*;;
205 |  199)  perf stat -e r10002,r2001E,r36382,r46382,r500fa,r600f4 $*;;
206 |  200)  perf stat -e r10002,r2001E,r36480,r400FA,r500fa,r600f4 $*;;
207 |  201)  perf stat -e r10002,r200F4,r3001E,r46380,r500fa,r600f4 $*;;
208 |  202)  perf stat -e r100F6,r240BC,r340BE,r40002,r500fa,r600f4 $*;;
209 |  203)  perf stat -e r12090,r220A8,r3001E,r40002,r500fa,r600f4 $*;;
210 |  204)  perf stat -e r1001E,r20006,r30008,r40002,r500fa,r600f4 $*;;
211 |  205)  perf stat -e r10008,r200F4,r3001E,r400F4,r500fa,r600f4 $*;;
212 |  206)  perf stat -e r10010,r20010,r30010,r40010,r500fa,r600f4 $*;;
213 |  207)  perf stat -e r10024,r20010,r30024,r40010,r500fa,r600f4 $*;;
214 |  208)  perf stat -e r10020,r200F4,r30020,r40002,r500fa,r600f4 $*;;
215 |  209)  perf stat -e r10022,r200F4,r30022,r40002,r500fa,r600f4 $*;;
216 |  210)  perf stat -e r1208A,r22096,r3D0B2,r40002,r500fa,r600f4 $*;;
217 |  211)  perf stat -e r100F6,r200FC,r300F0,r400F0,r500fa,r600f4 $*;;
218 |  212)  perf stat -e r1001E,r200F6,r300FC,r400FC,r500fa,r600f4 $*;;
219 |  213)  perf stat -e r100FA,r200F4,r3001E,r400F4,r500fa,r600f4 $*;;
220 |  214)  perf stat -e r100F4,r200F4,r3001E,r400FA,r500fa,r600f4 $*;;
221 |  215)  perf stat -e r100F2,r200F4,r300F2,r400F2,r500fa,r600f4 $*;;
222 |  216)  perf stat -e r10002,r200F0,r300F0,r400F0,r500fa,r600f4 $*;;
223 |  217)  perf stat -e r10002,r200FE,r300F6,r400F0,r500fa,r600f4 $*;;
224 |  218)  perf stat -e r100F6,r200FC,r30002,r400FC,r500fa,r600f4 $*;;
225 |  219)  perf stat -e r10000,r20000,r30000,r40000,r500fa,r600f4 $*;;
226 |  220)  perf stat -e r10002,r200F8,r300F8,r4001E,r500fa,r600f4 $*;;
227 |  221)  perf stat -e r100F0,r200F2,r300F4,r400F8,r500fa,r600f4 $*;;
228 |  222)  perf stat -e r100F8,r200F2,r3001E,r400F6,r500fa,r600f4 $*;;
229 |  223)  perf stat -e r10036,r20036,r30036,r40002,r500fa,r600f4 $*;;
230 |  224)  perf stat -e r1D04A,r2002E,r30002,r4D048,r500fa,r600f4 $*;;
231 |  225)  perf stat -e r1003E,r20002,r3D046,r40024,r500fa,r600f4 $*;;
232 |  226)  perf stat -e r1D048,r2D048,r30002,r40020,r500fa,r600f4 $*;;
233 |  227)  perf stat -e r10002,r2002C,r3D04A,r4C042,r500fa,r600f4 $*;;
234 |  228)  perf stat -e r1D044,r20002,r30030,r40026,r500fa,r600f4 $*;;
235 |  229)  perf stat -e r1003F,r20024,r3D04E,r40002,r500fa,r600f4 $*;;
236 |  230)  perf stat -e r1D040,r20020,r30002,r4D048,r500fa,r600f4 $*;;
237 |  231)  perf stat -e r1D042,r2D048,r30002,r40028,r500fa,r600f4 $*;;
238 |  232)  perf stat -e r10002,r2002A,r3D044,r4D048,r500fa,r600f4 $*;;
239 |  233)  perf stat -e r1D04C,r20028,r3C042,r40002,r500fa,r600f4 $*;;
240 |  234)  perf stat -e r1003E,r20002,r3D042,r4002C,r500fa,r600f4 $*;;
241 |  235)  perf stat -e r1D04E,r20026,r30030,r40002,r500fa,r600f4 $*;;
242 |  236)  perf stat -e r1003F,r20002,r3D04C,r4002A,r500fa,r600f4 $*;;
243 |  237)  perf stat -e r1D084,r2D086,r30002,r4001E,r500fa,r600f4 $*;;
244 |  238)  perf stat -e r10002,r2001E,r3D088,r4D08A,r500fa,r600f4 $*;;
245 |  239)  perf stat -e r1D082,r2D08C,r30002,r40064,r500fa,r600f4 $*;;
246 |  240)  perf stat -e r10032,r20030,r30030,r40002,r500fa,r600f4 $*;;
247 |  241)  perf stat -e r10034,r20034,r30034,r40002,r500fa,r600f4 $*;;
248 |  242)  perf stat -e r10002,r2D05E,r3D05E,r4D05E,r500fa,r600f4 $*;;
249 |  243)  perf stat -e r1D05E,r2D05E,r3D05E,r40002,r500fa,r600f4 $*;;
250 |  244)  perf stat -e r10002,r2D05C,r3D05C,r4D05C,r500fa,r600f4 $*;;
251 |  245)  perf stat -e r1D05C,r2D05C,r3D05C,r40002,r500fa,r600f4 $*;;
252 |  246)  perf stat -e r1003E,r20002,r3D05A,r4003E,r500fa,r600f4 $*;;
253 |  247)  perf stat -e r10002,r2D052,r3D056,r4D056,r500fa,r600f4 $*;;
254 |  248)  perf stat -e r1D050,r2D054,r3D052,r40002,r500fa,r600f4 $*;;
255 |  249)  perf stat -e r10002,r2D056,r3D056,r4D054,r500fa,r600f4 $*;;
256 |  250)  perf stat -e r1D054,r2D050,r30002,r4D058,r500fa,r600f4 $*;;
257 |  251)  perf stat -e r1D052,r2D058,r30002,r4D052,r500fa,r600f4 $*;;
258 |  252)  perf stat -e r1D08E,r20002,r3003A,r40034,r500fa,r600f4 $*;;
259 |  253)  perf stat -e r10002,r20038,r3003A,r40032,r500fa,r600f4 $*;;
260 |  254)  perf stat -e r10002,r2003A,r3D080,r40032,r500fa,r600f4 $*;;
261 |  255)  perf stat -e r1003C,r20002,r30032,r40038,r500fa,r600f4 $*;;
262 |  256)  perf stat -e r1003D,r20032,r3003F,r40002,r500fa,r600f4 $*;;
263 |  257)  perf stat -e r10030,r200F4,r30002,r40030,r500fa,r600f4 $*;;
264 |  258)  perf stat -e r1D082,r20002,r30064,r40064,r500fa,r600f4 $*;;
265 |  259)  perf stat -e r1001E,r2001E,r30002,r40032,r500fa,r600f4 $*;;
266 |  260)  perf stat -e r1D040,r20020,r3D0A2,r4000A,r500fa,r600f4 $*;;
267 |    *) echo "GROUP NOT FOUND $g";;
268 | esac
269 | 
270 | /gsa/yktgsa/home/h/a/haichuan/workspace/gsimd/examples/RGB2Gray/groupnamepower7.sh $g
271 | 


--------------------------------------------------------------------------------
/tools/p7.sh:
--------------------------------------------------------------------------------
  1 |  #/bin/ksh
  2 | g=$1
  3 | shift
  4 | p=$1
  5 | shift
  6 | echo "*************************  group $g" 1>&2
  7 | case $g in
  8 |    0)  perf stat -p $p -e r1001E,r200F4,r300F2,r40002,r500fa,r600f4 $*;;
  9 |    1)  perf stat -p $p -e r140A0,r240A2,r340A4,r440AE,r500fa,r600f4 $*;;
 10 |    2)  perf stat -p $p -e r1409C,r240A8,r340A0,r440A2,r500fa,r600f4 $*;;
 11 |    3)  perf stat -p $p -e r10068,r20004,r3409C,r400F6,r500fa,r600f4 $*;;
 12 |    4)  perf stat -p $p -e r140AC,r2409E,r340AE,r440A4,r500fa,r600f4 $*;;
 13 |    5)  perf stat -p $p -e r148AA,r248AE,r3409C,r440A8,r500fa,r600f4 $*;;
 14 |    6)  perf stat -p $p -e r140A0,r240A2,r340A8,r440AA,r500fa,r600f4 $*;;
 15 |    7)  perf stat -p $p -e r140AC,r240A8,r340A0,r440A2,r500fa,r600f4 $*;;
 16 |    8)  perf stat -p $p -e r140AE,r240A8,r340A0,r440A2,r500fa,r600f4 $*;;
 17 |    9)  perf stat -p $p -e r140A4,r240A8,r340A0,r440A2,r500fa,r600f4 $*;;
 18 |   10)  perf stat -p $p -e r100F6,r2D090,r3D092,r4D890,r500fa,r600f4 $*;;
 19 |   11)  perf stat -p $p -e r15088,r20066,r300FC,r400FC,r500fa,r600f4 $*;;
 20 |   12)  perf stat -p $p -e r1C05E,r2C05E,r3C05E,r4C05E,r500fa,r600f4 $*;;
 21 |   13)  perf stat -p $p -e r1C05C,r2C05C,r3C05C,r4C05C,r500fa,r600f4 $*;;
 22 |   14)  perf stat -p $p -e r10002,r2C05C,r3C05C,r4C05C,r500fa,r600f4 $*;;
 23 |   15)  perf stat -p $p -e r1D090,r200FE,r3C05A,r400F0,r500fa,r600f4 $*;;
 24 |   16)  perf stat -p $p -e r1001E,r2C058,r3C05A,r400FA,r500fa,r600f4 $*;;
 25 |   17)  perf stat -p $p -e r1001E,r2C058,r3C05A,r4C058,r500fa,r600f4 $*;;
 26 |   18)  perf stat -p $p -e r1D090,r24048,r30002,r400FA,r500fa,r600f4 $*;;
 27 |   19)  perf stat -p $p -e r100F6,r2D090,r3D092,r40002,r500fa,r600f4 $*;;
 28 |   20)  perf stat -p $p -e r1C050,r2E050,r3C056,r4E054,r500fa,r600f4 $*;;
 29 |   21)  perf stat -p $p -e r1E050,r2E054,r3E054,r4C054,r500fa,r600f4 $*;;
 30 |   22)  perf stat -p $p -e r1C054,r2C058,r3E052,r4C052,r500fa,r600f4 $*;;
 31 |   23)  perf stat -p $p -e r1E052,r2C052,r3C052,r4C052,r500fa,r600f4 $*;;
 32 |   24)  perf stat -p $p -e r1C052,r2C056,r3C054,r4C056,r500fa,r600f4 $*;;
 33 |   25)  perf stat -p $p -e r1E054,r2E052,r3E056,r4E052,r500fa,r600f4 $*;;
 34 |   26)  perf stat -p $p -e r1E054,r2E056,r3E056,r4E056,r500fa,r600f4 $*;;
 35 |   27)  perf stat -p $p -e r1E050,r2E058,r3E052,r4E058,r500fa,r600f4 $*;;
 36 |   28)  perf stat -p $p -e r1C050,r2C050,r3C052,r4C058,r500fa,r600f4 $*;;
 37 |   29)  perf stat -p $p -e r1C050,r2C050,r30002,r4001E,r500fa,r600f4 $*;;
 38 |   30)  perf stat -p $p -e r1C052,r2C054,r30002,r4C054,r500fa,r600f4 $*;;
 39 |   31)  perf stat -p $p -e r10002,r2C052,r3C052,r4C052,r500fa,r600f4 $*;;
 40 |   32)  perf stat -p $p -e r1006E,r20006,r3000C,r4000C,r500fa,r600f4 $*;;
 41 |   33)  perf stat -p $p -e r1006E,r20006,r30006,r4000C,r500fa,r600f4 $*;;
 42 |   34)  perf stat -p $p -e r1C880,r2C080,r3C082,r4D0A6,r500fa,r600f4 $*;;
 43 |   35)  perf stat -p $p -e r12088,r2208A,r3208C,r400F8,r500fa,r600f4 $*;;
 44 |   36)  perf stat -p $p -e r12086,r22082,r3208E,r4C0AA,r500fa,r600f4 $*;;
 45 |   37)  perf stat -p $p -e r12082,r2001E,r30012,r400F8,r500fa,r600f4 $*;;
 46 |   38)  perf stat -p $p -e r1C8B0,r2C8B4,r3C8B8,r4C8BC,r500fa,r600f4 $*;;
 47 |   39)  perf stat -p $p -e r1C8B0,r2C0B0,r3C0B2,r400F8,r500fa,r600f4 $*;;
 48 |   40)  perf stat -p $p -e r1C8B4,r2C0B4,r3C0B6,r400F8,r500fa,r600f4 $*;;
 49 |   41)  perf stat -p $p -e r1C8B8,r2C0B8,r3C0BA,r400F8,r500fa,r600f4 $*;;
 50 |   42)  perf stat -p $p -e r1C8BC,r2C0BC,r3C0BE,r400F8,r500fa,r600f4 $*;;
 51 |   43)  perf stat -p $p -e r10018,r2408A,r34096,r4408E,r500fa,r600f4 $*;;
 52 |   44)  perf stat -p $p -e r100FA,r2000C,r300F4,r40060,r500fa,r600f4 $*;;
 53 |   45)  perf stat -p $p -e r10012,r2000C,r300F4,r440B0,r500fa,r600f4 $*;;
 54 |   46)  perf stat -p $p -e r10062,r20060,r30060,r440B4,r500fa,r600f4 $*;;
 55 |   47)  perf stat -p $p -e r140B2,r20062,r30062,r40062,r500fa,r600f4 $*;;
 56 |   48)  perf stat -p $p -e r140B0,r240B2,r340B4,r440B6,r500fa,r600f4 $*;;
 57 |   49)  perf stat -p $p -e r10060,r2000C,r300F4,r40060,r500fa,r600f4 $*;;
 58 |   50)  perf stat -p $p -e r1000E,r2000E,r3000E,r4000E,r500fa,r600f4 $*;;
 59 |   51)  perf stat -p $p -e r10004,r200F4,r30002,r40004,r500fa,r600f4 $*;;
 60 |   52)  perf stat -p $p -e r1001E,r2000E,r3000E,r4000E,r500fa,r600f4 $*;;
 61 |   53)  perf stat -p $p -e r1000E,r2000E,r3001E,r40002,r500fa,r600f4 $*;;
 62 |   54)  perf stat -p $p -e r16280,r26280,r36280,r46282,r500fa,r600f4 $*;;
 63 |   55)  perf stat -p $p -e r16382,r2001E,r36380,r40002,r500fa,r600f4 $*;;
 64 |   56)  perf stat -p $p -e r16280,r26280,r36282,r46280,r500fa,r600f4 $*;;
 65 |   57)  perf stat -p $p -e r16082,r26080,r30002,r4001E,r500fa,r600f4 $*;;
 66 |   58)  perf stat -p $p -e r10002,r2001E,r36182,r46182,r500fa,r600f4 $*;;
 67 |   59)  perf stat -p $p -e r10002,r2001E,r36180,r46180,r500fa,r600f4 $*;;
 68 |   60)  perf stat -p $p -e r16282,r26282,r30002,r4001E,r500fa,r600f4 $*;;
 69 |   61)  perf stat -p $p -e r10081,r20081,r30081,r40081,r500fa,r600f4 $*;;
 70 |   62)  perf stat -p $p -e r10083,r20083,r30083,r40083,r500fa,r600f4 $*;;
 71 |   63)  perf stat -p $p -e r10881,r20881,r30881,r40881,r500fa,r600f4 $*;;
 72 |   64)  perf stat -p $p -e r10883,r20883,r30883,r40883,r500fa,r600f4 $*;;
 73 |   65)  perf stat -p $p -e r14098,r2409A,r34088,r44082,r500fa,r600f4 $*;;
 74 |   66)  perf stat -p $p -e r1C040,r200F2,r300F6,r400F2,r500fa,r600f4 $*;;
 75 |   67)  perf stat -p $p -e r1C048,r2001E,r300F6,r40002,r500fa,r600f4 $*;;
 76 |   68)  perf stat -p $p -e r1C042,r2C044,r300F6,r40002,r500fa,r600f4 $*;;
 77 |   69)  perf stat -p $p -e r10064,r2C0AC,r3C0AE,r4C8AC,r500fa,r600f4 $*;;
 78 |   70)  perf stat -p $p -e r10064,r20064,r3C8A8,r40008,r500fa,r600f4 $*;;
 79 |   71)  perf stat -p $p -e r1C8A8,r2C0A8,r3001E,r40002,r500fa,r600f4 $*;;
 80 |   72)  perf stat -p $p -e r1C8A4,r2C0A4,r3C0A6,r40002,r500fa,r600f4 $*;;
 81 |   73)  perf stat -p $p -e r1C88C,r2C08C,r3C08E,r40002,r500fa,r600f4 $*;;
 82 |   74)  perf stat -p $p -e r100F8,r20008,r34086,r4001E,r500fa,r600f4 $*;;
 83 |   75)  perf stat -p $p -e r1209C,r2209E,r320A0,r420A2,r500fa,r600f4 $*;;
 84 |   76)  perf stat -p $p -e r16180,r26182,r30002,r4001E,r500fa,r600f4 $*;;
 85 |   77)  perf stat -p $p -e r16182,r26180,r30002,r4001E,r500fa,r600f4 $*;;
 86 |   78)  perf stat -p $p -e r10006,r20006,r30006,r400F2,r500fa,r600f4 $*;;
 87 |   79)  perf stat -p $p -e r10016,r20006,r30006,r40006,r500fa,r600f4 $*;;
 88 |   80)  perf stat -p $p -e r12092,r22094,r32096,r42098,r500fa,r600f4 $*;;
 89 |   81)  perf stat -p $p -e r1006E,r2006E,r3006E,r4006E,r500fa,r600f4 $*;;
 90 |   82)  perf stat -p $p -e r100F2,r200F2,r3000A,r400F2,r500fa,r600f4 $*;;
 91 |   83)  perf stat -p $p -e r100F2,r2001E,r30002,r400F2,r500fa,r600f4 $*;;
 92 |   84)  perf stat -p $p -e r14888,r2488C,r34890,r44898,r500fa,r600f4 $*;;
 93 |   85)  perf stat -p $p -e r14090,r24092,r34094,r44890,r500fa,r600f4 $*;;
 94 |   86)  perf stat -p $p -e r100F6,r200FC,r30002,r4001E,r500fa,r600f4 $*;;
 95 |   87)  perf stat -p $p -e r1C040,r20016,r300F6,r40018,r500fa,r600f4 $*;;
 96 |   88)  perf stat -p $p -e r1000E,r20014,r30004,r40014,r500fa,r600f4 $*;;
 97 |   89)  perf stat -p $p -e r10026,r20012,r3001A,r40016,r500fa,r600f4 $*;;
 98 |   90)  perf stat -p $p -e r100F4,r20018,r3003E,r40012,r500fa,r600f4 $*;;
 99 |   91)  perf stat -p $p -e r10028,r2001C,r3003F,r4000A,r500fa,r600f4 $*;;
100 |   92)  perf stat -p $p -e r1001C,r2003C,r30002,r4001C,r500fa,r600f4 $*;;
101 |   93)  perf stat -p $p -e r100F8,r2001A,r30014,r4001A,r500fa,r600f4 $*;;
102 |   94)  perf stat -p $p -e r1C040,r2C040,r3C042,r4C042,r500fa,r600f4 $*;;
103 |   95)  perf stat -p $p -e r1C048,r2C046,r3C04A,r4C048,r500fa,r600f4 $*;;
104 |   96)  perf stat -p $p -e r1C04A,r2C048,r3C046,r4C048,r500fa,r600f4 $*;;
105 |   97)  perf stat -p $p -e r1C044,r2C044,r3C04C,r4C044,r500fa,r600f4 $*;;
106 |   98)  perf stat -p $p -e r1C04E,r2C042,r3C044,r4C046,r500fa,r600f4 $*;;
107 |   99)  perf stat -p $p -e r1C042,r2C044,r3C04E,r4C048,r500fa,r600f4 $*;;
108 |  100)  perf stat -p $p -e r1C04C,r2C048,r3C04C,r4C044,r500fa,r600f4 $*;;
109 |  101)  perf stat -p $p -e r10002,r2C040,r300FE,r4C042,r500fa,r600f4 $*;;
110 |  102)  perf stat -p $p -e r1C040,r200FE,r300F6,r400F0,r500fa,r600f4 $*;;
111 |  103)  perf stat -p $p -e r1C042,r2C044,r3C044,r4C044,r500fa,r600f4 $*;;
112 |  104)  perf stat -p $p -e r1C040,r200FE,r300FE,r400FA,r500fa,r600f4 $*;;
113 |  105)  perf stat -p $p -e r1C042,r2C042,r3C042,r4C042,r500fa,r600f4 $*;;
114 |  106)  perf stat -p $p -e r1C05C,r20002,r3C044,r4C044,r500fa,r600f4 $*;;
115 |  107)  perf stat -p $p -e r1C04A,r20002,r3C042,r4C042,r500fa,r600f4 $*;;
116 |  108)  perf stat -p $p -e r1C04A,r20002,r300F6,r4C042,r500fa,r600f4 $*;;
117 |  109)  perf stat -p $p -e r14040,r24040,r3404A,r44048,r500fa,r600f4 $*;;
118 |  110)  perf stat -p $p -e r14048,r24042,r3404C,r44042,r500fa,r600f4 $*;;
119 |  111)  perf stat -p $p -e r1404A,r24048,r34044,r44044,r500fa,r600f4 $*;;
120 |  112)  perf stat -p $p -e r14044,r24046,r34046,r44046,r500fa,r600f4 $*;;
121 |  113)  perf stat -p $p -e r1404E,r24044,r3404E,r44048,r500fa,r600f4 $*;;
122 |  114)  perf stat -p $p -e r14046,r24048,r3404A,r44048,r500fa,r600f4 $*;;
123 |  115)  perf stat -p $p -e r14042,r24044,r34044,r44044,r500fa,r600f4 $*;;
124 |  116)  perf stat -p $p -e r1404C,r24048,r3404A,r44048,r500fa,r600f4 $*;;
125 |  117)  perf stat -p $p -e r14046,r24042,r34042,r44042,r500fa,r600f4 $*;;
126 |  118)  perf stat -p $p -e r14040,r24040,r30002,r4001E,r500fa,r600f4 $*;;
127 |  119)  perf stat -p $p -e r14042,r24044,r3404A,r40002,r500fa,r600f4 $*;;
128 |  120)  perf stat -p $p -e r1001E,r20002,r34044,r44044,r500fa,r600f4 $*;;
129 |  121)  perf stat -p $p -e r1404A,r20002,r34042,r44042,r500fa,r600f4 $*;;
130 |  122)  perf stat -p $p -e r1D8A8,r2D8AC,r3D8B4,r4D8B8,r500fa,r600f4 $*;;
131 |  123)  perf stat -p $p -e r1D8BC,r2C880,r30066,r400F0,r500fa,r600f4 $*;;
132 |  124)  perf stat -p $p -e r1A080,r2A082,r3A098,r4A09A,r500fa,r600f4 $*;;
133 |  125)  perf stat -p $p -e r1A09C,r2A09E,r3A0A0,r4A0A2,r500fa,r600f4 $*;;
134 |  126)  perf stat -p $p -e r1A898,r2A88C,r3A08C,r4A08E,r500fa,r600f4 $*;;
135 |  127)  perf stat -p $p -e r1A084,r2A086,r3A884,r40002,r500fa,r600f4 $*;;
136 |  128)  perf stat -p $p -e r1A090,r2A092,r3A890,r40002,r500fa,r600f4 $*;;
137 |  129)  perf stat -p $p -e r1B880,r2B080,r3B082,r40002,r500fa,r600f4 $*;;
138 |  130)  perf stat -p $p -e r1A8AC,r2A0AC,r3A0AE,r40002,r500fa,r600f4 $*;;
139 |  131)  perf stat -p $p -e r1A8BC,r2A0BC,r3A0BE,r40002,r500fa,r600f4 $*;;
140 |  132)  perf stat -p $p -e r1B88C,r2B08C,r3B08E,r40002,r500fa,r600f4 $*;;
141 |  133)  perf stat -p $p -e r1A8A8,r2A0A8,r3A0AA,r4A0A4,r500fa,r600f4 $*;;
142 |  134)  perf stat -p $p -e r1A888,r2A088,r3A08A,r40002,r500fa,r600f4 $*;;
143 |  135)  perf stat -p $p -e r1A894,r2A094,r3A096,r40002,r500fa,r600f4 $*;;
144 |  136)  perf stat -p $p -e r1B888,r2B088,r3B08A,r40002,r500fa,r600f4 $*;;
145 |  137)  perf stat -p $p -e r1B884,r2B084,r3B086,r40002,r500fa,r600f4 $*;;
146 |  138)  perf stat -p $p -e r1A880,r2A89C,r3A8A0,r4A898,r500fa,r600f4 $*;;
147 |  139)  perf stat -p $p -e r1B890,r2B090,r3B09C,r40002,r500fa,r600f4 $*;;
148 |  140)  perf stat -p $p -e r1B894,r2B094,r3B096,r4B0A0,r500fa,r600f4 $*;;
149 |  141)  perf stat -p $p -e r1B098,r2B09A,r3B092,r4B09E,r500fa,r600f4 $*;;
150 |  142)  perf stat -p $p -e r1A8B0,r2A0B0,r3A0B2,r40002,r500fa,r600f4 $*;;
151 |  143)  perf stat -p $p -e r1A8B4,r2A0B4,r3A0B6,r40002,r500fa,r600f4 $*;;
152 |  144)  perf stat -p $p -e r1A8B8,r2A0B8,r3A0BA,r40002,r500fa,r600f4 $*;;
153 |  145)  perf stat -p $p -e r10068,r200F4,r30002,r4A8BC,r500fa,r600f4 $*;;
154 |  146)  perf stat -p $p -e r1C884,r2B88C,r3A884,r4A880,r500fa,r600f4 $*;;
155 |  147)  perf stat -p $p -e r1A888,r2A8BC,r3A884,r4A880,r500fa,r600f4 $*;;
156 |  148)  perf stat -p $p -e r100F4,r2A8BC,r3A8B8,r4A880,r500fa,r600f4 $*;;
157 |  149)  perf stat -p $p -e r1B88C,r2A8BC,r3A8B4,r4A8B0,r500fa,r600f4 $*;;
158 |  150)  perf stat -p $p -e r1D0A4,r2003E,r3001C,r40008,r500fa,r600f4 $*;;
159 |  151)  perf stat -p $p -e r10066,r2C090,r30066,r4208E,r500fa,r600f4 $*;;
160 |  152)  perf stat -p $p -e r1D098,r2D09A,r3D0A0,r4D0A4,r500fa,r600f4 $*;;
161 |  153)  perf stat -p $p -e r1C8A0,r2C0A0,r3C0A2,r40002,r500fa,r600f4 $*;;
162 |  154)  perf stat -p $p -e r1D096,r2D097,r3D09C,r40002,r500fa,r600f4 $*;;
163 |  155)  perf stat -p $p -e r1D09C,r2D09E,r3D0A0,r40002,r500fa,r600f4 $*;;
164 |  156)  perf stat -p $p -e r1D0A1,r2D09F,r3D09D,r40002,r500fa,r600f4 $*;;
165 |  157)  perf stat -p $p -e r1D8B8,r2D0B8,r3D0BA,r40002,r500fa,r600f4 $*;;
166 |  158)  perf stat -p $p -e r16480,r26480,r3001E,r40002,r500fa,r600f4 $*;;
167 |  159)  perf stat -p $p -e r16482,r26482,r3001E,r40002,r500fa,r600f4 $*;;
168 |  160)  perf stat -p $p -e r100F0,r24080,r30016,r40002,r500fa,r600f4 $*;;
169 |  161)  perf stat -p $p -e r12080,r200F8,r300F8,r4001E,r500fa,r600f4 $*;;
170 |  162)  perf stat -p $p -e r100F2,r2000A,r300F2,r400F2,r500fa,r600f4 $*;;
171 |  163)  perf stat -p $p -e r1000C,r2001A,r3001E,r4001C,r500fa,r600f4 $*;;
172 |  164)  perf stat -p $p -e r1000A,r248AE,r340A4,r400F6,r500fa,r600f4 $*;;
173 |  165)  perf stat -p $p -e r1408C,r2408E,r3488C,r40002,r500fa,r600f4 $*;;
174 |  166)  perf stat -p $p -e r10038,r2000A,r3001E,r40066,r500fa,r600f4 $*;;
175 |  167)  perf stat -p $p -e r140A6,r200F8,r300F6,r400F6,r500fa,r600f4 $*;;
176 |  168)  perf stat -p $p -e r12084,r22086,r3C0A8,r400F6,r500fa,r600f4 $*;;
177 |  169)  perf stat -p $p -e r1001A,r2D8A8,r3D8B8,r44084,r500fa,r600f4 $*;;
178 |  170)  perf stat -p $p -e r100F4,r2001E,r30004,r40002,r500fa,r600f4 $*;;
179 |  171)  perf stat -p $p -e r10002,r200F0,r300F8,r400F8,r500fa,r600f4 $*;;
180 |  172)  perf stat -p $p -e r100F8,r200F0,r300FC,r400F6,r500fa,r600f4 $*;;
181 |  173)  perf stat -p $p -e r1001E,r2001E,r30002,r40066,r500fa,r600f4 $*;;
182 |  174)  perf stat -p $p -e r1D0A2,r2004A,r300F6,r4004A,r500fa,r600f4 $*;;
183 |  175)  perf stat -p $p -e r10028,r2C09C,r3C09E,r4004C,r500fa,r600f4 $*;;
184 |  176)  perf stat -p $p -e r10068,r200F0,r3D054,r4004E,r500fa,r600f4 $*;;
185 |  177)  perf stat -p $p -e r10000,r2001E,r3D094,r40002,r500fa,r600f4 $*;;
186 |  178)  perf stat -p $p -e r10014,r2001E,r30014,r40002,r500fa,r600f4 $*;;
187 |  179)  perf stat -p $p -e r1D094,r2001E,r3209A,r40002,r500fa,r600f4 $*;;
188 |  180)  perf stat -p $p -e r1001E,r228A4,r320A4,r420A6,r500fa,r600f4 $*;;
189 |  181)  perf stat -p $p -e r1F080,r2F080,r3F080,r4F080,r500fa,r600f4 $*;;
190 |  182)  perf stat -p $p -e r15080,r25082,r35084,r45086,r500fa,r600f4 $*;;
191 |  183)  perf stat -p $p -e r1D0AC,r2D0AE,r3D8AC,r4D8B8,r500fa,r600f4 $*;;
192 |  184)  perf stat -p $p -e r1F082,r2F082,r3F082,r4F082,r500fa,r600f4 $*;;
193 |  185)  perf stat -p $p -e r1001E,r2D8B4,r3D0B4,r4D0B6,r500fa,r600f4 $*;;
194 |  186)  perf stat -p $p -e r1001E,r2D8BC,r3D0BC,r4D0BE,r500fa,r600f4 $*;;
195 |  187)  perf stat -p $p -e r1D0B0,r2D8A8,r3D0A8,r4D0AA,r500fa,r600f4 $*;;
196 |  188)  perf stat -p $p -e r1C094,r2C096,r3001E,r4C894,r500fa,r600f4 $*;;
197 |  189)  perf stat -p $p -e r1001E,r2C884,r3C084,r4C086,r500fa,r600f4 $*;;
198 |  190)  perf stat -p $p -e r1001E,r2C888,r3C088,r4C08A,r500fa,r600f4 $*;;
199 |  191)  perf stat -p $p -e r16080,r26082,r3F080,r4001E,r500fa,r600f4 $*;;
200 |  192)  perf stat -p $p -e r1C894,r2C8AC,r3C098,r4C09A,r500fa,r600f4 $*;;
201 |  193)  perf stat -p $p -e r1508A,r25088,r3C098,r4C09A,r500fa,r600f4 $*;;
202 |  194)  perf stat -p $p -e r140B8,r240BA,r3001E,r40002,r500fa,r600f4 $*;;
203 |  195)  perf stat -p $p -e r100F0,r200F2,r30016,r40002,r500fa,r600f4 $*;;
204 |  196)  perf stat -p $p -e r16880,r26880,r36082,r46080,r500fa,r600f4 $*;;
205 |  197)  perf stat -p $p -e r10002,r2001E,r36080,r46080,r500fa,r600f4 $*;;
206 |  198)  perf stat -p $p -e r10002,r2001E,r36482,r400FA,r500fa,r600f4 $*;;
207 |  199)  perf stat -p $p -e r10002,r2001E,r36382,r46382,r500fa,r600f4 $*;;
208 |  200)  perf stat -p $p -e r10002,r2001E,r36480,r400FA,r500fa,r600f4 $*;;
209 |  201)  perf stat -p $p -e r10002,r200F4,r3001E,r46380,r500fa,r600f4 $*;;
210 |  202)  perf stat -p $p -e r100F6,r240BC,r340BE,r40002,r500fa,r600f4 $*;;
211 |  203)  perf stat -p $p -e r12090,r220A8,r3001E,r40002,r500fa,r600f4 $*;;
212 |  204)  perf stat -p $p -e r1001E,r20006,r30008,r40002,r500fa,r600f4 $*;;
213 |  205)  perf stat -p $p -e r10008,r200F4,r3001E,r400F4,r500fa,r600f4 $*;;
214 |  206)  perf stat -p $p -e r10010,r20010,r30010,r40010,r500fa,r600f4 $*;;
215 |  207)  perf stat -p $p -e r10024,r20010,r30024,r40010,r500fa,r600f4 $*;;
216 |  208)  perf stat -p $p -e r10020,r200F4,r30020,r40002,r500fa,r600f4 $*;;
217 |  209)  perf stat -p $p -e r10022,r200F4,r30022,r40002,r500fa,r600f4 $*;;
218 |  210)  perf stat -p $p -e r1208A,r22096,r3D0B2,r40002,r500fa,r600f4 $*;;
219 |  211)  perf stat -p $p -e r100F6,r200FC,r300F0,r400F0,r500fa,r600f4 $*;;
220 |  212)  perf stat -p $p -e r1001E,r200F6,r300FC,r400FC,r500fa,r600f4 $*;;
221 |  213)  perf stat -p $p -e r100FA,r200F4,r3001E,r400F4,r500fa,r600f4 $*;;
222 |  214)  perf stat -p $p -e r100F4,r200F4,r3001E,r400FA,r500fa,r600f4 $*;;
223 |  215)  perf stat -p $p -e r100F2,r200F4,r300F2,r400F2,r500fa,r600f4 $*;;
224 |  216)  perf stat -p $p -e r10002,r200F0,r300F0,r400F0,r500fa,r600f4 $*;;
225 |  217)  perf stat -p $p -e r10002,r200FE,r300F6,r400F0,r500fa,r600f4 $*;;
226 |  218)  perf stat -p $p -e r100F6,r200FC,r30002,r400FC,r500fa,r600f4 $*;;
227 |  219)  perf stat -p $p -e r10000,r20000,r30000,r40000,r500fa,r600f4 $*;;
228 |  220)  perf stat -p $p -e r10002,r200F8,r300F8,r4001E,r500fa,r600f4 $*;;
229 |  221)  perf stat -p $p -e r100F0,r200F2,r300F4,r400F8,r500fa,r600f4 $*;;
230 |  222)  perf stat -p $p -e r100F8,r200F2,r3001E,r400F6,r500fa,r600f4 $*;;
231 |  223)  perf stat -p $p -e r10036,r20036,r30036,r40002,r500fa,r600f4 $*;;
232 |  224)  perf stat -p $p -e r1D04A,r2002E,r30002,r4D048,r500fa,r600f4 $*;;
233 |  225)  perf stat -p $p -e r1003E,r20002,r3D046,r40024,r500fa,r600f4 $*;;
234 |  226)  perf stat -p $p -e r1D048,r2D048,r30002,r40020,r500fa,r600f4 $*;;
235 |  227)  perf stat -p $p -e r10002,r2002C,r3D04A,r4C042,r500fa,r600f4 $*;;
236 |  228)  perf stat -p $p -e r1D044,r20002,r30030,r40026,r500fa,r600f4 $*;;
237 |  229)  perf stat -p $p -e r1003F,r20024,r3D04E,r40002,r500fa,r600f4 $*;;
238 |  230)  perf stat -p $p -e r1D040,r20020,r30002,r4D048,r500fa,r600f4 $*;;
239 |  231)  perf stat -p $p -e r1D042,r2D048,r30002,r40028,r500fa,r600f4 $*;;
240 |  232)  perf stat -p $p -e r10002,r2002A,r3D044,r4D048,r500fa,r600f4 $*;;
241 |  233)  perf stat -p $p -e r1D04C,r20028,r3C042,r40002,r500fa,r600f4 $*;;
242 |  234)  perf stat -p $p -e r1003E,r20002,r3D042,r4002C,r500fa,r600f4 $*;;
243 |  235)  perf stat -p $p -e r1D04E,r20026,r30030,r40002,r500fa,r600f4 $*;;
244 |  236)  perf stat -p $p -e r1003F,r20002,r3D04C,r4002A,r500fa,r600f4 $*;;
245 |  237)  perf stat -p $p -e r1D084,r2D086,r30002,r4001E,r500fa,r600f4 $*;;
246 |  238)  perf stat -p $p -e r10002,r2001E,r3D088,r4D08A,r500fa,r600f4 $*;;
247 |  239)  perf stat -p $p -e r1D082,r2D08C,r30002,r40064,r500fa,r600f4 $*;;
248 |  240)  perf stat -p $p -e r10032,r20030,r30030,r40002,r500fa,r600f4 $*;;
249 |  241)  perf stat -p $p -e r10034,r20034,r30034,r40002,r500fa,r600f4 $*;;
250 |  242)  perf stat -p $p -e r10002,r2D05E,r3D05E,r4D05E,r500fa,r600f4 $*;;
251 |  243)  perf stat -p $p -e r1D05E,r2D05E,r3D05E,r40002,r500fa,r600f4 $*;;
252 |  244)  perf stat -p $p -e r10002,r2D05C,r3D05C,r4D05C,r500fa,r600f4 $*;;
253 |  245)  perf stat -p $p -e r1D05C,r2D05C,r3D05C,r40002,r500fa,r600f4 $*;;
254 |  246)  perf stat -p $p -e r1003E,r20002,r3D05A,r4003E,r500fa,r600f4 $*;;
255 |  247)  perf stat -p $p -e r10002,r2D052,r3D056,r4D056,r500fa,r600f4 $*;;
256 |  248)  perf stat -p $p -e r1D050,r2D054,r3D052,r40002,r500fa,r600f4 $*;;
257 |  249)  perf stat -p $p -e r10002,r2D056,r3D056,r4D054,r500fa,r600f4 $*;;
258 |  250)  perf stat -p $p -e r1D054,r2D050,r30002,r4D058,r500fa,r600f4 $*;;
259 |  251)  perf stat -p $p -e r1D052,r2D058,r30002,r4D052,r500fa,r600f4 $*;;
260 |  252)  perf stat -p $p -e r1D08E,r20002,r3003A,r40034,r500fa,r600f4 $*;;
261 |  253)  perf stat -p $p -e r10002,r20038,r3003A,r40032,r500fa,r600f4 $*;;
262 |  254)  perf stat -p $p -e r10002,r2003A,r3D080,r40032,r500fa,r600f4 $*;;
263 |  255)  perf stat -p $p -e r1003C,r20002,r30032,r40038,r500fa,r600f4 $*;;
264 |  256)  perf stat -p $p -e r1003D,r20032,r3003F,r40002,r500fa,r600f4 $*;;
265 |  257)  perf stat -p $p -e r10030,r200F4,r30002,r40030,r500fa,r600f4 $*;;
266 |  258)  perf stat -p $p -e r1D082,r20002,r30064,r40064,r500fa,r600f4 $*;;
267 |  259)  perf stat -p $p -e r1001E,r2001E,r30002,r40032,r500fa,r600f4 $*;;
268 |  260)  perf stat -p $p -e r1D040,r20020,r3D0A2,r4000A,r500fa,r600f4 $*;;
269 |    *) echo "GROUP NOT FOUND $g";;
270 | esac
271 | 
272 | /gsa/yktgsa/home/h/a/haichuan/workspace/gsimd/examples/RGB2Gray/groupnamepower7.sh $g
273 | 


--------------------------------------------------------------------------------