├── LICENSE ├── README ├── cl-simd.asd ├── cl-simd.texinfo ├── ecl-sse-core.lisp ├── ecl-sse-utils.lisp ├── sbcl-arrays.lisp ├── sbcl-core.lisp ├── sbcl-functions.lisp ├── sse-array-defs.lisp ├── sse-intrinsics.lisp ├── sse-package.lisp ├── sse-utils.lisp └── test-sfmt.lisp /LICENSE: -------------------------------------------------------------------------------- 1 | (This is the MIT / X Consortium license as taken from 2 | http://www.opensource.org/licenses/mit-license.html on or about 3 | Monday; July 13, 2009) 4 | 5 | Copyright (c) 2010 by Alexander Gavrilov 6 | 7 | Permission is hereby granted, free of charge, to any person obtaining 8 | a copy of this software and associated documentation files (the 9 | "Software"), to deal in the Software without restriction, including 10 | without limitation the rights to use, copy, modify, merge, publish, 11 | distribute, sublicense, and/or sell copies of the Software, and to 12 | permit persons to whom the Software is furnished to do so, subject to 13 | the following conditions: 14 | 15 | The above copyright notice and this permission notice shall be 16 | included in all copies or substantial portions of the Software. 17 | 18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 19 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 20 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 21 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 22 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 23 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 24 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 25 | 26 | -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | This module implements SSE intrinsic functions for ECL and SBCL. 2 | 3 | NOTE: CURRENTLY THIS SHOULD BE CONSIDERED EXPERIMENTAL, AND 4 | SUBJECT TO INCOMPATIBLE CHANGES IN A FUTURE RELEASE. 5 | 6 | Since the implementation is closely tied to the internals of 7 | the compiler, it should normally be obtained exclusively via 8 | the bundled contrib mechanism of the above implementations. 9 | -------------------------------------------------------------------------------- /cl-simd.asd: -------------------------------------------------------------------------------- 1 | ;;; -*- mode: Lisp; indent-tabs-mode: nil; -*- 2 | ;;; 3 | ;;; Copyright (C) 2010, Alexander Gavrilov (angavrilov@gmail.com) 4 | ;;; 5 | ;;; This file defines the cl-simd ASDF system. 6 | ;;; 7 | ;;; Note that a completely independent definition 8 | ;;; is used to build the system as an ECL contrib. 9 | 10 | (defsystem :cl-simd 11 | :version "1.0" 12 | #+sb-building-contrib :pathname 13 | #+sb-building-contrib #p"SYS:CONTRIB;CL-SIMD;" 14 | :components 15 | #+(and sbcl sb-simd-pack) 16 | ((:file "sse-package") 17 | (:file "sbcl-core" :depends-on ("sse-package")) 18 | (:file "sse-intrinsics" :depends-on ("sbcl-core")) 19 | (:file "sbcl-functions" :depends-on ("sse-intrinsics")) 20 | (:file "sbcl-arrays" :depends-on ("sbcl-functions")) 21 | (:file "sse-array-defs" :depends-on ("sbcl-arrays")) 22 | (:file "sse-utils" :depends-on ("sse-array-defs"))) 23 | #+(and ecl sse2) 24 | ((:file "sse-package") 25 | (:file "ecl-sse-core" :depends-on ("sse-package")) 26 | (:file "sse-intrinsics" :depends-on ("ecl-sse-core")) 27 | (:file "sse-array-defs" :depends-on ("sse-intrinsics")) 28 | (:file "ecl-sse-utils" :depends-on ("sse-intrinsics")) 29 | (:file "sse-utils" :depends-on ("ecl-sse-utils"))) 30 | #-(or (and sbcl sb-simd-pack) 31 | (and ecl sse2)) 32 | ()) 33 | 34 | #+(or (and sbcl sb-simd-pack) 35 | (and ecl sse2)) 36 | (defmethod perform :after ((o load-op) (c (eql (find-system :cl-simd)))) 37 | (provide :cl-simd)) 38 | 39 | (defmethod perform ((o test-op) (c (eql (find-system :cl-simd)))) 40 | #+(or (and sbcl sb-simd-pack) 41 | (and ecl sse2)) 42 | (or (load (compile-file "test-sfmt.lisp")) 43 | (error "test-sfmt failed"))) 44 | 45 | -------------------------------------------------------------------------------- /cl-simd.texinfo: -------------------------------------------------------------------------------- 1 | @node cl-simd 2 | @section cl-simd 3 | @cindex SSE2 Intrinsics 4 | @cindex Intrinsics, SSE2 5 | 6 | The @code{cl-simd} module provides access to SSE2 instructions 7 | (which are nowadays supported by any CPU compatible with x86-64) 8 | in the form of @emph{intrinsic functions}, similar to the way 9 | adopted by modern C compilers. It also provides some lisp-specific 10 | functionality, like setf-able intrinsics for accessing lisp arrays. 11 | 12 | When this module is loaded, it defines an @code{:sse2} feature, 13 | which can be subsequently used for conditional compilation of 14 | code that depends on it. Intrinsic functions are available from 15 | the @code{sse} package. 16 | 17 | This API, with minor technical differences, is supported by both 18 | ECL and SBCL (x86-64 only). 19 | 20 | @menu 21 | * SSE pack types:: 22 | * SSE array type:: 23 | * Differences from C intrinsics:: 24 | * Comparisons and NaN handling:: 25 | * Simple extensions:: 26 | * Lisp array accessors:: 27 | * Example:: 28 | @end menu 29 | 30 | @node SSE pack types 31 | @subsection SSE pack types 32 | 33 | The package defines and/or exports the following types to 34 | represent 128-bit SSE register contents: 35 | 36 | @anchor{Type sse:sse-pack} 37 | @deftp {Type} @somepkg{sse-pack,sse} @&optional item-type 38 | The generic SSE pack type. 39 | @end deftp 40 | 41 | @anchor{Type sse:int-sse-pack} 42 | @deftp {Type} @somepkg{int-sse-pack,sse} 43 | Same as @code{(sse-pack integer)}. 44 | @end deftp 45 | 46 | @anchor{Type sse:float-sse-pack} 47 | @deftp {Type} @somepkg{float-sse-pack,sse} 48 | Same as @code{(sse-pack single-float)}. 49 | @end deftp 50 | 51 | @anchor{Type sse:double-sse-pack} 52 | @deftp {Type} @somepkg{double-sse-pack,sse} 53 | Same as @code{(sse-pack double-float)}. 54 | @end deftp 55 | 56 | Declaring variable types using the subtype appropriate 57 | for your data is likely to lead to more efficient code 58 | (especially on ECL). However, the compiler implicitly 59 | casts between any subtypes of sse-pack when needed. 60 | 61 | Printed representation of SSE packs can be controlled 62 | by binding @code{*sse-pack-print-mode*}: 63 | 64 | @anchor{Variable sse:*sse-pack-print-mode*} 65 | @defvr {Variable} @somepkg{@earmuffs{sse-pack-print-mode},sse} 66 | When set to one of @code{:int}, @code{:float} or 67 | @code{:double}, specifies the way SSE packs are 68 | printed. A @code{NIL} value (default) instructs 69 | the implementation to make its best effort to 70 | guess from the data and context. 71 | @end defvr 72 | 73 | @node SSE array type 74 | @subsection SSE array type 75 | 76 | @anchor{Type sse:sse-array} 77 | @deftp {Type} @somepkg{sse-array,sse} element-type @&optional dimensions 78 | Expands to a lisp array type that is efficiently 79 | supported by AREF-like accessors. 80 | It should be assumed to be a subtype of @code{SIMPLE-ARRAY}. 81 | The type expander signals warnings or errors if it detects 82 | that the element-type argument value is inappropriate or unsafe. 83 | @end deftp 84 | 85 | @anchor{Function sse:make-sse-array} 86 | @deffn {Function} @somepkg{make-sse-array,sse} dimensions @&key element-type initial-element displaced-to displaced-index-offset 87 | Creates an object of type @code{sse-array}, or signals an error. 88 | In non-displaced case ensures alignment of the beginning of data to 89 | the 16-byte boundary. 90 | Unlike @code{make-array}, the element type defaults to (unsigned-byte 8). 91 | @end deffn 92 | 93 | On ECL this function supports full-featured displacement. 94 | On SBCL it has to simulate it by sharing the underlying 95 | data vector, and does not support nonzero index offset. 96 | 97 | @node Differences from C intrinsics 98 | @subsection Differences from C intrinsics 99 | 100 | Intel Compiler, GCC and 101 | @url{http://msdn.microsoft.com/en-us/library/y0dh78ez%28VS.80%29.aspx,MSVC} 102 | all support the same set 103 | of SSE intrinsics, originally designed by Intel. This 104 | package generally follows the naming scheme of the C 105 | version, with the following exceptions: 106 | 107 | @itemize 108 | @item 109 | Underscores are replaced with dashes, and the @code{_mm_} 110 | prefix is removed in favor of packages. 111 | 112 | @item 113 | The 'e' from @code{epi} is dropped because MMX is obsolete 114 | and won't be supported. 115 | 116 | @item 117 | @code{_si128} functions are renamed to @code{-pi} for uniformity 118 | and brevity. The author has personally found this discrepancy 119 | in the original C intrinsics naming highly jarring. 120 | 121 | @item 122 | Comparisons are named using graphic characters, e.g. @code{<=-ps} 123 | for @code{cmpleps}, or @code{/>-ps} for @code{cmpngtps}. In some 124 | places the set of comparison functions is extended to cover the 125 | full possible range. 126 | 127 | @item 128 | Scalar comparison predicates are named like @code{..-ss?} for 129 | @code{comiss}, and @code{..-ssu?} for @code{ucomiss} wrappers. 130 | 131 | @item 132 | Conversion functions are renamed to @code{convert-*-to-*} and 133 | @code{truncate-*-to-*}. 134 | 135 | @item 136 | A few functions are completely renamed: @code{cpu-mxcsr} (setf-able), 137 | @code{cpu-pause}, @code{cpu-load-fence}, @code{cpu-store-fence}, 138 | @code{cpu-memory-fence}, @code{cpu-clflush}, @code{cpu-prefetch-*}. 139 | @end itemize 140 | 141 | In addition, foreign pointer access intrinsics have an additional 142 | optional integer offset parameter to allow more efficient coding 143 | of pointer deference, and the most common ones have been renamed 144 | and made SETF-able: 145 | 146 | @itemize 147 | @item 148 | @code{mem-ref-ss}, @code{mem-ref-ps}, @code{mem-ref-aps} 149 | 150 | @item 151 | @code{mem-ref-sd}, @code{mem-ref-pd}, @code{mem-ref-apd} 152 | 153 | @item 154 | @code{mem-ref-pi}, @code{mem-ref-api}, @code{mem-ref-si64} 155 | @end itemize 156 | 157 | (The @code{-ap*} version requires alignment.) 158 | 159 | @node Comparisons and NaN handling 160 | @subsection Comparisons and NaN handling 161 | 162 | Floating-point arithmetic intrinsics have trivial IEEE semantics 163 | when given QNaN and SNaN arguments. Comparisons have more complex 164 | behavior, detailed in the following table: 165 | 166 | @multitable { @code{/>=-ss, />=-ps} } { @code{/>=-sd, />=-pd} } { Not greater or equal } { Result for NaN } { QNaN traps } 167 | @item Single-float @tab Double-float @tab Condition @tab Result for NaN @tab QNaN traps 168 | @item @code{=-ss}, @code{=-ps} @tab @code{=-sd}, @code{=-pd} @tab Equal @tab False @tab No 169 | @item @code{<-ss}, @code{<-ps} @tab @code{<-sd}, @code{<-pd} @tab Less @tab False @tab Yes 170 | @item @code{<=-ss}, @code{<=-ps} @tab @code{<=-sd}, @code{<=-pd} @tab Less or equal @tab False @tab Yes 171 | @item @code{>-ss}, @code{>-ps} @tab @code{>-sd}, @code{>-pd} @tab Greater @tab False @tab Yes 172 | @item @code{>=-ss}, @code{>=-ps} @tab @code{>=-sd}, @code{>=-pd} @tab Greater or equal @tab False @tab Yes 173 | @item @code{/=-ss}, @code{/=-ps} @tab @code{/=-sd}, @code{/=-pd} @tab Not equal @tab True @tab No 174 | @item @code{/<-ss}, @code{/<-ps} @tab @code{/<-sd}, @code{/<-pd} @tab Not less @tab True @tab Yes 175 | @item @code{/<=-ss}, @code{/<=-ps} @tab @code{/<=-sd}, @code{/<=-pd} @tab Not less or equal @tab True @tab Yes 176 | @item @code{/>-ss}, @code{/>-ps} @tab @code{/>-sd}, @code{/>-pd} @tab Not greater @tab True @tab Yes 177 | @item @code{/>=-ss}, @code{/>=-ps} @tab @code{/>=-sd}, @code{/>=-pd} @tab Not greater or equal @tab True @tab Yes 178 | @item @code{cmpord-ss}, @code{cmpord-ps} @tab @code{cmpord-sd}, @code{cmpord-pd} 179 | @tab Ordered, i.e. no NaN args @tab False @tab No 180 | @item @code{cmpunord-ss}, @code{cmpunord-ps} @tab @code{cmpunord-sd}, @code{cmpunord-pd} 181 | @tab Unordered, i.e. with NaN args @tab True @tab No 182 | @end multitable 183 | 184 | Likewise for scalar comparison predicates, i.e. functions that return the 185 | result of the comparison as a Lisp boolean instead of a bitmask sse-pack: 186 | 187 | @multitable { Single-float } { Double-float } { Not greater or equal } { Result for NaN } { QNaN traps } 188 | @item Single-float @tab Double-float @tab Condition @tab Result for NaN @tab QNaN traps 189 | @item @code{=-ss?} @tab @code{=-sd?} @tab Equal @tab True @tab Yes 190 | @item @code{=-ssu?} @tab @code{=-sdu?} @tab Equal @tab True @tab No 191 | @item @code{<-ss?} @tab @code{<-sd?} @tab Less @tab True @tab Yes 192 | @item @code{<-ssu?} @tab @code{<-sdu?} @tab Less @tab True @tab No 193 | @item @code{<=-ss?} @tab @code{<=-sd?} @tab Less or equal @tab True @tab Yes 194 | @item @code{<=-ssu?} @tab @code{<=-sdu?} @tab Less or equal @tab True @tab No 195 | @item @code{>-ss?} @tab @code{>-sd?} @tab Greater @tab False @tab Yes 196 | @item @code{>-ssu?} @tab @code{>-sdu?} @tab Greater @tab False @tab No 197 | @item @code{>=-ss?} @tab @code{>=-sd?} @tab Greater or equal @tab False @tab Yes 198 | @item @code{>=-ssu?} @tab @code{>=-sdu?} @tab Greater or equal @tab False @tab No 199 | @item @code{/=-ss?} @tab @code{/=-sd?} @tab Not equal @tab False @tab Yes 200 | @item @code{/=-ssu?} @tab @code{/=-sdu?} @tab Not equal @tab False @tab No 201 | @end multitable 202 | 203 | Note that MSDN specifies different return values for the C counterparts of some 204 | of these functions when called with NaN arguments, but that seems to disagree 205 | with the actually generated code. 206 | 207 | @node Simple extensions 208 | @subsection Simple extensions 209 | 210 | This module extends the set of basic intrinsics with the following 211 | simple compound functions: 212 | 213 | @itemize 214 | @item 215 | @code{neg-ss}, @code{neg-ps}, @code{neg-sd}, @code{neg-pd}, 216 | @code{neg-pi8}, @code{neg-pi16}, @code{neg-pi32}, @code{neg-pi64}: 217 | 218 | implement numeric negation of the corresponding data type. 219 | 220 | @item 221 | @code{not-ps}, @code{not-pd}, @code{not-pi}: 222 | 223 | implement bitwise logical inversion. 224 | 225 | @item 226 | @code{if-ps}, @code{if-pd}, @code{if-pi}: 227 | 228 | perform element-wise combining of two values based on a boolean 229 | condition vector produced as a combination of comparison function 230 | results through bitwise logical functions. 231 | 232 | The condition value must use all-zero bitmask for false, and 233 | all-one bitmask for true as a value for each logical vector 234 | element. The result is undefined if any other bit pattern is used. 235 | 236 | N.B.: these are @emph{functions}, so both branches of the 237 | conditional are always evaluated. 238 | @end itemize 239 | 240 | The module also provides symbol macros that expand into expressions 241 | producing certain constants in the most efficient way: 242 | 243 | @itemize 244 | @item 245 | 0.0-ps 0.0-pd 0-pi for zero 246 | 247 | @item 248 | true-ps true-pd true-pi for all 1 bitmask 249 | 250 | @item 251 | false-ps false-pd false-pi for all 0 bitmask (same as zero) 252 | @end itemize 253 | 254 | @node Lisp array accessors 255 | @subsection Lisp array accessors 256 | 257 | In order to provide better integration with ordinary lisp code, 258 | this module implements a set of AREF-like memory accessors: 259 | 260 | @itemize 261 | @item 262 | @code{(ROW-MAJOR-)?AREF-PREFETCH-(T0|T1|T2|NTA)} for cache prefetch. 263 | 264 | @item 265 | @code{(ROW-MAJOR-)?AREF-CLFLUSH} for cache flush. 266 | 267 | @item 268 | @code{(ROW-MAJOR-)?AREF-[AS]?P[SDI]} for whole-pack read & write. 269 | 270 | @item 271 | @code{(ROW-MAJOR-)?AREF-S(S|D|I64)} for scalar read & write. 272 | @end itemize 273 | 274 | (Where A = aligned; S = aligned streamed write.) 275 | 276 | These accessors can be used with any non-bit specialized 277 | array or vector, without restriction on the precise element 278 | type (although it should be declared at compile time to 279 | ensure generation of the fastest code). 280 | 281 | Additional index bound checking is done to ensure that enough 282 | bytes of memory are accessible after the specified index. 283 | 284 | As an exception, ROW-MAJOR-AREF-PREFETCH-* does not do any 285 | range checks at all, because the prefetch instructions 286 | are officially safe to use with bad addresses. The 287 | AREF-PREFETCH-* and *-CLFLUSH functions do only ordinary 288 | index checks without the usual 16-byte extension. 289 | 290 | @node Example 291 | @subsection Example 292 | 293 | This code processes several single-float arrays, storing 294 | either the value of a*b, or c/3.5 into result, depending 295 | on the sign of mode: 296 | 297 | @example 298 | (loop for i from 0 below 128 by 4 299 | do (setf (aref-ps result i) 300 | (if-ps (<-ps (aref-ps mode i) 0.0-ps) 301 | (mul-ps (aref-ps a i) (aref-ps b i)) 302 | (div-ps (aref-ps c i) (set1-ps 3.5))))) 303 | @end example 304 | 305 | As already noted above, both branches of the if are always 306 | evaluated. 307 | -------------------------------------------------------------------------------- /ecl-sse-core.lisp: -------------------------------------------------------------------------------- 1 | ;;; -*- mode: Lisp; indent-tabs-mode: nil; -*- 2 | ;;; 3 | ;;; Copyright (c) 2010, Alexander Gavrilov (angavrilov@gmail.com) 4 | ;;; 5 | ;;; This file defines macros for wrapping C-level SSE intrinsics. 6 | ;;; 7 | 8 | (in-package #:SSE) 9 | 10 | ;;; The compound SSE pack type 11 | 12 | (deftype sse-pack (&optional item) 13 | (ecase item 14 | (* 'ext:sse-pack) 15 | ((single-float float) 'float-sse-pack) 16 | (double-float 'double-sse-pack) 17 | (integer 'int-sse-pack))) 18 | 19 | ;;; Helper macros and functions 20 | 21 | (defmacro typename-case (value &body clauses) 22 | "Syntax: (case value &body clauses)" 23 | `(cond ,@(mapcar (lambda (clause) 24 | `((subtypep ,value ',(first clause)) 25 | ,@(rest clause))) 26 | clauses) 27 | (t (error "Unsupported type name: ~S" ,value)))) 28 | 29 | (defun foreign-type-of (lt) 30 | (typename-case lt 31 | (nil :object) 32 | (int-sse-pack :int-sse-pack) 33 | (float-sse-pack :float-sse-pack) 34 | (double-sse-pack :double-sse-pack) 35 | (boolean :bool) 36 | (single-float :float) 37 | (double-float :double) 38 | (fixnum :fixnum) 39 | #+uint32-t 40 | (ext:integer32 :int32-t) 41 | #+uint32-t 42 | (ext:byte32 :uint32-t) 43 | #+uint64-t 44 | (ext:integer64 :int64-t) 45 | #+uint64-t 46 | (ext:byte64 :uint64-t) 47 | (integer :fixnum))) 48 | 49 | (defun pointer-c-type-of (lt) 50 | (typename-case lt 51 | (nil "void") 52 | (int-sse-pack "__m128i") 53 | (float-sse-pack "float") 54 | (double-sse-pack "double"))) 55 | 56 | ;; Accept any real values for floating-point arguments: 57 | (defun declaim-arg-type-of (lt) 58 | (typename-case lt 59 | ((or single-float double-float) 'real) 60 | (ext:sse-pack 'ext:sse-pack) 61 | (fixnum 'fixnum) 62 | (t lt))) 63 | 64 | (defun inline-arg-type-of (lt) 65 | (typename-case lt 66 | ((or single-float double-float) 'c::fixnum-float) 67 | (fixnum 'fixnum) 68 | (t lt))) 69 | 70 | ;; Constant expansion 71 | (defun expand-constant (form env &optional chgp) 72 | (let* ((mform (macroexpand form env)) 73 | (cform (cond ((and (symbolp mform) (constantp mform)) 74 | (symbol-value mform)) 75 | (t mform)))) 76 | (values cform (or chgp (not (eql cform form)))))) 77 | 78 | ;; Macro helpers 79 | (defun make-arg-name (index) 80 | (intern (format nil "ARG~A" index))) 81 | 82 | (defun make-arg-nums (lst) 83 | (loop for i from 0 below (length lst) collect i)) 84 | 85 | (defun wrap-ret-arg (core ret-type &optional ret-arg) 86 | (cond ((eq ret-type nil) 87 | (format nil "(~A,Cnil)" core)) 88 | (ret-arg 89 | (format nil "@~36R;(~A,#~36R)" ret-arg core ret-arg)) 90 | (t core))) 91 | 92 | ;; Constant generation 93 | (defun make-pack-of-bin (bin-value &key (as 'int-sse-pack)) 94 | (let* ((all (loop for i from 0 to 15 95 | for v = bin-value then (ash v -8) 96 | collect (logand v 255))) 97 | (pack (ext:vector-to-sse-pack 98 | (make-array 16 :element-type '(unsigned-byte 8) :initial-contents all)))) 99 | (if (eq as 'int-sse-pack) 100 | pack 101 | `(the ,as ,(ext:sse-pack-as-elt-type 102 | pack (ecase as 103 | (EXT:FLOAT-SSE-PACK 'single-float) 104 | (EXT:DOUBLE-SSE-PACK 'double-float))))))) 105 | 106 | (defmacro def-inline (name mode arg-types ret-type call-str &rest flags) 107 | `(eval-when (:compile-toplevel :load-toplevel) 108 | (c::def-inline ',name ',mode ',arg-types ',ret-type ,call-str ,@flags))) 109 | 110 | (defmacro def-intrinsic (name arg-types ret-type c-name 111 | &key (export t) ret-arg reorder-args immediate-args defun-body) 112 | "Defines and exports an SSE intrinsic function with matching open-coding rules." 113 | (let* ((anums (make-arg-nums arg-types)) 114 | (asyms (mapcar #'make-arg-name anums)) 115 | (aftypes (mapcar #'foreign-type-of arg-types)) 116 | (rftype (foreign-type-of ret-type)) 117 | (call-anums (if reorder-args (reverse anums) anums)) 118 | (call-str (wrap-ret-arg (format nil "~A(~{#~36R~^,~})" c-name call-anums) ret-type ret-arg))) 119 | `(progn 120 | ,(if export `(export ',name)) 121 | ,@(if immediate-args ; Generate a constantness verifier macro 122 | `((define-compiler-macro ,name (&whole whole &environment env ,@asyms &aux chgp) 123 | ,@(loop for (arg type) in immediate-args 124 | collect `(let ((oldv ,arg)) 125 | (multiple-value-setq (,arg chgp) (expand-constant oldv env chgp)) 126 | (unless (typep ,arg ',type) 127 | (c::cmperr "In call to ~A: Argument ~S~@[ = ~S~] is not a constant of type ~A" 128 | ',name oldv (unless (eql oldv ,arg) ,arg) ',type)))) 129 | (if chgp (list ',name ,@asyms) whole)))) 130 | (proclaim '(ftype (function ,(mapcar #'declaim-arg-type-of arg-types) ,(or ret-type 'null)) ,name)) 131 | ,@(if (null immediate-args) 132 | `((defun ,name ,asyms 133 | (declare (optimize (speed 0) (debug 0) (safety 1))) 134 | (ffi:c-inline ,asyms ,aftypes ,rftype ,(or defun-body call-str) :one-liner t)))) 135 | (def-inline ,name :always ,(mapcar #'inline-arg-type-of arg-types) ,rftype 136 | ,call-str :inline-or-warn t)))) 137 | 138 | (defmacro def-unary-intrinsic (name ret-type insn cost c-name 139 | &key (arg-type ret-type) partial result-size immediate-arg) 140 | (declare (ignore insn cost partial result-size)) 141 | `(def-intrinsic ,name (,arg-type ,@(if immediate-arg (list immediate-arg))) 142 | ,ret-type ,c-name :immediate-args ,(if immediate-arg `((arg1 ,immediate-arg))))) 143 | 144 | (defmacro def-cvt-to-int32-intrinsic (name ret-type insn cost c-name 145 | &key (arg-type ret-type) partial immediate-arg) 146 | (declare (ignore insn cost partial)) 147 | (assert (subtypep ret-type '(signed-byte 32))) 148 | `(def-intrinsic ,name (,arg-type ,@(if immediate-arg (list immediate-arg))) 149 | ,ret-type ,c-name :immediate-args ,(if immediate-arg `((arg1 ,immediate-arg))))) 150 | 151 | (defmacro def-binary-intrinsic (name ret-type insn cost c-name 152 | &key (x-type ret-type) (y-type ret-type) 153 | commutative tags immediate-arg) 154 | (declare (ignore insn cost commutative tags)) 155 | `(def-intrinsic ,name (,x-type ,y-type ,@(if immediate-arg (list immediate-arg))) 156 | ,ret-type ,c-name :immediate-args ,(if immediate-arg `((arg2 ,immediate-arg))))) 157 | 158 | (defmacro def-sse-int-intrinsic (name int-type ret-type insn cost c-name 159 | &key (arg-type ret-type) immediate-arg make-temporary defun-body) 160 | (declare (ignore insn cost make-temporary)) 161 | `(def-intrinsic ,name (,arg-type ,int-type ,@(if immediate-arg (list immediate-arg))) 162 | ,ret-type ,c-name :immediate-args ,(if immediate-arg `((arg2 ,immediate-arg))) 163 | :defun-body ,defun-body)) 164 | 165 | (defmacro def-comparison-intrinsic (name arg-type insn cost c-name &key commutative tags) 166 | (declare (ignore insn cost commutative tags)) 167 | `(def-intrinsic ,name (,arg-type ,arg-type) boolean ,c-name)) 168 | 169 | (defmacro %def-aref-intrinsic (tag val-type c-type reader writer &key (aux-args "") (bsize 16)) 170 | "Defines and exports macros and functios that implement vectorized array access." 171 | (let* ((rftype (foreign-type-of val-type)) 172 | (aref-name (intern (format nil "AREF-~A" tag) *package*)) 173 | (rm-aref-name (intern (format nil "ROW-MAJOR-AREF-~A" tag) *package*)) 174 | (rm-aset-name (intern (format nil "ROW-MAJOR-ASET-~A" tag) *package*)) 175 | (known-elt-types '((single-float "sf") 176 | (double-float "df") 177 | (ext:byte8 "b8") 178 | (ext:integer8 "i8") 179 | #+uint16-t (ext:byte16 "b16") 180 | #+uint16-t (ext:integer16 "i16") 181 | #+uint32-t (ext:byte32 "b32") 182 | #+uint32-t (ext:integer32 "i32") 183 | #+uint64-t (ext:byte64 "b64") 184 | #+uint64-t (ext:integer64 "i64")))) 185 | (flet ((fmtr (ptr-fmt &rest ptr-args) 186 | (wrap-ret-arg (format nil "~A((~A*)~?~A)" 187 | reader c-type ptr-fmt ptr-args aux-args) 188 | val-type)) 189 | (fmtw (ptr-fmt &rest ptr-args) 190 | (wrap-ret-arg (format nil "~A((~A*)~?,#2)" 191 | writer c-type ptr-fmt ptr-args) 192 | val-type 2))) 193 | `(progn 194 | (export ',aref-name) 195 | (export ',rm-aref-name) 196 | (defmacro ,aref-name (array &rest indexes) 197 | (let ((varr (gensym "ARR"))) 198 | `(let ((,varr ,array)) 199 | (declare (:read-only ,varr)) 200 | (,',rm-aref-name ,varr (array-row-major-index ,varr ,@indexes))))) 201 | (proclaim '(ftype (function (array fixnum) ,(or val-type 'null)) ,rm-aref-name)) 202 | (defun ,rm-aref-name (array index) 203 | (declare (optimize (speed 0) (debug 0) (safety 2))) 204 | (ffi:c-inline (array index) (:object :int) ,rftype 205 | ,(fmtr "ecl_row_major_ptr(#0,#1,~A)" bsize) 206 | :one-liner t)) 207 | ;; AREF 208 | (def-inline ,rm-aref-name :always (t t) ,rftype 209 | ,(fmtr "ecl_row_major_ptr(#0,fixint(#1),~A)" bsize) 210 | :inline-or-warn t) 211 | (def-inline ,rm-aref-name :always (t fixnum) ,rftype 212 | ,(fmtr "ecl_row_major_ptr(#0,#1,~A)" bsize)) 213 | ;; AREF unsafe 214 | ,@(mapcar (lambda (spec) 215 | `(def-inline ,rm-aref-name :unsafe ((array ,(first spec)) fixnum) ,rftype 216 | ,(fmtr "(&(#0)->array.self.~A[#1])" (second spec)))) 217 | known-elt-types) 218 | ,@(if writer 219 | `((define-setf-expander ,aref-name (array &rest indexes) 220 | (let ((varr (gensym)) (vidx (gensym)) (vval (gensym))) 221 | (values (list varr vidx) 222 | (list array `(array-row-major-index ,varr ,@indexes)) 223 | (list vval) 224 | `(,',rm-aset-name ,varr ,vidx ,vval) `(,',rm-aref-name ,varr ,vidx)))) 225 | (proclaim '(ftype (function (array fixnum ,(declaim-arg-type-of val-type)) ,val-type) ,rm-aset-name)) 226 | (defun ,rm-aset-name (array index value) 227 | (declare (optimize (speed 0) (debug 0) (safety 2))) 228 | (prog1 value 229 | (ffi:c-inline (array index value) (:object :int ,rftype) :void 230 | ,(fmtw "ecl_row_major_ptr(#0,#1,~A)" bsize) 231 | :one-liner t))) 232 | (defsetf ,rm-aref-name ,rm-aset-name) 233 | ;; ASET 234 | (def-inline ,rm-aset-name :always (t t ,val-type) ,rftype 235 | ,(fmtw "ecl_row_major_ptr(#0,fixint(#1),~A)" bsize) 236 | :inline-or-warn t) 237 | (def-inline ,rm-aset-name :always (t fixnum ,val-type) ,rftype 238 | ,(fmtw "ecl_row_major_ptr(#0,#1,~A)" bsize)) 239 | ;; ASET unsafe 240 | ,@(mapcar (lambda (spec) 241 | `(def-inline ,rm-aset-name :unsafe ((array ,(first spec)) fixnum ,val-type) ,rftype 242 | ,(fmtw "(&(#0)->array.self.~A[#1])" (second spec)))) 243 | known-elt-types))))))) 244 | 245 | (defmacro def-aref-intrinsic (tag val-type reader-fun writer-fun &key (ref-size 16) side-effect?) 246 | (declare (ignore side-effect?)) 247 | `(%def-aref-intrinsic ,tag ,val-type ,(pointer-c-type-of val-type) 248 | ,(get reader-fun 'c-function-name) ,(get writer-fun 'c-function-name) 249 | :bsize ,ref-size 250 | :aux-args ,(get reader-fun 'c-call-aux-args))) 251 | 252 | (defmacro def-mem-intrinsic (name c-type ret-type c-name &key (public t) 253 | prefix-args (prefix-fmt "~@{#~36R,~}") 254 | postfix-args (postfix-fmt "~@{,#~36R~}" pf-p) ret-arg) 255 | "Defines and exports an SSE memory access intrinsic function with matching open-coding rules." 256 | (let* ((anums (make-arg-nums (append prefix-args postfix-args))) 257 | (asyms (mapcar #'make-arg-name anums)) 258 | (prefix-nums (subseq anums 0 (length prefix-args))) 259 | (postfix-nums (mapcar #'1+ (subseq anums (length prefix-args)))) 260 | (prefix-syms (subseq asyms 0 (length prefix-args))) 261 | (postfix-syms (subseq asyms (length prefix-args))) 262 | (prefix-itypes (mapcar #'inline-arg-type-of prefix-args)) 263 | (postfix-itypes (mapcar #'inline-arg-type-of postfix-args)) 264 | (rftype (foreign-type-of ret-type)) 265 | (ptr-idx (length prefix-args)) 266 | (offset-idx (+ ptr-idx 1 (length postfix-args)))) 267 | (flet ((fmt (ptr-text) 268 | (wrap-ret-arg (format nil "~A(~?(~A*)~?~?)" 269 | c-name prefix-fmt prefix-nums 270 | c-type ptr-text (list ptr-idx offset-idx) 271 | postfix-fmt postfix-nums) 272 | ret-type ret-arg))) 273 | `(progn 274 | ,(when public `(export ',name)) 275 | (eval-when (:compile-toplevel :load-toplevel :execute) 276 | (setf (get ',name 'c-function-name) ,c-name) 277 | ,(if (and pf-p (null postfix-args)) 278 | `(setf (get ',name 'c-call-aux-args) ,postfix-fmt))) 279 | (proclaim '(ftype (function (,@(mapcar #'declaim-arg-type-of prefix-args) si:foreign-data 280 | ,@(mapcar #'declaim-arg-type-of postfix-args) &optional fixnum) ,ret-type) ,name)) 281 | (defun ,name (,@prefix-syms ptr ,@postfix-syms &optional (offset 0)) 282 | (declare (optimize (speed 0) (debug 0) (safety 1))) 283 | (ffi:c-inline (,@prefix-syms ptr ,@postfix-syms offset) 284 | (,@(mapcar #'foreign-type-of prefix-args) :pointer-void 285 | ,@(mapcar #'foreign-type-of postfix-args) :int) ,rftype 286 | ,(fmt "(((char*)#~A) + #~A)") 287 | :one-liner t)) 288 | (def-inline ,name :always (,@prefix-itypes t ,@postfix-itypes) ,rftype 289 | ,(fmt "ecl_to_pointer(#~A)") 290 | :inline-or-warn t) 291 | (def-inline ,name :always (,@prefix-itypes t ,@postfix-itypes t) ,rftype 292 | ,(fmt "(((char*)ecl_to_pointer(#~A)) + fixint(#~A))")) 293 | (def-inline ,name :always (,@prefix-itypes t ,@postfix-itypes fixnum) ,rftype 294 | ,(fmt "(((char*)ecl_to_pointer(#~A)) + #~A)")) 295 | (def-inline ,name :unsafe (,@prefix-itypes si:foreign-data ,@postfix-itypes) ,rftype 296 | ,(fmt "(#~A)->foreign.data")) 297 | (def-inline ,name :unsafe (,@prefix-itypes si:foreign-data ,@postfix-itypes t) ,rftype 298 | ,(fmt "(((char*)(#~A)->foreign.data) + fix(#~A))")) 299 | (def-inline ,name :unsafe (,@prefix-itypes si:foreign-data ,@postfix-itypes fixnum) ,rftype 300 | ,(fmt "(((char*)(#~A)->foreign.data) + #~A)")))))) 301 | 302 | (defmacro def-load-intrinsic (name ret-type insn c-name &key register-arg tags size postfix-fmt side-effect?) 303 | (declare (ignore insn tags size side-effect?)) 304 | `(def-mem-intrinsic ,name ,(pointer-c-type-of ret-type) ,ret-type ,c-name 305 | :prefix-args ,(if register-arg (list ret-type)) 306 | :postfix-fmt ,(or postfix-fmt ""))) 307 | 308 | (defmacro def-store-intrinsic (name ret-type insn c-name &key setf-name) 309 | (declare (ignore insn)) 310 | `(progn 311 | (def-mem-intrinsic ,name ,(pointer-c-type-of ret-type) ,ret-type ,c-name 312 | :public ,(not setf-name) :postfix-args (,ret-type) :ret-arg 1) 313 | ,(if setf-name 314 | `(defsetf ,setf-name (pointer &optional (offset 0)) (value) 315 | `(,',name ,pointer ,value ,offset))))) 316 | 317 | -------------------------------------------------------------------------------- /ecl-sse-utils.lisp: -------------------------------------------------------------------------------- 1 | ;;; -*- mode: Lisp; indent-tabs-mode: nil; -*- 2 | ;;; 3 | ;;; Copyright (c) 2010, Alexander Gavrilov (angavrilov@gmail.com) 4 | ;;; 5 | ;;; This file defines some extensions to the base intrinsic set, 6 | ;;; and other utility functions. 7 | ;;; 8 | 9 | (in-package #:SSE) 10 | 11 | ;;; Helper macros and functions 12 | (eval-when (:compile-toplevel :load-toplevel :execute) 13 | ;; Try using a matching inverse function name 14 | (defun lookup-flip (arg pairs &key no-reverse) 15 | (and (consp arg) 16 | (let ((fix (or (cdr (assoc (first arg) pairs)) 17 | (unless no-reverse 18 | (car (rassoc (first arg) pairs)))))) 19 | (cond ((eq fix :identity) 20 | (assert (null (cddr arg))) 21 | (second arg)) 22 | (fix 23 | `(,fix ,@(rest arg))) 24 | (t nil))))) 25 | ;; Macroexpand, plus compiler expand some specific names 26 | (defun expand-condition (form env) 27 | (setq form (macroexpand form env)) 28 | (loop while (and (consp form) 29 | (symbolp (first form)) 30 | (get (first form) 'expand-in-condition)) 31 | do (setq form (c::cmp-expand-macro (compiler-macro-function (first form)) 32 | form env))) 33 | form) 34 | ;; Checks if the form is an unary call 35 | (defun is-unary? (form op) 36 | (and (consp form) 37 | (eq (first form) op) 38 | (null (cddr form)))) 39 | ;; IF-style function expander 40 | (defun expand-if-macro (condition then-value else-value env if-f not-f or-f and-f andnot-f type-name zero-val &key flip) 41 | (let* ((condition (expand-condition condition env)) 42 | (then-value (macroexpand then-value env)) 43 | (else-value (macroexpand else-value env)) 44 | (then-zero? (equal then-value zero-val)) 45 | (else-zero? (equal else-value zero-val))) 46 | (cond ((is-unary? condition not-f) 47 | (expand-if-macro (second condition) else-value then-value 48 | env if-f not-f or-f and-f andnot-f type-name zero-val 49 | :flip (not flip))) 50 | ((and then-zero? else-zero?) 51 | zero-val) 52 | (then-zero? 53 | `(,andnot-f ,condition ,else-value)) 54 | (else-zero? 55 | `(,and-f ,condition ,then-value)) 56 | (t 57 | (let* ((csym (gensym)) 58 | (args `((,and-f ,csym ,then-value) 59 | (,andnot-f ,csym ,else-value)))) 60 | `(let ((,csym ,condition)) 61 | (declare (type ,type-name ,csym) 62 | (:read-only ,csym)) 63 | (,or-f ,@(if flip (reverse args) args))))))))) 64 | 65 | (defmacro def-utility (name arg-types ret-type expansion &key expand-args expand-in-condition) 66 | "Defines and exports a function & compiler macro with the specified expansion." 67 | (let* ((anames (mapcar #'make-arg-name (make-arg-nums arg-types)))) 68 | `(progn 69 | (export ',name) 70 | (eval-when (:compile-toplevel :load-toplevel) 71 | ,@(if expand-in-condition 72 | `((setf (get ',name 'expand-in-condition) t))) 73 | (define-compiler-macro ,name (&environment env ,@anames) 74 | (declare (ignorable env)) 75 | ,@(loop for arg in (if (eq expand-args t) anames expand-args) 76 | collect `(setq ,arg (macroexpand ,arg env))) 77 | ,expansion)) 78 | (proclaim '(ftype (function ,(mapcar #'declaim-arg-type-of arg-types) ,ret-type) ,name)) 79 | (defun ,name ,anames 80 | (declare (optimize (speed 0) (debug 0) (safety 1))) 81 | (let ,(mapcar #'list anames anames) 82 | (declare ,@(loop for an in anames and at in arg-types 83 | collect `(type ,at ,an))) 84 | ;; Depends on the compiler macro being expanded: 85 | (,name ,@anames)))))) 86 | 87 | (defmacro def-if-function (name type-name postfix) 88 | `(def-utility ,name (,type-name ,type-name ,type-name) ,type-name 89 | (expand-if-macro arg0 arg1 arg2 env 90 | ',name 91 | ',(intern (format nil "NOT-~A" postfix)) 92 | ',(intern (format nil "OR-~A" postfix)) 93 | ',(intern (format nil "AND-~A" postfix)) 94 | ',(intern (format nil "ANDNOT-~A" postfix)) 95 | ',type-name 96 | '(,(intern (format nil "SETZERO-~A" postfix)))))) 97 | 98 | ;;; Aligned array allocation 99 | 100 | (deftype sse-array (elt-type &optional dims) 101 | "Type of arrays efficiently accessed by SSE aref intrinsics and returned by make-sse-array. 102 | Should be assumed to be SIMPLE-ARRAY, except that displacing with MAKE-SSE-ARRAY is allowed." 103 | (when (eq elt-type '*) 104 | (c::cmperr "SSE-ARRAY must have a specific element type.")) 105 | (let ((upgraded (upgraded-array-element-type elt-type))) 106 | (when (member upgraded '(t bit)) 107 | (c::cmperr "Invalid SSE-ARRAY element type: ~S" elt-type)) 108 | (unless (subtypep upgraded elt-type) 109 | (c::cmpwarn "SSE-ARRAY element type ~S has been upgraded to ~S" elt-type upgraded)) 110 | `(array ,upgraded ,dims))) 111 | 112 | (defun make-sse-array (dimensions &rest args &key (element-type '(unsigned-byte 8)) displaced-to &allow-other-keys) 113 | "Allocates an SSE-ARRAY aligned to the 16-byte boundary. May flatten displacement chains for performance reasons." 114 | (if displaced-to 115 | (apply #'make-array dimensions args) 116 | (multiple-value-bind (elt-size adj-type) 117 | (array-element-type-byte-size element-type) 118 | (when (eq adj-type t) 119 | (error "Cannot use element type T with SSE.")) 120 | (sys::remf args :element-type) 121 | (let* ((full-size (if (numberp dimensions) 122 | dimensions 123 | (reduce #'* dimensions))) 124 | (padded-size (+ full-size (ceiling 15 elt-size))) 125 | (array (apply #'make-array padded-size :element-type adj-type args)) 126 | (misalign (ffi:c-inline (array) (:object) :int 127 | "(((unsigned long)(#0)->array.self.b8) & 15)" 128 | :one-liner t)) 129 | (offset (/ (if (> misalign 0) (- 16 misalign) 0) elt-size))) 130 | (make-array dimensions :element-type element-type 131 | :displaced-to array :displaced-index-offset offset))))) 132 | 133 | ;;; Single-float tools 134 | 135 | ;; Constants 136 | 137 | (defmacro set-true-ss () 138 | (load-time-value (make-pack-of-bin #xFFFFFFFF :as 'float-sse-pack))) 139 | 140 | (defmacro set-true-ps () 141 | (load-time-value (make-pack-of-bin -1 :as 'float-sse-pack))) 142 | 143 | (eval-when (:compile-toplevel :load-toplevel) 144 | (define-symbol-macro 0.0-ps (setzero-ps)) 145 | 146 | (define-symbol-macro true-ss (set-true-ss)) 147 | (define-symbol-macro false-ss (setzero-ps)) 148 | 149 | (define-symbol-macro true-ps (set-true-ps)) 150 | (define-symbol-macro false-ps (setzero-ps))) 151 | 152 | ;; Bitwise if 153 | 154 | (def-if-function if-ps float-sse-pack #:ps) 155 | 156 | ;; Arithmetic negation (xor with negative zero) 157 | 158 | (def-utility neg-ss (float-sse-pack) float-sse-pack 159 | `(xor-ps ,arg0 ,(load-time-value (make-pack-of-bin #x80000000 :as 'float-sse-pack)))) 160 | 161 | (def-utility neg-ps (float-sse-pack) float-sse-pack 162 | `(xor-ps ,arg0 ,(load-time-value 163 | (make-pack-of-bin #x80000000800000008000000080000000 :as 'float-sse-pack)))) 164 | 165 | ;; Logical inversion 166 | 167 | (def-utility not-ps (float-sse-pack) float-sse-pack 168 | (or (lookup-flip arg0 '((=-ps . /=-ps) 169 | (<-ps . /<-ps) 170 | (<=-ps . /<=-ps) 171 | (>-ps . />-ps) 172 | (>=-ps . />=-ps) 173 | (cmpord-ps . cmpunord-ps) 174 | (not-ps . :identity))) 175 | `(xor-ps ,arg0 true-ps)) 176 | :expand-args t) 177 | 178 | ;; Shuffle 179 | 180 | (defun shuffle-ps (x y mask) 181 | (declare (optimize (speed 0) (debug 0) (safety 1)) 182 | (type t x y mask)) 183 | (check-type x sse-pack) 184 | (check-type y sse-pack) 185 | (check-type mask (unsigned-byte 8)) 186 | (ffi:c-inline (x y mask) (:object :object :int) :float-sse-pack 187 | "_mm_setr_ps( 188 | (#0)->sse.data.sf[(#2)&3], 189 | (#0)->sse.data.sf[((#2)>>2)&3], 190 | (#1)->sse.data.sf[((#2)>>4)&3], 191 | (#1)->sse.data.sf[((#2)>>6)&3] 192 | )" :one-liner t)) 193 | 194 | ;;; Double-float tools 195 | 196 | ;; Constants 197 | 198 | (defmacro set-true-sd () 199 | (load-time-value (make-pack-of-bin #xFFFFFFFFFFFFFFFF :as 'double-sse-pack))) 200 | 201 | (defmacro set-true-pd () 202 | (load-time-value (make-pack-of-bin -1 :as 'double-sse-pack))) 203 | 204 | (eval-when (:compile-toplevel :load-toplevel) 205 | (define-symbol-macro 0.0-pd (setzero-pd)) 206 | 207 | (define-symbol-macro true-sd (set-true-sd)) 208 | (define-symbol-macro false-sd (setzero-pd)) 209 | 210 | (define-symbol-macro true-pd (set-true-pd)) 211 | (define-symbol-macro false-pd (setzero-pd))) 212 | 213 | ;; Bitwise if 214 | 215 | (def-if-function if-pd double-sse-pack #:pd) 216 | 217 | ;; Arithmetic negation (xor with negative zero) 218 | 219 | (def-utility neg-sd (double-sse-pack) double-sse-pack 220 | `(xor-pd ,arg0 221 | ,(load-time-value 222 | (make-pack-of-bin #x8000000000000000 :as 'double-sse-pack)))) 223 | 224 | (def-utility neg-pd (double-sse-pack) double-sse-pack 225 | `(xor-pd ,arg0 226 | ,(load-time-value 227 | (make-pack-of-bin #x80000000000000008000000000000000 :as 'double-sse-pack)))) 228 | 229 | ;; Logical inversion 230 | 231 | (def-utility not-pd (double-sse-pack) double-sse-pack 232 | (or (lookup-flip arg0 '((=-pd . /=-pd) 233 | (<-pd . /<-pd) 234 | (<=-pd . /<=-pd) 235 | (>-pd . />-pd) 236 | (>=-pd . />=-pd) 237 | (cmpord-pd . cmpunord-pd) 238 | (not-pd . :identity))) 239 | `(xor-pd ,arg0 true-pd)) 240 | :expand-args t) 241 | 242 | ;; Shuffle 243 | 244 | (defun shuffle-pd (x y mask) 245 | (declare (optimize (speed 0) (debug 0) (safety 1)) 246 | (type t x y mask)) 247 | (check-type x sse-pack) 248 | (check-type y sse-pack) 249 | (check-type mask (unsigned-byte 2)) 250 | (ffi:c-inline (x y mask) (:object :object :int) :double-sse-pack 251 | "_mm_setr_pd( 252 | (#0)->sse.data.df[(#2)&1], 253 | (#1)->sse.data.df[((#2)>>1)&1] 254 | )" :one-liner t)) 255 | 256 | ;;; Integer tools 257 | 258 | ;; Constants 259 | 260 | (defmacro set-true-pi () 261 | (load-time-value (make-pack-of-bin -1 :as 'int-sse-pack))) 262 | 263 | (eval-when (:compile-toplevel :load-toplevel) 264 | (define-symbol-macro 0-pi (setzero-pi)) 265 | 266 | (define-symbol-macro true-pi (set-true-pi)) 267 | (define-symbol-macro false-pi (setzero-pi))) 268 | 269 | ;; Bitwise if 270 | 271 | (def-if-function if-pi float-sse-pack #:pi) 272 | 273 | ;; Arithmetic negation (subtract from 0) 274 | 275 | (macrolet ((frob (name subf) 276 | `(def-utility ,name (int-sse-pack) int-sse-pack 277 | `(,',subf (setzero-pi) ,arg0)))) 278 | (frob neg-pi8 sub-pi8) 279 | (frob neg-pi16 sub-pi16) 280 | (frob neg-pi32 sub-pi32) 281 | (frob neg-pi64 sub-pi64)) 282 | 283 | ;; Logical inversion 284 | 285 | (def-utility not-pi (int-sse-pack) int-sse-pack 286 | (or (lookup-flip arg0 '((<=-pi8 . >-pi8) 287 | (<=-pi16 . >-pi16) 288 | (<=-pi32 . >-pi32) 289 | (>=-pi8 . <-pi8) 290 | (>=-pi16 . <-pi16) 291 | (>=-pi32 . <-pi32) 292 | (/=-pi8 . =-pi8) 293 | (/=-pi16 . =-pi16) 294 | (/=-pi32 . =-pi32) 295 | (not-pi . :identity)) 296 | :no-reverse t) 297 | `(xor-pi ,arg0 true-pi)) 298 | :expand-args t) 299 | 300 | (macrolet ((frob (name code) 301 | `(def-utility ,name (int-sse-pack int-sse-pack) int-sse-pack 302 | ,code 303 | :expand-in-condition t))) 304 | 305 | (frob <=-pi8 `(not-pi (>-pi8 ,arg0 ,arg1))) 306 | (frob <=-pi16 `(not-pi (>-pi16 ,arg0 ,arg1))) 307 | (frob <=-pi32 `(not-pi (>-pi32 ,arg0 ,arg1))) 308 | 309 | (frob >=-pi8 `(not-pi (<-pi8 ,arg0 ,arg1))) 310 | (frob >=-pi16 `(not-pi (<-pi16 ,arg0 ,arg1))) 311 | (frob >=-pi32 `(not-pi (<-pi32 ,arg0 ,arg1))) 312 | 313 | (frob /=-pi8 `(not-pi (=-pi8 ,arg0 ,arg1))) 314 | (frob /=-pi16 `(not-pi (=-pi16 ,arg0 ,arg1))) 315 | (frob /=-pi32 `(not-pi (=-pi32 ,arg0 ,arg1)))) 316 | 317 | ;; Shifts 318 | 319 | (defun slli-pi (x shift) 320 | (declare (optimize (speed 0) (debug 0) (safety 1)) 321 | (type t x shift)) 322 | (check-type x sse-pack) 323 | (check-type shift (unsigned-byte 8)) 324 | (ffi:c-inline (x shift) (:object :int) :object 325 | "cl_object rv = ecl_make_int_sse_pack(_mm_setzero_si128()); 326 | unsigned bshift=(#1), i; 327 | for (i = 0; i + bshift < 16; i++) 328 | rv->sse.data.b8[i+bshift] = (#0)->sse.data.b8[i]; 329 | @(return) = rv;")) 330 | 331 | (defun srli-pi (x shift) 332 | (declare (optimize (speed 0) (debug 0) (safety 1)) 333 | (type t x shift)) 334 | (check-type x sse-pack) 335 | (check-type shift (unsigned-byte 8)) 336 | (ffi:c-inline (x shift) (:object :int) :object 337 | "cl_object rv = ecl_make_int_sse_pack(_mm_setzero_si128()); 338 | int bshift=(#1), i; 339 | for (i = 16 - bshift - 1; i >= 0; i--) 340 | rv->sse.data.b8[i] = (#0)->sse.data.b8[i+bshift]; 341 | @(return) = rv;")) 342 | 343 | ;; Extract & insert 344 | 345 | (defun extract-pi16 (x index) 346 | (declare (optimize (speed 0) (debug 0) (safety 1)) 347 | (type t x index)) 348 | (check-type x sse-pack) 349 | (check-type index (unsigned-byte 8)) 350 | (ffi:c-inline (x index) (:object :int) :fixnum 351 | "*((unsigned short*)&(#0)->sse.data.b8[((#1)&3)*2])" 352 | :one-liner t)) 353 | 354 | (defun insert-pi16 (x ival index) 355 | (declare (optimize (speed 0) (debug 0) (safety 1)) 356 | (type t x ival index)) 357 | (check-type x sse-pack) 358 | (check-type index (unsigned-byte 8)) 359 | (ffi:c-inline (x ival index) (:int-sse-pack :int :int) :object 360 | "cl_object rv = ecl_make_int_sse_pack(#0); 361 | *((unsigned short*)&rv->sse.data.b8[((#2)&3)*2]) = (unsigned short)(#1); 362 | @(return) = rv;")) 363 | 364 | ;; Shuffles 365 | 366 | (defun shuffle-pi32 (x mask) 367 | (declare (optimize (speed 0) (debug 0) (safety 1)) 368 | (type t x mask)) 369 | (check-type x sse-pack) 370 | (check-type mask (unsigned-byte 8)) 371 | (ffi:c-inline (x mask) (:object :int) :int-sse-pack 372 | "unsigned *pd = (unsigned*)(#0)->sse.data.b8; 373 | @(return) = _mm_setr_epi32(pd[(#1)&3],pd[((#1)>>2)&3],pd[((#1)>>4)&3],pd[((#1)>>6)&3]);")) 374 | 375 | (defun shufflelo-pi16 (x mask) 376 | (declare (optimize (speed 0) (debug 0) (safety 1)) 377 | (type t x mask)) 378 | (check-type x sse-pack) 379 | (check-type mask (unsigned-byte 8)) 380 | (ffi:c-inline (x mask) (:object :int) :int-sse-pack 381 | "unsigned short *pd = (unsigned short*)(#0)->sse.data.b8; 382 | @(return) = _mm_setr_epi16( 383 | pd[(#1)&3],pd[((#1)>>2)&3],pd[((#1)>>4)&3],pd[(((#1)>>6)&3)], 384 | pd[4], pd[5], pd[6], pd[7] 385 | );")) 386 | 387 | (defun shufflehi-pi16 (x mask) 388 | (declare (optimize (speed 0) (debug 0) (safety 1)) 389 | (type t x mask)) 390 | (check-type x sse-pack) 391 | (check-type mask (unsigned-byte 8)) 392 | (ffi:c-inline (x mask) (:object :int) :int-sse-pack 393 | "unsigned short *pb = (unsigned short*)(#0)->sse.data.b8, *pd = pb+4; 394 | @(return) = _mm_setr_epi16( 395 | pb[0], pb[1], pb[2], pb[3], 396 | pd[(#1)&3],pd[((#1)>>2)&3],pd[((#1)>>4)&3],pd[(((#1)>>6)&3)] 397 | );")) 398 | 399 | -------------------------------------------------------------------------------- /sbcl-arrays.lisp: -------------------------------------------------------------------------------- 1 | ;;; -*- mode: Lisp; indent-tabs-mode: nil; -*- 2 | ;;; 3 | ;;; Copyright (c) 2010, Alexander Gavrilov (angavrilov@gmail.com) 4 | ;;; 5 | ;;; This file contains the groundwork for vectorized 6 | ;;; array access intrinsics. 7 | ;;; 8 | 9 | (in-package #:SSE) 10 | 11 | #|--------------------------------------| 12 | | SSE ARRAY ELEMENT SIZE CALCULATION | 13 | |--------------------------------------|# 14 | 15 | (eval-when (:compile-toplevel :load-toplevel :execute) 16 | (defun sse-elt-shift-from-saetp (info) 17 | (and info 18 | (subtypep (saetp-specifier info) 'number) 19 | (not (saetp-fixnum-p info)) 20 | (case (saetp-n-bits info) 21 | (8 0) (16 1) (32 2) (64 3) (128 4))))) 22 | 23 | (defglobal %%size-shift-table%% 24 | (let ((arr (make-array (1+ widetag-mask) :initial-element nil))) 25 | (loop 26 | for info across *specialized-array-element-type-properties* 27 | for shift = (sse-elt-shift-from-saetp info) 28 | when shift 29 | do (setf (svref arr (saetp-typecode info)) shift)) 30 | arr) 31 | "A table of element size shifts for supported SSE array types.") 32 | 33 | (declaim (inline sse-elt-shift-of) 34 | (ftype (function (t) (integer 0 4)) sse-elt-shift-of)) 35 | 36 | (defun sse-elt-shift-of (obj) 37 | "Returns the SSE element size shift for the given object, or fails if it is not a valid SSE vector." 38 | (declare (optimize (safety 0))) 39 | (the (integer 0 4) 40 | (or (svref %%size-shift-table%% 41 | (if (sb-vm::%other-pointer-p obj) 42 | (%other-pointer-widetag obj) 43 | 0)) 44 | (error 'type-error 45 | :datum obj 46 | :expected-type 'sse-array)))) 47 | 48 | #|--------------------------------------| 49 | | SSE-ARRAY TYPE AND ALLOCATION | 50 | |--------------------------------------|# 51 | 52 | (deftype sse-array (&optional (elt-type '* et-p) dims) 53 | "Type of arrays efficiently accessed by SSE aref intrinsics and returned by make-sse-array. 54 | Should be assumed to be SIMPLE-ARRAY, except that displacing with MAKE-SSE-ARRAY is allowed." 55 | (if (eq elt-type '*) 56 | (progn 57 | (when et-p 58 | (error "SSE-ARRAY must have a specific element type.")) 59 | `(simple-array * ,dims)) 60 | (let* ((upgraded (upgraded-array-element-type elt-type)) 61 | (shift (sse-elt-shift-from-saetp (find-saetp upgraded)))) 62 | (when (null shift) 63 | (error "Invalid SSE-ARRAY element type: ~S" elt-type)) 64 | (unless (subtypep upgraded elt-type) 65 | (warn "SSE-ARRAY element type ~S has been upgraded to ~S" elt-type upgraded)) 66 | `(simple-array ,upgraded ,dims)))) 67 | 68 | (defun make-sse-array (dimensions &key (element-type '(unsigned-byte 8)) (initial-element nil ie-p) displaced-to (displaced-index-offset 0)) 69 | "Allocates an SSE-ARRAY aligned to the 16-byte boundary. Flattens displacement chains for performance reasons." 70 | (let* ((upgraded (upgraded-array-element-type element-type)) 71 | (shift (sse-elt-shift-from-saetp (find-saetp upgraded)))) 72 | (when (null shift) 73 | (error "Invalid SSE-ARRAY element type: ~S" element-type)) 74 | (if displaced-to 75 | ;; Fake displacement by allocating a simple-array header 76 | (let* ((dimensions (if (listp dimensions) dimensions (list dimensions))) 77 | (rank (length dimensions)) 78 | (count (reduce #'* dimensions))) 79 | (unless (subtypep element-type (array-element-type displaced-to)) 80 | (error "can't displace an array of type ~S into another of type ~S" 81 | element-type (array-element-type displaced-to))) 82 | (with-array-data ((data displaced-to) 83 | (start displaced-index-offset) 84 | (end)) 85 | (unless (= start 0) 86 | (error "SSE-ARRAY does not support displaced index offset.")) 87 | (unless (<= count end) 88 | (array-bounding-indices-bad-error data start count)) 89 | (if (= rank 1) 90 | (progn 91 | (when (< count end) 92 | (warn "SSE-ARRAY displaced size extended to the full length of the vector.")) 93 | data) 94 | (let ((new-array (make-array-header simple-array-widetag rank))) 95 | (set-array-header new-array data count nil 0 dimensions nil t))))) 96 | ;; X86-64 vectors are already aligned to 16 bytes 97 | (apply #'make-array dimensions :element-type upgraded 98 | (if ie-p (list :initial-element initial-element)))))) 99 | 100 | #|--------------------------------------| 101 | | AREF INTRINSIC DEFINITION HELPERS | 102 | |--------------------------------------|# 103 | 104 | (defconstant +vector-data-fixup+ 105 | (- (* vector-data-offset n-word-bytes) other-pointer-lowtag) 106 | "Offset from a tagged vector pointer to its data") 107 | 108 | (defmacro array-data-expr (array-var &optional is-vector) 109 | (ecase is-vector 110 | (:yes array-var) 111 | (:no `(%array-data-vector ,array-var)) 112 | ((nil) 113 | `(if (array-header-p ,array-var) 114 | (%array-data-vector ,array-var) 115 | ,array-var)))) 116 | 117 | ;; Depends on the vector-length field being in the same place 118 | ;; as the array fill pointer, which for simple-array is equal 119 | ;; to the total size. 120 | ;; The integer constant argument is the number of elements that 121 | ;; should be deducted from the size to account for SIMD access. 122 | (defknown %sse-array-size (simple-array fixnum) array-total-size (flushable always-translatable dx-safe)) 123 | 124 | (define-vop (%sse-array-size/0) 125 | (:translate %sse-array-size) 126 | (:args (array :scs (descriptor-reg))) 127 | (:arg-types * (:constant (integer 0 0))) 128 | (:info gap) 129 | (:ignore gap) 130 | (:policy :fast-safe) 131 | (:results (result :scs (any-reg))) 132 | (:result-types tagged-num) 133 | (:generator 3 134 | (loadw result array vector-length-slot other-pointer-lowtag))) 135 | 136 | (define-vop (%sse-array-size %sse-array-size/0) 137 | (:arg-types * (:constant (integer 1 16))) 138 | (:ignore) 139 | (:temporary (:sc any-reg) tmp) 140 | (:generator 8 141 | (loadw result array vector-length-slot other-pointer-lowtag) 142 | (inst mov tmp (fixnumize gap)) 143 | (inst cmp result tmp) 144 | (inst cmov :ng tmp result) 145 | (inst sub result tmp))) 146 | 147 | (defmacro with-sse-data (((sap-var data-var array) (offset-var index)) &body code) 148 | ;; Compute a SAP and offset for the specified array and index. Check bounds. 149 | (with-unique-names (data-index data-end elt-shift access-size) 150 | (once-only ((array array) 151 | (index index)) 152 | `(locally 153 | (declare (optimize (insert-array-bounds-checks 0))) 154 | (with-array-data ((,data-var ,array) 155 | (,data-index ,index) 156 | (,data-end)) 157 | (let* ((,sap-var (int-sap (get-lisp-obj-address ,data-var))) 158 | (,elt-shift (sse-elt-shift-of ,data-var)) 159 | (,access-size (ash 16 (- ,elt-shift))) 160 | (,offset-var (+ (ash ,data-index ,elt-shift) +vector-data-fixup+))) 161 | (declare (type system-area-pointer ,sap-var) 162 | (type fixnum ,offset-var)) 163 | (unless (<= 0 ,data-index (+ ,data-index ,access-size) ,data-end) 164 | (array-bounding-indices-bad-error ,array ,index (+ ,index ,access-size))) 165 | ,@code)))))) 166 | 167 | (defun sse-array-type-info-or-give-up (type) 168 | (cond ((and (array-type-p type) 169 | (not (array-type-complexp type))) 170 | (let* ((etype (array-type-specialized-element-type type)) 171 | (shift (sse-elt-shift-from-saetp 172 | (if (eq etype *wild-type*) nil 173 | (find-saetp-by-ctype etype))))) 174 | (unless shift 175 | (give-up-ir1-transform "not a known SSE-compatible array element type: ~S" 176 | (type-specifier etype))) 177 | (values shift 178 | (and (listp (array-type-dimensions type)) 179 | (if (null (cdr (array-type-dimensions type))) :yes :no))))) 180 | ((union-type-p type) 181 | ;; Support unions of array types with the same elt size 182 | (let (nonfirst rshift rdims) 183 | (dolist (subtype (union-type-types type)) 184 | (multiple-value-bind (shift dims) 185 | (sse-array-type-info-or-give-up subtype) 186 | (unless nonfirst 187 | (setf nonfirst t 188 | rshift shift 189 | rdims dims)) 190 | (unless (= rshift shift) 191 | (give-up-ir1-transform 192 | "union member types have different element sizes")) 193 | (unless (eq rdims dims) 194 | (setf rdims nil)))) 195 | (values rshift rdims))) 196 | (t 197 | (give-up-ir1-transform "not a simple array type")))) 198 | 199 | (defun sse-array-info-or-give-up (lvar ref-size) 200 | ;; Look up the SSE element size and check if it is definitely a 201 | ;; vector 202 | (multiple-value-bind (shift dim-info) 203 | (sse-array-type-info-or-give-up (lvar-type lvar)) 204 | (values (ash 1 shift) ; step 205 | (ash (1- ref-size) (- shift)) ; gap (size of SIMD overreach) 206 | dim-info))) 207 | 208 | (defmacro def-aref-intrinsic (postfix rtype reader writer &key (ref-size 16) side-effect?) 209 | (let* ((rm-aref (symbolicate "ROW-MAJOR-AREF-" postfix)) 210 | (rm-aset (if writer (symbolicate "ROW-MAJOR-ASET-" postfix))) 211 | (aref (symbolicate "AREF-" postfix)) 212 | (aset (if writer (symbolicate "%ASET-" postfix))) 213 | (reader-vop (symbolicate "%" reader)) 214 | (reader/ix-vop (symbolicate "%" reader "/IX")) 215 | (writer-vop (if writer (symbolicate "%" writer))) 216 | (writer/ix-vop (if writer (symbolicate "%" writer "/IX"))) 217 | (rtype (or rtype '(values))) 218 | (known-flags (if side-effect? 219 | '(dx-safe) 220 | '(foldable flushable dx-safe))) 221 | (index-expression 222 | (if (= ref-size 0) 223 | ``(the signed-word index) 224 | ``(the signed-word (%check-bound array (%sse-array-size array ,gap) index))))) 225 | `(progn 226 | ;; ROW-MAJOR-AREF 227 | (export ',rm-aref) 228 | (defknown ,rm-aref (array index) ,rtype ,known-flags) 229 | (defun ,rm-aref (array index) 230 | (with-sse-data ((sap data array) 231 | (offset index)) 232 | (,reader-vop sap offset 1 0))) 233 | ;; 234 | (deftransform ,rm-aref ((array index) (simple-array t) * :important t) 235 | ,(format nil "open-code ~A" rm-aref) 236 | (multiple-value-bind (step gap is-vector) 237 | (sse-array-info-or-give-up array ,ref-size) 238 | (declare (ignorable gap)) 239 | `(,',reader/ix-vop (array-data-expr array ,is-vector) 240 | ,,index-expression 241 | ,step ,+vector-data-fixup+))) 242 | ;; AREF 243 | (export ',aref) 244 | (defknown ,aref (array &rest index) ,rtype ,known-flags) 245 | (defun ,aref (array &rest indices) 246 | (declare (truly-dynamic-extent indices)) 247 | (with-sse-data ((sap data array) 248 | (offset (%array-row-major-index array indices))) 249 | (,reader-vop sap offset 1 0))) 250 | ;; 251 | (defoptimizer (,aref derive-type) ((array &rest indices) node) 252 | (assert-array-rank array (length indices)) 253 | (values-specifier-type ',rtype)) 254 | (deftransform ,aref ((array &rest indices) (simple-array &rest t) * :important t) 255 | ,(format nil "open-code ~A" aref) 256 | (multiple-value-bind (step gap is-vector) 257 | (sse-array-info-or-give-up array ,ref-size) 258 | (declare (ignorable gap)) 259 | (let ((syms (make-gensym-list (length indices)))) 260 | `(lambda (array ,@syms) 261 | (let ((index ,(if (eq is-vector :yes) (first syms) 262 | `(array-row-major-index array ,@syms)))) 263 | (,',reader/ix-vop (array-data-expr array ,is-vector) 264 | ,,index-expression 265 | ,step ,+vector-data-fixup+)))))) 266 | ,@(if writer 267 | `(;; ROW-MAJOR-ASET 268 | (defknown ,rm-aset (array index sse-pack) ,rtype ()) 269 | (defsetf ,rm-aref ,rm-aset) 270 | (defun ,rm-aset (array index new-value) 271 | (with-sse-data ((sap data array) 272 | (offset index)) 273 | (,writer-vop sap offset 1 0 (the ,rtype new-value)) 274 | new-value)) 275 | ;; 276 | (deftransform ,rm-aset ((array index value) (simple-array t t) * :important t) 277 | ,(format nil "open-code ~A" rm-aset) 278 | (multiple-value-bind (step gap is-vector) 279 | (sse-array-info-or-give-up array ,ref-size) 280 | (declare (ignorable gap)) 281 | `(progn 282 | (,',writer/ix-vop (array-data-expr array ,is-vector) 283 | ,,index-expression 284 | ,step ,+vector-data-fixup+ 285 | (the sse-pack value)) 286 | value))) 287 | ;; %ASET 288 | (defknown ,aset (array &rest t) ,rtype ()) 289 | (defsetf ,aref ,aset) 290 | (defun ,aset (array &rest stuff) 291 | (let ((new-value (car (last stuff)))) 292 | (with-sse-data ((sap data array) 293 | (offset (%array-row-major-index array (nbutlast stuff)))) 294 | (,writer-vop sap offset 1 0 (the ,rtype new-value)) 295 | new-value))) 296 | ;; 297 | (defoptimizer (,aset derive-type) ((array &rest stuff) node) 298 | (assert-array-rank array (1- (length stuff))) 299 | (assert-lvar-type (car (last stuff)) (specifier-type 'sse-pack) 300 | (lexenv-policy (node-lexenv node))) 301 | (specifier-type ',rtype)) 302 | (deftransform ,aset ((array &rest stuff) (simple-array &rest t) * :important t) 303 | ,(format nil "open-code ~A" aset) 304 | (multiple-value-bind (step gap is-vector) 305 | (sse-array-info-or-give-up array ,ref-size) 306 | (declare (ignorable gap)) 307 | (let ((syms (make-gensym-list (length stuff)))) 308 | `(lambda (array ,@syms) 309 | (let ((index ,(if (eq is-vector :yes) (first syms) 310 | `(array-row-major-index array ,@(butlast syms))))) 311 | (,',writer/ix-vop (array-data-expr array ,is-vector) 312 | ,,index-expression 313 | ,step ,+vector-data-fixup+ 314 | (the sse-pack ,(car (last syms))))) 315 | ,(car (last syms))))))))))) 316 | 317 | -------------------------------------------------------------------------------- /sbcl-core.lisp: -------------------------------------------------------------------------------- 1 | ;;; -*- mode: Lisp; indent-tabs-mode: nil; -*- 2 | ;;; 3 | ;;; Copyright (c) 2010, Alexander Gavrilov (angavrilov@gmail.com) 4 | ;;; 5 | ;;; This file contains definitions of abstract VOPs, macros 6 | ;;; and utility functions used to implement the intrinsics. 7 | ;;; 8 | 9 | (in-package #:SSE) 10 | 11 | (eval-when (:load-toplevel :compile-toplevel :execute) 12 | (define-symbol-macro +any-sse-reg+ 13 | '(int-sse-reg single-sse-reg double-sse-reg)) 14 | (define-symbol-macro +any-sse-reg/immediate+ 15 | '(int-sse-reg single-sse-reg double-sse-reg 16 | int-sse-immediate single-sse-immediate double-sse-immediate))) 17 | 18 | #|---------------------------------| 19 | | SPECIFIC PACK TYPES | 20 | |---------------------------------|# 21 | 22 | (deftype sse-pack (&optional (arg nil arg-p)) 23 | (if arg-p `(simd-pack ,arg) 'simd-pack)) 24 | 25 | (deftype int-sse-pack () '(sse-pack integer)) 26 | (deftype float-sse-pack () '(sse-pack single-float)) 27 | (deftype double-sse-pack () '(sse-pack double-float)) 28 | 29 | (declaim (inline sse-pack-p)) 30 | (defun sse-pack-p (arg) (simd-pack-p arg)) 31 | 32 | ;; Has no effect; just to stop any errors 33 | (defvar *sse-pack-print-mode* nil) 34 | 35 | #|---------------------------------| 36 | | HELPER FUNCTIONS & MACROS | 37 | |---------------------------------|# 38 | 39 | (defconstant +uint32-mask+ #xFFFFFFFF) 40 | (defconstant +uint64-mask+ #xFFFFFFFFFFFFFFFF) 41 | (defconstant +min-int32+ (- (ash 1 31))) 42 | (defconstant +max-int32+ (1- (ash 1 31))) 43 | 44 | (defun type-name-to-primitive (lt) 45 | (primitive-type-name (primitive-type (specifier-type lt)))) 46 | 47 | (defun type-name-to-reg (lt) 48 | (ecase lt 49 | (int-sse-pack 'int-sse-reg) 50 | (float-sse-pack 'single-sse-reg) 51 | (double-sse-pack 'double-sse-reg))) 52 | 53 | (defun type-name-to-tag (lt) 54 | (let ((elt (ecase lt 55 | (int-sse-pack 'integer) 56 | (float-sse-pack 'single-float) 57 | (double-sse-pack 'double-float)))) 58 | (or (position elt *simd-pack-element-types*) 59 | (error "Unknown tag for type ~A" lt)))) 60 | 61 | (defmacro %mkpack (type lo hi) 62 | `(%make-simd-pack ,(type-name-to-tag type) ,lo ,hi)) 63 | 64 | (defun move-cmd-for-type (lt) 65 | ;; Select a move instruction that matches the type name best 66 | (ecase lt 67 | (int-sse-pack 'movdqa) 68 | ((float-sse-pack double-sse-pack) 'movaps))) 69 | 70 | (defun ensure-reg-or-mem (tn) 71 | ;; Spill immediate constants to inline memory 72 | (sc-case tn 73 | ((int-sse-immediate single-sse-immediate double-sse-immediate) 74 | (register-inline-constant (tn-value tn))) 75 | ((immediate) 76 | (register-inline-constant :dword (tn-value tn))) 77 | (t tn))) 78 | 79 | (defmacro ensure-load (type tgt src) 80 | ;; Ensure src gets to tgt, possibly from memory or immediate 81 | `(unless (location= ,tgt ,src) 82 | (inst ,(move-cmd-for-type type) ,tgt (ensure-reg-or-mem ,src)))) 83 | 84 | (defmacro ensure-move (type tgt src) 85 | ;; Ensure src gets to tgt; src should be a register 86 | `(unless (location= ,tgt ,src) 87 | (inst ,(move-cmd-for-type type) ,tgt ,src))) 88 | 89 | (defmacro save-intrinsic-spec (name info) 90 | ;; Save forms in info for later processing (function generation) 91 | `(eval-when (:compile-toplevel :load-toplevel :execute) 92 | (setf (get ',name 'intrinsic-spec) ',info))) 93 | 94 | (defmacro def-splice-transform (name args &body code) 95 | "Define a transform unpacking a superposition of function calls. Args can contain nested call specs." 96 | (let* ((direct-args (mapcar (lambda (x) (if (consp x) (gensym) x)) args)) 97 | (flat-args (mapcan (lambda (x) (if (consp x) (copy-list (rest x)) (list x))) args))) 98 | `(deftransform ,name ((,@direct-args) * *) 99 | ,(format nil "Simplify combination ~A" (cons name args)) 100 | ,@(loop for spec in args and name in direct-args 101 | when (consp spec) 102 | collect `(splice-fun-args ,name ',(first spec) ,(1- (length spec)))) 103 | (list* 'lambda ',flat-args ',code)))) 104 | 105 | #|---------------------------------| 106 | | INDEX-OFFSET SPLICING | 107 | |---------------------------------|# 108 | 109 | (defun skip-casts (lvar) 110 | ;; In unchecked mode, skip cast nodes and return their argument 111 | (let ((inside (lvar-uses lvar))) 112 | (if (and (cast-p inside) 113 | (policy inside (= sb-c::type-check 0))) 114 | (skip-casts (cast-value inside)) 115 | lvar))) 116 | 117 | (defun delete-casts (lvar) 118 | ;; Delete outer cast nodes from the lvar 119 | (loop for inside = (lvar-uses lvar) 120 | while (cast-p inside) 121 | do (delete-filter inside lvar (cast-value inside)))) 122 | 123 | (defun fold-index-addressing (fun-name index scale offset &key prefix-args postfix-args) 124 | "Generic index expression folding transform; unpacks index into base + index*scale + offset." 125 | ;; Peek into the index argument: 126 | (multiple-value-bind (func index-args) 127 | (extract-fun-args (skip-casts index) '(+ - * ash) 2) 128 | ;; Found an arithmetic op in index... 129 | (destructuring-bind (x constant) index-args 130 | (declare (ignorable x)) 131 | (unless (constant-lvar-p constant) 132 | (give-up-ir1-transform)) 133 | ;; It has one constant argument... 134 | (let ((value (lvar-value constant)) 135 | (scale-value (lvar-value scale)) 136 | (offset-value (lvar-value offset))) 137 | (unless (integerp value) 138 | (give-up-ir1-transform)) 139 | ;; Compute new scale and offset constants: 140 | (multiple-value-bind (new-scale new-offset) 141 | (ecase func 142 | (+ (values scale-value (+ offset-value (* value scale-value)))) 143 | (- (values scale-value (- offset-value (* value scale-value)))) 144 | (* (values (* scale-value value) offset-value)) 145 | (ash (unless (>= value 0) 146 | (give-up-ir1-transform "negative index shift")) 147 | (values (ash scale-value value) offset-value))) 148 | ;; Verify that the constants didn't overflow: 149 | (unless (and (typep new-scale '(signed-byte 32)) 150 | (typep new-offset 'signed-word)) 151 | (give-up-ir1-transform "constant is too large for inlining")) 152 | ;; OK, actually apply the splice: 153 | (delete-casts index) 154 | (splice-fun-args index func 2) 155 | `(lambda (,@prefix-args thing index const scale offset ,@postfix-args) 156 | (declare (ignore const scale offset)) 157 | (,fun-name ,@prefix-args 158 | thing (the signed-word index) ,new-scale ,new-offset 159 | ,@postfix-args))))))) 160 | 161 | (deftransform fold-ref-index-addressing ((thing index scale offset) * * :defun-only t :node node) 162 | (fold-index-addressing (lvar-fun-name (basic-combination-fun node)) index scale offset)) 163 | 164 | (deftransform fold-xmm-ref-index-addressing ((value thing index scale offset) * * :defun-only t :node node) 165 | (fold-index-addressing (lvar-fun-name (basic-combination-fun node)) index scale offset :prefix-args '(value))) 166 | 167 | (deftransform fold-set-index-addressing ((thing index scale offset value) * * :defun-only t :node node) 168 | (fold-index-addressing (lvar-fun-name (basic-combination-fun node)) index scale offset :postfix-args '(value))) 169 | 170 | #|---------------------------------| 171 | | INDEX-OFFSET ADDRESSING | 172 | |---------------------------------|# 173 | 174 | (defun is-tagged-load-scale (value) 175 | ;; The scale factor can be adjusted for a tagged fixnum index 176 | (not (logtest value (1- (ash 1 n-fixnum-tag-bits))))) 177 | 178 | (deftype tagged-load-scale () 179 | '(and fixnum (satisfies is-tagged-load-scale))) 180 | 181 | (defun find-lea-scale (scale) 182 | ;; Split the scale into a LEA-compatible part and the rest 183 | (cond ((not (logtest scale 7)) (values (/ scale 8) 8)) 184 | ((not (logtest scale 3)) (values (/ scale 4) 4)) 185 | ((not (logtest scale 1)) (values (/ scale 2) 2)) 186 | (t (values scale 1)))) 187 | 188 | (defun reduce-offset (ioffset scale offset) 189 | "Redistribute value from ioffset to offset, while keeping offset int32." 190 | (let* ((istep (if (< ioffset 0) -1 1)) 191 | (icount (max 0 192 | (if (< ioffset 0) 193 | (- (1+ +min-int32+) ioffset) ; = (- +max-int32+) 194 | (- ioffset +max-int32+)))) 195 | (ostep (* istep scale)) 196 | (ocount (truncate (- (if (> ostep 0) +max-int32+ +min-int32+) offset) 197 | ostep)) 198 | (count (min ocount icount))) 199 | (values (- ioffset (* count istep)) 200 | (+ offset (* count ostep))))) 201 | 202 | (defun split-offset (offset scale) 203 | ;; Optimally split the offset into a scaled and unscaled part 204 | (if (typep offset '(signed-byte 32)) 205 | (values 0 offset) 206 | (multiple-value-bind (div rem) (floor offset scale) 207 | (assert (typep rem '(signed-byte 32))) 208 | (if (typep div '(signed-byte 32)) 209 | (values div rem) 210 | (reduce-offset div scale rem))))) 211 | 212 | (defun power-of-2? (scale) 213 | (and (> scale 0) (not (logtest scale (1- scale))))) 214 | 215 | (defun find-power-of-2 (scale) 216 | (assert (power-of-2? scale)) 217 | (loop for i from 0 and sv = scale then (ash sv -1) 218 | when (<= sv 1) return i)) 219 | 220 | (defun make-scaled-ea (size sap index scale offset tmp &key fixnum-index) 221 | "Returns an ea representing the given sap + index*scale + offset formula. 222 | May emit additional instructions using the temporary register." 223 | (assemble () 224 | ;; Check if the index is immediate too 225 | (if (or (sc-is index immediate) (= scale 0)) 226 | ;; Fully constant offset: 227 | (let ((value (if (= scale 0) offset 228 | (+ (* (tn-value index) scale) offset)))) 229 | (assert (typep value '(signed-byte 64))) 230 | ;; Represent the offset as an immediate, or a loaded constant 231 | (if (typep value '(signed-byte 32)) 232 | (make-ea size :base sap :disp value) 233 | (progn 234 | (inst mov tmp (register-inline-constant :qword value)) 235 | (make-ea size :base sap :index tmp)))) 236 | ;; Otherwise, indexing required 237 | (progn 238 | ;; If the index is tagged, adjust the scale factor: 239 | (when (sc-is index any-reg) 240 | (assert (and fixnum-index (is-tagged-load-scale scale))) 241 | (setf scale (ash scale (- n-fixnum-tag-bits)))) 242 | ;; Split the scale factor for LEA 243 | (multiple-value-bind (rscale outer-scale) (find-lea-scale scale) 244 | ;; One-instruction case? 245 | (if (and (= rscale 1) (typep offset '(signed-byte 32))) 246 | ;; Return an EA representing the whole computation 247 | (make-ea size :base sap :index index :scale scale :disp offset) 248 | ;; Otherwise, temporary needed; so split the offset. 249 | ;; outer-offset is guaranteed to be signed-byte 32 250 | (multiple-value-bind (roffset outer-offset) (split-offset offset outer-scale) 251 | ;; Helpers: 252 | (labels ((negate-when-<0 (register scale) 253 | (when (< scale 0) 254 | (inst neg register))) 255 | (emit-shift-mul (register scale) 256 | (inst shl register (find-power-of-2 (abs scale))) 257 | (negate-when-<0 register scale)) 258 | (try-use-lea (try-scale &optional base) 259 | ;; Try to compute tmp via one LEA 260 | (multiple-value-bind (rrscale in-scale) (find-lea-scale try-scale) 261 | (when (and (= (abs rrscale) 1) ; signed 1 262 | (typep (* rrscale roffset) '(signed-byte 32))) 263 | ;; Would work: 264 | (when (and (= roffset 0) (null base)) ; minimize outer-offset 265 | (multiple-value-setq (roffset outer-offset) (floor offset outer-scale))) 266 | (let ((xoffset (* rrscale roffset))) 267 | (inst lea tmp 268 | (if (and (= in-scale 1) (null base)) 269 | (make-ea :byte :base index :disp xoffset) 270 | (make-ea :byte :base base :index index 271 | :scale in-scale :disp xoffset)))) 272 | (negate-when-<0 tmp rrscale) 273 | :success)))) 274 | (declare (inline negate-when-<0 emit-shift-mul)) 275 | ;; Select the best way to compute the temporary: 276 | (cond 277 | ;; same register shift? 278 | ((and (= roffset 0) (location= tmp index) (power-of-2? (abs rscale))) 279 | (emit-shift-mul tmp rscale)) 280 | ;; one LEA? 281 | ((try-use-lea rscale)) 282 | ((try-use-lea (1- rscale) index)) 283 | ;; Generic case, use mul/shl and add 284 | (t 285 | (if (power-of-2? (abs rscale)) 286 | (progn 287 | (move tmp index) 288 | (emit-shift-mul tmp rscale)) 289 | (inst imul tmp index rscale)) 290 | (unless (= roffset 0) 291 | ;; Make outer-offset as small as possible 292 | (multiple-value-setq (roffset outer-offset) (floor offset outer-scale)) 293 | ;; Emit ADD for the offset 294 | (if (typep roffset '(signed-byte 32)) 295 | (inst add tmp roffset) 296 | (inst add tmp (register-inline-constant :qword roffset)))))) 297 | ;; Return the final EA definition: 298 | (make-ea size :base sap :index tmp :scale outer-scale :disp outer-offset))))))))) 299 | 300 | #|---------------------------------| 301 | | INITIALIZATION INTRINSICS | 302 | |---------------------------------|# 303 | 304 | (defmacro def-float-set-intrinsic (&whole whole pubname fname atype aregtype rtype move-insn) 305 | (declare (ignore pubname)) 306 | `(progn 307 | (save-intrinsic-spec ,fname ,whole) 308 | (defknown ,fname (,atype) ,rtype (foldable flushable dx-safe)) 309 | ;; 310 | (define-vop (,fname) 311 | (:translate ,fname) 312 | (:args (arg :scs (,aregtype) :target dst)) 313 | (:arg-types ,atype) 314 | (:results (dst :scs (,(type-name-to-reg rtype)))) 315 | (:result-types ,(type-name-to-primitive rtype)) 316 | (:policy :fast-safe) 317 | (:generator 1 318 | (unless (location= dst arg) 319 | (inst ,move-insn dst arg)))))) 320 | 321 | #|---------------------------------| 322 | | UNARY OPERATION INTRINSICS | 323 | |---------------------------------|# 324 | 325 | (define-vop (sse-unary-op) 326 | ;; no immediate because expecting to be folded 327 | (:args (x :scs #.+any-sse-reg+)) 328 | (:arg-types simd-pack) 329 | (:policy :fast-safe) 330 | (:note "inline SSE unary operation") 331 | (:vop-var vop) 332 | (:save-p :compute-only)) 333 | 334 | (defmacro def-unary-intrinsic (&whole whole 335 | name rtype insn cost c-name 336 | &key 337 | partial ; instruction modifies only part of the destination 338 | immediate-arg result-size arg-type) 339 | (declare (ignore c-name)) 340 | (let* ((imm (if immediate-arg '(imm))) 341 | (immt (if immediate-arg (list immediate-arg))) 342 | (rprimtype (type-name-to-primitive rtype)) 343 | (rregtype (cond ((subtypep rtype 'unsigned-byte) 344 | 'unsigned-reg) 345 | ((subtypep rtype 'integer) 346 | 'signed-reg) 347 | (t (type-name-to-reg rtype)))) 348 | (aregtype (type-name-to-reg (or arg-type rtype))) 349 | (target (if (eq rregtype aregtype) '(:target r) '()))) 350 | (assert (or (not partial) (not (subtypep rtype 'integer)))) 351 | `(progn 352 | (export ',name) 353 | (save-intrinsic-spec ,name ,whole) 354 | (defknown ,name (sse-pack ,@immt) ,rtype (foldable flushable dx-safe)) 355 | ;; 356 | (define-vop (,name sse-unary-op) 357 | (:translate ,name) 358 | (:args (x :scs ,+any-sse-reg+ ,@target)) 359 | ,@(if immediate-arg 360 | `((:arg-types simd-pack (:constant ,immediate-arg)) 361 | (:info imm))) 362 | (:results (r :scs (,rregtype))) 363 | (:result-types ,rprimtype) 364 | (:generator ,cost 365 | ,@(ecase partial 366 | (:one-arg `((ensure-move ,rtype r x) 367 | (inst ,insn r ,@imm))) 368 | (t `((ensure-move ,rtype r x) 369 | (inst ,insn r r ,@imm))) 370 | ((nil) `((inst ,insn 371 | ,(if result-size `(reg-in-size r ,result-size) 'r) 372 | x ,@imm))))))))) 373 | 374 | #|---------------------------------| 375 | | UNARY TO INT32 & SIGN-EXTEND | 376 | |---------------------------------|# 377 | 378 | (define-vop (sse-cvt-to-int32-op sse-unary-op) 379 | (:temporary (:sc signed-reg :offset rax-offset :target r :to :result) rax) 380 | (:results (r :scs (signed-reg)))) 381 | 382 | (defmacro def-cvt-to-int32-intrinsic (name rtype insn cost c-name &key arg-type) 383 | (declare (ignore arg-type)) 384 | `(progn 385 | (export ',name) 386 | (save-intrinsic-spec ,name (def-unary-intrinsic ,name ,rtype ,insn ,cost ,c-name)) 387 | (defknown ,name (sse-pack) (signed-byte 32) (foldable flushable dx-safe)) 388 | ;; 389 | (define-vop (,name sse-cvt-to-int32-op) 390 | (:translate ,name) 391 | (:result-types ,(type-name-to-primitive rtype)) 392 | (:generator ,cost 393 | (inst ,insn (reg-in-size rax :dword) x) 394 | (inst cdqe) 395 | (move r rax))))) 396 | 397 | #|---------------------------------| 398 | | BITWISE NOT INTRINSICS | 399 | |---------------------------------|# 400 | 401 | (define-vop (sse-not-op sse-unary-op) 402 | (:temporary (:sc sse-reg) tmp)) 403 | 404 | (defmacro def-not-intrinsic (name rtype insn) 405 | `(progn 406 | (export ',name) 407 | (save-intrinsic-spec ,name (def-unary-intrinsic ,name ,rtype ,insn 3 nil)) 408 | (defknown ,name (sse-pack) ,rtype (foldable flushable dx-safe)) 409 | ;; 410 | (define-vop (,name sse-not-op) 411 | (:translate ,name) 412 | (:args (x :scs ,+any-sse-reg+ :target r)) 413 | (:results (r :scs (,(type-name-to-reg rtype)))) 414 | (:result-types ,(type-name-to-primitive rtype)) 415 | (:generator 3 416 | (if (location= x r) 417 | (progn 418 | (inst pcmpeqd tmp tmp) 419 | (inst ,insn r tmp)) 420 | (progn 421 | (inst pcmpeqd r r) 422 | (inst ,insn r x))))))) 423 | 424 | #|---------------------------------| 425 | | BINARY OPERATION INTRINSICS | 426 | |---------------------------------|# 427 | 428 | (define-vop (sse-binary-base-op) 429 | (:args (x :scs #.+any-sse-reg/immediate+ :target r) 430 | (y :scs #.+any-sse-reg/immediate+)) 431 | (:results (r :scs (int-sse-reg))) 432 | (:arg-types simd-pack simd-pack) 433 | (:policy :fast-safe) 434 | (:note "inline SSE binary operation") 435 | (:vop-var vop) 436 | (:save-p :compute-only)) 437 | 438 | (define-vop (sse-binary-op sse-binary-base-op) 439 | (:temporary (:sc sse-reg) tmp)) 440 | 441 | (define-vop (sse-binary-comm-op sse-binary-base-op) 442 | (:args (x :scs #.+any-sse-reg/immediate+ :target r) 443 | (y :scs #.+any-sse-reg/immediate+ :target r))) 444 | 445 | (defmacro def-binary-intrinsic (&whole whole 446 | name rtype insn cost c-name 447 | &key commutative tags immediate-arg x-type y-type) 448 | (declare (ignore c-name x-type y-type)) 449 | (let* ((imm (if immediate-arg '(imm))) 450 | (immt (if immediate-arg (list immediate-arg)))) 451 | `(progn 452 | (export ',name) 453 | (save-intrinsic-spec ,name ,whole) 454 | (defknown ,name (sse-pack sse-pack ,@immt) ,rtype (foldable flushable dx-safe)) 455 | ;; 456 | (define-vop (,name ,(if commutative 'sse-binary-comm-op 'sse-binary-op)) 457 | (:translate ,name) 458 | (:results (r :scs (,(type-name-to-reg rtype)))) 459 | (:result-types ,(type-name-to-primitive rtype)) 460 | ,@(if immediate-arg 461 | `((:arg-types simd-pack simd-pack (:constant ,immediate-arg)) 462 | (:info imm))) 463 | (:generator ,cost 464 | ,@(if commutative 465 | `((when (location= y r) 466 | (rotatef x y)) 467 | (ensure-load ,rtype r x) 468 | (inst ,insn ,@tags r (ensure-reg-or-mem y) ,@imm)) 469 | ;; Noncommutative may require usage of a temporary: 470 | `((unless (location= y r) 471 | (setf tmp r)) 472 | (ensure-load ,rtype tmp x) 473 | (inst ,insn ,@tags tmp (ensure-reg-or-mem y) ,@imm) 474 | (ensure-move ,rtype r tmp)))))))) 475 | 476 | #|---------------------------------| 477 | | XMM/INTEGER BINARY INTRINSICS | 478 | |---------------------------------|# 479 | 480 | (define-vop (sse-int-base-op) 481 | (:results (r :scs (int-sse-reg))) 482 | (:policy :fast-safe) 483 | (:note "inline SSE/integer operation") 484 | (:vop-var vop) 485 | (:save-p :compute-only)) 486 | 487 | (define-vop (sse-int-op sse-int-base-op) 488 | (:args (x :scs #.+any-sse-reg/immediate+ :target r) 489 | (iv :scs (signed-reg signed-stack immediate))) 490 | (:arg-types simd-pack signed-num)) 491 | 492 | (define-vop (sse-uint-op sse-int-base-op) 493 | (:args (x :scs #.+any-sse-reg/immediate+ :target r) 494 | (iv :scs (unsigned-reg unsigned-stack immediate))) 495 | (:arg-types simd-pack unsigned-num)) 496 | 497 | (defmacro def-sse-int-intrinsic (&whole whole 498 | name itype rtype insn cost c-name 499 | &key make-temporary immediate-arg defun-body) 500 | (declare (ignore c-name defun-body)) 501 | (let* ((imm (if immediate-arg '(imm))) 502 | (immt (if immediate-arg (list immediate-arg))) 503 | (unsigned? (subtypep itype 'unsigned-byte))) 504 | `(progn 505 | (export ',name) 506 | (save-intrinsic-spec ,name ,whole) 507 | (defknown ,name (sse-pack ,itype ,@immt) ,rtype (foldable flushable dx-safe)) 508 | ;; 509 | (define-vop (,name ,(if unsigned? 'sse-uint-op 'sse-int-op)) 510 | (:translate ,name) 511 | (:results (r :scs (,(type-name-to-reg rtype)))) 512 | (:result-types ,(type-name-to-primitive rtype)) 513 | ,@(if immediate-arg 514 | `((:arg-types simd-pack 515 | ,(if unsigned? 'unsigned-num 'signed-num) 516 | (:constant ,immediate-arg)) 517 | (:info imm))) 518 | ,@(if make-temporary 519 | `((:temporary (:sc sse-reg) tmp))) 520 | (:generator ,cost 521 | (ensure-load ,rtype r x) 522 | ,@(if (eq make-temporary t) 523 | '((inst movd tmp (ensure-reg-or-mem iv))) 524 | make-temporary) 525 | (inst ,insn r ,(if make-temporary 'tmp '(ensure-reg-or-mem iv)) ,@imm)))))) 526 | 527 | #|---------------------------------| 528 | | COMPARISON PREDICATE INTRINSICS | 529 | |---------------------------------|# 530 | 531 | (define-vop (sse-comparison-op) 532 | (:args (x :scs #.+any-sse-reg+) 533 | (y :scs #.+any-sse-reg/immediate+)) 534 | (:arg-types simd-pack simd-pack) 535 | (:policy :fast-safe) 536 | (:note "inline SSE binary comparison predicate") 537 | (:vop-var vop) 538 | (:save-p :compute-only)) 539 | 540 | (define-vop (sse-comparison-comm-op sse-comparison-op) 541 | (:args (x :scs #.+any-sse-reg+ 542 | :load-if (not (and (sc-is x int-sse-immediate single-sse-immediate double-sse-immediate) 543 | (sc-is y int-sse-reg single-sse-reg double-sse-reg)))) 544 | (y :scs #.+any-sse-reg/immediate+))) 545 | 546 | (defmacro def-comparison-intrinsic (&whole whole 547 | name arg-type insn cost c-name 548 | &key commutative tags) 549 | (declare (ignore arg-type c-name)) 550 | (let* () 551 | `(progn 552 | (export ',name) 553 | (save-intrinsic-spec ,name ,whole) 554 | (defknown ,name (sse-pack sse-pack) boolean (foldable flushable dx-safe)) 555 | (define-vop (,name ,(if commutative 'sse-comparison-comm-op 'sse-comparison-op)) 556 | (:translate ,name) 557 | (:conditional ,@tags) 558 | (:generator ,cost 559 | ,(if commutative 560 | `(if (sc-is x int-sse-reg single-sse-reg double-sse-reg) 561 | (inst ,insn x y) 562 | (inst ,insn y x)) 563 | `(inst ,insn x y))))))) 564 | 565 | #|---------------------------------| 566 | | MEMORY LOAD INTRINSICS | 567 | |---------------------------------|# 568 | 569 | (define-vop (sse-load-base-op) 570 | (:results (r :scs (int-sse-reg))) 571 | (:policy :fast-safe) 572 | (:note "inline SSE load operation")) 573 | 574 | (define-vop (sse-load-op sse-load-base-op) 575 | (:args (sap :scs (sap-reg) :to :eval) 576 | (index :scs (signed-reg immediate) :target tmp)) 577 | (:arg-types system-area-pointer signed-num 578 | (:constant fixnum) (:constant signed-word)) 579 | (:temporary (:sc signed-reg :from (:argument 1)) tmp) 580 | (:info scale offset)) 581 | 582 | (define-vop (sse-load-op/tag sse-load-base-op) 583 | (:args (sap :scs (sap-reg) :to :eval) 584 | (index :scs (any-reg signed-reg immediate) :target tmp)) 585 | (:arg-types system-area-pointer tagged-num 586 | (:constant tagged-load-scale) (:constant signed-word)) 587 | (:temporary (:sc any-reg :from (:argument 1)) tmp) 588 | (:info scale offset)) 589 | 590 | (define-vop (sse-xmm-load-op sse-load-base-op) 591 | (:args (value :scs #.+any-sse-reg/immediate+ :target r) 592 | (sap :scs (sap-reg) :to :eval) 593 | (index :scs (signed-reg immediate) :target tmp)) 594 | (:arg-types simd-pack system-area-pointer signed-num 595 | (:constant fixnum) (:constant signed-word)) 596 | (:temporary (:sc signed-reg :from (:argument 2)) tmp) 597 | (:info scale offset)) 598 | 599 | (define-vop (sse-xmm-load-op/tag sse-load-base-op) 600 | (:args (value :scs #.+any-sse-reg/immediate+ :target r) 601 | (sap :scs (sap-reg) :to :eval) 602 | (index :scs (any-reg signed-reg immediate) :target tmp)) 603 | (:arg-types simd-pack system-area-pointer tagged-num 604 | (:constant tagged-load-scale) (:constant signed-word)) 605 | (:temporary (:sc any-reg :from (:argument 2)) tmp) 606 | (:info scale offset)) 607 | 608 | (define-vop (sse-load-ix-op sse-load-base-op) 609 | (:args (sap :scs (descriptor-reg) :to :eval) 610 | (index :scs (signed-reg immediate) :target tmp)) 611 | (:arg-types * signed-num 612 | (:constant fixnum) (:constant signed-word)) 613 | (:temporary (:sc signed-reg :from (:argument 1)) tmp) 614 | (:info scale offset)) 615 | 616 | (define-vop (sse-load-ix-op/tag sse-load-base-op) 617 | (:args (sap :scs (descriptor-reg) :to :eval) 618 | (index :scs (any-reg signed-reg immediate) :target tmp)) 619 | (:arg-types * tagged-num 620 | (:constant tagged-load-scale) (:constant signed-word)) 621 | (:temporary (:sc any-reg :from (:argument 1)) tmp) 622 | (:info scale offset)) 623 | 624 | (defmacro def-load-intrinsic (&whole whole 625 | name rtype insn c-name 626 | &key register-arg tags postfix-fmt (size :qword) side-effect?) 627 | (declare (ignore c-name postfix-fmt)) 628 | (let* ((vop (symbolicate "%" name)) 629 | (ix-vop (symbolicate vop "/IX")) 630 | (valtype (if register-arg '(sse-pack))) 631 | (r-arg (if rtype '(r))) 632 | (rtypes (if rtype 633 | `((:results (r :scs (,(type-name-to-reg rtype)))) 634 | (:result-types ,(type-name-to-primitive rtype))) 635 | `((:results)))) 636 | (known-flags (if side-effect? 637 | '(always-translatable dx-safe) 638 | '(flushable always-translatable dx-safe)))) 639 | (assert (or rtype (not register-arg))) 640 | `(progn 641 | (export ',name) 642 | (save-intrinsic-spec ,name ,whole) 643 | (defknown ,vop (,@valtype system-area-pointer signed-word fixnum signed-word) 644 | ,(or rtype '(values)) ,known-flags) 645 | ;; 646 | (define-vop (,vop ,(if register-arg 'sse-xmm-load-op 'sse-load-op)) 647 | (:translate ,vop) 648 | ,@rtypes 649 | (:generator 5 650 | ,(if register-arg `(ensure-load ,rtype r value)) 651 | (inst ,insn ,@tags ,@r-arg (make-scaled-ea ,size sap index scale offset tmp)))) 652 | (define-vop (,(symbolicate vop "/TAG") ,(if register-arg 'sse-xmm-load-op/tag 'sse-load-op/tag)) 653 | (:translate ,vop) 654 | ,@rtypes 655 | (:generator 4 656 | ,(if register-arg `(ensure-load ,rtype r value)) 657 | (inst ,insn ,@tags ,@r-arg (make-scaled-ea ,size sap index scale offset tmp :fixnum-index t)))) 658 | ;; 659 | (%deftransform ',vop '(function * *) 660 | #',(if register-arg 'fold-xmm-ref-index-addressing 'fold-ref-index-addressing) 661 | "fold semi-constant offset expressions") 662 | ;; If the operation doesn't have a separate XMM argument: 663 | ,@(if (null register-arg) 664 | `(;; Lisp vector indexing version 665 | (defknown ,ix-vop (simple-array signed-word fixnum signed-word) ,(or rtype '(values)) 666 | ,known-flags) 667 | ;; 668 | (define-vop (,ix-vop sse-load-ix-op) 669 | (:translate ,ix-vop) 670 | ,@rtypes 671 | (:generator 4 672 | (inst ,insn ,@tags ,@r-arg (make-scaled-ea ,size sap index scale offset tmp)))) 673 | (define-vop (,(symbolicate ix-vop "/TAG") sse-load-ix-op/tag) 674 | (:translate ,ix-vop) 675 | ,@rtypes 676 | (:generator 3 677 | (inst ,insn ,@tags ,@r-arg (make-scaled-ea ,size sap index scale offset tmp :fixnum-index t)))) 678 | ;; 679 | (%deftransform ',ix-vop '(function * *) #'fold-ref-index-addressing 680 | "fold semi-constant index expressions")))))) 681 | 682 | #|---------------------------------| 683 | | MEMORY STORE INTRINSICS | 684 | |---------------------------------|# 685 | 686 | (define-vop (sse-store-base-op) 687 | (:policy :fast-safe) 688 | (:note "inline SSE store operation")) 689 | 690 | (define-vop (sse-store-op sse-store-base-op) 691 | (:args (sap :scs (sap-reg) :to :eval) 692 | (index :scs (signed-reg immediate) :target tmp) 693 | (value :scs #.+any-sse-reg+)) 694 | (:arg-types system-area-pointer signed-num 695 | (:constant fixnum) (:constant signed-word) simd-pack) 696 | (:temporary (:sc signed-reg :from (:argument 1)) tmp) 697 | (:info scale offset)) 698 | 699 | (define-vop (sse-store-op/tag sse-store-base-op) 700 | (:args (sap :scs (sap-reg) :to :eval) 701 | (index :scs (any-reg signed-reg immediate) :target tmp) 702 | (value :scs #.+any-sse-reg+)) 703 | (:arg-types system-area-pointer tagged-num 704 | (:constant tagged-load-scale) (:constant signed-word) simd-pack) 705 | (:temporary (:sc any-reg :from (:argument 1)) tmp) 706 | (:info scale offset)) 707 | 708 | (define-vop (sse-store-ix-op sse-store-base-op) 709 | (:args (sap :scs (descriptor-reg) :to :eval) 710 | (index :scs (signed-reg immediate) :target tmp) 711 | (value :scs #.+any-sse-reg+)) 712 | (:arg-types * signed-num 713 | (:constant fixnum) (:constant signed-word) simd-pack) 714 | (:temporary (:sc signed-reg :from (:argument 1)) tmp) 715 | (:info scale offset)) 716 | 717 | (define-vop (sse-store-ix-op/tag sse-store-base-op) 718 | (:args (sap :scs (descriptor-reg) :to :eval) 719 | (index :scs (any-reg signed-reg immediate) :target tmp) 720 | (value :scs #.+any-sse-reg+)) 721 | (:arg-types * tagged-num 722 | (:constant tagged-load-scale) (:constant signed-word) simd-pack) 723 | (:temporary (:sc any-reg :from (:argument 1)) tmp) 724 | (:info scale offset)) 725 | 726 | (defmacro def-store-intrinsic (&whole whole 727 | name rtype insn c-name 728 | &key setf-name) 729 | (declare (ignore rtype c-name)) 730 | (let* ((vop (symbolicate "%" name)) 731 | (ix-vop (symbolicate vop "/IX"))) 732 | `(progn 733 | ,(unless setf-name `(export ',name)) 734 | (save-intrinsic-spec ,name ,whole) 735 | (defknown ,vop (system-area-pointer signed-word fixnum signed-word sse-pack) (values) 736 | (always-translatable)) 737 | ;; 738 | (define-vop (,vop sse-store-op) 739 | (:translate ,vop) 740 | (:generator 5 741 | (inst ,insn (make-scaled-ea :qword sap index scale offset tmp) value))) 742 | (define-vop (,(symbolicate vop "/TAG") sse-store-op/tag) 743 | (:translate ,vop) 744 | (:generator 4 745 | (inst ,insn (make-scaled-ea :qword sap index scale offset tmp :fixnum-index t) value))) 746 | ;; 747 | (%deftransform ',vop '(function * *) #'fold-set-index-addressing 748 | "fold semi-constant offset expressions") 749 | ;; 750 | ;; Lisp vector indexing version 751 | (defknown ,ix-vop (simple-array signed-word fixnum signed-word sse-pack) (values) 752 | (always-translatable)) 753 | ;; 754 | (define-vop (,ix-vop sse-store-ix-op) 755 | (:translate ,ix-vop) 756 | (:generator 4 757 | (inst ,insn (make-scaled-ea :qword sap index scale offset tmp) value))) 758 | (define-vop (,(symbolicate ix-vop "/TAG") sse-store-ix-op/tag) 759 | (:translate ,ix-vop) 760 | (:generator 3 761 | (inst ,insn (make-scaled-ea :qword sap index scale offset tmp :fixnum-index t) value))) 762 | ;; 763 | (%deftransform ',ix-vop '(function * *) #'fold-set-index-addressing 764 | "fold semi-constant index expressions")))) 765 | 766 | -------------------------------------------------------------------------------- /sbcl-functions.lisp: -------------------------------------------------------------------------------- 1 | ;;; -*- mode: Lisp; indent-tabs-mode: nil; -*- 2 | ;;; 3 | ;;; Copyright (c) 2010, Alexander Gavrilov (angavrilov@gmail.com) 4 | ;;; 5 | ;;; This file implements VOP-wrapping functions and non-primitive 6 | ;;; extensions to the core intrinsic set. 7 | ;;; 8 | 9 | (in-package #:SSE) 10 | 11 | ;;; Materialize the intrinsic functions. 12 | 13 | ;; Since VOPs are activated only on load, actual functions that 14 | ;; wrap them have to be defined in a different file. This is a 15 | ;; hack to generate the functions from the same macro invocations 16 | ;; as the VOPS. 17 | 18 | (macrolet ((def-float-set-intrinsic (pubname fname atype aregtype rtype move) 19 | (declare (ignore aregtype move)) 20 | `(progn 21 | (defun ,fname (arg) 22 | (declare (type ,atype arg)) 23 | (truly-the ,rtype (%primitive ,fname arg))) 24 | ;; Public function - includes coercion 25 | (export ',pubname) 26 | (declaim (ftype (function (real) ,rtype) ,pubname) 27 | (inline ,pubname)) 28 | (defun ,pubname (arg) (,fname (coerce arg ',atype))))) 29 | ;; 30 | (def-unary-intrinsic (name rtype insn cost c-name &key immediate-arg &allow-other-keys) 31 | (declare (ignore insn cost c-name)) 32 | (unless immediate-arg 33 | `(defun ,name (x) 34 | (declare (type sse-pack x)) 35 | (truly-the ,rtype (%primitive ,name x))))) 36 | ;; 37 | (def-binary-intrinsic (name rtype insn cost c-name &key immediate-arg &allow-other-keys) 38 | (declare (ignore insn cost c-name)) 39 | (unless immediate-arg 40 | `(defun ,name (x y ,@(if immediate-arg '(imm))) 41 | (declare (type sse-pack x y)) 42 | (truly-the ,rtype (%primitive ,name x y))))) 43 | ;; 44 | (def-sse-int-intrinsic (name itype rtype insn cost c-name &key immediate-arg &allow-other-keys) 45 | (declare (ignore insn cost c-name)) 46 | (unless immediate-arg 47 | `(defun ,name (x iv) 48 | (declare (type sse-pack x) 49 | (type ,itype iv)) 50 | (truly-the ,rtype (%primitive ,name x iv))))) 51 | ;; 52 | (def-comparison-intrinsic (name arg-type insn cost c-name &key &allow-other-keys) 53 | (declare (ignore insn cost c-name arg-type)) 54 | `(defun ,name (x y) 55 | (declare (type sse-pack x y)) 56 | (truly-the boolean (,name x y)))) 57 | ;; 58 | (def-load-intrinsic (name rtype insn c-name &key register-arg &allow-other-keys) 59 | (declare (ignore insn c-name)) 60 | (let* ((vop (symbolicate "%" name)) 61 | (valarg (if register-arg '(value)))) 62 | `(progn 63 | (declaim (inline ,name)) 64 | (defun ,name (,@valarg pointer &optional (offset 0)) 65 | (declare ,@(if register-arg '((type sse-pack value))) 66 | (type system-area-pointer pointer) 67 | (type signed-word offset)) 68 | ,(if rtype 69 | `(truly-the ,rtype (,vop ,@valarg pointer offset 1 0)) 70 | `(,vop ,@valarg pointer offset 1 0)))))) 71 | ;; 72 | (def-store-intrinsic (name rtype insn c-name &key setf-name &allow-other-keys) 73 | (declare (ignore insn c-name)) 74 | (let* ((vop (symbolicate "%" name))) 75 | `(progn 76 | (declaim (inline ,name)) 77 | (defun ,name (pointer value &optional (offset 0)) 78 | (declare (type system-area-pointer pointer) 79 | (type sse-pack value) 80 | (type signed-word offset)) 81 | (,vop pointer offset 1 0 value) 82 | (truly-the ,rtype value)) 83 | ,(if setf-name 84 | `(defsetf ,setf-name (pointer &optional (offset 0)) (value) 85 | `(,',name ,pointer ,value ,offset))))))) 86 | ;; Load the definition list 87 | #.(loop for name being each present-symbol 88 | when (get name 'intrinsic-spec) 89 | collect it into specs 90 | finally (return `(progn ,@specs)))) 91 | 92 | #|---------------------------------| 93 | | HELPER FUNCTIONS & MACROS | 94 | |---------------------------------|# 95 | 96 | (defmacro def-utility (name args rtype &body code) 97 | `(progn 98 | (export ',name) 99 | (declaim (ftype (function ,(mapcar (constantly 'sse-pack) args) ,rtype) ,name) 100 | (inline ,name)) 101 | (defun ,name ,args ,@code))) 102 | 103 | (defmacro def-if-function (name rtype postfix) 104 | (let* ((not-x (symbolicate "NOT-" postfix)) 105 | (or-x (symbolicate "OR-" postfix)) 106 | (and-x (symbolicate "AND-" postfix)) 107 | (andn-x (symbolicate "ANDNOT-" postfix)) 108 | (xor-x (symbolicate "XOR-" postfix)) 109 | (tag (type-name-to-tag rtype)) 110 | (true (%make-simd-pack tag #xFFFFFFFFFFFFFFFF #xFFFFFFFFFFFFFFFF)) 111 | (false (%make-simd-pack tag 0 0))) 112 | `(progn 113 | (export ',name) 114 | (defknown ,name (sse-pack sse-pack sse-pack) ,rtype (foldable flushable)) 115 | (defun ,name (condition true-val false-val) 116 | (,or-x (,and-x condition true-val) 117 | (,andn-x condition false-val))) 118 | ;; Instead of inlining, use a transform so that the splice 119 | ;; rule has a chance to apply. This depends on transform 120 | ;; definitions behaving like a LIFO: 121 | (deftransform ,name ((condition true-val false-val) * *) 122 | "Expand the conditional." 123 | '(,or-x (,and-x condition true-val) (,andn-x condition false-val))) 124 | (def-splice-transform ,name ((,not-x cond) tv fv) (,name cond fv tv)) 125 | ;; NOT elimination and partial constant folding for bitwise ops: 126 | (def-splice-transform ,not-x ((,not-x arg1)) arg1) 127 | (def-splice-transform ,and-x (arg1 (,not-x arg2)) (,andn-x arg2 arg1)) 128 | (def-splice-transform ,and-x ((,not-x arg1) arg2) (,andn-x arg1 arg2)) 129 | (def-splice-transform ,andn-x ((,not-x arg1) arg2) (,and-x arg1 arg2)) 130 | (%deftransform ',or-x '(function * *) #'commutative-arg-swap "place constant arg last") 131 | (%deftransform ',and-x '(function * *) #'commutative-arg-swap "place constant arg last") 132 | (%deftransform ',xor-x '(function * *) #'commutative-arg-swap "place constant arg last") 133 | (deftransform ,or-x ((arg1 arg2) (* (constant-arg (member ,true))) *) ,true) 134 | (deftransform ,or-x ((arg1 arg2) (* (constant-arg (member ,false))) *) 'arg1) 135 | (deftransform ,and-x ((arg1 arg2) (* (constant-arg (member ,true))) *) 'arg1) 136 | (deftransform ,and-x ((arg1 arg2) (* (constant-arg (member ,false))) *) ,false) 137 | (deftransform ,xor-x ((arg1 arg2) (* (constant-arg (member ,false))) *) 'arg1) 138 | (deftransform ,andn-x ((arg1 arg2) (* (constant-arg (member ,true))) *) 'arg1) 139 | (deftransform ,andn-x ((arg1 arg2) (* (constant-arg (member ,false))) *) ,false) 140 | (deftransform ,andn-x ((arg1 arg2) ((constant-arg (member ,true)) *) *) ,false) 141 | (deftransform ,andn-x ((arg1 arg2) ((constant-arg (member ,false)) *) *) 'arg2)))) 142 | 143 | (defmacro def-not-cmp-pairs (not-fun &rest pairs) 144 | `(progn 145 | ,@(loop for (a b) on pairs by #'cddr 146 | collect `(def-splice-transform ,not-fun ((,a arg1 arg2)) (,b arg1 arg2)) 147 | collect `(def-splice-transform ,not-fun ((,b arg1 arg2)) (,a arg1 arg2))))) 148 | 149 | #|---------------------------------| 150 | | CPU CONTROL | 151 | |---------------------------------|# 152 | 153 | (defun cpu-mxcsr () 154 | (cpu-mxcsr)) 155 | 156 | (defun %set-cpu-mxcsr (x) 157 | (declare (type (unsigned-byte 32) x)) 158 | (%set-cpu-mxcsr x)) 159 | 160 | (defsetf cpu-mxcsr %set-cpu-mxcsr) 161 | 162 | (defun cpu-load-fence () (cpu-load-fence)) 163 | (defun cpu-store-fence () (cpu-store-fence)) 164 | (defun cpu-memory-fence () (cpu-memory-fence)) 165 | 166 | (defun cpu-pause () (cpu-pause)) 167 | 168 | #|---------------------------------| 169 | | SINGLE-FLOAT SUPPORT | 170 | |---------------------------------|# 171 | 172 | ;; Constants 173 | 174 | (define-symbol-macro 0.0-ps (truly-the float-sse-pack #.(%mkpack float-sse-pack 0 0))) 175 | 176 | (define-symbol-macro true-ss (truly-the float-sse-pack #.(%mkpack float-sse-pack #xFFFFFFFF 0))) 177 | (define-symbol-macro true-ps (truly-the float-sse-pack #.(%mkpack float-sse-pack #xFFFFFFFFFFFFFFFF #xFFFFFFFFFFFFFFFF))) 178 | 179 | (define-symbol-macro false-ss (truly-the float-sse-pack #.(%mkpack float-sse-pack 0 0))) 180 | (define-symbol-macro false-ps (truly-the float-sse-pack #.(%mkpack float-sse-pack 0 0))) 181 | 182 | ;; Initialization 183 | 184 | (declaim (inline set1-ps set-ps setr-ps setzero-ps)) 185 | 186 | (defun set1-ps (val) 187 | (let ((valv (set-ss val))) 188 | (shuffle-ps valv valv 0))) 189 | 190 | (defun set-ps (x3 x2 x1 x0) 191 | (movelh-ps (unpacklo-ps (set-ss x0) (set-ss x1)) 192 | (unpacklo-ps (set-ss x2) (set-ss x3)))) 193 | 194 | (defun setr-ps (x0 x1 x2 x3) 195 | (movelh-ps (unpacklo-ps (set-ss x0) (set-ss x1)) 196 | (unpacklo-ps (set-ss x2) (set-ss x3)))) 197 | 198 | (defun setzero-ps () 0.0-ps) 199 | 200 | ;; Arithmetic negation 201 | 202 | (def-utility neg-ss (arg) float-sse-pack 203 | (xor-ps arg #.(%mkpack float-sse-pack #x80000000 0))) 204 | 205 | (def-utility neg-ps (arg) float-sse-pack 206 | (xor-ps arg #.(%mkpack float-sse-pack #x8000000080000000 #x8000000080000000))) 207 | 208 | ;; Bitwise operations 209 | 210 | (def-if-function if-ps float-sse-pack #:ps) 211 | 212 | ;; Comparisons 213 | 214 | (def-utility >-ss (x y) float-sse-pack (<-ss y x)) 215 | (def-utility >-ps (x y) float-sse-pack (<-ps y x)) 216 | (def-utility >=-ss (x y) float-sse-pack (<=-ss y x)) 217 | (def-utility >=-ps (x y) float-sse-pack (<=-ps y x)) 218 | (def-utility />-ss (x y) float-sse-pack (/<-ss y x)) 219 | (def-utility />-ps (x y) float-sse-pack (/<-ps y x)) 220 | (def-utility />=-ss (x y) float-sse-pack (/<=-ss y x)) 221 | (def-utility />=-ps (x y) float-sse-pack (/<=-ps y x)) 222 | 223 | (def-not-cmp-pairs not-ps 224 | =-ps /=-ps <-ps /<-ps <=-ps /<=-ps >-ps />-ps >=-ps />=-ps cmpord-ps cmpunord-ps) 225 | 226 | ;; Shuffle 227 | 228 | (declaim (inline %sse-pack-to-int %shuffle-subints)) 229 | 230 | (defun %sse-pack-to-int (pack) 231 | (logior (%simd-pack-low pack) (ash (%simd-pack-high pack) 64))) 232 | 233 | (defmacro %int-to-sse-pack (type val) 234 | `(let ((val ,val) 235 | (mask #xFFFFFFFFFFFFFFFF)) 236 | (truly-the ,type (%mkpack ,type (logand val mask) (logand (ash val -64) mask))))) 237 | 238 | (defun %shuffle-subints (xval yval imm bit-cnt &aux (mask (1- (ash 1 bit-cnt)))) 239 | (flet ((bits (idx) 240 | (logand 3 (ash imm (* -2 idx)))) 241 | (val (src idx) 242 | (logand mask (ash src (* (- bit-cnt) idx))))) 243 | (logior (val xval (bits 0)) 244 | (ash (val xval (bits 1)) bit-cnt) 245 | (ash (val yval (bits 2)) (* 2 bit-cnt)) 246 | (ash (val yval (bits 3)) (* 3 bit-cnt))))) 247 | 248 | (defun shuffle-ps (x y imm) 249 | (declare (type sse-pack x y)) 250 | (let* ((xval (%sse-pack-to-int x)) 251 | (yval (%sse-pack-to-int y))) 252 | (%int-to-sse-pack float-sse-pack (%shuffle-subints xval yval imm 32)))) 253 | 254 | #|---------------------------------| 255 | | DOUBLE-FLOAT SUPPORT | 256 | |---------------------------------|# 257 | 258 | ;; Constants 259 | 260 | (define-symbol-macro 0.0-pd (truly-the double-sse-pack #.(%mkpack double-sse-pack 0 0))) 261 | 262 | (define-symbol-macro true-sd (truly-the double-sse-pack #.(%mkpack double-sse-pack #xFFFFFFFFFFFFFFFF 0))) 263 | (define-symbol-macro true-pd (truly-the double-sse-pack #.(%mkpack double-sse-pack #xFFFFFFFFFFFFFFFF #xFFFFFFFFFFFFFFFF))) 264 | 265 | (define-symbol-macro false-sd (truly-the double-sse-pack #.(%mkpack double-sse-pack 0 0))) 266 | (define-symbol-macro false-pd (truly-the double-sse-pack #.(%mkpack double-sse-pack 0 0))) 267 | 268 | ;; Initialization 269 | 270 | (declaim (inline set1-pd set-pd setr-pd setzero-pd)) 271 | 272 | (defun set1-pd (val) 273 | (let ((valv (set-sd val))) 274 | (shuffle-pd valv valv 0))) 275 | 276 | (defun set-pd (x1 x0) 277 | (unpacklo-pd (set-sd x0) (set-sd x1))) 278 | 279 | (defun setr-pd (x0 x1) 280 | (unpacklo-pd (set-sd x0) (set-sd x1))) 281 | 282 | (defun setzero-pd () 0.0-pd) 283 | 284 | ;; Arithmetic negation 285 | 286 | (def-utility neg-sd (arg) double-sse-pack 287 | (xor-pd arg #.(%mkpack double-sse-pack #x8000000000000000 0))) 288 | 289 | (def-utility neg-pd (arg) double-sse-pack 290 | (xor-pd arg #.(%mkpack double-sse-pack #x8000000000000000 #x8000000000000000))) 291 | 292 | ;; Bitwise operations 293 | 294 | (def-if-function if-pd double-sse-pack #:pd) 295 | 296 | ;; Comparisons 297 | 298 | (def-utility >-sd (x y) double-sse-pack (<-sd y x)) 299 | (def-utility >-pd (x y) double-sse-pack (<-pd y x)) 300 | (def-utility >=-sd (x y) double-sse-pack (<=-sd y x)) 301 | (def-utility >=-pd (x y) double-sse-pack (<=-pd y x)) 302 | (def-utility />-sd (x y) double-sse-pack (/<-sd y x)) 303 | (def-utility />-pd (x y) double-sse-pack (/<-pd y x)) 304 | (def-utility />=-sd (x y) double-sse-pack (/<=-sd y x)) 305 | (def-utility />=-pd (x y) double-sse-pack (/<=-pd y x)) 306 | 307 | (def-not-cmp-pairs not-pd 308 | =-pd /=-pd <-pd /<-pd <=-pd /<=-pd >-pd />-pd >=-pd />=-pd cmpord-pd cmpunord-pd) 309 | 310 | ;; Shuffle 311 | 312 | (defun shuffle-pd (x y imm) 313 | (declare (type sse-pack x y)) 314 | (truly-the double-sse-pack 315 | (%mkpack double-sse-pack 316 | (if (logtest imm 1) (%simd-pack-high x) (%simd-pack-low x)) 317 | (if (logtest imm 2) (%simd-pack-high y) (%simd-pack-low y))))) 318 | 319 | #|---------------------------------| 320 | | INTEGER SUPPORT | 321 | |---------------------------------|# 322 | 323 | ;; Constants 324 | 325 | (define-symbol-macro 0-pi (truly-the int-sse-pack #.(%mkpack int-sse-pack 0 0))) 326 | 327 | (define-symbol-macro true-pi (truly-the int-sse-pack #.(%mkpack int-sse-pack #xFFFFFFFFFFFFFFFF #xFFFFFFFFFFFFFFFF))) 328 | 329 | (define-symbol-macro false-pi (truly-the int-sse-pack #.(%mkpack int-sse-pack 0 0))) 330 | 331 | ;; Initialization 332 | 333 | (macrolet ((defset (name type) 334 | `(defun ,name (x) 335 | (declare (type ,type x)) 336 | (,name x)))) 337 | (defset %set-int (signed-byte 64)) 338 | (defset %set-uint (unsigned-byte 64)) 339 | (defset convert-si32-to-pi (signed-byte 32)) 340 | (defset convert-su32-to-pi (unsigned-byte 32)) 341 | (defset convert-si64-to-pi (signed-byte 64)) 342 | (defset convert-su64-to-pi (unsigned-byte 64))) 343 | 344 | (macrolet ((defset1 (name setter type shuffle &rest expands) 345 | `(progn 346 | (export ',name) 347 | (declaim (inline ,name)) 348 | (defun ,name (arg) 349 | (let ((val (,setter (the ,type arg)))) 350 | (declare (type int-sse-pack val)) 351 | ,@(loop for x in expands collect `(setq val (,x val val))) 352 | (shuffle-pi32 val ,shuffle)))))) 353 | (defset1 set1-pi8 %set-int fixnum #4r0000 unpacklo-pi8 unpacklo-pi16) 354 | (defset1 set1-pi16 %set-int fixnum #4r0000 unpacklo-pi16) 355 | (defset1 set1-pi32 %set-int (signed-byte 32) #4r0000) 356 | (defset1 set1-pu32 %set-uint (unsigned-byte 32) #4r0000) 357 | (defset1 set1-pi64 %set-int (signed-byte 64) #4r1010) 358 | (defset1 set1-pu64 %set-uint (unsigned-byte 64) #4r1010)) 359 | 360 | (macrolet ((defset (name rname setter type depth) 361 | (let* ((names (loop for i from 0 below (ash 1 depth) 362 | collect (symbolicate (format nil "X~A" i)))) 363 | (funcs #(unpacklo-pi64 unpacklo-pi32 unpacklo-pi16 unpacklo-pi8)) 364 | (body (loop for i downfrom depth to 0 365 | for bv = (mapcar (lambda (x) `(,setter (the ,type ,x))) names) 366 | then (loop for (a b) on bv by #'cddr 367 | collect `(,(svref funcs i) ,a ,b)) 368 | finally (return (first bv))))) 369 | `(progn 370 | (export ',name) 371 | (export ',rname) 372 | (declaim (inline ,name ,rname)) 373 | (defun ,name (,@(reverse names)) ,body) 374 | (defun ,rname (,@names) ,body))))) 375 | (defset set-pi8 setr-pi8 %set-int fixnum 4) 376 | (defset set-pi16 setr-pi16 %set-int fixnum 3) 377 | (defset set-pi32 setr-pi32 %set-int (signed-byte 32) 2) 378 | (defset set-pu32 setr-pu32 %set-uint (unsigned-byte 32) 2) 379 | (defset set-pi64 setr-pi64 %set-int (signed-byte 64) 1) 380 | (defset set-pu64 setr-pu64 %set-uint (unsigned-byte 64) 1)) 381 | 382 | (declaim (inline setzero-pi)) 383 | (defun setzero-pi () 0-pi) 384 | 385 | ;; Masked move 386 | 387 | (export 'maskmoveu-pi) 388 | 389 | (declaim (inline maskmoveu-pi)) 390 | 391 | (defun maskmoveu-pi (value mask pointer &optional (offset 0)) 392 | (declare (type sse-pack value mask) 393 | (type system-area-pointer pointer) 394 | (type fixnum offset)) 395 | (%maskmoveu-pi value mask pointer offset)) 396 | 397 | ;; Arithmetic negation (subtract from 0) 398 | 399 | (macrolet ((frob (name subf) 400 | `(def-utility ,name (arg) int-sse-pack (,subf 0-pi arg)))) 401 | (frob neg-pi8 sub-pi8) 402 | (frob neg-pi16 sub-pi16) 403 | (frob neg-pi32 sub-pi32) 404 | (frob neg-pi64 sub-pi64)) 405 | 406 | ;; Bitwise operations 407 | 408 | (def-if-function if-pi int-sse-pack #:pi) 409 | 410 | ;; Comparisons 411 | 412 | (def-utility <-pi8 (x y) int-sse-pack (>-pi8 y x)) 413 | (def-utility <-pi16 (x y) int-sse-pack (>-pi16 y x)) 414 | (def-utility <-pi32 (x y) int-sse-pack (>-pi32 y x)) 415 | 416 | (def-utility <=-pi8 (x y) int-sse-pack (not-pi (>-pi8 x y))) 417 | (def-utility <=-pi16 (x y) int-sse-pack (not-pi (>-pi16 x y))) 418 | (def-utility <=-pi32 (x y) int-sse-pack (not-pi (>-pi32 x y))) 419 | 420 | (def-utility >=-pi8 (x y) int-sse-pack (not-pi (>-pi8 y x))) 421 | (def-utility >=-pi16 (x y) int-sse-pack (not-pi (>-pi16 y x))) 422 | (def-utility >=-pi32 (x y) int-sse-pack (not-pi (>-pi32 y x))) 423 | 424 | (def-utility /=-pi8 (x y) int-sse-pack (not-pi (=-pi8 x y))) 425 | (def-utility /=-pi16 (x y) int-sse-pack (not-pi (=-pi16 x y))) 426 | (def-utility /=-pi32 (x y) int-sse-pack (not-pi (=-pi32 x y))) 427 | 428 | ;; Shifts 429 | 430 | (defun slli-pi (x imm) 431 | (declare (type sse-pack x)) 432 | (if (> imm 15) 433 | 0-pi 434 | (%int-to-sse-pack int-sse-pack (ash (%sse-pack-to-int x) (* 8 imm))))) 435 | 436 | (defun srli-pi (x imm) 437 | (declare (type sse-pack x)) 438 | (if (> imm 15) 439 | 0-pi 440 | (%int-to-sse-pack int-sse-pack (ash (%sse-pack-to-int x) (* -8 imm))))) 441 | 442 | ;; Extract & insert 443 | 444 | (defun extract-pi16 (x imm) 445 | (declare (type sse-pack x)) 446 | (logand #xFFFF 447 | (ash (%sse-pack-to-int x) 448 | (- (* 16 (logand imm 7)))))) 449 | 450 | (defun insert-pi16 (x intv imm) 451 | (declare (type sse-pack x)) 452 | (let ((shift (* 16 (logand imm 7)))) 453 | (%int-to-sse-pack int-sse-pack 454 | (logior (logand (%sse-pack-to-int x) 455 | (lognot (ash #xFFFF shift))) 456 | (ash (logand intv #xFFFF) shift))))) 457 | 458 | ;; Shuffle 459 | 460 | (defun shuffle-pi32 (x imm) 461 | (declare (type sse-pack x)) 462 | (let* ((xval (%sse-pack-to-int x))) 463 | (%int-to-sse-pack int-sse-pack (%shuffle-subints xval xval imm 32)))) 464 | 465 | (defun shufflelo-pi16 (x imm) 466 | (declare (type sse-pack x)) 467 | (let* ((xval (%simd-pack-low x))) 468 | (truly-the int-sse-pack (%mkpack int-sse-pack 469 | (%shuffle-subints xval xval imm 16) 470 | (%simd-pack-high x))))) 471 | 472 | (defun shufflehi-pi16 (x imm) 473 | (declare (type sse-pack x)) 474 | (let* ((xval (%simd-pack-high x))) 475 | (truly-the int-sse-pack (%mkpack int-sse-pack 476 | (%simd-pack-low x) 477 | (%shuffle-subints xval xval imm 16))))) 478 | 479 | -------------------------------------------------------------------------------- /sse-array-defs.lisp: -------------------------------------------------------------------------------- 1 | ;;; -*- mode: Lisp; indent-tabs-mode: nil; -*- 2 | ;;; 3 | ;;; Copyright (c) 2010, Alexander Gavrilov (angavrilov@gmail.com) 4 | ;;; 5 | ;;; This file contains definitions for vectorized access 6 | ;;; to specialized lisp arrays. 7 | ;;; 8 | 9 | (in-package #:SSE) 10 | 11 | ;;; Prefetch: AREF-PREFETCH-*, ROW-MAJOR-AREF-PREFETCH-* 12 | 13 | (def-aref-intrinsic #:PREFETCH-T0 nil cpu-prefetch-t0 nil :ref-size 0 :side-effect? t) 14 | (def-aref-intrinsic #:PREFETCH-T1 nil cpu-prefetch-t1 nil :ref-size 0 :side-effect? t) 15 | (def-aref-intrinsic #:PREFETCH-T2 nil cpu-prefetch-t2 nil :ref-size 0 :side-effect? t) 16 | (def-aref-intrinsic #:PREFETCH-NTA nil cpu-prefetch-nta nil :ref-size 0 :side-effect? t) 17 | 18 | (def-aref-intrinsic #:CLFLUSH nil cpu-clflush nil :ref-size 1 :side-effect? t) 19 | 20 | ;;; Single-float 21 | 22 | ;; AREF-SS, ROW-MAJOR-AREF-SS 23 | 24 | (def-aref-intrinsic #:SS float-sse-pack mem-ref-ss mem-set-ss :ref-size 4) 25 | 26 | ;; AREF-PS, ROW-MAJOR-AREF-PS 27 | 28 | (def-aref-intrinsic #:PS float-sse-pack mem-ref-ps mem-set-ps) 29 | 30 | ;; AREF-APS, ROW-MAJOR-AREF-APS (requires alignment) 31 | 32 | (def-aref-intrinsic #:APS float-sse-pack mem-ref-aps mem-set-aps) 33 | 34 | ;; AREF-SPS, ROW-MAJOR-AREF-SPS (requires alignment; no write cache) 35 | 36 | (def-aref-intrinsic #:SPS float-sse-pack mem-ref-aps stream-ps) 37 | 38 | ;;; Double-float 39 | 40 | ;; AREF-SD, ROW-MAJOR-AREF-SD 41 | 42 | (def-aref-intrinsic #:SD double-sse-pack mem-ref-sd mem-set-sd :ref-size 8) 43 | 44 | ;; AREF-PD, ROW-MAJOR-AREF-PD 45 | 46 | (def-aref-intrinsic #:PD double-sse-pack mem-ref-pd mem-set-pd) 47 | 48 | ;; AREF-APD, ROW-MAJOR-AREF-APD (requires alignment) 49 | 50 | (def-aref-intrinsic #:APD double-sse-pack mem-ref-apd mem-set-apd) 51 | 52 | ;; AREF-SPD, ROW-MAJOR-AREF-SPD (requires alignment; no write cache) 53 | 54 | (def-aref-intrinsic #:SPD double-sse-pack mem-ref-apd stream-pd) 55 | 56 | ;;; Integer 57 | 58 | ;; AREF-SI64, ROW-MAJOR-AREF-SI64 59 | 60 | (def-aref-intrinsic #:SI64 int-sse-pack mem-ref-si64 mem-set-si64 :ref-size 8) 61 | 62 | ;; AREF-PI, ROW-MAJOR-AREF-PI 63 | 64 | (def-aref-intrinsic #:PI int-sse-pack mem-ref-pi mem-set-pi) 65 | 66 | ;; AREF-API, ROW-MAJOR-AREF-API (requires alignment) 67 | 68 | (def-aref-intrinsic #:API int-sse-pack mem-ref-api mem-set-api) 69 | 70 | ;; AREF-SPI, ROW-MAJOR-AREF-SPI (requires alignment; no write cache) 71 | 72 | (def-aref-intrinsic #:SPI int-sse-pack mem-ref-api stream-pi) 73 | 74 | -------------------------------------------------------------------------------- /sse-intrinsics.lisp: -------------------------------------------------------------------------------- 1 | ;;; -*- mode: Lisp; indent-tabs-mode: nil; -*- 2 | ;;; 3 | ;;; Copyright (c) 2010, Alexander Gavrilov (angavrilov@gmail.com) 4 | ;;; 5 | ;;; This file contains definitions for all SSE intrinsics. 6 | ;;; 7 | ;;; The macros are defined in the *-core.lisp files. 8 | ;;; On SBCL wrapping functions are defined by sbcl-functions.lisp. 9 | ;;; 10 | 11 | (in-package #:SSE) 12 | 13 | #+(and ecl (or ecl_min stage1 cross)) 14 | (eval-when (:compile-toplevel) 15 | ;; During the initial bootstrap sequence when the contribs are 16 | ;; compiled, the system does not load fasls after building them. 17 | ;; (For all it knows, it might be cross-compiling to another architecture.) 18 | ;; Work around by loading the macro definition file into the interpreter: 19 | (load (merge-pathnames #P"ecl-sse-core.lisp" *compile-file-truename*))) 20 | 21 | #|---------------------------------| 22 | | MISC INTRINSICS DEFINITIONS | 23 | |---------------------------------|# 24 | 25 | ;;; Prefetch 26 | 27 | (def-load-intrinsic cpu-prefetch-t0 nil prefetch "_mm_prefetch" :tags (:t0) :size :byte :postfix-fmt ",_MM_HINT_T0" :side-effect? t) 28 | (def-load-intrinsic cpu-prefetch-t1 nil prefetch "_mm_prefetch" :tags (:t1) :size :byte :postfix-fmt ",_MM_HINT_T1" :side-effect? t) 29 | (def-load-intrinsic cpu-prefetch-t2 nil prefetch "_mm_prefetch" :tags (:t2) :size :byte :postfix-fmt ",_MM_HINT_T2" :side-effect? t) 30 | (def-load-intrinsic cpu-prefetch-nta nil prefetch "_mm_prefetch" :tags (:nta) :size :byte :postfix-fmt ",_MM_HINT_NTA" :side-effect? t) 31 | 32 | (def-load-intrinsic cpu-clflush nil clflush "_mm_clflush" :size :byte :side-effect? t) 33 | 34 | ;;; CPU control 35 | 36 | #+sbcl 37 | (progn 38 | (defknown cpu-mxcsr () (unsigned-byte 32) (flushable)) 39 | 40 | (define-vop (cpu-mxcsr) 41 | (:translate cpu-mxcsr) 42 | (:args) (:arg-types) 43 | (:results (result :scs (unsigned-reg))) 44 | (:result-types unsigned-num) 45 | (:temporary (:sc unsigned-stack) tmp) 46 | (:policy :fast-safe) 47 | (:generator 3 48 | (let ((ea (make-ea :dword :base rbp-tn 49 | :disp (frame-byte-offset (tn-offset tmp))))) 50 | (inst stmxcsr ea) 51 | (inst mov (reg-in-size result :dword) ea)))) 52 | 53 | (defknown %set-cpu-mxcsr ((unsigned-byte 32)) (unsigned-byte 32) ()) 54 | 55 | (define-vop (%set-cpu-mxcsr) 56 | (:translate %set-cpu-mxcsr) 57 | (:args (value :scs (unsigned-reg unsigned-stack) :target result)) 58 | (:arg-types unsigned-num) 59 | (:results (result :scs (unsigned-reg) 60 | :load-if (not (and (sc-is result unsigned-stack) 61 | (or (sc-is value unsigned-reg) 62 | (location= value result)))))) 63 | (:result-types unsigned-num) 64 | (:temporary (:sc unsigned-stack) tmp) 65 | (:policy :fast-safe) 66 | (:generator 3 67 | (cond ((sc-is value unsigned-stack) 68 | (setf tmp value)) 69 | ((sc-is result unsigned-stack) 70 | (setf tmp result))) 71 | (move tmp value) 72 | (unless (location= result tmp) 73 | (move result value)) 74 | (let ((ea (make-ea :dword :base rbp-tn 75 | :disp (frame-byte-offset (tn-offset tmp))))) 76 | (inst ldmxcsr ea)))) 77 | 78 | (macrolet ((defvoid (name insn) 79 | `(progn 80 | (export ',name) 81 | (defknown ,name () (values) ()) 82 | (define-vop (,name) 83 | (:translate ,name) 84 | (:policy :fast-safe) 85 | (:generator 1 86 | (inst ,insn)))))) 87 | (defvoid cpu-load-fence lfence) 88 | (defvoid cpu-store-fence sfence) 89 | (defvoid cpu-memory-fence mfence) 90 | (defvoid cpu-pause pause))) 91 | 92 | #+ecl 93 | (progn 94 | (def-intrinsic cpu-mxcsr () fixnum "_mm_getcsr") 95 | (def-intrinsic %set-cpu-mxcsr (fixnum) fixnum "_mm_setcsr" :export nil :ret-arg 0) 96 | 97 | (defsetf cpu-mxcsr %set-cpu-mxcsr) 98 | 99 | (def-intrinsic cpu-load-fence () nil "_mm_lfence") 100 | (def-intrinsic cpu-store-fence () nil "_mm_sfence") 101 | (def-intrinsic cpu-memory-fence () nil "_mm_mfence") 102 | 103 | (def-intrinsic cpu-pause () nil "_mm_pause")) 104 | 105 | #|-----------------------------------------| 106 | | SINGLE-FLOAT INTRINSICS DEFINITIONS | 107 | |-----------------------------------------|# 108 | 109 | ;; Initialization 110 | 111 | #+sbcl 112 | (def-float-set-intrinsic set-ss %set-ss single-float single-reg float-sse-pack movaps) 113 | 114 | #+ecl 115 | (progn 116 | (def-intrinsic set-ss (single-float) float-sse-pack "_mm_set_ss") 117 | (def-intrinsic set1-ps (single-float) float-sse-pack "_mm_set1_ps") 118 | 119 | (def-intrinsic set-ps (single-float single-float single-float single-float) float-sse-pack "_mm_set_ps") 120 | (def-intrinsic setr-ps (single-float single-float single-float single-float) float-sse-pack "_mm_setr_ps") 121 | 122 | (def-intrinsic setzero-ps () float-sse-pack "_mm_setzero_ps")) 123 | 124 | ;; Memory 125 | 126 | (def-load-intrinsic mem-ref-ss float-sse-pack movss "_mm_load_ss") 127 | 128 | (def-load-intrinsic mem-ref-ps float-sse-pack movups "_mm_loadu_ps") 129 | (def-load-intrinsic mem-ref-aps float-sse-pack movaps "_mm_load_ps") 130 | 131 | (def-store-intrinsic mem-set-ss float-sse-pack movss "_mm_store_ss" :setf-name mem-ref-ss) 132 | 133 | (def-store-intrinsic mem-set-ps float-sse-pack movups "_mm_storeu_ps" :setf-name mem-ref-ps) 134 | (def-store-intrinsic mem-set-aps float-sse-pack movaps "_mm_store_ps" :setf-name mem-ref-aps) 135 | 136 | (def-store-intrinsic stream-ps float-sse-pack movntps "_mm_stream_ps") 137 | 138 | ;; Arithmetics 139 | 140 | (def-binary-intrinsic add-ss float-sse-pack addss 3 "_mm_add_ss") 141 | (def-binary-intrinsic add-ps float-sse-pack addps 3 "_mm_add_ps" :commutative t) 142 | (def-binary-intrinsic sub-ss float-sse-pack subss 3 "_mm_sub_ss") 143 | (def-binary-intrinsic sub-ps float-sse-pack subps 3 "_mm_sub_ps") 144 | (def-binary-intrinsic mul-ss float-sse-pack mulss 5 "_mm_mul_ss") 145 | (def-binary-intrinsic mul-ps float-sse-pack mulps 5 "_mm_mul_ps" :commutative t) 146 | (def-binary-intrinsic div-ss float-sse-pack divss 13 "_mm_div_ss") 147 | (def-binary-intrinsic div-ps float-sse-pack divps 13 "_mm_div_ps") 148 | (def-binary-intrinsic min-ss float-sse-pack minss 3 "_mm_min_ss") 149 | (def-binary-intrinsic min-ps float-sse-pack minps 3 "_mm_min_ps":commutative t) 150 | (def-binary-intrinsic max-ss float-sse-pack maxss 3 "_mm_max_ss") 151 | (def-binary-intrinsic max-ps float-sse-pack maxps 3 "_mm_max_ps" :commutative t) 152 | 153 | (def-unary-intrinsic sqrt-ss float-sse-pack sqrtss 20 "_mm_sqrt_ss" :partial t) 154 | (def-unary-intrinsic sqrt-ps float-sse-pack sqrtps 20 "_mm_sqrt_ps") 155 | (def-unary-intrinsic rsqrt-ss float-sse-pack rsqrtss 20 "_mm_rsqrt_ss" :partial t) 156 | (def-unary-intrinsic rsqrt-ps float-sse-pack rsqrtps 20 "_mm_rsqrt_ps") 157 | (def-unary-intrinsic rcp-ss float-sse-pack rcpss 13 "_mm_rcp_ss" :partial t) 158 | (def-unary-intrinsic rcp-ps float-sse-pack rcpps 13 "_mm_rcp_ps") 159 | 160 | ;; Bitwise logic 161 | 162 | #+sbcl 163 | (def-not-intrinsic not-ps float-sse-pack xorps) 164 | 165 | (def-binary-intrinsic and-ps float-sse-pack andps 1 "_mm_and_ps" :commutative t) 166 | (def-binary-intrinsic andnot-ps float-sse-pack andnps 1 "_mm_andnot_ps") 167 | (def-binary-intrinsic or-ps float-sse-pack orps 1 "_mm_or_ps" :commutative t) 168 | (def-binary-intrinsic xor-ps float-sse-pack xorps 1 "_mm_xor_ps" :commutative t) 169 | 170 | ;; Comparisons 171 | 172 | (def-binary-intrinsic =-ss float-sse-pack cmpss 3 "_mm_cmpeq_ss" :tags (:eq)) 173 | (def-binary-intrinsic =-ps float-sse-pack cmpps 3 "_mm_cmpeq_ps" :tags (:eq) :commutative t) 174 | (def-binary-intrinsic <-ss float-sse-pack cmpss 3 "_mm_cmplt_ss" :tags (:lt)) 175 | (def-binary-intrinsic <-ps float-sse-pack cmpps 3 "_mm_cmplt_ps" :tags (:lt)) 176 | (def-binary-intrinsic <=-ss float-sse-pack cmpss 3 "_mm_cmple_ss" :tags (:le)) 177 | (def-binary-intrinsic <=-ps float-sse-pack cmpps 3 "_mm_cmple_ps" :tags (:le)) 178 | #+ecl 179 | (def-binary-intrinsic >-ss float-sse-pack nil nil "_mm_cmpgt_ss") 180 | #+ecl 181 | (def-binary-intrinsic >-ps float-sse-pack nil nil "_mm_cmpgt_ps") 182 | #+ecl 183 | (def-binary-intrinsic >=-ss float-sse-pack nil nil "_mm_cmpge_ss") 184 | #+ecl 185 | (def-binary-intrinsic >=-ps float-sse-pack nil nil "_mm_cmpge_ps") 186 | 187 | (def-binary-intrinsic /=-ss float-sse-pack cmpss 3 "_mm_cmpneq_ss" :tags (:neq)) 188 | (def-binary-intrinsic /=-ps float-sse-pack cmpps 3 "_mm_cmpneq_ps" :tags (:neq) :commutative t) 189 | (def-binary-intrinsic /<-ss float-sse-pack cmpss 3 "_mm_cmpnlt_ss" :tags (:nlt)) 190 | (def-binary-intrinsic /<-ps float-sse-pack cmpps 3 "_mm_cmpnlt_ps" :tags (:nlt)) 191 | (def-binary-intrinsic /<=-ss float-sse-pack cmpss 3 "_mm_cmpnle_ss" :tags (:nle)) 192 | (def-binary-intrinsic /<=-ps float-sse-pack cmpps 3 "_mm_cmpnle_ps" :tags (:nle)) 193 | #+ecl 194 | (def-binary-intrinsic />-ss float-sse-pack nil nil "_mm_cmpngt_ss") 195 | #+ecl 196 | (def-binary-intrinsic />-ps float-sse-pack nil nil "_mm_cmpngt_ps") 197 | #+ecl 198 | (def-binary-intrinsic />=-ss float-sse-pack nil nil "_mm_cmpnge_ss") 199 | #+ecl 200 | (def-binary-intrinsic />=-ps float-sse-pack nil nil "_mm_cmpnge_ps") 201 | 202 | (def-binary-intrinsic cmpord-ss float-sse-pack cmpss 3 "_mm_cmpord_ss" :tags (:ord)) ; neither is NaN 203 | (def-binary-intrinsic cmpord-ps float-sse-pack cmpps 3 "_mm_cmpord_ps" :tags (:ord) :commutative t) 204 | (def-binary-intrinsic cmpunord-ss float-sse-pack cmpss 3 "_mm_cmpunord_ss" :tags (:unord)) 205 | (def-binary-intrinsic cmpunord-ps float-sse-pack cmpps 3 "_mm_cmpunord_ps" :tags (:unord) :commutative t) 206 | 207 | (def-comparison-intrinsic =-ss? float-sse-pack comiss 3 "_mm_comieq_ss" :commutative t :tags (:e)) 208 | (def-comparison-intrinsic =-ssu? float-sse-pack ucomiss 3 "_mm_ucomieq_ss" :commutative t :tags (:e)) 209 | (def-comparison-intrinsic <-ss? float-sse-pack comiss 3 "_mm_comilt_ss" :tags (:b)) 210 | (def-comparison-intrinsic <-ssu? float-sse-pack ucomiss 3 "_mm_ucomilt_ss" :tags (:b)) 211 | (def-comparison-intrinsic <=-ss? float-sse-pack comiss 3 "_mm_comile_ss" :tags (:be)) 212 | (def-comparison-intrinsic <=-ssu? float-sse-pack ucomiss 3 "_mm_ucomile_ss" :tags (:be)) 213 | (def-comparison-intrinsic >-ss? float-sse-pack comiss 3 "_mm_comigt_ss" :tags (:a)) 214 | (def-comparison-intrinsic >-ssu? float-sse-pack ucomiss 3 "_mm_ucomigt_ss" :tags (:a)) 215 | (def-comparison-intrinsic >=-ss? float-sse-pack comiss 3 "_mm_comige_ss" :tags (:ae)) 216 | (def-comparison-intrinsic >=-ssu? float-sse-pack ucomiss 3 "_mm_ucomige_ss" :tags (:ae)) 217 | (def-comparison-intrinsic /=-ss? float-sse-pack comiss 3 "_mm_comineq_ss" :commutative t :tags (:ne)) 218 | (def-comparison-intrinsic /=-ssu? float-sse-pack ucomiss 3 "_mm_ucomineq_ss" :commutative t :tags (:ne)) 219 | 220 | ;; Misc 221 | 222 | (def-binary-intrinsic unpackhi-ps float-sse-pack unpckhps 1 "_mm_unpackhi_ps") 223 | (def-binary-intrinsic unpacklo-ps float-sse-pack unpcklps 1 "_mm_unpacklo_ps") 224 | 225 | (def-binary-intrinsic move-ss float-sse-pack movss 1 "_mm_move_ss") 226 | 227 | (def-binary-intrinsic movehl-ps float-sse-pack movhlps 1 "_mm_movehl_ps") 228 | (def-binary-intrinsic movelh-ps float-sse-pack movlhps 1 "_mm_movelh_ps") 229 | 230 | (def-unary-intrinsic movemask-ps (unsigned-byte 4) movmskps 1 "_mm_movemask_ps" :arg-type float-sse-pack) 231 | 232 | ;; Shuffle 233 | 234 | (def-binary-intrinsic shuffle-ps float-sse-pack shufps 1 "_mm_shuffle_ps" :immediate-arg (unsigned-byte 8)) 235 | 236 | ;; Conversion 237 | 238 | (def-unary-intrinsic convert-pi32-to-ps float-sse-pack cvtdq2ps 3 "_mm_cvtepi32_ps" :arg-type int-sse-pack) 239 | (def-unary-intrinsic convert-ps-to-pi32 int-sse-pack cvtps2dq 3 "_mm_cvtps_epi32" :arg-type float-sse-pack) 240 | (def-unary-intrinsic truncate-ps-to-pi32 int-sse-pack cvttps2dq 3 "_mm_cvttps_epi32" :arg-type float-sse-pack) 241 | 242 | (def-sse-int-intrinsic convert-si32-to-ss (signed-byte 32) float-sse-pack cvtsi2ss 3 "_mm_cvtsi32_ss") 243 | (def-cvt-to-int32-intrinsic convert-ss-to-si32 (signed-byte 32) cvtss2si 3 "_mm_cvtss_si32" :arg-type float-sse-pack) 244 | (def-cvt-to-int32-intrinsic truncate-ss-to-si32 (signed-byte 32) cvttss2si 3 "_mm_cvttss_si32" :arg-type float-sse-pack) 245 | 246 | #+(or x86_64 x86-64) 247 | (def-sse-int-intrinsic convert-si64-to-ss (signed-byte 64) float-sse-pack cvtsi2ss 3 248 | #-msvc "_mm_cvtsi64_ss" #+msvc "_mm_cvtsi64x_ss") 249 | #+(or x86_64 x86-64) 250 | (def-unary-intrinsic convert-ss-to-si64 (signed-byte 64) cvtss2si 3 251 | #-msvc "_mm_cvtss_si64" #+msvc "_mm_cvtss_si64x" :arg-type float-sse-pack) 252 | #+(or x86_64 x86-64) 253 | (def-unary-intrinsic truncate-ss-to-si64 (signed-byte 64) cvttss2si 3 254 | #-msvc "_mm_cvttss_si64" #+msvc "_mm_cvttss_si64x" :arg-type float-sse-pack) 255 | 256 | #|-----------------------------------------| 257 | | DOUBLE-FLOAT INTRINSICS DEFINITIONS | 258 | |-----------------------------------------|# 259 | 260 | ;; Initialization 261 | 262 | #+sbcl 263 | (def-float-set-intrinsic set-sd %set-sd double-float double-reg double-sse-pack movapd) 264 | 265 | #+ecl 266 | (progn 267 | (def-intrinsic set-sd (double-float) double-sse-pack "_mm_set_sd") 268 | (def-intrinsic set1-pd (double-float) double-sse-pack "_mm_set1_pd") 269 | 270 | (def-intrinsic set-pd (double-float double-float) double-sse-pack "_mm_set_pd") 271 | (def-intrinsic setr-pd (double-float double-float) double-sse-pack "_mm_setr_pd") 272 | 273 | (def-intrinsic setzero-pd () double-sse-pack "_mm_setzero_pd")) 274 | 275 | ;; Memory 276 | 277 | (def-load-intrinsic mem-ref-sd double-sse-pack movsd "_mm_load_sd") 278 | 279 | (def-load-intrinsic mem-ref-pd double-sse-pack movupd "_mm_loadu_pd") 280 | (def-load-intrinsic mem-ref-apd double-sse-pack movapd "_mm_load_pd") 281 | 282 | (def-load-intrinsic loadh-pd double-sse-pack movhpd "_mm_loadh_pd" :register-arg t) 283 | (def-load-intrinsic loadl-pd double-sse-pack movlpd "_mm_loadl_pd" :register-arg t) 284 | 285 | (def-store-intrinsic mem-set-sd double-sse-pack movsd "_mm_store_sd" :setf-name mem-ref-sd) 286 | 287 | (def-store-intrinsic mem-set-pd double-sse-pack movupd "_mm_storeu_pd" :setf-name mem-ref-pd) 288 | (def-store-intrinsic mem-set-apd double-sse-pack movapd "_mm_store_pd" :setf-name mem-ref-apd) 289 | 290 | (def-store-intrinsic storeh-pd double-sse-pack movhpd "_mm_storeh_pd") 291 | (def-store-intrinsic storel-pd double-sse-pack movlpd "_mm_storel_pd") 292 | 293 | (def-store-intrinsic stream-pd double-sse-pack movntpd "_mm_stream_pd") 294 | 295 | ;; Arithmetics 296 | 297 | (def-binary-intrinsic add-sd double-sse-pack addsd 3 "_mm_add_sd") 298 | (def-binary-intrinsic add-pd double-sse-pack addpd 3 "_mm_add_pd" :commutative t) 299 | (def-binary-intrinsic sub-sd double-sse-pack subsd 3 "_mm_sub_sd") 300 | (def-binary-intrinsic sub-pd double-sse-pack subpd 3 "_mm_sub_pd") 301 | (def-binary-intrinsic mul-sd double-sse-pack mulsd 5 "_mm_mul_sd") 302 | (def-binary-intrinsic mul-pd double-sse-pack mulpd 5 "_mm_mul_pd" :commutative t) 303 | (def-binary-intrinsic div-sd double-sse-pack divsd 13 "_mm_div_sd") 304 | (def-binary-intrinsic div-pd double-sse-pack divpd 13 "_mm_div_pd") 305 | (def-binary-intrinsic min-sd double-sse-pack minsd 3 "_mm_min_sd") 306 | (def-binary-intrinsic min-pd double-sse-pack minpd 3 "_mm_min_pd" :commutative t) 307 | (def-binary-intrinsic max-sd double-sse-pack maxsd 3 "_mm_max_sd") 308 | (def-binary-intrinsic max-pd double-sse-pack maxpd 3 "_mm_max_pd" :commutative t) 309 | 310 | (def-binary-intrinsic sqrt-sd double-sse-pack sqrtsd 20 "_mm_sqrt_sd") 311 | (def-unary-intrinsic sqrt-pd double-sse-pack sqrtpd 20 "_mm_sqrt_pd") 312 | 313 | ;; Bitwise logic 314 | 315 | #+sbcl 316 | (def-not-intrinsic not-pd double-sse-pack xorpd) 317 | 318 | (def-binary-intrinsic and-pd double-sse-pack andpd 1 "_mm_and_pd" :commutative t) 319 | (def-binary-intrinsic andnot-pd double-sse-pack andnpd 1 "_mm_andnot_pd") 320 | (def-binary-intrinsic or-pd double-sse-pack orpd 1 "_mm_or_pd" :commutative t) 321 | (def-binary-intrinsic xor-pd double-sse-pack xorpd 1 "_mm_xor_pd" :commutative t) 322 | 323 | ;; Comparisons 324 | 325 | (def-binary-intrinsic =-sd double-sse-pack cmpsd 3 "_mm_cmpeq_sd" :tags (:eq)) 326 | (def-binary-intrinsic =-pd double-sse-pack cmppd 3 "_mm_cmpeq_pd" :tags (:eq) :commutative t) 327 | (def-binary-intrinsic <-sd double-sse-pack cmpsd 3 "_mm_cmplt_sd" :tags (:lt)) 328 | (def-binary-intrinsic <-pd double-sse-pack cmppd 3 "_mm_cmplt_pd" :tags (:lt)) 329 | (def-binary-intrinsic <=-sd double-sse-pack cmpsd 3 "_mm_cmple_sd" :tags (:le)) 330 | (def-binary-intrinsic <=-pd double-sse-pack cmppd 3 "_mm_cmple_pd" :tags (:le)) 331 | #+ecl 332 | (def-binary-intrinsic >-sd double-sse-pack nil nil "_mm_cmpgt_sd") 333 | #+ecl 334 | (def-binary-intrinsic >-pd double-sse-pack nil nil "_mm_cmpgt_pd") 335 | #+ecl 336 | (def-binary-intrinsic >=-sd double-sse-pack nil nil "_mm_cmpge_sd") 337 | #+ecl 338 | (def-binary-intrinsic >=-pd double-sse-pack nil nil "_mm_cmpge_pd") 339 | 340 | (def-binary-intrinsic /=-sd double-sse-pack cmpsd 3 "_mm_cmpneq_sd" :tags (:neq)) 341 | (def-binary-intrinsic /=-pd double-sse-pack cmppd 3 "_mm_cmpneq_pd" :tags (:neq) :commutative t) 342 | (def-binary-intrinsic /<-sd double-sse-pack cmpsd 3 "_mm_cmpnlt_sd" :tags (:nlt)) 343 | (def-binary-intrinsic /<-pd double-sse-pack cmppd 3 "_mm_cmpnlt_pd" :tags (:nlt)) 344 | (def-binary-intrinsic /<=-sd double-sse-pack cmpsd 3 "_mm_cmpnle_sd" :tags (:nle)) 345 | (def-binary-intrinsic /<=-pd double-sse-pack cmppd 3 "_mm_cmpnle_pd" :tags (:nle)) 346 | #+ecl 347 | (def-binary-intrinsic />-sd double-sse-pack nil nil "_mm_cmpngt_sd") 348 | #+ecl 349 | (def-binary-intrinsic />-pd double-sse-pack nil nil "_mm_cmpngt_pd") 350 | #+ecl 351 | (def-binary-intrinsic />=-sd double-sse-pack nil nil "_mm_cmpnge_sd") 352 | #+ecl 353 | (def-binary-intrinsic />=-pd double-sse-pack nil nil "_mm_cmpnge_pd") 354 | 355 | (def-binary-intrinsic cmpord-sd double-sse-pack cmpsd 3 "_mm_cmpord_sd" :tags (:ord)) ; neither is NaN 356 | (def-binary-intrinsic cmpord-pd double-sse-pack cmppd 3 "_mm_cmpord_pd" :tags (:ord) :commutative t) 357 | (def-binary-intrinsic cmpunord-sd double-sse-pack cmpsd 3 "_mm_cmpunord_sd" :tags (:unord)) 358 | (def-binary-intrinsic cmpunord-pd double-sse-pack cmppd 3 "_mm_cmpunord_pd" :tags (:unord) :commutative t) 359 | 360 | (def-comparison-intrinsic =-sd? double-sse-pack comisd 3 "_mm_comieq_sd" :commutative t :tags (:e)) 361 | (def-comparison-intrinsic =-sdu? double-sse-pack ucomisd 3 "_mm_ucomieq_sd" :commutative t :tags (:e)) 362 | (def-comparison-intrinsic <-sd? double-sse-pack comisd 3 "_mm_comilt_sd" :tags (:b)) 363 | (def-comparison-intrinsic <-sdu? double-sse-pack ucomisd 3 "_mm_ucomilt_sd" :tags (:b)) 364 | (def-comparison-intrinsic <=-sd? double-sse-pack comisd 3 "_mm_comile_sd" :tags (:be)) 365 | (def-comparison-intrinsic <=-sdu? double-sse-pack ucomisd 3 "_mm_ucomile_sd" :tags (:be)) 366 | (def-comparison-intrinsic >-sd? double-sse-pack comisd 3 "_mm_comigt_sd" :tags (:a)) 367 | (def-comparison-intrinsic >-sdu? double-sse-pack ucomisd 3 "_mm_ucomigt_sd" :tags (:a)) 368 | (def-comparison-intrinsic >=-sd? double-sse-pack comisd 3 "_mm_comige_sd" :tags (:ae)) 369 | (def-comparison-intrinsic >=-sdu? double-sse-pack ucomisd 3 "_mm_ucomige_sd" :tags (:ae)) 370 | (def-comparison-intrinsic /=-sd? double-sse-pack comisd 3 "_mm_comineq_sd" :commutative t :tags (:ne)) 371 | (def-comparison-intrinsic /=-sdu? double-sse-pack ucomisd 3 "_mm_ucomineq_sd" :commutative t :tags (:ne)) 372 | 373 | ;; Misc 374 | 375 | (def-binary-intrinsic unpackhi-pd double-sse-pack unpckhpd 1 "_mm_unpackhi_pd") 376 | (def-binary-intrinsic unpacklo-pd double-sse-pack unpcklpd 1 "_mm_unpacklo_pd") 377 | 378 | (def-binary-intrinsic move-sd double-sse-pack movsd 1 "_mm_move_sd") 379 | 380 | (def-unary-intrinsic movemask-pd (unsigned-byte 2) movmskpd 1 "_mm_movemask_pd" :arg-type double-sse-pack) 381 | 382 | ;; Shuffle 383 | 384 | (def-binary-intrinsic shuffle-pd double-sse-pack shufpd 1 "_mm_shuffle_pd" :immediate-arg (unsigned-byte 2)) 385 | 386 | ;; Conversion 387 | 388 | (def-unary-intrinsic convert-ps-to-pd double-sse-pack cvtps2pd 3 "_mm_cvtps_pd" :arg-type float-sse-pack) 389 | (def-unary-intrinsic convert-pd-to-ps float-sse-pack cvtpd2ps 3 "_mm_cvtpd_ps" :arg-type double-sse-pack) 390 | 391 | (def-binary-intrinsic convert-ss-to-sd double-sse-pack cvtss2sd 3 "_mm_cvtss_sd" :y-type float-sse-pack) 392 | (def-binary-intrinsic convert-sd-to-ss float-sse-pack cvtsd2ss 3 "_mm_cvtsd_ss" :y-type double-sse-pack) 393 | 394 | (def-unary-intrinsic convert-pi32-to-pd double-sse-pack cvtdq2pd 3 "_mm_cvtepi32_pd" :arg-type int-sse-pack) 395 | (def-unary-intrinsic convert-pd-to-pi32 int-sse-pack cvtpd2dq 3 "_mm_cvtpd_epi32" :arg-type double-sse-pack) 396 | (def-unary-intrinsic truncate-pd-to-pi32 int-sse-pack cvttpd2dq 3 "_mm_cvttpd_epi32" :arg-type double-sse-pack) 397 | 398 | (def-sse-int-intrinsic convert-si32-to-sd (signed-byte 32) double-sse-pack cvtsi2ss 3 "_mm_cvtsi32_sd") 399 | (def-cvt-to-int32-intrinsic convert-sd-to-si32 (signed-byte 32) cvtsd2si 3 "_mm_cvtsd_si32" :arg-type double-sse-pack) 400 | (def-cvt-to-int32-intrinsic truncate-sd-to-si32 (signed-byte 32) cvttsd2si 3 "_mm_cvttsd_si32" :arg-type double-sse-pack) 401 | 402 | #+(or x86_64 x86-64) 403 | (def-sse-int-intrinsic convert-si64-to-sd (signed-byte 64) double-sse-pack cvtsi2ss 3 404 | #-msvc "_mm_cvtsi64_sd" #+msvc "_mm_cvtsi64x_sd") 405 | #+(or x86_64 x86-64) 406 | (def-unary-intrinsic convert-sd-to-si64 (signed-byte 64) cvtsd2si 3 407 | #-msvc "_mm_cvtsd_si64" #+msvc "_mm_cvtsd_si64x" :arg-type double-sse-pack) 408 | #+(or x86_64 x86-64) 409 | (def-unary-intrinsic truncate-sd-to-si64 (signed-byte 64) cvttsd2si 3 410 | #-msvc "_mm_cvttsd_si64" #+msvc "_mm_cvttsd_si64x" :arg-type double-sse-pack) 411 | 412 | #|-----------------------------------------| 413 | | INTEGER INTRINSICS DEFINITIONS | 414 | |-----------------------------------------|# 415 | 416 | ;; Initialization 417 | 418 | #+ecl 419 | (progn 420 | (def-intrinsic set1-pi8 (fixnum) int-sse-pack "_mm_set1_epi8") 421 | (def-intrinsic set1-pi16 (fixnum) int-sse-pack "_mm_set1_epi16") 422 | (def-intrinsic set1-pi32 (ext:integer32) int-sse-pack "_mm_set1_epi32") 423 | #+x86_64 424 | (def-intrinsic set1-pi64 (ext:integer64) int-sse-pack "_mm_set1_epi64x") 425 | 426 | (def-intrinsic set1-pu32 (ext:byte32) int-sse-pack "_mm_set1_epi32") 427 | #+x86_64 428 | (def-intrinsic set1-pu64 (ext:byte64) int-sse-pack "_mm_set1_epi64x") 429 | 430 | ;;----- 431 | (def-intrinsic set-pi8 (fixnum fixnum fixnum fixnum 432 | fixnum fixnum fixnum fixnum 433 | fixnum fixnum fixnum fixnum 434 | fixnum fixnum fixnum fixnum) int-sse-pack "_mm_set_epi8") 435 | (def-intrinsic set-pi16 (fixnum fixnum fixnum fixnum 436 | fixnum fixnum fixnum fixnum) int-sse-pack "_mm_set_epi16") 437 | (def-intrinsic set-pi32 (ext:integer32 ext:integer32 ext:integer32 ext:integer32) int-sse-pack "_mm_set_epi32") 438 | #+x86_64 439 | (def-intrinsic set-pi64 (ext:integer64 ext:integer64) int-sse-pack "_mm_set_epi64x") 440 | 441 | (def-intrinsic set-pu32 (ext:byte32 ext:byte32 ext:byte32 ext:byte32) int-sse-pack "_mm_set_epi32") 442 | #+x86_64 443 | (def-intrinsic set-pu64 (ext:byte64 ext:byte64) int-sse-pack "_mm_set_epi64x") 444 | 445 | ;;----- 446 | (def-intrinsic setr-pi8 (fixnum fixnum fixnum fixnum 447 | fixnum fixnum fixnum fixnum 448 | fixnum fixnum fixnum fixnum 449 | fixnum fixnum fixnum fixnum) int-sse-pack "_mm_setr_epi8") 450 | (def-intrinsic setr-pi16 (fixnum fixnum fixnum fixnum 451 | fixnum fixnum fixnum fixnum) int-sse-pack "_mm_setr_epi16") 452 | (def-intrinsic setr-pi32 (ext:integer32 ext:integer32 ext:integer32 ext:integer32) int-sse-pack "_mm_setr_epi32") 453 | #+x86_64 454 | (def-intrinsic setr-pi64 (ext:integer64 ext:integer64) int-sse-pack "_mm_set_epi64x" :reorder-args t) 455 | 456 | (def-intrinsic setr-pu32 (ext:byte32 ext:byte32 ext:byte32 ext:byte32) int-sse-pack "_mm_setr_epi32") 457 | #+x86_64 458 | (def-intrinsic setr-pu64 (ext:byte64 ext:byte64) int-sse-pack "_mm_set_epi64x" :reorder-args t) 459 | 460 | ;;----- 461 | (def-intrinsic setzero-pi () int-sse-pack "_mm_setzero_si128")) 462 | 463 | ;; Memory 464 | 465 | (def-load-intrinsic mem-ref-pi int-sse-pack movdqu "_mm_loadu_si128") 466 | (def-load-intrinsic mem-ref-api int-sse-pack movdqa "_mm_load_si128") 467 | 468 | (def-load-intrinsic mem-ref-si64 int-sse-pack movd "_mm_loadl_epi64") 469 | 470 | (def-store-intrinsic mem-set-pi int-sse-pack movdqu "_mm_storeu_si128" :setf-name mem-ref-pi) 471 | (def-store-intrinsic mem-set-api int-sse-pack movdqa "_mm_store_si128" :setf-name mem-ref-api) 472 | 473 | (def-store-intrinsic mem-set-si64 int-sse-pack movd "_mm_storel_epi64" :setf-name mem-ref-si64) 474 | 475 | (def-store-intrinsic stream-pi int-sse-pack movntdq "_mm_stream_si128") 476 | 477 | ;; Masked move 478 | 479 | #+ecl 480 | (def-mem-intrinsic maskmoveu-pi "char" nil "_mm_maskmoveu_si128" :prefix-args (int-sse-pack int-sse-pack)) 481 | 482 | #+sbcl 483 | (progn 484 | (defknown %maskmoveu-pi (sse-pack sse-pack system-area-pointer fixnum) (values) ()) 485 | 486 | (define-vop (%maskmoveu-pi) 487 | (:translate %maskmoveu-pi) 488 | (:args (value :scs #.+any-sse-reg+) 489 | (mask :scs #.+any-sse-reg+) 490 | (sap :scs (sap-reg) :target rdi) 491 | (offset :scs (signed-reg))) 492 | (:arg-types simd-pack simd-pack system-area-pointer signed-num) 493 | (:temporary (:sc sap-reg :offset rdi-offset :from :eval) rdi) 494 | (:policy :fast-safe) 495 | (:note "inline MASKMOVEU operation") 496 | (:generator 5 497 | (if (location= sap rdi) 498 | (inst add rdi offset) 499 | (inst lea rdi (make-ea :qword :base sap :index offset))) 500 | (inst maskmovdqu value mask))) 501 | 502 | (define-vop (%maskmoveu-pi-c) 503 | (:translate %maskmoveu-pi) 504 | (:args (value :scs #.+any-sse-reg+) 505 | (mask :scs #.+any-sse-reg+) 506 | (sap :scs (sap-reg) :target rdi)) 507 | (:arg-types simd-pack simd-pack system-area-pointer (:constant (signed-byte 32))) 508 | (:info offset) 509 | (:temporary (:sc sap-reg :offset rdi-offset :from :eval) rdi) 510 | (:policy :fast-safe) 511 | (:note "inline MASKMOVEU operation") 512 | (:generator 4 513 | (if (location= sap rdi) 514 | (unless (= offset 0) 515 | (inst add rdi offset)) 516 | (if (= offset 0) 517 | (inst mov rdi sap) 518 | (inst lea rdi (make-ea :qword :base sap :disp offset)))) 519 | (inst maskmovdqu value mask))) 520 | 521 | (def-splice-transform %maskmoveu-pi (value mask (sap+ sap offset1) offset2) 522 | (%maskmoveu-pi value mask sap (+ offset1 offset2)))) 523 | 524 | ;; Arithmetics 525 | 526 | (def-binary-intrinsic add-pi8 int-sse-pack paddb 1 "_mm_add_epi8" :commutative t) 527 | (def-binary-intrinsic add-pi16 int-sse-pack paddw 1 "_mm_add_epi16" :commutative t) 528 | (def-binary-intrinsic add-pi32 int-sse-pack paddd 1 "_mm_add_epi32" :commutative t) 529 | (def-binary-intrinsic add-pi64 int-sse-pack paddq 1 "_mm_add_epi64" :commutative t) 530 | 531 | (def-binary-intrinsic adds-pi8 int-sse-pack paddsb 1 "_mm_adds_epi8" :commutative t) 532 | (def-binary-intrinsic adds-pi16 int-sse-pack paddsw 1 "_mm_adds_epi16" :commutative t) 533 | (def-binary-intrinsic adds-pu8 int-sse-pack paddusb 1 "_mm_adds_epu8" :commutative t) 534 | (def-binary-intrinsic adds-pu16 int-sse-pack paddusw 1 "_mm_adds_epu16" :commutative t) 535 | 536 | (def-binary-intrinsic avg-pu8 int-sse-pack pavgb 1 "_mm_avg_epu8" :commutative t) 537 | (def-binary-intrinsic avg-pu16 int-sse-pack pavgw 1 "_mm_avg_epu16" :commutative t) 538 | 539 | (def-binary-intrinsic madd-pi16 int-sse-pack pmaddwd 1 "_mm_madd_epi16" :commutative t) 540 | 541 | (def-binary-intrinsic max-pu8 int-sse-pack pmaxub 1 "_mm_max_epu8" :commutative t) 542 | (def-binary-intrinsic max-pi16 int-sse-pack pmaxsw 1 "_mm_max_epi16" :commutative t) 543 | (def-binary-intrinsic min-pu8 int-sse-pack pminub 1 "_mm_min_epu8" :commutative t) 544 | (def-binary-intrinsic min-pi16 int-sse-pack pminsw 1 "_mm_min_epi16" :commutative t) 545 | 546 | (def-binary-intrinsic mulhi-pi16 int-sse-pack pmulhw 3 "_mm_mulhi_epi16" :commutative t) 547 | (def-binary-intrinsic mulhi-pu16 int-sse-pack pmulhuw 3 "_mm_mulhi_epu16" :commutative t) 548 | (def-binary-intrinsic mullo-pi16 int-sse-pack pmullw 3 "_mm_mullo_epi16" :commutative t) 549 | 550 | (def-binary-intrinsic mul-pu32 int-sse-pack pmuludq 3 "_mm_mul_epu32" :commutative t) 551 | 552 | (def-binary-intrinsic sad-pu8 int-sse-pack psadbw 1 "_mm_sad_epu8" :commutative t) 553 | 554 | (def-binary-intrinsic sub-pi8 int-sse-pack psubb 1 "_mm_sub_epi8") 555 | (def-binary-intrinsic sub-pi16 int-sse-pack psubw 1 "_mm_sub_epi16") 556 | (def-binary-intrinsic sub-pi32 int-sse-pack psubd 1 "_mm_sub_epi32") 557 | (def-binary-intrinsic sub-pi64 int-sse-pack psubq 1 "_mm_sub_epi64") 558 | 559 | (def-binary-intrinsic subs-pi8 int-sse-pack psubsb 1 "_mm_subs_epi8") 560 | (def-binary-intrinsic subs-pi16 int-sse-pack psubsw 1 "_mm_subs_epi16") 561 | (def-binary-intrinsic subs-pu8 int-sse-pack psubusb 1 "_mm_subs_epu8") 562 | (def-binary-intrinsic subs-pu16 int-sse-pack psubusw 1 "_mm_subs_epu16") 563 | 564 | ;; Bitwise logic 565 | 566 | #+sbcl 567 | (def-not-intrinsic not-pi int-sse-pack pxor) 568 | 569 | (def-binary-intrinsic and-pi int-sse-pack pand 1 "_mm_and_si128" :commutative t) 570 | (def-binary-intrinsic andnot-pi int-sse-pack pandn 1 "_mm_andnot_si128") 571 | (def-binary-intrinsic or-pi int-sse-pack por 1 "_mm_or_si128" :commutative t) 572 | (def-binary-intrinsic xor-pi int-sse-pack pxor 1 "_mm_xor_si128" :commutative t) 573 | 574 | ;; Shifts 575 | 576 | (def-unary-intrinsic slli-pi int-sse-pack pslldq 1 "_mm_slli_si128" :partial :one-arg :immediate-arg (unsigned-byte 8)) 577 | 578 | (def-sse-int-intrinsic slli-pi16 fixnum int-sse-pack psllw 3 "_mm_slli_epi16" :make-temporary t 579 | :defun-body "_mm_sll_epi16(#0,_mm_cvtsi32_si128(#1))") 580 | (def-sse-int-intrinsic slli-pi32 fixnum int-sse-pack pslld 3 "_mm_slli_epi32" :make-temporary t 581 | :defun-body "_mm_sll_epi32(#0,_mm_cvtsi32_si128(#1))") 582 | (def-sse-int-intrinsic slli-pi64 fixnum int-sse-pack psllq 3 "_mm_slli_epi64" :make-temporary t 583 | :defun-body "_mm_sll_epi64(#0,_mm_cvtsi32_si128(#1))") 584 | (def-binary-intrinsic sll-pi16 int-sse-pack psllw 1 "_mm_sll_epi16") 585 | (def-binary-intrinsic sll-pi32 int-sse-pack pslld 1 "_mm_sll_epi32") 586 | (def-binary-intrinsic sll-pi64 int-sse-pack psllq 1 "_mm_sll_epi64") 587 | 588 | (def-sse-int-intrinsic srai-pi16 fixnum int-sse-pack psraw 3 "_mm_srai_epi16" :make-temporary t 589 | :defun-body "_mm_sra_epi16(#0,_mm_cvtsi32_si128(#1))") 590 | (def-sse-int-intrinsic srai-pi32 fixnum int-sse-pack psrad 3 "_mm_srai_epi32" :make-temporary t 591 | :defun-body "_mm_sra_epi32(#0,_mm_cvtsi32_si128(#1))") 592 | (def-binary-intrinsic sra-pi16 int-sse-pack psraw 1 "_mm_sra_epi16") 593 | (def-binary-intrinsic sra-pi32 int-sse-pack psrad 1 "_mm_sra_epi32") 594 | 595 | (def-unary-intrinsic srli-pi int-sse-pack psrldq 1 "_mm_srli_si128" :partial :one-arg :immediate-arg (unsigned-byte 8)) 596 | 597 | (def-sse-int-intrinsic srli-pi16 fixnum int-sse-pack psrlw 3 "_mm_srli_epi16" :make-temporary t 598 | :defun-body "_mm_srl_epi16(#0,_mm_cvtsi32_si128(#1))") 599 | (def-sse-int-intrinsic srli-pi32 fixnum int-sse-pack psrld 3 "_mm_srli_epi32" :make-temporary t 600 | :defun-body "_mm_srl_epi32(#0,_mm_cvtsi32_si128(#1))") 601 | (def-sse-int-intrinsic srli-pi64 fixnum int-sse-pack psrlq 3 "_mm_srli_epi64" :make-temporary t 602 | :defun-body "_mm_srl_epi64(#0,_mm_cvtsi32_si128(#1))") 603 | (def-binary-intrinsic srl-pi16 int-sse-pack psrlw 1 "_mm_srl_epi16") 604 | (def-binary-intrinsic srl-pi32 int-sse-pack psrld 1 "_mm_srl_epi32") 605 | (def-binary-intrinsic srl-pi64 int-sse-pack psrlq 1 "_mm_srl_epi64") 606 | 607 | #+sbcl 608 | (macrolet ((defimm (name insn bits &key arithmetic) 609 | `(define-vop (,(symbolicate "%" name "-IMM") sse-int-base-op) 610 | (:translate ,name) 611 | (:args (x :scs #.+any-sse-reg+ :target r)) 612 | (:arg-types simd-pack (:constant fixnum)) 613 | (:result-types sb-kernel:simd-pack-int) 614 | (:info immv) 615 | (:generator 1 616 | ,@(let ((core `(progn 617 | (ensure-move int-sse-pack r x) 618 | (unless (= immv 0) 619 | (inst ,insn r immv))))) 620 | (if arithmetic 621 | `((when (or (< immv 0) (>= immv ,bits)) 622 | (setf immv ,bits)) 623 | ,core) 624 | `((if (or (< immv 0) (>= immv ,bits)) 625 | (inst pxor r r) 626 | ,core)))))))) 627 | (defimm slli-pi16 psllw-imm 16) 628 | (defimm slli-pi32 pslld-imm 32) 629 | (defimm slli-pi64 psllq-imm 64) 630 | (defimm srai-pi16 psraw-imm 16 :arithmetic t) 631 | (defimm srai-pi32 psrad-imm 32 :arithmetic t) 632 | (defimm srli-pi16 psrlw-imm 16) 633 | (defimm srli-pi32 psrld-imm 32) 634 | (defimm srli-pi64 psrlq-imm 64)) 635 | 636 | ;; Comparisons 637 | 638 | (def-binary-intrinsic =-pi8 int-sse-pack pcmpeqb 1 "_mm_cmpeq_epi8") 639 | (def-binary-intrinsic =-pi16 int-sse-pack pcmpeqw 1 "_mm_cmpeq_epi16") 640 | (def-binary-intrinsic =-pi32 int-sse-pack pcmpeqd 1 "_mm_cmpeq_epi32") 641 | 642 | #+ecl 643 | (def-binary-intrinsic <-pi8 int-sse-pack nil nil "_mm_cmplt_epi8") 644 | #+ecl 645 | (def-binary-intrinsic <-pi16 int-sse-pack nil nil "_mm_cmplt_epi16") 646 | #+ecl 647 | (def-binary-intrinsic <-pi32 int-sse-pack nil nil "_mm_cmplt_epi32") 648 | 649 | (def-binary-intrinsic >-pi8 int-sse-pack pcmpgtb 1 "_mm_cmpgt_epi8") 650 | (def-binary-intrinsic >-pi16 int-sse-pack pcmpgtw 1 "_mm_cmpgt_epi16") 651 | (def-binary-intrinsic >-pi32 int-sse-pack pcmpgtd 1 "_mm_cmpgt_epi32") 652 | 653 | ;; Misc 654 | 655 | (def-binary-intrinsic packs-pi16 int-sse-pack packsswb 1 "_mm_packs_epi16") 656 | (def-binary-intrinsic packs-pi32 int-sse-pack packssdw 1 "_mm_packs_epi32") 657 | (def-binary-intrinsic packus-pi16 int-sse-pack packuswb 1 "_mm_packus_epi16") 658 | 659 | (def-unary-intrinsic extract-pi16 (unsigned-byte 16) pextrw 1 "_mm_extract_epi16" 660 | :immediate-arg (unsigned-byte 8) :arg-type int-sse-pack) 661 | (def-sse-int-intrinsic insert-pi16 fixnum int-sse-pack pinsrw 1 "_mm_insert_epi16" 662 | :immediate-arg (unsigned-byte 8)) 663 | 664 | (def-unary-intrinsic movemask-pi8 (unsigned-byte 16) pmovmskb 1 "_mm_movemask_epi8" :arg-type int-sse-pack) 665 | 666 | (def-binary-intrinsic unpackhi-pi8 int-sse-pack punpckhbw 1 "_mm_unpackhi_epi8") 667 | (def-binary-intrinsic unpackhi-pi16 int-sse-pack punpckhwd 1 "_mm_unpackhi_epi16") 668 | (def-binary-intrinsic unpackhi-pi32 int-sse-pack punpckhdq 1 "_mm_unpackhi_epi32") 669 | (def-binary-intrinsic unpackhi-pi64 int-sse-pack punpckhqdq 1 "_mm_unpackhi_epi64") 670 | 671 | (def-binary-intrinsic unpacklo-pi8 int-sse-pack punpcklbw 1 "_mm_unpacklo_epi8") 672 | (def-binary-intrinsic unpacklo-pi16 int-sse-pack punpcklwd 1 "_mm_unpacklo_epi16") 673 | (def-binary-intrinsic unpacklo-pi32 int-sse-pack punpckldq 1 "_mm_unpacklo_epi32") 674 | (def-binary-intrinsic unpacklo-pi64 int-sse-pack punpcklqdq 1 "_mm_unpacklo_epi64") 675 | 676 | (def-unary-intrinsic move-pi64 int-sse-pack movq 1 "_mm_move_epi64") 677 | 678 | ;; Shuffle 679 | 680 | (def-unary-intrinsic shuffle-pi32 int-sse-pack pshufd 1 "_mm_shuffle_epi32" :immediate-arg (unsigned-byte 8)) 681 | (def-unary-intrinsic shufflelo-pi16 int-sse-pack pshuflw 1 "_mm_shufflelo_epi16" :immediate-arg (unsigned-byte 8)) 682 | (def-unary-intrinsic shufflehi-pi16 int-sse-pack pshufhw 1 "_mm_shufflehi_epi16" :immediate-arg (unsigned-byte 8)) 683 | 684 | ;; Conversion 685 | 686 | #+sbcl 687 | (progn 688 | (export 'convert-si32-to-pi) 689 | (defknown convert-si32-to-pi ((signed-byte 32)) int-sse-pack (foldable flushable dx-safe)) 690 | (export 'convert-su32-to-pi) 691 | (defknown convert-su32-to-pi ((unsigned-byte 32)) int-sse-pack (foldable flushable dx-safe)) 692 | (export 'convert-si64-to-pi) 693 | (defknown convert-si64-to-pi ((signed-byte 64)) int-sse-pack (foldable flushable dx-safe)) 694 | (export 'convert-su64-to-pi) 695 | (defknown convert-su64-to-pi ((unsigned-byte 64)) int-sse-pack (foldable flushable dx-safe)) 696 | (defknown %set-int ((signed-byte 64)) int-sse-pack (foldable flushable always-translatable dx-safe)) 697 | (defknown %set-uint ((unsigned-byte 64)) int-sse-pack (foldable flushable always-translatable dx-safe)) 698 | 699 | (define-vop (%set-int) 700 | (:translate %set-int %set-uint 701 | convert-si32-to-pi convert-su32-to-pi 702 | convert-si64-to-pi convert-su64-to-pi) 703 | (:args (arg :scs (signed-reg unsigned-reg signed-stack unsigned-stack))) 704 | (:arg-types untagged-num) 705 | (:results (dst :scs (int-sse-reg))) 706 | (:result-types sb-kernel:simd-pack-int) 707 | (:policy :fast-safe) 708 | (:generator 1 709 | (inst movd dst arg)))) 710 | 711 | #+ecl 712 | (progn 713 | (def-intrinsic convert-si32-to-pi (ext:integer32) int-sse-pack "_mm_cvtsi32_si128") 714 | (def-intrinsic convert-su32-to-pi (ext:byte32) int-sse-pack "_mm_cvtsi32_si128") 715 | #+x86_64 716 | (def-intrinsic convert-si64-to-pi (ext:integer64) int-sse-pack #-msvc "_mm_cvtsi64_si128" #+msvc "_mm_cvtsi64x_si128") 717 | #+x86_64 718 | (def-intrinsic convert-su64-to-pi (ext:byte64) int-sse-pack #-msvc "_mm_cvtsi64_si128" #+msvc "_mm_cvtsi64x_si128")) 719 | 720 | (def-cvt-to-int32-intrinsic convert-pi-to-si32 (signed-byte 32) movd 1 "_mm_cvtsi128_si32" 721 | :arg-type int-sse-pack) 722 | (def-unary-intrinsic convert-pi-to-su32 (unsigned-byte 32) movd 1 "_mm_cvtsi128_si32" 723 | :result-size :dword :arg-type int-sse-pack) 724 | 725 | #+(or x86_64 x86-64) 726 | (def-unary-intrinsic convert-pi-to-si64 (signed-byte 64) movd 1 727 | #-msvc "_mm_cvtsi128_si64" #+msvc "_mm_cvtsi128_si64x" :arg-type int-sse-pack) 728 | #+(or x86_64 x86-64) 729 | (def-unary-intrinsic convert-pi-to-su64 (unsigned-byte 64) movd 1 730 | #-msvc "_mm_cvtsi128_si64" #+msvc "_mm_cvtsi128_si64x" :arg-type int-sse-pack) 731 | 732 | -------------------------------------------------------------------------------- /sse-package.lisp: -------------------------------------------------------------------------------- 1 | ;;; -*- mode: Lisp; indent-tabs-mode: nil; -*- 2 | ;;; 3 | ;;; Copyright (c) 2010, Alexander Gavrilov (angavrilov@gmail.com) 4 | ;;; 5 | ;;; This file defines a package for all SSE intrinsics. 6 | ;;; 7 | 8 | #+ecl 9 | (eval-when (:load-toplevel) 10 | (require 'cmp)) 11 | 12 | #+sbcl 13 | (pushnew :SSE2 *features*) 14 | 15 | (defpackage #:SSE 16 | #+sbcl 17 | (:use #:COMMON-LISP #:SB-C #:SB-VM #:SB-INT #:SB-KERNEL #:SB-ASSEM #:SB-EXT #:SB-SYS) 18 | #+sbcl 19 | (:import-from #:SB-VM 20 | #:SINGLE-REG #:DOUBLE-REG 21 | #:INT-SSE-REG #:SINGLE-SSE-REG #:DOUBLE-SSE-REG #:SSE-REG 22 | #:INT-SSE-STACK #:SINGLE-SSE-STACK #:DOUBLE-SSE-STACK 23 | #:INT-SSE-IMMEDIATE #:SINGLE-SSE-IMMEDIATE #:DOUBLE-SSE-IMMEDIATE 24 | #:SIGNED-REG #:SIGNED-STACK #:UNSIGNED-REG #:UNSIGNED-STACK 25 | #:SIGNED-NUM #:UNSIGNED-NUM #:UNTAGGED-NUM #:IMMEDIATE 26 | #:SAP-REG #:DESCRIPTOR-REG #:ANY-REG #:TAGGED-NUM 27 | #:RAX-OFFSET #:RDI-OFFSET #:RBP-TN #:FRAME-BYTE-OFFSET 28 | #:MAKE-EA #:REG-IN-SIZE #:LOADW) 29 | #+sbcl 30 | (:import-from #:SB-C 31 | #:SPLICE-FUN-ARGS #:EXTRACT-FUN-ARGS 32 | #:%DEFTRANSFORM #:COMMUTATIVE-ARG-SWAP #:DX-SAFE 33 | #:GIVE-UP-IR1-TRANSFORM #:ABORT-IR1-TRANSFORM 34 | #:INSERT-ARRAY-BOUNDS-CHECKS #:VECTOR-LENGTH 35 | #:ASSERT-ARRAY-RANK #:ASSERT-LVAR-TYPE 36 | #:CONSTANT-LVAR-P #:LVAR-VALUE #:LVAR-TYPE #:LVAR-USES 37 | #:LVAR-FUN-NAME #:BASIC-COMBINATION-FUN 38 | #:LEXENV-POLICY #:NODE-LEXENV #:POLICY 39 | #:CAST-P #:CAST-VALUE #:DELETE-FILTER 40 | #:FIND-SAETP #:FIND-SAETP-BY-CTYPE) 41 | #+sbcl 42 | (:import-from #:SB-IMPL 43 | #:%ARRAY-ROW-MAJOR-INDEX) 44 | #+sbcl 45 | (:shadow #:INT-SSE-PACK #:FLOAT-SSE-PACK #:DOUBLE-SSE-PACK) 46 | #+ecl 47 | (:use #:COMMON-LISP #:FFI) 48 | #+ecl 49 | (:import-from #:EXT 50 | #:INT-SSE-PACK #:FLOAT-SSE-PACK #:DOUBLE-SSE-PACK 51 | #:SSE-PACK-P #:ARRAY-ELEMENT-TYPE-BYTE-SIZE 52 | #:*SSE-PACK-PRINT-MODE*) 53 | #+ecl 54 | (:shadow #:SSE-PACK) 55 | ;; Common exports: 56 | (:export #:SSE-PACK #:SSE-PACK-P 57 | #:INT-SSE-PACK #:FLOAT-SSE-PACK #:DOUBLE-SSE-PACK 58 | #:*SSE-PACK-PRINT-MODE* 59 | #:SSE-ARRAY #:MAKE-SSE-ARRAY 60 | #:0.0-PS #:TRUE-SS #:FALSE-SS #:TRUE-PS #:FALSE-PS 61 | #:SET1-PS #:SET-PS #:SETR-PS #:SETZERO-PS 62 | #:0.0-PD #:TRUE-SD #:FALSE-SD #:TRUE-PD #:FALSE-PD 63 | #:SET1-PD #:SET-PD #:SETR-PD #:SETZERO-PD 64 | #:0-PI #:TRUE-PI #:FALSE-PI #:SETZERO-PI 65 | #:CPU-MXCSR #:CPU-MXCSR-BITS #:WITH-SAVED-MXCSR #:CPU-CONFIGURE-ROUNDING)) 66 | 67 | -------------------------------------------------------------------------------- /sse-utils.lisp: -------------------------------------------------------------------------------- 1 | ;;; -*- mode: Lisp; indent-tabs-mode: nil; -*- 2 | ;;; 3 | ;;; Copyright (c) 2010, Alexander Gavrilov (angavrilov@gmail.com) 4 | ;;; 5 | ;;; This file implements some common utility functions. 6 | ;;; 7 | 8 | (in-package #:SSE) 9 | 10 | ;;; CPU control 11 | 12 | (eval-when (:compile-toplevel :load-toplevel :execute) 13 | (declaim (ftype (function (&rest t) (unsigned-byte 32)) cpu-mxcsr-bits)) 14 | (defun cpu-mxcsr-bits (&rest tags) 15 | (loop with mask = 0 16 | for tag in tags 17 | for bit = (if (listp tag) 18 | (apply #'cpu-mxcsr-bits tag) 19 | (ecase tag 20 | (:except-invalid #x1) 21 | (:except-denormal #x2) 22 | (:except-divide-zero #x4) 23 | (:except-overflow #x8) 24 | (:except-underflow #x10) 25 | (:except-precision #x20) 26 | (:except-all #x3F) 27 | (:denormals-are-zero #x40) 28 | (:mask-invalid #x80) 29 | (:mask-denormal #x100) 30 | (:mask-divide-zero #x200) 31 | (:mask-overflow #x400) 32 | (:mask-underflow #x800) 33 | (:mask-precision #x1000) 34 | (:mask-all #x1f80) 35 | (:round-nearest 0) 36 | (:round-negative #x2000) 37 | (:round-positive #x4000) 38 | (:round-zero #x6000) 39 | (:round-bits #x6000) 40 | (:flush-to-zero #x8000))) 41 | do (setf mask (logior mask bit)) 42 | finally (return mask))) 43 | (defun expand-cpu-mxcsr-bits (tags on-fail) 44 | (loop for tag in tags 45 | when (keywordp tag) collect tag into kwds 46 | else collect tag into rest 47 | finally 48 | (return 49 | (cond ((and kwds rest) 50 | `(logior ,(apply #'cpu-mxcsr-bits kwds) 51 | (cpu-mxcsr-bits ,@rest))) 52 | (kwds 53 | (apply #'cpu-mxcsr-bits kwds)) 54 | (t on-fail)))))) 55 | 56 | (define-compiler-macro cpu-mxcsr-bits (&whole whole &rest tags) 57 | (expand-cpu-mxcsr-bits tags whole)) 58 | 59 | (defmacro with-saved-mxcsr (&body code) 60 | (let ((v (gensym "CSR"))) 61 | `(let ((,v (cpu-mxcsr))) 62 | (declare (type (unsigned-byte 32) ,v) 63 | #+ecl (:read-only ,v)) 64 | (unwind-protect (progn ,@code) 65 | (%set-cpu-mxcsr ,v))))) 66 | 67 | #+nil 68 | (defun cpu-check-exceptions (&rest tags) 69 | (let ((mask (logand (cpu-mxcsr-bits (or tags :except-all)) 70 | (cpu-mxcsr-bits :except-all))) 71 | (csr (get-cpu-mxcsr))) 72 | (declare (optimize (safety 0) (speed 3) (debug 0)) 73 | (type fixnum csr mask)) 74 | (not (zerop (logand mask csr))))) 75 | 76 | #+nil 77 | (define-compiler-macro cpu-check-exceptions (&whole whole &rest tags) 78 | (let ((bits (expand-cpu-mxcsr-bits (or tags '(except-all)) nil))) 79 | (if (integerp bits) 80 | `(locally (declare (optimize (speed 3) (safety 0) (debug 0))) 81 | (not (zerop (logand (cpu-get-mxcsr) 82 | ,(logand bits (cpu-mxcsr-bits :except-all)))))) 83 | whole))) 84 | 85 | #+nil 86 | (macrolet ((foo (&rest names) 87 | (let* ((kwds (mapcar (lambda (x) (intern (format nil "MASK-~A" x) :keyword)) names)) 88 | (pvars (mapcar (lambda (x) (intern (format nil "~A-P" x))) names))) 89 | `(defun cpu-mask-exceptions (&key 90 | ,@(mapcar (lambda (n p) `(,n nil ,p)) names pvars) 91 | (other nil rest-p)) 92 | (let ((set-bits (logior ,@(mapcar (lambda (n k) `(if ,n (cpu-mxcsr-bits ,k) 0)) names kwds))) 93 | (arg-bits (logior ,@(mapcar (lambda (p k) `(if ,p (cpu-mxcsr-bits ,k) 0)) pvars kwds)))) 94 | (%set-cpu-mxcsr 95 | (the fixnum 96 | (if (not rest-p) 97 | (logior set-bits (logand (get-cpu-mxcsr) (lognot arg-bits))) 98 | (logior set-bits 99 | (if other (logand (cpu-mxcsr-bits :mask-all) (lognot arg-bits)) 0) 100 | (logiand (get-cpu-mxcsr) (lognot (cpu-mxcsr-bits :mask-all))))))) 101 | nil))))) 102 | (foo invalid denormal divide-zero overflow underflow precision)) 103 | 104 | (defun cpu-configure-rounding (&key round-to 105 | (denormals-are-zero nil daz-p) 106 | (flush-to-zero nil ftz-p)) 107 | (let ((set 0) 108 | (mask 0)) 109 | (when round-to 110 | (setf mask (cpu-mxcsr-bits :round-bits) 111 | set (ecase round-to 112 | (:zero (cpu-mxcsr-bits :round-zero)) 113 | (:negative (cpu-mxcsr-bits :round-negative)) 114 | (:positive (cpu-mxcsr-bits :round-positive)) 115 | (:nearest (cpu-mxcsr-bits :round-nearest))))) 116 | (when daz-p 117 | (setf mask (logior mask (cpu-mxcsr-bits :denormals-are-zero))) 118 | (when denormals-are-zero 119 | (setf set (logior set (cpu-mxcsr-bits :denormals-are-zero))))) 120 | (when ftz-p 121 | (setf mask (logior mask (cpu-mxcsr-bits :flush-to-zero))) 122 | (when flush-to-zero 123 | (setf set (logior set (cpu-mxcsr-bits :flush-to-zero))))) 124 | (setf (cpu-mxcsr) 125 | (the (unsigned-byte 32) 126 | (logior set (logand (cpu-mxcsr) (lognot mask))))) 127 | nil)) 128 | 129 | -------------------------------------------------------------------------------- /test-sfmt.lisp: -------------------------------------------------------------------------------- 1 | ;;; -*- mode: Lisp; indent-tabs-mode: nil; -*- 2 | ;;; 3 | ;;; Dumbly translated from C code at: http://github.com/jj1bdx/sfmt-extstate 4 | 5 | (eval-when (:compile-toplevel :load-toplevel :execute) 6 | (require :cl-simd)) 7 | 8 | (defpackage #:sfmt-test 9 | (:use #:common-lisp #:sse)) 10 | 11 | (in-package #:sfmt-test) 12 | 13 | (deftype uint32 () '(unsigned-byte 32)) 14 | (deftype uint32-vector () '(sse-array uint32 (*))) 15 | 16 | (defconstant +mexp+ 19937) 17 | (defconstant +n+ (1+ (floor +mexp+ 128))) 18 | (defconstant +pos1+ 122) 19 | (defconstant +sl1+ 18) 20 | (defconstant +sl2+ 1) 21 | (defconstant +sr1+ 11) 22 | (defconstant +sr2+ 1) 23 | 24 | (defconstant +msk1+ #xdfffffef) 25 | (defconstant +msk2+ #xddfecb7f) 26 | (defconstant +msk3+ #xbffaffff) 27 | (defconstant +msk4+ #xbffffff6) 28 | 29 | (defconstant +parity1+ #x00000001) 30 | (defconstant +parity2+ #x00000000) 31 | (defconstant +parity3+ #x00000000) 32 | (defconstant +parity4+ #x13c9e684) 33 | 34 | (defconstant +uint32-mask+ #xFFFFFFFF) 35 | 36 | (defvar *work-buffer* (make-sse-array (* +n+ 4) :element-type 'uint32)) 37 | 38 | (defun period-certification (buffer) 39 | (declare (type uint32-vector buffer)) 40 | (let ((inner (logxor (logand (aref buffer 0) +parity1+) 41 | (logand (aref buffer 1) +parity2+) 42 | (logand (aref buffer 2) +parity3+) 43 | (logand (aref buffer 3) +parity4+)))) 44 | (loop for i = 16 then (ash i -1) while (> i 0) 45 | do (setf inner (logxor inner (ash inner (- i))))) 46 | (when (logtest inner 1) 47 | (return-from period-certification))) 48 | (loop 49 | for i from 0 to 3 50 | for parity in (load-time-value (list +parity1+ +parity2+ +parity3+ +parity4+)) 51 | do (loop 52 | for work = 1 then (ash work 1) 53 | for j from 0 below 32 54 | when (/= 0 (logand work parity)) 55 | do (progn 56 | (setf (aref buffer i) 57 | (logxor (aref buffer i) work)) 58 | (return-from period-certification))))) 59 | 60 | (defun init-gen-rand (seed buffer) 61 | (declare (type uint32 seed) 62 | (type uint32-vector buffer)) 63 | (setf (aref buffer 0) seed) 64 | (loop for i from 1 below (array-total-size buffer) 65 | do (setf (aref buffer i) 66 | (logand +uint32-mask+ 67 | (+ i 68 | (* 1812433253 (logxor (aref buffer (1- i)) 69 | (ash (aref buffer (1- i)) -30))))))) 70 | (period-certification buffer)) 71 | 72 | ;; Should be an inline function, but it's broken in ECL 73 | (defmacro recursion (a b c d mask) 74 | `(let ((x ,a) 75 | (y (srli-pi32 ,b +sr1+)) 76 | (z (srli-pi ,c +sr2+)) 77 | (v (slli-pi32 ,d +sl1+)) 78 | (m ,mask)) 79 | (xor-pi (xor-pi (xor-pi z x) v) 80 | (xor-pi (slli-pi x +sl2+) 81 | (and-pi y m))))) 82 | 83 | (defmacro sfmt-aref (buf idx) 84 | `(row-major-aref-api ,buf (the fixnum (* 4 (the fixnum ,idx))))) 85 | 86 | (defun gen-rand-all (buffer) 87 | (declare (optimize (speed 3) #+ecl (safety 0) (debug 0) 88 | #+sbcl (sb-c::insert-array-bounds-checks 0)) 89 | (type uint32-vector buffer)) 90 | #+ecl (check-type buffer uint32-vector) 91 | (assert (= (array-total-size buffer) (* +n+ 4))) 92 | (let ((mask (set-pu32 +msk4+ +msk3+ +msk2+ +msk1+)) 93 | (r1 (sfmt-aref buffer (- +n+ 2))) 94 | (r2 (sfmt-aref buffer (- +n+ 1)))) 95 | (declare (type int-sse-pack mask r1 r2)) 96 | (macrolet ((twist (delta) 97 | `(psetq r1 r2 98 | r2 (setf (sfmt-aref buffer i) 99 | (recursion (sfmt-aref buffer i) 100 | (sfmt-aref buffer (+ i (the fixnum ,delta))) 101 | r1 r2 mask))))) 102 | (loop for i fixnum from 0 below (- +n+ +pos1+) 103 | do (twist +pos1+)) 104 | (loop for i fixnum from (- +n+ +pos1+) below +n+ 105 | do (twist (- +pos1+ +n+)))))) 106 | 107 | (defun gen-rand-array (output buffer) 108 | (declare (optimize (speed 3) #+ecl (safety 0) (debug 0) 109 | #+sbcl (sb-c::insert-array-bounds-checks 0)) 110 | (type uint32-vector buffer output)) 111 | #+ecl (check-type buffer uint32-vector) 112 | #+ecl (check-type output uint32-vector) 113 | (assert (= (array-total-size buffer) (* +n+ 4))) 114 | (let ((mask (set-pu32 +msk4+ +msk3+ +msk2+ +msk1+)) 115 | (size (floor (array-total-size output) 4)) 116 | (r1 (sfmt-aref buffer (- +n+ 2))) 117 | (r2 (sfmt-aref buffer (- +n+ 1)))) 118 | (declare (type int-sse-pack mask r1 r2) 119 | (type fixnum size)) 120 | (assert (> size (* +n+ 2))) 121 | (macrolet ((twist (tgt src1 delta1 src2 delta2) 122 | `(psetq r1 r2 123 | r2 (setf (sfmt-aref ,tgt i) 124 | (recursion (sfmt-aref ,src1 (- i (the fixnum ,delta1))) 125 | (sfmt-aref ,src2 (+ i (the fixnum ,delta2))) 126 | r1 r2 mask))))) 127 | (loop for i fixnum from 0 below (- +n+ +pos1+) 128 | do (twist output buffer 0 buffer +pos1+)) 129 | (loop for i fixnum from (- +n+ +pos1+) below +n+ 130 | do (twist output buffer 0 output (- +pos1+ +n+))) 131 | (loop for i fixnum from +n+ below (- size +n+) 132 | do (twist output output +n+ output (- +pos1+ +n+))) 133 | #+ () 134 | (loop for j fixnum from 0 below (- (* 2 +n+) size) 135 | do (setf (sfmt-aref buffer j) 136 | (sfmt-aref output (+ j (the fixnum (- size +n+)))))) 137 | (loop 138 | for i fixnum from (- size +n+) below size 139 | for j fixnum from 0 below +n+ ;(max 0 (- (* 2 +n+) size)) 140 | do (twist output output +n+ output (- +pos1+ +n+)) 141 | do (setf (sfmt-aref buffer j) r2)) 142 | output))) 143 | 144 | (defun test () 145 | (let ((out (make-sse-array 10000 :element-type 'uint32))) 146 | (init-gen-rand 1234 *work-buffer*) 147 | (gen-rand-array out *work-buffer*) 148 | (assert (equal (coerce (subseq out 995 1000) 'list) 149 | '(2499610950 3057240914 1662679783 461224431 1168395933))) 150 | (gen-rand-array out *work-buffer*) 151 | (assert (equal (coerce (subseq out 995 1000) 'list) 152 | '(648219337 458306832 3674950976 4030368244 2918117049))))) 153 | 154 | (dotimes (i 10) 155 | (test)) 156 | 157 | --------------------------------------------------------------------------------