├── LICENSE ├── README.md ├── arch.bqn ├── archutil.bqn ├── data ├── armv8_ext.txt ├── iintrinsic.bqn ├── rv_ext.txt ├── x86_ext.txt └── x86_strict_ext.txt ├── doc ├── compiler.md ├── interpreter.md ├── minfilter.md └── permutations.md ├── emit_c.bqn ├── float2.bqn ├── include ├── README.md ├── arch │ ├── c.singeli │ ├── iintrinsic │ │ ├── basic.singeli │ │ ├── basic_impl.singeli │ │ ├── basic_strict.singeli │ │ ├── misc.singeli │ │ └── select.singeli │ └── neon_intrin │ │ ├── basic.singeli │ │ └── select.singeli ├── clib │ └── malloc.singeli ├── debug │ └── printf.singeli ├── skin │ ├── c.singeli │ ├── cext.singeli │ ├── cmut.singeli │ └── cop.singeli └── util │ ├── for.singeli │ ├── functionize.singeli │ ├── kind.singeli │ ├── perv.singeli │ └── tup.singeli ├── ir.bqn ├── singeli ├── singeli.bqn └── test ├── README.md ├── alias.c ├── alias.in ├── alias.ir ├── anon.in ├── anon.ir ├── apply.in ├── apply.ir ├── arch ├── .gitignore ├── base.singeli ├── general │ ├── broadcast-sel.singeli │ ├── imm-shuffle-select.singeli │ └── makefile ├── simd.singeli └── to-c-args.bqn ├── blockmut.in ├── blockmut.ir ├── call.c ├── call.in ├── cond.in ├── cond.ir ├── const.in ├── const.ir ├── destruct.in ├── destruct.ir ├── each.in ├── each.ir ├── else.in ├── else.ir ├── excon.c ├── excon.in ├── export.in ├── export.ir ├── fnarr.c ├── fnarr.in ├── fnrec.in ├── fnrec.ir ├── fntup.in ├── fntup.ir ├── for.in ├── for.ir ├── forin.in ├── forin.ir ├── fortup.in ├── fortup.ir ├── fun.in ├── fun.ir ├── genext.in ├── genext.ir ├── goto.c ├── goto.in ├── hello.c ├── hello.in ├── ifconst.in ├── ifconst.ir ├── local.in ├── local.ir ├── logic.in ├── logic.ir ├── match.in ├── match.ir ├── mfor.in ├── mfor.ir ├── mut.in ├── mut.ir ├── oper.in ├── oper.ir ├── oppar.in ├── oppar.ir ├── partial.in ├── partial.ir ├── proto.c ├── proto.in ├── qual.in ├── qual.ir ├── run ├── spread.in ├── spread.ir ├── tup.in ├── tup.ir ├── uload.in ├── uload.ir ├── undefined.c ├── undefined.in ├── varpar.in ├── varpar.ir ├── voidfn.in ├── voidfn.ir ├── vtype.c └── vtype.in /LICENSE: -------------------------------------------------------------------------------- 1 | ISC License 2 | 3 | Copyright (c) 2021, Marshall Lochbaum 4 | 5 | Permission to use, copy, modify, and/or distribute this software for any 6 | purpose with or without fee is hereby granted, provided that the above 7 | copyright notice and this permission notice appear in all copies. 8 | 9 | THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 10 | WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 11 | MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 12 | ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 13 | WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 14 | ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 15 | OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 16 | -------------------------------------------------------------------------------- /arch.bqn: -------------------------------------------------------------------------------- 1 | # CPU architecture namespace, based on provided feature set argfeats 2 | argfeats‿infer ← ⋈⟜0⍟(0=≠) •args 3 | 4 | arches‿ReadDeps‿ReadNative‿ToUpper ← •Import "archutil.bqn" 5 | 6 | # Process feature flags and detect architecture family 7 | allfeats ← ∾ archfeats ← {𝕩.feats}¨ arches 8 | AllNat ← ∊⟜allfeats⊸/ ReadNative 9 | MaybeNat ← "NATIVE"⊸≡¨ (¬⊸/∾AllNat)⍟((0=≠)⊸(∨´)⊣) ⊢ 10 | arg ← ⟨"NONE"⟩⊸≢◶⟨⟩‿MaybeNat ToUpper argfeats 11 | all ← "ALL"⊸≡¨ arg 12 | !∘(∾"Unknown features:"<⊸∾' '∾¨/⟜arg)⍟(∨´) ¬all∨arg∊allfeats 13 | "Incompatible features" ! ∨´ supp ← (all¬⊸/arg)⊸(∧´∊)¨ archfeats 14 | archDesc ← ⊑ supp / arches 15 | width‿VecType‿header ⇐ ⟨aname⇐name⟩ ← archDesc 16 | 17 | # Parse dependencies for the chosen architecture 18 | ⟨feats, mat⟩ ⇐ { 19 | TSort ← {{𝕊⍟(𝕩<○≠⊢)⟜(𝕩∾·/𝕨⊸<)𝕨∨∧´∘⊏⟜𝕨¨p}⟜/0¨p←𝕩} # Topological sort 20 | # d is a list of extension dependency chains 21 | u ← ⍷∾ d ← ReadDeps archDesc.ExtFile infer 22 | u ⊏˜↩ TSort ∾¨ (⊢⊔○∾(¯1↓↑)¨) ("=⌜⊢)⊸⊔𝕩 13 | d←+`tt←1-(+˜⊸+´'/'=0‿¯1⊸⊏)¨tags # Tag type: ¯1 close, 0 void, 1 open 14 | tp←(⍋⊏⟜d)⊸⊏∘/˘ 1‿¯1=⌜tt # Tag pairs 15 | ! (∧`' '⊸≠)⊸/¨⊸≡⟜(1⊸↓¨)˝tp⊏tags # Tag matching 16 | oi←(0·≠`'"'⊸=)⊸⊔𝕩 23 | ⟨name, >(E˝·∨`"="""=⌜⊢)⊸⊔¨a⟩ 24 | } 25 | 26 | #⌜ 27 | # Now process the contents 28 | isaList ← "SSE"‿"SSE2"‿"SSE3"‿"SSSE3"‿"SSE4.1"‿"SSE4.2"‿"AVX"‿"AVX2"‿"FMA" 29 | 30 | names‿GetCont‿GetVoid‿svml ← { 31 | parent‿open‿cont ← ParseXml xml 32 | findOpen ← {(⍷𝕩)⊸⊐⊏(⊔⊐𝕩)˙} (∧`' '⊸≠)⊸/¨ open 33 | _on_ ← {𝔽○((∾FindOpen𝕘)⊸⊏)} 34 | child ← ⊔ parent 35 | intr ⇐ cont ∊⟜isaList⊸/_on_⟨"CPUID"⟩ parent 36 | IG ← intr⊏⊔ 37 | GetCont ⇐ { parent IG _on_𝕩 cont } 38 | GetVoid ⇐ { parent IG⟜((¯1⊑·ParseAttr ¯1⊸↓)¨)_on_𝕩 open } 39 | at ← (1⊑ParseAttr)¨ intr⊏open 40 | names ⇐ ≡¨⟜(<"name")⊸(⊑∘/)˝∘⍉¨ at 41 | svml ⇐ (⊑"tech"‿"SVML"∊⊢)¨ at 42 | } 43 | 44 | ProcType ← { 45 | IsDig ← 1=0‿10⍋-⟜'0' 46 | Nat ← 10⊸×⊸+˜´∘⌽ -⟜'0' 47 | Num ← {𝕊⁼:•Repr𝕩; (0<≠)◶1‿Nat IsDig⊸/𝕩} 48 | t‿n‿e ← ((1⊏˘𝕩)∾<"")⊏˜(⊏˘𝕩)⊐"type"‿"varname"‿"etype" 49 | pre ← "" 50 | t ↩ " const" {c←𝕨≡(-≠𝕨)↑𝕩⋄pre∾↩c⊏"&*"⋄(-c×≠𝕨)↓𝕩}⟜((-1+' '=¯2⊸⊑)⊸↓)⍟('*'≡¯1⊸⊑) t 51 | {𝕤⋄pre‿t↩"IMM"‿""}⍟("IMM"⊸≡) e 52 | EP ← (∾·((⥊¨"uifbm")⊑˜"UI"‿"SI"‿"FP"‿"M"‿"MASK"⊸⊐⌾<)⌾⊑IsDig⊸⊔) e˙ 53 | tp‿act ← <˘⍉∘‿2⥊⟨ 54 | "void" , ⊢ 55 | "int" , "i32" 56 | "float" , "f32" 57 | "double" , "f64" 58 | "__m" , (∊⟜"bm"⌾<⊑∘⊢)◶⟨("["∾"]"∾˜÷⌾Num)∾⊢, ⊏∘⊢∾·IsDig⊸/⊣⟩⟜EP 59 | "" , ⊢ 60 | ⟩ 61 | act ∾↩ ⟨EP⟩ # Various integer types 62 | ⟨n, pre∾(tp⊸⊐⌾<(∧`∘¬IsDig)⊸/)◶act t⟩ 63 | } 64 | 65 | proto ← (¯1↓⍟(""‿"void"≡⊑)ProcType¨)¨ GetVoid "return"‿"parameter" 66 | cpuid‿cat ← GetCont∘(⥊<)¨ "CPUID"‿"category" 67 | cpuid ↩ isaList ⊐ ⊑¨cpuid 68 | #instrs ← 0‿1⊸⊑¨¨ GetVoid ⟨"instruction"⟩ # x86 instruction name 69 | 70 | #⌜ 71 | # Singeli-specific adjustments; try to fix wrong signedness 72 | _seg ← {(𝔽·+`'_'⊸=)⊸/} 73 | GetSname ← 3⊸=_seg⊸(∾˜)⍟("_mask"⊸≡)⟜(2⊸=_seg) 74 | # Exclude intrinsics covered by basic.singeli 75 | excl ← E˜∘=⟜' '⊸⊔"setr set set1 loadu load storeu store extract insert and or xor andnot add sub adds subs min max mullo mul slli srai srli sll sra srl sllv srav srlv cmpeq cmpgt cmp cmpneq cmpge cmplt cmple div sqrt floor ceil round abs sign avg shuffle shufflehi shufflelo permute permutevar permute2f128 permute2x128 permute4x64 permute8x32 permutevar8x32 unpacklo unpackhi bslli bsrli alignr blend blendv" 76 | incl ← E˜∘=⟜' '⊸⊔"_mm_cmp_pd _mm_cmp_ps _mm256_set_m128 _mm256_set_m128d _mm256_set_m128i _mm256_setr_m128 _mm256_setr_m128d _mm256_setr_m128i _mm_mul_epu32 _mm_mul_epi32 _mm256_mul_epi32 _mm256_mul_epu32" 77 | filter ← ∧´ ⟨ 78 | (names∊incl) ∨ ¬((1↓GetSname)¨ names)∊excl 79 | ¬ ∨˝"_ss"‿"_sd"‿"1"(⊣≡-∘≠⊸↑)⌜names 80 | svml < cpuid<≠isaList 81 | ¬ (∨´·("&b"∧´∘∊⊢)¨1⊑¨⊢)¨ proto 82 | (⊑'b'∊1⊑⊑)◶⟨1,⊑·(⊏∊1⊸↓)1⊑¨⊢⟩¨ proto 83 | ⟩ 84 | names‿proto‿cpuid‿cat filter⊸/¨↩ 85 | 86 | sname ← GetSname¨ names 87 | 88 | pi ← ("pi"≡¯2↑·(¬·∨`'0'⊸≤∧'9'⊸≥)⊸/(∧`⌾⌽'_'⊸≠)⊸/)¨names 89 | pi ∧↩ ¬sname∊"_mullo"‿"_srl"‿"_srli"‿"_srlv"‿"_abs" 90 | proto ((-´"iu")×(<<0‿1)×'u'=⊢)⊸+⌾(pi⊸/)↩ 91 | 92 | id ← ⊐ sname ≍˘ 1↓¨proto 93 | Disamb ← { U←{¬∧´∊𝕩} ⋄ 𝕩 2⊸<_seg¨⊸(∾¨˜⍟U˜)⍟U ("_"∾·¬∘∊⟜"[]"⊸/1⊑⊑)¨𝕨 } 94 | sname ∾¨↩ proto (1<≠∘⊢)◶⟨""¨,Disamb⟩¨⌾(id⊸⊔) names 95 | 96 | #⌜ 97 | # Format as Singeli definition 98 | MakeDef ← {instr 𝕊 prot: 99 | # Utilities 100 | Br←"{"∾∾⟜"}" ⋄ A←∾⟜Br 101 | Int←⥊<⊸(≍˘) ⋄ J←∾1↓Int 102 | # Type handling and formatting 103 | pn‿pt ← <˘⍉> prot 104 | an←1↓pn ⋄ ! ∊⟜""‿"dst"‿"k"⌾< ⊑pn 105 | pp ← "*&"∊˜⊑¨pt 106 | rt‿at ← (⊑⋈1⊸↓) pp↓¨pt 107 | ri ← "void"⊸≢◶⟨≠at, at⊸⊐⌾<⟩ rt 108 | iv ← "bm"∊˜⊑¨at ⋄ vf ← "IMM"⊸≢¨ at 109 | c‿i ← iv(<⋈∧)vf∧∊at # c for type constant, i for intvec 110 | uf ← vf ∧ (ri=↕∘≠)⊸∨ iv ∨ ¬(∊∧∊⌾⌽)at # Which types are named 111 | tn ← uf⥊¨'T'- 1-˜1↓⊐0∾uf×1+⊐at 112 | vt ← at (0<≠∘⊣)◶⟨"("∾")"∾˜⊢,∾⟜"=="⊸∾⟩¨˜⌾(c/⊢) tn 113 | par ← an ∾¨ vf":"⊸∾⍟⊣¨ (1↓pp)"*"⊸∾⍟⊣¨ vt 114 | conds ← (tn {"intvec"A∾⟨1↓𝕩,",",𝕨⟩}¨○(i/⊢) at) ∾ "num"⊸A¨ vf¬⊸/an 115 | rt ↩ "__pnt"⊸A⍟(⊑pp) (≠at)⊸=◶⟨⊑⟜tn, rt⟩ ri 116 | ∾⟨ 117 | Br ∾ 1↓ (", " Int par) ∾ " if "⌾⊑⍟(0<≠) " and " Int conds 118 | " = emit" 119 | Br ", "J ⟨rt, "'"(∾∾⊣)instr⟩ ∾ an 120 | ⟩ 121 | } 122 | defs ← cpuid ⊔○((⍋cat)⊸⊏) sname {∾"def _"‿𝕨‿𝕩}¨ names MakeDef¨ proto 123 | •Out 1⌽" 124 | local { 125 | def intvec{w,T} = 0 126 | def intvec{(width{V}),V=[_]T if isint{T}} = 1 127 | def num{T} = is{'number',kind{T}} 128 | }" 129 | •Out¨ ∾ 1↓⥊(<2⥊<⟨⟩) ∾˘ ("#"⊸∾¨isaList) <⊸≍˘ defs 130 | -------------------------------------------------------------------------------- /data/rv_ext.txt: -------------------------------------------------------------------------------- 1 | ZAAMO RVA 2 | ZALRSC RVA 3 | ZBA RVB 4 | ZBB RVB 5 | ZBS RVB 6 | RVF RVD 7 | RVA RVG 8 | RVD RVG 9 | RVI RVG 10 | RVM RVG 11 | ZICNTR RVG 12 | ZIFENCEI RVG 13 | RVS RVH 14 | RVD RVQ 15 | RVU 16 | RVV 17 | ZA128RS 18 | ZA64RS 19 | ZAAMO ZABHA 20 | ZAAMO ZACAS 21 | ZALASR 22 | ZAMA16B 23 | ZAWRS 24 | ZBC 25 | RVD ZCD 26 | ZCA ZCE 27 | ZCB ZCE 28 | ZCMP ZCE 29 | ZCMT ZCE 30 | RVC ZCMOP 31 | ZCA ZCMT 32 | ZCB ZCMT 33 | ZCMP ZCMT 34 | ZCMT ZCMT 35 | RVF ZFA 36 | RVF ZFBFMIN 37 | ZFH ZFBFMIN 38 | RVF ZFHMIN 39 | ZHINX 40 | ZIC64B 41 | ZICBOM 42 | ZICBOP 43 | ZICBOZ 44 | ZICCAMOA 45 | ZICCAMOC 46 | ZICCIF 47 | ZICCLSM 48 | ZICCRSE 49 | ZICFILP 50 | ZICFISS 51 | ZICSR ZICNTR 52 | ZICOND 53 | ZIHINTNTL 54 | ZIHINTPAUSE 55 | SMHPM ZIHPM 56 | ZIMOP 57 | ZKN ZK 58 | ZKR ZK 59 | ZKT ZK 60 | ZBKB ZKN 61 | ZBKC ZKN 62 | ZBKX ZKN 63 | ZKND ZKN 64 | ZKNE ZKN 65 | ZKNH ZKN 66 | ZBKB ZKS 67 | ZBKC ZKS 68 | ZBKX ZKS 69 | ZKND ZKS 70 | ZKNE ZKS 71 | ZKNH ZKS 72 | ZKSED 73 | ZKSH 74 | ZMMUL 75 | ZVKB ZVBB 76 | ZFBFMIN ZVFBFWMA 77 | ZVFBFMIN ZVFBFWMA 78 | ZFHMIN ZVFH 79 | ZVE32F ZVFH 80 | ZVE32F ZVFHMIN 81 | ZVKB ZVKN 82 | ZVKNED ZVKN 83 | ZVKNHB ZVKN 84 | ZVKT ZVKN 85 | ZVBC ZVKNC 86 | ZVKN ZVKNC 87 | ZVKG ZVKNG 88 | ZVKN ZVKNG 89 | ZVKNHA ZVKNHB 90 | ZVKB ZVKS 91 | ZVKSED ZVKS 92 | ZVKSH ZVKS 93 | ZVKT ZVKS 94 | ZVBC ZVKSC 95 | ZVKS ZVKSC 96 | ZVKG ZVKSG 97 | ZVKS ZVKSG 98 | -------------------------------------------------------------------------------- /data/x86_ext.txt: -------------------------------------------------------------------------------- 1 | SSE2 X86_64 2 | MMX SSE SSE2 SSE3 POPCNT SSE4A FMA4 3 | SSE3 SSSE3 SSE4.1 SSE4.2 4 | POPCNT SSE4.2 PCLMUL AVX FMA FMA4 5 | PCLMUL GFNI AVX512VBMI2 AVX512BITALG AVX512VP2INTERSECT 6 | AVX LZCNT BMI AVX2 BMI2 AVX512F 7 | FMA AVX512F AVX512CD AVX512VL AVX512BW AVX512DQ AVX512VBMI AVX512IFMA AVX512VBMI2 8 | FMA VPCLMULQDQ AVX512VBMI2 9 | BMI2 VPCLMULQDQ 10 | AVX512CD AVX512ER AVX512PF AVX5124VNNIW AVX5124FMAPS 11 | AVX512CD AVX512VPOPCNTDQ AVX5124VNNIW 12 | AVX512DQ AVX512VNNI AVX512VBMI2 13 | AVX512VPOPCNTDQ AVX512VBMI2 14 | -------------------------------------------------------------------------------- /data/x86_strict_ext.txt: -------------------------------------------------------------------------------- 1 | SSE2 X86_64 2 | MMX SSE SSE2 SSE3 SSSE3 SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512VL 3 | SSE2 PCLMUL VPCLMULQDQ 4 | SSE2 GFNI 5 | SSE3 SSE4A FMA4 6 | AVX FMA4 7 | AVX FMA AVX512F AVX512BW AVX512VBMI 8 | AVX VPCLMULQDQ 9 | AVX512F AVX512DQ 10 | AVX512F AVX512CD 11 | AVX512F AVX512ER 12 | AVX512F AVX512PF 13 | AVX512F AVX512IFMA 14 | AVX512F AVX512VPOPCNTDQ 15 | AVX512F AVX512VNNI 16 | AVX512F AVX512VP2INTERSECT 17 | AVX512BW AVX512VBMI2 18 | AVX512BW AVX512BITALG 19 | POPCNT 20 | LZCNT 21 | BMI 22 | BMI2 23 | AVX5124VNNIW 24 | AVX5124FMAPS 25 | -------------------------------------------------------------------------------- /doc/compiler.md: -------------------------------------------------------------------------------- 1 | # Singeli is a macro-oriented compiler 2 | 3 | Using Singeli for your next project? Yeah, it's all right. The Rustacean numbskulls haven't gotten to it yet, so it's C-like and actually useful. Well, there's some functional mumbo-jumbo buried in there, but it stays out of the way most of the time. 4 | 5 | include 'debug/printf' 6 | main : void { 7 | lprintf{'Hello, World!'} # Print with newline 8 | } 9 | 10 | There's the classic to start off with. You compile to C with `singeli hello.singeli -o hello.c`, then compile and run that. Ugly but at least gcc/clang generate decent code, next best thing to doing the registers by hand. Now, `main` here is special syntax and not a function, and you usually integrate Singeli with an existing codebase, so here's how you get a function you can call from C: 11 | 12 | include 'debug/printf' 13 | 14 | fn hello() : void = { 15 | lprintf{'Hello, World!'} 16 | } 17 | 18 | export{'hello', hello} 19 | 20 | You can figure out the C stub that calls `hello()` to test it I'm sure. And `singeli -h` for compilation options, I won't bore you with the details. 21 | 22 | You'll notice that `lprintf{}` doesn't use parentheses like a function call. You do call functions with parens, but `lprintf` isn't a function. We'll get to that. And you'll notice that printing is classified as a debugging tool where it belongs. Singeli is for programming: data goes in, data comes out. Text processing? Sure, text is data. But the things in quotes aren't strings: they're called symbols and they're used for name-ish stuff that only exists at compile time. `lprintf{}` takes them because it's convenient. 23 | 24 | The curly brace calls. They're an eyesore but there's a reason. A function has a defined type signature, and printing just takes whatever pile of junk you give it. Even in C it's some kind of special function. Basically, the braces tell you that `lprintf{}` is a macro. It'll generate some code but you don't know what. So it's officially called a generator. I just say macro. Oh, code's in [include/debug/printf.singeli](../include/debug/printf.singeli) for your gut-viewing pleasure. 25 | 26 | Let's at least pretend to get some work done. Reverse a string—Pascal-style, you've got to admit C's made some mistakes. 27 | 28 | include 'skin/c' 29 | include 'arch/c' 30 | 31 | fn reverse(str:*u8, len:u64) : void = { 32 | i:u64 = 0 33 | while (i < len) { 34 | --len 35 | c := load{str, i} 36 | store{str, i, load{str, len}} 37 | store{str, len, c} 38 | ++i 39 | } 40 | } 41 | 42 | Now we are getting somewhere, `load{}` and `store{}` aside. You need skin/c and arch/c to do anything: in a fit of overengineering the authors have decided maybe you'd want something other than C operators and backend. `*u8` is a pointer. You can load and store at any index, and cast it to other pointer types. Better compile C with `-fno-strict-aliasing`, by the way. And Singeli only has prefix operators so you have `--len` to decrement but no `len--`. 43 | 44 | Singeli's string handling sucks, so to test this out we're going to call C functions directly. This is how the libraries like debug/printf and arch/c are implemented. `emit{}` is a built-in that takes a result type and function name (or operator, with `'op +'` or similar) and calls the C function directly. It outputs symbols verbatim, which is what lets me jam a string in with `'"%s\n"'`, but it might not work that way forever. And `require{}` gets a C header. Since it's a macro you can call it anywhere, like inside `main` or another macro you run. Requiring the same header many times is fine; it'll only generate one `#include`. 45 | 46 | require{'stdio.h', 'string.h'} 47 | main(argc, argv) { 48 | arg := load{argv, 1} 49 | reverse(arg, emit{u64, 'strlen', arg}) 50 | emit{void, 'printf', '"%s\n"', arg} 51 | } 52 | 53 | Call this with `./a.out sometext` and it prints out the reversed text. And now to deal with this `load` and `store` junk. As you may have guessed, Singeli doesn't support `array[index]` syntax, and it doesn't really have a concept of lvalues either. But there's a library [skin/cext](../include/skin/cext.singeli) that defines some extra non-C operators, mainly for dealing with pointers and casting (which we'll see later; Singeli's anal about types). Now the syntax is `array->index` to load, same as a C struct pointer, and `array <-{index} value` to store, where the `{index}` part is optional. So this is kind of tolerable. 54 | 55 | include 'skin/c' 56 | include 'skin/cext' 57 | include 'arch/c' 58 | 59 | fn reverse(str:*u8, len:u64) : void = { 60 | i:u64 = 0 61 | while (i < len) { 62 | --len 63 | c := str->i 64 | str <-{i} str->len 65 | str <-{len} c 66 | ++i 67 | } 68 | } 69 | 70 | require{'stdio.h', 'string.h'} 71 | main(argc, argv) { 72 | arg := load{argv, 1} 73 | reverse(arg, emit{u64, 'strlen', arg}) 74 | emit{void, 'printf', '"%s\n"', arg} 75 | } 76 | 77 | I'm smarter than a Gopher and don't like to spend all my time writing reverse functions, so if I have a codebase with multiple types I want my `reverse` to be generic. Generated, even. So I add a type parameter `{T}`, and call this with `reverse{u8}(str, len)`. Whenever `reverse` is called on a type it hasn't seen before, it generates a new function for that type (has to be a type, because of the `*T` in a type signature). Then it reuses that function if the same type comes up again—this is a special feature for generic functions and not other macros. 78 | 79 | fn reverse{T}(vec:*T, len:u64) : void = { 80 | i:u64 = 0 81 | while (i < len) { 82 | --len 83 | c := vec->i 84 | vec <-{i} vec->len 85 | vec <-{len} c 86 | ++i 87 | } 88 | } 89 | 90 | Of course, if I'd used `u8` inside the function, I'd need to replace those with `T` too. One reason I don't need to do this is that the declaration `c := vec->i` gets the type from the expression, so it's the same as `c:T = vec->i`. 91 | 92 | ## SIMD 93 | 94 | You're probably here for the vector processing stuff. Is this going to save me from `__m256d v = _mm256_fmsubadd_pd...` on every line of the program? The convenience of the C++ packages without Bjarne's head games? Ha, as a programmer you'd better learn to accept the head games, but these ones can largely be shuffled off to an `include` file. Vectorizing that reverse function will take us through the basics. 95 | 96 | Real built-in vector support would apparently harsh Singeli's minimalist vibe, so all you get out of the box are vector types, written like `[16]i8`. Which is better than C's `__m128` for every integer because now `a+b` has a clear meaning. Oh, and it knows which vector extensions exist, so you can test whether your target architecture supports one with `hasarch{'SSSE3'}`. Here I'm going to use x86 with vector extensions up to SSSE3 (released in 2006, yes you have it, unless you're on ARM). By default, Singeli picks up architecture flags from the current CPU to compile for native execution. Or you can specify with `-a SSSE3`, although if you're not on x86 of course you've got no way to run the output C code. 97 | 98 | To make use of my `[16]i8`s instead of leaving them to sit around and look pretty I need some definitions, which will compile to C intrinsics. There are two libraries for these right now. [arch/iintrinsic/basic](../include/README.md#simd-basics) is a curated set of "nice" operations like load, store, and arithmetic, and arch/iintrinsic/misc is a dump of the rest (iintrinsic is "intel intrinsics", which is the target the same way C is for arch/c). I only need one macro from misc, so I'm just going to copy it over. 99 | 100 | include 'arch/iintrinsic/basic' 101 | def shuffle{a:T==[16]i8, b:T} = emit{T, '_mm_shuffle_epi8', a, b} 102 | 103 | EDIT: That was good to build character and all, but now there's a [usable wrapper](../include/README.md#simd-selection) for shuffling (blending too, eh, let's ignore it because the instructions weren't added until after SSSE3), so I'll just patch this in: 104 | 105 | include 'arch/iintrinsic/select' 106 | def shuffle{a:T==[16]i8, b:T} = vec_shuffle{a, b} 107 | 108 | Looks like `#define`, but these `def` macros are smart: you can check compile-time conditions to decide whether it applies. If not it'll try the previous definition if any, meaning it's an overload. `shuffle` doesn't overload anything, so just errors if `a` and `b` don't have type `T` which is `[16]i8`. On the other hand, there's something we do want to overload: 109 | 110 | fn reverse{T==i8 if hasarch{'SSSE3'}}(arr:*T, len:u64) : void = { 111 | def V = [16]T 112 | r := vec_make{V, 15 - range{16}} 113 | av := *V~~arr 114 | av <- shuffle{av->0, r} 115 | } 116 | 117 | Not a full implementation—for now it's ignoring `len` and reversing 16 elements. But there are a few new things, besides the conditions added to `reverse`. The macro `def V = [16]T` is basically a typedef. Macros are scoped so that it only applies inside `reverse`. The casting operator `~~` is defined by skin/cext as `reinterpret`, which converts between types of the same width. `range{16}` gives the integers from 0 to 15 inclusive, and I subtract from 15 to reverse the order. All this happens at compile time, and `-` working on a list is showing some APL influence. Somehow we got one of the good bits here. 118 | 119 | And it makes sense that `-` should be able to act on multiple numbers at compile time because (with arch/iintrinsic/basic) it applies to vectors at runtime. Instead of `vec_make{V, 15 - range{16}}` it could be `vec_broadcast{V, 15} - vec_make{V, range{16}}`. These two vector-building macros come from arch/iintrinsic/basic, and if you haven't heard of it, "broadcasting" is one name for spreading a single value to all elements of a vector. A better use of vector arithmetic is to extend `reverse` to deal with 16 elements or less: 120 | 121 | fn reverse{T==i8 if hasarch{'SSSE3'}}(arr:*T, len:u64) : void = { 122 | def V = [16]T 123 | f := vec_make{V, range{16}} # forward 124 | r := vec_make{V, 15 - range{16}} # reverse 125 | l := vec_broadcast{V, T<~len} 126 | m := V~~(f < l) 127 | s := ((l - f - vec_broadcast{V, 1}) & m) | andnot{f, m} 128 | av := *V~~arr 129 | av <- shuffle{av->0, s} 130 | } 131 | 132 | The basic idea is to read an entire vector regardless of length, reverse only the first `len` elements, and put it back. So this reads from and writes to memory beyond the actual vector argument. Obviously you need to know you have access to that memory, but that's easy to ensure if you control the allocations. But C and other compilers can't figure it out so it's one way writing your own SIMD is better. 133 | 134 | The specific idea is to blend a vector that starts at `len-1` and goes down with the identity vector `f`. We choose the descending vector for the first `len` elements, using the mask `f < l`. The result of an SSE comparison is all 0 bits or all 1, and it has an unsigned type but `V` is signed, so slap on `V~~`. Next section I'll show a blend utility that keeps this mess out of sight. 135 | 136 | And another cast `<~` in there. The three casts skin/cext defines are `~~` for reinterpret, `^~` for promoting from a type to a superset, and `<~`. At the moment this one just always does a C cast, but the idea is to use it for a narrowing integer conversion. Get familiar with these because Singeli requires a lot of casting. Or at least the standard definitions do, nothing preventing you from extending those. 137 | 138 | So now we can put together a function that works on any length. Language-wise there's nothing new here unless you consider an `if` statement to be a surprise. But there's a trick for handling when the two vector pointers meet in the middle. If there's one vector or less between them, we have the code for that. If there are two vectors or less, we could reverse one full and one partial vector, but that's ugly. Instead we're going to reverse two overlapping full vectors. This actually doesn't take any changes other than the loop bound. The main loop was going to read the two vectors and then write two reversed ones anyway, so the writes don't interfere with the reads. 139 | 140 | include 'arch/iintrinsic/basic' 141 | include 'arch/iintrinsic/select' 142 | def shuffle{a:T==[16]i8, b:T} = vec_shuffle{a, b} 143 | fn reverse{T==i8 if hasarch{'SSSE3'}}(arr:*T, len:u64) : void = { 144 | def V = [16]T 145 | f := vec_make{V, range{16}} 146 | r := vec_make{V, 15 - range{16}} 147 | av := *V~~arr # beginning of part not yet reversed 148 | bv := *V~~(arr+len) # just after the end of that part 149 | while (av+1 < bv) { 150 | --bv 151 | c := shuffle{av->0, r} 152 | av <- shuffle{bv->0, r} 153 | bv <- c 154 | ++av 155 | } 156 | if (av < bv) { 157 | rem := *T~~bv - *T~~av 158 | l := vec_broadcast{V, T<~rem} 159 | m := V~~(f < l) 160 | s := ((l - f - vec_broadcast{V, 1}) & m) | andnot{f, m} 161 | av <- shuffle{av->0, s} 162 | } 163 | } 164 | 165 | There you have it, reversing bytes at SSE speed. AVX2 ought to be twice as fast but it's got this ridiculous design where it only shuffles within 16-byte lanes—it's not that much overhead but it's more of a headache than I'm willing to put up with right now. 166 | 167 | ## Generics 168 | 169 | I already said I don't like repeating myself. Instead of copy-pasting, I'll make this vector reverse work on multiple types, which will take a little more macro usage. First some cleanup. 170 | 171 | oper &~ andnot infix none 35 172 | 173 | This defines the and-not operator so that `a &~ b` is `a & ~b`. The C backend could probably work the second one out, but it's nice to know you're generating one `andnot` intrinsic. And even if an `&~` operator isn't defined, `&~` with no space won't split into `&` and `~` for consistency. Or maybe because developers are scared of working on the lexer, take your pick. The `infix none 35` thing is the parsing information, which I just copied from `&` in cop.singeli. 174 | 175 | def blend{m:M, t:T, f:T} = (t & T~~m) | (f &~ T~~m) 176 | 177 | And this is a macro for blend, the vector equivalent of `if (m) t else f`. Again we've got the smart macro, where the inputs all have to be typed and `t` and `f` have to have the same type. What it does is to get all their types and then check that the ones with the same name are consistent. Another thing, we use `m` twice, which should have a C programmer twitching. But it's safe: `blend` isn't operating on source tokens, but instead saying what to do with values. Which is also how it can check types, because by the time the macro gets processed its inputs have been handled by the compiler and their types are known. And the story is the same at runtime: all macro inputs are evaluated, and then the code in the macro runs. 178 | 179 | Now the hard part, which is to make this work on other types. For a lot of simpler vector algorithms you mostly just have to change the vector type, so you'd write something like `def V = [128/width{T}]T` to make a 128-bit vector and you're done. Here that doesn't work because SSSE3 only has this one shuffle instruction, which works on 1-byte units. So we're going to define `V` as `[16]i8`. Then it's bit-bashing time to reverse the `T`-width units in those vectors. Here, I'll dump it all out so you can see what I'm talking about. 180 | 181 | include 'arch/iintrinsic/basic' 182 | include 'arch/iintrinsic/select' 183 | oper &~ andnot infix none 35 184 | def blend{m:M, t:T, f:T} = (t & T~~m) | (f &~ T~~m) 185 | def shuffle{a:T==[16]i8, b:T} = vec_shuffle{a, b} 186 | 187 | fn reverse{T if hasarch{'SSSE3'}}(arr:*T, len:u64) : void = { 188 | def b = width{T} / 8 # width of T in bytes 189 | def vb = 16 190 | def vi = range{vb} 191 | def V = [vb]i8 192 | def scal{x} = vec_broadcast{V, x} 193 | f := vec_make{V, vi} 194 | r := vec_make{V, vb-b - vi + 2*(vi%b)} 195 | av := *V~~arr 196 | bv := *V~~(arr+len) 197 | while (av+1 < bv) { 198 | --bv 199 | c := shuffle{av->0, r} 200 | av <- shuffle{bv->0, r} 201 | bv <- c 202 | ++av 203 | } 204 | if (av < bv) { 205 | rem := *T~~bv - *T~~av 206 | l := scal{i8<~(b*rem)} 207 | m := V~~(f < l) 208 | s := blend{m, r + l - scal{vb}, f} 209 | av <- shuffle{av->0, s} 210 | } 211 | } 212 | 213 | The main loop always does the same permutation, analogous to `vec_make{V, 15 - range{16}}` from before but with more arithmetic. I've defined `vi = range{vb}` to make this a little simpler—if you haven't noticed, just about anything can go in a `def`. Still, `r` is a real head-scratcher. But it's a compile-time head scratcher, and that means I can stick `show` calls all over the place before compiling to see what's going on. `show` just returns its input so it doesn't affect the compiler output, but it also prints that input. See below. These are for the `i32` case, and since I don't actually call it I just added a line `reverse{i32}` which is enough to make sure the function is compiled. 214 | 215 | r := vec_make{V, show{vb-b} - vi + 2*(vi%b)} 216 | # 12 217 | r := vec_make{V, show{vb-b - vi} + 2*(vi%b)} 218 | # tup{12,11,10,9,8,7,6,5,4,3,2,1,0,-1,-2,-3} 219 | r := vec_make{V, show{vb-b - vi + show{2*(vi%b)}}} 220 | # tup{0,2,4,6,0,2,4,6,0,2,4,6,0,2,4,6} 221 | # tup{12,13,14,15,8,9,10,11,4,5,6,7,0,1,2,3} 222 | 223 | First line shows `vb-b`, which is the first byte after reversing, or the start of the last element before. And the elements go down from there so I subtract `vi`. But this means bytes go down within an element when I want them going up, so I add twice the byte index `vi%b` within each element. 224 | 225 | And then the last vector is a minor variation on what we did before. Work it out yourself if you really care. Can't get reverse by subtracting the forward vector from a constant any more, so I added the reverse one to a different constant. This arithmetic all happens at runtime, so you won't get anything useful out of `show`, but `lprintf` does handle vectors. 226 | 227 | What about AVX2, or other architectures? It's all possible. NEON support is going to be pretty easy here since it has just about the same instructions: use `hasarch{'SSSE3'} or hasarch{'AARCH64'}` for the condition, qualify the `shuffle` we have here with `hasarch{'SSSE3'}`, and add a NEON one too (EDIT: now arch/neon\_intrin/basic has you covered, load conditionally with `if_inline`). Then as `reverse` is compiled it'll check the architecture when it calls `shuffle` and use the right one. For AVX2 you have a few options. First thing I'd try is to change `def vb = 16` to `def vb = if (hasarch{'AVX2'}) 32 else 16`, and then make other things check `vb` as necessary. Have fun dealing with that within-lane shuffle. 228 | -------------------------------------------------------------------------------- /emit_c.bqn: -------------------------------------------------------------------------------- 1 | cpp‿cpu‿prefix‿⟨·,ErrOut,ErrExit,·⟩ ← ∾⟜"_"⌾(2⊸⊑) •args 2 | 3 | Asrt ← {𝕨𝕊1:𝕩; ·𝕊𝕩:𝕊𝕨; 4 | ErrOut "Invalid IR, likely implementation bug!" 5 | ErrOut⍟(0⊸≢) 𝕩 ⋄ ErrExit@ 6 | } 7 | 8 | types ← { 9 | ub←uv←um←0 ⋄ req←⟨⟩ ⋄ init ⇐ {ub↩uv↩um↩0⋄req↩⟨⟩} 10 | Require ⇐ {𝕊: req∾↩<𝕩} 11 | Headers ⇐ {𝕊: ⍷ req ∾˜ 1‿ub‿um‿(uv>0)/⟨"stdint.h","stdbool.h","math.h",cpu.header⟩} 12 | Type ⇐ { 𝕨 𝕊 bType‿w‿am‿ptrs‿disp: 13 | Err ← {ErrOut ∾⟨"Unhandled type `",disp,"`: ",𝕩⟩ ⋄ ErrExit@} 14 | u‿f ← bType = "uf" 15 | (" "∾𝕨) ⊢⊘(∾˜) (ptrs/"*") ∾˜ { 16 | 0<≠am ? uu‿t ← Err∘⊑⍟(0==) cpu.VecType w‿am‿u‿f ⋄ uv⌈↩uu ⋄ t ; 17 | f ? "float"‿"double"⊑˜32‿64⊸⊐⌾")⍟('"'≠⊑))¨ types.Headers@ 49 | ∾ (𝕨⊣prelude) ((⊣⋈(⊣×1+⊢)○(0<≠)⥊lf˙)∾⊢) (decl.Emit@)∾l 50 | } 51 | 52 | StartLine ← { 𝕊 str: 53 | Assert ← { Asrt ∾𝕨‿": `"‿str‿"`" }⍟(1≢⊢) 54 | tok ← ' ' ((⊢-˜+`׬)∘=⊔⊢) str 55 | i ← ¯1 56 | Next ⇐ {𝕤 57 | i +↩ 1 58 | "Unfinished line" Assert i < ≠tok 59 | i ⊑ tok 60 | } 61 | All ⇐ {𝕤 62 | r←(i+1)↓tok ⋄ i↩1-˜≠tok ⋄ r 63 | } 64 | Finish ⇐ {𝕤 65 | IsWS ← ∊⟜(" "∾@+9) 66 | "Excessive IR line" Assert tok ≠⊸≤◶⟨(∨´ IsWS ∨ ·∨`'#'⊸=)⊑˜, 1⟩ i+1 67 | } 68 | } 69 | 70 | Nat ← 10⊸×⊸+˜´∘⌽ -⟜'0' 71 | Nest ← +`·-˝=⌜ 72 | as ← { 73 | Sym ⇐ (⊢-128×(' '+128)⊸=) 1↓¯1↓⊢ 74 | Name ⇐ Sym⍟('''=⊑) 75 | Rename ⇐ prefix∾1⊸↓⍟('$'=⊑) 76 | I32 ⇐ ('-'=⊑)⊸(⊣-∘⊢⍟⊣Nat∘↓) 77 | Lit ⇐ (⊑"'$!"⊐⊏)◶Sym‿({decl.Call𝕩⋄𝕩}⍟('f'=⊑∘⊣)⟜Rename 1⊸↓)‿{ 78 | Bl ← 0<"{}"⊸Nest ⋄ br ← Bl𝕩 79 | v‿t ← (1-˜+`׬)∘(1⌾⊑br<':'⊸=)⊸⊔ 𝕩 80 | {¬∨´br? 81 | v ∾↩ { 82 | 'f': h←⊑'x'∊v ⋄ v types.SpecialFloat⍟(e←⊑'/'∊v)↩ 83 | {𝕩∾(h/"p0")∾e¬⊸/"f"}⍟("f32"≡t) "."/˜¬e∨h∨´"e."∊v ; 84 | (('u'=𝕩)/"u")∾"ll" 85 | }⊑t 86 | m ← "-0x8000000000000000"≡v # For floats: negate after cast 87 | ∾⟨"(",m↑v,"(",Type t,")",m↓v,")"⟩ 88 | ; 89 | "Invalid IR literal" Asrt "tup{"≡4↑v 90 | "{"∾"}"∾˜∾1↓⥊(<",")≍˘ Lit¨ ((⊢-˜+`׬)∘(Bl<','⊸=)⊔⊢)○(¯1↓4↓⊢) v 91 | } 92 | }‿⊢ 93 | Type ⇐ { 𝕨𝕊s: # Singeli type to native 94 | s ↓˜↩ ptrs ← +´∧`'*'=s 95 | s ↩ ⊢´ v ← ((1-˜¬∘∨×1+`⊢)˝·⊑⊸∧"[]"=⌜⊢)⊸⊔ s 96 | am ← Nat¨ ¯1↓v 97 | ParseFn ← { 98 | s ← (','=𝕩)∧n←(1⊸=∧·∧`0⊸<)"()"Nest𝕩 99 | a ← Type¨ ((1-˜+`׬)s∨»⊸≠⊸≥n)⊔𝕩 100 | "Invalid IR type" Asrt ")->"≡3↑r←n¬⊸/𝕩 101 | ⟨Type 3↓r, a⟩ 102 | } 103 | 𝕨 types.Type ⟨𝕩⟩ ∾˜ { 104 | "void": 'v'‿0‿⟨⟩‿ptrs; 105 | '('=⊑𝕩? ⟨"fn",ParseFn 𝕩,ptrs⟩; 106 | ⟨⊑𝕩, Nat 1↓𝕩, am, ptrs⟩ 107 | } s 108 | } 109 | } 110 | 111 | Name‿Rename‿Type‿Lit‿I32 ← {𝕏{𝔽∘𝔾⊘(𝔽⟜𝔾)}{𝕩.Next@}}¨ ⟨as.Name,as.Rename,as.Type,as.Lit,as.I32⟩ 112 | All ← {𝕩.All@} 113 | 114 | decl ← { 115 | FromName ← Nat · (∧`'_'⊸≠)⊸/ (1+≠prefix)⊸↓ # si_f 116 | c←o←@ ⋄ Init⇐{𝕤⋄c↩↕0⋄o↩⟨⟩} 117 | BeginFn ⇐ { n←FromName𝕨 ⋄ n<≠c ? n⊑c ? o∾↩<𝕩 ; @} 118 | Call ⇐ { i←FromName𝕩 ⋄ c↑˜↩(≠c)⌈1+i ⋄ c 1⌾(i⊸⊑)↩ } 119 | Emit ⇐ {𝕤⋄ ∾⟜⟨⥊lf⟩⍟(0<≠) (¯2⊸↓∾(";"∾lf)˙)¨ o } 120 | } 121 | 122 | Join ← {∾1↓⥊(<𝕨)≍˘𝕩} 123 | List ← ", "⊸Join 124 | 125 | BeginFn ← {𝕤 126 | FmtExt ← { 127 | e ← ((-´"aA")×'A'⊸≤)⊸+ 1↓𝕩 128 | ∾"__attribute__ ((__target__ ("""‿e‿""")))"‿lf 129 | } 130 | n ← as.Rename nn ← Name 𝕩 131 | ret ← n Type 𝕩 132 | argc ← I32 𝕩 133 | param ← Name⊸Type∘𝕩¨ ↕argc 134 | exts ← All 𝕩 ⋄ Asrt 1≥≠exts ⋄ Asrt ∧´('+'=⊑)¨exts 135 | fexts ← FmtExt¨exts 136 | { "main"≢nn 137 | ? n⊸decl.BeginFn⊸⊢ ∾fexts∾⟨"static ",ret,"(",List param,") {"⟩ 138 | ; pc ← (⊢∾" = ("∾(∧`' '⊸≠)⊸/∾")argv;"˙)¨1↓param 139 | ∾fexts∾⟨"int main(",List "char** argv"¨⌾(1⊸↓)param,") {"⟩∾pc 140 | } 141 | } 142 | Export ← { 143 | exp ← "const "∾Name 𝕩 144 | ret ← exp Type 𝕩 145 | val ← Lit 𝕩 146 | ∾⟨ret," = ",val,";"⟩ 147 | } 148 | Define ← {st 𝕊 ty‿id‿val: 149 | arr ← {'*'=⊑ty? ∨´"{?"=c←⊑1↑⊑val? # Define array, then pointer cover 150 | v←val ⋄ val↩i←id∾"_" ⋄ id∾⍟st˜↩"const " 151 | {'?'=c? UndefArr ⟨ty,i,(∧`':'⊸≠)⊸/1↓⊑v⟩ ; 152 | cpp>st? DefCppArr ⟨ty,i,∾v⟩ ; ⟨st Define ⟨1↓ty,i∾"[]",v⟩, "; "⟩} 153 | ; ⟨⟩} 154 | eq ← {cpp>st? ⟨"; ",id," = "⟩ ; ⟨" = "⟩} 155 | ∾∾⟨ 156 | arr, (1=st)/⟨"static "⟩ 157 | ⟨id as.Type ty⟩, (('?'≠·⊑1↑⊑) / eq⊸∾) val 158 | ⟩ 159 | } 160 | UndefArr ← {𝕊 ty‿id‿n: (0 Define ⟨1↓ty,∾id‿"["‿n‿"]",⟨"?"⟩⟩) ∾ "; "} 161 | # For C++, initialize array by creating a second in a temp scope and copying 162 | DefCppArr ← {𝕊 ty‿i‿v: 163 | it←i∾"t" 164 | n←•Repr("{}"≢v)+´(','⊸=∧1="{}"⊸Nest)v 165 | ∾⟨ 166 | UndefArr ty‿i‿n, "{ " 167 | 2 Define ⟨1↓ty,it∾"[]",v⟩, "; " 168 | "for (unsigned i=0; i<",n,"; i++) ",i,"[i] = ",it,"[i]; } " 169 | ⟩ 170 | } 171 | Constant ← { 172 | id ← Rename 𝕩 173 | type ← Name 𝕩 # Define turns to type 174 | val ← Lit 𝕩 175 | (1 Define type‿id‿val)∾";" 176 | } 177 | Require ← { types.Require Name 𝕩 ⋄ "" } 178 | ⟨New, Mut⟩ ← { 179 | Cast ← {∾"("‿(as.Type 𝕨)‿")"‿𝕩} 180 | Special ← { 181 | "^promote"𝕊⟨t,v⟩ : t Cast v ; 182 | "^bitcast"𝕊⟨s,t,v⟩: { s∧○(⊑⊏∊"iu*"˙)t ? s Cast v ; 183 | s ≡○as.Type t ? v ; "^bitcast"‿t‿v } ; 184 | "^load" 𝕊⟨p,i ⟩: ∾p‿"["‿i‿"]" ; 185 | "^store" 𝕊⟨p,i,v⟩: p‿"["‿i‿"] = "‿v 186 | } 187 | Call ← 1⊸↑⊸≡⟜"^"◶⟨{𝕨‿"("‿𝕩‿")"}⟜List, Special⟩ 188 | code‿op ← <˘⍉>⟨ 189 | "val" ‿(⋈ Lit) 190 | "call"‿{ fn ← Lit 𝕩 ⋄ fn Call (Lit𝕩˙)¨ ↕I32 𝕩 } 191 | "emit"‿(Name ("op "≡3↑⊣)◶⟨ 192 | Call⟜(as.Lit¨All) 193 | { o←3↓𝕨 ⋄ ⟨Lit 𝕩," ",o," ",Lit 𝕩⟩ } 194 | ⟩ ⊢) 195 | "array"‿{"{"∾"}"∾˜∾1↓⥊(<",")≍˘ as.Lit¨ All 𝕩} 196 | ⟩ 197 | Memcpy ← {id𝕊"^bitcast"‿t‿v: 198 | types.Require "string.h" 199 | m ← id∾"_" 200 | ∾⟨"{",0 Define ⟨t,m,v⟩,"; memcpy(&",id,", &",m,", sizeof(",as.Type t,"));}"⟩ 201 | ;𝕊:@} 202 | New ⇐ { 203 | id ← Name 𝕩 204 | kind ← Name 𝕩 205 | ty ← Name 𝕩 # Define turns to type 206 | c ← code⊸⊐⌾< kind 207 | (∾"Unknown new: `"‿kind‿"`") Asrt c<≠code 208 | val ← (c⊑op) {𝕎𝕩} 𝕩 209 | { 210 | "void"≡ty?∾val ; 211 | @≢m←id Memcpy val?∾⟨id as.Type ty,"; ",m⟩ ; 212 | 0 Define ty‿id‿val 213 | } 214 | } 215 | Mut ⇐ { 216 | id ← Name 𝕩 217 | kind ← Name 𝕩 218 | c ← code⊸⊐⌾< kind 219 | val ← { 220 | c=≠code ? ⟨as.Lit kind⟩ ; 221 | Name 𝕩 ⋄ (c⊑op) {𝕎𝕩} 𝕩 # Discard type 222 | } 𝕩 223 | {@≢m←id Memcpy val? m ; ∾id‿" = "∾val} 224 | } 225 | } 226 | 227 | code‿op ← (⊑¨ ⋈ 1⊸↓¨) ⟨ 228 | "export" ‿ 0‿0‿⟨Export⟩ 229 | "constant"‿ 0‿0‿⟨Constant⟩ 230 | "require" ‿ 0‿0‿⟨Require⟩ 231 | "beginFn" ‿ 1‿0‿⟨BeginFn⟩ 232 | "lbl" ‿ 0‿1‿⟨Name,":"⟩ 233 | "ret" ‿ 0‿1‿⟨" ","return","void"⊸≢◶⟨""," "⊸∾⟩ Lit⟩ 234 | "gotoF" ‿ 0‿1‿⟨" ","if (!(",Lit,")) ","goto ",Name⟩ 235 | "gotoT" ‿ 0‿1‿⟨" ","if (",Lit,") ","goto ",Name⟩ 236 | "goto" ‿ 0‿1‿⟨" ","goto ",Name⟩ 237 | "new" ‿ 0‿1‿⟨" ",New⟩ 238 | "mut" ‿ 0‿1‿⟨" ",Mut⟩ 239 | "endFn" ‿¯1‿1‿⟨"}"⟩ 240 | ⟩ 241 | 242 | Generate 243 | -------------------------------------------------------------------------------- /float2.bqn: -------------------------------------------------------------------------------- 1 | # High-precision numbers as pairs representing unevaluated sums 2 | # Format is ⟨high,low⟩ 3 | 4 | To ⇐ ⋈⟜0 5 | From ⇐ +´ 6 | 7 | Add12 ← {a𝕊b: 8 | s ← a + b 9 | {¬∞>|s ? s‿0 ; 10 | av← s - bv← s - a 11 | ⟨s, +´a‿b-av‿bv⟩ } 12 | } 13 | Add ⇐ {a𝕊b: 14 | r ← a +○⊑ b 15 | {¬∞>|r ? r‿0 ; 16 | s ← (-r) +´ ¯1⌽⌽⍟(a<○(|⊑)b) b⌽⊸∾a 17 | r Add12 s } 18 | } 19 | 20 | Neg ⇐ - 21 | Abs ⇐ -⍟(0>⊑) 22 | Floor‿Ceil ⇐ {𝕏∘⊑⊸(⊣⋈·𝕏-⊸(+´)⟜⌽)}¨ ⌊‿⌈ 23 | Sub ⇐ Add⟜Neg 24 | 25 | Cmp ⇐ (=˜∘⊢≤≤){𝔽˜-𝔽}{=○⊑◶⟨𝔽○⊑, 𝔽○(⊢´)⟩} 26 | 27 | Split ← 53‿1024{p‿e _𝕣: # Double-precision float 28 | sp← 1+2⋆se←⌈p÷2 29 | m ← 2⋆e-1+se ⋄ f ← 2⋆-p # Adjustments to avoid hitting ∞ 30 | {¬m>|𝕩? 𝕊⌾(f⊸×) 𝕩; 𝕊a: 31 | c ← sp × a 32 | al← a - ah← c - c - a 33 | ⟨al, ah⟩ # Backwards for convenient reduction 34 | } 35 | } 36 | Mul12 ← {a𝕊b: 37 | h ← a × b 38 | {∞>|h? ⟨h, (-h) +´ ⥊ b ×⌜○Split a⟩ ; h‿0} 39 | } 40 | Mul ⇐ {a𝕊b: 41 | ph‿pl ← a Mul12○⊑ b 42 | {∞>|⊑ph? ph Add12 pl + +´ a × ⌽b ; ph‿0} 43 | } 44 | 45 | Div ⇐ {b𝕊a: 46 | yn ← (⊑b) × xn ← ÷⊑a 47 | {¬∞>|yn? yn‿0 ; 48 | diff ← ⊑ b Sub a Mul yn‿0 49 | yn‿0 Add xn Mul12 diff } 50 | } 51 | 52 | Mod ⇐ { 53 | b𝕊a‿0: a>0 ? h←a÷2 ⋄ Add12´ a⊸+⌾⊑⍟(<⟜-´) (-⟜(a×h<⊢)a|⊢)⍟(h<|)¨b ; 54 | # Not correctly rounded but probably okay 55 | b𝕊a: a Sub b Mul (Floor a Div b) 56 | } 57 | 58 | # Decimal parsing 59 | # For one double, max digits is 15 and max power of 10 is 1e22 60 | Exp10 ← { 22≥𝕩? To 10⋆𝕩; 308<𝕩? To ∞; (⊣´Mul⊢´)𝕊¨⍷⌊2÷˜𝕩+↕2 } # Could save results 61 | N1 ← •ParseFloat 62 | Nat ← ≠⊸{ 63 | 15≥𝕨 ? To N1 𝕩 ; 64 | 20≥𝕨 ? ¯15 ((1e15×N1∘↓) Add12 N1∘↑) 𝕩 ; # Exact 65 | 35≥𝕨 ? ¯20 ((1e20 Mul12 •ParseFloat∘↓) Add Nat∘↑) 𝕩 ; # Sum of exacts so it's correctly rounded 66 | (Exp10 𝕨-35) Mul Nat 35↑𝕩 # Imprecise 67 | } 68 | ParseDec ⇐ { # 𝕨 is base-10 exponent; 𝕩 is digit string 69 | 0≤𝕨 ? 𝕨 Exp10⊸Mul⍟(0<⊣) Nat 𝕩 ; 70 | (𝕨↓𝕩) Nat⊸Add⍟(0<≠∘⊣) (Exp10-𝕨) Div˜ Nat (𝕨⌈-≠𝕩)↑𝕩 71 | } 72 | 73 | _repr ⇐ { len‿b _𝕣: 74 | ! ⌊⊸= 2⋆⁼b # Need division by b to be exact 75 | {c←0 ⋄ {𝕩+↩c⋄c↩⌊𝕩÷b⋄b|𝕩}¨𝕩} ·+´ b|⌊∘÷⟜b⍟(↕len)¨ 76 | } 77 | Bits ⇐ { 78 | 𝕊⁼𝕩: (2⋆48)⊸×⊸Add12˜○(2⊸×⊸+˜´)˝ 2‿∘⥊𝕩 ; 79 | ∧´𝕩=⟜1⊸∨⌾⊑𝕩=0 ? 96↑⊏𝕩 ; 80 | "Bitwise operation: arguments must be integers" ! ⌊⊸≡◶⟨0,>⟜-´⟩ 𝕩 81 | "Bitwise operation: arguments can't exceed 2^96" ! 0<(2⋆96)-˜´⌽𝕩 82 | 96‿2 _repr 𝕩 83 | } 84 | -------------------------------------------------------------------------------- /include/README.md: -------------------------------------------------------------------------------- 1 | # Singeli standard includes 2 | 3 | Standard includes are those built into the compiler. Each can be included with a line like `include 'arch/c'`, which uses a path relative to this directory (include/ in the Singeli sources). 4 | 5 | - `skin/` Operator definitions 6 | - [`skin/c`](skin/c.singeli) C-like operators (with some tweaks) 7 | - [`skin/cop`](skin/cop.singeli) Non-mutating operators 8 | - [`skin/cmut`](skin/cmut.singeli) Mutating operators such as `*=` and `++` 9 | - [`skin/cext`](skin/cext.singeli) Extensions to C-like operators 10 | - `arch/` Operation generation 11 | - [`arch/c`](arch/c.singeli) Platform-independent C 12 | - `arch/iintrinsic/` for x86 extensions or `arch/neon_intrin/` for NEON vector intrinsics (ARM) 13 | - [`arch/*/basic`](#simd-basics) Basic vector support and arithmetic 14 | - [`arch/*/select`](#simd-selection) Rearranging elements without changing type 15 | - `clib/` Bindings for C libraries 16 | - [`clib/malloc`](clib/malloc.singeli) malloc (as `alloc{}`) and free 17 | - `util/` Utilities 18 | - [`util/for`](#utilfor) Typical @for loops 19 | - [`util/tup`](#utiltup) Programming with tuples 20 | - [`util/kind`](util/kind.singeli) Short generators to test value's kind 21 | - [`util/perv`](util/perv.singeli) Generator pervasion 22 | - [`util/functionize`](util/functionize.singeli) Make function from generator 23 | - `debug/` Debugging utilities 24 | - [`debug/printf`](debug/printf.singeli) Print at runtime 25 | 26 | ## util/for 27 | 28 | File [util/for.singeli](util/for.singeli). 29 | 30 | Each loop handles the indices `i` satisfying `from <= i < to`. 31 | 32 | | Loop | Description 33 | |--------------------|------------ 34 | | `@for` | Standard forward loop 35 | | `@for_backwards` | Same indices in the reverse order 36 | | `@for_const` | Compile-time loop, requiring constant bounds 37 | | `@for_unroll{unr}` | Loop unrolled by a factor of `unr` 38 | 39 | The unrolled loop creates two sub-loops, one that evaluates `unr` copies of the given body and the other that evaluates only one. It runs the first as many times as possible starting at `from` (no adjustments are made for alignment), then the second until `to` is reached. 40 | 41 | ## util/tup 42 | 43 | File [util/tup.singeli](util/tup.singeli). 44 | 45 | | Syntax | Description 46 | |--------------------------|------------ 47 | | `empty{tup}` | Tuple is empty 48 | | `@collect` | Constant-time evaluation returning a list 49 | | `iota{num}` | Alias for `range` 50 | | `inds{tup}` | Tuple of all indices into tuple 51 | | `copy{num, any}` | Tuple of `num` copies of `any` 52 | | `join{tups}` | Merge a tuple of tuples 53 | | `shiftright{l, r}` | Shift tuple `l` into `r`, retaining length of `r` 54 | | `shiftleft{l, r}` | Shift tuple `r` into `l`, retaining length of `l` 55 | | `reverse{tup}` | Elements in reverse order 56 | | `cycle{num, tup}` | Repeat tuple cyclically to the given length 57 | | `split{num, tup}` | Split tuple into groups of the given length or less 58 | | `flip{tups}` | Transpose tuple of same-length tuples 59 | | `table{f, ...tups}` | Function table mapping over all combinations 60 | | `flat_table{f, ...tups}` | Function table flattened into a single list 61 | | `fold{gen, any?, tup+}` | Left fold, with or without initial element 62 | | `scan{gen, any?, tup+}` | Inclusive left scan 63 | | `replicate{r, tup}` | Tuple with each input element copied the given number of times 64 | | `indices{tup}` | Indices of elements of `tup`, repeated that many times 65 | 66 | Additional notes: 67 | 68 | - `split{n, tup}`: `n` may be a number, indicating that all groups have that length except that the last may be short. It may also be a list of numbers, which is expected to sum to the length of the tuple and indicates the sequence of group lengths. 69 | - `fold{gen, any?, tup+}` and `fold{gen, any?, tup+}`: if the initialized `any` is given, `tup` indicates any number of tuple arguments, and `gen` will be always called with one parameter from each one. 70 | - `replicate{r, tup}`: `r` may be a tuple, where each element indicates the number of times to include the corresponding element of `tup` (for example, if it's boolean the elements in the same position as a 1 are kept and those with a 0 are filtered out). It may also be a plain number, so that every element is copied the same number of times, or a generator `f`, so that element `e` is copied `f{e}` times. 71 | 72 | ## SIMD basics 73 | 74 | Includes `arch/iintrinsic/basic` and `arch/neon_intrin/basic` are "basic" architecture includes that define arithmetic and a few essential vector operations. Because of x86's haphazard instruction support, the default `arch/iintrinsic/basic` includes multi-instruction implementations of many operations such as comparisons, min, and max. Use `arch/iintrinsic/basic_strict` to define only cases that are supported by a single instruction. 75 | 76 | All [builtin arithmetic](../README.md#arithmetic) operations are supported when available (`__mod` is the only one that's never provided), in addition to the following (architecture indicated if only one supports it): 77 | 78 | | Syntax | Arch | Result 79 | |----------------------------|------|-------- 80 | | `__adds{x, y}` | | Saturating add 81 | | `__subs{x, y}` | | Saturating subtract 82 | | `__sqrt{x}` | | Square root 83 | | `__round{x}` | x86 | Round to nearest 84 | | `andnot{x, y}` | | `x & ~y` 85 | | `ornot{x, y}` | ARM | `x \| ~y` 86 | | `andnz{x, y}` | ARM | `(x & y) != 0` 87 | | `copy_sign{x, y}` | x86 | Absolute value of `x` with sign of `y` 88 | | `average_int{x, y}` | x86 | `(x + y + 1) >> 1` 89 | | `shl_uniform{v, s:[2]u64}` | x86 | Shift each element left by element 0 of `s` 90 | | `shr_uniform{v, s:[2]u64}` | x86 | Shift each element right by element 0 of `s` 91 | 92 | The following non-arithmetic definitions are also defined when possible. 93 | 94 | | Syntax | Result 95 | |------------------------|-------- 96 | | `vec_make{V, ...x}` | A vector of the values `x` 97 | | `vec_make{V, x}` | Same, with a tuple parameter 98 | | `vec_broadcast{V, x}` | A vector of copies of the value `x` 99 | | `extract{v:V, ind}` | The element at position `ind` of vector `v` 100 | | `insert{v:V, x, ind}` | Insert `x` to position `ind` of `v`, returning a new vector 101 | | `load{ptr,ind}` | Same as builtin 102 | | `store{ptr,ind,val}` | Same as builtin 103 | 104 | x86 also includes `load_aligned` and `store_aligned` for accesses that assume the pointer has vector alignment. 105 | 106 | ### x86 SIMD arithmetic support 107 | 108 | The following table shows when arithmetic support was added to x86 for various vector types. For integers, only signed types (`i16`) are shown but unsigned equivalents (`u16`) are supported at the same time. AVX-512F does have the ability to create and perform conversions on 8-bit and 16-bit types, but doesn't support any arithmetic specific to them. 109 | 110 | | Extension | `u`/`i8` | `u`/`i16` | `u`/`i32` | `u`/`i64` | `f32` | `f64` | 111 | |-----------|---------:|----------:|----------:|----------:|----------:|---------:| 112 | | SSE | | | | | `[4]f32` | | 113 | | SSE2 | `[16]i8` | `[8]i16` | `[4]i32` | `[2]i64` | | `[2]f64` | 114 | | AVX | | | | | `[8]f32` | `[4]f64` | 115 | | AVX2 | `[32]i8` | `[16]i16` | `[8]i32` | `[4]i64` | | | 116 | | AVX-512F | | | `[16]i32` | `[8]i64` | `[16]f32` | `[8]f64` | 117 | | AVX-512BW | `[64]i8` | `[32]i16` | | | | | 118 | 119 | The next table shows integer instruction availability in x86. Each entry shows the first extension to include the instructions on a given element type. Multi-instruction fills are not shown. Instructions introduced by SSE extensions are all available in AVX2, except `extract`, and those in AVX2 are all in AVX-512F or AVX-512BW (depending on type support as shown above), except `copy_sign`. AVX2 instructions are also supported on 128-bit vectors, and AVX-512 instructions are supported on 128-bit and 256-bit vectors if AVX-512VL is available. But `arch/iintrinsic/basic` doesn't correctly support these extensions right now. 120 | 121 | | Functions | `i8` | `i16` | `i32` | `i64` | `u8` | `u16` | `u32` | `u64` 122 | |-------------------------------|--------|--------|--------|---------|--------|--------|--------|------- 123 | | `&` `\|` `^` `andnot` `+` `-` | SSE2 | SSE2 | SSE2 | SSE2 | SSE2 | SSE2 | SSE2 | SSE2 124 | | `__min` `__max` | SSE4.2 | SSE2 | SSE4.2 | A512F | SSE2 | SSE4.2 | SSE4.2 | A512F 125 | | `==` | SSE2 | SSE2 | SSE2 | SSE4.1 | SSE2 | SSE2 | SSE2 | SSE4.1 126 | | `>` `<` | SSE2 | SSE2 | SSE2 | SSE4.2 | | | | 127 | | `__adds` `__subs` | SSE2 | SSE2 | | | SSE2 | SSE2 | | 128 | | `<<` `shl_uniform` | | SSE2 | SSE2 | SSE2 | | SSE2 | SSE2 | SSE2 129 | | `>>` `shr_uniform` | | SSE2 | SSE2 | A512F | | SSE2 | SSE2 | SSE2 130 | | `<<` (element-wise) | | A512F | AVX2 | AVX2 | | A512F | AVX2 | AVX2 131 | | `>>` (element-wise) | | A512F | AVX2 | A512F | | A512F | AVX2 | AVX2 132 | | `*` | | SSE2 | SSE4.1 | A512DQ | | SSE2 | SSE4.1 | A512DQ 133 | | `__abs` | SSSE3 | SSSE3 | SSSE3 | A512F | | | | 134 | | `copy_sign` (no 512-bit) | SSSE3 | SSSE3 | SSSE3 | | | | | 135 | | `average_int` | | | | | | SSE2 | SSE2 | 136 | | `extract` (no ≥256-bit) | SSE4.1 | SSE2 | SSE4.1 | SSE4.1 | SSE4.1 | SSE2 | SSE4.1 | SSE4.1 137 | 138 | Floating-point instruction availability is much simpler: all instructions are available on supported types, with the exception of `__floor`, `__ceil`, and `__round`, which weren't added until SSE4.1. 139 | 140 | | Functions | `f32` | `f64` 141 | |--------------------------------------------------------------------------------------------|--------|------- 142 | | `&` `\|` `^` `andnot` `+` `-` `*` `__min` `__max` `==` `>` `<` `!=` `>=` `<=` `/` `__sqrt` | SSE | SSE2 143 | | `__floor` `__ceil` `__round` | SSE4.1 | SSE4.1 144 | 145 | ## SIMD selection 146 | 147 | Includes `arch/iintrinsic/select` and `arch/neon_intrin/select` define operations that rearrange elements from one or more vectors. An operation is supported only when it can be implemented with a single instruction and possibly a constant vector register. In each case there are some values to be manipulated (`val`, `v0`, `v1`, `a`, `b` below), which must all share an element type and also determine the type of the result—although `spec` may indicate a different temporary element type to be used internal to the computation. Vectors here are treated strictly as lists of values, and in particular **left and right shifts go in the opposite direction to arithmetic shl and shr**! Operations `vec_shuffle`, `reverse_units`, and `blend_units` work on sub-units of the vectors, which must have a length that divides the number of elements, that is, a power of two. Operations ending in `128` work on 128-bit lanes, as this is all that AVX instructions support, but the same names without the `_128` or `128` suffix are defined to be the same on 128-bit vectors and error on larger sizes. AVX-512 is not yet supported. 148 | 149 | | Syntax | Arch | Description 150 | |--------------------------------------|------|------------ 151 | | `vec_select {spec?, val, ...?ind}` | | Vector version of `select{val, ind}` 152 | | `vec_shuffle{spec?, val, ...?ind}` | | Select within sub-units, possibly repeating the indices 153 | | `broadcast_sel{val, i}` | | Vector with all elements equal to element `i` of `val` 154 | | `reverse_units{s, val}` | | Reverse each length-`s` group of elements in `val` 155 | | `vec_shift_left_128 {val, n}` | | Move element `i` of `val` to index `i-n`, shifting in zeros 156 | | `vec_shift_right_128{val, n}` | | Move element `i` of `val` to index `i+n`, shifting in zeros 157 | | `vec_merge_shift_left_128 {a, b, n}` | | Left shift of combined lane placing `a` before `b` 158 | | `vec_merge_shift_right_128{a, b, n}` | | Right shift from end of combined lane placing `a` before `b` 159 | | `zip128{a, b, half}` | | Alternate elements from first (`half=0`) or last (`half=1`) halves of `a` and `b` 160 | | `blend{v0, v1, ...?bools}` | | Element-wise choice where `0` in `bools` takes from `v0` and `1` from `v1` 161 | | `blend_units{v0, v1, ...?bools}` | | Same, but tuple `bools` is repeated to the full length if short 162 | | `blend_top{v0, v1, mask}` | x86 | Choose using the top bit of each element of vector `mask` 163 | | `blend_bit{v0, v1, mask}` | ARM | Choose bitwise, `(~mask & v0) \| (mask & v1)` 164 | | `blend_hom{v0, v1, mask}` | | Choose `v0` when an element of `mask` is all 0, and `v1` when all 1 165 | 166 | Two types of selection by indices are defined: `vec_select`, which is more like NEON `tbl` instructions, and `vec_shuffle`, which selects on sub-units, matching x86 `shuffle` and `permute` better. These have many settings so they get [their own section](#vector-select-and-shuffle) below. `reverse_units` is a special case, and is implemented as a call to `vec_shuffle` on x86 but is supported by dedicated instructions on ARM. 167 | 168 | `vec_shift_left_128`, `vec_shift_right_128`, `vec_merge_shift_left_128`, and `vec_merge_shift_right_128` shift elements within lanes and are equivalent to `vec_shift_left`, `vec_shift_right`, `vec_merge_shift_left`, and `vec_merge_shift_right` when a vector is a single lane long. 169 | 170 | `zip` and `zip128` interleave elements of their arguments in the sense of `zip(abcd, 0123) = a0b1c2d3`; on tuples this might be written `merge{...each{tup,a,b}}`. Because the full result wouldn't fit in a single vector, the `half` parameter specifies half 0 or 1 of each lane of the result, or equivalently zipping only half 0 or 1 of each argument lane. More formally, element `2*i` of a result lane is element `i` of the relevant half-lane of `a`, and element `2*i + 1` is element `i` from a half-lane of `b`. The complete result as a list of vectors is `each{zip128{a,b,.}, range{2}}`. 171 | 172 | Arguments to blend functions are two vectors `v0` and `v1` of the same type, and a selector which is conceptually a list of booleans. For `blend` and `blend_units`, the selector `bools` is in fact a tuple of compile-time booleans (each is constant 0 or 1; these may also be passed as separate arguments). For `blend_hom`, `blend_top`, and `blend_bit`, the selector `mask` is another vector with the same number of elements and element width as the others. In a blend, the result value at index `i` is element `i` of either `v0` or `v1`: if element `i` of the selector is 0, `v0`, and if it's 1, `v1`. For `blend_top`, the selector is the top (sign) bit of each element of `mask`, and for `blend_bit`, all inputs are considered to be lists of bits so that the selector is simply the bits of `mask`. For `blend_hom` (short for "homogeneous"), result element `i` is defined only if element `i` of `mask` has all bits set to 0 or all set to 1. It's implemented as `blend_bit` on ARM and `blend_top`, possibly with a smaller element type than the arguments, on x86. 173 | 174 | ### Vector select and shuffle 175 | 176 | Both selection functions `vec_select` and `vec_shuffle` take three inputs: 177 | - `spec` is optional. It can describe the element type and width, and for `vec_shuffle`, sub-unit size. 178 | - `val` are the values for selection. It may be a tuple of vectors, which has a different meaning for select versus shuffle. 179 | - `ind` is the indices of the wanted values, either a vector or a tuple of constant integers (in which case they can also be passed as separate arguments). A constant index must be less than the selection length, and any negative indicates a zero result. For variables, out-of-bounds indices are not defined and will be interpreted according to the specific instruction called. `ind` is never cast, so if it's a vector its elements must be integers of the appropriate width. 180 | 181 | For `vec_select`, `spec` may be the element width as a number, or an element type. The width `128`, supported by AVX's `permute2x128` and `permute2f128` intrinsics, can only be specified by number. If multiple arguments are passed, they are treated as a single list of elements, so that indices into the first vector are normal, those into the second are increased by the width of a vector, and so on. 182 | 183 | `vec_shuffle` performs multiple independent selections: it corresponds to a single selection by adding an appropriate base index to each of these, although it's often the case on x86 that only some sub-unit size smaller than the entire vector is supported. If constant indices are used, they are repeated as needed to match the number of values. To run, `vec_shuffle` needs to determine both the element type and the number of elements in a sub-unit. `spec` may be a vector type like `[4]f32` to specify both, or a number like `4` to specify sub-unit length only, or an element type like `f32`. If the element type is unspecified, then the type's width comes from the indices if they're typed and the values if they're constant, and its quality (float or integer) comes from the values to be selected unless a floating-point type of the required width doesn't exist. The sub-unit size may be any divisor of the number of provided indices; if unspecified it's taken to be that number. An additional option is that `ind` may be a tuple of tuples, each having the length of a sub-unit (this specifies the sub-unit length if it would be taken from `ind`). 184 | 185 | The definition of `vec_shuffle` where `val` is a tuple is chosen to accomodate x86's rather esoteric `shuffle_ps` and `shuffle_pd` intrinsics. In this case each selection unit is divided equally into one part for each vector of values, and the indices for a part pertain to the current selection unit of the corresponding vector. 186 | 187 | Three extra definitions are included in iintrinsic/select to expose x86 shuffle instructions that don't fit `vec_select` or `vec_shuffle`. `vec_shuffle16_lo` and `vec_shuffle16_hi` shuffle the low and high halves of each lane of a vector with 16-bit elements, leaving the other half unchanged. `vec_shuffle_64_scaled` implements lane-wise `vec_shuffle` on `f64` elements and an index vector, except that the expected indices are 0 and 2 instead of 0 and 1: intrinsic `permutevar_pd` uses the second bit from the bottom of each index instead of the bottom bit as in `permutevar_ps`. 188 | -------------------------------------------------------------------------------- /include/arch/c.singeli: -------------------------------------------------------------------------------- 1 | local { 2 | local def extend promote{arith} = { 3 | def arith{a:T,b if is{'number',kind{b}} and is{'primitive',typekind{T}}} = arith{a , cast{T,b}} 4 | def arith{a,b:T if is{'number',kind{a}} and is{'primitive',typekind{T}}} = arith{cast{T,a} , b} 5 | } 6 | def arith{op} = { def extend _{arith} = { 7 | def arith{a:T,b:T if is{'primitive',typekind{T}}} = emit{T, op, a, b} 8 | extend promote{arith} 9 | }} 10 | def arith1{op} = { def extend _{arith} = { 11 | def arith{a:T if is{'primitive',typekind{T}}} = emit{T, op, a} 12 | arith 13 | }} 14 | def sh{op} = { def extend _{arith} = { 15 | def arith{a:T,b:I if isint{T} and isint{I}} = emit{T, op, a, b} 16 | extend promote{arith} 17 | }} 18 | def pk{T} = { 19 | def k=typekind{T} 20 | is{'primitive',k} or is{'pointer',k} 21 | } 22 | def compare{op} = { def extend _{arith} = { 23 | def arith{a:T,b:T if pk{T}} = emit{u1, op, a, b} 24 | extend promote{arith} 25 | }} 26 | def logic = arith 27 | def logic1 = arith1 28 | } 29 | 30 | extend (arith1{'-'}){__neg} 31 | 32 | extend (arith{'op +'}){__add} 33 | extend (arith{'op -'}){__sub} 34 | extend (arith{'op *'}){__mul} 35 | extend (arith{'op /'}){__div} 36 | extend (arith{'op %'}){__mod} 37 | 38 | local { 39 | def isptr{T} = is{'pointer',typekind{T}} 40 | def ptrwidth = width{__pnt{void}} 41 | def isize = primtype{'i',ptrwidth} 42 | def ptrdiff{a} = cast{isize, a} 43 | def ptrdiff{a:T} = promote{ptrwidth, a} 44 | def anynum{a} = is{'number',kind{a}}; def anynum{a:T} = is{'primitive',typekind{T}} 45 | } 46 | def __pnt{a:T if isptr{T}} = load{a,0} 47 | 48 | def __add{a ,b:P if isptr{P} and anynum{a}} = emit{P, 'op +', ptrdiff{a}, b} 49 | def __add{a:P,b if isptr{P} and anynum{b}} = emit{P, 'op +', a, ptrdiff{b}} 50 | def __sub{a:P,b if isptr{P} and anynum{b}} = emit{P, 'op -', a, ptrdiff{b}} 51 | def __sub{a:P,b:P if isptr{P}} = emit{isize, 'op -', a, b} 52 | 53 | extend (compare{'op =='}){__eq} 54 | extend (compare{'op !='}){__ne} 55 | extend (compare{'op >' }){__gt} 56 | extend (compare{'op >='}){__ge} 57 | extend (compare{'op <' }){__lt} 58 | extend (compare{'op <='}){__le} 59 | 60 | extend (logic{'op &'}){__and} 61 | extend (logic{'op |'}){__or } 62 | extend (logic{'op ^'}){__xor} 63 | 64 | extend (logic1{'~'}){__not} 65 | def __not{a:(u1)} = emit{u1, '!', a} 66 | 67 | extend (sh{'op <<'}){__shl} 68 | extend (sh{'op >>'}){__shr} 69 | 70 | def load{p:*T, i if anynum{i} and not is{T,void}} = emit{T, '^load', p, i} 71 | def store{p:*T, i, v:T if anynum{i} and not is{T,void}} = { emit{void, '^store', p, i, v}; v } 72 | def store{p:*T, i, v if is{'number',kind{v}} and anynum{i} and not is{T,void}} = store{p, i, cast{T, v}} 73 | 74 | def cast_i{T, x} = emit{T, '', x} 75 | -------------------------------------------------------------------------------- /include/arch/iintrinsic/basic.singeli: -------------------------------------------------------------------------------- 1 | def _iintrinsic_use_fill = 1 2 | include './basic_impl' 3 | -------------------------------------------------------------------------------- /include/arch/iintrinsic/basic_impl.singeli: -------------------------------------------------------------------------------- 1 | # This file should not be included directly: 2 | # instead use arch/iintrinsic/basic or arch/iintrinsic/basic_strict 3 | # which define whether fills should be used 4 | local { 5 | include 'skin/c' 6 | oper ~~ reinterpret infix right 55 7 | oper ** vec_broadcast infix right 55 8 | def num{x} = is{'number',kind{x}} 9 | 10 | def fmt_p{T, ...s} = { 11 | if (isfloat{T}) { 12 | if (width{T}==32) 'ps' else 'pd' 13 | } else { 14 | def sgn = match (s) { {{e}} => e; {_} => issigned{T} } 15 | merge{'ep', if (sgn) 'i' else 'u', fmtnat{width{T}}} 16 | } 17 | } 18 | def fmt_p{T, w if isint{T} and w>1} = merge{'si', fmtnat{w}} 19 | def intrin{name, V=[_]T, ...s} = { 20 | def w = width{V} 21 | def fw = if (w<=128) '' else fmtnat{w} 22 | merge{'_mm', fw, '_', name, '_', fmt_p{T, ...s}} 23 | } 24 | def intrin_b{name, V} = intrin{name, V, width{V}} 25 | def set_intrin_post{V=[_]T} = if (T==i64 and not av5{V}) 'x' else '' 26 | def vec_ptr{p:*V=[_]T} = if (isfloat{T}) *T~~p else p 27 | 28 | def sse{V} = 128==width{V} 29 | def avx{V} = 256==width{V} 30 | def av5{V} = 512==width{V} 31 | def sse_avx{V} = __or{...tup{128,256}==width{V}} 32 | def ew{[_]T} = width {T} 33 | def ef{[_]T} = isfloat{T} 34 | def ei{[_]T} = isint {T} 35 | def eu{[_]T} = 'u'==quality{T} 36 | def es{[_]T} = 'i'==quality{T} 37 | 38 | def change_qual{[k]T,q} = [k]primtype{q, width{T}} 39 | def uns = change_qual{.,'u'} 40 | def sgn = change_qual{.,'i'} 41 | 42 | def go = match { {[_]T} => T!=u1; {_} => 0 } 43 | } 44 | 45 | # Multi-instruction fills; slowest ones go first 46 | local def fill = _iintrinsic_use_fill 47 | def __not{a:V if fill and go{V} and has_arith{V}} = a ^ (V ** ~cast{eltype{V},0}) 48 | def __neg{a:V if fill and go{V} and has_arith{V}} = V**0 - a 49 | def __min{a:V, b:V if fill and go{V} and has_gt{V}} = { c:=V~~(ab); (a&c) | andnot{b,c} } 51 | def __max{a:V, b:V if fill and go{V} and has_satur{V} and eu{V}} = __subs{a,b}+b 52 | def __min{a:V, b:V if fill and go{V} and has_satur{V} and eu{V}} = a-__subs{a,b} 53 | def __lt{a:V, b:V if fill and go{V}} = b>a 54 | def __ge{a:V, b:V if fill and go{V}} = b<=a 55 | def __le{a:V, b:V if fill and go{V}} = ~(a>b) 56 | def __ne{a:V, b:V if fill and go{V}} = ~(b==a) 57 | def __gt{a:V, b:V if fill and go{V} and eu{V} and has_gt{sgn{V}}} = { 58 | t:= V**(1<<(ew{V}-1)) 59 | def I = sgn{V}; def s{v} = I~~(t^v) 60 | s{a} > s{b} 61 | } 62 | def __le{a:V, b:V if fill and go{V} and has_minmax{V}} = a==__min{a,b} 63 | def __gt{a:V, b:V if fill and go{V} and ~has_gt{V} and has_minmax{V}} = ~(a<=b) 64 | def __eq{a:V, b:V if fill and go{V} and ei{V} and ew{V}==64} = { def H=[4]u32; t := H~~a == H~~b; V~~(t & emit{H, '_mm_shuffle_epi32', t, 4b2301}) } 65 | def __abs{a:V if fill and go{V} and es{V} and has_rsh{V}} = { s:=a>>31; (s^a) - s } 66 | def __abs{a:V if fill and go{V} and es{V} and has_minmax{uns{V}}} = { u:=uns{V}~~a; V~~__min{u, -u} } 67 | def __abs{a:V if fill and go{V} and es{V} and has_minmax{ V }} = __max{a, -a} 68 | def __abs{a:V if fill and go{V} and has_arith{V} and ef{V}} = a & V~~uns{V}**(1<<(ew{V}-1)-1) 69 | def __shl{a:V, b:S if fill and go{V} and S<=u64 and has_shift{V}} = shl_uniform{a, vec_make{[2]u64, promote{u64,b}, 0}} 70 | def __shr{a:V, b:S if fill and go{V} and S<=u64 and has_rsh {V}} = shr_uniform{a, vec_make{[2]u64, promote{u64,b}, 0}} 71 | 72 | # Building vectors from scalars 73 | local { 74 | def can_elt = match { {[_]T, x:T} => 1; {_,x} => num{x} } 75 | def can_make_sub = can_elt 76 | def can_make_sub{V=[k]_, {...x}} = { 77 | def all{t} = is{t, 0 <= t} 78 | k==length{x} and all{each{can_elt{V,.}, x}} 79 | } 80 | def can_make{V,x} = go{V} and has_make{V} and can_make_sub{V,x} 81 | def mv_sub{m, V=[k]T, x} = { 82 | if ('u'!=quality{T}) { 83 | m{V, each{cast{T,.}, x}} 84 | } else { 85 | def w = width{T} 86 | def I = primtype{'i', w} 87 | def smax = 1<<(w-1) # Convert compile-time numbers in Singeli to avoid lots of cast instructions 88 | def ic{a} = I~~(if (num{a}) a - (a>=smax)<=32 193 | def has_extract{V if hasarch{'AVX2' } and avx{V} and fill} = 1 194 | def has_extract0{_} = 0 195 | def has_extract0{V if hasarch{'SSE2'} and sse{V} and ew{V}>=32} = 1 196 | 197 | def has_arith{_} = 0 # add, subtract, and, or, xor, andnot 198 | def has_arith{V==[4]f32 if hasarch{'SSE'}} = 1 199 | def has_arith{V if hasarch{'SSE2'} and sse{V}} = 1 200 | def has_arith{V if hasarch{'AVX'} and avx{V} and ef{V}} = 1 201 | def has_arith{V if hasarch{'AVX2'} and avx{V}} = 1 202 | def has_avx512{V} = ew{V}>=32 or hasarch{'AVX512BW'} 203 | def has_arith{V if hasarch{'AVX512F'} and av5{V}} = has_avx512{V} 204 | 205 | def has_satur{_} = 0 # saturating add/subtract 206 | def has_satur{V if hasarch{'SSE2'} and sse{V}} = ew{V}<=16 207 | def has_satur{V if hasarch{'AVX2'} and avx{V}} = ew{V}<=16 208 | def has_satur{V if hasarch{'AVX512BW'} and av5{V}} = ew{V}<=16 209 | 210 | def has_minmax{_} = 0 # min, max 211 | def has_minmax{V==[ 4]f32 if hasarch{'SSE' }} = 1 212 | def has_minmax{V==[ 2]f64 if hasarch{'SSE2'}} = 1 213 | def has_minmax{V==[ 8]i16 if hasarch{'SSE2'}} = 1 214 | def has_minmax{V==[16]u8 if hasarch{'SSE2'}} = 1 215 | def avx_minmax{[_]T} = isfloat{T} or width{T}<=32 216 | def has_minmax{V if hasarch{'SSE4.1'} and sse{V}} = avx_minmax{V} 217 | def has_minmax{V if hasarch{'AVX'} and avx{V} and ef{V}} = 1 218 | def has_minmax{V if hasarch{'AVX2'} and avx{V}} = avx_minmax{V} 219 | def has_minmax{V if hasarch{'AVX512F'} and av5{V}} = has_avx512{V} 220 | 221 | def has_mul{_} = 0 # same-width multiply (mullo for ints) 222 | def has_mul{V==[4]f32 if hasarch{'SSE'}} = 1 223 | def has_mul{V if hasarch{'SSE2' } and sse{V} and ef{V}} = 1 224 | def has_mul{V if hasarch{'SSE2' } and sse{V} and ew{V}==16} = 1 225 | def has_mul{V if hasarch{'SSE4.1'} and sse{V} and ew{V}==32} = 1 226 | def has_mul{V if hasarch{'AVX' } and avx{V} and ef{V}} = 1 227 | def has_mul{V if hasarch{'AVX2' } and avx{V} and ew{V}<=32 and ew{V}>=16} = 1 228 | def has_mul{V if hasarch{'AVX512F'} and av5{V}} = if (ef{V}) 1 else match (ew{V}) { 229 | {(32)}=>1; {(16)}=>hasarch{'AVX512BW'}; {(64)}=>hasarch{'AVX512DQ'} 230 | } 231 | 232 | def has_shift{_} = 0 # shift by scalar 233 | def has_shift{V if hasarch{'SSE2'} and sse{V} and ei{V} and ew{V}>=16} = 1 234 | def has_shift{V if hasarch{'AVX2'} and avx{V} and ei{V} and ew{V}>=16} = 1 235 | def has_shift{V if hasarch{'AVX512F'} and av5{V} and ei{V} and ew{V}>=16} = has_avx512{V} 236 | def has_rsh{V} = (eltype{V}!=i64 or av5{V}) and has_shift{V} 237 | def has_vshift{_} = 0 # shift by vector 238 | def has_vshift{V if hasarch{'AVX2'} and sse_avx{V} and ei{V} and ew{V}>=32} = 1 239 | def has_vshift{V if hasarch{'AVX512F'} and av5{V} and ei{V} and ew{V}>=16} = has_avx512{V} 240 | def has_vrsh{V} = (eltype{V}!=i64 or av5{V}) and has_vshift{V} 241 | 242 | def has_eq{_} = 0 # equals, integer only 243 | def has_eq{V if hasarch{'SSE2' } and sse{V}} = ew{V}<=32 244 | def has_eq{V if hasarch{'SSE4.1'} and sse{V}} = 1 245 | def has_eq{V if hasarch{'AVX2' } and avx{V}} = 1 246 | def has_gt{V} = es{V} and has_eq{V} 247 | def has_gt{([2]i64)} = hasarch{'SSE4.2'} 248 | 249 | # float comparisons, div, square root 250 | def has_float{V} = ef{V} and has_arith{V} 251 | def has_cmp_flt{V} = sse_avx{V} and has_float{V} 252 | # floor, ceiling, round 253 | def has_round{V} = hasarch{'SSE4.1'} and has_float{V} 254 | 255 | # abs, sign, avg 256 | def has_int_op{V=[_]T, arch_s, q, w, w512} = { 257 | if (q!=quality{T}) 0 258 | else if (sse_avx{V} and hasarch{if (sse{V}) arch_s else 'AVX2'}) width{T}<=w 259 | else hasarch{'AVX512F'} and av5{V} and has_avx512{V} and width{T}<=w512 260 | } 261 | } 262 | -------------------------------------------------------------------------------- /include/arch/iintrinsic/basic_strict.singeli: -------------------------------------------------------------------------------- 1 | def _iintrinsic_use_fill = 0 2 | include './basic_impl' 3 | -------------------------------------------------------------------------------- /include/arch/iintrinsic/misc.singeli: -------------------------------------------------------------------------------- 1 | local { 2 | def intvec{w,T} = 0 3 | def intvec{(width{V}),V=[_]T if isint{T}} = 1 4 | def num{T} = is{'number',kind{T}} 5 | } 6 | 7 | #SSE 8 | def __mulhi{a:T==[4]u16, b:T} = emit{T, '_mm_mulhi_pu16', a, b} 9 | def __pmulhuw{a:T==[4]u16, b:T} = emit{T, '_m_pmulhuw', a, b} 10 | def __cmpnlt{a:T==[4]f32, b:T} = emit{T, '_mm_cmpnlt_ps', a, b} 11 | def __cmpnle{a:T==[4]f32, b:T} = emit{T, '_mm_cmpnle_ps', a, b} 12 | def __cmpngt{a:T==[4]f32, b:T} = emit{T, '_mm_cmpngt_ps', a, b} 13 | def __cmpnge{a:T==[4]f32, b:T} = emit{T, '_mm_cmpnge_ps', a, b} 14 | def __cmpord{a:T==[4]f32, b:T} = emit{T, '_mm_cmpord_ps', a, b} 15 | def __cmpunord{a:T==[4]f32, b:T} = emit{T, '_mm_cmpunord_ps', a, b} 16 | def __cvt{a:T==[4]f32, b:(i32)} = emit{T, '_mm_cvt_si2ss', a, b} 17 | def __cvtpi32{a:T==[4]f32, b:([2]i32)} = emit{T, '_mm_cvtpi32_ps', a, b} 18 | def __cvt{a:T==[4]f32, b:([2]i32)} = emit{T, '_mm_cvt_pi2ps', a, b} 19 | def __cvtpi16{a:([4]i16)} = emit{[4]f32, '_mm_cvtpi16_ps', a} 20 | def __cvtpu16{a:([4]u16)} = emit{[4]f32, '_mm_cvtpu16_ps', a} 21 | def __cvtpi8{a:([8]i8)} = emit{[4]f32, '_mm_cvtpi8_ps', a} 22 | def __cvtpu8{a:([8]u8)} = emit{[4]f32, '_mm_cvtpu8_ps', a} 23 | def __cvtpi32x2{a:T==[2]i32, b:T} = emit{[4]f32, '_mm_cvtpi32x2_ps', a, b} 24 | def __cvtss_i32{a:([4]f32)} = emit{i32, '_mm_cvtss_si32', a} 25 | def __cvt_i32{a:([4]f32)} = emit{i32, '_mm_cvt_ss2si', a} 26 | def __cvtss_u64{a:([4]f32)} = emit{u64, '_mm_cvtss_si64', a} 27 | def __cvtss_f32{a:([4]f32)} = emit{f32, '_mm_cvtss_f32', a} 28 | def __cvtps_2f32{a:([4]f32)} = emit{[2]f32, '_mm_cvtps_pi32', a} 29 | def __cvt_2f32{a:([4]f32)} = emit{[2]f32, '_mm_cvt_ps2pi', a} 30 | def __cvttss_i32{a:([4]f32)} = emit{i32, '_mm_cvttss_si32', a} 31 | def __cvtt_i32{a:([4]f32)} = emit{i32, '_mm_cvtt_ss2si', a} 32 | def __cvttss_u64{a:([4]f32)} = emit{u64, '_mm_cvttss_si64', a} 33 | def __cvttps_2f32{a:([4]f32)} = emit{[2]f32, '_mm_cvttps_pi32', a} 34 | def __cvtt_2f32{a:([4]f32)} = emit{[2]f32, '_mm_cvtt_ps2pi', a} 35 | def __cvtps_4i16{a:([4]f32)} = emit{[4]i16, '_mm_cvtps_pi16', a} 36 | def __cvtps_8i8{a:([4]f32)} = emit{[8]i8, '_mm_cvtps_pi8', a} 37 | def __rcp{a:T==[4]f32} = emit{T, '_mm_rcp_ps', a} 38 | def __rsqrt{a:T==[4]f32} = emit{T, '_mm_rsqrt_ps', a} 39 | def __getcsr{} = emit{u32, '_mm_getcsr'} 40 | def __setcsr{a:(u32)} = emit{void, '_mm_setcsr', a} 41 | def __GET_EXCEPTION_STATE{} = emit{u32, '_MM_GET_EXCEPTION_STATE'} 42 | def __SET_EXCEPTION_STATE{a:(u32)} = emit{void, '_MM_SET_EXCEPTION_STATE', a} 43 | def __GET_EXCEPTION_MASK{} = emit{u32, '_MM_GET_EXCEPTION_MASK'} 44 | def __SET_EXCEPTION_MASK{a:(u32)} = emit{void, '_MM_SET_EXCEPTION_MASK', a} 45 | def __GET_ROUNDING_MODE{} = emit{u32, '_MM_GET_ROUNDING_MODE'} 46 | def __SET_ROUNDING_MODE{a:(u32)} = emit{void, '_MM_SET_ROUNDING_MODE', a} 47 | def __GET_FLUSH_ZERO_MODE{} = emit{u32, '_MM_GET_FLUSH_ZERO_MODE'} 48 | def __SET_FLUSH_ZERO_MODE{a:(u32)} = emit{void, '_MM_SET_FLUSH_ZERO_MODE', a} 49 | def __prefetch{p:*(u8), i if num{i}} = emit{void, '_mm_prefetch', p, i} 50 | def __sfence{} = emit{void, '_mm_sfence'} 51 | def __malloc{size:T==u64, align:T} = emit{__pnt{void}, '_mm_malloc', size, align} 52 | def __free{mem_addr:*(void)} = emit{void, '_mm_free', mem_addr} 53 | def __undefined_4f32{} = emit{[4]f32, '_mm_undefined_ps'} 54 | def __loadh{a:T==[4]f32, mem_addr:*([2]f32)} = emit{T, '_mm_loadh_pi', a, mem_addr} 55 | def __loadl{a:T==[4]f32, mem_addr:*([2]f32)} = emit{T, '_mm_loadl_pi', a, mem_addr} 56 | def __load1{mem_addr:*(f32)} = emit{[4]f32, '_mm_load1_ps', mem_addr} 57 | def __loadr{mem_addr:*(f32)} = emit{[4]f32, '_mm_loadr_ps', mem_addr} 58 | def __movemask{a:([8]i8)} = emit{i32, '_mm_movemask_pi8', a} 59 | def __pmovmskb{a:([8]u8)} = emit{i32, '_m_pmovmskb', a} 60 | def __movemask{a:([4]f32)} = emit{i32, '_mm_movemask_ps', a} 61 | def __sad{a:T==[8]u8, b:T} = emit{[4]u16, '_mm_sad_pu8', a, b} 62 | def __psadbw{a:T==[8]u8, b:T} = emit{[4]u16, '_m_psadbw', a, b} 63 | def __movehl{a:T==[4]f32, b:T} = emit{T, '_mm_movehl_ps', a, b} 64 | def __movelh{a:T==[4]f32, b:T} = emit{T, '_mm_movelh_ps', a, b} 65 | def __pavgb{a:T==[8]u8, b:T} = emit{T, '_m_pavgb', a, b} 66 | def __pavgw{a:T==[4]u16, b:T} = emit{T, '_m_pavgw', a, b} 67 | def __setzero_4f32{} = emit{[4]f32, '_mm_setzero_ps'} 68 | def __pmaxsw{a:T==[4]i16, b:T} = emit{T, '_m_pmaxsw', a, b} 69 | def __pmaxub{a:T==[8]u8, b:T} = emit{T, '_m_pmaxub', a, b} 70 | def __pminsw{a:T==[4]i16, b:T} = emit{T, '_m_pminsw', a, b} 71 | def __pminub{a:T==[8]u8, b:T} = emit{T, '_m_pminub', a, b} 72 | def __stream{mem_addr:*(void), a:([1]i64)} = emit{void, '_mm_stream_pi', mem_addr, a} 73 | def __maskmove{a:T==[8]u8, mask:T, mem_addr:*(u8)} = emit{void, '_mm_maskmove_si64', a, mask, mem_addr} 74 | def __maskmovq{a:T==[8]u8, mask:T, mem_addr:*(u8)} = emit{void, '_m_maskmovq', a, mask, mem_addr} 75 | def __stream{mem_addr:*(void), a:([4]f32)} = emit{void, '_mm_stream_ps', mem_addr, a} 76 | def __storeh{mem_addr:*([2]f32), a:([4]f32)} = emit{void, '_mm_storeh_pi', mem_addr, a} 77 | def __storel{mem_addr:*([2]f32), a:([4]f32)} = emit{void, '_mm_storel_pi', mem_addr, a} 78 | def __store1{mem_addr:*(f32), a:([4]f32)} = emit{void, '_mm_store1_ps', mem_addr, a} 79 | def __storer{mem_addr:*(f32), a:([4]f32)} = emit{void, '_mm_storer_ps', mem_addr, a} 80 | def __TRANSPOSE4{row0:T==[4]f32, row1:T, row2:T, row3:T} = emit{void, '_MM_TRANSPOSE4_PS', row0, row1, row2, row3} 81 | def __pextrw{a:([4]u16), imm8 if num{imm8}} = emit{i32, '_m_pextrw', a, imm8} 82 | def __pinsrw{a:T==[4]u16, i:(i32), imm8 if num{imm8}} = emit{T, '_m_pinsrw', a, i, imm8} 83 | def __pshufw{a:T==[4]u16, imm8 if num{imm8}} = emit{T, '_m_pshufw', a, imm8} 84 | 85 | 86 | #SSE2 87 | def __madd{a:T==[8]i16, b:T} = emit{[4]i32, '_mm_madd_epi16', a, b} 88 | def __mulhi{a:T==[8]i16, b:T} = emit{T, '_mm_mulhi_epi16', a, b} 89 | def __mulhi{a:T==[8]u16, b:T} = emit{T, '_mm_mulhi_epu16', a, b} 90 | def __mul{a:T==[4]u32, b:T} = emit{[2]u64, '_mm_mul_epu32', a, b} 91 | def __castpd{a:([2]f64)} = emit{[4]f32, '_mm_castpd_ps', a} 92 | def __castps{a:([4]f32)} = emit{[2]f64, '_mm_castps_pd', a} 93 | def __castsi128{a:([2]u64)} = emit{[2]f64, '_mm_castsi128_pd', a} 94 | def __castsi128{a:([4]u32)} = emit{[4]f32, '_mm_castsi128_ps', a} 95 | def __cmpord{a:T==[2]f64, b:T} = emit{T, '_mm_cmpord_pd', a, b} 96 | def __cmpunord{a:T==[2]f64, b:T} = emit{T, '_mm_cmpunord_pd', a, b} 97 | def __cmpnlt{a:T==[2]f64, b:T} = emit{T, '_mm_cmpnlt_pd', a, b} 98 | def __cmpnle{a:T==[2]f64, b:T} = emit{T, '_mm_cmpnle_pd', a, b} 99 | def __cmpngt{a:T==[2]f64, b:T} = emit{T, '_mm_cmpngt_pd', a, b} 100 | def __cmpnge{a:T==[2]f64, b:T} = emit{T, '_mm_cmpnge_pd', a, b} 101 | def __cvtepi32_2f64{a:([4]i32)} = emit{[2]f64, '_mm_cvtepi32_pd', a} 102 | def __cvtepi32_4f32{a:([4]i32)} = emit{[4]f32, '_mm_cvtepi32_ps', a} 103 | def __cvtpi32{a:([2]i32)} = emit{[2]f64, '_mm_cvtpi32_pd', a} 104 | def __cvtsi32{a:(i32)} = emit{[4]u32, '_mm_cvtsi32_si128', a} 105 | def __cvtsi64{a:(u64)} = emit{[2]u64, '_mm_cvtsi64_si128', a} 106 | def __cvtsi64x{a:(u64)} = emit{[2]u64, '_mm_cvtsi64x_si128', a} 107 | def __cvtsi128{a:([4]u32)} = emit{i32, '_mm_cvtsi128_si32', a} 108 | def __cvtsi128_si64{a:([2]u64)} = emit{u64, '_mm_cvtsi128_si64', a} 109 | def __cvtsi128_si64x{a:([2]u64)} = emit{u64, '_mm_cvtsi128_si64x', a} 110 | def __cvtpd_4f32{a:([2]f64)} = emit{[4]f32, '_mm_cvtpd_ps', a} 111 | def __cvtps_2f64{a:([4]f32)} = emit{[2]f64, '_mm_cvtps_pd', a} 112 | def __cvtpd_4i32{a:([2]f64)} = emit{[4]i32, '_mm_cvtpd_epi32', a} 113 | def __cvtsd_si32{a:([2]f64)} = emit{i32, '_mm_cvtsd_si32', a} 114 | def __cvtsd_si64{a:([2]f64)} = emit{u64, '_mm_cvtsd_si64', a} 115 | def __cvtsd_si64x{a:([2]f64)} = emit{u64, '_mm_cvtsd_si64x', a} 116 | def __cvtsd_f64{a:([2]f64)} = emit{f64, '_mm_cvtsd_f64', a} 117 | def __cvttpd_4i32{a:([2]f64)} = emit{[4]i32, '_mm_cvttpd_epi32', a} 118 | def __cvttsd_si32{a:([2]f64)} = emit{i32, '_mm_cvttsd_si32', a} 119 | def __cvttsd_si64{a:([2]f64)} = emit{u64, '_mm_cvttsd_si64', a} 120 | def __cvttsd_si64x{a:([2]f64)} = emit{u64, '_mm_cvttsd_si64x', a} 121 | def __cvtps_4i32{a:([4]f32)} = emit{[4]i32, '_mm_cvtps_epi32', a} 122 | def __cvttps_4i32{a:([4]f32)} = emit{[4]i32, '_mm_cvttps_epi32', a} 123 | def __cvtpd_2f32{a:([2]f64)} = emit{[2]f32, '_mm_cvtpd_pi32', a} 124 | def __cvttpd_2f32{a:([2]f64)} = emit{[2]f32, '_mm_cvttpd_pi32', a} 125 | def __undefined_2f64{} = emit{[2]f64, '_mm_undefined_pd'} 126 | def __pause{} = emit{void, '_mm_pause'} 127 | def __clflush{p:*(void)} = emit{void, '_mm_clflush', p} 128 | def __lfence{} = emit{void, '_mm_lfence'} 129 | def __mfence{} = emit{void, '_mm_mfence'} 130 | def __loadl{mem_addr:*T==[2]i64} = emit{T, '_mm_loadl_epi64', mem_addr} 131 | def __load1{mem_addr:*(f64)} = emit{[2]f64, '_mm_load1_pd', mem_addr} 132 | def __loadr{mem_addr:*(f64)} = emit{[2]f64, '_mm_loadr_pd', mem_addr} 133 | def __loadh{a:T==[2]f64, mem_addr:*(f64)} = emit{T, '_mm_loadh_pd', a, mem_addr} 134 | def __loadl{a:T==[2]f64, mem_addr:*(f64)} = emit{T, '_mm_loadl_pd', a, mem_addr} 135 | def __movepi64{a:([2]i64)} = emit{[2]f32, '_mm_movepi64_pi64', a} 136 | def __packs{a:T==[8]i16, b:T} = emit{[16]i8, '_mm_packs_epi16', a, b} 137 | def __packs{a:T==[4]i32, b:T} = emit{[8]i16, '_mm_packs_epi32', a, b} 138 | def __packus{a:T==[8]i16, b:T} = emit{[16]i8, '_mm_packus_epi16', a, b} 139 | def __movemask{a:([16]i8)} = emit{i32, '_mm_movemask_epi8', a} 140 | def __movemask{a:([2]f64)} = emit{i32, '_mm_movemask_pd', a} 141 | def __sad{a:T==[16]u8, b:T} = emit{[8]u16, '_mm_sad_epu8', a, b} 142 | def __movpi64{a:([1]i64)} = emit{[2]i64, '_mm_movpi64_epi64', a} 143 | def __move{a:T==[2]i64} = emit{T, '_mm_move_epi64', a} 144 | def __setzero_2f64{} = emit{[2]f64, '_mm_setzero_pd'} 145 | def __maskmoveu{a:T==[16]u8, mask:T, mem_addr:*(u8)} = emit{void, '_mm_maskmoveu_si128', a, mask, mem_addr} 146 | def __storel{mem_addr:*T==[2]i64, a:T} = emit{void, '_mm_storel_epi64', mem_addr, a} 147 | def __stream{mem_addr:*(void), a:T if intvec{128,T}} = emit{void, '_mm_stream_si128', mem_addr, a} 148 | def __stream{mem_addr:*(void), a:(i32)} = emit{void, '_mm_stream_si32', mem_addr, a} 149 | def __stream{mem_addr:*(void), a:(u64)} = emit{void, '_mm_stream_si64', mem_addr, a} 150 | def __stream{mem_addr:*(void), a:([2]f64)} = emit{void, '_mm_stream_pd', mem_addr, a} 151 | def __store1{mem_addr:*(f64), a:([2]f64)} = emit{void, '_mm_store1_pd', mem_addr, a} 152 | def __storer{mem_addr:*(f64), a:([2]f64)} = emit{void, '_mm_storer_pd', mem_addr, a} 153 | def __storeh{mem_addr:*(f64), a:([2]f64)} = emit{void, '_mm_storeh_pd', mem_addr, a} 154 | def __storel{mem_addr:*(f64), a:([2]f64)} = emit{void, '_mm_storel_pd', mem_addr, a} 155 | 156 | 157 | #SSE3 158 | def __addsub{a:T==[4]f32, b:T} = emit{T, '_mm_addsub_ps', a, b} 159 | def __addsub{a:T==[2]f64, b:T} = emit{T, '_mm_addsub_pd', a, b} 160 | def __hadd{a:T==[2]f64, b:T} = emit{T, '_mm_hadd_pd', a, b} 161 | def __hadd{a:T==[4]f32, b:T} = emit{T, '_mm_hadd_ps', a, b} 162 | def __hsub{a:T==[2]f64, b:T} = emit{T, '_mm_hsub_pd', a, b} 163 | def __hsub{a:T==[4]f32, b:T} = emit{T, '_mm_hsub_ps', a, b} 164 | def __loaddup{mem_addr:*(f64)} = emit{[2]f64, '_mm_loaddup_pd', mem_addr} 165 | def __movedup{a:T==[2]f64} = emit{T, '_mm_movedup_pd', a} 166 | def __movehdup{a:T==[4]f32} = emit{T, '_mm_movehdup_ps', a} 167 | def __moveldup{a:T==[4]f32} = emit{T, '_mm_moveldup_ps', a} 168 | 169 | 170 | #SSSE3 171 | def __hadd{a:T==[8]i16, b:T} = emit{T, '_mm_hadd_epi16', a, b} 172 | def __hadds{a:T==[8]i16, b:T} = emit{T, '_mm_hadds_epi16', a, b} 173 | def __hadd{a:T==[4]i32, b:T} = emit{T, '_mm_hadd_epi32', a, b} 174 | def __hadd{a:T==[4]i16, b:T} = emit{T, '_mm_hadd_pi16', a, b} 175 | def __hadd{a:T==[2]i32, b:T} = emit{T, '_mm_hadd_pi32', a, b} 176 | def __hadds{a:T==[4]i16, b:T} = emit{T, '_mm_hadds_pi16', a, b} 177 | def __hsub{a:T==[8]i16, b:T} = emit{T, '_mm_hsub_epi16', a, b} 178 | def __hsubs{a:T==[8]i16, b:T} = emit{T, '_mm_hsubs_epi16', a, b} 179 | def __hsub{a:T==[4]i32, b:T} = emit{T, '_mm_hsub_epi32', a, b} 180 | def __hsub{a:T==[4]i16, b:T} = emit{T, '_mm_hsub_pi16', a, b} 181 | def __hsub{a:T==[2]i32, b:T} = emit{T, '_mm_hsub_pi32', a, b} 182 | def __hsubs{a:T==[4]i16, b:T} = emit{T, '_mm_hsubs_pi16', a, b} 183 | def __maddubs{a:T==[16]i8, b:T} = emit{[8]i16, '_mm_maddubs_epi16', a, b} 184 | def __maddubs{a:T==[8]i8, b:T} = emit{[4]i16, '_mm_maddubs_pi16', a, b} 185 | def __mulhrs{a:T==[8]i16, b:T} = emit{T, '_mm_mulhrs_epi16', a, b} 186 | def __mulhrs{a:T==[4]i16, b:T} = emit{T, '_mm_mulhrs_pi16', a, b} 187 | 188 | 189 | #SSE4.1 190 | def __dp{a:T==[2]f64, b:T, imm8 if num{imm8}} = emit{T, '_mm_dp_pd', a, b, imm8} 191 | def __dp{a:T==[4]f32, b:T, imm8 if num{imm8}} = emit{T, '_mm_dp_ps', a, b, imm8} 192 | def __mul{a:T==[4]i32, b:T} = emit{[2]i64, '_mm_mul_epi32', a, b} 193 | def __cvtepi8_8i16{a:([16]i8)} = emit{[8]i16, '_mm_cvtepi8_epi16', a} 194 | def __cvtepi8_4i32{a:([16]i8)} = emit{[4]i32, '_mm_cvtepi8_epi32', a} 195 | def __cvtepi8_2i64{a:([16]i8)} = emit{[2]i64, '_mm_cvtepi8_epi64', a} 196 | def __cvtepi16_4i32{a:([8]i16)} = emit{[4]i32, '_mm_cvtepi16_epi32', a} 197 | def __cvtepi16_2i64{a:([8]i16)} = emit{[2]i64, '_mm_cvtepi16_epi64', a} 198 | def __cvtepi32_2i64{a:([4]i32)} = emit{[2]i64, '_mm_cvtepi32_epi64', a} 199 | def __cvtepu8_8i16{a:([16]i8)} = emit{[8]i16, '_mm_cvtepu8_epi16', a} 200 | def __cvtepu8_4i32{a:([16]i8)} = emit{[4]i32, '_mm_cvtepu8_epi32', a} 201 | def __cvtepu8_2i64{a:([16]i8)} = emit{[2]i64, '_mm_cvtepu8_epi64', a} 202 | def __cvtepu16_4i32{a:([8]i16)} = emit{[4]i32, '_mm_cvtepu16_epi32', a} 203 | def __cvtepu16_2i64{a:([8]i16)} = emit{[2]i64, '_mm_cvtepu16_epi64', a} 204 | def __cvtepu32_2i64{a:([4]i32)} = emit{[2]i64, '_mm_cvtepu32_epi64', a} 205 | def __testz{a:T, b:T if intvec{128,T}} = emit{i32, '_mm_testz_si128', a, b} 206 | def __testc{a:T, b:T if intvec{128,T}} = emit{i32, '_mm_testc_si128', a, b} 207 | def __testnzc{a:T, b:T if intvec{128,T}} = emit{i32, '_mm_testnzc_si128', a, b} 208 | def __test_all_zeros{mask:T, a:T if intvec{128,T}} = emit{i32, '_mm_test_all_zeros', mask, a} 209 | def __test_mix_ones_zeros{mask:T, a:T if intvec{128,T}} = emit{i32, '_mm_test_mix_ones_zeros', mask, a} 210 | def __test{a:T if intvec{128,T}} = emit{i32, '_mm_test_all_ones', a} 211 | def __minpos{a:T==[8]u16} = emit{T, '_mm_minpos_epu16', a} 212 | def __mpsadbw{a:T==[16]u8, b:T, imm8 if num{imm8}} = emit{T, '_mm_mpsadbw_epu8', a, b, imm8} 213 | def __packus{a:T==[4]i32, b:T} = emit{[8]i16, '_mm_packus_epi32', a, b} 214 | 215 | 216 | #SSE4.2 217 | def __crc32{crc:T==u32, v:(u8)} = emit{T, '_mm_crc32_u8', crc, v} 218 | def __crc32{crc:T==u32, v:(u16)} = emit{T, '_mm_crc32_u16', crc, v} 219 | def __crc32{crc:T==u32, v:T} = emit{T, '_mm_crc32_u32', crc, v} 220 | def __crc32{crc:T==u64, v:T} = emit{T, '_mm_crc32_u64', crc, v} 221 | def __cmpistrm{a:T, b:T, imm8 if intvec{128,T} and num{imm8}} = emit{T, '_mm_cmpistrm', a, b, imm8} 222 | def __cmpistri{a:T, b:T, imm8 if intvec{128,T} and num{imm8}} = emit{i32, '_mm_cmpistri', a, b, imm8} 223 | def __cmpistrz{a:T, b:T, imm8 if intvec{128,T} and num{imm8}} = emit{i32, '_mm_cmpistrz', a, b, imm8} 224 | def __cmpistrc{a:T, b:T, imm8 if intvec{128,T} and num{imm8}} = emit{i32, '_mm_cmpistrc', a, b, imm8} 225 | def __cmpistrs{a:T, b:T, imm8 if intvec{128,T} and num{imm8}} = emit{i32, '_mm_cmpistrs', a, b, imm8} 226 | def __cmpistro{a:T, b:T, imm8 if intvec{128,T} and num{imm8}} = emit{i32, '_mm_cmpistro', a, b, imm8} 227 | def __cmpistra{a:T, b:T, imm8 if intvec{128,T} and num{imm8}} = emit{i32, '_mm_cmpistra', a, b, imm8} 228 | def __cmpestrm{a:T, la:S==i32, b:T, lb:S, imm8 if intvec{128,T} and num{imm8}} = emit{T, '_mm_cmpestrm', a, la, b, lb, imm8} 229 | def __cmpestri{a:T, la:S==i32, b:T, lb:S, imm8 if intvec{128,T} and num{imm8}} = emit{S, '_mm_cmpestri', a, la, b, lb, imm8} 230 | def __cmpestrz{a:T, la:S==i32, b:T, lb:S, imm8 if intvec{128,T} and num{imm8}} = emit{S, '_mm_cmpestrz', a, la, b, lb, imm8} 231 | def __cmpestrc{a:T, la:S==i32, b:T, lb:S, imm8 if intvec{128,T} and num{imm8}} = emit{S, '_mm_cmpestrc', a, la, b, lb, imm8} 232 | def __cmpestrs{a:T, la:S==i32, b:T, lb:S, imm8 if intvec{128,T} and num{imm8}} = emit{S, '_mm_cmpestrs', a, la, b, lb, imm8} 233 | def __cmpestro{a:T, la:S==i32, b:T, lb:S, imm8 if intvec{128,T} and num{imm8}} = emit{S, '_mm_cmpestro', a, la, b, lb, imm8} 234 | def __cmpestra{a:T, la:S==i32, b:T, lb:S, imm8 if intvec{128,T} and num{imm8}} = emit{S, '_mm_cmpestra', a, la, b, lb, imm8} 235 | 236 | 237 | #AVX 238 | def __addsub{a:T==[4]f64, b:T} = emit{T, '_mm256_addsub_pd', a, b} 239 | def __addsub{a:T==[8]f32, b:T} = emit{T, '_mm256_addsub_ps', a, b} 240 | def __dp{a:T==[8]f32, b:T, imm8 if num{imm8}} = emit{T, '_mm256_dp_ps', a, b, imm8} 241 | def __hadd{a:T==[4]f64, b:T} = emit{T, '_mm256_hadd_pd', a, b} 242 | def __hadd{a:T==[8]f32, b:T} = emit{T, '_mm256_hadd_ps', a, b} 243 | def __hsub{a:T==[4]f64, b:T} = emit{T, '_mm256_hsub_pd', a, b} 244 | def __hsub{a:T==[8]f32, b:T} = emit{T, '_mm256_hsub_ps', a, b} 245 | def __castpd{a:([4]f64)} = emit{[8]f32, '_mm256_castpd_ps', a} 246 | def __castps{a:([8]f32)} = emit{[4]f64, '_mm256_castps_pd', a} 247 | def __castsi256{a:([8]u32)} = emit{[8]f32, '_mm256_castsi256_ps', a} 248 | def __castsi256{a:([4]u64)} = emit{[4]f64, '_mm256_castsi256_pd', a} 249 | def __castps256{a:([8]f32)} = emit{[4]f32, '_mm256_castps256_ps128', a} 250 | def __castpd256{a:([4]f64)} = emit{[2]f64, '_mm256_castpd256_pd128', a} 251 | def __castps128{a:([4]f32)} = emit{[8]f32, '_mm256_castps128_ps256', a} 252 | def __castpd128{a:([2]f64)} = emit{[4]f64, '_mm256_castpd128_pd256', a} 253 | def __zextps128{a:([4]f32)} = emit{[8]f32, '_mm256_zextps128_ps256', a} 254 | def __zextpd128{a:([2]f64)} = emit{[4]f64, '_mm256_zextpd128_pd256', a} 255 | def __cmp{a:T==[2]f64, b:T, imm8 if num{imm8}} = emit{T, '_mm_cmp_pd', a, b, imm8} 256 | def __cmp{a:T==[4]f32, b:T, imm8 if num{imm8}} = emit{T, '_mm_cmp_ps', a, b, imm8} 257 | def __cvtepi32_4f64{a:([4]i32)} = emit{[4]f64, '_mm256_cvtepi32_pd', a} 258 | def __cvtepi32{a:([8]i32)} = emit{[8]f32, '_mm256_cvtepi32_ps', a} 259 | def __cvtpd_4f32{a:([4]f64)} = emit{[4]f32, '_mm256_cvtpd_ps', a} 260 | def __cvtps{a:([8]f32)} = emit{[8]i32, '_mm256_cvtps_epi32', a} 261 | def __cvtps_4f64{a:([4]f32)} = emit{[4]f64, '_mm256_cvtps_pd', a} 262 | def __cvttpd{a:([4]f64)} = emit{[4]i32, '_mm256_cvttpd_epi32', a} 263 | def __cvtpd_4i32{a:([4]f64)} = emit{[4]i32, '_mm256_cvtpd_epi32', a} 264 | def __cvttps{a:([8]f32)} = emit{[8]i32, '_mm256_cvttps_epi32', a} 265 | def __cvtss{a:([8]f32)} = emit{f32, '_mm256_cvtss_f32', a} 266 | def __cvtsd{a:([4]f64)} = emit{f64, '_mm256_cvtsd_f64', a} 267 | def __cvtsi256{a:([8]u32)} = emit{i32, '_mm256_cvtsi256_si32', a} 268 | def __rcp{a:T==[8]f32} = emit{T, '_mm256_rcp_ps', a} 269 | def __rsqrt{a:T==[8]f32} = emit{T, '_mm256_rsqrt_ps', a} 270 | def __zeroall{} = emit{void, '_mm256_zeroall'} 271 | def __zeroupper{} = emit{void, '_mm256_zeroupper'} 272 | def __undefined_8f32{} = emit{[8]f32, '_mm256_undefined_ps'} 273 | def __undefined_4f64{} = emit{[4]f64, '_mm256_undefined_pd'} 274 | def __maskload{mem_addr:*(f64), mask:T if intvec{256,T}} = emit{[4]f64, '_mm256_maskload_pd', mem_addr, mask} 275 | def __maskload{mem_addr:*(f64), mask:T if intvec{128,T}} = emit{[2]f64, '_mm_maskload_pd', mem_addr, mask} 276 | def __maskload{mem_addr:*(f32), mask:T if intvec{256,T}} = emit{[8]f32, '_mm256_maskload_ps', mem_addr, mask} 277 | def __maskload{mem_addr:*(f32), mask:T if intvec{128,T}} = emit{[4]f32, '_mm_maskload_ps', mem_addr, mask} 278 | def __loadu2{hiaddr:*T==f32, loaddr:*T} = emit{[8]f32, '_mm256_loadu2_m128', hiaddr, loaddr} 279 | def __loadu2{hiaddr:*T==f64, loaddr:*T} = emit{[4]f64, '_mm256_loadu2_m128d', hiaddr, loaddr} 280 | def __testz{a:T, b:T if intvec{256,T}} = emit{i32, '_mm256_testz_si256', a, b} 281 | def __testc{a:T, b:T if intvec{256,T}} = emit{i32, '_mm256_testc_si256', a, b} 282 | def __testnzc{a:T, b:T if intvec{256,T}} = emit{i32, '_mm256_testnzc_si256', a, b} 283 | def __testz{a:T==[4]f64, b:T} = emit{i32, '_mm256_testz_pd', a, b} 284 | def __testc{a:T==[4]f64, b:T} = emit{i32, '_mm256_testc_pd', a, b} 285 | def __testnzc{a:T==[4]f64, b:T} = emit{i32, '_mm256_testnzc_pd', a, b} 286 | def __testz{a:T==[2]f64, b:T} = emit{i32, '_mm_testz_pd', a, b} 287 | def __testc{a:T==[2]f64, b:T} = emit{i32, '_mm_testc_pd', a, b} 288 | def __testnzc{a:T==[2]f64, b:T} = emit{i32, '_mm_testnzc_pd', a, b} 289 | def __testz{a:T==[8]f32, b:T} = emit{i32, '_mm256_testz_ps', a, b} 290 | def __testc{a:T==[8]f32, b:T} = emit{i32, '_mm256_testc_ps', a, b} 291 | def __testnzc{a:T==[8]f32, b:T} = emit{i32, '_mm256_testnzc_ps', a, b} 292 | def __testz{a:T==[4]f32, b:T} = emit{i32, '_mm_testz_ps', a, b} 293 | def __testc{a:T==[4]f32, b:T} = emit{i32, '_mm_testc_ps', a, b} 294 | def __testnzc{a:T==[4]f32, b:T} = emit{i32, '_mm_testnzc_ps', a, b} 295 | def __movemask{a:([4]f64)} = emit{i32, '_mm256_movemask_pd', a} 296 | def __movemask{a:([8]f32)} = emit{i32, '_mm256_movemask_ps', a} 297 | def __movehdup{a:T==[8]f32} = emit{T, '_mm256_movehdup_ps', a} 298 | def __moveldup{a:T==[8]f32} = emit{T, '_mm256_moveldup_ps', a} 299 | def __movedup{a:T==[4]f64} = emit{T, '_mm256_movedup_pd', a} 300 | def __setzero_4f64{} = emit{[4]f64, '_mm256_setzero_pd'} 301 | def __setzero_8f32{} = emit{[8]f32, '_mm256_setzero_ps'} 302 | def __set{hi:T==[4]f32, lo:T} = emit{[8]f32, '_mm256_set_m128', hi, lo} 303 | def __set{hi:T==[2]f64, lo:T} = emit{[4]f64, '_mm256_set_m128d', hi, lo} 304 | def __setr{lo:T==[4]f32, hi:T} = emit{[8]f32, '_mm256_setr_m128', lo, hi} 305 | def __setr{lo:T==[2]f64, hi:T} = emit{[4]f64, '_mm256_setr_m128d', lo, hi} 306 | def __maskstore{mem_addr:*(f64), mask:T, a:([4]f64) if intvec{256,T}} = emit{void, '_mm256_maskstore_pd', mem_addr, mask, a} 307 | def __maskstore{mem_addr:*(f64), mask:T, a:([2]f64) if intvec{128,T}} = emit{void, '_mm_maskstore_pd', mem_addr, mask, a} 308 | def __maskstore{mem_addr:*(f32), mask:T, a:([8]f32) if intvec{256,T}} = emit{void, '_mm256_maskstore_ps', mem_addr, mask, a} 309 | def __maskstore{mem_addr:*(f32), mask:T, a:([4]f32) if intvec{128,T}} = emit{void, '_mm_maskstore_ps', mem_addr, mask, a} 310 | def __stream{mem_addr:*(void), a:T if intvec{256,T}} = emit{void, '_mm256_stream_si256', mem_addr, a} 311 | def __stream{mem_addr:*(void), a:([4]f64)} = emit{void, '_mm256_stream_pd', mem_addr, a} 312 | def __stream{mem_addr:*(void), a:([8]f32)} = emit{void, '_mm256_stream_ps', mem_addr, a} 313 | def __storeu2{hiaddr:*T==f32, loaddr:*T, a:([8]f32)} = emit{void, '_mm256_storeu2_m128', hiaddr, loaddr, a} 314 | def __storeu2{hiaddr:*T==f64, loaddr:*T, a:([4]f64)} = emit{void, '_mm256_storeu2_m128d', hiaddr, loaddr, a} 315 | def __extractf128{a:([8]f32), imm8 if num{imm8}} = emit{[4]f32, '_mm256_extractf128_ps', a, imm8} 316 | def __extractf128{a:([4]f64), imm8 if num{imm8}} = emit{[2]f64, '_mm256_extractf128_pd', a, imm8} 317 | def __insertf128{a:T==[8]f32, b:([4]f32), imm8 if num{imm8}} = emit{T, '_mm256_insertf128_ps', a, b, imm8} 318 | def __insertf128{a:T==[4]f64, b:([2]f64), imm8 if num{imm8}} = emit{T, '_mm256_insertf128_pd', a, b, imm8} 319 | def __insertf128{a:T, b:S, imm8 if intvec{256,T} and intvec{128,S} and num{imm8}} = emit{T, '_mm256_insertf128_si256', a, b, imm8} 320 | def __broadcast{mem_addr:*([4]f32)} = emit{[8]f32, '_mm256_broadcast_ps', mem_addr} 321 | def __broadcast{mem_addr:*([2]f64)} = emit{[4]f64, '_mm256_broadcast_pd', mem_addr} 322 | 323 | 324 | #AVX2 325 | def __hadd{a:T==[16]i16, b:T} = emit{T, '_mm256_hadd_epi16', a, b} 326 | def __hadd{a:T==[8]i32, b:T} = emit{T, '_mm256_hadd_epi32', a, b} 327 | def __hadds{a:T==[16]i16, b:T} = emit{T, '_mm256_hadds_epi16', a, b} 328 | def __hsub{a:T==[16]i16, b:T} = emit{T, '_mm256_hsub_epi16', a, b} 329 | def __hsub{a:T==[8]i32, b:T} = emit{T, '_mm256_hsub_epi32', a, b} 330 | def __hsubs{a:T==[16]i16, b:T} = emit{T, '_mm256_hsubs_epi16', a, b} 331 | def __madd{a:T==[16]i16, b:T} = emit{[8]i32, '_mm256_madd_epi16', a, b} 332 | def __maddubs{a:T==[32]i8, b:T} = emit{[16]i16, '_mm256_maddubs_epi16', a, b} 333 | def __mul{a:T==[8]i32, b:T} = emit{[4]i64, '_mm256_mul_epi32', a, b} 334 | def __mul{a:T==[8]u32, b:T} = emit{[4]u64, '_mm256_mul_epu32', a, b} 335 | def __mulhi{a:T==[16]i16, b:T} = emit{T, '_mm256_mulhi_epi16', a, b} 336 | def __mulhi{a:T==[16]u16, b:T} = emit{T, '_mm256_mulhi_epu16', a, b} 337 | def __mulhrs{a:T==[16]i16, b:T} = emit{T, '_mm256_mulhrs_epi16', a, b} 338 | def __sad{a:T==[32]u8, b:T} = emit{[16]u16, '_mm256_sad_epu8', a, b} 339 | def __cvtepi16_8i32{a:([8]i16)} = emit{[8]i32, '_mm256_cvtepi16_epi32', a} 340 | def __cvtepi16_4i64{a:([8]i16)} = emit{[4]i64, '_mm256_cvtepi16_epi64', a} 341 | def __cvtepi32_4i64{a:([4]i32)} = emit{[4]i64, '_mm256_cvtepi32_epi64', a} 342 | def __cvtepi8_16i16{a:([16]i8)} = emit{[16]i16, '_mm256_cvtepi8_epi16', a} 343 | def __cvtepi8_8i32{a:([16]i8)} = emit{[8]i32, '_mm256_cvtepi8_epi32', a} 344 | def __cvtepi8_4i64{a:([16]i8)} = emit{[4]i64, '_mm256_cvtepi8_epi64', a} 345 | def __cvtepu16_8i32{a:([8]i16)} = emit{[8]i32, '_mm256_cvtepu16_epi32', a} 346 | def __cvtepu16_4i64{a:([8]i16)} = emit{[4]i64, '_mm256_cvtepu16_epi64', a} 347 | def __cvtepu32_4i64{a:([4]i32)} = emit{[4]i64, '_mm256_cvtepu32_epi64', a} 348 | def __cvtepu8_16i16{a:([16]i8)} = emit{[16]i16, '_mm256_cvtepu8_epi16', a} 349 | def __cvtepu8_8i32{a:([16]i8)} = emit{[8]i32, '_mm256_cvtepu8_epi32', a} 350 | def __cvtepu8_4i64{a:([16]i8)} = emit{[4]i64, '_mm256_cvtepu8_epi64', a} 351 | def __i32gather_2f64{base_addr:*(f64), vindex:([4]i32), scale if num{scale}} = emit{[2]f64, '_mm_i32gather_pd', base_addr, vindex, scale} 352 | def __i32gather_4f64{base_addr:*(f64), vindex:([4]i32), scale if num{scale}} = emit{[4]f64, '_mm256_i32gather_pd', base_addr, vindex, scale} 353 | def __i32gather{base_addr:*(f32), vindex:([4]i32), scale if num{scale}} = emit{[4]f32, '_mm_i32gather_ps', base_addr, vindex, scale} 354 | def __i32gather{base_addr:*(f32), vindex:([8]i32), scale if num{scale}} = emit{[8]f32, '_mm256_i32gather_ps', base_addr, vindex, scale} 355 | def __i32gather{base_addr:*(i32), vindex:T==[4]i32, scale if num{scale}} = emit{T, '_mm_i32gather_epi32', base_addr, vindex, scale} 356 | def __i32gather{base_addr:*(i32), vindex:T==[8]i32, scale if num{scale}} = emit{T, '_mm256_i32gather_epi32', base_addr, vindex, scale} 357 | def __i32gather_2i64{base_addr:*(i64), vindex:([4]i32), scale if num{scale}} = emit{[2]i64, '_mm_i32gather_epi64', base_addr, vindex, scale} 358 | def __i32gather_4i64{base_addr:*(i64), vindex:([4]i32), scale if num{scale}} = emit{[4]i64, '_mm256_i32gather_epi64', base_addr, vindex, scale} 359 | def __i64gather{base_addr:*(f64), vindex:([2]i64), scale if num{scale}} = emit{[2]f64, '_mm_i64gather_pd', base_addr, vindex, scale} 360 | def __i64gather{base_addr:*(f64), vindex:([4]i64), scale if num{scale}} = emit{[4]f64, '_mm256_i64gather_pd', base_addr, vindex, scale} 361 | def __i64gather{base_addr:*(f32), vindex:([2]i64), scale if num{scale}} = emit{[4]f32, '_mm_i64gather_ps', base_addr, vindex, scale} 362 | def __i64gather{base_addr:*(f32), vindex:([4]i64), scale if num{scale}} = emit{[4]f32, '_mm256_i64gather_ps', base_addr, vindex, scale} 363 | def __i64gather{base_addr:*(i32), vindex:([2]i64), scale if num{scale}} = emit{[4]i32, '_mm_i64gather_epi32', base_addr, vindex, scale} 364 | def __i64gather{base_addr:*(i32), vindex:([4]i64), scale if num{scale}} = emit{[4]i32, '_mm256_i64gather_epi32', base_addr, vindex, scale} 365 | def __i64gather{base_addr:*(i64), vindex:T==[2]i64, scale if num{scale}} = emit{T, '_mm_i64gather_epi64', base_addr, vindex, scale} 366 | def __i64gather{base_addr:*(i64), vindex:T==[4]i64, scale if num{scale}} = emit{T, '_mm256_i64gather_epi64', base_addr, vindex, scale} 367 | def __mask_i32gather{src:T==[2]f64, base_addr:*(f64), vindex:([4]i32), mask:S, scale if intvec{128,S} and num{scale}} = emit{T, '_mm_mask_i32gather_pd', src, base_addr, vindex, mask, scale} 368 | def __mask_i32gather{src:T==[4]f64, base_addr:*(f64), vindex:([4]i32), mask:S, scale if intvec{256,S} and num{scale}} = emit{T, '_mm256_mask_i32gather_pd', src, base_addr, vindex, mask, scale} 369 | def __mask_i32gather{src:T==[4]f32, base_addr:*(f32), vindex:([4]i32), mask:S, scale if intvec{128,S} and num{scale}} = emit{T, '_mm_mask_i32gather_ps', src, base_addr, vindex, mask, scale} 370 | def __mask_i32gather{src:T==[8]f32, base_addr:*(f32), vindex:([8]i32), mask:S, scale if intvec{256,S} and num{scale}} = emit{T, '_mm256_mask_i32gather_ps', src, base_addr, vindex, mask, scale} 371 | def __mask_i32gather{src:T==[4]i32, base_addr:*(i32), vindex:T, mask:S, scale if intvec{128,S} and num{scale}} = emit{T, '_mm_mask_i32gather_epi32', src, base_addr, vindex, mask, scale} 372 | def __mask_i32gather{src:T==[8]i32, base_addr:*(i32), vindex:T, mask:S, scale if intvec{256,S} and num{scale}} = emit{T, '_mm256_mask_i32gather_epi32', src, base_addr, vindex, mask, scale} 373 | def __mask_i32gather{src:T==[2]i64, base_addr:*(i64), vindex:([4]i32), mask:S, scale if intvec{128,S} and num{scale}} = emit{T, '_mm_mask_i32gather_epi64', src, base_addr, vindex, mask, scale} 374 | def __mask_i32gather{src:T==[4]i64, base_addr:*(i64), vindex:([4]i32), mask:S, scale if intvec{256,S} and num{scale}} = emit{T, '_mm256_mask_i32gather_epi64', src, base_addr, vindex, mask, scale} 375 | def __mask_i64gather{src:T==[2]f64, base_addr:*(f64), vindex:([2]i64), mask:S, scale if intvec{128,S} and num{scale}} = emit{T, '_mm_mask_i64gather_pd', src, base_addr, vindex, mask, scale} 376 | def __mask_i64gather{src:T==[4]f64, base_addr:*(f64), vindex:([4]i64), mask:S, scale if intvec{256,S} and num{scale}} = emit{T, '_mm256_mask_i64gather_pd', src, base_addr, vindex, mask, scale} 377 | def __mask_i64gather{src:T==[4]f32, base_addr:*(f32), vindex:([2]i64), mask:S, scale if intvec{128,S} and num{scale}} = emit{T, '_mm_mask_i64gather_ps', src, base_addr, vindex, mask, scale} 378 | def __mask_i64gather{src:T==[4]f32, base_addr:*(f32), vindex:([4]i64), mask:S, scale if intvec{128,S} and num{scale}} = emit{T, '_mm256_mask_i64gather_ps', src, base_addr, vindex, mask, scale} 379 | def __mask_i64gather{src:T==[4]i32, base_addr:*(i32), vindex:([2]i64), mask:S, scale if intvec{128,S} and num{scale}} = emit{T, '_mm_mask_i64gather_epi32', src, base_addr, vindex, mask, scale} 380 | def __mask_i64gather{src:T==[4]i32, base_addr:*(i32), vindex:([4]i64), mask:S, scale if intvec{128,S} and num{scale}} = emit{T, '_mm256_mask_i64gather_epi32', src, base_addr, vindex, mask, scale} 381 | def __mask_i64gather{src:T==[2]i64, base_addr:*(i64), vindex:T, mask:S, scale if intvec{128,S} and num{scale}} = emit{T, '_mm_mask_i64gather_epi64', src, base_addr, vindex, mask, scale} 382 | def __mask_i64gather{src:T==[4]i64, base_addr:*(i64), vindex:T, mask:S, scale if intvec{256,S} and num{scale}} = emit{T, '_mm256_mask_i64gather_epi64', src, base_addr, vindex, mask, scale} 383 | def __maskload{mem_addr:*(i32), mask:T if intvec{128,T}} = emit{[4]i32, '_mm_maskload_epi32', mem_addr, mask} 384 | def __maskload{mem_addr:*(i32), mask:T if intvec{256,T}} = emit{[8]i32, '_mm256_maskload_epi32', mem_addr, mask} 385 | def __maskload{mem_addr:*(i64), mask:T if intvec{128,T}} = emit{[2]i64, '_mm_maskload_epi64', mem_addr, mask} 386 | def __maskload{mem_addr:*(i64), mask:T if intvec{256,T}} = emit{[4]i64, '_mm256_maskload_epi64', mem_addr, mask} 387 | def __movemask{a:([32]i8)} = emit{i32, '_mm256_movemask_epi8', a} 388 | def __mpsadbw{a:T==[32]u8, b:T, imm8 if num{imm8}} = emit{T, '_mm256_mpsadbw_epu8', a, b, imm8} 389 | def __packs{a:T==[16]i16, b:T} = emit{[32]i8, '_mm256_packs_epi16', a, b} 390 | def __packs{a:T==[8]i32, b:T} = emit{[16]i16, '_mm256_packs_epi32', a, b} 391 | def __packus{a:T==[16]i16, b:T} = emit{[32]i8, '_mm256_packus_epi16', a, b} 392 | def __packus{a:T==[8]i32, b:T} = emit{[16]i16, '_mm256_packus_epi32', a, b} 393 | def __maskstore{mem_addr:*(i32), mask:T, a:([4]i32) if intvec{128,T}} = emit{void, '_mm_maskstore_epi32', mem_addr, mask, a} 394 | def __maskstore{mem_addr:*(i32), mask:T, a:([8]i32) if intvec{256,T}} = emit{void, '_mm256_maskstore_epi32', mem_addr, mask, a} 395 | def __maskstore{mem_addr:*(i64), mask:T, a:([2]i64) if intvec{128,T}} = emit{void, '_mm_maskstore_epi64', mem_addr, mask, a} 396 | def __maskstore{mem_addr:*(i64), mask:T, a:([4]i64) if intvec{256,T}} = emit{void, '_mm256_maskstore_epi64', mem_addr, mask, a} 397 | def __broadcastb_16i8{a:T==[16]i8} = emit{T, '_mm_broadcastb_epi8', a} 398 | def __broadcastb_32i8{a:([16]i8)} = emit{[32]i8, '_mm256_broadcastb_epi8', a} 399 | def __broadcastd_4i32{a:T==[4]i32} = emit{T, '_mm_broadcastd_epi32', a} 400 | def __broadcastd_8i32{a:([4]i32)} = emit{[8]i32, '_mm256_broadcastd_epi32', a} 401 | def __broadcastq_2i64{a:T==[2]i64} = emit{T, '_mm_broadcastq_epi64', a} 402 | def __broadcastq_4i64{a:([2]i64)} = emit{[4]i64, '_mm256_broadcastq_epi64', a} 403 | def __broadcastsd_2f64{a:T==[2]f64} = emit{T, '_mm_broadcastsd_pd', a} 404 | def __broadcastsd_4f64{a:([2]f64)} = emit{[4]f64, '_mm256_broadcastsd_pd', a} 405 | def __broadcastss_4f32{a:T==[4]f32} = emit{T, '_mm_broadcastss_ps', a} 406 | def __broadcastss_8f32{a:([4]f32)} = emit{[8]f32, '_mm256_broadcastss_ps', a} 407 | def __broadcastw_8i16{a:T==[8]i16} = emit{T, '_mm_broadcastw_epi16', a} 408 | def __broadcastw_16i16{a:([8]i16)} = emit{[16]i16, '_mm256_broadcastw_epi16', a} 409 | def __inserti128{a:T, b:S, imm8 if intvec{256,T} and intvec{128,S} and num{imm8}} = emit{T, '_mm256_inserti128_si256', a, b, imm8} 410 | 411 | 412 | #FMA 413 | def __fmadd{a:T==[2]f64, b:T, c:T} = emit{T, '_mm_fmadd_pd', a, b, c} 414 | def __fmadd{a:T==[4]f64, b:T, c:T} = emit{T, '_mm256_fmadd_pd', a, b, c} 415 | def __fmadd{a:T==[4]f32, b:T, c:T} = emit{T, '_mm_fmadd_ps', a, b, c} 416 | def __fmadd{a:T==[8]f32, b:T, c:T} = emit{T, '_mm256_fmadd_ps', a, b, c} 417 | def __fmaddsub{a:T==[2]f64, b:T, c:T} = emit{T, '_mm_fmaddsub_pd', a, b, c} 418 | def __fmaddsub{a:T==[4]f64, b:T, c:T} = emit{T, '_mm256_fmaddsub_pd', a, b, c} 419 | def __fmaddsub{a:T==[4]f32, b:T, c:T} = emit{T, '_mm_fmaddsub_ps', a, b, c} 420 | def __fmaddsub{a:T==[8]f32, b:T, c:T} = emit{T, '_mm256_fmaddsub_ps', a, b, c} 421 | def __fmsub{a:T==[2]f64, b:T, c:T} = emit{T, '_mm_fmsub_pd', a, b, c} 422 | def __fmsub{a:T==[4]f64, b:T, c:T} = emit{T, '_mm256_fmsub_pd', a, b, c} 423 | def __fmsub{a:T==[4]f32, b:T, c:T} = emit{T, '_mm_fmsub_ps', a, b, c} 424 | def __fmsub{a:T==[8]f32, b:T, c:T} = emit{T, '_mm256_fmsub_ps', a, b, c} 425 | def __fmsubadd{a:T==[2]f64, b:T, c:T} = emit{T, '_mm_fmsubadd_pd', a, b, c} 426 | def __fmsubadd{a:T==[4]f64, b:T, c:T} = emit{T, '_mm256_fmsubadd_pd', a, b, c} 427 | def __fmsubadd{a:T==[4]f32, b:T, c:T} = emit{T, '_mm_fmsubadd_ps', a, b, c} 428 | def __fmsubadd{a:T==[8]f32, b:T, c:T} = emit{T, '_mm256_fmsubadd_ps', a, b, c} 429 | def __fnmadd{a:T==[2]f64, b:T, c:T} = emit{T, '_mm_fnmadd_pd', a, b, c} 430 | def __fnmadd{a:T==[4]f64, b:T, c:T} = emit{T, '_mm256_fnmadd_pd', a, b, c} 431 | def __fnmadd{a:T==[4]f32, b:T, c:T} = emit{T, '_mm_fnmadd_ps', a, b, c} 432 | def __fnmadd{a:T==[8]f32, b:T, c:T} = emit{T, '_mm256_fnmadd_ps', a, b, c} 433 | def __fnmsub{a:T==[2]f64, b:T, c:T} = emit{T, '_mm_fnmsub_pd', a, b, c} 434 | def __fnmsub{a:T==[4]f64, b:T, c:T} = emit{T, '_mm256_fnmsub_pd', a, b, c} 435 | def __fnmsub{a:T==[4]f32, b:T, c:T} = emit{T, '_mm_fnmsub_ps', a, b, c} 436 | def __fnmsub{a:T==[8]f32, b:T, c:T} = emit{T, '_mm256_fnmsub_ps', a, b, c} 437 | -------------------------------------------------------------------------------- /include/arch/iintrinsic/select.singeli: -------------------------------------------------------------------------------- 1 | local { 2 | include 'skin/cop' 3 | include 'util/kind' 4 | oper ~~ reinterpret infix right 55 5 | def base{b,{}} = 0; def base{b,{h,...t}} = h + b*base{b,t} 6 | def all{t} = is{t, 0 <= t} 7 | def copy{n, v} = each{{_}=>v, range{n}} 8 | 9 | def fmtwidth{V} = fmtnat{width{V}} 10 | def fmt_p{T} = { 11 | if (isfloat{T}) { 12 | if (width{T}==32) 'ps' else 'pd' 13 | } else { 14 | merge{'epi', fmtwidth{T}} # No unsigned instructions here 15 | } 16 | } 17 | def intrin_g{get_typ}{name, V} = { 18 | def w = width{V} 19 | def fw = if (w<=128) '' else fmtnat{w} 20 | merge{'_mm', fw, '_', name, '_', get_typ{V}} 21 | } 22 | def intrin = intrin_g{{[_]T} => fmt_p{T}} 23 | def intrin_b = intrin_g{{V=[_]T} => { 24 | if (isint{T}) merge{'si',fmtwidth{V}} else fmt_p{T} 25 | }} 26 | 27 | # Only need to cast if element type class changes 28 | def class{T} = if (isfloat{T}) T else i8 # i8 represents any int 29 | def call_cast_sub{do_uncast, gen, T, V=[_]E, ...vs} = { 30 | def uncast{v} = if (do_uncast) V~~v else v 31 | if (class{E} == class{T}) gen{...vs} 32 | else uncast{gen{...each{~~{[width{V}/width{T}]T, .}, vs}}} 33 | } 34 | def call_cast = call_cast_sub{1, ...} 35 | } 36 | 37 | # 16-bit shuffles on half-words don't quite fit vec_shuffle 38 | local def vec_shuffle16_impl{suff, vec:V, ind} = { 39 | emit{V, intrin{merge{'shuffle', suff}, [width{V}/16]i16}, vec, base{4, ind}} 40 | } 41 | local def has_sh16{w} = (w==128 and hasarch{'SSE2'}) or (w==256 and hasarch{'AVX2'}) 42 | local def is_bool = match { {0}=>1; {1}=>1; {_}=>0 } 43 | def vec_shuffle16_half{half, vec:V=[_]_, {...ind} if is_bool{half} and has_sh16{width{V}} and length{ind}==4 and all{(ind>=0) & (ind<4)}} = { 44 | def part = select{tup{'lo','hi'}, half} 45 | call_cast{vec_shuffle16_impl{part, ., ind}, i16, V, vec} 46 | } 47 | def vec_shuffle16_lo{...} = vec_shuffle16_half{0, ...} 48 | def vec_shuffle16_hi{...} = vec_shuffle16_half{1, ...} 49 | 50 | # General implementation for constant indices 51 | local def vec_shuffle_impl{T, sel_n, val={_:V, ..._}, ind} = { 52 | def pos = all{ind >= 0} 53 | def w = width{T} 54 | def wv= width{V} 55 | def sel_width = sel_n * w 56 | def ind_width = length{ind} * w 57 | def 0 = wv % ind_width 58 | # Index expansion and instruction calls 59 | def get_ind{sel_n, n, ind} = { 60 | def exp{i} = if (n <= length{i}) i else exp{merge{i, i}} 61 | def off{i} = i + (range{length{i}} & -sel_n) * (i >= 0) 62 | off{exp{ind}} 63 | } 64 | def shuf_sub{I, name, ind, val} = { 65 | emit{V, intrin{name, [wv/width{I}]I}, ...val, ind} 66 | } 67 | def shuf{name, n, fmt} = shuf_sub{T, name, fmt{get_ind{sel_n, n, ind}}, val} 68 | def shuf{name, E, n, fmt} = { 69 | def e = w / width{E} 70 | def wi = if (e == 1) ind else merge{...each{+{.,range{e}}, e*ind}} 71 | shuf_sub{E, name, fmt{get_ind{e*sel_n, n, wi}}, val} 72 | } 73 | def shuf_base{...a, n } = shuf{...a, n, base{n, .}} 74 | def shuf_make{...a, n, T} = shuf{...a, n, vec_make{[n]T, .}} 75 | # shuffle_ps / shuffle_pd 76 | def fshuf{b, v={_,_}} = { 77 | def i = get_ind{sel_n, __min{wv / w, 4}, ind} 78 | shuf_sub{T, 'shuffle', base{b, i % b}, v} 79 | } 80 | # Arch-specific cases 81 | if (sel_n == 1 and 1 == length{val}) { 82 | select{val, 0} 83 | } else if (2 == length{val}) { 84 | def 1 = isfloat{T} and wv >= 128 and ind_width <= 128 and pos 85 | fshuf{128 / w, val} 86 | } else if (wv < 128) { 87 | def 1 = wv == 64 and isint{T} and w >= 16 and pos 88 | shuf_base{'shuffle', 4} 89 | } else if (sel_width == 256) { 90 | def 1 = hasarch{'AVX2'} and pos 91 | match (w) { 92 | {32} => shuf_make{'permutevar8x32', 8, u32} 93 | {64} => shuf_base{'permute4x64', 4} 94 | } 95 | } else if (isfloat{T}) { 96 | def 1 = pos # Zeroing unsupported for float selection 97 | if (ind_width == 256) { 98 | def 1 = hasarch{'AVX'} 99 | if (sel_width == 128 and w == 64) { 100 | shuf{'permute', 2, {t} => base{2, t%2}} 101 | } else { 102 | def n = 256 / w 103 | def U = [n]primtype{'u', w} 104 | shuf{'permutevar', n, {i} => vec_make{U, (w/32)*i}} 105 | } 106 | } else { 107 | def n = 128 / w 108 | if (hasarch{'AVX'}) shuf{'permute', n, {t} => base{n, if (n==2 and wv>128) merge{t,t} else t}} 109 | else fshuf{n, merge{val,val}} 110 | } 111 | } else { # isint{T} 112 | def 1 = hasarch{'SSE2'} 113 | match () { 114 | {if w >= 32 and pos} => shuf_base{'shuffle', i32, 4} 115 | {if hasarch{'SSSE3'}} => shuf_make{'shuffle', i8, wv/8, i8} 116 | {if w >= 16 and sel_width <= 64 and pos} => { 117 | def io = get_ind{w/16 * sel_n, 4, ind} 118 | def i = io - (range{length{io}} & -4) # Avoid passing offset indices to shufflehi 119 | def fi{v, s, ...bnd} = vec_shuffle16_impl{s, v, slice{i, ...bnd}} 120 | fi{fi{...val, 'lo',0,4}, 'hi',-4} 121 | } 122 | } 123 | } 124 | } 125 | 126 | # Process parameters 127 | local def vec_shuffle_proc{...spec, V=[_]vT, vals, {...inds}} = { 128 | # inds doesn't have the element type so it comes from spec or val 129 | def T = match (...spec) { 130 | {[_]T} => T; {T if ktyp{T}} => T 131 | {n if knum{n}} => vT; {} => vT 132 | } 133 | # sel_n is the number of indices in a selection unit 134 | # It can be specified by spec as a number or length of vector type 135 | # and/or by the length of each list in nested inds 136 | def spec_n = match (...spec) { 137 | {[k]_} => tup{k}; {k if knum{k}} => tup{k}; {..._} => tup{} 138 | } 139 | def {ind_n, ind} = match (inds) { 140 | {{{...t}, ..._}} => { 141 | def l = length{t} 142 | def 1 = all{l == each{length, inds}} 143 | tup{tup{l}, merge{...inds}} 144 | } 145 | {_} => { 146 | each{{s} => { def 0 = length{inds} % s }, spec_n} 147 | tup{tup{}, inds} 148 | } 149 | } 150 | def sel_n = match(...spec_n, ...ind_n) { # Shuffle unit from spec and nested indices must match 151 | {n,n} => n; {n} => n; {} => length{ind} 152 | } 153 | def 1 = all{ind < sel_n} 154 | def shuf{...v} = vec_shuffle_impl{T, sel_n, v, ind} 155 | call_cast{shuf, T, V, ...vals} 156 | } 157 | 158 | # For convenience, allow indices to be written directly without tup 159 | def vec_select {val, ...inds if 1 < length{inds}} = vec_select {val, inds} 160 | def vec_shuffle{val, ...inds if 1 < length{inds}} = vec_shuffle{val, inds} 161 | def vec_select {spec if knum{spec} or ktyp{spec}, val, ...inds if 1 < length{inds}} = vec_select {spec, val, inds} 162 | def vec_shuffle{spec if knum{spec} or ktyp{spec}, val, ...inds if 1 < length{inds}} = vec_shuffle{spec, val, inds} 163 | # Main definitions 164 | def vec_shuffle{...spec, val: V=[_]_ , {...inds}} = vec_shuffle_proc{...spec, V, tup{val}, inds} 165 | def vec_shuffle{...spec, vals={_:V=[_]_, _:V}, {...inds}} = vec_shuffle_proc{...spec, V, vals, inds} 166 | 167 | # Variable indices: not many cases to support 168 | def vec_shuffle{...spec, val:V=[vk]vT, ind:[ik]I if isint{I}} = { 169 | # Selection type and size can be set by spec, or come from ind 170 | # But element quality comes from val if possible 171 | def wi = width{I} 172 | def ty{} = if (wi < 32) I else primtype{quality{vT}, wi} 173 | def {sel_n, T} = match (...spec) { 174 | {[n]T} => tup{ n,T} 175 | {T if ktyp{T}} => tup{ik,T} 176 | {n if knum{n}} => tup{ n,ty{}} 177 | {} => tup{ik,ty{}} 178 | } 179 | def 1 = wi == width{T} 180 | def 0 = ik % sel_n 181 | def name = match (sel_n, width{T}) { 182 | {16, 8 if hasarch{'SSSE3'}} => 'shuffle' 183 | {4, 32 if hasarch{'AVX'} and isfloat{T}} => 'permutevar' 184 | # no 2, 64: permutevar_pd scales indices! 185 | {8, 32 if hasarch{'AVX2'}} => 'permutevar8x32' 186 | } 187 | def S = [width{V}/width{T}]T 188 | def shuf = emit{., intrin{name, S}, ., ind} 189 | if (class{T} == class{vT}) shuf{V, val} 190 | else V ~~ shuf{S, S ~~ val} 191 | } 192 | 193 | # Next-to-last index bit is used, so index has to be multiplied by 2 194 | def vec_shuffle_64_scaled{val:V=[k](f64), ind:[k]I if isint{I} and width{I}==64 and hasarch{'AVX'}} = { 195 | emit{V, intrin{'permutevar_pd', V}, val, ind} 196 | } 197 | def vec_shuffle_64_scaled{val:V=[_]T, ind if T != f64} = { 198 | V ~~ vec_shuffle_64_scaled{[width{V}/64]f64~~val, ind} 199 | } 200 | 201 | def vec_select{...spec, v:V=[_]vT, ind} = { 202 | def T = match (...spec) { 203 | {} => vT; {T if ktyp{T}} => T 204 | {w if knum{w} and w<=64} => primtype{if (w<32 and isfloat{vT}) 'u' else quality{vT}, w} 205 | } 206 | def w = width{T} 207 | def k = width{V} / w 208 | def 1 = match (ind) { {{...t}} => length{t}==k; {i:[(k)]I} => isint{I} } 209 | vec_shuffle{T, v, ind} 210 | } 211 | def vec_select{n, v:V=[_]vT, {...ind} if knum{n} and n>64} = { 212 | def e = n / 64 213 | vec_select{64, v, merge{...each{+{.,range{e}}, e*ind}}} 214 | } 215 | def vec_select{(width{V}), x:V=[_]_, i if is{i,0} or is{i,tup{0}}} = x 216 | # Selects as 4 unified lanes 217 | def vec_select{128, v={a:V=[_]T, b:V}, ind={_,_} if 256==width{V} and hasarch{'AVX'} and all{(ind>=0) & (ind<4)}} = { 218 | def q = if (isint{T} and hasarch{'AVX2'}) 'x' else 'f' 219 | def name = merge{'permute2', q, '128'} 220 | emit{V, intrin_b{name, V}, a, b, base{16, ind}} 221 | } 222 | 223 | # Reverse-units for compatibility with NEON 224 | def reverse_units{n, x:[l]_ if knum{n} and n>1 and l%n == 0} = { 225 | vec_shuffle{x, n-1 - range{n}} 226 | } 227 | 228 | 229 | # Zip 230 | local def has_zip{V=[_]T} = { 231 | hasarch{match (width{V}) { 232 | {128} => if (T==f32) 'SSE' else 'SSE2' 233 | {256} => if (isfloat{T}) 'AVX' else 'AVX2' 234 | {512} => if (width{T}>=32) 'AVX512F' else 'AVX512BW' 235 | }} 236 | } 237 | def zip128{a:V=[_]_, b:V, half if has_zip{V}} = { 238 | def name = merge{'unpack', match (half) { {0}=>'lo'; {1}=>'hi' }} 239 | emit{V, intrin{name, V}, a, b} 240 | } 241 | def zip{a:V=[_]_, b:V, half if width{V}==128} = zip128{a, b, half} 242 | 243 | 244 | # Shift/align as a list of elements 245 | # Directions left and right are opposite to instruction l and r! 246 | local { 247 | def has_sh{w, sse} = hasarch{match (w) { {128}=>sse; {256}=>'AVX2'; {512}=>'AVX512BW' }} 248 | def has_sh{V=[_]T, sse, n} = isint{T} and has_sh{width{V}, sse} and knum{n} 249 | # Shift left or right based on sign 250 | def vec_shift{w128, dir}{x:V=[_]T, n if has_sh{V,'SSE2',n} and (w128 or width{V}==128)} = { 251 | def wv = width{V}; def S = [width{V}/8]u8 252 | def name = merge{'bs', if (dir*n < 0) 'l' else 'r', 'li'} 253 | def suff = if (wv == 128) 'si128' else 'epi128' 254 | def nb = __min{16, __abs{n}*(width{T}/8)} # Shift in bytes 255 | def sh = emit{V, intrin_g{{_} => suff}{name, S}, ..., nb} 256 | call_cast{sh, T, S, x} 257 | } 258 | # Merge-shift, with mod_n to pick direction 259 | def vec_merge_shift{w128, mod_n}{a:V=[k]T, b:V, n if has_sh{V,'SSSE3',n} and (w128 or width{V}==128)} = { 260 | def S = [width{V}/8]u8 261 | def sh = emit{V, intrin{'alignr', S}, ..., mod_n{n*(width{T}/8)}} 262 | call_cast{sh, T, S, b, a} 263 | } 264 | } 265 | def vec_shift_left_128 {...} = vec_shift{1, 1} 266 | def vec_shift_left {...} = vec_shift{0, 1} 267 | def vec_shift_right_128{...} = vec_shift{1, -1} 268 | def vec_shift_right {...} = vec_shift{0, -1} 269 | 270 | def vec_merge_shift_left_128 {...} = vec_merge_shift{1, {n} => n} 271 | def vec_merge_shift_left {...} = vec_merge_shift{0, {n} => n} 272 | def vec_merge_shift_right_128{...} = vec_merge_shift{1, {n} => 16 - n} 273 | def vec_merge_shift_right {...} = vec_merge_shift{0, {n} => 16 - n} 274 | 275 | 276 | # Blend 277 | # x86 vector blends use the top bit of the mask 278 | # But blend_hom is more general so blend_top is defined as a restriction 279 | local { 280 | def has_blend{V=[_]T} = match (width{V}) { 281 | {128} => hasarch{'SSE4.1'} 282 | {256} => hasarch{if (isfloat{T}) 'AVX' else 'AVX2'} 283 | {_} => 0 284 | } 285 | def has_blendv{V, M} = width{V}==width{M} and has_blend{V} 286 | def blend_instr{hom}{f:V=[_]T, t, m:M if has_blendv{V, M} and (hom or isfloat{T} or width{T}==8)} = { 287 | def name = intrin{'blendv', if (isfloat{T}) V else [width{V}/8]i8} 288 | call_cast_sub{0, emit{V, name, f, t, ...}, T, M, m} 289 | } 290 | } 291 | def blend_hom{...} = blend_instr{1} 292 | def blend_top{...} = blend_instr{0} 293 | 294 | # Blend with immediate 295 | local def all_bool{m} = all{each{knum, m}} and all{(m==0) | (m==1)} 296 | def blend_units{f, t, ...m if all_bool{m}} = blend_units{f, t, m} 297 | def blend_units{f:V=[k]T, t:V, {...m} if has_blend{V} and width{T}>=16 and 0 == __min{8,k} % length{m} and all_bool{m}} = { 298 | def E = { 299 | if (isfloat{T}) T 300 | else if (width{T}>=32 and hasarch{'AVX2'}) i32 301 | else i16 302 | } 303 | def l = __min{8,k} 304 | def exp{i} = if (l <= length{i}) i else exp{merge{i, i}} 305 | def c = width{T} / width{E} # Copy each bit of m c times 306 | def b = 1 << c 307 | emit{V, intrin{'blend', [k*c]E}, f, t, base{b, exp{m}*(b-1)}} 308 | } 309 | def blend{f, t, ...m if all_bool{m}} = blend{f, t, m} 310 | def blend{f:V=[k]T, t:V, {...m} if has_blend{V} and k<=8 and length{m}==k} = { 311 | blend_units{f, t, m} 312 | } 313 | 314 | 315 | # Broadcast 316 | local def int_below{i, end} = knum{i} and __floor{i} == i and i < end 317 | local def has_shuf8{k} = hasarch{ 318 | match (k) { {16}=>'SSSE3'; {32}=>'AVX2'; {64}=>'AVX512BW' } 319 | } 320 | def broadcast_sel{x:V=[k]E, i if width{E}==8 and int_below{i, k} and has_shuf8{k}} = { 321 | if (k==32 and i<16) { 322 | def a = { 323 | if (i<8) V ~~ ([4]u64~~x >> (i*8)) 324 | else if (i==8) vec_shuffle{[2]u64, x, 1,0} 325 | else vec_shift_left_128{x, i} 326 | } 327 | broadcast_sel{a, 0} # Handled by special case for 0 below 328 | } else { 329 | def a = vec_shuffle{x, copy{16, i&15}} 330 | vec_select{128, a, copy{k>>4, i>>4}} 331 | } 332 | } 333 | def broadcast_sel{x:[k]E, i if width{E}==16 and k<=16 and int_below{i, k}} = { 334 | def a = vec_shuffle16_half{(i&4)!=0, x, copy{4, i&3}} 335 | vec_shuffle{u64, a, copy{k>>2, i>>2}} 336 | } 337 | 338 | local def has_full_shuf = match { # element width 339 | {8}=>hasarch{'AVX512VBMI'}; {16}=>hasarch{'AVX512BW'}; {_}=>1 340 | } 341 | def broadcast_sel{x:[k]E, i if int_below{i, k} and has_full_shuf{width{E}}} = { 342 | vec_shuffle{x, copy{k, i}} 343 | } 344 | 345 | def broadcast_sel{x:V=[k]E, 0 if hasarch{'AVX2'} and (width{V}<512 or width{E}>=32 or hasarch{'AVX512BW'})} = { 346 | def w = width{E} 347 | def char = { 348 | if (isfloat{E}) match (w) { {32}=>'ss'; {64}=>'sd' } 349 | else match (w) { {8}=>'b'; {16}=>'w'; {32}=>'d'; {64}=>'q' } 350 | } 351 | def lane0 = if (width{V} == 128) x else { 352 | def t = match (E) { {(f32)}=>'ps'; {(f64)}=>'pd'; {_}=>'si' } 353 | def f = fmtwidth{V} 354 | emit{[128/w]E, merge{'_mm',f,'_cast',t,f,'_',t,'128'}, x} 355 | } 356 | emit{V, intrin{merge{'broadcast', char}, V}, lane0} 357 | } 358 | -------------------------------------------------------------------------------- /include/arch/neon_intrin/basic.singeli: -------------------------------------------------------------------------------- 1 | local { 2 | include 'skin/c' 3 | oper ~~ reinterpret infix right 55 4 | def num{x} = 'number'==kind{x} 5 | 6 | def ew{V} = width{eltype{V}} 7 | def va{V} = 'vector'==typekind{V} and __or{...tup{64,128}==width{V}} 8 | def vi{V} = va{V} and isint{eltype{V}} 9 | def vs{V} = va{V} and __or{...tup{'i','f'}==quality{eltype{V}}} 10 | def vu{V} = va{V} and 'u'==quality{eltype{V}} 11 | def vf{V} = va{V} and isfloat{eltype{V}} 12 | 13 | def change_qual{[k]T,q} = [k]primtype{q, width{T}} 14 | def uns = change_qual{.,'u'} 15 | def sgn = change_qual{.,'i'} 16 | 17 | def intrin{name, ...s, V=[_]T} = { 18 | def q = quality{T} 19 | def w = if (128==width{V}) 'q' else '' 20 | merge{name, w, ...s, '_', if (q=='i') 's' else q, fmtnat{width{T}}} 21 | } 22 | } 23 | 24 | # Building vectors from scalars 25 | local { 26 | def can_elt = match { {[_]T, x:T} => 1; {_,x} => num{x} } 27 | def can_make_sub = can_elt 28 | def can_make_sub{V=[k]_, {...x}} = { 29 | def all{t} = is{t, 0 <= t} 30 | k==length{x} and all{each{can_elt{V,.}, x}} 31 | } 32 | def can_make{V,x} = va{V} and can_make_sub{V,x} 33 | def mv_sub{V=[_]T, x} = { 34 | tmp:*T = each{cast{T,.}, x} 35 | load{*V~~tmp, 0} 36 | } 37 | } 38 | def vec_make{V, ...x if can_make{V,x}} = mv_sub{V, x} 39 | def vec_make{V, {...x} if can_make{V,x}} = mv_sub{V, x} 40 | def vec_broadcast{V=[_]T, x if can_make{V,x}} = { 41 | emit{V, intrin{'vdup', '_n', V}, cast{T,x}} 42 | } 43 | 44 | def load {a:*V=[_]E, n if va{V}} = emit{V , intrin{'vld1', V}, *E ~~ (a+n)} 45 | def store{a:*V=[_]E, n, v:V if va{V}} = emit{void, intrin{'vst1', V}, *E ~~ (a+n), v} 46 | 47 | def extract{x:V,n if va{V} and num{n}} = emit{eltype{V}, intrin{'vget', '_lane', V}, x, n} 48 | def insert{a:V, x, i if va{V} and num{i} and can_elt{V,x}} = { 49 | emit{V, intrin{'vset', '_lane', V}, cast{eltype{V}, x}, a, i} 50 | } 51 | 52 | local def n8{[_]E} = isfloat{E} or width{E}<64 53 | def __or{a:V,b:V if vf{V}} = {def U = uns{V}; V~~ __or{U~~a, U~~b} } 54 | def __and{a:V,b:V if vf{V}} = {def U = uns{V}; V~~__and{U~~a, U~~b} } 55 | def __xor{a:V,b:V if vf{V}} = {def U = uns{V}; V~~__xor{U~~a, U~~b} } 56 | def __add{a:V,b:V if va{V}} = emit{V, intrin{'vadd', V}, a, b} 57 | def __sub{a:V,b:V if va{V}} = emit{V, intrin{'vsub', V}, a, b} 58 | def __mul{a:V,b:V if va{V} and n8{V}} = emit{V, intrin{'vmul', V}, a, b} 59 | def __div{a:V,b:V if vf{V}} = emit{V, intrin{'vdiv', V}, a, b} 60 | def __and{a:V,b:V if vi{V}} = emit{V, intrin{'vand', V}, a, b} 61 | def __or{a:V,b:V if vi{V}} = emit{V, intrin{'vorr', V}, a, b} 62 | def __xor{a:V,b:V if vi{V}} = emit{V, intrin{'veor', V}, a, b} 63 | def andnot{a:V,b:V if vi{V}} = emit{V, intrin{'vbic', V}, a, b} 64 | def ornot{a:V,b:V if vi{V}} = emit{V, intrin{'vorn', V}, a, b} 65 | def andnz{a:V,b:V if vi{V}} = emit{V, intrin{'vtst', V}, a, b} 66 | def __min{a:V,b:V if va{V} and n8{V}} = emit{V, intrin{'vmin', V}, a, b} 67 | def __max{a:V,b:V if va{V} and n8{V}} = emit{V, intrin{'vmax', V}, a, b} 68 | def __shl{a:V,b:S if vi{V} and S==uns{V}} = emit{V, intrin{'vshl', V}, a, sgn{S}~~b} 69 | def __adds{a:V,b:V if vi{V}} = emit{V, intrin{'vqadd', V}, a, b} 70 | def __subs{a:V,b:V if vi{V}} = emit{V, intrin{'vqsub', V}, a, b} 71 | 72 | def addp{a:V,b:V if va{V}} = emit{V, intrin{'vpadd', V}, a, b} 73 | def addpw{a:V=[k]E if vi{V} and width{E}<=32} = emit{[k/2](primtype{quality{E}, width{E}*2}), intrin{'vpaddl', V}, a} 74 | 75 | def __shl{a:V, s if vi{V} and num{s} and s>0 and s0 and s'q'; {64}=>'' }, 11 | '_', 12 | match (quality{T}) { {'i'}=>'s'; {q}=>q }, 13 | fmtnat{width{T}} 14 | } 15 | emit{V, intrin, ...args} 16 | } 17 | } 18 | 19 | local def vqtbl{...TS, vals={v0:V, ..._}, ind} = { 20 | def S = match (...TS) { 21 | {T if width{T}==8} => [16]T 22 | {} => if (V == [16]i8) V else [16]u8 23 | } 24 | def l = length{vals} 25 | def name = merge{if (l>1) 'unpacked_' else '', 'vqtbl', fmtnat{l}} 26 | def shuf = emit_intrin{S, name, ..., [16]u8~~ind} 27 | if (V == S) shuf{...vals} else V~~shuf{...each{~~{S,.}, vals}} 28 | } 29 | 30 | # Start with constant-index cases 31 | local def widen_norm_ind{ind, e} = { 32 | def wi = if (e == 1) ind else merge{...each{+{.,range{e}}, e*ind}} 33 | __max{wi,-1} % 256 34 | } 35 | 36 | # For convenience, allow indices to be written directly without tup 37 | def vec_select {val, ...inds if 1 < length{inds}} = vec_select {val, inds} 38 | def vec_shuffle{val, ...inds if 1 < length{inds}} = vec_shuffle{val, inds} 39 | def vec_select {spec if knum{spec} or ktyp{spec}, val, ...inds if 1 < length{inds}} = vec_select {spec, val, inds} 40 | def vec_shuffle{spec if knum{spec} or ktyp{spec}, val, ...inds if 1 < length{inds}} = vec_shuffle{spec, val, inds} 41 | 42 | def vec_select{...spec, val:V=[_]vT, {...ind} if all{each{knum,ind}}} = { 43 | def w = match (...spec) { 44 | {n if knum{n}} => n; {T if ktyp{T}} => width{T}; {} => width{vT} 45 | } 46 | def 1 = w >= 8 47 | def {n,n} = tup{width{V}/w, length{ind}} 48 | def 1 = all{ind < n} 49 | vqtbl{tup{val}, vec_make{[16]u8, widen_norm_ind{ind, w / 8}}} 50 | } 51 | 52 | def vec_shuffle{...spec, val:V=[_]vT, {...inds}} = { 53 | # inds doesn't have the element type so it comes from spec or val 54 | def T = match (...spec) { 55 | {[_]T} => T; {T if ktyp{T}} => T 56 | {n if knum{n}} => vT; {} => vT 57 | } 58 | # sel_n is the number of indices in a selection unit 59 | # It can be specified by spec as a number or length of vector type 60 | # and/or by the length of each list in nested inds 61 | def spec_n = match (...spec) { 62 | {[k]_} => tup{k}; {k if knum{k}} => tup{k}; {..._} => tup{} 63 | } 64 | def {ind_n, ind} = match (inds) { 65 | {{{...t}, ..._}} => { 66 | def l = length{t} 67 | def 1 = all{l == each{length, inds}} 68 | tup{tup{l}, merge{...inds}} 69 | } 70 | {_} => { 71 | each{{s} => { def 0 = length{inds} % s }, spec_n} 72 | tup{tup{}, inds} 73 | } 74 | } 75 | def sel_n = match(...spec_n, ...ind_n) { # Shuffle unit from spec and nested indices must match 76 | {n,n} => n; {n} => n; {} => length{ind} 77 | } 78 | def 1 = all{ind < sel_n} 79 | 80 | def e = width{T} / 8 81 | def exp{i} = if (16 <= length{i}) i else exp{merge{i, i}} 82 | def off{i} = i + (range{length{i}} & -e*sel_n) 83 | def vind = vec_make{[16]u8, off{exp{widen_norm_ind{ind, e}}}} 84 | vqtbl{tup{val}, vind} 85 | } 86 | 87 | # Variable indices 88 | def check_select{spec, vals, V} = { 89 | def l = length{vals} 90 | def v = l==1 or (l<=4 and all{V == each{type,vals}}) 91 | v and (match (...spec) { {}=>1; {8}=>1; {T} => ktyp{T} }) 92 | } 93 | def vec_select{...spec, vals={v0:V=[_]vT, ..._}, ind:[16]I if check_select{spec, vals, V}} = { 94 | def T = match (...spec) { {T if ktyp{T}} => T; {..._} => vT } 95 | vqtbl{T, vals, ind} 96 | } 97 | def vec_select{...spec, val:V=[_]_, ind:[16]I} = vec_select{...spec, tup{val}, ind} 98 | 99 | def vec_shuffle{...spec, val:V=[_]vT, ind:[16]I} = { 100 | def T = match (...spec) { 101 | {[16]T} => T; {T if ktyp{T}} => T; {16} => vT; {} => vT 102 | } 103 | vqtbl{T, tup{val}, ind} 104 | } 105 | 106 | 107 | # Shifts 108 | def vec_merge_shift_left {a:V=[_]_, b:V, n if knum{n}} = emit_intrin{V, 'vext', b, a, n} 109 | def vec_merge_shift_right{a:V=[k]_, b:V, n} = vec_merge_shift_left{b, a, k - n} 110 | def vec_shift_left {x:V=[_]_, n} = vec_merge_shift_left {x, vec_broadcast{V,0}, n} 111 | def vec_shift_right{x:V=[_]_, n} = vec_merge_shift_right{vec_broadcast{V,0}, x, n} 112 | 113 | def zip{a:V=[_]_, b:V, half} = { 114 | def name = merge{'vzip', match (half) { {0}=>'1'; {1}=>'2' }} 115 | emit_intrin{V, name, a, b} 116 | } 117 | def reverse_units{n, x:V=[l]T if knum{n} and 1=0 and i if (b) tup{' ', a} else tup{a}, 21 | vs0, nsym & shiftright{0, nsym} 22 | }} 23 | 24 | def lit{s} = tup{s, tup{}} 25 | def lit1{s} = tup{lit{s}} 26 | 27 | def listfmt{hex, open, close, vs} = { 28 | def f = each{runfmt{hex, .}, vs} 29 | def d = each{lit1, tup{open, ', ', close}} 30 | merge{...each{merge, select{d, 0 != inds{f}}, f}, select{d,2}} 31 | } 32 | def listfmt{hex, open, close, {}} = lit{merge{open, close}} 33 | 34 | def runfmt{hex, x} = match(x) { 35 | {_ if ksym{x}} => lit1{x} 36 | {{'x0', y}} => runfmt{2, y} 37 | {{'x', y}} => runfmt{1, y} 38 | {{}} => lit1{'{}'} 39 | {{...vs}} => listfmt{hex, '{', '}', vs} 40 | {_:*E} => tup{tup{'%p', tup{x}}} 41 | 42 | {_:V=[l]E} => { 43 | tmp:*V = undefined{V, 1} 44 | tmp <- x 45 | listfmt{hex, '[', ']', each{load{*E~~tmp,.}, range{l}}} 46 | } 47 | 48 | {_:T if typekind{T}=='primitive'} => { 49 | def q = quality{T} 50 | def w = width{T} 51 | def u = w>1 and q=='u' 52 | def spec = { 53 | if (q=='f') (if (w==32) '%.8g' else '%.17g') 54 | else merge{ 55 | if (hex!=0) '0x' else '', 56 | '%', 57 | if (hex==2) merge{'0', fmtnat{w/4}} else '', 58 | if (w==64) '"SCN' else '', 59 | if (hex!=0) 'x' else if (u) 'u' else 'd', 60 | if (w==64) '64"' else '' 61 | } 62 | } 63 | tup{tup{spec, tup{if (q=='i' and hex!=0) reinterpret{primtype{'u', w}, x} else x}}} 64 | } 65 | 66 | {_ if knum{x}} => { 67 | if ((x>>0) == x) { 68 | if (x >= -(1<<63) and x < 1<<63) runfmt{hex, i64~~x} 69 | else if (x >= 0 and x < 1<<64) runfmt{hex, u64~~x} 70 | else lit1{fmtnat{x}} 71 | } else runfmt{hex, f64~~x} 72 | } 73 | 74 | {T if kind{T}=='type'} => match(typekind{T}) { 75 | {'primitive'} => lit1{merge{quality{T}, fmtnat{width{T}}}} 76 | {'vector'} => join{flip{tup{lit1{merge{'[', fmtnat{vcount{T}}, ']'}}, runfmt{hex, eltype{T}}}}} 77 | {_} => lit1{merge{'(unhandled type typekind: ', typekind{x}, ')'}} 78 | } 79 | 80 | {_:T} => lit1{merge{'(unhandled value typekind: ', typekind{x}, ')'}} 81 | {_} => lit1{merge{'(unhandled kind: ', kind{x}, ')'}} 82 | } 83 | 84 | def fs = flip{join{each{runfmt{0,.}, vs}}} 85 | def {strs, args} = each{join, fs} 86 | 87 | emit{void, 'printf', merge{'"', strs, '"'}, ...args} 88 | match(vs0) { {{r, _}} => r; {_} => {} } 89 | } 90 | -------------------------------------------------------------------------------- /include/skin/c.singeli: -------------------------------------------------------------------------------- 1 | include 'skin/cop' # Ordinary operators + - % etc. 2 | include 'skin/cmut' # Mutating operators -- &= etc. 3 | -------------------------------------------------------------------------------- /include/skin/cext.singeli: -------------------------------------------------------------------------------- 1 | # Additional operators for use with skin/c 2 | 3 | oper === (is) infix none 0 4 | 5 | oper ~~ reinterpret infix right 55 6 | oper ^~ promote infix right 55 7 | oper <~ cast_i infix right 55 # Requires arch/c 8 | oper $ __vec prefix 50 9 | 10 | local def __store{ind}{ptr, val} = store{ptr, ind, val} 11 | local def __store{ptr, val} = __store{0}{ptr, val} 12 | 13 | oper -> load infix right 50 14 | oper <- (__store) infix right 5 15 | -------------------------------------------------------------------------------- /include/skin/cmut.singeli: -------------------------------------------------------------------------------- 1 | # C operators that modify variables/registers 2 | oper += __incr infix right 5 3 | oper -= __decr infix right 5 4 | oper ++ __incr prefix 60 5 | oper -- __decr prefix 60 6 | def __incr{a,b} = { a = __add{a,b} } 7 | def __decr{a,b} = { a = __sub{a,b} } 8 | def __incr{a} = __incr{a,1} 9 | def __decr{a} = __decr{a,1} 10 | 11 | oper *= ({a,b} => a = __mul{a,b}) infix right 5 12 | oper /= ({a,b} => a = __div{a,b}) infix right 5 13 | oper %= ({a,b} => a = __mod{a,b}) infix right 5 14 | oper <<= ({a,b} => a = __shl{a,b}) infix right 5 15 | oper >>= ({a,b} => a = __shr{a,b}) infix right 5 16 | oper &= ({a,b} => a = __and{a,b}) infix right 5 17 | oper ^= ({a,b} => a = __xor{a,b}) infix right 5 18 | oper |= ({a,b} => a = __or {a,b}) infix right 5 19 | -------------------------------------------------------------------------------- /include/skin/cop.singeli: -------------------------------------------------------------------------------- 1 | oper - __neg prefix 30 2 | oper * __pnt prefix 60 3 | 4 | oper == __eq infix none 20 5 | oper != __ne infix none 20 6 | oper < __lt infix none 20 7 | oper > __gt infix none 20 8 | oper <= __le infix none 20 9 | oper >= __ge infix none 20 10 | 11 | oper + __add infix left 30 12 | oper - __sub infix left 30 13 | oper * __mul infix left 40 14 | oper / __div infix left 40 15 | oper % __mod infix left 40 16 | 17 | # Nobody likes low-precedence & | 18 | oper & __and infix none 35 19 | oper | __or infix none 35 20 | oper ^ __xor infix none 35 21 | oper ~ __not prefix 50 22 | 23 | # Shifts are like multiply/divide and should have the same precedence 24 | oper << __shl infix left 40 25 | oper >> __shr infix left 40 26 | -------------------------------------------------------------------------------- /include/util/for.singeli: -------------------------------------------------------------------------------- 1 | local { 2 | include 'skin/cop' 3 | def ux = primtype{'u', width{*void}} 4 | def num{n} = is{'number',kind{n}} 5 | def loop_var{a, b} = { 6 | if (num{a}) { cast{if (num{b}) ux else type{b}, a} } 7 | else { 8 | def ta=type{a} 9 | match (b) { {_:tb if tapromote{tb,a}; {_}=>a } 10 | } 11 | } 12 | } 13 | 14 | def for{vars,begin,end,iter} = { 15 | i := loop_var{begin, end} 16 | while (i < end) { 17 | iter{i, vars} 18 | i = i + 1 19 | } 20 | } 21 | 22 | def for_backwards{vars,begin,end,iter} = { 23 | i := loop_var{end, begin} 24 | while (i > begin) { 25 | i = i - 1 26 | iter{i, vars} 27 | } 28 | } 29 | 30 | def for_const{vars,begin,end,iter if num{begin} and num{end}} = { 31 | each{iter{., vars}, begin + range{end-begin}} 32 | } 33 | 34 | def for_unroll{unr if num{unr}}{vars,begin,end,iter} = { 35 | i := loop_var{begin, end} 36 | while (i + unr <= end) { 37 | each{{j}=>iter{i+j, vars}, range{unr}} 38 | i = i + unr 39 | } 40 | while (i < end) { 41 | iter{i, vars} 42 | i = i + 1 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /include/util/functionize.singeli: -------------------------------------------------------------------------------- 1 | # If gen is a generator that applies to typed parameters, 2 | # functionize{gen} is equivalent to gen but works through a function call 3 | # So multiple calls to functionize{gen} on the same types share code 4 | # instead of inlining and duplicating it. 5 | 6 | fn asfunc{gen, ...type}(a:type) = gen{...a} 7 | def functionize{gen}{...args} = asfunc{gen, ...each{type, args}}(args) 8 | -------------------------------------------------------------------------------- /include/util/kind.singeli: -------------------------------------------------------------------------------- 1 | # Generators that check kind 2 | local def ki{k}{x} = is{k, kind{x}} 3 | def knum = ki{'number'} 4 | def ksym = ki{'symbol'} 5 | def ktup = ki{'tuple'} 6 | def kgen = ki{'generator'} 7 | def ktyp = ki{'type'} 8 | def kcon = ki{'constant'} 9 | def kreg = ki{'register'} 10 | def kfun = ki{'function'} 11 | def klab = ki{'label'} 12 | -------------------------------------------------------------------------------- /include/util/perv.singeli: -------------------------------------------------------------------------------- 1 | # Pervasion 2 | 3 | def extend perv1{op} = { 4 | def op{{...a}} = each{op, a} 5 | } 6 | 7 | def extend perv2{op} = { 8 | def op{ a , {...b}} = each{op{a,.}, b} 9 | def op{{...a}, b } = each{op{.,b}, a} 10 | def op{{...a}, {...b}} = each{op, a, b} 11 | } 12 | 13 | local def anytup = match { 14 | {{..._}, ..._} => 1; {_, ...r} => anytup{...r}; {} => 0 15 | } 16 | 17 | def perv{n if is{'number',kind{n}}} = { def extend _{op} = { 18 | def op{...t if is{n,length{t}} and anytup{...t}} = { 19 | def ftup = match { {{{...e}, ..._}} => e; {_, ...r} => ftup{r} } 20 | def l = ftup{t} 21 | def r = match { {{...s}} => s; {k} => each{{_}=>k,l} } 22 | each{op, ...each{r,t}} 23 | } 24 | }} 25 | def perv{1} = perv1 26 | def perv{2} = perv2 27 | -------------------------------------------------------------------------------- /include/util/tup.singeli: -------------------------------------------------------------------------------- 1 | # Tuple utilities 2 | 3 | local { 4 | include 'skin/cop' 5 | oper $ length prefix 30 6 | 7 | include 'util/kind' 8 | def sl{l, start, len} = slice{l, start, start + len} 9 | } 10 | 11 | # Tuple is empty 12 | def empty{tup} = 0 == $tup 13 | 14 | # Constant-time evaluation returning a list 15 | def collect{vars,begin,end,exec if begin<=end} = { 16 | def inds = begin + range{end-begin} 17 | each{exec{., vars}, inds} 18 | } 19 | 20 | # Integers [0,n) 21 | def iota{n if knum{n}} = range{n} 22 | 23 | # All indices into tuple t 24 | def inds{t} = range{$t} 25 | 26 | # Tuple of n copies of v 27 | def copy{n, v if knum{n}} = each{{_}=>v, range{n}} 28 | 29 | # Merge a tuple of tuples 30 | def join{l} = merge{...l} 31 | 32 | # Shift l into r, retaining length of r, or vice-versa 33 | def shiftright{l, r} = slice{merge{l, r}, 0, $r} 34 | def shiftleft {l, r} = slice{merge{l, r}, - $l} 35 | 36 | # Reversed tuple 37 | def reverse{t} = select{t, ($t-1) - inds{t}} 38 | 39 | # Tuple of length n made from t repeated cyclically 40 | def cycle{n, t if knum{n}} = { 41 | def l = $t 42 | def m = n % l; def e = slice{t, 0, m} 43 | if (m == n) e 44 | else merge{...copy{(n-m)/l, t}, e} 45 | } 46 | 47 | # Split into groups of length n, possibly less for the last 48 | def split{n, list if knum{n}} = { 49 | def d = __ceil{($list) / n} 50 | each{sl{list, ., n}, n*range{d}} 51 | } 52 | def split{{...n}, list} = { 53 | def start = shiftright{0, scan{+,n}} 54 | each{sl{list,...}, start, n} 55 | } 56 | 57 | # Transpose tuple of same-length tuples 58 | def flip{tab} = each{tup, ...tab} 59 | 60 | # Function table mapping over all combinations 61 | def table = match { 62 | {f} => f{} 63 | {f, t} => each{f, t} 64 | {f, t, ...ts} => each{{e} => table{f{e,...}, ...ts}, t} 65 | } 66 | # Flattened into a single list 67 | def flat_table = match { 68 | {f} => tup{f{}} 69 | {f, t} => each{f, t} 70 | {f, t, ...ts} => join{each{{e} => flat_table{f{e,...}, ...ts}, t}} 71 | } 72 | 73 | # Left fold, with or without initial element 74 | def fold = match { 75 | {f, init, {}} => init 76 | {f, init, {x, ...rest}} => fold{f, f{init, x}, rest} 77 | {f, {init, ...rest}} => fold{f, init, rest} 78 | } 79 | 80 | # Low-stack inclusive+exclusive scan implementation 81 | local def scan_full = match { 82 | {f, init, {}} => tup{init} 83 | {f, init, {x}} => tup{init, f{init, x}} 84 | {f, init, list} => { 85 | def m = length{list} >> 1 86 | def l = scan_full{f, init, slice{list, 0, m}} 87 | merge{l, scan{f, select{l, -1}, slice{list, m}}} 88 | } 89 | } 90 | # Inclusive left scan 91 | def scan{f, init, list} = slice{scan_full{f, init, list}, 1} 92 | def scan{f, {}} = tup{} 93 | def scan{f, {h, ...t}} = scan_full{f, h, t} 94 | 95 | # Extend to multiple list inputs, if initialized 96 | def fold{f, i, ...ls={_, _, ..._}} = fold{{a, t} => f{a, ...t}, i, flip{ls}} 97 | def scan{f, i, ...ls={_, _, ..._}} = scan{{a, t} => f{a, ...t}, i, flip{ls}} 98 | 99 | # Copy list elements based on list, constant, or generator (like filter) 100 | def replicate{reps, list} = join{each{copy, reps, list}} 101 | def replicate{r, list if knum{r}} = join{each{copy{r,.}, list}} 102 | def replicate{f, list if kgen{f}} = replicate{each{f,list}, list} 103 | 104 | # For boolean i, return indices of 1s 105 | def indices{i} = replicate{i, inds{i}} 106 | 107 | # Search functions that return a single number 108 | local def proc_find{out, i, f} = each{out, findmatches{i, f}} 109 | # Index of only match, erroring if there are multiple 110 | # If there are none, return the default if given and error otherwise 111 | def find_index{sin, sfor, ...default if $default <= 1} = proc_find{ 112 | {is} => match (is, default) { {{i}, _} => i; {{}, {d}} => d }, 113 | sin, sfor 114 | } 115 | # Index of first match 116 | def index_of{sin, sfor} = { 117 | def n = $sin 118 | proc_find{match { {{i, ..._}} => i; {_} => n }, sin, sfor} 119 | } 120 | # Whether each element is found; how many times it's found 121 | def contained_in = proc_find{{i} => 0 < $i, ...} 122 | def count_matches = proc_find{length, ...} 123 | 124 | # Grouping: gather indices or data values based on how a grouping 125 | # argument matches the domain 126 | # For group, domain can be a list of keys, a length, or omitted to infer length 127 | # For key, the domain is the unique elements of the grouping argument in order 128 | # group_inds: gather inds{values} 129 | def group_inds = findmatches 130 | def group_inds{values, len if knum{len}} = findmatches{values, range{len}} 131 | def group_inds{{...vs} if fold{&, each{knum, vs}}} = { 132 | group_inds{vs, 1 + fold{__max, vs}} 133 | } 134 | # group: gather data 135 | def group{{...vs}, ...g, {...data} if $vs == $data} = { 136 | select{data, group_inds{vs, ...g}} 137 | } 138 | # key: gather indices or data 139 | def key{{...keys}} = { 140 | def i = findmatches{keys, keys} 141 | replicate{inds{i} == each{select{., 0}, i}, i} 142 | } 143 | def key{{...keys}, {...values} if $keys == $values} = select{key{keys}, values} 144 | # Add a generator for the first argument to apply to each result 145 | def extend resgen{gr} = { 146 | def gr{gen, ...args if kgen{gen}} = each{gen, gr{...args}} 147 | } 148 | extend resgen{group_inds}; extend resgen{group}; extend resgen{key} 149 | 150 | # Self-search 151 | local def index_self{list} = proc_find{select{., 0}, list, list} 152 | local def umask_from_ind{i} = i == inds{i} 153 | local def cls_from_umask_ind{u, i} = select{scan{+, -1, u}, i} 154 | def unique_mask{list} = umask_from_ind{index_self{list}} 155 | def unique{list} = replicate{unique_mask{list}, list} 156 | def classify{list} = { 157 | def i = index_self{list} 158 | cls_from_umask_ind{umask_from_ind{i}, i} 159 | } 160 | def unique_classify{list} = { 161 | def i = index_self{list} 162 | def u = umask_from_ind{i} 163 | tup{replicate{u, list}, cls_from_umask_ind{u, i}} 164 | } 165 | def occurrence_count{list} = { 166 | def g = key{list} 167 | def c = join{each{inds, g}} 168 | group{{{i}}=>i, join{g}, c} 169 | } 170 | -------------------------------------------------------------------------------- /ir.bqn: -------------------------------------------------------------------------------- 1 | # IR passes 2 | 3 | # Apply transformation to each function 4 | _onFns ← { 5 | [o,c] ← +`˘ "beginFn"‿"endFn" (⊣≡≠⊸↑)⌜ 𝕩 6 | o »↩ 7 | f ← o-c 8 | r ← 𝔽¨⌾(1⊸↓) (o×f)⊔𝕩 9 | ((f¬⊸/o)∾/≠¨1↓r) ⍋⊸⊏ ∾r 10 | } 11 | 12 | # Attempt to replace goto{,T,F} with two kinds of structure: 13 | # 14 | # - beginBlock/endBlock/break{,T,F} (do {...} while (0)) 15 | # - beginLoop/endLoop/continue{,T,F} (while (1) {...;break;}) 16 | # 17 | # Loops and blocks are properly nested, and have named labels. 18 | # Jumps only occur on break, which goes to the end of its block, and 19 | # continue, which goes to the beginning of its loop. The loop exits when 20 | # endLoop is reached. 21 | Restructure ⇐ { 22 | [lm,am] ← ∨` "lbl "‿"goto" ≡⌜ 4↑¨𝕩 23 | ai ← /am 24 | lb ← ai ⊏ lm # Which statements are lbl (not goto) 25 | i ← ⊐id ← (∧`⌾⌽' '⊸≠)⊸/¨ ai ⊏ 𝕩 # Label ID 26 | f ← ∊i ⋄ l ← ∊⌾⌽i # First and last use of label 27 | IM ← {(𝕩⊐○(/⟜i)𝕨) ⊏ /𝕩} # /𝕩 ordered by matching 𝕨 (requires 𝕨≡○(∧/⟜i)𝕩) 28 | ff ← (fl ← f○⊑∧≡○(⊢´))´˘ ∘‿2⥊ ((⍋+`-0⊸<)⊑¨)⊸⊏ add ? # Abort if not nested 45 | # Change goto to break/continue and insert begin/end 46 | alm ← ¬lb⌾(am⊸/)am 47 | br ← "break"‿"continue"⊏˜lb¬⊸/(i⊏i⍋⊸⊏○(lb⊸/)⊢)⊸≤⊒i 48 | (ni∾ ⟨ 8 | "h" ‿"help" ‿0‿1‿"Print this message and exit" 9 | "o" ‿"out" ‿1‿0‿"Output file (print to stdout by default)" 10 | "oe"‿"errout"‿1‿0‿"Error to: stderr (default), stdout, none, file=path, bqn" 11 | "os"‿"show" ‿1‿0‿"show{} to: stdout (default), stderr, none, file=path" 12 | # "?" ‿"stdin" ‿0‿0‿"Use stdin as input, after any argument files" 13 | "r" ‿"run" ‿1‿1‿"Use this argument as source code" 14 | "t" ‿"target"‿1‿0‿"Output type: c (default), cpp, ir" 15 | "a" ‿"arch" ‿1‿2‿"Architecture features: list, or none, native (default), all" 16 | "i" ‿"infer" ‿1‿0‿"Type of architecture inference: strict, or loose (default)" 17 | "l" ‿"lib" ‿1‿2‿"Library paths: lib=path to try path/x for include 'lib/x'" 18 | "c" ‿"config"‿1‿2‿"Configuration: name=value to set config name to value" 19 | "p" ‿"pre" ‿1‿0‿"Preamble placed before C output" 20 | "n" ‿"name" ‿1‿0‿"Prefix for names in C output" 21 | "d" ‿"deplog"‿1‿0‿"Output file for log of included dependencies" 22 | ⟩ 23 | short‿long ∾˜¨⟜<¨↩ "-"‿"--" 24 | args ∾↩ 0 ⋄ dup ∾↩ 1 25 | 26 | Spl ← (⊢-˜+`׬)∘=⊔⊢ 27 | 28 | c ← ≠short 29 | op ← (short⊸⊐ ⌊ long⊸⊐) •args 30 | op ⌈↩ c ׬ <`⊸= op⊏args 31 | opts ← ((1+c)∾˜f/op) ⊔ ((op=c)(1-˜×⟜(+`))○(∾⟜1)f←¬0»op⊏args) ⊔ •args 32 | "Option can't be duplicated" ! ∧´ (1≤dup) ≥ 1<≠¨opts 33 | olist ← (2=dup) (∾','⊸Spl¨)⍟⊣¨ (1⌾(¯1⊸⊑)args) ⊣◶⟨0<≠∘⊢,⊑¨⊢⟩¨ opts 34 | help‿out‿oe‿os‿run‿target‿feats‿inf‿lib‿config‿pre‿namepre‿deplog‿files ← olist 35 | 36 | { help ? 37 | opt_help ← ∾¨´ ⟨desc⟩ ∾˜ (1+·⌈´≠¨)⊸(↑¨)¨ short‿long ∾¨¨ ",:" 38 | •Out ∾∾⟜(@+10)¨ ⟨help_pre,""⟩ ∾ opt_help 39 | •Exit@ 40 | ;@} 41 | 42 | _choices ← {∧´𝕨∊𝕩? (⊑𝕩)⊣´𝕨; !∾⟨"Unknown ",𝕗," option: ",∾𝕨," (options are",1↓∾", "⊸∾¨𝕩,")"⟩} 43 | target "target" _choices↩ "c"‿"cpp"‿"ir" 44 | inf "inference"_choices↩ "loose"‿"strict" 45 | 46 | Rel ← •wdpath⊸•file.At 47 | files Rel¨↩ 48 | 49 | SplitEq ← (»⊸(⊣-<)·∨`'='⊸=)⊸⊔ 50 | libpaths ← (Rel⌾(1⊸⊑) ¯2 ↑ SplitEq)¨ lib 51 | configs ← (2 ↑ SplitEq)¨ config 52 | OutBuf ← {𝕊: e←⟨⟩ ⋄ Save⇐{e∾↩<𝕩⋄𝕩} ⋄ Get⇐{𝕊:e}} 53 | _getShows ← {name _𝕣 𝕩: 54 | Save‿Get ← OutBuf@ 55 | Out‿Write ← ⊢‿⊢ »˜ { 56 | "stderr": •term.ErrRaw•ToUTF8∾(@+10)˙ ; "stdout":•Out ; "none":⊢ ; 57 | "bqn":"error"≡name? ⊢ ; 58 | (p←"file=")(⊣≡≠⊸↑)𝕩? f←Rel p≠⊸↓𝕩 ⋄ ⊢‿{f •file.Chars ∾∾⟜(@+10)¨𝕩 ⋄ 𝕩} ; 59 | !"Unknown "∾name∾" output option: "∾𝕩 60 | }𝕩 61 | ⟨Out∘Save, Write∘Get⟩ 62 | } 63 | ⟨ShowOut,ShowWrite⟩ ← "show{}" _getShows "stdout" ⊣´ os 64 | ⟨ErrOut, ErrWrite⟩ ← "error" _getShows oe ↩ "stderr" ⊣´ oe 65 | Writes ← ShowWrite ⋈ {⋈∾∾⟜(@+10)¨𝕩}⍟(0<≠)∘ErrWrite 66 | ⟨ErrExit,_withErr⟩ ← { "bqn"≡oe ? ⟨!, {𝔽⎊@}⟩ ; ⟨•Exit∘1 ⊣ Writes, {𝔽}⟩ } 67 | DepOut‿DepWrite ← { 68 | wr ← {⟨⟩:⊢; ⟨p⟩: (Rel p)⊸•file.Lines} deplog 69 | Save‿Get ← OutBuf@ ⋄ DepOut⇐Save ⋄ DepWrite⇐Wr∘Get 70 | } 71 | 72 | arch ← ⟨feats,"strict"≢inf⟩ •Import "arch.bqn" 73 | outputs ← ShowOut‿ErrOut‿ErrExit‿DepOut 74 | frontend ← arch‿libpaths‿configs‿outputs •Import "singeli.bqn" 75 | backend ← { 76 | "ir"≡target ? ⊢ ; 77 | par ← ⟨"cpp"≡target,arch,"si"⊣´namepre,outputs⟩ 78 | pre ⊑⊸{𝕨⊸𝕏}⍟(0<≠∘⊣) par •Import "emit_c.bqn" 79 | } 80 | Output ← { 81 | ≠out ? (Rel⊑out) •file.Chars ⊢ ; 82 | •Out⍟(0<≠) ¯1⊸↓ 83 | }⊸⊢ 84 | Result ← {show‿errout‿deplog‿out⇐𝕩} Writes ∾ DepWrite ⋈ ⊢ 85 | 86 | Result {Output Backend ∾ Frontend¨ 𝕩}_withErr (<¨run) ∾ files 87 | -------------------------------------------------------------------------------- /test/README.md: -------------------------------------------------------------------------------- 1 | # Singeli tests 2 | 3 | Singeli testing is not terribly comprehensive. We're relying in part on testing with existing codebases as well as the relative simplicity of the language to make sure things work. 4 | 5 | Compiler tests: `test/run` (like `singeli`, run as an executable if `bqn` is installed, or call with a BQN interpreter). 6 | 7 | Most includes are not yet tested. For arch/ includes, run `make` from the test/arch/general directory, or from the base: 8 | 9 | $ make -C test/arch/general ARCH=feats 10 | 11 | where the feature list `feats` is the same as Singeli's `-a` argument. If making changes to Singeli, run `make clean` between tests to force a new build. 12 | -------------------------------------------------------------------------------- /test/alias.c: -------------------------------------------------------------------------------- 1 | static int32_t si_f0_f(int32_t v0_x) { 2 | int32_t v1 = change(v0_x); 3 | int32_t v2_a = v1; 4 | int32_t v3_b = v0_x; 5 | v2_a = ((int32_t)4ll); 6 | v3_b = v1; 7 | v0_x = v2_a; 8 | return v3_b; 9 | } 10 | 11 | static int32_t* si_f1_g() { 12 | int32_t v0_b = ((int32_t)5ll); 13 | int32_t v1_c = v0_b; 14 | v1_c = ((int32_t)6ll); 15 | int32_t v2_a_[] = {v0_b,v0_b}; int32_t* v2_a = v2_a_; 16 | return v2_a; 17 | } 18 | 19 | -------------------------------------------------------------------------------- /test/alias.in: -------------------------------------------------------------------------------- 1 | fn f(x:i32) = { 2 | def d = emit{i32, 'change', x} 3 | a := d # New handle 4 | b := x # New handle 5 | def c = b # Aliases b 6 | a = 4 7 | c = d 8 | x = a 9 | b 10 | } 11 | 12 | fn g() : __pnt{i32} = { 13 | b:i32 = 5 14 | c := b 15 | c = 6 16 | a:__pnt{i32} = tup{b,b} 17 | ac := a 18 | # a = cast{__pnt{i32},tup{b,c}} # Problems with C output for this 19 | ac 20 | } 21 | -------------------------------------------------------------------------------- /test/alias.ir: -------------------------------------------------------------------------------- 1 | beginFn f0_f i32 1 v0_x i32 2 | new v1 emit i32 'change' v0_x 3 | new v2_a val i32 v1 4 | new v3_b val i32 v0_x 5 | mut v2_a !4:i32 6 | mut v3_b v1 7 | mut v0_x v2_a 8 | ret v3_b 9 | endFn 10 | 11 | beginFn f1_g *i32 0 12 | new v0_b val i32 !5:i32 13 | new v1_c val i32 v0_b 14 | mut v1_c !6:i32 15 | new v2_a array *i32 v0_b v0_b 16 | ret v2_a 17 | endFn 18 | 19 | -------------------------------------------------------------------------------- /test/anon.in: -------------------------------------------------------------------------------- 1 | fn fun(arg:i64) : i64 = ({x}=>emit{i64,'anon',x,x}){arg} 2 | export{'efn', fun} 3 | -------------------------------------------------------------------------------- /test/anon.ir: -------------------------------------------------------------------------------- 1 | beginFn f0_fun i64 1 v0_arg i64 2 | new v1 emit i64 'anon' v0_arg v0_arg 3 | ret v1 4 | endFn 5 | 6 | export 'efn' (i64)->i64 $f0_fun 7 | -------------------------------------------------------------------------------- /test/apply.in: -------------------------------------------------------------------------------- 1 | def divmod{a,b} = tup{emit{u32,'div',a,b},emit{u32,'mod',a,b}} 2 | fn divplusmod(a:u32, b:u32) : u32 = { 3 | apply{bind{emit, u32, 'add'}, divmod{a,b}} 4 | } 5 | -------------------------------------------------------------------------------- /test/apply.ir: -------------------------------------------------------------------------------- 1 | beginFn f0_divplusmod u32 2 v0_a u32 v1_b u32 2 | new v2 emit u32 'div' v0_a v1_b 3 | new v3 emit u32 'mod' v0_a v1_b 4 | new v4 emit u32 'add' v2 v3 5 | ret v4 6 | endFn 7 | 8 | -------------------------------------------------------------------------------- /test/arch/.gitignore: -------------------------------------------------------------------------------- 1 | *.c 2 | *.bin 3 | -------------------------------------------------------------------------------- /test/arch/base.singeli: -------------------------------------------------------------------------------- 1 | include 'skin/c' 2 | include 'arch/c' 3 | include 'debug/printf' 4 | 5 | def comptime_fail{...msg} = { 6 | show{...msg} 7 | 0{} # comptime_fail 8 | } 9 | 10 | require{'stdlib.h'} 11 | def exit{code} = emit{void, 'exit', code} 12 | require{'string.h'} 13 | def memeq{a, b, bytes} = 0 == emit{i32, 'memcmp', a, b, bytes} 14 | 15 | local fn expect_eq_fn{V}(exp:V, got:V) : void = { 16 | buf:*V = tup{exp, got} 17 | if (not memeq{buf, buf+1, width{V}/8}) { 18 | lprintf{'Expected: ', tup{'x0', exp}, ' (type = ', V,')'} 19 | lprintf{'Got: ', tup{'x0', got}} 20 | exit{1} 21 | } 22 | } 23 | 24 | def expect_eq{exp:V, got:V} = expect_eq_fn{V}(exp, got) 25 | 26 | def test_with_match{G, args, on_fail, on_res} = { 27 | def err # Detect when G doesn't match 28 | def got = match (...args) { (G); {..._} => err } 29 | if (is{got, err}) on_fail{}; else on_res{got} 30 | } 31 | def test_exp{exp, G}{...args} = test_with_match{G, args, 32 | {} => comptime_fail{'No case matched for ', G, args}, 33 | expect_eq{exp, .} 34 | } 35 | def test_no_case{G}{...args} = test_with_match{G, args, 36 | {} => {}, # No match, success 37 | {_} => comptime_fail{'Expected no case to match for ', G, args} 38 | } 39 | 40 | def for_tup{vars,0,'!',iter} = { 41 | def n = length{select{vars, 0}} 42 | each{{i, ...args} => iter{i, args, {a,_} => a}, range{n}, ...vars} 43 | } 44 | 45 | def example_elts{V=[k]E} = { 46 | def ew = width{E} 47 | def mul = match (if (issigned{E}) primtype{'u',ew} else E) { {(u8)}=>3; {(u16)}=>100; {(u32)}=>1e8; {(u64)}=>1e17; {(f32)}=>1.0001; {(f64)}=>1.000100010001 } 48 | def e = (range{k}+1) * mul 49 | if (isint{E}) e%(1<<(ew - issigned{E}) - 1) else e 50 | } 51 | def primtypes = tup{u8,i8,u16,i16,u32,i32,u64,i64,f32,f64} 52 | def supported_widths{accept_avx2_256} = { 53 | if (hasarch{'AVX512F'}) tup{128, 256, 512} 54 | else if (hasarch{if (accept_avx2_256) 'AVX' else 'AVX2'}) tup{128, 256} 55 | else tup{128} 56 | } 57 | -------------------------------------------------------------------------------- /test/arch/general/broadcast-sel.singeli: -------------------------------------------------------------------------------- 1 | include '../base' 2 | include '../simd' 3 | 4 | main() : void = { 5 | fn test{V=[k]E}() : void = { 6 | def elts = example_elts{V} 7 | def src = vec_make{V, elts} 8 | 9 | @for_tup(i in range{k}, el in elts over '!') { 10 | if (hasarch{'X86_64'}) { 11 | if (width{E}==8 and not hasarch{'SSSE3'}) { 12 | test_no_case{broadcast_sel}{src, i} 13 | } else if (width{V}==512 and width{E}<=16 and not hasarch{'AVX512BW'}) { 14 | test_no_case{broadcast_sel}{src, i} 15 | } else { 16 | test_exp{vec_broadcast{V, el}, broadcast_sel}{src, i} 17 | } 18 | } else { 19 | test_exp{vec_broadcast{V, el}, broadcast_sel}{src, i} 20 | } 21 | } 22 | test_no_case{broadcast_sel}{src, k} 23 | test_no_case{broadcast_sel}{src, 0.5} 24 | } 25 | @for_tup(E in primtypes over '!') { 26 | @for_tup(k in supported_widths{0}/width{E} over '!') { 27 | test{[k]E}() 28 | } 29 | } 30 | lprintf{'pass'} 31 | } 32 | -------------------------------------------------------------------------------- /test/arch/general/imm-shuffle-select.singeli: -------------------------------------------------------------------------------- 1 | include '../base' 2 | include 'util/tup' 3 | if_inline (hasarch{'X86_64'}) { 4 | include 'arch/iintrinsic/basic' 5 | include 'arch/iintrinsic/select' 6 | } else if_inline (hasarch{'AARCH64'}) { 7 | include 'arch/neon_intrin/basic' 8 | include 'arch/neon_intrin/select' 9 | } else { 10 | def {vec_shuffle,vec_select,vec_make} 11 | } 12 | 13 | def expand{e, t} = replicate{e, t}*e + cycle{e*length{t}, range{e}} 14 | def gen_idxs{n} = (3 * (1-range{n})) % n 15 | 16 | def widths{min, max} = tup{min, ...widths{min*2, max}} 17 | def widths{min, max if min>max} = tup{} 18 | def widths{v, v} = tup{v} 19 | 20 | main() : void = { 21 | fn test{sw, V}(counts:*u64) : void = { 22 | def vw = width{V} 23 | 24 | def inc_ok{ok} = store{counts, ok, load{counts, ok}+1} 25 | def elts = example_elts{V} 26 | def vec = vec_make{V, elts} 27 | 28 | def has_int{lw} = not (lw==256 and not hasarch{'AVX2'}) 29 | def qualities{lw} = { 30 | def i = has_int{lw} 31 | replicate{tup{i,i,sw>=32}, tup{'u','i','f'}} 32 | } 33 | 34 | if (sw<=64) @for_tup(lw in widths{sw, vw} over '!') { 35 | # vec_shuffle{[n]vw (width=lw), v:V (width=vw)}; n = elements 36 | def n = lw/sw 37 | def idxs = gen_idxs{n} 38 | def ok = match() { 39 | {if sw==16 and vw==128 and lw<=64} => 1 40 | {if sw== 8 and hasarch{'AVX512VBMI'}} => 1 41 | {if sw==16 and hasarch{'AVX512BW'}} => 1 42 | {if sw<=16 and vw==256} => hasarch{'AVX2'} and lw<=128 43 | {if sw<=16 and vw==128} => hasarch{'SSSE3'} 44 | {if sw<=16} => 0 45 | {if lw>=256 and not hasarch{'AVX2'}} => 0 46 | {} => 1 47 | } 48 | inc_ok{ok} 49 | if (ok) { 50 | @for_tup(quality in qualities{lw} over '!') { 51 | def spec = [n]primtype{quality, sw} 52 | def scale = sw/width{eltype{V}} 53 | def e1 = expand{scale, idxs} 54 | def e2 = join{each{{i} => e1+i*n*scale, range{vw/lw}}} 55 | # show{'V=',V, ' lw=',lw, ' n=',n, ' spec=',spec} 56 | # lprintf{'V=',V, ' lw=',lw, ' n=',n, ' spec=',spec} 57 | def exp = vec_make{V, select{elts, e2}} 58 | test_exp{exp, vec_shuffle}{spec, vec, idxs} 59 | } 60 | } # else show{'missed V=',V, ' lw=',lw, ' n=',n, ' spec=',[n]primtype{'u', sw}} 61 | } 62 | 63 | def select_ok = match() { 64 | {if sw<=16 and not hasarch{'SSSE3'}} => 0 65 | {if sw<=16 and vw>=256 and not hasarch{if (sw==8) 'AVX512VBMI' else 'AVX512BW'}} => 0 66 | {if sw<=16 and vw==256 and not hasarch{'AVX2'}} => 0 67 | {if vw==256 and not hasarch{'AVX2'}} => 0 68 | {..._} => 1 69 | } 70 | inc_ok{select_ok} 71 | if (select_ok) { 72 | def n = vw/sw 73 | def idxs = gen_idxs{n} 74 | 75 | def exp = { 76 | def scale = sw/width{eltype{V}} 77 | vec_make{V, select{elts, expand{scale, idxs}}} 78 | } 79 | 80 | @for_tup(spec in merge{ 81 | copy{has_int{vw} or quality{eltype{V}}=='f', sw}, 82 | each{{q}=>primtype{q,sw}, if (sw<=64) qualities{vw} else tup{}} 83 | } over '!') { 84 | # show{'spec=',spec, ' V=',V} 85 | test_exp{exp, vec_select}{spec, vec, idxs} 86 | test_exp{exp, vec_select}{spec, vec, ...idxs} 87 | } 88 | } 89 | } 90 | 91 | counts:*u64 = tup{0,0} 92 | if (hasarch{'X86_64'}) { 93 | @for_tup(sw in tup{8,16,32,64,128} over '!') { 94 | @for_tup(E in primtypes over '!') { 95 | if (sw>=width{E}) @for_tup(w in supported_widths{1} over '!') { 96 | def V = [w/width{E}]E 97 | test{sw, V}(counts) 98 | } 99 | } 100 | } 101 | } else { 102 | lprintf{'no tests defined for this arch'} 103 | exit{1} 104 | } 105 | lprintf{'pass; untestable: ', load{counts,0}, '/', load{counts,0}+load{counts,1}} 106 | } 107 | -------------------------------------------------------------------------------- /test/arch/general/makefile: -------------------------------------------------------------------------------- 1 | SHELL=/usr/bin/env bash -o pipefail 2 | ARCH=native 3 | SINGELI=../../../singeli 4 | SINGELI_FLAGS=-a $(ARCH) 5 | CCFLAGS=-g $(shell bqn ../to-c-args.bqn $(ARCH)) 6 | RUN= 7 | 8 | ALL_TESTS= 9 | ALL_TESTS+=run-imm-shuffle-select 10 | ALL_TESTS+=run-broadcast-sel 11 | 12 | default: $(ALL_TESTS) 13 | .SECONDARY: 14 | 15 | %-${ARCH}.bin: %.singeli 16 | $(SINGELI) -os stderr $(SINGELI_FLAGS) $< > $@.c 17 | $(CC) $(CCFLAGS) $@.c -o $@ 18 | 19 | run-%: %-${ARCH}.bin 20 | $(RUN) ./$< 21 | 22 | clean: 23 | rm *.bin* 24 | -------------------------------------------------------------------------------- /test/arch/simd.singeli: -------------------------------------------------------------------------------- 1 | if_inline (hasarch{'X86_64'}) { 2 | include 'arch/iintrinsic/basic' 3 | include 'arch/iintrinsic/select' 4 | } else if_inline (hasarch{'AARCH64'}) { 5 | include 'arch/neon_intrin/basic' 6 | include 'arch/neon_intrin/select' 7 | } 8 | -------------------------------------------------------------------------------- /test/arch/to-c-args.bqn: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bqn 2 | feats ← ∾ ',' ((⊢-˜+`׬)∘=⊔⊢)¨ •args 3 | •Out 1↓ ∾' '⊸∾¨ (⟨feats, 1⟩ •Import "../../arch.bqn").GetCFlags @ 4 | -------------------------------------------------------------------------------- /test/blockmut.in: -------------------------------------------------------------------------------- 1 | include 'skin/c' 2 | include 'arch/c' 3 | 4 | def for{vars,begin,end,block} = { 5 | i:u64 = begin 6 | while (i < end) { 7 | block{i, vars} 8 | i = i+1 9 | } 10 | } 11 | 12 | fn fun() : void = { 13 | b:i32 = 0 14 | @for(i from 0 to 4) { 15 | b = b+2 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /test/blockmut.ir: -------------------------------------------------------------------------------- 1 | beginFn f0_fun void 0 2 | new v0_b val i32 !0:i32 3 | new v1_i val u64 !0:u64 4 | lbl l0 5 | new v2 emit u1 'op <' v1_i !4:u64 6 | gotoF v2 l1 7 | mut v0_b emit i32 'op +' v0_b !2:i32 8 | mut v1_i emit u64 'op +' v1_i !1:u64 9 | goto l0 10 | lbl l1 11 | endFn 12 | 13 | -------------------------------------------------------------------------------- /test/call.c: -------------------------------------------------------------------------------- 1 | static int32_t si_f0_mid(int32_t v0_a, int32_t v1_b) { 2 | int32_t v2 = add(v0_a, v1_b); 3 | return v2; 4 | } 5 | 6 | static int32_t si_f1_fun(int32_t v0_a) { 7 | int32_t v1 = si_f0_mid(v0_a, v0_a); 8 | return v1; 9 | } 10 | 11 | int32_t (*const fn)(int32_t) = si_f1_fun; 12 | 13 | -------------------------------------------------------------------------------- /test/call.in: -------------------------------------------------------------------------------- 1 | fn mid(a:i32, b:i32) = emit{i32,'add',a,b} 2 | 3 | fn fun(a:i32) : i32 = { 4 | mid(a, a) 5 | } 6 | export{'fn', fun} 7 | -------------------------------------------------------------------------------- /test/cond.in: -------------------------------------------------------------------------------- 1 | def g{x,y} = emit{f64, 'g2', x, y} 2 | def g{g,'sym'} = emit{f64, 'g2s', g} 3 | def g{a, a} = g{a} 4 | def g{b==tup{a}, a} = g{__add{10,a}} 5 | def g{x} = emit{f64, 'g1', x} 6 | def g{w:T, x:T} = emit{T, 'g2T', w, x} 7 | def g{x:T if __le{T,i32}} = emit{T, 'g1i', x} 8 | 9 | fn fun(a:i16, b:u1, c:f32) : u8 = { 10 | emit{u8, 'out', g{a,a}, g{a,c}, g{3,3}, g{tup{4},4}, g{5,'sym'}, g{a}} 11 | } 12 | -------------------------------------------------------------------------------- /test/cond.ir: -------------------------------------------------------------------------------- 1 | beginFn f0_fun u8 3 v0_a i16 v1_b u1 v2_c f32 2 | new v3 emit i16 'g2T' v0_a v0_a 3 | new v4 emit f64 'g2' v0_a v2_c 4 | new v5 emit f64 'g1' 3 5 | new v6 emit f64 'g1' 14 6 | new v7 emit f64 'g2s' 5 7 | new v8 emit i16 'g1i' v0_a 8 | new v9 emit u8 'out' v3 v4 v5 v6 v7 v8 9 | ret v9 10 | endFn 11 | 12 | -------------------------------------------------------------------------------- /test/const.in: -------------------------------------------------------------------------------- 1 | c:u64 = 4 2 | 3 | fn fun() = c 4 | -------------------------------------------------------------------------------- /test/const.ir: -------------------------------------------------------------------------------- 1 | constant $c0_c u64 !4:u64 2 | 3 | beginFn f0_fun u64 0 4 | ret $c0_c 5 | endFn 6 | 7 | -------------------------------------------------------------------------------- /test/destruct.in: -------------------------------------------------------------------------------- 1 | fn f(x:u1, y:u8, z:u16) = { 2 | {a,b:(u1)}:tup{u1,u1} = tup{x,1} 3 | def {yc, ...c if is{length{c},3}, yc==y} = tup{y,b,y,b,y} 4 | {...d,e,f,g} := c 5 | emit{u32,'all',a,f,g} 6 | } 7 | 8 | def g 9 | def g{...x} = emit{f32, 'g0', ...x} 10 | def g{{a,b}} = emit{f32, 'g1', a, b} 11 | def g{{a,b},c,{d,e}} = emit{f32, 'g2', a, b, c, d, e} 12 | def g{{a,b},a,{b,a}} = emit{f32, 'g3', a, b} 13 | def g{x:T,...{y,T}} = emit{T, 'g4', x, y} 14 | fn nest() = { 15 | emit{f64, 'out', 16 | g{0,1,2}, 17 | g{tup{3,4}}, 18 | g{tup{5,4},3,tup{2,1}}, 19 | g{tup{7,6},7,tup{6,7}}, 20 | g{reinterpret{i16,8},9,i16} 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /test/destruct.ir: -------------------------------------------------------------------------------- 1 | beginFn f0_f u32 3 v0_x u1 v1_y u8 v2_z u16 2 | new v3_b val u1 !1:u1 3 | new v4 emit u32 'all' v0_x v1_y v3_b 4 | ret v4 5 | endFn 6 | 7 | beginFn f1_nest f64 0 8 | new v0 emit f32 'g0' 0 1 2 9 | new v1 emit f32 'g1' 3 4 10 | new v2 emit f32 'g2' 5 4 3 2 1 11 | new v3 emit f32 'g3' 7 6 12 | new v4 emit i16 'g4' !8:i16 9 13 | new v5 emit f64 'out' v0 v1 v2 v3 v4 14 | ret v5 15 | endFn 16 | 17 | -------------------------------------------------------------------------------- /test/each.in: -------------------------------------------------------------------------------- 1 | fn divplusmod() : u32 = { 2 | def t = tup{0,1,2,3,4} 3 | def u = each{{a}=>__mul{3,a}, t} 4 | def v = each{__add, u, tup{1,0,1,0,1}} 5 | apply{bind{emit, u32, 'list'}, v} 6 | } 7 | -------------------------------------------------------------------------------- /test/each.ir: -------------------------------------------------------------------------------- 1 | beginFn f0_divplusmod u32 0 2 | new v0 emit u32 'list' 1 3 7 9 13 3 | ret v0 4 | endFn 5 | 6 | -------------------------------------------------------------------------------- /test/else.in: -------------------------------------------------------------------------------- 1 | fn fun(i:i64) : i64 = { 2 | if (emit{u1, 'test', i}) { return{1} } 3 | else { return{2} } 4 | 3 5 | } 6 | export{'efn', fun} 7 | -------------------------------------------------------------------------------- /test/else.ir: -------------------------------------------------------------------------------- 1 | beginFn f0_fun i64 1 v0_i i64 2 | new v1 emit u1 'test' v0_i 3 | gotoF v1 l0 4 | ret !1:i64 5 | goto l1 6 | lbl l0 7 | ret !2:i64 8 | lbl l1 9 | ret !3:i64 10 | endFn 11 | 12 | export 'efn' (i64)->i64 $f0_fun 13 | -------------------------------------------------------------------------------- /test/excon.c: -------------------------------------------------------------------------------- 1 | static int32_t si_c0_c_[] = {((int32_t)11ll),((int32_t)10ll),((int32_t)9ll)}; static int32_t* const si_c0_c = si_c0_c_; 2 | 3 | int16_t const num = ((int16_t)12ll); 4 | 5 | int32_t* const arr = si_c0_c; 6 | 7 | -------------------------------------------------------------------------------- /test/excon.in: -------------------------------------------------------------------------------- 1 | export{'num', cast{i16, 12}} 2 | c:__pnt{i32} = tup{11,10,9} 3 | export{'arr', c} 4 | -------------------------------------------------------------------------------- /test/export.in: -------------------------------------------------------------------------------- 1 | fn fun{T}(arg:T) : T = arg 2 | export{'e0', fun{i8}} 3 | export{tup{'e1','e2'}, fun{i16}} 4 | export{'e3', fun{i32}} 5 | -------------------------------------------------------------------------------- /test/export.ir: -------------------------------------------------------------------------------- 1 | beginFn f0_fun_i8 i8 1 v0_arg i8 2 | ret v0_arg 3 | endFn 4 | 5 | beginFn f1_fun_i16 i16 1 v0_arg i16 6 | ret v0_arg 7 | endFn 8 | 9 | beginFn f2_fun_i32 i32 1 v0_arg i32 10 | ret v0_arg 11 | endFn 12 | 13 | export 'e0' (i8)->i8 $f0_fun_i8 14 | export 'e1' (i16)->i16 $f1_fun_i16 15 | export 'e2' (i16)->i16 $f1_fun_i16 16 | export 'e3' (i32)->i32 $f2_fun_i32 17 | -------------------------------------------------------------------------------- /test/fnarr.c: -------------------------------------------------------------------------------- 1 | static uint32_t si_f0_fun_0(uint32_t v0_a, uint32_t v1_b); 2 | static uint32_t si_f1_fun_1(uint32_t v0_a, uint32_t v1_b); 3 | 4 | static uint32_t (*si_c0_fns_[])(uint32_t,uint32_t) = {si_f0_fun_0,si_f1_fun_1}; static uint32_t (**const si_c0_fns)(uint32_t,uint32_t) = si_c0_fns_; 5 | 6 | static uint32_t si_f0_fun_0(uint32_t v0_a, uint32_t v1_b) { 7 | return v0_a; 8 | } 9 | 10 | static uint32_t si_f1_fun_1(uint32_t v0_a, uint32_t v1_b) { 11 | return v1_b; 12 | } 13 | 14 | static uint32_t si_f2_sfn(bool v0_i, uint32_t v1_a, uint32_t v2_b) { 15 | uint32_t (*v3)(uint32_t,uint32_t) = si_c0_fns[v0_i]; 16 | uint32_t v4 = v3(v1_a, v2_b); 17 | return v4; 18 | } 19 | 20 | uint32_t (**const fn_arr)(uint32_t,uint32_t) = si_c0_fns; 21 | 22 | -------------------------------------------------------------------------------- /test/fnarr.in: -------------------------------------------------------------------------------- 1 | fn fun{x}(a:u32, b:u32) = select{tup{a,b},x} 2 | fns:__pnt{fntype{u32, u32, u32}} = tup{fun{0},fun{1}} 3 | 4 | export{'fn_arr', fns} 5 | 6 | include 'arch/c' 7 | fn sfn(i:u1, a:u32, b:u32) = load{fns,i}(a,b) 8 | -------------------------------------------------------------------------------- /test/fnrec.in: -------------------------------------------------------------------------------- 1 | include 'arch/c' 2 | include 'skin/c' 3 | 4 | fn fact(x:u8) : u64 = { 5 | if (x <= 1) return{1} 6 | promote{u64,x} * fact(x - 1) 7 | } 8 | -------------------------------------------------------------------------------- /test/fnrec.ir: -------------------------------------------------------------------------------- 1 | beginFn f0_fact u64 1 v0_x u8 2 | new v1 emit u1 'op <=' v0_x !1:u8 3 | gotoF v1 l0 4 | ret !1:u64 5 | lbl l0 6 | new v2 emit u64 '^promote' u64 v0_x 7 | new v3 emit u8 'op -' v0_x !1:u8 8 | new v4 call u64 $f0_fact 1 v3 9 | new v5 emit u64 'op *' v2 v4 10 | ret v5 11 | endFn 12 | 13 | -------------------------------------------------------------------------------- /test/fntup.in: -------------------------------------------------------------------------------- 1 | fn gen{T}(a:T) : u8 = select{a,0} 2 | 3 | fn fun(none:tup{}) = { 4 | x:tup{u8,i32} = tup{4, 1} 5 | y := gen{tup{u8,i32}}(x) 6 | z := gen{tup{u8,type{x}}}(tup{y,x}) 7 | gen{tup{u8}}(tup{z}) 8 | } 9 | -------------------------------------------------------------------------------- /test/fntup.ir: -------------------------------------------------------------------------------- 1 | beginFn f0_fun u8 0 2 | new v0_x val u8 !4:u8 3 | new v1_x val i32 !1:i32 4 | new v2_y call u8 $f1_gen_tupu8i32 2 v0_x v1_x 5 | new v3_z call u8 $f2_gen_tupu8u8i32 3 v2_y v0_x v1_x 6 | new v4 call u8 $f3_gen_tupu8 1 v3_z 7 | ret v4 8 | endFn 9 | 10 | beginFn f1_gen_tupu8i32 u8 2 v0_a u8 v1_a i32 11 | ret v0_a 12 | endFn 13 | 14 | beginFn f2_gen_tupu8u8i32 u8 3 v0_a u8 v1_a u8 v2_a i32 15 | ret v0_a 16 | endFn 17 | 18 | beginFn f3_gen_tupu8 u8 1 v0_a u8 19 | ret v0_a 20 | endFn 21 | 22 | -------------------------------------------------------------------------------- /test/for.in: -------------------------------------------------------------------------------- 1 | include 'skin/c' 2 | include 'arch/c' 3 | 4 | def Size = u64 5 | 6 | def for{vars,begin,end,block} = { 7 | i:Size = begin 8 | while (ii64 $f0_fun 6 | -------------------------------------------------------------------------------- /test/genext.in: -------------------------------------------------------------------------------- 1 | def n{...} = __neg 2 | def a = n 3 | def a{'b0'} = 'fail' 4 | 5 | def b = ({'b0'} => 1) 6 | def b{'b1'} = 2 7 | 8 | def c{...} = b 9 | def a{...} = b 10 | 11 | fn ta() = emit{u8, 'out', ...each{a, tup{3, 'b0', 'b1'}}} 12 | fn tc() = emit{u8, 'out', ...each{c, tup{ 'b0', 'b1'}}} 13 | -------------------------------------------------------------------------------- /test/genext.ir: -------------------------------------------------------------------------------- 1 | beginFn f0_ta u8 0 2 | new v0 emit u8 'out' -3 1 2 3 | ret v0 4 | endFn 5 | 6 | beginFn f1_tc u8 0 7 | new v0 emit u8 'out' 1 2 8 | ret v0 9 | endFn 10 | 11 | -------------------------------------------------------------------------------- /test/goto.c: -------------------------------------------------------------------------------- 1 | static uint8_t si_f0_fun(bool v0_a) { 2 | l0:; 3 | if (!(v0_a)) goto l2; 4 | goto l1; 5 | l2:; 6 | return ((uint8_t)5ull); 7 | goto l0; 8 | if (!(v0_a)) goto l3; 9 | goto l_sym; 10 | l3:; 11 | l_sym:; 12 | l1:; 13 | return ((uint8_t)6ull); 14 | } 15 | 16 | -------------------------------------------------------------------------------- /test/goto.in: -------------------------------------------------------------------------------- 1 | fn fun(a:u1) : u8 = { 2 | def w = setlabel{} 3 | def l = makelabel{} 4 | if (a) goto{l} 5 | return{5} 6 | goto{w} 7 | if (a) goto{'sym'} 8 | setlabel{'sym'} 9 | setlabel{l} 10 | 6 11 | } 12 | -------------------------------------------------------------------------------- /test/hello.c: -------------------------------------------------------------------------------- 1 | int main() { 2 | printf("Hello, World!\n"); 3 | } 4 | 5 | -------------------------------------------------------------------------------- /test/hello.in: -------------------------------------------------------------------------------- 1 | include 'debug/printf' 2 | main : void { 3 | lprintf{'Hello, World!'} 4 | } 5 | -------------------------------------------------------------------------------- /test/ifconst.in: -------------------------------------------------------------------------------- 1 | include 'skin/c' 2 | include 'arch/c' 3 | 4 | fn fun(i:i64) : i64 = { 5 | if (1) { i = 2 * i } 6 | if (0) { i = 3 * i } 7 | if (1) { i = i - 1 } 8 | else { i = i - 2 } 9 | if (0) { i = i + 1 } 10 | else { i = i + 2 } 11 | } 12 | -------------------------------------------------------------------------------- /test/ifconst.ir: -------------------------------------------------------------------------------- 1 | beginFn f0_fun i64 1 v0_i i64 2 | mut v0_i emit i64 'op *' !2:i64 v0_i 3 | mut v0_i emit i64 'op -' v0_i !1:i64 4 | mut v0_i emit i64 'op +' v0_i !2:i64 5 | ret v0_i 6 | endFn 7 | 8 | -------------------------------------------------------------------------------- /test/local.in: -------------------------------------------------------------------------------- 1 | c:u8 = 2 2 | def g{a} = 'outer' 3 | local { 4 | local c:u8 = 3 5 | def g{a if is{a,'ext'}} = 'inner' 6 | local def g{a if is{a,'test'}} = 'fail!' 7 | fn fi(x:u8) = { 8 | emit{u1, g{'default'}, x} 9 | emit{u1, g{'ext'}, c} 10 | } 11 | local fn fi(x:u8) : u8 = 0 12 | local export{'locfi', fi} 13 | } 14 | fn fo(y:u8) = { 15 | emit{u1, g{'ext'}, y} 16 | emit{u1, g{'test'}, y} 17 | fi(c) 18 | } 19 | -------------------------------------------------------------------------------- /test/local.ir: -------------------------------------------------------------------------------- 1 | constant $c0_c u8 !2:u8 2 | constant $c1_c u8 !3:u8 3 | 4 | beginFn f0_fi u1 1 v0_x u8 5 | new v1 emit u1 'outer' v0_x 6 | new v2 emit u1 'inner' $c1_c 7 | ret v2 8 | endFn 9 | 10 | beginFn f1_fi u8 1 v0_x u8 11 | ret !0:u8 12 | endFn 13 | 14 | beginFn f2_fo u1 1 v0_y u8 15 | new v1 emit u1 'inner' v0_y 16 | new v2 emit u1 'outer' v0_y 17 | new v3 call u1 $f0_fi 1 $c0_c 18 | ret v3 19 | endFn 20 | 21 | export 'locfi' (u8)->u8 $f1_fi 22 | -------------------------------------------------------------------------------- /test/logic.in: -------------------------------------------------------------------------------- 1 | include 'arch/c' 2 | include 'skin/c' 3 | 4 | fn fun(x:i32) : i32 = { 5 | if ((x>=4 or x==2) and not x>6) return{2*x} 6 | while (x<10 and x!=5) ++x 7 | do --x while ((1 and x>2) or (not (0 or 1))) 8 | x 9 | } 10 | -------------------------------------------------------------------------------- /test/logic.ir: -------------------------------------------------------------------------------- 1 | beginFn f0_fun i32 1 v0_x i32 2 | new v1 emit u1 'op >=' v0_x !4:i32 3 | gotoT v1 l0 4 | new v2 emit u1 'op ==' v0_x !2:i32 5 | gotoF v2 l1 6 | lbl l0 7 | new v3 emit u1 'op >' v0_x !6:i32 8 | gotoT v3 l1 9 | new v4 emit i32 'op *' !2:i32 v0_x 10 | ret v4 11 | lbl l1 12 | lbl l2 13 | new v5 emit u1 'op <' v0_x !10:i32 14 | gotoF v5 l3 15 | new v6 emit u1 'op !=' v0_x !5:i32 16 | gotoF v6 l3 17 | mut v0_x emit i32 'op +' v0_x !1:i32 18 | goto l2 19 | lbl l3 20 | lbl l4 21 | mut v0_x emit i32 'op -' v0_x !1:i32 22 | new v7 emit u1 'op >' v0_x !2:i32 23 | gotoT v7 l4 24 | ret v0_x 25 | endFn 26 | 27 | -------------------------------------------------------------------------------- /test/match.in: -------------------------------------------------------------------------------- 1 | def g = match { 2 | {a,b} => emit{u8, 'g2', a, b} 3 | {...a} => emit{u8, 'g', ...a}; {...b} => emit{u8, '!', ...b}; 4 | } 5 | fn test() = { 6 | emit{f64,'out', 7 | g{0}, 8 | (match{{...any}=>g{...any}}){1,2}, 9 | match (3,1) { {a,b if __lt{a,b}} => emit{u16,'fail',a,b}; 10 | {a,b} => emit{u16,'pass',a,b} } 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /test/match.ir: -------------------------------------------------------------------------------- 1 | beginFn f0_test f64 0 2 | new v0 emit u8 'g' 0 3 | new v1 emit u8 'g2' 1 2 4 | new v2 emit u16 'pass' 3 1 5 | new v3 emit f64 'out' v0 v1 v2 6 | ret v3 7 | endFn 8 | 9 | -------------------------------------------------------------------------------- /test/mfor.in: -------------------------------------------------------------------------------- 1 | include 'arch/c' 2 | 3 | fn fun(x:__pnt{i32}, y:__pnt{u8}) : void = { 4 | def for{vars,begin,end,block} = { 5 | block{0, vars} 6 | } 7 | def istup{t} = is{'tuple', kind{t}} 8 | def load{p, i if istup{p}} = each{{p}=>load{p,i}, p} 9 | def store{p, i, v if istup{p}} = each{{p,v}=>store{p,i,v}, p,v} 10 | @for (a in tup{x,y} over 3) select{a,1} = 2 11 | } 12 | -------------------------------------------------------------------------------- /test/mfor.ir: -------------------------------------------------------------------------------- 1 | beginFn f0_fun void 2 v0_x *i32 v1_y *u8 2 | new v2_a emit i32 '^load' v0_x 0 3 | new v3_a emit u8 '^load' v1_y 0 4 | mut v3_a !2:u8 5 | new v4 emit void '^store' v1_y 0 v3_a 6 | endFn 7 | 8 | -------------------------------------------------------------------------------- /test/mut.in: -------------------------------------------------------------------------------- 1 | fn fun(i:i64) : i64 = { 2 | i = emit{i64, 'newvar', i} 3 | if (emit{u1, 'test', i}) { i = emit{i64, 'mutvar', i} } 4 | i 5 | } 6 | export{'efn', fun} 7 | -------------------------------------------------------------------------------- /test/mut.ir: -------------------------------------------------------------------------------- 1 | beginFn f0_fun i64 1 v0_i i64 2 | mut v0_i emit i64 'newvar' v0_i 3 | new v1 emit u1 'test' v0_i 4 | gotoF v1 l0 5 | mut v0_i emit i64 'mutvar' v0_i 6 | lbl l0 7 | ret v0_i 8 | endFn 9 | 10 | export 'efn' (i64)->i64 $f0_fun 11 | -------------------------------------------------------------------------------- /test/oper.in: -------------------------------------------------------------------------------- 1 | def a{b} = 0 2 | oper % a prefix 10 3 | oper & (a) prefix 10 4 | 5 | def outer = tup{%4, &4} 6 | local { 7 | def a{b} = 1 8 | def inner = tup{%4, &4} 9 | } 10 | 11 | c:__pnt{u8} = merge{outer, inner} 12 | -------------------------------------------------------------------------------- /test/oper.ir: -------------------------------------------------------------------------------- 1 | constant $c0_c *u8 !tup{!0:u8,!0:u8,!1:u8,!0:u8}:*u8 2 | 3 | -------------------------------------------------------------------------------- /test/oppar.in: -------------------------------------------------------------------------------- 1 | oper $ gen prefix 50.1 2 | oper $ gen infix none 50.1 3 | 4 | def gen{a}{b,c} = __mul{a,__sub{b,c}} 5 | def gen{a,b}{c} = __add{a,__mul{b,c}} 6 | 7 | fn fun() : f64 = { 20 ${3} ${4,5} 2 } 8 | 9 | fn fi() : f64 = { 10 | def o = ${4,5} 11 | (${3}){20, o{2}} 12 | } 13 | -------------------------------------------------------------------------------- /test/oppar.ir: -------------------------------------------------------------------------------- 1 | beginFn f0_fun f64 0 2 | ret !18:f64 3 | endFn 4 | 5 | beginFn f1_fi f64 0 6 | ret !18:f64 7 | endFn 8 | 9 | -------------------------------------------------------------------------------- /test/partial.in: -------------------------------------------------------------------------------- 1 | fn f0() = emit{u32, 'out', 0, ..., 4, .}{..., 2, ., .}{1, ., 5}{3, ...}{} 2 | fn f1() = emit{u32, 'out', ., ...tup{1,2,3}, 4, ...}{0, 5, 6} 3 | -------------------------------------------------------------------------------- /test/partial.ir: -------------------------------------------------------------------------------- 1 | beginFn f0_f0 u32 0 2 | new v0 emit u32 'out' 0 1 2 3 4 5 3 | ret v0 4 | endFn 5 | 6 | beginFn f1_f1 u32 0 7 | new v0 emit u32 'out' 0 1 2 3 4 5 6 8 | ret v0 9 | endFn 10 | 11 | -------------------------------------------------------------------------------- /test/proto.c: -------------------------------------------------------------------------------- 1 | static int32_t si_f1_fun_1(int32_t v0_x); 2 | 3 | static int32_t si_f0_x(int32_t v0_a) { 4 | int32_t v1 = si_f1_fun_1(v0_a); 5 | return v1; 6 | } 7 | 8 | static int32_t si_f1_fun_1(int32_t v0_x) { 9 | int32_t v1 = oper(v0_x, 1); 10 | return v1; 11 | } 12 | 13 | int32_t (*const x)(int32_t) = si_f0_x; 14 | 15 | -------------------------------------------------------------------------------- /test/proto.in: -------------------------------------------------------------------------------- 1 | fn fun{o}(x:i32) = { 2 | emit{i32, 'oper', x,o} 3 | } 4 | 5 | fn x(a:i32) : i32 = { 6 | call{fun{1},a} 7 | } 8 | export{'x', x} 9 | -------------------------------------------------------------------------------- /test/qual.in: -------------------------------------------------------------------------------- 1 | def g{x if isfloat {x}} = 0 2 | def g{x if isint {x}} = 1 3 | def g{x if issigned{x}} = 2 4 | 5 | fn fun(a:i64, b:u1, c:f32) : u8 = { 6 | emit{u8, 'out', g{a}, g{b}, g{c}} 7 | } 8 | -------------------------------------------------------------------------------- /test/qual.ir: -------------------------------------------------------------------------------- 1 | beginFn f0_fun u8 3 v0_a i64 v1_b u1 v2_c f32 2 | new v3 emit u8 'out' 2 1 0 3 | ret v3 4 | endFn 5 | 6 | -------------------------------------------------------------------------------- /test/run: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env bqn 2 | 3 | cpu ← •Import "../arch.bqn" 4 | out ← ⟨•Out,•term.ErrRaw•ToUTF8∾(@+10)˙,!∘0,⊢⟩ 5 | par ← ⟨cpu‿⟨⟩‿⟨⟩‿out, 0‿cpu‿"si"‿out⟩ 6 | steps ← {""⊸𝕏}⌾(1⊸⊑) par •Import¨ "../singeli.bqn"‿"../emit_c.bqn" 7 | 8 | n ← ≠types ← "in"‿"ir"‿"c" 9 | files ← •file.List "." 10 | type‿name ← types⊸⊐⌾⊑ <˘⍉> (2∾˜·∨`⌾⌽'.'⊸=)⊸⊔¨ files 11 | files‿type‿name (⍋name≍˘type)⊸⊏¨↩ 12 | 13 | Test ← { 14 | l ← 1-˜≠ t ← 𝕨 (0<⊣)◶⟨•file.At⊢,•file.Chars⊢⟩¨ 𝕩 15 | m ← (1↓t) ≡¨ t {𝕎⎊@𝕩}´⟜⌽¨○(l⊸↑) (¯1+`n↑/⁼𝕨) ⊔ steps 16 | (¬m) / 2↕𝕩 17 | } 18 | gr ← ⊐∘⊣⌾((typevoid $f0_fun 6 | -------------------------------------------------------------------------------- /test/vtype.c: -------------------------------------------------------------------------------- 1 | static bool si_f0_fun(__m64 v0_a, __m128i v1_b, __m256i v2_c, __m128d v3_d, __m256 v4_e) { 2 | return ((bool)0ull); 3 | } 4 | 5 | -------------------------------------------------------------------------------- /test/vtype.in: -------------------------------------------------------------------------------- 1 | fn fun(a:[2]i32, b:[4]u32, c:[256]u1, d:[2]f64, e:[8]f32) : u1 = { 0 } 2 | --------------------------------------------------------------------------------