├── LICENSE
├── README.md
├── arch.bqn
├── archutil.bqn
├── data
    ├── armv8_ext.txt
    ├── iintrinsic.bqn
    ├── rv_ext.txt
    ├── x86_ext.txt
    └── x86_strict_ext.txt
├── doc
    ├── compiler.md
    ├── interpreter.md
    ├── minfilter.md
    └── permutations.md
├── emit_c.bqn
├── float2.bqn
├── include
    ├── README.md
    ├── arch
    │   ├── c.singeli
    │   ├── iintrinsic
    │   │   ├── basic.singeli
    │   │   ├── basic_impl.singeli
    │   │   ├── basic_strict.singeli
    │   │   ├── misc.singeli
    │   │   └── select.singeli
    │   └── neon_intrin
    │   │   ├── basic.singeli
    │   │   └── select.singeli
    ├── clib
    │   └── malloc.singeli
    ├── debug
    │   └── printf.singeli
    ├── skin
    │   ├── c.singeli
    │   ├── cext.singeli
    │   ├── cmut.singeli
    │   └── cop.singeli
    └── util
    │   ├── for.singeli
    │   ├── functionize.singeli
    │   ├── kind.singeli
    │   ├── perv.singeli
    │   └── tup.singeli
├── ir.bqn
├── singeli
├── singeli.bqn
└── test
    ├── README.md
    ├── alias.c
    ├── alias.in
    ├── alias.ir
    ├── anon.in
    ├── anon.ir
    ├── apply.in
    ├── apply.ir
    ├── arch
        ├── .gitignore
        ├── base.singeli
        ├── general
        │   ├── broadcast-sel.singeli
        │   ├── imm-shuffle-select.singeli
        │   └── makefile
        ├── simd.singeli
        └── to-c-args.bqn
    ├── blockmut.in
    ├── blockmut.ir
    ├── call.c
    ├── call.in
    ├── cond.in
    ├── cond.ir
    ├── const.in
    ├── const.ir
    ├── destruct.in
    ├── destruct.ir
    ├── each.in
    ├── each.ir
    ├── else.in
    ├── else.ir
    ├── excon.c
    ├── excon.in
    ├── export.in
    ├── export.ir
    ├── fnarr.c
    ├── fnarr.in
    ├── fnrec.in
    ├── fnrec.ir
    ├── fntup.in
    ├── fntup.ir
    ├── for.in
    ├── for.ir
    ├── forin.in
    ├── forin.ir
    ├── fortup.in
    ├── fortup.ir
    ├── fun.in
    ├── fun.ir
    ├── genext.in
    ├── genext.ir
    ├── goto.c
    ├── goto.in
    ├── hello.c
    ├── hello.in
    ├── ifconst.in
    ├── ifconst.ir
    ├── local.in
    ├── local.ir
    ├── logic.in
    ├── logic.ir
    ├── match.in
    ├── match.ir
    ├── mfor.in
    ├── mfor.ir
    ├── mut.in
    ├── mut.ir
    ├── oper.in
    ├── oper.ir
    ├── oppar.in
    ├── oppar.ir
    ├── partial.in
    ├── partial.ir
    ├── proto.c
    ├── proto.in
    ├── qual.in
    ├── qual.ir
    ├── run
    ├── spread.in
    ├── spread.ir
    ├── tup.in
    ├── tup.ir
    ├── uload.in
    ├── uload.ir
    ├── undefined.c
    ├── undefined.in
    ├── varpar.in
    ├── varpar.ir
    ├── voidfn.in
    ├── voidfn.ir
    ├── vtype.c
    └── vtype.in


/LICENSE:
--------------------------------------------------------------------------------
 1 | ISC License
 2 | 
 3 | Copyright (c) 2021, Marshall Lochbaum <mwlochbaum@gmail.com>
 4 | 
 5 | Permission to use, copy, modify, and/or distribute this software for any
 6 | purpose with or without fee is hereby granted, provided that the above
 7 | copyright notice and this permission notice appear in all copies.
 8 | 
 9 | THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10 | WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11 | MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12 | ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13 | WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14 | ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15 | OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16 | 


--------------------------------------------------------------------------------
/arch.bqn:
--------------------------------------------------------------------------------
 1 | # CPU architecture namespace, based on provided feature set argfeats
 2 | argfeats‿infer ← ⋈⟜0⍟(0=≠) •args
 3 | 
 4 | arches‿ReadDeps‿ReadNative‿ToUpper ← •Import "archutil.bqn"
 5 | 
 6 | # Process feature flags and detect architecture family
 7 | allfeats ← ∾ archfeats ← {𝕩.feats}¨ arches
 8 | AllNat ← ∊⟜allfeats⊸/ ReadNative
 9 | MaybeNat ← "NATIVE"⊸≡¨ (¬⊸/∾AllNat)⍟((0=≠)⊸(∨´)⊣) ⊢
10 | arg ← ⟨"NONE"⟩⊸≢◶⟨⟩‿MaybeNat ToUpper argfeats
11 | all ← "ALL"⊸≡¨ arg
12 | !∘(∾"Unknown features:"<⊸∾' '∾¨/⟜arg)⍟(∨´) ¬all∨arg∊allfeats
13 | "Incompatible features" ! ∨´ supp ← (all¬⊸/arg)⊸(∧´∊)¨ archfeats
14 | archDesc ← ⊑ supp / arches
15 | width‿VecType‿header ⇐ ⟨aname⇐name⟩ ← archDesc
16 | 
17 | # Parse dependencies for the chosen architecture
18 | ⟨feats, mat⟩ ⇐ {
19 |   TSort ← {{𝕊⍟(𝕩<○≠⊢)⟜(𝕩∾·/𝕨⊸<)𝕨∨∧´∘⊏⟜𝕨¨p}⟜/0¨p←𝕩} # Topological sort
20 |   # d is a list of extension dependency chains
21 |   u ← ⍷∾ d ← ReadDeps archDesc.ExtFile infer
22 |   u ⊏˜↩ TSort ∾¨ (⊢⊔○∾(¯1↓↑)¨) (<u)⊐¨d
23 |   # Unique feature flags including arch name
24 |   feats ⇐ ⟨aname⟩ ∾ u
25 |   # Make a dependency matrix: i‿j⊑m is 1 iff feature i depends on j
26 |   m ← (≥⌜˜↕∘≠)⊸×∨´×⌜˜¨(<u)∊¨d # Dependencies from file
27 |   m ∨˝∘∧⎉1‿∞˜⍟(⌈2⋆⁼≠)↩        # Transitive closure
28 |   mat ⇐ 1 ∾˘ 0¨∘⊏⊸∾ m         # Plus base architecture
29 | }
30 | baseArch ⇐ ∨˝∘∧⟜mat (∨´all)◶⟨∊⟜arg,1¨⟩ feats
31 | 
32 | FeatInd ⇐ { # 𝕨 is 0 to allow features outside current architecture
33 |   i ← feats ⊐ 𝕩
34 |   m ← (i=≠feats)/○⥊𝕩
35 |   !∘(∾"arch: Unknown architecture features:"<⊸∾' '∾¨/⟜m)⍟(∨´) ¬m∊allfeats
36 |   !∘(1↓·∾' '∾¨⟨"arch: Features not in",aname,"architecture:"⟩∾⊢)⍟(0<≠)⍟𝕨 m
37 |   i
38 | }
39 | 
40 | # For external tools, return base architecture as C compiler flags
41 | GetCFlags ⇐ {𝕊:
42 |   "-m"⊸∾¨ ((-´"aA")×'A'⊸≤∧≤⟜'Z')⊸+ (baseArch ∧ (aname)⊸≢¨)⊸/ feats
43 | }
44 | 


--------------------------------------------------------------------------------
/archutil.bqn:
--------------------------------------------------------------------------------
 1 | # CPU architecture definitions and utilities
 2 | 
 3 | Spl ← (⊢-˜+`×¬)∘=⊔⊢
 4 | ToUpper ⇐ ((-´"Aa")×'a'⊸≤∧≤⟜'z')⊸+
 5 | UtoDot ← ((-´"._")×'_'⊸=)⊸+
 6 | 
 7 | ReadDeps ⇐ { ' ' Spl¨ •file.Lines ∾"data/"‿𝕩‿"_ext.txt" }
 8 | ReadFeats ← { ⟨𝕨⟩ ∾ ⍷∾ ReadDeps 𝕩 }
 9 | 
10 | arches ⇐ ⟨
11 |   {
12 |     name ⇐ "X86_64"
13 |     width ⇐ 64
14 |     ExtFile ⇐ {𝕩⊑"x86_strict"‿"x86"}
15 |     feats ⇐ name ReadFeats ExtFile 0
16 |     header ⇐ "immintrin.h"
17 |     VecType ⇐ {
18 |       𝕊 1‿⟨v⟩‿1‿0: v≤64?
19 |         { ∊⟜(2⋆1+↕6)⌾<v ? ⟨4, "__mmask"∾•Repr 8⌈v⟩ # AVX-512 mask
20 |         ; <"mask register length must a power of 2 between 2 and 64" } ;
21 |       𝕊 w‿⟨v⟩‿u‿f:
22 |         { ∊⟜(2⋆6+↕4)⌾<l←w×v ? ⟨5-˜2⋆⁼l, ∾⟨"__m", •Repr l, {l=64?"";¬f?"i";(64=w)/"d"}⟩⟩
23 |         ; <"non-boolean register width must a power of 2 between 64 and 512" } ;
24 |       <"nested vector type unsupported in x86"
25 |     }
26 |     mainDefine ⇐ "__x86_64__"
27 |     StripDefines ⇐ { u ← "__" ⋄ (¯2↓2↓⊢)¨ ((u≡2⊸↑)∧u≡¯2⊸↑)¨⊸/ 𝕩 }
28 |   }
29 |   {
30 |     name ⇐ "AARCH64"
31 |     width ⇐ 64
32 |     feats ⇐ name ReadFeats extfile ⇐ "armv8"
33 |     header ⇐ "arm_neon.h"
34 |     VecType ⇐ {
35 |       𝕊 w‿⟨v⟩‿u‿f:
36 |         1 ⋈ ∾⟨{f?"float"; (u/"u")∾"int"}, •Repr w, "x", •Repr v, "_t"⟩;
37 |       <"Nested vector type not yet supported in ARM"
38 |     }
39 |     mainDefine ⇐ "__aarch64__"
40 |     StripDefines ⇐ { l←≠pre←"__ARM_FEATURE_" ⋄ l↓¨(pre≡l⊸↑)¨⊸/ 𝕩 }
41 |   }
42 |   {
43 |     name ⇐ "RV64"
44 |     width ⇐ 64
45 |     feats ⇐ name ReadFeats extfile ⇐ "rv"
46 |     header ⇐ "riscv_vector.h"
47 |     VecType ⇐ {
48 |       𝕊 ⟨w, x‿v, u, f⟩: 1 ⋈ ∾⟨¯2↓1⊑𝕊 w‿⟨v⟩‿u‿f, 'x', •Repr x, "_t"⟩;
49 |       𝕊 1‿⟨v⟩‿1‿0:
50 |         1 ⋈ ∾⟨"vbool", •Repr 128÷v, "_t"⟩;
51 |       𝕊 w‿⟨v⟩‿u‿f:
52 |         lmul ← (v×w)÷128
53 |         mf ← lmul<1
54 |         1 ⋈ ∾⟨"v", {f?"float"; (u/"u")∾"int"}, •Repr w, "m", mf/"f", •Repr ÷⍟mf lmul, "_t"⟩;
55 |       <"Only two levels of vector nesting supported in RISC-V"
56 |     }
57 |     mainDefine ⇐ "__riscv_xlen"
58 |     StripDefines ⇐ { "RVV"¨ "__riscv_v"⊸≡¨⊸/ 𝕩 }
59 |   }
60 | ⟩
61 | 
62 | # Attempt to get the CPU's native architecture from OS resources
63 | ReadNative ⇐ {𝕊:
64 |   f ← {
65 |     c ← 1⊑ •SH "cat"‿"/proc/cpuinfo"
66 |     l ← (∨˝"flags"‿"Features"(⊣≡≠⊸↑)⌜⊢)⊸/ (@+10) Spl c  # Line with flags
67 |     0<≠l ? ToUpper¨ 1↑l
68 |   ;
69 |     l ← (@+10) Spl (1⊑•SH)⎊⟨⟩ "sysctl"‿"machdep.cpu" # For macs
70 |     0<≠l ? {
71 |       ∨´(∨´"feature"⊸⍷)¨l ? l
72 |     ;
73 |       l ← (@+10) Spl 1⊑ •SH "sysctl"‿"hw.optional" # ARM
74 |       ⋈ToUpper ":"∾ ∾ {" "∾(∧`⌾⌽·¬∊⟜"_.")⊸/(∧`':'⊸≠)⊸/𝕩}¨ ('0'≠⊢´¨)⊸/ l
75 |     }
76 |   ;
77 |     ! "Couldn't find CPU features"
78 |   }
79 |   f ↩ ∾ (' ' Spl 2↓(∨`':'⊸=)⊸/)¨ f
80 |   f ↩ "PNI"‿"ABM"‿"BMI1"‿"PCLMULQDQ"‿"AVX1.0"⊸⊐⊸(⊣◶⟨"SSE3","LZCNT",¯1⊸↓,¯3⊸↓,¯3⊸↓,⊢⟩¨) f
81 |   UtoDot '_'⊸≠⊸/⍟("AVX512"(⊣≡≠⊸↑)⊢)¨ f
82 | }
83 | 
84 | # Get extension list from a list of C defines as returned by cc -dM -E
85 | ParseCDefines ⇐ { 𝕊 lines:
86 |   pre ← "#define "
87 |   ! ∧´ pre⊸(⊣≡≠⊸↑)¨ lines
88 |   names ← (∧`' '⊸≠)⊸/¨ (≠pre)⊸↓¨ lines
89 |   as ← ({𝕩.mainDefine}¨ ∊ names˙)⊸/ arches
90 |   {⟨a⟩: ⟨a.name⟩ ∾ ∊⟜a.feats⊸/ UtoDot a.StripDefines names; ⟨"NONE"⟩} as
91 | }
92 | 


--------------------------------------------------------------------------------
/data/armv8_ext.txt:
--------------------------------------------------------------------------------
 1 | FLAGM
 2 | FP ASIMD SVE SVE2
 3 | ASIMDDP
 4 | ASIMDFHM
 5 | ASIMDRDM
 6 | PMULL SVEPMULL
 7 | SVEBITPERM
 8 | SVESM4
 9 | FCMA
10 | FPHP ASIMDHP
11 | CRC32
12 | SM3
13 | SM4
14 | SHA1
15 | SHA2
16 | SHA3
17 | SHA512
18 | SVESHA3
19 | SVEAES
20 | JSCVT
21 | EVTSTRM
22 | SSBS
23 | LRCPC
24 | DCPOP
25 | DCPODP
26 | DIT
27 | ATOMICS
28 | USCAT
29 | ILRCPC
30 | PACA
31 | PACG
32 | 


--------------------------------------------------------------------------------
/data/iintrinsic.bqn:
--------------------------------------------------------------------------------
  1 | # https://www.intel.com/content/dam/develop/public/us/en/include/intrinsics-guide/data-3-6-9.xml
  2 | xml ← •FChars •wdpath •file.At ⊑•args
  3 | 
  4 | #⌜
  5 | # An xml parser good enough for our use case
  6 | # Accept xml; return (as three lists):
  7 | # - Parent index of each tag
  8 | # - Contents of open tag
  9 | # - Text after last child tag
 10 | E ← +`⊸×⟜¬-⊢
 11 | ParseXml ← {
 12 |   text‿tags ← ('?'=·⊑·⊑1⊸⊑)⊸(↓¨) <˘⍉⌊‿2⥊((+⟜»E∨)˝"<>"=⌜⊢)⊸⊔𝕩
 13 |   d←+`tt←1-(+˜⊸+´'/'=0‿¯1⊸⊏)¨tags    # Tag type: ¯1 close, 0 void, 1 open
 14 |   tp←(⍋⊏⟜d)⊸⊏∘/˘ 1‿¯1=⌜tt            # Tag pairs
 15 |   ! (∧`' '⊸≠)⊸/¨⊸≡⟜(1⊸↓¨)˝tp⊏tags    # Tag matching
 16 |   oi←(0<tt)(⌈`↕∘≠⊸×)⊸⊏⌾((⍋d)⊸⊏)↕≠tt  # Open index, for closed and void tags
 17 |   ci←⍋⊸⊏○(∾⟜(/0=tt))˝tp
 18 |   pi←(/0≤tt)(1-˜⍋)¯1⌾⊑ci⊏oi          # Parent index
 19 |   ⟨pi,(0≤tt)/tags,ci⊏text⟩
 20 | }
 21 | ParseAttr ← {
 22 |   name‿a ← (⊑⋈1⊸↓) (E˜' '⊸=>·≠`'"'⊸=)⊸⊔𝕩
 23 |   ⟨name, >(E˝·∨`"="""=⌜⊢)⊸⊔¨a⟩
 24 | }
 25 | 
 26 | #⌜
 27 | # Now process the contents
 28 | isaList ← "SSE"‿"SSE2"‿"SSE3"‿"SSSE3"‿"SSE4.1"‿"SSE4.2"‿"AVX"‿"AVX2"‿"FMA"
 29 | 
 30 | names‿GetCont‿GetVoid‿svml ← {
 31 |   parent‿open‿cont ← ParseXml xml
 32 |   findOpen ← {(⍷𝕩)⊸⊐⊏(⊔⊐𝕩)˙} (∧`' '⊸≠)⊸/¨ open
 33 |   _on_ ← {𝔽○((∾FindOpen𝕘)⊸⊏)}
 34 |   child ← ⊔ parent
 35 |   intr ⇐ cont ∊⟜isaList⊸/_on_⟨"CPUID"⟩ parent
 36 |   IG ← intr⊏⊔
 37 |   GetCont ⇐ { parent IG _on_𝕩 cont }
 38 |   GetVoid ⇐ { parent IG⟜((¯1⊑·ParseAttr ¯1⊸↓)¨)_on_𝕩 open }
 39 |   at ← (1⊑ParseAttr)¨ intr⊏open
 40 |   names ⇐ ≡¨⟜(<"name")⊸(⊑∘/)˝∘⍉¨ at
 41 |   svml ⇐ (⊑"tech"‿"SVML"∊⊢)¨ at
 42 | }
 43 | 
 44 | ProcType ← {
 45 |   IsDig ← 1=0‿10⍋-⟜'0'
 46 |   Nat ← 10⊸×⊸+˜´∘⌽ -⟜'0'
 47 |   Num ← {𝕊⁼:•Repr𝕩; (0<≠)◶1‿Nat IsDig⊸/𝕩}
 48 |   t‿n‿e ← ((1⊏˘𝕩)∾<"")⊏˜(⊏˘𝕩)⊐"type"‿"varname"‿"etype"
 49 |   pre ← ""
 50 |   t ↩ " const" {c←𝕨≡(-≠𝕨)↑𝕩⋄pre∾↩c⊏"&*"⋄(-c×≠𝕨)↓𝕩}⟜((-1+' '=¯2⊸⊑)⊸↓)⍟('*'≡¯1⊸⊑) t
 51 |   {𝕤⋄pre‿t↩"IMM"‿""}⍟("IMM"⊸≡) e
 52 |   EP ← (∾·((⥊¨"uifbm")⊑˜"UI"‿"SI"‿"FP"‿"M"‿"MASK"⊸⊐⌾<)⌾⊑IsDig⊸⊔) e˙
 53 |   tp‿act ← <˘⍉∘‿2⥊⟨
 54 |     "void"   , ⊢
 55 |     "int"    , "i32"
 56 |     "float"  , "f32"
 57 |     "double" , "f64"
 58 |     "__m"    , (∊⟜"bm"⌾<⊑∘⊢)◶⟨("["∾"]"∾˜÷⌾Num)∾⊢, ⊏∘⊢∾·IsDig⊸/⊣⟩⟜EP
 59 |     ""       , ⊢
 60 |   ⟩
 61 |   act ∾↩ ⟨EP⟩ # Various integer types
 62 |   ⟨n, pre∾(tp⊸⊐⌾<(∧`∘¬IsDig)⊸/)◶act t⟩
 63 | }
 64 | 
 65 | proto ← (¯1↓⍟(""‿"void"≡⊑)ProcType¨)¨ GetVoid "return"‿"parameter"
 66 | cpuid‿cat ← GetCont∘(⥊<)¨ "CPUID"‿"category"
 67 | cpuid ↩ isaList ⊐ ⊑¨cpuid
 68 | #instrs ← 0‿1⊸⊑¨¨ GetVoid ⟨"instruction"⟩  # x86 instruction name
 69 | 
 70 | #⌜
 71 | # Singeli-specific adjustments; try to fix wrong signedness
 72 | _seg ← {(𝔽·+`'_'⊸=)⊸/}
 73 | GetSname ← 3⊸=_seg⊸(∾˜)⍟("_mask"⊸≡)⟜(2⊸=_seg)
 74 | # Exclude intrinsics covered by basic.singeli
 75 | excl ← E˜∘=⟜' '⊸⊔"setr set set1 loadu load storeu store extract insert and or xor andnot add sub adds subs min max mullo mul slli srai srli sll sra srl sllv srav srlv cmpeq cmpgt cmp cmpneq cmpge cmplt cmple div sqrt floor ceil round abs sign avg shuffle shufflehi shufflelo permute permutevar permute2f128 permute2x128 permute4x64 permute8x32 permutevar8x32 unpacklo unpackhi bslli bsrli alignr blend blendv"
 76 | incl ← E˜∘=⟜' '⊸⊔"_mm_cmp_pd _mm_cmp_ps _mm256_set_m128 _mm256_set_m128d _mm256_set_m128i _mm256_setr_m128 _mm256_setr_m128d _mm256_setr_m128i _mm_mul_epu32 _mm_mul_epi32 _mm256_mul_epi32 _mm256_mul_epu32"
 77 | filter ← ∧´ ⟨
 78 |   (names∊incl) ∨ ¬((1↓GetSname)¨ names)∊excl
 79 |   ¬ ∨˝"_ss"‿"_sd"‿"1"(⊣≡-∘≠⊸↑)⌜names
 80 |   svml < cpuid<≠isaList
 81 |   ¬ (∨´·("&b"∧´∘∊⊢)¨1⊑¨⊢)¨ proto
 82 |   (⊑'b'∊1⊑⊑)◶⟨1,⊑·(⊏∊1⊸↓)1⊑¨⊢⟩¨ proto
 83 | ⟩
 84 | names‿proto‿cpuid‿cat filter⊸/¨↩
 85 | 
 86 | sname ← GetSname¨ names
 87 | 
 88 | pi ← ("pi"≡¯2↑·(¬·∨`'0'⊸≤∧'9'⊸≥)⊸/(∧`⌾⌽'_'⊸≠)⊸/)¨names
 89 | pi ∧↩ ¬sname∊"_mullo"‿"_srl"‿"_srli"‿"_srlv"‿"_abs"
 90 | proto ((-´"iu")×(<<0‿1)×'u'=⊢)⊸+⌾(pi⊸/)↩
 91 | 
 92 | id ← ⊐ sname ≍˘ 1↓¨proto
 93 | Disamb ← { U←{¬∧´∊𝕩} ⋄ 𝕩 2⊸<_seg¨⊸(∾¨˜⍟U˜)⍟U ("_"∾·¬∘∊⟜"[]"⊸/1⊑⊑)¨𝕨 }
 94 | sname ∾¨↩ proto (1<≠∘⊢)◶⟨""¨,Disamb⟩¨⌾(id⊸⊔) names
 95 | 
 96 | #⌜
 97 | # Format as Singeli definition
 98 | MakeDef ← {instr 𝕊 prot:
 99 |   # Utilities
100 |   Br←"{"∾∾⟜"}" ⋄ A←∾⟜Br
101 |   Int←⥊<⊸(≍˘) ⋄ J←∾1↓Int
102 |   # Type handling and formatting
103 |   pn‿pt ← <˘⍉> prot
104 |   an←1↓pn ⋄ ! ∊⟜""‿"dst"‿"k"⌾< ⊑pn
105 |   pp ← "*&"∊˜⊑¨pt
106 |   rt‿at ← (⊑⋈1⊸↓) pp↓¨pt
107 |   ri ← "void"⊸≢◶⟨≠at, at⊸⊐⌾<⟩ rt
108 |   iv ← "bm"∊˜⊑¨at ⋄ vf ← "IMM"⊸≢¨ at
109 |   c‿i ← iv(<⋈∧)vf∧∊at # c for type constant, i for intvec
110 |   uf ← vf ∧ (ri=↕∘≠)⊸∨ iv ∨ ¬(∊∧∊⌾⌽)at # Which types are named
111 |   tn ← uf⥊¨'T'- 1-˜1↓⊐0∾uf×1+⊐at
112 |   vt ← at (0<≠∘⊣)◶⟨"("∾")"∾˜⊢,∾⟜"=="⊸∾⟩¨˜⌾(c/⊢) tn
113 |   par ← an ∾¨ vf":"⊸∾⍟⊣¨ (1↓pp)"*"⊸∾⍟⊣¨ vt
114 |   conds ← (tn {"intvec"A∾⟨1↓𝕩,",",𝕨⟩}¨○(i/⊢) at) ∾ "num"⊸A¨ vf¬⊸/an
115 |   rt ↩ "__pnt"⊸A⍟(⊑pp) (≠at)⊸=◶⟨⊑⟜tn, rt⟩ ri
116 |   ∾⟨
117 |     Br ∾ 1↓ (", " Int par) ∾ " if "⌾⊑⍟(0<≠) " and " Int conds
118 |     " = emit"
119 |     Br ", "J ⟨rt, "'"(∾∾⊣)instr⟩ ∾ an
120 |   ⟩
121 | }
122 | defs ← cpuid ⊔○((⍋cat)⊸⊏) sname {∾"def _"‿𝕨‿𝕩}¨ names MakeDef¨ proto
123 | •Out 1⌽"
124 | local {
125 |   def intvec{w,T} = 0
126 |   def intvec{(width{V}),V=[_]T if isint{T}} = 1
127 |   def num{T} = is{'number',kind{T}}
128 | }"
129 | •Out¨ ∾ 1↓⥊(<2⥊<⟨⟩) ∾˘ ("#"⊸∾¨isaList) <⊸≍˘ defs
130 | 


--------------------------------------------------------------------------------
/data/rv_ext.txt:
--------------------------------------------------------------------------------
 1 | ZAAMO RVA
 2 | ZALRSC RVA
 3 | ZBA RVB
 4 | ZBB RVB
 5 | ZBS RVB
 6 | RVF RVD
 7 | RVA RVG
 8 | RVD RVG
 9 | RVI RVG
10 | RVM RVG
11 | ZICNTR RVG
12 | ZIFENCEI RVG
13 | RVS RVH
14 | RVD RVQ
15 | RVU
16 | RVV
17 | ZA128RS
18 | ZA64RS
19 | ZAAMO ZABHA
20 | ZAAMO ZACAS
21 | ZALASR
22 | ZAMA16B
23 | ZAWRS
24 | ZBC
25 | RVD ZCD
26 | ZCA ZCE
27 | ZCB ZCE
28 | ZCMP ZCE
29 | ZCMT ZCE
30 | RVC ZCMOP
31 | ZCA ZCMT
32 | ZCB ZCMT
33 | ZCMP ZCMT
34 | ZCMT ZCMT
35 | RVF ZFA
36 | RVF ZFBFMIN
37 | ZFH ZFBFMIN
38 | RVF ZFHMIN
39 | ZHINX
40 | ZIC64B
41 | ZICBOM
42 | ZICBOP
43 | ZICBOZ
44 | ZICCAMOA
45 | ZICCAMOC
46 | ZICCIF
47 | ZICCLSM
48 | ZICCRSE
49 | ZICFILP
50 | ZICFISS
51 | ZICSR ZICNTR
52 | ZICOND
53 | ZIHINTNTL
54 | ZIHINTPAUSE
55 | SMHPM ZIHPM
56 | ZIMOP
57 | ZKN ZK
58 | ZKR ZK
59 | ZKT ZK
60 | ZBKB ZKN
61 | ZBKC ZKN
62 | ZBKX ZKN
63 | ZKND ZKN
64 | ZKNE ZKN
65 | ZKNH ZKN
66 | ZBKB ZKS
67 | ZBKC ZKS
68 | ZBKX ZKS
69 | ZKND ZKS
70 | ZKNE ZKS
71 | ZKNH ZKS
72 | ZKSED
73 | ZKSH
74 | ZMMUL
75 | ZVKB ZVBB
76 | ZFBFMIN ZVFBFWMA
77 | ZVFBFMIN ZVFBFWMA
78 | ZFHMIN ZVFH
79 | ZVE32F ZVFH
80 | ZVE32F ZVFHMIN
81 | ZVKB ZVKN
82 | ZVKNED ZVKN
83 | ZVKNHB ZVKN
84 | ZVKT ZVKN
85 | ZVBC ZVKNC
86 | ZVKN ZVKNC
87 | ZVKG ZVKNG
88 | ZVKN ZVKNG
89 | ZVKNHA ZVKNHB
90 | ZVKB ZVKS
91 | ZVKSED ZVKS
92 | ZVKSH ZVKS
93 | ZVKT ZVKS
94 | ZVBC ZVKSC
95 | ZVKS ZVKSC
96 | ZVKG ZVKSG
97 | ZVKS ZVKSG
98 | 


--------------------------------------------------------------------------------
/data/x86_ext.txt:
--------------------------------------------------------------------------------
 1 | SSE2 X86_64
 2 | MMX SSE SSE2 SSE3 POPCNT SSE4A FMA4
 3 | SSE3 SSSE3 SSE4.1 SSE4.2
 4 | POPCNT SSE4.2 PCLMUL AVX FMA FMA4
 5 | PCLMUL GFNI AVX512VBMI2 AVX512BITALG AVX512VP2INTERSECT
 6 | AVX LZCNT BMI AVX2 BMI2 AVX512F
 7 | FMA AVX512F AVX512CD AVX512VL AVX512BW AVX512DQ AVX512VBMI AVX512IFMA AVX512VBMI2
 8 | FMA VPCLMULQDQ AVX512VBMI2
 9 | BMI2 VPCLMULQDQ
10 | AVX512CD AVX512ER AVX512PF AVX5124VNNIW AVX5124FMAPS
11 | AVX512CD AVX512VPOPCNTDQ AVX5124VNNIW
12 | AVX512DQ AVX512VNNI AVX512VBMI2
13 | AVX512VPOPCNTDQ AVX512VBMI2
14 | 


--------------------------------------------------------------------------------
/data/x86_strict_ext.txt:
--------------------------------------------------------------------------------
 1 | SSE2 X86_64
 2 | MMX SSE SSE2 SSE3 SSSE3 SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512VL
 3 | SSE2 PCLMUL VPCLMULQDQ
 4 | SSE2 GFNI
 5 | SSE3 SSE4A FMA4
 6 | AVX FMA4
 7 | AVX FMA AVX512F AVX512BW AVX512VBMI
 8 | AVX VPCLMULQDQ
 9 | AVX512F AVX512DQ
10 | AVX512F AVX512CD
11 | AVX512F AVX512ER
12 | AVX512F AVX512PF
13 | AVX512F AVX512IFMA
14 | AVX512F AVX512VPOPCNTDQ
15 | AVX512F AVX512VNNI
16 | AVX512F AVX512VP2INTERSECT
17 | AVX512BW AVX512VBMI2
18 | AVX512BW AVX512BITALG
19 | POPCNT
20 | LZCNT
21 | BMI
22 | BMI2
23 | AVX5124VNNIW
24 | AVX5124FMAPS
25 | 


--------------------------------------------------------------------------------
/doc/compiler.md:
--------------------------------------------------------------------------------
  1 | # Singeli is a macro-oriented compiler
  2 | 
  3 | Using Singeli for your next project? Yeah, it's all right. The Rustacean numbskulls haven't gotten to it yet, so it's C-like and actually useful. Well, there's some functional mumbo-jumbo buried in there, but it stays out of the way most of the time.
  4 | 
  5 |     include 'debug/printf'
  6 |     main : void {
  7 |       lprintf{'Hello, World!'}  # Print with newline
  8 |     }
  9 | 
 10 | There's the classic to start off with. You compile to C with `singeli hello.singeli -o hello.c`, then compile and run that. Ugly but at least gcc/clang generate decent code, next best thing to doing the registers by hand. Now, `main` here is special syntax and not a function, and you usually integrate Singeli with an existing codebase, so here's how you get a function you can call from C:
 11 | 
 12 |     include 'debug/printf'
 13 | 
 14 |     fn hello() : void = {
 15 |       lprintf{'Hello, World!'}
 16 |     }
 17 | 
 18 |     export{'hello', hello}
 19 | 
 20 | You can figure out the C stub that calls `hello()` to test it I'm sure. And `singeli -h` for compilation options, I won't bore you with the details.
 21 | 
 22 | You'll notice that `lprintf{}` doesn't use parentheses like a function call. You do call functions with parens, but `lprintf` isn't a function. We'll get to that. And you'll notice that printing is classified as a debugging tool where it belongs. Singeli is for programming: data goes in, data comes out. Text processing? Sure, text is data. But the things in quotes aren't strings: they're called symbols and they're used for name-ish stuff that only exists at compile time. `lprintf{}` takes them because it's convenient.
 23 | 
 24 | The curly brace calls. They're an eyesore but there's a reason. A function has a defined type signature, and printing just takes whatever pile of junk you give it. Even in C it's some kind of special function. Basically, the braces tell you that `lprintf{}` is a macro. It'll generate some code but you don't know what. So it's officially called a generator. I just say macro. Oh, code's in [include/debug/printf.singeli](../include/debug/printf.singeli) for your gut-viewing pleasure.
 25 | 
 26 | Let's at least pretend to get some work done. Reverse a string—Pascal-style, you've got to admit C's made some mistakes.
 27 | 
 28 |     include 'skin/c'
 29 |     include 'arch/c'
 30 | 
 31 |     fn reverse(str:*u8, len:u64) : void = {
 32 |       i:u64 = 0
 33 |       while (i < len) {
 34 |         --len
 35 |         c := load{str, i}
 36 |         store{str, i, load{str, len}}
 37 |         store{str, len, c}
 38 |         ++i
 39 |       }
 40 |     }
 41 | 
 42 | Now we are getting somewhere, `load{}` and `store{}` aside. You need skin/c and arch/c to do anything: in a fit of overengineering the authors have decided maybe you'd want something other than C operators and backend. `*u8` is a pointer. You can load and store at any index, and cast it to other pointer types. Better compile C with `-fno-strict-aliasing`, by the way. And Singeli only has prefix operators so you have `--len` to decrement but no `len--`.
 43 | 
 44 | Singeli's string handling sucks, so to test this out we're going to call C functions directly. This is how the libraries like debug/printf and arch/c are implemented. `emit{}` is a built-in that takes a result type and function name (or operator, with `'op +'` or similar) and calls the C function directly. It outputs symbols verbatim, which is what lets me jam a string in with `'"%s\n"'`, but it might not work that way forever. And `require{}` gets a C header. Since it's a macro you can call it anywhere, like inside `main` or another macro you run. Requiring the same header many times is fine; it'll only generate one `#include`.
 45 | 
 46 |     require{'stdio.h', 'string.h'}
 47 |     main(argc, argv) {
 48 |       arg := load{argv, 1}
 49 |       reverse(arg, emit{u64, 'strlen', arg})
 50 |       emit{void, 'printf', '"%s\n"', arg}
 51 |     }
 52 | 
 53 | Call this with `./a.out sometext` and it prints out the reversed text. And now to deal with this `load` and `store` junk. As you may have guessed, Singeli doesn't support `array[index]` syntax, and it doesn't really have a concept of lvalues either. But there's a library [skin/cext](../include/skin/cext.singeli) that defines some extra non-C operators, mainly for dealing with pointers and casting (which we'll see later; Singeli's anal about types). Now the syntax is `array->index` to load, same as a C struct pointer, and `array <-{index} value` to store, where the `{index}` part is optional. So this is kind of tolerable.
 54 | 
 55 |     include 'skin/c'
 56 |     include 'skin/cext'
 57 |     include 'arch/c'
 58 | 
 59 |     fn reverse(str:*u8, len:u64) : void = {
 60 |       i:u64 = 0
 61 |       while (i < len) {
 62 |         --len
 63 |         c := str->i
 64 |         str <-{i} str->len
 65 |         str <-{len} c
 66 |         ++i
 67 |       }
 68 |     }
 69 | 
 70 |     require{'stdio.h', 'string.h'}
 71 |     main(argc, argv) {
 72 |       arg := load{argv, 1}
 73 |       reverse(arg, emit{u64, 'strlen', arg})
 74 |       emit{void, 'printf', '"%s\n"', arg}
 75 |     }
 76 | 
 77 | I'm smarter than a Gopher and don't like to spend all my time writing reverse functions, so if I have a codebase with multiple types I want my `reverse` to be generic. Generated, even. So I add a type parameter `{T}`, and call this with `reverse{u8}(str, len)`. Whenever `reverse` is called on a type it hasn't seen before, it generates a new function for that type (has to be a type, because of the `*T` in a type signature). Then it reuses that function if the same type comes up again—this is a special feature for generic functions and not other macros.
 78 | 
 79 |     fn reverse{T}(vec:*T, len:u64) : void = {
 80 |       i:u64 = 0
 81 |       while (i < len) {
 82 |         --len
 83 |         c := vec->i
 84 |         vec <-{i} vec->len
 85 |         vec <-{len} c
 86 |         ++i
 87 |       }
 88 |     }
 89 | 
 90 | Of course, if I'd used `u8` inside the function, I'd need to replace those with `T` too. One reason I don't need to do this is that the declaration `c := vec->i` gets the type from the expression, so it's the same as `c:T = vec->i`.
 91 | 
 92 | ## SIMD
 93 | 
 94 | You're probably here for the vector processing stuff. Is this going to save me from `__m256d v = _mm256_fmsubadd_pd...` on every line of the program? The convenience of the C++ packages without Bjarne's head games? Ha, as a programmer you'd better learn to accept the head games, but these ones can largely be shuffled off to an `include` file. Vectorizing that reverse function will take us through the basics.
 95 | 
 96 | Real built-in vector support would apparently harsh Singeli's minimalist vibe, so all you get out of the box are vector types, written like `[16]i8`. Which is better than C's `__m128` for every integer because now `a+b` has a clear meaning. Oh, and it knows which vector extensions exist, so you can test whether your target architecture supports one with `hasarch{'SSSE3'}`. Here I'm going to use x86 with vector extensions up to SSSE3 (released in 2006, yes you have it, unless you're on ARM). By default, Singeli picks up architecture flags from the current CPU to compile for native execution. Or you can specify with `-a SSSE3`, although if you're not on x86 of course you've got no way to run the output C code.
 97 | 
 98 | To make use of my `[16]i8`s instead of leaving them to sit around and look pretty I need some definitions, which will compile to C intrinsics. There are two libraries for these right now. [arch/iintrinsic/basic](../include/README.md#simd-basics) is a curated set of "nice" operations like load, store, and arithmetic, and arch/iintrinsic/misc is a dump of the rest (iintrinsic is "intel intrinsics", which is the target the same way C is for arch/c). I only need one macro from misc, so I'm just going to copy it over.
 99 | 
100 |     include 'arch/iintrinsic/basic'
101 |     def shuffle{a:T==[16]i8, b:T} = emit{T, '_mm_shuffle_epi8', a, b}
102 | 
103 | EDIT: That was good to build character and all, but now there's a [usable wrapper](../include/README.md#simd-selection) for shuffling (blending too, eh, let's ignore it because the instructions weren't added until after SSSE3), so I'll just patch this in:
104 | 
105 |     include 'arch/iintrinsic/select'
106 |     def shuffle{a:T==[16]i8, b:T} = vec_shuffle{a, b}
107 | 
108 | Looks like `#define`, but these `def` macros are smart: you can check compile-time conditions to decide whether it applies. If not it'll try the previous definition if any, meaning it's an overload. `shuffle` doesn't overload anything, so just errors if `a` and `b` don't have type `T` which is `[16]i8`. On the other hand, there's something we do want to overload:
109 | 
110 |     fn reverse{T==i8 if hasarch{'SSSE3'}}(arr:*T, len:u64) : void = {
111 |       def V = [16]T
112 |       r := vec_make{V, 15 - range{16}}
113 |       av := *V~~arr
114 |       av <- shuffle{av->0, r}
115 |     }
116 | 
117 | Not a full implementation—for now it's ignoring `len` and reversing 16 elements. But there are a few new things, besides the conditions added to `reverse`. The macro `def V = [16]T` is basically a typedef. Macros are scoped so that it only applies inside `reverse`. The casting operator `~~` is defined by skin/cext as `reinterpret`, which converts between types of the same width. `range{16}` gives the integers from 0 to 15 inclusive, and I subtract from 15 to reverse the order. All this happens at compile time, and `-` working on a list is showing some APL influence. Somehow we got one of the good bits here.
118 | 
119 | And it makes sense that `-` should be able to act on multiple numbers at compile time because (with arch/iintrinsic/basic) it applies to vectors at runtime. Instead of `vec_make{V, 15 - range{16}}` it could be `vec_broadcast{V, 15} - vec_make{V, range{16}}`. These two vector-building macros come from arch/iintrinsic/basic, and if you haven't heard of it, "broadcasting" is one name for spreading a single value to all elements of a vector. A better use of vector arithmetic is to extend `reverse` to deal with 16 elements or less:
120 | 
121 |     fn reverse{T==i8 if hasarch{'SSSE3'}}(arr:*T, len:u64) : void = {
122 |       def V = [16]T
123 |       f := vec_make{V, range{16}}       # forward
124 |       r := vec_make{V, 15 - range{16}}  # reverse
125 |       l := vec_broadcast{V, T<~len}
126 |       m := V~~(f < l)
127 |       s := ((l - f - vec_broadcast{V, 1}) & m) | andnot{f, m}
128 |       av := *V~~arr
129 |       av <- shuffle{av->0, s}
130 |     }
131 | 
132 | The basic idea is to read an entire vector regardless of length, reverse only the first `len` elements, and put it back. So this reads from and writes to memory beyond the actual vector argument. Obviously you need to know you have access to that memory, but that's easy to ensure if you control the allocations. But C and other compilers can't figure it out so it's one way writing your own SIMD is better.
133 | 
134 | The specific idea is to blend a vector that starts at `len-1` and goes down with the identity vector `f`. We choose the descending vector for the first `len` elements, using the mask `f < l`. The result of an SSE comparison is all 0 bits or all 1, and it has an unsigned type but `V` is signed, so slap on `V~~`. Next section I'll show a blend utility that keeps this mess out of sight.
135 | 
136 | And another cast `<~` in there. The three casts skin/cext defines are `~~` for reinterpret, `^~` for promoting from a type to a superset, and `<~`. At the moment this one just always does a C cast, but the idea is to use it for a narrowing integer conversion. Get familiar with these because Singeli requires a lot of casting. Or at least the standard definitions do, nothing preventing you from extending those.
137 | 
138 | So now we can put together a function that works on any length. Language-wise there's nothing new here unless you consider an `if` statement to be a surprise. But there's a trick for handling when the two vector pointers meet in the middle. If there's one vector or less between them, we have the code for that. If there are two vectors or less, we could reverse one full and one partial vector, but that's ugly. Instead we're going to reverse two overlapping full vectors. This actually doesn't take any changes other than the loop bound. The main loop was going to read the two vectors and then write two reversed ones anyway, so the writes don't interfere with the reads.
139 | 
140 |     include 'arch/iintrinsic/basic'
141 |     include 'arch/iintrinsic/select'
142 |     def shuffle{a:T==[16]i8, b:T} = vec_shuffle{a, b}
143 |     fn reverse{T==i8 if hasarch{'SSSE3'}}(arr:*T, len:u64) : void = {
144 |       def V = [16]T
145 |       f := vec_make{V, range{16}}
146 |       r := vec_make{V, 15 - range{16}}
147 |       av := *V~~arr        # beginning of part not yet reversed
148 |       bv := *V~~(arr+len)  # just after the end of that part
149 |       while (av+1 < bv) {
150 |         --bv
151 |         c  := shuffle{av->0, r}
152 |         av <- shuffle{bv->0, r}
153 |         bv <- c
154 |         ++av
155 |       }
156 |       if (av < bv) {
157 |         rem := *T~~bv - *T~~av
158 |         l := vec_broadcast{V, T<~rem}
159 |         m := V~~(f < l)
160 |         s := ((l - f - vec_broadcast{V, 1}) & m) | andnot{f, m}
161 |         av <- shuffle{av->0, s}
162 |       }
163 |     }
164 | 
165 | There you have it, reversing bytes at SSE speed. AVX2 ought to be twice as fast but it's got this ridiculous design where it only shuffles within 16-byte lanes—it's not that much overhead but it's more of a headache than I'm willing to put up with right now.
166 | 
167 | ## Generics
168 | 
169 | I already said I don't like repeating myself. Instead of copy-pasting, I'll make this vector reverse work on multiple types, which will take a little more macro usage. First some cleanup.
170 | 
171 |     oper &~ andnot infix none 35
172 | 
173 | This defines the and-not operator so that `a &~ b` is `a & ~b`. The C backend could probably work the second one out, but it's nice to know you're generating one `andnot` intrinsic. And even if an `&~` operator isn't defined, `&~` with no space won't split into `&` and `~` for consistency. Or maybe because developers are scared of working on the lexer, take your pick. The `infix none 35` thing is the parsing information, which I just copied from `&` in cop.singeli.
174 | 
175 |     def blend{m:M, t:T, f:T} = (t & T~~m) | (f &~ T~~m)
176 | 
177 | And this is a macro for blend, the vector equivalent of `if (m) t else f`. Again we've got the smart macro, where the inputs all have to be typed and `t` and `f` have to have the same type. What it does is to get all their types and then check that the ones with the same name are consistent. Another thing, we use `m` twice, which should have a C programmer twitching. But it's safe: `blend` isn't operating on source tokens, but instead saying what to do with values. Which is also how it can check types, because by the time the macro gets processed its inputs have been handled by the compiler and their types are known. And the story is the same at runtime: all macro inputs are evaluated, and then the code in the macro runs.
178 | 
179 | Now the hard part, which is to make this work on other types. For a lot of simpler vector algorithms you mostly just have to change the vector type, so you'd write something like `def V = [128/width{T}]T` to make a 128-bit vector and you're done. Here that doesn't work because SSSE3 only has this one shuffle instruction, which works on 1-byte units. So we're going to define `V` as `[16]i8`. Then it's bit-bashing time to reverse the `T`-width units in those vectors. Here, I'll dump it all out so you can see what I'm talking about.
180 | 
181 |     include 'arch/iintrinsic/basic'
182 |     include 'arch/iintrinsic/select'
183 |     oper &~ andnot infix none 35
184 |     def blend{m:M, t:T, f:T} = (t & T~~m) | (f &~ T~~m)
185 |     def shuffle{a:T==[16]i8, b:T} = vec_shuffle{a, b}
186 | 
187 |     fn reverse{T if hasarch{'SSSE3'}}(arr:*T, len:u64) : void = {
188 |       def b = width{T} / 8  # width of T in bytes
189 |       def vb = 16
190 |       def vi = range{vb}
191 |       def V = [vb]i8
192 |       def scal{x} = vec_broadcast{V, x}
193 |       f := vec_make{V, vi}
194 |       r := vec_make{V, vb-b - vi + 2*(vi%b)}
195 |       av := *V~~arr
196 |       bv := *V~~(arr+len)
197 |       while (av+1 < bv) {
198 |         --bv
199 |         c  := shuffle{av->0, r}
200 |         av <- shuffle{bv->0, r}
201 |         bv <- c
202 |         ++av
203 |       }
204 |       if (av < bv) {
205 |         rem := *T~~bv - *T~~av
206 |         l := scal{i8<~(b*rem)}
207 |         m := V~~(f < l)
208 |         s := blend{m, r + l - scal{vb}, f}
209 |         av <- shuffle{av->0, s}
210 |       }
211 |     }
212 | 
213 | The main loop always does the same permutation, analogous to `vec_make{V, 15 - range{16}}` from before but with more arithmetic. I've defined `vi = range{vb}` to make this a little simpler—if you haven't noticed, just about anything can go in a `def`. Still, `r` is a real head-scratcher. But it's a compile-time head scratcher, and that means I can stick `show` calls all over the place before compiling to see what's going on. `show` just returns its input so it doesn't affect the compiler output, but it also prints that input. See below. These are for the `i32` case, and since I don't actually call it I just added a line `reverse{i32}` which is enough to make sure the function is compiled.
214 | 
215 |       r := vec_make{V, show{vb-b} - vi + 2*(vi%b)}
216 |     # 12
217 |       r := vec_make{V, show{vb-b - vi} + 2*(vi%b)}
218 |     # tup{12,11,10,9,8,7,6,5,4,3,2,1,0,-1,-2,-3}
219 |       r := vec_make{V, show{vb-b - vi + show{2*(vi%b)}}}
220 |     # tup{0,2,4,6,0,2,4,6,0,2,4,6,0,2,4,6}
221 |     # tup{12,13,14,15,8,9,10,11,4,5,6,7,0,1,2,3}
222 | 
223 | First line shows `vb-b`, which is the first byte after reversing, or the start of the last element before. And the elements go down from there so I subtract `vi`. But this means bytes go down within an element when I want them going up, so I add twice the byte index `vi%b` within each element.
224 | 
225 | And then the last vector is a minor variation on what we did before. Work it out yourself if you really care. Can't get reverse by subtracting the forward vector from a constant any more, so I added the reverse one to a different constant. This arithmetic all happens at runtime, so you won't get anything useful out of `show`, but `lprintf` does handle vectors.
226 | 
227 | What about AVX2, or other architectures? It's all possible. NEON support is going to be pretty easy here since it has just about the same instructions: use `hasarch{'SSSE3'} or hasarch{'AARCH64'}` for the condition, qualify the `shuffle` we have here with `hasarch{'SSSE3'}`, and add a NEON one too (EDIT: now arch/neon\_intrin/basic has you covered, load conditionally with `if_inline`). Then as `reverse` is compiled it'll check the architecture when it calls `shuffle` and use the right one. For AVX2 you have a few options. First thing I'd try is to change `def vb = 16` to `def vb = if (hasarch{'AVX2'}) 32 else 16`, and then make other things check `vb` as necessary. Have fun dealing with that within-lane shuffle.
228 | 


--------------------------------------------------------------------------------
/emit_c.bqn:
--------------------------------------------------------------------------------
  1 | cpp‿cpu‿prefix‿⟨·,ErrOut,ErrExit,·⟩ ← ∾⟜"_"⌾(2⊸⊑) •args
  2 | 
  3 | Asrt ← {𝕨𝕊1:𝕩; ·𝕊𝕩:𝕊𝕨;
  4 |   ErrOut "Invalid IR, likely implementation bug!"
  5 |   ErrOut⍟(0⊸≢) 𝕩 ⋄ ErrExit@
  6 | }
  7 | 
  8 | types ← {
  9 |   ub←uv←um←0 ⋄ req←⟨⟩ ⋄ init ⇐ {ub↩uv↩um↩0⋄req↩⟨⟩}
 10 |   Require ⇐ {𝕊: req∾↩<𝕩}
 11 |   Headers ⇐ {𝕊: ⍷ req ∾˜ 1‿ub‿um‿(uv>0)/⟨"stdint.h","stdbool.h","math.h",cpu.header⟩}
 12 |   Type ⇐ { 𝕨 𝕊 bType‿w‿am‿ptrs‿disp:
 13 |     Err ← {ErrOut ∾⟨"Unhandled type `",disp,"`: ",𝕩⟩ ⋄ ErrExit@} 
 14 |     u‿f ← bType = "uf"
 15 |     (" "∾𝕨) ⊢⊘(∾˜) (ptrs/"*") ∾˜ {
 16 |       0<≠am     ? uu‿t ← Err∘⊑⍟(0==) cpu.VecType w‿am‿u‿f ⋄ uv⌈↩uu ⋄ t ;
 17 |       f         ? "float"‿"double"⊑˜32‿64⊸⊐⌾<w ;
 18 |       w=0       ? "void" ;
 19 |       u∧w=1     ? "bool" ⊣ ub↩1 ;
 20 |       ⊑w∊2⋆3+↕4 ? ∾⟨u/"u","int",•Repr w,"_t"⟩ ;
 21 |       Err "invalid scalar type"
 22 |     }
 23 |   ;
 24 |     𝕨𝕊⟨"fn",res‿args,ptrs,disp⟩:
 25 |     ∾⟨res, " (*",ptrs/"*",𝕨⊢⊘∾")(", ","Join args, ")"⟩
 26 |   }
 27 |   SpecialFloat ⇐ {um↩1 ⋄ n←'-'=⊑𝕩 ⋄ (n↑𝕩)∾"NAN"‿"INFINITY"⊑˜"0/0"‿"1/0"⊸⊐⌾<n↓𝕩}
 28 | }
 29 | 
 30 | lf ← @+10
 31 | 
 32 | Generate ← {
 33 |   decl.Init@ ⋄ types.Init@
 34 |   fn ← 0  # Whether a function is active
 35 |   ProcLine ← {
 36 |     line ← StartLine 𝕩
 37 |     c ← code⊸⊐⌾< Name line
 38 |     (∾"Unknown operation: `"‿𝕩‿"`") Asrt c<≠code
 39 |     depth‿infn‿do ← c ⊑ op
 40 |     "Invalid function containment" Asrt fn ≡ infn
 41 |     fn +↩ depth
 42 |     res ← ∾ do {𝕎𝕩}¨ <line
 43 |     line.Finish @
 44 |     res ∾ ⟨infn∧fn,(0<≠res)∧2-fn⟩/';'‿lf
 45 |   }⍟(0<≠)
 46 |   l ← ProcLine¨ lf ((⊢-˜+`×¬)∘=⊔⊢) 𝕩
 47 |   "Unclosed function" Asrt 0≡fn
 48 |   prelude ← lf Join ("#include"∾("<"∾∾⟜">")⍟('"'≠⊑))¨ types.Headers@
 49 |   ∾ (𝕨⊣prelude) ((⊣⋈(⊣×1+⊢)○(0<≠)⥊lf˙)∾⊢) (decl.Emit@)∾l
 50 | }
 51 | 
 52 | StartLine ← { 𝕊 str:
 53 |   Assert ← { Asrt ∾𝕨‿": `"‿str‿"`" }⍟(1≢⊢)
 54 |   tok ← ' ' ((⊢-˜+`×¬)∘=⊔⊢) str
 55 |   i ← ¯1
 56 |   Next ⇐ {𝕤
 57 |     i +↩ 1
 58 |     "Unfinished line" Assert i < ≠tok
 59 |     i ⊑ tok
 60 |   }
 61 |   All ⇐ {𝕤
 62 |     r←(i+1)↓tok ⋄ i↩1-˜≠tok ⋄ r
 63 |   }
 64 |   Finish ⇐ {𝕤
 65 |     IsWS ← ∊⟜(" "∾@+9)
 66 |     "Excessive IR line" Assert tok ≠⊸≤◶⟨(∨´ IsWS ∨ ·∨`'#'⊸=)⊑˜, 1⟩ i+1
 67 |   }
 68 | }
 69 | 
 70 | Nat ← 10⊸×⊸+˜´∘⌽ -⟜'0'
 71 | Nest ← +`·-˝=⌜
 72 | as ← {
 73 |   Sym ⇐ (⊢-128×(' '+128)⊸=) 1↓¯1↓⊢
 74 |   Name ⇐ Sym⍟('''=⊑)
 75 |   Rename ⇐ prefix∾1⊸↓⍟('$'=⊑)
 76 |   I32 ⇐ ('-'=⊑)⊸(⊣-∘⊢⍟⊣Nat∘↓)
 77 |   Lit ⇐ (⊑"'$!"⊐⊏)◶Sym‿({decl.Call𝕩⋄𝕩}⍟('f'=⊑∘⊣)⟜Rename 1⊸↓)‿{
 78 |     Bl ← 0<"{}"⊸Nest ⋄ br ← Bl𝕩
 79 |     v‿t ← (1-˜+`×¬)∘(1⌾⊑br<':'⊸=)⊸⊔ 𝕩
 80 |     {¬∨´br?
 81 |       v ∾↩ {
 82 |         'f': h←⊑'x'∊v ⋄ v types.SpecialFloat⍟(e←⊑'/'∊v)↩
 83 |              {𝕩∾(h/"p0")∾e¬⊸/"f"}⍟("f32"≡t) "."/˜¬e∨h∨´"e."∊v ;
 84 |         (('u'=𝕩)/"u")∾"ll"
 85 |       }⊑t
 86 |       m ← "-0x8000000000000000"≡v # For floats: negate after cast
 87 |       ∾⟨"(",m↑v,"(",Type t,")",m↓v,")"⟩
 88 |     ;
 89 |       "Invalid IR literal" Asrt "tup{"≡4↑v
 90 |       "{"∾"}"∾˜∾1↓⥊(<",")≍˘ Lit¨ ((⊢-˜+`×¬)∘(Bl<','⊸=)⊔⊢)○(¯1↓4↓⊢) v
 91 |     }
 92 |   }‿⊢
 93 |   Type ⇐ { 𝕨𝕊s:  # Singeli type to native
 94 |     s ↓˜↩ ptrs ← +´∧`'*'=s
 95 |     s ↩ ⊢´ v ← ((1-˜¬∘∨×1+`⊢)˝·⊑⊸∧"[]"=⌜⊢)⊸⊔ s
 96 |     am ← Nat¨ ¯1↓v
 97 |     ParseFn ← {
 98 |       s ← (','=𝕩)∧n←(1⊸=∧·∧`0⊸<)"()"Nest𝕩
 99 |       a ← Type¨ ((1-˜+`×¬)s∨»⊸≠⊸≥n)⊔𝕩
100 |       "Invalid IR type" Asrt ")->"≡3↑r←n¬⊸/𝕩
101 |       ⟨Type 3↓r, a⟩
102 |     }
103 |     𝕨 types.Type ⟨𝕩⟩ ∾˜ {
104 |       "void": 'v'‿0‿⟨⟩‿ptrs;
105 |       '('=⊑𝕩? ⟨"fn",ParseFn 𝕩,ptrs⟩;
106 |       ⟨⊑𝕩, Nat 1↓𝕩, am, ptrs⟩
107 |     } s
108 |   }
109 | }
110 | 
111 | Name‿Rename‿Type‿Lit‿I32 ← {𝕏{𝔽∘𝔾⊘(𝔽⟜𝔾)}{𝕩.Next@}}¨ ⟨as.Name,as.Rename,as.Type,as.Lit,as.I32⟩
112 | All ← {𝕩.All@}
113 | 
114 | decl ← {
115 |   FromName ← Nat · (∧`'_'⊸≠)⊸/ (1+≠prefix)⊸↓  # si_f
116 |   c←o←@ ⋄ Init⇐{𝕤⋄c↩↕0⋄o↩⟨⟩}
117 |   BeginFn ⇐ { n←FromName𝕨 ⋄ n<≠c ? n⊑c ? o∾↩<𝕩 ; @}
118 |   Call ⇐ { i←FromName𝕩 ⋄ c↑˜↩(≠c)⌈1+i ⋄ c 1⌾(i⊸⊑)↩ }
119 |   Emit ⇐ {𝕤⋄ ∾⟜⟨⥊lf⟩⍟(0<≠) (¯2⊸↓∾(";"∾lf)˙)¨ o }
120 | }
121 | 
122 | Join ← {∾1↓⥊(<𝕨)≍˘𝕩}
123 | List ← ", "⊸Join
124 | 
125 | BeginFn ← {𝕤
126 |   FmtExt ← {
127 |     e ← ((-´"aA")×'A'⊸≤)⊸+ 1↓𝕩
128 |     ∾"__attribute__ ((__target__ ("""‿e‿""")))"‿lf
129 |   }
130 |   n ← as.Rename nn ← Name 𝕩
131 |   ret ← n Type 𝕩
132 |   argc ← I32 𝕩
133 |   param ← Name⊸Type∘𝕩¨ ↕argc
134 |   exts ← All 𝕩 ⋄ Asrt 1≥≠exts ⋄ Asrt ∧´('+'=⊑)¨exts
135 |   fexts ← FmtExt¨exts
136 |   { "main"≢nn
137 |   ? n⊸decl.BeginFn⊸⊢ ∾fexts∾⟨"static ",ret,"(",List param,") {"⟩
138 |   ; pc ← (⊢∾" = ("∾(∧`' '⊸≠)⊸/∾")argv;"˙)¨1↓param
139 |     ∾fexts∾⟨"int main(",List "char** argv"¨⌾(1⊸↓)param,") {"⟩∾pc
140 |   }
141 | }
142 | Export ← {
143 |   exp ← "const "∾Name 𝕩
144 |   ret ← exp Type 𝕩
145 |   val ← Lit 𝕩
146 |   ∾⟨ret," = ",val,";"⟩
147 | }
148 | Define ← {st 𝕊 ty‿id‿val:
149 |   arr ← {'*'=⊑ty? ∨´"{?"=c←⊑1↑⊑val? # Define array, then pointer cover
150 |     v←val ⋄ val↩i←id∾"_" ⋄ id∾⍟st˜↩"const "
151 |     {'?'=c? UndefArr ⟨ty,i,(∧`':'⊸≠)⊸/1↓⊑v⟩ ;
152 |      cpp>st? DefCppArr ⟨ty,i,∾v⟩ ; ⟨st Define ⟨1↓ty,i∾"[]",v⟩, "; "⟩}
153 |   ; ⟨⟩}
154 |   eq ← {cpp>st? ⟨"; ",id," = "⟩ ; ⟨" = "⟩}
155 |   ∾∾⟨
156 |     arr, (1=st)/⟨"static "⟩
157 |     ⟨id as.Type ty⟩, (('?'≠·⊑1↑⊑) / eq⊸∾) val
158 |   ⟩
159 | }
160 | UndefArr ← {𝕊 ty‿id‿n: (0 Define ⟨1↓ty,∾id‿"["‿n‿"]",⟨"?"⟩⟩) ∾ "; "}
161 | # For C++, initialize array by creating a second in a temp scope and copying
162 | DefCppArr ← {𝕊 ty‿i‿v:
163 |   it←i∾"t"
164 |   n←•Repr("{}"≢v)+´(','⊸=∧1="{}"⊸Nest)v
165 |   ∾⟨
166 |     UndefArr ty‿i‿n, "{ "
167 |     2 Define ⟨1↓ty,it∾"[]",v⟩, "; "
168 |     "for (unsigned i=0; i<",n,"; i++) ",i,"[i] = ",it,"[i]; } "
169 |   ⟩
170 | }
171 | Constant ← {
172 |   id   ← Rename 𝕩
173 |   type ← Name 𝕩 # Define turns to type
174 |   val ← Lit 𝕩
175 |   (1 Define type‿id‿val)∾";"
176 | }
177 | Require ← { types.Require Name 𝕩 ⋄ "" }
178 | ⟨New, Mut⟩ ← {
179 |   Cast ← {∾"("‿(as.Type 𝕨)‿")"‿𝕩}
180 |   Special ← {
181 |     "^promote"𝕊⟨t,v⟩  : t Cast v ;
182 |     "^bitcast"𝕊⟨s,t,v⟩: { s∧○(⊑⊏∊"iu*"˙)t ? s Cast v ;
183 |                           s ≡○as.Type t ? v ; "^bitcast"‿t‿v } ;
184 |     "^load"   𝕊⟨p,i  ⟩: ∾p‿"["‿i‿"]" ;
185 |     "^store"  𝕊⟨p,i,v⟩:  p‿"["‿i‿"] = "‿v
186 |   }
187 |   Call ← 1⊸↑⊸≡⟜"^"◶⟨{𝕨‿"("‿𝕩‿")"}⟜List, Special⟩
188 |   code‿op ← <˘⍉>⟨
189 |     "val" ‿(⋈ Lit)
190 |     "call"‿{ fn ← Lit 𝕩 ⋄ fn Call (Lit𝕩˙)¨ ↕I32 𝕩 }
191 |     "emit"‿(Name ("op "≡3↑⊣)◶⟨
192 |         Call⟜(as.Lit¨All)
193 |         { o←3↓𝕨 ⋄ ⟨Lit 𝕩," ",o," ",Lit 𝕩⟩ }
194 |       ⟩ ⊢)
195 |     "array"‿{"{"∾"}"∾˜∾1↓⥊(<",")≍˘ as.Lit¨ All 𝕩}
196 |   ⟩
197 |   Memcpy ← {id𝕊"^bitcast"‿t‿v:
198 |     types.Require "string.h"
199 |     m ← id∾"_"
200 |     ∾⟨"{",0 Define ⟨t,m,v⟩,"; memcpy(&",id,", &",m,", sizeof(",as.Type t,"));}"⟩
201 |   ;𝕊:@}
202 |   New ⇐ {
203 |     id   ← Name 𝕩
204 |     kind ← Name 𝕩
205 |     ty   ← Name 𝕩 # Define turns to type
206 |     c ← code⊸⊐⌾< kind
207 |     (∾"Unknown new: `"‿kind‿"`") Asrt c<≠code
208 |     val ← (c⊑op) {𝕎𝕩} 𝕩
209 |     {
210 |       "void"≡ty?∾val ;
211 |       @≢m←id Memcpy val?∾⟨id as.Type ty,"; ",m⟩ ;
212 |       0 Define ty‿id‿val
213 |     }
214 |   }
215 |   Mut ⇐ {
216 |     id   ← Name 𝕩
217 |     kind ← Name 𝕩
218 |     c ← code⊸⊐⌾< kind
219 |     val ← {
220 |       c=≠code ? ⟨as.Lit kind⟩ ;
221 |       Name 𝕩 ⋄ (c⊑op) {𝕎𝕩} 𝕩  # Discard type
222 |     } 𝕩
223 |     {@≢m←id Memcpy val? m ; ∾id‿" = "∾val}
224 |   }
225 | }
226 | 
227 | code‿op ← (⊑¨ ⋈ 1⊸↓¨) ⟨
228 |   "export"  ‿ 0‿0‿⟨Export⟩
229 |   "constant"‿ 0‿0‿⟨Constant⟩
230 |   "require" ‿ 0‿0‿⟨Require⟩
231 |   "beginFn" ‿ 1‿0‿⟨BeginFn⟩
232 |   "lbl"     ‿ 0‿1‿⟨Name,":"⟩
233 |   "ret"     ‿ 0‿1‿⟨"  ","return","void"⊸≢◶⟨""," "⊸∾⟩ Lit⟩
234 |   "gotoF"   ‿ 0‿1‿⟨"  ","if (!(",Lit,")) ","goto ",Name⟩
235 |   "gotoT"   ‿ 0‿1‿⟨"  ","if (",Lit,") ","goto ",Name⟩
236 |   "goto"    ‿ 0‿1‿⟨"  ","goto ",Name⟩
237 |   "new"     ‿ 0‿1‿⟨"  ",New⟩
238 |   "mut"     ‿ 0‿1‿⟨"  ",Mut⟩
239 |   "endFn"   ‿¯1‿1‿⟨"}"⟩
240 | ⟩
241 | 
242 | Generate
243 | 


--------------------------------------------------------------------------------
/float2.bqn:
--------------------------------------------------------------------------------
 1 | # High-precision numbers as pairs representing unevaluated sums
 2 | # Format is ⟨high,low⟩
 3 | 
 4 | To   ⇐ ⋈⟜0
 5 | From ⇐ +´
 6 | 
 7 | Add12 ← {a𝕊b:
 8 |   s ← a + b
 9 |   {¬∞>|s ? s‿0 ;
10 |   av← s - bv← s - a
11 |   ⟨s, +´a‿b-av‿bv⟩ }
12 | }
13 | Add ⇐ {a𝕊b:
14 |   r ← a +○⊑ b
15 |   {¬∞>|r ? r‿0 ;
16 |   s ← (-r) +´ ¯1⌽⌽⍟(a<○(|⊑)b) b⌽⊸∾a
17 |   r Add12 s }
18 | }
19 | 
20 | Neg ⇐ -
21 | Abs ⇐ -⍟(0>⊑)
22 | Floor‿Ceil ⇐ {𝕏∘⊑⊸(⊣⋈·𝕏-⊸(+´)⟜⌽)}¨ ⌊‿⌈
23 | Sub ⇐ Add⟜Neg
24 | 
25 | Cmp ⇐ (=˜∘⊢≤≤){𝔽˜-𝔽}{=○⊑◶⟨𝔽○⊑, 𝔽○(⊢´)⟩}
26 | 
27 | Split ← 53‿1024{p‿e _𝕣:  # Double-precision float
28 |   sp← 1+2⋆se←⌈p÷2
29 |   m ← 2⋆e-1+se ⋄ f ← 2⋆-p  # Adjustments to avoid hitting ∞
30 |   {¬m>|𝕩? 𝕊⌾(f⊸×) 𝕩; 𝕊a:
31 |     c ← sp × a
32 |     al← a - ah← c - c - a
33 |     ⟨al, ah⟩ # Backwards for convenient reduction
34 |   }
35 | }
36 | Mul12 ← {a𝕊b:
37 |   h ← a × b
38 |   {∞>|h? ⟨h, (-h) +´ ⥊ b ×⌜○Split a⟩ ; h‿0}
39 | }
40 | Mul ⇐ {a𝕊b:
41 |   ph‿pl ← a Mul12○⊑ b
42 |   {∞>|⊑ph? ph Add12 pl + +´ a × ⌽b ; ph‿0}
43 | }
44 | 
45 | Div ⇐ {b𝕊a:
46 |   yn ← (⊑b) × xn ← ÷⊑a
47 |   {¬∞>|yn? yn‿0 ;
48 |   diff ← ⊑ b Sub a Mul yn‿0
49 |   yn‿0 Add xn Mul12 diff }
50 | }
51 | 
52 | Mod ⇐ {
53 |   b𝕊a‿0: a>0 ? h←a÷2 ⋄ Add12´ a⊸+⌾⊑⍟(<⟜-´) (-⟜(a×h<⊢)a|⊢)⍟(h<|)¨b ;
54 |   # Not correctly rounded but probably okay
55 |   b𝕊a: a Sub b Mul (Floor a Div b)
56 | }
57 | 
58 | # Decimal parsing
59 | # For one double, max digits is 15 and max power of 10 is 1e22
60 | Exp10 ← { 22≥𝕩? To 10⋆𝕩; 308<𝕩? To ∞; (⊣´Mul⊢´)𝕊¨⍷⌊2÷˜𝕩+↕2 } # Could save results
61 | N1 ← •ParseFloat
62 | Nat ← ≠⊸{
63 |   15≥𝕨 ? To N1 𝕩 ;
64 |   20≥𝕨 ? ¯15 ((1e15×N1∘↓) Add12 N1∘↑) 𝕩 ;  # Exact
65 |   35≥𝕨 ? ¯20 ((1e20 Mul12 •ParseFloat∘↓) Add Nat∘↑) 𝕩 ; # Sum of exacts so it's correctly rounded
66 |   (Exp10 𝕨-35) Mul Nat 35↑𝕩  # Imprecise
67 | }
68 | ParseDec ⇐ { # 𝕨 is base-10 exponent; 𝕩 is digit string
69 |   0≤𝕨 ? 𝕨 Exp10⊸Mul⍟(0<⊣) Nat 𝕩 ;
70 |   (𝕨↓𝕩) Nat⊸Add⍟(0<≠∘⊣) (Exp10-𝕨) Div˜ Nat (𝕨⌈-≠𝕩)↑𝕩
71 | }
72 | 
73 | _repr ⇐ { len‿b _𝕣:
74 |   ! ⌊⊸= 2⋆⁼b # Need division by b to be exact
75 |   {c←0 ⋄ {𝕩+↩c⋄c↩⌊𝕩÷b⋄b|𝕩}¨𝕩} ·+´ b|⌊∘÷⟜b⍟(↕len)¨
76 | }
77 | Bits ⇐ {
78 |   𝕊⁼𝕩: (2⋆48)⊸×⊸Add12˜○(2⊸×⊸+˜´)˝ 2‿∘⥊𝕩 ;
79 |   ∧´𝕩=⟜1⊸∨⌾⊑𝕩=0 ? 96↑⊏𝕩 ;
80 |   "Bitwise operation: arguments must be integers" ! ⌊⊸≡◶⟨0,>⟜-´⟩ 𝕩
81 |   "Bitwise operation: arguments can't exceed 2^96" ! 0<(2⋆96)-˜´⌽𝕩
82 |   96‿2 _repr 𝕩
83 | }
84 | 


--------------------------------------------------------------------------------
/include/README.md:
--------------------------------------------------------------------------------
  1 | # Singeli standard includes
  2 | 
  3 | Standard includes are those built into the compiler. Each can be included with a line like `include 'arch/c'`, which uses a path relative to this directory (include/ in the Singeli sources).
  4 | 
  5 | - `skin/` Operator definitions
  6 |   - [`skin/c`](skin/c.singeli) C-like operators (with some tweaks)
  7 |     - [`skin/cop`](skin/cop.singeli) Non-mutating operators
  8 |     - [`skin/cmut`](skin/cmut.singeli) Mutating operators such as `*=` and `++`
  9 |   - [`skin/cext`](skin/cext.singeli) Extensions to C-like operators
 10 | - `arch/` Operation generation
 11 |   - [`arch/c`](arch/c.singeli) Platform-independent C
 12 |   - `arch/iintrinsic/` for x86 extensions or `arch/neon_intrin/` for NEON vector intrinsics (ARM)
 13 |     - [`arch/*/basic`](#simd-basics) Basic vector support and arithmetic
 14 |     - [`arch/*/select`](#simd-selection) Rearranging elements without changing type
 15 | - `clib/` Bindings for C libraries
 16 |   - [`clib/malloc`](clib/malloc.singeli) malloc (as `alloc{}`) and free
 17 | - `util/` Utilities
 18 |   - [`util/for`](#utilfor) Typical @for loops
 19 |   - [`util/tup`](#utiltup) Programming with tuples
 20 |   - [`util/kind`](util/kind.singeli) Short generators to test value's kind
 21 |   - [`util/perv`](util/perv.singeli) Generator pervasion
 22 |   - [`util/functionize`](util/functionize.singeli) Make function from generator
 23 | - `debug/` Debugging utilities
 24 |   - [`debug/printf`](debug/printf.singeli) Print at runtime
 25 | 
 26 | ## util/for
 27 | 
 28 | File [util/for.singeli](util/for.singeli).
 29 | 
 30 | Each loop handles the indices `i` satisfying `from <= i < to`.
 31 | 
 32 | | Loop               | Description
 33 | |--------------------|------------
 34 | | `@for`             | Standard forward loop
 35 | | `@for_backwards`   | Same indices in the reverse order
 36 | | `@for_const`       | Compile-time loop, requiring constant bounds
 37 | | `@for_unroll{unr}` | Loop unrolled by a factor of `unr`
 38 | 
 39 | The unrolled loop creates two sub-loops, one that evaluates `unr` copies of the given body and the other that evaluates only one. It runs the first as many times as possible starting at `from` (no adjustments are made for alignment), then the second until `to` is reached.
 40 | 
 41 | ## util/tup
 42 | 
 43 | File [util/tup.singeli](util/tup.singeli).
 44 | 
 45 | | Syntax                   | Description
 46 | |--------------------------|------------
 47 | | `empty{tup}`             | Tuple is empty
 48 | | `@collect`               | Constant-time evaluation returning a list
 49 | | `iota{num}`              | Alias for `range`
 50 | | `inds{tup}`              | Tuple of all indices into tuple
 51 | | `copy{num, any}`         | Tuple of `num` copies of `any`
 52 | | `join{tups}`             | Merge a tuple of tuples
 53 | | `shiftright{l, r}`       | Shift tuple `l` into `r`, retaining length of `r`
 54 | | `shiftleft{l, r}`        | Shift tuple `r` into `l`, retaining length of `l`
 55 | | `reverse{tup}`           | Elements in reverse order
 56 | | `cycle{num, tup}`        | Repeat tuple cyclically to the given length
 57 | | `split{num, tup}`        | Split tuple into groups of the given length or less
 58 | | `flip{tups}`             | Transpose tuple of same-length tuples
 59 | | `table{f, ...tups}`      | Function table mapping over all combinations
 60 | | `flat_table{f, ...tups}` | Function table flattened into a single list
 61 | | `fold{gen, any?, tup+}`  | Left fold, with or without initial element
 62 | | `scan{gen, any?, tup+}`  | Inclusive left scan
 63 | | `replicate{r, tup}`      | Tuple with each input element copied the given number of times
 64 | | `indices{tup}`           | Indices of elements of `tup`, repeated that many times
 65 | 
 66 | Additional notes:
 67 | 
 68 | - `split{n, tup}`: `n` may be a number, indicating that all groups have that length except that the last may be short. It may also be a list of numbers, which is expected to sum to the length of the tuple and indicates the sequence of group lengths.
 69 | - `fold{gen, any?, tup+}` and `fold{gen, any?, tup+}`: if the initialized `any` is given, `tup` indicates any number of tuple arguments, and `gen` will be always called with one parameter from each one.
 70 | - `replicate{r, tup}`: `r` may be a tuple, where each element indicates the number of times to include the corresponding element of `tup` (for example, if it's boolean the elements in the same position as a 1 are kept and those with a 0 are filtered out). It may also be a plain number, so that every element is copied the same number of times, or a generator `f`, so that element `e` is copied `f{e}` times.
 71 | 
 72 | ## SIMD basics
 73 | 
 74 | Includes `arch/iintrinsic/basic` and `arch/neon_intrin/basic` are "basic" architecture includes that define arithmetic and a few essential vector operations. Because of x86's haphazard instruction support, the default `arch/iintrinsic/basic` includes multi-instruction implementations of many operations such as comparisons, min, and max. Use `arch/iintrinsic/basic_strict` to define only cases that are supported by a single instruction.
 75 | 
 76 | All [builtin arithmetic](../README.md#arithmetic) operations are supported when available (`__mod` is the only one that's never provided), in addition to the following (architecture indicated if only one supports it):
 77 | 
 78 | | Syntax                     | Arch | Result
 79 | |----------------------------|------|--------
 80 | | `__adds{x, y}`             |      | Saturating add
 81 | | `__subs{x, y}`             |      | Saturating subtract
 82 | | `__sqrt{x}`                |      | Square root
 83 | | `__round{x}`               | x86  | Round to nearest
 84 | | `andnot{x, y}`             |      | `x & ~y`
 85 | | `ornot{x, y}`              | ARM  | `x \| ~y`
 86 | | `andnz{x, y}`              | ARM  | `(x & y) != 0`
 87 | | `copy_sign{x, y}`          | x86  | Absolute value of `x` with sign of `y`
 88 | | `average_int{x, y}`        | x86  | `(x + y + 1) >> 1`
 89 | | `shl_uniform{v, s:[2]u64}` | x86  | Shift each element left by element 0 of `s`
 90 | | `shr_uniform{v, s:[2]u64}` | x86  | Shift each element right by element 0 of `s`
 91 | 
 92 | The following non-arithmetic definitions are also defined when possible.
 93 | 
 94 | | Syntax                 | Result
 95 | |------------------------|--------
 96 | | `vec_make{V, ...x}`    | A vector of the values `x`
 97 | | `vec_make{V, x}`       | Same, with a tuple parameter
 98 | | `vec_broadcast{V, x}`  | A vector of copies of the value `x`
 99 | | `extract{v:V, ind}`    | The element at position `ind` of vector `v`
100 | | `insert{v:V, x, ind}`  | Insert `x` to position `ind` of `v`, returning a new vector
101 | | `load{ptr,ind}`        | Same as builtin
102 | | `store{ptr,ind,val}`   | Same as builtin
103 | 
104 | x86 also includes `load_aligned` and `store_aligned` for accesses that assume the pointer has vector alignment.
105 | 
106 | ### x86 SIMD arithmetic support
107 | 
108 | The following table shows when arithmetic support was added to x86 for various vector types. For integers, only signed types (`i16`) are shown but unsigned equivalents (`u16`) are supported at the same time. AVX-512F does have the ability to create and perform conversions on 8-bit and 16-bit types, but doesn't support any arithmetic specific to them.
109 | 
110 | | Extension | `u`/`i8` | `u`/`i16` | `u`/`i32` | `u`/`i64` |     `f32` |    `f64` |
111 | |-----------|---------:|----------:|----------:|----------:|----------:|---------:|
112 | | SSE       |          |           |           |           |  `[4]f32` |          |
113 | | SSE2      | `[16]i8` |  `[8]i16` |  `[4]i32` |  `[2]i64` |           | `[2]f64` |
114 | | AVX       |          |           |           |           |  `[8]f32` | `[4]f64` |
115 | | AVX2      | `[32]i8` | `[16]i16` |  `[8]i32` |  `[4]i64` |           |          |
116 | | AVX-512F  |          |           | `[16]i32` |  `[8]i64` | `[16]f32` | `[8]f64` |
117 | | AVX-512BW | `[64]i8` | `[32]i16` |           |           |           |          |
118 | 
119 | The next table shows integer instruction availability in x86. Each entry shows the first extension to include the instructions on a given element type. Multi-instruction fills are not shown. Instructions introduced by SSE extensions are all available in AVX2, except `extract`, and those in AVX2 are all in AVX-512F or AVX-512BW (depending on type support as shown above), except `copy_sign`. AVX2 instructions are also supported on 128-bit vectors, and AVX-512 instructions are supported on 128-bit and 256-bit vectors if AVX-512VL is available. But `arch/iintrinsic/basic` doesn't correctly support these extensions right now.
120 | 
121 | | Functions                     | `i8`   | `i16`  | `i32`  | `i64`   | `u8`   | `u16`  | `u32`  | `u64`
122 | |-------------------------------|--------|--------|--------|---------|--------|--------|--------|-------
123 | | `&` `\|` `^` `andnot` `+` `-` | SSE2   | SSE2   | SSE2   | SSE2    | SSE2   | SSE2   | SSE2   | SSE2
124 | | `__min` `__max`               | SSE4.2 | SSE2   | SSE4.2 | A512F   | SSE2   | SSE4.2 | SSE4.2 | A512F
125 | | `==`                          | SSE2   | SSE2   | SSE2   | SSE4.1  | SSE2   | SSE2   | SSE2   | SSE4.1
126 | | `>` `<`                       | SSE2   | SSE2   | SSE2   | SSE4.2  |        |        |        |
127 | | `__adds` `__subs`             | SSE2   | SSE2   |        |         | SSE2   | SSE2   |        |
128 | | `<<` `shl_uniform`            |        | SSE2   | SSE2   | SSE2    |        | SSE2   | SSE2   | SSE2
129 | | `>>` `shr_uniform`            |        | SSE2   | SSE2   | A512F   |        | SSE2   | SSE2   | SSE2
130 | | `<<` (element-wise)           |        | A512F  | AVX2   | AVX2    |        | A512F  | AVX2   | AVX2
131 | | `>>` (element-wise)           |        | A512F  | AVX2   | A512F   |        | A512F  | AVX2   | AVX2
132 | | `*`                           |        | SSE2   | SSE4.1 | A512DQ  |        | SSE2   | SSE4.1 | A512DQ
133 | | `__abs`                       | SSSE3  | SSSE3  | SSSE3  | A512F   |        |        |        |
134 | | `copy_sign` (no 512-bit)      | SSSE3  | SSSE3  | SSSE3  |         |        |        |        |
135 | | `average_int`                 |        |        |        |         |        | SSE2   | SSE2   |
136 | | `extract` (no ≥256-bit)       | SSE4.1 | SSE2   | SSE4.1 | SSE4.1  | SSE4.1 | SSE2   | SSE4.1 | SSE4.1
137 | 
138 | Floating-point instruction availability is much simpler: all instructions are available on supported types, with the exception of `__floor`, `__ceil`, and `__round`, which weren't added until SSE4.1.
139 | 
140 | | Functions                                                                                  | `f32`  | `f64`
141 | |--------------------------------------------------------------------------------------------|--------|-------
142 | | `&` `\|` `^` `andnot` `+` `-` `*` `__min` `__max` `==` `>` `<` `!=` `>=` `<=` `/` `__sqrt` | SSE    | SSE2
143 | | `__floor` `__ceil` `__round`                                                               | SSE4.1 | SSE4.1
144 | 
145 | ## SIMD selection
146 | 
147 | Includes `arch/iintrinsic/select` and `arch/neon_intrin/select` define operations that rearrange elements from one or more vectors. An operation is supported only when it can be implemented with a single instruction and possibly a constant vector register. In each case there are some values to be manipulated (`val`, `v0`, `v1`, `a`, `b` below), which must all share an element type and also determine the type of the result—although `spec` may indicate a different temporary element type to be used internal to the computation. Vectors here are treated strictly as lists of values, and in particular **left and right shifts go in the opposite direction to arithmetic shl and shr**! Operations `vec_shuffle`, `reverse_units`, and `blend_units` work on sub-units of the vectors, which must have a length that divides the number of elements, that is, a power of two. Operations ending in `128` work on 128-bit lanes, as this is all that AVX instructions support, but the same names without the `_128` or `128` suffix are defined to be the same on 128-bit vectors and error on larger sizes. AVX-512 is not yet supported.
148 | 
149 | | Syntax                               | Arch | Description
150 | |--------------------------------------|------|------------
151 | | `vec_select {spec?, val, ...?ind}`   |      | Vector version of `select{val, ind}`
152 | | `vec_shuffle{spec?, val, ...?ind}`   |      | Select within sub-units, possibly repeating the indices
153 | | `broadcast_sel{val, i}`              |      | Vector with all elements equal to element `i` of `val`
154 | | `reverse_units{s, val}`              |      | Reverse each length-`s` group of elements in `val`
155 | | `vec_shift_left_128 {val, n}`        |      | Move element `i` of `val` to index `i-n`, shifting in zeros
156 | | `vec_shift_right_128{val, n}`        |      | Move element `i` of `val` to index `i+n`, shifting in zeros
157 | | `vec_merge_shift_left_128 {a, b, n}` |      | Left shift of combined lane placing `a` before `b`
158 | | `vec_merge_shift_right_128{a, b, n}` |      | Right shift from end of combined lane placing `a` before `b`
159 | | `zip128{a, b, half}`                 |      | Alternate elements from first (`half=0`) or last (`half=1`) halves of `a` and `b`
160 | | `blend{v0, v1, ...?bools}`           |      | Element-wise choice where `0` in `bools` takes from `v0` and `1` from `v1`
161 | | `blend_units{v0, v1, ...?bools}`     |      | Same, but tuple `bools` is repeated to the full length if short
162 | | `blend_top{v0, v1, mask}`            | x86  | Choose using the top bit of each element of vector `mask`
163 | | `blend_bit{v0, v1, mask}`            | ARM  | Choose bitwise, `(~mask & v0) \| (mask & v1)`
164 | | `blend_hom{v0, v1, mask}`            |      | Choose `v0` when an element of `mask` is all 0, and `v1` when all 1
165 | 
166 | Two types of selection by indices are defined: `vec_select`, which is more like NEON `tbl` instructions, and `vec_shuffle`, which selects on sub-units, matching x86 `shuffle` and `permute` better. These have many settings so they get [their own section](#vector-select-and-shuffle) below. `reverse_units` is a special case, and is implemented as a call to `vec_shuffle` on x86 but is supported by dedicated instructions on ARM.
167 | 
168 | `vec_shift_left_128`, `vec_shift_right_128`, `vec_merge_shift_left_128`, and `vec_merge_shift_right_128` shift elements within lanes and are equivalent to `vec_shift_left`, `vec_shift_right`, `vec_merge_shift_left`, and `vec_merge_shift_right` when a vector is a single lane long.
169 | 
170 | `zip` and `zip128` interleave elements of their arguments in the sense of `zip(abcd, 0123) = a0b1c2d3`; on tuples this might be written `merge{...each{tup,a,b}}`. Because the full result wouldn't fit in a single vector, the `half` parameter specifies half 0 or 1 of each lane of the result, or equivalently zipping only half 0 or 1 of each argument lane. More formally, element `2*i` of a result lane is element `i` of the relevant half-lane of `a`, and element `2*i + 1` is element `i` from a half-lane of `b`. The complete result as a list of vectors is `each{zip128{a,b,.}, range{2}}`.
171 | 
172 | Arguments to blend functions are two vectors `v0` and `v1` of the same type, and a selector which is conceptually a list of booleans. For `blend` and `blend_units`, the selector `bools` is in fact a tuple of compile-time booleans (each is constant 0 or 1; these may also be passed as separate arguments). For `blend_hom`, `blend_top`, and `blend_bit`, the selector `mask` is another vector with the same number of elements and element width as the others. In a blend, the result value at index `i` is element `i` of either `v0` or `v1`: if element `i` of the selector is 0, `v0`, and if it's 1, `v1`. For `blend_top`, the selector is the top (sign) bit of each element of `mask`, and for `blend_bit`, all inputs are considered to be lists of bits so that the selector is simply the bits of `mask`. For `blend_hom` (short for "homogeneous"), result element `i` is defined only if element `i` of `mask` has all bits set to 0 or all set to 1. It's implemented as `blend_bit` on ARM and `blend_top`, possibly with a smaller element type than the arguments, on x86.
173 | 
174 | ### Vector select and shuffle
175 | 
176 | Both selection functions `vec_select` and `vec_shuffle` take three inputs:
177 | - `spec` is optional. It can describe the element type and width, and for `vec_shuffle`, sub-unit size.
178 | - `val` are the values for selection. It may be a tuple of vectors, which has a different meaning for select versus shuffle.
179 | - `ind` is the indices of the wanted values, either a vector or a tuple of constant integers (in which case they can also be passed as separate arguments). A constant index must be less than the selection length, and any negative indicates a zero result. For variables, out-of-bounds indices are not defined and will be interpreted according to the specific instruction called. `ind` is never cast, so if it's a vector its elements must be integers of the appropriate width.
180 | 
181 | For `vec_select`, `spec` may be the element width as a number, or an element type. The width `128`, supported by AVX's `permute2x128` and `permute2f128` intrinsics, can only be specified by number. If multiple arguments are passed, they are treated as a single list of elements, so that indices into the first vector are normal, those into the second are increased by the width of a vector, and so on.
182 | 
183 | `vec_shuffle` performs multiple independent selections: it corresponds to a single selection by adding an appropriate base index to each of these, although it's often the case on x86 that only some sub-unit size smaller than the entire vector is supported. If constant indices are used, they are repeated as needed to match the number of values. To run, `vec_shuffle` needs to determine both the element type and the number of elements in a sub-unit. `spec` may be a vector type like `[4]f32` to specify both, or a number like `4` to specify sub-unit length only, or an element type like `f32`. If the element type is unspecified, then the type's width comes from the indices if they're typed and the values if they're constant, and its quality (float or integer) comes from the values to be selected unless a floating-point type of the required width doesn't exist. The sub-unit size may be any divisor of the number of provided indices; if unspecified it's taken to be that number. An additional option is that `ind` may be a tuple of tuples, each having the length of a sub-unit (this specifies the sub-unit length if it would be taken from `ind`).
184 | 
185 | The definition of `vec_shuffle` where `val` is a tuple is chosen to accomodate x86's rather esoteric `shuffle_ps` and `shuffle_pd` intrinsics. In this case each selection unit is divided equally into one part for each vector of values, and the indices for a part pertain to the current selection unit of the corresponding vector.
186 | 
187 | Three extra definitions are included in iintrinsic/select to expose x86 shuffle instructions that don't fit `vec_select` or `vec_shuffle`. `vec_shuffle16_lo` and `vec_shuffle16_hi` shuffle the low and high halves of each lane of a vector with 16-bit elements, leaving the other half unchanged. `vec_shuffle_64_scaled` implements lane-wise `vec_shuffle` on `f64` elements and an index vector, except that the expected indices are 0 and 2 instead of 0 and 1: intrinsic `permutevar_pd` uses the second bit from the bottom of each index instead of the bottom bit as in `permutevar_ps`.
188 | 


--------------------------------------------------------------------------------
/include/arch/c.singeli:
--------------------------------------------------------------------------------
 1 | local {
 2 |   local def extend promote{arith} = {
 3 |     def arith{a:T,b if is{'number',kind{b}} and is{'primitive',typekind{T}}} = arith{a , cast{T,b}}
 4 |     def arith{a,b:T if is{'number',kind{a}} and is{'primitive',typekind{T}}} = arith{cast{T,a} , b}
 5 |   }
 6 |   def arith{op} = { def extend _{arith} = {
 7 |     def arith{a:T,b:T if is{'primitive',typekind{T}}} = emit{T, op, a, b}
 8 |     extend promote{arith}
 9 |   }}
10 |   def arith1{op} = { def extend _{arith} = {
11 |     def arith{a:T if is{'primitive',typekind{T}}} = emit{T, op, a}
12 |     arith
13 |   }}
14 |   def sh{op} = { def extend _{arith} = {
15 |     def arith{a:T,b:I if isint{T} and isint{I}} = emit{T, op, a, b}
16 |     extend promote{arith}
17 |   }}
18 |   def pk{T} = {
19 |     def k=typekind{T}
20 |     is{'primitive',k} or is{'pointer',k}
21 |   }
22 |   def compare{op} = { def extend _{arith} = {
23 |     def arith{a:T,b:T if pk{T}} = emit{u1, op, a, b}
24 |     extend promote{arith}
25 |   }}
26 |   def logic = arith
27 |   def logic1 = arith1
28 | }
29 | 
30 | extend (arith1{'-'}){__neg}
31 | 
32 | extend (arith{'op +'}){__add}
33 | extend (arith{'op -'}){__sub}
34 | extend (arith{'op *'}){__mul}
35 | extend (arith{'op /'}){__div}
36 | extend (arith{'op %'}){__mod}
37 | 
38 | local {
39 |   def isptr{T} = is{'pointer',typekind{T}}
40 |   def ptrwidth = width{__pnt{void}}
41 |   def isize = primtype{'i',ptrwidth}
42 |   def ptrdiff{a} = cast{isize, a}
43 |   def ptrdiff{a:T} = promote{ptrwidth, a}
44 |   def anynum{a} = is{'number',kind{a}}; def anynum{a:T} = is{'primitive',typekind{T}}
45 | }
46 | def __pnt{a:T if isptr{T}} = load{a,0}
47 | 
48 | def __add{a  ,b:P if isptr{P} and anynum{a}} = emit{P, 'op +', ptrdiff{a}, b}
49 | def __add{a:P,b   if isptr{P} and anynum{b}} = emit{P, 'op +', a, ptrdiff{b}}
50 | def __sub{a:P,b   if isptr{P} and anynum{b}} = emit{P, 'op -', a, ptrdiff{b}}
51 | def __sub{a:P,b:P if isptr{P}}               = emit{isize, 'op -', a, b}
52 | 
53 | extend (compare{'op =='}){__eq}
54 | extend (compare{'op !='}){__ne}
55 | extend (compare{'op >' }){__gt}
56 | extend (compare{'op >='}){__ge}
57 | extend (compare{'op <' }){__lt}
58 | extend (compare{'op <='}){__le}
59 | 
60 | extend (logic{'op &'}){__and}
61 | extend (logic{'op |'}){__or }
62 | extend (logic{'op ^'}){__xor}
63 | 
64 | extend (logic1{'~'}){__not}
65 | def __not{a:(u1)} = emit{u1, '!', a}
66 | 
67 | extend (sh{'op <<'}){__shl}
68 | extend (sh{'op >>'}){__shr}
69 | 
70 | def load{p:*T, i if anynum{i} and not is{T,void}} = emit{T, '^load', p, i}
71 | def store{p:*T, i, v:T if anynum{i} and not is{T,void}} = { emit{void, '^store', p, i, v}; v }
72 | def store{p:*T, i, v if is{'number',kind{v}} and anynum{i} and not is{T,void}} = store{p, i, cast{T, v}}
73 | 
74 | def cast_i{T, x} = emit{T, '', x}
75 | 


--------------------------------------------------------------------------------
/include/arch/iintrinsic/basic.singeli:
--------------------------------------------------------------------------------
1 | def _iintrinsic_use_fill = 1
2 | include './basic_impl'
3 | 


--------------------------------------------------------------------------------
/include/arch/iintrinsic/basic_impl.singeli:
--------------------------------------------------------------------------------
  1 | # This file should not be included directly:
  2 | # instead use arch/iintrinsic/basic or arch/iintrinsic/basic_strict
  3 | # which define whether fills should be used
  4 | local {
  5 |   include 'skin/c'
  6 |   oper ~~ reinterpret infix right 55
  7 |   oper ** vec_broadcast infix right 55
  8 |   def num{x} = is{'number',kind{x}}
  9 | 
 10 |   def fmt_p{T, ...s} = {
 11 |     if (isfloat{T}) {
 12 |       if (width{T}==32) 'ps' else 'pd'
 13 |     } else {
 14 |       def sgn = match (s) { {{e}} => e; {_} => issigned{T} }
 15 |       merge{'ep', if (sgn) 'i' else 'u', fmtnat{width{T}}}
 16 |     }
 17 |   }
 18 |   def fmt_p{T, w if isint{T} and w>1} = merge{'si', fmtnat{w}}
 19 |   def intrin{name, V=[_]T, ...s} = {
 20 |     def w = width{V}
 21 |     def fw = if (w<=128) '' else fmtnat{w}
 22 |     merge{'_mm', fw, '_', name, '_', fmt_p{T, ...s}}
 23 |   }
 24 |   def intrin_b{name, V} = intrin{name, V, width{V}}
 25 |   def set_intrin_post{V=[_]T} = if (T==i64 and not av5{V}) 'x' else ''
 26 |   def vec_ptr{p:*V=[_]T} = if (isfloat{T}) *T~~p else p
 27 | 
 28 |   def sse{V} = 128==width{V}
 29 |   def avx{V} = 256==width{V}
 30 |   def av5{V} = 512==width{V}
 31 |   def sse_avx{V} = __or{...tup{128,256}==width{V}}
 32 |   def ew{[_]T} = width  {T}
 33 |   def ef{[_]T} = isfloat{T}
 34 |   def ei{[_]T} = isint  {T}
 35 |   def eu{[_]T} = 'u'==quality{T}
 36 |   def es{[_]T} = 'i'==quality{T}
 37 | 
 38 |   def change_qual{[k]T,q} = [k]primtype{q, width{T}}
 39 |   def uns = change_qual{.,'u'}
 40 |   def sgn = change_qual{.,'i'}
 41 | 
 42 |   def go = match { {[_]T} => T!=u1; {_} => 0 }
 43 | }
 44 | 
 45 | # Multi-instruction fills; slowest ones go first
 46 | local def fill = _iintrinsic_use_fill
 47 | def __not{a:V if fill and go{V} and has_arith{V}} = a ^ (V ** ~cast{eltype{V},0})
 48 | def __neg{a:V if fill and go{V} and has_arith{V}} = V**0 - a
 49 | def __min{a:V, b:V if fill and go{V} and has_gt{V}} = { c:=V~~(a<b); (a&c) | andnot{b,c} }
 50 | def __max{a:V, b:V if fill and go{V} and has_gt{V}} = { c:=V~~(a>b); (a&c) | andnot{b,c} }
 51 | def __max{a:V, b:V if fill and go{V} and has_satur{V} and eu{V}} = __subs{a,b}+b
 52 | def __min{a:V, b:V if fill and go{V} and has_satur{V} and eu{V}} = a-__subs{a,b}
 53 | def __lt{a:V, b:V if fill and go{V}} = b>a
 54 | def __ge{a:V, b:V if fill and go{V}} = b<=a
 55 | def __le{a:V, b:V if fill and go{V}} = ~(a>b)
 56 | def __ne{a:V, b:V if fill and go{V}} = ~(b==a)
 57 | def __gt{a:V, b:V if fill and go{V} and eu{V} and has_gt{sgn{V}}} = {
 58 |   t:= V**(1<<(ew{V}-1))
 59 |   def I = sgn{V}; def s{v} = I~~(t^v)
 60 |   s{a} > s{b}
 61 | }
 62 | def __le{a:V, b:V if fill and go{V} and has_minmax{V}} = a==__min{a,b}
 63 | def __gt{a:V, b:V if fill and go{V} and ~has_gt{V} and has_minmax{V}} = ~(a<=b)
 64 | def __eq{a:V, b:V if fill and go{V} and ei{V} and ew{V}==64} = { def H=[4]u32; t := H~~a == H~~b; V~~(t & emit{H, '_mm_shuffle_epi32', t, 4b2301}) }
 65 | def __abs{a:V if fill and go{V} and es{V} and has_rsh{V}} = { s:=a>>31; (s^a) - s }
 66 | def __abs{a:V if fill and go{V} and es{V} and has_minmax{uns{V}}} = { u:=uns{V}~~a; V~~__min{u, -u} }
 67 | def __abs{a:V if fill and go{V} and es{V} and has_minmax{    V }} = __max{a, -a}
 68 | def __abs{a:V if fill and go{V} and has_arith{V} and ef{V}} = a & V~~uns{V}**(1<<(ew{V}-1)-1)
 69 | def __shl{a:V, b:S if fill and go{V} and S<=u64 and has_shift{V}} = shl_uniform{a, vec_make{[2]u64, promote{u64,b}, 0}}
 70 | def __shr{a:V, b:S if fill and go{V} and S<=u64 and has_rsh  {V}} = shr_uniform{a, vec_make{[2]u64, promote{u64,b}, 0}}
 71 | 
 72 | # Building vectors from scalars
 73 | local {
 74 |   def can_elt = match { {[_]T, x:T} => 1; {_,x} => num{x} }
 75 |   def can_make_sub = can_elt
 76 |   def can_make_sub{V=[k]_, {...x}} = {
 77 |     def all{t} = is{t, 0 <= t}
 78 |     k==length{x} and all{each{can_elt{V,.}, x}}
 79 |   }
 80 |   def can_make{V,x} = go{V} and has_make{V} and can_make_sub{V,x}
 81 |   def mv_sub{m, V=[k]T, x} = {
 82 |     if ('u'!=quality{T}) {
 83 |       m{V, each{cast{T,.}, x}}
 84 |     } else {
 85 |       def w = width{T}
 86 |       def I = primtype{'i', w}
 87 |       def smax = 1<<(w-1)  # Convert compile-time numbers in Singeli to avoid lots of cast instructions
 88 |       def ic{a} = I~~(if (num{a}) a - (a>=smax)<<w else cast{T,a})
 89 |       V~~m{[k]I, each{ic, x}}
 90 |     }
 91 |   }
 92 |   def seti{name, V, x} = emit{V, merge{intrin{name,V,1}, set_intrin_post{V}}, ...x}
 93 |   def mvc = seti{'set1', ...}
 94 |   def mvi{V,x} = if (not (V==[2]i64 or av5{V})) {
 95 |     seti{'setr', V, x}
 96 |   } else {
 97 |     def l = length{x}
 98 |     seti{'set', V, select{x, l-1-range{l}}}
 99 |   }
100 | }
101 | def vec_make{V,  ...x  if can_make{V,x}} = mv_sub{mvi, V, x}
102 | def vec_make{V, {...x} if can_make{V,x}} = mv_sub{mvi, V, x}
103 | def vec_broadcast{V, x if can_make{V,x}} = mv_sub{mvc, V, tup{x}}
104 | 
105 | def load         {p:*V, i      if go{V} and has_make{V}} = emit{V,    intrin_b{'loadu' , V}, vec_ptr{p+i}}
106 | def load_aligned {p:*V, i      if go{V} and has_make{V}} = emit{V,    intrin_b{'load'  , V}, vec_ptr{p+i}}
107 | def store        {p:*V, i, v:V if go{V} and has_make{V}} = emit{void, intrin_b{'storeu', V}, vec_ptr{p+i}, v}
108 | def store_aligned{p:*V, i, v:V if go{V} and has_make{V}} = emit{void, intrin_b{'store' , V}, vec_ptr{p+i}, v}
109 | 
110 | def extract{a:V=[k]T, i==0 if go{V} and has_extract0{V}} = {
111 |   def nt = if (not isfloat{T}) 'si128_si' else if (width{T}==32) 'ss_f' else 'sd_f'
112 |   def name = merge{'_mm_cvt', nt, fmtnat{width{T}}}
113 |   if ('u'!=quality{T}) emit{T,name,a}
114 |   else { def I=primtype{'i', width{T}}; T~~emit{I,name,[k]I~~a} }
115 | }
116 | def extract{a:V=[_]T, i if go{V} and num{i} and has_extract{V}} = {
117 |   def e{U=[_]S,a} = emit{S, intrin{'extract',U}, a, i}
118 |   if (es{V}) e{V,a} else { def I=sgn{V}; T~~e{I,I~~a} }
119 | }
120 | def insert{a:V=[_]T, x, i if go{V} and num{i} and has_extract{V} and can_elt{V,x}} = {
121 |   def m{V,a,x} = emit{V, intrin{'insert',V}, a, x, i}
122 |   def tx = cast{T,x}
123 |   if (issigned{T}) m{V, a, tx} else {
124 |     def I=sgn{V}
125 |     V~~m{I, I~~a, eltype{I}~~tx}
126 |   }
127 | }
128 | 
129 | # Single-instruction definitions
130 | def __and {a:V, b:V if go{V} and has_arith{V}} = emit{V, intrin_b{'and'   ,V}, a, b}
131 | def __or  {a:V, b:V if go{V} and has_arith{V}} = emit{V, intrin_b{'or'    ,V}, a, b}
132 | def __xor {a:V, b:V if go{V} and has_arith{V}} = emit{V, intrin_b{'xor'   ,V}, a, b}
133 | def andnot{a:V, b:V if go{V} and has_arith{V}} = emit{V, intrin_b{'andnot',V}, b, a}
134 | 
135 | def __add {a:V, b:V if go{V} and has_arith{V}}  = emit{V, intrin{'add', V,1}, a, b}
136 | def __sub {a:V, b:V if go{V} and has_arith{V}}  = emit{V, intrin{'sub', V,1}, a, b}
137 | def __adds{a:V, b:V if go{V} and has_satur{V}}  = emit{V, intrin{'adds',V  }, a, b}
138 | def __subs{a:V, b:V if go{V} and has_satur{V}}  = emit{V, intrin{'subs',V  }, a, b}
139 | def __min {a:V, b:V if go{V} and has_minmax{V}} = emit{V, intrin{'min', V  }, a, b}
140 | def __max {a:V, b:V if go{V} and has_minmax{V}} = emit{V, intrin{'max', V  }, a, b}
141 | def __mul {a:V, b:V if go{V} and has_mul{V}}    = emit{V, intrin{if (ef{V}) 'mul' else 'mullo', V,1}, a, b}
142 | 
143 | def __shl{a:V, b if num{b} and go{V} and has_shift{V}} = emit{V, intrin{'slli',V,1}, a, b}
144 | def __shr{a:V, b if num{b} and go{V} and has_rsh  {V}} = emit{V, intrin{if (es{V}) 'srai' else 'srli',V,1}, a, b}
145 | def __shl{a:V, b:U if go{V} and U==uns{V} and has_vshift{V}} = emit{V, intrin{'sllv',V,1}, a, b}
146 | def __shr{a:V, b:U if go{V} and U==uns{V} and has_vrsh  {V}} = emit{V, intrin{if (es{V}) 'srav' else 'srlv',V,1}, a, b}
147 | def shl_uniform{a:V, b:([2]u64) if go{V} and has_shift{V}} = emit{V, intrin{'sll',V,1}, a, b}
148 | def shr_uniform{a:V, b:([2]u64) if go{V} and has_rsh  {V}} = emit{V, intrin{if (es{V}) 'sra' else 'srl',V,1}, a, b}
149 | 
150 | def __eq{a:V, b:V if go{V} and has_eq{V}} = emit{uns{V}, intrin{'cmpeq',V,1}, b, a}
151 | def __gt{a:V, b:V if go{V} and has_gt{V}} = emit{uns{V}, intrin{'cmpgt',V  }, a, b}
152 | def __lt{a:V, b:V if go{V} and has_gt{V}} = emit{uns{V}, intrin{'cmpgt',V  }, b, a}
153 | 
154 | local def cmp_flt{V, a, b, name, num} = {
155 |   def c = if (sse{V}) emit{V, intrin{merge{'cmp',name}, V}, a, b}
156 |           else        emit{V, intrin{'cmp', V}, a, b, num}
157 |   # Return unsigned int even though intrinsics use floats
158 |   def U = uns{V}
159 |   emit{U, intrin_b{merge{'cast',fmt_p{eltype{V}}}, U}, c}
160 | }
161 | def __eq{a:V, b:V if go{V} and has_cmp_flt{V}} = cmp_flt{V, a, b, 'eq' ,  0}
162 | def __ne{a:V, b:V if go{V} and has_cmp_flt{V}} = cmp_flt{V, a, b, 'neq',  4}
163 | def __gt{a:V, b:V if go{V} and has_cmp_flt{V}} = cmp_flt{V, a, b, 'gt' , 30}
164 | def __ge{a:V, b:V if go{V} and has_cmp_flt{V}} = cmp_flt{V, a, b, 'ge' , 29}
165 | def __lt{a:V, b:V if go{V} and has_cmp_flt{V}} = cmp_flt{V, a, b, 'lt' , 17}
166 | def __le{a:V, b:V if go{V} and has_cmp_flt{V}} = cmp_flt{V, a, b, 'le' , 18}
167 | 
168 | def __div{a:V, b:V if go{V} and has_float{V}} = emit{V, intrin{'div', V}, a, b}
169 | def __sqrt {a:V if go{V} and has_float{V}} = emit{V, intrin{'sqrt',  V}, a}
170 | local def round_flt{V, a, name, num} = {
171 |   if (av5{V}) emit{V, intrin{'roundscale', V}, a, num}
172 |   else        emit{V, intrin{name,         V}, a}
173 | }
174 | def __floor{a:V if go{V} and has_round{V}} = round_flt{V, a, 'floor', 9}
175 | def __ceil {a:V if go{V} and has_round{V}} = round_flt{V, a, 'ceil', 10}
176 | def __round{a:V if go{V} and has_round{V}} = round_flt{V, a, 'round', 8}
177 | 
178 | def __abs      {a:V      if go{V} and has_int_op{V,'SSSE3','i',32,64}} = emit{V, intrin{'abs' ,V}, a}
179 | def copy_sign  {a:V, b:V if go{V} and has_int_op{V,'SSSE3','i',32, 0}} = emit{V, intrin{'sign',V}, a, b}
180 | def average_int{a:V, b:V if go{V} and has_int_op{V,'SSE2' ,'u',16,16}} = emit{V, intrin{'avg' ,V}, a, b}
181 | 
182 | local {
183 |   def has_make{_} = 0  # set, load, store
184 |   def has_make{V==[4]f32 if hasarch{'SSE'}} = 1
185 |   def has_make{V if hasarch{'SSE2'} and sse{V}} = 1
186 |   def has_make{V if hasarch{'AVX' } and avx{V}} = 1
187 |   def has_make{V if hasarch{'AVX512F'} and av5{V}} = 1
188 | 
189 |   def has_extract{_} = 0  # extract
190 |   def has_extract{V if hasarch{'SSE2'  } and sse{V}} = ew{V}==16
191 |   def has_extract{V if hasarch{'SSE4.1'} and sse{V}} = 1
192 |   def has_extract{V if hasarch{'AVX'   } and avx{V} and fill} = ew{V}>=32
193 |   def has_extract{V if hasarch{'AVX2'  } and avx{V} and fill} = 1
194 |   def has_extract0{_} = 0
195 |   def has_extract0{V if hasarch{'SSE2'} and sse{V} and ew{V}>=32} = 1
196 | 
197 |   def has_arith{_} = 0  # add, subtract, and, or, xor, andnot
198 |   def has_arith{V==[4]f32 if hasarch{'SSE'}} = 1
199 |   def has_arith{V if hasarch{'SSE2'} and sse{V}} = 1
200 |   def has_arith{V if hasarch{'AVX'} and avx{V} and ef{V}} = 1
201 |   def has_arith{V if hasarch{'AVX2'} and avx{V}} = 1
202 |   def has_avx512{V} = ew{V}>=32 or hasarch{'AVX512BW'}
203 |   def has_arith{V if hasarch{'AVX512F'} and av5{V}} = has_avx512{V}
204 | 
205 |   def has_satur{_} = 0  # saturating add/subtract
206 |   def has_satur{V if hasarch{'SSE2'} and sse{V}} = ew{V}<=16
207 |   def has_satur{V if hasarch{'AVX2'} and avx{V}} = ew{V}<=16
208 |   def has_satur{V if hasarch{'AVX512BW'} and av5{V}} = ew{V}<=16
209 | 
210 |   def has_minmax{_} = 0  # min, max
211 |   def has_minmax{V==[ 4]f32 if hasarch{'SSE' }} = 1
212 |   def has_minmax{V==[ 2]f64 if hasarch{'SSE2'}} = 1
213 |   def has_minmax{V==[ 8]i16 if hasarch{'SSE2'}} = 1
214 |   def has_minmax{V==[16]u8  if hasarch{'SSE2'}} = 1
215 |   def avx_minmax{[_]T} = isfloat{T} or width{T}<=32
216 |   def has_minmax{V if hasarch{'SSE4.1'} and sse{V}} = avx_minmax{V}
217 |   def has_minmax{V if hasarch{'AVX'} and avx{V} and ef{V}} = 1
218 |   def has_minmax{V if hasarch{'AVX2'} and avx{V}} = avx_minmax{V}
219 |   def has_minmax{V if hasarch{'AVX512F'} and av5{V}} = has_avx512{V}
220 | 
221 |   def has_mul{_} = 0  # same-width multiply (mullo for ints)
222 |   def has_mul{V==[4]f32 if hasarch{'SSE'}} = 1
223 |   def has_mul{V if hasarch{'SSE2'  } and sse{V} and ef{V}} = 1
224 |   def has_mul{V if hasarch{'SSE2'  } and sse{V} and ew{V}==16} = 1
225 |   def has_mul{V if hasarch{'SSE4.1'} and sse{V} and ew{V}==32} = 1
226 |   def has_mul{V if hasarch{'AVX'   } and avx{V} and ef{V}} = 1
227 |   def has_mul{V if hasarch{'AVX2'  } and avx{V} and ew{V}<=32 and ew{V}>=16} = 1
228 |   def has_mul{V if hasarch{'AVX512F'} and av5{V}} = if (ef{V}) 1 else match (ew{V}) {
229 |     {(32)}=>1; {(16)}=>hasarch{'AVX512BW'}; {(64)}=>hasarch{'AVX512DQ'}
230 |   }
231 | 
232 |   def has_shift{_} = 0  # shift by scalar
233 |   def has_shift{V if hasarch{'SSE2'} and sse{V} and ei{V} and ew{V}>=16} = 1
234 |   def has_shift{V if hasarch{'AVX2'} and avx{V} and ei{V} and ew{V}>=16} = 1
235 |   def has_shift{V if hasarch{'AVX512F'} and av5{V} and ei{V} and ew{V}>=16} = has_avx512{V}
236 |   def has_rsh{V} = (eltype{V}!=i64 or av5{V}) and has_shift{V}
237 |   def has_vshift{_} = 0  # shift by vector
238 |   def has_vshift{V if hasarch{'AVX2'} and sse_avx{V} and ei{V} and ew{V}>=32} = 1
239 |   def has_vshift{V if hasarch{'AVX512F'} and av5{V} and ei{V} and ew{V}>=16} = has_avx512{V}
240 |   def has_vrsh{V} = (eltype{V}!=i64 or av5{V}) and has_vshift{V}
241 | 
242 |   def has_eq{_} = 0  # equals, integer only
243 |   def has_eq{V if hasarch{'SSE2'  } and sse{V}} = ew{V}<=32
244 |   def has_eq{V if hasarch{'SSE4.1'} and sse{V}} = 1
245 |   def has_eq{V if hasarch{'AVX2'  } and avx{V}} = 1
246 |   def has_gt{V} = es{V} and has_eq{V}
247 |   def has_gt{([2]i64)} = hasarch{'SSE4.2'}
248 | 
249 |   # float comparisons, div, square root
250 |   def has_float{V} = ef{V} and has_arith{V}
251 |   def has_cmp_flt{V} = sse_avx{V} and has_float{V}
252 |   # floor, ceiling, round
253 |   def has_round{V} = hasarch{'SSE4.1'} and has_float{V}
254 | 
255 |   # abs, sign, avg
256 |   def has_int_op{V=[_]T, arch_s, q, w, w512} = {
257 |     if (q!=quality{T}) 0
258 |     else if (sse_avx{V} and hasarch{if (sse{V}) arch_s else 'AVX2'}) width{T}<=w
259 |     else hasarch{'AVX512F'} and av5{V} and has_avx512{V} and width{T}<=w512
260 |   }
261 | }
262 | 


--------------------------------------------------------------------------------
/include/arch/iintrinsic/basic_strict.singeli:
--------------------------------------------------------------------------------
1 | def _iintrinsic_use_fill = 0
2 | include './basic_impl'
3 | 


--------------------------------------------------------------------------------
/include/arch/iintrinsic/misc.singeli:
--------------------------------------------------------------------------------
  1 | local {
  2 |   def intvec{w,T} = 0
  3 |   def intvec{(width{V}),V=[_]T if isint{T}} = 1
  4 |   def num{T} = is{'number',kind{T}}
  5 | }
  6 | 
  7 | #SSE
  8 | def __mulhi{a:T==[4]u16, b:T} = emit{T, '_mm_mulhi_pu16', a, b}
  9 | def __pmulhuw{a:T==[4]u16, b:T} = emit{T, '_m_pmulhuw', a, b}
 10 | def __cmpnlt{a:T==[4]f32, b:T} = emit{T, '_mm_cmpnlt_ps', a, b}
 11 | def __cmpnle{a:T==[4]f32, b:T} = emit{T, '_mm_cmpnle_ps', a, b}
 12 | def __cmpngt{a:T==[4]f32, b:T} = emit{T, '_mm_cmpngt_ps', a, b}
 13 | def __cmpnge{a:T==[4]f32, b:T} = emit{T, '_mm_cmpnge_ps', a, b}
 14 | def __cmpord{a:T==[4]f32, b:T} = emit{T, '_mm_cmpord_ps', a, b}
 15 | def __cmpunord{a:T==[4]f32, b:T} = emit{T, '_mm_cmpunord_ps', a, b}
 16 | def __cvt{a:T==[4]f32, b:(i32)} = emit{T, '_mm_cvt_si2ss', a, b}
 17 | def __cvtpi32{a:T==[4]f32, b:([2]i32)} = emit{T, '_mm_cvtpi32_ps', a, b}
 18 | def __cvt{a:T==[4]f32, b:([2]i32)} = emit{T, '_mm_cvt_pi2ps', a, b}
 19 | def __cvtpi16{a:([4]i16)} = emit{[4]f32, '_mm_cvtpi16_ps', a}
 20 | def __cvtpu16{a:([4]u16)} = emit{[4]f32, '_mm_cvtpu16_ps', a}
 21 | def __cvtpi8{a:([8]i8)} = emit{[4]f32, '_mm_cvtpi8_ps', a}
 22 | def __cvtpu8{a:([8]u8)} = emit{[4]f32, '_mm_cvtpu8_ps', a}
 23 | def __cvtpi32x2{a:T==[2]i32, b:T} = emit{[4]f32, '_mm_cvtpi32x2_ps', a, b}
 24 | def __cvtss_i32{a:([4]f32)} = emit{i32, '_mm_cvtss_si32', a}
 25 | def __cvt_i32{a:([4]f32)} = emit{i32, '_mm_cvt_ss2si', a}
 26 | def __cvtss_u64{a:([4]f32)} = emit{u64, '_mm_cvtss_si64', a}
 27 | def __cvtss_f32{a:([4]f32)} = emit{f32, '_mm_cvtss_f32', a}
 28 | def __cvtps_2f32{a:([4]f32)} = emit{[2]f32, '_mm_cvtps_pi32', a}
 29 | def __cvt_2f32{a:([4]f32)} = emit{[2]f32, '_mm_cvt_ps2pi', a}
 30 | def __cvttss_i32{a:([4]f32)} = emit{i32, '_mm_cvttss_si32', a}
 31 | def __cvtt_i32{a:([4]f32)} = emit{i32, '_mm_cvtt_ss2si', a}
 32 | def __cvttss_u64{a:([4]f32)} = emit{u64, '_mm_cvttss_si64', a}
 33 | def __cvttps_2f32{a:([4]f32)} = emit{[2]f32, '_mm_cvttps_pi32', a}
 34 | def __cvtt_2f32{a:([4]f32)} = emit{[2]f32, '_mm_cvtt_ps2pi', a}
 35 | def __cvtps_4i16{a:([4]f32)} = emit{[4]i16, '_mm_cvtps_pi16', a}
 36 | def __cvtps_8i8{a:([4]f32)} = emit{[8]i8, '_mm_cvtps_pi8', a}
 37 | def __rcp{a:T==[4]f32} = emit{T, '_mm_rcp_ps', a}
 38 | def __rsqrt{a:T==[4]f32} = emit{T, '_mm_rsqrt_ps', a}
 39 | def __getcsr{} = emit{u32, '_mm_getcsr'}
 40 | def __setcsr{a:(u32)} = emit{void, '_mm_setcsr', a}
 41 | def __GET_EXCEPTION_STATE{} = emit{u32, '_MM_GET_EXCEPTION_STATE'}
 42 | def __SET_EXCEPTION_STATE{a:(u32)} = emit{void, '_MM_SET_EXCEPTION_STATE', a}
 43 | def __GET_EXCEPTION_MASK{} = emit{u32, '_MM_GET_EXCEPTION_MASK'}
 44 | def __SET_EXCEPTION_MASK{a:(u32)} = emit{void, '_MM_SET_EXCEPTION_MASK', a}
 45 | def __GET_ROUNDING_MODE{} = emit{u32, '_MM_GET_ROUNDING_MODE'}
 46 | def __SET_ROUNDING_MODE{a:(u32)} = emit{void, '_MM_SET_ROUNDING_MODE', a}
 47 | def __GET_FLUSH_ZERO_MODE{} = emit{u32, '_MM_GET_FLUSH_ZERO_MODE'}
 48 | def __SET_FLUSH_ZERO_MODE{a:(u32)} = emit{void, '_MM_SET_FLUSH_ZERO_MODE', a}
 49 | def __prefetch{p:*(u8), i if num{i}} = emit{void, '_mm_prefetch', p, i}
 50 | def __sfence{} = emit{void, '_mm_sfence'}
 51 | def __malloc{size:T==u64, align:T} = emit{__pnt{void}, '_mm_malloc', size, align}
 52 | def __free{mem_addr:*(void)} = emit{void, '_mm_free', mem_addr}
 53 | def __undefined_4f32{} = emit{[4]f32, '_mm_undefined_ps'}
 54 | def __loadh{a:T==[4]f32, mem_addr:*([2]f32)} = emit{T, '_mm_loadh_pi', a, mem_addr}
 55 | def __loadl{a:T==[4]f32, mem_addr:*([2]f32)} = emit{T, '_mm_loadl_pi', a, mem_addr}
 56 | def __load1{mem_addr:*(f32)} = emit{[4]f32, '_mm_load1_ps', mem_addr}
 57 | def __loadr{mem_addr:*(f32)} = emit{[4]f32, '_mm_loadr_ps', mem_addr}
 58 | def __movemask{a:([8]i8)} = emit{i32, '_mm_movemask_pi8', a}
 59 | def __pmovmskb{a:([8]u8)} = emit{i32, '_m_pmovmskb', a}
 60 | def __movemask{a:([4]f32)} = emit{i32, '_mm_movemask_ps', a}
 61 | def __sad{a:T==[8]u8, b:T} = emit{[4]u16, '_mm_sad_pu8', a, b}
 62 | def __psadbw{a:T==[8]u8, b:T} = emit{[4]u16, '_m_psadbw', a, b}
 63 | def __movehl{a:T==[4]f32, b:T} = emit{T, '_mm_movehl_ps', a, b}
 64 | def __movelh{a:T==[4]f32, b:T} = emit{T, '_mm_movelh_ps', a, b}
 65 | def __pavgb{a:T==[8]u8, b:T} = emit{T, '_m_pavgb', a, b}
 66 | def __pavgw{a:T==[4]u16, b:T} = emit{T, '_m_pavgw', a, b}
 67 | def __setzero_4f32{} = emit{[4]f32, '_mm_setzero_ps'}
 68 | def __pmaxsw{a:T==[4]i16, b:T} = emit{T, '_m_pmaxsw', a, b}
 69 | def __pmaxub{a:T==[8]u8, b:T} = emit{T, '_m_pmaxub', a, b}
 70 | def __pminsw{a:T==[4]i16, b:T} = emit{T, '_m_pminsw', a, b}
 71 | def __pminub{a:T==[8]u8, b:T} = emit{T, '_m_pminub', a, b}
 72 | def __stream{mem_addr:*(void), a:([1]i64)} = emit{void, '_mm_stream_pi', mem_addr, a}
 73 | def __maskmove{a:T==[8]u8, mask:T, mem_addr:*(u8)} = emit{void, '_mm_maskmove_si64', a, mask, mem_addr}
 74 | def __maskmovq{a:T==[8]u8, mask:T, mem_addr:*(u8)} = emit{void, '_m_maskmovq', a, mask, mem_addr}
 75 | def __stream{mem_addr:*(void), a:([4]f32)} = emit{void, '_mm_stream_ps', mem_addr, a}
 76 | def __storeh{mem_addr:*([2]f32), a:([4]f32)} = emit{void, '_mm_storeh_pi', mem_addr, a}
 77 | def __storel{mem_addr:*([2]f32), a:([4]f32)} = emit{void, '_mm_storel_pi', mem_addr, a}
 78 | def __store1{mem_addr:*(f32), a:([4]f32)} = emit{void, '_mm_store1_ps', mem_addr, a}
 79 | def __storer{mem_addr:*(f32), a:([4]f32)} = emit{void, '_mm_storer_ps', mem_addr, a}
 80 | def __TRANSPOSE4{row0:T==[4]f32, row1:T, row2:T, row3:T} = emit{void, '_MM_TRANSPOSE4_PS', row0, row1, row2, row3}
 81 | def __pextrw{a:([4]u16), imm8 if num{imm8}} = emit{i32, '_m_pextrw', a, imm8}
 82 | def __pinsrw{a:T==[4]u16, i:(i32), imm8 if num{imm8}} = emit{T, '_m_pinsrw', a, i, imm8}
 83 | def __pshufw{a:T==[4]u16, imm8 if num{imm8}} = emit{T, '_m_pshufw', a, imm8}
 84 | 
 85 | 
 86 | #SSE2
 87 | def __madd{a:T==[8]i16, b:T} = emit{[4]i32, '_mm_madd_epi16', a, b}
 88 | def __mulhi{a:T==[8]i16, b:T} = emit{T, '_mm_mulhi_epi16', a, b}
 89 | def __mulhi{a:T==[8]u16, b:T} = emit{T, '_mm_mulhi_epu16', a, b}
 90 | def __mul{a:T==[4]u32, b:T} = emit{[2]u64, '_mm_mul_epu32', a, b}
 91 | def __castpd{a:([2]f64)} = emit{[4]f32, '_mm_castpd_ps', a}
 92 | def __castps{a:([4]f32)} = emit{[2]f64, '_mm_castps_pd', a}
 93 | def __castsi128{a:([2]u64)} = emit{[2]f64, '_mm_castsi128_pd', a}
 94 | def __castsi128{a:([4]u32)} = emit{[4]f32, '_mm_castsi128_ps', a}
 95 | def __cmpord{a:T==[2]f64, b:T} = emit{T, '_mm_cmpord_pd', a, b}
 96 | def __cmpunord{a:T==[2]f64, b:T} = emit{T, '_mm_cmpunord_pd', a, b}
 97 | def __cmpnlt{a:T==[2]f64, b:T} = emit{T, '_mm_cmpnlt_pd', a, b}
 98 | def __cmpnle{a:T==[2]f64, b:T} = emit{T, '_mm_cmpnle_pd', a, b}
 99 | def __cmpngt{a:T==[2]f64, b:T} = emit{T, '_mm_cmpngt_pd', a, b}
100 | def __cmpnge{a:T==[2]f64, b:T} = emit{T, '_mm_cmpnge_pd', a, b}
101 | def __cvtepi32_2f64{a:([4]i32)} = emit{[2]f64, '_mm_cvtepi32_pd', a}
102 | def __cvtepi32_4f32{a:([4]i32)} = emit{[4]f32, '_mm_cvtepi32_ps', a}
103 | def __cvtpi32{a:([2]i32)} = emit{[2]f64, '_mm_cvtpi32_pd', a}
104 | def __cvtsi32{a:(i32)} = emit{[4]u32, '_mm_cvtsi32_si128', a}
105 | def __cvtsi64{a:(u64)} = emit{[2]u64, '_mm_cvtsi64_si128', a}
106 | def __cvtsi64x{a:(u64)} = emit{[2]u64, '_mm_cvtsi64x_si128', a}
107 | def __cvtsi128{a:([4]u32)} = emit{i32, '_mm_cvtsi128_si32', a}
108 | def __cvtsi128_si64{a:([2]u64)} = emit{u64, '_mm_cvtsi128_si64', a}
109 | def __cvtsi128_si64x{a:([2]u64)} = emit{u64, '_mm_cvtsi128_si64x', a}
110 | def __cvtpd_4f32{a:([2]f64)} = emit{[4]f32, '_mm_cvtpd_ps', a}
111 | def __cvtps_2f64{a:([4]f32)} = emit{[2]f64, '_mm_cvtps_pd', a}
112 | def __cvtpd_4i32{a:([2]f64)} = emit{[4]i32, '_mm_cvtpd_epi32', a}
113 | def __cvtsd_si32{a:([2]f64)} = emit{i32, '_mm_cvtsd_si32', a}
114 | def __cvtsd_si64{a:([2]f64)} = emit{u64, '_mm_cvtsd_si64', a}
115 | def __cvtsd_si64x{a:([2]f64)} = emit{u64, '_mm_cvtsd_si64x', a}
116 | def __cvtsd_f64{a:([2]f64)} = emit{f64, '_mm_cvtsd_f64', a}
117 | def __cvttpd_4i32{a:([2]f64)} = emit{[4]i32, '_mm_cvttpd_epi32', a}
118 | def __cvttsd_si32{a:([2]f64)} = emit{i32, '_mm_cvttsd_si32', a}
119 | def __cvttsd_si64{a:([2]f64)} = emit{u64, '_mm_cvttsd_si64', a}
120 | def __cvttsd_si64x{a:([2]f64)} = emit{u64, '_mm_cvttsd_si64x', a}
121 | def __cvtps_4i32{a:([4]f32)} = emit{[4]i32, '_mm_cvtps_epi32', a}
122 | def __cvttps_4i32{a:([4]f32)} = emit{[4]i32, '_mm_cvttps_epi32', a}
123 | def __cvtpd_2f32{a:([2]f64)} = emit{[2]f32, '_mm_cvtpd_pi32', a}
124 | def __cvttpd_2f32{a:([2]f64)} = emit{[2]f32, '_mm_cvttpd_pi32', a}
125 | def __undefined_2f64{} = emit{[2]f64, '_mm_undefined_pd'}
126 | def __pause{} = emit{void, '_mm_pause'}
127 | def __clflush{p:*(void)} = emit{void, '_mm_clflush', p}
128 | def __lfence{} = emit{void, '_mm_lfence'}
129 | def __mfence{} = emit{void, '_mm_mfence'}
130 | def __loadl{mem_addr:*T==[2]i64} = emit{T, '_mm_loadl_epi64', mem_addr}
131 | def __load1{mem_addr:*(f64)} = emit{[2]f64, '_mm_load1_pd', mem_addr}
132 | def __loadr{mem_addr:*(f64)} = emit{[2]f64, '_mm_loadr_pd', mem_addr}
133 | def __loadh{a:T==[2]f64, mem_addr:*(f64)} = emit{T, '_mm_loadh_pd', a, mem_addr}
134 | def __loadl{a:T==[2]f64, mem_addr:*(f64)} = emit{T, '_mm_loadl_pd', a, mem_addr}
135 | def __movepi64{a:([2]i64)} = emit{[2]f32, '_mm_movepi64_pi64', a}
136 | def __packs{a:T==[8]i16, b:T} = emit{[16]i8, '_mm_packs_epi16', a, b}
137 | def __packs{a:T==[4]i32, b:T} = emit{[8]i16, '_mm_packs_epi32', a, b}
138 | def __packus{a:T==[8]i16, b:T} = emit{[16]i8, '_mm_packus_epi16', a, b}
139 | def __movemask{a:([16]i8)} = emit{i32, '_mm_movemask_epi8', a}
140 | def __movemask{a:([2]f64)} = emit{i32, '_mm_movemask_pd', a}
141 | def __sad{a:T==[16]u8, b:T} = emit{[8]u16, '_mm_sad_epu8', a, b}
142 | def __movpi64{a:([1]i64)} = emit{[2]i64, '_mm_movpi64_epi64', a}
143 | def __move{a:T==[2]i64} = emit{T, '_mm_move_epi64', a}
144 | def __setzero_2f64{} = emit{[2]f64, '_mm_setzero_pd'}
145 | def __maskmoveu{a:T==[16]u8, mask:T, mem_addr:*(u8)} = emit{void, '_mm_maskmoveu_si128', a, mask, mem_addr}
146 | def __storel{mem_addr:*T==[2]i64, a:T} = emit{void, '_mm_storel_epi64', mem_addr, a}
147 | def __stream{mem_addr:*(void), a:T if intvec{128,T}} = emit{void, '_mm_stream_si128', mem_addr, a}
148 | def __stream{mem_addr:*(void), a:(i32)} = emit{void, '_mm_stream_si32', mem_addr, a}
149 | def __stream{mem_addr:*(void), a:(u64)} = emit{void, '_mm_stream_si64', mem_addr, a}
150 | def __stream{mem_addr:*(void), a:([2]f64)} = emit{void, '_mm_stream_pd', mem_addr, a}
151 | def __store1{mem_addr:*(f64), a:([2]f64)} = emit{void, '_mm_store1_pd', mem_addr, a}
152 | def __storer{mem_addr:*(f64), a:([2]f64)} = emit{void, '_mm_storer_pd', mem_addr, a}
153 | def __storeh{mem_addr:*(f64), a:([2]f64)} = emit{void, '_mm_storeh_pd', mem_addr, a}
154 | def __storel{mem_addr:*(f64), a:([2]f64)} = emit{void, '_mm_storel_pd', mem_addr, a}
155 | 
156 | 
157 | #SSE3
158 | def __addsub{a:T==[4]f32, b:T} = emit{T, '_mm_addsub_ps', a, b}
159 | def __addsub{a:T==[2]f64, b:T} = emit{T, '_mm_addsub_pd', a, b}
160 | def __hadd{a:T==[2]f64, b:T} = emit{T, '_mm_hadd_pd', a, b}
161 | def __hadd{a:T==[4]f32, b:T} = emit{T, '_mm_hadd_ps', a, b}
162 | def __hsub{a:T==[2]f64, b:T} = emit{T, '_mm_hsub_pd', a, b}
163 | def __hsub{a:T==[4]f32, b:T} = emit{T, '_mm_hsub_ps', a, b}
164 | def __loaddup{mem_addr:*(f64)} = emit{[2]f64, '_mm_loaddup_pd', mem_addr}
165 | def __movedup{a:T==[2]f64} = emit{T, '_mm_movedup_pd', a}
166 | def __movehdup{a:T==[4]f32} = emit{T, '_mm_movehdup_ps', a}
167 | def __moveldup{a:T==[4]f32} = emit{T, '_mm_moveldup_ps', a}
168 | 
169 | 
170 | #SSSE3
171 | def __hadd{a:T==[8]i16, b:T} = emit{T, '_mm_hadd_epi16', a, b}
172 | def __hadds{a:T==[8]i16, b:T} = emit{T, '_mm_hadds_epi16', a, b}
173 | def __hadd{a:T==[4]i32, b:T} = emit{T, '_mm_hadd_epi32', a, b}
174 | def __hadd{a:T==[4]i16, b:T} = emit{T, '_mm_hadd_pi16', a, b}
175 | def __hadd{a:T==[2]i32, b:T} = emit{T, '_mm_hadd_pi32', a, b}
176 | def __hadds{a:T==[4]i16, b:T} = emit{T, '_mm_hadds_pi16', a, b}
177 | def __hsub{a:T==[8]i16, b:T} = emit{T, '_mm_hsub_epi16', a, b}
178 | def __hsubs{a:T==[8]i16, b:T} = emit{T, '_mm_hsubs_epi16', a, b}
179 | def __hsub{a:T==[4]i32, b:T} = emit{T, '_mm_hsub_epi32', a, b}
180 | def __hsub{a:T==[4]i16, b:T} = emit{T, '_mm_hsub_pi16', a, b}
181 | def __hsub{a:T==[2]i32, b:T} = emit{T, '_mm_hsub_pi32', a, b}
182 | def __hsubs{a:T==[4]i16, b:T} = emit{T, '_mm_hsubs_pi16', a, b}
183 | def __maddubs{a:T==[16]i8, b:T} = emit{[8]i16, '_mm_maddubs_epi16', a, b}
184 | def __maddubs{a:T==[8]i8, b:T} = emit{[4]i16, '_mm_maddubs_pi16', a, b}
185 | def __mulhrs{a:T==[8]i16, b:T} = emit{T, '_mm_mulhrs_epi16', a, b}
186 | def __mulhrs{a:T==[4]i16, b:T} = emit{T, '_mm_mulhrs_pi16', a, b}
187 | 
188 | 
189 | #SSE4.1
190 | def __dp{a:T==[2]f64, b:T, imm8 if num{imm8}} = emit{T, '_mm_dp_pd', a, b, imm8}
191 | def __dp{a:T==[4]f32, b:T, imm8 if num{imm8}} = emit{T, '_mm_dp_ps', a, b, imm8}
192 | def __mul{a:T==[4]i32, b:T} = emit{[2]i64, '_mm_mul_epi32', a, b}
193 | def __cvtepi8_8i16{a:([16]i8)} = emit{[8]i16, '_mm_cvtepi8_epi16', a}
194 | def __cvtepi8_4i32{a:([16]i8)} = emit{[4]i32, '_mm_cvtepi8_epi32', a}
195 | def __cvtepi8_2i64{a:([16]i8)} = emit{[2]i64, '_mm_cvtepi8_epi64', a}
196 | def __cvtepi16_4i32{a:([8]i16)} = emit{[4]i32, '_mm_cvtepi16_epi32', a}
197 | def __cvtepi16_2i64{a:([8]i16)} = emit{[2]i64, '_mm_cvtepi16_epi64', a}
198 | def __cvtepi32_2i64{a:([4]i32)} = emit{[2]i64, '_mm_cvtepi32_epi64', a}
199 | def __cvtepu8_8i16{a:([16]i8)} = emit{[8]i16, '_mm_cvtepu8_epi16', a}
200 | def __cvtepu8_4i32{a:([16]i8)} = emit{[4]i32, '_mm_cvtepu8_epi32', a}
201 | def __cvtepu8_2i64{a:([16]i8)} = emit{[2]i64, '_mm_cvtepu8_epi64', a}
202 | def __cvtepu16_4i32{a:([8]i16)} = emit{[4]i32, '_mm_cvtepu16_epi32', a}
203 | def __cvtepu16_2i64{a:([8]i16)} = emit{[2]i64, '_mm_cvtepu16_epi64', a}
204 | def __cvtepu32_2i64{a:([4]i32)} = emit{[2]i64, '_mm_cvtepu32_epi64', a}
205 | def __testz{a:T, b:T if intvec{128,T}} = emit{i32, '_mm_testz_si128', a, b}
206 | def __testc{a:T, b:T if intvec{128,T}} = emit{i32, '_mm_testc_si128', a, b}
207 | def __testnzc{a:T, b:T if intvec{128,T}} = emit{i32, '_mm_testnzc_si128', a, b}
208 | def __test_all_zeros{mask:T, a:T if intvec{128,T}} = emit{i32, '_mm_test_all_zeros', mask, a}
209 | def __test_mix_ones_zeros{mask:T, a:T if intvec{128,T}} = emit{i32, '_mm_test_mix_ones_zeros', mask, a}
210 | def __test{a:T if intvec{128,T}} = emit{i32, '_mm_test_all_ones', a}
211 | def __minpos{a:T==[8]u16} = emit{T, '_mm_minpos_epu16', a}
212 | def __mpsadbw{a:T==[16]u8, b:T, imm8 if num{imm8}} = emit{T, '_mm_mpsadbw_epu8', a, b, imm8}
213 | def __packus{a:T==[4]i32, b:T} = emit{[8]i16, '_mm_packus_epi32', a, b}
214 | 
215 | 
216 | #SSE4.2
217 | def __crc32{crc:T==u32, v:(u8)} = emit{T, '_mm_crc32_u8', crc, v}
218 | def __crc32{crc:T==u32, v:(u16)} = emit{T, '_mm_crc32_u16', crc, v}
219 | def __crc32{crc:T==u32, v:T} = emit{T, '_mm_crc32_u32', crc, v}
220 | def __crc32{crc:T==u64, v:T} = emit{T, '_mm_crc32_u64', crc, v}
221 | def __cmpistrm{a:T, b:T, imm8 if intvec{128,T} and num{imm8}} = emit{T, '_mm_cmpistrm', a, b, imm8}
222 | def __cmpistri{a:T, b:T, imm8 if intvec{128,T} and num{imm8}} = emit{i32, '_mm_cmpistri', a, b, imm8}
223 | def __cmpistrz{a:T, b:T, imm8 if intvec{128,T} and num{imm8}} = emit{i32, '_mm_cmpistrz', a, b, imm8}
224 | def __cmpistrc{a:T, b:T, imm8 if intvec{128,T} and num{imm8}} = emit{i32, '_mm_cmpistrc', a, b, imm8}
225 | def __cmpistrs{a:T, b:T, imm8 if intvec{128,T} and num{imm8}} = emit{i32, '_mm_cmpistrs', a, b, imm8}
226 | def __cmpistro{a:T, b:T, imm8 if intvec{128,T} and num{imm8}} = emit{i32, '_mm_cmpistro', a, b, imm8}
227 | def __cmpistra{a:T, b:T, imm8 if intvec{128,T} and num{imm8}} = emit{i32, '_mm_cmpistra', a, b, imm8}
228 | def __cmpestrm{a:T, la:S==i32, b:T, lb:S, imm8 if intvec{128,T} and num{imm8}} = emit{T, '_mm_cmpestrm', a, la, b, lb, imm8}
229 | def __cmpestri{a:T, la:S==i32, b:T, lb:S, imm8 if intvec{128,T} and num{imm8}} = emit{S, '_mm_cmpestri', a, la, b, lb, imm8}
230 | def __cmpestrz{a:T, la:S==i32, b:T, lb:S, imm8 if intvec{128,T} and num{imm8}} = emit{S, '_mm_cmpestrz', a, la, b, lb, imm8}
231 | def __cmpestrc{a:T, la:S==i32, b:T, lb:S, imm8 if intvec{128,T} and num{imm8}} = emit{S, '_mm_cmpestrc', a, la, b, lb, imm8}
232 | def __cmpestrs{a:T, la:S==i32, b:T, lb:S, imm8 if intvec{128,T} and num{imm8}} = emit{S, '_mm_cmpestrs', a, la, b, lb, imm8}
233 | def __cmpestro{a:T, la:S==i32, b:T, lb:S, imm8 if intvec{128,T} and num{imm8}} = emit{S, '_mm_cmpestro', a, la, b, lb, imm8}
234 | def __cmpestra{a:T, la:S==i32, b:T, lb:S, imm8 if intvec{128,T} and num{imm8}} = emit{S, '_mm_cmpestra', a, la, b, lb, imm8}
235 | 
236 | 
237 | #AVX
238 | def __addsub{a:T==[4]f64, b:T} = emit{T, '_mm256_addsub_pd', a, b}
239 | def __addsub{a:T==[8]f32, b:T} = emit{T, '_mm256_addsub_ps', a, b}
240 | def __dp{a:T==[8]f32, b:T, imm8 if num{imm8}} = emit{T, '_mm256_dp_ps', a, b, imm8}
241 | def __hadd{a:T==[4]f64, b:T} = emit{T, '_mm256_hadd_pd', a, b}
242 | def __hadd{a:T==[8]f32, b:T} = emit{T, '_mm256_hadd_ps', a, b}
243 | def __hsub{a:T==[4]f64, b:T} = emit{T, '_mm256_hsub_pd', a, b}
244 | def __hsub{a:T==[8]f32, b:T} = emit{T, '_mm256_hsub_ps', a, b}
245 | def __castpd{a:([4]f64)} = emit{[8]f32, '_mm256_castpd_ps', a}
246 | def __castps{a:([8]f32)} = emit{[4]f64, '_mm256_castps_pd', a}
247 | def __castsi256{a:([8]u32)} = emit{[8]f32, '_mm256_castsi256_ps', a}
248 | def __castsi256{a:([4]u64)} = emit{[4]f64, '_mm256_castsi256_pd', a}
249 | def __castps256{a:([8]f32)} = emit{[4]f32, '_mm256_castps256_ps128', a}
250 | def __castpd256{a:([4]f64)} = emit{[2]f64, '_mm256_castpd256_pd128', a}
251 | def __castps128{a:([4]f32)} = emit{[8]f32, '_mm256_castps128_ps256', a}
252 | def __castpd128{a:([2]f64)} = emit{[4]f64, '_mm256_castpd128_pd256', a}
253 | def __zextps128{a:([4]f32)} = emit{[8]f32, '_mm256_zextps128_ps256', a}
254 | def __zextpd128{a:([2]f64)} = emit{[4]f64, '_mm256_zextpd128_pd256', a}
255 | def __cmp{a:T==[2]f64, b:T, imm8 if num{imm8}} = emit{T, '_mm_cmp_pd', a, b, imm8}
256 | def __cmp{a:T==[4]f32, b:T, imm8 if num{imm8}} = emit{T, '_mm_cmp_ps', a, b, imm8}
257 | def __cvtepi32_4f64{a:([4]i32)} = emit{[4]f64, '_mm256_cvtepi32_pd', a}
258 | def __cvtepi32{a:([8]i32)} = emit{[8]f32, '_mm256_cvtepi32_ps', a}
259 | def __cvtpd_4f32{a:([4]f64)} = emit{[4]f32, '_mm256_cvtpd_ps', a}
260 | def __cvtps{a:([8]f32)} = emit{[8]i32, '_mm256_cvtps_epi32', a}
261 | def __cvtps_4f64{a:([4]f32)} = emit{[4]f64, '_mm256_cvtps_pd', a}
262 | def __cvttpd{a:([4]f64)} = emit{[4]i32, '_mm256_cvttpd_epi32', a}
263 | def __cvtpd_4i32{a:([4]f64)} = emit{[4]i32, '_mm256_cvtpd_epi32', a}
264 | def __cvttps{a:([8]f32)} = emit{[8]i32, '_mm256_cvttps_epi32', a}
265 | def __cvtss{a:([8]f32)} = emit{f32, '_mm256_cvtss_f32', a}
266 | def __cvtsd{a:([4]f64)} = emit{f64, '_mm256_cvtsd_f64', a}
267 | def __cvtsi256{a:([8]u32)} = emit{i32, '_mm256_cvtsi256_si32', a}
268 | def __rcp{a:T==[8]f32} = emit{T, '_mm256_rcp_ps', a}
269 | def __rsqrt{a:T==[8]f32} = emit{T, '_mm256_rsqrt_ps', a}
270 | def __zeroall{} = emit{void, '_mm256_zeroall'}
271 | def __zeroupper{} = emit{void, '_mm256_zeroupper'}
272 | def __undefined_8f32{} = emit{[8]f32, '_mm256_undefined_ps'}
273 | def __undefined_4f64{} = emit{[4]f64, '_mm256_undefined_pd'}
274 | def __maskload{mem_addr:*(f64), mask:T if intvec{256,T}} = emit{[4]f64, '_mm256_maskload_pd', mem_addr, mask}
275 | def __maskload{mem_addr:*(f64), mask:T if intvec{128,T}} = emit{[2]f64, '_mm_maskload_pd', mem_addr, mask}
276 | def __maskload{mem_addr:*(f32), mask:T if intvec{256,T}} = emit{[8]f32, '_mm256_maskload_ps', mem_addr, mask}
277 | def __maskload{mem_addr:*(f32), mask:T if intvec{128,T}} = emit{[4]f32, '_mm_maskload_ps', mem_addr, mask}
278 | def __loadu2{hiaddr:*T==f32, loaddr:*T} = emit{[8]f32, '_mm256_loadu2_m128', hiaddr, loaddr}
279 | def __loadu2{hiaddr:*T==f64, loaddr:*T} = emit{[4]f64, '_mm256_loadu2_m128d', hiaddr, loaddr}
280 | def __testz{a:T, b:T if intvec{256,T}} = emit{i32, '_mm256_testz_si256', a, b}
281 | def __testc{a:T, b:T if intvec{256,T}} = emit{i32, '_mm256_testc_si256', a, b}
282 | def __testnzc{a:T, b:T if intvec{256,T}} = emit{i32, '_mm256_testnzc_si256', a, b}
283 | def __testz{a:T==[4]f64, b:T} = emit{i32, '_mm256_testz_pd', a, b}
284 | def __testc{a:T==[4]f64, b:T} = emit{i32, '_mm256_testc_pd', a, b}
285 | def __testnzc{a:T==[4]f64, b:T} = emit{i32, '_mm256_testnzc_pd', a, b}
286 | def __testz{a:T==[2]f64, b:T} = emit{i32, '_mm_testz_pd', a, b}
287 | def __testc{a:T==[2]f64, b:T} = emit{i32, '_mm_testc_pd', a, b}
288 | def __testnzc{a:T==[2]f64, b:T} = emit{i32, '_mm_testnzc_pd', a, b}
289 | def __testz{a:T==[8]f32, b:T} = emit{i32, '_mm256_testz_ps', a, b}
290 | def __testc{a:T==[8]f32, b:T} = emit{i32, '_mm256_testc_ps', a, b}
291 | def __testnzc{a:T==[8]f32, b:T} = emit{i32, '_mm256_testnzc_ps', a, b}
292 | def __testz{a:T==[4]f32, b:T} = emit{i32, '_mm_testz_ps', a, b}
293 | def __testc{a:T==[4]f32, b:T} = emit{i32, '_mm_testc_ps', a, b}
294 | def __testnzc{a:T==[4]f32, b:T} = emit{i32, '_mm_testnzc_ps', a, b}
295 | def __movemask{a:([4]f64)} = emit{i32, '_mm256_movemask_pd', a}
296 | def __movemask{a:([8]f32)} = emit{i32, '_mm256_movemask_ps', a}
297 | def __movehdup{a:T==[8]f32} = emit{T, '_mm256_movehdup_ps', a}
298 | def __moveldup{a:T==[8]f32} = emit{T, '_mm256_moveldup_ps', a}
299 | def __movedup{a:T==[4]f64} = emit{T, '_mm256_movedup_pd', a}
300 | def __setzero_4f64{} = emit{[4]f64, '_mm256_setzero_pd'}
301 | def __setzero_8f32{} = emit{[8]f32, '_mm256_setzero_ps'}
302 | def __set{hi:T==[4]f32, lo:T} = emit{[8]f32, '_mm256_set_m128', hi, lo}
303 | def __set{hi:T==[2]f64, lo:T} = emit{[4]f64, '_mm256_set_m128d', hi, lo}
304 | def __setr{lo:T==[4]f32, hi:T} = emit{[8]f32, '_mm256_setr_m128', lo, hi}
305 | def __setr{lo:T==[2]f64, hi:T} = emit{[4]f64, '_mm256_setr_m128d', lo, hi}
306 | def __maskstore{mem_addr:*(f64), mask:T, a:([4]f64) if intvec{256,T}} = emit{void, '_mm256_maskstore_pd', mem_addr, mask, a}
307 | def __maskstore{mem_addr:*(f64), mask:T, a:([2]f64) if intvec{128,T}} = emit{void, '_mm_maskstore_pd', mem_addr, mask, a}
308 | def __maskstore{mem_addr:*(f32), mask:T, a:([8]f32) if intvec{256,T}} = emit{void, '_mm256_maskstore_ps', mem_addr, mask, a}
309 | def __maskstore{mem_addr:*(f32), mask:T, a:([4]f32) if intvec{128,T}} = emit{void, '_mm_maskstore_ps', mem_addr, mask, a}
310 | def __stream{mem_addr:*(void), a:T if intvec{256,T}} = emit{void, '_mm256_stream_si256', mem_addr, a}
311 | def __stream{mem_addr:*(void), a:([4]f64)} = emit{void, '_mm256_stream_pd', mem_addr, a}
312 | def __stream{mem_addr:*(void), a:([8]f32)} = emit{void, '_mm256_stream_ps', mem_addr, a}
313 | def __storeu2{hiaddr:*T==f32, loaddr:*T, a:([8]f32)} = emit{void, '_mm256_storeu2_m128', hiaddr, loaddr, a}
314 | def __storeu2{hiaddr:*T==f64, loaddr:*T, a:([4]f64)} = emit{void, '_mm256_storeu2_m128d', hiaddr, loaddr, a}
315 | def __extractf128{a:([8]f32), imm8 if num{imm8}} = emit{[4]f32, '_mm256_extractf128_ps', a, imm8}
316 | def __extractf128{a:([4]f64), imm8 if num{imm8}} = emit{[2]f64, '_mm256_extractf128_pd', a, imm8}
317 | def __insertf128{a:T==[8]f32, b:([4]f32), imm8 if num{imm8}} = emit{T, '_mm256_insertf128_ps', a, b, imm8}
318 | def __insertf128{a:T==[4]f64, b:([2]f64), imm8 if num{imm8}} = emit{T, '_mm256_insertf128_pd', a, b, imm8}
319 | def __insertf128{a:T, b:S, imm8 if intvec{256,T} and intvec{128,S} and num{imm8}} = emit{T, '_mm256_insertf128_si256', a, b, imm8}
320 | def __broadcast{mem_addr:*([4]f32)} = emit{[8]f32, '_mm256_broadcast_ps', mem_addr}
321 | def __broadcast{mem_addr:*([2]f64)} = emit{[4]f64, '_mm256_broadcast_pd', mem_addr}
322 | 
323 | 
324 | #AVX2
325 | def __hadd{a:T==[16]i16, b:T} = emit{T, '_mm256_hadd_epi16', a, b}
326 | def __hadd{a:T==[8]i32, b:T} = emit{T, '_mm256_hadd_epi32', a, b}
327 | def __hadds{a:T==[16]i16, b:T} = emit{T, '_mm256_hadds_epi16', a, b}
328 | def __hsub{a:T==[16]i16, b:T} = emit{T, '_mm256_hsub_epi16', a, b}
329 | def __hsub{a:T==[8]i32, b:T} = emit{T, '_mm256_hsub_epi32', a, b}
330 | def __hsubs{a:T==[16]i16, b:T} = emit{T, '_mm256_hsubs_epi16', a, b}
331 | def __madd{a:T==[16]i16, b:T} = emit{[8]i32, '_mm256_madd_epi16', a, b}
332 | def __maddubs{a:T==[32]i8, b:T} = emit{[16]i16, '_mm256_maddubs_epi16', a, b}
333 | def __mul{a:T==[8]i32, b:T} = emit{[4]i64, '_mm256_mul_epi32', a, b}
334 | def __mul{a:T==[8]u32, b:T} = emit{[4]u64, '_mm256_mul_epu32', a, b}
335 | def __mulhi{a:T==[16]i16, b:T} = emit{T, '_mm256_mulhi_epi16', a, b}
336 | def __mulhi{a:T==[16]u16, b:T} = emit{T, '_mm256_mulhi_epu16', a, b}
337 | def __mulhrs{a:T==[16]i16, b:T} = emit{T, '_mm256_mulhrs_epi16', a, b}
338 | def __sad{a:T==[32]u8, b:T} = emit{[16]u16, '_mm256_sad_epu8', a, b}
339 | def __cvtepi16_8i32{a:([8]i16)} = emit{[8]i32, '_mm256_cvtepi16_epi32', a}
340 | def __cvtepi16_4i64{a:([8]i16)} = emit{[4]i64, '_mm256_cvtepi16_epi64', a}
341 | def __cvtepi32_4i64{a:([4]i32)} = emit{[4]i64, '_mm256_cvtepi32_epi64', a}
342 | def __cvtepi8_16i16{a:([16]i8)} = emit{[16]i16, '_mm256_cvtepi8_epi16', a}
343 | def __cvtepi8_8i32{a:([16]i8)} = emit{[8]i32, '_mm256_cvtepi8_epi32', a}
344 | def __cvtepi8_4i64{a:([16]i8)} = emit{[4]i64, '_mm256_cvtepi8_epi64', a}
345 | def __cvtepu16_8i32{a:([8]i16)} = emit{[8]i32, '_mm256_cvtepu16_epi32', a}
346 | def __cvtepu16_4i64{a:([8]i16)} = emit{[4]i64, '_mm256_cvtepu16_epi64', a}
347 | def __cvtepu32_4i64{a:([4]i32)} = emit{[4]i64, '_mm256_cvtepu32_epi64', a}
348 | def __cvtepu8_16i16{a:([16]i8)} = emit{[16]i16, '_mm256_cvtepu8_epi16', a}
349 | def __cvtepu8_8i32{a:([16]i8)} = emit{[8]i32, '_mm256_cvtepu8_epi32', a}
350 | def __cvtepu8_4i64{a:([16]i8)} = emit{[4]i64, '_mm256_cvtepu8_epi64', a}
351 | def __i32gather_2f64{base_addr:*(f64), vindex:([4]i32), scale if num{scale}} = emit{[2]f64, '_mm_i32gather_pd', base_addr, vindex, scale}
352 | def __i32gather_4f64{base_addr:*(f64), vindex:([4]i32), scale if num{scale}} = emit{[4]f64, '_mm256_i32gather_pd', base_addr, vindex, scale}
353 | def __i32gather{base_addr:*(f32), vindex:([4]i32), scale if num{scale}} = emit{[4]f32, '_mm_i32gather_ps', base_addr, vindex, scale}
354 | def __i32gather{base_addr:*(f32), vindex:([8]i32), scale if num{scale}} = emit{[8]f32, '_mm256_i32gather_ps', base_addr, vindex, scale}
355 | def __i32gather{base_addr:*(i32), vindex:T==[4]i32, scale if num{scale}} = emit{T, '_mm_i32gather_epi32', base_addr, vindex, scale}
356 | def __i32gather{base_addr:*(i32), vindex:T==[8]i32, scale if num{scale}} = emit{T, '_mm256_i32gather_epi32', base_addr, vindex, scale}
357 | def __i32gather_2i64{base_addr:*(i64), vindex:([4]i32), scale if num{scale}} = emit{[2]i64, '_mm_i32gather_epi64', base_addr, vindex, scale}
358 | def __i32gather_4i64{base_addr:*(i64), vindex:([4]i32), scale if num{scale}} = emit{[4]i64, '_mm256_i32gather_epi64', base_addr, vindex, scale}
359 | def __i64gather{base_addr:*(f64), vindex:([2]i64), scale if num{scale}} = emit{[2]f64, '_mm_i64gather_pd', base_addr, vindex, scale}
360 | def __i64gather{base_addr:*(f64), vindex:([4]i64), scale if num{scale}} = emit{[4]f64, '_mm256_i64gather_pd', base_addr, vindex, scale}
361 | def __i64gather{base_addr:*(f32), vindex:([2]i64), scale if num{scale}} = emit{[4]f32, '_mm_i64gather_ps', base_addr, vindex, scale}
362 | def __i64gather{base_addr:*(f32), vindex:([4]i64), scale if num{scale}} = emit{[4]f32, '_mm256_i64gather_ps', base_addr, vindex, scale}
363 | def __i64gather{base_addr:*(i32), vindex:([2]i64), scale if num{scale}} = emit{[4]i32, '_mm_i64gather_epi32', base_addr, vindex, scale}
364 | def __i64gather{base_addr:*(i32), vindex:([4]i64), scale if num{scale}} = emit{[4]i32, '_mm256_i64gather_epi32', base_addr, vindex, scale}
365 | def __i64gather{base_addr:*(i64), vindex:T==[2]i64, scale if num{scale}} = emit{T, '_mm_i64gather_epi64', base_addr, vindex, scale}
366 | def __i64gather{base_addr:*(i64), vindex:T==[4]i64, scale if num{scale}} = emit{T, '_mm256_i64gather_epi64', base_addr, vindex, scale}
367 | def __mask_i32gather{src:T==[2]f64, base_addr:*(f64), vindex:([4]i32), mask:S, scale if intvec{128,S} and num{scale}} = emit{T, '_mm_mask_i32gather_pd', src, base_addr, vindex, mask, scale}
368 | def __mask_i32gather{src:T==[4]f64, base_addr:*(f64), vindex:([4]i32), mask:S, scale if intvec{256,S} and num{scale}} = emit{T, '_mm256_mask_i32gather_pd', src, base_addr, vindex, mask, scale}
369 | def __mask_i32gather{src:T==[4]f32, base_addr:*(f32), vindex:([4]i32), mask:S, scale if intvec{128,S} and num{scale}} = emit{T, '_mm_mask_i32gather_ps', src, base_addr, vindex, mask, scale}
370 | def __mask_i32gather{src:T==[8]f32, base_addr:*(f32), vindex:([8]i32), mask:S, scale if intvec{256,S} and num{scale}} = emit{T, '_mm256_mask_i32gather_ps', src, base_addr, vindex, mask, scale}
371 | def __mask_i32gather{src:T==[4]i32, base_addr:*(i32), vindex:T, mask:S, scale if intvec{128,S} and num{scale}} = emit{T, '_mm_mask_i32gather_epi32', src, base_addr, vindex, mask, scale}
372 | def __mask_i32gather{src:T==[8]i32, base_addr:*(i32), vindex:T, mask:S, scale if intvec{256,S} and num{scale}} = emit{T, '_mm256_mask_i32gather_epi32', src, base_addr, vindex, mask, scale}
373 | def __mask_i32gather{src:T==[2]i64, base_addr:*(i64), vindex:([4]i32), mask:S, scale if intvec{128,S} and num{scale}} = emit{T, '_mm_mask_i32gather_epi64', src, base_addr, vindex, mask, scale}
374 | def __mask_i32gather{src:T==[4]i64, base_addr:*(i64), vindex:([4]i32), mask:S, scale if intvec{256,S} and num{scale}} = emit{T, '_mm256_mask_i32gather_epi64', src, base_addr, vindex, mask, scale}
375 | def __mask_i64gather{src:T==[2]f64, base_addr:*(f64), vindex:([2]i64), mask:S, scale if intvec{128,S} and num{scale}} = emit{T, '_mm_mask_i64gather_pd', src, base_addr, vindex, mask, scale}
376 | def __mask_i64gather{src:T==[4]f64, base_addr:*(f64), vindex:([4]i64), mask:S, scale if intvec{256,S} and num{scale}} = emit{T, '_mm256_mask_i64gather_pd', src, base_addr, vindex, mask, scale}
377 | def __mask_i64gather{src:T==[4]f32, base_addr:*(f32), vindex:([2]i64), mask:S, scale if intvec{128,S} and num{scale}} = emit{T, '_mm_mask_i64gather_ps', src, base_addr, vindex, mask, scale}
378 | def __mask_i64gather{src:T==[4]f32, base_addr:*(f32), vindex:([4]i64), mask:S, scale if intvec{128,S} and num{scale}} = emit{T, '_mm256_mask_i64gather_ps', src, base_addr, vindex, mask, scale}
379 | def __mask_i64gather{src:T==[4]i32, base_addr:*(i32), vindex:([2]i64), mask:S, scale if intvec{128,S} and num{scale}} = emit{T, '_mm_mask_i64gather_epi32', src, base_addr, vindex, mask, scale}
380 | def __mask_i64gather{src:T==[4]i32, base_addr:*(i32), vindex:([4]i64), mask:S, scale if intvec{128,S} and num{scale}} = emit{T, '_mm256_mask_i64gather_epi32', src, base_addr, vindex, mask, scale}
381 | def __mask_i64gather{src:T==[2]i64, base_addr:*(i64), vindex:T, mask:S, scale if intvec{128,S} and num{scale}} = emit{T, '_mm_mask_i64gather_epi64', src, base_addr, vindex, mask, scale}
382 | def __mask_i64gather{src:T==[4]i64, base_addr:*(i64), vindex:T, mask:S, scale if intvec{256,S} and num{scale}} = emit{T, '_mm256_mask_i64gather_epi64', src, base_addr, vindex, mask, scale}
383 | def __maskload{mem_addr:*(i32), mask:T if intvec{128,T}} = emit{[4]i32, '_mm_maskload_epi32', mem_addr, mask}
384 | def __maskload{mem_addr:*(i32), mask:T if intvec{256,T}} = emit{[8]i32, '_mm256_maskload_epi32', mem_addr, mask}
385 | def __maskload{mem_addr:*(i64), mask:T if intvec{128,T}} = emit{[2]i64, '_mm_maskload_epi64', mem_addr, mask}
386 | def __maskload{mem_addr:*(i64), mask:T if intvec{256,T}} = emit{[4]i64, '_mm256_maskload_epi64', mem_addr, mask}
387 | def __movemask{a:([32]i8)} = emit{i32, '_mm256_movemask_epi8', a}
388 | def __mpsadbw{a:T==[32]u8, b:T, imm8 if num{imm8}} = emit{T, '_mm256_mpsadbw_epu8', a, b, imm8}
389 | def __packs{a:T==[16]i16, b:T} = emit{[32]i8, '_mm256_packs_epi16', a, b}
390 | def __packs{a:T==[8]i32, b:T} = emit{[16]i16, '_mm256_packs_epi32', a, b}
391 | def __packus{a:T==[16]i16, b:T} = emit{[32]i8, '_mm256_packus_epi16', a, b}
392 | def __packus{a:T==[8]i32, b:T} = emit{[16]i16, '_mm256_packus_epi32', a, b}
393 | def __maskstore{mem_addr:*(i32), mask:T, a:([4]i32) if intvec{128,T}} = emit{void, '_mm_maskstore_epi32', mem_addr, mask, a}
394 | def __maskstore{mem_addr:*(i32), mask:T, a:([8]i32) if intvec{256,T}} = emit{void, '_mm256_maskstore_epi32', mem_addr, mask, a}
395 | def __maskstore{mem_addr:*(i64), mask:T, a:([2]i64) if intvec{128,T}} = emit{void, '_mm_maskstore_epi64', mem_addr, mask, a}
396 | def __maskstore{mem_addr:*(i64), mask:T, a:([4]i64) if intvec{256,T}} = emit{void, '_mm256_maskstore_epi64', mem_addr, mask, a}
397 | def __broadcastb_16i8{a:T==[16]i8} = emit{T, '_mm_broadcastb_epi8', a}
398 | def __broadcastb_32i8{a:([16]i8)} = emit{[32]i8, '_mm256_broadcastb_epi8', a}
399 | def __broadcastd_4i32{a:T==[4]i32} = emit{T, '_mm_broadcastd_epi32', a}
400 | def __broadcastd_8i32{a:([4]i32)} = emit{[8]i32, '_mm256_broadcastd_epi32', a}
401 | def __broadcastq_2i64{a:T==[2]i64} = emit{T, '_mm_broadcastq_epi64', a}
402 | def __broadcastq_4i64{a:([2]i64)} = emit{[4]i64, '_mm256_broadcastq_epi64', a}
403 | def __broadcastsd_2f64{a:T==[2]f64} = emit{T, '_mm_broadcastsd_pd', a}
404 | def __broadcastsd_4f64{a:([2]f64)} = emit{[4]f64, '_mm256_broadcastsd_pd', a}
405 | def __broadcastss_4f32{a:T==[4]f32} = emit{T, '_mm_broadcastss_ps', a}
406 | def __broadcastss_8f32{a:([4]f32)} = emit{[8]f32, '_mm256_broadcastss_ps', a}
407 | def __broadcastw_8i16{a:T==[8]i16} = emit{T, '_mm_broadcastw_epi16', a}
408 | def __broadcastw_16i16{a:([8]i16)} = emit{[16]i16, '_mm256_broadcastw_epi16', a}
409 | def __inserti128{a:T, b:S, imm8 if intvec{256,T} and intvec{128,S} and num{imm8}} = emit{T, '_mm256_inserti128_si256', a, b, imm8}
410 | 
411 | 
412 | #FMA
413 | def __fmadd{a:T==[2]f64, b:T, c:T} = emit{T, '_mm_fmadd_pd', a, b, c}
414 | def __fmadd{a:T==[4]f64, b:T, c:T} = emit{T, '_mm256_fmadd_pd', a, b, c}
415 | def __fmadd{a:T==[4]f32, b:T, c:T} = emit{T, '_mm_fmadd_ps', a, b, c}
416 | def __fmadd{a:T==[8]f32, b:T, c:T} = emit{T, '_mm256_fmadd_ps', a, b, c}
417 | def __fmaddsub{a:T==[2]f64, b:T, c:T} = emit{T, '_mm_fmaddsub_pd', a, b, c}
418 | def __fmaddsub{a:T==[4]f64, b:T, c:T} = emit{T, '_mm256_fmaddsub_pd', a, b, c}
419 | def __fmaddsub{a:T==[4]f32, b:T, c:T} = emit{T, '_mm_fmaddsub_ps', a, b, c}
420 | def __fmaddsub{a:T==[8]f32, b:T, c:T} = emit{T, '_mm256_fmaddsub_ps', a, b, c}
421 | def __fmsub{a:T==[2]f64, b:T, c:T} = emit{T, '_mm_fmsub_pd', a, b, c}
422 | def __fmsub{a:T==[4]f64, b:T, c:T} = emit{T, '_mm256_fmsub_pd', a, b, c}
423 | def __fmsub{a:T==[4]f32, b:T, c:T} = emit{T, '_mm_fmsub_ps', a, b, c}
424 | def __fmsub{a:T==[8]f32, b:T, c:T} = emit{T, '_mm256_fmsub_ps', a, b, c}
425 | def __fmsubadd{a:T==[2]f64, b:T, c:T} = emit{T, '_mm_fmsubadd_pd', a, b, c}
426 | def __fmsubadd{a:T==[4]f64, b:T, c:T} = emit{T, '_mm256_fmsubadd_pd', a, b, c}
427 | def __fmsubadd{a:T==[4]f32, b:T, c:T} = emit{T, '_mm_fmsubadd_ps', a, b, c}
428 | def __fmsubadd{a:T==[8]f32, b:T, c:T} = emit{T, '_mm256_fmsubadd_ps', a, b, c}
429 | def __fnmadd{a:T==[2]f64, b:T, c:T} = emit{T, '_mm_fnmadd_pd', a, b, c}
430 | def __fnmadd{a:T==[4]f64, b:T, c:T} = emit{T, '_mm256_fnmadd_pd', a, b, c}
431 | def __fnmadd{a:T==[4]f32, b:T, c:T} = emit{T, '_mm_fnmadd_ps', a, b, c}
432 | def __fnmadd{a:T==[8]f32, b:T, c:T} = emit{T, '_mm256_fnmadd_ps', a, b, c}
433 | def __fnmsub{a:T==[2]f64, b:T, c:T} = emit{T, '_mm_fnmsub_pd', a, b, c}
434 | def __fnmsub{a:T==[4]f64, b:T, c:T} = emit{T, '_mm256_fnmsub_pd', a, b, c}
435 | def __fnmsub{a:T==[4]f32, b:T, c:T} = emit{T, '_mm_fnmsub_ps', a, b, c}
436 | def __fnmsub{a:T==[8]f32, b:T, c:T} = emit{T, '_mm256_fnmsub_ps', a, b, c}
437 | 


--------------------------------------------------------------------------------
/include/arch/iintrinsic/select.singeli:
--------------------------------------------------------------------------------
  1 | local {
  2 |   include 'skin/cop'
  3 |   include 'util/kind'
  4 |   oper ~~ reinterpret infix right 55
  5 |   def base{b,{}} = 0; def base{b,{h,...t}} = h + b*base{b,t}
  6 |   def all{t} = is{t, 0 <= t}
  7 |   def copy{n, v} = each{{_}=>v, range{n}}
  8 | 
  9 |   def fmtwidth{V} = fmtnat{width{V}}
 10 |   def fmt_p{T} = {
 11 |     if (isfloat{T}) {
 12 |       if (width{T}==32) 'ps' else 'pd'
 13 |     } else {
 14 |       merge{'epi', fmtwidth{T}} # No unsigned instructions here
 15 |     }
 16 |   }
 17 |   def intrin_g{get_typ}{name, V} = {
 18 |     def w = width{V}
 19 |     def fw = if (w<=128) '' else fmtnat{w}
 20 |     merge{'_mm', fw, '_', name, '_', get_typ{V}}
 21 |   }
 22 |   def intrin = intrin_g{{[_]T} => fmt_p{T}}
 23 |   def intrin_b = intrin_g{{V=[_]T} => {
 24 |     if (isint{T}) merge{'si',fmtwidth{V}} else fmt_p{T}
 25 |   }}
 26 | 
 27 |   # Only need to cast if element type class changes
 28 |   def class{T} = if (isfloat{T}) T else i8  # i8 represents any int
 29 |   def call_cast_sub{do_uncast, gen, T, V=[_]E, ...vs} = {
 30 |     def uncast{v} = if (do_uncast) V~~v else v
 31 |     if (class{E} == class{T}) gen{...vs}
 32 |     else uncast{gen{...each{~~{[width{V}/width{T}]T, .}, vs}}}
 33 |   }
 34 |   def call_cast = call_cast_sub{1, ...}
 35 | }
 36 | 
 37 | # 16-bit shuffles on half-words don't quite fit vec_shuffle
 38 | local def vec_shuffle16_impl{suff, vec:V, ind} = {
 39 |   emit{V, intrin{merge{'shuffle', suff}, [width{V}/16]i16}, vec, base{4, ind}}
 40 | }
 41 | local def has_sh16{w} = (w==128 and hasarch{'SSE2'}) or (w==256 and hasarch{'AVX2'})
 42 | local def is_bool = match { {0}=>1; {1}=>1; {_}=>0 }
 43 | def vec_shuffle16_half{half, vec:V=[_]_, {...ind} if is_bool{half} and has_sh16{width{V}} and length{ind}==4 and all{(ind>=0) & (ind<4)}} = {
 44 |   def part = select{tup{'lo','hi'}, half}
 45 |   call_cast{vec_shuffle16_impl{part, ., ind}, i16, V, vec}
 46 | }
 47 | def vec_shuffle16_lo{...} = vec_shuffle16_half{0, ...}
 48 | def vec_shuffle16_hi{...} = vec_shuffle16_half{1, ...}
 49 | 
 50 | # General implementation for constant indices
 51 | local def vec_shuffle_impl{T, sel_n, val={_:V, ..._}, ind} = {
 52 |   def pos = all{ind >= 0}
 53 |   def w = width{T}
 54 |   def wv= width{V}
 55 |   def sel_width = sel_n * w
 56 |   def ind_width = length{ind} * w
 57 |   def 0 = wv % ind_width
 58 |   # Index expansion and instruction calls
 59 |   def get_ind{sel_n, n, ind} = {
 60 |     def exp{i} = if (n <= length{i}) i else exp{merge{i, i}}
 61 |     def off{i} = i + (range{length{i}} & -sel_n) * (i >= 0)
 62 |     off{exp{ind}}
 63 |   }
 64 |   def shuf_sub{I, name, ind, val} = {
 65 |     emit{V, intrin{name, [wv/width{I}]I}, ...val, ind}
 66 |   }
 67 |   def shuf{name, n, fmt} = shuf_sub{T, name, fmt{get_ind{sel_n, n, ind}}, val}
 68 |   def shuf{name, E, n, fmt} = {
 69 |     def e = w / width{E}
 70 |     def wi = if (e == 1) ind else merge{...each{+{.,range{e}}, e*ind}}
 71 |     shuf_sub{E, name, fmt{get_ind{e*sel_n, n, wi}}, val}
 72 |   }
 73 |   def shuf_base{...a, n   } = shuf{...a, n, base{n, .}}
 74 |   def shuf_make{...a, n, T} = shuf{...a, n, vec_make{[n]T, .}}
 75 |   # shuffle_ps / shuffle_pd
 76 |   def fshuf{b, v={_,_}} = {
 77 |     def i = get_ind{sel_n, __min{wv / w, 4}, ind}
 78 |     shuf_sub{T, 'shuffle', base{b, i % b}, v}
 79 |   }
 80 |   # Arch-specific cases
 81 |   if (sel_n == 1 and 1 == length{val}) {
 82 |     select{val, 0}
 83 |   } else if (2 == length{val}) {
 84 |     def 1 = isfloat{T} and wv >= 128 and ind_width <= 128 and pos
 85 |     fshuf{128 / w, val}
 86 |   } else if (wv < 128) {
 87 |     def 1 = wv == 64 and isint{T} and w >= 16 and pos
 88 |     shuf_base{'shuffle', 4}
 89 |   } else if (sel_width == 256) {
 90 |     def 1 = hasarch{'AVX2'} and pos
 91 |     match (w) {
 92 |       {32} => shuf_make{'permutevar8x32', 8, u32}
 93 |       {64} => shuf_base{'permute4x64', 4}
 94 |     }
 95 |   } else if (isfloat{T}) {
 96 |     def 1 = pos  # Zeroing unsupported for float selection
 97 |     if (ind_width == 256) {
 98 |       def 1 = hasarch{'AVX'}
 99 |       if (sel_width == 128 and w == 64) {
100 |         shuf{'permute', 2, {t} => base{2, t%2}}
101 |       } else {
102 |         def n = 256 / w
103 |         def U = [n]primtype{'u', w}
104 |         shuf{'permutevar', n, {i} => vec_make{U, (w/32)*i}}
105 |       }
106 |     } else {
107 |       def n = 128 / w
108 |       if (hasarch{'AVX'}) shuf{'permute', n, {t} => base{n, if (n==2 and wv>128) merge{t,t} else t}}
109 |       else fshuf{n, merge{val,val}}
110 |     }
111 |   } else { # isint{T}
112 |     def 1 = hasarch{'SSE2'}
113 |     match () {
114 |       {if w >= 32 and pos}  => shuf_base{'shuffle', i32, 4}
115 |       {if hasarch{'SSSE3'}} => shuf_make{'shuffle', i8, wv/8, i8}
116 |       {if w >= 16 and sel_width <= 64 and pos} => {
117 |         def io = get_ind{w/16 * sel_n, 4, ind}
118 |         def i = io - (range{length{io}} & -4) # Avoid passing offset indices to shufflehi
119 |         def fi{v, s, ...bnd} = vec_shuffle16_impl{s, v, slice{i, ...bnd}}
120 |         fi{fi{...val, 'lo',0,4}, 'hi',-4}
121 |       }
122 |     }
123 |   }
124 | }
125 | 
126 | # Process parameters
127 | local def vec_shuffle_proc{...spec, V=[_]vT, vals, {...inds}} = {
128 |   # inds doesn't have the element type so it comes from spec or val
129 |   def T = match (...spec) {
130 |     {[_]T} => T; {T if ktyp{T}} => T
131 |     {n if knum{n}} => vT; {} => vT
132 |   }
133 |   # sel_n is the number of indices in a selection unit
134 |   # It can be specified by spec as a number or length of vector type
135 |   # and/or by the length of each list in nested inds
136 |   def spec_n = match (...spec) {
137 |     {[k]_} => tup{k}; {k if knum{k}} => tup{k}; {..._} => tup{}
138 |   }
139 |   def {ind_n, ind} = match (inds) {
140 |     {{{...t}, ..._}} => {
141 |       def l = length{t}
142 |       def 1 = all{l == each{length, inds}}
143 |       tup{tup{l}, merge{...inds}}
144 |     }
145 |     {_} => {
146 |       each{{s} => { def 0 = length{inds} % s }, spec_n}
147 |       tup{tup{}, inds}
148 |     }
149 |   }
150 |   def sel_n = match(...spec_n, ...ind_n) {  # Shuffle unit from spec and nested indices must match
151 |     {n,n} => n; {n} => n; {} => length{ind}
152 |   }
153 |   def 1 = all{ind < sel_n}
154 |   def shuf{...v} = vec_shuffle_impl{T, sel_n, v, ind}
155 |   call_cast{shuf, T, V, ...vals}
156 | }
157 | 
158 | # For convenience, allow indices to be written directly without tup
159 | def vec_select {val, ...inds if 1 < length{inds}} = vec_select {val, inds}
160 | def vec_shuffle{val, ...inds if 1 < length{inds}} = vec_shuffle{val, inds}
161 | def vec_select {spec if knum{spec} or ktyp{spec}, val, ...inds if 1 < length{inds}} = vec_select {spec, val, inds}
162 | def vec_shuffle{spec if knum{spec} or ktyp{spec}, val, ...inds if 1 < length{inds}} = vec_shuffle{spec, val, inds}
163 | # Main definitions
164 | def vec_shuffle{...spec, val:    V=[_]_      , {...inds}} = vec_shuffle_proc{...spec, V, tup{val}, inds}
165 | def vec_shuffle{...spec, vals={_:V=[_]_, _:V}, {...inds}} = vec_shuffle_proc{...spec, V, vals,     inds}
166 | 
167 | # Variable indices: not many cases to support
168 | def vec_shuffle{...spec, val:V=[vk]vT, ind:[ik]I if isint{I}} = {
169 |   # Selection type and size can be set by spec, or come from ind
170 |   # But element quality comes from val if possible
171 |   def wi = width{I}
172 |   def ty{} = if (wi < 32) I else primtype{quality{vT}, wi}
173 |   def {sel_n, T} = match (...spec) {
174 |     {[n]T}         => tup{ n,T}
175 |     {T if ktyp{T}} => tup{ik,T}
176 |     {n if knum{n}} => tup{ n,ty{}}
177 |     {}             => tup{ik,ty{}}
178 |   }
179 |   def 1 = wi == width{T}
180 |   def 0 = ik % sel_n
181 |   def name = match (sel_n, width{T}) {
182 |     {16, 8 if hasarch{'SSSE3'}} => 'shuffle'
183 |     {4, 32 if hasarch{'AVX'} and isfloat{T}} => 'permutevar'
184 |     # no 2, 64: permutevar_pd scales indices!
185 |     {8, 32 if hasarch{'AVX2'}} => 'permutevar8x32'
186 |   }
187 |   def S = [width{V}/width{T}]T
188 |   def shuf = emit{., intrin{name, S}, ., ind}
189 |   if (class{T} == class{vT}) shuf{V, val}
190 |   else                  V ~~ shuf{S, S ~~ val}
191 | }
192 | 
193 | # Next-to-last index bit is used, so index has to be multiplied by 2
194 | def vec_shuffle_64_scaled{val:V=[k](f64), ind:[k]I if isint{I} and width{I}==64 and hasarch{'AVX'}} = {
195 |   emit{V, intrin{'permutevar_pd', V}, val, ind}
196 | }
197 | def vec_shuffle_64_scaled{val:V=[_]T, ind if T != f64} = {
198 |   V ~~ vec_shuffle_64_scaled{[width{V}/64]f64~~val, ind}
199 | }
200 | 
201 | def vec_select{...spec, v:V=[_]vT, ind} = {
202 |   def T = match (...spec) {
203 |     {} => vT; {T if ktyp{T}} => T
204 |     {w if knum{w} and w<=64} => primtype{if (w<32 and isfloat{vT}) 'u' else quality{vT}, w}
205 |   }
206 |   def w = width{T}
207 |   def k = width{V} / w
208 |   def 1 = match (ind) { {{...t}} => length{t}==k; {i:[(k)]I} => isint{I} }
209 |   vec_shuffle{T, v, ind}
210 | }
211 | def vec_select{n, v:V=[_]vT, {...ind} if knum{n} and n>64} = {
212 |   def e = n / 64
213 |   vec_select{64, v, merge{...each{+{.,range{e}}, e*ind}}}
214 | }
215 | def vec_select{(width{V}), x:V=[_]_, i if is{i,0} or is{i,tup{0}}} = x
216 | # Selects as 4 unified lanes
217 | def vec_select{128, v={a:V=[_]T, b:V}, ind={_,_} if 256==width{V} and hasarch{'AVX'} and all{(ind>=0) & (ind<4)}} = {
218 |   def q = if (isint{T} and hasarch{'AVX2'}) 'x' else 'f'
219 |   def name = merge{'permute2', q, '128'}
220 |   emit{V, intrin_b{name, V}, a, b, base{16, ind}}
221 | }
222 | 
223 | # Reverse-units for compatibility with NEON
224 | def reverse_units{n, x:[l]_ if knum{n} and n>1 and l%n == 0} = {
225 |   vec_shuffle{x, n-1 - range{n}}
226 | }
227 | 
228 | 
229 | # Zip
230 | local def has_zip{V=[_]T} = {
231 |   hasarch{match (width{V}) {
232 |     {128} => if (T==f32) 'SSE' else 'SSE2'
233 |     {256} => if (isfloat{T}) 'AVX' else 'AVX2'
234 |     {512} => if (width{T}>=32) 'AVX512F' else 'AVX512BW'
235 |   }}
236 | }
237 | def zip128{a:V=[_]_, b:V, half if has_zip{V}} = {
238 |   def name = merge{'unpack', match (half) { {0}=>'lo'; {1}=>'hi' }}
239 |   emit{V, intrin{name, V}, a, b}
240 | }
241 | def zip{a:V=[_]_, b:V, half if width{V}==128} = zip128{a, b, half}
242 | 
243 | 
244 | # Shift/align as a list of elements
245 | # Directions left and right are opposite to instruction l and r!
246 | local {
247 |   def has_sh{w, sse} = hasarch{match (w) { {128}=>sse; {256}=>'AVX2'; {512}=>'AVX512BW' }}
248 |   def has_sh{V=[_]T, sse, n} = isint{T} and has_sh{width{V}, sse} and knum{n}
249 |   # Shift left or right based on sign
250 |   def vec_shift{w128, dir}{x:V=[_]T, n if has_sh{V,'SSE2',n} and (w128 or width{V}==128)} = {
251 |     def wv = width{V}; def S = [width{V}/8]u8
252 |     def name = merge{'bs', if (dir*n < 0) 'l' else 'r', 'li'}
253 |     def suff = if (wv == 128) 'si128' else 'epi128'
254 |     def nb = __min{16, __abs{n}*(width{T}/8)}  # Shift in bytes
255 |     def sh = emit{V, intrin_g{{_} => suff}{name, S}, ..., nb}
256 |     call_cast{sh, T, S, x}
257 |   }
258 |   # Merge-shift, with mod_n to pick direction
259 |   def vec_merge_shift{w128, mod_n}{a:V=[k]T, b:V, n if has_sh{V,'SSSE3',n} and (w128 or width{V}==128)} = {
260 |     def S = [width{V}/8]u8
261 |     def sh = emit{V, intrin{'alignr', S}, ..., mod_n{n*(width{T}/8)}}
262 |     call_cast{sh, T, S, b, a}
263 |   }
264 | }
265 | def vec_shift_left_128 {...} = vec_shift{1,  1}
266 | def vec_shift_left     {...} = vec_shift{0,  1}
267 | def vec_shift_right_128{...} = vec_shift{1, -1}
268 | def vec_shift_right    {...} = vec_shift{0, -1}
269 | 
270 | def vec_merge_shift_left_128 {...} = vec_merge_shift{1, {n} => n}
271 | def vec_merge_shift_left     {...} = vec_merge_shift{0, {n} => n}
272 | def vec_merge_shift_right_128{...} = vec_merge_shift{1, {n} => 16 - n}
273 | def vec_merge_shift_right    {...} = vec_merge_shift{0, {n} => 16 - n}
274 | 
275 | 
276 | # Blend
277 | # x86 vector blends use the top bit of the mask
278 | # But blend_hom is more general so blend_top is defined as a restriction
279 | local {
280 |   def has_blend{V=[_]T} = match (width{V}) {
281 |     {128} => hasarch{'SSE4.1'}
282 |     {256} => hasarch{if (isfloat{T}) 'AVX' else 'AVX2'}
283 |     {_} => 0
284 |   }
285 |   def has_blendv{V, M} = width{V}==width{M} and has_blend{V}
286 |   def blend_instr{hom}{f:V=[_]T, t, m:M if has_blendv{V, M} and (hom or isfloat{T} or width{T}==8)} = {
287 |     def name = intrin{'blendv', if (isfloat{T}) V else [width{V}/8]i8}
288 |     call_cast_sub{0, emit{V, name, f, t, ...}, T, M, m}
289 |   }
290 | }
291 | def blend_hom{...} = blend_instr{1}
292 | def blend_top{...} = blend_instr{0}
293 | 
294 | # Blend with immediate
295 | local def all_bool{m} = all{each{knum, m}} and all{(m==0) | (m==1)}
296 | def blend_units{f, t, ...m if all_bool{m}} = blend_units{f, t, m}
297 | def blend_units{f:V=[k]T, t:V, {...m} if has_blend{V} and width{T}>=16 and 0 == __min{8,k} % length{m} and all_bool{m}} = {
298 |   def E = {
299 |     if (isfloat{T}) T
300 |     else if (width{T}>=32 and hasarch{'AVX2'}) i32
301 |     else i16
302 |   }
303 |   def l = __min{8,k}
304 |   def exp{i} = if (l <= length{i}) i else exp{merge{i, i}}
305 |   def c = width{T} / width{E}  # Copy each bit of m c times
306 |   def b = 1 << c
307 |   emit{V, intrin{'blend', [k*c]E}, f, t, base{b, exp{m}*(b-1)}}
308 | }
309 | def blend{f, t, ...m if all_bool{m}} = blend{f, t, m}
310 | def blend{f:V=[k]T, t:V, {...m} if has_blend{V} and k<=8 and length{m}==k} = {
311 |   blend_units{f, t, m}
312 | }
313 | 
314 | 
315 | # Broadcast
316 | local def int_below{i, end} = knum{i} and __floor{i} == i and i < end
317 | local def has_shuf8{k} = hasarch{
318 |   match (k) { {16}=>'SSSE3'; {32}=>'AVX2'; {64}=>'AVX512BW' }
319 | }
320 | def broadcast_sel{x:V=[k]E, i if width{E}==8 and int_below{i, k} and has_shuf8{k}} = {
321 |   if (k==32 and i<16) {
322 |     def a = {
323 |       if (i<8) V ~~ ([4]u64~~x >> (i*8))
324 |       else if (i==8) vec_shuffle{[2]u64, x, 1,0}
325 |       else vec_shift_left_128{x, i}
326 |     }
327 |     broadcast_sel{a, 0} # Handled by special case for 0 below
328 |   } else {
329 |     def a = vec_shuffle{x, copy{16, i&15}}
330 |     vec_select{128, a, copy{k>>4, i>>4}}
331 |   }
332 | }
333 | def broadcast_sel{x:[k]E, i if width{E}==16 and k<=16 and int_below{i, k}} = {
334 |   def a = vec_shuffle16_half{(i&4)!=0, x, copy{4, i&3}}
335 |   vec_shuffle{u64, a, copy{k>>2, i>>2}}
336 | }
337 | 
338 | local def has_full_shuf = match { # element width
339 |   {8}=>hasarch{'AVX512VBMI'}; {16}=>hasarch{'AVX512BW'}; {_}=>1
340 | }
341 | def broadcast_sel{x:[k]E, i if int_below{i, k} and has_full_shuf{width{E}}} = {
342 |   vec_shuffle{x, copy{k, i}}
343 | }
344 | 
345 | def broadcast_sel{x:V=[k]E, 0 if hasarch{'AVX2'} and (width{V}<512 or width{E}>=32 or hasarch{'AVX512BW'})} = {
346 |   def w = width{E}
347 |   def char = {
348 |     if (isfloat{E}) match (w) { {32}=>'ss'; {64}=>'sd' }
349 |     else match (w) { {8}=>'b'; {16}=>'w'; {32}=>'d'; {64}=>'q' }
350 |   }
351 |   def lane0 = if (width{V} == 128) x else {
352 |     def t = match (E) { {(f32)}=>'ps'; {(f64)}=>'pd'; {_}=>'si' }
353 |     def f = fmtwidth{V}
354 |     emit{[128/w]E, merge{'_mm',f,'_cast',t,f,'_',t,'128'}, x}
355 |   }
356 |   emit{V, intrin{merge{'broadcast', char}, V}, lane0}
357 | }
358 | 


--------------------------------------------------------------------------------
/include/arch/neon_intrin/basic.singeli:
--------------------------------------------------------------------------------
 1 | local {
 2 |   include 'skin/c'
 3 |   oper ~~ reinterpret infix right 55
 4 |   def num{x} = 'number'==kind{x}
 5 | 
 6 |   def ew{V} = width{eltype{V}}
 7 |   def va{V} = 'vector'==typekind{V} and __or{...tup{64,128}==width{V}}
 8 |   def vi{V} = va{V} and isint{eltype{V}}
 9 |   def vs{V} = va{V} and __or{...tup{'i','f'}==quality{eltype{V}}}
10 |   def vu{V} = va{V} and 'u'==quality{eltype{V}}
11 |   def vf{V} = va{V} and isfloat{eltype{V}}
12 | 
13 |   def change_qual{[k]T,q} = [k]primtype{q, width{T}}
14 |   def uns = change_qual{.,'u'}
15 |   def sgn = change_qual{.,'i'}
16 | 
17 |   def intrin{name, ...s, V=[_]T} = {
18 |     def q = quality{T}
19 |     def w = if (128==width{V}) 'q' else ''
20 |     merge{name, w, ...s, '_', if (q=='i') 's' else q, fmtnat{width{T}}}
21 |   }
22 | }
23 | 
24 | # Building vectors from scalars
25 | local {
26 |   def can_elt = match { {[_]T, x:T} => 1; {_,x} => num{x} }
27 |   def can_make_sub = can_elt
28 |   def can_make_sub{V=[k]_, {...x}} = {
29 |     def all{t} = is{t, 0 <= t}
30 |     k==length{x} and all{each{can_elt{V,.}, x}}
31 |   }
32 |   def can_make{V,x} = va{V} and can_make_sub{V,x}
33 |   def mv_sub{V=[_]T, x} = {
34 |     tmp:*T = each{cast{T,.}, x}
35 |     load{*V~~tmp, 0}
36 |   }
37 | }
38 | def vec_make{V,  ...x  if can_make{V,x}} = mv_sub{V, x}
39 | def vec_make{V, {...x} if can_make{V,x}} = mv_sub{V, x}
40 | def vec_broadcast{V=[_]T, x if can_make{V,x}} = {
41 |   emit{V, intrin{'vdup', '_n', V}, cast{T,x}}
42 | }
43 | 
44 | def load {a:*V=[_]E, n      if va{V}} = emit{V   , intrin{'vld1', V}, *E ~~ (a+n)}
45 | def store{a:*V=[_]E, n, v:V if va{V}} = emit{void, intrin{'vst1', V}, *E ~~ (a+n), v}
46 | 
47 | def extract{x:V,n if va{V} and num{n}} = emit{eltype{V}, intrin{'vget', '_lane', V}, x, n}
48 | def insert{a:V, x, i if va{V} and num{i} and can_elt{V,x}} = {
49 |   emit{V, intrin{'vset', '_lane', V}, cast{eltype{V}, x}, a, i}
50 | }
51 | 
52 | local def n8{[_]E} = isfloat{E} or width{E}<64
53 | def   __or{a:V,b:V if vf{V}} = {def U = uns{V}; V~~ __or{U~~a, U~~b} }
54 | def  __and{a:V,b:V if vf{V}} = {def U = uns{V}; V~~__and{U~~a, U~~b} }
55 | def  __xor{a:V,b:V if vf{V}} = {def U = uns{V}; V~~__xor{U~~a, U~~b} }
56 | def  __add{a:V,b:V if va{V}} = emit{V, intrin{'vadd', V}, a, b}
57 | def  __sub{a:V,b:V if va{V}} = emit{V, intrin{'vsub', V}, a, b}
58 | def  __mul{a:V,b:V if va{V} and n8{V}} = emit{V, intrin{'vmul', V}, a, b}
59 | def  __div{a:V,b:V if vf{V}} = emit{V, intrin{'vdiv', V}, a, b}
60 | def  __and{a:V,b:V if vi{V}} = emit{V, intrin{'vand', V}, a, b}
61 | def   __or{a:V,b:V if vi{V}} = emit{V, intrin{'vorr', V}, a, b}
62 | def  __xor{a:V,b:V if vi{V}} = emit{V, intrin{'veor', V}, a, b}
63 | def andnot{a:V,b:V if vi{V}} = emit{V, intrin{'vbic', V}, a, b}
64 | def  ornot{a:V,b:V if vi{V}} = emit{V, intrin{'vorn', V}, a, b}
65 | def  andnz{a:V,b:V if vi{V}} = emit{V, intrin{'vtst', V}, a, b}
66 | def  __min{a:V,b:V if va{V} and n8{V}}  = emit{V, intrin{'vmin', V}, a, b}
67 | def  __max{a:V,b:V if va{V} and n8{V}}  = emit{V, intrin{'vmax', V}, a, b}
68 | def  __shl{a:V,b:S if vi{V} and S==uns{V}} = emit{V, intrin{'vshl', V}, a, sgn{S}~~b}
69 | def __adds{a:V,b:V if vi{V}} = emit{V, intrin{'vqadd', V}, a, b}
70 | def __subs{a:V,b:V if vi{V}} = emit{V, intrin{'vqsub', V}, a, b}
71 | 
72 | def   addp{a:V,b:V if va{V}} = emit{V, intrin{'vpadd', V}, a, b}
73 | def addpw{a:V=[k]E if vi{V} and width{E}<=32} = emit{[k/2](primtype{quality{E}, width{E}*2}), intrin{'vpaddl', V}, a}
74 | 
75 | def __shl{a:V, s if vi{V} and num{s} and s>0 and s<ew{V}} = emit{V, intrin{'vshl', '_n', V}, a, s};  def __shl{a:V,0 if vi{V}} = a
76 | def __shr{a:V, s if vi{V} and num{s} and s>0 and s<ew{V}} = emit{V, intrin{'vshr', '_n', V}, a, s};  def __shr{a:V,0 if vi{V}} = a
77 | 
78 | def __neg{a:V if vi{V}} = { def S=sgn{V};                 V~~emit{S, intrin{'vneg', S}, S~~a} }
79 | def __not{a:V if vi{V}} = { def U=[width{V}/width{u8}]u8; V~~emit{U, intrin{'vmvn', U}, U~~a} }
80 | def __sqrt {a:V if vf{V}} = emit{V, intrin{'vsqrt', V}, a}
81 | def __floor{a:V if vf{V}} = emit{V, intrin{'vrndm', V}, a}
82 | def __ceil {a:V if vf{V}} = emit{V, intrin{'vrndp', V}, a}
83 | def __abs  {a:V if vs{V}} = emit{V, intrin{'vabs', V}, a}
84 | 
85 | def __eq{a:V,b:V if va{V}} = emit{uns{V}, intrin{'vceq', V}, a, b}
86 | def __ge{a:V,b:V if va{V}} = emit{uns{V}, intrin{'vcge', V}, a, b}
87 | def __gt{a:V,b:V if va{V}} = emit{uns{V}, intrin{'vcgt', V}, a, b}
88 | def __lt{a:V,b:V if va{V}} = emit{uns{V}, intrin{'vclt', V}, a, b}
89 | def __le{a:V,b:V if va{V}} = emit{uns{V}, intrin{'vcle', V}, a, b}
90 | def __ne{a:V,b:V if va{V}} = ~(a==b)
91 | 


--------------------------------------------------------------------------------
/include/arch/neon_intrin/select.singeli:
--------------------------------------------------------------------------------
  1 | local {
  2 |   include 'skin/cop'
  3 |   include 'util/kind'
  4 |   oper ~~ reinterpret infix right 55
  5 |   def all{t} = is{t, 0 <= t}
  6 | 
  7 |   def emit_intrin{V=[_]T, name, ...args} = {
  8 |     def intrin = merge{
  9 |       name,
 10 |       match (width{V}) { {128}=>'q'; {64}=>'' },
 11 |       '_',
 12 |       match (quality{T}) { {'i'}=>'s'; {q}=>q },
 13 |       fmtnat{width{T}}
 14 |     }
 15 |     emit{V, intrin, ...args}
 16 |   }
 17 | }
 18 | 
 19 | local def vqtbl{...TS, vals={v0:V, ..._}, ind} = {
 20 |   def S = match (...TS) {
 21 |     {T if width{T}==8} => [16]T
 22 |     {} => if (V == [16]i8) V else [16]u8
 23 |   }
 24 |   def l = length{vals}
 25 |   def name = merge{if (l>1) 'unpacked_' else '', 'vqtbl', fmtnat{l}}
 26 |   def shuf = emit_intrin{S, name, ..., [16]u8~~ind}
 27 |   if (V == S) shuf{...vals} else V~~shuf{...each{~~{S,.}, vals}}
 28 | }
 29 | 
 30 | # Start with constant-index cases
 31 | local def widen_norm_ind{ind, e} = {
 32 |   def wi = if (e == 1) ind else merge{...each{+{.,range{e}}, e*ind}}
 33 |   __max{wi,-1} % 256
 34 | }
 35 | 
 36 | # For convenience, allow indices to be written directly without tup
 37 | def vec_select {val, ...inds if 1 < length{inds}} = vec_select {val, inds}
 38 | def vec_shuffle{val, ...inds if 1 < length{inds}} = vec_shuffle{val, inds}
 39 | def vec_select {spec if knum{spec} or ktyp{spec}, val, ...inds if 1 < length{inds}} = vec_select {spec, val, inds}
 40 | def vec_shuffle{spec if knum{spec} or ktyp{spec}, val, ...inds if 1 < length{inds}} = vec_shuffle{spec, val, inds}
 41 | 
 42 | def vec_select{...spec, val:V=[_]vT, {...ind} if all{each{knum,ind}}} = {
 43 |   def w = match (...spec) {
 44 |     {n if knum{n}} => n; {T if ktyp{T}} => width{T}; {} => width{vT}
 45 |   }
 46 |   def 1 = w >= 8
 47 |   def {n,n} = tup{width{V}/w, length{ind}}
 48 |   def 1 = all{ind < n}
 49 |   vqtbl{tup{val}, vec_make{[16]u8, widen_norm_ind{ind, w / 8}}}
 50 | }
 51 | 
 52 | def vec_shuffle{...spec, val:V=[_]vT, {...inds}} = {
 53 |   # inds doesn't have the element type so it comes from spec or val
 54 |   def T = match (...spec) {
 55 |     {[_]T} => T; {T if ktyp{T}} => T
 56 |     {n if knum{n}} => vT; {} => vT
 57 |   }
 58 |   # sel_n is the number of indices in a selection unit
 59 |   # It can be specified by spec as a number or length of vector type
 60 |   # and/or by the length of each list in nested inds
 61 |   def spec_n = match (...spec) {
 62 |     {[k]_} => tup{k}; {k if knum{k}} => tup{k}; {..._} => tup{}
 63 |   }
 64 |   def {ind_n, ind} = match (inds) {
 65 |     {{{...t}, ..._}} => {
 66 |       def l = length{t}
 67 |       def 1 = all{l == each{length, inds}}
 68 |       tup{tup{l}, merge{...inds}}
 69 |     }
 70 |     {_} => {
 71 |       each{{s} => { def 0 = length{inds} % s }, spec_n}
 72 |       tup{tup{}, inds}
 73 |     }
 74 |   }
 75 |   def sel_n = match(...spec_n, ...ind_n) {  # Shuffle unit from spec and nested indices must match
 76 |     {n,n} => n; {n} => n; {} => length{ind}
 77 |   }
 78 |   def 1 = all{ind < sel_n}
 79 | 
 80 |   def e = width{T} / 8
 81 |   def exp{i} = if (16 <= length{i}) i else exp{merge{i, i}}
 82 |   def off{i} = i + (range{length{i}} & -e*sel_n)
 83 |   def vind = vec_make{[16]u8, off{exp{widen_norm_ind{ind, e}}}}
 84 |   vqtbl{tup{val}, vind}
 85 | }
 86 | 
 87 | # Variable indices
 88 | def check_select{spec, vals, V} = {
 89 |   def l = length{vals}
 90 |   def v = l==1 or (l<=4 and all{V == each{type,vals}})
 91 |   v and (match (...spec) { {}=>1; {8}=>1; {T} => ktyp{T} })
 92 | }
 93 | def vec_select{...spec, vals={v0:V=[_]vT, ..._}, ind:[16]I if check_select{spec, vals, V}} = {
 94 |   def T = match (...spec) { {T if ktyp{T}} => T; {..._} => vT }
 95 |   vqtbl{T, vals, ind}
 96 | }
 97 | def vec_select{...spec, val:V=[_]_,  ind:[16]I} = vec_select{...spec, tup{val}, ind}
 98 | 
 99 | def vec_shuffle{...spec, val:V=[_]vT, ind:[16]I} = {
100 |   def T = match (...spec) {
101 |     {[16]T} => T; {T if ktyp{T}} => T; {16} => vT; {} => vT
102 |   }
103 |   vqtbl{T, tup{val}, ind}
104 | }
105 | 
106 | 
107 | # Shifts
108 | def vec_merge_shift_left {a:V=[_]_, b:V, n if knum{n}} = emit_intrin{V, 'vext', b, a, n}
109 | def vec_merge_shift_right{a:V=[k]_, b:V, n} = vec_merge_shift_left{b, a, k - n}
110 | def vec_shift_left {x:V=[_]_, n} = vec_merge_shift_left {x, vec_broadcast{V,0}, n}
111 | def vec_shift_right{x:V=[_]_, n} = vec_merge_shift_right{vec_broadcast{V,0}, x, n}
112 | 
113 | def zip{a:V=[_]_, b:V, half} = {
114 |   def name = merge{'vzip', match (half) { {0}=>'1'; {1}=>'2' }}
115 |   emit_intrin{V, name, a, b}
116 | }
117 | def reverse_units{n, x:V=[l]T if knum{n} and 1<n and n<l and l%n == 0} = {
118 |   emit_intrin{V, merge{'vrev', fmtnat{n*width{T}}}, x}
119 | }
120 | # NEON supports bitwise blending only
121 | def blend_bit{f:V=[k]T, t:V, m:M=[k]_ if width{V}==width{M}} = {
122 |   emit_intrin{V, 'vbsl', m, t, f}
123 | }
124 | def blend_hom{...} = blend_bit
125 | 
126 | def broadcast_sel{x:V=[k]_, i if knum{i} and i>=0 and i<k and __floor{i}==i} = {
127 |   emit_intrin{V, merge{'vdup', if (width{V}==128) 'q' else '', '_lane'}, x, i}
128 | }
129 | 
130 | # For x86 compatibility
131 | def vec_shift_left_128 {...} = vec_shift_left
132 | def vec_shift_right_128{...} = vec_shift_right
133 | def vec_merge_shift_left_128 {...} = vec_merge_shift_left
134 | def vec_merge_shift_right_128{...} = vec_merge_shift_right
135 | def zip128{...} = zip
136 | 


--------------------------------------------------------------------------------
/include/clib/malloc.singeli:
--------------------------------------------------------------------------------
1 | require{'stdlib.h'}
2 | def alloc{T, len} = reinterpret{
3 |   __pnt{T},
4 |   emit{__pnt{void}, 'malloc', __mul{len, __div{width{T}, 8}}}
5 | }
6 | def free{ptr} = emit{void, 'free', ptr}
7 | 


--------------------------------------------------------------------------------
/include/debug/printf.singeli:
--------------------------------------------------------------------------------
 1 | # printf: Print a list of values at runtime, C backend
 2 | # lprintf: Same, with trailing newline
 3 | 
 4 | local {
 5 |   include 'arch/c' # load{}
 6 |   include 'skin/c'
 7 |   include 'skin/cext'
 8 |   include 'util/kind'
 9 |   include 'util/tup'
10 | }
11 | 
12 | require{'stdio.h', 'inttypes.h'}
13 | 
14 | def lprintf{...vs} = printf_impl{...vs, '\n'}
15 | def printf{...vs} = printf_impl{...vs, ''}
16 | 
17 | def printf_impl{...vs0} = {
18 |   def nsym = ~each{ksym, vs0}
19 |   def vs = join{each{
20 |     {a,b} => if (b) tup{' ', a} else tup{a},
21 |     vs0, nsym & shiftright{0, nsym}
22 |   }}
23 | 
24 |   def lit{s} = tup{s, tup{}}
25 |   def lit1{s} = tup{lit{s}}
26 | 
27 |   def listfmt{hex, open, close, vs} = {
28 |     def f = each{runfmt{hex, .}, vs}
29 |     def d = each{lit1, tup{open, ', ', close}}
30 |     merge{...each{merge, select{d, 0 != inds{f}}, f}, select{d,2}}
31 |   }
32 |   def listfmt{hex, open, close, {}} = lit{merge{open, close}}
33 | 
34 |   def runfmt{hex, x} = match(x) {
35 |     {_ if ksym{x}} => lit1{x}
36 |     {{'x0', y}} => runfmt{2, y}
37 |     {{'x', y}} => runfmt{1, y}
38 |     {{}} => lit1{'{}'}
39 |     {{...vs}} => listfmt{hex, '{', '}', vs}
40 |     {_:*E} => tup{tup{'%p', tup{x}}}
41 | 
42 |     {_:V=[l]E} => {
43 |       tmp:*V = undefined{V, 1}
44 |       tmp <- x
45 |       listfmt{hex, '[', ']', each{load{*E~~tmp,.}, range{l}}}
46 |     }
47 | 
48 |     {_:T if typekind{T}=='primitive'} => {
49 |       def q = quality{T}
50 |       def w = width{T}
51 |       def u = w>1 and q=='u'
52 |       def spec = {
53 |         if (q=='f') (if (w==32) '%.8g' else '%.17g')
54 |         else merge{
55 |           if (hex!=0) '0x' else '',
56 |           '%',
57 |           if (hex==2) merge{'0', fmtnat{w/4}} else '',
58 |           if (w==64) '"SCN' else '',
59 |           if (hex!=0) 'x' else if (u) 'u' else 'd',
60 |           if (w==64) '64"' else ''
61 |         }
62 |       }
63 |       tup{tup{spec, tup{if (q=='i' and hex!=0) reinterpret{primtype{'u', w}, x} else x}}}
64 |     }
65 | 
66 |     {_ if knum{x}} => {
67 |       if ((x>>0) == x) {
68 |         if      (x >= -(1<<63) and x < 1<<63) runfmt{hex, i64~~x}
69 |         else if (x >= 0        and x < 1<<64) runfmt{hex, u64~~x}
70 |         else lit1{fmtnat{x}}
71 |       } else runfmt{hex, f64~~x}
72 |     }
73 | 
74 |     {T if kind{T}=='type'} => match(typekind{T}) {
75 |       {'primitive'} => lit1{merge{quality{T}, fmtnat{width{T}}}}
76 |       {'vector'} => join{flip{tup{lit1{merge{'[', fmtnat{vcount{T}}, ']'}}, runfmt{hex, eltype{T}}}}}
77 |       {_} => lit1{merge{'(unhandled type typekind: ', typekind{x}, ')'}}
78 |     }
79 | 
80 |     {_:T} => lit1{merge{'(unhandled value typekind: ', typekind{x}, ')'}}
81 |     {_} => lit1{merge{'(unhandled kind: ', kind{x}, ')'}}
82 |   }
83 | 
84 |   def fs = flip{join{each{runfmt{0,.}, vs}}}
85 |   def {strs, args} = each{join, fs}
86 | 
87 |   emit{void, 'printf', merge{'"', strs, '"'}, ...args}
88 |   match(vs0) { {{r, _}} => r; {_} => {} }
89 | }
90 | 


--------------------------------------------------------------------------------
/include/skin/c.singeli:
--------------------------------------------------------------------------------
1 | include 'skin/cop'  # Ordinary operators + - % etc.
2 | include 'skin/cmut' # Mutating operators -- &= etc.
3 | 


--------------------------------------------------------------------------------
/include/skin/cext.singeli:
--------------------------------------------------------------------------------
 1 | # Additional operators for use with skin/c
 2 | 
 3 | oper === (is)        infix none   0
 4 | 
 5 | oper ~~  reinterpret infix right 55
 6 | oper ^~  promote     infix right 55
 7 | oper <~  cast_i      infix right 55  # Requires arch/c
 8 | oper $   __vec       prefix      50
 9 | 
10 | local def __store{ind}{ptr, val} = store{ptr, ind, val}
11 | local def __store{ptr, val} = __store{0}{ptr, val}
12 | 
13 | oper ->  load        infix right 50
14 | oper <-  (__store)   infix right  5
15 | 


--------------------------------------------------------------------------------
/include/skin/cmut.singeli:
--------------------------------------------------------------------------------
 1 | # C operators that modify variables/registers
 2 | oper += __incr infix right 5
 3 | oper -= __decr infix right 5
 4 | oper ++ __incr prefix 60
 5 | oper -- __decr prefix 60
 6 | def __incr{a,b} = { a = __add{a,b} }
 7 | def __decr{a,b} = { a = __sub{a,b} }
 8 | def __incr{a} = __incr{a,1}
 9 | def __decr{a} = __decr{a,1}
10 | 
11 | oper *=  ({a,b} => a = __mul{a,b}) infix right 5
12 | oper /=  ({a,b} => a = __div{a,b}) infix right 5
13 | oper %=  ({a,b} => a = __mod{a,b}) infix right 5
14 | oper <<= ({a,b} => a = __shl{a,b}) infix right 5
15 | oper >>= ({a,b} => a = __shr{a,b}) infix right 5
16 | oper &=  ({a,b} => a = __and{a,b}) infix right 5
17 | oper ^=  ({a,b} => a = __xor{a,b}) infix right 5
18 | oper |=  ({a,b} => a = __or {a,b}) infix right 5
19 | 


--------------------------------------------------------------------------------
/include/skin/cop.singeli:
--------------------------------------------------------------------------------
 1 | oper -  __neg prefix      30
 2 | oper *  __pnt prefix      60
 3 | 
 4 | oper == __eq  infix none  20
 5 | oper != __ne  infix none  20
 6 | oper <  __lt  infix none  20
 7 | oper >  __gt  infix none  20
 8 | oper <= __le  infix none  20
 9 | oper >= __ge  infix none  20
10 | 
11 | oper +  __add infix left  30
12 | oper -  __sub infix left  30
13 | oper *  __mul infix left  40
14 | oper /  __div infix left  40
15 | oper %  __mod infix left  40
16 | 
17 | # Nobody likes low-precedence & |
18 | oper &  __and infix none  35
19 | oper |  __or  infix none  35
20 | oper ^  __xor infix none  35
21 | oper ~  __not prefix      50
22 | 
23 | # Shifts are like multiply/divide and should have the same precedence
24 | oper << __shl infix left  40
25 | oper >> __shr infix left  40
26 | 


--------------------------------------------------------------------------------
/include/util/for.singeli:
--------------------------------------------------------------------------------
 1 | local {
 2 |   include 'skin/cop'
 3 |   def ux = primtype{'u', width{*void}}
 4 |   def num{n} = is{'number',kind{n}}
 5 |   def loop_var{a, b} = {
 6 |     if (num{a}) { cast{if (num{b}) ux else type{b}, a} }
 7 |     else {
 8 |       def ta=type{a}
 9 |       match (b) { {_:tb if ta<tb}=>promote{tb,a}; {_}=>a }
10 |     }
11 |   }
12 | }
13 | 
14 | def for{vars,begin,end,iter} = {
15 |   i := loop_var{begin, end}
16 |   while (i < end) {
17 |     iter{i, vars}
18 |     i = i + 1
19 |   }
20 | }
21 | 
22 | def for_backwards{vars,begin,end,iter} = {
23 |   i := loop_var{end, begin}
24 |   while (i > begin) {
25 |     i = i - 1
26 |     iter{i, vars}
27 |   }
28 | }
29 | 
30 | def for_const{vars,begin,end,iter if num{begin} and num{end}} = {
31 |   each{iter{., vars}, begin + range{end-begin}}
32 | }
33 | 
34 | def for_unroll{unr if num{unr}}{vars,begin,end,iter} = {
35 |   i := loop_var{begin, end}
36 |   while (i + unr <= end) {
37 |     each{{j}=>iter{i+j, vars}, range{unr}}
38 |     i = i + unr
39 |   }
40 |   while (i < end) {
41 |     iter{i, vars}
42 |     i = i + 1
43 |   }
44 | }
45 | 


--------------------------------------------------------------------------------
/include/util/functionize.singeli:
--------------------------------------------------------------------------------
1 | # If gen is a generator that applies to typed parameters,
2 | # functionize{gen} is equivalent to gen but works through a function call
3 | # So multiple calls to functionize{gen} on the same types share code
4 | # instead of inlining and duplicating it.
5 | 
6 | fn asfunc{gen, ...type}(a:type) = gen{...a}
7 | def functionize{gen}{...args} = asfunc{gen, ...each{type, args}}(args)
8 | 


--------------------------------------------------------------------------------
/include/util/kind.singeli:
--------------------------------------------------------------------------------
 1 | # Generators that check kind
 2 | local def ki{k}{x} = is{k, kind{x}}
 3 | def knum = ki{'number'}
 4 | def ksym = ki{'symbol'}
 5 | def ktup = ki{'tuple'}
 6 | def kgen = ki{'generator'}
 7 | def ktyp = ki{'type'}
 8 | def kcon = ki{'constant'}
 9 | def kreg = ki{'register'}
10 | def kfun = ki{'function'}
11 | def klab = ki{'label'}
12 | 


--------------------------------------------------------------------------------
/include/util/perv.singeli:
--------------------------------------------------------------------------------
 1 | # Pervasion
 2 | 
 3 | def extend perv1{op} = {
 4 |   def op{{...a}} = each{op, a}
 5 | }
 6 | 
 7 | def extend perv2{op} = {
 8 |   def op{    a , {...b}} = each{op{a,.}, b}
 9 |   def op{{...a},     b } = each{op{.,b}, a}
10 |   def op{{...a}, {...b}} = each{op, a, b}
11 | }
12 | 
13 | local def anytup = match {
14 |   {{..._}, ..._} => 1; {_, ...r} => anytup{...r}; {} => 0
15 | }
16 | 
17 | def perv{n if is{'number',kind{n}}} = { def extend _{op} = {
18 |   def op{...t if is{n,length{t}} and anytup{...t}} = {
19 |     def ftup = match { {{{...e}, ..._}} => e; {_, ...r} => ftup{r} }
20 |     def l = ftup{t}
21 |     def r = match { {{...s}} => s; {k} => each{{_}=>k,l} }
22 |     each{op, ...each{r,t}}
23 |   }
24 | }}
25 | def perv{1} = perv1
26 | def perv{2} = perv2
27 | 


--------------------------------------------------------------------------------
/include/util/tup.singeli:
--------------------------------------------------------------------------------
  1 | # Tuple utilities
  2 | 
  3 | local {
  4 |   include 'skin/cop'
  5 |   oper $ length prefix 30
  6 | 
  7 |   include 'util/kind'
  8 |   def sl{l, start, len} = slice{l, start, start + len}
  9 | }
 10 | 
 11 | # Tuple is empty
 12 | def empty{tup} = 0 == $tup
 13 | 
 14 | # Constant-time evaluation returning a list
 15 | def collect{vars,begin,end,exec if begin<=end} = {
 16 |   def inds = begin + range{end-begin}
 17 |   each{exec{., vars}, inds}
 18 | }
 19 | 
 20 | # Integers [0,n)
 21 | def iota{n if knum{n}} = range{n}
 22 | 
 23 | # All indices into tuple t
 24 | def inds{t} = range{$t}
 25 | 
 26 | # Tuple of n copies of v
 27 | def copy{n, v if knum{n}} = each{{_}=>v, range{n}}
 28 | 
 29 | # Merge a tuple of tuples
 30 | def join{l} = merge{...l}
 31 | 
 32 | # Shift l into r, retaining length of r, or vice-versa
 33 | def shiftright{l, r} = slice{merge{l, r}, 0, $r}
 34 | def shiftleft {l, r} = slice{merge{l, r}, - $l}
 35 | 
 36 | # Reversed tuple
 37 | def reverse{t} = select{t, ($t-1) - inds{t}}
 38 | 
 39 | # Tuple of length n made from t repeated cyclically
 40 | def cycle{n, t if knum{n}} = {
 41 |   def l = $t
 42 |   def m = n % l; def e = slice{t, 0, m}
 43 |   if (m == n) e
 44 |   else merge{...copy{(n-m)/l, t}, e}
 45 | }
 46 | 
 47 | # Split into groups of length n, possibly less for the last
 48 | def split{n, list if knum{n}} = {
 49 |   def d = __ceil{($list) / n}
 50 |   each{sl{list, ., n}, n*range{d}}
 51 | }
 52 | def split{{...n}, list} = {
 53 |   def start = shiftright{0, scan{+,n}}
 54 |   each{sl{list,...}, start, n}
 55 | }
 56 | 
 57 | # Transpose tuple of same-length tuples
 58 | def flip{tab} = each{tup, ...tab}
 59 | 
 60 | # Function table mapping over all combinations
 61 | def table = match {
 62 |   {f}           => f{}
 63 |   {f, t}        => each{f, t}
 64 |   {f, t, ...ts} => each{{e} => table{f{e,...}, ...ts}, t}
 65 | }
 66 | # Flattened into a single list
 67 | def flat_table = match {
 68 |   {f}           => tup{f{}}
 69 |   {f, t}        => each{f, t}
 70 |   {f, t, ...ts} => join{each{{e} => flat_table{f{e,...}, ...ts}, t}}
 71 | }
 72 | 
 73 | # Left fold, with or without initial element
 74 | def fold = match {
 75 |   {f, init, {}} => init
 76 |   {f, init, {x, ...rest}} => fold{f, f{init, x}, rest}
 77 |   {f,    {init, ...rest}} => fold{f, init, rest}
 78 | }
 79 | 
 80 | # Low-stack inclusive+exclusive scan implementation
 81 | local def scan_full = match {
 82 |   {f, init, {}}  => tup{init}
 83 |   {f, init, {x}} => tup{init, f{init, x}}
 84 |   {f, init, list} => {
 85 |     def m = length{list} >> 1
 86 |     def l = scan_full{f, init, slice{list, 0, m}}
 87 |     merge{l, scan{f, select{l, -1}, slice{list, m}}}
 88 |   }
 89 | }
 90 | # Inclusive left scan
 91 | def scan{f, init, list} = slice{scan_full{f, init, list}, 1}
 92 | def scan{f, {}} = tup{}
 93 | def scan{f, {h, ...t}} = scan_full{f, h, t}
 94 | 
 95 | # Extend to multiple list inputs, if initialized
 96 | def fold{f, i, ...ls={_, _, ..._}} = fold{{a, t} => f{a, ...t}, i, flip{ls}}
 97 | def scan{f, i, ...ls={_, _, ..._}} = scan{{a, t} => f{a, ...t}, i, flip{ls}}
 98 | 
 99 | # Copy list elements based on list, constant, or generator (like filter)
100 | def replicate{reps, list} = join{each{copy, reps, list}}
101 | def replicate{r, list if knum{r}} = join{each{copy{r,.}, list}}
102 | def replicate{f, list if kgen{f}} = replicate{each{f,list}, list}
103 | 
104 | # For boolean i, return indices of 1s
105 | def indices{i} = replicate{i, inds{i}}
106 | 
107 | # Search functions that return a single number
108 | local def proc_find{out, i, f} = each{out, findmatches{i, f}}
109 | # Index of only match, erroring if there are multiple
110 | # If there are none, return the default if given and error otherwise
111 | def find_index{sin, sfor, ...default if $default <= 1} = proc_find{
112 |   {is} => match (is, default) { {{i}, _} => i; {{}, {d}} => d },
113 |   sin, sfor
114 | }
115 | # Index of first match
116 | def index_of{sin, sfor} = {
117 |   def n = $sin
118 |   proc_find{match { {{i, ..._}} => i; {_} => n }, sin, sfor}
119 | }
120 | # Whether each element is found; how many times it's found
121 | def contained_in = proc_find{{i} => 0 < $i, ...}
122 | def count_matches = proc_find{length, ...}
123 | 
124 | # Grouping: gather indices or data values based on how a grouping
125 | # argument matches the domain
126 | # For group, domain can be a list of keys, a length, or omitted to infer length
127 | # For key, the domain is the unique elements of the grouping argument in order
128 | # group_inds: gather inds{values}
129 | def group_inds = findmatches
130 | def group_inds{values, len if knum{len}} = findmatches{values, range{len}}
131 | def group_inds{{...vs} if fold{&, each{knum, vs}}} = {
132 |   group_inds{vs, 1 + fold{__max, vs}}
133 | }
134 | # group: gather data
135 | def group{{...vs}, ...g, {...data} if $vs == $data} = {
136 |   select{data, group_inds{vs, ...g}}
137 | }
138 | # key: gather indices or data
139 | def key{{...keys}} = {
140 |   def i = findmatches{keys, keys}
141 |   replicate{inds{i} == each{select{., 0}, i}, i}
142 | }
143 | def key{{...keys}, {...values} if $keys == $values} = select{key{keys}, values}
144 | # Add a generator for the first argument to apply to each result
145 | def extend resgen{gr} = {
146 |   def gr{gen, ...args if kgen{gen}} = each{gen, gr{...args}}
147 | }
148 | extend resgen{group_inds}; extend resgen{group}; extend resgen{key}
149 | 
150 | # Self-search
151 | local def index_self{list} = proc_find{select{., 0}, list, list}
152 | local def umask_from_ind{i} = i == inds{i}
153 | local def cls_from_umask_ind{u, i} = select{scan{+, -1, u}, i}
154 | def unique_mask{list} = umask_from_ind{index_self{list}}
155 | def unique{list} = replicate{unique_mask{list}, list}
156 | def classify{list} = {
157 |   def i = index_self{list}
158 |   cls_from_umask_ind{umask_from_ind{i}, i}
159 | }
160 | def unique_classify{list} = {
161 |   def i = index_self{list}
162 |   def u = umask_from_ind{i}
163 |   tup{replicate{u, list}, cls_from_umask_ind{u, i}}
164 | }
165 | def occurrence_count{list} = {
166 |   def g = key{list}
167 |   def c = join{each{inds, g}}
168 |   group{{{i}}=>i, join{g}, c}
169 | }
170 | 


--------------------------------------------------------------------------------
/ir.bqn:
--------------------------------------------------------------------------------
 1 | # IR passes
 2 | 
 3 | # Apply transformation to each function
 4 | _onFns ← {
 5 |   [o,c] ← +`˘ "beginFn"‿"endFn" (⊣≡≠⊸↑)⌜ 𝕩
 6 |   o »↩
 7 |   f ← o-c
 8 |   r ← 𝔽¨⌾(1⊸↓) (o×f)⊔𝕩
 9 |   ((f¬⊸/o)∾/≠¨1↓r) ⍋⊸⊏ ∾r
10 | }
11 | 
12 | # Attempt to replace goto{,T,F} with two kinds of structure:
13 | #
14 | # - beginBlock/endBlock/break{,T,F}     (do {...} while (0))
15 | # - beginLoop/endLoop/continue{,T,F}    (while (1) {...;break;})
16 | #
17 | # Loops and blocks are properly nested, and have named labels.
18 | # Jumps only occur on break, which goes to the end of its block, and
19 | # continue, which goes to the beginning of its loop. The loop exits when
20 | # endLoop is reached.
21 | Restructure ⇐ {
22 |   [lm,am] ← ∨` "lbl "‿"goto" ≡⌜ 4↑¨𝕩
23 |   ai ← /am
24 |   lb ← ai ⊏ lm                    # Which statements are lbl (not goto)
25 |   i ← ⊐id ← (∧`⌾⌽' '⊸≠)⊸/¨ ai ⊏ 𝕩 # Label ID
26 |   f ← ∊i ⋄ l ← ∊⌾⌽i               # First and last use of label
27 |   IM ← {(𝕩⊐○(/⟜i)𝕨) ⊏ /𝕩}         # /𝕩 ordered by matching 𝕨 (requires 𝕨≡○(∧/⟜i)𝕩)
28 |   ff ← (fl ← f<lb) IM lb<f        # First use of label, ordered by fl
29 |   ll ← (bl ← lb<l) IM l<lb        # Loop start, ordered by end
30 |   ff ⌊↩ {⌊´𝕨↓𝕩↑ll}¨˝[ff,/fl]⊏+`bl # Surround loops where end is caught
31 |   b ← ↕0                          # Block start
32 |   ff {b∾↩𝕨⌊´𝕩↓b⋄@}¨ ff ⊏ +`fl     # Surround caught blocks, iteratively
33 |   # Place endBlock and/or beginLoop at the label, endLoop at the last goto,
34 |   # and beginBlock at the computed location
35 |   mi‿li←2↑lb⊔ai ⋄ lis←⟨lb/id,li⟩
36 |   [nn,nm,ni] ← ⍉¯1⌽[
37 |     ⟨"endBlock " ⟩∾(lb/¬f)⊸/¨lis
38 |     ⟨"beginBlock ", ⌽fl/id, ⌽b⊏ai⟩
39 |     ⟨"beginLoop "⟩∾(lb/¬l)⊸/¨lis
40 |     ⟨"endLoop "  ⟩∾(lb<l)⊸/¨id‿ai # Will be rotated to place after 𝕩
41 |   ]
42 |   # Check that the inserted begin/end markers will be properly nested
43 |   add ← ni ⍋⊸⊏○(∾1⊸⌽) (2/¯1‿1)⋈¨¨nm            # Depth change and label
44 |   ∧´(>○⊑∧≡○(⊢´))´˘ ∘‿2⥊ ((⍋+`-0⊸<)⊑¨)⊸⊏ add ?  # Abort if not nested
45 |   # Change goto to break/continue and insert begin/end
46 |   alm ← ¬lb⌾(am⊸/)am
47 |   br ← "break"‿"continue"⊏˜lb¬⊸/(i⊏i⍋⊸⊏○(lb⊸/)⊢)⊸≤⊒i
48 |   (ni∾</alm) ⍋⊸⊏○(∾1⊸⌽) (nn{𝕨⊸∾¨𝕩}¨nm)∾<alm/(br∾⟜(4⊸↓)¨⊢)⌾(mi⊸⊏)𝕩
49 | ;
50 |   𝕩
51 | }_onFns
52 | 


--------------------------------------------------------------------------------
/singeli:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env bqn
 2 | 
 3 | help_pre ← 1↓"
 4 | Compile Singeli program(s).
 5 | Argument is a list of input files and options. Supported:"
 6 | 
 7 | short‿long‿args‿dup‿desc ← <˘⍉> ⟨
 8 |   "h" ‿"help"  ‿0‿1‿"Print this message and exit"
 9 |   "o" ‿"out"   ‿1‿0‿"Output file (print to stdout by default)"
10 |   "oe"‿"errout"‿1‿0‿"Error to: stderr (default), stdout, none, file=path, bqn"
11 |   "os"‿"show"  ‿1‿0‿"show{} to: stdout (default), stderr, none, file=path"
12 | # "?" ‿"stdin" ‿0‿0‿"Use stdin as input, after any argument files"
13 |   "r" ‿"run"   ‿1‿1‿"Use this argument as source code"
14 |   "t" ‿"target"‿1‿0‿"Output type: c (default), cpp, ir"
15 |   "a" ‿"arch"  ‿1‿2‿"Architecture features: list, or none, native (default), all"
16 |   "i" ‿"infer" ‿1‿0‿"Type of architecture inference: strict, or loose (default)"
17 |   "l" ‿"lib"   ‿1‿2‿"Library paths: lib=path to try path/x for include 'lib/x'"
18 |   "c" ‿"config"‿1‿2‿"Configuration: name=value to set config name to value"
19 |   "p" ‿"pre"   ‿1‿0‿"Preamble placed before C output"
20 |   "n" ‿"name"  ‿1‿0‿"Prefix for names in C output"
21 |   "d" ‿"deplog"‿1‿0‿"Output file for log of included dependencies"
22 | ⟩
23 | short‿long ∾˜¨⟜<¨↩ "-"‿"--"
24 | args ∾↩ 0 ⋄ dup ∾↩ 1
25 | 
26 | Spl ← (⊢-˜+`×¬)∘=⊔⊢
27 | 
28 | c ← ≠short
29 | op ← (short⊸⊐ ⌊ long⊸⊐) •args
30 | op ⌈↩ c ×¬ <`⊸= op⊏args
31 | opts ← ((1+c)∾˜f/op) ⊔ ((op=c)(1-˜×⟜(+`))○(∾⟜1)f←¬0»op⊏args) ⊔ •args
32 | "Option can't be duplicated" ! ∧´ (1≤dup) ≥ 1<≠¨opts
33 | olist ← (2=dup) (∾','⊸Spl¨)⍟⊣¨ (1⌾(¯1⊸⊑)args) ⊣◶⟨0<≠∘⊢,⊑¨⊢⟩¨ opts
34 | help‿out‿oe‿os‿run‿target‿feats‿inf‿lib‿config‿pre‿namepre‿deplog‿files ← olist
35 | 
36 | { help ?
37 |   opt_help ← ∾¨´ ⟨desc⟩ ∾˜ (1+·⌈´≠¨)⊸(↑¨)¨ short‿long ∾¨¨ ",:"
38 |   •Out ∾∾⟜(@+10)¨ ⟨help_pre,""⟩ ∾ opt_help
39 |   •Exit@
40 | ;@}
41 | 
42 | _choices ← {∧´𝕨∊𝕩? (⊑𝕩)⊣´𝕨; !∾⟨"Unknown ",𝕗," option: ",∾𝕨," (options are",1↓∾", "⊸∾¨𝕩,")"⟩}
43 | target "target"   _choices↩ "c"‿"cpp"‿"ir"
44 | inf    "inference"_choices↩ "loose"‿"strict"
45 | 
46 | Rel ← •wdpath⊸•file.At
47 | files Rel¨↩
48 | 
49 | SplitEq ← (»⊸(⊣-<)·∨`'='⊸=)⊸⊔
50 | libpaths ← (Rel⌾(1⊸⊑) ¯2 ↑ SplitEq)¨ lib
51 | configs ← (2 ↑ SplitEq)¨ config
52 | OutBuf ← {𝕊: e←⟨⟩ ⋄ Save⇐{e∾↩<𝕩⋄𝕩} ⋄ Get⇐{𝕊:e}}
53 | _getShows ← {name _𝕣 𝕩:
54 |   Save‿Get ← OutBuf@
55 |   Out‿Write ← ⊢‿⊢ »˜ {
56 |     "stderr": •term.ErrRaw•ToUTF8∾(@+10)˙ ; "stdout":•Out ; "none":⊢ ;
57 |     "bqn":"error"≡name? ⊢ ;
58 |     (p←"file=")(⊣≡≠⊸↑)𝕩? f←Rel p≠⊸↓𝕩 ⋄ ⊢‿{f •file.Chars ∾∾⟜(@+10)¨𝕩 ⋄ 𝕩} ;
59 |     !"Unknown "∾name∾" output option: "∾𝕩
60 |   }𝕩
61 |   ⟨Out∘Save, Write∘Get⟩
62 | }
63 | ⟨ShowOut,ShowWrite⟩ ← "show{}" _getShows      "stdout" ⊣´ os
64 | ⟨ErrOut, ErrWrite⟩  ← "error"  _getShows oe ↩ "stderr" ⊣´ oe
65 | Writes ← ShowWrite ⋈ {⋈∾∾⟜(@+10)¨𝕩}⍟(0<≠)∘ErrWrite
66 | ⟨ErrExit,_withErr⟩ ← { "bqn"≡oe ? ⟨!, {𝔽⎊@}⟩ ; ⟨•Exit∘1 ⊣ Writes, {𝔽}⟩ }
67 | DepOut‿DepWrite ← {
68 |   wr ← {⟨⟩:⊢; ⟨p⟩: (Rel p)⊸•file.Lines} deplog
69 |   Save‿Get ← OutBuf@ ⋄ DepOut⇐Save ⋄ DepWrite⇐Wr∘Get
70 | }
71 | 
72 | arch ← ⟨feats,"strict"≢inf⟩ •Import "arch.bqn"
73 | outputs ← ShowOut‿ErrOut‿ErrExit‿DepOut
74 | frontend ← arch‿libpaths‿configs‿outputs •Import "singeli.bqn"
75 | backend ← {
76 |   "ir"≡target ? ⊢ ;
77 |   par ← ⟨"cpp"≡target,arch,"si"⊣´namepre,outputs⟩
78 |   pre ⊑⊸{𝕨⊸𝕏}⍟(0<≠∘⊣) par •Import "emit_c.bqn"
79 | }
80 | Output ← {
81 |   ≠out ? (Rel⊑out) •file.Chars ⊢ ;
82 |   •Out⍟(0<≠) ¯1⊸↓
83 | }⊸⊢
84 | Result ← {show‿errout‿deplog‿out⇐𝕩} Writes ∾ DepWrite ⋈ ⊢
85 | 
86 | Result {Output Backend ∾ Frontend¨ 𝕩}_withErr (<¨run) ∾ files
87 | 


--------------------------------------------------------------------------------
/test/README.md:
--------------------------------------------------------------------------------
 1 | # Singeli tests
 2 | 
 3 | Singeli testing is not terribly comprehensive. We're relying in part on testing with existing codebases as well as the relative simplicity of the language to make sure things work.
 4 | 
 5 | Compiler tests: `test/run` (like `singeli`, run as an executable if `bqn` is installed, or call with a BQN interpreter).
 6 | 
 7 | Most includes are not yet tested. For arch/ includes, run `make` from the test/arch/general directory, or from the base:
 8 | 
 9 |     $ make -C test/arch/general ARCH=feats
10 | 
11 | where the feature list `feats` is the same as Singeli's `-a` argument. If making changes to Singeli, run `make clean` between tests to force a new build.
12 | 


--------------------------------------------------------------------------------
/test/alias.c:
--------------------------------------------------------------------------------
 1 | static int32_t si_f0_f(int32_t v0_x) {
 2 |   int32_t v1 = change(v0_x);
 3 |   int32_t v2_a = v1;
 4 |   int32_t v3_b = v0_x;
 5 |   v2_a = ((int32_t)4ll);
 6 |   v3_b = v1;
 7 |   v0_x = v2_a;
 8 |   return v3_b;
 9 | }
10 | 
11 | static int32_t* si_f1_g() {
12 |   int32_t v0_b = ((int32_t)5ll);
13 |   int32_t v1_c = v0_b;
14 |   v1_c = ((int32_t)6ll);
15 |   int32_t v2_a_[] = {v0_b,v0_b}; int32_t* v2_a = v2_a_;
16 |   return v2_a;
17 | }
18 | 
19 | 


--------------------------------------------------------------------------------
/test/alias.in:
--------------------------------------------------------------------------------
 1 | fn f(x:i32) = {
 2 |   def d = emit{i32, 'change', x}
 3 |   a := d     # New handle
 4 |   b := x     # New handle
 5 |   def c = b  # Aliases b
 6 |   a = 4
 7 |   c = d
 8 |   x = a
 9 |   b
10 | }
11 | 
12 | fn g() : __pnt{i32} = {
13 |   b:i32 = 5
14 |   c := b
15 |   c = 6
16 |   a:__pnt{i32} = tup{b,b}
17 |   ac := a
18 | # a = cast{__pnt{i32},tup{b,c}}  # Problems with C output for this
19 |   ac
20 | }
21 | 


--------------------------------------------------------------------------------
/test/alias.ir:
--------------------------------------------------------------------------------
 1 | beginFn f0_f i32 1 v0_x i32
 2 | new v1 emit i32 'change' v0_x
 3 | new v2_a val i32 v1
 4 | new v3_b val i32 v0_x
 5 | mut v2_a !4:i32
 6 | mut v3_b v1
 7 | mut v0_x v2_a
 8 | ret v3_b
 9 | endFn
10 | 
11 | beginFn f1_g *i32 0
12 | new v0_b val i32 !5:i32
13 | new v1_c val i32 v0_b
14 | mut v1_c !6:i32
15 | new v2_a array *i32 v0_b v0_b
16 | ret v2_a
17 | endFn
18 | 
19 | 


--------------------------------------------------------------------------------
/test/anon.in:
--------------------------------------------------------------------------------
1 | fn fun(arg:i64) : i64 = ({x}=>emit{i64,'anon',x,x}){arg}
2 | export{'efn', fun}
3 | 


--------------------------------------------------------------------------------
/test/anon.ir:
--------------------------------------------------------------------------------
1 | beginFn f0_fun i64 1 v0_arg i64
2 | new v1 emit i64 'anon' v0_arg v0_arg
3 | ret v1
4 | endFn
5 | 
6 | export 'efn' (i64)->i64 $f0_fun
7 | 


--------------------------------------------------------------------------------
/test/apply.in:
--------------------------------------------------------------------------------
1 | def divmod{a,b} = tup{emit{u32,'div',a,b},emit{u32,'mod',a,b}}
2 | fn divplusmod(a:u32, b:u32) : u32 = {
3 |   apply{bind{emit, u32, 'add'}, divmod{a,b}}
4 | }
5 | 


--------------------------------------------------------------------------------
/test/apply.ir:
--------------------------------------------------------------------------------
1 | beginFn f0_divplusmod u32 2 v0_a u32 v1_b u32
2 | new v2 emit u32 'div' v0_a v1_b
3 | new v3 emit u32 'mod' v0_a v1_b
4 | new v4 emit u32 'add' v2 v3
5 | ret v4
6 | endFn
7 | 
8 | 


--------------------------------------------------------------------------------
/test/arch/.gitignore:
--------------------------------------------------------------------------------
1 | *.c
2 | *.bin
3 | 


--------------------------------------------------------------------------------
/test/arch/base.singeli:
--------------------------------------------------------------------------------
 1 | include 'skin/c'
 2 | include 'arch/c'
 3 | include 'debug/printf'
 4 | 
 5 | def comptime_fail{...msg} = {
 6 |   show{...msg}
 7 |   0{} # comptime_fail
 8 | }
 9 | 
10 | require{'stdlib.h'}
11 | def exit{code} = emit{void, 'exit', code}
12 | require{'string.h'}
13 | def memeq{a, b, bytes} = 0 == emit{i32, 'memcmp', a, b, bytes}
14 | 
15 | local fn expect_eq_fn{V}(exp:V, got:V) : void = {
16 |   buf:*V = tup{exp, got}
17 |   if (not memeq{buf, buf+1, width{V}/8}) {
18 |     lprintf{'Expected: ', tup{'x0', exp}, ' (type = ', V,')'}
19 |     lprintf{'Got:      ', tup{'x0', got}}
20 |     exit{1}
21 |   }
22 | }
23 | 
24 | def expect_eq{exp:V, got:V} = expect_eq_fn{V}(exp, got)
25 | 
26 | def test_with_match{G, args, on_fail, on_res} = {
27 |   def err  # Detect when G doesn't match
28 |   def got = match (...args) { (G); {..._} => err }
29 |   if (is{got, err}) on_fail{}; else on_res{got}
30 | }
31 | def test_exp{exp, G}{...args} = test_with_match{G, args,
32 |   {} => comptime_fail{'No case matched for ', G, args},
33 |   expect_eq{exp, .}
34 | }
35 | def test_no_case{G}{...args} = test_with_match{G, args,
36 |   {} => {}, # No match, success
37 |   {_} => comptime_fail{'Expected no case to match for ', G, args}
38 | }
39 | 
40 | def for_tup{vars,0,'!',iter} = {
41 |   def n = length{select{vars, 0}}
42 |   each{{i, ...args} => iter{i, args, {a,_} => a}, range{n}, ...vars}
43 | }
44 | 
45 | def example_elts{V=[k]E} = {
46 |   def ew = width{E}
47 |   def mul = match (if (issigned{E}) primtype{'u',ew} else E) { {(u8)}=>3; {(u16)}=>100; {(u32)}=>1e8; {(u64)}=>1e17; {(f32)}=>1.0001; {(f64)}=>1.000100010001 }
48 |   def e = (range{k}+1) * mul
49 |   if (isint{E}) e%(1<<(ew - issigned{E}) - 1) else e
50 | }
51 | def primtypes = tup{u8,i8,u16,i16,u32,i32,u64,i64,f32,f64}
52 | def supported_widths{accept_avx2_256} = {
53 |   if (hasarch{'AVX512F'}) tup{128, 256, 512}
54 |   else if (hasarch{if (accept_avx2_256) 'AVX' else 'AVX2'}) tup{128, 256}
55 |   else tup{128}
56 | }
57 | 


--------------------------------------------------------------------------------
/test/arch/general/broadcast-sel.singeli:
--------------------------------------------------------------------------------
 1 | include '../base'
 2 | include '../simd'
 3 | 
 4 | main() : void = {
 5 |   fn test{V=[k]E}() : void = {
 6 |     def elts = example_elts{V}
 7 |     def src = vec_make{V, elts}
 8 | 
 9 |     @for_tup(i in range{k}, el in elts over '!') {
10 |       if (hasarch{'X86_64'}) {
11 |         if (width{E}==8 and not hasarch{'SSSE3'}) {
12 |           test_no_case{broadcast_sel}{src, i}
13 |         } else if (width{V}==512 and width{E}<=16 and not hasarch{'AVX512BW'}) {
14 |           test_no_case{broadcast_sel}{src, i}
15 |         } else {
16 |           test_exp{vec_broadcast{V, el}, broadcast_sel}{src, i}
17 |         }
18 |       } else {
19 |         test_exp{vec_broadcast{V, el}, broadcast_sel}{src, i}
20 |       }
21 |     }
22 |     test_no_case{broadcast_sel}{src, k}
23 |     test_no_case{broadcast_sel}{src, 0.5}
24 |   }
25 |   @for_tup(E in primtypes over '!') {
26 |     @for_tup(k in supported_widths{0}/width{E} over '!') {
27 |       test{[k]E}()
28 |     }
29 |   }
30 |   lprintf{'pass'}
31 | }
32 | 


--------------------------------------------------------------------------------
/test/arch/general/imm-shuffle-select.singeli:
--------------------------------------------------------------------------------
  1 | include '../base'
  2 | include 'util/tup'
  3 | if_inline (hasarch{'X86_64'}) {
  4 |   include 'arch/iintrinsic/basic'
  5 |   include 'arch/iintrinsic/select'
  6 | } else if_inline (hasarch{'AARCH64'}) {
  7 |   include 'arch/neon_intrin/basic'
  8 |   include 'arch/neon_intrin/select'
  9 | } else {
 10 |   def {vec_shuffle,vec_select,vec_make}
 11 | }
 12 | 
 13 | def expand{e, t} = replicate{e, t}*e + cycle{e*length{t}, range{e}}
 14 | def gen_idxs{n} = (3 * (1-range{n})) % n
 15 | 
 16 | def widths{min, max} = tup{min, ...widths{min*2, max}}
 17 | def widths{min, max if min>max} = tup{}
 18 | def widths{v, v} = tup{v}
 19 | 
 20 | main() : void = {
 21 |   fn test{sw, V}(counts:*u64) : void = {
 22 |     def vw = width{V}
 23 | 
 24 |     def inc_ok{ok} = store{counts, ok, load{counts, ok}+1}
 25 |     def elts = example_elts{V}
 26 |     def vec = vec_make{V, elts}
 27 | 
 28 |     def has_int{lw} = not (lw==256 and not hasarch{'AVX2'})
 29 |     def qualities{lw} = {
 30 |       def i = has_int{lw}
 31 |       replicate{tup{i,i,sw>=32}, tup{'u','i','f'}}
 32 |     }
 33 | 
 34 |     if (sw<=64) @for_tup(lw in widths{sw, vw} over '!') {
 35 |       # vec_shuffle{[n]vw (width=lw), v:V (width=vw)}; n = elements
 36 |       def n = lw/sw
 37 |       def idxs = gen_idxs{n}
 38 |       def ok = match() {
 39 |         {if sw==16 and vw==128 and lw<=64} => 1
 40 |         {if sw== 8 and hasarch{'AVX512VBMI'}} => 1
 41 |         {if sw==16 and hasarch{'AVX512BW'}} => 1
 42 |         {if sw<=16 and vw==256} => hasarch{'AVX2'} and lw<=128
 43 |         {if sw<=16 and vw==128} => hasarch{'SSSE3'}
 44 |         {if sw<=16} => 0
 45 |         {if lw>=256 and not hasarch{'AVX2'}} => 0
 46 |         {} => 1
 47 |       }
 48 |       inc_ok{ok}
 49 |       if (ok) {
 50 |         @for_tup(quality in qualities{lw} over '!') {
 51 |           def spec = [n]primtype{quality, sw}
 52 |           def scale = sw/width{eltype{V}}
 53 |           def e1 = expand{scale, idxs}
 54 |           def e2 = join{each{{i} => e1+i*n*scale, range{vw/lw}}}
 55 |           # show{'V=',V, ' lw=',lw, ' n=',n, ' spec=',spec}
 56 |           # lprintf{'V=',V, ' lw=',lw, ' n=',n, ' spec=',spec}
 57 |           def exp = vec_make{V, select{elts, e2}}
 58 |           test_exp{exp, vec_shuffle}{spec, vec, idxs}
 59 |         }
 60 |       } # else show{'missed V=',V, ' lw=',lw, ' n=',n, ' spec=',[n]primtype{'u', sw}}
 61 |     }
 62 | 
 63 |     def select_ok = match() {
 64 |       {if sw<=16 and not hasarch{'SSSE3'}} => 0
 65 |       {if sw<=16 and vw>=256 and not hasarch{if (sw==8) 'AVX512VBMI' else 'AVX512BW'}} => 0
 66 |       {if sw<=16 and vw==256 and not hasarch{'AVX2'}} => 0
 67 |       {if vw==256 and not hasarch{'AVX2'}} => 0
 68 |       {..._} => 1
 69 |     }
 70 |     inc_ok{select_ok}
 71 |     if (select_ok) {
 72 |       def n = vw/sw
 73 |       def idxs = gen_idxs{n}
 74 | 
 75 |       def exp = {
 76 |         def scale = sw/width{eltype{V}}
 77 |         vec_make{V, select{elts, expand{scale, idxs}}}
 78 |       }
 79 | 
 80 |       @for_tup(spec in merge{
 81 |         copy{has_int{vw} or quality{eltype{V}}=='f', sw},
 82 |         each{{q}=>primtype{q,sw}, if (sw<=64) qualities{vw} else tup{}}
 83 |       } over '!') {
 84 |         # show{'spec=',spec, ' V=',V}
 85 |         test_exp{exp, vec_select}{spec, vec, idxs}
 86 |         test_exp{exp, vec_select}{spec, vec, ...idxs}
 87 |       }
 88 |     }
 89 |   }
 90 | 
 91 |   counts:*u64 = tup{0,0}
 92 |   if (hasarch{'X86_64'}) {
 93 |     @for_tup(sw in tup{8,16,32,64,128} over '!') {
 94 |       @for_tup(E in primtypes over '!') {
 95 |         if (sw>=width{E}) @for_tup(w in supported_widths{1} over '!') {
 96 |           def V = [w/width{E}]E
 97 |           test{sw, V}(counts)
 98 |         }
 99 |       }
100 |     }
101 |   } else {
102 |     lprintf{'no tests defined for this arch'}
103 |     exit{1}
104 |   }
105 |   lprintf{'pass; untestable: ', load{counts,0}, '/', load{counts,0}+load{counts,1}}
106 | }
107 | 


--------------------------------------------------------------------------------
/test/arch/general/makefile:
--------------------------------------------------------------------------------
 1 | SHELL=/usr/bin/env bash -o pipefail
 2 | ARCH=native
 3 | SINGELI=../../../singeli
 4 | SINGELI_FLAGS=-a $(ARCH)
 5 | CCFLAGS=-g $(shell bqn ../to-c-args.bqn $(ARCH))
 6 | RUN=
 7 | 
 8 | ALL_TESTS=
 9 | ALL_TESTS+=run-imm-shuffle-select
10 | ALL_TESTS+=run-broadcast-sel
11 | 
12 | default: $(ALL_TESTS)
13 | .SECONDARY:
14 | 
15 | %-${ARCH}.bin: %.singeli
16 | 	$(SINGELI) -os stderr $(SINGELI_FLAGS) $< > $@.c
17 | 	$(CC) $(CCFLAGS) $@.c -o $@
18 | 
19 | run-%: %-${ARCH}.bin
20 | 	$(RUN) ./$<
21 | 
22 | clean:
23 | 	rm *.bin*
24 | 


--------------------------------------------------------------------------------
/test/arch/simd.singeli:
--------------------------------------------------------------------------------
1 | if_inline (hasarch{'X86_64'}) {
2 |   include 'arch/iintrinsic/basic'
3 |   include 'arch/iintrinsic/select'
4 | } else if_inline (hasarch{'AARCH64'}) {
5 |   include 'arch/neon_intrin/basic'
6 |   include 'arch/neon_intrin/select'
7 | }
8 | 


--------------------------------------------------------------------------------
/test/arch/to-c-args.bqn:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bqn
2 | feats ← ∾ ',' ((⊢-˜+`×¬)∘=⊔⊢)¨ •args
3 | •Out 1↓ ∾' '⊸∾¨ (⟨feats, 1⟩ •Import "../../arch.bqn").GetCFlags @
4 | 


--------------------------------------------------------------------------------
/test/blockmut.in:
--------------------------------------------------------------------------------
 1 | include 'skin/c'
 2 | include 'arch/c'
 3 | 
 4 | def for{vars,begin,end,block} = {
 5 |   i:u64 = begin
 6 |   while (i < end) {
 7 |     block{i, vars}
 8 |     i = i+1
 9 |   }
10 | }
11 | 
12 | fn fun() : void = {
13 |   b:i32 = 0
14 |   @for(i from 0 to 4) {
15 |     b = b+2
16 |   }
17 | }
18 | 


--------------------------------------------------------------------------------
/test/blockmut.ir:
--------------------------------------------------------------------------------
 1 | beginFn f0_fun void 0
 2 | new v0_b val i32 !0:i32
 3 | new v1_i val u64 !0:u64
 4 | lbl l0
 5 | new v2 emit u1 'op <' v1_i !4:u64
 6 | gotoF v2 l1
 7 | mut v0_b emit i32 'op +' v0_b !2:i32
 8 | mut v1_i emit u64 'op +' v1_i !1:u64
 9 | goto l0
10 | lbl l1
11 | endFn
12 | 
13 | 


--------------------------------------------------------------------------------
/test/call.c:
--------------------------------------------------------------------------------
 1 | static int32_t si_f0_mid(int32_t v0_a, int32_t v1_b) {
 2 |   int32_t v2 = add(v0_a, v1_b);
 3 |   return v2;
 4 | }
 5 | 
 6 | static int32_t si_f1_fun(int32_t v0_a) {
 7 |   int32_t v1 = si_f0_mid(v0_a, v0_a);
 8 |   return v1;
 9 | }
10 | 
11 | int32_t (*const fn)(int32_t) = si_f1_fun;
12 | 
13 | 


--------------------------------------------------------------------------------
/test/call.in:
--------------------------------------------------------------------------------
1 | fn mid(a:i32, b:i32) = emit{i32,'add',a,b}
2 | 
3 | fn fun(a:i32) : i32 = {
4 |   mid(a, a)
5 | }
6 | export{'fn', fun}
7 | 


--------------------------------------------------------------------------------
/test/cond.in:
--------------------------------------------------------------------------------
 1 | def g{x,y}                = emit{f64, 'g2', x, y}
 2 | def g{g,'sym'}            = emit{f64, 'g2s', g}
 3 | def g{a, a}               = g{a}
 4 | def g{b==tup{a}, a}       = g{__add{10,a}}
 5 | def g{x}                  = emit{f64, 'g1', x}
 6 | def g{w:T, x:T}           = emit{T, 'g2T', w, x}
 7 | def g{x:T if __le{T,i32}} = emit{T, 'g1i', x}
 8 | 
 9 | fn fun(a:i16, b:u1, c:f32) : u8 = {
10 |   emit{u8, 'out', g{a,a}, g{a,c}, g{3,3}, g{tup{4},4}, g{5,'sym'}, g{a}}
11 | }
12 | 


--------------------------------------------------------------------------------
/test/cond.ir:
--------------------------------------------------------------------------------
 1 | beginFn f0_fun u8 3 v0_a i16 v1_b u1 v2_c f32
 2 | new v3 emit i16 'g2T' v0_a v0_a
 3 | new v4 emit f64 'g2' v0_a v2_c
 4 | new v5 emit f64 'g1' 3
 5 | new v6 emit f64 'g1' 14
 6 | new v7 emit f64 'g2s' 5
 7 | new v8 emit i16 'g1i' v0_a
 8 | new v9 emit u8 'out' v3 v4 v5 v6 v7 v8
 9 | ret v9
10 | endFn
11 | 
12 | 


--------------------------------------------------------------------------------
/test/const.in:
--------------------------------------------------------------------------------
1 | c:u64 = 4
2 | 
3 | fn fun() = c
4 | 


--------------------------------------------------------------------------------
/test/const.ir:
--------------------------------------------------------------------------------
1 | constant $c0_c u64 !4:u64
2 | 
3 | beginFn f0_fun u64 0
4 | ret $c0_c
5 | endFn
6 | 
7 | 


--------------------------------------------------------------------------------
/test/destruct.in:
--------------------------------------------------------------------------------
 1 | fn f(x:u1, y:u8, z:u16) = {
 2 |   {a,b:(u1)}:tup{u1,u1} = tup{x,1}
 3 |   def {yc, ...c if is{length{c},3}, yc==y} = tup{y,b,y,b,y}
 4 |   {...d,e,f,g} := c
 5 |   emit{u32,'all',a,f,g}
 6 | }
 7 | 
 8 | def g
 9 | def g{...x}          = emit{f32, 'g0', ...x}
10 | def g{{a,b}}         = emit{f32, 'g1', a, b}
11 | def g{{a,b},c,{d,e}} = emit{f32, 'g2', a, b, c, d, e}
12 | def g{{a,b},a,{b,a}} = emit{f32, 'g3', a, b}
13 | def g{x:T,...{y,T}}  = emit{T,   'g4', x, y}
14 | fn nest() = {
15 |   emit{f64, 'out',
16 |     g{0,1,2},
17 |     g{tup{3,4}},
18 |     g{tup{5,4},3,tup{2,1}},
19 |     g{tup{7,6},7,tup{6,7}},
20 |     g{reinterpret{i16,8},9,i16}
21 |   }
22 | }
23 | 


--------------------------------------------------------------------------------
/test/destruct.ir:
--------------------------------------------------------------------------------
 1 | beginFn f0_f u32 3 v0_x u1 v1_y u8 v2_z u16
 2 | new v3_b val u1 !1:u1
 3 | new v4 emit u32 'all' v0_x v1_y v3_b
 4 | ret v4
 5 | endFn
 6 | 
 7 | beginFn f1_nest f64 0
 8 | new v0 emit f32 'g0' 0 1 2
 9 | new v1 emit f32 'g1' 3 4
10 | new v2 emit f32 'g2' 5 4 3 2 1
11 | new v3 emit f32 'g3' 7 6
12 | new v4 emit i16 'g4' !8:i16 9
13 | new v5 emit f64 'out' v0 v1 v2 v3 v4
14 | ret v5
15 | endFn
16 | 
17 | 


--------------------------------------------------------------------------------
/test/each.in:
--------------------------------------------------------------------------------
1 | fn divplusmod() : u32 = {
2 |   def t = tup{0,1,2,3,4}
3 |   def u = each{{a}=>__mul{3,a}, t}
4 |   def v = each{__add, u, tup{1,0,1,0,1}}
5 |   apply{bind{emit, u32, 'list'}, v}
6 | }
7 | 


--------------------------------------------------------------------------------
/test/each.ir:
--------------------------------------------------------------------------------
1 | beginFn f0_divplusmod u32 0
2 | new v0 emit u32 'list' 1 3 7 9 13
3 | ret v0
4 | endFn
5 | 
6 | 


--------------------------------------------------------------------------------
/test/else.in:
--------------------------------------------------------------------------------
1 | fn fun(i:i64) : i64 = {
2 |   if (emit{u1, 'test', i}) { return{1} }
3 |   else { return{2} }
4 |   3
5 | }
6 | export{'efn', fun}
7 | 


--------------------------------------------------------------------------------
/test/else.ir:
--------------------------------------------------------------------------------
 1 | beginFn f0_fun i64 1 v0_i i64
 2 | new v1 emit u1 'test' v0_i
 3 | gotoF v1 l0
 4 | ret !1:i64
 5 | goto l1
 6 | lbl l0
 7 | ret !2:i64
 8 | lbl l1
 9 | ret !3:i64
10 | endFn
11 | 
12 | export 'efn' (i64)->i64 $f0_fun
13 | 


--------------------------------------------------------------------------------
/test/excon.c:
--------------------------------------------------------------------------------
1 | static int32_t si_c0_c_[] = {((int32_t)11ll),((int32_t)10ll),((int32_t)9ll)}; static int32_t* const si_c0_c = si_c0_c_;
2 | 
3 | int16_t const num = ((int16_t)12ll);
4 | 
5 | int32_t* const arr = si_c0_c;
6 | 
7 | 


--------------------------------------------------------------------------------
/test/excon.in:
--------------------------------------------------------------------------------
1 | export{'num', cast{i16, 12}}
2 | c:__pnt{i32} = tup{11,10,9}
3 | export{'arr', c}
4 | 


--------------------------------------------------------------------------------
/test/export.in:
--------------------------------------------------------------------------------
1 | fn fun{T}(arg:T) : T = arg
2 | export{'e0',           fun{i8}}
3 | export{tup{'e1','e2'}, fun{i16}}
4 | export{'e3',           fun{i32}}
5 | 


--------------------------------------------------------------------------------
/test/export.ir:
--------------------------------------------------------------------------------
 1 | beginFn f0_fun_i8 i8 1 v0_arg i8
 2 | ret v0_arg
 3 | endFn
 4 | 
 5 | beginFn f1_fun_i16 i16 1 v0_arg i16
 6 | ret v0_arg
 7 | endFn
 8 | 
 9 | beginFn f2_fun_i32 i32 1 v0_arg i32
10 | ret v0_arg
11 | endFn
12 | 
13 | export 'e0' (i8)->i8 $f0_fun_i8
14 | export 'e1' (i16)->i16 $f1_fun_i16
15 | export 'e2' (i16)->i16 $f1_fun_i16
16 | export 'e3' (i32)->i32 $f2_fun_i32
17 | 


--------------------------------------------------------------------------------
/test/fnarr.c:
--------------------------------------------------------------------------------
 1 | static uint32_t si_f0_fun_0(uint32_t v0_a, uint32_t v1_b);
 2 | static uint32_t si_f1_fun_1(uint32_t v0_a, uint32_t v1_b);
 3 | 
 4 | static uint32_t (*si_c0_fns_[])(uint32_t,uint32_t) = {si_f0_fun_0,si_f1_fun_1}; static uint32_t (**const si_c0_fns)(uint32_t,uint32_t) = si_c0_fns_;
 5 | 
 6 | static uint32_t si_f0_fun_0(uint32_t v0_a, uint32_t v1_b) {
 7 |   return v0_a;
 8 | }
 9 | 
10 | static uint32_t si_f1_fun_1(uint32_t v0_a, uint32_t v1_b) {
11 |   return v1_b;
12 | }
13 | 
14 | static uint32_t si_f2_sfn(bool v0_i, uint32_t v1_a, uint32_t v2_b) {
15 |   uint32_t (*v3)(uint32_t,uint32_t) = si_c0_fns[v0_i];
16 |   uint32_t v4 = v3(v1_a, v2_b);
17 |   return v4;
18 | }
19 | 
20 | uint32_t (**const fn_arr)(uint32_t,uint32_t) = si_c0_fns;
21 | 
22 | 


--------------------------------------------------------------------------------
/test/fnarr.in:
--------------------------------------------------------------------------------
1 | fn fun{x}(a:u32, b:u32) = select{tup{a,b},x}
2 | fns:__pnt{fntype{u32, u32, u32}} = tup{fun{0},fun{1}}
3 | 
4 | export{'fn_arr', fns}
5 | 
6 | include 'arch/c'
7 | fn sfn(i:u1, a:u32, b:u32) = load{fns,i}(a,b)
8 | 


--------------------------------------------------------------------------------
/test/fnrec.in:
--------------------------------------------------------------------------------
1 | include 'arch/c'
2 | include 'skin/c'
3 | 
4 | fn fact(x:u8) : u64 = {
5 |   if (x <= 1) return{1}
6 |   promote{u64,x} * fact(x - 1)
7 | }
8 | 


--------------------------------------------------------------------------------
/test/fnrec.ir:
--------------------------------------------------------------------------------
 1 | beginFn f0_fact u64 1 v0_x u8
 2 | new v1 emit u1 'op <=' v0_x !1:u8
 3 | gotoF v1 l0
 4 | ret !1:u64
 5 | lbl l0
 6 | new v2 emit u64 '^promote' u64 v0_x
 7 | new v3 emit u8 'op -' v0_x !1:u8
 8 | new v4 call u64 $f0_fact 1 v3
 9 | new v5 emit u64 'op *' v2 v4
10 | ret v5
11 | endFn
12 | 
13 | 


--------------------------------------------------------------------------------
/test/fntup.in:
--------------------------------------------------------------------------------
1 | fn gen{T}(a:T) : u8 = select{a,0}
2 | 
3 | fn fun(none:tup{}) = {
4 |   x:tup{u8,i32} = tup{4, 1}
5 |   y := gen{tup{u8,i32}}(x)
6 |   z := gen{tup{u8,type{x}}}(tup{y,x})
7 |        gen{tup{u8}}(tup{z})
8 | }
9 | 


--------------------------------------------------------------------------------
/test/fntup.ir:
--------------------------------------------------------------------------------
 1 | beginFn f0_fun u8 0
 2 | new v0_x val u8 !4:u8
 3 | new v1_x val i32 !1:i32
 4 | new v2_y call u8 $f1_gen_tupu8i32 2 v0_x v1_x
 5 | new v3_z call u8 $f2_gen_tupu8u8i32 3 v2_y v0_x v1_x
 6 | new v4 call u8 $f3_gen_tupu8 1 v3_z
 7 | ret v4
 8 | endFn
 9 | 
10 | beginFn f1_gen_tupu8i32 u8 2 v0_a u8 v1_a i32
11 | ret v0_a
12 | endFn
13 | 
14 | beginFn f2_gen_tupu8u8i32 u8 3 v0_a u8 v1_a u8 v2_a i32
15 | ret v0_a
16 | endFn
17 | 
18 | beginFn f3_gen_tupu8 u8 1 v0_a u8
19 | ret v0_a
20 | endFn
21 | 
22 | 


--------------------------------------------------------------------------------
/test/for.in:
--------------------------------------------------------------------------------
 1 | include 'skin/c'
 2 | include 'arch/c'
 3 | 
 4 | def Size = u64
 5 | 
 6 | def for{vars,begin,end,block} = {
 7 |   i:Size = begin
 8 |   while (i<end) {
 9 |     block{i, vars}
10 |     i = i+1
11 |   }
12 | }
13 | 
14 | fn loop(dst:*f64, x:*f64, w:*f64, len:Size) : void = {
15 |   @for (d in dst,x,e in w+1 over _ from 0 to len) {
16 |     d=emit{f64,'op',e,x}
17 |   }
18 | }
19 | 


--------------------------------------------------------------------------------
/test/for.ir:
--------------------------------------------------------------------------------
 1 | beginFn f0_loop void 4 v0_dst *f64 v1_x *f64 v2_w *f64 v3_len u64
 2 | new v4 emit *f64 'op +' v2_w !1:i64
 3 | new v5_i val u64 !0:u64
 4 | lbl l0
 5 | new v6 emit u1 'op <' v5_i v3_len
 6 | gotoF v6 l1
 7 | new v7_d emit f64 '^load' v0_dst v5_i
 8 | new v8_x emit f64 '^load' v1_x v5_i
 9 | new v9_e emit f64 '^load' v4 v5_i
10 | mut v7_d emit f64 'op' v9_e v8_x
11 | new v10 emit void '^store' v0_dst v5_i v7_d
12 | mut v5_i emit u64 'op +' v5_i !1:u64
13 | goto l0
14 | lbl l1
15 | endFn
16 | 
17 | 


--------------------------------------------------------------------------------
/test/forin.in:
--------------------------------------------------------------------------------
1 | include 'arch/c'
2 | 
3 | fn fun(x:__pnt{u8}) : void = {
4 |   def for{vars,begin,end,block} = { block{end, vars} }
5 |   def xg{r} = x
6 |   @for (a in xg{4} over 2) { a = 3 }
7 | }
8 | 


--------------------------------------------------------------------------------
/test/forin.ir:
--------------------------------------------------------------------------------
1 | beginFn f0_fun void 1 v0_x *u8
2 | new v1_a emit u8 '^load' v0_x 2
3 | mut v1_a !3:u8
4 | new v2 emit void '^store' v0_x 2 v1_a
5 | endFn
6 | 
7 | 


--------------------------------------------------------------------------------
/test/fortup.in:
--------------------------------------------------------------------------------
 1 | include 'skin/c'
 2 | 
 3 | def fortup{vars,begin,end,block} = {
 4 |   def load_t = select
 5 |   def store_t{p,i,v} = { select{p,i} = v }
 6 |   def f{i,l} = if (i<l) {
 7 |     block{i, vars, load_t, store_t}
 8 |     f{i+1, l}
 9 |   }
10 |   f{begin,end}
11 | }
12 | 
13 | fn test(a:tup{u64,u64,u64}) = {
14 |   @fortup (r in a, k in tup{8,1,3} over _ from 1 to 3) r = k
15 |   emit{u1, 'res', ...a}
16 | }
17 | 


--------------------------------------------------------------------------------
/test/fortup.ir:
--------------------------------------------------------------------------------
 1 | beginFn f0_test u1 3 v0_a u64 v1_a u64 v2_a u64
 2 | new v3_r val u64 v1_a
 3 | mut v3_r !1:u64
 4 | mut v1_a v3_r
 5 | new v4_r val u64 v2_a
 6 | mut v4_r !3:u64
 7 | mut v2_a v4_r
 8 | new v5 emit u1 'res' v0_a v1_a v2_a
 9 | ret v5
10 | endFn
11 | 
12 | 


--------------------------------------------------------------------------------
/test/fun.in:
--------------------------------------------------------------------------------
1 | fn fun(arg:i64) : i64 = arg
2 | export{'efn', fun}
3 | 


--------------------------------------------------------------------------------
/test/fun.ir:
--------------------------------------------------------------------------------
1 | beginFn f0_fun i64 1 v0_arg i64
2 | ret v0_arg
3 | endFn
4 | 
5 | export 'efn' (i64)->i64 $f0_fun
6 | 


--------------------------------------------------------------------------------
/test/genext.in:
--------------------------------------------------------------------------------
 1 | def n{...} = __neg
 2 | def a = n
 3 | def a{'b0'} = 'fail'
 4 | 
 5 | def b = ({'b0'} => 1)
 6 | def b{'b1'} = 2
 7 | 
 8 | def c{...} = b
 9 | def a{...} = b
10 | 
11 | fn ta() = emit{u8, 'out', ...each{a, tup{3, 'b0', 'b1'}}}
12 | fn tc() = emit{u8, 'out', ...each{c, tup{   'b0', 'b1'}}}
13 | 


--------------------------------------------------------------------------------
/test/genext.ir:
--------------------------------------------------------------------------------
 1 | beginFn f0_ta u8 0
 2 | new v0 emit u8 'out' -3 1 2
 3 | ret v0
 4 | endFn
 5 | 
 6 | beginFn f1_tc u8 0
 7 | new v0 emit u8 'out' 1 2
 8 | ret v0
 9 | endFn
10 | 
11 | 


--------------------------------------------------------------------------------
/test/goto.c:
--------------------------------------------------------------------------------
 1 | static uint8_t si_f0_fun(bool v0_a) {
 2 | l0:;
 3 |   if (!(v0_a)) goto l2;
 4 |   goto l1;
 5 | l2:;
 6 |   return ((uint8_t)5ull);
 7 |   goto l0;
 8 |   if (!(v0_a)) goto l3;
 9 |   goto l_sym;
10 | l3:;
11 | l_sym:;
12 | l1:;
13 |   return ((uint8_t)6ull);
14 | }
15 | 
16 | 


--------------------------------------------------------------------------------
/test/goto.in:
--------------------------------------------------------------------------------
 1 | fn fun(a:u1) : u8 = {
 2 |   def w = setlabel{}
 3 |   def l = makelabel{}
 4 |   if (a) goto{l}
 5 |   return{5}
 6 |   goto{w}
 7 |   if (a) goto{'sym'}
 8 |   setlabel{'sym'}
 9 |   setlabel{l}
10 |   6
11 | }
12 | 


--------------------------------------------------------------------------------
/test/hello.c:
--------------------------------------------------------------------------------
1 | int main() {
2 |   printf("Hello, World!\n");
3 | }
4 | 
5 | 


--------------------------------------------------------------------------------
/test/hello.in:
--------------------------------------------------------------------------------
1 | include 'debug/printf'
2 | main : void {
3 |   lprintf{'Hello, World!'}
4 | }
5 | 


--------------------------------------------------------------------------------
/test/ifconst.in:
--------------------------------------------------------------------------------
 1 | include 'skin/c'
 2 | include 'arch/c'
 3 | 
 4 | fn fun(i:i64) : i64 = {
 5 |   if (1) { i = 2 * i }
 6 |   if (0) { i = 3 * i }
 7 |   if (1) { i = i - 1 }
 8 |   else   { i = i - 2 }
 9 |   if (0) { i = i + 1 }
10 |   else   { i = i + 2 }
11 | }
12 | 


--------------------------------------------------------------------------------
/test/ifconst.ir:
--------------------------------------------------------------------------------
1 | beginFn f0_fun i64 1 v0_i i64
2 | mut v0_i emit i64 'op *' !2:i64 v0_i
3 | mut v0_i emit i64 'op -' v0_i !1:i64
4 | mut v0_i emit i64 'op +' v0_i !2:i64
5 | ret v0_i
6 | endFn
7 | 
8 | 


--------------------------------------------------------------------------------
/test/local.in:
--------------------------------------------------------------------------------
 1 | c:u8 = 2
 2 | def g{a} = 'outer'
 3 | local {
 4 |   local c:u8 = 3
 5 |   def g{a if is{a,'ext'}} = 'inner'
 6 |   local def g{a if is{a,'test'}} = 'fail!'
 7 |   fn fi(x:u8) = {
 8 |     emit{u1, g{'default'}, x}
 9 |     emit{u1, g{'ext'}, c}
10 |   }
11 |   local fn fi(x:u8) : u8 = 0
12 |   local export{'locfi', fi}
13 | }
14 | fn fo(y:u8) = {
15 |   emit{u1, g{'ext'}, y}
16 |   emit{u1, g{'test'}, y}
17 |   fi(c)
18 | }
19 | 


--------------------------------------------------------------------------------
/test/local.ir:
--------------------------------------------------------------------------------
 1 | constant $c0_c u8 !2:u8
 2 | constant $c1_c u8 !3:u8
 3 | 
 4 | beginFn f0_fi u1 1 v0_x u8
 5 | new v1 emit u1 'outer' v0_x
 6 | new v2 emit u1 'inner' $c1_c
 7 | ret v2
 8 | endFn
 9 | 
10 | beginFn f1_fi u8 1 v0_x u8
11 | ret !0:u8
12 | endFn
13 | 
14 | beginFn f2_fo u1 1 v0_y u8
15 | new v1 emit u1 'inner' v0_y
16 | new v2 emit u1 'outer' v0_y
17 | new v3 call u1 $f0_fi 1 $c0_c
18 | ret v3
19 | endFn
20 | 
21 | export 'locfi' (u8)->u8 $f1_fi
22 | 


--------------------------------------------------------------------------------
/test/logic.in:
--------------------------------------------------------------------------------
 1 | include 'arch/c'
 2 | include 'skin/c'
 3 | 
 4 | fn fun(x:i32) : i32 = {
 5 |   if ((x>=4 or x==2) and not x>6) return{2*x}
 6 |   while (x<10 and x!=5) ++x
 7 |   do --x while ((1 and x>2) or (not (0 or 1)))
 8 |   x
 9 | }
10 | 


--------------------------------------------------------------------------------
/test/logic.ir:
--------------------------------------------------------------------------------
 1 | beginFn f0_fun i32 1 v0_x i32
 2 | new v1 emit u1 'op >=' v0_x !4:i32
 3 | gotoT v1 l0
 4 | new v2 emit u1 'op ==' v0_x !2:i32
 5 | gotoF v2 l1
 6 | lbl l0
 7 | new v3 emit u1 'op >' v0_x !6:i32
 8 | gotoT v3 l1
 9 | new v4 emit i32 'op *' !2:i32 v0_x
10 | ret v4
11 | lbl l1
12 | lbl l2
13 | new v5 emit u1 'op <' v0_x !10:i32
14 | gotoF v5 l3
15 | new v6 emit u1 'op !=' v0_x !5:i32
16 | gotoF v6 l3
17 | mut v0_x emit i32 'op +' v0_x !1:i32
18 | goto l2
19 | lbl l3
20 | lbl l4
21 | mut v0_x emit i32 'op -' v0_x !1:i32
22 | new v7 emit u1 'op >' v0_x !2:i32
23 | gotoT v7 l4
24 | ret v0_x
25 | endFn
26 | 
27 | 


--------------------------------------------------------------------------------
/test/match.in:
--------------------------------------------------------------------------------
 1 | def g = match {
 2 |   {a,b} => emit{u8, 'g2', a, b}
 3 |   {...a} => emit{u8, 'g', ...a}; {...b} => emit{u8, '!', ...b};
 4 | }
 5 | fn test() = {
 6 |   emit{f64,'out',
 7 |     g{0},
 8 |     (match{{...any}=>g{...any}}){1,2},
 9 |     match (3,1) { {a,b if __lt{a,b}} => emit{u16,'fail',a,b};
10 |                   {a,b}              => emit{u16,'pass',a,b} }
11 |   }
12 | }
13 | 


--------------------------------------------------------------------------------
/test/match.ir:
--------------------------------------------------------------------------------
1 | beginFn f0_test f64 0
2 | new v0 emit u8 'g' 0
3 | new v1 emit u8 'g2' 1 2
4 | new v2 emit u16 'pass' 3 1
5 | new v3 emit f64 'out' v0 v1 v2
6 | ret v3
7 | endFn
8 | 
9 | 


--------------------------------------------------------------------------------
/test/mfor.in:
--------------------------------------------------------------------------------
 1 | include 'arch/c'
 2 | 
 3 | fn fun(x:__pnt{i32}, y:__pnt{u8}) : void = {
 4 |   def for{vars,begin,end,block} = {
 5 |     block{0, vars}
 6 |   }
 7 |   def istup{t} = is{'tuple', kind{t}}
 8 |   def load{p, i if istup{p}} = each{{p}=>load{p,i}, p}
 9 |   def store{p, i, v if istup{p}} = each{{p,v}=>store{p,i,v}, p,v}
10 |   @for (a in tup{x,y} over 3) select{a,1} = 2
11 | }
12 | 


--------------------------------------------------------------------------------
/test/mfor.ir:
--------------------------------------------------------------------------------
1 | beginFn f0_fun void 2 v0_x *i32 v1_y *u8
2 | new v2_a emit i32 '^load' v0_x 0
3 | new v3_a emit u8 '^load' v1_y 0
4 | mut v3_a !2:u8
5 | new v4 emit void '^store' v1_y 0 v3_a
6 | endFn
7 | 
8 | 


--------------------------------------------------------------------------------
/test/mut.in:
--------------------------------------------------------------------------------
1 | fn fun(i:i64) : i64 = {
2 |   i = emit{i64, 'newvar', i}
3 |   if (emit{u1, 'test', i}) { i = emit{i64, 'mutvar', i} }
4 |   i
5 | }
6 | export{'efn', fun}
7 | 


--------------------------------------------------------------------------------
/test/mut.ir:
--------------------------------------------------------------------------------
 1 | beginFn f0_fun i64 1 v0_i i64
 2 | mut v0_i emit i64 'newvar' v0_i
 3 | new v1 emit u1 'test' v0_i
 4 | gotoF v1 l0
 5 | mut v0_i emit i64 'mutvar' v0_i
 6 | lbl l0
 7 | ret v0_i
 8 | endFn
 9 | 
10 | export 'efn' (i64)->i64 $f0_fun
11 | 


--------------------------------------------------------------------------------
/test/oper.in:
--------------------------------------------------------------------------------
 1 | def a{b} = 0
 2 | oper %  a  prefix 10
 3 | oper & (a) prefix 10
 4 | 
 5 | def outer = tup{%4, &4}
 6 | local {
 7 |   def a{b} = 1
 8 |   def inner = tup{%4, &4}
 9 | }
10 | 
11 | c:__pnt{u8} = merge{outer, inner}
12 | 


--------------------------------------------------------------------------------
/test/oper.ir:
--------------------------------------------------------------------------------
1 | constant $c0_c *u8 !tup{!0:u8,!0:u8,!1:u8,!0:u8}:*u8
2 | 
3 | 


--------------------------------------------------------------------------------
/test/oppar.in:
--------------------------------------------------------------------------------
 1 | oper $ gen prefix 50.1
 2 | oper $ gen infix none 50.1
 3 | 
 4 | def gen{a}{b,c} = __mul{a,__sub{b,c}}
 5 | def gen{a,b}{c} = __add{a,__mul{b,c}}
 6 | 
 7 | fn fun() : f64 = { 20 ${3} ${4,5} 2 }
 8 | 
 9 | fn fi() : f64 = {
10 |   def o = ${4,5}
11 |   (${3}){20, o{2}}
12 | }
13 | 


--------------------------------------------------------------------------------
/test/oppar.ir:
--------------------------------------------------------------------------------
1 | beginFn f0_fun f64 0
2 | ret !18:f64
3 | endFn
4 | 
5 | beginFn f1_fi f64 0
6 | ret !18:f64
7 | endFn
8 | 
9 | 


--------------------------------------------------------------------------------
/test/partial.in:
--------------------------------------------------------------------------------
1 | fn f0() = emit{u32, 'out', 0, ..., 4, .}{..., 2, ., .}{1, ., 5}{3, ...}{}
2 | fn f1() = emit{u32, 'out', ., ...tup{1,2,3}, 4, ...}{0, 5, 6}
3 | 


--------------------------------------------------------------------------------
/test/partial.ir:
--------------------------------------------------------------------------------
 1 | beginFn f0_f0 u32 0
 2 | new v0 emit u32 'out' 0 1 2 3 4 5
 3 | ret v0
 4 | endFn
 5 | 
 6 | beginFn f1_f1 u32 0
 7 | new v0 emit u32 'out' 0 1 2 3 4 5 6
 8 | ret v0
 9 | endFn
10 | 
11 | 


--------------------------------------------------------------------------------
/test/proto.c:
--------------------------------------------------------------------------------
 1 | static int32_t si_f1_fun_1(int32_t v0_x);
 2 | 
 3 | static int32_t si_f0_x(int32_t v0_a) {
 4 |   int32_t v1 = si_f1_fun_1(v0_a);
 5 |   return v1;
 6 | }
 7 | 
 8 | static int32_t si_f1_fun_1(int32_t v0_x) {
 9 |   int32_t v1 = oper(v0_x, 1);
10 |   return v1;
11 | }
12 | 
13 | int32_t (*const x)(int32_t) = si_f0_x;
14 | 
15 | 


--------------------------------------------------------------------------------
/test/proto.in:
--------------------------------------------------------------------------------
1 | fn fun{o}(x:i32) = {
2 |   emit{i32, 'oper', x,o}
3 | }
4 | 
5 | fn x(a:i32) : i32 = {
6 |   call{fun{1},a}
7 | }
8 | export{'x', x}
9 | 


--------------------------------------------------------------------------------
/test/qual.in:
--------------------------------------------------------------------------------
1 | def g{x if isfloat {x}} = 0
2 | def g{x if isint   {x}} = 1
3 | def g{x if issigned{x}} = 2
4 | 
5 | fn fun(a:i64, b:u1, c:f32) : u8 = {
6 |   emit{u8, 'out', g{a}, g{b}, g{c}}
7 | }
8 | 


--------------------------------------------------------------------------------
/test/qual.ir:
--------------------------------------------------------------------------------
1 | beginFn f0_fun u8 3 v0_a i64 v1_b u1 v2_c f32
2 | new v3 emit u8 'out' 2 1 0
3 | ret v3
4 | endFn
5 | 
6 | 


--------------------------------------------------------------------------------
/test/run:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env bqn
 2 | 
 3 | cpu ← •Import "../arch.bqn"
 4 | out ← ⟨•Out,•term.ErrRaw•ToUTF8∾(@+10)˙,!∘0,⊢⟩
 5 | par ← ⟨cpu‿⟨⟩‿⟨⟩‿out, 0‿cpu‿"si"‿out⟩
 6 | steps ← {""⊸𝕏}⌾(1⊸⊑) par •Import¨ "../singeli.bqn"‿"../emit_c.bqn"
 7 | 
 8 | n ← ≠types ← "in"‿"ir"‿"c"
 9 | files ← •file.List "."
10 | type‿name ← types⊸⊐⌾⊑ <˘⍉> (2∾˜·∨`⌾⌽'.'⊸=)⊸⊔¨ files
11 | files‿type‿name (⍋name≍˘type)⊸⊏¨↩
12 | 
13 | Test ← {
14 |   l ← 1-˜≠ t ← 𝕨 (0<⊣)◶⟨•file.At⊢,•file.Chars⊢⟩¨ 𝕩
15 |   m ← (1↓t) ≡¨ t {𝕎⎊@𝕩}´⟜⌽¨○(l⊸↑) (¯1+`n↑/⁼𝕨) ⊔ steps
16 |   (¬m) / 2↕𝕩
17 | }
18 | gr ← ⊐∘⊣⌾((type<n)⊸/)⟜(¯1¨) name
19 | fail ← ∾ type Test¨○(gr⊸⊔) files
20 | 
21 | •Out {
22 |   0=≠fail ? "All passed!"
23 | ;
24 |   f←(⌈´≠¨)⊸(↑¨)⊸((∾⟜" → "⊸∾)¨)˝ ⍉fail
25 |   ¯1↓∾∾⟜(@+10)¨ ⟨"Following steps failed:"⟩∾f
26 | }
27 | 


--------------------------------------------------------------------------------
/test/spread.in:
--------------------------------------------------------------------------------
1 | fn fn1(a:u64, b:i8, c:i8, d:i8, e:u64) = emit{i32, 'go', ...tup{c,d,b,c}}
2 | fn fn2(a:u64) = {
3 |   fn1(a, ...__add{2,tup{0,1,2}}, a)
4 | }
5 | 


--------------------------------------------------------------------------------
/test/spread.ir:
--------------------------------------------------------------------------------
 1 | beginFn f0_fn1 i32 5 v0_a u64 v1_b i8 v2_c i8 v3_d i8 v4_e u64
 2 | new v5 emit i32 'go' v2_c v3_d v1_b v2_c
 3 | ret v5
 4 | endFn
 5 | 
 6 | beginFn f1_fn2 i32 1 v0_a u64
 7 | new v1 call i32 $f0_fn1 5 v0_a !2:i8 !3:i8 !4:i8 v0_a
 8 | ret v1
 9 | endFn
10 | 
11 | 


--------------------------------------------------------------------------------
/test/tup.in:
--------------------------------------------------------------------------------
1 | def divmod{a,b} = tup{emit{u32,'div',a,b},emit{u32,'mod',a,b}}
2 | fn divplusmod(a:u32, b:u32) : u32 = {
3 |   def dm = divmod{a,b}
4 |   emit{u32, 'add', select{dm,0}, select{dm,1}}
5 | }
6 | 


--------------------------------------------------------------------------------
/test/tup.ir:
--------------------------------------------------------------------------------
1 | beginFn f0_divplusmod u32 2 v0_a u32 v1_b u32
2 | new v2 emit u32 'div' v0_a v1_b
3 | new v3 emit u32 'mod' v0_a v1_b
4 | new v4 emit u32 'add' v2 v3
5 | ret v4
6 | endFn
7 | 
8 | 


--------------------------------------------------------------------------------
/test/uload.in:
--------------------------------------------------------------------------------
 1 | include 'skin/c'
 2 | include 'arch/c'
 3 | 
 4 | def for{vars,begin,end,block} = {
 5 |   i:u64 = begin
 6 |   while (i<end) {
 7 |     block{i, vars}
 8 |     i = i+1
 9 |   }
10 | }
11 | 
12 | fn loop(dst:*f64, x:*f64, w:*f64, len:u64) : void = {
13 |   def load{a,b} = emit{f64, 'llooaadd', a, b}
14 |   def store{a,b,c} = emit{f64, 'ssttoorree', a, b, c}
15 |   @for (dst,x,w over _ from 0 to len) {
16 |     def store{a,b,c} = emit{'wrong', f64, a, b, c} # In body; not called
17 |     dst=emit{f64,'op',w,x}
18 |   }
19 | }
20 | 


--------------------------------------------------------------------------------
/test/uload.ir:
--------------------------------------------------------------------------------
 1 | beginFn f0_loop void 4 v0_dst *f64 v1_x *f64 v2_w *f64 v3_len u64
 2 | new v4_i val u64 !0:u64
 3 | lbl l0
 4 | new v5 emit u1 'op <' v4_i v3_len
 5 | gotoF v5 l1
 6 | new v6_dst emit f64 'llooaadd' v0_dst v4_i
 7 | new v7_x emit f64 'llooaadd' v1_x v4_i
 8 | new v8_w emit f64 'llooaadd' v2_w v4_i
 9 | mut v6_dst emit f64 'op' v8_w v7_x
10 | new v9 emit f64 'ssttoorree' v0_dst v4_i v6_dst
11 | mut v4_i emit u64 'op +' v4_i !1:u64
12 | goto l0
13 | lbl l1
14 | endFn
15 | 
16 | 


--------------------------------------------------------------------------------
/test/undefined.c:
--------------------------------------------------------------------------------
 1 | static int32_t si_f0_fun() {
 2 |   int32_t v0_a;
 3 |   int32_t v1_b;
 4 |   int32_t v2_c_[7]; int32_t* v2_c = v2_c_;
 5 |   int32_t v3 = v2_c[2];
 6 |   int32_t v4 = add(v0_a, v1_b, v3);
 7 |   return v4;
 8 | }
 9 | 
10 | 


--------------------------------------------------------------------------------
/test/undefined.in:
--------------------------------------------------------------------------------
1 | include 'arch/c'
2 | 
3 | fn fun() = {
4 |   a := undefined{i32}
5 |   b:i32 = undefined{i32}
6 |   c:__pnt{i32} = undefined{i32, 7}
7 |   emit{i32, 'add', a, b, load{c,2}}
8 | }
9 | 


--------------------------------------------------------------------------------
/test/varpar.in:
--------------------------------------------------------------------------------
 1 | def g1{...x} = length{x}
 2 | def g4{a, b, ...c, d} = __sub{length{c},d}
 3 | 
 4 | fn fun(x:u8) : u8 = {
 5 |   x = g1{0,1,2,3}
 6 |   x = g1{}
 7 |   x = g4{3,2,1,0}
 8 |   x = g4{9,8,7,6,5,4,3,2}
 9 | }
10 | 


--------------------------------------------------------------------------------
/test/varpar.ir:
--------------------------------------------------------------------------------
1 | beginFn f0_fun u8 1 v0_x u8
2 | mut v0_x !4:u8
3 | mut v0_x !0:u8
4 | mut v0_x !1:u8
5 | mut v0_x !3:u8
6 | ret v0_x
7 | endFn
8 | 
9 | 


--------------------------------------------------------------------------------
/test/voidfn.in:
--------------------------------------------------------------------------------
1 | fn fun(i:i64) : void = {
2 |   emit{i64, 'thing', i}
3 | }
4 | export{'efn', fun}
5 | 


--------------------------------------------------------------------------------
/test/voidfn.ir:
--------------------------------------------------------------------------------
1 | beginFn f0_fun void 1 v0_i i64
2 | new v1 emit i64 'thing' v0_i
3 | endFn
4 | 
5 | export 'efn' (i64)->void $f0_fun
6 | 


--------------------------------------------------------------------------------
/test/vtype.c:
--------------------------------------------------------------------------------
1 | static bool si_f0_fun(__m64 v0_a, __m128i v1_b, __m256i v2_c, __m128d v3_d, __m256 v4_e) {
2 |   return ((bool)0ull);
3 | }
4 | 
5 | 


--------------------------------------------------------------------------------
/test/vtype.in:
--------------------------------------------------------------------------------
1 | fn fun(a:[2]i32, b:[4]u32, c:[256]u1, d:[2]f64, e:[8]f32) : u1 = { 0 }
2 | 


--------------------------------------------------------------------------------