├── tutorials
    ├── sdklayout-05-gemv
    │   ├── commands_wse2.sh
    │   ├── commands_wse3.sh
    │   ├── gemv-4-by-4.png
    │   └── mux.csl
    ├── sdklayout-02-routing
    │   ├── commands_wse2.sh
    │   ├── commands_wse3.sh
    │   ├── README.rst
    │   └── send_receive.csl
    ├── sdklayout-04-h2d-d2h
    │   ├── commands_wse2.sh
    │   ├── commands_wse3.sh
    │   ├── README.rst
    │   └── add2vec.csl
    ├── sdklayout-01-introduction
    │   ├── commands_wse2.sh
    │   ├── commands_wse3.sh
    │   ├── gv.csl
    │   └── README.rst
    ├── sdklayout-03-ports-and-connections
    │   ├── commands_wse2.sh
    │   ├── commands_wse3.sh
    │   ├── README.rst
    │   ├── receiver.csl
    │   ├── sender.csl
    │   └── add2vec.csl
    ├── gemv-03-memcpy
    │   ├── commands_wse2.sh
    │   ├── commands_wse3.sh
    │   ├── README.rst
    │   ├── layout.csl
    │   └── pe_program.csl
    ├── gemv-02-memory-dsds
    │   ├── commands_wse2.sh
    │   ├── commands_wse3.sh
    │   ├── layout.csl
    │   ├── README.rst
    │   └── run.py
    ├── gemv-01-complete-program
    │   ├── commands_wse2.sh
    │   ├── commands_wse3.sh
    │   ├── layout.csl
    │   ├── README.rst
    │   └── pe_program.csl
    ├── gemv-04-params
    │   ├── commands_wse2.sh
    │   ├── commands_wse3.sh
    │   ├── README.rst
    │   ├── layout.csl
    │   └── pe_program.csl
    ├── topic-15-wse3-microthreads
    │   ├── commands_wse3.sh
    │   ├── layout.csl
    │   ├── right_pe.csl
    │   ├── left_pe.csl
    │   ├── run.py
    │   └── README.rst
    ├── gemv-06-routes-1
    │   ├── commands_wse2.sh
    │   ├── commands_wse3.sh
    │   ├── layout.csl
    │   └── README.rst
    ├── gemv-07-routes-2
    │   ├── commands_wse2.sh
    │   ├── commands_wse3.sh
    │   └── layout.csl
    ├── gemv-05-multiple-pes
    │   ├── commands_wse2.sh
    │   ├── commands_wse3.sh
    │   ├── README.rst
    │   ├── layout.csl
    │   └── pe_program.csl
    ├── topic-06-switches
    │   ├── commands_wse2.sh
    │   ├── commands_wse3.sh
    │   ├── empty.csl
    │   └── README.rst
    ├── topic-08-filters
    │   ├── commands_wse2.sh
    │   ├── commands_wse3.sh
    │   ├── README.rst
    │   └── run.py
    ├── topic-01-arrays-and-pointers
    │   ├── commands_wse2.sh
    │   ├── commands_wse3.sh
    │   ├── README.rst
    │   ├── pe_program.csl
    │   ├── layout.csl
    │   └── run.py
    ├── topic-07-switches-entrypt
    │   ├── commands_wse2.sh
    │   ├── commands_wse3.sh
    │   ├── empty.csl
    │   └── README.rst
    ├── gemv-08-routes-3
    │   ├── commands_wse2.sh
    │   └── commands_wse3.sh
    ├── topic-10-map-builtin
    │   ├── commands_wse2.sh
    │   ├── commands_wse3.sh
    │   ├── README.rst
    │   └── layout.csl
    ├── topic-13-simprint
    │   ├── commands_wse2.sh
    │   └── commands_wse3.sh
    ├── topic-11-collectives
    │   ├── commands_wse2.sh
    │   ├── commands_wse3.sh
    │   └── README.rst
    ├── topic-12-debug-library
    │   ├── commands_wse2.sh
    │   ├── commands_wse3.sh
    │   └── README.rst
    ├── topic-02-libraries
    │   ├── commands_wse2.sh
    │   ├── commands_wse3.sh
    │   ├── README.rst
    │   └── layout.csl
    ├── topic-14-color-swap
    │   ├── commands_wse2.sh
    │   ├── README.rst
    │   └── pe_program.csl
    ├── topic-04-sparse-tensors
    │   ├── commands_wse2.sh
    │   ├── commands_wse3.sh
    │   ├── README.rst
    │   └── pe_program.csl
    ├── topic-03-streaming-wavelet-data
    │   ├── commands_wse2.sh
    │   ├── commands_wse3.sh
    │   ├── README.rst
    │   └── pe_program.csl
    ├── pipeline-01-basic
    │   ├── commands_wse2.sh
    │   ├── commands_wse3.sh
    │   ├── layout.csl
    │   ├── README.rst
    │   └── pe_program.csl
    ├── pipeline-02-fifo
    │   ├── commands_wse2.sh
    │   ├── commands_wse3.sh
    │   ├── README.rst
    │   └── layout.csl
    ├── pipeline-03-multiple
    │   ├── commands_wse2.sh
    │   ├── commands_wse3.sh
    │   ├── memcpy_edge
    │   │   ├── east.csl
    │   │   ├── north.csl
    │   │   ├── south.csl
    │   │   ├── west.csl
    │   │   └── d2h.csl
    │   ├── README.rst
    │   └── pe_program.csl
    ├── topic-09-fifos
    │   ├── commands_wse2.sh
    │   ├── commands_wse3.sh
    │   └── README.rst
    ├── gemv-09-streaming
    │   ├── commands_wse2.sh
    │   ├── commands_wse3.sh
    │   └── README.rst
    ├── topic-05-sentinels
    │   ├── commands_wse2.sh
    │   ├── commands_wse3.sh
    │   └── README.rst
    └── gemv-00-basic-syntax
    │   ├── README.rst
    │   └── code.csl
└── benchmarks
    ├── residual
        ├── images
        │   └── residual-memcpy-2-by-2.png
        ├── commands_wse2.sh
        ├── commands_wse3.sh
        ├── nrminf.csl
        ├── README.rst
        └── axpy.csl
    ├── gemv-checkerboard-pattern
        ├── images
        │   └── gemv-4-by-4.png
        ├── commands_wse2.sh
        ├── commands_wse3.sh
        └── README.rst
    ├── gemm-collectives_2d
        ├── commands_wse2.sh
        ├── commands_wse3.sh
        └── README.rst
    ├── cholesky
        ├── commands_wse2.sh
        ├── commands_wse3.sh
        └── launch.csl
    ├── game-of-life
        ├── commands_wse2.sh
        ├── commands_wse3.sh
        └── README.rst
    ├── mandelbrot
        ├── commands_wse2.sh
        ├── commands_wse3.sh
        ├── common.csl
        └── README.rst
    ├── gemv-collectives_2d
        ├── commands_wse2.sh
        ├── commands_wse3.sh
        └── README.rst
    ├── single-tile-matvec
        ├── commands_wse2.sh
        ├── commands_wse3.sh
        ├── compile.appliance.py
        └── src
        │   └── layout_matvec.csl
    ├── wide-multiplication
        ├── commands_wse2.sh
        ├── commands_wse3.sh
        └── README.rst
    ├── 25-pt-stencil
        ├── commands_wse2.sh
        ├── nop.csl
        ├── ic.py
        └── util.csl
    ├── histogram-torus
        └── commands_wse2.sh
    ├── row-col-broadcast
        ├── commands_wse2.sh
        ├── commands_wse3.sh
        ├── README.rst
        └── src
        │   └── sync
        │       └── layout.csl
    ├── fft-1d-2d
        ├── commands_wse2.sh
        ├── commands_wse3.sh
        ├── reshape.csl
        └── ucode_1d.csl
    ├── bandwidth-test
        ├── commands_wse2.sh
        ├── commands_wse3.sh
        ├── README.rst
        └── src
        │   └── sync
        │       └── layout.csl
    ├── fft-3d
        ├── commands_wse3.sh
        ├── layout.csl
        └── README.rst
    ├── 7pt-stencil-spmv
        ├── commands_wse2.sh
        ├── commands_wse3.sh
        └── README.rst
    ├── bicgstab
        ├── commands_wse2.sh
        └── commands_wse3.sh
    ├── power-method
        ├── commands_wse2.sh
        ├── commands_wse3.sh
        └── power_method.py
    ├── conjugate-gradient
        ├── commands_wse2.sh
        └── commands_wse3.sh
    ├── preconditioned-conjugate-gradient
        ├── commands_wse2.sh
        └── commands_wse3.sh
    ├── spmv-hypersparse
        ├── commands_wse2.sh
        ├── data
        │   └── rmat4.4x4.lb.mtx
        ├── src
        │   └── allreduce2R1E
        │   │   └── layout.csl
        └── README.rst
    └── benchmark-libs
        └── allreduce
            └── layout.csl


/tutorials/sdklayout-05-gemv/commands_wse2.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | set -e
4 | 
5 | cs_python run.py --arch=wse2
6 | 


--------------------------------------------------------------------------------
/tutorials/sdklayout-05-gemv/commands_wse3.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | set -e
4 | 
5 | cs_python run.py --arch=wse3
6 | 


--------------------------------------------------------------------------------
/tutorials/sdklayout-02-routing/commands_wse2.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | set -e
4 | 
5 | cs_python run.py --arch=wse2
6 | 


--------------------------------------------------------------------------------
/tutorials/sdklayout-02-routing/commands_wse3.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | set -e
4 | 
5 | cs_python run.py --arch=wse3
6 | 


--------------------------------------------------------------------------------
/tutorials/sdklayout-04-h2d-d2h/commands_wse2.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | set -e
4 | 
5 | cs_python run.py --arch=wse2
6 | 


--------------------------------------------------------------------------------
/tutorials/sdklayout-04-h2d-d2h/commands_wse3.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | set -e
4 | 
5 | cs_python run.py --arch=wse3
6 | 


--------------------------------------------------------------------------------
/tutorials/sdklayout-01-introduction/commands_wse2.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | set -e
4 | 
5 | cs_python run.py --arch=wse2
6 | 


--------------------------------------------------------------------------------
/tutorials/sdklayout-01-introduction/commands_wse3.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | set -e
4 | 
5 | cs_python run.py --arch=wse3
6 | 


--------------------------------------------------------------------------------
/tutorials/sdklayout-03-ports-and-connections/commands_wse2.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | set -e
4 | 
5 | cs_python run.py --arch=wse2
6 | 


--------------------------------------------------------------------------------
/tutorials/sdklayout-03-ports-and-connections/commands_wse3.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | set -e
4 | 
5 | cs_python run.py --arch=wse3
6 | 


--------------------------------------------------------------------------------
/tutorials/sdklayout-05-gemv/gemv-4-by-4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Cerebras/csl-examples/HEAD/tutorials/sdklayout-05-gemv/gemv-4-by-4.png


--------------------------------------------------------------------------------
/benchmarks/residual/images/residual-memcpy-2-by-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Cerebras/csl-examples/HEAD/benchmarks/residual/images/residual-memcpy-2-by-2.png


--------------------------------------------------------------------------------
/benchmarks/gemv-checkerboard-pattern/images/gemv-4-by-4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Cerebras/csl-examples/HEAD/benchmarks/gemv-checkerboard-pattern/images/gemv-4-by-4.png


--------------------------------------------------------------------------------
/tutorials/gemv-03-memcpy/commands_wse2.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | set -e
4 | 
5 | cslc --arch=wse2 ./layout.csl --fabric-dims=8,3 \
6 | --fabric-offsets=4,1 -o out --memcpy --channels 1
7 | cs_python run.py --name out
8 | 


--------------------------------------------------------------------------------
/tutorials/gemv-03-memcpy/commands_wse3.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | set -e
4 | 
5 | cslc --arch=wse3 ./layout.csl --fabric-dims=8,3 \
6 | --fabric-offsets=4,1 -o out --memcpy --channels 1
7 | cs_python run.py --name out
8 | 


--------------------------------------------------------------------------------
/tutorials/gemv-02-memory-dsds/commands_wse2.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | set -e
4 | 
5 | cslc --arch=wse2 ./layout.csl --fabric-dims=8,3 \
6 | --fabric-offsets=4,1 -o out --memcpy --channels 1
7 | cs_python run.py --name out
8 | 


--------------------------------------------------------------------------------
/tutorials/gemv-02-memory-dsds/commands_wse3.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | set -e
4 | 
5 | cslc --arch=wse3 ./layout.csl --fabric-dims=8,3 \
6 | --fabric-offsets=4,1 -o out --memcpy --channels 1
7 | cs_python run.py --name out
8 | 


--------------------------------------------------------------------------------
/tutorials/gemv-01-complete-program/commands_wse2.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | set -e
4 | 
5 | cslc --arch=wse2 ./layout.csl --fabric-dims=8,3 \
6 | --fabric-offsets=4,1 -o out --memcpy --channels 1
7 | cs_python run.py --name out
8 | 


--------------------------------------------------------------------------------
/tutorials/gemv-01-complete-program/commands_wse3.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | set -e
4 | 
5 | cslc --arch=wse3 ./layout.csl --fabric-dims=8,3 \
6 | --fabric-offsets=4,1 -o out --memcpy --channels 1
7 | cs_python run.py --name out
8 | 


--------------------------------------------------------------------------------
/tutorials/gemv-04-params/commands_wse2.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | set -e
4 | 
5 | cslc --arch=wse2 ./layout.csl --fabric-dims=8,3 \
6 | --fabric-offsets=4,1 --params=M:4,N:6 -o out --memcpy --channels 1
7 | cs_python run.py --name out
8 | 


--------------------------------------------------------------------------------
/tutorials/gemv-04-params/commands_wse3.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | set -e
4 | 
5 | cslc --arch=wse3 ./layout.csl --fabric-dims=8,3 \
6 | --fabric-offsets=4,1 --params=M:4,N:6 -o out --memcpy --channels 1
7 | cs_python run.py --name out
8 | 


--------------------------------------------------------------------------------
/tutorials/topic-15-wse3-microthreads/commands_wse3.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | set -e
4 | 
5 | cslc --arch=wse3 ./layout.csl --fabric-dims=11,3 \
6 | --fabric-offsets=4,1 -o out --memcpy --channels 1
7 | cs_python run.py --name out
8 | 


--------------------------------------------------------------------------------
/tutorials/gemv-06-routes-1/commands_wse2.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | set -e
4 | 
5 | cslc --arch=wse2 ./layout.csl --fabric-dims=11,3 \
6 | --fabric-offsets=4,1 --params=M:4,N:6 -o out --memcpy --channels 1
7 | cs_python run.py --name out
8 | 


--------------------------------------------------------------------------------
/tutorials/gemv-06-routes-1/commands_wse3.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | set -e
4 | 
5 | cslc --arch=wse3 ./layout.csl --fabric-dims=11,3 \
6 | --fabric-offsets=4,1 --params=M:4,N:6 -o out --memcpy --channels 1
7 | cs_python run.py --name out
8 | 


--------------------------------------------------------------------------------
/tutorials/gemv-07-routes-2/commands_wse2.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | set -e
4 | 
5 | cslc --arch=wse2 ./layout.csl --fabric-dims=9,4 \
6 | --fabric-offsets=4,1 --params=M:4,N:6 -o out --memcpy --channels 1
7 | cs_python run.py --name out
8 | 


--------------------------------------------------------------------------------
/tutorials/gemv-07-routes-2/commands_wse3.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | set -e
4 | 
5 | cslc --arch=wse3 ./layout.csl --fabric-dims=9,4 \
6 | --fabric-offsets=4,1 --params=M:4,N:6 -o out --memcpy --channels 1
7 | cs_python run.py --name out
8 | 


--------------------------------------------------------------------------------
/tutorials/gemv-05-multiple-pes/commands_wse2.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | set -e
4 | 
5 | cslc --arch=wse2 ./layout.csl --fabric-dims=11,3 \
6 | --fabric-offsets=4,1 --params=M:4,N:6,width:4 -o out --memcpy --channels 1
7 | cs_python run.py --name out
8 | 


--------------------------------------------------------------------------------
/tutorials/gemv-05-multiple-pes/commands_wse3.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | set -e
4 | 
5 | cslc --arch=wse3 ./layout.csl --fabric-dims=11,3 \
6 | --fabric-offsets=4,1 --params=M:4,N:6,width:4 -o out --memcpy --channels 1
7 | cs_python run.py --name out
8 | 


--------------------------------------------------------------------------------
/tutorials/topic-06-switches/commands_wse2.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | set -e
4 | 
5 | cslc --arch=wse2 ./layout.csl --fabric-dims=10,5 --fabric-offsets=4,1 -o out \
6 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0
7 | cs_python run.py --name out
8 | 


--------------------------------------------------------------------------------
/tutorials/topic-06-switches/commands_wse3.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | set -e
4 | 
5 | cslc --arch=wse3 ./layout.csl --fabric-dims=10,5 --fabric-offsets=4,1 -o out \
6 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0
7 | cs_python run.py --name out
8 | 


--------------------------------------------------------------------------------
/tutorials/topic-08-filters/commands_wse2.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | set -e
4 | 
5 | cslc --arch=wse2 ./layout.csl --fabric-dims=11,3 --fabric-offsets=4,1 -o out \
6 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0
7 | cs_python run.py --name out
8 | 


--------------------------------------------------------------------------------
/tutorials/topic-08-filters/commands_wse3.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | set -e
4 | 
5 | cslc --arch=wse3 ./layout.csl --fabric-dims=11,3 --fabric-offsets=4,1 -o out \
6 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0
7 | cs_python run.py --name out
8 | 


--------------------------------------------------------------------------------
/benchmarks/gemm-collectives_2d/commands_wse2.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | set -e
4 | 
5 | cslc --arch=wse2 ./layout.csl --fabric-dims=11,6 --fabric-offsets=4,1 \
6 | --params=P:4,Mt:14,Kt:14,Nt:14 \
7 | --memcpy --channels=1 -o out
8 | cs_python run.py --name out
9 | 


--------------------------------------------------------------------------------
/benchmarks/gemm-collectives_2d/commands_wse3.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | set -e
4 | 
5 | cslc --arch=wse3 ./layout.csl --fabric-dims=11,6 --fabric-offsets=4,1 \
6 | --params=P:4,Mt:14,Kt:14,Nt:14 \
7 | --memcpy --channels=1 -o out
8 | cs_python run.py --name out
9 | 


--------------------------------------------------------------------------------
/tutorials/topic-01-arrays-and-pointers/commands_wse2.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | set -e
4 | 
5 | cslc --arch=wse2 ./layout.csl --fabric-dims=8,3 --fabric-offsets=4,1 -o out \
6 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0
7 | cs_python run.py --name out
8 | 


--------------------------------------------------------------------------------
/tutorials/topic-01-arrays-and-pointers/commands_wse3.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | set -e
4 | 
5 | cslc --arch=wse3 ./layout.csl --fabric-dims=8,3 --fabric-offsets=4,1 -o out \
6 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0
7 | cs_python run.py --name out
8 | 


--------------------------------------------------------------------------------
/tutorials/topic-07-switches-entrypt/commands_wse2.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | set -e
4 | 
5 | cslc --arch=wse2 ./layout.csl --fabric-dims=10,5 --fabric-offsets=4,1 -o out \
6 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0
7 | cs_python run.py --name out
8 | 


--------------------------------------------------------------------------------
/tutorials/topic-07-switches-entrypt/commands_wse3.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | set -e
4 | 
5 | cslc --arch=wse3 ./layout.csl --fabric-dims=10,5 --fabric-offsets=4,1 -o out \
6 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0
7 | cs_python run.py --name out
8 | 


--------------------------------------------------------------------------------
/tutorials/gemv-08-routes-3/commands_wse2.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | set -e
4 | 
5 | cslc --arch=wse2 ./layout.csl --fabric-dims=11,5 \
6 | --fabric-offsets=4,1 --params=kernel_x_dim:4,kernel_y_dim:3,M:6,N:8 \
7 | -o out --memcpy --channels 1
8 | cs_python run.py --name out
9 | 


--------------------------------------------------------------------------------
/tutorials/gemv-08-routes-3/commands_wse3.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | set -e
4 | 
5 | cslc --arch=wse3 ./layout.csl --fabric-dims=11,5 \
6 | --fabric-offsets=4,1 --params=kernel_x_dim:4,kernel_y_dim:3,M:6,N:8 \
7 | -o out --memcpy --channels 1
8 | cs_python run.py --name out
9 | 


--------------------------------------------------------------------------------
/benchmarks/cholesky/commands_wse2.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | set -e
4 | 
5 | cslc --arch=wse2 ./layout.csl --fabric-dims=17,12 --fabric-offsets=4,1 \
6 | --params=P:10,Nt:4 -o out \
7 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0
8 | cs_python run.py --name out
9 | 


--------------------------------------------------------------------------------
/benchmarks/cholesky/commands_wse3.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | set -e
4 | 
5 | cslc --arch=wse3 ./layout.csl --fabric-dims=17,12 --fabric-offsets=4,1 \
6 | --params=P:10,Nt:4 -o out \
7 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0
8 | cs_python run.py --name out
9 | 


--------------------------------------------------------------------------------
/benchmarks/game-of-life/commands_wse2.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | set -e
4 | 
5 | cslc --arch=wse2 ./layout.csl --fabric-dims=19,14 --fabric-offsets=4,1 \
6 | --params=x_dim:12,y_dim:12 --memcpy --channels=1 -o out
7 | cs_python run.py --name out --initial-state glider --iters 20
8 | 


--------------------------------------------------------------------------------
/benchmarks/game-of-life/commands_wse3.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | set -e
4 | 
5 | cslc --arch=wse3 ./layout.csl --fabric-dims=19,14 --fabric-offsets=4,1 \
6 | --params=x_dim:12,y_dim:12 --memcpy --channels=1 -o out
7 | cs_python run.py --name out --initial-state glider --iters 20
8 | 


--------------------------------------------------------------------------------
/tutorials/topic-10-map-builtin/commands_wse2.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | set -e
4 | 
5 | cslc --arch=wse2 ./layout.csl \
6 | --fabric-dims=8,3 --fabric-offsets=4,1 --params=size:5 \
7 | -o out --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0
8 | cs_python run.py --name out
9 | 


--------------------------------------------------------------------------------
/tutorials/topic-10-map-builtin/commands_wse3.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | set -e
4 | 
5 | cslc --arch=wse3 ./layout.csl \
6 | --fabric-dims=8,3 --fabric-offsets=4,1 --params=size:5 \
7 | -o out --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0
8 | cs_python run.py --name out
9 | 


--------------------------------------------------------------------------------
/benchmarks/mandelbrot/commands_wse2.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | set -e
4 | 
5 | cslc --arch=wse2 ./code.csl --fabric-dims=11,6 --fabric-offsets=4,1 -o out \
6 | --params=MEMCPYD2H_DATA_1_ID:1 \
7 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0
8 | cs_python run.py --name out
9 | 


--------------------------------------------------------------------------------
/benchmarks/mandelbrot/commands_wse3.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | set -e
4 | 
5 | cslc --arch=wse3 ./code.csl --fabric-dims=11,6 --fabric-offsets=4,1 -o out \
6 | --params=MEMCPYD2H_DATA_1_ID:1 \
7 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0
8 | cs_python run.py --name out
9 | 


--------------------------------------------------------------------------------
/benchmarks/gemv-collectives_2d/commands_wse2.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | set -e
4 | 
5 | cslc --arch=wse2 ./layout.csl --fabric-dims=11,6 --fabric-offsets=4,1 \
6 | --params=kernel_rows:4,kernel_cols:4,matrix_rows:32,matrix_cols:16 \
7 | --memcpy --channels=1 -o out
8 | cs_python run.py --name out
9 | 


--------------------------------------------------------------------------------
/benchmarks/gemv-collectives_2d/commands_wse3.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | set -e
4 | 
5 | cslc --arch=wse3 ./layout.csl --fabric-dims=11,6 --fabric-offsets=4,1 \
6 | --params=kernel_rows:4,kernel_cols:4,matrix_rows:32,matrix_cols:16 \
7 | --memcpy --channels=1 -o out
8 | cs_python run.py --name out
9 | 


--------------------------------------------------------------------------------
/tutorials/topic-13-simprint/commands_wse2.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | set -e
4 | 
5 | cslc --arch=wse2 ./layout.csl --fabric-dims=11,3 \
6 | --fabric-offsets=4,1 --params=width:4,num_elems:5 -o out  \
7 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0
8 | cs_python run.py --name out
9 | 


--------------------------------------------------------------------------------
/tutorials/topic-13-simprint/commands_wse3.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | set -e
4 | 
5 | cslc --arch=wse3 ./layout.csl --fabric-dims=11,3 \
6 | --fabric-offsets=4,1 --params=width:4,num_elems:5 -o out  \
7 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0
8 | cs_python run.py --name out
9 | 


--------------------------------------------------------------------------------
/tutorials/topic-11-collectives/commands_wse2.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | set -e
4 | 
5 | cslc --arch=wse2 ./layout.csl --fabric-dims=22,17 --fabric-offsets=4,1 \
6 | --params=Pw:15,Ph:15,chunk_size:3 -o out \
7 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0
8 | cs_python run.py --name out
9 | 


--------------------------------------------------------------------------------
/tutorials/topic-11-collectives/commands_wse3.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | set -e
4 | 
5 | cslc --arch=wse3 ./layout.csl --fabric-dims=22,17 --fabric-offsets=4,1 \
6 | --params=Pw:15,Ph:15,chunk_size:3 -o out \
7 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0
8 | cs_python run.py --name out
9 | 


--------------------------------------------------------------------------------
/tutorials/topic-12-debug-library/commands_wse2.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | set -e
4 | 
5 | cslc --arch=wse2 ./layout.csl --fabric-dims=11,3 \
6 | --fabric-offsets=4,1 --params=width:4,num_elems:5 -o out  \
7 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0
8 | cs_python run.py --name out
9 | 


--------------------------------------------------------------------------------
/tutorials/topic-12-debug-library/commands_wse3.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | set -e
4 | 
5 | cslc --arch=wse3 ./layout.csl --fabric-dims=11,3 \
6 | --fabric-offsets=4,1 --params=width:4,num_elems:5 -o out  \
7 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0
8 | cs_python run.py --name out
9 | 


--------------------------------------------------------------------------------
/tutorials/topic-02-libraries/commands_wse2.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | set -e
4 | 
5 | cslc --arch=wse2 ./layout.csl --fabric-dims=8,3 --fabric-offsets=4,1 \
6 | --params=iterations:200 -o out \
7 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0
8 | cs_python run.py --name out --tolerance 0.1
9 | 


--------------------------------------------------------------------------------
/tutorials/topic-02-libraries/commands_wse3.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | set -e
4 | 
5 | cslc --arch=wse3 ./layout.csl --fabric-dims=8,3 --fabric-offsets=4,1 \
6 | --params=iterations:200 -o out \
7 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0
8 | cs_python run.py --name out --tolerance 0.1
9 | 


--------------------------------------------------------------------------------
/benchmarks/single-tile-matvec/commands_wse2.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -e
 4 | 
 5 | cslc ./src/layout_matvec.csl --arch wse2 --fabric-dims=9,4 \
 6 | --fabric-offsets=4,1 \
 7 | --params=width:2,height:2,tile_size:25,iters:1 \
 8 | -o out --memcpy --channels=1
 9 | cs_python ./run.py --name out --verify
10 | 


--------------------------------------------------------------------------------
/benchmarks/single-tile-matvec/commands_wse3.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -e
 4 | 
 5 | cslc ./src/layout_matvec.csl --arch wse3 --fabric-dims=9,4 \
 6 | --fabric-offsets=4,1 \
 7 | --params=width:2,height:2,tile_size:25,iters:1 \
 8 | -o out --memcpy --channels=1
 9 | cs_python ./run.py --name out --verify
10 | 


--------------------------------------------------------------------------------
/benchmarks/residual/commands_wse2.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -e
 4 | 
 5 | cslc ./layout.csl --arch=wse2 --fabric-dims=9,4 --fabric-offsets=4,1 \
 6 | --params=width:2,height:2 \
 7 | --params=LOCAL_OUT_SZ:3,LOCAL_IN_SZ:2 -o=out --memcpy --channels=1 \
 8 | --width-west-buf=0 --width-east-buf=0
 9 | cs_python run.py --name out
10 | 


--------------------------------------------------------------------------------
/benchmarks/residual/commands_wse3.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -e
 4 | 
 5 | cslc ./layout.csl --arch=wse3 --fabric-dims=9,4 --fabric-offsets=4,1 \
 6 | --params=width:2,height:2 \
 7 | --params=LOCAL_OUT_SZ:3,LOCAL_IN_SZ:2 -o=out --memcpy --channels=1 \
 8 | --width-west-buf=0 --width-east-buf=0
 9 | cs_python run.py --name out
10 | 


--------------------------------------------------------------------------------
/tutorials/topic-14-color-swap/commands_wse2.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -e
 4 | 
 5 | cslc --arch=wse2 ./layout.csl --fabric-dims=11,3 \
 6 | --fabric-offsets=4,1 -o out  \
 7 | --params=width:4 \
 8 | --params=MEMCPYH2D_DATA_1_ID:6 \
 9 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0
10 | cs_python run.py --name out
11 | 


--------------------------------------------------------------------------------
/tutorials/topic-04-sparse-tensors/commands_wse2.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -e
 4 | 
 5 | cslc --arch=wse2 ./layout.csl --fabric-dims=8,3 \
 6 | --fabric-offsets=4,1 -o out \
 7 | --params=MEMCPYH2D_DATA_1_ID:0 \
 8 | --params=MEMCPYD2H_DATA_1_ID:1 \
 9 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0
10 | cs_python run.py --name out
11 | 


--------------------------------------------------------------------------------
/tutorials/topic-04-sparse-tensors/commands_wse3.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -e
 4 | 
 5 | cslc --arch=wse3 ./layout.csl --fabric-dims=8,3 \
 6 | --fabric-offsets=4,1 -o out \
 7 | --params=MEMCPYH2D_DATA_1_ID:0 \
 8 | --params=MEMCPYD2H_DATA_1_ID:1 \
 9 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0
10 | cs_python run.py --name out
11 | 


--------------------------------------------------------------------------------
/tutorials/topic-03-streaming-wavelet-data/commands_wse2.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -e
 4 | 
 5 | cslc --arch=wse2 ./layout.csl --fabric-dims=8,3 \
 6 | --fabric-offsets=4,1 -o out \
 7 | --params=MEMCPYH2D_DATA_1_ID:0 \
 8 | --params=MEMCPYD2H_DATA_1_ID:1 \
 9 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0
10 | cs_python run.py --name out
11 | 


--------------------------------------------------------------------------------
/tutorials/topic-03-streaming-wavelet-data/commands_wse3.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -e
 4 | 
 5 | cslc --arch=wse3 ./layout.csl --fabric-dims=8,3 \
 6 | --fabric-offsets=4,1 -o out \
 7 | --params=MEMCPYH2D_DATA_1_ID:0 \
 8 | --params=MEMCPYD2H_DATA_1_ID:1 \
 9 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0
10 | cs_python run.py --name out
11 | 


--------------------------------------------------------------------------------
/benchmarks/wide-multiplication/commands_wse2.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -e
 4 | 
 5 | cslc --arch=wse2 ./code.csl --fabric-dims=8,3 --fabric-offsets=4,1 -o out \
 6 | --params=num_bits:256 --params=MEMCPYH2D_DATA_1_ID:0 \
 7 | --params=MEMCPYD2H_DATA_1_ID:1 \
 8 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0
 9 | cs_python run.py --name out
10 | 


--------------------------------------------------------------------------------
/benchmarks/wide-multiplication/commands_wse3.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -e
 4 | 
 5 | cslc --arch=wse3 ./code.csl --fabric-dims=8,3 --fabric-offsets=4,1 -o out \
 6 | --params=num_bits:256 --params=MEMCPYH2D_DATA_1_ID:0 \
 7 | --params=MEMCPYD2H_DATA_1_ID:1 \
 8 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0
 9 | cs_python run.py --name out
10 | 


--------------------------------------------------------------------------------
/tutorials/pipeline-01-basic/commands_wse2.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -e
 4 | 
 5 | cslc --arch=wse2 ./layout.csl --fabric-dims=8,3 \
 6 | --fabric-offsets=4,1 --params=size:12 -o out \
 7 | --params=MEMCPYH2D_DATA_1_ID:0 \
 8 | --params=MEMCPYD2H_DATA_1_ID:1 \
 9 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0
10 | cs_python run.py --name out
11 | 


--------------------------------------------------------------------------------
/tutorials/pipeline-01-basic/commands_wse3.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -e
 4 | 
 5 | cslc --arch=wse3 ./layout.csl --fabric-dims=8,3 \
 6 | --fabric-offsets=4,1 --params=size:12 -o out \
 7 | --params=MEMCPYH2D_DATA_1_ID:0 \
 8 | --params=MEMCPYD2H_DATA_1_ID:1 \
 9 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0
10 | cs_python run.py --name out
11 | 


--------------------------------------------------------------------------------
/tutorials/pipeline-02-fifo/commands_wse2.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -e
 4 | 
 5 | cslc --arch=wse2 ./layout.csl --fabric-dims=8,3 \
 6 | --fabric-offsets=4,1 --params=size:32 -o out \
 7 | --params=MEMCPYH2D_DATA_1_ID:0 \
 8 | --params=MEMCPYD2H_DATA_1_ID:1 \
 9 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0
10 | cs_python run.py --name out
11 | 


--------------------------------------------------------------------------------
/tutorials/pipeline-02-fifo/commands_wse3.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -e
 4 | 
 5 | cslc --arch=wse3 ./layout.csl --fabric-dims=8,3 \
 6 | --fabric-offsets=4,1 --params=size:32 -o out \
 7 | --params=MEMCPYH2D_DATA_1_ID:0 \
 8 | --params=MEMCPYD2H_DATA_1_ID:1 \
 9 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0
10 | cs_python run.py --name out
11 | 


--------------------------------------------------------------------------------
/tutorials/pipeline-03-multiple/commands_wse2.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -e
 4 | 
 5 | cslc --arch=wse2 ./layout.csl --fabric-dims=10,3 \
 6 | --fabric-offsets=4,1 --params=size:32 -o out \
 7 | --params=MEMCPYH2D_DATA_1_ID:0 \
 8 | --params=MEMCPYD2H_DATA_1_ID:1 \
 9 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0
10 | cs_python run.py --name out
11 | 


--------------------------------------------------------------------------------
/tutorials/pipeline-03-multiple/commands_wse3.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -e
 4 | 
 5 | cslc --arch=wse3 ./layout.csl --fabric-dims=10,3 \
 6 | --fabric-offsets=4,1 --params=size:32 -o out \
 7 | --params=MEMCPYH2D_DATA_1_ID:0 \
 8 | --params=MEMCPYD2H_DATA_1_ID:1 \
 9 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0
10 | cs_python run.py --name out
11 | 


--------------------------------------------------------------------------------
/tutorials/gemv-03-memcpy/README.rst:
--------------------------------------------------------------------------------
1 | GEMV 3: H2D and D2H Memcpy
2 | ==========================
3 | 
4 | The memcpy functionality of ``SdkRuntime`` allows the programmer to copy data
5 | between the host and device.
6 | Continuing from the previous example, we now extend it to include
7 | ``memcpy_h2d`` calls which copy data from the host to initialize ``A``, ``x``,
8 | and ``y`` on device.
9 | 


--------------------------------------------------------------------------------
/tutorials/topic-09-fifos/commands_wse2.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -e
 4 | 
 5 | cslc --arch=wse2 ./layout.csl \
 6 | --fabric-dims=8,3 --fabric-offsets=4,1 \
 7 | --params=num_elems_to_process:512 \
 8 | --params=MEMCPYH2D_DATA_1_ID:4 \
 9 | --params=MEMCPYD2H_DATA_1_ID:5 \
10 | -o out \
11 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0
12 | cs_python run.py --name out
13 | 


--------------------------------------------------------------------------------
/tutorials/topic-09-fifos/commands_wse3.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -e
 4 | 
 5 | cslc --arch=wse3 ./layout.csl \
 6 | --fabric-dims=8,3 --fabric-offsets=4,1 \
 7 | --params=num_elems_to_process:512 \
 8 | --params=MEMCPYH2D_DATA_1_ID:4 \
 9 | --params=MEMCPYD2H_DATA_1_ID:5 \
10 | -o out \
11 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0
12 | cs_python run.py --name out
13 | 


--------------------------------------------------------------------------------
/tutorials/gemv-09-streaming/commands_wse2.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -e
 4 | 
 5 | cslc --arch=wse2 ./layout.csl --fabric-dims=11,5 \
 6 | --fabric-offsets=4,1 --params=kernel_x_dim:4,kernel_y_dim:3,M:6,N:8 \
 7 | --params=MEMCPYH2D_DATA_1_ID:0 \
 8 | --params=MEMCPYH2D_DATA_2_ID:1 \
 9 | --params=MEMCPYD2H_DATA_1_ID:2 \
10 | -o out --memcpy --channels 1
11 | cs_python run.py --name out
12 | 


--------------------------------------------------------------------------------
/tutorials/gemv-09-streaming/commands_wse3.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -e
 4 | 
 5 | cslc --arch=wse3 ./layout.csl --fabric-dims=11,5 \
 6 | --fabric-offsets=4,1 --params=kernel_x_dim:4,kernel_y_dim:3,M:6,N:8 \
 7 | --params=MEMCPYH2D_DATA_1_ID:0 \
 8 | --params=MEMCPYH2D_DATA_2_ID:1 \
 9 | --params=MEMCPYD2H_DATA_1_ID:2 \
10 | -o out --memcpy --channels 1
11 | cs_python run.py --name out
12 | 


--------------------------------------------------------------------------------
/tutorials/topic-05-sentinels/commands_wse2.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -e
 4 | 
 5 | cslc --arch=wse2 ./layout.csl --fabric-dims=11,12 \
 6 | --fabric-offsets=4,1 -o out \
 7 | --params=MEMCPYH2D_DATA_1_ID:2 \
 8 | --params=MEMCPYH2D_DATA_2_ID:3 \
 9 | --params=MEMCPYD2H_DATA_1_ID:4 \
10 | --params=size:4 \
11 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0
12 | cs_python run.py --name out
13 | 


--------------------------------------------------------------------------------
/tutorials/topic-05-sentinels/commands_wse3.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -e
 4 | 
 5 | cslc --arch=wse3 ./layout.csl --fabric-dims=11,12 \
 6 | --fabric-offsets=4,1 -o out \
 7 | --params=MEMCPYH2D_DATA_1_ID:2 \
 8 | --params=MEMCPYH2D_DATA_2_ID:3 \
 9 | --params=MEMCPYD2H_DATA_1_ID:4 \
10 | --params=size:4 \
11 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0
12 | cs_python run.py --name out
13 | 


--------------------------------------------------------------------------------
/benchmarks/25-pt-stencil/commands_wse2.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -e
 4 | 
 5 | cslc ./layout.csl --arch=wse2 --fabric-dims=17,12 --fabric-offsets=4,1 \
 6 | -o=out_code --params=width:10,height:10,zDim:10,sourceLength:10,dx:20 \
 7 | --params=srcX:0,srcY:0,srcZ:0 --verbose --memcpy --channels=1 \
 8 | --width-west-buf=0 --width-east-buf=0
 9 | cs_python run.py --name out \
10 | --iterations=10 --dx=20 --skip-compile
11 | 


--------------------------------------------------------------------------------
/benchmarks/histogram-torus/commands_wse2.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -e
 4 | 
 5 | cslc --arch=wse2 ./code.csl \
 6 | --params=INPUT_SIZE:16,HIST_WIDTH:8,HIST_HEIGHT:8,NUM_BUCKETS:4,BUCKET_SIZE:2 \
 7 | --colors=OUT_COLOR:8 \
 8 | --fabric-dims=15,10 --fabric-offsets=4,1 -o out \
 9 | --params=MEMCPYH2D_DATA_1_ID:10 \
10 | --params=MEMCPYD2H_DATA_1_ID:11 \
11 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0
12 | cs_python run.py --name out
13 | 


--------------------------------------------------------------------------------
/benchmarks/row-col-broadcast/commands_wse2.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -e
 4 | 
 5 | cslc ./src/layout.csl --arch wse2 --fabric-dims=12,7 --fabric-offsets=4,1 \
 6 | --params=width:5,height:5,pe_length:5 --params=C0_ID:0 \
 7 | --params=C1_ID:1 --params=C2_ID:2 --params=C3_ID:3 --params=C4_ID:4 -o=out \
 8 | --memcpy --channels=2 --width-west-buf=0 --width-east-buf=0
 9 | cs_python ./run.py -m=5 -n=5 -k=5 --latestlink out --is_row_bcast --loop_count=1
10 | 


--------------------------------------------------------------------------------
/benchmarks/row-col-broadcast/commands_wse3.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -e
 4 | 
 5 | cslc ./src/layout.csl --arch wse3 --fabric-dims=12,7 --fabric-offsets=4,1 \
 6 | --params=width:5,height:5,pe_length:5 --params=C0_ID:0 \
 7 | --params=C1_ID:1 --params=C2_ID:2 --params=C3_ID:3 --params=C4_ID:4 -o=out \
 8 | --memcpy --channels=2 --width-west-buf=0 --width-east-buf=0
 9 | cs_python ./run.py -m=5 -n=5 -k=5 --latestlink out --is_row_bcast --loop_count=1
10 | 


--------------------------------------------------------------------------------
/benchmarks/gemv-checkerboard-pattern/commands_wse2.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -e
 4 | 
 5 | cslc --arch=wse2 ./layout.csl --fabric-dims=11,6 --fabric-offsets=4,1 \
 6 | --colors=x_in:1,ax_out:3,b_in:4 -o out \
 7 | --params=kernel_rows:4,kernel_cols:4,matrix_rows:32,matrix_cols:16 \
 8 | --params=MEMCPYH2D_DATA_1_ID:10 --params=MEMCPYH2D_DATA_2_ID:11 \
 9 | --params=MEMCPYD2H_DATA_1_ID:12 \
10 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0
11 | cs_python run.py --name out
12 | 


--------------------------------------------------------------------------------
/benchmarks/gemv-checkerboard-pattern/commands_wse3.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -e
 4 | 
 5 | cslc --arch=wse3 ./layout.csl --fabric-dims=11,6 --fabric-offsets=4,1 \
 6 | --colors=x_in:1,ax_out:3,b_in:4 -o out \
 7 | --params=kernel_rows:4,kernel_cols:4,matrix_rows:32,matrix_cols:16 \
 8 | --params=MEMCPYH2D_DATA_1_ID:10 --params=MEMCPYH2D_DATA_2_ID:11 \
 9 | --params=MEMCPYD2H_DATA_1_ID:12 \
10 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0
11 | cs_python run.py --name out
12 | 


--------------------------------------------------------------------------------
/tutorials/topic-02-libraries/README.rst:
--------------------------------------------------------------------------------
 1 | Topic 2: Libraries
 2 | ==================
 3 | 
 4 | The CSL compiler comes bundled with a few standard libraries, which can be
 5 | imported into the user's program using the ``@import_module()`` builtin.  This
 6 | example shows three such compiler-bundled libraries:
 7 | 
 8 | * the ``random`` library for generating uniform random numbers,
 9 | * the ``timestamp`` library for reading the on-chip timestamp counter, and
10 | * the ``math`` library for square root.
11 | 


--------------------------------------------------------------------------------
/benchmarks/fft-1d-2d/commands_wse2.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -e
 4 | 
 5 | cslc --arch=wse2 ./layout.csl --fabric-dims=8,3 --fabric-offsets=4,1 \
 6 | --params=DIM:1,Nz:4,FP:2 --memcpy --channels=1 -o out-1D
 7 | cs_python run.py --name out-1D
 8 | cs_python run.py --inverse --name out-1D
 9 | cslc --arch=wse2 ./layout.csl --fabric-dims=11,3 --fabric-offsets=4,1 \
10 | --params=DIM:2,Nz:4,FP:1 --memcpy --channels=1 -o out-2D
11 | cs_python run.py --name out-2D
12 | cs_python run.py --inverse --name out-2D
13 | 


--------------------------------------------------------------------------------
/benchmarks/fft-1d-2d/commands_wse3.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -e
 4 | 
 5 | cslc --arch=wse3 ./layout.csl --fabric-dims=8,3 --fabric-offsets=4,1 \
 6 | --params=DIM:1,Nz:4,FP:2 --memcpy --channels=1 -o out-1D
 7 | cs_python run.py --name out-1D
 8 | cs_python run.py --inverse --name out-1D
 9 | cslc --arch=wse3 ./layout.csl --fabric-dims=11,3 --fabric-offsets=4,1 \
10 | --params=DIM:2,Nz:4,FP:1 --memcpy --channels=1 -o out-2D
11 | cs_python run.py --name out-2D
12 | cs_python run.py --inverse --name out-2D
13 | 


--------------------------------------------------------------------------------
/benchmarks/bandwidth-test/commands_wse2.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -e
 4 | 
 5 | cslc ./src/bw_sync_layout.csl --arch wse2 --fabric-dims=12,7 --fabric-offsets=4,1 \
 6 | --params=width:5,height:5,pe_length:5 --params=C0_ID:0 \
 7 | --params=C1_ID:1 --params=C2_ID:2 --params=C3_ID:3 --params=C4_ID:4 -o=out \
 8 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0
 9 | cs_python ./run.py -m=5 -n=5 -k=5 --latestlink out --channels=1 \
10 | --width-west-buf=0 --width-east-buf=0 --run-only --loop_count=1
11 | 


--------------------------------------------------------------------------------
/benchmarks/bandwidth-test/commands_wse3.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -e
 4 | 
 5 | cslc ./src/bw_sync_layout.csl --arch wse3 --fabric-dims=12,7 --fabric-offsets=4,1 \
 6 | --params=width:5,height:5,pe_length:5 --params=C0_ID:0 \
 7 | --params=C1_ID:1 --params=C2_ID:2 --params=C3_ID:3 --params=C4_ID:4 -o=out \
 8 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0
 9 | cs_python ./run.py -m=5 -n=5 -k=5 --latestlink out --channels=1 \
10 | --width-west-buf=0 --width-east-buf=0 --run-only --loop_count=1
11 | 


--------------------------------------------------------------------------------
/benchmarks/fft-3d/commands_wse3.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -e
 4 | 
 5 | cslc --arch=wse3 ./layout.csl --fabric-dims=11,6 --fabric-offsets=4,1 \
 6 | --params=N:16,NUM_PENCILS_PER_DIM:4,FP:1 --memcpy --channels=1 -o out
 7 | cs_python run.py --name out --real --norm 1
 8 | cs_python run.py --inverse --name out --norm 1
 9 | cslc --arch=wse3 ./layout.csl --fabric-dims=11,6 --fabric-offsets=4,1 \
10 | --params=N:16,NUM_PENCILS_PER_DIM:4,FP:0 --memcpy --channels=1 -o out
11 | cs_python run.py --name out
12 | cs_python run.py --inverse --name out
13 | 


--------------------------------------------------------------------------------
/tutorials/topic-08-filters/README.rst:
--------------------------------------------------------------------------------
 1 | Topic 8: Filters
 2 | ================
 3 | 
 4 | Fabric filters allow a PE to selectively accept incoming wavelets.  This example
 5 | shows the use of so-called range filters, which specify the wavelets to allow to
 6 | be forwarded to the CE based on the upper 16 bits of the wavelet contents.
 7 | Specifically, PE #0 sends all 12 wavelets to the other PEs, while each recipient
 8 | PE receives and processes only a quarter of the incoming wavelets.
 9 | See :ref:`language-builtins-filters` for other possible filter configurations.
10 | 


--------------------------------------------------------------------------------
/benchmarks/7pt-stencil-spmv/commands_wse2.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -e
 4 | 
 5 | cslc ./src/layout.csl --arch wse2 --fabric-dims=12,7 --fabric-offsets=4,1 \
 6 | --params=width:5,height:5,MAX_ZDIM:5 --params=BLOCK_SIZE:2 --params=C0_ID:0 \
 7 | --params=C1_ID:1 --params=C2_ID:2 --params=C3_ID:3 --params=C4_ID:4 --params=C5_ID:5 \
 8 | --params=C6_ID:6 --params=C7_ID:7 --params=C8_ID:8 -o=out \
 9 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0
10 | cs_python ./run.py -m=5 -n=5 -k=5 --latestlink out --channels=1 \
11 | --width-west-buf=0 --width-east-buf=0 --zDim=5 --run-only
12 | 


--------------------------------------------------------------------------------
/benchmarks/7pt-stencil-spmv/commands_wse3.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -e
 4 | 
 5 | cslc ./src/layout.csl --arch wse3 --fabric-dims=12,7 --fabric-offsets=4,1 \
 6 | --params=width:5,height:5,MAX_ZDIM:5 --params=BLOCK_SIZE:2 --params=C0_ID:0 \
 7 | --params=C1_ID:1 --params=C2_ID:2 --params=C3_ID:3 --params=C4_ID:4 --params=C5_ID:5 \
 8 | --params=C6_ID:6 --params=C7_ID:7 --params=C8_ID:8 -o=out \
 9 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0
10 | cs_python ./run.py -m=5 -n=5 -k=5 --latestlink out --channels=1 \
11 | --width-west-buf=0 --width-east-buf=0 --zDim=5 --run-only
12 | 


--------------------------------------------------------------------------------
/benchmarks/bicgstab/commands_wse2.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -e
 4 | 
 5 | cslc ./src/layout.csl --arch wse2 --fabric-dims=12,7 --fabric-offsets=4,1 \
 6 | --params=width:5,height:5,MAX_ZDIM:5 --params=BLOCK_SIZE:2 --params=C0_ID:0 \
 7 | --params=C1_ID:1 --params=C2_ID:2 --params=C3_ID:3 --params=C4_ID:4 --params=C5_ID:5 \
 8 | --params=C6_ID:6 --params=C7_ID:7 --params=C8_ID:8 -o=out \
 9 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0
10 | cs_python ./run.py -m=5 -n=5 -k=5 --latestlink out --channels=1 \
11 | --width-west-buf=0 --width-east-buf=0 --zDim=5 --run-only --max-ite=2
12 | 


--------------------------------------------------------------------------------
/benchmarks/bicgstab/commands_wse3.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -e
 4 | 
 5 | cslc ./src/layout.csl --arch wse3 --fabric-dims=12,7 --fabric-offsets=4,1 \
 6 | --params=width:5,height:5,MAX_ZDIM:5 --params=BLOCK_SIZE:2 --params=C0_ID:0 \
 7 | --params=C1_ID:1 --params=C2_ID:2 --params=C3_ID:3 --params=C4_ID:4 --params=C5_ID:5 \
 8 | --params=C6_ID:6 --params=C7_ID:7 --params=C8_ID:8 -o=out \
 9 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0
10 | cs_python ./run.py -m=5 -n=5 -k=5 --latestlink out --channels=1 \
11 | --width-west-buf=0 --width-east-buf=0 --zDim=5 --run-only --max-ite=2
12 | 


--------------------------------------------------------------------------------
/benchmarks/power-method/commands_wse2.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -e
 4 | 
 5 | cslc ./src/layout.csl --arch wse2 --fabric-dims=12,7 --fabric-offsets=4,1 \
 6 | --params=width:5,height:5,MAX_ZDIM:5 --params=BLOCK_SIZE:2 --params=C0_ID:0 \
 7 | --params=C1_ID:1 --params=C2_ID:2 --params=C3_ID:3 --params=C4_ID:4 --params=C5_ID:5 \
 8 | --params=C6_ID:6 --params=C7_ID:7 --params=C8_ID:8 -o=out \
 9 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0
10 | cs_python ./run.py -m=5 -n=5 -k=5 --latestlink out --channels=1 \
11 | --width-west-buf=0 --width-east-buf=0 --zDim=5 --run-only --max-ite=1
12 | 


--------------------------------------------------------------------------------
/benchmarks/power-method/commands_wse3.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -e
 4 | 
 5 | cslc ./src/layout.csl --arch wse3 --fabric-dims=12,7 --fabric-offsets=4,1 \
 6 | --params=width:5,height:5,MAX_ZDIM:5 --params=BLOCK_SIZE:2 --params=C0_ID:0 \
 7 | --params=C1_ID:1 --params=C2_ID:2 --params=C3_ID:3 --params=C4_ID:4 --params=C5_ID:5 \
 8 | --params=C6_ID:6 --params=C7_ID:7 --params=C8_ID:8 -o=out \
 9 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0
10 | cs_python ./run.py -m=5 -n=5 -k=5 --latestlink out --channels=1 \
11 | --width-west-buf=0 --width-east-buf=0 --zDim=5 --run-only --max-ite=1
12 | 


--------------------------------------------------------------------------------
/benchmarks/conjugate-gradient/commands_wse2.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -e
 4 | 
 5 | cslc ./src/layout.csl --arch wse2 --fabric-dims=12,7 --fabric-offsets=4,1 \
 6 | --params=width:5,height:5,MAX_ZDIM:5 --params=BLOCK_SIZE:2 --params=C0_ID:0 \
 7 | --params=C1_ID:1 --params=C2_ID:2 --params=C3_ID:3 --params=C4_ID:4 --params=C5_ID:5 \
 8 | --params=C6_ID:6 --params=C7_ID:7 --params=C8_ID:8 -o=out \
 9 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0
10 | cs_python ./run.py -m=5 -n=5 -k=5 --latestlink out --channels=1 \
11 | --width-west-buf=0 --width-east-buf=0 --zDim=5 --run-only --max-ite=2
12 | 


--------------------------------------------------------------------------------
/benchmarks/conjugate-gradient/commands_wse3.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -e
 4 | 
 5 | cslc ./src/layout.csl --arch wse3 --fabric-dims=12,7 --fabric-offsets=4,1 \
 6 | --params=width:5,height:5,MAX_ZDIM:5 --params=BLOCK_SIZE:2 --params=C0_ID:0 \
 7 | --params=C1_ID:1 --params=C2_ID:2 --params=C3_ID:3 --params=C4_ID:4 --params=C5_ID:5 \
 8 | --params=C6_ID:6 --params=C7_ID:7 --params=C8_ID:8 -o=out \
 9 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0
10 | cs_python ./run.py -m=5 -n=5 -k=5 --latestlink out --channels=1 \
11 | --width-west-buf=0 --width-east-buf=0 --zDim=5 --run-only --max-ite=2
12 | 


--------------------------------------------------------------------------------
/benchmarks/preconditioned-conjugate-gradient/commands_wse2.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -e
 4 | 
 5 | cslc ./src/layout.csl --arch wse2 --fabric-dims=12,7 --fabric-offsets=4,1 \
 6 | --params=width:5,height:5,MAX_ZDIM:5 --params=BLOCK_SIZE:2 --params=C0_ID:0 \
 7 | --params=C1_ID:1 --params=C2_ID:2 --params=C3_ID:3 --params=C4_ID:4 --params=C5_ID:5 \
 8 | --params=C6_ID:6 --params=C7_ID:7 --params=C8_ID:8 -o=out \
 9 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0
10 | cs_python ./run.py -m=5 -n=5 -k=5 --latestlink out --channels=1 \
11 | --width-west-buf=0 --width-east-buf=0 --zDim=5 --run-only --max-ite=2
12 | 


--------------------------------------------------------------------------------
/benchmarks/preconditioned-conjugate-gradient/commands_wse3.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -e
 4 | 
 5 | cslc ./src/layout.csl --arch wse3 --fabric-dims=12,7 --fabric-offsets=4,1 \
 6 | --params=width:5,height:5,MAX_ZDIM:5 --params=BLOCK_SIZE:2 --params=C0_ID:0 \
 7 | --params=C1_ID:1 --params=C2_ID:2 --params=C3_ID:3 --params=C4_ID:4 --params=C5_ID:5 \
 8 | --params=C6_ID:6 --params=C7_ID:7 --params=C8_ID:8 -o=out \
 9 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0
10 | cs_python ./run.py -m=5 -n=5 -k=5 --latestlink out --channels=1 \
11 | --width-west-buf=0 --width-east-buf=0 --zDim=5 --run-only --max-ite=2
12 | 


--------------------------------------------------------------------------------
/benchmarks/spmv-hypersparse/commands_wse2.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -e
 4 | 
 5 | cslc ./src/layout.csl --arch wse2 --fabric-dims=11,6 --fabric-offsets=4,1 \
 6 | --params=ncols:16,nrows:16,pcols:4,prows:4,max_local_nnz:8 \
 7 | --params=max_local_nnz_cols:4,max_local_nnz_rows:4,local_vec_sz:1 \
 8 | --params=local_out_vec_sz:1,y_pad_start_row_idx:4 -o=out \
 9 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0
10 | cs_python ./run.py --num_pe_cols=4 --num_pe_rows=4  --latestlink out --channels=1 \
11 | --width-west-buf=0 --width-east-buf=0 --is_weight_one --run-only \
12 | --infile_mtx=./data/rmat4.4x4.lb.mtx
13 | 


--------------------------------------------------------------------------------
/tutorials/sdklayout-02-routing/README.rst:
--------------------------------------------------------------------------------
 1 | .. _tutorials-sdklayout-02:
 2 | 
 3 | SdkLayout 2: Basic routing
 4 | ==========================
 5 | 
 6 | This tutorial demonstrates how to define routes between the
 7 | PEs of a code region using symbolic colors.
 8 | 
 9 | The key point here is that the colors that we use for the routes
10 | are symbolic (i.e., without a physical values). This means that
11 | the ``SdkLayout`` compiler will assign the value automatically.
12 | 
13 | For debugging purposes, the ``SdkLayout`` compiler will emit
14 | a JSON file called ``colors.json`` that contains the allocated
15 | physical color values.
16 | 


--------------------------------------------------------------------------------
/benchmarks/25-pt-stencil/nop.csl:
--------------------------------------------------------------------------------
 1 | // Copyright 2025 Cerebras Systems.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | 


--------------------------------------------------------------------------------
/tutorials/topic-01-arrays-and-pointers/README.rst:
--------------------------------------------------------------------------------
 1 | Topic 1: Arrays and Pointers
 2 | ============================
 3 | 
 4 | Arrays can only be passed to or returned from functions used at compile-time.
 5 | For functions used at runtime, pointers should be used instead.  This example
 6 | demonstrates a function ``increment_and_sum()``, which accepts a pointer to an
 7 | array and a pointer to a scalar.  When declaring an array pointer, CSL requires
 8 | that the type specification contain the size of the array.  CSL does not have
 9 | a null pointer.
10 | 
11 | Pointers are dereferenced using the ``.*`` syntax.  Once dereferenced, they can
12 | be used just like non-pointer variables like ``(data_ptr.*)[0]`` for indexing
13 | into the first element of the array.
14 | 


--------------------------------------------------------------------------------
/tutorials/sdklayout-03-ports-and-connections/README.rst:
--------------------------------------------------------------------------------
 1 | .. _tutorials-sdklayout-03:
 2 | 
 3 | SdkLayout 3: Ports and connections
 4 | ==================================
 5 | 
 6 | This tutorial demonstrates how to attach ports to code regions
 7 | and then connect those ports together. It instantiates two
 8 | code regions that send data to a third code region. The receiving
 9 | code region adds the input streams element-wise and then sends
10 | the result out and towards a fourth code region that saves the
11 | result on device memory.
12 | 
13 | There are two kinds of ports: input ports and output ports. It is
14 | only possible to connect an output port to an input port. When
15 | we do that the ``SdkLayout`` compiler will automatically find and
16 | encode a path between them.
17 | 


--------------------------------------------------------------------------------
/tutorials/topic-04-sparse-tensors/README.rst:
--------------------------------------------------------------------------------
 1 | Topic 4: Wavelets for Sparse Tensors
 2 | ====================================
 3 | 
 4 | When tensors are sparse, it is wasteful to send zero values.  Since wavelet
 5 | payloads are 32 bits wide, we can use the lower 16 bits to contain data as
 6 | usual, but we can also use the upper 16 bits to contain the index of the value.
 7 | 
 8 | This example illustrates the latter, where each wavelet of the incoming tensor
 9 | has the index field populated in the upper 16 bits.  Accordingly, the task
10 | definition uses two function arguments, one for the lower 16 bits whereas
11 | another for the upper 16 bits.
12 | 
13 | Optionally, the programmer may also declare a task with just one argument of
14 | type ``u32`` for receiving 32-bit data.
15 | 


--------------------------------------------------------------------------------
/tutorials/gemv-04-params/README.rst:
--------------------------------------------------------------------------------
 1 | GEMV 4: Parameters
 2 | ==================
 3 | 
 4 | Parameter values are compile-time constants, which implies that the compiler
 5 | is fully aware of their precise value.
 6 | This enables the programmer to not just change the program’s behavior at
 7 | runtime, but it also enables the programmer to change the program’s
 8 | compilation.
 9 | 
10 | Continuing on from the previous example, we add two compile-time parameters
11 | to the ``layout.csl`` file that specify the dimensions ``M`` and ``N`` of our
12 | problem, instead of hardcoding them in ``pe_program.csl``.
13 | When the program is compiled, the program specifies ``M`` and ``N`` in the
14 | compile command. ``layout.csl`` also sets these parameter  values in
15 | ``pe_program.csl`` in its ``@set_tile_code`` call.
16 | 


--------------------------------------------------------------------------------
/tutorials/topic-03-streaming-wavelet-data/README.rst:
--------------------------------------------------------------------------------
 1 | Topic 3: Streaming Wavelet Data
 2 | ===============================
 3 | 
 4 | Often, CSL programs contain tasks that are activated in response to the
 5 | arrival of wavelets of specific colors. Such tasks are also called
 6 | Wavelet-Triggered Tasks, or data tasks.
 7 | 
 8 | In this example, the ``comptime`` block binds a data task to a ``data_task_id``
 9 | created from a ``memcpy`` streaming color, which receives data from the host.
10 | The routing of the color ``MEMCPYH2D_DATA_1`` must not be defined.
11 | The ``memcpy`` module will figure out the routing of ``MEMCPYH2D_DATA_1``.
12 | 
13 | Given the task and color association and the route, when a wavelet of
14 | color ``MEMCPYH2D_DATA_1`` arrives at the router, it is forwarded to the CE,
15 | which then activates ``main_task``.  The wavelet's payload field is received in
16 | the argument to the task, and the code uses the wavelet data to update a global
17 | variable.
18 | 


--------------------------------------------------------------------------------
/tutorials/sdklayout-01-introduction/gv.csl:
--------------------------------------------------------------------------------
 1 | // Copyright 2025 Cerebras Systems.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | // Code parameter specified by the host using 'set_param_all'.
16 | param value: i16;
17 | export var gv: i16;
18 | 
19 | const main_id = @get_local_task_id(8);
20 | task main() void {
21 |   gv = value;
22 | }
23 | 
24 | comptime {
25 |   @bind_local_task(main, main_id);
26 |   @activate(main_id);
27 | }
28 | 


--------------------------------------------------------------------------------
/tutorials/gemv-02-memory-dsds/layout.csl:
--------------------------------------------------------------------------------
 1 | // Copyright 2025 Cerebras Systems.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | const memcpy = @import_module("<memcpy/get_params>", .{ .width = 1, .height = 1 });
16 | 
17 | layout {
18 |   @set_rectangle(1, 1);
19 |   @set_tile_code(0, 0, "pe_program.csl", .{ .memcpy_params = memcpy.get_params(0) });
20 | 
21 |   // export symbol names
22 |   @export_name("y", [*]f32, false);
23 |   @export_name("init_and_compute", fn()void);
24 | }
25 | 


--------------------------------------------------------------------------------
/tutorials/topic-06-switches/empty.csl:
--------------------------------------------------------------------------------
 1 | // Copyright 2025 Cerebras Systems.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | // Every PE needs to import memcpy module otherwise the I/O cannot
16 | // propagate the data to the destination.
17 | 
18 | param memcpy_params: comptime_struct;
19 | 
20 | const sys_mod = @import_module("<memcpy/memcpy>", memcpy_params);
21 | 
22 | fn main_fn() void {
23 |   sys_mod.unblock_cmd_stream();
24 | }
25 | 
26 | comptime {
27 |   @export_symbol(main_fn);
28 | }
29 | 


--------------------------------------------------------------------------------
/tutorials/topic-07-switches-entrypt/empty.csl:
--------------------------------------------------------------------------------
 1 | // Copyright 2025 Cerebras Systems.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | // Every PE needs to import memcpy module otherwise the I/O cannot
16 | // propagate the data to the destination.
17 | 
18 | param memcpy_params: comptime_struct;
19 | 
20 | const sys_mod = @import_module("<memcpy/memcpy>", memcpy_params);
21 | 
22 | fn main_fn() void {
23 |   sys_mod.unblock_cmd_stream();
24 | }
25 | 
26 | comptime {
27 |   @export_symbol(main_fn);
28 | }
29 | 


--------------------------------------------------------------------------------
/tutorials/topic-06-switches/README.rst:
--------------------------------------------------------------------------------
 1 | Topic 6: Switches
 2 | =================
 3 | 
 4 | Fabric switches permit limited runtime control of routes.
 5 | 
 6 | In this example, the ``layout`` block initializes the default route to receive
 7 | wavelets from the ramp and forward them to the PE's north neighbor.  However, it
 8 | also defines routes for switch positions 1, 2, and 3.  The hardware updates the
 9 | route according to the specified switch positions when it receives a so-called
10 | Control Wavelet.
11 | 
12 | For the payload of the control wavelet, the code creates a special wavelet using
13 | the helper function ``encode_single_payload()`` from the ``<control>`` library.
14 | The program then sends out a data wavelet along the newly-switched color.
15 | 
16 | Switches can be helpful not just to change the routing configuration in limited
17 | ways at runtime, but also to save the number of colors used.  For instance, this
18 | same example could be re-written to use four colors and four routes, but by
19 | using fabric switches, this example uses just one color.
20 | 


--------------------------------------------------------------------------------
/tutorials/sdklayout-04-h2d-d2h/README.rst:
--------------------------------------------------------------------------------
 1 | .. _tutorials-sdklayout-04:
 2 | 
 3 | SdkLayout 4: Host-to-device and device-to-host data streaming
 4 | =============================================================
 5 | 
 6 | This tutorial demonstrates how we can connect ports to the
 7 | host to allow us to stream data in and out of the WSE.
 8 | 
 9 | It uses the 'add2vec' code region that was also used in
10 | tutorial :ref:`sdkruntime-sdklayout-03-ports-and-connections` but instead of
11 | using sender/receiver code regions it creates streams directly
12 | to/from the host.
13 | 
14 | Similar to connections between input and output ports (see tutorial
15 | :ref:`sdkruntime-sdklayout-03-ports-and-connections`) paths to/from ports
16 | to/from the edge of the wafer are produced automatically.
17 | 
18 | For now, it is only possible to create input/output streams
19 | to/from single-PE ports. If a port consists of more than one PE then
20 | an adaptor layer must be created explicitly to funnel the data
21 | through a single PE port. The next tutorial shows an example
22 | of such a configuration.
23 | 


--------------------------------------------------------------------------------
/tutorials/topic-07-switches-entrypt/README.rst:
--------------------------------------------------------------------------------
 1 | Topic 7: Switches and Control Entrypoints
 2 | =========================================
 3 | 
 4 | Following on from the last topic example, we can also encode a special
 5 | task ID inside of a control wavelet. When that control wavelet is forwarded
 6 | to the CE of the receiving PE, it will activate a task known as a control
 7 | task which is bound to that ID.
 8 | 
 9 | The lower 16 bits of the control wavelet can be used to store an optional
10 | data payload for that control task. Here, we encode the same values
11 | sent to the PEs as normal data wavelets in the previous example.
12 | 
13 | Note that a PE router will move to a new switch position only after the
14 | control wavelet carrying the switch command passes through that PE.
15 | Therefore all control wavelets will continue to be routed using the current
16 | switch position setting and the new switch position will only affect
17 | subsequent wavelets. Thus, the data payload of a control wavelet is received
18 | by the PE connected by the current switch position, not the new position.
19 | 


--------------------------------------------------------------------------------
/benchmarks/power-method/power_method.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Cerebras Systems.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import numpy as np
16 | from numpy import linalg as LA
17 | 
18 | 
19 | def power_method(A_csr, x0, max_ite):
20 |   prev_mu = 0
21 |   nrm2_x = LA.norm(x0, 2)
22 |   x = x0 / nrm2_x
23 |   for i in range(max_ite):
24 |     y = A_csr.dot(x)
25 |     mu = np.dot(x, y)
26 |     print(f"i = {i}, mu = {mu}, |prev_mu - mu| = {abs(mu - prev_mu)}")
27 |     nrm2_x = LA.norm(y, 2)
28 |     x = y / nrm2_x
29 |     prev_mu = mu
30 |   return x
31 | 


--------------------------------------------------------------------------------
/tutorials/pipeline-03-multiple/memcpy_edge/east.csl:
--------------------------------------------------------------------------------
 1 | // Copyright 2025 Cerebras Systems.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | // send data to the "core"
16 | param USER_IN_1 = {};
17 | param USER_IN_2 = {};
18 | 
19 | // receive data from the "core"
20 | param USER_OUT_1 = {};
21 | 
22 | param memcpy_params: comptime_struct;
23 | 
24 | const edge_mod = @import_module("memcpy_edge.csl", .{
25 |   .memcpy_params = memcpy_params,
26 |   .USER_IN_1 = USER_IN_1,
27 |   .USER_IN_2 = USER_IN_2,
28 |   .USER_OUT_1 = USER_OUT_1,
29 |   .dir = WEST
30 | });
31 | 


--------------------------------------------------------------------------------
/tutorials/pipeline-03-multiple/memcpy_edge/north.csl:
--------------------------------------------------------------------------------
 1 | // Copyright 2025 Cerebras Systems.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | // send data to the "core"
16 | param USER_IN_1 = {};
17 | param USER_IN_2 = {};
18 | 
19 | // receive data from the "core"
20 | param USER_OUT_1 = {};
21 | 
22 | param memcpy_params: comptime_struct;
23 | 
24 | const edge_mod = @import_module("memcpy_edge.csl", .{
25 |   .memcpy_params = memcpy_params,
26 |   .USER_IN_1 = USER_IN_1,
27 |   .USER_IN_2 = USER_IN_2,
28 |   .USER_OUT_1 = USER_OUT_1,
29 |   .dir = SOUTH
30 | });
31 | 


--------------------------------------------------------------------------------
/tutorials/pipeline-03-multiple/memcpy_edge/south.csl:
--------------------------------------------------------------------------------
 1 | // Copyright 2025 Cerebras Systems.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | // send data to the "core"
16 | param USER_IN_1 = {};
17 | param USER_IN_2 = {};
18 | 
19 | // receive data from the "core"
20 | param USER_OUT_1 = {};
21 | 
22 | param memcpy_params: comptime_struct;
23 | 
24 | const edge_mod = @import_module("memcpy_edge.csl", .{
25 |   .memcpy_params = memcpy_params,
26 |   .USER_IN_1 = USER_IN_1,
27 |   .USER_IN_2 = USER_IN_2,
28 |   .USER_OUT_1 = USER_OUT_1,
29 |   .dir = NORTH
30 | });
31 | 


--------------------------------------------------------------------------------
/tutorials/pipeline-03-multiple/memcpy_edge/west.csl:
--------------------------------------------------------------------------------
 1 | // Copyright 2025 Cerebras Systems.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | // send data to the "core"
16 | param USER_IN_1 = {};
17 | param USER_IN_2 = {};
18 | 
19 | // receive data from the "core"
20 | param USER_OUT_1 = {};
21 | 
22 | param memcpy_params: comptime_struct;
23 | 
24 | const edge_mod = @import_module("memcpy_edge.csl", .{
25 |   .memcpy_params = memcpy_params,
26 |   .USER_IN_1 = USER_IN_1,
27 |   .USER_IN_2 = USER_IN_2,
28 |   .USER_OUT_1 = USER_OUT_1,
29 |   .dir = EAST
30 | });
31 | 


--------------------------------------------------------------------------------
/benchmarks/cholesky/launch.csl:
--------------------------------------------------------------------------------
 1 | // Copyright 2025 Cerebras Systems.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | param memcpy_params: comptime_struct;
16 | param Nt: u16;
17 | 
18 | var tile = @zeros([Nt*Nt]f32);
19 | 
20 | var ptr_tile : [*]f32 = &tile;
21 | 
22 | const sys_mod = @import_module( "<memcpy/memcpy>", memcpy_params);
23 | 
24 | fn f_chol() void {
25 |   // WARNING: the user must unblock cmd color for every PE
26 |   sys_mod.unblock_cmd_stream();
27 | }
28 | 
29 | comptime{
30 |   @export_symbol(ptr_tile, "tile");
31 | 
32 |   @export_symbol(f_chol);
33 | }
34 | 


--------------------------------------------------------------------------------
/tutorials/gemv-03-memcpy/layout.csl:
--------------------------------------------------------------------------------
 1 | // Copyright 2025 Cerebras Systems.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | const memcpy = @import_module("<memcpy/get_params>", .{ .width = 1, .height = 1 });
16 | 
17 | layout {
18 |   @set_rectangle(1, 1);
19 |   @set_tile_code(0, 0, "pe_program.csl", .{ .memcpy_params = memcpy.get_params(0) });
20 | 
21 |   // export symbol names
22 |   @export_name("A", [*]f32, true);
23 |   @export_name("x", [*]f32, true);
24 |   @export_name("b", [*]f32, true);
25 |   @export_name("y", [*]f32, false);
26 |   @export_name("init_and_compute", fn()void);
27 | }
28 | 


--------------------------------------------------------------------------------
/tutorials/pipeline-03-multiple/README.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | Pipeline 3: Add an artificial halo
 3 | ==================================
 4 | 
 5 | The disadvantage of FIFO in the previous example is the resource consumption.
 6 | The FIFO requires two microthreads and a scratch buffer.
 7 | 
 8 | The simple workaround is to move such FIFO outside the kernel. We add another
 9 | halo, which we call an artificial halo, around the kernel (``pe_program.csl``).
10 | The west side is ``west.csl`` and east side is ``east.csl``.
11 | The ``west.csl`` implements a FIFO to receive the data from H2D.
12 | The ``east.csl`` implements a FIFO to receive the data from ``pe_program.csl``
13 | and redirect it to D2H.
14 | 
15 | There is no more FIFO in ``pe_program.csl``. Instead, we replace the colors
16 | ``MEMCPYH2D_DATA_1`` by ``Cin`` and ``MEMCPYD2H_DATA_1`` by ``Cout``.
17 | The color ``Cin`` receives data from the west to the ramp.
18 | The color ``Cout`` sends the data from ramp to the east.
19 | 
20 | This example has the same property as ``pipeline-02-fifo``: as long as the
21 | parameter ``size`` does not exceed the capacity of the FIFO in ``west.csl``,
22 | H2D can always finish so the ``@add16`` can progress.
23 | 


--------------------------------------------------------------------------------
/benchmarks/gemm-collectives_2d/README.rst:
--------------------------------------------------------------------------------
 1 | GEMM with Collective Operations
 2 | ===============================
 3 | 
 4 | This program implements the SUMMA matrix multiplication algorithm and serves
 5 | as an example of using the ``collectives_2d`` library together with
 6 | ``SdkRuntime`` and the ``memcpy`` framework.
 7 | 
 8 | The host code first copies tiles of ``A`` and ``B`` onto their corresponding
 9 | PEs. It then uses the remote procedure call (RPC) mechanism to launch the
10 | function ``main``, at which point the GEMM computation begins.
11 | 
12 | We perform GEMM in ``P`` many steps on a grid of ``P x P`` processors.
13 | At each step ``i``, PEs in the ith column broadcast their home tiles of ``A``
14 | to other PEs in their row, and PEs in the ith row broadcast their home
15 | tiles of ``B`` to other PEs in their column. Once both broadcasts are complete
16 | as determined by ``x_done()`` and ``y_done()`` both being activated,
17 | each PE computes ``C_tile += Ap * Bp`` where ``Ap`` and ``Bp`` are pointers to
18 | either the PE's home tile or the tile it received through broadcasts.
19 | 
20 | When computation is complete the host copies back the resulting tiles of
21 | ``C`` from the device.
22 | 


--------------------------------------------------------------------------------
/benchmarks/row-col-broadcast/README.rst:
--------------------------------------------------------------------------------
 1 | Host-to-Device Broadcast Test
 2 | =============================
 3 | 
 4 | This example shows how to use row or column broadcast. For example if the user
 5 | wants to broadcast a column of data [1.0, 2.0, 3.0, 4.0] to a region of interest
 6 | starting from (1,1) with width 3 and height 4, one element per PE, the H2D API
 7 | requires the user to prepare the following 3-by-4 tensor,
 8 | 
 9 | .. code-block::
10 | 
11 |    | 1.0  1.0  1.0 |
12 |    | 2.0  2.0  2.0 |
13 |    | 3.0  3.0  3.0 |
14 |    | 4.0  4.0  4.0 |
15 | 
16 | and use ``memcpy_h2d()`` API to stream 12 elements into the device. This
17 | operation wastes host bandwidth by 3x.
18 | Now the user can use the new API, ``memcpy_h2d_rowbcast()``, to stream 4
19 | elements only.
20 | 
21 | The same for column broadcasting, the user only needs to provide data of one
22 | row and uses ``memcpy_h2d_colbcast()`` API.
23 | 
24 | The new broadcasting scheme only supports H2D, not D2H.
25 | 
26 | The kernel of ``row-col-broadcast`` is the same as ``bandwidth-test``.
27 | The ``run.py`` calculates the bandwidth as well.
28 | The formula of the bandwidth calculation is the same as ``bandwidth-test``,
29 | so the user can see how much time this new API can save.
30 | 


--------------------------------------------------------------------------------
/tutorials/gemv-04-params/layout.csl:
--------------------------------------------------------------------------------
 1 | // Copyright 2025 Cerebras Systems.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | param M: i16;
16 | param N: i16;
17 | 
18 | const memcpy = @import_module("<memcpy/get_params>", .{ .width = 1, .height = 1 });
19 | 
20 | layout {
21 |   @set_rectangle(1, 1);
22 |   @set_tile_code(0, 0, "pe_program.csl", .{
23 |     .memcpy_params = memcpy.get_params(0),
24 |     .M = M,
25 |     .N = N
26 |   });
27 | 
28 |   // export symbol names
29 |   @export_name("A", [*]f32, true);
30 |   @export_name("x", [*]f32, true);
31 |   @export_name("b", [*]f32, true);
32 |   @export_name("y", [*]f32, false);
33 |   @export_name("init_and_compute", fn()void);
34 | }
35 | 


--------------------------------------------------------------------------------
/tutorials/gemv-05-multiple-pes/README.rst:
--------------------------------------------------------------------------------
 1 | GEMV 5: Multiple PEs
 2 | ====================
 3 | 
 4 | Continuing on from the previous example, we now extend our program to use
 5 | multiple PEs.
 6 | 
 7 | The number of PEs used in this program is set at compile-time with the ``width``
 8 | parameter.
 9 | Note that ``layout.csl`` uses this parameter to set the size of the program
10 | with the call to ``@set_rectangle``.
11 | The dimensions of a grid of PEs is always specified as width by height (or,
12 | alternatively, number of columns by number of rows), and individual PEs are
13 | indexed by (x, y), or, in other words, (column number, row number).
14 | 
15 | This program involves no communication between PEs; we only duplicate the same
16 | workload on each PE.
17 | In ``run.py``, the ``memcpy_h2d`` calls now specify that data is copied into
18 | ``width x 1`` PEs, beginning at the upper left corner (0, 0) of the program
19 | rectangle.
20 | Because we are copying the same data to each PE, we use ``np.tile`` to repeat
21 | the data in ``A``, ``x``, and ``b`` multiple times.
22 | The ``memcpy_d2h`` call copies back the resulting ``y`` from each PE into
23 | an array of size ``M x width``.
24 | 
25 | The next example will expand this example to demonstrate simple communication
26 | between PEs.
27 | 


--------------------------------------------------------------------------------
/tutorials/gemv-09-streaming/README.rst:
--------------------------------------------------------------------------------
 1 | GEMV 9: Memcpy Streaming Mode
 2 | =============================
 3 | 
 4 | We present an alternative version of the previous example,
 5 | in which we use the ``streaming`` mode of ``memcpy`` to stream ``x`` and ``b``
 6 | onto the device, and stream ``y`` off of the device.
 7 | All of the previous examples used the ``copy`` mode of ``memcpy``.
 8 | This example is meant to simply present the basics of ``streaming`` mode,
 9 | and future tutorials will demonstrate some use cases for this mode.
10 | 
11 | The host code no longer includes an explicit kernel launch.
12 | Instead, computation is started by the wavelet-triggered tasks that receive
13 | elements of ``x`` and ``b`` along the top row and left column of PEs,
14 | respectively.
15 | We finish computation when the kernel streams back the result ``y``
16 | to the host.
17 | 
18 | The colors ``MEMCPYH2D_DATA_1`` and ``MEMCPYH2D_DATA_2`` are used
19 | to stream ``x`` and ``b`` onto the device, respectively,
20 | while ``MEMCPYD2H_DATA_1`` is used to stream ``y`` off the device.
21 | 
22 | Note that, because ``memcpy`` commands are serialized, the order of these
23 | ``streaming`` mode ``memcpy_h2d`` calls in this example is important.
24 | If the ``b`` values were streamed in before ``x``, the program would hang.
25 | 


--------------------------------------------------------------------------------
/benchmarks/single-tile-matvec/compile.appliance.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 Cerebras Systems.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import json
16 | 
17 | from cerebras.sdk.client import SdkCompiler # pylint: disable=import-error,no-name-in-module
18 | 
19 | hash_filename = "hash.json"
20 | 
21 | with SdkCompiler() as compiler:
22 | 
23 |   hashstr = compiler.compile(
24 |       "./src",
25 |       "layout_matvec.csl",
26 |       "--arch wse3 --fabric-dims=9,4 --fabric-offsets=4,1 "
27 |       "--params=width:2,height:2,tile_size:25,iters:1 -o latest --memcpy --channels=1",
28 |   )
29 | 
30 |   print("compile artifact:", hashstr)
31 | 
32 |   print(f"dump artifact name to file {hash_filename}")
33 |   with open(hash_filename, "w", encoding="utf-8") as write_file:
34 |     json.dump(hashstr, write_file)
35 | 


--------------------------------------------------------------------------------
/tutorials/sdklayout-03-ports-and-connections/receiver.csl:
--------------------------------------------------------------------------------
 1 | // Copyright 2025 Cerebras Systems.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | param size: u16;
16 | param rx: color;
17 | 
18 | const in_q = @get_input_queue(0);
19 | 
20 | export var data: [size]u16;
21 | 
22 | const data_dsd = @get_dsd(mem1d_dsd, .{.tensor_access = |i|{size} -> data[i]});
23 | 
24 | const input = @get_dsd(fabin_dsd, .{.extent = size,
25 |                                     .fabric_color = rx,
26 |                                     .input_queue = in_q});
27 | 
28 | const main_id = @get_local_task_id(8);
29 | task main() void {
30 |   @mov16(data_dsd, input, .{.async = true});
31 | }
32 | 
33 | comptime {
34 |   @bind_local_task(main, main_id);
35 |   @activate(main_id);
36 | 
37 |   @initialize_queue(in_q, .{.color = rx});
38 | }
39 | 


--------------------------------------------------------------------------------
/tutorials/topic-11-collectives/README.rst:
--------------------------------------------------------------------------------
 1 | Topic 11: Collective Communications
 2 | ===================================
 3 | 
 4 | The ``<collectives_2d>`` library can be used for communication between PEs in
 5 | the same row or column. It mimics the capabilities provided by
 6 | `message passing interface <https://www.open-mpi.org/>`_ (MPI)
 7 | collective operations found in other programming languages.
 8 | 
 9 | This example showcases each of the currently available communication primitives
10 | while using the library across two indepedent dimensions. The communication
11 | tasks are executed asynchronously.
12 | 
13 | ``task_x`` uses the ``broadcast`` primitive to transmit data from the first PE
14 | in every row to every other PE in the same row. After the data is received,
15 | ``reduce_fadds`` computes the vector sum of the ``broadcast_recv``. The result
16 | is transmitted back to the first PE in every row.
17 | 
18 | ``task_y`` operates concurrently along every column of PEs. The task first
19 | uses ``scatter`` to distribute ``chunk_size`` slices of ``scatter_data``
20 | across every other PE in the same column. The task uses ``gather`` to collect
21 | ``chunk_size`` slices of data distributed by ``scatter``. Because ``scatter``
22 | is the inversion of ``gather``, we have used collective communications to
23 | transmit the data from ``scatter_data`` to ``gather_recv``.
24 | 


--------------------------------------------------------------------------------
/benchmarks/fft-3d/layout.csl:
--------------------------------------------------------------------------------
 1 | // Copyright 2025 Cerebras Systems.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | param FP: i16;   // Precision: 0 == float16, 1 == float32
16 | 
17 | param N: u16;   // FFT size in each dimension
18 | param NUM_PENCILS_PER_DIM: u16; // Pencils in each dimension per PE
19 | 
20 | // Number of PEs for FFT in both X and Y dimension
21 | param WIDTH: i16 = N / NUM_PENCILS_PER_DIM;
22 | 
23 | const tensor_type: type = if (FP == 0) f16 else f32;
24 | 
25 | const memcpy = @import_module("<memcpy/get_params>", .{
26 |   .width = WIDTH,
27 |   .height = WIDTH,
28 | });
29 | 
30 | const fft_helper = @import_module("<kernels/fft/fft3d_layout>", .{
31 |   .width = WIDTH,
32 |   .memcpy = memcpy,
33 | });
34 | 
35 | layout {
36 |   @set_rectangle(WIDTH, WIDTH);
37 |   fft_helper.FFT_kernel(WIDTH, N, tensor_type);
38 | }
39 | 


--------------------------------------------------------------------------------
/tutorials/topic-12-debug-library/README.rst:
--------------------------------------------------------------------------------
 1 | Topic 12: Debug Library
 2 | =======================
 3 | 
 4 | This example shows a program that uses the tracing mechanism of the ``<debug>``
 5 | library to record variable values and compile time strings as well as
 6 | timestamps, for inspection by the host code.
 7 | 
 8 | The program uses a row of four contiguous PEs.
 9 | The first PE sends an array of values to three receiver PEs.
10 | Each PE program contains a global variable named ``global``, initialized to
11 | zero.
12 | When the data task ``recv_task`` on the receiver PE is activated by an incoming
13 | wavelet ``in_data``, ``global`` is incremented by ``2 * in_data``.
14 | 
15 | The programs running on each PE import two instances of the ``<debug>`` library.
16 | On the receiver PEs, each time a task activates, the instance named ``trace``
17 | logs a compile time string noting that the task has begun execution, and the
18 | updated value of ``global``.
19 | The instance named ``times`` logs a timestamp at the beginning of a task, and
20 | at the end of a task.
21 | 
22 | The host code uses the function ``read_trace`` from
23 | ``cerebras.sdk.debug.debug_util`` to read the logged values after execution of
24 | the device code finishes.
25 | Note that the PE coordinates passed to ``read_trace`` start from the northwest
26 | corner of the fabric, not from the northwest corner of the program rectangle.
27 | 


--------------------------------------------------------------------------------
/benchmarks/residual/nrminf.csl:
--------------------------------------------------------------------------------
 1 | // Copyright 2025 Cerebras Systems.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | // http://www.netlib.org/lapack/explore-html/d6/d12/snrm2_8f90.html
16 | //  SNRMINF returns the maximum of a vector
17 | //     SNRMINF = max(|x|)
18 | //
19 | // @param[in] n       number of elements of the vector x
20 | // @param[in] x       array of dimension n
21 | // @param[out] result scalar
22 | //                    result = max(|x|)
23 | 
24 | param sizeX: i16; // size of x, sizeX >= n
25 | 
26 | fn snrminf(n: i16, x: *[sizeX]f32, result: *f32) void {
27 | 
28 |   var nrm_r: f32 = 0.0;
29 | 
30 |   for (@range(i16, n)) |row| {
31 |     var yi: f32 = (x.*)[row];
32 |     if (0.0 > yi) {
33 |       yi = -yi;
34 |     }
35 |     if (nrm_r < yi) {
36 |       nrm_r = yi;
37 |     }
38 |   }
39 |   (result.*) = nrm_r;
40 | }
41 | 


--------------------------------------------------------------------------------
/tutorials/sdklayout-03-ports-and-connections/sender.csl:
--------------------------------------------------------------------------------
 1 | // Copyright 2025 Cerebras Systems.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | param size: u16;
16 | param tx: color;
17 | 
18 | const out_q = @get_output_queue(0);
19 | 
20 | export var data = [10]u16{1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
21 | 
22 | const data_dsd = @get_dsd(mem1d_dsd, .{.tensor_access = |i|{size} -> data[i]});
23 | 
24 | const output = @get_dsd(fabout_dsd, .{.extent = size,
25 |                                       .fabric_color = tx,
26 |                                       .output_queue = out_q});
27 | 
28 | const main_id = @get_local_task_id(8);
29 | task main() void {
30 |   @mov16(output, data_dsd, .{.async = true});
31 | }
32 | 
33 | comptime {
34 |   @bind_local_task(main, main_id);
35 |   @activate(main_id);
36 | 
37 |   if (@is_arch("wse3")) {
38 |     @initialize_queue(out_q, .{.color = tx});
39 |   }
40 | }
41 | 


--------------------------------------------------------------------------------
/tutorials/topic-14-color-swap/README.rst:
--------------------------------------------------------------------------------
 1 | Topic 14: Color Swap
 2 | ====================
 3 | 
 4 | This example demonstrates the color swap feature of WSE-2.
 5 | CSL currently does not support color swap on WSE-3, and support
 6 | is in development.
 7 | 
 8 | This program uses a row of four contiguous PEs.
 9 | Two colors, ``red`` (color 0) and ``blue`` (color 1), are used.
10 | On all PEs, the routing associated with these colors receives
11 | from the ``WEST`` and sends down the ``RAMP`` and ``EAST``.
12 | Additionally, for both colors, ``swap_color_x`` is set to ``true``.
13 | Because these colors differ only in their lowest bit, when a
14 | ``red`` wavelet comes into a router from ``WEST``, it leaves the
15 | router to the ``EAST`` as a ``blue`` wavelet, and vice versa.
16 | 
17 | The host code sends four wavelets along the color ``MEMCPYH2D_DATA_1``
18 | into the first PE. The WTT of ``MEMCPYH2D_DATA_1`` forwards this data
19 | to color ``blue``. When a PE receives a ``red`` wavelet, the task
20 | ``red_task`` is activated, and when a PE receives a ``blue`` wavelet,
21 | the task ``blue_task`` is activated.
22 | 
23 | Each PE program contains a global variable named ``sum``,
24 | initialized to zero.
25 | When a ``red_task`` is activated by an incoming wavelet ``in_data``,
26 | ``sum`` is incremented by an amount ``in_data``.
27 | When a ``blue_task`` is activated by an incoming wavelet ``in_data``,
28 | ``sum`` is incremented by an amount ``2 * in_data``.
29 | 


--------------------------------------------------------------------------------
/benchmarks/spmv-hypersparse/data/rmat4.4x4.lb.mtx:
--------------------------------------------------------------------------------
  1 | %%MatrixMarket matrix coordinate real general
  2 | 16 16 108
  3 | 1 2 1
  4 | 1 4 1
  5 | 1 5 1
  6 | 1 6 1
  7 | 1 7 1
  8 | 1 8 1
  9 | 1 9 1
 10 | 1 10 1
 11 | 1 11 1
 12 | 1 13 1
 13 | 1 14 1
 14 | 1 15 1
 15 | 1 16 1
 16 | 2 1 1
 17 | 2 2 1
 18 | 2 3 1
 19 | 2 5 1
 20 | 2 9 1
 21 | 2 10 1
 22 | 2 12 1
 23 | 2 13 1
 24 | 3 1 1
 25 | 3 5 1
 26 | 3 8 1
 27 | 3 10 1
 28 | 3 13 1
 29 | 4 1 1
 30 | 4 5 1
 31 | 5 1 1
 32 | 5 2 1
 33 | 5 6 1
 34 | 5 7 1
 35 | 5 9 1
 36 | 5 10 1
 37 | 5 11 1
 38 | 5 13 1
 39 | 5 14 1
 40 | 5 15 1
 41 | 5 16 1
 42 | 6 1 1
 43 | 6 4 1
 44 | 6 5 1
 45 | 6 6 1
 46 | 6 10 1
 47 | 6 14 1
 48 | 6 15 1
 49 | 7 1 1
 50 | 7 2 1
 51 | 7 5 1
 52 | 7 9 1
 53 | 7 13 1
 54 | 8 1 1
 55 | 8 3 1
 56 | 8 11 1
 57 | 8 13 1
 58 | 9 1 1
 59 | 9 3 1
 60 | 9 4 1
 61 | 9 5 1
 62 | 9 6 1
 63 | 9 7 1
 64 | 9 10 1
 65 | 9 13 1
 66 | 9 14 1
 67 | 9 15 1
 68 | 10 1 1
 69 | 10 2 1
 70 | 10 5 1
 71 | 10 7 1
 72 | 10 9 1
 73 | 10 10 1
 74 | 10 13 1
 75 | 11 8 1
 76 | 11 9 1
 77 | 11 10 1
 78 | 11 14 1
 79 | 12 1 1
 80 | 12 2 1
 81 | 12 7 1
 82 | 12 9 1
 83 | 13 1 1
 84 | 13 5 1
 85 | 13 6 1
 86 | 13 7 1
 87 | 13 8 1
 88 | 13 9 1
 89 | 13 10 1
 90 | 13 11 1
 91 | 13 14 1
 92 | 13 15 1
 93 | 14 1 1
 94 | 14 2 1
 95 | 14 3 1
 96 | 14 5 1
 97 | 14 6 1
 98 | 14 9 1
 99 | 14 11 1
100 | 14 12 1
101 | 14 13 1
102 | 14 14 1
103 | 15 1 1
104 | 15 4 1
105 | 15 5 1
106 | 15 6 1
107 | 15 9 1
108 | 15 13 1
109 | 16 10 1
110 | 16 14 1
111 | 


--------------------------------------------------------------------------------
/tutorials/gemv-05-multiple-pes/layout.csl:
--------------------------------------------------------------------------------
 1 | // Copyright 2025 Cerebras Systems.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | // matrix dimensions on each PE
16 | param M: i16;
17 | param N: i16;
18 | 
19 | // number of PEs in program
20 | param width: i16;
21 | 
22 | const memcpy = @import_module("<memcpy/get_params>", .{
23 |   .width = width,
24 |   .height = 1
25 | });
26 | 
27 | layout {
28 |   // PE coordinates are (column, row)
29 |   @set_rectangle(width, 1);
30 |   for (@range(i16, width)) |x| {
31 |     @set_tile_code(x, 0, "pe_program.csl", .{
32 |       .memcpy_params = memcpy.get_params(x),
33 |       .M = M,
34 |       .N = N
35 |     });
36 |   }
37 | 
38 |   // export symbol names
39 |   @export_name("A", [*]f32, true);
40 |   @export_name("x", [*]f32, true);
41 |   @export_name("b", [*]f32, true);
42 |   @export_name("y", [*]f32, false);
43 |   @export_name("compute", fn()void);
44 | }
45 | 


--------------------------------------------------------------------------------
/tutorials/topic-10-map-builtin/README.rst:
--------------------------------------------------------------------------------
 1 | Topic 10: @map Builtin
 2 | ======================
 3 | 
 4 | The ``@map`` builtin can be used to perform custom operations on the data
 5 | elements of one or more DSDs. In other words, it is a
 6 | *customizable DSD operation* that allows us to go beyond the
 7 | :ref:`fixed list <language-builtins-for-dsd-operations>` of
 8 | natively supported DSD operations.
 9 | 
10 | This example demonstrates three use-cases of the ``@map`` builtin:
11 | 
12 | 1. In the first use-case, ``@map`` is used to compute the square-root of the
13 |    diagonal elements of a 2D tensor.
14 | 2. In the second use-case ``@map`` is used to perform a custom calculation with
15 |    a mix of input DSDs of various kinds (``mem1d_dsd`` and ``fabin_dsd``) and
16 |    scalar values while the result is stored to a ``mem1d_dsd``. It shows how we
17 |    can use arbitrary callbacks combined with a variety of input and output DSDs.
18 | 3. Finally, we demonstrate how ``@map`` can be used to compute a reduction like
19 |    the sum of all elements in a tensor.
20 | 
21 | Without ``@map``, we would have to write explicit loops iterating over each
22 | element involved in these computations. With ``@map`` we can avoid writing such
23 | loops by utilizing the DSD descriptions which specify the loop structure
24 | implicitly. Since DSDs are supported natively by the hardware, using ``@map``
25 | can lead to significant performance gains compared to writing explicit loops.
26 | 


--------------------------------------------------------------------------------
/benchmarks/mandelbrot/common.csl:
--------------------------------------------------------------------------------
 1 | // Copyright 2025 Cerebras Systems.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | const math = @import_module("<math>");
16 | 
17 | fn get_x(c: u32, x_lo: f32, x_hi: f32, cols: u32) f32 {
18 |   return @as(f32, c) * (x_hi - x_lo) / @as(f32, cols - 1)  + x_lo;
19 | }
20 | 
21 | fn get_y(r: u32, y_lo: f32, y_hi: f32, rows: u32) f32 {
22 |   return @as(f32, r) * (y_hi - y_lo) / @as(f32, rows - 1)  + y_lo;
23 | }
24 | 
25 | fn mandelbrot(max_iters: u32, rp: *f32, ip: *f32, iters: *f32, x: f32, y: f32) void {
26 | 
27 |   for (@range(u32, max_iters)) |i| {
28 | 
29 |     const real = rp.*;
30 |     const imag = ip.*;
31 | 
32 |     if (math.sqrt_f32(real * real + imag * imag) > 2.0) {
33 |       break;
34 |     }
35 | 
36 |     rp.* = real * real - imag * imag;
37 |     ip.* = real * imag + real * imag;
38 | 
39 |     rp.* += x;
40 |     ip.* += y;
41 | 
42 |     iters.* += 1.0;
43 |   }
44 | }
45 | 


--------------------------------------------------------------------------------
/benchmarks/single-tile-matvec/src/layout_matvec.csl:
--------------------------------------------------------------------------------
 1 | // Copyright 2025 Cerebras Systems.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | param width: u16;
16 | param height: u16;
17 | param tile_size: u16;
18 | param iters: u16;
19 | 
20 | const memcpy = @import_module("<memcpy/get_params>", .{
21 |   .width = width,
22 |   .height = height,
23 | });
24 | 
25 | layout {
26 |   @set_rectangle(width, height);
27 | 
28 |   for (@range(u16, width)) |px| {
29 |     const memcpy_params = memcpy.get_params(px);
30 |     for (@range(u16, height)) |py| {
31 |       @set_tile_code(px, py, "pe_matvec.csl", .{ .memcpy_params = memcpy_params,
32 |         .nb = tile_size, .iters = iters});
33 |     }
34 |   }
35 | 
36 |   // export symbol names
37 |   @export_name("A", [*]f32, true);
38 |   @export_name("x", [*]f32, true);
39 |   @export_name("y", [*]f32, true);
40 |   @export_name("maxmin_time", [*]f32, true);
41 |   @export_name("compute", fn()void);
42 | }
43 | 


--------------------------------------------------------------------------------
/tutorials/topic-05-sentinels/README.rst:
--------------------------------------------------------------------------------
 1 | Topic 5: Sentinels
 2 | ==================
 3 | 
 4 | In previous programs, we used so-called routable colors, which are associated
 5 | with a route to direct the flow of wavelets.
 6 | On WSE-2, task IDs which can receive data wavelets are in the range 0 through
 7 | 23, corresponding to the IDs of the colors.
 8 | On WSE-3, task IDs which can receive data wavelets are in the range 0 through
 9 | 7, corresponding to input queues which are bound to a routable color.
10 | We have also used local tasks, which on WSE-2 can be associated with any task
11 | ID from 0 to 30, and on WSE-3 can be associated with any task ID from 8 to 30.
12 | 
13 | This example demonstrates the use of a non-routable control task ID to signal
14 | the end of an input tensor.
15 | We call this use for a control task ID a *sentinel*.
16 | 
17 | In this example, the host sends to a receiving PE (``sentinel.csl``) the number
18 | of wavelets that the receiving PE should expect to receive, followed by the
19 | stream of data.
20 | The receiving PE then sends the data to its neighbor (``pe_program.csl``),
21 | followed by a *control wavelet* which specifies the control task ID that the
22 | neighbor will activate.
23 | 
24 | Since sentinel control task IDs are not routable colors, the programmer does
25 | not specify a route, but does need to bind the control task ID to a control
26 | task, which will be activated upon receipt of the sentinel wavelet.
27 | Here, the sentinel activates the ``send_result`` task, which relays the
28 | result of the sum reduction back to the host.
29 | 


--------------------------------------------------------------------------------
/tutorials/topic-01-arrays-and-pointers/pe_program.csl:
--------------------------------------------------------------------------------
 1 | // Copyright 2025 Cerebras Systems.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | // Not a complete program; the top-level source file is layout.csl
16 | 
17 | param memcpy_params: comptime_struct;
18 | 
19 | const sys_mod = @import_module("<memcpy/memcpy>", memcpy_params);
20 | 
21 | var result: [1]i16;
22 | var result_ptr: [*]i16 = &result;
23 | 
24 | fn increment_and_sum(data_ptr: *[3]i16, result_ptr: *i16) void {
25 |   // Write an updated value to each element of the array
26 |   (data_ptr.*)[0] += 1;
27 |   (data_ptr.*)[1] += 1;
28 |   (data_ptr.*)[2] += 1;
29 | 
30 |   // Read all array values, sum them, and write the result
31 |   result_ptr.* = (data_ptr.*)[0] + (data_ptr.*)[1] + (data_ptr.*)[2];
32 | }
33 | 
34 | fn f_run() void {
35 |   var data = [3]i16 { 1, 2, 3 };
36 | 
37 |   increment_and_sum(&data, &result[0]);
38 | 
39 |   sys_mod.unblock_cmd_stream();
40 | }
41 | 
42 | comptime {
43 |   @export_symbol(result_ptr, "result");
44 |   @export_symbol(f_run);
45 | }
46 | 


--------------------------------------------------------------------------------
/tutorials/gemv-01-complete-program/layout.csl:
--------------------------------------------------------------------------------
 1 | // Copyright 2025 Cerebras Systems.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | // Import memcpy layout module for 1 x 1 grid of PEs
16 | // This module defines parameters passed to program on the single PE
17 | const memcpy = @import_module("<memcpy/get_params>", .{ .width = 1, .height = 1 });
18 | 
19 | layout {
20 | 
21 |   // Use just one 1 PE (columns=1, rows=1)
22 |   @set_rectangle(1, 1);
23 | 
24 |   // The lone PE in this program should execute the code in "pe_program.csl"
25 |   // We pass memcpy parameters as a parameter to the program. Note that
26 |   // memcpy parameters are parameterized by the PE's column number.
27 |   @set_tile_code(0, 0, "pe_program.csl", .{ .memcpy_params = memcpy.get_params(0) });
28 | 
29 |   // Export device symbol for array "y"
30 |   // Last argument is mutability: host can read y, but not write to it
31 |   @export_name("y", [*]f32, false);
32 | 
33 |   // Export host-callable device function
34 |   @export_name("init_and_compute", fn()void);
35 | }
36 | 


--------------------------------------------------------------------------------
/tutorials/pipeline-02-fifo/README.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | Pipeline 2: Attach a FIFO to H2D
 3 | ================================
 4 | 
 5 | The previous example stalls if the parameter ``size`` exceeds the capacity of
 6 | the internal queues. The size of the queue is architecture-dependent. From the
 7 | software development point of view, a program should be independent of any
 8 | architecture. One solution is to add a FIFO between H2D and ``@add16``. The FIFO
 9 | receives data from H2D and then forwards the data to ``@add16``. The WSE
10 | provides an efficient design for FIFO. The user just binds two microthreads to
11 | the FIFO: one pushes data into the FIFO, and the other pops the data out. As
12 | long as the parameter ``size`` does not exceed the capacity of the FIFO, H2D can
13 | always push all data into the FIFO even if ``@add16`` cannot process any data.
14 | Once H2D is done, D2H can continue to drain the data out such that ``@add16``
15 | can progress.
16 | 
17 | To create a FIFO, we use a builtin ``@allocate_fifo`` to bind a normal tensor.
18 | We create two fabric DSDs: one pushes data from ``MEMCPYH2D_DATA_1`` to the
19 | FIFO and the other pops data from the FIFO to the color ``C1``. Both DSDs must
20 | use different microthreads.
21 | 
22 | The routing configuration of color ``C1`` is RAMP to RAMP because
23 | 1) the FIFO pops data to the router via ``C1`` and
24 | 2) ``@add16`` receives data from the router via ``C1``
25 | 
26 | The disadvantage of this approach is the resource consumption. The FIFO
27 | requires two microthreads and a scratch buffer.
28 | 
29 | The next example will fix this issue.
30 | 


--------------------------------------------------------------------------------
/tutorials/gemv-00-basic-syntax/README.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | GEMV 0: Basic CSL Syntax
 3 | ========================
 4 | 
 5 | This example is the first in a series of successive example programs
 6 | demonstrating CSL and the SDK by implementing a general matrix-vector product,
 7 | or GEMV.
 8 | 
 9 | We start by illustrating the syntax of some of CSL's core language constructs.
10 | The code in this example is not a complete program, but it shows
11 | some of the most commonly used CSL features.
12 | 
13 | CSL’s syntax is like that of `Zig <https://ziglang.org>`_.
14 | Despite the similarity, both the purpose and the implementation of the CSL
15 | compiler are different from that of the Zig compiler.
16 | 
17 | Types
18 | -----
19 | 
20 | CSL includes some basic types:
21 | 
22 | 
23 | * ``bool`` for boolean values
24 | * ``i16`` and ``i32`` for 16- and 32-bit signed integers
25 | * ``u16`` and ``u32`` for 16- and 32-bit unsigned integers
26 | * ``f16`` and ``f32`` for 16- and 32-bit IEEE-754 floating point numbers
27 | 
28 | In addition to the above, CSL also supports array types and pointer types.
29 | Their use will be illustrated in subsequent examples.
30 | 
31 | Functions
32 | ---------
33 | 
34 | Functions are declared using the ``fn`` keyword.  The compiler provides special
35 | functions called *Builtins*, whose names start with ``@`` and whose
36 | implementation is provided by the compiler.  All CSL builtins are described in
37 | :ref:`language-builtins`.
38 | 
39 | Conditional Statements and Loops
40 | --------------------------------
41 | 
42 | CSL includes ``if`` statements and ``while`` and ``for`` loops.
43 | These are described in greater detail in the subsequent example programs.
44 | 


--------------------------------------------------------------------------------
/tutorials/topic-15-wse3-microthreads/layout.csl:
--------------------------------------------------------------------------------
 1 | // Copyright 2025 Cerebras Systems.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | // Colors
16 | const send_color: color = @get_color(0); // Color used to send/recv data between PEs
17 | 
18 | // This example only uses 2 PEs
19 | const memcpy = @import_module("<memcpy/get_params>", .{ .width = 2, .height = 1 });
20 | 
21 | layout {
22 |   // PE coordinates are (column, row)
23 |   @set_rectangle(2, 1);
24 | 
25 |   // Left PE (0, 0)
26 |   @set_tile_code(0, 0, "left_pe.csl", .{
27 |     .memcpy_params = memcpy.get_params(0), .send_color = send_color });
28 | 
29 |   // Left PE sends to the right
30 |   @set_color_config(0, 0, send_color, .{.routes = .{ .rx = .{RAMP}, .tx = .{EAST} }});
31 | 
32 |   // Right PE (1, 0)
33 |   @set_tile_code(1, 0, "right_pe.csl", .{
34 |     .memcpy_params = memcpy.get_params(1), .recv_color = send_color });
35 | 
36 |   // Right PE receives from left PE
37 |   @set_color_config(1, 0, send_color, .{.routes = .{ .rx = .{WEST}, .tx = .{RAMP} }});
38 | 
39 |   // export symbol names
40 |   @export_name("y", [*]f32, true);
41 |   @export_name("compute", fn()void);
42 | }
43 | 


--------------------------------------------------------------------------------
/tutorials/sdklayout-01-introduction/README.rst:
--------------------------------------------------------------------------------
 1 | SdkLayout 1: Introduction
 2 | =========================
 3 | 
 4 | This tutorial introduces the ``SdkLayout`` API. ``SdkLayout``
 5 | allows us to define and compile multi-PE WSE programs. Specifically,
 6 | it consists of the following main features:
 7 | 
 8 | * Creation of CSL code regions: rectangular CSL code regions can be
 9 |   instantiated given a CSL source code file path, a name, and the
10 |   width and height dimensions.
11 | * Routing and switching: for a given CSL code region we can specify
12 |   routing and switching information on a single PE within the code
13 |   region, on a rectangular sub-region, or on the entire code region.
14 |   See tutorial :ref:`sdkruntime-sdklayout-02-routing`.
15 | * Automatic color allocation: routing can be done based on symbolic
16 |   colors. The ``SdkLayout`` engine will then allocate physical
17 |   values automatically. See tutorials :ref:`sdkruntime-sdklayout-02-routing`
18 |   and :ref:`sdkruntime-sdklayout-03-ports-and-connections`.
19 | * Automatic routing between code regions: users can create input
20 |   and output ports on code regions and connect them. The ``SdkLayout``
21 |   engine will automatically find optimal routes between them.
22 |   See tutorial :ref:`sdkruntime-sdklayout-03-ports-and-connections`.
23 | * Host-to-device and device-to-host connections: an input or
24 |   output port can be connected to the host to create an input
25 |   or output stream respectively. See tutorial
26 |   :ref:`sdkruntime-sdklayout-04-h2d-d2h`.
27 | 
28 | This tutorial demonstrates the most basic compilation flow,
29 | where a single-PE program with no colors and no routing sets the value
30 | of a global variable in device memory based on the value of
31 | a parameter.
32 | 


--------------------------------------------------------------------------------
/benchmarks/25-pt-stencil/ic.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env cs_python
 2 | 
 3 | # Copyright 2025 Cerebras Systems.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | 
18 | import numpy as np
19 | 
20 | 
21 | def computeGaussianSource(iterations):
22 |   tau = np.float32(1.0)
23 |   scale = np.float32(8.0)
24 |   mscale = np.float32(-8.0)
25 |   _fmax = np.float32(25.0)
26 |   dt = np.float32(0.001610153)
27 |   sigma = np.float32(0.6) * _fmax
28 | 
29 |   t = np.arange(0, iterations, 1, dtype=np.float32) * np.float32(dt)
30 |   power = np.power(sigma * t - tau, 2, dtype=np.float32)
31 |   expf = np.exp(np.multiply(power, np.float32(mscale)))
32 |   source = (
33 |       np.float32(-2.0)
34 |       * scale
35 |       * sigma
36 |       * np.multiply(
37 |           sigma - np.float32(2.0) * sigma * scale * power,
38 |           expf,
39 |           dtype=np.float32,
40 |       )
41 |   )
42 | 
43 |   first_zero_idx = np.nonzero(source)[-1][-1] + 1
44 |   if first_zero_idx < source.shape[-1]:
45 |     source = source[:first_zero_idx]
46 |     sourceLength = first_zero_idx
47 |   else:
48 |     sourceLength = source.shape[-1]
49 | 
50 |   print(f"sourceLength = {sourceLength}, first_zero_idx={first_zero_idx}")
51 | 
52 |   return source, sourceLength
53 | 


--------------------------------------------------------------------------------
/benchmarks/gemv-checkerboard-pattern/README.rst:
--------------------------------------------------------------------------------
 1 | GEMV with Checkerboard Pattern
 2 | ==============================
 3 | 
 4 | This example shows a CSL program that performs generalized matrix-vector (GEMV)
 5 | multiplication operation of the form:
 6 | 
 7 | .. code-block:: text
 8 | 
 9 |     y = Ax + b
10 | 
11 | where:
12 | 
13 | - ``A`` is a tensor of shape [M, N] (stored distributed on PE memory).
14 | - ``x`` is a tensor input of shape [N, 1] (streamed in).
15 | - ``b`` is a tensor input of shape [M, 1] (streamed in).
16 | - ``y`` is the tensor output of shape [M, 1] (streamed out).
17 | 
18 | For simplicity, we choose M as a multiple of the
19 | height of the kernel and N as a multiple of the width of the kernel.
20 | In this example, M = 32, N = 16 and we use a PE-rectangle (kernel) of
21 | size 4×4.
22 | 
23 | Below is a visualization of the kernel interface:
24 | 
25 | .. _fig-gemv-4-by-4-checkerboard:
26 | 
27 | .. figure:: ./images/gemv-4-by-4.png
28 |     :align: center
29 |     :width: 980 px
30 | 
31 | Note that this algorithm and the implementation are not optimized for
32 | performance. It is intended to serve as a non-trivial introductory example.
33 | 
34 | All computations are done in FP16 format.
35 | 
36 | The matrix ``A``, of shape [M, N],
37 | is distributed across the PE memories as follows:
38 | 
39 | - The first dimension of ``A``, M rows, is distributed across
40 |   the height of the kernel.
41 | - The second dimension of ``A``, N columns, is distributed across
42 |   the width of the kernel.
43 | 
44 | Since we know that M is 32 and the height of the kernel is 4, each PE will be
45 | assigned 32÷4 = 8 rows of ``A``.
46 | 
47 | Similarly, each PE will get 16÷4 = 4 columns of ``A``. This means each PE is
48 | assigned an 8×4 chunk of the original matrix ``A``.
49 | 


--------------------------------------------------------------------------------
/benchmarks/mandelbrot/README.rst:
--------------------------------------------------------------------------------
 1 | Mandelbrot
 2 | ==========
 3 | 
 4 | This is a simple program that computes a visualization of the Mandelbrot set on
 5 | a 16x16 pixel grid using a 4x4 grid of PEs.
 6 | 
 7 | Files:
 8 | - ``code.csl``: the main file that sets up the 4x4 PE grid and routing
 9 | - ``left.csl``: code for the PEs on the left of the grid
10 | - ``middle.csl``: code for the PEs in the rest of the grid
11 | - ``common.csl``: Mandelbrot code used in all PEs
12 | 
13 | Description:
14 | 
15 | This program adopts a pipeline-parallel approach to generating the Mandelbrot
16 | set. Each row of 4 PEs is responsible for a 4x16 chunk of the grid. The PE on
17 | the left of each row generates elements, performs up to 8 iterations on them,
18 | then passes them to the right. Each subsequent PE in the same row will also
19 | perform up to 8 iterations, then pass the elements right. Eventually, the
20 | element is outputted on the EAST side of the grid after having undergone a
21 | maximum of 32 iterations.
22 | 
23 | When a PE passes "an element", it is actually passing 3 32-bit floats. They are
24 | as follows: { real part, imaginary part, number of iterations so far }
25 | 
26 | Middle PEs calculate the x,y of the values they receive based on the order they
27 | receive them in.
28 | 
29 | An alternative approach would be to assign each PE a 4x4 tile of the 16x16
30 | overall grid and have it compute Mandelbrot for just its tile. Implementing this
31 | version and comparing its performance to this pipeline-parallel program would be
32 | interesting future work.
33 | 
34 | Known problems:
35 | - Load balancing between PEs in the same row is poor
36 | - ``iters`` is stored as an ``f32``. It really should be an integer type,
37 | however, we do not yet have support for sending structs through memory DSDs.
38 | 


--------------------------------------------------------------------------------
/benchmarks/fft-3d/README.rst:
--------------------------------------------------------------------------------
 1 | 3D FFT
 2 | ======
 3 | 
 4 | This example implements a 3D Discrete Fourier Transform by using a pencil
 5 | decomposition, in which the input data is viewed as a 2D array of 1D pencils,
 6 | and each PE stores a small subarray of the 2D array of pencils.
 7 | 
 8 | The algorithm proceeds in steps. First, the 1D FFT of the pencils on each PE
 9 | are performed. Then, the data is transposed along a coordinate axis among
10 | all PEs. This process happens two more times, resulting in three local
11 | operations in which 1D FFTs are performed independently on each PE, and three
12 | transpose operations in which all PEs commmunicate to change which axis of
13 | the data is stored in memory.
14 | 
15 | The algorithm used to compute the 1D FFTs is Cooley-Tukey,
16 | Decimation in Time (DIT), radix 2, with the
17 | slight tweak that we use iteration instead of recursion.
18 | 
19 | FFT Compilation Parameters
20 | --------------------------
21 | 
22 | * ``N``: Size of 3D FFT along one dimension. The full problem size is
23 |   ``N x N x N``.
24 | * ``NUM_PENCILS_PER_DIM``: Number of pencils along a given dimension on each PE.
25 |   For instance, ``NUM_PENCILS_PER_DIM == 2`` means that each PE stores
26 |   ``2 x 2`` pencils.
27 | * ``FP``: Floating point precision. Valid values are ``1`` or ``2``, specifying
28 |   IEEE fp16 or fp32, respectively.
29 | 
30 | FFT Runtime Parameters
31 | ----------------------
32 | 
33 | * ``--inverse``: With this flag set, perform an inverse Fourier transform.
34 | * ``--real``: With this flag set, compute Fourier transform with real input
35 |   data. Without this flag, complex Fourier transform is computed.
36 | * ``--norm``: Normalization strategy. Valid values are ``0``, ``1``, or ``2``,
37 |   specifying ``forward``, ``backward``, or ``orthonormal``, respectively.
38 | 


--------------------------------------------------------------------------------
/benchmarks/bandwidth-test/README.rst:
--------------------------------------------------------------------------------
 1 | Bandwidth Test
 2 | ==============
 3 | 
 4 | This example evaluates the bandwidth between the host and the device (WSE). The
 5 | kernel records the ``start`` and ``end`` of H2D or D2H by tsc counter. This is
 6 | better than host timer because the runtime may not send the command right after
 7 | the user issues it. The runtime can aggregate multiple nonblocking commands
 8 | together to reduce TCP overhead. In addition the tsc counters of all PEs are
 9 | not sychronized in the beginning. To avoid the timing variation among those PEs
10 | , we add a sync() to synchronize all PEs and sample the reference clock.
11 | 
12 | The kernel ``bw_sync_kernel.csl`` defines a couple of host-callable functions,
13 | ``f_sync()``, ``f_tic()`` and ``f_toc()`` in order to synchronize the PEs and
14 | record the timing of H2D or D2H.
15 | 
16 | The kernel ``sync/pe.csl`` performs a reduction over the whole rectangle to sync
17 | the PEs, then the top-left PE sends a signal to other PEs to sample the
18 | reference clock.
19 | 
20 | The script ``run.py`` has the following parameters:
21 | 
22 | - ``--loop_count=<int>`` decides how many H2Ds/D2Hs are called.
23 | 
24 | - ``--d2h`` measures the bandwidth of D2H, otherwise H2D is measured.
25 | 
26 | - ``--channels=<int>`` specifies the number of I/O channels, no bigger than 16.
27 | 
28 | The tic() samples "time_start" and toc() samples "time_end". The sync() samples
29 | "time_ref" which is used to adjust "time_start" and "time_end".
30 | The elapsed time (unit: cycles) is measured by
31 | ``cycles_send = max(time_end) - min(time_start)``
32 | 
33 | The overall runtime (us) is computed via the following formula
34 | ``time_send = (cycles_send / 0.85) * 1.e-3 us``
35 | 
36 | The bandwidth is calculated by
37 | ``bandwidth = ((wvlts * 4)/time_send)*loop_count``
38 | 


--------------------------------------------------------------------------------
/benchmarks/spmv-hypersparse/src/allreduce2R1E/layout.csl:
--------------------------------------------------------------------------------
 1 | // Copyright 2025 Cerebras Systems.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | 
16 | param colors: [2]color;
17 | param entrypoints: [1]local_task_id;
18 | param width: i16 ;   // width of the core
19 | param height: i16 ;  // height of the core
20 | 
21 | 
22 | const C0: color = colors[0];
23 | const C1: color = colors[1];
24 | 
25 | // entrypoints of allreduce module
26 | // LOCK runs only if teardown is received and the operation is done
27 | // LOCK performs the state transition
28 | // teardown handler activates LOCK
29 | // the operation blocks LOCK in the beginning and unblocks it when it finishes
30 | const C_LOCK: local_task_id = entrypoints[0];
31 | 
32 | fn get_params(px:i16, py:i16) comptime_struct {
33 | 
34 |     var first_py: bool = (0 == py);
35 |     var last_py: bool = ((height-1) == py);
36 | 
37 |     var first_px: bool = (0 == px);
38 |     var last_px: bool = ((width-1) == px);
39 | 
40 |     return .{
41 |         .first_px = first_px,
42 |         .last_px = last_px,
43 |         .first_py = first_py,
44 |         .last_py = last_py,
45 |         .C_ROUTE = C0,
46 |         .C_DISPATCH = C1,
47 |         .C_LOCK = C_LOCK,
48 |         .width = width,
49 |         .height = height
50 |     };
51 | }
52 | 


--------------------------------------------------------------------------------
/tutorials/topic-15-wse3-microthreads/right_pe.csl:
--------------------------------------------------------------------------------
 1 | // Copyright 2025 Cerebras Systems.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | param memcpy_params: comptime_struct;
16 | 
17 | param recv_color: color;
18 | 
19 | const M: i16 = 10;
20 | 
21 | // Task IDs
22 | const exit_task_id: local_task_id = @get_local_task_id(9);
23 | 
24 | // Queue and microthread IDs
25 | const recv_color_iq = @get_input_queue(2);
26 | const recv_color_ut = @get_ut_id(5);
27 | 
28 | const sys_mod = @import_module("<memcpy/memcpy>", memcpy_params);
29 | 
30 | var y: [M]f32;
31 | var y_dsd = @get_dsd(mem1d_dsd, .{ .tensor_access = |i|{M} -> y[i] });
32 | var y_ptr: [*]f32 = &y;
33 | 
34 | fn compute() void {
35 |   const in_dsd = @get_dsd(fabin_dsd, .{
36 |                    .fabric_color = recv_color, .extent = M,
37 |                    .input_queue = recv_color_iq
38 |                  });
39 |   @fmovs(y_dsd, in_dsd, .{ .async = true, .ut_id = recv_color_ut,
40 |                            .activate = exit_task_id });
41 | }
42 | 
43 | task exit_task() void {
44 |   sys_mod.unblock_cmd_stream();
45 | }
46 | 
47 | comptime {
48 |   @bind_local_task(exit_task, exit_task_id);
49 | 
50 |   @initialize_queue(recv_color_iq, .{ .color = recv_color });
51 | 
52 |   @export_symbol(y_ptr, "y");
53 |   @export_symbol(compute);
54 | }
55 | 


--------------------------------------------------------------------------------
/tutorials/topic-15-wse3-microthreads/left_pe.csl:
--------------------------------------------------------------------------------
 1 | // Copyright 2025 Cerebras Systems.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | param memcpy_params: comptime_struct;
16 | 
17 | param send_color: color;
18 | 
19 | const M: i16 = 10;
20 | 
21 | // Task IDs
22 | const exit_task_id: local_task_id = @get_local_task_id(9);
23 | 
24 | // Queue and microthread IDs
25 | const send_color_oq = @get_output_queue(2);
26 | const send_color_ut = @get_ut_id(4);
27 | 
28 | const sys_mod = @import_module("<memcpy/memcpy>", memcpy_params);
29 | 
30 | var y: [M]f32;
31 | var y_dsd = @get_dsd(mem1d_dsd, .{ .tensor_access = |i|{M} -> y[i] });
32 | var y_ptr: [*]f32 = &y;
33 | 
34 | fn compute() void {
35 |   const out_dsd = @get_dsd(fabout_dsd, .{
36 |                     .fabric_color = send_color, .extent = M,
37 |                     .output_queue = send_color_oq
38 |                   });
39 |   @fmovs(out_dsd, y_dsd, .{ .async = true, .ut_id = send_color_ut,
40 |                             .activate = exit_task_id });
41 | }
42 | 
43 | task exit_task() void {
44 |   sys_mod.unblock_cmd_stream();
45 | }
46 | 
47 | comptime {
48 |   @bind_local_task(exit_task, exit_task_id);
49 | 
50 |   @initialize_queue(send_color_oq, .{ .color = send_color });
51 | 
52 |   @export_symbol(y_ptr, "y");
53 |   @export_symbol(compute);
54 | }
55 | 


--------------------------------------------------------------------------------
/benchmarks/fft-1d-2d/reshape.csl:
--------------------------------------------------------------------------------
 1 | // Copyright 2025 Cerebras Systems.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | 
16 | param SRC_SIZE: u16;
17 | param DST_SIZE: u16;
18 | param N: u16;
19 | param FP: i16;
20 | param tensor_type: type;
21 | param dest: *[DST_SIZE]tensor_type;
22 | param src: *[SRC_SIZE]tensor_type;
23 | 
24 | const srcDSD = @get_dsd(mem1d_dsd, .{ .tensor_access = |i|{N} -> src[i] });
25 | const destDSD = @get_dsd(mem1d_dsd, .{ .tensor_access = |i|{N} -> dest[i] });
26 | 
27 | fn reshape(baseA: *u16, baseB: *u16, subproblems: u16, stride: u16) void {
28 | 
29 |   var sourceDSD = @increment_dsd_offset(srcDSD, @as(i16, baseB.*), tensor_type);
30 |   sourceDSD = @set_dsd_length(sourceDSD, subproblems);
31 | 
32 |   var destinationDSD = @increment_dsd_offset(destDSD, @as(i16, baseA.*), tensor_type);
33 |   destinationDSD = @set_dsd_length(destinationDSD, subproblems);
34 | 
35 |   while ((baseB.*) < N) {
36 |     if (FP==1){
37 |       @fmovh(destinationDSD, sourceDSD);
38 |     } else {
39 |       @fmovs(destinationDSD, sourceDSD);
40 |     }
41 |     destinationDSD = @increment_dsd_offset(destinationDSD, @as(i16, subproblems), tensor_type);
42 |     sourceDSD = @increment_dsd_offset(sourceDSD, @as(i16, stride), tensor_type);
43 |     (baseA.*) += subproblems;
44 |     (baseB.*) += stride;
45 |   }
46 | }
47 | 


--------------------------------------------------------------------------------
/tutorials/topic-09-fifos/README.rst:
--------------------------------------------------------------------------------
 1 | Topic 9: FIFOs
 2 | ==============
 3 | 
 4 | A FIFO DSD is useful to buffer input going into or out of a PE, as a way to
 5 | extend the small hardware queues used for fabric communication. In particular,
 6 | this may prevent stalls in the communication fabric when input or output
 7 | happens in bursts. It is also possible to operate on the values while they flow
 8 | through the FIFO, as this code sample demonstrates.
 9 | 
10 | This example illustrates a typical pattern in the use of FIFOs, where a
11 | receiver receives wavelets from the fabric and forwards them to a task that
12 | performs some computation. Specifically, incoming data from the host is stored
13 | in the FIFO, thus relieving the sender from being blocked until the receiver
14 | has received all wavelets. While the incoming wavelets are being asynchronously
15 | received into the FIFO buffer, we also start a second asynchronous DSD
16 | operation that pulls data from the FIFO and forwards it to a wavelet-triggered
17 | task.
18 | 
19 | This example also illustrates another common pattern, where a PE starts a
20 | wavelet-triggered task using its own wavelets, by sending them to the router
21 | which immediately sends them back to the compute element. In our example, this
22 | wavelet-triggered task simply computes the cube of the wavelet's data, before
23 | sending the result to the host.
24 | 
25 | Note that, to demonstrate the use of FIFOs in this program, we use ``memcpy``
26 | streaming mode to stream data from the host and receive in the PE program's
27 | FIFO, and to stream data out of the PE program back to the host. Because
28 | ``memcpy`` calls are serialized, the ``memcpy_h2d`` must finish before the
29 | ``memcpy_d2h``. This places an artificial restriction on our FIFO: the input
30 | size from the host cannot exceed the FIFO size, or the program will potentially
31 | stall.
32 | 


--------------------------------------------------------------------------------
/tutorials/topic-01-arrays-and-pointers/layout.csl:
--------------------------------------------------------------------------------
 1 | // Copyright 2025 Cerebras Systems.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | // The core kernel must start at P4.1 so the memcpy infrastructure has enough
16 | // resources to route the data between the host and the device.
17 | 
18 | // Color/ task ID map
19 | //
20 | //  ID var  ID var         ID var                ID var
21 | //   0       9             18                    27 reserved (memcpy)
22 | //   1      10             19                    28 reserved (memcpy)
23 | //   2      11             20                    29 reserved
24 | //   3      12             21 reserved (memcpy)  30 reserved (memcpy)
25 | //   4      13             22 reserved (memcpy)  31 reserved
26 | //   5      14             23 reserved (memcpy)  32
27 | //   6      15             24                    33
28 | //   7      16             25                    34
29 | //   8      17             26                    35
30 | 
31 | const memcpy = @import_module("<memcpy/get_params>", .{
32 |   .width = 1,
33 |   .height = 1,
34 | });
35 | 
36 | layout {
37 |   @set_rectangle(1, 1);
38 | 
39 |   @set_tile_code(0, 0, "pe_program.csl", .{ .memcpy_params = memcpy.get_params(0) });
40 | 
41 |   // export symbol name
42 |   @export_name("result", [*]i16, true);
43 |   @export_name("f_run", fn()void);
44 | }
45 | 


--------------------------------------------------------------------------------
/benchmarks/wide-multiplication/README.rst:
--------------------------------------------------------------------------------
 1 | Wide Multiplication
 2 | ===================
 3 | 
 4 | This example shows a CSL program that performs multiplication of wide integers:
 5 | 
 6 | .. code-block:: text
 7 | 
 8 |     result = X x Y
 9 | 
10 | where:
11 | 
12 | - ``X`` and ``Y`` are 128-bit unsigned integers.
13 | - ``result`` is the 256-bit wide result of multiplying X and Y.
14 | 
15 | The simulation script ``run.py`` generates random values for ``X`` and ``Y``.
16 | ``X`` is represented as a NumPy array of 16 elements of type ``uint16`` on the
17 | form
18 | 
19 | .. code-block::
20 | 
21 |     X = [x₀, x₁, ..., x₇, 0, 0, ..., 0]
22 | 
23 | where:
24 | 
25 | - The representation uses little endian.
26 | - x :subscript:`i`, i = 0, 1,..., 7, is the i-th 2-byte word of ``X``.
27 | - The eight trailing zeros are leading zeros to get a full 256-bit
28 |   representation.
29 | 
30 | ``Y`` is represented similarly, and ``X`` and ``Y`` are concatenated and sent to
31 | the fabric as a single 32-element vector of type ``uint16``:
32 | 
33 | .. code-block::
34 | 
35 |      (X, Y) = [x₀, x₁, ..., x₇, 0, 0, ..., 0, y₀, y₁, ... y₇, 0, 0,
36 |                ..., 0]
37 | 
38 | The multiplication is performed by a single PE which receives the input vectors
39 | (``X``, ``Y``) via the streaming H2D on color ``MEMCPYH2D_DATA_1`` and delivers
40 | the ``result`` via streaming D2H on color ``MEMCPYD2H_DATA_1``. A single color
41 | ``MEMCPYH2D_DATA_1`` is used for the delivery of both input vectors ``X`` and
42 | ``Y``. This is made possible by concatenation of ``X`` and ``Y`` into a single
43 | input vector.
44 | 
45 | The multiplication is done at the bit level. In the k'th iteration of the outer
46 | loop, ``Y`` is traversed and multiplied by the bit value at position ``k`` of
47 | ``X``. This partial result is added to an accumulated result, tracking a carry
48 | bit, and ``X`` is then shifted by one position before the next iteration.
49 | 


--------------------------------------------------------------------------------
/benchmarks/fft-1d-2d/ucode_1d.csl:
--------------------------------------------------------------------------------
 1 | // Copyright 2025 Cerebras Systems.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | param memcpyParams: comptime_struct;
16 | 
17 | // Task IDs
18 | const EXIT: local_task_id = @get_local_task_id(10);
19 | 
20 | const sys_mod = @import_module( "<memcpy/memcpy>", memcpyParams);
21 | 
22 | // Problem size
23 | param N: i16;
24 | const ELEM_SIZE: i16 = 2;
25 | param FP: i16;
26 | param tensor_type: type;
27 | 
28 | // Import the code in the file `fft.csl` as the module `mod`, and
29 | // instantiate the module's `N` parameter.
30 | 
31 | var X = @zeros([N*ELEM_SIZE]tensor_type);
32 | var f_twiddle = @zeros([N]tensor_type);
33 | 
34 | var ptr_X: [*]tensor_type = &X;
35 | var ptr_f_twiddle: [*]tensor_type = &f_twiddle;
36 | 
37 | const mod = @import_module("fft.csl", .{ .N = N, .ARRAY_LEN = N*ELEM_SIZE, .X=&X, .FP=FP, .tensor_type=tensor_type});
38 | 
39 | fn f_fft() void {
40 |   mod.fft(&f_twiddle);
41 |   @activate(EXIT);
42 | }
43 | 
44 | fn f_ifft() void {
45 |   mod.ifft(&f_twiddle);
46 |   @activate(EXIT);
47 | }
48 | 
49 | task f_exit() void {
50 |   // the user must unblock cmd color for every PE
51 |   sys_mod.unblock_cmd_stream();
52 | }
53 | comptime {
54 |   @bind_local_task(f_exit, EXIT);
55 |   @export_symbol(ptr_X, "X");
56 |   @export_symbol(ptr_f_twiddle, "f_twiddle");
57 |   @export_symbol(f_fft);
58 |   @export_symbol(f_ifft);
59 | }
60 | 


--------------------------------------------------------------------------------
/tutorials/topic-02-libraries/layout.csl:
--------------------------------------------------------------------------------
 1 | // Copyright 2025 Cerebras Systems.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | // Color/ task ID map
16 | //
17 | //  ID var           ID var     ID var                ID var
18 | //   0                9         18                    27 reserved (memcpy)
19 | //   1               10         19                    28 reserved (memcpy)
20 | //   2               11         20                    29 reserved
21 | //   3               12         21 reserved (memcpy)  30 reserved (memcpy)
22 | //   4               13         22 reserved (memcpy)  31 reserved
23 | //   5               14         23 reserved (memcpy)  32
24 | //   6               15         24                    33
25 | //   7               16         25                    34
26 | //   8               17         26                    35
27 | 
28 | param iterations: u32;
29 | 
30 | const memcpy = @import_module("<memcpy/get_params>", .{
31 |   .width = 1,
32 |   .height = 1,
33 | });
34 | 
35 | layout {
36 |   @set_rectangle(1, 1);
37 | 
38 |   @set_tile_code(0, 0, "pe_program.csl", .{
39 |     .memcpy_params = memcpy.get_params(0),
40 |     .iterations = iterations
41 |   });
42 | 
43 |   // export symbol name
44 |   @export_name("result", [*]f32, true);
45 |   @export_name("start_timestamp", [*]u16, true);
46 |   @export_name("finish_timestamp", [*]u16, true);
47 |   @export_name("f_run", fn()void);
48 | }
49 | 


--------------------------------------------------------------------------------
/tutorials/gemv-02-memory-dsds/README.rst:
--------------------------------------------------------------------------------
 1 | GEMV 2: Memory DSDs
 2 | ===================
 3 | 
 4 | Continuing on from the previous example, we now extend it by introducing
 5 | memory Data Structure Descriptors (DSDs), an efficient mechanism for
 6 | performing operations on entire tensors.
 7 | 
 8 | This program creates three one-dimensional memory DSDs for accessing ``A``,
 9 | ``b``, and ``y``, each of which specifies how to loop over the respective
10 | arrays.
11 | 
12 | ``b_dsd`` and ``y_dsd`` access the ``M`` contiguous elements of ``b`` and ``y``,
13 | respectively.
14 | ``A_dsd`` accesses ``M`` elements of ``A``, but strided by ``N`` elements.
15 | Because ``A`` is stored in row major format, this means that ``A_dsd``
16 | initially accesses the 0th column of ``A``.
17 | 
18 | We demonstrate here two ways of defining DSDs. For ``y_dsd``, we specify the
19 | base memory address (``&y``) and the number of elements accessed (``M``).
20 | For ``A_dsd`` and ``b_dsd``, we demonstrate the use of a ``tensor_access``
21 | expression.
22 | The ``tensor_access`` field specifies an induction variable, a loop bound,
23 | and an affine expression (i.e., a linear function plus a constant) to generate
24 | various addresses at runtime.
25 | 
26 | 
27 | These DSDs are used by the DSD operations ``@fmacs`` and ``@fadds`` to
28 | compute ``Ax + b`` and store it in ``y``.
29 | The ``gemv`` function first loops over ``N``, with the ``@fmacs`` in iteration
30 | ``i`` computing the scalar-vector product of ``x[i]`` with column ``i``
31 | of ``A``, and incrementing ``y`` by that result.
32 | The ``increment_dsd_offset`` operation updates ``A_dsd`` by shifting its
33 | access by one element.
34 | This causes ``A_dsd`` to access the next column of ``A``.
35 | After the loop, ``y`` is incremented by ``b`` with the ``@fadds`` operation,
36 | to complete the computation.
37 | 
38 | Other DSD operations and their associated operand types are described in
39 | :ref:`language-builtins-for-dsd-operations`.
40 | 


--------------------------------------------------------------------------------
/benchmarks/7pt-stencil-spmv/README.rst:
--------------------------------------------------------------------------------
 1 | 3D 7-Point Stencil SpMV
 2 | =======================
 3 | 
 4 | This example evaluates the performance of 7-point stencil. The kernel records
 5 | the ``start`` and ``end`` of ``spmv`` by tsc counter. In addition the tsc
 6 | counters of all PEs are not sychronized in the beginning. To avoid the timing
 7 | variation among those PEs, ``sync()`` synchronizes all PEs and samples the
 8 | reference clock.
 9 | 
10 | The kernel ``kernel.csl`` defines a couple of host-callable functions,
11 | ``f_sync()``, ``f_tic()`` and ``f_toc()`` in order to synchronize the PEs and
12 | record the timing of ``spmv``.
13 | 
14 | The kernel ``allreduce/pe.csl`` performs a reduction over the whole rectangle
15 | to synchronize the PEs, then the bottom-right PE sends a signal to other PEs
16 | to sample the reference clock.
17 | 
18 | The kernel ``stencil_3d_7pts/pe.csl`` performs a matrix-vector product (spmv)
19 | where the matrix has 7 diagonals corresponding to 7 point stencil. The stencil
20 | coefficients can vary per PE, but must be the same for the local vector. The
21 | user can change the coefficients based on the boundary condition or curvilinear
22 | coordinate transformation.
23 | 
24 | The script ``run.py`` has the following parameters:
25 | 
26 | - ``-k=<int>`` specifies the maximum size of local vector.
27 | 
28 | - ``--zDim=<int>`` specifies how many elements per PE are computed.
29 | 
30 | - ``--channels=<int>`` specifies the number of I/O channels, no bigger than 16.
31 | 
32 | The ``tic()`` samples "time_start" and ``toc()`` samples "time_end". The
33 | ``sync()`` samples "time_ref" which is used to adjust "time_start" and
34 | "time_end". The elapsed time (unit: cycles) is measured by
35 | ``cycles_send = max(time_end) - min(time_start)``
36 | 
37 | The overall runtime (us) is computed via the following formula
38 | ``time_send = (cycles_send / 0.85) * 1.e-3 us``
39 | 
40 | The bandwidth is calculated by
41 | ``bandwidth = ((6*w*h*4)/time_send)``
42 | 


--------------------------------------------------------------------------------
/tutorials/topic-10-map-builtin/layout.csl:
--------------------------------------------------------------------------------
 1 | // Copyright 2025 Cerebras Systems.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | // Color/ task ID map
16 | //
17 | //  ID var           ID var      ID var                ID var
18 | //   0                9          18                    27 reserved (memcpy)
19 | //   1               10          19                    28 reserved (memcpy)
20 | //   2               11          20                    29 reserved
21 | //   3               12          21 reserved (memcpy)  30 reserved (memcpy)
22 | //   4               13          22 reserved (memcpy)  31 reserved
23 | //   5               14          23 reserved (memcpy)  32
24 | //   6               15          24                    33
25 | //   7               16          25                    34
26 | //   8               17          26                    35
27 | 
28 | param size: i16;
29 | 
30 | const memcpy = @import_module( "<memcpy/get_params>", .{
31 |   .width = 1,
32 |   .height = 1,
33 | });
34 | 
35 | layout {
36 |   @set_rectangle(1, 1);
37 | 
38 |   @set_tile_code(0, 0, "pe_program.csl", .{
39 |     .memcpy_params = memcpy.get_params(0),
40 |     .size = size,
41 |   });
42 | 
43 |   // export symbol name
44 |   @export_name("weight", [*]f32, true);
45 |   @export_name("sqrt_diag_A", [*]f32, true);
46 |   @export_name("A", [*]f32, true);
47 |   @export_name("sum", [*]i32, true);
48 |   @export_name("f_run", fn()void);
49 | }
50 | 


--------------------------------------------------------------------------------
/benchmarks/game-of-life/README.rst:
--------------------------------------------------------------------------------
 1 | Conway's Game of Life
 2 | =====================
 3 | 
 4 | This program implements
 5 | `Conway's Game of Life <https://en.wikipedia.org/wiki/Conway%27s_Game_of_Life>`_
 6 | on the WSE.
 7 | 
 8 | Conway's Game of Life is a cellular automaton which evolves on a 2D grid of
 9 | square cells. Each cell is in one of two possible states, LIVE or DEAD.
10 | Every cell interacts with its neighbors, which are the cells horziontally,
11 | vertically, or diagonally adjacent. At each step in time, the following
12 | transitions occur:
13 | 
14 | - Any LIVE cell with fewer than two LIVE neighbours becomes a DEAD cell.
15 | - Any LIVE cell with two or three LIVE neighbours stays a LIVE cell.
16 | - Any LIVE cell with more than three LIVE neighbours becomes a DEAD cell.
17 | - Any DEAD cell with exactly three LIVE neighbours becomes a LIVE cell.
18 | 
19 | This program implements the Game of Life be assigning one cell to each PE.
20 | Zero boundary conditions are used, and thus the neighbors of a border PE that
21 | fall outside of the program rectangle are treaded as always DEAD.
22 | 
23 | In each generation, each PE sends its state to its four N, S, E, and W
24 | neighbors. Each PE receives the state of its four N, S, E, and W neighbors, and
25 | also forwards the received state from its N and S neighbors to its E and W
26 | neighbors. Thus, each PE receives from its E and W links both the state of its
27 | E and W adjacent neighbors, as well as its four diagonal neighbors.
28 | 
29 | The program implements two initial conditions, ``random`` and ``glider``.
30 | ``random`` randomly initializes the state of all cells. ``glider`` generates
31 | several glider objects across the grid. The initial condition can be set with
32 | the ``--initial-state`` flag.
33 | 
34 | The ``--show-ascii-animation`` flag will generate an ASCII animation of the
35 | cellular automoton's evolution when the program is complete.
36 | ``--save-animation`` will save a GIF of the automoton's evolution.
37 | 


--------------------------------------------------------------------------------
/tutorials/pipeline-01-basic/layout.csl:
--------------------------------------------------------------------------------
 1 | // Copyright 2025 Cerebras Systems.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | // Color/ task ID map
16 | //
17 | //  ID var           ID var  ID var               ID var
18 | //   0 MEMCPYH2D_1    9      18                   27 reserved (memcpy)
19 | //   1 MEMCPYD2H_1   10      19                   28 reserved (memcpy)
20 | //   2               11      20                   29 reserved
21 | //   3               12      21 reserved (memcpy) 30 reserved (memcpy)
22 | //   4               13      22 reserved (memcpy) 31 reserved
23 | //   5               14      23 reserved (memcpy) 32
24 | //   6               15      24                   33
25 | //   7               16      25                   34
26 | //   8 main_task_id  17      26                   35
27 | 
28 | param size: i16;
29 | 
30 | param MEMCPYH2D_DATA_1_ID: i16;
31 | param MEMCPYD2H_DATA_1_ID: i16;
32 | 
33 | const MEMCPYH2D_DATA_1: color = @get_color(MEMCPYH2D_DATA_1_ID);
34 | const MEMCPYD2H_DATA_1: color = @get_color(MEMCPYD2H_DATA_1_ID);
35 | 
36 | const memcpy = @import_module("<memcpy/get_params>", .{
37 |     .width = 1,
38 |     .height = 1,
39 |     .MEMCPYH2D_1 = MEMCPYH2D_DATA_1,
40 |     .MEMCPYD2H_1 = MEMCPYD2H_DATA_1
41 |     });
42 | 
43 | layout {
44 |   @set_rectangle(1, 1);
45 | 
46 |   @set_tile_code(0, 0, "pe_program.csl", .{
47 |     .size = size,
48 |     .memcpy_params = memcpy.get_params(0)
49 |   });
50 | }
51 | 


--------------------------------------------------------------------------------
/tutorials/topic-01-arrays-and-pointers/run.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env cs_python
 2 | 
 3 | # Copyright 2025 Cerebras Systems.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | 
18 | import argparse
19 | import numpy as np
20 | 
21 | from cerebras.sdk.sdk_utils import memcpy_view
22 | from cerebras.sdk.runtime.sdkruntimepybind import SdkRuntime, MemcpyDataType # pylint: disable=no-name-in-module
23 | from cerebras.sdk.runtime.sdkruntimepybind import MemcpyOrder # pylint: disable=no-name-in-module
24 | 
25 | parser = argparse.ArgumentParser()
26 | parser.add_argument('--name', help='the test name')
27 | parser.add_argument("--cmaddr", help="IP:port for CS system")
28 | args = parser.parse_args()
29 | dirname = args.name
30 | 
31 | memcpy_dtype = MemcpyDataType.MEMCPY_16BIT
32 | runner = SdkRuntime(dirname, cmaddr=args.cmaddr)
33 | 
34 | result_symbol = runner.get_id('result')
35 | 
36 | runner.load()
37 | runner.run()
38 | 
39 | runner.launch("f_run", nonblock=False)
40 | 
41 | # The D2H buffer must be of type u32
42 | out_tensors_u32 = np.zeros(1, np.uint32)
43 | runner.memcpy_d2h(out_tensors_u32, result_symbol, 0, 0, 1, 1, 1, \
44 |     streaming=False, data_type=memcpy_dtype, order=MemcpyOrder.COL_MAJOR, nonblock=False)
45 | 
46 | # remove upper 16-bit of each u32
47 | result_tensor = memcpy_view(out_tensors_u32, np.dtype(np.int16))
48 | 
49 | runner.stop()
50 | 
51 | # Ensure that the result matches our expectation
52 | np.testing.assert_equal(result_tensor, [9])
53 | print("SUCCESS!")
54 | 


--------------------------------------------------------------------------------
/benchmarks/benchmark-libs/allreduce/layout.csl:
--------------------------------------------------------------------------------
 1 | // Copyright 2025 Cerebras Systems.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | 
16 | param colors: [1]color;
17 | param entrypoints: [4]local_task_id;
18 | param width: i16 ;   // width of the core
19 | param height: i16 ;  // height of the core
20 | 
21 | 
22 | const C0: color = colors[0];
23 | 
24 | // entrypoints of allreduce module
25 | const SEND_CTRL: local_task_id = entrypoints[0];
26 | const SEND_DATA: local_task_id = entrypoints[1];
27 | const STATE_ENTRY: local_task_id = entrypoints[2];
28 | // LOCK runs only if teardown is received and the operation is done
29 | // LOCK performs the state transition
30 | // teardown handler activates LOCK
31 | // the operation blocks LOCK in the beginning and unblocks it when it finishes
32 | const C_LOCK: local_task_id = entrypoints[3];
33 | 
34 | fn get_params(px:i16, py:i16) comptime_struct {
35 | 
36 |     var first_py: bool = (0 == py);
37 |     var last_py: bool = ((height-1) == py);
38 | 
39 |     var first_px: bool = (0 == px);
40 |     var last_px: bool = ((width-1) == px);
41 | 
42 |     return .{
43 |         .first_px = first_px,
44 |         .last_px = last_px,
45 |         .first_py = first_py,
46 |         .last_py = last_py,
47 |         .C_ROUTE = C0,
48 |         .C_SEND_CTRL = SEND_CTRL,
49 |         .C_SEND_DATA = SEND_DATA,
50 |         .C_STATE_ENTRY = STATE_ENTRY,
51 |         .C_LOCK = C_LOCK,
52 |         .width = width,
53 |         .height = height
54 |     };
55 | }
56 | 


--------------------------------------------------------------------------------
/tutorials/gemv-00-basic-syntax/code.csl:
--------------------------------------------------------------------------------
 1 | // Copyright 2025 Cerebras Systems.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | // Not a complete program; we include it here for illustrating some syntax
16 | 
17 | // Every variable must be declared either "const" or "var"
18 | // Const cannot be modified after declaration, but var can
19 | 
20 | // Constants defining dimensions of our matrix
21 | const M: i16 = 4;
22 | const N: i16 = 6;
23 | 
24 | // 48 kB of global memory contain A, x, b, y
25 | var A: [M*N]f32; // A is stored in row-major order
26 | var x: [N]f32;
27 | var b: [M]f32;
28 | var y: [M]f32;
29 | 
30 | // Initialize matrix and vectors
31 | fn initialize() void {
32 |   // for loop with range syntax
33 |   // loops over 0, 1, ...., M*N-1
34 |   // idx stores the loop index
35 |   for (@range(i16, M*N)) |idx| {
36 |     // @as casts idx from i16 to f32
37 |     A[idx] = @as(f32, idx);
38 |   }
39 | 
40 |   for (@range(i16, N)) |j| {
41 |     x[j] = 1.0;
42 |   }
43 | 
44 |   // while loop with iterator syntax
45 |   var i: i16 = 0;
46 |   while (i < M) : (i += 1) {
47 |     b[i] = 2.0;
48 |     y[i] = 0.0;
49 |   }
50 | }
51 | 
52 | // Compute gemv
53 | fn gemv() void {
54 |   for (@range(i16, M)) |i| {
55 |     var tmp: f32 = 0.0;
56 |     for (@range(i16, N)) |j| {
57 |       tmp += A[i*N + j] * x[j];
58 |     }
59 |     y[i] = tmp + b[i];
60 |   }
61 | }
62 | 
63 | // Call initialize and gemv functions
64 | fn init_and_compute() void {
65 |   initialize();
66 |   gemv();
67 | }
68 | 


--------------------------------------------------------------------------------
/tutorials/gemv-06-routes-1/layout.csl:
--------------------------------------------------------------------------------
 1 | // Copyright 2025 Cerebras Systems.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | // matrix dimensions on each PE
16 | param M: i16;
17 | param N: i16;
18 | 
19 | // Colors
20 | const send_color: color = @get_color(0); // Color used to send/recv data between PEs
21 | 
22 | // This example only uses 2 PEs
23 | const memcpy = @import_module("<memcpy/get_params>", .{
24 |   .width = 2,
25 |   .height = 1,
26 | });
27 | 
28 | layout {
29 |   // PE coordinates are (column, row)
30 |   @set_rectangle(2, 1);
31 | 
32 |   // Left PE (0, 0)
33 |   @set_tile_code(0, 0, "pe_program.csl", .{
34 |     .memcpy_params = memcpy.get_params(0),
35 |     .M = M,
36 |     .N_per_PE = N / 2,
37 |     .pe_id = 0,
38 |     .send_color = send_color
39 |   });
40 | 
41 |   // Left PE sends its result to the right
42 |   @set_color_config(0, 0, send_color, .{.routes = .{ .rx = .{RAMP}, .tx = .{EAST} }});
43 | 
44 |   // Right PE (1, 0)
45 |   @set_tile_code(1, 0, "pe_program.csl", .{
46 |     .memcpy_params = memcpy.get_params(1),
47 |     .M = M,
48 |     .N_per_PE = N / 2,
49 |     .pe_id = 1,
50 |     .send_color = send_color
51 |   });
52 | 
53 |   // Right PE receives result of left PE
54 |   @set_color_config(1, 0, send_color, .{.routes = .{ .rx = .{WEST}, .tx = .{RAMP} }});
55 | 
56 |   // export symbol names
57 |   @export_name("A", [*]f32, true);
58 |   @export_name("x", [*]f32, true);
59 |   @export_name("y", [*]f32, true);
60 |   @export_name("compute", fn()void);
61 | }
62 | 


--------------------------------------------------------------------------------
/tutorials/topic-08-filters/run.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env cs_python
 2 | 
 3 | # Copyright 2025 Cerebras Systems.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | 
18 | import argparse
19 | import numpy as np
20 | 
21 | from cerebras.sdk.runtime.sdkruntimepybind import SdkRuntime, MemcpyDataType # pylint: disable=no-name-in-module
22 | from cerebras.sdk.runtime.sdkruntimepybind import MemcpyOrder # pylint: disable=no-name-in-module
23 | 
24 | parser = argparse.ArgumentParser()
25 | parser.add_argument('--name', help='the test name')
26 | parser.add_argument("--cmaddr", help="IP:port for CS system")
27 | args = parser.parse_args()
28 | dirname = args.name
29 | 
30 | runner = SdkRuntime(dirname, cmaddr=args.cmaddr)
31 | 
32 | result_symbol = runner.get_id('result')
33 | 
34 | runner.load()
35 | runner.run()
36 | 
37 | num_recv_pes = 3 # 3 PEs receive from the sender
38 | elems_per_pe = 3 # Each recv PE receives 3 elems after filtering
39 | 
40 | print("step 1: launch function to send data to neighbors")
41 | runner.launch("main_fn", nonblock=False)
42 | 
43 | print("step 2: copy back data from receiving PEs")
44 | result = np.zeros(num_recv_pes * elems_per_pe, np.float32)
45 | runner.memcpy_d2h(result, result_symbol, 1, 0, num_recv_pes, 1, elems_per_pe, streaming=False, \
46 |    data_type=MemcpyDataType.MEMCPY_32BIT, order=MemcpyOrder.ROW_MAJOR, nonblock=False)
47 | 
48 | runner.stop()
49 | 
50 | oracle = [6.5, 7, 7.5, 8, 8.5, 9, 9.5, 10, 10.5]
51 | np.testing.assert_allclose(result, oracle, atol=0.0001, rtol=0)
52 | print("SUCCESS!")
53 | 


--------------------------------------------------------------------------------
/benchmarks/25-pt-stencil/util.csl:
--------------------------------------------------------------------------------
 1 | // Copyright 2025 Cerebras Systems.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | fn min(a: u16, b: u16) u16 {
16 |   if (a < b) {
17 |     return a;
18 |   }
19 |   return b;
20 | }
21 | 
22 | fn computeRelativePeId(peId: u16, peCount: u16, dir: direction) u16 {
23 |   if (dir == EAST or dir == SOUTH) {
24 |     return peId;
25 |   }
26 |   if (dir == WEST or dir == NORTH) {
27 |     return peCount - peId - 1;
28 |   }
29 |   @comptime_assert(false);
30 | }
31 | 
32 | fn computeChunks(zDim: u16) i16 {
33 |   // We observe that given the memory consumption of the program on chip, the
34 |   // maximum number of Z-dimension values that we can allocate on chip is about
35 |   // 400.  The following line splits the requested Z dimension into multiple
36 |   // chunks of the requested size exceeds 400.
37 |   return 1 + zDim / 401;
38 | }
39 | 
40 | fn computeChunkSize(zDim: u16, numChunks: u16) u16 {
41 |   // If the number of chunks cleanly divides the number of elements in the Z
42 |   // dimension, then use the result of the division as the size of the chunks.
43 |   if (zDim % numChunks == 0) {
44 |     return zDim / numChunks;
45 |   }
46 | 
47 |   // Otherwise, bump up the chunk size by one.  Note that increasing the chunk
48 |   // size by one is better than increasing the number of chunks by one, since
49 |   // each new chunk introduces a non-trivial overhead due to the need to perform
50 |   // another round of communication with each neighbor.
51 |   return 1 + zDim / numChunks;
52 | }
53 | 


--------------------------------------------------------------------------------
/benchmarks/gemv-collectives_2d/README.rst:
--------------------------------------------------------------------------------
 1 | GEMV with Collective Communications
 2 | ===================================
 3 | 
 4 | This example shows a CSL program that uses collective communications
 5 | to perform a generalized matrix-vector (GEMV)
 6 | multiplication operation of the form:
 7 | 
 8 | .. code-block:: text
 9 | 
10 |     y = Ax + b
11 | 
12 | where:
13 | 
14 | - ``A`` is a tensor of shape [M, N] (stored distributed on PE memory).
15 | - ``x`` is a tensor of shape [N, 1].
16 |   It is placed in the memory of the northwesternmost PE before computation
17 |   begins, and then scattered using collective communications.
18 | - ``b`` is a tensor of shape [M, 1].
19 |   It is placed in the memory of the northwesternmost PE before computation
20 |   begins, and then scattered using collective communications.
21 | - ``y`` is the output tensor of shape [M, 1].
22 |   At the end of computation, it is located in the memory of
23 |   the southeasternmost PE.
24 | 
25 | For simplicity, we choose N as a multiple of the
26 | width of the kernel and M as a multiple of the height of the kernel.
27 | With the default compile parameters for this example,
28 | M = 32, N = 16 and we use a PE rectangle of size 4×4 for the kernel.
29 | The parameters specifying these values can be modified at compile time.
30 | 
31 | Note that this algorithm and the implementation are not optimized for
32 | performance. It is intended to serve as a non-trivial introductory example
33 | of the collectives library.
34 | 
35 | All computations are done in FP32 format.
36 | 
37 | The matrix ``A``, of shape [M, N],
38 | is distributed across the PE memories as follows:
39 | 
40 | - The first dimension of ``A``, M rows, is distributed across
41 |   the height of the kernel.
42 | - The second dimension of ``A``, N columns, is distributed across
43 |   the width of the kernel.
44 | 
45 | Since we know that M is 32 and the height of the kernel is 4, each PE will be
46 | assigned 32÷4 = 8 rows of ``A``.
47 | 
48 | Similarly, each PE will get 16÷4 = 4 columns of ``A``. This means each PE is
49 | assigned an 8×4 chunk of the original matrix ``A``.
50 | 


--------------------------------------------------------------------------------
/benchmarks/residual/README.rst:
--------------------------------------------------------------------------------
 1 | Residual
 2 | ========
 3 | 
 4 | This example shows a CSL program that uses a rectangle of 2-by-2 PEs to compute
 5 | ``|b - A * x|``, i.e., the norm of the residual ``b - A * x``.
 6 | 
 7 | ``A`` is an ``M x N`` matrix. Each PE computes a part of the ``A'*x``
 8 | multiplication, where ``A''' is a ``M/2 x N/2`` matrix. In other words, each PE
 9 | essentially does "a fourth" of the multiplication. It then does a row reduction,
10 | so that the last column of PEs has the result ``b - A*x``. Finally, the PEs of
11 | the last column computes the norm, ``|b-A*x|``, via a column reduction.
12 | 
13 | The 2-by-2 rectangle is surrounded by memcpy infrastructure which occupies five
14 | column of PEs shown below.
15 | The memcpy routes the input and output data between the host and the device.
16 | 
17 | .. _fig-residual-memcpy-2-by-2:
18 | 
19 | .. figure:: ./images/residual-memcpy-2-by-2.png
20 |     :align: center
21 |     :width: 980 px
22 | 
23 | The matrix ``A``, the input vectors ``x``  and ``b`` and the output scalar (the
24 | computed norm ``|b - A * x|``) are supported by memcpy streaming.
25 | 
26 | - The matrix ``A`` is distributed into the PEs.
27 |   For simplicity, the matrix dimensions ``M x N`` are assumed even.
28 | 
29 | - The vector ``x`` is distributed into the first row PEs.
30 |   The first row receives ``x`` from the memcpy, then
31 |   broadcasts ``x`` into other rows. The incoming vector ``x`` is distributed
32 |   across all N = 4 PEs along the top side of the rectangle.
33 | 
34 | - The vector ``b`` is distributed into rows of the first column.
35 |   The vector ``b`` is distributed across all M = 6 PEs
36 |   along the left side of the rectangle.
37 | 
38 | - The scalar ``nrm_r`` is sent out by the PE with coordinates ``pe_x=1`` and
39 |   ``pe_y=0``.
40 | 
41 | Three functions ``GEMV``, ``AXPY``, and ``NRMINF`` are defined separately, and
42 | are loaded by ``import_module``.  ``GEM`` computes ``y = A*x``, ``AXPY``
43 | computes ``y = alpha*x`` and ``NRMINF`` computes the norm. ``SIMD`` operations
44 | are used in ``GEMV`` and ``AXPY`` to reduce the overhead of address computation.
45 | 


--------------------------------------------------------------------------------
/tutorials/sdklayout-04-h2d-d2h/add2vec.csl:
--------------------------------------------------------------------------------
 1 | // Copyright 2025 Cerebras Systems.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | param size: u16;
16 | param rx1: color;
17 | param rx2: color;
18 | param tx: color;
19 | 
20 | const in_q1 = @get_input_queue(0);
21 | const in_q2 = @get_input_queue(1);
22 | const out_q = @get_output_queue(0);
23 | 
24 | const input1 = @get_dsd(fabin_dsd, .{.extent = size,
25 |                                      .fabric_color = rx1,
26 |                                      .input_queue = in_q1});
27 | 
28 | const input2 = @get_dsd(fabin_dsd, .{.extent = size,
29 |                                      .fabric_color = rx2,
30 |                                      .input_queue = in_q2});
31 | 
32 | const output = @get_dsd(fabout_dsd, .{.extent = size,
33 |                                       .fabric_color = tx,
34 |                                       .output_queue = out_q});
35 | 
36 | // WSE3 does not allow multiple fabric inputs per DSD operation.
37 | // Therefore, we introduce a FIFO for portability between WSE2
38 | // and WSE3.
39 | var buffer: [size]u16;
40 | const fifo = @allocate_fifo(buffer);
41 | const main_id = @get_local_task_id(8);
42 | task main() void {
43 |   @mov16(fifo, input2, .{.async = true});
44 |   @add16(output, input1, fifo, .{.async = true});
45 | }
46 | 
47 | comptime {
48 |   @bind_local_task(main, main_id);
49 |   @activate(main_id);
50 | 
51 |   @initialize_queue(in_q1, .{.color = rx1});
52 |   @initialize_queue(in_q2, .{.color = rx2});
53 | 
54 |   if (@is_arch("wse3")) {
55 |     @initialize_queue(out_q, .{.color = tx});
56 |   }
57 | }
58 | 


--------------------------------------------------------------------------------
/tutorials/sdklayout-03-ports-and-connections/add2vec.csl:
--------------------------------------------------------------------------------
 1 | // Copyright 2025 Cerebras Systems.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | param size: u16;
16 | param rx1: color;
17 | param rx2: color;
18 | param tx: color;
19 | 
20 | const in_q1 = @get_input_queue(0);
21 | const in_q2 = @get_input_queue(1);
22 | const out_q = @get_output_queue(0);
23 | 
24 | const input1 = @get_dsd(fabin_dsd, .{.extent = size,
25 |                                      .fabric_color = rx1,
26 |                                      .input_queue = in_q1});
27 | 
28 | const input2 = @get_dsd(fabin_dsd, .{.extent = size,
29 |                                      .fabric_color = rx2,
30 |                                      .input_queue = in_q2});
31 | 
32 | const output = @get_dsd(fabout_dsd, .{.extent = size,
33 |                                       .fabric_color = tx,
34 |                                       .output_queue = out_q});
35 | 
36 | // WSE3 does not allow multiple fabric inputs per DSD operation.
37 | // Therefore, we introduce a FIFO for portability between WSE2
38 | // and WSE3.
39 | var buffer: [size]u16;
40 | const fifo = @allocate_fifo(buffer);
41 | const main_id = @get_local_task_id(8);
42 | task main() void {
43 |   @mov16(fifo, input2, .{.async = true});
44 |   @add16(output, input1, fifo, .{.async = true});
45 | }
46 | 
47 | comptime {
48 |   @bind_local_task(main, main_id);
49 |   @activate(main_id);
50 | 
51 |   @initialize_queue(in_q1, .{.color = rx1});
52 |   @initialize_queue(in_q2, .{.color = rx2});
53 | 
54 |   if (@is_arch("wse3")) {
55 |     @initialize_queue(out_q, .{.color = tx});
56 |   }
57 | }
58 | 


--------------------------------------------------------------------------------
/tutorials/pipeline-03-multiple/pe_program.csl:
--------------------------------------------------------------------------------
 1 | // Copyright 2025 Cerebras Systems.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | param memcpy_params: comptime_struct;
16 | 
17 | param size: i16;
18 | 
19 | param Cin: color;
20 | param Cout: color;
21 | 
22 | // Queue IDs
23 | const Cin_iq: input_queue = @get_input_queue(2);
24 | const Cout_oq: output_queue = @get_output_queue(3);
25 | 
26 | // Task IDs
27 | const main_task_id: local_task_id = @get_local_task_id(8);
28 | 
29 | const sys_mod = @import_module("<memcpy/memcpy>", memcpy_params);
30 | 
31 | const Cin_in_dsd = @get_dsd(fabin_dsd, .{
32 |   .extent = size,
33 |   .fabric_color = Cin,
34 |   .input_queue = Cin_iq,
35 | });
36 | 
37 | const Cout_out_dsd = @get_dsd(fabout_dsd, .{
38 |   .extent = size,
39 |   .fabric_color = Cout,
40 |   .output_queue = Cout_oq,
41 | });
42 | 
43 | const buf = [1]i16{ 1 };
44 | const one_dsd = @get_dsd(mem1d_dsd, .{ .tensor_access = |i|{size} -> buf[0] });
45 | 
46 | task main_task() void {
47 |   @add16(Cout_out_dsd, Cin_in_dsd, one_dsd, .{ .async = true });
48 | }
49 | 
50 | comptime {
51 |   // activate local task main_task at startup
52 |   @bind_local_task(main_task, main_task_id);
53 |   @activate(main_task_id);
54 | 
55 |   @set_local_color_config(Cin, .{ .routes = .{ .rx = .{ WEST }, .tx = .{ RAMP }}});
56 |   @set_local_color_config(Cout, .{ .routes = .{ .rx = .{ RAMP }, .tx = .{ EAST }}});
57 | 
58 |   // On WSE-3, we must explicitly initialize input and output queues
59 |   if (@is_arch("wse3")) {
60 |     @initialize_queue(Cin_iq, .{ .color = Cin });
61 |     @initialize_queue(Cout_oq, .{ .color = Cout });
62 |   }
63 | }
64 | 


--------------------------------------------------------------------------------
/tutorials/topic-03-streaming-wavelet-data/pe_program.csl:
--------------------------------------------------------------------------------
 1 | // Copyright 2025 Cerebras Systems.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | // Not a complete program; the top-level source file is layout.csl.
16 | 
17 | param memcpy_params: comptime_struct;
18 | 
19 | const sys_mod = @import_module("<memcpy/memcpy>", memcpy_params);
20 | 
21 | // Queue IDs
22 | const h2d_data_1_iq: input_queue = @get_input_queue(2);
23 | const d2h_data_1_oq: output_queue = @get_output_queue(3);
24 | 
25 | // Data task main_task triggered by wlts along MEMCPYH2D_DATA_1
26 | // On WSE-2, data task IDs are created from colors; on WSE-3, from input queues
27 | const main_task_id: data_task_id =
28 |   if      (@is_arch("wse2")) @get_data_task_id(sys_mod.MEMCPYH2D_1)
29 |   else if (@is_arch("wse3")) @get_data_task_id(h2d_data_1_iq);
30 | 
31 | export var global: i16 = 0;
32 | 
33 | const out_dsd = @get_dsd(fabout_dsd, .{
34 |    .extent = 1,
35 |    .fabric_color = sys_mod.MEMCPYD2H_1,
36 |    .output_queue = d2h_data_1_oq
37 | });
38 | 
39 | task main_task(wavelet_data: i16) void {
40 |   global = wavelet_data;
41 |   // The non-async operation works here because only one wavelet is sent
42 |   // It would be better to use async operation with .{async = true}
43 |   @mov16(out_dsd, global);
44 | }
45 | 
46 | comptime {
47 |   @bind_data_task(main_task, main_task_id);
48 | 
49 |   // On WSE-3, we must explicitly initialize input and output queues
50 |   if (@is_arch("wse3")) {
51 |     @initialize_queue(h2d_data_1_iq, .{ .color = sys_mod.MEMCPYH2D_1 });
52 |     @initialize_queue(d2h_data_1_oq, .{ .color = sys_mod.MEMCPYD2H_1 });
53 |   }
54 | }
55 | 


--------------------------------------------------------------------------------
/tutorials/topic-15-wse3-microthreads/run.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env cs_python
 2 | 
 3 | # Copyright 2025 Cerebras Systems.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | 
18 | import argparse
19 | import numpy as np
20 | 
21 | from cerebras.sdk.runtime.sdkruntimepybind import SdkRuntime, MemcpyDataType, MemcpyOrder # pylint: disable=no-name-in-module
22 | 
23 | # Read arguments
24 | parser = argparse.ArgumentParser()
25 | parser.add_argument('--name', help="the test compile output dir")
26 | parser.add_argument('--cmaddr', help="IP:port for CS system")
27 | args = parser.parse_args()
28 | 
29 | M = 10
30 | y = np.arange(M, dtype=np.float32)
31 | y_expected = y
32 | 
33 | # Construct a runner using SdkRuntime
34 | runner = SdkRuntime(args.name, cmaddr=args.cmaddr)
35 | 
36 | # Get symbols for A, x, y on device
37 | y_symbol = runner.get_id('y')
38 | 
39 | # Load and run the program
40 | runner.load()
41 | runner.run()
42 | 
43 | 
44 | # Copy y into PE (0, 0)
45 | runner.memcpy_h2d(y_symbol, y, 0, 0, 1, 1, M, streaming=False,
46 |   order=MemcpyOrder.ROW_MAJOR, data_type=MemcpyDataType.MEMCPY_32BIT, nonblock=False)
47 | 
48 | # Launch the compute function on device
49 | runner.launch('compute', nonblock=False)
50 | 
51 | # Copy y back from PE (1, 0)
52 | y_result = np.zeros([M], dtype=np.float32)
53 | runner.memcpy_d2h(y_result, y_symbol, 1, 0, 1, 1, M, streaming=False,
54 |   order=MemcpyOrder.ROW_MAJOR, data_type=MemcpyDataType.MEMCPY_32BIT, nonblock=False)
55 | 
56 | # Stop the program
57 | runner.stop()
58 | 
59 | # Ensure that the result matches our expectation
60 | np.testing.assert_allclose(y_result, y_expected, atol=0.01, rtol=0)
61 | print("SUCCESS!")
62 | 


--------------------------------------------------------------------------------
/tutorials/topic-04-sparse-tensors/pe_program.csl:
--------------------------------------------------------------------------------
 1 | // Copyright 2025 Cerebras Systems.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | // Not a complete program; the top-level source file is layout.csl.
16 | 
17 | param memcpy_params: comptime_struct;
18 | 
19 | const sys_mod = @import_module("<memcpy/memcpy>", memcpy_params);
20 | 
21 | // Queue IDs
22 | const h2d_data_1_iq: input_queue = @get_input_queue(2);
23 | const d2h_data_1_oq: output_queue = @get_output_queue(3);
24 | 
25 | // Data task main_task triggered by wlts along MEMCPYH2D_DATA_1
26 | // On WSE-2, data task IDs are created from colors; on WSE-3, from input queues
27 | const main_task_id: data_task_id =
28 |   if      (@is_arch("wse2")) @get_data_task_id(sys_mod.MEMCPYH2D_1)
29 |   else if (@is_arch("wse3")) @get_data_task_id(h2d_data_1_iq);
30 | 
31 | var result = [4]i16 { 0, 0, 0, 0 };
32 | 
33 | const out_dsd = @get_dsd(fabout_dsd, .{
34 |    .extent = 1,
35 |    .fabric_color = sys_mod.MEMCPYD2H_1,
36 |    .output_queue = d2h_data_1_oq
37 | });
38 | 
39 | task main_task(wavelet_data: i16, index: i16) void {
40 |   result[index] = wavelet_data;
41 |   // The non-async operation works here because only two wavelet are sent
42 |   // It would be better to use async operation with .{async = true}
43 |   @mov16(out_dsd, wavelet_data);
44 | }
45 | 
46 | comptime {
47 |   @bind_data_task(main_task, main_task_id);
48 | 
49 |   // On WSE-3, we must explicitly initialize input and output queues
50 |   if (@is_arch("wse3")) {
51 |     @initialize_queue(h2d_data_1_iq, .{ .color = sys_mod.MEMCPYH2D_1 });
52 |     @initialize_queue(d2h_data_1_oq, .{ .color = sys_mod.MEMCPYD2H_1 });
53 |   }
54 | }
55 | 


--------------------------------------------------------------------------------
/tutorials/gemv-02-memory-dsds/run.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env cs_python
 2 | 
 3 | # Copyright 2025 Cerebras Systems.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | 
18 | import argparse
19 | import numpy as np
20 | 
21 | from cerebras.sdk.runtime.sdkruntimepybind import SdkRuntime, MemcpyDataType, MemcpyOrder # pylint: disable=no-name-in-module
22 | 
23 | # Read arguments
24 | parser = argparse.ArgumentParser()
25 | parser.add_argument('--name', help="the test compile output dir")
26 | parser.add_argument('--cmaddr', help="IP:port for CS system")
27 | args = parser.parse_args()
28 | 
29 | # Matrix dimensions
30 | M = 4
31 | N = 6
32 | 
33 | # Construct A, x, b
34 | A = np.arange(M*N, dtype=np.float32).reshape(M, N)
35 | x = np.full(shape=N, fill_value=1.0, dtype=np.float32)
36 | b = np.full(shape=M, fill_value=2.0, dtype=np.float32)
37 | 
38 | # Calculate expected y
39 | y_expected = A@x + b
40 | 
41 | # Construct a runner using SdkRuntime
42 | runner = SdkRuntime(args.name, cmaddr=args.cmaddr)
43 | 
44 | # Get symbol for copying y result off device
45 | y_symbol = runner.get_id('y')
46 | 
47 | # Load and run the program
48 | runner.load()
49 | runner.run()
50 | 
51 | # Launch the init_and_compute function on device
52 | runner.launch('init_and_compute', nonblock=False)
53 | 
54 | # Copy y back from device
55 | y_result = np.zeros([1*1*M], dtype=np.float32)
56 | runner.memcpy_d2h(y_result, y_symbol, 0, 0, 1, 1, M, streaming=False,
57 |   order=MemcpyOrder.ROW_MAJOR, data_type=MemcpyDataType.MEMCPY_32BIT, nonblock=False)
58 | 
59 | # Stop the program
60 | runner.stop()
61 | 
62 | # Ensure that the result matches our expectation
63 | np.testing.assert_allclose(y_result, y_expected, atol=0.01, rtol=0)
64 | print("SUCCESS!")
65 | 


--------------------------------------------------------------------------------
/tutorials/topic-15-wse3-microthreads/README.rst:
--------------------------------------------------------------------------------
 1 | Topic 13: WSE-3 Microthreads
 2 | ============================
 3 | 
 4 | Unlike WSE-2, the WSE-3 architecture exposes microthread IDs.
 5 | This example demonstrates the use of explicit microthread IDS
 6 | on the WSE-3 architecture.
 7 | 
 8 | On WSE-2, the queue ID of an input or output fabric DSD corresponds to the
 9 | ID of the microthread in which that operation executes.
10 | On WSE-3, queue IDs and microthreads can be decoupled, so that any
11 | microthread ID 0 to 7 can be used with any of queues 0 to 7.
12 | 
13 | In this example, the left PE sends ``M`` wavelets to the right PE over
14 | the color ``send_color``.
15 | These wavelets are sent in an asynchronous ``@fmovs`` operation which
16 | copies from the ``y`` array via ``y_dsd`` into ``out_dsd``.
17 | ``out_dsd`` is a ``fabout_dsd`` associated with the color ``send_color``,
18 | and the output queue with ID 2.
19 | The ``@fmovs`` operation is launched using microthread ID 4.
20 | 
21 | The right PE receives these ``M`` wavelets on the same color (called
22 | ``right_color`` in ``right_pe.csl``) via ``in_dsd``, which uses input
23 | queue with ID 2.
24 | The asynchronous ``@fmovs`` operation which receives these wavelets
25 | and copies them into ``y`` is launched using microthread ID 5.
26 | 
27 | Decoupling microthread IDs from queue IDs can provide valuable flexibility
28 | in managing program resource usage, and conserve microthreads.
29 | 
30 | By using explicit microthread IDs, we allow CSL's DSR allocator to use fewer
31 | DSRs in situations where fabric DSD operands are not known at compile time.
32 | 
33 | Additionally, on the WSE-3, output queues cannot be re-used with a different
34 | color if they have not yet been drained, and CSL does not yet support a
35 | mechanism for guaranteeing that a given queue is empty.
36 | This may force the programmer to use more output queues than needed, which in
37 | turn can lead to overusing microthread IDs (if they are not explicitly
38 | specified, they default to the respective queue IDs).
39 | By allowing explicit microthread IDs, a programmer can share microthreads
40 | between output queues, and thus conserve microthreads for other operations.
41 | Note, however, that two operations cannot concurrently use the same microthread.
42 | 


--------------------------------------------------------------------------------
/tutorials/sdklayout-05-gemv/mux.csl:
--------------------------------------------------------------------------------
 1 | // Copyright 2025 Cerebras Systems.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | param size: u16;
16 | param in_color: color;
17 | param out_color: color;
18 | 
19 | const ctrl = @import_module("<control>");
20 | 
21 | const input_q = @get_input_queue(0);
22 | const output_q = @get_output_queue(1);
23 | 
24 | const inDSD = @get_dsd(fabin_dsd, .{.extent = size,
25 |                                     .fabric_color = in_color,
26 |                                     .input_queue = input_q});
27 | 
28 | const outDSD = @get_dsd(fabout_dsd, .{.extent = size,
29 |                                       .fabric_color = out_color,
30 |                                       .output_queue = output_q});
31 | 
32 | const ctrlOurDSD = @get_dsd(fabout_dsd, .{.extent = 1,
33 |                                           .fabric_color = out_color,
34 |                                           .output_queue = output_q,
35 |                                           .control = true});
36 | 
37 | const main_id = @get_local_task_id(8);
38 | task main() void {
39 |   @mov32(outDSD, inDSD, .{.async = true, .activate = send_ctrl});
40 | }
41 | 
42 | // This task sends a control wavelet to self, in order to
43 | // advance the switch position.
44 | const send_ctrl_id = @get_local_task_id(9);
45 | task send_ctrl() void {
46 |   @mov32(ctrlOurDSD, ctrl.encode_single_payload(ctrl.opcode.SWITCH_ADV, true, {}, 0));
47 | }
48 | 
49 | comptime {
50 |   @bind_local_task(main, main_id);
51 |   @activate(main_id);
52 | 
53 |   @bind_local_task(send_ctrl, send_ctrl_id);
54 | 
55 |   @initialize_queue(input_q, .{.color = in_color});
56 |   if (@is_arch("wse3")) {
57 |     @initialize_queue(output_q, .{.color = out_color});
58 |   }
59 | }
60 | 


--------------------------------------------------------------------------------
/tutorials/pipeline-01-basic/README.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | Pipeline 1: Redirect fabin to fabout
 3 | ====================================
 4 | 
 5 | While wavelet-triggered tasks enable us to receive and operate on one wavelet at
 6 | a time, the programmer may need a way to receive a tensor comprised of multiple
 7 | wavelets using one instruction.  This is enabled by fabric input DSDs.
 8 | Similarly, using fabric output DSDs, the programmer can send multiple wavelets
 9 | using one instruction.
10 | 
11 | This example illustrates two fabric DSDs, one for input and another for output.
12 | Each fabric DSD requires a corresponding color.
13 | 
14 | Crucially, when using a fabric input DSD, it is important that the programmer
15 | blocks the wavelet's color, as this example does for the color
16 | ``MEMCPYH2D_DATA_1``.
17 | Otherwise, wavelets of that color will attempt to activate the (empty) task
18 | associated with the color, which in turn will consume the wavelet before it can
19 | be consumed by the fabric input DSD.
20 | 
21 | This example only has a single PE, which receives data via H2D and sends it out
22 | via D2H in one vector operation. Logically speaking it is NOT valid because H2D
23 | and D2H are serialized. The host triggers D2H only if H2D is done. The hardware
24 | has some internal queues to hold the data for I/O, so H2D finishes when it
25 | pushes all data into the dedicated queues. This example still works if the size
26 | does not exceed the capacity of such queues. Otherwise H2D stalls.
27 | 
28 | The parameter ``size`` controls the number of wavelets of H2D and D2H. The
29 | program stalls when ``size`` exceeds 14.
30 | 
31 | Such programming paradigm is called pipelined approach: the kernel receives
32 | input data without storing it into memory, instead redirecting the result to
33 | the output. The microthread is necessary because the CE (compute engine) must
34 | have some resources to run ``memcpy`` kernel. The kernel stalls if a blocking
35 | instruction ``@add16(outDsd, inDsd, 1)`` is used. The simulation stalls, and
36 | the instruction trace shows ``@add16`` repeatedly querying data from input
37 | queue 1, which is still empty. The router receives the H2D command much later
38 | than running ``@add16``. The CE has no resource to run the H2D command received
39 | by the router, so it stalls.
40 | 


--------------------------------------------------------------------------------
/tutorials/pipeline-02-fifo/layout.csl:
--------------------------------------------------------------------------------
 1 | // Copyright 2025 Cerebras Systems.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | // Color/ task ID map
16 | //
17 | //  ID var           ID var  ID var                ID var
18 | //   0 MEMCPYH2D_1    9  C1  18                    27 reserved (memcpy)
19 | //   1 MEMCPYD2H_1   10      19                    28 reserved (memcpy)
20 | //   2               11      20                    29 reserved
21 | //   3               12      21 reserved (memcpy)  30 reserved (memcpy)
22 | //   4               13      22 reserved (memcpy)  31 reserved
23 | //   5               14      23 reserved (memcpy)  32
24 | //   6               15      24                    33
25 | //   7               16      25                    34
26 | //   8 main_task_id  17      26                    35
27 | 
28 | // Number of elements sent through core program rectangle
29 | param size: i16;
30 | 
31 | param MEMCPYH2D_DATA_1_ID: i16;
32 | param MEMCPYD2H_DATA_1_ID: i16;
33 | 
34 | const MEMCPYH2D_DATA_1: color = @get_color(MEMCPYH2D_DATA_1_ID);
35 | const MEMCPYD2H_DATA_1: color = @get_color(MEMCPYD2H_DATA_1_ID);
36 | 
37 | const C1: color = @get_color(9);
38 | 
39 | const memcpy = @import_module("<memcpy/get_params>", .{
40 |   .width = 1,
41 |   .height = 1,
42 |   .MEMCPYH2D_1 = MEMCPYH2D_DATA_1,
43 |   .MEMCPYD2H_1 = MEMCPYD2H_DATA_1
44 | });
45 | 
46 | layout {
47 |   @set_rectangle(1, 1);
48 | 
49 |   @set_tile_code(0, 0, "pe_program.csl", .{
50 |     .memcpy_params = memcpy.get_params(0),
51 |     .size = size,
52 |     .C1 = C1
53 |   });
54 | 
55 |   // fifo sends out the data via C1 --> tx = RAMP
56 |   // add16 receives data via C1 --> rx = RAMP
57 |   @set_color_config(0, 0, C1, .{ .routes = .{ .rx = .{ RAMP }, .tx = .{ RAMP }}});
58 | }
59 | 


--------------------------------------------------------------------------------
/benchmarks/residual/axpy.csl:
--------------------------------------------------------------------------------
 1 | // Copyright 2025 Cerebras Systems.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | // http://www.netlib.org/lapack/explore-html/d8/daf/saxpy_8f.html
16 | // SAXPY constant times a vector plus a vector.
17 | //     y = y + alpha*x
18 | //
19 | // @param[in] n      number of elements of the input vectors
20 | // @param[in] alpha  scalar
21 | // @param[in] x      array of dimension n
22 | //                   x[j] can be NAN or INF if alpha is zero
23 | // @param[in,out] y  array of dimension n
24 | 
25 | param sizeXY: i16;  // size of x and y, sizeXY >= n
26 | 
27 | // To change the base address and the length of a DSD, csl requires a dummy DSD.
28 | // The type here doesn't matter
29 | const dummy = @zeros([1]i16);
30 | // The length doesn't matter either since csl will overwrite it
31 | const dummy_dsd = @get_dsd(mem1d_dsd, .{ .tensor_access = |i|{42} -> dummy[i] });
32 | 
33 | fn saxpy(n: i16, alpha: f32, x: *[sizeXY]f32, y: *[sizeXY]f32) void {
34 |   // bind vector x to a DSD
35 |   var mem_x_buf_dsd = @set_dsd_base_addr(dummy_dsd, x);
36 |   mem_x_buf_dsd = @set_dsd_length(mem_x_buf_dsd, @as(u16, n));
37 | 
38 |   // bind vector y to DSD
39 |   // it is based on mem_x_buf_dsd, so no need to set the length again
40 |   var mem_y_buf_dsd = @set_dsd_base_addr(mem_x_buf_dsd, y);
41 | 
42 |   // fast path: if alpha is zero, no need to compute
43 |   if (alpha == 0.0) {
44 |       return;
45 |   }
46 | 
47 |   // y[j] = y[j] + x[j]*alpha, j = 0,1,2,...,n-1
48 |   // The SIMD fmacs replaces the following for-loop
49 |   // ========
50 |   // var row : i16 = 0;
51 |   // while(row < n) : (row +=1) {
52 |   //     (y.*)[row] = (y.*)[row] + alpha * (x.*)[row];
53 |   // }
54 |   // ========
55 |   @fmacs(mem_y_buf_dsd, mem_y_buf_dsd, mem_x_buf_dsd, alpha);
56 | }
57 | 


--------------------------------------------------------------------------------
/tutorials/sdklayout-02-routing/send_receive.csl:
--------------------------------------------------------------------------------
 1 | // Copyright 2025 Cerebras Systems.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | // Select sender (0) or receiver (1)
16 | param select: u16;
17 | param c: color;
18 | 
19 | const in_q = @get_input_queue(0);
20 | const out_q = @get_output_queue(1);
21 | 
22 | const mode = enum(u16) {send = 0, receive = 1};
23 | 
24 | // Buffer to be sent
25 | const size = 5;
26 | const data = [size]u16{1, 2, 3, 4, 5};
27 | 
28 | // Buffer to receive data
29 | export var buffer: [size]u16;
30 | 
31 | const dataDSD = @get_dsd(mem1d_dsd, .{.base_address = &data, .extent = size});
32 | const bufferDSD = @get_dsd(mem1d_dsd, .{.base_address = &buffer, .extent = size});
33 | 
34 | const inDSD = @get_dsd(fabin_dsd, .{.extent = size, .fabric_color = c, .input_queue = in_q});
35 | const outDSD = @get_dsd(fabout_dsd, .{.extent = size, .fabric_color = c, .output_queue = out_q});
36 | 
37 | // Sender task
38 | const send_task_id = @get_local_task_id(8);
39 | task send_task() void {
40 |     @mov16(outDSD, dataDSD, .{.async = true});
41 | }
42 | 
43 | // Receiver task
44 | const receive_task_id = @get_local_task_id(9);
45 | task receive_task() void {
46 |     @mov16(bufferDSD, inDSD, .{.async = true});
47 | }
48 | 
49 | const main_id = @get_local_task_id(10);
50 | task main() void {
51 |   // Select sender or receiver
52 |   switch(@as(mode, select)) {
53 |     mode.send => @activate(send_task_id),
54 |     mode.receive => @activate(receive_task_id)
55 |   }
56 | }
57 | 
58 | comptime {
59 |   @bind_local_task(send_task, send_task_id);
60 |   @bind_local_task(receive_task, receive_task_id);
61 |   @bind_local_task(main, main_id);
62 |   @activate(main_id);
63 | 
64 |   @initialize_queue(in_q, .{.color = c});
65 |   if (@is_arch("wse3")) {
66 |     @initialize_queue(out_q, .{.color = c});
67 |   }
68 | }
69 | 


--------------------------------------------------------------------------------
/tutorials/gemv-05-multiple-pes/pe_program.csl:
--------------------------------------------------------------------------------
 1 | // Copyright 2025 Cerebras Systems.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | param memcpy_params: comptime_struct;
16 | 
17 | // Matrix dimensions
18 | param M: i16;
19 | param N: i16;
20 | 
21 | // memcpy module provides infrastructure for copying data
22 | // and launching functions from the host
23 | const sys_mod = @import_module("<memcpy/memcpy>", memcpy_params);
24 | 
25 | 
26 | // 48 kB of global memory contain A, x, b, y
27 | var A: [M*N]f32; // A is stored row major
28 | var x: [N]f32;
29 | var b: [M]f32;
30 | var y = @zeros([M]f32); // Initialize y to zero
31 | 
32 | // DSDs for accessing A, b, y
33 | // A_dsd accesses column of A
34 | var A_dsd = @get_dsd(mem1d_dsd, .{ .tensor_access = |i|{M} -> A[i*N] });
35 | var b_dsd = @get_dsd(mem1d_dsd, .{ .base_address = &b, .extent = M });
36 | var y_dsd = @get_dsd(mem1d_dsd, .{ .base_address = &y, .extent = M });
37 | 
38 | // ptrs to A, x, b, y will be advertised as symbols to host
39 | var A_ptr: [*]f32 = &A;
40 | var x_ptr: [*]f32 = &x;
41 | var b_ptr: [*]f32 = &b;
42 | const y_ptr: [*]f32 = &y;
43 | 
44 | // Compute gemv
45 | fn gemv() void {
46 |   // Loop over all columns of A
47 |   for (@range(i16, N)) |i| {
48 |     // Calculate contribution to A*x from ith column of A, ith elem of x
49 |     @fmacs(y_dsd, y_dsd, A_dsd, x[i]);
50 |     // Move A_dsd to next column of A
51 |     A_dsd = @increment_dsd_offset(A_dsd, 1, f32);
52 |   }
53 |   // Add b to A*x
54 |   @fadds(y_dsd, y_dsd, b_dsd);
55 | }
56 | 
57 | // Call initialize and gemv functions
58 | fn compute() void {
59 |   gemv();
60 |   sys_mod.unblock_cmd_stream();
61 | }
62 | 
63 | comptime {
64 |   @export_symbol(A_ptr, "A");
65 |   @export_symbol(x_ptr, "x");
66 |   @export_symbol(b_ptr, "b");
67 |   @export_symbol(y_ptr, "y");
68 |   @export_symbol(compute);
69 | }
70 | 


--------------------------------------------------------------------------------
/tutorials/gemv-04-params/pe_program.csl:
--------------------------------------------------------------------------------
 1 | // Copyright 2025 Cerebras Systems.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | param memcpy_params: comptime_struct;
16 | 
17 | // Matrix dimensions
18 | param M: i16;
19 | param N: i16;
20 | 
21 | // memcpy module provides infrastructure for copying data
22 | // and launching functions from the host
23 | const sys_mod = @import_module("<memcpy/memcpy>", memcpy_params);
24 | 
25 | 
26 | // 48 kB of global memory contain A, x, b, y
27 | var A: [M*N]f32; // A is stored row major
28 | var x: [N]f32;
29 | var b: [M]f32;
30 | var y = @zeros([M]f32); // Initialize y to zero
31 | 
32 | // DSDs for accessing A, b, y
33 | // A_dsd accesses column of A
34 | var A_dsd = @get_dsd(mem1d_dsd, .{ .tensor_access = |i|{M} -> A[i*N] });
35 | var b_dsd = @get_dsd(mem1d_dsd, .{ .base_address = &b, .extent = M });
36 | var y_dsd = @get_dsd(mem1d_dsd, .{ .base_address = &y, .extent = M });
37 | 
38 | // ptrs to A, x, b, y will be advertised as symbols to host
39 | var A_ptr: [*]f32 = &A;
40 | var x_ptr: [*]f32 = &x;
41 | var b_ptr: [*]f32 = &b;
42 | const y_ptr: [*]f32 = &y;
43 | 
44 | // Compute gemv
45 | fn gemv() void {
46 |   // Loop over all columns of A
47 |   for (@range(i16, N)) |i| {
48 |     // Calculate contribution to A*x from ith column of A, ith elem of x
49 |     @fmacs(y_dsd, y_dsd, A_dsd, x[i]);
50 |     // Move A_dsd to next column of A
51 |     A_dsd = @increment_dsd_offset(A_dsd, 1, f32);
52 |   }
53 |   // Add b to A*x
54 |   @fadds(y_dsd, y_dsd, b_dsd);
55 | }
56 | 
57 | // Call initialize and gemv functions
58 | fn init_and_compute() void {
59 |   gemv();
60 |   sys_mod.unblock_cmd_stream();
61 | }
62 | 
63 | comptime {
64 |   @export_symbol(A_ptr, "A");
65 |   @export_symbol(x_ptr, "x");
66 |   @export_symbol(b_ptr, "b");
67 |   @export_symbol(y_ptr, "y");
68 |   @export_symbol(init_and_compute);
69 | }
70 | 


--------------------------------------------------------------------------------
/tutorials/gemv-03-memcpy/pe_program.csl:
--------------------------------------------------------------------------------
 1 | // Copyright 2025 Cerebras Systems.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | param memcpy_params: comptime_struct;
16 | 
17 | // memcpy module provides infrastructure for copying data
18 | // and launching functions from the host
19 | const sys_mod = @import_module("<memcpy/memcpy>", memcpy_params);
20 | 
21 | // Constants definining dimensions of our matrix
22 | const M: i16 = 4;
23 | const N: i16 = 6;
24 | 
25 | // 48 kB of global memory contain A, x, b, y
26 | var A: [M*N]f32; // A is stored row major
27 | var x: [N]f32;
28 | var b: [M]f32;
29 | var y = @zeros([M]f32); // Initialize y to zero
30 | 
31 | // DSDs for accessing A, b, y
32 | // A_dsd accesses column of A
33 | var A_dsd = @get_dsd(mem1d_dsd, .{ .tensor_access = |i|{M} -> A[i*N] });
34 | var b_dsd = @get_dsd(mem1d_dsd, .{ .base_address = &b, .extent = M });
35 | var y_dsd = @get_dsd(mem1d_dsd, .{ .base_address = &y, .extent = M });
36 | 
37 | // ptrs to A, x, b, y will be advertised as symbols to host
38 | var A_ptr: [*]f32 = &A;
39 | var x_ptr: [*]f32 = &x;
40 | var b_ptr: [*]f32 = &b;
41 | const y_ptr: [*]f32 = &y;
42 | 
43 | // Compute gemv
44 | fn gemv() void {
45 |   // Loop over all columns of A
46 |   for (@range(i16, N)) |i| {
47 |     // Calculate contribution to A*x from ith column of A, ith elem of x
48 |     @fmacs(y_dsd, y_dsd, A_dsd, x[i]);
49 |     // Move A_dsd to next column of A
50 |     A_dsd = @increment_dsd_offset(A_dsd, 1, f32);
51 |   }
52 |   // Add b to A*x
53 |   @fadds(y_dsd, y_dsd, b_dsd);
54 | }
55 | 
56 | // Call initialize and gemv functions
57 | fn init_and_compute() void {
58 |   gemv();
59 |   sys_mod.unblock_cmd_stream();
60 | }
61 | 
62 | comptime {
63 |   @export_symbol(A_ptr, "A");
64 |   @export_symbol(x_ptr, "x");
65 |   @export_symbol(b_ptr, "b");
66 |   @export_symbol(y_ptr, "y");
67 |   @export_symbol(init_and_compute);
68 | }
69 | 


--------------------------------------------------------------------------------
/tutorials/pipeline-03-multiple/memcpy_edge/d2h.csl:
--------------------------------------------------------------------------------
 1 | // Copyright 2025 Cerebras Systems.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | // One streaming D2H:
16 | // 1st D2H: UT 4 and UT 7
17 | 
18 | param MEMCPYD2H_1 = {};
19 | 
20 | // Color along which we expect a wavelet
21 | param USER_OUT_1 = {};
22 | 
23 | param rxdir: direction;
24 | 
25 | // Queue IDs
26 | const USER_OUT_1_iq: input_queue = @get_input_queue(7);
27 | const d2h_oq: output_queue = @get_output_queue(4);
28 | 
29 | const max_fifo_len = 256*40; // maximum length of the fifo
30 | 
31 | var fifo1_buffer = @zeros([max_fifo_len]u32);
32 | const fifo1 = @allocate_fifo(fifo1_buffer);
33 | 
34 | const INFINITE_DSD_LEN: u16 = 0x7fff;
35 | 
36 | var fab_recv_wdsd = @get_dsd(fabin_dsd, .{
37 |   .extent = INFINITE_DSD_LEN,
38 |   .fabric_color = USER_OUT_1,
39 |   .input_queue = USER_OUT_1_iq
40 | });
41 | 
42 | var fab_trans_wdsd = @get_dsd(fabout_dsd, .{
43 |   .extent = INFINITE_DSD_LEN,
44 |   .fabric_color = MEMCPYD2H_1,
45 |   .output_queue = d2h_oq
46 | });
47 | 
48 | // if USER_OUT_1 is not valid, f_startup() is empty
49 | fn f_startup() void {
50 |   if (!@is_same_type(@type_of(MEMCPYD2H_1), void) and !@is_same_type(@type_of(USER_OUT_1), void)) {
51 |     // receive data from USER_OUT_1
52 |     @mov32(fifo1, fab_recv_wdsd, .{ .async = true });
53 | 
54 |     // forward data to MEMCPYD2H_1
55 |     @mov32(fab_trans_wdsd, fifo1, .{ .async = true });
56 |   }
57 | }
58 | 
59 | comptime {
60 |   if (!@is_same_type(@type_of(USER_OUT_1), void)) {
61 |     @set_local_color_config(USER_OUT_1, .{ .routes = .{ .rx = .{ rxdir }, .tx = .{ RAMP }}});
62 | 
63 |     // On WSE-3, we must explicitly initialize input and output queues
64 |     if (@is_arch("wse3")) {
65 |       @initialize_queue(d2h_oq, .{ .color = MEMCPYD2H_1 });
66 |       @initialize_queue(USER_OUT_1_iq, .{ .color = USER_OUT_1 });
67 |     }
68 |   }
69 | }
70 | 


--------------------------------------------------------------------------------
/tutorials/topic-14-color-swap/pe_program.csl:
--------------------------------------------------------------------------------
 1 | // Copyright 2025 Cerebras Systems.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | param memcpy_params: comptime_struct;
16 | 
17 | const sys_mod = @import_module("<memcpy/memcpy>", memcpy_params);
18 | 
19 | // Colors
20 | param red:  color;
21 | param blue: color;
22 | 
23 | // Queue IDs
24 | const blue_oq: output_queue = @get_output_queue(2);
25 | 
26 | // Task IDs
27 | // Task ID for data task that recvs from memcpy
28 | const h2d_task_id: data_task_id = @get_data_task_id(sys_mod.MEMCPYH2D_1);
29 | 
30 | // Task ID for data task red, consumes red wlts
31 | const red_task_id: data_task_id = @get_data_task_id(red);
32 | 
33 | // Task ID for data task blue, consumes blue wlts
34 | const blue_task_id: data_task_id = @get_data_task_id(blue);
35 | 
36 | // Single-elem array to hold sum of received wlts
37 | var sum = @zeros([1]u32);
38 | var ptr_sum: [*]u32 = &sum;
39 | 
40 | // Task that will be triggered by red wavelet
41 | task red_task(in_data : u32) void {
42 |   sum[0] += in_data;
43 | }
44 | 
45 | // Task that will be triggered by blue wavelet
46 | task blue_task(in_data : u32) void {
47 |   sum[0] += in_data * 2;
48 | }
49 | 
50 | var buf = @zeros([1]u32);
51 | const buf_dsd = @get_dsd(mem1d_dsd, .{ .tensor_access = |i|{1} -> buf[i] });
52 | 
53 | // PEs 0, 2 activate blue task; 1, 3 activate red task.
54 | const out_dsd = @get_dsd(fabout_dsd, .{
55 |   .extent = 1,
56 |   .fabric_color = blue,
57 |   .output_queue = blue_oq
58 | });
59 | 
60 | // receive data from streaming H2D and forward it to color red
61 | task wtt_h2d(data: u32) void {
62 |   @block(h2d_task_id);
63 |   buf[0] = data;
64 |   @mov16(out_dsd, buf_dsd, .{ .async = true, .unblock = h2d_task_id });
65 | }
66 | 
67 | comptime {
68 |   @bind_data_task(red_task, red_task_id);
69 |   @bind_data_task(blue_task, blue_task_id);
70 |   @bind_data_task(wtt_h2d, h2d_task_id);
71 | 
72 |   @export_symbol(ptr_sum, "sum");
73 | }
74 | 


--------------------------------------------------------------------------------
/tutorials/gemv-06-routes-1/README.rst:
--------------------------------------------------------------------------------
 1 | GEMV 6: Routes and Fabric DSDs, Part I
 2 | ======================================
 3 | 
 4 | Continuing from the previous example, we now break up a single GEMV
 5 | computation among two PEs.
 6 | 
 7 | The host program copies ``b`` into the ``y`` tensor of the left PE.
 8 | The left PE also gets the first ``N/2`` columns of ``A`` and the first ``N/2``
 9 | values of ``x``, and the right PE gets the last ``N/2`` columns of ``A``
10 | and last ``N/2`` values of ``x``.
11 | 
12 | The left and right PE both increment their local ``y`` tensors by computing
13 | their piece of ``Ax``.
14 | Then, the left PE sends its result to the right PE, which increments its ``y``
15 | tensor by the received values.
16 | 
17 | Last, the host copies ``y`` from the right PE, and checks that the result is
18 | correct.
19 | 
20 | To send data from the left PE to the right PE, we must specify a route, known
21 | as a color.
22 | In ``layout.csl``, ``@set_color_config`` specifies that on the left PE,
23 | color 0 will receive data, or wavelets, from the compute element (CE)
24 | up the RAMP, and transmit them to the EAST.
25 | On the right PE, color 0 will receive wavelets form the ``WEST``, and then
26 | transmit them down the RAMP to the CE.
27 | ``@set_tile_code`` passes the ID of this color to ``pe_program`` as a
28 | parameter named ``send_color``, and also sets a paremeter called ``pe_id``,
29 | to diffentiate if the program is running on the left or the right PE.
30 | 
31 | The ``send_right`` function executed on the left PE defines a ``fabout_dsd``
32 | called ``out_dsd`` that sends ``M`` wavelets along the color route specified
33 | by ``send_color``.
34 | ``out_dsd`` is used as the destination operand of ``@fmovs``, and ``y_dsd``
35 | as the source operand.
36 | Thus, this operation sends the ``M`` elements accessed by ``y_dsd`` along the
37 | fabric as specified by ``out_dsd``.
38 | 
39 | The ``recv_left`` function executed on the right PE receives the data in a
40 | ``fabin_dsd`` named ``in_dsd``, used in an ``@fadds`` operation that
41 | increments the ``M`` elements of ``y`` on this PE by the ``M`` received values.
42 | 
43 | Note that this program also provides an example of a local task.
44 | The ``@fmovs`` and ``@fadds`` operations are performed asynchronously;
45 | when these operations are done, the color ``exit_color`` is activated, which
46 | activates the task ``exit_task``.
47 | This task unblocks ``memcpy``'s command stream, allowing additional commands
48 | from the host program to proceed.
49 | 


--------------------------------------------------------------------------------
/tutorials/gemv-01-complete-program/README.rst:
--------------------------------------------------------------------------------
 1 | GEMV 1: A Complete Program
 2 | ==========================
 3 | 
 4 | This example demonstrates a complete CSL program.
 5 | 
 6 | A complete program consists of a host program (a Python script, in this example)
 7 | and at least two CSL code files,
 8 | one of which defines the layout of the program across a collection of
 9 | processing elements (PEs) on the Wafer-Scale Engine (hereafter referred to
10 | as "device"),
11 | and one or more of which define the programs running on the individual PEs.
12 | In this example, there is just one PE.
13 | 
14 | When executing the program, the user first compiles the CSL code files, and
15 | then invokes the host program to copy data on and off the device and launch
16 | functions on the device using a remote procedure call (RPC) mechanism.
17 | The device used may be an actual CS system,
18 | or it may be simulated without access to an actual CS system using the
19 | Cerebras Fabric Simulator.
20 | 
21 | The host program here is defined in the ``run.py`` script, and the layout and
22 | device code are defined in ``layout.csl`` and ``pe_program.csl``.
23 | 
24 | The movement of data from host to device and back is done with memory to memory
25 | copy semantics, which is provided by an SDK utility called ``memcpy``.
26 | The top of the ``layout.csl`` file imports a module which is used to
27 | parameterize the program's ``memcpy`` infrastructure.
28 | This file also includes a layout block which specifies the number
29 | and spatial arrangement of PEs used by this program, as well as the instructions
30 | to execute on each PE.
31 | Here, we instruct the compiler to produce executable code for 1 PE using the
32 | code in ``pe_program.csl``.
33 | 
34 | This program executes as follows.
35 | The host code ``run.py`` uses the remote procedure call (RPC) mechanism to
36 | launch a function called ``init_and_compute`` on the device.
37 | This function initializes a 4 x 6 matrix ``A``, stored in row major format,
38 | a 6 x 1 vector ``x``, and a 4 x 1 vector ``b``.
39 | Then, it computes the matrix-vector product of ``Ax + b``
40 | and stores it in ``y``.
41 | 
42 | Once ``init_and_compute`` finishes on the device,
43 | the host program performs a device-to-host memcpy with
44 | the ``memcpy_d2h`` command to copy back the result stored in ``y``,
45 | and then checks that the answer is correct.
46 | Notice the ``unblock_cmd_stream`` call in ``pe_program.csl`` that occurs
47 | at the end of ``init_and_compute``;
48 | this call allows the device-to-host ``memcpy_d2h`` to proceed.
49 | 


--------------------------------------------------------------------------------
/benchmarks/bandwidth-test/src/sync/layout.csl:
--------------------------------------------------------------------------------
 1 | // Copyright 2025 Cerebras Systems.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | 
16 | param colors:[5]color;
17 | param entrypoints:[4]local_task_id;
18 | param width : i16 ;   // width of the core
19 | param height: i16 ;   // height of the core
20 | 
21 | const C0 : color = colors[0];
22 | const C1 : color = colors[1];
23 | const C2 : color = colors[2];
24 | const C3 : color = colors[3];
25 | const C4 : color = colors[4];
26 | 
27 | const STARTUP: local_task_id = entrypoints[0];
28 | const SYNC_Y: local_task_id = entrypoints[1];
29 | const SYNC_BCAST: local_task_id = entrypoints[2];
30 | const EXIT: local_task_id = entrypoints[3];
31 | 
32 | fn get_params(px:i16, py:i16) comptime_struct {
33 | 
34 |     var first_py: bool = (0 == py);
35 |     var last_py: bool = ((height-1) == py);
36 |     var is_py_even: bool = (0 == (py % 2));
37 | 
38 |     var first_px: bool = (0 == px);
39 |     var last_px: bool = ((width-1) == px);
40 |     var is_px_even: bool = (0 == (px % 2));
41 | 
42 |     var c_recv_px: color = C0;
43 |     var c_send_px: color = C1;
44 |     if (is_px_even){
45 |         c_recv_px = C0;
46 |         c_send_px = C1;
47 |     }else{
48 |         c_recv_px = C1;
49 |         c_send_px = C0;
50 |     }
51 | 
52 |     var c_recv_py: color = C2;
53 |     var c_send_py: color = C3;
54 |     if (is_py_even){
55 |         c_recv_py = C2;
56 |         c_send_py = C3;
57 |     }else{
58 |         c_recv_py = C3;
59 |         c_send_py = C2;
60 |     }
61 | 
62 |     return .{
63 |         .c_recv_px = c_recv_px,
64 |         .c_send_px = c_send_px,
65 |         .c_recv_py = c_recv_py,
66 |         .c_send_py = c_send_py,
67 |         .c_bcast = C4,
68 | 
69 |         .STARTUP = STARTUP,
70 |         .SYNC_Y = SYNC_Y,
71 |         .SYNC_BCAST = SYNC_BCAST,
72 |         .EXIT = EXIT,
73 | 
74 |         .first_px = first_px,
75 |         .last_px = last_px,
76 |         .first_py = first_py,
77 |         .last_py = last_py,
78 |     };
79 | }
80 | 


--------------------------------------------------------------------------------
/benchmarks/row-col-broadcast/src/sync/layout.csl:
--------------------------------------------------------------------------------
 1 | // Copyright 2025 Cerebras Systems.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | 
16 | param colors:[5]color;
17 | param entrypoints:[4]local_task_id;
18 | param width : i16 ;   // width of the core
19 | param height: i16 ;   // height of the core
20 | 
21 | const C0 : color = colors[0];
22 | const C1 : color = colors[1];
23 | const C2 : color = colors[2];
24 | const C3 : color = colors[3];
25 | const C4 : color = colors[4];
26 | 
27 | const STARTUP: local_task_id = entrypoints[0];
28 | const SYNC_Y: local_task_id = entrypoints[1];
29 | const SYNC_BCAST: local_task_id = entrypoints[2];
30 | const EXIT: local_task_id = entrypoints[3];
31 | 
32 | fn get_params(px:i16, py:i16) comptime_struct {
33 | 
34 |     var first_py: bool = (0 == py);
35 |     var last_py: bool = ((height-1) == py);
36 |     var is_py_even: bool = (0 == (py % 2));
37 | 
38 |     var first_px: bool = (0 == px);
39 |     var last_px: bool = ((width-1) == px);
40 |     var is_px_even: bool = (0 == (px % 2));
41 | 
42 |     var c_recv_px: color = C0;
43 |     var c_send_px: color = C1;
44 |     if (is_px_even){
45 |         c_recv_px = C0;
46 |         c_send_px = C1;
47 |     }else{
48 |         c_recv_px = C1;
49 |         c_send_px = C0;
50 |     }
51 | 
52 |     var c_recv_py: color = C2;
53 |     var c_send_py: color = C3;
54 |     if (is_py_even){
55 |         c_recv_py = C2;
56 |         c_send_py = C3;
57 |     }else{
58 |         c_recv_py = C3;
59 |         c_send_py = C2;
60 |     }
61 | 
62 |     return .{
63 |         .c_recv_px = c_recv_px,
64 |         .c_send_px = c_send_px,
65 |         .c_recv_py = c_recv_py,
66 |         .c_send_py = c_send_py,
67 |         .c_bcast = C4,
68 | 
69 |         .STARTUP = STARTUP,
70 |         .SYNC_Y = SYNC_Y,
71 |         .SYNC_BCAST = SYNC_BCAST,
72 |         .EXIT = EXIT,
73 | 
74 |         .first_px = first_px,
75 |         .last_px = last_px,
76 |         .first_py = first_py,
77 |         .last_py = last_py,
78 |     };
79 | }
80 | 


--------------------------------------------------------------------------------
/benchmarks/spmv-hypersparse/README.rst:
--------------------------------------------------------------------------------
 1 | Hypersparse SpMV
 2 | ================
 3 | 
 4 | This example evaluates the performance of sparse matrix-vector multiplication.
 5 | The kernel records the ``start`` and ``end`` of ``spmv`` by tsc counter. In
 6 | addition the tsc counters of all PEs are not sychronized in the beginning.
 7 | To avoid the timing variation among those PEs, ``f_sync()`` synchronizes all
 8 | PEs and samples the reference clock.
 9 | 
10 | The kernel ``kernel.csl`` defines a couple of host-callable functions,
11 | ``f_sync()``, ``f_tic()`` and ``f_toc()`` in order to synchronize the PEs and
12 | record the timing of ``spmv``.
13 | 
14 | The kernel ``allreduce2R1E/pe.csl`` performs a reduction over the whole
15 | rectangle to synchronize the PEs, then the bottom-right PE sends a signal to
16 | other PEs to sample the reference clock. The ``allreduce2R1E`` is a variant of
17 | ``allreduce`` in ``stencil-3d-7pts``. The former uses 2 routable colors and
18 | 1 entrypoints, the latter uses 1 routable color and 4 entrypoints.
19 | ``allreduce2R1E`` is designed for spmv kernel which only has three unused
20 | colors.
21 | 
22 | The kernel ``hypersparse_spmv/pe.csl`` performs a matrix-vector product (spmv)
23 | where the matrix ``A`` is hypersparse, partitioned into 2D grids. The input
24 | vector ``x`` and output vector ``y`` are also distributed into 2D grids.
25 | 
26 | The user has to provide the matrix ``A`` in Matrix Market File format with
27 | 1-based index. To obtain the best performance, the user may need to reorder the
28 | matrix such that the variatoin of the nonzeros of each parition is small. One
29 | option is ``util/analyze.cpp`` which provides a load balancing algorithm.
30 | 
31 | The script ``run.py`` has the following parameters:
32 | 
33 | - ``--infile_mtx=<path to mtx file>`` contains the sparse matrix A
34 | 
35 | - ``--num_pe_rows=<int>`` specifies the height of the core rectangle
36 | 
37 | - ``--num_pe_cols=<int>`` specifies the width of the core rectangle
38 | 
39 | - ``--channels=<int>`` specifies the number of I/O channels, no bigger than 16.
40 | 
41 | The ``tic()`` samples "time_start" and ``toc()`` samples "time_end". The
42 | ``sync()`` samples "time_ref" which is used to adjust "time_start" and
43 | "time_end". The elapsed time (unit: cycles) is measured by
44 | ``cycles_send = max(time_end) - min(time_start)``
45 | 
46 | The overall runtime (us) is computed via the following formula
47 | ``time_send = (cycles_send / 0.85) * 1.e-3 us``
48 | 
49 | The bandwidth is calculated by
50 | ``bandwidth = ((2*nnz+m)*4)/time_send)``
51 | 


--------------------------------------------------------------------------------
/tutorials/pipeline-01-basic/pe_program.csl:
--------------------------------------------------------------------------------
 1 | // Copyright 2025 Cerebras Systems.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | param memcpy_params: comptime_struct;
16 | 
17 | // number of elements received from host
18 | param size: i16;
19 | 
20 | const sys_mod = @import_module("<memcpy/memcpy>", memcpy_params);
21 | 
22 | // Queues
23 | const h2d_1_iq: input_queue = @get_input_queue(2);
24 | const d2h_1_oq: output_queue = @get_output_queue(3);
25 | 
26 | // Task IDs
27 | const main_task_id: local_task_id = @get_local_task_id(8);
28 | 
29 | const in_dsd = @get_dsd(fabin_dsd, .{
30 |   .extent = size,
31 |   .fabric_color = sys_mod.MEMCPYH2D_1,
32 |   .input_queue = h2d_1_iq
33 | });
34 | 
35 | const out_dsd = @get_dsd(fabout_dsd, .{
36 |   .extent = size,
37 |   .fabric_color = sys_mod.MEMCPYD2H_1,
38 |   .output_queue = d2h_1_oq
39 | });
40 | 
41 | var buf = @zeros([1]i16);
42 | const one_dsd = @get_dsd(mem1d_dsd, .{ .tensor_access = |i|{size} -> buf[0] });
43 | 
44 | task main_task() void {
45 |   // WARNING: large size can stall.
46 |   // H2D and D2H are serialized. It is NOT safe to run "send" and "recv"
47 |   // involving memcpy at the same time on the same PE.
48 |   //
49 |   // It only works for a small vector because the HW has some internal
50 |   // queues to hold those values from/to IO. If such queues are full,
51 |   // I/O stalls.
52 |   //
53 |   // In this case, if the length exceeds certain amount,
54 |   // H2D cannot finish and D2H has no chance to run.
55 | 
56 |   buf[0] = @as(i16, 1);
57 |   @add16(out_dsd, in_dsd, one_dsd, .{ .async = true });
58 | }
59 | 
60 | comptime {
61 |   // activate local task main_task at startup
62 |   @activate(main_task_id);
63 |   @bind_local_task(main_task, main_task_id);
64 | 
65 |   // On WSE-3, we must explicitly initialize input and output queues
66 |   if (@is_arch("wse3")) {
67 |     @initialize_queue(h2d_1_iq,  .{ .color = sys_mod.MEMCPYH2D_1 });
68 |     @initialize_queue(d2h_1_oq,  .{ .color = sys_mod.MEMCPYD2H_1 });
69 |   }
70 | }
71 | 


--------------------------------------------------------------------------------
/tutorials/gemv-01-complete-program/pe_program.csl:
--------------------------------------------------------------------------------
 1 | // Copyright 2025 Cerebras Systems.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | // Struct containing parameters for memcpy layout
16 | param memcpy_params: comptime_struct;
17 | 
18 | // memcpy module provides infrastructure for copying data
19 | // and launching functions from the host
20 | const sys_mod = @import_module("<memcpy/memcpy>", memcpy_params);
21 | 
22 | // Constants definining dimensions of our matrix
23 | const M: i16 = 4;
24 | const N: i16 = 6;
25 | 
26 | // 48 kB of global memory contain A, x, b, y
27 | var A: [M*N]f32; // A is stored row major
28 | var x: [N]f32;
29 | var b: [M]f32;
30 | var y: [M]f32;
31 | 
32 | // Ptr to y will be exported as symbol to host
33 | // Ptr is const, so host can read but not write to y
34 | const y_ptr: [*]f32 = &y;
35 | 
36 | // Initialize matrix and vectors
37 | fn initialize() void {
38 |   // for loop with range syntax
39 |   for (@range(i16, M*N)) |idx| {
40 |     A[idx] = @as(f32, idx);
41 |   }
42 | 
43 |   for (@range(i16, N)) |j| {
44 |     x[j] = 1.0;
45 |   }
46 | 
47 |   // while loop with iterator syntax
48 |   var i: i16 = 0;
49 |   while (i < M) : (i += 1) {
50 |     b[i] = 2.0;
51 |     y[i] = 0.0;
52 |   }
53 | }
54 | 
55 | // Compute gemv
56 | fn gemv() void {
57 |   for (@range(i16, M)) |i| {
58 |     var tmp: f32 = 0.0;
59 |     for (@range(i16, N)) |j| {
60 |       tmp += A[i*N + j] * x[j];
61 |     }
62 |     y[i] = tmp + b[i];
63 |   }
64 | }
65 | 
66 | // Call initialize and gemv functions
67 | fn init_and_compute() void {
68 |   initialize();
69 |   gemv();
70 | 
71 |   // After this function finishes, memcpy's cmd_stream must
72 |   // be unblocked on all PEs for further memcpy commands
73 |   // to execute
74 |   sys_mod.unblock_cmd_stream();
75 | }
76 | 
77 | comptime {
78 |   // Export symbol pointing to y so it is host-readable
79 |   @export_symbol(y_ptr, "y");
80 | 
81 |   // Export function so it is host-callable by RPC mechanism
82 |   @export_symbol(init_and_compute);
83 | }
84 | 


--------------------------------------------------------------------------------
/tutorials/gemv-07-routes-2/layout.csl:
--------------------------------------------------------------------------------
 1 | // Copyright 2025 Cerebras Systems.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | // total matrix dimensions
16 | param M: i16;
17 | param N: i16;
18 | 
19 | // Colors
20 | const ax_color: color = @get_color(0); // sends/recvs partial result Ax EAST
21 | const x_color:  color = @get_color(1); // sends/recvs elems x
22 | 
23 | // This example uses 2x2 PEs
24 | const memcpy = @import_module("<memcpy/get_params>", .{
25 |   .width = 2,
26 |   .height = 2
27 | });
28 | 
29 | layout {
30 |   // PE coordinates are (column, row)
31 |   @set_rectangle(2, 2);
32 | 
33 |   for (@range(i16, 2)) |pe_x| {
34 |     for (@range(i16, 2)) |pe_y| {
35 |       @set_tile_code(pe_x, pe_y, "pe_program.csl", .{
36 |         .memcpy_params = memcpy.get_params(pe_x),
37 |         .M_per_PE = M / 2,
38 |         .N_per_PE = N / 2,
39 |         .ax_color = ax_color,
40 |         .x_color = x_color
41 |       });
42 |     }
43 |   }
44 | 
45 |   // Top left PE (0, 0)
46 |   @set_color_config(0, 0, ax_color, .{.routes = .{ .rx = .{RAMP}, .tx = .{EAST}  }});
47 |   @set_color_config(0, 0, x_color,  .{.routes = .{ .rx = .{RAMP}, .tx = .{RAMP, SOUTH} }});
48 | 
49 |   // Top right PE (1, 0)
50 |   @set_color_config(1, 0, ax_color, .{.routes = .{ .rx = .{WEST},  .tx = .{RAMP} }});
51 |   @set_color_config(1, 0, x_color,  .{.routes = .{ .rx = .{RAMP}, .tx = .{RAMP, SOUTH} }});
52 | 
53 |   // Bottom left PE (0, 1)
54 |   @set_color_config(0, 1, ax_color, .{.routes = .{ .rx = .{RAMP}, .tx = .{EAST} }});
55 |   @set_color_config(0, 1, x_color,  .{.routes = .{ .rx = .{NORTH}, .tx = .{RAMP} }});
56 | 
57 |   // Bottom right PE (1, 1)
58 |   @set_color_config(1, 1, ax_color, .{.routes = .{ .rx = .{WEST}, .tx = .{RAMP} }});
59 |   @set_color_config(1, 1, x_color,  .{.routes = .{ .rx = .{NORTH}, .tx = .{RAMP} }});
60 | 
61 |   // export symbol names
62 |   @export_name("A", [*]f32, true);
63 |   @export_name("x", [*]f32, true);
64 |   @export_name("y", [*]f32, true);
65 |   @export_name("compute", fn()void);
66 | }
67 | 


--------------------------------------------------------------------------------