├── tutorials ├── sdklayout-05-gemv │ ├── commands_wse2.sh │ ├── commands_wse3.sh │ ├── gemv-4-by-4.png │ └── mux.csl ├── sdklayout-02-routing │ ├── commands_wse2.sh │ ├── commands_wse3.sh │ ├── README.rst │ └── send_receive.csl ├── sdklayout-04-h2d-d2h │ ├── commands_wse2.sh │ ├── commands_wse3.sh │ ├── README.rst │ └── add2vec.csl ├── sdklayout-01-introduction │ ├── commands_wse2.sh │ ├── commands_wse3.sh │ ├── gv.csl │ └── README.rst ├── sdklayout-03-ports-and-connections │ ├── commands_wse2.sh │ ├── commands_wse3.sh │ ├── README.rst │ ├── receiver.csl │ ├── sender.csl │ └── add2vec.csl ├── gemv-03-memcpy │ ├── commands_wse2.sh │ ├── commands_wse3.sh │ ├── README.rst │ ├── layout.csl │ └── pe_program.csl ├── gemv-02-memory-dsds │ ├── commands_wse2.sh │ ├── commands_wse3.sh │ ├── layout.csl │ ├── README.rst │ └── run.py ├── gemv-01-complete-program │ ├── commands_wse2.sh │ ├── commands_wse3.sh │ ├── layout.csl │ ├── README.rst │ └── pe_program.csl ├── gemv-04-params │ ├── commands_wse2.sh │ ├── commands_wse3.sh │ ├── README.rst │ ├── layout.csl │ └── pe_program.csl ├── topic-15-wse3-microthreads │ ├── commands_wse3.sh │ ├── layout.csl │ ├── right_pe.csl │ ├── left_pe.csl │ ├── run.py │ └── README.rst ├── gemv-06-routes-1 │ ├── commands_wse2.sh │ ├── commands_wse3.sh │ ├── layout.csl │ └── README.rst ├── gemv-07-routes-2 │ ├── commands_wse2.sh │ ├── commands_wse3.sh │ └── layout.csl ├── gemv-05-multiple-pes │ ├── commands_wse2.sh │ ├── commands_wse3.sh │ ├── README.rst │ ├── layout.csl │ └── pe_program.csl ├── topic-06-switches │ ├── commands_wse2.sh │ ├── commands_wse3.sh │ ├── empty.csl │ └── README.rst ├── topic-08-filters │ ├── commands_wse2.sh │ ├── commands_wse3.sh │ ├── README.rst │ └── run.py ├── topic-01-arrays-and-pointers │ ├── commands_wse2.sh │ ├── commands_wse3.sh │ ├── README.rst │ ├── pe_program.csl │ ├── layout.csl │ └── run.py ├── topic-07-switches-entrypt │ ├── commands_wse2.sh │ ├── commands_wse3.sh │ ├── empty.csl │ └── README.rst ├── gemv-08-routes-3 │ ├── commands_wse2.sh │ └── commands_wse3.sh ├── topic-10-map-builtin │ ├── commands_wse2.sh │ ├── commands_wse3.sh │ ├── README.rst │ └── layout.csl ├── topic-13-simprint │ ├── commands_wse2.sh │ └── commands_wse3.sh ├── topic-11-collectives │ ├── commands_wse2.sh │ ├── commands_wse3.sh │ └── README.rst ├── topic-12-debug-library │ ├── commands_wse2.sh │ ├── commands_wse3.sh │ └── README.rst ├── topic-02-libraries │ ├── commands_wse2.sh │ ├── commands_wse3.sh │ ├── README.rst │ └── layout.csl ├── topic-14-color-swap │ ├── commands_wse2.sh │ ├── README.rst │ └── pe_program.csl ├── topic-04-sparse-tensors │ ├── commands_wse2.sh │ ├── commands_wse3.sh │ ├── README.rst │ └── pe_program.csl ├── topic-03-streaming-wavelet-data │ ├── commands_wse2.sh │ ├── commands_wse3.sh │ ├── README.rst │ └── pe_program.csl ├── pipeline-01-basic │ ├── commands_wse2.sh │ ├── commands_wse3.sh │ ├── layout.csl │ ├── README.rst │ └── pe_program.csl ├── pipeline-02-fifo │ ├── commands_wse2.sh │ ├── commands_wse3.sh │ ├── README.rst │ └── layout.csl ├── pipeline-03-multiple │ ├── commands_wse2.sh │ ├── commands_wse3.sh │ ├── memcpy_edge │ │ ├── east.csl │ │ ├── north.csl │ │ ├── south.csl │ │ ├── west.csl │ │ └── d2h.csl │ ├── README.rst │ └── pe_program.csl ├── topic-09-fifos │ ├── commands_wse2.sh │ ├── commands_wse3.sh │ └── README.rst ├── gemv-09-streaming │ ├── commands_wse2.sh │ ├── commands_wse3.sh │ └── README.rst ├── topic-05-sentinels │ ├── commands_wse2.sh │ ├── commands_wse3.sh │ └── README.rst └── gemv-00-basic-syntax │ ├── README.rst │ └── code.csl └── benchmarks ├── residual ├── images │ └── residual-memcpy-2-by-2.png ├── commands_wse2.sh ├── commands_wse3.sh ├── nrminf.csl ├── README.rst └── axpy.csl ├── gemv-checkerboard-pattern ├── images │ └── gemv-4-by-4.png ├── commands_wse2.sh ├── commands_wse3.sh └── README.rst ├── gemm-collectives_2d ├── commands_wse2.sh ├── commands_wse3.sh └── README.rst ├── cholesky ├── commands_wse2.sh ├── commands_wse3.sh └── launch.csl ├── game-of-life ├── commands_wse2.sh ├── commands_wse3.sh └── README.rst ├── mandelbrot ├── commands_wse2.sh ├── commands_wse3.sh ├── common.csl └── README.rst ├── gemv-collectives_2d ├── commands_wse2.sh ├── commands_wse3.sh └── README.rst ├── single-tile-matvec ├── commands_wse2.sh ├── commands_wse3.sh ├── compile.appliance.py └── src │ └── layout_matvec.csl ├── wide-multiplication ├── commands_wse2.sh ├── commands_wse3.sh └── README.rst ├── 25-pt-stencil ├── commands_wse2.sh ├── nop.csl ├── ic.py └── util.csl ├── histogram-torus └── commands_wse2.sh ├── row-col-broadcast ├── commands_wse2.sh ├── commands_wse3.sh ├── README.rst └── src │ └── sync │ └── layout.csl ├── fft-1d-2d ├── commands_wse2.sh ├── commands_wse3.sh ├── reshape.csl └── ucode_1d.csl ├── bandwidth-test ├── commands_wse2.sh ├── commands_wse3.sh ├── README.rst └── src │ └── sync │ └── layout.csl ├── fft-3d ├── commands_wse3.sh ├── layout.csl └── README.rst ├── 7pt-stencil-spmv ├── commands_wse2.sh ├── commands_wse3.sh └── README.rst ├── bicgstab ├── commands_wse2.sh └── commands_wse3.sh ├── power-method ├── commands_wse2.sh ├── commands_wse3.sh └── power_method.py ├── conjugate-gradient ├── commands_wse2.sh └── commands_wse3.sh ├── preconditioned-conjugate-gradient ├── commands_wse2.sh └── commands_wse3.sh ├── spmv-hypersparse ├── commands_wse2.sh ├── data │ └── rmat4.4x4.lb.mtx ├── src │ └── allreduce2R1E │ │ └── layout.csl └── README.rst └── benchmark-libs └── allreduce └── layout.csl /tutorials/sdklayout-05-gemv/commands_wse2.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | cs_python run.py --arch=wse2 6 | -------------------------------------------------------------------------------- /tutorials/sdklayout-05-gemv/commands_wse3.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | cs_python run.py --arch=wse3 6 | -------------------------------------------------------------------------------- /tutorials/sdklayout-02-routing/commands_wse2.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | cs_python run.py --arch=wse2 6 | -------------------------------------------------------------------------------- /tutorials/sdklayout-02-routing/commands_wse3.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | cs_python run.py --arch=wse3 6 | -------------------------------------------------------------------------------- /tutorials/sdklayout-04-h2d-d2h/commands_wse2.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | cs_python run.py --arch=wse2 6 | -------------------------------------------------------------------------------- /tutorials/sdklayout-04-h2d-d2h/commands_wse3.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | cs_python run.py --arch=wse3 6 | -------------------------------------------------------------------------------- /tutorials/sdklayout-01-introduction/commands_wse2.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | cs_python run.py --arch=wse2 6 | -------------------------------------------------------------------------------- /tutorials/sdklayout-01-introduction/commands_wse3.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | cs_python run.py --arch=wse3 6 | -------------------------------------------------------------------------------- /tutorials/sdklayout-03-ports-and-connections/commands_wse2.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | cs_python run.py --arch=wse2 6 | -------------------------------------------------------------------------------- /tutorials/sdklayout-03-ports-and-connections/commands_wse3.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | cs_python run.py --arch=wse3 6 | -------------------------------------------------------------------------------- /tutorials/sdklayout-05-gemv/gemv-4-by-4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Cerebras/csl-examples/HEAD/tutorials/sdklayout-05-gemv/gemv-4-by-4.png -------------------------------------------------------------------------------- /benchmarks/residual/images/residual-memcpy-2-by-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Cerebras/csl-examples/HEAD/benchmarks/residual/images/residual-memcpy-2-by-2.png -------------------------------------------------------------------------------- /benchmarks/gemv-checkerboard-pattern/images/gemv-4-by-4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Cerebras/csl-examples/HEAD/benchmarks/gemv-checkerboard-pattern/images/gemv-4-by-4.png -------------------------------------------------------------------------------- /tutorials/gemv-03-memcpy/commands_wse2.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | cslc --arch=wse2 ./layout.csl --fabric-dims=8,3 \ 6 | --fabric-offsets=4,1 -o out --memcpy --channels 1 7 | cs_python run.py --name out 8 | -------------------------------------------------------------------------------- /tutorials/gemv-03-memcpy/commands_wse3.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | cslc --arch=wse3 ./layout.csl --fabric-dims=8,3 \ 6 | --fabric-offsets=4,1 -o out --memcpy --channels 1 7 | cs_python run.py --name out 8 | -------------------------------------------------------------------------------- /tutorials/gemv-02-memory-dsds/commands_wse2.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | cslc --arch=wse2 ./layout.csl --fabric-dims=8,3 \ 6 | --fabric-offsets=4,1 -o out --memcpy --channels 1 7 | cs_python run.py --name out 8 | -------------------------------------------------------------------------------- /tutorials/gemv-02-memory-dsds/commands_wse3.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | cslc --arch=wse3 ./layout.csl --fabric-dims=8,3 \ 6 | --fabric-offsets=4,1 -o out --memcpy --channels 1 7 | cs_python run.py --name out 8 | -------------------------------------------------------------------------------- /tutorials/gemv-01-complete-program/commands_wse2.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | cslc --arch=wse2 ./layout.csl --fabric-dims=8,3 \ 6 | --fabric-offsets=4,1 -o out --memcpy --channels 1 7 | cs_python run.py --name out 8 | -------------------------------------------------------------------------------- /tutorials/gemv-01-complete-program/commands_wse3.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | cslc --arch=wse3 ./layout.csl --fabric-dims=8,3 \ 6 | --fabric-offsets=4,1 -o out --memcpy --channels 1 7 | cs_python run.py --name out 8 | -------------------------------------------------------------------------------- /tutorials/gemv-04-params/commands_wse2.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | cslc --arch=wse2 ./layout.csl --fabric-dims=8,3 \ 6 | --fabric-offsets=4,1 --params=M:4,N:6 -o out --memcpy --channels 1 7 | cs_python run.py --name out 8 | -------------------------------------------------------------------------------- /tutorials/gemv-04-params/commands_wse3.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | cslc --arch=wse3 ./layout.csl --fabric-dims=8,3 \ 6 | --fabric-offsets=4,1 --params=M:4,N:6 -o out --memcpy --channels 1 7 | cs_python run.py --name out 8 | -------------------------------------------------------------------------------- /tutorials/topic-15-wse3-microthreads/commands_wse3.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | cslc --arch=wse3 ./layout.csl --fabric-dims=11,3 \ 6 | --fabric-offsets=4,1 -o out --memcpy --channels 1 7 | cs_python run.py --name out 8 | -------------------------------------------------------------------------------- /tutorials/gemv-06-routes-1/commands_wse2.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | cslc --arch=wse2 ./layout.csl --fabric-dims=11,3 \ 6 | --fabric-offsets=4,1 --params=M:4,N:6 -o out --memcpy --channels 1 7 | cs_python run.py --name out 8 | -------------------------------------------------------------------------------- /tutorials/gemv-06-routes-1/commands_wse3.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | cslc --arch=wse3 ./layout.csl --fabric-dims=11,3 \ 6 | --fabric-offsets=4,1 --params=M:4,N:6 -o out --memcpy --channels 1 7 | cs_python run.py --name out 8 | -------------------------------------------------------------------------------- /tutorials/gemv-07-routes-2/commands_wse2.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | cslc --arch=wse2 ./layout.csl --fabric-dims=9,4 \ 6 | --fabric-offsets=4,1 --params=M:4,N:6 -o out --memcpy --channels 1 7 | cs_python run.py --name out 8 | -------------------------------------------------------------------------------- /tutorials/gemv-07-routes-2/commands_wse3.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | cslc --arch=wse3 ./layout.csl --fabric-dims=9,4 \ 6 | --fabric-offsets=4,1 --params=M:4,N:6 -o out --memcpy --channels 1 7 | cs_python run.py --name out 8 | -------------------------------------------------------------------------------- /tutorials/gemv-05-multiple-pes/commands_wse2.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | cslc --arch=wse2 ./layout.csl --fabric-dims=11,3 \ 6 | --fabric-offsets=4,1 --params=M:4,N:6,width:4 -o out --memcpy --channels 1 7 | cs_python run.py --name out 8 | -------------------------------------------------------------------------------- /tutorials/gemv-05-multiple-pes/commands_wse3.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | cslc --arch=wse3 ./layout.csl --fabric-dims=11,3 \ 6 | --fabric-offsets=4,1 --params=M:4,N:6,width:4 -o out --memcpy --channels 1 7 | cs_python run.py --name out 8 | -------------------------------------------------------------------------------- /tutorials/topic-06-switches/commands_wse2.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | cslc --arch=wse2 ./layout.csl --fabric-dims=10,5 --fabric-offsets=4,1 -o out \ 6 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0 7 | cs_python run.py --name out 8 | -------------------------------------------------------------------------------- /tutorials/topic-06-switches/commands_wse3.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | cslc --arch=wse3 ./layout.csl --fabric-dims=10,5 --fabric-offsets=4,1 -o out \ 6 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0 7 | cs_python run.py --name out 8 | -------------------------------------------------------------------------------- /tutorials/topic-08-filters/commands_wse2.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | cslc --arch=wse2 ./layout.csl --fabric-dims=11,3 --fabric-offsets=4,1 -o out \ 6 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0 7 | cs_python run.py --name out 8 | -------------------------------------------------------------------------------- /tutorials/topic-08-filters/commands_wse3.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | cslc --arch=wse3 ./layout.csl --fabric-dims=11,3 --fabric-offsets=4,1 -o out \ 6 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0 7 | cs_python run.py --name out 8 | -------------------------------------------------------------------------------- /benchmarks/gemm-collectives_2d/commands_wse2.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | cslc --arch=wse2 ./layout.csl --fabric-dims=11,6 --fabric-offsets=4,1 \ 6 | --params=P:4,Mt:14,Kt:14,Nt:14 \ 7 | --memcpy --channels=1 -o out 8 | cs_python run.py --name out 9 | -------------------------------------------------------------------------------- /benchmarks/gemm-collectives_2d/commands_wse3.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | cslc --arch=wse3 ./layout.csl --fabric-dims=11,6 --fabric-offsets=4,1 \ 6 | --params=P:4,Mt:14,Kt:14,Nt:14 \ 7 | --memcpy --channels=1 -o out 8 | cs_python run.py --name out 9 | -------------------------------------------------------------------------------- /tutorials/topic-01-arrays-and-pointers/commands_wse2.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | cslc --arch=wse2 ./layout.csl --fabric-dims=8,3 --fabric-offsets=4,1 -o out \ 6 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0 7 | cs_python run.py --name out 8 | -------------------------------------------------------------------------------- /tutorials/topic-01-arrays-and-pointers/commands_wse3.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | cslc --arch=wse3 ./layout.csl --fabric-dims=8,3 --fabric-offsets=4,1 -o out \ 6 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0 7 | cs_python run.py --name out 8 | -------------------------------------------------------------------------------- /tutorials/topic-07-switches-entrypt/commands_wse2.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | cslc --arch=wse2 ./layout.csl --fabric-dims=10,5 --fabric-offsets=4,1 -o out \ 6 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0 7 | cs_python run.py --name out 8 | -------------------------------------------------------------------------------- /tutorials/topic-07-switches-entrypt/commands_wse3.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | cslc --arch=wse3 ./layout.csl --fabric-dims=10,5 --fabric-offsets=4,1 -o out \ 6 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0 7 | cs_python run.py --name out 8 | -------------------------------------------------------------------------------- /tutorials/gemv-08-routes-3/commands_wse2.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | cslc --arch=wse2 ./layout.csl --fabric-dims=11,5 \ 6 | --fabric-offsets=4,1 --params=kernel_x_dim:4,kernel_y_dim:3,M:6,N:8 \ 7 | -o out --memcpy --channels 1 8 | cs_python run.py --name out 9 | -------------------------------------------------------------------------------- /tutorials/gemv-08-routes-3/commands_wse3.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | cslc --arch=wse3 ./layout.csl --fabric-dims=11,5 \ 6 | --fabric-offsets=4,1 --params=kernel_x_dim:4,kernel_y_dim:3,M:6,N:8 \ 7 | -o out --memcpy --channels 1 8 | cs_python run.py --name out 9 | -------------------------------------------------------------------------------- /benchmarks/cholesky/commands_wse2.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | cslc --arch=wse2 ./layout.csl --fabric-dims=17,12 --fabric-offsets=4,1 \ 6 | --params=P:10,Nt:4 -o out \ 7 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0 8 | cs_python run.py --name out 9 | -------------------------------------------------------------------------------- /benchmarks/cholesky/commands_wse3.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | cslc --arch=wse3 ./layout.csl --fabric-dims=17,12 --fabric-offsets=4,1 \ 6 | --params=P:10,Nt:4 -o out \ 7 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0 8 | cs_python run.py --name out 9 | -------------------------------------------------------------------------------- /benchmarks/game-of-life/commands_wse2.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | cslc --arch=wse2 ./layout.csl --fabric-dims=19,14 --fabric-offsets=4,1 \ 6 | --params=x_dim:12,y_dim:12 --memcpy --channels=1 -o out 7 | cs_python run.py --name out --initial-state glider --iters 20 8 | -------------------------------------------------------------------------------- /benchmarks/game-of-life/commands_wse3.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | cslc --arch=wse3 ./layout.csl --fabric-dims=19,14 --fabric-offsets=4,1 \ 6 | --params=x_dim:12,y_dim:12 --memcpy --channels=1 -o out 7 | cs_python run.py --name out --initial-state glider --iters 20 8 | -------------------------------------------------------------------------------- /tutorials/topic-10-map-builtin/commands_wse2.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | cslc --arch=wse2 ./layout.csl \ 6 | --fabric-dims=8,3 --fabric-offsets=4,1 --params=size:5 \ 7 | -o out --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0 8 | cs_python run.py --name out 9 | -------------------------------------------------------------------------------- /tutorials/topic-10-map-builtin/commands_wse3.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | cslc --arch=wse3 ./layout.csl \ 6 | --fabric-dims=8,3 --fabric-offsets=4,1 --params=size:5 \ 7 | -o out --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0 8 | cs_python run.py --name out 9 | -------------------------------------------------------------------------------- /benchmarks/mandelbrot/commands_wse2.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | cslc --arch=wse2 ./code.csl --fabric-dims=11,6 --fabric-offsets=4,1 -o out \ 6 | --params=MEMCPYD2H_DATA_1_ID:1 \ 7 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0 8 | cs_python run.py --name out 9 | -------------------------------------------------------------------------------- /benchmarks/mandelbrot/commands_wse3.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | cslc --arch=wse3 ./code.csl --fabric-dims=11,6 --fabric-offsets=4,1 -o out \ 6 | --params=MEMCPYD2H_DATA_1_ID:1 \ 7 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0 8 | cs_python run.py --name out 9 | -------------------------------------------------------------------------------- /benchmarks/gemv-collectives_2d/commands_wse2.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | cslc --arch=wse2 ./layout.csl --fabric-dims=11,6 --fabric-offsets=4,1 \ 6 | --params=kernel_rows:4,kernel_cols:4,matrix_rows:32,matrix_cols:16 \ 7 | --memcpy --channels=1 -o out 8 | cs_python run.py --name out 9 | -------------------------------------------------------------------------------- /benchmarks/gemv-collectives_2d/commands_wse3.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | cslc --arch=wse3 ./layout.csl --fabric-dims=11,6 --fabric-offsets=4,1 \ 6 | --params=kernel_rows:4,kernel_cols:4,matrix_rows:32,matrix_cols:16 \ 7 | --memcpy --channels=1 -o out 8 | cs_python run.py --name out 9 | -------------------------------------------------------------------------------- /tutorials/topic-13-simprint/commands_wse2.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | cslc --arch=wse2 ./layout.csl --fabric-dims=11,3 \ 6 | --fabric-offsets=4,1 --params=width:4,num_elems:5 -o out \ 7 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0 8 | cs_python run.py --name out 9 | -------------------------------------------------------------------------------- /tutorials/topic-13-simprint/commands_wse3.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | cslc --arch=wse3 ./layout.csl --fabric-dims=11,3 \ 6 | --fabric-offsets=4,1 --params=width:4,num_elems:5 -o out \ 7 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0 8 | cs_python run.py --name out 9 | -------------------------------------------------------------------------------- /tutorials/topic-11-collectives/commands_wse2.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | cslc --arch=wse2 ./layout.csl --fabric-dims=22,17 --fabric-offsets=4,1 \ 6 | --params=Pw:15,Ph:15,chunk_size:3 -o out \ 7 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0 8 | cs_python run.py --name out 9 | -------------------------------------------------------------------------------- /tutorials/topic-11-collectives/commands_wse3.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | cslc --arch=wse3 ./layout.csl --fabric-dims=22,17 --fabric-offsets=4,1 \ 6 | --params=Pw:15,Ph:15,chunk_size:3 -o out \ 7 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0 8 | cs_python run.py --name out 9 | -------------------------------------------------------------------------------- /tutorials/topic-12-debug-library/commands_wse2.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | cslc --arch=wse2 ./layout.csl --fabric-dims=11,3 \ 6 | --fabric-offsets=4,1 --params=width:4,num_elems:5 -o out \ 7 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0 8 | cs_python run.py --name out 9 | -------------------------------------------------------------------------------- /tutorials/topic-12-debug-library/commands_wse3.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | cslc --arch=wse3 ./layout.csl --fabric-dims=11,3 \ 6 | --fabric-offsets=4,1 --params=width:4,num_elems:5 -o out \ 7 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0 8 | cs_python run.py --name out 9 | -------------------------------------------------------------------------------- /tutorials/topic-02-libraries/commands_wse2.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | cslc --arch=wse2 ./layout.csl --fabric-dims=8,3 --fabric-offsets=4,1 \ 6 | --params=iterations:200 -o out \ 7 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0 8 | cs_python run.py --name out --tolerance 0.1 9 | -------------------------------------------------------------------------------- /tutorials/topic-02-libraries/commands_wse3.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | cslc --arch=wse3 ./layout.csl --fabric-dims=8,3 --fabric-offsets=4,1 \ 6 | --params=iterations:200 -o out \ 7 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0 8 | cs_python run.py --name out --tolerance 0.1 9 | -------------------------------------------------------------------------------- /benchmarks/single-tile-matvec/commands_wse2.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | cslc ./src/layout_matvec.csl --arch wse2 --fabric-dims=9,4 \ 6 | --fabric-offsets=4,1 \ 7 | --params=width:2,height:2,tile_size:25,iters:1 \ 8 | -o out --memcpy --channels=1 9 | cs_python ./run.py --name out --verify 10 | -------------------------------------------------------------------------------- /benchmarks/single-tile-matvec/commands_wse3.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | cslc ./src/layout_matvec.csl --arch wse3 --fabric-dims=9,4 \ 6 | --fabric-offsets=4,1 \ 7 | --params=width:2,height:2,tile_size:25,iters:1 \ 8 | -o out --memcpy --channels=1 9 | cs_python ./run.py --name out --verify 10 | -------------------------------------------------------------------------------- /benchmarks/residual/commands_wse2.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | cslc ./layout.csl --arch=wse2 --fabric-dims=9,4 --fabric-offsets=4,1 \ 6 | --params=width:2,height:2 \ 7 | --params=LOCAL_OUT_SZ:3,LOCAL_IN_SZ:2 -o=out --memcpy --channels=1 \ 8 | --width-west-buf=0 --width-east-buf=0 9 | cs_python run.py --name out 10 | -------------------------------------------------------------------------------- /benchmarks/residual/commands_wse3.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | cslc ./layout.csl --arch=wse3 --fabric-dims=9,4 --fabric-offsets=4,1 \ 6 | --params=width:2,height:2 \ 7 | --params=LOCAL_OUT_SZ:3,LOCAL_IN_SZ:2 -o=out --memcpy --channels=1 \ 8 | --width-west-buf=0 --width-east-buf=0 9 | cs_python run.py --name out 10 | -------------------------------------------------------------------------------- /tutorials/topic-14-color-swap/commands_wse2.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | cslc --arch=wse2 ./layout.csl --fabric-dims=11,3 \ 6 | --fabric-offsets=4,1 -o out \ 7 | --params=width:4 \ 8 | --params=MEMCPYH2D_DATA_1_ID:6 \ 9 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0 10 | cs_python run.py --name out 11 | -------------------------------------------------------------------------------- /tutorials/topic-04-sparse-tensors/commands_wse2.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | cslc --arch=wse2 ./layout.csl --fabric-dims=8,3 \ 6 | --fabric-offsets=4,1 -o out \ 7 | --params=MEMCPYH2D_DATA_1_ID:0 \ 8 | --params=MEMCPYD2H_DATA_1_ID:1 \ 9 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0 10 | cs_python run.py --name out 11 | -------------------------------------------------------------------------------- /tutorials/topic-04-sparse-tensors/commands_wse3.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | cslc --arch=wse3 ./layout.csl --fabric-dims=8,3 \ 6 | --fabric-offsets=4,1 -o out \ 7 | --params=MEMCPYH2D_DATA_1_ID:0 \ 8 | --params=MEMCPYD2H_DATA_1_ID:1 \ 9 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0 10 | cs_python run.py --name out 11 | -------------------------------------------------------------------------------- /tutorials/topic-03-streaming-wavelet-data/commands_wse2.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | cslc --arch=wse2 ./layout.csl --fabric-dims=8,3 \ 6 | --fabric-offsets=4,1 -o out \ 7 | --params=MEMCPYH2D_DATA_1_ID:0 \ 8 | --params=MEMCPYD2H_DATA_1_ID:1 \ 9 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0 10 | cs_python run.py --name out 11 | -------------------------------------------------------------------------------- /tutorials/topic-03-streaming-wavelet-data/commands_wse3.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | cslc --arch=wse3 ./layout.csl --fabric-dims=8,3 \ 6 | --fabric-offsets=4,1 -o out \ 7 | --params=MEMCPYH2D_DATA_1_ID:0 \ 8 | --params=MEMCPYD2H_DATA_1_ID:1 \ 9 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0 10 | cs_python run.py --name out 11 | -------------------------------------------------------------------------------- /benchmarks/wide-multiplication/commands_wse2.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | cslc --arch=wse2 ./code.csl --fabric-dims=8,3 --fabric-offsets=4,1 -o out \ 6 | --params=num_bits:256 --params=MEMCPYH2D_DATA_1_ID:0 \ 7 | --params=MEMCPYD2H_DATA_1_ID:1 \ 8 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0 9 | cs_python run.py --name out 10 | -------------------------------------------------------------------------------- /benchmarks/wide-multiplication/commands_wse3.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | cslc --arch=wse3 ./code.csl --fabric-dims=8,3 --fabric-offsets=4,1 -o out \ 6 | --params=num_bits:256 --params=MEMCPYH2D_DATA_1_ID:0 \ 7 | --params=MEMCPYD2H_DATA_1_ID:1 \ 8 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0 9 | cs_python run.py --name out 10 | -------------------------------------------------------------------------------- /tutorials/pipeline-01-basic/commands_wse2.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | cslc --arch=wse2 ./layout.csl --fabric-dims=8,3 \ 6 | --fabric-offsets=4,1 --params=size:12 -o out \ 7 | --params=MEMCPYH2D_DATA_1_ID:0 \ 8 | --params=MEMCPYD2H_DATA_1_ID:1 \ 9 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0 10 | cs_python run.py --name out 11 | -------------------------------------------------------------------------------- /tutorials/pipeline-01-basic/commands_wse3.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | cslc --arch=wse3 ./layout.csl --fabric-dims=8,3 \ 6 | --fabric-offsets=4,1 --params=size:12 -o out \ 7 | --params=MEMCPYH2D_DATA_1_ID:0 \ 8 | --params=MEMCPYD2H_DATA_1_ID:1 \ 9 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0 10 | cs_python run.py --name out 11 | -------------------------------------------------------------------------------- /tutorials/pipeline-02-fifo/commands_wse2.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | cslc --arch=wse2 ./layout.csl --fabric-dims=8,3 \ 6 | --fabric-offsets=4,1 --params=size:32 -o out \ 7 | --params=MEMCPYH2D_DATA_1_ID:0 \ 8 | --params=MEMCPYD2H_DATA_1_ID:1 \ 9 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0 10 | cs_python run.py --name out 11 | -------------------------------------------------------------------------------- /tutorials/pipeline-02-fifo/commands_wse3.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | cslc --arch=wse3 ./layout.csl --fabric-dims=8,3 \ 6 | --fabric-offsets=4,1 --params=size:32 -o out \ 7 | --params=MEMCPYH2D_DATA_1_ID:0 \ 8 | --params=MEMCPYD2H_DATA_1_ID:1 \ 9 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0 10 | cs_python run.py --name out 11 | -------------------------------------------------------------------------------- /tutorials/pipeline-03-multiple/commands_wse2.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | cslc --arch=wse2 ./layout.csl --fabric-dims=10,3 \ 6 | --fabric-offsets=4,1 --params=size:32 -o out \ 7 | --params=MEMCPYH2D_DATA_1_ID:0 \ 8 | --params=MEMCPYD2H_DATA_1_ID:1 \ 9 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0 10 | cs_python run.py --name out 11 | -------------------------------------------------------------------------------- /tutorials/pipeline-03-multiple/commands_wse3.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | cslc --arch=wse3 ./layout.csl --fabric-dims=10,3 \ 6 | --fabric-offsets=4,1 --params=size:32 -o out \ 7 | --params=MEMCPYH2D_DATA_1_ID:0 \ 8 | --params=MEMCPYD2H_DATA_1_ID:1 \ 9 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0 10 | cs_python run.py --name out 11 | -------------------------------------------------------------------------------- /tutorials/gemv-03-memcpy/README.rst: -------------------------------------------------------------------------------- 1 | GEMV 3: H2D and D2H Memcpy 2 | ========================== 3 | 4 | The memcpy functionality of ``SdkRuntime`` allows the programmer to copy data 5 | between the host and device. 6 | Continuing from the previous example, we now extend it to include 7 | ``memcpy_h2d`` calls which copy data from the host to initialize ``A``, ``x``, 8 | and ``y`` on device. 9 | -------------------------------------------------------------------------------- /tutorials/topic-09-fifos/commands_wse2.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | cslc --arch=wse2 ./layout.csl \ 6 | --fabric-dims=8,3 --fabric-offsets=4,1 \ 7 | --params=num_elems_to_process:512 \ 8 | --params=MEMCPYH2D_DATA_1_ID:4 \ 9 | --params=MEMCPYD2H_DATA_1_ID:5 \ 10 | -o out \ 11 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0 12 | cs_python run.py --name out 13 | -------------------------------------------------------------------------------- /tutorials/topic-09-fifos/commands_wse3.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | cslc --arch=wse3 ./layout.csl \ 6 | --fabric-dims=8,3 --fabric-offsets=4,1 \ 7 | --params=num_elems_to_process:512 \ 8 | --params=MEMCPYH2D_DATA_1_ID:4 \ 9 | --params=MEMCPYD2H_DATA_1_ID:5 \ 10 | -o out \ 11 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0 12 | cs_python run.py --name out 13 | -------------------------------------------------------------------------------- /tutorials/gemv-09-streaming/commands_wse2.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | cslc --arch=wse2 ./layout.csl --fabric-dims=11,5 \ 6 | --fabric-offsets=4,1 --params=kernel_x_dim:4,kernel_y_dim:3,M:6,N:8 \ 7 | --params=MEMCPYH2D_DATA_1_ID:0 \ 8 | --params=MEMCPYH2D_DATA_2_ID:1 \ 9 | --params=MEMCPYD2H_DATA_1_ID:2 \ 10 | -o out --memcpy --channels 1 11 | cs_python run.py --name out 12 | -------------------------------------------------------------------------------- /tutorials/gemv-09-streaming/commands_wse3.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | cslc --arch=wse3 ./layout.csl --fabric-dims=11,5 \ 6 | --fabric-offsets=4,1 --params=kernel_x_dim:4,kernel_y_dim:3,M:6,N:8 \ 7 | --params=MEMCPYH2D_DATA_1_ID:0 \ 8 | --params=MEMCPYH2D_DATA_2_ID:1 \ 9 | --params=MEMCPYD2H_DATA_1_ID:2 \ 10 | -o out --memcpy --channels 1 11 | cs_python run.py --name out 12 | -------------------------------------------------------------------------------- /tutorials/topic-05-sentinels/commands_wse2.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | cslc --arch=wse2 ./layout.csl --fabric-dims=11,12 \ 6 | --fabric-offsets=4,1 -o out \ 7 | --params=MEMCPYH2D_DATA_1_ID:2 \ 8 | --params=MEMCPYH2D_DATA_2_ID:3 \ 9 | --params=MEMCPYD2H_DATA_1_ID:4 \ 10 | --params=size:4 \ 11 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0 12 | cs_python run.py --name out 13 | -------------------------------------------------------------------------------- /tutorials/topic-05-sentinels/commands_wse3.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | cslc --arch=wse3 ./layout.csl --fabric-dims=11,12 \ 6 | --fabric-offsets=4,1 -o out \ 7 | --params=MEMCPYH2D_DATA_1_ID:2 \ 8 | --params=MEMCPYH2D_DATA_2_ID:3 \ 9 | --params=MEMCPYD2H_DATA_1_ID:4 \ 10 | --params=size:4 \ 11 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0 12 | cs_python run.py --name out 13 | -------------------------------------------------------------------------------- /benchmarks/25-pt-stencil/commands_wse2.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | cslc ./layout.csl --arch=wse2 --fabric-dims=17,12 --fabric-offsets=4,1 \ 6 | -o=out_code --params=width:10,height:10,zDim:10,sourceLength:10,dx:20 \ 7 | --params=srcX:0,srcY:0,srcZ:0 --verbose --memcpy --channels=1 \ 8 | --width-west-buf=0 --width-east-buf=0 9 | cs_python run.py --name out \ 10 | --iterations=10 --dx=20 --skip-compile 11 | -------------------------------------------------------------------------------- /benchmarks/histogram-torus/commands_wse2.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | cslc --arch=wse2 ./code.csl \ 6 | --params=INPUT_SIZE:16,HIST_WIDTH:8,HIST_HEIGHT:8,NUM_BUCKETS:4,BUCKET_SIZE:2 \ 7 | --colors=OUT_COLOR:8 \ 8 | --fabric-dims=15,10 --fabric-offsets=4,1 -o out \ 9 | --params=MEMCPYH2D_DATA_1_ID:10 \ 10 | --params=MEMCPYD2H_DATA_1_ID:11 \ 11 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0 12 | cs_python run.py --name out 13 | -------------------------------------------------------------------------------- /benchmarks/row-col-broadcast/commands_wse2.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | cslc ./src/layout.csl --arch wse2 --fabric-dims=12,7 --fabric-offsets=4,1 \ 6 | --params=width:5,height:5,pe_length:5 --params=C0_ID:0 \ 7 | --params=C1_ID:1 --params=C2_ID:2 --params=C3_ID:3 --params=C4_ID:4 -o=out \ 8 | --memcpy --channels=2 --width-west-buf=0 --width-east-buf=0 9 | cs_python ./run.py -m=5 -n=5 -k=5 --latestlink out --is_row_bcast --loop_count=1 10 | -------------------------------------------------------------------------------- /benchmarks/row-col-broadcast/commands_wse3.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | cslc ./src/layout.csl --arch wse3 --fabric-dims=12,7 --fabric-offsets=4,1 \ 6 | --params=width:5,height:5,pe_length:5 --params=C0_ID:0 \ 7 | --params=C1_ID:1 --params=C2_ID:2 --params=C3_ID:3 --params=C4_ID:4 -o=out \ 8 | --memcpy --channels=2 --width-west-buf=0 --width-east-buf=0 9 | cs_python ./run.py -m=5 -n=5 -k=5 --latestlink out --is_row_bcast --loop_count=1 10 | -------------------------------------------------------------------------------- /benchmarks/gemv-checkerboard-pattern/commands_wse2.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | cslc --arch=wse2 ./layout.csl --fabric-dims=11,6 --fabric-offsets=4,1 \ 6 | --colors=x_in:1,ax_out:3,b_in:4 -o out \ 7 | --params=kernel_rows:4,kernel_cols:4,matrix_rows:32,matrix_cols:16 \ 8 | --params=MEMCPYH2D_DATA_1_ID:10 --params=MEMCPYH2D_DATA_2_ID:11 \ 9 | --params=MEMCPYD2H_DATA_1_ID:12 \ 10 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0 11 | cs_python run.py --name out 12 | -------------------------------------------------------------------------------- /benchmarks/gemv-checkerboard-pattern/commands_wse3.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | cslc --arch=wse3 ./layout.csl --fabric-dims=11,6 --fabric-offsets=4,1 \ 6 | --colors=x_in:1,ax_out:3,b_in:4 -o out \ 7 | --params=kernel_rows:4,kernel_cols:4,matrix_rows:32,matrix_cols:16 \ 8 | --params=MEMCPYH2D_DATA_1_ID:10 --params=MEMCPYH2D_DATA_2_ID:11 \ 9 | --params=MEMCPYD2H_DATA_1_ID:12 \ 10 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0 11 | cs_python run.py --name out 12 | -------------------------------------------------------------------------------- /tutorials/topic-02-libraries/README.rst: -------------------------------------------------------------------------------- 1 | Topic 2: Libraries 2 | ================== 3 | 4 | The CSL compiler comes bundled with a few standard libraries, which can be 5 | imported into the user's program using the ``@import_module()`` builtin. This 6 | example shows three such compiler-bundled libraries: 7 | 8 | * the ``random`` library for generating uniform random numbers, 9 | * the ``timestamp`` library for reading the on-chip timestamp counter, and 10 | * the ``math`` library for square root. 11 | -------------------------------------------------------------------------------- /benchmarks/fft-1d-2d/commands_wse2.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | cslc --arch=wse2 ./layout.csl --fabric-dims=8,3 --fabric-offsets=4,1 \ 6 | --params=DIM:1,Nz:4,FP:2 --memcpy --channels=1 -o out-1D 7 | cs_python run.py --name out-1D 8 | cs_python run.py --inverse --name out-1D 9 | cslc --arch=wse2 ./layout.csl --fabric-dims=11,3 --fabric-offsets=4,1 \ 10 | --params=DIM:2,Nz:4,FP:1 --memcpy --channels=1 -o out-2D 11 | cs_python run.py --name out-2D 12 | cs_python run.py --inverse --name out-2D 13 | -------------------------------------------------------------------------------- /benchmarks/fft-1d-2d/commands_wse3.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | cslc --arch=wse3 ./layout.csl --fabric-dims=8,3 --fabric-offsets=4,1 \ 6 | --params=DIM:1,Nz:4,FP:2 --memcpy --channels=1 -o out-1D 7 | cs_python run.py --name out-1D 8 | cs_python run.py --inverse --name out-1D 9 | cslc --arch=wse3 ./layout.csl --fabric-dims=11,3 --fabric-offsets=4,1 \ 10 | --params=DIM:2,Nz:4,FP:1 --memcpy --channels=1 -o out-2D 11 | cs_python run.py --name out-2D 12 | cs_python run.py --inverse --name out-2D 13 | -------------------------------------------------------------------------------- /benchmarks/bandwidth-test/commands_wse2.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | cslc ./src/bw_sync_layout.csl --arch wse2 --fabric-dims=12,7 --fabric-offsets=4,1 \ 6 | --params=width:5,height:5,pe_length:5 --params=C0_ID:0 \ 7 | --params=C1_ID:1 --params=C2_ID:2 --params=C3_ID:3 --params=C4_ID:4 -o=out \ 8 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0 9 | cs_python ./run.py -m=5 -n=5 -k=5 --latestlink out --channels=1 \ 10 | --width-west-buf=0 --width-east-buf=0 --run-only --loop_count=1 11 | -------------------------------------------------------------------------------- /benchmarks/bandwidth-test/commands_wse3.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | cslc ./src/bw_sync_layout.csl --arch wse3 --fabric-dims=12,7 --fabric-offsets=4,1 \ 6 | --params=width:5,height:5,pe_length:5 --params=C0_ID:0 \ 7 | --params=C1_ID:1 --params=C2_ID:2 --params=C3_ID:3 --params=C4_ID:4 -o=out \ 8 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0 9 | cs_python ./run.py -m=5 -n=5 -k=5 --latestlink out --channels=1 \ 10 | --width-west-buf=0 --width-east-buf=0 --run-only --loop_count=1 11 | -------------------------------------------------------------------------------- /benchmarks/fft-3d/commands_wse3.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | cslc --arch=wse3 ./layout.csl --fabric-dims=11,6 --fabric-offsets=4,1 \ 6 | --params=N:16,NUM_PENCILS_PER_DIM:4,FP:1 --memcpy --channels=1 -o out 7 | cs_python run.py --name out --real --norm 1 8 | cs_python run.py --inverse --name out --norm 1 9 | cslc --arch=wse3 ./layout.csl --fabric-dims=11,6 --fabric-offsets=4,1 \ 10 | --params=N:16,NUM_PENCILS_PER_DIM:4,FP:0 --memcpy --channels=1 -o out 11 | cs_python run.py --name out 12 | cs_python run.py --inverse --name out 13 | -------------------------------------------------------------------------------- /tutorials/topic-08-filters/README.rst: -------------------------------------------------------------------------------- 1 | Topic 8: Filters 2 | ================ 3 | 4 | Fabric filters allow a PE to selectively accept incoming wavelets. This example 5 | shows the use of so-called range filters, which specify the wavelets to allow to 6 | be forwarded to the CE based on the upper 16 bits of the wavelet contents. 7 | Specifically, PE #0 sends all 12 wavelets to the other PEs, while each recipient 8 | PE receives and processes only a quarter of the incoming wavelets. 9 | See :ref:`language-builtins-filters` for other possible filter configurations. 10 | -------------------------------------------------------------------------------- /benchmarks/7pt-stencil-spmv/commands_wse2.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | cslc ./src/layout.csl --arch wse2 --fabric-dims=12,7 --fabric-offsets=4,1 \ 6 | --params=width:5,height:5,MAX_ZDIM:5 --params=BLOCK_SIZE:2 --params=C0_ID:0 \ 7 | --params=C1_ID:1 --params=C2_ID:2 --params=C3_ID:3 --params=C4_ID:4 --params=C5_ID:5 \ 8 | --params=C6_ID:6 --params=C7_ID:7 --params=C8_ID:8 -o=out \ 9 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0 10 | cs_python ./run.py -m=5 -n=5 -k=5 --latestlink out --channels=1 \ 11 | --width-west-buf=0 --width-east-buf=0 --zDim=5 --run-only 12 | -------------------------------------------------------------------------------- /benchmarks/7pt-stencil-spmv/commands_wse3.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | cslc ./src/layout.csl --arch wse3 --fabric-dims=12,7 --fabric-offsets=4,1 \ 6 | --params=width:5,height:5,MAX_ZDIM:5 --params=BLOCK_SIZE:2 --params=C0_ID:0 \ 7 | --params=C1_ID:1 --params=C2_ID:2 --params=C3_ID:3 --params=C4_ID:4 --params=C5_ID:5 \ 8 | --params=C6_ID:6 --params=C7_ID:7 --params=C8_ID:8 -o=out \ 9 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0 10 | cs_python ./run.py -m=5 -n=5 -k=5 --latestlink out --channels=1 \ 11 | --width-west-buf=0 --width-east-buf=0 --zDim=5 --run-only 12 | -------------------------------------------------------------------------------- /benchmarks/bicgstab/commands_wse2.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | cslc ./src/layout.csl --arch wse2 --fabric-dims=12,7 --fabric-offsets=4,1 \ 6 | --params=width:5,height:5,MAX_ZDIM:5 --params=BLOCK_SIZE:2 --params=C0_ID:0 \ 7 | --params=C1_ID:1 --params=C2_ID:2 --params=C3_ID:3 --params=C4_ID:4 --params=C5_ID:5 \ 8 | --params=C6_ID:6 --params=C7_ID:7 --params=C8_ID:8 -o=out \ 9 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0 10 | cs_python ./run.py -m=5 -n=5 -k=5 --latestlink out --channels=1 \ 11 | --width-west-buf=0 --width-east-buf=0 --zDim=5 --run-only --max-ite=2 12 | -------------------------------------------------------------------------------- /benchmarks/bicgstab/commands_wse3.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | cslc ./src/layout.csl --arch wse3 --fabric-dims=12,7 --fabric-offsets=4,1 \ 6 | --params=width:5,height:5,MAX_ZDIM:5 --params=BLOCK_SIZE:2 --params=C0_ID:0 \ 7 | --params=C1_ID:1 --params=C2_ID:2 --params=C3_ID:3 --params=C4_ID:4 --params=C5_ID:5 \ 8 | --params=C6_ID:6 --params=C7_ID:7 --params=C8_ID:8 -o=out \ 9 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0 10 | cs_python ./run.py -m=5 -n=5 -k=5 --latestlink out --channels=1 \ 11 | --width-west-buf=0 --width-east-buf=0 --zDim=5 --run-only --max-ite=2 12 | -------------------------------------------------------------------------------- /benchmarks/power-method/commands_wse2.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | cslc ./src/layout.csl --arch wse2 --fabric-dims=12,7 --fabric-offsets=4,1 \ 6 | --params=width:5,height:5,MAX_ZDIM:5 --params=BLOCK_SIZE:2 --params=C0_ID:0 \ 7 | --params=C1_ID:1 --params=C2_ID:2 --params=C3_ID:3 --params=C4_ID:4 --params=C5_ID:5 \ 8 | --params=C6_ID:6 --params=C7_ID:7 --params=C8_ID:8 -o=out \ 9 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0 10 | cs_python ./run.py -m=5 -n=5 -k=5 --latestlink out --channels=1 \ 11 | --width-west-buf=0 --width-east-buf=0 --zDim=5 --run-only --max-ite=1 12 | -------------------------------------------------------------------------------- /benchmarks/power-method/commands_wse3.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | cslc ./src/layout.csl --arch wse3 --fabric-dims=12,7 --fabric-offsets=4,1 \ 6 | --params=width:5,height:5,MAX_ZDIM:5 --params=BLOCK_SIZE:2 --params=C0_ID:0 \ 7 | --params=C1_ID:1 --params=C2_ID:2 --params=C3_ID:3 --params=C4_ID:4 --params=C5_ID:5 \ 8 | --params=C6_ID:6 --params=C7_ID:7 --params=C8_ID:8 -o=out \ 9 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0 10 | cs_python ./run.py -m=5 -n=5 -k=5 --latestlink out --channels=1 \ 11 | --width-west-buf=0 --width-east-buf=0 --zDim=5 --run-only --max-ite=1 12 | -------------------------------------------------------------------------------- /benchmarks/conjugate-gradient/commands_wse2.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | cslc ./src/layout.csl --arch wse2 --fabric-dims=12,7 --fabric-offsets=4,1 \ 6 | --params=width:5,height:5,MAX_ZDIM:5 --params=BLOCK_SIZE:2 --params=C0_ID:0 \ 7 | --params=C1_ID:1 --params=C2_ID:2 --params=C3_ID:3 --params=C4_ID:4 --params=C5_ID:5 \ 8 | --params=C6_ID:6 --params=C7_ID:7 --params=C8_ID:8 -o=out \ 9 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0 10 | cs_python ./run.py -m=5 -n=5 -k=5 --latestlink out --channels=1 \ 11 | --width-west-buf=0 --width-east-buf=0 --zDim=5 --run-only --max-ite=2 12 | -------------------------------------------------------------------------------- /benchmarks/conjugate-gradient/commands_wse3.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | cslc ./src/layout.csl --arch wse3 --fabric-dims=12,7 --fabric-offsets=4,1 \ 6 | --params=width:5,height:5,MAX_ZDIM:5 --params=BLOCK_SIZE:2 --params=C0_ID:0 \ 7 | --params=C1_ID:1 --params=C2_ID:2 --params=C3_ID:3 --params=C4_ID:4 --params=C5_ID:5 \ 8 | --params=C6_ID:6 --params=C7_ID:7 --params=C8_ID:8 -o=out \ 9 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0 10 | cs_python ./run.py -m=5 -n=5 -k=5 --latestlink out --channels=1 \ 11 | --width-west-buf=0 --width-east-buf=0 --zDim=5 --run-only --max-ite=2 12 | -------------------------------------------------------------------------------- /benchmarks/preconditioned-conjugate-gradient/commands_wse2.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | cslc ./src/layout.csl --arch wse2 --fabric-dims=12,7 --fabric-offsets=4,1 \ 6 | --params=width:5,height:5,MAX_ZDIM:5 --params=BLOCK_SIZE:2 --params=C0_ID:0 \ 7 | --params=C1_ID:1 --params=C2_ID:2 --params=C3_ID:3 --params=C4_ID:4 --params=C5_ID:5 \ 8 | --params=C6_ID:6 --params=C7_ID:7 --params=C8_ID:8 -o=out \ 9 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0 10 | cs_python ./run.py -m=5 -n=5 -k=5 --latestlink out --channels=1 \ 11 | --width-west-buf=0 --width-east-buf=0 --zDim=5 --run-only --max-ite=2 12 | -------------------------------------------------------------------------------- /benchmarks/preconditioned-conjugate-gradient/commands_wse3.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | cslc ./src/layout.csl --arch wse3 --fabric-dims=12,7 --fabric-offsets=4,1 \ 6 | --params=width:5,height:5,MAX_ZDIM:5 --params=BLOCK_SIZE:2 --params=C0_ID:0 \ 7 | --params=C1_ID:1 --params=C2_ID:2 --params=C3_ID:3 --params=C4_ID:4 --params=C5_ID:5 \ 8 | --params=C6_ID:6 --params=C7_ID:7 --params=C8_ID:8 -o=out \ 9 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0 10 | cs_python ./run.py -m=5 -n=5 -k=5 --latestlink out --channels=1 \ 11 | --width-west-buf=0 --width-east-buf=0 --zDim=5 --run-only --max-ite=2 12 | -------------------------------------------------------------------------------- /benchmarks/spmv-hypersparse/commands_wse2.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | cslc ./src/layout.csl --arch wse2 --fabric-dims=11,6 --fabric-offsets=4,1 \ 6 | --params=ncols:16,nrows:16,pcols:4,prows:4,max_local_nnz:8 \ 7 | --params=max_local_nnz_cols:4,max_local_nnz_rows:4,local_vec_sz:1 \ 8 | --params=local_out_vec_sz:1,y_pad_start_row_idx:4 -o=out \ 9 | --memcpy --channels=1 --width-west-buf=0 --width-east-buf=0 10 | cs_python ./run.py --num_pe_cols=4 --num_pe_rows=4 --latestlink out --channels=1 \ 11 | --width-west-buf=0 --width-east-buf=0 --is_weight_one --run-only \ 12 | --infile_mtx=./data/rmat4.4x4.lb.mtx 13 | -------------------------------------------------------------------------------- /tutorials/sdklayout-02-routing/README.rst: -------------------------------------------------------------------------------- 1 | .. _tutorials-sdklayout-02: 2 | 3 | SdkLayout 2: Basic routing 4 | ========================== 5 | 6 | This tutorial demonstrates how to define routes between the 7 | PEs of a code region using symbolic colors. 8 | 9 | The key point here is that the colors that we use for the routes 10 | are symbolic (i.e., without a physical values). This means that 11 | the ``SdkLayout`` compiler will assign the value automatically. 12 | 13 | For debugging purposes, the ``SdkLayout`` compiler will emit 14 | a JSON file called ``colors.json`` that contains the allocated 15 | physical color values. 16 | -------------------------------------------------------------------------------- /benchmarks/25-pt-stencil/nop.csl: -------------------------------------------------------------------------------- 1 | // Copyright 2025 Cerebras Systems. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | -------------------------------------------------------------------------------- /tutorials/topic-01-arrays-and-pointers/README.rst: -------------------------------------------------------------------------------- 1 | Topic 1: Arrays and Pointers 2 | ============================ 3 | 4 | Arrays can only be passed to or returned from functions used at compile-time. 5 | For functions used at runtime, pointers should be used instead. This example 6 | demonstrates a function ``increment_and_sum()``, which accepts a pointer to an 7 | array and a pointer to a scalar. When declaring an array pointer, CSL requires 8 | that the type specification contain the size of the array. CSL does not have 9 | a null pointer. 10 | 11 | Pointers are dereferenced using the ``.*`` syntax. Once dereferenced, they can 12 | be used just like non-pointer variables like ``(data_ptr.*)[0]`` for indexing 13 | into the first element of the array. 14 | -------------------------------------------------------------------------------- /tutorials/sdklayout-03-ports-and-connections/README.rst: -------------------------------------------------------------------------------- 1 | .. _tutorials-sdklayout-03: 2 | 3 | SdkLayout 3: Ports and connections 4 | ================================== 5 | 6 | This tutorial demonstrates how to attach ports to code regions 7 | and then connect those ports together. It instantiates two 8 | code regions that send data to a third code region. The receiving 9 | code region adds the input streams element-wise and then sends 10 | the result out and towards a fourth code region that saves the 11 | result on device memory. 12 | 13 | There are two kinds of ports: input ports and output ports. It is 14 | only possible to connect an output port to an input port. When 15 | we do that the ``SdkLayout`` compiler will automatically find and 16 | encode a path between them. 17 | -------------------------------------------------------------------------------- /tutorials/topic-04-sparse-tensors/README.rst: -------------------------------------------------------------------------------- 1 | Topic 4: Wavelets for Sparse Tensors 2 | ==================================== 3 | 4 | When tensors are sparse, it is wasteful to send zero values. Since wavelet 5 | payloads are 32 bits wide, we can use the lower 16 bits to contain data as 6 | usual, but we can also use the upper 16 bits to contain the index of the value. 7 | 8 | This example illustrates the latter, where each wavelet of the incoming tensor 9 | has the index field populated in the upper 16 bits. Accordingly, the task 10 | definition uses two function arguments, one for the lower 16 bits whereas 11 | another for the upper 16 bits. 12 | 13 | Optionally, the programmer may also declare a task with just one argument of 14 | type ``u32`` for receiving 32-bit data. 15 | -------------------------------------------------------------------------------- /tutorials/gemv-04-params/README.rst: -------------------------------------------------------------------------------- 1 | GEMV 4: Parameters 2 | ================== 3 | 4 | Parameter values are compile-time constants, which implies that the compiler 5 | is fully aware of their precise value. 6 | This enables the programmer to not just change the program’s behavior at 7 | runtime, but it also enables the programmer to change the program’s 8 | compilation. 9 | 10 | Continuing on from the previous example, we add two compile-time parameters 11 | to the ``layout.csl`` file that specify the dimensions ``M`` and ``N`` of our 12 | problem, instead of hardcoding them in ``pe_program.csl``. 13 | When the program is compiled, the program specifies ``M`` and ``N`` in the 14 | compile command. ``layout.csl`` also sets these parameter values in 15 | ``pe_program.csl`` in its ``@set_tile_code`` call. 16 | -------------------------------------------------------------------------------- /tutorials/topic-03-streaming-wavelet-data/README.rst: -------------------------------------------------------------------------------- 1 | Topic 3: Streaming Wavelet Data 2 | =============================== 3 | 4 | Often, CSL programs contain tasks that are activated in response to the 5 | arrival of wavelets of specific colors. Such tasks are also called 6 | Wavelet-Triggered Tasks, or data tasks. 7 | 8 | In this example, the ``comptime`` block binds a data task to a ``data_task_id`` 9 | created from a ``memcpy`` streaming color, which receives data from the host. 10 | The routing of the color ``MEMCPYH2D_DATA_1`` must not be defined. 11 | The ``memcpy`` module will figure out the routing of ``MEMCPYH2D_DATA_1``. 12 | 13 | Given the task and color association and the route, when a wavelet of 14 | color ``MEMCPYH2D_DATA_1`` arrives at the router, it is forwarded to the CE, 15 | which then activates ``main_task``. The wavelet's payload field is received in 16 | the argument to the task, and the code uses the wavelet data to update a global 17 | variable. 18 | -------------------------------------------------------------------------------- /tutorials/sdklayout-01-introduction/gv.csl: -------------------------------------------------------------------------------- 1 | // Copyright 2025 Cerebras Systems. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Code parameter specified by the host using 'set_param_all'. 16 | param value: i16; 17 | export var gv: i16; 18 | 19 | const main_id = @get_local_task_id(8); 20 | task main() void { 21 | gv = value; 22 | } 23 | 24 | comptime { 25 | @bind_local_task(main, main_id); 26 | @activate(main_id); 27 | } 28 | -------------------------------------------------------------------------------- /tutorials/gemv-02-memory-dsds/layout.csl: -------------------------------------------------------------------------------- 1 | // Copyright 2025 Cerebras Systems. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | const memcpy = @import_module("", .{ .width = 1, .height = 1 }); 16 | 17 | layout { 18 | @set_rectangle(1, 1); 19 | @set_tile_code(0, 0, "pe_program.csl", .{ .memcpy_params = memcpy.get_params(0) }); 20 | 21 | // export symbol names 22 | @export_name("y", [*]f32, false); 23 | @export_name("init_and_compute", fn()void); 24 | } 25 | -------------------------------------------------------------------------------- /tutorials/topic-06-switches/empty.csl: -------------------------------------------------------------------------------- 1 | // Copyright 2025 Cerebras Systems. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Every PE needs to import memcpy module otherwise the I/O cannot 16 | // propagate the data to the destination. 17 | 18 | param memcpy_params: comptime_struct; 19 | 20 | const sys_mod = @import_module("", memcpy_params); 21 | 22 | fn main_fn() void { 23 | sys_mod.unblock_cmd_stream(); 24 | } 25 | 26 | comptime { 27 | @export_symbol(main_fn); 28 | } 29 | -------------------------------------------------------------------------------- /tutorials/topic-07-switches-entrypt/empty.csl: -------------------------------------------------------------------------------- 1 | // Copyright 2025 Cerebras Systems. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Every PE needs to import memcpy module otherwise the I/O cannot 16 | // propagate the data to the destination. 17 | 18 | param memcpy_params: comptime_struct; 19 | 20 | const sys_mod = @import_module("", memcpy_params); 21 | 22 | fn main_fn() void { 23 | sys_mod.unblock_cmd_stream(); 24 | } 25 | 26 | comptime { 27 | @export_symbol(main_fn); 28 | } 29 | -------------------------------------------------------------------------------- /tutorials/topic-06-switches/README.rst: -------------------------------------------------------------------------------- 1 | Topic 6: Switches 2 | ================= 3 | 4 | Fabric switches permit limited runtime control of routes. 5 | 6 | In this example, the ``layout`` block initializes the default route to receive 7 | wavelets from the ramp and forward them to the PE's north neighbor. However, it 8 | also defines routes for switch positions 1, 2, and 3. The hardware updates the 9 | route according to the specified switch positions when it receives a so-called 10 | Control Wavelet. 11 | 12 | For the payload of the control wavelet, the code creates a special wavelet using 13 | the helper function ``encode_single_payload()`` from the ```` library. 14 | The program then sends out a data wavelet along the newly-switched color. 15 | 16 | Switches can be helpful not just to change the routing configuration in limited 17 | ways at runtime, but also to save the number of colors used. For instance, this 18 | same example could be re-written to use four colors and four routes, but by 19 | using fabric switches, this example uses just one color. 20 | -------------------------------------------------------------------------------- /tutorials/sdklayout-04-h2d-d2h/README.rst: -------------------------------------------------------------------------------- 1 | .. _tutorials-sdklayout-04: 2 | 3 | SdkLayout 4: Host-to-device and device-to-host data streaming 4 | ============================================================= 5 | 6 | This tutorial demonstrates how we can connect ports to the 7 | host to allow us to stream data in and out of the WSE. 8 | 9 | It uses the 'add2vec' code region that was also used in 10 | tutorial :ref:`sdkruntime-sdklayout-03-ports-and-connections` but instead of 11 | using sender/receiver code regions it creates streams directly 12 | to/from the host. 13 | 14 | Similar to connections between input and output ports (see tutorial 15 | :ref:`sdkruntime-sdklayout-03-ports-and-connections`) paths to/from ports 16 | to/from the edge of the wafer are produced automatically. 17 | 18 | For now, it is only possible to create input/output streams 19 | to/from single-PE ports. If a port consists of more than one PE then 20 | an adaptor layer must be created explicitly to funnel the data 21 | through a single PE port. The next tutorial shows an example 22 | of such a configuration. 23 | -------------------------------------------------------------------------------- /tutorials/topic-07-switches-entrypt/README.rst: -------------------------------------------------------------------------------- 1 | Topic 7: Switches and Control Entrypoints 2 | ========================================= 3 | 4 | Following on from the last topic example, we can also encode a special 5 | task ID inside of a control wavelet. When that control wavelet is forwarded 6 | to the CE of the receiving PE, it will activate a task known as a control 7 | task which is bound to that ID. 8 | 9 | The lower 16 bits of the control wavelet can be used to store an optional 10 | data payload for that control task. Here, we encode the same values 11 | sent to the PEs as normal data wavelets in the previous example. 12 | 13 | Note that a PE router will move to a new switch position only after the 14 | control wavelet carrying the switch command passes through that PE. 15 | Therefore all control wavelets will continue to be routed using the current 16 | switch position setting and the new switch position will only affect 17 | subsequent wavelets. Thus, the data payload of a control wavelet is received 18 | by the PE connected by the current switch position, not the new position. 19 | -------------------------------------------------------------------------------- /benchmarks/power-method/power_method.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Cerebras Systems. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import numpy as np 16 | from numpy import linalg as LA 17 | 18 | 19 | def power_method(A_csr, x0, max_ite): 20 | prev_mu = 0 21 | nrm2_x = LA.norm(x0, 2) 22 | x = x0 / nrm2_x 23 | for i in range(max_ite): 24 | y = A_csr.dot(x) 25 | mu = np.dot(x, y) 26 | print(f"i = {i}, mu = {mu}, |prev_mu - mu| = {abs(mu - prev_mu)}") 27 | nrm2_x = LA.norm(y, 2) 28 | x = y / nrm2_x 29 | prev_mu = mu 30 | return x 31 | -------------------------------------------------------------------------------- /tutorials/pipeline-03-multiple/memcpy_edge/east.csl: -------------------------------------------------------------------------------- 1 | // Copyright 2025 Cerebras Systems. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // send data to the "core" 16 | param USER_IN_1 = {}; 17 | param USER_IN_2 = {}; 18 | 19 | // receive data from the "core" 20 | param USER_OUT_1 = {}; 21 | 22 | param memcpy_params: comptime_struct; 23 | 24 | const edge_mod = @import_module("memcpy_edge.csl", .{ 25 | .memcpy_params = memcpy_params, 26 | .USER_IN_1 = USER_IN_1, 27 | .USER_IN_2 = USER_IN_2, 28 | .USER_OUT_1 = USER_OUT_1, 29 | .dir = WEST 30 | }); 31 | -------------------------------------------------------------------------------- /tutorials/pipeline-03-multiple/memcpy_edge/north.csl: -------------------------------------------------------------------------------- 1 | // Copyright 2025 Cerebras Systems. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // send data to the "core" 16 | param USER_IN_1 = {}; 17 | param USER_IN_2 = {}; 18 | 19 | // receive data from the "core" 20 | param USER_OUT_1 = {}; 21 | 22 | param memcpy_params: comptime_struct; 23 | 24 | const edge_mod = @import_module("memcpy_edge.csl", .{ 25 | .memcpy_params = memcpy_params, 26 | .USER_IN_1 = USER_IN_1, 27 | .USER_IN_2 = USER_IN_2, 28 | .USER_OUT_1 = USER_OUT_1, 29 | .dir = SOUTH 30 | }); 31 | -------------------------------------------------------------------------------- /tutorials/pipeline-03-multiple/memcpy_edge/south.csl: -------------------------------------------------------------------------------- 1 | // Copyright 2025 Cerebras Systems. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // send data to the "core" 16 | param USER_IN_1 = {}; 17 | param USER_IN_2 = {}; 18 | 19 | // receive data from the "core" 20 | param USER_OUT_1 = {}; 21 | 22 | param memcpy_params: comptime_struct; 23 | 24 | const edge_mod = @import_module("memcpy_edge.csl", .{ 25 | .memcpy_params = memcpy_params, 26 | .USER_IN_1 = USER_IN_1, 27 | .USER_IN_2 = USER_IN_2, 28 | .USER_OUT_1 = USER_OUT_1, 29 | .dir = NORTH 30 | }); 31 | -------------------------------------------------------------------------------- /tutorials/pipeline-03-multiple/memcpy_edge/west.csl: -------------------------------------------------------------------------------- 1 | // Copyright 2025 Cerebras Systems. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // send data to the "core" 16 | param USER_IN_1 = {}; 17 | param USER_IN_2 = {}; 18 | 19 | // receive data from the "core" 20 | param USER_OUT_1 = {}; 21 | 22 | param memcpy_params: comptime_struct; 23 | 24 | const edge_mod = @import_module("memcpy_edge.csl", .{ 25 | .memcpy_params = memcpy_params, 26 | .USER_IN_1 = USER_IN_1, 27 | .USER_IN_2 = USER_IN_2, 28 | .USER_OUT_1 = USER_OUT_1, 29 | .dir = EAST 30 | }); 31 | -------------------------------------------------------------------------------- /benchmarks/cholesky/launch.csl: -------------------------------------------------------------------------------- 1 | // Copyright 2025 Cerebras Systems. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | param memcpy_params: comptime_struct; 16 | param Nt: u16; 17 | 18 | var tile = @zeros([Nt*Nt]f32); 19 | 20 | var ptr_tile : [*]f32 = &tile; 21 | 22 | const sys_mod = @import_module( "", memcpy_params); 23 | 24 | fn f_chol() void { 25 | // WARNING: the user must unblock cmd color for every PE 26 | sys_mod.unblock_cmd_stream(); 27 | } 28 | 29 | comptime{ 30 | @export_symbol(ptr_tile, "tile"); 31 | 32 | @export_symbol(f_chol); 33 | } 34 | -------------------------------------------------------------------------------- /tutorials/gemv-03-memcpy/layout.csl: -------------------------------------------------------------------------------- 1 | // Copyright 2025 Cerebras Systems. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | const memcpy = @import_module("", .{ .width = 1, .height = 1 }); 16 | 17 | layout { 18 | @set_rectangle(1, 1); 19 | @set_tile_code(0, 0, "pe_program.csl", .{ .memcpy_params = memcpy.get_params(0) }); 20 | 21 | // export symbol names 22 | @export_name("A", [*]f32, true); 23 | @export_name("x", [*]f32, true); 24 | @export_name("b", [*]f32, true); 25 | @export_name("y", [*]f32, false); 26 | @export_name("init_and_compute", fn()void); 27 | } 28 | -------------------------------------------------------------------------------- /tutorials/pipeline-03-multiple/README.rst: -------------------------------------------------------------------------------- 1 | 2 | Pipeline 3: Add an artificial halo 3 | ================================== 4 | 5 | The disadvantage of FIFO in the previous example is the resource consumption. 6 | The FIFO requires two microthreads and a scratch buffer. 7 | 8 | The simple workaround is to move such FIFO outside the kernel. We add another 9 | halo, which we call an artificial halo, around the kernel (``pe_program.csl``). 10 | The west side is ``west.csl`` and east side is ``east.csl``. 11 | The ``west.csl`` implements a FIFO to receive the data from H2D. 12 | The ``east.csl`` implements a FIFO to receive the data from ``pe_program.csl`` 13 | and redirect it to D2H. 14 | 15 | There is no more FIFO in ``pe_program.csl``. Instead, we replace the colors 16 | ``MEMCPYH2D_DATA_1`` by ``Cin`` and ``MEMCPYD2H_DATA_1`` by ``Cout``. 17 | The color ``Cin`` receives data from the west to the ramp. 18 | The color ``Cout`` sends the data from ramp to the east. 19 | 20 | This example has the same property as ``pipeline-02-fifo``: as long as the 21 | parameter ``size`` does not exceed the capacity of the FIFO in ``west.csl``, 22 | H2D can always finish so the ``@add16`` can progress. 23 | -------------------------------------------------------------------------------- /benchmarks/gemm-collectives_2d/README.rst: -------------------------------------------------------------------------------- 1 | GEMM with Collective Operations 2 | =============================== 3 | 4 | This program implements the SUMMA matrix multiplication algorithm and serves 5 | as an example of using the ``collectives_2d`` library together with 6 | ``SdkRuntime`` and the ``memcpy`` framework. 7 | 8 | The host code first copies tiles of ``A`` and ``B`` onto their corresponding 9 | PEs. It then uses the remote procedure call (RPC) mechanism to launch the 10 | function ``main``, at which point the GEMM computation begins. 11 | 12 | We perform GEMM in ``P`` many steps on a grid of ``P x P`` processors. 13 | At each step ``i``, PEs in the ith column broadcast their home tiles of ``A`` 14 | to other PEs in their row, and PEs in the ith row broadcast their home 15 | tiles of ``B`` to other PEs in their column. Once both broadcasts are complete 16 | as determined by ``x_done()`` and ``y_done()`` both being activated, 17 | each PE computes ``C_tile += Ap * Bp`` where ``Ap`` and ``Bp`` are pointers to 18 | either the PE's home tile or the tile it received through broadcasts. 19 | 20 | When computation is complete the host copies back the resulting tiles of 21 | ``C`` from the device. 22 | -------------------------------------------------------------------------------- /benchmarks/row-col-broadcast/README.rst: -------------------------------------------------------------------------------- 1 | Host-to-Device Broadcast Test 2 | ============================= 3 | 4 | This example shows how to use row or column broadcast. For example if the user 5 | wants to broadcast a column of data [1.0, 2.0, 3.0, 4.0] to a region of interest 6 | starting from (1,1) with width 3 and height 4, one element per PE, the H2D API 7 | requires the user to prepare the following 3-by-4 tensor, 8 | 9 | .. code-block:: 10 | 11 | | 1.0 1.0 1.0 | 12 | | 2.0 2.0 2.0 | 13 | | 3.0 3.0 3.0 | 14 | | 4.0 4.0 4.0 | 15 | 16 | and use ``memcpy_h2d()`` API to stream 12 elements into the device. This 17 | operation wastes host bandwidth by 3x. 18 | Now the user can use the new API, ``memcpy_h2d_rowbcast()``, to stream 4 19 | elements only. 20 | 21 | The same for column broadcasting, the user only needs to provide data of one 22 | row and uses ``memcpy_h2d_colbcast()`` API. 23 | 24 | The new broadcasting scheme only supports H2D, not D2H. 25 | 26 | The kernel of ``row-col-broadcast`` is the same as ``bandwidth-test``. 27 | The ``run.py`` calculates the bandwidth as well. 28 | The formula of the bandwidth calculation is the same as ``bandwidth-test``, 29 | so the user can see how much time this new API can save. 30 | -------------------------------------------------------------------------------- /tutorials/gemv-04-params/layout.csl: -------------------------------------------------------------------------------- 1 | // Copyright 2025 Cerebras Systems. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | param M: i16; 16 | param N: i16; 17 | 18 | const memcpy = @import_module("", .{ .width = 1, .height = 1 }); 19 | 20 | layout { 21 | @set_rectangle(1, 1); 22 | @set_tile_code(0, 0, "pe_program.csl", .{ 23 | .memcpy_params = memcpy.get_params(0), 24 | .M = M, 25 | .N = N 26 | }); 27 | 28 | // export symbol names 29 | @export_name("A", [*]f32, true); 30 | @export_name("x", [*]f32, true); 31 | @export_name("b", [*]f32, true); 32 | @export_name("y", [*]f32, false); 33 | @export_name("init_and_compute", fn()void); 34 | } 35 | -------------------------------------------------------------------------------- /tutorials/gemv-05-multiple-pes/README.rst: -------------------------------------------------------------------------------- 1 | GEMV 5: Multiple PEs 2 | ==================== 3 | 4 | Continuing on from the previous example, we now extend our program to use 5 | multiple PEs. 6 | 7 | The number of PEs used in this program is set at compile-time with the ``width`` 8 | parameter. 9 | Note that ``layout.csl`` uses this parameter to set the size of the program 10 | with the call to ``@set_rectangle``. 11 | The dimensions of a grid of PEs is always specified as width by height (or, 12 | alternatively, number of columns by number of rows), and individual PEs are 13 | indexed by (x, y), or, in other words, (column number, row number). 14 | 15 | This program involves no communication between PEs; we only duplicate the same 16 | workload on each PE. 17 | In ``run.py``, the ``memcpy_h2d`` calls now specify that data is copied into 18 | ``width x 1`` PEs, beginning at the upper left corner (0, 0) of the program 19 | rectangle. 20 | Because we are copying the same data to each PE, we use ``np.tile`` to repeat 21 | the data in ``A``, ``x``, and ``b`` multiple times. 22 | The ``memcpy_d2h`` call copies back the resulting ``y`` from each PE into 23 | an array of size ``M x width``. 24 | 25 | The next example will expand this example to demonstrate simple communication 26 | between PEs. 27 | -------------------------------------------------------------------------------- /tutorials/gemv-09-streaming/README.rst: -------------------------------------------------------------------------------- 1 | GEMV 9: Memcpy Streaming Mode 2 | ============================= 3 | 4 | We present an alternative version of the previous example, 5 | in which we use the ``streaming`` mode of ``memcpy`` to stream ``x`` and ``b`` 6 | onto the device, and stream ``y`` off of the device. 7 | All of the previous examples used the ``copy`` mode of ``memcpy``. 8 | This example is meant to simply present the basics of ``streaming`` mode, 9 | and future tutorials will demonstrate some use cases for this mode. 10 | 11 | The host code no longer includes an explicit kernel launch. 12 | Instead, computation is started by the wavelet-triggered tasks that receive 13 | elements of ``x`` and ``b`` along the top row and left column of PEs, 14 | respectively. 15 | We finish computation when the kernel streams back the result ``y`` 16 | to the host. 17 | 18 | The colors ``MEMCPYH2D_DATA_1`` and ``MEMCPYH2D_DATA_2`` are used 19 | to stream ``x`` and ``b`` onto the device, respectively, 20 | while ``MEMCPYD2H_DATA_1`` is used to stream ``y`` off the device. 21 | 22 | Note that, because ``memcpy`` commands are serialized, the order of these 23 | ``streaming`` mode ``memcpy_h2d`` calls in this example is important. 24 | If the ``b`` values were streamed in before ``x``, the program would hang. 25 | -------------------------------------------------------------------------------- /benchmarks/single-tile-matvec/compile.appliance.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Cerebras Systems. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import json 16 | 17 | from cerebras.sdk.client import SdkCompiler # pylint: disable=import-error,no-name-in-module 18 | 19 | hash_filename = "hash.json" 20 | 21 | with SdkCompiler() as compiler: 22 | 23 | hashstr = compiler.compile( 24 | "./src", 25 | "layout_matvec.csl", 26 | "--arch wse3 --fabric-dims=9,4 --fabric-offsets=4,1 " 27 | "--params=width:2,height:2,tile_size:25,iters:1 -o latest --memcpy --channels=1", 28 | ) 29 | 30 | print("compile artifact:", hashstr) 31 | 32 | print(f"dump artifact name to file {hash_filename}") 33 | with open(hash_filename, "w", encoding="utf-8") as write_file: 34 | json.dump(hashstr, write_file) 35 | -------------------------------------------------------------------------------- /tutorials/sdklayout-03-ports-and-connections/receiver.csl: -------------------------------------------------------------------------------- 1 | // Copyright 2025 Cerebras Systems. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | param size: u16; 16 | param rx: color; 17 | 18 | const in_q = @get_input_queue(0); 19 | 20 | export var data: [size]u16; 21 | 22 | const data_dsd = @get_dsd(mem1d_dsd, .{.tensor_access = |i|{size} -> data[i]}); 23 | 24 | const input = @get_dsd(fabin_dsd, .{.extent = size, 25 | .fabric_color = rx, 26 | .input_queue = in_q}); 27 | 28 | const main_id = @get_local_task_id(8); 29 | task main() void { 30 | @mov16(data_dsd, input, .{.async = true}); 31 | } 32 | 33 | comptime { 34 | @bind_local_task(main, main_id); 35 | @activate(main_id); 36 | 37 | @initialize_queue(in_q, .{.color = rx}); 38 | } 39 | -------------------------------------------------------------------------------- /tutorials/topic-11-collectives/README.rst: -------------------------------------------------------------------------------- 1 | Topic 11: Collective Communications 2 | =================================== 3 | 4 | The ```` library can be used for communication between PEs in 5 | the same row or column. It mimics the capabilities provided by 6 | `message passing interface `_ (MPI) 7 | collective operations found in other programming languages. 8 | 9 | This example showcases each of the currently available communication primitives 10 | while using the library across two indepedent dimensions. The communication 11 | tasks are executed asynchronously. 12 | 13 | ``task_x`` uses the ``broadcast`` primitive to transmit data from the first PE 14 | in every row to every other PE in the same row. After the data is received, 15 | ``reduce_fadds`` computes the vector sum of the ``broadcast_recv``. The result 16 | is transmitted back to the first PE in every row. 17 | 18 | ``task_y`` operates concurrently along every column of PEs. The task first 19 | uses ``scatter`` to distribute ``chunk_size`` slices of ``scatter_data`` 20 | across every other PE in the same column. The task uses ``gather`` to collect 21 | ``chunk_size`` slices of data distributed by ``scatter``. Because ``scatter`` 22 | is the inversion of ``gather``, we have used collective communications to 23 | transmit the data from ``scatter_data`` to ``gather_recv``. 24 | -------------------------------------------------------------------------------- /benchmarks/fft-3d/layout.csl: -------------------------------------------------------------------------------- 1 | // Copyright 2025 Cerebras Systems. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | param FP: i16; // Precision: 0 == float16, 1 == float32 16 | 17 | param N: u16; // FFT size in each dimension 18 | param NUM_PENCILS_PER_DIM: u16; // Pencils in each dimension per PE 19 | 20 | // Number of PEs for FFT in both X and Y dimension 21 | param WIDTH: i16 = N / NUM_PENCILS_PER_DIM; 22 | 23 | const tensor_type: type = if (FP == 0) f16 else f32; 24 | 25 | const memcpy = @import_module("", .{ 26 | .width = WIDTH, 27 | .height = WIDTH, 28 | }); 29 | 30 | const fft_helper = @import_module("", .{ 31 | .width = WIDTH, 32 | .memcpy = memcpy, 33 | }); 34 | 35 | layout { 36 | @set_rectangle(WIDTH, WIDTH); 37 | fft_helper.FFT_kernel(WIDTH, N, tensor_type); 38 | } 39 | -------------------------------------------------------------------------------- /tutorials/topic-12-debug-library/README.rst: -------------------------------------------------------------------------------- 1 | Topic 12: Debug Library 2 | ======================= 3 | 4 | This example shows a program that uses the tracing mechanism of the ```` 5 | library to record variable values and compile time strings as well as 6 | timestamps, for inspection by the host code. 7 | 8 | The program uses a row of four contiguous PEs. 9 | The first PE sends an array of values to three receiver PEs. 10 | Each PE program contains a global variable named ``global``, initialized to 11 | zero. 12 | When the data task ``recv_task`` on the receiver PE is activated by an incoming 13 | wavelet ``in_data``, ``global`` is incremented by ``2 * in_data``. 14 | 15 | The programs running on each PE import two instances of the ```` library. 16 | On the receiver PEs, each time a task activates, the instance named ``trace`` 17 | logs a compile time string noting that the task has begun execution, and the 18 | updated value of ``global``. 19 | The instance named ``times`` logs a timestamp at the beginning of a task, and 20 | at the end of a task. 21 | 22 | The host code uses the function ``read_trace`` from 23 | ``cerebras.sdk.debug.debug_util`` to read the logged values after execution of 24 | the device code finishes. 25 | Note that the PE coordinates passed to ``read_trace`` start from the northwest 26 | corner of the fabric, not from the northwest corner of the program rectangle. 27 | -------------------------------------------------------------------------------- /benchmarks/residual/nrminf.csl: -------------------------------------------------------------------------------- 1 | // Copyright 2025 Cerebras Systems. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // http://www.netlib.org/lapack/explore-html/d6/d12/snrm2_8f90.html 16 | // SNRMINF returns the maximum of a vector 17 | // SNRMINF = max(|x|) 18 | // 19 | // @param[in] n number of elements of the vector x 20 | // @param[in] x array of dimension n 21 | // @param[out] result scalar 22 | // result = max(|x|) 23 | 24 | param sizeX: i16; // size of x, sizeX >= n 25 | 26 | fn snrminf(n: i16, x: *[sizeX]f32, result: *f32) void { 27 | 28 | var nrm_r: f32 = 0.0; 29 | 30 | for (@range(i16, n)) |row| { 31 | var yi: f32 = (x.*)[row]; 32 | if (0.0 > yi) { 33 | yi = -yi; 34 | } 35 | if (nrm_r < yi) { 36 | nrm_r = yi; 37 | } 38 | } 39 | (result.*) = nrm_r; 40 | } 41 | -------------------------------------------------------------------------------- /tutorials/sdklayout-03-ports-and-connections/sender.csl: -------------------------------------------------------------------------------- 1 | // Copyright 2025 Cerebras Systems. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | param size: u16; 16 | param tx: color; 17 | 18 | const out_q = @get_output_queue(0); 19 | 20 | export var data = [10]u16{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; 21 | 22 | const data_dsd = @get_dsd(mem1d_dsd, .{.tensor_access = |i|{size} -> data[i]}); 23 | 24 | const output = @get_dsd(fabout_dsd, .{.extent = size, 25 | .fabric_color = tx, 26 | .output_queue = out_q}); 27 | 28 | const main_id = @get_local_task_id(8); 29 | task main() void { 30 | @mov16(output, data_dsd, .{.async = true}); 31 | } 32 | 33 | comptime { 34 | @bind_local_task(main, main_id); 35 | @activate(main_id); 36 | 37 | if (@is_arch("wse3")) { 38 | @initialize_queue(out_q, .{.color = tx}); 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /tutorials/topic-14-color-swap/README.rst: -------------------------------------------------------------------------------- 1 | Topic 14: Color Swap 2 | ==================== 3 | 4 | This example demonstrates the color swap feature of WSE-2. 5 | CSL currently does not support color swap on WSE-3, and support 6 | is in development. 7 | 8 | This program uses a row of four contiguous PEs. 9 | Two colors, ``red`` (color 0) and ``blue`` (color 1), are used. 10 | On all PEs, the routing associated with these colors receives 11 | from the ``WEST`` and sends down the ``RAMP`` and ``EAST``. 12 | Additionally, for both colors, ``swap_color_x`` is set to ``true``. 13 | Because these colors differ only in their lowest bit, when a 14 | ``red`` wavelet comes into a router from ``WEST``, it leaves the 15 | router to the ``EAST`` as a ``blue`` wavelet, and vice versa. 16 | 17 | The host code sends four wavelets along the color ``MEMCPYH2D_DATA_1`` 18 | into the first PE. The WTT of ``MEMCPYH2D_DATA_1`` forwards this data 19 | to color ``blue``. When a PE receives a ``red`` wavelet, the task 20 | ``red_task`` is activated, and when a PE receives a ``blue`` wavelet, 21 | the task ``blue_task`` is activated. 22 | 23 | Each PE program contains a global variable named ``sum``, 24 | initialized to zero. 25 | When a ``red_task`` is activated by an incoming wavelet ``in_data``, 26 | ``sum`` is incremented by an amount ``in_data``. 27 | When a ``blue_task`` is activated by an incoming wavelet ``in_data``, 28 | ``sum`` is incremented by an amount ``2 * in_data``. 29 | -------------------------------------------------------------------------------- /benchmarks/spmv-hypersparse/data/rmat4.4x4.lb.mtx: -------------------------------------------------------------------------------- 1 | %%MatrixMarket matrix coordinate real general 2 | 16 16 108 3 | 1 2 1 4 | 1 4 1 5 | 1 5 1 6 | 1 6 1 7 | 1 7 1 8 | 1 8 1 9 | 1 9 1 10 | 1 10 1 11 | 1 11 1 12 | 1 13 1 13 | 1 14 1 14 | 1 15 1 15 | 1 16 1 16 | 2 1 1 17 | 2 2 1 18 | 2 3 1 19 | 2 5 1 20 | 2 9 1 21 | 2 10 1 22 | 2 12 1 23 | 2 13 1 24 | 3 1 1 25 | 3 5 1 26 | 3 8 1 27 | 3 10 1 28 | 3 13 1 29 | 4 1 1 30 | 4 5 1 31 | 5 1 1 32 | 5 2 1 33 | 5 6 1 34 | 5 7 1 35 | 5 9 1 36 | 5 10 1 37 | 5 11 1 38 | 5 13 1 39 | 5 14 1 40 | 5 15 1 41 | 5 16 1 42 | 6 1 1 43 | 6 4 1 44 | 6 5 1 45 | 6 6 1 46 | 6 10 1 47 | 6 14 1 48 | 6 15 1 49 | 7 1 1 50 | 7 2 1 51 | 7 5 1 52 | 7 9 1 53 | 7 13 1 54 | 8 1 1 55 | 8 3 1 56 | 8 11 1 57 | 8 13 1 58 | 9 1 1 59 | 9 3 1 60 | 9 4 1 61 | 9 5 1 62 | 9 6 1 63 | 9 7 1 64 | 9 10 1 65 | 9 13 1 66 | 9 14 1 67 | 9 15 1 68 | 10 1 1 69 | 10 2 1 70 | 10 5 1 71 | 10 7 1 72 | 10 9 1 73 | 10 10 1 74 | 10 13 1 75 | 11 8 1 76 | 11 9 1 77 | 11 10 1 78 | 11 14 1 79 | 12 1 1 80 | 12 2 1 81 | 12 7 1 82 | 12 9 1 83 | 13 1 1 84 | 13 5 1 85 | 13 6 1 86 | 13 7 1 87 | 13 8 1 88 | 13 9 1 89 | 13 10 1 90 | 13 11 1 91 | 13 14 1 92 | 13 15 1 93 | 14 1 1 94 | 14 2 1 95 | 14 3 1 96 | 14 5 1 97 | 14 6 1 98 | 14 9 1 99 | 14 11 1 100 | 14 12 1 101 | 14 13 1 102 | 14 14 1 103 | 15 1 1 104 | 15 4 1 105 | 15 5 1 106 | 15 6 1 107 | 15 9 1 108 | 15 13 1 109 | 16 10 1 110 | 16 14 1 111 | -------------------------------------------------------------------------------- /tutorials/gemv-05-multiple-pes/layout.csl: -------------------------------------------------------------------------------- 1 | // Copyright 2025 Cerebras Systems. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // matrix dimensions on each PE 16 | param M: i16; 17 | param N: i16; 18 | 19 | // number of PEs in program 20 | param width: i16; 21 | 22 | const memcpy = @import_module("", .{ 23 | .width = width, 24 | .height = 1 25 | }); 26 | 27 | layout { 28 | // PE coordinates are (column, row) 29 | @set_rectangle(width, 1); 30 | for (@range(i16, width)) |x| { 31 | @set_tile_code(x, 0, "pe_program.csl", .{ 32 | .memcpy_params = memcpy.get_params(x), 33 | .M = M, 34 | .N = N 35 | }); 36 | } 37 | 38 | // export symbol names 39 | @export_name("A", [*]f32, true); 40 | @export_name("x", [*]f32, true); 41 | @export_name("b", [*]f32, true); 42 | @export_name("y", [*]f32, false); 43 | @export_name("compute", fn()void); 44 | } 45 | -------------------------------------------------------------------------------- /tutorials/topic-10-map-builtin/README.rst: -------------------------------------------------------------------------------- 1 | Topic 10: @map Builtin 2 | ====================== 3 | 4 | The ``@map`` builtin can be used to perform custom operations on the data 5 | elements of one or more DSDs. In other words, it is a 6 | *customizable DSD operation* that allows us to go beyond the 7 | :ref:`fixed list ` of 8 | natively supported DSD operations. 9 | 10 | This example demonstrates three use-cases of the ``@map`` builtin: 11 | 12 | 1. In the first use-case, ``@map`` is used to compute the square-root of the 13 | diagonal elements of a 2D tensor. 14 | 2. In the second use-case ``@map`` is used to perform a custom calculation with 15 | a mix of input DSDs of various kinds (``mem1d_dsd`` and ``fabin_dsd``) and 16 | scalar values while the result is stored to a ``mem1d_dsd``. It shows how we 17 | can use arbitrary callbacks combined with a variety of input and output DSDs. 18 | 3. Finally, we demonstrate how ``@map`` can be used to compute a reduction like 19 | the sum of all elements in a tensor. 20 | 21 | Without ``@map``, we would have to write explicit loops iterating over each 22 | element involved in these computations. With ``@map`` we can avoid writing such 23 | loops by utilizing the DSD descriptions which specify the loop structure 24 | implicitly. Since DSDs are supported natively by the hardware, using ``@map`` 25 | can lead to significant performance gains compared to writing explicit loops. 26 | -------------------------------------------------------------------------------- /benchmarks/mandelbrot/common.csl: -------------------------------------------------------------------------------- 1 | // Copyright 2025 Cerebras Systems. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | const math = @import_module(""); 16 | 17 | fn get_x(c: u32, x_lo: f32, x_hi: f32, cols: u32) f32 { 18 | return @as(f32, c) * (x_hi - x_lo) / @as(f32, cols - 1) + x_lo; 19 | } 20 | 21 | fn get_y(r: u32, y_lo: f32, y_hi: f32, rows: u32) f32 { 22 | return @as(f32, r) * (y_hi - y_lo) / @as(f32, rows - 1) + y_lo; 23 | } 24 | 25 | fn mandelbrot(max_iters: u32, rp: *f32, ip: *f32, iters: *f32, x: f32, y: f32) void { 26 | 27 | for (@range(u32, max_iters)) |i| { 28 | 29 | const real = rp.*; 30 | const imag = ip.*; 31 | 32 | if (math.sqrt_f32(real * real + imag * imag) > 2.0) { 33 | break; 34 | } 35 | 36 | rp.* = real * real - imag * imag; 37 | ip.* = real * imag + real * imag; 38 | 39 | rp.* += x; 40 | ip.* += y; 41 | 42 | iters.* += 1.0; 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /benchmarks/single-tile-matvec/src/layout_matvec.csl: -------------------------------------------------------------------------------- 1 | // Copyright 2025 Cerebras Systems. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | param width: u16; 16 | param height: u16; 17 | param tile_size: u16; 18 | param iters: u16; 19 | 20 | const memcpy = @import_module("", .{ 21 | .width = width, 22 | .height = height, 23 | }); 24 | 25 | layout { 26 | @set_rectangle(width, height); 27 | 28 | for (@range(u16, width)) |px| { 29 | const memcpy_params = memcpy.get_params(px); 30 | for (@range(u16, height)) |py| { 31 | @set_tile_code(px, py, "pe_matvec.csl", .{ .memcpy_params = memcpy_params, 32 | .nb = tile_size, .iters = iters}); 33 | } 34 | } 35 | 36 | // export symbol names 37 | @export_name("A", [*]f32, true); 38 | @export_name("x", [*]f32, true); 39 | @export_name("y", [*]f32, true); 40 | @export_name("maxmin_time", [*]f32, true); 41 | @export_name("compute", fn()void); 42 | } 43 | -------------------------------------------------------------------------------- /tutorials/topic-05-sentinels/README.rst: -------------------------------------------------------------------------------- 1 | Topic 5: Sentinels 2 | ================== 3 | 4 | In previous programs, we used so-called routable colors, which are associated 5 | with a route to direct the flow of wavelets. 6 | On WSE-2, task IDs which can receive data wavelets are in the range 0 through 7 | 23, corresponding to the IDs of the colors. 8 | On WSE-3, task IDs which can receive data wavelets are in the range 0 through 9 | 7, corresponding to input queues which are bound to a routable color. 10 | We have also used local tasks, which on WSE-2 can be associated with any task 11 | ID from 0 to 30, and on WSE-3 can be associated with any task ID from 8 to 30. 12 | 13 | This example demonstrates the use of a non-routable control task ID to signal 14 | the end of an input tensor. 15 | We call this use for a control task ID a *sentinel*. 16 | 17 | In this example, the host sends to a receiving PE (``sentinel.csl``) the number 18 | of wavelets that the receiving PE should expect to receive, followed by the 19 | stream of data. 20 | The receiving PE then sends the data to its neighbor (``pe_program.csl``), 21 | followed by a *control wavelet* which specifies the control task ID that the 22 | neighbor will activate. 23 | 24 | Since sentinel control task IDs are not routable colors, the programmer does 25 | not specify a route, but does need to bind the control task ID to a control 26 | task, which will be activated upon receipt of the sentinel wavelet. 27 | Here, the sentinel activates the ``send_result`` task, which relays the 28 | result of the sum reduction back to the host. 29 | -------------------------------------------------------------------------------- /tutorials/topic-01-arrays-and-pointers/pe_program.csl: -------------------------------------------------------------------------------- 1 | // Copyright 2025 Cerebras Systems. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Not a complete program; the top-level source file is layout.csl 16 | 17 | param memcpy_params: comptime_struct; 18 | 19 | const sys_mod = @import_module("", memcpy_params); 20 | 21 | var result: [1]i16; 22 | var result_ptr: [*]i16 = &result; 23 | 24 | fn increment_and_sum(data_ptr: *[3]i16, result_ptr: *i16) void { 25 | // Write an updated value to each element of the array 26 | (data_ptr.*)[0] += 1; 27 | (data_ptr.*)[1] += 1; 28 | (data_ptr.*)[2] += 1; 29 | 30 | // Read all array values, sum them, and write the result 31 | result_ptr.* = (data_ptr.*)[0] + (data_ptr.*)[1] + (data_ptr.*)[2]; 32 | } 33 | 34 | fn f_run() void { 35 | var data = [3]i16 { 1, 2, 3 }; 36 | 37 | increment_and_sum(&data, &result[0]); 38 | 39 | sys_mod.unblock_cmd_stream(); 40 | } 41 | 42 | comptime { 43 | @export_symbol(result_ptr, "result"); 44 | @export_symbol(f_run); 45 | } 46 | -------------------------------------------------------------------------------- /tutorials/gemv-01-complete-program/layout.csl: -------------------------------------------------------------------------------- 1 | // Copyright 2025 Cerebras Systems. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Import memcpy layout module for 1 x 1 grid of PEs 16 | // This module defines parameters passed to program on the single PE 17 | const memcpy = @import_module("", .{ .width = 1, .height = 1 }); 18 | 19 | layout { 20 | 21 | // Use just one 1 PE (columns=1, rows=1) 22 | @set_rectangle(1, 1); 23 | 24 | // The lone PE in this program should execute the code in "pe_program.csl" 25 | // We pass memcpy parameters as a parameter to the program. Note that 26 | // memcpy parameters are parameterized by the PE's column number. 27 | @set_tile_code(0, 0, "pe_program.csl", .{ .memcpy_params = memcpy.get_params(0) }); 28 | 29 | // Export device symbol for array "y" 30 | // Last argument is mutability: host can read y, but not write to it 31 | @export_name("y", [*]f32, false); 32 | 33 | // Export host-callable device function 34 | @export_name("init_and_compute", fn()void); 35 | } 36 | -------------------------------------------------------------------------------- /tutorials/pipeline-02-fifo/README.rst: -------------------------------------------------------------------------------- 1 | 2 | Pipeline 2: Attach a FIFO to H2D 3 | ================================ 4 | 5 | The previous example stalls if the parameter ``size`` exceeds the capacity of 6 | the internal queues. The size of the queue is architecture-dependent. From the 7 | software development point of view, a program should be independent of any 8 | architecture. One solution is to add a FIFO between H2D and ``@add16``. The FIFO 9 | receives data from H2D and then forwards the data to ``@add16``. The WSE 10 | provides an efficient design for FIFO. The user just binds two microthreads to 11 | the FIFO: one pushes data into the FIFO, and the other pops the data out. As 12 | long as the parameter ``size`` does not exceed the capacity of the FIFO, H2D can 13 | always push all data into the FIFO even if ``@add16`` cannot process any data. 14 | Once H2D is done, D2H can continue to drain the data out such that ``@add16`` 15 | can progress. 16 | 17 | To create a FIFO, we use a builtin ``@allocate_fifo`` to bind a normal tensor. 18 | We create two fabric DSDs: one pushes data from ``MEMCPYH2D_DATA_1`` to the 19 | FIFO and the other pops data from the FIFO to the color ``C1``. Both DSDs must 20 | use different microthreads. 21 | 22 | The routing configuration of color ``C1`` is RAMP to RAMP because 23 | 1) the FIFO pops data to the router via ``C1`` and 24 | 2) ``@add16`` receives data from the router via ``C1`` 25 | 26 | The disadvantage of this approach is the resource consumption. The FIFO 27 | requires two microthreads and a scratch buffer. 28 | 29 | The next example will fix this issue. 30 | -------------------------------------------------------------------------------- /tutorials/gemv-00-basic-syntax/README.rst: -------------------------------------------------------------------------------- 1 | 2 | GEMV 0: Basic CSL Syntax 3 | ======================== 4 | 5 | This example is the first in a series of successive example programs 6 | demonstrating CSL and the SDK by implementing a general matrix-vector product, 7 | or GEMV. 8 | 9 | We start by illustrating the syntax of some of CSL's core language constructs. 10 | The code in this example is not a complete program, but it shows 11 | some of the most commonly used CSL features. 12 | 13 | CSL’s syntax is like that of `Zig `_. 14 | Despite the similarity, both the purpose and the implementation of the CSL 15 | compiler are different from that of the Zig compiler. 16 | 17 | Types 18 | ----- 19 | 20 | CSL includes some basic types: 21 | 22 | 23 | * ``bool`` for boolean values 24 | * ``i16`` and ``i32`` for 16- and 32-bit signed integers 25 | * ``u16`` and ``u32`` for 16- and 32-bit unsigned integers 26 | * ``f16`` and ``f32`` for 16- and 32-bit IEEE-754 floating point numbers 27 | 28 | In addition to the above, CSL also supports array types and pointer types. 29 | Their use will be illustrated in subsequent examples. 30 | 31 | Functions 32 | --------- 33 | 34 | Functions are declared using the ``fn`` keyword. The compiler provides special 35 | functions called *Builtins*, whose names start with ``@`` and whose 36 | implementation is provided by the compiler. All CSL builtins are described in 37 | :ref:`language-builtins`. 38 | 39 | Conditional Statements and Loops 40 | -------------------------------- 41 | 42 | CSL includes ``if`` statements and ``while`` and ``for`` loops. 43 | These are described in greater detail in the subsequent example programs. 44 | -------------------------------------------------------------------------------- /tutorials/topic-15-wse3-microthreads/layout.csl: -------------------------------------------------------------------------------- 1 | // Copyright 2025 Cerebras Systems. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Colors 16 | const send_color: color = @get_color(0); // Color used to send/recv data between PEs 17 | 18 | // This example only uses 2 PEs 19 | const memcpy = @import_module("", .{ .width = 2, .height = 1 }); 20 | 21 | layout { 22 | // PE coordinates are (column, row) 23 | @set_rectangle(2, 1); 24 | 25 | // Left PE (0, 0) 26 | @set_tile_code(0, 0, "left_pe.csl", .{ 27 | .memcpy_params = memcpy.get_params(0), .send_color = send_color }); 28 | 29 | // Left PE sends to the right 30 | @set_color_config(0, 0, send_color, .{.routes = .{ .rx = .{RAMP}, .tx = .{EAST} }}); 31 | 32 | // Right PE (1, 0) 33 | @set_tile_code(1, 0, "right_pe.csl", .{ 34 | .memcpy_params = memcpy.get_params(1), .recv_color = send_color }); 35 | 36 | // Right PE receives from left PE 37 | @set_color_config(1, 0, send_color, .{.routes = .{ .rx = .{WEST}, .tx = .{RAMP} }}); 38 | 39 | // export symbol names 40 | @export_name("y", [*]f32, true); 41 | @export_name("compute", fn()void); 42 | } 43 | -------------------------------------------------------------------------------- /tutorials/sdklayout-01-introduction/README.rst: -------------------------------------------------------------------------------- 1 | SdkLayout 1: Introduction 2 | ========================= 3 | 4 | This tutorial introduces the ``SdkLayout`` API. ``SdkLayout`` 5 | allows us to define and compile multi-PE WSE programs. Specifically, 6 | it consists of the following main features: 7 | 8 | * Creation of CSL code regions: rectangular CSL code regions can be 9 | instantiated given a CSL source code file path, a name, and the 10 | width and height dimensions. 11 | * Routing and switching: for a given CSL code region we can specify 12 | routing and switching information on a single PE within the code 13 | region, on a rectangular sub-region, or on the entire code region. 14 | See tutorial :ref:`sdkruntime-sdklayout-02-routing`. 15 | * Automatic color allocation: routing can be done based on symbolic 16 | colors. The ``SdkLayout`` engine will then allocate physical 17 | values automatically. See tutorials :ref:`sdkruntime-sdklayout-02-routing` 18 | and :ref:`sdkruntime-sdklayout-03-ports-and-connections`. 19 | * Automatic routing between code regions: users can create input 20 | and output ports on code regions and connect them. The ``SdkLayout`` 21 | engine will automatically find optimal routes between them. 22 | See tutorial :ref:`sdkruntime-sdklayout-03-ports-and-connections`. 23 | * Host-to-device and device-to-host connections: an input or 24 | output port can be connected to the host to create an input 25 | or output stream respectively. See tutorial 26 | :ref:`sdkruntime-sdklayout-04-h2d-d2h`. 27 | 28 | This tutorial demonstrates the most basic compilation flow, 29 | where a single-PE program with no colors and no routing sets the value 30 | of a global variable in device memory based on the value of 31 | a parameter. 32 | -------------------------------------------------------------------------------- /benchmarks/25-pt-stencil/ic.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env cs_python 2 | 3 | # Copyright 2025 Cerebras Systems. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | 18 | import numpy as np 19 | 20 | 21 | def computeGaussianSource(iterations): 22 | tau = np.float32(1.0) 23 | scale = np.float32(8.0) 24 | mscale = np.float32(-8.0) 25 | _fmax = np.float32(25.0) 26 | dt = np.float32(0.001610153) 27 | sigma = np.float32(0.6) * _fmax 28 | 29 | t = np.arange(0, iterations, 1, dtype=np.float32) * np.float32(dt) 30 | power = np.power(sigma * t - tau, 2, dtype=np.float32) 31 | expf = np.exp(np.multiply(power, np.float32(mscale))) 32 | source = ( 33 | np.float32(-2.0) 34 | * scale 35 | * sigma 36 | * np.multiply( 37 | sigma - np.float32(2.0) * sigma * scale * power, 38 | expf, 39 | dtype=np.float32, 40 | ) 41 | ) 42 | 43 | first_zero_idx = np.nonzero(source)[-1][-1] + 1 44 | if first_zero_idx < source.shape[-1]: 45 | source = source[:first_zero_idx] 46 | sourceLength = first_zero_idx 47 | else: 48 | sourceLength = source.shape[-1] 49 | 50 | print(f"sourceLength = {sourceLength}, first_zero_idx={first_zero_idx}") 51 | 52 | return source, sourceLength 53 | -------------------------------------------------------------------------------- /benchmarks/gemv-checkerboard-pattern/README.rst: -------------------------------------------------------------------------------- 1 | GEMV with Checkerboard Pattern 2 | ============================== 3 | 4 | This example shows a CSL program that performs generalized matrix-vector (GEMV) 5 | multiplication operation of the form: 6 | 7 | .. code-block:: text 8 | 9 | y = Ax + b 10 | 11 | where: 12 | 13 | - ``A`` is a tensor of shape [M, N] (stored distributed on PE memory). 14 | - ``x`` is a tensor input of shape [N, 1] (streamed in). 15 | - ``b`` is a tensor input of shape [M, 1] (streamed in). 16 | - ``y`` is the tensor output of shape [M, 1] (streamed out). 17 | 18 | For simplicity, we choose M as a multiple of the 19 | height of the kernel and N as a multiple of the width of the kernel. 20 | In this example, M = 32, N = 16 and we use a PE-rectangle (kernel) of 21 | size 4×4. 22 | 23 | Below is a visualization of the kernel interface: 24 | 25 | .. _fig-gemv-4-by-4-checkerboard: 26 | 27 | .. figure:: ./images/gemv-4-by-4.png 28 | :align: center 29 | :width: 980 px 30 | 31 | Note that this algorithm and the implementation are not optimized for 32 | performance. It is intended to serve as a non-trivial introductory example. 33 | 34 | All computations are done in FP16 format. 35 | 36 | The matrix ``A``, of shape [M, N], 37 | is distributed across the PE memories as follows: 38 | 39 | - The first dimension of ``A``, M rows, is distributed across 40 | the height of the kernel. 41 | - The second dimension of ``A``, N columns, is distributed across 42 | the width of the kernel. 43 | 44 | Since we know that M is 32 and the height of the kernel is 4, each PE will be 45 | assigned 32÷4 = 8 rows of ``A``. 46 | 47 | Similarly, each PE will get 16÷4 = 4 columns of ``A``. This means each PE is 48 | assigned an 8×4 chunk of the original matrix ``A``. 49 | -------------------------------------------------------------------------------- /benchmarks/mandelbrot/README.rst: -------------------------------------------------------------------------------- 1 | Mandelbrot 2 | ========== 3 | 4 | This is a simple program that computes a visualization of the Mandelbrot set on 5 | a 16x16 pixel grid using a 4x4 grid of PEs. 6 | 7 | Files: 8 | - ``code.csl``: the main file that sets up the 4x4 PE grid and routing 9 | - ``left.csl``: code for the PEs on the left of the grid 10 | - ``middle.csl``: code for the PEs in the rest of the grid 11 | - ``common.csl``: Mandelbrot code used in all PEs 12 | 13 | Description: 14 | 15 | This program adopts a pipeline-parallel approach to generating the Mandelbrot 16 | set. Each row of 4 PEs is responsible for a 4x16 chunk of the grid. The PE on 17 | the left of each row generates elements, performs up to 8 iterations on them, 18 | then passes them to the right. Each subsequent PE in the same row will also 19 | perform up to 8 iterations, then pass the elements right. Eventually, the 20 | element is outputted on the EAST side of the grid after having undergone a 21 | maximum of 32 iterations. 22 | 23 | When a PE passes "an element", it is actually passing 3 32-bit floats. They are 24 | as follows: { real part, imaginary part, number of iterations so far } 25 | 26 | Middle PEs calculate the x,y of the values they receive based on the order they 27 | receive them in. 28 | 29 | An alternative approach would be to assign each PE a 4x4 tile of the 16x16 30 | overall grid and have it compute Mandelbrot for just its tile. Implementing this 31 | version and comparing its performance to this pipeline-parallel program would be 32 | interesting future work. 33 | 34 | Known problems: 35 | - Load balancing between PEs in the same row is poor 36 | - ``iters`` is stored as an ``f32``. It really should be an integer type, 37 | however, we do not yet have support for sending structs through memory DSDs. 38 | -------------------------------------------------------------------------------- /benchmarks/fft-3d/README.rst: -------------------------------------------------------------------------------- 1 | 3D FFT 2 | ====== 3 | 4 | This example implements a 3D Discrete Fourier Transform by using a pencil 5 | decomposition, in which the input data is viewed as a 2D array of 1D pencils, 6 | and each PE stores a small subarray of the 2D array of pencils. 7 | 8 | The algorithm proceeds in steps. First, the 1D FFT of the pencils on each PE 9 | are performed. Then, the data is transposed along a coordinate axis among 10 | all PEs. This process happens two more times, resulting in three local 11 | operations in which 1D FFTs are performed independently on each PE, and three 12 | transpose operations in which all PEs commmunicate to change which axis of 13 | the data is stored in memory. 14 | 15 | The algorithm used to compute the 1D FFTs is Cooley-Tukey, 16 | Decimation in Time (DIT), radix 2, with the 17 | slight tweak that we use iteration instead of recursion. 18 | 19 | FFT Compilation Parameters 20 | -------------------------- 21 | 22 | * ``N``: Size of 3D FFT along one dimension. The full problem size is 23 | ``N x N x N``. 24 | * ``NUM_PENCILS_PER_DIM``: Number of pencils along a given dimension on each PE. 25 | For instance, ``NUM_PENCILS_PER_DIM == 2`` means that each PE stores 26 | ``2 x 2`` pencils. 27 | * ``FP``: Floating point precision. Valid values are ``1`` or ``2``, specifying 28 | IEEE fp16 or fp32, respectively. 29 | 30 | FFT Runtime Parameters 31 | ---------------------- 32 | 33 | * ``--inverse``: With this flag set, perform an inverse Fourier transform. 34 | * ``--real``: With this flag set, compute Fourier transform with real input 35 | data. Without this flag, complex Fourier transform is computed. 36 | * ``--norm``: Normalization strategy. Valid values are ``0``, ``1``, or ``2``, 37 | specifying ``forward``, ``backward``, or ``orthonormal``, respectively. 38 | -------------------------------------------------------------------------------- /benchmarks/bandwidth-test/README.rst: -------------------------------------------------------------------------------- 1 | Bandwidth Test 2 | ============== 3 | 4 | This example evaluates the bandwidth between the host and the device (WSE). The 5 | kernel records the ``start`` and ``end`` of H2D or D2H by tsc counter. This is 6 | better than host timer because the runtime may not send the command right after 7 | the user issues it. The runtime can aggregate multiple nonblocking commands 8 | together to reduce TCP overhead. In addition the tsc counters of all PEs are 9 | not sychronized in the beginning. To avoid the timing variation among those PEs 10 | , we add a sync() to synchronize all PEs and sample the reference clock. 11 | 12 | The kernel ``bw_sync_kernel.csl`` defines a couple of host-callable functions, 13 | ``f_sync()``, ``f_tic()`` and ``f_toc()`` in order to synchronize the PEs and 14 | record the timing of H2D or D2H. 15 | 16 | The kernel ``sync/pe.csl`` performs a reduction over the whole rectangle to sync 17 | the PEs, then the top-left PE sends a signal to other PEs to sample the 18 | reference clock. 19 | 20 | The script ``run.py`` has the following parameters: 21 | 22 | - ``--loop_count=`` decides how many H2Ds/D2Hs are called. 23 | 24 | - ``--d2h`` measures the bandwidth of D2H, otherwise H2D is measured. 25 | 26 | - ``--channels=`` specifies the number of I/O channels, no bigger than 16. 27 | 28 | The tic() samples "time_start" and toc() samples "time_end". The sync() samples 29 | "time_ref" which is used to adjust "time_start" and "time_end". 30 | The elapsed time (unit: cycles) is measured by 31 | ``cycles_send = max(time_end) - min(time_start)`` 32 | 33 | The overall runtime (us) is computed via the following formula 34 | ``time_send = (cycles_send / 0.85) * 1.e-3 us`` 35 | 36 | The bandwidth is calculated by 37 | ``bandwidth = ((wvlts * 4)/time_send)*loop_count`` 38 | -------------------------------------------------------------------------------- /benchmarks/spmv-hypersparse/src/allreduce2R1E/layout.csl: -------------------------------------------------------------------------------- 1 | // Copyright 2025 Cerebras Systems. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | 16 | param colors: [2]color; 17 | param entrypoints: [1]local_task_id; 18 | param width: i16 ; // width of the core 19 | param height: i16 ; // height of the core 20 | 21 | 22 | const C0: color = colors[0]; 23 | const C1: color = colors[1]; 24 | 25 | // entrypoints of allreduce module 26 | // LOCK runs only if teardown is received and the operation is done 27 | // LOCK performs the state transition 28 | // teardown handler activates LOCK 29 | // the operation blocks LOCK in the beginning and unblocks it when it finishes 30 | const C_LOCK: local_task_id = entrypoints[0]; 31 | 32 | fn get_params(px:i16, py:i16) comptime_struct { 33 | 34 | var first_py: bool = (0 == py); 35 | var last_py: bool = ((height-1) == py); 36 | 37 | var first_px: bool = (0 == px); 38 | var last_px: bool = ((width-1) == px); 39 | 40 | return .{ 41 | .first_px = first_px, 42 | .last_px = last_px, 43 | .first_py = first_py, 44 | .last_py = last_py, 45 | .C_ROUTE = C0, 46 | .C_DISPATCH = C1, 47 | .C_LOCK = C_LOCK, 48 | .width = width, 49 | .height = height 50 | }; 51 | } 52 | -------------------------------------------------------------------------------- /tutorials/topic-15-wse3-microthreads/right_pe.csl: -------------------------------------------------------------------------------- 1 | // Copyright 2025 Cerebras Systems. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | param memcpy_params: comptime_struct; 16 | 17 | param recv_color: color; 18 | 19 | const M: i16 = 10; 20 | 21 | // Task IDs 22 | const exit_task_id: local_task_id = @get_local_task_id(9); 23 | 24 | // Queue and microthread IDs 25 | const recv_color_iq = @get_input_queue(2); 26 | const recv_color_ut = @get_ut_id(5); 27 | 28 | const sys_mod = @import_module("", memcpy_params); 29 | 30 | var y: [M]f32; 31 | var y_dsd = @get_dsd(mem1d_dsd, .{ .tensor_access = |i|{M} -> y[i] }); 32 | var y_ptr: [*]f32 = &y; 33 | 34 | fn compute() void { 35 | const in_dsd = @get_dsd(fabin_dsd, .{ 36 | .fabric_color = recv_color, .extent = M, 37 | .input_queue = recv_color_iq 38 | }); 39 | @fmovs(y_dsd, in_dsd, .{ .async = true, .ut_id = recv_color_ut, 40 | .activate = exit_task_id }); 41 | } 42 | 43 | task exit_task() void { 44 | sys_mod.unblock_cmd_stream(); 45 | } 46 | 47 | comptime { 48 | @bind_local_task(exit_task, exit_task_id); 49 | 50 | @initialize_queue(recv_color_iq, .{ .color = recv_color }); 51 | 52 | @export_symbol(y_ptr, "y"); 53 | @export_symbol(compute); 54 | } 55 | -------------------------------------------------------------------------------- /tutorials/topic-15-wse3-microthreads/left_pe.csl: -------------------------------------------------------------------------------- 1 | // Copyright 2025 Cerebras Systems. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | param memcpy_params: comptime_struct; 16 | 17 | param send_color: color; 18 | 19 | const M: i16 = 10; 20 | 21 | // Task IDs 22 | const exit_task_id: local_task_id = @get_local_task_id(9); 23 | 24 | // Queue and microthread IDs 25 | const send_color_oq = @get_output_queue(2); 26 | const send_color_ut = @get_ut_id(4); 27 | 28 | const sys_mod = @import_module("", memcpy_params); 29 | 30 | var y: [M]f32; 31 | var y_dsd = @get_dsd(mem1d_dsd, .{ .tensor_access = |i|{M} -> y[i] }); 32 | var y_ptr: [*]f32 = &y; 33 | 34 | fn compute() void { 35 | const out_dsd = @get_dsd(fabout_dsd, .{ 36 | .fabric_color = send_color, .extent = M, 37 | .output_queue = send_color_oq 38 | }); 39 | @fmovs(out_dsd, y_dsd, .{ .async = true, .ut_id = send_color_ut, 40 | .activate = exit_task_id }); 41 | } 42 | 43 | task exit_task() void { 44 | sys_mod.unblock_cmd_stream(); 45 | } 46 | 47 | comptime { 48 | @bind_local_task(exit_task, exit_task_id); 49 | 50 | @initialize_queue(send_color_oq, .{ .color = send_color }); 51 | 52 | @export_symbol(y_ptr, "y"); 53 | @export_symbol(compute); 54 | } 55 | -------------------------------------------------------------------------------- /benchmarks/fft-1d-2d/reshape.csl: -------------------------------------------------------------------------------- 1 | // Copyright 2025 Cerebras Systems. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | 16 | param SRC_SIZE: u16; 17 | param DST_SIZE: u16; 18 | param N: u16; 19 | param FP: i16; 20 | param tensor_type: type; 21 | param dest: *[DST_SIZE]tensor_type; 22 | param src: *[SRC_SIZE]tensor_type; 23 | 24 | const srcDSD = @get_dsd(mem1d_dsd, .{ .tensor_access = |i|{N} -> src[i] }); 25 | const destDSD = @get_dsd(mem1d_dsd, .{ .tensor_access = |i|{N} -> dest[i] }); 26 | 27 | fn reshape(baseA: *u16, baseB: *u16, subproblems: u16, stride: u16) void { 28 | 29 | var sourceDSD = @increment_dsd_offset(srcDSD, @as(i16, baseB.*), tensor_type); 30 | sourceDSD = @set_dsd_length(sourceDSD, subproblems); 31 | 32 | var destinationDSD = @increment_dsd_offset(destDSD, @as(i16, baseA.*), tensor_type); 33 | destinationDSD = @set_dsd_length(destinationDSD, subproblems); 34 | 35 | while ((baseB.*) < N) { 36 | if (FP==1){ 37 | @fmovh(destinationDSD, sourceDSD); 38 | } else { 39 | @fmovs(destinationDSD, sourceDSD); 40 | } 41 | destinationDSD = @increment_dsd_offset(destinationDSD, @as(i16, subproblems), tensor_type); 42 | sourceDSD = @increment_dsd_offset(sourceDSD, @as(i16, stride), tensor_type); 43 | (baseA.*) += subproblems; 44 | (baseB.*) += stride; 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /tutorials/topic-09-fifos/README.rst: -------------------------------------------------------------------------------- 1 | Topic 9: FIFOs 2 | ============== 3 | 4 | A FIFO DSD is useful to buffer input going into or out of a PE, as a way to 5 | extend the small hardware queues used for fabric communication. In particular, 6 | this may prevent stalls in the communication fabric when input or output 7 | happens in bursts. It is also possible to operate on the values while they flow 8 | through the FIFO, as this code sample demonstrates. 9 | 10 | This example illustrates a typical pattern in the use of FIFOs, where a 11 | receiver receives wavelets from the fabric and forwards them to a task that 12 | performs some computation. Specifically, incoming data from the host is stored 13 | in the FIFO, thus relieving the sender from being blocked until the receiver 14 | has received all wavelets. While the incoming wavelets are being asynchronously 15 | received into the FIFO buffer, we also start a second asynchronous DSD 16 | operation that pulls data from the FIFO and forwards it to a wavelet-triggered 17 | task. 18 | 19 | This example also illustrates another common pattern, where a PE starts a 20 | wavelet-triggered task using its own wavelets, by sending them to the router 21 | which immediately sends them back to the compute element. In our example, this 22 | wavelet-triggered task simply computes the cube of the wavelet's data, before 23 | sending the result to the host. 24 | 25 | Note that, to demonstrate the use of FIFOs in this program, we use ``memcpy`` 26 | streaming mode to stream data from the host and receive in the PE program's 27 | FIFO, and to stream data out of the PE program back to the host. Because 28 | ``memcpy`` calls are serialized, the ``memcpy_h2d`` must finish before the 29 | ``memcpy_d2h``. This places an artificial restriction on our FIFO: the input 30 | size from the host cannot exceed the FIFO size, or the program will potentially 31 | stall. 32 | -------------------------------------------------------------------------------- /tutorials/topic-01-arrays-and-pointers/layout.csl: -------------------------------------------------------------------------------- 1 | // Copyright 2025 Cerebras Systems. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // The core kernel must start at P4.1 so the memcpy infrastructure has enough 16 | // resources to route the data between the host and the device. 17 | 18 | // Color/ task ID map 19 | // 20 | // ID var ID var ID var ID var 21 | // 0 9 18 27 reserved (memcpy) 22 | // 1 10 19 28 reserved (memcpy) 23 | // 2 11 20 29 reserved 24 | // 3 12 21 reserved (memcpy) 30 reserved (memcpy) 25 | // 4 13 22 reserved (memcpy) 31 reserved 26 | // 5 14 23 reserved (memcpy) 32 27 | // 6 15 24 33 28 | // 7 16 25 34 29 | // 8 17 26 35 30 | 31 | const memcpy = @import_module("", .{ 32 | .width = 1, 33 | .height = 1, 34 | }); 35 | 36 | layout { 37 | @set_rectangle(1, 1); 38 | 39 | @set_tile_code(0, 0, "pe_program.csl", .{ .memcpy_params = memcpy.get_params(0) }); 40 | 41 | // export symbol name 42 | @export_name("result", [*]i16, true); 43 | @export_name("f_run", fn()void); 44 | } 45 | -------------------------------------------------------------------------------- /benchmarks/wide-multiplication/README.rst: -------------------------------------------------------------------------------- 1 | Wide Multiplication 2 | =================== 3 | 4 | This example shows a CSL program that performs multiplication of wide integers: 5 | 6 | .. code-block:: text 7 | 8 | result = X x Y 9 | 10 | where: 11 | 12 | - ``X`` and ``Y`` are 128-bit unsigned integers. 13 | - ``result`` is the 256-bit wide result of multiplying X and Y. 14 | 15 | The simulation script ``run.py`` generates random values for ``X`` and ``Y``. 16 | ``X`` is represented as a NumPy array of 16 elements of type ``uint16`` on the 17 | form 18 | 19 | .. code-block:: 20 | 21 | X = [x₀, x₁, ..., x₇, 0, 0, ..., 0] 22 | 23 | where: 24 | 25 | - The representation uses little endian. 26 | - x :subscript:`i`, i = 0, 1,..., 7, is the i-th 2-byte word of ``X``. 27 | - The eight trailing zeros are leading zeros to get a full 256-bit 28 | representation. 29 | 30 | ``Y`` is represented similarly, and ``X`` and ``Y`` are concatenated and sent to 31 | the fabric as a single 32-element vector of type ``uint16``: 32 | 33 | .. code-block:: 34 | 35 | (X, Y) = [x₀, x₁, ..., x₇, 0, 0, ..., 0, y₀, y₁, ... y₇, 0, 0, 36 | ..., 0] 37 | 38 | The multiplication is performed by a single PE which receives the input vectors 39 | (``X``, ``Y``) via the streaming H2D on color ``MEMCPYH2D_DATA_1`` and delivers 40 | the ``result`` via streaming D2H on color ``MEMCPYD2H_DATA_1``. A single color 41 | ``MEMCPYH2D_DATA_1`` is used for the delivery of both input vectors ``X`` and 42 | ``Y``. This is made possible by concatenation of ``X`` and ``Y`` into a single 43 | input vector. 44 | 45 | The multiplication is done at the bit level. In the k'th iteration of the outer 46 | loop, ``Y`` is traversed and multiplied by the bit value at position ``k`` of 47 | ``X``. This partial result is added to an accumulated result, tracking a carry 48 | bit, and ``X`` is then shifted by one position before the next iteration. 49 | -------------------------------------------------------------------------------- /benchmarks/fft-1d-2d/ucode_1d.csl: -------------------------------------------------------------------------------- 1 | // Copyright 2025 Cerebras Systems. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | param memcpyParams: comptime_struct; 16 | 17 | // Task IDs 18 | const EXIT: local_task_id = @get_local_task_id(10); 19 | 20 | const sys_mod = @import_module( "", memcpyParams); 21 | 22 | // Problem size 23 | param N: i16; 24 | const ELEM_SIZE: i16 = 2; 25 | param FP: i16; 26 | param tensor_type: type; 27 | 28 | // Import the code in the file `fft.csl` as the module `mod`, and 29 | // instantiate the module's `N` parameter. 30 | 31 | var X = @zeros([N*ELEM_SIZE]tensor_type); 32 | var f_twiddle = @zeros([N]tensor_type); 33 | 34 | var ptr_X: [*]tensor_type = &X; 35 | var ptr_f_twiddle: [*]tensor_type = &f_twiddle; 36 | 37 | const mod = @import_module("fft.csl", .{ .N = N, .ARRAY_LEN = N*ELEM_SIZE, .X=&X, .FP=FP, .tensor_type=tensor_type}); 38 | 39 | fn f_fft() void { 40 | mod.fft(&f_twiddle); 41 | @activate(EXIT); 42 | } 43 | 44 | fn f_ifft() void { 45 | mod.ifft(&f_twiddle); 46 | @activate(EXIT); 47 | } 48 | 49 | task f_exit() void { 50 | // the user must unblock cmd color for every PE 51 | sys_mod.unblock_cmd_stream(); 52 | } 53 | comptime { 54 | @bind_local_task(f_exit, EXIT); 55 | @export_symbol(ptr_X, "X"); 56 | @export_symbol(ptr_f_twiddle, "f_twiddle"); 57 | @export_symbol(f_fft); 58 | @export_symbol(f_ifft); 59 | } 60 | -------------------------------------------------------------------------------- /tutorials/topic-02-libraries/layout.csl: -------------------------------------------------------------------------------- 1 | // Copyright 2025 Cerebras Systems. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Color/ task ID map 16 | // 17 | // ID var ID var ID var ID var 18 | // 0 9 18 27 reserved (memcpy) 19 | // 1 10 19 28 reserved (memcpy) 20 | // 2 11 20 29 reserved 21 | // 3 12 21 reserved (memcpy) 30 reserved (memcpy) 22 | // 4 13 22 reserved (memcpy) 31 reserved 23 | // 5 14 23 reserved (memcpy) 32 24 | // 6 15 24 33 25 | // 7 16 25 34 26 | // 8 17 26 35 27 | 28 | param iterations: u32; 29 | 30 | const memcpy = @import_module("", .{ 31 | .width = 1, 32 | .height = 1, 33 | }); 34 | 35 | layout { 36 | @set_rectangle(1, 1); 37 | 38 | @set_tile_code(0, 0, "pe_program.csl", .{ 39 | .memcpy_params = memcpy.get_params(0), 40 | .iterations = iterations 41 | }); 42 | 43 | // export symbol name 44 | @export_name("result", [*]f32, true); 45 | @export_name("start_timestamp", [*]u16, true); 46 | @export_name("finish_timestamp", [*]u16, true); 47 | @export_name("f_run", fn()void); 48 | } 49 | -------------------------------------------------------------------------------- /tutorials/gemv-02-memory-dsds/README.rst: -------------------------------------------------------------------------------- 1 | GEMV 2: Memory DSDs 2 | =================== 3 | 4 | Continuing on from the previous example, we now extend it by introducing 5 | memory Data Structure Descriptors (DSDs), an efficient mechanism for 6 | performing operations on entire tensors. 7 | 8 | This program creates three one-dimensional memory DSDs for accessing ``A``, 9 | ``b``, and ``y``, each of which specifies how to loop over the respective 10 | arrays. 11 | 12 | ``b_dsd`` and ``y_dsd`` access the ``M`` contiguous elements of ``b`` and ``y``, 13 | respectively. 14 | ``A_dsd`` accesses ``M`` elements of ``A``, but strided by ``N`` elements. 15 | Because ``A`` is stored in row major format, this means that ``A_dsd`` 16 | initially accesses the 0th column of ``A``. 17 | 18 | We demonstrate here two ways of defining DSDs. For ``y_dsd``, we specify the 19 | base memory address (``&y``) and the number of elements accessed (``M``). 20 | For ``A_dsd`` and ``b_dsd``, we demonstrate the use of a ``tensor_access`` 21 | expression. 22 | The ``tensor_access`` field specifies an induction variable, a loop bound, 23 | and an affine expression (i.e., a linear function plus a constant) to generate 24 | various addresses at runtime. 25 | 26 | 27 | These DSDs are used by the DSD operations ``@fmacs`` and ``@fadds`` to 28 | compute ``Ax + b`` and store it in ``y``. 29 | The ``gemv`` function first loops over ``N``, with the ``@fmacs`` in iteration 30 | ``i`` computing the scalar-vector product of ``x[i]`` with column ``i`` 31 | of ``A``, and incrementing ``y`` by that result. 32 | The ``increment_dsd_offset`` operation updates ``A_dsd`` by shifting its 33 | access by one element. 34 | This causes ``A_dsd`` to access the next column of ``A``. 35 | After the loop, ``y`` is incremented by ``b`` with the ``@fadds`` operation, 36 | to complete the computation. 37 | 38 | Other DSD operations and their associated operand types are described in 39 | :ref:`language-builtins-for-dsd-operations`. 40 | -------------------------------------------------------------------------------- /benchmarks/7pt-stencil-spmv/README.rst: -------------------------------------------------------------------------------- 1 | 3D 7-Point Stencil SpMV 2 | ======================= 3 | 4 | This example evaluates the performance of 7-point stencil. The kernel records 5 | the ``start`` and ``end`` of ``spmv`` by tsc counter. In addition the tsc 6 | counters of all PEs are not sychronized in the beginning. To avoid the timing 7 | variation among those PEs, ``sync()`` synchronizes all PEs and samples the 8 | reference clock. 9 | 10 | The kernel ``kernel.csl`` defines a couple of host-callable functions, 11 | ``f_sync()``, ``f_tic()`` and ``f_toc()`` in order to synchronize the PEs and 12 | record the timing of ``spmv``. 13 | 14 | The kernel ``allreduce/pe.csl`` performs a reduction over the whole rectangle 15 | to synchronize the PEs, then the bottom-right PE sends a signal to other PEs 16 | to sample the reference clock. 17 | 18 | The kernel ``stencil_3d_7pts/pe.csl`` performs a matrix-vector product (spmv) 19 | where the matrix has 7 diagonals corresponding to 7 point stencil. The stencil 20 | coefficients can vary per PE, but must be the same for the local vector. The 21 | user can change the coefficients based on the boundary condition or curvilinear 22 | coordinate transformation. 23 | 24 | The script ``run.py`` has the following parameters: 25 | 26 | - ``-k=`` specifies the maximum size of local vector. 27 | 28 | - ``--zDim=`` specifies how many elements per PE are computed. 29 | 30 | - ``--channels=`` specifies the number of I/O channels, no bigger than 16. 31 | 32 | The ``tic()`` samples "time_start" and ``toc()`` samples "time_end". The 33 | ``sync()`` samples "time_ref" which is used to adjust "time_start" and 34 | "time_end". The elapsed time (unit: cycles) is measured by 35 | ``cycles_send = max(time_end) - min(time_start)`` 36 | 37 | The overall runtime (us) is computed via the following formula 38 | ``time_send = (cycles_send / 0.85) * 1.e-3 us`` 39 | 40 | The bandwidth is calculated by 41 | ``bandwidth = ((6*w*h*4)/time_send)`` 42 | -------------------------------------------------------------------------------- /tutorials/topic-10-map-builtin/layout.csl: -------------------------------------------------------------------------------- 1 | // Copyright 2025 Cerebras Systems. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Color/ task ID map 16 | // 17 | // ID var ID var ID var ID var 18 | // 0 9 18 27 reserved (memcpy) 19 | // 1 10 19 28 reserved (memcpy) 20 | // 2 11 20 29 reserved 21 | // 3 12 21 reserved (memcpy) 30 reserved (memcpy) 22 | // 4 13 22 reserved (memcpy) 31 reserved 23 | // 5 14 23 reserved (memcpy) 32 24 | // 6 15 24 33 25 | // 7 16 25 34 26 | // 8 17 26 35 27 | 28 | param size: i16; 29 | 30 | const memcpy = @import_module( "", .{ 31 | .width = 1, 32 | .height = 1, 33 | }); 34 | 35 | layout { 36 | @set_rectangle(1, 1); 37 | 38 | @set_tile_code(0, 0, "pe_program.csl", .{ 39 | .memcpy_params = memcpy.get_params(0), 40 | .size = size, 41 | }); 42 | 43 | // export symbol name 44 | @export_name("weight", [*]f32, true); 45 | @export_name("sqrt_diag_A", [*]f32, true); 46 | @export_name("A", [*]f32, true); 47 | @export_name("sum", [*]i32, true); 48 | @export_name("f_run", fn()void); 49 | } 50 | -------------------------------------------------------------------------------- /benchmarks/game-of-life/README.rst: -------------------------------------------------------------------------------- 1 | Conway's Game of Life 2 | ===================== 3 | 4 | This program implements 5 | `Conway's Game of Life `_ 6 | on the WSE. 7 | 8 | Conway's Game of Life is a cellular automaton which evolves on a 2D grid of 9 | square cells. Each cell is in one of two possible states, LIVE or DEAD. 10 | Every cell interacts with its neighbors, which are the cells horziontally, 11 | vertically, or diagonally adjacent. At each step in time, the following 12 | transitions occur: 13 | 14 | - Any LIVE cell with fewer than two LIVE neighbours becomes a DEAD cell. 15 | - Any LIVE cell with two or three LIVE neighbours stays a LIVE cell. 16 | - Any LIVE cell with more than three LIVE neighbours becomes a DEAD cell. 17 | - Any DEAD cell with exactly three LIVE neighbours becomes a LIVE cell. 18 | 19 | This program implements the Game of Life be assigning one cell to each PE. 20 | Zero boundary conditions are used, and thus the neighbors of a border PE that 21 | fall outside of the program rectangle are treaded as always DEAD. 22 | 23 | In each generation, each PE sends its state to its four N, S, E, and W 24 | neighbors. Each PE receives the state of its four N, S, E, and W neighbors, and 25 | also forwards the received state from its N and S neighbors to its E and W 26 | neighbors. Thus, each PE receives from its E and W links both the state of its 27 | E and W adjacent neighbors, as well as its four diagonal neighbors. 28 | 29 | The program implements two initial conditions, ``random`` and ``glider``. 30 | ``random`` randomly initializes the state of all cells. ``glider`` generates 31 | several glider objects across the grid. The initial condition can be set with 32 | the ``--initial-state`` flag. 33 | 34 | The ``--show-ascii-animation`` flag will generate an ASCII animation of the 35 | cellular automoton's evolution when the program is complete. 36 | ``--save-animation`` will save a GIF of the automoton's evolution. 37 | -------------------------------------------------------------------------------- /tutorials/pipeline-01-basic/layout.csl: -------------------------------------------------------------------------------- 1 | // Copyright 2025 Cerebras Systems. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Color/ task ID map 16 | // 17 | // ID var ID var ID var ID var 18 | // 0 MEMCPYH2D_1 9 18 27 reserved (memcpy) 19 | // 1 MEMCPYD2H_1 10 19 28 reserved (memcpy) 20 | // 2 11 20 29 reserved 21 | // 3 12 21 reserved (memcpy) 30 reserved (memcpy) 22 | // 4 13 22 reserved (memcpy) 31 reserved 23 | // 5 14 23 reserved (memcpy) 32 24 | // 6 15 24 33 25 | // 7 16 25 34 26 | // 8 main_task_id 17 26 35 27 | 28 | param size: i16; 29 | 30 | param MEMCPYH2D_DATA_1_ID: i16; 31 | param MEMCPYD2H_DATA_1_ID: i16; 32 | 33 | const MEMCPYH2D_DATA_1: color = @get_color(MEMCPYH2D_DATA_1_ID); 34 | const MEMCPYD2H_DATA_1: color = @get_color(MEMCPYD2H_DATA_1_ID); 35 | 36 | const memcpy = @import_module("", .{ 37 | .width = 1, 38 | .height = 1, 39 | .MEMCPYH2D_1 = MEMCPYH2D_DATA_1, 40 | .MEMCPYD2H_1 = MEMCPYD2H_DATA_1 41 | }); 42 | 43 | layout { 44 | @set_rectangle(1, 1); 45 | 46 | @set_tile_code(0, 0, "pe_program.csl", .{ 47 | .size = size, 48 | .memcpy_params = memcpy.get_params(0) 49 | }); 50 | } 51 | -------------------------------------------------------------------------------- /tutorials/topic-01-arrays-and-pointers/run.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env cs_python 2 | 3 | # Copyright 2025 Cerebras Systems. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | 18 | import argparse 19 | import numpy as np 20 | 21 | from cerebras.sdk.sdk_utils import memcpy_view 22 | from cerebras.sdk.runtime.sdkruntimepybind import SdkRuntime, MemcpyDataType # pylint: disable=no-name-in-module 23 | from cerebras.sdk.runtime.sdkruntimepybind import MemcpyOrder # pylint: disable=no-name-in-module 24 | 25 | parser = argparse.ArgumentParser() 26 | parser.add_argument('--name', help='the test name') 27 | parser.add_argument("--cmaddr", help="IP:port for CS system") 28 | args = parser.parse_args() 29 | dirname = args.name 30 | 31 | memcpy_dtype = MemcpyDataType.MEMCPY_16BIT 32 | runner = SdkRuntime(dirname, cmaddr=args.cmaddr) 33 | 34 | result_symbol = runner.get_id('result') 35 | 36 | runner.load() 37 | runner.run() 38 | 39 | runner.launch("f_run", nonblock=False) 40 | 41 | # The D2H buffer must be of type u32 42 | out_tensors_u32 = np.zeros(1, np.uint32) 43 | runner.memcpy_d2h(out_tensors_u32, result_symbol, 0, 0, 1, 1, 1, \ 44 | streaming=False, data_type=memcpy_dtype, order=MemcpyOrder.COL_MAJOR, nonblock=False) 45 | 46 | # remove upper 16-bit of each u32 47 | result_tensor = memcpy_view(out_tensors_u32, np.dtype(np.int16)) 48 | 49 | runner.stop() 50 | 51 | # Ensure that the result matches our expectation 52 | np.testing.assert_equal(result_tensor, [9]) 53 | print("SUCCESS!") 54 | -------------------------------------------------------------------------------- /benchmarks/benchmark-libs/allreduce/layout.csl: -------------------------------------------------------------------------------- 1 | // Copyright 2025 Cerebras Systems. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | 16 | param colors: [1]color; 17 | param entrypoints: [4]local_task_id; 18 | param width: i16 ; // width of the core 19 | param height: i16 ; // height of the core 20 | 21 | 22 | const C0: color = colors[0]; 23 | 24 | // entrypoints of allreduce module 25 | const SEND_CTRL: local_task_id = entrypoints[0]; 26 | const SEND_DATA: local_task_id = entrypoints[1]; 27 | const STATE_ENTRY: local_task_id = entrypoints[2]; 28 | // LOCK runs only if teardown is received and the operation is done 29 | // LOCK performs the state transition 30 | // teardown handler activates LOCK 31 | // the operation blocks LOCK in the beginning and unblocks it when it finishes 32 | const C_LOCK: local_task_id = entrypoints[3]; 33 | 34 | fn get_params(px:i16, py:i16) comptime_struct { 35 | 36 | var first_py: bool = (0 == py); 37 | var last_py: bool = ((height-1) == py); 38 | 39 | var first_px: bool = (0 == px); 40 | var last_px: bool = ((width-1) == px); 41 | 42 | return .{ 43 | .first_px = first_px, 44 | .last_px = last_px, 45 | .first_py = first_py, 46 | .last_py = last_py, 47 | .C_ROUTE = C0, 48 | .C_SEND_CTRL = SEND_CTRL, 49 | .C_SEND_DATA = SEND_DATA, 50 | .C_STATE_ENTRY = STATE_ENTRY, 51 | .C_LOCK = C_LOCK, 52 | .width = width, 53 | .height = height 54 | }; 55 | } 56 | -------------------------------------------------------------------------------- /tutorials/gemv-00-basic-syntax/code.csl: -------------------------------------------------------------------------------- 1 | // Copyright 2025 Cerebras Systems. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Not a complete program; we include it here for illustrating some syntax 16 | 17 | // Every variable must be declared either "const" or "var" 18 | // Const cannot be modified after declaration, but var can 19 | 20 | // Constants defining dimensions of our matrix 21 | const M: i16 = 4; 22 | const N: i16 = 6; 23 | 24 | // 48 kB of global memory contain A, x, b, y 25 | var A: [M*N]f32; // A is stored in row-major order 26 | var x: [N]f32; 27 | var b: [M]f32; 28 | var y: [M]f32; 29 | 30 | // Initialize matrix and vectors 31 | fn initialize() void { 32 | // for loop with range syntax 33 | // loops over 0, 1, ...., M*N-1 34 | // idx stores the loop index 35 | for (@range(i16, M*N)) |idx| { 36 | // @as casts idx from i16 to f32 37 | A[idx] = @as(f32, idx); 38 | } 39 | 40 | for (@range(i16, N)) |j| { 41 | x[j] = 1.0; 42 | } 43 | 44 | // while loop with iterator syntax 45 | var i: i16 = 0; 46 | while (i < M) : (i += 1) { 47 | b[i] = 2.0; 48 | y[i] = 0.0; 49 | } 50 | } 51 | 52 | // Compute gemv 53 | fn gemv() void { 54 | for (@range(i16, M)) |i| { 55 | var tmp: f32 = 0.0; 56 | for (@range(i16, N)) |j| { 57 | tmp += A[i*N + j] * x[j]; 58 | } 59 | y[i] = tmp + b[i]; 60 | } 61 | } 62 | 63 | // Call initialize and gemv functions 64 | fn init_and_compute() void { 65 | initialize(); 66 | gemv(); 67 | } 68 | -------------------------------------------------------------------------------- /tutorials/gemv-06-routes-1/layout.csl: -------------------------------------------------------------------------------- 1 | // Copyright 2025 Cerebras Systems. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // matrix dimensions on each PE 16 | param M: i16; 17 | param N: i16; 18 | 19 | // Colors 20 | const send_color: color = @get_color(0); // Color used to send/recv data between PEs 21 | 22 | // This example only uses 2 PEs 23 | const memcpy = @import_module("", .{ 24 | .width = 2, 25 | .height = 1, 26 | }); 27 | 28 | layout { 29 | // PE coordinates are (column, row) 30 | @set_rectangle(2, 1); 31 | 32 | // Left PE (0, 0) 33 | @set_tile_code(0, 0, "pe_program.csl", .{ 34 | .memcpy_params = memcpy.get_params(0), 35 | .M = M, 36 | .N_per_PE = N / 2, 37 | .pe_id = 0, 38 | .send_color = send_color 39 | }); 40 | 41 | // Left PE sends its result to the right 42 | @set_color_config(0, 0, send_color, .{.routes = .{ .rx = .{RAMP}, .tx = .{EAST} }}); 43 | 44 | // Right PE (1, 0) 45 | @set_tile_code(1, 0, "pe_program.csl", .{ 46 | .memcpy_params = memcpy.get_params(1), 47 | .M = M, 48 | .N_per_PE = N / 2, 49 | .pe_id = 1, 50 | .send_color = send_color 51 | }); 52 | 53 | // Right PE receives result of left PE 54 | @set_color_config(1, 0, send_color, .{.routes = .{ .rx = .{WEST}, .tx = .{RAMP} }}); 55 | 56 | // export symbol names 57 | @export_name("A", [*]f32, true); 58 | @export_name("x", [*]f32, true); 59 | @export_name("y", [*]f32, true); 60 | @export_name("compute", fn()void); 61 | } 62 | -------------------------------------------------------------------------------- /tutorials/topic-08-filters/run.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env cs_python 2 | 3 | # Copyright 2025 Cerebras Systems. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | 18 | import argparse 19 | import numpy as np 20 | 21 | from cerebras.sdk.runtime.sdkruntimepybind import SdkRuntime, MemcpyDataType # pylint: disable=no-name-in-module 22 | from cerebras.sdk.runtime.sdkruntimepybind import MemcpyOrder # pylint: disable=no-name-in-module 23 | 24 | parser = argparse.ArgumentParser() 25 | parser.add_argument('--name', help='the test name') 26 | parser.add_argument("--cmaddr", help="IP:port for CS system") 27 | args = parser.parse_args() 28 | dirname = args.name 29 | 30 | runner = SdkRuntime(dirname, cmaddr=args.cmaddr) 31 | 32 | result_symbol = runner.get_id('result') 33 | 34 | runner.load() 35 | runner.run() 36 | 37 | num_recv_pes = 3 # 3 PEs receive from the sender 38 | elems_per_pe = 3 # Each recv PE receives 3 elems after filtering 39 | 40 | print("step 1: launch function to send data to neighbors") 41 | runner.launch("main_fn", nonblock=False) 42 | 43 | print("step 2: copy back data from receiving PEs") 44 | result = np.zeros(num_recv_pes * elems_per_pe, np.float32) 45 | runner.memcpy_d2h(result, result_symbol, 1, 0, num_recv_pes, 1, elems_per_pe, streaming=False, \ 46 | data_type=MemcpyDataType.MEMCPY_32BIT, order=MemcpyOrder.ROW_MAJOR, nonblock=False) 47 | 48 | runner.stop() 49 | 50 | oracle = [6.5, 7, 7.5, 8, 8.5, 9, 9.5, 10, 10.5] 51 | np.testing.assert_allclose(result, oracle, atol=0.0001, rtol=0) 52 | print("SUCCESS!") 53 | -------------------------------------------------------------------------------- /benchmarks/25-pt-stencil/util.csl: -------------------------------------------------------------------------------- 1 | // Copyright 2025 Cerebras Systems. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | fn min(a: u16, b: u16) u16 { 16 | if (a < b) { 17 | return a; 18 | } 19 | return b; 20 | } 21 | 22 | fn computeRelativePeId(peId: u16, peCount: u16, dir: direction) u16 { 23 | if (dir == EAST or dir == SOUTH) { 24 | return peId; 25 | } 26 | if (dir == WEST or dir == NORTH) { 27 | return peCount - peId - 1; 28 | } 29 | @comptime_assert(false); 30 | } 31 | 32 | fn computeChunks(zDim: u16) i16 { 33 | // We observe that given the memory consumption of the program on chip, the 34 | // maximum number of Z-dimension values that we can allocate on chip is about 35 | // 400. The following line splits the requested Z dimension into multiple 36 | // chunks of the requested size exceeds 400. 37 | return 1 + zDim / 401; 38 | } 39 | 40 | fn computeChunkSize(zDim: u16, numChunks: u16) u16 { 41 | // If the number of chunks cleanly divides the number of elements in the Z 42 | // dimension, then use the result of the division as the size of the chunks. 43 | if (zDim % numChunks == 0) { 44 | return zDim / numChunks; 45 | } 46 | 47 | // Otherwise, bump up the chunk size by one. Note that increasing the chunk 48 | // size by one is better than increasing the number of chunks by one, since 49 | // each new chunk introduces a non-trivial overhead due to the need to perform 50 | // another round of communication with each neighbor. 51 | return 1 + zDim / numChunks; 52 | } 53 | -------------------------------------------------------------------------------- /benchmarks/gemv-collectives_2d/README.rst: -------------------------------------------------------------------------------- 1 | GEMV with Collective Communications 2 | =================================== 3 | 4 | This example shows a CSL program that uses collective communications 5 | to perform a generalized matrix-vector (GEMV) 6 | multiplication operation of the form: 7 | 8 | .. code-block:: text 9 | 10 | y = Ax + b 11 | 12 | where: 13 | 14 | - ``A`` is a tensor of shape [M, N] (stored distributed on PE memory). 15 | - ``x`` is a tensor of shape [N, 1]. 16 | It is placed in the memory of the northwesternmost PE before computation 17 | begins, and then scattered using collective communications. 18 | - ``b`` is a tensor of shape [M, 1]. 19 | It is placed in the memory of the northwesternmost PE before computation 20 | begins, and then scattered using collective communications. 21 | - ``y`` is the output tensor of shape [M, 1]. 22 | At the end of computation, it is located in the memory of 23 | the southeasternmost PE. 24 | 25 | For simplicity, we choose N as a multiple of the 26 | width of the kernel and M as a multiple of the height of the kernel. 27 | With the default compile parameters for this example, 28 | M = 32, N = 16 and we use a PE rectangle of size 4×4 for the kernel. 29 | The parameters specifying these values can be modified at compile time. 30 | 31 | Note that this algorithm and the implementation are not optimized for 32 | performance. It is intended to serve as a non-trivial introductory example 33 | of the collectives library. 34 | 35 | All computations are done in FP32 format. 36 | 37 | The matrix ``A``, of shape [M, N], 38 | is distributed across the PE memories as follows: 39 | 40 | - The first dimension of ``A``, M rows, is distributed across 41 | the height of the kernel. 42 | - The second dimension of ``A``, N columns, is distributed across 43 | the width of the kernel. 44 | 45 | Since we know that M is 32 and the height of the kernel is 4, each PE will be 46 | assigned 32÷4 = 8 rows of ``A``. 47 | 48 | Similarly, each PE will get 16÷4 = 4 columns of ``A``. This means each PE is 49 | assigned an 8×4 chunk of the original matrix ``A``. 50 | -------------------------------------------------------------------------------- /benchmarks/residual/README.rst: -------------------------------------------------------------------------------- 1 | Residual 2 | ======== 3 | 4 | This example shows a CSL program that uses a rectangle of 2-by-2 PEs to compute 5 | ``|b - A * x|``, i.e., the norm of the residual ``b - A * x``. 6 | 7 | ``A`` is an ``M x N`` matrix. Each PE computes a part of the ``A'*x`` 8 | multiplication, where ``A''' is a ``M/2 x N/2`` matrix. In other words, each PE 9 | essentially does "a fourth" of the multiplication. It then does a row reduction, 10 | so that the last column of PEs has the result ``b - A*x``. Finally, the PEs of 11 | the last column computes the norm, ``|b-A*x|``, via a column reduction. 12 | 13 | The 2-by-2 rectangle is surrounded by memcpy infrastructure which occupies five 14 | column of PEs shown below. 15 | The memcpy routes the input and output data between the host and the device. 16 | 17 | .. _fig-residual-memcpy-2-by-2: 18 | 19 | .. figure:: ./images/residual-memcpy-2-by-2.png 20 | :align: center 21 | :width: 980 px 22 | 23 | The matrix ``A``, the input vectors ``x`` and ``b`` and the output scalar (the 24 | computed norm ``|b - A * x|``) are supported by memcpy streaming. 25 | 26 | - The matrix ``A`` is distributed into the PEs. 27 | For simplicity, the matrix dimensions ``M x N`` are assumed even. 28 | 29 | - The vector ``x`` is distributed into the first row PEs. 30 | The first row receives ``x`` from the memcpy, then 31 | broadcasts ``x`` into other rows. The incoming vector ``x`` is distributed 32 | across all N = 4 PEs along the top side of the rectangle. 33 | 34 | - The vector ``b`` is distributed into rows of the first column. 35 | The vector ``b`` is distributed across all M = 6 PEs 36 | along the left side of the rectangle. 37 | 38 | - The scalar ``nrm_r`` is sent out by the PE with coordinates ``pe_x=1`` and 39 | ``pe_y=0``. 40 | 41 | Three functions ``GEMV``, ``AXPY``, and ``NRMINF`` are defined separately, and 42 | are loaded by ``import_module``. ``GEM`` computes ``y = A*x``, ``AXPY`` 43 | computes ``y = alpha*x`` and ``NRMINF`` computes the norm. ``SIMD`` operations 44 | are used in ``GEMV`` and ``AXPY`` to reduce the overhead of address computation. 45 | -------------------------------------------------------------------------------- /tutorials/sdklayout-04-h2d-d2h/add2vec.csl: -------------------------------------------------------------------------------- 1 | // Copyright 2025 Cerebras Systems. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | param size: u16; 16 | param rx1: color; 17 | param rx2: color; 18 | param tx: color; 19 | 20 | const in_q1 = @get_input_queue(0); 21 | const in_q2 = @get_input_queue(1); 22 | const out_q = @get_output_queue(0); 23 | 24 | const input1 = @get_dsd(fabin_dsd, .{.extent = size, 25 | .fabric_color = rx1, 26 | .input_queue = in_q1}); 27 | 28 | const input2 = @get_dsd(fabin_dsd, .{.extent = size, 29 | .fabric_color = rx2, 30 | .input_queue = in_q2}); 31 | 32 | const output = @get_dsd(fabout_dsd, .{.extent = size, 33 | .fabric_color = tx, 34 | .output_queue = out_q}); 35 | 36 | // WSE3 does not allow multiple fabric inputs per DSD operation. 37 | // Therefore, we introduce a FIFO for portability between WSE2 38 | // and WSE3. 39 | var buffer: [size]u16; 40 | const fifo = @allocate_fifo(buffer); 41 | const main_id = @get_local_task_id(8); 42 | task main() void { 43 | @mov16(fifo, input2, .{.async = true}); 44 | @add16(output, input1, fifo, .{.async = true}); 45 | } 46 | 47 | comptime { 48 | @bind_local_task(main, main_id); 49 | @activate(main_id); 50 | 51 | @initialize_queue(in_q1, .{.color = rx1}); 52 | @initialize_queue(in_q2, .{.color = rx2}); 53 | 54 | if (@is_arch("wse3")) { 55 | @initialize_queue(out_q, .{.color = tx}); 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /tutorials/sdklayout-03-ports-and-connections/add2vec.csl: -------------------------------------------------------------------------------- 1 | // Copyright 2025 Cerebras Systems. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | param size: u16; 16 | param rx1: color; 17 | param rx2: color; 18 | param tx: color; 19 | 20 | const in_q1 = @get_input_queue(0); 21 | const in_q2 = @get_input_queue(1); 22 | const out_q = @get_output_queue(0); 23 | 24 | const input1 = @get_dsd(fabin_dsd, .{.extent = size, 25 | .fabric_color = rx1, 26 | .input_queue = in_q1}); 27 | 28 | const input2 = @get_dsd(fabin_dsd, .{.extent = size, 29 | .fabric_color = rx2, 30 | .input_queue = in_q2}); 31 | 32 | const output = @get_dsd(fabout_dsd, .{.extent = size, 33 | .fabric_color = tx, 34 | .output_queue = out_q}); 35 | 36 | // WSE3 does not allow multiple fabric inputs per DSD operation. 37 | // Therefore, we introduce a FIFO for portability between WSE2 38 | // and WSE3. 39 | var buffer: [size]u16; 40 | const fifo = @allocate_fifo(buffer); 41 | const main_id = @get_local_task_id(8); 42 | task main() void { 43 | @mov16(fifo, input2, .{.async = true}); 44 | @add16(output, input1, fifo, .{.async = true}); 45 | } 46 | 47 | comptime { 48 | @bind_local_task(main, main_id); 49 | @activate(main_id); 50 | 51 | @initialize_queue(in_q1, .{.color = rx1}); 52 | @initialize_queue(in_q2, .{.color = rx2}); 53 | 54 | if (@is_arch("wse3")) { 55 | @initialize_queue(out_q, .{.color = tx}); 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /tutorials/pipeline-03-multiple/pe_program.csl: -------------------------------------------------------------------------------- 1 | // Copyright 2025 Cerebras Systems. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | param memcpy_params: comptime_struct; 16 | 17 | param size: i16; 18 | 19 | param Cin: color; 20 | param Cout: color; 21 | 22 | // Queue IDs 23 | const Cin_iq: input_queue = @get_input_queue(2); 24 | const Cout_oq: output_queue = @get_output_queue(3); 25 | 26 | // Task IDs 27 | const main_task_id: local_task_id = @get_local_task_id(8); 28 | 29 | const sys_mod = @import_module("", memcpy_params); 30 | 31 | const Cin_in_dsd = @get_dsd(fabin_dsd, .{ 32 | .extent = size, 33 | .fabric_color = Cin, 34 | .input_queue = Cin_iq, 35 | }); 36 | 37 | const Cout_out_dsd = @get_dsd(fabout_dsd, .{ 38 | .extent = size, 39 | .fabric_color = Cout, 40 | .output_queue = Cout_oq, 41 | }); 42 | 43 | const buf = [1]i16{ 1 }; 44 | const one_dsd = @get_dsd(mem1d_dsd, .{ .tensor_access = |i|{size} -> buf[0] }); 45 | 46 | task main_task() void { 47 | @add16(Cout_out_dsd, Cin_in_dsd, one_dsd, .{ .async = true }); 48 | } 49 | 50 | comptime { 51 | // activate local task main_task at startup 52 | @bind_local_task(main_task, main_task_id); 53 | @activate(main_task_id); 54 | 55 | @set_local_color_config(Cin, .{ .routes = .{ .rx = .{ WEST }, .tx = .{ RAMP }}}); 56 | @set_local_color_config(Cout, .{ .routes = .{ .rx = .{ RAMP }, .tx = .{ EAST }}}); 57 | 58 | // On WSE-3, we must explicitly initialize input and output queues 59 | if (@is_arch("wse3")) { 60 | @initialize_queue(Cin_iq, .{ .color = Cin }); 61 | @initialize_queue(Cout_oq, .{ .color = Cout }); 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /tutorials/topic-03-streaming-wavelet-data/pe_program.csl: -------------------------------------------------------------------------------- 1 | // Copyright 2025 Cerebras Systems. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Not a complete program; the top-level source file is layout.csl. 16 | 17 | param memcpy_params: comptime_struct; 18 | 19 | const sys_mod = @import_module("", memcpy_params); 20 | 21 | // Queue IDs 22 | const h2d_data_1_iq: input_queue = @get_input_queue(2); 23 | const d2h_data_1_oq: output_queue = @get_output_queue(3); 24 | 25 | // Data task main_task triggered by wlts along MEMCPYH2D_DATA_1 26 | // On WSE-2, data task IDs are created from colors; on WSE-3, from input queues 27 | const main_task_id: data_task_id = 28 | if (@is_arch("wse2")) @get_data_task_id(sys_mod.MEMCPYH2D_1) 29 | else if (@is_arch("wse3")) @get_data_task_id(h2d_data_1_iq); 30 | 31 | export var global: i16 = 0; 32 | 33 | const out_dsd = @get_dsd(fabout_dsd, .{ 34 | .extent = 1, 35 | .fabric_color = sys_mod.MEMCPYD2H_1, 36 | .output_queue = d2h_data_1_oq 37 | }); 38 | 39 | task main_task(wavelet_data: i16) void { 40 | global = wavelet_data; 41 | // The non-async operation works here because only one wavelet is sent 42 | // It would be better to use async operation with .{async = true} 43 | @mov16(out_dsd, global); 44 | } 45 | 46 | comptime { 47 | @bind_data_task(main_task, main_task_id); 48 | 49 | // On WSE-3, we must explicitly initialize input and output queues 50 | if (@is_arch("wse3")) { 51 | @initialize_queue(h2d_data_1_iq, .{ .color = sys_mod.MEMCPYH2D_1 }); 52 | @initialize_queue(d2h_data_1_oq, .{ .color = sys_mod.MEMCPYD2H_1 }); 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /tutorials/topic-15-wse3-microthreads/run.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env cs_python 2 | 3 | # Copyright 2025 Cerebras Systems. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | 18 | import argparse 19 | import numpy as np 20 | 21 | from cerebras.sdk.runtime.sdkruntimepybind import SdkRuntime, MemcpyDataType, MemcpyOrder # pylint: disable=no-name-in-module 22 | 23 | # Read arguments 24 | parser = argparse.ArgumentParser() 25 | parser.add_argument('--name', help="the test compile output dir") 26 | parser.add_argument('--cmaddr', help="IP:port for CS system") 27 | args = parser.parse_args() 28 | 29 | M = 10 30 | y = np.arange(M, dtype=np.float32) 31 | y_expected = y 32 | 33 | # Construct a runner using SdkRuntime 34 | runner = SdkRuntime(args.name, cmaddr=args.cmaddr) 35 | 36 | # Get symbols for A, x, y on device 37 | y_symbol = runner.get_id('y') 38 | 39 | # Load and run the program 40 | runner.load() 41 | runner.run() 42 | 43 | 44 | # Copy y into PE (0, 0) 45 | runner.memcpy_h2d(y_symbol, y, 0, 0, 1, 1, M, streaming=False, 46 | order=MemcpyOrder.ROW_MAJOR, data_type=MemcpyDataType.MEMCPY_32BIT, nonblock=False) 47 | 48 | # Launch the compute function on device 49 | runner.launch('compute', nonblock=False) 50 | 51 | # Copy y back from PE (1, 0) 52 | y_result = np.zeros([M], dtype=np.float32) 53 | runner.memcpy_d2h(y_result, y_symbol, 1, 0, 1, 1, M, streaming=False, 54 | order=MemcpyOrder.ROW_MAJOR, data_type=MemcpyDataType.MEMCPY_32BIT, nonblock=False) 55 | 56 | # Stop the program 57 | runner.stop() 58 | 59 | # Ensure that the result matches our expectation 60 | np.testing.assert_allclose(y_result, y_expected, atol=0.01, rtol=0) 61 | print("SUCCESS!") 62 | -------------------------------------------------------------------------------- /tutorials/topic-04-sparse-tensors/pe_program.csl: -------------------------------------------------------------------------------- 1 | // Copyright 2025 Cerebras Systems. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Not a complete program; the top-level source file is layout.csl. 16 | 17 | param memcpy_params: comptime_struct; 18 | 19 | const sys_mod = @import_module("", memcpy_params); 20 | 21 | // Queue IDs 22 | const h2d_data_1_iq: input_queue = @get_input_queue(2); 23 | const d2h_data_1_oq: output_queue = @get_output_queue(3); 24 | 25 | // Data task main_task triggered by wlts along MEMCPYH2D_DATA_1 26 | // On WSE-2, data task IDs are created from colors; on WSE-3, from input queues 27 | const main_task_id: data_task_id = 28 | if (@is_arch("wse2")) @get_data_task_id(sys_mod.MEMCPYH2D_1) 29 | else if (@is_arch("wse3")) @get_data_task_id(h2d_data_1_iq); 30 | 31 | var result = [4]i16 { 0, 0, 0, 0 }; 32 | 33 | const out_dsd = @get_dsd(fabout_dsd, .{ 34 | .extent = 1, 35 | .fabric_color = sys_mod.MEMCPYD2H_1, 36 | .output_queue = d2h_data_1_oq 37 | }); 38 | 39 | task main_task(wavelet_data: i16, index: i16) void { 40 | result[index] = wavelet_data; 41 | // The non-async operation works here because only two wavelet are sent 42 | // It would be better to use async operation with .{async = true} 43 | @mov16(out_dsd, wavelet_data); 44 | } 45 | 46 | comptime { 47 | @bind_data_task(main_task, main_task_id); 48 | 49 | // On WSE-3, we must explicitly initialize input and output queues 50 | if (@is_arch("wse3")) { 51 | @initialize_queue(h2d_data_1_iq, .{ .color = sys_mod.MEMCPYH2D_1 }); 52 | @initialize_queue(d2h_data_1_oq, .{ .color = sys_mod.MEMCPYD2H_1 }); 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /tutorials/gemv-02-memory-dsds/run.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env cs_python 2 | 3 | # Copyright 2025 Cerebras Systems. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | 18 | import argparse 19 | import numpy as np 20 | 21 | from cerebras.sdk.runtime.sdkruntimepybind import SdkRuntime, MemcpyDataType, MemcpyOrder # pylint: disable=no-name-in-module 22 | 23 | # Read arguments 24 | parser = argparse.ArgumentParser() 25 | parser.add_argument('--name', help="the test compile output dir") 26 | parser.add_argument('--cmaddr', help="IP:port for CS system") 27 | args = parser.parse_args() 28 | 29 | # Matrix dimensions 30 | M = 4 31 | N = 6 32 | 33 | # Construct A, x, b 34 | A = np.arange(M*N, dtype=np.float32).reshape(M, N) 35 | x = np.full(shape=N, fill_value=1.0, dtype=np.float32) 36 | b = np.full(shape=M, fill_value=2.0, dtype=np.float32) 37 | 38 | # Calculate expected y 39 | y_expected = A@x + b 40 | 41 | # Construct a runner using SdkRuntime 42 | runner = SdkRuntime(args.name, cmaddr=args.cmaddr) 43 | 44 | # Get symbol for copying y result off device 45 | y_symbol = runner.get_id('y') 46 | 47 | # Load and run the program 48 | runner.load() 49 | runner.run() 50 | 51 | # Launch the init_and_compute function on device 52 | runner.launch('init_and_compute', nonblock=False) 53 | 54 | # Copy y back from device 55 | y_result = np.zeros([1*1*M], dtype=np.float32) 56 | runner.memcpy_d2h(y_result, y_symbol, 0, 0, 1, 1, M, streaming=False, 57 | order=MemcpyOrder.ROW_MAJOR, data_type=MemcpyDataType.MEMCPY_32BIT, nonblock=False) 58 | 59 | # Stop the program 60 | runner.stop() 61 | 62 | # Ensure that the result matches our expectation 63 | np.testing.assert_allclose(y_result, y_expected, atol=0.01, rtol=0) 64 | print("SUCCESS!") 65 | -------------------------------------------------------------------------------- /tutorials/topic-15-wse3-microthreads/README.rst: -------------------------------------------------------------------------------- 1 | Topic 13: WSE-3 Microthreads 2 | ============================ 3 | 4 | Unlike WSE-2, the WSE-3 architecture exposes microthread IDs. 5 | This example demonstrates the use of explicit microthread IDS 6 | on the WSE-3 architecture. 7 | 8 | On WSE-2, the queue ID of an input or output fabric DSD corresponds to the 9 | ID of the microthread in which that operation executes. 10 | On WSE-3, queue IDs and microthreads can be decoupled, so that any 11 | microthread ID 0 to 7 can be used with any of queues 0 to 7. 12 | 13 | In this example, the left PE sends ``M`` wavelets to the right PE over 14 | the color ``send_color``. 15 | These wavelets are sent in an asynchronous ``@fmovs`` operation which 16 | copies from the ``y`` array via ``y_dsd`` into ``out_dsd``. 17 | ``out_dsd`` is a ``fabout_dsd`` associated with the color ``send_color``, 18 | and the output queue with ID 2. 19 | The ``@fmovs`` operation is launched using microthread ID 4. 20 | 21 | The right PE receives these ``M`` wavelets on the same color (called 22 | ``right_color`` in ``right_pe.csl``) via ``in_dsd``, which uses input 23 | queue with ID 2. 24 | The asynchronous ``@fmovs`` operation which receives these wavelets 25 | and copies them into ``y`` is launched using microthread ID 5. 26 | 27 | Decoupling microthread IDs from queue IDs can provide valuable flexibility 28 | in managing program resource usage, and conserve microthreads. 29 | 30 | By using explicit microthread IDs, we allow CSL's DSR allocator to use fewer 31 | DSRs in situations where fabric DSD operands are not known at compile time. 32 | 33 | Additionally, on the WSE-3, output queues cannot be re-used with a different 34 | color if they have not yet been drained, and CSL does not yet support a 35 | mechanism for guaranteeing that a given queue is empty. 36 | This may force the programmer to use more output queues than needed, which in 37 | turn can lead to overusing microthread IDs (if they are not explicitly 38 | specified, they default to the respective queue IDs). 39 | By allowing explicit microthread IDs, a programmer can share microthreads 40 | between output queues, and thus conserve microthreads for other operations. 41 | Note, however, that two operations cannot concurrently use the same microthread. 42 | -------------------------------------------------------------------------------- /tutorials/sdklayout-05-gemv/mux.csl: -------------------------------------------------------------------------------- 1 | // Copyright 2025 Cerebras Systems. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | param size: u16; 16 | param in_color: color; 17 | param out_color: color; 18 | 19 | const ctrl = @import_module(""); 20 | 21 | const input_q = @get_input_queue(0); 22 | const output_q = @get_output_queue(1); 23 | 24 | const inDSD = @get_dsd(fabin_dsd, .{.extent = size, 25 | .fabric_color = in_color, 26 | .input_queue = input_q}); 27 | 28 | const outDSD = @get_dsd(fabout_dsd, .{.extent = size, 29 | .fabric_color = out_color, 30 | .output_queue = output_q}); 31 | 32 | const ctrlOurDSD = @get_dsd(fabout_dsd, .{.extent = 1, 33 | .fabric_color = out_color, 34 | .output_queue = output_q, 35 | .control = true}); 36 | 37 | const main_id = @get_local_task_id(8); 38 | task main() void { 39 | @mov32(outDSD, inDSD, .{.async = true, .activate = send_ctrl}); 40 | } 41 | 42 | // This task sends a control wavelet to self, in order to 43 | // advance the switch position. 44 | const send_ctrl_id = @get_local_task_id(9); 45 | task send_ctrl() void { 46 | @mov32(ctrlOurDSD, ctrl.encode_single_payload(ctrl.opcode.SWITCH_ADV, true, {}, 0)); 47 | } 48 | 49 | comptime { 50 | @bind_local_task(main, main_id); 51 | @activate(main_id); 52 | 53 | @bind_local_task(send_ctrl, send_ctrl_id); 54 | 55 | @initialize_queue(input_q, .{.color = in_color}); 56 | if (@is_arch("wse3")) { 57 | @initialize_queue(output_q, .{.color = out_color}); 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /tutorials/pipeline-01-basic/README.rst: -------------------------------------------------------------------------------- 1 | 2 | Pipeline 1: Redirect fabin to fabout 3 | ==================================== 4 | 5 | While wavelet-triggered tasks enable us to receive and operate on one wavelet at 6 | a time, the programmer may need a way to receive a tensor comprised of multiple 7 | wavelets using one instruction. This is enabled by fabric input DSDs. 8 | Similarly, using fabric output DSDs, the programmer can send multiple wavelets 9 | using one instruction. 10 | 11 | This example illustrates two fabric DSDs, one for input and another for output. 12 | Each fabric DSD requires a corresponding color. 13 | 14 | Crucially, when using a fabric input DSD, it is important that the programmer 15 | blocks the wavelet's color, as this example does for the color 16 | ``MEMCPYH2D_DATA_1``. 17 | Otherwise, wavelets of that color will attempt to activate the (empty) task 18 | associated with the color, which in turn will consume the wavelet before it can 19 | be consumed by the fabric input DSD. 20 | 21 | This example only has a single PE, which receives data via H2D and sends it out 22 | via D2H in one vector operation. Logically speaking it is NOT valid because H2D 23 | and D2H are serialized. The host triggers D2H only if H2D is done. The hardware 24 | has some internal queues to hold the data for I/O, so H2D finishes when it 25 | pushes all data into the dedicated queues. This example still works if the size 26 | does not exceed the capacity of such queues. Otherwise H2D stalls. 27 | 28 | The parameter ``size`` controls the number of wavelets of H2D and D2H. The 29 | program stalls when ``size`` exceeds 14. 30 | 31 | Such programming paradigm is called pipelined approach: the kernel receives 32 | input data without storing it into memory, instead redirecting the result to 33 | the output. The microthread is necessary because the CE (compute engine) must 34 | have some resources to run ``memcpy`` kernel. The kernel stalls if a blocking 35 | instruction ``@add16(outDsd, inDsd, 1)`` is used. The simulation stalls, and 36 | the instruction trace shows ``@add16`` repeatedly querying data from input 37 | queue 1, which is still empty. The router receives the H2D command much later 38 | than running ``@add16``. The CE has no resource to run the H2D command received 39 | by the router, so it stalls. 40 | -------------------------------------------------------------------------------- /tutorials/pipeline-02-fifo/layout.csl: -------------------------------------------------------------------------------- 1 | // Copyright 2025 Cerebras Systems. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Color/ task ID map 16 | // 17 | // ID var ID var ID var ID var 18 | // 0 MEMCPYH2D_1 9 C1 18 27 reserved (memcpy) 19 | // 1 MEMCPYD2H_1 10 19 28 reserved (memcpy) 20 | // 2 11 20 29 reserved 21 | // 3 12 21 reserved (memcpy) 30 reserved (memcpy) 22 | // 4 13 22 reserved (memcpy) 31 reserved 23 | // 5 14 23 reserved (memcpy) 32 24 | // 6 15 24 33 25 | // 7 16 25 34 26 | // 8 main_task_id 17 26 35 27 | 28 | // Number of elements sent through core program rectangle 29 | param size: i16; 30 | 31 | param MEMCPYH2D_DATA_1_ID: i16; 32 | param MEMCPYD2H_DATA_1_ID: i16; 33 | 34 | const MEMCPYH2D_DATA_1: color = @get_color(MEMCPYH2D_DATA_1_ID); 35 | const MEMCPYD2H_DATA_1: color = @get_color(MEMCPYD2H_DATA_1_ID); 36 | 37 | const C1: color = @get_color(9); 38 | 39 | const memcpy = @import_module("", .{ 40 | .width = 1, 41 | .height = 1, 42 | .MEMCPYH2D_1 = MEMCPYH2D_DATA_1, 43 | .MEMCPYD2H_1 = MEMCPYD2H_DATA_1 44 | }); 45 | 46 | layout { 47 | @set_rectangle(1, 1); 48 | 49 | @set_tile_code(0, 0, "pe_program.csl", .{ 50 | .memcpy_params = memcpy.get_params(0), 51 | .size = size, 52 | .C1 = C1 53 | }); 54 | 55 | // fifo sends out the data via C1 --> tx = RAMP 56 | // add16 receives data via C1 --> rx = RAMP 57 | @set_color_config(0, 0, C1, .{ .routes = .{ .rx = .{ RAMP }, .tx = .{ RAMP }}}); 58 | } 59 | -------------------------------------------------------------------------------- /benchmarks/residual/axpy.csl: -------------------------------------------------------------------------------- 1 | // Copyright 2025 Cerebras Systems. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // http://www.netlib.org/lapack/explore-html/d8/daf/saxpy_8f.html 16 | // SAXPY constant times a vector plus a vector. 17 | // y = y + alpha*x 18 | // 19 | // @param[in] n number of elements of the input vectors 20 | // @param[in] alpha scalar 21 | // @param[in] x array of dimension n 22 | // x[j] can be NAN or INF if alpha is zero 23 | // @param[in,out] y array of dimension n 24 | 25 | param sizeXY: i16; // size of x and y, sizeXY >= n 26 | 27 | // To change the base address and the length of a DSD, csl requires a dummy DSD. 28 | // The type here doesn't matter 29 | const dummy = @zeros([1]i16); 30 | // The length doesn't matter either since csl will overwrite it 31 | const dummy_dsd = @get_dsd(mem1d_dsd, .{ .tensor_access = |i|{42} -> dummy[i] }); 32 | 33 | fn saxpy(n: i16, alpha: f32, x: *[sizeXY]f32, y: *[sizeXY]f32) void { 34 | // bind vector x to a DSD 35 | var mem_x_buf_dsd = @set_dsd_base_addr(dummy_dsd, x); 36 | mem_x_buf_dsd = @set_dsd_length(mem_x_buf_dsd, @as(u16, n)); 37 | 38 | // bind vector y to DSD 39 | // it is based on mem_x_buf_dsd, so no need to set the length again 40 | var mem_y_buf_dsd = @set_dsd_base_addr(mem_x_buf_dsd, y); 41 | 42 | // fast path: if alpha is zero, no need to compute 43 | if (alpha == 0.0) { 44 | return; 45 | } 46 | 47 | // y[j] = y[j] + x[j]*alpha, j = 0,1,2,...,n-1 48 | // The SIMD fmacs replaces the following for-loop 49 | // ======== 50 | // var row : i16 = 0; 51 | // while(row < n) : (row +=1) { 52 | // (y.*)[row] = (y.*)[row] + alpha * (x.*)[row]; 53 | // } 54 | // ======== 55 | @fmacs(mem_y_buf_dsd, mem_y_buf_dsd, mem_x_buf_dsd, alpha); 56 | } 57 | -------------------------------------------------------------------------------- /tutorials/sdklayout-02-routing/send_receive.csl: -------------------------------------------------------------------------------- 1 | // Copyright 2025 Cerebras Systems. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Select sender (0) or receiver (1) 16 | param select: u16; 17 | param c: color; 18 | 19 | const in_q = @get_input_queue(0); 20 | const out_q = @get_output_queue(1); 21 | 22 | const mode = enum(u16) {send = 0, receive = 1}; 23 | 24 | // Buffer to be sent 25 | const size = 5; 26 | const data = [size]u16{1, 2, 3, 4, 5}; 27 | 28 | // Buffer to receive data 29 | export var buffer: [size]u16; 30 | 31 | const dataDSD = @get_dsd(mem1d_dsd, .{.base_address = &data, .extent = size}); 32 | const bufferDSD = @get_dsd(mem1d_dsd, .{.base_address = &buffer, .extent = size}); 33 | 34 | const inDSD = @get_dsd(fabin_dsd, .{.extent = size, .fabric_color = c, .input_queue = in_q}); 35 | const outDSD = @get_dsd(fabout_dsd, .{.extent = size, .fabric_color = c, .output_queue = out_q}); 36 | 37 | // Sender task 38 | const send_task_id = @get_local_task_id(8); 39 | task send_task() void { 40 | @mov16(outDSD, dataDSD, .{.async = true}); 41 | } 42 | 43 | // Receiver task 44 | const receive_task_id = @get_local_task_id(9); 45 | task receive_task() void { 46 | @mov16(bufferDSD, inDSD, .{.async = true}); 47 | } 48 | 49 | const main_id = @get_local_task_id(10); 50 | task main() void { 51 | // Select sender or receiver 52 | switch(@as(mode, select)) { 53 | mode.send => @activate(send_task_id), 54 | mode.receive => @activate(receive_task_id) 55 | } 56 | } 57 | 58 | comptime { 59 | @bind_local_task(send_task, send_task_id); 60 | @bind_local_task(receive_task, receive_task_id); 61 | @bind_local_task(main, main_id); 62 | @activate(main_id); 63 | 64 | @initialize_queue(in_q, .{.color = c}); 65 | if (@is_arch("wse3")) { 66 | @initialize_queue(out_q, .{.color = c}); 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /tutorials/gemv-05-multiple-pes/pe_program.csl: -------------------------------------------------------------------------------- 1 | // Copyright 2025 Cerebras Systems. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | param memcpy_params: comptime_struct; 16 | 17 | // Matrix dimensions 18 | param M: i16; 19 | param N: i16; 20 | 21 | // memcpy module provides infrastructure for copying data 22 | // and launching functions from the host 23 | const sys_mod = @import_module("", memcpy_params); 24 | 25 | 26 | // 48 kB of global memory contain A, x, b, y 27 | var A: [M*N]f32; // A is stored row major 28 | var x: [N]f32; 29 | var b: [M]f32; 30 | var y = @zeros([M]f32); // Initialize y to zero 31 | 32 | // DSDs for accessing A, b, y 33 | // A_dsd accesses column of A 34 | var A_dsd = @get_dsd(mem1d_dsd, .{ .tensor_access = |i|{M} -> A[i*N] }); 35 | var b_dsd = @get_dsd(mem1d_dsd, .{ .base_address = &b, .extent = M }); 36 | var y_dsd = @get_dsd(mem1d_dsd, .{ .base_address = &y, .extent = M }); 37 | 38 | // ptrs to A, x, b, y will be advertised as symbols to host 39 | var A_ptr: [*]f32 = &A; 40 | var x_ptr: [*]f32 = &x; 41 | var b_ptr: [*]f32 = &b; 42 | const y_ptr: [*]f32 = &y; 43 | 44 | // Compute gemv 45 | fn gemv() void { 46 | // Loop over all columns of A 47 | for (@range(i16, N)) |i| { 48 | // Calculate contribution to A*x from ith column of A, ith elem of x 49 | @fmacs(y_dsd, y_dsd, A_dsd, x[i]); 50 | // Move A_dsd to next column of A 51 | A_dsd = @increment_dsd_offset(A_dsd, 1, f32); 52 | } 53 | // Add b to A*x 54 | @fadds(y_dsd, y_dsd, b_dsd); 55 | } 56 | 57 | // Call initialize and gemv functions 58 | fn compute() void { 59 | gemv(); 60 | sys_mod.unblock_cmd_stream(); 61 | } 62 | 63 | comptime { 64 | @export_symbol(A_ptr, "A"); 65 | @export_symbol(x_ptr, "x"); 66 | @export_symbol(b_ptr, "b"); 67 | @export_symbol(y_ptr, "y"); 68 | @export_symbol(compute); 69 | } 70 | -------------------------------------------------------------------------------- /tutorials/gemv-04-params/pe_program.csl: -------------------------------------------------------------------------------- 1 | // Copyright 2025 Cerebras Systems. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | param memcpy_params: comptime_struct; 16 | 17 | // Matrix dimensions 18 | param M: i16; 19 | param N: i16; 20 | 21 | // memcpy module provides infrastructure for copying data 22 | // and launching functions from the host 23 | const sys_mod = @import_module("", memcpy_params); 24 | 25 | 26 | // 48 kB of global memory contain A, x, b, y 27 | var A: [M*N]f32; // A is stored row major 28 | var x: [N]f32; 29 | var b: [M]f32; 30 | var y = @zeros([M]f32); // Initialize y to zero 31 | 32 | // DSDs for accessing A, b, y 33 | // A_dsd accesses column of A 34 | var A_dsd = @get_dsd(mem1d_dsd, .{ .tensor_access = |i|{M} -> A[i*N] }); 35 | var b_dsd = @get_dsd(mem1d_dsd, .{ .base_address = &b, .extent = M }); 36 | var y_dsd = @get_dsd(mem1d_dsd, .{ .base_address = &y, .extent = M }); 37 | 38 | // ptrs to A, x, b, y will be advertised as symbols to host 39 | var A_ptr: [*]f32 = &A; 40 | var x_ptr: [*]f32 = &x; 41 | var b_ptr: [*]f32 = &b; 42 | const y_ptr: [*]f32 = &y; 43 | 44 | // Compute gemv 45 | fn gemv() void { 46 | // Loop over all columns of A 47 | for (@range(i16, N)) |i| { 48 | // Calculate contribution to A*x from ith column of A, ith elem of x 49 | @fmacs(y_dsd, y_dsd, A_dsd, x[i]); 50 | // Move A_dsd to next column of A 51 | A_dsd = @increment_dsd_offset(A_dsd, 1, f32); 52 | } 53 | // Add b to A*x 54 | @fadds(y_dsd, y_dsd, b_dsd); 55 | } 56 | 57 | // Call initialize and gemv functions 58 | fn init_and_compute() void { 59 | gemv(); 60 | sys_mod.unblock_cmd_stream(); 61 | } 62 | 63 | comptime { 64 | @export_symbol(A_ptr, "A"); 65 | @export_symbol(x_ptr, "x"); 66 | @export_symbol(b_ptr, "b"); 67 | @export_symbol(y_ptr, "y"); 68 | @export_symbol(init_and_compute); 69 | } 70 | -------------------------------------------------------------------------------- /tutorials/gemv-03-memcpy/pe_program.csl: -------------------------------------------------------------------------------- 1 | // Copyright 2025 Cerebras Systems. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | param memcpy_params: comptime_struct; 16 | 17 | // memcpy module provides infrastructure for copying data 18 | // and launching functions from the host 19 | const sys_mod = @import_module("", memcpy_params); 20 | 21 | // Constants definining dimensions of our matrix 22 | const M: i16 = 4; 23 | const N: i16 = 6; 24 | 25 | // 48 kB of global memory contain A, x, b, y 26 | var A: [M*N]f32; // A is stored row major 27 | var x: [N]f32; 28 | var b: [M]f32; 29 | var y = @zeros([M]f32); // Initialize y to zero 30 | 31 | // DSDs for accessing A, b, y 32 | // A_dsd accesses column of A 33 | var A_dsd = @get_dsd(mem1d_dsd, .{ .tensor_access = |i|{M} -> A[i*N] }); 34 | var b_dsd = @get_dsd(mem1d_dsd, .{ .base_address = &b, .extent = M }); 35 | var y_dsd = @get_dsd(mem1d_dsd, .{ .base_address = &y, .extent = M }); 36 | 37 | // ptrs to A, x, b, y will be advertised as symbols to host 38 | var A_ptr: [*]f32 = &A; 39 | var x_ptr: [*]f32 = &x; 40 | var b_ptr: [*]f32 = &b; 41 | const y_ptr: [*]f32 = &y; 42 | 43 | // Compute gemv 44 | fn gemv() void { 45 | // Loop over all columns of A 46 | for (@range(i16, N)) |i| { 47 | // Calculate contribution to A*x from ith column of A, ith elem of x 48 | @fmacs(y_dsd, y_dsd, A_dsd, x[i]); 49 | // Move A_dsd to next column of A 50 | A_dsd = @increment_dsd_offset(A_dsd, 1, f32); 51 | } 52 | // Add b to A*x 53 | @fadds(y_dsd, y_dsd, b_dsd); 54 | } 55 | 56 | // Call initialize and gemv functions 57 | fn init_and_compute() void { 58 | gemv(); 59 | sys_mod.unblock_cmd_stream(); 60 | } 61 | 62 | comptime { 63 | @export_symbol(A_ptr, "A"); 64 | @export_symbol(x_ptr, "x"); 65 | @export_symbol(b_ptr, "b"); 66 | @export_symbol(y_ptr, "y"); 67 | @export_symbol(init_and_compute); 68 | } 69 | -------------------------------------------------------------------------------- /tutorials/pipeline-03-multiple/memcpy_edge/d2h.csl: -------------------------------------------------------------------------------- 1 | // Copyright 2025 Cerebras Systems. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // One streaming D2H: 16 | // 1st D2H: UT 4 and UT 7 17 | 18 | param MEMCPYD2H_1 = {}; 19 | 20 | // Color along which we expect a wavelet 21 | param USER_OUT_1 = {}; 22 | 23 | param rxdir: direction; 24 | 25 | // Queue IDs 26 | const USER_OUT_1_iq: input_queue = @get_input_queue(7); 27 | const d2h_oq: output_queue = @get_output_queue(4); 28 | 29 | const max_fifo_len = 256*40; // maximum length of the fifo 30 | 31 | var fifo1_buffer = @zeros([max_fifo_len]u32); 32 | const fifo1 = @allocate_fifo(fifo1_buffer); 33 | 34 | const INFINITE_DSD_LEN: u16 = 0x7fff; 35 | 36 | var fab_recv_wdsd = @get_dsd(fabin_dsd, .{ 37 | .extent = INFINITE_DSD_LEN, 38 | .fabric_color = USER_OUT_1, 39 | .input_queue = USER_OUT_1_iq 40 | }); 41 | 42 | var fab_trans_wdsd = @get_dsd(fabout_dsd, .{ 43 | .extent = INFINITE_DSD_LEN, 44 | .fabric_color = MEMCPYD2H_1, 45 | .output_queue = d2h_oq 46 | }); 47 | 48 | // if USER_OUT_1 is not valid, f_startup() is empty 49 | fn f_startup() void { 50 | if (!@is_same_type(@type_of(MEMCPYD2H_1), void) and !@is_same_type(@type_of(USER_OUT_1), void)) { 51 | // receive data from USER_OUT_1 52 | @mov32(fifo1, fab_recv_wdsd, .{ .async = true }); 53 | 54 | // forward data to MEMCPYD2H_1 55 | @mov32(fab_trans_wdsd, fifo1, .{ .async = true }); 56 | } 57 | } 58 | 59 | comptime { 60 | if (!@is_same_type(@type_of(USER_OUT_1), void)) { 61 | @set_local_color_config(USER_OUT_1, .{ .routes = .{ .rx = .{ rxdir }, .tx = .{ RAMP }}}); 62 | 63 | // On WSE-3, we must explicitly initialize input and output queues 64 | if (@is_arch("wse3")) { 65 | @initialize_queue(d2h_oq, .{ .color = MEMCPYD2H_1 }); 66 | @initialize_queue(USER_OUT_1_iq, .{ .color = USER_OUT_1 }); 67 | } 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /tutorials/topic-14-color-swap/pe_program.csl: -------------------------------------------------------------------------------- 1 | // Copyright 2025 Cerebras Systems. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | param memcpy_params: comptime_struct; 16 | 17 | const sys_mod = @import_module("", memcpy_params); 18 | 19 | // Colors 20 | param red: color; 21 | param blue: color; 22 | 23 | // Queue IDs 24 | const blue_oq: output_queue = @get_output_queue(2); 25 | 26 | // Task IDs 27 | // Task ID for data task that recvs from memcpy 28 | const h2d_task_id: data_task_id = @get_data_task_id(sys_mod.MEMCPYH2D_1); 29 | 30 | // Task ID for data task red, consumes red wlts 31 | const red_task_id: data_task_id = @get_data_task_id(red); 32 | 33 | // Task ID for data task blue, consumes blue wlts 34 | const blue_task_id: data_task_id = @get_data_task_id(blue); 35 | 36 | // Single-elem array to hold sum of received wlts 37 | var sum = @zeros([1]u32); 38 | var ptr_sum: [*]u32 = ∑ 39 | 40 | // Task that will be triggered by red wavelet 41 | task red_task(in_data : u32) void { 42 | sum[0] += in_data; 43 | } 44 | 45 | // Task that will be triggered by blue wavelet 46 | task blue_task(in_data : u32) void { 47 | sum[0] += in_data * 2; 48 | } 49 | 50 | var buf = @zeros([1]u32); 51 | const buf_dsd = @get_dsd(mem1d_dsd, .{ .tensor_access = |i|{1} -> buf[i] }); 52 | 53 | // PEs 0, 2 activate blue task; 1, 3 activate red task. 54 | const out_dsd = @get_dsd(fabout_dsd, .{ 55 | .extent = 1, 56 | .fabric_color = blue, 57 | .output_queue = blue_oq 58 | }); 59 | 60 | // receive data from streaming H2D and forward it to color red 61 | task wtt_h2d(data: u32) void { 62 | @block(h2d_task_id); 63 | buf[0] = data; 64 | @mov16(out_dsd, buf_dsd, .{ .async = true, .unblock = h2d_task_id }); 65 | } 66 | 67 | comptime { 68 | @bind_data_task(red_task, red_task_id); 69 | @bind_data_task(blue_task, blue_task_id); 70 | @bind_data_task(wtt_h2d, h2d_task_id); 71 | 72 | @export_symbol(ptr_sum, "sum"); 73 | } 74 | -------------------------------------------------------------------------------- /tutorials/gemv-06-routes-1/README.rst: -------------------------------------------------------------------------------- 1 | GEMV 6: Routes and Fabric DSDs, Part I 2 | ====================================== 3 | 4 | Continuing from the previous example, we now break up a single GEMV 5 | computation among two PEs. 6 | 7 | The host program copies ``b`` into the ``y`` tensor of the left PE. 8 | The left PE also gets the first ``N/2`` columns of ``A`` and the first ``N/2`` 9 | values of ``x``, and the right PE gets the last ``N/2`` columns of ``A`` 10 | and last ``N/2`` values of ``x``. 11 | 12 | The left and right PE both increment their local ``y`` tensors by computing 13 | their piece of ``Ax``. 14 | Then, the left PE sends its result to the right PE, which increments its ``y`` 15 | tensor by the received values. 16 | 17 | Last, the host copies ``y`` from the right PE, and checks that the result is 18 | correct. 19 | 20 | To send data from the left PE to the right PE, we must specify a route, known 21 | as a color. 22 | In ``layout.csl``, ``@set_color_config`` specifies that on the left PE, 23 | color 0 will receive data, or wavelets, from the compute element (CE) 24 | up the RAMP, and transmit them to the EAST. 25 | On the right PE, color 0 will receive wavelets form the ``WEST``, and then 26 | transmit them down the RAMP to the CE. 27 | ``@set_tile_code`` passes the ID of this color to ``pe_program`` as a 28 | parameter named ``send_color``, and also sets a paremeter called ``pe_id``, 29 | to diffentiate if the program is running on the left or the right PE. 30 | 31 | The ``send_right`` function executed on the left PE defines a ``fabout_dsd`` 32 | called ``out_dsd`` that sends ``M`` wavelets along the color route specified 33 | by ``send_color``. 34 | ``out_dsd`` is used as the destination operand of ``@fmovs``, and ``y_dsd`` 35 | as the source operand. 36 | Thus, this operation sends the ``M`` elements accessed by ``y_dsd`` along the 37 | fabric as specified by ``out_dsd``. 38 | 39 | The ``recv_left`` function executed on the right PE receives the data in a 40 | ``fabin_dsd`` named ``in_dsd``, used in an ``@fadds`` operation that 41 | increments the ``M`` elements of ``y`` on this PE by the ``M`` received values. 42 | 43 | Note that this program also provides an example of a local task. 44 | The ``@fmovs`` and ``@fadds`` operations are performed asynchronously; 45 | when these operations are done, the color ``exit_color`` is activated, which 46 | activates the task ``exit_task``. 47 | This task unblocks ``memcpy``'s command stream, allowing additional commands 48 | from the host program to proceed. 49 | -------------------------------------------------------------------------------- /tutorials/gemv-01-complete-program/README.rst: -------------------------------------------------------------------------------- 1 | GEMV 1: A Complete Program 2 | ========================== 3 | 4 | This example demonstrates a complete CSL program. 5 | 6 | A complete program consists of a host program (a Python script, in this example) 7 | and at least two CSL code files, 8 | one of which defines the layout of the program across a collection of 9 | processing elements (PEs) on the Wafer-Scale Engine (hereafter referred to 10 | as "device"), 11 | and one or more of which define the programs running on the individual PEs. 12 | In this example, there is just one PE. 13 | 14 | When executing the program, the user first compiles the CSL code files, and 15 | then invokes the host program to copy data on and off the device and launch 16 | functions on the device using a remote procedure call (RPC) mechanism. 17 | The device used may be an actual CS system, 18 | or it may be simulated without access to an actual CS system using the 19 | Cerebras Fabric Simulator. 20 | 21 | The host program here is defined in the ``run.py`` script, and the layout and 22 | device code are defined in ``layout.csl`` and ``pe_program.csl``. 23 | 24 | The movement of data from host to device and back is done with memory to memory 25 | copy semantics, which is provided by an SDK utility called ``memcpy``. 26 | The top of the ``layout.csl`` file imports a module which is used to 27 | parameterize the program's ``memcpy`` infrastructure. 28 | This file also includes a layout block which specifies the number 29 | and spatial arrangement of PEs used by this program, as well as the instructions 30 | to execute on each PE. 31 | Here, we instruct the compiler to produce executable code for 1 PE using the 32 | code in ``pe_program.csl``. 33 | 34 | This program executes as follows. 35 | The host code ``run.py`` uses the remote procedure call (RPC) mechanism to 36 | launch a function called ``init_and_compute`` on the device. 37 | This function initializes a 4 x 6 matrix ``A``, stored in row major format, 38 | a 6 x 1 vector ``x``, and a 4 x 1 vector ``b``. 39 | Then, it computes the matrix-vector product of ``Ax + b`` 40 | and stores it in ``y``. 41 | 42 | Once ``init_and_compute`` finishes on the device, 43 | the host program performs a device-to-host memcpy with 44 | the ``memcpy_d2h`` command to copy back the result stored in ``y``, 45 | and then checks that the answer is correct. 46 | Notice the ``unblock_cmd_stream`` call in ``pe_program.csl`` that occurs 47 | at the end of ``init_and_compute``; 48 | this call allows the device-to-host ``memcpy_d2h`` to proceed. 49 | -------------------------------------------------------------------------------- /benchmarks/bandwidth-test/src/sync/layout.csl: -------------------------------------------------------------------------------- 1 | // Copyright 2025 Cerebras Systems. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | 16 | param colors:[5]color; 17 | param entrypoints:[4]local_task_id; 18 | param width : i16 ; // width of the core 19 | param height: i16 ; // height of the core 20 | 21 | const C0 : color = colors[0]; 22 | const C1 : color = colors[1]; 23 | const C2 : color = colors[2]; 24 | const C3 : color = colors[3]; 25 | const C4 : color = colors[4]; 26 | 27 | const STARTUP: local_task_id = entrypoints[0]; 28 | const SYNC_Y: local_task_id = entrypoints[1]; 29 | const SYNC_BCAST: local_task_id = entrypoints[2]; 30 | const EXIT: local_task_id = entrypoints[3]; 31 | 32 | fn get_params(px:i16, py:i16) comptime_struct { 33 | 34 | var first_py: bool = (0 == py); 35 | var last_py: bool = ((height-1) == py); 36 | var is_py_even: bool = (0 == (py % 2)); 37 | 38 | var first_px: bool = (0 == px); 39 | var last_px: bool = ((width-1) == px); 40 | var is_px_even: bool = (0 == (px % 2)); 41 | 42 | var c_recv_px: color = C0; 43 | var c_send_px: color = C1; 44 | if (is_px_even){ 45 | c_recv_px = C0; 46 | c_send_px = C1; 47 | }else{ 48 | c_recv_px = C1; 49 | c_send_px = C0; 50 | } 51 | 52 | var c_recv_py: color = C2; 53 | var c_send_py: color = C3; 54 | if (is_py_even){ 55 | c_recv_py = C2; 56 | c_send_py = C3; 57 | }else{ 58 | c_recv_py = C3; 59 | c_send_py = C2; 60 | } 61 | 62 | return .{ 63 | .c_recv_px = c_recv_px, 64 | .c_send_px = c_send_px, 65 | .c_recv_py = c_recv_py, 66 | .c_send_py = c_send_py, 67 | .c_bcast = C4, 68 | 69 | .STARTUP = STARTUP, 70 | .SYNC_Y = SYNC_Y, 71 | .SYNC_BCAST = SYNC_BCAST, 72 | .EXIT = EXIT, 73 | 74 | .first_px = first_px, 75 | .last_px = last_px, 76 | .first_py = first_py, 77 | .last_py = last_py, 78 | }; 79 | } 80 | -------------------------------------------------------------------------------- /benchmarks/row-col-broadcast/src/sync/layout.csl: -------------------------------------------------------------------------------- 1 | // Copyright 2025 Cerebras Systems. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | 16 | param colors:[5]color; 17 | param entrypoints:[4]local_task_id; 18 | param width : i16 ; // width of the core 19 | param height: i16 ; // height of the core 20 | 21 | const C0 : color = colors[0]; 22 | const C1 : color = colors[1]; 23 | const C2 : color = colors[2]; 24 | const C3 : color = colors[3]; 25 | const C4 : color = colors[4]; 26 | 27 | const STARTUP: local_task_id = entrypoints[0]; 28 | const SYNC_Y: local_task_id = entrypoints[1]; 29 | const SYNC_BCAST: local_task_id = entrypoints[2]; 30 | const EXIT: local_task_id = entrypoints[3]; 31 | 32 | fn get_params(px:i16, py:i16) comptime_struct { 33 | 34 | var first_py: bool = (0 == py); 35 | var last_py: bool = ((height-1) == py); 36 | var is_py_even: bool = (0 == (py % 2)); 37 | 38 | var first_px: bool = (0 == px); 39 | var last_px: bool = ((width-1) == px); 40 | var is_px_even: bool = (0 == (px % 2)); 41 | 42 | var c_recv_px: color = C0; 43 | var c_send_px: color = C1; 44 | if (is_px_even){ 45 | c_recv_px = C0; 46 | c_send_px = C1; 47 | }else{ 48 | c_recv_px = C1; 49 | c_send_px = C0; 50 | } 51 | 52 | var c_recv_py: color = C2; 53 | var c_send_py: color = C3; 54 | if (is_py_even){ 55 | c_recv_py = C2; 56 | c_send_py = C3; 57 | }else{ 58 | c_recv_py = C3; 59 | c_send_py = C2; 60 | } 61 | 62 | return .{ 63 | .c_recv_px = c_recv_px, 64 | .c_send_px = c_send_px, 65 | .c_recv_py = c_recv_py, 66 | .c_send_py = c_send_py, 67 | .c_bcast = C4, 68 | 69 | .STARTUP = STARTUP, 70 | .SYNC_Y = SYNC_Y, 71 | .SYNC_BCAST = SYNC_BCAST, 72 | .EXIT = EXIT, 73 | 74 | .first_px = first_px, 75 | .last_px = last_px, 76 | .first_py = first_py, 77 | .last_py = last_py, 78 | }; 79 | } 80 | -------------------------------------------------------------------------------- /benchmarks/spmv-hypersparse/README.rst: -------------------------------------------------------------------------------- 1 | Hypersparse SpMV 2 | ================ 3 | 4 | This example evaluates the performance of sparse matrix-vector multiplication. 5 | The kernel records the ``start`` and ``end`` of ``spmv`` by tsc counter. In 6 | addition the tsc counters of all PEs are not sychronized in the beginning. 7 | To avoid the timing variation among those PEs, ``f_sync()`` synchronizes all 8 | PEs and samples the reference clock. 9 | 10 | The kernel ``kernel.csl`` defines a couple of host-callable functions, 11 | ``f_sync()``, ``f_tic()`` and ``f_toc()`` in order to synchronize the PEs and 12 | record the timing of ``spmv``. 13 | 14 | The kernel ``allreduce2R1E/pe.csl`` performs a reduction over the whole 15 | rectangle to synchronize the PEs, then the bottom-right PE sends a signal to 16 | other PEs to sample the reference clock. The ``allreduce2R1E`` is a variant of 17 | ``allreduce`` in ``stencil-3d-7pts``. The former uses 2 routable colors and 18 | 1 entrypoints, the latter uses 1 routable color and 4 entrypoints. 19 | ``allreduce2R1E`` is designed for spmv kernel which only has three unused 20 | colors. 21 | 22 | The kernel ``hypersparse_spmv/pe.csl`` performs a matrix-vector product (spmv) 23 | where the matrix ``A`` is hypersparse, partitioned into 2D grids. The input 24 | vector ``x`` and output vector ``y`` are also distributed into 2D grids. 25 | 26 | The user has to provide the matrix ``A`` in Matrix Market File format with 27 | 1-based index. To obtain the best performance, the user may need to reorder the 28 | matrix such that the variatoin of the nonzeros of each parition is small. One 29 | option is ``util/analyze.cpp`` which provides a load balancing algorithm. 30 | 31 | The script ``run.py`` has the following parameters: 32 | 33 | - ``--infile_mtx=`` contains the sparse matrix A 34 | 35 | - ``--num_pe_rows=`` specifies the height of the core rectangle 36 | 37 | - ``--num_pe_cols=`` specifies the width of the core rectangle 38 | 39 | - ``--channels=`` specifies the number of I/O channels, no bigger than 16. 40 | 41 | The ``tic()`` samples "time_start" and ``toc()`` samples "time_end". The 42 | ``sync()`` samples "time_ref" which is used to adjust "time_start" and 43 | "time_end". The elapsed time (unit: cycles) is measured by 44 | ``cycles_send = max(time_end) - min(time_start)`` 45 | 46 | The overall runtime (us) is computed via the following formula 47 | ``time_send = (cycles_send / 0.85) * 1.e-3 us`` 48 | 49 | The bandwidth is calculated by 50 | ``bandwidth = ((2*nnz+m)*4)/time_send)`` 51 | -------------------------------------------------------------------------------- /tutorials/pipeline-01-basic/pe_program.csl: -------------------------------------------------------------------------------- 1 | // Copyright 2025 Cerebras Systems. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | param memcpy_params: comptime_struct; 16 | 17 | // number of elements received from host 18 | param size: i16; 19 | 20 | const sys_mod = @import_module("", memcpy_params); 21 | 22 | // Queues 23 | const h2d_1_iq: input_queue = @get_input_queue(2); 24 | const d2h_1_oq: output_queue = @get_output_queue(3); 25 | 26 | // Task IDs 27 | const main_task_id: local_task_id = @get_local_task_id(8); 28 | 29 | const in_dsd = @get_dsd(fabin_dsd, .{ 30 | .extent = size, 31 | .fabric_color = sys_mod.MEMCPYH2D_1, 32 | .input_queue = h2d_1_iq 33 | }); 34 | 35 | const out_dsd = @get_dsd(fabout_dsd, .{ 36 | .extent = size, 37 | .fabric_color = sys_mod.MEMCPYD2H_1, 38 | .output_queue = d2h_1_oq 39 | }); 40 | 41 | var buf = @zeros([1]i16); 42 | const one_dsd = @get_dsd(mem1d_dsd, .{ .tensor_access = |i|{size} -> buf[0] }); 43 | 44 | task main_task() void { 45 | // WARNING: large size can stall. 46 | // H2D and D2H are serialized. It is NOT safe to run "send" and "recv" 47 | // involving memcpy at the same time on the same PE. 48 | // 49 | // It only works for a small vector because the HW has some internal 50 | // queues to hold those values from/to IO. If such queues are full, 51 | // I/O stalls. 52 | // 53 | // In this case, if the length exceeds certain amount, 54 | // H2D cannot finish and D2H has no chance to run. 55 | 56 | buf[0] = @as(i16, 1); 57 | @add16(out_dsd, in_dsd, one_dsd, .{ .async = true }); 58 | } 59 | 60 | comptime { 61 | // activate local task main_task at startup 62 | @activate(main_task_id); 63 | @bind_local_task(main_task, main_task_id); 64 | 65 | // On WSE-3, we must explicitly initialize input and output queues 66 | if (@is_arch("wse3")) { 67 | @initialize_queue(h2d_1_iq, .{ .color = sys_mod.MEMCPYH2D_1 }); 68 | @initialize_queue(d2h_1_oq, .{ .color = sys_mod.MEMCPYD2H_1 }); 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /tutorials/gemv-01-complete-program/pe_program.csl: -------------------------------------------------------------------------------- 1 | // Copyright 2025 Cerebras Systems. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Struct containing parameters for memcpy layout 16 | param memcpy_params: comptime_struct; 17 | 18 | // memcpy module provides infrastructure for copying data 19 | // and launching functions from the host 20 | const sys_mod = @import_module("", memcpy_params); 21 | 22 | // Constants definining dimensions of our matrix 23 | const M: i16 = 4; 24 | const N: i16 = 6; 25 | 26 | // 48 kB of global memory contain A, x, b, y 27 | var A: [M*N]f32; // A is stored row major 28 | var x: [N]f32; 29 | var b: [M]f32; 30 | var y: [M]f32; 31 | 32 | // Ptr to y will be exported as symbol to host 33 | // Ptr is const, so host can read but not write to y 34 | const y_ptr: [*]f32 = &y; 35 | 36 | // Initialize matrix and vectors 37 | fn initialize() void { 38 | // for loop with range syntax 39 | for (@range(i16, M*N)) |idx| { 40 | A[idx] = @as(f32, idx); 41 | } 42 | 43 | for (@range(i16, N)) |j| { 44 | x[j] = 1.0; 45 | } 46 | 47 | // while loop with iterator syntax 48 | var i: i16 = 0; 49 | while (i < M) : (i += 1) { 50 | b[i] = 2.0; 51 | y[i] = 0.0; 52 | } 53 | } 54 | 55 | // Compute gemv 56 | fn gemv() void { 57 | for (@range(i16, M)) |i| { 58 | var tmp: f32 = 0.0; 59 | for (@range(i16, N)) |j| { 60 | tmp += A[i*N + j] * x[j]; 61 | } 62 | y[i] = tmp + b[i]; 63 | } 64 | } 65 | 66 | // Call initialize and gemv functions 67 | fn init_and_compute() void { 68 | initialize(); 69 | gemv(); 70 | 71 | // After this function finishes, memcpy's cmd_stream must 72 | // be unblocked on all PEs for further memcpy commands 73 | // to execute 74 | sys_mod.unblock_cmd_stream(); 75 | } 76 | 77 | comptime { 78 | // Export symbol pointing to y so it is host-readable 79 | @export_symbol(y_ptr, "y"); 80 | 81 | // Export function so it is host-callable by RPC mechanism 82 | @export_symbol(init_and_compute); 83 | } 84 | -------------------------------------------------------------------------------- /tutorials/gemv-07-routes-2/layout.csl: -------------------------------------------------------------------------------- 1 | // Copyright 2025 Cerebras Systems. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // total matrix dimensions 16 | param M: i16; 17 | param N: i16; 18 | 19 | // Colors 20 | const ax_color: color = @get_color(0); // sends/recvs partial result Ax EAST 21 | const x_color: color = @get_color(1); // sends/recvs elems x 22 | 23 | // This example uses 2x2 PEs 24 | const memcpy = @import_module("", .{ 25 | .width = 2, 26 | .height = 2 27 | }); 28 | 29 | layout { 30 | // PE coordinates are (column, row) 31 | @set_rectangle(2, 2); 32 | 33 | for (@range(i16, 2)) |pe_x| { 34 | for (@range(i16, 2)) |pe_y| { 35 | @set_tile_code(pe_x, pe_y, "pe_program.csl", .{ 36 | .memcpy_params = memcpy.get_params(pe_x), 37 | .M_per_PE = M / 2, 38 | .N_per_PE = N / 2, 39 | .ax_color = ax_color, 40 | .x_color = x_color 41 | }); 42 | } 43 | } 44 | 45 | // Top left PE (0, 0) 46 | @set_color_config(0, 0, ax_color, .{.routes = .{ .rx = .{RAMP}, .tx = .{EAST} }}); 47 | @set_color_config(0, 0, x_color, .{.routes = .{ .rx = .{RAMP}, .tx = .{RAMP, SOUTH} }}); 48 | 49 | // Top right PE (1, 0) 50 | @set_color_config(1, 0, ax_color, .{.routes = .{ .rx = .{WEST}, .tx = .{RAMP} }}); 51 | @set_color_config(1, 0, x_color, .{.routes = .{ .rx = .{RAMP}, .tx = .{RAMP, SOUTH} }}); 52 | 53 | // Bottom left PE (0, 1) 54 | @set_color_config(0, 1, ax_color, .{.routes = .{ .rx = .{RAMP}, .tx = .{EAST} }}); 55 | @set_color_config(0, 1, x_color, .{.routes = .{ .rx = .{NORTH}, .tx = .{RAMP} }}); 56 | 57 | // Bottom right PE (1, 1) 58 | @set_color_config(1, 1, ax_color, .{.routes = .{ .rx = .{WEST}, .tx = .{RAMP} }}); 59 | @set_color_config(1, 1, x_color, .{.routes = .{ .rx = .{NORTH}, .tx = .{RAMP} }}); 60 | 61 | // export symbol names 62 | @export_name("A", [*]f32, true); 63 | @export_name("x", [*]f32, true); 64 | @export_name("y", [*]f32, true); 65 | @export_name("compute", fn()void); 66 | } 67 | --------------------------------------------------------------------------------