├── design └── api.md ├── tests ├── cases │ ├── 07-test-fusion-attention │ │ ├── NULL │ │ ├── edge.yaml │ │ └── result │ │ │ ├── attention-fused.csv │ │ │ └── attention-nofuse.csv │ ├── 13-test-attention │ │ ├── map │ │ │ ├── pipelined.yaml │ │ │ ├── fuse-L0-pipeline.yaml │ │ │ ├── flash-attention.yaml │ │ │ ├── pipeline.yaml │ │ │ ├── tileflow.yaml │ │ │ ├── fuse-L0.yaml │ │ │ ├── no-fuse.yaml │ │ │ ├── flat.yaml │ │ │ └── chimera.yaml │ │ ├── readme.md │ │ ├── run.sh │ │ ├── map-raw │ │ │ ├── flash-attention.yaml │ │ │ ├── pipeline.yaml │ │ │ ├── fuse-L0-pipeline.yaml │ │ │ ├── tileflow.yaml │ │ │ ├── no-fuse.yaml │ │ │ ├── flat.yaml │ │ │ └── chimera.yaml │ │ ├── arch │ │ │ ├── cloud-4X4.yaml │ │ │ ├── cloud.yaml │ │ │ ├── edge.yaml │ │ │ ├── cloud-12X14.yaml │ │ │ ├── cloud-16X16.yaml │ │ │ ├── cloud-32X32.yaml │ │ │ ├── cloud-lowBW.yaml │ │ │ ├── cloud-midBW.yaml │ │ │ ├── cloud-highBW.yaml │ │ │ └── cloud-largeBW.yaml │ │ ├── prob │ │ │ └── attention.yaml │ │ └── parser.py │ ├── 00-validation │ │ └── 02-attention │ │ │ ├── .gitignore │ │ │ ├── config.yaml │ │ │ ├── macro │ │ │ └── macro.yaml │ │ │ ├── prob │ │ │ └── prob.yaml │ │ │ ├── map │ │ │ └── map.yaml │ │ │ └── topk-analysis.py │ ├── 06-test-mapper │ │ ├── out.txt │ │ └── test1.yaml │ ├── 12-test-fused-cnn │ │ ├── macro.yaml │ │ ├── run.sh │ │ ├── map-raw │ │ │ ├── isos.yaml │ │ │ ├── fused-layer.yaml │ │ │ ├── tileflow.yaml │ │ │ └── naive.yaml │ │ ├── arch │ │ │ ├── edge.yaml │ │ │ └── cloud.yaml │ │ ├── prob │ │ │ └── prob.yaml │ │ ├── map │ │ │ ├── isos.yaml │ │ │ ├── tileflow.yaml │ │ │ ├── fused-layer.yaml │ │ │ ├── pipeline.yaml │ │ │ └── naive.yaml │ │ ├── parser.py │ │ ├── fused-layer.yaml │ │ └── script.py │ ├── 04-test-attention │ │ ├── macro │ │ │ └── macro.yaml │ │ ├── prob │ │ │ └── prob.yaml │ │ ├── arch │ │ │ └── arch.yaml │ │ └── map │ │ │ └── map-flat.yaml │ ├── 01-test-linear │ │ ├── map │ │ │ └── map.yaml │ │ ├── prob │ │ │ └── prob.yaml │ │ └── arch │ │ │ └── arch.yaml │ ├── 03-test-systolic │ │ ├── prob │ │ │ ├── prob-timeloop.yaml │ │ │ └── prob.yaml │ │ ├── macro.yaml │ │ ├── map │ │ │ ├── map-timeloop.yaml │ │ │ └── map.yaml │ │ ├── arch │ │ │ └── arch.yaml │ │ └── script.py │ ├── 02-test-spatial │ │ ├── prob.yaml │ │ ├── map.yaml │ │ └── arch.yaml │ ├── 11-fail-domino-self-attention │ │ ├── arch.yaml │ │ ├── prob.yaml │ │ └── map.yaml │ ├── 08-test-2mm │ │ ├── arch │ │ │ └── arch-spatial.yaml │ │ ├── prob │ │ │ └── prob-2mm.yaml │ │ ├── map │ │ │ └── map.yaml │ │ └── reference_output.txt │ └── 10-domino-2mm │ │ ├── arch │ │ └── arch-spatial.yaml │ │ ├── prob │ │ └── prob-2mm.yaml │ │ └── map │ │ └── map.yaml ├── results │ ├── .gitignore │ └── changes │ │ └── configs_mapper_sample │ │ └── reference_stats.pkl ├── scripts │ └── parser.cpp └── SConscript ├── .gitignore ├── setup-env.sh ├── AE └── validation │ ├── timeloop │ ├── data.pkl │ ├── readme.md │ ├── prob │ │ ├── prob-timeloop.yaml │ │ └── prob.yaml │ ├── map │ │ ├── map-timeloop.yaml │ │ └── map.yaml │ └── arch │ │ └── arch.yaml │ └── accelerator │ ├── data │ ├── data.pkl │ └── io_data.csv │ ├── readme.md │ ├── prob │ ├── prob-gemm.yaml │ └── prob.yaml │ ├── map │ ├── map-gemm.yaml │ └── map.yaml │ └── arch │ └── arch.yaml ├── .gitmodules ├── src ├── common.cpp ├── mapping │ ├── loop.cpp │ └── mapping.cpp ├── loop-analysis │ └── memory-state.cpp ├── mapper │ └── op.cpp ├── model │ └── topology.cpp ├── SConscript └── application │ └── main.cpp ├── tutorials ├── 00-GEMM │ ├── macro.yaml │ ├── prob │ │ └── prob.yaml │ ├── map │ │ └── map.yaml │ ├── arch │ │ └── arch.yaml │ └── readme.md └── 01-self-attention │ ├── macro │ └── macro.yaml │ ├── prob │ └── prob.yaml │ ├── arch │ └── arch.yaml │ ├── readme.md │ └── map │ └── map.yaml ├── include ├── tileflow │ ├── mapping │ │ └── loop.hpp │ ├── common.hpp │ ├── model │ │ └── topology.hpp │ ├── mapper │ │ ├── op.hpp │ │ └── checker.hpp │ ├── loop-analysis │ │ └── memory-state.hpp │ └── problem │ │ └── problem.hpp └── application │ └── model.hpp ├── SConstruct ├── LICENSE ├── docs ├── tileflow-metrics.md ├── mcts.md └── frontend-syntax.md └── README.md /design/api.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/cases/07-test-fusion-attention/NULL: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/cases/13-test-attention/map/pipelined.yaml: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/cases/00-validation/02-attention/.gitignore: -------------------------------------------------------------------------------- 1 | config/ 2 | *png -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | bin/ 2 | build/ 3 | lib/ 4 | *.o 5 | *.png 6 | *.txt 7 | *.pdf 8 | -------------------------------------------------------------------------------- /tests/results/.gitignore: -------------------------------------------------------------------------------- 1 | *.cfg 2 | *.out 3 | *.log 4 | *.pkl 5 | !reference_stats.pkl 6 | -------------------------------------------------------------------------------- /setup-env.sh: -------------------------------------------------------------------------------- 1 | export PATH=$(pwd)/build/bin:$PATH 2 | export PATH=$(pwd)/3rdparty/timeloop/bin:$PATH -------------------------------------------------------------------------------- /tests/cases/06-test-mapper/out.txt: -------------------------------------------------------------------------------- 1 | input file: flat-mgran.yaml 2 | ERROR: key not found: problem, at line: 0 3 | -------------------------------------------------------------------------------- /AE/validation/timeloop/data.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pku-liang/TileFlow/HEAD/AE/validation/timeloop/data.pkl -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "3rdparty/timeloop"] 2 | path = 3rdparty/timeloop 3 | url = git@github.com:gulang2019/timeloop.git 4 | -------------------------------------------------------------------------------- /AE/validation/accelerator/data/data.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pku-liang/TileFlow/HEAD/AE/validation/accelerator/data/data.pkl -------------------------------------------------------------------------------- /tests/results/changes/configs_mapper_sample/reference_stats.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pku-liang/TileFlow/HEAD/tests/results/changes/configs_mapper_sample/reference_stats.pkl -------------------------------------------------------------------------------- /src/common.cpp: -------------------------------------------------------------------------------- 1 | #include "tileflow/common.hpp" 2 | 3 | 4 | namespace TileFlow { 5 | 6 | int verbose_level = 0; 7 | 8 | config::CompoundConfigNode macros; 9 | 10 | } // namespace TileFlow -------------------------------------------------------------------------------- /tests/cases/13-test-attention/readme.md: -------------------------------------------------------------------------------- 1 | ## Attention Experiments 2 | 3 | Write architectures, mappings, problems in arch/, map-raw/, prob/, run experiments by 4 | 5 | ``` 6 | python ./script.py 7 | ``` -------------------------------------------------------------------------------- /tutorials/00-GEMM/macro.yaml: -------------------------------------------------------------------------------- 1 | output: gemm 2 | verbose: 0 3 | macro: 4 | M: 16 5 | N: 64 6 | K: 16 7 | # MO: 1 8 | # MM: 1 9 | # NO: 4 10 | # NI: 16 11 | # KO: 1 12 | # KM: 1 -------------------------------------------------------------------------------- /tests/cases/00-validation/02-attention/config.yaml: -------------------------------------------------------------------------------- 1 | macro: 2 | M: 512 3 | N: 64 4 | K: 64 5 | L: 512 6 | SX: 16 7 | SY: 16 8 | 9 | tileflow-mapper: 10 | topk: 20 11 | timeout: 1200 12 | verbose: 1 13 | output: topk-experiment -------------------------------------------------------------------------------- /tests/cases/12-test-fused-cnn/macro.yaml: -------------------------------------------------------------------------------- 1 | macro: 2 | B: 1 3 | H: 112 4 | W: 112 5 | C: 64 6 | L: 192 7 | K: 128 8 | R: 3 9 | S: 3 10 | U: 1 11 | V: 1 12 | tileflow-mapper: 13 | timeout: 60 14 | verbose: 0 15 | check: 16 | loopcount: False -------------------------------------------------------------------------------- /tests/cases/04-test-attention/macro/macro.yaml: -------------------------------------------------------------------------------- 1 | macro: 2 | M: 512 3 | N: 64 4 | K: 64 5 | L: 512 6 | MO: 4 7 | MM: 8 8 | NM: 4 9 | KM: 4 10 | LM: 512 11 | verbose: 2 12 | # M = MO * MM * 16 13 | # N = NO * NM * 16 14 | # K = KO * KM * 16 15 | # L = LO * LM -------------------------------------------------------------------------------- /tests/cases/00-validation/02-attention/macro/macro.yaml: -------------------------------------------------------------------------------- 1 | macro: 2 | M: 512 3 | N: 64 4 | K: 64 5 | L: 512 6 | 7 | MO: 4 8 | MM: 8 9 | KO: 2 10 | KM: 2 11 | LO: 4 12 | LM: 8 13 | 14 | 15 | SX: 16 16 | SY: 16 17 | check: 18 | mem: False 19 | output: doublegemm 20 | verbose: 1 -------------------------------------------------------------------------------- /tutorials/01-self-attention/macro/macro.yaml: -------------------------------------------------------------------------------- 1 | macro: 2 | M: 512 3 | N: 64 4 | K: 64 5 | L: 512 6 | 7 | # MO: 4 8 | # MM: 8 9 | # KO: 2 10 | # KM: 2 11 | # LO: 4 12 | # LM: 8 13 | 14 | SX: 16 15 | SY: 16 16 | check: 17 | mem: False 18 | output: self_attention 19 | verbose: 1 -------------------------------------------------------------------------------- /tests/cases/12-test-fused-cnn/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | MAPFILE=$1.yaml 3 | ARCHFILE=arch/$4.yaml 4 | PROBFILE=prob/prob.yaml 5 | MACROFILE=$2 6 | python parser.py map-raw/$MAPFILE map/$MAPFILE 7 | tileflow map/$MAPFILE $ARCHFILE $PROBFILE $MACROFILE > $3 2>&1 8 | test $? -eq 0 || echo tileflow map/$MAPFILE $ARCHFILE $PROBFILE $MACROFILE > $3 2>&1 >> error.sh -------------------------------------------------------------------------------- /tests/cases/13-test-attention/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | MAPFILE=$1.yaml 3 | ARCHFILE=arch/$4.yaml 4 | PROBFILE=prob/attention.yaml 5 | MACROFILE=$2 6 | python parser.py map-raw/$MAPFILE map/$MAPFILE 7 | tileflow map/$MAPFILE $ARCHFILE $PROBFILE $MACROFILE > $3 2>&1 8 | test $? -eq 0 || echo tileflow map/$MAPFILE $ARCHFILE $PROBFILE $MACROFILE > $3 2>&1 >> error.sh -------------------------------------------------------------------------------- /include/tileflow/mapping/loop.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "mapping/loop.hpp" 4 | 5 | namespace loop { 6 | 7 | namespace TileFlow{ 8 | 9 | class Descriptor: public loop::Descriptor { 10 | public: 11 | std::string name_; 12 | void Print(std::ostream& out, bool long_form = true) const; 13 | }; 14 | 15 | } // namespace TileFlow 16 | 17 | } // namespace loop 18 | 19 | -------------------------------------------------------------------------------- /tests/cases/01-test-linear/map/map.yaml: -------------------------------------------------------------------------------- 1 | mapping: 2 | node-type: Tile 3 | type: temporal 4 | factors: M=32 N=64 K=128 5 | permutation: MNK 6 | target: MainMemory 7 | 8 | 9 | subtree: 10 | - node-type: Tile 11 | type: temporal 12 | factors: M=16 N=8 K=4 13 | permutation: MKN 14 | target: RegFile 15 | 16 | subtree: 17 | - node-type: Op 18 | name: GEMM 19 | binding: M:M N:N K:K -------------------------------------------------------------------------------- /AE/validation/timeloop/readme.md: -------------------------------------------------------------------------------- 1 | ### Validation with timeloop 2 | 3 | This folder reproduces the experiment in Fig.7 a/b 4 | 5 | - Folder Description 6 | - `data.pkl`: RTL simluation result of different mappings of GEMM operation on systolic architecture. 7 | - `arch/`, `map/`, `prob/`: the architecture/mapping/workload description. 8 | - `sample_output`: the output figures shown in the paper. 9 | - Run Script 10 | ```sh 11 | python script.py 12 | ``` -------------------------------------------------------------------------------- /AE/validation/timeloop/prob/prob-timeloop.yaml: -------------------------------------------------------------------------------- 1 | problem: 2 | shape: 3 | name: GEMM 4 | dimensions: [M,N,K] 5 | data-spaces: 6 | - name: C 7 | projection: 8 | - [[M]] 9 | - [[N]] 10 | read-write: True 11 | - name: A 12 | projection: 13 | - [[M]] 14 | - [[K]] 15 | - name: B 16 | projection: 17 | - [[K]] 18 | - [[N]] 19 | 20 | instance: 21 | M: M 22 | N: N 23 | K: K -------------------------------------------------------------------------------- /AE/validation/accelerator/readme.md: -------------------------------------------------------------------------------- 1 | ### Validation with accelerator 2 | 3 | This folder produces the result of Fig.7 c/d. 4 | 5 | - Folder Description 6 | - `data/`: RTL simulation result of a systolic based hardware. 7 | - `prob/`,`map/`,`arch`: the workload, mapping, architecture descriptions. 8 | - `sample_outputs`: the Fig.7 c/d in the paper. 9 | 10 | - Run Scripts 11 | ```sh 12 | python ./validation.py 13 | ``` 14 | 15 | - Run time: 5 seconds on 112 cores CPU. -------------------------------------------------------------------------------- /tests/cases/03-test-systolic/prob/prob-timeloop.yaml: -------------------------------------------------------------------------------- 1 | problem: 2 | shape: 3 | name: GEMM 4 | dimensions: [M,N,K] 5 | data-spaces: 6 | - name: C 7 | projection: 8 | - [[M]] 9 | - [[N]] 10 | read-write: True 11 | - name: A 12 | projection: 13 | - [[M]] 14 | - [[K]] 15 | - name: B 16 | projection: 17 | - [[K]] 18 | - [[N]] 19 | 20 | instance: 21 | M: M 22 | N: N 23 | K: K -------------------------------------------------------------------------------- /tests/cases/03-test-systolic/macro.yaml: -------------------------------------------------------------------------------- 1 | # M, N, K, micro_M, micro_N, micro_K 2 | # 5 3 | # 256 256 256 128 128 128 4 | # 256 384 256 128 128 128 5 | # 256 128 96 64 32 48 6 | # 1024 512 256 64 128 64 7 | # 64 64 1024 32 32 256 8 | 9 | # 78283 10 | # 111271 11 | # 38206 12 | # 793662 13 | # 67070 14 | 15 | output: /tmp/_tmp-0.csv 16 | verbose: 1 17 | macro: 18 | M: 16 19 | N: 64 20 | K: 16 21 | MO: 1 22 | MM: 1 23 | NO: 4 24 | NI: 16 25 | KO: 1 26 | KM: 1 27 | -------------------------------------------------------------------------------- /tests/cases/02-test-spatial/prob.yaml: -------------------------------------------------------------------------------- 1 | problem: 2 | io: 3 | ins: Input, Kernel 4 | outs: Output 5 | dimensions: [K,R] 6 | instance: 7 | K: 128 8 | R: 3 9 | 10 | ops: 11 | - name: Conv1D 12 | dimensions: [K,R] 13 | data-spaces: 14 | - name: Output 15 | projection: 16 | - [[K]] 17 | read-write: True 18 | - name: Input 19 | projection: 20 | - [[K],[R]] 21 | - name: Kernel 22 | projection: 23 | - [[R]] 24 | ins: Input, Kernel 25 | out: Output -------------------------------------------------------------------------------- /tests/cases/02-test-spatial/map.yaml: -------------------------------------------------------------------------------- 1 | mapping: 2 | node-type: Tile 3 | type: temporal 4 | factors: K=8 5 | permutation: K 6 | target: MainMemory 7 | 8 | subtree: 9 | - node-type: Tile 10 | type: spatial 11 | factors: K=16 12 | permutation: K 13 | target: MainMemory 14 | 15 | subtree: 16 | - node-type: Tile 17 | type: temporal 18 | factors: R=3 19 | permutation: R 20 | target: RegFile 21 | 22 | subtree: 23 | - node-type: Op 24 | name: Conv1D 25 | binding: K:K R:R -------------------------------------------------------------------------------- /tutorials/00-GEMM/prob/prob.yaml: -------------------------------------------------------------------------------- 1 | problem: 2 | io: 3 | ins: A, B 4 | outs: C 5 | dimensions: [M,N,K] 6 | instance: 7 | M: M 8 | N: N 9 | K: K 10 | 11 | ops: 12 | - name: GEMM 13 | dimensions: [M,N,K] 14 | data-spaces: 15 | - name: C 16 | projection: 17 | - [[M]] 18 | - [[N]] 19 | read-write: True 20 | - name: A 21 | projection: 22 | - [[M]] 23 | - [[K]] 24 | - name: B 25 | projection: 26 | - [[K]] 27 | - [[N]] 28 | ins: A, B 29 | out: C -------------------------------------------------------------------------------- /AE/validation/timeloop/prob/prob.yaml: -------------------------------------------------------------------------------- 1 | problem: 2 | io: 3 | ins: A, B 4 | outs: C 5 | dimensions: [M,N,K] 6 | instance: 7 | M: M 8 | N: N 9 | K: K 10 | 11 | ops: 12 | - name: GEMM 13 | dimensions: [M,N,K] 14 | data-spaces: 15 | - name: C 16 | projection: 17 | - [[M]] 18 | - [[N]] 19 | read-write: True 20 | - name: A 21 | projection: 22 | - [[M]] 23 | - [[K]] 24 | - name: B 25 | projection: 26 | - [[K]] 27 | - [[N]] 28 | ins: A, B 29 | out: C -------------------------------------------------------------------------------- /tests/cases/12-test-fused-cnn/map-raw/isos.yaml: -------------------------------------------------------------------------------- 1 | for b,h in (BO,HO): # target: L2 2 | scope Pipeline 3 | pfor h,w in (HS,WS): #target: L2, split:1 4 | for c,l,h,w in (HM, WM, CM, LM): #target: L1 5 | pfor c,l in (CS, LS): # target: L1, split:1 6 | for r,s in (RI,SI): # target: L0 7 | op ProduceA 8 | pfor h,w in (HS,WS): #target: L2, split:1 9 | for l,k,h,w in (HM2,WM2,LM2,KM2): #target: L1 10 | pfor l,k in (LS2, KS): # target: L1, split:1 11 | for u,v in (UI, VI): # target: L0 12 | op ProduceD -------------------------------------------------------------------------------- /AE/validation/accelerator/prob/prob-gemm.yaml: -------------------------------------------------------------------------------- 1 | problem: 2 | io: 3 | ins: A, B 4 | outs: C 5 | dimensions: [M,N,K] 6 | instance: 7 | M: M 8 | N: N 9 | K: K 10 | 11 | ops: 12 | - name: GEMM 13 | dimensions: [M,N,K] 14 | data-spaces: 15 | - name: C 16 | projection: 17 | - [[M]] 18 | - [[N]] 19 | read-write: True 20 | - name: A 21 | projection: 22 | - [[M]] 23 | - [[K]] 24 | - name: B 25 | projection: 26 | - [[K]] 27 | - [[N]] 28 | ins: A, B 29 | out: C -------------------------------------------------------------------------------- /tests/cases/03-test-systolic/prob/prob.yaml: -------------------------------------------------------------------------------- 1 | problem: 2 | io: 3 | ins: A, B 4 | outs: C 5 | dimensions: [M,N,K] 6 | instance: 7 | M: M 8 | N: N 9 | K: K 10 | 11 | ops: 12 | - name: GEMM 13 | dimensions: [M,N,K] 14 | data-spaces: 15 | - name: C 16 | projection: 17 | - [[M]] 18 | - [[N]] 19 | read-write: True 20 | - name: A 21 | projection: 22 | - [[M]] 23 | - [[K]] 24 | - name: B 25 | projection: 26 | - [[K]] 27 | - [[N]] 28 | ins: A, B 29 | out: C -------------------------------------------------------------------------------- /tests/cases/13-test-attention/map-raw/flash-attention.yaml: -------------------------------------------------------------------------------- 1 | for b,h,m,l in (?,?,?,?): # target: L2 2 | scope Sequential 3 | pfor b,h,m in (?,?,?): # target: L2, split:1 4 | for m,l,a in (?,?,?): # target: L1 5 | pfor m,l in (?,?): # target: L1, split:1 6 | for m,a,l in (1,1,1): # target: L0 7 | op ProduceC 8 | pfor b,h,m in (?,?,?): # target: L2, split:1 9 | for m,l,n in (?,?,?): # target: L1 10 | pfor m,l in (?,?): # target: L1, split:1 11 | for m,n,l in (1,1,1): # target: L0 12 | op ProduceO -------------------------------------------------------------------------------- /tests/cases/01-test-linear/prob/prob.yaml: -------------------------------------------------------------------------------- 1 | problem: 2 | io: 3 | ins: A, B 4 | outs: C 5 | dimensions: [M,N,K] 6 | instance: 7 | M: 512 8 | N: 512 9 | K: 512 10 | 11 | ops: 12 | - name: GEMM 13 | dimensions: [M,N,K] 14 | data-spaces: 15 | - name: C 16 | projection: 17 | - [[M]] 18 | - [[N]] 19 | read-write: True 20 | - name: A 21 | projection: 22 | - [[M]] 23 | - [[K]] 24 | - name: B 25 | projection: 26 | - [[K]] 27 | - [[N]] 28 | ins: A, B 29 | out: C -------------------------------------------------------------------------------- /tests/cases/12-test-fused-cnn/map-raw/fused-layer.yaml: -------------------------------------------------------------------------------- 1 | for b,w,h in (BO,WO,HO): # target: L2 2 | scope Pipeline 3 | pfor h,w in (HS,WS): #target: L2, split: 1 4 | for c,l,h,w in (HM, WM, CM, LM): #target: L1 5 | pfor c,l in (CS, LS): # target: L1, split: 1 6 | for r,s in (RI,SI): # target: L0 7 | op ProduceA 8 | pfor h,w in (HS,WS): #target: L2, split: 1 9 | for l,k,h,w in (HM2,WM2,LM2,KM2): #target: L1 10 | pfor l,k in (LS2, KS): # target: L1, split: 1 11 | for u,v in (UI, VI): # target: L0 12 | op ProduceD -------------------------------------------------------------------------------- /tests/cases/12-test-fused-cnn/map-raw/tileflow.yaml: -------------------------------------------------------------------------------- 1 | for b,w,h in (BO,WO,HO): # target: L2 2 | scope Sequential 3 | pfor h,w in (HS,WS): #target: L2, split: 1 4 | for c,l,h,w in (HM, WM, CM, LM): #target: L1 5 | pfor c,l in (CS, LS): # target: L1, split: 1 6 | for r,s in (RI,SI): # target: L0 7 | op ProduceA 8 | pfor h,w in (HS,WS): #target: L2, split: 1 9 | for l,k,h,w in (HM2,WM2,LM2,KM2): #target: L1 10 | pfor l,k in (LS2, KS): # target: L1, split: 1 11 | for u,v in (UI, VI): # target: L0 12 | op ProduceD -------------------------------------------------------------------------------- /tests/cases/13-test-attention/map-raw/pipeline.yaml: -------------------------------------------------------------------------------- 1 | for b,h,m,l in (?,?,?,?): # target: L2 2 | scope Pipeline 3 | pfor b,h,m in (?,?,?): # target: L2, split:1 4 | for m,l,a in (?,?,?): # target: L1, tag: op1 5 | pfor m,l in (?,?): # target: L1, split:1, tag: op1 6 | for m,a,l in (1,1,1): # target: L0, tag: op1 7 | op ProduceC 8 | pfor b,h,m in (?,?,?): # target: L2, split:1 9 | for m,l,n in (?,?,?): # target: L1 10 | pfor m,l in (?,?): # target: L1, split:1 11 | for m,l,n in (1,1,1): # target: L0 12 | op ProduceO -------------------------------------------------------------------------------- /AE/validation/timeloop/map/map-timeloop.yaml: -------------------------------------------------------------------------------- 1 | mapping: 2 | - node-type: Tile 3 | type: temporal 4 | factors: M = MO N = NO K= KO 5 | permutation: KMN 6 | target: MainMemory 7 | 8 | - node-type: Tile 9 | type: temporal 10 | factors: M=MM K=KM N=NI 11 | permutation: NMK 12 | target: Cache 13 | 14 | - node-type: Tile 15 | type: spatial 16 | factors: M=16 K=16 17 | permutation: MK 18 | split: 1 19 | target: Cache 20 | 21 | - node-type: Tile 22 | type: temporal 23 | factors: N=1 M=1 K=1 24 | permutation: MK 25 | target: RegFile -------------------------------------------------------------------------------- /tests/cases/03-test-systolic/map/map-timeloop.yaml: -------------------------------------------------------------------------------- 1 | mapping: 2 | - node-type: Tile 3 | type: temporal 4 | factors: M = MO N = NO K= KO 5 | permutation: KMN 6 | target: MainMemory 7 | 8 | - node-type: Tile 9 | type: temporal 10 | factors: M=MM K=KM N=NI 11 | permutation: NMK 12 | target: Cache 13 | 14 | - node-type: Tile 15 | type: spatial 16 | factors: M=16 K=16 17 | permutation: MK 18 | split: 1 19 | target: Cache 20 | 21 | - node-type: Tile 22 | type: temporal 23 | factors: N=1 M=1 K=1 24 | permutation: MK 25 | target: RegFile -------------------------------------------------------------------------------- /tests/cases/13-test-attention/map-raw/fuse-L0-pipeline.yaml: -------------------------------------------------------------------------------- 1 | for b,h,m,l in (?,?,?,?): # target: L2 2 | pfor b,h,m in (?,?,?): # target: L2, split:1 3 | for m,l,b,h in (?,?,?,?): # target: L1 4 | scope Pipeline 5 | for a in (?): # target:L1, profile: False 6 | pfor m,l in (?,?): # target: L1, split:1, tag: op1 7 | for m,a,l in (1,1,1): # target: L0, tag: op1 8 | op ProduceC 9 | for n in (?): # target:L1, profile: False 10 | pfor m,l in (?,?): # target: L1, split:1 11 | for m,l,n in (1,1,1): # target: L0 12 | op ProduceO -------------------------------------------------------------------------------- /tests/cases/12-test-fused-cnn/map-raw/naive.yaml: -------------------------------------------------------------------------------- 1 | scope Sequential 2 | for b,h,w in (BO,HO,WO): # target: L2 3 | pfor h,w in (HS,WS): #target: L2, split:1 4 | for c,l,h,w in (HM, WM, CM, LM): #target: L1 5 | pfor c,l in (CS, LS): # target: L1, split:1 6 | for r,s in (RI,SI): # target: L0 7 | op ProduceA 8 | for b,h,w in (BO,HO,WO): # target: L2 9 | pfor h,w in (HS,WS): #target: L2, split:1 10 | for l,k,h,w in (HM2,WM2,LM2,KM2): #target: L1 11 | pfor l,k in (LS2, KS): # target: L1, split:1 12 | for u,v in (UI, VI): # target: L0 13 | op ProduceD -------------------------------------------------------------------------------- /tests/cases/13-test-attention/map-raw/tileflow.yaml: -------------------------------------------------------------------------------- 1 | for b,h,m,l in (?,?,?,?): # target: L2 2 | pfor b,h,m in (?,?,?): # target: L2, split:1 3 | for m,l,b,h in (?,?,?,?): # target: L1 4 | scope Sequential 5 | for a in (?): # target: L1, profile: False, bypass: [C] 6 | pfor m,l in (?,?): # target: L1, split:1, tag: op1 7 | for m,a,l in (1,1,1): # target: L0, tag: op1 8 | op ProduceC 9 | for n in (?): # target: L1, profile: False, bypass: [C] 10 | pfor m,l in (?,?): # target: L1, split:1 11 | for m,l,n in (1,1,1): # target: L0 12 | op ProduceO -------------------------------------------------------------------------------- /tests/cases/13-test-attention/map-raw/no-fuse.yaml: -------------------------------------------------------------------------------- 1 | scope Sequential 2 | for b,h,m,l in (?,?,?,?): # target: L2 3 | pfor b,h,m in (?,?,?): # target: L2, split:1 4 | for m,l,a in (?,?,?): # target: L1, tag: op1 5 | pfor m,l in (?,?): # target: L1, split:1, tag: op1 6 | for m,a,l in (1,1,1): # target: L0, tag: op1 7 | op ProduceC 8 | for b,h,m,l in (?,?,?,?): # target: L2 9 | pfor b,h,m in (?,?,?): # target: L2, split:1 10 | for m,l,n in (?,?,?): # target: L1 11 | pfor m,l in (?,?): # target: L1, split:1 12 | for m,l,n in (1,1,1): # target: L0 13 | op ProduceO -------------------------------------------------------------------------------- /tests/cases/01-test-linear/arch/arch.yaml: -------------------------------------------------------------------------------- 1 | architecture: 2 | version: 0.2 3 | 4 | subtree: 5 | - name: System 6 | 7 | local: 8 | - name: MainMemory 9 | class: DRAM 10 | attributes: 11 | depth: 1 12 | block-size: 2048 13 | word-bits: 8 14 | 15 | subtree: 16 | - name: PE 17 | 18 | local: 19 | - name: RegFile 20 | class: regfile 21 | attributes: 22 | depth: 112 23 | width: 16 24 | word-bits: 8 25 | read_bandwidth: 1 26 | write_bandwidth: 1 27 | 28 | - name: mac 29 | class: intmac 30 | attributes: 31 | datawidth: 16 -------------------------------------------------------------------------------- /tests/cases/13-test-attention/map-raw/flat.yaml: -------------------------------------------------------------------------------- 1 | for b,h,m in (?,?,?): # target: L2 2 | scope Sequential 3 | for a in (?): # target: L2, bypass:[C], profile:False 4 | pfor b,h in (?,?): # target: L2, split: 1 5 | for m,l,a in (?,?,?): # target: L1 6 | pfor m,l in (?,?): # target: L1, split: 1 7 | for m,a,l in (1,1,1): # target: L0 8 | op ProduceC 9 | for l in (?): # target: L2, bypass:[C], profile:False 10 | pfor b,h in (?,?): # target: L2, split: 1 11 | for m,l,n in (?,?,?): # target: L1 12 | pfor m,l in (?,?): # target: L1, split: 1 13 | for m,n,l in (1,1,1): # target: L0 14 | op ProduceO -------------------------------------------------------------------------------- /tests/cases/13-test-attention/map-raw/chimera.yaml: -------------------------------------------------------------------------------- 1 | for b,h,m,l in (?,?,?,?): # target: L2 2 | scope Sequential 3 | for a in (?): # target: L2, bypass:[C], profile:False 4 | pfor b,h,m in (?,?,?): # target: L2, split:1 5 | for m,l,a in (?,?,?): # target: L1, tag: op1 6 | pfor m,l in (?,?): # target: L1, split:1, tag: op1 7 | for m,a,l in (1,1,1): # target: L0, tag: op1 8 | op ProduceC 9 | for n in (?): # target: L2, bypass:[C], profile:False 10 | pfor b,h,m in (?,?,?): # target: L2, split:1 11 | for m,l,n in (?,?,?): # target: L1 12 | pfor m,l in (?,?): # target: L1, split:1 13 | for m,l,n in (1,1,1): # target: L0 14 | op ProduceO -------------------------------------------------------------------------------- /tests/cases/02-test-spatial/arch.yaml: -------------------------------------------------------------------------------- 1 | architecture: 2 | version: 0.2 3 | 4 | subtree: 5 | - name: System 6 | 7 | local: 8 | - name: MainMemory 9 | class: DRAM 10 | attributes: 11 | width: 256 12 | block-size: 32 13 | word-bits: 8 14 | 15 | subtree: 16 | - name: PE 17 | 18 | local: 19 | - name: RegFile[0..15] 20 | class: regfile 21 | attributes: 22 | meshX: 16 23 | depth: 64 24 | width: 16 25 | word-bits: 8 26 | read_bandwidth: 2 27 | write_bandwidth: 2 28 | 29 | - name: mac[0..15] 30 | class: intmac 31 | attributes: 32 | datawidth: 16 33 | meshX: 16 34 | -------------------------------------------------------------------------------- /AE/validation/accelerator/map/map-gemm.yaml: -------------------------------------------------------------------------------- 1 | mapping: 2 | node-type: Tile 3 | type: temporal 4 | factors: M=MO K=KO N=NO 5 | target: MainMemory 6 | permutation: NKM 7 | 8 | subtree: 9 | - node-type: Tile 10 | type: temporal 11 | factors: K=KI N=NI M=MI 12 | permutation: NMK 13 | target: Cache 14 | 15 | subtree: 16 | - node-type: Tile 17 | type: Spatial 18 | factors: M=SX K=SY 19 | split: 1 20 | permutation: MK 21 | target: Cache 22 | 23 | subtree: 24 | - node-type: Tile 25 | type: temporal 26 | factors: M=1 N=1 K=1 27 | permutation: MNK 28 | target: RegFile 29 | 30 | subtree: 31 | - node-type: Op 32 | name: GEMM -------------------------------------------------------------------------------- /tests/cases/11-fail-domino-self-attention/arch.yaml: -------------------------------------------------------------------------------- 1 | architecture: 2 | version: 0.2 3 | subtree: 4 | - name: System 5 | attributes: 6 | local: 7 | - name: L1 8 | class: DRAM 9 | attributes: 10 | width: 512 11 | word-bits: 16 12 | block-size: 32 13 | technology: 45 14 | sizeKB: 2000 15 | subtree: 16 | - name: PE 17 | attributes: 18 | local: 19 | - name: L0[0..1023] 20 | class: regfile 21 | attributes: 22 | width: 16 23 | meshX: 1024 24 | word-bits: 16 25 | technology: 16nm 26 | read_bandwidth: 64 27 | sizeKB: 200 28 | - name: mac[0..1023] 29 | class: intmac 30 | attributes: 31 | meshX: 1024 -------------------------------------------------------------------------------- /tests/cases/08-test-2mm/arch/arch-spatial.yaml: -------------------------------------------------------------------------------- 1 | architecture: 2 | version: 0.2 3 | 4 | subtree: 5 | - name: System 6 | 7 | local: 8 | - name: MainMemory 9 | class: DRAM 10 | attributes: 11 | sizeKB: 1048576 12 | word-bits: 8 13 | 14 | subtree: 15 | - name: PE 16 | 17 | local: 18 | - name: RegFile[0..15] 19 | class: regfile 20 | attributes: 21 | meshX: 4 22 | meshY: 4 23 | depth: 64 24 | block_size: 1024 25 | word-bits: 8 26 | read_bandwidth: 2 27 | write_bandwidth: 2 28 | 29 | - name: mac[0..15] 30 | class: intmac 31 | attributes: 32 | word-bits: 16 33 | meshX: 4 34 | meshY: 4 -------------------------------------------------------------------------------- /tutorials/00-GEMM/map/map.yaml: -------------------------------------------------------------------------------- 1 | mapping: 2 | node-type: Tile 3 | type: temporal 4 | factors: M = MO N = NO K= KO 5 | permutation: KMN 6 | target: MainMemory 7 | 8 | 9 | subtree: 10 | - node-type: Tile 11 | type: temporal 12 | factors: M=MM K=KM N=NI 13 | permutation: NMK 14 | target: Cache 15 | 16 | subtree: 17 | - node-type: Tile 18 | type: spatial 19 | factors: M=16 K=16 20 | permutation: MK 21 | split: 1 22 | target: Cache 23 | multicast: true 24 | 25 | subtree: 26 | - node-type: Tile 27 | type: temporal 28 | factors: M=1 N=1 K=1 29 | permutation: MNK 30 | target: RegFile 31 | 32 | subtree: 33 | - node-type: Op 34 | name: GEMM -------------------------------------------------------------------------------- /AE/validation/timeloop/map/map.yaml: -------------------------------------------------------------------------------- 1 | mapping: 2 | node-type: Tile 3 | type: temporal 4 | factors: M = MO N = NO K= KO 5 | permutation: KMN 6 | target: MainMemory 7 | 8 | 9 | subtree: 10 | - node-type: Tile 11 | type: temporal 12 | factors: M=MM K=KM N=NI 13 | permutation: NMK 14 | target: Cache 15 | 16 | subtree: 17 | - node-type: Tile 18 | type: spatial 19 | factors: M=16 K=16 20 | permutation: MK 21 | split: 1 22 | target: Cache 23 | multicast: true 24 | 25 | subtree: 26 | - node-type: Tile 27 | type: temporal 28 | factors: M=1 N=1 K=1 29 | permutation: MNK 30 | target: RegFile 31 | 32 | subtree: 33 | - node-type: Op 34 | name: GEMM -------------------------------------------------------------------------------- /tests/cases/10-domino-2mm/arch/arch-spatial.yaml: -------------------------------------------------------------------------------- 1 | architecture: 2 | version: 0.2 3 | 4 | subtree: 5 | - name: System 6 | 7 | local: 8 | - name: L1 9 | class: DRAM 10 | attributes: 11 | width: 256 12 | block-size: 32 13 | word-bits: 8 14 | 15 | subtree: 16 | - name: PE 17 | 18 | local: 19 | - name: L0[0..15] 20 | class: regfile 21 | attributes: 22 | meshX: 4 23 | meshY: 4 24 | depth: 64 25 | width: 16 26 | word-bits: 8 27 | read_bandwidth: 2 28 | write_bandwidth: 2 29 | 30 | - name: mac[0..15] 31 | class: intmac 32 | attributes: 33 | datawidth: 16 34 | meshX: 4 35 | meshY: 4 -------------------------------------------------------------------------------- /tests/cases/03-test-systolic/map/map.yaml: -------------------------------------------------------------------------------- 1 | mapping: 2 | node-type: Tile 3 | type: temporal 4 | factors: M = MO N = NO K= KO 5 | permutation: KMN 6 | target: MainMemory 7 | 8 | 9 | subtree: 10 | - node-type: Tile 11 | type: temporal 12 | factors: M=MM K=KM N=NI 13 | permutation: NMK 14 | target: Cache 15 | 16 | subtree: 17 | - node-type: Tile 18 | type: spatial 19 | factors: M=16 K=16 20 | permutation: MK 21 | split: 1 22 | target: Cache 23 | multicast: true 24 | 25 | subtree: 26 | - node-type: Tile 27 | type: temporal 28 | factors: M=1 N=1 K=1 29 | permutation: MNK 30 | target: RegFile 31 | 32 | subtree: 33 | - node-type: Op 34 | name: GEMM -------------------------------------------------------------------------------- /include/tileflow/common.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #define MCARO_WRAPPER(code) do {code} while(0) 4 | 5 | #define TILEFLOW_ERROR(msg) do{std::cerr << "[ERROR]: " << msg << std::endl;exit(1);} while(0) 6 | 7 | #define TILEFLOW_ASSERT(cond, msg) do{if(!(cond)) {std::cerr << "[ASSERT ERROR]: " << msg << std::endl; exit(1);} }while(0) 8 | 9 | #define TILEFLOW_WARNING(msg) do{std::cerr << "[WARNING]: " << msg << std::endl;}while(0) 10 | 11 | #define TILEFLOW_LOG(msg) do{std::cerr << "[LOG]: " << msg << std::endl;}while(0) 12 | 13 | #define TILEFLOW_COND_WARNING(cond, msg) do{if(!(cond)) {std::cerr << "[WARNING]: " << msg << std::endl;}}while(0) 14 | 15 | #include "compound-config/compound-config.hpp" 16 | 17 | const int MaxTensors = 32; 18 | 19 | namespace TileFlow { 20 | 21 | extern int verbose_level; 22 | 23 | extern config::CompoundConfigNode macros; 24 | 25 | } -------------------------------------------------------------------------------- /AE/validation/accelerator/prob/prob.yaml: -------------------------------------------------------------------------------- 1 | problem: 2 | io: 3 | ins: A, B, D 4 | outs: E 5 | dimensions: [M,N,K,L] 6 | instance: 7 | M: M 8 | N: N 9 | L: L 10 | K: K 11 | 12 | ops: 13 | - name: GEMM1 14 | dimensions: [M,L,K] 15 | data-spaces: 16 | - name: C 17 | projection: 18 | - [[M]] 19 | - [[L]] 20 | read-write: True 21 | - name: A 22 | projection: 23 | - [[M]] 24 | - [[K]] 25 | - name: B 26 | projection: 27 | - [[K]] 28 | - [[L]] 29 | ins: A, B 30 | out: C 31 | 32 | - name: GEMM2 33 | dimensions: [M,L,N] 34 | data-spaces: 35 | - name: E 36 | projection: 37 | - [[M]] 38 | - [[N]] 39 | read-write: True 40 | - name: C 41 | projection: 42 | - [[M]] 43 | - [[L]] 44 | - name: D 45 | projection: 46 | - [[L]] 47 | - [[N]] 48 | ins: C, D 49 | out: E -------------------------------------------------------------------------------- /tests/cases/04-test-attention/prob/prob.yaml: -------------------------------------------------------------------------------- 1 | problem: 2 | io: 3 | ins: A, B, D 4 | outs: E 5 | dimensions: [M,N,K,L] 6 | instance: 7 | M: M 8 | N: N 9 | K: K 10 | L: L 11 | 12 | ops: 13 | - name: GEMM1 14 | dimensions: [M,L,K] 15 | data-spaces: 16 | - name: C 17 | projection: 18 | - [[M]] 19 | - [[L]] 20 | read-write: True 21 | - name: A 22 | projection: 23 | - [[M]] 24 | - [[K]] 25 | - name: B 26 | projection: 27 | - [[K]] 28 | - [[L]] 29 | ins: A, B 30 | out: C 31 | 32 | - name: GEMM2 33 | dimensions: [M,L,N] 34 | data-spaces: 35 | - name: E 36 | projection: 37 | - [[M]] 38 | - [[N]] 39 | read-write: True 40 | - name: C 41 | projection: 42 | - [[M]] 43 | - [[L]] 44 | - name: D 45 | projection: 46 | - [[L]] 47 | - [[N]] 48 | ins: C, D 49 | out: E -------------------------------------------------------------------------------- /tutorials/01-self-attention/prob/prob.yaml: -------------------------------------------------------------------------------- 1 | problem: 2 | io: 3 | ins: A, B, D 4 | outs: E 5 | dimensions: [M,N,K,L] 6 | instance: 7 | M: M 8 | N: N 9 | L: L 10 | K: K 11 | 12 | ops: 13 | - name: GEMM1 14 | dimensions: [M,L,K] 15 | data-spaces: 16 | - name: C 17 | projection: 18 | - [[M]] 19 | - [[L]] 20 | read-write: True 21 | - name: A 22 | projection: 23 | - [[M]] 24 | - [[K]] 25 | - name: B 26 | projection: 27 | - [[K]] 28 | - [[L]] 29 | ins: A, B 30 | out: C 31 | 32 | - name: GEMM2 33 | dimensions: [M,L,N] 34 | data-spaces: 35 | - name: E 36 | projection: 37 | - [[M]] 38 | - [[N]] 39 | read-write: True 40 | - name: C 41 | projection: 42 | - [[M]] 43 | - [[L]] 44 | - name: D 45 | projection: 46 | - [[L]] 47 | - [[N]] 48 | ins: C, D 49 | out: E -------------------------------------------------------------------------------- /tests/cases/00-validation/02-attention/prob/prob.yaml: -------------------------------------------------------------------------------- 1 | problem: 2 | io: 3 | ins: A, B, D 4 | outs: E 5 | dimensions: [M,N,K,L] 6 | instance: 7 | M: M 8 | N: N 9 | L: L 10 | K: K 11 | 12 | ops: 13 | - name: GEMM1 14 | dimensions: [M,L,K] 15 | data-spaces: 16 | - name: C 17 | projection: 18 | - [[M]] 19 | - [[L]] 20 | read-write: True 21 | - name: A 22 | projection: 23 | - [[M]] 24 | - [[K]] 25 | - name: B 26 | projection: 27 | - [[K]] 28 | - [[L]] 29 | ins: A, B 30 | out: C 31 | 32 | - name: GEMM2 33 | dimensions: [M,L,N] 34 | data-spaces: 35 | - name: E 36 | projection: 37 | - [[M]] 38 | - [[N]] 39 | read-write: True 40 | - name: C 41 | projection: 42 | - [[M]] 43 | - [[L]] 44 | - name: D 45 | projection: 46 | - [[L]] 47 | - [[N]] 48 | ins: C, D 49 | out: E -------------------------------------------------------------------------------- /src/mapping/loop.cpp: -------------------------------------------------------------------------------- 1 | #include "tileflow/mapping/loop.hpp" 2 | 3 | namespace loop{ 4 | 5 | namespace TileFlow { 6 | 7 | void Descriptor::Print(std::ostream& out, bool long_form) const 8 | { 9 | if (long_form) 10 | { 11 | out << "for " << name_ << " in [" << start << ":" << end; 12 | if (residual_end != end) 13 | out << "," << residual_end; 14 | out << ")"; 15 | if (loop::IsSpatial(spacetime_dimension)) 16 | { 17 | if (loop::IsSpatialX(spacetime_dimension)) 18 | out << " (Spatial-X)"; 19 | else 20 | out << " (Spatial-Y)"; 21 | } 22 | } 23 | else 24 | { 25 | out << "(" << name_ << "," << end; 26 | if (residual_end != end) 27 | out << "," << residual_end; 28 | if (loop::IsSpatial(spacetime_dimension)) 29 | { 30 | if (loop::IsSpatialX(spacetime_dimension)) 31 | out << ",spX"; 32 | else 33 | out << ",spY"; 34 | } 35 | out << ") "; 36 | } 37 | } 38 | 39 | } // namespace TileFlow 40 | 41 | } // namespace loop -------------------------------------------------------------------------------- /SConstruct: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | VariantDir('build', 'src', duplicate = 0) 4 | 5 | AddOption('--static', dest='link_static', default=False, action='store_true', help='Use static linking (default is dynamic)') 6 | AddOption('--d', dest='debug', default=False, action='store_true', help='Debug build (default is off)') 7 | AddOption('--with-isl', dest='with_isl', default=False, action='store_true', help='Build with ISL support (default is false)') 8 | AddOption('--clang', dest='clang', default=(str(Platform())=='darwin'), action='store_true', help='Build using clang (default is true for MacOS, otherwise false)') 9 | AddOption('--parser', dest='parser', default=False, action = 'store_true', help='Build frontend parser') 10 | 11 | env = Environment(ENV = os.environ) 12 | 13 | 14 | if GetOption('clang'): 15 | print('Building with clang instead of gcc.') 16 | 17 | if not GetOption('clang'): 18 | env.Replace(AR = "gcc-ar") 19 | env.Replace(RANLIB = "gcc-ranlib") 20 | 21 | env.Append(TIMELOOP_BASE_DIR = Dir('./3rdparty/timeloop/').abspath) 22 | env.Append(BUILD_BASE_DIR = Dir('.').abspath) 23 | env.SConscript('build/SConscript', exports='env') 24 | -------------------------------------------------------------------------------- /tests/cases/07-test-fusion-attention/edge.yaml: -------------------------------------------------------------------------------- 1 | architecture: 2 | version: 0.2 3 | subtree: 4 | - name: System 5 | attributes: 6 | local: 7 | - name: L2 8 | class: DRAM 9 | attributes: 10 | word-bits: 16 11 | block-size: 32 12 | technology: 16nm 13 | read_bandwidth: 3 14 | sizeKB: 1600000000 15 | subtree: 16 | - name: Buffer[0..3] 17 | attributes: 18 | meshX: 4 19 | local: 20 | - name: L1 21 | class: SRAM 22 | attributes: 23 | width: 16 24 | word-bits: 16 25 | technology: 16nm 26 | read_bandwidth: 60 27 | sizeKB: 4000 28 | subtree: 29 | - name: PE 30 | attributes: 31 | local: 32 | - name: L0[0..1023] 33 | class: regfile 34 | attributes: 35 | depth: 1 36 | meshX: 1024 37 | word-bits: 16 38 | block-size: 3 39 | technology: 16nm 40 | read_bandwidth: 3 41 | - name: mac[0..1023] 42 | class: intmac 43 | attributes: 44 | meshX: 1024 -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 PKU Yun (Eric) Liang Research Group 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /tests/cases/13-test-attention/arch/cloud-4X4.yaml: -------------------------------------------------------------------------------- 1 | architecture: 2 | version: 0.2 3 | subtree: 4 | - name: System 5 | attributes: 6 | local: 7 | - name: L2 8 | class: DRAM 9 | attributes: 10 | word-bits: 16 11 | block-size: 32 12 | technology: 16nm 13 | read_bandwidth: 1 14 | sizeKB: 1600000000 15 | subtree: 16 | - name: Buffer 17 | attributes: 18 | local: 19 | - name: L1[0..3] 20 | class: SRAM 21 | attributes: 22 | meshX: 2 23 | meshY: 2 24 | width: 16 25 | word-bits: 16 26 | technology: 16nm 27 | read_bandwidth: 20 28 | sizeKB: 2000 29 | subtree: 30 | - name: PE 31 | attributes: 32 | local: 33 | - name: L0[0..63] 34 | class: regfile 35 | attributes: 36 | depth: 1 37 | meshX: 8 38 | meshY: 8 39 | word-bits: 16 40 | block-size: 6 41 | technology: 16nm 42 | read_bandwidth: 3 43 | - name: mac[0..63] 44 | class: intmac 45 | attributes: 46 | meshX: 8 47 | meshY: 8 -------------------------------------------------------------------------------- /tests/cases/03-test-systolic/arch/arch.yaml: -------------------------------------------------------------------------------- 1 | architecture: 2 | version: 0.2 3 | 4 | subtree: 5 | - name: System 6 | 7 | local: 8 | - name: MainMemory 9 | class: DRAM 10 | attributes: 11 | block-size: 1048576 12 | word-bits: 16 13 | read_bandwidth: 4.3 14 | write_bandwidth: 2.9 15 | 16 | subtree: 17 | - name: Buffer 18 | 19 | local: 20 | - name: Cache 21 | class: SRAM 22 | attributes: 23 | word-bits: 16 24 | block_size: 16384 25 | depth: 3 26 | read_bandwidth: 52 27 | write_bandwidth: 20 # 16 28 | 29 | 30 | subtree: 31 | - name: PE 32 | 33 | local: 34 | - name: RegFile[0..255] 35 | class: regfile 36 | attributes: 37 | meshX: 16 38 | meshY: 16 39 | depth: 1 40 | block_size: 3 41 | word-bits: 16 42 | read_bandwidth: 3.2 43 | write_bandwidth: 3.2 44 | 45 | - name: mac[0..255] 46 | class: intmac 47 | attributes: 48 | word-bits: 16 49 | meshX: 16 50 | meshY: 16 51 | -------------------------------------------------------------------------------- /tests/cases/12-test-fused-cnn/arch/edge.yaml: -------------------------------------------------------------------------------- 1 | architecture: 2 | version: 0.2 3 | subtree: 4 | - name: System 5 | attributes: 6 | local: 7 | - name: L2 8 | class: DRAM 9 | attributes: 10 | word-bits: 16 11 | block-size: 32 12 | technology: 16nm 13 | read_bandwidth: 4 14 | sizeKB: 1600000000 15 | subtree: 16 | - name: Buffer 17 | attributes: 18 | local: 19 | - name: L1[0..3] 20 | class: SRAM 21 | attributes: 22 | meshX: 2 23 | meshY: 2 24 | width: 16 25 | word-bits: 16 26 | technology: 16nm 27 | read_bandwidth: 64 28 | sizeKB: 128 29 | subtree: 30 | - name: PE 31 | attributes: 32 | local: 33 | - name: L0[0..1023] 34 | class: regfile 35 | attributes: 36 | depth: 1 37 | meshX: 32 38 | meshY: 32 39 | word-bits: 32 40 | block-size: 1024 41 | technology: 16nm 42 | read_bandwidth: 3 43 | - name: mac[0..1023] 44 | class: intmac 45 | attributes: 46 | meshX: 32 47 | meshY: 32 -------------------------------------------------------------------------------- /tests/cases/13-test-attention/arch/cloud.yaml: -------------------------------------------------------------------------------- 1 | architecture: 2 | version: 0.2 3 | subtree: 4 | - name: System 5 | attributes: 6 | local: 7 | - name: L2 8 | class: DRAM 9 | attributes: 10 | word-bits: 16 11 | block-size: 32 12 | technology: 16nm 13 | read_bandwidth: 4 14 | sizeKB: 1600000000 15 | subtree: 16 | - name: Buffer 17 | attributes: 18 | local: 19 | - name: L1[0..15] 20 | class: SRAM 21 | attributes: 22 | meshX: 4 23 | meshY: 4 24 | width: 16 25 | word-bits: 16 26 | technology: 16nm 27 | read_bandwidth: 64 28 | sizeKB: 2048 29 | subtree: 30 | - name: PE 31 | attributes: 32 | local: 33 | - name: L0[0..4095] 34 | class: regfile 35 | attributes: 36 | depth: 1 37 | meshX: 64 38 | meshY: 64 39 | word-bits: 16 40 | block-size: 6 41 | technology: 16nm 42 | read_bandwidth: 3 43 | - name: mac[0..4095] 44 | class: intmac 45 | attributes: 46 | meshX: 64 47 | meshY: 64 -------------------------------------------------------------------------------- /tests/cases/13-test-attention/arch/edge.yaml: -------------------------------------------------------------------------------- 1 | architecture: 2 | version: 0.2 3 | subtree: 4 | - name: System 5 | attributes: 6 | local: 7 | - name: L2 8 | class: DRAM 9 | attributes: 10 | word-bits: 16 11 | block-size: 32 12 | technology: 16nm 13 | read_bandwidth: 4 14 | sizeKB: 1600000000 15 | subtree: 16 | - name: Buffer 17 | attributes: 18 | local: 19 | - name: L1[0..3] 20 | class: SRAM 21 | attributes: 22 | meshX: 2 23 | meshY: 2 24 | width: 16 25 | word-bits: 16 26 | technology: 16nm 27 | read_bandwidth: 64 28 | sizeKB: 64 29 | subtree: 30 | - name: PE 31 | attributes: 32 | local: 33 | - name: L0[0..1023] 34 | class: regfile 35 | attributes: 36 | depth: 1 37 | meshX: 32 38 | meshY: 32 39 | word-bits: 16 40 | block-size: 6 41 | technology: 16nm 42 | read_bandwidth: 3 43 | - name: mac[0..1023] 44 | class: intmac 45 | attributes: 46 | meshX: 32 47 | meshY: 32 -------------------------------------------------------------------------------- /AE/validation/accelerator/arch/arch.yaml: -------------------------------------------------------------------------------- 1 | architecture: 2 | version: 0.2 3 | 4 | subtree: 5 | - name: System 6 | 7 | local: 8 | - name: MainMemory 9 | class: DRAM 10 | attributes: 11 | block-size: 65536 12 | depth: 8 13 | word-bits: 16 14 | read_bandwidth: 4 15 | write_bandwidth: 4 16 | 17 | subtree: 18 | - name: Buffer 19 | 20 | local: 21 | - name: Cache 22 | class: SRAM 23 | attributes: 24 | word-bits: 16 25 | block_size: 16384 26 | depth: 8 27 | read_bandwidth: 32 28 | write_bandwidth: 32 29 | 30 | 31 | subtree: 32 | - name: PE 33 | 34 | local: 35 | - name: RegFile[0..255] 36 | class: regfile 37 | attributes: 38 | meshX: 16 39 | meshY: 16 40 | depth: 1 41 | block_size: 3 42 | word-bits: 16 43 | read_bandwidth: 3 44 | write_bandwidth: 3 45 | 46 | - name: mac[0..255] 47 | class: intmac 48 | attributes: 49 | word-bits: 16 50 | meshX: 16 51 | meshY: 16 52 | -------------------------------------------------------------------------------- /tests/cases/12-test-fused-cnn/arch/cloud.yaml: -------------------------------------------------------------------------------- 1 | architecture: 2 | version: 0.2 3 | subtree: 4 | - name: System 5 | attributes: 6 | local: 7 | - name: L2 8 | class: DRAM 9 | attributes: 10 | word-bits: 16 11 | block-size: 32 12 | technology: 16nm 13 | read_bandwidth: 4 14 | sizeKB: 1600000000 15 | subtree: 16 | - name: Buffer 17 | attributes: 18 | local: 19 | - name: L1[0..15] 20 | class: SRAM 21 | attributes: 22 | meshX: 4 23 | meshY: 4 24 | width: 16 25 | word-bits: 16 26 | technology: 16nm 27 | read_bandwidth: 64 28 | sizeKB: 2048 29 | subtree: 30 | - name: PE 31 | attributes: 32 | local: 33 | - name: L0[0..4095] 34 | class: regfile 35 | attributes: 36 | depth: 1 37 | meshX: 64 38 | meshY: 64 39 | word-bits: 16 40 | block-size: 32 41 | technology: 16nm 42 | read_bandwidth: 3 43 | - name: mac[0..4095] 44 | class: intmac 45 | attributes: 46 | meshX: 64 47 | meshY: 64 -------------------------------------------------------------------------------- /tests/cases/13-test-attention/arch/cloud-12X14.yaml: -------------------------------------------------------------------------------- 1 | architecture: 2 | version: 0.2 3 | subtree: 4 | - name: System 5 | attributes: 6 | local: 7 | - name: L2 8 | class: DRAM 9 | attributes: 10 | word-bits: 16 11 | block-size: 32 12 | technology: 16nm 13 | read_bandwidth: 1 14 | sizeKB: 1600000000 15 | subtree: 16 | - name: Buffer 17 | attributes: 18 | local: 19 | - name: L1[0..3] 20 | class: SRAM 21 | attributes: 22 | meshX: 2 23 | meshY: 2 24 | width: 16 25 | word-bits: 16 26 | technology: 16nm 27 | read_bandwidth: 20 28 | sizeKB: 2000 29 | subtree: 30 | - name: PE 31 | attributes: 32 | local: 33 | - name: L0[0..671] 34 | class: regfile 35 | attributes: 36 | depth: 1 37 | meshX: 24 38 | meshY: 28 39 | word-bits: 16 40 | block-size: 6 41 | technology: 16nm 42 | read_bandwidth: 3 43 | - name: mac[0..671] 44 | class: intmac 45 | attributes: 46 | meshX: 24 47 | meshY: 28 -------------------------------------------------------------------------------- /tests/cases/13-test-attention/arch/cloud-16X16.yaml: -------------------------------------------------------------------------------- 1 | architecture: 2 | version: 0.2 3 | subtree: 4 | - name: System 5 | attributes: 6 | local: 7 | - name: L2 8 | class: DRAM 9 | attributes: 10 | word-bits: 16 11 | block-size: 32 12 | technology: 16nm 13 | read_bandwidth: 1 14 | sizeKB: 1600000000 15 | subtree: 16 | - name: Buffer 17 | attributes: 18 | local: 19 | - name: L1[0..3] 20 | class: SRAM 21 | attributes: 22 | meshX: 2 23 | meshY: 2 24 | width: 16 25 | word-bits: 16 26 | technology: 16nm 27 | read_bandwidth: 20 28 | sizeKB: 2000 29 | subtree: 30 | - name: PE 31 | attributes: 32 | local: 33 | - name: L0[0..1023] 34 | class: regfile 35 | attributes: 36 | depth: 1 37 | meshX: 32 38 | meshY: 32 39 | word-bits: 16 40 | block-size: 6 41 | technology: 16nm 42 | read_bandwidth: 3 43 | - name: mac[0..1023] 44 | class: intmac 45 | attributes: 46 | meshX: 32 47 | meshY: 32 -------------------------------------------------------------------------------- /tests/cases/13-test-attention/arch/cloud-32X32.yaml: -------------------------------------------------------------------------------- 1 | architecture: 2 | version: 0.2 3 | subtree: 4 | - name: System 5 | attributes: 6 | local: 7 | - name: L2 8 | class: DRAM 9 | attributes: 10 | word-bits: 16 11 | block-size: 32 12 | technology: 16nm 13 | read_bandwidth: 1 14 | sizeKB: 1600000000 15 | subtree: 16 | - name: Buffer 17 | attributes: 18 | local: 19 | - name: L1[0..3] 20 | class: SRAM 21 | attributes: 22 | meshX: 2 23 | meshY: 2 24 | width: 16 25 | word-bits: 16 26 | technology: 16nm 27 | read_bandwidth: 20 28 | sizeKB: 2000 29 | subtree: 30 | - name: PE 31 | attributes: 32 | local: 33 | - name: L0[0..4095] 34 | class: regfile 35 | attributes: 36 | depth: 1 37 | meshX: 64 38 | meshY: 64 39 | word-bits: 16 40 | block-size: 6 41 | technology: 16nm 42 | read_bandwidth: 3 43 | - name: mac[0..4095] 44 | class: intmac 45 | attributes: 46 | meshX: 64 47 | meshY: 64 -------------------------------------------------------------------------------- /tests/cases/13-test-attention/arch/cloud-lowBW.yaml: -------------------------------------------------------------------------------- 1 | architecture: 2 | version: 0.2 3 | subtree: 4 | - name: System 5 | attributes: 6 | local: 7 | - name: L2 8 | class: DRAM 9 | attributes: 10 | word-bits: 16 11 | block-size: 32 12 | technology: 16nm 13 | read_bandwidth: 1 14 | sizeKB: 1600000000 15 | subtree: 16 | - name: Buffer 17 | attributes: 18 | local: 19 | - name: L1[0..3] 20 | class: SRAM 21 | attributes: 22 | meshX: 2 23 | meshY: 2 24 | width: 16 25 | word-bits: 16 26 | technology: 16nm 27 | read_bandwidth: 20 28 | sizeKB: 2000 29 | subtree: 30 | - name: PE 31 | attributes: 32 | local: 33 | - name: L0[0..1023] 34 | class: regfile 35 | attributes: 36 | depth: 1 37 | meshX: 32 38 | meshY: 32 39 | word-bits: 16 40 | block-size: 6 41 | technology: 16nm 42 | read_bandwidth: 3 43 | - name: mac[0..1023] 44 | class: intmac 45 | attributes: 46 | meshX: 32 47 | meshY: 32 -------------------------------------------------------------------------------- /tests/cases/13-test-attention/arch/cloud-midBW.yaml: -------------------------------------------------------------------------------- 1 | architecture: 2 | version: 0.2 3 | subtree: 4 | - name: System 5 | attributes: 6 | local: 7 | - name: L2 8 | class: DRAM 9 | attributes: 10 | word-bits: 16 11 | block-size: 32 12 | technology: 16nm 13 | read_bandwidth: 4 14 | sizeKB: 1600000000 15 | subtree: 16 | - name: Buffer 17 | attributes: 18 | local: 19 | - name: L1[0..3] 20 | class: SRAM 21 | attributes: 22 | meshX: 2 23 | meshY: 2 24 | width: 16 25 | word-bits: 16 26 | technology: 16nm 27 | read_bandwidth: 80 28 | sizeKB: 2000 29 | subtree: 30 | - name: PE 31 | attributes: 32 | local: 33 | - name: L0[0..1023] 34 | class: regfile 35 | attributes: 36 | depth: 1 37 | meshX: 32 38 | meshY: 32 39 | word-bits: 16 40 | block-size: 6 41 | technology: 16nm 42 | read_bandwidth: 3 43 | - name: mac[0..1023] 44 | class: intmac 45 | attributes: 46 | meshX: 32 47 | meshY: 32 -------------------------------------------------------------------------------- /tutorials/01-self-attention/arch/arch.yaml: -------------------------------------------------------------------------------- 1 | architecture: 2 | version: 0.2 3 | 4 | subtree: 5 | - name: System 6 | 7 | local: 8 | - name: MainMemory 9 | class: DRAM 10 | attributes: 11 | block-size: 65536 12 | depth: 8 13 | word-bits: 16 14 | read_bandwidth: 4 15 | write_bandwidth: 4 16 | 17 | subtree: 18 | - name: Buffer 19 | 20 | local: 21 | - name: Cache 22 | class: SRAM 23 | attributes: 24 | word-bits: 16 25 | block_size: 16384 26 | depth: 8 27 | read_bandwidth: 32 28 | write_bandwidth: 32 29 | 30 | 31 | subtree: 32 | - name: PE 33 | 34 | local: 35 | - name: RegFile[0..255] 36 | class: regfile 37 | attributes: 38 | meshX: 16 39 | meshY: 16 40 | depth: 1 41 | block_size: 3 42 | word-bits: 16 43 | read_bandwidth: 3 44 | write_bandwidth: 3 45 | 46 | - name: mac[0..255] 47 | class: intmac 48 | attributes: 49 | word-bits: 16 50 | meshX: 16 51 | meshY: 16 52 | -------------------------------------------------------------------------------- /tests/cases/13-test-attention/arch/cloud-highBW.yaml: -------------------------------------------------------------------------------- 1 | architecture: 2 | version: 0.2 3 | subtree: 4 | - name: System 5 | attributes: 6 | local: 7 | - name: L2 8 | class: DRAM 9 | attributes: 10 | word-bits: 16 11 | block-size: 32 12 | technology: 16nm 13 | read_bandwidth: 8 14 | sizeKB: 1600000000 15 | subtree: 16 | - name: Buffer 17 | attributes: 18 | local: 19 | - name: L1[0..3] 20 | class: SRAM 21 | attributes: 22 | meshX: 2 23 | meshY: 2 24 | width: 16 25 | word-bits: 16 26 | technology: 16nm 27 | read_bandwidth: 160 28 | sizeKB: 2000 29 | subtree: 30 | - name: PE 31 | attributes: 32 | local: 33 | - name: L0[0..1023] 34 | class: regfile 35 | attributes: 36 | depth: 1 37 | meshX: 32 38 | meshY: 32 39 | word-bits: 16 40 | block-size: 6 41 | technology: 16nm 42 | read_bandwidth: 3 43 | - name: mac[0..1023] 44 | class: intmac 45 | attributes: 46 | meshX: 32 47 | meshY: 32 -------------------------------------------------------------------------------- /tests/cases/13-test-attention/arch/cloud-largeBW.yaml: -------------------------------------------------------------------------------- 1 | architecture: 2 | version: 0.2 3 | subtree: 4 | - name: System 5 | attributes: 6 | local: 7 | - name: L2 8 | class: DRAM 9 | attributes: 10 | word-bits: 16 11 | block-size: 32 12 | technology: 16nm 13 | read_bandwidth: 16 14 | sizeKB: 1600000000 15 | subtree: 16 | - name: Buffer 17 | attributes: 18 | local: 19 | - name: L1[0..3] 20 | class: SRAM 21 | attributes: 22 | meshX: 2 23 | meshY: 2 24 | width: 16 25 | word-bits: 16 26 | technology: 16nm 27 | read_bandwidth: 320 28 | sizeKB: 2000 29 | subtree: 30 | - name: PE 31 | attributes: 32 | local: 33 | - name: L0[0..1023] 34 | class: regfile 35 | attributes: 36 | depth: 1 37 | meshX: 32 38 | meshY: 32 39 | word-bits: 16 40 | block-size: 6 41 | technology: 16nm 42 | read_bandwidth: 3 43 | - name: mac[0..1023] 44 | class: intmac 45 | attributes: 46 | meshX: 32 47 | meshY: 32 -------------------------------------------------------------------------------- /tutorials/00-GEMM/arch/arch.yaml: -------------------------------------------------------------------------------- 1 | architecture: 2 | version: 0.2 3 | 4 | subtree: 5 | - name: System 6 | 7 | local: 8 | - name: MainMemory 9 | class: DRAM 10 | attributes: 11 | block-size: 16384 12 | depth: 1 13 | word-bits: 16 14 | read_bandwidth: 4.3 15 | write_bandwidth: 2.9 16 | 17 | subtree: 18 | - name: Buffer 19 | 20 | local: 21 | - name: Cache 22 | class: SRAM 23 | attributes: 24 | word-bits: 16 25 | block_size: 16384 26 | depth: 3 27 | read_bandwidth: 52 28 | write_bandwidth: 20 # 16 29 | 30 | 31 | subtree: 32 | - name: PE 33 | 34 | local: 35 | - name: RegFile[0..255] 36 | class: regfile 37 | attributes: 38 | meshX: 16 39 | meshY: 16 40 | depth: 1 41 | block_size: 3 42 | word-bits: 16 43 | read_bandwidth: 3.2 44 | write_bandwidth: 3.2 45 | 46 | - name: mac[0..255] 47 | class: intmac 48 | attributes: 49 | word-bits: 16 50 | meshX: 16 51 | meshY: 16 52 | -------------------------------------------------------------------------------- /AE/validation/timeloop/arch/arch.yaml: -------------------------------------------------------------------------------- 1 | architecture: 2 | version: 0.2 3 | 4 | subtree: 5 | - name: System 6 | 7 | local: 8 | - name: MainMemory 9 | class: DRAM 10 | attributes: 11 | block-size: 16384 12 | depth: 1 13 | word-bits: 16 14 | read_bandwidth: 4.3 15 | write_bandwidth: 2.9 16 | 17 | subtree: 18 | - name: Buffer 19 | 20 | local: 21 | - name: Cache 22 | class: SRAM 23 | attributes: 24 | word-bits: 16 25 | block_size: 16384 26 | depth: 3 27 | read_bandwidth: 52 28 | write_bandwidth: 20 # 16 29 | 30 | 31 | subtree: 32 | - name: PE 33 | 34 | local: 35 | - name: RegFile[0..255] 36 | class: regfile 37 | attributes: 38 | meshX: 16 39 | meshY: 16 40 | depth: 1 41 | block_size: 3 42 | word-bits: 16 43 | read_bandwidth: 3.2 44 | write_bandwidth: 3.2 45 | 46 | - name: mac[0..255] 47 | class: intmac 48 | attributes: 49 | word-bits: 16 50 | meshX: 16 51 | meshY: 16 52 | -------------------------------------------------------------------------------- /tests/cases/04-test-attention/arch/arch.yaml: -------------------------------------------------------------------------------- 1 | architecture: 2 | version: 0.2 3 | 4 | subtree: 5 | - name: System 6 | 7 | local: 8 | - name: MainMemory 9 | class: DRAM 10 | attributes: 11 | block-size: 1048576 12 | depth: 1 13 | word-bits: 16 14 | read_bandwidth: 4.3 15 | write_bandwidth: 2.9 16 | 17 | subtree: 18 | - name: Buffer 19 | 20 | local: 21 | - name: Cache 22 | class: SRAM 23 | attributes: 24 | word-bits: 16 25 | block_size: 16384 26 | depth: 30 27 | read_bandwidth: 52 28 | write_bandwidth: 20 # 16 29 | 30 | 31 | subtree: 32 | - name: PE 33 | 34 | local: 35 | - name: RegFile[0..255] 36 | class: regfile 37 | attributes: 38 | meshX: 16 39 | meshY: 16 40 | depth: 1 41 | block_size: 3 42 | word-bits: 16 43 | read_bandwidth: 3.2 44 | write_bandwidth: 3.2 45 | 46 | - name: mac[0..255] 47 | class: intmac 48 | attributes: 49 | word-bits: 16 50 | meshX: 16 51 | meshY: 16 52 | -------------------------------------------------------------------------------- /tests/cases/13-test-attention/prob/attention.yaml: -------------------------------------------------------------------------------- 1 | problem: 2 | io: 3 | ins: Q, K, V 4 | outs: O 5 | dimensions: [B,H,M,N,A,L] 6 | instance: 7 | B: B 8 | H: H 9 | M: M 10 | N: N 11 | A: A 12 | L: L 13 | 14 | ops: 15 | - name: ProduceC 16 | dimensions: [B,H,A,L,M] 17 | data-spaces: 18 | - name: C 19 | projection: 20 | - [[B]] 21 | - [[H]] 22 | - [[M]] 23 | - [[L]] 24 | read-write: True 25 | - name: K 26 | projection: 27 | - [[B]] 28 | - [[H]] 29 | - [[A]] 30 | - [[L]] 31 | - name: Q 32 | projection: 33 | - [[B]] 34 | - [[H]] 35 | - [[M]] 36 | - [[A]] 37 | ins: K, Q 38 | out: C 39 | 40 | - name: ProduceO 41 | dimensions: [B,H,L,N,M] 42 | data-spaces: 43 | - name: O 44 | projection: 45 | - [[B]] 46 | - [[H]] 47 | - [[M]] 48 | - [[N]] 49 | read-write: True 50 | - name: V 51 | projection: 52 | - [[B]] 53 | - [[H]] 54 | - [[L]] 55 | - [[N]] 56 | - name: C 57 | projection: 58 | - [[B]] 59 | - [[H]] 60 | - [[M]] 61 | - [[L]] 62 | ins: C, V 63 | out: O -------------------------------------------------------------------------------- /include/tileflow/model/topology.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "tileflow/loop-analysis/nest-analysis.hpp" 4 | 5 | #include "model/topology.hpp" 6 | 7 | namespace model { 8 | 9 | namespace TileFlow { 10 | 11 | 12 | class Topology: public model::Topology { 13 | public: 14 | void eval( 15 | const mapping::TileFlow::Mapping& mapping, 16 | const analysis::TileFlow::NestAnalysis& analysis); 17 | friend class StatCalculator; 18 | }; 19 | 20 | class StatCalculator: public mapping::TileFlow::Visitor { 21 | void visitTile(const TileNode*) override; 22 | void visitScope(const ScopeNode*) override; 23 | void visitOp(const OpNode*) override; 24 | bool break_on_failure; 25 | std::stack cycles_; 26 | double energy_; 27 | model::TileFlow::Topology& topology_; 28 | const mapping::TileFlow::Mapping& mapping_; 29 | const analysis::TileFlow::NestAnalysis& analysis_; 30 | public: 31 | StatCalculator(model::TileFlow::Topology& topology, 32 | const mapping::TileFlow::Mapping& mapping, 33 | const analysis::TileFlow::NestAnalysis& analysis) 34 | : topology_(topology), mapping_(mapping), analysis_(analysis){} 35 | void run(const Node* root) override; 36 | }; 37 | 38 | } 39 | 40 | } // namespace model -------------------------------------------------------------------------------- /tutorials/01-self-attention/readme.md: -------------------------------------------------------------------------------- 1 | # Fusion Dataflow design 2 | 3 | TileFlow is a framework focusing on modeling fusion dataflow. In this tutorial, we would delve into the design process of fusion dataflow on the workload of self-attention. 4 | 5 | An example of the fusion dataflow design is available in map/map.yaml. You might be scaerd by its length. But don't worry, we will guide you through it! 6 | 7 | It is always a good option to start with running the script. 8 | 9 | ```sh 10 | tileflow arch/arch.yaml prob/prob.yaml map/map.yaml macro/macro.yaml 11 | ``` 12 | 13 | You can observe TileFlow tuning the dataflow continuously. 14 | 15 | Next we introduce how we implement the fusion dataflow in TileFlow. In TileFlow, fusion is performed by combining the iteration space of multiple operators. To do that, we introduce scope node in the mapping tree. For this example, an `Sequential` scope is introduced into the mapping descrition to indicates sequential execution of two operators: 16 | ```sh 17 | ... 18 | - node-type: Scope 19 | type: Sequential 20 | 21 | subtree: 22 | ... 23 | ``` 24 | The scope node is inserted below the tile for Mainmemory mapping and above tiles for the Cache's mapping, indicating the tiles are fused at the cache level. 25 | 26 | -------------------------------------------------------------------------------- /tests/cases/08-test-2mm/prob/prob-2mm.yaml: -------------------------------------------------------------------------------- 1 | problem: 2 | io: 3 | ins: A, B, D 4 | outs: E 5 | dimensions: [M,N,K,L] 6 | instance: 7 | M: 512 8 | N: 64 9 | K: 64 10 | L: 512 11 | 12 | ops: 13 | - name: GEMM1 14 | dimensions: [M,L,K] 15 | data-spaces: 16 | - name: C 17 | projection: 18 | - [[M]] 19 | - [[L]] 20 | read-write: True 21 | - name: A 22 | projection: 23 | - [[M]] 24 | - [[K]] 25 | - name: B 26 | projection: 27 | - [[K]] 28 | - [[L]] 29 | ins: A, B 30 | out: C 31 | 32 | - name: EXP 33 | dimensions: [M,L] 34 | type: exp 35 | data-spaces: 36 | - name: C 37 | projection: 38 | - [[M]] 39 | - [[L]] 40 | - name: exp 41 | projection: 42 | - [[M]] 43 | - [[L]] 44 | ins: C 45 | out: exp 46 | inplace: True 47 | 48 | - name: GEMM2 49 | dimensions: [M,L,N] 50 | data-spaces: 51 | - name: E 52 | projection: 53 | - [[M]] 54 | - [[N]] 55 | read-write: True 56 | - name: exp 57 | projection: 58 | - [[M]] 59 | - [[L]] 60 | - name: D 61 | projection: 62 | - [[L]] 63 | - [[N]] 64 | ins: exp, D 65 | out: E -------------------------------------------------------------------------------- /tests/cases/10-domino-2mm/prob/prob-2mm.yaml: -------------------------------------------------------------------------------- 1 | problem: 2 | io: 3 | ins: A, B, E 4 | outs: F 5 | dimensions: [M,N,K,L] 6 | instance: 7 | M: 512 8 | N: 64 9 | K: 64 10 | L: 512 11 | 12 | ops: 13 | - name: ProduceC 14 | dimensions: [M,L,K] 15 | data-spaces: 16 | - name: C 17 | projection: 18 | - [[M]] 19 | - [[L]] 20 | read-write: True 21 | - name: A 22 | projection: 23 | - [[M]] 24 | - [[K]] 25 | - name: B 26 | projection: 27 | - [[K]] 28 | - [[L]] 29 | ins: A, B 30 | out: C 31 | 32 | - name: ProduceD 33 | dimensions: [M,L] 34 | type: exp 35 | data-spaces: 36 | - name: C 37 | projection: 38 | - [[M]] 39 | - [[L]] 40 | - name: D 41 | projection: 42 | - [[M]] 43 | - [[L]] 44 | ins: C 45 | out: D 46 | inplace: True 47 | 48 | - name: ProduceF 49 | dimensions: [M,L,N] 50 | data-spaces: 51 | - name: F 52 | projection: 53 | - [[M]] 54 | - [[N]] 55 | read-write: True 56 | - name: D 57 | projection: 58 | - [[M]] 59 | - [[L]] 60 | - name: E 61 | projection: 62 | - [[L]] 63 | - [[N]] 64 | ins: D, E 65 | out: F -------------------------------------------------------------------------------- /docs/tileflow-metrics.md: -------------------------------------------------------------------------------- 1 | # Profile Metrics 2 | 3 | - Basic: 4 | `Cycle`: the total cycle 5 | `Energy`: total energy consumption 6 | 7 | - Data Movement profiling: 8 | - Key: `Level::Metric[::Tensor]`. MemScope is the name of the user specified storage/compute unit. 9 | 10 | - Metric: 11 | - For compute level: 12 | - `Flop`: the total flops; 13 | - `Energy`: the compute energy; 14 | - For storage level: 15 | - `Read`: the total read from peer/child level; 16 | - `Update`: the write back from the peer/child level 17 | - `Fill`: the total fill from the parent level; 18 | - `Read|Update|Fill::t`: the metric for tensor t; 19 | - `SpatialUtil`: the max utilization of PE units. 20 | - `CapUtil`: the max utilized capcity X utilized PE / (total capcity X #PE) 21 | - `Energy`: the memory access energy; 22 | - `SlowDown` >= 1: the slowdown of this level compared to the child level. > 1 slowdown indicates this level is bottleneck compared to the child levels. 23 | - Value: `double` value. 24 | - Legacy: Per tile utilization 25 | - Key: `ConstraintType::Level`. `ConstraintType` = [MEM|Spatial] 26 | - Value: `double` value indicating utilization. -------------------------------------------------------------------------------- /tests/cases/12-test-fused-cnn/prob/prob.yaml: -------------------------------------------------------------------------------- 1 | problem: 2 | io: 3 | ins: I X Y 4 | outs: D 5 | dimensions: [B,H,W,C,L,K,R,S,U,V] 6 | instance: 7 | B: B 8 | H: H 9 | W: W 10 | C: C 11 | L: L 12 | K: K 13 | R: R 14 | S: S 15 | U: U 16 | V: V 17 | 18 | ops: 19 | - name: ProduceA 20 | dimensions: [L,C,R,S,B,H,W,U,V] 21 | data-spaces: 22 | - name: A 23 | projection: 24 | - [[B]] 25 | - [[L]] 26 | - [[H], [U]] 27 | - [[W], [V]] 28 | read-write: True 29 | - name: X 30 | projection: 31 | - [[L]] 32 | - [[C]] 33 | - [[R]] 34 | - [[S]] 35 | - name: I 36 | projection: 37 | - [[B]] 38 | - [[C]] 39 | - [[H],[R]] 40 | - [[W],[S]] 41 | ins: X, I 42 | out: A 43 | 44 | - name: ProduceD 45 | dimensions: [K,L,U,V,B,H,W] 46 | data-spaces: 47 | - name: D 48 | projection: 49 | - [[B]] 50 | - [[K]] 51 | - [[H]] 52 | - [[W]] 53 | read-write: True 54 | - name: Y 55 | projection: 56 | - [[K]] 57 | - [[L]] 58 | - [[U]] 59 | - [[V]] 60 | - name: A 61 | projection: 62 | - [[B]] 63 | - [[L]] 64 | - [[H],[U]] 65 | - [[W],[V]] 66 | ins: Y, A 67 | out: D 68 | 69 | check: 70 | loopcount: False -------------------------------------------------------------------------------- /include/tileflow/mapper/op.hpp: -------------------------------------------------------------------------------- 1 | #include "tileflow/mapper/expr.hpp" 2 | 3 | 4 | 5 | namespace TileFlow { 6 | 7 | namespace Op { 8 | 9 | std::shared_ptr max(std::vector > exprs); 10 | 11 | std::shared_ptr sum(std::vector > exprs); 12 | 13 | std::shared_ptr operator <= (const std::shared_ptr&, const std::shared_ptr&); 14 | 15 | std::shared_ptr pair(int x, int y); 16 | 17 | std::shared_ptr pair( 18 | const std::shared_ptr& x, 19 | const std::shared_ptr& y); 20 | 21 | std::shared_ptr product(std::vector >& exprs); 22 | 23 | std::shared_ptr product(std::initializer_list> exprs); 24 | 25 | std::shared_ptr product(const std::pair >& exprs); 26 | 27 | std::shared_ptr sum(std::vector >& exprs); 28 | 29 | std::shared_ptr max(std::vector >& exprs); 30 | 31 | std::shared_ptr variable(int x); 32 | 33 | std::shared_ptr operator == ( 34 | const std::shared_ptr& left, 35 | const std::shared_ptr& right 36 | ); 37 | 38 | std::shared_ptr operator <= ( 39 | const std::shared_ptr& left, 40 | const std::shared_ptr& right 41 | ); 42 | 43 | std::shared_ptr parameter(num_t x); 44 | 45 | } // namespace Op 46 | 47 | } // namespace TileFlow -------------------------------------------------------------------------------- /tests/cases/10-domino-2mm/map/map.yaml: -------------------------------------------------------------------------------- 1 | mapping: 2 | node-type: Tile 3 | type: Temporal 4 | factors: M=32 L=32 5 | permutation: ML 6 | target: L1 7 | 8 | subtree: 9 | - node-type: Scope 10 | type: Sequential 11 | 12 | subtree: 13 | - node-type: Tile 14 | type: Spatial 15 | factors: K=4 16 | permutation: K 17 | target: L1 18 | 19 | subtree: 20 | - node-type: Tile 21 | type: Temporal 22 | factors: M=16 L=16 K=16 23 | permutation: MLK 24 | target: L0 25 | 26 | subtree: 27 | - node-type: Op 28 | name: ProduceC 29 | binding: M:M L:L K:K 30 | - node-type: Tile 31 | type: Temporal 32 | factors: M=16 L=16 33 | permutation: ML 34 | target: L0 35 | 36 | subtree: 37 | - node-type: Op 38 | name: ProduceD 39 | binding: M:M L:L 40 | - node-type: Tile 41 | type: Spatial 42 | factors: N=4 43 | permutation: N 44 | target: L1 45 | 46 | subtree: 47 | - node-type: Tile 48 | type: Temporal 49 | factors: M=16 N=16 L=16 50 | permutation: MNL 51 | target: L0 52 | 53 | subtree: 54 | - node-type: Op 55 | name: ProduceF 56 | binding: M:M L:L N:N -------------------------------------------------------------------------------- /tests/cases/04-test-attention/map/map-flat.yaml: -------------------------------------------------------------------------------- 1 | mapping: 2 | node-type: Tile 3 | type: Temporal 4 | factors: M=MO 5 | target: MainMemory 6 | subtree: 7 | - node-type: Scope 8 | type: sequential 9 | 10 | subtree: 11 | - node-type: Tile 12 | type: temporal 13 | factors: M=MM K=KM L=LM 14 | permutation: LMK # input stationary 15 | target: Cache 16 | 17 | subtree: 18 | - node-type: Tile 19 | type: spatial 20 | factors: M=16 K=16 21 | permutation: MK 22 | split: 1 23 | target: Cache 24 | multicast: true 25 | 26 | subtree: 27 | - node-type: Tile 28 | type: temporal 29 | factors: M=1 L=1 K=1 30 | permutation: MLK 31 | target: RegFile 32 | 33 | subtree: 34 | - node-type: Op 35 | name: GEMM1 36 | 37 | - node-type: Tile 38 | type: temporal 39 | factors: M=MM L=LM N=NM 40 | permutation: LMN # output stationary 41 | target: Cache 42 | 43 | subtree: 44 | - node-type: Tile 45 | type: spatial 46 | factors: M = 16 N = 16 47 | permutation: MN 48 | split: 1 49 | target: Cache 50 | multicast: true 51 | 52 | subtree: 53 | - node-type: Tile 54 | type: temporal 55 | factors: M=1 L=1 N=1 56 | target: RegFile 57 | 58 | subtree: 59 | - node-type: Op 60 | name: GEMM2 -------------------------------------------------------------------------------- /tests/cases/13-test-attention/map/fuse-L0-pipeline.yaml: -------------------------------------------------------------------------------- 1 | mapping: 2 | node-type: Tile 3 | type: Temporal 4 | factors: B=? H=? M=? L=? 5 | permutation: LMHB 6 | target: L2 7 | 8 | subtree: 9 | - node-type: Tile 10 | type: Spatial 11 | factors: B=? H=? M=? 12 | permutation: MHB 13 | target: L2 14 | split: 1 15 | 16 | subtree: 17 | - node-type: Tile 18 | type: Temporal 19 | factors: M=? L=? B=? H=? 20 | permutation: HBLM 21 | target: L1 22 | 23 | subtree: 24 | - node-type: Scope 25 | type: Pipeline 26 | 27 | subtree: 28 | - node-type: Tile 29 | type: Spatial 30 | factors: M=16 L=8 31 | permutation: LM 32 | target: L1 33 | split: 1 34 | tag: op1 35 | 36 | subtree: 37 | - node-type: Tile 38 | type: Temporal 39 | factors: M=1 A=1 L=1 40 | permutation: LAM 41 | target: L0 42 | tag: op1 43 | 44 | subtree: 45 | - node-type: Op 46 | name: ProduceC 47 | - node-type: Tile 48 | type: Spatial 49 | factors: M=16 L=8 50 | permutation: LM 51 | target: L1 52 | split: 1 53 | 54 | subtree: 55 | - node-type: Tile 56 | type: Temporal 57 | factors: M=1 L=1 N=1 58 | permutation: NLM 59 | target: L0 60 | 61 | subtree: 62 | - node-type: Op 63 | name: ProduceO 64 | -------------------------------------------------------------------------------- /tests/cases/06-test-mapper/test1.yaml: -------------------------------------------------------------------------------- 1 | problem: 2 | io: 3 | ins: A, B 4 | outs: C 5 | dimensions: [M,N,K] 6 | instance: 7 | M: 32 8 | N: 16 9 | K: 16 10 | 11 | ops: 12 | - name: GEMM 13 | dimensions: [M,N,K] 14 | data-spaces: 15 | - name: C 16 | projection: 17 | - [[M]] 18 | - [[N]] 19 | read-write: True 20 | - name: A 21 | projection: 22 | - [[M]] 23 | - [[K]] 24 | - name: B 25 | projection: 26 | - [[K]] 27 | - [[N]] 28 | ins: A, B 29 | out: C 30 | 31 | mapping: 32 | node-type: Tile 33 | type: temporal 34 | factors: M=X N=X K=X 35 | permutation: MNK 36 | target: MainMemory 37 | 38 | 39 | subtree: 40 | - node-type: Tile 41 | type: temporal 42 | factors: M=X N=X K=X 43 | permutation: MKN 44 | target: RegFile 45 | 46 | subtree: 47 | - node-type: Op 48 | name: GEMM 49 | binding: M:M N:N K:K 50 | 51 | tileflow-mapper: 52 | objective: energy 53 | 54 | architecture: 55 | version: 0.2 56 | 57 | subtree: 58 | - name: System 59 | 60 | local: 61 | - name: MainMemory 62 | class: DRAM 63 | attributes: 64 | depth: 1 65 | block-size: 2048 66 | word-bits: 8 67 | 68 | subtree: 69 | - name: PE 70 | 71 | local: 72 | - name: RegFile 73 | class: regfile 74 | attributes: 75 | depth: 112 76 | width: 16 77 | word-bits: 8 78 | read_bandwidth: 1 79 | write_bandwidth: 1 80 | 81 | - name: mac 82 | class: intmac 83 | attributes: 84 | datawidth: 16 -------------------------------------------------------------------------------- /tests/cases/08-test-2mm/map/map.yaml: -------------------------------------------------------------------------------- 1 | mapping: 2 | node-type: Tile 3 | type: temporal 4 | factors: M=64 L=32 5 | permutation: ML 6 | target: MainMemory 7 | 8 | subtree: 9 | - node-type: Scope 10 | type: Pipeline 11 | 12 | subtree: 13 | - node-type: Tile 14 | type: Spatial 15 | factors: M=2 L=4 16 | split: 1 17 | permutation: ML 18 | target: MainMemory 19 | 20 | subtree: 21 | - node-type: Tile 22 | type: temporal 23 | factors: M=4 L=4 K=64 24 | permutation: MLK 25 | target: RegFile 26 | 27 | subtree: 28 | - node-type: Op 29 | name: GEMM1 30 | binding: M:M L:L K:K 31 | 32 | # A common spatial tile 33 | - node-type: Tile 34 | type: Spatial 35 | factors: M=2 L=4 36 | permutation: ML 37 | split: 1 38 | target: MainMemory 39 | 40 | subtree: 41 | - node-type: Scope 42 | type: Sequential 43 | 44 | subtree: 45 | - node-type: Tile 46 | type: temporal 47 | factors: M=4 L=4 48 | permutation: ML 49 | target: RegFile 50 | 51 | subtree: 52 | - node-type: Op 53 | name: EXP 54 | binding: M:M, L:L 55 | 56 | 57 | - node-type: Tile 58 | type: temporal 59 | factors: M=4 L=4 N=64 60 | permutation: MLN 61 | target: RegFile 62 | 63 | subtree: 64 | - node-type: Op 65 | name: GEMM2 66 | binding: M:M, L:L, N:N 67 | -------------------------------------------------------------------------------- /tests/cases/12-test-fused-cnn/map/isos.yaml: -------------------------------------------------------------------------------- 1 | mapping: 2 | node-type: Tile 3 | type: Temporal 4 | factors: B=BO H=HO 5 | permutation: HB 6 | target: L2 7 | 8 | subtree: 9 | - node-type: Scope 10 | type: Pipeline 11 | 12 | subtree: 13 | - node-type: Tile 14 | type: Spatial 15 | factors: H=HS W=WS 16 | permutation: WH 17 | target: L2 18 | split: 1 19 | 20 | subtree: 21 | - node-type: Tile 22 | type: Temporal 23 | factors: C=HM L=WM H=CM W=LM 24 | permutation: WHLC 25 | target: L1 26 | 27 | subtree: 28 | - node-type: Tile 29 | type: Spatial 30 | factors: C=CS L=LS 31 | permutation: LC 32 | target: L1 33 | split: 1 34 | 35 | subtree: 36 | - node-type: Tile 37 | type: Temporal 38 | factors: R=RI S=SI 39 | permutation: SR 40 | target: L0 41 | 42 | subtree: 43 | - node-type: Op 44 | name: ProduceA 45 | - node-type: Tile 46 | type: Spatial 47 | factors: H=HS W=WS 48 | permutation: WH 49 | target: L2 50 | split: 1 51 | 52 | subtree: 53 | - node-type: Tile 54 | type: Temporal 55 | factors: L=HM2 K=WM2 H=LM2 W=KM2 56 | permutation: WHKL 57 | target: L1 58 | 59 | subtree: 60 | - node-type: Tile 61 | type: Spatial 62 | factors: L=LS2 K=KS 63 | permutation: KL 64 | target: L1 65 | split: 1 66 | 67 | subtree: 68 | - node-type: Tile 69 | type: Temporal 70 | factors: U=UI V=VI 71 | permutation: VU 72 | target: L0 73 | 74 | subtree: 75 | - node-type: Op 76 | name: ProduceD 77 | -------------------------------------------------------------------------------- /tests/cases/13-test-attention/map/flash-attention.yaml: -------------------------------------------------------------------------------- 1 | mapping: 2 | node-type: Tile 3 | type: Temporal 4 | factors: B=? H=? M=? L=? 5 | permutation: LMHB 6 | target: L2 7 | 8 | subtree: 9 | - node-type: Scope 10 | type: Sequential 11 | 12 | subtree: 13 | - node-type: Tile 14 | type: Spatial 15 | factors: B=? H=? M=? 16 | permutation: MHB 17 | target: L2 18 | split: 1 19 | 20 | subtree: 21 | - node-type: Tile 22 | type: Temporal 23 | factors: M=? L=? A=? 24 | permutation: ALM 25 | target: L1 26 | 27 | subtree: 28 | - node-type: Tile 29 | type: Spatial 30 | factors: M=? L=? 31 | permutation: LM 32 | target: L1 33 | split: 1 34 | 35 | subtree: 36 | - node-type: Tile 37 | type: Temporal 38 | factors: M=1 A=1 L=1 39 | permutation: LAM 40 | target: L0 41 | 42 | subtree: 43 | - node-type: Op 44 | name: ProduceC 45 | - node-type: Tile 46 | type: Spatial 47 | factors: B=? H=? M=? 48 | permutation: MHB 49 | target: L2 50 | split: 1 51 | 52 | subtree: 53 | - node-type: Tile 54 | type: Temporal 55 | factors: M=? L=? N=? 56 | permutation: NLM 57 | target: L1 58 | 59 | subtree: 60 | - node-type: Tile 61 | type: Spatial 62 | factors: M=? L=? 63 | permutation: LM 64 | target: L1 65 | split: 1 66 | 67 | subtree: 68 | - node-type: Tile 69 | type: Temporal 70 | factors: M=1 N=1 L=1 71 | permutation: LNM 72 | target: L0 73 | 74 | subtree: 75 | - node-type: Op 76 | name: ProduceO 77 | -------------------------------------------------------------------------------- /tests/cases/12-test-fused-cnn/map/tileflow.yaml: -------------------------------------------------------------------------------- 1 | mapping: 2 | node-type: Tile 3 | type: Temporal 4 | factors: B=BO W=WO H=HO 5 | permutation: HWB 6 | target: L2 7 | 8 | subtree: 9 | - node-type: Scope 10 | type: Sequential 11 | 12 | subtree: 13 | - node-type: Tile 14 | type: Spatial 15 | factors: H=HS W=WS 16 | permutation: WH 17 | target: L2 18 | split: 1 19 | 20 | subtree: 21 | - node-type: Tile 22 | type: Temporal 23 | factors: C=HM L=WM H=CM W=LM 24 | permutation: WHLC 25 | target: L1 26 | 27 | subtree: 28 | - node-type: Tile 29 | type: Spatial 30 | factors: C=CS L=LS 31 | permutation: LC 32 | target: L1 33 | split: 1 34 | 35 | subtree: 36 | - node-type: Tile 37 | type: Temporal 38 | factors: R=RI S=SI 39 | permutation: SR 40 | target: L0 41 | 42 | subtree: 43 | - node-type: Op 44 | name: ProduceA 45 | - node-type: Tile 46 | type: Spatial 47 | factors: H=HS W=WS 48 | permutation: WH 49 | target: L2 50 | split: 1 51 | 52 | subtree: 53 | - node-type: Tile 54 | type: Temporal 55 | factors: L=HM2 K=WM2 H=LM2 W=KM2 56 | permutation: WHKL 57 | target: L1 58 | 59 | subtree: 60 | - node-type: Tile 61 | type: Spatial 62 | factors: L=LS2 K=KS 63 | permutation: KL 64 | target: L1 65 | split: 1 66 | 67 | subtree: 68 | - node-type: Tile 69 | type: Temporal 70 | factors: U=UI V=VI 71 | permutation: VU 72 | target: L0 73 | 74 | subtree: 75 | - node-type: Op 76 | name: ProduceD 77 | -------------------------------------------------------------------------------- /tests/cases/12-test-fused-cnn/map/fused-layer.yaml: -------------------------------------------------------------------------------- 1 | mapping: 2 | node-type: Tile 3 | type: Temporal 4 | factors: B=BO W=WO H=HO 5 | permutation: HWB 6 | target: L2 7 | 8 | subtree: 9 | - node-type: Scope 10 | type: Pipeline 11 | 12 | subtree: 13 | - node-type: Tile 14 | type: Spatial 15 | factors: H=HS W=WS 16 | permutation: WH 17 | target: L2 18 | split: 1 19 | 20 | subtree: 21 | - node-type: Tile 22 | type: Temporal 23 | factors: C=HM L=WM H=CM W=LM 24 | permutation: WHLC 25 | target: L1 26 | 27 | subtree: 28 | - node-type: Tile 29 | type: Spatial 30 | factors: C=CS L=LS 31 | permutation: LC 32 | target: L1 33 | split: 1 34 | 35 | subtree: 36 | - node-type: Tile 37 | type: Temporal 38 | factors: R=RI S=SI 39 | permutation: SR 40 | target: L0 41 | 42 | subtree: 43 | - node-type: Op 44 | name: ProduceA 45 | - node-type: Tile 46 | type: Spatial 47 | factors: H=HS W=WS 48 | permutation: WH 49 | target: L2 50 | split: 1 51 | 52 | subtree: 53 | - node-type: Tile 54 | type: Temporal 55 | factors: L=HM2 K=WM2 H=LM2 W=KM2 56 | permutation: WHKL 57 | target: L1 58 | 59 | subtree: 60 | - node-type: Tile 61 | type: Spatial 62 | factors: L=LS2 K=KS 63 | permutation: KL 64 | target: L1 65 | split: 1 66 | 67 | subtree: 68 | - node-type: Tile 69 | type: Temporal 70 | factors: U=UI V=VI 71 | permutation: VU 72 | target: L0 73 | 74 | subtree: 75 | - node-type: Op 76 | name: ProduceD 77 | -------------------------------------------------------------------------------- /tests/cases/12-test-fused-cnn/map/pipeline.yaml: -------------------------------------------------------------------------------- 1 | mapping: 2 | node-type: Tile 3 | type: Temporal 4 | factors: B=BO H=HO W=WO 5 | permutation: WHB 6 | target: L2 7 | 8 | subtree: 9 | - node-type: Scope 10 | type: Pipeline 11 | 12 | subtree: 13 | - node-type: Tile 14 | type: Spatial 15 | factors: H=HS1 W=WS1 16 | permutation: WH 17 | target: L2 18 | split: 1 19 | 20 | subtree: 21 | - node-type: Tile 22 | type: Temporal 23 | factors: C=HM L=WM H=CM W=LM 24 | permutation: WHLC 25 | target: L1 26 | 27 | subtree: 28 | - node-type: Tile 29 | type: Spatial 30 | factors: C=CS L=LS 31 | permutation: LC 32 | target: L1 33 | split: 1 34 | 35 | subtree: 36 | - node-type: Tile 37 | type: Temporal 38 | factors: R=RI S=SI 39 | permutation: SR 40 | target: L0 41 | 42 | subtree: 43 | - node-type: Op 44 | name: ProduceA 45 | - node-type: Tile 46 | type: Spatial 47 | factors: H=HS2 W=WS2 48 | permutation: WH 49 | target: L2 50 | split: 1 51 | 52 | subtree: 53 | - node-type: Tile 54 | type: Temporal 55 | factors: L=HM2 K=WM2 H=LM2 W=KM2 56 | permutation: WHKL 57 | target: L1 58 | 59 | subtree: 60 | - node-type: Tile 61 | type: Spatial 62 | factors: L=LS2 K=KS 63 | permutation: KL 64 | target: L1 65 | split: 1 66 | 67 | subtree: 68 | - node-type: Tile 69 | type: Temporal 70 | factors: U=UI V=VI 71 | permutation: VU 72 | target: L0 73 | 74 | subtree: 75 | - node-type: Op 76 | name: ProduceD 77 | -------------------------------------------------------------------------------- /tests/cases/13-test-attention/map/pipeline.yaml: -------------------------------------------------------------------------------- 1 | mapping: 2 | node-type: Tile 3 | type: Temporal 4 | factors: B=? H=? M=? L=? 5 | permutation: LMHB 6 | target: L2 7 | 8 | subtree: 9 | - node-type: Scope 10 | type: Pipeline 11 | 12 | subtree: 13 | - node-type: Tile 14 | type: Spatial 15 | factors: B=? H=? M=? 16 | permutation: MHB 17 | target: L2 18 | split: 1 19 | 20 | subtree: 21 | - node-type: Tile 22 | type: Temporal 23 | factors: M=? L=? A=? 24 | permutation: ALM 25 | target: L1 26 | tag: op1 27 | 28 | subtree: 29 | - node-type: Tile 30 | type: Spatial 31 | factors: M=? L=? 32 | permutation: LM 33 | target: L1 34 | split: 1 35 | tag: op1 36 | 37 | subtree: 38 | - node-type: Tile 39 | type: Temporal 40 | factors: M=1 A=1 L=1 41 | permutation: LAM 42 | target: L0 43 | tag: op1 44 | 45 | subtree: 46 | - node-type: Op 47 | name: ProduceC 48 | - node-type: Tile 49 | type: Spatial 50 | factors: B=? H=? M=? 51 | permutation: MHB 52 | target: L2 53 | split: 1 54 | 55 | subtree: 56 | - node-type: Tile 57 | type: Temporal 58 | factors: M=? L=? N=? 59 | permutation: NLM 60 | target: L1 61 | 62 | subtree: 63 | - node-type: Tile 64 | type: Spatial 65 | factors: M=? L=? 66 | permutation: LM 67 | target: L1 68 | split: 1 69 | 70 | subtree: 71 | - node-type: Tile 72 | type: Temporal 73 | factors: M=1 L=1 N=1 74 | permutation: NLM 75 | target: L0 76 | 77 | subtree: 78 | - node-type: Op 79 | name: ProduceO 80 | -------------------------------------------------------------------------------- /docs/mcts.md: -------------------------------------------------------------------------------- 1 | # MCTS 2 | - monte caro tree search; 3 | - problem description: 4 | - A constraint optimization problem: 5 | $$ 6 | Min_{variables} obj \\ 7 | s.t.\ Constraints(variables) = 1 8 | $$ 9 | - variables: 10 | - tiling factors; 11 | - scope type (not realized yet); 12 | - permutation (not realized); 13 | - constraints: 14 | - loop count constraint: example: $\Pi_{i} t^j_i \leq LoopCount_j$ 15 | - memory constraint: example: $\Sigma_j\Pi_i t^k_{ij} \leq MemSize_k$ 16 | - spatial constaint: example: $\Sigma_i Max_j...\Sigma_k \leq $ 17 | - objective: 18 | - energy/latency; 19 | - this is calculated as a black box by simulation; 20 | - Algorithm: for the tile size only 21 | - How the algorithm works can be seen in [this](https://hci.iwr.uni-heidelberg.de/system/files/private/downloads/297868474/report_robert-klassert.pdf) 22 | - Encoding in TileFlow: 23 | - State: 24 | - A `symbol table` recording if a variable is fixed. If it is fixed, record the fixed value; else record the candidate values. 25 | - The candidate values is derived by the loop count constraint. 26 | - Action: 27 | - Choose the next variable to be fixed; 28 | - Use heuristic to choose the variable with minimum feasible candidate values. 29 | - Decide the variable's value: 30 | - Use the MCTS's UCB method to decide the value. 31 | - State Transition: Fix the variable with given value; update the symbol table using all constraints: 32 | - For loop count constraints, use it to give concrete candidate values easily. 33 | - For other two type of constraints, use them as 0/1 pruning condition. 34 | - Terminate condition: all variables are fixed or no feasible solutions; 35 | - reward: energy/cycle from TileFlow's simulation 36 | 37 | -------------------------------------------------------------------------------- /tests/cases/00-validation/02-attention/map/map.yaml: -------------------------------------------------------------------------------- 1 | mapping: 2 | node-type: Tile 3 | type: temporal 4 | factors: M=MO 5 | target: MainMemory 6 | 7 | 8 | subtree: 9 | - node-type: Scope 10 | type: Sequential 11 | 12 | subtree: 13 | - node-type: Tile 14 | factors: K=KO 15 | type: temporal 16 | bypass: [C] 17 | target: MainMemory 18 | profile: False 19 | 20 | 21 | subtree: 22 | - node-type: Tile 23 | type: temporal 24 | factors: K=KM L=L M=MM 25 | permutation: LMK 26 | target: Cache 27 | 28 | subtree: 29 | - node-type: Tile 30 | type: Spatial 31 | factors: M=SX K=SY 32 | split: 1 33 | permutation: MK 34 | target: Cache 35 | 36 | subtree: 37 | - node-type: Tile 38 | type: temporal 39 | factors: M=1 L=1 K=1 40 | permutation: MLK 41 | target: RegFile 42 | 43 | subtree: 44 | - node-type: Op 45 | name: GEMM1 46 | 47 | # A common spatial tile 48 | - node-type: Tile 49 | type: temporal 50 | factors: L=LO 51 | target: MainMemory 52 | profile: False 53 | bypass: [C] 54 | 55 | subtree: 56 | - node-type: Tile 57 | type: temporal 58 | factors: M=MM L=LM N=N 59 | permutation: NML 60 | target: Cache 61 | 62 | subtree: 63 | - node-type: Tile 64 | type: Spatial 65 | factors: M=SX L=SY 66 | split: 1 67 | permutation: ML 68 | target: Cache 69 | 70 | subtree: 71 | - node-type: Tile 72 | type: temporal 73 | factors: M=1 L=1 N=1 74 | permutation: MLN 75 | target: RegFile 76 | 77 | subtree: 78 | - node-type: Op 79 | name: GEMM2 -------------------------------------------------------------------------------- /tests/cases/12-test-fused-cnn/map/naive.yaml: -------------------------------------------------------------------------------- 1 | mapping: 2 | node-type: Scope 3 | type: Sequential 4 | 5 | subtree: 6 | - node-type: Tile 7 | type: Temporal 8 | factors: B=BO H=HO W=WO 9 | permutation: WHB 10 | target: L2 11 | 12 | subtree: 13 | - node-type: Tile 14 | type: Spatial 15 | factors: H=HS W=WS 16 | permutation: WH 17 | target: L2 18 | split: 1 19 | 20 | subtree: 21 | - node-type: Tile 22 | type: Temporal 23 | factors: C=HM L=WM H=CM W=LM 24 | permutation: WHLC 25 | target: L1 26 | 27 | subtree: 28 | - node-type: Tile 29 | type: Spatial 30 | factors: C=CS L=LS 31 | permutation: LC 32 | target: L1 33 | split: 1 34 | 35 | subtree: 36 | - node-type: Tile 37 | type: Temporal 38 | factors: R=RI S=SI 39 | permutation: SR 40 | target: L0 41 | 42 | subtree: 43 | - node-type: Op 44 | name: ProduceA 45 | - node-type: Tile 46 | type: Temporal 47 | factors: B=BO H=HO W=WO 48 | permutation: WHB 49 | target: L2 50 | 51 | subtree: 52 | - node-type: Tile 53 | type: Spatial 54 | factors: H=HS W=WS 55 | permutation: WH 56 | target: L2 57 | split: 1 58 | 59 | subtree: 60 | - node-type: Tile 61 | type: Temporal 62 | factors: L=HM2 K=WM2 H=LM2 W=KM2 63 | permutation: WHKL 64 | target: L1 65 | 66 | subtree: 67 | - node-type: Tile 68 | type: Spatial 69 | factors: L=LS2 K=KS 70 | permutation: KL 71 | target: L1 72 | split: 1 73 | 74 | subtree: 75 | - node-type: Tile 76 | type: Temporal 77 | factors: U=UI V=VI 78 | permutation: VU 79 | target: L0 80 | 81 | subtree: 82 | - node-type: Op 83 | name: ProduceD 84 | -------------------------------------------------------------------------------- /tests/cases/13-test-attention/map/tileflow.yaml: -------------------------------------------------------------------------------- 1 | mapping: 2 | node-type: Tile 3 | type: Temporal 4 | factors: B=? H=? M=? L=? 5 | permutation: LMHB 6 | target: L2 7 | 8 | subtree: 9 | - node-type: Tile 10 | type: Spatial 11 | factors: B=? H=? M=? 12 | permutation: MHB 13 | target: L2 14 | split: 1 15 | 16 | subtree: 17 | - node-type: Tile 18 | type: Temporal 19 | factors: M=? L=? B=? H=? 20 | permutation: HBLM 21 | target: L1 22 | 23 | subtree: 24 | - node-type: Scope 25 | type: Sequential 26 | 27 | subtree: 28 | - node-type: Tile 29 | type: Temporal 30 | factors: A=? 31 | permutation: A 32 | target: L1 33 | profile: False 34 | bypass: [C] 35 | 36 | subtree: 37 | - node-type: Tile 38 | type: Spatial 39 | factors: M=? L=? 40 | permutation: LM 41 | target: L1 42 | split: 1 43 | tag: op1 44 | 45 | subtree: 46 | - node-type: Tile 47 | type: Temporal 48 | factors: M=1 A=1 L=1 49 | permutation: LAM 50 | target: L0 51 | tag: op1 52 | 53 | subtree: 54 | - node-type: Op 55 | name: ProduceC 56 | - node-type: Tile 57 | type: Temporal 58 | factors: N=? 59 | permutation: N 60 | target: L1 61 | profile: False 62 | bypass: [C] 63 | 64 | subtree: 65 | - node-type: Tile 66 | type: Spatial 67 | factors: M=? L=? 68 | permutation: LM 69 | target: L1 70 | split: 1 71 | 72 | subtree: 73 | - node-type: Tile 74 | type: Temporal 75 | factors: M=1 L=1 N=1 76 | permutation: NLM 77 | target: L0 78 | 79 | subtree: 80 | - node-type: Op 81 | name: ProduceO 82 | -------------------------------------------------------------------------------- /tests/cases/13-test-attention/map/fuse-L0.yaml: -------------------------------------------------------------------------------- 1 | mapping: 2 | node-type: Tile 3 | type: Temporal 4 | factors: B=? H=? M=? L=? 5 | permutation: LMHB 6 | target: L2 7 | 8 | subtree: 9 | - node-type: Tile 10 | type: Spatial 11 | factors: B=? H=? M=? 12 | permutation: MHB 13 | target: L2 14 | split: 1 15 | 16 | subtree: 17 | - node-type: Tile 18 | type: Temporal 19 | factors: M=? L=? B=? H=? 20 | permutation: HBLM 21 | target: L1 22 | 23 | subtree: 24 | - node-type: Scope 25 | type: Sequential 26 | 27 | subtree: 28 | - node-type: Tile 29 | type: Temporal 30 | factors: A=? 31 | permutation: A 32 | target: L1 33 | profile: False 34 | bypass: [C] 35 | 36 | subtree: 37 | - node-type: Tile 38 | type: Spatial 39 | factors: M=16 L=16 40 | permutation: LM 41 | target: L1 42 | split: 1 43 | tag: op1 44 | 45 | subtree: 46 | - node-type: Tile 47 | type: Temporal 48 | factors: M=1 A=1 L=1 49 | permutation: LAM 50 | target: L0 51 | tag: op1 52 | 53 | subtree: 54 | - node-type: Op 55 | name: ProduceC 56 | - node-type: Tile 57 | type: Temporal 58 | factors: N=? 59 | permutation: N 60 | target: L1 61 | profile: False 62 | bypass: [C] 63 | 64 | subtree: 65 | - node-type: Tile 66 | type: Spatial 67 | factors: M=16 L=16 68 | permutation: LM 69 | target: L1 70 | split: 1 71 | 72 | subtree: 73 | - node-type: Tile 74 | type: Temporal 75 | factors: M=1 L=1 N=1 76 | permutation: NLM 77 | target: L0 78 | 79 | subtree: 80 | - node-type: Op 81 | name: ProduceO 82 | -------------------------------------------------------------------------------- /tests/cases/13-test-attention/map/no-fuse.yaml: -------------------------------------------------------------------------------- 1 | mapping: 2 | node-type: Scope 3 | type: Sequential 4 | 5 | subtree: 6 | - node-type: Tile 7 | type: Temporal 8 | factors: B=? H=? M=? L=? 9 | permutation: LMHB 10 | target: L2 11 | 12 | subtree: 13 | - node-type: Tile 14 | type: Spatial 15 | factors: B=? H=? M=? 16 | permutation: MHB 17 | target: L2 18 | split: 1 19 | 20 | subtree: 21 | - node-type: Tile 22 | type: Temporal 23 | factors: M=? L=? A=? 24 | permutation: ALM 25 | target: L1 26 | tag: op1 27 | 28 | subtree: 29 | - node-type: Tile 30 | type: Spatial 31 | factors: M=? L=? 32 | permutation: LM 33 | target: L1 34 | split: 1 35 | tag: op1 36 | 37 | subtree: 38 | - node-type: Tile 39 | type: Temporal 40 | factors: M=1 A=1 L=1 41 | permutation: LAM 42 | target: L0 43 | tag: op1 44 | 45 | subtree: 46 | - node-type: Op 47 | name: ProduceC 48 | - node-type: Tile 49 | type: Temporal 50 | factors: B=? H=? M=? L=? 51 | permutation: LMHB 52 | target: L2 53 | 54 | subtree: 55 | - node-type: Tile 56 | type: Spatial 57 | factors: B=? H=? M=? 58 | permutation: MHB 59 | target: L2 60 | split: 1 61 | 62 | subtree: 63 | - node-type: Tile 64 | type: Temporal 65 | factors: M=? L=? N=? 66 | permutation: NLM 67 | target: L1 68 | 69 | subtree: 70 | - node-type: Tile 71 | type: Spatial 72 | factors: M=? L=? 73 | permutation: LM 74 | target: L1 75 | split: 1 76 | 77 | subtree: 78 | - node-type: Tile 79 | type: Temporal 80 | factors: M=1 L=1 N=1 81 | permutation: NLM 82 | target: L0 83 | 84 | subtree: 85 | - node-type: Op 86 | name: ProduceO 87 | -------------------------------------------------------------------------------- /AE/validation/accelerator/map/map.yaml: -------------------------------------------------------------------------------- 1 | mapping: 2 | node-type: Tile 3 | type: temporal 4 | factors: M=MO 5 | target: MainMemory 6 | 7 | subtree: 8 | - node-type: Scope 9 | type: Sequential 10 | 11 | subtree: 12 | - node-type: Tile 13 | factors: K=KO L=LO 14 | type: temporal 15 | bypass: [C] 16 | target: MainMemory 17 | profile: False 18 | tag: op1 19 | 20 | 21 | subtree: 22 | - node-type: Tile 23 | type: temporal 24 | factors: K=KM L=LI M=MM 25 | permutation: LMK 26 | target: Cache 27 | tag: op1 28 | 29 | subtree: 30 | - node-type: Tile 31 | type: Spatial 32 | factors: M=SX K=SY 33 | split: 1 34 | permutation: MK 35 | target: Cache 36 | tag: op1 37 | 38 | subtree: 39 | - node-type: Tile 40 | type: temporal 41 | factors: M=1 L=1 K=1 42 | permutation: MLK 43 | target: RegFile 44 | tag: op1 45 | 46 | subtree: 47 | - node-type: Op 48 | name: GEMM1 49 | 50 | # A common spatial tile 51 | - node-type: Tile 52 | type: temporal 53 | factors: L=LO N=NO 54 | target: MainMemory 55 | profile: False 56 | bypass: [C] 57 | tag: op2 58 | 59 | subtree: 60 | - node-type: Tile 61 | type: temporal 62 | factors: M=MM L=LM N=NI 63 | permutation: NML 64 | target: Cache 65 | tag: op2 66 | 67 | subtree: 68 | - node-type: Tile 69 | type: Spatial 70 | factors: M=SX L=SY 71 | split: 1 72 | permutation: ML 73 | target: Cache 74 | tag: op2 75 | 76 | subtree: 77 | - node-type: Tile 78 | type: temporal 79 | factors: M=1 L=1 N=1 80 | permutation: MLN 81 | target: RegFile 82 | tag: op2 83 | 84 | subtree: 85 | - node-type: Op 86 | name: GEMM2 -------------------------------------------------------------------------------- /tutorials/01-self-attention/map/map.yaml: -------------------------------------------------------------------------------- 1 | mapping: 2 | node-type: Tile 3 | type: temporal 4 | factors: M=MO 5 | target: MainMemory 6 | 7 | subtree: 8 | - node-type: Scope 9 | type: Sequential 10 | 11 | subtree: 12 | - node-type: Tile 13 | factors: K=KO L=LO 14 | type: temporal 15 | bypass: [C] 16 | target: MainMemory 17 | profile: False 18 | tag: op1 19 | 20 | 21 | subtree: 22 | - node-type: Tile 23 | type: temporal 24 | factors: K=KM L=LI M=MM 25 | permutation: LMK 26 | target: Cache 27 | tag: op1 28 | 29 | subtree: 30 | - node-type: Tile 31 | type: Spatial 32 | factors: M=SX K=SY 33 | split: 1 34 | permutation: MK 35 | target: Cache 36 | tag: op1 37 | 38 | subtree: 39 | - node-type: Tile 40 | type: temporal 41 | factors: M=1 L=1 K=1 42 | permutation: MLK 43 | target: RegFile 44 | tag: op1 45 | 46 | subtree: 47 | - node-type: Op 48 | name: GEMM1 49 | 50 | # A common spatial tile 51 | - node-type: Tile 52 | type: temporal 53 | factors: L=LO N=NO 54 | target: MainMemory 55 | profile: False 56 | bypass: [C] 57 | tag: op2 58 | 59 | subtree: 60 | - node-type: Tile 61 | type: temporal 62 | factors: M=MM L=LM N=NI 63 | permutation: NML 64 | target: Cache 65 | tag: op2 66 | 67 | subtree: 68 | - node-type: Tile 69 | type: Spatial 70 | factors: M=SX L=SY 71 | split: 1 72 | permutation: ML 73 | target: Cache 74 | tag: op2 75 | 76 | subtree: 77 | - node-type: Tile 78 | type: temporal 79 | factors: M=1 L=1 N=1 80 | permutation: MLN 81 | target: RegFile 82 | tag: op2 83 | 84 | subtree: 85 | - node-type: Op 86 | name: GEMM2 -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # TileFlow 2 | 3 | This repo is an extension to `timeloop` to support more general simulation of tensor programs. 4 | 5 | ## Install 6 | 7 | 1. install required software/libs 8 | ```bash 9 | sudo apt install scons libconfig++-dev libboost-dev libboost-iostreams-dev libboost-serialization-dev libyaml-cpp-dev libncurses-dev libtinfo-dev libgpm-dev git build-essential python3-pip 10 | ``` 11 | 12 | 2. install tileflow 13 | ```bash 14 | git clone --recursive git@github.com:pku-liang/TileFlow.git 15 | cd TileFlow 16 | export TILEFLOW_BASE=$(pwd) 17 | 18 | # build timeloop 19 | cd 3rdparty/timeloop/src 20 | ln -s ../pat-public/src/pat . 21 | cd .. 22 | scons -j4 --static 23 | 24 | 25 | # build tileflow 26 | cd ../.. 27 | scons -j4 --static 28 | 29 | # add bin to path 30 | source ./setup-env.sh 31 | ``` 32 | 33 | 3. check installation 34 | 35 | ```bash 36 | # test parser 37 | cd ./tests/cases/08-test-2mm # a sample input for 2mm. 38 | tileflow arch/* prob/* map/* # the order is not important 39 | ``` 40 | 41 | 4. Run tutorials in `tutorials`. Run validation experiment in `AE/validation`. 42 | 43 | ## Cite us 44 | ```bibtex 45 | @inproceedings{tileflow, 46 | author = {Size Zheng and 47 | Siyuan Chen and 48 | Siyuan Gao and 49 | Liancheng Jia and 50 | Guangyu Sun and 51 | Runsheng Wang and 52 | Yun Liang}, 53 | title = {TileFlow: {A} Framework for Modeling Fusion Dataflow via Tree-based 54 | Analysis}, 55 | booktitle = {Proceedings of the 56th Annual {IEEE/ACM} International Symposium 56 | on Microarchitecture, {MICRO} 2023, Toronto, ON, Canada, 28 October 57 | 2023 - 1 November 2023}, 58 | pages = {1271--1288}, 59 | publisher = {{ACM}}, 60 | year = {2023}, 61 | url = {https://doi.org/10.1145/3613424.3623792}, 62 | doi = {10.1145/3613424.3623792}, 63 | timestamp = {Sun, 31 Dec 2023 19:06:27 +0100}, 64 | biburl = {https://dblp.org/rec/conf/micro/0001CGJ0W023.bib}, 65 | bibsource = {dblp computer science bibliography, https://dblp.org} 66 | } 67 | ``` 68 | -------------------------------------------------------------------------------- /tutorials/00-GEMM/readme.md: -------------------------------------------------------------------------------- 1 | # Tutorial on GEMM 2 | In `TileFlow`, users specifies the architecture, mapping, and problem description in `TileFlow` 's [frontend syntax](../../docs/frontend-syntax.md). 3 | This folder uses the general matrix multiply (GEMM) to demonstrate `TileFlow`'s workflow. 4 | 5 | In this example, we described a spatial [accelerator](arch/arch.yaml) with three memory levels. Further, we described the computation in `prob/prob.yaml`, and the dataflow (mapping) in `map/map.yaml`. To instantiate the mapping problem, we wrote a macro file (marco.yaml) to instantiate the shape of the problem. 6 | 7 | To run `TileFlow`, please ensure the binary is in your system's path, and simply append all configuration files as paramers (order-blind): 8 | 9 | ```sh 10 | tileflow arch/arch.yaml prob/prob.yaml map/map.yaml macro.yaml 11 | ``` 12 | 13 | > Tips: All configuration file can be combined as one file. 14 | 15 | In less than a second, TileFlow will output the currently optimal dataflow found, along with the profiling metrics, including latency, energy, data movement volume, etc. Example output is shown in `gemm.csv`, `gemm.mapping.txt`. 16 | 17 | Next, we will illustrate how a dataflow is described in TileFlow, i.e. the mapping description. In TileFlow, we design dataflow by mapping each operator on the software side to each memory level of the hardware. For the matrix multiply example, the operator's mapping is represented in a chain of tile nodes, where every tile node describe the mapping of a memory level. For example, we map the computation of the MainMemory Level using a temporal tile: 18 | 19 | ``` 20 | node-type: Tile 21 | type: temporal 22 | factors: M = MO N = NO K= KO 23 | permutation: KMN 24 | target: MainMemory 25 | ```, 26 | 27 | And map the PE arrays using a spatial Tile: 28 | 29 | ``` 30 | node-type: Tile 31 | type: spatial 32 | factors: M=16 K=16 33 | permutation: MK 34 | split: 1 35 | target: Cache 36 | multicast: true 37 | ``` 38 | > `multicast` metric indicates the hardware is able to perform multicast. 39 | 40 | You are free to change the tiling factors and permutations of the tile node. Or, you can replace concrete tile sizes with unspecified macros. These macros can be automatically decided by TileFlow. -------------------------------------------------------------------------------- /tests/cases/13-test-attention/map/flat.yaml: -------------------------------------------------------------------------------- 1 | mapping: 2 | node-type: Tile 3 | type: Temporal 4 | factors: B=? H=? M=? 5 | permutation: MHB 6 | target: L2 7 | 8 | subtree: 9 | - node-type: Scope 10 | type: Sequential 11 | 12 | subtree: 13 | - node-type: Tile 14 | type: Temporal 15 | factors: A=? 16 | permutation: A 17 | target: L2 18 | bypass: [C] 19 | profile: False 20 | 21 | subtree: 22 | - node-type: Tile 23 | type: Spatial 24 | factors: B=? H=? 25 | permutation: HB 26 | target: L2 27 | split: 1 28 | 29 | subtree: 30 | - node-type: Tile 31 | type: Temporal 32 | factors: M=? L=? A=? 33 | permutation: ALM 34 | target: L1 35 | 36 | subtree: 37 | - node-type: Tile 38 | type: Spatial 39 | factors: M=? L=? 40 | permutation: LM 41 | target: L1 42 | split: 1 43 | 44 | subtree: 45 | - node-type: Tile 46 | type: Temporal 47 | factors: M=1 A=1 L=1 48 | permutation: LAM 49 | target: L0 50 | 51 | subtree: 52 | - node-type: Op 53 | name: ProduceC 54 | - node-type: Tile 55 | type: Temporal 56 | factors: L=? 57 | permutation: L 58 | target: L2 59 | bypass: [C] 60 | profile: False 61 | 62 | subtree: 63 | - node-type: Tile 64 | type: Spatial 65 | factors: B=? H=? 66 | permutation: HB 67 | target: L2 68 | split: 1 69 | 70 | subtree: 71 | - node-type: Tile 72 | type: Temporal 73 | factors: M=? L=? N=? 74 | permutation: NLM 75 | target: L1 76 | 77 | subtree: 78 | - node-type: Tile 79 | type: Spatial 80 | factors: M=? L=? 81 | permutation: LM 82 | target: L1 83 | split: 1 84 | 85 | subtree: 86 | - node-type: Tile 87 | type: Temporal 88 | factors: M=1 N=1 L=1 89 | permutation: LNM 90 | target: L0 91 | 92 | subtree: 93 | - node-type: Op 94 | name: ProduceO 95 | -------------------------------------------------------------------------------- /include/application/model.hpp: -------------------------------------------------------------------------------- 1 | 2 | #pragma once 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | #include "mapping/parser.hpp" 14 | #include "mapping/arch-properties.hpp" 15 | #include "mapping/constraints.hpp" 16 | #include "compound-config/compound-config.hpp" 17 | #include "model/sparse-optimization-parser.hpp" 18 | 19 | //--------------------------------------------// 20 | // Application // 21 | //--------------------------------------------// 22 | 23 | class Application 24 | { 25 | public: 26 | std::string name_; 27 | 28 | struct Stats 29 | { 30 | double energy; 31 | double cycles; 32 | }; 33 | 34 | protected: 35 | // Critical state. 36 | std::vector workloads_; 37 | model::Engine::Specs arch_specs_; 38 | 39 | // Many of the following submodules are dynamic objects because 40 | // we can only instantiate them after certain config files have 41 | // been parsed. 42 | 43 | // The mapping. 44 | Mapping* mapping_; 45 | 46 | // Abstract representation of the architecture. 47 | ArchProperties* arch_props_; 48 | 49 | // Constraints. 50 | mapping::Constraints* constraints_; 51 | 52 | // Application flags/config. 53 | bool verbose_ = false; 54 | bool auto_bypass_on_failure_ = false; 55 | std::string out_prefix_; 56 | 57 | // Sparse optimization 58 | sparse::SparseOptimizationInfo* sparse_optimizations_; 59 | 60 | private: 61 | 62 | // Serialization 63 | friend class boost::serialization::access; 64 | template 65 | void serialize(Archive& ar, const unsigned int version = 0); 66 | 67 | public: 68 | 69 | Application(config::CompoundConfig* config, 70 | std::string output_dir = ".", 71 | std::string name = "timeloop-model"); 72 | 73 | // This class does not support being copied 74 | Application(const Application&) = delete; 75 | Application& operator=(const Application&) = delete; 76 | 77 | ~Application(); 78 | 79 | // Run the evaluation. 80 | Stats Run(); 81 | }; 82 | 83 | -------------------------------------------------------------------------------- /include/tileflow/loop-analysis/memory-state.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "loop-analysis/loop-state.hpp" 4 | 5 | namespace analysis { 6 | 7 | namespace TileFlow { 8 | 9 | 10 | /** 11 | * \brief the dataspaces 12 | */ 13 | struct MemoryState { 14 | static const problem::Workload* workload_; 15 | std::unordered_map data_spaces_; 16 | 17 | MemoryState& Union(const MemoryState& other); 18 | MemoryState& Substract(const MemoryState& other); 19 | MemoryState& Add(const MemoryState& other); 20 | MemoryState& Intersect(const MemoryState& other); 21 | 22 | // add 23 | MemoryState& operator += (const MemoryState& other); 24 | // union 25 | MemoryState& operator |= (const MemoryState& other); 26 | // intersect 27 | MemoryState& operator &= (const MemoryState& other); 28 | // substract 29 | MemoryState& operator -= (const MemoryState& other); 30 | 31 | MemoryState operator - (const MemoryState& other); 32 | 33 | problem::OperationSpace& operator[] (int id) { 34 | if (data_spaces_.count(id) == 0) { 35 | data_spaces_.emplace(id, workload_); 36 | } 37 | return data_spaces_.at(id); 38 | } 39 | 40 | inline const problem::OperationSpace& at(std::uint64_t idx) const { 41 | return data_spaces_.at(idx); 42 | } 43 | 44 | void insert(std::uint64_t spatial_id, 45 | const problem::OperationPoint& low_point, 46 | const problem::OperationPoint& high_point); 47 | 48 | void insert(std::uint64_t spatial_id, 49 | const problem::OperationSpace& data_space); 50 | 51 | MemoryState() = default; 52 | 53 | MemoryState(std::uint64_t id, const problem::OperationSpace& data_space){ 54 | data_spaces_.emplace(id, data_space); 55 | } 56 | 57 | const std::unordered_map& 58 | getDataSpaces() const {return data_spaces_;} 59 | 60 | 61 | static void set_workload(const problem::Workload* workload) { 62 | MemoryState::workload_ = workload;} 63 | 64 | void show() const; 65 | }; 66 | 67 | } // namespace TileFlow 68 | 69 | } // namespace analysis -------------------------------------------------------------------------------- /tests/cases/13-test-attention/map/chimera.yaml: -------------------------------------------------------------------------------- 1 | mapping: 2 | node-type: Tile 3 | type: Temporal 4 | factors: B=? H=? M=? L=? 5 | permutation: LMHB 6 | target: L2 7 | 8 | subtree: 9 | - node-type: Scope 10 | type: Sequential 11 | 12 | subtree: 13 | - node-type: Tile 14 | type: Temporal 15 | factors: A=? 16 | permutation: A 17 | target: L2 18 | bypass: [C] 19 | profile: False 20 | 21 | subtree: 22 | - node-type: Tile 23 | type: Spatial 24 | factors: B=? H=? M=? 25 | permutation: MHB 26 | target: L2 27 | split: 1 28 | 29 | subtree: 30 | - node-type: Tile 31 | type: Temporal 32 | factors: M=? L=? A=? 33 | permutation: ALM 34 | target: L1 35 | tag: op1 36 | 37 | subtree: 38 | - node-type: Tile 39 | type: Spatial 40 | factors: M=? L=? 41 | permutation: LM 42 | target: L1 43 | split: 1 44 | tag: op1 45 | 46 | subtree: 47 | - node-type: Tile 48 | type: Temporal 49 | factors: M=1 A=1 L=1 50 | permutation: LAM 51 | target: L0 52 | tag: op1 53 | 54 | subtree: 55 | - node-type: Op 56 | name: ProduceC 57 | - node-type: Tile 58 | type: Temporal 59 | factors: N=? 60 | permutation: N 61 | target: L2 62 | bypass: [C] 63 | profile: False 64 | 65 | subtree: 66 | - node-type: Tile 67 | type: Spatial 68 | factors: B=? H=? M=? 69 | permutation: MHB 70 | target: L2 71 | split: 1 72 | 73 | subtree: 74 | - node-type: Tile 75 | type: Temporal 76 | factors: M=? L=? N=? 77 | permutation: NLM 78 | target: L1 79 | 80 | subtree: 81 | - node-type: Tile 82 | type: Spatial 83 | factors: M=? L=? 84 | permutation: LM 85 | target: L1 86 | split: 1 87 | 88 | subtree: 89 | - node-type: Tile 90 | type: Temporal 91 | factors: M=1 L=1 N=1 92 | permutation: NLM 93 | target: L0 94 | 95 | subtree: 96 | - node-type: Op 97 | name: ProduceO 98 | -------------------------------------------------------------------------------- /tests/scripts/parser.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "application/model.hpp" 6 | #include "compound-config/compound-config.hpp" 7 | #include "util/args.hpp" 8 | 9 | #include "tileflow/problem/problem.hpp" 10 | #include "tileflow/mapping/mapping.hpp" 11 | #include "tileflow/loop-analysis/nest-analysis.hpp" 12 | #include "tileflow/model/topology.hpp" 13 | 14 | extern bool gTerminateEval; 15 | 16 | //--------------------------------------------// 17 | // MAIN // 18 | //--------------------------------------------// 19 | 20 | int main(int argc, char* argv[]) 21 | { 22 | assert(argc >= 2); 23 | 24 | std::vector input_files; 25 | std::string output_dir = "."; 26 | bool success = ParseArgs(argc, argv, input_files, output_dir); 27 | if (!success) 28 | { 29 | std::cerr << "ERROR: error parsing command line." << std::endl; 30 | exit(1); 31 | } 32 | 33 | auto config = new config::CompoundConfig(input_files); 34 | 35 | auto root = config->getRoot(); 36 | 37 | auto problem = root.lookup("problem"); 38 | problem::TileFlow::Workloads workloads; 39 | 40 | 41 | config::CompoundConfigNode arch; 42 | 43 | if (root.exists("arch")) 44 | { 45 | arch = root.lookup("arch"); 46 | } 47 | else if (root.exists("architecture")) 48 | { 49 | arch = root.lookup("architecture"); 50 | } 51 | 52 | bool is_sparse_topology = root.exists("sparse_optimizations"); 53 | 54 | model::Engine::Specs arch_specs_ = model::Engine::ParseSpecs(arch, is_sparse_topology); 55 | 56 | std::cout << "Begin ParseWorkload..." << std::endl; 57 | problem::TileFlow::ParseWorkloads(problem, workloads); 58 | 59 | auto mapping = mapping::TileFlow::ParseAndConstruct(root.lookup("mapping"), arch_specs_, workloads); 60 | 61 | mapping.Print(); 62 | 63 | workloads.Print(); 64 | 65 | problem::Workload::SetCurrShape(&workloads.get_shape()); 66 | 67 | model::TileFlow::Topology topology_; 68 | 69 | std::cout << "Begin Spec..." << std::endl; 70 | topology_.Spec(arch_specs_.topology); 71 | 72 | analysis::TileFlow::NestAnalysis analysis(workloads, mapping, arch_specs_); 73 | analysis.analyze(); 74 | analysis.Print(); 75 | 76 | std::cout << "Begin eval..." << std::endl; 77 | 78 | topology_.eval(mapping, analysis); 79 | 80 | std::cout << "Parser check passed!" << std::endl; 81 | 82 | return 0; 83 | } 84 | 85 | /** 86 | - ComputePartitionSizes 87 | - partition_size = partition_size * tile_nest[cur].size / tile_nest[cur].size or master spatial level size 88 | - ComputeParentAccessShare: 89 | - Compute the accesses by each fanout; 90 | - accumulated in parent_access_share; 91 | */ 92 | -------------------------------------------------------------------------------- /tests/cases/12-test-fused-cnn/parser.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | def main(input_file, output_file): 4 | output = 'mapping:\n' 5 | with open(input_file, 'r') as f: 6 | for line in f.readlines(): 7 | keywords = line.strip().split(' ') 8 | if len(keywords) == 0: break 9 | keyword = keywords[0] 10 | keywords = keywords[1:] 11 | ntabs = 0 12 | while ntabs < len(line) and line[ntabs] in ' \t': 13 | ntabs += 1 14 | ntabs = ntabs // 2 + 1 15 | # print (len(re.findall('(\t*)[A-Za-z].*', line)[0])) 16 | if keyword == 'for' or keyword == 'pfor': 17 | varnames, bounds, tags = re.findall(r'p?for (.*) in [\(\[](.*)[\)\]]:\s*(#.*)\s*', line)[0] 18 | varnames = [x.strip().upper() for x in varnames.strip().split(',')] 19 | bounds = [x.strip() for x in bounds.strip().split(',')] 20 | assert(len(varnames) == len(bounds)) 21 | tags = [x.strip() for x in tags.strip('# ').split(',')] 22 | output += ' '*(ntabs-1) + ('- ' if ntabs!= 1 else ' ') + 'node-type: Tile\n' 23 | output += ' '*ntabs + 'type: ' + ('Temporal' if keyword == 'for' else 'Spatial') + '\n' 24 | output += ' '*ntabs + 'factors: ' + ' '.join([f'{k}={v}' for k,v in zip(varnames, bounds)]) + '\n' 25 | output += ' '*ntabs + 'permutation: ' + ''.join(reversed(varnames)) + '\n' 26 | for tag in tags: 27 | k,v = [x.strip() for x in tag.split(':')] 28 | output += ' '*ntabs +f'{k}: {v}\n' 29 | output += '\n' 30 | output += ' '*ntabs + 'subtree:\n' 31 | elif keyword == 'scope': 32 | t = re.findall(r'scope (\w+)', line)[0] 33 | output += ' '*(ntabs-1) + ('- ' if ntabs!= 1 else ' ') + 'node-type: Scope\n' 34 | output += ' '*ntabs + 'type: ' + t + '\n' 35 | output += '\n' 36 | output += ' '*ntabs + 'subtree:\n' 37 | elif keyword == 'endscope': 38 | pass 39 | elif keyword == 'op': 40 | name = re.findall(r'op (\w+)', line)[0] 41 | output += ' '*(ntabs-1) + '- node-type: Op\n' 42 | output += ' '*ntabs + 'name: ' + name + '\n' 43 | else: 44 | raise NotImplementedError 45 | with open(output_file, 'w') as f: 46 | f.write(output) 47 | import sys 48 | 49 | input_file = '' 50 | output_file = 'map.yaml' 51 | 52 | if len(sys.argv) > 1: 53 | input_file = sys.argv[1] 54 | if len(sys.argv) > 2: 55 | output_file = sys.argv[2] 56 | 57 | import os 58 | 59 | if os.path.isfile(input_file): 60 | main(input_file, output_file) -------------------------------------------------------------------------------- /tests/cases/13-test-attention/parser.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | def main(input_file, output_file): 4 | output = 'mapping:\n' 5 | with open(input_file, 'r') as f: 6 | for line in f.readlines(): 7 | keywords = line.strip().split(' ') 8 | if len(keywords) == 0: break 9 | keyword = keywords[0] 10 | keywords = keywords[1:] 11 | ntabs = 0 12 | while ntabs < len(line) and line[ntabs] in ' \t': 13 | ntabs += 1 14 | ntabs = ntabs // 2 + 1 15 | # print (len(re.findall('(\t*)[A-Za-z].*', line)[0])) 16 | if keyword == 'for' or keyword == 'pfor': 17 | varnames, bounds, tags = re.findall(r'p?for (.*) in [\(\[](.*)[\)\]]:\s*(#.*)\s*', line)[0] 18 | varnames = [x.strip().upper() for x in varnames.strip().split(',')] 19 | bounds = [x.strip() for x in bounds.strip().split(',')] 20 | assert(len(varnames) == len(bounds)) 21 | tags = [x.strip() for x in tags.strip('# ').split(',')] 22 | output += ' '*(ntabs-1) + ('- ' if ntabs!= 1 else ' ') + 'node-type: Tile\n' 23 | output += ' '*ntabs + 'type: ' + ('Temporal' if keyword == 'for' else 'Spatial') + '\n' 24 | output += ' '*ntabs + 'factors: ' + ' '.join([f'{k}={v}' for k,v in zip(varnames, bounds)]) + '\n' 25 | output += ' '*ntabs + 'permutation: ' + ''.join(reversed(varnames)) + '\n' 26 | for tag in tags: 27 | k,v = [x.strip() for x in tag.split(':')] 28 | output += ' '*ntabs +f'{k}: {v}\n' 29 | output += '\n' 30 | output += ' '*ntabs + 'subtree:\n' 31 | elif keyword == 'scope': 32 | t = re.findall(r'scope (\w+)', line)[0] 33 | output += ' '*(ntabs-1) + ('- ' if ntabs!= 1 else ' ') + 'node-type: Scope\n' 34 | output += ' '*ntabs + 'type: ' + t + '\n' 35 | output += '\n' 36 | output += ' '*ntabs + 'subtree:\n' 37 | elif keyword == 'endscope': 38 | pass 39 | elif keyword == 'op': 40 | name = re.findall(r'op (\w+)', line)[0] 41 | output += ' '*(ntabs-1) + '- node-type: Op\n' 42 | output += ' '*ntabs + 'name: ' + name + '\n' 43 | else: 44 | raise NotImplementedError 45 | with open(output_file, 'w') as f: 46 | f.write(output) 47 | import sys 48 | 49 | input_file = '' 50 | output_file = 'map.yaml' 51 | 52 | if len(sys.argv) > 1: 53 | input_file = sys.argv[1] 54 | if len(sys.argv) > 2: 55 | output_file = sys.argv[2] 56 | 57 | import os 58 | 59 | if os.path.isfile(input_file): 60 | main(input_file, output_file) -------------------------------------------------------------------------------- /src/loop-analysis/memory-state.cpp: -------------------------------------------------------------------------------- 1 | #include "tileflow/loop-analysis/memory-state.hpp" 2 | 3 | 4 | namespace analysis { 5 | 6 | namespace TileFlow { 7 | 8 | MemoryState& MemoryState::Union(const MemoryState& other){ 9 | for (auto& kv: other.data_spaces_) { 10 | if (data_spaces_.count(kv.first) == 0) { 11 | data_spaces_.emplace(kv.first, kv.second); 12 | } 13 | else { 14 | data_spaces_.at(kv.first) += kv.second; 15 | } 16 | } 17 | return *this; 18 | } 19 | 20 | MemoryState& MemoryState::Substract(const MemoryState& other){ 21 | for (auto& kv: data_spaces_) { 22 | if (other.data_spaces_.count(kv.first)) 23 | kv.second = kv.second - other.data_spaces_.at(kv.first); 24 | } 25 | return *this; 26 | } 27 | 28 | MemoryState& MemoryState::Add(const MemoryState& other){ 29 | for (auto& kv: other.data_spaces_) { 30 | if (data_spaces_.count(kv.first) == 0) { 31 | data_spaces_.emplace(kv.first, kv.second); 32 | } 33 | else { 34 | data_spaces_.at(kv.first) += kv.second; 35 | } 36 | } 37 | return *this; 38 | } 39 | 40 | MemoryState& MemoryState::Intersect(const MemoryState& ){ 41 | // TODO: realize real intersection logic here. 42 | return *this; 43 | } 44 | 45 | void MemoryState::insert(std::uint64_t spatial_id, 46 | const problem::OperationPoint& low_point, 47 | const problem::OperationPoint& high_point) { 48 | data_spaces_.emplace(spatial_id, problem::OperationSpace(workload_, low_point, high_point)); 49 | } 50 | 51 | void MemoryState::insert(std::uint64_t spatial_id, 52 | const problem::OperationSpace& data_space) { 53 | data_spaces_.emplace(spatial_id, data_space); 54 | } 55 | 56 | const problem::Workload* MemoryState::workload_; 57 | 58 | void MemoryState::show() const{ 59 | for (auto kv: data_spaces_) { 60 | std::cout << kv.first << ":"; 61 | kv.second.Print(std::cout); 62 | std::cout << std::endl; 63 | } 64 | std::cout << std::endl; 65 | } 66 | 67 | MemoryState MemoryState::operator - (const MemoryState& other) { 68 | MemoryState ret; 69 | for (auto& kv: data_spaces_) { 70 | if (other.getDataSpaces().count(kv.first)) { 71 | ret.data_spaces_.emplace(kv.first, kv.second - other.getDataSpaces().at(kv.first)); 72 | } 73 | else { 74 | ret.data_spaces_.emplace(kv.first, kv.second); 75 | } 76 | } 77 | return ret; 78 | } 79 | 80 | } // namespace TileFlow 81 | 82 | } // namespace analysis -------------------------------------------------------------------------------- /tests/cases/11-fail-domino-self-attention/prob.yaml: -------------------------------------------------------------------------------- 1 | problem: 2 | io: 3 | ins: Q K V 4 | outs: I 5 | dimensions: [H,M,N,A,L] 6 | instance: 7 | H: 16 8 | M: 512 9 | N: 64 10 | A: 64 11 | L: 512 12 | 13 | ops: 14 | - name: ProduceC 15 | dimensions: [H,A,L,M] 16 | data-spaces: 17 | - name: C 18 | projection: 19 | - [[H]] 20 | - [[M]] 21 | - [[L]] 22 | read-write: True 23 | - name: K 24 | projection: 25 | - [[H]] 26 | - [[A]] 27 | - [[L]] 28 | - name: Q 29 | projection: 30 | - [[H]] 31 | - [[M]] 32 | - [[A]] 33 | ins: K, Q 34 | out: C 35 | 36 | - name: ProduceB 37 | dimensions: [H,M,L] 38 | data-spaces: 39 | - name: B 40 | projection: 41 | - [[H]] 42 | - [[M]] 43 | read-write: True 44 | - name: C 45 | projection: 46 | - [[H]] 47 | - [[M]] 48 | - [[L]] 49 | ins: C 50 | out: B 51 | 52 | - name: ProduceD 53 | dimensions: [H,M,L] 54 | data-spaces: 55 | - name: D 56 | projection: 57 | - [[H]] 58 | - [[M]] 59 | - [[L]] 60 | read-write: True 61 | - name: B 62 | projection: 63 | - [[H]] 64 | - [[M]] 65 | - name: C 66 | projection: 67 | - [[H]] 68 | - [[M]] 69 | - [[L]] 70 | ins: B, C 71 | out: D 72 | 73 | - name: ProduceE 74 | dimensions: [H,M,L] 75 | data-spaces: 76 | - name: E 77 | projection: 78 | - [[H]] 79 | - [[M]] 80 | - [[L]] 81 | read-write: True 82 | - name: D 83 | projection: 84 | - [[H]] 85 | - [[M]] 86 | - [[L]] 87 | ins: D 88 | out: E 89 | 90 | - name: ProduceF 91 | dimensions: [H,M,L] 92 | data-spaces: 93 | - name: F 94 | projection: 95 | - [[H]] 96 | - [[M]] 97 | read-write: True 98 | - name: E 99 | projection: 100 | - [[H]] 101 | - [[M]] 102 | - [[L]] 103 | ins: E 104 | out: F 105 | 106 | - name: ProduceG 107 | dimensions: [H,M,L] 108 | data-spaces: 109 | - name: G 110 | projection: 111 | - [[H]] 112 | - [[M]] 113 | - [[L]] 114 | read-write: True 115 | - name: F 116 | projection: 117 | - [[H]] 118 | - [[M]] 119 | - name: B 120 | projection: 121 | - [[H]] 122 | - [[M]] 123 | ins: F, B 124 | out: G 125 | 126 | - name: ProduceI 127 | dimensions: [H,L,N,M] 128 | data-spaces: 129 | - name: I 130 | projection: 131 | - [[H]] 132 | - [[M]] 133 | - [[N]] 134 | read-write: True 135 | - name: V 136 | projection: 137 | - [[H]] 138 | - [[L]] 139 | - [[N]] 140 | - name: G 141 | projection: 142 | - [[H]] 143 | - [[M]] 144 | - [[L]] 145 | ins: V, G 146 | out: I -------------------------------------------------------------------------------- /src/mapper/op.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "tileflow/mapper/op.hpp" 4 | 5 | 6 | namespace TileFlow { 7 | 8 | namespace Op { 9 | 10 | 11 | std::shared_ptr max(std::vector > exprs){ 12 | assert(exprs.size()); 13 | if (exprs.size() == 1) return exprs.front(); 14 | return std::make_shared(exprs); 15 | } 16 | 17 | std::shared_ptr sum(std::vector > exprs){ 18 | assert(exprs.size()); 19 | if (exprs.size() == 1) return exprs.front(); 20 | return std::make_shared(exprs); 21 | } 22 | 23 | std::shared_ptr operator <= ( 24 | const std::shared_ptr& expr, 25 | const std::shared_ptr& limit 26 | ){ 27 | return std::make_shared ( 28 | expr, 29 | limit, 30 | PairCondExpr::type_t::LEQ 31 | ); 32 | } 33 | 34 | std::shared_ptr pair(int x, int y) { 35 | return pair( 36 | parameter(x),parameter(y) 37 | ); 38 | } 39 | 40 | std::shared_ptr pair( 41 | const std::shared_ptr& x, 42 | const std::shared_ptr& y) { 43 | return std::make_shared(x,y); 44 | } 45 | 46 | std::shared_ptr product(std::vector>& exprs) { 47 | if (!exprs.size()) return parameter(1); 48 | if (exprs.size() == 1) return exprs.front(); 49 | return std::make_shared(exprs); 50 | } 51 | 52 | std::shared_ptr product(std::initializer_list> exprs) { 53 | return std::make_shared(exprs); 54 | } 55 | 56 | std::shared_ptr product(const std::pair >& exprs){ 57 | return std::make_shared(exprs); 58 | } 59 | 60 | std::shared_ptr sum(std::vector >& exprs) { 61 | if (!exprs.size()) return parameter(0); 62 | if (exprs.size() == 1) return exprs.front(); 63 | return std::make_shared(exprs); 64 | } 65 | 66 | std::shared_ptr max(std::vector >& exprs) { 67 | if (!exprs.size()) return parameter(0); 68 | if (exprs.size() == 1) return exprs.front(); 69 | return std::make_shared(exprs); 70 | } 71 | 72 | std::shared_ptr variable(int x) { 73 | return std::make_shared(x); 74 | } 75 | 76 | std::shared_ptr operator == ( 77 | const std::shared_ptr& left, 78 | const std::shared_ptr& right 79 | ){ 80 | return std::make_shared( 81 | left, right, CondExpr::EQU 82 | ); 83 | } 84 | 85 | std::shared_ptr operator <= ( 86 | const std::shared_ptr& left, 87 | const std::shared_ptr& right 88 | ){ 89 | return std::make_shared( 90 | left, right, CondExpr::LEQ 91 | ); 92 | } 93 | 94 | std::shared_ptr parameter(num_t x) { 95 | return std::make_shared(x); 96 | } 97 | 98 | } // namespace Op 99 | 100 | } // namespace TileFlow -------------------------------------------------------------------------------- /include/tileflow/problem/problem.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "mapping/parser.hpp" 4 | 5 | #include "tileflow/common.hpp" 6 | 7 | 8 | namespace problem { 9 | 10 | namespace TileFlow { 11 | 12 | typedef unsigned TensorIndex; 13 | 14 | class Workloads; 15 | 16 | class Workload: public problem::Workload { 17 | std::vector ins_; 18 | std::string out_; 19 | std::string name_; 20 | Workloads& workloads_; 21 | bool binding_applied = false; 22 | public: 23 | Workload(Workloads& workloads): workloads_(workloads){} 24 | inline void set_name(const std::string & name){name_ = name;} 25 | void set_io(const std::vector& ins, const std::vector& outs); 26 | inline const std::vector& get_ins() const { return ins_; } 27 | inline const std::string & get_out() const {return out_;} 28 | inline const std::string & get_name() const {return name_;} 29 | inline const problem::Workload::FactorizedBounds& get_factorized_bounds() {return factorized_bounds_;} 30 | void Print(std::ostream& o = std::cout); 31 | friend class Workloads; 32 | // void set_common_shape(); 33 | }; 34 | 35 | class Workloads{ 36 | std::unordered_map > workloads_; 37 | std::vector ins_; 38 | std::vector outs_; 39 | config::CompoundConfigNode coeffs_; 40 | 41 | problem::Workload::FactorizedBounds factorized_bounds_; 42 | problem::Workload::FlattenedBounds flattened_bounds_; 43 | problem::Workload::Coefficients coefficients_; 44 | problem::Workload::Densities densities_; 45 | problem::Shape common_shape_; 46 | 47 | mutable problem::Workload workload_; 48 | mutable bool workload_constructed_ = false; 49 | 50 | public: 51 | Workloads() { 52 | common_shape_.UsesFlattening = false; coefficients_[-1] = 1; 53 | common_shape_.DefaultCoefficients[-1] = 1;} 54 | bool add_workload(const std::string& name, std::shared_ptr& workload); 55 | std::shared_ptr get_workload(const std::string & op_name) const { 56 | TILEFLOW_ASSERT(workloads_.count(op_name), op_name << " Not FOUND"); 57 | return workloads_.at(op_name); 58 | } 59 | void set_io(const std::vector& ins, const std::vector& outs); 60 | void set_coeffs(const config::CompoundConfigNode& coeffs); 61 | void set_factorized_bound(const std::string& dim, int bound); 62 | void set_dims(const std::vector& dims); 63 | 64 | void Print(); 65 | 66 | const std::vector& get_ins() const {return ins_;} 67 | const std::vector& get_outs() const {return outs_;} 68 | const problem::Workload& get_workload() const; 69 | const problem::Shape& get_shape() const {return common_shape_;} 70 | 71 | friend class Workload; 72 | }; 73 | 74 | void ParseWorkloads(config::CompoundConfigNode config, Workloads& workloads_); 75 | 76 | } // namespace TileFlow 77 | 78 | } // namespace problem -------------------------------------------------------------------------------- /tests/cases/07-test-fusion-attention/result/attention-fused.csv: -------------------------------------------------------------------------------- 1 | ,value 2 | Cycle,2490368 3 | Energy,1.08711e+11 4 | L2::Fill::V, 0 5 | L2::Update::V, 0 6 | L2::Read::V, 3.14573e+06 7 | L2::Fill::J, 0 8 | L2::Fill::I, 0 9 | L2::Update::I, 0 10 | L2::Fill::Q, 0 11 | L2::Read::K, 786432 12 | L2::Update, 3.14573e+06 13 | L2::Update::D, 0 14 | L2::Read::Q, 393216 15 | L2::Update::K, 0 16 | L2::Fill::C, 0 17 | L2::Fill::G, 0 18 | L2::Fill::F, 0 19 | L2::Update::C, 0 20 | L2::Fill::E, 0 21 | L2::Read::C, 0 22 | L2::Fill::D, 0 23 | L2::CapUtil, 8.64e-06 24 | L2::SpatialUtil, 1 25 | L2::SlowDown, 1.06645 26 | L2::Fill::K, 0 27 | L2::Accesses, 1.06168e+07 28 | L2::Update::Q, 0 29 | L2::Fill, 0 30 | L2::Read::D, 0 31 | L2::Read::E, 0 32 | L2::Update::J, 3.14573e+06 33 | L2::Update::G, 0 34 | L2::Update::E, 0 35 | L2::Read::J, 3.14573e+06 36 | L2::Update::F, 0 37 | L2::Read::F, 0 38 | L2::Read::G, 0 39 | L2::Read, 7.4711e+06 40 | L2::Read::I, 0 41 | L1::Fill::V, 786432 42 | L1::Update::V, 0 43 | L1::Read::V, 3.14573e+06 44 | L1::Fill::J, 786432 45 | L1::Fill::I, 786432 46 | L1::Update::I, 786432 47 | L1::Fill::Q, 98304 48 | L1::Read::K, 5.03316e+07 49 | L1::Update, 3.34479e+07 50 | L1::Update::D, 393216 51 | L1::Read::Q, 393216 52 | L1::Update::K, 0 53 | L1::Fill::C, 2.94912e+06 54 | L1::Fill::G, 1536 55 | L1::Fill::F, 1.17965e+06 56 | L1::Update::C, 2.51658e+07 57 | L1::Fill::E, 786432 58 | L1::Read::C, 3.14573e+07 59 | L1::Fill::D, 4608 60 | L1::CapUtil, 0.192 61 | L1::SpatialUtil, 1 62 | L1::SlowDown, 8.53516 63 | L1::Fill::K, 786432 64 | L1::Accesses, 5.16502e+08 65 | L1::Update::Q, 0 66 | L1::Fill, 8.16538e+06 67 | L1::Read::D, 494592 68 | L1::Read::E, 3.93216e+06 69 | L1::Update::J, 3.14573e+06 70 | L1::Update::G, 24576 71 | L1::Update::E, 3.14573e+06 72 | L1::Read::J, 3.14573e+06 73 | L1::Update::F, 786432 74 | L1::Read::F, 3.14573e+06 75 | L1::Read::G, 122880 76 | L1::Read, 1.00101e+08 77 | L1::Read::I, 3.93216e+06 78 | L0::Fill::V, 49149 79 | L0::Update::V, 0 80 | L0::Read::V, 393216 81 | L0::Fill::J, 49149 82 | L0::Fill::I, 49914 83 | L0::Update::I, 98304 84 | L0::Fill::Q, 49150 85 | L0::Read::K, 196608 86 | L0::Update, 1.2073e+06 87 | L0::Update::D, 393216 88 | L0::Read::Q, 196608 89 | L0::Update::K, 0 90 | L0::Fill::C, 53752 91 | L0::Fill::G, 2298 92 | L0::Fill::F, 3064 93 | L0::Update::C, 196608 94 | L0::Fill::E, 3835 95 | L0::Read::C, 688128 96 | L0::Fill::D, 4602 97 | L0::CapUtil, 0.25 98 | L0::SpatialUtil, 0.25 99 | L0::SlowDown, 1 100 | L0::Fill::K, 49150 101 | L0::Accesses, 1.89293e+09 102 | L0::Update::Q, 0 103 | L0::Fill, 314063 104 | L0::Read::D, 491520 105 | L0::Read::E, 101376 106 | L0::Update::J, 393216 107 | L0::Update::G, 24576 108 | L0::Update::E, 98304 109 | L0::Read::J, 393216 110 | L0::Update::F, 3072 111 | L0::Read::F, 125952 112 | L0::Read::G, 122880 113 | L0::Read, 3.20102e+06 114 | L0::Read::I, 491520 115 | mac::Flops, 1.2073e+06 116 | MEM::L0,1 117 | MEM::L1,0.136 118 | MEM::L0,0.666667 119 | MEM::L1,0.12825 120 | MEM::L0,1 121 | MEM::L1,0.5125 122 | MEM::L0,0.666667 123 | MEM::L1,0.128 124 | MEM::L0,0.666667 125 | MEM::L1,0.12825 126 | MEM::L0,1 127 | MEM::L1,0.128125 128 | MEM::L0,1 129 | MEM::L1,0.192 130 | MEM::L2,7.695e-06 131 | SPATIAL::L1,0.25 132 | SPATIAL::L1,0.00390625 133 | SPATIAL::L1,0.03125 134 | SPATIAL::L1,0.25 135 | SPATIAL::L1,0.0625 136 | SPATIAL::L1,0.0078125 137 | SPATIAL::L1,0.125 138 | SPATIAL::L2,1 139 | SPATIAL::L2,0.25 140 | -------------------------------------------------------------------------------- /tests/cases/07-test-fusion-attention/result/attention-nofuse.csv: -------------------------------------------------------------------------------- 1 | ,value 2 | Cycle,4587520 3 | Energy,8.27667e+10 4 | L2::Fill::V, 3.14573e+06 5 | L2::Update::V, 0 6 | L2::Read::V, 3.14573e+06 7 | L2::Fill::J, 3.14573e+06 8 | L2::Fill::I, 0 9 | L2::Update::I, 0 10 | L2::Fill::Q, 393216 11 | L2::Read::K, 786432 12 | L2::Update, 6.29146e+06 13 | L2::Update::D, 0 14 | L2::Read::Q, 393216 15 | L2::Update::K, 0 16 | L2::Fill::C, 3.14573e+06 17 | L2::Fill::G, 0 18 | L2::Fill::F, 0 19 | L2::Update::C, 3.14573e+06 20 | L2::Fill::E, 0 21 | L2::Read::C, 6.29146e+06 22 | L2::Fill::D, 0 23 | L2::CapUtil, 1.152e-05 24 | L2::SpatialUtil, 1 25 | L2::SlowDown, 1.11486 26 | L2::Fill::K, 393216 27 | L2::Accesses, 3.02776e+07 28 | L2::Update::Q, 0 29 | L2::Fill, 1.02236e+07 30 | L2::Read::D, 0 31 | L2::Read::E, 0 32 | L2::Update::J, 3.14573e+06 33 | L2::Update::G, 0 34 | L2::Update::E, 0 35 | L2::Read::J, 3.14573e+06 36 | L2::Update::F, 0 37 | L2::Read::F, 0 38 | L2::Read::G, 0 39 | L2::Read, 1.37626e+07 40 | L2::Read::I, 0 41 | L1::Fill::V, 786432 42 | L1::Update::V, 0 43 | L1::Read::V, 6.29146e+06 44 | L1::Fill::J, 786432 45 | L1::Fill::I, 1.37626e+06 46 | L1::Update::I, 786432 47 | L1::Fill::Q, 98304 48 | L1::Read::K, 6.71089e+07 49 | L1::Update, 1.80224e+07 50 | L1::Update::D, 786432 51 | L1::Read::Q, 2.09715e+06 52 | L1::Update::K, 0 53 | L1::Fill::C, 1.57286e+06 54 | L1::Fill::G, 1536 55 | L1::Fill::F, 1.17965e+06 56 | L1::Update::C, 8.38861e+06 57 | L1::Fill::E, 786432 58 | L1::Read::C, 1.07479e+07 59 | L1::Fill::D, 1536 60 | L1::CapUtil, 0.192 61 | L1::SpatialUtil, 1 62 | L1::SlowDown, 4.93333 63 | L1::Fill::K, 196608 64 | L1::Accesses, 3.68772e+08 65 | L1::Update::Q, 0 66 | L1::Fill, 6.78605e+06 67 | L1::Read::D, 792576 68 | L1::Read::E, 1.57286e+06 69 | L1::Update::J, 6.29146e+06 70 | L1::Update::G, 196608 71 | L1::Update::E, 786432 72 | L1::Read::J, 6.29146e+06 73 | L1::Update::F, 786432 74 | L1::Read::F, 2.3593e+06 75 | L1::Read::G, 245760 76 | L1::Read, 1.04978e+08 77 | L1::Read::I, 7.4711e+06 78 | L0::Fill::V, 98301 79 | L0::Update::V, 0 80 | L0::Read::V, 1.57286e+06 81 | L0::Fill::J, 98301 82 | L0::Fill::I, 99066 83 | L0::Update::I, 49152 84 | L0::Fill::Q, 65533 85 | L0::Read::K, 262144 86 | L0::Update, 3.26656e+06 87 | L0::Update::D, 786432 88 | L0::Read::Q, 262144 89 | L0::Update::K, 0 90 | L0::Fill::C, 67831 91 | L0::Fill::G, 1530 92 | L0::Fill::F, 2295 93 | L0::Update::C, 262144 94 | L0::Fill::E, 1530 95 | L0::Read::C, 1.05472e+06 96 | L0::Fill::D, 2298 97 | L0::CapUtil, 0.1875 98 | L0::SpatialUtil, 0.1875 99 | L0::SlowDown, 1 100 | L0::Fill::K, 65533 101 | L0::Accesses, 1.8542e+09 102 | L0::Update::Q, 0 103 | L0::Fill, 502218 104 | L0::Read::D, 792576 105 | L0::Read::E, 399360 106 | L0::Update::J, 1.57286e+06 107 | L0::Update::G, 196608 108 | L0::Update::E, 6144 109 | L0::Read::J, 1.57286e+06 110 | L0::Update::F, 393216 111 | L0::Read::F, 638976 112 | L0::Read::G, 245760 113 | L0::Read, 8.42342e+06 114 | L0::Read::I, 1.62202e+06 115 | mac::Flops, 3.26656e+06 116 | MEM::L0,1 117 | MEM::L1,0.176 118 | MEM::L2,4.8e-06 119 | MEM::L0,0.666667 120 | MEM::L1,0.12825 121 | MEM::L0,1 122 | MEM::L1,0.128125 123 | MEM::L0,0.666667 124 | MEM::L1,0.128 125 | MEM::L0,0.666667 126 | MEM::L1,0.064125 127 | MEM::L0,1 128 | MEM::L1,0.128125 129 | MEM::L0,1 130 | MEM::L1,0.384 131 | MEM::L2,1.152e-05 132 | SPATIAL::L1,0.25 133 | SPATIAL::L2,0.75 134 | SPATIAL::L1,0.00195312 135 | SPATIAL::L1,0.125 136 | SPATIAL::L1,0.00195312 137 | SPATIAL::L1,0.00390625 138 | SPATIAL::L1,0.015625 139 | SPATIAL::L1,0.0625 140 | SPATIAL::L2,1 141 | SPATIAL::L2,0.25 142 | -------------------------------------------------------------------------------- /tests/SConscript: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | Import('env') 4 | 5 | env.Append(CPPDEFINES = [('BUILD_BASE_DIR', '\\"' + env["BUILD_BASE_DIR"] + '\\"')]) 6 | env.Append(CPPPATH = [os.path.join(env['BUILD_BASE_DIR'], 'include')]) 7 | env.Append(CPPPATH = [os.path.join(env['TIMELOOP_BASE_DIR'], 'include')]) 8 | #env.Append(CPPPATH = ['src/include']) 9 | #env["CPPPATH"] = ["."] 10 | 11 | 12 | if GetOption('debug'): 13 | env.Append(CCFLAGS = ['-g', '-O0']) 14 | else: 15 | env.Append(CCFLAGS = ['-g', '-O3', '-flto']) 16 | 17 | env.Append(CCFLAGS = ['-Wall', '-Wextra', '-std=c++17', '-pthread']) 18 | 19 | if GetOption('clang'): 20 | env.Append(CCFLAGS = ['-ferror-limit=1']) 21 | else: 22 | env.Append(CCFLAGS = ['-fmax-errors=1']) 23 | 24 | env.Append(LIBPATH = ['.', os.path.join(env['BUILD_BASE_DIR'], 'lib')]) 25 | # If we are doing a static build, the timeloop library must be the first 26 | # item in the link order. 27 | if GetOption('link_static'): 28 | env.Append(LIBS = ['timeloop-mapper']) 29 | 30 | env.Append(LINKFLAGS = ['-std=c++17', '-pthread']) 31 | if str(Platform()) != 'darwin': 32 | env.Append(LINKFLAGS = ['-static-libgcc', '-static-libstdc++']) 33 | 34 | env.Append(LIBS = ['config++', 'yaml-cpp', 'ncurses']) 35 | if str(Platform()) != 'darwin': 36 | env.Append(LIBS = ['tinfo']) 37 | 38 | # barvinok needs to be before isl because it references isl functions 39 | if GetOption('link_static'): 40 | print("Using static linking.") 41 | env.Append(LINKFLAGS = [ '-Wl,--whole-archive', '-static', '-lpthread', '-Wl,--no-whole-archive']) 42 | env.Append(LIBS = ['tinfo', 'gpm']) 43 | else: 44 | print("Using dynamic linking.") 45 | 46 | env.Append(LIBS = ['boost_iostreams', 'boost_serialization']) 47 | 48 | if os.environ.get('BOOSTDIR'): 49 | env.Append(CPPFLAGS = ['-I' + os.environ['BOOSTDIR'] + '/include']) 50 | env.Append(LIBPATH = [os.environ['BOOSTDIR'] + '/lib']) 51 | 52 | if "LIBCONFIGPATH" in os.environ: 53 | LIBCONFIGPATH = os.environ["LIBCONFIGPATH"] 54 | env["LIBPATH"] += [LIBCONFIGPATH + '/lib'] 55 | env["CPPFLAGS"] += ['-I' + LIBCONFIGPATH + '/include'] 56 | 57 | if "HDF5PATH" in os.environ: 58 | HDF5PATH = os.environ["HDF5PATH"] 59 | env["LIBPATH"] += [HDF5PATH + '/lib'] 60 | env["CPPFLAGS"] += ['-I' + HDF5PATH + '/include'] 61 | env["LIBS"] += ['hdf5', 'hdf5_hl'] 62 | elif "HDF5PATH_INCLUDE" in os.environ: 63 | HDF5PATH_INCLUDE = os.environ["HDF5PATH_INCLUDE"] 64 | env["CPPPATH"] += ['-I' + HDF5PATH_INCLUDE] 65 | env["LIBS"] += ['hdf5_cpp', 'hdf5_hl_cpp', 'hdf5_serial', 'hdf5_serial_hl'] 66 | 67 | if "YAMLCPPPATH" in os.environ: 68 | YAMLCPPPATH = os.environ["YAMLCPPPATH"] 69 | env["LIBPATH"] += [YAMLCPPPATH + '/lib'] 70 | env["CPPFLAGS"] += ['-I' + YAMLCPPPATH + '/include'] 71 | 72 | if "NCURSESPATH" in os.environ: 73 | NCURSESPATH = os.environ["NCURSESPATH"] 74 | env.Append(LIBPATH = [NCURSESPATH + '/lib']) 75 | env.Append(CPPFLAGS = ['-I' + NCURSESPATH + '/include']) 76 | 77 | if "BARVINOKPATH" in os.environ: 78 | BARVINOKPATH = os.environ["BARVINOKPATH"] 79 | env.Append(LIBPATH = [BARVINOKPATH + '/lib']) 80 | env.Append(CPPFLAGS = ['-I' + BARVINOKPATH + '/include']) 81 | 82 | if "NTLPATH" in os.environ: 83 | NTLPATH = os.environ["NTLPATH"] 84 | env.Append(LIBPATH = [NTLPATH + '/lib']) 85 | env.Append(CPPFLAGS = ['-I' + NTLPATH + '/include']) 86 | 87 | if GetOption('use_accelergy'): 88 | env["CPPDEFINES"] += [('USE_ACCELERGY')] 89 | 90 | parser_sources = Split(""" 91 | ../src/problem/parser.cpp 92 | ../src/mapping/loop.cpp 93 | ../src/mapping/parser.cpp 94 | ../src/mapping/mapping.cpp 95 | ../src/loop-analysis/nest-analysis.cpp 96 | ../src/loop-analysis/memory-state.cpp 97 | ../src/model/topology.cpp 98 | scripts/parser.cpp 99 | """) 100 | 101 | print(env['LIBS']) 102 | 103 | bin_model = env.Program(target = 'parser', source = parser_sources) 104 | 105 | 106 | env.Install(env['BUILD_BASE_DIR'] + "/bin", [bin_model]) -------------------------------------------------------------------------------- /src/model/topology.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include "tileflow/model/topology.hpp" 3 | 4 | namespace model { 5 | 6 | namespace TileFlow { 7 | 8 | void Topology::eval( 9 | const mapping::TileFlow::Mapping& mapping, 10 | const analysis::TileFlow::NestAnalysis& analysis){ 11 | 12 | StatCalculator pass(*this, mapping, analysis); 13 | stats_.cycles = total_network_latency_; 14 | pass.run(mapping.root); 15 | 16 | std::cout << "Energy: " << stats_.energy << std::endl; 17 | std::cout << "Cycles: " << stats_.cycles << std::endl; 18 | } 19 | 20 | void StatCalculator::visitTile(const TileNode* node) { 21 | for (auto child: node->get_children()) 22 | child->accept(this); 23 | if (!node->is_spatial()) { 24 | auto cycle = cycles_.top(); 25 | cycles_.pop(); 26 | auto& tile = analysis_.get_tile(node); 27 | auto storage_id = node->get_storage_level(); 28 | auto storage_level = std::static_pointer_cast(topology_.GetStorageLevel(storage_id)->Clone()); 29 | tiling::CompoundMask mask = {}; 30 | for (int pv = 0; pv < (int)problem::GetShape()->NumDataSpaces; ++pv) 31 | mask[pv] = true; 32 | storage_level->Evaluate(tile, mask, 33 | 0, 34 | cycle, break_on_failure); 35 | storage_level->FinalizeBufferEnergy(); 36 | cycles_.push(storage_level->Cycles()); 37 | energy_ += storage_level->Energy(); 38 | 39 | auto connection = topology_.connection_map_[storage_id]; 40 | auto rf_net = connection.read_fill_network->Clone(); 41 | rf_net->Evaluate(tile, break_on_failure); 42 | energy_ += rf_net->Energy(); 43 | auto du_net = connection.drain_update_network->Clone(); 44 | du_net->Evaluate(tile, break_on_failure); 45 | energy_ += du_net->Energy(); 46 | 47 | std::cout << "Storage<" << storage_id << ">:" << std::endl 48 | << *storage_level; 49 | std::cout << "Connect<" << storage_id << ">:"; 50 | std::cout << "rf_net: " << rf_net->Energy(); 51 | std::cout << "du_net: " << du_net->Energy(); 52 | std::cout << std::endl; 53 | } 54 | } 55 | 56 | void StatCalculator::visitScope(const ScopeNode* node) { 57 | 58 | auto type = node->get_scope_type(); 59 | std::uint64_t cycle; 60 | if (type == ScopeNode::Sequential || type == ScopeNode::Sharing) { 61 | cycle = 0; 62 | for (auto child: node->get_children()) { 63 | child->accept(this); 64 | cycle += cycles_.top(); 65 | cycles_.pop(); 66 | } 67 | } 68 | else if (type == ScopeNode::Parallel || type == ScopeNode::Pipeline) { 69 | cycle = 0; 70 | for (auto child: node->get_children()) { 71 | child->accept(this); 72 | cycle = std::max(cycle, cycles_.top()); 73 | cycles_.pop(); 74 | } 75 | } 76 | cycles_.push(cycle); 77 | } 78 | 79 | void StatCalculator::visitOp(const OpNode* node) { 80 | auto level = topology_.GetArithmeticLevel()->Clone(); 81 | auto &tile = analysis_.get_tile(node); 82 | tiling::CompoundMask mask = {}; 83 | for (int pv = 0; pv < (int)problem::GetShape()->NumDataSpaces; ++pv) 84 | mask[pv] = true; 85 | level->Evaluate(tile, mask, 0, 86 | tile.compute_info.accesses, break_on_failure); 87 | cycles_.push(level->Cycles()); 88 | energy_ += level->Energy(); 89 | std::cout << "Arithmetic::" << node->get_name() << ":" << level->Energy() << std::endl; 90 | } 91 | 92 | void StatCalculator::run(const Node* root) { 93 | break_on_failure = false; 94 | energy_ = 0.0; 95 | root->accept(this); 96 | assert(cycles_.size() == 1); 97 | topology_.stats_.energy = energy_; 98 | topology_.stats_.cycles += cycles_.top(); 99 | } 100 | 101 | } // namespace TileFlow 102 | 103 | } // namespace model -------------------------------------------------------------------------------- /docs/frontend-syntax.md: -------------------------------------------------------------------------------- 1 | # Frontend Syntax of TileFlow 2 | 3 | TileFlow uses yaml for input configuration. There are 3 required fields for an application: `architecture` field for archtecture specification, `problem` field for problem specification, and `mapping` field problem-architecture mapping specification. Normally, we implement three fields in 3 separate `yaml` files. A typical config file is like: 4 | 5 | ``` 6 | problem: 7 | ... 8 | architecture: 9 | ... 10 | mapping: 11 | ... 12 | check (optional): 13 | ... 14 | tileflow-mapper (optional): 15 | ... 16 | macro (optional): 17 | ... 18 | output (optional): 19 | verbose (optional): 20 | ``` 21 | 22 | Please see `tests/cases` for examples. 23 | 24 | ## Arch Scope 25 | 26 | The `architecture` field uses the syntax of `Timeloop`, see [this](https://timeloop.csail.mit.edu/timeloop/input-formats/design/architecture) for description. 27 | 28 | ## Prob Scope 29 | 30 | The `problem` field extends `Timeloop`'s syntax to support multi-op. The file is organized like: 31 | ``` 32 | problem: 33 | io: 34 | ... 35 | dimensions: 36 | ... 37 | instance: 38 | ... 39 | ops: 40 | - TIMELOOP-OP1 41 | - TIMELOOP-OP2 42 | ... 43 | ``` 44 | 45 | - `io`: the input and output of the function. 46 | - `dimensions`: all the dimensions appeared in describing the tensors. 47 | - `instance`: the specification for parameters, see [this](https://timeloop.csail.mit.edu/timeloop/input-formats/problem#problem-shape) for description. 48 | - `ops`: a list of tensor operations, follow the same syntax with [shape](https://timeloop.csail.mit.edu/timeloop/input-formats/problem#problem-shape) in timeloop without the instance field. An extra field of each op is the `ins` and `out` field to specify the IO of each operation. 49 | 50 | ## Mapping Scope 51 | 52 | The `mapping` field decribed the mapping in a tree. A Node in a tree is like: 53 | 54 | ```yaml 55 | node-type: TILE|Scope|Op 56 | # optional attributes 57 | type: [Sharing|Temporal|Spatial|Pipeline|temporal|spatial] 58 | factors: 59 | permutation: 60 | target: 61 | split: 62 | 63 | subtree: 64 | - CHILD1 65 | - CHILD2 66 | ... 67 | ``` 68 | 69 | There are three kinds of nodes: 70 | 71 | - Scope Node: to specify the boundary of memory hierarchy; The only attribute of a scope node is its sub-types: Sharing/Temporal/Spatial/Pipeline. 72 | 73 | - Tile Node: to specify the temporal/spatial mapping of loops. The key attributes include factors, permutations, target, split, etc.. See [this](https://timeloop.csail.mit.edu/timeloop/input-formats/mapping) for illustration. 74 | - Key knobs: 75 | - multicast [true|false]: used for spatial tile to specify whether the higher memory level's bandwidth can perform multicast. 76 | 77 | - Op Node: to specify the arithmetic operations; Attributes: 78 | - name: the name of operation; 79 | 80 | - To enable `tileflow-mapper`, user can simply replace the number in the specification for tile factors with arbitrary string. 81 | 82 | ## Check Scope: 83 | - To cutomize different kinds of checking; 84 | - Attributes: 85 | - `mem`(bool): whether or not to enable memory capticy check; 86 | - `loopcount`(bool): whether or not to enable the loopcount check (whether the multiplication of tile factors equal the shape); 87 | - `spatial`(bool): whether the spatial core usage is exceeded. 88 | 89 | ## Mapper Scope: 90 | - Specify the configuration for mapper 91 | - Attributes: 92 | - `alg`[random, mtcs]: the searching algorithm for mapper; 93 | - `timeout`[INT]: the searching timeout in seconds; 94 | - `topk`(unsigned): record topK candidates. 95 | 96 | ## Macro Scope: 97 | - key(string): value(int) pairs of macros. The macros can be used for instanciation of `factor` scope of `tile` nodes, and the instanciation of `instance scope` of `problem scope`. 98 | 99 | ## Others 100 | 101 | - `macro` attribute: list some constant values that can be used as tile factors/tensor shapes 102 | - `verbose` attribute: specify the verbose level; 103 | - `output`: the prefix for output; including 1. `$(output).csv` for cycle/energy/profiling results; 2. `$(output).mapping.csv` for searched best dataflow; 3. `$(output).tuning.csv` for mapper tunning log; 104 | 105 | -------------------------------------------------------------------------------- /tests/cases/03-test-systolic/script.py: -------------------------------------------------------------------------------- 1 | import pickle as pkl 2 | from functools import reduce 3 | import multiprocessing 4 | import time 5 | 6 | # n_proc = multiprocessing.cpu_count() 7 | n_proc = 112 8 | # data = [ 9 | # ((256, 256, 256, 128, 128, 128), 78283), 10 | # ((256, 384, 256, 128, 128, 128), 111271), 11 | # ((256, 128, 96, 64, 32, 48), 38206), 12 | # ((1024, 512, 256, 64, 128, 64), 793662), 13 | # ((64, 64, 1024, 32, 32, 256), 67070) 14 | # ] 15 | 16 | with open("data.pkl", 'rb') as f: 17 | data = pkl.load(f) 18 | 19 | import os 20 | import pandas as pd 21 | 22 | def run(data, id): 23 | M,N,K,MO,MM,NO,NI,KO,KM = data 24 | filename = 'config-' + '-'.join([str(_) for _ in data]) + '.yaml' 25 | with open(f"config/{filename}", 'w') as f: 26 | f.write( 27 | f''' 28 | output: /tmp/_tmp-{id}.csv 29 | verbose: 0 30 | macro: 31 | M: {M} 32 | N: {N} 33 | K: {K} 34 | MO: {MO} 35 | MM: {MM} 36 | NO: {NO} 37 | NI: {NI} 38 | KO: {KO} 39 | KM: {KM} 40 | ''' 41 | ) 42 | os.system(f'tileflow arch/arch.yaml map/map.yaml prob/prob.yaml config/{filename}') 43 | tileflow = pd.read_csv(f'/tmp/_tmp-{id}.csv').set_index('metric').T 44 | # df = pd.DataFrame(data = [M, N, K, MO, NO, KO, tileflow['Cycle'].iloc[0], tileflow['Energy'].iloc[0]], 45 | # index = ['M', 'N', 'K', 'MO', 'NO', 'KO','tileflow-cycle', 'tileflow-ener'], 46 | # columns = ['value']) 47 | os.system(f'timeloop-model arch/arch.yaml map/map-timeloop.yaml prob/prob-timeloop.yaml config/{filename}') 48 | timeloop = pd.read_csv(f'/tmp/_tmp-{id}.csv').set_index('metric').T 49 | df = pd.DataFrame(data = [M, N, K, MO, NO, KO, timeloop['Cycle'].iloc[0], tileflow['Cycle'].iloc[0],timeloop['Energy'].iloc[0], tileflow['Energy'].iloc[0]], 50 | index = ['M', 'N', 'K', 'MO', 'NO', 'KO','timeloop-cycle', 'tileflow-cycle', 'timeloop-ener', 'tileflow-ener'], 51 | columns=['value']) 52 | df = df.T 53 | return df 54 | 55 | from sklearn.linear_model import LinearRegression 56 | import matplotlib.pyplot as plt 57 | import numpy as np 58 | 59 | def analyze(df, metric = 'tileflow-cycle', target = 'real'): 60 | model = LinearRegression() 61 | model.fit(df[metric].values.reshape(-1,1), df[target]) 62 | print (f'relation: {metric} v.s. {target}') 63 | print ('\tscore:', model.score(df[metric].values.reshape(-1,1), df[target])) 64 | print ('\tk: ', model.coef_) 65 | print ('\tb: ', model.intercept_) 66 | ave_err = np.mean(np.abs((df[metric] - df[target]) / df[target])) 67 | print ('\taverage error: ', ave_err) 68 | df['pred'] = model.predict(df[metric].values.reshape(-1,1)) 69 | ax = df.plot.scatter(x=metric, y = target, label=target) 70 | df.plot.line(ax = ax, x=metric, y = 'pred', label='pred', c = 'black') 71 | plt.legend() 72 | ax.get_figure().savefig(f'result-{metric}-{target}.png') 73 | 74 | def do_work(procnum, return_dict): 75 | rets = [] 76 | stride = (len(data) + n_proc - 1) // n_proc 77 | start = stride*procnum 78 | end = min(len(data), start + stride) 79 | for (M,N,K,micro_M,micro_N,micro_K), cycle in data[start:end]: 80 | MO = M // micro_M 81 | MM = micro_M // 16 82 | NO = N // micro_N 83 | NI = micro_N 84 | KO = K // micro_K 85 | KM = micro_K // 16 86 | 87 | ret = run((M,N,K,MO,MM,NO,NI,KO,KM), procnum) 88 | ret['real'] = cycle 89 | rets.append(ret) 90 | 91 | return_dict[procnum] = rets 92 | 93 | 94 | def main(): 95 | manager = multiprocessing.Manager() 96 | return_dict = manager.dict() 97 | jobs = [] 98 | for i in range(n_proc): 99 | p = multiprocessing.Process(target = do_work, args = (i, return_dict)) 100 | jobs.append(p) 101 | p.start() 102 | for proc in jobs: proc.join() 103 | 104 | print (return_dict.values()) 105 | ret = pd.concat(reduce(lambda x, y: x+y, return_dict.values(), [])) 106 | print(ret) 107 | # ret.to_csv('out.csv') 108 | analyze(ret, 'tileflow-cycle', 'timeloop-cycle') 109 | analyze(ret, 'tileflow-cycle', 'real') 110 | analyze(ret, 'timeloop-cycle', 'real') 111 | analyze(ret, 'timeloop-ener', 'tileflow-ener') 112 | return ret 113 | 114 | 115 | if __name__ == '__main__': 116 | main() -------------------------------------------------------------------------------- /src/mapping/mapping.cpp: -------------------------------------------------------------------------------- 1 | #include "tileflow/mapping/mapping.hpp" 2 | 3 | using TileFlow::global_symbol_table_; 4 | 5 | namespace mapping { 6 | 7 | namespace TileFlow { 8 | 9 | const std::unordered_map Node::type2name_ = { 10 | {Node::Tile, "Tile"}, 11 | {Node::Op, "Op"}, 12 | {Node::Scope, "Scope"} 13 | }; 14 | 15 | void Node::add_child(const Node* child){ 16 | if (type_ == Node::Scope) { 17 | 18 | unsigned storage_level; 19 | std::string storage_level_name = "Unknown"; 20 | if (child->get_type() == Node::Tile) { 21 | // if (static_cast(child)->get_tile_type() == TileNode::Temporal){ 22 | // storage_level = child->get_storage_level() + 1; 23 | // } 24 | // else { 25 | storage_level = child->get_storage_level(); 26 | storage_level_name = child->get_storage_name(); 27 | // } 28 | } 29 | else if (child->get_type() == Node::Scope) { 30 | storage_level = child->get_storage_level(); 31 | storage_level_name = child->get_storage_name(); 32 | } 33 | else { 34 | TILEFLOW_ERROR("Scope Node should not have a op child"); 35 | } 36 | assert(storage_level_ == unsigned(-1) || storage_level_ == storage_level); 37 | storage_level_ = storage_level; 38 | storage_level_name_ = storage_level_name; 39 | } 40 | assert(child != nullptr); 41 | children_.push_back(child); 42 | child->set_parent(this); 43 | } 44 | 45 | void Visitor::visitScope(const ScopeNode* node){ 46 | for (auto child: node->children_) 47 | child->accept(this); 48 | } 49 | 50 | void Visitor::visitOp(const OpNode* node){ 51 | for (auto child: node->children_) 52 | child->accept(this); 53 | } 54 | 55 | void Visitor::visitTile(const TileNode* node){ 56 | for (auto child: node->children_) 57 | child->accept(this); 58 | } 59 | 60 | void Visitor::run(const Node* root) { 61 | root->accept(this); 62 | } 63 | 64 | loop::Nest TileNode::constructLoopNest(const SymbolTable* symbol_table_) const{ 65 | loop::Nest loop_nest; 66 | uint64_t num_subnests_added = 0; 67 | for (auto loop: loopnests_) 68 | { 69 | // Ignore trivial factors 70 | // This reduces computation time by 1.5x on average. 71 | if (loop.end <= 0) { 72 | assert(symbol_table_); 73 | loop.residual_end = loop.end = symbol_table_->lookup(loop.end).value_; 74 | } 75 | if (loop.start + loop.stride < loop.end){ 76 | assert((type_==TileNode::Spatial && loop::IsSpatial(loop.spacetime_dimension)) 77 | || (type_==TileNode::Temporal && !loop::IsSpatial(loop.spacetime_dimension))); 78 | loop_nest.AddLoop(loop); 79 | num_subnests_added ++; 80 | } 81 | } 82 | if (num_subnests_added == 0) { 83 | loop_nest.AddLoop(0, 0, 1, 1, type_ == TileNode::Spatial? spacetime::Dimension::SpaceX : spacetime::Dimension::Time); 84 | } 85 | loop_nest.AddStorageTilingBoundary(); 86 | return loop_nest; 87 | } 88 | 89 | void Node::display_active_tensors(std::string prefix, std::ostream&o) const { 90 | bool isEmpty = active_tensors_.read_tensors.size() 91 | + active_tensors_.update_tensors.size() 92 | + active_tensors_.fill_tensors.size() 93 | + active_tensors_.wb_tensors.size(); 94 | if (!isEmpty) return; 95 | o << prefix; 96 | if (active_tensors_.read_tensors.size()) { 97 | o << "read: "; 98 | for (auto id: active_tensors_.read_tensors) 99 | o << problem::GetShape()->DataSpaceIDToName.at(id) << " "; 100 | } 101 | if (active_tensors_.update_tensors.size()) { 102 | o << "update: "; 103 | for (auto id: active_tensors_.update_tensors) 104 | o << problem::GetShape()->DataSpaceIDToName.at(id) << " "; 105 | } 106 | if (active_tensors_.fill_tensors.size()){ 107 | o << "fill: "; 108 | for (auto id: active_tensors_.fill_tensors) 109 | o << problem::GetShape()->DataSpaceIDToName.at(id) << " "; 110 | } 111 | if (active_tensors_.wb_tensors.size()) { 112 | o << "write-back: "; 113 | for (auto id: active_tensors_.wb_tensors) 114 | o << problem::GetShape()->DataSpaceIDToName.at(id) << " "; 115 | } 116 | o << std::endl; 117 | } 118 | 119 | } // namespace TileFlow 120 | 121 | } // namespace mapping -------------------------------------------------------------------------------- /tests/cases/00-validation/02-attention/topk-analysis.py: -------------------------------------------------------------------------------- 1 | import pickle as pkl 2 | import re 3 | 4 | shape = [512,512,64,64] 5 | 6 | def parse_raw_data(raw_data): 7 | ret = [] 8 | for line in raw_data.split('\n'): 9 | key_tuple = shape.copy() 10 | failed = False 11 | for key in ['MO', 'MM', 'KO', 'KM', 'LO', 'LM']: 12 | pattern = f'<{key},1,(\d+)>' 13 | res = re.findall(pattern, line) 14 | if len(res) == 0: 15 | print (line, key) 16 | failed = True 17 | break 18 | key_tuple.append(int(res[0])) 19 | if failed: continue 20 | res = re.findall('value: (\d+)', line) 21 | assert len(res) 22 | ret.append((tuple(key_tuple), int(res[0]))) 23 | return ret 24 | 25 | def get_dataset(): 26 | filename = 'data/No_Softmax/data.pkl' 27 | ret = dict() 28 | with open(filename, 'rb') as f: 29 | data = pkl.load(f) 30 | for line in data: 31 | (M1,N1,K1,micro_M1,micro_N1,micro_K1,M2,N2,K2,micro_M2,micro_N2,micro_K2), value = line 32 | MO = M1 // micro_M1 33 | MM = micro_M1 // 16 34 | KO = K1 // micro_K1 35 | KM = micro_K1 // 16 36 | LO = K2 // micro_K2 37 | LM = micro_K2 // 16 38 | ret[(M1, N1, K1, N2, MO, MM, KO, KM, LO, LM)] = value 39 | return ret 40 | 41 | def reconvert(shape): 42 | M1, N1, K1, N2, MO, MM, KO, KM, LO, LM = shape 43 | micro_M1 = 16 * MM 44 | assert MO == (M1 // micro_M1) 45 | micro_N1 = N1 46 | micro_K1 = 16 * KM 47 | M2 = M1 48 | K2 = N1 49 | micro_M2 = micro_M1 50 | micro_N2 = N2 51 | micro_K2 = 16 * LM 52 | assert LO == (K2 // micro_K2) 53 | return (M1,N1,K1,micro_M1,micro_N1,micro_K1,M2,N2,K2,micro_M2,micro_N2,micro_K2) 54 | 55 | raw_data = ''' 56 | 0: ,,,,,,,,,, value: 140288 57 | 1: ,,,,,,,,,, value: 140288 58 | 2: ,,,,,,,,,, value: 140288 59 | 3: ,,,,,,,,,, value: 141312 60 | 4: ,,,,,,,,,, value: 141312 61 | 5: ,,,,,,,,,, value: 141312 62 | 6: ,,,,,,,,,, value: 143360 63 | 7: ,,,,,,,,,, value: 143360 64 | 8: ,,,,,,,,,, value: 143362 65 | 9: ,,,,,,,,,, value: 147456 66 | 10: ,,,,,,,,,, value: 147456 67 | 11: ,,,,,,,,,, value: 147456 68 | 12: ,,,,,,,,,, value: 148480 69 | 13: ,,,,,,,,,, value: 148480 70 | 14: ,,,,,,,,,, value: 148480 71 | 15: ,,,,,,,,,, value: 149504 72 | 16: ,,,,,,,,,, value: 149504 73 | 17: ,,,,,,,,,, value: 149504 74 | 18: ,,,,,,,,,, value: 151552 75 | 19: ,,,,,,,,,, value: 151552''' 76 | 77 | topk_data = parse_raw_data(raw_data) 78 | dataset = get_dataset() 79 | converted_dataset = [] 80 | 81 | for k, v in topk_data: 82 | if k not in dataset: 83 | print (k, 'is not in dataset') 84 | converted_dataset.append(list(reconvert(k)) + [v]) 85 | continue 86 | print (k, v, dataset[k]) 87 | 88 | import pandas as pd 89 | df = pd.DataFrame(data = converted_dataset, columns= 90 | ['M1','N1','K1','micro_M1','micro_N1','micro_K1','M2','N2','K2','micro_M2','micro_N2','micro_K2','latency']) 91 | df.to_csv('topk.csv') 92 | # print (dataset) -------------------------------------------------------------------------------- /tests/cases/08-test-2mm/reference_output.txt: -------------------------------------------------------------------------------- 1 | input file: arch/arch-spatial.yaml 2 | input file: prob/prob-2mm.yaml 3 | input file: map/map.yaml 4 | Begin ParseWorkload... 5 | Begin Spec... 6 | begin mapping by random... 7 | ***Optimal Mapping: 8 | -----------------Nest Analysis---------------- 9 | read: A B E D update: E 10 | for L in [0:32), MainMemory 11 | for M in [0:64), MainMemory 12 | read: A B E D update: E fill: A B E D write-back: E 13 | Scope: Pipeline 14 | { 15 | read: C A B update: C fill: A B 16 | for L in [0:4) (Spatial-Y), MainMemory 17 | for M in [0:2) (Spatial-X), MainMemory 18 | read: C A B update: C fill: C A B write-back: C 19 | for K in [0:64), RegFile 20 | for L in [0:4), RegFile 21 | for M in [0:4), RegFile 22 | read: C A B update: C fill: C A B write-back: C 23 | Op: GEMM1(A,B,)->C 24 | 25 | read: E D update: E fill: E D write-back: E 26 | Scope: Sequential 27 | { 28 | read: C update: exp 29 | for L in [0:4) (Spatial-Y), MainMemory 30 | for M in [0:2) (Spatial-X), MainMemory 31 | read: C update: exp fill: C write-back: exp 32 | for L in [0:4), RegFile 33 | for M in [0:4), RegFile 34 | read: C update: exp fill: C write-back: exp 35 | Op: EXP(C,)->exp 36 | 37 | read: exp E D update: E fill: E D write-back: E 38 | for L in [0:4) (Spatial-Y), MainMemory 39 | for M in [0:2) (Spatial-X), MainMemory 40 | read: exp E D update: E fill: exp E D write-back: E 41 | for N in [0:64), RegFile 42 | for L in [0:4), RegFile 43 | for M in [0:4), RegFile 44 | read: exp E D update: E fill: exp E D write-back: E 45 | Op: GEMM2(exp,D,)->E 46 | 47 | } 48 | } 49 | Cycle: 3178496, Energy: 1.06885e+09 50 | --------------END Nest Analysis--------------- 51 | ***TileFlow Result 52 | metric,value 53 | Cycle,3178496 54 | Energy,1.06885e+09 55 | MainMemory::Write::D, 0 56 | MainMemory::Fill::D, 0 57 | MainMemory::Update::D, 0 58 | MainMemory::Update, 1.04858e+06 59 | MainMemory::Read, 6.29146e+06 60 | MainMemory::Fill::E, 0 61 | MainMemory::Update::exp, 0 62 | MainMemory::CapUtil, 0.00012207 63 | MainMemory::Write, 1.04858e+06 64 | MainMemory::Update::B, 0 65 | MainMemory::Read::E, 1.04858e+06 66 | MainMemory::Update::C, 0 67 | MainMemory::Write::E, 1.04858e+06 68 | MainMemory::SpatialUtil, 1 69 | MainMemory::Read::C, 0 70 | MainMemory::Read::exp, 0 71 | MainMemory::Write::C, 0 72 | MainMemory::SlowDown, 1 73 | MainMemory::Accesses, 7.34003e+06 74 | MainMemory::Fill::exp, 0 75 | MainMemory::Energy, 7.34003e+08 76 | MainMemory::Write::B, 0 77 | MainMemory::Read::A, 1.04858e+06 78 | MainMemory::Update::A, 0 79 | MainMemory::Read::D, 2.09715e+06 80 | MainMemory::Fill, 0 81 | MainMemory::Fill::A, 0 82 | MainMemory::Write::exp, 0 83 | MainMemory::Fill::C, 0 84 | MainMemory::Write::A, 0 85 | MainMemory::Read::B, 2.09715e+06 86 | MainMemory::Fill::B, 0 87 | MainMemory::Update::E, 1.04858e+06 88 | RegFile::Write::D, 262144 89 | RegFile::Fill::D, 262144 90 | RegFile::Update::D, 0 91 | RegFile::Update, 4.22707e+06 92 | RegFile::Read, 1.26484e+07 93 | RegFile::Fill::E, 262144 94 | RegFile::Update::exp, 32768 95 | RegFile::CapUtil, 0.00402832 96 | RegFile::Write, 5.30842e+06 97 | RegFile::Update::B, 0 98 | RegFile::Read::E, 2.09715e+06 99 | RegFile::Update::C, 2.09715e+06 100 | RegFile::Write::E, 2.3593e+06 101 | RegFile::SpatialUtil, 0.5 102 | RegFile::Read::C, 2.16269e+06 103 | RegFile::Read::exp, 2.09715e+06 104 | RegFile::Write::C, 2.12992e+06 105 | RegFile::SlowDown, 1.5 106 | RegFile::Accesses, 1.43655e+08 107 | RegFile::Fill::exp, 0 108 | RegFile::Energy, 3.01026e+08 109 | RegFile::Write::B, 262144 110 | RegFile::Read::A, 2.09715e+06 111 | RegFile::Update::A, 0 112 | RegFile::Read::D, 2.09715e+06 113 | RegFile::Fill, 1.08134e+06 114 | RegFile::Fill::A, 262144 115 | RegFile::Write::exp, 32768 116 | RegFile::Fill::C, 32768 117 | RegFile::Write::A, 262144 118 | RegFile::Read::B, 2.09715e+06 119 | RegFile::Fill::B, 262144 120 | RegFile::Update::E, 2.09715e+06 121 | mac::Energy, 3.38166e+07 122 | mac::Flops, 3.38166e+07 123 | MEM::RegFile,0.00805664 124 | MEM::RegFile,0.000488281 125 | MEM::RegFile,0.00805664 126 | MEM::MainMemory,0.00012207 127 | SPATIAL::MainMemory,1 128 | ***TileFlow Result Ends 129 | -------------------------------------------------------------------------------- /tests/cases/12-test-fused-cnn/fused-layer.yaml: -------------------------------------------------------------------------------- 1 | output: fused-layer 2 | verbose: 1 3 | problem: 4 | io: 5 | ins: I X Y 6 | outs: D 7 | dimensions: [B,H,W,C,L,K,R,S,U,V] 8 | instance: 9 | B: 1 10 | H: 112 11 | W: 112 12 | C: 64 13 | L: 192 14 | K: 128 15 | R: 3 16 | S: 3 17 | U: 3 18 | V: 3 19 | 20 | ops: 21 | - name: ProduceA 22 | dimensions: [L,C,R,S,B,H,W,U,V] 23 | data-spaces: 24 | - name: A 25 | projection: 26 | - [[B]] 27 | - [[L]] 28 | - [[H], [U]] 29 | - [[W], [V]] 30 | read-write: True 31 | - name: X 32 | projection: 33 | - [[L]] 34 | - [[C]] 35 | - [[R]] 36 | - [[S]] 37 | - name: I 38 | projection: 39 | - [[B]] 40 | - [[C]] 41 | - [[H],[R]] 42 | - [[W],[S]] 43 | ins: X, I 44 | out: A 45 | 46 | - name: ProduceD 47 | dimensions: [K,L,U,V,B,H,W] 48 | data-spaces: 49 | - name: D 50 | projection: 51 | - [[B]] 52 | - [[K]] 53 | - [[H]] 54 | - [[W]] 55 | read-write: True 56 | - name: Y 57 | projection: 58 | - [[K]] 59 | - [[L]] 60 | - [[U]] 61 | - [[V]] 62 | - name: A 63 | projection: 64 | - [[B]] 65 | - [[L]] 66 | - [[H],[U]] 67 | - [[W],[V]] 68 | ins: Y, A 69 | out: D 70 | 71 | architecture: 72 | version: 0.2 73 | subtree: 74 | - name: System 75 | attributes: 76 | local: 77 | - name: L2 78 | class: DRAM 79 | attributes: 80 | word-bits: 16 81 | block-size: 32 82 | technology: 16nm 83 | read_bandwidth: 25 84 | sizeKB: 1600000000 85 | subtree: 86 | - name: Buffer[0..3] 87 | attributes: 88 | meshX: 4 89 | local: 90 | - name: L1 91 | class: SRAM 92 | attributes: 93 | width: 16 94 | word-bits: 16 95 | technology: 16nm 96 | read_bandwidth: 500 97 | sizeKB: 2000 98 | subtree: 99 | - name: PE 100 | attributes: 101 | local: 102 | - name: L0[0..1023] 103 | class: regfile 104 | attributes: 105 | depth: 1 106 | meshX: 1024 107 | word-bits: 16 108 | block-size: 30 109 | technology: 16nm 110 | read_bandwidth: 3 111 | - name: mac[0..1023] 112 | class: intmac 113 | attributes: 114 | meshX: 1024 115 | 116 | mapping: 117 | node-type: Tile 118 | type: Temporal 119 | factors: B=BO H=HO W=WO 120 | permutation: BHW 121 | target: L2 122 | 123 | subtree: 124 | - node-type: Tile 125 | type: Spatial 126 | factors: H=HS W=WS 127 | permutation: HW 128 | target: L2 129 | 130 | subtree: 131 | - node-type: Scope 132 | type: Sequential 133 | 134 | subtree: 135 | - node-type: Tile 136 | type: Temporal 137 | factors: H=HM W=WM C=CM L=LM 138 | permutation: HWCL 139 | target: L1 140 | 141 | subtree: 142 | - node-type: Tile 143 | type: Spatial 144 | factors: C=CS L=LS 145 | permutation: CL 146 | target: L1 147 | 148 | subtree: 149 | - node-type: Tile 150 | type: Temporal 151 | factors: R=RI S=SI 152 | permutation: RS 153 | target: L0 154 | 155 | subtree: 156 | - node-type: Op 157 | name: ProduceA 158 | - node-type: Tile 159 | type: Temporal 160 | factors: H=HM2 W=WM2 L=LM2 K=KM2 161 | permutation: HWLK 162 | target: L1 163 | 164 | subtree: 165 | - node-type: Tile 166 | type: Spatial 167 | factors: L=LS2 K=KS 168 | permutation: LK 169 | target: L1 170 | 171 | subtree: 172 | - node-type: Tile 173 | type: Temporal 174 | factors: U=UI V=VI 175 | permutation: UV 176 | target: L0 177 | 178 | subtree: 179 | - node-type: Op 180 | name: ProduceD 181 | 182 | check: 183 | loopcount: False -------------------------------------------------------------------------------- /src/SConscript: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | Import('env') 4 | 5 | env.Append(CPPDEFINES = [('BUILD_BASE_DIR', '\\"' + env["BUILD_BASE_DIR"] + '\\"')]) 6 | env.Append(CPPPATH = [os.path.join(env['BUILD_BASE_DIR'], 'include')]) 7 | env.Append(CPPPATH = [os.path.join(env['TIMELOOP_BASE_DIR'], 'include')]) 8 | #env.Append(CPPPATH = ['src/include']) 9 | #env["CPPPATH"] = ["."] 10 | 11 | if GetOption('debug'): 12 | env.Append(CCFLAGS = ['-g', '-O0']) 13 | else: 14 | env.Append(CCFLAGS = ['-g', '-O3', '-flto']) 15 | 16 | env.Append(CCFLAGS = ['-Werror', '-Wall', '-Wextra', '-std=c++17', '-pthread']) 17 | 18 | if GetOption('clang'): 19 | env.Append(CCFLAGS = ['-ferror-limit=1']) 20 | else: 21 | env.Append(CCFLAGS = ['-fmax-errors=1']) 22 | 23 | env.Append(LIBPATH = ['.', os.path.join(env['BUILD_BASE_DIR'], '3rdparty/timeloop/lib')]) 24 | # If we are doing a static build, the timeloop library must be the first 25 | # item in the link order. 26 | if GetOption('link_static'): 27 | env.Append(LIBS = ['tileflow', 'timeloop-mapper']) 28 | 29 | env.Append(LINKFLAGS = ['-std=c++17', '-pthread']) 30 | if str(Platform()) != 'darwin': 31 | env.Append(LINKFLAGS = ['-static-libgcc', '-static-libstdc++']) 32 | 33 | env.Append(LIBS = ['config++', 'yaml-cpp', 'ncurses']) 34 | if str(Platform()) != 'darwin': 35 | env.Append(LIBS = ['tinfo']) 36 | 37 | # barvinok needs to be before isl because it references isl functions 38 | if GetOption('link_static'): 39 | print("Using static linking.") 40 | env.Append(LINKFLAGS = [ '-Wl,--whole-archive', '-static', '-lpthread', '-Wl,--no-whole-archive']) 41 | env.Append(LIBS = ['tinfo', 'gpm']) 42 | else: 43 | print("Using dynamic linking.") 44 | 45 | env.Append(LIBS = ['boost_iostreams', 'boost_serialization']) 46 | 47 | if os.environ.get('BOOSTDIR'): 48 | env.Append(CPPFLAGS = ['-I' + os.environ['BOOSTDIR'] + '/include']) 49 | env.Append(LIBPATH = [os.environ['BOOSTDIR'] + '/lib']) 50 | 51 | if "LIBCONFIGPATH" in os.environ: 52 | LIBCONFIGPATH = os.environ["LIBCONFIGPATH"] 53 | env["LIBPATH"] += [LIBCONFIGPATH + '/lib'] 54 | env["CPPFLAGS"] += ['-I' + LIBCONFIGPATH + '/include'] 55 | 56 | if "HDF5PATH" in os.environ: 57 | HDF5PATH = os.environ["HDF5PATH"] 58 | env["LIBPATH"] += [HDF5PATH + '/lib'] 59 | env["CPPFLAGS"] += ['-I' + HDF5PATH + '/include'] 60 | env["LIBS"] += ['hdf5', 'hdf5_hl'] 61 | elif "HDF5PATH_INCLUDE" in os.environ: 62 | HDF5PATH_INCLUDE = os.environ["HDF5PATH_INCLUDE"] 63 | env["CPPPATH"] += ['-I' + HDF5PATH_INCLUDE] 64 | env["LIBS"] += ['hdf5_cpp', 'hdf5_hl_cpp', 'hdf5_serial', 'hdf5_serial_hl'] 65 | 66 | if "YAMLCPPPATH" in os.environ: 67 | YAMLCPPPATH = os.environ["YAMLCPPPATH"] 68 | env["LIBPATH"] += [YAMLCPPPATH + '/lib'] 69 | env["CPPFLAGS"] += ['-I' + YAMLCPPPATH + '/include'] 70 | 71 | if "NCURSESPATH" in os.environ: 72 | NCURSESPATH = os.environ["NCURSESPATH"] 73 | env.Append(LIBPATH = [NCURSESPATH + '/lib']) 74 | env.Append(CPPFLAGS = ['-I' + NCURSESPATH + '/include']) 75 | 76 | if "BARVINOKPATH" in os.environ: 77 | BARVINOKPATH = os.environ["BARVINOKPATH"] 78 | env.Append(LIBPATH = [BARVINOKPATH + '/lib']) 79 | env.Append(CPPFLAGS = ['-I' + BARVINOKPATH + '/include']) 80 | 81 | if "NTLPATH" in os.environ: 82 | NTLPATH = os.environ["NTLPATH"] 83 | env.Append(LIBPATH = [NTLPATH + '/lib']) 84 | env.Append(CPPFLAGS = ['-I' + NTLPATH + '/include']) 85 | 86 | # if GetOption('use_accelergy'): 87 | # env["CPPDEFINES"] += [('USE_ACCELERGY')] 88 | 89 | # if not os.path.isdir('./pat'): 90 | # print ("ERROR: 'src/pat' not found. Please create a symbolic link to the source code for the power-area-timing model and place it in src/pat. E.g., ln -s pat-public/src/pat src/pat.") 91 | # Exit(1) 92 | 93 | lib_sources = Split(""" 94 | ./problem/parser.cpp 95 | ./mapping/loop.cpp 96 | ./mapping/parser.cpp 97 | ./mapping/mapping.cpp 98 | ./loop-analysis/nest-analysis.cpp 99 | ./loop-analysis/memory-state.cpp 100 | ./loop-analysis/dm-calculator.cpp 101 | ./model/topology.cpp 102 | ./mapper/checker.cpp 103 | ./mapper/expr.cpp 104 | ./mapper/mapper.cpp 105 | ./mapper/op.cpp 106 | ./common.cpp 107 | """) 108 | 109 | libenv = env.Clone() 110 | if GetOption("link_static"): 111 | lib_tileflow = libenv.StaticLibrary(target="tileflow", source = lib_sources) 112 | libenv.Install("lib", [lib_tileflow]) 113 | else: 114 | lib_tileflow_shared = libenv.SharedLibrary(target = "tileflow", source = lib_sources) 115 | libenv.Install('lib', [ lib_tileflow_shared]) 116 | 117 | model_sources = Split(""" 118 | ./application/main.cpp 119 | """) 120 | 121 | bin_model = env.Program(target = 'tileflow', source = model_sources) 122 | 123 | env.Install("bin", [bin_model]) 124 | -------------------------------------------------------------------------------- /AE/validation/accelerator/data/io_data.csv: -------------------------------------------------------------------------------- 1 | mem_to_buf,buf_to_mem,buf_to_reg,reg_to_buf 2 | 34078720,524288,76021760,37748736 3 | 34078720,524288,76021760,37748736 4 | 34078720,524288,76021760,37748736 5 | 34078720,524288,76021760,37748736 6 | 34078720,524288,76021760,37748736 7 | 34078720,524288,76021760,37748736 8 | 34078720,524288,76021760,37748736 9 | 34078720,524288,76021760,37748736 10 | 34078720,524288,76021760,37748736 11 | 34078720,524288,76021760,37748736 12 | 17301504,524288,76021760,37748736 13 | 17301504,524288,76021760,37748736 14 | 17301504,524288,76021760,37748736 15 | 17301504,524288,76021760,37748736 16 | 17301504,524288,76021760,37748736 17 | 17301504,524288,76021760,37748736 18 | 17301504,524288,76021760,37748736 19 | 17301504,524288,76021760,37748736 20 | 17301504,524288,76021760,37748736 21 | 17301504,524288,76021760,37748736 22 | 8650752,262144,19136512,9437184 23 | 8650752,262144,19136512,9437184 24 | 8650752,262144,19136512,9437184 25 | 8650752,262144,19136512,9437184 26 | 8650752,262144,19136512,9437184 27 | 8650752,262144,19136512,9437184 28 | 8650752,262144,19136512,9437184 29 | 8650752,262144,19136512,9437184 30 | 8650752,262144,19136512,9437184 31 | 8650752,262144,19136512,9437184 32 | 8650752,262144,19136512,9437184 33 | 8650752,262144,19136512,9437184 34 | 8650752,262144,19136512,9437184 35 | 8650752,262144,19136512,9437184 36 | 8650752,262144,19136512,9437184 37 | 4456448,262144,19136512,9437184 38 | 4456448,262144,19136512,9437184 39 | 4456448,262144,19136512,9437184 40 | 4456448,262144,19136512,9437184 41 | 4456448,262144,19136512,9437184 42 | 4456448,262144,19136512,9437184 43 | 4456448,262144,19136512,9437184 44 | 4456448,262144,19136512,9437184 45 | 4456448,262144,19136512,9437184 46 | 4456448,262144,19136512,9437184 47 | 4456448,262144,19136512,9437184 48 | 4456448,262144,19136512,9437184 49 | 4456448,262144,19136512,9437184 50 | 4456448,262144,19136512,9437184 51 | 4456448,262144,19136512,9437184 52 | 2359296,262144,19136512,9437184 53 | 2359296,262144,19136512,9437184 54 | 2359296,262144,19136512,9437184 55 | 2359296,262144,19136512,9437184 56 | 2359296,262144,19136512,9437184 57 | 2359296,262144,19136512,9437184 58 | 2359296,262144,19136512,9437184 59 | 2359296,262144,19136512,9437184 60 | 2359296,262144,19136512,9437184 61 | 2359296,262144,19136512,9437184 62 | 2359296,262144,19136512,9437184 63 | 2359296,262144,19136512,9437184 64 | 2359296,262144,19136512,9437184 65 | 2359296,262144,19136512,9437184 66 | 2359296,262144,19136512,9437184 67 | 10813440,327680,23396352,11534336 68 | 10813440,327680,23396352,11534336 69 | 10813440,327680,23396352,11534336 70 | 10813440,327680,23396352,11534336 71 | 5570560,327680,23396352,11534336 72 | 5570560,327680,23396352,11534336 73 | 5570560,327680,23396352,11534336 74 | 5570560,327680,23396352,11534336 75 | 2949120,327680,23396352,11534336 76 | 2949120,327680,23396352,11534336 77 | 2949120,327680,23396352,11534336 78 | 2949120,327680,23396352,11534336 79 | 12976128,393216,29753344,14680064 80 | 12976128,393216,29753344,14680064 81 | 12976128,393216,29753344,14680064 82 | 12976128,393216,29753344,14680064 83 | 12976128,393216,29753344,14680064 84 | 12976128,393216,29753344,14680064 85 | 12976128,393216,29753344,14680064 86 | 12976128,393216,29753344,14680064 87 | 12976128,393216,29753344,14680064 88 | 12976128,393216,29753344,14680064 89 | 6684672,393216,29753344,14680064 90 | 6684672,393216,29753344,14680064 91 | 6684672,393216,29753344,14680064 92 | 6684672,393216,29753344,14680064 93 | 6684672,393216,29753344,14680064 94 | 6684672,393216,29753344,14680064 95 | 6684672,393216,29753344,14680064 96 | 6684672,393216,29753344,14680064 97 | 6684672,393216,29753344,14680064 98 | 6684672,393216,29753344,14680064 99 | 3538944,393216,29753344,14680064 100 | 3538944,393216,29753344,14680064 101 | 3538944,393216,29753344,14680064 102 | 3538944,393216,29753344,14680064 103 | 3538944,393216,29753344,14680064 104 | 3538944,393216,29753344,14680064 105 | 3538944,393216,29753344,14680064 106 | 3538944,393216,29753344,14680064 107 | 3538944,393216,29753344,14680064 108 | 3538944,393216,29753344,14680064 109 | 28901376,589824,66650112,33030144 110 | 28901376,589824,66650112,33030144 111 | 28901376,589824,66650112,33030144 112 | 28901376,589824,66650112,33030144 113 | 28901376,589824,66650112,33030144 114 | 28901376,589824,66650112,33030144 115 | 28901376,589824,66650112,33030144 116 | 14745600,589824,66650112,33030144 117 | 14745600,589824,66650112,33030144 118 | 14745600,589824,66650112,33030144 119 | 14745600,589824,66650112,33030144 120 | 14745600,589824,66650112,33030144 121 | 14745600,589824,66650112,33030144 122 | 14745600,589824,66650112,33030144 123 | 51118080,786432,118226944,58720256 124 | 51118080,786432,118226944,58720256 125 | 51118080,786432,118226944,58720256 126 | 51118080,786432,118226944,58720256 127 | 51118080,786432,118226944,58720256 128 | 25952256,786432,118226944,58720256 129 | 25952256,786432,118226944,58720256 130 | 25952256,786432,118226944,58720256 131 | 25952256,786432,118226944,58720256 132 | 25952256,786432,118226944,58720256 133 | -------------------------------------------------------------------------------- /tests/cases/11-fail-domino-self-attention/map.yaml: -------------------------------------------------------------------------------- 1 | mapping: 2 | node-type: Tile 3 | type: Temporal 4 | factors: H=1 M=512 N=8 L=1 A=32 5 | permutation: HMNLA 6 | target: L1 7 | 8 | subtree: 9 | - node-type: Scope 10 | type: Parallel 11 | 12 | subtree: 13 | - node-type: Tile 14 | type: Spatial 15 | factors: H=1 M=1 L=1 A=1 16 | permutation: HMLA 17 | target: L1 18 | 19 | subtree: 20 | - node-type: Tile 21 | type: Temporal 22 | factors: H=16 M=1 L=512 A=2 23 | permutation: HMLA 24 | target: L0 25 | 26 | subtree: 27 | - node-type: Op 28 | name: ProduceC 29 | binding: H:H M:M L:L A:A 30 | - node-type: Scope 31 | type: Pipeline 32 | 33 | subtree: 34 | - node-type: Tile 35 | type: Spatial 36 | factors: H=1 M=1 L=1 37 | permutation: HML 38 | target: L1 39 | 40 | subtree: 41 | - node-type: Tile 42 | type: Temporal 43 | factors: H=16 M=1 L=512 44 | permutation: HML 45 | target: L0 46 | 47 | subtree: 48 | - node-type: Op 49 | name: ProduceB 50 | binding: H:H M:M L:L 51 | - node-type: Scope 52 | type: Parallel 53 | 54 | subtree: 55 | - node-type: Tile 56 | type: Spatial 57 | factors: H=1 M=1 L=1 58 | permutation: HML 59 | target: L1 60 | 61 | subtree: 62 | - node-type: Tile 63 | type: Temporal 64 | factors: H=16 M=1 L=512 65 | permutation: HML 66 | target: L0 67 | 68 | subtree: 69 | - node-type: Op 70 | name: ProduceD 71 | binding: H:H M:M L:L 72 | - node-type: Scope 73 | type: Pipeline 74 | 75 | subtree: 76 | - node-type: Tile 77 | type: Spatial 78 | factors: H=1 M=1 L=1 79 | permutation: HML 80 | target: L1 81 | 82 | subtree: 83 | - node-type: Tile 84 | type: Temporal 85 | factors: H=16 M=1 L=512 86 | permutation: HML 87 | target: L0 88 | 89 | subtree: 90 | - node-type: Op 91 | name: ProduceE 92 | binding: H:H M:M L:L 93 | - node-type: Scope 94 | type: Sequential 95 | 96 | subtree: 97 | - node-type: Tile 98 | type: Spatial 99 | factors: H=1 M=1 L=1 100 | permutation: HML 101 | target: L1 102 | 103 | subtree: 104 | - node-type: Tile 105 | type: Temporal 106 | factors: H=16 M=1 L=512 107 | permutation: HML 108 | target: L0 109 | 110 | subtree: 111 | - node-type: Op 112 | name: ProduceF 113 | binding: H:H M:M L:L 114 | - node-type: Scope 115 | type: Sequential 116 | 117 | subtree: 118 | - node-type: Tile 119 | type: Spatial 120 | factors: H=1 M=1 L=1 121 | permutation: HML 122 | target: L1 123 | 124 | subtree: 125 | - node-type: Tile 126 | type: Temporal 127 | factors: H=16 M=1 L=512 128 | permutation: HML 129 | target: L0 130 | 131 | subtree: 132 | - node-type: Op 133 | name: ProduceG 134 | binding: H:H M:M L:L 135 | - node-type: Tile 136 | type: Spatial 137 | factors: H=1 M=1 N=4 L=1 138 | permutation: HMNL 139 | target: L1 140 | 141 | subtree: 142 | - node-type: Tile 143 | type: Temporal 144 | factors: H=16 M=1 N=2 L=512 145 | permutation: HMNL 146 | target: L0 147 | 148 | subtree: 149 | - node-type: Op 150 | name: ProduceI 151 | binding: H:H M:M N:N L:L -------------------------------------------------------------------------------- /tests/cases/12-test-fused-cnn/script.py: -------------------------------------------------------------------------------- 1 | dataflows = [ 2 | 'naive', 3 | 'isos', 4 | 'fused-layer', 5 | 'tileflow', 6 | ] 7 | 8 | # dataflows = ['flash-attention'] #, 'flash-attention', 'flat', 'pipeline'] 9 | ''' 10 | CC1 64 112 112 192 128 11 | CC2 32 147 147 64 80 12 | CC3 64 56 56 128 64 13 | CC4 128 28 28 256 128 14 | CC5 16 227 227 64 16 15 | ''' 16 | shapes = [ 17 | # B H W C L K R S U V 18 | ('CC1', [1,112,112,64,192,128,3,3,3,3]), 19 | ('CC2', [1,144,144,32,64,80,3,3,1,1]), 20 | ('CC3', [1,56,56,64,128,64,3,3,1,1]), 21 | ('CC4', [1,28,28,128,256,128,3,3,1,1]), 22 | ('CC5', [1,224,224,16,64,16,3,3,1,1]) 23 | ] 24 | 25 | architectures = ['cloud', 'edge'] 26 | 27 | import os 28 | import multiprocessing 29 | 30 | def worker(prefix, workload, shape, dataflow, objective, architecture): 31 | tag = f'{workload}-{dataflow}-{objective}-{architecture}' 32 | file_path = os.path.join(prefix, 'macro', f'{tag}.yaml') 33 | output_path = os.path.join(prefix, 'log', f'{tag}.txt') 34 | if not os.path.isfile(file_path): 35 | B,H,W,C,L,K,R,S,U,V = shape 36 | with open(file_path, 'w') as f: 37 | f.write(f''' 38 | macro: 39 | B: {B} 40 | H: {H} 41 | W: {W} 42 | C: {C} 43 | L: {L} 44 | K: {K} 45 | R: {R} 46 | S: {S} 47 | U: {U} 48 | V: {V} 49 | output: {prefix}/out/{tag} 50 | tileflow-mapper: 51 | timeout: 600 52 | objective: {objective} 53 | verbose: 1 54 | check: 55 | loopcount: False 56 | ''') 57 | 58 | cmd = f"source ./run.sh {dataflow} {file_path} {output_path} {architecture}" 59 | if not os.path.isfile(f'{prefix}/out/{tag}.csv'): 60 | os.system(cmd) 61 | else: 62 | print(cmd+ ' already executed') 63 | 64 | def main( 65 | prefix, 66 | objectives, 67 | dataflows, 68 | shapes, 69 | architectures 70 | ): 71 | 72 | os.system(f'mkdir -p result') 73 | os.system(f'mkdir -p result/{prefix}') 74 | os.system(f'mkdir -p result/{prefix}/pics') 75 | os.system(f'mkdir -p result/{prefix}/out') 76 | os.system(f'mkdir -p result/{prefix}/log') 77 | os.system(f'mkdir -p result/{prefix}/macro') 78 | 79 | procs = [] 80 | 81 | for workload, shape in shapes: 82 | for dataflow in dataflows: 83 | for objective in objectives: 84 | for architecture in architectures: 85 | p = multiprocessing.Process(target = worker, 86 | args = [ 87 | f'result/{prefix}', 88 | workload, 89 | shape, 90 | dataflow, 91 | objective, 92 | architecture]) 93 | procs.append(p) 94 | p.start() 95 | 96 | for p in procs: 97 | p.join() 98 | 99 | for objective in objectives: 100 | visualize(f'result/{prefix}', dataflows, shapes, objective, architectures) 101 | 102 | import pandas as pd 103 | 104 | def visualize(prefix, dataflows, shapes, objective, architectures, styles = {'workload': 'bar', 'architecture': 'line'}): 105 | dfs = [] 106 | keys = [] 107 | for dataflow in dataflows: 108 | for workload, _ in shapes: 109 | for architecture in architectures: 110 | tag = f'{workload}-{dataflow}-{objective}-{architecture}' 111 | filename = f'{prefix}/out/{tag}.csv' 112 | if not os.path.isfile(filename): 113 | print (filename + ' does not exists') 114 | continue 115 | keys.append((workload, dataflow, architecture)) 116 | dfs.append(pd.read_csv(filename)) 117 | 118 | df = pd.concat(dfs, keys = keys, names = ['workload', 'dataflow', 'architecture']) 119 | 120 | 121 | os.system(f'mkdir -p {prefix}/pics/{objective}') 122 | df.to_csv(f'{prefix}/all_data.csv') 123 | 124 | def worker(group, key, other, metric, kind): 125 | ax = group.plot(kind = kind) 126 | os.system(f'mkdir -p {prefix}/pics/{objective}/{other}') 127 | os.system(f'mkdir -p {prefix}/pics/{objective}/{other}/{key}') 128 | ax.get_figure().savefig(f'{prefix}/pics/{objective}/{other}/{key}/{metric}.png', bbox_inches='tight') 129 | ax.get_figure().savefig(f'{prefix}/pics/{objective}/{other}/{key}/{metric}.pdf', bbox_inches='tight') 130 | group.to_csv(f'{prefix}/pics/{objective}/{other}/{key}/{metric}.csv') 131 | 132 | workloads = [x[0] for x in shapes] 133 | 134 | procs = [] 135 | for x, other, order in [('architecture', 'workload', workloads), ('workload', 'architecture', architectures)]: 136 | for (key, metric), group in df.groupby([x, 'metric']): 137 | try: 138 | group = group.drop('metric', axis = 1).droplevel(x) 139 | group = group.reset_index().set_index([other, 'dataflow'])['value'] 140 | group = group.unstack() 141 | group = group.reindex(order) 142 | group = group[dataflows] 143 | proc = multiprocessing.Process(target=worker, args = [group, key, other, metric, styles[other]]) 144 | proc.start() 145 | procs.append(proc) 146 | except: 147 | print (f'error with {key} {metric}') 148 | for proc in procs: proc.join() 149 | 150 | os.system('echo "#!/bin/bash" > error.sh') 151 | 152 | if __name__ == "__main__": 153 | configs = [ 154 | ["Standard", ['cycle', 'energy'], dataflows, shapes, ['cloud']], 155 | ] 156 | procs = [] 157 | for config in configs: 158 | p = multiprocessing.Process(target=main, args = config) 159 | procs.append(p) 160 | p.start() 161 | for p in procs: p.join() -------------------------------------------------------------------------------- /include/tileflow/mapper/checker.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include "tileflow/mapping/mapping.hpp" 6 | 7 | using mapping::TileFlow::Node; 8 | using mapping::TileFlow::OpNode; 9 | using mapping::TileFlow::TileNode; 10 | using mapping::TileFlow::ScopeNode; 11 | using mapping::TileFlow::Visitor; 12 | 13 | namespace TileFlow { 14 | 15 | class ShapeConstraintParser: public Visitor { 16 | void visitTile(const TileNode*) override; 17 | void visitOp(const OpNode*) override; 18 | std::vector > > > 19 | exprs_; 20 | const problem::Workload& workload_; 21 | bool allow_mismatched_; 22 | 23 | std::vector constraints; 24 | public: 25 | ShapeConstraintParser(const problem::Workload& workload, bool allow_mismatched = false): 26 | workload_(workload), allow_mismatched_(allow_mismatched){} 27 | std::vector parse(const Node*root); 28 | 29 | }; 30 | 31 | class MemoryConstraintParser: public Visitor { 32 | void visitTile(const TileNode*) override; 33 | void visitOp(const OpNode*) override; 34 | void visitScope(const ScopeNode*) override; 35 | std::shared_ptr cal_footprint(const Node * node, unsigned pv); 36 | // std::vector > > factors_; 37 | std::vector > factors_; 38 | std::unordered_map > > node2factors_; 39 | const problem::Workload& workload_; 40 | const model::Topology& topology_; 41 | std::vector constraints_; 42 | std::vector constraint_nodes_; 43 | 44 | std::unordered_map< 45 | const Node*, std::unordered_map >& init_scope_; 46 | void add_constraint(const Node*node); 47 | std::vector > factor2expr( 48 | const std::vector > >& factors); 49 | public: 50 | MemoryConstraintParser( 51 | const problem::Workload& workload, 52 | const model::Topology& topology, 53 | std::unordered_map< 54 | const Node*, std::unordered_map >& init_scope): 55 | workload_(workload), topology_(topology), init_scope_(init_scope){} 56 | std::vector parse(const Node*root); 57 | }; 58 | 59 | class ResourceConstraintParser: public Visitor { 60 | void visitTile(const TileNode*) override; 61 | void visitScope(const ScopeNode*) override; 62 | std::vector constraints_; 63 | std::shared_ptr core_usage_; 64 | const mapping::TileFlow::Mapping& mapping_; 65 | void add_constraint(const Node* node); 66 | public: 67 | ResourceConstraintParser(const mapping::TileFlow::Mapping& mapping): 68 | mapping_(mapping) {} 69 | std::vector parse(const Node*root); 70 | 71 | }; 72 | 73 | class SpatialScopeSwapper: public mapping::TileFlow::Visitor { 74 | void visitScope(const ScopeNode*) override; 75 | }; 76 | 77 | /** 78 | * rule1: Tiling fators's multiplication should be equal to the shape; 79 | * rule2: Spatial TileNode's child must be a Temporal Tile Node 80 | * rule3: each level should have at most one temporal tile node; 81 | */ 82 | class SanityChecker: public mapping::TileFlow::Visitor { 83 | unsigned storage_level_; 84 | void visitTile(const TileNode*) override; 85 | void visitScope(const ScopeNode*) override; 86 | void visitOp(const OpNode*) override; 87 | const model::Topology& topology_; 88 | public: 89 | SanityChecker(const model::Topology& topology): topology_(topology){} 90 | void run(const Node*) override; 91 | }; 92 | 93 | class Checker { 94 | private: 95 | bool constraints_parsed_ = false; 96 | const problem::TileFlow::Workloads& workloads_; 97 | const mapping::TileFlow::Mapping& mapping_; 98 | const model::Topology& topology_; 99 | bool enable_mem_check_; 100 | bool enable_spatial_check_; 101 | bool enable_loopcount_check_; 102 | 103 | std::unordered_map< 104 | const Node*, std::unordered_map > init_scope_; 105 | 106 | std::vector constraints; 107 | void swap_spatial_scope(); 108 | void get_active_tensors(); 109 | void sanity_check(); 110 | void parse_constraints(); 111 | void get_shape_constraints(); 112 | void get_memory_constraints(); 113 | void get_resource_constraints(); 114 | void add_access_pattern( 115 | problem::Shape::DataSpaceID producer_id, 116 | const Node *producer, 117 | problem::Shape::DataSpaceID consumer_id, 118 | const Node *consumer); 119 | public: 120 | Checker(const problem::TileFlow::Workloads& workloads, 121 | const mapping::TileFlow::Mapping& mapping, 122 | const model::Topology& topology, 123 | bool enable_mem_check_ = true, 124 | bool enable_spatial_check_ = true, 125 | bool enable_loopcount_check_ = true) 126 | : workloads_(workloads), mapping_(mapping), topology_(topology), 127 | enable_mem_check_(enable_mem_check_), 128 | enable_spatial_check_(enable_spatial_check_), 129 | enable_loopcount_check_(enable_loopcount_check_){} 130 | const std::vector& get_constraints() const {return constraints;} 131 | void check(const SymbolTable* symbol_table = nullptr); 132 | void display(const SymbolTable* symbol_table = nullptr); 133 | }; // Mappert 134 | 135 | } // namespace TileFlow -------------------------------------------------------------------------------- /src/application/main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "application/model.hpp" 6 | #include "compound-config/compound-config.hpp" 7 | #include "util/args.hpp" 8 | 9 | #include "tileflow/problem/problem.hpp" 10 | #include "tileflow/mapping/mapping.hpp" 11 | #include "tileflow/loop-analysis/nest-analysis.hpp" 12 | #include "tileflow/model/topology.hpp" 13 | #include "tileflow/mapper/checker.hpp" 14 | #include "tileflow/mapper/mapper.hpp" 15 | 16 | extern bool gTerminateEval; 17 | 18 | //--------------------------------------------// 19 | // MAIN // 20 | //--------------------------------------------// 21 | 22 | void show_energy( 23 | const model::TileFlow::Topology& topology, 24 | std::ostream& o = std::cout) { 25 | auto arith = topology.GetArithmeticLevel(); 26 | o << "==========AccessEnergy===========" << std::endl; 27 | o << "metric, energy" << std::endl; 28 | o << "Arith::energy_per_op," 29 | << arith->GetSpecs().op_energy_map.at("random_compute") << std::endl; 30 | for (unsigned i = 0; i < topology.NumStorageLevels(); i++){ 31 | auto buffer = topology.GetStorageLevel(i); 32 | auto& specs = buffer->GetSpecs(); 33 | o << "Buffer::" << buffer->Name() << "::energy_per_op::read," << specs.op_energy_map.at("random_read") << std::endl; 34 | o << "Buffer::" << buffer->Name() << "::energy_per_op::update," << specs.op_energy_map.at("random_update") << std::endl; 35 | o << "Buffer::" << buffer->Name() << "::energy_per_op::fill," << specs.op_energy_map.at("random_fill") << std::endl; 36 | } 37 | o << "========End AccessEnergy=========" << std::endl; 38 | } 39 | 40 | int main(int argc, char* argv[]) 41 | { 42 | assert(argc >= 2); 43 | 44 | std::vector input_files; 45 | std::string output_dir = "."; 46 | bool success = ParseArgs(argc, argv, input_files, output_dir); 47 | if (!success) 48 | { 49 | std::cerr << "ERROR: error parsing command line." << std::endl; 50 | exit(1); 51 | } 52 | 53 | auto config = new config::CompoundConfig(input_files); 54 | 55 | auto root = config->getRoot(); 56 | 57 | if (root.exists("macro")) 58 | TileFlow::macros = root.lookup("macro"); 59 | 60 | if (root.exists("verbose")) 61 | root.lookupValue("verbose", TileFlow::verbose_level); 62 | 63 | auto problem = root.lookup("problem"); 64 | problem::TileFlow::Workloads workloads; 65 | 66 | config::CompoundConfigNode arch; 67 | 68 | if (root.exists("arch")) 69 | { 70 | arch = root.lookup("arch"); 71 | } 72 | else if (root.exists("architecture")) 73 | { 74 | arch = root.lookup("architecture"); 75 | } 76 | 77 | bool is_sparse_topology = root.exists("sparse_optimizations"); 78 | 79 | model::Engine::Specs arch_specs_ = model::Engine::ParseSpecs(arch, is_sparse_topology); 80 | 81 | if (root.exists("ERT")) 82 | { 83 | std::cout << "Found Accelergy ERT (energy reference table), replacing internal energy model." << std::endl; 84 | auto ert = root.lookup("ERT"); 85 | arch_specs_.topology.ParseAccelergyERT(ert); 86 | if (root.exists("ART")){ // Nellie: well, if the users have the version of Accelergy that generates ART 87 | auto art = root.lookup("ART"); 88 | arch_specs_.topology.ParseAccelergyART(art); 89 | } 90 | } 91 | 92 | std::cout << "Begin ParseWorkload..." << std::endl; 93 | problem::TileFlow::ParseWorkloads(problem, workloads); 94 | problem::Workload::SetCurrShape(&workloads.get_shape()); 95 | 96 | if (TileFlow::verbose_level) 97 | workloads.Print(); 98 | 99 | model::TileFlow::Topology topology; 100 | 101 | for (unsigned storage_level_id = 0; storage_level_id < arch_specs_.topology.NumStorageLevels(); 102 | ++ storage_level_id){ 103 | auto buffer = arch_specs_.topology.GetStorageLevel(storage_level_id); 104 | TILEFLOW_COND_WARNING(buffer->size.IsSpecified(), "No memory size specified at " << buffer->name.Get()); 105 | if (verbose_level) { 106 | std::cout << buffer->name.Get() << ": "; 107 | std::cout << buffer->size.Get() << "words" << std::endl; 108 | } 109 | } 110 | 111 | std::cout << "Begin Spec..." << std::endl; 112 | topology.Spec(arch_specs_.topology); 113 | if (verbose_level) 114 | show_energy(topology,std::cout); 115 | 116 | auto mapping = 117 | mapping::TileFlow::ParseAndConstruct(root.lookup("mapping"), arch_specs_, workloads); 118 | 119 | if (TileFlow::verbose_level) 120 | mapping.Print(); 121 | 122 | bool enable_mem_check_ = true; 123 | bool enable_spatial_check_ = true; 124 | bool enable_loopcount_check_ = true; 125 | if (root.exists("check")) { 126 | auto checknode = root.lookup("check"); 127 | checknode.lookupValue("mem", enable_mem_check_); 128 | checknode.lookupValue("spatial", enable_spatial_check_); 129 | checknode.lookupValue("loopcount", enable_loopcount_check_); 130 | } 131 | 132 | TileFlow::Checker checker(workloads, mapping, topology 133 | , enable_mem_check_, enable_spatial_check_, enable_loopcount_check_); 134 | 135 | checker.check(); 136 | 137 | if (verbose_level) 138 | checker.display(); 139 | 140 | TileFlow::mapper::Objective obj = TileFlow::mapper::CYCLE; 141 | unsigned timeout = 600; 142 | unsigned topk = 1; 143 | std::string search_alg = "random"; 144 | if (root.exists("tileflow-mapper")) { 145 | auto mapper = root.lookup("tileflow-mapper"); 146 | std::string objective; 147 | if (mapper.lookupValue("objective", objective)){ 148 | if (objective == "cycle") obj = TileFlow::mapper::CYCLE; 149 | else if (objective == "energy") obj = TileFlow::mapper::ENERGY; 150 | } 151 | mapper.lookupValue("timeout", timeout); 152 | mapper.lookupValue("alg", search_alg); 153 | mapper.lookupValue("topk", topk); 154 | } 155 | 156 | TileFlow::mapper::Mapper mapper(checker.get_constraints(), workloads, mapping, arch_specs_, topology, obj, timeout, search_alg, topk); 157 | 158 | auto result = mapper.search(); 159 | assert(result); 160 | 161 | TILEFLOW_LOG("Verify result..."); 162 | checker.check(result); 163 | TILEFLOW_LOG("Check passed!"); 164 | 165 | mapper.report(); 166 | 167 | if (root.exists("output")) { 168 | std::string filename; 169 | root.lookupValue("output", filename); 170 | mapper.dump(filename); 171 | } 172 | 173 | return 0; 174 | } --------------------------------------------------------------------------------