├── .clang-format
├── .github
    ├── ISSUE_TEMPLATE
    │   ├── bug.md
    │   ├── feature-request.md
    │   └── question.md
    └── workflows
    │   ├── compiler-ci.yaml
    │   ├── daily_ci.yaml
    │   ├── e2e_test.yaml
    │   ├── format-check.yaml
    │   ├── onnx-frontend-ci.yaml
    │   ├── runtime-ci.yaml
    │   ├── tf-frontend-ci.yaml
    │   └── torch-frontend-ci.yaml
├── .gitignore
├── .gitmodules
├── CITATION.cff
├── CONTRIBUTING.md
├── LICENSE
├── NOTICE
├── README-zh_cn.md
├── README.md
├── compiler
    ├── .gitignore
    ├── README.md
    ├── cmake
    │   ├── CMakeLists.txt
    │   ├── MLIR.cmake
    │   └── mhlo.cmake
    ├── dialects
    │   ├── CMakeLists.txt
    │   ├── include
    │   │   ├── CMakeLists.txt
    │   │   └── byteir
    │   │   │   ├── CMakeLists.txt
    │   │   │   └── Dialect
    │   │   │       ├── Ace
    │   │   │           ├── AceBase.td
    │   │   │           ├── AceDialect.h
    │   │   │           ├── AceOps.td
    │   │   │           └── CMakeLists.txt
    │   │   │       ├── CMakeLists.txt
    │   │   │       └── Ccl
    │   │   │           ├── CMakeLists.txt
    │   │   │           └── IR
    │   │   │               ├── CMakeLists.txt
    │   │   │               ├── CclBase.td
    │   │   │               ├── CclOpInterface.td
    │   │   │               ├── CclOps.h
    │   │   │               └── CclOps.td
    │   └── lib
    │   │   ├── CMakeLists.txt
    │   │   └── Dialect
    │   │       ├── Ace
    │   │           ├── CMakeLists.txt
    │   │           └── IR
    │   │           │   └── AceDialect.cpp
    │   │       ├── CMakeLists.txt
    │   │       └── Ccl
    │   │           ├── CMakeLists.txt
    │   │           └── IR
    │   │               ├── CMakeLists.txt
    │   │               └── CclOps.cpp
    ├── doc
    │   ├── attention.md
    │   ├── byteir_hlo_custom_call.md
    │   ├── codegen.md
    │   ├── gpu.md
    │   ├── linalg.md
    │   ├── passes.md
    │   └── rng.md
    ├── include
    │   ├── CMakeLists.txt
    │   ├── byteir-c
    │   │   ├── Dialects.h
    │   │   ├── PDLValue.h
    │   │   ├── Passes.h
    │   │   └── Translation.h
    │   └── byteir
    │   │   ├── Analysis
    │   │       ├── Alias.h
    │   │       ├── DimFlag.h
    │   │       ├── Liveness.h
    │   │       ├── OpDependence.h
    │   │       ├── ShapeAnalysis.h
    │   │       ├── SideEffect.h
    │   │       ├── SymbolicShape.h
    │   │       └── UseRange.h
    │   │   ├── CMakeLists.txt
    │   │   ├── Conversion
    │   │       ├── CMakeLists.txt
    │   │       ├── Common
    │   │       │   └── FunctionSupport.h
    │   │       ├── FuncToByre
    │   │       │   └── FuncToByre.h
    │   │       ├── GPUToNVVM
    │   │       │   └── GPUToNVVM.h
    │   │       ├── HloToByreTensor
    │   │       │   ├── HloToByreCustom.h
    │   │       │   └── HloToByreTensor.h
    │   │       ├── HloToCat
    │   │       │   ├── ConvertHloToCat.h
    │   │       │   ├── FuseHloToCat.h
    │   │       │   └── HloToCat.h
    │   │       ├── HloToTensor
    │   │       │   └── ConvertHloToTensor.h
    │   │       ├── LcclToByre
    │   │       │   └── LcclToByre.h
    │   │       ├── MemrefToByre
    │   │       │   └── MemrefToByre.h
    │   │       ├── Passes.h
    │   │       ├── Passes.td
    │   │       ├── ToAIT
    │   │       │   └── ToAIT.h
    │   │       ├── ToAce
    │   │       │   └── MhloToAce.h
    │   │       ├── ToByre
    │   │       │   └── ToByre.h
    │   │       ├── ToGPU
    │   │       │   ├── ToGPU.h
    │   │       │   └── Utils.h
    │   │       ├── ToHlo
    │   │       │   └── ArithToMhlo.h
    │   │       ├── ToLLVM
    │   │       │   └── ToLLVM.h
    │   │       ├── ToLinalg
    │   │       │   └── ToLinalg.h
    │   │       └── ToPTX
    │   │       │   └── ToPTX.h
    │   │   ├── Dialect
    │   │       ├── Ace
    │   │       │   ├── CMakeLists.txt
    │   │       │   ├── Passes.h
    │   │       │   ├── Passes.td
    │   │       │   └── Transforms
    │   │       │   │   └── BufferizableOpInterfaceImpl.h
    │   │       ├── Affine
    │   │       │   ├── CMakeLists.txt
    │   │       │   ├── Passes.h
    │   │       │   ├── Passes.td
    │   │       │   └── Transforms
    │   │       │   │   ├── AffineLoopFusionEx.h
    │   │       │   │   ├── InsertTrivialAffineLoop.h
    │   │       │   │   └── RewriteAffineToMemref.h
    │   │       ├── Byre
    │   │       │   ├── ByreBase.td
    │   │       │   ├── ByreDialect.h
    │   │       │   ├── ByreOps.td
    │   │       │   ├── CMakeLists.txt
    │   │       │   ├── Common.h
    │   │       │   ├── Passes.h
    │   │       │   ├── Passes.td
    │   │       │   ├── Serialization.h
    │   │       │   ├── Serialization
    │   │       │   │   ├── ByreSerial.td
    │   │       │   │   ├── ByreSerialOps.h
    │   │       │   │   ├── CMakeLists.txt
    │   │       │   │   └── Versioning.h
    │   │       │   └── Transforms
    │   │       │   │   ├── BufferizableOpInterfaceImpl.h
    │   │       │   │   └── Serial.h
    │   │       ├── CMakeLists.txt
    │   │       ├── Cat
    │   │       │   ├── CMakeLists.txt
    │   │       │   └── IR
    │   │       │   │   ├── CMakeLists.txt
    │   │       │   │   ├── CatBase.td
    │   │       │   │   ├── CatDialect.h
    │   │       │   │   └── CatOps.td
    │   │       ├── Ccl
    │   │       │   ├── CMakeLists.txt
    │   │       │   ├── Passes.h
    │   │       │   ├── Passes.td
    │   │       │   ├── TransformOps
    │   │       │   │   ├── CMakeLists.txt
    │   │       │   │   ├── CclTransformOps.h
    │   │       │   │   └── CclTransformOps.td
    │   │       │   └── Transforms
    │   │       │   │   ├── CclBufferizeOpInterfaceImpl.h
    │   │       │   │   └── CclMoveDown.h
    │   │       ├── GPU
    │   │       │   ├── CMakeLists.txt
    │   │       │   ├── Passes.h
    │   │       │   ├── Passes.td
    │   │       │   ├── TransformOps
    │   │       │   │   ├── CMakeLists.txt
    │   │       │   │   ├── GPUExtTransformOps.h
    │   │       │   │   ├── GPUExtTransformOps.td
    │   │       │   │   └── Utils.h
    │   │       │   └── Transforms
    │   │       │   │   ├── GPUBlockSwizzle.h
    │   │       │   │   ├── GPUDistributeSharedMemoryCopy.h
    │   │       │   │   ├── GPUDistributeToWarp.h
    │   │       │   │   ├── GPUPackSharedMemoryAlloc.h
    │   │       │   │   ├── GPUTensorCoreVectorization.h
    │   │       │   │   ├── OptimizeVectorTransfer.h
    │   │       │   │   ├── RemoveTrivialLoops.h
    │   │       │   │   ├── Transforms.h
    │   │       │   │   └── Utils.h
    │   │       ├── Lace
    │   │       │   ├── CMakeLists.txt
    │   │       │   ├── LaceBase.td
    │   │       │   ├── LaceDialect.h
    │   │       │   └── LaceOps.td
    │   │       ├── Lccl
    │   │       │   ├── CMakeLists.txt
    │   │       │   ├── LcclBase.td
    │   │       │   ├── LcclOps.h
    │   │       │   └── LcclOps.td
    │   │       ├── Linalg
    │   │       │   ├── CMakeLists.txt
    │   │       │   ├── IR
    │   │       │   │   ├── CMakeLists.txt
    │   │       │   │   ├── LinalgExtBase.td
    │   │       │   │   ├── LinalgExtInterfaces.h
    │   │       │   │   ├── LinalgExtInterfaces.td
    │   │       │   │   ├── LinalgExtOps.h
    │   │       │   │   └── LinalgExtOps.td
    │   │       │   ├── Passes.h
    │   │       │   ├── Passes.td
    │   │       │   ├── TransformOps
    │   │       │   │   ├── CMakeLists.txt
    │   │       │   │   ├── LinalgExtTransformOps.h
    │   │       │   │   └── LinalgExtTransformOps.td
    │   │       │   ├── Transforms
    │   │       │   │   ├── BufferizableOpInterfaceImpl.h
    │   │       │   │   ├── Bufferize.h
    │   │       │   │   ├── CanonicalizeExt.h
    │   │       │   │   ├── FuseElementwise.h
    │   │       │   │   ├── HoistingExt.h
    │   │       │   │   ├── LinalgCollapseLoops.h
    │   │       │   │   ├── LinalgDataPlace.h
    │   │       │   │   ├── LinalgExtToLoops.h
    │   │       │   │   ├── LinalgPrefetch.h
    │   │       │   │   ├── LinalgPromotion.h
    │   │       │   │   ├── Tiling.h
    │   │       │   │   ├── TilingUtils.h
    │   │       │   │   └── Transforms.h
    │   │       │   └── Util
    │   │       │   │   └── Util.h
    │   │       ├── MemRef
    │   │       │   ├── CMakeLists.txt
    │   │       │   ├── Passes.h
    │   │       │   ├── Passes.td
    │   │       │   ├── Transforms
    │   │       │   │   ├── ApplyMemRefAffineLayout.h
    │   │       │   │   ├── ExtractAddressComputation.h
    │   │       │   │   ├── RemoveCopy.h
    │   │       │   │   ├── SimplifyLinearizedIndex.h
    │   │       │   │   └── SimplifyView.h
    │   │       │   └── Utils
    │   │       │   │   ├── Layout.h
    │   │       │   │   ├── MemEffect.h
    │   │       │   │   └── Ops.h
    │   │       ├── SCF
    │   │       │   ├── CMakeLists.txt
    │   │       │   ├── Passes.h
    │   │       │   ├── Passes.td
    │   │       │   ├── Transforms
    │   │       │   │   ├── ForallCollapsing.h
    │   │       │   │   ├── FuseNestedForall.h
    │   │       │   │   ├── InsertTrivialSCFLoop.h
    │   │       │   │   ├── RemoveSingleIterationLoop.h
    │   │       │   │   └── TilingInterfaceToSCFFor.h
    │   │       │   └── Util
    │   │       │   │   └── Util.h
    │   │       ├── Shape
    │   │       │   ├── CMakeLists.txt
    │   │       │   ├── IR
    │   │       │   │   ├── CMakeLists.txt
    │   │       │   │   ├── ShapeExtBase.td
    │   │       │   │   ├── ShapeExtOps.h
    │   │       │   │   └── ShapeExtOps.td
    │   │       │   ├── Passes.h
    │   │       │   ├── Passes.td
    │   │       │   └── Transforms
    │   │       │   │   ├── InsertInputShapeConstraint.h
    │   │       │   │   ├── InsertTieShape.h
    │   │       │   │   ├── ResolveShapeConstraint.h
    │   │       │   │   └── SetAssumingAlwaysTrue.h
    │   │       ├── Tensor
    │   │       │   ├── CMakeLists.txt
    │   │       │   ├── IR
    │   │       │   │   └── TilingInterfaceImpl.h
    │   │       │   ├── Passes.h
    │   │       │   ├── Passes.td
    │   │       │   └── Transforms
    │   │       │   │   ├── CanonicalizeExt.h
    │   │       │   │   ├── ExtractSliceSpecialization.h
    │   │       │   │   └── TensorPadSpecialization.h
    │   │       ├── Transform
    │   │       │   ├── CMakeLists.txt
    │   │       │   ├── IR
    │   │       │   │   ├── CMakeLists.txt
    │   │       │   │   ├── TransformExtOps.h
    │   │       │   │   └── TransformExtOps.td
    │   │       │   ├── Passes.h
    │   │       │   ├── Passes.td
    │   │       │   └── Transforms
    │   │       │   │   ├── TransformDialectInterpreter.h
    │   │       │   │   └── TransformInsertion.h
    │   │       ├── Vector
    │   │       │   ├── CMakeLists.txt
    │   │       │   └── Transforms
    │   │       │   │   ├── CMakeLists.txt
    │   │       │   │   ├── CanonicalizeExt.h
    │   │       │   │   ├── MoveForallRegionIntoWarpOp.h
    │   │       │   │   ├── Passes.h
    │   │       │   │   ├── Passes.td
    │   │       │   │   └── VectorWarpDistribute.h
    │   │       └── mhlo
    │   │       │   ├── Analysis
    │   │       │       ├── DimFromBroadcast.h
    │   │       │       └── ShapeAnalysis.h
    │   │       │   ├── CMakeLists.txt
    │   │       │   ├── DynamicShapeOpRegister
    │   │       │       └── Register.h
    │   │       │   ├── Passes.h
    │   │       │   ├── Passes.td
    │   │       │   ├── Transforms
    │   │       │       ├── BoundedShapeInference.h
    │   │       │       ├── CanonicalizeExt.h
    │   │       │       ├── ClusterConstraint.h
    │   │       │       ├── ConvertFuncToCustomCall.h
    │   │       │       ├── ConvertInsertion.h
    │   │       │       ├── ConvertOpToCustomCall.h
    │   │       │       ├── DTypeConversion.h
    │   │       │       ├── DecomposeMhloCustomCallOps.h
    │   │       │       ├── DynamicShapeClustering.h
    │   │       │       ├── FuncArgRearrangement.h
    │   │       │       ├── FuseBMMDimension.h
    │   │       │       ├── FusionOutlining.h
    │   │       │       ├── GenericFusionCommon.h
    │   │       │       ├── HloAggressiveFusion.h
    │   │       │       ├── HloFolder.h
    │   │       │       ├── HloFuser.h
    │   │       │       ├── HloMove.h
    │   │       │       ├── HloSimplify.h
    │   │       │       ├── InsertShapeConstraint.h
    │   │       │       ├── LayoutTransformation.h
    │   │       │       ├── MatmulLayoutTransform.h
    │   │       │       ├── MoveCommon.h
    │   │       │       ├── RewriteWithConstraint.h
    │   │       │       ├── ShapeReification.h
    │   │       │       ├── StaticShapeInference.h
    │   │       │       └── UnfuseBatchNorm.h
    │   │       │   └── Util
    │   │       │       ├── CustomCallUtil.h
    │   │       │       ├── FusionUtil.h
    │   │       │       ├── ShapeInferUtil.h
    │   │       │       └── Util.h
    │   │   ├── Pipelines
    │   │       ├── AffineOpt.h
    │   │       ├── AllOpt.h
    │   │       ├── BufferizeOpt.h
    │   │       ├── ByreHost.h
    │   │       ├── ByreOpt.h
    │   │       ├── ByreTensorOpt.h
    │   │       ├── CatFusionOpt.h
    │   │       ├── CatPreprocess.h
    │   │       ├── Common
    │   │       │   └── Utils.h
    │   │       ├── GPU
    │   │       │   ├── ElementwiseCodegen.h
    │   │       │   ├── GPUOpt.h
    │   │       │   ├── LinalgMemrefGPU.h
    │   │       │   ├── MappingForall.h
    │   │       │   ├── NVVMCodegen.h
    │   │       │   └── ReductionCodegen.h
    │   │       ├── HloFusionOpt.h
    │   │       ├── HloGraphOpt.h
    │   │       ├── Host
    │   │       │   ├── Codegen.h
    │   │       │   ├── HostOpt.h
    │   │       │   └── ToLLVM.h
    │   │       ├── InitAllPipelines.h
    │   │       ├── LinalgMemrefOpt.h
    │   │       ├── LinalgTensorOpt.h
    │   │       ├── SCFOpt.h
    │   │       └── ShapeOpt.h
    │   │   ├── Stat
    │   │       ├── AllocCnt
    │   │       │   └── AllocCnt.h
    │   │       ├── Common
    │   │       │   └── Reg.h
    │   │       ├── InitAllStats.h
    │   │       └── OpCnt
    │   │       │   └── OpCnt.h
    │   │   ├── Target
    │   │       ├── CUDA
    │   │       │   ├── CUDAEmitter.h
    │   │       │   └── ToCUDA.h
    │   │       ├── Common
    │   │       │   ├── Common.h
    │   │       │   └── EmitUtil.h
    │   │       ├── Cpp
    │   │       │   ├── CppEmitter.h
    │   │       │   └── ToCpp.h
    │   │       ├── LLVM
    │   │       │   └── ToLLVMBC.h
    │   │       └── PTX
    │   │       │   ├── Passes.h
    │   │       │   └── ToPTX.h
    │   │   ├── Transforms
    │   │       ├── AnchoredPipeline.h
    │   │       ├── ApplyPDLPatterns.h
    │   │       ├── Bufferize.h
    │   │       ├── CMAE.h
    │   │       ├── CMakeLists.txt
    │   │       ├── CanonicalizeExt.h
    │   │       ├── CollectFunc.h
    │   │       ├── CondCanonicalize.h
    │   │       ├── FuncTag.h
    │   │       ├── GenericDeviceConfig.h
    │   │       ├── GraphClusteringAlgo.h
    │   │       ├── GraphClusteringByDevice.h
    │   │       ├── InsertUniqueId.h
    │   │       ├── LoopTag.h
    │   │       ├── LoopUnroll.h
    │   │       ├── MemoryPlanning.h
    │   │       ├── ModuleTag.h
    │   │       ├── Passes.h
    │   │       ├── Passes.td
    │   │       ├── RemoveFuncBody.h
    │   │       ├── RewriteOpToStdCall.h
    │   │       ├── SetArgShape.h
    │   │       ├── SetSpace.h
    │   │       ├── ShapeFuncOutlining.h
    │   │       ├── TryCatchModulePipeline.h
    │   │       └── Utils.h
    │   │   └── Utils
    │   │       ├── AffineUtils.h
    │   │       ├── AttrUtils.h
    │   │       ├── FuncUtils.h
    │   │       ├── GraphUtils.h
    │   │       ├── HashUtils.h
    │   │       ├── Hoist.h
    │   │       ├── IRRewrite.h
    │   │       ├── LoopUtils.h
    │   │       ├── MemUtils.h
    │   │       ├── ModuleUtils.h
    │   │       ├── OpInterfaceUtils.h
    │   │       ├── OptionUtils.h
    │   │       ├── PatternMatch.h
    │   │       ├── PipelineUtils.h
    │   │       ├── TileUtils.h
    │   │       ├── TypeUtils.h
    │   │       └── Utils.h
    ├── lib
    │   ├── Analysis
    │   │   ├── CMakeLists.txt
    │   │   ├── DimFlag.cpp
    │   │   ├── Liveness.cpp
    │   │   ├── OpDependence.cpp
    │   │   ├── ShapeAnalysis.cpp
    │   │   ├── SideEffect.cpp
    │   │   ├── SymbolicShape.cpp
    │   │   └── UseRange.cpp
    │   ├── CAPI
    │   │   ├── CMakeLists.txt
    │   │   ├── Dialects.cpp
    │   │   ├── PDLValue.cpp
    │   │   ├── Passes.cpp
    │   │   └── Translation.cpp
    │   ├── CMakeLists.txt
    │   ├── Conversion
    │   │   ├── CMakeLists.txt
    │   │   ├── Common
    │   │   │   ├── CMakeLists.txt
    │   │   │   └── FunctionSupport.cpp
    │   │   ├── FuncToByre
    │   │   │   ├── CMakeLists.txt
    │   │   │   └── FuncToByre.cpp
    │   │   ├── GPUToNVVM
    │   │   │   ├── CMakeLists.txt
    │   │   │   └── GPUToNVVM.cpp
    │   │   ├── HloToByreTensor
    │   │   │   ├── CMakeLists.txt
    │   │   │   ├── HloToByreCustom.cpp
    │   │   │   └── HloToByreTensor.cpp
    │   │   ├── HloToCat
    │   │   │   ├── CMakeLists.txt
    │   │   │   ├── ConvertHloToCat.cpp
    │   │   │   ├── FuseHloToCat.cpp
    │   │   │   ├── FuseHloToCatPattern.td
    │   │   │   ├── HloToCat.cpp
    │   │   │   └── Utils.h
    │   │   ├── HloToTensor
    │   │   │   ├── CMakeLists.txt
    │   │   │   └── ConvertHloToTensor.cpp
    │   │   ├── LcclToByre
    │   │   │   ├── CMakeLists.txt
    │   │   │   └── LcclToByre.cpp
    │   │   ├── MemrefToByre
    │   │   │   ├── CMakeLists.txt
    │   │   │   └── MemrefToByre.cpp
    │   │   ├── PassDetail.h
    │   │   ├── ToAIT
    │   │   │   ├── CMakeLists.txt
    │   │   │   └── GenAITConfig.cpp
    │   │   ├── ToAce
    │   │   │   ├── CMakeLists.txt
    │   │   │   ├── MhloToAce.cpp
    │   │   │   └── MhloToAceActivationPattern.td
    │   │   ├── ToByre
    │   │   │   ├── CMakeLists.txt
    │   │   │   └── ToByre.cpp
    │   │   ├── ToGPU
    │   │   │   ├── CMakeLists.txt
    │   │   │   ├── CoalescedForToGPU.cpp
    │   │   │   ├── FuncToGPU.cpp
    │   │   │   └── Utils.cpp
    │   │   ├── ToHlo
    │   │   │   ├── ArithToMhlo.cpp
    │   │   │   ├── ArithToMhloPattern.td
    │   │   │   └── CMakeLists.txt
    │   │   ├── ToLLVM
    │   │   │   ├── CMakeLists.txt
    │   │   │   ├── CollectFuncToLLVM.cpp
    │   │   │   └── GenLLVMConfig.cpp
    │   │   ├── ToLinalg
    │   │   │   ├── CMakeLists.txt
    │   │   │   ├── HloToLinalg.cpp
    │   │   │   ├── LinalgExtToLinalg.cpp
    │   │   │   ├── MemrefCopyToLinalg.cpp
    │   │   │   ├── TensorToLinalg.cpp
    │   │   │   └── UnrealizedCastToLinalg.cpp
    │   │   └── ToPTX
    │   │   │   ├── CMakeLists.txt
    │   │   │   ├── CollectGPUKernel.cpp
    │   │   │   └── GenPTXConfig.cpp
    │   ├── Dialect
    │   │   ├── Ace
    │   │   │   ├── CMakeLists.txt
    │   │   │   └── Transforms
    │   │   │   │   ├── BufferizableOpInterfaceImpl.cpp
    │   │   │   │   ├── Bufferize.cpp
    │   │   │   │   └── PassDetail.h
    │   │   ├── Affine
    │   │   │   ├── CMakeLists.txt
    │   │   │   └── Transforms
    │   │   │   │   ├── AffineLoopFusionEx.cpp
    │   │   │   │   ├── InsertTrivialAffineLoop.cpp
    │   │   │   │   ├── PassDetail.h
    │   │   │   │   └── RewriteAffineToMemref.cpp
    │   │   ├── Byre
    │   │   │   ├── CMakeLists.txt
    │   │   │   ├── IR
    │   │   │   │   ├── ByreDialect.cpp
    │   │   │   │   ├── Common.cpp
    │   │   │   │   ├── Serialization.cpp
    │   │   │   │   └── Serialization
    │   │   │   │   │   ├── ByreSerialOps.cpp
    │   │   │   │   │   ├── Bytecode.cpp
    │   │   │   │   │   ├── Bytecode.h
    │   │   │   │   │   └── Versioning.cpp
    │   │   │   └── Transforms
    │   │   │   │   ├── BufferizableOpInterfaceImpl.cpp
    │   │   │   │   ├── PassDetail.h
    │   │   │   │   └── Serial.cpp
    │   │   ├── CMakeLists.txt
    │   │   ├── Cat
    │   │   │   ├── CMakeLists.txt
    │   │   │   └── IR
    │   │   │   │   ├── CMakeLists.txt
    │   │   │   │   └── CatDialect.cpp
    │   │   ├── Ccl
    │   │   │   ├── CMakeLists.txt
    │   │   │   ├── TransformOps
    │   │   │   │   ├── CMakeLists.txt
    │   │   │   │   └── CclTransformOps.cpp
    │   │   │   └── Transforms
    │   │   │   │   ├── CMakeLists.txt
    │   │   │   │   ├── CclBufferizeOpInterfaceImpl.cpp
    │   │   │   │   ├── CclMoveDown.cpp
    │   │   │   │   └── PassDetail.h
    │   │   ├── GPU
    │   │   │   ├── CMakeLists.txt
    │   │   │   ├── TransformOps
    │   │   │   │   ├── CMakeLists.txt
    │   │   │   │   ├── GPUExtTransformOps.cpp
    │   │   │   │   └── Utils.cpp
    │   │   │   └── Transforms
    │   │   │   │   ├── CMakeLists.txt
    │   │   │   │   ├── GPUBlockSwizzle.cpp
    │   │   │   │   ├── GPUDistributeSharedMemoryCopy.cpp
    │   │   │   │   ├── GPUDistributeToWarp.cpp
    │   │   │   │   ├── GPUPackSharedMemoryAlloc.cpp
    │   │   │   │   ├── GPUTensorCoreVectorization.cpp
    │   │   │   │   ├── OptimizeVectorTransfer.cpp
    │   │   │   │   ├── PassDetail.h
    │   │   │   │   ├── RemoveTrivialLoops.cpp
    │   │   │   │   ├── ShmAllocaToWorkgroupArg.cpp
    │   │   │   │   └── Utils.cpp
    │   │   ├── Lace
    │   │   │   ├── CMakeLists.txt
    │   │   │   └── IR
    │   │   │   │   └── LaceDialect.cpp
    │   │   ├── Lccl
    │   │   │   ├── CMakeLists.txt
    │   │   │   └── IR
    │   │   │   │   ├── CMakeLists.txt
    │   │   │   │   └── LcclOps.cpp
    │   │   ├── Linalg
    │   │   │   ├── CMakeLists.txt
    │   │   │   ├── IR
    │   │   │   │   ├── CMakeLists.txt
    │   │   │   │   ├── LinalgExtInterfaces.cpp
    │   │   │   │   └── LinalgExtOps.cpp
    │   │   │   ├── TransformOps
    │   │   │   │   ├── CMakeLists.txt
    │   │   │   │   └── LinalgExtTransformOps.cpp
    │   │   │   ├── Transforms
    │   │   │   │   ├── BufferizableOpInterfaceImpl.cpp
    │   │   │   │   ├── Bufferize.cpp
    │   │   │   │   ├── CMakeLists.txt
    │   │   │   │   ├── CanonicalizeExt.cpp
    │   │   │   │   ├── FuseElementwise.cpp
    │   │   │   │   ├── HoistingExt.cpp
    │   │   │   │   ├── LinalgCollapseLoops.cpp
    │   │   │   │   ├── LinalgDataPlace.cpp
    │   │   │   │   ├── LinalgExtToLoops.cpp
    │   │   │   │   ├── LinalgGeneralizationExt.cpp
    │   │   │   │   ├── LinalgPrefetch.cpp
    │   │   │   │   ├── LinalgPromotion.cpp
    │   │   │   │   ├── PassDetail.h
    │   │   │   │   ├── ScopeTiling.cpp
    │   │   │   │   ├── Tiling.cpp
    │   │   │   │   ├── TilingUtils.cpp
    │   │   │   │   └── Transforms.cpp
    │   │   │   └── Util
    │   │   │   │   ├── CMakeLists.txt
    │   │   │   │   └── Util.cpp
    │   │   ├── MemRef
    │   │   │   ├── CMakeLists.txt
    │   │   │   ├── Transforms
    │   │   │   │   ├── ApplyMemRefAffineLayout.cpp
    │   │   │   │   ├── ExtractAddressComputation.cpp
    │   │   │   │   ├── PassDetail.h
    │   │   │   │   ├── RemoveCopy.cpp
    │   │   │   │   ├── SimplifyLinearizedIndex.cpp
    │   │   │   │   └── SimplifyView.cpp
    │   │   │   └── Utils
    │   │   │   │   ├── Layout.cpp
    │   │   │   │   ├── MemEffect.cpp
    │   │   │   │   └── Ops.cpp
    │   │   ├── SCF
    │   │   │   ├── CMakeLists.txt
    │   │   │   ├── Transforms
    │   │   │   │   ├── CMakeLists.txt
    │   │   │   │   ├── ForallCollapsing.cpp
    │   │   │   │   ├── FuseNestedForall.cpp
    │   │   │   │   ├── InsertTrivialSCFLoop.cpp
    │   │   │   │   ├── PassDetail.h
    │   │   │   │   ├── RemoveSingleIterationLoop.cpp
    │   │   │   │   └── TilingInterfaceToSCFFor.cpp
    │   │   │   └── Util
    │   │   │   │   ├── CMakeLists.txt
    │   │   │   │   └── Util.cpp
    │   │   ├── Shape
    │   │   │   ├── CMakeLists.txt
    │   │   │   ├── IR
    │   │   │   │   └── ShapeExtOps.cpp
    │   │   │   └── Transforms
    │   │   │   │   ├── InsertInputShapeConstraint.cpp
    │   │   │   │   ├── InsertTieShape.cpp
    │   │   │   │   ├── PassDetail.h
    │   │   │   │   ├── ResolveShapeConstraint.cpp
    │   │   │   │   └── SetAssumingAlwaysTrue.cpp
    │   │   ├── Tensor
    │   │   │   ├── CMakeLists.txt
    │   │   │   ├── IR
    │   │   │   │   ├── CMakeLists.txt
    │   │   │   │   └── TilingInterfaceImpl.cpp
    │   │   │   └── Transforms
    │   │   │   │   ├── CMakeLists.txt
    │   │   │   │   ├── CanonicalizeExt.cpp
    │   │   │   │   ├── ExtractSliceSpecialization.cpp
    │   │   │   │   ├── PassDetail.h
    │   │   │   │   └── TensorPadSpecialization.cpp
    │   │   ├── Transform
    │   │   │   ├── CMakeLists.txt
    │   │   │   ├── IR
    │   │   │   │   ├── CMakeLists.txt
    │   │   │   │   └── TransformExtOps.cpp
    │   │   │   └── Transforms
    │   │   │   │   ├── CMakeLists.txt
    │   │   │   │   ├── PassDetail.h
    │   │   │   │   ├── TransformDialectInterpreter.cpp
    │   │   │   │   └── TransformInsertion.cpp
    │   │   ├── Vector
    │   │   │   ├── CMakeLists.txt
    │   │   │   └── Transforms
    │   │   │   │   ├── CMakeLists.txt
    │   │   │   │   ├── CanonicalizeExt.cpp
    │   │   │   │   ├── MoveForallRegionIntoWarpOp.cpp
    │   │   │   │   ├── PassDetail.h
    │   │   │   │   ├── VectorLowerings.cpp
    │   │   │   │   └── VectorWarpDistribute.cpp
    │   │   └── mhlo
    │   │   │   ├── Analysis
    │   │   │       ├── DimFromBroadcast.cpp
    │   │   │       └── ShapeAnalysis.cpp
    │   │   │   ├── CMakeLists.txt
    │   │   │   ├── DynamicShapeOpRegister
    │   │   │       ├── AddN.cpp
    │   │   │       ├── BatchMatMul.cpp
    │   │   │       ├── Concatenate.cpp
    │   │   │       ├── Convolution.cpp
    │   │   │       ├── DotLike.cpp
    │   │   │       ├── DynamicBroadcastInDim.cpp
    │   │   │       ├── DynamicPartition.cpp
    │   │   │       ├── DynamicStitchLike.cpp
    │   │   │       ├── Einsum.cpp
    │   │   │       ├── GeLU.cpp
    │   │   │       ├── LayerNorm.cpp
    │   │   │       ├── NonZero.cpp
    │   │   │       ├── OneHot.cpp
    │   │   │       ├── RealDynamicSlice.cpp
    │   │   │       ├── Reduce.cpp
    │   │   │       ├── Repeat.cpp
    │   │   │       ├── ReshapeLike.cpp
    │   │   │       ├── ScatterNd.cpp
    │   │   │       ├── Softmax.cpp
    │   │   │       ├── StridedSlice.cpp
    │   │   │       └── TorchIndexSelect.cpp
    │   │   │   ├── Transforms
    │   │   │       ├── BoundedShapeInference.cpp
    │   │   │       ├── CanonicalizeExt.cpp
    │   │   │       ├── ClusterConstraint.cpp
    │   │   │       ├── ConvBackwardFusion.cpp
    │   │   │       ├── ConvForwardFusion.cpp
    │   │   │       ├── ConvertFuncToCustomCall.cpp
    │   │   │       ├── ConvertInsertion.cpp
    │   │   │       ├── ConvertOpToCustomCall.cpp
    │   │   │       ├── DTypeConversion.cpp
    │   │   │       ├── DecomposeMhloCustomCallOps.cpp
    │   │   │       ├── DynamicShapeClustering.cpp
    │   │   │       ├── FuncArgRearrangement.cpp
    │   │   │       ├── FuseBMMDimension.cpp
    │   │   │       ├── FuseTransposeIntoDotGeneral.cpp
    │   │   │       ├── FusionOutlining.cpp
    │   │   │       ├── GenericFusion.cpp
    │   │   │       ├── HloFolder.cpp
    │   │   │       ├── HloMoveDown.cpp
    │   │   │       ├── HloMoveUp.cpp
    │   │   │       ├── HloSimplify.cpp
    │   │   │       ├── IOConvertFusion.cpp
    │   │   │       ├── InsertShapeConstraint.cpp
    │   │   │       ├── LayoutTransformation.cpp
    │   │   │       ├── MatmulLayoutTransform.cpp
    │   │   │       ├── PassDetail.h
    │   │   │       ├── ReduceWindowFusion.cpp
    │   │   │       ├── RewriteWithConstraint.cpp
    │   │   │       ├── ShapeReification.cpp
    │   │   │       ├── StaticShapeInference.cpp
    │   │   │       ├── TrivialFusion.cpp
    │   │   │       └── UnfuseBatchNorm.cpp
    │   │   │   └── Util
    │   │   │       ├── FusionUtil.cpp
    │   │   │       ├── ShapeInferUtil.cpp
    │   │   │       └── Util.cpp
    │   ├── Pipelines
    │   │   ├── AffineOpt.cpp
    │   │   ├── AllOpt.cpp
    │   │   ├── BufferizeOpt.cpp
    │   │   ├── ByreHost.cpp
    │   │   ├── ByreOpt.cpp
    │   │   ├── ByreTensorOpt.cpp
    │   │   ├── CMakeLists.txt
    │   │   ├── CatFusionOpt.cpp
    │   │   ├── CatPreprocess.cpp
    │   │   ├── Common
    │   │   │   ├── CMakeLists.txt
    │   │   │   └── Utils.cpp
    │   │   ├── GPU
    │   │   │   ├── CMakeLists.txt
    │   │   │   ├── ElementwiseCodegen.cpp
    │   │   │   ├── GPUOpt.cpp
    │   │   │   ├── LinalgMemrefGPU.cpp
    │   │   │   ├── MappingForall.cpp
    │   │   │   ├── NVVMCodegen.cpp
    │   │   │   └── ReductionCodegen.cpp
    │   │   ├── HloFusionOpt.cpp
    │   │   ├── HloGraphOpt.cpp
    │   │   ├── Host
    │   │   │   ├── CMakeLists.txt
    │   │   │   ├── Codegen.cpp
    │   │   │   ├── HostOpt.cpp
    │   │   │   └── ToLLVM.cpp
    │   │   ├── LinalgMemrefOpt.cpp
    │   │   ├── LinalgTensorOpt.cpp
    │   │   ├── SCFOpt.cpp
    │   │   └── ShapeOpt.cpp
    │   ├── Stat
    │   │   ├── AllocCnt
    │   │   │   ├── AllocCnt.cpp
    │   │   │   └── CMakeLists.txt
    │   │   ├── CMakeLists.txt
    │   │   ├── Common
    │   │   │   ├── CMakeLists.txt
    │   │   │   └── Reg.cpp
    │   │   └── OpCnt
    │   │   │   ├── CMakeLists.txt
    │   │   │   └── OpCnt.cpp
    │   ├── Target
    │   │   ├── CMakeLists.txt
    │   │   ├── CUDA
    │   │   │   ├── CMakeLists.txt
    │   │   │   ├── TranslateRegistration.cpp
    │   │   │   └── TranslateToCUDA.cpp
    │   │   ├── Cpp
    │   │   │   ├── CMakeLists.txt
    │   │   │   ├── TranslateRegistration.cpp
    │   │   │   └── TranslateToCpp.cpp
    │   │   ├── LLVM
    │   │   │   ├── CMakeLists.txt
    │   │   │   └── TranslateRegistration.cpp
    │   │   └── PTX
    │   │   │   ├── CMakeLists.txt
    │   │   │   ├── GPUKernelToPTX.cpp
    │   │   │   ├── TranslateRegistration.cpp
    │   │   │   └── TranslateToPTX.cpp
    │   ├── Transforms
    │   │   ├── AnchoredPipeline.cpp
    │   │   ├── ApplyPDLPatterns.cpp
    │   │   ├── Bufferize.cpp
    │   │   ├── CMAE.cpp
    │   │   ├── CMakeLists.txt
    │   │   ├── CanonicalizeExt.cpp
    │   │   ├── CollectFunc.cpp
    │   │   ├── CondCanonicalize.cpp
    │   │   ├── FuncTag.cpp
    │   │   ├── GenericDeviceConfig.cpp
    │   │   ├── GraphClusteringByDevice.cpp
    │   │   ├── InsertUniqueId.cpp
    │   │   ├── LoopTag.cpp
    │   │   ├── LoopUnroll.cpp
    │   │   ├── MemoryPlanning.cpp
    │   │   ├── ModuleTag.cpp
    │   │   ├── PassDetail.h
    │   │   ├── RemoveFuncBody.cpp
    │   │   ├── RewriteOpToStdCall.cpp
    │   │   ├── SetArgShape.cpp
    │   │   ├── SetSpace.cpp
    │   │   ├── ShapeFuncOutlining.cpp
    │   │   ├── TryCatchModulePipeline.cpp
    │   │   └── Utils.cpp
    │   └── Utils
    │   │   ├── AffineUtils.cpp
    │   │   ├── AttrUtils.cpp
    │   │   ├── CMakeLists.txt
    │   │   ├── FuncUtils.cpp
    │   │   ├── GraphUtils.cpp
    │   │   ├── Hoist.cpp
    │   │   ├── IRRewrite.cpp
    │   │   ├── LoopUtils.cpp
    │   │   ├── MemUtils.cpp
    │   │   ├── ModuleUtils.cpp
    │   │   ├── OpInterfaceUtils.cpp
    │   │   ├── OptionUtils.cpp
    │   │   ├── PatternMatch.cpp
    │   │   ├── PipelineUtils.cpp
    │   │   ├── TileUtils.cpp
    │   │   ├── TypeUtils.cpp
    │   │   └── Utils.cpp
    ├── numerical
    │   ├── CMakeLists.txt
    │   ├── hlo
    │   │   ├── canonicalize_ext.mlir
    │   │   ├── conv_bn.mlir
    │   │   ├── dot_bn.mlir
    │   │   ├── hlo_fold.mlir
    │   │   ├── hlo_move_down.mlir
    │   │   ├── hlo_simplify.mlir
    │   │   ├── numerical_test.py
    │   │   ├── slice_move_down_and_merge.mlir
    │   │   └── test_broadcast_dense_elements_attr.mlir
    │   ├── lit.cfg.py
    │   └── lit.site.cfg.py.in
    ├── python
    │   ├── ByteIRModules.cpp
    │   ├── CMakeLists.txt
    │   ├── byteir
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── _backend_registry.py
    │   │   ├── _mlir_libs
    │   │   │   └── _site_initialize_0.py
    │   │   ├── compile.py
    │   │   ├── dialects
    │   │   │   ├── CatOps.td
    │   │   │   └── cat
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── ait_cache.py
    │   │   │   │   ├── ir_processor.py
    │   │   │   │   └── ir_translator
    │   │   │   │       ├── __init__.py
    │   │   │   │       ├── ait_builder.py
    │   │   │   │       ├── backend
    │   │   │   │           ├── __init__.py
    │   │   │   │           └── ait_registry.py
    │   │   │   │       └── translator.py
    │   │   ├── pattern_matches.py
    │   │   ├── tools
    │   │   │   ├── cat_executor.py
    │   │   │   └── compiler.py
    │   │   └── utils.py
    │   ├── gen_version.py
    │   ├── setup.py
    │   ├── test
    │   │   ├── CMakeLists.txt
    │   │   ├── api
    │   │   │   ├── test_pattern_matches.py
    │   │   │   └── test_py_api.py
    │   │   ├── dialects
    │   │   │   └── cat
    │   │   │   │   └── ait
    │   │   │   │       ├── numerical
    │   │   │   │           ├── layernorm.mlir
    │   │   │   │           ├── matmul.mlir
    │   │   │   │           ├── permute021.mlir
    │   │   │   │           ├── permute0213.mlir
    │   │   │   │           ├── permute0312.mlir
    │   │   │   │           ├── permute10.mlir
    │   │   │   │           └── softmax_f16.mlir
    │   │   │   │       └── profile
    │   │   │   │           └── matmul.mlir
    │   │   ├── lit.cfg.py
    │   │   └── lit.site.cfg.py.in
    │   └── version.txt
    ├── scripts
    │   ├── README.md
    │   ├── gen_testcases.py
    │   ├── gen_testcases_and_check_diff.sh
    │   └── sync_to_runtime.sh
    ├── test
    │   ├── Analysis
    │   │   ├── testPrintArgSideEffect.mlir
    │   │   ├── testPrintLiveness.mlir
    │   │   ├── testPrintShapeAnalysis.mlir
    │   │   ├── testPrintSymbolicShape.mlir
    │   │   └── testPrintUseRange.mlir
    │   ├── CMakeLists.txt
    │   ├── CPURunner
    │   │   ├── gelu.mlir
    │   │   ├── repeatCustomCall.mlir
    │   │   └── scatterTiling.mlir
    │   ├── Conversion
    │   │   ├── FuncToByre
    │   │   │   └── func_to_byre_tensor.mlir
    │   │   ├── HloToByreTensor
    │   │   │   └── compute_ops.mlir
    │   │   ├── HloToCat
    │   │   │   ├── basic_ops.mlir
    │   │   │   └── fused_ops.mlir
    │   │   ├── HloToTensor
    │   │   │   └── scatter_to_insertslice.mlir
    │   │   ├── LcclToByre
    │   │   │   └── lcclToByre.mlir
    │   │   ├── MemrefToByre
    │   │   │   └── memref_to_byre.mlir
    │   │   ├── ToAce
    │   │   │   └── mhloToAceActivation.mlir
    │   │   ├── ToByre
    │   │   │   ├── convertFuncAndCallToByre.mlir
    │   │   │   └── convertMemRefToByre.mlir
    │   │   ├── ToCUDAGPU
    │   │   │   ├── fusionGPUToNVVM.mlir
    │   │   │   ├── fusionGPUToNVVMBarePtr.mlir
    │   │   │   └── fusionHloToGPU.mlir
    │   │   ├── ToGPU
    │   │   │   ├── coalescedForToGPU.mlir
    │   │   │   └── funcToGPU.mlir
    │   │   ├── ToHlo
    │   │   │   └── arithConstToMhlo.mlir
    │   │   ├── ToLinalg
    │   │   │   ├── LinalgExtToLinalg.mlir
    │   │   │   ├── TesnorToLinalg.mlir
    │   │   │   ├── fusionHlo.mlir
    │   │   │   ├── hloConvertToLinalg.mlir
    │   │   │   ├── memrefcopyToLinalg.mlir
    │   │   │   ├── primitiveOpsHlo.mlir
    │   │   │   ├── reducef16.mlir
    │   │   │   ├── repeatCustomCallToLinalg.mlir
    │   │   │   ├── rngCustomCallToLinalg.mlir
    │   │   │   ├── simpleHlo.mlir
    │   │   │   └── unrealizedCastToLinalg.mlir
    │   │   ├── ToPTX
    │   │   │   ├── genPTXConfig.mlir
    │   │   │   └── genPTXConfigBarePtr.mlir
    │   │   └── VectorToGPU
    │   │   │   └── existing-vector-to-mma-ops.mlir
    │   ├── Dialect
    │   │   ├── Ace
    │   │   │   ├── attrs.mlir
    │   │   │   ├── bufferize.mlir
    │   │   │   ├── canonicalize.mlir
    │   │   │   └── ops.mlir
    │   │   ├── Affine
    │   │   │   ├── affineLoopFusionEx.mlir
    │   │   │   ├── affineToMemRef.mlir
    │   │   │   └── insertTrivialAffineLoop.mlir
    │   │   ├── Byre
    │   │   │   ├── Serialization
    │   │   │   │   ├── Compatibility
    │   │   │   │   │   ├── version_1_0_0.mlir
    │   │   │   │   │   ├── version_1_0_0.mlir.bc
    │   │   │   │   │   ├── version_1_0_0.mlir.bc.v0
    │   │   │   │   │   ├── version_1_0_0_alloc.mlir
    │   │   │   │   │   └── version_1_0_0_alloc.mlir.bc
    │   │   │   │   └── round_trip.mlir
    │   │   │   ├── bert_transformer.mlir
    │   │   │   ├── buffer_ops.mlir
    │   │   │   ├── bufferize.mlir
    │   │   │   ├── canonicalize.mlir
    │   │   │   ├── interface.mlir
    │   │   │   └── invalid.mlir
    │   │   ├── Cat
    │   │   │   └── ops.mlir
    │   │   ├── Ccl
    │   │   │   ├── ccl_bufferize.mlir
    │   │   │   ├── ccl_canonicalize.mlir
    │   │   │   ├── ccl_move_down.mlir
    │   │   │   ├── decompose_all_reduce.mlir
    │   │   │   ├── invalid.mlir
    │   │   │   └── ops.mlir
    │   │   ├── GPU
    │   │   │   ├── gpu-block-swizzle.mlir
    │   │   │   ├── gpu-distribute-shared-memory-copy.mlir
    │   │   │   ├── gpu-distributed-to-warp.mlir
    │   │   │   ├── gpu-pack-shared-memory-alloc.mlir
    │   │   │   ├── gpu-tensorcore-vectorization.mlir
    │   │   │   ├── optimize-vector-transfer.mlir
    │   │   │   ├── remove-trivial-loops.mlir
    │   │   │   ├── transform-gpu-failing.mlir
    │   │   │   ├── transform-map-forall-to-blocks.mlir
    │   │   │   └── transform-map-nested-forall-to-threads.mlir
    │   │   ├── Lace
    │   │   │   ├── invalid.mlir
    │   │   │   └── ops.mlir
    │   │   ├── Linalg
    │   │   │   ├── annotate.mlir
    │   │   │   ├── bufferize.mlir
    │   │   │   ├── canonicalizeExt.mlir
    │   │   │   ├── dataPlace-lagacy.mlir
    │   │   │   ├── dataPlace-tensor.mlir
    │   │   │   ├── extension.mlir
    │   │   │   ├── fuse-attention-upstream.mlir
    │   │   │   ├── fuse-attention.mlir
    │   │   │   ├── generalization.mlir
    │   │   │   ├── linalg-collapse-loops.mlir
    │   │   │   ├── linalg-fuse-elementwise-ext-existing.mlir
    │   │   │   ├── linalg-fuse-elementwise-ext.mlir
    │   │   │   ├── linalg-promotion-epilogue-fusion.mlir
    │   │   │   ├── linalg-promotion.mlir
    │   │   │   ├── linalgExtToLoops.mlir
    │   │   │   ├── opTiling1.mlir
    │   │   │   ├── opTiling2.mlir
    │   │   │   ├── prefetch.mlir
    │   │   │   ├── scopeTiling3SplitK-dev.mlir
    │   │   │   ├── transform-dev.mlir
    │   │   │   ├── transform-lower-to-loops.mlir
    │   │   │   ├── transform-op-collapse-dims.mlir
    │   │   │   ├── transform-op-fold-unit-extent-dims.mlir
    │   │   │   ├── transform-op-fuse-dev.mlir
    │   │   │   ├── transform-op-fuse-into-containing.mlir
    │   │   │   ├── transform-op-fuse-multi-root.mlir
    │   │   │   ├── transform-op-fuse.mlir
    │   │   │   ├── transform-op-shared-out-to-dist-style.mlir
    │   │   │   ├── transform-op-tile-ext.mlir
    │   │   │   ├── transform-op-tile-loop-hint.mlir
    │   │   │   └── transform-op-tile-reduction-parallel.mlir
    │   │   ├── MemRef
    │   │   │   ├── canonicalize.mlir
    │   │   │   ├── layout.mlir
    │   │   │   ├── removeCopy.mlir
    │   │   │   └── simplifyView.mlir
    │   │   ├── Mhlo
    │   │   │   ├── fusion.mlir
    │   │   │   ├── multi_return.mlir
    │   │   │   ├── reduce.mlir
    │   │   │   ├── simple.mlir
    │   │   │   └── transforms
    │   │   │   │   ├── ConvBNFolder.mlir
    │   │   │   │   ├── ConvBackwardFusion.mlir
    │   │   │   │   ├── ConvBiasActFusion.mlir
    │   │   │   │   ├── ConvertOpToCustomCall.mlir
    │   │   │   │   ├── DecomposeMhloCustomCallOps.mlir
    │   │   │   │   ├── IOConvertFusion.mlir
    │   │   │   │   ├── LayoutTransformation.mlir
    │   │   │   │   ├── RewriteWithConstraint.mlir
    │   │   │   │   ├── SliceMoveDownAndMerge.mlir
    │   │   │   │   ├── TestBroadcastDenseElementsAttr.mlir
    │   │   │   │   ├── TestConvertFuncToCustomCall.mlir
    │   │   │   │   ├── TestConvertInsertion.mlir
    │   │   │   │   ├── TestCustomConvert.mlir
    │   │   │   │   ├── TestDTypeConversion.mlir
    │   │   │   │   ├── TestDTypeConversionModifyFunc.mlir
    │   │   │   │   ├── TestFuncArgRearrangement.mlir
    │   │   │   │   ├── aggressiveFusion.mlir
    │   │   │   │   ├── canonicalize
    │   │   │   │       ├── arithOptimize.mlir
    │   │   │   │       ├── dynamicGather.mlir
    │   │   │   │       └── transposeFolder.mlir
    │   │   │   │   ├── clusterConstraint.mlir
    │   │   │   │   ├── concatSliceFusion.mlir
    │   │   │   │   ├── elementFusion.mlir
    │   │   │   │   ├── expandHloTuples.mlir
    │   │   │   │   ├── fuseBMMDimension.mlir
    │   │   │   │   ├── fuseTransposeIntoDotGeneral.mlir
    │   │   │   │   ├── fusionOutlining.mlir
    │   │   │   │   ├── hloFolder.mlir
    │   │   │   │   ├── hloMoveDown.mlir
    │   │   │   │   ├── hloMoveUp.mlir
    │   │   │   │   ├── hloSimplify.mlir
    │   │   │   │   ├── insertShapeConstraint.mlir
    │   │   │   │   ├── matmulEpilogueFusion.mlir
    │   │   │   │   ├── matmulLayoutTransform.mlir
    │   │   │   │   ├── mhloFlattenTuple.mlir
    │   │   │   │   └── reduceFusion.mlir
    │   │   ├── SCF
    │   │   │   ├── forallCollapsing.mlir
    │   │   │   ├── fuseNestedForall.mlir
    │   │   │   ├── insertTrivialSCFLoop.mlir
    │   │   │   └── moveForallRegionIntoWarpOp.mlir
    │   │   ├── Shape
    │   │   │   ├── insertInputShapeConstraint.mlir
    │   │   │   ├── insertTieShape.mlir
    │   │   │   ├── resolveShapeConstraint.mlir
    │   │   │   └── setAssumingAlwaysTrue.mlir
    │   │   ├── Tensor
    │   │   │   └── canonicalizeExt.mlir
    │   │   ├── Transform
    │   │   │   ├── canonicalize.mlir
    │   │   │   ├── cleanup.mlir
    │   │   │   ├── detensorizeInsertion.mlir
    │   │   │   ├── dump.mlir
    │   │   │   ├── transformDialectInterpreter.mlir
    │   │   │   └── transformInsertion.mlir
    │   │   └── Vector
    │   │   │   └── canonicalizeExt.mlir
    │   ├── E2E
    │   │   ├── CUDA
    │   │   │   ├── AliasLikeGPU
    │   │   │   │   ├── 10b_ptx_codegen.mlir
    │   │   │   │   ├── 1_hlo_opt.mlir
    │   │   │   │   ├── 2_linalg_tensor_opt.mlir
    │   │   │   │   ├── 3_byre_tensor_opt.mlir
    │   │   │   │   ├── 4_bufferize_opt.mlir
    │   │   │   │   ├── 5_alternative_scf_opt.mlir
    │   │   │   │   ├── 6_gpu_opt.mlir
    │   │   │   │   ├── 7_set_space_opt.mlir
    │   │   │   │   ├── 8_byre_opt.mlir
    │   │   │   │   ├── 9a_byre_host.mlir
    │   │   │   │   ├── 9b_nvvm_codegen.mlir
    │   │   │   │   ├── device_output.ptx
    │   │   │   │   ├── host_output.mlir
    │   │   │   │   ├── input.mlir
    │   │   │   │   └── template.py
    │   │   │   ├── BertTiny
    │   │   │   │   ├── BW
    │   │   │   │   │   └── input.mlir
    │   │   │   │   └── FW
    │   │   │   │   │   ├── 1_hlo_opt.mlir
    │   │   │   │   │   └── input.mlir
    │   │   │   ├── CclInference
    │   │   │   │   └── input.mlir
    │   │   │   ├── MLPBasic
    │   │   │   │   ├── 1_preprocess_for_lowering.mlir
    │   │   │   │   └── input.mlir
    │   │   │   ├── MLPInference
    │   │   │   │   ├── 10b_ptx_codegen.mlir
    │   │   │   │   ├── 1_hlo_opt.mlir
    │   │   │   │   ├── 2_linalg_tensor_opt.mlir
    │   │   │   │   ├── 3_byre_tensor_opt.mlir
    │   │   │   │   ├── 4_bufferize_opt.mlir
    │   │   │   │   ├── 5_affine_opt.mlir
    │   │   │   │   ├── 5_alternative_scf_opt.mlir
    │   │   │   │   ├── 6_gpu_opt.mlir
    │   │   │   │   ├── 7_set_space_opt.mlir
    │   │   │   │   ├── 8_byre_opt.mlir
    │   │   │   │   ├── 9a_byre_host.mlir
    │   │   │   │   ├── 9b_nvvm_codegen.mlir
    │   │   │   │   ├── device_output.ptx
    │   │   │   │   ├── host_output.mlir
    │   │   │   │   ├── input.mlir
    │   │   │   │   └── template.py
    │   │   │   ├── NanoGPT
    │   │   │   │   ├── BW
    │   │   │   │   │   └── input.mlir
    │   │   │   │   └── FW
    │   │   │   │   │   └── input.mlir
    │   │   │   └── ResNet18
    │   │   │   │   ├── BW
    │   │   │   │       ├── 10b_ptx_codegen.mlir
    │   │   │   │       ├── 1_hlo_opt.mlir
    │   │   │   │       ├── 2_linalg_tensor_opt.mlir
    │   │   │   │       ├── 3_byre_tensor_opt.mlir
    │   │   │   │       ├── 4_bufferize_opt.mlir
    │   │   │   │       ├── 5_affine_opt.mlir
    │   │   │   │       ├── 5_alternative_scf_opt.mlir
    │   │   │   │       ├── 6_gpu_opt.mlir
    │   │   │   │       ├── 7_set_space_opt.mlir
    │   │   │   │       ├── 8_byre_opt.mlir
    │   │   │   │       ├── 9a_byre_host.mlir
    │   │   │   │       ├── 9b_nvvm_codegen.mlir
    │   │   │   │       ├── device_output.ptx
    │   │   │   │       ├── host_output.mlir
    │   │   │   │       ├── input.mlir
    │   │   │   │       └── template.py
    │   │   │   │   ├── FW
    │   │   │   │       ├── 10b_ptx_codegen.mlir
    │   │   │   │       ├── 1_hlo_opt.mlir
    │   │   │   │       ├── 2_linalg_tensor_opt.mlir
    │   │   │   │       ├── 3_byre_tensor_opt.mlir
    │   │   │   │       ├── 4_bufferize_opt.mlir
    │   │   │   │       ├── 5_affine_opt.mlir
    │   │   │   │       ├── 5_alternative_scf_opt.mlir
    │   │   │   │       ├── 6_gpu_opt.mlir
    │   │   │   │       ├── 7_set_space_opt.mlir
    │   │   │   │       ├── 8_byre_opt.mlir
    │   │   │   │       ├── 9a_byre_host.mlir
    │   │   │   │       ├── 9b_nvvm_codegen.mlir
    │   │   │   │       ├── device_output.ptx
    │   │   │   │       ├── host_output.mlir
    │   │   │   │       ├── input.mlir
    │   │   │   │       └── template.py
    │   │   │   │   └── Whole
    │   │   │   │       ├── 10b_ptx_codegen.mlir
    │   │   │   │       ├── 1_hlo_opt.mlir
    │   │   │   │       ├── 2_linalg_tensor_opt.mlir
    │   │   │   │       ├── 3_byre_tensor_opt.mlir
    │   │   │   │       ├── 4_bufferize_opt.mlir
    │   │   │   │       ├── 5_affine_opt.mlir
    │   │   │   │       ├── 5_alternative_scf_opt.mlir
    │   │   │   │       ├── 6_gpu_opt.mlir
    │   │   │   │       ├── 7_set_space_opt.mlir
    │   │   │   │       ├── 8_byre_opt.mlir
    │   │   │   │       ├── 9a_byre_host.mlir
    │   │   │   │       ├── 9b_nvvm_codegen.mlir
    │   │   │   │       ├── device_output.ptx
    │   │   │   │       ├── host_output.mlir
    │   │   │   │       ├── input.mlir
    │   │   │   │       └── template.py
    │   │   └── Host
    │   │   │   ├── AliasLike
    │   │   │       ├── 00_Input.mlir
    │   │   │       ├── 01_HostOpt.mlir
    │   │   │       ├── 02a_ByreHost.mlir
    │   │   │       ├── 02b_ToLLVM.mlir
    │   │   │       ├── 03b_ToLLVMIR.mlir
    │   │   │       ├── Output.ll
    │   │   │       ├── Output.mlir
    │   │   │       ├── TotalPipeline.mlir
    │   │   │       └── template.py
    │   │   │   ├── Case0
    │   │   │       ├── 00_Input.mlir
    │   │   │       ├── 01_HostOpt.mlir
    │   │   │       ├── 02a_ByreHost.mlir
    │   │   │       ├── 02b_ToLLVM.mlir
    │   │   │       ├── 03b_ToLLVMIR.mlir
    │   │   │       ├── Output.ll
    │   │   │       ├── Output.mlir
    │   │   │       ├── TotalPipeline.mlir
    │   │   │       └── template.py
    │   │   │   ├── Case0_Bytecode
    │   │   │       ├── 00_Input.mlir
    │   │   │       ├── 01_HostOpt.mlir
    │   │   │       ├── 02a_ByreHost.mlir
    │   │   │       ├── 02b_ToLLVM.mlir
    │   │   │       ├── 03a_ByreSerial.mlir
    │   │   │       ├── 03b_ToLLVMBC.mlir
    │   │   │       ├── Output.bc
    │   │   │       ├── Output.mlirbc
    │   │   │       └── template.py
    │   │   │   ├── Case1
    │   │   │       ├── 00_Input.mlir
    │   │   │       ├── 01_HostOpt.mlir
    │   │   │       ├── 02a_ByreHost.mlir
    │   │   │       ├── 02b_ToLLVM.mlir
    │   │   │       ├── 03b_ToLLVMIR.mlir
    │   │   │       ├── Output.ll
    │   │   │       ├── Output.mlir
    │   │   │       ├── TotalPipeline.mlir
    │   │   │       └── template.py
    │   │   │   ├── RngNormal
    │   │   │       ├── 00_Input.mlir
    │   │   │       ├── 01_HostOpt.mlir
    │   │   │       ├── 02a_ByreHost.mlir
    │   │   │       ├── 02b_ToLLVM.mlir
    │   │   │       ├── 03b_ToLLVMIR.mlir
    │   │   │       ├── Output.ll
    │   │   │       ├── Output.mlir
    │   │   │       ├── TotalPipeline.mlir
    │   │   │       └── template.py
    │   │   │   ├── RngUniform
    │   │   │       ├── 00_Input.mlir
    │   │   │       ├── 01_HostOpt.mlir
    │   │   │       ├── 02a_ByreHost.mlir
    │   │   │       ├── 02b_ToLLVM.mlir
    │   │   │       ├── 03b_ToLLVMIR.mlir
    │   │   │       ├── Output.ll
    │   │   │       ├── Output.mlir
    │   │   │       ├── TotalPipeline.mlir
    │   │   │       └── template.py
    │   │   │   ├── Transpose
    │   │   │       ├── 00_Input.mlir
    │   │   │       ├── 01_HostOpt.mlir
    │   │   │       ├── 02a_ByreHost.mlir
    │   │   │       ├── 02b_ToLLVM.mlir
    │   │   │       ├── 03b_ToLLVMIR.mlir
    │   │   │       ├── Output.ll
    │   │   │       ├── Output.mlir
    │   │   │       ├── TotalPipeline.mlir
    │   │   │       └── template.py
    │   │   │   └── TypeCvt
    │   │   │       ├── 00_Input.mlir
    │   │   │       ├── 01_HostOpt.mlir
    │   │   │       ├── 02a_ByreHost.mlir
    │   │   │       ├── 02b_ToLLVM.mlir
    │   │   │       ├── 03b_ToLLVMIR.mlir
    │   │   │       ├── Output.ll
    │   │   │       ├── Output.mlir
    │   │   │       ├── TotalPipeline.mlir
    │   │   │       └── template.py
    │   ├── Ops
    │   │   ├── conv.mlir
    │   │   └── dot.mlir
    │   ├── Pipelines
    │   │   ├── BufferizeOpts
    │   │   │   ├── linalg-ext.mlir
    │   │   │   └── tensor.mlir
    │   │   ├── HloOpts
    │   │   │   ├── mlp.mlir
    │   │   │   └── rng.mlir
    │   │   ├── Host
    │   │   │   ├── Codegen
    │   │   │   │   └── transpose.mlir
    │   │   │   └── ToLLVM
    │   │   │   │   ├── subview.mlir
    │   │   │   │   └── tanh.mlir
    │   │   ├── LinalgTensorOpt
    │   │   │   ├── elementwiseCodegen.mlir
    │   │   │   └── reductionCodegen.mlir
    │   │   └── ShapeOpts
    │   │   │   └── dynamicPartitionStitch.mlir
    │   ├── Stat
    │   │   ├── allocCnt.mlir
    │   │   ├── opCnt.mlir
    │   │   └── opTypes.mlir
    │   ├── Target
    │   │   ├── CUDA
    │   │   │   ├── all.mlir
    │   │   │   └── kernel.mlir
    │   │   ├── Cpp
    │   │   │   ├── attrs.mlir
    │   │   │   ├── binary.mlir
    │   │   │   ├── call.mlir
    │   │   │   ├── cast.mlir
    │   │   │   ├── common-cpp.mlir
    │   │   │   ├── const.mlir
    │   │   │   ├── control_flow.mlir
    │   │   │   ├── for.mlir
    │   │   │   ├── if.mlir
    │   │   │   ├── invalid.mlir
    │   │   │   ├── memref.mlir
    │   │   │   ├── opaque_types.mlir
    │   │   │   ├── stdops.mlir
    │   │   │   └── types.mlir
    │   │   └── PTX
    │   │   │   └── fusionFuncToPTX.mlir
    │   ├── Transforms
    │   │   ├── ApplyPDLPatterns
    │   │   │   ├── Case_0.mlir
    │   │   │   └── Pattern_0.mlir
    │   │   ├── CanonicalizeExt
    │   │   │   ├── basic.mlir
    │   │   │   ├── broadcast.mlir
    │   │   │   ├── concat.mlir
    │   │   │   ├── deprecated.mlir
    │   │   │   ├── elementwise.mlir
    │   │   │   ├── gather.mlir
    │   │   │   ├── reduce_like.mlir
    │   │   │   ├── slice_concat.mlir
    │   │   │   └── transpose.mlir
    │   │   ├── boundedShapeInference.mlir
    │   │   ├── cmae.mlir
    │   │   ├── collectFunc.mlir
    │   │   ├── funTag.mlir
    │   │   ├── genericDeviceConfig.mlir
    │   │   ├── gereicDeviceConfig_with_ByreOpt.mlir
    │   │   ├── graphCanonicalize.mlir
    │   │   ├── graphClusteringByDevice.mlir
    │   │   ├── graphClusteringByDeviceBottomUp.mlir
    │   │   ├── graphClusteringByDeviceGreedy.mlir
    │   │   ├── graphClusteringByDeviceTopDown.mlir
    │   │   ├── insertUniqueId.mlir
    │   │   ├── insertUniqueIdErase.mlir
    │   │   ├── loopTag.mlir
    │   │   ├── loopUnrollImperfect.mlir
    │   │   ├── loopUnrollUseAnchor.mlir
    │   │   ├── loopUnrollUseDepth.mlir
    │   │   ├── loopUnrollWithAnnotation.mlir
    │   │   ├── memoryPlanning.mlir
    │   │   ├── oneShotBufferize.mlir
    │   │   ├── oneShotBufferizeOutParams.mlir
    │   │   ├── removeFunTag.mlir
    │   │   ├── removeFuncBody.mlir
    │   │   ├── rewriteOpToStdCall.mlir
    │   │   ├── setAllSpace.mlir
    │   │   ├── setArgShape.mlir
    │   │   ├── setArgSpace.mlir
    │   │   ├── setArgSpaceAutoDeduce.mlir
    │   │   ├── setOpAndArgSpace.mlir
    │   │   ├── setOpSpace.mlir
    │   │   ├── shapeFuncOutlining.mlir
    │   │   ├── shapeReification.mlir
    │   │   ├── staticShapeInference.mlir
    │   │   └── testGraphClusteringByDeviceOpNum.mlir
    │   ├── Utils
    │   │   ├── testMergeTwoModulesCase0.mlir
    │   │   ├── testMergeTwoModulesCase0_1.mlir
    │   │   ├── testMergeTwoModulesCase1.mlir
    │   │   ├── testMergeTwoModulesCase1_1.mlir
    │   │   ├── testMergeTwoModulesCase2.mlir
    │   │   └── testMergeTwoModulesCase2_1.mlir
    │   ├── lib
    │   │   ├── Analysis
    │   │   │   ├── CMakeLists.txt
    │   │   │   ├── TestGraphClusteringByDeviceOpNum.cpp
    │   │   │   ├── TestPrintLiveness.cpp
    │   │   │   ├── TestPrintShapeAnalysis.cpp
    │   │   │   ├── TestPrintSideEffect.cpp
    │   │   │   ├── TestPrintSymbolicShape.cpp
    │   │   │   └── TestPrintUseRange.cpp
    │   │   ├── CMakeLists.txt
    │   │   ├── Interface
    │   │   │   ├── CMakeLists.txt
    │   │   │   └── TestByreOpInterface.cpp
    │   │   ├── Transformation
    │   │   │   ├── CMakeLists.txt
    │   │   │   ├── TestByreSerialRoundtrip.cpp
    │   │   │   ├── TestConvertFuncToCustomCall.cpp
    │   │   │   ├── TestConvertInsertion.cpp
    │   │   │   ├── TestDTypeConversion.cpp
    │   │   │   └── TestFuncArgRearrangement.cpp
    │   │   └── Utils
    │   │   │   ├── CMakeLists.txt
    │   │   │   ├── TestBroadcastDenseElementsAttr.cpp
    │   │   │   └── TestMergeTwoModules.cpp
    │   ├── lit.cfg.py
    │   └── lit.site.cfg.py.in
    └── tools
    │   ├── CMakeLists.txt
    │   ├── byteir-cpu-runner
    │       ├── CMakeLists.txt
    │       └── byteir-cpu-runner.cpp
    │   ├── byteir-opt
    │       ├── CMakeLists.txt
    │       └── byteir-opt.cpp
    │   ├── byteir-stat
    │       ├── CMakeLists.txt
    │       └── byteir-stat.cpp
    │   └── byteir-translate
    │       ├── CMakeLists.txt
    │       └── byteir-translate.cpp
├── docker
    └── Dockerfile
├── external
    ├── half
    │   ├── LICENSE.txt
    │   ├── README.txt
    │   └── include
    │   │   └── half
    │   │       └── half.hpp
    └── patches
    │   └── AITemplate
    │       ├── A10.patch
    │       ├── logging.patch
    │       └── num_builders.patch
├── external_libs
    └── runtime
    │   ├── CMakeLists.txt
    │   ├── README.md
    │   └── flash_attn
    │       ├── CMakeLists.txt
    │       ├── include
    │           └── flash_api.h
    │       └── lib
    │           ├── CMakeLists.txt
    │           ├── alibi.h
    │           ├── block_info.h
    │           ├── dropout.h
    │           ├── flash.h
    │           ├── flash_api.cu
    │           ├── flash_bwd_hdim128_fp16_sm80.cu
    │           ├── flash_bwd_hdim160_fp16_sm80.cu
    │           ├── flash_bwd_hdim192_fp16_sm80.cu
    │           ├── flash_bwd_hdim224_fp16_sm80.cu
    │           ├── flash_bwd_hdim256_fp16_sm80.cu
    │           ├── flash_bwd_hdim32_fp16_sm80.cu
    │           ├── flash_bwd_hdim64_fp16_sm80.cu
    │           ├── flash_bwd_hdim96_fp16_sm80.cu
    │           ├── flash_bwd_kernel.h
    │           ├── flash_bwd_launch_template.h
    │           ├── flash_bwd_preprocess_kernel.h
    │           ├── flash_fwd_hdim128_fp16_sm80.cu
    │           ├── flash_fwd_hdim160_fp16_sm80.cu
    │           ├── flash_fwd_hdim192_fp16_sm80.cu
    │           ├── flash_fwd_hdim224_fp16_sm80.cu
    │           ├── flash_fwd_hdim256_fp16_sm80.cu
    │           ├── flash_fwd_hdim32_fp16_sm80.cu
    │           ├── flash_fwd_hdim64_fp16_sm80.cu
    │           ├── flash_fwd_hdim96_fp16_sm80.cu
    │           ├── flash_fwd_kernel.h
    │           ├── flash_fwd_launch_template.h
    │           ├── flash_fwd_split_hdim128_fp16_sm80.cu
    │           ├── flash_fwd_split_hdim160_fp16_sm80.cu
    │           ├── flash_fwd_split_hdim192_fp16_sm80.cu
    │           ├── flash_fwd_split_hdim224_fp16_sm80.cu
    │           ├── flash_fwd_split_hdim256_fp16_sm80.cu
    │           ├── flash_fwd_split_hdim32_fp16_sm80.cu
    │           ├── flash_fwd_split_hdim64_fp16_sm80.cu
    │           ├── flash_fwd_split_hdim96_fp16_sm80.cu
    │           ├── kernel_traits.h
    │           ├── mask.h
    │           ├── philox.cuh
    │           ├── rotary.h
    │           ├── softmax.h
    │           ├── static_switch.h
    │           └── utils.h
├── frontends
    ├── README.md
    ├── onnx-frontend
    │   ├── .gitignore
    │   ├── CMakeLists.txt
    │   ├── MLIR.cmake
    │   ├── README.md
    │   ├── onnx-frontend
    │   │   ├── CMakeLists.txt
    │   │   ├── src
    │   │   │   ├── CMakeLists.txt
    │   │   │   ├── Compiler
    │   │   │   │   ├── CMakeLists.txt
    │   │   │   │   ├── OFCompilerOptions.cpp
    │   │   │   │   ├── OFCompilerOptions.hpp
    │   │   │   │   ├── OFCompilerPipelines.cpp
    │   │   │   │   ├── OFCompilerPipelines.hpp
    │   │   │   │   ├── OFCompilerTypes.hpp
    │   │   │   │   ├── OFCompilerUtils.cpp
    │   │   │   │   └── OFCompilerUtils.hpp
    │   │   │   ├── Conversion
    │   │   │   │   ├── CMakeLists.txt
    │   │   │   │   ├── OFCanonicalizer.cpp
    │   │   │   │   ├── OFCanonicalizer.hpp
    │   │   │   │   ├── OFCheckNonLowered.cpp
    │   │   │   │   ├── OFCheckNonLowered.hpp
    │   │   │   │   ├── OFInsertNecessaryCast.cpp
    │   │   │   │   ├── OFInsertNecessaryCast.hpp
    │   │   │   │   ├── OFModifyEntryPoint.cpp
    │   │   │   │   ├── OFModifyEntryPoint.hpp
    │   │   │   │   ├── OFPasses.hpp
    │   │   │   │   ├── OFPasses.td
    │   │   │   │   ├── OFPassesDetail.hpp
    │   │   │   │   ├── OFRewriteCustomOnnxOps.cpp
    │   │   │   │   ├── OFRewriteCustomOnnxOps.hpp
    │   │   │   │   ├── OFRewriteCustomOnnxOps.td
    │   │   │   │   ├── OFRewriteToCustomCall.cpp
    │   │   │   │   ├── OFRewriteToCustomCall.hpp
    │   │   │   │   └── OFRewriteToCustomCall.td
    │   │   │   ├── Support
    │   │   │   │   ├── CMakeLists.txt
    │   │   │   │   ├── OFConstants.hpp
    │   │   │   │   ├── OFUtils.cpp
    │   │   │   │   └── OFUtils.hpp
    │   │   │   ├── onnx-frontend-opt.cpp
    │   │   │   └── onnx-frontend.cpp
    │   │   └── test
    │   │   │   ├── CMakeLists.txt
    │   │   │   ├── dynamic_shape_relu.onnx
    │   │   │   ├── lit.cfg.py
    │   │   │   ├── lit.site.cfg.py.in
    │   │   │   ├── of_canonicalizer.mlir
    │   │   │   ├── of_check_non_lowered.mlir
    │   │   │   ├── of_modify_entry_point.mlir
    │   │   │   ├── of_rewrite_custom_onnx_op.mlir
    │   │   │   ├── of_rewrite_to_custom_call.mlir
    │   │   │   ├── set_shape.mlir
    │   │   │   └── shape_inference.mlir
    │   ├── pytest.ini
    │   ├── requirements.txt
    │   ├── scripts
    │   │   ├── build_and_test.sh
    │   │   └── envsetup.sh
    │   ├── test
    │   │   ├── __init__.py
    │   │   ├── base.py
    │   │   ├── env.py
    │   │   ├── models
    │   │   │   ├── test_batch_size.py
    │   │   │   └── test_large_model.py
    │   │   └── ops
    │   │   │   ├── __init__.py
    │   │   │   ├── data
    │   │   │       ├── math
    │   │   │       │   ├── clip.onnx
    │   │   │       │   ├── gelu.onnx
    │   │   │       │   └── softmax.onnx
    │   │   │       ├── nn
    │   │   │       │   └── batch_normalization.onnx
    │   │   │       ├── quantize
    │   │   │       │   └── quantize_dequantize.onnx
    │   │   │       └── tensor
    │   │   │       │   ├── arg_max.onnx
    │   │   │       │   ├── arg_min.onnx
    │   │   │       │   ├── concat.onnx
    │   │   │       │   ├── concat_dynamic_shape.onnx
    │   │   │       │   └── resize_nearest_v10.onnx
    │   │   │   ├── test_math.py
    │   │   │   ├── test_nn.py
    │   │   │   ├── test_quantize.py
    │   │   │   ├── test_rnn.py
    │   │   │   ├── test_tensor.py
    │   │   │   └── utils.py
    │   └── third_party
    │   │   └── patches
    │   │       ├── OnnxMlirConcat.patch
    │   │       ├── OnnxMlirConvTranspose.patch
    │   │       ├── OnnxMlirDialectBuilder.patch
    │   │       ├── OnnxMlirDialectRewrite.patch
    │   │       ├── OnnxMlirElementwise.patch
    │   │       ├── OnnxMlirKeepCustomOpType.patch
    │   │       ├── OnnxMlirONNXToStablehloCommon.patch
    │   │       ├── OnnxMlirONNXToStablehloGather.patch
    │   │       ├── OnnxMlirOnnxOpsTensorPad.patch
    │   │       ├── OnnxMlirPooling.patch
    │   │       ├── OnnxMlirReductionUpgrade.patch
    │   │       ├── OnnxMlirRegisterLibrary.patch
    │   │       ├── OnnxMlirReshape.patch
    │   │       ├── OnnxMlirScatterElements.patch
    │   │       ├── OnnxMlirTestElementwise.patch
    │   │       ├── OnnxMlirTestPooling.patch
    │   │       ├── OnnxMlirWillNotPush.patch
    │   │       ├── OnnxMlirWillPushToUpstream.patch
    │   │       ├── OnnxOfficialExternalData.patch
    │   │       └── OnnxOfficialResize.patch
    ├── tf-frontend
    │   ├── .bazelrc
    │   ├── .bazelversion
    │   ├── .gitignore
    │   ├── .tf_configure.bazelrc
    │   ├── BUILD
    │   ├── README.md
    │   ├── WORKSPACE
    │   ├── byteir
    │   │   ├── BUILD
    │   │   ├── ace.BUILD
    │   │   └── workspace.bzl
    │   ├── docs
    │   │   ├── attributes.md
    │   │   └── developer_guild.md
    │   ├── example
    │   │   ├── resnet.py
    │   │   └── resnet50_model.py
    │   ├── external
    │   │   └── patches
    │   │   │   └── tensorflow
    │   │   │       ├── fix-bug-of-create-f16-const-for-HoistCwiseBinaryOutO.patch
    │   │   │       ├── for_gcc_8_5.patch
    │   │   │       ├── grappler.patch
    │   │   │       ├── mhlo_ops.patch
    │   │   │       ├── support-tf-shape-inference.patch
    │   │   │       ├── tf.Select_to_mhlo.select.patch
    │   │   │       ├── tf_build.patch
    │   │   │       ├── tf_dilated_conv.patch
    │   │   │       ├── tf_mkl.patch
    │   │   │       ├── tf_slice.patch
    │   │   │       └── topk.patch
    │   ├── scripts
    │   │   ├── apply_patches.sh
    │   │   ├── build_and_test.sh
    │   │   └── prepare.sh
    │   ├── tf_mlir_ext
    │   │   ├── numerical
    │   │   │   ├── BUILD
    │   │   │   ├── dilated_conv2d.mlir
    │   │   │   ├── fallback_to_custom_call.mlir
    │   │   │   ├── fuse_tf_ops.mlir
    │   │   │   ├── glob_lit_test.bzl
    │   │   │   ├── numerical_test.py
    │   │   │   ├── process_dynamic_stitch_as_static.mlir
    │   │   │   ├── reshape_movedown_string.mlir
    │   │   │   ├── rewrite_to_custom_call.mlir
    │   │   │   ├── runlit.cfg.py
    │   │   │   ├── runlit.site.cfg.py
    │   │   │   └── where.mlir
    │   │   ├── pipelines
    │   │   │   ├── BUILD
    │   │   │   ├── customized_tf_to_mhlo.cc
    │   │   │   ├── customized_tf_to_mhlo.h
    │   │   │   ├── passes.h
    │   │   │   ├── passes.td
    │   │   │   └── passes_detail.h
    │   │   ├── tests
    │   │   │   ├── BUILD
    │   │   │   ├── ace_ops.mlir
    │   │   │   ├── convert_repeat_to_tile.mlir
    │   │   │   ├── dilated_conv2d.mlir
    │   │   │   ├── fallback_to_custom_call.mlir
    │   │   │   ├── fuse_tf_ops.mlir
    │   │   │   ├── glob_lit_test.bzl
    │   │   │   ├── inline_func_call_in_scf_if.mlir
    │   │   │   ├── mhlo_legalize_tf_ext.mlir
    │   │   │   ├── process_dynamic_stitch_as_static.mlir
    │   │   │   ├── reshape_movedown_string.mlir
    │   │   │   ├── rewrite_func_attr_to_byteir.mlir
    │   │   │   ├── rewrite_to_custom_call.mlir
    │   │   │   ├── rewrite_to_custom_call_keep_body.mlir
    │   │   │   ├── rewrite_to_if.mlir
    │   │   │   ├── runlit.cfg.py
    │   │   │   ├── runlit.site.cfg.py
    │   │   │   ├── set_repeat_out_batch_size.mlir
    │   │   │   └── where.mlir
    │   │   ├── transforms
    │   │   │   ├── BUILD
    │   │   │   ├── constant_folding.cc
    │   │   │   ├── constant_folding.h
    │   │   │   ├── convert_repeat_to_tile.cc
    │   │   │   ├── convert_repeat_to_tile.h
    │   │   │   ├── fuse_tf_ops.cc
    │   │   │   ├── fuse_tf_ops.h
    │   │   │   ├── fuse_tf_ops.td
    │   │   │   ├── inline_func_call_in_scf_if.cc
    │   │   │   ├── inline_func_call_in_scf_if.h
    │   │   │   ├── mhlo_legalize_tf_ext.cc
    │   │   │   ├── mhlo_legalize_tf_ext.h
    │   │   │   ├── passes.h
    │   │   │   ├── passes.td
    │   │   │   ├── passes_detail.h
    │   │   │   ├── process_dynamic_stitch_as_static.cc
    │   │   │   ├── process_dynamic_stitch_as_static.h
    │   │   │   ├── remove_control_flow.cc
    │   │   │   ├── remove_control_flow.h
    │   │   │   ├── reshape_movedown_string.cc
    │   │   │   ├── reshape_movedown_string.h
    │   │   │   ├── rewrite_func_attr_to_byteir.cc
    │   │   │   ├── rewrite_func_attr_to_byteir.h
    │   │   │   ├── rewrite_to_custom_call.cc
    │   │   │   ├── rewrite_to_custom_call.h
    │   │   │   ├── rewrite_to_custom_call.td
    │   │   │   ├── rewrite_to_if.cc
    │   │   │   ├── rewrite_to_if.h
    │   │   │   ├── set_repeat_out_batch_size.cc
    │   │   │   ├── set_repeat_out_batch_size.h
    │   │   │   ├── tf_fallback_to_custom_call.cc
    │   │   │   ├── tf_fallback_to_custom_call.h
    │   │   │   ├── tf_switch_merge_to_if.cc
    │   │   │   └── tf_switch_merge_to_if.h
    │   │   └── utils
    │   │   │   ├── BUILD
    │   │   │   ├── customcall.cc
    │   │   │   ├── customcall.h
    │   │   │   ├── dce.cc
    │   │   │   ├── dce.h
    │   │   │   ├── utils.cc
    │   │   │   └── utils.h
    │   ├── tools
    │   │   ├── BUILD
    │   │   ├── tf_ext_opt_main.cc
    │   │   └── tf_frontend_main.cc
    │   └── utils
    │   │   ├── BUILD
    │   │   ├── attributes.h
    │   │   ├── graphdef_opt.cc
    │   │   ├── graphdef_opt.h
    │   │   ├── misc.cc
    │   │   └── misc.h
    └── torch-frontend
    │   ├── .gitignore
    │   ├── CMakeLists.txt
    │   ├── MLIR.cmake
    │   ├── README.md
    │   ├── TorchMLIR.cmake
    │   ├── build-requirements.txt
    │   ├── doc
    │       └── torch_2_0_training.md
    │   ├── examples
    │       ├── demo
    │       │   ├── README.md
    │       │   ├── backend.py
    │       │   ├── byteir_fusible_pattern.py
    │       │   ├── compile_utils.py
    │       │   ├── config.py
    │       │   ├── context.py
    │       │   ├── fx_match_utils.py
    │       │   ├── main.py
    │       │   ├── models
    │       │   │   ├── configuration_chatglm.py
    │       │   │   ├── modeling_chatglm.py
    │       │   │   └── modeling_nanogpt.py
    │       │   ├── partitioners.py
    │       │   └── requirements.txt
    │       ├── inference
    │       │   ├── brt_backend.py
    │       │   ├── infer_resnet.py
    │       │   ├── infer_tinybert.py
    │       │   ├── mixtral
    │       │   │   ├── infer_single_mixtral.py
    │       │   │   └── requirements.txt
    │       │   └── mlp.py
    │       └── training
    │       │   ├── byteir_backend.py
    │       │   ├── mlp.py
    │       │   ├── train_resnet.py
    │       │   └── train_tinybert.py
    │   ├── scripts
    │       ├── build.sh
    │       ├── build_and_test.sh
    │       └── envsetup.sh
    │   ├── test-requirements.txt
    │   ├── third_party
    │       ├── llvm_patches
    │       │   └── ir_printing.patch
    │       └── patches
    │       │   ├── backend_contract.patch
    │       │   ├── build.patch
    │       │   ├── communication_op.patch
    │       │   ├── custom_op.patch
    │       │   ├── fx_importer.patch
    │       │   ├── generated_torch_ops_td.patch
    │       │   ├── pipeline.patch
    │       │   ├── reduce_op_variants.patch
    │       │   └── tuple.patch
    │   ├── torch-cpu-requirements.txt
    │   ├── torch-cuda-requirements.txt
    │   └── torch-frontend
    │       ├── CMakeLists.txt
    │       ├── include
    │           ├── torch-frontend-c
    │           │   └── Passes.h
    │           └── torch-frontend
    │           │   ├── CMakeLists.txt
    │           │   ├── Conversion
    │           │       ├── CMakeLists.txt
    │           │       ├── ConvertTorchToCcl.h
    │           │       ├── ConvertTorchToCustomCall.h
    │           │       ├── ConvertTorchToStablehloExt.h
    │           │       ├── Passes.h
    │           │       └── Passes.td
    │           │   ├── Dialect
    │           │       └── Torch
    │           │       │   └── Transforms
    │           │       │       ├── CMakeLists.txt
    │           │       │       ├── DecomposeOnTorch.h
    │           │       │       ├── FuseOpOnTorch.h
    │           │       │       ├── Passes.h
    │           │       │       └── Passes.td
    │           │   ├── Pipelines
    │           │       └── Pipelines.h
    │           │   ├── Transforms
    │           │       ├── CMakeLists.txt
    │           │       ├── CanonicalizeExt.h
    │           │       ├── EliminateUselessOp.h
    │           │       ├── Passes.h
    │           │       ├── Passes.td
    │           │       ├── RewriteCustomOp.h
    │           │       ├── RewriteEntryFuncName.h
    │           │       └── UnpackPublicFunctionReturn.h
    │           │   └── Utils
    │           │       ├── ConvertOpFolder.h
    │           │       └── CustomCallUtil.h
    │       ├── lib
    │           ├── CAPI
    │           │   ├── CMakeLists.txt
    │           │   └── Passes.cpp
    │           ├── CMakeLists.txt
    │           ├── Conversion
    │           │   ├── CMakeLists.txt
    │           │   ├── ConvertTorchToCcl.cpp
    │           │   ├── ConvertTorchToCustomCall.cpp
    │           │   ├── ConvertTorchToStablehloExt.cpp
    │           │   └── PassDetail.h
    │           ├── CustomOp
    │           │   ├── CMakeLists.txt
    │           │   ├── dynamic_mask_stitch.cpp
    │           │   ├── dynamic_partition.cpp
    │           │   └── dynamic_stitch.cpp
    │           ├── Dialect
    │           │   └── Torch
    │           │   │   └── Transforms
    │           │   │       ├── CMakeLists.txt
    │           │   │       ├── DecomposeOnTorch.cpp
    │           │   │       ├── FuseOpOnTorch.cpp
    │           │   │       ├── FuseOpOnTorchPattern.td
    │           │   │       └── PassDetail.h
    │           ├── Pipelines
    │           │   ├── CMakeLists.txt
    │           │   └── Pipelines.cpp
    │           ├── Transforms
    │           │   ├── CMakeLists.txt
    │           │   ├── CanonicalizeExt.cpp
    │           │   ├── EliminateUselessOp.cpp
    │           │   ├── PassDetail.h
    │           │   ├── RewriteCustomOp.cpp
    │           │   ├── RewriteEntryFuncName.cpp
    │           │   └── UnpackPublicFunctionReturn.cpp
    │           └── Utils
    │           │   ├── CMakeLists.txt
    │           │   └── ConvertOpFolder.cpp
    │       ├── python
    │           ├── CMakeLists.txt
    │           ├── TorchFrontendModule.cpp
    │           ├── setup.py
    │           ├── test
    │           │   ├── pytest.ini
    │           │   ├── test_attn_rewrite.py
    │           │   ├── test_fx_utils.py
    │           │   ├── test_fximporter
    │           │   │   ├── test_ccl.py
    │           │   │   ├── test_custom_ops.py
    │           │   │   ├── test_ops_fximporter.py
    │           │   │   └── utils.py
    │           │   ├── test_math_custom_ops.py
    │           │   ├── test_stablehlo_bytecode.py
    │           │   ├── test_torchscript
    │           │   │   ├── test_byteir_customcall_ops.py
    │           │   │   ├── test_compile_option.py
    │           │   │   ├── test_model.py
    │           │   │   ├── test_ops.py
    │           │   │   └── test_torch_custom_ops.py
    │           │   └── test_utils
    │           │   │   └── test_jit_transforms.py
    │           ├── torch_frontend
    │           │   ├── __init__.py
    │           │   ├── _mlir_libs
    │           │   │   └── _site_initialize_0.py
    │           │   ├── byteir_backend
    │           │   │   ├── README.md
    │           │   │   ├── __init__.py
    │           │   │   ├── byteir_fusible_pattern.py
    │           │   │   ├── compilation_cache.py
    │           │   │   ├── compiled_function.py
    │           │   │   ├── compiler.py
    │           │   │   ├── config.py
    │           │   │   ├── debug.py
    │           │   │   ├── fx_match_utils.py
    │           │   │   ├── fx_utils.py
    │           │   │   ├── inner_compile.py
    │           │   │   ├── partitioners.py
    │           │   │   └── utils.py
    │           │   ├── compile.py
    │           │   ├── extra_shape_fn.py
    │           │   ├── flash_attn_op.py
    │           │   ├── fx_rewrite.py
    │           │   ├── fx_tracer.py
    │           │   ├── fx_utils.py
    │           │   ├── tools
    │           │   │   ├── compiler.py
    │           │   │   ├── extra_fn.mlir
    │           │   │   └── gen_extra_library.py
    │           │   ├── ts_utils.py
    │           │   └── utils
    │           │   │   ├── __init__.py
    │           │   │   └── jit_transforms.py
    │           └── version.txt
    │       ├── test
    │           ├── CMakeLists.txt
    │           ├── Conversion
    │           │   ├── ConvertTorchToCcl.mlir
    │           │   ├── ConvertTorchToCustomCall.mlir
    │           │   └── ConvertTorchToStablehloExt.mlir
    │           ├── Dialect
    │           │   └── Torch
    │           │   │   ├── DecomposeOnTorch.mlir
    │           │   │   └── FuseOpOnTorch.mlir
    │           ├── Pipelines
    │           │   └── TorchFunctionToTorchPipeline.mlir
    │           ├── Transforms
    │           │   ├── EliminateUselessOp.mlir
    │           │   ├── RewriteEntryFuncName.mlir
    │           │   └── UnpackPublicFunctionReturn.mlir
    │           ├── lit.cfg.py
    │           └── lit.site.cfg.py.in
    │       └── tools
    │           ├── CMakeLists.txt
    │           └── torch-frontend-opt.cpp
├── runtime
    ├── .gitignore
    ├── README.md
    ├── VERSION_NUMBER
    ├── cmake
    │   ├── CMakeLists.txt
    │   ├── Modules
    │   │   └── FindNCCL.cmake
    │   ├── brt_common.cmake
    │   ├── brt_config.h.in
    │   ├── brt_device_cpu.cmake
    │   ├── brt_device_cuda.cmake
    │   ├── brt_device_nccl.cmake
    │   ├── brt_framework.cmake
    │   ├── brt_ir.cmake
    │   ├── brt_provider_cpu.cmake
    │   ├── brt_provider_cuda.cmake
    │   ├── brt_provider_nccl.cmake
    │   ├── brt_python_bindings.cmake
    │   ├── brt_shared.cmake
    │   └── brt_unittests.cmake
    ├── examples
    │   └── external_project
    │   │   ├── CMakeLists.txt
    │   │   └── main.cpp
    ├── include
    │   └── brt
    │   │   ├── backends
    │   │       ├── README.md
    │   │       ├── common.h
    │   │       ├── cpu
    │   │       │   ├── device
    │   │       │   │   ├── cpu_device_api.h
    │   │       │   │   ├── cpu_work_queue.h
    │   │       │   │   └── llvm
    │   │       │   │   │   └── jit.h
    │   │       │   └── providers
    │   │       │   │   └── default
    │   │       │   │       └── cpu_provider.h
    │   │       ├── cuda
    │   │       │   ├── device
    │   │       │   │   ├── common
    │   │       │   │   │   ├── cuda_call.h
    │   │       │   │   │   ├── dtype.h
    │   │       │   │   │   ├── fast_divmod.h
    │   │       │   │   │   └── util.h
    │   │       │   │   ├── compile
    │   │       │   │   │   ├── nvrtc.h
    │   │       │   │   │   └── ptx.h
    │   │       │   │   ├── cuda_allocator.h
    │   │       │   │   ├── cuda_device_api.h
    │   │       │   │   ├── cuda_env.h
    │   │       │   │   ├── cuda_work_queue.h
    │   │       │   │   └── utils
    │   │       │   │   │   └── op_kernel_impl_helpers.h
    │   │       │   └── providers
    │   │       │   │   └── default
    │   │       │   │       ├── ait
    │   │       │   │           ├── model_interface.h
    │   │       │   │           └── op_registration.h
    │   │       │   │       ├── codegen
    │   │       │   │           └── op_registration.h
    │   │       │   │       ├── copy
    │   │       │   │           └── op_registration.h
    │   │       │   │       ├── cuda_provider.h
    │   │       │   │       ├── cudnn_helper.h
    │   │       │   │       ├── custom
    │   │       │   │           └── op_registration.h
    │   │       │   │       ├── indexing
    │   │       │   │           └── op_registration.h
    │   │       │   │       ├── math
    │   │       │   │           ├── helper.h
    │   │       │   │           └── op_registration.h
    │   │       │   │       ├── normalization
    │   │       │   │           └── op_registration.h
    │   │       │   │       ├── reduction
    │   │       │   │           └── op_registration.h
    │   │       │   │       ├── tensor_generate
    │   │       │   │           └── op_registration.h
    │   │       │   │       └── tensor_manipulate
    │   │       │   │           └── op_registration.h
    │   │       ├── nccl
    │   │       │   ├── device
    │   │       │   │   ├── d_context_nccl.h
    │   │       │   │   ├── distributed_backend_nccl.h
    │   │       │   │   └── utils.h
    │   │       │   └── providers
    │   │       │   │   ├── nccl_provider.h
    │   │       │   │   └── op_registration.h
    │   │       └── rng_state_context.h
    │   │   └── core
    │   │       ├── common
    │   │           ├── code_location.h
    │   │           ├── common.h
    │   │           ├── enums.h
    │   │           ├── exceptions.h
    │   │           ├── logging
    │   │           │   ├── capture.h
    │   │           │   ├── isink.h
    │   │           │   ├── logging.h
    │   │           │   ├── macros.h
    │   │           │   ├── severity.h
    │   │           │   └── sinks
    │   │           │   │   ├── cerr_sink.h
    │   │           │   │   ├── clog_sink.h
    │   │           │   │   ├── composite_sink.h
    │   │           │   │   ├── file_sink.h
    │   │           │   │   └── ostream_sink.h
    │   │           ├── make_string.h
    │   │           ├── status.h
    │   │           ├── string_view.h
    │   │           └── utils
    │   │           │   └── math_helper.h
    │   │       ├── context
    │   │           ├── execution_context.h
    │   │           ├── execution_frame.h
    │   │           └── work_queue.h
    │   │       ├── distributed
    │   │           ├── d_context.h
    │   │           ├── distributed_backend.h
    │   │           ├── distributed_session.h
    │   │           └── rendezvous_socket.h
    │   │       ├── framework
    │   │           ├── allocator.h
    │   │           ├── arena.h
    │   │           ├── bfc_arena.h
    │   │           ├── brt_mutex.h
    │   │           ├── device_api.h
    │   │           ├── dtype.h
    │   │           ├── event.h
    │   │           ├── execution_plan.h
    │   │           ├── execution_provider.h
    │   │           ├── kernel_registry.h
    │   │           ├── memory_info.h
    │   │           ├── op_accessor.h
    │   │           ├── op_kernel.h
    │   │           ├── op_kernel_impl_base.h
    │   │           ├── op_kernel_info.h
    │   │           └── value.h
    │   │       ├── ir
    │   │           ├── builder.h
    │   │           ├── engine_util.h
    │   │           ├── graph_info.h
    │   │           ├── ir.h
    │   │           ├── op_helper.h
    │   │           └── util.h
    │   │       └── session
    │   │           ├── request_context.h
    │   │           └── session.h
    ├── lib
    │   ├── backends
    │   │   ├── cpu
    │   │   │   ├── device
    │   │   │   │   ├── cpu_device_api.cc
    │   │   │   │   ├── cpu_work_queue.cc
    │   │   │   │   └── llvm
    │   │   │   │   │   └── jit.cc
    │   │   │   └── providers
    │   │   │   │   └── default
    │   │   │   │       ├── copy
    │   │   │   │           ├── copy.cc
    │   │   │   │           └── copy.h
    │   │   │   │       ├── cpu_provider.cc
    │   │   │   │       ├── custom_call
    │   │   │   │           ├── non_zero.cc
    │   │   │   │           ├── non_zero.h
    │   │   │   │           ├── repeat.cc
    │   │   │   │           ├── repeat.h
    │   │   │   │           ├── tf_equal.cc
    │   │   │   │           ├── tf_equal.h
    │   │   │   │           ├── tf_select.cc
    │   │   │   │           ├── tf_select.h
    │   │   │   │           ├── tf_string_to_number.cc
    │   │   │   │           ├── tf_string_to_number.h
    │   │   │   │           ├── topk.cc
    │   │   │   │           └── topk.h
    │   │   │   │       ├── llvm
    │   │   │   │           ├── jit.cc
    │   │   │   │           └── jit.h
    │   │   │   │       ├── math
    │   │   │   │           ├── elementwise_ops.cc
    │   │   │   │           └── elementwise_ops.h
    │   │   │   │       ├── shape
    │   │   │   │           ├── shape_compute.cc
    │   │   │   │           └── shape_compute.h
    │   │   │   │       ├── tensor_generate
    │   │   │   │           ├── fill.cc
    │   │   │   │           ├── fill.h
    │   │   │   │           ├── rng_state.cc
    │   │   │   │           └── rng_state.h
    │   │   │   │       └── typecvt
    │   │   │   │           └── typecvt.h
    │   │   ├── cuda
    │   │   │   ├── device
    │   │   │   │   ├── common
    │   │   │   │   │   ├── cuda_call.cc
    │   │   │   │   │   └── util.cc
    │   │   │   │   ├── compile
    │   │   │   │   │   ├── nvrtc.cc
    │   │   │   │   │   └── ptx.cc
    │   │   │   │   ├── cuda_allocator.cc
    │   │   │   │   ├── cuda_device_api.cc
    │   │   │   │   ├── cuda_env.cc
    │   │   │   │   └── cuda_work_queue.cc
    │   │   │   └── providers
    │   │   │   │   └── default
    │   │   │   │       ├── ait
    │   │   │   │           ├── ait.cc
    │   │   │   │           ├── ait.h
    │   │   │   │           └── op_registration.cc
    │   │   │   │       ├── codegen
    │   │   │   │           ├── op_registration.cc
    │   │   │   │           ├── ptx.cc
    │   │   │   │           └── ptx.h
    │   │   │   │       ├── copy
    │   │   │   │           ├── copy.cc
    │   │   │   │           ├── copy.h
    │   │   │   │           └── op_registration.cc
    │   │   │   │       ├── cuda_provider.cc
    │   │   │   │       ├── custom
    │   │   │   │           ├── custom.cc
    │   │   │   │           ├── custom.h
    │   │   │   │           └── op_registration.cc
    │   │   │   │       ├── indexing
    │   │   │   │           ├── index_put.h
    │   │   │   │           ├── index_select.h
    │   │   │   │           ├── kernels
    │   │   │   │           │   ├── index_put.cu
    │   │   │   │           │   ├── index_put.h
    │   │   │   │           │   ├── index_select.cu
    │   │   │   │           │   └── index_select.h
    │   │   │   │           └── op_registration.cc
    │   │   │   │       ├── math
    │   │   │   │           ├── batch_matmul.cc
    │   │   │   │           ├── batch_matmul.h
    │   │   │   │           ├── conv.cc
    │   │   │   │           ├── conv.h
    │   │   │   │           ├── conv_backward.cc
    │   │   │   │           ├── conv_backward.h
    │   │   │   │           ├── elementwise_ops.cc
    │   │   │   │           ├── elementwise_ops.h
    │   │   │   │           ├── helper.cc
    │   │   │   │           ├── kernels
    │   │   │   │           │   ├── cutlass_blas.cu
    │   │   │   │           │   ├── cutlass_blas.h
    │   │   │   │           │   ├── elementwise.cu
    │   │   │   │           │   └── elementwise.h
    │   │   │   │           ├── matmul.cc
    │   │   │   │           ├── matmul.h
    │   │   │   │           ├── op_registration.cc
    │   │   │   │           ├── pool.cc
    │   │   │   │           ├── pool.h
    │   │   │   │           ├── pool_grad.cc
    │   │   │   │           └── pool_grad.h
    │   │   │   │       ├── normalization
    │   │   │   │           ├── batch_norm_grad.cc
    │   │   │   │           ├── batch_norm_grad.h
    │   │   │   │           ├── batch_norm_training.cc
    │   │   │   │           ├── batch_norm_training.h
    │   │   │   │           └── op_registration.cc
    │   │   │   │       ├── reduction
    │   │   │   │           ├── kernels
    │   │   │   │           │   ├── reduction.cu
    │   │   │   │           │   ├── reduction.h
    │   │   │   │           │   └── reduction_helper.h
    │   │   │   │           ├── op_registration.cc
    │   │   │   │           └── reduce_impl.h
    │   │   │   │       ├── tensor_generate
    │   │   │   │           ├── fill.cc
    │   │   │   │           ├── fill.h
    │   │   │   │           ├── kernels
    │   │   │   │           │   ├── fill.cu
    │   │   │   │           │   ├── fill.h
    │   │   │   │           │   ├── rng.cu
    │   │   │   │           │   └── rng.h
    │   │   │   │           ├── op_registration.cc
    │   │   │   │           ├── rng.h
    │   │   │   │           ├── rng_state.cc
    │   │   │   │           └── rng_state.h
    │   │   │   │       └── tensor_manipulate
    │   │   │   │           ├── kernels
    │   │   │   │               ├── transpose.cu
    │   │   │   │               └── transpose.h
    │   │   │   │           ├── op_registration.cc
    │   │   │   │           ├── transpose.cc
    │   │   │   │           └── transpose.h
    │   │   └── nccl
    │   │   │   ├── device
    │   │   │       ├── distributed_backend_nccl.cc
    │   │   │       └── utils.cc
    │   │   │   └── providers
    │   │   │       ├── all_gather.cc
    │   │   │       ├── all_gather.h
    │   │   │       ├── all_reduce.cc
    │   │   │       ├── all_reduce.h
    │   │   │       ├── broadcast.cc
    │   │   │       ├── broadcast.h
    │   │   │       ├── nccl_provider.cc
    │   │   │       ├── op_registration.cc
    │   │   │       ├── recv.cc
    │   │   │       ├── recv.h
    │   │   │       ├── send.cc
    │   │   │       └── send.h
    │   └── core
    │   │   ├── common
    │   │       ├── common.cc
    │   │       ├── logging
    │   │       │   ├── capture.cc
    │   │       │   ├── logging.cc
    │   │       │   └── sinks
    │   │       │   │   └── ostream_sink.cc
    │   │       ├── status.cc
    │   │       └── utils
    │   │       │   └── math_helper.cc
    │   │   ├── context
    │   │       └── execution_frame.cc
    │   │   ├── distributed
    │   │       ├── distributed_backend.cc
    │   │       ├── distributed_session.cc
    │   │       └── rendezvous_socket.cc
    │   │   ├── framework
    │   │       ├── allocator.cc
    │   │       ├── bfc_arena.cc
    │   │       ├── device_api.cc
    │   │       ├── execution_plan.cc
    │   │       ├── execution_provider.cc
    │   │       ├── kernel_registry.cc
    │   │       ├── op_accessor.cc
    │   │       └── op_kernel_info.cc
    │   │   ├── ir
    │   │       ├── builder.cc
    │   │       ├── ir.cc
    │   │       ├── op_helper.cc
    │   │       └── util.cc
    │   │   └── session
    │   │       ├── request_context.cc
    │   │       └── session.cc
    ├── python
    │   ├── README.md
    │   ├── brt
    │   │   ├── __init__.py
    │   │   ├── backend.py
    │   │   └── utils.py
    │   ├── examples
    │   │   ├── add2.mlir
    │   │   ├── add2.py
    │   │   ├── ait_op.py
    │   │   ├── arg_alias.mlir
    │   │   ├── arg_alias.py
    │   │   ├── distribute_mlp.py
    │   │   └── llm.py
    │   ├── setup.py
    │   └── src
    │   │   └── module.cc
    ├── test
    │   ├── backends
    │   │   ├── cpu
    │   │   │   ├── device
    │   │   │   │   └── llvm_jit_test.cc
    │   │   │   └── providers
    │   │   │   │   └── default
    │   │   │   │       ├── e2e
    │   │   │   │           └── e2e_test.cc
    │   │   │   │       ├── kernel
    │   │   │   │           ├── copy_test.cc
    │   │   │   │           ├── non_zero_test.cc
    │   │   │   │           ├── repeat_test.cc
    │   │   │   │           ├── rng_state_test.cc
    │   │   │   │           ├── string_equal_test.cc
    │   │   │   │           ├── tf_select_test.cc
    │   │   │   │           ├── tf_string_to_number_test.cc
    │   │   │   │           ├── topk_test.cc
    │   │   │   │           └── typecvt_test.cc
    │   │   │   │       └── request_context_test.cc
    │   │   ├── cuda
    │   │   │   ├── device
    │   │   │   │   ├── allocator_test.cc
    │   │   │   │   ├── cuda_work_queue_test.cc
    │   │   │   │   ├── nvrtc_test.cc
    │   │   │   │   ├── ptx_test.cc
    │   │   │   │   ├── test_kernels.cu
    │   │   │   │   └── test_kernels.h
    │   │   │   └── providers
    │   │   │   │   └── default
    │   │   │   │       ├── e2e
    │   │   │   │           └── resnet_test.cc
    │   │   │   │       ├── kernel
    │   │   │   │           ├── ait_test.cc
    │   │   │   │           ├── alias_test.cc
    │   │   │   │           ├── batch_matmul_test.cc
    │   │   │   │           ├── batch_norm_grad_test.cc
    │   │   │   │           ├── batch_norm_training_test.cc
    │   │   │   │           ├── codegen_test.cc
    │   │   │   │           ├── conv_backward_data_test.cc
    │   │   │   │           ├── conv_backward_filter_test.cc
    │   │   │   │           ├── conv_test.cc
    │   │   │   │           ├── copy_test.cc
    │   │   │   │           ├── elementwise_test.cc
    │   │   │   │           ├── fill_test.cc
    │   │   │   │           ├── flash_attn_bwd_test.cc
    │   │   │   │           ├── flash_attn_fwd_test.cc
    │   │   │   │           ├── index_test.cc
    │   │   │   │           ├── matmul_test.cc
    │   │   │   │           ├── multi_stream_test.cc
    │   │   │   │           ├── pool_grad_test.cc
    │   │   │   │           ├── pool_test.cc
    │   │   │   │           ├── reduction_test.cc
    │   │   │   │           ├── rng_state_test.cc
    │   │   │   │           ├── rng_test.cc
    │   │   │   │           └── transpose_test.cc
    │   │   │   │       ├── request_context_test.cc
    │   │   │   │       └── session_test.cc
    │   │   └── nccl
    │   │   │   ├── device
    │   │   │       ├── test_distributed_backend.cc
    │   │   │       └── test_utils.cc
    │   │   │   └── providers
    │   │   │       └── test_distributed_session.cc
    │   ├── common
    │   │   ├── env.cc
    │   │   ├── models.cc
    │   │   └── util.cc
    │   ├── context
    │   │   └── exec_frame_test.cc
    │   ├── distributed
    │   │   └── test_rendezvous_socket.cc
    │   ├── exported.ld
    │   ├── external_kernels
    │   │   ├── cpu
    │   │   │   └── kernels.cc
    │   │   └── cuda
    │   │   │   ├── kernels.cc
    │   │   │   ├── kernels.cu
    │   │   │   └── kernels.h
    │   ├── framework
    │   │   ├── allocator_test.cc
    │   │   └── misc.cc
    │   ├── include
    │   │   └── brt
    │   │   │   └── test
    │   │   │       └── common
    │   │   │           ├── config.h
    │   │   │           ├── cuda
    │   │   │               └── util.h
    │   │   │           ├── env.h
    │   │   │           ├── models.h
    │   │   │           ├── nccl
    │   │   │               ├── test_base.h
    │   │   │               └── test_utils.h
    │   │   │           └── util.h
    │   ├── ir
    │   │   ├── builder_test.cc
    │   │   └── ir_test.cc
    │   ├── session
    │   │   └── session_test.cc
    │   ├── test_files
    │   │   ├── AITOp
    │   │   │   ├── bmm_permute_a100.so
    │   │   │   ├── bmm_permute_entry.mlir
    │   │   │   ├── permute_a100.so
    │   │   │   └── permute_entry.mlir
    │   │   ├── Distributed
    │   │   │   ├── add_send.mlir
    │   │   │   ├── all_gather.mlir
    │   │   │   ├── all_reduce.mlir
    │   │   │   ├── broadcast.mlir
    │   │   │   ├── broadcast2.mlir
    │   │   │   ├── ccl.mlir
    │   │   │   ├── ccl.ptx
    │   │   │   ├── recv.mlir
    │   │   │   ├── recv_add.mlir
    │   │   │   └── send.mlir
    │   │   ├── DynamicShapes
    │   │   │   └── Add2
    │   │   │   │   ├── entry.mlir
    │   │   │   │   └── shape_fn.ll
    │   │   ├── LLJIT
    │   │   │   ├── Case0
    │   │   │   │   ├── entry.mlir
    │   │   │   │   └── host_kernels.ll
    │   │   │   ├── Case0_v1_0_0
    │   │   │   │   ├── entry.mlirbc
    │   │   │   │   └── host_kernels.bc
    │   │   │   ├── add.ll
    │   │   │   ├── tanh.ll
    │   │   │   ├── transpose_32_64_64.ll
    │   │   │   ├── transpose_3_224_224.ll
    │   │   │   └── typecvt.ll
    │   │   ├── add2_cpu.mlir
    │   │   ├── add_splat_const_one_cuda.mlir
    │   │   ├── cuda_add.cu
    │   │   ├── custom_add_cpu2cuda.mlir
    │   │   ├── fill_cuda.mlir
    │   │   ├── flash_attn_bwd.mlir
    │   │   ├── flash_attn_bwd_outputs_dk.data
    │   │   ├── flash_attn_bwd_outputs_dq.data
    │   │   ├── flash_attn_bwd_outputs_dv.data
    │   │   ├── flash_attn_fwd.mlir
    │   │   ├── flash_attn_fwd_outputs.data
    │   │   ├── flash_attn_inputs_dout.data
    │   │   ├── flash_attn_inputs_k.data
    │   │   ├── flash_attn_inputs_q.data
    │   │   ├── flash_attn_inputs_v.data
    │   │   ├── flash_attn_kvcache.mlir
    │   │   ├── flash_attn_kvcache_inputs_cache_seqlens.data
    │   │   ├── flash_attn_kvcache_inputs_k.data
    │   │   ├── flash_attn_kvcache_inputs_kcache.data
    │   │   ├── flash_attn_kvcache_inputs_q.data
    │   │   ├── flash_attn_kvcache_inputs_v.data
    │   │   ├── flash_attn_kvcache_inputs_vcache.data
    │   │   ├── flash_attn_kvcache_outputs.data
    │   │   ├── flash_attn_kvcache_outputs_kcache.data
    │   │   ├── flash_attn_kvcache_outputs_vcache.data
    │   │   ├── generate_flash_attn_ground_truth.py
    │   │   ├── group_allocation_hook_cpu_group.mlir
    │   │   ├── llvm_ptx_add.ptx
    │   │   ├── llvm_ptx_add_bare_ptr.ptx
    │   │   ├── nvcc_ptx_add.ptx
    │   │   ├── resnet18_bw_device.ptx
    │   │   ├── resnet18_bw_host_cuda.mlir
    │   │   ├── resnet18_fw_bw_device.ptx
    │   │   ├── resnet18_fw_bw_host_cuda.mlir
    │   │   ├── resnet18_fw_device.ptx
    │   │   ├── resnet18_fw_host_cuda.mlir
    │   │   ├── rng_cuda.mlir
    │   │   ├── rng_state_cpu.mlir
    │   │   ├── rng_state_cuda.mlir
    │   │   ├── string_equal.mlir
    │   │   └── string_equal_scalar.mlir
    │   └── unittest_main
    │   │   └── test_main.cc
    └── version.ld
├── scripts
    ├── apply_patches.sh
    ├── clang_format_check.sh
    ├── compiler
    │   └── build_and_test.sh
    ├── format_check.py
    ├── prepare.sh
    └── runtime
    │   ├── build_and_test.sh
    │   └── build_external_project.sh
├── talks
    ├── ChinaSoftCon-ByteIR.pdf
    └── c4ml23_poster.pdf
└── tests
    ├── build_and_test_e2e.sh
    ├── compatibility_test
        ├── execute.py
        ├── main.py
        └── reporting.py
    └── numerical_test
        ├── execute.py
        ├── gen_brt_tests.py
        ├── main.py
        ├── mlir_tests
            ├── cpu_ops
            │   ├── add.mlir
            │   ├── batch_norm_inference.mlir
            │   ├── batch_norm_inference_f16.mlir
            │   ├── broadcast_in_dim.mlir
            │   ├── compare_LT_f32.mlir
            │   ├── compare_LT_f64.mlir
            │   ├── compare_LT_i32.mlir
            │   ├── compare_LT_i64.mlir
            │   ├── compare_NE_f32.mlir
            │   ├── compare_NE_f64.mlir
            │   ├── compare_NE_i32.mlir
            │   ├── compare_NE_i64.mlir
            │   ├── concatenate.mlir
            │   ├── convert_f16_f32.mlir
            │   ├── convert_f16_f64.mlir
            │   ├── convert_f16_i16.mlir
            │   ├── convert_f16_i32.mlir
            │   ├── convert_f16_i64.mlir
            │   ├── convert_f32_f16.mlir
            │   ├── convert_f32_f64.mlir
            │   ├── convert_f32_i16.mlir
            │   ├── convert_f32_i32.mlir
            │   ├── convert_f32_i32_special_val.mlir
            │   ├── convert_f32_i64.mlir
            │   ├── convert_f64_f16.mlir
            │   ├── convert_f64_f32.mlir
            │   ├── convert_f64_i16.mlir
            │   ├── convert_f64_i32.mlir
            │   ├── convert_f64_i64.mlir
            │   ├── convert_i16_f16.mlir
            │   ├── convert_i16_f32.mlir
            │   ├── convert_i16_f64.mlir
            │   ├── convert_i16_i32.mlir
            │   ├── convert_i16_i64.mlir
            │   ├── convert_i32_f16.mlir
            │   ├── convert_i32_f32.mlir
            │   ├── convert_i32_f64.mlir
            │   ├── convert_i32_i16.mlir
            │   ├── convert_i32_i64.mlir
            │   ├── convert_i64_f16.mlir
            │   ├── convert_i64_f32.mlir
            │   ├── convert_i64_f64.mlir
            │   ├── convert_i64_i16.mlir
            │   ├── convert_i64_i32.mlir
            │   ├── custom_call_byteir_addn.mlir
            │   ├── custom_call_byteir_arg_max.mlir
            │   ├── custom_call_byteir_arg_max_i32.mlir
            │   ├── custom_call_byteir_arg_min.mlir
            │   ├── custom_call_byteir_arg_min_i32.mlir
            │   ├── custom_call_byteir_softmax.mlir
            │   ├── custom_call_tf_UpperBound.mlir
            │   ├── divide_f16.mlir
            │   ├── log_plus_one_f16.mlir
            │   ├── maximum_f32.mlir
            │   ├── maximum_f64.mlir
            │   ├── maximum_i32.mlir
            │   ├── maximum_i64.mlir
            │   ├── minimum_f32.mlir
            │   ├── minimum_f64.mlir
            │   ├── minimum_i32.mlir
            │   ├── minimum_i64.mlir
            │   ├── multiply_f32.mlir
            │   ├── multiply_f64.mlir
            │   ├── multiply_i32.mlir
            │   ├── multiply_i64.mlir
            │   ├── reduce_f32.mlir
            │   ├── remainder_i64.mlir
            │   ├── reshape_slice.mlir
            │   ├── rng.mlir
            │   ├── scatter_insert_slice.mlir
            │   ├── select_f32.mlir
            │   ├── select_f64.mlir
            │   ├── select_i64.mlir
            │   ├── slice_view_like.mlir
            │   └── subtrace_f16.mlir
            └── ops
            │   ├── add.mlir
            │   ├── bmm_rcr.mlir
            │   ├── bmm_rrc.mlir
            │   ├── bmm_rrr_add_f16.mlir
            │   ├── bmm_rrr_f16.mlir
            │   ├── bmm_rrr_permute_f16.mlir
            │   ├── bmm_rrr_permute_f32.mlir
            │   ├── broadcast.mlir
            │   ├── broadcast1.mlir
            │   ├── compare_eq.mlir
            │   ├── compare_lt.mlir
            │   ├── concat.mlir
            │   ├── concat2.mlir
            │   ├── convert_f16_f32.mlir
            │   ├── convert_f32_f16.mlir
            │   ├── divide.mlir
            │   ├── gather.mlir
            │   ├── gemm_crr_f16.mlir
            │   ├── gemm_rrr_f16.mlir
            │   ├── gemm_rrr_f32.mlir
            │   ├── insert_slice.mlir
            │   ├── layernorm.mlir
            │   ├── logistic.mlir
            │   ├── mul_f16.mlir
            │   ├── mul_f32.mlir
            │   ├── negate.mlir
            │   ├── power.mlir
            │   ├── reduce_first_dim.mlir
            │   ├── reduce_sum.mlir
            │   ├── reduce_sum_2d.mlir
            │   ├── reduce_sum_first_2d.mlir
            │   ├── rsqrt.mlir
            │   ├── scatter.mlir
            │   ├── scatter_insert_slice.mlir
            │   ├── select.mlir
            │   ├── slice.mlir
            │   ├── softmax.mlir
            │   ├── transpose0312.mlir
            │   ├── transpose102.mlir
            │   ├── transpose1023.mlir
            │   ├── transpose120.mlir
            │   ├── transpose1203.mlir
            │   ├── transpose2013.mlir
            │   └── transpose2d.mlir
        ├── profiler.py
        ├── reporting.py
        ├── testset.py
        ├── torch_dynamo_e2e_testing
            ├── backend.py
            ├── execute.py
            └── test_suite
            │   └── test_flash_attn.py
        └── torch_e2e_testing
            ├── framework.py
            ├── registry.py
            └── test_suite
                ├── __init__.py
                └── basic.py


/.clang-format:
--------------------------------------------------------------------------------
1 | BasedOnStyle: LLVM
2 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug
 3 | about: Create a bug report
 4 | title: "[BUG]"
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 | 
13 | **To Reproduce**
14 | Steps to reproduce the behavior. The easier it is to reproduce the faster it will get maintainer attention.
15 | 
16 | **Expected behavior**
17 | A clear and concise description of what you expected to happen.
18 | 
19 | **Stack trace/logs**
20 | If applicable, add the stack trace or logs from the time of the error.
21 | 
22 | **Environment**
23 | 
24 | **Proposed fix**
25 | If you have a proposal for how to fix the issue state it here or link to a PR.
26 | 
27 | **Additional context**
28 | Add any other context about the problem here.
29 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/question.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Question
 3 | about: Ask a general question about ByteIR
 4 | title: "[QUESTION]"
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Your question**
11 | Ask a clear and concise question about ByteIR.
12 | 


--------------------------------------------------------------------------------
/compiler/.gitignore:
--------------------------------------------------------------------------------
1 | python/byteir.egg-info/
2 | python/byteir/version.py


--------------------------------------------------------------------------------
/compiler/dialects/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(include)
2 | add_subdirectory(lib)


--------------------------------------------------------------------------------
/compiler/dialects/include/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(byteir)


--------------------------------------------------------------------------------
/compiler/dialects/include/byteir/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(Dialect)


--------------------------------------------------------------------------------
/compiler/dialects/include/byteir/Dialect/Ace/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_mlir_dialect(AceOps ace)
2 | add_mlir_doc(AceOps AceOps Dialects/ -gen-op-doc)
3 | 
4 | set(LLVM_TARGET_DEFINITIONS AceOps.td)
5 | mlir_tablegen(AceOpsAttributes.h.inc -gen-attrdef-decls)
6 | mlir_tablegen(AceOpsAttributes.cpp.inc -gen-attrdef-defs)
7 | add_public_tablegen_target(MLIRAceOpsAttrIncGen)
8 | 


--------------------------------------------------------------------------------
/compiler/dialects/include/byteir/Dialect/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(Ace)
2 | add_subdirectory(Ccl)


--------------------------------------------------------------------------------
/compiler/dialects/include/byteir/Dialect/Ccl/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(IR)


--------------------------------------------------------------------------------
/compiler/dialects/include/byteir/Dialect/Ccl/IR/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_mlir_dialect(CclOps ccl)
2 | add_mlir_doc(CclOps CclOps Dialects/ -gen-op-doc)
3 | add_mlir_interface(CclOpInterface)
4 | 


--------------------------------------------------------------------------------
/compiler/dialects/lib/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(Dialect)


--------------------------------------------------------------------------------
/compiler/dialects/lib/Dialect/Ace/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_byteir_dialect_library(MLIRAceDialect
 2 |   IR/AceDialect.cpp
 3 | 
 4 |   ADDITIONAL_HEADER_DIRS
 5 |   ${BYTEIR_SRC_INCLUDE_DIR}/mlir/Dialect/Ace
 6 | 
 7 |   DEPENDS
 8 |   MLIRAceOpsIncGen
 9 |   MLIRAceOpsAttrIncGen
10 | 
11 |   LINK_LIBS PUBLIC
12 |   MLIRIR
13 |   MLIRSupport
14 |   MLIRSideEffectInterfaces
15 |   )
16 | 


--------------------------------------------------------------------------------
/compiler/dialects/lib/Dialect/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(Ace)
2 | add_subdirectory(Ccl)


--------------------------------------------------------------------------------
/compiler/dialects/lib/Dialect/Ccl/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(IR)


--------------------------------------------------------------------------------
/compiler/dialects/lib/Dialect/Ccl/IR/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_byteir_dialect_library(MLIRCclDialect
 2 |   CclOps.cpp
 3 | 
 4 |   ADDITIONAL_HEADER_DIRS
 5 |   ${BYTEIR_SRC_INCLUDE_DIR}/byteir/Dialect/Ccl
 6 | 
 7 |   DEPENDS
 8 |   MLIRCclOpsIncGen
 9 | 
10 |   LINK_LIBS PUBLIC
11 |   MLIRIR
12 |   MLIRSupport
13 | )
14 | 


--------------------------------------------------------------------------------
/compiler/doc/passes.md:
--------------------------------------------------------------------------------
1 | # Useful Passes 
2 | 
3 | This is a placeholder for passes we built and we will call.
4 | 


--------------------------------------------------------------------------------
/compiler/include/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(byteir)
2 | 


--------------------------------------------------------------------------------
/compiler/include/byteir/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(Conversion)
2 | add_subdirectory(Dialect)
3 | add_subdirectory(Transforms)


--------------------------------------------------------------------------------
/compiler/include/byteir/Conversion/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | set(LLVM_TARGET_DEFINITIONS Passes.td)
2 | mlir_tablegen(Passes.h.inc -gen-pass-decls -name ByteIRConversion)
3 | add_public_tablegen_target(ByteIRConversionPassIncGen)
4 | # add_mlir_doc(Passes ConversionPasses ./ -gen-pass-doc)


--------------------------------------------------------------------------------
/compiler/include/byteir/Dialect/Ace/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | set(LLVM_TARGET_DEFINITIONS Passes.td)
2 | mlir_tablegen(Passes.h.inc -gen-pass-decls -name ByteIRAce)
3 | add_public_tablegen_target(ByteIRAcePassIncGen)
4 | 


--------------------------------------------------------------------------------
/compiler/include/byteir/Dialect/Affine/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | set(LLVM_TARGET_DEFINITIONS Passes.td)
2 | mlir_tablegen(Passes.h.inc -gen-pass-decls -name ByteIRAffine)
3 | add_public_tablegen_target(ByteIRAffinePassIncGen)
4 | 


--------------------------------------------------------------------------------
/compiler/include/byteir/Dialect/Byre/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_mlir_dialect(ByreOps byre)
 2 | add_mlir_doc(ByreOps ByreOps Dialects/ -gen-op-doc)
 3 | 
 4 | set(LLVM_TARGET_DEFINITIONS ByreBase.td)
 5 | mlir_tablegen(ByreOpInterfaces.h.inc -gen-op-interface-decls)
 6 | mlir_tablegen(ByreOpInterfaces.cpp.inc -gen-op-interface-defs)
 7 | mlir_tablegen(ByreEnums.h.inc -gen-enum-decls)
 8 | mlir_tablegen(ByreEnums.cpp.inc -gen-enum-defs)
 9 | 
10 | add_public_tablegen_target(MLIRByreOpInterfacesIncGen)
11 | 
12 | set(LLVM_TARGET_DEFINITIONS Passes.td)
13 | mlir_tablegen(Passes.h.inc -gen-pass-decls -name ByteIRByre)
14 | add_public_tablegen_target(ByteIRByrePassIncGen)
15 | 
16 | add_subdirectory(Serialization)
17 | 


--------------------------------------------------------------------------------
/compiler/include/byteir/Dialect/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_subdirectory(Ace)
 2 | add_subdirectory(Affine)
 3 | add_subdirectory(Byre)
 4 | add_subdirectory(Cat)
 5 | add_subdirectory(Ccl)
 6 | add_subdirectory(GPU)
 7 | add_subdirectory(Lace)
 8 | add_subdirectory(Linalg)
 9 | add_subdirectory(MemRef)
10 | add_subdirectory(mhlo)
11 | add_subdirectory(SCF)
12 | add_subdirectory(Shape)
13 | add_subdirectory(Tensor)
14 | add_subdirectory(Transform)
15 | add_subdirectory(Vector)
16 | add_subdirectory(Lccl)
17 | 


--------------------------------------------------------------------------------
/compiler/include/byteir/Dialect/Cat/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(IR)
2 | 


--------------------------------------------------------------------------------
/compiler/include/byteir/Dialect/Cat/IR/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_mlir_dialect(CatOps cat)
2 | add_mlir_doc(CatOps CatOps Dialects/ -gen-op-doc)
3 | 
4 | set(LLVM_TARGET_DEFINITIONS CatBase.td)
5 | mlir_tablegen(CatOpInterfaces.h.inc -gen-op-interface-decls)
6 | mlir_tablegen(CatOpInterfaces.cpp.inc -gen-op-interface-defs)
7 | 
8 | add_public_tablegen_target(MLIRCatOpInterfacesIncGen)


--------------------------------------------------------------------------------
/compiler/include/byteir/Dialect/Ccl/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(TransformOps)
2 | 
3 | set(LLVM_TARGET_DEFINITIONS Passes.td)
4 | mlir_tablegen(Passes.h.inc -gen-pass-decls -name ByteIRCcl)
5 | add_public_tablegen_target(ByteIRCclPassIncGen)


--------------------------------------------------------------------------------
/compiler/include/byteir/Dialect/Ccl/TransformOps/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | set(LLVM_TARGET_DEFINITIONS CclTransformOps.td)
2 | mlir_tablegen(CclTransformOps.h.inc -gen-op-decls)
3 | mlir_tablegen(CclTransformOps.cpp.inc -gen-op-defs)
4 | add_public_tablegen_target(MLIRCclTransformOpsIncGen)
5 | 
6 | add_mlir_doc(CclTransformOps CclTransformOps Dialects/ -gen-op-doc)


--------------------------------------------------------------------------------
/compiler/include/byteir/Dialect/GPU/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(TransformOps)
2 | 
3 | set(LLVM_TARGET_DEFINITIONS Passes.td)
4 | mlir_tablegen(Passes.h.inc -gen-pass-decls -name ByteIRGPU)
5 | add_public_tablegen_target(ByteIRGPUPassIncGen)
6 | 


--------------------------------------------------------------------------------
/compiler/include/byteir/Dialect/GPU/TransformOps/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | set(LLVM_TARGET_DEFINITIONS GPUExtTransformOps.td)
2 | mlir_tablegen(GPUExtTransformOps.h.inc -gen-op-decls)
3 | mlir_tablegen(GPUExtTransformOps.cpp.inc -gen-op-defs)
4 | add_public_tablegen_target(MLIRGPUExtTransformOpsIncGen)
5 | 
6 | add_mlir_doc(GPUExtTransformOps GPUExtTransformOps Dialects/ -gen-op-doc)


--------------------------------------------------------------------------------
/compiler/include/byteir/Dialect/Lace/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_mlir_dialect(LaceOps lace)
2 | add_mlir_doc(LaceOps LaceOps Dialects/ -gen-op-doc)
3 | 
4 | set(LLVM_TARGET_DEFINITIONS LaceBase.td)
5 | mlir_tablegen(LaceOpInterfaces.h.inc -gen-op-interface-decls)
6 | mlir_tablegen(LaceOpInterfaces.cpp.inc -gen-op-interface-defs)
7 | 
8 | add_public_tablegen_target(MLIRLaceOpInterfacesIncGen)
9 | 


--------------------------------------------------------------------------------
/compiler/include/byteir/Dialect/Lccl/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_mlir_dialect(LcclOps lccl)
2 | add_mlir_doc(LcclOps LcclOps Dialects/ -gen-op-doc)
3 | 


--------------------------------------------------------------------------------
/compiler/include/byteir/Dialect/Linalg/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(IR)
2 | add_subdirectory(TransformOps)
3 | 
4 | set(LLVM_TARGET_DEFINITIONS Passes.td)
5 | mlir_tablegen(Passes.h.inc -gen-pass-decls -name ByteIRLinalg)
6 | add_public_tablegen_target(ByteIRLinalgPassIncGen)
7 | 


--------------------------------------------------------------------------------
/compiler/include/byteir/Dialect/Linalg/IR/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_mlir_dialect(LinalgExtOps linalg_ext)
 2 | add_mlir_doc(LinalgExtOps LinalgExtOps Dialects/ -gen-op-doc)
 3 | 
 4 | 
 5 | set(LLVM_TARGET_DEFINITIONS LinalgExtInterfaces.td)
 6 | mlir_tablegen(LinalgExtOpInterfaces.h.inc -gen-op-interface-decls)
 7 | mlir_tablegen(LinalgExtOpInterfaces.cpp.inc -gen-op-interface-defs)
 8 | add_public_tablegen_target(MLIRLinalgExtInterfacesIncGen)
 9 | add_dependencies(MLIRLinalgExtOpsIncGen MLIRLinalgExtInterfacesIncGen)
10 | 


--------------------------------------------------------------------------------
/compiler/include/byteir/Dialect/Linalg/TransformOps/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | set(LLVM_TARGET_DEFINITIONS LinalgExtTransformOps.td)
2 | mlir_tablegen(LinalgExtTransformOps.h.inc -gen-op-decls)
3 | mlir_tablegen(LinalgExtTransformOps.cpp.inc -gen-op-defs)
4 | add_public_tablegen_target(MLIRLinalgExtTransformOpsIncGen)
5 | 
6 | add_mlir_doc(LinalgExtTransformOps LinalgExtTransformOps Dialects/ -gen-op-doc)


--------------------------------------------------------------------------------
/compiler/include/byteir/Dialect/MemRef/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | set(LLVM_TARGET_DEFINITIONS Passes.td)
2 | mlir_tablegen(Passes.h.inc -gen-pass-decls -name ByteIRMemRef)
3 | add_public_tablegen_target(ByteIRMemRefPassIncGen)
4 | 


--------------------------------------------------------------------------------
/compiler/include/byteir/Dialect/SCF/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | set(LLVM_TARGET_DEFINITIONS Passes.td)
2 | mlir_tablegen(Passes.h.inc -gen-pass-decls -name ByteIRSCF)
3 | add_public_tablegen_target(ByteIRSCFPassIncGen)
4 | 


--------------------------------------------------------------------------------
/compiler/include/byteir/Dialect/Shape/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(IR)
2 | 
3 | set(LLVM_TARGET_DEFINITIONS Passes.td)
4 | mlir_tablegen(Passes.h.inc -gen-pass-decls -name ByteIRShape)
5 | add_public_tablegen_target(ByteIRShapePassIncGen)
6 | 


--------------------------------------------------------------------------------
/compiler/include/byteir/Dialect/Shape/IR/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_mlir_dialect(ShapeExtOps shape_ext)
2 | add_mlir_doc(ShapeExtOps ShapeExtOps Dialects/ -gen-op-doc)
3 | 
4 | 


--------------------------------------------------------------------------------
/compiler/include/byteir/Dialect/Tensor/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | set(LLVM_TARGET_DEFINITIONS Passes.td)
2 | mlir_tablegen(Passes.h.inc -gen-pass-decls -name ByteIRTensor)
3 | add_public_tablegen_target(ByteIRTensorPassIncGen)
4 | 


--------------------------------------------------------------------------------
/compiler/include/byteir/Dialect/Transform/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(IR)
2 | 
3 | set(LLVM_TARGET_DEFINITIONS Passes.td)
4 | mlir_tablegen(Passes.h.inc -gen-pass-decls -name ByteIRTransform)
5 | add_public_tablegen_target(ByteIRTransformPassIncGen)
6 | 


--------------------------------------------------------------------------------
/compiler/include/byteir/Dialect/Transform/IR/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | set(LLVM_TARGET_DEFINITIONS TransformExtOps.td)
2 | mlir_tablegen(TransformExtOps.h.inc -gen-op-decls)
3 | mlir_tablegen(TransformExtOps.cpp.inc -gen-op-defs)
4 | add_public_tablegen_target(MLIRTransformExtOpsIncGen)
5 | 
6 | add_mlir_doc(TransformExtOps TransformExtOps Dialects/ -gen-op-doc)


--------------------------------------------------------------------------------
/compiler/include/byteir/Dialect/Vector/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(Transforms)


--------------------------------------------------------------------------------
/compiler/include/byteir/Dialect/Vector/Transforms/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | set(LLVM_TARGET_DEFINITIONS Passes.td)
2 | mlir_tablegen(Passes.h.inc -gen-pass-decls -name ByteIRVector)
3 | add_public_tablegen_target(ByteIRVectorPassIncGen)
4 | 


--------------------------------------------------------------------------------
/compiler/include/byteir/Dialect/mhlo/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | set(LLVM_TARGET_DEFINITIONS Passes.td)
2 | mlir_tablegen(Passes.h.inc -gen-pass-decls -name ByteIRMhlo)
3 | add_public_tablegen_target(ByteIRMhloPassIncGen)
4 | 


--------------------------------------------------------------------------------
/compiler/include/byteir/Transforms/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | set(LLVM_TARGET_DEFINITIONS Passes.td)
2 | mlir_tablegen(Passes.h.inc -gen-pass-decls -name ByteIRTransforms)
3 | add_public_tablegen_target(ByteIRTransformsPassIncGen)
4 | # add_mlir_doc(Passes TransformsPasses ./ -gen-pass-doc)


--------------------------------------------------------------------------------
/compiler/lib/Analysis/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_mlir_library(ByteIRAnalysis
 2 |   DimFlag.cpp
 3 |   Liveness.cpp
 4 |   OpDependence.cpp
 5 |   ShapeAnalysis.cpp
 6 |   SideEffect.cpp
 7 |   SymbolicShape.cpp
 8 |   UseRange.cpp
 9 | 
10 |   ADDITIONAL_HEADER_DIRS
11 |   ${BYTEIR_SRC_INCLUDE_DIR}/byteir/Analysis
12 | 
13 |   LINK_LIBS PUBLIC
14 |   MLIRAnalysis
15 |   MLIRBufferizationTransforms
16 |   MLIRIR
17 |   MLIRShapeDialect
18 |   MLIRTensorDialect
19 | )


--------------------------------------------------------------------------------
/compiler/lib/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_subdirectory(Analysis)
 2 | add_subdirectory(Conversion)
 3 | add_subdirectory(Dialect)
 4 | add_subdirectory(Pipelines)
 5 | add_subdirectory(Stat)
 6 | add_subdirectory(Target)
 7 | add_subdirectory(Transforms)
 8 | add_subdirectory(Utils)
 9 | 
10 | # note: CAPI depends on byteir property, so add it at last
11 | add_subdirectory(CAPI)
12 | 


--------------------------------------------------------------------------------
/compiler/lib/Conversion/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_subdirectory(Common)
 2 | add_subdirectory(FuncToByre)
 3 | add_subdirectory(GPUToNVVM)
 4 | add_subdirectory(HloToByreTensor)
 5 | add_subdirectory(HloToCat)
 6 | add_subdirectory(HloToTensor)
 7 | add_subdirectory(MemrefToByre)
 8 | add_subdirectory(ToAce)
 9 | add_subdirectory(ToAIT)
10 | add_subdirectory(ToByre)
11 | add_subdirectory(ToGPU)
12 | add_subdirectory(ToHlo)
13 | add_subdirectory(ToLinalg)
14 | add_subdirectory(ToLLVM)
15 | add_subdirectory(ToPTX)
16 | add_subdirectory(LcclToByre)
17 | 


--------------------------------------------------------------------------------
/compiler/lib/Conversion/Common/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_byteir_conversion_library(ByteIRConversionCommon
 2 |   FunctionSupport.cpp
 3 | 
 4 |   ADDITIONAL_HEADER_DIRS
 5 |   ${BYTEIR_SRC_INCLUDE_DIR}/byteir/Conversion/Common
 6 | 
 7 |   LINK_LIBS PUBLIC
 8 |   MLIRIR
 9 |   MLIRMemRefDialect
10 |   MLIRTransforms
11 |   )
12 | 


--------------------------------------------------------------------------------
/compiler/lib/Conversion/FuncToByre/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_byteir_conversion_library(ByteIRFuncToByre
 2 |   FuncToByre.cpp
 3 | 
 4 |   ADDITIONAL_HEADER_DIRS
 5 |   ${BYTEIR_SRC_INCLUDE_DIR}/byteir/Conversion/FuncToByre
 6 | 
 7 |   DEPENDS
 8 |   MLIRByreDialect
 9 |   ByteIRConversionPassIncGen
10 |   ByteIRConversionCommon
11 |   ByteIRMhloUtils
12 | 
13 |   LINK_LIBS PUBLIC
14 |   MLIRArithDialect
15 |   MLIRByreDialect
16 |   MLIRIR
17 |   MLIRMemRefDialect
18 |   MLIRTensorDialect
19 |   MLIRTransforms
20 |   ByteIRConversionCommon
21 |   ByteIRMhloUtils
22 | )
23 | 


--------------------------------------------------------------------------------
/compiler/lib/Conversion/GPUToNVVM/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_byteir_conversion_library(ByteIRGPUToNVVM
 2 |   GPUToNVVM.cpp
 3 | 
 4 |   ADDITIONAL_HEADER_DIRS
 5 |   ${BYTEIR_SRC_INCLUDE_DIR}/byteir/Conversion/GPUToNVVM
 6 | 
 7 |   DEPENDS
 8 |   ByteIRConversionPassIncGen
 9 |   ByteIRConversionCommon
10 | 
11 |   LINK_LIBS PUBLIC
12 |   MLIRArithToLLVM
13 |   MLIRFuncToLLVM
14 |   MLIRGPUDialect
15 |   MLIRGPUToGPURuntimeTransforms
16 |   MLIRGPUToNVVMTransforms
17 |   MLIRLLVMCommonConversion
18 |   MLIRLLVMDialect
19 |   MLIRMathTransforms
20 |   MLIRMemRefDialect
21 |   MLIRMemRefToLLVM
22 |   MLIRNVVMDialect
23 |   MLIRPass
24 |   MLIRTransformUtils
25 |   )
26 | 


--------------------------------------------------------------------------------
/compiler/lib/Conversion/HloToByreTensor/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_byteir_conversion_library(ByteIRHloToByreTensor
 2 |   HloToByreCustom.cpp
 3 |   HloToByreTensor.cpp
 4 | 
 5 |   ADDITIONAL_HEADER_DIRS
 6 |   ${BYTEIR_SRC_INCLUDE_DIR}/byteir/Conversion/HloToByreTensor
 7 | 
 8 |   DEPENDS
 9 |   MLIRByreDialect
10 |   ByteIRConversionPassIncGen
11 |   ByteIRConversionCommon
12 |   ByteIRMhloUtils
13 | 
14 |   LINK_LIBS PUBLIC
15 |   MhloDialect
16 |   MLIRAceDialect
17 |   MLIRArithDialect
18 |   MLIRByreDialect
19 |   MLIRIR
20 |   MLIRTensorDialect
21 |   MLIRTransforms
22 |   ByteIRConversionCommon
23 |   ByteIRMhloUtils
24 | )
25 | 


--------------------------------------------------------------------------------
/compiler/lib/Conversion/HloToTensor/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_byteir_conversion_library(ByteIRHloToTensor
 2 |     ConvertHloToTensor.cpp
 3 | 
 4 |     ADDITIONAL_HEADER_DIRS
 5 |     ${BYTEIR_SRC_INCLUDE_DIR}/byteir/Conversion/HloToTensor
 6 |     
 7 |     DEPENDS
 8 |     MLIRTensorDialect
 9 |     MLIRArithDialect
10 |     ByteIRConversionPassIncGen
11 |     ByteIRConversionCommon
12 |     ByteIRMhloUtils
13 | 
14 |     LINK_LIBS PUBLIC
15 |     MhloDialect
16 |     MLIRTensorDialect
17 |     MLIRArithDialect
18 |     MLIRIR
19 |     ByteIRConversionCommon
20 | )
21 | 


--------------------------------------------------------------------------------
/compiler/lib/Conversion/LcclToByre/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_byteir_conversion_library(ByteIRLcclToByre
 2 |   LcclToByre.cpp
 3 | 
 4 |   ADDITIONAL_HEADER_DIRS
 5 |   ${BYTEIR_SRC_INCLUDE_DIR}/byteir/Conversion/LcclToByre
 6 | 
 7 |   DEPENDS
 8 |   MLIRByreDialect
 9 |   ByteIRConversionPassIncGen
10 |   ByteIRConversionCommon
11 | 
12 |   LINK_LIBS PUBLIC
13 |   MLIRArithDialect
14 |   MLIRByreDialect
15 |   MLIRIR
16 |   MLIRMemRefDialect
17 |   MLIRTransforms
18 |   ByteIRConversionCommon
19 | )
20 | 


--------------------------------------------------------------------------------
/compiler/lib/Conversion/MemrefToByre/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_byteir_conversion_library(ByteIRMemrefToByre
 2 |   MemrefToByre.cpp
 3 | 
 4 |   ADDITIONAL_HEADER_DIRS
 5 |   ${BYTEIR_SRC_INCLUDE_DIR}/byteir/Conversion/MemrefToByre
 6 | 
 7 |   DEPENDS
 8 |   MLIRByreDialect
 9 |   ByteIRConversionPassIncGen
10 |   ByteIRConversionCommon
11 | 
12 |   LINK_LIBS PUBLIC
13 |   MLIRArithDialect
14 |   MLIRByreDialect
15 |   MLIRIR
16 |   MLIRMemRefDialect
17 |   MLIRTensorDialect
18 |   MLIRTransforms
19 |   ByteIRConversionCommon
20 | )
21 | 


--------------------------------------------------------------------------------
/compiler/lib/Conversion/ToAIT/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_byteir_conversion_library(ByteIRToAIT
 2 |   GenAITConfig.cpp
 3 | 
 4 |   ADDITIONAL_HEADER_DIRS
 5 |   ${BYTEIR_SRC_INCLUDE_DIR}/byteir/Conversion/ToAIT
 6 | 
 7 |   DEPENDS
 8 |   ByteIRConversionPassIncGen
 9 | 
10 |   LINK_LIBS PUBLIC
11 |   MLIRIR
12 |   MLIRBufferizationTransforms
13 |   ByteIRUtils
14 |   )
15 | 


--------------------------------------------------------------------------------
/compiler/lib/Conversion/ToAce/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | set(LLVM_TARGET_DEFINITIONS MhloToAceActivationPattern.td)
 2 | mlir_tablegen(MhloToAceActivationPattern.inc -gen-rewriters)
 3 | add_public_tablegen_target(MhloToAceActivationPatternIncGen)
 4 | 
 5 | add_byteir_conversion_library(ByteIRToAce
 6 |   MhloToAce.cpp
 7 | 
 8 |   DEPENDS
 9 |   ByteIRConversionPassIncGen
10 |   ByteIRConversionCommon
11 |   MhloToAceActivationPatternIncGen
12 |   MLIRAceDialect
13 | 
14 |   LINK_LIBS PUBLIC
15 |   ByteIRConversionCommon
16 |   ByteIRUtils
17 |   MhloDialect
18 |   MLIRAceDialect
19 |   MLIRIR
20 |   )
21 | 
22 | target_include_directories(ByteIRToAce PUBLIC ${CMAKE_CURRENT_BINARY_DIR})
23 | 


--------------------------------------------------------------------------------
/compiler/lib/Conversion/ToByre/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_byteir_conversion_library(ByteIRToByre
 2 |   ToByre.cpp
 3 | 
 4 |   ADDITIONAL_HEADER_DIRS
 5 |   ${BYTEIR_SRC_INCLUDE_DIR}/byteir/Conversion/ToByre
 6 | 
 7 |   DEPENDS
 8 |   MLIRAceDialect
 9 |   MLIRByreDialect
10 |   MLIRLaceDialect
11 |   ByteIRConversionPassIncGen
12 |   ByteIRConversionCommon
13 |   ByteIRMhloUtils
14 | 
15 |   LINK_LIBS PUBLIC
16 |   MLIRAceDialect
17 |   MLIRByreDialect
18 |   MLIRIR
19 |   MLIRLaceDialect
20 |   MLIRMemRefDialect
21 |   MLIRTransforms
22 |   ByteIRConversionCommon
23 |   ByteIRMhloUtils
24 |   )
25 | 


--------------------------------------------------------------------------------
/compiler/lib/Conversion/ToGPU/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_byteir_conversion_library(ByteIRToGPU
 2 |   CoalescedForToGPU.cpp
 3 |   FuncToGPU.cpp
 4 |   Utils.cpp
 5 | 
 6 |   ADDITIONAL_HEADER_DIRS
 7 |   ${BYTEIR_SRC_INCLUDE_DIR}/byteir/Conversion/ToGPU
 8 | 
 9 |   DEPENDS
10 |   ByteIRConversionPassIncGen
11 |   ByteIRConversionCommon
12 |   ByteIRUtils
13 | 
14 |   LINK_LIBS PUBLIC
15 |   MLIRIR
16 |   MLIRAffineDialect
17 |   MLIRGPUDialect
18 |   MLIRMemRefDialect
19 |   MLIRSCFDialect
20 |   MLIRTransforms
21 |   ByteIRConversionCommon
22 |   ByteIRUtils
23 |   )
24 | 


--------------------------------------------------------------------------------
/compiler/lib/Conversion/ToHlo/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | set(LLVM_TARGET_DEFINITIONS ArithToMhloPattern.td)
 2 | mlir_tablegen(ArithToMhloPattern.inc -gen-rewriters)
 3 | add_public_tablegen_target(ArithToMhloPatternIncGen)
 4 | 
 5 | add_byteir_conversion_library(ByteIRToMhlo
 6 |   ArithToMhlo.cpp
 7 | 
 8 |   DEPENDS
 9 |   ByteIRConversionPassIncGen
10 |   ArithToMhloPatternIncGen
11 | 
12 |   LINK_LIBS PUBLIC
13 |   MhloDialect
14 |   MLIRArithDialect
15 |   MLIRIR
16 |   )
17 | 
18 | target_include_directories(ByteIRToMhlo PUBLIC ${CMAKE_CURRENT_BINARY_DIR})
19 | 


--------------------------------------------------------------------------------
/compiler/lib/Conversion/ToLLVM/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_byteir_conversion_library(ByteIRToLLVM
 2 |   CollectFuncToLLVM.cpp
 3 |   GenLLVMConfig.cpp
 4 | 
 5 |   ADDITIONAL_HEADER_DIRS
 6 |   ${BYTEIR_SRC_INCLUDE_DIR}/byteir/Conversion/ToLLVM
 7 | 
 8 |   DEPENDS
 9 |   ByteIRConversionPassIncGen
10 | 
11 |   LINK_LIBS PUBLIC
12 |   MLIRIR
13 |   MLIRBufferizationTransforms
14 |   ByteIRUtils
15 |   )
16 | 


--------------------------------------------------------------------------------
/compiler/lib/Conversion/ToLinalg/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_byteir_conversion_library(ByteIRToLinalg
 2 |   HloToLinalg.cpp
 3 |   LinalgExtToLinalg.cpp
 4 |   MemrefCopyToLinalg.cpp
 5 |   TensorToLinalg.cpp
 6 |   UnrealizedCastToLinalg.cpp
 7 | 
 8 |   ADDITIONAL_HEADER_DIRS
 9 |   ${BYTEIR_SRC_INCLUDE_DIR}/byteir/Conversion/HloToLinalg
10 | 
11 |   DEPENDS
12 |   ByteIRConversionPassIncGen
13 |   ByteIRConversionCommon
14 |   HloToLinalgUtils
15 | 
16 |   LINK_LIBS PUBLIC
17 |   MLIRIR
18 |   MhloDialect
19 |   MhloToLinalg
20 |   MLIRRewrite
21 |   MLIRLinalgDialect
22 |   MLIRMathDialect
23 |   MLIRMemRefDialect
24 |   MLIRSCFDialect
25 |   MLIRTransforms
26 |   ByteIRConversionCommon
27 |   HloToLinalgUtils
28 |   )
29 | 


--------------------------------------------------------------------------------
/compiler/lib/Conversion/ToPTX/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_byteir_conversion_library(ByteIRToPTX
 2 |   CollectGPUKernel.cpp
 3 |   GenPTXConfig.cpp
 4 | 
 5 |   ADDITIONAL_HEADER_DIRS
 6 |   ${BYTEIR_SRC_INCLUDE_DIR}/byteir/Conversion/ToPTX
 7 | 
 8 |   DEPENDS
 9 |   ByteIRConversionPassIncGen
10 | 
11 |   LINK_LIBS PUBLIC
12 |   MLIRIR
13 |   MLIRMemRefDialect
14 |   MLIRGPUDialect
15 |   MLIRTransforms
16 |   )
17 | 


--------------------------------------------------------------------------------
/compiler/lib/Dialect/Ace/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_byteir_dialect_library(ByteIRAcePasses
 2 |   Transforms/BufferizableOpInterfaceImpl.cpp
 3 |   Transforms/Bufferize.cpp
 4 | 
 5 |   ADDITIONAL_HEADER_DIRS
 6 |   ${BYTEIR_SRC_INCLUDE_DIR}/byteir/Dialect/Ace
 7 | 
 8 |   DEPENDS
 9 |   ByteIRAcePassIncGen
10 |   MLIRAceDialect
11 |   MLIRLaceDialect
12 | 
13 |   LINK_LIBS PUBLIC
14 |   MLIRIR
15 |   MLIRSupport
16 | 
17 |   MLIRAceDialect
18 |   MLIRBufferizationDialect
19 |   MLIRBufferizationTransforms
20 |   MLIRLaceDialect
21 |   MLIRMemRefDialect
22 | )
23 | 


--------------------------------------------------------------------------------
/compiler/lib/Dialect/Affine/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_byteir_dialect_library(ByteIRAffinePasses
 2 |   Transforms/AffineLoopFusionEx.cpp
 3 |   Transforms/InsertTrivialAffineLoop.cpp
 4 |   Transforms/RewriteAffineToMemref.cpp
 5 | 
 6 |   ADDITIONAL_HEADER_DIRS
 7 |   ${BYTEIR_SRC_INCLUDE_DIR}/byteir/Dialect/Affine
 8 |   ${BYTEIR_SRC_INCLUDE_DIR}/byteir/Dialect/Affine/Transforms
 9 | 
10 |   DEPENDS
11 |   ByteIRAffinePassIncGen
12 |   ByteIRUtils
13 |   MLIRAffineDialect
14 |   MLIRMemRefDialect
15 | 
16 |   LINK_LIBS PUBLIC
17 |   ByteIRUtils
18 |   MLIRIR
19 |   MLIRAffineDialect
20 |   MLIRMemRefDialect
21 |   MLIRSideEffectInterfaces
22 |   MLIRSupport
23 | )
24 | 


--------------------------------------------------------------------------------
/compiler/lib/Dialect/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_subdirectory(Ace)
 2 | add_subdirectory(Affine)
 3 | add_subdirectory(Byre)
 4 | add_subdirectory(Cat)
 5 | add_subdirectory(Ccl)
 6 | add_subdirectory(GPU)
 7 | add_subdirectory(Lace)
 8 | add_subdirectory(Linalg)
 9 | add_subdirectory(MemRef)
10 | add_subdirectory(mhlo)
11 | add_subdirectory(SCF)
12 | add_subdirectory(Shape)
13 | add_subdirectory(Tensor)
14 | add_subdirectory(Transform)
15 | add_subdirectory(Vector)
16 | add_subdirectory(Lccl)
17 | 


--------------------------------------------------------------------------------
/compiler/lib/Dialect/Cat/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(IR)


--------------------------------------------------------------------------------
/compiler/lib/Dialect/Cat/IR/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_byteir_dialect_library(MLIRCatDialect
 2 |   CatDialect.cpp
 3 | 
 4 |   DEPENDS
 5 |   MLIRCatOpsIncGen
 6 |   MLIRCatOpInterfacesIncGen
 7 | 
 8 |   LINK_LIBS PUBLIC
 9 |   MLIRIR
10 |   MLIRSupport
11 | )
12 | 
13 | 


--------------------------------------------------------------------------------
/compiler/lib/Dialect/Ccl/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(TransformOps)
2 | add_subdirectory(Transforms)


--------------------------------------------------------------------------------
/compiler/lib/Dialect/Ccl/TransformOps/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_byteir_dialect_library(MLIRCclTransformOps
 2 |   CclTransformOps.cpp
 3 | 
 4 |   ADDITIONAL_HEADER_DIRS
 5 |   ${BYTEIR_SRC_INCLUDE_DIR}/byteir/Dialect/Ccl/TransformOps
 6 | 
 7 |   DEPENDS
 8 |   MLIRCclDialect
 9 |   MLIRCclTransformOpsIncGen
10 |   
11 | 
12 |   LINK_LIBS PUBLIC
13 |   MLIRCclDialect
14 |   MLIRIR
15 |   MLIRParser
16 |   MLIRPDLDialect
17 |   MLIRSideEffectInterfaces
18 |   MLIRTransformDialect
19 | )
20 | 


--------------------------------------------------------------------------------
/compiler/lib/Dialect/Ccl/Transforms/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_byteir_dialect_library(ByteIRCclPasses
 2 |   CclMoveDown.cpp
 3 |   CclBufferizeOpInterfaceImpl.cpp
 4 | 
 5 |   ADDITIONAL_HEADER_DIRS
 6 |   ${BYTEIR_SRC_INCLUDE_DIR}/byteir/Dialect/Ccl
 7 |   ${BYTEIR_SRC_INCLUDE_DIR}/byteir/Dialect/Ccl/Transforms
 8 | 
 9 |   DEPENDS
10 |   ByteIRCclPassIncGen
11 | 
12 |   LINK_LIBS PUBLIC
13 |   MLIRLcclDialect
14 |   MLIRIR
15 |   MhloDialect
16 |   MLIRSupport
17 | )
18 | 


--------------------------------------------------------------------------------
/compiler/lib/Dialect/GPU/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(Transforms)
2 | add_subdirectory(TransformOps)


--------------------------------------------------------------------------------
/compiler/lib/Dialect/GPU/TransformOps/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_byteir_dialect_library(MLIRGPUExtTransformOps
 2 |   GPUExtTransformOps.cpp
 3 |   Utils.cpp
 4 | 
 5 |   ADDITIONAL_HEADER_DIRS
 6 |   ${BYTEIR_SRC_INCLUDE_DIR}/byteir/Dialect/GPU/TransformOps
 7 | 
 8 | 
 9 |   DEPENDS
10 |   MLIRGPUExtTransformOpsIncGen
11 | 
12 |   LINK_LIBS PUBLIC
13 |   MLIRGPUDialect
14 |   MLIRGPUTransforms
15 |   MLIRIR
16 |   MLIRParser
17 |   MLIRSideEffectInterfaces
18 |   MLIRTransformDialect
19 |   MLIRPDLDialect
20 |   MLIRSCFDialect
21 | )  
22 | 


--------------------------------------------------------------------------------
/compiler/lib/Dialect/GPU/Transforms/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_byteir_dialect_library(ByteIRGPUPasses
 2 |   GPUBlockSwizzle.cpp
 3 |   GPUDistributeSharedMemoryCopy.cpp
 4 |   GPUDistributeToWarp.cpp
 5 |   GPUTensorCoreVectorization.cpp
 6 |   GPUPackSharedMemoryAlloc.cpp
 7 |   OptimizeVectorTransfer.cpp
 8 |   RemoveTrivialLoops.cpp
 9 |   ShmAllocaToWorkgroupArg.cpp
10 |   Utils.cpp
11 | 
12 |   ADDITIONAL_HEADER_DIRS
13 |   ${BYTEIR_SRC_INCLUDE_DIR}/byteir/Dialect/GPU
14 |   ${BYTEIR_SRC_INCLUDE_DIR}/byteir/Dialect/GPU/Transforms
15 | 
16 |   DEPENDS
17 |   ByteIRGPUPassIncGen
18 |   ByteIRUtils
19 |   MLIRGPUDialect
20 | 
21 |   LINK_LIBS PUBLIC
22 |   ByteIRUtils
23 |   MLIRIR
24 |   MLIRGPUDialect
25 |   MLIRMemRefDialect
26 |   MLIRSupport
27 | )
28 | 


--------------------------------------------------------------------------------
/compiler/lib/Dialect/Lace/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_byteir_dialect_library(MLIRLaceDialect
 2 |   IR/LaceDialect.cpp
 3 | 
 4 |   ADDITIONAL_HEADER_DIRS
 5 |   ${BYTEIR_SRC_INCLUDE_DIR}/mlir/Dialect/Lace
 6 | 
 7 |   DEPENDS
 8 |   MLIRAceOpsIncGen # ace types
 9 |   MLIRLaceOpsIncGen
10 |   MLIRLaceOpInterfacesIncGen
11 | 
12 |   LINK_LIBS PUBLIC
13 |   MLIRIR
14 |   MLIRSupport
15 |   MLIRViewLikeInterface
16 |   MLIRAceDialect
17 | )
18 | 


--------------------------------------------------------------------------------
/compiler/lib/Dialect/Lccl/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(IR)
2 | 


--------------------------------------------------------------------------------
/compiler/lib/Dialect/Lccl/IR/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_byteir_dialect_library(MLIRLcclDialect
 2 |   LcclOps.cpp
 3 | 
 4 |   ADDITIONAL_HEADER_DIRS
 5 | 
 6 |   DEPENDS
 7 |   MLIRLcclOpsIncGen
 8 | 
 9 |   LINK_LIBS PUBLIC
10 |   MLIRIR
11 |   MLIRSupport
12 | )
13 | 


--------------------------------------------------------------------------------
/compiler/lib/Dialect/Linalg/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(IR)
2 | add_subdirectory(TransformOps)
3 | add_subdirectory(Transforms)
4 | add_subdirectory(Util)


--------------------------------------------------------------------------------
/compiler/lib/Dialect/Linalg/IR/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_byteir_dialect_library(MLIRLinalgExt
 2 |   LinalgExtInterfaces.cpp
 3 |   LinalgExtOps.cpp
 4 | 
 5 |   DEPENDS
 6 |   MLIRLinalgExtInterfacesIncGen
 7 |   MLIRLinalgExtOpsIncGen
 8 | 
 9 |   LINK_LIBS PUBLIC
10 |   MLIRAnalysis
11 |   MLIRIR
12 |   MLIRLinalgUtils
13 |   MLIRSupport
14 |   MLIRLinalgExtUtils
15 |   MLIRSCFExtUtils
16 | )
17 | 


--------------------------------------------------------------------------------
/compiler/lib/Dialect/Linalg/TransformOps/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_byteir_dialect_library(MLIRLinalgExtTransformOps
 2 |   LinalgExtTransformOps.cpp
 3 | 
 4 |   ADDITIONAL_HEADER_DIRS
 5 |   ${BYTEIR_SRC_INCLUDE_DIR}/byteir/Dialect/Linalg/TransformOps
 6 | 
 7 |   DEPENDS
 8 |   MLIRCclDialect
 9 |   MLIRLinalgExtTransformOpsIncGen
10 |   ByteIRLinalgPasses
11 | 
12 |   LINK_LIBS PUBLIC
13 |   ByteIRLinalgPasses
14 |   MLIRAffineDialect
15 |   MLIRArithDialect
16 |   MLIRCclDialect
17 |   MLIRIR
18 |   MLIRLinalgDialect
19 |   MLIRLinalgTransforms
20 |   MLIRParser
21 |   MLIRPDLDialect
22 |   MLIRSCFDialect
23 |   MLIRSideEffectInterfaces
24 |   MLIRTensorTilingInterfaceImplExt
25 |   MLIRTransformDialect
26 |   MLIRVectorDialect
27 | )
28 | 


--------------------------------------------------------------------------------
/compiler/lib/Dialect/Linalg/Util/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_byteir_dialect_library(MLIRLinalgExtUtils
 2 |   Util.cpp
 3 | 
 4 |   ADDITIONAL_HEADER_DIRS
 5 |   ${BYTEIR_SRC_INCLUDE_DIR}/byteir/Dialect/Linalg
 6 |   
 7 |   LINK_LIBS PUBLIC
 8 |   MLIRIR
 9 |   MLIRLinalgDialect
10 | )
11 | 


--------------------------------------------------------------------------------
/compiler/lib/Dialect/SCF/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(Transforms)
2 | add_subdirectory(Util)
3 | 


--------------------------------------------------------------------------------
/compiler/lib/Dialect/SCF/Transforms/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_byteir_dialect_library(ByteIRSCFPasses
 2 |   ForallCollapsing.cpp
 3 |   FuseNestedForall.cpp
 4 |   InsertTrivialSCFLoop.cpp
 5 |   TilingInterfaceToSCFFor.cpp
 6 |   RemoveSingleIterationLoop.cpp
 7 | 
 8 |   ADDITIONAL_HEADER_DIRS
 9 |   ${BYTEIR_SRC_INCLUDE_DIR}/byteir/Dialect/SCF
10 |   ${BYTEIR_SRC_INCLUDE_DIR}/byteir/Dialect/SCF/Transforms
11 | 
12 |   DEPENDS
13 |   ByteIRSCFPassIncGen
14 |   ByteIRUtils
15 | 
16 |   LINK_LIBS PUBLIC
17 |   ByteIRUtils
18 |   MLIRIR
19 |   MLIRMemRefDialect
20 |   MLIRSCFDialect
21 |   MLIRSCFTransforms
22 |   MLIRSideEffectInterfaces
23 |   MLIRSupport
24 | )
25 | 


--------------------------------------------------------------------------------
/compiler/lib/Dialect/SCF/Util/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_byteir_dialect_library(MLIRSCFExtUtils
 2 |   Util.cpp
 3 | 
 4 |   ADDITIONAL_HEADER_DIRS
 5 |   ${BYTEIR_SRC_INCLUDE_DIR}/byteir/Dialect/SCF
 6 |   
 7 |   LINK_LIBS PUBLIC
 8 |   MLIRIR
 9 |   MLIRSCFDialect
10 | )
11 | 


--------------------------------------------------------------------------------
/compiler/lib/Dialect/Tensor/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(IR)
2 | add_subdirectory(Transforms)
3 | 


--------------------------------------------------------------------------------
/compiler/lib/Dialect/Tensor/IR/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_byteir_dialect_library(MLIRTensorTilingInterfaceImplExt
 2 |   TilingInterfaceImpl.cpp
 3 | 
 4 |   ADDITIONAL_HEADER_DIRS
 5 |   ${BYTEIR_SRC_INCLUDE_DIR}/byteir/Dialect/Tensor
 6 | 
 7 |   DEPENDS
 8 |   ByteIRUtils
 9 | 
10 |   LINK_LIBS PUBLIC
11 |   ByteIRUtils
12 |   MLIRAffineDialect
13 |   MLIRIR
14 |   MLIRLinalgDialect
15 |   MLIRSCFDialect
16 |   MLIRSupport
17 |   MLIRTensorDialect
18 |   MLIRTilingInterface
19 | )
20 | 


--------------------------------------------------------------------------------
/compiler/lib/Dialect/Tensor/Transforms/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_byteir_dialect_library(ByteIRTensorPasses
 2 |   CanonicalizeExt.cpp
 3 |   ExtractSliceSpecialization.cpp
 4 |   TensorPadSpecialization.cpp
 5 | 
 6 |   ADDITIONAL_HEADER_DIRS
 7 |   ${BYTEIR_SRC_INCLUDE_DIR}/byteir/Dialect/mhlo
 8 |   ${BYTEIR_SRC_INCLUDE_DIR}/byteir/Dialect/mhlo/Transforms
 9 | 
10 |   DEPENDS
11 |   ByteIRUtils
12 |   ByteIRTensorPassIncGen
13 | 
14 |   LINK_LIBS PUBLIC
15 |   MLIRIR
16 |   MLIRSupport
17 |   MLIRSCFDialect
18 |   ByteIRUtils
19 | )
20 | 


--------------------------------------------------------------------------------
/compiler/lib/Dialect/Transform/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(IR)
2 | add_subdirectory(Transforms)
3 | 


--------------------------------------------------------------------------------
/compiler/lib/Dialect/Transform/IR/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_byteir_dialect_library(MLIRTransformExtDialect
 2 |   TransformExtOps.cpp
 3 | 
 4 |   ADDITIONAL_HEADER_DIRS
 5 |   ${BYTEIR_SRC_INCLUDE_DIR}/byteir/Dialect/Transform/IR
 6 | 
 7 |   DEPENDS
 8 |   MLIRTransformExtOpsIncGen
 9 | 
10 |   LINK_LIBS PUBLIC
11 |   MLIRIR
12 |   MLIRPass
13 |   MLIRPDLDialect
14 |   MLIRTransformDialect
15 |   MLIRLinalgExtTransformOps
16 | )
17 | 


--------------------------------------------------------------------------------
/compiler/lib/Dialect/Transform/Transforms/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_byteir_dialect_library(ByteIRTransformPasses
 2 |   TransformDialectInterpreter.cpp
 3 |   TransformInsertion.cpp
 4 | 
 5 |   ADDITIONAL_HEADER_DIRS
 6 |   ${BYTEIR_SRC_INCLUDE_DIR}/byteir/Dialect/Transform/Transforms
 7 | 
 8 |   DEPENDS
 9 |   ByteIRTransformPassIncGen
10 |   MLIRLinalgExtTransformOps
11 |   MLIRTransformExtOpsIncGen
12 | 
13 |   LINK_LIBS PUBLIC
14 |   MLIRIR
15 |   MLIRPass
16 |   MLIRPDLDialect
17 |   MLIRTransformDialect
18 |   MLIRLinalgExtTransformOps
19 | )
20 | 


--------------------------------------------------------------------------------
/compiler/lib/Dialect/Vector/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(Transforms)
2 | 


--------------------------------------------------------------------------------
/compiler/lib/Dialect/Vector/Transforms/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_byteir_dialect_library(ByteIRVectorPasses
 2 |   CanonicalizeExt.cpp
 3 |   VectorLowerings.cpp
 4 |   VectorWarpDistribute.cpp
 5 |   MoveForallRegionIntoWarpOp.cpp
 6 | 
 7 |   ADDITIONAL_HEADER_DIRS
 8 |   ${BYTEIR_SRC_INCLUDE_DIR}/byteir/Dialect/Vector/Transforms
 9 | 
10 |   DEPENDS
11 |   ByteIRVectorPassIncGen
12 |   ByteIRUtils
13 | 
14 |   LINK_LIBS PUBLIC
15 |   ByteIRUtils
16 |   MLIRIR
17 |   MLIRSupport
18 | 
19 |   MLIRAffineDialect
20 |   MLIRMemRefDialect
21 |   MLIRSCFDialect
22 |   MLIRTensorDialect
23 |   MLIRVectorDialect
24 | )
25 | 


--------------------------------------------------------------------------------
/compiler/lib/Pipelines/Common/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_mlir_library(ByteIRPipelineCommon
 2 |   Utils.cpp
 3 | 
 4 |   ADDITIONAL_HEADER_DIRS
 5 |   ${BYTEIR_SRC_INCLUDE_DIR}/byteir/Pipelines/Common
 6 | 
 7 |   DEPENDS
 8 |   ByteIRTransforms
 9 |   ByteIRUtils
10 |   MhloDialect
11 |   MLIRBufferTransforms
12 | 
13 |   LINK_LIBS PUBLIC
14 |   ByteIRLinalgPasses
15 |   ByteIRUtils
16 |   ByteIRSCFPasses
17 |   MLIRIR
18 |   MLIRTransforms
19 | )


--------------------------------------------------------------------------------
/compiler/lib/Pipelines/GPU/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_byteir_pipeline_library(ByteIRGPUPipelines
 2 |   ElementwiseCodegen.cpp
 3 |   GPUOpt.cpp
 4 |   LinalgMemrefGPU.cpp
 5 |   MappingForall.cpp
 6 |   NVVMCodegen.cpp
 7 |   ReductionCodegen.cpp
 8 | 
 9 |   ADDITIONAL_HEADER_DIRS
10 |   ${BYTEIR_SRC_INCLUDE_DIR}/byteir/Pipelines/GPU
11 | 
12 |   DEPENDS
13 |   ByteIRPipelineCommon
14 |   ByteIRTransforms
15 |   ByteIRUtils
16 |   MhloDialect
17 |   MLIRBufferTransforms
18 | 
19 |   LINK_LIBS PUBLIC
20 |   ByteIRGPUPasses
21 |   ByteIRLinalgPasses
22 |   ByteIRPipelineCommon
23 |   ByteIRUtils
24 |   ByteIRSCFPasses
25 |   ByteIRToPTX
26 |   MLIRIR
27 |   MLIRTransforms
28 |   MLIRLinalgExtTransformOps
29 | )


--------------------------------------------------------------------------------
/compiler/lib/Pipelines/Host/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_byteir_pipeline_library(ByteIRHostPipelines
 2 |   Codegen.cpp
 3 |   HostOpt.cpp
 4 |   ToLLVM.cpp
 5 | 
 6 |   ADDITIONAL_HEADER_DIRS
 7 |   ${BYTEIR_SRC_INCLUDE_DIR}/byteir/Pipelines/Host
 8 | 
 9 |   DEPENDS
10 |   MLIRTransformExtDialect
11 |   MLIRLinalgExtTransformOps
12 | 
13 |   LINK_LIBS PUBLIC
14 |   MLIRIR
15 |   MLIRFuncDialect
16 |   ByteIRToLLVM
17 |   ByteIRPipelineCommon
18 |   ByteIRTransformPasses
19 |   ByteIRVectorPasses
20 |   MLIRTransformExtDialect
21 |   MLIRLinalgExtTransformOps
22 |   MLIRArithTransforms
23 |   MLIRBufferizationTransforms
24 |   MLIRFuncToLLVM
25 |   MLIRMathToLLVM
26 |   MLIRMemRefToLLVM
27 |   MLIRReconcileUnrealizedCasts
28 |   MLIRSCFToControlFlow
29 |   MLIRTensorTransforms
30 |   MLIRTransforms
31 | )


--------------------------------------------------------------------------------
/compiler/lib/Stat/AllocCnt/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_byteir_stat_library(ByteIRAllocCntStat
 2 |   AllocCnt.cpp
 3 | 
 4 |   DEPENDS
 5 |   ByteIRStatCommon
 6 |   ByteIRUtils
 7 | 
 8 |   LINK_LIBS PUBLIC
 9 |   ByteIRAnalysis
10 |   ByteIRStatCommon
11 |   ByteIRUtils
12 |   MLIRIR
13 |   MLIRMemRefDialect
14 | )


--------------------------------------------------------------------------------
/compiler/lib/Stat/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(AllocCnt)
2 | add_subdirectory(Common)
3 | add_subdirectory(OpCnt)


--------------------------------------------------------------------------------
/compiler/lib/Stat/Common/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_mlir_library(ByteIRStatCommon
2 |   Reg.cpp
3 | 
4 |   LINK_LIBS PUBLIC
5 |   MLIRIR
6 | )


--------------------------------------------------------------------------------
/compiler/lib/Stat/OpCnt/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_byteir_stat_library(ByteIROpCntStat
 2 |   OpCnt.cpp
 3 | 
 4 |   DEPENDS
 5 |   ByteIRStatCommon
 6 | 
 7 |   LINK_LIBS PUBLIC
 8 |   ByteIRStatCommon
 9 |   MLIRIR
10 | )


--------------------------------------------------------------------------------
/compiler/lib/Target/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(Cpp)
2 | add_subdirectory(CUDA)
3 | add_subdirectory(LLVM)
4 | add_subdirectory(PTX)
5 | 


--------------------------------------------------------------------------------
/compiler/lib/Target/CUDA/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_byteir_translation_library(ByteIRTargetCUDA
 2 |   TranslateRegistration.cpp
 3 |   TranslateToCUDA.cpp
 4 | 
 5 |   ADDITIONAL_HEADER_DIRS
 6 |   ${BYTEIR_SRC_INCLUDE_DIR}/byteir/Target/CUDA
 7 | 
 8 |   DEPENDS
 9 |   ByteIRTargetCpp
10 | 
11 |   LINK_LIBS PUBLIC
12 |   MLIREmitCDialect
13 |   MLIRIR
14 |   MLIRSCFDialect
15 |   MLIRControlFlowDialect
16 |   MLIRMemRefDialect
17 |   MLIRGPUDialect
18 |   MLIRSupport
19 |   # MLIRTranslation
20 |   ByteIRTargetCpp
21 |   )
22 | 


--------------------------------------------------------------------------------
/compiler/lib/Target/Cpp/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_byteir_translation_library(ByteIRTargetCpp
 2 |   TranslateRegistration.cpp
 3 |   TranslateToCpp.cpp
 4 | 
 5 |   ADDITIONAL_HEADER_DIRS
 6 |   ${BYTEIR_SRC_INCLUDE_DIR}/byteir/Target/Cpp
 7 | 
 8 |   LINK_LIBS PUBLIC
 9 |   MLIREmitCDialect
10 |   MLIRIR
11 |   MLIRSCFDialect
12 |   MLIRControlFlowDialect
13 |   MLIRMemRefDialect
14 |   MLIRSupport
15 |   # MLIRTranslation
16 |   )
17 | 


--------------------------------------------------------------------------------
/compiler/lib/Target/LLVM/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_byteir_translation_library(ByteIRTargetLLVM
 2 |   TranslateRegistration.cpp
 3 | 
 4 |   ADDITIONAL_HEADER_DIRS
 5 |   ${BYTEIR_SRC_INCLUDE_DIR}/byteir/Target/LLVM
 6 | 
 7 |   LINK_LIBS PUBLIC
 8 |   LLVMBitWriter
 9 |   MLIRArmNeonToLLVMIRTranslation
10 |   MLIRArmSMEToLLVMIRTranslation
11 |   MLIRArmSVEToLLVMIRTranslation
12 |   MLIRAMXToLLVMIRTranslation
13 |   MLIRBuiltinToLLVMIRTranslation
14 |   MLIRGPUToLLVMIRTranslation
15 |   MLIRX86VectorToLLVMIRTranslation
16 |   MLIRLLVMToLLVMIRTranslation
17 |   MLIRNVVMToLLVMIRTranslation
18 |   MLIROpenACCToLLVMIRTranslation
19 |   MLIROpenMPToLLVMIRTranslation
20 |   MLIRROCDLToLLVMIRTranslation
21 | )
22 | 


--------------------------------------------------------------------------------
/compiler/lib/Utils/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_mlir_library(ByteIRUtils
 2 |   AffineUtils.cpp
 3 |   AttrUtils.cpp
 4 |   FuncUtils.cpp
 5 |   GraphUtils.cpp
 6 |   Hoist.cpp
 7 |   IRRewrite.cpp
 8 |   LoopUtils.cpp
 9 |   MemUtils.cpp
10 |   ModuleUtils.cpp
11 |   OpInterfaceUtils.cpp
12 |   PatternMatch.cpp
13 |   OptionUtils.cpp
14 |   PipelineUtils.cpp
15 |   TileUtils.cpp
16 |   TypeUtils.cpp
17 |   Utils.cpp
18 | 
19 |   ADDITIONAL_HEADER_DIRS
20 |   ${BYTEIR_SRC_INCLUDE_DIR}/byteir/Utils
21 | 
22 |   LINK_LIBS PUBLIC
23 |   MLIRIR
24 |   MLIRAffineDialect
25 |   MLIRArithDialect
26 |   MLIRCclDialect
27 |   MLIRMemRefDialect
28 |   MLIRSCFDialect
29 |   MLIRSCFExtUtils
30 | )


--------------------------------------------------------------------------------
/compiler/numerical/hlo/test_broadcast_dense_elements_attr.mlir:
--------------------------------------------------------------------------------
 1 | // RUN: byteir-opt %s -test-broadcast-dense-elements-attr -o %t
 2 | // RUN: FileCheck %s < %t
 3 | // RUN: python3 %S/numerical_test.py %s %t
 4 | 
 5 | func.func @case3() -> tensor<2x1x5xi64> {
 6 |   %0 = mhlo.constant dense<[[[2, 3]]]> : tensor<1x1x2xi64>
 7 |   %1 = "mhlo.broadcast_in_dim"(%0) {broadcast_dimensions = dense<[1, 2, 0]> : tensor<3xi64>} : (tensor<1x1x2xi64>) -> (tensor<2x1x5xi64>)
 8 |   return %1 : tensor<2x1x5xi64>
 9 | }
10 | // CHECK-LABEL: @case3
11 | // CHECK{LITERAL}: [[[2, 2, 2, 2, 2]], [[3, 3, 3, 3, 3]]]
12 | // CHECK-NOT: mhlo.broadcast_in_dim
13 | 


--------------------------------------------------------------------------------
/compiler/numerical/lit.site.cfg.py.in:
--------------------------------------------------------------------------------
 1 | import lit.llvm
 2 | 
 3 | config.llvm_tools_dir = r"@LLVM_TOOLS_DIR@"
 4 | config.byteir_tools_dir = r"@BYTEIR_TOOLS_DIR@"
 5 | config.byteir_numerical_build_dir = r"@BYTEIR_NUMERICAL_BUILD_DIR@"
 6 | config.lit_tools_dir = config.llvm_tools_dir
 7 | 
 8 | try:
 9 |     config.llvm_tools_dir = config.llvm_tools_dir % lit_config.params
10 | except KeyError:
11 |     e = sys.exc_info()[1]
12 |     key, = e.args
13 |     lit_config.fatal("unable to find %r parameter, use '--param=%s=VALUE'" % (key,key))
14 | 
15 | lit.llvm.initialize(lit_config, config)
16 | 
17 | # Let the main config do the real work.
18 | lit_config.load_config(config, r"@BYTEIR_NUMERICAL_SOURCE_DIR@/lit.cfg.py")
19 | 


--------------------------------------------------------------------------------
/compiler/python/byteir/dialects/cat/ir_translator/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2022 ByteDance Ltd. and/or its affiliates. All rights reserved.
 2 | # Licensed under the Apache License, Version 2.0 (the "License");
 3 | # you may not use this file except in compliance with the License.
 4 | # You may obtain a copy of the License at
 5 | #
 6 | #    http://www.apache.org/licenses/LICENSE-2.0
 7 | #
 8 | # Unless required by applicable law or agreed to in writing, software
 9 | # distributed under the License is distributed on an "AS IS" BASIS,
10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | # See the License for the specific language governing permissions and
12 | # limitations under the License.
13 | # ==============================================================================
14 | 


--------------------------------------------------------------------------------
/compiler/python/test/dialects/cat/ait/numerical/layernorm.mlir:
--------------------------------------------------------------------------------
1 | // RUN: %python -m byteir.tools.cat_executor %s --backend=ait | FileCheck %s
2 | 
3 | func.func @layer_norm(%arg0 : tensor<1x16x4096xf32>, %arg1 : tensor<4096xf32>, %arg2 : tensor<4096xf32>) -> tensor<1x16x4096xf32> attributes {__byteir_cat_fusion__} {
4 |   %0 = "mhlo.custom_call"(%arg0, %arg1, %arg2) {api_version = 1 : i32, backend_config = "", byteir_attrs = {axis = [2], epsilon = 1.000000e-05 : f64}, call_target_name = "byteir.layer_norm", called_computations = [], has_side_effect = false} : (tensor<1x16x4096xf32>, tensor<4096xf32>, tensor<4096xf32>) -> (tensor<1x16x4096xf32>)
5 |   return %0 : tensor<1x16x4096xf32>
6 | }
7 | 
8 | // CHECK: cat ait numerical test pass
9 | 


--------------------------------------------------------------------------------
/compiler/python/test/dialects/cat/ait/numerical/matmul.mlir:
--------------------------------------------------------------------------------
1 | // RUN: %python -m byteir.tools.cat_executor %s --backend=ait | FileCheck %s
2 | 
3 | func.func @main(%arg0 : tensor<128x64xf16>, %arg1 : tensor<64x32xf16>) -> tensor<128x32xf16> attributes {__byteir_cat_fusion__} {
4 |   %0 = "mhlo.dot_general"(%arg0, %arg1) {dot_dimension_numbers = #mhlo.dot<lhs_contracting_dimensions = [1], rhs_contracting_dimensions = [0]>} : (tensor<128x64xf16>, tensor<64x32xf16>) -> tensor<128x32xf16>
5 |   return %0 : tensor<128x32xf16>
6 | }
7 | 
8 | // CHECK: cat ait numerical test pass
9 | 


--------------------------------------------------------------------------------
/compiler/python/test/dialects/cat/ait/numerical/permute021.mlir:
--------------------------------------------------------------------------------
1 | // RUN: %python -m byteir.tools.cat_executor %s --backend=ait | FileCheck %s
2 | 
3 | func.func @permute021(%arg0 : tensor<32x16x128xf32>) -> tensor<32x128x16xf32> attributes {__byteir_cat_fusion__} {
4 |   %0 = "mhlo.transpose"(%arg0) {permutation = dense<[0, 2, 1]> : tensor<3xi64>} : (tensor<32x16x128xf32>) -> tensor<32x128x16xf32>
5 |   return %0 : tensor<32x128x16xf32>
6 | }
7 | 
8 | // CHECK: cat ait numerical test pass
9 | 


--------------------------------------------------------------------------------
/compiler/python/test/dialects/cat/ait/numerical/permute0213.mlir:
--------------------------------------------------------------------------------
1 | // RUN: %python -m byteir.tools.cat_executor %s --backend=ait | FileCheck %s
2 | 
3 | func.func @permute0213(%arg0 : tensor<1x16x32x128xf32>) -> tensor<1x32x16x128xf32> attributes {__byteir_cat_fusion__} {
4 |   %0 = "mhlo.transpose"(%arg0) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x16x32x128xf32>) -> tensor<1x32x16x128xf32>
5 |   return %0 : tensor<1x32x16x128xf32>
6 | }
7 | 
8 | // CHECK: cat ait numerical test pass
9 | 


--------------------------------------------------------------------------------
/compiler/python/test/dialects/cat/ait/numerical/permute0312.mlir:
--------------------------------------------------------------------------------
1 | // RUN: %python -m byteir.tools.cat_executor %s --backend=ait | FileCheck %s
2 | 
3 | func.func @permute0312(%arg0 : tensor<1x16x32x128xf32>) -> tensor<1x128x16x32xf32> attributes {__byteir_cat_fusion__} {
4 |   %0 = "mhlo.transpose"(%arg0) {permutation = dense<[0, 3, 1, 2]> : tensor<4xi64>} : (tensor<1x16x32x128xf32>) -> tensor<1x128x16x32xf32>
5 |   return %0 : tensor<1x128x16x32xf32>
6 | }
7 | 
8 | // CHECK: cat ait numerical test pass
9 | 


--------------------------------------------------------------------------------
/compiler/python/test/dialects/cat/ait/numerical/permute10.mlir:
--------------------------------------------------------------------------------
1 | // RUN: %python -m byteir.tools.cat_executor %s --backend=ait | FileCheck %s
2 | 
3 | func.func @permute10(%arg0 : tensor<128x64xf32>) -> tensor<64x128xf32> attributes {__byteir_cat_fusion__} {
4 |   %0 = "mhlo.transpose"(%arg0) {permutation = dense<[1, 0]> : tensor<2xi64>} : (tensor<128x64xf32>) -> tensor<64x128xf32>
5 |   return %0 : tensor<64x128xf32>
6 | }
7 | 
8 | // CHECK: cat ait numerical test pass
9 | 


--------------------------------------------------------------------------------
/compiler/python/test/dialects/cat/ait/numerical/softmax_f16.mlir:
--------------------------------------------------------------------------------
1 | // RUN: %python -m byteir.tools.cat_executor %s --backend=ait | FileCheck %s
2 | 
3 | func.func @softmax_f16(%arg0 : tensor<1x12x1024x1024xf16>) -> tensor<1x12x1024x1024xf16> attributes {__byteir_cat_fusion__} {
4 |   %0 = mhlo.custom_call @byteir.softmax(%arg0) {backend_config = "", byteir_attrs = {axis = 3 : i64}} : (tensor<1x12x1024x1024xf16>) -> tensor<1x12x1024x1024xf16>
5 |   return %0 : tensor<1x12x1024x1024xf16>
6 | }
7 | 
8 | // CHECK: cat ait numerical test pass
9 | 


--------------------------------------------------------------------------------
/compiler/python/test/dialects/cat/ait/profile/matmul.mlir:
--------------------------------------------------------------------------------
1 | // RUN: %python -m byteir.tools.cat_executor %s --mode=profile --backend=ait | FileCheck %s
2 | 
3 | func.func @main(%arg0 : tensor<128x64xf16>, %arg1 : tensor<64x32xf16>) -> tensor<128x32xf16> attributes {__byteir_cat_fusion__} {
4 |   %0 = "mhlo.dot_general"(%arg0, %arg1) {dot_dimension_numbers = #mhlo.dot<lhs_contracting_dimensions = [1], rhs_contracting_dimensions = [0]>} : (tensor<128x64xf16>, tensor<64x32xf16>) -> tensor<128x32xf16>
5 |   return %0 : tensor<128x32xf16>
6 | }
7 | 
8 | // CHECK: cat ait profile finish
9 | 


--------------------------------------------------------------------------------
/compiler/python/test/lit.site.cfg.py.in:
--------------------------------------------------------------------------------
 1 | import lit.llvm
 2 | 
 3 | config.llvm_tools_dir = r"@LLVM_TOOLS_DIR@"
 4 | config.byteir_python_packages_dir = r"@BYTEIR_PYTHON_PACKAGES_DIR@/byteir"
 5 | config.byteir_python_test_dir = r"@PROJECT_BINARY_DIR@/python/test"
 6 | config.lit_tools_dir = config.llvm_tools_dir
 7 | 
 8 | try:
 9 |     config.llvm_tools_dir = config.llvm_tools_dir % lit_config.params
10 | except KeyError:
11 |     e = sys.exc_info()[1]
12 |     key, = e.args
13 |     lit_config.fatal("unable to find %r parameter, use '--param=%s=VALUE'" % (key,key))
14 | 
15 | lit.llvm.initialize(lit_config, config)
16 | 
17 | # Let the main config do the real work.
18 | lit_config.load_config(config, r"@CMAKE_CURRENT_SOURCE_DIR@/lit.cfg.py")
19 | 


--------------------------------------------------------------------------------
/compiler/python/version.txt:
--------------------------------------------------------------------------------
1 | 1.9.3.0


--------------------------------------------------------------------------------
/compiler/test/Conversion/ToHlo/arithConstToMhlo.mlir:
--------------------------------------------------------------------------------
 1 | // RUN: byteir-opt %s -convert-arith-to-mhlo | FileCheck %s
 2 | 
 3 | // CHECK-LABEL: func.func @const
 4 | func.func @const() -> tensor<4x4xf32> {
 5 |   // CHECK: mhlo.constant
 6 |   %0 = arith.constant dense<0.000000e+00> : tensor<4x4xf32>
 7 |   return %0 : tensor<4x4xf32>
 8 | }
 9 | 
10 | // CHECK-LABEL: func.func @not_mhlo_const
11 | func.func @not_mhlo_const() -> i32 {
12 |   // CHECK-NOT: mhlo.constant
13 |   %0 = arith.constant 1 : i32
14 |   return %0 : i32
15 | }
16 | 


--------------------------------------------------------------------------------
/compiler/test/Conversion/ToLinalg/TesnorToLinalg.mlir:
--------------------------------------------------------------------------------
 1 | // RUN: byteir-opt -tensor-to-linalg -split-input-file %s | FileCheck %s
 2 | 
 3 | 
 4 | func.func @expand_shape_static(%arg0: tensor<1000xf32>) -> tensor<1x1000xf32> {
 5 |   %expanded = tensor.expand_shape %arg0 [[0, 1]] output_shape [1, 1000] : tensor<1000xf32> into tensor<1x1000xf32>
 6 |   return %expanded : tensor<1x1000xf32>
 7 | }
 8 | // CHECK-LABEL: @expand_shape_static
 9 | // CHECK: linalg.generic
10 | 
11 | func.func @collapse_shape_static(%arg0: tensor<1x3x4x1x5xf32>) -> tensor<3x4x5xf32> {
12 |   %0 = tensor.collapse_shape %arg0 [[0, 1], [2], [3, 4]] :
13 |     tensor<1x3x4x1x5xf32> into tensor<3x4x5xf32>
14 |   return %0 : tensor<3x4x5xf32>
15 | }
16 | // CHECK-LABEL: @collapse_shape_static
17 | // CHECK: linalg.generic
18 | 


--------------------------------------------------------------------------------
/compiler/test/Conversion/ToLinalg/hloConvertToLinalg.mlir:
--------------------------------------------------------------------------------
 1 | // RUN: byteir-opt %s -hlo-fusion-to-linalg="target="cpu" arch="x86_64"" | FileCheck %s
 2 | 
 3 | func.func @mhlo_convert_f32_i32(%arg0: tensor<2x3xf32>) -> tensor<2x3xi32> {
 4 |     %0 = mhlo.convert %arg0 : (tensor<2x3xf32>) -> tensor<2x3xi32>
 5 |     return %0 : tensor<2x3xi32>
 6 | }
 7 | // CHECK-LABEL: mhlo_convert_f32_i32
 8 | // CHECK: linalg.map
 9 | // CHECK: arith.cmpf
10 | // CHECK: arith.fptosi
11 | // CHECK: arith.select
12 | 


--------------------------------------------------------------------------------
/compiler/test/Conversion/ToLinalg/primitiveOpsHlo.mlir:
--------------------------------------------------------------------------------
 1 | // RUN: byteir-opt -hlo-fusion-to-linalg="enable-primitive-ops=true" %s | FileCheck %s
 2 | 
 3 | // CHECK-LABEL: mhlo_add
 4 | func.func @mhlo_add(%lhs: tensor<2x2xf32>,
 5 |                 %rhs: tensor<2x2xf32>) -> tensor<2x2xf32> {
 6 |   %0 = "mhlo.add"(%lhs, %rhs) {someattr}
 7 |       : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
 8 |   return %0 : tensor<2x2xf32>
 9 |   // CHECK: tensor.empty
10 |   // CHECK: linalg.map
11 |   // CHECK: arith.addf
12 | }
13 | 


--------------------------------------------------------------------------------
/compiler/test/Conversion/ToLinalg/simpleHlo.mlir:
--------------------------------------------------------------------------------
 1 | // RUN: byteir-opt -hlo-legalize-to-linalg %s | FileCheck %s
 2 | 
 3 | // CHECK-LABEL: mhlo_add
 4 | func.func @mhlo_add(%lhs: tensor<2x2xf32>,
 5 |                 %rhs: tensor<2x2xf32>) -> tensor<2x2xf32> {
 6 |   %0 = "mhlo.add"(%lhs, %rhs) {someattr}
 7 |       : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
 8 |   return %0 : tensor<2x2xf32>
 9 |   // CHECK: linalg.generic
10 |   // CHECK: addf
11 | }
12 | 


--------------------------------------------------------------------------------
/compiler/test/Dialect/Ace/attrs.mlir:
--------------------------------------------------------------------------------
1 | // RUN: byteir-opt %s | FileCheck %s
2 | 
3 | func.func @extension_type(%arg0: tensor<3x?xf32, #ace.tensor_encoding<is_dynamic = [0, 1]>>) -> tensor<3x?xf32, #ace.tensor_encoding<is_dynamic = [0, 1]>> {
4 |   return %arg0 : tensor<3x?xf32, #ace.tensor_encoding<is_dynamic = [0, 1]>>
5 | }
6 | // CHECK: tensor<3x?xf32, #ace.tensor_encoding<is_dynamic = [0, 1]>>


--------------------------------------------------------------------------------
/compiler/test/Dialect/Ace/canonicalize.mlir:
--------------------------------------------------------------------------------
1 | // RUN: byteir-opt --canonicalize %s | FileCheck %s
2 | 
3 | func.func @test_ace_constant_case0() -> tensor<!ace.string> {
4 |   %0 = "ace.constant"() {value = dense<"fork_active_pay"> : tensor<!ace.string>} : () -> tensor<!ace.string>
5 |   return %0 : tensor<!ace.string>
6 | }
7 | // CHECK: ace.constant
8 | 


--------------------------------------------------------------------------------
/compiler/test/Dialect/Affine/insertTrivialAffineLoop.mlir:
--------------------------------------------------------------------------------
 1 | // RUN: byteir-opt %s -insert-trivial-affine-loop | FileCheck %s
 2 | 
 3 | 
 4 | func.func  @scalar_func(%arg0: memref<f32>) -> memref<f32> {
 5 |   %cst = arith.constant 1.000000e+00 : f32
 6 |   %0 = memref.alloc() : memref<f32>
 7 |   %cst_0 = arith.constant 0.000000e+00 : f32
 8 |   %1 = affine.load %arg0[] : memref<f32>
 9 |   %2 = arith.cmpf une, %1, %cst_0 : f32
10 |   %3 = arith.select %2, %1, %cst : f32
11 |   affine.store %3, %0[] : memref<f32>
12 |   return %0 : memref<f32>
13 | }
14 | // CHECK-LABEL: func.func @scalar_func
15 | // CHECK: affine.for {{.*}} = 0 to 1
16 | // CHECK-NEXT: affine.load
17 | // CHECK-NEXT: arith.cmpf
18 | // CHECK-NEXT: arith.select
19 | // CHECK-NEXT: affine.store 


--------------------------------------------------------------------------------
/compiler/test/Dialect/Byre/Serialization/Compatibility/version_1_0_0.mlir.bc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bytedance/byteir/5c485e2857e277934ec1f72878b1d9673f3614be/compiler/test/Dialect/Byre/Serialization/Compatibility/version_1_0_0.mlir.bc


--------------------------------------------------------------------------------
/compiler/test/Dialect/Byre/Serialization/Compatibility/version_1_0_0.mlir.bc.v0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bytedance/byteir/5c485e2857e277934ec1f72878b1d9673f3614be/compiler/test/Dialect/Byre/Serialization/Compatibility/version_1_0_0.mlir.bc.v0


--------------------------------------------------------------------------------
/compiler/test/Dialect/Byre/Serialization/Compatibility/version_1_0_0_alloc.mlir.bc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bytedance/byteir/5c485e2857e277934ec1f72878b1d9673f3614be/compiler/test/Dialect/Byre/Serialization/Compatibility/version_1_0_0_alloc.mlir.bc


--------------------------------------------------------------------------------
/compiler/test/Dialect/Lace/ops.mlir:
--------------------------------------------------------------------------------
 1 | // RUN: byteir-opt -allow-unregistered-dialect %s | FileCheck %s
 2 | 
 3 | func.func @test_reshape(%arg0: memref<2x3xf32>) -> memref<6xf32> {
 4 |   %0 = "lace.reshape" (%arg0) : (memref<2x3xf32>) -> memref<6xf32>
 5 |   return %0: memref<6xf32>
 6 | }
 7 | // CHECK: lace.reshape
 8 | 
 9 | func.func @test_slice(%arg0: memref<2x3xf32>) -> memref<1x3xf32> {
10 |   %0 = "lace.slice" (%arg0) {limit_indices = dense<[2, 3]> : tensor<2xi64>, start_indices = dense<[1, 0]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}: (memref<2x3xf32>) -> memref<1x3xf32>
11 |   return %0: memref<1x3xf32>
12 | }


--------------------------------------------------------------------------------
/compiler/test/Dialect/Mhlo/fusion.mlir:
--------------------------------------------------------------------------------
 1 | // RUN: byteir-opt %s | FileCheck %s
 2 | 
 3 | func.func @mhlo_add(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
 4 |   %0 = "mhlo.fusion"(%arg0, %arg1) ( {
 5 |     %1 = mhlo.add %arg0, %arg1 : tensor<4xf32>
 6 |     %2 = mhlo.add %arg0, %1 : tensor<4xf32>
 7 |     "mhlo.return"(%2) : (tensor<4xf32>) -> ()
 8 |   }) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
 9 |   return %0 : tensor<4xf32>
10 | }
11 | // CHECK-LABEL: func.func @mhlo_add
12 | 


--------------------------------------------------------------------------------
/compiler/test/Dialect/Mhlo/transforms/fuseBMMDimension.mlir:
--------------------------------------------------------------------------------
 1 | // RUN: byteir-opt %s -fuse-bmm-dimension | FileCheck %s
 2 | 
 3 | func.func @dot_general(%arg0 : tensor<12x4x64x64xf32>, %arg1 : tensor<12x4x64x32xf32>) -> (tensor<12x4x64x32xf32>) {
 4 |     %0 = "mhlo.dot_general"(%arg0, %arg1) {dot_dimension_numbers = #mhlo.dot<lhs_batching_dimensions = [0, 1], rhs_batching_dimensions = [0, 1], lhs_contracting_dimensions = [3], rhs_contracting_dimensions = [2]>} : (tensor<12x4x64x64xf32>, tensor<12x4x64x32xf32>) -> tensor<12x4x64x32xf32>
 5 |     return %0 : tensor<12x4x64x32xf32>
 6 | }
 7 | 
 8 | // CHECK-LABEL: dot_general
 9 | // CHECK-NEXT: mhlo.reshape
10 | // CHECK-NEXT: mhlo.reshape
11 | // CHECK-NEXT: mhlo.dot_general
12 | // CHECK-NEXT: mhlo.reshape
13 | 


--------------------------------------------------------------------------------
/compiler/test/Dialect/SCF/insertTrivialSCFLoop.mlir:
--------------------------------------------------------------------------------
 1 | // RUN: byteir-opt %s -insert-trivial-scf-loop | FileCheck %s
 2 | 
 3 | 
 4 | func.func  @scalar_func(%arg0: memref<f32>) -> memref<f32> {
 5 |   %cst = arith.constant 1.000000e+00 : f32
 6 |   %0 = memref.alloc() : memref<f32>
 7 |   %cst_0 = arith.constant 0.000000e+00 : f32
 8 |   %1 = memref.load %arg0[] : memref<f32>
 9 |   %2 = arith.cmpf une, %1, %cst_0 : f32
10 |   %3 = arith.select %2, %1, %cst : f32
11 |   memref.store %3, %0[] : memref<f32>
12 |   return %0 : memref<f32>
13 | }
14 | // CHECK-LABEL: func.func @scalar_func
15 | // CHECK: scf.for {{.*}} = %c0 to %c1 step %c1
16 | // CHECK-NEXT: memref.load
17 | // CHECK-NEXT: arith.cmpf
18 | // CHECK-NEXT: select
19 | // CHECK-NEXT: memref.store 


--------------------------------------------------------------------------------
/compiler/test/E2E/CUDA/MLPBasic/input.mlir:
--------------------------------------------------------------------------------
 1 | // RUN: byteir-opt %s | FileCheck %s
 2 | 
 3 | func.func @mlp(%arg0 : tensor<128x64xf32>, %arg1 : tensor<64x32xf32>, %arg2 : tensor<32xf32>) -> tensor<128x32xf32> {
 4 |   %0 = "mhlo.dot"(%arg0, %arg1) : (tensor<128x64xf32>, tensor<64x32xf32>) -> tensor<128x32xf32>
 5 |   %1 = "mhlo.broadcast_in_dim"(%arg2) {broadcast_dimensions = dense<[1]> : tensor<1xi64>} : (tensor<32xf32>) -> tensor<128x32xf32>
 6 |   %2 = "mhlo.add"(%0, %1) : (tensor<128x32xf32>, tensor<128x32xf32>) -> tensor<128x32xf32>
 7 |   return %2 : tensor<128x32xf32>
 8 | }
 9 | // CHECK-LABEL: func.func @mlp
10 | 


--------------------------------------------------------------------------------
/compiler/test/E2E/Host/Case0_Bytecode/Output.bc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bytedance/byteir/5c485e2857e277934ec1f72878b1d9673f3614be/compiler/test/E2E/Host/Case0_Bytecode/Output.bc


--------------------------------------------------------------------------------
/compiler/test/E2E/Host/Case0_Bytecode/Output.mlirbc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bytedance/byteir/5c485e2857e277934ec1f72878b1d9673f3614be/compiler/test/E2E/Host/Case0_Bytecode/Output.mlirbc


--------------------------------------------------------------------------------
/compiler/test/E2E/Host/Case1/Output.mlir:
--------------------------------------------------------------------------------
 1 | // RUN: byteir-opt %s  | FileCheck %s
 2 | 
 3 | // CHECK-LABEL: func.func @main
 4 | 
 5 | module attributes {byre.container_module} {
 6 |   func.func @main(%arg0: memref<1x100x27x48x3xf32, "cpu"> {byre.argname = "Input0", byre.argtype = 1 : i32}, %arg1: memref<51200xi32, "cpu"> {byre.argname = "Output0", byre.argtype = 2 : i32}) attributes {byre.entry_point} {
 7 |     byre.compute @LLVMJITOp(%arg0, %arg1) {kernel_name = "Unknown0", llvm_file_name = "host_kernels.ll", memory_effects = [1 : i32, 2 : i32]} : memref<1x100x27x48x3xf32, "cpu">, memref<51200xi32, "cpu">
 8 |     return
 9 |   }
10 | }


--------------------------------------------------------------------------------
/compiler/test/E2E/Host/RngNormal/00_Input.mlir:
--------------------------------------------------------------------------------
 1 | // RUN: byteir-opt %s --hlo-graph-opt --hlo-fusion-opt="target=cpu" --linalg-tensor-opt="target=cpu" --byre-tensor-opt="entry-func=main append-arg-types" --byteir-bufferize-opt --linalg-memref-opt --scf-opt="target=cpu" | FileCheck %s
 2 | 
 3 | // CHECK-LABEL: func.func @main
 4 | 
 5 | func.func @main() -> tensor<1x97xf32> {
 6 |     %0 = mhlo.constant dense<0.000000e+00> : tensor<f32>
 7 |     %1 = mhlo.constant dense<1.000000e+00> : tensor<f32>
 8 |     %2 = mhlo.constant dense<[1, 97]> : tensor<2xi64>
 9 |     %3 = "mhlo.rng"(%0, %1, %2) {rng_distribution = #mhlo.rng_distribution<NORMAL>} : (tensor<f32>, tensor<f32>, tensor<2xi64>) -> tensor<1x97xf32>
10 |     return %3 : tensor<1x97xf32>
11 | }


--------------------------------------------------------------------------------
/compiler/test/E2E/Host/RngUniform/00_Input.mlir:
--------------------------------------------------------------------------------
 1 | // RUN: byteir-opt %s --hlo-graph-opt --hlo-fusion-opt="target=cpu" --linalg-tensor-opt="target=cpu" --byre-tensor-opt="entry-func=main append-arg-types" --byteir-bufferize-opt --linalg-memref-opt --scf-opt="target=cpu" | FileCheck %s
 2 | 
 3 | // CHECK-LABEL: func.func @main
 4 | 
 5 | func.func @main() -> tensor<1x97xf32> {
 6 |     %0 = mhlo.constant dense<0.000000e+00> : tensor<f32>
 7 |     %1 = mhlo.constant dense<1.000000e+00> : tensor<f32>
 8 |     %2 = mhlo.constant dense<[1, 97]> : tensor<2xi64>
 9 |     %3 = "mhlo.rng"(%0, %1, %2) {rng_distribution = #mhlo.rng_distribution<UNIFORM>} : (tensor<f32>, tensor<f32>, tensor<2xi64>) -> tensor<1x97xf32>
10 |     return %3 : tensor<1x97xf32>
11 | }


--------------------------------------------------------------------------------
/compiler/test/E2E/Host/Transpose/00_Input.mlir:
--------------------------------------------------------------------------------
1 | // RUN: byteir-opt %s --hlo-graph-opt --hlo-fusion-opt="target=cpu" --linalg-tensor-opt="target=cpu" --byre-tensor-opt="entry-func=main append-arg-types" --byteir-bufferize-opt --linalg-memref-opt --scf-opt="target=cpu" | FileCheck %s
2 | 
3 | // CHECK-LABEL: func.func @main
4 | 
5 | func.func @main(%arg0: tensor<1x32x64x64xf32>) -> tensor<1x64x64x32xf32> {
6 |   %0 = "mhlo.transpose"(%arg0) {permutation = dense<[0, 2, 3, 1]>: tensor<4xi64>} : (tensor<1x32x64x64xf32>) -> tensor<1x64x64x32xf32> 
7 |   return %0 : tensor<1x64x64x32xf32> 
8 | }


--------------------------------------------------------------------------------
/compiler/test/E2E/Host/Transpose/Output.mlir:
--------------------------------------------------------------------------------
 1 | // RUN: byteir-opt %s  | FileCheck %s
 2 | 
 3 | // CHECK-LABEL: func.func @main
 4 | 
 5 | module attributes {byre.container_module} {
 6 |   func.func @main(%arg0: memref<1x32x64x64xf32, "cpu"> {byre.argname = "Input0", byre.argtype = 1 : i32}, %arg1: memref<1x64x64x32xf32, "cpu"> {byre.argname = "Output0", byre.argtype = 2 : i32}) attributes {byre.entry_point} {
 7 |     byre.compute @LLVMJITOp(%arg0, %arg1) {kernel_name = "Unknown0", llvm_file_name = "host_kernels.ll", memory_effects = [1 : i32, 2 : i32]} : memref<1x32x64x64xf32, "cpu">, memref<1x64x64x32xf32, "cpu">
 8 |     return
 9 |   }
10 | }


--------------------------------------------------------------------------------
/compiler/test/E2E/Host/Transpose/TotalPipeline.mlir:
--------------------------------------------------------------------------------
1 | // RUN: byteir-opt %s --hlo-graph-opt --hlo-fusion-opt="target=cpu" --linalg-tensor-opt="target=cpu" --byre-tensor-opt="entry-func=main append-arg-types" --byteir-bufferize-opt --linalg-memref-opt --scf-opt="target=cpu" --host-opt -set-op-space="entry-func=main space=cpu" -set-arg-space="entry-func=main all-space=cpu" --byre-opt --to-llvm | byteir-translate --mlir-to-llvmir | FileCheck %s
2 | 
3 | // CHECK-LABEL: define void @_mlir_ciface_Unknown
4 | 
5 | func.func @main(%arg0: tensor<1x32x64x64xf32>) -> tensor<1x64x64x32xf32> {
6 |   %0 = "mhlo.transpose"(%arg0) {permutation = dense<[0, 2, 3, 1]>: tensor<4xi64>} : (tensor<1x32x64x64xf32>) -> tensor<1x64x64x32xf32> 
7 |   return %0 : tensor<1x64x64x32xf32> 
8 | }


--------------------------------------------------------------------------------
/compiler/test/E2E/Host/TypeCvt/00_Input.mlir:
--------------------------------------------------------------------------------
1 | // RUN: byteir-opt %s --hlo-graph-opt --hlo-fusion-opt="target=cpu" --linalg-tensor-opt="target=cpu" --byre-tensor-opt="entry-func=main append-arg-types" --byteir-bufferize-opt --linalg-memref-opt --scf-opt="target=cpu" | FileCheck %s
2 | 
3 | // CHECK-LABEL: func.func @main
4 | 
5 | func.func @main(%arg0 : tensor<1x224x224x3xf32>) -> tensor<1x224x224x3xf16>  {
6 |   %0 = mhlo.convert %arg0 : (tensor<1x224x224x3xf32>) -> tensor<1x224x224x3xf16>
7 |   return %0 : tensor<1x224x224x3xf16>
8 | }


--------------------------------------------------------------------------------
/compiler/test/E2E/Host/TypeCvt/Output.mlir:
--------------------------------------------------------------------------------
 1 | // RUN: byteir-opt %s  | FileCheck %s
 2 | 
 3 | // CHECK-LABEL: func.func @main
 4 | 
 5 | module attributes {byre.container_module} {
 6 |   func.func @main(%arg0: memref<1x224x224x3xf32, "cpu"> {byre.argname = "Input0", byre.argtype = 1 : i32}, %arg1: memref<1x224x224x3xf16, "cpu"> {byre.argname = "Output0", byre.argtype = 2 : i32}) attributes {byre.entry_point} {
 7 |     byre.compute @LLVMJITOp(%arg0, %arg1) {kernel_name = "Unknown0", llvm_file_name = "host_kernels.ll", memory_effects = [1 : i32, 2 : i32]} : memref<1x224x224x3xf32, "cpu">, memref<1x224x224x3xf16, "cpu">
 8 |     return
 9 |   }
10 | }


--------------------------------------------------------------------------------
/compiler/test/E2E/Host/TypeCvt/TotalPipeline.mlir:
--------------------------------------------------------------------------------
1 | // RUN: byteir-opt %s --hlo-graph-opt --hlo-fusion-opt="target=cpu" --linalg-tensor-opt="target=cpu" --byre-tensor-opt="entry-func=main append-arg-types" --byteir-bufferize-opt --linalg-memref-opt --scf-opt="target=cpu" --host-opt -set-op-space="entry-func=main space=cpu" -set-arg-space="entry-func=main all-space=cpu" --byre-opt --to-llvm | byteir-translate --mlir-to-llvmir | FileCheck %s
2 | 
3 | // CHECK-LABEL: define void @_mlir_ciface_Unknown
4 | 
5 | func.func @main(%arg0 : tensor<1x224x224x3xf32>) -> tensor<1x224x224x3xf16>  {
6 |   %0 = mhlo.convert %arg0 : (tensor<1x224x224x3xf32>) -> tensor<1x224x224x3xf16>
7 |   return %0 : tensor<1x224x224x3xf16>
8 | }


--------------------------------------------------------------------------------
/compiler/test/Pipelines/BufferizeOpts/tensor.mlir:
--------------------------------------------------------------------------------
 1 | // RUN: byteir-opt %s -byteir-bufferize-opt --split-input-file | FileCheck %s
 2 | 
 3 | // CHECK-LABEL: tensor_pad
 4 | func.func @tensor_pad(%arg0: tensor<2x34xi32>) -> tensor<2x64xi32> {
 5 |   %c3_i32 = arith.constant 3 : i32
 6 |   // CHECK-NOT: bufferization.to_tensor
 7 |   // CHECK: linalg.map
 8 |   //   CHECK-SAME: memref<2x64xi32>
 9 |   // CHECK-NOT: bufferization.to_memref
10 |   %0 = tensor.pad %arg0 low[0, 0] high[0, 30] {
11 |   ^bb0(%arg1: index, %arg2: index):
12 |     tensor.yield %c3_i32 : i32
13 |   } : tensor<2x34xi32> to tensor<2x64xi32>
14 |   return %0 : tensor<2x64xi32>
15 | }
16 | 


--------------------------------------------------------------------------------
/compiler/test/Pipelines/Host/ToLLVM/subview.mlir:
--------------------------------------------------------------------------------
 1 | // RUN: byteir-opt --to-llvm %s | FileCheck %s
 2 | 
 3 | module attributes {byteir.llvm_module} {
 4 |   func.func @subview(%arg0: memref<32x128xi32>) -> memref <32x64xi32, strided<[128, 1]>> attributes {llvm.emit_c_interface} {
 5 |     %0 = memref.subview %arg0[0, 0] [32, 64] [1, 1] : memref<32x128xi32> to memref <32x64xi32, strided<[128, 1]>>
 6 |     return %0: memref <32x64xi32, strided<[128, 1]>>
 7 |   }
 8 |   // CHECK-LABEL: llvm.func @subview
 9 |   //   CHECK: llvm.mlir.undef : !llvm.struct
10 |   // CHECK-LABEL: llvm.func @_mlir_ciface_subview
11 | }


--------------------------------------------------------------------------------
/compiler/test/Pipelines/Host/ToLLVM/tanh.mlir:
--------------------------------------------------------------------------------
 1 | // RUN: byteir-opt --to-llvm %s | FileCheck %s
 2 | 
 3 | module attributes {byteir.llvm_module} {
 4 |   func.func @Unknown0(%arg0: memref<32xf32>, %arg1: memref<32xf32>) attributes {llvm.emit_c_interface} {
 5 |     %c0 = arith.constant 0 : index
 6 |     %c1 = arith.constant 1 : index
 7 |     %c32 = arith.constant 32 : index
 8 |     scf.for %arg2 = %c0 to %c32 step %c1 {
 9 |       %0 = memref.load %arg0[%arg2] : memref<32xf32>
10 |       %1 = math.tanh %0 : f32
11 |       memref.store %1, %arg1[%arg2] : memref<32xf32>
12 |     }
13 |     return
14 |   }
15 |   // CHECK-LABEL: llvm.func @tanhf
16 |   //   CHECK: llvm.call @tanhf
17 | }


--------------------------------------------------------------------------------
/compiler/test/Target/Cpp/attrs.mlir:
--------------------------------------------------------------------------------
 1 | // RUN: byteir-translate -emit-cpp %s | FileCheck %s
 2 | 
 3 | // CHECK-LABEL: void opaque_attrs() {
 4 | func.func @opaque_attrs() {
 5 |   // CHECK-NEXT: f(OPAQUE_ENUM_VALUE);
 6 |   emitc.call_opaque "f"() {args = [#emitc.opaque<"OPAQUE_ENUM_VALUE">]} : () -> ()
 7 |   // CHECK-NEXT: f("some string");
 8 |   emitc.call_opaque "f"() {args = [#emitc.opaque<"\"some string\"">]} : () -> ()
 9 |   return
10 | }
11 | 


--------------------------------------------------------------------------------
/compiler/test/Target/Cpp/cast.mlir:
--------------------------------------------------------------------------------
 1 | // RUN: byteir-translate -emit-cpp %s | FileCheck %s
 2 | 
 3 | // CHECK-LABEL: test_cast
 4 | // CHECK-SAME: (int32_t [[V1:[^ ]*]], uint32_t [[V2:[^ ]*]], size_t [[V3:[^ ]*]])
 5 | func.func @test_cast(%arg0 : i32, %arg1 : ui32, %arg2: index) -> i32 {
 6 |   // CHECK-NEXT: float [[V4:[^ ]*]] = (float)([[V1]]);
 7 |   %0 = arith.sitofp %arg0: i32 to f32
 8 |   // CHECK-NEXT: int32_t [[V5:[^ ]*]] = (int32_t)([[V3]]);
 9 |   %1 = arith.index_cast %arg2: index to i32
10 |   // CHECK-NEXT: float [[V6:[^ ]*]] = (float)([[V2]]);
11 |   %2 = builtin.unrealized_conversion_cast %arg1: ui32 to f32
12 |   // CHECK-NEXT: int32_t [[V7:[^ ]*]] = (int32_t)([[V4]]);
13 |   %3 = arith.fptosi %0: f32 to i32 
14 |   return %3 : i32
15 | }
16 | 
17 | 


--------------------------------------------------------------------------------
/compiler/test/Target/Cpp/types.mlir:
--------------------------------------------------------------------------------
 1 | // RUN: byteir-translate -emit-cpp %s | FileCheck %s
 2 | 
 3 | // CHECK-LABEL: func_external
 4 | // CHECK-SAME: (int32_t, int32_t)
 5 | func.func private @func_external(%arg0 : i32, %arg1 : i32)
 6 | 
 7 | // CHECK-LABEL: func_int
 8 | // CHECK-SAME: (int32_t [[V1:[^ ]*]], int32_t [[V2:[^ ]*]])
 9 | func.func @func_int(%arg0 : i32, %arg1 : i32) -> () {
10 |   return
11 | }
12 | 
13 | // CHECK-LABEL: func_memref
14 | // CHECK-SAME: (float*, int32_t)
15 | func.func private @func_memref(%arg0 : memref<2x3xf32>, %arg1 : i32)


--------------------------------------------------------------------------------
/compiler/test/Transforms/ApplyPDLPatterns/Case_0.mlir:
--------------------------------------------------------------------------------
1 | // RUN: byteir-opt %s -apply-pdl-patterns="pdl-file=%S/Pattern_0.mlir" -allow-unregistered-dialect | FileCheck %s
2 | 
3 | func.func @foo(%arg0: index) -> index {
4 |     // CHECK: test.test_op_B
5 |     %0 = "test.test_op_A"(%arg0) {__rewrite__} : (index) -> index
6 |     // CHECK: test.test_op_A
7 |     %1 = "test.test_op_A"(%0) : (index) -> index
8 |     return %1 : index
9 | }


--------------------------------------------------------------------------------
/compiler/test/Transforms/ApplyPDLPatterns/Pattern_0.mlir:
--------------------------------------------------------------------------------
 1 | // RUN: byteir-opt %s
 2 | 
 3 | module {
 4 |   pdl.pattern : benefit(0) {
 5 |     %0 = types
 6 |     %1 = operands
 7 |     %2 = attribute
 8 |     %3 = operation "test.test_op_A"(%1 : !pdl.range<value>) {"__rewrite__" = %2} -> (%0 : !pdl.range<type>)
 9 |     rewrite %3 {
10 |       %4 = operation "test.test_op_B"(%1 : !pdl.range<value>) -> (%0 : !pdl.range<type>)
11 |       replace %3 with %4
12 |     }
13 |   }
14 | }
15 | 


--------------------------------------------------------------------------------
/compiler/test/Transforms/CanonicalizeExt/deprecated.mlir:
--------------------------------------------------------------------------------
 1 | // RUN: byteir-opt %s -canonicalize-ext | FileCheck %s
 2 | 
 3 | func.func @broadcast_to_broadcast_in_dim(%arg0: tensor<3xf32>) -> tensor<1x2x3xf32> {
 4 |   %0 = "mhlo.broadcast"(%arg0) {broadcast_sizes = dense<[1, 2]> : tensor<2xi64>} : (tensor<3xf32>) -> tensor<1x2x3xf32>
 5 |   return %0 : tensor<1x2x3xf32>
 6 | }
 7 | // CHECK-LABEL: func.func @broadcast_to_broadcast_in_dim
 8 | // CHECK-NEXT:  mhlo.broadcast_in_dim
 9 | // CHECK-NEXT:  return
10 | 


--------------------------------------------------------------------------------
/compiler/test/Transforms/collectFunc.mlir:
--------------------------------------------------------------------------------
 1 | // RUN: byteir-opt %s -collect-func="anchor-attr=testAttr" | FileCheck %s
 2 | 
 3 | 
 4 | func.func private @test_private1() {
 5 |     return
 6 | }
 7 | // CHECK-LABEL: func.func private @test_private1() 
 8 | 
 9 | func.func private @test_private2() {
10 |     return
11 | }
12 | // CHECK-NOT: func.func private @test_private2() 
13 | 
14 | func.func @test1() attributes {testAttr} {
15 |     call @test_private1() : () -> ()
16 |     return
17 | }
18 | // CHECK-LABEL: func.func @test1() attributes {testAttr}
19 | 
20 | func.func @test2() attributes {testAttr2} {
21 |     call @test_private2() : () -> ()
22 |     return
23 | }
24 | // CHECK-NOT: func.func @test2() 
25 | 
26 | 


--------------------------------------------------------------------------------
/compiler/test/Transforms/genericDeviceConfig.mlir:
--------------------------------------------------------------------------------
1 | // RUN: byteir-opt %s -generic-device-config="anchor-attr=__byteir_test_device__ compute-name=TestDeviceOp" | FileCheck %s
2 | 
3 | func.func private @device_func(memref<1x97xf32>, memref<1x6xf32>) -> memref<1x6xf32> attributes {__byteir_test_device__}
4 | // CHECK-LABEL: func.func private @device_func
5 | // CHECK-SAME: attributes {__byre__kernel_name = "device_func", __byteir_test_device__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "TestDeviceOp", byre_force_compute_name}


--------------------------------------------------------------------------------
/compiler/test/Transforms/insertUniqueIdErase.mlir:
--------------------------------------------------------------------------------
 1 | // RUN: byteir-opt %s -insert-unique-id="erase-id=true" -split-input-file | FileCheck %s
 2 | 
 3 | func.func @mhlo_add(%arg0 : tensor<4xf32>, %arg1 : tensor<4xf32>) -> tensor<4xf32> {
 4 |   %res = "mhlo.add"(%arg0, %arg1) {__byteir_unique_id__ = "mhlo.add_0"} : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
 5 |   return {__byteir_unique_id__ = "func.return_1"} %res : tensor<4xf32>
 6 | }
 7 | 
 8 | // CHECK-LABEL: func.func @mhlo_add
 9 | // CHECK-SAME: (%[[ARG0:[a-zA-Z0-9]+]]: tensor<4xf32>, %[[ARG1:[a-zA-Z0-9]+]]: tensor<4xf32>)
10 | // CHECK: %[[V0:.*]] = mhlo.add %[[ARG0]], %[[ARG1]] : tensor<4xf32>
11 | // CHECK: return %[[V0]] : tensor<4xf32>


--------------------------------------------------------------------------------
/compiler/test/Transforms/rewriteOpToStdCall.mlir:
--------------------------------------------------------------------------------
1 | // RUN: byteir-opt %s --rewrite-op-to-std-call="call-table=linalg.matmul:matmul_impl" --split-input-file | FileCheck %s
2 | 
3 | // CHECK:  func.func private @matmul_impl(memref<?x?xf32>, memref<?x?xf32>, memref<?x?xf32>)
4 | func.func @matmul(%A : memref<?x?xf32>, %B : memref<?x?xf32>, %C : memref<?x?xf32>) {
5 |     // CHECK: call @matmul_impl({{.*}}, {{.*}}, {{.*}}) : (memref<?x?xf32>, memref<?x?xf32>, memref<?x?xf32>) -> ()
6 |     linalg.matmul ins(%A, %B: memref<?x?xf32>, memref<?x?xf32>) outs(%C: memref<?x?xf32>)
7 |     return
8 | }
9 | 


--------------------------------------------------------------------------------
/compiler/test/Transforms/setArgShape.mlir:
--------------------------------------------------------------------------------
1 | // RUN: byteir-opt %s -allow-unregistered-dialect -set-arg-shape="dim=0 size=3 entry-func-name=tf_add arg-attr-name=__placeholder__byre.argname" | FileCheck %s
2 | 
3 | func.func @tf_add(%arg0 : tensor<?x4xf32> {__placeholder__byre.argname = "A"}, %arg1 : tensor<?x4xf32> {__placeholder__byre.argname = "B"}) -> (tensor<*xf32> {__placeholder__byre.argname = "C"}) attributes { __placeholder__byre.entry_point} {
4 |     %res = "tf.Add"(%arg0, %arg1) : (tensor<?x4xf32>, tensor<?x4xf32>) -> tensor<*xf32>
5 |     return %res : tensor<*xf32>
6 | }
7 | // CHECK-LABEL: func.func @tf_add
8 | // CHECK-NEXT: %[[RES0:.*]] = "tf.Add"(%arg0, %arg1) : (tensor<3x4xf32>, tensor<3x4xf32>) -> tensor<*xf32>
9 | 


--------------------------------------------------------------------------------
/compiler/test/Transforms/setOpSpace.mlir:
--------------------------------------------------------------------------------
 1 | // RUN: byteir-opt %s -set-op-space="entry-func=main space=cpu" --allow-unregistered-dialect| FileCheck %s
 2 | 
 3 | // CHECK-LABEL: func.func @main
 4 | func.func @main(%arg0 : memref<2x4xf32>, %arg1 : memref<2x4xf32>, %arg2 : memref<2x4xf32>) -> (memref<2x4xf32>, memref<2x4xf32>) {
 5 |   %0 = memref.alloc() : memref<2x4xf32>
 6 |   "lmhlo.add"(%arg0, %arg1, %arg2) : (memref<2x4xf32>, memref<2x4xf32>, memref<2x4xf32>) -> ()
 7 | // CHECK: lmhlo.add
 8 | // CHECK-SAME: device = "cpu"
 9 |   "lmhlo.add"(%arg0, %arg1, %0) : (memref<2x4xf32>, memref<2x4xf32>, memref<2x4xf32>) -> ()
10 | // CHECK-NEXT: lmhlo.add
11 | // CHECK-SAME: device = "cpu"
12 |   return %0, %0: memref<2x4xf32>, memref<2x4xf32> 
13 | }
14 | 


--------------------------------------------------------------------------------
/compiler/test/Utils/testMergeTwoModulesCase0.mlir:
--------------------------------------------------------------------------------
 1 | // RUN: byteir-opt %s --test-merge-two-modules="second-module-path=%S/testMergeTwoModulesCase0_1.mlir" --allow-unregistered-dialect | FileCheck %s
 2 | 
 3 | func.func @main(%arg0: tensor<f32>) -> tensor<f32> {
 4 |     return %arg0 : tensor<f32>
 5 | }
 6 | // CHECK: func.func @main
 7 | // CHECK-NEXT: call @__byteir__merge_model_0
 8 | // CHECK-NEXT: call @__byteir__merge_model_1
 9 | // CHECK-DAG: func.func private @__byteir__merge_model_1
10 | // CHECK-DAG: func.func private @__byteir__merge_model_0
11 | 


--------------------------------------------------------------------------------
/compiler/test/Utils/testMergeTwoModulesCase0_1.mlir:
--------------------------------------------------------------------------------
1 | // RUN: byteir-opt %s --allow-unregistered-dialect
2 | 
3 | module {
4 | func.func @main(%arg0: tensor<f32>) -> tensor<f32> {
5 |     %0 = "foo.add"(%arg0, %arg0) : (tensor<f32>, tensor<f32>) -> tensor<f32>
6 |     return %0 : tensor<f32>
7 | }
8 | }
9 | 


--------------------------------------------------------------------------------
/compiler/test/Utils/testMergeTwoModulesCase1_1.mlir:
--------------------------------------------------------------------------------
1 | // RUN: byteir-opt %s --allow-unregistered-dialect
2 | 
3 | module {
4 | func.func @main(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<f32> attributes {byteir.entry_point = {inputs = ["module1_input0", "module1_input1"], outputs = ["module1_output"]}} {
5 |     %0 = "foo.add"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
6 |     return %0 : tensor<f32>
7 | }
8 | }
9 | 


--------------------------------------------------------------------------------
/compiler/test/Utils/testMergeTwoModulesCase2.mlir:
--------------------------------------------------------------------------------
1 | // RUN: byteir-opt %s --allow-unregistered-dialect
2 | 
3 | func.func @main(%arg0: tensor<f32>, %arg1: tensor<f32>) -> (tensor<f32>, tensor<f32>) attributes {byteir.entry_point = {inputs = ["module0_input0", "module0_input1"], outputs = ["x", "y"]}} {
4 |     return %arg0, %arg1 : tensor<f32>, tensor<f32>
5 | }
6 | 


--------------------------------------------------------------------------------
/compiler/test/Utils/testMergeTwoModulesCase2_1.mlir:
--------------------------------------------------------------------------------
1 | // RUN: byteir-opt %s --allow-unregistered-dialect
2 | 
3 | module {
4 | func.func @main(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<f32> attributes {byteir.entry_point = {inputs = ["xx", "yy"], outputs = ["module1_output"]}} {
5 |     %0 = "foo.add"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
6 |     return %0 : tensor<f32>
7 | }
8 | }
9 | 


--------------------------------------------------------------------------------
/compiler/test/lib/Analysis/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Exclude tests from libMLIR.so
 2 | add_mlir_library(ByteIRTestAnalysis
 3 |  TestGraphClusteringByDeviceOpNum.cpp
 4 |   TestPrintLiveness.cpp
 5 |   TestPrintShapeAnalysis.cpp
 6 |   TestPrintSideEffect.cpp
 7 |   TestPrintSymbolicShape.cpp
 8 |   TestPrintUseRange.cpp
 9 | 
10 |   EXCLUDE_FROM_LIBMLIR
11 | 
12 |   LINK_LIBS PUBLIC
13 |   ByteIRAnalysis
14 |   )


--------------------------------------------------------------------------------
/compiler/test/lib/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(Analysis)
2 | add_subdirectory(Interface)
3 | add_subdirectory(Transformation)
4 | add_subdirectory(Utils)
5 | 


--------------------------------------------------------------------------------
/compiler/test/lib/Interface/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Exclude tests from libMLIR.so
 2 | add_mlir_library(ByteIRTestInterface
 3 |   TestByreOpInterface.cpp
 4 | 
 5 |   EXCLUDE_FROM_LIBMLIR
 6 | 
 7 |   DEPENDS
 8 |   MLIRByreDialect
 9 | 
10 |   LINK_LIBS PUBLIC
11 |   MLIRIR
12 |   MLIRByreDialect
13 |   )


--------------------------------------------------------------------------------
/compiler/test/lib/Transformation/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Exclude tests from libMLIR.so
 2 | add_mlir_library(ByteIRTestTransformation
 3 |   TestByreSerialRoundtrip.cpp
 4 |   TestConvertFuncToCustomCall.cpp
 5 |   TestConvertInsertion.cpp
 6 |   TestDTypeConversion.cpp
 7 |   TestFuncArgRearrangement.cpp
 8 | 
 9 |   EXCLUDE_FROM_LIBMLIR
10 | 
11 |   LINK_LIBS PUBLIC
12 |   ByteIRMhloPasses
13 |   ByteIRUtils
14 |   MLIRByreSerialization
15 |   MhloDialect
16 |   )


--------------------------------------------------------------------------------
/compiler/test/lib/Utils/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Exclude tests from libMLIR.so
 2 | add_mlir_library(ByteIRTestUtils
 3 |   TestBroadcastDenseElementsAttr.cpp
 4 |   TestMergeTwoModules.cpp
 5 | 
 6 |   EXCLUDE_FROM_LIBMLIR
 7 | 
 8 |   LINK_LIBS PUBLIC
 9 |   ByteIRMhloUtils
10 |   ByteIRUtils
11 |   MhloDialect
12 |   )


--------------------------------------------------------------------------------
/compiler/tools/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(byteir-cpu-runner)
2 | add_subdirectory(byteir-opt)
3 | add_subdirectory(byteir-stat)
4 | add_subdirectory(byteir-translate)
5 | 


--------------------------------------------------------------------------------
/compiler/tools/byteir-cpu-runner/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | set(LLVM_LINK_COMPONENTS
 2 |   Core
 3 |   Support
 4 |   nativecodegen
 5 |   native
 6 | )
 7 | 
 8 | add_mlir_tool(byteir-cpu-runner
 9 |   byteir-cpu-runner.cpp
10 | )
11 | 
12 | llvm_update_compile_flags(byteir-cpu-runner)
13 | target_link_libraries(byteir-cpu-runner PRIVATE
14 |   MLIRAnalysis
15 |   MLIRExecutionEngine
16 |   MLIRIR
17 |   MLIRJitRunner
18 |   MLIRLLVMDialect
19 |   MLIRLLVMToLLVMIRTranslation
20 |   MLIRToLLVMIRTranslationRegistration
21 |   MLIRParser
22 |   MLIRTargetLLVMIRExport
23 |   MLIRSupport
24 | )
25 | 


--------------------------------------------------------------------------------
/external/patches/AITemplate/A10.patch:
--------------------------------------------------------------------------------
 1 | diff --git a/python/aitemplate/testing/detect_target.py b/python/aitemplate/testing/detect_target.py
 2 | index 2b2913d..95fd02f 100644
 3 | --- a/python/aitemplate/testing/detect_target.py
 4 | +++ b/python/aitemplate/testing/detect_target.py
 5 | @@ -42,7 +42,7 @@ def _detect_cuda_with_nvidia_smi():
 6 |          sm_names = {
 7 |              "70": ["V100"],
 8 |              "75": ["T4", "Quadro T2000"],
 9 | -            "80": ["PG509", "A100", "A10G", "RTX 30", "A30", "RTX 40"],
10 | +            "80": ["PG509", "A100", "A10G", "RTX 30", "A30", "RTX 40", "A10", "A16"],
11 |              "90": ["H100"],
12 |          }
13 |          for sm, names in sm_names.items():
14 | 


--------------------------------------------------------------------------------
/external/patches/AITemplate/logging.patch:
--------------------------------------------------------------------------------
 1 | diff --git a/static/csrc/model_container.cpp b/static/csrc/model_container.cpp
 2 | index 5548a97..920ed60 100644
 3 | --- a/static/csrc/model_container.cpp
 4 | +++ b/static/csrc/model_container.cpp
 5 | @@ -80,9 +80,9 @@ ModelContainer::ModelContainer(
 6 |        useDebugLogging = true;
 7 |      }
 8 |    }
 9 | -  LOG(INFO)
10 | -      << (useDebugLogging ? PrintDebugDeviceProperties(prop)
11 | -                          : PrintInfoDeviceProperties(prop));
12 | +  //LOG(INFO)
13 | +  //    << (useDebugLogging ? PrintDebugDeviceProperties(prop)
14 | +  //                        : PrintInfoDeviceProperties(prop));
15 |  
16 |    LOG(INFO) << "Init AITemplate Runtime with " << num_models << " concurrency";
17 |    models_.reserve(num_models);
18 | 


--------------------------------------------------------------------------------
/external/patches/AITemplate/num_builders.patch:
--------------------------------------------------------------------------------
 1 | diff --git a/python/aitemplate/backend/builder.py b/python/aitemplate/backend/builder.py
 2 | index e66d97e..b0d5848 100644
 3 | --- a/python/aitemplate/backend/builder.py
 4 | +++ b/python/aitemplate/backend/builder.py
 5 | @@ -900,6 +900,7 @@ clean:
 6 |                  f"-C {build_dir}",
 7 |              ]
 8 |          )
 9 | +        self._n_jobs = 4
10 |          make_clean_cmd = f" {make_path} {make_flags} clean "
11 |          make_all_cmd = f" {make_path} {make_flags} -j{self._n_jobs} all "
12 |          make_clean_constants_cmd = f" {make_path} {make_flags} clean_constants "
13 | 


--------------------------------------------------------------------------------
/external_libs/runtime/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | ## Copyright (c) ByteDance Inc. All rights reserved.
 2 | ## Licensed under the Apache License, Version 2.0
 3 | 
 4 | # Minimum CMake required
 5 | cmake_minimum_required(VERSION 3.18)
 6 | set(CMAKE_CXX_STANDARD 17)
 7 | 
 8 | project(brt-libs LANGUAGES CXX CUDA)
 9 | 
10 | 
11 | set(REPO_ROOT ${PROJECT_SOURCE_DIR})
12 | message("REPO_ROOT = ${REPO_ROOT}")
13 | set(BYTEIR_ROOT ${REPO_ROOT}/../..)
14 | set(CUTLASS_ROOT ${BYTEIR_ROOT}/external/cutlass)
15 | message("CUTLASS_ROOT = ${CUTLASS_ROOT}")
16 | 
17 | add_subdirectory(flash_attn)
18 | 


--------------------------------------------------------------------------------
/external_libs/runtime/README.md:
--------------------------------------------------------------------------------
 1 | # Runtime External Libs
 2 | 
 3 | Runtime external library contains standalone kernels that can be used externally, eg. used by ByteIR Runtime.
 4 | 
 5 | ## Build
 6 | ### Linux/Mac
 7 | ```bash
 8 | mkdir ./build
 9 | 
10 | # build runtime
11 | cd build && cmake .. -G Ninja
12 | 
13 | cmake --build . --target all
14 | ```
15 | 


--------------------------------------------------------------------------------
/external_libs/runtime/flash_attn/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(lib)


--------------------------------------------------------------------------------
/external_libs/runtime/flash_attn/lib/flash_bwd_hdim128_fp16_sm80.cu:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2023, Tri Dao.
 2 | // Splitting the different head dimensions to different files to speed up compilation.
 3 | // This file is auto-generated. See "generate_kernels.py"
 4 | 
 5 | #include "flash_bwd_launch_template.h"
 6 | 
 7 | template<>
 8 | void run_mha_bwd_<cutlass::half_t, 128>(Flash_bwd_params &params, cudaStream_t stream) {
 9 |     run_mha_bwd_hdim128<cutlass::half_t>(params, stream);
10 | }
11 | 


--------------------------------------------------------------------------------
/external_libs/runtime/flash_attn/lib/flash_bwd_hdim160_fp16_sm80.cu:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2023, Tri Dao.
 2 | // Splitting the different head dimensions to different files to speed up compilation.
 3 | // This file is auto-generated. See "generate_kernels.py"
 4 | 
 5 | #include "flash_bwd_launch_template.h"
 6 | 
 7 | template<>
 8 | void run_mha_bwd_<cutlass::half_t, 160>(Flash_bwd_params &params, cudaStream_t stream) {
 9 |     run_mha_bwd_hdim160<cutlass::half_t>(params, stream);
10 | }
11 | 


--------------------------------------------------------------------------------
/external_libs/runtime/flash_attn/lib/flash_bwd_hdim192_fp16_sm80.cu:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2023, Tri Dao.
 2 | // Splitting the different head dimensions to different files to speed up compilation.
 3 | // This file is auto-generated. See "generate_kernels.py"
 4 | 
 5 | #include "flash_bwd_launch_template.h"
 6 | 
 7 | template<>
 8 | void run_mha_bwd_<cutlass::half_t, 192>(Flash_bwd_params &params, cudaStream_t stream) {
 9 |     run_mha_bwd_hdim192<cutlass::half_t>(params, stream);
10 | }
11 | 


--------------------------------------------------------------------------------
/external_libs/runtime/flash_attn/lib/flash_bwd_hdim224_fp16_sm80.cu:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2023, Tri Dao.
 2 | // Splitting the different head dimensions to different files to speed up compilation.
 3 | // This file is auto-generated. See "generate_kernels.py"
 4 | 
 5 | #include "flash_bwd_launch_template.h"
 6 | 
 7 | template<>
 8 | void run_mha_bwd_<cutlass::half_t, 224>(Flash_bwd_params &params, cudaStream_t stream) {
 9 |     run_mha_bwd_hdim224<cutlass::half_t>(params, stream);
10 | }
11 | 


--------------------------------------------------------------------------------
/external_libs/runtime/flash_attn/lib/flash_bwd_hdim256_fp16_sm80.cu:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2023, Tri Dao.
 2 | // Splitting the different head dimensions to different files to speed up compilation.
 3 | // This file is auto-generated. See "generate_kernels.py"
 4 | 
 5 | #include "flash_bwd_launch_template.h"
 6 | 
 7 | template<>
 8 | void run_mha_bwd_<cutlass::half_t, 256>(Flash_bwd_params &params, cudaStream_t stream) {
 9 |     run_mha_bwd_hdim256<cutlass::half_t>(params, stream);
10 | }
11 | 


--------------------------------------------------------------------------------
/external_libs/runtime/flash_attn/lib/flash_bwd_hdim32_fp16_sm80.cu:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2023, Tri Dao.
 2 | // Splitting the different head dimensions to different files to speed up compilation.
 3 | // This file is auto-generated. See "generate_kernels.py"
 4 | 
 5 | #include "flash_bwd_launch_template.h"
 6 | 
 7 | template<>
 8 | void run_mha_bwd_<cutlass::half_t, 32>(Flash_bwd_params &params, cudaStream_t stream) {
 9 |     run_mha_bwd_hdim32<cutlass::half_t>(params, stream);
10 | }
11 | 


--------------------------------------------------------------------------------
/external_libs/runtime/flash_attn/lib/flash_bwd_hdim64_fp16_sm80.cu:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2023, Tri Dao.
 2 | // Splitting the different head dimensions to different files to speed up compilation.
 3 | // This file is auto-generated. See "generate_kernels.py"
 4 | 
 5 | #include "flash_bwd_launch_template.h"
 6 | 
 7 | template<>
 8 | void run_mha_bwd_<cutlass::half_t, 64>(Flash_bwd_params &params, cudaStream_t stream) {
 9 |     run_mha_bwd_hdim64<cutlass::half_t>(params, stream);
10 | }
11 | 


--------------------------------------------------------------------------------
/external_libs/runtime/flash_attn/lib/flash_bwd_hdim96_fp16_sm80.cu:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2023, Tri Dao.
 2 | // Splitting the different head dimensions to different files to speed up compilation.
 3 | // This file is auto-generated. See "generate_kernels.py"
 4 | 
 5 | #include "flash_bwd_launch_template.h"
 6 | 
 7 | template<>
 8 | void run_mha_bwd_<cutlass::half_t, 96>(Flash_bwd_params &params, cudaStream_t stream) {
 9 |     run_mha_bwd_hdim96<cutlass::half_t>(params, stream);
10 | }
11 | 


--------------------------------------------------------------------------------
/external_libs/runtime/flash_attn/lib/flash_fwd_hdim128_fp16_sm80.cu:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2023, Tri Dao.
 2 | // Splitting the different head dimensions to different files to speed up compilation.
 3 | // This file is auto-generated. See "generate_kernels.py"
 4 | 
 5 | #include "flash_fwd_launch_template.h"
 6 | 
 7 | template<>
 8 | void run_mha_fwd_<cutlass::half_t, 128>(Flash_fwd_params &params, cudaStream_t stream) {
 9 |     run_mha_fwd_hdim128<cutlass::half_t>(params, stream);
10 | }
11 | 


--------------------------------------------------------------------------------
/external_libs/runtime/flash_attn/lib/flash_fwd_hdim160_fp16_sm80.cu:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2023, Tri Dao.
 2 | // Splitting the different head dimensions to different files to speed up compilation.
 3 | // This file is auto-generated. See "generate_kernels.py"
 4 | 
 5 | #include "flash_fwd_launch_template.h"
 6 | 
 7 | template<>
 8 | void run_mha_fwd_<cutlass::half_t, 160>(Flash_fwd_params &params, cudaStream_t stream) {
 9 |     run_mha_fwd_hdim160<cutlass::half_t>(params, stream);
10 | }
11 | 


--------------------------------------------------------------------------------
/external_libs/runtime/flash_attn/lib/flash_fwd_hdim192_fp16_sm80.cu:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2023, Tri Dao.
 2 | // Splitting the different head dimensions to different files to speed up compilation.
 3 | // This file is auto-generated. See "generate_kernels.py"
 4 | 
 5 | #include "flash_fwd_launch_template.h"
 6 | 
 7 | template<>
 8 | void run_mha_fwd_<cutlass::half_t, 192>(Flash_fwd_params &params, cudaStream_t stream) {
 9 |     run_mha_fwd_hdim192<cutlass::half_t>(params, stream);
10 | }
11 | 


--------------------------------------------------------------------------------
/external_libs/runtime/flash_attn/lib/flash_fwd_hdim224_fp16_sm80.cu:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2023, Tri Dao.
 2 | // Splitting the different head dimensions to different files to speed up compilation.
 3 | // This file is auto-generated. See "generate_kernels.py"
 4 | 
 5 | #include "flash_fwd_launch_template.h"
 6 | 
 7 | template<>
 8 | void run_mha_fwd_<cutlass::half_t, 224>(Flash_fwd_params &params, cudaStream_t stream) {
 9 |     run_mha_fwd_hdim224<cutlass::half_t>(params, stream);
10 | }
11 | 


--------------------------------------------------------------------------------
/external_libs/runtime/flash_attn/lib/flash_fwd_hdim256_fp16_sm80.cu:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2023, Tri Dao.
 2 | // Splitting the different head dimensions to different files to speed up compilation.
 3 | // This file is auto-generated. See "generate_kernels.py"
 4 | 
 5 | #include "flash_fwd_launch_template.h"
 6 | 
 7 | template<>
 8 | void run_mha_fwd_<cutlass::half_t, 256>(Flash_fwd_params &params, cudaStream_t stream) {
 9 |     run_mha_fwd_hdim256<cutlass::half_t>(params, stream);
10 | }
11 | 


--------------------------------------------------------------------------------
/external_libs/runtime/flash_attn/lib/flash_fwd_hdim32_fp16_sm80.cu:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2023, Tri Dao.
 2 | // Splitting the different head dimensions to different files to speed up compilation.
 3 | // This file is auto-generated. See "generate_kernels.py"
 4 | 
 5 | #include "flash_fwd_launch_template.h"
 6 | 
 7 | template<>
 8 | void run_mha_fwd_<cutlass::half_t, 32>(Flash_fwd_params &params, cudaStream_t stream) {
 9 |     run_mha_fwd_hdim32<cutlass::half_t>(params, stream);
10 | }
11 | 


--------------------------------------------------------------------------------
/external_libs/runtime/flash_attn/lib/flash_fwd_hdim64_fp16_sm80.cu:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2023, Tri Dao.
 2 | // Splitting the different head dimensions to different files to speed up compilation.
 3 | // This file is auto-generated. See "generate_kernels.py"
 4 | 
 5 | #include "flash_fwd_launch_template.h"
 6 | 
 7 | template<>
 8 | void run_mha_fwd_<cutlass::half_t, 64>(Flash_fwd_params &params, cudaStream_t stream) {
 9 |     run_mha_fwd_hdim64<cutlass::half_t>(params, stream);
10 | }
11 | 


--------------------------------------------------------------------------------
/external_libs/runtime/flash_attn/lib/flash_fwd_hdim96_fp16_sm80.cu:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2023, Tri Dao.
 2 | // Splitting the different head dimensions to different files to speed up compilation.
 3 | // This file is auto-generated. See "generate_kernels.py"
 4 | 
 5 | #include "flash_fwd_launch_template.h"
 6 | 
 7 | template<>
 8 | void run_mha_fwd_<cutlass::half_t, 96>(Flash_fwd_params &params, cudaStream_t stream) {
 9 |     run_mha_fwd_hdim96<cutlass::half_t>(params, stream);
10 | }
11 | 


--------------------------------------------------------------------------------
/external_libs/runtime/flash_attn/lib/flash_fwd_split_hdim128_fp16_sm80.cu:
--------------------------------------------------------------------------------
1 | // Copyright (c) 2023, Tri Dao.
2 | // Splitting the different head dimensions to different files to speed up compilation.
3 | // This file is auto-generated. See "generate_kernels.py"
4 | 
5 | #include "flash_fwd_launch_template.h"
6 | 
7 | template void run_mha_fwd_splitkv_dispatch<cutlass::half_t, 128>(Flash_fwd_params &params, cudaStream_t stream);
8 | 


--------------------------------------------------------------------------------
/external_libs/runtime/flash_attn/lib/flash_fwd_split_hdim160_fp16_sm80.cu:
--------------------------------------------------------------------------------
1 | // Copyright (c) 2023, Tri Dao.
2 | // Splitting the different head dimensions to different files to speed up compilation.
3 | // This file is auto-generated. See "generate_kernels.py"
4 | 
5 | #include "flash_fwd_launch_template.h"
6 | 
7 | template void run_mha_fwd_splitkv_dispatch<cutlass::half_t, 160>(Flash_fwd_params &params, cudaStream_t stream);
8 | 


--------------------------------------------------------------------------------
/external_libs/runtime/flash_attn/lib/flash_fwd_split_hdim192_fp16_sm80.cu:
--------------------------------------------------------------------------------
1 | // Copyright (c) 2023, Tri Dao.
2 | // Splitting the different head dimensions to different files to speed up compilation.
3 | // This file is auto-generated. See "generate_kernels.py"
4 | 
5 | #include "flash_fwd_launch_template.h"
6 | 
7 | template void run_mha_fwd_splitkv_dispatch<cutlass::half_t, 192>(Flash_fwd_params &params, cudaStream_t stream);
8 | 


--------------------------------------------------------------------------------
/external_libs/runtime/flash_attn/lib/flash_fwd_split_hdim224_fp16_sm80.cu:
--------------------------------------------------------------------------------
1 | // Copyright (c) 2023, Tri Dao.
2 | // Splitting the different head dimensions to different files to speed up compilation.
3 | // This file is auto-generated. See "generate_kernels.py"
4 | 
5 | #include "flash_fwd_launch_template.h"
6 | 
7 | template void run_mha_fwd_splitkv_dispatch<cutlass::half_t, 224>(Flash_fwd_params &params, cudaStream_t stream);
8 | 


--------------------------------------------------------------------------------
/external_libs/runtime/flash_attn/lib/flash_fwd_split_hdim256_fp16_sm80.cu:
--------------------------------------------------------------------------------
1 | // Copyright (c) 2023, Tri Dao.
2 | // Splitting the different head dimensions to different files to speed up compilation.
3 | // This file is auto-generated. See "generate_kernels.py"
4 | 
5 | #include "flash_fwd_launch_template.h"
6 | 
7 | template void run_mha_fwd_splitkv_dispatch<cutlass::half_t, 256>(Flash_fwd_params &params, cudaStream_t stream);
8 | 


--------------------------------------------------------------------------------
/external_libs/runtime/flash_attn/lib/flash_fwd_split_hdim32_fp16_sm80.cu:
--------------------------------------------------------------------------------
1 | // Copyright (c) 2023, Tri Dao.
2 | // Splitting the different head dimensions to different files to speed up compilation.
3 | // This file is auto-generated. See "generate_kernels.py"
4 | 
5 | #include "flash_fwd_launch_template.h"
6 | 
7 | template void run_mha_fwd_splitkv_dispatch<cutlass::half_t, 32>(Flash_fwd_params &params, cudaStream_t stream);
8 | 


--------------------------------------------------------------------------------
/external_libs/runtime/flash_attn/lib/flash_fwd_split_hdim64_fp16_sm80.cu:
--------------------------------------------------------------------------------
1 | // Copyright (c) 2023, Tri Dao.
2 | // Splitting the different head dimensions to different files to speed up compilation.
3 | // This file is auto-generated. See "generate_kernels.py"
4 | 
5 | #include "flash_fwd_launch_template.h"
6 | 
7 | template void run_mha_fwd_splitkv_dispatch<cutlass::half_t, 64>(Flash_fwd_params &params, cudaStream_t stream);
8 | 


--------------------------------------------------------------------------------
/external_libs/runtime/flash_attn/lib/flash_fwd_split_hdim96_fp16_sm80.cu:
--------------------------------------------------------------------------------
1 | // Copyright (c) 2023, Tri Dao.
2 | // Splitting the different head dimensions to different files to speed up compilation.
3 | // This file is auto-generated. See "generate_kernels.py"
4 | 
5 | #include "flash_fwd_launch_template.h"
6 | 
7 | template void run_mha_fwd_splitkv_dispatch<cutlass::half_t, 96>(Flash_fwd_params &params, cudaStream_t stream);
8 | 


--------------------------------------------------------------------------------
/frontends/onnx-frontend/.gitignore:
--------------------------------------------------------------------------------
1 | .pytest_cache/
2 | *.pyc
3 | *.tar.gz
4 | 
5 | build/
6 | 


--------------------------------------------------------------------------------
/frontends/onnx-frontend/onnx-frontend/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(src)
2 | add_subdirectory(test)
3 | 


--------------------------------------------------------------------------------
/frontends/onnx-frontend/onnx-frontend/src/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_subdirectory(Compiler)
 2 | add_subdirectory(Conversion)
 3 | add_subdirectory(Support)
 4 | 
 5 | add_onnx_frontend_executable(onnx-frontend
 6 |   onnx-frontend.cpp
 7 | 
 8 |   INSTALL
 9 | 
10 |   LINK_LIBS PRIVATE
11 |   OFCompiler
12 |   OMCompilerOptions
13 |   StablehloPortableApi
14 |   )
15 | 
16 | add_onnx_frontend_executable(onnx-frontend-opt
17 |   onnx-frontend-opt.cpp
18 | 
19 |   INSTALL
20 | 
21 |   LINK_LIBS PRIVATE
22 |   OMCompilerOptions
23 |   OMRegisterPasses
24 |   OFCompiler
25 |   MLIROptLib
26 |   )
27 | 


--------------------------------------------------------------------------------
/frontends/onnx-frontend/onnx-frontend/src/Compiler/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_onnx_frontend_library(OFCompiler
2 |   OFCompilerOptions.cpp
3 |   OFCompilerPipelines.cpp
4 |   OFCompilerUtils.cpp
5 | 
6 |   LINK_LIBS PUBLIC
7 |   OFConversion
8 |   )


--------------------------------------------------------------------------------
/frontends/onnx-frontend/onnx-frontend/src/Support/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_onnx_frontend_library(OFSupport
2 |   OFUtils.cpp
3 |   )


--------------------------------------------------------------------------------
/frontends/onnx-frontend/onnx-frontend/test/dynamic_shape_relu.onnx:
--------------------------------------------------------------------------------
 1 | 	onnx-relu:X
 2 | 
 3 | XYrelu"Relu
 4 | test-modelZ
 5 | X
 6 | 
 7 | B
 8 | T
 9 | Cb
10 | Y
11 | 
12 | B
13 | T
14 | CB


--------------------------------------------------------------------------------
/frontends/onnx-frontend/onnx-frontend/test/of_check_non_lowered.mlir:
--------------------------------------------------------------------------------
1 | // RUN: onnx-frontend-opt -check-non-lowered %s -split-input-file -verify-diagnostics
2 | 
3 | func.func @test_onnx_non_lowered(%arg0: tensor<1x2xf32>) -> tensor<1x2xf32> {
4 |   // expected-warning @+2 {{onnx.NoValue: ONNX op is not lowered}}
5 |   // expected-error @-2 {{Please lower all ONNX ops}}
6 |   %0 = "onnx.NoValue"() : () -> none
7 |   return %arg0 : tensor<1x2xf32>
8 | }
9 | 


--------------------------------------------------------------------------------
/frontends/onnx-frontend/onnx-frontend/test/set_shape.mlir:
--------------------------------------------------------------------------------
1 | // RUN: onnx-frontend %S/dynamic_shape_relu.onnx --input-name-and-shapes=X,1,128,80 -- | FileCheck %s
2 | 
3 | // CHECK-LABEL:  func.func @main
4 | // CHECK-SAME:   ([[PARAM_0_:%.+]]: tensor<1x128x80xf32> {onnx.name = "X"}) -> (tensor<1x128x80xf32> {onnx.name = "Y"}) attributes {byteir.entry_point = {inputs = ["X"], outputs = ["Y"]}} {
5 | // CHECK:           [[VAR_0_:%.+]] = stablehlo.constant dense<0.000000e+00> : tensor<1x128x80xf32>
6 | // CHECK:           [[VAR_1_:%.+]] = stablehlo.maximum [[PARAM_0_]], [[VAR_0_]] : tensor<1x128x80xf32>
7 | // CHECK:           return [[VAR_1_]] : tensor<1x128x80xf32>


--------------------------------------------------------------------------------
/frontends/onnx-frontend/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | filterwarnings =
3 |     ignore::DeprecationWarning


--------------------------------------------------------------------------------
/frontends/onnx-frontend/requirements.txt:
--------------------------------------------------------------------------------
1 | lit>=14.0.0
2 | numpy>=1.21.6
3 | onnx==1.13.0
4 | onnxruntime>=1.13.1
5 | # protobuf==3.20.1
6 | pytest>=7.1.2
7 | torch>=1.12.0
8 | 


--------------------------------------------------------------------------------
/frontends/onnx-frontend/scripts/build_and_test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -x
 4 | set -e
 5 | 
 6 | CUR_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" > /dev/null && pwd )"
 7 | # path to byteir root
 8 | BYTEIR_ROOT="$CUR_DIR/../../.."
 9 | # path to byteir/frontends/onnx-frontend
10 | ONNX_FRONTEND_ROOT="$BYTEIR_ROOT/frontends/onnx-frontend"
11 | 
12 | export BYTEIR_ROOT="$BYTEIR_ROOT"
13 | export ONNX_FRONTEND_ROOT="$ONNX_FRONTEND_ROOT"
14 | 
15 | source $CUR_DIR/envsetup.sh
16 | load_onnx_llvm_rtti_prebuilt
17 | 
18 | of_envsetup
19 | of_build
20 | of_test_lit
21 | of_test_ops
22 | 


--------------------------------------------------------------------------------
/frontends/onnx-frontend/test/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bytedance/byteir/5c485e2857e277934ec1f72878b1d9673f3614be/frontends/onnx-frontend/test/__init__.py


--------------------------------------------------------------------------------
/frontends/onnx-frontend/test/env.py:
--------------------------------------------------------------------------------
1 | import os
2 | import os.path as osp
3 | import sys
4 | 
5 | ONNX_FRONTEND_PATH = osp.join(os.environ["ONNX_FRONTEND_ROOT"], "build/onnx-frontend/src/onnx-frontend")
6 | 


--------------------------------------------------------------------------------
/frontends/onnx-frontend/test/ops/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bytedance/byteir/5c485e2857e277934ec1f72878b1d9673f3614be/frontends/onnx-frontend/test/ops/__init__.py


--------------------------------------------------------------------------------
/frontends/onnx-frontend/test/ops/data/math/clip.onnx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bytedance/byteir/5c485e2857e277934ec1f72878b1d9673f3614be/frontends/onnx-frontend/test/ops/data/math/clip.onnx


--------------------------------------------------------------------------------
/frontends/onnx-frontend/test/ops/data/math/gelu.onnx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bytedance/byteir/5c485e2857e277934ec1f72878b1d9673f3614be/frontends/onnx-frontend/test/ops/data/math/gelu.onnx


--------------------------------------------------------------------------------
/frontends/onnx-frontend/test/ops/data/math/softmax.onnx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bytedance/byteir/5c485e2857e277934ec1f72878b1d9673f3614be/frontends/onnx-frontend/test/ops/data/math/softmax.onnx


--------------------------------------------------------------------------------
/frontends/onnx-frontend/test/ops/data/nn/batch_normalization.onnx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bytedance/byteir/5c485e2857e277934ec1f72878b1d9673f3614be/frontends/onnx-frontend/test/ops/data/nn/batch_normalization.onnx


--------------------------------------------------------------------------------
/frontends/onnx-frontend/test/ops/data/quantize/quantize_dequantize.onnx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bytedance/byteir/5c485e2857e277934ec1f72878b1d9673f3614be/frontends/onnx-frontend/test/ops/data/quantize/quantize_dequantize.onnx


--------------------------------------------------------------------------------
/frontends/onnx-frontend/test/ops/data/tensor/arg_max.onnx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bytedance/byteir/5c485e2857e277934ec1f72878b1d9673f3614be/frontends/onnx-frontend/test/ops/data/tensor/arg_max.onnx


--------------------------------------------------------------------------------
/frontends/onnx-frontend/test/ops/data/tensor/arg_min.onnx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bytedance/byteir/5c485e2857e277934ec1f72878b1d9673f3614be/frontends/onnx-frontend/test/ops/data/tensor/arg_min.onnx


--------------------------------------------------------------------------------
/frontends/onnx-frontend/test/ops/data/tensor/concat.onnx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bytedance/byteir/5c485e2857e277934ec1f72878b1d9673f3614be/frontends/onnx-frontend/test/ops/data/tensor/concat.onnx


--------------------------------------------------------------------------------
/frontends/onnx-frontend/test/ops/data/tensor/concat_dynamic_shape.onnx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bytedance/byteir/5c485e2857e277934ec1f72878b1d9673f3614be/frontends/onnx-frontend/test/ops/data/tensor/concat_dynamic_shape.onnx


--------------------------------------------------------------------------------
/frontends/onnx-frontend/test/ops/data/tensor/resize_nearest_v10.onnx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bytedance/byteir/5c485e2857e277934ec1f72878b1d9673f3614be/frontends/onnx-frontend/test/ops/data/tensor/resize_nearest_v10.onnx


--------------------------------------------------------------------------------
/frontends/onnx-frontend/test/ops/test_quantize.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pytest
 3 | import onnx
 4 | from test.base import TestBase
 5 | from test.ops.utils import build_onnx
 6 | 
 7 | 
 8 | class TestOpsQuantize(TestBase):
 9 | 
10 |     @pytest.fixture(autouse=True)
11 |     def setup(self, tmpdir_factory):
12 |         self.setup_base(tmpdir_factory, "test/ops/data/quantize")
13 | 
14 |     def test_quantize_dequantize(self):
15 |         input_shape_dtype = [
16 |             ["data", (16, 3, 224, 224), "float32"],
17 |         ]
18 |         self.run(model_filename="quantize_dequantize.onnx", input_shape_dtype=input_shape_dtype)
19 | 


--------------------------------------------------------------------------------
/frontends/onnx-frontend/third_party/patches/OnnxMlirRegisterLibrary.patch:
--------------------------------------------------------------------------------
 1 | diff --git a/src/Tools/onnx-mlir-opt/CMakeLists.txt b/src/Tools/onnx-mlir-opt/CMakeLists.txt
 2 | index a90a670a..0a80c88b 100644
 3 | --- a/src/Tools/onnx-mlir-opt/CMakeLists.txt
 4 | +++ b/src/Tools/onnx-mlir-opt/CMakeLists.txt
 5 | @@ -20,3 +20,16 @@ add_onnx_mlir_executable(onnx-mlir-opt
 6 |    MLIROptLib
 7 |    MLIRSCFToOpenMP
 8 |    )
 9 | +
10 | +add_onnx_mlir_library(OMRegisterPasses
11 | +  RegisterPasses.cpp
12 | +
13 | +  EXCLUDE_FROM_OM_LIBS
14 | +
15 | +  LINK_LIBS PUBLIC
16 | +  OMCompilerPasses
17 | +  OMAccelerator
18 | +  MLIRAffineTransforms
19 | +  MLIRLinalgTransforms
20 | +  MLIRMemRefTransforms
21 | +)
22 | 


--------------------------------------------------------------------------------
/frontends/tf-frontend/.bazelrc:
--------------------------------------------------------------------------------
1 | ./external/tensorflow/.bazelrc


--------------------------------------------------------------------------------
/frontends/tf-frontend/.bazelversion:
--------------------------------------------------------------------------------
1 | ./external/tensorflow/.bazelversion


--------------------------------------------------------------------------------
/frontends/tf-frontend/.gitignore:
--------------------------------------------------------------------------------
1 | /bazel-*
2 | example/.workspace/*
3 | 


--------------------------------------------------------------------------------
/frontends/tf-frontend/.tf_configure.bazelrc:
--------------------------------------------------------------------------------
 1 | build --action_env PYTHON_BIN_PATH="/usr/bin/python3"
 2 | build --action_env PYTHON_LIB_PATH="/usr/lib/python3/dist-packages"
 3 | build --python_path="/usr/bin/python3"
 4 | build:opt --copt=-Wno-sign-compare
 5 | build:opt --host_copt=-Wno-sign-compare
 6 | test --flaky_test_attempts=3
 7 | test --test_size_filters=small,medium
 8 | test:v1 --test_tag_filters=-benchmark-test,-no_oss,-gpu,-oss_serial
 9 | test:v1 --build_tag_filters=-benchmark-test,-no_oss,-gpu
10 | test:v2 --test_tag_filters=-benchmark-test,-no_oss,-gpu,-oss_serial,-v1only
11 | test:v2 --build_tag_filters=-benchmark-test,-no_oss,-gpu,-v1only
12 | 


--------------------------------------------------------------------------------
/frontends/tf-frontend/BUILD:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bytedance/byteir/5c485e2857e277934ec1f72878b1d9673f3614be/frontends/tf-frontend/BUILD


--------------------------------------------------------------------------------
/frontends/tf-frontend/byteir/BUILD:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bytedance/byteir/5c485e2857e277934ec1f72878b1d9673f3614be/frontends/tf-frontend/byteir/BUILD


--------------------------------------------------------------------------------
/frontends/tf-frontend/byteir/workspace.bzl:
--------------------------------------------------------------------------------
1 | def ace_repo():
2 |     native.new_local_repository(
3 |         name = "byteir",
4 |         path = "./../../compiler/dialects",
5 |         build_file = "//byteir:ace.BUILD",
6 |     )
7 | 


--------------------------------------------------------------------------------
/frontends/tf-frontend/scripts/apply_patches.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e
 4 | 
 5 | CUR_DIR="$(cd "$(dirname "$0")" ; pwd -P)"
 6 | TF_FRONTEND_DIR=$CUR_DIR/..
 7 | TF_DIR=$TF_FRONTEND_DIR/external/tensorflow
 8 | TF_PATCHES_DIR=$TF_FRONTEND_DIR/external/patches/tensorflow
 9 | 
10 | pushd $TF_DIR
11 | git clean -fd .
12 | for patch in $TF_PATCHES_DIR/*; do
13 |   git apply $patch
14 | done
15 | popd
16 | 


--------------------------------------------------------------------------------
/frontends/tf-frontend/tf_mlir_ext/numerical/BUILD:
--------------------------------------------------------------------------------
 1 | load("@org_tensorflow//tensorflow:tensorflow.bzl", "filegroup")
 2 | load("glob_lit_test.bzl", "glob_lit_tests")
 3 | 
 4 | package(licenses = ["notice"])
 5 | 
 6 | glob_lit_tests(
 7 |     data = [":test_utilities"],
 8 |     driver = "@llvm-project//mlir:run_lit.sh",
 9 |     test_file_exts = ["mlir"],
10 | )
11 | 
12 | # Bundle together all of the test utilities that are used by tests.
13 | filegroup(
14 |     name = "test_utilities",
15 |     testonly = True,
16 |     data = [
17 |         ":numerical_test.py",
18 |         "//tools:tf-ext-opt",
19 |         "@llvm-project//llvm:FileCheck",
20 |         "@llvm-project//llvm:not",
21 |     ],
22 | )
23 | 
24 | filegroup(
25 |     name = "litfiles",
26 |     srcs = glob(["runlit*py"]),
27 | )
28 | 


--------------------------------------------------------------------------------
/frontends/tf-frontend/tf_mlir_ext/tests/BUILD:
--------------------------------------------------------------------------------
 1 | load("@org_tensorflow//tensorflow:tensorflow.bzl", "filegroup")
 2 | load("glob_lit_test.bzl", "glob_lit_tests")
 3 | 
 4 | package(licenses = ["notice"])
 5 | 
 6 | glob_lit_tests(
 7 |     data = [":test_utilities"],
 8 |     driver = "@llvm-project//mlir:run_lit.sh",
 9 |     test_file_exts = ["mlir"],
10 | )
11 | 
12 | # Bundle together all of the test utilities that are used by tests.
13 | filegroup(
14 |     name = "test_utilities",
15 |     testonly = True,
16 |     data = [
17 |         "//tools:tf-ext-opt",
18 |         "@llvm-project//llvm:FileCheck",
19 |         "@llvm-project//llvm:not",
20 |     ],
21 | )
22 | 
23 | filegroup(
24 |     name = "litfiles",
25 |     srcs = glob(["runlit*py"]),
26 | )
27 | 


--------------------------------------------------------------------------------
/frontends/tf-frontend/tf_mlir_ext/utils/BUILD:
--------------------------------------------------------------------------------
 1 | package(
 2 |     default_visibility = ["//visibility:public"],
 3 |     licenses = ["notice"],
 4 | )
 5 | 
 6 | cc_library(
 7 |     name = "tfext_utils",
 8 |     srcs = [
 9 |         "customcall.cc",
10 |         "dce.cc",
11 |         "utils.cc",
12 |     ],
13 |     hdrs = [
14 |       "customcall.h",
15 |       "dce.h",
16 |       "utils.h",
17 |     ],
18 |     deps = [
19 |       "@org_tensorflow//tensorflow/compiler/mlir/tensorflow:tensorflow",
20 |       "@llvm-project//llvm:Support",
21 |       "@llvm-project//mlir:Dialect",
22 |       "@llvm-project//mlir:IR",
23 |       "@llvm-project//mlir:Support",
24 |     ]
25 | )


--------------------------------------------------------------------------------
/frontends/torch-frontend/.gitignore:
--------------------------------------------------------------------------------
1 | torch-frontend/python/torch_frontend.egg-info/
2 | torch-frontend/python/torch_frontend/version.py


--------------------------------------------------------------------------------
/frontends/torch-frontend/README.md:
--------------------------------------------------------------------------------
 1 | # Torch Frontend
 2 | torch-frontend is a project to build customized torch model --> torch dialect --> stablehlo dialect pipeline, where we could add extended dialect and passes.
 3 | 
 4 | 
 5 | ## Quick Start
 6 | 
 7 | ### Build from source code
 8 | 
 9 | ```bash
10 | git clone https://github.com/bytedance/byteir.git
11 | cd byteir/frontends/torch-frontend
12 | 
13 | # prepare python environment and build torch-frontend
14 | bash scripts/build.sh
15 | 
16 | # torch_frontend-*.whl in ./build/torch-frontend/python/dist/
17 | ```
18 | 
19 | ### Example
20 | ```bash
21 | PYTHONPATH=./build/python_packages/:build/torch_mlir_build/python_packages/torch_mlir python3 examples/inference/infer_resnet.py
22 | ```
23 | 


--------------------------------------------------------------------------------
/frontends/torch-frontend/build-requirements.txt:
--------------------------------------------------------------------------------
 1 | # cpu torch and torchvision
 2 | # --extra-index-url https://download.pytorch.org/whl/cpu
 3 | # --pre
 4 | # torch==2.1.0+cpu
 5 | # torchvision==0.16.0+cpu
 6 | 
 7 | # cuda torch and torchvision
 8 | # --extra-index-url https://download.pytorch.org/whl/cu118
 9 | # --pre
10 | # torch==2.1.0+cu118
11 | # torchvision==0.16.0+cu118
12 | 
13 | # cuda torch and torchvision nightly
14 | # --extra-index-url https://download.pytorch.org/whl/nightly/cu118
15 | # --pre
16 | # torch==2.1.0.dev20230820+cu118
17 | # torchvision==0.16.0.dev20230820+cu118
18 | 
19 | 
20 | # The following copied from torch-mlir
21 | 
22 | # Build requirements.
23 | pybind11
24 | wheel
25 | setuptools
26 | cmake
27 | pyyaml
28 | packaging
29 | lit
30 | 
31 | 


--------------------------------------------------------------------------------
/frontends/torch-frontend/examples/demo/requirements.txt:
--------------------------------------------------------------------------------
1 | --extra-index-url https://download.pytorch.org/whl/cu118
2 | --pre
3 | torch==2.1.0+cu118
4 | 
5 | transformers==4.29.2
6 | 


--------------------------------------------------------------------------------
/frontends/torch-frontend/examples/inference/mixtral/requirements.txt:
--------------------------------------------------------------------------------
1 | transformers==4.40.2
2 | 
3 | --extra-index-url https://download.pytorch.org/whl/cu118
4 | --pre
5 | torch==2.3.0+cu118
6 | 


--------------------------------------------------------------------------------
/frontends/torch-frontend/test-requirements.txt:
--------------------------------------------------------------------------------
 1 | numpy
 2 | transformers==4.29.2
 3 | 
 4 | # Test Requirements
 5 | pillow
 6 | pytest==8.1.0
 7 | dill
 8 | multiprocess
 9 | expecttest
10 | 


--------------------------------------------------------------------------------
/frontends/torch-frontend/torch-cpu-requirements.txt:
--------------------------------------------------------------------------------
1 | --extra-index-url https://download.pytorch.org/whl/cpu
2 | --pre
3 | torch==2.4.1+cpu
4 | torchvision==0.19.1+cpu
5 | 


--------------------------------------------------------------------------------
/frontends/torch-frontend/torch-cuda-requirements.txt:
--------------------------------------------------------------------------------
1 | --extra-index-url https://download.pytorch.org/whl/cu118
2 | --pre
3 | torch==2.4.1+cu118
4 | torchvision==0.19.1+cu118
5 | 


--------------------------------------------------------------------------------
/frontends/torch-frontend/torch-frontend/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_subdirectory(include/torch-frontend)
 2 | 
 3 | add_subdirectory(lib)
 4 | 
 5 | # for torch-frontend python extensions and packages
 6 | add_subdirectory(python)
 7 | 
 8 | # for torch-frontend binary executable tools
 9 | add_subdirectory(tools)
10 | 
11 | add_subdirectory(test)


--------------------------------------------------------------------------------
/frontends/torch-frontend/torch-frontend/include/torch-frontend/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(Conversion)
2 | add_subdirectory(Transforms)
3 | add_subdirectory(Dialect/Torch/Transforms)
4 | 


--------------------------------------------------------------------------------
/frontends/torch-frontend/torch-frontend/include/torch-frontend/Conversion/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | set(LLVM_TARGET_DEFINITIONS Passes.td)
2 | mlir_tablegen(Passes.h.inc -gen-pass-decls -name TorchFrontendConversion)
3 | add_public_tablegen_target(TorchFrontendConversionPassIncGen)
4 | 


--------------------------------------------------------------------------------
/frontends/torch-frontend/torch-frontend/include/torch-frontend/Dialect/Torch/Transforms/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | set(LLVM_TARGET_DEFINITIONS Passes.td)
2 | mlir_tablegen(Passes.h.inc -gen-pass-decls -name TorchFrontendTorchTransforms)
3 | add_public_tablegen_target(TorchFrontendTorchTransformsPassIncGen)
4 | 


--------------------------------------------------------------------------------
/frontends/torch-frontend/torch-frontend/include/torch-frontend/Transforms/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | set(LLVM_TARGET_DEFINITIONS Passes.td)
2 | mlir_tablegen(Passes.h.inc -gen-pass-decls -name TorchFrontendTransforms)
3 | add_public_tablegen_target(TorchFrontendTransformsPassIncGen)
4 | 


--------------------------------------------------------------------------------
/frontends/torch-frontend/torch-frontend/lib/CAPI/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_mlir_public_c_api_library(TorchFrontendCAPI
 2 |     Passes.cpp
 3 | 
 4 |     ENABLE_AGGREGATION
 5 |     LINK_COMPONENTS
 6 | 
 7 |     LINK_LIBS PUBLIC
 8 |     MLIRSupport
 9 |     TorchFrontendPipelines
10 | )
11 | 


--------------------------------------------------------------------------------
/frontends/torch-frontend/torch-frontend/lib/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(CAPI)
2 | add_subdirectory(Conversion)
3 | add_subdirectory(Dialect/Torch/Transforms)
4 | add_subdirectory(Pipelines)
5 | add_subdirectory(Transforms)
6 | add_subdirectory(Utils)
7 | # pytorch custom op
8 | add_subdirectory(CustomOp)
9 | 


--------------------------------------------------------------------------------
/frontends/torch-frontend/torch-frontend/lib/Conversion/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_mlir_library(TorchFrontendConversion
 2 |   ConvertTorchToCcl.cpp
 3 |   ConvertTorchToCustomCall.cpp
 4 |   ConvertTorchToStablehloExt.cpp
 5 | 
 6 |   DEPENDS
 7 |   TorchFrontendConversionPassIncGen
 8 | 
 9 |   LINK_COMPONENTS
10 |   Core
11 | 
12 |   LINK_LIBS PUBLIC
13 |   MLIRIR
14 |   MLIRPass
15 |   MLIRDialect
16 |   MLIRTransforms
17 |   ChloOps
18 |   StablehloOps
19 |   MLIRCclDialect
20 |   TorchMLIRConversionUtils
21 |   TorchMLIRTorchDialect
22 |   TorchMLIRTorchPasses
23 |   TorchMLIRTorchToStablehlo
24 |   TorchMLIRTorchConversionPasses
25 |   TorchMLIRTorchUtils
26 | )
27 | 


--------------------------------------------------------------------------------
/frontends/torch-frontend/torch-frontend/lib/CustomOp/dynamic_mask_stitch.cpp:
--------------------------------------------------------------------------------
 1 | #include <torch/script.h>
 2 | 
 3 | torch::Tensor custom_dynamic_mask_stitch(std::vector<torch::Tensor> data,
 4 |                                          torch::Tensor partitions) {
 5 |   std::vector<torch::Tensor> res;
 6 |   res.reserve(partitions.size(0));
 7 |   std::vector<size_t> count(data.size(), 0);
 8 |   for (int64_t i = 0; i < partitions.size(0); ++i) {
 9 |     int idx = partitions[i].item<int>();
10 |     res.push_back(data[idx][count[idx]].unsqueeze(0));
11 |     count[idx]++;
12 |   }
13 |   return torch::cat(res, /*dim=*/0);
14 | }
15 | 
16 | static auto registry = torch::RegisterOperators("byteir::dynamic_mask_stitch",
17 |                                                 &custom_dynamic_mask_stitch);
18 | 


--------------------------------------------------------------------------------
/frontends/torch-frontend/torch-frontend/lib/CustomOp/dynamic_stitch.cpp:
--------------------------------------------------------------------------------
 1 | #include <torch/script.h>
 2 | 
 3 | torch::Tensor custom_dynamic_stitch(std::vector<torch::Tensor> indices,
 4 |                                     std::vector<torch::Tensor> data) {
 5 |   int n = 0;
 6 |   for (auto &idx : indices) {
 7 |     n += idx.numel();
 8 |   }
 9 |   std::vector<torch::Tensor> res(n);
10 |   for (size_t i = 0; i < data.size(); ++i) {
11 |     for (int j = 0; j < indices[i].size(0); ++j) {
12 |       res[indices[i][j].item<int>()] = data[i][j].unsqueeze(0);
13 |     }
14 |   }
15 |   return torch::cat(res, /*dim=*/0);
16 | }
17 | 
18 | static auto registry =
19 |     torch::RegisterOperators("byteir::dynamic_stitch", &custom_dynamic_stitch);
20 | 


--------------------------------------------------------------------------------
/frontends/torch-frontend/torch-frontend/lib/Dialect/Torch/Transforms/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | set(LLVM_TARGET_DEFINITIONS FuseOpOnTorchPattern.td)
 2 | mlir_tablegen(FuseOpOnTorchPattern.inc -gen-rewriters)
 3 | add_public_tablegen_target(FuseOpOnTorchPatternIncGen)
 4 | 
 5 | add_mlir_library(TorchFrontendTorchTransforms
 6 |   DecomposeOnTorch.cpp
 7 |   FuseOpOnTorch.cpp
 8 | 
 9 |   DEPENDS
10 |   TorchFrontendTorchTransformsPassIncGen
11 |   FuseOpOnTorchPatternIncGen
12 | 
13 |   LINK_LIBS PUBLIC
14 |   MLIRIR
15 |   MLIRPass
16 |   MLIRDialect
17 |   TorchMLIRTorchDialect
18 |   TorchMLIRTorchUtils
19 |   TorchFrontendUtils
20 | )
21 | target_include_directories(TorchFrontendTorchTransforms PUBLIC ${CMAKE_CURRENT_BINARY_DIR})
22 | 


--------------------------------------------------------------------------------
/frontends/torch-frontend/torch-frontend/lib/Pipelines/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_mlir_library(TorchFrontendPipelines
 2 |     Pipelines.cpp
 3 | 
 4 |     # DEPENDS
 5 | 
 6 |     LINK_COMPONENTS
 7 |     Core
 8 | 
 9 |     LINK_LIBS PUBLIC
10 |     MLIRIR
11 |     TorchMLIRTorchToStablehlo
12 |     TorchMLIRTorchConversionPasses
13 |     TorchFrontendConversion
14 |     TorchFrontendTransforms
15 |     TorchFrontendTorchTransforms
16 |     StablehloPasses
17 | )
18 | 


--------------------------------------------------------------------------------
/frontends/torch-frontend/torch-frontend/lib/Transforms/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_mlir_library(TorchFrontendTransforms
 2 |   CanonicalizeExt.cpp
 3 |   EliminateUselessOp.cpp
 4 |   RewriteCustomOp.cpp
 5 |   RewriteEntryFuncName.cpp
 6 |   UnpackPublicFunctionReturn.cpp
 7 | 
 8 |   DEPENDS
 9 |   TorchFrontendTransformsPassIncGen
10 |   TorchMLIRTorchDialect
11 | 
12 |   LINK_COMPONENTS
13 |   Core
14 | 
15 |   LINK_LIBS PUBLIC
16 |   MLIRIR
17 |   MLIRPass
18 |   MLIRDialect
19 |   TorchMLIRTorchDialect
20 |   TorchFrontendUtils
21 | )
22 | 


--------------------------------------------------------------------------------
/frontends/torch-frontend/torch-frontend/lib/Utils/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_mlir_library(TorchFrontendUtils
2 |   ConvertOpFolder.cpp
3 | 
4 |   LINK_LIBS PUBLIC
5 |   MLIRIR
6 | )


--------------------------------------------------------------------------------
/frontends/torch-frontend/torch-frontend/python/test/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | markers =
3 |     mhlo_tools: marks tests depend on mhlo_tools (deselect with '-m "not mhlo_tools"')
4 |     attention_rewriter: marks tests which run attention rewriting (deselect with '-m "not attention_rewriter"')
5 |     serial
6 | 


--------------------------------------------------------------------------------
/frontends/torch-frontend/torch-frontend/python/test/test_fx_utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.fx as fx
 3 | import torch_frontend
 4 | from torch_frontend.fx_utils import _replace_aten_full_arugment
 5 | 
 6 | class FullModule(torch.nn.Module):
 7 |     def __init__(self):
 8 |         super().__init__()
 9 |     
10 |     def forward(self, x):
11 |         y = torch.ops.aten.full(x.shape, True, dtype=torch.bool)
12 |         return y
13 | 
14 | 
15 | def test_full_bool_pattern():
16 |     fx_g = fx.symbolic_trace(FullModule())
17 |     fx_g = _replace_aten_full_arugment(fx_g)
18 |     module = torch.jit.script(fx_g)
19 | 


--------------------------------------------------------------------------------
/frontends/torch-frontend/torch-frontend/python/torch_frontend/__init__.py:
--------------------------------------------------------------------------------
 1 | from ._mlir_libs._torchFrontend import *
 2 | 
 3 | from .compile import DebugType, GENERIC_CUSTOM_OPS, BYTEIR_CUSTOM_OPS, MATH_CUSTOM_OPS
 4 | from .compile import compile, compile_dynamo_model
 5 | 
 6 | from .fx_utils import list_decomposed_ops, preprocess_fx_graph, get_none_indices
 7 | from .flash_attn_op import replace_flash_attn
 8 | from .fx_rewrite import fx_replace_attn_pattern
 9 | 
10 | from . import utils
11 | 


--------------------------------------------------------------------------------
/frontends/torch-frontend/torch-frontend/python/torch_frontend/_mlir_libs/_site_initialize_0.py:
--------------------------------------------------------------------------------
1 | def context_init_hook(context):
2 |     from ._stablehlo import register_dialect as register_stablehlo_dialect
3 |     from ._torchMlir import register_dialect as register_torch_dialect
4 | 
5 |     register_stablehlo_dialect(context)
6 |     register_torch_dialect(context)
7 | 


--------------------------------------------------------------------------------
/frontends/torch-frontend/torch-frontend/python/torch_frontend/byteir_backend/__init__.py:
--------------------------------------------------------------------------------
 1 | from torch._dynamo import register_backend
 2 | 
 3 | 
 4 | @register_backend
 5 | def byteir(*args, **kwargs):
 6 |     from .compiler import byteir_compiler
 7 | 
 8 |     return byteir_compiler(*args, **kwargs)
 9 | 
10 | @register_backend
11 | def byteir_debug(*args, **kwargs):
12 |     from .debug import debug_backend
13 | 
14 |     return debug_backend(*args, **kwargs)
15 | 
16 | def set_cache_dir(path: str):
17 |     from .compilation_cache import ByteIRFxGraphCache
18 | 
19 |     ByteIRFxGraphCache.base_cache_dir = path
20 | 


--------------------------------------------------------------------------------
/frontends/torch-frontend/torch-frontend/python/torch_frontend/tools/gen_extra_library.py:
--------------------------------------------------------------------------------
 1 | from torch_frontend.extra_shape_fn import byteir_extra_library
 2 | from torch_mlir import torchscript
 3 | 
 4 | import os
 5 | import shutil
 6 | 
 7 | CUR_DIR = os.path.abspath(os.path.dirname(__file__))
 8 | 
 9 | def _get_extra_library_file():
10 |     extra_library = []
11 |     for op in byteir_extra_library:
12 |         extra_library += byteir_extra_library[op]
13 |     return torchscript._canon_extra_library(extra_library)
14 | 
15 | def main():
16 |     temp_file_path = _get_extra_library_file()
17 |     shutil.copyfile(temp_file_path, os.path.join(CUR_DIR, "extra_fn.mlir"))
18 | 
19 | if __name__ == "__main__":
20 |     main()
21 | 


--------------------------------------------------------------------------------
/frontends/torch-frontend/torch-frontend/python/torch_frontend/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .jit_transforms import replace_copy_fill_with_slice_scatter
2 | 


--------------------------------------------------------------------------------
/frontends/torch-frontend/torch-frontend/python/version.txt:
--------------------------------------------------------------------------------
1 | 1.3.4


--------------------------------------------------------------------------------
/frontends/torch-frontend/torch-frontend/test/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | configure_lit_site_cfg(
 2 |         ${CMAKE_CURRENT_SOURCE_DIR}/lit.site.cfg.py.in
 3 |         ${CMAKE_CURRENT_BINARY_DIR}/lit.site.cfg.py
 4 |         MAIN_CONFIG
 5 |         ${CMAKE_CURRENT_SOURCE_DIR}/lit.cfg.py
 6 | )
 7 | 
 8 | set(TORCH_FRONTEND_TEST_DEPENDS
 9 |         FileCheck count not
10 |         torch-frontend-opt
11 |         )
12 | 
13 | add_lit_testsuite(check-torch-frontend-opt "Running the torch-frontend-opt regression tests"
14 |         ${CMAKE_CURRENT_BINARY_DIR}
15 |         DEPENDS ${TORCH_FRONTEND_TEST_DEPENDS}
16 |         )
17 | set_target_properties(check-torch-frontend-opt PROPERTIES FOLDER "Tests")
18 | 
19 | add_lit_testsuites(TORCH_FRONTEND_TEST ${CMAKE_CURRENT_SOURCE_DIR} DEPENDS ${TORCH_FRONTEND_TEST_DEPENDS})


--------------------------------------------------------------------------------
/frontends/torch-frontend/torch-frontend/test/Transforms/RewriteEntryFuncName.mlir:
--------------------------------------------------------------------------------
1 | // RUN: torch-frontend-opt %s --rewrite-entry-func-name="target-name=main" | FileCheck %s
2 | 
3 | module {
4 |   func.func @forward(%arg0: !torch.vtensor<[3,4],f32>) -> !torch.vtensor<[3,4],f32>{
5 |     return %arg0 : !torch.vtensor<[3,4],f32>
6 |   }
7 | }
8 | // CHECK-LABEL: func.func @main
9 | 


--------------------------------------------------------------------------------
/frontends/torch-frontend/torch-frontend/test/Transforms/UnpackPublicFunctionReturn.mlir:
--------------------------------------------------------------------------------
 1 | // RUN: torch-frontend-opt %s --unpack-public-function-return --canonicalize | FileCheck %s
 2 | 
 3 | module {
 4 |   func.func @forward(%arg0: !torch.tensor {torch.type_bound = !torch.vtensor<[3,4],f32>}) -> !torch.list<tensor> {
 5 |     %0 = torch.prim.ListConstruct %arg0, %arg0, %arg0 : (!torch.tensor, !torch.tensor, !torch.tensor) -> !torch.list<tensor>
 6 |     return %0 : !torch.list<tensor>
 7 |   }
 8 | }
 9 | // CHECK-LABEL: func.func @forward
10 | // CHECK: %0 = torch.prim.TupleConstruct %arg0, %arg0, %arg0 : !torch.tensor, !torch.tensor, !torch.tensor -> !torch.tuple<tensor, tensor, tensor>
11 | // CHECK: return %0 : !torch.tuple<tensor, tensor, tensor>
12 | 


--------------------------------------------------------------------------------
/runtime/.gitignore:
--------------------------------------------------------------------------------
1 | python/brt.egg-info/
2 | python/dist/
3 | python/brt/version.py
4 | 


--------------------------------------------------------------------------------
/runtime/VERSION_NUMBER:
--------------------------------------------------------------------------------
1 | 1.9.3.0
2 | 


--------------------------------------------------------------------------------
/runtime/cmake/brt_config.h.in:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #cmakedefine HAS_UNUSED_BUT_SET_VARIABLE
 4 | #cmakedefine HAS_UNUSED_PARAMETER
 5 | #cmakedefine HAS_UNUSED_VARIABLE
 6 | #cmakedefine HAS_CAST_FUNCTION_TYPE
 7 | #cmakedefine HAS_PARENTHESES
 8 | #cmakedefine HAS_USELESS_CAST
 9 | #cmakedefine HAS_NONNULL_COMPARE
10 | #cmakedefine HAS_TAUTOLOGICAL_POINTER_COMPARE
11 | #cmakedefine HAS_CATCH_VALUE
12 | #cmakedefine HAS_MISSING_BRACES
13 | #cmakedefine HAS_IGNORED_ATTRIBUTES
14 | #cmakedefine HAS_DEPRECATED_COPY
15 | #cmakedefine HAS_CLASS_MEMACCESS
16 | #cmakedefine HAS_MAYBE_UNINITIALIZED
17 | #cmakedefine HAS_DEPRECATED_DECLARATIONS
18 | #cmakedefine BRT_VERSION "@BRT_VERSION@"
19 | 


--------------------------------------------------------------------------------
/runtime/cmake/brt_device_cpu.cmake:
--------------------------------------------------------------------------------
 1 | file(GLOB_RECURSE brt_device_cpu_srcs CONFIGURE_DEPENDS
 2 |   "${BRT_INCLUDE_DIR}/brt/backends/cpu/device/*.h"
 3 |   "${LIB_ROOT}/backends/cpu/device/*.cc"
 4 | )
 5 | 
 6 | source_group(TREE ${REPO_ROOT} FILES ${brt_device_cpu_srcs})
 7 | 
 8 | brt_add_object_library(brt_device_cpu ${brt_device_cpu_srcs})
 9 | target_link_libraries(brt_device_cpu LLVMOrcJIT LLVMX86CodeGen LLVMX86AsmParser)
10 | brt_add_include_to_target(brt_device_cpu brt_framework brt_common)
11 | set_target_properties(brt_device_cpu PROPERTIES FOLDER "Brt")
12 | 
13 | install(
14 |   DIRECTORY "${BRT_INCLUDE_DIR}/brt/backends/cpu/device"
15 |   DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/brt/backends/cpu")
16 | 


--------------------------------------------------------------------------------
/runtime/cmake/brt_shared.cmake:
--------------------------------------------------------------------------------
 1 | brt_add_shared_library(brt)
 2 | set(VERSION_SCRIPT ${REPO_ROOT}/version.ld)
 3 | target_link_libraries(brt
 4 |   PUBLIC $<BUILD_INTERFACE:brt.objs>
 5 |   PRIVATE $<INSTALL_INTERFACE:brt.objs>
 6 |   PRIVATE -Wl,--no-undefined -Wl,--version-script=${VERSION_SCRIPT}
 7 | )
 8 | set_target_properties(brt PROPERTIES LINK_DEPENDS ${VERSION_SCRIPT})
 9 | set_target_properties(brt PROPERTIES INSTALL_RPATH "$ORIGIN")
10 | 
11 | install(
12 |   TARGETS brt
13 |   EXPORT brt-targets
14 |   LIBRARY DESTINATION "${CMAKE_INSTALL_LIBDIR}"
15 |   INCLUDES DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}")
16 | 
17 | install(
18 |   EXPORT brt-targets
19 |   DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/brt")
20 | 


--------------------------------------------------------------------------------
/runtime/examples/external_project/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | cmake_minimum_required(VERSION 3.18)
 3 | 
 4 | project(brt_external_project C CXX)
 5 | 
 6 | option(BRT_INSTALL_PATH "The path to the installed BRT library")
 7 | get_filename_component(BRT_INSTALL_PATH ${BRT_INSTALL_PATH} ABSOLUTE)
 8 | 
 9 | list(APPEND CMAKE_MODULE_PATH "${BRT_INSTALL_PATH}/lib/cmake/brt")
10 | include(brt-targets)
11 | 
12 | add_executable(main main.cpp)
13 | target_link_libraries(main brt)
14 | 


--------------------------------------------------------------------------------
/runtime/include/brt/core/common/logging/sinks/clog_sink.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation. All rights reserved.
 2 | // Licensed under the MIT License.
 3 | // ===========================================================================
 4 | // Modification Copyright 2022 ByteDance Ltd. and/or its affiliates.
 5 | 
 6 | #pragma once
 7 | 
 8 | #include "brt/core/common/logging/sinks/ostream_sink.h"
 9 | #include <iostream>
10 | 
11 | namespace brt {
12 | namespace logging {
13 | /// <summary>
14 | /// A std::clog based ISink
15 | /// </summary>
16 | /// <seealso cref="ISink" />
17 | class CLogSink : public OStreamSink {
18 | public:
19 |   CLogSink() : OStreamSink(std::clog, /*flush*/ true) {}
20 | };
21 | } // namespace logging
22 | } // namespace brt
23 | 


--------------------------------------------------------------------------------
/runtime/include/brt/core/distributed/d_context.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Megvii Inc.
 2 | // Licensed under Apache License, Version 2.0
 3 | // ===========================================================================
 4 | // Modification Copyright 2022 ByteDance Ltd. and/or its affiliates.
 5 | 
 6 | #pragma once
 7 | 
 8 | #include <string>
 9 | 
10 | namespace brt {
11 | 
12 | //  DContext is an abstraction of communication contexts (e.g. cuda stream)
13 | //  on different platforms, a context should be passed as a parameter when
14 | //  a communicator operation is called
15 | class DContext {
16 | public:
17 |   virtual std::string type() const = 0;
18 |   virtual ~DContext() = default;
19 | };
20 | 
21 | } // namespace brt


--------------------------------------------------------------------------------
/runtime/lib/backends/cuda/providers/default/reduction/kernels/reduction.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Megvii Inc.
 2 | // Licensed under the Apache License.
 3 | // ===========================================================================
 4 | // Modification Copyright 2022 ByteDance Ltd. and/or its affiliates.
 5 | 
 6 | #pragma once
 7 | 
 8 | #include <cuda_runtime.h>
 9 | 
10 | namespace brt {
11 | namespace cuda {
12 | namespace kernel {
13 | template <typename wtype>
14 | size_t get_reduce_workspace_in_bytes(size_t A, size_t B, size_t C);
15 | 
16 | template <typename T, typename Op>
17 | void call_reduce(const T *input, T *output, size_t A, size_t B, size_t C,
18 |                  void *, cudaStream_t stream);
19 | } // namespace kernel
20 | } // namespace cuda
21 | } // namespace brt


--------------------------------------------------------------------------------
/runtime/python/README.md:
--------------------------------------------------------------------------------
 1 | ## BRT Python Binding
 2 | 
 3 | ### Building
 4 | 
 5 | build brt python extensions with cmake options `-Dbrt_ENABLE_PYTHON_BINDINGS=ON`
 6 | 
 7 | ### Run Examples
 8 | ```
 9 | cd build/python
10 | PYTHONPATH=. python3 examples/add2.py
11 | ```


--------------------------------------------------------------------------------
/runtime/python/brt/__init__.py:
--------------------------------------------------------------------------------
1 | from ._brt import *
2 | 


--------------------------------------------------------------------------------
/runtime/test/exported.ld:
--------------------------------------------------------------------------------
1 | {
2 |     extern "C++" {
3 |         /* export all symbols in brt to enable external kernel registration */
4 |         *brt::*;
5 |     };
6 | };
7 | 


--------------------------------------------------------------------------------
/runtime/test/test_files/AITOp/bmm_permute_a100.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bytedance/byteir/5c485e2857e277934ec1f72878b1d9673f3614be/runtime/test/test_files/AITOp/bmm_permute_a100.so


--------------------------------------------------------------------------------
/runtime/test/test_files/AITOp/bmm_permute_entry.mlir:
--------------------------------------------------------------------------------
1 | module attributes {byre.container_module} {
2 |   func.func @main(%arg0 : memref<384x256x256xf32, "cuda"> {byre.argname = "Input0", byre.argtype = 1 : i32},
3 |                   %arg1 : memref<384x256x64xf32, "cuda"> {byre.argname = "Input1", byre.argtype = 1 : i32},
4 |                   %arg2 : memref<64x256x6x64xf32, "cuda"> {byre.argname = "Output0", byre.argtype = 2 : i32}) attributes {byre.entry_point} {
5 |     byre.compute @AITOp(%arg0, %arg1, %arg2) {kernel_name = "bmm_permute", ait_lib_file = "bmm_permute_a100.so"} : memref<384x256x256xf32, "cuda">, memref<384x256x64xf32, "cuda">, memref<64x256x6x64xf32, "cuda">
6 |     return
7 |   }
8 | }
9 | 


--------------------------------------------------------------------------------
/runtime/test/test_files/AITOp/permute_a100.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bytedance/byteir/5c485e2857e277934ec1f72878b1d9673f3614be/runtime/test/test_files/AITOp/permute_a100.so


--------------------------------------------------------------------------------
/runtime/test/test_files/AITOp/permute_entry.mlir:
--------------------------------------------------------------------------------
1 | module attributes {byre.container_module} {
2 |   func.func @main(%arg0 : memref<64x256x6x64xf32, "cuda"> {byre.argname = "Input0", byre.argtype = 1 : i32},
3 |                   %arg1 : memref<64x6x256x64xf32, "cuda"> {byre.argname = "Output0", byre.argtype = 2 : i32}) attributes {byre.entry_point} {
4 |     byre.compute @AITOp(%arg0, %arg1) {kernel_name = "permute", ait_lib_file = "permute_a100.so"} : memref<64x256x6x64xf32, "cuda">, memref<64x6x256x64xf32, "cuda">
5 |     return
6 |   }
7 | }
8 | 


--------------------------------------------------------------------------------
/runtime/test/test_files/Distributed/add_send.mlir:
--------------------------------------------------------------------------------
 1 | module attributes {byre.container_module} {
 2 |   func.func @test_add_send(%arg0 : memref<4xf32, "cuda"> {byre.argname = "in0", byre.argtype = 1: i32}, 
 3 |                            %arg1 : memref<4xf32, "cuda"> {byre.argname = "in1", byre.argtype = 1: i32}, 
 4 |                            %arg2 : memref<4xf32, "cuda"> {byre.argname = "out", byre.argtype = 2: i32}) attributes {byre.entry_point} {
 5 |     byre.compute @AddOp_f32f32_f32(%arg0, %arg1, %arg2) : memref<4xf32, "cuda">, memref<4xf32, "cuda">, memref<4xf32, "cuda">         
 6 |     byre.compute @nccl.Send(%arg2) {rank = 1 : i64} : memref<4xf32, "cuda">
 7 |     return
 8 |   }
 9 | }
10 | 


--------------------------------------------------------------------------------
/runtime/test/test_files/Distributed/all_gather.mlir:
--------------------------------------------------------------------------------
1 | module attributes {byre.container_module} {
2 |   func.func @test_all_gather(%arg0 : memref<4xf32, "cuda"> {byre.argname = "in0", byre.argtype = 1: i32},
3 |                              %arg1 : memref<8xf32, "cuda"> {byre.argname = "out", byre.argtype = 2: i32}) attributes {byre.entry_point} {
4 |     byre.compute @nccl.AllGather(%arg0, %arg1) {replica_group = [2, 3]} : memref<4xf32, "cuda">, memref<8xf32, "cuda">
5 |     return
6 |   }
7 | }
8 | 


--------------------------------------------------------------------------------
/runtime/test/test_files/Distributed/all_reduce.mlir:
--------------------------------------------------------------------------------
1 | module attributes {byre.container_module} {
2 |   func.func @test_all_reduce(%arg0 : memref<4xf32, "cuda"> {byre.argname = "in0", byre.argtype = 1: i32},
3 |                              %arg1 : memref<4xf32, "cuda"> {byre.argname = "out", byre.argtype = 2: i32}) attributes {byre.entry_point} {
4 |     byre.compute @nccl.AllReduce(%arg0, %arg1) { reduction = "sum" , replica_group = [1 ,2, 3]} : memref<4xf32, "cuda">, memref<4xf32, "cuda">
5 |     return
6 |   }
7 | }
8 | 


--------------------------------------------------------------------------------
/runtime/test/test_files/Distributed/broadcast.mlir:
--------------------------------------------------------------------------------
1 | module attributes {byre.container_module} {
2 |   func.func @test_broadcast(%arg0 : memref<8xf32, "cuda"> {byre.argname = "in0", byre.argtype = 1: i32}) attributes {byre.entry_point} {
3 |     byre.compute @nccl.Broadcast(%arg0) {replica_group = [1, 0, 2]} : memref<8xf32, "cuda">
4 |     return
5 |   }
6 | }
7 | 


--------------------------------------------------------------------------------
/runtime/test/test_files/Distributed/broadcast2.mlir:
--------------------------------------------------------------------------------
1 | module attributes {byre.container_module} {
2 |   func.func @test_broadcast(%arg0 : memref<8xf32, "cuda"> {byre.argname = "in0", byre.argtype = 1: i32}) attributes {byre.entry_point} {
3 |     byre.compute @nccl.Broadcast(%arg0) {replica_group = [2, 0, 3]} : memref<8xf32, "cuda">
4 |     return
5 |   }
6 | }
7 | 


--------------------------------------------------------------------------------
/runtime/test/test_files/Distributed/recv.mlir:
--------------------------------------------------------------------------------
1 | module attributes {byre.container_module} {
2 |   func.func @test_recv(%arg0 : memref<4xf32, "cuda"> {byre.argname = "src", byre.argtype = 2: i32}) attributes {byre.entry_point} {
3 |     byre.compute @nccl.Recv(%arg0) {rank = 0 : i64} : memref<4xf32, "cuda">
4 |     return
5 |   }
6 | }
7 | 


--------------------------------------------------------------------------------
/runtime/test/test_files/Distributed/recv_add.mlir:
--------------------------------------------------------------------------------
 1 | module attributes {byre.container_module} {
 2 |   func.func @test_recv_add(%arg0 : memref<4xf32, "cuda"> {byre.argname = "in", byre.argtype = 1: i32}, 
 3 |                            %arg1 : memref<4xf32, "cuda"> {byre.argname = "out0", byre.argtype = 2: i32}, 
 4 |                            %arg2 : memref<4xf32, "cuda"> {byre.argname = "out1", byre.argtype = 2: i32}) attributes {byre.entry_point} {
 5 |     byre.compute @nccl.Recv(%arg1) {rank = 0 : i64} : memref<4xf32, "cuda">
 6 |     byre.compute @AddOp_f32f32_f32(%arg0, %arg1, %arg2) : memref<4xf32, "cuda">, memref<4xf32, "cuda">, memref<4xf32, "cuda">         
 7 |     return
 8 |   }
 9 | }
10 | 


--------------------------------------------------------------------------------
/runtime/test/test_files/Distributed/send.mlir:
--------------------------------------------------------------------------------
1 | module attributes {byre.container_module} {
2 |   func.func @test_send(%arg0 : memref<4xf32, "cuda"> {byre.argname = "src", byre.argtype = 1: i32}) attributes {byre.entry_point} {
3 |     byre.compute @nccl.Send(%arg0) {rank = 1 : i64} : memref<4xf32, "cuda">
4 |     return
5 |   }
6 | }
7 | 


--------------------------------------------------------------------------------
/runtime/test/test_files/LLJIT/Case0_v1_0_0/entry.mlirbc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bytedance/byteir/5c485e2857e277934ec1f72878b1d9673f3614be/runtime/test/test_files/LLJIT/Case0_v1_0_0/entry.mlirbc


--------------------------------------------------------------------------------
/runtime/test/test_files/LLJIT/Case0_v1_0_0/host_kernels.bc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bytedance/byteir/5c485e2857e277934ec1f72878b1d9673f3614be/runtime/test/test_files/LLJIT/Case0_v1_0_0/host_kernels.bc


--------------------------------------------------------------------------------
/runtime/test/test_files/add_splat_const_one_cuda.mlir:
--------------------------------------------------------------------------------
1 | module attributes {byre.container_module} {
2 |     func.func @mhlo_add_splat_const(%arg0: memref<100x32xf32, "cuda"> {byre.argname = "Input0", byre.argtype = 1 : i32},
3 |                                %arg1: memref<100x32xf32, "cuda"> {byre.argname = "Output0", byre.argtype = 2 : i32}) attributes {byre.entry_point} {
4 |         %0 = memref.alloc() : memref<100x32xf32, "cuda">
5 |         byre.compute @FillOp(%0) {value = dense<1.000000e+00> : tensor<100x32xf32>} : memref<100x32xf32, "cuda">
6 |         byre.compute @AddOp_f32f32_f32(%arg0, %0, %arg1) : memref<100x32xf32, "cuda">, memref<100x32xf32, "cuda">, memref<100x32xf32, "cuda">
7 |         return
8 |     }
9 | }


--------------------------------------------------------------------------------
/runtime/test/test_files/cuda_add.cu:
--------------------------------------------------------------------------------
1 | extern "C" __global__ void nvrtc_add_kernel(const float* input, float* output, int n, float val) {
2 |   int i = blockIdx.x*blockDim.x + threadIdx.x;
3 |   if (i < n) {
4 |     output[i] = input[i]+ val;
5 |   }
6 | }


--------------------------------------------------------------------------------
/runtime/test/test_files/flash_attn_kvcache_inputs_cache_seqlens.data:
--------------------------------------------------------------------------------
1 | 64 64 


--------------------------------------------------------------------------------
/runtime/test/test_files/group_allocation_hook_cpu_group.mlir:
--------------------------------------------------------------------------------
1 | module attributes {byre.container_module} {
2 |     func.func @main(%arg0: memref<32xf32, "cpu_group"> {byre.argname = "Input0", byre.argtype = 1 : i32},
3 |                %arg1: memref<32xf32, "cpu_group"> {byre.argname = "Output0", byre.argtype = 2 : i32}) attributes {byre.entry_point} {
4 |         %0 = memref.alloc() : memref<32xf32, "cpu_group">
5 |         byre.compute @CheckGroupAllocationHook(%arg0, %0, %arg1) {base = 0xdeadbeef: i64} : memref<32xf32, "cpu_group">, memref<32xf32, "cpu_group">, memref<32xf32, "cpu_group">
6 |         return
7 |     }
8 | }
9 | 


--------------------------------------------------------------------------------
/runtime/test/test_files/string_equal.mlir:
--------------------------------------------------------------------------------
 1 | module attributes {byre.container_module} {
 2 |   func.func @main(%arg0 : memref<4x!ace.string, "cpu"> {byre.argname = "Input", byre.argtype = 1: i32},
 3 |                   %arg1 : memref<4xi1, "cpu"> {byre.argname = "Output", byre.argtype = 2: i32}) attributes {byre.entry_point} {
 4 |     %0 = memref.alloc() : memref<4x!ace.string, "cpu">
 5 |     byre.compute @FillOp(%0) {memory_effects = [2 : i32], value = dense<"aaa"> : tensor<4x!ace.string, "cpu">} : memref<4x!ace.string, "cpu">
 6 |     byre.compute @tf.Equal(%arg0, %0, %arg1) {memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x!ace.string, "cpu">, memref<4x!ace.string, "cpu">, memref<4xi1, "cpu">
 7 |     return
 8 |   }
 9 | }
10 | 


--------------------------------------------------------------------------------
/runtime/test/test_files/string_equal_scalar.mlir:
--------------------------------------------------------------------------------
 1 | module attributes {byre.container_module} {
 2 |   func.func @main(%arg0 : memref<1x!ace.string, "cpu"> {byre.argname = "Input", byre.argtype = 1: i32},
 3 |                   %arg1 : memref<4xi1, "cpu"> {byre.argname = "Output", byre.argtype = 2: i32}) attributes {byre.entry_point} {
 4 |     %0 = memref.alloc() : memref<4x!ace.string, "cpu">
 5 |     byre.compute @FillOp(%0) {memory_effects = [2 : i32], value = dense<"aaa"> : tensor<4x!ace.string, "cpu">} : memref<4x!ace.string, "cpu">
 6 |     byre.compute @tf.Equal(%arg0, %0, %arg1) {memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x!ace.string, "cpu">, memref<4x!ace.string, "cpu">, memref<4xi1, "cpu">
 7 |     return
 8 |   }
 9 | }
10 | 


--------------------------------------------------------------------------------
/runtime/version.ld:
--------------------------------------------------------------------------------
 1 | {
 2 | global:
 3 |     extern "C++" {
 4 |         /* since we didn't have pubapi or capi yet, export all of symbols defined in brt namespace */
 5 |         *brt::*;
 6 |     };
 7 | 
 8 | local:
 9 |     extern "C++" {
10 |         /* hide all of symbols defined in MLIR to avoid symbol conflict */
11 |         *mlir::*;
12 |         *llvm::*;
13 |     };
14 | };
15 | 


--------------------------------------------------------------------------------
/scripts/clang_format_check.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | git_status=$(git status --porcelain)
 4 | if [[ $git_status ]]; then
 5 |   echo "Checkout code is not clean"
 6 |   echo "${git_status}"
 7 |   exit 1
 8 | fi
 9 | 
10 | find \( -name '*.cpp' -or -name '*.h' -or -name '*.cc' \) -not -path "./external/*" -not -path "./external_libs/*" | xargs clang-format-13 -i -style=file
11 | git_status=$(git status --porcelain)
12 | if [[ $git_status ]]; then
13 |   echo "clang-format-13 is not happy, please run \"clang-format-13 -i -style=file /PATH/TO/foo.cpp\" to the following files"
14 |   echo "${git_status}"
15 |   exit 1
16 | else
17 |   echo "PASSED C++ format"
18 | fi
19 | 


--------------------------------------------------------------------------------
/scripts/format_check.py:
--------------------------------------------------------------------------------
 1 | from formatCheck.check import *
 2 | import argparse
 3 | 
 4 | # parse directory path
 5 | parser = argparse.ArgumentParser()
 6 | parser.add_argument("--dir", type=str, help="path to directory")
 7 | args = parser.parse_args()
 8 | 
 9 | format_check(args.dir)
10 | 


--------------------------------------------------------------------------------
/scripts/runtime/build_external_project.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e
 4 | 
 5 | CUR_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
 6 | PROJ_DIR="$CUR_DIR/../../runtime"
 7 | BRT_INSTALL_DIR="$PROJ_DIR/build/install"
 8 | EXTERNAL_PROJECT_SRC_DIR="$PROJ_DIR/examples/external_project"
 9 | EXTERNAL_PROJECT_BUILD_DIR="$EXTERNAL_PROJECT_SRC_DIR/build"
10 | 
11 | rm -rf "$EXTERNAL_PROJECT_BUILD_DIR"
12 | mkdir -p "$EXTERNAL_PROJECT_BUILD_DIR"
13 | cmake -GNinja \
14 |   "-H$EXTERNAL_PROJECT_SRC_DIR" \
15 |   "-B$EXTERNAL_PROJECT_BUILD_DIR" \
16 |   -DBRT_INSTALL_PATH="$BRT_INSTALL_DIR"
17 | 
18 | cmake --build "$EXTERNAL_PROJECT_BUILD_DIR" --target all
19 | pushd $EXTERNAL_PROJECT_BUILD_DIR
20 | ./main
21 | popd
22 | 


--------------------------------------------------------------------------------
/talks/ChinaSoftCon-ByteIR.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bytedance/byteir/5c485e2857e277934ec1f72878b1d9673f3614be/talks/ChinaSoftCon-ByteIR.pdf


--------------------------------------------------------------------------------
/talks/c4ml23_poster.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bytedance/byteir/5c485e2857e277934ec1f72878b1d9673f3614be/talks/c4ml23_poster.pdf


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/cpu_ops/add.mlir:
--------------------------------------------------------------------------------
1 | func.func @add(%arg0 : tensor<128x2xf32>, %arg1 : tensor<128x2xf32>) -> tensor<128x2xf32> {
2 |   %0 = stablehlo.add %arg0, %arg1 : tensor<128x2xf32>
3 |   func.return %0 : tensor<128x2xf32>
4 | }
5 | 


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/cpu_ops/batch_norm_inference.mlir:
--------------------------------------------------------------------------------
1 | module {
2 |   func.func @main(%arg0: tensor<10x32x10xf32>, %arg1: tensor<32xf32>, %arg2: tensor<32xf32>, %arg3: tensor<32xf32>, %arg4: tensor<32xf32>) -> tensor<10x32x10xf32> {
3 |     %0 = "stablehlo.batch_norm_inference"(%arg0, %arg1, %arg2, %arg3, %arg4) <{epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64}> : (tensor<10x32x10xf32>, tensor<32xf32>, tensor<32xf32>, tensor<32xf32>, tensor<32xf32>) -> tensor<10x32x10xf32>
4 |     return %0 : tensor<10x32x10xf32>
5 |   }
6 | }
7 | 


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/cpu_ops/batch_norm_inference_f16.mlir:
--------------------------------------------------------------------------------
1 | module {
2 |   func.func @main(%arg0: tensor<10x32x10xf16>, %arg1: tensor<32xf16>, %arg2: tensor<32xf16>, %arg3: tensor<32xf16>, %arg4: tensor<32xf16>) -> tensor<10x32x10xf16> {
3 |     %0 = "stablehlo.batch_norm_inference"(%arg0, %arg1, %arg2, %arg3, %arg4) <{epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64}> : (tensor<10x32x10xf16>, tensor<32xf16>, tensor<32xf16>, tensor<32xf16>, tensor<32xf16>) -> tensor<10x32x10xf16>
4 |     return %0 : tensor<10x32x10xf16>
5 |   }
6 | }
7 | 


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/cpu_ops/broadcast_in_dim.mlir:
--------------------------------------------------------------------------------
1 | func.func @broadcast_in_dim(%arg0 : tensor<128x1xi64>) -> tensor<128x3xi64> { 
2 |   %0 = "stablehlo.broadcast_in_dim"(%arg0) {
3 |     broadcast_dimensions = array<i64: 0, 1>
4 |   } : (tensor<128x1xi64>) -> tensor<128x3xi64>
5 |   func.return %0 : tensor<128x3xi64>
6 | }
7 | 


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/cpu_ops/compare_LT_f32.mlir:
--------------------------------------------------------------------------------
1 | func.func @compare_LT_f32(%arg0 : tensor<256x1xf32>, %arg1 : tensor<256x1xf32>) -> tensor<256x1xi1> { 
2 |   %0 = "stablehlo.compare"(%arg0, %arg1) {
3 |     comparison_direction = #stablehlo<comparison_direction LT>,
4 |     compare_type = #stablehlo<comparison_type FLOAT>
5 |   } : (tensor<256x1xf32>, tensor<256x1xf32>) -> tensor<256x1xi1>
6 |   func.return %0 : tensor<256x1xi1>
7 | }
8 | 


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/cpu_ops/compare_LT_f64.mlir:
--------------------------------------------------------------------------------
1 | func.func @compare_LT_f64(%arg0 : tensor<256x1xf64>, %arg1 : tensor<256x1xf64>) -> tensor<256x1xi1> { 
2 |   %0 = "stablehlo.compare"(%arg0, %arg1) {
3 |     comparison_direction = #stablehlo<comparison_direction LT>,
4 |     compare_type = #stablehlo<comparison_type FLOAT>
5 |   } : (tensor<256x1xf64>, tensor<256x1xf64>) -> tensor<256x1xi1>
6 |   func.return %0 : tensor<256x1xi1>
7 | }
8 | 


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/cpu_ops/compare_LT_i32.mlir:
--------------------------------------------------------------------------------
1 | func.func @compare_LT_i32(%arg0 : tensor<256x1xi32>, %arg1 : tensor<256x1xi32>) -> tensor<256x1xi1> { 
2 |   %0 = "stablehlo.compare"(%arg0, %arg1) {
3 |     comparison_direction = #stablehlo<comparison_direction LT>,
4 |     compare_type = #stablehlo<comparison_type SIGNED>
5 |   } : (tensor<256x1xi32>, tensor<256x1xi32>) -> tensor<256x1xi1>
6 |   func.return %0 : tensor<256x1xi1>
7 | }
8 | 


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/cpu_ops/compare_LT_i64.mlir:
--------------------------------------------------------------------------------
1 | func.func @compare_LT_i64(%arg0 : tensor<256x1xi64>, %arg1 : tensor<256x1xi64>) -> tensor<256x1xi1> { 
2 |   %0 = "stablehlo.compare"(%arg0, %arg1) {
3 |     comparison_direction = #stablehlo<comparison_direction LT>,
4 |     compare_type = #stablehlo<comparison_type SIGNED>
5 |   } : (tensor<256x1xi64>, tensor<256x1xi64>) -> tensor<256x1xi1>
6 |   func.return %0 : tensor<256x1xi1>
7 | }
8 | 


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/cpu_ops/compare_NE_f32.mlir:
--------------------------------------------------------------------------------
1 | func.func @compare_LT_f32(%arg0 : tensor<256x1xf32>, %arg1 : tensor<256x1xf32>) -> tensor<256x1xi1> { 
2 |   %0 = "stablehlo.compare"(%arg0, %arg1) {
3 |     comparison_direction = #stablehlo<comparison_direction NE>,
4 |     compare_type = #stablehlo<comparison_type FLOAT>
5 |   } : (tensor<256x1xf32>, tensor<256x1xf32>) -> tensor<256x1xi1>
6 |   func.return %0 : tensor<256x1xi1>
7 | }
8 | 


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/cpu_ops/compare_NE_f64.mlir:
--------------------------------------------------------------------------------
1 | func.func @compare_LT_f64(%arg0 : tensor<256x1xf64>, %arg1 : tensor<256x1xf64>) -> tensor<256x1xi1> { 
2 |   %0 = "stablehlo.compare"(%arg0, %arg1) {
3 |     comparison_direction = #stablehlo<comparison_direction NE>,
4 |     compare_type = #stablehlo<comparison_type FLOAT>
5 |   } : (tensor<256x1xf64>, tensor<256x1xf64>) -> tensor<256x1xi1>
6 |   func.return %0 : tensor<256x1xi1>
7 | }
8 | 


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/cpu_ops/compare_NE_i32.mlir:
--------------------------------------------------------------------------------
1 | func.func @compare_LT_i32(%arg0 : tensor<256x1xi32>, %arg1 : tensor<256x1xi32>) -> tensor<256x1xi1> { 
2 |   %0 = "stablehlo.compare"(%arg0, %arg1) {
3 |     comparison_direction = #stablehlo<comparison_direction NE>,
4 |     compare_type = #stablehlo<comparison_type SIGNED>
5 |   } : (tensor<256x1xi32>, tensor<256x1xi32>) -> tensor<256x1xi1>
6 |   func.return %0 : tensor<256x1xi1>
7 | }
8 | 


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/cpu_ops/compare_NE_i64.mlir:
--------------------------------------------------------------------------------
1 | func.func @compare_LT_i64(%arg0 : tensor<256x1xi64>, %arg1 : tensor<256x1xi64>) -> tensor<256x1xi1> { 
2 |   %0 = "stablehlo.compare"(%arg0, %arg1) {
3 |     comparison_direction = #stablehlo<comparison_direction NE>,
4 |     compare_type = #stablehlo<comparison_type SIGNED>
5 |   } : (tensor<256x1xi64>, tensor<256x1xi64>) -> tensor<256x1xi1>
6 |   func.return %0 : tensor<256x1xi1>
7 | }
8 | 


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/cpu_ops/concatenate.mlir:
--------------------------------------------------------------------------------
1 | func.func @concatenate(%arg0 : tensor<256x1xi64>) -> tensor<256x2xi64> { 
2 |   %0 = stablehlo.constant dense<86400> : tensor<256x1xi64>
3 |   %1 = "stablehlo.concatenate"(%0, %arg0) {
4 |     dimension = 1 : i64
5 |   } : (tensor<256x1xi64>, tensor<256x1xi64>) -> tensor<256x2xi64>
6 |   func.return %1 : tensor<256x2xi64>
7 | }
8 | 


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/cpu_ops/convert_f16_f32.mlir:
--------------------------------------------------------------------------------
1 | func.func @convert_f16_f32(%arg0 : tensor<1x256xf16>) -> tensor<1x256xf32> { 
2 |   %0 = stablehlo.convert %arg0 : (tensor<1x256xf16>) -> tensor<1x256xf32>
3 |   func.return %0 : tensor<1x256xf32>
4 | }


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/cpu_ops/convert_f16_f64.mlir:
--------------------------------------------------------------------------------
1 | func.func @convert_f16_f64(%arg0 : tensor<1x256xf16>) -> tensor<1x256xf64> { 
2 |   %0 = stablehlo.convert %arg0 : (tensor<1x256xf16>) -> tensor<1x256xf64>
3 |   func.return %0 : tensor<1x256xf64>
4 | }


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/cpu_ops/convert_f16_i16.mlir:
--------------------------------------------------------------------------------
1 | func.func @convert_f16_i16(%arg0 : tensor<1x256xf16>) -> tensor<1x256xi16> { 
2 |   %0 = stablehlo.convert %arg0 : (tensor<1x256xf16>) -> tensor<1x256xi16>
3 |   func.return %0 : tensor<1x256xi16>
4 | }


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/cpu_ops/convert_f16_i32.mlir:
--------------------------------------------------------------------------------
1 | func.func @convert_f16_i32(%arg0 : tensor<1x256xf16>) -> tensor<1x256xi32> { 
2 |   %0 = stablehlo.convert %arg0 : (tensor<1x256xf16>) -> tensor<1x256xi32>
3 |   func.return %0 : tensor<1x256xi32>
4 | }


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/cpu_ops/convert_f16_i64.mlir:
--------------------------------------------------------------------------------
1 | func.func @convert_f16_i64(%arg0 : tensor<1x256xf16>) -> tensor<1x256xi64> { 
2 |   %0 = stablehlo.convert %arg0 : (tensor<1x256xf16>) -> tensor<1x256xi64>
3 |   func.return %0 : tensor<1x256xi64>
4 | }


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/cpu_ops/convert_f32_f16.mlir:
--------------------------------------------------------------------------------
1 | func.func @convert_f32_f16(%arg0 : tensor<1x256xf32>) -> tensor<1x256xf16> { 
2 |   %0 = stablehlo.convert %arg0 : (tensor<1x256xf32>) -> tensor<1x256xf16>
3 |   func.return %0 : tensor<1x256xf16>
4 | }


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/cpu_ops/convert_f32_f64.mlir:
--------------------------------------------------------------------------------
1 | func.func @convert_f32_f64(%arg0 : tensor<1x256xf32>) -> tensor<1x256xf64> { 
2 |   %0 = stablehlo.convert %arg0 : (tensor<1x256xf32>) -> tensor<1x256xf64>
3 |   func.return %0 : tensor<1x256xf64>
4 | }


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/cpu_ops/convert_f32_i16.mlir:
--------------------------------------------------------------------------------
1 | func.func @convert_f32_i16(%arg0 : tensor<1x256xf32>) -> tensor<1x256xi16> { 
2 |   %0 = stablehlo.convert %arg0 : (tensor<1x256xf32>) -> tensor<1x256xi16>
3 |   func.return %0 : tensor<1x256xi16>
4 | }


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/cpu_ops/convert_f32_i32.mlir:
--------------------------------------------------------------------------------
1 | func.func @convert_f32_i32(%arg0 : tensor<1x256xf32>) -> tensor<1x256xi32> { 
2 |   %0 = stablehlo.convert %arg0 : (tensor<1x256xf32>) -> tensor<1x256xi32>
3 |   func.return %0 : tensor<1x256xi32>
4 | }


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/cpu_ops/convert_f32_i32_special_val.mlir:
--------------------------------------------------------------------------------
1 | func.func @convert_f32_i32_special_val(%arg0 : tensor<2x3xf32>) -> tensor<2x3xi32> { 
2 |   %0 = stablehlo.convert %arg0 : (tensor<2x3xf32>) -> tensor<2x3xi32>
3 |   func.return %0 : tensor<2x3xi32>
4 | }


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/cpu_ops/convert_f32_i64.mlir:
--------------------------------------------------------------------------------
1 | func.func @convert_f32_i64(%arg0 : tensor<1x256xf32>) -> tensor<1x256xi64> { 
2 |   %0 = stablehlo.convert %arg0 : (tensor<1x256xf32>) -> tensor<1x256xi64>
3 |   func.return %0 : tensor<1x256xi64>
4 | }


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/cpu_ops/convert_f64_f16.mlir:
--------------------------------------------------------------------------------
1 | func.func @convert_f64_f16(%arg0 : tensor<1x256xf64>) -> tensor<1x256xf16> { 
2 |   %0 = stablehlo.convert %arg0 : (tensor<1x256xf64>) -> tensor<1x256xf16>
3 |   func.return %0 : tensor<1x256xf16>
4 | }


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/cpu_ops/convert_f64_f32.mlir:
--------------------------------------------------------------------------------
1 | func.func @convert_f64_f32(%arg0 : tensor<1x256xf64>) -> tensor<1x256xf32> { 
2 |   %0 = stablehlo.convert %arg0 : (tensor<1x256xf64>) -> tensor<1x256xf32>
3 |   func.return %0 : tensor<1x256xf32>
4 | }


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/cpu_ops/convert_f64_i16.mlir:
--------------------------------------------------------------------------------
1 | func.func @convert_f64_i16(%arg0 : tensor<1x256xf64>) -> tensor<1x256xi16> { 
2 |   %0 = stablehlo.convert %arg0 : (tensor<1x256xf64>) -> tensor<1x256xi16>
3 |   func.return %0 : tensor<1x256xi16>
4 | }


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/cpu_ops/convert_f64_i32.mlir:
--------------------------------------------------------------------------------
1 | func.func @convert_f64_i32(%arg0 : tensor<1x256xf64>) -> tensor<1x256xi32> { 
2 |   %0 = stablehlo.convert %arg0 : (tensor<1x256xf64>) -> tensor<1x256xi32>
3 |   func.return %0 : tensor<1x256xi32>
4 | }


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/cpu_ops/convert_f64_i64.mlir:
--------------------------------------------------------------------------------
1 | func.func @convert_f64_i64(%arg0 : tensor<1x256xf64>) -> tensor<1x256xi64> { 
2 |   %0 = stablehlo.convert %arg0 : (tensor<1x256xf64>) -> tensor<1x256xi64>
3 |   func.return %0 : tensor<1x256xi64>
4 | }


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/cpu_ops/convert_i16_f16.mlir:
--------------------------------------------------------------------------------
1 | func.func @convert_i16_f16(%arg0 : tensor<1x256xi16>) -> tensor<1x256xf16> { 
2 |   %0 = stablehlo.convert %arg0 : (tensor<1x256xi16>) -> tensor<1x256xf16>
3 |   func.return %0 : tensor<1x256xf16>
4 | }


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/cpu_ops/convert_i16_f32.mlir:
--------------------------------------------------------------------------------
1 | func.func @convert_i16_f32(%arg0 : tensor<1x256xi16>) -> tensor<1x256xf32> { 
2 |   %0 = stablehlo.convert %arg0 : (tensor<1x256xi16>) -> tensor<1x256xf32>
3 |   func.return %0 : tensor<1x256xf32>
4 | }


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/cpu_ops/convert_i16_f64.mlir:
--------------------------------------------------------------------------------
1 | func.func @convert_i16_f64(%arg0 : tensor<1x256xi16>) -> tensor<1x256xf64> { 
2 |   %0 = stablehlo.convert %arg0 : (tensor<1x256xi16>) -> tensor<1x256xf64>
3 |   func.return %0 : tensor<1x256xf64>
4 | }


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/cpu_ops/convert_i16_i32.mlir:
--------------------------------------------------------------------------------
1 | func.func @convert_i16_i32(%arg0 : tensor<1x256xi16>) -> tensor<1x256xi32> { 
2 |   %0 = stablehlo.convert %arg0 : (tensor<1x256xi16>) -> tensor<1x256xi32>
3 |   func.return %0 : tensor<1x256xi32>
4 | }


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/cpu_ops/convert_i16_i64.mlir:
--------------------------------------------------------------------------------
1 | func.func @convert_i16_i64(%arg0 : tensor<1x256xi16>) -> tensor<1x256xi64> { 
2 |   %0 = stablehlo.convert %arg0 : (tensor<1x256xi16>) -> tensor<1x256xi64>
3 |   func.return %0 : tensor<1x256xi64>
4 | }


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/cpu_ops/convert_i32_f16.mlir:
--------------------------------------------------------------------------------
1 | func.func @convert_i32_f16(%arg0 : tensor<1x256xi32>) -> tensor<1x256xf16> { 
2 |   %0 = stablehlo.convert %arg0 : (tensor<1x256xi32>) -> tensor<1x256xf16>
3 |   func.return %0 : tensor<1x256xf16>
4 | }


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/cpu_ops/convert_i32_f32.mlir:
--------------------------------------------------------------------------------
1 | func.func @convert_i32_f32(%arg0 : tensor<1x256xi32>) -> tensor<1x256xf32> { 
2 |   %0 = stablehlo.convert %arg0 : (tensor<1x256xi32>) -> tensor<1x256xf32>
3 |   func.return %0 : tensor<1x256xf32>
4 | }


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/cpu_ops/convert_i32_f64.mlir:
--------------------------------------------------------------------------------
1 | func.func @convert_i32_f64(%arg0 : tensor<1x256xi32>) -> tensor<1x256xf64> { 
2 |   %0 = stablehlo.convert %arg0 : (tensor<1x256xi32>) -> tensor<1x256xf64>
3 |   func.return %0 : tensor<1x256xf64>
4 | }


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/cpu_ops/convert_i32_i16.mlir:
--------------------------------------------------------------------------------
1 | func.func @convert_i32_i16(%arg0 : tensor<1x256xi32>) -> tensor<1x256xi16> { 
2 |   %0 = stablehlo.convert %arg0 : (tensor<1x256xi32>) -> tensor<1x256xi16>
3 |   func.return %0 : tensor<1x256xi16>
4 | }


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/cpu_ops/convert_i32_i64.mlir:
--------------------------------------------------------------------------------
1 | func.func @convert_i32_i64(%arg0 : tensor<1x256xi32>) -> tensor<1x256xi64> { 
2 |   %0 = stablehlo.convert %arg0 : (tensor<1x256xi32>) -> tensor<1x256xi64>
3 |   func.return %0 : tensor<1x256xi64>
4 | }


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/cpu_ops/convert_i64_f16.mlir:
--------------------------------------------------------------------------------
1 | func.func @convert_i64_f16(%arg0 : tensor<1x256xi64>) -> tensor<1x256xf16> { 
2 |   %0 = stablehlo.convert %arg0 : (tensor<1x256xi64>) -> tensor<1x256xf16>
3 |   func.return %0 : tensor<1x256xf16>
4 | }


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/cpu_ops/convert_i64_f32.mlir:
--------------------------------------------------------------------------------
1 | func.func @convert_i64_f32(%arg0 : tensor<1x256xi64>) -> tensor<1x256xf32> { 
2 |   %0 = stablehlo.convert %arg0 : (tensor<1x256xi64>) -> tensor<1x256xf32>
3 |   func.return %0 : tensor<1x256xf32>
4 | }


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/cpu_ops/convert_i64_f64.mlir:
--------------------------------------------------------------------------------
1 | func.func @convert_i64_f64(%arg0 : tensor<1x256xi64>) -> tensor<1x256xf64> { 
2 |   %0 = stablehlo.convert %arg0 : (tensor<1x256xi64>) -> tensor<1x256xf64>
3 |   func.return %0 : tensor<1x256xf64>
4 | }


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/cpu_ops/convert_i64_i16.mlir:
--------------------------------------------------------------------------------
1 | func.func @convert_i64_i16(%arg0 : tensor<1x256xi64>) -> tensor<1x256xi16> { 
2 |   %0 = stablehlo.convert %arg0 : (tensor<1x256xi64>) -> tensor<1x256xi16>
3 |   func.return %0 : tensor<1x256xi16>
4 | }


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/cpu_ops/convert_i64_i32.mlir:
--------------------------------------------------------------------------------
1 | func.func @convert_i64_i32(%arg0 : tensor<1x256xi64>) -> tensor<1x256xi32> { 
2 |   %0 = stablehlo.convert %arg0 : (tensor<1x256xi64>) -> tensor<1x256xi32>
3 |   func.return %0 : tensor<1x256xi32>
4 | }


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/cpu_ops/custom_call_byteir_addn.mlir:
--------------------------------------------------------------------------------
1 | func.func @byteir.addn(%arg0: tensor<128xf32>, %arg1: tensor<128xf32>, %arg2: tensor<128xf32>) -> tensor<128xf32> {
2 |   %0 = stablehlo.custom_call @byteir.addn(%arg0, %arg1, %arg2) {byteir_attrs = {}} : (tensor<128xf32>, tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32>
3 |   return %0 : tensor<128xf32>
4 | }
5 | 


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/cpu_ops/custom_call_byteir_arg_max.mlir:
--------------------------------------------------------------------------------
1 | func.func @byteir.arg_max$return_2(%arg0: tensor<3x128xf32>) -> (tensor<3xf32>, tensor<3xi64>) {
2 |   %0:2 = stablehlo.custom_call @byteir.arg_max(%arg0) {byteir_attrs = {axis = 1 : i64, keep_dims = false, select_last_index = false}} : (tensor<3x128xf32>) -> (tensor<3xf32>, tensor<3xi64>)
3 |   return %0#0, %0#1 : tensor<3xf32>, tensor<3xi64>
4 | }
5 | 


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/cpu_ops/custom_call_byteir_arg_max_i32.mlir:
--------------------------------------------------------------------------------
1 | func.func @byteir.arg_max$return_2(%arg0: tensor<3x128xi32>) -> (tensor<3xi32>, tensor<3xi64>) {
2 |   %0:2 = stablehlo.custom_call @byteir.arg_max(%arg0) {byteir_attrs = {axis = 1 : i64, keep_dims = false, select_last_index = false}} : (tensor<3x128xi32>) -> (tensor<3xi32>, tensor<3xi64>)
3 |   return %0#0, %0#1 : tensor<3xi32>, tensor<3xi64>
4 | }
5 | 


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/cpu_ops/custom_call_byteir_arg_min.mlir:
--------------------------------------------------------------------------------
1 | func.func @byteir.arg_min$return_2(%arg0: tensor<3x128xf32>) -> (tensor<3xf32>, tensor<3xi64>) {
2 |   %0:2 = stablehlo.custom_call @byteir.arg_min(%arg0) {byteir_attrs = {axis = 1 : i64, keep_dims = false, select_last_index = false}} : (tensor<3x128xf32>) -> (tensor<3xf32>, tensor<3xi64>)
3 |   return %0#0, %0#1 : tensor<3xf32>, tensor<3xi64>
4 | }
5 | 


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/cpu_ops/custom_call_byteir_arg_min_i32.mlir:
--------------------------------------------------------------------------------
1 | func.func @byteir.arg_min$return_2(%arg0: tensor<3x128xi32>) -> (tensor<3xi32>, tensor<3xi64>) {
2 |   %0:2 = stablehlo.custom_call @byteir.arg_min(%arg0) {byteir_attrs = {axis = 1 : i64, keep_dims = false, select_last_index = false}} : (tensor<3x128xi32>) -> (tensor<3xi32>, tensor<3xi64>)
3 |   return %0#0, %0#1 : tensor<3xi32>, tensor<3xi64>
4 | }
5 | 


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/cpu_ops/custom_call_byteir_softmax.mlir:
--------------------------------------------------------------------------------
1 | func.func @byteir.softmax(%arg0: tensor<10x128xf32>) -> tensor<10x128xf32> {
2 |   %0 = stablehlo.custom_call @byteir.softmax(%arg0) {byteir_attrs = {axis = 1 : i64}} : (tensor<10x128xf32>) -> tensor<10x128xf32>
3 |   return %0 : tensor<10x128xf32>
4 | }
5 | 


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/cpu_ops/custom_call_tf_UpperBound.mlir:
--------------------------------------------------------------------------------
 1 | func.func @custom_call_tf_UpperBound(%arg0 : tensor<1x2560xf16>) -> tensor<1x2560xi32> { 
 2 |   %0 = stablehlo.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00, 6.000000e+00, 1.000000e+01, 2.000000e+01, 5.000000e+01]]> : tensor<1x8xf16>
 3 |   %1 = "stablehlo.custom_call"(%0, %arg0) {
 4 |     call_target_name = "tf.UpperBound",
 5 |     has_side_effect = false,
 6 |     backend_config = "",
 7 |     byteir_attrs = {},
 8 |     api_version = 1 : i32,
 9 |     called_computations = [@tf.UpperBound]
10 |   } : (tensor<1x8xf16>, tensor<1x2560xf16>) -> tensor<1x2560xi32>
11 |   func.return %1 : tensor<1x2560xi32>
12 | }
13 | 


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/cpu_ops/divide_f16.mlir:
--------------------------------------------------------------------------------
1 | func.func @divide_f16(%arg0 : tensor<256x1xf16>, %arg1 : tensor<256x1xf16>) -> tensor<256x1xf16> {
2 |   %0 = stablehlo.divide %arg0, %arg1 : tensor<256x1xf16>
3 |   func.return %0 : tensor<256x1xf16>
4 | }
5 | 


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/cpu_ops/log_plus_one_f16.mlir:
--------------------------------------------------------------------------------
1 | func.func @log_plus_one(%arg0 : tensor<256x1xf16>) -> tensor<256x1xf16> { 
2 |   %0 = "stablehlo.log_plus_one"(%arg0) : (tensor<256x1xf16>) -> tensor<256x1xf16>
3 |   func.return %0 : tensor<256x1xf16>
4 | }
5 | 


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/cpu_ops/maximum_f32.mlir:
--------------------------------------------------------------------------------
1 | func.func @maximum(%arg0 : tensor<256x1xf32>, %arg1 : tensor<256x1xf32>) -> tensor<256x1xf32> {
2 |   %0 = "stablehlo.maximum"(%arg0, %arg1) : (tensor<256x1xf32>, tensor<256x1xf32>) -> tensor<256x1xf32>
3 |   func.return %0 : tensor<256x1xf32>
4 | }
5 | 


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/cpu_ops/maximum_f64.mlir:
--------------------------------------------------------------------------------
1 | func.func @maximum_f64(%arg0 : tensor<256x1xf64>, %arg1 : tensor<256x1xf64>) -> tensor<256x1xf64> {
2 |   %0 = "stablehlo.maximum"(%arg0, %arg1) : (tensor<256x1xf64>, tensor<256x1xf64>) -> tensor<256x1xf64>
3 |   func.return %0 : tensor<256x1xf64>
4 | }
5 | 


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/cpu_ops/maximum_i32.mlir:
--------------------------------------------------------------------------------
1 | func.func @maximum_i32(%arg0 : tensor<256x1xi32>, %arg1 : tensor<256x1xi32>) -> tensor<256x1xi32> {
2 |   %0 = "stablehlo.maximum"(%arg0, %arg1) : (tensor<256x1xi32>, tensor<256x1xi32>) -> tensor<256x1xi32>
3 |   func.return %0 : tensor<256x1xi32>
4 | }
5 | 


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/cpu_ops/maximum_i64.mlir:
--------------------------------------------------------------------------------
1 | func.func @maximum_i64(%arg0 : tensor<256x1xi64>, %arg1 : tensor<256x1xi64>) -> tensor<256x1xi64> {
2 |   %0 = "stablehlo.maximum"(%arg0, %arg1) : (tensor<256x1xi64>, tensor<256x1xi64>) -> tensor<256x1xi64>
3 |   func.return %0 : tensor<256x1xi64>
4 | }
5 | 


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/cpu_ops/minimum_f32.mlir:
--------------------------------------------------------------------------------
1 | func.func @minimum(%arg0 : tensor<256x1xf32>, %arg1 : tensor<256x1xf32>) -> tensor<256x1xf32> {
2 |   %0 = "stablehlo.minimum"(%arg0, %arg1) : (tensor<256x1xf32>, tensor<256x1xf32>) -> tensor<256x1xf32>
3 |   func.return %0 : tensor<256x1xf32>
4 | }
5 | 


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/cpu_ops/minimum_f64.mlir:
--------------------------------------------------------------------------------
1 | func.func @minimum_f64(%arg0 : tensor<256x1xf64>, %arg1 : tensor<256x1xf64>) -> tensor<256x1xf64> {
2 |   %0 = "stablehlo.minimum"(%arg0, %arg1) : (tensor<256x1xf64>, tensor<256x1xf64>) -> tensor<256x1xf64>
3 |   func.return %0 : tensor<256x1xf64>
4 | }
5 | 


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/cpu_ops/minimum_i32.mlir:
--------------------------------------------------------------------------------
1 | func.func @minimum_i32(%arg0 : tensor<256x1xi32>, %arg1 : tensor<256x1xi32>) -> tensor<256x1xi32> {
2 |   %0 = "stablehlo.minimum"(%arg0, %arg1) : (tensor<256x1xi32>, tensor<256x1xi32>) -> tensor<256x1xi32>
3 |   func.return %0 : tensor<256x1xi32>
4 | }
5 | 


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/cpu_ops/minimum_i64.mlir:
--------------------------------------------------------------------------------
1 | func.func @minimum_i64(%arg0 : tensor<256x1xi64>, %arg1 : tensor<256x1xi64>) -> tensor<256x1xi64> {
2 |   %0 = "stablehlo.minimum"(%arg0, %arg1) : (tensor<256x1xi64>, tensor<256x1xi64>) -> tensor<256x1xi64>
3 |   func.return %0 : tensor<256x1xi64>
4 | }
5 | 


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/cpu_ops/multiply_f32.mlir:
--------------------------------------------------------------------------------
1 | func.func @multiply_256x1xf32(%arg0 : tensor<256x1xf32>, %arg1 : tensor<256x1xf32>) -> tensor<256x1xf32> {
2 |   %0 = "stablehlo.multiply"(%arg0, %arg1) : (tensor<256x1xf32>, tensor<256x1xf32>) -> tensor<256x1xf32>
3 |   func.return %0 : tensor<256x1xf32>
4 | }
5 | 


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/cpu_ops/multiply_f64.mlir:
--------------------------------------------------------------------------------
1 | func.func @multiply_256x1xf64(%arg0 : tensor<256x1xf64>, %arg1 : tensor<256x1xf64>) -> tensor<256x1xf64> {
2 |   %0 = "stablehlo.multiply"(%arg0, %arg1) : (tensor<256x1xf64>, tensor<256x1xf64>) -> tensor<256x1xf64>
3 |   func.return %0 : tensor<256x1xf64>
4 | }
5 | 


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/cpu_ops/multiply_i32.mlir:
--------------------------------------------------------------------------------
1 | func.func @multiply_256x1xi32(%arg0 : tensor<256x1xi32>, %arg1 : tensor<256x1xi32>) -> tensor<256x1xi32> {
2 |   %0 = "stablehlo.multiply"(%arg0, %arg1) : (tensor<256x1xi32>, tensor<256x1xi32>) -> tensor<256x1xi32>
3 |   func.return %0 : tensor<256x1xi32>
4 | }
5 | 


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/cpu_ops/multiply_i64.mlir:
--------------------------------------------------------------------------------
1 | func.func @multiply_256x1xi64(%arg0 : tensor<256x1xi64>, %arg1 : tensor<256x1xi64>) -> tensor<256x1xi64> {
2 |   %0 = "stablehlo.multiply"(%arg0, %arg1) : (tensor<256x1xi64>, tensor<256x1xi64>) -> tensor<256x1xi64>
3 |   func.return %0 : tensor<256x1xi64>
4 | }
5 | 


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/cpu_ops/reduce_f32.mlir:
--------------------------------------------------------------------------------
 1 | func.func @reduce_f32(%input : tensor<256x5xf32>) -> tensor<256xf32> {
 2 |   %0 = stablehlo.constant  dense<-0.000000e+00> : tensor<f32>
 3 |   %1 = "stablehlo.reduce"(%input, %0) ({
 4 |     ^bb0(%arg0: tensor<f32>, %arg1: tensor<f32>):
 5 |       %2 = "stablehlo.add"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
 6 |       "stablehlo.return"(%2) : (tensor<f32>) -> ()
 7 |   }) {
 8 |     dimensions = array<i64: 1>
 9 |   } : (tensor<256x5xf32>, tensor<f32>) -> tensor<256xf32>
10 |   func.return %1 : tensor<256xf32>
11 | }
12 | 


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/cpu_ops/remainder_i64.mlir:
--------------------------------------------------------------------------------
1 | func.func @remainder(%arg0 : tensor<256x1xi64>) -> tensor<256x1xi64> { 
2 |   %0 = stablehlo.constant dense<86400> : tensor<256x1xi64>
3 |   %1 = "stablehlo.remainder"(%arg0, %0) : (tensor<256x1xi64>, tensor<256x1xi64>) -> tensor<256x1xi64>
4 |   func.return %1 : tensor<256x1xi64>
5 | }
6 | 


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/cpu_ops/reshape_slice.mlir:
--------------------------------------------------------------------------------
1 | func.func @reshape_slice_reshape(%arg0: tensor<256x2xf16>) -> (tensor<256xf16>) {
2 |   %0 = "stablehlo.reshape"(%arg0) : (tensor<256x2xf16>) -> tensor<256x1x2xf16>
3 |   %1 = "stablehlo.slice"(%0) {limit_indices = array<i64: 256, 1, 1>, start_indices = array<i64: 0, 0, 0>, strides =array<i64: 1, 1, 1>} : (tensor<256x1x2xf16>) -> tensor<256x1x1xf16>
4 |   %2 = "stablehlo.reshape"(%1) : (tensor<256x1x1xf16>) -> tensor<256xf16>
5 |   func.return %2 : tensor<256xf16>
6 | }
7 | 


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/cpu_ops/rng.mlir:
--------------------------------------------------------------------------------
 1 | func.func @rng_f16() -> tensor<256x120xf16> { 
 2 |   %0 = stablehlo.constant dense<[256, 120]> : tensor<2xi64>
 3 |   %1 = stablehlo.constant dense<1.000000e+00> : tensor<f16>
 4 |   %2 = stablehlo.constant dense<0.000000e+00> : tensor<f16>
 5 |   %3 = "stablehlo.rng"(%2, %1, %0) {
 6 |     rng_distribution = #stablehlo<rng_distribution UNIFORM>,
 7 |     device = "host"
 8 |   } : (tensor<f16>, tensor<f16>, tensor<2xi64>) -> tensor<256x120xf16>
 9 |   func.return %3 : tensor<256x120xf16>
10 | }
11 | 


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/cpu_ops/scatter_insert_slice.mlir:
--------------------------------------------------------------------------------
1 | func.func @forward(%arg0: tensor<6x8x5xf32>, %arg1: tensor<6x1x5xf32>) -> tensor<6x8x5xf32> {
2 |     %c = stablehlo.constant dense<0> : tensor<1x1xi64>
3 |     %0 = "stablehlo.scatter"(%arg0, %c, %arg1) <{indices_are_sorted = false, scatter_dimension_numbers = #stablehlo.scatter<update_window_dims = [0, 2], inserted_window_dims = [1], scatter_dims_to_operand_dims = [1], index_vector_dim = 1>, unique_indices = false}> ({
4 |     ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):
5 |       stablehlo.return %arg3 : tensor<f32>
6 |     }) : (tensor<6x8x5xf32>, tensor<1x1xi64>, tensor<6x1x5xf32>) -> tensor<6x8x5xf32>
7 |     return %0 : tensor<6x8x5xf32>
8 | }


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/cpu_ops/select_f32.mlir:
--------------------------------------------------------------------------------
1 | func.func @select_f32(%pred : tensor<256x1xi1>, %on_true : tensor<256x1xf32>, %on_false : tensor<256x1xf32>) -> tensor<256x1xf32> {
2 |   %0 = "stablehlo.select"(%pred, %on_true, %on_false) : (tensor<256x1xi1>, tensor<256x1xf32>, tensor<256x1xf32>) -> tensor<256x1xf32>
3 |   func.return %0 : tensor<256x1xf32>
4 | }
5 | 


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/cpu_ops/select_f64.mlir:
--------------------------------------------------------------------------------
1 | func.func @select_f64(%pred : tensor<256x1xi1>, %on_true : tensor<256x1xf64>, %on_false : tensor<256x1xf64>) -> tensor<256x1xf64> {
2 |   %0 = "stablehlo.select"(%pred, %on_true, %on_false) : (tensor<256x1xi1>, tensor<256x1xf64>, tensor<256x1xf64>) -> tensor<256x1xf64>
3 |   func.return %0 : tensor<256x1xf64>
4 | }
5 | 


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/cpu_ops/select_i64.mlir:
--------------------------------------------------------------------------------
1 | func.func @select_i64(%pred : tensor<256x1xi1>, %on_true : tensor<256x1xi64>, %on_false : tensor<256x1xi64>) -> tensor<256x1xi64> {
2 |   %0 = "stablehlo.select"(%pred, %on_true, %on_false) : (tensor<256x1xi1>, tensor<256x1xi64>, tensor<256x1xi64>) -> tensor<256x1xi64>
3 |   func.return %0 : tensor<256x1xi64>
4 | }
5 | 


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/cpu_ops/subtrace_f16.mlir:
--------------------------------------------------------------------------------
1 | func.func @subtract_f16(%arg0 : tensor<256x1xf16>, %arg1 : tensor<256x1xf16>) -> tensor<256x1xf16> { 
2 |   %0 = "stablehlo.subtract"(%arg0, %arg1) : (tensor<256x1xf16>, tensor<256x1xf16>) -> (tensor<256x1xf16>)
3 |   func.return %0 : tensor<256x1xf16>
4 | }
5 | 


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/ops/add.mlir:
--------------------------------------------------------------------------------
1 | func.func @add(%arg0 : tensor<256x256xf32>, %arg1 : tensor<256x256xf32>) -> tensor<256x256xf32> {
2 |   %0 = mhlo.add %arg0, %arg1 : tensor<256x256xf32>
3 |   return %0 : tensor<256x256xf32>
4 | }
5 | 
6 | 


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/ops/bmm_rcr.mlir:
--------------------------------------------------------------------------------
1 | func.func @bmm_rcr(%arg0 : tensor<1x32x256x128xf16>, %arg1 : tensor<32x256x128xf16>) -> tensor<32x256x256xf16> {
2 |     %0 = "mhlo.transpose"(%arg0) {permutation = dense<[0, 1, 3, 2]> : tensor<4xi64>} : (tensor<1x32x256x128xf16>) -> tensor<1x32x128x256xf16>
3 |     %1 = mhlo.reshape %0 : (tensor<1x32x128x256xf16>) -> tensor<32x128x256xf16>
4 |     %2 = "mhlo.dot_general"(%arg1, %1) {dot_dimension_numbers = #mhlo.dot<lhs_batching_dimensions = [0], rhs_batching_dimensions = [0], lhs_contracting_dimensions = [2], rhs_contracting_dimensions = [1]>} : (tensor<32x256x128xf16>, tensor<32x128x256xf16>) -> tensor<32x256x256xf16>
5 |     return %2 : tensor<32x256x256xf16>
6 | }
7 | 


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/ops/bmm_rrc.mlir:
--------------------------------------------------------------------------------
1 | 
2 | func.func @bmm_rrc(%arg0 : tensor<32x128x256xf16>, %arg1 : tensor<32x256x256xf16>) -> tensor<1x32x256x128xf16> {
3 |     %0 = "mhlo.dot_general"(%arg0, %arg1) {dot_dimension_numbers = #mhlo.dot<lhs_batching_dimensions = [0], rhs_batching_dimensions = [0], lhs_contracting_dimensions = [2], rhs_contracting_dimensions = [1]>} : (tensor<32x128x256xf16>, tensor<32x256x256xf16>) -> tensor<32x128x256xf16>
4 |     %1 = mhlo.reshape %0 : (tensor<32x128x256xf16>) -> tensor<1x32x128x256xf16>
5 |     %2 = "mhlo.transpose"(%1) {permutation = dense<[0, 1, 3, 2]> : tensor<4xi64>} : (tensor<1x32x128x256xf16>) -> tensor<1x32x256x128xf16>
6 |     return %2 : tensor<1x32x256x128xf16>
7 | }
8 | 


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/ops/bmm_rrr_add_f16.mlir:
--------------------------------------------------------------------------------
1 | func.func @bmm_rrr_add(%arg0 : tensor<32x256x256xf16>, %arg1 : tensor<32x256x128xf16>, %arg2 : tensor<1x32x256x128xf16>) -> tensor<1x32x256x128xf16> {
2 |     %0 = "mhlo.dot_general"(%arg0, %arg1) {dot_dimension_numbers = #mhlo.dot<lhs_batching_dimensions = [0], rhs_batching_dimensions = [0], lhs_contracting_dimensions = [2], rhs_contracting_dimensions = [1]>} : (tensor<32x256x256xf16>, tensor<32x256x128xf16>) -> tensor<32x256x128xf16>
3 |     %1 = mhlo.reshape %0 : (tensor<32x256x128xf16>) -> tensor<1x32x256x128xf16>
4 |     %2 = mhlo.add %arg2, %1 : tensor<1x32x256x128xf16>
5 |     return %2 : tensor<1x32x256x128xf16>
6 | }
7 | 


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/ops/bmm_rrr_f16.mlir:
--------------------------------------------------------------------------------
1 | func.func @bmm_rrr(%arg0 : tensor<32x256x256xf16>, %arg1 : tensor<32x256x128xf16>) -> tensor<32x256x128xf16> {
2 |     %0 = "mhlo.dot_general"(%arg0, %arg1) {dot_dimension_numbers = #mhlo.dot<lhs_batching_dimensions = [0], rhs_batching_dimensions = [0], lhs_contracting_dimensions = [2], rhs_contracting_dimensions = [1]>} : (tensor<32x256x256xf16>, tensor<32x256x128xf16>) -> tensor<32x256x128xf16>
3 |     return %0 : tensor<32x256x128xf16>
4 | }
5 | 


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/ops/bmm_rrr_permute_f16.mlir:
--------------------------------------------------------------------------------
1 | func.func @bmm_rrr_permute(%arg0: tensor<32x64x64xf16>, %arg1: tensor<32x64x128xf16>) -> tensor<1x64x32x128xf16> {
2 |     %0 = "mhlo.dot_general"(%arg0, %arg1) {dot_dimension_numbers = #mhlo.dot<lhs_batching_dimensions = [0], rhs_batching_dimensions = [0], lhs_contracting_dimensions = [2], rhs_contracting_dimensions = [1]>} : (tensor<32x64x64xf16>, tensor<32x64x128xf16>) -> tensor<32x64x128xf16>
3 |     %1 = mhlo.reshape %0 : (tensor<32x64x128xf16>) -> tensor<1x32x64x128xf16>
4 |     %2 = "mhlo.transpose"(%1) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x32x64x128xf16>) -> tensor<1x64x32x128xf16>
5 |     return %2 : tensor<1x64x32x128xf16>
6 | }


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/ops/bmm_rrr_permute_f32.mlir:
--------------------------------------------------------------------------------
1 | func.func @bmm_rrr_permute_f32(%arg0: tensor<4x2x2xf32>, %arg1: tensor<4x2x2xf32>) -> tensor<2x2x2x2xf32> {
2 |   %0 = "mhlo.dot_general"(%arg0, %arg1) {dot_dimension_numbers = #mhlo.dot<lhs_batching_dimensions = [0], rhs_batching_dimensions = [0], lhs_contracting_dimensions = [2], rhs_contracting_dimensions = [1]>} : (tensor<4x2x2xf32>, tensor<4x2x2xf32>) -> tensor<4x2x2xf32>
3 |   %1 = mhlo.reshape %0 : (tensor<4x2x2xf32>) -> tensor<2x2x2x2xf32>
4 |   %2 = "mhlo.transpose"(%1) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<2x2x2x2xf32>) -> tensor<2x2x2x2xf32>
5 |   return %2 : tensor<2x2x2x2xf32>
6 | }
7 | 


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/ops/broadcast.mlir:
--------------------------------------------------------------------------------
1 | func.func @broadcast(%arg0 : tensor<1x1x256x128xf16>) -> tensor<1x32x256x128xf16> {
2 |   %0 = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[0, 1, 2, 3]> : tensor<4xi64>} : (tensor<1x1x256x128xf16>) -> tensor<1x32x256x128xf16>
3 |   return %0 : tensor<1x32x256x128xf16>
4 | }
5 | 


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/ops/broadcast1.mlir:
--------------------------------------------------------------------------------
1 | func.func @broadcast1(%arg0 : tensor<4096xf32>) -> tensor<1x256x4096xf32> {
2 |   %0 = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<4096xf32>) -> tensor<1x256x4096xf32>
3 |   return %0 : tensor<1x256x4096xf32>
4 | }
5 | 


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/ops/compare_eq.mlir:
--------------------------------------------------------------------------------
1 | func.func @compare_eq(%arg0 : tensor<1x256x1024xi64>, %arg1 : tensor<1x256x1024xi64>) -> tensor<1x256x1024xi1> {
2 |   %0 = mhlo.compare  EQ, %arg0, %arg1,  SIGNED : (tensor<1x256x1024xi64>, tensor<1x256x1024xi64>) -> tensor<1x256x1024xi1>
3 |   return %0 : tensor<1x256x1024xi1>
4 | }
5 | 


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/ops/compare_lt.mlir:
--------------------------------------------------------------------------------
1 | func.func @compare_lt(%arg0 : tensor<1x32x256x256xf32>, %arg1 : tensor<1x32x256x256xf32>) -> tensor<1x32x256x256xi1> {
2 |   %0 = mhlo.compare  LT, %arg0, %arg1,  FLOAT : (tensor<1x32x256x256xf32>, tensor<1x32x256x256xf32>) -> tensor<1x32x256x256xi1>
3 |   return %0 : tensor<1x32x256x256xi1>
4 | }
5 | 


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/ops/concat.mlir:
--------------------------------------------------------------------------------
1 | func.func @concat(%arg0 : tensor<1x32x256x64xf16>, %arg1 : tensor<1x32x256x64xf16>) -> tensor<1x32x256x128xf16> {
2 |   %0 = "mhlo.concatenate"(%arg0, %arg1) {dimension = 3 : i64} : (tensor<1x32x256x64xf16>, tensor<1x32x256x64xf16>) -> tensor<1x32x256x128xf16>
3 |   return %0 : tensor<1x32x256x128xf16>
4 | }
5 | 


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/ops/concat2.mlir:
--------------------------------------------------------------------------------
1 | func.func @concat2(%arg0: tensor<i64>, %arg1: tensor<i64>) -> (tensor<2xi64>) {
2 |   %0 = mhlo.reshape %arg0 : (tensor<i64>) -> tensor<1xi64>
3 |   %1 = mhlo.reshape %arg1 : (tensor<i64>) -> tensor<1xi64>
4 |   %2 = "mhlo.concatenate"(%0, %1) {dimension = 0 : i64} : (tensor<1xi64>, tensor<1xi64>) -> tensor<2xi64>
5 |   return %2 : tensor<2xi64>
6 | }
7 | 


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/ops/convert_f16_f32.mlir:
--------------------------------------------------------------------------------
1 | func.func @convert_f16_f32(%arg0 : tensor<1x256x1024xf16>) -> tensor<1x256x1024xf32> {
2 |   %0 = mhlo.convert %arg0 : (tensor<1x256x1024xf16>) -> tensor<1x256x1024xf32>
3 |   return %0 : tensor<1x256x1024xf32>
4 | }
5 | 


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/ops/convert_f32_f16.mlir:
--------------------------------------------------------------------------------
1 | func.func @convert_f32_f16(%arg0 : tensor<1x256x1024xf32>) -> tensor<1x256x1024xf16> {
2 |   %0 = mhlo.convert %arg0 : (tensor<1x256x1024xf32>) -> tensor<1x256x1024xf16>
3 |   return %0 : tensor<1x256x1024xf16>
4 | }
5 | 


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/ops/divide.mlir:
--------------------------------------------------------------------------------
1 | func.func @divide(%arg0 : tensor<1x256x4096xf32>) -> tensor<1x256x4096xf32> {
2 |   %cst = mhlo.constant dense<4.096000e+03> : tensor<1x256x4096xf32>
3 |   %0 = mhlo.divide %arg0, %cst : tensor<1x256x4096xf32>
4 |   return %0 : tensor<1x256x4096xf32>
5 | }
6 | 


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/ops/gather.mlir:
--------------------------------------------------------------------------------
1 | func.func @gather(%arg0 : tensor<256x128xf16>, %arg1 : tensor<1x256xi64>) -> tensor<1x256x128xf16> {
2 |   %0 = "mhlo.gather"(%arg0, %arg1) {dimension_numbers = #mhlo.gather<offset_dims = [2], collapsed_slice_dims = [0], start_index_map = [0], index_vector_dim = 2>, indices_are_sorted = false, slice_sizes = dense<[1, 128]> : tensor<2xi64>} : (tensor<256x128xf16>, tensor<1x256xi64>) -> tensor<1x256x128xf16>
3 |   return %0 : tensor<1x256x128xf16>
4 | }
5 | 
6 | 


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/ops/gemm_crr_f16.mlir:
--------------------------------------------------------------------------------
1 | func.func @bmm_crr(%arg0 : tensor<1x256x4096xf16>, %arg1 : tensor<256x11008xf16>) -> tensor<4096x11008xf16> {
2 |     %0 = mhlo.reshape %arg0 : (tensor<1x256x4096xf16>) -> tensor<256x4096xf16>
3 |     %1 = "mhlo.transpose"(%0) {permutation = dense<[1, 0]> : tensor<2xi64>} : (tensor<256x4096xf16>) -> tensor<4096x256xf16>
4 |     %2 = "mhlo.dot"(%1, %arg1) : (tensor<4096x256xf16>, tensor<256x11008xf16>) -> tensor<4096x11008xf16>
5 |     return %2: tensor<4096x11008xf16>
6 | }
7 | 
8 | 


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/ops/gemm_rrr_f16.mlir:
--------------------------------------------------------------------------------
1 | func.func @gemm_rrr_f16(%arg0 : tensor<1024x4096xf16>, %arg1 : tensor<4096x4096xf16>) -> tensor<1024x4096xf16> {
2 |   %0 = "mhlo.dot"(%arg0, %arg1) : (tensor<1024x4096xf16>, tensor<4096x4096xf16>) -> tensor<1024x4096xf16>
3 |   return %0 : tensor<1024x4096xf16>
4 | }
5 | 


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/ops/gemm_rrr_f32.mlir:
--------------------------------------------------------------------------------
1 | func.func @gemm_rrr_f32(%arg0 : tensor<4x4xf32>, %arg1 : tensor<4x4xf32>) -> tensor<4x4xf32> {
2 |   %0 = "mhlo.dot"(%arg0, %arg1) : (tensor<4x4xf32>, tensor<4x4xf32>) -> tensor<4x4xf32>
3 |   return %0 : tensor<4x4xf32>
4 | }
5 | 


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/ops/insert_slice.mlir:
--------------------------------------------------------------------------------
1 | func.func @insert_slice(%arg0 : tensor<1x32x256x64xf16>, %arg1 : tensor<1x32x256x64xf16>) -> tensor<1x32x256x128xf16> {
2 |     %cst = mhlo.constant dense<0.000000e+00> : tensor<1x32x256x128xf16>
3 |     %inserted_slice_0 = tensor.insert_slice %arg0 into %cst[0, 0, 0, 64] [1, 32, 256, 64] [1, 1, 1, 1] : tensor<1x32x256x64xf16> into tensor<1x32x256x128xf16>
4 |     %inserted_slice_1 = tensor.insert_slice %arg1 into %inserted_slice_0[0, 0, 0, 0] [1, 32, 256, 64] [1, 1, 1, 1] : tensor<1x32x256x64xf16> into tensor<1x32x256x128xf16>
5 |     return %inserted_slice_1 : tensor<1x32x256x128xf16>
6 | }
7 | 


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/ops/layernorm.mlir:
--------------------------------------------------------------------------------
1 | func.func @layer_norm(%arg0 : tensor<1x16x4096xf32>, %arg1 : tensor<4096xf32>, %arg2 : tensor<4096xf32>) -> tensor<1x16x4096xf32> {
2 |   %0 = "mhlo.custom_call"(%arg0, %arg1, %arg2) {api_version = 1 : i32, backend_config = "", byteir_attrs = {axis = [2], epsilon = 1.000000e-05 : f64}, call_target_name = "byteir.layer_norm", called_computations = [], has_side_effect = false} : (tensor<1x16x4096xf32>, tensor<4096xf32>, tensor<4096xf32>) -> tensor<1x16x4096xf32>
3 |   return %0 : tensor<1x16x4096xf32>
4 | }
5 | 


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/ops/logistic.mlir:
--------------------------------------------------------------------------------
1 | func.func @logistic(%arg0 : tensor<1x256x1024xf16>) -> tensor<1x256x1024xf16> {
2 |   %0 = mhlo.logistic %arg0 : tensor<1x256x1024xf16>
3 |   return %0 : tensor<1x256x1024xf16>
4 | }
5 | 


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/ops/mul_f16.mlir:
--------------------------------------------------------------------------------
1 | func.func @multiply(%arg0 : tensor<1x32x256x128xf16>, %arg1 : tensor<1x32x256x128xf16>) -> tensor<1x32x256x128xf16> {
2 |   %0 = mhlo.multiply %arg0, %arg1 : tensor<1x32x256x128xf16>
3 |   return %0 : tensor<1x32x256x128xf16>
4 | }
5 | 


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/ops/mul_f32.mlir:
--------------------------------------------------------------------------------
1 | func.func @multiply_f32(%arg0 : tensor<1x32x256x128xf32>, %arg1 : tensor<1x32x256x128xf32>) -> tensor<1x32x256x128xf32> {
2 |   %0 = mhlo.multiply %arg0, %arg1 : tensor<1x32x256x128xf32>
3 |   return %0 : tensor<1x32x256x128xf32>
4 | }
5 | 


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/ops/negate.mlir:
--------------------------------------------------------------------------------
1 | func.func @negate(%arg0 : tensor<1x256xf32>) -> tensor<1x256xf32> {
2 |   %0 = mhlo.negate %arg0 : tensor<1x256xf32>
3 |   return %0 : tensor<1x256xf32>
4 | }
5 | 


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/ops/power.mlir:
--------------------------------------------------------------------------------
1 | func.func @power(%arg0 : tensor<1x256x4096xf32>) -> tensor<1x256x4096xf32> {
2 |   %cst = mhlo.constant dense<3.000000e+00> : tensor<1x256x4096xf32>
3 |   %0 = mhlo.power %arg0, %cst : tensor<1x256x4096xf32>
4 |   return %0 : tensor<1x256x4096xf32>
5 | }
6 | 
7 | 


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/ops/reduce_first_dim.mlir:
--------------------------------------------------------------------------------
 1 | func.func @reduce_sum(%arg0 : tensor<256x2304xf32>) -> tensor<2304xf32> {
 2 |     %cst = mhlo.constant dense<0.000000e+00> : tensor<f32>
 3 |     %0 = mhlo.reduce(%arg0 init: %cst) across dimensions = [0] : (tensor<256x2304xf32>, tensor<f32>) -> tensor<2304xf32>
 4 |      reducer(%arg1: tensor<f32>, %arg2: tensor<f32>)  {
 5 |       %1 = mhlo.add %arg1, %arg2 : tensor<f32>
 6 |       mhlo.return %1 : tensor<f32>
 7 |     }
 8 |     return %0 : tensor<2304xf32>
 9 | }
10 | 


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/ops/reduce_sum.mlir:
--------------------------------------------------------------------------------
 1 | func.func @reduce_sum(%arg0 : tensor<1x32x256x256xf32>) -> tensor<1x32x256xf32> {
 2 |     %cst = mhlo.constant dense<0.000000e+00> : tensor<f32>
 3 |     %0 = mhlo.reduce(%arg0 init: %cst) across dimensions = [3] : (tensor<1x32x256x256xf32>, tensor<f32>) -> tensor<1x32x256xf32>
 4 |      reducer(%arg1: tensor<f32>, %arg2: tensor<f32>)  {
 5 |       %1 = mhlo.add %arg1, %arg2 : tensor<f32>
 6 |       mhlo.return %1 : tensor<f32>
 7 |     }
 8 |     return %0 : tensor<1x32x256xf32>
 9 | }
10 | 


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/ops/reduce_sum_2d.mlir:
--------------------------------------------------------------------------------
 1 | func.func @reduce(%arg0 : tensor<1x256x1024xf32>) -> tensor<1x256xf32> {
 2 |     %cst = mhlo.constant dense<0.000000e+00> : tensor<f32>
 3 |     %0 = mhlo.reduce(%arg0 init: %cst) across dimensions = [2] : (tensor<1x256x1024xf32>, tensor<f32>) -> tensor<1x256xf32>
 4 |      reducer(%arg1: tensor<f32>, %arg2: tensor<f32>)  {
 5 |       %1 = mhlo.add %arg1, %arg2 : tensor<f32>
 6 |       mhlo.return %1 : tensor<f32>
 7 |     }
 8 |     return %0 : tensor<1x256xf32>
 9 | }
10 | 


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/ops/reduce_sum_first_2d.mlir:
--------------------------------------------------------------------------------
 1 | func.func @reduce(%arg0 : tensor<1x256x1024xf32>) -> tensor<1024xf32> {
 2 |     %cst = mhlo.constant dense<0.000000e+00> : tensor<f32>
 3 |     %0 = mhlo.reduce(%arg0 init: %cst) across dimensions = [0, 1] : (tensor<1x256x1024xf32>, tensor<f32>) -> tensor<1024xf32>
 4 |      reducer(%arg1: tensor<f32>, %arg2: tensor<f32>)  {
 5 |       %1 = mhlo.add %arg1, %arg2 : tensor<f32>
 6 |       mhlo.return %1 : tensor<f32>
 7 |     }
 8 |     return %0 : tensor<1024xf32>
 9 | }
10 | 


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/ops/rsqrt.mlir:
--------------------------------------------------------------------------------
1 | func.func @rsqrt(%arg0 : tensor<1x256xf32>) -> tensor<1x256xf32> {
2 |   %0 = mhlo.rsqrt %arg0 : tensor<1x256xf32>
3 |   return %0 : tensor<1x256xf32>
4 | }
5 | 


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/ops/scatter.mlir:
--------------------------------------------------------------------------------
 1 | 
 2 | func.func @scatter(%arg0 : tensor<256x1xi64>, %arg1 : tensor<256x4096xf32>) -> tensor<32000x4096xf32> {
 3 |     %cst = mhlo.constant dense<0.000000e+00> : tensor<32000x4096xf32>
 4 |     %0 = "mhlo.scatter"(%cst, %arg0, %arg1) ({
 5 |     ^bb0(%arg66: tensor<f32>, %arg67: tensor<f32>):
 6 |       %395 = mhlo.add %arg66, %arg67 : tensor<f32>
 7 |       mhlo.return %395 : tensor<f32>
 8 |     }) {indices_are_sorted = false, scatter_dimension_numbers = #mhlo.scatter<update_window_dims = [1], inserted_window_dims = [0], scatter_dims_to_operand_dims = [0], index_vector_dim = 1>, unique_indices = false} : (tensor<32000x4096xf32>, tensor<256x1xi64>, tensor<256x4096xf32>) -> tensor<32000x4096xf32>
 9 |     return %0 : tensor<32000x4096xf32>
10 | }
11 | 


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/ops/scatter_insert_slice.mlir:
--------------------------------------------------------------------------------
1 | func.func @forward(%arg0: tensor<6x8x5xf32>, %arg1: tensor<6x1x5xf32>) -> tensor<6x8x5xf32> {
2 |     %0 = mhlo.constant dense<0> : tensor<1x1xi64>
3 |     %1 = "mhlo.scatter"(%arg0, %0, %arg1) <{indices_are_sorted = false, scatter_dimension_numbers = #mhlo.scatter<update_window_dims = [0, 2], inserted_window_dims = [1], scatter_dims_to_operand_dims = [1], index_vector_dim = 1>, unique_indices = false}> ({
4 |     ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):
5 |         mhlo.return %arg3 : tensor<f32>
6 |     }) : (tensor<6x8x5xf32>, tensor<1x1xi64>, tensor<6x1x5xf32>) -> tensor<6x8x5xf32>
7 |     return %1 : tensor<6x8x5xf32>
8 | }


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/ops/select.mlir:
--------------------------------------------------------------------------------
1 | func.func @select(%arg0 : tensor<1x32x256x256xi1>, %arg1 : tensor<1x32x256x256xf32>, %arg2 : tensor<1x32x256x256xf32>) -> tensor<1x32x256x256xf32> {
2 |   %0 = mhlo.select %arg0, %arg1, %arg2 : tensor<1x32x256x256xi1>, tensor<1x32x256x256xf32>
3 |   return %0 : tensor<1x32x256x256xf32>
4 | }
5 | 


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/ops/slice.mlir:
--------------------------------------------------------------------------------
1 | func.func @slice(%arg0 : tensor<1x1x1024x1024xi1>) -> tensor<1x1x256x1024xi1> {
2 |   %0 = "mhlo.slice"(%arg0) {limit_indices = dense<[1, 1, 256, 1024]> : tensor<4xi64>, start_indices = dense<0> : tensor<4xi64>, strides = dense<1> : tensor<4xi64>} : (tensor<1x1x1024x1024xi1>) -> tensor<1x1x256x1024xi1>
3 |   return %0 : tensor<1x1x256x1024xi1>
4 | }
5 | 


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/ops/softmax.mlir:
--------------------------------------------------------------------------------
1 | func.func @softmax(%arg0 : tensor<8x12x256x256xf32>) -> tensor<8x12x256x256xf32> {
2 |     %0 = mhlo.custom_call @byteir.softmax(%arg0) {backend_config = "", byteir_attrs = {axis = 3 : i64}} : (tensor<8x12x256x256xf32>) -> tensor<8x12x256x256xf32>
3 |     return %0 : tensor<8x12x256x256xf32>
4 | }
5 | 


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/ops/transpose0312.mlir:
--------------------------------------------------------------------------------
1 | func.func @transpose(%arg0 : tensor<1x12x64x256xf32>) -> tensor<1x256x12x64xf32> {
2 |     %0 = "mhlo.transpose"(%arg0) {permutation = dense<[0, 3, 1, 2]> : tensor<4xi64>} : (tensor<1x12x64x256xf32>) -> tensor<1x256x12x64xf32>
3 |   return %0 : tensor<1x256x12x64xf32>
4 | }
5 | 


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/ops/transpose102.mlir:
--------------------------------------------------------------------------------
1 | func.func @transpose102(%arg0 : tensor<12x64x256xf32>) -> tensor<64x12x256xf32> {
2 |   %0 = "mhlo.transpose"(%arg0) {permutation = dense<[1, 0, 2]> : tensor<3xi64>} : (tensor<12x64x256xf32>) -> tensor<64x12x256xf32>
3 |   return %0 : tensor<64x12x256xf32>
4 | }


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/ops/transpose1023.mlir:
--------------------------------------------------------------------------------
1 | func.func @transpose1023(%arg0 : tensor<12x64x256x128xf32>) -> tensor<64x12x256x128xf32> {
2 |   %0 = "mhlo.transpose"(%arg0) {permutation = dense<[1, 0, 2, 3]> : tensor<4xi64>} : (tensor<12x64x256x128xf32>) -> tensor<64x12x256x128xf32>
3 |   return %0 : tensor<64x12x256x128xf32>
4 | }
5 | 


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/ops/transpose120.mlir:
--------------------------------------------------------------------------------
1 | func.func @transpose120(%arg0 : tensor<12x64x256xf32>) -> tensor<64x256x12xf32> {
2 |   %0 = "mhlo.transpose"(%arg0) {permutation = dense<[1, 2, 0]> : tensor<3xi64>} : (tensor<12x64x256xf32>) -> tensor<64x256x12xf32>
3 |   return %0 : tensor<64x256x12xf32>
4 | }


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/ops/transpose1203.mlir:
--------------------------------------------------------------------------------
1 | func.func @transpose1203(%arg0 : tensor<12x3x7x6xf16>) -> tensor<3x7x12x6xf16> {
2 |   %0 = "mhlo.transpose"(%arg0) {permutation = dense<[1, 2, 0, 3]> : tensor<4xi64>} : (tensor<12x3x7x6xf16>) -> tensor<3x7x12x6xf16>
3 |   return %0 : tensor<3x7x12x6xf16>
4 | }
5 | 


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/ops/transpose2013.mlir:
--------------------------------------------------------------------------------
1 | func.func @transpose2013(%arg0 : tensor<12x64x256x128xf32>) -> tensor<256x12x64x128xf32> {
2 |   %0 = "mhlo.transpose"(%arg0) {permutation = dense<[2, 0, 1, 3]> : tensor<4xi64>} : (tensor<12x64x256x128xf32>) -> tensor<256x12x64x128xf32>
3 |   return %0 : tensor<256x12x64x128xf32>
4 | }
5 | 


--------------------------------------------------------------------------------
/tests/numerical_test/mlir_tests/ops/transpose2d.mlir:
--------------------------------------------------------------------------------
1 | func.func @transpose2d(%arg0 : tensor<4096x13696xf16>) -> tensor<13696x4096xf16> {
2 |   %0 = "mhlo.transpose"(%arg0) {permutation = dense<[1, 0]> : tensor<2xi64>} : (tensor<4096x13696xf16>) -> tensor<13696x4096xf16>
3 |   return %0 : tensor<13696x4096xf16>
4 | }


--------------------------------------------------------------------------------