├── .clang-format ├── .github ├── ISSUE_TEMPLATE │ ├── bug.md │ ├── feature-request.md │ └── question.md └── workflows │ ├── compiler-ci.yaml │ ├── daily_ci.yaml │ ├── e2e_test.yaml │ ├── format-check.yaml │ ├── onnx-frontend-ci.yaml │ ├── runtime-ci.yaml │ ├── tf-frontend-ci.yaml │ └── torch-frontend-ci.yaml ├── .gitignore ├── .gitmodules ├── CITATION.cff ├── CONTRIBUTING.md ├── LICENSE ├── NOTICE ├── README-zh_cn.md ├── README.md ├── compiler ├── .gitignore ├── README.md ├── cmake │ ├── CMakeLists.txt │ ├── MLIR.cmake │ └── mhlo.cmake ├── dialects │ ├── CMakeLists.txt │ ├── include │ │ ├── CMakeLists.txt │ │ └── byteir │ │ │ ├── CMakeLists.txt │ │ │ └── Dialect │ │ │ ├── Ace │ │ │ ├── AceBase.td │ │ │ ├── AceDialect.h │ │ │ ├── AceOps.td │ │ │ └── CMakeLists.txt │ │ │ ├── CMakeLists.txt │ │ │ └── Ccl │ │ │ ├── CMakeLists.txt │ │ │ └── IR │ │ │ ├── CMakeLists.txt │ │ │ ├── CclBase.td │ │ │ ├── CclOpInterface.td │ │ │ ├── CclOps.h │ │ │ └── CclOps.td │ └── lib │ │ ├── CMakeLists.txt │ │ └── Dialect │ │ ├── Ace │ │ ├── CMakeLists.txt │ │ └── IR │ │ │ └── AceDialect.cpp │ │ ├── CMakeLists.txt │ │ └── Ccl │ │ ├── CMakeLists.txt │ │ └── IR │ │ ├── CMakeLists.txt │ │ └── CclOps.cpp ├── doc │ ├── attention.md │ ├── byteir_hlo_custom_call.md │ ├── codegen.md │ ├── gpu.md │ ├── linalg.md │ ├── passes.md │ └── rng.md ├── include │ ├── CMakeLists.txt │ ├── byteir-c │ │ ├── Dialects.h │ │ ├── PDLValue.h │ │ ├── Passes.h │ │ └── Translation.h │ └── byteir │ │ ├── Analysis │ │ ├── Alias.h │ │ ├── DimFlag.h │ │ ├── Liveness.h │ │ ├── OpDependence.h │ │ ├── ShapeAnalysis.h │ │ ├── SideEffect.h │ │ ├── SymbolicShape.h │ │ └── UseRange.h │ │ ├── CMakeLists.txt │ │ ├── Conversion │ │ ├── CMakeLists.txt │ │ ├── Common │ │ │ └── FunctionSupport.h │ │ ├── FuncToByre │ │ │ └── FuncToByre.h │ │ ├── GPUToNVVM │ │ │ └── GPUToNVVM.h │ │ ├── HloToByreTensor │ │ │ ├── HloToByreCustom.h │ │ │ └── HloToByreTensor.h │ │ ├── HloToCat │ │ │ ├── ConvertHloToCat.h │ │ │ ├── FuseHloToCat.h │ │ │ └── HloToCat.h │ │ ├── HloToTensor │ │ │ └── ConvertHloToTensor.h │ │ ├── LcclToByre │ │ │ └── LcclToByre.h │ │ ├── MemrefToByre │ │ │ └── MemrefToByre.h │ │ ├── Passes.h │ │ ├── Passes.td │ │ ├── ToAIT │ │ │ └── ToAIT.h │ │ ├── ToAce │ │ │ └── MhloToAce.h │ │ ├── ToByre │ │ │ └── ToByre.h │ │ ├── ToGPU │ │ │ ├── ToGPU.h │ │ │ └── Utils.h │ │ ├── ToHlo │ │ │ └── ArithToMhlo.h │ │ ├── ToLLVM │ │ │ └── ToLLVM.h │ │ ├── ToLinalg │ │ │ └── ToLinalg.h │ │ └── ToPTX │ │ │ └── ToPTX.h │ │ ├── Dialect │ │ ├── Ace │ │ │ ├── CMakeLists.txt │ │ │ ├── Passes.h │ │ │ ├── Passes.td │ │ │ └── Transforms │ │ │ │ └── BufferizableOpInterfaceImpl.h │ │ ├── Affine │ │ │ ├── CMakeLists.txt │ │ │ ├── Passes.h │ │ │ ├── Passes.td │ │ │ └── Transforms │ │ │ │ ├── AffineLoopFusionEx.h │ │ │ │ ├── InsertTrivialAffineLoop.h │ │ │ │ └── RewriteAffineToMemref.h │ │ ├── Byre │ │ │ ├── ByreBase.td │ │ │ ├── ByreDialect.h │ │ │ ├── ByreOps.td │ │ │ ├── CMakeLists.txt │ │ │ ├── Common.h │ │ │ ├── Passes.h │ │ │ ├── Passes.td │ │ │ ├── Serialization.h │ │ │ ├── Serialization │ │ │ │ ├── ByreSerial.td │ │ │ │ ├── ByreSerialOps.h │ │ │ │ ├── CMakeLists.txt │ │ │ │ └── Versioning.h │ │ │ └── Transforms │ │ │ │ ├── BufferizableOpInterfaceImpl.h │ │ │ │ └── Serial.h │ │ ├── CMakeLists.txt │ │ ├── Cat │ │ │ ├── CMakeLists.txt │ │ │ └── IR │ │ │ │ ├── CMakeLists.txt │ │ │ │ ├── CatBase.td │ │ │ │ ├── CatDialect.h │ │ │ │ └── CatOps.td │ │ ├── Ccl │ │ │ ├── CMakeLists.txt │ │ │ ├── Passes.h │ │ │ ├── Passes.td │ │ │ ├── TransformOps │ │ │ │ ├── CMakeLists.txt │ │ │ │ ├── CclTransformOps.h │ │ │ │ └── CclTransformOps.td │ │ │ └── Transforms │ │ │ │ ├── CclBufferizeOpInterfaceImpl.h │ │ │ │ └── CclMoveDown.h │ │ ├── GPU │ │ │ ├── CMakeLists.txt │ │ │ ├── Passes.h │ │ │ ├── Passes.td │ │ │ ├── TransformOps │ │ │ │ ├── CMakeLists.txt │ │ │ │ ├── GPUExtTransformOps.h │ │ │ │ ├── GPUExtTransformOps.td │ │ │ │ └── Utils.h │ │ │ └── Transforms │ │ │ │ ├── GPUBlockSwizzle.h │ │ │ │ ├── GPUDistributeSharedMemoryCopy.h │ │ │ │ ├── GPUDistributeToWarp.h │ │ │ │ ├── GPUPackSharedMemoryAlloc.h │ │ │ │ ├── GPUTensorCoreVectorization.h │ │ │ │ ├── OptimizeVectorTransfer.h │ │ │ │ ├── RemoveTrivialLoops.h │ │ │ │ ├── Transforms.h │ │ │ │ └── Utils.h │ │ ├── Lace │ │ │ ├── CMakeLists.txt │ │ │ ├── LaceBase.td │ │ │ ├── LaceDialect.h │ │ │ └── LaceOps.td │ │ ├── Lccl │ │ │ ├── CMakeLists.txt │ │ │ ├── LcclBase.td │ │ │ ├── LcclOps.h │ │ │ └── LcclOps.td │ │ ├── Linalg │ │ │ ├── CMakeLists.txt │ │ │ ├── IR │ │ │ │ ├── CMakeLists.txt │ │ │ │ ├── LinalgExtBase.td │ │ │ │ ├── LinalgExtInterfaces.h │ │ │ │ ├── LinalgExtInterfaces.td │ │ │ │ ├── LinalgExtOps.h │ │ │ │ └── LinalgExtOps.td │ │ │ ├── Passes.h │ │ │ ├── Passes.td │ │ │ ├── TransformOps │ │ │ │ ├── CMakeLists.txt │ │ │ │ ├── LinalgExtTransformOps.h │ │ │ │ └── LinalgExtTransformOps.td │ │ │ ├── Transforms │ │ │ │ ├── BufferizableOpInterfaceImpl.h │ │ │ │ ├── Bufferize.h │ │ │ │ ├── CanonicalizeExt.h │ │ │ │ ├── FuseElementwise.h │ │ │ │ ├── HoistingExt.h │ │ │ │ ├── LinalgCollapseLoops.h │ │ │ │ ├── LinalgDataPlace.h │ │ │ │ ├── LinalgExtToLoops.h │ │ │ │ ├── LinalgPrefetch.h │ │ │ │ ├── LinalgPromotion.h │ │ │ │ ├── Tiling.h │ │ │ │ ├── TilingUtils.h │ │ │ │ └── Transforms.h │ │ │ └── Util │ │ │ │ └── Util.h │ │ ├── MemRef │ │ │ ├── CMakeLists.txt │ │ │ ├── Passes.h │ │ │ ├── Passes.td │ │ │ ├── Transforms │ │ │ │ ├── ApplyMemRefAffineLayout.h │ │ │ │ ├── ExtractAddressComputation.h │ │ │ │ ├── RemoveCopy.h │ │ │ │ ├── SimplifyLinearizedIndex.h │ │ │ │ └── SimplifyView.h │ │ │ └── Utils │ │ │ │ ├── Layout.h │ │ │ │ ├── MemEffect.h │ │ │ │ └── Ops.h │ │ ├── SCF │ │ │ ├── CMakeLists.txt │ │ │ ├── Passes.h │ │ │ ├── Passes.td │ │ │ ├── Transforms │ │ │ │ ├── ForallCollapsing.h │ │ │ │ ├── FuseNestedForall.h │ │ │ │ ├── InsertTrivialSCFLoop.h │ │ │ │ ├── RemoveSingleIterationLoop.h │ │ │ │ └── TilingInterfaceToSCFFor.h │ │ │ └── Util │ │ │ │ └── Util.h │ │ ├── Shape │ │ │ ├── CMakeLists.txt │ │ │ ├── IR │ │ │ │ ├── CMakeLists.txt │ │ │ │ ├── ShapeExtBase.td │ │ │ │ ├── ShapeExtOps.h │ │ │ │ └── ShapeExtOps.td │ │ │ ├── Passes.h │ │ │ ├── Passes.td │ │ │ └── Transforms │ │ │ │ ├── InsertInputShapeConstraint.h │ │ │ │ ├── InsertTieShape.h │ │ │ │ ├── ResolveShapeConstraint.h │ │ │ │ └── SetAssumingAlwaysTrue.h │ │ ├── Tensor │ │ │ ├── CMakeLists.txt │ │ │ ├── IR │ │ │ │ └── TilingInterfaceImpl.h │ │ │ ├── Passes.h │ │ │ ├── Passes.td │ │ │ └── Transforms │ │ │ │ ├── CanonicalizeExt.h │ │ │ │ ├── ExtractSliceSpecialization.h │ │ │ │ └── TensorPadSpecialization.h │ │ ├── Transform │ │ │ ├── CMakeLists.txt │ │ │ ├── IR │ │ │ │ ├── CMakeLists.txt │ │ │ │ ├── TransformExtOps.h │ │ │ │ └── TransformExtOps.td │ │ │ ├── Passes.h │ │ │ ├── Passes.td │ │ │ └── Transforms │ │ │ │ ├── TransformDialectInterpreter.h │ │ │ │ └── TransformInsertion.h │ │ ├── Vector │ │ │ ├── CMakeLists.txt │ │ │ └── Transforms │ │ │ │ ├── CMakeLists.txt │ │ │ │ ├── CanonicalizeExt.h │ │ │ │ ├── MoveForallRegionIntoWarpOp.h │ │ │ │ ├── Passes.h │ │ │ │ ├── Passes.td │ │ │ │ └── VectorWarpDistribute.h │ │ └── mhlo │ │ │ ├── Analysis │ │ │ ├── DimFromBroadcast.h │ │ │ └── ShapeAnalysis.h │ │ │ ├── CMakeLists.txt │ │ │ ├── DynamicShapeOpRegister │ │ │ └── Register.h │ │ │ ├── Passes.h │ │ │ ├── Passes.td │ │ │ ├── Transforms │ │ │ ├── BoundedShapeInference.h │ │ │ ├── CanonicalizeExt.h │ │ │ ├── ClusterConstraint.h │ │ │ ├── ConvertFuncToCustomCall.h │ │ │ ├── ConvertInsertion.h │ │ │ ├── ConvertOpToCustomCall.h │ │ │ ├── DTypeConversion.h │ │ │ ├── DecomposeMhloCustomCallOps.h │ │ │ ├── DynamicShapeClustering.h │ │ │ ├── FuncArgRearrangement.h │ │ │ ├── FuseBMMDimension.h │ │ │ ├── FusionOutlining.h │ │ │ ├── GenericFusionCommon.h │ │ │ ├── HloAggressiveFusion.h │ │ │ ├── HloFolder.h │ │ │ ├── HloFuser.h │ │ │ ├── HloMove.h │ │ │ ├── HloSimplify.h │ │ │ ├── InsertShapeConstraint.h │ │ │ ├── LayoutTransformation.h │ │ │ ├── MatmulLayoutTransform.h │ │ │ ├── MoveCommon.h │ │ │ ├── RewriteWithConstraint.h │ │ │ ├── ShapeReification.h │ │ │ ├── StaticShapeInference.h │ │ │ └── UnfuseBatchNorm.h │ │ │ └── Util │ │ │ ├── CustomCallUtil.h │ │ │ ├── FusionUtil.h │ │ │ ├── ShapeInferUtil.h │ │ │ └── Util.h │ │ ├── Pipelines │ │ ├── AffineOpt.h │ │ ├── AllOpt.h │ │ ├── BufferizeOpt.h │ │ ├── ByreHost.h │ │ ├── ByreOpt.h │ │ ├── ByreTensorOpt.h │ │ ├── CatFusionOpt.h │ │ ├── CatPreprocess.h │ │ ├── Common │ │ │ └── Utils.h │ │ ├── GPU │ │ │ ├── ElementwiseCodegen.h │ │ │ ├── GPUOpt.h │ │ │ ├── LinalgMemrefGPU.h │ │ │ ├── MappingForall.h │ │ │ ├── NVVMCodegen.h │ │ │ └── ReductionCodegen.h │ │ ├── HloFusionOpt.h │ │ ├── HloGraphOpt.h │ │ ├── Host │ │ │ ├── Codegen.h │ │ │ ├── HostOpt.h │ │ │ └── ToLLVM.h │ │ ├── InitAllPipelines.h │ │ ├── LinalgMemrefOpt.h │ │ ├── LinalgTensorOpt.h │ │ ├── SCFOpt.h │ │ └── ShapeOpt.h │ │ ├── Stat │ │ ├── AllocCnt │ │ │ └── AllocCnt.h │ │ ├── Common │ │ │ └── Reg.h │ │ ├── InitAllStats.h │ │ └── OpCnt │ │ │ └── OpCnt.h │ │ ├── Target │ │ ├── CUDA │ │ │ ├── CUDAEmitter.h │ │ │ └── ToCUDA.h │ │ ├── Common │ │ │ ├── Common.h │ │ │ └── EmitUtil.h │ │ ├── Cpp │ │ │ ├── CppEmitter.h │ │ │ └── ToCpp.h │ │ ├── LLVM │ │ │ └── ToLLVMBC.h │ │ └── PTX │ │ │ ├── Passes.h │ │ │ └── ToPTX.h │ │ ├── Transforms │ │ ├── AnchoredPipeline.h │ │ ├── ApplyPDLPatterns.h │ │ ├── Bufferize.h │ │ ├── CMAE.h │ │ ├── CMakeLists.txt │ │ ├── CanonicalizeExt.h │ │ ├── CollectFunc.h │ │ ├── CondCanonicalize.h │ │ ├── FuncTag.h │ │ ├── GenericDeviceConfig.h │ │ ├── GraphClusteringAlgo.h │ │ ├── GraphClusteringByDevice.h │ │ ├── InsertUniqueId.h │ │ ├── LoopTag.h │ │ ├── LoopUnroll.h │ │ ├── MemoryPlanning.h │ │ ├── ModuleTag.h │ │ ├── Passes.h │ │ ├── Passes.td │ │ ├── RemoveFuncBody.h │ │ ├── RewriteOpToStdCall.h │ │ ├── SetArgShape.h │ │ ├── SetSpace.h │ │ ├── ShapeFuncOutlining.h │ │ ├── TryCatchModulePipeline.h │ │ └── Utils.h │ │ └── Utils │ │ ├── AffineUtils.h │ │ ├── AttrUtils.h │ │ ├── FuncUtils.h │ │ ├── GraphUtils.h │ │ ├── HashUtils.h │ │ ├── Hoist.h │ │ ├── IRRewrite.h │ │ ├── LoopUtils.h │ │ ├── MemUtils.h │ │ ├── ModuleUtils.h │ │ ├── OpInterfaceUtils.h │ │ ├── OptionUtils.h │ │ ├── PatternMatch.h │ │ ├── PipelineUtils.h │ │ ├── TileUtils.h │ │ ├── TypeUtils.h │ │ └── Utils.h ├── lib │ ├── Analysis │ │ ├── CMakeLists.txt │ │ ├── DimFlag.cpp │ │ ├── Liveness.cpp │ │ ├── OpDependence.cpp │ │ ├── ShapeAnalysis.cpp │ │ ├── SideEffect.cpp │ │ ├── SymbolicShape.cpp │ │ └── UseRange.cpp │ ├── CAPI │ │ ├── CMakeLists.txt │ │ ├── Dialects.cpp │ │ ├── PDLValue.cpp │ │ ├── Passes.cpp │ │ └── Translation.cpp │ ├── CMakeLists.txt │ ├── Conversion │ │ ├── CMakeLists.txt │ │ ├── Common │ │ │ ├── CMakeLists.txt │ │ │ └── FunctionSupport.cpp │ │ ├── FuncToByre │ │ │ ├── CMakeLists.txt │ │ │ └── FuncToByre.cpp │ │ ├── GPUToNVVM │ │ │ ├── CMakeLists.txt │ │ │ └── GPUToNVVM.cpp │ │ ├── HloToByreTensor │ │ │ ├── CMakeLists.txt │ │ │ ├── HloToByreCustom.cpp │ │ │ └── HloToByreTensor.cpp │ │ ├── HloToCat │ │ │ ├── CMakeLists.txt │ │ │ ├── ConvertHloToCat.cpp │ │ │ ├── FuseHloToCat.cpp │ │ │ ├── FuseHloToCatPattern.td │ │ │ ├── HloToCat.cpp │ │ │ └── Utils.h │ │ ├── HloToTensor │ │ │ ├── CMakeLists.txt │ │ │ └── ConvertHloToTensor.cpp │ │ ├── LcclToByre │ │ │ ├── CMakeLists.txt │ │ │ └── LcclToByre.cpp │ │ ├── MemrefToByre │ │ │ ├── CMakeLists.txt │ │ │ └── MemrefToByre.cpp │ │ ├── PassDetail.h │ │ ├── ToAIT │ │ │ ├── CMakeLists.txt │ │ │ └── GenAITConfig.cpp │ │ ├── ToAce │ │ │ ├── CMakeLists.txt │ │ │ ├── MhloToAce.cpp │ │ │ └── MhloToAceActivationPattern.td │ │ ├── ToByre │ │ │ ├── CMakeLists.txt │ │ │ └── ToByre.cpp │ │ ├── ToGPU │ │ │ ├── CMakeLists.txt │ │ │ ├── CoalescedForToGPU.cpp │ │ │ ├── FuncToGPU.cpp │ │ │ └── Utils.cpp │ │ ├── ToHlo │ │ │ ├── ArithToMhlo.cpp │ │ │ ├── ArithToMhloPattern.td │ │ │ └── CMakeLists.txt │ │ ├── ToLLVM │ │ │ ├── CMakeLists.txt │ │ │ ├── CollectFuncToLLVM.cpp │ │ │ └── GenLLVMConfig.cpp │ │ ├── ToLinalg │ │ │ ├── CMakeLists.txt │ │ │ ├── HloToLinalg.cpp │ │ │ ├── LinalgExtToLinalg.cpp │ │ │ ├── MemrefCopyToLinalg.cpp │ │ │ ├── TensorToLinalg.cpp │ │ │ └── UnrealizedCastToLinalg.cpp │ │ └── ToPTX │ │ │ ├── CMakeLists.txt │ │ │ ├── CollectGPUKernel.cpp │ │ │ └── GenPTXConfig.cpp │ ├── Dialect │ │ ├── Ace │ │ │ ├── CMakeLists.txt │ │ │ └── Transforms │ │ │ │ ├── BufferizableOpInterfaceImpl.cpp │ │ │ │ ├── Bufferize.cpp │ │ │ │ └── PassDetail.h │ │ ├── Affine │ │ │ ├── CMakeLists.txt │ │ │ └── Transforms │ │ │ │ ├── AffineLoopFusionEx.cpp │ │ │ │ ├── InsertTrivialAffineLoop.cpp │ │ │ │ ├── PassDetail.h │ │ │ │ └── RewriteAffineToMemref.cpp │ │ ├── Byre │ │ │ ├── CMakeLists.txt │ │ │ ├── IR │ │ │ │ ├── ByreDialect.cpp │ │ │ │ ├── Common.cpp │ │ │ │ ├── Serialization.cpp │ │ │ │ └── Serialization │ │ │ │ │ ├── ByreSerialOps.cpp │ │ │ │ │ ├── Bytecode.cpp │ │ │ │ │ ├── Bytecode.h │ │ │ │ │ └── Versioning.cpp │ │ │ └── Transforms │ │ │ │ ├── BufferizableOpInterfaceImpl.cpp │ │ │ │ ├── PassDetail.h │ │ │ │ └── Serial.cpp │ │ ├── CMakeLists.txt │ │ ├── Cat │ │ │ ├── CMakeLists.txt │ │ │ └── IR │ │ │ │ ├── CMakeLists.txt │ │ │ │ └── CatDialect.cpp │ │ ├── Ccl │ │ │ ├── CMakeLists.txt │ │ │ ├── TransformOps │ │ │ │ ├── CMakeLists.txt │ │ │ │ └── CclTransformOps.cpp │ │ │ └── Transforms │ │ │ │ ├── CMakeLists.txt │ │ │ │ ├── CclBufferizeOpInterfaceImpl.cpp │ │ │ │ ├── CclMoveDown.cpp │ │ │ │ └── PassDetail.h │ │ ├── GPU │ │ │ ├── CMakeLists.txt │ │ │ ├── TransformOps │ │ │ │ ├── CMakeLists.txt │ │ │ │ ├── GPUExtTransformOps.cpp │ │ │ │ └── Utils.cpp │ │ │ └── Transforms │ │ │ │ ├── CMakeLists.txt │ │ │ │ ├── GPUBlockSwizzle.cpp │ │ │ │ ├── GPUDistributeSharedMemoryCopy.cpp │ │ │ │ ├── GPUDistributeToWarp.cpp │ │ │ │ ├── GPUPackSharedMemoryAlloc.cpp │ │ │ │ ├── GPUTensorCoreVectorization.cpp │ │ │ │ ├── OptimizeVectorTransfer.cpp │ │ │ │ ├── PassDetail.h │ │ │ │ ├── RemoveTrivialLoops.cpp │ │ │ │ ├── ShmAllocaToWorkgroupArg.cpp │ │ │ │ └── Utils.cpp │ │ ├── Lace │ │ │ ├── CMakeLists.txt │ │ │ └── IR │ │ │ │ └── LaceDialect.cpp │ │ ├── Lccl │ │ │ ├── CMakeLists.txt │ │ │ └── IR │ │ │ │ ├── CMakeLists.txt │ │ │ │ └── LcclOps.cpp │ │ ├── Linalg │ │ │ ├── CMakeLists.txt │ │ │ ├── IR │ │ │ │ ├── CMakeLists.txt │ │ │ │ ├── LinalgExtInterfaces.cpp │ │ │ │ └── LinalgExtOps.cpp │ │ │ ├── TransformOps │ │ │ │ ├── CMakeLists.txt │ │ │ │ └── LinalgExtTransformOps.cpp │ │ │ ├── Transforms │ │ │ │ ├── BufferizableOpInterfaceImpl.cpp │ │ │ │ ├── Bufferize.cpp │ │ │ │ ├── CMakeLists.txt │ │ │ │ ├── CanonicalizeExt.cpp │ │ │ │ ├── FuseElementwise.cpp │ │ │ │ ├── HoistingExt.cpp │ │ │ │ ├── LinalgCollapseLoops.cpp │ │ │ │ ├── LinalgDataPlace.cpp │ │ │ │ ├── LinalgExtToLoops.cpp │ │ │ │ ├── LinalgGeneralizationExt.cpp │ │ │ │ ├── LinalgPrefetch.cpp │ │ │ │ ├── LinalgPromotion.cpp │ │ │ │ ├── PassDetail.h │ │ │ │ ├── ScopeTiling.cpp │ │ │ │ ├── Tiling.cpp │ │ │ │ ├── TilingUtils.cpp │ │ │ │ └── Transforms.cpp │ │ │ └── Util │ │ │ │ ├── CMakeLists.txt │ │ │ │ └── Util.cpp │ │ ├── MemRef │ │ │ ├── CMakeLists.txt │ │ │ ├── Transforms │ │ │ │ ├── ApplyMemRefAffineLayout.cpp │ │ │ │ ├── ExtractAddressComputation.cpp │ │ │ │ ├── PassDetail.h │ │ │ │ ├── RemoveCopy.cpp │ │ │ │ ├── SimplifyLinearizedIndex.cpp │ │ │ │ └── SimplifyView.cpp │ │ │ └── Utils │ │ │ │ ├── Layout.cpp │ │ │ │ ├── MemEffect.cpp │ │ │ │ └── Ops.cpp │ │ ├── SCF │ │ │ ├── CMakeLists.txt │ │ │ ├── Transforms │ │ │ │ ├── CMakeLists.txt │ │ │ │ ├── ForallCollapsing.cpp │ │ │ │ ├── FuseNestedForall.cpp │ │ │ │ ├── InsertTrivialSCFLoop.cpp │ │ │ │ ├── PassDetail.h │ │ │ │ ├── RemoveSingleIterationLoop.cpp │ │ │ │ └── TilingInterfaceToSCFFor.cpp │ │ │ └── Util │ │ │ │ ├── CMakeLists.txt │ │ │ │ └── Util.cpp │ │ ├── Shape │ │ │ ├── CMakeLists.txt │ │ │ ├── IR │ │ │ │ └── ShapeExtOps.cpp │ │ │ └── Transforms │ │ │ │ ├── InsertInputShapeConstraint.cpp │ │ │ │ ├── InsertTieShape.cpp │ │ │ │ ├── PassDetail.h │ │ │ │ ├── ResolveShapeConstraint.cpp │ │ │ │ └── SetAssumingAlwaysTrue.cpp │ │ ├── Tensor │ │ │ ├── CMakeLists.txt │ │ │ ├── IR │ │ │ │ ├── CMakeLists.txt │ │ │ │ └── TilingInterfaceImpl.cpp │ │ │ └── Transforms │ │ │ │ ├── CMakeLists.txt │ │ │ │ ├── CanonicalizeExt.cpp │ │ │ │ ├── ExtractSliceSpecialization.cpp │ │ │ │ ├── PassDetail.h │ │ │ │ └── TensorPadSpecialization.cpp │ │ ├── Transform │ │ │ ├── CMakeLists.txt │ │ │ ├── IR │ │ │ │ ├── CMakeLists.txt │ │ │ │ └── TransformExtOps.cpp │ │ │ └── Transforms │ │ │ │ ├── CMakeLists.txt │ │ │ │ ├── PassDetail.h │ │ │ │ ├── TransformDialectInterpreter.cpp │ │ │ │ └── TransformInsertion.cpp │ │ ├── Vector │ │ │ ├── CMakeLists.txt │ │ │ └── Transforms │ │ │ │ ├── CMakeLists.txt │ │ │ │ ├── CanonicalizeExt.cpp │ │ │ │ ├── MoveForallRegionIntoWarpOp.cpp │ │ │ │ ├── PassDetail.h │ │ │ │ ├── VectorLowerings.cpp │ │ │ │ └── VectorWarpDistribute.cpp │ │ └── mhlo │ │ │ ├── Analysis │ │ │ ├── DimFromBroadcast.cpp │ │ │ └── ShapeAnalysis.cpp │ │ │ ├── CMakeLists.txt │ │ │ ├── DynamicShapeOpRegister │ │ │ ├── AddN.cpp │ │ │ ├── BatchMatMul.cpp │ │ │ ├── Concatenate.cpp │ │ │ ├── Convolution.cpp │ │ │ ├── DotLike.cpp │ │ │ ├── DynamicBroadcastInDim.cpp │ │ │ ├── DynamicPartition.cpp │ │ │ ├── DynamicStitchLike.cpp │ │ │ ├── Einsum.cpp │ │ │ ├── GeLU.cpp │ │ │ ├── LayerNorm.cpp │ │ │ ├── NonZero.cpp │ │ │ ├── OneHot.cpp │ │ │ ├── RealDynamicSlice.cpp │ │ │ ├── Reduce.cpp │ │ │ ├── Repeat.cpp │ │ │ ├── ReshapeLike.cpp │ │ │ ├── ScatterNd.cpp │ │ │ ├── Softmax.cpp │ │ │ ├── StridedSlice.cpp │ │ │ └── TorchIndexSelect.cpp │ │ │ ├── Transforms │ │ │ ├── BoundedShapeInference.cpp │ │ │ ├── CanonicalizeExt.cpp │ │ │ ├── ClusterConstraint.cpp │ │ │ ├── ConvBackwardFusion.cpp │ │ │ ├── ConvForwardFusion.cpp │ │ │ ├── ConvertFuncToCustomCall.cpp │ │ │ ├── ConvertInsertion.cpp │ │ │ ├── ConvertOpToCustomCall.cpp │ │ │ ├── DTypeConversion.cpp │ │ │ ├── DecomposeMhloCustomCallOps.cpp │ │ │ ├── DynamicShapeClustering.cpp │ │ │ ├── FuncArgRearrangement.cpp │ │ │ ├── FuseBMMDimension.cpp │ │ │ ├── FuseTransposeIntoDotGeneral.cpp │ │ │ ├── FusionOutlining.cpp │ │ │ ├── GenericFusion.cpp │ │ │ ├── HloFolder.cpp │ │ │ ├── HloMoveDown.cpp │ │ │ ├── HloMoveUp.cpp │ │ │ ├── HloSimplify.cpp │ │ │ ├── IOConvertFusion.cpp │ │ │ ├── InsertShapeConstraint.cpp │ │ │ ├── LayoutTransformation.cpp │ │ │ ├── MatmulLayoutTransform.cpp │ │ │ ├── PassDetail.h │ │ │ ├── ReduceWindowFusion.cpp │ │ │ ├── RewriteWithConstraint.cpp │ │ │ ├── ShapeReification.cpp │ │ │ ├── StaticShapeInference.cpp │ │ │ ├── TrivialFusion.cpp │ │ │ └── UnfuseBatchNorm.cpp │ │ │ └── Util │ │ │ ├── FusionUtil.cpp │ │ │ ├── ShapeInferUtil.cpp │ │ │ └── Util.cpp │ ├── Pipelines │ │ ├── AffineOpt.cpp │ │ ├── AllOpt.cpp │ │ ├── BufferizeOpt.cpp │ │ ├── ByreHost.cpp │ │ ├── ByreOpt.cpp │ │ ├── ByreTensorOpt.cpp │ │ ├── CMakeLists.txt │ │ ├── CatFusionOpt.cpp │ │ ├── CatPreprocess.cpp │ │ ├── Common │ │ │ ├── CMakeLists.txt │ │ │ └── Utils.cpp │ │ ├── GPU │ │ │ ├── CMakeLists.txt │ │ │ ├── ElementwiseCodegen.cpp │ │ │ ├── GPUOpt.cpp │ │ │ ├── LinalgMemrefGPU.cpp │ │ │ ├── MappingForall.cpp │ │ │ ├── NVVMCodegen.cpp │ │ │ └── ReductionCodegen.cpp │ │ ├── HloFusionOpt.cpp │ │ ├── HloGraphOpt.cpp │ │ ├── Host │ │ │ ├── CMakeLists.txt │ │ │ ├── Codegen.cpp │ │ │ ├── HostOpt.cpp │ │ │ └── ToLLVM.cpp │ │ ├── LinalgMemrefOpt.cpp │ │ ├── LinalgTensorOpt.cpp │ │ ├── SCFOpt.cpp │ │ └── ShapeOpt.cpp │ ├── Stat │ │ ├── AllocCnt │ │ │ ├── AllocCnt.cpp │ │ │ └── CMakeLists.txt │ │ ├── CMakeLists.txt │ │ ├── Common │ │ │ ├── CMakeLists.txt │ │ │ └── Reg.cpp │ │ └── OpCnt │ │ │ ├── CMakeLists.txt │ │ │ └── OpCnt.cpp │ ├── Target │ │ ├── CMakeLists.txt │ │ ├── CUDA │ │ │ ├── CMakeLists.txt │ │ │ ├── TranslateRegistration.cpp │ │ │ └── TranslateToCUDA.cpp │ │ ├── Cpp │ │ │ ├── CMakeLists.txt │ │ │ ├── TranslateRegistration.cpp │ │ │ └── TranslateToCpp.cpp │ │ ├── LLVM │ │ │ ├── CMakeLists.txt │ │ │ └── TranslateRegistration.cpp │ │ └── PTX │ │ │ ├── CMakeLists.txt │ │ │ ├── GPUKernelToPTX.cpp │ │ │ ├── TranslateRegistration.cpp │ │ │ └── TranslateToPTX.cpp │ ├── Transforms │ │ ├── AnchoredPipeline.cpp │ │ ├── ApplyPDLPatterns.cpp │ │ ├── Bufferize.cpp │ │ ├── CMAE.cpp │ │ ├── CMakeLists.txt │ │ ├── CanonicalizeExt.cpp │ │ ├── CollectFunc.cpp │ │ ├── CondCanonicalize.cpp │ │ ├── FuncTag.cpp │ │ ├── GenericDeviceConfig.cpp │ │ ├── GraphClusteringByDevice.cpp │ │ ├── InsertUniqueId.cpp │ │ ├── LoopTag.cpp │ │ ├── LoopUnroll.cpp │ │ ├── MemoryPlanning.cpp │ │ ├── ModuleTag.cpp │ │ ├── PassDetail.h │ │ ├── RemoveFuncBody.cpp │ │ ├── RewriteOpToStdCall.cpp │ │ ├── SetArgShape.cpp │ │ ├── SetSpace.cpp │ │ ├── ShapeFuncOutlining.cpp │ │ ├── TryCatchModulePipeline.cpp │ │ └── Utils.cpp │ └── Utils │ │ ├── AffineUtils.cpp │ │ ├── AttrUtils.cpp │ │ ├── CMakeLists.txt │ │ ├── FuncUtils.cpp │ │ ├── GraphUtils.cpp │ │ ├── Hoist.cpp │ │ ├── IRRewrite.cpp │ │ ├── LoopUtils.cpp │ │ ├── MemUtils.cpp │ │ ├── ModuleUtils.cpp │ │ ├── OpInterfaceUtils.cpp │ │ ├── OptionUtils.cpp │ │ ├── PatternMatch.cpp │ │ ├── PipelineUtils.cpp │ │ ├── TileUtils.cpp │ │ ├── TypeUtils.cpp │ │ └── Utils.cpp ├── numerical │ ├── CMakeLists.txt │ ├── hlo │ │ ├── canonicalize_ext.mlir │ │ ├── conv_bn.mlir │ │ ├── dot_bn.mlir │ │ ├── hlo_fold.mlir │ │ ├── hlo_move_down.mlir │ │ ├── hlo_simplify.mlir │ │ ├── numerical_test.py │ │ ├── slice_move_down_and_merge.mlir │ │ └── test_broadcast_dense_elements_attr.mlir │ ├── lit.cfg.py │ └── lit.site.cfg.py.in ├── python │ ├── ByteIRModules.cpp │ ├── CMakeLists.txt │ ├── byteir │ │ ├── README.md │ │ ├── __init__.py │ │ ├── _backend_registry.py │ │ ├── _mlir_libs │ │ │ └── _site_initialize_0.py │ │ ├── compile.py │ │ ├── dialects │ │ │ ├── CatOps.td │ │ │ └── cat │ │ │ │ ├── __init__.py │ │ │ │ ├── ait_cache.py │ │ │ │ ├── ir_processor.py │ │ │ │ └── ir_translator │ │ │ │ ├── __init__.py │ │ │ │ ├── ait_builder.py │ │ │ │ ├── backend │ │ │ │ ├── __init__.py │ │ │ │ └── ait_registry.py │ │ │ │ └── translator.py │ │ ├── pattern_matches.py │ │ ├── tools │ │ │ ├── cat_executor.py │ │ │ └── compiler.py │ │ └── utils.py │ ├── gen_version.py │ ├── setup.py │ ├── test │ │ ├── CMakeLists.txt │ │ ├── api │ │ │ ├── test_pattern_matches.py │ │ │ └── test_py_api.py │ │ ├── dialects │ │ │ └── cat │ │ │ │ └── ait │ │ │ │ ├── numerical │ │ │ │ ├── layernorm.mlir │ │ │ │ ├── matmul.mlir │ │ │ │ ├── permute021.mlir │ │ │ │ ├── permute0213.mlir │ │ │ │ ├── permute0312.mlir │ │ │ │ ├── permute10.mlir │ │ │ │ └── softmax_f16.mlir │ │ │ │ └── profile │ │ │ │ └── matmul.mlir │ │ ├── lit.cfg.py │ │ └── lit.site.cfg.py.in │ └── version.txt ├── scripts │ ├── README.md │ ├── gen_testcases.py │ ├── gen_testcases_and_check_diff.sh │ └── sync_to_runtime.sh ├── test │ ├── Analysis │ │ ├── testPrintArgSideEffect.mlir │ │ ├── testPrintLiveness.mlir │ │ ├── testPrintShapeAnalysis.mlir │ │ ├── testPrintSymbolicShape.mlir │ │ └── testPrintUseRange.mlir │ ├── CMakeLists.txt │ ├── CPURunner │ │ ├── gelu.mlir │ │ ├── repeatCustomCall.mlir │ │ └── scatterTiling.mlir │ ├── Conversion │ │ ├── FuncToByre │ │ │ └── func_to_byre_tensor.mlir │ │ ├── HloToByreTensor │ │ │ └── compute_ops.mlir │ │ ├── HloToCat │ │ │ ├── basic_ops.mlir │ │ │ └── fused_ops.mlir │ │ ├── HloToTensor │ │ │ └── scatter_to_insertslice.mlir │ │ ├── LcclToByre │ │ │ └── lcclToByre.mlir │ │ ├── MemrefToByre │ │ │ └── memref_to_byre.mlir │ │ ├── ToAce │ │ │ └── mhloToAceActivation.mlir │ │ ├── ToByre │ │ │ ├── convertFuncAndCallToByre.mlir │ │ │ └── convertMemRefToByre.mlir │ │ ├── ToCUDAGPU │ │ │ ├── fusionGPUToNVVM.mlir │ │ │ ├── fusionGPUToNVVMBarePtr.mlir │ │ │ └── fusionHloToGPU.mlir │ │ ├── ToGPU │ │ │ ├── coalescedForToGPU.mlir │ │ │ └── funcToGPU.mlir │ │ ├── ToHlo │ │ │ └── arithConstToMhlo.mlir │ │ ├── ToLinalg │ │ │ ├── LinalgExtToLinalg.mlir │ │ │ ├── TesnorToLinalg.mlir │ │ │ ├── fusionHlo.mlir │ │ │ ├── hloConvertToLinalg.mlir │ │ │ ├── memrefcopyToLinalg.mlir │ │ │ ├── primitiveOpsHlo.mlir │ │ │ ├── reducef16.mlir │ │ │ ├── repeatCustomCallToLinalg.mlir │ │ │ ├── rngCustomCallToLinalg.mlir │ │ │ ├── simpleHlo.mlir │ │ │ └── unrealizedCastToLinalg.mlir │ │ ├── ToPTX │ │ │ ├── genPTXConfig.mlir │ │ │ └── genPTXConfigBarePtr.mlir │ │ └── VectorToGPU │ │ │ └── existing-vector-to-mma-ops.mlir │ ├── Dialect │ │ ├── Ace │ │ │ ├── attrs.mlir │ │ │ ├── bufferize.mlir │ │ │ ├── canonicalize.mlir │ │ │ └── ops.mlir │ │ ├── Affine │ │ │ ├── affineLoopFusionEx.mlir │ │ │ ├── affineToMemRef.mlir │ │ │ └── insertTrivialAffineLoop.mlir │ │ ├── Byre │ │ │ ├── Serialization │ │ │ │ ├── Compatibility │ │ │ │ │ ├── version_1_0_0.mlir │ │ │ │ │ ├── version_1_0_0.mlir.bc │ │ │ │ │ ├── version_1_0_0.mlir.bc.v0 │ │ │ │ │ ├── version_1_0_0_alloc.mlir │ │ │ │ │ └── version_1_0_0_alloc.mlir.bc │ │ │ │ └── round_trip.mlir │ │ │ ├── bert_transformer.mlir │ │ │ ├── buffer_ops.mlir │ │ │ ├── bufferize.mlir │ │ │ ├── canonicalize.mlir │ │ │ ├── interface.mlir │ │ │ └── invalid.mlir │ │ ├── Cat │ │ │ └── ops.mlir │ │ ├── Ccl │ │ │ ├── ccl_bufferize.mlir │ │ │ ├── ccl_canonicalize.mlir │ │ │ ├── ccl_move_down.mlir │ │ │ ├── decompose_all_reduce.mlir │ │ │ ├── invalid.mlir │ │ │ └── ops.mlir │ │ ├── GPU │ │ │ ├── gpu-block-swizzle.mlir │ │ │ ├── gpu-distribute-shared-memory-copy.mlir │ │ │ ├── gpu-distributed-to-warp.mlir │ │ │ ├── gpu-pack-shared-memory-alloc.mlir │ │ │ ├── gpu-tensorcore-vectorization.mlir │ │ │ ├── optimize-vector-transfer.mlir │ │ │ ├── remove-trivial-loops.mlir │ │ │ ├── transform-gpu-failing.mlir │ │ │ ├── transform-map-forall-to-blocks.mlir │ │ │ └── transform-map-nested-forall-to-threads.mlir │ │ ├── Lace │ │ │ ├── invalid.mlir │ │ │ └── ops.mlir │ │ ├── Linalg │ │ │ ├── annotate.mlir │ │ │ ├── bufferize.mlir │ │ │ ├── canonicalizeExt.mlir │ │ │ ├── dataPlace-lagacy.mlir │ │ │ ├── dataPlace-tensor.mlir │ │ │ ├── extension.mlir │ │ │ ├── fuse-attention-upstream.mlir │ │ │ ├── fuse-attention.mlir │ │ │ ├── generalization.mlir │ │ │ ├── linalg-collapse-loops.mlir │ │ │ ├── linalg-fuse-elementwise-ext-existing.mlir │ │ │ ├── linalg-fuse-elementwise-ext.mlir │ │ │ ├── linalg-promotion-epilogue-fusion.mlir │ │ │ ├── linalg-promotion.mlir │ │ │ ├── linalgExtToLoops.mlir │ │ │ ├── opTiling1.mlir │ │ │ ├── opTiling2.mlir │ │ │ ├── prefetch.mlir │ │ │ ├── scopeTiling3SplitK-dev.mlir │ │ │ ├── transform-dev.mlir │ │ │ ├── transform-lower-to-loops.mlir │ │ │ ├── transform-op-collapse-dims.mlir │ │ │ ├── transform-op-fold-unit-extent-dims.mlir │ │ │ ├── transform-op-fuse-dev.mlir │ │ │ ├── transform-op-fuse-into-containing.mlir │ │ │ ├── transform-op-fuse-multi-root.mlir │ │ │ ├── transform-op-fuse.mlir │ │ │ ├── transform-op-shared-out-to-dist-style.mlir │ │ │ ├── transform-op-tile-ext.mlir │ │ │ ├── transform-op-tile-loop-hint.mlir │ │ │ └── transform-op-tile-reduction-parallel.mlir │ │ ├── MemRef │ │ │ ├── canonicalize.mlir │ │ │ ├── layout.mlir │ │ │ ├── removeCopy.mlir │ │ │ └── simplifyView.mlir │ │ ├── Mhlo │ │ │ ├── fusion.mlir │ │ │ ├── multi_return.mlir │ │ │ ├── reduce.mlir │ │ │ ├── simple.mlir │ │ │ └── transforms │ │ │ │ ├── ConvBNFolder.mlir │ │ │ │ ├── ConvBackwardFusion.mlir │ │ │ │ ├── ConvBiasActFusion.mlir │ │ │ │ ├── ConvertOpToCustomCall.mlir │ │ │ │ ├── DecomposeMhloCustomCallOps.mlir │ │ │ │ ├── IOConvertFusion.mlir │ │ │ │ ├── LayoutTransformation.mlir │ │ │ │ ├── RewriteWithConstraint.mlir │ │ │ │ ├── SliceMoveDownAndMerge.mlir │ │ │ │ ├── TestBroadcastDenseElementsAttr.mlir │ │ │ │ ├── TestConvertFuncToCustomCall.mlir │ │ │ │ ├── TestConvertInsertion.mlir │ │ │ │ ├── TestCustomConvert.mlir │ │ │ │ ├── TestDTypeConversion.mlir │ │ │ │ ├── TestDTypeConversionModifyFunc.mlir │ │ │ │ ├── TestFuncArgRearrangement.mlir │ │ │ │ ├── aggressiveFusion.mlir │ │ │ │ ├── canonicalize │ │ │ │ ├── arithOptimize.mlir │ │ │ │ ├── dynamicGather.mlir │ │ │ │ └── transposeFolder.mlir │ │ │ │ ├── clusterConstraint.mlir │ │ │ │ ├── concatSliceFusion.mlir │ │ │ │ ├── elementFusion.mlir │ │ │ │ ├── expandHloTuples.mlir │ │ │ │ ├── fuseBMMDimension.mlir │ │ │ │ ├── fuseTransposeIntoDotGeneral.mlir │ │ │ │ ├── fusionOutlining.mlir │ │ │ │ ├── hloFolder.mlir │ │ │ │ ├── hloMoveDown.mlir │ │ │ │ ├── hloMoveUp.mlir │ │ │ │ ├── hloSimplify.mlir │ │ │ │ ├── insertShapeConstraint.mlir │ │ │ │ ├── matmulEpilogueFusion.mlir │ │ │ │ ├── matmulLayoutTransform.mlir │ │ │ │ ├── mhloFlattenTuple.mlir │ │ │ │ └── reduceFusion.mlir │ │ ├── SCF │ │ │ ├── forallCollapsing.mlir │ │ │ ├── fuseNestedForall.mlir │ │ │ ├── insertTrivialSCFLoop.mlir │ │ │ └── moveForallRegionIntoWarpOp.mlir │ │ ├── Shape │ │ │ ├── insertInputShapeConstraint.mlir │ │ │ ├── insertTieShape.mlir │ │ │ ├── resolveShapeConstraint.mlir │ │ │ └── setAssumingAlwaysTrue.mlir │ │ ├── Tensor │ │ │ └── canonicalizeExt.mlir │ │ ├── Transform │ │ │ ├── canonicalize.mlir │ │ │ ├── cleanup.mlir │ │ │ ├── detensorizeInsertion.mlir │ │ │ ├── dump.mlir │ │ │ ├── transformDialectInterpreter.mlir │ │ │ └── transformInsertion.mlir │ │ └── Vector │ │ │ └── canonicalizeExt.mlir │ ├── E2E │ │ ├── CUDA │ │ │ ├── AliasLikeGPU │ │ │ │ ├── 10b_ptx_codegen.mlir │ │ │ │ ├── 1_hlo_opt.mlir │ │ │ │ ├── 2_linalg_tensor_opt.mlir │ │ │ │ ├── 3_byre_tensor_opt.mlir │ │ │ │ ├── 4_bufferize_opt.mlir │ │ │ │ ├── 5_alternative_scf_opt.mlir │ │ │ │ ├── 6_gpu_opt.mlir │ │ │ │ ├── 7_set_space_opt.mlir │ │ │ │ ├── 8_byre_opt.mlir │ │ │ │ ├── 9a_byre_host.mlir │ │ │ │ ├── 9b_nvvm_codegen.mlir │ │ │ │ ├── device_output.ptx │ │ │ │ ├── host_output.mlir │ │ │ │ ├── input.mlir │ │ │ │ └── template.py │ │ │ ├── BertTiny │ │ │ │ ├── BW │ │ │ │ │ └── input.mlir │ │ │ │ └── FW │ │ │ │ │ ├── 1_hlo_opt.mlir │ │ │ │ │ └── input.mlir │ │ │ ├── CclInference │ │ │ │ └── input.mlir │ │ │ ├── MLPBasic │ │ │ │ ├── 1_preprocess_for_lowering.mlir │ │ │ │ └── input.mlir │ │ │ ├── MLPInference │ │ │ │ ├── 10b_ptx_codegen.mlir │ │ │ │ ├── 1_hlo_opt.mlir │ │ │ │ ├── 2_linalg_tensor_opt.mlir │ │ │ │ ├── 3_byre_tensor_opt.mlir │ │ │ │ ├── 4_bufferize_opt.mlir │ │ │ │ ├── 5_affine_opt.mlir │ │ │ │ ├── 5_alternative_scf_opt.mlir │ │ │ │ ├── 6_gpu_opt.mlir │ │ │ │ ├── 7_set_space_opt.mlir │ │ │ │ ├── 8_byre_opt.mlir │ │ │ │ ├── 9a_byre_host.mlir │ │ │ │ ├── 9b_nvvm_codegen.mlir │ │ │ │ ├── device_output.ptx │ │ │ │ ├── host_output.mlir │ │ │ │ ├── input.mlir │ │ │ │ └── template.py │ │ │ ├── NanoGPT │ │ │ │ ├── BW │ │ │ │ │ └── input.mlir │ │ │ │ └── FW │ │ │ │ │ └── input.mlir │ │ │ └── ResNet18 │ │ │ │ ├── BW │ │ │ │ ├── 10b_ptx_codegen.mlir │ │ │ │ ├── 1_hlo_opt.mlir │ │ │ │ ├── 2_linalg_tensor_opt.mlir │ │ │ │ ├── 3_byre_tensor_opt.mlir │ │ │ │ ├── 4_bufferize_opt.mlir │ │ │ │ ├── 5_affine_opt.mlir │ │ │ │ ├── 5_alternative_scf_opt.mlir │ │ │ │ ├── 6_gpu_opt.mlir │ │ │ │ ├── 7_set_space_opt.mlir │ │ │ │ ├── 8_byre_opt.mlir │ │ │ │ ├── 9a_byre_host.mlir │ │ │ │ ├── 9b_nvvm_codegen.mlir │ │ │ │ ├── device_output.ptx │ │ │ │ ├── host_output.mlir │ │ │ │ ├── input.mlir │ │ │ │ └── template.py │ │ │ │ ├── FW │ │ │ │ ├── 10b_ptx_codegen.mlir │ │ │ │ ├── 1_hlo_opt.mlir │ │ │ │ ├── 2_linalg_tensor_opt.mlir │ │ │ │ ├── 3_byre_tensor_opt.mlir │ │ │ │ ├── 4_bufferize_opt.mlir │ │ │ │ ├── 5_affine_opt.mlir │ │ │ │ ├── 5_alternative_scf_opt.mlir │ │ │ │ ├── 6_gpu_opt.mlir │ │ │ │ ├── 7_set_space_opt.mlir │ │ │ │ ├── 8_byre_opt.mlir │ │ │ │ ├── 9a_byre_host.mlir │ │ │ │ ├── 9b_nvvm_codegen.mlir │ │ │ │ ├── device_output.ptx │ │ │ │ ├── host_output.mlir │ │ │ │ ├── input.mlir │ │ │ │ └── template.py │ │ │ │ └── Whole │ │ │ │ ├── 10b_ptx_codegen.mlir │ │ │ │ ├── 1_hlo_opt.mlir │ │ │ │ ├── 2_linalg_tensor_opt.mlir │ │ │ │ ├── 3_byre_tensor_opt.mlir │ │ │ │ ├── 4_bufferize_opt.mlir │ │ │ │ ├── 5_affine_opt.mlir │ │ │ │ ├── 5_alternative_scf_opt.mlir │ │ │ │ ├── 6_gpu_opt.mlir │ │ │ │ ├── 7_set_space_opt.mlir │ │ │ │ ├── 8_byre_opt.mlir │ │ │ │ ├── 9a_byre_host.mlir │ │ │ │ ├── 9b_nvvm_codegen.mlir │ │ │ │ ├── device_output.ptx │ │ │ │ ├── host_output.mlir │ │ │ │ ├── input.mlir │ │ │ │ └── template.py │ │ └── Host │ │ │ ├── AliasLike │ │ │ ├── 00_Input.mlir │ │ │ ├── 01_HostOpt.mlir │ │ │ ├── 02a_ByreHost.mlir │ │ │ ├── 02b_ToLLVM.mlir │ │ │ ├── 03b_ToLLVMIR.mlir │ │ │ ├── Output.ll │ │ │ ├── Output.mlir │ │ │ ├── TotalPipeline.mlir │ │ │ └── template.py │ │ │ ├── Case0 │ │ │ ├── 00_Input.mlir │ │ │ ├── 01_HostOpt.mlir │ │ │ ├── 02a_ByreHost.mlir │ │ │ ├── 02b_ToLLVM.mlir │ │ │ ├── 03b_ToLLVMIR.mlir │ │ │ ├── Output.ll │ │ │ ├── Output.mlir │ │ │ ├── TotalPipeline.mlir │ │ │ └── template.py │ │ │ ├── Case0_Bytecode │ │ │ ├── 00_Input.mlir │ │ │ ├── 01_HostOpt.mlir │ │ │ ├── 02a_ByreHost.mlir │ │ │ ├── 02b_ToLLVM.mlir │ │ │ ├── 03a_ByreSerial.mlir │ │ │ ├── 03b_ToLLVMBC.mlir │ │ │ ├── Output.bc │ │ │ ├── Output.mlirbc │ │ │ └── template.py │ │ │ ├── Case1 │ │ │ ├── 00_Input.mlir │ │ │ ├── 01_HostOpt.mlir │ │ │ ├── 02a_ByreHost.mlir │ │ │ ├── 02b_ToLLVM.mlir │ │ │ ├── 03b_ToLLVMIR.mlir │ │ │ ├── Output.ll │ │ │ ├── Output.mlir │ │ │ ├── TotalPipeline.mlir │ │ │ └── template.py │ │ │ ├── RngNormal │ │ │ ├── 00_Input.mlir │ │ │ ├── 01_HostOpt.mlir │ │ │ ├── 02a_ByreHost.mlir │ │ │ ├── 02b_ToLLVM.mlir │ │ │ ├── 03b_ToLLVMIR.mlir │ │ │ ├── Output.ll │ │ │ ├── Output.mlir │ │ │ ├── TotalPipeline.mlir │ │ │ └── template.py │ │ │ ├── RngUniform │ │ │ ├── 00_Input.mlir │ │ │ ├── 01_HostOpt.mlir │ │ │ ├── 02a_ByreHost.mlir │ │ │ ├── 02b_ToLLVM.mlir │ │ │ ├── 03b_ToLLVMIR.mlir │ │ │ ├── Output.ll │ │ │ ├── Output.mlir │ │ │ ├── TotalPipeline.mlir │ │ │ └── template.py │ │ │ ├── Transpose │ │ │ ├── 00_Input.mlir │ │ │ ├── 01_HostOpt.mlir │ │ │ ├── 02a_ByreHost.mlir │ │ │ ├── 02b_ToLLVM.mlir │ │ │ ├── 03b_ToLLVMIR.mlir │ │ │ ├── Output.ll │ │ │ ├── Output.mlir │ │ │ ├── TotalPipeline.mlir │ │ │ └── template.py │ │ │ └── TypeCvt │ │ │ ├── 00_Input.mlir │ │ │ ├── 01_HostOpt.mlir │ │ │ ├── 02a_ByreHost.mlir │ │ │ ├── 02b_ToLLVM.mlir │ │ │ ├── 03b_ToLLVMIR.mlir │ │ │ ├── Output.ll │ │ │ ├── Output.mlir │ │ │ ├── TotalPipeline.mlir │ │ │ └── template.py │ ├── Ops │ │ ├── conv.mlir │ │ └── dot.mlir │ ├── Pipelines │ │ ├── BufferizeOpts │ │ │ ├── linalg-ext.mlir │ │ │ └── tensor.mlir │ │ ├── HloOpts │ │ │ ├── mlp.mlir │ │ │ └── rng.mlir │ │ ├── Host │ │ │ ├── Codegen │ │ │ │ └── transpose.mlir │ │ │ └── ToLLVM │ │ │ │ ├── subview.mlir │ │ │ │ └── tanh.mlir │ │ ├── LinalgTensorOpt │ │ │ ├── elementwiseCodegen.mlir │ │ │ └── reductionCodegen.mlir │ │ └── ShapeOpts │ │ │ └── dynamicPartitionStitch.mlir │ ├── Stat │ │ ├── allocCnt.mlir │ │ ├── opCnt.mlir │ │ └── opTypes.mlir │ ├── Target │ │ ├── CUDA │ │ │ ├── all.mlir │ │ │ └── kernel.mlir │ │ ├── Cpp │ │ │ ├── attrs.mlir │ │ │ ├── binary.mlir │ │ │ ├── call.mlir │ │ │ ├── cast.mlir │ │ │ ├── common-cpp.mlir │ │ │ ├── const.mlir │ │ │ ├── control_flow.mlir │ │ │ ├── for.mlir │ │ │ ├── if.mlir │ │ │ ├── invalid.mlir │ │ │ ├── memref.mlir │ │ │ ├── opaque_types.mlir │ │ │ ├── stdops.mlir │ │ │ └── types.mlir │ │ └── PTX │ │ │ └── fusionFuncToPTX.mlir │ ├── Transforms │ │ ├── ApplyPDLPatterns │ │ │ ├── Case_0.mlir │ │ │ └── Pattern_0.mlir │ │ ├── CanonicalizeExt │ │ │ ├── basic.mlir │ │ │ ├── broadcast.mlir │ │ │ ├── concat.mlir │ │ │ ├── deprecated.mlir │ │ │ ├── elementwise.mlir │ │ │ ├── gather.mlir │ │ │ ├── reduce_like.mlir │ │ │ ├── slice_concat.mlir │ │ │ └── transpose.mlir │ │ ├── boundedShapeInference.mlir │ │ ├── cmae.mlir │ │ ├── collectFunc.mlir │ │ ├── funTag.mlir │ │ ├── genericDeviceConfig.mlir │ │ ├── gereicDeviceConfig_with_ByreOpt.mlir │ │ ├── graphCanonicalize.mlir │ │ ├── graphClusteringByDevice.mlir │ │ ├── graphClusteringByDeviceBottomUp.mlir │ │ ├── graphClusteringByDeviceGreedy.mlir │ │ ├── graphClusteringByDeviceTopDown.mlir │ │ ├── insertUniqueId.mlir │ │ ├── insertUniqueIdErase.mlir │ │ ├── loopTag.mlir │ │ ├── loopUnrollImperfect.mlir │ │ ├── loopUnrollUseAnchor.mlir │ │ ├── loopUnrollUseDepth.mlir │ │ ├── loopUnrollWithAnnotation.mlir │ │ ├── memoryPlanning.mlir │ │ ├── oneShotBufferize.mlir │ │ ├── oneShotBufferizeOutParams.mlir │ │ ├── removeFunTag.mlir │ │ ├── removeFuncBody.mlir │ │ ├── rewriteOpToStdCall.mlir │ │ ├── setAllSpace.mlir │ │ ├── setArgShape.mlir │ │ ├── setArgSpace.mlir │ │ ├── setArgSpaceAutoDeduce.mlir │ │ ├── setOpAndArgSpace.mlir │ │ ├── setOpSpace.mlir │ │ ├── shapeFuncOutlining.mlir │ │ ├── shapeReification.mlir │ │ ├── staticShapeInference.mlir │ │ └── testGraphClusteringByDeviceOpNum.mlir │ ├── Utils │ │ ├── testMergeTwoModulesCase0.mlir │ │ ├── testMergeTwoModulesCase0_1.mlir │ │ ├── testMergeTwoModulesCase1.mlir │ │ ├── testMergeTwoModulesCase1_1.mlir │ │ ├── testMergeTwoModulesCase2.mlir │ │ └── testMergeTwoModulesCase2_1.mlir │ ├── lib │ │ ├── Analysis │ │ │ ├── CMakeLists.txt │ │ │ ├── TestGraphClusteringByDeviceOpNum.cpp │ │ │ ├── TestPrintLiveness.cpp │ │ │ ├── TestPrintShapeAnalysis.cpp │ │ │ ├── TestPrintSideEffect.cpp │ │ │ ├── TestPrintSymbolicShape.cpp │ │ │ └── TestPrintUseRange.cpp │ │ ├── CMakeLists.txt │ │ ├── Interface │ │ │ ├── CMakeLists.txt │ │ │ └── TestByreOpInterface.cpp │ │ ├── Transformation │ │ │ ├── CMakeLists.txt │ │ │ ├── TestByreSerialRoundtrip.cpp │ │ │ ├── TestConvertFuncToCustomCall.cpp │ │ │ ├── TestConvertInsertion.cpp │ │ │ ├── TestDTypeConversion.cpp │ │ │ └── TestFuncArgRearrangement.cpp │ │ └── Utils │ │ │ ├── CMakeLists.txt │ │ │ ├── TestBroadcastDenseElementsAttr.cpp │ │ │ └── TestMergeTwoModules.cpp │ ├── lit.cfg.py │ └── lit.site.cfg.py.in └── tools │ ├── CMakeLists.txt │ ├── byteir-cpu-runner │ ├── CMakeLists.txt │ └── byteir-cpu-runner.cpp │ ├── byteir-opt │ ├── CMakeLists.txt │ └── byteir-opt.cpp │ ├── byteir-stat │ ├── CMakeLists.txt │ └── byteir-stat.cpp │ └── byteir-translate │ ├── CMakeLists.txt │ └── byteir-translate.cpp ├── docker └── Dockerfile ├── external ├── half │ ├── LICENSE.txt │ ├── README.txt │ └── include │ │ └── half │ │ └── half.hpp └── patches │ └── AITemplate │ ├── A10.patch │ ├── logging.patch │ └── num_builders.patch ├── external_libs └── runtime │ ├── CMakeLists.txt │ ├── README.md │ └── flash_attn │ ├── CMakeLists.txt │ ├── include │ └── flash_api.h │ └── lib │ ├── CMakeLists.txt │ ├── alibi.h │ ├── block_info.h │ ├── dropout.h │ ├── flash.h │ ├── flash_api.cu │ ├── flash_bwd_hdim128_fp16_sm80.cu │ ├── flash_bwd_hdim160_fp16_sm80.cu │ ├── flash_bwd_hdim192_fp16_sm80.cu │ ├── flash_bwd_hdim224_fp16_sm80.cu │ ├── flash_bwd_hdim256_fp16_sm80.cu │ ├── flash_bwd_hdim32_fp16_sm80.cu │ ├── flash_bwd_hdim64_fp16_sm80.cu │ ├── flash_bwd_hdim96_fp16_sm80.cu │ ├── flash_bwd_kernel.h │ ├── flash_bwd_launch_template.h │ ├── flash_bwd_preprocess_kernel.h │ ├── flash_fwd_hdim128_fp16_sm80.cu │ ├── flash_fwd_hdim160_fp16_sm80.cu │ ├── flash_fwd_hdim192_fp16_sm80.cu │ ├── flash_fwd_hdim224_fp16_sm80.cu │ ├── flash_fwd_hdim256_fp16_sm80.cu │ ├── flash_fwd_hdim32_fp16_sm80.cu │ ├── flash_fwd_hdim64_fp16_sm80.cu │ ├── flash_fwd_hdim96_fp16_sm80.cu │ ├── flash_fwd_kernel.h │ ├── flash_fwd_launch_template.h │ ├── flash_fwd_split_hdim128_fp16_sm80.cu │ ├── flash_fwd_split_hdim160_fp16_sm80.cu │ ├── flash_fwd_split_hdim192_fp16_sm80.cu │ ├── flash_fwd_split_hdim224_fp16_sm80.cu │ ├── flash_fwd_split_hdim256_fp16_sm80.cu │ ├── flash_fwd_split_hdim32_fp16_sm80.cu │ ├── flash_fwd_split_hdim64_fp16_sm80.cu │ ├── flash_fwd_split_hdim96_fp16_sm80.cu │ ├── kernel_traits.h │ ├── mask.h │ ├── philox.cuh │ ├── rotary.h │ ├── softmax.h │ ├── static_switch.h │ └── utils.h ├── frontends ├── README.md ├── onnx-frontend │ ├── .gitignore │ ├── CMakeLists.txt │ ├── MLIR.cmake │ ├── README.md │ ├── onnx-frontend │ │ ├── CMakeLists.txt │ │ ├── src │ │ │ ├── CMakeLists.txt │ │ │ ├── Compiler │ │ │ │ ├── CMakeLists.txt │ │ │ │ ├── OFCompilerOptions.cpp │ │ │ │ ├── OFCompilerOptions.hpp │ │ │ │ ├── OFCompilerPipelines.cpp │ │ │ │ ├── OFCompilerPipelines.hpp │ │ │ │ ├── OFCompilerTypes.hpp │ │ │ │ ├── OFCompilerUtils.cpp │ │ │ │ └── OFCompilerUtils.hpp │ │ │ ├── Conversion │ │ │ │ ├── CMakeLists.txt │ │ │ │ ├── OFCanonicalizer.cpp │ │ │ │ ├── OFCanonicalizer.hpp │ │ │ │ ├── OFCheckNonLowered.cpp │ │ │ │ ├── OFCheckNonLowered.hpp │ │ │ │ ├── OFInsertNecessaryCast.cpp │ │ │ │ ├── OFInsertNecessaryCast.hpp │ │ │ │ ├── OFModifyEntryPoint.cpp │ │ │ │ ├── OFModifyEntryPoint.hpp │ │ │ │ ├── OFPasses.hpp │ │ │ │ ├── OFPasses.td │ │ │ │ ├── OFPassesDetail.hpp │ │ │ │ ├── OFRewriteCustomOnnxOps.cpp │ │ │ │ ├── OFRewriteCustomOnnxOps.hpp │ │ │ │ ├── OFRewriteCustomOnnxOps.td │ │ │ │ ├── OFRewriteToCustomCall.cpp │ │ │ │ ├── OFRewriteToCustomCall.hpp │ │ │ │ └── OFRewriteToCustomCall.td │ │ │ ├── Support │ │ │ │ ├── CMakeLists.txt │ │ │ │ ├── OFConstants.hpp │ │ │ │ ├── OFUtils.cpp │ │ │ │ └── OFUtils.hpp │ │ │ ├── onnx-frontend-opt.cpp │ │ │ └── onnx-frontend.cpp │ │ └── test │ │ │ ├── CMakeLists.txt │ │ │ ├── dynamic_shape_relu.onnx │ │ │ ├── lit.cfg.py │ │ │ ├── lit.site.cfg.py.in │ │ │ ├── of_canonicalizer.mlir │ │ │ ├── of_check_non_lowered.mlir │ │ │ ├── of_modify_entry_point.mlir │ │ │ ├── of_rewrite_custom_onnx_op.mlir │ │ │ ├── of_rewrite_to_custom_call.mlir │ │ │ ├── set_shape.mlir │ │ │ └── shape_inference.mlir │ ├── pytest.ini │ ├── requirements.txt │ ├── scripts │ │ ├── build_and_test.sh │ │ └── envsetup.sh │ ├── test │ │ ├── __init__.py │ │ ├── base.py │ │ ├── env.py │ │ ├── models │ │ │ ├── test_batch_size.py │ │ │ └── test_large_model.py │ │ └── ops │ │ │ ├── __init__.py │ │ │ ├── data │ │ │ ├── math │ │ │ │ ├── clip.onnx │ │ │ │ ├── gelu.onnx │ │ │ │ └── softmax.onnx │ │ │ ├── nn │ │ │ │ └── batch_normalization.onnx │ │ │ ├── quantize │ │ │ │ └── quantize_dequantize.onnx │ │ │ └── tensor │ │ │ │ ├── arg_max.onnx │ │ │ │ ├── arg_min.onnx │ │ │ │ ├── concat.onnx │ │ │ │ ├── concat_dynamic_shape.onnx │ │ │ │ └── resize_nearest_v10.onnx │ │ │ ├── test_math.py │ │ │ ├── test_nn.py │ │ │ ├── test_quantize.py │ │ │ ├── test_rnn.py │ │ │ ├── test_tensor.py │ │ │ └── utils.py │ └── third_party │ │ └── patches │ │ ├── OnnxMlirConcat.patch │ │ ├── OnnxMlirConvTranspose.patch │ │ ├── OnnxMlirDialectBuilder.patch │ │ ├── OnnxMlirDialectRewrite.patch │ │ ├── OnnxMlirElementwise.patch │ │ ├── OnnxMlirKeepCustomOpType.patch │ │ ├── OnnxMlirONNXToStablehloCommon.patch │ │ ├── OnnxMlirONNXToStablehloGather.patch │ │ ├── OnnxMlirOnnxOpsTensorPad.patch │ │ ├── OnnxMlirPooling.patch │ │ ├── OnnxMlirReductionUpgrade.patch │ │ ├── OnnxMlirRegisterLibrary.patch │ │ ├── OnnxMlirReshape.patch │ │ ├── OnnxMlirScatterElements.patch │ │ ├── OnnxMlirTestElementwise.patch │ │ ├── OnnxMlirTestPooling.patch │ │ ├── OnnxMlirWillNotPush.patch │ │ ├── OnnxMlirWillPushToUpstream.patch │ │ ├── OnnxOfficialExternalData.patch │ │ └── OnnxOfficialResize.patch ├── tf-frontend │ ├── .bazelrc │ ├── .bazelversion │ ├── .gitignore │ ├── .tf_configure.bazelrc │ ├── BUILD │ ├── README.md │ ├── WORKSPACE │ ├── byteir │ │ ├── BUILD │ │ ├── ace.BUILD │ │ └── workspace.bzl │ ├── docs │ │ ├── attributes.md │ │ └── developer_guild.md │ ├── example │ │ ├── resnet.py │ │ └── resnet50_model.py │ ├── external │ │ └── patches │ │ │ └── tensorflow │ │ │ ├── fix-bug-of-create-f16-const-for-HoistCwiseBinaryOutO.patch │ │ │ ├── for_gcc_8_5.patch │ │ │ ├── grappler.patch │ │ │ ├── mhlo_ops.patch │ │ │ ├── support-tf-shape-inference.patch │ │ │ ├── tf.Select_to_mhlo.select.patch │ │ │ ├── tf_build.patch │ │ │ ├── tf_dilated_conv.patch │ │ │ ├── tf_mkl.patch │ │ │ ├── tf_slice.patch │ │ │ └── topk.patch │ ├── scripts │ │ ├── apply_patches.sh │ │ ├── build_and_test.sh │ │ └── prepare.sh │ ├── tf_mlir_ext │ │ ├── numerical │ │ │ ├── BUILD │ │ │ ├── dilated_conv2d.mlir │ │ │ ├── fallback_to_custom_call.mlir │ │ │ ├── fuse_tf_ops.mlir │ │ │ ├── glob_lit_test.bzl │ │ │ ├── numerical_test.py │ │ │ ├── process_dynamic_stitch_as_static.mlir │ │ │ ├── reshape_movedown_string.mlir │ │ │ ├── rewrite_to_custom_call.mlir │ │ │ ├── runlit.cfg.py │ │ │ ├── runlit.site.cfg.py │ │ │ └── where.mlir │ │ ├── pipelines │ │ │ ├── BUILD │ │ │ ├── customized_tf_to_mhlo.cc │ │ │ ├── customized_tf_to_mhlo.h │ │ │ ├── passes.h │ │ │ ├── passes.td │ │ │ └── passes_detail.h │ │ ├── tests │ │ │ ├── BUILD │ │ │ ├── ace_ops.mlir │ │ │ ├── convert_repeat_to_tile.mlir │ │ │ ├── dilated_conv2d.mlir │ │ │ ├── fallback_to_custom_call.mlir │ │ │ ├── fuse_tf_ops.mlir │ │ │ ├── glob_lit_test.bzl │ │ │ ├── inline_func_call_in_scf_if.mlir │ │ │ ├── mhlo_legalize_tf_ext.mlir │ │ │ ├── process_dynamic_stitch_as_static.mlir │ │ │ ├── reshape_movedown_string.mlir │ │ │ ├── rewrite_func_attr_to_byteir.mlir │ │ │ ├── rewrite_to_custom_call.mlir │ │ │ ├── rewrite_to_custom_call_keep_body.mlir │ │ │ ├── rewrite_to_if.mlir │ │ │ ├── runlit.cfg.py │ │ │ ├── runlit.site.cfg.py │ │ │ ├── set_repeat_out_batch_size.mlir │ │ │ └── where.mlir │ │ ├── transforms │ │ │ ├── BUILD │ │ │ ├── constant_folding.cc │ │ │ ├── constant_folding.h │ │ │ ├── convert_repeat_to_tile.cc │ │ │ ├── convert_repeat_to_tile.h │ │ │ ├── fuse_tf_ops.cc │ │ │ ├── fuse_tf_ops.h │ │ │ ├── fuse_tf_ops.td │ │ │ ├── inline_func_call_in_scf_if.cc │ │ │ ├── inline_func_call_in_scf_if.h │ │ │ ├── mhlo_legalize_tf_ext.cc │ │ │ ├── mhlo_legalize_tf_ext.h │ │ │ ├── passes.h │ │ │ ├── passes.td │ │ │ ├── passes_detail.h │ │ │ ├── process_dynamic_stitch_as_static.cc │ │ │ ├── process_dynamic_stitch_as_static.h │ │ │ ├── remove_control_flow.cc │ │ │ ├── remove_control_flow.h │ │ │ ├── reshape_movedown_string.cc │ │ │ ├── reshape_movedown_string.h │ │ │ ├── rewrite_func_attr_to_byteir.cc │ │ │ ├── rewrite_func_attr_to_byteir.h │ │ │ ├── rewrite_to_custom_call.cc │ │ │ ├── rewrite_to_custom_call.h │ │ │ ├── rewrite_to_custom_call.td │ │ │ ├── rewrite_to_if.cc │ │ │ ├── rewrite_to_if.h │ │ │ ├── set_repeat_out_batch_size.cc │ │ │ ├── set_repeat_out_batch_size.h │ │ │ ├── tf_fallback_to_custom_call.cc │ │ │ ├── tf_fallback_to_custom_call.h │ │ │ ├── tf_switch_merge_to_if.cc │ │ │ └── tf_switch_merge_to_if.h │ │ └── utils │ │ │ ├── BUILD │ │ │ ├── customcall.cc │ │ │ ├── customcall.h │ │ │ ├── dce.cc │ │ │ ├── dce.h │ │ │ ├── utils.cc │ │ │ └── utils.h │ ├── tools │ │ ├── BUILD │ │ ├── tf_ext_opt_main.cc │ │ └── tf_frontend_main.cc │ └── utils │ │ ├── BUILD │ │ ├── attributes.h │ │ ├── graphdef_opt.cc │ │ ├── graphdef_opt.h │ │ ├── misc.cc │ │ └── misc.h └── torch-frontend │ ├── .gitignore │ ├── CMakeLists.txt │ ├── MLIR.cmake │ ├── README.md │ ├── TorchMLIR.cmake │ ├── build-requirements.txt │ ├── doc │ └── torch_2_0_training.md │ ├── examples │ ├── demo │ │ ├── README.md │ │ ├── backend.py │ │ ├── byteir_fusible_pattern.py │ │ ├── compile_utils.py │ │ ├── config.py │ │ ├── context.py │ │ ├── fx_match_utils.py │ │ ├── main.py │ │ ├── models │ │ │ ├── configuration_chatglm.py │ │ │ ├── modeling_chatglm.py │ │ │ └── modeling_nanogpt.py │ │ ├── partitioners.py │ │ └── requirements.txt │ ├── inference │ │ ├── brt_backend.py │ │ ├── infer_resnet.py │ │ ├── infer_tinybert.py │ │ ├── mixtral │ │ │ ├── infer_single_mixtral.py │ │ │ └── requirements.txt │ │ └── mlp.py │ └── training │ │ ├── byteir_backend.py │ │ ├── mlp.py │ │ ├── train_resnet.py │ │ └── train_tinybert.py │ ├── scripts │ ├── build.sh │ ├── build_and_test.sh │ └── envsetup.sh │ ├── test-requirements.txt │ ├── third_party │ ├── llvm_patches │ │ └── ir_printing.patch │ └── patches │ │ ├── backend_contract.patch │ │ ├── build.patch │ │ ├── communication_op.patch │ │ ├── custom_op.patch │ │ ├── fx_importer.patch │ │ ├── generated_torch_ops_td.patch │ │ ├── pipeline.patch │ │ ├── reduce_op_variants.patch │ │ └── tuple.patch │ ├── torch-cpu-requirements.txt │ ├── torch-cuda-requirements.txt │ └── torch-frontend │ ├── CMakeLists.txt │ ├── include │ ├── torch-frontend-c │ │ └── Passes.h │ └── torch-frontend │ │ ├── CMakeLists.txt │ │ ├── Conversion │ │ ├── CMakeLists.txt │ │ ├── ConvertTorchToCcl.h │ │ ├── ConvertTorchToCustomCall.h │ │ ├── ConvertTorchToStablehloExt.h │ │ ├── Passes.h │ │ └── Passes.td │ │ ├── Dialect │ │ └── Torch │ │ │ └── Transforms │ │ │ ├── CMakeLists.txt │ │ │ ├── DecomposeOnTorch.h │ │ │ ├── FuseOpOnTorch.h │ │ │ ├── Passes.h │ │ │ └── Passes.td │ │ ├── Pipelines │ │ └── Pipelines.h │ │ ├── Transforms │ │ ├── CMakeLists.txt │ │ ├── CanonicalizeExt.h │ │ ├── EliminateUselessOp.h │ │ ├── Passes.h │ │ ├── Passes.td │ │ ├── RewriteCustomOp.h │ │ ├── RewriteEntryFuncName.h │ │ └── UnpackPublicFunctionReturn.h │ │ └── Utils │ │ ├── ConvertOpFolder.h │ │ └── CustomCallUtil.h │ ├── lib │ ├── CAPI │ │ ├── CMakeLists.txt │ │ └── Passes.cpp │ ├── CMakeLists.txt │ ├── Conversion │ │ ├── CMakeLists.txt │ │ ├── ConvertTorchToCcl.cpp │ │ ├── ConvertTorchToCustomCall.cpp │ │ ├── ConvertTorchToStablehloExt.cpp │ │ └── PassDetail.h │ ├── CustomOp │ │ ├── CMakeLists.txt │ │ ├── dynamic_mask_stitch.cpp │ │ ├── dynamic_partition.cpp │ │ └── dynamic_stitch.cpp │ ├── Dialect │ │ └── Torch │ │ │ └── Transforms │ │ │ ├── CMakeLists.txt │ │ │ ├── DecomposeOnTorch.cpp │ │ │ ├── FuseOpOnTorch.cpp │ │ │ ├── FuseOpOnTorchPattern.td │ │ │ └── PassDetail.h │ ├── Pipelines │ │ ├── CMakeLists.txt │ │ └── Pipelines.cpp │ ├── Transforms │ │ ├── CMakeLists.txt │ │ ├── CanonicalizeExt.cpp │ │ ├── EliminateUselessOp.cpp │ │ ├── PassDetail.h │ │ ├── RewriteCustomOp.cpp │ │ ├── RewriteEntryFuncName.cpp │ │ └── UnpackPublicFunctionReturn.cpp │ └── Utils │ │ ├── CMakeLists.txt │ │ └── ConvertOpFolder.cpp │ ├── python │ ├── CMakeLists.txt │ ├── TorchFrontendModule.cpp │ ├── setup.py │ ├── test │ │ ├── pytest.ini │ │ ├── test_attn_rewrite.py │ │ ├── test_fx_utils.py │ │ ├── test_fximporter │ │ │ ├── test_ccl.py │ │ │ ├── test_custom_ops.py │ │ │ ├── test_ops_fximporter.py │ │ │ └── utils.py │ │ ├── test_math_custom_ops.py │ │ ├── test_stablehlo_bytecode.py │ │ ├── test_torchscript │ │ │ ├── test_byteir_customcall_ops.py │ │ │ ├── test_compile_option.py │ │ │ ├── test_model.py │ │ │ ├── test_ops.py │ │ │ └── test_torch_custom_ops.py │ │ └── test_utils │ │ │ └── test_jit_transforms.py │ ├── torch_frontend │ │ ├── __init__.py │ │ ├── _mlir_libs │ │ │ └── _site_initialize_0.py │ │ ├── byteir_backend │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── byteir_fusible_pattern.py │ │ │ ├── compilation_cache.py │ │ │ ├── compiled_function.py │ │ │ ├── compiler.py │ │ │ ├── config.py │ │ │ ├── debug.py │ │ │ ├── fx_match_utils.py │ │ │ ├── fx_utils.py │ │ │ ├── inner_compile.py │ │ │ ├── partitioners.py │ │ │ └── utils.py │ │ ├── compile.py │ │ ├── extra_shape_fn.py │ │ ├── flash_attn_op.py │ │ ├── fx_rewrite.py │ │ ├── fx_tracer.py │ │ ├── fx_utils.py │ │ ├── tools │ │ │ ├── compiler.py │ │ │ ├── extra_fn.mlir │ │ │ └── gen_extra_library.py │ │ ├── ts_utils.py │ │ └── utils │ │ │ ├── __init__.py │ │ │ └── jit_transforms.py │ └── version.txt │ ├── test │ ├── CMakeLists.txt │ ├── Conversion │ │ ├── ConvertTorchToCcl.mlir │ │ ├── ConvertTorchToCustomCall.mlir │ │ └── ConvertTorchToStablehloExt.mlir │ ├── Dialect │ │ └── Torch │ │ │ ├── DecomposeOnTorch.mlir │ │ │ └── FuseOpOnTorch.mlir │ ├── Pipelines │ │ └── TorchFunctionToTorchPipeline.mlir │ ├── Transforms │ │ ├── EliminateUselessOp.mlir │ │ ├── RewriteEntryFuncName.mlir │ │ └── UnpackPublicFunctionReturn.mlir │ ├── lit.cfg.py │ └── lit.site.cfg.py.in │ └── tools │ ├── CMakeLists.txt │ └── torch-frontend-opt.cpp ├── runtime ├── .gitignore ├── README.md ├── VERSION_NUMBER ├── cmake │ ├── CMakeLists.txt │ ├── Modules │ │ └── FindNCCL.cmake │ ├── brt_common.cmake │ ├── brt_config.h.in │ ├── brt_device_cpu.cmake │ ├── brt_device_cuda.cmake │ ├── brt_device_nccl.cmake │ ├── brt_framework.cmake │ ├── brt_ir.cmake │ ├── brt_provider_cpu.cmake │ ├── brt_provider_cuda.cmake │ ├── brt_provider_nccl.cmake │ ├── brt_python_bindings.cmake │ ├── brt_shared.cmake │ └── brt_unittests.cmake ├── examples │ └── external_project │ │ ├── CMakeLists.txt │ │ └── main.cpp ├── include │ └── brt │ │ ├── backends │ │ ├── README.md │ │ ├── common.h │ │ ├── cpu │ │ │ ├── device │ │ │ │ ├── cpu_device_api.h │ │ │ │ ├── cpu_work_queue.h │ │ │ │ └── llvm │ │ │ │ │ └── jit.h │ │ │ └── providers │ │ │ │ └── default │ │ │ │ └── cpu_provider.h │ │ ├── cuda │ │ │ ├── device │ │ │ │ ├── common │ │ │ │ │ ├── cuda_call.h │ │ │ │ │ ├── dtype.h │ │ │ │ │ ├── fast_divmod.h │ │ │ │ │ └── util.h │ │ │ │ ├── compile │ │ │ │ │ ├── nvrtc.h │ │ │ │ │ └── ptx.h │ │ │ │ ├── cuda_allocator.h │ │ │ │ ├── cuda_device_api.h │ │ │ │ ├── cuda_env.h │ │ │ │ ├── cuda_work_queue.h │ │ │ │ └── utils │ │ │ │ │ └── op_kernel_impl_helpers.h │ │ │ └── providers │ │ │ │ └── default │ │ │ │ ├── ait │ │ │ │ ├── model_interface.h │ │ │ │ └── op_registration.h │ │ │ │ ├── codegen │ │ │ │ └── op_registration.h │ │ │ │ ├── copy │ │ │ │ └── op_registration.h │ │ │ │ ├── cuda_provider.h │ │ │ │ ├── cudnn_helper.h │ │ │ │ ├── custom │ │ │ │ └── op_registration.h │ │ │ │ ├── indexing │ │ │ │ └── op_registration.h │ │ │ │ ├── math │ │ │ │ ├── helper.h │ │ │ │ └── op_registration.h │ │ │ │ ├── normalization │ │ │ │ └── op_registration.h │ │ │ │ ├── reduction │ │ │ │ └── op_registration.h │ │ │ │ ├── tensor_generate │ │ │ │ └── op_registration.h │ │ │ │ └── tensor_manipulate │ │ │ │ └── op_registration.h │ │ ├── nccl │ │ │ ├── device │ │ │ │ ├── d_context_nccl.h │ │ │ │ ├── distributed_backend_nccl.h │ │ │ │ └── utils.h │ │ │ └── providers │ │ │ │ ├── nccl_provider.h │ │ │ │ └── op_registration.h │ │ └── rng_state_context.h │ │ └── core │ │ ├── common │ │ ├── code_location.h │ │ ├── common.h │ │ ├── enums.h │ │ ├── exceptions.h │ │ ├── logging │ │ │ ├── capture.h │ │ │ ├── isink.h │ │ │ ├── logging.h │ │ │ ├── macros.h │ │ │ ├── severity.h │ │ │ └── sinks │ │ │ │ ├── cerr_sink.h │ │ │ │ ├── clog_sink.h │ │ │ │ ├── composite_sink.h │ │ │ │ ├── file_sink.h │ │ │ │ └── ostream_sink.h │ │ ├── make_string.h │ │ ├── status.h │ │ ├── string_view.h │ │ └── utils │ │ │ └── math_helper.h │ │ ├── context │ │ ├── execution_context.h │ │ ├── execution_frame.h │ │ └── work_queue.h │ │ ├── distributed │ │ ├── d_context.h │ │ ├── distributed_backend.h │ │ ├── distributed_session.h │ │ └── rendezvous_socket.h │ │ ├── framework │ │ ├── allocator.h │ │ ├── arena.h │ │ ├── bfc_arena.h │ │ ├── brt_mutex.h │ │ ├── device_api.h │ │ ├── dtype.h │ │ ├── event.h │ │ ├── execution_plan.h │ │ ├── execution_provider.h │ │ ├── kernel_registry.h │ │ ├── memory_info.h │ │ ├── op_accessor.h │ │ ├── op_kernel.h │ │ ├── op_kernel_impl_base.h │ │ ├── op_kernel_info.h │ │ └── value.h │ │ ├── ir │ │ ├── builder.h │ │ ├── engine_util.h │ │ ├── graph_info.h │ │ ├── ir.h │ │ ├── op_helper.h │ │ └── util.h │ │ └── session │ │ ├── request_context.h │ │ └── session.h ├── lib │ ├── backends │ │ ├── cpu │ │ │ ├── device │ │ │ │ ├── cpu_device_api.cc │ │ │ │ ├── cpu_work_queue.cc │ │ │ │ └── llvm │ │ │ │ │ └── jit.cc │ │ │ └── providers │ │ │ │ └── default │ │ │ │ ├── copy │ │ │ │ ├── copy.cc │ │ │ │ └── copy.h │ │ │ │ ├── cpu_provider.cc │ │ │ │ ├── custom_call │ │ │ │ ├── non_zero.cc │ │ │ │ ├── non_zero.h │ │ │ │ ├── repeat.cc │ │ │ │ ├── repeat.h │ │ │ │ ├── tf_equal.cc │ │ │ │ ├── tf_equal.h │ │ │ │ ├── tf_select.cc │ │ │ │ ├── tf_select.h │ │ │ │ ├── tf_string_to_number.cc │ │ │ │ ├── tf_string_to_number.h │ │ │ │ ├── topk.cc │ │ │ │ └── topk.h │ │ │ │ ├── llvm │ │ │ │ ├── jit.cc │ │ │ │ └── jit.h │ │ │ │ ├── math │ │ │ │ ├── elementwise_ops.cc │ │ │ │ └── elementwise_ops.h │ │ │ │ ├── shape │ │ │ │ ├── shape_compute.cc │ │ │ │ └── shape_compute.h │ │ │ │ ├── tensor_generate │ │ │ │ ├── fill.cc │ │ │ │ ├── fill.h │ │ │ │ ├── rng_state.cc │ │ │ │ └── rng_state.h │ │ │ │ └── typecvt │ │ │ │ └── typecvt.h │ │ ├── cuda │ │ │ ├── device │ │ │ │ ├── common │ │ │ │ │ ├── cuda_call.cc │ │ │ │ │ └── util.cc │ │ │ │ ├── compile │ │ │ │ │ ├── nvrtc.cc │ │ │ │ │ └── ptx.cc │ │ │ │ ├── cuda_allocator.cc │ │ │ │ ├── cuda_device_api.cc │ │ │ │ ├── cuda_env.cc │ │ │ │ └── cuda_work_queue.cc │ │ │ └── providers │ │ │ │ └── default │ │ │ │ ├── ait │ │ │ │ ├── ait.cc │ │ │ │ ├── ait.h │ │ │ │ └── op_registration.cc │ │ │ │ ├── codegen │ │ │ │ ├── op_registration.cc │ │ │ │ ├── ptx.cc │ │ │ │ └── ptx.h │ │ │ │ ├── copy │ │ │ │ ├── copy.cc │ │ │ │ ├── copy.h │ │ │ │ └── op_registration.cc │ │ │ │ ├── cuda_provider.cc │ │ │ │ ├── custom │ │ │ │ ├── custom.cc │ │ │ │ ├── custom.h │ │ │ │ └── op_registration.cc │ │ │ │ ├── indexing │ │ │ │ ├── index_put.h │ │ │ │ ├── index_select.h │ │ │ │ ├── kernels │ │ │ │ │ ├── index_put.cu │ │ │ │ │ ├── index_put.h │ │ │ │ │ ├── index_select.cu │ │ │ │ │ └── index_select.h │ │ │ │ └── op_registration.cc │ │ │ │ ├── math │ │ │ │ ├── batch_matmul.cc │ │ │ │ ├── batch_matmul.h │ │ │ │ ├── conv.cc │ │ │ │ ├── conv.h │ │ │ │ ├── conv_backward.cc │ │ │ │ ├── conv_backward.h │ │ │ │ ├── elementwise_ops.cc │ │ │ │ ├── elementwise_ops.h │ │ │ │ ├── helper.cc │ │ │ │ ├── kernels │ │ │ │ │ ├── cutlass_blas.cu │ │ │ │ │ ├── cutlass_blas.h │ │ │ │ │ ├── elementwise.cu │ │ │ │ │ └── elementwise.h │ │ │ │ ├── matmul.cc │ │ │ │ ├── matmul.h │ │ │ │ ├── op_registration.cc │ │ │ │ ├── pool.cc │ │ │ │ ├── pool.h │ │ │ │ ├── pool_grad.cc │ │ │ │ └── pool_grad.h │ │ │ │ ├── normalization │ │ │ │ ├── batch_norm_grad.cc │ │ │ │ ├── batch_norm_grad.h │ │ │ │ ├── batch_norm_training.cc │ │ │ │ ├── batch_norm_training.h │ │ │ │ └── op_registration.cc │ │ │ │ ├── reduction │ │ │ │ ├── kernels │ │ │ │ │ ├── reduction.cu │ │ │ │ │ ├── reduction.h │ │ │ │ │ └── reduction_helper.h │ │ │ │ ├── op_registration.cc │ │ │ │ └── reduce_impl.h │ │ │ │ ├── tensor_generate │ │ │ │ ├── fill.cc │ │ │ │ ├── fill.h │ │ │ │ ├── kernels │ │ │ │ │ ├── fill.cu │ │ │ │ │ ├── fill.h │ │ │ │ │ ├── rng.cu │ │ │ │ │ └── rng.h │ │ │ │ ├── op_registration.cc │ │ │ │ ├── rng.h │ │ │ │ ├── rng_state.cc │ │ │ │ └── rng_state.h │ │ │ │ └── tensor_manipulate │ │ │ │ ├── kernels │ │ │ │ ├── transpose.cu │ │ │ │ └── transpose.h │ │ │ │ ├── op_registration.cc │ │ │ │ ├── transpose.cc │ │ │ │ └── transpose.h │ │ └── nccl │ │ │ ├── device │ │ │ ├── distributed_backend_nccl.cc │ │ │ └── utils.cc │ │ │ └── providers │ │ │ ├── all_gather.cc │ │ │ ├── all_gather.h │ │ │ ├── all_reduce.cc │ │ │ ├── all_reduce.h │ │ │ ├── broadcast.cc │ │ │ ├── broadcast.h │ │ │ ├── nccl_provider.cc │ │ │ ├── op_registration.cc │ │ │ ├── recv.cc │ │ │ ├── recv.h │ │ │ ├── send.cc │ │ │ └── send.h │ └── core │ │ ├── common │ │ ├── common.cc │ │ ├── logging │ │ │ ├── capture.cc │ │ │ ├── logging.cc │ │ │ └── sinks │ │ │ │ └── ostream_sink.cc │ │ ├── status.cc │ │ └── utils │ │ │ └── math_helper.cc │ │ ├── context │ │ └── execution_frame.cc │ │ ├── distributed │ │ ├── distributed_backend.cc │ │ ├── distributed_session.cc │ │ └── rendezvous_socket.cc │ │ ├── framework │ │ ├── allocator.cc │ │ ├── bfc_arena.cc │ │ ├── device_api.cc │ │ ├── execution_plan.cc │ │ ├── execution_provider.cc │ │ ├── kernel_registry.cc │ │ ├── op_accessor.cc │ │ └── op_kernel_info.cc │ │ ├── ir │ │ ├── builder.cc │ │ ├── ir.cc │ │ ├── op_helper.cc │ │ └── util.cc │ │ └── session │ │ ├── request_context.cc │ │ └── session.cc ├── python │ ├── README.md │ ├── brt │ │ ├── __init__.py │ │ ├── backend.py │ │ └── utils.py │ ├── examples │ │ ├── add2.mlir │ │ ├── add2.py │ │ ├── ait_op.py │ │ ├── arg_alias.mlir │ │ ├── arg_alias.py │ │ ├── distribute_mlp.py │ │ └── llm.py │ ├── setup.py │ └── src │ │ └── module.cc ├── test │ ├── backends │ │ ├── cpu │ │ │ ├── device │ │ │ │ └── llvm_jit_test.cc │ │ │ └── providers │ │ │ │ └── default │ │ │ │ ├── e2e │ │ │ │ └── e2e_test.cc │ │ │ │ ├── kernel │ │ │ │ ├── copy_test.cc │ │ │ │ ├── non_zero_test.cc │ │ │ │ ├── repeat_test.cc │ │ │ │ ├── rng_state_test.cc │ │ │ │ ├── string_equal_test.cc │ │ │ │ ├── tf_select_test.cc │ │ │ │ ├── tf_string_to_number_test.cc │ │ │ │ ├── topk_test.cc │ │ │ │ └── typecvt_test.cc │ │ │ │ └── request_context_test.cc │ │ ├── cuda │ │ │ ├── device │ │ │ │ ├── allocator_test.cc │ │ │ │ ├── cuda_work_queue_test.cc │ │ │ │ ├── nvrtc_test.cc │ │ │ │ ├── ptx_test.cc │ │ │ │ ├── test_kernels.cu │ │ │ │ └── test_kernels.h │ │ │ └── providers │ │ │ │ └── default │ │ │ │ ├── e2e │ │ │ │ └── resnet_test.cc │ │ │ │ ├── kernel │ │ │ │ ├── ait_test.cc │ │ │ │ ├── alias_test.cc │ │ │ │ ├── batch_matmul_test.cc │ │ │ │ ├── batch_norm_grad_test.cc │ │ │ │ ├── batch_norm_training_test.cc │ │ │ │ ├── codegen_test.cc │ │ │ │ ├── conv_backward_data_test.cc │ │ │ │ ├── conv_backward_filter_test.cc │ │ │ │ ├── conv_test.cc │ │ │ │ ├── copy_test.cc │ │ │ │ ├── elementwise_test.cc │ │ │ │ ├── fill_test.cc │ │ │ │ ├── flash_attn_bwd_test.cc │ │ │ │ ├── flash_attn_fwd_test.cc │ │ │ │ ├── index_test.cc │ │ │ │ ├── matmul_test.cc │ │ │ │ ├── multi_stream_test.cc │ │ │ │ ├── pool_grad_test.cc │ │ │ │ ├── pool_test.cc │ │ │ │ ├── reduction_test.cc │ │ │ │ ├── rng_state_test.cc │ │ │ │ ├── rng_test.cc │ │ │ │ └── transpose_test.cc │ │ │ │ ├── request_context_test.cc │ │ │ │ └── session_test.cc │ │ └── nccl │ │ │ ├── device │ │ │ ├── test_distributed_backend.cc │ │ │ └── test_utils.cc │ │ │ └── providers │ │ │ └── test_distributed_session.cc │ ├── common │ │ ├── env.cc │ │ ├── models.cc │ │ └── util.cc │ ├── context │ │ └── exec_frame_test.cc │ ├── distributed │ │ └── test_rendezvous_socket.cc │ ├── exported.ld │ ├── external_kernels │ │ ├── cpu │ │ │ └── kernels.cc │ │ └── cuda │ │ │ ├── kernels.cc │ │ │ ├── kernels.cu │ │ │ └── kernels.h │ ├── framework │ │ ├── allocator_test.cc │ │ └── misc.cc │ ├── include │ │ └── brt │ │ │ └── test │ │ │ └── common │ │ │ ├── config.h │ │ │ ├── cuda │ │ │ └── util.h │ │ │ ├── env.h │ │ │ ├── models.h │ │ │ ├── nccl │ │ │ ├── test_base.h │ │ │ └── test_utils.h │ │ │ └── util.h │ ├── ir │ │ ├── builder_test.cc │ │ └── ir_test.cc │ ├── session │ │ └── session_test.cc │ ├── test_files │ │ ├── AITOp │ │ │ ├── bmm_permute_a100.so │ │ │ ├── bmm_permute_entry.mlir │ │ │ ├── permute_a100.so │ │ │ └── permute_entry.mlir │ │ ├── Distributed │ │ │ ├── add_send.mlir │ │ │ ├── all_gather.mlir │ │ │ ├── all_reduce.mlir │ │ │ ├── broadcast.mlir │ │ │ ├── broadcast2.mlir │ │ │ ├── ccl.mlir │ │ │ ├── ccl.ptx │ │ │ ├── recv.mlir │ │ │ ├── recv_add.mlir │ │ │ └── send.mlir │ │ ├── DynamicShapes │ │ │ └── Add2 │ │ │ │ ├── entry.mlir │ │ │ │ └── shape_fn.ll │ │ ├── LLJIT │ │ │ ├── Case0 │ │ │ │ ├── entry.mlir │ │ │ │ └── host_kernels.ll │ │ │ ├── Case0_v1_0_0 │ │ │ │ ├── entry.mlirbc │ │ │ │ └── host_kernels.bc │ │ │ ├── add.ll │ │ │ ├── tanh.ll │ │ │ ├── transpose_32_64_64.ll │ │ │ ├── transpose_3_224_224.ll │ │ │ └── typecvt.ll │ │ ├── add2_cpu.mlir │ │ ├── add_splat_const_one_cuda.mlir │ │ ├── cuda_add.cu │ │ ├── custom_add_cpu2cuda.mlir │ │ ├── fill_cuda.mlir │ │ ├── flash_attn_bwd.mlir │ │ ├── flash_attn_bwd_outputs_dk.data │ │ ├── flash_attn_bwd_outputs_dq.data │ │ ├── flash_attn_bwd_outputs_dv.data │ │ ├── flash_attn_fwd.mlir │ │ ├── flash_attn_fwd_outputs.data │ │ ├── flash_attn_inputs_dout.data │ │ ├── flash_attn_inputs_k.data │ │ ├── flash_attn_inputs_q.data │ │ ├── flash_attn_inputs_v.data │ │ ├── flash_attn_kvcache.mlir │ │ ├── flash_attn_kvcache_inputs_cache_seqlens.data │ │ ├── flash_attn_kvcache_inputs_k.data │ │ ├── flash_attn_kvcache_inputs_kcache.data │ │ ├── flash_attn_kvcache_inputs_q.data │ │ ├── flash_attn_kvcache_inputs_v.data │ │ ├── flash_attn_kvcache_inputs_vcache.data │ │ ├── flash_attn_kvcache_outputs.data │ │ ├── flash_attn_kvcache_outputs_kcache.data │ │ ├── flash_attn_kvcache_outputs_vcache.data │ │ ├── generate_flash_attn_ground_truth.py │ │ ├── group_allocation_hook_cpu_group.mlir │ │ ├── llvm_ptx_add.ptx │ │ ├── llvm_ptx_add_bare_ptr.ptx │ │ ├── nvcc_ptx_add.ptx │ │ ├── resnet18_bw_device.ptx │ │ ├── resnet18_bw_host_cuda.mlir │ │ ├── resnet18_fw_bw_device.ptx │ │ ├── resnet18_fw_bw_host_cuda.mlir │ │ ├── resnet18_fw_device.ptx │ │ ├── resnet18_fw_host_cuda.mlir │ │ ├── rng_cuda.mlir │ │ ├── rng_state_cpu.mlir │ │ ├── rng_state_cuda.mlir │ │ ├── string_equal.mlir │ │ └── string_equal_scalar.mlir │ └── unittest_main │ │ └── test_main.cc └── version.ld ├── scripts ├── apply_patches.sh ├── clang_format_check.sh ├── compiler │ └── build_and_test.sh ├── format_check.py ├── prepare.sh └── runtime │ ├── build_and_test.sh │ └── build_external_project.sh ├── talks ├── ChinaSoftCon-ByteIR.pdf └── c4ml23_poster.pdf └── tests ├── build_and_test_e2e.sh ├── compatibility_test ├── execute.py ├── main.py └── reporting.py └── numerical_test ├── execute.py ├── gen_brt_tests.py ├── main.py ├── mlir_tests ├── cpu_ops │ ├── add.mlir │ ├── batch_norm_inference.mlir │ ├── batch_norm_inference_f16.mlir │ ├── broadcast_in_dim.mlir │ ├── compare_LT_f32.mlir │ ├── compare_LT_f64.mlir │ ├── compare_LT_i32.mlir │ ├── compare_LT_i64.mlir │ ├── compare_NE_f32.mlir │ ├── compare_NE_f64.mlir │ ├── compare_NE_i32.mlir │ ├── compare_NE_i64.mlir │ ├── concatenate.mlir │ ├── convert_f16_f32.mlir │ ├── convert_f16_f64.mlir │ ├── convert_f16_i16.mlir │ ├── convert_f16_i32.mlir │ ├── convert_f16_i64.mlir │ ├── convert_f32_f16.mlir │ ├── convert_f32_f64.mlir │ ├── convert_f32_i16.mlir │ ├── convert_f32_i32.mlir │ ├── convert_f32_i32_special_val.mlir │ ├── convert_f32_i64.mlir │ ├── convert_f64_f16.mlir │ ├── convert_f64_f32.mlir │ ├── convert_f64_i16.mlir │ ├── convert_f64_i32.mlir │ ├── convert_f64_i64.mlir │ ├── convert_i16_f16.mlir │ ├── convert_i16_f32.mlir │ ├── convert_i16_f64.mlir │ ├── convert_i16_i32.mlir │ ├── convert_i16_i64.mlir │ ├── convert_i32_f16.mlir │ ├── convert_i32_f32.mlir │ ├── convert_i32_f64.mlir │ ├── convert_i32_i16.mlir │ ├── convert_i32_i64.mlir │ ├── convert_i64_f16.mlir │ ├── convert_i64_f32.mlir │ ├── convert_i64_f64.mlir │ ├── convert_i64_i16.mlir │ ├── convert_i64_i32.mlir │ ├── custom_call_byteir_addn.mlir │ ├── custom_call_byteir_arg_max.mlir │ ├── custom_call_byteir_arg_max_i32.mlir │ ├── custom_call_byteir_arg_min.mlir │ ├── custom_call_byteir_arg_min_i32.mlir │ ├── custom_call_byteir_softmax.mlir │ ├── custom_call_tf_UpperBound.mlir │ ├── divide_f16.mlir │ ├── log_plus_one_f16.mlir │ ├── maximum_f32.mlir │ ├── maximum_f64.mlir │ ├── maximum_i32.mlir │ ├── maximum_i64.mlir │ ├── minimum_f32.mlir │ ├── minimum_f64.mlir │ ├── minimum_i32.mlir │ ├── minimum_i64.mlir │ ├── multiply_f32.mlir │ ├── multiply_f64.mlir │ ├── multiply_i32.mlir │ ├── multiply_i64.mlir │ ├── reduce_f32.mlir │ ├── remainder_i64.mlir │ ├── reshape_slice.mlir │ ├── rng.mlir │ ├── scatter_insert_slice.mlir │ ├── select_f32.mlir │ ├── select_f64.mlir │ ├── select_i64.mlir │ ├── slice_view_like.mlir │ └── subtrace_f16.mlir └── ops │ ├── add.mlir │ ├── bmm_rcr.mlir │ ├── bmm_rrc.mlir │ ├── bmm_rrr_add_f16.mlir │ ├── bmm_rrr_f16.mlir │ ├── bmm_rrr_permute_f16.mlir │ ├── bmm_rrr_permute_f32.mlir │ ├── broadcast.mlir │ ├── broadcast1.mlir │ ├── compare_eq.mlir │ ├── compare_lt.mlir │ ├── concat.mlir │ ├── concat2.mlir │ ├── convert_f16_f32.mlir │ ├── convert_f32_f16.mlir │ ├── divide.mlir │ ├── gather.mlir │ ├── gemm_crr_f16.mlir │ ├── gemm_rrr_f16.mlir │ ├── gemm_rrr_f32.mlir │ ├── insert_slice.mlir │ ├── layernorm.mlir │ ├── logistic.mlir │ ├── mul_f16.mlir │ ├── mul_f32.mlir │ ├── negate.mlir │ ├── power.mlir │ ├── reduce_first_dim.mlir │ ├── reduce_sum.mlir │ ├── reduce_sum_2d.mlir │ ├── reduce_sum_first_2d.mlir │ ├── rsqrt.mlir │ ├── scatter.mlir │ ├── scatter_insert_slice.mlir │ ├── select.mlir │ ├── slice.mlir │ ├── softmax.mlir │ ├── transpose0312.mlir │ ├── transpose102.mlir │ ├── transpose1023.mlir │ ├── transpose120.mlir │ ├── transpose1203.mlir │ ├── transpose2013.mlir │ └── transpose2d.mlir ├── profiler.py ├── reporting.py ├── testset.py ├── torch_dynamo_e2e_testing ├── backend.py ├── execute.py └── test_suite │ └── test_flash_attn.py └── torch_e2e_testing ├── framework.py ├── registry.py └── test_suite ├── __init__.py └── basic.py /.clang-format: -------------------------------------------------------------------------------- 1 | BasedOnStyle: LLVM 2 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug 3 | about: Create a bug report 4 | title: "[BUG]" 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior. The easier it is to reproduce the faster it will get maintainer attention. 15 | 16 | **Expected behavior** 17 | A clear and concise description of what you expected to happen. 18 | 19 | **Stack trace/logs** 20 | If applicable, add the stack trace or logs from the time of the error. 21 | 22 | **Environment** 23 | 24 | **Proposed fix** 25 | If you have a proposal for how to fix the issue state it here or link to a PR. 26 | 27 | **Additional context** 28 | Add any other context about the problem here. 29 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/question.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Question 3 | about: Ask a general question about ByteIR 4 | title: "[QUESTION]" 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Your question** 11 | Ask a clear and concise question about ByteIR. 12 | -------------------------------------------------------------------------------- /compiler/.gitignore: -------------------------------------------------------------------------------- 1 | python/byteir.egg-info/ 2 | python/byteir/version.py -------------------------------------------------------------------------------- /compiler/dialects/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(include) 2 | add_subdirectory(lib) -------------------------------------------------------------------------------- /compiler/dialects/include/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(byteir) -------------------------------------------------------------------------------- /compiler/dialects/include/byteir/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(Dialect) -------------------------------------------------------------------------------- /compiler/dialects/include/byteir/Dialect/Ace/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_mlir_dialect(AceOps ace) 2 | add_mlir_doc(AceOps AceOps Dialects/ -gen-op-doc) 3 | 4 | set(LLVM_TARGET_DEFINITIONS AceOps.td) 5 | mlir_tablegen(AceOpsAttributes.h.inc -gen-attrdef-decls) 6 | mlir_tablegen(AceOpsAttributes.cpp.inc -gen-attrdef-defs) 7 | add_public_tablegen_target(MLIRAceOpsAttrIncGen) 8 | -------------------------------------------------------------------------------- /compiler/dialects/include/byteir/Dialect/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(Ace) 2 | add_subdirectory(Ccl) -------------------------------------------------------------------------------- /compiler/dialects/include/byteir/Dialect/Ccl/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(IR) -------------------------------------------------------------------------------- /compiler/dialects/include/byteir/Dialect/Ccl/IR/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_mlir_dialect(CclOps ccl) 2 | add_mlir_doc(CclOps CclOps Dialects/ -gen-op-doc) 3 | add_mlir_interface(CclOpInterface) 4 | -------------------------------------------------------------------------------- /compiler/dialects/lib/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(Dialect) -------------------------------------------------------------------------------- /compiler/dialects/lib/Dialect/Ace/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_byteir_dialect_library(MLIRAceDialect 2 | IR/AceDialect.cpp 3 | 4 | ADDITIONAL_HEADER_DIRS 5 | ${BYTEIR_SRC_INCLUDE_DIR}/mlir/Dialect/Ace 6 | 7 | DEPENDS 8 | MLIRAceOpsIncGen 9 | MLIRAceOpsAttrIncGen 10 | 11 | LINK_LIBS PUBLIC 12 | MLIRIR 13 | MLIRSupport 14 | MLIRSideEffectInterfaces 15 | ) 16 | -------------------------------------------------------------------------------- /compiler/dialects/lib/Dialect/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(Ace) 2 | add_subdirectory(Ccl) -------------------------------------------------------------------------------- /compiler/dialects/lib/Dialect/Ccl/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(IR) -------------------------------------------------------------------------------- /compiler/dialects/lib/Dialect/Ccl/IR/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_byteir_dialect_library(MLIRCclDialect 2 | CclOps.cpp 3 | 4 | ADDITIONAL_HEADER_DIRS 5 | ${BYTEIR_SRC_INCLUDE_DIR}/byteir/Dialect/Ccl 6 | 7 | DEPENDS 8 | MLIRCclOpsIncGen 9 | 10 | LINK_LIBS PUBLIC 11 | MLIRIR 12 | MLIRSupport 13 | ) 14 | -------------------------------------------------------------------------------- /compiler/doc/passes.md: -------------------------------------------------------------------------------- 1 | # Useful Passes 2 | 3 | This is a placeholder for passes we built and we will call. 4 | -------------------------------------------------------------------------------- /compiler/include/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(byteir) 2 | -------------------------------------------------------------------------------- /compiler/include/byteir/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(Conversion) 2 | add_subdirectory(Dialect) 3 | add_subdirectory(Transforms) -------------------------------------------------------------------------------- /compiler/include/byteir/Conversion/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(LLVM_TARGET_DEFINITIONS Passes.td) 2 | mlir_tablegen(Passes.h.inc -gen-pass-decls -name ByteIRConversion) 3 | add_public_tablegen_target(ByteIRConversionPassIncGen) 4 | # add_mlir_doc(Passes ConversionPasses ./ -gen-pass-doc) -------------------------------------------------------------------------------- /compiler/include/byteir/Dialect/Ace/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(LLVM_TARGET_DEFINITIONS Passes.td) 2 | mlir_tablegen(Passes.h.inc -gen-pass-decls -name ByteIRAce) 3 | add_public_tablegen_target(ByteIRAcePassIncGen) 4 | -------------------------------------------------------------------------------- /compiler/include/byteir/Dialect/Affine/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(LLVM_TARGET_DEFINITIONS Passes.td) 2 | mlir_tablegen(Passes.h.inc -gen-pass-decls -name ByteIRAffine) 3 | add_public_tablegen_target(ByteIRAffinePassIncGen) 4 | -------------------------------------------------------------------------------- /compiler/include/byteir/Dialect/Byre/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_mlir_dialect(ByreOps byre) 2 | add_mlir_doc(ByreOps ByreOps Dialects/ -gen-op-doc) 3 | 4 | set(LLVM_TARGET_DEFINITIONS ByreBase.td) 5 | mlir_tablegen(ByreOpInterfaces.h.inc -gen-op-interface-decls) 6 | mlir_tablegen(ByreOpInterfaces.cpp.inc -gen-op-interface-defs) 7 | mlir_tablegen(ByreEnums.h.inc -gen-enum-decls) 8 | mlir_tablegen(ByreEnums.cpp.inc -gen-enum-defs) 9 | 10 | add_public_tablegen_target(MLIRByreOpInterfacesIncGen) 11 | 12 | set(LLVM_TARGET_DEFINITIONS Passes.td) 13 | mlir_tablegen(Passes.h.inc -gen-pass-decls -name ByteIRByre) 14 | add_public_tablegen_target(ByteIRByrePassIncGen) 15 | 16 | add_subdirectory(Serialization) 17 | -------------------------------------------------------------------------------- /compiler/include/byteir/Dialect/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(Ace) 2 | add_subdirectory(Affine) 3 | add_subdirectory(Byre) 4 | add_subdirectory(Cat) 5 | add_subdirectory(Ccl) 6 | add_subdirectory(GPU) 7 | add_subdirectory(Lace) 8 | add_subdirectory(Linalg) 9 | add_subdirectory(MemRef) 10 | add_subdirectory(mhlo) 11 | add_subdirectory(SCF) 12 | add_subdirectory(Shape) 13 | add_subdirectory(Tensor) 14 | add_subdirectory(Transform) 15 | add_subdirectory(Vector) 16 | add_subdirectory(Lccl) 17 | -------------------------------------------------------------------------------- /compiler/include/byteir/Dialect/Cat/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(IR) 2 | -------------------------------------------------------------------------------- /compiler/include/byteir/Dialect/Cat/IR/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_mlir_dialect(CatOps cat) 2 | add_mlir_doc(CatOps CatOps Dialects/ -gen-op-doc) 3 | 4 | set(LLVM_TARGET_DEFINITIONS CatBase.td) 5 | mlir_tablegen(CatOpInterfaces.h.inc -gen-op-interface-decls) 6 | mlir_tablegen(CatOpInterfaces.cpp.inc -gen-op-interface-defs) 7 | 8 | add_public_tablegen_target(MLIRCatOpInterfacesIncGen) -------------------------------------------------------------------------------- /compiler/include/byteir/Dialect/Ccl/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(TransformOps) 2 | 3 | set(LLVM_TARGET_DEFINITIONS Passes.td) 4 | mlir_tablegen(Passes.h.inc -gen-pass-decls -name ByteIRCcl) 5 | add_public_tablegen_target(ByteIRCclPassIncGen) -------------------------------------------------------------------------------- /compiler/include/byteir/Dialect/Ccl/TransformOps/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(LLVM_TARGET_DEFINITIONS CclTransformOps.td) 2 | mlir_tablegen(CclTransformOps.h.inc -gen-op-decls) 3 | mlir_tablegen(CclTransformOps.cpp.inc -gen-op-defs) 4 | add_public_tablegen_target(MLIRCclTransformOpsIncGen) 5 | 6 | add_mlir_doc(CclTransformOps CclTransformOps Dialects/ -gen-op-doc) -------------------------------------------------------------------------------- /compiler/include/byteir/Dialect/GPU/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(TransformOps) 2 | 3 | set(LLVM_TARGET_DEFINITIONS Passes.td) 4 | mlir_tablegen(Passes.h.inc -gen-pass-decls -name ByteIRGPU) 5 | add_public_tablegen_target(ByteIRGPUPassIncGen) 6 | -------------------------------------------------------------------------------- /compiler/include/byteir/Dialect/GPU/TransformOps/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(LLVM_TARGET_DEFINITIONS GPUExtTransformOps.td) 2 | mlir_tablegen(GPUExtTransformOps.h.inc -gen-op-decls) 3 | mlir_tablegen(GPUExtTransformOps.cpp.inc -gen-op-defs) 4 | add_public_tablegen_target(MLIRGPUExtTransformOpsIncGen) 5 | 6 | add_mlir_doc(GPUExtTransformOps GPUExtTransformOps Dialects/ -gen-op-doc) -------------------------------------------------------------------------------- /compiler/include/byteir/Dialect/Lace/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_mlir_dialect(LaceOps lace) 2 | add_mlir_doc(LaceOps LaceOps Dialects/ -gen-op-doc) 3 | 4 | set(LLVM_TARGET_DEFINITIONS LaceBase.td) 5 | mlir_tablegen(LaceOpInterfaces.h.inc -gen-op-interface-decls) 6 | mlir_tablegen(LaceOpInterfaces.cpp.inc -gen-op-interface-defs) 7 | 8 | add_public_tablegen_target(MLIRLaceOpInterfacesIncGen) 9 | -------------------------------------------------------------------------------- /compiler/include/byteir/Dialect/Lccl/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_mlir_dialect(LcclOps lccl) 2 | add_mlir_doc(LcclOps LcclOps Dialects/ -gen-op-doc) 3 | -------------------------------------------------------------------------------- /compiler/include/byteir/Dialect/Linalg/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(IR) 2 | add_subdirectory(TransformOps) 3 | 4 | set(LLVM_TARGET_DEFINITIONS Passes.td) 5 | mlir_tablegen(Passes.h.inc -gen-pass-decls -name ByteIRLinalg) 6 | add_public_tablegen_target(ByteIRLinalgPassIncGen) 7 | -------------------------------------------------------------------------------- /compiler/include/byteir/Dialect/Linalg/IR/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_mlir_dialect(LinalgExtOps linalg_ext) 2 | add_mlir_doc(LinalgExtOps LinalgExtOps Dialects/ -gen-op-doc) 3 | 4 | 5 | set(LLVM_TARGET_DEFINITIONS LinalgExtInterfaces.td) 6 | mlir_tablegen(LinalgExtOpInterfaces.h.inc -gen-op-interface-decls) 7 | mlir_tablegen(LinalgExtOpInterfaces.cpp.inc -gen-op-interface-defs) 8 | add_public_tablegen_target(MLIRLinalgExtInterfacesIncGen) 9 | add_dependencies(MLIRLinalgExtOpsIncGen MLIRLinalgExtInterfacesIncGen) 10 | -------------------------------------------------------------------------------- /compiler/include/byteir/Dialect/Linalg/TransformOps/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(LLVM_TARGET_DEFINITIONS LinalgExtTransformOps.td) 2 | mlir_tablegen(LinalgExtTransformOps.h.inc -gen-op-decls) 3 | mlir_tablegen(LinalgExtTransformOps.cpp.inc -gen-op-defs) 4 | add_public_tablegen_target(MLIRLinalgExtTransformOpsIncGen) 5 | 6 | add_mlir_doc(LinalgExtTransformOps LinalgExtTransformOps Dialects/ -gen-op-doc) -------------------------------------------------------------------------------- /compiler/include/byteir/Dialect/MemRef/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(LLVM_TARGET_DEFINITIONS Passes.td) 2 | mlir_tablegen(Passes.h.inc -gen-pass-decls -name ByteIRMemRef) 3 | add_public_tablegen_target(ByteIRMemRefPassIncGen) 4 | -------------------------------------------------------------------------------- /compiler/include/byteir/Dialect/SCF/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(LLVM_TARGET_DEFINITIONS Passes.td) 2 | mlir_tablegen(Passes.h.inc -gen-pass-decls -name ByteIRSCF) 3 | add_public_tablegen_target(ByteIRSCFPassIncGen) 4 | -------------------------------------------------------------------------------- /compiler/include/byteir/Dialect/Shape/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(IR) 2 | 3 | set(LLVM_TARGET_DEFINITIONS Passes.td) 4 | mlir_tablegen(Passes.h.inc -gen-pass-decls -name ByteIRShape) 5 | add_public_tablegen_target(ByteIRShapePassIncGen) 6 | -------------------------------------------------------------------------------- /compiler/include/byteir/Dialect/Shape/IR/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_mlir_dialect(ShapeExtOps shape_ext) 2 | add_mlir_doc(ShapeExtOps ShapeExtOps Dialects/ -gen-op-doc) 3 | 4 | -------------------------------------------------------------------------------- /compiler/include/byteir/Dialect/Tensor/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(LLVM_TARGET_DEFINITIONS Passes.td) 2 | mlir_tablegen(Passes.h.inc -gen-pass-decls -name ByteIRTensor) 3 | add_public_tablegen_target(ByteIRTensorPassIncGen) 4 | -------------------------------------------------------------------------------- /compiler/include/byteir/Dialect/Transform/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(IR) 2 | 3 | set(LLVM_TARGET_DEFINITIONS Passes.td) 4 | mlir_tablegen(Passes.h.inc -gen-pass-decls -name ByteIRTransform) 5 | add_public_tablegen_target(ByteIRTransformPassIncGen) 6 | -------------------------------------------------------------------------------- /compiler/include/byteir/Dialect/Transform/IR/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(LLVM_TARGET_DEFINITIONS TransformExtOps.td) 2 | mlir_tablegen(TransformExtOps.h.inc -gen-op-decls) 3 | mlir_tablegen(TransformExtOps.cpp.inc -gen-op-defs) 4 | add_public_tablegen_target(MLIRTransformExtOpsIncGen) 5 | 6 | add_mlir_doc(TransformExtOps TransformExtOps Dialects/ -gen-op-doc) -------------------------------------------------------------------------------- /compiler/include/byteir/Dialect/Vector/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(Transforms) -------------------------------------------------------------------------------- /compiler/include/byteir/Dialect/Vector/Transforms/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(LLVM_TARGET_DEFINITIONS Passes.td) 2 | mlir_tablegen(Passes.h.inc -gen-pass-decls -name ByteIRVector) 3 | add_public_tablegen_target(ByteIRVectorPassIncGen) 4 | -------------------------------------------------------------------------------- /compiler/include/byteir/Dialect/mhlo/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(LLVM_TARGET_DEFINITIONS Passes.td) 2 | mlir_tablegen(Passes.h.inc -gen-pass-decls -name ByteIRMhlo) 3 | add_public_tablegen_target(ByteIRMhloPassIncGen) 4 | -------------------------------------------------------------------------------- /compiler/include/byteir/Transforms/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(LLVM_TARGET_DEFINITIONS Passes.td) 2 | mlir_tablegen(Passes.h.inc -gen-pass-decls -name ByteIRTransforms) 3 | add_public_tablegen_target(ByteIRTransformsPassIncGen) 4 | # add_mlir_doc(Passes TransformsPasses ./ -gen-pass-doc) -------------------------------------------------------------------------------- /compiler/lib/Analysis/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_mlir_library(ByteIRAnalysis 2 | DimFlag.cpp 3 | Liveness.cpp 4 | OpDependence.cpp 5 | ShapeAnalysis.cpp 6 | SideEffect.cpp 7 | SymbolicShape.cpp 8 | UseRange.cpp 9 | 10 | ADDITIONAL_HEADER_DIRS 11 | ${BYTEIR_SRC_INCLUDE_DIR}/byteir/Analysis 12 | 13 | LINK_LIBS PUBLIC 14 | MLIRAnalysis 15 | MLIRBufferizationTransforms 16 | MLIRIR 17 | MLIRShapeDialect 18 | MLIRTensorDialect 19 | ) -------------------------------------------------------------------------------- /compiler/lib/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(Analysis) 2 | add_subdirectory(Conversion) 3 | add_subdirectory(Dialect) 4 | add_subdirectory(Pipelines) 5 | add_subdirectory(Stat) 6 | add_subdirectory(Target) 7 | add_subdirectory(Transforms) 8 | add_subdirectory(Utils) 9 | 10 | # note: CAPI depends on byteir property, so add it at last 11 | add_subdirectory(CAPI) 12 | -------------------------------------------------------------------------------- /compiler/lib/Conversion/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(Common) 2 | add_subdirectory(FuncToByre) 3 | add_subdirectory(GPUToNVVM) 4 | add_subdirectory(HloToByreTensor) 5 | add_subdirectory(HloToCat) 6 | add_subdirectory(HloToTensor) 7 | add_subdirectory(MemrefToByre) 8 | add_subdirectory(ToAce) 9 | add_subdirectory(ToAIT) 10 | add_subdirectory(ToByre) 11 | add_subdirectory(ToGPU) 12 | add_subdirectory(ToHlo) 13 | add_subdirectory(ToLinalg) 14 | add_subdirectory(ToLLVM) 15 | add_subdirectory(ToPTX) 16 | add_subdirectory(LcclToByre) 17 | -------------------------------------------------------------------------------- /compiler/lib/Conversion/Common/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_byteir_conversion_library(ByteIRConversionCommon 2 | FunctionSupport.cpp 3 | 4 | ADDITIONAL_HEADER_DIRS 5 | ${BYTEIR_SRC_INCLUDE_DIR}/byteir/Conversion/Common 6 | 7 | LINK_LIBS PUBLIC 8 | MLIRIR 9 | MLIRMemRefDialect 10 | MLIRTransforms 11 | ) 12 | -------------------------------------------------------------------------------- /compiler/lib/Conversion/FuncToByre/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_byteir_conversion_library(ByteIRFuncToByre 2 | FuncToByre.cpp 3 | 4 | ADDITIONAL_HEADER_DIRS 5 | ${BYTEIR_SRC_INCLUDE_DIR}/byteir/Conversion/FuncToByre 6 | 7 | DEPENDS 8 | MLIRByreDialect 9 | ByteIRConversionPassIncGen 10 | ByteIRConversionCommon 11 | ByteIRMhloUtils 12 | 13 | LINK_LIBS PUBLIC 14 | MLIRArithDialect 15 | MLIRByreDialect 16 | MLIRIR 17 | MLIRMemRefDialect 18 | MLIRTensorDialect 19 | MLIRTransforms 20 | ByteIRConversionCommon 21 | ByteIRMhloUtils 22 | ) 23 | -------------------------------------------------------------------------------- /compiler/lib/Conversion/GPUToNVVM/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_byteir_conversion_library(ByteIRGPUToNVVM 2 | GPUToNVVM.cpp 3 | 4 | ADDITIONAL_HEADER_DIRS 5 | ${BYTEIR_SRC_INCLUDE_DIR}/byteir/Conversion/GPUToNVVM 6 | 7 | DEPENDS 8 | ByteIRConversionPassIncGen 9 | ByteIRConversionCommon 10 | 11 | LINK_LIBS PUBLIC 12 | MLIRArithToLLVM 13 | MLIRFuncToLLVM 14 | MLIRGPUDialect 15 | MLIRGPUToGPURuntimeTransforms 16 | MLIRGPUToNVVMTransforms 17 | MLIRLLVMCommonConversion 18 | MLIRLLVMDialect 19 | MLIRMathTransforms 20 | MLIRMemRefDialect 21 | MLIRMemRefToLLVM 22 | MLIRNVVMDialect 23 | MLIRPass 24 | MLIRTransformUtils 25 | ) 26 | -------------------------------------------------------------------------------- /compiler/lib/Conversion/HloToByreTensor/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_byteir_conversion_library(ByteIRHloToByreTensor 2 | HloToByreCustom.cpp 3 | HloToByreTensor.cpp 4 | 5 | ADDITIONAL_HEADER_DIRS 6 | ${BYTEIR_SRC_INCLUDE_DIR}/byteir/Conversion/HloToByreTensor 7 | 8 | DEPENDS 9 | MLIRByreDialect 10 | ByteIRConversionPassIncGen 11 | ByteIRConversionCommon 12 | ByteIRMhloUtils 13 | 14 | LINK_LIBS PUBLIC 15 | MhloDialect 16 | MLIRAceDialect 17 | MLIRArithDialect 18 | MLIRByreDialect 19 | MLIRIR 20 | MLIRTensorDialect 21 | MLIRTransforms 22 | ByteIRConversionCommon 23 | ByteIRMhloUtils 24 | ) 25 | -------------------------------------------------------------------------------- /compiler/lib/Conversion/HloToTensor/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_byteir_conversion_library(ByteIRHloToTensor 2 | ConvertHloToTensor.cpp 3 | 4 | ADDITIONAL_HEADER_DIRS 5 | ${BYTEIR_SRC_INCLUDE_DIR}/byteir/Conversion/HloToTensor 6 | 7 | DEPENDS 8 | MLIRTensorDialect 9 | MLIRArithDialect 10 | ByteIRConversionPassIncGen 11 | ByteIRConversionCommon 12 | ByteIRMhloUtils 13 | 14 | LINK_LIBS PUBLIC 15 | MhloDialect 16 | MLIRTensorDialect 17 | MLIRArithDialect 18 | MLIRIR 19 | ByteIRConversionCommon 20 | ) 21 | -------------------------------------------------------------------------------- /compiler/lib/Conversion/LcclToByre/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_byteir_conversion_library(ByteIRLcclToByre 2 | LcclToByre.cpp 3 | 4 | ADDITIONAL_HEADER_DIRS 5 | ${BYTEIR_SRC_INCLUDE_DIR}/byteir/Conversion/LcclToByre 6 | 7 | DEPENDS 8 | MLIRByreDialect 9 | ByteIRConversionPassIncGen 10 | ByteIRConversionCommon 11 | 12 | LINK_LIBS PUBLIC 13 | MLIRArithDialect 14 | MLIRByreDialect 15 | MLIRIR 16 | MLIRMemRefDialect 17 | MLIRTransforms 18 | ByteIRConversionCommon 19 | ) 20 | -------------------------------------------------------------------------------- /compiler/lib/Conversion/MemrefToByre/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_byteir_conversion_library(ByteIRMemrefToByre 2 | MemrefToByre.cpp 3 | 4 | ADDITIONAL_HEADER_DIRS 5 | ${BYTEIR_SRC_INCLUDE_DIR}/byteir/Conversion/MemrefToByre 6 | 7 | DEPENDS 8 | MLIRByreDialect 9 | ByteIRConversionPassIncGen 10 | ByteIRConversionCommon 11 | 12 | LINK_LIBS PUBLIC 13 | MLIRArithDialect 14 | MLIRByreDialect 15 | MLIRIR 16 | MLIRMemRefDialect 17 | MLIRTensorDialect 18 | MLIRTransforms 19 | ByteIRConversionCommon 20 | ) 21 | -------------------------------------------------------------------------------- /compiler/lib/Conversion/ToAIT/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_byteir_conversion_library(ByteIRToAIT 2 | GenAITConfig.cpp 3 | 4 | ADDITIONAL_HEADER_DIRS 5 | ${BYTEIR_SRC_INCLUDE_DIR}/byteir/Conversion/ToAIT 6 | 7 | DEPENDS 8 | ByteIRConversionPassIncGen 9 | 10 | LINK_LIBS PUBLIC 11 | MLIRIR 12 | MLIRBufferizationTransforms 13 | ByteIRUtils 14 | ) 15 | -------------------------------------------------------------------------------- /compiler/lib/Conversion/ToAce/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(LLVM_TARGET_DEFINITIONS MhloToAceActivationPattern.td) 2 | mlir_tablegen(MhloToAceActivationPattern.inc -gen-rewriters) 3 | add_public_tablegen_target(MhloToAceActivationPatternIncGen) 4 | 5 | add_byteir_conversion_library(ByteIRToAce 6 | MhloToAce.cpp 7 | 8 | DEPENDS 9 | ByteIRConversionPassIncGen 10 | ByteIRConversionCommon 11 | MhloToAceActivationPatternIncGen 12 | MLIRAceDialect 13 | 14 | LINK_LIBS PUBLIC 15 | ByteIRConversionCommon 16 | ByteIRUtils 17 | MhloDialect 18 | MLIRAceDialect 19 | MLIRIR 20 | ) 21 | 22 | target_include_directories(ByteIRToAce PUBLIC ${CMAKE_CURRENT_BINARY_DIR}) 23 | -------------------------------------------------------------------------------- /compiler/lib/Conversion/ToByre/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_byteir_conversion_library(ByteIRToByre 2 | ToByre.cpp 3 | 4 | ADDITIONAL_HEADER_DIRS 5 | ${BYTEIR_SRC_INCLUDE_DIR}/byteir/Conversion/ToByre 6 | 7 | DEPENDS 8 | MLIRAceDialect 9 | MLIRByreDialect 10 | MLIRLaceDialect 11 | ByteIRConversionPassIncGen 12 | ByteIRConversionCommon 13 | ByteIRMhloUtils 14 | 15 | LINK_LIBS PUBLIC 16 | MLIRAceDialect 17 | MLIRByreDialect 18 | MLIRIR 19 | MLIRLaceDialect 20 | MLIRMemRefDialect 21 | MLIRTransforms 22 | ByteIRConversionCommon 23 | ByteIRMhloUtils 24 | ) 25 | -------------------------------------------------------------------------------- /compiler/lib/Conversion/ToGPU/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_byteir_conversion_library(ByteIRToGPU 2 | CoalescedForToGPU.cpp 3 | FuncToGPU.cpp 4 | Utils.cpp 5 | 6 | ADDITIONAL_HEADER_DIRS 7 | ${BYTEIR_SRC_INCLUDE_DIR}/byteir/Conversion/ToGPU 8 | 9 | DEPENDS 10 | ByteIRConversionPassIncGen 11 | ByteIRConversionCommon 12 | ByteIRUtils 13 | 14 | LINK_LIBS PUBLIC 15 | MLIRIR 16 | MLIRAffineDialect 17 | MLIRGPUDialect 18 | MLIRMemRefDialect 19 | MLIRSCFDialect 20 | MLIRTransforms 21 | ByteIRConversionCommon 22 | ByteIRUtils 23 | ) 24 | -------------------------------------------------------------------------------- /compiler/lib/Conversion/ToHlo/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(LLVM_TARGET_DEFINITIONS ArithToMhloPattern.td) 2 | mlir_tablegen(ArithToMhloPattern.inc -gen-rewriters) 3 | add_public_tablegen_target(ArithToMhloPatternIncGen) 4 | 5 | add_byteir_conversion_library(ByteIRToMhlo 6 | ArithToMhlo.cpp 7 | 8 | DEPENDS 9 | ByteIRConversionPassIncGen 10 | ArithToMhloPatternIncGen 11 | 12 | LINK_LIBS PUBLIC 13 | MhloDialect 14 | MLIRArithDialect 15 | MLIRIR 16 | ) 17 | 18 | target_include_directories(ByteIRToMhlo PUBLIC ${CMAKE_CURRENT_BINARY_DIR}) 19 | -------------------------------------------------------------------------------- /compiler/lib/Conversion/ToLLVM/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_byteir_conversion_library(ByteIRToLLVM 2 | CollectFuncToLLVM.cpp 3 | GenLLVMConfig.cpp 4 | 5 | ADDITIONAL_HEADER_DIRS 6 | ${BYTEIR_SRC_INCLUDE_DIR}/byteir/Conversion/ToLLVM 7 | 8 | DEPENDS 9 | ByteIRConversionPassIncGen 10 | 11 | LINK_LIBS PUBLIC 12 | MLIRIR 13 | MLIRBufferizationTransforms 14 | ByteIRUtils 15 | ) 16 | -------------------------------------------------------------------------------- /compiler/lib/Conversion/ToLinalg/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_byteir_conversion_library(ByteIRToLinalg 2 | HloToLinalg.cpp 3 | LinalgExtToLinalg.cpp 4 | MemrefCopyToLinalg.cpp 5 | TensorToLinalg.cpp 6 | UnrealizedCastToLinalg.cpp 7 | 8 | ADDITIONAL_HEADER_DIRS 9 | ${BYTEIR_SRC_INCLUDE_DIR}/byteir/Conversion/HloToLinalg 10 | 11 | DEPENDS 12 | ByteIRConversionPassIncGen 13 | ByteIRConversionCommon 14 | HloToLinalgUtils 15 | 16 | LINK_LIBS PUBLIC 17 | MLIRIR 18 | MhloDialect 19 | MhloToLinalg 20 | MLIRRewrite 21 | MLIRLinalgDialect 22 | MLIRMathDialect 23 | MLIRMemRefDialect 24 | MLIRSCFDialect 25 | MLIRTransforms 26 | ByteIRConversionCommon 27 | HloToLinalgUtils 28 | ) 29 | -------------------------------------------------------------------------------- /compiler/lib/Conversion/ToPTX/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_byteir_conversion_library(ByteIRToPTX 2 | CollectGPUKernel.cpp 3 | GenPTXConfig.cpp 4 | 5 | ADDITIONAL_HEADER_DIRS 6 | ${BYTEIR_SRC_INCLUDE_DIR}/byteir/Conversion/ToPTX 7 | 8 | DEPENDS 9 | ByteIRConversionPassIncGen 10 | 11 | LINK_LIBS PUBLIC 12 | MLIRIR 13 | MLIRMemRefDialect 14 | MLIRGPUDialect 15 | MLIRTransforms 16 | ) 17 | -------------------------------------------------------------------------------- /compiler/lib/Dialect/Ace/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_byteir_dialect_library(ByteIRAcePasses 2 | Transforms/BufferizableOpInterfaceImpl.cpp 3 | Transforms/Bufferize.cpp 4 | 5 | ADDITIONAL_HEADER_DIRS 6 | ${BYTEIR_SRC_INCLUDE_DIR}/byteir/Dialect/Ace 7 | 8 | DEPENDS 9 | ByteIRAcePassIncGen 10 | MLIRAceDialect 11 | MLIRLaceDialect 12 | 13 | LINK_LIBS PUBLIC 14 | MLIRIR 15 | MLIRSupport 16 | 17 | MLIRAceDialect 18 | MLIRBufferizationDialect 19 | MLIRBufferizationTransforms 20 | MLIRLaceDialect 21 | MLIRMemRefDialect 22 | ) 23 | -------------------------------------------------------------------------------- /compiler/lib/Dialect/Affine/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_byteir_dialect_library(ByteIRAffinePasses 2 | Transforms/AffineLoopFusionEx.cpp 3 | Transforms/InsertTrivialAffineLoop.cpp 4 | Transforms/RewriteAffineToMemref.cpp 5 | 6 | ADDITIONAL_HEADER_DIRS 7 | ${BYTEIR_SRC_INCLUDE_DIR}/byteir/Dialect/Affine 8 | ${BYTEIR_SRC_INCLUDE_DIR}/byteir/Dialect/Affine/Transforms 9 | 10 | DEPENDS 11 | ByteIRAffinePassIncGen 12 | ByteIRUtils 13 | MLIRAffineDialect 14 | MLIRMemRefDialect 15 | 16 | LINK_LIBS PUBLIC 17 | ByteIRUtils 18 | MLIRIR 19 | MLIRAffineDialect 20 | MLIRMemRefDialect 21 | MLIRSideEffectInterfaces 22 | MLIRSupport 23 | ) 24 | -------------------------------------------------------------------------------- /compiler/lib/Dialect/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(Ace) 2 | add_subdirectory(Affine) 3 | add_subdirectory(Byre) 4 | add_subdirectory(Cat) 5 | add_subdirectory(Ccl) 6 | add_subdirectory(GPU) 7 | add_subdirectory(Lace) 8 | add_subdirectory(Linalg) 9 | add_subdirectory(MemRef) 10 | add_subdirectory(mhlo) 11 | add_subdirectory(SCF) 12 | add_subdirectory(Shape) 13 | add_subdirectory(Tensor) 14 | add_subdirectory(Transform) 15 | add_subdirectory(Vector) 16 | add_subdirectory(Lccl) 17 | -------------------------------------------------------------------------------- /compiler/lib/Dialect/Cat/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(IR) -------------------------------------------------------------------------------- /compiler/lib/Dialect/Cat/IR/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_byteir_dialect_library(MLIRCatDialect 2 | CatDialect.cpp 3 | 4 | DEPENDS 5 | MLIRCatOpsIncGen 6 | MLIRCatOpInterfacesIncGen 7 | 8 | LINK_LIBS PUBLIC 9 | MLIRIR 10 | MLIRSupport 11 | ) 12 | 13 | -------------------------------------------------------------------------------- /compiler/lib/Dialect/Ccl/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(TransformOps) 2 | add_subdirectory(Transforms) -------------------------------------------------------------------------------- /compiler/lib/Dialect/Ccl/TransformOps/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_byteir_dialect_library(MLIRCclTransformOps 2 | CclTransformOps.cpp 3 | 4 | ADDITIONAL_HEADER_DIRS 5 | ${BYTEIR_SRC_INCLUDE_DIR}/byteir/Dialect/Ccl/TransformOps 6 | 7 | DEPENDS 8 | MLIRCclDialect 9 | MLIRCclTransformOpsIncGen 10 | 11 | 12 | LINK_LIBS PUBLIC 13 | MLIRCclDialect 14 | MLIRIR 15 | MLIRParser 16 | MLIRPDLDialect 17 | MLIRSideEffectInterfaces 18 | MLIRTransformDialect 19 | ) 20 | -------------------------------------------------------------------------------- /compiler/lib/Dialect/Ccl/Transforms/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_byteir_dialect_library(ByteIRCclPasses 2 | CclMoveDown.cpp 3 | CclBufferizeOpInterfaceImpl.cpp 4 | 5 | ADDITIONAL_HEADER_DIRS 6 | ${BYTEIR_SRC_INCLUDE_DIR}/byteir/Dialect/Ccl 7 | ${BYTEIR_SRC_INCLUDE_DIR}/byteir/Dialect/Ccl/Transforms 8 | 9 | DEPENDS 10 | ByteIRCclPassIncGen 11 | 12 | LINK_LIBS PUBLIC 13 | MLIRLcclDialect 14 | MLIRIR 15 | MhloDialect 16 | MLIRSupport 17 | ) 18 | -------------------------------------------------------------------------------- /compiler/lib/Dialect/GPU/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(Transforms) 2 | add_subdirectory(TransformOps) -------------------------------------------------------------------------------- /compiler/lib/Dialect/GPU/TransformOps/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_byteir_dialect_library(MLIRGPUExtTransformOps 2 | GPUExtTransformOps.cpp 3 | Utils.cpp 4 | 5 | ADDITIONAL_HEADER_DIRS 6 | ${BYTEIR_SRC_INCLUDE_DIR}/byteir/Dialect/GPU/TransformOps 7 | 8 | 9 | DEPENDS 10 | MLIRGPUExtTransformOpsIncGen 11 | 12 | LINK_LIBS PUBLIC 13 | MLIRGPUDialect 14 | MLIRGPUTransforms 15 | MLIRIR 16 | MLIRParser 17 | MLIRSideEffectInterfaces 18 | MLIRTransformDialect 19 | MLIRPDLDialect 20 | MLIRSCFDialect 21 | ) 22 | -------------------------------------------------------------------------------- /compiler/lib/Dialect/GPU/Transforms/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_byteir_dialect_library(ByteIRGPUPasses 2 | GPUBlockSwizzle.cpp 3 | GPUDistributeSharedMemoryCopy.cpp 4 | GPUDistributeToWarp.cpp 5 | GPUTensorCoreVectorization.cpp 6 | GPUPackSharedMemoryAlloc.cpp 7 | OptimizeVectorTransfer.cpp 8 | RemoveTrivialLoops.cpp 9 | ShmAllocaToWorkgroupArg.cpp 10 | Utils.cpp 11 | 12 | ADDITIONAL_HEADER_DIRS 13 | ${BYTEIR_SRC_INCLUDE_DIR}/byteir/Dialect/GPU 14 | ${BYTEIR_SRC_INCLUDE_DIR}/byteir/Dialect/GPU/Transforms 15 | 16 | DEPENDS 17 | ByteIRGPUPassIncGen 18 | ByteIRUtils 19 | MLIRGPUDialect 20 | 21 | LINK_LIBS PUBLIC 22 | ByteIRUtils 23 | MLIRIR 24 | MLIRGPUDialect 25 | MLIRMemRefDialect 26 | MLIRSupport 27 | ) 28 | -------------------------------------------------------------------------------- /compiler/lib/Dialect/Lace/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_byteir_dialect_library(MLIRLaceDialect 2 | IR/LaceDialect.cpp 3 | 4 | ADDITIONAL_HEADER_DIRS 5 | ${BYTEIR_SRC_INCLUDE_DIR}/mlir/Dialect/Lace 6 | 7 | DEPENDS 8 | MLIRAceOpsIncGen # ace types 9 | MLIRLaceOpsIncGen 10 | MLIRLaceOpInterfacesIncGen 11 | 12 | LINK_LIBS PUBLIC 13 | MLIRIR 14 | MLIRSupport 15 | MLIRViewLikeInterface 16 | MLIRAceDialect 17 | ) 18 | -------------------------------------------------------------------------------- /compiler/lib/Dialect/Lccl/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(IR) 2 | -------------------------------------------------------------------------------- /compiler/lib/Dialect/Lccl/IR/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_byteir_dialect_library(MLIRLcclDialect 2 | LcclOps.cpp 3 | 4 | ADDITIONAL_HEADER_DIRS 5 | 6 | DEPENDS 7 | MLIRLcclOpsIncGen 8 | 9 | LINK_LIBS PUBLIC 10 | MLIRIR 11 | MLIRSupport 12 | ) 13 | -------------------------------------------------------------------------------- /compiler/lib/Dialect/Linalg/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(IR) 2 | add_subdirectory(TransformOps) 3 | add_subdirectory(Transforms) 4 | add_subdirectory(Util) -------------------------------------------------------------------------------- /compiler/lib/Dialect/Linalg/IR/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_byteir_dialect_library(MLIRLinalgExt 2 | LinalgExtInterfaces.cpp 3 | LinalgExtOps.cpp 4 | 5 | DEPENDS 6 | MLIRLinalgExtInterfacesIncGen 7 | MLIRLinalgExtOpsIncGen 8 | 9 | LINK_LIBS PUBLIC 10 | MLIRAnalysis 11 | MLIRIR 12 | MLIRLinalgUtils 13 | MLIRSupport 14 | MLIRLinalgExtUtils 15 | MLIRSCFExtUtils 16 | ) 17 | -------------------------------------------------------------------------------- /compiler/lib/Dialect/Linalg/TransformOps/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_byteir_dialect_library(MLIRLinalgExtTransformOps 2 | LinalgExtTransformOps.cpp 3 | 4 | ADDITIONAL_HEADER_DIRS 5 | ${BYTEIR_SRC_INCLUDE_DIR}/byteir/Dialect/Linalg/TransformOps 6 | 7 | DEPENDS 8 | MLIRCclDialect 9 | MLIRLinalgExtTransformOpsIncGen 10 | ByteIRLinalgPasses 11 | 12 | LINK_LIBS PUBLIC 13 | ByteIRLinalgPasses 14 | MLIRAffineDialect 15 | MLIRArithDialect 16 | MLIRCclDialect 17 | MLIRIR 18 | MLIRLinalgDialect 19 | MLIRLinalgTransforms 20 | MLIRParser 21 | MLIRPDLDialect 22 | MLIRSCFDialect 23 | MLIRSideEffectInterfaces 24 | MLIRTensorTilingInterfaceImplExt 25 | MLIRTransformDialect 26 | MLIRVectorDialect 27 | ) 28 | -------------------------------------------------------------------------------- /compiler/lib/Dialect/Linalg/Util/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_byteir_dialect_library(MLIRLinalgExtUtils 2 | Util.cpp 3 | 4 | ADDITIONAL_HEADER_DIRS 5 | ${BYTEIR_SRC_INCLUDE_DIR}/byteir/Dialect/Linalg 6 | 7 | LINK_LIBS PUBLIC 8 | MLIRIR 9 | MLIRLinalgDialect 10 | ) 11 | -------------------------------------------------------------------------------- /compiler/lib/Dialect/SCF/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(Transforms) 2 | add_subdirectory(Util) 3 | -------------------------------------------------------------------------------- /compiler/lib/Dialect/SCF/Transforms/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_byteir_dialect_library(ByteIRSCFPasses 2 | ForallCollapsing.cpp 3 | FuseNestedForall.cpp 4 | InsertTrivialSCFLoop.cpp 5 | TilingInterfaceToSCFFor.cpp 6 | RemoveSingleIterationLoop.cpp 7 | 8 | ADDITIONAL_HEADER_DIRS 9 | ${BYTEIR_SRC_INCLUDE_DIR}/byteir/Dialect/SCF 10 | ${BYTEIR_SRC_INCLUDE_DIR}/byteir/Dialect/SCF/Transforms 11 | 12 | DEPENDS 13 | ByteIRSCFPassIncGen 14 | ByteIRUtils 15 | 16 | LINK_LIBS PUBLIC 17 | ByteIRUtils 18 | MLIRIR 19 | MLIRMemRefDialect 20 | MLIRSCFDialect 21 | MLIRSCFTransforms 22 | MLIRSideEffectInterfaces 23 | MLIRSupport 24 | ) 25 | -------------------------------------------------------------------------------- /compiler/lib/Dialect/SCF/Util/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_byteir_dialect_library(MLIRSCFExtUtils 2 | Util.cpp 3 | 4 | ADDITIONAL_HEADER_DIRS 5 | ${BYTEIR_SRC_INCLUDE_DIR}/byteir/Dialect/SCF 6 | 7 | LINK_LIBS PUBLIC 8 | MLIRIR 9 | MLIRSCFDialect 10 | ) 11 | -------------------------------------------------------------------------------- /compiler/lib/Dialect/Tensor/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(IR) 2 | add_subdirectory(Transforms) 3 | -------------------------------------------------------------------------------- /compiler/lib/Dialect/Tensor/IR/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_byteir_dialect_library(MLIRTensorTilingInterfaceImplExt 2 | TilingInterfaceImpl.cpp 3 | 4 | ADDITIONAL_HEADER_DIRS 5 | ${BYTEIR_SRC_INCLUDE_DIR}/byteir/Dialect/Tensor 6 | 7 | DEPENDS 8 | ByteIRUtils 9 | 10 | LINK_LIBS PUBLIC 11 | ByteIRUtils 12 | MLIRAffineDialect 13 | MLIRIR 14 | MLIRLinalgDialect 15 | MLIRSCFDialect 16 | MLIRSupport 17 | MLIRTensorDialect 18 | MLIRTilingInterface 19 | ) 20 | -------------------------------------------------------------------------------- /compiler/lib/Dialect/Tensor/Transforms/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_byteir_dialect_library(ByteIRTensorPasses 2 | CanonicalizeExt.cpp 3 | ExtractSliceSpecialization.cpp 4 | TensorPadSpecialization.cpp 5 | 6 | ADDITIONAL_HEADER_DIRS 7 | ${BYTEIR_SRC_INCLUDE_DIR}/byteir/Dialect/mhlo 8 | ${BYTEIR_SRC_INCLUDE_DIR}/byteir/Dialect/mhlo/Transforms 9 | 10 | DEPENDS 11 | ByteIRUtils 12 | ByteIRTensorPassIncGen 13 | 14 | LINK_LIBS PUBLIC 15 | MLIRIR 16 | MLIRSupport 17 | MLIRSCFDialect 18 | ByteIRUtils 19 | ) 20 | -------------------------------------------------------------------------------- /compiler/lib/Dialect/Transform/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(IR) 2 | add_subdirectory(Transforms) 3 | -------------------------------------------------------------------------------- /compiler/lib/Dialect/Transform/IR/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_byteir_dialect_library(MLIRTransformExtDialect 2 | TransformExtOps.cpp 3 | 4 | ADDITIONAL_HEADER_DIRS 5 | ${BYTEIR_SRC_INCLUDE_DIR}/byteir/Dialect/Transform/IR 6 | 7 | DEPENDS 8 | MLIRTransformExtOpsIncGen 9 | 10 | LINK_LIBS PUBLIC 11 | MLIRIR 12 | MLIRPass 13 | MLIRPDLDialect 14 | MLIRTransformDialect 15 | MLIRLinalgExtTransformOps 16 | ) 17 | -------------------------------------------------------------------------------- /compiler/lib/Dialect/Transform/Transforms/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_byteir_dialect_library(ByteIRTransformPasses 2 | TransformDialectInterpreter.cpp 3 | TransformInsertion.cpp 4 | 5 | ADDITIONAL_HEADER_DIRS 6 | ${BYTEIR_SRC_INCLUDE_DIR}/byteir/Dialect/Transform/Transforms 7 | 8 | DEPENDS 9 | ByteIRTransformPassIncGen 10 | MLIRLinalgExtTransformOps 11 | MLIRTransformExtOpsIncGen 12 | 13 | LINK_LIBS PUBLIC 14 | MLIRIR 15 | MLIRPass 16 | MLIRPDLDialect 17 | MLIRTransformDialect 18 | MLIRLinalgExtTransformOps 19 | ) 20 | -------------------------------------------------------------------------------- /compiler/lib/Dialect/Vector/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(Transforms) 2 | -------------------------------------------------------------------------------- /compiler/lib/Dialect/Vector/Transforms/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_byteir_dialect_library(ByteIRVectorPasses 2 | CanonicalizeExt.cpp 3 | VectorLowerings.cpp 4 | VectorWarpDistribute.cpp 5 | MoveForallRegionIntoWarpOp.cpp 6 | 7 | ADDITIONAL_HEADER_DIRS 8 | ${BYTEIR_SRC_INCLUDE_DIR}/byteir/Dialect/Vector/Transforms 9 | 10 | DEPENDS 11 | ByteIRVectorPassIncGen 12 | ByteIRUtils 13 | 14 | LINK_LIBS PUBLIC 15 | ByteIRUtils 16 | MLIRIR 17 | MLIRSupport 18 | 19 | MLIRAffineDialect 20 | MLIRMemRefDialect 21 | MLIRSCFDialect 22 | MLIRTensorDialect 23 | MLIRVectorDialect 24 | ) 25 | -------------------------------------------------------------------------------- /compiler/lib/Pipelines/Common/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_mlir_library(ByteIRPipelineCommon 2 | Utils.cpp 3 | 4 | ADDITIONAL_HEADER_DIRS 5 | ${BYTEIR_SRC_INCLUDE_DIR}/byteir/Pipelines/Common 6 | 7 | DEPENDS 8 | ByteIRTransforms 9 | ByteIRUtils 10 | MhloDialect 11 | MLIRBufferTransforms 12 | 13 | LINK_LIBS PUBLIC 14 | ByteIRLinalgPasses 15 | ByteIRUtils 16 | ByteIRSCFPasses 17 | MLIRIR 18 | MLIRTransforms 19 | ) -------------------------------------------------------------------------------- /compiler/lib/Pipelines/GPU/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_byteir_pipeline_library(ByteIRGPUPipelines 2 | ElementwiseCodegen.cpp 3 | GPUOpt.cpp 4 | LinalgMemrefGPU.cpp 5 | MappingForall.cpp 6 | NVVMCodegen.cpp 7 | ReductionCodegen.cpp 8 | 9 | ADDITIONAL_HEADER_DIRS 10 | ${BYTEIR_SRC_INCLUDE_DIR}/byteir/Pipelines/GPU 11 | 12 | DEPENDS 13 | ByteIRPipelineCommon 14 | ByteIRTransforms 15 | ByteIRUtils 16 | MhloDialect 17 | MLIRBufferTransforms 18 | 19 | LINK_LIBS PUBLIC 20 | ByteIRGPUPasses 21 | ByteIRLinalgPasses 22 | ByteIRPipelineCommon 23 | ByteIRUtils 24 | ByteIRSCFPasses 25 | ByteIRToPTX 26 | MLIRIR 27 | MLIRTransforms 28 | MLIRLinalgExtTransformOps 29 | ) -------------------------------------------------------------------------------- /compiler/lib/Pipelines/Host/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_byteir_pipeline_library(ByteIRHostPipelines 2 | Codegen.cpp 3 | HostOpt.cpp 4 | ToLLVM.cpp 5 | 6 | ADDITIONAL_HEADER_DIRS 7 | ${BYTEIR_SRC_INCLUDE_DIR}/byteir/Pipelines/Host 8 | 9 | DEPENDS 10 | MLIRTransformExtDialect 11 | MLIRLinalgExtTransformOps 12 | 13 | LINK_LIBS PUBLIC 14 | MLIRIR 15 | MLIRFuncDialect 16 | ByteIRToLLVM 17 | ByteIRPipelineCommon 18 | ByteIRTransformPasses 19 | ByteIRVectorPasses 20 | MLIRTransformExtDialect 21 | MLIRLinalgExtTransformOps 22 | MLIRArithTransforms 23 | MLIRBufferizationTransforms 24 | MLIRFuncToLLVM 25 | MLIRMathToLLVM 26 | MLIRMemRefToLLVM 27 | MLIRReconcileUnrealizedCasts 28 | MLIRSCFToControlFlow 29 | MLIRTensorTransforms 30 | MLIRTransforms 31 | ) -------------------------------------------------------------------------------- /compiler/lib/Stat/AllocCnt/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_byteir_stat_library(ByteIRAllocCntStat 2 | AllocCnt.cpp 3 | 4 | DEPENDS 5 | ByteIRStatCommon 6 | ByteIRUtils 7 | 8 | LINK_LIBS PUBLIC 9 | ByteIRAnalysis 10 | ByteIRStatCommon 11 | ByteIRUtils 12 | MLIRIR 13 | MLIRMemRefDialect 14 | ) -------------------------------------------------------------------------------- /compiler/lib/Stat/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(AllocCnt) 2 | add_subdirectory(Common) 3 | add_subdirectory(OpCnt) -------------------------------------------------------------------------------- /compiler/lib/Stat/Common/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_mlir_library(ByteIRStatCommon 2 | Reg.cpp 3 | 4 | LINK_LIBS PUBLIC 5 | MLIRIR 6 | ) -------------------------------------------------------------------------------- /compiler/lib/Stat/OpCnt/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_byteir_stat_library(ByteIROpCntStat 2 | OpCnt.cpp 3 | 4 | DEPENDS 5 | ByteIRStatCommon 6 | 7 | LINK_LIBS PUBLIC 8 | ByteIRStatCommon 9 | MLIRIR 10 | ) -------------------------------------------------------------------------------- /compiler/lib/Target/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(Cpp) 2 | add_subdirectory(CUDA) 3 | add_subdirectory(LLVM) 4 | add_subdirectory(PTX) 5 | -------------------------------------------------------------------------------- /compiler/lib/Target/CUDA/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_byteir_translation_library(ByteIRTargetCUDA 2 | TranslateRegistration.cpp 3 | TranslateToCUDA.cpp 4 | 5 | ADDITIONAL_HEADER_DIRS 6 | ${BYTEIR_SRC_INCLUDE_DIR}/byteir/Target/CUDA 7 | 8 | DEPENDS 9 | ByteIRTargetCpp 10 | 11 | LINK_LIBS PUBLIC 12 | MLIREmitCDialect 13 | MLIRIR 14 | MLIRSCFDialect 15 | MLIRControlFlowDialect 16 | MLIRMemRefDialect 17 | MLIRGPUDialect 18 | MLIRSupport 19 | # MLIRTranslation 20 | ByteIRTargetCpp 21 | ) 22 | -------------------------------------------------------------------------------- /compiler/lib/Target/Cpp/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_byteir_translation_library(ByteIRTargetCpp 2 | TranslateRegistration.cpp 3 | TranslateToCpp.cpp 4 | 5 | ADDITIONAL_HEADER_DIRS 6 | ${BYTEIR_SRC_INCLUDE_DIR}/byteir/Target/Cpp 7 | 8 | LINK_LIBS PUBLIC 9 | MLIREmitCDialect 10 | MLIRIR 11 | MLIRSCFDialect 12 | MLIRControlFlowDialect 13 | MLIRMemRefDialect 14 | MLIRSupport 15 | # MLIRTranslation 16 | ) 17 | -------------------------------------------------------------------------------- /compiler/lib/Target/LLVM/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_byteir_translation_library(ByteIRTargetLLVM 2 | TranslateRegistration.cpp 3 | 4 | ADDITIONAL_HEADER_DIRS 5 | ${BYTEIR_SRC_INCLUDE_DIR}/byteir/Target/LLVM 6 | 7 | LINK_LIBS PUBLIC 8 | LLVMBitWriter 9 | MLIRArmNeonToLLVMIRTranslation 10 | MLIRArmSMEToLLVMIRTranslation 11 | MLIRArmSVEToLLVMIRTranslation 12 | MLIRAMXToLLVMIRTranslation 13 | MLIRBuiltinToLLVMIRTranslation 14 | MLIRGPUToLLVMIRTranslation 15 | MLIRX86VectorToLLVMIRTranslation 16 | MLIRLLVMToLLVMIRTranslation 17 | MLIRNVVMToLLVMIRTranslation 18 | MLIROpenACCToLLVMIRTranslation 19 | MLIROpenMPToLLVMIRTranslation 20 | MLIRROCDLToLLVMIRTranslation 21 | ) 22 | -------------------------------------------------------------------------------- /compiler/lib/Utils/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_mlir_library(ByteIRUtils 2 | AffineUtils.cpp 3 | AttrUtils.cpp 4 | FuncUtils.cpp 5 | GraphUtils.cpp 6 | Hoist.cpp 7 | IRRewrite.cpp 8 | LoopUtils.cpp 9 | MemUtils.cpp 10 | ModuleUtils.cpp 11 | OpInterfaceUtils.cpp 12 | PatternMatch.cpp 13 | OptionUtils.cpp 14 | PipelineUtils.cpp 15 | TileUtils.cpp 16 | TypeUtils.cpp 17 | Utils.cpp 18 | 19 | ADDITIONAL_HEADER_DIRS 20 | ${BYTEIR_SRC_INCLUDE_DIR}/byteir/Utils 21 | 22 | LINK_LIBS PUBLIC 23 | MLIRIR 24 | MLIRAffineDialect 25 | MLIRArithDialect 26 | MLIRCclDialect 27 | MLIRMemRefDialect 28 | MLIRSCFDialect 29 | MLIRSCFExtUtils 30 | ) -------------------------------------------------------------------------------- /compiler/numerical/hlo/test_broadcast_dense_elements_attr.mlir: -------------------------------------------------------------------------------- 1 | // RUN: byteir-opt %s -test-broadcast-dense-elements-attr -o %t 2 | // RUN: FileCheck %s < %t 3 | // RUN: python3 %S/numerical_test.py %s %t 4 | 5 | func.func @case3() -> tensor<2x1x5xi64> { 6 | %0 = mhlo.constant dense<[[[2, 3]]]> : tensor<1x1x2xi64> 7 | %1 = "mhlo.broadcast_in_dim"(%0) {broadcast_dimensions = dense<[1, 2, 0]> : tensor<3xi64>} : (tensor<1x1x2xi64>) -> (tensor<2x1x5xi64>) 8 | return %1 : tensor<2x1x5xi64> 9 | } 10 | // CHECK-LABEL: @case3 11 | // CHECK{LITERAL}: [[[2, 2, 2, 2, 2]], [[3, 3, 3, 3, 3]]] 12 | // CHECK-NOT: mhlo.broadcast_in_dim 13 | -------------------------------------------------------------------------------- /compiler/numerical/lit.site.cfg.py.in: -------------------------------------------------------------------------------- 1 | import lit.llvm 2 | 3 | config.llvm_tools_dir = r"@LLVM_TOOLS_DIR@" 4 | config.byteir_tools_dir = r"@BYTEIR_TOOLS_DIR@" 5 | config.byteir_numerical_build_dir = r"@BYTEIR_NUMERICAL_BUILD_DIR@" 6 | config.lit_tools_dir = config.llvm_tools_dir 7 | 8 | try: 9 | config.llvm_tools_dir = config.llvm_tools_dir % lit_config.params 10 | except KeyError: 11 | e = sys.exc_info()[1] 12 | key, = e.args 13 | lit_config.fatal("unable to find %r parameter, use '--param=%s=VALUE'" % (key,key)) 14 | 15 | lit.llvm.initialize(lit_config, config) 16 | 17 | # Let the main config do the real work. 18 | lit_config.load_config(config, r"@BYTEIR_NUMERICAL_SOURCE_DIR@/lit.cfg.py") 19 | -------------------------------------------------------------------------------- /compiler/python/byteir/dialects/cat/ir_translator/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 ByteDance Ltd. and/or its affiliates. All rights reserved. 2 | # Licensed under the Apache License, Version 2.0 (the "License"); 3 | # you may not use this file except in compliance with the License. 4 | # You may obtain a copy of the License at 5 | # 6 | # http://www.apache.org/licenses/LICENSE-2.0 7 | # 8 | # Unless required by applicable law or agreed to in writing, software 9 | # distributed under the License is distributed on an "AS IS" BASIS, 10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | # See the License for the specific language governing permissions and 12 | # limitations under the License. 13 | # ============================================================================== 14 | -------------------------------------------------------------------------------- /compiler/python/test/dialects/cat/ait/numerical/layernorm.mlir: -------------------------------------------------------------------------------- 1 | // RUN: %python -m byteir.tools.cat_executor %s --backend=ait | FileCheck %s 2 | 3 | func.func @layer_norm(%arg0 : tensor<1x16x4096xf32>, %arg1 : tensor<4096xf32>, %arg2 : tensor<4096xf32>) -> tensor<1x16x4096xf32> attributes {__byteir_cat_fusion__} { 4 | %0 = "mhlo.custom_call"(%arg0, %arg1, %arg2) {api_version = 1 : i32, backend_config = "", byteir_attrs = {axis = [2], epsilon = 1.000000e-05 : f64}, call_target_name = "byteir.layer_norm", called_computations = [], has_side_effect = false} : (tensor<1x16x4096xf32>, tensor<4096xf32>, tensor<4096xf32>) -> (tensor<1x16x4096xf32>) 5 | return %0 : tensor<1x16x4096xf32> 6 | } 7 | 8 | // CHECK: cat ait numerical test pass 9 | -------------------------------------------------------------------------------- /compiler/python/test/dialects/cat/ait/numerical/matmul.mlir: -------------------------------------------------------------------------------- 1 | // RUN: %python -m byteir.tools.cat_executor %s --backend=ait | FileCheck %s 2 | 3 | func.func @main(%arg0 : tensor<128x64xf16>, %arg1 : tensor<64x32xf16>) -> tensor<128x32xf16> attributes {__byteir_cat_fusion__} { 4 | %0 = "mhlo.dot_general"(%arg0, %arg1) {dot_dimension_numbers = #mhlo.dot} : (tensor<128x64xf16>, tensor<64x32xf16>) -> tensor<128x32xf16> 5 | return %0 : tensor<128x32xf16> 6 | } 7 | 8 | // CHECK: cat ait numerical test pass 9 | -------------------------------------------------------------------------------- /compiler/python/test/dialects/cat/ait/numerical/permute021.mlir: -------------------------------------------------------------------------------- 1 | // RUN: %python -m byteir.tools.cat_executor %s --backend=ait | FileCheck %s 2 | 3 | func.func @permute021(%arg0 : tensor<32x16x128xf32>) -> tensor<32x128x16xf32> attributes {__byteir_cat_fusion__} { 4 | %0 = "mhlo.transpose"(%arg0) {permutation = dense<[0, 2, 1]> : tensor<3xi64>} : (tensor<32x16x128xf32>) -> tensor<32x128x16xf32> 5 | return %0 : tensor<32x128x16xf32> 6 | } 7 | 8 | // CHECK: cat ait numerical test pass 9 | -------------------------------------------------------------------------------- /compiler/python/test/dialects/cat/ait/numerical/permute0213.mlir: -------------------------------------------------------------------------------- 1 | // RUN: %python -m byteir.tools.cat_executor %s --backend=ait | FileCheck %s 2 | 3 | func.func @permute0213(%arg0 : tensor<1x16x32x128xf32>) -> tensor<1x32x16x128xf32> attributes {__byteir_cat_fusion__} { 4 | %0 = "mhlo.transpose"(%arg0) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x16x32x128xf32>) -> tensor<1x32x16x128xf32> 5 | return %0 : tensor<1x32x16x128xf32> 6 | } 7 | 8 | // CHECK: cat ait numerical test pass 9 | -------------------------------------------------------------------------------- /compiler/python/test/dialects/cat/ait/numerical/permute0312.mlir: -------------------------------------------------------------------------------- 1 | // RUN: %python -m byteir.tools.cat_executor %s --backend=ait | FileCheck %s 2 | 3 | func.func @permute0312(%arg0 : tensor<1x16x32x128xf32>) -> tensor<1x128x16x32xf32> attributes {__byteir_cat_fusion__} { 4 | %0 = "mhlo.transpose"(%arg0) {permutation = dense<[0, 3, 1, 2]> : tensor<4xi64>} : (tensor<1x16x32x128xf32>) -> tensor<1x128x16x32xf32> 5 | return %0 : tensor<1x128x16x32xf32> 6 | } 7 | 8 | // CHECK: cat ait numerical test pass 9 | -------------------------------------------------------------------------------- /compiler/python/test/dialects/cat/ait/numerical/permute10.mlir: -------------------------------------------------------------------------------- 1 | // RUN: %python -m byteir.tools.cat_executor %s --backend=ait | FileCheck %s 2 | 3 | func.func @permute10(%arg0 : tensor<128x64xf32>) -> tensor<64x128xf32> attributes {__byteir_cat_fusion__} { 4 | %0 = "mhlo.transpose"(%arg0) {permutation = dense<[1, 0]> : tensor<2xi64>} : (tensor<128x64xf32>) -> tensor<64x128xf32> 5 | return %0 : tensor<64x128xf32> 6 | } 7 | 8 | // CHECK: cat ait numerical test pass 9 | -------------------------------------------------------------------------------- /compiler/python/test/dialects/cat/ait/numerical/softmax_f16.mlir: -------------------------------------------------------------------------------- 1 | // RUN: %python -m byteir.tools.cat_executor %s --backend=ait | FileCheck %s 2 | 3 | func.func @softmax_f16(%arg0 : tensor<1x12x1024x1024xf16>) -> tensor<1x12x1024x1024xf16> attributes {__byteir_cat_fusion__} { 4 | %0 = mhlo.custom_call @byteir.softmax(%arg0) {backend_config = "", byteir_attrs = {axis = 3 : i64}} : (tensor<1x12x1024x1024xf16>) -> tensor<1x12x1024x1024xf16> 5 | return %0 : tensor<1x12x1024x1024xf16> 6 | } 7 | 8 | // CHECK: cat ait numerical test pass 9 | -------------------------------------------------------------------------------- /compiler/python/test/dialects/cat/ait/profile/matmul.mlir: -------------------------------------------------------------------------------- 1 | // RUN: %python -m byteir.tools.cat_executor %s --mode=profile --backend=ait | FileCheck %s 2 | 3 | func.func @main(%arg0 : tensor<128x64xf16>, %arg1 : tensor<64x32xf16>) -> tensor<128x32xf16> attributes {__byteir_cat_fusion__} { 4 | %0 = "mhlo.dot_general"(%arg0, %arg1) {dot_dimension_numbers = #mhlo.dot} : (tensor<128x64xf16>, tensor<64x32xf16>) -> tensor<128x32xf16> 5 | return %0 : tensor<128x32xf16> 6 | } 7 | 8 | // CHECK: cat ait profile finish 9 | -------------------------------------------------------------------------------- /compiler/python/test/lit.site.cfg.py.in: -------------------------------------------------------------------------------- 1 | import lit.llvm 2 | 3 | config.llvm_tools_dir = r"@LLVM_TOOLS_DIR@" 4 | config.byteir_python_packages_dir = r"@BYTEIR_PYTHON_PACKAGES_DIR@/byteir" 5 | config.byteir_python_test_dir = r"@PROJECT_BINARY_DIR@/python/test" 6 | config.lit_tools_dir = config.llvm_tools_dir 7 | 8 | try: 9 | config.llvm_tools_dir = config.llvm_tools_dir % lit_config.params 10 | except KeyError: 11 | e = sys.exc_info()[1] 12 | key, = e.args 13 | lit_config.fatal("unable to find %r parameter, use '--param=%s=VALUE'" % (key,key)) 14 | 15 | lit.llvm.initialize(lit_config, config) 16 | 17 | # Let the main config do the real work. 18 | lit_config.load_config(config, r"@CMAKE_CURRENT_SOURCE_DIR@/lit.cfg.py") 19 | -------------------------------------------------------------------------------- /compiler/python/version.txt: -------------------------------------------------------------------------------- 1 | 1.9.3.0 -------------------------------------------------------------------------------- /compiler/test/Conversion/ToHlo/arithConstToMhlo.mlir: -------------------------------------------------------------------------------- 1 | // RUN: byteir-opt %s -convert-arith-to-mhlo | FileCheck %s 2 | 3 | // CHECK-LABEL: func.func @const 4 | func.func @const() -> tensor<4x4xf32> { 5 | // CHECK: mhlo.constant 6 | %0 = arith.constant dense<0.000000e+00> : tensor<4x4xf32> 7 | return %0 : tensor<4x4xf32> 8 | } 9 | 10 | // CHECK-LABEL: func.func @not_mhlo_const 11 | func.func @not_mhlo_const() -> i32 { 12 | // CHECK-NOT: mhlo.constant 13 | %0 = arith.constant 1 : i32 14 | return %0 : i32 15 | } 16 | -------------------------------------------------------------------------------- /compiler/test/Conversion/ToLinalg/TesnorToLinalg.mlir: -------------------------------------------------------------------------------- 1 | // RUN: byteir-opt -tensor-to-linalg -split-input-file %s | FileCheck %s 2 | 3 | 4 | func.func @expand_shape_static(%arg0: tensor<1000xf32>) -> tensor<1x1000xf32> { 5 | %expanded = tensor.expand_shape %arg0 [[0, 1]] output_shape [1, 1000] : tensor<1000xf32> into tensor<1x1000xf32> 6 | return %expanded : tensor<1x1000xf32> 7 | } 8 | // CHECK-LABEL: @expand_shape_static 9 | // CHECK: linalg.generic 10 | 11 | func.func @collapse_shape_static(%arg0: tensor<1x3x4x1x5xf32>) -> tensor<3x4x5xf32> { 12 | %0 = tensor.collapse_shape %arg0 [[0, 1], [2], [3, 4]] : 13 | tensor<1x3x4x1x5xf32> into tensor<3x4x5xf32> 14 | return %0 : tensor<3x4x5xf32> 15 | } 16 | // CHECK-LABEL: @collapse_shape_static 17 | // CHECK: linalg.generic 18 | -------------------------------------------------------------------------------- /compiler/test/Conversion/ToLinalg/hloConvertToLinalg.mlir: -------------------------------------------------------------------------------- 1 | // RUN: byteir-opt %s -hlo-fusion-to-linalg="target="cpu" arch="x86_64"" | FileCheck %s 2 | 3 | func.func @mhlo_convert_f32_i32(%arg0: tensor<2x3xf32>) -> tensor<2x3xi32> { 4 | %0 = mhlo.convert %arg0 : (tensor<2x3xf32>) -> tensor<2x3xi32> 5 | return %0 : tensor<2x3xi32> 6 | } 7 | // CHECK-LABEL: mhlo_convert_f32_i32 8 | // CHECK: linalg.map 9 | // CHECK: arith.cmpf 10 | // CHECK: arith.fptosi 11 | // CHECK: arith.select 12 | -------------------------------------------------------------------------------- /compiler/test/Conversion/ToLinalg/primitiveOpsHlo.mlir: -------------------------------------------------------------------------------- 1 | // RUN: byteir-opt -hlo-fusion-to-linalg="enable-primitive-ops=true" %s | FileCheck %s 2 | 3 | // CHECK-LABEL: mhlo_add 4 | func.func @mhlo_add(%lhs: tensor<2x2xf32>, 5 | %rhs: tensor<2x2xf32>) -> tensor<2x2xf32> { 6 | %0 = "mhlo.add"(%lhs, %rhs) {someattr} 7 | : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32> 8 | return %0 : tensor<2x2xf32> 9 | // CHECK: tensor.empty 10 | // CHECK: linalg.map 11 | // CHECK: arith.addf 12 | } 13 | -------------------------------------------------------------------------------- /compiler/test/Conversion/ToLinalg/simpleHlo.mlir: -------------------------------------------------------------------------------- 1 | // RUN: byteir-opt -hlo-legalize-to-linalg %s | FileCheck %s 2 | 3 | // CHECK-LABEL: mhlo_add 4 | func.func @mhlo_add(%lhs: tensor<2x2xf32>, 5 | %rhs: tensor<2x2xf32>) -> tensor<2x2xf32> { 6 | %0 = "mhlo.add"(%lhs, %rhs) {someattr} 7 | : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32> 8 | return %0 : tensor<2x2xf32> 9 | // CHECK: linalg.generic 10 | // CHECK: addf 11 | } 12 | -------------------------------------------------------------------------------- /compiler/test/Dialect/Ace/attrs.mlir: -------------------------------------------------------------------------------- 1 | // RUN: byteir-opt %s | FileCheck %s 2 | 3 | func.func @extension_type(%arg0: tensor<3x?xf32, #ace.tensor_encoding>) -> tensor<3x?xf32, #ace.tensor_encoding> { 4 | return %arg0 : tensor<3x?xf32, #ace.tensor_encoding> 5 | } 6 | // CHECK: tensor<3x?xf32, #ace.tensor_encoding> -------------------------------------------------------------------------------- /compiler/test/Dialect/Ace/canonicalize.mlir: -------------------------------------------------------------------------------- 1 | // RUN: byteir-opt --canonicalize %s | FileCheck %s 2 | 3 | func.func @test_ace_constant_case0() -> tensor { 4 | %0 = "ace.constant"() {value = dense<"fork_active_pay"> : tensor} : () -> tensor 5 | return %0 : tensor 6 | } 7 | // CHECK: ace.constant 8 | -------------------------------------------------------------------------------- /compiler/test/Dialect/Affine/insertTrivialAffineLoop.mlir: -------------------------------------------------------------------------------- 1 | // RUN: byteir-opt %s -insert-trivial-affine-loop | FileCheck %s 2 | 3 | 4 | func.func @scalar_func(%arg0: memref) -> memref { 5 | %cst = arith.constant 1.000000e+00 : f32 6 | %0 = memref.alloc() : memref 7 | %cst_0 = arith.constant 0.000000e+00 : f32 8 | %1 = affine.load %arg0[] : memref 9 | %2 = arith.cmpf une, %1, %cst_0 : f32 10 | %3 = arith.select %2, %1, %cst : f32 11 | affine.store %3, %0[] : memref 12 | return %0 : memref 13 | } 14 | // CHECK-LABEL: func.func @scalar_func 15 | // CHECK: affine.for {{.*}} = 0 to 1 16 | // CHECK-NEXT: affine.load 17 | // CHECK-NEXT: arith.cmpf 18 | // CHECK-NEXT: arith.select 19 | // CHECK-NEXT: affine.store -------------------------------------------------------------------------------- /compiler/test/Dialect/Byre/Serialization/Compatibility/version_1_0_0.mlir.bc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bytedance/byteir/5c485e2857e277934ec1f72878b1d9673f3614be/compiler/test/Dialect/Byre/Serialization/Compatibility/version_1_0_0.mlir.bc -------------------------------------------------------------------------------- /compiler/test/Dialect/Byre/Serialization/Compatibility/version_1_0_0.mlir.bc.v0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bytedance/byteir/5c485e2857e277934ec1f72878b1d9673f3614be/compiler/test/Dialect/Byre/Serialization/Compatibility/version_1_0_0.mlir.bc.v0 -------------------------------------------------------------------------------- /compiler/test/Dialect/Byre/Serialization/Compatibility/version_1_0_0_alloc.mlir.bc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bytedance/byteir/5c485e2857e277934ec1f72878b1d9673f3614be/compiler/test/Dialect/Byre/Serialization/Compatibility/version_1_0_0_alloc.mlir.bc -------------------------------------------------------------------------------- /compiler/test/Dialect/Lace/ops.mlir: -------------------------------------------------------------------------------- 1 | // RUN: byteir-opt -allow-unregistered-dialect %s | FileCheck %s 2 | 3 | func.func @test_reshape(%arg0: memref<2x3xf32>) -> memref<6xf32> { 4 | %0 = "lace.reshape" (%arg0) : (memref<2x3xf32>) -> memref<6xf32> 5 | return %0: memref<6xf32> 6 | } 7 | // CHECK: lace.reshape 8 | 9 | func.func @test_slice(%arg0: memref<2x3xf32>) -> memref<1x3xf32> { 10 | %0 = "lace.slice" (%arg0) {limit_indices = dense<[2, 3]> : tensor<2xi64>, start_indices = dense<[1, 0]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}: (memref<2x3xf32>) -> memref<1x3xf32> 11 | return %0: memref<1x3xf32> 12 | } -------------------------------------------------------------------------------- /compiler/test/Dialect/Mhlo/fusion.mlir: -------------------------------------------------------------------------------- 1 | // RUN: byteir-opt %s | FileCheck %s 2 | 3 | func.func @mhlo_add(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> { 4 | %0 = "mhlo.fusion"(%arg0, %arg1) ( { 5 | %1 = mhlo.add %arg0, %arg1 : tensor<4xf32> 6 | %2 = mhlo.add %arg0, %1 : tensor<4xf32> 7 | "mhlo.return"(%2) : (tensor<4xf32>) -> () 8 | }) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32> 9 | return %0 : tensor<4xf32> 10 | } 11 | // CHECK-LABEL: func.func @mhlo_add 12 | -------------------------------------------------------------------------------- /compiler/test/Dialect/Mhlo/transforms/fuseBMMDimension.mlir: -------------------------------------------------------------------------------- 1 | // RUN: byteir-opt %s -fuse-bmm-dimension | FileCheck %s 2 | 3 | func.func @dot_general(%arg0 : tensor<12x4x64x64xf32>, %arg1 : tensor<12x4x64x32xf32>) -> (tensor<12x4x64x32xf32>) { 4 | %0 = "mhlo.dot_general"(%arg0, %arg1) {dot_dimension_numbers = #mhlo.dot} : (tensor<12x4x64x64xf32>, tensor<12x4x64x32xf32>) -> tensor<12x4x64x32xf32> 5 | return %0 : tensor<12x4x64x32xf32> 6 | } 7 | 8 | // CHECK-LABEL: dot_general 9 | // CHECK-NEXT: mhlo.reshape 10 | // CHECK-NEXT: mhlo.reshape 11 | // CHECK-NEXT: mhlo.dot_general 12 | // CHECK-NEXT: mhlo.reshape 13 | -------------------------------------------------------------------------------- /compiler/test/Dialect/SCF/insertTrivialSCFLoop.mlir: -------------------------------------------------------------------------------- 1 | // RUN: byteir-opt %s -insert-trivial-scf-loop | FileCheck %s 2 | 3 | 4 | func.func @scalar_func(%arg0: memref) -> memref { 5 | %cst = arith.constant 1.000000e+00 : f32 6 | %0 = memref.alloc() : memref 7 | %cst_0 = arith.constant 0.000000e+00 : f32 8 | %1 = memref.load %arg0[] : memref 9 | %2 = arith.cmpf une, %1, %cst_0 : f32 10 | %3 = arith.select %2, %1, %cst : f32 11 | memref.store %3, %0[] : memref 12 | return %0 : memref 13 | } 14 | // CHECK-LABEL: func.func @scalar_func 15 | // CHECK: scf.for {{.*}} = %c0 to %c1 step %c1 16 | // CHECK-NEXT: memref.load 17 | // CHECK-NEXT: arith.cmpf 18 | // CHECK-NEXT: select 19 | // CHECK-NEXT: memref.store -------------------------------------------------------------------------------- /compiler/test/E2E/CUDA/MLPBasic/input.mlir: -------------------------------------------------------------------------------- 1 | // RUN: byteir-opt %s | FileCheck %s 2 | 3 | func.func @mlp(%arg0 : tensor<128x64xf32>, %arg1 : tensor<64x32xf32>, %arg2 : tensor<32xf32>) -> tensor<128x32xf32> { 4 | %0 = "mhlo.dot"(%arg0, %arg1) : (tensor<128x64xf32>, tensor<64x32xf32>) -> tensor<128x32xf32> 5 | %1 = "mhlo.broadcast_in_dim"(%arg2) {broadcast_dimensions = dense<[1]> : tensor<1xi64>} : (tensor<32xf32>) -> tensor<128x32xf32> 6 | %2 = "mhlo.add"(%0, %1) : (tensor<128x32xf32>, tensor<128x32xf32>) -> tensor<128x32xf32> 7 | return %2 : tensor<128x32xf32> 8 | } 9 | // CHECK-LABEL: func.func @mlp 10 | -------------------------------------------------------------------------------- /compiler/test/E2E/Host/Case0_Bytecode/Output.bc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bytedance/byteir/5c485e2857e277934ec1f72878b1d9673f3614be/compiler/test/E2E/Host/Case0_Bytecode/Output.bc -------------------------------------------------------------------------------- /compiler/test/E2E/Host/Case0_Bytecode/Output.mlirbc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bytedance/byteir/5c485e2857e277934ec1f72878b1d9673f3614be/compiler/test/E2E/Host/Case0_Bytecode/Output.mlirbc -------------------------------------------------------------------------------- /compiler/test/E2E/Host/Case1/Output.mlir: -------------------------------------------------------------------------------- 1 | // RUN: byteir-opt %s | FileCheck %s 2 | 3 | // CHECK-LABEL: func.func @main 4 | 5 | module attributes {byre.container_module} { 6 | func.func @main(%arg0: memref<1x100x27x48x3xf32, "cpu"> {byre.argname = "Input0", byre.argtype = 1 : i32}, %arg1: memref<51200xi32, "cpu"> {byre.argname = "Output0", byre.argtype = 2 : i32}) attributes {byre.entry_point} { 7 | byre.compute @LLVMJITOp(%arg0, %arg1) {kernel_name = "Unknown0", llvm_file_name = "host_kernels.ll", memory_effects = [1 : i32, 2 : i32]} : memref<1x100x27x48x3xf32, "cpu">, memref<51200xi32, "cpu"> 8 | return 9 | } 10 | } -------------------------------------------------------------------------------- /compiler/test/E2E/Host/RngNormal/00_Input.mlir: -------------------------------------------------------------------------------- 1 | // RUN: byteir-opt %s --hlo-graph-opt --hlo-fusion-opt="target=cpu" --linalg-tensor-opt="target=cpu" --byre-tensor-opt="entry-func=main append-arg-types" --byteir-bufferize-opt --linalg-memref-opt --scf-opt="target=cpu" | FileCheck %s 2 | 3 | // CHECK-LABEL: func.func @main 4 | 5 | func.func @main() -> tensor<1x97xf32> { 6 | %0 = mhlo.constant dense<0.000000e+00> : tensor 7 | %1 = mhlo.constant dense<1.000000e+00> : tensor 8 | %2 = mhlo.constant dense<[1, 97]> : tensor<2xi64> 9 | %3 = "mhlo.rng"(%0, %1, %2) {rng_distribution = #mhlo.rng_distribution} : (tensor, tensor, tensor<2xi64>) -> tensor<1x97xf32> 10 | return %3 : tensor<1x97xf32> 11 | } -------------------------------------------------------------------------------- /compiler/test/E2E/Host/RngUniform/00_Input.mlir: -------------------------------------------------------------------------------- 1 | // RUN: byteir-opt %s --hlo-graph-opt --hlo-fusion-opt="target=cpu" --linalg-tensor-opt="target=cpu" --byre-tensor-opt="entry-func=main append-arg-types" --byteir-bufferize-opt --linalg-memref-opt --scf-opt="target=cpu" | FileCheck %s 2 | 3 | // CHECK-LABEL: func.func @main 4 | 5 | func.func @main() -> tensor<1x97xf32> { 6 | %0 = mhlo.constant dense<0.000000e+00> : tensor 7 | %1 = mhlo.constant dense<1.000000e+00> : tensor 8 | %2 = mhlo.constant dense<[1, 97]> : tensor<2xi64> 9 | %3 = "mhlo.rng"(%0, %1, %2) {rng_distribution = #mhlo.rng_distribution} : (tensor, tensor, tensor<2xi64>) -> tensor<1x97xf32> 10 | return %3 : tensor<1x97xf32> 11 | } -------------------------------------------------------------------------------- /compiler/test/E2E/Host/Transpose/00_Input.mlir: -------------------------------------------------------------------------------- 1 | // RUN: byteir-opt %s --hlo-graph-opt --hlo-fusion-opt="target=cpu" --linalg-tensor-opt="target=cpu" --byre-tensor-opt="entry-func=main append-arg-types" --byteir-bufferize-opt --linalg-memref-opt --scf-opt="target=cpu" | FileCheck %s 2 | 3 | // CHECK-LABEL: func.func @main 4 | 5 | func.func @main(%arg0: tensor<1x32x64x64xf32>) -> tensor<1x64x64x32xf32> { 6 | %0 = "mhlo.transpose"(%arg0) {permutation = dense<[0, 2, 3, 1]>: tensor<4xi64>} : (tensor<1x32x64x64xf32>) -> tensor<1x64x64x32xf32> 7 | return %0 : tensor<1x64x64x32xf32> 8 | } -------------------------------------------------------------------------------- /compiler/test/E2E/Host/Transpose/Output.mlir: -------------------------------------------------------------------------------- 1 | // RUN: byteir-opt %s | FileCheck %s 2 | 3 | // CHECK-LABEL: func.func @main 4 | 5 | module attributes {byre.container_module} { 6 | func.func @main(%arg0: memref<1x32x64x64xf32, "cpu"> {byre.argname = "Input0", byre.argtype = 1 : i32}, %arg1: memref<1x64x64x32xf32, "cpu"> {byre.argname = "Output0", byre.argtype = 2 : i32}) attributes {byre.entry_point} { 7 | byre.compute @LLVMJITOp(%arg0, %arg1) {kernel_name = "Unknown0", llvm_file_name = "host_kernels.ll", memory_effects = [1 : i32, 2 : i32]} : memref<1x32x64x64xf32, "cpu">, memref<1x64x64x32xf32, "cpu"> 8 | return 9 | } 10 | } -------------------------------------------------------------------------------- /compiler/test/E2E/Host/Transpose/TotalPipeline.mlir: -------------------------------------------------------------------------------- 1 | // RUN: byteir-opt %s --hlo-graph-opt --hlo-fusion-opt="target=cpu" --linalg-tensor-opt="target=cpu" --byre-tensor-opt="entry-func=main append-arg-types" --byteir-bufferize-opt --linalg-memref-opt --scf-opt="target=cpu" --host-opt -set-op-space="entry-func=main space=cpu" -set-arg-space="entry-func=main all-space=cpu" --byre-opt --to-llvm | byteir-translate --mlir-to-llvmir | FileCheck %s 2 | 3 | // CHECK-LABEL: define void @_mlir_ciface_Unknown 4 | 5 | func.func @main(%arg0: tensor<1x32x64x64xf32>) -> tensor<1x64x64x32xf32> { 6 | %0 = "mhlo.transpose"(%arg0) {permutation = dense<[0, 2, 3, 1]>: tensor<4xi64>} : (tensor<1x32x64x64xf32>) -> tensor<1x64x64x32xf32> 7 | return %0 : tensor<1x64x64x32xf32> 8 | } -------------------------------------------------------------------------------- /compiler/test/E2E/Host/TypeCvt/00_Input.mlir: -------------------------------------------------------------------------------- 1 | // RUN: byteir-opt %s --hlo-graph-opt --hlo-fusion-opt="target=cpu" --linalg-tensor-opt="target=cpu" --byre-tensor-opt="entry-func=main append-arg-types" --byteir-bufferize-opt --linalg-memref-opt --scf-opt="target=cpu" | FileCheck %s 2 | 3 | // CHECK-LABEL: func.func @main 4 | 5 | func.func @main(%arg0 : tensor<1x224x224x3xf32>) -> tensor<1x224x224x3xf16> { 6 | %0 = mhlo.convert %arg0 : (tensor<1x224x224x3xf32>) -> tensor<1x224x224x3xf16> 7 | return %0 : tensor<1x224x224x3xf16> 8 | } -------------------------------------------------------------------------------- /compiler/test/E2E/Host/TypeCvt/Output.mlir: -------------------------------------------------------------------------------- 1 | // RUN: byteir-opt %s | FileCheck %s 2 | 3 | // CHECK-LABEL: func.func @main 4 | 5 | module attributes {byre.container_module} { 6 | func.func @main(%arg0: memref<1x224x224x3xf32, "cpu"> {byre.argname = "Input0", byre.argtype = 1 : i32}, %arg1: memref<1x224x224x3xf16, "cpu"> {byre.argname = "Output0", byre.argtype = 2 : i32}) attributes {byre.entry_point} { 7 | byre.compute @LLVMJITOp(%arg0, %arg1) {kernel_name = "Unknown0", llvm_file_name = "host_kernels.ll", memory_effects = [1 : i32, 2 : i32]} : memref<1x224x224x3xf32, "cpu">, memref<1x224x224x3xf16, "cpu"> 8 | return 9 | } 10 | } -------------------------------------------------------------------------------- /compiler/test/E2E/Host/TypeCvt/TotalPipeline.mlir: -------------------------------------------------------------------------------- 1 | // RUN: byteir-opt %s --hlo-graph-opt --hlo-fusion-opt="target=cpu" --linalg-tensor-opt="target=cpu" --byre-tensor-opt="entry-func=main append-arg-types" --byteir-bufferize-opt --linalg-memref-opt --scf-opt="target=cpu" --host-opt -set-op-space="entry-func=main space=cpu" -set-arg-space="entry-func=main all-space=cpu" --byre-opt --to-llvm | byteir-translate --mlir-to-llvmir | FileCheck %s 2 | 3 | // CHECK-LABEL: define void @_mlir_ciface_Unknown 4 | 5 | func.func @main(%arg0 : tensor<1x224x224x3xf32>) -> tensor<1x224x224x3xf16> { 6 | %0 = mhlo.convert %arg0 : (tensor<1x224x224x3xf32>) -> tensor<1x224x224x3xf16> 7 | return %0 : tensor<1x224x224x3xf16> 8 | } -------------------------------------------------------------------------------- /compiler/test/Pipelines/BufferizeOpts/tensor.mlir: -------------------------------------------------------------------------------- 1 | // RUN: byteir-opt %s -byteir-bufferize-opt --split-input-file | FileCheck %s 2 | 3 | // CHECK-LABEL: tensor_pad 4 | func.func @tensor_pad(%arg0: tensor<2x34xi32>) -> tensor<2x64xi32> { 5 | %c3_i32 = arith.constant 3 : i32 6 | // CHECK-NOT: bufferization.to_tensor 7 | // CHECK: linalg.map 8 | // CHECK-SAME: memref<2x64xi32> 9 | // CHECK-NOT: bufferization.to_memref 10 | %0 = tensor.pad %arg0 low[0, 0] high[0, 30] { 11 | ^bb0(%arg1: index, %arg2: index): 12 | tensor.yield %c3_i32 : i32 13 | } : tensor<2x34xi32> to tensor<2x64xi32> 14 | return %0 : tensor<2x64xi32> 15 | } 16 | -------------------------------------------------------------------------------- /compiler/test/Pipelines/Host/ToLLVM/subview.mlir: -------------------------------------------------------------------------------- 1 | // RUN: byteir-opt --to-llvm %s | FileCheck %s 2 | 3 | module attributes {byteir.llvm_module} { 4 | func.func @subview(%arg0: memref<32x128xi32>) -> memref <32x64xi32, strided<[128, 1]>> attributes {llvm.emit_c_interface} { 5 | %0 = memref.subview %arg0[0, 0] [32, 64] [1, 1] : memref<32x128xi32> to memref <32x64xi32, strided<[128, 1]>> 6 | return %0: memref <32x64xi32, strided<[128, 1]>> 7 | } 8 | // CHECK-LABEL: llvm.func @subview 9 | // CHECK: llvm.mlir.undef : !llvm.struct 10 | // CHECK-LABEL: llvm.func @_mlir_ciface_subview 11 | } -------------------------------------------------------------------------------- /compiler/test/Pipelines/Host/ToLLVM/tanh.mlir: -------------------------------------------------------------------------------- 1 | // RUN: byteir-opt --to-llvm %s | FileCheck %s 2 | 3 | module attributes {byteir.llvm_module} { 4 | func.func @Unknown0(%arg0: memref<32xf32>, %arg1: memref<32xf32>) attributes {llvm.emit_c_interface} { 5 | %c0 = arith.constant 0 : index 6 | %c1 = arith.constant 1 : index 7 | %c32 = arith.constant 32 : index 8 | scf.for %arg2 = %c0 to %c32 step %c1 { 9 | %0 = memref.load %arg0[%arg2] : memref<32xf32> 10 | %1 = math.tanh %0 : f32 11 | memref.store %1, %arg1[%arg2] : memref<32xf32> 12 | } 13 | return 14 | } 15 | // CHECK-LABEL: llvm.func @tanhf 16 | // CHECK: llvm.call @tanhf 17 | } -------------------------------------------------------------------------------- /compiler/test/Target/Cpp/attrs.mlir: -------------------------------------------------------------------------------- 1 | // RUN: byteir-translate -emit-cpp %s | FileCheck %s 2 | 3 | // CHECK-LABEL: void opaque_attrs() { 4 | func.func @opaque_attrs() { 5 | // CHECK-NEXT: f(OPAQUE_ENUM_VALUE); 6 | emitc.call_opaque "f"() {args = [#emitc.opaque<"OPAQUE_ENUM_VALUE">]} : () -> () 7 | // CHECK-NEXT: f("some string"); 8 | emitc.call_opaque "f"() {args = [#emitc.opaque<"\"some string\"">]} : () -> () 9 | return 10 | } 11 | -------------------------------------------------------------------------------- /compiler/test/Target/Cpp/cast.mlir: -------------------------------------------------------------------------------- 1 | // RUN: byteir-translate -emit-cpp %s | FileCheck %s 2 | 3 | // CHECK-LABEL: test_cast 4 | // CHECK-SAME: (int32_t [[V1:[^ ]*]], uint32_t [[V2:[^ ]*]], size_t [[V3:[^ ]*]]) 5 | func.func @test_cast(%arg0 : i32, %arg1 : ui32, %arg2: index) -> i32 { 6 | // CHECK-NEXT: float [[V4:[^ ]*]] = (float)([[V1]]); 7 | %0 = arith.sitofp %arg0: i32 to f32 8 | // CHECK-NEXT: int32_t [[V5:[^ ]*]] = (int32_t)([[V3]]); 9 | %1 = arith.index_cast %arg2: index to i32 10 | // CHECK-NEXT: float [[V6:[^ ]*]] = (float)([[V2]]); 11 | %2 = builtin.unrealized_conversion_cast %arg1: ui32 to f32 12 | // CHECK-NEXT: int32_t [[V7:[^ ]*]] = (int32_t)([[V4]]); 13 | %3 = arith.fptosi %0: f32 to i32 14 | return %3 : i32 15 | } 16 | 17 | -------------------------------------------------------------------------------- /compiler/test/Target/Cpp/types.mlir: -------------------------------------------------------------------------------- 1 | // RUN: byteir-translate -emit-cpp %s | FileCheck %s 2 | 3 | // CHECK-LABEL: func_external 4 | // CHECK-SAME: (int32_t, int32_t) 5 | func.func private @func_external(%arg0 : i32, %arg1 : i32) 6 | 7 | // CHECK-LABEL: func_int 8 | // CHECK-SAME: (int32_t [[V1:[^ ]*]], int32_t [[V2:[^ ]*]]) 9 | func.func @func_int(%arg0 : i32, %arg1 : i32) -> () { 10 | return 11 | } 12 | 13 | // CHECK-LABEL: func_memref 14 | // CHECK-SAME: (float*, int32_t) 15 | func.func private @func_memref(%arg0 : memref<2x3xf32>, %arg1 : i32) -------------------------------------------------------------------------------- /compiler/test/Transforms/ApplyPDLPatterns/Case_0.mlir: -------------------------------------------------------------------------------- 1 | // RUN: byteir-opt %s -apply-pdl-patterns="pdl-file=%S/Pattern_0.mlir" -allow-unregistered-dialect | FileCheck %s 2 | 3 | func.func @foo(%arg0: index) -> index { 4 | // CHECK: test.test_op_B 5 | %0 = "test.test_op_A"(%arg0) {__rewrite__} : (index) -> index 6 | // CHECK: test.test_op_A 7 | %1 = "test.test_op_A"(%0) : (index) -> index 8 | return %1 : index 9 | } -------------------------------------------------------------------------------- /compiler/test/Transforms/ApplyPDLPatterns/Pattern_0.mlir: -------------------------------------------------------------------------------- 1 | // RUN: byteir-opt %s 2 | 3 | module { 4 | pdl.pattern : benefit(0) { 5 | %0 = types 6 | %1 = operands 7 | %2 = attribute 8 | %3 = operation "test.test_op_A"(%1 : !pdl.range) {"__rewrite__" = %2} -> (%0 : !pdl.range) 9 | rewrite %3 { 10 | %4 = operation "test.test_op_B"(%1 : !pdl.range) -> (%0 : !pdl.range) 11 | replace %3 with %4 12 | } 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /compiler/test/Transforms/CanonicalizeExt/deprecated.mlir: -------------------------------------------------------------------------------- 1 | // RUN: byteir-opt %s -canonicalize-ext | FileCheck %s 2 | 3 | func.func @broadcast_to_broadcast_in_dim(%arg0: tensor<3xf32>) -> tensor<1x2x3xf32> { 4 | %0 = "mhlo.broadcast"(%arg0) {broadcast_sizes = dense<[1, 2]> : tensor<2xi64>} : (tensor<3xf32>) -> tensor<1x2x3xf32> 5 | return %0 : tensor<1x2x3xf32> 6 | } 7 | // CHECK-LABEL: func.func @broadcast_to_broadcast_in_dim 8 | // CHECK-NEXT: mhlo.broadcast_in_dim 9 | // CHECK-NEXT: return 10 | -------------------------------------------------------------------------------- /compiler/test/Transforms/collectFunc.mlir: -------------------------------------------------------------------------------- 1 | // RUN: byteir-opt %s -collect-func="anchor-attr=testAttr" | FileCheck %s 2 | 3 | 4 | func.func private @test_private1() { 5 | return 6 | } 7 | // CHECK-LABEL: func.func private @test_private1() 8 | 9 | func.func private @test_private2() { 10 | return 11 | } 12 | // CHECK-NOT: func.func private @test_private2() 13 | 14 | func.func @test1() attributes {testAttr} { 15 | call @test_private1() : () -> () 16 | return 17 | } 18 | // CHECK-LABEL: func.func @test1() attributes {testAttr} 19 | 20 | func.func @test2() attributes {testAttr2} { 21 | call @test_private2() : () -> () 22 | return 23 | } 24 | // CHECK-NOT: func.func @test2() 25 | 26 | -------------------------------------------------------------------------------- /compiler/test/Transforms/genericDeviceConfig.mlir: -------------------------------------------------------------------------------- 1 | // RUN: byteir-opt %s -generic-device-config="anchor-attr=__byteir_test_device__ compute-name=TestDeviceOp" | FileCheck %s 2 | 3 | func.func private @device_func(memref<1x97xf32>, memref<1x6xf32>) -> memref<1x6xf32> attributes {__byteir_test_device__} 4 | // CHECK-LABEL: func.func private @device_func 5 | // CHECK-SAME: attributes {__byre__kernel_name = "device_func", __byteir_test_device__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "TestDeviceOp", byre_force_compute_name} -------------------------------------------------------------------------------- /compiler/test/Transforms/insertUniqueIdErase.mlir: -------------------------------------------------------------------------------- 1 | // RUN: byteir-opt %s -insert-unique-id="erase-id=true" -split-input-file | FileCheck %s 2 | 3 | func.func @mhlo_add(%arg0 : tensor<4xf32>, %arg1 : tensor<4xf32>) -> tensor<4xf32> { 4 | %res = "mhlo.add"(%arg0, %arg1) {__byteir_unique_id__ = "mhlo.add_0"} : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32> 5 | return {__byteir_unique_id__ = "func.return_1"} %res : tensor<4xf32> 6 | } 7 | 8 | // CHECK-LABEL: func.func @mhlo_add 9 | // CHECK-SAME: (%[[ARG0:[a-zA-Z0-9]+]]: tensor<4xf32>, %[[ARG1:[a-zA-Z0-9]+]]: tensor<4xf32>) 10 | // CHECK: %[[V0:.*]] = mhlo.add %[[ARG0]], %[[ARG1]] : tensor<4xf32> 11 | // CHECK: return %[[V0]] : tensor<4xf32> -------------------------------------------------------------------------------- /compiler/test/Transforms/rewriteOpToStdCall.mlir: -------------------------------------------------------------------------------- 1 | // RUN: byteir-opt %s --rewrite-op-to-std-call="call-table=linalg.matmul:matmul_impl" --split-input-file | FileCheck %s 2 | 3 | // CHECK: func.func private @matmul_impl(memref, memref, memref) 4 | func.func @matmul(%A : memref, %B : memref, %C : memref) { 5 | // CHECK: call @matmul_impl({{.*}}, {{.*}}, {{.*}}) : (memref, memref, memref) -> () 6 | linalg.matmul ins(%A, %B: memref, memref) outs(%C: memref) 7 | return 8 | } 9 | -------------------------------------------------------------------------------- /compiler/test/Transforms/setArgShape.mlir: -------------------------------------------------------------------------------- 1 | // RUN: byteir-opt %s -allow-unregistered-dialect -set-arg-shape="dim=0 size=3 entry-func-name=tf_add arg-attr-name=__placeholder__byre.argname" | FileCheck %s 2 | 3 | func.func @tf_add(%arg0 : tensor {__placeholder__byre.argname = "A"}, %arg1 : tensor {__placeholder__byre.argname = "B"}) -> (tensor<*xf32> {__placeholder__byre.argname = "C"}) attributes { __placeholder__byre.entry_point} { 4 | %res = "tf.Add"(%arg0, %arg1) : (tensor, tensor) -> tensor<*xf32> 5 | return %res : tensor<*xf32> 6 | } 7 | // CHECK-LABEL: func.func @tf_add 8 | // CHECK-NEXT: %[[RES0:.*]] = "tf.Add"(%arg0, %arg1) : (tensor<3x4xf32>, tensor<3x4xf32>) -> tensor<*xf32> 9 | -------------------------------------------------------------------------------- /compiler/test/Transforms/setOpSpace.mlir: -------------------------------------------------------------------------------- 1 | // RUN: byteir-opt %s -set-op-space="entry-func=main space=cpu" --allow-unregistered-dialect| FileCheck %s 2 | 3 | // CHECK-LABEL: func.func @main 4 | func.func @main(%arg0 : memref<2x4xf32>, %arg1 : memref<2x4xf32>, %arg2 : memref<2x4xf32>) -> (memref<2x4xf32>, memref<2x4xf32>) { 5 | %0 = memref.alloc() : memref<2x4xf32> 6 | "lmhlo.add"(%arg0, %arg1, %arg2) : (memref<2x4xf32>, memref<2x4xf32>, memref<2x4xf32>) -> () 7 | // CHECK: lmhlo.add 8 | // CHECK-SAME: device = "cpu" 9 | "lmhlo.add"(%arg0, %arg1, %0) : (memref<2x4xf32>, memref<2x4xf32>, memref<2x4xf32>) -> () 10 | // CHECK-NEXT: lmhlo.add 11 | // CHECK-SAME: device = "cpu" 12 | return %0, %0: memref<2x4xf32>, memref<2x4xf32> 13 | } 14 | -------------------------------------------------------------------------------- /compiler/test/Utils/testMergeTwoModulesCase0.mlir: -------------------------------------------------------------------------------- 1 | // RUN: byteir-opt %s --test-merge-two-modules="second-module-path=%S/testMergeTwoModulesCase0_1.mlir" --allow-unregistered-dialect | FileCheck %s 2 | 3 | func.func @main(%arg0: tensor) -> tensor { 4 | return %arg0 : tensor 5 | } 6 | // CHECK: func.func @main 7 | // CHECK-NEXT: call @__byteir__merge_model_0 8 | // CHECK-NEXT: call @__byteir__merge_model_1 9 | // CHECK-DAG: func.func private @__byteir__merge_model_1 10 | // CHECK-DAG: func.func private @__byteir__merge_model_0 11 | -------------------------------------------------------------------------------- /compiler/test/Utils/testMergeTwoModulesCase0_1.mlir: -------------------------------------------------------------------------------- 1 | // RUN: byteir-opt %s --allow-unregistered-dialect 2 | 3 | module { 4 | func.func @main(%arg0: tensor) -> tensor { 5 | %0 = "foo.add"(%arg0, %arg0) : (tensor, tensor) -> tensor 6 | return %0 : tensor 7 | } 8 | } 9 | -------------------------------------------------------------------------------- /compiler/test/Utils/testMergeTwoModulesCase1_1.mlir: -------------------------------------------------------------------------------- 1 | // RUN: byteir-opt %s --allow-unregistered-dialect 2 | 3 | module { 4 | func.func @main(%arg0: tensor, %arg1: tensor) -> tensor attributes {byteir.entry_point = {inputs = ["module1_input0", "module1_input1"], outputs = ["module1_output"]}} { 5 | %0 = "foo.add"(%arg0, %arg1) : (tensor, tensor) -> tensor 6 | return %0 : tensor 7 | } 8 | } 9 | -------------------------------------------------------------------------------- /compiler/test/Utils/testMergeTwoModulesCase2.mlir: -------------------------------------------------------------------------------- 1 | // RUN: byteir-opt %s --allow-unregistered-dialect 2 | 3 | func.func @main(%arg0: tensor, %arg1: tensor) -> (tensor, tensor) attributes {byteir.entry_point = {inputs = ["module0_input0", "module0_input1"], outputs = ["x", "y"]}} { 4 | return %arg0, %arg1 : tensor, tensor 5 | } 6 | -------------------------------------------------------------------------------- /compiler/test/Utils/testMergeTwoModulesCase2_1.mlir: -------------------------------------------------------------------------------- 1 | // RUN: byteir-opt %s --allow-unregistered-dialect 2 | 3 | module { 4 | func.func @main(%arg0: tensor, %arg1: tensor) -> tensor attributes {byteir.entry_point = {inputs = ["xx", "yy"], outputs = ["module1_output"]}} { 5 | %0 = "foo.add"(%arg0, %arg1) : (tensor, tensor) -> tensor 6 | return %0 : tensor 7 | } 8 | } 9 | -------------------------------------------------------------------------------- /compiler/test/lib/Analysis/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Exclude tests from libMLIR.so 2 | add_mlir_library(ByteIRTestAnalysis 3 | TestGraphClusteringByDeviceOpNum.cpp 4 | TestPrintLiveness.cpp 5 | TestPrintShapeAnalysis.cpp 6 | TestPrintSideEffect.cpp 7 | TestPrintSymbolicShape.cpp 8 | TestPrintUseRange.cpp 9 | 10 | EXCLUDE_FROM_LIBMLIR 11 | 12 | LINK_LIBS PUBLIC 13 | ByteIRAnalysis 14 | ) -------------------------------------------------------------------------------- /compiler/test/lib/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(Analysis) 2 | add_subdirectory(Interface) 3 | add_subdirectory(Transformation) 4 | add_subdirectory(Utils) 5 | -------------------------------------------------------------------------------- /compiler/test/lib/Interface/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Exclude tests from libMLIR.so 2 | add_mlir_library(ByteIRTestInterface 3 | TestByreOpInterface.cpp 4 | 5 | EXCLUDE_FROM_LIBMLIR 6 | 7 | DEPENDS 8 | MLIRByreDialect 9 | 10 | LINK_LIBS PUBLIC 11 | MLIRIR 12 | MLIRByreDialect 13 | ) -------------------------------------------------------------------------------- /compiler/test/lib/Transformation/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Exclude tests from libMLIR.so 2 | add_mlir_library(ByteIRTestTransformation 3 | TestByreSerialRoundtrip.cpp 4 | TestConvertFuncToCustomCall.cpp 5 | TestConvertInsertion.cpp 6 | TestDTypeConversion.cpp 7 | TestFuncArgRearrangement.cpp 8 | 9 | EXCLUDE_FROM_LIBMLIR 10 | 11 | LINK_LIBS PUBLIC 12 | ByteIRMhloPasses 13 | ByteIRUtils 14 | MLIRByreSerialization 15 | MhloDialect 16 | ) -------------------------------------------------------------------------------- /compiler/test/lib/Utils/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Exclude tests from libMLIR.so 2 | add_mlir_library(ByteIRTestUtils 3 | TestBroadcastDenseElementsAttr.cpp 4 | TestMergeTwoModules.cpp 5 | 6 | EXCLUDE_FROM_LIBMLIR 7 | 8 | LINK_LIBS PUBLIC 9 | ByteIRMhloUtils 10 | ByteIRUtils 11 | MhloDialect 12 | ) -------------------------------------------------------------------------------- /compiler/tools/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(byteir-cpu-runner) 2 | add_subdirectory(byteir-opt) 3 | add_subdirectory(byteir-stat) 4 | add_subdirectory(byteir-translate) 5 | -------------------------------------------------------------------------------- /compiler/tools/byteir-cpu-runner/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(LLVM_LINK_COMPONENTS 2 | Core 3 | Support 4 | nativecodegen 5 | native 6 | ) 7 | 8 | add_mlir_tool(byteir-cpu-runner 9 | byteir-cpu-runner.cpp 10 | ) 11 | 12 | llvm_update_compile_flags(byteir-cpu-runner) 13 | target_link_libraries(byteir-cpu-runner PRIVATE 14 | MLIRAnalysis 15 | MLIRExecutionEngine 16 | MLIRIR 17 | MLIRJitRunner 18 | MLIRLLVMDialect 19 | MLIRLLVMToLLVMIRTranslation 20 | MLIRToLLVMIRTranslationRegistration 21 | MLIRParser 22 | MLIRTargetLLVMIRExport 23 | MLIRSupport 24 | ) 25 | -------------------------------------------------------------------------------- /external/patches/AITemplate/A10.patch: -------------------------------------------------------------------------------- 1 | diff --git a/python/aitemplate/testing/detect_target.py b/python/aitemplate/testing/detect_target.py 2 | index 2b2913d..95fd02f 100644 3 | --- a/python/aitemplate/testing/detect_target.py 4 | +++ b/python/aitemplate/testing/detect_target.py 5 | @@ -42,7 +42,7 @@ def _detect_cuda_with_nvidia_smi(): 6 | sm_names = { 7 | "70": ["V100"], 8 | "75": ["T4", "Quadro T2000"], 9 | - "80": ["PG509", "A100", "A10G", "RTX 30", "A30", "RTX 40"], 10 | + "80": ["PG509", "A100", "A10G", "RTX 30", "A30", "RTX 40", "A10", "A16"], 11 | "90": ["H100"], 12 | } 13 | for sm, names in sm_names.items(): 14 | -------------------------------------------------------------------------------- /external/patches/AITemplate/logging.patch: -------------------------------------------------------------------------------- 1 | diff --git a/static/csrc/model_container.cpp b/static/csrc/model_container.cpp 2 | index 5548a97..920ed60 100644 3 | --- a/static/csrc/model_container.cpp 4 | +++ b/static/csrc/model_container.cpp 5 | @@ -80,9 +80,9 @@ ModelContainer::ModelContainer( 6 | useDebugLogging = true; 7 | } 8 | } 9 | - LOG(INFO) 10 | - << (useDebugLogging ? PrintDebugDeviceProperties(prop) 11 | - : PrintInfoDeviceProperties(prop)); 12 | + //LOG(INFO) 13 | + // << (useDebugLogging ? PrintDebugDeviceProperties(prop) 14 | + // : PrintInfoDeviceProperties(prop)); 15 | 16 | LOG(INFO) << "Init AITemplate Runtime with " << num_models << " concurrency"; 17 | models_.reserve(num_models); 18 | -------------------------------------------------------------------------------- /external/patches/AITemplate/num_builders.patch: -------------------------------------------------------------------------------- 1 | diff --git a/python/aitemplate/backend/builder.py b/python/aitemplate/backend/builder.py 2 | index e66d97e..b0d5848 100644 3 | --- a/python/aitemplate/backend/builder.py 4 | +++ b/python/aitemplate/backend/builder.py 5 | @@ -900,6 +900,7 @@ clean: 6 | f"-C {build_dir}", 7 | ] 8 | ) 9 | + self._n_jobs = 4 10 | make_clean_cmd = f" {make_path} {make_flags} clean " 11 | make_all_cmd = f" {make_path} {make_flags} -j{self._n_jobs} all " 12 | make_clean_constants_cmd = f" {make_path} {make_flags} clean_constants " 13 | -------------------------------------------------------------------------------- /external_libs/runtime/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | ## Copyright (c) ByteDance Inc. All rights reserved. 2 | ## Licensed under the Apache License, Version 2.0 3 | 4 | # Minimum CMake required 5 | cmake_minimum_required(VERSION 3.18) 6 | set(CMAKE_CXX_STANDARD 17) 7 | 8 | project(brt-libs LANGUAGES CXX CUDA) 9 | 10 | 11 | set(REPO_ROOT ${PROJECT_SOURCE_DIR}) 12 | message("REPO_ROOT = ${REPO_ROOT}") 13 | set(BYTEIR_ROOT ${REPO_ROOT}/../..) 14 | set(CUTLASS_ROOT ${BYTEIR_ROOT}/external/cutlass) 15 | message("CUTLASS_ROOT = ${CUTLASS_ROOT}") 16 | 17 | add_subdirectory(flash_attn) 18 | -------------------------------------------------------------------------------- /external_libs/runtime/README.md: -------------------------------------------------------------------------------- 1 | # Runtime External Libs 2 | 3 | Runtime external library contains standalone kernels that can be used externally, eg. used by ByteIR Runtime. 4 | 5 | ## Build 6 | ### Linux/Mac 7 | ```bash 8 | mkdir ./build 9 | 10 | # build runtime 11 | cd build && cmake .. -G Ninja 12 | 13 | cmake --build . --target all 14 | ``` 15 | -------------------------------------------------------------------------------- /external_libs/runtime/flash_attn/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(lib) -------------------------------------------------------------------------------- /external_libs/runtime/flash_attn/lib/flash_bwd_hdim128_fp16_sm80.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023, Tri Dao. 2 | // Splitting the different head dimensions to different files to speed up compilation. 3 | // This file is auto-generated. See "generate_kernels.py" 4 | 5 | #include "flash_bwd_launch_template.h" 6 | 7 | template<> 8 | void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { 9 | run_mha_bwd_hdim128(params, stream); 10 | } 11 | -------------------------------------------------------------------------------- /external_libs/runtime/flash_attn/lib/flash_bwd_hdim160_fp16_sm80.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023, Tri Dao. 2 | // Splitting the different head dimensions to different files to speed up compilation. 3 | // This file is auto-generated. See "generate_kernels.py" 4 | 5 | #include "flash_bwd_launch_template.h" 6 | 7 | template<> 8 | void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { 9 | run_mha_bwd_hdim160(params, stream); 10 | } 11 | -------------------------------------------------------------------------------- /external_libs/runtime/flash_attn/lib/flash_bwd_hdim192_fp16_sm80.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023, Tri Dao. 2 | // Splitting the different head dimensions to different files to speed up compilation. 3 | // This file is auto-generated. See "generate_kernels.py" 4 | 5 | #include "flash_bwd_launch_template.h" 6 | 7 | template<> 8 | void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { 9 | run_mha_bwd_hdim192(params, stream); 10 | } 11 | -------------------------------------------------------------------------------- /external_libs/runtime/flash_attn/lib/flash_bwd_hdim224_fp16_sm80.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023, Tri Dao. 2 | // Splitting the different head dimensions to different files to speed up compilation. 3 | // This file is auto-generated. See "generate_kernels.py" 4 | 5 | #include "flash_bwd_launch_template.h" 6 | 7 | template<> 8 | void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { 9 | run_mha_bwd_hdim224(params, stream); 10 | } 11 | -------------------------------------------------------------------------------- /external_libs/runtime/flash_attn/lib/flash_bwd_hdim256_fp16_sm80.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023, Tri Dao. 2 | // Splitting the different head dimensions to different files to speed up compilation. 3 | // This file is auto-generated. See "generate_kernels.py" 4 | 5 | #include "flash_bwd_launch_template.h" 6 | 7 | template<> 8 | void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { 9 | run_mha_bwd_hdim256(params, stream); 10 | } 11 | -------------------------------------------------------------------------------- /external_libs/runtime/flash_attn/lib/flash_bwd_hdim32_fp16_sm80.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023, Tri Dao. 2 | // Splitting the different head dimensions to different files to speed up compilation. 3 | // This file is auto-generated. See "generate_kernels.py" 4 | 5 | #include "flash_bwd_launch_template.h" 6 | 7 | template<> 8 | void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { 9 | run_mha_bwd_hdim32(params, stream); 10 | } 11 | -------------------------------------------------------------------------------- /external_libs/runtime/flash_attn/lib/flash_bwd_hdim64_fp16_sm80.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023, Tri Dao. 2 | // Splitting the different head dimensions to different files to speed up compilation. 3 | // This file is auto-generated. See "generate_kernels.py" 4 | 5 | #include "flash_bwd_launch_template.h" 6 | 7 | template<> 8 | void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { 9 | run_mha_bwd_hdim64(params, stream); 10 | } 11 | -------------------------------------------------------------------------------- /external_libs/runtime/flash_attn/lib/flash_bwd_hdim96_fp16_sm80.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023, Tri Dao. 2 | // Splitting the different head dimensions to different files to speed up compilation. 3 | // This file is auto-generated. See "generate_kernels.py" 4 | 5 | #include "flash_bwd_launch_template.h" 6 | 7 | template<> 8 | void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { 9 | run_mha_bwd_hdim96(params, stream); 10 | } 11 | -------------------------------------------------------------------------------- /external_libs/runtime/flash_attn/lib/flash_fwd_hdim128_fp16_sm80.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023, Tri Dao. 2 | // Splitting the different head dimensions to different files to speed up compilation. 3 | // This file is auto-generated. See "generate_kernels.py" 4 | 5 | #include "flash_fwd_launch_template.h" 6 | 7 | template<> 8 | void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { 9 | run_mha_fwd_hdim128(params, stream); 10 | } 11 | -------------------------------------------------------------------------------- /external_libs/runtime/flash_attn/lib/flash_fwd_hdim160_fp16_sm80.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023, Tri Dao. 2 | // Splitting the different head dimensions to different files to speed up compilation. 3 | // This file is auto-generated. See "generate_kernels.py" 4 | 5 | #include "flash_fwd_launch_template.h" 6 | 7 | template<> 8 | void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { 9 | run_mha_fwd_hdim160(params, stream); 10 | } 11 | -------------------------------------------------------------------------------- /external_libs/runtime/flash_attn/lib/flash_fwd_hdim192_fp16_sm80.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023, Tri Dao. 2 | // Splitting the different head dimensions to different files to speed up compilation. 3 | // This file is auto-generated. See "generate_kernels.py" 4 | 5 | #include "flash_fwd_launch_template.h" 6 | 7 | template<> 8 | void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { 9 | run_mha_fwd_hdim192(params, stream); 10 | } 11 | -------------------------------------------------------------------------------- /external_libs/runtime/flash_attn/lib/flash_fwd_hdim224_fp16_sm80.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023, Tri Dao. 2 | // Splitting the different head dimensions to different files to speed up compilation. 3 | // This file is auto-generated. See "generate_kernels.py" 4 | 5 | #include "flash_fwd_launch_template.h" 6 | 7 | template<> 8 | void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { 9 | run_mha_fwd_hdim224(params, stream); 10 | } 11 | -------------------------------------------------------------------------------- /external_libs/runtime/flash_attn/lib/flash_fwd_hdim256_fp16_sm80.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023, Tri Dao. 2 | // Splitting the different head dimensions to different files to speed up compilation. 3 | // This file is auto-generated. See "generate_kernels.py" 4 | 5 | #include "flash_fwd_launch_template.h" 6 | 7 | template<> 8 | void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { 9 | run_mha_fwd_hdim256(params, stream); 10 | } 11 | -------------------------------------------------------------------------------- /external_libs/runtime/flash_attn/lib/flash_fwd_hdim32_fp16_sm80.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023, Tri Dao. 2 | // Splitting the different head dimensions to different files to speed up compilation. 3 | // This file is auto-generated. See "generate_kernels.py" 4 | 5 | #include "flash_fwd_launch_template.h" 6 | 7 | template<> 8 | void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { 9 | run_mha_fwd_hdim32(params, stream); 10 | } 11 | -------------------------------------------------------------------------------- /external_libs/runtime/flash_attn/lib/flash_fwd_hdim64_fp16_sm80.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023, Tri Dao. 2 | // Splitting the different head dimensions to different files to speed up compilation. 3 | // This file is auto-generated. See "generate_kernels.py" 4 | 5 | #include "flash_fwd_launch_template.h" 6 | 7 | template<> 8 | void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { 9 | run_mha_fwd_hdim64(params, stream); 10 | } 11 | -------------------------------------------------------------------------------- /external_libs/runtime/flash_attn/lib/flash_fwd_hdim96_fp16_sm80.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023, Tri Dao. 2 | // Splitting the different head dimensions to different files to speed up compilation. 3 | // This file is auto-generated. See "generate_kernels.py" 4 | 5 | #include "flash_fwd_launch_template.h" 6 | 7 | template<> 8 | void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { 9 | run_mha_fwd_hdim96(params, stream); 10 | } 11 | -------------------------------------------------------------------------------- /external_libs/runtime/flash_attn/lib/flash_fwd_split_hdim128_fp16_sm80.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023, Tri Dao. 2 | // Splitting the different head dimensions to different files to speed up compilation. 3 | // This file is auto-generated. See "generate_kernels.py" 4 | 5 | #include "flash_fwd_launch_template.h" 6 | 7 | template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); 8 | -------------------------------------------------------------------------------- /external_libs/runtime/flash_attn/lib/flash_fwd_split_hdim160_fp16_sm80.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023, Tri Dao. 2 | // Splitting the different head dimensions to different files to speed up compilation. 3 | // This file is auto-generated. See "generate_kernels.py" 4 | 5 | #include "flash_fwd_launch_template.h" 6 | 7 | template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); 8 | -------------------------------------------------------------------------------- /external_libs/runtime/flash_attn/lib/flash_fwd_split_hdim192_fp16_sm80.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023, Tri Dao. 2 | // Splitting the different head dimensions to different files to speed up compilation. 3 | // This file is auto-generated. See "generate_kernels.py" 4 | 5 | #include "flash_fwd_launch_template.h" 6 | 7 | template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); 8 | -------------------------------------------------------------------------------- /external_libs/runtime/flash_attn/lib/flash_fwd_split_hdim224_fp16_sm80.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023, Tri Dao. 2 | // Splitting the different head dimensions to different files to speed up compilation. 3 | // This file is auto-generated. See "generate_kernels.py" 4 | 5 | #include "flash_fwd_launch_template.h" 6 | 7 | template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); 8 | -------------------------------------------------------------------------------- /external_libs/runtime/flash_attn/lib/flash_fwd_split_hdim256_fp16_sm80.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023, Tri Dao. 2 | // Splitting the different head dimensions to different files to speed up compilation. 3 | // This file is auto-generated. See "generate_kernels.py" 4 | 5 | #include "flash_fwd_launch_template.h" 6 | 7 | template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); 8 | -------------------------------------------------------------------------------- /external_libs/runtime/flash_attn/lib/flash_fwd_split_hdim32_fp16_sm80.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023, Tri Dao. 2 | // Splitting the different head dimensions to different files to speed up compilation. 3 | // This file is auto-generated. See "generate_kernels.py" 4 | 5 | #include "flash_fwd_launch_template.h" 6 | 7 | template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); 8 | -------------------------------------------------------------------------------- /external_libs/runtime/flash_attn/lib/flash_fwd_split_hdim64_fp16_sm80.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023, Tri Dao. 2 | // Splitting the different head dimensions to different files to speed up compilation. 3 | // This file is auto-generated. See "generate_kernels.py" 4 | 5 | #include "flash_fwd_launch_template.h" 6 | 7 | template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); 8 | -------------------------------------------------------------------------------- /external_libs/runtime/flash_attn/lib/flash_fwd_split_hdim96_fp16_sm80.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023, Tri Dao. 2 | // Splitting the different head dimensions to different files to speed up compilation. 3 | // This file is auto-generated. See "generate_kernels.py" 4 | 5 | #include "flash_fwd_launch_template.h" 6 | 7 | template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); 8 | -------------------------------------------------------------------------------- /frontends/onnx-frontend/.gitignore: -------------------------------------------------------------------------------- 1 | .pytest_cache/ 2 | *.pyc 3 | *.tar.gz 4 | 5 | build/ 6 | -------------------------------------------------------------------------------- /frontends/onnx-frontend/onnx-frontend/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(src) 2 | add_subdirectory(test) 3 | -------------------------------------------------------------------------------- /frontends/onnx-frontend/onnx-frontend/src/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(Compiler) 2 | add_subdirectory(Conversion) 3 | add_subdirectory(Support) 4 | 5 | add_onnx_frontend_executable(onnx-frontend 6 | onnx-frontend.cpp 7 | 8 | INSTALL 9 | 10 | LINK_LIBS PRIVATE 11 | OFCompiler 12 | OMCompilerOptions 13 | StablehloPortableApi 14 | ) 15 | 16 | add_onnx_frontend_executable(onnx-frontend-opt 17 | onnx-frontend-opt.cpp 18 | 19 | INSTALL 20 | 21 | LINK_LIBS PRIVATE 22 | OMCompilerOptions 23 | OMRegisterPasses 24 | OFCompiler 25 | MLIROptLib 26 | ) 27 | -------------------------------------------------------------------------------- /frontends/onnx-frontend/onnx-frontend/src/Compiler/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_onnx_frontend_library(OFCompiler 2 | OFCompilerOptions.cpp 3 | OFCompilerPipelines.cpp 4 | OFCompilerUtils.cpp 5 | 6 | LINK_LIBS PUBLIC 7 | OFConversion 8 | ) -------------------------------------------------------------------------------- /frontends/onnx-frontend/onnx-frontend/src/Support/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_onnx_frontend_library(OFSupport 2 | OFUtils.cpp 3 | ) -------------------------------------------------------------------------------- /frontends/onnx-frontend/onnx-frontend/test/dynamic_shape_relu.onnx: -------------------------------------------------------------------------------- 1 |  onnx-relu:X 2 |  3 | XYrelu"Relu 4 | test-modelZ 5 | X 6 |  7 | B 8 | T 9 | Cb 10 | Y 11 |  12 | B 13 | T 14 | CB -------------------------------------------------------------------------------- /frontends/onnx-frontend/onnx-frontend/test/of_check_non_lowered.mlir: -------------------------------------------------------------------------------- 1 | // RUN: onnx-frontend-opt -check-non-lowered %s -split-input-file -verify-diagnostics 2 | 3 | func.func @test_onnx_non_lowered(%arg0: tensor<1x2xf32>) -> tensor<1x2xf32> { 4 | // expected-warning @+2 {{onnx.NoValue: ONNX op is not lowered}} 5 | // expected-error @-2 {{Please lower all ONNX ops}} 6 | %0 = "onnx.NoValue"() : () -> none 7 | return %arg0 : tensor<1x2xf32> 8 | } 9 | -------------------------------------------------------------------------------- /frontends/onnx-frontend/onnx-frontend/test/set_shape.mlir: -------------------------------------------------------------------------------- 1 | // RUN: onnx-frontend %S/dynamic_shape_relu.onnx --input-name-and-shapes=X,1,128,80 -- | FileCheck %s 2 | 3 | // CHECK-LABEL: func.func @main 4 | // CHECK-SAME: ([[PARAM_0_:%.+]]: tensor<1x128x80xf32> {onnx.name = "X"}) -> (tensor<1x128x80xf32> {onnx.name = "Y"}) attributes {byteir.entry_point = {inputs = ["X"], outputs = ["Y"]}} { 5 | // CHECK: [[VAR_0_:%.+]] = stablehlo.constant dense<0.000000e+00> : tensor<1x128x80xf32> 6 | // CHECK: [[VAR_1_:%.+]] = stablehlo.maximum [[PARAM_0_]], [[VAR_0_]] : tensor<1x128x80xf32> 7 | // CHECK: return [[VAR_1_]] : tensor<1x128x80xf32> -------------------------------------------------------------------------------- /frontends/onnx-frontend/pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | filterwarnings = 3 | ignore::DeprecationWarning -------------------------------------------------------------------------------- /frontends/onnx-frontend/requirements.txt: -------------------------------------------------------------------------------- 1 | lit>=14.0.0 2 | numpy>=1.21.6 3 | onnx==1.13.0 4 | onnxruntime>=1.13.1 5 | # protobuf==3.20.1 6 | pytest>=7.1.2 7 | torch>=1.12.0 8 | -------------------------------------------------------------------------------- /frontends/onnx-frontend/scripts/build_and_test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -x 4 | set -e 5 | 6 | CUR_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" > /dev/null && pwd )" 7 | # path to byteir root 8 | BYTEIR_ROOT="$CUR_DIR/../../.." 9 | # path to byteir/frontends/onnx-frontend 10 | ONNX_FRONTEND_ROOT="$BYTEIR_ROOT/frontends/onnx-frontend" 11 | 12 | export BYTEIR_ROOT="$BYTEIR_ROOT" 13 | export ONNX_FRONTEND_ROOT="$ONNX_FRONTEND_ROOT" 14 | 15 | source $CUR_DIR/envsetup.sh 16 | load_onnx_llvm_rtti_prebuilt 17 | 18 | of_envsetup 19 | of_build 20 | of_test_lit 21 | of_test_ops 22 | -------------------------------------------------------------------------------- /frontends/onnx-frontend/test/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bytedance/byteir/5c485e2857e277934ec1f72878b1d9673f3614be/frontends/onnx-frontend/test/__init__.py -------------------------------------------------------------------------------- /frontends/onnx-frontend/test/env.py: -------------------------------------------------------------------------------- 1 | import os 2 | import os.path as osp 3 | import sys 4 | 5 | ONNX_FRONTEND_PATH = osp.join(os.environ["ONNX_FRONTEND_ROOT"], "build/onnx-frontend/src/onnx-frontend") 6 | -------------------------------------------------------------------------------- /frontends/onnx-frontend/test/ops/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bytedance/byteir/5c485e2857e277934ec1f72878b1d9673f3614be/frontends/onnx-frontend/test/ops/__init__.py -------------------------------------------------------------------------------- /frontends/onnx-frontend/test/ops/data/math/clip.onnx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bytedance/byteir/5c485e2857e277934ec1f72878b1d9673f3614be/frontends/onnx-frontend/test/ops/data/math/clip.onnx -------------------------------------------------------------------------------- /frontends/onnx-frontend/test/ops/data/math/gelu.onnx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bytedance/byteir/5c485e2857e277934ec1f72878b1d9673f3614be/frontends/onnx-frontend/test/ops/data/math/gelu.onnx -------------------------------------------------------------------------------- /frontends/onnx-frontend/test/ops/data/math/softmax.onnx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bytedance/byteir/5c485e2857e277934ec1f72878b1d9673f3614be/frontends/onnx-frontend/test/ops/data/math/softmax.onnx -------------------------------------------------------------------------------- /frontends/onnx-frontend/test/ops/data/nn/batch_normalization.onnx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bytedance/byteir/5c485e2857e277934ec1f72878b1d9673f3614be/frontends/onnx-frontend/test/ops/data/nn/batch_normalization.onnx -------------------------------------------------------------------------------- /frontends/onnx-frontend/test/ops/data/quantize/quantize_dequantize.onnx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bytedance/byteir/5c485e2857e277934ec1f72878b1d9673f3614be/frontends/onnx-frontend/test/ops/data/quantize/quantize_dequantize.onnx -------------------------------------------------------------------------------- /frontends/onnx-frontend/test/ops/data/tensor/arg_max.onnx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bytedance/byteir/5c485e2857e277934ec1f72878b1d9673f3614be/frontends/onnx-frontend/test/ops/data/tensor/arg_max.onnx -------------------------------------------------------------------------------- /frontends/onnx-frontend/test/ops/data/tensor/arg_min.onnx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bytedance/byteir/5c485e2857e277934ec1f72878b1d9673f3614be/frontends/onnx-frontend/test/ops/data/tensor/arg_min.onnx -------------------------------------------------------------------------------- /frontends/onnx-frontend/test/ops/data/tensor/concat.onnx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bytedance/byteir/5c485e2857e277934ec1f72878b1d9673f3614be/frontends/onnx-frontend/test/ops/data/tensor/concat.onnx -------------------------------------------------------------------------------- /frontends/onnx-frontend/test/ops/data/tensor/concat_dynamic_shape.onnx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bytedance/byteir/5c485e2857e277934ec1f72878b1d9673f3614be/frontends/onnx-frontend/test/ops/data/tensor/concat_dynamic_shape.onnx -------------------------------------------------------------------------------- /frontends/onnx-frontend/test/ops/data/tensor/resize_nearest_v10.onnx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bytedance/byteir/5c485e2857e277934ec1f72878b1d9673f3614be/frontends/onnx-frontend/test/ops/data/tensor/resize_nearest_v10.onnx -------------------------------------------------------------------------------- /frontends/onnx-frontend/test/ops/test_quantize.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | import onnx 4 | from test.base import TestBase 5 | from test.ops.utils import build_onnx 6 | 7 | 8 | class TestOpsQuantize(TestBase): 9 | 10 | @pytest.fixture(autouse=True) 11 | def setup(self, tmpdir_factory): 12 | self.setup_base(tmpdir_factory, "test/ops/data/quantize") 13 | 14 | def test_quantize_dequantize(self): 15 | input_shape_dtype = [ 16 | ["data", (16, 3, 224, 224), "float32"], 17 | ] 18 | self.run(model_filename="quantize_dequantize.onnx", input_shape_dtype=input_shape_dtype) 19 | -------------------------------------------------------------------------------- /frontends/onnx-frontend/third_party/patches/OnnxMlirRegisterLibrary.patch: -------------------------------------------------------------------------------- 1 | diff --git a/src/Tools/onnx-mlir-opt/CMakeLists.txt b/src/Tools/onnx-mlir-opt/CMakeLists.txt 2 | index a90a670a..0a80c88b 100644 3 | --- a/src/Tools/onnx-mlir-opt/CMakeLists.txt 4 | +++ b/src/Tools/onnx-mlir-opt/CMakeLists.txt 5 | @@ -20,3 +20,16 @@ add_onnx_mlir_executable(onnx-mlir-opt 6 | MLIROptLib 7 | MLIRSCFToOpenMP 8 | ) 9 | + 10 | +add_onnx_mlir_library(OMRegisterPasses 11 | + RegisterPasses.cpp 12 | + 13 | + EXCLUDE_FROM_OM_LIBS 14 | + 15 | + LINK_LIBS PUBLIC 16 | + OMCompilerPasses 17 | + OMAccelerator 18 | + MLIRAffineTransforms 19 | + MLIRLinalgTransforms 20 | + MLIRMemRefTransforms 21 | +) 22 | -------------------------------------------------------------------------------- /frontends/tf-frontend/.bazelrc: -------------------------------------------------------------------------------- 1 | ./external/tensorflow/.bazelrc -------------------------------------------------------------------------------- /frontends/tf-frontend/.bazelversion: -------------------------------------------------------------------------------- 1 | ./external/tensorflow/.bazelversion -------------------------------------------------------------------------------- /frontends/tf-frontend/.gitignore: -------------------------------------------------------------------------------- 1 | /bazel-* 2 | example/.workspace/* 3 | -------------------------------------------------------------------------------- /frontends/tf-frontend/.tf_configure.bazelrc: -------------------------------------------------------------------------------- 1 | build --action_env PYTHON_BIN_PATH="/usr/bin/python3" 2 | build --action_env PYTHON_LIB_PATH="/usr/lib/python3/dist-packages" 3 | build --python_path="/usr/bin/python3" 4 | build:opt --copt=-Wno-sign-compare 5 | build:opt --host_copt=-Wno-sign-compare 6 | test --flaky_test_attempts=3 7 | test --test_size_filters=small,medium 8 | test:v1 --test_tag_filters=-benchmark-test,-no_oss,-gpu,-oss_serial 9 | test:v1 --build_tag_filters=-benchmark-test,-no_oss,-gpu 10 | test:v2 --test_tag_filters=-benchmark-test,-no_oss,-gpu,-oss_serial,-v1only 11 | test:v2 --build_tag_filters=-benchmark-test,-no_oss,-gpu,-v1only 12 | -------------------------------------------------------------------------------- /frontends/tf-frontend/BUILD: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bytedance/byteir/5c485e2857e277934ec1f72878b1d9673f3614be/frontends/tf-frontend/BUILD -------------------------------------------------------------------------------- /frontends/tf-frontend/byteir/BUILD: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bytedance/byteir/5c485e2857e277934ec1f72878b1d9673f3614be/frontends/tf-frontend/byteir/BUILD -------------------------------------------------------------------------------- /frontends/tf-frontend/byteir/workspace.bzl: -------------------------------------------------------------------------------- 1 | def ace_repo(): 2 | native.new_local_repository( 3 | name = "byteir", 4 | path = "./../../compiler/dialects", 5 | build_file = "//byteir:ace.BUILD", 6 | ) 7 | -------------------------------------------------------------------------------- /frontends/tf-frontend/scripts/apply_patches.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | CUR_DIR="$(cd "$(dirname "$0")" ; pwd -P)" 6 | TF_FRONTEND_DIR=$CUR_DIR/.. 7 | TF_DIR=$TF_FRONTEND_DIR/external/tensorflow 8 | TF_PATCHES_DIR=$TF_FRONTEND_DIR/external/patches/tensorflow 9 | 10 | pushd $TF_DIR 11 | git clean -fd . 12 | for patch in $TF_PATCHES_DIR/*; do 13 | git apply $patch 14 | done 15 | popd 16 | -------------------------------------------------------------------------------- /frontends/tf-frontend/tf_mlir_ext/numerical/BUILD: -------------------------------------------------------------------------------- 1 | load("@org_tensorflow//tensorflow:tensorflow.bzl", "filegroup") 2 | load("glob_lit_test.bzl", "glob_lit_tests") 3 | 4 | package(licenses = ["notice"]) 5 | 6 | glob_lit_tests( 7 | data = [":test_utilities"], 8 | driver = "@llvm-project//mlir:run_lit.sh", 9 | test_file_exts = ["mlir"], 10 | ) 11 | 12 | # Bundle together all of the test utilities that are used by tests. 13 | filegroup( 14 | name = "test_utilities", 15 | testonly = True, 16 | data = [ 17 | ":numerical_test.py", 18 | "//tools:tf-ext-opt", 19 | "@llvm-project//llvm:FileCheck", 20 | "@llvm-project//llvm:not", 21 | ], 22 | ) 23 | 24 | filegroup( 25 | name = "litfiles", 26 | srcs = glob(["runlit*py"]), 27 | ) 28 | -------------------------------------------------------------------------------- /frontends/tf-frontend/tf_mlir_ext/tests/BUILD: -------------------------------------------------------------------------------- 1 | load("@org_tensorflow//tensorflow:tensorflow.bzl", "filegroup") 2 | load("glob_lit_test.bzl", "glob_lit_tests") 3 | 4 | package(licenses = ["notice"]) 5 | 6 | glob_lit_tests( 7 | data = [":test_utilities"], 8 | driver = "@llvm-project//mlir:run_lit.sh", 9 | test_file_exts = ["mlir"], 10 | ) 11 | 12 | # Bundle together all of the test utilities that are used by tests. 13 | filegroup( 14 | name = "test_utilities", 15 | testonly = True, 16 | data = [ 17 | "//tools:tf-ext-opt", 18 | "@llvm-project//llvm:FileCheck", 19 | "@llvm-project//llvm:not", 20 | ], 21 | ) 22 | 23 | filegroup( 24 | name = "litfiles", 25 | srcs = glob(["runlit*py"]), 26 | ) 27 | -------------------------------------------------------------------------------- /frontends/tf-frontend/tf_mlir_ext/utils/BUILD: -------------------------------------------------------------------------------- 1 | package( 2 | default_visibility = ["//visibility:public"], 3 | licenses = ["notice"], 4 | ) 5 | 6 | cc_library( 7 | name = "tfext_utils", 8 | srcs = [ 9 | "customcall.cc", 10 | "dce.cc", 11 | "utils.cc", 12 | ], 13 | hdrs = [ 14 | "customcall.h", 15 | "dce.h", 16 | "utils.h", 17 | ], 18 | deps = [ 19 | "@org_tensorflow//tensorflow/compiler/mlir/tensorflow:tensorflow", 20 | "@llvm-project//llvm:Support", 21 | "@llvm-project//mlir:Dialect", 22 | "@llvm-project//mlir:IR", 23 | "@llvm-project//mlir:Support", 24 | ] 25 | ) -------------------------------------------------------------------------------- /frontends/torch-frontend/.gitignore: -------------------------------------------------------------------------------- 1 | torch-frontend/python/torch_frontend.egg-info/ 2 | torch-frontend/python/torch_frontend/version.py -------------------------------------------------------------------------------- /frontends/torch-frontend/README.md: -------------------------------------------------------------------------------- 1 | # Torch Frontend 2 | torch-frontend is a project to build customized torch model --> torch dialect --> stablehlo dialect pipeline, where we could add extended dialect and passes. 3 | 4 | 5 | ## Quick Start 6 | 7 | ### Build from source code 8 | 9 | ```bash 10 | git clone https://github.com/bytedance/byteir.git 11 | cd byteir/frontends/torch-frontend 12 | 13 | # prepare python environment and build torch-frontend 14 | bash scripts/build.sh 15 | 16 | # torch_frontend-*.whl in ./build/torch-frontend/python/dist/ 17 | ``` 18 | 19 | ### Example 20 | ```bash 21 | PYTHONPATH=./build/python_packages/:build/torch_mlir_build/python_packages/torch_mlir python3 examples/inference/infer_resnet.py 22 | ``` 23 | -------------------------------------------------------------------------------- /frontends/torch-frontend/build-requirements.txt: -------------------------------------------------------------------------------- 1 | # cpu torch and torchvision 2 | # --extra-index-url https://download.pytorch.org/whl/cpu 3 | # --pre 4 | # torch==2.1.0+cpu 5 | # torchvision==0.16.0+cpu 6 | 7 | # cuda torch and torchvision 8 | # --extra-index-url https://download.pytorch.org/whl/cu118 9 | # --pre 10 | # torch==2.1.0+cu118 11 | # torchvision==0.16.0+cu118 12 | 13 | # cuda torch and torchvision nightly 14 | # --extra-index-url https://download.pytorch.org/whl/nightly/cu118 15 | # --pre 16 | # torch==2.1.0.dev20230820+cu118 17 | # torchvision==0.16.0.dev20230820+cu118 18 | 19 | 20 | # The following copied from torch-mlir 21 | 22 | # Build requirements. 23 | pybind11 24 | wheel 25 | setuptools 26 | cmake 27 | pyyaml 28 | packaging 29 | lit 30 | 31 | -------------------------------------------------------------------------------- /frontends/torch-frontend/examples/demo/requirements.txt: -------------------------------------------------------------------------------- 1 | --extra-index-url https://download.pytorch.org/whl/cu118 2 | --pre 3 | torch==2.1.0+cu118 4 | 5 | transformers==4.29.2 6 | -------------------------------------------------------------------------------- /frontends/torch-frontend/examples/inference/mixtral/requirements.txt: -------------------------------------------------------------------------------- 1 | transformers==4.40.2 2 | 3 | --extra-index-url https://download.pytorch.org/whl/cu118 4 | --pre 5 | torch==2.3.0+cu118 6 | -------------------------------------------------------------------------------- /frontends/torch-frontend/test-requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | transformers==4.29.2 3 | 4 | # Test Requirements 5 | pillow 6 | pytest==8.1.0 7 | dill 8 | multiprocess 9 | expecttest 10 | -------------------------------------------------------------------------------- /frontends/torch-frontend/torch-cpu-requirements.txt: -------------------------------------------------------------------------------- 1 | --extra-index-url https://download.pytorch.org/whl/cpu 2 | --pre 3 | torch==2.4.1+cpu 4 | torchvision==0.19.1+cpu 5 | -------------------------------------------------------------------------------- /frontends/torch-frontend/torch-cuda-requirements.txt: -------------------------------------------------------------------------------- 1 | --extra-index-url https://download.pytorch.org/whl/cu118 2 | --pre 3 | torch==2.4.1+cu118 4 | torchvision==0.19.1+cu118 5 | -------------------------------------------------------------------------------- /frontends/torch-frontend/torch-frontend/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(include/torch-frontend) 2 | 3 | add_subdirectory(lib) 4 | 5 | # for torch-frontend python extensions and packages 6 | add_subdirectory(python) 7 | 8 | # for torch-frontend binary executable tools 9 | add_subdirectory(tools) 10 | 11 | add_subdirectory(test) -------------------------------------------------------------------------------- /frontends/torch-frontend/torch-frontend/include/torch-frontend/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(Conversion) 2 | add_subdirectory(Transforms) 3 | add_subdirectory(Dialect/Torch/Transforms) 4 | -------------------------------------------------------------------------------- /frontends/torch-frontend/torch-frontend/include/torch-frontend/Conversion/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(LLVM_TARGET_DEFINITIONS Passes.td) 2 | mlir_tablegen(Passes.h.inc -gen-pass-decls -name TorchFrontendConversion) 3 | add_public_tablegen_target(TorchFrontendConversionPassIncGen) 4 | -------------------------------------------------------------------------------- /frontends/torch-frontend/torch-frontend/include/torch-frontend/Dialect/Torch/Transforms/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(LLVM_TARGET_DEFINITIONS Passes.td) 2 | mlir_tablegen(Passes.h.inc -gen-pass-decls -name TorchFrontendTorchTransforms) 3 | add_public_tablegen_target(TorchFrontendTorchTransformsPassIncGen) 4 | -------------------------------------------------------------------------------- /frontends/torch-frontend/torch-frontend/include/torch-frontend/Transforms/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(LLVM_TARGET_DEFINITIONS Passes.td) 2 | mlir_tablegen(Passes.h.inc -gen-pass-decls -name TorchFrontendTransforms) 3 | add_public_tablegen_target(TorchFrontendTransformsPassIncGen) 4 | -------------------------------------------------------------------------------- /frontends/torch-frontend/torch-frontend/lib/CAPI/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_mlir_public_c_api_library(TorchFrontendCAPI 2 | Passes.cpp 3 | 4 | ENABLE_AGGREGATION 5 | LINK_COMPONENTS 6 | 7 | LINK_LIBS PUBLIC 8 | MLIRSupport 9 | TorchFrontendPipelines 10 | ) 11 | -------------------------------------------------------------------------------- /frontends/torch-frontend/torch-frontend/lib/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(CAPI) 2 | add_subdirectory(Conversion) 3 | add_subdirectory(Dialect/Torch/Transforms) 4 | add_subdirectory(Pipelines) 5 | add_subdirectory(Transforms) 6 | add_subdirectory(Utils) 7 | # pytorch custom op 8 | add_subdirectory(CustomOp) 9 | -------------------------------------------------------------------------------- /frontends/torch-frontend/torch-frontend/lib/Conversion/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_mlir_library(TorchFrontendConversion 2 | ConvertTorchToCcl.cpp 3 | ConvertTorchToCustomCall.cpp 4 | ConvertTorchToStablehloExt.cpp 5 | 6 | DEPENDS 7 | TorchFrontendConversionPassIncGen 8 | 9 | LINK_COMPONENTS 10 | Core 11 | 12 | LINK_LIBS PUBLIC 13 | MLIRIR 14 | MLIRPass 15 | MLIRDialect 16 | MLIRTransforms 17 | ChloOps 18 | StablehloOps 19 | MLIRCclDialect 20 | TorchMLIRConversionUtils 21 | TorchMLIRTorchDialect 22 | TorchMLIRTorchPasses 23 | TorchMLIRTorchToStablehlo 24 | TorchMLIRTorchConversionPasses 25 | TorchMLIRTorchUtils 26 | ) 27 | -------------------------------------------------------------------------------- /frontends/torch-frontend/torch-frontend/lib/CustomOp/dynamic_mask_stitch.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | torch::Tensor custom_dynamic_mask_stitch(std::vector data, 4 | torch::Tensor partitions) { 5 | std::vector res; 6 | res.reserve(partitions.size(0)); 7 | std::vector count(data.size(), 0); 8 | for (int64_t i = 0; i < partitions.size(0); ++i) { 9 | int idx = partitions[i].item(); 10 | res.push_back(data[idx][count[idx]].unsqueeze(0)); 11 | count[idx]++; 12 | } 13 | return torch::cat(res, /*dim=*/0); 14 | } 15 | 16 | static auto registry = torch::RegisterOperators("byteir::dynamic_mask_stitch", 17 | &custom_dynamic_mask_stitch); 18 | -------------------------------------------------------------------------------- /frontends/torch-frontend/torch-frontend/lib/CustomOp/dynamic_stitch.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | torch::Tensor custom_dynamic_stitch(std::vector indices, 4 | std::vector data) { 5 | int n = 0; 6 | for (auto &idx : indices) { 7 | n += idx.numel(); 8 | } 9 | std::vector res(n); 10 | for (size_t i = 0; i < data.size(); ++i) { 11 | for (int j = 0; j < indices[i].size(0); ++j) { 12 | res[indices[i][j].item()] = data[i][j].unsqueeze(0); 13 | } 14 | } 15 | return torch::cat(res, /*dim=*/0); 16 | } 17 | 18 | static auto registry = 19 | torch::RegisterOperators("byteir::dynamic_stitch", &custom_dynamic_stitch); 20 | -------------------------------------------------------------------------------- /frontends/torch-frontend/torch-frontend/lib/Dialect/Torch/Transforms/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(LLVM_TARGET_DEFINITIONS FuseOpOnTorchPattern.td) 2 | mlir_tablegen(FuseOpOnTorchPattern.inc -gen-rewriters) 3 | add_public_tablegen_target(FuseOpOnTorchPatternIncGen) 4 | 5 | add_mlir_library(TorchFrontendTorchTransforms 6 | DecomposeOnTorch.cpp 7 | FuseOpOnTorch.cpp 8 | 9 | DEPENDS 10 | TorchFrontendTorchTransformsPassIncGen 11 | FuseOpOnTorchPatternIncGen 12 | 13 | LINK_LIBS PUBLIC 14 | MLIRIR 15 | MLIRPass 16 | MLIRDialect 17 | TorchMLIRTorchDialect 18 | TorchMLIRTorchUtils 19 | TorchFrontendUtils 20 | ) 21 | target_include_directories(TorchFrontendTorchTransforms PUBLIC ${CMAKE_CURRENT_BINARY_DIR}) 22 | -------------------------------------------------------------------------------- /frontends/torch-frontend/torch-frontend/lib/Pipelines/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_mlir_library(TorchFrontendPipelines 2 | Pipelines.cpp 3 | 4 | # DEPENDS 5 | 6 | LINK_COMPONENTS 7 | Core 8 | 9 | LINK_LIBS PUBLIC 10 | MLIRIR 11 | TorchMLIRTorchToStablehlo 12 | TorchMLIRTorchConversionPasses 13 | TorchFrontendConversion 14 | TorchFrontendTransforms 15 | TorchFrontendTorchTransforms 16 | StablehloPasses 17 | ) 18 | -------------------------------------------------------------------------------- /frontends/torch-frontend/torch-frontend/lib/Transforms/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_mlir_library(TorchFrontendTransforms 2 | CanonicalizeExt.cpp 3 | EliminateUselessOp.cpp 4 | RewriteCustomOp.cpp 5 | RewriteEntryFuncName.cpp 6 | UnpackPublicFunctionReturn.cpp 7 | 8 | DEPENDS 9 | TorchFrontendTransformsPassIncGen 10 | TorchMLIRTorchDialect 11 | 12 | LINK_COMPONENTS 13 | Core 14 | 15 | LINK_LIBS PUBLIC 16 | MLIRIR 17 | MLIRPass 18 | MLIRDialect 19 | TorchMLIRTorchDialect 20 | TorchFrontendUtils 21 | ) 22 | -------------------------------------------------------------------------------- /frontends/torch-frontend/torch-frontend/lib/Utils/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_mlir_library(TorchFrontendUtils 2 | ConvertOpFolder.cpp 3 | 4 | LINK_LIBS PUBLIC 5 | MLIRIR 6 | ) -------------------------------------------------------------------------------- /frontends/torch-frontend/torch-frontend/python/test/pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | markers = 3 | mhlo_tools: marks tests depend on mhlo_tools (deselect with '-m "not mhlo_tools"') 4 | attention_rewriter: marks tests which run attention rewriting (deselect with '-m "not attention_rewriter"') 5 | serial 6 | -------------------------------------------------------------------------------- /frontends/torch-frontend/torch-frontend/python/test/test_fx_utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.fx as fx 3 | import torch_frontend 4 | from torch_frontend.fx_utils import _replace_aten_full_arugment 5 | 6 | class FullModule(torch.nn.Module): 7 | def __init__(self): 8 | super().__init__() 9 | 10 | def forward(self, x): 11 | y = torch.ops.aten.full(x.shape, True, dtype=torch.bool) 12 | return y 13 | 14 | 15 | def test_full_bool_pattern(): 16 | fx_g = fx.symbolic_trace(FullModule()) 17 | fx_g = _replace_aten_full_arugment(fx_g) 18 | module = torch.jit.script(fx_g) 19 | -------------------------------------------------------------------------------- /frontends/torch-frontend/torch-frontend/python/torch_frontend/__init__.py: -------------------------------------------------------------------------------- 1 | from ._mlir_libs._torchFrontend import * 2 | 3 | from .compile import DebugType, GENERIC_CUSTOM_OPS, BYTEIR_CUSTOM_OPS, MATH_CUSTOM_OPS 4 | from .compile import compile, compile_dynamo_model 5 | 6 | from .fx_utils import list_decomposed_ops, preprocess_fx_graph, get_none_indices 7 | from .flash_attn_op import replace_flash_attn 8 | from .fx_rewrite import fx_replace_attn_pattern 9 | 10 | from . import utils 11 | -------------------------------------------------------------------------------- /frontends/torch-frontend/torch-frontend/python/torch_frontend/_mlir_libs/_site_initialize_0.py: -------------------------------------------------------------------------------- 1 | def context_init_hook(context): 2 | from ._stablehlo import register_dialect as register_stablehlo_dialect 3 | from ._torchMlir import register_dialect as register_torch_dialect 4 | 5 | register_stablehlo_dialect(context) 6 | register_torch_dialect(context) 7 | -------------------------------------------------------------------------------- /frontends/torch-frontend/torch-frontend/python/torch_frontend/byteir_backend/__init__.py: -------------------------------------------------------------------------------- 1 | from torch._dynamo import register_backend 2 | 3 | 4 | @register_backend 5 | def byteir(*args, **kwargs): 6 | from .compiler import byteir_compiler 7 | 8 | return byteir_compiler(*args, **kwargs) 9 | 10 | @register_backend 11 | def byteir_debug(*args, **kwargs): 12 | from .debug import debug_backend 13 | 14 | return debug_backend(*args, **kwargs) 15 | 16 | def set_cache_dir(path: str): 17 | from .compilation_cache import ByteIRFxGraphCache 18 | 19 | ByteIRFxGraphCache.base_cache_dir = path 20 | -------------------------------------------------------------------------------- /frontends/torch-frontend/torch-frontend/python/torch_frontend/tools/gen_extra_library.py: -------------------------------------------------------------------------------- 1 | from torch_frontend.extra_shape_fn import byteir_extra_library 2 | from torch_mlir import torchscript 3 | 4 | import os 5 | import shutil 6 | 7 | CUR_DIR = os.path.abspath(os.path.dirname(__file__)) 8 | 9 | def _get_extra_library_file(): 10 | extra_library = [] 11 | for op in byteir_extra_library: 12 | extra_library += byteir_extra_library[op] 13 | return torchscript._canon_extra_library(extra_library) 14 | 15 | def main(): 16 | temp_file_path = _get_extra_library_file() 17 | shutil.copyfile(temp_file_path, os.path.join(CUR_DIR, "extra_fn.mlir")) 18 | 19 | if __name__ == "__main__": 20 | main() 21 | -------------------------------------------------------------------------------- /frontends/torch-frontend/torch-frontend/python/torch_frontend/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .jit_transforms import replace_copy_fill_with_slice_scatter 2 | -------------------------------------------------------------------------------- /frontends/torch-frontend/torch-frontend/python/version.txt: -------------------------------------------------------------------------------- 1 | 1.3.4 -------------------------------------------------------------------------------- /frontends/torch-frontend/torch-frontend/test/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | configure_lit_site_cfg( 2 | ${CMAKE_CURRENT_SOURCE_DIR}/lit.site.cfg.py.in 3 | ${CMAKE_CURRENT_BINARY_DIR}/lit.site.cfg.py 4 | MAIN_CONFIG 5 | ${CMAKE_CURRENT_SOURCE_DIR}/lit.cfg.py 6 | ) 7 | 8 | set(TORCH_FRONTEND_TEST_DEPENDS 9 | FileCheck count not 10 | torch-frontend-opt 11 | ) 12 | 13 | add_lit_testsuite(check-torch-frontend-opt "Running the torch-frontend-opt regression tests" 14 | ${CMAKE_CURRENT_BINARY_DIR} 15 | DEPENDS ${TORCH_FRONTEND_TEST_DEPENDS} 16 | ) 17 | set_target_properties(check-torch-frontend-opt PROPERTIES FOLDER "Tests") 18 | 19 | add_lit_testsuites(TORCH_FRONTEND_TEST ${CMAKE_CURRENT_SOURCE_DIR} DEPENDS ${TORCH_FRONTEND_TEST_DEPENDS}) -------------------------------------------------------------------------------- /frontends/torch-frontend/torch-frontend/test/Transforms/RewriteEntryFuncName.mlir: -------------------------------------------------------------------------------- 1 | // RUN: torch-frontend-opt %s --rewrite-entry-func-name="target-name=main" | FileCheck %s 2 | 3 | module { 4 | func.func @forward(%arg0: !torch.vtensor<[3,4],f32>) -> !torch.vtensor<[3,4],f32>{ 5 | return %arg0 : !torch.vtensor<[3,4],f32> 6 | } 7 | } 8 | // CHECK-LABEL: func.func @main 9 | -------------------------------------------------------------------------------- /frontends/torch-frontend/torch-frontend/test/Transforms/UnpackPublicFunctionReturn.mlir: -------------------------------------------------------------------------------- 1 | // RUN: torch-frontend-opt %s --unpack-public-function-return --canonicalize | FileCheck %s 2 | 3 | module { 4 | func.func @forward(%arg0: !torch.tensor {torch.type_bound = !torch.vtensor<[3,4],f32>}) -> !torch.list { 5 | %0 = torch.prim.ListConstruct %arg0, %arg0, %arg0 : (!torch.tensor, !torch.tensor, !torch.tensor) -> !torch.list 6 | return %0 : !torch.list 7 | } 8 | } 9 | // CHECK-LABEL: func.func @forward 10 | // CHECK: %0 = torch.prim.TupleConstruct %arg0, %arg0, %arg0 : !torch.tensor, !torch.tensor, !torch.tensor -> !torch.tuple 11 | // CHECK: return %0 : !torch.tuple 12 | -------------------------------------------------------------------------------- /runtime/.gitignore: -------------------------------------------------------------------------------- 1 | python/brt.egg-info/ 2 | python/dist/ 3 | python/brt/version.py 4 | -------------------------------------------------------------------------------- /runtime/VERSION_NUMBER: -------------------------------------------------------------------------------- 1 | 1.9.3.0 2 | -------------------------------------------------------------------------------- /runtime/cmake/brt_config.h.in: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #cmakedefine HAS_UNUSED_BUT_SET_VARIABLE 4 | #cmakedefine HAS_UNUSED_PARAMETER 5 | #cmakedefine HAS_UNUSED_VARIABLE 6 | #cmakedefine HAS_CAST_FUNCTION_TYPE 7 | #cmakedefine HAS_PARENTHESES 8 | #cmakedefine HAS_USELESS_CAST 9 | #cmakedefine HAS_NONNULL_COMPARE 10 | #cmakedefine HAS_TAUTOLOGICAL_POINTER_COMPARE 11 | #cmakedefine HAS_CATCH_VALUE 12 | #cmakedefine HAS_MISSING_BRACES 13 | #cmakedefine HAS_IGNORED_ATTRIBUTES 14 | #cmakedefine HAS_DEPRECATED_COPY 15 | #cmakedefine HAS_CLASS_MEMACCESS 16 | #cmakedefine HAS_MAYBE_UNINITIALIZED 17 | #cmakedefine HAS_DEPRECATED_DECLARATIONS 18 | #cmakedefine BRT_VERSION "@BRT_VERSION@" 19 | -------------------------------------------------------------------------------- /runtime/cmake/brt_device_cpu.cmake: -------------------------------------------------------------------------------- 1 | file(GLOB_RECURSE brt_device_cpu_srcs CONFIGURE_DEPENDS 2 | "${BRT_INCLUDE_DIR}/brt/backends/cpu/device/*.h" 3 | "${LIB_ROOT}/backends/cpu/device/*.cc" 4 | ) 5 | 6 | source_group(TREE ${REPO_ROOT} FILES ${brt_device_cpu_srcs}) 7 | 8 | brt_add_object_library(brt_device_cpu ${brt_device_cpu_srcs}) 9 | target_link_libraries(brt_device_cpu LLVMOrcJIT LLVMX86CodeGen LLVMX86AsmParser) 10 | brt_add_include_to_target(brt_device_cpu brt_framework brt_common) 11 | set_target_properties(brt_device_cpu PROPERTIES FOLDER "Brt") 12 | 13 | install( 14 | DIRECTORY "${BRT_INCLUDE_DIR}/brt/backends/cpu/device" 15 | DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/brt/backends/cpu") 16 | -------------------------------------------------------------------------------- /runtime/cmake/brt_shared.cmake: -------------------------------------------------------------------------------- 1 | brt_add_shared_library(brt) 2 | set(VERSION_SCRIPT ${REPO_ROOT}/version.ld) 3 | target_link_libraries(brt 4 | PUBLIC $ 5 | PRIVATE $ 6 | PRIVATE -Wl,--no-undefined -Wl,--version-script=${VERSION_SCRIPT} 7 | ) 8 | set_target_properties(brt PROPERTIES LINK_DEPENDS ${VERSION_SCRIPT}) 9 | set_target_properties(brt PROPERTIES INSTALL_RPATH "$ORIGIN") 10 | 11 | install( 12 | TARGETS brt 13 | EXPORT brt-targets 14 | LIBRARY DESTINATION "${CMAKE_INSTALL_LIBDIR}" 15 | INCLUDES DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}") 16 | 17 | install( 18 | EXPORT brt-targets 19 | DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/brt") 20 | -------------------------------------------------------------------------------- /runtime/examples/external_project/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | 2 | cmake_minimum_required(VERSION 3.18) 3 | 4 | project(brt_external_project C CXX) 5 | 6 | option(BRT_INSTALL_PATH "The path to the installed BRT library") 7 | get_filename_component(BRT_INSTALL_PATH ${BRT_INSTALL_PATH} ABSOLUTE) 8 | 9 | list(APPEND CMAKE_MODULE_PATH "${BRT_INSTALL_PATH}/lib/cmake/brt") 10 | include(brt-targets) 11 | 12 | add_executable(main main.cpp) 13 | target_link_libraries(main brt) 14 | -------------------------------------------------------------------------------- /runtime/include/brt/core/common/logging/sinks/clog_sink.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | // =========================================================================== 4 | // Modification Copyright 2022 ByteDance Ltd. and/or its affiliates. 5 | 6 | #pragma once 7 | 8 | #include "brt/core/common/logging/sinks/ostream_sink.h" 9 | #include 10 | 11 | namespace brt { 12 | namespace logging { 13 | /// 14 | /// A std::clog based ISink 15 | /// 16 | /// 17 | class CLogSink : public OStreamSink { 18 | public: 19 | CLogSink() : OStreamSink(std::clog, /*flush*/ true) {} 20 | }; 21 | } // namespace logging 22 | } // namespace brt 23 | -------------------------------------------------------------------------------- /runtime/include/brt/core/distributed/d_context.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Megvii Inc. 2 | // Licensed under Apache License, Version 2.0 3 | // =========================================================================== 4 | // Modification Copyright 2022 ByteDance Ltd. and/or its affiliates. 5 | 6 | #pragma once 7 | 8 | #include 9 | 10 | namespace brt { 11 | 12 | // DContext is an abstraction of communication contexts (e.g. cuda stream) 13 | // on different platforms, a context should be passed as a parameter when 14 | // a communicator operation is called 15 | class DContext { 16 | public: 17 | virtual std::string type() const = 0; 18 | virtual ~DContext() = default; 19 | }; 20 | 21 | } // namespace brt -------------------------------------------------------------------------------- /runtime/lib/backends/cuda/providers/default/reduction/kernels/reduction.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Megvii Inc. 2 | // Licensed under the Apache License. 3 | // =========================================================================== 4 | // Modification Copyright 2022 ByteDance Ltd. and/or its affiliates. 5 | 6 | #pragma once 7 | 8 | #include 9 | 10 | namespace brt { 11 | namespace cuda { 12 | namespace kernel { 13 | template 14 | size_t get_reduce_workspace_in_bytes(size_t A, size_t B, size_t C); 15 | 16 | template 17 | void call_reduce(const T *input, T *output, size_t A, size_t B, size_t C, 18 | void *, cudaStream_t stream); 19 | } // namespace kernel 20 | } // namespace cuda 21 | } // namespace brt -------------------------------------------------------------------------------- /runtime/python/README.md: -------------------------------------------------------------------------------- 1 | ## BRT Python Binding 2 | 3 | ### Building 4 | 5 | build brt python extensions with cmake options `-Dbrt_ENABLE_PYTHON_BINDINGS=ON` 6 | 7 | ### Run Examples 8 | ``` 9 | cd build/python 10 | PYTHONPATH=. python3 examples/add2.py 11 | ``` -------------------------------------------------------------------------------- /runtime/python/brt/__init__.py: -------------------------------------------------------------------------------- 1 | from ._brt import * 2 | -------------------------------------------------------------------------------- /runtime/test/exported.ld: -------------------------------------------------------------------------------- 1 | { 2 | extern "C++" { 3 | /* export all symbols in brt to enable external kernel registration */ 4 | *brt::*; 5 | }; 6 | }; 7 | -------------------------------------------------------------------------------- /runtime/test/test_files/AITOp/bmm_permute_a100.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bytedance/byteir/5c485e2857e277934ec1f72878b1d9673f3614be/runtime/test/test_files/AITOp/bmm_permute_a100.so -------------------------------------------------------------------------------- /runtime/test/test_files/AITOp/bmm_permute_entry.mlir: -------------------------------------------------------------------------------- 1 | module attributes {byre.container_module} { 2 | func.func @main(%arg0 : memref<384x256x256xf32, "cuda"> {byre.argname = "Input0", byre.argtype = 1 : i32}, 3 | %arg1 : memref<384x256x64xf32, "cuda"> {byre.argname = "Input1", byre.argtype = 1 : i32}, 4 | %arg2 : memref<64x256x6x64xf32, "cuda"> {byre.argname = "Output0", byre.argtype = 2 : i32}) attributes {byre.entry_point} { 5 | byre.compute @AITOp(%arg0, %arg1, %arg2) {kernel_name = "bmm_permute", ait_lib_file = "bmm_permute_a100.so"} : memref<384x256x256xf32, "cuda">, memref<384x256x64xf32, "cuda">, memref<64x256x6x64xf32, "cuda"> 6 | return 7 | } 8 | } 9 | -------------------------------------------------------------------------------- /runtime/test/test_files/AITOp/permute_a100.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bytedance/byteir/5c485e2857e277934ec1f72878b1d9673f3614be/runtime/test/test_files/AITOp/permute_a100.so -------------------------------------------------------------------------------- /runtime/test/test_files/AITOp/permute_entry.mlir: -------------------------------------------------------------------------------- 1 | module attributes {byre.container_module} { 2 | func.func @main(%arg0 : memref<64x256x6x64xf32, "cuda"> {byre.argname = "Input0", byre.argtype = 1 : i32}, 3 | %arg1 : memref<64x6x256x64xf32, "cuda"> {byre.argname = "Output0", byre.argtype = 2 : i32}) attributes {byre.entry_point} { 4 | byre.compute @AITOp(%arg0, %arg1) {kernel_name = "permute", ait_lib_file = "permute_a100.so"} : memref<64x256x6x64xf32, "cuda">, memref<64x6x256x64xf32, "cuda"> 5 | return 6 | } 7 | } 8 | -------------------------------------------------------------------------------- /runtime/test/test_files/Distributed/add_send.mlir: -------------------------------------------------------------------------------- 1 | module attributes {byre.container_module} { 2 | func.func @test_add_send(%arg0 : memref<4xf32, "cuda"> {byre.argname = "in0", byre.argtype = 1: i32}, 3 | %arg1 : memref<4xf32, "cuda"> {byre.argname = "in1", byre.argtype = 1: i32}, 4 | %arg2 : memref<4xf32, "cuda"> {byre.argname = "out", byre.argtype = 2: i32}) attributes {byre.entry_point} { 5 | byre.compute @AddOp_f32f32_f32(%arg0, %arg1, %arg2) : memref<4xf32, "cuda">, memref<4xf32, "cuda">, memref<4xf32, "cuda"> 6 | byre.compute @nccl.Send(%arg2) {rank = 1 : i64} : memref<4xf32, "cuda"> 7 | return 8 | } 9 | } 10 | -------------------------------------------------------------------------------- /runtime/test/test_files/Distributed/all_gather.mlir: -------------------------------------------------------------------------------- 1 | module attributes {byre.container_module} { 2 | func.func @test_all_gather(%arg0 : memref<4xf32, "cuda"> {byre.argname = "in0", byre.argtype = 1: i32}, 3 | %arg1 : memref<8xf32, "cuda"> {byre.argname = "out", byre.argtype = 2: i32}) attributes {byre.entry_point} { 4 | byre.compute @nccl.AllGather(%arg0, %arg1) {replica_group = [2, 3]} : memref<4xf32, "cuda">, memref<8xf32, "cuda"> 5 | return 6 | } 7 | } 8 | -------------------------------------------------------------------------------- /runtime/test/test_files/Distributed/all_reduce.mlir: -------------------------------------------------------------------------------- 1 | module attributes {byre.container_module} { 2 | func.func @test_all_reduce(%arg0 : memref<4xf32, "cuda"> {byre.argname = "in0", byre.argtype = 1: i32}, 3 | %arg1 : memref<4xf32, "cuda"> {byre.argname = "out", byre.argtype = 2: i32}) attributes {byre.entry_point} { 4 | byre.compute @nccl.AllReduce(%arg0, %arg1) { reduction = "sum" , replica_group = [1 ,2, 3]} : memref<4xf32, "cuda">, memref<4xf32, "cuda"> 5 | return 6 | } 7 | } 8 | -------------------------------------------------------------------------------- /runtime/test/test_files/Distributed/broadcast.mlir: -------------------------------------------------------------------------------- 1 | module attributes {byre.container_module} { 2 | func.func @test_broadcast(%arg0 : memref<8xf32, "cuda"> {byre.argname = "in0", byre.argtype = 1: i32}) attributes {byre.entry_point} { 3 | byre.compute @nccl.Broadcast(%arg0) {replica_group = [1, 0, 2]} : memref<8xf32, "cuda"> 4 | return 5 | } 6 | } 7 | -------------------------------------------------------------------------------- /runtime/test/test_files/Distributed/broadcast2.mlir: -------------------------------------------------------------------------------- 1 | module attributes {byre.container_module} { 2 | func.func @test_broadcast(%arg0 : memref<8xf32, "cuda"> {byre.argname = "in0", byre.argtype = 1: i32}) attributes {byre.entry_point} { 3 | byre.compute @nccl.Broadcast(%arg0) {replica_group = [2, 0, 3]} : memref<8xf32, "cuda"> 4 | return 5 | } 6 | } 7 | -------------------------------------------------------------------------------- /runtime/test/test_files/Distributed/recv.mlir: -------------------------------------------------------------------------------- 1 | module attributes {byre.container_module} { 2 | func.func @test_recv(%arg0 : memref<4xf32, "cuda"> {byre.argname = "src", byre.argtype = 2: i32}) attributes {byre.entry_point} { 3 | byre.compute @nccl.Recv(%arg0) {rank = 0 : i64} : memref<4xf32, "cuda"> 4 | return 5 | } 6 | } 7 | -------------------------------------------------------------------------------- /runtime/test/test_files/Distributed/recv_add.mlir: -------------------------------------------------------------------------------- 1 | module attributes {byre.container_module} { 2 | func.func @test_recv_add(%arg0 : memref<4xf32, "cuda"> {byre.argname = "in", byre.argtype = 1: i32}, 3 | %arg1 : memref<4xf32, "cuda"> {byre.argname = "out0", byre.argtype = 2: i32}, 4 | %arg2 : memref<4xf32, "cuda"> {byre.argname = "out1", byre.argtype = 2: i32}) attributes {byre.entry_point} { 5 | byre.compute @nccl.Recv(%arg1) {rank = 0 : i64} : memref<4xf32, "cuda"> 6 | byre.compute @AddOp_f32f32_f32(%arg0, %arg1, %arg2) : memref<4xf32, "cuda">, memref<4xf32, "cuda">, memref<4xf32, "cuda"> 7 | return 8 | } 9 | } 10 | -------------------------------------------------------------------------------- /runtime/test/test_files/Distributed/send.mlir: -------------------------------------------------------------------------------- 1 | module attributes {byre.container_module} { 2 | func.func @test_send(%arg0 : memref<4xf32, "cuda"> {byre.argname = "src", byre.argtype = 1: i32}) attributes {byre.entry_point} { 3 | byre.compute @nccl.Send(%arg0) {rank = 1 : i64} : memref<4xf32, "cuda"> 4 | return 5 | } 6 | } 7 | -------------------------------------------------------------------------------- /runtime/test/test_files/LLJIT/Case0_v1_0_0/entry.mlirbc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bytedance/byteir/5c485e2857e277934ec1f72878b1d9673f3614be/runtime/test/test_files/LLJIT/Case0_v1_0_0/entry.mlirbc -------------------------------------------------------------------------------- /runtime/test/test_files/LLJIT/Case0_v1_0_0/host_kernels.bc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bytedance/byteir/5c485e2857e277934ec1f72878b1d9673f3614be/runtime/test/test_files/LLJIT/Case0_v1_0_0/host_kernels.bc -------------------------------------------------------------------------------- /runtime/test/test_files/add_splat_const_one_cuda.mlir: -------------------------------------------------------------------------------- 1 | module attributes {byre.container_module} { 2 | func.func @mhlo_add_splat_const(%arg0: memref<100x32xf32, "cuda"> {byre.argname = "Input0", byre.argtype = 1 : i32}, 3 | %arg1: memref<100x32xf32, "cuda"> {byre.argname = "Output0", byre.argtype = 2 : i32}) attributes {byre.entry_point} { 4 | %0 = memref.alloc() : memref<100x32xf32, "cuda"> 5 | byre.compute @FillOp(%0) {value = dense<1.000000e+00> : tensor<100x32xf32>} : memref<100x32xf32, "cuda"> 6 | byre.compute @AddOp_f32f32_f32(%arg0, %0, %arg1) : memref<100x32xf32, "cuda">, memref<100x32xf32, "cuda">, memref<100x32xf32, "cuda"> 7 | return 8 | } 9 | } -------------------------------------------------------------------------------- /runtime/test/test_files/cuda_add.cu: -------------------------------------------------------------------------------- 1 | extern "C" __global__ void nvrtc_add_kernel(const float* input, float* output, int n, float val) { 2 | int i = blockIdx.x*blockDim.x + threadIdx.x; 3 | if (i < n) { 4 | output[i] = input[i]+ val; 5 | } 6 | } -------------------------------------------------------------------------------- /runtime/test/test_files/flash_attn_kvcache_inputs_cache_seqlens.data: -------------------------------------------------------------------------------- 1 | 64 64 -------------------------------------------------------------------------------- /runtime/test/test_files/group_allocation_hook_cpu_group.mlir: -------------------------------------------------------------------------------- 1 | module attributes {byre.container_module} { 2 | func.func @main(%arg0: memref<32xf32, "cpu_group"> {byre.argname = "Input0", byre.argtype = 1 : i32}, 3 | %arg1: memref<32xf32, "cpu_group"> {byre.argname = "Output0", byre.argtype = 2 : i32}) attributes {byre.entry_point} { 4 | %0 = memref.alloc() : memref<32xf32, "cpu_group"> 5 | byre.compute @CheckGroupAllocationHook(%arg0, %0, %arg1) {base = 0xdeadbeef: i64} : memref<32xf32, "cpu_group">, memref<32xf32, "cpu_group">, memref<32xf32, "cpu_group"> 6 | return 7 | } 8 | } 9 | -------------------------------------------------------------------------------- /runtime/test/test_files/string_equal.mlir: -------------------------------------------------------------------------------- 1 | module attributes {byre.container_module} { 2 | func.func @main(%arg0 : memref<4x!ace.string, "cpu"> {byre.argname = "Input", byre.argtype = 1: i32}, 3 | %arg1 : memref<4xi1, "cpu"> {byre.argname = "Output", byre.argtype = 2: i32}) attributes {byre.entry_point} { 4 | %0 = memref.alloc() : memref<4x!ace.string, "cpu"> 5 | byre.compute @FillOp(%0) {memory_effects = [2 : i32], value = dense<"aaa"> : tensor<4x!ace.string, "cpu">} : memref<4x!ace.string, "cpu"> 6 | byre.compute @tf.Equal(%arg0, %0, %arg1) {memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x!ace.string, "cpu">, memref<4x!ace.string, "cpu">, memref<4xi1, "cpu"> 7 | return 8 | } 9 | } 10 | -------------------------------------------------------------------------------- /runtime/test/test_files/string_equal_scalar.mlir: -------------------------------------------------------------------------------- 1 | module attributes {byre.container_module} { 2 | func.func @main(%arg0 : memref<1x!ace.string, "cpu"> {byre.argname = "Input", byre.argtype = 1: i32}, 3 | %arg1 : memref<4xi1, "cpu"> {byre.argname = "Output", byre.argtype = 2: i32}) attributes {byre.entry_point} { 4 | %0 = memref.alloc() : memref<4x!ace.string, "cpu"> 5 | byre.compute @FillOp(%0) {memory_effects = [2 : i32], value = dense<"aaa"> : tensor<4x!ace.string, "cpu">} : memref<4x!ace.string, "cpu"> 6 | byre.compute @tf.Equal(%arg0, %0, %arg1) {memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x!ace.string, "cpu">, memref<4x!ace.string, "cpu">, memref<4xi1, "cpu"> 7 | return 8 | } 9 | } 10 | -------------------------------------------------------------------------------- /runtime/version.ld: -------------------------------------------------------------------------------- 1 | { 2 | global: 3 | extern "C++" { 4 | /* since we didn't have pubapi or capi yet, export all of symbols defined in brt namespace */ 5 | *brt::*; 6 | }; 7 | 8 | local: 9 | extern "C++" { 10 | /* hide all of symbols defined in MLIR to avoid symbol conflict */ 11 | *mlir::*; 12 | *llvm::*; 13 | }; 14 | }; 15 | -------------------------------------------------------------------------------- /scripts/clang_format_check.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | git_status=$(git status --porcelain) 4 | if [[ $git_status ]]; then 5 | echo "Checkout code is not clean" 6 | echo "${git_status}" 7 | exit 1 8 | fi 9 | 10 | find \( -name '*.cpp' -or -name '*.h' -or -name '*.cc' \) -not -path "./external/*" -not -path "./external_libs/*" | xargs clang-format-13 -i -style=file 11 | git_status=$(git status --porcelain) 12 | if [[ $git_status ]]; then 13 | echo "clang-format-13 is not happy, please run \"clang-format-13 -i -style=file /PATH/TO/foo.cpp\" to the following files" 14 | echo "${git_status}" 15 | exit 1 16 | else 17 | echo "PASSED C++ format" 18 | fi 19 | -------------------------------------------------------------------------------- /scripts/format_check.py: -------------------------------------------------------------------------------- 1 | from formatCheck.check import * 2 | import argparse 3 | 4 | # parse directory path 5 | parser = argparse.ArgumentParser() 6 | parser.add_argument("--dir", type=str, help="path to directory") 7 | args = parser.parse_args() 8 | 9 | format_check(args.dir) 10 | -------------------------------------------------------------------------------- /scripts/runtime/build_external_project.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | CUR_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" 6 | PROJ_DIR="$CUR_DIR/../../runtime" 7 | BRT_INSTALL_DIR="$PROJ_DIR/build/install" 8 | EXTERNAL_PROJECT_SRC_DIR="$PROJ_DIR/examples/external_project" 9 | EXTERNAL_PROJECT_BUILD_DIR="$EXTERNAL_PROJECT_SRC_DIR/build" 10 | 11 | rm -rf "$EXTERNAL_PROJECT_BUILD_DIR" 12 | mkdir -p "$EXTERNAL_PROJECT_BUILD_DIR" 13 | cmake -GNinja \ 14 | "-H$EXTERNAL_PROJECT_SRC_DIR" \ 15 | "-B$EXTERNAL_PROJECT_BUILD_DIR" \ 16 | -DBRT_INSTALL_PATH="$BRT_INSTALL_DIR" 17 | 18 | cmake --build "$EXTERNAL_PROJECT_BUILD_DIR" --target all 19 | pushd $EXTERNAL_PROJECT_BUILD_DIR 20 | ./main 21 | popd 22 | -------------------------------------------------------------------------------- /talks/ChinaSoftCon-ByteIR.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bytedance/byteir/5c485e2857e277934ec1f72878b1d9673f3614be/talks/ChinaSoftCon-ByteIR.pdf -------------------------------------------------------------------------------- /talks/c4ml23_poster.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bytedance/byteir/5c485e2857e277934ec1f72878b1d9673f3614be/talks/c4ml23_poster.pdf -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/cpu_ops/add.mlir: -------------------------------------------------------------------------------- 1 | func.func @add(%arg0 : tensor<128x2xf32>, %arg1 : tensor<128x2xf32>) -> tensor<128x2xf32> { 2 | %0 = stablehlo.add %arg0, %arg1 : tensor<128x2xf32> 3 | func.return %0 : tensor<128x2xf32> 4 | } 5 | -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/cpu_ops/batch_norm_inference.mlir: -------------------------------------------------------------------------------- 1 | module { 2 | func.func @main(%arg0: tensor<10x32x10xf32>, %arg1: tensor<32xf32>, %arg2: tensor<32xf32>, %arg3: tensor<32xf32>, %arg4: tensor<32xf32>) -> tensor<10x32x10xf32> { 3 | %0 = "stablehlo.batch_norm_inference"(%arg0, %arg1, %arg2, %arg3, %arg4) <{epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64}> : (tensor<10x32x10xf32>, tensor<32xf32>, tensor<32xf32>, tensor<32xf32>, tensor<32xf32>) -> tensor<10x32x10xf32> 4 | return %0 : tensor<10x32x10xf32> 5 | } 6 | } 7 | -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/cpu_ops/batch_norm_inference_f16.mlir: -------------------------------------------------------------------------------- 1 | module { 2 | func.func @main(%arg0: tensor<10x32x10xf16>, %arg1: tensor<32xf16>, %arg2: tensor<32xf16>, %arg3: tensor<32xf16>, %arg4: tensor<32xf16>) -> tensor<10x32x10xf16> { 3 | %0 = "stablehlo.batch_norm_inference"(%arg0, %arg1, %arg2, %arg3, %arg4) <{epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64}> : (tensor<10x32x10xf16>, tensor<32xf16>, tensor<32xf16>, tensor<32xf16>, tensor<32xf16>) -> tensor<10x32x10xf16> 4 | return %0 : tensor<10x32x10xf16> 5 | } 6 | } 7 | -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/cpu_ops/broadcast_in_dim.mlir: -------------------------------------------------------------------------------- 1 | func.func @broadcast_in_dim(%arg0 : tensor<128x1xi64>) -> tensor<128x3xi64> { 2 | %0 = "stablehlo.broadcast_in_dim"(%arg0) { 3 | broadcast_dimensions = array 4 | } : (tensor<128x1xi64>) -> tensor<128x3xi64> 5 | func.return %0 : tensor<128x3xi64> 6 | } 7 | -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/cpu_ops/compare_LT_f32.mlir: -------------------------------------------------------------------------------- 1 | func.func @compare_LT_f32(%arg0 : tensor<256x1xf32>, %arg1 : tensor<256x1xf32>) -> tensor<256x1xi1> { 2 | %0 = "stablehlo.compare"(%arg0, %arg1) { 3 | comparison_direction = #stablehlo, 4 | compare_type = #stablehlo 5 | } : (tensor<256x1xf32>, tensor<256x1xf32>) -> tensor<256x1xi1> 6 | func.return %0 : tensor<256x1xi1> 7 | } 8 | -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/cpu_ops/compare_LT_f64.mlir: -------------------------------------------------------------------------------- 1 | func.func @compare_LT_f64(%arg0 : tensor<256x1xf64>, %arg1 : tensor<256x1xf64>) -> tensor<256x1xi1> { 2 | %0 = "stablehlo.compare"(%arg0, %arg1) { 3 | comparison_direction = #stablehlo, 4 | compare_type = #stablehlo 5 | } : (tensor<256x1xf64>, tensor<256x1xf64>) -> tensor<256x1xi1> 6 | func.return %0 : tensor<256x1xi1> 7 | } 8 | -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/cpu_ops/compare_LT_i32.mlir: -------------------------------------------------------------------------------- 1 | func.func @compare_LT_i32(%arg0 : tensor<256x1xi32>, %arg1 : tensor<256x1xi32>) -> tensor<256x1xi1> { 2 | %0 = "stablehlo.compare"(%arg0, %arg1) { 3 | comparison_direction = #stablehlo, 4 | compare_type = #stablehlo 5 | } : (tensor<256x1xi32>, tensor<256x1xi32>) -> tensor<256x1xi1> 6 | func.return %0 : tensor<256x1xi1> 7 | } 8 | -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/cpu_ops/compare_LT_i64.mlir: -------------------------------------------------------------------------------- 1 | func.func @compare_LT_i64(%arg0 : tensor<256x1xi64>, %arg1 : tensor<256x1xi64>) -> tensor<256x1xi1> { 2 | %0 = "stablehlo.compare"(%arg0, %arg1) { 3 | comparison_direction = #stablehlo, 4 | compare_type = #stablehlo 5 | } : (tensor<256x1xi64>, tensor<256x1xi64>) -> tensor<256x1xi1> 6 | func.return %0 : tensor<256x1xi1> 7 | } 8 | -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/cpu_ops/compare_NE_f32.mlir: -------------------------------------------------------------------------------- 1 | func.func @compare_LT_f32(%arg0 : tensor<256x1xf32>, %arg1 : tensor<256x1xf32>) -> tensor<256x1xi1> { 2 | %0 = "stablehlo.compare"(%arg0, %arg1) { 3 | comparison_direction = #stablehlo, 4 | compare_type = #stablehlo 5 | } : (tensor<256x1xf32>, tensor<256x1xf32>) -> tensor<256x1xi1> 6 | func.return %0 : tensor<256x1xi1> 7 | } 8 | -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/cpu_ops/compare_NE_f64.mlir: -------------------------------------------------------------------------------- 1 | func.func @compare_LT_f64(%arg0 : tensor<256x1xf64>, %arg1 : tensor<256x1xf64>) -> tensor<256x1xi1> { 2 | %0 = "stablehlo.compare"(%arg0, %arg1) { 3 | comparison_direction = #stablehlo, 4 | compare_type = #stablehlo 5 | } : (tensor<256x1xf64>, tensor<256x1xf64>) -> tensor<256x1xi1> 6 | func.return %0 : tensor<256x1xi1> 7 | } 8 | -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/cpu_ops/compare_NE_i32.mlir: -------------------------------------------------------------------------------- 1 | func.func @compare_LT_i32(%arg0 : tensor<256x1xi32>, %arg1 : tensor<256x1xi32>) -> tensor<256x1xi1> { 2 | %0 = "stablehlo.compare"(%arg0, %arg1) { 3 | comparison_direction = #stablehlo, 4 | compare_type = #stablehlo 5 | } : (tensor<256x1xi32>, tensor<256x1xi32>) -> tensor<256x1xi1> 6 | func.return %0 : tensor<256x1xi1> 7 | } 8 | -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/cpu_ops/compare_NE_i64.mlir: -------------------------------------------------------------------------------- 1 | func.func @compare_LT_i64(%arg0 : tensor<256x1xi64>, %arg1 : tensor<256x1xi64>) -> tensor<256x1xi1> { 2 | %0 = "stablehlo.compare"(%arg0, %arg1) { 3 | comparison_direction = #stablehlo, 4 | compare_type = #stablehlo 5 | } : (tensor<256x1xi64>, tensor<256x1xi64>) -> tensor<256x1xi1> 6 | func.return %0 : tensor<256x1xi1> 7 | } 8 | -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/cpu_ops/concatenate.mlir: -------------------------------------------------------------------------------- 1 | func.func @concatenate(%arg0 : tensor<256x1xi64>) -> tensor<256x2xi64> { 2 | %0 = stablehlo.constant dense<86400> : tensor<256x1xi64> 3 | %1 = "stablehlo.concatenate"(%0, %arg0) { 4 | dimension = 1 : i64 5 | } : (tensor<256x1xi64>, tensor<256x1xi64>) -> tensor<256x2xi64> 6 | func.return %1 : tensor<256x2xi64> 7 | } 8 | -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/cpu_ops/convert_f16_f32.mlir: -------------------------------------------------------------------------------- 1 | func.func @convert_f16_f32(%arg0 : tensor<1x256xf16>) -> tensor<1x256xf32> { 2 | %0 = stablehlo.convert %arg0 : (tensor<1x256xf16>) -> tensor<1x256xf32> 3 | func.return %0 : tensor<1x256xf32> 4 | } -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/cpu_ops/convert_f16_f64.mlir: -------------------------------------------------------------------------------- 1 | func.func @convert_f16_f64(%arg0 : tensor<1x256xf16>) -> tensor<1x256xf64> { 2 | %0 = stablehlo.convert %arg0 : (tensor<1x256xf16>) -> tensor<1x256xf64> 3 | func.return %0 : tensor<1x256xf64> 4 | } -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/cpu_ops/convert_f16_i16.mlir: -------------------------------------------------------------------------------- 1 | func.func @convert_f16_i16(%arg0 : tensor<1x256xf16>) -> tensor<1x256xi16> { 2 | %0 = stablehlo.convert %arg0 : (tensor<1x256xf16>) -> tensor<1x256xi16> 3 | func.return %0 : tensor<1x256xi16> 4 | } -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/cpu_ops/convert_f16_i32.mlir: -------------------------------------------------------------------------------- 1 | func.func @convert_f16_i32(%arg0 : tensor<1x256xf16>) -> tensor<1x256xi32> { 2 | %0 = stablehlo.convert %arg0 : (tensor<1x256xf16>) -> tensor<1x256xi32> 3 | func.return %0 : tensor<1x256xi32> 4 | } -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/cpu_ops/convert_f16_i64.mlir: -------------------------------------------------------------------------------- 1 | func.func @convert_f16_i64(%arg0 : tensor<1x256xf16>) -> tensor<1x256xi64> { 2 | %0 = stablehlo.convert %arg0 : (tensor<1x256xf16>) -> tensor<1x256xi64> 3 | func.return %0 : tensor<1x256xi64> 4 | } -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/cpu_ops/convert_f32_f16.mlir: -------------------------------------------------------------------------------- 1 | func.func @convert_f32_f16(%arg0 : tensor<1x256xf32>) -> tensor<1x256xf16> { 2 | %0 = stablehlo.convert %arg0 : (tensor<1x256xf32>) -> tensor<1x256xf16> 3 | func.return %0 : tensor<1x256xf16> 4 | } -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/cpu_ops/convert_f32_f64.mlir: -------------------------------------------------------------------------------- 1 | func.func @convert_f32_f64(%arg0 : tensor<1x256xf32>) -> tensor<1x256xf64> { 2 | %0 = stablehlo.convert %arg0 : (tensor<1x256xf32>) -> tensor<1x256xf64> 3 | func.return %0 : tensor<1x256xf64> 4 | } -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/cpu_ops/convert_f32_i16.mlir: -------------------------------------------------------------------------------- 1 | func.func @convert_f32_i16(%arg0 : tensor<1x256xf32>) -> tensor<1x256xi16> { 2 | %0 = stablehlo.convert %arg0 : (tensor<1x256xf32>) -> tensor<1x256xi16> 3 | func.return %0 : tensor<1x256xi16> 4 | } -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/cpu_ops/convert_f32_i32.mlir: -------------------------------------------------------------------------------- 1 | func.func @convert_f32_i32(%arg0 : tensor<1x256xf32>) -> tensor<1x256xi32> { 2 | %0 = stablehlo.convert %arg0 : (tensor<1x256xf32>) -> tensor<1x256xi32> 3 | func.return %0 : tensor<1x256xi32> 4 | } -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/cpu_ops/convert_f32_i32_special_val.mlir: -------------------------------------------------------------------------------- 1 | func.func @convert_f32_i32_special_val(%arg0 : tensor<2x3xf32>) -> tensor<2x3xi32> { 2 | %0 = stablehlo.convert %arg0 : (tensor<2x3xf32>) -> tensor<2x3xi32> 3 | func.return %0 : tensor<2x3xi32> 4 | } -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/cpu_ops/convert_f32_i64.mlir: -------------------------------------------------------------------------------- 1 | func.func @convert_f32_i64(%arg0 : tensor<1x256xf32>) -> tensor<1x256xi64> { 2 | %0 = stablehlo.convert %arg0 : (tensor<1x256xf32>) -> tensor<1x256xi64> 3 | func.return %0 : tensor<1x256xi64> 4 | } -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/cpu_ops/convert_f64_f16.mlir: -------------------------------------------------------------------------------- 1 | func.func @convert_f64_f16(%arg0 : tensor<1x256xf64>) -> tensor<1x256xf16> { 2 | %0 = stablehlo.convert %arg0 : (tensor<1x256xf64>) -> tensor<1x256xf16> 3 | func.return %0 : tensor<1x256xf16> 4 | } -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/cpu_ops/convert_f64_f32.mlir: -------------------------------------------------------------------------------- 1 | func.func @convert_f64_f32(%arg0 : tensor<1x256xf64>) -> tensor<1x256xf32> { 2 | %0 = stablehlo.convert %arg0 : (tensor<1x256xf64>) -> tensor<1x256xf32> 3 | func.return %0 : tensor<1x256xf32> 4 | } -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/cpu_ops/convert_f64_i16.mlir: -------------------------------------------------------------------------------- 1 | func.func @convert_f64_i16(%arg0 : tensor<1x256xf64>) -> tensor<1x256xi16> { 2 | %0 = stablehlo.convert %arg0 : (tensor<1x256xf64>) -> tensor<1x256xi16> 3 | func.return %0 : tensor<1x256xi16> 4 | } -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/cpu_ops/convert_f64_i32.mlir: -------------------------------------------------------------------------------- 1 | func.func @convert_f64_i32(%arg0 : tensor<1x256xf64>) -> tensor<1x256xi32> { 2 | %0 = stablehlo.convert %arg0 : (tensor<1x256xf64>) -> tensor<1x256xi32> 3 | func.return %0 : tensor<1x256xi32> 4 | } -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/cpu_ops/convert_f64_i64.mlir: -------------------------------------------------------------------------------- 1 | func.func @convert_f64_i64(%arg0 : tensor<1x256xf64>) -> tensor<1x256xi64> { 2 | %0 = stablehlo.convert %arg0 : (tensor<1x256xf64>) -> tensor<1x256xi64> 3 | func.return %0 : tensor<1x256xi64> 4 | } -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/cpu_ops/convert_i16_f16.mlir: -------------------------------------------------------------------------------- 1 | func.func @convert_i16_f16(%arg0 : tensor<1x256xi16>) -> tensor<1x256xf16> { 2 | %0 = stablehlo.convert %arg0 : (tensor<1x256xi16>) -> tensor<1x256xf16> 3 | func.return %0 : tensor<1x256xf16> 4 | } -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/cpu_ops/convert_i16_f32.mlir: -------------------------------------------------------------------------------- 1 | func.func @convert_i16_f32(%arg0 : tensor<1x256xi16>) -> tensor<1x256xf32> { 2 | %0 = stablehlo.convert %arg0 : (tensor<1x256xi16>) -> tensor<1x256xf32> 3 | func.return %0 : tensor<1x256xf32> 4 | } -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/cpu_ops/convert_i16_f64.mlir: -------------------------------------------------------------------------------- 1 | func.func @convert_i16_f64(%arg0 : tensor<1x256xi16>) -> tensor<1x256xf64> { 2 | %0 = stablehlo.convert %arg0 : (tensor<1x256xi16>) -> tensor<1x256xf64> 3 | func.return %0 : tensor<1x256xf64> 4 | } -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/cpu_ops/convert_i16_i32.mlir: -------------------------------------------------------------------------------- 1 | func.func @convert_i16_i32(%arg0 : tensor<1x256xi16>) -> tensor<1x256xi32> { 2 | %0 = stablehlo.convert %arg0 : (tensor<1x256xi16>) -> tensor<1x256xi32> 3 | func.return %0 : tensor<1x256xi32> 4 | } -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/cpu_ops/convert_i16_i64.mlir: -------------------------------------------------------------------------------- 1 | func.func @convert_i16_i64(%arg0 : tensor<1x256xi16>) -> tensor<1x256xi64> { 2 | %0 = stablehlo.convert %arg0 : (tensor<1x256xi16>) -> tensor<1x256xi64> 3 | func.return %0 : tensor<1x256xi64> 4 | } -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/cpu_ops/convert_i32_f16.mlir: -------------------------------------------------------------------------------- 1 | func.func @convert_i32_f16(%arg0 : tensor<1x256xi32>) -> tensor<1x256xf16> { 2 | %0 = stablehlo.convert %arg0 : (tensor<1x256xi32>) -> tensor<1x256xf16> 3 | func.return %0 : tensor<1x256xf16> 4 | } -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/cpu_ops/convert_i32_f32.mlir: -------------------------------------------------------------------------------- 1 | func.func @convert_i32_f32(%arg0 : tensor<1x256xi32>) -> tensor<1x256xf32> { 2 | %0 = stablehlo.convert %arg0 : (tensor<1x256xi32>) -> tensor<1x256xf32> 3 | func.return %0 : tensor<1x256xf32> 4 | } -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/cpu_ops/convert_i32_f64.mlir: -------------------------------------------------------------------------------- 1 | func.func @convert_i32_f64(%arg0 : tensor<1x256xi32>) -> tensor<1x256xf64> { 2 | %0 = stablehlo.convert %arg0 : (tensor<1x256xi32>) -> tensor<1x256xf64> 3 | func.return %0 : tensor<1x256xf64> 4 | } -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/cpu_ops/convert_i32_i16.mlir: -------------------------------------------------------------------------------- 1 | func.func @convert_i32_i16(%arg0 : tensor<1x256xi32>) -> tensor<1x256xi16> { 2 | %0 = stablehlo.convert %arg0 : (tensor<1x256xi32>) -> tensor<1x256xi16> 3 | func.return %0 : tensor<1x256xi16> 4 | } -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/cpu_ops/convert_i32_i64.mlir: -------------------------------------------------------------------------------- 1 | func.func @convert_i32_i64(%arg0 : tensor<1x256xi32>) -> tensor<1x256xi64> { 2 | %0 = stablehlo.convert %arg0 : (tensor<1x256xi32>) -> tensor<1x256xi64> 3 | func.return %0 : tensor<1x256xi64> 4 | } -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/cpu_ops/convert_i64_f16.mlir: -------------------------------------------------------------------------------- 1 | func.func @convert_i64_f16(%arg0 : tensor<1x256xi64>) -> tensor<1x256xf16> { 2 | %0 = stablehlo.convert %arg0 : (tensor<1x256xi64>) -> tensor<1x256xf16> 3 | func.return %0 : tensor<1x256xf16> 4 | } -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/cpu_ops/convert_i64_f32.mlir: -------------------------------------------------------------------------------- 1 | func.func @convert_i64_f32(%arg0 : tensor<1x256xi64>) -> tensor<1x256xf32> { 2 | %0 = stablehlo.convert %arg0 : (tensor<1x256xi64>) -> tensor<1x256xf32> 3 | func.return %0 : tensor<1x256xf32> 4 | } -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/cpu_ops/convert_i64_f64.mlir: -------------------------------------------------------------------------------- 1 | func.func @convert_i64_f64(%arg0 : tensor<1x256xi64>) -> tensor<1x256xf64> { 2 | %0 = stablehlo.convert %arg0 : (tensor<1x256xi64>) -> tensor<1x256xf64> 3 | func.return %0 : tensor<1x256xf64> 4 | } -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/cpu_ops/convert_i64_i16.mlir: -------------------------------------------------------------------------------- 1 | func.func @convert_i64_i16(%arg0 : tensor<1x256xi64>) -> tensor<1x256xi16> { 2 | %0 = stablehlo.convert %arg0 : (tensor<1x256xi64>) -> tensor<1x256xi16> 3 | func.return %0 : tensor<1x256xi16> 4 | } -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/cpu_ops/convert_i64_i32.mlir: -------------------------------------------------------------------------------- 1 | func.func @convert_i64_i32(%arg0 : tensor<1x256xi64>) -> tensor<1x256xi32> { 2 | %0 = stablehlo.convert %arg0 : (tensor<1x256xi64>) -> tensor<1x256xi32> 3 | func.return %0 : tensor<1x256xi32> 4 | } -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/cpu_ops/custom_call_byteir_addn.mlir: -------------------------------------------------------------------------------- 1 | func.func @byteir.addn(%arg0: tensor<128xf32>, %arg1: tensor<128xf32>, %arg2: tensor<128xf32>) -> tensor<128xf32> { 2 | %0 = stablehlo.custom_call @byteir.addn(%arg0, %arg1, %arg2) {byteir_attrs = {}} : (tensor<128xf32>, tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32> 3 | return %0 : tensor<128xf32> 4 | } 5 | -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/cpu_ops/custom_call_byteir_arg_max.mlir: -------------------------------------------------------------------------------- 1 | func.func @byteir.arg_max$return_2(%arg0: tensor<3x128xf32>) -> (tensor<3xf32>, tensor<3xi64>) { 2 | %0:2 = stablehlo.custom_call @byteir.arg_max(%arg0) {byteir_attrs = {axis = 1 : i64, keep_dims = false, select_last_index = false}} : (tensor<3x128xf32>) -> (tensor<3xf32>, tensor<3xi64>) 3 | return %0#0, %0#1 : tensor<3xf32>, tensor<3xi64> 4 | } 5 | -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/cpu_ops/custom_call_byteir_arg_max_i32.mlir: -------------------------------------------------------------------------------- 1 | func.func @byteir.arg_max$return_2(%arg0: tensor<3x128xi32>) -> (tensor<3xi32>, tensor<3xi64>) { 2 | %0:2 = stablehlo.custom_call @byteir.arg_max(%arg0) {byteir_attrs = {axis = 1 : i64, keep_dims = false, select_last_index = false}} : (tensor<3x128xi32>) -> (tensor<3xi32>, tensor<3xi64>) 3 | return %0#0, %0#1 : tensor<3xi32>, tensor<3xi64> 4 | } 5 | -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/cpu_ops/custom_call_byteir_arg_min.mlir: -------------------------------------------------------------------------------- 1 | func.func @byteir.arg_min$return_2(%arg0: tensor<3x128xf32>) -> (tensor<3xf32>, tensor<3xi64>) { 2 | %0:2 = stablehlo.custom_call @byteir.arg_min(%arg0) {byteir_attrs = {axis = 1 : i64, keep_dims = false, select_last_index = false}} : (tensor<3x128xf32>) -> (tensor<3xf32>, tensor<3xi64>) 3 | return %0#0, %0#1 : tensor<3xf32>, tensor<3xi64> 4 | } 5 | -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/cpu_ops/custom_call_byteir_arg_min_i32.mlir: -------------------------------------------------------------------------------- 1 | func.func @byteir.arg_min$return_2(%arg0: tensor<3x128xi32>) -> (tensor<3xi32>, tensor<3xi64>) { 2 | %0:2 = stablehlo.custom_call @byteir.arg_min(%arg0) {byteir_attrs = {axis = 1 : i64, keep_dims = false, select_last_index = false}} : (tensor<3x128xi32>) -> (tensor<3xi32>, tensor<3xi64>) 3 | return %0#0, %0#1 : tensor<3xi32>, tensor<3xi64> 4 | } 5 | -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/cpu_ops/custom_call_byteir_softmax.mlir: -------------------------------------------------------------------------------- 1 | func.func @byteir.softmax(%arg0: tensor<10x128xf32>) -> tensor<10x128xf32> { 2 | %0 = stablehlo.custom_call @byteir.softmax(%arg0) {byteir_attrs = {axis = 1 : i64}} : (tensor<10x128xf32>) -> tensor<10x128xf32> 3 | return %0 : tensor<10x128xf32> 4 | } 5 | -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/cpu_ops/custom_call_tf_UpperBound.mlir: -------------------------------------------------------------------------------- 1 | func.func @custom_call_tf_UpperBound(%arg0 : tensor<1x2560xf16>) -> tensor<1x2560xi32> { 2 | %0 = stablehlo.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00, 6.000000e+00, 1.000000e+01, 2.000000e+01, 5.000000e+01]]> : tensor<1x8xf16> 3 | %1 = "stablehlo.custom_call"(%0, %arg0) { 4 | call_target_name = "tf.UpperBound", 5 | has_side_effect = false, 6 | backend_config = "", 7 | byteir_attrs = {}, 8 | api_version = 1 : i32, 9 | called_computations = [@tf.UpperBound] 10 | } : (tensor<1x8xf16>, tensor<1x2560xf16>) -> tensor<1x2560xi32> 11 | func.return %1 : tensor<1x2560xi32> 12 | } 13 | -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/cpu_ops/divide_f16.mlir: -------------------------------------------------------------------------------- 1 | func.func @divide_f16(%arg0 : tensor<256x1xf16>, %arg1 : tensor<256x1xf16>) -> tensor<256x1xf16> { 2 | %0 = stablehlo.divide %arg0, %arg1 : tensor<256x1xf16> 3 | func.return %0 : tensor<256x1xf16> 4 | } 5 | -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/cpu_ops/log_plus_one_f16.mlir: -------------------------------------------------------------------------------- 1 | func.func @log_plus_one(%arg0 : tensor<256x1xf16>) -> tensor<256x1xf16> { 2 | %0 = "stablehlo.log_plus_one"(%arg0) : (tensor<256x1xf16>) -> tensor<256x1xf16> 3 | func.return %0 : tensor<256x1xf16> 4 | } 5 | -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/cpu_ops/maximum_f32.mlir: -------------------------------------------------------------------------------- 1 | func.func @maximum(%arg0 : tensor<256x1xf32>, %arg1 : tensor<256x1xf32>) -> tensor<256x1xf32> { 2 | %0 = "stablehlo.maximum"(%arg0, %arg1) : (tensor<256x1xf32>, tensor<256x1xf32>) -> tensor<256x1xf32> 3 | func.return %0 : tensor<256x1xf32> 4 | } 5 | -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/cpu_ops/maximum_f64.mlir: -------------------------------------------------------------------------------- 1 | func.func @maximum_f64(%arg0 : tensor<256x1xf64>, %arg1 : tensor<256x1xf64>) -> tensor<256x1xf64> { 2 | %0 = "stablehlo.maximum"(%arg0, %arg1) : (tensor<256x1xf64>, tensor<256x1xf64>) -> tensor<256x1xf64> 3 | func.return %0 : tensor<256x1xf64> 4 | } 5 | -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/cpu_ops/maximum_i32.mlir: -------------------------------------------------------------------------------- 1 | func.func @maximum_i32(%arg0 : tensor<256x1xi32>, %arg1 : tensor<256x1xi32>) -> tensor<256x1xi32> { 2 | %0 = "stablehlo.maximum"(%arg0, %arg1) : (tensor<256x1xi32>, tensor<256x1xi32>) -> tensor<256x1xi32> 3 | func.return %0 : tensor<256x1xi32> 4 | } 5 | -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/cpu_ops/maximum_i64.mlir: -------------------------------------------------------------------------------- 1 | func.func @maximum_i64(%arg0 : tensor<256x1xi64>, %arg1 : tensor<256x1xi64>) -> tensor<256x1xi64> { 2 | %0 = "stablehlo.maximum"(%arg0, %arg1) : (tensor<256x1xi64>, tensor<256x1xi64>) -> tensor<256x1xi64> 3 | func.return %0 : tensor<256x1xi64> 4 | } 5 | -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/cpu_ops/minimum_f32.mlir: -------------------------------------------------------------------------------- 1 | func.func @minimum(%arg0 : tensor<256x1xf32>, %arg1 : tensor<256x1xf32>) -> tensor<256x1xf32> { 2 | %0 = "stablehlo.minimum"(%arg0, %arg1) : (tensor<256x1xf32>, tensor<256x1xf32>) -> tensor<256x1xf32> 3 | func.return %0 : tensor<256x1xf32> 4 | } 5 | -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/cpu_ops/minimum_f64.mlir: -------------------------------------------------------------------------------- 1 | func.func @minimum_f64(%arg0 : tensor<256x1xf64>, %arg1 : tensor<256x1xf64>) -> tensor<256x1xf64> { 2 | %0 = "stablehlo.minimum"(%arg0, %arg1) : (tensor<256x1xf64>, tensor<256x1xf64>) -> tensor<256x1xf64> 3 | func.return %0 : tensor<256x1xf64> 4 | } 5 | -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/cpu_ops/minimum_i32.mlir: -------------------------------------------------------------------------------- 1 | func.func @minimum_i32(%arg0 : tensor<256x1xi32>, %arg1 : tensor<256x1xi32>) -> tensor<256x1xi32> { 2 | %0 = "stablehlo.minimum"(%arg0, %arg1) : (tensor<256x1xi32>, tensor<256x1xi32>) -> tensor<256x1xi32> 3 | func.return %0 : tensor<256x1xi32> 4 | } 5 | -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/cpu_ops/minimum_i64.mlir: -------------------------------------------------------------------------------- 1 | func.func @minimum_i64(%arg0 : tensor<256x1xi64>, %arg1 : tensor<256x1xi64>) -> tensor<256x1xi64> { 2 | %0 = "stablehlo.minimum"(%arg0, %arg1) : (tensor<256x1xi64>, tensor<256x1xi64>) -> tensor<256x1xi64> 3 | func.return %0 : tensor<256x1xi64> 4 | } 5 | -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/cpu_ops/multiply_f32.mlir: -------------------------------------------------------------------------------- 1 | func.func @multiply_256x1xf32(%arg0 : tensor<256x1xf32>, %arg1 : tensor<256x1xf32>) -> tensor<256x1xf32> { 2 | %0 = "stablehlo.multiply"(%arg0, %arg1) : (tensor<256x1xf32>, tensor<256x1xf32>) -> tensor<256x1xf32> 3 | func.return %0 : tensor<256x1xf32> 4 | } 5 | -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/cpu_ops/multiply_f64.mlir: -------------------------------------------------------------------------------- 1 | func.func @multiply_256x1xf64(%arg0 : tensor<256x1xf64>, %arg1 : tensor<256x1xf64>) -> tensor<256x1xf64> { 2 | %0 = "stablehlo.multiply"(%arg0, %arg1) : (tensor<256x1xf64>, tensor<256x1xf64>) -> tensor<256x1xf64> 3 | func.return %0 : tensor<256x1xf64> 4 | } 5 | -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/cpu_ops/multiply_i32.mlir: -------------------------------------------------------------------------------- 1 | func.func @multiply_256x1xi32(%arg0 : tensor<256x1xi32>, %arg1 : tensor<256x1xi32>) -> tensor<256x1xi32> { 2 | %0 = "stablehlo.multiply"(%arg0, %arg1) : (tensor<256x1xi32>, tensor<256x1xi32>) -> tensor<256x1xi32> 3 | func.return %0 : tensor<256x1xi32> 4 | } 5 | -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/cpu_ops/multiply_i64.mlir: -------------------------------------------------------------------------------- 1 | func.func @multiply_256x1xi64(%arg0 : tensor<256x1xi64>, %arg1 : tensor<256x1xi64>) -> tensor<256x1xi64> { 2 | %0 = "stablehlo.multiply"(%arg0, %arg1) : (tensor<256x1xi64>, tensor<256x1xi64>) -> tensor<256x1xi64> 3 | func.return %0 : tensor<256x1xi64> 4 | } 5 | -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/cpu_ops/reduce_f32.mlir: -------------------------------------------------------------------------------- 1 | func.func @reduce_f32(%input : tensor<256x5xf32>) -> tensor<256xf32> { 2 | %0 = stablehlo.constant dense<-0.000000e+00> : tensor 3 | %1 = "stablehlo.reduce"(%input, %0) ({ 4 | ^bb0(%arg0: tensor, %arg1: tensor): 5 | %2 = "stablehlo.add"(%arg0, %arg1) : (tensor, tensor) -> tensor 6 | "stablehlo.return"(%2) : (tensor) -> () 7 | }) { 8 | dimensions = array 9 | } : (tensor<256x5xf32>, tensor) -> tensor<256xf32> 10 | func.return %1 : tensor<256xf32> 11 | } 12 | -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/cpu_ops/remainder_i64.mlir: -------------------------------------------------------------------------------- 1 | func.func @remainder(%arg0 : tensor<256x1xi64>) -> tensor<256x1xi64> { 2 | %0 = stablehlo.constant dense<86400> : tensor<256x1xi64> 3 | %1 = "stablehlo.remainder"(%arg0, %0) : (tensor<256x1xi64>, tensor<256x1xi64>) -> tensor<256x1xi64> 4 | func.return %1 : tensor<256x1xi64> 5 | } 6 | -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/cpu_ops/reshape_slice.mlir: -------------------------------------------------------------------------------- 1 | func.func @reshape_slice_reshape(%arg0: tensor<256x2xf16>) -> (tensor<256xf16>) { 2 | %0 = "stablehlo.reshape"(%arg0) : (tensor<256x2xf16>) -> tensor<256x1x2xf16> 3 | %1 = "stablehlo.slice"(%0) {limit_indices = array, start_indices = array, strides =array} : (tensor<256x1x2xf16>) -> tensor<256x1x1xf16> 4 | %2 = "stablehlo.reshape"(%1) : (tensor<256x1x1xf16>) -> tensor<256xf16> 5 | func.return %2 : tensor<256xf16> 6 | } 7 | -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/cpu_ops/rng.mlir: -------------------------------------------------------------------------------- 1 | func.func @rng_f16() -> tensor<256x120xf16> { 2 | %0 = stablehlo.constant dense<[256, 120]> : tensor<2xi64> 3 | %1 = stablehlo.constant dense<1.000000e+00> : tensor 4 | %2 = stablehlo.constant dense<0.000000e+00> : tensor 5 | %3 = "stablehlo.rng"(%2, %1, %0) { 6 | rng_distribution = #stablehlo, 7 | device = "host" 8 | } : (tensor, tensor, tensor<2xi64>) -> tensor<256x120xf16> 9 | func.return %3 : tensor<256x120xf16> 10 | } 11 | -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/cpu_ops/scatter_insert_slice.mlir: -------------------------------------------------------------------------------- 1 | func.func @forward(%arg0: tensor<6x8x5xf32>, %arg1: tensor<6x1x5xf32>) -> tensor<6x8x5xf32> { 2 | %c = stablehlo.constant dense<0> : tensor<1x1xi64> 3 | %0 = "stablehlo.scatter"(%arg0, %c, %arg1) <{indices_are_sorted = false, scatter_dimension_numbers = #stablehlo.scatter, unique_indices = false}> ({ 4 | ^bb0(%arg2: tensor, %arg3: tensor): 5 | stablehlo.return %arg3 : tensor 6 | }) : (tensor<6x8x5xf32>, tensor<1x1xi64>, tensor<6x1x5xf32>) -> tensor<6x8x5xf32> 7 | return %0 : tensor<6x8x5xf32> 8 | } -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/cpu_ops/select_f32.mlir: -------------------------------------------------------------------------------- 1 | func.func @select_f32(%pred : tensor<256x1xi1>, %on_true : tensor<256x1xf32>, %on_false : tensor<256x1xf32>) -> tensor<256x1xf32> { 2 | %0 = "stablehlo.select"(%pred, %on_true, %on_false) : (tensor<256x1xi1>, tensor<256x1xf32>, tensor<256x1xf32>) -> tensor<256x1xf32> 3 | func.return %0 : tensor<256x1xf32> 4 | } 5 | -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/cpu_ops/select_f64.mlir: -------------------------------------------------------------------------------- 1 | func.func @select_f64(%pred : tensor<256x1xi1>, %on_true : tensor<256x1xf64>, %on_false : tensor<256x1xf64>) -> tensor<256x1xf64> { 2 | %0 = "stablehlo.select"(%pred, %on_true, %on_false) : (tensor<256x1xi1>, tensor<256x1xf64>, tensor<256x1xf64>) -> tensor<256x1xf64> 3 | func.return %0 : tensor<256x1xf64> 4 | } 5 | -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/cpu_ops/select_i64.mlir: -------------------------------------------------------------------------------- 1 | func.func @select_i64(%pred : tensor<256x1xi1>, %on_true : tensor<256x1xi64>, %on_false : tensor<256x1xi64>) -> tensor<256x1xi64> { 2 | %0 = "stablehlo.select"(%pred, %on_true, %on_false) : (tensor<256x1xi1>, tensor<256x1xi64>, tensor<256x1xi64>) -> tensor<256x1xi64> 3 | func.return %0 : tensor<256x1xi64> 4 | } 5 | -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/cpu_ops/subtrace_f16.mlir: -------------------------------------------------------------------------------- 1 | func.func @subtract_f16(%arg0 : tensor<256x1xf16>, %arg1 : tensor<256x1xf16>) -> tensor<256x1xf16> { 2 | %0 = "stablehlo.subtract"(%arg0, %arg1) : (tensor<256x1xf16>, tensor<256x1xf16>) -> (tensor<256x1xf16>) 3 | func.return %0 : tensor<256x1xf16> 4 | } 5 | -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/ops/add.mlir: -------------------------------------------------------------------------------- 1 | func.func @add(%arg0 : tensor<256x256xf32>, %arg1 : tensor<256x256xf32>) -> tensor<256x256xf32> { 2 | %0 = mhlo.add %arg0, %arg1 : tensor<256x256xf32> 3 | return %0 : tensor<256x256xf32> 4 | } 5 | 6 | -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/ops/bmm_rcr.mlir: -------------------------------------------------------------------------------- 1 | func.func @bmm_rcr(%arg0 : tensor<1x32x256x128xf16>, %arg1 : tensor<32x256x128xf16>) -> tensor<32x256x256xf16> { 2 | %0 = "mhlo.transpose"(%arg0) {permutation = dense<[0, 1, 3, 2]> : tensor<4xi64>} : (tensor<1x32x256x128xf16>) -> tensor<1x32x128x256xf16> 3 | %1 = mhlo.reshape %0 : (tensor<1x32x128x256xf16>) -> tensor<32x128x256xf16> 4 | %2 = "mhlo.dot_general"(%arg1, %1) {dot_dimension_numbers = #mhlo.dot} : (tensor<32x256x128xf16>, tensor<32x128x256xf16>) -> tensor<32x256x256xf16> 5 | return %2 : tensor<32x256x256xf16> 6 | } 7 | -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/ops/bmm_rrc.mlir: -------------------------------------------------------------------------------- 1 | 2 | func.func @bmm_rrc(%arg0 : tensor<32x128x256xf16>, %arg1 : tensor<32x256x256xf16>) -> tensor<1x32x256x128xf16> { 3 | %0 = "mhlo.dot_general"(%arg0, %arg1) {dot_dimension_numbers = #mhlo.dot} : (tensor<32x128x256xf16>, tensor<32x256x256xf16>) -> tensor<32x128x256xf16> 4 | %1 = mhlo.reshape %0 : (tensor<32x128x256xf16>) -> tensor<1x32x128x256xf16> 5 | %2 = "mhlo.transpose"(%1) {permutation = dense<[0, 1, 3, 2]> : tensor<4xi64>} : (tensor<1x32x128x256xf16>) -> tensor<1x32x256x128xf16> 6 | return %2 : tensor<1x32x256x128xf16> 7 | } 8 | -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/ops/bmm_rrr_add_f16.mlir: -------------------------------------------------------------------------------- 1 | func.func @bmm_rrr_add(%arg0 : tensor<32x256x256xf16>, %arg1 : tensor<32x256x128xf16>, %arg2 : tensor<1x32x256x128xf16>) -> tensor<1x32x256x128xf16> { 2 | %0 = "mhlo.dot_general"(%arg0, %arg1) {dot_dimension_numbers = #mhlo.dot} : (tensor<32x256x256xf16>, tensor<32x256x128xf16>) -> tensor<32x256x128xf16> 3 | %1 = mhlo.reshape %0 : (tensor<32x256x128xf16>) -> tensor<1x32x256x128xf16> 4 | %2 = mhlo.add %arg2, %1 : tensor<1x32x256x128xf16> 5 | return %2 : tensor<1x32x256x128xf16> 6 | } 7 | -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/ops/bmm_rrr_f16.mlir: -------------------------------------------------------------------------------- 1 | func.func @bmm_rrr(%arg0 : tensor<32x256x256xf16>, %arg1 : tensor<32x256x128xf16>) -> tensor<32x256x128xf16> { 2 | %0 = "mhlo.dot_general"(%arg0, %arg1) {dot_dimension_numbers = #mhlo.dot} : (tensor<32x256x256xf16>, tensor<32x256x128xf16>) -> tensor<32x256x128xf16> 3 | return %0 : tensor<32x256x128xf16> 4 | } 5 | -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/ops/bmm_rrr_permute_f16.mlir: -------------------------------------------------------------------------------- 1 | func.func @bmm_rrr_permute(%arg0: tensor<32x64x64xf16>, %arg1: tensor<32x64x128xf16>) -> tensor<1x64x32x128xf16> { 2 | %0 = "mhlo.dot_general"(%arg0, %arg1) {dot_dimension_numbers = #mhlo.dot} : (tensor<32x64x64xf16>, tensor<32x64x128xf16>) -> tensor<32x64x128xf16> 3 | %1 = mhlo.reshape %0 : (tensor<32x64x128xf16>) -> tensor<1x32x64x128xf16> 4 | %2 = "mhlo.transpose"(%1) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<1x32x64x128xf16>) -> tensor<1x64x32x128xf16> 5 | return %2 : tensor<1x64x32x128xf16> 6 | } -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/ops/bmm_rrr_permute_f32.mlir: -------------------------------------------------------------------------------- 1 | func.func @bmm_rrr_permute_f32(%arg0: tensor<4x2x2xf32>, %arg1: tensor<4x2x2xf32>) -> tensor<2x2x2x2xf32> { 2 | %0 = "mhlo.dot_general"(%arg0, %arg1) {dot_dimension_numbers = #mhlo.dot} : (tensor<4x2x2xf32>, tensor<4x2x2xf32>) -> tensor<4x2x2xf32> 3 | %1 = mhlo.reshape %0 : (tensor<4x2x2xf32>) -> tensor<2x2x2x2xf32> 4 | %2 = "mhlo.transpose"(%1) {permutation = dense<[0, 2, 1, 3]> : tensor<4xi64>} : (tensor<2x2x2x2xf32>) -> tensor<2x2x2x2xf32> 5 | return %2 : tensor<2x2x2x2xf32> 6 | } 7 | -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/ops/broadcast.mlir: -------------------------------------------------------------------------------- 1 | func.func @broadcast(%arg0 : tensor<1x1x256x128xf16>) -> tensor<1x32x256x128xf16> { 2 | %0 = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[0, 1, 2, 3]> : tensor<4xi64>} : (tensor<1x1x256x128xf16>) -> tensor<1x32x256x128xf16> 3 | return %0 : tensor<1x32x256x128xf16> 4 | } 5 | -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/ops/broadcast1.mlir: -------------------------------------------------------------------------------- 1 | func.func @broadcast1(%arg0 : tensor<4096xf32>) -> tensor<1x256x4096xf32> { 2 | %0 = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<4096xf32>) -> tensor<1x256x4096xf32> 3 | return %0 : tensor<1x256x4096xf32> 4 | } 5 | -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/ops/compare_eq.mlir: -------------------------------------------------------------------------------- 1 | func.func @compare_eq(%arg0 : tensor<1x256x1024xi64>, %arg1 : tensor<1x256x1024xi64>) -> tensor<1x256x1024xi1> { 2 | %0 = mhlo.compare EQ, %arg0, %arg1, SIGNED : (tensor<1x256x1024xi64>, tensor<1x256x1024xi64>) -> tensor<1x256x1024xi1> 3 | return %0 : tensor<1x256x1024xi1> 4 | } 5 | -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/ops/compare_lt.mlir: -------------------------------------------------------------------------------- 1 | func.func @compare_lt(%arg0 : tensor<1x32x256x256xf32>, %arg1 : tensor<1x32x256x256xf32>) -> tensor<1x32x256x256xi1> { 2 | %0 = mhlo.compare LT, %arg0, %arg1, FLOAT : (tensor<1x32x256x256xf32>, tensor<1x32x256x256xf32>) -> tensor<1x32x256x256xi1> 3 | return %0 : tensor<1x32x256x256xi1> 4 | } 5 | -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/ops/concat.mlir: -------------------------------------------------------------------------------- 1 | func.func @concat(%arg0 : tensor<1x32x256x64xf16>, %arg1 : tensor<1x32x256x64xf16>) -> tensor<1x32x256x128xf16> { 2 | %0 = "mhlo.concatenate"(%arg0, %arg1) {dimension = 3 : i64} : (tensor<1x32x256x64xf16>, tensor<1x32x256x64xf16>) -> tensor<1x32x256x128xf16> 3 | return %0 : tensor<1x32x256x128xf16> 4 | } 5 | -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/ops/concat2.mlir: -------------------------------------------------------------------------------- 1 | func.func @concat2(%arg0: tensor, %arg1: tensor) -> (tensor<2xi64>) { 2 | %0 = mhlo.reshape %arg0 : (tensor) -> tensor<1xi64> 3 | %1 = mhlo.reshape %arg1 : (tensor) -> tensor<1xi64> 4 | %2 = "mhlo.concatenate"(%0, %1) {dimension = 0 : i64} : (tensor<1xi64>, tensor<1xi64>) -> tensor<2xi64> 5 | return %2 : tensor<2xi64> 6 | } 7 | -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/ops/convert_f16_f32.mlir: -------------------------------------------------------------------------------- 1 | func.func @convert_f16_f32(%arg0 : tensor<1x256x1024xf16>) -> tensor<1x256x1024xf32> { 2 | %0 = mhlo.convert %arg0 : (tensor<1x256x1024xf16>) -> tensor<1x256x1024xf32> 3 | return %0 : tensor<1x256x1024xf32> 4 | } 5 | -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/ops/convert_f32_f16.mlir: -------------------------------------------------------------------------------- 1 | func.func @convert_f32_f16(%arg0 : tensor<1x256x1024xf32>) -> tensor<1x256x1024xf16> { 2 | %0 = mhlo.convert %arg0 : (tensor<1x256x1024xf32>) -> tensor<1x256x1024xf16> 3 | return %0 : tensor<1x256x1024xf16> 4 | } 5 | -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/ops/divide.mlir: -------------------------------------------------------------------------------- 1 | func.func @divide(%arg0 : tensor<1x256x4096xf32>) -> tensor<1x256x4096xf32> { 2 | %cst = mhlo.constant dense<4.096000e+03> : tensor<1x256x4096xf32> 3 | %0 = mhlo.divide %arg0, %cst : tensor<1x256x4096xf32> 4 | return %0 : tensor<1x256x4096xf32> 5 | } 6 | -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/ops/gather.mlir: -------------------------------------------------------------------------------- 1 | func.func @gather(%arg0 : tensor<256x128xf16>, %arg1 : tensor<1x256xi64>) -> tensor<1x256x128xf16> { 2 | %0 = "mhlo.gather"(%arg0, %arg1) {dimension_numbers = #mhlo.gather, indices_are_sorted = false, slice_sizes = dense<[1, 128]> : tensor<2xi64>} : (tensor<256x128xf16>, tensor<1x256xi64>) -> tensor<1x256x128xf16> 3 | return %0 : tensor<1x256x128xf16> 4 | } 5 | 6 | -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/ops/gemm_crr_f16.mlir: -------------------------------------------------------------------------------- 1 | func.func @bmm_crr(%arg0 : tensor<1x256x4096xf16>, %arg1 : tensor<256x11008xf16>) -> tensor<4096x11008xf16> { 2 | %0 = mhlo.reshape %arg0 : (tensor<1x256x4096xf16>) -> tensor<256x4096xf16> 3 | %1 = "mhlo.transpose"(%0) {permutation = dense<[1, 0]> : tensor<2xi64>} : (tensor<256x4096xf16>) -> tensor<4096x256xf16> 4 | %2 = "mhlo.dot"(%1, %arg1) : (tensor<4096x256xf16>, tensor<256x11008xf16>) -> tensor<4096x11008xf16> 5 | return %2: tensor<4096x11008xf16> 6 | } 7 | 8 | -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/ops/gemm_rrr_f16.mlir: -------------------------------------------------------------------------------- 1 | func.func @gemm_rrr_f16(%arg0 : tensor<1024x4096xf16>, %arg1 : tensor<4096x4096xf16>) -> tensor<1024x4096xf16> { 2 | %0 = "mhlo.dot"(%arg0, %arg1) : (tensor<1024x4096xf16>, tensor<4096x4096xf16>) -> tensor<1024x4096xf16> 3 | return %0 : tensor<1024x4096xf16> 4 | } 5 | -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/ops/gemm_rrr_f32.mlir: -------------------------------------------------------------------------------- 1 | func.func @gemm_rrr_f32(%arg0 : tensor<4x4xf32>, %arg1 : tensor<4x4xf32>) -> tensor<4x4xf32> { 2 | %0 = "mhlo.dot"(%arg0, %arg1) : (tensor<4x4xf32>, tensor<4x4xf32>) -> tensor<4x4xf32> 3 | return %0 : tensor<4x4xf32> 4 | } 5 | -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/ops/insert_slice.mlir: -------------------------------------------------------------------------------- 1 | func.func @insert_slice(%arg0 : tensor<1x32x256x64xf16>, %arg1 : tensor<1x32x256x64xf16>) -> tensor<1x32x256x128xf16> { 2 | %cst = mhlo.constant dense<0.000000e+00> : tensor<1x32x256x128xf16> 3 | %inserted_slice_0 = tensor.insert_slice %arg0 into %cst[0, 0, 0, 64] [1, 32, 256, 64] [1, 1, 1, 1] : tensor<1x32x256x64xf16> into tensor<1x32x256x128xf16> 4 | %inserted_slice_1 = tensor.insert_slice %arg1 into %inserted_slice_0[0, 0, 0, 0] [1, 32, 256, 64] [1, 1, 1, 1] : tensor<1x32x256x64xf16> into tensor<1x32x256x128xf16> 5 | return %inserted_slice_1 : tensor<1x32x256x128xf16> 6 | } 7 | -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/ops/layernorm.mlir: -------------------------------------------------------------------------------- 1 | func.func @layer_norm(%arg0 : tensor<1x16x4096xf32>, %arg1 : tensor<4096xf32>, %arg2 : tensor<4096xf32>) -> tensor<1x16x4096xf32> { 2 | %0 = "mhlo.custom_call"(%arg0, %arg1, %arg2) {api_version = 1 : i32, backend_config = "", byteir_attrs = {axis = [2], epsilon = 1.000000e-05 : f64}, call_target_name = "byteir.layer_norm", called_computations = [], has_side_effect = false} : (tensor<1x16x4096xf32>, tensor<4096xf32>, tensor<4096xf32>) -> tensor<1x16x4096xf32> 3 | return %0 : tensor<1x16x4096xf32> 4 | } 5 | -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/ops/logistic.mlir: -------------------------------------------------------------------------------- 1 | func.func @logistic(%arg0 : tensor<1x256x1024xf16>) -> tensor<1x256x1024xf16> { 2 | %0 = mhlo.logistic %arg0 : tensor<1x256x1024xf16> 3 | return %0 : tensor<1x256x1024xf16> 4 | } 5 | -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/ops/mul_f16.mlir: -------------------------------------------------------------------------------- 1 | func.func @multiply(%arg0 : tensor<1x32x256x128xf16>, %arg1 : tensor<1x32x256x128xf16>) -> tensor<1x32x256x128xf16> { 2 | %0 = mhlo.multiply %arg0, %arg1 : tensor<1x32x256x128xf16> 3 | return %0 : tensor<1x32x256x128xf16> 4 | } 5 | -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/ops/mul_f32.mlir: -------------------------------------------------------------------------------- 1 | func.func @multiply_f32(%arg0 : tensor<1x32x256x128xf32>, %arg1 : tensor<1x32x256x128xf32>) -> tensor<1x32x256x128xf32> { 2 | %0 = mhlo.multiply %arg0, %arg1 : tensor<1x32x256x128xf32> 3 | return %0 : tensor<1x32x256x128xf32> 4 | } 5 | -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/ops/negate.mlir: -------------------------------------------------------------------------------- 1 | func.func @negate(%arg0 : tensor<1x256xf32>) -> tensor<1x256xf32> { 2 | %0 = mhlo.negate %arg0 : tensor<1x256xf32> 3 | return %0 : tensor<1x256xf32> 4 | } 5 | -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/ops/power.mlir: -------------------------------------------------------------------------------- 1 | func.func @power(%arg0 : tensor<1x256x4096xf32>) -> tensor<1x256x4096xf32> { 2 | %cst = mhlo.constant dense<3.000000e+00> : tensor<1x256x4096xf32> 3 | %0 = mhlo.power %arg0, %cst : tensor<1x256x4096xf32> 4 | return %0 : tensor<1x256x4096xf32> 5 | } 6 | 7 | -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/ops/reduce_first_dim.mlir: -------------------------------------------------------------------------------- 1 | func.func @reduce_sum(%arg0 : tensor<256x2304xf32>) -> tensor<2304xf32> { 2 | %cst = mhlo.constant dense<0.000000e+00> : tensor 3 | %0 = mhlo.reduce(%arg0 init: %cst) across dimensions = [0] : (tensor<256x2304xf32>, tensor) -> tensor<2304xf32> 4 | reducer(%arg1: tensor, %arg2: tensor) { 5 | %1 = mhlo.add %arg1, %arg2 : tensor 6 | mhlo.return %1 : tensor 7 | } 8 | return %0 : tensor<2304xf32> 9 | } 10 | -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/ops/reduce_sum.mlir: -------------------------------------------------------------------------------- 1 | func.func @reduce_sum(%arg0 : tensor<1x32x256x256xf32>) -> tensor<1x32x256xf32> { 2 | %cst = mhlo.constant dense<0.000000e+00> : tensor 3 | %0 = mhlo.reduce(%arg0 init: %cst) across dimensions = [3] : (tensor<1x32x256x256xf32>, tensor) -> tensor<1x32x256xf32> 4 | reducer(%arg1: tensor, %arg2: tensor) { 5 | %1 = mhlo.add %arg1, %arg2 : tensor 6 | mhlo.return %1 : tensor 7 | } 8 | return %0 : tensor<1x32x256xf32> 9 | } 10 | -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/ops/reduce_sum_2d.mlir: -------------------------------------------------------------------------------- 1 | func.func @reduce(%arg0 : tensor<1x256x1024xf32>) -> tensor<1x256xf32> { 2 | %cst = mhlo.constant dense<0.000000e+00> : tensor 3 | %0 = mhlo.reduce(%arg0 init: %cst) across dimensions = [2] : (tensor<1x256x1024xf32>, tensor) -> tensor<1x256xf32> 4 | reducer(%arg1: tensor, %arg2: tensor) { 5 | %1 = mhlo.add %arg1, %arg2 : tensor 6 | mhlo.return %1 : tensor 7 | } 8 | return %0 : tensor<1x256xf32> 9 | } 10 | -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/ops/reduce_sum_first_2d.mlir: -------------------------------------------------------------------------------- 1 | func.func @reduce(%arg0 : tensor<1x256x1024xf32>) -> tensor<1024xf32> { 2 | %cst = mhlo.constant dense<0.000000e+00> : tensor 3 | %0 = mhlo.reduce(%arg0 init: %cst) across dimensions = [0, 1] : (tensor<1x256x1024xf32>, tensor) -> tensor<1024xf32> 4 | reducer(%arg1: tensor, %arg2: tensor) { 5 | %1 = mhlo.add %arg1, %arg2 : tensor 6 | mhlo.return %1 : tensor 7 | } 8 | return %0 : tensor<1024xf32> 9 | } 10 | -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/ops/rsqrt.mlir: -------------------------------------------------------------------------------- 1 | func.func @rsqrt(%arg0 : tensor<1x256xf32>) -> tensor<1x256xf32> { 2 | %0 = mhlo.rsqrt %arg0 : tensor<1x256xf32> 3 | return %0 : tensor<1x256xf32> 4 | } 5 | -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/ops/scatter.mlir: -------------------------------------------------------------------------------- 1 | 2 | func.func @scatter(%arg0 : tensor<256x1xi64>, %arg1 : tensor<256x4096xf32>) -> tensor<32000x4096xf32> { 3 | %cst = mhlo.constant dense<0.000000e+00> : tensor<32000x4096xf32> 4 | %0 = "mhlo.scatter"(%cst, %arg0, %arg1) ({ 5 | ^bb0(%arg66: tensor, %arg67: tensor): 6 | %395 = mhlo.add %arg66, %arg67 : tensor 7 | mhlo.return %395 : tensor 8 | }) {indices_are_sorted = false, scatter_dimension_numbers = #mhlo.scatter, unique_indices = false} : (tensor<32000x4096xf32>, tensor<256x1xi64>, tensor<256x4096xf32>) -> tensor<32000x4096xf32> 9 | return %0 : tensor<32000x4096xf32> 10 | } 11 | -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/ops/scatter_insert_slice.mlir: -------------------------------------------------------------------------------- 1 | func.func @forward(%arg0: tensor<6x8x5xf32>, %arg1: tensor<6x1x5xf32>) -> tensor<6x8x5xf32> { 2 | %0 = mhlo.constant dense<0> : tensor<1x1xi64> 3 | %1 = "mhlo.scatter"(%arg0, %0, %arg1) <{indices_are_sorted = false, scatter_dimension_numbers = #mhlo.scatter, unique_indices = false}> ({ 4 | ^bb0(%arg2: tensor, %arg3: tensor): 5 | mhlo.return %arg3 : tensor 6 | }) : (tensor<6x8x5xf32>, tensor<1x1xi64>, tensor<6x1x5xf32>) -> tensor<6x8x5xf32> 7 | return %1 : tensor<6x8x5xf32> 8 | } -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/ops/select.mlir: -------------------------------------------------------------------------------- 1 | func.func @select(%arg0 : tensor<1x32x256x256xi1>, %arg1 : tensor<1x32x256x256xf32>, %arg2 : tensor<1x32x256x256xf32>) -> tensor<1x32x256x256xf32> { 2 | %0 = mhlo.select %arg0, %arg1, %arg2 : tensor<1x32x256x256xi1>, tensor<1x32x256x256xf32> 3 | return %0 : tensor<1x32x256x256xf32> 4 | } 5 | -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/ops/slice.mlir: -------------------------------------------------------------------------------- 1 | func.func @slice(%arg0 : tensor<1x1x1024x1024xi1>) -> tensor<1x1x256x1024xi1> { 2 | %0 = "mhlo.slice"(%arg0) {limit_indices = dense<[1, 1, 256, 1024]> : tensor<4xi64>, start_indices = dense<0> : tensor<4xi64>, strides = dense<1> : tensor<4xi64>} : (tensor<1x1x1024x1024xi1>) -> tensor<1x1x256x1024xi1> 3 | return %0 : tensor<1x1x256x1024xi1> 4 | } 5 | -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/ops/softmax.mlir: -------------------------------------------------------------------------------- 1 | func.func @softmax(%arg0 : tensor<8x12x256x256xf32>) -> tensor<8x12x256x256xf32> { 2 | %0 = mhlo.custom_call @byteir.softmax(%arg0) {backend_config = "", byteir_attrs = {axis = 3 : i64}} : (tensor<8x12x256x256xf32>) -> tensor<8x12x256x256xf32> 3 | return %0 : tensor<8x12x256x256xf32> 4 | } 5 | -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/ops/transpose0312.mlir: -------------------------------------------------------------------------------- 1 | func.func @transpose(%arg0 : tensor<1x12x64x256xf32>) -> tensor<1x256x12x64xf32> { 2 | %0 = "mhlo.transpose"(%arg0) {permutation = dense<[0, 3, 1, 2]> : tensor<4xi64>} : (tensor<1x12x64x256xf32>) -> tensor<1x256x12x64xf32> 3 | return %0 : tensor<1x256x12x64xf32> 4 | } 5 | -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/ops/transpose102.mlir: -------------------------------------------------------------------------------- 1 | func.func @transpose102(%arg0 : tensor<12x64x256xf32>) -> tensor<64x12x256xf32> { 2 | %0 = "mhlo.transpose"(%arg0) {permutation = dense<[1, 0, 2]> : tensor<3xi64>} : (tensor<12x64x256xf32>) -> tensor<64x12x256xf32> 3 | return %0 : tensor<64x12x256xf32> 4 | } -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/ops/transpose1023.mlir: -------------------------------------------------------------------------------- 1 | func.func @transpose1023(%arg0 : tensor<12x64x256x128xf32>) -> tensor<64x12x256x128xf32> { 2 | %0 = "mhlo.transpose"(%arg0) {permutation = dense<[1, 0, 2, 3]> : tensor<4xi64>} : (tensor<12x64x256x128xf32>) -> tensor<64x12x256x128xf32> 3 | return %0 : tensor<64x12x256x128xf32> 4 | } 5 | -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/ops/transpose120.mlir: -------------------------------------------------------------------------------- 1 | func.func @transpose120(%arg0 : tensor<12x64x256xf32>) -> tensor<64x256x12xf32> { 2 | %0 = "mhlo.transpose"(%arg0) {permutation = dense<[1, 2, 0]> : tensor<3xi64>} : (tensor<12x64x256xf32>) -> tensor<64x256x12xf32> 3 | return %0 : tensor<64x256x12xf32> 4 | } -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/ops/transpose1203.mlir: -------------------------------------------------------------------------------- 1 | func.func @transpose1203(%arg0 : tensor<12x3x7x6xf16>) -> tensor<3x7x12x6xf16> { 2 | %0 = "mhlo.transpose"(%arg0) {permutation = dense<[1, 2, 0, 3]> : tensor<4xi64>} : (tensor<12x3x7x6xf16>) -> tensor<3x7x12x6xf16> 3 | return %0 : tensor<3x7x12x6xf16> 4 | } 5 | -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/ops/transpose2013.mlir: -------------------------------------------------------------------------------- 1 | func.func @transpose2013(%arg0 : tensor<12x64x256x128xf32>) -> tensor<256x12x64x128xf32> { 2 | %0 = "mhlo.transpose"(%arg0) {permutation = dense<[2, 0, 1, 3]> : tensor<4xi64>} : (tensor<12x64x256x128xf32>) -> tensor<256x12x64x128xf32> 3 | return %0 : tensor<256x12x64x128xf32> 4 | } 5 | -------------------------------------------------------------------------------- /tests/numerical_test/mlir_tests/ops/transpose2d.mlir: -------------------------------------------------------------------------------- 1 | func.func @transpose2d(%arg0 : tensor<4096x13696xf16>) -> tensor<13696x4096xf16> { 2 | %0 = "mhlo.transpose"(%arg0) {permutation = dense<[1, 0]> : tensor<2xi64>} : (tensor<4096x13696xf16>) -> tensor<13696x4096xf16> 3 | return %0 : tensor<13696x4096xf16> 4 | } --------------------------------------------------------------------------------