├── .clang-format ├── .clang-tidy ├── .cmakelintrc ├── .flake8 ├── .github ├── ISSUE_TEMPLATE │ ├── bug-report.yml │ ├── documentation.yml │ └── feature-request.yml ├── actions │ ├── inductor-xpu-e2e-test │ │ └── action.yml │ ├── print-environment │ │ └── action.yml │ └── pt2e │ │ └── action.yml ├── ci_commit_pins │ ├── torchbench.txt │ └── triton.txt ├── ci_expected_accuracy │ ├── check_expected.py │ ├── inductor_huggingface_inference.csv │ ├── inductor_huggingface_training.csv │ ├── inductor_timm_models_inference.csv │ ├── inductor_timm_models_training.csv │ ├── inductor_torchbench_inference.csv │ └── inductor_torchbench_training.csv ├── scripts │ ├── apply_torch_pr.py │ ├── build.sh │ ├── calculate_best_perf.py │ ├── check-transformers.py │ ├── check-ut.py │ ├── e2e_summary.sh │ ├── env.sh │ ├── inductor_summary.py │ ├── inductor_xpu_test.sh │ ├── install_xpu.bat │ ├── lintrunner.sh │ ├── microbench_summary.sh │ ├── parse-junitxml.py │ ├── perf_comparison.py │ ├── rpath.sh │ ├── spec.py │ ├── summary_pt2e.py │ └── ut_result_check.sh └── workflows │ ├── _linux_accelerate.yml │ ├── _linux_build.yml │ ├── _linux_op_benchmark.yml │ ├── _linux_transformers.yml │ ├── _linux_ut.yml │ ├── _performance_comparison.yml │ ├── _windows_ut.yml │ ├── nightly_ondemand.yml │ ├── nightly_ondemand_rolling.yml │ ├── nightly_ondemand_whl.yml │ └── pull.yml ├── .gitignore ├── .lintrunner.toml ├── CMakeLists.txt ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── SECURITY.md ├── cmake ├── BuildFlags.cmake ├── ClangFormat.cmake ├── Codegen.cmake ├── Modules │ ├── FindONEMKL.cmake │ ├── FindSYCL.cmake │ ├── FindSYCL │ │ ├── make2cmake.cmake │ │ └── run_sycl.cmake │ ├── FindSYCLToolkit.cmake │ └── FindXCCL.cmake ├── ONEMKL.cmake ├── SYCL.cmake └── XCCL.cmake ├── docs └── torch_xpu_ops.jpg ├── mypy-strict.ini ├── mypy.ini ├── pyproject.toml ├── src ├── ATen │ ├── CMakeLists.txt │ ├── native │ │ ├── nested │ │ │ ├── NestedTensorTransformerFunctions.cpp │ │ │ └── xpu │ │ │ │ ├── NestedTensorTransformerFunctions.cpp │ │ │ │ └── sycl │ │ │ │ ├── NestedTensorTransformerFunctionKernels.cpp │ │ │ │ └── NestedTensorTransformerFunctionKernels.h │ │ ├── quantized │ │ │ ├── AffineQuantizer.cpp │ │ │ ├── FakeQuantizeCore.cpp │ │ │ ├── FusedObsFakeQuant.cpp │ │ │ ├── MakePerTensorQuantizedTensor.cpp │ │ │ ├── QuantizedMaxPool2d.cpp │ │ │ └── sycl │ │ │ │ ├── AffineQuantizerKernels.cpp │ │ │ │ ├── AffineQuantizerKernels.h │ │ │ │ ├── FakeQuantizeCoreKernels.cpp │ │ │ │ ├── FakeQuantizeCoreKernels.h │ │ │ │ ├── FusedObsFakeQuantKernels.cpp │ │ │ │ ├── FusedObsFakeQuantKernels.h │ │ │ │ ├── MakePerTensorQuantizedTensorKernels.cpp │ │ │ │ ├── MakePerTensorQuantizedTensorKernels.h │ │ │ │ ├── QuantizedMaxPool2d.cpp │ │ │ │ └── QuantizedMaxPool2d.h │ │ ├── sparse │ │ │ └── xpu │ │ │ │ ├── SparseBinaryOpIntersection.cpp │ │ │ │ ├── SparseCsrTensorMath.cpp │ │ │ │ ├── SparseSoftmax.cpp │ │ │ │ ├── SparseTensor.cpp │ │ │ │ ├── SparseTensorMath.cpp │ │ │ │ └── sycl │ │ │ │ ├── SparseBinaryOpIntersectionKernels.cpp │ │ │ │ ├── SparseBinaryOpIntersectionKernels.h │ │ │ │ ├── SparseCsrTensorMathKernels.cpp │ │ │ │ ├── SparseCsrTensorMathKernels.h │ │ │ │ ├── SparseSoftmaxKernels.cpp │ │ │ │ ├── SparseSoftmaxKernels.h │ │ │ │ ├── SparseTensorKernels.cpp │ │ │ │ ├── SparseTensorKernels.h │ │ │ │ ├── SparseTensorMathKernels.cpp │ │ │ │ └── SparseTensorMathKernels.h │ │ ├── transformers │ │ │ ├── Attention.cpp │ │ │ ├── SDPUtils.cpp │ │ │ ├── SDPUtils.h │ │ │ └── sycl │ │ │ │ ├── AttentionKernels.cpp │ │ │ │ └── AttentionKernels.h │ │ └── xpu │ │ │ ├── Activation.cpp │ │ │ ├── AdaptiveAveragePooling2d.cpp │ │ │ ├── AdaptiveAveragePooling3d.cpp │ │ │ ├── AdaptiveMaxPooling2d.cpp │ │ │ ├── AdaptiveMaxPooling3d.cpp │ │ │ ├── AiryAi.cpp │ │ │ ├── AmpKernels.cpp │ │ │ ├── AveragePool2d.cpp │ │ │ ├── AveragePool3d.cpp │ │ │ ├── BatchLinearAlgebra.cpp │ │ │ ├── BatchNorm.cpp │ │ │ ├── Bessel.cpp │ │ │ ├── BinaryOps.cpp │ │ │ ├── Bucketization.cpp │ │ │ ├── Col2Im.cpp │ │ │ ├── CompareOps.cpp │ │ │ ├── Copy.cpp │ │ │ ├── Cross.cpp │ │ │ ├── DeformConv2d.cpp │ │ │ ├── DepthwiseConv2d.cpp │ │ │ ├── DepthwiseConv3d.cpp │ │ │ ├── DilatedMaxPool2d.cpp │ │ │ ├── DilatedMaxPool3d.cpp │ │ │ ├── Distance.cpp │ │ │ ├── Distributions.cpp │ │ │ ├── Dropout.cpp │ │ │ ├── Embedding.cpp │ │ │ ├── EmbeddingBag.cpp │ │ │ ├── Equal.cpp │ │ │ ├── Fill.cpp │ │ │ ├── ForeachOpList.cpp │ │ │ ├── ForeachOpScalar.cpp │ │ │ ├── ForeachOpScalarList.cpp │ │ │ ├── ForeachOpScalarTensor.cpp │ │ │ ├── ForeachReduceOp.cpp │ │ │ ├── ForeachUnaryOp.cpp │ │ │ ├── FractionalMaxPool2d.cpp │ │ │ ├── FractionalMaxPool3d.cpp │ │ │ ├── FunctionOfAMatrixUtils.cpp │ │ │ ├── FusedAdam.cpp │ │ │ ├── FusedAdamW.cpp │ │ │ ├── FusedSgd.cpp │ │ │ ├── GatedLinearUnit.cpp │ │ │ ├── GridSampler.cpp │ │ │ ├── GroupNorm.cpp │ │ │ ├── Histogram.cpp │ │ │ ├── Im2Col.cpp │ │ │ ├── Indexing.cpp │ │ │ ├── LayerNorm.cpp │ │ │ ├── Lerp.cpp │ │ │ ├── LinearAlgebra.cpp │ │ │ ├── LinearInt4.cpp │ │ │ ├── Loss.cpp │ │ │ ├── LossCTC.cpp │ │ │ ├── LossMultiLabelMargin.cpp │ │ │ ├── LossMultiMargin.cpp │ │ │ ├── LossNLL.cpp │ │ │ ├── LossNLL2d.cpp │ │ │ ├── MaxUnpooling.cpp │ │ │ ├── NMS.cpp │ │ │ ├── Nonzero.cpp │ │ │ ├── Normalization.cpp │ │ │ ├── PinnedMemoryAllocator.cpp │ │ │ ├── PointwiseOps.cpp │ │ │ ├── Pow.cpp │ │ │ ├── PsRoiAlign.cpp │ │ │ ├── PsRoiPool.cpp │ │ │ ├── RNN.cpp │ │ │ ├── RangeFactories.cpp │ │ │ ├── RecordStream.cpp │ │ │ ├── ReduceAllOps.cpp │ │ │ ├── ReduceOps.cpp │ │ │ ├── ReflectionPad.cpp │ │ │ ├── Repeat.cpp │ │ │ ├── ReplicationPadding.cpp │ │ │ ├── Resize.cpp │ │ │ ├── RoiAlign.cpp │ │ │ ├── RoiPool.cpp │ │ │ ├── RreluWithNoise.cpp │ │ │ ├── ScanKernels.cpp │ │ │ ├── ScanKernels.h │ │ │ ├── SegmentReduce.cpp │ │ │ ├── SoftMax.cpp │ │ │ ├── Sorting.cpp │ │ │ ├── SpectralOps.cpp │ │ │ ├── SummaryOps.cpp │ │ │ ├── TensorAdvancedIndexing.cpp │ │ │ ├── TensorCompare.cpp │ │ │ ├── TensorFactories.cpp │ │ │ ├── TensorProperties.cpp │ │ │ ├── TensorShape.cpp │ │ │ ├── TensorTopK.cpp │ │ │ ├── TensorTransformations.cpp │ │ │ ├── TriangluarOps.cpp │ │ │ ├── UnaryOps.cpp │ │ │ ├── UnfoldBackward.cpp │ │ │ ├── Unique.cpp │ │ │ ├── UpSample.h │ │ │ ├── UpSampleBicubic2d.cpp │ │ │ ├── UpSampleBilinear2d.cpp │ │ │ ├── UpSampleLinear1d.cpp │ │ │ ├── UpSampleNearest1d.cpp │ │ │ ├── UpSampleNearest2d.cpp │ │ │ ├── UpSampleNearest3d.cpp │ │ │ ├── UpSampleTrilinear3d.cpp │ │ │ ├── WeightInt4Pack.cpp │ │ │ ├── WeightNorm.cpp │ │ │ ├── XPUFallback.template │ │ │ ├── XPUScalar.cpp │ │ │ ├── mkl │ │ │ ├── BatchLinearAlgebra.cpp │ │ │ ├── BatchLinearAlgebra.h │ │ │ ├── SpectralOps.cpp │ │ │ └── SpectralOps.h │ │ │ └── sycl │ │ │ ├── AbsKernel.cpp │ │ │ ├── AbsKernel.h │ │ │ ├── ActivationEluKernels.cpp │ │ │ ├── ActivationEluKernels.h │ │ │ ├── ActivationGeluKernel.cpp │ │ │ ├── ActivationGeluKernel.h │ │ │ ├── ActivationGluKernels.cpp │ │ │ ├── ActivationGluKernels.h │ │ │ ├── ActivationHardshrinkKernels.cpp │ │ │ ├── ActivationHardshrinkKernels.h │ │ │ ├── ActivationHardsigmoidKernels.cpp │ │ │ ├── ActivationHardsigmoidKernels.h │ │ │ ├── ActivationHardswishKernels.cpp │ │ │ ├── ActivationHardswishKernels.h │ │ │ ├── ActivationHardtanhKernels.cpp │ │ │ ├── ActivationHardtanhKernels.h │ │ │ ├── ActivationLeakyReluKernels.cpp │ │ │ ├── ActivationLeakyReluKernels.h │ │ │ ├── ActivationLogSigmoidKernels.cpp │ │ │ ├── ActivationLogSigmoidKernels.h │ │ │ ├── ActivationMishKernels.cpp │ │ │ ├── ActivationMishKernels.h │ │ │ ├── ActivationPreluKernels.cpp │ │ │ ├── ActivationPreluKernels.h │ │ │ ├── ActivationSiluKernels.cpp │ │ │ ├── ActivationSiluKernels.h │ │ │ ├── ActivationSoftplusKernels.cpp │ │ │ ├── ActivationSoftplusKernels.h │ │ │ ├── ActivationSoftshrinkKernels.cpp │ │ │ ├── ActivationSoftshrinkKernels.h │ │ │ ├── ActivationThresholdKernel.cpp │ │ │ ├── ActivationThresholdKernel.h │ │ │ ├── AdaptiveAveragePooling2dKernels.cpp │ │ │ ├── AdaptiveAveragePooling2dKernels.h │ │ │ ├── AdaptiveAveragePooling3dKernels.cpp │ │ │ ├── AdaptiveAveragePooling3dKernels.h │ │ │ ├── AdaptiveMaxPooling2dKernels.cpp │ │ │ ├── AdaptiveMaxPooling2dKernels.h │ │ │ ├── AdaptiveMaxPooling3dKernels.cpp │ │ │ ├── AdaptiveMaxPooling3dKernels.h │ │ │ ├── AiryAiKernel.cpp │ │ │ ├── AiryAiKernel.h │ │ │ ├── AmpKernels.cpp │ │ │ ├── AmpKernels.h │ │ │ ├── Atomics.h │ │ │ ├── AveragePool2dKernels.cpp │ │ │ ├── AveragePool2dKernels.h │ │ │ ├── AveragePool3dKernels.cpp │ │ │ ├── AveragePool3dKernels.h │ │ │ ├── BatchKernel.h │ │ │ ├── BatchNormKernels.cpp │ │ │ ├── BatchNormKernels.h │ │ │ ├── BesselJ0Kernel.cpp │ │ │ ├── BesselJ0Kernel.h │ │ │ ├── BesselJ1Kernel.cpp │ │ │ ├── BesselJ1Kernel.h │ │ │ ├── BesselY0Kernel.cpp │ │ │ ├── BesselY0Kernel.h │ │ │ ├── BesselY1Kernel.cpp │ │ │ ├── BesselY1Kernel.h │ │ │ ├── BinaryBitwiseOpsKernels.cpp │ │ │ ├── BinaryBitwiseOpsKernels.h │ │ │ ├── BinaryDivFloorKernel.cpp │ │ │ ├── BinaryDivTrueKernel.cpp │ │ │ ├── BinaryDivTruncKernel.cpp │ │ │ ├── BinaryGeometricKernels.cpp │ │ │ ├── BinaryGeometricKernels.h │ │ │ ├── BinaryInternal.h │ │ │ ├── BinaryKernels.cpp │ │ │ ├── BinaryKernels.h │ │ │ ├── BinaryLogicalOpsKernels.cpp │ │ │ ├── BinaryLogicalOpsKernels.h │ │ │ ├── BinaryMiscBackwardOpsKernels.cpp │ │ │ ├── BinaryMiscBackwardOpsKernels.h │ │ │ ├── BinaryMiscOpsKernels.cpp │ │ │ ├── BinaryMiscOpsKernels.h │ │ │ ├── BinaryRemainderKernel.cpp │ │ │ ├── BinaryRemainderKernel.h │ │ │ ├── BinaryShiftOpsKernels.cpp │ │ │ ├── BinaryShiftOpsKernels.h │ │ │ ├── BucketizationKernels.cpp │ │ │ ├── BucketizationKernels.h │ │ │ ├── ChebyshevPolynomialKernels.h │ │ │ ├── ChebyshevPolynomialTKernel.cpp │ │ │ ├── ChebyshevPolynomialUKernel.cpp │ │ │ ├── ChebyshevPolynomialVKernel.cpp │ │ │ ├── ChebyshevPolynomialWKernel.cpp │ │ │ ├── Col2ImKernel.cpp │ │ │ ├── Col2ImKernel.h │ │ │ ├── CompareKernels.cpp │ │ │ ├── CompareKernels.h │ │ │ ├── ComplexKernels.cpp │ │ │ ├── ComplexKernels.h │ │ │ ├── CopyKernel.cpp │ │ │ ├── CopyKernel.h │ │ │ ├── CopysignKernel.cpp │ │ │ ├── CopysignKernel.h │ │ │ ├── CrossKernel.cpp │ │ │ ├── CrossKernel.h │ │ │ ├── CumminmaxKernel.cpp │ │ │ ├── CumprodKernel.cpp │ │ │ ├── CumsumKernel.cpp │ │ │ ├── DeformConv2dKernels.cpp │ │ │ ├── DeformConv2dKernels.h │ │ │ ├── DepthwiseConv2dKernels.cpp │ │ │ ├── DepthwiseConv2dKernels.h │ │ │ ├── DepthwiseConv3dKernels.cpp │ │ │ ├── DepthwiseConv3dKernels.h │ │ │ ├── Dequant_int4.cpp │ │ │ ├── Dequant_int4.h │ │ │ ├── DilatedMaxPool2d.cpp │ │ │ ├── DilatedMaxPool2d.h │ │ │ ├── DilatedMaxPool3d.cpp │ │ │ ├── DilatedMaxPool3d.h │ │ │ ├── DistanceKernels.cpp │ │ │ ├── DistanceKernels.h │ │ │ ├── DistributionBernoulli.cpp │ │ │ ├── DistributionCauchyKernel.cpp │ │ │ ├── DistributionExponentialKernel.cpp │ │ │ ├── DistributionGeometricKernel.cpp │ │ │ ├── DistributionKernels.h │ │ │ ├── DistributionLogNormalKernel.cpp │ │ │ ├── DistributionNormal.cpp │ │ │ ├── DistributionRandomKernel.cpp │ │ │ ├── DistributionTemplates.h │ │ │ ├── DistributionUniform.cpp │ │ │ ├── Distributions.cpp │ │ │ ├── Distributions.h │ │ │ ├── Dropout.cpp │ │ │ ├── DropoutKernels.h │ │ │ ├── ElementwiseInvoke.h │ │ │ ├── Embedding.cpp │ │ │ ├── EmbeddingBackwardKernel.h │ │ │ ├── EmbeddingBag.cpp │ │ │ ├── EmbeddingBag.h │ │ │ ├── EmbeddingBagKernels.h │ │ │ ├── EmbeddingKernels.h │ │ │ ├── FFTKernelFunctor.cpp │ │ │ ├── FFTKernelFunctor.h │ │ │ ├── FillKernel.cpp │ │ │ ├── FillKernel.h │ │ │ ├── ForeachBinaryOpListKernels.cpp │ │ │ ├── ForeachBinaryOpListKernels.h │ │ │ ├── ForeachBinaryOpScalarKernels.cpp │ │ │ ├── ForeachBinaryOpScalarKernels.h │ │ │ ├── ForeachBinaryOpScalarListKernels.cpp │ │ │ ├── ForeachBinaryOpScalarListKernels.h │ │ │ ├── ForeachBinaryOpScalarTensorKernels.cpp │ │ │ ├── ForeachBinaryOpScalarTensorKernels.h │ │ │ ├── ForeachCopyKernels.cpp │ │ │ ├── ForeachCopyKernels.h │ │ │ ├── ForeachFunctors.h │ │ │ ├── ForeachPointwiseKernels.cpp │ │ │ ├── ForeachPointwiseOpListKernels.h │ │ │ ├── ForeachPointwiseOpScalarKernels.h │ │ │ ├── ForeachPointwiseOpScalarListKernels.h │ │ │ ├── ForeachReduceKernels.cpp │ │ │ ├── ForeachReduceKernels.h │ │ │ ├── ForeachTernaryKernels.cpp │ │ │ ├── ForeachTernaryOpListKernels.h │ │ │ ├── ForeachTernaryOpScalarKernels.h │ │ │ ├── ForeachTernaryOpScalarListKernels.h │ │ │ ├── ForeachUnaryKernels.cpp │ │ │ ├── ForeachUnaryKernels.h │ │ │ ├── FractionalMaxPool2dKernels.cpp │ │ │ ├── FractionalMaxPool2dKernels.h │ │ │ ├── FractionalMaxPool3dKernels.cpp │ │ │ ├── FractionalMaxPool3dKernels.h │ │ │ ├── FunctionOfAMatrixUtilsKernels.cpp │ │ │ ├── FunctionOfAMatrixUtilsKernels.h │ │ │ ├── FusedAdamAmsgradKernels.cpp │ │ │ ├── FusedAdamKernels.cpp │ │ │ ├── FusedAdamKernels.h │ │ │ ├── FusedAdamUtils.h │ │ │ ├── FusedAdamWAmsgradKernels.cpp │ │ │ ├── FusedAdamWKernels.cpp │ │ │ ├── FusedAdamWKernels.h │ │ │ ├── FusedSgdKernels.cpp │ │ │ ├── FusedSgdKernels.h │ │ │ ├── GcdLcmKernels.cpp │ │ │ ├── GcdLcmKernels.h │ │ │ ├── GridSampler.cpp │ │ │ ├── GridSampler.h │ │ │ ├── GridSamplerKernels.h │ │ │ ├── GroupNormKernels.cpp │ │ │ ├── GroupNormKernels.h │ │ │ ├── GroupReduceUtils.h │ │ │ ├── HermitePolynomialHKernel.cpp │ │ │ ├── HermitePolynomialHKernel.h │ │ │ ├── HermitePolynomialHeKernel.cpp │ │ │ ├── HermitePolynomialHeKernel.h │ │ │ ├── HistogramKernels.h │ │ │ ├── HistogramddKernels.cpp │ │ │ ├── IGammaKernel.cpp │ │ │ ├── IGammaKernel.h │ │ │ ├── Im2ColKernel.cpp │ │ │ ├── Im2ColKernel.h │ │ │ ├── IndexKernelUtils.h │ │ │ ├── IndexUtils.h │ │ │ ├── Indexing.cpp │ │ │ ├── Indexing.h │ │ │ ├── IndexingKernels.h │ │ │ ├── IndexingUtils.h │ │ │ ├── IntegerDivider.h │ │ │ ├── KernelUtils.h │ │ │ ├── LaguerrePolynomialLKernel.cpp │ │ │ ├── LaguerrePolynomialLKernel.h │ │ │ ├── LaunchUtils.h │ │ │ ├── LayerNormKernels.cpp │ │ │ ├── LayerNormKernels.h │ │ │ ├── LegendrePolynomialPKernel.cpp │ │ │ ├── LegendrePolynomialPKernel.h │ │ │ ├── LerpKernels.cpp │ │ │ ├── LerpKernels.h │ │ │ ├── LinearAlgebraKernels.cpp │ │ │ ├── LinearAlgebraKernels.h │ │ │ ├── LinearInt4.cpp │ │ │ ├── LinearInt4.h │ │ │ ├── LogAddExpKernels.cpp │ │ │ ├── LogAddExpKernels.h │ │ │ ├── LogcumsumexpKernel.cpp │ │ │ ├── Loops.h │ │ │ ├── LossCTCKernels.cpp │ │ │ ├── LossCTCKernels.h │ │ │ ├── LossKernels.cpp │ │ │ ├── LossKernels.h │ │ │ ├── LossNLL2dKernels.cpp │ │ │ ├── LossNLL2dKernels.h │ │ │ ├── LossNLLKernel.cpp │ │ │ ├── LossNLLKernel.h │ │ │ ├── MathExtensions.h │ │ │ ├── MaxMinElementwiseKernels.cpp │ │ │ ├── MaxMinElementwiseKernels.h │ │ │ ├── MaxUnpoolingKernels.cpp │ │ │ ├── MaxUnpoolingKernels.h │ │ │ ├── MemoryAccess.h │ │ │ ├── MemoryAccessUtils.h │ │ │ ├── ModifiedBesselI0Kernel.cpp │ │ │ ├── ModifiedBesselI0Kernel.h │ │ │ ├── ModifiedBesselI1Kernel.cpp │ │ │ ├── ModifiedBesselI1Kernel.h │ │ │ ├── ModifiedBesselK0Kernel.cpp │ │ │ ├── ModifiedBesselK0Kernel.h │ │ │ ├── ModifiedBesselK1Kernel.cpp │ │ │ ├── ModifiedBesselK1Kernel.h │ │ │ ├── MultiLabelMarginLossKernels.cpp │ │ │ ├── MultiLabelMarginLossKernels.h │ │ │ ├── MultiMarginLossKernels.cpp │ │ │ ├── MultiMarginLossKernels.h │ │ │ ├── MultiTensorApply.h │ │ │ ├── MultinomialKernel.cpp │ │ │ ├── MultinomialKernel.h │ │ │ ├── NMSKernel.cpp │ │ │ ├── NMSKernel.h │ │ │ ├── NonzeroKernel.cpp │ │ │ ├── NonzeroKernel.h │ │ │ ├── Norm.h │ │ │ ├── NumericLimits.h │ │ │ ├── OffsetCalculator.h │ │ │ ├── Philox4x32.h │ │ │ ├── PointwiseOpsKernels.cpp │ │ │ ├── PointwiseOpsKernels.h │ │ │ ├── Pow.h │ │ │ ├── PowKernels.cpp │ │ │ ├── PowKernels.h │ │ │ ├── PsRoiAlignKernels.cpp │ │ │ ├── PsRoiAlignKernels.h │ │ │ ├── PsRoiPoolKernels.cpp │ │ │ ├── PsRoiPoolKernels.h │ │ │ ├── RNNKernels.cpp │ │ │ ├── RNNKernels.h │ │ │ ├── RandpermKernel.cpp │ │ │ ├── RandpermKernel.h │ │ │ ├── RangeFactoriesKernel.cpp │ │ │ ├── RangeFactoriesKernel.h │ │ │ ├── Reduce.h │ │ │ ├── ReduceAMinMaxKernel.cpp │ │ │ ├── ReduceArgMaxKernel.cpp │ │ │ ├── ReduceArgMinKernel.cpp │ │ │ ├── ReduceLogicKernels.cpp │ │ │ ├── ReduceMaxValuesKernels.cpp │ │ │ ├── ReduceMaxValuesKernels.h │ │ │ ├── ReduceMinValuesKernels.cpp │ │ │ ├── ReduceMinValuesKernels.h │ │ │ ├── ReduceMomentKernels.cpp │ │ │ ├── ReduceNormKernel.cpp │ │ │ ├── ReduceNormKernel.h │ │ │ ├── ReduceOps.h │ │ │ ├── ReduceOpsKernels.h │ │ │ ├── ReduceSumProdKernels.cpp │ │ │ ├── ReflectionPadKernels.cpp │ │ │ ├── ReflectionPadKernels.h │ │ │ ├── RenormKernel.cpp │ │ │ ├── RenormKernel.h │ │ │ ├── RepeatKernel.cpp │ │ │ ├── RepeatKernel.h │ │ │ ├── ReplicationPaddingKernels.cpp │ │ │ ├── ReplicationPaddingKernels.h │ │ │ ├── ResizeKernel.cpp │ │ │ ├── ResizeKernel.h │ │ │ ├── RoiAlignKernels.cpp │ │ │ ├── RoiAlignKernels.h │ │ │ ├── RoiPoolKernels.cpp │ │ │ ├── RoiPoolKernels.h │ │ │ ├── RreluWithNoiseKernels.cpp │ │ │ ├── RreluWithNoiseKernels.h │ │ │ ├── SYCLGroupAlgorithm.h │ │ │ ├── ScaledModifiedBesselK0Kernel.cpp │ │ │ ├── ScaledModifiedBesselK0Kernel.h │ │ │ ├── ScaledModifiedBesselK1Kernel.cpp │ │ │ ├── ScaledModifiedBesselK1Kernel.h │ │ │ ├── ScanUtils.h │ │ │ ├── ScatterGatherKernels.cpp │ │ │ ├── ScatterGatherKernels.h │ │ │ ├── SegmentReduceKernels.cpp │ │ │ ├── SegmentReduceKernels.h │ │ │ ├── Shape.cpp │ │ │ ├── ShapeKernels.h │ │ │ ├── SharedReduceOps.h │ │ │ ├── ShiftedChebyshevPolynomialKernels.h │ │ │ ├── ShiftedChebyshevPolynomialTKernel.cpp │ │ │ ├── ShiftedChebyshevPolynomialUKernel.cpp │ │ │ ├── ShiftedChebyshevPolynomialVKernel.cpp │ │ │ ├── ShiftedChebyshevPolynomialWKernel.cpp │ │ │ ├── SoftMaxKernels.cpp │ │ │ ├── SoftMaxKernels.h │ │ │ ├── Sorting.cpp │ │ │ ├── Sorting.h │ │ │ ├── SortingCommon.h │ │ │ ├── SortingKernels.h │ │ │ ├── SortingRadixSelect.h │ │ │ ├── SortingRadixSort.h │ │ │ ├── SphericalBesselJ0Kernel.cpp │ │ │ ├── SphericalBesselJ0Kernel.h │ │ │ ├── StepKernels.cpp │ │ │ ├── StepKernels.h │ │ │ ├── SummaryOpsKernels.cpp │ │ │ ├── SummaryOpsKernels.h │ │ │ ├── TensorApplyUtils.h │ │ │ ├── TensorCompare.cpp │ │ │ ├── TensorCompareKernels.cpp │ │ │ ├── TensorCompareKernels.h │ │ │ ├── TensorFactoriesKernels.cpp │ │ │ ├── TensorFactoriesKernels.h │ │ │ ├── TensorModeKernel.cpp │ │ │ ├── TensorModeKernel.h │ │ │ ├── TensorShapeKernels.cpp │ │ │ ├── TensorShapeKernels.h │ │ │ ├── TensorTopKKernel.cpp │ │ │ ├── TensorTopKKernel.h │ │ │ ├── TensorTransformationsKernels.cpp │ │ │ ├── TensorTransformationsKernels.h │ │ │ ├── TriangularOpsKernels.cpp │ │ │ ├── TriangularOpsKernels.h │ │ │ ├── UnaryComplexKernels.cpp │ │ │ ├── UnaryComplexKernels.h │ │ │ ├── UnaryFractionKernels.cpp │ │ │ ├── UnaryFractionKernels.h │ │ │ ├── UnaryGammaKernels.cpp │ │ │ ├── UnaryGammaKernels.h │ │ │ ├── UnaryGeometricAcosKernel.cpp │ │ │ ├── UnaryGeometricAcosKernel.h │ │ │ ├── UnaryGeometricAcoshKernel.cpp │ │ │ ├── UnaryGeometricAcoshKernel.h │ │ │ ├── UnaryGeometricAsinKernel.cpp │ │ │ ├── UnaryGeometricAsinKernel.h │ │ │ ├── UnaryGeometricAsinhKernel.cpp │ │ │ ├── UnaryGeometricAsinhKernel.h │ │ │ ├── UnaryGeometricAtanKernel.cpp │ │ │ ├── UnaryGeometricAtanKernel.h │ │ │ ├── UnaryGeometricAtanhKernel.cpp │ │ │ ├── UnaryGeometricAtanhKernel.h │ │ │ ├── UnaryGeometricCosKernel.cpp │ │ │ ├── UnaryGeometricCosKernel.h │ │ │ ├── UnaryGeometricCoshKernel.cpp │ │ │ ├── UnaryGeometricCoshKernel.h │ │ │ ├── UnaryGeometricSinKernel.cpp │ │ │ ├── UnaryGeometricSinKernel.h │ │ │ ├── UnaryGeometricSinhKernel.cpp │ │ │ ├── UnaryGeometricSinhKernel.h │ │ │ ├── UnaryGeometricTanKernel.cpp │ │ │ ├── UnaryGeometricTanKernel.h │ │ │ ├── UnaryGeometricTanhKernel.cpp │ │ │ ├── UnaryGeometricTanhKernel.h │ │ │ ├── UnaryKernels.cpp │ │ │ ├── UnaryKernels.h │ │ │ ├── UnaryLogKernels.cpp │ │ │ ├── UnaryLogKernels.h │ │ │ ├── UnarySignKernels.cpp │ │ │ ├── UnarySignKernels.h │ │ │ ├── UnarySpecialOpsKernels.cpp │ │ │ ├── UnarySpecialOpsKernels.h │ │ │ ├── UnfoldBackwardKernels.cpp │ │ │ ├── UnfoldBackwardKernels.h │ │ │ ├── UniqueKernels.cpp │ │ │ ├── UniqueKernels.h │ │ │ ├── UpSampleBicubic2dKernels.cpp │ │ │ ├── UpSampleBicubic2dKernels.h │ │ │ ├── UpSampleBilinear2dKernels.cpp │ │ │ ├── UpSampleBilinear2dKernels.h │ │ │ ├── UpSampleLinear1dKernels.cpp │ │ │ ├── UpSampleLinear1dKernels.h │ │ │ ├── UpSampleNearest1dKernels.cpp │ │ │ ├── UpSampleNearest1dKernels.h │ │ │ ├── UpSampleNearest2dKernels.cpp │ │ │ ├── UpSampleNearest2dKernels.h │ │ │ ├── UpSampleNearest3dKernels.cpp │ │ │ ├── UpSampleNearest3dKernels.h │ │ │ ├── UpSampleTrilinear3dKernels.cpp │ │ │ ├── UpSampleTrilinear3dKernels.h │ │ │ ├── WeightInt4PackKernel.cpp │ │ │ ├── WeightInt4PackKernel.h │ │ │ ├── WeightNormKernels.cpp │ │ │ ├── WeightNormKernels.h │ │ │ ├── WelfordNorm.h │ │ │ ├── ZetaKernel.cpp │ │ │ ├── ZetaKernel.h │ │ │ └── pstl │ │ │ └── PSTLFunctions.h │ └── xpu │ │ ├── EmptyTensor.cpp │ │ └── EmptyTensor.h ├── BuildOnLinux.cmake ├── BuildOnWindows.cmake ├── CMakeLists.txt ├── comm │ ├── DeviceProperties.h │ ├── Macros.h │ ├── Memory.h │ ├── MemoryFormat.h │ ├── ReduceOpsUtils.h │ ├── RegisterUtils.h │ ├── Runtime.h │ ├── SYCLContext.h │ ├── SYCLHelpers.h │ ├── Scalar.h │ ├── TensorInfo.h │ ├── TensorOptions.h │ ├── XPUMathCompat.h │ ├── XPUPair.h │ └── xpu_aten.h └── xccl │ ├── CMakeLists.txt │ ├── ProcessGroupXCCL.cpp │ ├── ProcessGroupXCCL.hpp │ ├── Register.cpp │ └── reducer_xpu.cpp ├── test ├── microbench │ ├── adaptive_avg_pool2d.py │ ├── avg_pool2d.py │ ├── avg_pool3d.py │ ├── batch_norm_1d.py │ ├── batch_norm_2d.py │ ├── batch_norm_3d.py │ ├── col2im.py │ ├── distance.cdist.py │ ├── distance.pdist.py │ ├── distribution.bernoulli.py │ ├── distribution.cauchy.py │ ├── distribution.exponential.py │ ├── distribution.geometric.py │ ├── distribution.log_normal.py │ ├── distribution.multinomial.py │ ├── distribution.normal.py │ ├── distribution.random.py │ ├── distribution.uniform.py │ ├── dropout.py │ ├── eltwise.add.py │ ├── flip.py │ ├── grid_sampler.grid_sampler_2d.py │ ├── grid_sampler.grid_sampler_3d.py │ ├── group_norm.py │ ├── im2col.py │ ├── layer_norm.py │ ├── loss.binary_cross_entropy.py │ ├── loss.ctc_loss.py │ ├── loss.l1_loss.py │ ├── loss.mse_loss.py │ ├── loss.multilabel_margin_loss.py │ ├── loss.nll_loss.py │ ├── loss.smooth_l1_loss.py │ ├── matmul.py │ ├── pad_sequence.py │ ├── pooling.adaptive_max_pool2d.py │ ├── pooling.fractional_max_pool2d.py │ ├── pooling.fractional_max_pool3d.py │ ├── pooling.max_pool2d.py │ ├── pooling.max_pool3d.py │ ├── reduce.max.py │ ├── reduce.sum.py │ ├── remainder.py │ ├── roll.py │ ├── scan.cumsum.py │ ├── scan.topk.py │ ├── scan.unique.py │ ├── softmax.py │ ├── sort.py │ ├── sort.randperm.py │ ├── upsample_bicubic2d.py │ ├── upsample_bilinear2d.py │ ├── upsample_nearest2d.py │ ├── upsample_nearest3d.py │ └── upsample_nearest_exact2d.py ├── profiling │ ├── correlation_id_mixed.py │ ├── profile_partial_runtime_ops.py │ ├── reproducer.missing.gpu.kernel.time.py │ ├── rn50.py │ ├── time_precision_in_profile.py │ └── triton_xpu_ops_time.py ├── regressions │ ├── optests_failures_dict.json │ ├── test_binary.py │ ├── test_cat.py │ ├── test_clamp_promotion.py │ ├── test_compare.py │ ├── test_copy.py │ ├── test_copy_downcast_fp8.py │ ├── test_deform_conv.py │ ├── test_div_mode.py │ ├── test_foreach_list.py │ ├── test_foreach_scalar.py │ ├── test_foreach_scalarlist.py │ ├── test_index_and_index_put.py │ ├── test_int4pack.py │ ├── test_layer_norm.py │ ├── test_loops.py │ ├── test_nms.py │ ├── test_operation_on_device_1.py │ ├── test_rand.py │ ├── test_record_stream.py │ ├── test_resize.py │ ├── test_roi_align.py │ ├── test_safe_softmax.py │ ├── test_softmax.py │ ├── test_sort.py │ ├── test_tensor_factory.py │ ├── test_torchvision_roi_ops.py │ ├── test_unary.py │ ├── test_upsample_bilinear_bwd.py │ └── test_upsample_nearest.py ├── sycl │ ├── CMakeLists.txt │ ├── main.cpp │ ├── simple_kernel.cpp │ └── simple_kernel.hpp └── xpu │ ├── __init__.py │ ├── distributed │ ├── __init__.py │ ├── test_c10d_ops_xccl.py │ └── test_c10d_xccl.py │ ├── extended │ ├── __init__.py │ ├── run_test_with_skip.py │ ├── run_test_with_skip_arc.py │ ├── run_test_with_skip_bmg.py │ ├── run_test_with_skip_lnl.py │ ├── run_test_with_skip_mtl.py │ ├── skip_list_arc.py │ ├── skip_list_common.py │ ├── skip_list_win.py │ ├── skip_list_win_arc.py │ ├── skip_list_win_bmg.py │ ├── skip_list_win_lnl.py │ ├── skip_list_win_mtl.py │ ├── test_ops_xpu.py │ └── test_tensor_creation_ops_xpu.py │ ├── nn │ ├── __init__.py │ ├── test_convolution_xpu.py │ ├── test_dropout_xpu.py │ ├── test_embedding_xpu.py │ ├── test_init_xpu.py │ ├── test_lazy_modules_xpu.py │ ├── test_load_state_dict_xpu.py │ ├── test_module_hooks_xpu.py │ ├── test_multihead_attention_xpu.py │ ├── test_packed_sequence_xpu.py │ ├── test_parametrization_xpu.py │ ├── test_pooling_xpu.py │ └── test_pruning_xpu.py │ ├── quantization │ └── core │ │ ├── __init__.py │ │ ├── test_quantized_op_xpu.py │ │ ├── test_quantized_tensor_xpu.py │ │ ├── test_workflow_module_xpu.py │ │ └── test_workflow_ops_xpu.py │ ├── run_distributed.py │ ├── run_test_win_with_skip_mtl.py │ ├── run_test_with_only.py │ ├── run_test_with_skip.py │ ├── run_test_with_skip_arc.py │ ├── run_test_with_skip_bmg.py │ ├── run_test_with_skip_lnl.py │ ├── run_test_with_skip_mtl.py │ ├── skip_list_arc.py │ ├── skip_list_common.py │ ├── skip_list_dist.py │ ├── skip_list_mtl.py │ ├── skip_list_win.py │ ├── skip_list_win_arc.py │ ├── skip_list_win_bmg.py │ ├── skip_list_win_lnl.py │ ├── skip_list_win_mtl.py │ ├── test_autocast_xpu.py │ ├── test_autograd_fallback_xpu.py │ ├── test_autograd_xpu.py │ ├── test_binary_ufuncs_xpu.py │ ├── test_comparison_utils_xpu.py │ ├── test_complex_xpu.py │ ├── test_content_store_xpu.py │ ├── test_dataloader_xpu.py │ ├── test_decomp_xpu.py │ ├── test_distributions_xpu.py │ ├── test_dynamic_shapes_xpu.py │ ├── test_foreach_xpu.py │ ├── test_indexing_xpu.py │ ├── test_linalg_xpu.py │ ├── test_masked_xpu.py │ ├── test_maskedtensor_xpu.py │ ├── test_matmul_cuda_xpu.py │ ├── test_meta_xpu.py │ ├── test_modules_xpu.py │ ├── test_namedtensor_xpu.py │ ├── test_native_functions_xpu.py │ ├── test_native_mha_xpu.py │ ├── test_nestedtensor_xpu.py │ ├── test_nn_xpu.py │ ├── test_ops_fwd_gradients_xpu.py │ ├── test_ops_gradients_xpu.py │ ├── test_ops_xpu.py │ ├── test_optim_xpu.py │ ├── test_reductions_xpu.py │ ├── test_scatter_gather_ops_xpu.py │ ├── test_segment_reductions_xpu.py │ ├── test_shape_ops_xpu.py │ ├── test_sort_and_select_xpu.py │ ├── test_sparse_xpu.py │ ├── test_spectral_ops_xpu.py │ ├── test_tensor_creation_ops_xpu.py │ ├── test_torch_xpu.py │ ├── test_transformers_xpu.py │ ├── test_type_promotion_xpu.py │ ├── test_unary_ufuncs_xpu.py │ ├── test_view_ops_xpu.py │ └── xpu_test_utils.py ├── tools ├── check_ops.py ├── codegen │ └── remove_headers.py └── linter │ ├── __init__.py │ ├── adapters │ ├── README.md │ ├── _linter.py │ ├── actionlint_linter.py │ ├── bazel_linter.py │ ├── black_linter.py │ ├── clangformat_linter.py │ ├── clangtidy_linter.py │ ├── cmake_linter.py │ ├── constexpr_linter.py │ ├── docstring_linter.py │ ├── exec_linter.py │ ├── flake8_linter.py │ ├── gha_linter.py │ ├── grep_linter.py │ ├── import_linter.py │ ├── lintrunner_version_linter.py │ ├── mypy_linter.py │ ├── nativefunctions_linter.py │ ├── newlines_linter.py │ ├── no_merge_conflict_csv_linter.py │ ├── no_workflows_on_fork.py │ ├── pip_init.py │ ├── pyfmt_linter.py │ ├── ruff_linter.py │ ├── s3_init.py │ ├── s3_init_config.json │ ├── set_linter.py │ ├── shellcheck_linter.py │ ├── test_has_main_linter.py │ ├── testowners_linter.py │ ├── ufmt_linter.py │ ├── update_s3.py │ └── workflow_consistency_linter.py │ └── clang_tidy │ ├── __init__.py │ └── generate_build_files.py └── yaml ├── native ├── native_functions.yaml └── tags.yaml └── xpu_functions.yaml /.cmakelintrc: -------------------------------------------------------------------------------- 1 | filter=-convention/filename,-linelength,-package/consistency,-readability/logic,-readability/mixedcase,-readability/wonkycase,-syntax,-whitespace/eol,+whitespace/extra,-whitespace/indent,-whitespace/mismatch,-whitespace/newline,-whitespace/tabs 2 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/documentation.yml: -------------------------------------------------------------------------------- 1 | name: 📚 Documentation 2 | description: Report an issue related to https://pytorch.org/docs/stable/index.html 3 | 4 | body: 5 | - type: textarea 6 | attributes: 7 | label: 📚 The doc issue 8 | description: > 9 | A clear and concise description of what content in https://pytorch.org/docs/stable/index.html is an issue. If this has to do with the general https://pytorch.org website, please file an issue at https://github.com/pytorch/pytorch.github.io/issues/new/choose instead. If this has to do with https://pytorch.org/tutorials, please file an issue at https://github.com/pytorch/tutorials/issues/new. 10 | validations: 11 | required: true 12 | - type: textarea 13 | attributes: 14 | label: Suggest a potential alternative/fix 15 | description: > 16 | Tell us how we could improve the documentation in this regard. 17 | - type: markdown 18 | attributes: 19 | value: > 20 | Thanks for contributing 🎉! 21 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature-request.yml: -------------------------------------------------------------------------------- 1 | name: 🚀 Feature request 2 | description: Submit a proposal/request for a new PyTorch feature 3 | 4 | body: 5 | - type: textarea 6 | attributes: 7 | label: 🚀 The feature, motivation and pitch 8 | description: > 9 | A clear and concise description of the feature proposal. Please outline the motivation for the proposal. Is your feature request related to a specific problem? e.g., *"I'm working on X and would like Y to be possible"*. If this is related to another GitHub issue, please link here too. 10 | validations: 11 | required: true 12 | - type: textarea 13 | attributes: 14 | label: Alternatives 15 | description: > 16 | A description of any alternative solutions or features you've considered, if any. 17 | - type: textarea 18 | attributes: 19 | label: Additional context 20 | description: > 21 | Add any other context or screenshots about the feature request. 22 | - type: markdown 23 | attributes: 24 | value: > 25 | Thanks for contributing 🎉! 26 | -------------------------------------------------------------------------------- /.github/ci_commit_pins/torchbench.txt: -------------------------------------------------------------------------------- 1 | 03cde49eba0580ed17f9ae2250832fd8af4ed756 2 | -------------------------------------------------------------------------------- /.github/ci_commit_pins/triton.txt: -------------------------------------------------------------------------------- 1 | b8c64f64c18d8cac598b3adb355c21e7439c21de 2 | -------------------------------------------------------------------------------- /.github/scripts/env.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | source /opt/intel/oneapi/compiler/latest/env/vars.sh 4 | source /opt/intel/oneapi/pti/latest/env/vars.sh 5 | source /opt/intel/oneapi/umf/latest/env/vars.sh 6 | source /opt/intel/oneapi/ccl/latest/env/vars.sh 7 | source /opt/intel/oneapi/mpi/latest/env/vars.sh 8 | icpx --version 9 | sycl-ls 10 | -------------------------------------------------------------------------------- /.github/scripts/spec.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | DEVICE_NAME = 'xpu' 4 | 5 | MANUAL_SEED_FN = torch.xpu.manual_seed 6 | EMPTY_CACHE_FN = torch.xpu.empty_cache 7 | DEVICE_COUNT_FN = torch.xpu.device_count 8 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | */*.pyc 2 | */*.so* 3 | */**/__pycache__ 4 | */**/*.dylib* 5 | */**/*.pyc 6 | */**/*.pyd 7 | */**/*.so* 8 | */**/**/*.pyc 9 | */**/**/**/*.pyc 10 | */**/**/**/**/*.pyc 11 | .lintbin 12 | yaml/templates 13 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | # Security Policy 2 | 3 | ## Report a Vulnerability 4 | 5 | Report security issues or vulnerabilities to the [Intel Security Center]. 6 | 7 | For more information on how Intel works to resolve security issues, see 8 | [Vulnerability Handling Guidelines]. 9 | 10 | [Intel Security Center]:https://www.intel.com/security 11 | 12 | [Vulnerability Handling Guidelines]:https://www.intel.com/content/www/us/en/security-center/vulnerability-handling-guidelines.html 13 | -------------------------------------------------------------------------------- /cmake/ClangFormat.cmake: -------------------------------------------------------------------------------- 1 | ## Include to trigger clang-format 2 | if(BUILD_NO_CLANGFORMAT) 3 | return() 4 | endif() 5 | 6 | if(CLANGFORMAT_enabled) 7 | return() 8 | endif() 9 | set(CLANGFORMAT_enabled true) 10 | 11 | set(CFMT_STYLE ${PROJECT_SOURCE_DIR}/.clang-format) 12 | if(NOT EXISTS ${CFMT_STYLE}) 13 | message(WARNING "Cannot find style file ${CFMT_STYLE}!") 14 | return() 15 | endif() 16 | 17 | find_program(CLANG_FORMAT "clang-format-12") 18 | if(NOT CLANG_FORMAT) 19 | message("Please install clang-format-12 before contributing to torch-xpu-ops!") 20 | else() 21 | set(CLANG_FORMAT_EXEC clang-format-12) 22 | endif() 23 | -------------------------------------------------------------------------------- /cmake/ONEMKL.cmake: -------------------------------------------------------------------------------- 1 | option(USE_ONEMKL_XPU "Build with ONEMKL XPU support" ON) 2 | 3 | if(DEFINED ENV{USE_ONEMKL_XPU}) 4 | set(USE_ONEMKL_XPU $ENV{USE_ONEMKL_XPU}) 5 | endif() 6 | 7 | message(STATUS "USE_ONEMKL_XPU is set to ${USE_ONEMKL_XPU}") 8 | 9 | if(NOT USE_ONEMKL_XPU) 10 | return() 11 | endif() 12 | 13 | find_package(ONEMKL) 14 | if(NOT ONEMKL_FOUND) 15 | message(FATAL_ERROR "Can NOT find ONEMKL cmake helpers module!") 16 | endif() 17 | 18 | set(TORCH_XPU_OPS_ONEMKL_INCLUDE_DIR ${ONEMKL_INCLUDE_DIR}) 19 | 20 | set(TORCH_XPU_OPS_ONEMKL_LIBRARIES ${ONEMKL_LIBRARIES}) 21 | 22 | list(INSERT TORCH_XPU_OPS_ONEMKL_LIBRARIES 1 "-Wl,--start-group") 23 | list(APPEND TORCH_XPU_OPS_ONEMKL_LIBRARIES "-Wl,--end-group") 24 | -------------------------------------------------------------------------------- /cmake/XCCL.cmake: -------------------------------------------------------------------------------- 1 | if(NOT __XCCL_INCLUDED) 2 | set(__XCCL_INCLUDED TRUE) 3 | 4 | # XCCL_ROOT, XCCL_LIBRARY_DIR, XCCL_INCLUDE_DIR are handled by FindXCCL.cmake. 5 | find_package(XCCL REQUIRED) 6 | if(NOT XCCL_FOUND) 7 | set(PYTORCH_FOUND_XCCL FALSE) 8 | message(WARNING "${XCCL_NOT_FOUND_MESSAGE}") 9 | return() 10 | endif() 11 | 12 | set(PYTORCH_FOUND_XCCL TRUE) 13 | add_library(torch::xccl INTERFACE IMPORTED) 14 | set_property( 15 | TARGET torch::xccl PROPERTY INTERFACE_INCLUDE_DIRECTORIES 16 | ${XCCL_INCLUDE_DIR}) 17 | set_property( 18 | TARGET torch::xccl PROPERTY INTERFACE_LINK_LIBRARIES 19 | ${XCCL_LIBRARY}) 20 | endif() 21 | -------------------------------------------------------------------------------- /docs/torch_xpu_ops.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intel/torch-xpu-ops/5b4d7444484576f721d2295761cf8fafa924ef36/docs/torch_xpu_ops.jpg -------------------------------------------------------------------------------- /src/ATen/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # ATen XPU sources 2 | 3 | file(GLOB xpu_h "xpu/*.h") 4 | file(GLOB xpu_cpp "xpu/*.cpp") 5 | file(GLOB xpu_mkl "native/xpu/mkl/*.cpp") 6 | file(GLOB xpu_native_cpp "native/xpu/*.cpp" "native/sparse/*.cpp" "native/sparse/xpu/*.cpp" "native/nested/*.cpp" "native/nested/xpu/*.cpp" "native/transformers/*.cpp" "native/quantized/*.cpp") 7 | file(GLOB xpu_sycl "native/xpu/sycl/*.cpp" "native/sparse/xpu/sycl/*.cpp" "native/nested/xpu/sycl/*.cpp" "native/transformers/sycl/*.cpp" "native/quantized/sycl/*.cpp") 8 | 9 | list(APPEND ATen_XPU_CPP_SRCS ${xpu_cpp}) 10 | list(APPEND ATen_XPU_MKL_SRCS ${xpu_mkl}) 11 | list(APPEND ATen_XPU_NATIVE_CPP_SRCS ${xpu_native_cpp}) 12 | list(APPEND ATen_XPU_SYCL_SRCS ${xpu_sycl}) 13 | 14 | set(ATen_XPU_CPP_SRCS ${ATen_XPU_CPP_SRCS} PARENT_SCOPE) 15 | set(ATen_XPU_MKL_SRCS ${ATen_XPU_MKL_SRCS} PARENT_SCOPE) 16 | set(ATen_XPU_NATIVE_CPP_SRCS ${ATen_XPU_NATIVE_CPP_SRCS} PARENT_SCOPE) 17 | set(ATen_XPU_SYCL_SRCS ${ATen_XPU_SYCL_SRCS} PARENT_SCOPE) 18 | 19 | foreach(HEADER ${xpu_h}) 20 | install(FILES ${HEADER} DESTINATION "${AT_INSTALL_INCLUDE_DIR}/ATen/xpu") 21 | endforeach() 22 | -------------------------------------------------------------------------------- /src/ATen/native/nested/NestedTensorTransformerFunctions.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | namespace at::native { 5 | 6 | Tensor NestedTensor_softmax_dropout_xpu( 7 | const Tensor& self, 8 | const Tensor& query) { 9 | std::optional attn_mask; 10 | 11 | attn_mask = NestedTensor_to_mask(query, 2, self.size(2)); 12 | attn_mask = attn_mask->to(query.device(), /*non-blocking=*/true); 13 | return _masked_softmax( 14 | self, 15 | *attn_mask, 16 | self.dim() - 1, 17 | /*mask type */ 1); // NestedTensor_to_mask produces a BxT mask 18 | } 19 | 20 | } // namespace at::native -------------------------------------------------------------------------------- /src/ATen/native/quantized/FakeQuantizeCore.cpp: -------------------------------------------------------------------------------- 1 | #define TORCH_ASSERT_ONLY_METHOD_OPERATORS 2 | 3 | #include 4 | #include 5 | 6 | #include 7 | 8 | namespace at::native { 9 | 10 | REGISTER_XPU_DISPATCH( 11 | fake_quant_tensor_cachemask_stub, 12 | &xpu::fake_quantize_tensor_cachemask_kernel) 13 | REGISTER_XPU_DISPATCH( 14 | fake_quant_tensor_cachemask_tensor_qparams_stub, 15 | &xpu::fake_quantize_tensor_cachemask_tensor_qparams_kernel) 16 | REGISTER_XPU_DISPATCH( 17 | fake_quant_grad_learnable_tensor_stub, 18 | &xpu::_fake_quantize_grad_learnable_tensor_kernel) 19 | REGISTER_XPU_DISPATCH( 20 | fake_quant_per_channel_cachemask_stub, 21 | &xpu::fake_quant_per_channel_cachemask_kernel) 22 | REGISTER_XPU_DISPATCH( 23 | fake_quant_grad_learnable_channel_stub, 24 | &xpu::_fake_quantize_grad_learnable_channel_kernel) 25 | 26 | } // namespace at::native 27 | -------------------------------------------------------------------------------- /src/ATen/native/quantized/sycl/FusedObsFakeQuantKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at::native::xpu { 6 | 7 | TORCH_XPU_API void _calculate_moving_average( 8 | const at::Tensor& x, 9 | const at::Tensor& observer_on, 10 | at::Tensor& running_min, 11 | at::Tensor& running_max, 12 | const float averaging_const, 13 | const int64_t size, 14 | bool per_row_fake_quant); 15 | 16 | TORCH_XPU_API 17 | void _calc_moving_avg_qparams_helper( 18 | const at::Tensor& x, 19 | const at::Tensor fake_quant_on, 20 | at::Tensor& running_min, 21 | at::Tensor& running_max, 22 | float* scale_ptr, 23 | int32_t* zp_ptr, 24 | int32_t qmin, 25 | int32_t qmax, 26 | bool symmetric_quant, 27 | const int64_t size, 28 | bool per_row_fq); 29 | 30 | } // namespace at::native::xpu 31 | -------------------------------------------------------------------------------- /src/ATen/native/quantized/sycl/MakePerTensorQuantizedTensorKernels.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include 5 | #include 6 | 7 | namespace at::native::xpu { 8 | 9 | template 10 | struct AssignQuantizedTensorFunctor { 11 | scalar_t operator()(underlying_t value) const { 12 | return scalar_t(value); 13 | } 14 | }; 15 | 16 | void assign_quantized_tensor_kernel(const Tensor& self, Tensor& dst) { 17 | AT_DISPATCH_QINT_TYPES( 18 | dst.scalar_type(), "assign_quantized_tensor_xpu", [&]() { 19 | auto iter = TensorIteratorConfig() 20 | .check_all_same_dtype(false) 21 | .add_output(dst) 22 | .add_input(self) 23 | .build(); 24 | auto caller = AssignQuantizedTensorFunctor(); 25 | gpu_kernel(iter, caller); 26 | }); 27 | } 28 | 29 | } // namespace at::native::xpu 30 | -------------------------------------------------------------------------------- /src/ATen/native/quantized/sycl/MakePerTensorQuantizedTensorKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at::native::xpu { 6 | 7 | TORCH_XPU_API void assign_quantized_tensor_kernel( 8 | const Tensor& self, 9 | Tensor& dst); 10 | 11 | } // namespace at::native::xpu 12 | -------------------------------------------------------------------------------- /src/ATen/native/quantized/sycl/QuantizedMaxPool2d.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at::native::xpu { 6 | 7 | TORCH_XPU_API Tensor quantized_max_pool2d_kernel( 8 | const Tensor& input, 9 | IntArrayRef kernel_size, 10 | IntArrayRef stride, 11 | IntArrayRef padding, 12 | IntArrayRef dilation, 13 | bool ceil_mode); 14 | 15 | } // namespace at::native::xpu 16 | -------------------------------------------------------------------------------- /src/ATen/native/sparse/xpu/SparseBinaryOpIntersection.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | namespace at::native { 5 | 6 | REGISTER_XPU_DISPATCH( 7 | mul_sparse_sparse_out_stub, 8 | &xpu::mul_sparse_sparse_kernel); 9 | REGISTER_XPU_DISPATCH( 10 | sparse_mask_intersection_out_stub, 11 | &xpu::sparse_mask_intersection_kernel); 12 | REGISTER_XPU_DISPATCH( 13 | sparse_mask_projection_out_stub, 14 | &xpu::sparse_mask_projection_kernel); 15 | 16 | } // namespace at::native 17 | -------------------------------------------------------------------------------- /src/ATen/native/sparse/xpu/SparseTensor.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | namespace at::native { 5 | 6 | using namespace at::sparse; 7 | 8 | SparseTensor _coalesce_sparse_xpu(const SparseTensor& self) { 9 | return xpu::coalesce_sparse_kernel(self); 10 | } 11 | 12 | REGISTER_XPU_DISPATCH(flatten_indices_stub, &xpu::flatten_indices_kernel); 13 | 14 | } // namespace at::native 15 | -------------------------------------------------------------------------------- /src/ATen/native/sparse/xpu/SparseTensorMath.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | namespace at::native { 4 | 5 | using namespace at::sparse; 6 | 7 | SparseTensor& add_out_sparse_xpu( 8 | const SparseTensor& t, 9 | const SparseTensor& src, 10 | const Scalar& value, 11 | SparseTensor& r_) { 12 | return xpu::add_sparse_kernel(t, src, value, r_); 13 | } 14 | 15 | SparseTensor& mul_out_sparse_xpu( 16 | const Tensor& t_, 17 | const Tensor& src_, 18 | SparseTensor& r_) { 19 | return xpu::mul_sparse_kernel(t_, src_, r_); 20 | } 21 | 22 | Tensor _sparse_sum_backward_xpu( 23 | const Tensor& grad_, 24 | const SparseTensor& input_, 25 | IntArrayRef dims_to_sum) { 26 | return xpu::_sparse_sum_backward_kernel(grad_, input_, dims_to_sum); 27 | } 28 | 29 | } // namespace at::native 30 | -------------------------------------------------------------------------------- /src/ATen/native/sparse/xpu/sycl/SparseBinaryOpIntersectionKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | namespace at::native::xpu { 7 | 8 | using namespace at::sparse; 9 | using OptTensor = std::optional; 10 | 11 | TORCH_XPU_API void mul_sparse_sparse_kernel( 12 | Tensor& result, 13 | const Tensor& x, 14 | const Tensor& y); 15 | 16 | TORCH_XPU_API void sparse_mask_intersection_kernel( 17 | Tensor& result, 18 | const Tensor& x, 19 | const Tensor& y, 20 | const OptTensor& x_hash_opt = std::nullopt); 21 | 22 | TORCH_XPU_API void sparse_mask_projection_kernel( 23 | Tensor& result, 24 | const Tensor& x, 25 | const Tensor& y, 26 | const OptTensor& x_hash_opt, 27 | bool accumulate_matches); 28 | 29 | } // namespace at::native::xpu 30 | -------------------------------------------------------------------------------- /src/ATen/native/sparse/xpu/sycl/SparseCsrTensorMathKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | namespace at::native::xpu { 7 | 8 | TORCH_XPU_API void convert_indices_from_coo_to_csr_structured_kernel( 9 | const Tensor& input, 10 | const int64_t size, 11 | const bool out_int32, 12 | const Tensor& result); 13 | 14 | TORCH_XPU_API void convert_indices_from_csr_to_coo_structured_kernel( 15 | const Tensor& crow_indices, 16 | const Tensor& col_indices, 17 | const bool out_int32, 18 | const bool transpose, 19 | const Tensor& result); 20 | 21 | TORCH_XPU_API Tensor _sparse_csr_sum_xpu_kernel( 22 | const Tensor& input, 23 | IntArrayRef dims_to_sum, 24 | bool keepdim, 25 | std::optional dtype); 26 | 27 | TORCH_XPU_API Tensor _sparse_csr_prod_xpu_kernel( 28 | const Tensor& input, 29 | IntArrayRef dims_to_reduce, 30 | bool keepdim, 31 | std::optional dtype); 32 | 33 | } // namespace at::native::xpu 34 | -------------------------------------------------------------------------------- /src/ATen/native/sparse/xpu/sycl/SparseSoftmaxKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | namespace at::native::xpu { 7 | 8 | using namespace at::sparse; 9 | 10 | TORCH_XPU_API Tensor softmax_sparse_xpu_kernel( 11 | const Tensor& input_, 12 | const int64_t dim_, 13 | const bool half_to_float); 14 | 15 | TORCH_XPU_API Tensor log_softmax_sparse_xpu_kernel( 16 | const Tensor& input_, 17 | const int64_t dim_, 18 | const bool half_to_float); 19 | 20 | TORCH_XPU_API Tensor softmax_backward_sparse_xpu_kernel( 21 | const Tensor& grad_, 22 | const Tensor& output_, 23 | int64_t dim_, 24 | const Tensor& input_); 25 | 26 | TORCH_XPU_API Tensor log_softmax_backward_sparse_xpu_kernel( 27 | const Tensor& grad_, 28 | const Tensor& output_, 29 | int64_t dim_, 30 | const Tensor& input_); 31 | 32 | } // namespace at::native::xpu 33 | -------------------------------------------------------------------------------- /src/ATen/native/sparse/xpu/sycl/SparseTensorKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | namespace at::native::xpu { 7 | 8 | using namespace at::sparse; 9 | 10 | TORCH_XPU_API SparseTensor coalesce_sparse_kernel(const SparseTensor& self); 11 | 12 | TORCH_XPU_API Tensor 13 | flatten_indices_kernel(const Tensor& indices, IntArrayRef size); 14 | 15 | } // namespace at::native::xpu 16 | -------------------------------------------------------------------------------- /src/ATen/native/sparse/xpu/sycl/SparseTensorMathKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | namespace at::native::xpu { 7 | 8 | using namespace at::sparse; 9 | 10 | TORCH_XPU_API SparseTensor& add_sparse_kernel( 11 | const SparseTensor& t, 12 | const SparseTensor& src, 13 | const Scalar& value, 14 | SparseTensor& r_); 15 | 16 | TORCH_XPU_API SparseTensor& mul_sparse_kernel( 17 | const Tensor& t_, 18 | const Tensor& src_, 19 | SparseTensor& r_); 20 | 21 | TORCH_XPU_API Tensor _sparse_sum_backward_kernel( 22 | const Tensor& grad_, 23 | const SparseTensor& input_, 24 | IntArrayRef dims_to_sum); 25 | 26 | } // namespace at::native::xpu 27 | -------------------------------------------------------------------------------- /src/ATen/native/transformers/SDPUtils.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace sdp { 6 | 7 | bool can_use_mem_efficient_attention(sdp::sdp_params params, bool debug); 8 | 9 | } // namespace sdp 10 | -------------------------------------------------------------------------------- /src/ATen/native/transformers/sycl/AttentionKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at::native::xpu { 6 | 7 | TORCH_XPU_API void _transform_bias_rescale_qkv_kernel( 8 | const Tensor& qkv, 9 | const Tensor& qkv_bias, 10 | const int64_t num_head, 11 | Tensor& q_k_v, 12 | int64_t B, 13 | int64_t T, 14 | int64_t D, 15 | int64_t dim_per_head); 16 | 17 | } // namespace at::native::xpu 18 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/AiryAi.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | namespace at { 7 | namespace native { 8 | REGISTER_XPU_DISPATCH(special_airy_ai_stub, &xpu::airy_ai_kernel); 9 | 10 | } // namespace native 11 | } // namespace at -------------------------------------------------------------------------------- /src/ATen/native/xpu/CompareOps.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | namespace at { 8 | 9 | namespace native { 10 | REGISTER_XPU_DISPATCH(eq_stub, &xpu::eq_kernel); 11 | REGISTER_XPU_DISPATCH(ne_stub, &xpu::ne_kernel); 12 | REGISTER_XPU_DISPATCH(le_stub, &xpu::le_kernel); 13 | REGISTER_XPU_DISPATCH(lt_stub, &xpu::lt_kernel); 14 | REGISTER_XPU_DISPATCH(ge_stub, &xpu::ge_kernel); 15 | REGISTER_XPU_DISPATCH(gt_stub, &xpu::gt_kernel); 16 | } // namespace native 17 | } // namespace at 18 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/Cross.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | namespace at { 9 | namespace native { 10 | REGISTER_XPU_DISPATCH(cross_stub, &xpu::linalg_cross_kernel); 11 | } // namespace native 12 | } // namespace at 13 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/Distance.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | namespace at { 6 | namespace native { 7 | 8 | REGISTER_XPU_DISPATCH(cdist_stub, &xpu::cdist_kernel); 9 | REGISTER_XPU_DISPATCH(cdist_backward_stub, &xpu::cdist_backward_kernel); 10 | REGISTER_XPU_DISPATCH(pdist_forward_stub, &xpu::pdist_forward_kernel); 11 | REGISTER_XPU_DISPATCH(pdist_backward_stub, &xpu::pdist_backward_kernel); 12 | 13 | } // namespace native 14 | } // namespace at 15 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/Fill.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include 8 | namespace at::native { 9 | REGISTER_XPU_DISPATCH(fill_stub, &native::xpu::fill_kernel); 10 | } // namespace at::native 11 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/FunctionOfAMatrixUtils.cpp: -------------------------------------------------------------------------------- 1 | #define TORCH_ASSERT_NO_OPERATORS 2 | #include 3 | 4 | #include 5 | 6 | #include 7 | 8 | namespace at { 9 | namespace native { 10 | 11 | REGISTER_XPU_DISPATCH( 12 | _compute_linear_combination_stub, 13 | &xpu::_compute_linear_combination_kernel); 14 | 15 | } 16 | } // namespace at 17 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/GroupNorm.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | namespace at { 9 | namespace native { 10 | REGISTER_XPU_DISPATCH(GroupNormKernel, &xpu::group_norm_kernel); 11 | REGISTER_XPU_DISPATCH( 12 | GroupNormBackwardKernel, 13 | &xpu::group_norm_backward_kernel); 14 | } // namespace native 15 | } // namespace at 16 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/Histogram.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | namespace at { 8 | 9 | namespace native { 10 | REGISTER_XPU_DISPATCH(histogramdd_stub, &xpu::histogramdd_kernel); 11 | REGISTER_XPU_DISPATCH(histogramdd_linear_stub, &xpu::histogramdd_linear_kernel); 12 | REGISTER_XPU_DISPATCH( 13 | histogram_select_outer_bin_edges_stub, 14 | &xpu::histogram_select_outer_bin_edges_kernel); 15 | 16 | } // namespace native 17 | } // namespace at -------------------------------------------------------------------------------- /src/ATen/native/xpu/Lerp.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | namespace at { 9 | namespace native { 10 | 11 | REGISTER_XPU_DISPATCH(lerp_kernel_tensor_weight, &xpu::lerp_tensor_kernel); 12 | REGISTER_XPU_DISPATCH(lerp_kernel_scalar_weight, &xpu::lerp_scalar_kernel); 13 | 14 | } // namespace native 15 | 16 | } // namespace at 17 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/LinearAlgebra.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | #include 14 | #include 15 | #include 16 | 17 | namespace at { 18 | namespace native { 19 | REGISTER_XPU_DISPATCH(addr_stub, &xpu::addr_kernel); 20 | REGISTER_XPU_DISPATCH(norm_stub, &xpu::norm_kernel); 21 | } // namespace native 22 | } // namespace at 23 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/Normalization.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include 8 | 9 | namespace at { 10 | namespace native { 11 | REGISTER_XPU_DISPATCH( 12 | renorm_scale_factor_stub, 13 | &xpu::renorm_scale_factor_kernel); 14 | } 15 | } // namespace at 16 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/PointwiseOps.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | namespace at { 7 | namespace native { 8 | REGISTER_XPU_DISPATCH(addcmul_stub, &xpu::addcmul_kernel); 9 | REGISTER_XPU_DISPATCH(addcdiv_stub, &xpu::addcdiv_kernel); 10 | } // namespace native 11 | } // namespace at 12 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/Pow.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | namespace at { 9 | namespace native { 10 | REGISTER_XPU_DISPATCH(pow_tensor_tensor_stub, &xpu::pow_tensor_tensor_kernel); 11 | REGISTER_XPU_DISPATCH(pow_tensor_scalar_stub, &xpu::pow_tensor_scalar_kernel); 12 | } // namespace native 13 | } // namespace at 14 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/RecordStream.cpp: -------------------------------------------------------------------------------- 1 | #define TORCH_ASSERT_ONLY_METHOD_OPERATORS 2 | #include 3 | #include 4 | 5 | #ifndef AT_PER_OPERATOR_HEADERS 6 | #include 7 | #else 8 | #include 9 | #endif 10 | 11 | namespace at::native { 12 | void record_stream_xpu(Tensor& self, c10::Stream stream) { 13 | struct c10::StreamData3 data = stream.pack3(); 14 | c10::xpu::XPUCachingAllocator::recordStream( 15 | self.storage().data_ptr(), 16 | at::xpu::XPUStream::unpack3( 17 | data.stream_id, data.device_index, data.device_type)); 18 | } 19 | } // namespace at::native 20 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/Repeat.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | namespace at { 5 | namespace native { 6 | Tensor repeat_interleave_xpu( 7 | const Tensor& repeats, 8 | std::optional output_size) { 9 | return at::native::xpu::repeat_interleave_kernel(repeats, output_size); 10 | } 11 | 12 | } // namespace native 13 | } // namespace at 14 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/SegmentReduce.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include 5 | 6 | namespace at { 7 | namespace native { 8 | 9 | REGISTER_XPU_DISPATCH( 10 | _segment_reduce_lengths_stub, 11 | &xpu::_segment_reduce_lengths_kernel); 12 | REGISTER_XPU_DISPATCH( 13 | _segment_reduce_offsets_stub, 14 | &xpu::_segment_reduce_offsets_kernel); 15 | REGISTER_XPU_DISPATCH( 16 | _segment_reduce_lengths_backward_stub, 17 | &xpu::_segment_reduce_lengths_backward_kernel); 18 | REGISTER_XPU_DISPATCH( 19 | _segment_reduce_offsets_backward_stub, 20 | &xpu::_segment_reduce_offsets_backward_kernel); 21 | 22 | } // namespace native 23 | } // namespace at 24 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/TensorProperties.cpp: -------------------------------------------------------------------------------- 1 | 2 | namespace at {} // namespace at 3 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/TriangluarOps.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include 9 | #include 10 | 11 | namespace at::native { 12 | 13 | TORCH_IMPL_FUNC(tril_xpu)(const Tensor& self, int64_t k, const Tensor& result) { 14 | if (self.numel() != 0) { 15 | xpu::tril_kernel(result, self, k); 16 | } 17 | } 18 | 19 | TORCH_IMPL_FUNC(triu_xpu)(const Tensor& self, int64_t k, const Tensor& result) { 20 | if (self.numel() != 0) { 21 | xpu::triu_kernel(result, self, k); 22 | } 23 | } 24 | 25 | Tensor trace_xpu(const Tensor& self) { 26 | TORCH_CHECK(self.dim() == 2, "expected a matrix"); 27 | return self.diagonal().sum(); 28 | } 29 | 30 | } // namespace at::native -------------------------------------------------------------------------------- /src/ATen/native/xpu/UnfoldBackward.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | namespace at { 8 | 9 | namespace native { 10 | REGISTER_XPU_DISPATCH(unfold_backward_stub, &xpu::unfold_backward_kernel); 11 | } 12 | } // namespace at 13 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/WeightInt4Pack.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | namespace at::native { 4 | 5 | // input is [n][k / 2] (uint8 dtype) 6 | // output is [n][k // 8] 7 | Tensor _convert_weight_to_int4pack_xpu(const Tensor& in, int64_t innerKTiles) { 8 | TORCH_CHECK(in.dim() == 2, __func__, " : expect weight to be 2D tensor."); 9 | TORCH_CHECK( 10 | in.dtype() == at::kByte, __func__, " : expect weight to be kByte."); 11 | TORCH_CHECK( 12 | innerKTiles == 2 || innerKTiles == 4 || innerKTiles == 8, 13 | __func__, 14 | " : innerKTiles need to be 2, 4, or 8, got ", 15 | innerKTiles); 16 | 17 | auto weight = in.contiguous(); 18 | auto N = weight.size(0); 19 | auto K = weight.size(1) * 2; 20 | TORCH_CHECK( 21 | K % 8 == 0, "The K dimension of int4 GEMM should be a multiple of 8."); 22 | auto weight_packed = at::empty( 23 | {N, K / 8}, at::TensorOptions().dtype(at::kInt).device(in.device())); 24 | 25 | xpu::weight_to_int4pack_kernel(weight_packed, weight, N, K); 26 | return weight_packed; 27 | } 28 | 29 | } // namespace at::native 30 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/WeightNorm.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | namespace at { 4 | namespace native { 5 | std::tuple weight_norm_xpu( 6 | const Tensor& v, 7 | const Tensor& g, 8 | int64_t dim) { 9 | return native::xpu::weight_norm_kernel(v, g, dim); 10 | } 11 | 12 | std::tuple weight_norm_backward_xpu( 13 | const Tensor& grad_w, 14 | const Tensor& saved_v, 15 | const Tensor& saved_g, 16 | const Tensor& saved_norms, 17 | int64_t dim) { 18 | TORCH_CHECK(saved_v.is_contiguous(), "saved_v must be contiguous"); 19 | TORCH_CHECK(saved_g.is_contiguous(), "saved_g must be contiguous"); 20 | TORCH_CHECK(saved_norms.is_contiguous(), "saved_norms must be contiguous"); 21 | TORCH_CHECK( 22 | dim == 0 || dim == saved_v.dim() - 1, 23 | "fused kernels can only be applied for first or last dim") 24 | 25 | return native::xpu::weight_norm_backward_kernel( 26 | grad_w, saved_v, saved_g, saved_norms, dim); 27 | } 28 | 29 | } // namespace native 30 | } // namespace at -------------------------------------------------------------------------------- /src/ATen/native/xpu/mkl/BatchLinearAlgebra.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at::native::xpu { 6 | 7 | TORCH_XPU_API void lu_solve_mkl( 8 | const Tensor& LU, 9 | const Tensor& pivots, 10 | const Tensor& B, 11 | TransposeType trans); 12 | 13 | TORCH_XPU_API void lu_factor_mkl( 14 | const Tensor& LU, 15 | const Tensor& pivots, 16 | const Tensor& info, 17 | bool pivot); 18 | 19 | } // namespace at::native::xpu 20 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/AbsKernel.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include 7 | 8 | #include 9 | 10 | namespace at::native::xpu { 11 | 12 | template 13 | struct AbsFunctor { 14 | scalar_t operator()(const scalar_t a) const { 15 | return std::abs(a); 16 | } 17 | }; 18 | 19 | void abs_kernel(TensorIteratorBase& iter) { 20 | auto dtype = iter.dtype(); 21 | if (at::isComplexType(dtype)) { 22 | AT_DISPATCH_COMPLEX_TYPES_AND(kComplexHalf, dtype, "abs_xpu", [&]() { 23 | using opmath_t = at::opmath_type; 24 | gpu_kernel(iter, AbsFunctor()); 25 | }); 26 | } else { 27 | AT_DISPATCH_ALL_TYPES_AND3( 28 | ScalarType::Half, 29 | ScalarType::BFloat16, 30 | ScalarType::Bool, 31 | iter.dtype(), 32 | "abs_xpu", 33 | [&]() { gpu_kernel(iter, AbsFunctor()); }); 34 | } 35 | } 36 | 37 | } // namespace at::native::xpu 38 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/AbsKernel.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at::native::xpu { 6 | 7 | TORCH_XPU_API void abs_kernel(TensorIteratorBase& iter); 8 | 9 | } // namespace at::native::xpu 10 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/ActivationEluKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at::native::xpu { 6 | 7 | TORCH_XPU_API void elu_kernel( 8 | TensorIteratorBase& iter, 9 | const Scalar& alpha, 10 | const Scalar& scale, 11 | const Scalar& input_scale); 12 | 13 | TORCH_XPU_API void elu_backward_kernel( 14 | TensorIteratorBase& iter, 15 | const Scalar& alpha, 16 | const Scalar& scale, 17 | const Scalar& input_scale, 18 | bool is_result); 19 | 20 | } // namespace at::native::xpu 21 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/ActivationGeluKernel.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at { 6 | namespace native { 7 | namespace xpu { 8 | 9 | TORCH_XPU_API void gelu_kernel( 10 | TensorIteratorBase& iter, 11 | std::string_view approximate); 12 | 13 | TORCH_XPU_API void gelu_backward_kernel( 14 | TensorIteratorBase& iter, 15 | std::string_view approximate); 16 | 17 | } // namespace xpu 18 | } // namespace native 19 | } // namespace at 20 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/ActivationGluKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at::native::xpu { 6 | 7 | TORCH_XPU_API void glu_kernel(TensorIteratorBase& iter); 8 | TORCH_XPU_API void glu_jvp_kernel(TensorIteratorBase& iter); 9 | 10 | TORCH_XPU_API void glu_backward_kernel( 11 | const TensorIteratorBase& iter, 12 | int64_t gI_stride, 13 | int64_t I_stride); 14 | 15 | } // namespace at::native::xpu 16 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/ActivationHardshrinkKernels.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | namespace at::native::xpu { 7 | 8 | template 9 | struct HardshrinkFunctor { 10 | scalar_t operator()(scalar_t a) const { 11 | return (a >= -lambd_ && a <= lambd_) ? scalar_t(0) : a; 12 | } 13 | 14 | HardshrinkFunctor(const scalar_t lambd) : lambd_(lambd) {} 15 | 16 | private: 17 | const scalar_t lambd_; 18 | }; 19 | 20 | void hardshrink_kernel(TensorIteratorBase& iter, const Scalar& value) { 21 | AT_DISPATCH_FLOATING_TYPES_AND2( 22 | at::ScalarType::Half, 23 | at::ScalarType::BFloat16, 24 | iter.dtype(), 25 | "hardshrink_xpu", 26 | [&]() { 27 | auto lambd = value.to(); 28 | auto caller = HardshrinkFunctor(lambd); 29 | gpu_kernel(iter, caller); 30 | }); 31 | } 32 | 33 | } // namespace at::native::xpu 34 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/ActivationHardshrinkKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at::native::xpu { 6 | 7 | TORCH_XPU_API void hardshrink_kernel( 8 | TensorIteratorBase& iter, 9 | const Scalar& value); 10 | 11 | } // namespace at::native::xpu 12 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/ActivationHardsigmoidKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at::native::xpu { 6 | 7 | TORCH_XPU_API void hardsigmoid_kernel(TensorIteratorBase& iter); 8 | 9 | TORCH_XPU_API void hardsigmoid_backward_kernel(TensorIteratorBase& iter); 10 | 11 | } // namespace at::native::xpu 12 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/ActivationHardswishKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at { 6 | namespace native { 7 | namespace xpu { 8 | 9 | TORCH_XPU_API void hardswish_kernel(TensorIterator& iter); 10 | 11 | TORCH_XPU_API void hardswish_backward_kernel(TensorIterator& iter); 12 | 13 | } // namespace xpu 14 | } // namespace native 15 | } // namespace at 16 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/ActivationHardtanhKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at { 6 | namespace native { 7 | namespace xpu { 8 | 9 | TORCH_XPU_API void hardtanh_backward_kernel( 10 | TensorIterator& iter, 11 | const Scalar& min, 12 | const Scalar& max); 13 | 14 | } // namespace xpu 15 | } // namespace native 16 | } // namespace at 17 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/ActivationLeakyReluKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at::native::xpu { 6 | 7 | TORCH_XPU_API void leaky_relu_kernel( 8 | TensorIteratorBase& iter, 9 | const Scalar& negval_); 10 | 11 | TORCH_XPU_API void leaky_relu_backward_kernel( 12 | TensorIteratorBase& iter, 13 | const Scalar& negval_); 14 | 15 | } // namespace at::native::xpu 16 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/ActivationLogSigmoidKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at::native::xpu { 6 | 7 | TORCH_XPU_API void log_sigmoid_forward_kernel(TensorIteratorBase& iter); 8 | 9 | TORCH_XPU_API void log_sigmoid_backward_kernel(TensorIterator& iter); 10 | 11 | } // namespace at::native::xpu 12 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/ActivationMishKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at::native::xpu { 6 | 7 | TORCH_XPU_API void mish_kernel(TensorIteratorBase& iter); 8 | 9 | TORCH_XPU_API void mish_backward_kernel(TensorIterator& iter); 10 | 11 | } // namespace at::native::xpu 12 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/ActivationPreluKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at::native::xpu { 6 | 7 | TORCH_XPU_API void prelu_kernel(TensorIterator& iter); 8 | 9 | TORCH_XPU_API void prelu_backward_kernel(TensorIterator& iter); 10 | 11 | } // namespace at::native::xpu 12 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/ActivationSiluKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at::native::xpu { 6 | 7 | TORCH_XPU_API void silu_kernel(TensorIteratorBase& iter); 8 | 9 | TORCH_XPU_API void silu_backward_kernel(TensorIteratorBase& iter); 10 | 11 | } // namespace at::native::xpu 12 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/ActivationSoftplusKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at::native::xpu { 6 | 7 | TORCH_XPU_API void softplus_kernel( 8 | TensorIteratorBase& iter, 9 | const Scalar& beta_, 10 | const Scalar& threshold_); 11 | 12 | TORCH_XPU_API void softplus_backward_kernel( 13 | TensorIteratorBase& iter, 14 | const Scalar& beta_, 15 | const Scalar& threshold_); 16 | 17 | } // namespace at::native::xpu 18 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/ActivationSoftshrinkKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at::native::xpu { 6 | 7 | TORCH_XPU_API void softshrink_kernel( 8 | TensorIteratorBase& iter, 9 | const Scalar& value); 10 | 11 | TORCH_XPU_API void softshrink_backward_kernel( 12 | TensorIteratorBase& iter, 13 | const Scalar& value); 14 | 15 | } // namespace at::native::xpu 16 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/ActivationThresholdKernel.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at { 6 | namespace native { 7 | namespace xpu { 8 | 9 | TORCH_XPU_API void threshold_kernel( 10 | TensorIteratorBase& iter, 11 | const Scalar& threshold, 12 | const Scalar& value); 13 | 14 | } 15 | } // namespace native 16 | } // namespace at 17 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at::native::xpu { 6 | 7 | TORCH_XPU_API void adaptive_avg_pool2d_backward_kernel( 8 | Tensor& gradInput, 9 | const Tensor& gradOutput, 10 | const Tensor& input); 11 | 12 | TORCH_XPU_API void adaptive_avg_pool2d_kernel( 13 | Tensor& output, 14 | const Tensor& input, 15 | IntArrayRef output_size); 16 | 17 | } // namespace at::native::xpu 18 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/AdaptiveAveragePooling3dKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at::native::xpu { 6 | 7 | TORCH_XPU_API void adaptive_avg_pool3d_backward_kernel( 8 | Tensor& gradInput, 9 | const Tensor& gradOutput_, 10 | const Tensor& input); 11 | 12 | TORCH_XPU_API void adaptive_avg_pool3d_kernel( 13 | Tensor& output, 14 | const Tensor& input_, 15 | IntArrayRef& output_size); 16 | 17 | } // namespace at::native::xpu -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/AdaptiveMaxPooling2dKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at::native::xpu { 6 | 7 | TORCH_XPU_API void adaptive_max_pool2d_kernel( 8 | const Tensor& input, 9 | IntArrayRef output_size, 10 | const Tensor& output, 11 | const Tensor& indices); 12 | 13 | TORCH_XPU_API void adaptive_max_pool2d_backward_kernel( 14 | const Tensor& grad_output, 15 | const Tensor& input, 16 | const Tensor& indices, 17 | const Tensor& grad_input); 18 | 19 | } // namespace at::native::xpu 20 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/AdaptiveMaxPooling3dKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at::native::xpu { 6 | 7 | TORCH_XPU_API void adaptive_max_pool3d_kernel( 8 | const Tensor& input, 9 | IntArrayRef output_size, 10 | const Tensor& output, 11 | const Tensor& indices); 12 | 13 | TORCH_XPU_API void adaptive_max_pool3d_backward_kernel( 14 | const Tensor& gradOutput, 15 | const Tensor& input, 16 | const Tensor& indices, 17 | const Tensor& gradInput); 18 | 19 | } // namespace at::native::xpu 20 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/AiryAiKernel.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include 7 | 8 | namespace at::native::xpu { 9 | template 10 | struct AiryAiFunctor { 11 | scalar_t operator()(scalar_t a) const { 12 | return airy_ai_forward(a); 13 | } 14 | }; 15 | 16 | void airy_ai_kernel(TensorIteratorBase& iter) { 17 | AT_DISPATCH_FLOATING_TYPES(iter.common_dtype(), "airy_ai_xpu", [&]() { 18 | gpu_kernel(iter, AiryAiFunctor()); 19 | }); 20 | } 21 | 22 | } // namespace at::native::xpu -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/AiryAiKernel.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at::native::xpu { 6 | 7 | TORCH_XPU_API void airy_ai_kernel(TensorIteratorBase& iter); 8 | 9 | } // namespace at::native::xpu -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/AmpKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | namespace at::native::xpu { 5 | 6 | TORCH_XPU_API void amp_non_finite_check_and_unscale_kernel( 7 | Tensor& scaled_grad, 8 | Tensor& found_inf, 9 | const Tensor& inv_scale); 10 | 11 | TORCH_XPU_API void amp_foreach_non_finite_check_and_unscale_kernel( 12 | std::vector> scaled_grads, 13 | Tensor& found_inf, 14 | const Tensor& inv_scale); 15 | 16 | TORCH_XPU_API Tensor& amp_update_scale_kernel( 17 | Tensor& current_scale, 18 | Tensor& growth_tracker, 19 | const Tensor& found_inf, 20 | double growth_factor, 21 | double backoff_factor, 22 | int64_t growth_interval); 23 | 24 | } // namespace at::native::xpu 25 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/AveragePool2dKernels.h: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | namespace at::native::xpu { 4 | 5 | TORCH_XPU_API void avg_pool2d_kernel( 6 | const Tensor& input_, 7 | int64_t kH_, 8 | int64_t kW_, 9 | int64_t dH_, 10 | int64_t dW_, 11 | int64_t padH_, 12 | int64_t padW_, 13 | bool ceil_mode, 14 | bool count_include_pad, 15 | std::optional divisor_override, 16 | const Tensor& output); 17 | 18 | TORCH_XPU_API void avg_pool2d_backward_kernel( 19 | const Tensor& gradOutput_, 20 | const Tensor& input_, 21 | IntArrayRef kernel_size, 22 | IntArrayRef stride, 23 | IntArrayRef padding, 24 | bool ceil_mode, 25 | bool count_include_pad, 26 | std::optional divisor_override, 27 | const Tensor& gradInput); 28 | 29 | } // namespace at::native::xpu 30 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/AveragePool3dKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at::native::xpu { 6 | 7 | TORCH_XPU_API void avg_pool3d_kernel( 8 | const Tensor& input_, 9 | IntArrayRef kernel_size, 10 | IntArrayRef stride, 11 | IntArrayRef padding, 12 | bool ceil_mode, 13 | bool count_include_pad, 14 | std::optional divisor_override, 15 | const Tensor& output); 16 | 17 | TORCH_XPU_API void avg_pool3d_backward_kernel( 18 | const Tensor& gradOutput_, 19 | const Tensor& input_, 20 | IntArrayRef kernel_size, 21 | IntArrayRef stride, 22 | IntArrayRef padding, 23 | bool ceil_mode, 24 | bool count_include_pad, 25 | std::optional divisor_override, 26 | const Tensor& gradInput); 27 | 28 | } // namespace at::native::xpu 29 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/BesselJ0Kernel.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include 8 | 9 | namespace at::native::xpu { 10 | 11 | template 12 | struct BesselJ0Functor { 13 | scalar_t operator()(scalar_t a) const { 14 | return bessel_j0_forward(a); 15 | } 16 | }; 17 | 18 | void bessel_j0_kernel(TensorIteratorBase& iter) { 19 | AT_DISPATCH_FLOATING_TYPES(iter.common_dtype(), "bessel_j0_xpu", [&]() { 20 | gpu_kernel(iter, BesselJ0Functor()); 21 | }); 22 | } 23 | 24 | } // namespace at::native::xpu 25 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/BesselJ0Kernel.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at::native::xpu { 6 | 7 | TORCH_XPU_API void bessel_j0_kernel(TensorIteratorBase& iter); 8 | 9 | } // namespace at::native::xpu 10 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/BesselJ1Kernel.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include 8 | 9 | namespace at::native::xpu { 10 | 11 | template 12 | struct BesselJ1Functor { 13 | scalar_t operator()(scalar_t a) const { 14 | if (a < scalar_t(0.0f)) { 15 | return -bessel_j1_forward(-a); 16 | } 17 | return bessel_j1_forward(a); 18 | } 19 | }; 20 | 21 | void bessel_j1_kernel(TensorIteratorBase& iter) { 22 | AT_DISPATCH_FLOATING_TYPES(iter.common_dtype(), "bessel_j1_xpu", [&]() { 23 | gpu_kernel(iter, BesselJ1Functor()); 24 | }); 25 | } 26 | 27 | } // namespace at::native::xpu 28 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/BesselJ1Kernel.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at::native::xpu { 6 | 7 | TORCH_XPU_API void bessel_j1_kernel(TensorIteratorBase& iter); 8 | 9 | } // namespace at::native::xpu 10 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/BesselY0Kernel.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include 8 | 9 | namespace at::native::xpu { 10 | 11 | template 12 | struct BesselY0Functor { 13 | scalar_t operator()(scalar_t a) const { 14 | return bessel_y0_forward(a); 15 | } 16 | }; 17 | 18 | void bessel_y0_kernel(TensorIteratorBase& iter) { 19 | AT_DISPATCH_FLOATING_TYPES(iter.common_dtype(), "bessel_y0_xpu", [&]() { 20 | gpu_kernel(iter, BesselY0Functor()); 21 | }); 22 | } 23 | 24 | } // namespace at::native::xpu 25 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/BesselY0Kernel.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at::native::xpu { 6 | 7 | TORCH_XPU_API void bessel_y0_kernel(TensorIteratorBase& iter); 8 | 9 | } // namespace at::native::xpu 10 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/BesselY1Kernel.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include 8 | 9 | namespace at::native::xpu { 10 | 11 | template 12 | struct BesselY1Functor { 13 | scalar_t operator()(scalar_t a) const { 14 | return bessel_y1_forward(a); 15 | } 16 | }; 17 | 18 | void bessel_y1_kernel(TensorIteratorBase& iter) { 19 | AT_DISPATCH_FLOATING_TYPES(iter.common_dtype(), "bessel_y1_xpu", [&]() { 20 | gpu_kernel(iter, BesselY1Functor()); 21 | }); 22 | } 23 | 24 | } // namespace at::native::xpu 25 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/BesselY1Kernel.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at::native::xpu { 6 | 7 | TORCH_XPU_API void bessel_y1_kernel(TensorIteratorBase& iter); 8 | 9 | } // namespace at::native::xpu 10 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/BinaryBitwiseOpsKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at { 6 | namespace native { 7 | namespace xpu { 8 | 9 | TORCH_XPU_API void bitwise_and_kernel(TensorIteratorBase& iter); 10 | 11 | TORCH_XPU_API void bitwise_or_kernel(TensorIteratorBase& iter); 12 | 13 | TORCH_XPU_API void bitwise_xor_kernel(TensorIteratorBase& iter); 14 | 15 | } // namespace xpu 16 | } // namespace native 17 | } // namespace at 18 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/BinaryGeometricKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at::native::xpu { 6 | 7 | TORCH_XPU_API void atan2_kernel(TensorIteratorBase& iter); 8 | 9 | TORCH_XPU_API void hypot_kernel(TensorIteratorBase& iter); 10 | 11 | } // namespace at::native::xpu 12 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/BinaryInternal.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at::native::xpu { 6 | 7 | template 8 | struct DivFunctor { 9 | scalar_t operator()(scalar_t a, scalar_t b) const { 10 | return c10::xpu::compat::div(a, b); 11 | } 12 | }; 13 | 14 | template 15 | struct MulFunctor { 16 | scalar_t operator()(scalar_t a, scalar_t b) const { 17 | return a * b; 18 | } 19 | }; 20 | 21 | // Workaround for the error: '*' in boolean context, suggest '&&' instead 22 | // [-Werror=int-in-bool-context] 23 | template <> 24 | struct MulFunctor { 25 | bool operator()(bool a, bool b) const { 26 | return a && b; 27 | } 28 | }; 29 | 30 | } // namespace at::native::xpu 31 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/BinaryKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at::native::xpu { 6 | 7 | TORCH_XPU_API void add_kernel(TensorIteratorBase& iter, const Scalar& alpha); 8 | 9 | TORCH_XPU_API void sub_kernel(TensorIteratorBase& iter, const Scalar& alpha); 10 | 11 | TORCH_XPU_API void mul_kernel(TensorIteratorBase& iter); 12 | 13 | TORCH_XPU_API void div_true_kernel(TensorIteratorBase& iter); 14 | 15 | TORCH_XPU_API void div_trunc_kernel(TensorIteratorBase& iter); 16 | 17 | TORCH_XPU_API void div_floor_kernel(TensorIteratorBase& iter); 18 | 19 | } // namespace at::native::xpu 20 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/BinaryLogicalOpsKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at::native::xpu { 6 | 7 | TORCH_XPU_API void logical_and_kernel(TensorIterator& iter); 8 | 9 | TORCH_XPU_API void logical_or_kernel(TensorIterator& iter); 10 | 11 | TORCH_XPU_API void logical_xor_kernel(TensorIterator& iter); 12 | 13 | } // namespace at::native::xpu 14 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/BinaryMiscBackwardOpsKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at::native::xpu { 6 | 7 | TORCH_XPU_API void sigmoid_backward_kernel(TensorIteratorBase& iter); 8 | 9 | TORCH_XPU_API void tanh_backward_kernel(TensorIteratorBase& iter); 10 | 11 | TORCH_XPU_API void logit_backward_kernel( 12 | TensorIteratorBase& iter, 13 | const Scalar& eps_scalar); 14 | 15 | } // namespace at::native::xpu 16 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/BinaryMiscOpsKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at::native::xpu { 6 | 7 | TORCH_XPU_API void mse_kernel(TensorIteratorBase& iter); 8 | 9 | TORCH_XPU_API void smooth_l1_kernel(TensorIteratorBase& iter, double beta); 10 | 11 | TORCH_XPU_API void huber_kernel(TensorIterator& iter, double delta); 12 | 13 | TORCH_XPU_API void xlogy_kernel(TensorIteratorBase& iter); 14 | 15 | TORCH_XPU_API void xlog1py_kernel(TensorIteratorBase& iter); 16 | 17 | } // namespace at::native::xpu 18 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/BinaryRemainderKernel.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at { 6 | namespace native { 7 | namespace xpu { 8 | 9 | TORCH_XPU_API void remainder_kernel(TensorIteratorBase& iter); 10 | 11 | TORCH_XPU_API void fmod_kernel(TensorIteratorBase& iter); 12 | 13 | } // namespace xpu 14 | } // namespace native 15 | } // namespace at 16 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/BinaryShiftOpsKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at::native::xpu { 6 | 7 | TORCH_XPU_API void lshift_kernel(TensorIteratorBase& iter); 8 | 9 | TORCH_XPU_API void rshift_kernel(TensorIteratorBase& iter); 10 | 11 | } // namespace at::native::xpu 12 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/BucketizationKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | namespace at::native::xpu { 5 | 6 | TORCH_XPU_API void searchsorted_kernel( 7 | Tensor& result, 8 | const Tensor& input, 9 | const Tensor& sorted_sequence, 10 | bool out_int32, 11 | bool right, 12 | const Tensor& sorter); 13 | 14 | } // namespace at::native::xpu 15 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/ChebyshevPolynomialKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at::native::xpu { 6 | 7 | TORCH_XPU_API void chebyshev_polynomial_t_kernel(TensorIteratorBase& iterator); 8 | 9 | TORCH_XPU_API void chebyshev_polynomial_u_kernel(TensorIteratorBase& iterator); 10 | 11 | TORCH_XPU_API void chebyshev_polynomial_v_kernel(TensorIteratorBase& iterator); 12 | 13 | TORCH_XPU_API void chebyshev_polynomial_w_kernel(TensorIteratorBase& iterator); 14 | 15 | } // namespace at::native::xpu 16 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/ChebyshevPolynomialTKernel.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | namespace at::native::xpu { 8 | 9 | template 10 | struct ChebyshevPolynomialTFunctor { 11 | scalar_t operator()(scalar_t x, scalar_t n) const { 12 | return chebyshev_polynomial_t_forward(x, n); 13 | } 14 | }; 15 | 16 | void chebyshev_polynomial_t_kernel(TensorIteratorBase& iterator) { 17 | AT_DISPATCH_FLOATING_TYPES( 18 | iterator.common_dtype(), "chebyshev_polynomial_t_xpu", [&]() { 19 | ChebyshevPolynomialTFunctor f; 20 | gpu_kernel_with_scalars(iterator, f); 21 | }); 22 | } 23 | 24 | } // namespace at::native::xpu 25 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/ChebyshevPolynomialUKernel.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | namespace at::native::xpu { 8 | 9 | template 10 | struct ChebyshevPolynomialUFunctor { 11 | scalar_t operator()(scalar_t x, scalar_t n) const { 12 | return chebyshev_polynomial_u_forward(x, n); 13 | } 14 | }; 15 | 16 | void chebyshev_polynomial_u_kernel(TensorIteratorBase& iterator) { 17 | AT_DISPATCH_FLOATING_TYPES( 18 | iterator.common_dtype(), "chebyshev_polynomial_u_xpu", [&]() { 19 | ChebyshevPolynomialUFunctor f; 20 | gpu_kernel_with_scalars(iterator, f); 21 | }); 22 | } 23 | 24 | } // namespace at::native::xpu 25 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/ChebyshevPolynomialVKernel.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | namespace at::native::xpu { 8 | 9 | template 10 | struct ChebyshevPolynomialVFunctor { 11 | scalar_t operator()(scalar_t x, scalar_t n) const { 12 | return chebyshev_polynomial_v_forward(x, n); 13 | } 14 | }; 15 | 16 | void chebyshev_polynomial_v_kernel(TensorIteratorBase& iterator) { 17 | AT_DISPATCH_FLOATING_TYPES( 18 | iterator.common_dtype(), "chebyshev_polynomial_v_xpu", [&]() { 19 | ChebyshevPolynomialVFunctor f; 20 | gpu_kernel_with_scalars(iterator, f); 21 | }); 22 | } 23 | 24 | } // namespace at::native::xpu 25 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/ChebyshevPolynomialWKernel.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | namespace at::native::xpu { 8 | 9 | template 10 | struct ChebyshevPolynomialWFunctor { 11 | scalar_t operator()(scalar_t x, scalar_t n) const { 12 | return chebyshev_polynomial_w_forward(x, n); 13 | } 14 | }; 15 | 16 | void chebyshev_polynomial_w_kernel(TensorIteratorBase& iterator) { 17 | AT_DISPATCH_FLOATING_TYPES( 18 | iterator.common_dtype(), "chebyshev_polynomial_w_xpu", [&]() { 19 | ChebyshevPolynomialWFunctor f; 20 | gpu_kernel_with_scalars(iterator, f); 21 | }); 22 | } 23 | 24 | } // namespace at::native::xpu 25 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/Col2ImKernel.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at::native::xpu { 6 | 7 | TORCH_XPU_API void col2im_kernel( 8 | Tensor& output, 9 | const Tensor& input_, 10 | IntArrayRef output_size, 11 | IntArrayRef kernel_size, 12 | IntArrayRef dilation, 13 | IntArrayRef padding, 14 | IntArrayRef stride); 15 | 16 | } // namespace at::native::xpu 17 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/CompareKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at { 6 | namespace native { 7 | namespace xpu { 8 | 9 | TORCH_XPU_API void eq_kernel(TensorIteratorBase& iter); 10 | 11 | TORCH_XPU_API void ne_kernel(TensorIteratorBase& iter); 12 | 13 | TORCH_XPU_API void lt_kernel(TensorIteratorBase& iter); 14 | 15 | TORCH_XPU_API void le_kernel(TensorIteratorBase& iter); 16 | 17 | TORCH_XPU_API void gt_kernel(TensorIteratorBase& iter); 18 | 19 | TORCH_XPU_API void ge_kernel(TensorIteratorBase& iter); 20 | 21 | } // namespace xpu 22 | } // namespace native 23 | } // namespace at 24 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/ComplexKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at::native::xpu { 6 | 7 | TORCH_XPU_API void complex_kernel(TensorIterator& iter); 8 | 9 | TORCH_XPU_API void polar_kernel(TensorIterator& iter); 10 | 11 | } // namespace at::native::xpu 12 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/CopyKernel.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at::native::xpu { 6 | 7 | TORCH_XPU_API void copy_kernel(TensorIteratorBase& iter); 8 | 9 | } // namespace at::native::xpu 10 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/CopysignKernel.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include 5 | 6 | #include 7 | 8 | namespace at::native::xpu { 9 | 10 | template 11 | struct CopysignFunctor { 12 | scalar_t operator()(scalar_t a, scalar_t b) const { 13 | return std::copysign(a, b); 14 | } 15 | }; 16 | 17 | void copysign_kernel(TensorIteratorBase& iter) { 18 | AT_DISPATCH_FLOATING_TYPES_AND2( 19 | at::ScalarType::Half, 20 | at::ScalarType::BFloat16, 21 | iter.common_dtype(), 22 | "copysign_xpu", 23 | [&]() { gpu_kernel_with_scalars(iter, CopysignFunctor()); }); 24 | } 25 | 26 | } // namespace at::native::xpu 27 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/CopysignKernel.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at::native::xpu { 6 | 7 | TORCH_XPU_API void copysign_kernel(TensorIteratorBase& iter); 8 | 9 | } // namespace at::native::xpu 10 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/CrossKernel.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | namespace at::native::xpu { 5 | 6 | TORCH_XPU_API void linalg_cross_kernel( 7 | const Tensor& result, 8 | const Tensor& x1, 9 | const Tensor& x2, 10 | int64_t dim); 11 | 12 | } // namespace at::native::xpu 13 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/CumprodKernel.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include 5 | #include 6 | 7 | namespace at::native::xpu { 8 | 9 | void launch_cumprod_kernel( 10 | const Tensor& result, 11 | const Tensor& self, 12 | int64_t dim) { 13 | AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2( 14 | ScalarType::Half, 15 | ScalarType::BFloat16, 16 | self.scalar_type(), 17 | "cumprod_xpu", 18 | [&]() { 19 | scalar_t init = 1; 20 | scan( 21 | result, self, dim, init, std::multiplies()); 22 | }); 23 | } 24 | 25 | } // namespace at::native::xpu 26 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/CumsumKernel.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include 5 | #include 6 | 7 | namespace at::native::xpu { 8 | 9 | void launch_cumsum_kernel( 10 | const Tensor& result, 11 | const Tensor& self, 12 | int64_t dim) { 13 | AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2( 14 | ScalarType::Half, 15 | ScalarType::BFloat16, 16 | self.scalar_type(), 17 | "cumsum_xpu", 18 | [&]() { 19 | scalar_t init = 0; 20 | scan( 21 | result, self, dim, init, std::plus()); 22 | }); 23 | } 24 | 25 | } // namespace at::native::xpu 26 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/DepthwiseConv3dKernels.h: -------------------------------------------------------------------------------- 1 | 2 | #pragma once 3 | #include 4 | #include 5 | 6 | namespace at::native::xpu { 7 | 8 | TORCH_XPU_API Tensor conv_depthwise3d_kernel( 9 | const Tensor& input, 10 | const Tensor& weight, 11 | IntArrayRef kernel_size, 12 | const std::optional& bias_opt, 13 | IntArrayRef stride, 14 | IntArrayRef padding, 15 | IntArrayRef dilation); 16 | 17 | TORCH_XPU_API std::tuple 18 | _depthwise_3d_backward_kernel( 19 | Tensor& grad_input, 20 | Tensor& grad_weight, 21 | Tensor& grad_bias, 22 | const Tensor& grad_output, 23 | const Tensor& input, 24 | const Tensor& weight, 25 | IntArrayRef kernel_size, 26 | IntArrayRef stride, 27 | IntArrayRef padding, 28 | IntArrayRef dilation, 29 | const std::array output_mask); 30 | 31 | } // namespace at::native::xpu 32 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/Dequant_int4.h: -------------------------------------------------------------------------------- 1 | 2 | #pragma once 3 | #include 4 | #include 5 | 6 | namespace at::native::xpu { 7 | 8 | TORCH_XPU_API void dequant_int4_kernel( 9 | const Tensor& weight_int4, 10 | Tensor& weight, 11 | int qGroupSize, 12 | const Tensor& qScaleAndZeros); 13 | 14 | } // namespace at::native::xpu 15 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/DilatedMaxPool2d.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at::native::xpu { 6 | 7 | TORCH_XPU_API void max_pool2d_with_indices_kernel( 8 | const Tensor& input, 9 | IntArrayRef kernel_size, 10 | IntArrayRef stride, 11 | IntArrayRef padding, 12 | IntArrayRef dilation, 13 | bool ceil_mode, 14 | const Tensor& output, 15 | const Tensor& indices); 16 | 17 | TORCH_XPU_API void max_pool2d_with_indices_backward_kernel( 18 | const Tensor& gradInput, 19 | const Tensor& gradOutput, 20 | const Tensor& input, 21 | const Tensor& indices, 22 | IntArrayRef kernel_size, 23 | IntArrayRef stride, 24 | IntArrayRef padding, 25 | IntArrayRef dilation, 26 | bool ceil_mode); 27 | 28 | } // namespace at::native::xpu 29 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/DilatedMaxPool3d.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at::native::xpu { 6 | 7 | TORCH_XPU_API void max_pool3d_with_indices_kernel( 8 | const Tensor& input, 9 | IntArrayRef kernel_size, 10 | IntArrayRef stride, 11 | IntArrayRef padding, 12 | IntArrayRef dilation, 13 | bool ceil_mode, 14 | Tensor& output, 15 | Tensor& indices); 16 | 17 | TORCH_XPU_API void max_pool3d_with_indices_backward_kernel( 18 | Tensor& gradInput, 19 | const Tensor& gradOutput, 20 | const Tensor& input, 21 | const Tensor& indices, 22 | IntArrayRef kernel_size, 23 | IntArrayRef stride, 24 | IntArrayRef padding, 25 | IntArrayRef dilation, 26 | bool ceil_mode); 27 | 28 | } // namespace at::native::xpu 29 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/DistanceKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | namespace at::native::xpu { 5 | 6 | TORCH_XPU_API void cdist_kernel( 7 | Tensor& result, 8 | const Tensor& x1_expanded, 9 | const Tensor& x2_expanded, 10 | double p); 11 | 12 | TORCH_XPU_API void cdist_backward_kernel( 13 | Tensor& grad_x1, 14 | const Tensor& grad, 15 | const Tensor& x1, 16 | const Tensor& x2, 17 | const double p, 18 | const Tensor& cdist); 19 | 20 | TORCH_XPU_API void pdist_forward_kernel( 21 | Tensor& result, 22 | const Tensor& self, 23 | double p); 24 | 25 | TORCH_XPU_API void pdist_backward_kernel( 26 | Tensor& result, 27 | const Tensor& grad, 28 | const Tensor& self, 29 | const double p, 30 | const Tensor& dist); 31 | 32 | } // namespace at::native::xpu 33 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/DistributionCauchyKernel.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | namespace at::native::xpu { 7 | 8 | void cauchy_kernel( 9 | TensorIteratorBase& iter, 10 | double median, 11 | double sigma, 12 | std::optional gen) { 13 | auto generator = get_generator_or_default( 14 | gen, at::xpu::detail::getDefaultXPUGenerator()); 15 | at::native::templates::xpu::cauchy_kernel(iter, median, sigma, generator); 16 | } 17 | 18 | } // namespace at::native::xpu 19 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/DistributionExponentialKernel.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include 11 | 12 | namespace at::native::xpu { 13 | 14 | void exponential_kernel( 15 | TensorIteratorBase& iter, 16 | double lambda, 17 | std::optional gen) { 18 | auto generator = get_generator_or_default( 19 | gen, at::xpu::detail::getDefaultXPUGenerator()); 20 | at::native::templates::xpu::exponential_kernel(iter, lambda, generator); 21 | } 22 | 23 | } // namespace at::native::xpu 24 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/DistributionGeometricKernel.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | namespace at::native::xpu { 7 | 8 | void geometric_kernel( 9 | TensorIteratorBase& iter, 10 | double p_, 11 | std::optional gen) { 12 | auto generator = get_generator_or_default( 13 | gen, at::xpu::detail::getDefaultXPUGenerator()); 14 | at::native::templates::xpu::geometric_kernel(iter, p_, generator); 15 | } 16 | 17 | } // namespace at::native::xpu 18 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/DistributionLogNormalKernel.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | namespace at::native::xpu { 7 | 8 | void log_normal_kernel( 9 | TensorIteratorBase& iter, 10 | double mean, 11 | double std, 12 | std::optional gen) { 13 | auto generator = get_generator_or_default( 14 | gen, at::xpu::detail::getDefaultXPUGenerator()); 15 | at::native::templates::xpu::log_normal_kernel(iter, mean, std, generator); 16 | } 17 | 18 | } // namespace at::native::xpu 19 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/DistributionNormal.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #include 13 | 14 | namespace at { 15 | namespace native { 16 | namespace xpu { 17 | 18 | void normal_kernel( 19 | const TensorBase& self, 20 | double mean, 21 | double std, 22 | std::optional gen) { 23 | auto generator = get_generator_or_default( 24 | gen, at::xpu::detail::getDefaultXPUGenerator()); 25 | at::native::templates::xpu::normal_kernel(self, mean, std, generator); 26 | } 27 | 28 | } // namespace xpu 29 | } // namespace native 30 | } // namespace at 31 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/DistributionUniform.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #include 13 | 14 | namespace at { 15 | namespace native { 16 | namespace xpu { 17 | 18 | void uniform_kernel( 19 | TensorIteratorBase& iter, 20 | double from, 21 | double to, 22 | std::optional gen) { 23 | auto generator = get_generator_or_default( 24 | gen, at::xpu::detail::getDefaultXPUGenerator()); 25 | at::native::templates::xpu::uniform_kernel(iter, from, to, generator); 26 | } 27 | 28 | } // namespace xpu 29 | } // namespace native 30 | } // namespace at 31 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/Distributions.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | namespace at::native::xpu { 7 | 8 | TORCH_XPU_API void launch_poisson_kernel( 9 | const TensorBase& ret, 10 | const TensorBase& lambda, 11 | XPUGeneratorImpl* gen); 12 | 13 | TORCH_XPU_API void launch_binomial_kernel( 14 | TensorIteratorBase& iter, 15 | XPUGeneratorImpl* gen); 16 | 17 | TORCH_XPU_API void launch_gamma_kernel( 18 | Tensor& ret, 19 | const Tensor& alpha, 20 | XPUGeneratorImpl* gen); 21 | 22 | TORCH_XPU_API void launch_standard_gamma_grad_kernel(TensorIteratorBase& iter); 23 | 24 | TORCH_XPU_API void launch_dirichlet_kernel(TensorIteratorBase& iter); 25 | 26 | TORCH_XPU_API void launch_dirichlet_grad_kernel(TensorIteratorBase& iter); 27 | 28 | } // namespace at::native::xpu 29 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/DropoutKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at { 6 | namespace native { 7 | namespace xpu { 8 | 9 | TORCH_XPU_API std::tuple dropout_kernel( 10 | const Tensor& self, 11 | double p, 12 | std::optional train); 13 | 14 | TORCH_XPU_API Tensor 15 | dropout_backward_kernel(const Tensor& grad, const Tensor& mask, double scale); 16 | 17 | TORCH_XPU_API std::tuple fused_dropout_kernel( 18 | const Tensor& self, 19 | double p, 20 | std::optional gen_); 21 | 22 | TORCH_XPU_API Tensor 23 | masked_scale_kernel(const Tensor& self, const Tensor& mask, double scale); 24 | 25 | } // namespace xpu 26 | } // namespace native 27 | } // namespace at 28 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/EmbeddingKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at::native::xpu { 6 | 7 | TORCH_XPU_API Tensor embedding_dense_backward_kernel( 8 | const Tensor& grad_, 9 | const Tensor& indices_, 10 | int64_t num_weights, 11 | int64_t padding_idx, 12 | bool scale_grad_by_freq); 13 | 14 | TORCH_XPU_API Tensor& embedding_renorm_kernel( 15 | Tensor& self, 16 | const Tensor& indices, 17 | double max_norm, 18 | double norm_type); 19 | 20 | } // namespace at::native::xpu 21 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/FFTKernelFunctor.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | namespace at { 4 | namespace native { 5 | namespace xpu { 6 | 7 | void _fft_fill_with_conjugate_symmetry_xpu( 8 | ScalarType dtype, 9 | IntArrayRef mirror_dims, 10 | IntArrayRef signal_half_sizes, 11 | IntArrayRef in_strides, 12 | const void* in_data, 13 | IntArrayRef out_strides, 14 | void* out_data); 15 | 16 | template 17 | void _fft_conjugate_copy_kernel( 18 | int64_t numel, 19 | scalar_t* out_data, 20 | const scalar_t* in_data, 21 | inp_calc_t ic, 22 | out_calc_t oc); 23 | 24 | void _fft_fill_with_conjugate_symmetry_(const Tensor& input, IntArrayRef dim_); 25 | 26 | } // namespace xpu 27 | } // namespace native 28 | } // namespace at 29 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/FillKernel.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include 6 | 7 | #include 8 | 9 | namespace at { 10 | namespace native { 11 | namespace xpu { 12 | 13 | template 14 | struct FillFunctor { 15 | scalar_t operator()() const { 16 | return val_; 17 | } 18 | FillFunctor(scalar_t val) : val_(val) {} 19 | 20 | private: 21 | scalar_t val_; 22 | }; 23 | 24 | void fill_kernel(TensorIterator& iter, const Scalar& value) { 25 | AT_DISPATCH_V2( 26 | iter.dtype(), 27 | "fill_xpu", 28 | AT_WRAP([&]() { 29 | gpu_kernel(iter, FillFunctor(value.to())); 30 | }), 31 | AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX), 32 | kComplexHalf, 33 | kBool, 34 | kHalf, 35 | kBFloat16, 36 | AT_EXPAND(AT_FLOAT8_TYPES), 37 | AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES)); 38 | } 39 | 40 | } // namespace xpu 41 | } // namespace native 42 | } // namespace at 43 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/FillKernel.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at { 6 | namespace native { 7 | namespace xpu { 8 | 9 | TORCH_XPU_API void fill_kernel(TensorIterator& iter, const Scalar& scalar); 10 | 11 | } 12 | } // namespace native 13 | } // namespace at 14 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/ForeachCopyKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | namespace at::native::xpu { 5 | 6 | TORCH_XPU_API void foreach_copy_list_kernel_(TensorList self, TensorList src); 7 | 8 | } // namespace at::native::xpu 9 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/ForeachPointwiseOpListKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | #include 5 | 6 | namespace at::native::xpu { 7 | 8 | #define FOREACH_POINTWISE_OP_TENSOR_KERNEL(NAME) \ 9 | FOREACH_POINTWISE_OP_SCALARLIST_KERNEL(NAME) 10 | 11 | #define FOREACH_POINTWISE_OP_TENSOR_INPLACE_KERNEL(NAME) \ 12 | FOREACH_POINTWISE_OP_SCALARLIST_INPLACE_KERNEL(NAME) 13 | 14 | TORCH_XPU_API FOREACH_POINTWISE_OP_TENSOR_KERNEL(addcmul); 15 | TORCH_XPU_API FOREACH_POINTWISE_OP_TENSOR_INPLACE_KERNEL(addcmul); 16 | TORCH_XPU_API FOREACH_POINTWISE_OP_TENSOR_KERNEL(addcdiv); 17 | TORCH_XPU_API FOREACH_POINTWISE_OP_TENSOR_INPLACE_KERNEL(addcdiv); 18 | 19 | } // namespace at::native::xpu 20 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/ForeachPointwiseOpScalarKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | namespace at::native::xpu { 5 | 6 | #define FOREACH_POINTWISE_OP_SCALAR_KERNEL(NAME) \ 7 | std::vector foreach_##NAME##_kernel( \ 8 | TensorList input, \ 9 | TensorList tensors1, \ 10 | TensorList tensors2, \ 11 | const Scalar& scalar) 12 | 13 | #define FOREACH_POINTWISE_OP_SCALAR_INPLACE_KERNEL(NAME) \ 14 | void foreach_##NAME##_kernel_( \ 15 | TensorList input, \ 16 | TensorList tensors1, \ 17 | TensorList tensors2, \ 18 | const Scalar& scalar) 19 | 20 | TORCH_XPU_API FOREACH_POINTWISE_OP_SCALAR_KERNEL(addcmul); 21 | TORCH_XPU_API FOREACH_POINTWISE_OP_SCALAR_INPLACE_KERNEL(addcmul); 22 | TORCH_XPU_API FOREACH_POINTWISE_OP_SCALAR_KERNEL(addcdiv); 23 | TORCH_XPU_API FOREACH_POINTWISE_OP_SCALAR_INPLACE_KERNEL(addcdiv); 24 | 25 | } // namespace at::native::xpu 26 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/ForeachReduceKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | namespace at::native::xpu { 5 | 6 | TORCH_XPU_API std::vector foreach_norm_kernel( 7 | TensorList tensors, 8 | const Scalar& ord, 9 | double p, 10 | std::optional dtype); 11 | 12 | TORCH_XPU_API std::vector foreach_max_kernel(TensorList tensors); 13 | 14 | } // namespace at::native::xpu 15 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/ForeachTernaryOpListKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | namespace at::native::xpu { 5 | 6 | TORCH_XPU_API void foreach_lerp_list_kernel( 7 | TensorList tensors1, 8 | TensorList tensors2, 9 | TensorList tensors3, 10 | TensorList result); 11 | 12 | TORCH_XPU_API void foreach_lerp_list_kernel_( 13 | TensorList tensors1, 14 | TensorList tensors2, 15 | TensorList tensors3); 16 | 17 | } // namespace at::native::xpu 18 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/ForeachTernaryOpScalarKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | namespace at::native::xpu { 5 | 6 | TORCH_XPU_API void foreach_lerp_scalar_kernel( 7 | TensorList tensors1, 8 | TensorList tensors2, 9 | const Scalar& weight, 10 | TensorList result); 11 | 12 | TORCH_XPU_API void foreach_lerp_scalar_kernel_( 13 | TensorList tensors1, 14 | TensorList tensors2, 15 | const Scalar& weight); 16 | 17 | } // namespace at::native::xpu 18 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/ForeachTernaryOpScalarListKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | namespace at::native::xpu { 5 | 6 | TORCH_XPU_API void foreach_lerp_scalarlist_kernel( 7 | TensorList tensors1, 8 | TensorList tensors2, 9 | at::ArrayRef scalars, 10 | TensorList result); 11 | 12 | TORCH_XPU_API void foreach_lerp_scalarlist_kernel_( 13 | TensorList tensors1, 14 | TensorList tensors2, 15 | at::ArrayRef scalars); 16 | 17 | } // namespace at::native::xpu 18 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/FractionalMaxPool2dKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | namespace at::native::xpu { 5 | 6 | TORCH_XPU_API void fractional_max_pool2d_kernel( 7 | const Tensor& input, 8 | IntArrayRef pool_size, 9 | IntArrayRef output_size, 10 | const Tensor& randomSamples, 11 | const Tensor& output, 12 | const Tensor& indices); 13 | 14 | TORCH_XPU_API void fractional_max_pool2d_backward_kernel( 15 | const Tensor& gradOutput, 16 | const Tensor& input, 17 | IntArrayRef pool_size /* unused */, 18 | IntArrayRef output_size, 19 | const Tensor& indices, 20 | const Tensor& gradInput); 21 | 22 | } // namespace at::native::xpu 23 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/FractionalMaxPool3dKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at::native::xpu { 6 | 7 | TORCH_XPU_API void fractional_max_pool3d_kernel( 8 | const Tensor& input, 9 | int64_t poolSizeT, 10 | int64_t poolSizeH, 11 | int64_t poolSizeW, 12 | int64_t outputT, 13 | int64_t outputH, 14 | int64_t outputW, 15 | const Tensor& randomSamples, 16 | int64_t numBatch, 17 | int64_t numPlanes, 18 | int64_t inputT, 19 | int64_t inputH, 20 | int64_t inputW, 21 | const Tensor& output, 22 | const Tensor& indices); 23 | 24 | TORCH_XPU_API void fractional_max_pool3d_backward_kernel( 25 | Tensor& gradInput, 26 | const Tensor& gradOutput, 27 | const Tensor& input, 28 | IntArrayRef output_size, 29 | const Tensor& indices); 30 | 31 | } // namespace at::native::xpu 32 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/FunctionOfAMatrixUtilsKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | namespace at::native::xpu { 5 | 6 | TORCH_XPU_API void _compute_linear_combination_kernel( 7 | TensorIterator& iter, 8 | int64_t in_stride, 9 | int64_t coeff_stride, 10 | int64_t num_summations); 11 | 12 | } // namespace at::native::xpu 13 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/FusedSgdKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | namespace at::native::xpu { 5 | 6 | TORCH_XPU_API void fused_sgd_kernel( 7 | at::TensorList params, 8 | at::TensorList grads, 9 | const double weight_decay, 10 | const double momentum, 11 | const float* lr_ptr, 12 | const double lr, 13 | const double dampening, 14 | const bool nesterov, 15 | const bool maximize, 16 | const bool is_first_step, 17 | const float* grad_scale_ptr, 18 | const float* found_inf_ptr); 19 | 20 | TORCH_XPU_API void fused_sgd_with_momentum_kernel( 21 | at::TensorList params, 22 | at::TensorList grads, 23 | at::TensorList momentum_buffer_list, 24 | const double weight_decay, 25 | const double momentum, 26 | const float* lr_ptr, 27 | const double lr, 28 | const double dampening, 29 | const bool nesterov, 30 | const bool maximize, 31 | const bool is_first_step, 32 | const float* grad_scale_ptr, 33 | const float* found_inf_ptr); 34 | 35 | } // namespace at::native::xpu 36 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/GcdLcmKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at { 6 | namespace native { 7 | namespace xpu { 8 | 9 | TORCH_XPU_API void gcd_kernel(TensorIteratorBase& iter); 10 | 11 | TORCH_XPU_API void lcm_kernel(TensorIteratorBase& iter); 12 | 13 | } // namespace xpu 14 | } // namespace native 15 | } // namespace at 16 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/GroupNormKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at::native::xpu { 6 | 7 | TORCH_XPU_API void group_norm_kernel( 8 | const Tensor& X, 9 | const Tensor& gamma, 10 | const Tensor& beta, 11 | int64_t N, 12 | int64_t C, 13 | int64_t HxW, 14 | int64_t group, 15 | double eps, 16 | Tensor& Y, 17 | Tensor& mean, 18 | Tensor& rstd); 19 | 20 | TORCH_XPU_API void group_norm_backward_kernel( 21 | const Tensor& dY, 22 | const Tensor& X, 23 | const Tensor& mean, 24 | const Tensor& rstd, 25 | const Tensor& gamma, 26 | int64_t N, 27 | int64_t C, 28 | int64_t HxW, 29 | int64_t group, 30 | Tensor& dX, 31 | Tensor& dgamma, 32 | Tensor& dbeta); 33 | 34 | } // namespace at::native::xpu 35 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/HermitePolynomialHKernel.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | namespace at::native::xpu { 5 | 6 | TORCH_XPU_API void hermite_polynomial_h_kernel(TensorIteratorBase& iterator); 7 | 8 | } // namespace at::native::xpu 9 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/HermitePolynomialHeKernel.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | namespace at::native::xpu { 5 | 6 | TORCH_XPU_API void hermite_polynomial_he_kernel(TensorIteratorBase& iterator); 7 | 8 | } // namespace at::native::xpu 9 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/HistogramKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at::native::xpu { 6 | 7 | TORCH_XPU_API void histogramdd_kernel( 8 | const Tensor& self, 9 | const std::optional& weight, 10 | bool density, 11 | Tensor& hist, 12 | const TensorList& bin_edges_); 13 | 14 | TORCH_XPU_API void histogramdd_linear_kernel( 15 | const Tensor& self, 16 | const std::optional& weight, 17 | bool density, 18 | Tensor& hist, 19 | const TensorList& bin_edges_, 20 | bool local_search); 21 | 22 | TORCH_XPU_API void histogram_select_outer_bin_edges_kernel( 23 | const Tensor& input, 24 | const int64_t N, 25 | std::vector& leftmost_edges, 26 | std::vector& rightmost_edges); 27 | 28 | } // namespace at::native::xpu 29 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/IGammaKernel.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at::native::xpu { 6 | 7 | TORCH_XPU_API void igamma_kernel(TensorIteratorBase& iter); 8 | 9 | TORCH_XPU_API void igammac_kernel(TensorIteratorBase& iter); 10 | 11 | } // namespace at::native::xpu 12 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/Im2ColKernel.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at::native::xpu { 6 | 7 | TORCH_XPU_API void im2col_kernel( 8 | Tensor& output, 9 | const Tensor& input_, 10 | IntArrayRef kernel_size, 11 | IntArrayRef dilation, 12 | IntArrayRef padding, 13 | IntArrayRef stride); 14 | 15 | } // namespace at::native::xpu 16 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/KernelUtils.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | #define XPU_KERNEL_LOOP_TYPE(item, i, n, index_type) \ 7 | int64_t _i_n_d_e_x = \ 8 | item.get_group(0) * item.get_local_range(0) + item.get_local_id(0); \ 9 | for (index_type i = _i_n_d_e_x; _i_n_d_e_x < (n); \ 10 | _i_n_d_e_x += item.get_local_range(0) * item.get_group_range(0), \ 11 | i = _i_n_d_e_x) 12 | 13 | #define XPU_KERNEL_LOOP(item, i, n) XPU_KERNEL_LOOP_TYPE(item, i, n, int) 14 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/LaguerrePolynomialLKernel.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | namespace at::native::xpu { 5 | 6 | TORCH_XPU_API void laguerre_polynomial_l_kernel(TensorIteratorBase& iterator); 7 | 8 | } // namespace at::native::xpu 9 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/LaunchUtils.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | namespace at::native { 5 | namespace xpu { 6 | // returns 2**floor(log2(n)) 7 | static int lastPow2(unsigned int n) { 8 | n |= (n >> 1); 9 | n |= (n >> 2); 10 | n |= (n >> 4); 11 | n |= (n >> 8); 12 | n |= (n >> 16); 13 | return std::max(1, n - (n >> 1)); 14 | } 15 | } // namespace xpu 16 | } // namespace at::native -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/LayerNormKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at { 6 | namespace native { 7 | namespace xpu { 8 | 9 | TORCH_XPU_API void layer_norm_kernel( 10 | const Tensor& X, 11 | const Tensor& gamma, 12 | const Tensor& beta, 13 | int64_t M, 14 | int64_t N, 15 | double eps, 16 | Tensor* Y, 17 | Tensor* mean, 18 | Tensor* rstd); 19 | 20 | TORCH_XPU_API std::tuple layer_norm_backward_kernel( 21 | const Tensor& dY, 22 | const Tensor& X, 23 | const Tensor& mean, 24 | const Tensor& rstd, 25 | const Tensor& gamma, 26 | int64_t M, 27 | int64_t N, 28 | Tensor& dX, 29 | Tensor& dgamma, 30 | Tensor& dbeta, 31 | std::array grad_input_mask); 32 | 33 | } // namespace xpu 34 | } // namespace native 35 | } // namespace at 36 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/LegendrePolynomialPKernel.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | namespace at::native::xpu { 5 | 6 | TORCH_XPU_API void legendre_polynomial_p_kernel(TensorIteratorBase& iterator); 7 | 8 | } // namespace at::native::xpu 9 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/LerpKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at::native::xpu { 6 | 7 | TORCH_XPU_API void lerp_tensor_kernel(TensorIteratorBase& iter); 8 | 9 | TORCH_XPU_API void lerp_scalar_kernel( 10 | TensorIteratorBase& iter, 11 | const c10::Scalar& weight); 12 | 13 | } // namespace at::native::xpu 14 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/LinearAlgebraKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | namespace at::native::xpu { 5 | 6 | TORCH_XPU_API void addr_kernel( 7 | TensorIterator& iter, 8 | const Scalar& beta, 9 | const Scalar& alpha); 10 | 11 | } // namespace at::native::xpu 12 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/LinearInt4.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | 5 | namespace at::native::xpu { 6 | 7 | TORCH_XPU_API void linear_int4_kernel( 8 | const Tensor& input, 9 | const Tensor& weight, 10 | int qGroupSize, 11 | const Tensor& weight_scale_zero_point, 12 | Tensor& output); 13 | 14 | } // namespace at::native::xpu 15 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/LogAddExpKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | 5 | namespace at::native::xpu { 6 | 7 | TORCH_XPU_API void logaddexp_kernel(TensorIteratorBase& iter); 8 | 9 | TORCH_XPU_API void logaddexp2_kernel(TensorIteratorBase& iter); 10 | 11 | } // namespace at::native::xpu 12 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/LossCTCKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | namespace at::native::xpu { 5 | 6 | TORCH_XPU_API std::tuple ctc_loss_kernel( 7 | const Tensor& log_probs, 8 | const Tensor& targets, 9 | IntArrayRef input_lengths, 10 | IntArrayRef target_lengths, 11 | int64_t BLANK, 12 | bool zero_infinity); 13 | 14 | TORCH_XPU_API Tensor ctc_loss_backward_kernel( 15 | const Tensor& grad, 16 | const Tensor& log_probs, 17 | const Tensor& targets, 18 | IntArrayRef input_lengths, 19 | IntArrayRef target_lengths, 20 | const Tensor& neg_log_likelihood, 21 | const Tensor& log_alpha, 22 | int64_t BLANK, 23 | bool zero_infinity); 24 | 25 | } // namespace at::native::xpu 26 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/LossKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | namespace at::native::xpu { 5 | 6 | TORCH_XPU_API Tensor& binary_cross_entropy_kernel( 7 | const Tensor& input, 8 | const Tensor& target, 9 | const Tensor& weight, 10 | int64_t reduction, 11 | Tensor& loss); 12 | 13 | TORCH_XPU_API Tensor& binary_cross_entropy_backward_kernel( 14 | const Tensor& grad, 15 | const Tensor& input, 16 | const Tensor& target, 17 | const Tensor& weight, 18 | int64_t reduction, 19 | Tensor& grad_input); 20 | 21 | } // namespace at::native::xpu 22 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/LossNLL2dKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at::native::xpu { 6 | 7 | TORCH_XPU_API void nll_loss2d_forward_kernel( 8 | Tensor& output, 9 | Tensor& total_weight, 10 | const Tensor& input, 11 | const Tensor& target, 12 | const Tensor& weight, 13 | int64_t reduction, 14 | int64_t ignore_index); 15 | 16 | TORCH_XPU_API void nll_loss2d_backward_kernel( 17 | Tensor& grad_input, 18 | const Tensor& grad_output, 19 | const Tensor& input, 20 | const Tensor& target, 21 | const Tensor& weight, 22 | int64_t reduction, 23 | int64_t ignore_index, 24 | const Tensor& total_weight); 25 | 26 | } // namespace at::native::xpu 27 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/LossNLLKernel.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | namespace at::native::xpu { 5 | 6 | TORCH_XPU_API void nll_loss_forward_kernel( 7 | const Tensor& self, 8 | const Tensor& target, 9 | const OptionalTensorRef weight_opt, 10 | int64_t reduction, 11 | int64_t ignore_index, 12 | const Tensor& output, 13 | const Tensor& total_weight); 14 | 15 | TORCH_XPU_API void nll_loss_backward_kernel( 16 | const Tensor& grad_output, 17 | const Tensor& self, 18 | const Tensor& target, 19 | const OptionalTensorRef weight_opt, 20 | int64_t reduction, 21 | int64_t ignore_index, 22 | const Tensor& total_weight, 23 | const Tensor& grad_input); 24 | 25 | } // namespace at::native::xpu 26 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/MaxMinElementwiseKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at::native::xpu { 6 | 7 | TORCH_XPU_API void maximum_kernel(TensorIteratorBase& iter); 8 | 9 | TORCH_XPU_API void minimum_kernel(TensorIteratorBase& iter); 10 | 11 | TORCH_XPU_API void fmax_kernel(TensorIteratorBase& iter); 12 | 13 | TORCH_XPU_API void fmin_kernel(TensorIteratorBase& iter); 14 | 15 | } // namespace at::native::xpu 16 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/MaxUnpoolingKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | namespace at::native::xpu { 5 | 6 | TORCH_XPU_API Tensor& max_unpooling2d_forward_kernel( 7 | Tensor& output, 8 | const Tensor& self_, 9 | const Tensor& indices_, 10 | IntArrayRef output_size); 11 | 12 | TORCH_XPU_API Tensor& max_unpooling3d_forward_kernel( 13 | Tensor& output, 14 | const Tensor& self_, 15 | const Tensor& indices_, 16 | IntArrayRef output_size, 17 | IntArrayRef stride, 18 | IntArrayRef padding); 19 | 20 | } // namespace at::native::xpu 21 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/ModifiedBesselI0Kernel.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include 8 | 9 | namespace at::native::xpu { 10 | 11 | template 12 | struct ModifiedBesselI0Functor { 13 | scalar_t operator()(scalar_t a) const { 14 | return modified_bessel_i0_forward(a); 15 | } 16 | }; 17 | 18 | void modified_bessel_i0_kernel(TensorIteratorBase& iter) { 19 | AT_DISPATCH_FLOATING_TYPES( 20 | iter.common_dtype(), "modified_bessel_i0_xpu", [&]() { 21 | gpu_kernel(iter, ModifiedBesselI0Functor()); 22 | }); 23 | } 24 | 25 | } // namespace at::native::xpu 26 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/ModifiedBesselI0Kernel.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at::native::xpu { 6 | 7 | TORCH_XPU_API void modified_bessel_i0_kernel(TensorIteratorBase& iter); 8 | 9 | } // namespace at::native::xpu 10 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/ModifiedBesselI1Kernel.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include 8 | 9 | namespace at::native::xpu { 10 | 11 | template 12 | struct ModifiedBesselI1Functor { 13 | scalar_t operator()(scalar_t a) const { 14 | return modified_bessel_i1_forward(a); 15 | } 16 | }; 17 | 18 | void modified_bessel_i1_kernel(TensorIteratorBase& iter) { 19 | AT_DISPATCH_FLOATING_TYPES( 20 | iter.common_dtype(), "modified_bessel_i1_xpu", [&]() { 21 | gpu_kernel(iter, ModifiedBesselI1Functor()); 22 | }); 23 | } 24 | 25 | } // namespace at::native::xpu 26 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/ModifiedBesselI1Kernel.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at::native::xpu { 6 | 7 | TORCH_XPU_API void modified_bessel_i1_kernel(TensorIteratorBase& iter); 8 | 9 | } // namespace at::native::xpu 10 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/ModifiedBesselK0Kernel.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include 8 | 9 | namespace at::native::xpu { 10 | 11 | template 12 | struct ModifiedBesselK0Functor { 13 | scalar_t operator()(scalar_t a) const { 14 | return modified_bessel_k0_forward(a); 15 | } 16 | }; 17 | 18 | void modified_bessel_k0_kernel(TensorIteratorBase& iter) { 19 | AT_DISPATCH_FLOATING_TYPES( 20 | iter.common_dtype(), "modified_bessel_k0_xpu", [&]() { 21 | gpu_kernel(iter, ModifiedBesselK0Functor()); 22 | }); 23 | } 24 | 25 | } // namespace at::native::xpu 26 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/ModifiedBesselK0Kernel.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at::native::xpu { 6 | 7 | TORCH_XPU_API void modified_bessel_k0_kernel(TensorIteratorBase& iter); 8 | 9 | } // namespace at::native::xpu 10 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/ModifiedBesselK1Kernel.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include 8 | 9 | namespace at::native::xpu { 10 | 11 | template 12 | struct ModifiedBesselK1Functor { 13 | scalar_t operator()(scalar_t a) const { 14 | return modified_bessel_k1_forward(a); 15 | } 16 | }; 17 | 18 | void modified_bessel_k1_kernel(TensorIteratorBase& iter) { 19 | AT_DISPATCH_FLOATING_TYPES( 20 | iter.common_dtype(), "modified_bessel_k1_xpu", [&]() { 21 | gpu_kernel(iter, ModifiedBesselK1Functor()); 22 | }); 23 | } 24 | 25 | } // namespace at::native::xpu 26 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/ModifiedBesselK1Kernel.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at::native::xpu { 6 | 7 | TORCH_XPU_API void modified_bessel_k1_kernel(TensorIteratorBase& iter); 8 | 9 | } // namespace at::native::xpu 10 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/MultiLabelMarginLossKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | namespace at::native::xpu { 5 | 6 | TORCH_XPU_API void multilabel_margin_loss_kernel( 7 | const Tensor& input, 8 | const Tensor& target, 9 | int64_t reduction, 10 | Tensor& output, 11 | Tensor& is_target); 12 | 13 | TORCH_XPU_API void multilabel_margin_loss_backward_kernel( 14 | const Tensor& grad_output, 15 | const Tensor& input, 16 | const Tensor& target, 17 | int64_t reduction, 18 | const Tensor& is_target, 19 | Tensor& grad_input); 20 | 21 | } // namespace at::native::xpu -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/MultiMarginLossKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | namespace at::native::xpu { 5 | 6 | TORCH_XPU_API Tensor& multi_margin_loss_kernel( 7 | const Tensor& input, 8 | const Tensor& target, 9 | const Scalar& p, 10 | const Scalar& margin, 11 | const std::optional& weight, 12 | int64_t reduction, 13 | Tensor& out); 14 | 15 | TORCH_XPU_API Tensor& multi_margin_loss_backward_kernel( 16 | const Tensor& grad_output, 17 | const Tensor& input, 18 | const Tensor& target, 19 | const Scalar& p, 20 | const Scalar& margin, 21 | const std::optional& weight, 22 | int64_t reduction, 23 | Tensor& grad_input); 24 | 25 | } // namespace at::native::xpu 26 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/MultinomialKernel.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | namespace at::native::xpu { 5 | 6 | TORCH_XPU_API void multinomial_kernel( 7 | Tensor& result, 8 | const Tensor& self, 9 | const int64_t n_sample, 10 | std::optional generator); 11 | 12 | } // namespace at::native::xpu 13 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/NMSKernel.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at { 6 | namespace native { 7 | namespace xpu { 8 | 9 | TORCH_XPU_API Tensor nms_kernel(const Tensor& dets_sorted, float iou_threshold); 10 | 11 | } 12 | } // namespace native 13 | } // namespace at 14 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/NonzeroKernel.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | namespace at::native::xpu { 5 | 6 | TORCH_XPU_API void nonzero_kernel(const Tensor& self, Tensor& out); 7 | 8 | } // namespace at::native::xpu 9 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/PointwiseOpsKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at::native::xpu { 6 | 7 | TORCH_XPU_API void addcmul_kernel( 8 | TensorIteratorBase& iter, 9 | const Scalar& value); 10 | 11 | TORCH_XPU_API void addcdiv_kernel( 12 | TensorIteratorBase& iter, 13 | const Scalar& value); 14 | 15 | TORCH_XPU_API void mse_backward_kernel( 16 | TensorIterator& iter, 17 | const Scalar& value); 18 | 19 | TORCH_XPU_API void smooth_l1_backward_kernel( 20 | TensorIterator& iter, 21 | const Scalar& norm, 22 | double beta); 23 | 24 | TORCH_XPU_API void huber_backward_kernel( 25 | TensorIterator& iter, 26 | const Scalar& norm, 27 | double delta); 28 | 29 | } // namespace at::native::xpu 30 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/PowKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at { 6 | namespace native { 7 | namespace xpu { 8 | 9 | TORCH_XPU_API void pow_tensor_scalar_kernel( 10 | TensorIteratorBase& iter, 11 | const Scalar& exp_scalar); 12 | 13 | TORCH_XPU_API void pow_tensor_tensor_kernel(TensorIteratorBase& iter); 14 | 15 | } // namespace xpu 16 | } // namespace native 17 | } // namespace at 18 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/PsRoiAlignKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | namespace at::native::xpu { 5 | 6 | TORCH_XPU_API std::tuple ps_roi_align_kernel( 7 | const at::Tensor& input, 8 | const at::Tensor& rois, 9 | double spatial_scale, 10 | int64_t pooled_height, 11 | int64_t pooled_width, 12 | int64_t sampling_ratio); 13 | 14 | TORCH_XPU_API Tensor ps_roi_align_backward_kernel( 15 | const at::Tensor& grad, 16 | const at::Tensor& rois, 17 | const at::Tensor& channel_mapping, 18 | double spatial_scale, 19 | int64_t pooled_height, 20 | int64_t pooled_width, 21 | int64_t sampling_ratio, 22 | int64_t batch_size, 23 | int64_t channels, 24 | int64_t height, 25 | int64_t width); 26 | } // namespace at::native::xpu 27 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/PsRoiPoolKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | namespace at::native::xpu { 5 | 6 | TORCH_XPU_API std::tuple ps_roi_pool_kernel( 7 | const at::Tensor& input, 8 | const at::Tensor& rois, 9 | double spatial_scale, 10 | int64_t pooled_height, 11 | int64_t pooled_width); 12 | 13 | TORCH_XPU_API Tensor ps_roi_pool_backward_kernel( 14 | const at::Tensor& grad, 15 | const at::Tensor& rois, 16 | const at::Tensor& channel_mapping, 17 | double spatial_scale, 18 | int64_t pooled_height, 19 | int64_t pooled_width, 20 | int64_t batch_size, 21 | int64_t channels, 22 | int64_t height, 23 | int64_t width); 24 | } // namespace at::native::xpu -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/RandpermKernel.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | namespace at::native::xpu { 5 | 6 | TORCH_XPU_API Tensor 7 | randperm_kernel(Tensor& result, int64_t n, std::optional generator); 8 | 9 | } // namespace at::native::xpu 10 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/RangeFactoriesKernel.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at::native::xpu { 6 | 7 | TORCH_XPU_API Tensor& arange_kernel( 8 | const Scalar& start, 9 | const Scalar& end, 10 | const Scalar& step, 11 | Tensor& result); 12 | 13 | TORCH_XPU_API Tensor& range_kernel( 14 | const Scalar& start, 15 | const Scalar& end, 16 | const Scalar& step, 17 | Tensor& result); 18 | 19 | TORCH_XPU_API Tensor& linspace_kernel( 20 | const Scalar& start, 21 | const Scalar& end, 22 | int64_t steps, 23 | Tensor& result); 24 | 25 | TORCH_XPU_API Tensor& logspace_kernel( 26 | const Scalar& start, 27 | const Scalar& end, 28 | int64_t steps, 29 | double base, 30 | Tensor& result); 31 | 32 | } // namespace at::native::xpu 33 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/ReduceMaxValuesKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at::native::xpu { 6 | 7 | TORCH_XPU_API void max_values_kernel(TensorIterator& iter); 8 | 9 | TORCH_XPU_API void max_kernel(TensorIterator& iter); 10 | 11 | TORCH_XPU_API void max_all_kernel(TensorIterator& iter); 12 | 13 | } // namespace at::native::xpu 14 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/ReduceMinValuesKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at::native::xpu { 6 | 7 | TORCH_XPU_API void min_values_kernel(TensorIterator& iter); 8 | 9 | TORCH_XPU_API void min_kernel(TensorIterator& iter); 10 | 11 | TORCH_XPU_API void min_all_kernel(TensorIterator& iter); 12 | 13 | } // namespace at::native::xpu 14 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/ReduceNormKernel.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | 5 | namespace at::native::xpu { 6 | 7 | TORCH_XPU_API void norm_kernel(TensorIterator& iter, const Scalar& val); 8 | 9 | } // namespace at::native::xpu 10 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/ReduceOps.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at::native::xpu { 6 | 7 | void argmax_kernel(TensorIterator& iter); 8 | 9 | void and_kernel(TensorIterator& iter); 10 | 11 | void or_kernel(TensorIterator& iter); 12 | 13 | void mean_kernel(TensorIterator& iter); 14 | 15 | void sum_kernel(TensorIterator& iter); 16 | 17 | } // namespace at::native::xpu 18 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/ReduceOpsKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at::native::xpu { 6 | 7 | TORCH_XPU_API void argmax_kernel(TensorIterator& iter); 8 | 9 | TORCH_XPU_API void argmin_kernel(TensorIterator& iter); 10 | 11 | TORCH_XPU_API void and_kernel(TensorIterator& iter); 12 | 13 | TORCH_XPU_API void or_kernel(TensorIterator& iter); 14 | 15 | TORCH_XPU_API void mean_kernel(TensorIterator& iter); 16 | 17 | TORCH_XPU_API void sum_kernel(TensorIterator& iter); 18 | 19 | TORCH_XPU_API void prod_kernel(TensorIterator& iter); 20 | 21 | TORCH_XPU_API void nansum_kernel(TensorIterator& iter); 22 | 23 | TORCH_XPU_API void std_var_kernel( 24 | TensorIterator& iter, 25 | double correction, 26 | bool take_sqrt); 27 | 28 | TORCH_XPU_API void aminmax_kernel(TensorIterator& iter); 29 | 30 | TORCH_XPU_API void aminmax_allreduce_kernel(TensorIterator& iter); 31 | 32 | } // namespace at::native::xpu 33 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/RenormKernel.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | 5 | #include 6 | 7 | namespace at::native::xpu { 8 | 9 | template 10 | struct RenormScalarFactorFunctor { 11 | scalar_t operator()(scalar_t norm) const { 12 | const auto eps = static_cast(1e-7); 13 | const auto one = static_cast(1.0); 14 | return (norm > maxnorm_elm) ? maxnorm_elm / (norm + eps) : one; 15 | } 16 | 17 | RenormScalarFactorFunctor(scalar_t maxnorm_elm) : maxnorm_elm(maxnorm_elm) {} 18 | 19 | private: 20 | scalar_t maxnorm_elm; 21 | }; 22 | 23 | void renorm_scale_factor_kernel(TensorIteratorBase& iter, double maxnorm) { 24 | AT_DISPATCH_FLOATING_TYPES_AND2( 25 | at::ScalarType::Half, 26 | at::ScalarType::BFloat16, 27 | iter.common_dtype(), 28 | "renorm_scale_factor_xpu", 29 | [&] { 30 | RenormScalarFactorFunctor f(maxnorm); 31 | gpu_kernel(iter, f); 32 | }); 33 | } 34 | 35 | } // namespace at::native::xpu 36 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/RenormKernel.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | namespace at::native::xpu { 5 | 6 | TORCH_XPU_API void renorm_scale_factor_kernel( 7 | TensorIteratorBase& iter, 8 | double maxnorm); 9 | 10 | } 11 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/RepeatKernel.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | namespace at::native::xpu { 4 | 5 | TORCH_XPU_API Tensor repeat_interleave_kernel( 6 | const Tensor& repeats, 7 | std::optional output_size); 8 | 9 | } // namespace at::native::xpu 10 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/ResizeKernel.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at::native::xpu { 6 | 7 | TORCH_XPU_API TensorImpl* resize_impl_xpu_( 8 | TensorImpl* self, 9 | IntArrayRef size, 10 | at::OptionalIntArrayRef stride, 11 | bool device_guard = true); 12 | 13 | } 14 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/RoiAlignKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | namespace at::native::xpu { 5 | 6 | TORCH_XPU_API Tensor roi_align_kernel( 7 | const at::Tensor& input, 8 | const at::Tensor& rois, 9 | double spatial_scale, 10 | int64_t pooled_height, 11 | int64_t pooled_width, 12 | int64_t sampling_ratio, 13 | bool aligned); 14 | 15 | TORCH_XPU_API Tensor roi_align_backward_kernel( 16 | const at::Tensor& grad, 17 | const at::Tensor& rois, 18 | double spatial_scale, 19 | int64_t pooled_height, 20 | int64_t pooled_width, 21 | int64_t batch_size, 22 | int64_t channels, 23 | int64_t height, 24 | int64_t width, 25 | int64_t sampling_ratio, 26 | bool aligned); 27 | } // namespace at::native::xpu -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/RoiPoolKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | namespace at::native::xpu { 5 | 6 | TORCH_XPU_API std::tuple roi_pool_kernel( 7 | const at::Tensor& input, 8 | const at::Tensor& rois, 9 | double spatial_scale, 10 | int64_t pooled_height, 11 | int64_t pooled_width); 12 | 13 | TORCH_XPU_API Tensor roi_pool_backward_kernel( 14 | const at::Tensor& grad, 15 | const at::Tensor& rois, 16 | const at::Tensor& argmax, 17 | double spatial_scale, 18 | int64_t pooled_height, 19 | int64_t pooled_width, 20 | int64_t batch_size, 21 | int64_t channels, 22 | int64_t height, 23 | int64_t width); 24 | } // namespace at::native::xpu -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/RreluWithNoiseKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | namespace at::native::xpu { 7 | 8 | TORCH_XPU_API Tensor& rrelu_with_noise_kernel( 9 | const Tensor& self, 10 | Tensor& noise, 11 | const Scalar& lower, 12 | const Scalar& upper, 13 | bool training, 14 | std::optional generator, 15 | Tensor& output); 16 | 17 | } 18 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/ScaledModifiedBesselK0Kernel.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include 8 | 9 | namespace at::native::xpu { 10 | 11 | template 12 | struct ScaledModifiedBesselK0Functor { 13 | scalar_t operator()(scalar_t a) const { 14 | return scaled_modified_bessel_k0_forward(a); 15 | } 16 | }; 17 | 18 | void scaled_modified_bessel_k0_kernel(TensorIteratorBase& iter) { 19 | AT_DISPATCH_FLOATING_TYPES( 20 | iter.common_dtype(), "scaled_modified_bessel_k0_xpu", [&]() { 21 | gpu_kernel(iter, ScaledModifiedBesselK0Functor()); 22 | }); 23 | } 24 | 25 | } // namespace at::native::xpu 26 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/ScaledModifiedBesselK0Kernel.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at::native::xpu { 6 | 7 | TORCH_XPU_API void scaled_modified_bessel_k0_kernel(TensorIteratorBase& iter); 8 | 9 | } // namespace at::native::xpu -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/ScaledModifiedBesselK1Kernel.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include 8 | 9 | namespace at::native::xpu { 10 | 11 | template 12 | struct ScaledModifiedBesselK1Functor { 13 | scalar_t operator()(scalar_t a) const { 14 | return scaled_modified_bessel_k1_forward(a); 15 | } 16 | }; 17 | 18 | void scaled_modified_bessel_k1_kernel(TensorIteratorBase& iter) { 19 | AT_DISPATCH_FLOATING_TYPES( 20 | iter.common_dtype(), "scaled_modified_bessel_k1_xpu", [&]() { 21 | gpu_kernel(iter, ScaledModifiedBesselK1Functor()); 22 | }); 23 | } 24 | 25 | } // namespace at::native::xpu 26 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/ScaledModifiedBesselK1Kernel.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at::native::xpu { 6 | 7 | TORCH_XPU_API void scaled_modified_bessel_k1_kernel(TensorIteratorBase& iter); 8 | 9 | } // namespace at::native::xpu -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/ShapeKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at::native::xpu { 6 | 7 | TORCH_XPU_API void cat_out_kernel( 8 | const ITensorListRef& tensors, 9 | int64_t dim, 10 | int64_t valid, 11 | bool all_contiguous, 12 | bool all_same_dtype, 13 | bool all_same_sizes_and_stride, 14 | MemoryFormat memory_format, 15 | const Tensor& result); 16 | 17 | } // namespace at::native::xpu 18 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/ShiftedChebyshevPolynomialKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at::native::xpu { 6 | 7 | TORCH_XPU_API void shifted_chebyshev_polynomial_t_kernel( 8 | TensorIteratorBase& iterator); 9 | 10 | TORCH_XPU_API void shifted_chebyshev_polynomial_u_kernel( 11 | TensorIteratorBase& iterator); 12 | 13 | TORCH_XPU_API void shifted_chebyshev_polynomial_v_kernel( 14 | TensorIteratorBase& iterator); 15 | 16 | TORCH_XPU_API void shifted_chebyshev_polynomial_w_kernel( 17 | TensorIteratorBase& iterator); 18 | 19 | } // namespace at::native::xpu 20 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/ShiftedChebyshevPolynomialTKernel.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | namespace at::native::xpu { 8 | 9 | template 10 | struct ShiftedChebyshevPolynomialTFunctor { 11 | scalar_t operator()(scalar_t x, scalar_t n) const { 12 | return shifted_chebyshev_polynomial_t_forward(x, n); 13 | } 14 | }; 15 | 16 | void shifted_chebyshev_polynomial_t_kernel(TensorIteratorBase& iterator) { 17 | AT_DISPATCH_FLOATING_TYPES( 18 | iterator.common_dtype(), "shifted_chebyshev_polynomial_t_xpu", [&]() { 19 | ShiftedChebyshevPolynomialTFunctor f; 20 | gpu_kernel_with_scalars(iterator, f); 21 | }); 22 | } 23 | 24 | } // namespace at::native::xpu 25 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/ShiftedChebyshevPolynomialUKernel.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | namespace at::native::xpu { 8 | 9 | template 10 | struct ShiftedChebyshevPolynomialUFunctor { 11 | scalar_t operator()(scalar_t x, scalar_t n) const { 12 | return shifted_chebyshev_polynomial_u_forward(x, n); 13 | } 14 | }; 15 | 16 | void shifted_chebyshev_polynomial_u_kernel(TensorIteratorBase& iterator) { 17 | AT_DISPATCH_FLOATING_TYPES( 18 | iterator.common_dtype(), "shifted_chebyshev_polynomial_u_xpu", [&]() { 19 | ShiftedChebyshevPolynomialUFunctor f; 20 | gpu_kernel_with_scalars(iterator, f); 21 | }); 22 | } 23 | 24 | } // namespace at::native::xpu 25 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/ShiftedChebyshevPolynomialVKernel.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | namespace at::native::xpu { 8 | 9 | template 10 | struct ShiftedChebyshevPolynomialVFunctor { 11 | scalar_t operator()(scalar_t x, scalar_t n) const { 12 | return shifted_chebyshev_polynomial_v_forward(x, n); 13 | } 14 | }; 15 | 16 | void shifted_chebyshev_polynomial_v_kernel(TensorIteratorBase& iterator) { 17 | AT_DISPATCH_FLOATING_TYPES( 18 | iterator.common_dtype(), "shifted_chebyshev_polynomial_v_xpu", [&]() { 19 | ShiftedChebyshevPolynomialVFunctor f; 20 | gpu_kernel_with_scalars(iterator, f); 21 | }); 22 | } 23 | 24 | } // namespace at::native::xpu 25 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/ShiftedChebyshevPolynomialWKernel.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | namespace at::native::xpu { 8 | 9 | template 10 | struct ShiftedChebyshevPolynomialWFunctor { 11 | scalar_t operator()(scalar_t x, scalar_t n) const { 12 | return shifted_chebyshev_polynomial_w_forward(x, n); 13 | } 14 | }; 15 | 16 | void shifted_chebyshev_polynomial_w_kernel(TensorIteratorBase& iterator) { 17 | AT_DISPATCH_FLOATING_TYPES( 18 | iterator.common_dtype(), "shifted_chebyshev_polynomial_w_xpu", [&]() { 19 | ShiftedChebyshevPolynomialWFunctor f; 20 | gpu_kernel_with_scalars(iterator, f); 21 | }); 22 | } 23 | 24 | } // namespace at::native::xpu 25 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/Sorting.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at::native::xpu { 6 | 7 | TORCH_XPU_API void sort_stable_kernel( 8 | const TensorBase& self_base, 9 | const TensorBase& values_base, 10 | const TensorBase& indices_base, 11 | int64_t dim, 12 | bool descending, 13 | bool stable); 14 | 15 | TORCH_XPU_API void launch_median_kernel( 16 | const TensorBase& vals, 17 | const TensorBase& inds, 18 | const TensorBase& self, 19 | int64_t dim, 20 | bool ignore_nan); 21 | 22 | TORCH_XPU_API void launch_kthvalue_kernel( 23 | const TensorBase& values, 24 | const TensorBase& indices, 25 | const TensorBase& self, 26 | int64_t dim, 27 | int64_t k); 28 | 29 | } // namespace at::native::xpu 30 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/SphericalBesselJ0Kernel.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include 8 | 9 | namespace at::native::xpu { 10 | 11 | template 12 | struct SphericalBesselJ0Functor { 13 | scalar_t operator()(scalar_t a) const { 14 | return spherical_bessel_j0_forward(a); 15 | } 16 | }; 17 | 18 | void spherical_bessel_j0_kernel(TensorIteratorBase& iter) { 19 | AT_DISPATCH_FLOATING_TYPES( 20 | iter.common_dtype(), "spherical_bessel_j0_xpu", [&]() { 21 | gpu_kernel(iter, SphericalBesselJ0Functor()); 22 | }); 23 | } 24 | 25 | } // namespace at::native::xpu 26 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/SphericalBesselJ0Kernel.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at::native::xpu { 6 | 7 | TORCH_XPU_API void spherical_bessel_j0_kernel(TensorIteratorBase& iter); 8 | 9 | } // namespace at::native::xpu 10 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/StepKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at::native::xpu { 6 | 7 | TORCH_XPU_API void nextafter_kernel(TensorIteratorBase& iter); 8 | 9 | TORCH_XPU_API void heaviside_kernel(TensorIteratorBase& iter); 10 | 11 | } // namespace at::native::xpu 12 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/SummaryOpsKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | namespace at::native::xpu { 5 | 6 | TORCH_XPU_API Tensor 7 | bincount_kernel(const Tensor& self, const Tensor& weights, int64_t minlength); 8 | 9 | TORCH_XPU_API Tensor _histc_kernel( 10 | const Tensor& self, 11 | int64_t nbins, 12 | const Scalar& min, 13 | const Scalar& max); 14 | 15 | } // namespace at::native::xpu 16 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/TensorCompareKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at::native::xpu { 6 | 7 | TORCH_XPU_API void where_kernel(TensorIterator& iter); 8 | 9 | TORCH_XPU_API void isposinf_kernel(TensorIteratorBase& iter); 10 | 11 | TORCH_XPU_API void isneginf_kernel(TensorIteratorBase& iter); 12 | 13 | TORCH_XPU_API void clamp_kernel(TensorIteratorBase& iter); 14 | 15 | TORCH_XPU_API void clamp_scalar_kernel( 16 | TensorIteratorBase& iter, 17 | const Scalar& min, 18 | const Scalar& max); 19 | 20 | TORCH_XPU_API void clamp_min_scalar_kernel( 21 | TensorIteratorBase& iter, 22 | Scalar min); 23 | 24 | TORCH_XPU_API void clamp_max_scalar_kernel( 25 | TensorIteratorBase& iter, 26 | Scalar max); 27 | 28 | TORCH_XPU_API void isin_kernel( 29 | const Tensor& elements, 30 | const Tensor& test_elements, 31 | bool invert, 32 | const Tensor& out); 33 | 34 | TORCH_XPU_API void _assert_async_msg_kernel( 35 | const Tensor& self_tensor, 36 | std::string_view assert_msg); 37 | 38 | } // namespace at::native::xpu 39 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/TensorFactoriesKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | namespace at::native::xpu { 7 | 8 | TORCH_XPU_API Tensor tril_indices_kernel( 9 | int64_t row, 10 | int64_t col, 11 | int64_t offset, 12 | const TensorOptions& options); 13 | 14 | TORCH_XPU_API Tensor triu_indices_kernel( 15 | int64_t row, 16 | int64_t col, 17 | int64_t offset, 18 | const TensorOptions& options); 19 | 20 | } // namespace at::native::xpu 21 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/TensorModeKernel.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at::native::xpu { 6 | 7 | TORCH_XPU_API void mode_kernel( 8 | Tensor& values, 9 | Tensor& indices, 10 | const Tensor& self, 11 | int64_t dim, 12 | bool keepdim); 13 | 14 | } // namespace at::native::xpu 15 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/TensorShapeKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at::native::xpu { 6 | 7 | TORCH_XPU_API void split_with_sizes_copy_out_xpu_kernel( 8 | const Tensor& self, 9 | IntArrayRef split_sizes, 10 | int64_t dim, 11 | TensorList out); 12 | 13 | TORCH_XPU_API Tensor 14 | _chunk_cat_xpu_kernel(TensorList tensors, int64_t dim, int64_t num_chunks); 15 | 16 | TORCH_XPU_API Tensor& _chunk_cat_out_xpu_kernel( 17 | TensorList tensors, 18 | int64_t dim, 19 | int64_t num_chunks, 20 | Tensor& out); 21 | 22 | } // namespace at::native::xpu 23 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/TensorTopKKernel.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at { 6 | namespace native { 7 | namespace xpu { 8 | 9 | TORCH_XPU_API void topk_kernel( 10 | const at::Tensor& input, 11 | int64_t k, 12 | int64_t dim, 13 | bool largest, 14 | bool sorted, 15 | const at::Tensor& values, 16 | const at::Tensor& indices); 17 | 18 | } // namespace xpu 19 | } // namespace native 20 | } // namespace at 21 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/TensorTransformationsKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | 5 | namespace at::native::xpu { 6 | 7 | TORCH_XPU_API void flip_kernel(TensorIterator& iter, bool quantized); 8 | 9 | TORCH_XPU_API void roll_kernel( 10 | const Tensor& input, 11 | Tensor& output, 12 | IntArrayRef shifts, 13 | IntArrayRef dims); 14 | 15 | } // namespace at::native::xpu 16 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/TriangularOpsKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | namespace at::native::xpu { 5 | 6 | TORCH_XPU_API void tril_kernel( 7 | const Tensor& result, 8 | const Tensor& self, 9 | int64_t k); 10 | 11 | TORCH_XPU_API void triu_kernel( 12 | const Tensor& result, 13 | const Tensor& self, 14 | int64_t k); 15 | 16 | } // namespace at::native::xpu 17 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/UnaryComplexKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at::native::xpu { 6 | 7 | TORCH_XPU_API void conj_kernel(TensorIterator& iter); 8 | 9 | TORCH_XPU_API void conj_physical_kernel(TensorIteratorBase& iter); 10 | 11 | TORCH_XPU_API void neg_conj_kernel(TensorIterator& iter); 12 | 13 | TORCH_XPU_API void neg_kernel(TensorIterator& iter); 14 | 15 | TORCH_XPU_API void angle_kernel(TensorIteratorBase& iter); 16 | 17 | } // namespace at::native::xpu 18 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/UnaryFractionKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at::native::xpu { 6 | 7 | TORCH_XPU_API void reciprocal_kernel(TensorIteratorBase& iter); 8 | 9 | TORCH_XPU_API void floor_kernel(TensorIteratorBase& iter); 10 | 11 | TORCH_XPU_API void ceil_kernel(TensorIteratorBase& iter); 12 | 13 | TORCH_XPU_API void round_kernel(TensorIteratorBase& iter); 14 | 15 | TORCH_XPU_API void round_decimals_kernel( 16 | TensorIteratorBase& iter, 17 | int64_t decimals); 18 | 19 | TORCH_XPU_API void frac_kernel(TensorIteratorBase& iter); 20 | 21 | TORCH_XPU_API void trunc_kernel(TensorIteratorBase& iter); 22 | 23 | } // namespace at::native::xpu 24 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/UnaryGammaKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at::native::xpu { 6 | 7 | TORCH_XPU_API void digamma_kernel(TensorIteratorBase& iter); 8 | 9 | TORCH_XPU_API void polygamma_kernel(TensorIteratorBase& iter, int64_t n); 10 | 11 | TORCH_XPU_API void lgamma_kernel(TensorIteratorBase& iter); 12 | 13 | } // namespace at::native::xpu 14 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/UnaryGeometricAcosKernel.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at::native::xpu { 6 | 7 | TORCH_XPU_API void acos_kernel(TensorIteratorBase& iter); 8 | 9 | } // namespace at::native::xpu 10 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/UnaryGeometricAcoshKernel.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at::native::xpu { 6 | 7 | TORCH_XPU_API void acosh_kernel(TensorIteratorBase& iter); 8 | 9 | } // namespace at::native::xpu 10 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/UnaryGeometricAsinKernel.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at::native::xpu { 6 | 7 | TORCH_XPU_API void asin_kernel(TensorIteratorBase& iter); 8 | 9 | } // namespace at::native::xpu 10 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/UnaryGeometricAsinhKernel.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at::native::xpu { 6 | 7 | TORCH_XPU_API void asinh_kernel(TensorIteratorBase& iter); 8 | 9 | } // namespace at::native::xpu 10 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/UnaryGeometricAtanKernel.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at::native::xpu { 6 | 7 | TORCH_XPU_API void atan_kernel(TensorIteratorBase& iter); 8 | 9 | } // namespace at::native::xpu 10 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/UnaryGeometricAtanhKernel.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at::native::xpu { 6 | 7 | TORCH_XPU_API void atanh_kernel(TensorIteratorBase& iter); 8 | 9 | } // namespace at::native::xpu 10 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/UnaryGeometricCosKernel.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include 7 | 8 | #include 9 | 10 | namespace at::native::xpu { 11 | 12 | template 13 | struct CosFunctor { 14 | scalar_t operator()(const scalar_t a) const { 15 | return std::cos(a); 16 | } 17 | }; 18 | 19 | void cos_kernel(TensorIteratorBase& iter) { 20 | auto common_dtype = iter.common_dtype(); 21 | if (at::isComplexType(common_dtype)) { 22 | AT_DISPATCH_COMPLEX_TYPES_AND(kComplexHalf, common_dtype, "cos_xpu", [&]() { 23 | using opmath_t = at::opmath_type; 24 | gpu_kernel(iter, CosFunctor()); 25 | }); 26 | } else { 27 | AT_DISPATCH_FLOATING_TYPES_AND2( 28 | ScalarType::Half, ScalarType::BFloat16, common_dtype, "cos_xpu", [&]() { 29 | gpu_kernel(iter, CosFunctor()); 30 | }); 31 | } 32 | } 33 | 34 | } // namespace at::native::xpu 35 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/UnaryGeometricCosKernel.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at::native::xpu { 6 | 7 | TORCH_XPU_API void cos_kernel(TensorIteratorBase& iter); 8 | 9 | } // namespace at::native::xpu 10 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/UnaryGeometricCoshKernel.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at::native::xpu { 6 | 7 | TORCH_XPU_API void cosh_kernel(TensorIteratorBase& iter); 8 | 9 | } // namespace at::native::xpu 10 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/UnaryGeometricSinKernel.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include 7 | 8 | #include 9 | 10 | namespace at::native::xpu { 11 | 12 | template 13 | struct SinFunctor { 14 | scalar_t operator()(const scalar_t a) const { 15 | return std::sin(a); 16 | } 17 | }; 18 | 19 | void sin_kernel(TensorIteratorBase& iter) { 20 | auto common_dtype = iter.common_dtype(); 21 | if (at::isComplexType(common_dtype)) { 22 | AT_DISPATCH_COMPLEX_TYPES_AND(kComplexHalf, common_dtype, "sin_xpu", [&]() { 23 | using opmath_t = at::opmath_type; 24 | gpu_kernel(iter, SinFunctor()); 25 | }); 26 | } else { 27 | AT_DISPATCH_FLOATING_TYPES_AND2( 28 | ScalarType::Half, ScalarType::BFloat16, common_dtype, "sin_xpu", [&]() { 29 | gpu_kernel(iter, SinFunctor()); 30 | }); 31 | } 32 | } 33 | 34 | } // namespace at::native::xpu 35 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/UnaryGeometricSinKernel.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at::native::xpu { 6 | 7 | TORCH_XPU_API void sin_kernel(TensorIteratorBase& iter); 8 | 9 | } // namespace at::native::xpu 10 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/UnaryGeometricSinhKernel.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at::native::xpu { 6 | 7 | TORCH_XPU_API void sinh_kernel(TensorIteratorBase& iter); 8 | 9 | } // namespace at::native::xpu 10 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/UnaryGeometricTanKernel.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at::native::xpu { 6 | 7 | TORCH_XPU_API void tan_kernel(TensorIteratorBase& iter); 8 | 9 | } // namespace at::native::xpu 10 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/UnaryGeometricTanhKernel.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at::native::xpu { 6 | 7 | TORCH_XPU_API void tanh_kernel(TensorIteratorBase& iter); 8 | 9 | } // namespace at::native::xpu 10 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/UnaryKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at::native::xpu { 6 | 7 | TORCH_XPU_API void sqrt_kernel(TensorIteratorBase& iter); 8 | 9 | TORCH_XPU_API void rsqrt_kernel(TensorIteratorBase& iter); 10 | 11 | TORCH_XPU_API void bitwise_not_kernel(TensorIteratorBase& iter); 12 | 13 | TORCH_XPU_API void exp_kernel(TensorIteratorBase& iter); 14 | 15 | TORCH_XPU_API void expm1_kernel(TensorIteratorBase& iter); 16 | 17 | TORCH_XPU_API void nan_to_num_kernel( 18 | TensorIteratorBase& iter, 19 | std::optional nan, 20 | std::optional pos_inf, 21 | std::optional neg_inf); 22 | 23 | TORCH_XPU_API void frexp_kernel(TensorIteratorBase& iter); 24 | 25 | } // namespace at::native::xpu 26 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/UnaryLogKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at::native::xpu { 6 | 7 | TORCH_XPU_API void log_kernel(TensorIteratorBase& iter); 8 | 9 | TORCH_XPU_API void log10_kernel(TensorIteratorBase& iter); 10 | 11 | TORCH_XPU_API void log1p_kernel(TensorIteratorBase& iter); 12 | 13 | TORCH_XPU_API void log2_kernel(TensorIteratorBase& iter); 14 | 15 | } // namespace at::native::xpu 16 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/UnarySignKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at::native::xpu { 6 | 7 | TORCH_XPU_API void logical_not_kernel(TensorIteratorBase& iter); 8 | 9 | TORCH_XPU_API void neg_kernel(TensorIteratorBase& iter); 10 | 11 | TORCH_XPU_API void sgn_kernel(TensorIteratorBase& iter); 12 | 13 | TORCH_XPU_API void sign_kernel(TensorIteratorBase& iter); 14 | 15 | TORCH_XPU_API void signbit_kernel(TensorIteratorBase& iter); 16 | 17 | } // namespace at::native::xpu 18 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/UnfoldBackwardKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at::native::xpu { 6 | 7 | TORCH_XPU_API void unfold_backward_kernel( 8 | Tensor& grad_out, 9 | const Tensor& grad_in, 10 | int64_t dim, 11 | int64_t size, 12 | int64_t step); 13 | 14 | } // namespace at::native::xpu 15 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/UniqueKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at::native::xpu { 6 | 7 | TORCH_XPU_API std::tuple unique_consecutive_kernel( 8 | const Tensor& self, 9 | const bool return_inverse, 10 | const bool return_counts, 11 | std::optional dim); 12 | 13 | TORCH_XPU_API std::tuple unique_dim_consecutive_kernel( 14 | const Tensor& self, 15 | const int64_t dim, 16 | const bool return_inverse, 17 | const bool return_counts); 18 | 19 | TORCH_XPU_API std::tuple unique_dim_kernel( 20 | const Tensor& self, 21 | const int64_t dim, 22 | const bool return_inverse, 23 | const bool return_counts); 24 | 25 | TORCH_XPU_API std::tuple _unique_kernel( 26 | const Tensor& self, 27 | const bool return_inverse); 28 | 29 | TORCH_XPU_API std::tuple _unique2_kernel( 30 | const Tensor& self, 31 | const bool return_inverse, 32 | const bool return_counts); 33 | 34 | } // namespace at::native::xpu 35 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/UpSampleBicubic2dKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at::native::xpu { 6 | 7 | TORCH_XPU_API void upsample_bicubic2d_kernel( 8 | const Tensor& output, 9 | const Tensor& input, 10 | IntArrayRef output_size, 11 | bool align_corners, 12 | std::optional scales_h, 13 | std::optional scales_w); 14 | 15 | TORCH_XPU_API void upsample_bicubic2d_backward_kernel( 16 | const Tensor& grad_input, 17 | const Tensor& grad_output_, 18 | IntArrayRef output_size, 19 | IntArrayRef input_size, 20 | bool align_corners, 21 | std::optional scales_h, 22 | std::optional scales_w); 23 | 24 | } // namespace at::native::xpu 25 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/UpSampleLinear1dKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at::native::xpu { 6 | 7 | TORCH_XPU_API void upsample_linear1d_kernel( 8 | const Tensor& input, 9 | IntArrayRef output_size, 10 | bool align_corners, 11 | std::optional scales, 12 | const Tensor& output); 13 | 14 | TORCH_XPU_API void upsample_linear1d_backward_kernel( 15 | const Tensor& grad_output_, 16 | IntArrayRef output_size, 17 | IntArrayRef input_size, 18 | bool align_corners, 19 | std::optional scales, 20 | const Tensor& grad_input); 21 | 22 | } // namespace at::native::xpu 23 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/UpSampleNearest1dKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | namespace at::native::xpu { 7 | 8 | TORCH_XPU_API void upsample_nearest1d_kernel( 9 | const Tensor& output, 10 | const Tensor& input_, 11 | IntArrayRef output_size, 12 | std::optional scales, 13 | bool is_exact); 14 | 15 | TORCH_XPU_API void upsample_nearest1d_backward_kernel( 16 | const Tensor& grad_input, 17 | const Tensor& grad_output_, 18 | IntArrayRef output_size, 19 | IntArrayRef input_size, 20 | std::optional scales, 21 | bool is_exact); 22 | 23 | } // namespace at::native::xpu 24 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/UpSampleNearest2dKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | namespace at::native::xpu { 7 | 8 | TORCH_XPU_API void upsample_nearest2d_kernel( 9 | const Tensor& output, 10 | const Tensor& input_, 11 | IntArrayRef output_size, 12 | std::optional scales_h, 13 | std::optional scales_w, 14 | bool is_exact); 15 | 16 | TORCH_XPU_API void upsample_nearest2d_backward_kernel( 17 | const Tensor& grad_input, 18 | const Tensor& grad_output_, 19 | IntArrayRef output_size, 20 | IntArrayRef input_size, 21 | std::optional scales_h, 22 | std::optional scales_w, 23 | bool is_exact); 24 | 25 | } // namespace at::native::xpu 26 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/UpSampleNearest3dKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at::native::xpu { 6 | 7 | TORCH_XPU_API void upsample_nearest3d_kernel( 8 | const Tensor& output, 9 | const Tensor& input_, 10 | IntArrayRef output_size, 11 | std::optional scales_d, 12 | std::optional scales_h, 13 | std::optional scales_w, 14 | bool is_exact); 15 | 16 | TORCH_XPU_API void upsample_nearest3d_backward_kernel( 17 | const Tensor& grad_input, 18 | const Tensor& grad_output_, 19 | IntArrayRef output_size, 20 | IntArrayRef input_size, 21 | std::optional scales_d, 22 | std::optional scales_h, 23 | std::optional scales_w, 24 | bool is_exact); 25 | 26 | } // namespace at::native::xpu 27 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/UpSampleTrilinear3dKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at::native::xpu { 6 | 7 | TORCH_XPU_API void upsample_trilinear3d_out_kernel( 8 | const Tensor& output, 9 | const Tensor& input, 10 | IntArrayRef output_size, 11 | bool align_corners, 12 | std::optional scales_d, 13 | std::optional scales_h, 14 | std::optional scales_w); 15 | 16 | TORCH_XPU_API void upsample_trilinear3d_backward_out_kernel( 17 | const Tensor& grad_input_, 18 | const Tensor& grad_output_, 19 | IntArrayRef output_size, 20 | IntArrayRef input_size, 21 | bool align_corners, 22 | std::optional scales_d, 23 | std::optional scales_h, 24 | std::optional scales_w); 25 | 26 | } // namespace at::native::xpu 27 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/WeightInt4PackKernel.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | namespace at::native::xpu { 5 | 6 | TORCH_XPU_API void weight_to_int4pack_kernel( 7 | const Tensor& weight_packed, 8 | const Tensor& weight, 9 | int N, 10 | int K); 11 | 12 | } // namespace at::native::xpu 13 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/WeightNormKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | namespace at::native::xpu { 5 | 6 | TORCH_XPU_API std::tuple weight_norm_kernel( 7 | const Tensor& v, 8 | const Tensor& g, 9 | int64_t dim); 10 | 11 | TORCH_XPU_API std::tuple weight_norm_backward_kernel( 12 | const Tensor& grad_w, 13 | const Tensor& saved_v, 14 | const Tensor& saved_g, 15 | const Tensor& saved_norms, 16 | int64_t dim); 17 | 18 | } // namespace at::native::xpu 19 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/ZetaKernel.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include 9 | 10 | namespace at::native::xpu { 11 | 12 | template 13 | struct ZetaFunctor { 14 | scalar_t operator()(scalar_t x, scalar_t q) const { 15 | return zeta(x, q); 16 | } 17 | }; 18 | 19 | constexpr char zeta_name[] = "zeta"; 20 | void zeta_kernel(TensorIteratorBase& iter) { 21 | AT_DISPATCH_FLOATING_TYPES(iter.common_dtype(), "zeta_xpu", [&]() { 22 | gpu_kernel_with_scalars(iter, ZetaFunctor()); 23 | }); 24 | } 25 | 26 | } // namespace at::native::xpu 27 | -------------------------------------------------------------------------------- /src/ATen/native/xpu/sycl/ZetaKernel.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at::native::xpu { 6 | 7 | TORCH_XPU_API void zeta_kernel(TensorIteratorBase& iter); 8 | 9 | } // namespace at::native::xpu -------------------------------------------------------------------------------- /src/comm/Macros.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #ifdef _WIN32 4 | #define RESTRICT __restrict 5 | #else 6 | #define RESTRICT __restrict__ 7 | #endif 8 | -------------------------------------------------------------------------------- /src/comm/Runtime.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace at::xpu { 6 | 7 | static inline at::DeviceIndex getDeviceIndexOfCurrentQueue() { 8 | return c10::xpu::getCurrentXPUStream().device_index(); 9 | } 10 | 11 | static inline sycl::queue& getCurrentSYCLQueue() { 12 | return c10::xpu::getCurrentXPUStream().queue(); 13 | } 14 | 15 | } // namespace at::xpu 16 | -------------------------------------------------------------------------------- /src/comm/SYCLContext.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | using namespace at::xpu; 9 | using namespace xpu::sycl; 10 | -------------------------------------------------------------------------------- /src/comm/xpu_aten.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include -------------------------------------------------------------------------------- /src/xccl/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # XCCL sources 2 | 3 | file(GLOB xccl_h "*.hpp") 4 | file(GLOB xccl_cpp "*.cpp") 5 | 6 | list(APPEND ATen_XPU_XCCL_SRCS ${xccl_cpp}) 7 | 8 | set(ATen_XPU_XCCL_SRCS ${ATen_XPU_XCCL_SRCS} PARENT_SCOPE) 9 | 10 | # Why copy the header file to the build directory? 11 | # We want register XCCL backend to PyTorch c10d in torch/csrc/distributed/c10d/init.cpp#L27-L29. 12 | # To align with other backends, we need to copy the header file to the build torch/csrc/distributed/c10d directory. 13 | # Further solution is add find path for torch/csrc/distributed/c10d/init.cpp#L27-L29. 14 | foreach(HEADER ${xccl_h}) 15 | file(COPY ${HEADER} DESTINATION "${CMAKE_BINARY_DIR}/torch/csrc/distributed/c10d") 16 | endforeach() 17 | -------------------------------------------------------------------------------- /test/microbench/distribution.cauchy.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.profiler import profile, ProfilerActivity 3 | 4 | device = "xpu" 5 | shape_list = [(8192, 8192)] 6 | backward = False 7 | 8 | for shape in shape_list: 9 | for dtype in [torch.bfloat16, torch.float16, torch.float32]: 10 | input = torch.randn(shape, dtype=dtype, device=device) 11 | # warm up 12 | input.cauchy_() 13 | 14 | # go 15 | print("shape:", (shape), "; datatype:", dtype, "; backward:", backward) 16 | with profile( 17 | activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], record_shapes=True 18 | ) as prof: 19 | for i in range(20): 20 | input.cauchy_() 21 | print(prof.key_averages().table(sort_by="xpu_time_total")) 22 | -------------------------------------------------------------------------------- /test/microbench/distribution.exponential.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.profiler import profile, ProfilerActivity 3 | 4 | device = "xpu" 5 | shape_list = [(8192, 8192)] 6 | backward = False 7 | 8 | for shape in shape_list: 9 | for dtype in [torch.bfloat16, torch.float16, torch.float32]: 10 | input = torch.randn(shape, dtype=dtype, device=device) 11 | # warm up 12 | input.exponential_(0.5) 13 | 14 | # go 15 | print("shape:", (shape), "; datatype:", dtype, "; backward:", backward) 16 | with profile( 17 | activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], record_shapes=True 18 | ) as prof: 19 | for i in range(20): 20 | input.exponential_(0.5) 21 | print(prof.key_averages().table(sort_by="xpu_time_total")) 22 | -------------------------------------------------------------------------------- /test/microbench/distribution.geometric.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.profiler import profile, ProfilerActivity 3 | 4 | shape_list = [(8192, 8192)] 5 | backward = False 6 | 7 | if __name__ == "__main__": 8 | for shape in shape_list: 9 | for dtype in [torch.bfloat16, torch.float16, torch.float32]: 10 | input = torch.randn(shape, dtype=torch.bfloat16, device=torch.device("xpu")) 11 | 12 | # warm up 13 | input.geometric_(0.5) 14 | 15 | # go 16 | print( 17 | "shape:", 18 | (shape), 19 | "; datatype:", 20 | dtype, 21 | "; P:", 22 | 0.5, 23 | "; backward:", 24 | backward, 25 | ) 26 | with profile( 27 | activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], 28 | record_shapes=True, 29 | ) as prof: 30 | for i in range(20): 31 | input.geometric_(0.5) 32 | print(prof.key_averages().table(sort_by="xpu_time_total")) 33 | -------------------------------------------------------------------------------- /test/microbench/distribution.log_normal.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.profiler import profile, ProfilerActivity 3 | 4 | device = "xpu" 5 | shape_list = [(8192, 8192)] 6 | backward = False 7 | 8 | for shape in shape_list: 9 | for dtype in [torch.bfloat16, torch.float16, torch.float32]: 10 | input = torch.randn(shape, dtype=dtype, device=device) 11 | # warm up 12 | input.log_normal_(128, 128) 13 | 14 | # go 15 | print("shape:", (shape), "; datatype:", dtype, "; backward:", backward) 16 | with profile( 17 | activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], record_shapes=True 18 | ) as prof: 19 | for i in range(20): 20 | input.log_normal_(128, 128) 21 | print(prof.key_averages().table(sort_by="xpu_time_total")) 22 | -------------------------------------------------------------------------------- /test/microbench/distribution.normal.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.profiler import profile, ProfilerActivity 3 | 4 | shape_list = [(8192, 8192)] 5 | backward = False 6 | 7 | if __name__ == "__main__": 8 | for shape in shape_list: 9 | for dtype in [torch.bfloat16, torch.float16, torch.float32]: 10 | input = torch.randn(shape, dtype=torch.bfloat16, device=torch.device("xpu")) 11 | 12 | # warm up 13 | input.normal_() 14 | 15 | # go 16 | print("shape:", (shape), "; datatype:", dtype, "; backward:", backward) 17 | with profile( 18 | activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], 19 | record_shapes=True, 20 | ) as prof: 21 | for i in range(20): 22 | input.normal_() 23 | print(prof.key_averages().table(sort_by="xpu_time_total")) 24 | -------------------------------------------------------------------------------- /test/microbench/distribution.random.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.profiler import profile, ProfilerActivity 3 | 4 | shape_list = [(8192, 8192)] 5 | backward = False 6 | 7 | if __name__ == "__main__": 8 | for shape in shape_list: 9 | for dtype in [torch.bfloat16, torch.float16, torch.float32]: 10 | input = torch.randn(shape, dtype=torch.bfloat16, device=torch.device("xpu")) 11 | 12 | # warm up 13 | input.random_(-(2**8), 2**8) 14 | 15 | # go 16 | print("shape:", (shape), "; datatype:", dtype, "; backward:", backward) 17 | with profile( 18 | activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], 19 | record_shapes=True, 20 | ) as prof: 21 | for i in range(20): 22 | input.random_(-(2**8), 2**8) 23 | print(prof.key_averages().table(sort_by="xpu_time_total")) 24 | -------------------------------------------------------------------------------- /test/microbench/distribution.uniform.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.profiler import profile, ProfilerActivity 3 | 4 | shape_list = [(8192, 8192)] 5 | backward = False 6 | 7 | if __name__ == "__main__": 8 | for shape in shape_list: 9 | for dtype in [torch.bfloat16, torch.float16, torch.float32]: 10 | input = torch.randn(shape, dtype=torch.bfloat16, device=torch.device("xpu")) 11 | 12 | # warm up 13 | input.uniform_() 14 | 15 | # go 16 | print("shape:", (shape), "; datatype:", dtype, "; backward:", backward) 17 | with profile( 18 | activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], 19 | record_shapes=True, 20 | ) as prof: 21 | for i in range(20): 22 | input.uniform_() 23 | print(prof.key_averages().table(sort_by="xpu_time_total")) 24 | -------------------------------------------------------------------------------- /test/microbench/scan.unique.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.profiler import profile, ProfilerActivity 3 | 4 | device = "xpu" 5 | backward = False 6 | 7 | shape_list = [(2049, 2049)] 8 | 9 | for shape in shape_list: 10 | for dtype in [torch.bfloat16, torch.float16, torch.float32]: 11 | input = torch.randint(100, shape, dtype=dtype, device=device) 12 | 13 | # warm up 14 | torch.unique(input, sorted=True, return_inverse=True, return_counts=True) 15 | 16 | # go 17 | print("shape:", (shape), "; datatype:", dtype, "; backward:", backward) 18 | with profile( 19 | activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], record_shapes=True 20 | ) as prof: 21 | for i in range(20): 22 | output = torch.unique( 23 | input, sorted=True, return_inverse=True, return_counts=True 24 | ) 25 | print(prof.key_averages().table(sort_by="xpu_time_total")) 26 | -------------------------------------------------------------------------------- /test/microbench/sort.randperm.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.profiler import profile, ProfilerActivity 3 | 4 | device = "xpu" 5 | backward = False 6 | 7 | shape_list = [(8193)] 8 | 9 | for shape in shape_list: 10 | for dtype in [torch.float32]: 11 | # warm up 12 | torch.randperm(shape, dtype=dtype, device=device) 13 | 14 | # go 15 | print("shape:", (shape), "; datatype:", dtype, "; backward:", backward) 16 | with profile( 17 | activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], record_shapes=True 18 | ) as prof: 19 | for i in range(20): 20 | torch.randperm(shape, dtype=dtype, device=device) 21 | print(prof.key_averages().table(sort_by="xpu_time_total")) 22 | -------------------------------------------------------------------------------- /test/profiling/correlation_id_mixed.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | input1 = torch.randn(3, 3, device="xpu") 4 | input2 = torch.randn(3, 3, device="xpu") 5 | 6 | with torch.profiler.profile( 7 | activities=[ 8 | torch.profiler.ProfilerActivity.CPU, 9 | torch.profiler.ProfilerActivity.XPU, 10 | ] 11 | ) as prof: 12 | output1 = input1 + 1.0 13 | output2 = input2 + 2.0 14 | output = output1 + output2 15 | print(prof.key_averages().table(sort_by="xpu_time_total")) 16 | -------------------------------------------------------------------------------- /test/profiling/profile_partial_runtime_ops.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def compute(input1, input2): 5 | input1 = input1.to(device="xpu") 6 | return input1 + 1.0 7 | 8 | 9 | input1 = torch.randn(3, 3, device="cpu") 10 | input2 = torch.randn(3, 3, device="cpu") 11 | 12 | # warm up 13 | output = compute(input1, input2) 14 | 15 | for id in range(1): 16 | with torch.profiler.profile( 17 | activities=[ 18 | torch.profiler.ProfilerActivity.CPU, 19 | torch.profiler.ProfilerActivity.XPU, 20 | ] 21 | ) as p: 22 | output = compute(input1, input2) 23 | print(p.key_averages().table(sort_by="self_xpu_time_total", row_limit=-1)) 24 | -------------------------------------------------------------------------------- /test/profiling/time_precision_in_profile.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def compute(input1, input2): 5 | input1 = input1.to(device="xpu") 6 | return input1 + 1.0 7 | 8 | 9 | input1 = torch.randn(3, 3, device="cpu") 10 | input2 = torch.randn(3, 3, device="cpu") 11 | 12 | # warm up 13 | output = compute(input1, input2) 14 | 15 | for id in range(1000): 16 | with torch.profiler.profile( 17 | activities=[ 18 | torch.profiler.ProfilerActivity.CPU, 19 | torch.profiler.ProfilerActivity.XPU, 20 | ] 21 | ) as p: 22 | output = compute(input1, input2) 23 | print(p.key_averages().table(sort_by="self_xpu_time_total", row_limit=-1)) 24 | -------------------------------------------------------------------------------- /test/profiling/triton_xpu_ops_time.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | device = "xpu" 4 | 5 | 6 | @torch.compile 7 | def test(x): 8 | x = x + 1.0 9 | x = x * x 10 | x = x + 2.0 11 | return x 12 | 13 | 14 | input = torch.randn(128, 128, device=device) 15 | 16 | # warm 17 | output = test(input) 18 | print("[info] finish warm up") 19 | 20 | with torch.profiler.profile( 21 | activities=[ 22 | torch.profiler.ProfilerActivity.CPU, 23 | torch.profiler.ProfilerActivity.XPU, 24 | ] 25 | ) as p: 26 | print("[info] start running") 27 | output = test(input) 28 | print(p.key_averages().table(sort_by="self_xpu_time_total", row_limit=-1)) 29 | -------------------------------------------------------------------------------- /test/regressions/optests_failures_dict.json: -------------------------------------------------------------------------------- 1 | { 2 | "_description": "This is a dict containing failures for tests autogenerated by generate_opcheck_tests. For more details, please see https://docs.google.com/document/d/1Pj5HRZvdOq3xpFpbEjUZp2hBovhy7Wnxw14m6lF2154/edit", 3 | "_version": 1, 4 | "data": {} 5 | } 6 | -------------------------------------------------------------------------------- /test/regressions/test_copy.py: -------------------------------------------------------------------------------- 1 | # Owner(s): ["module: intel"] 2 | import torch 3 | from torch.testing._internal.common_utils import TestCase 4 | 5 | cpu_device = torch.device("cpu") 6 | xpu_device = torch.device("xpu") 7 | 8 | 9 | class TestSimpleCopy(TestCase): 10 | def test_copy_and_clone(self, dtype=torch.float): 11 | a_cpu = torch.randn(16, 64, 28, 28) 12 | b_cpu = torch.randn(16, 64, 28, 28) 13 | a_xpu = a_cpu.to(xpu_device) 14 | b_xpu = b_cpu.to(xpu_device) 15 | # naive 16 | b_cpu.copy_(a_cpu) 17 | b_xpu.copy_(a_xpu) 18 | self.assertEqual(b_cpu, b_xpu.to(cpu_device)) 19 | # clone + permutation 20 | b_cpu = a_cpu.clone(memory_format=torch.channels_last) 21 | b_xpu = a_xpu.clone(memory_format=torch.channels_last) 22 | self.assertEqual(b_cpu, b_xpu.to(cpu_device)) 23 | -------------------------------------------------------------------------------- /test/regressions/test_div_mode.py: -------------------------------------------------------------------------------- 1 | # Owner(s): ["module: intel"] 2 | import torch 3 | from torch.testing._internal.common_dtype import get_all_dtypes 4 | from torch.testing._internal.common_utils import TestCase 5 | 6 | 7 | class TestDivMode(TestCase): 8 | def test_div_true_dtype(self): 9 | claimed_dtypes = get_all_dtypes() 10 | for dtype in claimed_dtypes: 11 | a_cpu = torch.randint(1, 100, [8, 8]).to(dtype) 12 | a_xpu = a_cpu.to("xpu") 13 | ref = torch.ops.aten.div(a_cpu * 2, a_cpu, rounding_mode=None) 14 | res = torch.ops.aten.div(a_xpu * 2, a_xpu, rounding_mode=None) 15 | self.assertEqual(ref, res.to("cpu")) 16 | -------------------------------------------------------------------------------- /test/regressions/test_layer_norm.py: -------------------------------------------------------------------------------- 1 | # Owner(s): ["module: intel"] 2 | import torch 3 | import torch.nn as nn 4 | from torch.testing._internal.common_utils import TestCase 5 | 6 | cpu_device = torch.device("cpu") 7 | xpu_device = torch.device("xpu") 8 | 9 | 10 | class TestLayerNorm(TestCase): 11 | def test_layer_norm_no_nan(self, dtype=torch.float): 12 | dim = [5] 13 | x_cpu = torch.tensor([[1e15, 1e15 + 1, 1e15 + 2, 1e15 + 3, 1e15 + 4]]) 14 | layernorm_cpu = nn.LayerNorm(dim) 15 | y_cpu = layernorm_cpu(x_cpu) 16 | 17 | x_xpu = x_cpu.to(xpu_device) 18 | layernorm_xpu = nn.LayerNorm(dim).to(xpu_device) 19 | y_xpu = layernorm_xpu(x_xpu) 20 | self.assertEqual(y_cpu, y_xpu.to(cpu_device)) 21 | -------------------------------------------------------------------------------- /test/regressions/test_operation_on_device_1.py: -------------------------------------------------------------------------------- 1 | # Owner(s): ["module: intel"] 2 | import torch 3 | from torch.testing._internal.common_utils import TestCase 4 | 5 | 6 | class TestOperationOnDevice1(TestCase): 7 | def test_sum_on_device1(self, dtype=torch.float): 8 | if torch.xpu.device_count() >= 2: 9 | a = torch.randn(2, 3, device=torch.device("xpu:1")) 10 | torch.xpu.set_device(1) 11 | res = a.sum() 12 | ref = a.cpu().sum() 13 | self.assertEqual(ref, res) 14 | -------------------------------------------------------------------------------- /test/sycl/main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "simple_kernel.hpp" 3 | 4 | void test_simple_kernel() { 5 | int numel = 1024; 6 | float a[1024]; 7 | 8 | // a simple sycl kernel 9 | itoa(a, numel); 10 | 11 | bool success = true; 12 | for (int i = 0; i < numel; i++) { 13 | if (a[i] != i) { 14 | success = false; 15 | break; 16 | } 17 | } 18 | 19 | if (success) { 20 | std::cout << "Pass" << std::endl; 21 | } else { 22 | std::cout << "Fail" << std::endl; 23 | } 24 | } 25 | 26 | int main(int argc, char* argv[]) { 27 | test_simple_kernel(); 28 | return 0; 29 | } 30 | -------------------------------------------------------------------------------- /test/sycl/simple_kernel.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | // Create an idx array on SYCL GPU device 4 | // res - host buffer for result 5 | // numel - length of the idx array 6 | void itoa(float* res, int numel); 7 | -------------------------------------------------------------------------------- /test/xpu/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intel/torch-xpu-ops/5b4d7444484576f721d2295761cf8fafa924ef36/test/xpu/__init__.py -------------------------------------------------------------------------------- /test/xpu/distributed/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intel/torch-xpu-ops/5b4d7444484576f721d2295761cf8fafa924ef36/test/xpu/distributed/__init__.py -------------------------------------------------------------------------------- /test/xpu/extended/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intel/torch-xpu-ops/5b4d7444484576f721d2295761cf8fafa924ef36/test/xpu/extended/__init__.py -------------------------------------------------------------------------------- /test/xpu/extended/run_test_with_skip.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | from skip_list_common import skip_dict 5 | from skip_list_win import skip_dict as skip_dict_win 6 | 7 | IS_WINDOWS = sys.platform == "win32" 8 | 9 | skip_list = skip_dict["test_ops_xpu.py"] 10 | if IS_WINDOWS: 11 | skip_list += skip_dict_win["test_ops_xpu.py"] 12 | 13 | skip_options = ' -k "not ' + skip_list[0] 14 | for skip_case in skip_list[1:]: 15 | skip_option = " and not " + skip_case 16 | skip_options += skip_option 17 | skip_options += '"' 18 | 19 | os.environ["PYTORCH_TEST_WITH_SLOW"] = "1" 20 | test_command = "pytest --timeout 600 -v --junit-xml=./op_extended.xml test_ops_xpu.py" 21 | test_command += skip_options 22 | res = os.system(test_command) 23 | sys.exit(res) 24 | -------------------------------------------------------------------------------- /test/xpu/extended/run_test_with_skip_arc.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | from skip_list_arc import skip_dict as skip_dict_specifical 5 | from skip_list_common import skip_dict 6 | from skip_list_win import skip_dict as skip_dict_win 7 | from skip_list_win_arc import skip_dict as skip_dict_win_arc 8 | 9 | IS_WINDOWS = sys.platform == "win32" 10 | 11 | skip_list = skip_dict["test_ops_xpu.py"] + skip_dict_specifical["test_ops_xpu.py"] 12 | if IS_WINDOWS: 13 | skip_list += skip_dict_win["test_ops_xpu.py"] + skip_dict_win_arc["test_ops_xpu.py"] 14 | 15 | skip_options = ' -k "not ' + skip_list[0] 16 | for skip_case in skip_list[1:]: 17 | skip_option = " and not " + skip_case 18 | skip_options += skip_option 19 | skip_options += '"' 20 | 21 | os.environ["PYTORCH_TEST_WITH_SLOW"] = "1" 22 | test_command = "pytest -v test_ops_xpu.py" 23 | test_command += skip_options 24 | res = os.system(test_command) 25 | sys.exit(res) 26 | -------------------------------------------------------------------------------- /test/xpu/extended/run_test_with_skip_bmg.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | import pytest 5 | from skip_list_common import skip_dict 6 | from skip_list_win import skip_dict as skip_dict_win 7 | from skip_list_win_bmg import skip_dict as skip_dict_win_bmg 8 | 9 | IS_WINDOWS = sys.platform == "win32" 10 | 11 | skip_list = skip_dict["test_ops_xpu.py"] 12 | if IS_WINDOWS: 13 | skip_list += skip_dict_win["test_ops_xpu.py"] + skip_dict_win_bmg["test_ops_xpu.py"] 14 | 15 | skip_options = "not " + skip_list[0] 16 | for skip_case in skip_list[1:]: 17 | skip_option = " and not " + skip_case 18 | skip_options += skip_option 19 | 20 | os.environ["PYTORCH_TEST_WITH_SLOW"] = "1" 21 | test_command = ["-k", skip_options, "test_ops_xpu.py", "-v"] 22 | res = pytest.main(test_command) 23 | sys.exit(res) 24 | -------------------------------------------------------------------------------- /test/xpu/extended/run_test_with_skip_lnl.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | import pytest 5 | from skip_list_common import skip_dict 6 | from skip_list_win import skip_dict as skip_dict_win 7 | from skip_list_win_lnl import skip_dict as skip_dict_win_lnl 8 | 9 | IS_WINDOWS = sys.platform == "win32" 10 | 11 | skip_list = skip_dict["test_ops_xpu.py"] 12 | if IS_WINDOWS: 13 | skip_list += skip_dict_win["test_ops_xpu.py"] + skip_dict_win_lnl["test_ops_xpu.py"] 14 | 15 | skip_options = "not " + skip_list[0] 16 | for skip_case in skip_list[1:]: 17 | skip_option = " and not " + skip_case 18 | skip_options += skip_option 19 | 20 | os.environ["PYTORCH_TEST_WITH_SLOW"] = "1" 21 | test_command = ["-k", skip_options, "test_ops_xpu.py", "-v"] 22 | res = pytest.main(test_command) 23 | sys.exit(res) 24 | -------------------------------------------------------------------------------- /test/xpu/extended/run_test_with_skip_mtl.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | import pytest 5 | from skip_list_common import skip_dict 6 | from skip_list_win import skip_dict as skip_dict_win 7 | from skip_list_win_mtl import skip_dict as skip_dict_win_mtl 8 | 9 | IS_WINDOWS = sys.platform == "win32" 10 | 11 | skip_list = skip_dict["test_ops_xpu.py"] 12 | if IS_WINDOWS: 13 | skip_list += skip_dict_win["test_ops_xpu.py"] + skip_dict_win_mtl["test_ops_xpu.py"] 14 | 15 | skip_options = "not " + skip_list[0] 16 | for skip_case in skip_list[1:]: 17 | skip_option = " and not " + skip_case 18 | skip_options += skip_option 19 | 20 | os.environ["PYTORCH_TEST_WITH_SLOW"] = "1" 21 | test_command = ["-k", skip_options, "test_ops_xpu.py", "-v"] 22 | res = pytest.main(test_command) 23 | sys.exit(res) 24 | -------------------------------------------------------------------------------- /test/xpu/extended/skip_list_win.py: -------------------------------------------------------------------------------- 1 | skip_dict = { 2 | "test_ops_xpu.py": ( 3 | "test_compare_cpu_pow_xpu_bfloat16", # https://github.com/intel/torch-xpu-ops/pull/764 4 | "test_compare_cpu_argmin_xpu_int", 5 | ), 6 | } 7 | -------------------------------------------------------------------------------- /test/xpu/extended/skip_list_win_arc.py: -------------------------------------------------------------------------------- 1 | skip_dict = { 2 | # SYCL Compiler on Windows removed the following operations when '-cl-poison-unsupported-fp64-kernels' is on 3 | # Hence, skip the following windows specific errors 4 | "test_ops_xpu.py": ( 5 | "test_compare_cpu_sqrt_xpu_complex64", 6 | "test_backward_nn_functional_adaptive_avg_pool2d_xpu_float32", 7 | ), 8 | } 9 | -------------------------------------------------------------------------------- /test/xpu/extended/skip_list_win_bmg.py: -------------------------------------------------------------------------------- 1 | skip_dict = { 2 | "test_ops_xpu.py": ( 3 | # https://github.com/intel/torch-xpu-ops/issues/1173 4 | # Fatal Python error: Illegal instruction 5 | "test_compare_cpu_grid_sampler_2d_xpu_float64", 6 | "test_compare_cpu_cosh_xpu_complex64", 7 | "test_compare_cpu_nn_functional_softshrink_xpu_bfloat16", 8 | "test_compare_cpu_nn_functional_softshrink_xpu_float16", 9 | "test_compare_cpu_nn_functional_softshrink_xpu_float32", 10 | "test_compare_cpu_nn_functional_softshrink_xpu_float64", 11 | "test_compare_cpu_square_xpu_complex128", 12 | ), 13 | } 14 | -------------------------------------------------------------------------------- /test/xpu/extended/skip_list_win_lnl.py: -------------------------------------------------------------------------------- 1 | skip_dict = { 2 | "test_ops_xpu.py": ( 3 | # https://github.com/intel/torch-xpu-ops/issues/1173 4 | # Fatal Python error: Illegal instruction 5 | "test_compare_cpu_grid_sampler_2d_xpu_float64", 6 | "test_compare_cpu_cosh_xpu_complex64", 7 | "test_compare_cpu_nn_functional_softshrink_xpu_bfloat16", 8 | "test_compare_cpu_nn_functional_softshrink_xpu_float16", 9 | "test_compare_cpu_nn_functional_softshrink_xpu_float32", 10 | "test_compare_cpu_nn_functional_softshrink_xpu_float64", 11 | "test_compare_cpu_square_xpu_complex128", 12 | ), 13 | } 14 | -------------------------------------------------------------------------------- /test/xpu/nn/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intel/torch-xpu-ops/5b4d7444484576f721d2295761cf8fafa924ef36/test/xpu/nn/__init__.py -------------------------------------------------------------------------------- /test/xpu/nn/test_embedding_xpu.py: -------------------------------------------------------------------------------- 1 | # Owner(s): ["module: intel"] 2 | 3 | from torch.testing._internal.common_device_type import instantiate_device_type_tests 4 | from torch.testing._internal.common_utils import run_tests 5 | 6 | try: 7 | from .xpu_test_utils import XPUPatchForImport 8 | except Exception as e: 9 | from ..xpu_test_utils import XPUPatchForImport 10 | 11 | with XPUPatchForImport(False): 12 | from test_embedding import TestEmbeddingNNDeviceType 13 | 14 | 15 | instantiate_device_type_tests( 16 | TestEmbeddingNNDeviceType, globals(), only_for="xpu", allow_xpu=True 17 | ) 18 | 19 | 20 | if __name__ == "__main__": 21 | run_tests() 22 | -------------------------------------------------------------------------------- /test/xpu/nn/test_init_xpu.py: -------------------------------------------------------------------------------- 1 | # Owner(s): ["module: intel"] 2 | 3 | from torch.testing._internal.common_utils import run_tests 4 | 5 | try: 6 | from .xpu_test_utils import XPUPatchForImport 7 | except Exception as e: 8 | from ..xpu_test_utils import XPUPatchForImport 9 | 10 | with XPUPatchForImport(False): 11 | from test_init import TestNNInit # noqa: F401` 12 | 13 | 14 | if __name__ == "__main__": 15 | run_tests() 16 | -------------------------------------------------------------------------------- /test/xpu/nn/test_lazy_modules_xpu.py: -------------------------------------------------------------------------------- 1 | # Owner(s): ["module: intel"] 2 | 3 | from torch.nn.parameter import UninitializedParameter 4 | from torch.testing._internal.common_utils import run_tests, suppress_warnings 5 | 6 | try: 7 | from .xpu_test_utils import XPUPatchForImport 8 | except Exception as e: 9 | from ..xpu_test_utils import XPUPatchForImport 10 | 11 | with XPUPatchForImport(False): 12 | from test_lazy_modules import LazyModule, TestLazyModules 13 | 14 | 15 | @suppress_warnings 16 | def materialize_device(self): 17 | module = LazyModule() 18 | module.register_parameter("test_param", UninitializedParameter()) 19 | module.test_param.materialize(10) 20 | self.assertTrue(module.test_param.device.type == "cpu") 21 | device = "xpu" 22 | module = LazyModule() 23 | module.register_parameter("test_param", UninitializedParameter()) 24 | module.to(device) 25 | module.test_param.materialize(10) 26 | self.assertTrue(module.test_param.device.type == device) 27 | 28 | 29 | TestLazyModules.test_materialize_device = materialize_device 30 | 31 | if __name__ == "__main__": 32 | run_tests() 33 | -------------------------------------------------------------------------------- /test/xpu/nn/test_load_state_dict_xpu.py: -------------------------------------------------------------------------------- 1 | # Owner(s): ["module: intel"] 2 | 3 | from torch.testing._internal.common_utils import ( 4 | instantiate_parametrized_tests, 5 | run_tests, 6 | TestCase, 7 | ) 8 | 9 | try: 10 | from .xpu_test_utils import XPUPatchForImport 11 | except Exception as e: 12 | from ..xpu_test_utils import XPUPatchForImport 13 | 14 | with XPUPatchForImport(False): 15 | from test_load_state_dict import TestLoadStateDict, TestLoadStateDictSwap 16 | 17 | 18 | instantiate_parametrized_tests(TestLoadStateDict) 19 | instantiate_parametrized_tests(TestLoadStateDictSwap) 20 | 21 | 22 | if __name__ == "__main__": 23 | TestCase._default_dtype_check_enabled = True 24 | run_tests() 25 | -------------------------------------------------------------------------------- /test/xpu/nn/test_module_hooks_xpu.py: -------------------------------------------------------------------------------- 1 | # Owner(s): ["module: intel"] 2 | 3 | from torch.testing._internal.common_utils import ( 4 | instantiate_parametrized_tests, 5 | run_tests, 6 | TestCase, 7 | ) 8 | 9 | try: 10 | from .xpu_test_utils import XPUPatchForImport 11 | except Exception as e: 12 | from ..xpu_test_utils import XPUPatchForImport 13 | 14 | with XPUPatchForImport(False): 15 | from test_module_hooks import TestModuleHooks 16 | 17 | instantiate_parametrized_tests(TestModuleHooks) 18 | 19 | 20 | if __name__ == "__main__": 21 | TestCase._default_dtype_check_enabled = True 22 | run_tests() 23 | -------------------------------------------------------------------------------- /test/xpu/nn/test_parametrization_xpu.py: -------------------------------------------------------------------------------- 1 | # Owner(s): ["module: intel"] 2 | 3 | from torch.testing._internal.common_device_type import instantiate_device_type_tests 4 | from torch.testing._internal.common_utils import ( 5 | instantiate_parametrized_tests, 6 | run_tests, 7 | ) 8 | 9 | try: 10 | from .xpu_test_utils import XPUPatchForImport 11 | except Exception as e: 12 | from ..xpu_test_utils import XPUPatchForImport 13 | 14 | with XPUPatchForImport(False): 15 | from test_parametrization import TestNNParametrization, TestNNParametrizationDevice 16 | 17 | 18 | instantiate_device_type_tests( 19 | TestNNParametrizationDevice, globals(), only_for="xpu", allow_xpu=True 20 | ) 21 | instantiate_parametrized_tests(TestNNParametrization) 22 | 23 | 24 | if __name__ == "__main__": 25 | run_tests() 26 | -------------------------------------------------------------------------------- /test/xpu/nn/test_pruning_xpu.py: -------------------------------------------------------------------------------- 1 | # Owner(s): ["module: intel"] 2 | 3 | from torch.testing._internal.common_utils import ( 4 | instantiate_parametrized_tests, 5 | run_tests, 6 | ) 7 | 8 | try: 9 | from .xpu_test_utils import XPUPatchForImport 10 | except Exception as e: 11 | from ..xpu_test_utils import XPUPatchForImport 12 | 13 | with XPUPatchForImport(False): 14 | from test_pruning import TestPruningNN 15 | 16 | 17 | instantiate_parametrized_tests(TestPruningNN) 18 | 19 | 20 | if __name__ == "__main__": 21 | run_tests() 22 | -------------------------------------------------------------------------------- /test/xpu/quantization/core/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intel/torch-xpu-ops/5b4d7444484576f721d2295761cf8fafa924ef36/test/xpu/quantization/core/__init__.py -------------------------------------------------------------------------------- /test/xpu/run_test_win_with_skip_mtl.py: -------------------------------------------------------------------------------- 1 | import re 2 | import sys 3 | from io import StringIO 4 | 5 | import pytest 6 | from skip_list_win_mtl import skip_dict 7 | 8 | IS_WINDOWS = sys.platform == "win32" 9 | 10 | skip_list = skip_dict["test_xpu.py"] 11 | 12 | skip_options = "not " + skip_list[0] 13 | for skip_case in skip_list[1:]: 14 | skip_option = " and not " + skip_case 15 | skip_options += skip_option 16 | 17 | original_stdout = sys.stdout 18 | sys.stdout = StringIO() 19 | 20 | test_command = ["-k", skip_options, "../../../../test/test_xpu.py", "-v"] 21 | res = pytest.main(test_command) 22 | 23 | output = sys.stdout.getvalue() 24 | sys.stdout = original_stdout 25 | 26 | cleaned_output = re.sub( 27 | r"\.\.(\/|\\)\.\.(\/|\\)\.\.(\/|\\)\.\.(\/|\\)test(\/|\\)", "", output 28 | ) 29 | print(cleaned_output, end="") 30 | sys.exit(res) 31 | -------------------------------------------------------------------------------- /test/xpu/run_test_with_skip.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | from skip_list_common import skip_dict 5 | from xpu_test_utils import launch_test 6 | 7 | res = 0 8 | fail_test = [] 9 | 10 | for key in skip_dict: 11 | skip_list = skip_dict[key] 12 | fail = launch_test(key, skip_list) 13 | res += fail 14 | if fail: 15 | fail_test.append(key) 16 | if fail_test: 17 | print(",".join(fail_test) + " have failures") 18 | 19 | 20 | if os.name == "nt": 21 | sys.exit(res) 22 | else: 23 | exit_code = os.WEXITSTATUS(res) 24 | sys.exit(exit_code) 25 | -------------------------------------------------------------------------------- /test/xpu/run_test_with_skip_arc.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | from skip_list_arc import skip_dict as skip_dict_specifical 5 | from skip_list_common import skip_dict 6 | from skip_list_win import skip_dict as skip_dict_win 7 | from skip_list_win_arc import skip_dict as skip_dict_win_arc 8 | from xpu_test_utils import launch_test 9 | 10 | res = 0 11 | IS_WINDOWS = sys.platform == "win32" 12 | 13 | for key in skip_dict: 14 | skip_list = skip_dict[key] 15 | if key in skip_dict_specifical: 16 | skip_list += skip_dict_specifical[key] 17 | if IS_WINDOWS and key in skip_dict_win: 18 | skip_list += skip_dict_win[key] 19 | if IS_WINDOWS and key in skip_dict_win_arc: 20 | skip_list += skip_dict_win_arc[key] 21 | res += launch_test(key, skip_list) 22 | 23 | if os.name == "nt": 24 | sys.exit(res) 25 | else: 26 | exit_code = os.WEXITSTATUS(res) 27 | sys.exit(exit_code) 28 | -------------------------------------------------------------------------------- /test/xpu/run_test_with_skip_bmg.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | from skip_list_common import skip_dict 5 | from skip_list_win import skip_dict as skip_dict_win 6 | from skip_list_win_bmg import skip_dict as skip_dict_win_bmg 7 | from xpu_test_utils import launch_test 8 | 9 | res = 0 10 | IS_WINDOWS = sys.platform == "win32" 11 | 12 | for key in skip_dict: 13 | skip_list = skip_dict[key] 14 | if IS_WINDOWS and key in skip_dict_win: 15 | skip_list += skip_dict_win[key] 16 | if IS_WINDOWS and key in skip_dict_win_bmg: 17 | skip_list += skip_dict_win_bmg[key] 18 | res += launch_test(key, skip_list) 19 | 20 | if os.name == "nt": 21 | sys.exit(res) 22 | else: 23 | exit_code = os.WEXITSTATUS(res) 24 | sys.exit(exit_code) 25 | -------------------------------------------------------------------------------- /test/xpu/run_test_with_skip_lnl.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | from skip_list_common import skip_dict 5 | from skip_list_win import skip_dict as skip_dict_win 6 | from skip_list_win_lnl import skip_dict as skip_dict_win_lnl 7 | from xpu_test_utils import launch_test 8 | 9 | res = 0 10 | IS_WINDOWS = sys.platform == "win32" 11 | 12 | for key in skip_dict: 13 | skip_list = skip_dict[key] 14 | if IS_WINDOWS and key in skip_dict_win: 15 | skip_list += skip_dict_win[key] 16 | if IS_WINDOWS and key in skip_dict_win_lnl: 17 | skip_list += skip_dict_win_lnl[key] 18 | res += launch_test(key, skip_list) 19 | 20 | if os.name == "nt": 21 | sys.exit(res) 22 | else: 23 | exit_code = os.WEXITSTATUS(res) 24 | sys.exit(exit_code) 25 | -------------------------------------------------------------------------------- /test/xpu/run_test_with_skip_mtl.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | from skip_list_common import skip_dict 5 | from skip_list_mtl import skip_dict as skip_dict_specifical 6 | from skip_list_win import skip_dict as skip_dict_win 7 | from xpu_test_utils import launch_test 8 | 9 | res = 0 10 | IS_WINDOWS = sys.platform == "win32" 11 | 12 | for key in skip_dict: 13 | skip_list = skip_dict[key] 14 | if key in skip_dict_specifical: 15 | skip_list += skip_dict_specifical[key] 16 | if IS_WINDOWS and key in skip_dict_win: 17 | skip_list += skip_dict_win[key] 18 | res += launch_test(key, skip_list) 19 | 20 | if os.name == "nt": 21 | sys.exit(res) 22 | else: 23 | exit_code = os.WEXITSTATUS(res) 24 | sys.exit(exit_code) 25 | -------------------------------------------------------------------------------- /test/xpu/skip_list_arc.py: -------------------------------------------------------------------------------- 1 | skip_dict = { 2 | "test_indexing_xpu.py": ("test_index_put_accumulate_large_tensor_xpu",), 3 | "test_nn_xpu.py": ("test_grid_sample_large_xpu",), 4 | "test_tensor_creation_ops_xpu.py": ( 5 | "test_float_to_int_conversion_finite_xpu_int64", 6 | ), 7 | } 8 | -------------------------------------------------------------------------------- /test/xpu/skip_list_mtl.py: -------------------------------------------------------------------------------- 1 | skip_dict = { 2 | "test_indexing_xpu.py": ("test_index_put_accumulate_large_tensor_xpu",), 3 | "test_nn_xpu.py": ("test_grid_sample_large_xpu",), 4 | "test_tensor_creation_ops_xpu.py": ( 5 | "test_float_to_int_conversion_finite_xpu_int64", 6 | ), 7 | } 8 | -------------------------------------------------------------------------------- /test/xpu/skip_list_win_mtl.py: -------------------------------------------------------------------------------- 1 | skip_dict = { 2 | # failed on MTL windows, skip first for Preci 3 | "test_xpu.py": ("test_mem_get_info_xpu",), 4 | } 5 | -------------------------------------------------------------------------------- /test/xpu/test_autograd_fallback_xpu.py: -------------------------------------------------------------------------------- 1 | # Owner(s): ["module: intel"] 2 | 3 | from torch.testing._internal.common_utils import ( 4 | instantiate_parametrized_tests, 5 | run_tests, 6 | ) 7 | 8 | try: 9 | from xpu_test_utils import XPUPatchForImport 10 | except Exception as e: 11 | from .xpu_test_utils import XPUPatchForImport 12 | 13 | with XPUPatchForImport(False): 14 | from test_autograd_fallback import TestAutogradFallback 15 | 16 | 17 | instantiate_parametrized_tests(TestAutogradFallback) 18 | 19 | 20 | if __name__ == "__main__": 21 | run_tests() 22 | -------------------------------------------------------------------------------- /test/xpu/test_comparison_utils_xpu.py: -------------------------------------------------------------------------------- 1 | # Owner(s): ["module: intel"] 2 | 3 | from torch.testing._internal.common_utils import run_tests 4 | 5 | try: 6 | from xpu_test_utils import XPUPatchForImport 7 | except Exception as e: 8 | from .xpu_test_utils import XPUPatchForImport 9 | 10 | with XPUPatchForImport(False): 11 | from test_comparison_utils import TestComparisonUtils # noqa: F401` 12 | 13 | 14 | if __name__ == "__main__": 15 | run_tests() 16 | -------------------------------------------------------------------------------- /test/xpu/test_complex_xpu.py: -------------------------------------------------------------------------------- 1 | # Owner(s): ["module: intel"] 2 | 3 | from torch.testing._internal.common_device_type import instantiate_device_type_tests 4 | from torch.testing._internal.common_utils import run_tests, TestCase 5 | 6 | try: 7 | from xpu_test_utils import XPUPatchForImport 8 | except Exception as e: 9 | from .xpu_test_utils import XPUPatchForImport 10 | 11 | with XPUPatchForImport(False): 12 | from test_complex import TestComplexTensor 13 | 14 | instantiate_device_type_tests( 15 | TestComplexTensor, globals(), only_for="xpu", allow_xpu=True 16 | ) 17 | 18 | 19 | if __name__ == "__main__": 20 | TestCase._default_dtype_check_enabled = True 21 | run_tests() 22 | -------------------------------------------------------------------------------- /test/xpu/test_content_store_xpu.py: -------------------------------------------------------------------------------- 1 | # Owner(s): ["module: intel"] 2 | 3 | from torch.testing._internal.common_device_type import instantiate_device_type_tests 4 | from torch.testing._internal.common_utils import run_tests 5 | 6 | try: 7 | from xpu_test_utils import XPUPatchForImport 8 | except Exception as e: 9 | from .xpu_test_utils import XPUPatchForImport 10 | 11 | with XPUPatchForImport(False): 12 | from test_content_store import TestContentStore 13 | 14 | 15 | instantiate_device_type_tests( 16 | TestContentStore, globals(), only_for="xpu", allow_xpu=True 17 | ) 18 | 19 | 20 | if __name__ == "__main__": 21 | run_tests() 22 | -------------------------------------------------------------------------------- /test/xpu/test_dynamic_shapes_xpu.py: -------------------------------------------------------------------------------- 1 | # Owner(s): ["module: intel"] 2 | 3 | from torch.testing._internal.common_utils import ( 4 | instantiate_parametrized_tests, 5 | run_tests, 6 | ) 7 | 8 | try: 9 | from xpu_test_utils import XPUPatchForImport 10 | except Exception as e: 11 | from .xpu_test_utils import XPUPatchForImport 12 | 13 | with XPUPatchForImport(False): 14 | from test_dynamic_shapes import TestSymNumberMagicMethods 15 | 16 | instantiate_parametrized_tests(TestSymNumberMagicMethods) 17 | 18 | 19 | if __name__ == "__main__": 20 | run_tests() 21 | -------------------------------------------------------------------------------- /test/xpu/test_masked_xpu.py: -------------------------------------------------------------------------------- 1 | # Owner(s): ["module: intel"] 2 | 3 | from torch.testing._internal.common_device_type import instantiate_device_type_tests 4 | from torch.testing._internal.common_utils import run_tests 5 | 6 | try: 7 | from xpu_test_utils import XPUPatchForImport 8 | except Exception as e: 9 | from .xpu_test_utils import XPUPatchForImport 10 | 11 | with XPUPatchForImport(False): 12 | from test_masked import TestMasked 13 | 14 | instantiate_device_type_tests(TestMasked, globals(), only_for="xpu", allow_xpu=True) 15 | if __name__ == "__main__": 16 | run_tests() 17 | -------------------------------------------------------------------------------- /test/xpu/test_maskedtensor_xpu.py: -------------------------------------------------------------------------------- 1 | # Owner(s): ["module: intel"] 2 | 3 | from torch.testing._internal.common_device_type import instantiate_device_type_tests 4 | from torch.testing._internal.common_utils import ( 5 | instantiate_parametrized_tests, 6 | run_tests, 7 | ) 8 | 9 | try: 10 | from xpu_test_utils import XPUPatchForImport 11 | except Exception as e: 12 | from .xpu_test_utils import XPUPatchForImport 13 | 14 | with XPUPatchForImport(False): 15 | from test_maskedtensor import ( 16 | TestBasics, 17 | TestBinary, 18 | TestOperators, 19 | TestReductions, 20 | TestUnary, 21 | ) 22 | 23 | instantiate_device_type_tests(TestBasics, globals(), only_for=("xpu"), allow_xpu=True) 24 | 25 | instantiate_device_type_tests( 26 | TestOperators, globals(), only_for=("xpu"), allow_xpu=True 27 | ) 28 | instantiate_parametrized_tests(TestUnary) 29 | instantiate_parametrized_tests(TestBinary) 30 | instantiate_parametrized_tests(TestReductions) 31 | 32 | if __name__ == "__main__": 33 | run_tests() 34 | -------------------------------------------------------------------------------- /test/xpu/test_namedtensor_xpu.py: -------------------------------------------------------------------------------- 1 | # Owner(s): ["module: intel"] 2 | 3 | from torch.testing._internal.common_utils import run_tests 4 | 5 | try: 6 | from xpu_test_utils import XPUPatchForImport 7 | except Exception as e: 8 | from .xpu_test_utils import XPUPatchForImport 9 | 10 | 11 | def select_cuda(self): 12 | self._test_select("xpu") 13 | 14 | 15 | def as_strided_cuda(self): 16 | self._test_as_strided("xpu") 17 | 18 | 19 | with XPUPatchForImport(False): 20 | from test_namedtensor import TestNamedTensor 21 | 22 | TestNamedTensor.test_select_cuda = select_cuda 23 | TestNamedTensor.test_as_strided_cuda = as_strided_cuda 24 | 25 | if __name__ == "__main__": 26 | run_tests() 27 | -------------------------------------------------------------------------------- /test/xpu/test_native_functions_xpu.py: -------------------------------------------------------------------------------- 1 | # Owner(s): ["module: intel"] 2 | 3 | from torch.testing._internal.common_utils import run_tests 4 | 5 | try: 6 | from xpu_test_utils import XPUPatchForImport 7 | except Exception as e: 8 | from .xpu_test_utils import XPUPatchForImport 9 | 10 | with XPUPatchForImport(False): 11 | from test_native_functions import TestNativeFunctions # noqa: F401` 12 | 13 | if __name__ == "__main__": 14 | run_tests() 15 | -------------------------------------------------------------------------------- /test/xpu/test_native_mha_xpu.py: -------------------------------------------------------------------------------- 1 | # Owner(s): ["module: intel"] 2 | 3 | from torch.testing._internal.common_device_type import instantiate_device_type_tests 4 | from torch.testing._internal.common_utils import run_tests 5 | 6 | try: 7 | from xpu_test_utils import XPUPatchForImport 8 | except Exception as e: 9 | from .xpu_test_utils import XPUPatchForImport 10 | 11 | with XPUPatchForImport(False): 12 | from test_native_mha import TestMHADeviceType 13 | 14 | instantiate_device_type_tests( 15 | TestMHADeviceType, globals(), only_for="xpu", allow_xpu=True 16 | ) 17 | 18 | if __name__ == "__main__": 19 | run_tests() 20 | -------------------------------------------------------------------------------- /test/xpu/test_ops_fwd_gradients_xpu.py: -------------------------------------------------------------------------------- 1 | # Owner(s): ["module: intel"] 2 | 3 | from torch.testing._internal.common_device_type import instantiate_device_type_tests 4 | from torch.testing._internal.common_utils import run_tests, TestCase 5 | 6 | try: 7 | from xpu_test_utils import XPUPatchForImport 8 | except Exception as e: 9 | from .xpu_test_utils import XPUPatchForImport 10 | 11 | with XPUPatchForImport(False): 12 | from test_ops_fwd_gradients import TestFwdGradients 13 | TestFwdGradients._default_dtype_check_enabled = True 14 | instantiate_device_type_tests( 15 | TestFwdGradients, globals(), only_for=("xpu"), allow_xpu=True 16 | ) 17 | if __name__ == "__main__": 18 | TestCase._default_dtype_check_enabled = True 19 | run_tests() 20 | -------------------------------------------------------------------------------- /test/xpu/test_ops_gradients_xpu.py: -------------------------------------------------------------------------------- 1 | # Owner(s): ["module: intel"] 2 | 3 | from torch.testing._internal.common_device_type import instantiate_device_type_tests 4 | from torch.testing._internal.common_utils import run_tests 5 | 6 | try: 7 | from xpu_test_utils import XPUPatchForImport 8 | except Exception as e: 9 | from .xpu_test_utils import XPUPatchForImport 10 | 11 | with XPUPatchForImport(False): 12 | from test_ops_gradients import TestBwdGradients 13 | 14 | instantiate_device_type_tests( 15 | TestBwdGradients, globals(), only_for="xpu", allow_xpu=True 16 | ) 17 | 18 | if __name__ == "__main__": 19 | run_tests() 20 | -------------------------------------------------------------------------------- /test/xpu/test_ops_xpu.py: -------------------------------------------------------------------------------- 1 | # Owner(s): ["module: intel"] 2 | 3 | 4 | from torch.testing._internal.common_device_type import instantiate_device_type_tests 5 | from torch.testing._internal.common_utils import run_tests 6 | 7 | try: 8 | from xpu_test_utils import XPUPatchForImport 9 | except Exception as e: 10 | from .xpu_test_utils import XPUPatchForImport 11 | with XPUPatchForImport(False): 12 | from test_ops import TestCommon, TestMathBits 13 | instantiate_device_type_tests(TestCommon, globals(), only_for="xpu", allow_xpu=True) 14 | instantiate_device_type_tests(TestMathBits, globals(), only_for="xpu", allow_xpu=True) 15 | # in finegrand 16 | # instantiate_device_type_tests(TestCompositeCompliance, globals(), only_for="xpu", allow_xpu=True) 17 | # only CPU 18 | # instantiate_device_type_tests(TestRefsOpsInfo, globals(), only_for="xpu", allow_xpu=True) 19 | # not important 20 | # instantiate_device_type_tests(TestFakeTensor, globals(), only_for="xpu", allow_xpu=True) 21 | # instantiate_device_type_tests(TestTags, globals(), only_for="xpu", allow_xpu=True) 22 | 23 | 24 | if __name__ == "__main__": 25 | run_tests() 26 | -------------------------------------------------------------------------------- /test/xpu/test_reductions_xpu.py: -------------------------------------------------------------------------------- 1 | # Owner(s): ["module: intel"] 2 | 3 | from torch.testing._internal.common_device_type import instantiate_device_type_tests 4 | from torch.testing._internal.common_utils import run_tests 5 | 6 | try: 7 | from xpu_test_utils import XPUPatchForImport 8 | except Exception as e: 9 | from .xpu_test_utils import XPUPatchForImport 10 | 11 | with XPUPatchForImport(False): 12 | from test_reductions import TestReductions 13 | 14 | 15 | instantiate_device_type_tests(TestReductions, globals(), only_for="xpu", allow_xpu=True) 16 | 17 | 18 | if __name__ == "__main__": 19 | run_tests() 20 | -------------------------------------------------------------------------------- /test/xpu/test_scatter_gather_ops_xpu.py: -------------------------------------------------------------------------------- 1 | # Owner(s): ["module: intel"] 2 | 3 | from torch.testing._internal.common_device_type import instantiate_device_type_tests 4 | from torch.testing._internal.common_utils import run_tests 5 | 6 | try: 7 | from xpu_test_utils import XPUPatchForImport 8 | except Exception as e: 9 | from .xpu_test_utils import XPUPatchForImport 10 | 11 | with XPUPatchForImport(False): 12 | from test_scatter_gather_ops import TestScatterGather 13 | 14 | 15 | instantiate_device_type_tests( 16 | TestScatterGather, globals(), only_for="xpu", allow_xpu=True 17 | ) 18 | 19 | 20 | if __name__ == "__main__": 21 | run_tests() 22 | -------------------------------------------------------------------------------- /test/xpu/test_segment_reductions_xpu.py: -------------------------------------------------------------------------------- 1 | # Owner(s): ["module: intel"] 2 | 3 | 4 | from torch.testing._internal.common_device_type import instantiate_device_type_tests 5 | from torch.testing._internal.common_utils import run_tests 6 | 7 | try: 8 | from xpu_test_utils import XPUPatchForImport 9 | except Exception as e: 10 | from .xpu_test_utils import XPUPatchForImport 11 | with XPUPatchForImport(False): 12 | from test_segment_reductions import TestSegmentReductions 13 | 14 | instantiate_device_type_tests( 15 | TestSegmentReductions, globals(), only_for="xpu", allow_xpu=True 16 | ) 17 | 18 | 19 | if __name__ == "__main__": 20 | run_tests() 21 | -------------------------------------------------------------------------------- /test/xpu/test_shape_ops_xpu.py: -------------------------------------------------------------------------------- 1 | # Owner(s): ["module: intel"] 2 | 3 | from torch.testing._internal.common_device_type import instantiate_device_type_tests 4 | from torch.testing._internal.common_utils import run_tests 5 | 6 | try: 7 | from xpu_test_utils import XPUPatchForImport 8 | except Exception as e: 9 | from .xpu_test_utils import XPUPatchForImport 10 | 11 | with XPUPatchForImport(False): 12 | from test_shape_ops import TestShapeOps 13 | 14 | instantiate_device_type_tests(TestShapeOps, globals(), only_for="xpu", allow_xpu=True) 15 | 16 | 17 | if __name__ == "__main__": 18 | run_tests() 19 | -------------------------------------------------------------------------------- /tools/linter/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intel/torch-xpu-ops/5b4d7444484576f721d2295761cf8fafa924ef36/tools/linter/__init__.py -------------------------------------------------------------------------------- /tools/linter/adapters/README.md: -------------------------------------------------------------------------------- 1 | # lintrunner adapters 2 | 3 | These files adapt our various linters to work with `lintrunner`. 4 | 5 | ## Adding a new linter 6 | 1. init and linter 7 | 2. {{DRYRUN}} and {{PATHSFILE}} 8 | 3. never exit uncleanly 9 | 4. Communication protocol 10 | 5. Self-contained 11 | -------------------------------------------------------------------------------- /tools/linter/clang_tidy/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intel/torch-xpu-ops/5b4d7444484576f721d2295761cf8fafa924ef36/tools/linter/clang_tidy/__init__.py --------------------------------------------------------------------------------