├── .clang-format
├── .clang-tidy
├── .cmakelintrc
├── .flake8
├── .github
    ├── ISSUE_TEMPLATE
    │   ├── bug-report.yml
    │   ├── documentation.yml
    │   └── feature-request.yml
    ├── actions
    │   ├── inductor-xpu-e2e-test
    │   │   └── action.yml
    │   ├── print-environment
    │   │   └── action.yml
    │   └── pt2e
    │   │   └── action.yml
    ├── ci_commit_pins
    │   ├── torchbench.txt
    │   └── triton.txt
    ├── ci_expected_accuracy
    │   ├── check_expected.py
    │   ├── inductor_huggingface_inference.csv
    │   ├── inductor_huggingface_training.csv
    │   ├── inductor_timm_models_inference.csv
    │   ├── inductor_timm_models_training.csv
    │   ├── inductor_torchbench_inference.csv
    │   └── inductor_torchbench_training.csv
    ├── scripts
    │   ├── apply_torch_pr.py
    │   ├── build.sh
    │   ├── calculate_best_perf.py
    │   ├── check-transformers.py
    │   ├── check-ut.py
    │   ├── e2e_summary.sh
    │   ├── env.sh
    │   ├── inductor_summary.py
    │   ├── inductor_xpu_test.sh
    │   ├── install_xpu.bat
    │   ├── lintrunner.sh
    │   ├── microbench_summary.sh
    │   ├── parse-junitxml.py
    │   ├── perf_comparison.py
    │   ├── rpath.sh
    │   ├── spec.py
    │   ├── summary_pt2e.py
    │   └── ut_result_check.sh
    └── workflows
    │   ├── _linux_accelerate.yml
    │   ├── _linux_build.yml
    │   ├── _linux_op_benchmark.yml
    │   ├── _linux_transformers.yml
    │   ├── _linux_ut.yml
    │   ├── _performance_comparison.yml
    │   ├── _windows_ut.yml
    │   ├── nightly_ondemand.yml
    │   ├── nightly_ondemand_rolling.yml
    │   ├── nightly_ondemand_whl.yml
    │   └── pull.yml
├── .gitignore
├── .lintrunner.toml
├── CMakeLists.txt
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── SECURITY.md
├── cmake
    ├── BuildFlags.cmake
    ├── ClangFormat.cmake
    ├── Codegen.cmake
    ├── Modules
    │   ├── FindONEMKL.cmake
    │   ├── FindSYCL.cmake
    │   ├── FindSYCL
    │   │   ├── make2cmake.cmake
    │   │   └── run_sycl.cmake
    │   ├── FindSYCLToolkit.cmake
    │   └── FindXCCL.cmake
    ├── ONEMKL.cmake
    ├── SYCL.cmake
    └── XCCL.cmake
├── docs
    └── torch_xpu_ops.jpg
├── mypy-strict.ini
├── mypy.ini
├── pyproject.toml
├── src
    ├── ATen
    │   ├── CMakeLists.txt
    │   ├── native
    │   │   ├── nested
    │   │   │   ├── NestedTensorTransformerFunctions.cpp
    │   │   │   └── xpu
    │   │   │   │   ├── NestedTensorTransformerFunctions.cpp
    │   │   │   │   └── sycl
    │   │   │   │       ├── NestedTensorTransformerFunctionKernels.cpp
    │   │   │   │       └── NestedTensorTransformerFunctionKernels.h
    │   │   ├── quantized
    │   │   │   ├── AffineQuantizer.cpp
    │   │   │   ├── FakeQuantizeCore.cpp
    │   │   │   ├── FusedObsFakeQuant.cpp
    │   │   │   ├── MakePerTensorQuantizedTensor.cpp
    │   │   │   ├── QuantizedMaxPool2d.cpp
    │   │   │   └── sycl
    │   │   │   │   ├── AffineQuantizerKernels.cpp
    │   │   │   │   ├── AffineQuantizerKernels.h
    │   │   │   │   ├── FakeQuantizeCoreKernels.cpp
    │   │   │   │   ├── FakeQuantizeCoreKernels.h
    │   │   │   │   ├── FusedObsFakeQuantKernels.cpp
    │   │   │   │   ├── FusedObsFakeQuantKernels.h
    │   │   │   │   ├── MakePerTensorQuantizedTensorKernels.cpp
    │   │   │   │   ├── MakePerTensorQuantizedTensorKernels.h
    │   │   │   │   ├── QuantizedMaxPool2d.cpp
    │   │   │   │   └── QuantizedMaxPool2d.h
    │   │   ├── sparse
    │   │   │   └── xpu
    │   │   │   │   ├── SparseBinaryOpIntersection.cpp
    │   │   │   │   ├── SparseCsrTensorMath.cpp
    │   │   │   │   ├── SparseSoftmax.cpp
    │   │   │   │   ├── SparseTensor.cpp
    │   │   │   │   ├── SparseTensorMath.cpp
    │   │   │   │   └── sycl
    │   │   │   │       ├── SparseBinaryOpIntersectionKernels.cpp
    │   │   │   │       ├── SparseBinaryOpIntersectionKernels.h
    │   │   │   │       ├── SparseCsrTensorMathKernels.cpp
    │   │   │   │       ├── SparseCsrTensorMathKernels.h
    │   │   │   │       ├── SparseSoftmaxKernels.cpp
    │   │   │   │       ├── SparseSoftmaxKernels.h
    │   │   │   │       ├── SparseTensorKernels.cpp
    │   │   │   │       ├── SparseTensorKernels.h
    │   │   │   │       ├── SparseTensorMathKernels.cpp
    │   │   │   │       └── SparseTensorMathKernels.h
    │   │   ├── transformers
    │   │   │   ├── Attention.cpp
    │   │   │   ├── SDPUtils.cpp
    │   │   │   ├── SDPUtils.h
    │   │   │   └── sycl
    │   │   │   │   ├── AttentionKernels.cpp
    │   │   │   │   └── AttentionKernels.h
    │   │   └── xpu
    │   │   │   ├── Activation.cpp
    │   │   │   ├── AdaptiveAveragePooling2d.cpp
    │   │   │   ├── AdaptiveAveragePooling3d.cpp
    │   │   │   ├── AdaptiveMaxPooling2d.cpp
    │   │   │   ├── AdaptiveMaxPooling3d.cpp
    │   │   │   ├── AiryAi.cpp
    │   │   │   ├── AmpKernels.cpp
    │   │   │   ├── AveragePool2d.cpp
    │   │   │   ├── AveragePool3d.cpp
    │   │   │   ├── BatchLinearAlgebra.cpp
    │   │   │   ├── BatchNorm.cpp
    │   │   │   ├── Bessel.cpp
    │   │   │   ├── BinaryOps.cpp
    │   │   │   ├── Bucketization.cpp
    │   │   │   ├── Col2Im.cpp
    │   │   │   ├── CompareOps.cpp
    │   │   │   ├── Copy.cpp
    │   │   │   ├── Cross.cpp
    │   │   │   ├── DeformConv2d.cpp
    │   │   │   ├── DepthwiseConv2d.cpp
    │   │   │   ├── DepthwiseConv3d.cpp
    │   │   │   ├── DilatedMaxPool2d.cpp
    │   │   │   ├── DilatedMaxPool3d.cpp
    │   │   │   ├── Distance.cpp
    │   │   │   ├── Distributions.cpp
    │   │   │   ├── Dropout.cpp
    │   │   │   ├── Embedding.cpp
    │   │   │   ├── EmbeddingBag.cpp
    │   │   │   ├── Equal.cpp
    │   │   │   ├── Fill.cpp
    │   │   │   ├── ForeachOpList.cpp
    │   │   │   ├── ForeachOpScalar.cpp
    │   │   │   ├── ForeachOpScalarList.cpp
    │   │   │   ├── ForeachOpScalarTensor.cpp
    │   │   │   ├── ForeachReduceOp.cpp
    │   │   │   ├── ForeachUnaryOp.cpp
    │   │   │   ├── FractionalMaxPool2d.cpp
    │   │   │   ├── FractionalMaxPool3d.cpp
    │   │   │   ├── FunctionOfAMatrixUtils.cpp
    │   │   │   ├── FusedAdam.cpp
    │   │   │   ├── FusedAdamW.cpp
    │   │   │   ├── FusedSgd.cpp
    │   │   │   ├── GatedLinearUnit.cpp
    │   │   │   ├── GridSampler.cpp
    │   │   │   ├── GroupNorm.cpp
    │   │   │   ├── Histogram.cpp
    │   │   │   ├── Im2Col.cpp
    │   │   │   ├── Indexing.cpp
    │   │   │   ├── LayerNorm.cpp
    │   │   │   ├── Lerp.cpp
    │   │   │   ├── LinearAlgebra.cpp
    │   │   │   ├── LinearInt4.cpp
    │   │   │   ├── Loss.cpp
    │   │   │   ├── LossCTC.cpp
    │   │   │   ├── LossMultiLabelMargin.cpp
    │   │   │   ├── LossMultiMargin.cpp
    │   │   │   ├── LossNLL.cpp
    │   │   │   ├── LossNLL2d.cpp
    │   │   │   ├── MaxUnpooling.cpp
    │   │   │   ├── NMS.cpp
    │   │   │   ├── Nonzero.cpp
    │   │   │   ├── Normalization.cpp
    │   │   │   ├── PinnedMemoryAllocator.cpp
    │   │   │   ├── PointwiseOps.cpp
    │   │   │   ├── Pow.cpp
    │   │   │   ├── PsRoiAlign.cpp
    │   │   │   ├── PsRoiPool.cpp
    │   │   │   ├── RNN.cpp
    │   │   │   ├── RangeFactories.cpp
    │   │   │   ├── RecordStream.cpp
    │   │   │   ├── ReduceAllOps.cpp
    │   │   │   ├── ReduceOps.cpp
    │   │   │   ├── ReflectionPad.cpp
    │   │   │   ├── Repeat.cpp
    │   │   │   ├── ReplicationPadding.cpp
    │   │   │   ├── Resize.cpp
    │   │   │   ├── RoiAlign.cpp
    │   │   │   ├── RoiPool.cpp
    │   │   │   ├── RreluWithNoise.cpp
    │   │   │   ├── ScanKernels.cpp
    │   │   │   ├── ScanKernels.h
    │   │   │   ├── SegmentReduce.cpp
    │   │   │   ├── SoftMax.cpp
    │   │   │   ├── Sorting.cpp
    │   │   │   ├── SpectralOps.cpp
    │   │   │   ├── SummaryOps.cpp
    │   │   │   ├── TensorAdvancedIndexing.cpp
    │   │   │   ├── TensorCompare.cpp
    │   │   │   ├── TensorFactories.cpp
    │   │   │   ├── TensorProperties.cpp
    │   │   │   ├── TensorShape.cpp
    │   │   │   ├── TensorTopK.cpp
    │   │   │   ├── TensorTransformations.cpp
    │   │   │   ├── TriangluarOps.cpp
    │   │   │   ├── UnaryOps.cpp
    │   │   │   ├── UnfoldBackward.cpp
    │   │   │   ├── Unique.cpp
    │   │   │   ├── UpSample.h
    │   │   │   ├── UpSampleBicubic2d.cpp
    │   │   │   ├── UpSampleBilinear2d.cpp
    │   │   │   ├── UpSampleLinear1d.cpp
    │   │   │   ├── UpSampleNearest1d.cpp
    │   │   │   ├── UpSampleNearest2d.cpp
    │   │   │   ├── UpSampleNearest3d.cpp
    │   │   │   ├── UpSampleTrilinear3d.cpp
    │   │   │   ├── WeightInt4Pack.cpp
    │   │   │   ├── WeightNorm.cpp
    │   │   │   ├── XPUFallback.template
    │   │   │   ├── XPUScalar.cpp
    │   │   │   ├── mkl
    │   │   │       ├── BatchLinearAlgebra.cpp
    │   │   │       ├── BatchLinearAlgebra.h
    │   │   │       ├── SpectralOps.cpp
    │   │   │       └── SpectralOps.h
    │   │   │   └── sycl
    │   │   │       ├── AbsKernel.cpp
    │   │   │       ├── AbsKernel.h
    │   │   │       ├── ActivationEluKernels.cpp
    │   │   │       ├── ActivationEluKernels.h
    │   │   │       ├── ActivationGeluKernel.cpp
    │   │   │       ├── ActivationGeluKernel.h
    │   │   │       ├── ActivationGluKernels.cpp
    │   │   │       ├── ActivationGluKernels.h
    │   │   │       ├── ActivationHardshrinkKernels.cpp
    │   │   │       ├── ActivationHardshrinkKernels.h
    │   │   │       ├── ActivationHardsigmoidKernels.cpp
    │   │   │       ├── ActivationHardsigmoidKernels.h
    │   │   │       ├── ActivationHardswishKernels.cpp
    │   │   │       ├── ActivationHardswishKernels.h
    │   │   │       ├── ActivationHardtanhKernels.cpp
    │   │   │       ├── ActivationHardtanhKernels.h
    │   │   │       ├── ActivationLeakyReluKernels.cpp
    │   │   │       ├── ActivationLeakyReluKernels.h
    │   │   │       ├── ActivationLogSigmoidKernels.cpp
    │   │   │       ├── ActivationLogSigmoidKernels.h
    │   │   │       ├── ActivationMishKernels.cpp
    │   │   │       ├── ActivationMishKernels.h
    │   │   │       ├── ActivationPreluKernels.cpp
    │   │   │       ├── ActivationPreluKernels.h
    │   │   │       ├── ActivationSiluKernels.cpp
    │   │   │       ├── ActivationSiluKernels.h
    │   │   │       ├── ActivationSoftplusKernels.cpp
    │   │   │       ├── ActivationSoftplusKernels.h
    │   │   │       ├── ActivationSoftshrinkKernels.cpp
    │   │   │       ├── ActivationSoftshrinkKernels.h
    │   │   │       ├── ActivationThresholdKernel.cpp
    │   │   │       ├── ActivationThresholdKernel.h
    │   │   │       ├── AdaptiveAveragePooling2dKernels.cpp
    │   │   │       ├── AdaptiveAveragePooling2dKernels.h
    │   │   │       ├── AdaptiveAveragePooling3dKernels.cpp
    │   │   │       ├── AdaptiveAveragePooling3dKernels.h
    │   │   │       ├── AdaptiveMaxPooling2dKernels.cpp
    │   │   │       ├── AdaptiveMaxPooling2dKernels.h
    │   │   │       ├── AdaptiveMaxPooling3dKernels.cpp
    │   │   │       ├── AdaptiveMaxPooling3dKernels.h
    │   │   │       ├── AiryAiKernel.cpp
    │   │   │       ├── AiryAiKernel.h
    │   │   │       ├── AmpKernels.cpp
    │   │   │       ├── AmpKernels.h
    │   │   │       ├── Atomics.h
    │   │   │       ├── AveragePool2dKernels.cpp
    │   │   │       ├── AveragePool2dKernels.h
    │   │   │       ├── AveragePool3dKernels.cpp
    │   │   │       ├── AveragePool3dKernels.h
    │   │   │       ├── BatchKernel.h
    │   │   │       ├── BatchNormKernels.cpp
    │   │   │       ├── BatchNormKernels.h
    │   │   │       ├── BesselJ0Kernel.cpp
    │   │   │       ├── BesselJ0Kernel.h
    │   │   │       ├── BesselJ1Kernel.cpp
    │   │   │       ├── BesselJ1Kernel.h
    │   │   │       ├── BesselY0Kernel.cpp
    │   │   │       ├── BesselY0Kernel.h
    │   │   │       ├── BesselY1Kernel.cpp
    │   │   │       ├── BesselY1Kernel.h
    │   │   │       ├── BinaryBitwiseOpsKernels.cpp
    │   │   │       ├── BinaryBitwiseOpsKernels.h
    │   │   │       ├── BinaryDivFloorKernel.cpp
    │   │   │       ├── BinaryDivTrueKernel.cpp
    │   │   │       ├── BinaryDivTruncKernel.cpp
    │   │   │       ├── BinaryGeometricKernels.cpp
    │   │   │       ├── BinaryGeometricKernels.h
    │   │   │       ├── BinaryInternal.h
    │   │   │       ├── BinaryKernels.cpp
    │   │   │       ├── BinaryKernels.h
    │   │   │       ├── BinaryLogicalOpsKernels.cpp
    │   │   │       ├── BinaryLogicalOpsKernels.h
    │   │   │       ├── BinaryMiscBackwardOpsKernels.cpp
    │   │   │       ├── BinaryMiscBackwardOpsKernels.h
    │   │   │       ├── BinaryMiscOpsKernels.cpp
    │   │   │       ├── BinaryMiscOpsKernels.h
    │   │   │       ├── BinaryRemainderKernel.cpp
    │   │   │       ├── BinaryRemainderKernel.h
    │   │   │       ├── BinaryShiftOpsKernels.cpp
    │   │   │       ├── BinaryShiftOpsKernels.h
    │   │   │       ├── BucketizationKernels.cpp
    │   │   │       ├── BucketizationKernels.h
    │   │   │       ├── ChebyshevPolynomialKernels.h
    │   │   │       ├── ChebyshevPolynomialTKernel.cpp
    │   │   │       ├── ChebyshevPolynomialUKernel.cpp
    │   │   │       ├── ChebyshevPolynomialVKernel.cpp
    │   │   │       ├── ChebyshevPolynomialWKernel.cpp
    │   │   │       ├── Col2ImKernel.cpp
    │   │   │       ├── Col2ImKernel.h
    │   │   │       ├── CompareKernels.cpp
    │   │   │       ├── CompareKernels.h
    │   │   │       ├── ComplexKernels.cpp
    │   │   │       ├── ComplexKernels.h
    │   │   │       ├── CopyKernel.cpp
    │   │   │       ├── CopyKernel.h
    │   │   │       ├── CopysignKernel.cpp
    │   │   │       ├── CopysignKernel.h
    │   │   │       ├── CrossKernel.cpp
    │   │   │       ├── CrossKernel.h
    │   │   │       ├── CumminmaxKernel.cpp
    │   │   │       ├── CumprodKernel.cpp
    │   │   │       ├── CumsumKernel.cpp
    │   │   │       ├── DeformConv2dKernels.cpp
    │   │   │       ├── DeformConv2dKernels.h
    │   │   │       ├── DepthwiseConv2dKernels.cpp
    │   │   │       ├── DepthwiseConv2dKernels.h
    │   │   │       ├── DepthwiseConv3dKernels.cpp
    │   │   │       ├── DepthwiseConv3dKernels.h
    │   │   │       ├── Dequant_int4.cpp
    │   │   │       ├── Dequant_int4.h
    │   │   │       ├── DilatedMaxPool2d.cpp
    │   │   │       ├── DilatedMaxPool2d.h
    │   │   │       ├── DilatedMaxPool3d.cpp
    │   │   │       ├── DilatedMaxPool3d.h
    │   │   │       ├── DistanceKernels.cpp
    │   │   │       ├── DistanceKernels.h
    │   │   │       ├── DistributionBernoulli.cpp
    │   │   │       ├── DistributionCauchyKernel.cpp
    │   │   │       ├── DistributionExponentialKernel.cpp
    │   │   │       ├── DistributionGeometricKernel.cpp
    │   │   │       ├── DistributionKernels.h
    │   │   │       ├── DistributionLogNormalKernel.cpp
    │   │   │       ├── DistributionNormal.cpp
    │   │   │       ├── DistributionRandomKernel.cpp
    │   │   │       ├── DistributionTemplates.h
    │   │   │       ├── DistributionUniform.cpp
    │   │   │       ├── Distributions.cpp
    │   │   │       ├── Distributions.h
    │   │   │       ├── Dropout.cpp
    │   │   │       ├── DropoutKernels.h
    │   │   │       ├── ElementwiseInvoke.h
    │   │   │       ├── Embedding.cpp
    │   │   │       ├── EmbeddingBackwardKernel.h
    │   │   │       ├── EmbeddingBag.cpp
    │   │   │       ├── EmbeddingBag.h
    │   │   │       ├── EmbeddingBagKernels.h
    │   │   │       ├── EmbeddingKernels.h
    │   │   │       ├── FFTKernelFunctor.cpp
    │   │   │       ├── FFTKernelFunctor.h
    │   │   │       ├── FillKernel.cpp
    │   │   │       ├── FillKernel.h
    │   │   │       ├── ForeachBinaryOpListKernels.cpp
    │   │   │       ├── ForeachBinaryOpListKernels.h
    │   │   │       ├── ForeachBinaryOpScalarKernels.cpp
    │   │   │       ├── ForeachBinaryOpScalarKernels.h
    │   │   │       ├── ForeachBinaryOpScalarListKernels.cpp
    │   │   │       ├── ForeachBinaryOpScalarListKernels.h
    │   │   │       ├── ForeachBinaryOpScalarTensorKernels.cpp
    │   │   │       ├── ForeachBinaryOpScalarTensorKernels.h
    │   │   │       ├── ForeachCopyKernels.cpp
    │   │   │       ├── ForeachCopyKernels.h
    │   │   │       ├── ForeachFunctors.h
    │   │   │       ├── ForeachPointwiseKernels.cpp
    │   │   │       ├── ForeachPointwiseOpListKernels.h
    │   │   │       ├── ForeachPointwiseOpScalarKernels.h
    │   │   │       ├── ForeachPointwiseOpScalarListKernels.h
    │   │   │       ├── ForeachReduceKernels.cpp
    │   │   │       ├── ForeachReduceKernels.h
    │   │   │       ├── ForeachTernaryKernels.cpp
    │   │   │       ├── ForeachTernaryOpListKernels.h
    │   │   │       ├── ForeachTernaryOpScalarKernels.h
    │   │   │       ├── ForeachTernaryOpScalarListKernels.h
    │   │   │       ├── ForeachUnaryKernels.cpp
    │   │   │       ├── ForeachUnaryKernels.h
    │   │   │       ├── FractionalMaxPool2dKernels.cpp
    │   │   │       ├── FractionalMaxPool2dKernels.h
    │   │   │       ├── FractionalMaxPool3dKernels.cpp
    │   │   │       ├── FractionalMaxPool3dKernels.h
    │   │   │       ├── FunctionOfAMatrixUtilsKernels.cpp
    │   │   │       ├── FunctionOfAMatrixUtilsKernels.h
    │   │   │       ├── FusedAdamAmsgradKernels.cpp
    │   │   │       ├── FusedAdamKernels.cpp
    │   │   │       ├── FusedAdamKernels.h
    │   │   │       ├── FusedAdamUtils.h
    │   │   │       ├── FusedAdamWAmsgradKernels.cpp
    │   │   │       ├── FusedAdamWKernels.cpp
    │   │   │       ├── FusedAdamWKernels.h
    │   │   │       ├── FusedSgdKernels.cpp
    │   │   │       ├── FusedSgdKernels.h
    │   │   │       ├── GcdLcmKernels.cpp
    │   │   │       ├── GcdLcmKernels.h
    │   │   │       ├── GridSampler.cpp
    │   │   │       ├── GridSampler.h
    │   │   │       ├── GridSamplerKernels.h
    │   │   │       ├── GroupNormKernels.cpp
    │   │   │       ├── GroupNormKernels.h
    │   │   │       ├── GroupReduceUtils.h
    │   │   │       ├── HermitePolynomialHKernel.cpp
    │   │   │       ├── HermitePolynomialHKernel.h
    │   │   │       ├── HermitePolynomialHeKernel.cpp
    │   │   │       ├── HermitePolynomialHeKernel.h
    │   │   │       ├── HistogramKernels.h
    │   │   │       ├── HistogramddKernels.cpp
    │   │   │       ├── IGammaKernel.cpp
    │   │   │       ├── IGammaKernel.h
    │   │   │       ├── Im2ColKernel.cpp
    │   │   │       ├── Im2ColKernel.h
    │   │   │       ├── IndexKernelUtils.h
    │   │   │       ├── IndexUtils.h
    │   │   │       ├── Indexing.cpp
    │   │   │       ├── Indexing.h
    │   │   │       ├── IndexingKernels.h
    │   │   │       ├── IndexingUtils.h
    │   │   │       ├── IntegerDivider.h
    │   │   │       ├── KernelUtils.h
    │   │   │       ├── LaguerrePolynomialLKernel.cpp
    │   │   │       ├── LaguerrePolynomialLKernel.h
    │   │   │       ├── LaunchUtils.h
    │   │   │       ├── LayerNormKernels.cpp
    │   │   │       ├── LayerNormKernels.h
    │   │   │       ├── LegendrePolynomialPKernel.cpp
    │   │   │       ├── LegendrePolynomialPKernel.h
    │   │   │       ├── LerpKernels.cpp
    │   │   │       ├── LerpKernels.h
    │   │   │       ├── LinearAlgebraKernels.cpp
    │   │   │       ├── LinearAlgebraKernels.h
    │   │   │       ├── LinearInt4.cpp
    │   │   │       ├── LinearInt4.h
    │   │   │       ├── LogAddExpKernels.cpp
    │   │   │       ├── LogAddExpKernels.h
    │   │   │       ├── LogcumsumexpKernel.cpp
    │   │   │       ├── Loops.h
    │   │   │       ├── LossCTCKernels.cpp
    │   │   │       ├── LossCTCKernels.h
    │   │   │       ├── LossKernels.cpp
    │   │   │       ├── LossKernels.h
    │   │   │       ├── LossNLL2dKernels.cpp
    │   │   │       ├── LossNLL2dKernels.h
    │   │   │       ├── LossNLLKernel.cpp
    │   │   │       ├── LossNLLKernel.h
    │   │   │       ├── MathExtensions.h
    │   │   │       ├── MaxMinElementwiseKernels.cpp
    │   │   │       ├── MaxMinElementwiseKernels.h
    │   │   │       ├── MaxUnpoolingKernels.cpp
    │   │   │       ├── MaxUnpoolingKernels.h
    │   │   │       ├── MemoryAccess.h
    │   │   │       ├── MemoryAccessUtils.h
    │   │   │       ├── ModifiedBesselI0Kernel.cpp
    │   │   │       ├── ModifiedBesselI0Kernel.h
    │   │   │       ├── ModifiedBesselI1Kernel.cpp
    │   │   │       ├── ModifiedBesselI1Kernel.h
    │   │   │       ├── ModifiedBesselK0Kernel.cpp
    │   │   │       ├── ModifiedBesselK0Kernel.h
    │   │   │       ├── ModifiedBesselK1Kernel.cpp
    │   │   │       ├── ModifiedBesselK1Kernel.h
    │   │   │       ├── MultiLabelMarginLossKernels.cpp
    │   │   │       ├── MultiLabelMarginLossKernels.h
    │   │   │       ├── MultiMarginLossKernels.cpp
    │   │   │       ├── MultiMarginLossKernels.h
    │   │   │       ├── MultiTensorApply.h
    │   │   │       ├── MultinomialKernel.cpp
    │   │   │       ├── MultinomialKernel.h
    │   │   │       ├── NMSKernel.cpp
    │   │   │       ├── NMSKernel.h
    │   │   │       ├── NonzeroKernel.cpp
    │   │   │       ├── NonzeroKernel.h
    │   │   │       ├── Norm.h
    │   │   │       ├── NumericLimits.h
    │   │   │       ├── OffsetCalculator.h
    │   │   │       ├── Philox4x32.h
    │   │   │       ├── PointwiseOpsKernels.cpp
    │   │   │       ├── PointwiseOpsKernels.h
    │   │   │       ├── Pow.h
    │   │   │       ├── PowKernels.cpp
    │   │   │       ├── PowKernels.h
    │   │   │       ├── PsRoiAlignKernels.cpp
    │   │   │       ├── PsRoiAlignKernels.h
    │   │   │       ├── PsRoiPoolKernels.cpp
    │   │   │       ├── PsRoiPoolKernels.h
    │   │   │       ├── RNNKernels.cpp
    │   │   │       ├── RNNKernels.h
    │   │   │       ├── RandpermKernel.cpp
    │   │   │       ├── RandpermKernel.h
    │   │   │       ├── RangeFactoriesKernel.cpp
    │   │   │       ├── RangeFactoriesKernel.h
    │   │   │       ├── Reduce.h
    │   │   │       ├── ReduceAMinMaxKernel.cpp
    │   │   │       ├── ReduceArgMaxKernel.cpp
    │   │   │       ├── ReduceArgMinKernel.cpp
    │   │   │       ├── ReduceLogicKernels.cpp
    │   │   │       ├── ReduceMaxValuesKernels.cpp
    │   │   │       ├── ReduceMaxValuesKernels.h
    │   │   │       ├── ReduceMinValuesKernels.cpp
    │   │   │       ├── ReduceMinValuesKernels.h
    │   │   │       ├── ReduceMomentKernels.cpp
    │   │   │       ├── ReduceNormKernel.cpp
    │   │   │       ├── ReduceNormKernel.h
    │   │   │       ├── ReduceOps.h
    │   │   │       ├── ReduceOpsKernels.h
    │   │   │       ├── ReduceSumProdKernels.cpp
    │   │   │       ├── ReflectionPadKernels.cpp
    │   │   │       ├── ReflectionPadKernels.h
    │   │   │       ├── RenormKernel.cpp
    │   │   │       ├── RenormKernel.h
    │   │   │       ├── RepeatKernel.cpp
    │   │   │       ├── RepeatKernel.h
    │   │   │       ├── ReplicationPaddingKernels.cpp
    │   │   │       ├── ReplicationPaddingKernels.h
    │   │   │       ├── ResizeKernel.cpp
    │   │   │       ├── ResizeKernel.h
    │   │   │       ├── RoiAlignKernels.cpp
    │   │   │       ├── RoiAlignKernels.h
    │   │   │       ├── RoiPoolKernels.cpp
    │   │   │       ├── RoiPoolKernels.h
    │   │   │       ├── RreluWithNoiseKernels.cpp
    │   │   │       ├── RreluWithNoiseKernels.h
    │   │   │       ├── SYCLGroupAlgorithm.h
    │   │   │       ├── ScaledModifiedBesselK0Kernel.cpp
    │   │   │       ├── ScaledModifiedBesselK0Kernel.h
    │   │   │       ├── ScaledModifiedBesselK1Kernel.cpp
    │   │   │       ├── ScaledModifiedBesselK1Kernel.h
    │   │   │       ├── ScanUtils.h
    │   │   │       ├── ScatterGatherKernels.cpp
    │   │   │       ├── ScatterGatherKernels.h
    │   │   │       ├── SegmentReduceKernels.cpp
    │   │   │       ├── SegmentReduceKernels.h
    │   │   │       ├── Shape.cpp
    │   │   │       ├── ShapeKernels.h
    │   │   │       ├── SharedReduceOps.h
    │   │   │       ├── ShiftedChebyshevPolynomialKernels.h
    │   │   │       ├── ShiftedChebyshevPolynomialTKernel.cpp
    │   │   │       ├── ShiftedChebyshevPolynomialUKernel.cpp
    │   │   │       ├── ShiftedChebyshevPolynomialVKernel.cpp
    │   │   │       ├── ShiftedChebyshevPolynomialWKernel.cpp
    │   │   │       ├── SoftMaxKernels.cpp
    │   │   │       ├── SoftMaxKernels.h
    │   │   │       ├── Sorting.cpp
    │   │   │       ├── Sorting.h
    │   │   │       ├── SortingCommon.h
    │   │   │       ├── SortingKernels.h
    │   │   │       ├── SortingRadixSelect.h
    │   │   │       ├── SortingRadixSort.h
    │   │   │       ├── SphericalBesselJ0Kernel.cpp
    │   │   │       ├── SphericalBesselJ0Kernel.h
    │   │   │       ├── StepKernels.cpp
    │   │   │       ├── StepKernels.h
    │   │   │       ├── SummaryOpsKernels.cpp
    │   │   │       ├── SummaryOpsKernels.h
    │   │   │       ├── TensorApplyUtils.h
    │   │   │       ├── TensorCompare.cpp
    │   │   │       ├── TensorCompareKernels.cpp
    │   │   │       ├── TensorCompareKernels.h
    │   │   │       ├── TensorFactoriesKernels.cpp
    │   │   │       ├── TensorFactoriesKernels.h
    │   │   │       ├── TensorModeKernel.cpp
    │   │   │       ├── TensorModeKernel.h
    │   │   │       ├── TensorShapeKernels.cpp
    │   │   │       ├── TensorShapeKernels.h
    │   │   │       ├── TensorTopKKernel.cpp
    │   │   │       ├── TensorTopKKernel.h
    │   │   │       ├── TensorTransformationsKernels.cpp
    │   │   │       ├── TensorTransformationsKernels.h
    │   │   │       ├── TriangularOpsKernels.cpp
    │   │   │       ├── TriangularOpsKernels.h
    │   │   │       ├── UnaryComplexKernels.cpp
    │   │   │       ├── UnaryComplexKernels.h
    │   │   │       ├── UnaryFractionKernels.cpp
    │   │   │       ├── UnaryFractionKernels.h
    │   │   │       ├── UnaryGammaKernels.cpp
    │   │   │       ├── UnaryGammaKernels.h
    │   │   │       ├── UnaryGeometricAcosKernel.cpp
    │   │   │       ├── UnaryGeometricAcosKernel.h
    │   │   │       ├── UnaryGeometricAcoshKernel.cpp
    │   │   │       ├── UnaryGeometricAcoshKernel.h
    │   │   │       ├── UnaryGeometricAsinKernel.cpp
    │   │   │       ├── UnaryGeometricAsinKernel.h
    │   │   │       ├── UnaryGeometricAsinhKernel.cpp
    │   │   │       ├── UnaryGeometricAsinhKernel.h
    │   │   │       ├── UnaryGeometricAtanKernel.cpp
    │   │   │       ├── UnaryGeometricAtanKernel.h
    │   │   │       ├── UnaryGeometricAtanhKernel.cpp
    │   │   │       ├── UnaryGeometricAtanhKernel.h
    │   │   │       ├── UnaryGeometricCosKernel.cpp
    │   │   │       ├── UnaryGeometricCosKernel.h
    │   │   │       ├── UnaryGeometricCoshKernel.cpp
    │   │   │       ├── UnaryGeometricCoshKernel.h
    │   │   │       ├── UnaryGeometricSinKernel.cpp
    │   │   │       ├── UnaryGeometricSinKernel.h
    │   │   │       ├── UnaryGeometricSinhKernel.cpp
    │   │   │       ├── UnaryGeometricSinhKernel.h
    │   │   │       ├── UnaryGeometricTanKernel.cpp
    │   │   │       ├── UnaryGeometricTanKernel.h
    │   │   │       ├── UnaryGeometricTanhKernel.cpp
    │   │   │       ├── UnaryGeometricTanhKernel.h
    │   │   │       ├── UnaryKernels.cpp
    │   │   │       ├── UnaryKernels.h
    │   │   │       ├── UnaryLogKernels.cpp
    │   │   │       ├── UnaryLogKernels.h
    │   │   │       ├── UnarySignKernels.cpp
    │   │   │       ├── UnarySignKernels.h
    │   │   │       ├── UnarySpecialOpsKernels.cpp
    │   │   │       ├── UnarySpecialOpsKernels.h
    │   │   │       ├── UnfoldBackwardKernels.cpp
    │   │   │       ├── UnfoldBackwardKernels.h
    │   │   │       ├── UniqueKernels.cpp
    │   │   │       ├── UniqueKernels.h
    │   │   │       ├── UpSampleBicubic2dKernels.cpp
    │   │   │       ├── UpSampleBicubic2dKernels.h
    │   │   │       ├── UpSampleBilinear2dKernels.cpp
    │   │   │       ├── UpSampleBilinear2dKernels.h
    │   │   │       ├── UpSampleLinear1dKernels.cpp
    │   │   │       ├── UpSampleLinear1dKernels.h
    │   │   │       ├── UpSampleNearest1dKernels.cpp
    │   │   │       ├── UpSampleNearest1dKernels.h
    │   │   │       ├── UpSampleNearest2dKernels.cpp
    │   │   │       ├── UpSampleNearest2dKernels.h
    │   │   │       ├── UpSampleNearest3dKernels.cpp
    │   │   │       ├── UpSampleNearest3dKernels.h
    │   │   │       ├── UpSampleTrilinear3dKernels.cpp
    │   │   │       ├── UpSampleTrilinear3dKernels.h
    │   │   │       ├── WeightInt4PackKernel.cpp
    │   │   │       ├── WeightInt4PackKernel.h
    │   │   │       ├── WeightNormKernels.cpp
    │   │   │       ├── WeightNormKernels.h
    │   │   │       ├── WelfordNorm.h
    │   │   │       ├── ZetaKernel.cpp
    │   │   │       ├── ZetaKernel.h
    │   │   │       └── pstl
    │   │   │           └── PSTLFunctions.h
    │   └── xpu
    │   │   ├── EmptyTensor.cpp
    │   │   └── EmptyTensor.h
    ├── BuildOnLinux.cmake
    ├── BuildOnWindows.cmake
    ├── CMakeLists.txt
    ├── comm
    │   ├── DeviceProperties.h
    │   ├── Macros.h
    │   ├── Memory.h
    │   ├── MemoryFormat.h
    │   ├── ReduceOpsUtils.h
    │   ├── RegisterUtils.h
    │   ├── Runtime.h
    │   ├── SYCLContext.h
    │   ├── SYCLHelpers.h
    │   ├── Scalar.h
    │   ├── TensorInfo.h
    │   ├── TensorOptions.h
    │   ├── XPUMathCompat.h
    │   ├── XPUPair.h
    │   └── xpu_aten.h
    └── xccl
    │   ├── CMakeLists.txt
    │   ├── ProcessGroupXCCL.cpp
    │   ├── ProcessGroupXCCL.hpp
    │   ├── Register.cpp
    │   └── reducer_xpu.cpp
├── test
    ├── microbench
    │   ├── adaptive_avg_pool2d.py
    │   ├── avg_pool2d.py
    │   ├── avg_pool3d.py
    │   ├── batch_norm_1d.py
    │   ├── batch_norm_2d.py
    │   ├── batch_norm_3d.py
    │   ├── col2im.py
    │   ├── distance.cdist.py
    │   ├── distance.pdist.py
    │   ├── distribution.bernoulli.py
    │   ├── distribution.cauchy.py
    │   ├── distribution.exponential.py
    │   ├── distribution.geometric.py
    │   ├── distribution.log_normal.py
    │   ├── distribution.multinomial.py
    │   ├── distribution.normal.py
    │   ├── distribution.random.py
    │   ├── distribution.uniform.py
    │   ├── dropout.py
    │   ├── eltwise.add.py
    │   ├── flip.py
    │   ├── grid_sampler.grid_sampler_2d.py
    │   ├── grid_sampler.grid_sampler_3d.py
    │   ├── group_norm.py
    │   ├── im2col.py
    │   ├── layer_norm.py
    │   ├── loss.binary_cross_entropy.py
    │   ├── loss.ctc_loss.py
    │   ├── loss.l1_loss.py
    │   ├── loss.mse_loss.py
    │   ├── loss.multilabel_margin_loss.py
    │   ├── loss.nll_loss.py
    │   ├── loss.smooth_l1_loss.py
    │   ├── matmul.py
    │   ├── pad_sequence.py
    │   ├── pooling.adaptive_max_pool2d.py
    │   ├── pooling.fractional_max_pool2d.py
    │   ├── pooling.fractional_max_pool3d.py
    │   ├── pooling.max_pool2d.py
    │   ├── pooling.max_pool3d.py
    │   ├── reduce.max.py
    │   ├── reduce.sum.py
    │   ├── remainder.py
    │   ├── roll.py
    │   ├── scan.cumsum.py
    │   ├── scan.topk.py
    │   ├── scan.unique.py
    │   ├── softmax.py
    │   ├── sort.py
    │   ├── sort.randperm.py
    │   ├── upsample_bicubic2d.py
    │   ├── upsample_bilinear2d.py
    │   ├── upsample_nearest2d.py
    │   ├── upsample_nearest3d.py
    │   └── upsample_nearest_exact2d.py
    ├── profiling
    │   ├── correlation_id_mixed.py
    │   ├── profile_partial_runtime_ops.py
    │   ├── reproducer.missing.gpu.kernel.time.py
    │   ├── rn50.py
    │   ├── time_precision_in_profile.py
    │   └── triton_xpu_ops_time.py
    ├── regressions
    │   ├── optests_failures_dict.json
    │   ├── test_binary.py
    │   ├── test_cat.py
    │   ├── test_clamp_promotion.py
    │   ├── test_compare.py
    │   ├── test_copy.py
    │   ├── test_copy_downcast_fp8.py
    │   ├── test_deform_conv.py
    │   ├── test_div_mode.py
    │   ├── test_foreach_list.py
    │   ├── test_foreach_scalar.py
    │   ├── test_foreach_scalarlist.py
    │   ├── test_index_and_index_put.py
    │   ├── test_int4pack.py
    │   ├── test_layer_norm.py
    │   ├── test_loops.py
    │   ├── test_nms.py
    │   ├── test_operation_on_device_1.py
    │   ├── test_rand.py
    │   ├── test_record_stream.py
    │   ├── test_resize.py
    │   ├── test_roi_align.py
    │   ├── test_safe_softmax.py
    │   ├── test_softmax.py
    │   ├── test_sort.py
    │   ├── test_tensor_factory.py
    │   ├── test_torchvision_roi_ops.py
    │   ├── test_unary.py
    │   ├── test_upsample_bilinear_bwd.py
    │   └── test_upsample_nearest.py
    ├── sycl
    │   ├── CMakeLists.txt
    │   ├── main.cpp
    │   ├── simple_kernel.cpp
    │   └── simple_kernel.hpp
    └── xpu
    │   ├── __init__.py
    │   ├── distributed
    │       ├── __init__.py
    │       ├── test_c10d_ops_xccl.py
    │       └── test_c10d_xccl.py
    │   ├── extended
    │       ├── __init__.py
    │       ├── run_test_with_skip.py
    │       ├── run_test_with_skip_arc.py
    │       ├── run_test_with_skip_bmg.py
    │       ├── run_test_with_skip_lnl.py
    │       ├── run_test_with_skip_mtl.py
    │       ├── skip_list_arc.py
    │       ├── skip_list_common.py
    │       ├── skip_list_win.py
    │       ├── skip_list_win_arc.py
    │       ├── skip_list_win_bmg.py
    │       ├── skip_list_win_lnl.py
    │       ├── skip_list_win_mtl.py
    │       ├── test_ops_xpu.py
    │       └── test_tensor_creation_ops_xpu.py
    │   ├── nn
    │       ├── __init__.py
    │       ├── test_convolution_xpu.py
    │       ├── test_dropout_xpu.py
    │       ├── test_embedding_xpu.py
    │       ├── test_init_xpu.py
    │       ├── test_lazy_modules_xpu.py
    │       ├── test_load_state_dict_xpu.py
    │       ├── test_module_hooks_xpu.py
    │       ├── test_multihead_attention_xpu.py
    │       ├── test_packed_sequence_xpu.py
    │       ├── test_parametrization_xpu.py
    │       ├── test_pooling_xpu.py
    │       └── test_pruning_xpu.py
    │   ├── quantization
    │       └── core
    │       │   ├── __init__.py
    │       │   ├── test_quantized_op_xpu.py
    │       │   ├── test_quantized_tensor_xpu.py
    │       │   ├── test_workflow_module_xpu.py
    │       │   └── test_workflow_ops_xpu.py
    │   ├── run_distributed.py
    │   ├── run_test_win_with_skip_mtl.py
    │   ├── run_test_with_only.py
    │   ├── run_test_with_skip.py
    │   ├── run_test_with_skip_arc.py
    │   ├── run_test_with_skip_bmg.py
    │   ├── run_test_with_skip_lnl.py
    │   ├── run_test_with_skip_mtl.py
    │   ├── skip_list_arc.py
    │   ├── skip_list_common.py
    │   ├── skip_list_dist.py
    │   ├── skip_list_mtl.py
    │   ├── skip_list_win.py
    │   ├── skip_list_win_arc.py
    │   ├── skip_list_win_bmg.py
    │   ├── skip_list_win_lnl.py
    │   ├── skip_list_win_mtl.py
    │   ├── test_autocast_xpu.py
    │   ├── test_autograd_fallback_xpu.py
    │   ├── test_autograd_xpu.py
    │   ├── test_binary_ufuncs_xpu.py
    │   ├── test_comparison_utils_xpu.py
    │   ├── test_complex_xpu.py
    │   ├── test_content_store_xpu.py
    │   ├── test_dataloader_xpu.py
    │   ├── test_decomp_xpu.py
    │   ├── test_distributions_xpu.py
    │   ├── test_dynamic_shapes_xpu.py
    │   ├── test_foreach_xpu.py
    │   ├── test_indexing_xpu.py
    │   ├── test_linalg_xpu.py
    │   ├── test_masked_xpu.py
    │   ├── test_maskedtensor_xpu.py
    │   ├── test_matmul_cuda_xpu.py
    │   ├── test_meta_xpu.py
    │   ├── test_modules_xpu.py
    │   ├── test_namedtensor_xpu.py
    │   ├── test_native_functions_xpu.py
    │   ├── test_native_mha_xpu.py
    │   ├── test_nestedtensor_xpu.py
    │   ├── test_nn_xpu.py
    │   ├── test_ops_fwd_gradients_xpu.py
    │   ├── test_ops_gradients_xpu.py
    │   ├── test_ops_xpu.py
    │   ├── test_optim_xpu.py
    │   ├── test_reductions_xpu.py
    │   ├── test_scatter_gather_ops_xpu.py
    │   ├── test_segment_reductions_xpu.py
    │   ├── test_shape_ops_xpu.py
    │   ├── test_sort_and_select_xpu.py
    │   ├── test_sparse_xpu.py
    │   ├── test_spectral_ops_xpu.py
    │   ├── test_tensor_creation_ops_xpu.py
    │   ├── test_torch_xpu.py
    │   ├── test_transformers_xpu.py
    │   ├── test_type_promotion_xpu.py
    │   ├── test_unary_ufuncs_xpu.py
    │   ├── test_view_ops_xpu.py
    │   └── xpu_test_utils.py
├── tools
    ├── check_ops.py
    ├── codegen
    │   └── remove_headers.py
    └── linter
    │   ├── __init__.py
    │   ├── adapters
    │       ├── README.md
    │       ├── _linter.py
    │       ├── actionlint_linter.py
    │       ├── bazel_linter.py
    │       ├── black_linter.py
    │       ├── clangformat_linter.py
    │       ├── clangtidy_linter.py
    │       ├── cmake_linter.py
    │       ├── constexpr_linter.py
    │       ├── docstring_linter.py
    │       ├── exec_linter.py
    │       ├── flake8_linter.py
    │       ├── gha_linter.py
    │       ├── grep_linter.py
    │       ├── import_linter.py
    │       ├── lintrunner_version_linter.py
    │       ├── mypy_linter.py
    │       ├── nativefunctions_linter.py
    │       ├── newlines_linter.py
    │       ├── no_merge_conflict_csv_linter.py
    │       ├── no_workflows_on_fork.py
    │       ├── pip_init.py
    │       ├── pyfmt_linter.py
    │       ├── ruff_linter.py
    │       ├── s3_init.py
    │       ├── s3_init_config.json
    │       ├── set_linter.py
    │       ├── shellcheck_linter.py
    │       ├── test_has_main_linter.py
    │       ├── testowners_linter.py
    │       ├── ufmt_linter.py
    │       ├── update_s3.py
    │       └── workflow_consistency_linter.py
    │   └── clang_tidy
    │       ├── __init__.py
    │       └── generate_build_files.py
└── yaml
    ├── native
        ├── native_functions.yaml
        └── tags.yaml
    └── xpu_functions.yaml


/.cmakelintrc:
--------------------------------------------------------------------------------
1 | filter=-convention/filename,-linelength,-package/consistency,-readability/logic,-readability/mixedcase,-readability/wonkycase,-syntax,-whitespace/eol,+whitespace/extra,-whitespace/indent,-whitespace/mismatch,-whitespace/newline,-whitespace/tabs
2 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/documentation.yml:
--------------------------------------------------------------------------------
 1 | name: 📚 Documentation
 2 | description: Report an issue related to https://pytorch.org/docs/stable/index.html
 3 | 
 4 | body:
 5 | - type: textarea
 6 |   attributes:
 7 |     label: 📚 The doc issue
 8 |     description: >
 9 |       A clear and concise description of what content in https://pytorch.org/docs/stable/index.html is an issue. If this has to do with the general https://pytorch.org website, please file an issue at https://github.com/pytorch/pytorch.github.io/issues/new/choose instead. If this has to do with https://pytorch.org/tutorials, please file an issue at https://github.com/pytorch/tutorials/issues/new.
10 |   validations:
11 |     required: true
12 | - type: textarea
13 |   attributes:
14 |     label: Suggest a potential alternative/fix
15 |     description: >
16 |       Tell us how we could improve the documentation in this regard.
17 | - type: markdown
18 |   attributes:
19 |     value: >
20 |       Thanks for contributing 🎉!
21 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature-request.yml:
--------------------------------------------------------------------------------
 1 | name: 🚀 Feature request
 2 | description: Submit a proposal/request for a new PyTorch feature
 3 | 
 4 | body:
 5 | - type: textarea
 6 |   attributes:
 7 |     label: 🚀 The feature, motivation and pitch
 8 |     description: >
 9 |       A clear and concise description of the feature proposal. Please outline the motivation for the proposal. Is your feature request related to a specific problem? e.g., *"I'm working on X and would like Y to be possible"*. If this is related to another GitHub issue, please link here too.
10 |   validations:
11 |     required: true
12 | - type: textarea
13 |   attributes:
14 |     label: Alternatives
15 |     description: >
16 |       A description of any alternative solutions or features you've considered, if any.
17 | - type: textarea
18 |   attributes:
19 |     label: Additional context
20 |     description: >
21 |       Add any other context or screenshots about the feature request.
22 | - type: markdown
23 |   attributes:
24 |     value: >
25 |       Thanks for contributing 🎉!
26 | 


--------------------------------------------------------------------------------
/.github/ci_commit_pins/torchbench.txt:
--------------------------------------------------------------------------------
1 | 03cde49eba0580ed17f9ae2250832fd8af4ed756
2 | 


--------------------------------------------------------------------------------
/.github/ci_commit_pins/triton.txt:
--------------------------------------------------------------------------------
1 | b8c64f64c18d8cac598b3adb355c21e7439c21de
2 | 


--------------------------------------------------------------------------------
/.github/scripts/env.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | source /opt/intel/oneapi/compiler/latest/env/vars.sh
 4 | source /opt/intel/oneapi/pti/latest/env/vars.sh
 5 | source /opt/intel/oneapi/umf/latest/env/vars.sh
 6 | source /opt/intel/oneapi/ccl/latest/env/vars.sh
 7 | source /opt/intel/oneapi/mpi/latest/env/vars.sh
 8 | icpx --version
 9 | sycl-ls
10 | 


--------------------------------------------------------------------------------
/.github/scripts/spec.py:
--------------------------------------------------------------------------------
1 | import torch
2 | 
3 | DEVICE_NAME = 'xpu'
4 | 
5 | MANUAL_SEED_FN = torch.xpu.manual_seed
6 | EMPTY_CACHE_FN = torch.xpu.empty_cache
7 | DEVICE_COUNT_FN = torch.xpu.device_count
8 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | */*.pyc
 2 | */*.so*
 3 | */**/__pycache__
 4 | */**/*.dylib*
 5 | */**/*.pyc
 6 | */**/*.pyd
 7 | */**/*.so*
 8 | */**/**/*.pyc
 9 | */**/**/**/*.pyc
10 | */**/**/**/**/*.pyc
11 | .lintbin
12 | yaml/templates
13 | 


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
 1 | # Security Policy
 2 | 
 3 | ## Report a Vulnerability
 4 | 
 5 | Report security issues or vulnerabilities to the [Intel Security Center].
 6 | 
 7 | For more information on how Intel works to resolve security issues, see
 8 | [Vulnerability Handling Guidelines].
 9 | 
10 | [Intel Security Center]:https://www.intel.com/security
11 | 
12 | [Vulnerability Handling Guidelines]:https://www.intel.com/content/www/us/en/security-center/vulnerability-handling-guidelines.html
13 | 


--------------------------------------------------------------------------------
/cmake/ClangFormat.cmake:
--------------------------------------------------------------------------------
 1 | ## Include to trigger clang-format
 2 | if(BUILD_NO_CLANGFORMAT)
 3 |   return()
 4 | endif()
 5 | 
 6 | if(CLANGFORMAT_enabled)
 7 |   return()
 8 | endif()
 9 | set(CLANGFORMAT_enabled true)
10 | 
11 | set(CFMT_STYLE ${PROJECT_SOURCE_DIR}/.clang-format)
12 | if(NOT EXISTS ${CFMT_STYLE})
13 |   message(WARNING "Cannot find style file ${CFMT_STYLE}!")
14 |   return()
15 | endif()
16 | 
17 | find_program(CLANG_FORMAT "clang-format-12")
18 | if(NOT CLANG_FORMAT)
19 |   message("Please install clang-format-12 before contributing to torch-xpu-ops!")
20 | else()
21 |   set(CLANG_FORMAT_EXEC clang-format-12)
22 | endif()
23 | 


--------------------------------------------------------------------------------
/cmake/ONEMKL.cmake:
--------------------------------------------------------------------------------
 1 | option(USE_ONEMKL_XPU "Build with ONEMKL XPU support" ON)
 2 | 
 3 | if(DEFINED ENV{USE_ONEMKL_XPU})
 4 |   set(USE_ONEMKL_XPU $ENV{USE_ONEMKL_XPU})
 5 | endif()
 6 | 
 7 | message(STATUS "USE_ONEMKL_XPU is set to ${USE_ONEMKL_XPU}")
 8 | 
 9 | if(NOT USE_ONEMKL_XPU)
10 |   return()
11 | endif()
12 | 
13 | find_package(ONEMKL)
14 | if(NOT ONEMKL_FOUND)
15 |   message(FATAL_ERROR "Can NOT find ONEMKL cmake helpers module!")
16 | endif()
17 | 
18 | set(TORCH_XPU_OPS_ONEMKL_INCLUDE_DIR ${ONEMKL_INCLUDE_DIR})
19 | 
20 | set(TORCH_XPU_OPS_ONEMKL_LIBRARIES ${ONEMKL_LIBRARIES})
21 | 
22 | list(INSERT TORCH_XPU_OPS_ONEMKL_LIBRARIES 1 "-Wl,--start-group")
23 | list(APPEND TORCH_XPU_OPS_ONEMKL_LIBRARIES "-Wl,--end-group")
24 | 


--------------------------------------------------------------------------------
/cmake/XCCL.cmake:
--------------------------------------------------------------------------------
 1 | if(NOT __XCCL_INCLUDED)
 2 |   set(__XCCL_INCLUDED TRUE)
 3 | 
 4 |   # XCCL_ROOT, XCCL_LIBRARY_DIR, XCCL_INCLUDE_DIR are handled by FindXCCL.cmake.
 5 |   find_package(XCCL REQUIRED)
 6 |   if(NOT XCCL_FOUND)
 7 |     set(PYTORCH_FOUND_XCCL FALSE)
 8 |     message(WARNING "${XCCL_NOT_FOUND_MESSAGE}")
 9 |     return()
10 |   endif()
11 | 
12 |   set(PYTORCH_FOUND_XCCL TRUE)
13 |   add_library(torch::xccl INTERFACE IMPORTED)
14 |   set_property(
15 |     TARGET torch::xccl PROPERTY INTERFACE_INCLUDE_DIRECTORIES
16 |     ${XCCL_INCLUDE_DIR})
17 |   set_property(
18 |     TARGET torch::xccl PROPERTY INTERFACE_LINK_LIBRARIES
19 |     ${XCCL_LIBRARY})
20 | endif()
21 | 


--------------------------------------------------------------------------------
/docs/torch_xpu_ops.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/torch-xpu-ops/5b4d7444484576f721d2295761cf8fafa924ef36/docs/torch_xpu_ops.jpg


--------------------------------------------------------------------------------
/src/ATen/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # ATen XPU sources
 2 | 
 3 | file(GLOB xpu_h "xpu/*.h")
 4 | file(GLOB xpu_cpp "xpu/*.cpp")
 5 | file(GLOB xpu_mkl "native/xpu/mkl/*.cpp")
 6 | file(GLOB xpu_native_cpp "native/xpu/*.cpp" "native/sparse/*.cpp" "native/sparse/xpu/*.cpp" "native/nested/*.cpp" "native/nested/xpu/*.cpp" "native/transformers/*.cpp" "native/quantized/*.cpp")
 7 | file(GLOB xpu_sycl "native/xpu/sycl/*.cpp" "native/sparse/xpu/sycl/*.cpp" "native/nested/xpu/sycl/*.cpp" "native/transformers/sycl/*.cpp" "native/quantized/sycl/*.cpp")
 8 | 
 9 | list(APPEND ATen_XPU_CPP_SRCS ${xpu_cpp})
10 | list(APPEND ATen_XPU_MKL_SRCS ${xpu_mkl})
11 | list(APPEND ATen_XPU_NATIVE_CPP_SRCS ${xpu_native_cpp})
12 | list(APPEND ATen_XPU_SYCL_SRCS ${xpu_sycl})
13 | 
14 | set(ATen_XPU_CPP_SRCS ${ATen_XPU_CPP_SRCS} PARENT_SCOPE)
15 | set(ATen_XPU_MKL_SRCS ${ATen_XPU_MKL_SRCS} PARENT_SCOPE)
16 | set(ATen_XPU_NATIVE_CPP_SRCS ${ATen_XPU_NATIVE_CPP_SRCS} PARENT_SCOPE)
17 | set(ATen_XPU_SYCL_SRCS ${ATen_XPU_SYCL_SRCS} PARENT_SCOPE)
18 | 
19 | foreach(HEADER  ${xpu_h})
20 |   install(FILES ${HEADER} DESTINATION "${AT_INSTALL_INCLUDE_DIR}/ATen/xpu")
21 | endforeach()
22 | 


--------------------------------------------------------------------------------
/src/ATen/native/nested/NestedTensorTransformerFunctions.cpp:
--------------------------------------------------------------------------------
 1 | #include <ATen/ATen.h>
 2 | #include <ATen/native/nested/NestedTensorTransformerFunctions.h>
 3 | 
 4 | namespace at::native {
 5 | 
 6 | Tensor NestedTensor_softmax_dropout_xpu(
 7 |     const Tensor& self,
 8 |     const Tensor& query) {
 9 |   std::optional<Tensor> attn_mask;
10 | 
11 |   attn_mask = NestedTensor_to_mask(query, 2, self.size(2));
12 |   attn_mask = attn_mask->to(query.device(), /*non-blocking=*/true);
13 |   return _masked_softmax(
14 |       self,
15 |       *attn_mask,
16 |       self.dim() - 1,
17 |       /*mask type */ 1); // NestedTensor_to_mask produces a BxT mask
18 | }
19 | 
20 | } // namespace at::native


--------------------------------------------------------------------------------
/src/ATen/native/quantized/FakeQuantizeCore.cpp:
--------------------------------------------------------------------------------
 1 | #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 2 | 
 3 | #include <ATen/native/DispatchStub.h>
 4 | #include <ATen/native/quantized/FakeQuantAffine.h>
 5 | 
 6 | #include <ATen/native/quantized/sycl/FakeQuantizeCoreKernels.h>
 7 | 
 8 | namespace at::native {
 9 | 
10 | REGISTER_XPU_DISPATCH(
11 |     fake_quant_tensor_cachemask_stub,
12 |     &xpu::fake_quantize_tensor_cachemask_kernel)
13 | REGISTER_XPU_DISPATCH(
14 |     fake_quant_tensor_cachemask_tensor_qparams_stub,
15 |     &xpu::fake_quantize_tensor_cachemask_tensor_qparams_kernel)
16 | REGISTER_XPU_DISPATCH(
17 |     fake_quant_grad_learnable_tensor_stub,
18 |     &xpu::_fake_quantize_grad_learnable_tensor_kernel)
19 | REGISTER_XPU_DISPATCH(
20 |     fake_quant_per_channel_cachemask_stub,
21 |     &xpu::fake_quant_per_channel_cachemask_kernel)
22 | REGISTER_XPU_DISPATCH(
23 |     fake_quant_grad_learnable_channel_stub,
24 |     &xpu::_fake_quantize_grad_learnable_channel_kernel)
25 | 
26 | } // namespace at::native
27 | 


--------------------------------------------------------------------------------
/src/ATen/native/quantized/sycl/FusedObsFakeQuantKernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/core/Tensor.h>
 4 | 
 5 | namespace at::native::xpu {
 6 | 
 7 | TORCH_XPU_API void _calculate_moving_average(
 8 |     const at::Tensor& x,
 9 |     const at::Tensor& observer_on,
10 |     at::Tensor& running_min,
11 |     at::Tensor& running_max,
12 |     const float averaging_const,
13 |     const int64_t size,
14 |     bool per_row_fake_quant);
15 | 
16 | TORCH_XPU_API
17 | void _calc_moving_avg_qparams_helper(
18 |     const at::Tensor& x,
19 |     const at::Tensor fake_quant_on,
20 |     at::Tensor& running_min,
21 |     at::Tensor& running_max,
22 |     float* scale_ptr,
23 |     int32_t* zp_ptr,
24 |     int32_t qmin,
25 |     int32_t qmax,
26 |     bool symmetric_quant,
27 |     const int64_t size,
28 |     bool per_row_fq);
29 | 
30 | } // namespace at::native::xpu
31 | 


--------------------------------------------------------------------------------
/src/ATen/native/quantized/sycl/MakePerTensorQuantizedTensorKernels.cpp:
--------------------------------------------------------------------------------
 1 | #include <ATen/Dispatch.h>
 2 | #include <ATen/core/Tensor.h>
 3 | 
 4 | #include <ATen/native/quantized/sycl/MakePerTensorQuantizedTensorKernels.h>
 5 | #include <ATen/native/xpu/sycl/Loops.h>
 6 | 
 7 | namespace at::native::xpu {
 8 | 
 9 | template <typename scalar_t, typename underlying_t>
10 | struct AssignQuantizedTensorFunctor {
11 |   scalar_t operator()(underlying_t value) const {
12 |     return scalar_t(value);
13 |   }
14 | };
15 | 
16 | void assign_quantized_tensor_kernel(const Tensor& self, Tensor& dst) {
17 |   AT_DISPATCH_QINT_TYPES(
18 |       dst.scalar_type(), "assign_quantized_tensor_xpu", [&]() {
19 |         auto iter = TensorIteratorConfig()
20 |                         .check_all_same_dtype(false)
21 |                         .add_output(dst)
22 |                         .add_input(self)
23 |                         .build();
24 |         auto caller = AssignQuantizedTensorFunctor<scalar_t, underlying_t>();
25 |         gpu_kernel(iter, caller);
26 |       });
27 | }
28 | 
29 | } // namespace at::native::xpu
30 | 


--------------------------------------------------------------------------------
/src/ATen/native/quantized/sycl/MakePerTensorQuantizedTensorKernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/native/TensorIterator.h>
 4 | 
 5 | namespace at::native::xpu {
 6 | 
 7 | TORCH_XPU_API void assign_quantized_tensor_kernel(
 8 |     const Tensor& self,
 9 |     Tensor& dst);
10 | 
11 | } // namespace at::native::xpu
12 | 


--------------------------------------------------------------------------------
/src/ATen/native/quantized/sycl/QuantizedMaxPool2d.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <comm/xpu_aten.h>
 4 | 
 5 | namespace at::native::xpu {
 6 | 
 7 | TORCH_XPU_API Tensor quantized_max_pool2d_kernel(
 8 |     const Tensor& input,
 9 |     IntArrayRef kernel_size,
10 |     IntArrayRef stride,
11 |     IntArrayRef padding,
12 |     IntArrayRef dilation,
13 |     bool ceil_mode);
14 | 
15 | } // namespace at::native::xpu
16 | 


--------------------------------------------------------------------------------
/src/ATen/native/sparse/xpu/SparseBinaryOpIntersection.cpp:
--------------------------------------------------------------------------------
 1 | #include <ATen/native/sparse/SparseStubs.h>
 2 | #include <ATen/native/sparse/xpu/sycl/SparseBinaryOpIntersectionKernels.h>
 3 | 
 4 | namespace at::native {
 5 | 
 6 | REGISTER_XPU_DISPATCH(
 7 |     mul_sparse_sparse_out_stub,
 8 |     &xpu::mul_sparse_sparse_kernel);
 9 | REGISTER_XPU_DISPATCH(
10 |     sparse_mask_intersection_out_stub,
11 |     &xpu::sparse_mask_intersection_kernel);
12 | REGISTER_XPU_DISPATCH(
13 |     sparse_mask_projection_out_stub,
14 |     &xpu::sparse_mask_projection_kernel);
15 | 
16 | } // namespace at::native
17 | 


--------------------------------------------------------------------------------
/src/ATen/native/sparse/xpu/SparseTensor.cpp:
--------------------------------------------------------------------------------
 1 | #include <ATen/native/sparse/SparseStubs.h>
 2 | #include <ATen/native/sparse/xpu/sycl/SparseTensorKernels.h>
 3 | 
 4 | namespace at::native {
 5 | 
 6 | using namespace at::sparse;
 7 | 
 8 | SparseTensor _coalesce_sparse_xpu(const SparseTensor& self) {
 9 |   return xpu::coalesce_sparse_kernel(self);
10 | }
11 | 
12 | REGISTER_XPU_DISPATCH(flatten_indices_stub, &xpu::flatten_indices_kernel);
13 | 
14 | } // namespace at::native
15 | 


--------------------------------------------------------------------------------
/src/ATen/native/sparse/xpu/SparseTensorMath.cpp:
--------------------------------------------------------------------------------
 1 | #include <ATen/native/sparse/xpu/sycl/SparseTensorMathKernels.h>
 2 | 
 3 | namespace at::native {
 4 | 
 5 | using namespace at::sparse;
 6 | 
 7 | SparseTensor& add_out_sparse_xpu(
 8 |     const SparseTensor& t,
 9 |     const SparseTensor& src,
10 |     const Scalar& value,
11 |     SparseTensor& r_) {
12 |   return xpu::add_sparse_kernel(t, src, value, r_);
13 | }
14 | 
15 | SparseTensor& mul_out_sparse_xpu(
16 |     const Tensor& t_,
17 |     const Tensor& src_,
18 |     SparseTensor& r_) {
19 |   return xpu::mul_sparse_kernel(t_, src_, r_);
20 | }
21 | 
22 | Tensor _sparse_sum_backward_xpu(
23 |     const Tensor& grad_,
24 |     const SparseTensor& input_,
25 |     IntArrayRef dims_to_sum) {
26 |   return xpu::_sparse_sum_backward_kernel(grad_, input_, dims_to_sum);
27 | }
28 | 
29 | } // namespace at::native
30 | 


--------------------------------------------------------------------------------
/src/ATen/native/sparse/xpu/sycl/SparseBinaryOpIntersectionKernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/native/SparseTensorUtils.h>
 4 | #include <ATen/native/TensorIterator.h>
 5 | 
 6 | namespace at::native::xpu {
 7 | 
 8 | using namespace at::sparse;
 9 | using OptTensor = std::optional<Tensor>;
10 | 
11 | TORCH_XPU_API void mul_sparse_sparse_kernel(
12 |     Tensor& result,
13 |     const Tensor& x,
14 |     const Tensor& y);
15 | 
16 | TORCH_XPU_API void sparse_mask_intersection_kernel(
17 |     Tensor& result,
18 |     const Tensor& x,
19 |     const Tensor& y,
20 |     const OptTensor& x_hash_opt = std::nullopt);
21 | 
22 | TORCH_XPU_API void sparse_mask_projection_kernel(
23 |     Tensor& result,
24 |     const Tensor& x,
25 |     const Tensor& y,
26 |     const OptTensor& x_hash_opt,
27 |     bool accumulate_matches);
28 | 
29 | } // namespace at::native::xpu
30 | 


--------------------------------------------------------------------------------
/src/ATen/native/sparse/xpu/sycl/SparseCsrTensorMathKernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/native/SparseTensorUtils.h>
 4 | #include <ATen/native/TensorIterator.h>
 5 | 
 6 | namespace at::native::xpu {
 7 | 
 8 | TORCH_XPU_API void convert_indices_from_coo_to_csr_structured_kernel(
 9 |     const Tensor& input,
10 |     const int64_t size,
11 |     const bool out_int32,
12 |     const Tensor& result);
13 | 
14 | TORCH_XPU_API void convert_indices_from_csr_to_coo_structured_kernel(
15 |     const Tensor& crow_indices,
16 |     const Tensor& col_indices,
17 |     const bool out_int32,
18 |     const bool transpose,
19 |     const Tensor& result);
20 | 
21 | TORCH_XPU_API Tensor _sparse_csr_sum_xpu_kernel(
22 |     const Tensor& input,
23 |     IntArrayRef dims_to_sum,
24 |     bool keepdim,
25 |     std::optional<ScalarType> dtype);
26 | 
27 | TORCH_XPU_API Tensor _sparse_csr_prod_xpu_kernel(
28 |     const Tensor& input,
29 |     IntArrayRef dims_to_reduce,
30 |     bool keepdim,
31 |     std::optional<ScalarType> dtype);
32 | 
33 | } // namespace at::native::xpu
34 | 


--------------------------------------------------------------------------------
/src/ATen/native/sparse/xpu/sycl/SparseSoftmaxKernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/native/SparseTensorUtils.h>
 4 | #include <ATen/native/TensorIterator.h>
 5 | 
 6 | namespace at::native::xpu {
 7 | 
 8 | using namespace at::sparse;
 9 | 
10 | TORCH_XPU_API Tensor softmax_sparse_xpu_kernel(
11 |     const Tensor& input_,
12 |     const int64_t dim_,
13 |     const bool half_to_float);
14 | 
15 | TORCH_XPU_API Tensor log_softmax_sparse_xpu_kernel(
16 |     const Tensor& input_,
17 |     const int64_t dim_,
18 |     const bool half_to_float);
19 | 
20 | TORCH_XPU_API Tensor softmax_backward_sparse_xpu_kernel(
21 |     const Tensor& grad_,
22 |     const Tensor& output_,
23 |     int64_t dim_,
24 |     const Tensor& input_);
25 | 
26 | TORCH_XPU_API Tensor log_softmax_backward_sparse_xpu_kernel(
27 |     const Tensor& grad_,
28 |     const Tensor& output_,
29 |     int64_t dim_,
30 |     const Tensor& input_);
31 | 
32 | } // namespace at::native::xpu
33 | 


--------------------------------------------------------------------------------
/src/ATen/native/sparse/xpu/sycl/SparseTensorKernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/native/SparseTensorUtils.h>
 4 | #include <ATen/native/TensorIterator.h>
 5 | 
 6 | namespace at::native::xpu {
 7 | 
 8 | using namespace at::sparse;
 9 | 
10 | TORCH_XPU_API SparseTensor coalesce_sparse_kernel(const SparseTensor& self);
11 | 
12 | TORCH_XPU_API Tensor
13 | flatten_indices_kernel(const Tensor& indices, IntArrayRef size);
14 | 
15 | } // namespace at::native::xpu
16 | 


--------------------------------------------------------------------------------
/src/ATen/native/sparse/xpu/sycl/SparseTensorMathKernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/native/SparseTensorUtils.h>
 4 | #include <ATen/native/TensorIterator.h>
 5 | 
 6 | namespace at::native::xpu {
 7 | 
 8 | using namespace at::sparse;
 9 | 
10 | TORCH_XPU_API SparseTensor& add_sparse_kernel(
11 |     const SparseTensor& t,
12 |     const SparseTensor& src,
13 |     const Scalar& value,
14 |     SparseTensor& r_);
15 | 
16 | TORCH_XPU_API SparseTensor& mul_sparse_kernel(
17 |     const Tensor& t_,
18 |     const Tensor& src_,
19 |     SparseTensor& r_);
20 | 
21 | TORCH_XPU_API Tensor _sparse_sum_backward_kernel(
22 |     const Tensor& grad_,
23 |     const SparseTensor& input_,
24 |     IntArrayRef dims_to_sum);
25 | 
26 | } // namespace at::native::xpu
27 | 


--------------------------------------------------------------------------------
/src/ATen/native/transformers/SDPUtils.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/native/transformers/sdp_utils_cpp.h>
 4 | 
 5 | namespace sdp {
 6 | 
 7 | bool can_use_mem_efficient_attention(sdp::sdp_params params, bool debug);
 8 | 
 9 | } // namespace sdp
10 | 


--------------------------------------------------------------------------------
/src/ATen/native/transformers/sycl/AttentionKernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/core/Tensor.h>
 4 | 
 5 | namespace at::native::xpu {
 6 | 
 7 | TORCH_XPU_API void _transform_bias_rescale_qkv_kernel(
 8 |     const Tensor& qkv,
 9 |     const Tensor& qkv_bias,
10 |     const int64_t num_head,
11 |     Tensor& q_k_v,
12 |     int64_t B,
13 |     int64_t T,
14 |     int64_t D,
15 |     int64_t dim_per_head);
16 | 
17 | } // namespace at::native::xpu
18 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/AiryAi.cpp:
--------------------------------------------------------------------------------
 1 | #include <ATen/native/DispatchStub.h>
 2 | #include <ATen/native/TensorIterator.h>
 3 | #include <ATen/native/UnaryOps.h>
 4 | #include <ATen/native/xpu/sycl/AiryAiKernel.h>
 5 | 
 6 | namespace at {
 7 | namespace native {
 8 | REGISTER_XPU_DISPATCH(special_airy_ai_stub, &xpu::airy_ai_kernel);
 9 | 
10 | } // namespace native
11 | } // namespace at


--------------------------------------------------------------------------------
/src/ATen/native/xpu/CompareOps.cpp:
--------------------------------------------------------------------------------
 1 | #include <ATen/ScalarOps.h>
 2 | #include <ATen/core/Tensor.h>
 3 | #include <ATen/native/BinaryOps.h>
 4 | #include <ATen/native/TensorIterator.h>
 5 | #include <ATen/native/xpu/sycl/CompareKernels.h>
 6 | 
 7 | namespace at {
 8 | 
 9 | namespace native {
10 | REGISTER_XPU_DISPATCH(eq_stub, &xpu::eq_kernel);
11 | REGISTER_XPU_DISPATCH(ne_stub, &xpu::ne_kernel);
12 | REGISTER_XPU_DISPATCH(le_stub, &xpu::le_kernel);
13 | REGISTER_XPU_DISPATCH(lt_stub, &xpu::lt_kernel);
14 | REGISTER_XPU_DISPATCH(ge_stub, &xpu::ge_kernel);
15 | REGISTER_XPU_DISPATCH(gt_stub, &xpu::gt_kernel);
16 | } // namespace native
17 | } // namespace at
18 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/Cross.cpp:
--------------------------------------------------------------------------------
 1 | #include <ATen/ATen.h>
 2 | #include <ATen/ExpandUtils.h>
 3 | #include <ATen/native/Cross.h>
 4 | #include <ATen/native/DispatchStub.h>
 5 | #include <ATen/native/xpu/sycl/CrossKernel.h>
 6 | #include <comm/RegisterUtils.h>
 7 | 
 8 | namespace at {
 9 | namespace native {
10 | REGISTER_XPU_DISPATCH(cross_stub, &xpu::linalg_cross_kernel);
11 | } // namespace native
12 | } // namespace at
13 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/Distance.cpp:
--------------------------------------------------------------------------------
 1 | #include <ATen/native/DispatchStub.h>
 2 | #include <ATen/native/Distance.h>
 3 | #include <ATen/native/xpu/sycl/DistanceKernels.h>
 4 | 
 5 | namespace at {
 6 | namespace native {
 7 | 
 8 | REGISTER_XPU_DISPATCH(cdist_stub, &xpu::cdist_kernel);
 9 | REGISTER_XPU_DISPATCH(cdist_backward_stub, &xpu::cdist_backward_kernel);
10 | REGISTER_XPU_DISPATCH(pdist_forward_stub, &xpu::pdist_forward_kernel);
11 | REGISTER_XPU_DISPATCH(pdist_backward_stub, &xpu::pdist_backward_kernel);
12 | 
13 | } // namespace native
14 | } // namespace at
15 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/Fill.cpp:
--------------------------------------------------------------------------------
 1 | #include <ATen/ScalarOps.h>
 2 | #include <ATen/core/Tensor.h>
 3 | #include <ATen/native/DispatchStub.h>
 4 | #include <ATen/native/Fill.h>
 5 | #include <ATen/native/TensorIterator.h>
 6 | 
 7 | #include <ATen/native/xpu/sycl/FillKernel.h>
 8 | namespace at::native {
 9 | REGISTER_XPU_DISPATCH(fill_stub, &native::xpu::fill_kernel);
10 | } // namespace at::native
11 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/FunctionOfAMatrixUtils.cpp:
--------------------------------------------------------------------------------
 1 | #define TORCH_ASSERT_NO_OPERATORS
 2 | #include <ATen/native/FunctionOfAMatrixUtils.h>
 3 | 
 4 | #include <ATen/native/DispatchStub.h>
 5 | 
 6 | #include <ATen/native/xpu/sycl/FunctionOfAMatrixUtilsKernels.h>
 7 | 
 8 | namespace at {
 9 | namespace native {
10 | 
11 | REGISTER_XPU_DISPATCH(
12 |     _compute_linear_combination_stub,
13 |     &xpu::_compute_linear_combination_kernel);
14 | 
15 | }
16 | } // namespace at
17 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/GroupNorm.cpp:
--------------------------------------------------------------------------------
 1 | #include <ATen/core/op_registration/adaption.h>
 2 | #include <ATen/native/DispatchStub.h>
 3 | #include <ATen/native/cpu/mixed_data_type.h>
 4 | #include <ATen/native/group_norm.h>
 5 | #include <ATen/native/xpu/sycl/GroupNormKernels.h>
 6 | #include <comm/xpu_aten.h>
 7 | 
 8 | namespace at {
 9 | namespace native {
10 | REGISTER_XPU_DISPATCH(GroupNormKernel, &xpu::group_norm_kernel);
11 | REGISTER_XPU_DISPATCH(
12 |     GroupNormBackwardKernel,
13 |     &xpu::group_norm_backward_kernel);
14 | } // namespace native
15 | } // namespace at
16 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/Histogram.cpp:
--------------------------------------------------------------------------------
 1 | #include <ATen/Context.h>
 2 | #include <ATen/core/Tensor.h>
 3 | #include <ATen/native/Histogram.h>
 4 | #include <ATen/native/Resize.h>
 5 | #include <ATen/native/xpu/sycl/HistogramKernels.h>
 6 | 
 7 | namespace at {
 8 | 
 9 | namespace native {
10 | REGISTER_XPU_DISPATCH(histogramdd_stub, &xpu::histogramdd_kernel);
11 | REGISTER_XPU_DISPATCH(histogramdd_linear_stub, &xpu::histogramdd_linear_kernel);
12 | REGISTER_XPU_DISPATCH(
13 |     histogram_select_outer_bin_edges_stub,
14 |     &xpu::histogram_select_outer_bin_edges_kernel);
15 | 
16 | } // namespace native
17 | } // namespace at


--------------------------------------------------------------------------------
/src/ATen/native/xpu/Lerp.cpp:
--------------------------------------------------------------------------------
 1 | #include <ATen/ScalarOps.h>
 2 | #include <ATen/core/Tensor.h>
 3 | #include <ATen/native/DispatchStub.h>
 4 | #include <ATen/native/Lerp.h>
 5 | #include <ATen/native/TensorIterator.h>
 6 | #include <ATen/native/xpu/sycl/LerpKernels.h>
 7 | 
 8 | namespace at {
 9 | namespace native {
10 | 
11 | REGISTER_XPU_DISPATCH(lerp_kernel_tensor_weight, &xpu::lerp_tensor_kernel);
12 | REGISTER_XPU_DISPATCH(lerp_kernel_scalar_weight, &xpu::lerp_scalar_kernel);
13 | 
14 | } // namespace native
15 | 
16 | } // namespace at
17 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/LinearAlgebra.cpp:
--------------------------------------------------------------------------------
 1 | #include <ATen/ExpandUtils.h>
 2 | #include <ATen/WrapDimUtils.h>
 3 | #include <ATen/core/Tensor.h>
 4 | #include <ATen/core/op_registration/adaption.h>
 5 | #include <ATen/native/DispatchStub.h>
 6 | #include <ATen/native/LinearAlgebra.h>
 7 | #include <ATen/native/LinearAlgebraUtils.h>
 8 | #include <ATen/native/ReduceOps.h>
 9 | #include <ATen/native/ReduceOpsUtils.h>
10 | #include <ATen/native/utils/ParamUtils.h>
11 | #include <comm/xpu_aten.h>
12 | 
13 | #include <ATen/native/xpu/sycl/LinearAlgebraKernels.h>
14 | #include <ATen/native/xpu/sycl/ReduceNormKernel.h>
15 | #include <comm/RegisterUtils.h>
16 | 
17 | namespace at {
18 | namespace native {
19 | REGISTER_XPU_DISPATCH(addr_stub, &xpu::addr_kernel);
20 | REGISTER_XPU_DISPATCH(norm_stub, &xpu::norm_kernel);
21 | } // namespace native
22 | } // namespace at
23 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/Normalization.cpp:
--------------------------------------------------------------------------------
 1 | #include <ATen/AccumulateType.h>
 2 | #include <ATen/core/Tensor.h>
 3 | #include <ATen/native/DispatchStub.h>
 4 | #include <ATen/native/Normalization.h>
 5 | #include <ATen/native/xpu/sycl/RenormKernel.h>
 6 | 
 7 | #include <comm/RegisterUtils.h>
 8 | 
 9 | namespace at {
10 | namespace native {
11 | REGISTER_XPU_DISPATCH(
12 |     renorm_scale_factor_stub,
13 |     &xpu::renorm_scale_factor_kernel);
14 | }
15 | } // namespace at
16 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/PointwiseOps.cpp:
--------------------------------------------------------------------------------
 1 | #include <ATen/core/Tensor.h>
 2 | #include <ATen/native/PointwiseOps.h>
 3 | #include <ATen/native/TensorIterator.h>
 4 | #include <ATen/native/xpu/sycl/PointwiseOpsKernels.h>
 5 | 
 6 | namespace at {
 7 | namespace native {
 8 | REGISTER_XPU_DISPATCH(addcmul_stub, &xpu::addcmul_kernel);
 9 | REGISTER_XPU_DISPATCH(addcdiv_stub, &xpu::addcdiv_kernel);
10 | } // namespace native
11 | } // namespace at
12 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/Pow.cpp:
--------------------------------------------------------------------------------
 1 | #include <ATen/ScalarOps.h>
 2 | #include <ATen/core/Tensor.h>
 3 | #include <ATen/native/BinaryOps.h>
 4 | #include <ATen/native/Pow.h>
 5 | #include <ATen/native/TensorIterator.h>
 6 | #include <ATen/native/xpu/sycl/PowKernels.h>
 7 | 
 8 | namespace at {
 9 | namespace native {
10 | REGISTER_XPU_DISPATCH(pow_tensor_tensor_stub, &xpu::pow_tensor_tensor_kernel);
11 | REGISTER_XPU_DISPATCH(pow_tensor_scalar_stub, &xpu::pow_tensor_scalar_kernel);
12 | } // namespace native
13 | } // namespace at
14 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/RecordStream.cpp:
--------------------------------------------------------------------------------
 1 | #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 2 | #include <ATen/core/Tensor.h>
 3 | #include <c10/xpu/XPUCachingAllocator.h>
 4 | 
 5 | #ifndef AT_PER_OPERATOR_HEADERS
 6 | #include <ATen/NativeFunctions.h>
 7 | #else
 8 | #include <ATen/ops/record_stream_native.h>
 9 | #endif
10 | 
11 | namespace at::native {
12 | void record_stream_xpu(Tensor& self, c10::Stream stream) {
13 |   struct c10::StreamData3 data = stream.pack3();
14 |   c10::xpu::XPUCachingAllocator::recordStream(
15 |       self.storage().data_ptr(),
16 |       at::xpu::XPUStream::unpack3(
17 |           data.stream_id, data.device_index, data.device_type));
18 | }
19 | } // namespace at::native
20 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/Repeat.cpp:
--------------------------------------------------------------------------------
 1 | #include <ATen/ATen.h>
 2 | #include <ATen/native/xpu/sycl/RepeatKernel.h>
 3 | 
 4 | namespace at {
 5 | namespace native {
 6 | Tensor repeat_interleave_xpu(
 7 |     const Tensor& repeats,
 8 |     std::optional<int64_t> output_size) {
 9 |   return at::native::xpu::repeat_interleave_kernel(repeats, output_size);
10 | }
11 | 
12 | } // namespace native
13 | } // namespace at
14 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/SegmentReduce.cpp:
--------------------------------------------------------------------------------
 1 | #include <ATen/native/DispatchStub.h>
 2 | #include <ATen/native/SegmentReduce.h>
 3 | 
 4 | #include <ATen/native/xpu/sycl/SegmentReduceKernels.h>
 5 | 
 6 | namespace at {
 7 | namespace native {
 8 | 
 9 | REGISTER_XPU_DISPATCH(
10 |     _segment_reduce_lengths_stub,
11 |     &xpu::_segment_reduce_lengths_kernel);
12 | REGISTER_XPU_DISPATCH(
13 |     _segment_reduce_offsets_stub,
14 |     &xpu::_segment_reduce_offsets_kernel);
15 | REGISTER_XPU_DISPATCH(
16 |     _segment_reduce_lengths_backward_stub,
17 |     &xpu::_segment_reduce_lengths_backward_kernel);
18 | REGISTER_XPU_DISPATCH(
19 |     _segment_reduce_offsets_backward_stub,
20 |     &xpu::_segment_reduce_offsets_backward_kernel);
21 | 
22 | } // namespace native
23 | } // namespace at
24 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/TensorProperties.cpp:
--------------------------------------------------------------------------------
1 | 
2 | namespace at {} // namespace at
3 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/TriangluarOps.cpp:
--------------------------------------------------------------------------------
 1 | 
 2 | #include <ATen/core/Tensor.h>
 3 | #include <ATen/core/op_registration/adaption.h>
 4 | #include <ATen/native/xpu/sycl/TriangularOpsKernels.h>
 5 | #include <comm/RegisterUtils.h>
 6 | #include <comm/xpu_aten.h>
 7 | 
 8 | #include <xpu/ATen/ops/tril_native.h>
 9 | #include <xpu/ATen/ops/triu_native.h>
10 | 
11 | namespace at::native {
12 | 
13 | TORCH_IMPL_FUNC(tril_xpu)(const Tensor& self, int64_t k, const Tensor& result) {
14 |   if (self.numel() != 0) {
15 |     xpu::tril_kernel(result, self, k);
16 |   }
17 | }
18 | 
19 | TORCH_IMPL_FUNC(triu_xpu)(const Tensor& self, int64_t k, const Tensor& result) {
20 |   if (self.numel() != 0) {
21 |     xpu::triu_kernel(result, self, k);
22 |   }
23 | }
24 | 
25 | Tensor trace_xpu(const Tensor& self) {
26 |   TORCH_CHECK(self.dim() == 2, "expected a matrix");
27 |   return self.diagonal().sum();
28 | }
29 | 
30 | } // namespace at::native


--------------------------------------------------------------------------------
/src/ATen/native/xpu/UnfoldBackward.cpp:
--------------------------------------------------------------------------------
 1 | #include <ATen/core/Tensor.h>
 2 | #include <ATen/native/DispatchStub.h>
 3 | #include <ATen/native/UnfoldBackward.h>
 4 | #include <ATen/native/xpu/sycl/UnfoldBackwardKernels.h>
 5 | #include <comm/xpu_aten.h>
 6 | 
 7 | namespace at {
 8 | 
 9 | namespace native {
10 | REGISTER_XPU_DISPATCH(unfold_backward_stub, &xpu::unfold_backward_kernel);
11 | }
12 | } // namespace at
13 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/WeightInt4Pack.cpp:
--------------------------------------------------------------------------------
 1 | #include <ATen/native/xpu/sycl/WeightInt4PackKernel.h>
 2 | 
 3 | namespace at::native {
 4 | 
 5 | // input is [n][k / 2] (uint8 dtype)
 6 | // output is [n][k // 8]
 7 | Tensor _convert_weight_to_int4pack_xpu(const Tensor& in, int64_t innerKTiles) {
 8 |   TORCH_CHECK(in.dim() == 2, __func__, " : expect weight to be 2D tensor.");
 9 |   TORCH_CHECK(
10 |       in.dtype() == at::kByte, __func__, " : expect weight to be kByte.");
11 |   TORCH_CHECK(
12 |       innerKTiles == 2 || innerKTiles == 4 || innerKTiles == 8,
13 |       __func__,
14 |       " : innerKTiles need to be 2, 4, or 8, got ",
15 |       innerKTiles);
16 | 
17 |   auto weight = in.contiguous();
18 |   auto N = weight.size(0);
19 |   auto K = weight.size(1) * 2;
20 |   TORCH_CHECK(
21 |       K % 8 == 0, "The K dimension of int4 GEMM should be a multiple of 8.");
22 |   auto weight_packed = at::empty(
23 |       {N, K / 8}, at::TensorOptions().dtype(at::kInt).device(in.device()));
24 | 
25 |   xpu::weight_to_int4pack_kernel(weight_packed, weight, N, K);
26 |   return weight_packed;
27 | }
28 | 
29 | } // namespace at::native
30 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/WeightNorm.cpp:
--------------------------------------------------------------------------------
 1 | #include <ATen/native/xpu/sycl/WeightNormKernels.h>
 2 | 
 3 | namespace at {
 4 | namespace native {
 5 | std::tuple<Tensor, Tensor> weight_norm_xpu(
 6 |     const Tensor& v,
 7 |     const Tensor& g,
 8 |     int64_t dim) {
 9 |   return native::xpu::weight_norm_kernel(v, g, dim);
10 | }
11 | 
12 | std::tuple<Tensor, Tensor> weight_norm_backward_xpu(
13 |     const Tensor& grad_w,
14 |     const Tensor& saved_v,
15 |     const Tensor& saved_g,
16 |     const Tensor& saved_norms,
17 |     int64_t dim) {
18 |   TORCH_CHECK(saved_v.is_contiguous(), "saved_v must be contiguous");
19 |   TORCH_CHECK(saved_g.is_contiguous(), "saved_g must be contiguous");
20 |   TORCH_CHECK(saved_norms.is_contiguous(), "saved_norms must be contiguous");
21 |   TORCH_CHECK(
22 |       dim == 0 || dim == saved_v.dim() - 1,
23 |       "fused kernels can only be applied for first or last dim")
24 | 
25 |   return native::xpu::weight_norm_backward_kernel(
26 |       grad_w, saved_v, saved_g, saved_norms, dim);
27 | }
28 | 
29 | } // namespace native
30 | } // namespace at


--------------------------------------------------------------------------------
/src/ATen/native/xpu/mkl/BatchLinearAlgebra.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/core/Tensor.h>
 4 | 
 5 | namespace at::native::xpu {
 6 | 
 7 | TORCH_XPU_API void lu_solve_mkl(
 8 |     const Tensor& LU,
 9 |     const Tensor& pivots,
10 |     const Tensor& B,
11 |     TransposeType trans);
12 | 
13 | TORCH_XPU_API void lu_factor_mkl(
14 |     const Tensor& LU,
15 |     const Tensor& pivots,
16 |     const Tensor& info,
17 |     bool pivot);
18 | 
19 | } // namespace at::native::xpu
20 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/AbsKernel.cpp:
--------------------------------------------------------------------------------
 1 | #include <ATen/Dispatch.h>
 2 | #include <ATen/OpMathType.h>
 3 | #include <ATen/native/TensorIterator.h>
 4 | #include <c10/core/ScalarType.h>
 5 | 
 6 | #include <ATen/native/xpu/sycl/Loops.h>
 7 | 
 8 | #include <ATen/native/xpu/sycl/AbsKernel.h>
 9 | 
10 | namespace at::native::xpu {
11 | 
12 | template <typename scalar_t>
13 | struct AbsFunctor {
14 |   scalar_t operator()(const scalar_t a) const {
15 |     return std::abs(a);
16 |   }
17 | };
18 | 
19 | void abs_kernel(TensorIteratorBase& iter) {
20 |   auto dtype = iter.dtype();
21 |   if (at::isComplexType(dtype)) {
22 |     AT_DISPATCH_COMPLEX_TYPES_AND(kComplexHalf, dtype, "abs_xpu", [&]() {
23 |       using opmath_t = at::opmath_type<scalar_t>;
24 |       gpu_kernel(iter, AbsFunctor<opmath_t>());
25 |     });
26 |   } else {
27 |     AT_DISPATCH_ALL_TYPES_AND3(
28 |         ScalarType::Half,
29 |         ScalarType::BFloat16,
30 |         ScalarType::Bool,
31 |         iter.dtype(),
32 |         "abs_xpu",
33 |         [&]() { gpu_kernel(iter, AbsFunctor<scalar_t>()); });
34 |   }
35 | }
36 | 
37 | } // namespace at::native::xpu
38 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/AbsKernel.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/native/TensorIterator.h>
 4 | 
 5 | namespace at::native::xpu {
 6 | 
 7 | TORCH_XPU_API void abs_kernel(TensorIteratorBase& iter);
 8 | 
 9 | } // namespace at::native::xpu
10 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/ActivationEluKernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/TensorIterator.h>
 4 | 
 5 | namespace at::native::xpu {
 6 | 
 7 | TORCH_XPU_API void elu_kernel(
 8 |     TensorIteratorBase& iter,
 9 |     const Scalar& alpha,
10 |     const Scalar& scale,
11 |     const Scalar& input_scale);
12 | 
13 | TORCH_XPU_API void elu_backward_kernel(
14 |     TensorIteratorBase& iter,
15 |     const Scalar& alpha,
16 |     const Scalar& scale,
17 |     const Scalar& input_scale,
18 |     bool is_result);
19 | 
20 | } // namespace at::native::xpu
21 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/ActivationGeluKernel.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/native/TensorIterator.h>
 4 | 
 5 | namespace at {
 6 | namespace native {
 7 | namespace xpu {
 8 | 
 9 | TORCH_XPU_API void gelu_kernel(
10 |     TensorIteratorBase& iter,
11 |     std::string_view approximate);
12 | 
13 | TORCH_XPU_API void gelu_backward_kernel(
14 |     TensorIteratorBase& iter,
15 |     std::string_view approximate);
16 | 
17 | } // namespace xpu
18 | } // namespace native
19 | } // namespace at
20 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/ActivationGluKernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/native/TensorIterator.h>
 4 | 
 5 | namespace at::native::xpu {
 6 | 
 7 | TORCH_XPU_API void glu_kernel(TensorIteratorBase& iter);
 8 | TORCH_XPU_API void glu_jvp_kernel(TensorIteratorBase& iter);
 9 | 
10 | TORCH_XPU_API void glu_backward_kernel(
11 |     const TensorIteratorBase& iter,
12 |     int64_t gI_stride,
13 |     int64_t I_stride);
14 | 
15 | } // namespace at::native::xpu
16 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/ActivationHardshrinkKernels.cpp:
--------------------------------------------------------------------------------
 1 | #include <ATen/Dispatch.h>
 2 | #include <ATen/native/TensorIterator.h>
 3 | #include <ATen/native/xpu/sycl/ActivationHardshrinkKernels.h>
 4 | #include <ATen/native/xpu/sycl/Loops.h>
 5 | 
 6 | namespace at::native::xpu {
 7 | 
 8 | template <typename scalar_t>
 9 | struct HardshrinkFunctor {
10 |   scalar_t operator()(scalar_t a) const {
11 |     return (a >= -lambd_ && a <= lambd_) ? scalar_t(0) : a;
12 |   }
13 | 
14 |   HardshrinkFunctor(const scalar_t lambd) : lambd_(lambd) {}
15 | 
16 |  private:
17 |   const scalar_t lambd_;
18 | };
19 | 
20 | void hardshrink_kernel(TensorIteratorBase& iter, const Scalar& value) {
21 |   AT_DISPATCH_FLOATING_TYPES_AND2(
22 |       at::ScalarType::Half,
23 |       at::ScalarType::BFloat16,
24 |       iter.dtype(),
25 |       "hardshrink_xpu",
26 |       [&]() {
27 |         auto lambd = value.to<scalar_t>();
28 |         auto caller = HardshrinkFunctor<scalar_t>(lambd);
29 |         gpu_kernel(iter, caller);
30 |       });
31 | }
32 | 
33 | } // namespace at::native::xpu
34 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/ActivationHardshrinkKernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/native/TensorIterator.h>
 4 | 
 5 | namespace at::native::xpu {
 6 | 
 7 | TORCH_XPU_API void hardshrink_kernel(
 8 |     TensorIteratorBase& iter,
 9 |     const Scalar& value);
10 | 
11 | } // namespace at::native::xpu
12 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/ActivationHardsigmoidKernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/native/TensorIterator.h>
 4 | 
 5 | namespace at::native::xpu {
 6 | 
 7 | TORCH_XPU_API void hardsigmoid_kernel(TensorIteratorBase& iter);
 8 | 
 9 | TORCH_XPU_API void hardsigmoid_backward_kernel(TensorIteratorBase& iter);
10 | 
11 | } // namespace at::native::xpu
12 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/ActivationHardswishKernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/native/TensorIterator.h>
 4 | 
 5 | namespace at {
 6 | namespace native {
 7 | namespace xpu {
 8 | 
 9 | TORCH_XPU_API void hardswish_kernel(TensorIterator& iter);
10 | 
11 | TORCH_XPU_API void hardswish_backward_kernel(TensorIterator& iter);
12 | 
13 | } // namespace xpu
14 | } // namespace native
15 | } // namespace at
16 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/ActivationHardtanhKernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/native/TensorIterator.h>
 4 | 
 5 | namespace at {
 6 | namespace native {
 7 | namespace xpu {
 8 | 
 9 | TORCH_XPU_API void hardtanh_backward_kernel(
10 |     TensorIterator& iter,
11 |     const Scalar& min,
12 |     const Scalar& max);
13 | 
14 | } // namespace xpu
15 | } // namespace native
16 | } // namespace at
17 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/ActivationLeakyReluKernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/native/TensorIterator.h>
 4 | 
 5 | namespace at::native::xpu {
 6 | 
 7 | TORCH_XPU_API void leaky_relu_kernel(
 8 |     TensorIteratorBase& iter,
 9 |     const Scalar& negval_);
10 | 
11 | TORCH_XPU_API void leaky_relu_backward_kernel(
12 |     TensorIteratorBase& iter,
13 |     const Scalar& negval_);
14 | 
15 | } // namespace at::native::xpu
16 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/ActivationLogSigmoidKernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/native/TensorIterator.h>
 4 | 
 5 | namespace at::native::xpu {
 6 | 
 7 | TORCH_XPU_API void log_sigmoid_forward_kernel(TensorIteratorBase& iter);
 8 | 
 9 | TORCH_XPU_API void log_sigmoid_backward_kernel(TensorIterator& iter);
10 | 
11 | } // namespace at::native::xpu
12 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/ActivationMishKernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/native/TensorIterator.h>
 4 | 
 5 | namespace at::native::xpu {
 6 | 
 7 | TORCH_XPU_API void mish_kernel(TensorIteratorBase& iter);
 8 | 
 9 | TORCH_XPU_API void mish_backward_kernel(TensorIterator& iter);
10 | 
11 | } // namespace at::native::xpu
12 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/ActivationPreluKernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/native/TensorIterator.h>
 4 | 
 5 | namespace at::native::xpu {
 6 | 
 7 | TORCH_XPU_API void prelu_kernel(TensorIterator& iter);
 8 | 
 9 | TORCH_XPU_API void prelu_backward_kernel(TensorIterator& iter);
10 | 
11 | } // namespace at::native::xpu
12 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/ActivationSiluKernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/native/TensorIterator.h>
 4 | 
 5 | namespace at::native::xpu {
 6 | 
 7 | TORCH_XPU_API void silu_kernel(TensorIteratorBase& iter);
 8 | 
 9 | TORCH_XPU_API void silu_backward_kernel(TensorIteratorBase& iter);
10 | 
11 | } // namespace at::native::xpu
12 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/ActivationSoftplusKernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/TensorIterator.h>
 4 | 
 5 | namespace at::native::xpu {
 6 | 
 7 | TORCH_XPU_API void softplus_kernel(
 8 |     TensorIteratorBase& iter,
 9 |     const Scalar& beta_,
10 |     const Scalar& threshold_);
11 | 
12 | TORCH_XPU_API void softplus_backward_kernel(
13 |     TensorIteratorBase& iter,
14 |     const Scalar& beta_,
15 |     const Scalar& threshold_);
16 | 
17 | } // namespace at::native::xpu
18 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/ActivationSoftshrinkKernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/TensorIterator.h>
 4 | 
 5 | namespace at::native::xpu {
 6 | 
 7 | TORCH_XPU_API void softshrink_kernel(
 8 |     TensorIteratorBase& iter,
 9 |     const Scalar& value);
10 | 
11 | TORCH_XPU_API void softshrink_backward_kernel(
12 |     TensorIteratorBase& iter,
13 |     const Scalar& value);
14 | 
15 | } // namespace at::native::xpu
16 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/ActivationThresholdKernel.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/native/TensorIterator.h>
 4 | 
 5 | namespace at {
 6 | namespace native {
 7 | namespace xpu {
 8 | 
 9 | TORCH_XPU_API void threshold_kernel(
10 |     TensorIteratorBase& iter,
11 |     const Scalar& threshold,
12 |     const Scalar& value);
13 | 
14 | }
15 | } // namespace native
16 | } // namespace at
17 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/native/TensorIterator.h>
 4 | 
 5 | namespace at::native::xpu {
 6 | 
 7 | TORCH_XPU_API void adaptive_avg_pool2d_backward_kernel(
 8 |     Tensor& gradInput,
 9 |     const Tensor& gradOutput,
10 |     const Tensor& input);
11 | 
12 | TORCH_XPU_API void adaptive_avg_pool2d_kernel(
13 |     Tensor& output,
14 |     const Tensor& input,
15 |     IntArrayRef output_size);
16 | 
17 | } // namespace at::native::xpu
18 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/AdaptiveAveragePooling3dKernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/core/Tensor.h>
 4 | 
 5 | namespace at::native::xpu {
 6 | 
 7 | TORCH_XPU_API void adaptive_avg_pool3d_backward_kernel(
 8 |     Tensor& gradInput,
 9 |     const Tensor& gradOutput_,
10 |     const Tensor& input);
11 | 
12 | TORCH_XPU_API void adaptive_avg_pool3d_kernel(
13 |     Tensor& output,
14 |     const Tensor& input_,
15 |     IntArrayRef& output_size);
16 | 
17 | } // namespace at::native::xpu


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/AdaptiveMaxPooling2dKernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/Tensor.h>
 4 | 
 5 | namespace at::native::xpu {
 6 | 
 7 | TORCH_XPU_API void adaptive_max_pool2d_kernel(
 8 |     const Tensor& input,
 9 |     IntArrayRef output_size,
10 |     const Tensor& output,
11 |     const Tensor& indices);
12 | 
13 | TORCH_XPU_API void adaptive_max_pool2d_backward_kernel(
14 |     const Tensor& grad_output,
15 |     const Tensor& input,
16 |     const Tensor& indices,
17 |     const Tensor& grad_input);
18 | 
19 | } // namespace at::native::xpu
20 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/AdaptiveMaxPooling3dKernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/core/Tensor.h>
 4 | 
 5 | namespace at::native::xpu {
 6 | 
 7 | TORCH_XPU_API void adaptive_max_pool3d_kernel(
 8 |     const Tensor& input,
 9 |     IntArrayRef output_size,
10 |     const Tensor& output,
11 |     const Tensor& indices);
12 | 
13 | TORCH_XPU_API void adaptive_max_pool3d_backward_kernel(
14 |     const Tensor& gradOutput,
15 |     const Tensor& input,
16 |     const Tensor& indices,
17 |     const Tensor& gradInput);
18 | 
19 | } // namespace at::native::xpu
20 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/AiryAiKernel.cpp:
--------------------------------------------------------------------------------
 1 | #include <ATen/ATen.h>
 2 | #include <ATen/native/TensorIterator.h>
 3 | #include <ATen/native/xpu/sycl/Loops.h>
 4 | #include <ATen/native/xpu/sycl/MathExtensions.h>
 5 | 
 6 | #include <ATen/native/xpu/sycl/AiryAiKernel.h>
 7 | 
 8 | namespace at::native::xpu {
 9 | template <typename scalar_t>
10 | struct AiryAiFunctor {
11 |   scalar_t operator()(scalar_t a) const {
12 |     return airy_ai_forward(a);
13 |   }
14 | };
15 | 
16 | void airy_ai_kernel(TensorIteratorBase& iter) {
17 |   AT_DISPATCH_FLOATING_TYPES(iter.common_dtype(), "airy_ai_xpu", [&]() {
18 |     gpu_kernel(iter, AiryAiFunctor<scalar_t>());
19 |   });
20 | }
21 | 
22 | } // namespace at::native::xpu


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/AiryAiKernel.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | 
3 | #include <ATen/native/TensorIterator.h>
4 | 
5 | namespace at::native::xpu {
6 | 
7 | TORCH_XPU_API void airy_ai_kernel(TensorIteratorBase& iter);
8 | 
9 | } // namespace at::native::xpu


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/AmpKernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <comm/xpu_aten.h>
 3 | 
 4 | namespace at::native::xpu {
 5 | 
 6 | TORCH_XPU_API void amp_non_finite_check_and_unscale_kernel(
 7 |     Tensor& scaled_grad,
 8 |     Tensor& found_inf,
 9 |     const Tensor& inv_scale);
10 | 
11 | TORCH_XPU_API void amp_foreach_non_finite_check_and_unscale_kernel(
12 |     std::vector<std::vector<at::Tensor>> scaled_grads,
13 |     Tensor& found_inf,
14 |     const Tensor& inv_scale);
15 | 
16 | TORCH_XPU_API Tensor& amp_update_scale_kernel(
17 |     Tensor& current_scale,
18 |     Tensor& growth_tracker,
19 |     const Tensor& found_inf,
20 |     double growth_factor,
21 |     double backoff_factor,
22 |     int64_t growth_interval);
23 | 
24 | } // namespace at::native::xpu
25 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/AveragePool2dKernels.h:
--------------------------------------------------------------------------------
 1 | #include <comm/xpu_aten.h>
 2 | 
 3 | namespace at::native::xpu {
 4 | 
 5 | TORCH_XPU_API void avg_pool2d_kernel(
 6 |     const Tensor& input_,
 7 |     int64_t kH_,
 8 |     int64_t kW_,
 9 |     int64_t dH_,
10 |     int64_t dW_,
11 |     int64_t padH_,
12 |     int64_t padW_,
13 |     bool ceil_mode,
14 |     bool count_include_pad,
15 |     std::optional<int64_t> divisor_override,
16 |     const Tensor& output);
17 | 
18 | TORCH_XPU_API void avg_pool2d_backward_kernel(
19 |     const Tensor& gradOutput_,
20 |     const Tensor& input_,
21 |     IntArrayRef kernel_size,
22 |     IntArrayRef stride,
23 |     IntArrayRef padding,
24 |     bool ceil_mode,
25 |     bool count_include_pad,
26 |     std::optional<int64_t> divisor_override,
27 |     const Tensor& gradInput);
28 | 
29 | } // namespace at::native::xpu
30 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/AveragePool3dKernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/core/Tensor.h>
 4 | 
 5 | namespace at::native::xpu {
 6 | 
 7 | TORCH_XPU_API void avg_pool3d_kernel(
 8 |     const Tensor& input_,
 9 |     IntArrayRef kernel_size,
10 |     IntArrayRef stride,
11 |     IntArrayRef padding,
12 |     bool ceil_mode,
13 |     bool count_include_pad,
14 |     std::optional<int64_t> divisor_override,
15 |     const Tensor& output);
16 | 
17 | TORCH_XPU_API void avg_pool3d_backward_kernel(
18 |     const Tensor& gradOutput_,
19 |     const Tensor& input_,
20 |     IntArrayRef kernel_size,
21 |     IntArrayRef stride,
22 |     IntArrayRef padding,
23 |     bool ceil_mode,
24 |     bool count_include_pad,
25 |     std::optional<int64_t> divisor_override,
26 |     const Tensor& gradInput);
27 | 
28 | } // namespace at::native::xpu
29 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/BesselJ0Kernel.cpp:
--------------------------------------------------------------------------------
 1 | #include <ATen/Dispatch.h>
 2 | #include <ATen/native/Math.h>
 3 | #include <ATen/native/TensorIterator.h>
 4 | #include <ATen/native/xpu/sycl/Loops.h>
 5 | #include <c10/core/Scalar.h>
 6 | 
 7 | #include <ATen/native/xpu/sycl/BesselJ0Kernel.h>
 8 | 
 9 | namespace at::native::xpu {
10 | 
11 | template <typename scalar_t>
12 | struct BesselJ0Functor {
13 |   scalar_t operator()(scalar_t a) const {
14 |     return bessel_j0_forward(a);
15 |   }
16 | };
17 | 
18 | void bessel_j0_kernel(TensorIteratorBase& iter) {
19 |   AT_DISPATCH_FLOATING_TYPES(iter.common_dtype(), "bessel_j0_xpu", [&]() {
20 |     gpu_kernel(iter, BesselJ0Functor<scalar_t>());
21 |   });
22 | }
23 | 
24 | } // namespace at::native::xpu
25 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/BesselJ0Kernel.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/native/TensorIterator.h>
 4 | 
 5 | namespace at::native::xpu {
 6 | 
 7 | TORCH_XPU_API void bessel_j0_kernel(TensorIteratorBase& iter);
 8 | 
 9 | } // namespace at::native::xpu
10 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/BesselJ1Kernel.cpp:
--------------------------------------------------------------------------------
 1 | #include <ATen/Dispatch.h>
 2 | #include <ATen/native/TensorIterator.h>
 3 | #include <ATen/native/xpu/sycl/Loops.h>
 4 | #include <ATen/native/xpu/sycl/MathExtensions.h>
 5 | #include <c10/core/Scalar.h>
 6 | 
 7 | #include <ATen/native/xpu/sycl/BesselJ1Kernel.h>
 8 | 
 9 | namespace at::native::xpu {
10 | 
11 | template <typename scalar_t>
12 | struct BesselJ1Functor {
13 |   scalar_t operator()(scalar_t a) const {
14 |     if (a < scalar_t(0.0f)) {
15 |       return -bessel_j1_forward(-a);
16 |     }
17 |     return bessel_j1_forward(a);
18 |   }
19 | };
20 | 
21 | void bessel_j1_kernel(TensorIteratorBase& iter) {
22 |   AT_DISPATCH_FLOATING_TYPES(iter.common_dtype(), "bessel_j1_xpu", [&]() {
23 |     gpu_kernel(iter, BesselJ1Functor<scalar_t>());
24 |   });
25 | }
26 | 
27 | } // namespace at::native::xpu
28 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/BesselJ1Kernel.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/native/TensorIterator.h>
 4 | 
 5 | namespace at::native::xpu {
 6 | 
 7 | TORCH_XPU_API void bessel_j1_kernel(TensorIteratorBase& iter);
 8 | 
 9 | } // namespace at::native::xpu
10 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/BesselY0Kernel.cpp:
--------------------------------------------------------------------------------
 1 | #include <ATen/Dispatch.h>
 2 | #include <ATen/native/Math.h>
 3 | #include <ATen/native/TensorIterator.h>
 4 | #include <ATen/native/xpu/sycl/Loops.h>
 5 | #include <c10/core/Scalar.h>
 6 | 
 7 | #include <ATen/native/xpu/sycl/BesselY0Kernel.h>
 8 | 
 9 | namespace at::native::xpu {
10 | 
11 | template <typename scalar_t>
12 | struct BesselY0Functor {
13 |   scalar_t operator()(scalar_t a) const {
14 |     return bessel_y0_forward(a);
15 |   }
16 | };
17 | 
18 | void bessel_y0_kernel(TensorIteratorBase& iter) {
19 |   AT_DISPATCH_FLOATING_TYPES(iter.common_dtype(), "bessel_y0_xpu", [&]() {
20 |     gpu_kernel(iter, BesselY0Functor<scalar_t>());
21 |   });
22 | }
23 | 
24 | } // namespace at::native::xpu
25 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/BesselY0Kernel.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/native/TensorIterator.h>
 4 | 
 5 | namespace at::native::xpu {
 6 | 
 7 | TORCH_XPU_API void bessel_y0_kernel(TensorIteratorBase& iter);
 8 | 
 9 | } // namespace at::native::xpu
10 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/BesselY1Kernel.cpp:
--------------------------------------------------------------------------------
 1 | #include <ATen/Dispatch.h>
 2 | #include <ATen/native/TensorIterator.h>
 3 | #include <ATen/native/xpu/sycl/Loops.h>
 4 | #include <ATen/native/xpu/sycl/MathExtensions.h>
 5 | #include <c10/core/Scalar.h>
 6 | 
 7 | #include <ATen/native/xpu/sycl/BesselY1Kernel.h>
 8 | 
 9 | namespace at::native::xpu {
10 | 
11 | template <typename scalar_t>
12 | struct BesselY1Functor {
13 |   scalar_t operator()(scalar_t a) const {
14 |     return bessel_y1_forward(a);
15 |   }
16 | };
17 | 
18 | void bessel_y1_kernel(TensorIteratorBase& iter) {
19 |   AT_DISPATCH_FLOATING_TYPES(iter.common_dtype(), "bessel_y1_xpu", [&]() {
20 |     gpu_kernel(iter, BesselY1Functor<scalar_t>());
21 |   });
22 | }
23 | 
24 | } // namespace at::native::xpu
25 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/BesselY1Kernel.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/native/TensorIterator.h>
 4 | 
 5 | namespace at::native::xpu {
 6 | 
 7 | TORCH_XPU_API void bessel_y1_kernel(TensorIteratorBase& iter);
 8 | 
 9 | } // namespace at::native::xpu
10 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/BinaryBitwiseOpsKernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/native/TensorIterator.h>
 4 | 
 5 | namespace at {
 6 | namespace native {
 7 | namespace xpu {
 8 | 
 9 | TORCH_XPU_API void bitwise_and_kernel(TensorIteratorBase& iter);
10 | 
11 | TORCH_XPU_API void bitwise_or_kernel(TensorIteratorBase& iter);
12 | 
13 | TORCH_XPU_API void bitwise_xor_kernel(TensorIteratorBase& iter);
14 | 
15 | } // namespace xpu
16 | } // namespace native
17 | } // namespace at
18 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/BinaryGeometricKernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/native/TensorIterator.h>
 4 | 
 5 | namespace at::native::xpu {
 6 | 
 7 | TORCH_XPU_API void atan2_kernel(TensorIteratorBase& iter);
 8 | 
 9 | TORCH_XPU_API void hypot_kernel(TensorIteratorBase& iter);
10 | 
11 | } // namespace at::native::xpu
12 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/BinaryInternal.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <comm/XPUMathCompat.h>
 4 | 
 5 | namespace at::native::xpu {
 6 | 
 7 | template <typename scalar_t>
 8 | struct DivFunctor {
 9 |   scalar_t operator()(scalar_t a, scalar_t b) const {
10 |     return c10::xpu::compat::div(a, b);
11 |   }
12 | };
13 | 
14 | template <typename scalar_t>
15 | struct MulFunctor {
16 |   scalar_t operator()(scalar_t a, scalar_t b) const {
17 |     return a * b;
18 |   }
19 | };
20 | 
21 | // Workaround for the error: '*' in boolean context, suggest '&&' instead
22 | // [-Werror=int-in-bool-context]
23 | template <>
24 | struct MulFunctor<bool> {
25 |   bool operator()(bool a, bool b) const {
26 |     return a && b;
27 |   }
28 | };
29 | 
30 | } // namespace at::native::xpu
31 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/BinaryKernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/native/TensorIterator.h>
 4 | 
 5 | namespace at::native::xpu {
 6 | 
 7 | TORCH_XPU_API void add_kernel(TensorIteratorBase& iter, const Scalar& alpha);
 8 | 
 9 | TORCH_XPU_API void sub_kernel(TensorIteratorBase& iter, const Scalar& alpha);
10 | 
11 | TORCH_XPU_API void mul_kernel(TensorIteratorBase& iter);
12 | 
13 | TORCH_XPU_API void div_true_kernel(TensorIteratorBase& iter);
14 | 
15 | TORCH_XPU_API void div_trunc_kernel(TensorIteratorBase& iter);
16 | 
17 | TORCH_XPU_API void div_floor_kernel(TensorIteratorBase& iter);
18 | 
19 | } // namespace at::native::xpu
20 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/BinaryLogicalOpsKernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/native/TensorIterator.h>
 4 | 
 5 | namespace at::native::xpu {
 6 | 
 7 | TORCH_XPU_API void logical_and_kernel(TensorIterator& iter);
 8 | 
 9 | TORCH_XPU_API void logical_or_kernel(TensorIterator& iter);
10 | 
11 | TORCH_XPU_API void logical_xor_kernel(TensorIterator& iter);
12 | 
13 | } // namespace at::native::xpu
14 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/BinaryMiscBackwardOpsKernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/native/TensorIterator.h>
 4 | 
 5 | namespace at::native::xpu {
 6 | 
 7 | TORCH_XPU_API void sigmoid_backward_kernel(TensorIteratorBase& iter);
 8 | 
 9 | TORCH_XPU_API void tanh_backward_kernel(TensorIteratorBase& iter);
10 | 
11 | TORCH_XPU_API void logit_backward_kernel(
12 |     TensorIteratorBase& iter,
13 |     const Scalar& eps_scalar);
14 | 
15 | } // namespace at::native::xpu
16 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/BinaryMiscOpsKernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/native/TensorIterator.h>
 4 | 
 5 | namespace at::native::xpu {
 6 | 
 7 | TORCH_XPU_API void mse_kernel(TensorIteratorBase& iter);
 8 | 
 9 | TORCH_XPU_API void smooth_l1_kernel(TensorIteratorBase& iter, double beta);
10 | 
11 | TORCH_XPU_API void huber_kernel(TensorIterator& iter, double delta);
12 | 
13 | TORCH_XPU_API void xlogy_kernel(TensorIteratorBase& iter);
14 | 
15 | TORCH_XPU_API void xlog1py_kernel(TensorIteratorBase& iter);
16 | 
17 | } // namespace at::native::xpu
18 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/BinaryRemainderKernel.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/native/TensorIterator.h>
 4 | 
 5 | namespace at {
 6 | namespace native {
 7 | namespace xpu {
 8 | 
 9 | TORCH_XPU_API void remainder_kernel(TensorIteratorBase& iter);
10 | 
11 | TORCH_XPU_API void fmod_kernel(TensorIteratorBase& iter);
12 | 
13 | } // namespace xpu
14 | } // namespace native
15 | } // namespace at
16 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/BinaryShiftOpsKernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/native/TensorIterator.h>
 4 | 
 5 | namespace at::native::xpu {
 6 | 
 7 | TORCH_XPU_API void lshift_kernel(TensorIteratorBase& iter);
 8 | 
 9 | TORCH_XPU_API void rshift_kernel(TensorIteratorBase& iter);
10 | 
11 | } // namespace at::native::xpu
12 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/BucketizationKernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <ATen/ATen.h>
 3 | 
 4 | namespace at::native::xpu {
 5 | 
 6 | TORCH_XPU_API void searchsorted_kernel(
 7 |     Tensor& result,
 8 |     const Tensor& input,
 9 |     const Tensor& sorted_sequence,
10 |     bool out_int32,
11 |     bool right,
12 |     const Tensor& sorter);
13 | 
14 | } // namespace at::native::xpu
15 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/ChebyshevPolynomialKernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <comm/xpu_aten.h>
 4 | 
 5 | namespace at::native::xpu {
 6 | 
 7 | TORCH_XPU_API void chebyshev_polynomial_t_kernel(TensorIteratorBase& iterator);
 8 | 
 9 | TORCH_XPU_API void chebyshev_polynomial_u_kernel(TensorIteratorBase& iterator);
10 | 
11 | TORCH_XPU_API void chebyshev_polynomial_v_kernel(TensorIteratorBase& iterator);
12 | 
13 | TORCH_XPU_API void chebyshev_polynomial_w_kernel(TensorIteratorBase& iterator);
14 | 
15 | } // namespace at::native::xpu
16 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/ChebyshevPolynomialTKernel.cpp:
--------------------------------------------------------------------------------
 1 | #include <ATen/ATen.h>
 2 | #include <ATen/native/Math.h>
 3 | #include <ATen/native/TensorIterator.h>
 4 | #include <ATen/native/xpu/sycl/ChebyshevPolynomialKernels.h>
 5 | #include <ATen/native/xpu/sycl/Loops.h>
 6 | 
 7 | namespace at::native::xpu {
 8 | 
 9 | template <typename scalar_t>
10 | struct ChebyshevPolynomialTFunctor {
11 |   scalar_t operator()(scalar_t x, scalar_t n) const {
12 |     return chebyshev_polynomial_t_forward<scalar_t>(x, n);
13 |   }
14 | };
15 | 
16 | void chebyshev_polynomial_t_kernel(TensorIteratorBase& iterator) {
17 |   AT_DISPATCH_FLOATING_TYPES(
18 |       iterator.common_dtype(), "chebyshev_polynomial_t_xpu", [&]() {
19 |         ChebyshevPolynomialTFunctor<scalar_t> f;
20 |         gpu_kernel_with_scalars(iterator, f);
21 |       });
22 | }
23 | 
24 | } // namespace at::native::xpu
25 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/ChebyshevPolynomialUKernel.cpp:
--------------------------------------------------------------------------------
 1 | #include <ATen/ATen.h>
 2 | #include <ATen/native/Math.h>
 3 | #include <ATen/native/TensorIterator.h>
 4 | #include <ATen/native/xpu/sycl/ChebyshevPolynomialKernels.h>
 5 | #include <ATen/native/xpu/sycl/Loops.h>
 6 | 
 7 | namespace at::native::xpu {
 8 | 
 9 | template <typename scalar_t>
10 | struct ChebyshevPolynomialUFunctor {
11 |   scalar_t operator()(scalar_t x, scalar_t n) const {
12 |     return chebyshev_polynomial_u_forward<scalar_t>(x, n);
13 |   }
14 | };
15 | 
16 | void chebyshev_polynomial_u_kernel(TensorIteratorBase& iterator) {
17 |   AT_DISPATCH_FLOATING_TYPES(
18 |       iterator.common_dtype(), "chebyshev_polynomial_u_xpu", [&]() {
19 |         ChebyshevPolynomialUFunctor<scalar_t> f;
20 |         gpu_kernel_with_scalars(iterator, f);
21 |       });
22 | }
23 | 
24 | } // namespace at::native::xpu
25 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/ChebyshevPolynomialVKernel.cpp:
--------------------------------------------------------------------------------
 1 | #include <ATen/ATen.h>
 2 | #include <ATen/native/Math.h>
 3 | #include <ATen/native/TensorIterator.h>
 4 | #include <ATen/native/xpu/sycl/ChebyshevPolynomialKernels.h>
 5 | #include <ATen/native/xpu/sycl/Loops.h>
 6 | 
 7 | namespace at::native::xpu {
 8 | 
 9 | template <typename scalar_t>
10 | struct ChebyshevPolynomialVFunctor {
11 |   scalar_t operator()(scalar_t x, scalar_t n) const {
12 |     return chebyshev_polynomial_v_forward<scalar_t>(x, n);
13 |   }
14 | };
15 | 
16 | void chebyshev_polynomial_v_kernel(TensorIteratorBase& iterator) {
17 |   AT_DISPATCH_FLOATING_TYPES(
18 |       iterator.common_dtype(), "chebyshev_polynomial_v_xpu", [&]() {
19 |         ChebyshevPolynomialVFunctor<scalar_t> f;
20 |         gpu_kernel_with_scalars(iterator, f);
21 |       });
22 | }
23 | 
24 | } // namespace at::native::xpu
25 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/ChebyshevPolynomialWKernel.cpp:
--------------------------------------------------------------------------------
 1 | #include <ATen/ATen.h>
 2 | #include <ATen/native/Math.h>
 3 | #include <ATen/native/TensorIterator.h>
 4 | #include <ATen/native/xpu/sycl/ChebyshevPolynomialKernels.h>
 5 | #include <ATen/native/xpu/sycl/Loops.h>
 6 | 
 7 | namespace at::native::xpu {
 8 | 
 9 | template <typename scalar_t>
10 | struct ChebyshevPolynomialWFunctor {
11 |   scalar_t operator()(scalar_t x, scalar_t n) const {
12 |     return chebyshev_polynomial_w_forward<scalar_t>(x, n);
13 |   }
14 | };
15 | 
16 | void chebyshev_polynomial_w_kernel(TensorIteratorBase& iterator) {
17 |   AT_DISPATCH_FLOATING_TYPES(
18 |       iterator.common_dtype(), "chebyshev_polynomial_w_xpu", [&]() {
19 |         ChebyshevPolynomialWFunctor<scalar_t> f;
20 |         gpu_kernel_with_scalars(iterator, f);
21 |       });
22 | }
23 | 
24 | } // namespace at::native::xpu
25 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/Col2ImKernel.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <comm/xpu_aten.h>
 4 | 
 5 | namespace at::native::xpu {
 6 | 
 7 | TORCH_XPU_API void col2im_kernel(
 8 |     Tensor& output,
 9 |     const Tensor& input_,
10 |     IntArrayRef output_size,
11 |     IntArrayRef kernel_size,
12 |     IntArrayRef dilation,
13 |     IntArrayRef padding,
14 |     IntArrayRef stride);
15 | 
16 | } // namespace at::native::xpu
17 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/CompareKernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/native/TensorIterator.h>
 4 | 
 5 | namespace at {
 6 | namespace native {
 7 | namespace xpu {
 8 | 
 9 | TORCH_XPU_API void eq_kernel(TensorIteratorBase& iter);
10 | 
11 | TORCH_XPU_API void ne_kernel(TensorIteratorBase& iter);
12 | 
13 | TORCH_XPU_API void lt_kernel(TensorIteratorBase& iter);
14 | 
15 | TORCH_XPU_API void le_kernel(TensorIteratorBase& iter);
16 | 
17 | TORCH_XPU_API void gt_kernel(TensorIteratorBase& iter);
18 | 
19 | TORCH_XPU_API void ge_kernel(TensorIteratorBase& iter);
20 | 
21 | } // namespace xpu
22 | } // namespace native
23 | } // namespace at
24 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/ComplexKernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/native/TensorIterator.h>
 4 | 
 5 | namespace at::native::xpu {
 6 | 
 7 | TORCH_XPU_API void complex_kernel(TensorIterator& iter);
 8 | 
 9 | TORCH_XPU_API void polar_kernel(TensorIterator& iter);
10 | 
11 | } // namespace at::native::xpu
12 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/CopyKernel.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/native/TensorIterator.h>
 4 | 
 5 | namespace at::native::xpu {
 6 | 
 7 | TORCH_XPU_API void copy_kernel(TensorIteratorBase& iter);
 8 | 
 9 | } // namespace at::native::xpu
10 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/CopysignKernel.cpp:
--------------------------------------------------------------------------------
 1 | #include <ATen/Dispatch.h>
 2 | #include <ATen/native/TensorIterator.h>
 3 | 
 4 | #include <ATen/native/xpu/sycl/Loops.h>
 5 | 
 6 | #include <ATen/native/xpu/sycl/CopysignKernel.h>
 7 | 
 8 | namespace at::native::xpu {
 9 | 
10 | template <typename scalar_t>
11 | struct CopysignFunctor {
12 |   scalar_t operator()(scalar_t a, scalar_t b) const {
13 |     return std::copysign(a, b);
14 |   }
15 | };
16 | 
17 | void copysign_kernel(TensorIteratorBase& iter) {
18 |   AT_DISPATCH_FLOATING_TYPES_AND2(
19 |       at::ScalarType::Half,
20 |       at::ScalarType::BFloat16,
21 |       iter.common_dtype(),
22 |       "copysign_xpu",
23 |       [&]() { gpu_kernel_with_scalars(iter, CopysignFunctor<scalar_t>()); });
24 | }
25 | 
26 | } // namespace at::native::xpu
27 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/CopysignKernel.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/native/TensorIterator.h>
 4 | 
 5 | namespace at::native::xpu {
 6 | 
 7 | TORCH_XPU_API void copysign_kernel(TensorIteratorBase& iter);
 8 | 
 9 | } // namespace at::native::xpu
10 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/CrossKernel.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <ATen/ATen.h>
 3 | 
 4 | namespace at::native::xpu {
 5 | 
 6 | TORCH_XPU_API void linalg_cross_kernel(
 7 |     const Tensor& result,
 8 |     const Tensor& x1,
 9 |     const Tensor& x2,
10 |     int64_t dim);
11 | 
12 | } // namespace at::native::xpu
13 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/CumprodKernel.cpp:
--------------------------------------------------------------------------------
 1 | #include <ATen/Dispatch.h>
 2 | #include <ATen/core/Tensor.h>
 3 | 
 4 | #include <ATen/native/xpu/ScanKernels.h>
 5 | #include <ATen/native/xpu/sycl/ScanUtils.h>
 6 | 
 7 | namespace at::native::xpu {
 8 | 
 9 | void launch_cumprod_kernel(
10 |     const Tensor& result,
11 |     const Tensor& self,
12 |     int64_t dim) {
13 |   AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(
14 |       ScalarType::Half,
15 |       ScalarType::BFloat16,
16 |       self.scalar_type(),
17 |       "cumprod_xpu",
18 |       [&]() {
19 |         scalar_t init = 1;
20 |         scan<INCLUSIVE_TYPE, const scalar_t, scalar_t>(
21 |             result, self, dim, init, std::multiplies<scalar_t>());
22 |       });
23 | }
24 | 
25 | } // namespace at::native::xpu
26 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/CumsumKernel.cpp:
--------------------------------------------------------------------------------
 1 | #include <ATen/Dispatch.h>
 2 | #include <ATen/core/Tensor.h>
 3 | 
 4 | #include <ATen/native/xpu/ScanKernels.h>
 5 | #include <ATen/native/xpu/sycl/ScanUtils.h>
 6 | 
 7 | namespace at::native::xpu {
 8 | 
 9 | void launch_cumsum_kernel(
10 |     const Tensor& result,
11 |     const Tensor& self,
12 |     int64_t dim) {
13 |   AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(
14 |       ScalarType::Half,
15 |       ScalarType::BFloat16,
16 |       self.scalar_type(),
17 |       "cumsum_xpu",
18 |       [&]() {
19 |         scalar_t init = 0;
20 |         scan<INCLUSIVE_TYPE, const scalar_t, scalar_t>(
21 |             result, self, dim, init, std::plus<scalar_t>());
22 |       });
23 | }
24 | 
25 | } // namespace at::native::xpu
26 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/DepthwiseConv3dKernels.h:
--------------------------------------------------------------------------------
 1 | 
 2 | #pragma once
 3 | #include <ATen/native/TensorIterator.h>
 4 | #include <comm/xpu_aten.h>
 5 | 
 6 | namespace at::native::xpu {
 7 | 
 8 | TORCH_XPU_API Tensor conv_depthwise3d_kernel(
 9 |     const Tensor& input,
10 |     const Tensor& weight,
11 |     IntArrayRef kernel_size,
12 |     const std::optional<Tensor>& bias_opt,
13 |     IntArrayRef stride,
14 |     IntArrayRef padding,
15 |     IntArrayRef dilation);
16 | 
17 | TORCH_XPU_API std::tuple<Tensor&, Tensor&, Tensor&>
18 | _depthwise_3d_backward_kernel(
19 |     Tensor& grad_input,
20 |     Tensor& grad_weight,
21 |     Tensor& grad_bias,
22 |     const Tensor& grad_output,
23 |     const Tensor& input,
24 |     const Tensor& weight,
25 |     IntArrayRef kernel_size,
26 |     IntArrayRef stride,
27 |     IntArrayRef padding,
28 |     IntArrayRef dilation,
29 |     const std::array<bool, 3> output_mask);
30 | 
31 | } // namespace at::native::xpu
32 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/Dequant_int4.h:
--------------------------------------------------------------------------------
 1 | 
 2 | #pragma once
 3 | #include <ATen/native/TensorIterator.h>
 4 | #include <comm/xpu_aten.h>
 5 | 
 6 | namespace at::native::xpu {
 7 | 
 8 | TORCH_XPU_API void dequant_int4_kernel(
 9 |     const Tensor& weight_int4,
10 |     Tensor& weight,
11 |     int qGroupSize,
12 |     const Tensor& qScaleAndZeros);
13 | 
14 | } // namespace at::native::xpu
15 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/DilatedMaxPool2d.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/core/Tensor.h>
 4 | 
 5 | namespace at::native::xpu {
 6 | 
 7 | TORCH_XPU_API void max_pool2d_with_indices_kernel(
 8 |     const Tensor& input,
 9 |     IntArrayRef kernel_size,
10 |     IntArrayRef stride,
11 |     IntArrayRef padding,
12 |     IntArrayRef dilation,
13 |     bool ceil_mode,
14 |     const Tensor& output,
15 |     const Tensor& indices);
16 | 
17 | TORCH_XPU_API void max_pool2d_with_indices_backward_kernel(
18 |     const Tensor& gradInput,
19 |     const Tensor& gradOutput,
20 |     const Tensor& input,
21 |     const Tensor& indices,
22 |     IntArrayRef kernel_size,
23 |     IntArrayRef stride,
24 |     IntArrayRef padding,
25 |     IntArrayRef dilation,
26 |     bool ceil_mode);
27 | 
28 | } // namespace at::native::xpu
29 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/DilatedMaxPool3d.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/core/Tensor.h>
 4 | 
 5 | namespace at::native::xpu {
 6 | 
 7 | TORCH_XPU_API void max_pool3d_with_indices_kernel(
 8 |     const Tensor& input,
 9 |     IntArrayRef kernel_size,
10 |     IntArrayRef stride,
11 |     IntArrayRef padding,
12 |     IntArrayRef dilation,
13 |     bool ceil_mode,
14 |     Tensor& output,
15 |     Tensor& indices);
16 | 
17 | TORCH_XPU_API void max_pool3d_with_indices_backward_kernel(
18 |     Tensor& gradInput,
19 |     const Tensor& gradOutput,
20 |     const Tensor& input,
21 |     const Tensor& indices,
22 |     IntArrayRef kernel_size,
23 |     IntArrayRef stride,
24 |     IntArrayRef padding,
25 |     IntArrayRef dilation,
26 |     bool ceil_mode);
27 | 
28 | } // namespace at::native::xpu
29 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/DistanceKernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <comm/xpu_aten.h>
 3 | 
 4 | namespace at::native::xpu {
 5 | 
 6 | TORCH_XPU_API void cdist_kernel(
 7 |     Tensor& result,
 8 |     const Tensor& x1_expanded,
 9 |     const Tensor& x2_expanded,
10 |     double p);
11 | 
12 | TORCH_XPU_API void cdist_backward_kernel(
13 |     Tensor& grad_x1,
14 |     const Tensor& grad,
15 |     const Tensor& x1,
16 |     const Tensor& x2,
17 |     const double p,
18 |     const Tensor& cdist);
19 | 
20 | TORCH_XPU_API void pdist_forward_kernel(
21 |     Tensor& result,
22 |     const Tensor& self,
23 |     double p);
24 | 
25 | TORCH_XPU_API void pdist_backward_kernel(
26 |     Tensor& result,
27 |     const Tensor& grad,
28 |     const Tensor& self,
29 |     const double p,
30 |     const Tensor& dist);
31 | 
32 | } // namespace at::native::xpu
33 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/DistributionCauchyKernel.cpp:
--------------------------------------------------------------------------------
 1 | #include <ATen/native/TensorIterator.h>
 2 | #include <ATen/native/xpu/sycl/DistributionKernels.h>
 3 | #include <ATen/native/xpu/sycl/DistributionTemplates.h>
 4 | #include <ATen/xpu/XPUGeneratorImpl.h>
 5 | 
 6 | namespace at::native::xpu {
 7 | 
 8 | void cauchy_kernel(
 9 |     TensorIteratorBase& iter,
10 |     double median,
11 |     double sigma,
12 |     std::optional<Generator> gen) {
13 |   auto generator = get_generator_or_default<at::XPUGeneratorImpl>(
14 |       gen, at::xpu::detail::getDefaultXPUGenerator());
15 |   at::native::templates::xpu::cauchy_kernel(iter, median, sigma, generator);
16 | }
17 | 
18 | } // namespace at::native::xpu
19 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/DistributionExponentialKernel.cpp:
--------------------------------------------------------------------------------
 1 | #include <ATen/ATen.h>
 2 | #include <ATen/AccumulateType.h>
 3 | #include <ATen/native/TensorIterator.h>
 4 | #include <ATen/native/xpu/sycl/DistributionTemplates.h>
 5 | #include <ATen/native/xpu/sycl/Philox4x32.h>
 6 | #include <ATen/xpu/XPUGeneratorImpl.h>
 7 | #include <comm/DeviceProperties.h>
 8 | #include <comm/Runtime.h>
 9 | 
10 | #include <ATen/native/xpu/sycl/DistributionKernels.h>
11 | 
12 | namespace at::native::xpu {
13 | 
14 | void exponential_kernel(
15 |     TensorIteratorBase& iter,
16 |     double lambda,
17 |     std::optional<Generator> gen) {
18 |   auto generator = get_generator_or_default<at::XPUGeneratorImpl>(
19 |       gen, at::xpu::detail::getDefaultXPUGenerator());
20 |   at::native::templates::xpu::exponential_kernel(iter, lambda, generator);
21 | }
22 | 
23 | } // namespace at::native::xpu
24 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/DistributionGeometricKernel.cpp:
--------------------------------------------------------------------------------
 1 | #include <ATen/native/TensorIterator.h>
 2 | #include <ATen/native/xpu/sycl/DistributionKernels.h>
 3 | #include <ATen/native/xpu/sycl/DistributionTemplates.h>
 4 | #include <ATen/xpu/XPUGeneratorImpl.h>
 5 | 
 6 | namespace at::native::xpu {
 7 | 
 8 | void geometric_kernel(
 9 |     TensorIteratorBase& iter,
10 |     double p_,
11 |     std::optional<Generator> gen) {
12 |   auto generator = get_generator_or_default<at::XPUGeneratorImpl>(
13 |       gen, at::xpu::detail::getDefaultXPUGenerator());
14 |   at::native::templates::xpu::geometric_kernel(iter, p_, generator);
15 | }
16 | 
17 | } // namespace at::native::xpu
18 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/DistributionLogNormalKernel.cpp:
--------------------------------------------------------------------------------
 1 | #include <ATen/native/TensorIterator.h>
 2 | #include <ATen/native/xpu/sycl/DistributionKernels.h>
 3 | #include <ATen/native/xpu/sycl/DistributionTemplates.h>
 4 | #include <ATen/xpu/XPUGeneratorImpl.h>
 5 | 
 6 | namespace at::native::xpu {
 7 | 
 8 | void log_normal_kernel(
 9 |     TensorIteratorBase& iter,
10 |     double mean,
11 |     double std,
12 |     std::optional<Generator> gen) {
13 |   auto generator = get_generator_or_default<at::XPUGeneratorImpl>(
14 |       gen, at::xpu::detail::getDefaultXPUGenerator());
15 |   at::native::templates::xpu::log_normal_kernel(iter, mean, std, generator);
16 | }
17 | 
18 | } // namespace at::native::xpu
19 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/DistributionNormal.cpp:
--------------------------------------------------------------------------------
 1 | #include <ATen/AccumulateType.h>
 2 | #include <ATen/native/TensorIterator.h>
 3 | #include <ATen/native/xpu/sycl/DistributionTemplates.h>
 4 | #include <ATen/native/xpu/sycl/MemoryAccess.h>
 5 | #include <ATen/native/xpu/sycl/OffsetCalculator.h>
 6 | #include <ATen/native/xpu/sycl/Philox4x32.h>
 7 | #include <ATen/xpu/XPUGeneratorImpl.h>
 8 | #include <comm/DeviceProperties.h>
 9 | #include <comm/Runtime.h>
10 | #include <comm/xpu_aten.h>
11 | 
12 | #include <ATen/native/xpu/sycl/DistributionKernels.h>
13 | 
14 | namespace at {
15 | namespace native {
16 | namespace xpu {
17 | 
18 | void normal_kernel(
19 |     const TensorBase& self,
20 |     double mean,
21 |     double std,
22 |     std::optional<Generator> gen) {
23 |   auto generator = get_generator_or_default<at::XPUGeneratorImpl>(
24 |       gen, at::xpu::detail::getDefaultXPUGenerator());
25 |   at::native::templates::xpu::normal_kernel(self, mean, std, generator);
26 | }
27 | 
28 | } // namespace xpu
29 | } // namespace native
30 | } // namespace at
31 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/DistributionUniform.cpp:
--------------------------------------------------------------------------------
 1 | #include <ATen/AccumulateType.h>
 2 | #include <ATen/native/TensorIterator.h>
 3 | #include <ATen/native/xpu/sycl/DistributionTemplates.h>
 4 | #include <ATen/native/xpu/sycl/MemoryAccess.h>
 5 | #include <ATen/native/xpu/sycl/OffsetCalculator.h>
 6 | #include <ATen/native/xpu/sycl/Philox4x32.h>
 7 | #include <ATen/xpu/XPUGeneratorImpl.h>
 8 | #include <comm/DeviceProperties.h>
 9 | #include <comm/Runtime.h>
10 | #include <comm/xpu_aten.h>
11 | 
12 | #include <ATen/native/xpu/sycl/DistributionKernels.h>
13 | 
14 | namespace at {
15 | namespace native {
16 | namespace xpu {
17 | 
18 | void uniform_kernel(
19 |     TensorIteratorBase& iter,
20 |     double from,
21 |     double to,
22 |     std::optional<Generator> gen) {
23 |   auto generator = get_generator_or_default<at::XPUGeneratorImpl>(
24 |       gen, at::xpu::detail::getDefaultXPUGenerator());
25 |   at::native::templates::xpu::uniform_kernel(iter, from, to, generator);
26 | }
27 | 
28 | } // namespace xpu
29 | } // namespace native
30 | } // namespace at
31 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/Distributions.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/native/TensorIterator.h>
 4 | #include <ATen/xpu/XPUGeneratorImpl.h>
 5 | 
 6 | namespace at::native::xpu {
 7 | 
 8 | TORCH_XPU_API void launch_poisson_kernel(
 9 |     const TensorBase& ret,
10 |     const TensorBase& lambda,
11 |     XPUGeneratorImpl* gen);
12 | 
13 | TORCH_XPU_API void launch_binomial_kernel(
14 |     TensorIteratorBase& iter,
15 |     XPUGeneratorImpl* gen);
16 | 
17 | TORCH_XPU_API void launch_gamma_kernel(
18 |     Tensor& ret,
19 |     const Tensor& alpha,
20 |     XPUGeneratorImpl* gen);
21 | 
22 | TORCH_XPU_API void launch_standard_gamma_grad_kernel(TensorIteratorBase& iter);
23 | 
24 | TORCH_XPU_API void launch_dirichlet_kernel(TensorIteratorBase& iter);
25 | 
26 | TORCH_XPU_API void launch_dirichlet_grad_kernel(TensorIteratorBase& iter);
27 | 
28 | } // namespace at::native::xpu
29 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/DropoutKernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/native/TensorIterator.h>
 4 | 
 5 | namespace at {
 6 | namespace native {
 7 | namespace xpu {
 8 | 
 9 | TORCH_XPU_API std::tuple<Tensor, Tensor> dropout_kernel(
10 |     const Tensor& self,
11 |     double p,
12 |     std::optional<bool> train);
13 | 
14 | TORCH_XPU_API Tensor
15 | dropout_backward_kernel(const Tensor& grad, const Tensor& mask, double scale);
16 | 
17 | TORCH_XPU_API std::tuple<Tensor, Tensor> fused_dropout_kernel(
18 |     const Tensor& self,
19 |     double p,
20 |     std::optional<Generator> gen_);
21 | 
22 | TORCH_XPU_API Tensor
23 | masked_scale_kernel(const Tensor& self, const Tensor& mask, double scale);
24 | 
25 | } // namespace xpu
26 | } // namespace native
27 | } // namespace at
28 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/EmbeddingKernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/native/TensorIterator.h>
 4 | 
 5 | namespace at::native::xpu {
 6 | 
 7 | TORCH_XPU_API Tensor embedding_dense_backward_kernel(
 8 |     const Tensor& grad_,
 9 |     const Tensor& indices_,
10 |     int64_t num_weights,
11 |     int64_t padding_idx,
12 |     bool scale_grad_by_freq);
13 | 
14 | TORCH_XPU_API Tensor& embedding_renorm_kernel(
15 |     Tensor& self,
16 |     const Tensor& indices,
17 |     double max_norm,
18 |     double norm_type);
19 | 
20 | } // namespace at::native::xpu
21 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/FFTKernelFunctor.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | namespace at {
 4 | namespace native {
 5 | namespace xpu {
 6 | 
 7 | void _fft_fill_with_conjugate_symmetry_xpu(
 8 |     ScalarType dtype,
 9 |     IntArrayRef mirror_dims,
10 |     IntArrayRef signal_half_sizes,
11 |     IntArrayRef in_strides,
12 |     const void* in_data,
13 |     IntArrayRef out_strides,
14 |     void* out_data);
15 | 
16 | template <typename scalar_t, typename inp_calc_t, typename out_calc_t>
17 | void _fft_conjugate_copy_kernel(
18 |     int64_t numel,
19 |     scalar_t* out_data,
20 |     const scalar_t* in_data,
21 |     inp_calc_t ic,
22 |     out_calc_t oc);
23 | 
24 | void _fft_fill_with_conjugate_symmetry_(const Tensor& input, IntArrayRef dim_);
25 | 
26 | } // namespace xpu
27 | } // namespace native
28 | } // namespace at
29 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/FillKernel.cpp:
--------------------------------------------------------------------------------
 1 | #include <ATen/Dispatch_v2.h>
 2 | #include <ATen/native/TensorIterator.h>
 3 | #include <comm/xpu_aten.h>
 4 | 
 5 | #include <ATen/native/xpu/sycl/Loops.h>
 6 | 
 7 | #include <ATen/native/xpu/sycl/FillKernel.h>
 8 | 
 9 | namespace at {
10 | namespace native {
11 | namespace xpu {
12 | 
13 | template <typename scalar_t>
14 | struct FillFunctor {
15 |   scalar_t operator()() const {
16 |     return val_;
17 |   }
18 |   FillFunctor(scalar_t val) : val_(val) {}
19 | 
20 |  private:
21 |   scalar_t val_;
22 | };
23 | 
24 | void fill_kernel(TensorIterator& iter, const Scalar& value) {
25 |   AT_DISPATCH_V2(
26 |       iter.dtype(),
27 |       "fill_xpu",
28 |       AT_WRAP([&]() {
29 |         gpu_kernel(iter, FillFunctor<scalar_t>(value.to<scalar_t>()));
30 |       }),
31 |       AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX),
32 |       kComplexHalf,
33 |       kBool,
34 |       kHalf,
35 |       kBFloat16,
36 |       AT_EXPAND(AT_FLOAT8_TYPES),
37 |       AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES));
38 | }
39 | 
40 | } // namespace xpu
41 | } // namespace native
42 | } // namespace at
43 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/FillKernel.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/native/TensorIterator.h>
 4 | 
 5 | namespace at {
 6 | namespace native {
 7 | namespace xpu {
 8 | 
 9 | TORCH_XPU_API void fill_kernel(TensorIterator& iter, const Scalar& scalar);
10 | 
11 | }
12 | } // namespace native
13 | } // namespace at
14 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/ForeachCopyKernels.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | #include <comm/xpu_aten.h>
3 | 
4 | namespace at::native::xpu {
5 | 
6 | TORCH_XPU_API void foreach_copy_list_kernel_(TensorList self, TensorList src);
7 | 
8 | } // namespace at::native::xpu
9 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/ForeachPointwiseOpListKernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <comm/xpu_aten.h>
 3 | 
 4 | #include <ATen/native/xpu/sycl/ForeachPointwiseOpScalarListKernels.h>
 5 | 
 6 | namespace at::native::xpu {
 7 | 
 8 | #define FOREACH_POINTWISE_OP_TENSOR_KERNEL(NAME) \
 9 |   FOREACH_POINTWISE_OP_SCALARLIST_KERNEL(NAME)
10 | 
11 | #define FOREACH_POINTWISE_OP_TENSOR_INPLACE_KERNEL(NAME) \
12 |   FOREACH_POINTWISE_OP_SCALARLIST_INPLACE_KERNEL(NAME)
13 | 
14 | TORCH_XPU_API FOREACH_POINTWISE_OP_TENSOR_KERNEL(addcmul);
15 | TORCH_XPU_API FOREACH_POINTWISE_OP_TENSOR_INPLACE_KERNEL(addcmul);
16 | TORCH_XPU_API FOREACH_POINTWISE_OP_TENSOR_KERNEL(addcdiv);
17 | TORCH_XPU_API FOREACH_POINTWISE_OP_TENSOR_INPLACE_KERNEL(addcdiv);
18 | 
19 | } // namespace at::native::xpu
20 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/ForeachPointwiseOpScalarKernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <comm/xpu_aten.h>
 3 | 
 4 | namespace at::native::xpu {
 5 | 
 6 | #define FOREACH_POINTWISE_OP_SCALAR_KERNEL(NAME) \
 7 |   std::vector<Tensor> foreach_##NAME##_kernel(   \
 8 |       TensorList input,                          \
 9 |       TensorList tensors1,                       \
10 |       TensorList tensors2,                       \
11 |       const Scalar& scalar)
12 | 
13 | #define FOREACH_POINTWISE_OP_SCALAR_INPLACE_KERNEL(NAME) \
14 |   void foreach_##NAME##_kernel_(                         \
15 |       TensorList input,                                  \
16 |       TensorList tensors1,                               \
17 |       TensorList tensors2,                               \
18 |       const Scalar& scalar)
19 | 
20 | TORCH_XPU_API FOREACH_POINTWISE_OP_SCALAR_KERNEL(addcmul);
21 | TORCH_XPU_API FOREACH_POINTWISE_OP_SCALAR_INPLACE_KERNEL(addcmul);
22 | TORCH_XPU_API FOREACH_POINTWISE_OP_SCALAR_KERNEL(addcdiv);
23 | TORCH_XPU_API FOREACH_POINTWISE_OP_SCALAR_INPLACE_KERNEL(addcdiv);
24 | 
25 | } // namespace at::native::xpu
26 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/ForeachReduceKernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <comm/xpu_aten.h>
 3 | 
 4 | namespace at::native::xpu {
 5 | 
 6 | TORCH_XPU_API std::vector<Tensor> foreach_norm_kernel(
 7 |     TensorList tensors,
 8 |     const Scalar& ord,
 9 |     double p,
10 |     std::optional<ScalarType> dtype);
11 | 
12 | TORCH_XPU_API std::vector<Tensor> foreach_max_kernel(TensorList tensors);
13 | 
14 | } // namespace at::native::xpu
15 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/ForeachTernaryOpListKernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <comm/xpu_aten.h>
 3 | 
 4 | namespace at::native::xpu {
 5 | 
 6 | TORCH_XPU_API void foreach_lerp_list_kernel(
 7 |     TensorList tensors1,
 8 |     TensorList tensors2,
 9 |     TensorList tensors3,
10 |     TensorList result);
11 | 
12 | TORCH_XPU_API void foreach_lerp_list_kernel_(
13 |     TensorList tensors1,
14 |     TensorList tensors2,
15 |     TensorList tensors3);
16 | 
17 | } // namespace at::native::xpu
18 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/ForeachTernaryOpScalarKernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <comm/xpu_aten.h>
 3 | 
 4 | namespace at::native::xpu {
 5 | 
 6 | TORCH_XPU_API void foreach_lerp_scalar_kernel(
 7 |     TensorList tensors1,
 8 |     TensorList tensors2,
 9 |     const Scalar& weight,
10 |     TensorList result);
11 | 
12 | TORCH_XPU_API void foreach_lerp_scalar_kernel_(
13 |     TensorList tensors1,
14 |     TensorList tensors2,
15 |     const Scalar& weight);
16 | 
17 | } // namespace at::native::xpu
18 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/ForeachTernaryOpScalarListKernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <comm/xpu_aten.h>
 3 | 
 4 | namespace at::native::xpu {
 5 | 
 6 | TORCH_XPU_API void foreach_lerp_scalarlist_kernel(
 7 |     TensorList tensors1,
 8 |     TensorList tensors2,
 9 |     at::ArrayRef<Scalar> scalars,
10 |     TensorList result);
11 | 
12 | TORCH_XPU_API void foreach_lerp_scalarlist_kernel_(
13 |     TensorList tensors1,
14 |     TensorList tensors2,
15 |     at::ArrayRef<Scalar> scalars);
16 | 
17 | } // namespace at::native::xpu
18 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/FractionalMaxPool2dKernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <ATen/native/TensorIterator.h>
 3 | 
 4 | namespace at::native::xpu {
 5 | 
 6 | TORCH_XPU_API void fractional_max_pool2d_kernel(
 7 |     const Tensor& input,
 8 |     IntArrayRef pool_size,
 9 |     IntArrayRef output_size,
10 |     const Tensor& randomSamples,
11 |     const Tensor& output,
12 |     const Tensor& indices);
13 | 
14 | TORCH_XPU_API void fractional_max_pool2d_backward_kernel(
15 |     const Tensor& gradOutput,
16 |     const Tensor& input,
17 |     IntArrayRef pool_size /* unused */,
18 |     IntArrayRef output_size,
19 |     const Tensor& indices,
20 |     const Tensor& gradInput);
21 | 
22 | } // namespace at::native::xpu
23 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/FractionalMaxPool3dKernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/native/TensorIterator.h>
 4 | 
 5 | namespace at::native::xpu {
 6 | 
 7 | TORCH_XPU_API void fractional_max_pool3d_kernel(
 8 |     const Tensor& input,
 9 |     int64_t poolSizeT,
10 |     int64_t poolSizeH,
11 |     int64_t poolSizeW,
12 |     int64_t outputT,
13 |     int64_t outputH,
14 |     int64_t outputW,
15 |     const Tensor& randomSamples,
16 |     int64_t numBatch,
17 |     int64_t numPlanes,
18 |     int64_t inputT,
19 |     int64_t inputH,
20 |     int64_t inputW,
21 |     const Tensor& output,
22 |     const Tensor& indices);
23 | 
24 | TORCH_XPU_API void fractional_max_pool3d_backward_kernel(
25 |     Tensor& gradInput,
26 |     const Tensor& gradOutput,
27 |     const Tensor& input,
28 |     IntArrayRef output_size,
29 |     const Tensor& indices);
30 | 
31 | } // namespace at::native::xpu
32 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/FunctionOfAMatrixUtilsKernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <ATen/native/TensorIterator.h>
 3 | 
 4 | namespace at::native::xpu {
 5 | 
 6 | TORCH_XPU_API void _compute_linear_combination_kernel(
 7 |     TensorIterator& iter,
 8 |     int64_t in_stride,
 9 |     int64_t coeff_stride,
10 |     int64_t num_summations);
11 | 
12 | } // namespace at::native::xpu
13 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/FusedSgdKernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <ATen/core/Tensor.h>
 3 | 
 4 | namespace at::native::xpu {
 5 | 
 6 | TORCH_XPU_API void fused_sgd_kernel(
 7 |     at::TensorList params,
 8 |     at::TensorList grads,
 9 |     const double weight_decay,
10 |     const double momentum,
11 |     const float* lr_ptr,
12 |     const double lr,
13 |     const double dampening,
14 |     const bool nesterov,
15 |     const bool maximize,
16 |     const bool is_first_step,
17 |     const float* grad_scale_ptr,
18 |     const float* found_inf_ptr);
19 | 
20 | TORCH_XPU_API void fused_sgd_with_momentum_kernel(
21 |     at::TensorList params,
22 |     at::TensorList grads,
23 |     at::TensorList momentum_buffer_list,
24 |     const double weight_decay,
25 |     const double momentum,
26 |     const float* lr_ptr,
27 |     const double lr,
28 |     const double dampening,
29 |     const bool nesterov,
30 |     const bool maximize,
31 |     const bool is_first_step,
32 |     const float* grad_scale_ptr,
33 |     const float* found_inf_ptr);
34 | 
35 | } // namespace at::native::xpu
36 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/GcdLcmKernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/native/TensorIterator.h>
 4 | 
 5 | namespace at {
 6 | namespace native {
 7 | namespace xpu {
 8 | 
 9 | TORCH_XPU_API void gcd_kernel(TensorIteratorBase& iter);
10 | 
11 | TORCH_XPU_API void lcm_kernel(TensorIteratorBase& iter);
12 | 
13 | } // namespace xpu
14 | } // namespace native
15 | } // namespace at
16 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/GroupNormKernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <comm/xpu_aten.h>
 4 | 
 5 | namespace at::native::xpu {
 6 | 
 7 | TORCH_XPU_API void group_norm_kernel(
 8 |     const Tensor& X,
 9 |     const Tensor& gamma,
10 |     const Tensor& beta,
11 |     int64_t N,
12 |     int64_t C,
13 |     int64_t HxW,
14 |     int64_t group,
15 |     double eps,
16 |     Tensor& Y,
17 |     Tensor& mean,
18 |     Tensor& rstd);
19 | 
20 | TORCH_XPU_API void group_norm_backward_kernel(
21 |     const Tensor& dY,
22 |     const Tensor& X,
23 |     const Tensor& mean,
24 |     const Tensor& rstd,
25 |     const Tensor& gamma,
26 |     int64_t N,
27 |     int64_t C,
28 |     int64_t HxW,
29 |     int64_t group,
30 |     Tensor& dX,
31 |     Tensor& dgamma,
32 |     Tensor& dbeta);
33 | 
34 | } // namespace at::native::xpu
35 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/HermitePolynomialHKernel.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | #include <comm/xpu_aten.h>
3 | 
4 | namespace at::native::xpu {
5 | 
6 | TORCH_XPU_API void hermite_polynomial_h_kernel(TensorIteratorBase& iterator);
7 | 
8 | } // namespace at::native::xpu
9 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/HermitePolynomialHeKernel.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | #include <comm/xpu_aten.h>
3 | 
4 | namespace at::native::xpu {
5 | 
6 | TORCH_XPU_API void hermite_polynomial_he_kernel(TensorIteratorBase& iterator);
7 | 
8 | } // namespace at::native::xpu
9 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/HistogramKernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/ATen.h>
 4 | 
 5 | namespace at::native::xpu {
 6 | 
 7 | TORCH_XPU_API void histogramdd_kernel(
 8 |     const Tensor& self,
 9 |     const std::optional<Tensor>& weight,
10 |     bool density,
11 |     Tensor& hist,
12 |     const TensorList& bin_edges_);
13 | 
14 | TORCH_XPU_API void histogramdd_linear_kernel(
15 |     const Tensor& self,
16 |     const std::optional<Tensor>& weight,
17 |     bool density,
18 |     Tensor& hist,
19 |     const TensorList& bin_edges_,
20 |     bool local_search);
21 | 
22 | TORCH_XPU_API void histogram_select_outer_bin_edges_kernel(
23 |     const Tensor& input,
24 |     const int64_t N,
25 |     std::vector<double>& leftmost_edges,
26 |     std::vector<double>& rightmost_edges);
27 | 
28 | } // namespace at::native::xpu
29 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/IGammaKernel.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/native/TensorIterator.h>
 4 | 
 5 | namespace at::native::xpu {
 6 | 
 7 | TORCH_XPU_API void igamma_kernel(TensorIteratorBase& iter);
 8 | 
 9 | TORCH_XPU_API void igammac_kernel(TensorIteratorBase& iter);
10 | 
11 | } // namespace at::native::xpu
12 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/Im2ColKernel.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <comm/xpu_aten.h>
 4 | 
 5 | namespace at::native::xpu {
 6 | 
 7 | TORCH_XPU_API void im2col_kernel(
 8 |     Tensor& output,
 9 |     const Tensor& input_,
10 |     IntArrayRef kernel_size,
11 |     IntArrayRef dilation,
12 |     IntArrayRef padding,
13 |     IntArrayRef stride);
14 | 
15 | } // namespace at::native::xpu
16 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/KernelUtils.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <c10/util/Exception.h>
 4 | #include <limits>
 5 | 
 6 | #define XPU_KERNEL_LOOP_TYPE(item, i, n, index_type)                      \
 7 |   int64_t _i_n_d_e_x =                                                    \
 8 |       item.get_group(0) * item.get_local_range(0) + item.get_local_id(0); \
 9 |   for (index_type i = _i_n_d_e_x; _i_n_d_e_x < (n);                       \
10 |        _i_n_d_e_x += item.get_local_range(0) * item.get_group_range(0),   \
11 |                   i = _i_n_d_e_x)
12 | 
13 | #define XPU_KERNEL_LOOP(item, i, n) XPU_KERNEL_LOOP_TYPE(item, i, n, int)
14 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/LaguerrePolynomialLKernel.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | #include <comm/xpu_aten.h>
3 | 
4 | namespace at::native::xpu {
5 | 
6 | TORCH_XPU_API void laguerre_polynomial_l_kernel(TensorIteratorBase& iterator);
7 | 
8 | } // namespace at::native::xpu
9 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/LaunchUtils.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <algorithm>
 3 | 
 4 | namespace at::native {
 5 | namespace xpu {
 6 | // returns 2**floor(log2(n))
 7 | static int lastPow2(unsigned int n) {
 8 |   n |= (n >> 1);
 9 |   n |= (n >> 2);
10 |   n |= (n >> 4);
11 |   n |= (n >> 8);
12 |   n |= (n >> 16);
13 |   return std::max<int>(1, n - (n >> 1));
14 | }
15 | } // namespace xpu
16 | } // namespace at::native


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/LayerNormKernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/native/TensorIterator.h>
 4 | 
 5 | namespace at {
 6 | namespace native {
 7 | namespace xpu {
 8 | 
 9 | TORCH_XPU_API void layer_norm_kernel(
10 |     const Tensor& X,
11 |     const Tensor& gamma,
12 |     const Tensor& beta,
13 |     int64_t M,
14 |     int64_t N,
15 |     double eps,
16 |     Tensor* Y,
17 |     Tensor* mean,
18 |     Tensor* rstd);
19 | 
20 | TORCH_XPU_API std::tuple<Tensor, Tensor, Tensor> layer_norm_backward_kernel(
21 |     const Tensor& dY,
22 |     const Tensor& X,
23 |     const Tensor& mean,
24 |     const Tensor& rstd,
25 |     const Tensor& gamma,
26 |     int64_t M,
27 |     int64_t N,
28 |     Tensor& dX,
29 |     Tensor& dgamma,
30 |     Tensor& dbeta,
31 |     std::array<bool, 3> grad_input_mask);
32 | 
33 | } // namespace xpu
34 | } // namespace native
35 | } // namespace at
36 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/LegendrePolynomialPKernel.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | #include <comm/xpu_aten.h>
3 | 
4 | namespace at::native::xpu {
5 | 
6 | TORCH_XPU_API void legendre_polynomial_p_kernel(TensorIteratorBase& iterator);
7 | 
8 | } // namespace at::native::xpu
9 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/LerpKernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/TensorIterator.h>
 4 | 
 5 | namespace at::native::xpu {
 6 | 
 7 | TORCH_XPU_API void lerp_tensor_kernel(TensorIteratorBase& iter);
 8 | 
 9 | TORCH_XPU_API void lerp_scalar_kernel(
10 |     TensorIteratorBase& iter,
11 |     const c10::Scalar& weight);
12 | 
13 | } // namespace at::native::xpu
14 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/LinearAlgebraKernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <ATen/native/TensorIterator.h>
 3 | 
 4 | namespace at::native::xpu {
 5 | 
 6 | TORCH_XPU_API void addr_kernel(
 7 |     TensorIterator& iter,
 8 |     const Scalar& beta,
 9 |     const Scalar& alpha);
10 | 
11 | } // namespace at::native::xpu
12 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/LinearInt4.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <ATen/native/TensorIterator.h>
 3 | #include <comm/xpu_aten.h>
 4 | 
 5 | namespace at::native::xpu {
 6 | 
 7 | TORCH_XPU_API void linear_int4_kernel(
 8 |     const Tensor& input,
 9 |     const Tensor& weight,
10 |     int qGroupSize,
11 |     const Tensor& weight_scale_zero_point,
12 |     Tensor& output);
13 | 
14 | } // namespace at::native::xpu
15 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/LogAddExpKernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <ATen/native/TensorIterator.h>
 3 | #include <c10/core/ScalarType.h>
 4 | 
 5 | namespace at::native::xpu {
 6 | 
 7 | TORCH_XPU_API void logaddexp_kernel(TensorIteratorBase& iter);
 8 | 
 9 | TORCH_XPU_API void logaddexp2_kernel(TensorIteratorBase& iter);
10 | 
11 | } // namespace at::native::xpu
12 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/LossCTCKernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <ATen/native/TensorIterator.h>
 3 | 
 4 | namespace at::native::xpu {
 5 | 
 6 | TORCH_XPU_API std::tuple<Tensor, Tensor> ctc_loss_kernel(
 7 |     const Tensor& log_probs,
 8 |     const Tensor& targets,
 9 |     IntArrayRef input_lengths,
10 |     IntArrayRef target_lengths,
11 |     int64_t BLANK,
12 |     bool zero_infinity);
13 | 
14 | TORCH_XPU_API Tensor ctc_loss_backward_kernel(
15 |     const Tensor& grad,
16 |     const Tensor& log_probs,
17 |     const Tensor& targets,
18 |     IntArrayRef input_lengths,
19 |     IntArrayRef target_lengths,
20 |     const Tensor& neg_log_likelihood,
21 |     const Tensor& log_alpha,
22 |     int64_t BLANK,
23 |     bool zero_infinity);
24 | 
25 | } // namespace at::native::xpu
26 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/LossKernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <ATen/native/TensorIterator.h>
 3 | 
 4 | namespace at::native::xpu {
 5 | 
 6 | TORCH_XPU_API Tensor& binary_cross_entropy_kernel(
 7 |     const Tensor& input,
 8 |     const Tensor& target,
 9 |     const Tensor& weight,
10 |     int64_t reduction,
11 |     Tensor& loss);
12 | 
13 | TORCH_XPU_API Tensor& binary_cross_entropy_backward_kernel(
14 |     const Tensor& grad,
15 |     const Tensor& input,
16 |     const Tensor& target,
17 |     const Tensor& weight,
18 |     int64_t reduction,
19 |     Tensor& grad_input);
20 | 
21 | } // namespace at::native::xpu
22 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/LossNLL2dKernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/ATen.h>
 4 | 
 5 | namespace at::native::xpu {
 6 | 
 7 | TORCH_XPU_API void nll_loss2d_forward_kernel(
 8 |     Tensor& output,
 9 |     Tensor& total_weight,
10 |     const Tensor& input,
11 |     const Tensor& target,
12 |     const Tensor& weight,
13 |     int64_t reduction,
14 |     int64_t ignore_index);
15 | 
16 | TORCH_XPU_API void nll_loss2d_backward_kernel(
17 |     Tensor& grad_input,
18 |     const Tensor& grad_output,
19 |     const Tensor& input,
20 |     const Tensor& target,
21 |     const Tensor& weight,
22 |     int64_t reduction,
23 |     int64_t ignore_index,
24 |     const Tensor& total_weight);
25 | 
26 | } // namespace at::native::xpu
27 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/LossNLLKernel.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <ATen/native/TensorIterator.h>
 3 | 
 4 | namespace at::native::xpu {
 5 | 
 6 | TORCH_XPU_API void nll_loss_forward_kernel(
 7 |     const Tensor& self,
 8 |     const Tensor& target,
 9 |     const OptionalTensorRef weight_opt,
10 |     int64_t reduction,
11 |     int64_t ignore_index,
12 |     const Tensor& output,
13 |     const Tensor& total_weight);
14 | 
15 | TORCH_XPU_API void nll_loss_backward_kernel(
16 |     const Tensor& grad_output,
17 |     const Tensor& self,
18 |     const Tensor& target,
19 |     const OptionalTensorRef weight_opt,
20 |     int64_t reduction,
21 |     int64_t ignore_index,
22 |     const Tensor& total_weight,
23 |     const Tensor& grad_input);
24 | 
25 | } // namespace at::native::xpu
26 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/MaxMinElementwiseKernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/native/TensorIterator.h>
 4 | 
 5 | namespace at::native::xpu {
 6 | 
 7 | TORCH_XPU_API void maximum_kernel(TensorIteratorBase& iter);
 8 | 
 9 | TORCH_XPU_API void minimum_kernel(TensorIteratorBase& iter);
10 | 
11 | TORCH_XPU_API void fmax_kernel(TensorIteratorBase& iter);
12 | 
13 | TORCH_XPU_API void fmin_kernel(TensorIteratorBase& iter);
14 | 
15 | } // namespace at::native::xpu
16 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/MaxUnpoolingKernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <ATen/ATen.h>
 3 | 
 4 | namespace at::native::xpu {
 5 | 
 6 | TORCH_XPU_API Tensor& max_unpooling2d_forward_kernel(
 7 |     Tensor& output,
 8 |     const Tensor& self_,
 9 |     const Tensor& indices_,
10 |     IntArrayRef output_size);
11 | 
12 | TORCH_XPU_API Tensor& max_unpooling3d_forward_kernel(
13 |     Tensor& output,
14 |     const Tensor& self_,
15 |     const Tensor& indices_,
16 |     IntArrayRef output_size,
17 |     IntArrayRef stride,
18 |     IntArrayRef padding);
19 | 
20 | } // namespace at::native::xpu
21 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/ModifiedBesselI0Kernel.cpp:
--------------------------------------------------------------------------------
 1 | #include <ATen/Dispatch.h>
 2 | #include <ATen/native/Math.h>
 3 | #include <ATen/native/TensorIterator.h>
 4 | #include <ATen/native/xpu/sycl/Loops.h>
 5 | #include <c10/core/Scalar.h>
 6 | 
 7 | #include <ATen/native/xpu/sycl/ModifiedBesselI0Kernel.h>
 8 | 
 9 | namespace at::native::xpu {
10 | 
11 | template <typename scalar_t>
12 | struct ModifiedBesselI0Functor {
13 |   scalar_t operator()(scalar_t a) const {
14 |     return modified_bessel_i0_forward(a);
15 |   }
16 | };
17 | 
18 | void modified_bessel_i0_kernel(TensorIteratorBase& iter) {
19 |   AT_DISPATCH_FLOATING_TYPES(
20 |       iter.common_dtype(), "modified_bessel_i0_xpu", [&]() {
21 |         gpu_kernel(iter, ModifiedBesselI0Functor<scalar_t>());
22 |       });
23 | }
24 | 
25 | } // namespace at::native::xpu
26 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/ModifiedBesselI0Kernel.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/native/TensorIterator.h>
 4 | 
 5 | namespace at::native::xpu {
 6 | 
 7 | TORCH_XPU_API void modified_bessel_i0_kernel(TensorIteratorBase& iter);
 8 | 
 9 | } // namespace at::native::xpu
10 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/ModifiedBesselI1Kernel.cpp:
--------------------------------------------------------------------------------
 1 | #include <ATen/Dispatch.h>
 2 | #include <ATen/native/Math.h>
 3 | #include <ATen/native/TensorIterator.h>
 4 | #include <ATen/native/xpu/sycl/Loops.h>
 5 | #include <c10/core/Scalar.h>
 6 | 
 7 | #include <ATen/native/xpu/sycl/ModifiedBesselI1Kernel.h>
 8 | 
 9 | namespace at::native::xpu {
10 | 
11 | template <typename scalar_t>
12 | struct ModifiedBesselI1Functor {
13 |   scalar_t operator()(scalar_t a) const {
14 |     return modified_bessel_i1_forward(a);
15 |   }
16 | };
17 | 
18 | void modified_bessel_i1_kernel(TensorIteratorBase& iter) {
19 |   AT_DISPATCH_FLOATING_TYPES(
20 |       iter.common_dtype(), "modified_bessel_i1_xpu", [&]() {
21 |         gpu_kernel(iter, ModifiedBesselI1Functor<scalar_t>());
22 |       });
23 | }
24 | 
25 | } // namespace at::native::xpu
26 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/ModifiedBesselI1Kernel.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/native/TensorIterator.h>
 4 | 
 5 | namespace at::native::xpu {
 6 | 
 7 | TORCH_XPU_API void modified_bessel_i1_kernel(TensorIteratorBase& iter);
 8 | 
 9 | } // namespace at::native::xpu
10 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/ModifiedBesselK0Kernel.cpp:
--------------------------------------------------------------------------------
 1 | #include <ATen/Dispatch.h>
 2 | #include <ATen/native/Math.h>
 3 | #include <ATen/native/TensorIterator.h>
 4 | #include <ATen/native/xpu/sycl/Loops.h>
 5 | #include <c10/core/Scalar.h>
 6 | 
 7 | #include <ATen/native/xpu/sycl/ModifiedBesselK0Kernel.h>
 8 | 
 9 | namespace at::native::xpu {
10 | 
11 | template <typename scalar_t>
12 | struct ModifiedBesselK0Functor {
13 |   scalar_t operator()(scalar_t a) const {
14 |     return modified_bessel_k0_forward(a);
15 |   }
16 | };
17 | 
18 | void modified_bessel_k0_kernel(TensorIteratorBase& iter) {
19 |   AT_DISPATCH_FLOATING_TYPES(
20 |       iter.common_dtype(), "modified_bessel_k0_xpu", [&]() {
21 |         gpu_kernel(iter, ModifiedBesselK0Functor<scalar_t>());
22 |       });
23 | }
24 | 
25 | } // namespace at::native::xpu
26 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/ModifiedBesselK0Kernel.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/native/TensorIterator.h>
 4 | 
 5 | namespace at::native::xpu {
 6 | 
 7 | TORCH_XPU_API void modified_bessel_k0_kernel(TensorIteratorBase& iter);
 8 | 
 9 | } // namespace at::native::xpu
10 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/ModifiedBesselK1Kernel.cpp:
--------------------------------------------------------------------------------
 1 | #include <ATen/Dispatch.h>
 2 | #include <ATen/native/Math.h>
 3 | #include <ATen/native/TensorIterator.h>
 4 | #include <ATen/native/xpu/sycl/Loops.h>
 5 | #include <c10/core/Scalar.h>
 6 | 
 7 | #include <ATen/native/xpu/sycl/ModifiedBesselK1Kernel.h>
 8 | 
 9 | namespace at::native::xpu {
10 | 
11 | template <typename scalar_t>
12 | struct ModifiedBesselK1Functor {
13 |   scalar_t operator()(scalar_t a) const {
14 |     return modified_bessel_k1_forward(a);
15 |   }
16 | };
17 | 
18 | void modified_bessel_k1_kernel(TensorIteratorBase& iter) {
19 |   AT_DISPATCH_FLOATING_TYPES(
20 |       iter.common_dtype(), "modified_bessel_k1_xpu", [&]() {
21 |         gpu_kernel(iter, ModifiedBesselK1Functor<scalar_t>());
22 |       });
23 | }
24 | 
25 | } // namespace at::native::xpu
26 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/ModifiedBesselK1Kernel.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/native/TensorIterator.h>
 4 | 
 5 | namespace at::native::xpu {
 6 | 
 7 | TORCH_XPU_API void modified_bessel_k1_kernel(TensorIteratorBase& iter);
 8 | 
 9 | } // namespace at::native::xpu
10 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/MultiLabelMarginLossKernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <ATen/ATen.h>
 3 | 
 4 | namespace at::native::xpu {
 5 | 
 6 | TORCH_XPU_API void multilabel_margin_loss_kernel(
 7 |     const Tensor& input,
 8 |     const Tensor& target,
 9 |     int64_t reduction,
10 |     Tensor& output,
11 |     Tensor& is_target);
12 | 
13 | TORCH_XPU_API void multilabel_margin_loss_backward_kernel(
14 |     const Tensor& grad_output,
15 |     const Tensor& input,
16 |     const Tensor& target,
17 |     int64_t reduction,
18 |     const Tensor& is_target,
19 |     Tensor& grad_input);
20 | 
21 | } // namespace at::native::xpu


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/MultiMarginLossKernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <ATen/core/Tensor.h>
 3 | 
 4 | namespace at::native::xpu {
 5 | 
 6 | TORCH_XPU_API Tensor& multi_margin_loss_kernel(
 7 |     const Tensor& input,
 8 |     const Tensor& target,
 9 |     const Scalar& p,
10 |     const Scalar& margin,
11 |     const std::optional<Tensor>& weight,
12 |     int64_t reduction,
13 |     Tensor& out);
14 | 
15 | TORCH_XPU_API Tensor& multi_margin_loss_backward_kernel(
16 |     const Tensor& grad_output,
17 |     const Tensor& input,
18 |     const Tensor& target,
19 |     const Scalar& p,
20 |     const Scalar& margin,
21 |     const std::optional<Tensor>& weight,
22 |     int64_t reduction,
23 |     Tensor& grad_input);
24 | 
25 | } // namespace at::native::xpu
26 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/MultinomialKernel.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <ATen/ATen.h>
 3 | 
 4 | namespace at::native::xpu {
 5 | 
 6 | TORCH_XPU_API void multinomial_kernel(
 7 |     Tensor& result,
 8 |     const Tensor& self,
 9 |     const int64_t n_sample,
10 |     std::optional<Generator> generator);
11 | 
12 | } // namespace at::native::xpu
13 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/NMSKernel.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/native/TensorIterator.h>
 4 | 
 5 | namespace at {
 6 | namespace native {
 7 | namespace xpu {
 8 | 
 9 | TORCH_XPU_API Tensor nms_kernel(const Tensor& dets_sorted, float iou_threshold);
10 | 
11 | }
12 | } // namespace native
13 | } // namespace at
14 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/NonzeroKernel.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | #include <comm/xpu_aten.h>
3 | 
4 | namespace at::native::xpu {
5 | 
6 | TORCH_XPU_API void nonzero_kernel(const Tensor& self, Tensor& out);
7 | 
8 | } // namespace at::native::xpu
9 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/PointwiseOpsKernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/native/TensorIterator.h>
 4 | 
 5 | namespace at::native::xpu {
 6 | 
 7 | TORCH_XPU_API void addcmul_kernel(
 8 |     TensorIteratorBase& iter,
 9 |     const Scalar& value);
10 | 
11 | TORCH_XPU_API void addcdiv_kernel(
12 |     TensorIteratorBase& iter,
13 |     const Scalar& value);
14 | 
15 | TORCH_XPU_API void mse_backward_kernel(
16 |     TensorIterator& iter,
17 |     const Scalar& value);
18 | 
19 | TORCH_XPU_API void smooth_l1_backward_kernel(
20 |     TensorIterator& iter,
21 |     const Scalar& norm,
22 |     double beta);
23 | 
24 | TORCH_XPU_API void huber_backward_kernel(
25 |     TensorIterator& iter,
26 |     const Scalar& norm,
27 |     double delta);
28 | 
29 | } // namespace at::native::xpu
30 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/PowKernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/native/TensorIterator.h>
 4 | 
 5 | namespace at {
 6 | namespace native {
 7 | namespace xpu {
 8 | 
 9 | TORCH_XPU_API void pow_tensor_scalar_kernel(
10 |     TensorIteratorBase& iter,
11 |     const Scalar& exp_scalar);
12 | 
13 | TORCH_XPU_API void pow_tensor_tensor_kernel(TensorIteratorBase& iter);
14 | 
15 | } // namespace xpu
16 | } // namespace native
17 | } // namespace at
18 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/PsRoiAlignKernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/native/TensorIterator.h>
 4 | namespace at::native::xpu {
 5 | 
 6 | TORCH_XPU_API std::tuple<Tensor, Tensor> ps_roi_align_kernel(
 7 |     const at::Tensor& input,
 8 |     const at::Tensor& rois,
 9 |     double spatial_scale,
10 |     int64_t pooled_height,
11 |     int64_t pooled_width,
12 |     int64_t sampling_ratio);
13 | 
14 | TORCH_XPU_API Tensor ps_roi_align_backward_kernel(
15 |     const at::Tensor& grad,
16 |     const at::Tensor& rois,
17 |     const at::Tensor& channel_mapping,
18 |     double spatial_scale,
19 |     int64_t pooled_height,
20 |     int64_t pooled_width,
21 |     int64_t sampling_ratio,
22 |     int64_t batch_size,
23 |     int64_t channels,
24 |     int64_t height,
25 |     int64_t width);
26 | } // namespace at::native::xpu
27 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/PsRoiPoolKernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/native/TensorIterator.h>
 4 | namespace at::native::xpu {
 5 | 
 6 | TORCH_XPU_API std::tuple<Tensor, Tensor> ps_roi_pool_kernel(
 7 |     const at::Tensor& input,
 8 |     const at::Tensor& rois,
 9 |     double spatial_scale,
10 |     int64_t pooled_height,
11 |     int64_t pooled_width);
12 | 
13 | TORCH_XPU_API Tensor ps_roi_pool_backward_kernel(
14 |     const at::Tensor& grad,
15 |     const at::Tensor& rois,
16 |     const at::Tensor& channel_mapping,
17 |     double spatial_scale,
18 |     int64_t pooled_height,
19 |     int64_t pooled_width,
20 |     int64_t batch_size,
21 |     int64_t channels,
22 |     int64_t height,
23 |     int64_t width);
24 | } // namespace at::native::xpu


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/RandpermKernel.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <comm/xpu_aten.h>
 3 | 
 4 | namespace at::native::xpu {
 5 | 
 6 | TORCH_XPU_API Tensor
 7 | randperm_kernel(Tensor& result, int64_t n, std::optional<Generator> generator);
 8 | 
 9 | } // namespace at::native::xpu
10 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/RangeFactoriesKernel.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/native/TensorIterator.h>
 4 | 
 5 | namespace at::native::xpu {
 6 | 
 7 | TORCH_XPU_API Tensor& arange_kernel(
 8 |     const Scalar& start,
 9 |     const Scalar& end,
10 |     const Scalar& step,
11 |     Tensor& result);
12 | 
13 | TORCH_XPU_API Tensor& range_kernel(
14 |     const Scalar& start,
15 |     const Scalar& end,
16 |     const Scalar& step,
17 |     Tensor& result);
18 | 
19 | TORCH_XPU_API Tensor& linspace_kernel(
20 |     const Scalar& start,
21 |     const Scalar& end,
22 |     int64_t steps,
23 |     Tensor& result);
24 | 
25 | TORCH_XPU_API Tensor& logspace_kernel(
26 |     const Scalar& start,
27 |     const Scalar& end,
28 |     int64_t steps,
29 |     double base,
30 |     Tensor& result);
31 | 
32 | } // namespace at::native::xpu
33 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/ReduceMaxValuesKernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/native/TensorIterator.h>
 4 | 
 5 | namespace at::native::xpu {
 6 | 
 7 | TORCH_XPU_API void max_values_kernel(TensorIterator& iter);
 8 | 
 9 | TORCH_XPU_API void max_kernel(TensorIterator& iter);
10 | 
11 | TORCH_XPU_API void max_all_kernel(TensorIterator& iter);
12 | 
13 | } // namespace at::native::xpu
14 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/ReduceMinValuesKernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/native/TensorIterator.h>
 4 | 
 5 | namespace at::native::xpu {
 6 | 
 7 | TORCH_XPU_API void min_values_kernel(TensorIterator& iter);
 8 | 
 9 | TORCH_XPU_API void min_kernel(TensorIterator& iter);
10 | 
11 | TORCH_XPU_API void min_all_kernel(TensorIterator& iter);
12 | 
13 | } // namespace at::native::xpu
14 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/ReduceNormKernel.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <ATen/native/TensorIterator.h>
 3 | #include <c10/core/ScalarType.h>
 4 | 
 5 | namespace at::native::xpu {
 6 | 
 7 | TORCH_XPU_API void norm_kernel(TensorIterator& iter, const Scalar& val);
 8 | 
 9 | } // namespace at::native::xpu
10 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/ReduceOps.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/native/TensorIterator.h>
 4 | 
 5 | namespace at::native::xpu {
 6 | 
 7 | void argmax_kernel(TensorIterator& iter);
 8 | 
 9 | void and_kernel(TensorIterator& iter);
10 | 
11 | void or_kernel(TensorIterator& iter);
12 | 
13 | void mean_kernel(TensorIterator& iter);
14 | 
15 | void sum_kernel(TensorIterator& iter);
16 | 
17 | } // namespace at::native::xpu
18 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/ReduceOpsKernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/native/TensorIterator.h>
 4 | 
 5 | namespace at::native::xpu {
 6 | 
 7 | TORCH_XPU_API void argmax_kernel(TensorIterator& iter);
 8 | 
 9 | TORCH_XPU_API void argmin_kernel(TensorIterator& iter);
10 | 
11 | TORCH_XPU_API void and_kernel(TensorIterator& iter);
12 | 
13 | TORCH_XPU_API void or_kernel(TensorIterator& iter);
14 | 
15 | TORCH_XPU_API void mean_kernel(TensorIterator& iter);
16 | 
17 | TORCH_XPU_API void sum_kernel(TensorIterator& iter);
18 | 
19 | TORCH_XPU_API void prod_kernel(TensorIterator& iter);
20 | 
21 | TORCH_XPU_API void nansum_kernel(TensorIterator& iter);
22 | 
23 | TORCH_XPU_API void std_var_kernel(
24 |     TensorIterator& iter,
25 |     double correction,
26 |     bool take_sqrt);
27 | 
28 | TORCH_XPU_API void aminmax_kernel(TensorIterator& iter);
29 | 
30 | TORCH_XPU_API void aminmax_allreduce_kernel(TensorIterator& iter);
31 | 
32 | } // namespace at::native::xpu
33 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/RenormKernel.cpp:
--------------------------------------------------------------------------------
 1 | #include <ATen/ATen.h>
 2 | 
 3 | #include <ATen/native/xpu/sycl/Loops.h>
 4 | 
 5 | #include <ATen/native/xpu/sycl/RenormKernel.h>
 6 | 
 7 | namespace at::native::xpu {
 8 | 
 9 | template <typename scalar_t>
10 | struct RenormScalarFactorFunctor {
11 |   scalar_t operator()(scalar_t norm) const {
12 |     const auto eps = static_cast<scalar_t>(1e-7);
13 |     const auto one = static_cast<scalar_t>(1.0);
14 |     return (norm > maxnorm_elm) ? maxnorm_elm / (norm + eps) : one;
15 |   }
16 | 
17 |   RenormScalarFactorFunctor(scalar_t maxnorm_elm) : maxnorm_elm(maxnorm_elm) {}
18 | 
19 |  private:
20 |   scalar_t maxnorm_elm;
21 | };
22 | 
23 | void renorm_scale_factor_kernel(TensorIteratorBase& iter, double maxnorm) {
24 |   AT_DISPATCH_FLOATING_TYPES_AND2(
25 |       at::ScalarType::Half,
26 |       at::ScalarType::BFloat16,
27 |       iter.common_dtype(),
28 |       "renorm_scale_factor_xpu",
29 |       [&] {
30 |         RenormScalarFactorFunctor<scalar_t> f(maxnorm);
31 |         gpu_kernel(iter, f);
32 |       });
33 | }
34 | 
35 | } // namespace at::native::xpu
36 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/RenormKernel.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <ATen/ATen.h>
 3 | 
 4 | namespace at::native::xpu {
 5 | 
 6 | TORCH_XPU_API void renorm_scale_factor_kernel(
 7 |     TensorIteratorBase& iter,
 8 |     double maxnorm);
 9 | 
10 | }
11 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/RepeatKernel.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <ATen/ATen.h>
 3 | namespace at::native::xpu {
 4 | 
 5 | TORCH_XPU_API Tensor repeat_interleave_kernel(
 6 |     const Tensor& repeats,
 7 |     std::optional<int64_t> output_size);
 8 | 
 9 | } // namespace at::native::xpu
10 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/ResizeKernel.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/native/TensorIterator.h>
 4 | 
 5 | namespace at::native::xpu {
 6 | 
 7 | TORCH_XPU_API TensorImpl* resize_impl_xpu_(
 8 |     TensorImpl* self,
 9 |     IntArrayRef size,
10 |     at::OptionalIntArrayRef stride,
11 |     bool device_guard = true);
12 | 
13 | }
14 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/RoiAlignKernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/native/TensorIterator.h>
 4 | namespace at::native::xpu {
 5 | 
 6 | TORCH_XPU_API Tensor roi_align_kernel(
 7 |     const at::Tensor& input,
 8 |     const at::Tensor& rois,
 9 |     double spatial_scale,
10 |     int64_t pooled_height,
11 |     int64_t pooled_width,
12 |     int64_t sampling_ratio,
13 |     bool aligned);
14 | 
15 | TORCH_XPU_API Tensor roi_align_backward_kernel(
16 |     const at::Tensor& grad,
17 |     const at::Tensor& rois,
18 |     double spatial_scale,
19 |     int64_t pooled_height,
20 |     int64_t pooled_width,
21 |     int64_t batch_size,
22 |     int64_t channels,
23 |     int64_t height,
24 |     int64_t width,
25 |     int64_t sampling_ratio,
26 |     bool aligned);
27 | } // namespace at::native::xpu


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/RoiPoolKernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/native/TensorIterator.h>
 4 | namespace at::native::xpu {
 5 | 
 6 | TORCH_XPU_API std::tuple<Tensor, Tensor> roi_pool_kernel(
 7 |     const at::Tensor& input,
 8 |     const at::Tensor& rois,
 9 |     double spatial_scale,
10 |     int64_t pooled_height,
11 |     int64_t pooled_width);
12 | 
13 | TORCH_XPU_API Tensor roi_pool_backward_kernel(
14 |     const at::Tensor& grad,
15 |     const at::Tensor& rois,
16 |     const at::Tensor& argmax,
17 |     double spatial_scale,
18 |     int64_t pooled_height,
19 |     int64_t pooled_width,
20 |     int64_t batch_size,
21 |     int64_t channels,
22 |     int64_t height,
23 |     int64_t width);
24 | } // namespace at::native::xpu


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/RreluWithNoiseKernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/core/Generator.h>
 4 | #include <ATen/native/TensorIterator.h>
 5 | 
 6 | namespace at::native::xpu {
 7 | 
 8 | TORCH_XPU_API Tensor& rrelu_with_noise_kernel(
 9 |     const Tensor& self,
10 |     Tensor& noise,
11 |     const Scalar& lower,
12 |     const Scalar& upper,
13 |     bool training,
14 |     std::optional<Generator> generator,
15 |     Tensor& output);
16 | 
17 | }
18 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/ScaledModifiedBesselK0Kernel.cpp:
--------------------------------------------------------------------------------
 1 | #include <ATen/Dispatch.h>
 2 | #include <ATen/native/Math.h>
 3 | #include <ATen/native/TensorIterator.h>
 4 | #include <ATen/native/xpu/sycl/Loops.h>
 5 | #include <c10/core/Scalar.h>
 6 | 
 7 | #include <ATen/native/xpu/sycl/ScaledModifiedBesselK0Kernel.h>
 8 | 
 9 | namespace at::native::xpu {
10 | 
11 | template <typename scalar_t>
12 | struct ScaledModifiedBesselK0Functor {
13 |   scalar_t operator()(scalar_t a) const {
14 |     return scaled_modified_bessel_k0_forward(a);
15 |   }
16 | };
17 | 
18 | void scaled_modified_bessel_k0_kernel(TensorIteratorBase& iter) {
19 |   AT_DISPATCH_FLOATING_TYPES(
20 |       iter.common_dtype(), "scaled_modified_bessel_k0_xpu", [&]() {
21 |         gpu_kernel(iter, ScaledModifiedBesselK0Functor<scalar_t>());
22 |       });
23 | }
24 | 
25 | } // namespace at::native::xpu
26 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/ScaledModifiedBesselK0Kernel.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | 
3 | #include <ATen/native/TensorIterator.h>
4 | 
5 | namespace at::native::xpu {
6 | 
7 | TORCH_XPU_API void scaled_modified_bessel_k0_kernel(TensorIteratorBase& iter);
8 | 
9 | } // namespace at::native::xpu


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/ScaledModifiedBesselK1Kernel.cpp:
--------------------------------------------------------------------------------
 1 | #include <ATen/Dispatch.h>
 2 | #include <ATen/native/Math.h>
 3 | #include <ATen/native/TensorIterator.h>
 4 | #include <ATen/native/xpu/sycl/Loops.h>
 5 | #include <c10/core/Scalar.h>
 6 | 
 7 | #include <ATen/native/xpu/sycl/ScaledModifiedBesselK1Kernel.h>
 8 | 
 9 | namespace at::native::xpu {
10 | 
11 | template <typename scalar_t>
12 | struct ScaledModifiedBesselK1Functor {
13 |   scalar_t operator()(scalar_t a) const {
14 |     return scaled_modified_bessel_k1_forward(a);
15 |   }
16 | };
17 | 
18 | void scaled_modified_bessel_k1_kernel(TensorIteratorBase& iter) {
19 |   AT_DISPATCH_FLOATING_TYPES(
20 |       iter.common_dtype(), "scaled_modified_bessel_k1_xpu", [&]() {
21 |         gpu_kernel(iter, ScaledModifiedBesselK1Functor<scalar_t>());
22 |       });
23 | }
24 | 
25 | } // namespace at::native::xpu
26 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/ScaledModifiedBesselK1Kernel.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | 
3 | #include <ATen/native/TensorIterator.h>
4 | 
5 | namespace at::native::xpu {
6 | 
7 | TORCH_XPU_API void scaled_modified_bessel_k1_kernel(TensorIteratorBase& iter);
8 | 
9 | } // namespace at::native::xpu


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/ShapeKernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/native/TensorIterator.h>
 4 | 
 5 | namespace at::native::xpu {
 6 | 
 7 | TORCH_XPU_API void cat_out_kernel(
 8 |     const ITensorListRef& tensors,
 9 |     int64_t dim,
10 |     int64_t valid,
11 |     bool all_contiguous,
12 |     bool all_same_dtype,
13 |     bool all_same_sizes_and_stride,
14 |     MemoryFormat memory_format,
15 |     const Tensor& result);
16 | 
17 | } // namespace at::native::xpu
18 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/ShiftedChebyshevPolynomialKernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <comm/xpu_aten.h>
 4 | 
 5 | namespace at::native::xpu {
 6 | 
 7 | TORCH_XPU_API void shifted_chebyshev_polynomial_t_kernel(
 8 |     TensorIteratorBase& iterator);
 9 | 
10 | TORCH_XPU_API void shifted_chebyshev_polynomial_u_kernel(
11 |     TensorIteratorBase& iterator);
12 | 
13 | TORCH_XPU_API void shifted_chebyshev_polynomial_v_kernel(
14 |     TensorIteratorBase& iterator);
15 | 
16 | TORCH_XPU_API void shifted_chebyshev_polynomial_w_kernel(
17 |     TensorIteratorBase& iterator);
18 | 
19 | } // namespace at::native::xpu
20 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/ShiftedChebyshevPolynomialTKernel.cpp:
--------------------------------------------------------------------------------
 1 | #include <ATen/ATen.h>
 2 | #include <ATen/native/Math.h>
 3 | #include <ATen/native/TensorIterator.h>
 4 | #include <ATen/native/xpu/sycl/Loops.h>
 5 | #include <ATen/native/xpu/sycl/ShiftedChebyshevPolynomialKernels.h>
 6 | 
 7 | namespace at::native::xpu {
 8 | 
 9 | template <typename scalar_t>
10 | struct ShiftedChebyshevPolynomialTFunctor {
11 |   scalar_t operator()(scalar_t x, scalar_t n) const {
12 |     return shifted_chebyshev_polynomial_t_forward<scalar_t>(x, n);
13 |   }
14 | };
15 | 
16 | void shifted_chebyshev_polynomial_t_kernel(TensorIteratorBase& iterator) {
17 |   AT_DISPATCH_FLOATING_TYPES(
18 |       iterator.common_dtype(), "shifted_chebyshev_polynomial_t_xpu", [&]() {
19 |         ShiftedChebyshevPolynomialTFunctor<scalar_t> f;
20 |         gpu_kernel_with_scalars(iterator, f);
21 |       });
22 | }
23 | 
24 | } // namespace at::native::xpu
25 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/ShiftedChebyshevPolynomialUKernel.cpp:
--------------------------------------------------------------------------------
 1 | #include <ATen/ATen.h>
 2 | #include <ATen/native/Math.h>
 3 | #include <ATen/native/TensorIterator.h>
 4 | #include <ATen/native/xpu/sycl/Loops.h>
 5 | #include <ATen/native/xpu/sycl/ShiftedChebyshevPolynomialKernels.h>
 6 | 
 7 | namespace at::native::xpu {
 8 | 
 9 | template <typename scalar_t>
10 | struct ShiftedChebyshevPolynomialUFunctor {
11 |   scalar_t operator()(scalar_t x, scalar_t n) const {
12 |     return shifted_chebyshev_polynomial_u_forward<scalar_t>(x, n);
13 |   }
14 | };
15 | 
16 | void shifted_chebyshev_polynomial_u_kernel(TensorIteratorBase& iterator) {
17 |   AT_DISPATCH_FLOATING_TYPES(
18 |       iterator.common_dtype(), "shifted_chebyshev_polynomial_u_xpu", [&]() {
19 |         ShiftedChebyshevPolynomialUFunctor<scalar_t> f;
20 |         gpu_kernel_with_scalars(iterator, f);
21 |       });
22 | }
23 | 
24 | } // namespace at::native::xpu
25 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/ShiftedChebyshevPolynomialVKernel.cpp:
--------------------------------------------------------------------------------
 1 | #include <ATen/ATen.h>
 2 | #include <ATen/native/Math.h>
 3 | #include <ATen/native/TensorIterator.h>
 4 | #include <ATen/native/xpu/sycl/Loops.h>
 5 | #include <ATen/native/xpu/sycl/ShiftedChebyshevPolynomialKernels.h>
 6 | 
 7 | namespace at::native::xpu {
 8 | 
 9 | template <typename scalar_t>
10 | struct ShiftedChebyshevPolynomialVFunctor {
11 |   scalar_t operator()(scalar_t x, scalar_t n) const {
12 |     return shifted_chebyshev_polynomial_v_forward<scalar_t>(x, n);
13 |   }
14 | };
15 | 
16 | void shifted_chebyshev_polynomial_v_kernel(TensorIteratorBase& iterator) {
17 |   AT_DISPATCH_FLOATING_TYPES(
18 |       iterator.common_dtype(), "shifted_chebyshev_polynomial_v_xpu", [&]() {
19 |         ShiftedChebyshevPolynomialVFunctor<scalar_t> f;
20 |         gpu_kernel_with_scalars(iterator, f);
21 |       });
22 | }
23 | 
24 | } // namespace at::native::xpu
25 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/ShiftedChebyshevPolynomialWKernel.cpp:
--------------------------------------------------------------------------------
 1 | #include <ATen/ATen.h>
 2 | #include <ATen/native/Math.h>
 3 | #include <ATen/native/TensorIterator.h>
 4 | #include <ATen/native/xpu/sycl/Loops.h>
 5 | #include <ATen/native/xpu/sycl/ShiftedChebyshevPolynomialKernels.h>
 6 | 
 7 | namespace at::native::xpu {
 8 | 
 9 | template <typename scalar_t>
10 | struct ShiftedChebyshevPolynomialWFunctor {
11 |   scalar_t operator()(scalar_t x, scalar_t n) const {
12 |     return shifted_chebyshev_polynomial_w_forward<scalar_t>(x, n);
13 |   }
14 | };
15 | 
16 | void shifted_chebyshev_polynomial_w_kernel(TensorIteratorBase& iterator) {
17 |   AT_DISPATCH_FLOATING_TYPES(
18 |       iterator.common_dtype(), "shifted_chebyshev_polynomial_w_xpu", [&]() {
19 |         ShiftedChebyshevPolynomialWFunctor<scalar_t> f;
20 |         gpu_kernel_with_scalars(iterator, f);
21 |       });
22 | }
23 | 
24 | } // namespace at::native::xpu
25 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/Sorting.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/native/TensorIterator.h>
 4 | 
 5 | namespace at::native::xpu {
 6 | 
 7 | TORCH_XPU_API void sort_stable_kernel(
 8 |     const TensorBase& self_base,
 9 |     const TensorBase& values_base,
10 |     const TensorBase& indices_base,
11 |     int64_t dim,
12 |     bool descending,
13 |     bool stable);
14 | 
15 | TORCH_XPU_API void launch_median_kernel(
16 |     const TensorBase& vals,
17 |     const TensorBase& inds,
18 |     const TensorBase& self,
19 |     int64_t dim,
20 |     bool ignore_nan);
21 | 
22 | TORCH_XPU_API void launch_kthvalue_kernel(
23 |     const TensorBase& values,
24 |     const TensorBase& indices,
25 |     const TensorBase& self,
26 |     int64_t dim,
27 |     int64_t k);
28 | 
29 | } // namespace at::native::xpu
30 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/SphericalBesselJ0Kernel.cpp:
--------------------------------------------------------------------------------
 1 | #include <ATen/Dispatch.h>
 2 | #include <ATen/native/Math.h>
 3 | #include <ATen/native/TensorIterator.h>
 4 | #include <ATen/native/xpu/sycl/Loops.h>
 5 | #include <c10/core/Scalar.h>
 6 | 
 7 | #include <ATen/native/xpu/sycl/SphericalBesselJ0Kernel.h>
 8 | 
 9 | namespace at::native::xpu {
10 | 
11 | template <typename scalar_t>
12 | struct SphericalBesselJ0Functor {
13 |   scalar_t operator()(scalar_t a) const {
14 |     return spherical_bessel_j0_forward(a);
15 |   }
16 | };
17 | 
18 | void spherical_bessel_j0_kernel(TensorIteratorBase& iter) {
19 |   AT_DISPATCH_FLOATING_TYPES(
20 |       iter.common_dtype(), "spherical_bessel_j0_xpu", [&]() {
21 |         gpu_kernel(iter, SphericalBesselJ0Functor<scalar_t>());
22 |       });
23 | }
24 | 
25 | } // namespace at::native::xpu
26 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/SphericalBesselJ0Kernel.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/native/TensorIterator.h>
 4 | 
 5 | namespace at::native::xpu {
 6 | 
 7 | TORCH_XPU_API void spherical_bessel_j0_kernel(TensorIteratorBase& iter);
 8 | 
 9 | } // namespace at::native::xpu
10 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/StepKernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/native/TensorIterator.h>
 4 | 
 5 | namespace at::native::xpu {
 6 | 
 7 | TORCH_XPU_API void nextafter_kernel(TensorIteratorBase& iter);
 8 | 
 9 | TORCH_XPU_API void heaviside_kernel(TensorIteratorBase& iter);
10 | 
11 | } // namespace at::native::xpu
12 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/SummaryOpsKernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <comm/xpu_aten.h>
 3 | 
 4 | namespace at::native::xpu {
 5 | 
 6 | TORCH_XPU_API Tensor
 7 | bincount_kernel(const Tensor& self, const Tensor& weights, int64_t minlength);
 8 | 
 9 | TORCH_XPU_API Tensor _histc_kernel(
10 |     const Tensor& self,
11 |     int64_t nbins,
12 |     const Scalar& min,
13 |     const Scalar& max);
14 | 
15 | } // namespace at::native::xpu
16 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/TensorCompareKernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/native/TensorIterator.h>
 4 | 
 5 | namespace at::native::xpu {
 6 | 
 7 | TORCH_XPU_API void where_kernel(TensorIterator& iter);
 8 | 
 9 | TORCH_XPU_API void isposinf_kernel(TensorIteratorBase& iter);
10 | 
11 | TORCH_XPU_API void isneginf_kernel(TensorIteratorBase& iter);
12 | 
13 | TORCH_XPU_API void clamp_kernel(TensorIteratorBase& iter);
14 | 
15 | TORCH_XPU_API void clamp_scalar_kernel(
16 |     TensorIteratorBase& iter,
17 |     const Scalar& min,
18 |     const Scalar& max);
19 | 
20 | TORCH_XPU_API void clamp_min_scalar_kernel(
21 |     TensorIteratorBase& iter,
22 |     Scalar min);
23 | 
24 | TORCH_XPU_API void clamp_max_scalar_kernel(
25 |     TensorIteratorBase& iter,
26 |     Scalar max);
27 | 
28 | TORCH_XPU_API void isin_kernel(
29 |     const Tensor& elements,
30 |     const Tensor& test_elements,
31 |     bool invert,
32 |     const Tensor& out);
33 | 
34 | TORCH_XPU_API void _assert_async_msg_kernel(
35 |     const Tensor& self_tensor,
36 |     std::string_view assert_msg);
37 | 
38 | } // namespace at::native::xpu
39 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/TensorFactoriesKernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/core/Tensor.h>
 4 | #include <c10/core/TensorOptions.h>
 5 | 
 6 | namespace at::native::xpu {
 7 | 
 8 | TORCH_XPU_API Tensor tril_indices_kernel(
 9 |     int64_t row,
10 |     int64_t col,
11 |     int64_t offset,
12 |     const TensorOptions& options);
13 | 
14 | TORCH_XPU_API Tensor triu_indices_kernel(
15 |     int64_t row,
16 |     int64_t col,
17 |     int64_t offset,
18 |     const TensorOptions& options);
19 | 
20 | } // namespace at::native::xpu
21 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/TensorModeKernel.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/ATen.h>
 4 | 
 5 | namespace at::native::xpu {
 6 | 
 7 | TORCH_XPU_API void mode_kernel(
 8 |     Tensor& values,
 9 |     Tensor& indices,
10 |     const Tensor& self,
11 |     int64_t dim,
12 |     bool keepdim);
13 | 
14 | } // namespace at::native::xpu
15 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/TensorShapeKernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/native/TensorIterator.h>
 4 | 
 5 | namespace at::native::xpu {
 6 | 
 7 | TORCH_XPU_API void split_with_sizes_copy_out_xpu_kernel(
 8 |     const Tensor& self,
 9 |     IntArrayRef split_sizes,
10 |     int64_t dim,
11 |     TensorList out);
12 | 
13 | TORCH_XPU_API Tensor
14 | _chunk_cat_xpu_kernel(TensorList tensors, int64_t dim, int64_t num_chunks);
15 | 
16 | TORCH_XPU_API Tensor& _chunk_cat_out_xpu_kernel(
17 |     TensorList tensors,
18 |     int64_t dim,
19 |     int64_t num_chunks,
20 |     Tensor& out);
21 | 
22 | } // namespace at::native::xpu
23 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/TensorTopKKernel.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/native/TensorIterator.h>
 4 | 
 5 | namespace at {
 6 | namespace native {
 7 | namespace xpu {
 8 | 
 9 | TORCH_XPU_API void topk_kernel(
10 |     const at::Tensor& input,
11 |     int64_t k,
12 |     int64_t dim,
13 |     bool largest,
14 |     bool sorted,
15 |     const at::Tensor& values,
16 |     const at::Tensor& indices);
17 | 
18 | } // namespace xpu
19 | } // namespace native
20 | } // namespace at
21 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/TensorTransformationsKernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <ATen/native/TensorIterator.h>
 3 | #include <comm/xpu_aten.h>
 4 | 
 5 | namespace at::native::xpu {
 6 | 
 7 | TORCH_XPU_API void flip_kernel(TensorIterator& iter, bool quantized);
 8 | 
 9 | TORCH_XPU_API void roll_kernel(
10 |     const Tensor& input,
11 |     Tensor& output,
12 |     IntArrayRef shifts,
13 |     IntArrayRef dims);
14 | 
15 | } // namespace at::native::xpu
16 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/TriangularOpsKernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <ATen/native/TensorIterator.h>
 3 | 
 4 | namespace at::native::xpu {
 5 | 
 6 | TORCH_XPU_API void tril_kernel(
 7 |     const Tensor& result,
 8 |     const Tensor& self,
 9 |     int64_t k);
10 | 
11 | TORCH_XPU_API void triu_kernel(
12 |     const Tensor& result,
13 |     const Tensor& self,
14 |     int64_t k);
15 | 
16 | } // namespace at::native::xpu
17 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/UnaryComplexKernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/native/TensorIterator.h>
 4 | 
 5 | namespace at::native::xpu {
 6 | 
 7 | TORCH_XPU_API void conj_kernel(TensorIterator& iter);
 8 | 
 9 | TORCH_XPU_API void conj_physical_kernel(TensorIteratorBase& iter);
10 | 
11 | TORCH_XPU_API void neg_conj_kernel(TensorIterator& iter);
12 | 
13 | TORCH_XPU_API void neg_kernel(TensorIterator& iter);
14 | 
15 | TORCH_XPU_API void angle_kernel(TensorIteratorBase& iter);
16 | 
17 | } // namespace at::native::xpu
18 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/UnaryFractionKernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/native/TensorIterator.h>
 4 | 
 5 | namespace at::native::xpu {
 6 | 
 7 | TORCH_XPU_API void reciprocal_kernel(TensorIteratorBase& iter);
 8 | 
 9 | TORCH_XPU_API void floor_kernel(TensorIteratorBase& iter);
10 | 
11 | TORCH_XPU_API void ceil_kernel(TensorIteratorBase& iter);
12 | 
13 | TORCH_XPU_API void round_kernel(TensorIteratorBase& iter);
14 | 
15 | TORCH_XPU_API void round_decimals_kernel(
16 |     TensorIteratorBase& iter,
17 |     int64_t decimals);
18 | 
19 | TORCH_XPU_API void frac_kernel(TensorIteratorBase& iter);
20 | 
21 | TORCH_XPU_API void trunc_kernel(TensorIteratorBase& iter);
22 | 
23 | } // namespace at::native::xpu
24 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/UnaryGammaKernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/native/TensorIterator.h>
 4 | 
 5 | namespace at::native::xpu {
 6 | 
 7 | TORCH_XPU_API void digamma_kernel(TensorIteratorBase& iter);
 8 | 
 9 | TORCH_XPU_API void polygamma_kernel(TensorIteratorBase& iter, int64_t n);
10 | 
11 | TORCH_XPU_API void lgamma_kernel(TensorIteratorBase& iter);
12 | 
13 | } // namespace at::native::xpu
14 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/UnaryGeometricAcosKernel.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/native/TensorIterator.h>
 4 | 
 5 | namespace at::native::xpu {
 6 | 
 7 | TORCH_XPU_API void acos_kernel(TensorIteratorBase& iter);
 8 | 
 9 | } // namespace at::native::xpu
10 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/UnaryGeometricAcoshKernel.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/native/TensorIterator.h>
 4 | 
 5 | namespace at::native::xpu {
 6 | 
 7 | TORCH_XPU_API void acosh_kernel(TensorIteratorBase& iter);
 8 | 
 9 | } // namespace at::native::xpu
10 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/UnaryGeometricAsinKernel.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/native/TensorIterator.h>
 4 | 
 5 | namespace at::native::xpu {
 6 | 
 7 | TORCH_XPU_API void asin_kernel(TensorIteratorBase& iter);
 8 | 
 9 | } // namespace at::native::xpu
10 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/UnaryGeometricAsinhKernel.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/native/TensorIterator.h>
 4 | 
 5 | namespace at::native::xpu {
 6 | 
 7 | TORCH_XPU_API void asinh_kernel(TensorIteratorBase& iter);
 8 | 
 9 | } // namespace at::native::xpu
10 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/UnaryGeometricAtanKernel.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/native/TensorIterator.h>
 4 | 
 5 | namespace at::native::xpu {
 6 | 
 7 | TORCH_XPU_API void atan_kernel(TensorIteratorBase& iter);
 8 | 
 9 | } // namespace at::native::xpu
10 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/UnaryGeometricAtanhKernel.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/native/TensorIterator.h>
 4 | 
 5 | namespace at::native::xpu {
 6 | 
 7 | TORCH_XPU_API void atanh_kernel(TensorIteratorBase& iter);
 8 | 
 9 | } // namespace at::native::xpu
10 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/UnaryGeometricCosKernel.cpp:
--------------------------------------------------------------------------------
 1 | #include <ATen/Dispatch.h>
 2 | #include <ATen/OpMathType.h>
 3 | #include <ATen/native/TensorIterator.h>
 4 | #include <c10/core/ScalarType.h>
 5 | 
 6 | #include <ATen/native/xpu/sycl/Loops.h>
 7 | 
 8 | #include <ATen/native/xpu/sycl/UnaryGeometricCosKernel.h>
 9 | 
10 | namespace at::native::xpu {
11 | 
12 | template <typename scalar_t>
13 | struct CosFunctor {
14 |   scalar_t operator()(const scalar_t a) const {
15 |     return std::cos(a);
16 |   }
17 | };
18 | 
19 | void cos_kernel(TensorIteratorBase& iter) {
20 |   auto common_dtype = iter.common_dtype();
21 |   if (at::isComplexType(common_dtype)) {
22 |     AT_DISPATCH_COMPLEX_TYPES_AND(kComplexHalf, common_dtype, "cos_xpu", [&]() {
23 |       using opmath_t = at::opmath_type<scalar_t>;
24 |       gpu_kernel(iter, CosFunctor<opmath_t>());
25 |     });
26 |   } else {
27 |     AT_DISPATCH_FLOATING_TYPES_AND2(
28 |         ScalarType::Half, ScalarType::BFloat16, common_dtype, "cos_xpu", [&]() {
29 |           gpu_kernel(iter, CosFunctor<scalar_t>());
30 |         });
31 |   }
32 | }
33 | 
34 | } // namespace at::native::xpu
35 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/UnaryGeometricCosKernel.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/native/TensorIterator.h>
 4 | 
 5 | namespace at::native::xpu {
 6 | 
 7 | TORCH_XPU_API void cos_kernel(TensorIteratorBase& iter);
 8 | 
 9 | } // namespace at::native::xpu
10 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/UnaryGeometricCoshKernel.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/native/TensorIterator.h>
 4 | 
 5 | namespace at::native::xpu {
 6 | 
 7 | TORCH_XPU_API void cosh_kernel(TensorIteratorBase& iter);
 8 | 
 9 | } // namespace at::native::xpu
10 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/UnaryGeometricSinKernel.cpp:
--------------------------------------------------------------------------------
 1 | #include <ATen/Dispatch.h>
 2 | #include <ATen/OpMathType.h>
 3 | #include <ATen/native/TensorIterator.h>
 4 | #include <c10/core/ScalarType.h>
 5 | 
 6 | #include <ATen/native/xpu/sycl/Loops.h>
 7 | 
 8 | #include <ATen/native/xpu/sycl/UnaryGeometricSinKernel.h>
 9 | 
10 | namespace at::native::xpu {
11 | 
12 | template <typename scalar_t>
13 | struct SinFunctor {
14 |   scalar_t operator()(const scalar_t a) const {
15 |     return std::sin(a);
16 |   }
17 | };
18 | 
19 | void sin_kernel(TensorIteratorBase& iter) {
20 |   auto common_dtype = iter.common_dtype();
21 |   if (at::isComplexType(common_dtype)) {
22 |     AT_DISPATCH_COMPLEX_TYPES_AND(kComplexHalf, common_dtype, "sin_xpu", [&]() {
23 |       using opmath_t = at::opmath_type<scalar_t>;
24 |       gpu_kernel(iter, SinFunctor<opmath_t>());
25 |     });
26 |   } else {
27 |     AT_DISPATCH_FLOATING_TYPES_AND2(
28 |         ScalarType::Half, ScalarType::BFloat16, common_dtype, "sin_xpu", [&]() {
29 |           gpu_kernel(iter, SinFunctor<scalar_t>());
30 |         });
31 |   }
32 | }
33 | 
34 | } // namespace at::native::xpu
35 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/UnaryGeometricSinKernel.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/native/TensorIterator.h>
 4 | 
 5 | namespace at::native::xpu {
 6 | 
 7 | TORCH_XPU_API void sin_kernel(TensorIteratorBase& iter);
 8 | 
 9 | } // namespace at::native::xpu
10 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/UnaryGeometricSinhKernel.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/native/TensorIterator.h>
 4 | 
 5 | namespace at::native::xpu {
 6 | 
 7 | TORCH_XPU_API void sinh_kernel(TensorIteratorBase& iter);
 8 | 
 9 | } // namespace at::native::xpu
10 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/UnaryGeometricTanKernel.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/native/TensorIterator.h>
 4 | 
 5 | namespace at::native::xpu {
 6 | 
 7 | TORCH_XPU_API void tan_kernel(TensorIteratorBase& iter);
 8 | 
 9 | } // namespace at::native::xpu
10 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/UnaryGeometricTanhKernel.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/native/TensorIterator.h>
 4 | 
 5 | namespace at::native::xpu {
 6 | 
 7 | TORCH_XPU_API void tanh_kernel(TensorIteratorBase& iter);
 8 | 
 9 | } // namespace at::native::xpu
10 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/UnaryKernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/native/TensorIterator.h>
 4 | 
 5 | namespace at::native::xpu {
 6 | 
 7 | TORCH_XPU_API void sqrt_kernel(TensorIteratorBase& iter);
 8 | 
 9 | TORCH_XPU_API void rsqrt_kernel(TensorIteratorBase& iter);
10 | 
11 | TORCH_XPU_API void bitwise_not_kernel(TensorIteratorBase& iter);
12 | 
13 | TORCH_XPU_API void exp_kernel(TensorIteratorBase& iter);
14 | 
15 | TORCH_XPU_API void expm1_kernel(TensorIteratorBase& iter);
16 | 
17 | TORCH_XPU_API void nan_to_num_kernel(
18 |     TensorIteratorBase& iter,
19 |     std::optional<double> nan,
20 |     std::optional<double> pos_inf,
21 |     std::optional<double> neg_inf);
22 | 
23 | TORCH_XPU_API void frexp_kernel(TensorIteratorBase& iter);
24 | 
25 | } // namespace at::native::xpu
26 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/UnaryLogKernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/native/TensorIterator.h>
 4 | 
 5 | namespace at::native::xpu {
 6 | 
 7 | TORCH_XPU_API void log_kernel(TensorIteratorBase& iter);
 8 | 
 9 | TORCH_XPU_API void log10_kernel(TensorIteratorBase& iter);
10 | 
11 | TORCH_XPU_API void log1p_kernel(TensorIteratorBase& iter);
12 | 
13 | TORCH_XPU_API void log2_kernel(TensorIteratorBase& iter);
14 | 
15 | } // namespace at::native::xpu
16 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/UnarySignKernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/native/TensorIterator.h>
 4 | 
 5 | namespace at::native::xpu {
 6 | 
 7 | TORCH_XPU_API void logical_not_kernel(TensorIteratorBase& iter);
 8 | 
 9 | TORCH_XPU_API void neg_kernel(TensorIteratorBase& iter);
10 | 
11 | TORCH_XPU_API void sgn_kernel(TensorIteratorBase& iter);
12 | 
13 | TORCH_XPU_API void sign_kernel(TensorIteratorBase& iter);
14 | 
15 | TORCH_XPU_API void signbit_kernel(TensorIteratorBase& iter);
16 | 
17 | } // namespace at::native::xpu
18 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/UnfoldBackwardKernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <comm/xpu_aten.h>
 4 | 
 5 | namespace at::native::xpu {
 6 | 
 7 | TORCH_XPU_API void unfold_backward_kernel(
 8 |     Tensor& grad_out,
 9 |     const Tensor& grad_in,
10 |     int64_t dim,
11 |     int64_t size,
12 |     int64_t step);
13 | 
14 | } // namespace at::native::xpu
15 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/UniqueKernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/ATen.h>
 4 | 
 5 | namespace at::native::xpu {
 6 | 
 7 | TORCH_XPU_API std::tuple<Tensor, Tensor, Tensor> unique_consecutive_kernel(
 8 |     const Tensor& self,
 9 |     const bool return_inverse,
10 |     const bool return_counts,
11 |     std::optional<int64_t> dim);
12 | 
13 | TORCH_XPU_API std::tuple<Tensor, Tensor, Tensor> unique_dim_consecutive_kernel(
14 |     const Tensor& self,
15 |     const int64_t dim,
16 |     const bool return_inverse,
17 |     const bool return_counts);
18 | 
19 | TORCH_XPU_API std::tuple<Tensor, Tensor, Tensor> unique_dim_kernel(
20 |     const Tensor& self,
21 |     const int64_t dim,
22 |     const bool return_inverse,
23 |     const bool return_counts);
24 | 
25 | TORCH_XPU_API std::tuple<Tensor, Tensor> _unique_kernel(
26 |     const Tensor& self,
27 |     const bool return_inverse);
28 | 
29 | TORCH_XPU_API std::tuple<Tensor, Tensor, Tensor> _unique2_kernel(
30 |     const Tensor& self,
31 |     const bool return_inverse,
32 |     const bool return_counts);
33 | 
34 | } // namespace at::native::xpu
35 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/UpSampleBicubic2dKernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <comm/xpu_aten.h>
 4 | 
 5 | namespace at::native::xpu {
 6 | 
 7 | TORCH_XPU_API void upsample_bicubic2d_kernel(
 8 |     const Tensor& output,
 9 |     const Tensor& input,
10 |     IntArrayRef output_size,
11 |     bool align_corners,
12 |     std::optional<double> scales_h,
13 |     std::optional<double> scales_w);
14 | 
15 | TORCH_XPU_API void upsample_bicubic2d_backward_kernel(
16 |     const Tensor& grad_input,
17 |     const Tensor& grad_output_,
18 |     IntArrayRef output_size,
19 |     IntArrayRef input_size,
20 |     bool align_corners,
21 |     std::optional<double> scales_h,
22 |     std::optional<double> scales_w);
23 | 
24 | } // namespace at::native::xpu
25 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/UpSampleLinear1dKernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/ATen.h>
 4 | 
 5 | namespace at::native::xpu {
 6 | 
 7 | TORCH_XPU_API void upsample_linear1d_kernel(
 8 |     const Tensor& input,
 9 |     IntArrayRef output_size,
10 |     bool align_corners,
11 |     std::optional<double> scales,
12 |     const Tensor& output);
13 | 
14 | TORCH_XPU_API void upsample_linear1d_backward_kernel(
15 |     const Tensor& grad_output_,
16 |     IntArrayRef output_size,
17 |     IntArrayRef input_size,
18 |     bool align_corners,
19 |     std::optional<double> scales,
20 |     const Tensor& grad_input);
21 | 
22 | } // namespace at::native::xpu
23 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/UpSampleNearest1dKernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/native/xpu/UpSample.h>
 4 | #include <comm/xpu_aten.h>
 5 | 
 6 | namespace at::native::xpu {
 7 | 
 8 | TORCH_XPU_API void upsample_nearest1d_kernel(
 9 |     const Tensor& output,
10 |     const Tensor& input_,
11 |     IntArrayRef output_size,
12 |     std::optional<double> scales,
13 |     bool is_exact);
14 | 
15 | TORCH_XPU_API void upsample_nearest1d_backward_kernel(
16 |     const Tensor& grad_input,
17 |     const Tensor& grad_output_,
18 |     IntArrayRef output_size,
19 |     IntArrayRef input_size,
20 |     std::optional<double> scales,
21 |     bool is_exact);
22 | 
23 | } // namespace at::native::xpu
24 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/UpSampleNearest2dKernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/native/xpu/UpSample.h>
 4 | #include <comm/xpu_aten.h>
 5 | 
 6 | namespace at::native::xpu {
 7 | 
 8 | TORCH_XPU_API void upsample_nearest2d_kernel(
 9 |     const Tensor& output,
10 |     const Tensor& input_,
11 |     IntArrayRef output_size,
12 |     std::optional<double> scales_h,
13 |     std::optional<double> scales_w,
14 |     bool is_exact);
15 | 
16 | TORCH_XPU_API void upsample_nearest2d_backward_kernel(
17 |     const Tensor& grad_input,
18 |     const Tensor& grad_output_,
19 |     IntArrayRef output_size,
20 |     IntArrayRef input_size,
21 |     std::optional<double> scales_h,
22 |     std::optional<double> scales_w,
23 |     bool is_exact);
24 | 
25 | } // namespace at::native::xpu
26 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/UpSampleNearest3dKernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/core/Tensor.h>
 4 | 
 5 | namespace at::native::xpu {
 6 | 
 7 | TORCH_XPU_API void upsample_nearest3d_kernel(
 8 |     const Tensor& output,
 9 |     const Tensor& input_,
10 |     IntArrayRef output_size,
11 |     std::optional<double> scales_d,
12 |     std::optional<double> scales_h,
13 |     std::optional<double> scales_w,
14 |     bool is_exact);
15 | 
16 | TORCH_XPU_API void upsample_nearest3d_backward_kernel(
17 |     const Tensor& grad_input,
18 |     const Tensor& grad_output_,
19 |     IntArrayRef output_size,
20 |     IntArrayRef input_size,
21 |     std::optional<double> scales_d,
22 |     std::optional<double> scales_h,
23 |     std::optional<double> scales_w,
24 |     bool is_exact);
25 | 
26 | } // namespace at::native::xpu
27 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/UpSampleTrilinear3dKernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <ATen/core/Tensor.h>
 4 | 
 5 | namespace at::native::xpu {
 6 | 
 7 | TORCH_XPU_API void upsample_trilinear3d_out_kernel(
 8 |     const Tensor& output,
 9 |     const Tensor& input,
10 |     IntArrayRef output_size,
11 |     bool align_corners,
12 |     std::optional<double> scales_d,
13 |     std::optional<double> scales_h,
14 |     std::optional<double> scales_w);
15 | 
16 | TORCH_XPU_API void upsample_trilinear3d_backward_out_kernel(
17 |     const Tensor& grad_input_,
18 |     const Tensor& grad_output_,
19 |     IntArrayRef output_size,
20 |     IntArrayRef input_size,
21 |     bool align_corners,
22 |     std::optional<double> scales_d,
23 |     std::optional<double> scales_h,
24 |     std::optional<double> scales_w);
25 | 
26 | } // namespace at::native::xpu
27 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/WeightInt4PackKernel.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <ATen/ATen.h>
 3 | 
 4 | namespace at::native::xpu {
 5 | 
 6 | TORCH_XPU_API void weight_to_int4pack_kernel(
 7 |     const Tensor& weight_packed,
 8 |     const Tensor& weight,
 9 |     int N,
10 |     int K);
11 | 
12 | } // namespace at::native::xpu
13 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/WeightNormKernels.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <ATen/ATen.h>
 3 | 
 4 | namespace at::native::xpu {
 5 | 
 6 | TORCH_XPU_API std::tuple<Tensor, Tensor> weight_norm_kernel(
 7 |     const Tensor& v,
 8 |     const Tensor& g,
 9 |     int64_t dim);
10 | 
11 | TORCH_XPU_API std::tuple<Tensor, Tensor> weight_norm_backward_kernel(
12 |     const Tensor& grad_w,
13 |     const Tensor& saved_v,
14 |     const Tensor& saved_g,
15 |     const Tensor& saved_norms,
16 |     int64_t dim);
17 | 
18 | } // namespace at::native::xpu
19 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/ZetaKernel.cpp:
--------------------------------------------------------------------------------
 1 | #include <ATen/Dispatch.h>
 2 | #include <ATen/native/BinaryOps.h>
 3 | #include <ATen/native/Math.h>
 4 | #include <ATen/native/TensorIterator.h>
 5 | #include <ATen/native/xpu/sycl/Loops.h>
 6 | #include <c10/core/Scalar.h>
 7 | 
 8 | #include <ATen/native/xpu/sycl/ZetaKernel.h>
 9 | 
10 | namespace at::native::xpu {
11 | 
12 | template <typename scalar_t>
13 | struct ZetaFunctor {
14 |   scalar_t operator()(scalar_t x, scalar_t q) const {
15 |     return zeta<scalar_t, /*is_xpu=*/true>(x, q);
16 |   }
17 | };
18 | 
19 | constexpr char zeta_name[] = "zeta";
20 | void zeta_kernel(TensorIteratorBase& iter) {
21 |   AT_DISPATCH_FLOATING_TYPES(iter.common_dtype(), "zeta_xpu", [&]() {
22 |     gpu_kernel_with_scalars(iter, ZetaFunctor<scalar_t>());
23 |   });
24 | }
25 | 
26 | } // namespace at::native::xpu
27 | 


--------------------------------------------------------------------------------
/src/ATen/native/xpu/sycl/ZetaKernel.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | 
3 | #include <ATen/native/TensorIterator.h>
4 | 
5 | namespace at::native::xpu {
6 | 
7 | TORCH_XPU_API void zeta_kernel(TensorIteratorBase& iter);
8 | 
9 | } // namespace at::native::xpu


--------------------------------------------------------------------------------
/src/comm/Macros.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | 
3 | #ifdef _WIN32
4 | #define RESTRICT __restrict
5 | #else
6 | #define RESTRICT __restrict__
7 | #endif
8 | 


--------------------------------------------------------------------------------
/src/comm/Runtime.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <c10/xpu/XPUStream.h>
 4 | 
 5 | namespace at::xpu {
 6 | 
 7 | static inline at::DeviceIndex getDeviceIndexOfCurrentQueue() {
 8 |   return c10::xpu::getCurrentXPUStream().device_index();
 9 | }
10 | 
11 | static inline sycl::queue& getCurrentSYCLQueue() {
12 |   return c10::xpu::getCurrentXPUStream().queue();
13 | }
14 | 
15 | } // namespace at::xpu
16 | 


--------------------------------------------------------------------------------
/src/comm/SYCLContext.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <comm/DeviceProperties.h>
 4 | #include <comm/Macros.h>
 5 | #include <comm/Runtime.h>
 6 | #include <comm/SYCLHelpers.h>
 7 | 
 8 | using namespace at::xpu;
 9 | using namespace xpu::sycl;
10 | 


--------------------------------------------------------------------------------
/src/comm/xpu_aten.h:
--------------------------------------------------------------------------------
 1 | #include <ATen/Context.h>
 2 | #include <ATen/Device.h>
 3 | #include <ATen/DeviceGuard.h>
 4 | #include <ATen/DimVector.h>
 5 | #include <ATen/Dispatch.h>
 6 | #include <ATen/Formatting.h>
 7 | #include <ATen/NamedTensor.h>
 8 | #include <ATen/ScalarOps.h>
 9 | #include <ATen/Tensor.h>
10 | #include <ATen/TensorGeometry.h>
11 | #include <ATen/TensorIndexing.h>
12 | #include <ATen/TensorOperators.h>
13 | #include <ATen/Version.h>
14 | #include <ATen/core/ATenGeneral.h>
15 | #include <ATen/core/Generator.h>
16 | #include <ATen/core/Reduction.h>
17 | #include <ATen/core/Scalar.h>
18 | #include <ATen/core/UnsafeFromTH.h>
19 | #include <ATen/core/ivalue.h>
20 | #include <ATen/core/jit_type.h>
21 | #include <c10/core/Allocator.h>
22 | #include <c10/core/InferenceMode.h>
23 | #include <c10/core/Layout.h>
24 | #include <c10/core/Storage.h>
25 | #include <c10/core/TensorOptions.h>
26 | #include <c10/util/Exception.h>


--------------------------------------------------------------------------------
/src/xccl/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # XCCL sources
 2 | 
 3 | file(GLOB xccl_h "*.hpp")
 4 | file(GLOB xccl_cpp "*.cpp")
 5 | 
 6 | list(APPEND ATen_XPU_XCCL_SRCS ${xccl_cpp})
 7 | 
 8 | set(ATen_XPU_XCCL_SRCS ${ATen_XPU_XCCL_SRCS} PARENT_SCOPE)
 9 | 
10 | # Why copy the header file to the build directory?
11 | # We want register XCCL backend to PyTorch c10d in torch/csrc/distributed/c10d/init.cpp#L27-L29.
12 | # To align with other backends, we need to copy the header file to the build torch/csrc/distributed/c10d directory.
13 | # Further solution is add find path for torch/csrc/distributed/c10d/init.cpp#L27-L29.
14 | foreach(HEADER  ${xccl_h})
15 |   file(COPY ${HEADER} DESTINATION "${CMAKE_BINARY_DIR}/torch/csrc/distributed/c10d")
16 | endforeach()
17 | 


--------------------------------------------------------------------------------
/test/microbench/distribution.cauchy.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.profiler import profile, ProfilerActivity
 3 | 
 4 | device = "xpu"
 5 | shape_list = [(8192, 8192)]
 6 | backward = False
 7 | 
 8 | for shape in shape_list:
 9 |     for dtype in [torch.bfloat16, torch.float16, torch.float32]:
10 |         input = torch.randn(shape, dtype=dtype, device=device)
11 |         # warm up
12 |         input.cauchy_()
13 | 
14 |         # go
15 |         print("shape:", (shape), "; datatype:", dtype, "; backward:", backward)
16 |         with profile(
17 |             activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], record_shapes=True
18 |         ) as prof:
19 |             for i in range(20):
20 |                 input.cauchy_()
21 |         print(prof.key_averages().table(sort_by="xpu_time_total"))
22 | 


--------------------------------------------------------------------------------
/test/microbench/distribution.exponential.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.profiler import profile, ProfilerActivity
 3 | 
 4 | device = "xpu"
 5 | shape_list = [(8192, 8192)]
 6 | backward = False
 7 | 
 8 | for shape in shape_list:
 9 |     for dtype in [torch.bfloat16, torch.float16, torch.float32]:
10 |         input = torch.randn(shape, dtype=dtype, device=device)
11 |         # warm up
12 |         input.exponential_(0.5)
13 | 
14 |         # go
15 |         print("shape:", (shape), "; datatype:", dtype, "; backward:", backward)
16 |         with profile(
17 |             activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], record_shapes=True
18 |         ) as prof:
19 |             for i in range(20):
20 |                 input.exponential_(0.5)
21 |         print(prof.key_averages().table(sort_by="xpu_time_total"))
22 | 


--------------------------------------------------------------------------------
/test/microbench/distribution.geometric.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.profiler import profile, ProfilerActivity
 3 | 
 4 | shape_list = [(8192, 8192)]
 5 | backward = False
 6 | 
 7 | if __name__ == "__main__":
 8 |     for shape in shape_list:
 9 |         for dtype in [torch.bfloat16, torch.float16, torch.float32]:
10 |             input = torch.randn(shape, dtype=torch.bfloat16, device=torch.device("xpu"))
11 | 
12 |             # warm up
13 |             input.geometric_(0.5)
14 | 
15 |             # go
16 |             print(
17 |                 "shape:",
18 |                 (shape),
19 |                 "; datatype:",
20 |                 dtype,
21 |                 "; P:",
22 |                 0.5,
23 |                 "; backward:",
24 |                 backward,
25 |             )
26 |             with profile(
27 |                 activities=[ProfilerActivity.CPU, ProfilerActivity.XPU],
28 |                 record_shapes=True,
29 |             ) as prof:
30 |                 for i in range(20):
31 |                     input.geometric_(0.5)
32 |             print(prof.key_averages().table(sort_by="xpu_time_total"))
33 | 


--------------------------------------------------------------------------------
/test/microbench/distribution.log_normal.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.profiler import profile, ProfilerActivity
 3 | 
 4 | device = "xpu"
 5 | shape_list = [(8192, 8192)]
 6 | backward = False
 7 | 
 8 | for shape in shape_list:
 9 |     for dtype in [torch.bfloat16, torch.float16, torch.float32]:
10 |         input = torch.randn(shape, dtype=dtype, device=device)
11 |         # warm up
12 |         input.log_normal_(128, 128)
13 | 
14 |         # go
15 |         print("shape:", (shape), "; datatype:", dtype, "; backward:", backward)
16 |         with profile(
17 |             activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], record_shapes=True
18 |         ) as prof:
19 |             for i in range(20):
20 |                 input.log_normal_(128, 128)
21 |         print(prof.key_averages().table(sort_by="xpu_time_total"))
22 | 


--------------------------------------------------------------------------------
/test/microbench/distribution.normal.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.profiler import profile, ProfilerActivity
 3 | 
 4 | shape_list = [(8192, 8192)]
 5 | backward = False
 6 | 
 7 | if __name__ == "__main__":
 8 |     for shape in shape_list:
 9 |         for dtype in [torch.bfloat16, torch.float16, torch.float32]:
10 |             input = torch.randn(shape, dtype=torch.bfloat16, device=torch.device("xpu"))
11 | 
12 |             # warm up
13 |             input.normal_()
14 | 
15 |             # go
16 |             print("shape:", (shape), "; datatype:", dtype, "; backward:", backward)
17 |             with profile(
18 |                 activities=[ProfilerActivity.CPU, ProfilerActivity.XPU],
19 |                 record_shapes=True,
20 |             ) as prof:
21 |                 for i in range(20):
22 |                     input.normal_()
23 |             print(prof.key_averages().table(sort_by="xpu_time_total"))
24 | 


--------------------------------------------------------------------------------
/test/microbench/distribution.random.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.profiler import profile, ProfilerActivity
 3 | 
 4 | shape_list = [(8192, 8192)]
 5 | backward = False
 6 | 
 7 | if __name__ == "__main__":
 8 |     for shape in shape_list:
 9 |         for dtype in [torch.bfloat16, torch.float16, torch.float32]:
10 |             input = torch.randn(shape, dtype=torch.bfloat16, device=torch.device("xpu"))
11 | 
12 |             # warm up
13 |             input.random_(-(2**8), 2**8)
14 | 
15 |             # go
16 |             print("shape:", (shape), "; datatype:", dtype, "; backward:", backward)
17 |             with profile(
18 |                 activities=[ProfilerActivity.CPU, ProfilerActivity.XPU],
19 |                 record_shapes=True,
20 |             ) as prof:
21 |                 for i in range(20):
22 |                     input.random_(-(2**8), 2**8)
23 |             print(prof.key_averages().table(sort_by="xpu_time_total"))
24 | 


--------------------------------------------------------------------------------
/test/microbench/distribution.uniform.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.profiler import profile, ProfilerActivity
 3 | 
 4 | shape_list = [(8192, 8192)]
 5 | backward = False
 6 | 
 7 | if __name__ == "__main__":
 8 |     for shape in shape_list:
 9 |         for dtype in [torch.bfloat16, torch.float16, torch.float32]:
10 |             input = torch.randn(shape, dtype=torch.bfloat16, device=torch.device("xpu"))
11 | 
12 |             # warm up
13 |             input.uniform_()
14 | 
15 |             # go
16 |             print("shape:", (shape), "; datatype:", dtype, "; backward:", backward)
17 |             with profile(
18 |                 activities=[ProfilerActivity.CPU, ProfilerActivity.XPU],
19 |                 record_shapes=True,
20 |             ) as prof:
21 |                 for i in range(20):
22 |                     input.uniform_()
23 |             print(prof.key_averages().table(sort_by="xpu_time_total"))
24 | 


--------------------------------------------------------------------------------
/test/microbench/scan.unique.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.profiler import profile, ProfilerActivity
 3 | 
 4 | device = "xpu"
 5 | backward = False
 6 | 
 7 | shape_list = [(2049, 2049)]
 8 | 
 9 | for shape in shape_list:
10 |     for dtype in [torch.bfloat16, torch.float16, torch.float32]:
11 |         input = torch.randint(100, shape, dtype=dtype, device=device)
12 | 
13 |         # warm up
14 |         torch.unique(input, sorted=True, return_inverse=True, return_counts=True)
15 | 
16 |         # go
17 |         print("shape:", (shape), "; datatype:", dtype, "; backward:", backward)
18 |         with profile(
19 |             activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], record_shapes=True
20 |         ) as prof:
21 |             for i in range(20):
22 |                 output = torch.unique(
23 |                     input, sorted=True, return_inverse=True, return_counts=True
24 |                 )
25 |         print(prof.key_averages().table(sort_by="xpu_time_total"))
26 | 


--------------------------------------------------------------------------------
/test/microbench/sort.randperm.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.profiler import profile, ProfilerActivity
 3 | 
 4 | device = "xpu"
 5 | backward = False
 6 | 
 7 | shape_list = [(8193)]
 8 | 
 9 | for shape in shape_list:
10 |     for dtype in [torch.float32]:
11 |         # warm up
12 |         torch.randperm(shape, dtype=dtype, device=device)
13 | 
14 |         # go
15 |         print("shape:", (shape), "; datatype:", dtype, "; backward:", backward)
16 |         with profile(
17 |             activities=[ProfilerActivity.CPU, ProfilerActivity.XPU], record_shapes=True
18 |         ) as prof:
19 |             for i in range(20):
20 |                 torch.randperm(shape, dtype=dtype, device=device)
21 |         print(prof.key_averages().table(sort_by="xpu_time_total"))
22 | 


--------------------------------------------------------------------------------
/test/profiling/correlation_id_mixed.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | input1 = torch.randn(3, 3, device="xpu")
 4 | input2 = torch.randn(3, 3, device="xpu")
 5 | 
 6 | with torch.profiler.profile(
 7 |     activities=[
 8 |         torch.profiler.ProfilerActivity.CPU,
 9 |         torch.profiler.ProfilerActivity.XPU,
10 |     ]
11 | ) as prof:
12 |     output1 = input1 + 1.0
13 |     output2 = input2 + 2.0
14 |     output = output1 + output2
15 | print(prof.key_averages().table(sort_by="xpu_time_total"))
16 | 


--------------------------------------------------------------------------------
/test/profiling/profile_partial_runtime_ops.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | def compute(input1, input2):
 5 |     input1 = input1.to(device="xpu")
 6 |     return input1 + 1.0
 7 | 
 8 | 
 9 | input1 = torch.randn(3, 3, device="cpu")
10 | input2 = torch.randn(3, 3, device="cpu")
11 | 
12 | # warm up
13 | output = compute(input1, input2)
14 | 
15 | for id in range(1):
16 |     with torch.profiler.profile(
17 |         activities=[
18 |             torch.profiler.ProfilerActivity.CPU,
19 |             torch.profiler.ProfilerActivity.XPU,
20 |         ]
21 |     ) as p:
22 |         output = compute(input1, input2)
23 |     print(p.key_averages().table(sort_by="self_xpu_time_total", row_limit=-1))
24 | 


--------------------------------------------------------------------------------
/test/profiling/time_precision_in_profile.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | def compute(input1, input2):
 5 |     input1 = input1.to(device="xpu")
 6 |     return input1 + 1.0
 7 | 
 8 | 
 9 | input1 = torch.randn(3, 3, device="cpu")
10 | input2 = torch.randn(3, 3, device="cpu")
11 | 
12 | # warm up
13 | output = compute(input1, input2)
14 | 
15 | for id in range(1000):
16 |     with torch.profiler.profile(
17 |         activities=[
18 |             torch.profiler.ProfilerActivity.CPU,
19 |             torch.profiler.ProfilerActivity.XPU,
20 |         ]
21 |     ) as p:
22 |         output = compute(input1, input2)
23 |     print(p.key_averages().table(sort_by="self_xpu_time_total", row_limit=-1))
24 | 


--------------------------------------------------------------------------------
/test/profiling/triton_xpu_ops_time.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | device = "xpu"
 4 | 
 5 | 
 6 | @torch.compile
 7 | def test(x):
 8 |     x = x + 1.0
 9 |     x = x * x
10 |     x = x + 2.0
11 |     return x
12 | 
13 | 
14 | input = torch.randn(128, 128, device=device)
15 | 
16 | # warm
17 | output = test(input)
18 | print("[info] finish warm up")
19 | 
20 | with torch.profiler.profile(
21 |     activities=[
22 |         torch.profiler.ProfilerActivity.CPU,
23 |         torch.profiler.ProfilerActivity.XPU,
24 |     ]
25 | ) as p:
26 |     print("[info] start running")
27 |     output = test(input)
28 | print(p.key_averages().table(sort_by="self_xpu_time_total", row_limit=-1))
29 | 


--------------------------------------------------------------------------------
/test/regressions/optests_failures_dict.json:
--------------------------------------------------------------------------------
1 | {
2 |   "_description": "This is a dict containing failures for tests autogenerated by generate_opcheck_tests. For more details, please see https://docs.google.com/document/d/1Pj5HRZvdOq3xpFpbEjUZp2hBovhy7Wnxw14m6lF2154/edit",
3 |   "_version": 1,
4 |   "data": {}
5 | }
6 | 


--------------------------------------------------------------------------------
/test/regressions/test_copy.py:
--------------------------------------------------------------------------------
 1 | # Owner(s): ["module: intel"]
 2 | import torch
 3 | from torch.testing._internal.common_utils import TestCase
 4 | 
 5 | cpu_device = torch.device("cpu")
 6 | xpu_device = torch.device("xpu")
 7 | 
 8 | 
 9 | class TestSimpleCopy(TestCase):
10 |     def test_copy_and_clone(self, dtype=torch.float):
11 |         a_cpu = torch.randn(16, 64, 28, 28)
12 |         b_cpu = torch.randn(16, 64, 28, 28)
13 |         a_xpu = a_cpu.to(xpu_device)
14 |         b_xpu = b_cpu.to(xpu_device)
15 |         # naive
16 |         b_cpu.copy_(a_cpu)
17 |         b_xpu.copy_(a_xpu)
18 |         self.assertEqual(b_cpu, b_xpu.to(cpu_device))
19 |         # clone + permutation
20 |         b_cpu = a_cpu.clone(memory_format=torch.channels_last)
21 |         b_xpu = a_xpu.clone(memory_format=torch.channels_last)
22 |         self.assertEqual(b_cpu, b_xpu.to(cpu_device))
23 | 


--------------------------------------------------------------------------------
/test/regressions/test_div_mode.py:
--------------------------------------------------------------------------------
 1 | # Owner(s): ["module: intel"]
 2 | import torch
 3 | from torch.testing._internal.common_dtype import get_all_dtypes
 4 | from torch.testing._internal.common_utils import TestCase
 5 | 
 6 | 
 7 | class TestDivMode(TestCase):
 8 |     def test_div_true_dtype(self):
 9 |         claimed_dtypes = get_all_dtypes()
10 |         for dtype in claimed_dtypes:
11 |             a_cpu = torch.randint(1, 100, [8, 8]).to(dtype)
12 |             a_xpu = a_cpu.to("xpu")
13 |             ref = torch.ops.aten.div(a_cpu * 2, a_cpu, rounding_mode=None)
14 |             res = torch.ops.aten.div(a_xpu * 2, a_xpu, rounding_mode=None)
15 |             self.assertEqual(ref, res.to("cpu"))
16 | 


--------------------------------------------------------------------------------
/test/regressions/test_layer_norm.py:
--------------------------------------------------------------------------------
 1 | # Owner(s): ["module: intel"]
 2 | import torch
 3 | import torch.nn as nn
 4 | from torch.testing._internal.common_utils import TestCase
 5 | 
 6 | cpu_device = torch.device("cpu")
 7 | xpu_device = torch.device("xpu")
 8 | 
 9 | 
10 | class TestLayerNorm(TestCase):
11 |     def test_layer_norm_no_nan(self, dtype=torch.float):
12 |         dim = [5]
13 |         x_cpu = torch.tensor([[1e15, 1e15 + 1, 1e15 + 2, 1e15 + 3, 1e15 + 4]])
14 |         layernorm_cpu = nn.LayerNorm(dim)
15 |         y_cpu = layernorm_cpu(x_cpu)
16 | 
17 |         x_xpu = x_cpu.to(xpu_device)
18 |         layernorm_xpu = nn.LayerNorm(dim).to(xpu_device)
19 |         y_xpu = layernorm_xpu(x_xpu)
20 |         self.assertEqual(y_cpu, y_xpu.to(cpu_device))
21 | 


--------------------------------------------------------------------------------
/test/regressions/test_operation_on_device_1.py:
--------------------------------------------------------------------------------
 1 | # Owner(s): ["module: intel"]
 2 | import torch
 3 | from torch.testing._internal.common_utils import TestCase
 4 | 
 5 | 
 6 | class TestOperationOnDevice1(TestCase):
 7 |     def test_sum_on_device1(self, dtype=torch.float):
 8 |         if torch.xpu.device_count() >= 2:
 9 |             a = torch.randn(2, 3, device=torch.device("xpu:1"))
10 |             torch.xpu.set_device(1)
11 |             res = a.sum()
12 |             ref = a.cpu().sum()
13 |             self.assertEqual(ref, res)
14 | 


--------------------------------------------------------------------------------
/test/sycl/main.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include "simple_kernel.hpp"
 3 | 
 4 | void test_simple_kernel() {
 5 |   int numel = 1024;
 6 |   float a[1024];
 7 | 
 8 |   // a simple sycl kernel
 9 |   itoa(a, numel);
10 | 
11 |   bool success = true;
12 |   for (int i = 0; i < numel; i++) {
13 |     if (a[i] != i) {
14 |       success = false;
15 |       break;
16 |     }
17 |   }
18 | 
19 |   if (success) {
20 |     std::cout << "Pass" << std::endl;
21 |   } else {
22 |     std::cout << "Fail" << std::endl;
23 |   }
24 | }
25 | 
26 | int main(int argc, char* argv[]) {
27 |   test_simple_kernel();
28 |   return 0;
29 | }
30 | 


--------------------------------------------------------------------------------
/test/sycl/simple_kernel.hpp:
--------------------------------------------------------------------------------
1 | #pragma once
2 | 
3 | // Create an idx array on SYCL GPU device
4 | // res      - host buffer for result
5 | // numel    - length of the idx array
6 | void itoa(float* res, int numel);
7 | 


--------------------------------------------------------------------------------
/test/xpu/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/torch-xpu-ops/5b4d7444484576f721d2295761cf8fafa924ef36/test/xpu/__init__.py


--------------------------------------------------------------------------------
/test/xpu/distributed/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/torch-xpu-ops/5b4d7444484576f721d2295761cf8fafa924ef36/test/xpu/distributed/__init__.py


--------------------------------------------------------------------------------
/test/xpu/extended/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/torch-xpu-ops/5b4d7444484576f721d2295761cf8fafa924ef36/test/xpu/extended/__init__.py


--------------------------------------------------------------------------------
/test/xpu/extended/run_test_with_skip.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | 
 4 | from skip_list_common import skip_dict
 5 | from skip_list_win import skip_dict as skip_dict_win
 6 | 
 7 | IS_WINDOWS = sys.platform == "win32"
 8 | 
 9 | skip_list = skip_dict["test_ops_xpu.py"]
10 | if IS_WINDOWS:
11 |     skip_list += skip_dict_win["test_ops_xpu.py"]
12 | 
13 | skip_options = ' -k "not ' + skip_list[0]
14 | for skip_case in skip_list[1:]:
15 |     skip_option = " and not " + skip_case
16 |     skip_options += skip_option
17 | skip_options += '"'
18 | 
19 | os.environ["PYTORCH_TEST_WITH_SLOW"] = "1"
20 | test_command = "pytest --timeout 600 -v --junit-xml=./op_extended.xml test_ops_xpu.py"
21 | test_command += skip_options
22 | res = os.system(test_command)
23 | sys.exit(res)
24 | 


--------------------------------------------------------------------------------
/test/xpu/extended/run_test_with_skip_arc.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | 
 4 | from skip_list_arc import skip_dict as skip_dict_specifical
 5 | from skip_list_common import skip_dict
 6 | from skip_list_win import skip_dict as skip_dict_win
 7 | from skip_list_win_arc import skip_dict as skip_dict_win_arc
 8 | 
 9 | IS_WINDOWS = sys.platform == "win32"
10 | 
11 | skip_list = skip_dict["test_ops_xpu.py"] + skip_dict_specifical["test_ops_xpu.py"]
12 | if IS_WINDOWS:
13 |     skip_list += skip_dict_win["test_ops_xpu.py"] + skip_dict_win_arc["test_ops_xpu.py"]
14 | 
15 | skip_options = ' -k "not ' + skip_list[0]
16 | for skip_case in skip_list[1:]:
17 |     skip_option = " and not " + skip_case
18 |     skip_options += skip_option
19 | skip_options += '"'
20 | 
21 | os.environ["PYTORCH_TEST_WITH_SLOW"] = "1"
22 | test_command = "pytest -v test_ops_xpu.py"
23 | test_command += skip_options
24 | res = os.system(test_command)
25 | sys.exit(res)
26 | 


--------------------------------------------------------------------------------
/test/xpu/extended/run_test_with_skip_bmg.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | 
 4 | import pytest
 5 | from skip_list_common import skip_dict
 6 | from skip_list_win import skip_dict as skip_dict_win
 7 | from skip_list_win_bmg import skip_dict as skip_dict_win_bmg
 8 | 
 9 | IS_WINDOWS = sys.platform == "win32"
10 | 
11 | skip_list = skip_dict["test_ops_xpu.py"]
12 | if IS_WINDOWS:
13 |     skip_list += skip_dict_win["test_ops_xpu.py"] + skip_dict_win_bmg["test_ops_xpu.py"]
14 | 
15 | skip_options = "not " + skip_list[0]
16 | for skip_case in skip_list[1:]:
17 |     skip_option = " and not " + skip_case
18 |     skip_options += skip_option
19 | 
20 | os.environ["PYTORCH_TEST_WITH_SLOW"] = "1"
21 | test_command = ["-k", skip_options, "test_ops_xpu.py", "-v"]
22 | res = pytest.main(test_command)
23 | sys.exit(res)
24 | 


--------------------------------------------------------------------------------
/test/xpu/extended/run_test_with_skip_lnl.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | 
 4 | import pytest
 5 | from skip_list_common import skip_dict
 6 | from skip_list_win import skip_dict as skip_dict_win
 7 | from skip_list_win_lnl import skip_dict as skip_dict_win_lnl
 8 | 
 9 | IS_WINDOWS = sys.platform == "win32"
10 | 
11 | skip_list = skip_dict["test_ops_xpu.py"]
12 | if IS_WINDOWS:
13 |     skip_list += skip_dict_win["test_ops_xpu.py"] + skip_dict_win_lnl["test_ops_xpu.py"]
14 | 
15 | skip_options = "not " + skip_list[0]
16 | for skip_case in skip_list[1:]:
17 |     skip_option = " and not " + skip_case
18 |     skip_options += skip_option
19 | 
20 | os.environ["PYTORCH_TEST_WITH_SLOW"] = "1"
21 | test_command = ["-k", skip_options, "test_ops_xpu.py", "-v"]
22 | res = pytest.main(test_command)
23 | sys.exit(res)
24 | 


--------------------------------------------------------------------------------
/test/xpu/extended/run_test_with_skip_mtl.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | 
 4 | import pytest
 5 | from skip_list_common import skip_dict
 6 | from skip_list_win import skip_dict as skip_dict_win
 7 | from skip_list_win_mtl import skip_dict as skip_dict_win_mtl
 8 | 
 9 | IS_WINDOWS = sys.platform == "win32"
10 | 
11 | skip_list = skip_dict["test_ops_xpu.py"]
12 | if IS_WINDOWS:
13 |     skip_list += skip_dict_win["test_ops_xpu.py"] + skip_dict_win_mtl["test_ops_xpu.py"]
14 | 
15 | skip_options = "not " + skip_list[0]
16 | for skip_case in skip_list[1:]:
17 |     skip_option = " and not " + skip_case
18 |     skip_options += skip_option
19 | 
20 | os.environ["PYTORCH_TEST_WITH_SLOW"] = "1"
21 | test_command = ["-k", skip_options, "test_ops_xpu.py", "-v"]
22 | res = pytest.main(test_command)
23 | sys.exit(res)
24 | 


--------------------------------------------------------------------------------
/test/xpu/extended/skip_list_win.py:
--------------------------------------------------------------------------------
1 | skip_dict = {
2 |     "test_ops_xpu.py": (
3 |         "test_compare_cpu_pow_xpu_bfloat16",  # https://github.com/intel/torch-xpu-ops/pull/764
4 |         "test_compare_cpu_argmin_xpu_int",
5 |     ),
6 | }
7 | 


--------------------------------------------------------------------------------
/test/xpu/extended/skip_list_win_arc.py:
--------------------------------------------------------------------------------
1 | skip_dict = {
2 |     # SYCL Compiler on Windows removed the following operations when '-cl-poison-unsupported-fp64-kernels' is on
3 |     # Hence, skip the following windows specific errors
4 |     "test_ops_xpu.py": (
5 |         "test_compare_cpu_sqrt_xpu_complex64",
6 |         "test_backward_nn_functional_adaptive_avg_pool2d_xpu_float32",
7 |     ),
8 | }
9 | 


--------------------------------------------------------------------------------
/test/xpu/extended/skip_list_win_bmg.py:
--------------------------------------------------------------------------------
 1 | skip_dict = {
 2 |     "test_ops_xpu.py": (
 3 |         # https://github.com/intel/torch-xpu-ops/issues/1173
 4 |         # Fatal Python error: Illegal instruction
 5 |         "test_compare_cpu_grid_sampler_2d_xpu_float64",
 6 |         "test_compare_cpu_cosh_xpu_complex64",
 7 |         "test_compare_cpu_nn_functional_softshrink_xpu_bfloat16",
 8 |         "test_compare_cpu_nn_functional_softshrink_xpu_float16",
 9 |         "test_compare_cpu_nn_functional_softshrink_xpu_float32",
10 |         "test_compare_cpu_nn_functional_softshrink_xpu_float64",
11 |         "test_compare_cpu_square_xpu_complex128",
12 |     ),
13 | }
14 | 


--------------------------------------------------------------------------------
/test/xpu/extended/skip_list_win_lnl.py:
--------------------------------------------------------------------------------
 1 | skip_dict = {
 2 |     "test_ops_xpu.py": (
 3 |         # https://github.com/intel/torch-xpu-ops/issues/1173
 4 |         # Fatal Python error: Illegal instruction
 5 |         "test_compare_cpu_grid_sampler_2d_xpu_float64",
 6 |         "test_compare_cpu_cosh_xpu_complex64",
 7 |         "test_compare_cpu_nn_functional_softshrink_xpu_bfloat16",
 8 |         "test_compare_cpu_nn_functional_softshrink_xpu_float16",
 9 |         "test_compare_cpu_nn_functional_softshrink_xpu_float32",
10 |         "test_compare_cpu_nn_functional_softshrink_xpu_float64",
11 |         "test_compare_cpu_square_xpu_complex128",
12 |     ),
13 | }
14 | 


--------------------------------------------------------------------------------
/test/xpu/nn/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/torch-xpu-ops/5b4d7444484576f721d2295761cf8fafa924ef36/test/xpu/nn/__init__.py


--------------------------------------------------------------------------------
/test/xpu/nn/test_embedding_xpu.py:
--------------------------------------------------------------------------------
 1 | # Owner(s): ["module: intel"]
 2 | 
 3 | from torch.testing._internal.common_device_type import instantiate_device_type_tests
 4 | from torch.testing._internal.common_utils import run_tests
 5 | 
 6 | try:
 7 |     from .xpu_test_utils import XPUPatchForImport
 8 | except Exception as e:
 9 |     from ..xpu_test_utils import XPUPatchForImport
10 | 
11 | with XPUPatchForImport(False):
12 |     from test_embedding import TestEmbeddingNNDeviceType
13 | 
14 | 
15 | instantiate_device_type_tests(
16 |     TestEmbeddingNNDeviceType, globals(), only_for="xpu", allow_xpu=True
17 | )
18 | 
19 | 
20 | if __name__ == "__main__":
21 |     run_tests()
22 | 


--------------------------------------------------------------------------------
/test/xpu/nn/test_init_xpu.py:
--------------------------------------------------------------------------------
 1 | # Owner(s): ["module: intel"]
 2 | 
 3 | from torch.testing._internal.common_utils import run_tests
 4 | 
 5 | try:
 6 |     from .xpu_test_utils import XPUPatchForImport
 7 | except Exception as e:
 8 |     from ..xpu_test_utils import XPUPatchForImport
 9 | 
10 | with XPUPatchForImport(False):
11 |     from test_init import TestNNInit  # noqa: F401`
12 | 
13 | 
14 | if __name__ == "__main__":
15 |     run_tests()
16 | 


--------------------------------------------------------------------------------
/test/xpu/nn/test_lazy_modules_xpu.py:
--------------------------------------------------------------------------------
 1 | # Owner(s): ["module: intel"]
 2 | 
 3 | from torch.nn.parameter import UninitializedParameter
 4 | from torch.testing._internal.common_utils import run_tests, suppress_warnings
 5 | 
 6 | try:
 7 |     from .xpu_test_utils import XPUPatchForImport
 8 | except Exception as e:
 9 |     from ..xpu_test_utils import XPUPatchForImport
10 | 
11 | with XPUPatchForImport(False):
12 |     from test_lazy_modules import LazyModule, TestLazyModules
13 | 
14 | 
15 | @suppress_warnings
16 | def materialize_device(self):
17 |     module = LazyModule()
18 |     module.register_parameter("test_param", UninitializedParameter())
19 |     module.test_param.materialize(10)
20 |     self.assertTrue(module.test_param.device.type == "cpu")
21 |     device = "xpu"
22 |     module = LazyModule()
23 |     module.register_parameter("test_param", UninitializedParameter())
24 |     module.to(device)
25 |     module.test_param.materialize(10)
26 |     self.assertTrue(module.test_param.device.type == device)
27 | 
28 | 
29 | TestLazyModules.test_materialize_device = materialize_device
30 | 
31 | if __name__ == "__main__":
32 |     run_tests()
33 | 


--------------------------------------------------------------------------------
/test/xpu/nn/test_load_state_dict_xpu.py:
--------------------------------------------------------------------------------
 1 | # Owner(s): ["module: intel"]
 2 | 
 3 | from torch.testing._internal.common_utils import (
 4 |     instantiate_parametrized_tests,
 5 |     run_tests,
 6 |     TestCase,
 7 | )
 8 | 
 9 | try:
10 |     from .xpu_test_utils import XPUPatchForImport
11 | except Exception as e:
12 |     from ..xpu_test_utils import XPUPatchForImport
13 | 
14 | with XPUPatchForImport(False):
15 |     from test_load_state_dict import TestLoadStateDict, TestLoadStateDictSwap
16 | 
17 | 
18 | instantiate_parametrized_tests(TestLoadStateDict)
19 | instantiate_parametrized_tests(TestLoadStateDictSwap)
20 | 
21 | 
22 | if __name__ == "__main__":
23 |     TestCase._default_dtype_check_enabled = True
24 |     run_tests()
25 | 


--------------------------------------------------------------------------------
/test/xpu/nn/test_module_hooks_xpu.py:
--------------------------------------------------------------------------------
 1 | # Owner(s): ["module: intel"]
 2 | 
 3 | from torch.testing._internal.common_utils import (
 4 |     instantiate_parametrized_tests,
 5 |     run_tests,
 6 |     TestCase,
 7 | )
 8 | 
 9 | try:
10 |     from .xpu_test_utils import XPUPatchForImport
11 | except Exception as e:
12 |     from ..xpu_test_utils import XPUPatchForImport
13 | 
14 | with XPUPatchForImport(False):
15 |     from test_module_hooks import TestModuleHooks
16 | 
17 | instantiate_parametrized_tests(TestModuleHooks)
18 | 
19 | 
20 | if __name__ == "__main__":
21 |     TestCase._default_dtype_check_enabled = True
22 |     run_tests()
23 | 


--------------------------------------------------------------------------------
/test/xpu/nn/test_parametrization_xpu.py:
--------------------------------------------------------------------------------
 1 | # Owner(s): ["module: intel"]
 2 | 
 3 | from torch.testing._internal.common_device_type import instantiate_device_type_tests
 4 | from torch.testing._internal.common_utils import (
 5 |     instantiate_parametrized_tests,
 6 |     run_tests,
 7 | )
 8 | 
 9 | try:
10 |     from .xpu_test_utils import XPUPatchForImport
11 | except Exception as e:
12 |     from ..xpu_test_utils import XPUPatchForImport
13 | 
14 | with XPUPatchForImport(False):
15 |     from test_parametrization import TestNNParametrization, TestNNParametrizationDevice
16 | 
17 | 
18 | instantiate_device_type_tests(
19 |     TestNNParametrizationDevice, globals(), only_for="xpu", allow_xpu=True
20 | )
21 | instantiate_parametrized_tests(TestNNParametrization)
22 | 
23 | 
24 | if __name__ == "__main__":
25 |     run_tests()
26 | 


--------------------------------------------------------------------------------
/test/xpu/nn/test_pruning_xpu.py:
--------------------------------------------------------------------------------
 1 | # Owner(s): ["module: intel"]
 2 | 
 3 | from torch.testing._internal.common_utils import (
 4 |     instantiate_parametrized_tests,
 5 |     run_tests,
 6 | )
 7 | 
 8 | try:
 9 |     from .xpu_test_utils import XPUPatchForImport
10 | except Exception as e:
11 |     from ..xpu_test_utils import XPUPatchForImport
12 | 
13 | with XPUPatchForImport(False):
14 |     from test_pruning import TestPruningNN
15 | 
16 | 
17 | instantiate_parametrized_tests(TestPruningNN)
18 | 
19 | 
20 | if __name__ == "__main__":
21 |     run_tests()
22 | 


--------------------------------------------------------------------------------
/test/xpu/quantization/core/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/torch-xpu-ops/5b4d7444484576f721d2295761cf8fafa924ef36/test/xpu/quantization/core/__init__.py


--------------------------------------------------------------------------------
/test/xpu/run_test_win_with_skip_mtl.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import sys
 3 | from io import StringIO
 4 | 
 5 | import pytest
 6 | from skip_list_win_mtl import skip_dict
 7 | 
 8 | IS_WINDOWS = sys.platform == "win32"
 9 | 
10 | skip_list = skip_dict["test_xpu.py"]
11 | 
12 | skip_options = "not " + skip_list[0]
13 | for skip_case in skip_list[1:]:
14 |     skip_option = " and not " + skip_case
15 |     skip_options += skip_option
16 | 
17 | original_stdout = sys.stdout
18 | sys.stdout = StringIO()
19 | 
20 | test_command = ["-k", skip_options, "../../../../test/test_xpu.py", "-v"]
21 | res = pytest.main(test_command)
22 | 
23 | output = sys.stdout.getvalue()
24 | sys.stdout = original_stdout
25 | 
26 | cleaned_output = re.sub(
27 |     r"\.\.(\/|\\)\.\.(\/|\\)\.\.(\/|\\)\.\.(\/|\\)test(\/|\\)", "", output
28 | )
29 | print(cleaned_output, end="")
30 | sys.exit(res)
31 | 


--------------------------------------------------------------------------------
/test/xpu/run_test_with_skip.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | 
 4 | from skip_list_common import skip_dict
 5 | from xpu_test_utils import launch_test
 6 | 
 7 | res = 0
 8 | fail_test = []
 9 | 
10 | for key in skip_dict:
11 |     skip_list = skip_dict[key]
12 |     fail = launch_test(key, skip_list)
13 |     res += fail
14 |     if fail:
15 |         fail_test.append(key)
16 | if fail_test:
17 |     print(",".join(fail_test) + " have failures")
18 | 
19 | 
20 | if os.name == "nt":
21 |     sys.exit(res)
22 | else:
23 |     exit_code = os.WEXITSTATUS(res)
24 |     sys.exit(exit_code)
25 | 


--------------------------------------------------------------------------------
/test/xpu/run_test_with_skip_arc.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | 
 4 | from skip_list_arc import skip_dict as skip_dict_specifical
 5 | from skip_list_common import skip_dict
 6 | from skip_list_win import skip_dict as skip_dict_win
 7 | from skip_list_win_arc import skip_dict as skip_dict_win_arc
 8 | from xpu_test_utils import launch_test
 9 | 
10 | res = 0
11 | IS_WINDOWS = sys.platform == "win32"
12 | 
13 | for key in skip_dict:
14 |     skip_list = skip_dict[key]
15 |     if key in skip_dict_specifical:
16 |         skip_list += skip_dict_specifical[key]
17 |     if IS_WINDOWS and key in skip_dict_win:
18 |         skip_list += skip_dict_win[key]
19 |     if IS_WINDOWS and key in skip_dict_win_arc:
20 |         skip_list += skip_dict_win_arc[key]
21 |     res += launch_test(key, skip_list)
22 | 
23 | if os.name == "nt":
24 |     sys.exit(res)
25 | else:
26 |     exit_code = os.WEXITSTATUS(res)
27 |     sys.exit(exit_code)
28 | 


--------------------------------------------------------------------------------
/test/xpu/run_test_with_skip_bmg.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | 
 4 | from skip_list_common import skip_dict
 5 | from skip_list_win import skip_dict as skip_dict_win
 6 | from skip_list_win_bmg import skip_dict as skip_dict_win_bmg
 7 | from xpu_test_utils import launch_test
 8 | 
 9 | res = 0
10 | IS_WINDOWS = sys.platform == "win32"
11 | 
12 | for key in skip_dict:
13 |     skip_list = skip_dict[key]
14 |     if IS_WINDOWS and key in skip_dict_win:
15 |         skip_list += skip_dict_win[key]
16 |     if IS_WINDOWS and key in skip_dict_win_bmg:
17 |         skip_list += skip_dict_win_bmg[key]
18 |     res += launch_test(key, skip_list)
19 | 
20 | if os.name == "nt":
21 |     sys.exit(res)
22 | else:
23 |     exit_code = os.WEXITSTATUS(res)
24 |     sys.exit(exit_code)
25 | 


--------------------------------------------------------------------------------
/test/xpu/run_test_with_skip_lnl.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | 
 4 | from skip_list_common import skip_dict
 5 | from skip_list_win import skip_dict as skip_dict_win
 6 | from skip_list_win_lnl import skip_dict as skip_dict_win_lnl
 7 | from xpu_test_utils import launch_test
 8 | 
 9 | res = 0
10 | IS_WINDOWS = sys.platform == "win32"
11 | 
12 | for key in skip_dict:
13 |     skip_list = skip_dict[key]
14 |     if IS_WINDOWS and key in skip_dict_win:
15 |         skip_list += skip_dict_win[key]
16 |     if IS_WINDOWS and key in skip_dict_win_lnl:
17 |         skip_list += skip_dict_win_lnl[key]
18 |     res += launch_test(key, skip_list)
19 | 
20 | if os.name == "nt":
21 |     sys.exit(res)
22 | else:
23 |     exit_code = os.WEXITSTATUS(res)
24 |     sys.exit(exit_code)
25 | 


--------------------------------------------------------------------------------
/test/xpu/run_test_with_skip_mtl.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | 
 4 | from skip_list_common import skip_dict
 5 | from skip_list_mtl import skip_dict as skip_dict_specifical
 6 | from skip_list_win import skip_dict as skip_dict_win
 7 | from xpu_test_utils import launch_test
 8 | 
 9 | res = 0
10 | IS_WINDOWS = sys.platform == "win32"
11 | 
12 | for key in skip_dict:
13 |     skip_list = skip_dict[key]
14 |     if key in skip_dict_specifical:
15 |         skip_list += skip_dict_specifical[key]
16 |     if IS_WINDOWS and key in skip_dict_win:
17 |         skip_list += skip_dict_win[key]
18 |     res += launch_test(key, skip_list)
19 | 
20 | if os.name == "nt":
21 |     sys.exit(res)
22 | else:
23 |     exit_code = os.WEXITSTATUS(res)
24 |     sys.exit(exit_code)
25 | 


--------------------------------------------------------------------------------
/test/xpu/skip_list_arc.py:
--------------------------------------------------------------------------------
1 | skip_dict = {
2 |     "test_indexing_xpu.py": ("test_index_put_accumulate_large_tensor_xpu",),
3 |     "test_nn_xpu.py": ("test_grid_sample_large_xpu",),
4 |     "test_tensor_creation_ops_xpu.py": (
5 |         "test_float_to_int_conversion_finite_xpu_int64",
6 |     ),
7 | }
8 | 


--------------------------------------------------------------------------------
/test/xpu/skip_list_mtl.py:
--------------------------------------------------------------------------------
1 | skip_dict = {
2 |     "test_indexing_xpu.py": ("test_index_put_accumulate_large_tensor_xpu",),
3 |     "test_nn_xpu.py": ("test_grid_sample_large_xpu",),
4 |     "test_tensor_creation_ops_xpu.py": (
5 |         "test_float_to_int_conversion_finite_xpu_int64",
6 |     ),
7 | }
8 | 


--------------------------------------------------------------------------------
/test/xpu/skip_list_win_mtl.py:
--------------------------------------------------------------------------------
1 | skip_dict = {
2 |     # failed on MTL windows, skip first for Preci
3 |     "test_xpu.py": ("test_mem_get_info_xpu",),
4 | }
5 | 


--------------------------------------------------------------------------------
/test/xpu/test_autograd_fallback_xpu.py:
--------------------------------------------------------------------------------
 1 | # Owner(s): ["module: intel"]
 2 | 
 3 | from torch.testing._internal.common_utils import (
 4 |     instantiate_parametrized_tests,
 5 |     run_tests,
 6 | )
 7 | 
 8 | try:
 9 |     from xpu_test_utils import XPUPatchForImport
10 | except Exception as e:
11 |     from .xpu_test_utils import XPUPatchForImport
12 | 
13 | with XPUPatchForImport(False):
14 |     from test_autograd_fallback import TestAutogradFallback
15 | 
16 | 
17 | instantiate_parametrized_tests(TestAutogradFallback)
18 | 
19 | 
20 | if __name__ == "__main__":
21 |     run_tests()
22 | 


--------------------------------------------------------------------------------
/test/xpu/test_comparison_utils_xpu.py:
--------------------------------------------------------------------------------
 1 | # Owner(s): ["module: intel"]
 2 | 
 3 | from torch.testing._internal.common_utils import run_tests
 4 | 
 5 | try:
 6 |     from xpu_test_utils import XPUPatchForImport
 7 | except Exception as e:
 8 |     from .xpu_test_utils import XPUPatchForImport
 9 | 
10 | with XPUPatchForImport(False):
11 |     from test_comparison_utils import TestComparisonUtils  # noqa: F401`
12 | 
13 | 
14 | if __name__ == "__main__":
15 |     run_tests()
16 | 


--------------------------------------------------------------------------------
/test/xpu/test_complex_xpu.py:
--------------------------------------------------------------------------------
 1 | # Owner(s): ["module: intel"]
 2 | 
 3 | from torch.testing._internal.common_device_type import instantiate_device_type_tests
 4 | from torch.testing._internal.common_utils import run_tests, TestCase
 5 | 
 6 | try:
 7 |     from xpu_test_utils import XPUPatchForImport
 8 | except Exception as e:
 9 |     from .xpu_test_utils import XPUPatchForImport
10 | 
11 | with XPUPatchForImport(False):
12 |     from test_complex import TestComplexTensor
13 | 
14 | instantiate_device_type_tests(
15 |     TestComplexTensor, globals(), only_for="xpu", allow_xpu=True
16 | )
17 | 
18 | 
19 | if __name__ == "__main__":
20 |     TestCase._default_dtype_check_enabled = True
21 |     run_tests()
22 | 


--------------------------------------------------------------------------------
/test/xpu/test_content_store_xpu.py:
--------------------------------------------------------------------------------
 1 | # Owner(s): ["module: intel"]
 2 | 
 3 | from torch.testing._internal.common_device_type import instantiate_device_type_tests
 4 | from torch.testing._internal.common_utils import run_tests
 5 | 
 6 | try:
 7 |     from xpu_test_utils import XPUPatchForImport
 8 | except Exception as e:
 9 |     from .xpu_test_utils import XPUPatchForImport
10 | 
11 | with XPUPatchForImport(False):
12 |     from test_content_store import TestContentStore
13 | 
14 | 
15 | instantiate_device_type_tests(
16 |     TestContentStore, globals(), only_for="xpu", allow_xpu=True
17 | )
18 | 
19 | 
20 | if __name__ == "__main__":
21 |     run_tests()
22 | 


--------------------------------------------------------------------------------
/test/xpu/test_dynamic_shapes_xpu.py:
--------------------------------------------------------------------------------
 1 | # Owner(s): ["module: intel"]
 2 | 
 3 | from torch.testing._internal.common_utils import (
 4 |     instantiate_parametrized_tests,
 5 |     run_tests,
 6 | )
 7 | 
 8 | try:
 9 |     from xpu_test_utils import XPUPatchForImport
10 | except Exception as e:
11 |     from .xpu_test_utils import XPUPatchForImport
12 | 
13 | with XPUPatchForImport(False):
14 |     from test_dynamic_shapes import TestSymNumberMagicMethods
15 | 
16 | instantiate_parametrized_tests(TestSymNumberMagicMethods)
17 | 
18 | 
19 | if __name__ == "__main__":
20 |     run_tests()
21 | 


--------------------------------------------------------------------------------
/test/xpu/test_masked_xpu.py:
--------------------------------------------------------------------------------
 1 | # Owner(s): ["module: intel"]
 2 | 
 3 | from torch.testing._internal.common_device_type import instantiate_device_type_tests
 4 | from torch.testing._internal.common_utils import run_tests
 5 | 
 6 | try:
 7 |     from xpu_test_utils import XPUPatchForImport
 8 | except Exception as e:
 9 |     from .xpu_test_utils import XPUPatchForImport
10 | 
11 | with XPUPatchForImport(False):
12 |     from test_masked import TestMasked
13 | 
14 | instantiate_device_type_tests(TestMasked, globals(), only_for="xpu", allow_xpu=True)
15 | if __name__ == "__main__":
16 |     run_tests()
17 | 


--------------------------------------------------------------------------------
/test/xpu/test_maskedtensor_xpu.py:
--------------------------------------------------------------------------------
 1 | # Owner(s): ["module: intel"]
 2 | 
 3 | from torch.testing._internal.common_device_type import instantiate_device_type_tests
 4 | from torch.testing._internal.common_utils import (
 5 |     instantiate_parametrized_tests,
 6 |     run_tests,
 7 | )
 8 | 
 9 | try:
10 |     from xpu_test_utils import XPUPatchForImport
11 | except Exception as e:
12 |     from .xpu_test_utils import XPUPatchForImport
13 | 
14 | with XPUPatchForImport(False):
15 |     from test_maskedtensor import (
16 |         TestBasics,
17 |         TestBinary,
18 |         TestOperators,
19 |         TestReductions,
20 |         TestUnary,
21 |     )
22 | 
23 | instantiate_device_type_tests(TestBasics, globals(), only_for=("xpu"), allow_xpu=True)
24 | 
25 | instantiate_device_type_tests(
26 |     TestOperators, globals(), only_for=("xpu"), allow_xpu=True
27 | )
28 | instantiate_parametrized_tests(TestUnary)
29 | instantiate_parametrized_tests(TestBinary)
30 | instantiate_parametrized_tests(TestReductions)
31 | 
32 | if __name__ == "__main__":
33 |     run_tests()
34 | 


--------------------------------------------------------------------------------
/test/xpu/test_namedtensor_xpu.py:
--------------------------------------------------------------------------------
 1 | # Owner(s): ["module: intel"]
 2 | 
 3 | from torch.testing._internal.common_utils import run_tests
 4 | 
 5 | try:
 6 |     from xpu_test_utils import XPUPatchForImport
 7 | except Exception as e:
 8 |     from .xpu_test_utils import XPUPatchForImport
 9 | 
10 | 
11 | def select_cuda(self):
12 |     self._test_select("xpu")
13 | 
14 | 
15 | def as_strided_cuda(self):
16 |     self._test_as_strided("xpu")
17 | 
18 | 
19 | with XPUPatchForImport(False):
20 |     from test_namedtensor import TestNamedTensor
21 | 
22 | TestNamedTensor.test_select_cuda = select_cuda
23 | TestNamedTensor.test_as_strided_cuda = as_strided_cuda
24 | 
25 | if __name__ == "__main__":
26 |     run_tests()
27 | 


--------------------------------------------------------------------------------
/test/xpu/test_native_functions_xpu.py:
--------------------------------------------------------------------------------
 1 | # Owner(s): ["module: intel"]
 2 | 
 3 | from torch.testing._internal.common_utils import run_tests
 4 | 
 5 | try:
 6 |     from xpu_test_utils import XPUPatchForImport
 7 | except Exception as e:
 8 |     from .xpu_test_utils import XPUPatchForImport
 9 | 
10 | with XPUPatchForImport(False):
11 |     from test_native_functions import TestNativeFunctions  # noqa: F401`
12 | 
13 | if __name__ == "__main__":
14 |     run_tests()
15 | 


--------------------------------------------------------------------------------
/test/xpu/test_native_mha_xpu.py:
--------------------------------------------------------------------------------
 1 | # Owner(s): ["module: intel"]
 2 | 
 3 | from torch.testing._internal.common_device_type import instantiate_device_type_tests
 4 | from torch.testing._internal.common_utils import run_tests
 5 | 
 6 | try:
 7 |     from xpu_test_utils import XPUPatchForImport
 8 | except Exception as e:
 9 |     from .xpu_test_utils import XPUPatchForImport
10 | 
11 | with XPUPatchForImport(False):
12 |     from test_native_mha import TestMHADeviceType
13 | 
14 | instantiate_device_type_tests(
15 |     TestMHADeviceType, globals(), only_for="xpu", allow_xpu=True
16 | )
17 | 
18 | if __name__ == "__main__":
19 |     run_tests()
20 | 


--------------------------------------------------------------------------------
/test/xpu/test_ops_fwd_gradients_xpu.py:
--------------------------------------------------------------------------------
 1 | # Owner(s): ["module: intel"]
 2 | 
 3 | from torch.testing._internal.common_device_type import instantiate_device_type_tests
 4 | from torch.testing._internal.common_utils import run_tests, TestCase
 5 | 
 6 | try:
 7 |     from xpu_test_utils import XPUPatchForImport
 8 | except Exception as e:
 9 |     from .xpu_test_utils import XPUPatchForImport
10 | 
11 | with XPUPatchForImport(False):
12 |     from test_ops_fwd_gradients import TestFwdGradients
13 | TestFwdGradients._default_dtype_check_enabled = True
14 | instantiate_device_type_tests(
15 |     TestFwdGradients, globals(), only_for=("xpu"), allow_xpu=True
16 | )
17 | if __name__ == "__main__":
18 |     TestCase._default_dtype_check_enabled = True
19 |     run_tests()
20 | 


--------------------------------------------------------------------------------
/test/xpu/test_ops_gradients_xpu.py:
--------------------------------------------------------------------------------
 1 | # Owner(s): ["module: intel"]
 2 | 
 3 | from torch.testing._internal.common_device_type import instantiate_device_type_tests
 4 | from torch.testing._internal.common_utils import run_tests
 5 | 
 6 | try:
 7 |     from xpu_test_utils import XPUPatchForImport
 8 | except Exception as e:
 9 |     from .xpu_test_utils import XPUPatchForImport
10 | 
11 | with XPUPatchForImport(False):
12 |     from test_ops_gradients import TestBwdGradients
13 | 
14 | instantiate_device_type_tests(
15 |     TestBwdGradients, globals(), only_for="xpu", allow_xpu=True
16 | )
17 | 
18 | if __name__ == "__main__":
19 |     run_tests()
20 | 


--------------------------------------------------------------------------------
/test/xpu/test_ops_xpu.py:
--------------------------------------------------------------------------------
 1 | # Owner(s): ["module: intel"]
 2 | 
 3 | 
 4 | from torch.testing._internal.common_device_type import instantiate_device_type_tests
 5 | from torch.testing._internal.common_utils import run_tests
 6 | 
 7 | try:
 8 |     from xpu_test_utils import XPUPatchForImport
 9 | except Exception as e:
10 |     from .xpu_test_utils import XPUPatchForImport
11 | with XPUPatchForImport(False):
12 |     from test_ops import TestCommon, TestMathBits
13 | instantiate_device_type_tests(TestCommon, globals(), only_for="xpu", allow_xpu=True)
14 | instantiate_device_type_tests(TestMathBits, globals(), only_for="xpu", allow_xpu=True)
15 | # in finegrand
16 | # instantiate_device_type_tests(TestCompositeCompliance, globals(), only_for="xpu", allow_xpu=True)
17 | # only CPU
18 | # instantiate_device_type_tests(TestRefsOpsInfo, globals(), only_for="xpu", allow_xpu=True)
19 | # not important
20 | # instantiate_device_type_tests(TestFakeTensor, globals(), only_for="xpu", allow_xpu=True)
21 | # instantiate_device_type_tests(TestTags, globals(), only_for="xpu", allow_xpu=True)
22 | 
23 | 
24 | if __name__ == "__main__":
25 |     run_tests()
26 | 


--------------------------------------------------------------------------------
/test/xpu/test_reductions_xpu.py:
--------------------------------------------------------------------------------
 1 | # Owner(s): ["module: intel"]
 2 | 
 3 | from torch.testing._internal.common_device_type import instantiate_device_type_tests
 4 | from torch.testing._internal.common_utils import run_tests
 5 | 
 6 | try:
 7 |     from xpu_test_utils import XPUPatchForImport
 8 | except Exception as e:
 9 |     from .xpu_test_utils import XPUPatchForImport
10 | 
11 | with XPUPatchForImport(False):
12 |     from test_reductions import TestReductions
13 | 
14 | 
15 | instantiate_device_type_tests(TestReductions, globals(), only_for="xpu", allow_xpu=True)
16 | 
17 | 
18 | if __name__ == "__main__":
19 |     run_tests()
20 | 


--------------------------------------------------------------------------------
/test/xpu/test_scatter_gather_ops_xpu.py:
--------------------------------------------------------------------------------
 1 | # Owner(s): ["module: intel"]
 2 | 
 3 | from torch.testing._internal.common_device_type import instantiate_device_type_tests
 4 | from torch.testing._internal.common_utils import run_tests
 5 | 
 6 | try:
 7 |     from xpu_test_utils import XPUPatchForImport
 8 | except Exception as e:
 9 |     from .xpu_test_utils import XPUPatchForImport
10 | 
11 | with XPUPatchForImport(False):
12 |     from test_scatter_gather_ops import TestScatterGather
13 | 
14 | 
15 | instantiate_device_type_tests(
16 |     TestScatterGather, globals(), only_for="xpu", allow_xpu=True
17 | )
18 | 
19 | 
20 | if __name__ == "__main__":
21 |     run_tests()
22 | 


--------------------------------------------------------------------------------
/test/xpu/test_segment_reductions_xpu.py:
--------------------------------------------------------------------------------
 1 | # Owner(s): ["module: intel"]
 2 | 
 3 | 
 4 | from torch.testing._internal.common_device_type import instantiate_device_type_tests
 5 | from torch.testing._internal.common_utils import run_tests
 6 | 
 7 | try:
 8 |     from xpu_test_utils import XPUPatchForImport
 9 | except Exception as e:
10 |     from .xpu_test_utils import XPUPatchForImport
11 | with XPUPatchForImport(False):
12 |     from test_segment_reductions import TestSegmentReductions
13 | 
14 | instantiate_device_type_tests(
15 |     TestSegmentReductions, globals(), only_for="xpu", allow_xpu=True
16 | )
17 | 
18 | 
19 | if __name__ == "__main__":
20 |     run_tests()
21 | 


--------------------------------------------------------------------------------
/test/xpu/test_shape_ops_xpu.py:
--------------------------------------------------------------------------------
 1 | # Owner(s): ["module: intel"]
 2 | 
 3 | from torch.testing._internal.common_device_type import instantiate_device_type_tests
 4 | from torch.testing._internal.common_utils import run_tests
 5 | 
 6 | try:
 7 |     from xpu_test_utils import XPUPatchForImport
 8 | except Exception as e:
 9 |     from .xpu_test_utils import XPUPatchForImport
10 | 
11 | with XPUPatchForImport(False):
12 |     from test_shape_ops import TestShapeOps
13 | 
14 | instantiate_device_type_tests(TestShapeOps, globals(), only_for="xpu", allow_xpu=True)
15 | 
16 | 
17 | if __name__ == "__main__":
18 |     run_tests()
19 | 


--------------------------------------------------------------------------------
/tools/linter/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/torch-xpu-ops/5b4d7444484576f721d2295761cf8fafa924ef36/tools/linter/__init__.py


--------------------------------------------------------------------------------
/tools/linter/adapters/README.md:
--------------------------------------------------------------------------------
 1 | # lintrunner adapters
 2 | 
 3 | These files adapt our various linters to work with `lintrunner`.
 4 | 
 5 | ## Adding a new linter
 6 | 1. init and linter
 7 | 2. {{DRYRUN}} and {{PATHSFILE}}
 8 | 3. never exit uncleanly
 9 | 4. Communication protocol
10 | 5. Self-contained
11 | 


--------------------------------------------------------------------------------
/tools/linter/clang_tidy/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/torch-xpu-ops/5b4d7444484576f721d2295761cf8fafa924ef36/tools/linter/clang_tidy/__init__.py


--------------------------------------------------------------------------------