SharedMemTrait Struct Reference#
ck::GridwiseBatchedGemmSoftmaxGemm_Wmma< ADataType, B0DataType, Acc0DataType, B1DataType, Acc1DataType, CShuffleDataType, CDataType, AElementwiseOperation, B0ElementwiseOperation, AccElementwiseOperation, B1ElementwiseOperation, CElementwiseOperation, CGlobalMemoryDataOperation, AGridDesc, B0GridDesc, B1GridDesc, CGridDesc_M_N, MPerBlock, LPerBlock, KPerBlock, AK1Value, BK1Value, NPerBlock, LTilePerBlock, L1Value, MPerWmma, LPerWmma, NPerWmma, MRepeat, LRepeat, NRepeat, BlockSize, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, AThreadTransferSrcResetCoordinateAfterRun, AEnableLds, ABlockLdsExtraM, B0BlockTransferThreadClusterLengths_K0_L_K1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_K1, B0ThreadTransferSrcResetCoordinateAfterRun, B0EnableLds, B0BlockLdsExtraL, B1BlockTransferThreadClusterLengths_L0_N_L1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_L1, B1ThreadTransferSrcResetCoordinateAfterRun, B1EnableLds, B1BlockLdsExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, PadN, MaskOutUpperTriangle, NumGemmKPrefetchStage, LoopSched, PipelineVer >::SharedMemTrait Struct Reference
#include <gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp>
Static Public Attributes | |
| static constexpr auto | max_lds_align = math::lcm(math::lcm(AK1, BK1), BL1) |
| static constexpr auto | a_block_space_size_aligned |
| static constexpr auto | b0_block_space_size_aligned |
| static constexpr auto | b1_block_space_size_aligned |
| static constexpr auto | a_block_space_offset = 0 |
| static constexpr auto | b0_block_space_offset = a_block_space_size_aligned |
| static constexpr auto | b1_block_space_offset = 0 |
| static constexpr index_t | reduction_space_size_aligned |
| static constexpr auto | reduction_space_offset = 0 |
| static constexpr auto | c_block_space_size |
Member Data Documentation
◆ a_block_space_offset
template<typename ADataType, typename B0DataType, typename Acc0DataType, typename B1DataType, typename Acc1DataType, typename CShuffleDataType, typename CDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc, typename B0GridDesc, typename B1GridDesc, typename CGridDesc_M_N, index_t MPerBlock, index_t LPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t NPerBlock, index_t LTilePerBlock, index_t L1Value, index_t MPerWmma, index_t LPerWmma, index_t NPerWmma, index_t MRepeat, index_t LRepeat, index_t NRepeat, index_t BlockSize, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_K1, bool AThreadTransferSrcResetCoordinateAfterRun, bool AEnableLds, bool ABlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_K1, bool B0ThreadTransferSrcResetCoordinateAfterRun, bool B0EnableLds, bool B0BlockLdsExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_L1, bool B1ThreadTransferSrcResetCoordinateAfterRun, bool B1EnableLds, bool B1BlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, bool PadN, bool MaskOutUpperTriangle, index_t NumGemmKPrefetchStage = 1, LoopScheduler LoopSched = make_default_loop_scheduler(), PipelineVersion PipelineVer = PipelineVersion::v1>
|
staticconstexpr |
◆ a_block_space_size_aligned
template<typename ADataType, typename B0DataType, typename Acc0DataType, typename B1DataType, typename Acc1DataType, typename CShuffleDataType, typename CDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc, typename B0GridDesc, typename B1GridDesc, typename CGridDesc_M_N, index_t MPerBlock, index_t LPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t NPerBlock, index_t LTilePerBlock, index_t L1Value, index_t MPerWmma, index_t LPerWmma, index_t NPerWmma, index_t MRepeat, index_t LRepeat, index_t NRepeat, index_t BlockSize, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_K1, bool AThreadTransferSrcResetCoordinateAfterRun, bool AEnableLds, bool ABlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_K1, bool B0ThreadTransferSrcResetCoordinateAfterRun, bool B0EnableLds, bool B0BlockLdsExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_L1, bool B1ThreadTransferSrcResetCoordinateAfterRun, bool B1EnableLds, bool B1BlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, bool PadN, bool MaskOutUpperTriangle, index_t NumGemmKPrefetchStage = 1, LoopScheduler LoopSched = make_default_loop_scheduler(), PipelineVersion PipelineVer = PipelineVersion::v1>
|
staticconstexpr |
Initial value:
=
AEnableLds ? math::integer_least_multiple(MakeABlockDescriptor().GetElementSpaceSize(),
: 0
__host__ __device__ constexpr auto integer_least_multiple(X x, Y y)
Definition utility/math.hpp:78
static constexpr auto max_lds_align
Definition gridwise_batched_gemm_gemm_wmma_cshuffle_v3.hpp:504
__host__ static __device__ constexpr auto MakeABlockDescriptor()
Definition gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp:128
◆ b0_block_space_offset
template<typename ADataType, typename B0DataType, typename Acc0DataType, typename B1DataType, typename Acc1DataType, typename CShuffleDataType, typename CDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc, typename B0GridDesc, typename B1GridDesc, typename CGridDesc_M_N, index_t MPerBlock, index_t LPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t NPerBlock, index_t LTilePerBlock, index_t L1Value, index_t MPerWmma, index_t LPerWmma, index_t NPerWmma, index_t MRepeat, index_t LRepeat, index_t NRepeat, index_t BlockSize, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_K1, bool AThreadTransferSrcResetCoordinateAfterRun, bool AEnableLds, bool ABlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_K1, bool B0ThreadTransferSrcResetCoordinateAfterRun, bool B0EnableLds, bool B0BlockLdsExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_L1, bool B1ThreadTransferSrcResetCoordinateAfterRun, bool B1EnableLds, bool B1BlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, bool PadN, bool MaskOutUpperTriangle, index_t NumGemmKPrefetchStage = 1, LoopScheduler LoopSched = make_default_loop_scheduler(), PipelineVersion PipelineVer = PipelineVersion::v1>
|
staticconstexpr |
◆ b0_block_space_size_aligned
template<typename ADataType, typename B0DataType, typename Acc0DataType, typename B1DataType, typename Acc1DataType, typename CShuffleDataType, typename CDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc, typename B0GridDesc, typename B1GridDesc, typename CGridDesc_M_N, index_t MPerBlock, index_t LPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t NPerBlock, index_t LTilePerBlock, index_t L1Value, index_t MPerWmma, index_t LPerWmma, index_t NPerWmma, index_t MRepeat, index_t LRepeat, index_t NRepeat, index_t BlockSize, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_K1, bool AThreadTransferSrcResetCoordinateAfterRun, bool AEnableLds, bool ABlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_K1, bool B0ThreadTransferSrcResetCoordinateAfterRun, bool B0EnableLds, bool B0BlockLdsExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_L1, bool B1ThreadTransferSrcResetCoordinateAfterRun, bool B1EnableLds, bool B1BlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, bool PadN, bool MaskOutUpperTriangle, index_t NumGemmKPrefetchStage = 1, LoopScheduler LoopSched = make_default_loop_scheduler(), PipelineVersion PipelineVer = PipelineVersion::v1>
|
staticconstexpr |
Initial value:
=
B0EnableLds ? math::integer_least_multiple(
MakeB0BlockDescriptor().GetElementSpaceSize(), max_lds_align)
: 0
__host__ static __device__ constexpr auto MakeB0BlockDescriptor()
Definition gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp:175
◆ b1_block_space_offset
template<typename ADataType, typename B0DataType, typename Acc0DataType, typename B1DataType, typename Acc1DataType, typename CShuffleDataType, typename CDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc, typename B0GridDesc, typename B1GridDesc, typename CGridDesc_M_N, index_t MPerBlock, index_t LPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t NPerBlock, index_t LTilePerBlock, index_t L1Value, index_t MPerWmma, index_t LPerWmma, index_t NPerWmma, index_t MRepeat, index_t LRepeat, index_t NRepeat, index_t BlockSize, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_K1, bool AThreadTransferSrcResetCoordinateAfterRun, bool AEnableLds, bool ABlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_K1, bool B0ThreadTransferSrcResetCoordinateAfterRun, bool B0EnableLds, bool B0BlockLdsExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_L1, bool B1ThreadTransferSrcResetCoordinateAfterRun, bool B1EnableLds, bool B1BlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, bool PadN, bool MaskOutUpperTriangle, index_t NumGemmKPrefetchStage = 1, LoopScheduler LoopSched = make_default_loop_scheduler(), PipelineVersion PipelineVer = PipelineVersion::v1>
|
staticconstexpr |
◆ b1_block_space_size_aligned
template<typename ADataType, typename B0DataType, typename Acc0DataType, typename B1DataType, typename Acc1DataType, typename CShuffleDataType, typename CDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc, typename B0GridDesc, typename B1GridDesc, typename CGridDesc_M_N, index_t MPerBlock, index_t LPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t NPerBlock, index_t LTilePerBlock, index_t L1Value, index_t MPerWmma, index_t LPerWmma, index_t NPerWmma, index_t MRepeat, index_t LRepeat, index_t NRepeat, index_t BlockSize, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_K1, bool AThreadTransferSrcResetCoordinateAfterRun, bool AEnableLds, bool ABlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_K1, bool B0ThreadTransferSrcResetCoordinateAfterRun, bool B0EnableLds, bool B0BlockLdsExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_L1, bool B1ThreadTransferSrcResetCoordinateAfterRun, bool B1EnableLds, bool B1BlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, bool PadN, bool MaskOutUpperTriangle, index_t NumGemmKPrefetchStage = 1, LoopScheduler LoopSched = make_default_loop_scheduler(), PipelineVersion PipelineVer = PipelineVersion::v1>
|
staticconstexpr |
Initial value:
=
B1EnableLds ? math::integer_least_multiple(
MakeB1BlockDescriptor().GetElementSpaceSize(), max_lds_align)
: 0
__host__ static __device__ constexpr auto MakeB1BlockDescriptor()
Definition gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp:222
◆ c_block_space_size
template<typename ADataType, typename B0DataType, typename Acc0DataType, typename B1DataType, typename Acc1DataType, typename CShuffleDataType, typename CDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc, typename B0GridDesc, typename B1GridDesc, typename CGridDesc_M_N, index_t MPerBlock, index_t LPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t NPerBlock, index_t LTilePerBlock, index_t L1Value, index_t MPerWmma, index_t LPerWmma, index_t NPerWmma, index_t MRepeat, index_t LRepeat, index_t NRepeat, index_t BlockSize, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_K1, bool AThreadTransferSrcResetCoordinateAfterRun, bool AEnableLds, bool ABlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_K1, bool B0ThreadTransferSrcResetCoordinateAfterRun, bool B0EnableLds, bool B0BlockLdsExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_L1, bool B1ThreadTransferSrcResetCoordinateAfterRun, bool B1EnableLds, bool B1BlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, bool PadN, bool MaskOutUpperTriangle, index_t NumGemmKPrefetchStage = 1, LoopScheduler LoopSched = make_default_loop_scheduler(), PipelineVersion PipelineVer = PipelineVersion::v1>
|
staticconstexpr |
Initial value:
=
.GetElementSpaceSize()
__host__ static __device__ constexpr auto GetCShuffleBlockDescriptor_MShRepeat_MPerShRepeat_NShRepeat_NPerShRepeat()
Definition gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp:473
◆ max_lds_align
template<typename ADataType, typename B0DataType, typename Acc0DataType, typename B1DataType, typename Acc1DataType, typename CShuffleDataType, typename CDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc, typename B0GridDesc, typename B1GridDesc, typename CGridDesc_M_N, index_t MPerBlock, index_t LPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t NPerBlock, index_t LTilePerBlock, index_t L1Value, index_t MPerWmma, index_t LPerWmma, index_t NPerWmma, index_t MRepeat, index_t LRepeat, index_t NRepeat, index_t BlockSize, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_K1, bool AThreadTransferSrcResetCoordinateAfterRun, bool AEnableLds, bool ABlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_K1, bool B0ThreadTransferSrcResetCoordinateAfterRun, bool B0EnableLds, bool B0BlockLdsExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_L1, bool B1ThreadTransferSrcResetCoordinateAfterRun, bool B1EnableLds, bool B1BlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, bool PadN, bool MaskOutUpperTriangle, index_t NumGemmKPrefetchStage = 1, LoopScheduler LoopSched = make_default_loop_scheduler(), PipelineVersion PipelineVer = PipelineVersion::v1>
|
staticconstexpr |
◆ reduction_space_offset
template<typename ADataType, typename B0DataType, typename Acc0DataType, typename B1DataType, typename Acc1DataType, typename CShuffleDataType, typename CDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc, typename B0GridDesc, typename B1GridDesc, typename CGridDesc_M_N, index_t MPerBlock, index_t LPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t NPerBlock, index_t LTilePerBlock, index_t L1Value, index_t MPerWmma, index_t LPerWmma, index_t NPerWmma, index_t MRepeat, index_t LRepeat, index_t NRepeat, index_t BlockSize, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_K1, bool AThreadTransferSrcResetCoordinateAfterRun, bool AEnableLds, bool ABlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_K1, bool B0ThreadTransferSrcResetCoordinateAfterRun, bool B0EnableLds, bool B0BlockLdsExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_L1, bool B1ThreadTransferSrcResetCoordinateAfterRun, bool B1EnableLds, bool B1BlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, bool PadN, bool MaskOutUpperTriangle, index_t NumGemmKPrefetchStage = 1, LoopScheduler LoopSched = make_default_loop_scheduler(), PipelineVersion PipelineVer = PipelineVersion::v1>
|
staticconstexpr |
◆ reduction_space_size_aligned
template<typename ADataType, typename B0DataType, typename Acc0DataType, typename B1DataType, typename Acc1DataType, typename CShuffleDataType, typename CDataType, typename AElementwiseOperation, typename B0ElementwiseOperation, typename AccElementwiseOperation, typename B1ElementwiseOperation, typename CElementwiseOperation, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename AGridDesc, typename B0GridDesc, typename B1GridDesc, typename CGridDesc_M_N, index_t MPerBlock, index_t LPerBlock, index_t KPerBlock, index_t AK1Value, index_t BK1Value, index_t NPerBlock, index_t LTilePerBlock, index_t L1Value, index_t MPerWmma, index_t LPerWmma, index_t NPerWmma, index_t MRepeat, index_t LRepeat, index_t NRepeat, index_t BlockSize, typename ABlockTransferThreadClusterLengths_K0_M_K1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_K1, bool AThreadTransferSrcResetCoordinateAfterRun, bool AEnableLds, bool ABlockLdsExtraM, typename B0BlockTransferThreadClusterLengths_K0_L_K1, typename B0BlockTransferThreadClusterArrangeOrder, typename B0BlockTransferSrcAccessOrder, index_t B0BlockTransferSrcVectorDim, index_t B0BlockTransferSrcScalarPerVector, index_t B0BlockTransferDstScalarPerVector_K1, bool B0ThreadTransferSrcResetCoordinateAfterRun, bool B0EnableLds, bool B0BlockLdsExtraL, typename B1BlockTransferThreadClusterLengths_L0_N_L1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_L1, bool B1ThreadTransferSrcResetCoordinateAfterRun, bool B1EnableLds, bool B1BlockLdsExtraN, index_t CShuffleMRepeatPerShuffle, index_t CShuffleNRepeatPerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, bool PadN, bool MaskOutUpperTriangle, index_t NumGemmKPrefetchStage = 1, LoopScheduler LoopSched = make_default_loop_scheduler(), PipelineVersion PipelineVer = PipelineVersion::v1>
|
staticconstexpr |
Initial value:
=
math::integer_least_multiple(BlockSize, max_lds_align)
The documentation for this struct was generated from the following file: