ThreadGroupTensorSliceTransfer_Gather_DirectLoad< ThreadGroup, BlockSliceLengths, ThreadClusterLengths, ThreadClusterArrangeOrder, SrcData, DstData, SrcDesc, DstDesc, SrcDimAccessOrder, SrcVectorDim, DstVectorDim, ScalarPerVector, IndexType, GatherDim > Struct Template Reference#
Public Types |
Public Member Functions |
Static Public Member Functions |
Static Public Attributes |
List of all members
ck::ThreadGroupTensorSliceTransfer_Gather_DirectLoad< ThreadGroup, BlockSliceLengths, ThreadClusterLengths, ThreadClusterArrangeOrder, SrcData, DstData, SrcDesc, DstDesc, SrcDimAccessOrder, SrcVectorDim, DstVectorDim, ScalarPerVector, IndexType, GatherDim > Struct Template Reference
#include <thread_group_tensor_slice_transfer_gather_direct_load.hpp>
Public Types | |
| using | Index = MultiIndex<nDim> |
| using | SrcCoord = decltype(make_tensor_coordinate(SrcDesc{}, Index{})) |
| using | DstCoord = decltype(make_tensor_coordinate(DstDesc{}, Index{})) |
| using | SrcCoordStep = decltype(make_tensor_coordinate_step(SrcDesc{}, Index{})) |
| using | DstCoordStep = decltype(make_tensor_coordinate_step(DstDesc{}, Index{})) |
Public Member Functions | |
| __device__ constexpr | ThreadGroupTensorSliceTransfer_Gather_DirectLoad (const SrcDesc &src_desc, const Index &src_block_slice_origin, const DstDesc &dst_desc, const Index &dst_block_slice_origin, const StaticallyIndexedArray< IndexType, gather_num > &gather_offsets) |
| __device__ void | SetSrcSliceOrigin (const SrcDesc &src_desc, const Index &src_slice_origin_idx) |
| __device__ void | SetDstSliceOrigin (const DstDesc &dst_desc, const Index &dst_slice_origin_idx) |
| __device__ void | ResetDstSliceWindow (const DstDesc &dst_desc) |
| template<typename SrcBuffer, typename DstBuffer> | |
| __device__ void | Run (const SrcDesc &src_desc, const SrcBuffer &src_buf, const DstDesc &dst_desc, DstBuffer &dst_buf) |
| __device__ void | MoveSrcSliceWindow (const SrcDesc &src_desc, const Index &step) |
| template<typename DescType> | |
| __device__ auto | generate_steps (const DescType &desc, int sign) |
Static Public Member Functions | |
| static __device__ constexpr bool | AreThreadClusterLengthsValid () |
Static Public Attributes | |
| static constexpr index_t | nDim = remove_reference_t<SrcDesc>::GetNumOfDimension() |
| static constexpr auto | I0 = Number<0>{} |
| static constexpr auto | I1 = Number<1>{} |
| static constexpr auto | block_slice_lengths = BlockSliceLengths{} |
| static constexpr auto | thread_cluster_lengths = ThreadClusterLengths{} |
| static constexpr auto | thread_single_load_size |
| static constexpr auto | thread_steps = thread_cluster_lengths * thread_single_load_size |
| static constexpr auto | thread_slice_lengths = block_slice_lengths / thread_steps |
| static constexpr index_t | gather_num = thread_slice_lengths.At(Number<GatherDim>{}) |
Detailed Description
template<typename ThreadGroup, typename BlockSliceLengths, typename ThreadClusterLengths, typename ThreadClusterArrangeOrder, typename SrcData, typename DstData, typename SrcDesc, typename DstDesc, typename SrcDimAccessOrder, index_t SrcVectorDim, index_t DstVectorDim, index_t ScalarPerVector, typename IndexType, index_t GatherDim = 1>
struct ck::ThreadGroupTensorSliceTransfer_Gather_DirectLoad< ThreadGroup, BlockSliceLengths, ThreadClusterLengths, ThreadClusterArrangeOrder, SrcData, DstData, SrcDesc, DstDesc, SrcDimAccessOrder, SrcVectorDim, DstVectorDim, ScalarPerVector, IndexType, GatherDim >
struct ck::ThreadGroupTensorSliceTransfer_Gather_DirectLoad< ThreadGroup, BlockSliceLengths, ThreadClusterLengths, ThreadClusterArrangeOrder, SrcData, DstData, SrcDesc, DstDesc, SrcDimAccessOrder, SrcVectorDim, DstVectorDim, ScalarPerVector, IndexType, GatherDim >
Transfer that uses direct load instructions to copy data from global to LDS memory.
Traditional loads first copy data from global to registers, and then from registers to LDS. Direct loads do not need an intermediate step, data is copied directly from global to LDS, without the use of additional registers.
However, the instruction has limitations:
- each thread must copy exactly a single DWORD - 4 bytes;
- threads within a single wavefront must write consecutive DWORDS into LDS, (data in global do not need to be contiguous, each thread might have its own offset).
To make sure that all the transfers finished, the waitcnt instruction must be used with vmcnt instead of lgkmcnt.
Limitations of the transfer class:
- SrcData must be the same as DstData - no possibility to convert the data type in flight;
- DstVectorDim must be the last dimension;
- SrcVectorDim must be the last dimension if ScalarPerVector is greater than 1;
- ScalarPerVector times the number of bytes of DstData must be equal to a single DWORD = 4B (for examlpe if DstData is fp32, then ScalarPerVector must be 1; if DstData is fp16, ScalarPerVector must be 2);
- if ScalarPerVector is greater than 1, the contiguous dimension in src and dst must be the same dimension;
- threads in a wavefront must write contiguous data to LDS (when wavefront size is 64, they must write 64 contiguous DWORDs) - ThreadClusterLengths must be prepared in such a way to guarantee that.
Member Typedef Documentation
◆ DstCoord
template<typename ThreadGroup, typename BlockSliceLengths, typename ThreadClusterLengths, typename ThreadClusterArrangeOrder, typename SrcData, typename DstData, typename SrcDesc, typename DstDesc, typename SrcDimAccessOrder, index_t SrcVectorDim, index_t DstVectorDim, index_t ScalarPerVector, typename IndexType, index_t GatherDim = 1>
| using ck::ThreadGroupTensorSliceTransfer_Gather_DirectLoad< ThreadGroup, BlockSliceLengths, ThreadClusterLengths, ThreadClusterArrangeOrder, SrcData, DstData, SrcDesc, DstDesc, SrcDimAccessOrder, SrcVectorDim, DstVectorDim, ScalarPerVector, IndexType, GatherDim >::DstCoord = decltype(make_tensor_coordinate(DstDesc{}, Index{})) |
◆ DstCoordStep
template<typename ThreadGroup, typename BlockSliceLengths, typename ThreadClusterLengths, typename ThreadClusterArrangeOrder, typename SrcData, typename DstData, typename SrcDesc, typename DstDesc, typename SrcDimAccessOrder, index_t SrcVectorDim, index_t DstVectorDim, index_t ScalarPerVector, typename IndexType, index_t GatherDim = 1>
| using ck::ThreadGroupTensorSliceTransfer_Gather_DirectLoad< ThreadGroup, BlockSliceLengths, ThreadClusterLengths, ThreadClusterArrangeOrder, SrcData, DstData, SrcDesc, DstDesc, SrcDimAccessOrder, SrcVectorDim, DstVectorDim, ScalarPerVector, IndexType, GatherDim >::DstCoordStep = decltype(make_tensor_coordinate_step(DstDesc{}, Index{})) |
◆ Index
template<typename ThreadGroup, typename BlockSliceLengths, typename ThreadClusterLengths, typename ThreadClusterArrangeOrder, typename SrcData, typename DstData, typename SrcDesc, typename DstDesc, typename SrcDimAccessOrder, index_t SrcVectorDim, index_t DstVectorDim, index_t ScalarPerVector, typename IndexType, index_t GatherDim = 1>
| using ck::ThreadGroupTensorSliceTransfer_Gather_DirectLoad< ThreadGroup, BlockSliceLengths, ThreadClusterLengths, ThreadClusterArrangeOrder, SrcData, DstData, SrcDesc, DstDesc, SrcDimAccessOrder, SrcVectorDim, DstVectorDim, ScalarPerVector, IndexType, GatherDim >::Index = MultiIndex<nDim> |
◆ SrcCoord
template<typename ThreadGroup, typename BlockSliceLengths, typename ThreadClusterLengths, typename ThreadClusterArrangeOrder, typename SrcData, typename DstData, typename SrcDesc, typename DstDesc, typename SrcDimAccessOrder, index_t SrcVectorDim, index_t DstVectorDim, index_t ScalarPerVector, typename IndexType, index_t GatherDim = 1>
| using ck::ThreadGroupTensorSliceTransfer_Gather_DirectLoad< ThreadGroup, BlockSliceLengths, ThreadClusterLengths, ThreadClusterArrangeOrder, SrcData, DstData, SrcDesc, DstDesc, SrcDimAccessOrder, SrcVectorDim, DstVectorDim, ScalarPerVector, IndexType, GatherDim >::SrcCoord = decltype(make_tensor_coordinate(SrcDesc{}, Index{})) |
◆ SrcCoordStep
template<typename ThreadGroup, typename BlockSliceLengths, typename ThreadClusterLengths, typename ThreadClusterArrangeOrder, typename SrcData, typename DstData, typename SrcDesc, typename DstDesc, typename SrcDimAccessOrder, index_t SrcVectorDim, index_t DstVectorDim, index_t ScalarPerVector, typename IndexType, index_t GatherDim = 1>
| using ck::ThreadGroupTensorSliceTransfer_Gather_DirectLoad< ThreadGroup, BlockSliceLengths, ThreadClusterLengths, ThreadClusterArrangeOrder, SrcData, DstData, SrcDesc, DstDesc, SrcDimAccessOrder, SrcVectorDim, DstVectorDim, ScalarPerVector, IndexType, GatherDim >::SrcCoordStep = decltype(make_tensor_coordinate_step(SrcDesc{}, Index{})) |
Constructor & Destructor Documentation
◆ ThreadGroupTensorSliceTransfer_Gather_DirectLoad()
template<typename ThreadGroup, typename BlockSliceLengths, typename ThreadClusterLengths, typename ThreadClusterArrangeOrder, typename SrcData, typename DstData, typename SrcDesc, typename DstDesc, typename SrcDimAccessOrder, index_t SrcVectorDim, index_t DstVectorDim, index_t ScalarPerVector, typename IndexType, index_t GatherDim = 1>
|
inlineconstexpr |
Member Function Documentation
◆ AreThreadClusterLengthsValid()
template<typename ThreadGroup, typename BlockSliceLengths, typename ThreadClusterLengths, typename ThreadClusterArrangeOrder, typename SrcData, typename DstData, typename SrcDesc, typename DstDesc, typename SrcDimAccessOrder, index_t SrcVectorDim, index_t DstVectorDim, index_t ScalarPerVector, typename IndexType, index_t GatherDim = 1>
|
inlinestaticconstexpr |
◆ generate_steps()
template<typename ThreadGroup, typename BlockSliceLengths, typename ThreadClusterLengths, typename ThreadClusterArrangeOrder, typename SrcData, typename DstData, typename SrcDesc, typename DstDesc, typename SrcDimAccessOrder, index_t SrcVectorDim, index_t DstVectorDim, index_t ScalarPerVector, typename IndexType, index_t GatherDim = 1>
template<typename DescType>
|
inline |
◆ MoveSrcSliceWindow()
template<typename ThreadGroup, typename BlockSliceLengths, typename ThreadClusterLengths, typename ThreadClusterArrangeOrder, typename SrcData, typename DstData, typename SrcDesc, typename DstDesc, typename SrcDimAccessOrder, index_t SrcVectorDim, index_t DstVectorDim, index_t ScalarPerVector, typename IndexType, index_t GatherDim = 1>
|
inline |
◆ ResetDstSliceWindow()
template<typename ThreadGroup, typename BlockSliceLengths, typename ThreadClusterLengths, typename ThreadClusterArrangeOrder, typename SrcData, typename DstData, typename SrcDesc, typename DstDesc, typename SrcDimAccessOrder, index_t SrcVectorDim, index_t DstVectorDim, index_t ScalarPerVector, typename IndexType, index_t GatherDim = 1>
|
inline |
◆ Run()
template<typename ThreadGroup, typename BlockSliceLengths, typename ThreadClusterLengths, typename ThreadClusterArrangeOrder, typename SrcData, typename DstData, typename SrcDesc, typename DstDesc, typename SrcDimAccessOrder, index_t SrcVectorDim, index_t DstVectorDim, index_t ScalarPerVector, typename IndexType, index_t GatherDim = 1>
template<typename SrcBuffer, typename DstBuffer>
|
inline |
◆ SetDstSliceOrigin()
template<typename ThreadGroup, typename BlockSliceLengths, typename ThreadClusterLengths, typename ThreadClusterArrangeOrder, typename SrcData, typename DstData, typename SrcDesc, typename DstDesc, typename SrcDimAccessOrder, index_t SrcVectorDim, index_t DstVectorDim, index_t ScalarPerVector, typename IndexType, index_t GatherDim = 1>
|
inline |
◆ SetSrcSliceOrigin()
template<typename ThreadGroup, typename BlockSliceLengths, typename ThreadClusterLengths, typename ThreadClusterArrangeOrder, typename SrcData, typename DstData, typename SrcDesc, typename DstDesc, typename SrcDimAccessOrder, index_t SrcVectorDim, index_t DstVectorDim, index_t ScalarPerVector, typename IndexType, index_t GatherDim = 1>
|
inline |
Member Data Documentation
◆ block_slice_lengths
template<typename ThreadGroup, typename BlockSliceLengths, typename ThreadClusterLengths, typename ThreadClusterArrangeOrder, typename SrcData, typename DstData, typename SrcDesc, typename DstDesc, typename SrcDimAccessOrder, index_t SrcVectorDim, index_t DstVectorDim, index_t ScalarPerVector, typename IndexType, index_t GatherDim = 1>
|
staticconstexpr |
◆ gather_num
template<typename ThreadGroup, typename BlockSliceLengths, typename ThreadClusterLengths, typename ThreadClusterArrangeOrder, typename SrcData, typename DstData, typename SrcDesc, typename DstDesc, typename SrcDimAccessOrder, index_t SrcVectorDim, index_t DstVectorDim, index_t ScalarPerVector, typename IndexType, index_t GatherDim = 1>
|
staticconstexpr |
◆ I0
template<typename ThreadGroup, typename BlockSliceLengths, typename ThreadClusterLengths, typename ThreadClusterArrangeOrder, typename SrcData, typename DstData, typename SrcDesc, typename DstDesc, typename SrcDimAccessOrder, index_t SrcVectorDim, index_t DstVectorDim, index_t ScalarPerVector, typename IndexType, index_t GatherDim = 1>
|
staticconstexpr |
◆ I1
template<typename ThreadGroup, typename BlockSliceLengths, typename ThreadClusterLengths, typename ThreadClusterArrangeOrder, typename SrcData, typename DstData, typename SrcDesc, typename DstDesc, typename SrcDimAccessOrder, index_t SrcVectorDim, index_t DstVectorDim, index_t ScalarPerVector, typename IndexType, index_t GatherDim = 1>
|
staticconstexpr |
◆ nDim
template<typename ThreadGroup, typename BlockSliceLengths, typename ThreadClusterLengths, typename ThreadClusterArrangeOrder, typename SrcData, typename DstData, typename SrcDesc, typename DstDesc, typename SrcDimAccessOrder, index_t SrcVectorDim, index_t DstVectorDim, index_t ScalarPerVector, typename IndexType, index_t GatherDim = 1>
|
staticconstexpr |
◆ thread_cluster_lengths
template<typename ThreadGroup, typename BlockSliceLengths, typename ThreadClusterLengths, typename ThreadClusterArrangeOrder, typename SrcData, typename DstData, typename SrcDesc, typename DstDesc, typename SrcDimAccessOrder, index_t SrcVectorDim, index_t DstVectorDim, index_t ScalarPerVector, typename IndexType, index_t GatherDim = 1>
|
staticconstexpr |
◆ thread_single_load_size
template<typename ThreadGroup, typename BlockSliceLengths, typename ThreadClusterLengths, typename ThreadClusterArrangeOrder, typename SrcData, typename DstData, typename SrcDesc, typename DstDesc, typename SrcDimAccessOrder, index_t SrcVectorDim, index_t DstVectorDim, index_t ScalarPerVector, typename IndexType, index_t GatherDim = 1>
|
staticconstexpr |
Initial value:
__host__ __device__ constexpr auto generate_sequence(F, Number< N >)
Definition sequence_helper.hpp:18
Definition threadwise_tensor_slice_transfer_util.hpp:20
◆ thread_slice_lengths
template<typename ThreadGroup, typename BlockSliceLengths, typename ThreadClusterLengths, typename ThreadClusterArrangeOrder, typename SrcData, typename DstData, typename SrcDesc, typename DstDesc, typename SrcDimAccessOrder, index_t SrcVectorDim, index_t DstVectorDim, index_t ScalarPerVector, typename IndexType, index_t GatherDim = 1>
|
staticconstexpr |
◆ thread_steps
template<typename ThreadGroup, typename BlockSliceLengths, typename ThreadClusterLengths, typename ThreadClusterArrangeOrder, typename SrcData, typename DstData, typename SrcDesc, typename DstDesc, typename SrcDimAccessOrder, index_t SrcVectorDim, index_t DstVectorDim, index_t ScalarPerVector, typename IndexType, index_t GatherDim = 1>
|
staticconstexpr |
The documentation for this struct was generated from the following file: