20 typename ComputeDataType,
24 typename AMmaTileDesc,
25 typename BMmaTileDesc,
26 index_t ABlockTransferSrcScalarPerVector,
27 index_t BBlockTransferSrcScalarPerVector,
43 typename ComputeDataType,
47 typename AMmaTileDesc,
48 typename BMmaTileDesc,
49 index_t ABlockTransferSrcScalarPerVector,
50 index_t BBlockTransferSrcScalarPerVector,
72 ABlockTransferSrcScalarPerVector,
73 BBlockTransferSrcScalarPerVector,
90 ABlockTransferSrcScalarPerVector,
91 BBlockTransferSrcScalarPerVector,
111 ABlockTransferSrcScalarPerVector,
112 BBlockTransferSrcScalarPerVector,
151 template <
typename TileDesc_M0_M1_M2_K>
162 TileDesc_M0_M1_M2_K{},
194 __builtin_amdgcn_sched_group_barrier(0x008, 1, 0);
195 __builtin_amdgcn_sched_group_barrier(0x020, 1, 0);
201 __builtin_amdgcn_sched_group_barrier(0x008, 1, 0);
202 __builtin_amdgcn_sched_group_barrier(0x200, 1, 0);
203 __builtin_amdgcn_sched_group_barrier(0x008, 1, 0);
204 __builtin_amdgcn_sched_group_barrier(0x020, 1, 0);
208 static_for<0, num_ds_read_inst_a / 2, 1>{}([&](
auto i) {
210 __builtin_amdgcn_sched_group_barrier(0x008, 1, 0);
211 __builtin_amdgcn_sched_group_barrier(0x100, 2, 0);
215 template <
bool HasMainLoop,
219 typename ABlockTransfer,
220 typename AGridBuffer,
221 typename ABlockBuffer,
222 typename ABlockTransferStep,
224 typename BBlockTransfer,
225 typename BGridBuffer,
226 typename BBlockBuffer,
227 typename BBlockTransferStep,
228 typename CThreadBuffer>
229 __device__
void Run(
const AGridDesc& a_grid_desc,
230 const ABlockDesc& a_block_desc,
231 ABlockTransfer& a_blockwise_copy,
232 const AGridBuffer& a_grid_buf,
233 ABlockBuffer& a_block_buf,
234 const ABlockTransferStep& a_block_copy_step,
235 const BGridDesc& b_grid_desc,
236 BBlockTransfer& b_blockwise_copy,
237 BBlockTransfer& b_blockwise_copy_up,
238 const BGridBuffer& b_grid_buf,
239 const BGridBuffer& b_grid_buf_up,
240 BBlockBuffer& b_block_buf,
241 const BBlockTransferStep& b_block_copy_step,
242 CThreadBuffer& c_thread_buf,
243 CThreadBuffer& c_thread_buf_up,
248 __builtin_amdgcn_sched_barrier(0);
263 b_thread_dequant_bufs_up;
266 a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf,
I0);
267 b_blockwise_copy.Run(b_grid_desc,
272 b_blockwise_copy_up.Run(b_grid_desc,
276 b_thread_bufs_up(
I0));
278 a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
279 b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
280 b_blockwise_copy_up.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
281 __builtin_amdgcn_sched_barrier(0);
284 a_blockwise_copy.RunWrite(a_block_desc, a_block_buf,
I0);
287 a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf,
I0);
288 a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
308 b_thread_dequant_bufs(
I0));
311 b_thread_bufs_up(
I0),
314 b_thread_dequant_bufs_up(
I0));
317 c_thread_buf.Clear();
318 c_thread_buf_up.Clear();
320 __builtin_amdgcn_sched_barrier(0);
323 if constexpr(HasMainLoop)
328 auto LoopFunc = [&](
auto mfma_reg_buf,
auto local_read_buf) {
329 b_blockwise_copy.Run(b_grid_desc,
333 b_thread_bufs(local_read_buf));
334 b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
335 b_blockwise_copy_up.Run(b_grid_desc,
339 b_thread_bufs_up(local_read_buf));
340 b_blockwise_copy_up.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
343 a_blockwise_copy.RunWrite(a_block_desc, a_block_buf, mfma_reg_buf);
345 a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, local_read_buf);
346 a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
356 a_thread_vec.template AsType<ComputeDataType>()(ik) =
359 b_thread_vec.template AsType<ComputeDataType>()(ik) =
360 b_thread_dequant_bufs[mfma_reg_buf]
363 b_thread_vec_up.template AsType<ComputeDataType>()(ik) =
364 b_thread_dequant_bufs_up
368 using mfma_input_type =
376 a_thread_vec.template AsType<mfma_input_type>(),
377 b_thread_vec.template AsType<mfma_input_type>(),
381 a_thread_vec.template AsType<mfma_input_type>(),
382 b_thread_vec_up.template AsType<mfma_input_type>(),
403 b_thread_bufs(local_read_buf),
406 b_thread_dequant_bufs(local_read_buf));
409 b_thread_bufs_up(local_read_buf),
412 b_thread_dequant_bufs_up(local_read_buf));
415 __builtin_amdgcn_sched_barrier(0);
422 }
while(i < (num_loop - 2));
427 b_blockwise_copy.Run(b_grid_desc,
433 b_blockwise_copy_up.Run(b_grid_desc,
437 b_thread_bufs_up(
I1));
440 a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
450 a_thread_vec.template AsType<ComputeDataType>()(ik) =
453 b_thread_vec.template AsType<ComputeDataType>()(ik) =
456 b_thread_vec_up.template AsType<ComputeDataType>()(ik) =
461 using mfma_input_type =
467 xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type>(),
468 b_thread_vec.template AsType<mfma_input_type>(),
470 xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type>(),
471 b_thread_vec_up.template AsType<mfma_input_type>(),
495 b_thread_dequant_bufs(
I1));
499 b_thread_bufs_up(
I1),
502 b_thread_dequant_bufs_up(
I1));
503 __builtin_amdgcn_sched_barrier(0);
513 a_thread_vec.template AsType<ComputeDataType>()(ik) =
516 b_thread_vec.template AsType<ComputeDataType>()(ik) =
519 b_thread_vec_up.template AsType<ComputeDataType>()(ik) =
524 using mfma_input_type =
530 xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type>(),
531 b_thread_vec.template AsType<mfma_input_type>(),
533 xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type>(),
534 b_thread_vec_up.template AsType<mfma_input_type>(),
553 a_thread_vec.template AsType<ComputeDataType>()(ik) =
556 b_thread_vec.template AsType<ComputeDataType>()(ik) =
559 b_thread_vec_up.template AsType<ComputeDataType>()(ik) =
564 using mfma_input_type =
570 xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type>(),
571 b_thread_vec.template AsType<mfma_input_type>(),
573 xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type>(),
574 b_thread_vec_up.template AsType<mfma_input_type>(),
614 Sequence<1, 2, 0, 3>,
__host__ __device__ constexpr auto make_static_buffer(Number< N >)
Definition static_buffer.hpp:186
__host__ __device__ constexpr auto make_pass_through_transform(const LowLength &low_length)
Definition multi_index_transform_helper.hpp:12
typename detail::StaticallyIndexedArrayImpl< T, N >::type StaticallyIndexedArray
Definition utility/statically_indexed_array.hpp:45
int32_t index_t
Definition ck.hpp:299
integral_constant< index_t, N > Number
Definition number.hpp:12
TailNumber
Definition blkgemmpipe_scheduler.hpp:31
@ Even
Definition blkgemmpipe_scheduler.hpp:34
@ Odd
Definition blkgemmpipe_scheduler.hpp:33
constexpr detail::ignore_t ignore
Definition utility/ignore.hpp:20
BlockGemmPipelineScheduler
Definition blkgemmpipe_scheduler.hpp:25
@ Intrawave
Definition blkgemmpipe_scheduler.hpp:26
__host__ __device__ constexpr auto make_naive_tensor_descriptor_packed(const Tuple< Lengths... > &lengths)
Definition tensor_descriptor_helper.hpp:101
__host__ __device__ constexpr auto make_tuple(Xs &&... xs)
Definition utility/tuple.hpp:211
__host__ __device__ constexpr auto transform_tensor_descriptor(const OldTensorDescriptor &old_tensor_desc, const NewTransforms &new_transforms, NewLowerDimensionOldVisibleIdss, NewUpperDimensionNewVisibleIdss)
Definition tensor_description/tensor_descriptor.hpp:319
__device__ void block_sync_lds()
Definition synchronization.hpp:16
__host__ __device__ constexpr auto make_unmerge_transform(const UpLengths &up_lengths, integral_constant< bool, Use24BitIntegerCalculation >=integral_constant< bool, false >{})
Definition multi_index_transform_helper.hpp:90
__host__ __device__ BlockwiseGemmXdlops_pipeline_base(Tuple4 a_origin=CalculateAThreadOriginDataIndex(), Tuple4 b_origin=CalculateBThreadOriginDataIndex())
Constructor for BlockwiseGemmXdlops_pipeline_base.
Definition blockwise_gemm_pipeline_xdlops_base.hpp:222
__host__ static __device__ constexpr auto GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_N3_N4()
Definition blockwise_gemm_pipeline_xdlops_base.hpp:280
__host__ static __device__ constexpr auto GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_N3_N4()
Definition blockwise_gemm_pipeline_xdlops_base.hpp:239
static constexpr auto c_thread_desc_
Definition blockwise_gemm_pipeline_xdlops_base.hpp:378
static constexpr auto xdlops_gemm
Definition blockwise_gemm_pipeline_xdlops_base.hpp:54
static constexpr auto I1
Definition blockwise_gemm_pipeline_xdlops_base.hpp:37
__host__ static __device__ constexpr auto GetCThreadDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2()
Definition blockwise_gemm_pipeline_xdlops_base.hpp:266
__host__ static __device__ constexpr auto GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2()
Definition blockwise_gemm_pipeline_xdlops_base.hpp:294
static constexpr index_t AMmaKStride
Definition blockwise_gemm_pipeline_xdlops_base.hpp:60
static __device__ auto CalculateAThreadOriginDataIndex6D()
Definition blockwise_gemm_pipeline_xdlops_base.hpp:136
static constexpr index_t WaveSize
Definition blockwise_gemm_pipeline_xdlops_base.hpp:46
__host__ static __device__ constexpr auto GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2()
Definition blockwise_gemm_pipeline_xdlops_base.hpp:253
static constexpr index_t B_K1
Definition blockwise_gemm_pipeline_xdlops_base.hpp:51
ck::BlockwiseGemmXdlops_pipeline_hotloop_inst< BlockSize, MPerBlock, NPerBlock, KPerBlock, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, A_K1, B_K1, A_K1, B_K1, MRepeat, NRepeat, MPerXDL, NPerXDL, xdlops_gemm.KPerXdlops > HotLoopInstList
Definition blockwise_gemm_pipeline_xdlops_base.hpp:82
__host__ __device__ constexpr auto & GetCThreadBuffer()
Definition blockwise_gemm_pipeline_xdlops_base.hpp:111
static constexpr auto I0
Definition blockwise_gemm_pipeline_xdlops_base.hpp:36
static __device__ auto CalculateCThreadOriginDataIndex(Number< m0 >, Number< n0 >, Number< xdlops_i >, Number< blk_i >)
Definition blockwise_gemm_pipeline_xdlops_base.hpp:160
static __device__ auto CalculateCThreadOriginDataIndex8D(Number< m0 >, Number< n0 >, Number< xdlops_i >, Number< blk_i >)
Definition blockwise_gemm_pipeline_xdlops_base.hpp:189
static constexpr index_t KRepeat
Definition blockwise_gemm_pipeline_xdlops_base.hpp:64
static constexpr AMmaTileDesc a_block_desc_m0_m1_m2_k
Definition blockwise_gemm_pipeline_xdlops_base.hpp:359
static constexpr index_t A_K1
Definition blockwise_gemm_pipeline_xdlops_base.hpp:50
static constexpr index_t BMmaKStride
Definition blockwise_gemm_pipeline_xdlops_base.hpp:61
__host__ static __device__ constexpr auto MakeCGridDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2(const CGridDesc_G_M_N &c_grid_desc_g_m_n)
Definition blockwise_gemm_pipeline_xdlops_base.hpp:341
__host__ static __device__ constexpr auto GetCBlockDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2()
Definition blockwise_gemm_pipeline_xdlops_base.hpp:307
__host__ static __device__ constexpr auto MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(const CGridDesc_M_N &c_grid_desc_m_n)
Definition blockwise_gemm_pipeline_xdlops_base.hpp:324
ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_bdequant_v1< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::a_thread_copy_ AThreadCopy a_thread_copy_
Definition blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_dequant_v1.hpp:598
ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_bdequant_v1< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::BlockHasHotloop __host__ static __device__ constexpr bool BlockHasHotloop(index_t num_loop)
Definition blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_dequant_v1.hpp:175
ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_bdequant_v1< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::b_block_desc_n0_n1_k0_k1 static constexpr BTileDesc b_block_desc_n0_n1_k0_k1
Definition blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_dequant_v1.hpp:603
ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_bdequant_v1< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::GlobalBufferNum static constexpr index_t GlobalBufferNum
Definition blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_dequant_v1.hpp:149
ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_bdequant_v1< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::a_block_desc_m0_m1_m2_k0_k1_k2 static constexpr auto a_block_desc_m0_m1_m2_k0_k1_k2
Definition blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_dequant_v1.hpp:172
ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_bdequant_v1< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::c_thread_desc_ static constexpr auto c_thread_desc_
Definition blockwise_gemm_pipeline_xdlops_base.hpp:378
ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_bdequant_v1< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::xdlops_gemm static constexpr auto xdlops_gemm
Definition blockwise_gemm_pipeline_xdlops_base.hpp:54
ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_bdequant_v1< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::a_thread_desc_ static constexpr auto a_thread_desc_
Definition blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_dequant_v1.hpp:585
ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_bdequant_v1< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::AThreadCopy ThreadwiseTensorSliceTransfer_v4< ADataType, ComputeDataType, decltype(a_block_desc_m0_m1_m2_k0_k1_k2), decltype(a_thread_desc_), Sequence< 1, 1, 1, 1, 1, KPack >, Sequence< 0, 1, 2, 3, 4, 5 >, 5, A_K1, A_K1 > AThreadCopy
Definition blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_dequant_v1.hpp:588
ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_bdequant_v1< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::b_thread_desc_ static constexpr auto b_thread_desc_
Definition blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_dequant_v1.hpp:600
ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_bdequant_v1< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::I1 static constexpr auto I1
Definition blockwise_gemm_pipeline_xdlops_base.hpp:37
ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_bdequant_v1< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::HotLoopScheduler static __device__ constexpr auto HotLoopScheduler()
Definition blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_dequant_v1.hpp:185
ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_bdequant_v1< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::Base BlockwiseGemmXdlops_pipeline_base< BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack > Base
Definition blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_dequant_v1.hpp:102
ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_bdequant_v1< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::PassThrough ck::tensor_operation::element_wise::PassThrough PassThrough
Definition blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_dequant_v1.hpp:605
ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_bdequant_v1< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::WaveSize static constexpr index_t WaveSize
Definition blockwise_gemm_pipeline_xdlops_base.hpp:46
ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_bdequant_v1< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::I0 static constexpr auto I0
Definition blockwise_gemm_pipeline_xdlops_base.hpp:36
ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_bdequant_v1< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::MakeAGemmMmaTileDescriptor __host__ static __device__ constexpr auto MakeAGemmMmaTileDescriptor(const TileDesc_M0_M1_M2_K &)
Definition blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_dequant_v1.hpp:152
ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_bdequant_v1< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::BThreadDequantCopy ThreadwiseTensorSliceTransfer_StaticToStatic< BDataType, ComputeDataType, decltype(b_block_desc_n0_n1_k0_k1), decltype(b_block_desc_n0_n1_k0_k1), tensor_operation::element_wise::PassThrough, Sequence< Number< NRepeat >{}, I1, Number< KRepeat >{}, Number< KPack >{}>, Sequence< 1, 2, 0, 3 >, 3, KPack > BThreadDequantCopy
Definition blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_dequant_v1.hpp:607
ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_bdequant_v1< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::b_thread_dequant_copy_ BThreadDequantCopy b_thread_dequant_copy_
Definition blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_dequant_v1.hpp:619
ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_bdequant_v1< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::b_element_op const PassThrough b_element_op
Definition blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_dequant_v1.hpp:618
ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_bdequant_v1< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::BlockLoopTailNum __host__ static __device__ constexpr TailNumber BlockLoopTailNum(index_t num_loop)
Definition blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_dequant_v1.hpp:180
ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_bdequant_v1< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::KRepeat static constexpr index_t KRepeat
Definition blockwise_gemm_pipeline_xdlops_base.hpp:64
ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_bdequant_v1< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::a_block_desc_m0_m1_m2_k static constexpr AMmaTileDesc a_block_desc_m0_m1_m2_k
Definition blockwise_gemm_pipeline_xdlops_base.hpp:359
ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_bdequant_v1< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::PrefillStages static constexpr index_t PrefillStages
Definition blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_dequant_v1.hpp:148
ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_bdequant_v1< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::A_K1 static constexpr index_t A_K1
Definition blockwise_gemm_pipeline_xdlops_base.hpp:50
ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_bdequant_v1< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::Run __device__ void Run(const AGridDesc &a_grid_desc, const ABlockDesc &a_block_desc, ABlockTransfer &a_blockwise_copy, const AGridBuffer &a_grid_buf, ABlockBuffer &a_block_buf, const ABlockTransferStep &a_block_copy_step, const BGridDesc &b_grid_desc, BBlockTransfer &b_blockwise_copy, BBlockTransfer &b_blockwise_copy_up, const BGridBuffer &b_grid_buf, const BGridBuffer &b_grid_buf_up, BBlockBuffer &b_block_buf, const BBlockTransferStep &b_block_copy_step, CThreadBuffer &c_thread_buf, CThreadBuffer &c_thread_buf_up, index_t num_loop) const
Definition blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_dequant_v1.hpp:229
ck::BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_bdequant_v1< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::PrefetchStages static constexpr index_t PrefetchStages
Definition blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_dequant_v1.hpp:147
Definition blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_dequant_v1.hpp:37
ck::BlockwiseGemmXdlops_pipeline_hotloop_inst< BlockSize, MPerBlock, NPerBlock, KPerBlock, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, A_K1, B_K1, A_K1, B_K1, MRepeat, NRepeat, MPerXDL, NPerXDL, xdlops_gemm.KPerXdlops >::A_LDS_Read_Inst_Num static constexpr index_t A_LDS_Read_Inst_Num
Definition blockwise_gemm_pipeline_xdlops.hpp:49
ck::BlockwiseGemmXdlops_pipeline_hotloop_inst< BlockSize, MPerBlock, NPerBlock, KPerBlock, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, A_K1, B_K1, A_K1, B_K1, MRepeat, NRepeat, MPerXDL, NPerXDL, xdlops_gemm.KPerXdlops >::A_Buffer_Load_Inst_Num static constexpr index_t A_Buffer_Load_Inst_Num
Definition blockwise_gemm_pipeline_xdlops.hpp:39
ck::BlockwiseGemmXdlops_pipeline_hotloop_inst< BlockSize, MPerBlock, NPerBlock, KPerBlock, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, A_K1, B_K1, A_K1, B_K1, MRepeat, NRepeat, MPerXDL, NPerXDL, xdlops_gemm.KPerXdlops >::B_Buffer_Load_Inst_Num static constexpr index_t B_Buffer_Load_Inst_Num
Definition blockwise_gemm_pipeline_xdlops.hpp:41
Definition utility/sequence.hpp:43
Threadwise data transfer.
Definition threadwise_tensor_slice_transfer.hpp:1720
Definition threadwise_tensor_slice_transfer.hpp:1260
Definition functional2.hpp:33
Definition tensor_operation/gpu/element/unary_element_wise_operation.hpp:340
Definition dtype_vector.hpp:10