20 typename AScaleDataType,
22 typename BScaleDataType,
25 typename AMmaTileDesc,
26 typename BMmaTileDesc,
27 index_t ABlockTransferSrcScalarPerVector,
28 index_t BBlockTransferSrcScalarPerVector,
41template <
index_t ThreadBlockSize,
44 typename AScaleDataType,
46 typename BScaleDataType,
49 typename AMmaTileDesc,
50 typename BMmaTileDesc,
51 index_t ABlockTransferSrcScalarPerVector,
52 index_t BBlockTransferSrcScalarPerVector,
72 ABlockTransferSrcScalarPerVector,
73 BBlockTransferSrcScalarPerVector,
89 ABlockTransferSrcScalarPerVector,
90 BBlockTransferSrcScalarPerVector,
109 ABlockTransferSrcScalarPerVector,
110 BBlockTransferSrcScalarPerVector,
163 KPerBlock / ScaleBlockSize;
177 "A scale pack data type too large!");
179 "B scale pack data type too large!");
197 constexpr auto num_ds_read_inst_a =
201 constexpr auto num_ds_read_inst_b =
215 constexpr auto ds_read_a_issue_cycle =
217 constexpr auto ds_read_b_issue_cycle =
220 constexpr auto ds_read_a_mfma_rate =
221 (mfma_cycle - 8 + 2 * ds_read_a_issue_cycle - 1) / (2 * ds_read_a_issue_cycle);
222 constexpr auto ds_read_b_mfma_rate =
223 (mfma_cycle - 8 + 2 * ds_read_b_issue_cycle - 1) / (2 * ds_read_b_issue_cycle);
225 constexpr auto num_dsread_a_mfma =
226 (num_ds_read_inst_a + ds_read_a_mfma_rate - 1) / ds_read_a_mfma_rate;
227 constexpr auto num_dsread_b_mfma =
228 (num_ds_read_inst_b + ds_read_b_mfma_rate - 1) / ds_read_b_mfma_rate;
231 constexpr auto num_mfma_stage1 = num_mfma_inst - (num_dsread_a_mfma + num_dsread_b_mfma);
232 constexpr auto num_buffer_load_total = num_buffer_load_inst_a + num_buffer_load_inst_b +
233 num_buffer_load_a_scale + num_buffer_load_b_scale;
235 constexpr auto mfma_perstage_more =
237 constexpr auto mfma_perstage_less =
240 constexpr auto mfma_stages_more =
241 num_mfma_stage1 - mfma_perstage_less * num_buffer_load_total;
244 if constexpr(i < mfma_stages_more)
247 __builtin_amdgcn_sched_group_barrier(0x008, 1, 0);
249 __builtin_amdgcn_sched_group_barrier(0x020, 1, 0);
254 __builtin_amdgcn_sched_group_barrier(0x008, 1, 0);
256 __builtin_amdgcn_sched_group_barrier(0x020, 1, 0);
261 if constexpr((i + num_buffer_load_inst_a) < mfma_stages_more)
264 __builtin_amdgcn_sched_group_barrier(0x008, 1, 0);
266 __builtin_amdgcn_sched_group_barrier(0x020, 1, 0);
271 __builtin_amdgcn_sched_group_barrier(0x008, 1, 0);
273 __builtin_amdgcn_sched_group_barrier(0x020, 1, 0);
278 if constexpr((i + num_buffer_load_inst_a + num_buffer_load_inst_b) < mfma_stages_more)
281 __builtin_amdgcn_sched_group_barrier(0x008, 1, 0);
283 __builtin_amdgcn_sched_group_barrier(0x020, 1, 0);
288 __builtin_amdgcn_sched_group_barrier(0x008, 1, 0);
290 __builtin_amdgcn_sched_group_barrier(0x020, 1, 0);
295 if constexpr((i + num_buffer_load_inst_a + num_buffer_load_inst_b +
296 num_buffer_load_a_scale) < mfma_stages_more)
299 __builtin_amdgcn_sched_group_barrier(0x008, 1, 0);
301 __builtin_amdgcn_sched_group_barrier(0x020, 1, 0);
306 __builtin_amdgcn_sched_group_barrier(0x008, 1, 0);
308 __builtin_amdgcn_sched_group_barrier(0x020, 1, 0);
314 __builtin_amdgcn_sched_group_barrier(0x008, 1, 0);
315 if constexpr((num_ds_read_inst_a - (i + 1) * ds_read_a_mfma_rate) >=
318 __builtin_amdgcn_sched_group_barrier(0x100, ds_read_a_mfma_rate, 0);
322 __builtin_amdgcn_sched_group_barrier(0x100,
323 num_ds_read_inst_a - (num_dsread_a_mfma - 1) *
330 __builtin_amdgcn_sched_group_barrier(0x008, 1, 0);
331 if constexpr((num_ds_read_inst_b - (i + 1) * ds_read_b_mfma_rate) >=
334 __builtin_amdgcn_sched_group_barrier(0x100, ds_read_b_mfma_rate, 0);
338 __builtin_amdgcn_sched_group_barrier(0x100,
339 num_ds_read_inst_b - (num_dsread_b_mfma - 1) *
346 template <
bool HasMainLoop,
350 typename ABlockTransfer,
351 typename AGridBuffer,
352 typename ABlockBuffer,
353 typename ABlockTransferStep,
356 typename BBlockTransfer,
357 typename BGridBuffer,
358 typename BBlockBuffer,
359 typename BBlockTransferStep,
360 typename CThreadBuffer,
361 typename AScaleGridBuffer,
362 typename AScaleGridDesc,
363 typename AScaleThreadTransfer,
364 typename BScaleGridBuffer,
365 typename BScaleGridDesc,
366 typename BScaleThreadTransfer>
369 const AGridDesc& a_grid_desc,
370 const ABlockDesc& a_block_desc,
371 ABlockTransfer& a_blockwise_copy,
372 const AGridBuffer& a_grid_buf,
373 ABlockBuffer& a_block_bufs,
374 const ABlockTransferStep& a_block_copy_step,
376 const BGridDesc& b_grid_desc,
377 const BBlockDesc& b_block_desc,
378 BBlockTransfer& b_blockwise_copy,
379 const BGridBuffer& b_grid_buf,
380 BBlockBuffer& b_block_bufs,
381 const BBlockTransferStep& b_block_copy_step,
383 CThreadBuffer& c_thread_buf,
385 const AScaleGridDesc& a_scale_grid_desc,
386 AScaleThreadTransfer& a_scale_thread_copy,
387 const AScaleGridBuffer& a_scale_grid_buf,
388 const BScaleGridDesc& b_scale_grid_desc,
389 BScaleThreadTransfer& b_scale_thread_copy,
390 const BScaleGridBuffer& b_scale_grid_buf,
408 a_blockwise_copy.Run(a_grid_desc, a_grid_buf, a_block_desc, a_block_bufs(
I0));
409 b_blockwise_copy.Run(b_grid_desc, b_grid_buf, b_block_desc, b_block_bufs(
I0));
411 a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
412 b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
417 a_scale_thread_copy.Run(a_scale_grid_desc,
421 a_scale_thread_bufs(
I0));
423 a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
426 a_scale_thread_copy.MoveSrcSliceWindow(
431 a_scale_thread_copy.MoveSrcSliceWindow(
438 b_scale_thread_copy.Run(b_scale_grid_desc,
442 b_scale_thread_bufs(
I0));
444 b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
447 b_scale_thread_copy.MoveSrcSliceWindow(
453 b_scale_thread_copy.MoveSrcSliceWindow(
458 __builtin_amdgcn_s_waitcnt(3952);
465 constexpr auto a_k_step_chunk =
487 constexpr auto b_k_step_chunk =
508 a_blockwise_copy.Run(a_grid_desc, a_grid_buf, a_block_desc, a_block_bufs(
I1));
509 b_blockwise_copy.Run(b_grid_desc, b_grid_buf, b_block_desc, b_block_bufs(
I1));
511 a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
512 b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
515 c_thread_buf.Clear();
516 __builtin_amdgcn_sched_barrier(0);
519 if constexpr(HasMainLoop)
525 auto LoopFunc = [&](
auto scale_comp_buf,
auto scale_mem_buf) {
526 __builtin_amdgcn_s_waitcnt(3952);
529 a_blockwise_copy.Run(
530 a_grid_desc, a_grid_buf, a_block_desc, a_block_bufs(scale_comp_buf));
531 b_blockwise_copy.Run(
532 b_grid_desc, b_grid_buf, b_block_desc, b_block_bufs(scale_comp_buf));
537 a_scale_thread_copy.Run(a_scale_grid_desc,
541 a_scale_thread_bufs(scale_mem_buf));
543 a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
546 a_scale_thread_copy.MoveSrcSliceWindow(
551 a_scale_thread_copy.MoveSrcSliceWindow(
558 b_scale_thread_copy.Run(b_scale_grid_desc,
562 b_scale_thread_bufs(scale_mem_buf));
564 b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
567 b_scale_thread_copy.MoveSrcSliceWindow(
573 b_scale_thread_copy.MoveSrcSliceWindow(
577 a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
578 b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
583 constexpr index_t a_scale_offset =
585 constexpr index_t b_scale_offset =
589 "Must have at least one scale per Xdlops "
599 a_scale_thread_vec.template AsType<AScaleDataType>()(s) =
605 b_scale_thread_vec.template AsType<BScaleDataType>()(s) =
613 constexpr auto kxdl = ikxdl + k0 *
KXdlPack;
619 a_thread_vec.template AsType<ComputeTypeA>()(
623 b_thread_vec.template AsType<ComputeTypeB>()(
637 using mfma_scale_input_type_a =
typename vector_type<
640 using mfma_scale_input_type_b =
typename vector_type<
651 a_thread_vec.template AsType<mfma_input_type_a>(),
653 .template AsType<mfma_scale_input_type_a>(),
654 b_thread_vec.template AsType<mfma_input_type_b>(),
656 .template AsType<mfma_scale_input_type_b>(),
657 c_thread_buf.GetVectorTypeReference(
680 constexpr auto k_step =
685 1>{}([&](
auto chunk) {
686 constexpr auto a_k_step_chunk =
695 a_block_bufs(scale_mem_buf),
709 1>{}([&](
auto chunk) {
710 constexpr auto b_k_step_chunk =
719 b_block_bufs(scale_mem_buf),
732 __builtin_amdgcn_sched_barrier(0);
739 }
while(i < (num_loop - 2));
748 a_scale_thread_copy.Run(a_scale_grid_desc,
752 a_scale_thread_bufs(
I1));
754 a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
757 a_scale_thread_copy.MoveSrcSliceWindow(
764 b_scale_thread_copy.Run(b_scale_grid_desc,
768 b_scale_thread_bufs(
I1));
770 b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
773 b_scale_thread_copy.MoveSrcSliceWindow(
780 constexpr index_t a_scale_offset =
782 constexpr index_t b_scale_offset =
786 "Must have at least one scale per Xdlops "
794 a_scale_thread_vec.template AsType<AScaleDataType>()(s) =
799 b_scale_thread_vec.template AsType<BScaleDataType>()(s) =
806 constexpr auto kxdl = ikxdl + k0 *
KXdlPack;
812 a_thread_vec.template AsType<ComputeTypeA>()(ik) =
815 b_thread_vec.template AsType<ComputeTypeB>()(ik) =
828 using mfma_scale_input_type_a =
typename vector_type<
831 using mfma_scale_input_type_b =
typename vector_type<
841 a_thread_vec.template AsType<mfma_input_type_a>(),
843 .template AsType<mfma_scale_input_type_a>(),
844 b_thread_vec.template AsType<mfma_input_type_b>(),
846 .template AsType<mfma_scale_input_type_b>(),
855 __builtin_amdgcn_s_waitcnt(3952);
859 constexpr auto k_step =
864 constexpr auto a_k_step_chunk =
887 constexpr auto b_k_step_chunk =
911 constexpr index_t a_scale_offset =
913 constexpr index_t b_scale_offset =
917 "Must have at least one scale per Xdlops "
925 a_scale_thread_vec.template AsType<AScaleDataType>()(s) =
930 b_scale_thread_vec.template AsType<BScaleDataType>()(s) =
937 constexpr auto kxdl = ikxdl + k0 *
KXdlPack;
943 a_thread_vec.template AsType<ComputeTypeA>()(ik) =
946 b_thread_vec.template AsType<ComputeTypeB>()(ik) =
959 using mfma_scale_input_type_a =
typename vector_type<
962 using mfma_scale_input_type_b =
typename vector_type<
972 a_thread_vec.template AsType<mfma_input_type_a>(),
974 .template AsType<mfma_scale_input_type_a>(),
975 b_thread_vec.template AsType<mfma_input_type_b>(),
977 .template AsType<mfma_scale_input_type_b>(),
991 constexpr index_t a_scale_offset =
993 constexpr index_t b_scale_offset =
997 "Must have at least one scale per Xdlops "
1005 a_scale_thread_vec.template AsType<AScaleDataType>()(s) =
1010 b_scale_thread_vec.template AsType<BScaleDataType>()(s) =
1017 constexpr auto kxdl = ikxdl + k0 *
KXdlPack;
1023 a_thread_vec.template AsType<ComputeTypeA>()(ik) =
1026 b_thread_vec.template AsType<ComputeTypeB>()(ik) =
1039 using mfma_scale_input_type_a =
typename vector_type<
1042 using mfma_scale_input_type_b =
typename vector_type<
1052 a_thread_vec.template AsType<mfma_input_type_a>(),
1054 .template AsType<mfma_scale_input_type_a>(),
1055 b_thread_vec.template AsType<mfma_input_type_b>(),
1057 .template AsType<mfma_scale_input_type_b>(),
1072 Number<KRepeat / KXdlPack>{},
1079 Number<KRepeat / KXdlPack>{},
1083 using Base::a_thread_copy_;
1084 using Base::a_thread_desc_;
1085 using Base::b_thread_copy_;
1086 using Base::b_thread_desc_;
1087 using Base::c_thread_desc_;
__host__ __device__ constexpr auto integer_divide_floor(X x, Y y)
Definition utility/math.hpp:66
__host__ __device__ constexpr auto integer_divide_ceil(X x, Y y)
Definition utility/math.hpp:72
__host__ __device__ constexpr auto make_multi_index(Xs &&... xs)
Definition array_multi_index.hpp:15
__host__ __device__ constexpr auto make_static_buffer(Number< N >)
Definition static_buffer.hpp:186
typename detail::StaticallyIndexedArrayImpl< T, N >::type StaticallyIndexedArray
Definition utility/statically_indexed_array.hpp:45
int32_t index_t
Definition ck.hpp:299
integral_constant< index_t, N > Number
Definition number.hpp:12
TailNumber
Definition blkgemmpipe_scheduler.hpp:31
@ Even
Definition blkgemmpipe_scheduler.hpp:34
@ Odd
Definition blkgemmpipe_scheduler.hpp:33
BlockGemmPipelineScheduler
Definition blkgemmpipe_scheduler.hpp:25
@ Intrawave
Definition blkgemmpipe_scheduler.hpp:26
__host__ __device__ constexpr auto make_naive_tensor_descriptor_packed(const Tuple< Lengths... > &lengths)
Definition tensor_descriptor_helper.hpp:101
__host__ __device__ constexpr auto make_tuple(Xs &&... xs)
Definition utility/tuple.hpp:211
__device__ void block_sync_lds()
Definition synchronization.hpp:16
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:33
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::GetWaveIdx static __device__ auto GetWaveIdx()
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:118
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::AccType float AccType
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:36
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::HotLoopInstList ck::BlockwiseGemmXdlops_pipeline_hotloop_inst< BlockSize, MPerBlock, NPerBlock, KPerBlock, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, A_K1, B_K1, A_K1, B_K1, MRepeat, NRepeat, MPerXDL, NPerXDL, xdlops_gemm.KPerXdlops,(packed_size_v< ComputeTypeA > > 1||packed_size_v< ComputeTypeB > > 1)> HotLoopInstList
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:88
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::ComputeTypeA ADataType ComputeTypeA
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:34
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2 __host__ static __device__ constexpr auto MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(const CGridDesc_M_N &c_grid_desc_m_n)
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:344
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::AMmaKStride static constexpr index_t AMmaKStride
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:68
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::MWaves static constexpr index_t MWaves
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:49
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_N3_N4 __host__ static __device__ constexpr auto GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_N3_N4()
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:220
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::GetCThreadDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2 __host__ static __device__ constexpr auto GetCThreadDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2()
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:269
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::b_block_desc_n0_n1_n2_n3_k static constexpr BMmaTileDesc b_block_desc_n0_n1_n2_n3_k
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:382
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2 __host__ static __device__ constexpr auto GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2()
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:297
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::Tuple5 decltype(CalculateAThreadOriginDataIndex()) Tuple5
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:184
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::I0 static constexpr auto I0
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:41
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::I1 static constexpr auto I1
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:42
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::KRepeat static constexpr index_t KRepeat
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:79
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::CalculateCThreadOriginDataIndex static __device__ auto CalculateCThreadOriginDataIndex(Number< m0 >, Number< n0 >, Number< xdlops_i >, Number< blk_i >)
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:154
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::MXdlPack static constexpr index_t MXdlPack
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:84
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_N3_N4 __host__ static __device__ constexpr auto GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_N3_N4()
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:283
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::MakeCGridDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2 __host__ static __device__ constexpr auto MakeCGridDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2(const CGridDesc_G_M_N &c_grid_desc_g_m_n)
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:361
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::xdlops_gemm static constexpr auto xdlops_gemm
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:60
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::a_block_desc_m0_m1_m2_m3_k static constexpr AMmaTileDesc a_block_desc_m0_m1_m2_m3_k
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:381
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::KThreadChunk static constexpr index_t KThreadChunk
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:74
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::KXdlPack static constexpr index_t KXdlPack
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:86
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2 __host__ static __device__ constexpr auto GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2()
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:234
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::GetCThreadBuffer __host__ __device__ constexpr auto & GetCThreadBuffer()
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:116
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::ComputeTypeB BDataType ComputeTypeB
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:35
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::APackedSize static constexpr index_t APackedSize
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:38
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::BlockwiseGemmXdlops_mx_pipeline_base __host__ __device__ BlockwiseGemmXdlops_mx_pipeline_base(Tuple5 a_origin=CalculateAThreadOriginDataIndex(), Tuple5 b_origin=CalculateBThreadOriginDataIndex())
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:204
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::BMmaKStride static constexpr index_t BMmaKStride
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:69
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::NXdlPack static constexpr index_t NXdlPack
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:85
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::GetCBlockDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2 __host__ static __device__ constexpr auto GetCBlockDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2()
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:327
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::NWaves static constexpr index_t NWaves
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:50
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::WaveSize static constexpr index_t WaveSize
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:51
ck::BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::BPackedSize static constexpr index_t BPackedSize
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:39
ck::BlockwiseGemmXdlops_pipeline_hotloop_inst< BlockSize, MPerBlock, NPerBlock, KPerBlock, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, A_K1, B_K1, A_K1, B_K1, MRepeat, NRepeat, MPerXDL, NPerXDL, xdlops_gemm.KPerXdlops,(packed_size_v< ComputeTypeA > > 1||packed_size_v< ComputeTypeB > > 1)>::A_LDS_Read_Inst_Num static constexpr index_t A_LDS_Read_Inst_Num
Definition blockwise_gemm_pipeline_xdlops.hpp:49
ck::BlockwiseGemmXdlops_pipeline_hotloop_inst< BlockSize, MPerBlock, NPerBlock, KPerBlock, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, A_K1, B_K1, A_K1, B_K1, MRepeat, NRepeat, MPerXDL, NPerXDL, xdlops_gemm.KPerXdlops,(packed_size_v< ComputeTypeA > > 1||packed_size_v< ComputeTypeB > > 1)>::A_LDS_Read_Width static constexpr index_t A_LDS_Read_Width
Definition blkgemmpipe_scheduler.hpp:82
ck::BlockwiseGemmXdlops_pipeline_hotloop_inst< BlockSize, MPerBlock, NPerBlock, KPerBlock, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, A_K1, B_K1, A_K1, B_K1, MRepeat, NRepeat, MPerXDL, NPerXDL, xdlops_gemm.KPerXdlops,(packed_size_v< ComputeTypeA > > 1||packed_size_v< ComputeTypeB > > 1)>::B_LDS_Read_Inst_Num static constexpr index_t B_LDS_Read_Inst_Num
Definition blockwise_gemm_pipeline_xdlops.hpp:51
ck::BlockwiseGemmXdlops_pipeline_hotloop_inst< BlockSize, MPerBlock, NPerBlock, KPerBlock, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, A_K1, B_K1, A_K1, B_K1, MRepeat, NRepeat, MPerXDL, NPerXDL, xdlops_gemm.KPerXdlops,(packed_size_v< ComputeTypeA > > 1||packed_size_v< ComputeTypeB > > 1)>::C_MFMA_Inst_Num static constexpr index_t C_MFMA_Inst_Num
Definition blockwise_gemm_pipeline_xdlops.hpp:54
ck::BlockwiseGemmXdlops_pipeline_hotloop_inst< BlockSize, MPerBlock, NPerBlock, KPerBlock, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, A_K1, B_K1, A_K1, B_K1, MRepeat, NRepeat, MPerXDL, NPerXDL, xdlops_gemm.KPerXdlops,(packed_size_v< ComputeTypeA > > 1||packed_size_v< ComputeTypeB > > 1)>::C_MFMA_Inst_Cycle static constexpr index_t C_MFMA_Inst_Cycle
Definition blkgemmpipe_scheduler.hpp:105
ck::BlockwiseGemmXdlops_pipeline_hotloop_inst< BlockSize, MPerBlock, NPerBlock, KPerBlock, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, A_K1, B_K1, A_K1, B_K1, MRepeat, NRepeat, MPerXDL, NPerXDL, xdlops_gemm.KPerXdlops,(packed_size_v< ComputeTypeA > > 1||packed_size_v< ComputeTypeB > > 1)>::A_Buffer_Load_Inst_Num static constexpr index_t A_Buffer_Load_Inst_Num
Definition blockwise_gemm_pipeline_xdlops.hpp:39
ck::BlockwiseGemmXdlops_pipeline_hotloop_inst< BlockSize, MPerBlock, NPerBlock, KPerBlock, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, A_K1, B_K1, A_K1, B_K1, MRepeat, NRepeat, MPerXDL, NPerXDL, xdlops_gemm.KPerXdlops,(packed_size_v< ComputeTypeA > > 1||packed_size_v< ComputeTypeB > > 1)>::B_Buffer_Load_Inst_Num static constexpr index_t B_Buffer_Load_Inst_Num
Definition blockwise_gemm_pipeline_xdlops.hpp:41
ck::BlockwiseGemmXdlops_pipeline_hotloop_inst< BlockSize, MPerBlock, NPerBlock, KPerBlock, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, A_K1, B_K1, A_K1, B_K1, MRepeat, NRepeat, MPerXDL, NPerXDL, xdlops_gemm.KPerXdlops,(packed_size_v< ComputeTypeA > > 1||packed_size_v< ComputeTypeB > > 1)>::B_LDS_Read_Width static constexpr index_t B_LDS_Read_Width
Definition blkgemmpipe_scheduler.hpp:83
ck::BlockwiseGemmXdlops_pipeline_mx_moe_nbs_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::b_thread_copy_ BThreadCopy b_thread_copy_
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:424
ck::BlockwiseGemmXdlops_pipeline_mx_moe_nbs_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::I1 static constexpr auto I1
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:42
ck::BlockwiseGemmXdlops_pipeline_mx_moe_nbs_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::Run __device__ void Run(const AGridDesc &a_grid_desc, const ABlockDesc &a_block_desc, ABlockTransfer &a_blockwise_copy, const AGridBuffer &a_grid_buf, ABlockBuffer &a_block_buf, const ABlockTransferStep &a_block_copy_step, const BGridDesc &b_grid_desc, const BBlockDesc &b_block_desc, BBlockTransfer &b_blockwise_copy, const BGridBuffer &b_grid_buf, BBlockBuffer &b_block_buf, const BBlockTransferStep &b_block_copy_step, CThreadBuffer &c_thread_buf, const AScaleGridDesc &a_scale_grid_desc, AScaleThreadTransfer &a_scale_thread_copy, const AScaleGridBuffer &a_scale_grid_buf, const BScaleGridDesc &b_scale_grid_desc, BScaleThreadTransfer &b_scale_thread_copy, const BScaleGridBuffer &b_scale_grid_buf, index_t num_loop) const
Definition blockwise_gemm_pipeline_xdlops_mx_moe_nbs_v3.hpp:389
ck::BlockwiseGemmXdlops_pipeline_mx_moe_nbs_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::ComputeTypeA typename Base::ComputeTypeA ComputeTypeA
Definition blockwise_gemm_pipeline_xdlops_mx_moe_nbs_v3.hpp:155
ck::BlockwiseGemmXdlops_pipeline_mx_moe_nbs_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::MXdlPack static constexpr index_t MXdlPack
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:84
ck::BlockwiseGemmXdlops_pipeline_mx_moe_nbs_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::a_scale_thread_desc static constexpr auto a_scale_thread_desc
Definition blockwise_gemm_pipeline_xdlops_mx_moe_nbs_v3.hpp:1106
ck::BlockwiseGemmXdlops_pipeline_mx_moe_nbs_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::Tuple5 typename Base::Tuple5 Tuple5
Definition blockwise_gemm_pipeline_xdlops_mx_moe_nbs_v3.hpp:154
ck::BlockwiseGemmXdlops_pipeline_mx_moe_nbs_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::MWaves static constexpr index_t MWaves
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:49
ck::BlockwiseGemmXdlops_pipeline_mx_moe_nbs_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::a_block_desc_m0_m1_m2_m3_k static constexpr AMmaTileDesc a_block_desc_m0_m1_m2_m3_k
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:381
ck::BlockwiseGemmXdlops_pipeline_mx_moe_nbs_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::I0 static constexpr auto I0
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:41
ck::BlockwiseGemmXdlops_pipeline_mx_moe_nbs_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::KThreadChunk static constexpr index_t KThreadChunk
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:74
ck::BlockwiseGemmXdlops_pipeline_mx_moe_nbs_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::a_thread_copy_ AThreadCopy a_thread_copy_
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:423
ck::BlockwiseGemmXdlops_pipeline_mx_moe_nbs_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::b_block_desc_n0_n1_n2_n3_k static constexpr BMmaTileDesc b_block_desc_n0_n1_n2_n3_k
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:382
ck::BlockwiseGemmXdlops_pipeline_mx_moe_nbs_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::BlockHasHotloop static __host__ constexpr bool BlockHasHotloop(index_t num_loop)
Definition blockwise_gemm_pipeline_xdlops_mx_moe_v3.hpp:183
ck::BlockwiseGemmXdlops_pipeline_mx_moe_nbs_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::scale_pack_size_b static constexpr auto scale_pack_size_b
Definition blockwise_gemm_pipeline_xdlops_mx_moe_nbs_v3.hpp:175
ck::BlockwiseGemmXdlops_pipeline_mx_moe_nbs_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::b_thread_desc_ static constexpr auto b_thread_desc_
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:392
ck::BlockwiseGemmXdlops_pipeline_mx_moe_nbs_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::c_thread_desc_ static constexpr auto c_thread_desc_
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:396
ck::BlockwiseGemmXdlops_pipeline_mx_moe_nbs_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::GlobalBufferNum static constexpr index_t GlobalBufferNum
Definition blockwise_gemm_pipeline_xdlops_mx_moe_nbs_v3.hpp:160
ck::BlockwiseGemmXdlops_pipeline_mx_moe_nbs_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::Base BlockwiseGemmXdlops_mx_pipeline_base< ThreadBlockSize, ADataType, BDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack > Base
Definition blockwise_gemm_pipeline_xdlops_mx_moe_nbs_v3.hpp:102
ck::BlockwiseGemmXdlops_pipeline_mx_moe_nbs_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::b_scale_thread_vec_size static constexpr auto b_scale_thread_vec_size
Definition blockwise_gemm_pipeline_xdlops_mx_moe_nbs_v3.hpp:181
ck::BlockwiseGemmXdlops_pipeline_mx_moe_nbs_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::ScalesPerXdlopsRunPerThread static constexpr auto ScalesPerXdlopsRunPerThread
Definition blockwise_gemm_pipeline_xdlops_mx_moe_nbs_v3.hpp:170
ck::BlockwiseGemmXdlops_pipeline_mx_moe_nbs_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::KXdlPack static constexpr index_t KXdlPack
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:86
ck::BlockwiseGemmXdlops_pipeline_mx_moe_nbs_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::PrefillStages static constexpr index_t PrefillStages
Definition blockwise_gemm_pipeline_xdlops_mx_moe_nbs_v3.hpp:159
ck::BlockwiseGemmXdlops_pipeline_mx_moe_nbs_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::b_scale_thread_desc static constexpr auto b_scale_thread_desc
Definition blockwise_gemm_pipeline_xdlops_mx_moe_nbs_v3.hpp:1113
ck::BlockwiseGemmXdlops_pipeline_mx_moe_nbs_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::NXdlPack static constexpr index_t NXdlPack
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:85
ck::BlockwiseGemmXdlops_pipeline_mx_moe_nbs_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::ScalesPerXdlopsRun static constexpr auto ScalesPerXdlopsRun
Definition blockwise_gemm_pipeline_xdlops_mx_moe_nbs_v3.hpp:166
ck::BlockwiseGemmXdlops_pipeline_mx_moe_nbs_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::KRepeat static constexpr index_t KRepeat
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:79
ck::BlockwiseGemmXdlops_pipeline_mx_moe_nbs_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::AccType typename Base::AccType AccType
Definition blockwise_gemm_pipeline_xdlops_mx_moe_nbs_v3.hpp:153
ck::BlockwiseGemmXdlops_pipeline_mx_moe_nbs_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::PrefetchStages static constexpr index_t PrefetchStages
Definition blockwise_gemm_pipeline_xdlops_mx_moe_nbs_v3.hpp:158
ck::BlockwiseGemmXdlops_pipeline_mx_moe_nbs_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::xdlops_gemm static constexpr auto xdlops_gemm
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:60
ck::BlockwiseGemmXdlops_pipeline_mx_moe_nbs_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::a_thread_desc_ static constexpr auto a_thread_desc_
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:388
ck::BlockwiseGemmXdlops_pipeline_mx_moe_nbs_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::a_scale_thread_vec_size static constexpr auto a_scale_thread_vec_size
Definition blockwise_gemm_pipeline_xdlops_mx_moe_nbs_v3.hpp:180
ck::BlockwiseGemmXdlops_pipeline_mx_moe_nbs_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::APackedSize static constexpr index_t APackedSize
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:38
ck::BlockwiseGemmXdlops_pipeline_mx_moe_nbs_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::ComputeTypeB typename Base::ComputeTypeB ComputeTypeB
Definition blockwise_gemm_pipeline_xdlops_mx_moe_nbs_v3.hpp:156
ck::BlockwiseGemmXdlops_pipeline_mx_moe_nbs_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::HotLoopScheduler static __device__ constexpr auto HotLoopScheduler()
Definition blockwise_gemm_pipeline_xdlops_mx_moe_v3.hpp:193
ck::BlockwiseGemmXdlops_pipeline_mx_moe_nbs_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::ScalesPerKBlockSize static constexpr auto ScalesPerKBlockSize
Definition blockwise_gemm_pipeline_xdlops_mx_moe_nbs_v3.hpp:162
ck::BlockwiseGemmXdlops_pipeline_mx_moe_nbs_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::Run __device__ void Run(const AGridDesc &a_grid_desc, const ABlockDesc &a_block_desc, ABlockTransfer &a_blockwise_copy, const AGridBuffer &a_grid_buf, ABlockBuffer &a_block_bufs, const ABlockTransferStep &a_block_copy_step, const BGridDesc &b_grid_desc, const BBlockDesc &b_block_desc, BBlockTransfer &b_blockwise_copy, const BGridBuffer &b_grid_buf, BBlockBuffer &b_block_bufs, const BBlockTransferStep &b_block_copy_step, CThreadBuffer &c_thread_buf, const AScaleGridDesc &a_scale_grid_desc, AScaleThreadTransfer &a_scale_thread_copy, const AScaleGridBuffer &a_scale_grid_buf, const BScaleGridDesc &b_scale_grid_desc, BScaleThreadTransfer &b_scale_thread_copy, const BScaleGridBuffer &b_scale_grid_buf, index_t num_loop) const
Definition blockwise_gemm_pipeline_xdlops_mx_moe_v3.hpp:367
ck::BlockwiseGemmXdlops_pipeline_mx_moe_nbs_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::scale_pack_size_a static constexpr auto scale_pack_size_a
Definition blockwise_gemm_pipeline_xdlops_mx_moe_nbs_v3.hpp:174
ck::BlockwiseGemmXdlops_pipeline_mx_moe_nbs_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::BPackedSize static constexpr index_t BPackedSize
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:39
ck::BlockwiseGemmXdlops_pipeline_mx_moe_nbs_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::mx_scale_t e8m0_bexp_t mx_scale_t
Definition blockwise_gemm_pipeline_xdlops_mx_moe_nbs_v3.hpp:173
ck::BlockwiseGemmXdlops_pipeline_mx_moe_nbs_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::NWaves static constexpr index_t NWaves
Definition blockwise_gemm_mx_pipeline_xdlops_base.hpp:50
ck::BlockwiseGemmXdlops_pipeline_mx_moe_nbs_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >::BlockLoopTailNum static __host__ constexpr TailNumber BlockLoopTailNum(index_t num_loop)
Definition blockwise_gemm_pipeline_xdlops_mx_moe_v3.hpp:188
Definition blockwise_gemm_pipeline_xdlops_mx_moe_nbs_v3.hpp:38
Unsigned representation of a conventional biased Float32 exponent.
Definition utility/e8m0.hpp:26
Definition functional2.hpp:33
Definition dtype_vector.hpp:10