gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp Source File

gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp Source File#

Composable Kernel: gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp Source File
gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp
Go to the documentation of this file.
1// SPDX-License-Identifier: MIT
2// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
3
4#pragma once
5
17
18namespace ck {
19
20template <typename A0B0B1DataType, // FIXME: don't assume A0/B0/B1 have same datatype
21 typename Acc0DataType,
22 typename D0sDataType,
23 typename Acc1DataType,
24 typename C1ShuffleDataType,
25 typename D1sDataType,
26 typename E1DataType,
27 typename A0ElementwiseOperation,
28 typename B0ElementwiseOperation,
29 typename CDE0ElementwiseOperation,
30 typename B1ElementwiseOperation,
31 typename CDE1ElementwiseOperation,
32 InMemoryDataOperationEnum E1GlobalMemoryDataOperation,
33 typename A0GridDesc_M_K,
34 typename B0GridDesc_N_K,
35 typename D0sGridDesc_M_N,
36 typename B1GridDesc_N_K,
37 typename D1sGridDesc_M_N,
38 typename E1GridDesc_M_N,
39 index_t NumGemm0KPrefetchStage,
40 index_t BlockSize,
41 index_t Gemm0MPerBlock,
42 index_t Gemm0NPerBlock,
43 index_t Gemm0KPerBlock,
44 index_t Gemm1NPerBlock,
45 index_t Gemm1KPerBlock,
46 index_t A0K1Value,
47 index_t B0K1Value,
48 index_t B1K1Value,
49 index_t Gemm0MPerXdl,
50 index_t Gemm0NPerXdl,
51 index_t Gemm0MXdlPerWave,
52 index_t Gemm0NXdlPerWave,
53 index_t Gemm1NXdlPerWave,
54 typename A0BlockTransferThreadClusterLengths_AK0_M_AK1,
55 typename A0BlockTransferThreadClusterArrangeOrder,
56 typename A0BlockTransferSrcAccessOrder,
57 index_t A0BlockTransferSrcVectorDim,
58 index_t A0BlockTransferSrcScalarPerVector,
59 index_t A0BlockTransferDstScalarPerVector_AK1,
60 bool A0ThreadTransferSrcResetCoordinateAfterRun, // ignored
61 index_t A0BlockLdsExtraM,
62 typename B0BlockTransferThreadClusterLengths_BK0_N_BK1,
63 typename B0BlockTransferThreadClusterArrangeOrder,
64 typename B0BlockTransferSrcAccessOrder,
65 index_t B0BlockTransferSrcVectorDim,
66 index_t B0BlockTransferSrcScalarPerVector,
67 index_t B0BlockTransferDstScalarPerVector_BK1,
68 bool B0ThreadTransferSrcResetCoordinateAfterRun, // ignored
69 index_t B0BlockLdsExtraN,
70 index_t CDE0BlockTransferSrcVectorDim,
71 index_t CDE0BlockTransferSrcScalarPerVector,
72 typename B1BlockTransferThreadClusterLengths_BK0_N_BK1,
73 typename B1BlockTransferThreadClusterArrangeOrder,
74 typename B1BlockTransferSrcAccessOrder,
75 index_t B1BlockTransferSrcVectorDim,
76 index_t B1BlockTransferSrcScalarPerVector,
77 index_t B1BlockTransferDstScalarPerVector_BK1,
78 bool B1ThreadTransferSrcResetCoordinateAfterRun,
79 index_t B1BlockLdsExtraN,
80 index_t C1ShuffleGemm0MXdlPerWavePerShuffle,
81 index_t C1ShuffleGemm0NXdlPerWavePerShuffle,
82 typename CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
83 index_t CDE1ShuffleBlockTransferScalarPerVector_NPerBlock,
84 LoopScheduler LoopSched>
86{
87 static_assert(LoopSched == LoopScheduler::Default,
88 "Non-default loop scheduler is currently not supported");
89
90 static constexpr index_t NumD0Tensor = D0sDataType::Size();
91 static constexpr index_t NumD1Tensor = D1sDataType::Size();
92
93 static constexpr auto I0 = Number<0>{};
94 static constexpr auto I1 = Number<1>{};
95 static constexpr auto I2 = Number<2>{};
96 static constexpr auto I3 = Number<3>{};
97 static constexpr auto I4 = Number<4>{};
98 static constexpr auto I5 = Number<5>{};
99 static constexpr auto I6 = Number<6>{};
100 static constexpr auto I7 = Number<7>{};
101
102 // K1 should be Number<...>
103 // Gemm0
104 static constexpr auto A0K1 = Number<A0K1Value>{};
105 static constexpr auto B0K1 = Number<B0K1Value>{};
106
107 static constexpr auto A0K0PerBlock = Number<Gemm0KPerBlock / A0K1Value>{};
108 static constexpr auto B0K0PerBlock = Number<Gemm0KPerBlock / B0K1Value>{};
109
110 static constexpr auto Gemm0MWaves = Gemm0MPerBlock / (Gemm0MPerXdl * Gemm0MXdlPerWave);
111 static constexpr auto Gemm0NWaves = Gemm0NPerBlock / (Gemm0NPerXdl * Gemm0NXdlPerWave);
112 static constexpr auto WaveSize = BlockSize / (Gemm0MWaves * Gemm0NWaves);
113 // Gemm1
114 static constexpr auto B1K1 = Number<B1K1Value>{};
115 static constexpr auto B1K0PerBlock = Number<Gemm1KPerBlock / B1K1Value>{};
116
118
120
121 // ck::Tuple<const D0DataType1*, const D0DataType2*, ...>
122 static constexpr auto MakeD0sGridPointer()
123 {
124 return generate_tuple(
125 [&](auto i) {
126 using D0DataType = remove_cvref_t<tuple_element_t<i.value, D0sDataType>>;
127
128 return static_cast<const D0DataType*>(nullptr);
129 },
131 }
132
133 // ck::Tuple<const D1DataType1*, const D1DataType2*, ...>
134 static constexpr auto MakeD1sGridPointer()
135 {
136 return generate_tuple(
137 [&](auto i) {
138 using D1DataType = remove_cvref_t<tuple_element_t<i.value, D1sDataType>>;
139
140 return static_cast<const D1DataType*>(nullptr);
141 },
143 }
144
145 __device__ static auto GetGemm0WaveIdx()
146 {
147 const index_t thread_id = get_thread_local_1d_id();
148
149 constexpr auto threadid_to_wave_idx_adaptor = make_single_stage_tensor_adaptor(
153
154 return threadid_to_wave_idx_adaptor.CalculateBottomIndex(make_multi_index(thread_id));
155 }
156
157 __device__ static auto GetGemm0WaveMNIdx(const index_t thread_id)
158 {
159 constexpr auto wave_threadid_to_mn_idx_adaptor = make_single_stage_tensor_adaptor(
160 make_tuple(make_merge_transform(make_tuple(WaveSize / Gemm0NPerXdl, Gemm0NPerXdl))),
163
164 return wave_threadid_to_mn_idx_adaptor.CalculateBottomIndex(make_multi_index(thread_id));
165 }
166
167 template <typename A0BlockDesc_AK0_M_AK1>
168 __host__ __device__ static constexpr auto
169 MakeGemm0AMmaTileDescriptor_M0_M1_M2_K(const A0BlockDesc_AK0_M_AK1&)
170 {
171 constexpr index_t MWaves = Gemm0MPerBlock / (Gemm0MXdlPerWave * Gemm0MPerXdl);
172
173 return MakeGemmMmaTileDescriptor_MN0_MN1_MN2_K<Gemm0MXdlPerWave, MWaves, Gemm0MPerXdl>(
174 A0BlockDesc_AK0_M_AK1{});
175 }
176
177 template <typename BBlockDesc_BK0_N_BK1>
178 __host__ __device__ static constexpr auto
179 MakeGemm0BMmaTileDescriptor_N0_N1_N2_K(const BBlockDesc_BK0_N_BK1&)
180 {
181 constexpr index_t NWaves = Gemm0NPerBlock / (Gemm0NXdlPerWave * Gemm0NPerXdl);
182
183 return MakeGemmMmaTileDescriptor_MN0_MN1_MN2_K<Gemm0NXdlPerWave, NWaves, Gemm0NPerXdl>(
184 BBlockDesc_BK0_N_BK1{});
185 }
186
187 template <typename A0BlockDesc_AK0_M_AK1>
188 __host__ __device__ static constexpr auto
189 MakeGemm1AMmaTileDescriptor_M0_M1_M2_K(const A0BlockDesc_AK0_M_AK1&)
190 {
191 return MakeGemmMmaTileDescriptor_MN0_MN1_MN2_K<Gemm0MXdlPerWave, 1, 1>(
192 A0BlockDesc_AK0_M_AK1{});
193 }
194
195 template <typename BBlockDesc_BK0_N_BK1>
196 __host__ __device__ static constexpr auto
197 MakeGemm1BMmaTileDescriptor_N0_N1_N2_K(const BBlockDesc_BK0_N_BK1&)
198 {
199 constexpr index_t Gemm1NWaves = Gemm1NPerBlock / (Gemm1NXdlPerWave * Gemm0NPerXdl);
200 return MakeGemmMmaTileDescriptor_MN0_MN1_MN2_K<Gemm1NXdlPerWave, Gemm1NWaves, Gemm0NPerXdl>(
201 BBlockDesc_BK0_N_BK1{});
202 }
203
204 __host__ __device__ static constexpr auto GetA0BlockDescriptor_AK0PerBlock_MPerBlock_AK1()
205 {
206 // A0 matrix in LDS memory, dst of blockwise copy
210 }
211
212 __host__ __device__ static constexpr auto GetB0BlockDescriptor_BK0PerBlock_NPerBlock_BK1()
213 {
214 // B0 matrix in LDS memory, dst of blockwise copy
218 }
219
220 __host__ __device__ static constexpr auto GetB1BlockDescriptor_BK0PerBlock_NPerBlock_BK1()
221 {
222 // B1 matrix in LDS memory, dst of blockwise copy
226 }
227
228 __host__ __device__ static constexpr auto
230 {
231 constexpr index_t MWave = Gemm0MPerBlock / (Gemm0MXdlPerWave * Gemm0MPerXdl);
232 constexpr index_t NWave = Gemm1NPerBlock / (Gemm1NXdlPerWave * Gemm0NPerXdl);
233
234 constexpr auto c1_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
238 I1,
240
241 return c1_shuffle_block_desc_mblock_mperblock_nblock_nperblock;
242 }
243
244 __host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
245 {
248 sizeof(A0B0B1DataType);
249 const index_t gemm1_bytes_end =
251 sizeof(A0B0B1DataType);
252 const index_t c1_block_bytes_end =
253 SharedMemTrait::c1_block_space_size * sizeof(C1ShuffleDataType);
254
255 return math::max(gemm0_bytes_end, gemm1_bytes_end, c1_block_bytes_end);
256 }
257
258 template <
259 InMemoryDataOperationEnum CGlobalMemoryDataOperation_ = InMemoryDataOperationEnum::Set>
260 __device__ static bool constexpr IsValidCompilationParameter()
261 {
262 return ck::tensor_operation::device::IsValidGemmCompilationParameter<
263 BlockSize,
264 Gemm0MPerBlock,
265 Gemm0NPerBlock,
266 Gemm0MPerXdl,
267 Gemm0NPerXdl,
268 Gemm0MXdlPerWave,
269 Gemm0NXdlPerWave,
270 E1DataType,
271 CGlobalMemoryDataOperation_>() &&
272 ck::tensor_operation::device::IsValidGemmCompilationParameter<
273 BlockSize,
274 Gemm0MPerBlock,
275 Gemm1NPerBlock,
276 Gemm0MPerXdl,
277 Gemm0NPerXdl,
278 Gemm0MXdlPerWave,
279 Gemm1NXdlPerWave,
280 E1DataType,
281 CGlobalMemoryDataOperation_>();
282 }
283
284 // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01}
285 template <typename Block2E1TileMap>
286 __host__ static constexpr bool CheckValidity(const A0GridDesc_M_K& a0_grid_desc_m_k,
287 const B0GridDesc_N_K& b0_grid_desc_n_k,
288 const B1GridDesc_N_K& b1_grid_desc_n_k,
289 const E1GridDesc_M_N& e1_grid_desc_m_n,
290 const Block2E1TileMap& block_2_e1tile_map)
291 {
292 static_assert((Gemm0MPerBlock % (Gemm0MPerXdl * Gemm0MXdlPerWave) == 0) &&
293 (Gemm0NPerBlock % (Gemm0NXdlPerWave * Gemm0NPerXdl)) == 0,
294 "Invalid tuning param!");
295 if constexpr((Gemm0MPerXdl * Gemm0MXdlPerWave) == 0 ||
296 (Gemm0NXdlPerWave * Gemm0NPerXdl) == 0)
297 {
298 return false;
299 }
300 else
301 {
302 if constexpr((Gemm0MPerBlock % (Gemm0MPerXdl * Gemm0MXdlPerWave) != 0) ||
303 (Gemm0NPerBlock % (Gemm0NXdlPerWave * Gemm0NPerXdl) != 0))
304 {
305 return false;
306 }
307 else
308 {
309 if(WaveSize != get_warp_size())
310 {
311 return false;
312 }
313 }
314 }
315
316 const auto M = a0_grid_desc_m_k.GetLength(I0);
317 const auto N = b0_grid_desc_n_k.GetLength(I0);
318 const auto K = a0_grid_desc_m_k.GetLength(I1);
319 const auto Gemm1N = b1_grid_desc_n_k.GetLength(I0);
320
321 if(!(M == e1_grid_desc_m_n.GetLength(I0) && Gemm1N == e1_grid_desc_m_n.GetLength(I1)))
322 {
323 return false;
324 }
325
326 if(!(M % Gemm0MPerBlock == 0 && N % Gemm0NPerBlock == 0 && K % Gemm0KPerBlock == 0 &&
327 Gemm1N % Gemm1NPerBlock == 0))
328 {
329 return false;
330 }
331
332 // check gemm0 gridwise gemm pipeline
333 const auto num_gemm0_k_loop = K / Gemm0KPerBlock;
334 if(!GridwiseGemmPipe::IsSupported(num_gemm0_k_loop))
335 {
336 return false;
337 }
338
339 // check gemm1 gridwise gemm pipeline
340 if(!(Gemm0NPerBlock % Gemm1KPerBlock == 0))
341 {
342 return false;
343 }
344
345 const auto num_gemm1_k_inner_loop = Gemm0NPerBlock / Gemm1KPerBlock;
346 if(!GridwiseGemmPipe::IsSupported(num_gemm1_k_inner_loop))
347 {
348 return false;
349 }
350
351 if(!block_2_e1tile_map.CheckValidity(e1_grid_desc_m_n))
352 {
353 return false;
354 }
355
356 // TODO: also check validity of all components (blockwise-copy, threadwise-copy, etc)
357 return true;
358 }
359
360 __host__ __device__ static constexpr bool CalculateHasMainKBlockLoop(index_t K)
361 {
362 const index_t num_loop = K / Gemm0KPerBlock;
363
364 return GridwiseGemmPipe::CalculateHasMainLoop(num_loop);
365 }
366
367 // A0 desc for source in blockwise copy
368 __host__ __device__ static constexpr auto
369 MakeDefaultA0GridDescriptor_AK0_M_AK1(const A0GridDesc_M_K& a0_grid_desc_m_k)
370 {
371 const auto M = a0_grid_desc_m_k.GetLength(I0);
372 const auto K = a0_grid_desc_m_k.GetLength(I1);
373
374 const auto A0K0 = K / A0K1;
375
377 a0_grid_desc_m_k,
382 }
383
384 // B0 desc for source in blockwise copy
385 __host__ __device__ static constexpr auto
386 MakeDefaultB0GridDescriptor_BK0_N_BK1(const B0GridDesc_N_K& b0_grid_desc_n_k)
387 {
388 const auto N = b0_grid_desc_n_k.GetLength(I0);
389 const auto K = b0_grid_desc_n_k.GetLength(I1);
390
391 const auto B0K0 = K / B0K1;
392
394 b0_grid_desc_n_k,
399 }
400
401 // D0 desc for source in blockwise copy
402 template <typename D0GridDesc_M_N>
403 __host__ __device__ static constexpr auto
404 MakeGemm0D0GridDescriptor_M0_N0_M1_N1_M2_N2_M3_N3_N4_N5(const D0GridDesc_M_N& d0_grid_desc_m_n)
405 {
406 const auto M = d0_grid_desc_m_n.GetLength(I0);
407 const auto N = d0_grid_desc_m_n.GetLength(I1);
408
409 constexpr auto lcm_A0K1_B0K1 = math::lcm(A0K1, B0K1);
410 constexpr bool is_single_rate_mfma =
412 lcm_A0K1_B0K1 <= 4) ||
413 (is_same<A0B0B1DataType, int8_t>::value && lcm_A0K1_B0K1 <= 8) ||
415 lcm_A0K1_B0K1 < 32))
416 ? true
417 : false;
418 constexpr auto is_scale_mfma = false;
419 constexpr auto mfma = MfmaSelector<A0B0B1DataType,
420 Gemm0MPerXdl,
421 Gemm0NPerXdl,
422 A0B0B1DataType,
423 is_single_rate_mfma,
424 is_scale_mfma>::selected_mfma;
425 constexpr auto N3 = mfma.num_groups_per_blk;
426 constexpr auto N5 = mfma.group_size;
428 d0_grid_desc_m_n,
430 M / Gemm0MPerBlock, Gemm0MXdlPerWave, Gemm0MWaves, Gemm0MPerXdl)),
431 make_unmerge_transform(make_tuple(N / Gemm0NPerBlock,
432 Gemm0NXdlPerWave,
434 N3,
435 WaveSize / Gemm0NPerXdl,
436 N5))),
439 }
440
441 // B1 desc for source in blockwise copy
442 __host__ __device__ static constexpr auto
443 MakeDefaultB1GridDescriptor_BK0_N_BK1(const B1GridDesc_N_K& b1_grid_desc_n_k)
444 {
445 const auto N = b1_grid_desc_n_k.GetLength(I0);
446 const auto K = b1_grid_desc_n_k.GetLength(I1);
447
448 const auto B1K0 = K / B1K1;
449
451 b1_grid_desc_n_k,
456 }
457
458 // C1 desc for destination in blockwise copy
459 __host__ __device__ static constexpr auto
460 MakeE1GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(const E1GridDesc_M_N& e1_grid_desc_m_n)
461 {
462 const auto M = e1_grid_desc_m_n.GetLength(I0);
463 const auto N = e1_grid_desc_m_n.GetLength(I1);
464
465 const auto MBlock = M / Gemm0MPerBlock;
466 const auto NBlock = N / Gemm1NPerBlock;
467
468 const auto e1_grid_desc_mblock_mperblock_nblock_nperblock = transform_tensor_descriptor(
469 e1_grid_desc_m_n,
474
475 return e1_grid_desc_mblock_mperblock_nblock_nperblock;
476 }
477 // D0s desc for source in blockwise copy
478 __host__ __device__ static constexpr auto
479 MakeD0sGridDescriptor_M0_N0_M1_N1_M2_N2_M3_N3_N4_N5(const D0sGridDesc_M_N& ds_grid_desc_m_n)
480 {
481 return generate_tuple(
482 [&](auto i) {
484 },
486 }
487 // Ds desc for source in blockwise copy
488 template <typename DsGridDescriptor_M_N>
489 __host__ __device__ static constexpr auto
491 const DsGridDescriptor_M_N& ds_grid_desc_m_n)
492 {
493 return generate_tuple(
494 [&](auto i) {
496 },
498 }
499
500 // return block_id to C1 matrix tile idx (m0, n0) mapping
501 __host__ __device__ static constexpr auto
502 MakeDefaultBlock2E1TileMap(const E1GridDesc_M_N& e1_grid_desc_m_n)
503 {
505 e1_grid_desc_m_n);
506 }
507
510 E1GridDesc_M_N{}))>;
511
514 D0sGridDesc_M_N{}))>;
515
518 D1sGridDesc_M_N{}))>;
519
521 remove_cvref_t<decltype(MakeDefaultBlock2E1TileMap(E1GridDesc_M_N{}))>;
522
524 {
525 // LDS allocation for A0 and B0: be careful of alignment
526 static constexpr auto a0_block_desc_ak0_m_ak1 =
528 static constexpr auto b0_block_desc_bk0_n_bk1 =
530 static constexpr auto b1_block_desc_bk0_n_bk1 =
532
533 static constexpr auto max_lds_align = math::lcm(math::lcm(A0K1, B0K1), B1K1);
534
536 a0_block_desc_ak0_m_ak1.GetElementSpaceSize(), max_lds_align);
538 b0_block_desc_bk0_n_bk1.GetElementSpaceSize(), max_lds_align);
540 b1_block_desc_bk0_n_bk1.GetElementSpaceSize(), max_lds_align);
541
542 static constexpr auto a0_block_space_offset = 0;
544 static constexpr auto b1_block_space_offset = 0;
545
546 // LDS allocation for C1 shuffle in LDS
549 static constexpr auto c1_block_space_size =
551 };
552
555
556 template <bool HasMainKBlockLoop,
557 typename A0GridDesc_AK0_M_AK1,
558 typename B0GridDesc_BK0_N_BK1,
559 typename B1GridDesc_BK0_N_BK1,
560 typename Block2E1TileMap>
561 __device__ static void Run(const A0B0B1DataType* __restrict__ p_a0_grid,
562 const A0B0B1DataType* __restrict__ p_b0_grid,
563 D0sGridPointer p_d0s_grid,
564 const A0B0B1DataType* __restrict__ p_b1_grid,
565 D1sGridPointer p_d1s_grid,
566 E1DataType* __restrict__ p_e1_grid,
567 void* __restrict__ p_shared,
568 const A0ElementwiseOperation& a0_element_op,
569 const B0ElementwiseOperation& b0_element_op,
570 const CDE0ElementwiseOperation& cde0_element_op,
571 const B1ElementwiseOperation& b1_element_op,
572 const CDE1ElementwiseOperation& cde1_element_op,
573 const A0GridDesc_AK0_M_AK1& a0_grid_desc_ak0_m_ak1,
574 const B0GridDesc_BK0_N_BK1& b0_grid_desc_bk0_n_bk1,
575 const D0sGridDesc_M_N& d0s_griddesc_m_n,
576 const B1GridDesc_BK0_N_BK1& b1_grid_desc_bk0_n_bk1,
578 d1s_grid_desc_mblock_mperblock_nblock_nperblock,
580 e1_grid_desc_mblock_mperblock_nblock_nperblock,
581 const Block2E1TileMap& block_2_e1tile_map)
582 {
583 const auto d0s_griddesc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5 =
585 const auto a0_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
586 p_a0_grid, a0_grid_desc_ak0_m_ak1.GetElementSpaceSize());
587 const auto b0_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
588 p_b0_grid, b0_grid_desc_bk0_n_bk1.GetElementSpaceSize());
589 const auto b1_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
590 p_b1_grid, b1_grid_desc_bk0_n_bk1.GetElementSpaceSize());
592 p_e1_grid, e1_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
593 const auto d0s_grid_buf = generate_tuple(
594 [&](auto i) {
596 p_d0s_grid[i],
597 d0s_griddesc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5[i].GetElementSpaceSize());
598 },
600 const auto d1s_grid_buf = generate_tuple(
601 [&](auto i) {
603 p_d1s_grid[i],
604 d1s_grid_desc_mblock_mperblock_nblock_nperblock[i].GetElementSpaceSize());
605 },
607
608 // divide block work by [M, N]
609 const auto block_work_idx =
610 block_2_e1tile_map.CalculateBottomIndex(make_multi_index(get_block_1d_id()));
611
612 if(!block_2_e1tile_map.ValidCTileIndex(
613 block_work_idx,
614 make_tuple(e1_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(I0),
615 e1_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(I2))))
616 {
617 return;
618 }
619
620 // HACK: this force m/n_block_data_idx_on_grid into SGPR
621 const index_t m_block_data_idx_on_grid =
622 __builtin_amdgcn_readfirstlane(block_work_idx[I0] * Gemm0MPerBlock);
623
624 const index_t n_block_data_idx_on_grid =
625 __builtin_amdgcn_readfirstlane(block_work_idx[I1] * Gemm1NPerBlock);
626
627 // A0 matrix in LDS memory, dst of blockwise copy
628 constexpr auto a0_block_desc_ak0_m_ak1 = GetA0BlockDescriptor_AK0PerBlock_MPerBlock_AK1();
629
630 // B0 matrix in LDS memory, dst of blockwise copy
631 constexpr auto b0_block_desc_bk0_n_bk1 = GetB0BlockDescriptor_BK0PerBlock_NPerBlock_BK1();
632
633 //
634 // set up Gemm0
635 //
636
637 // A0 matrix blockwise copy
638 auto a0_blockwise_copy =
640 A0ElementwiseOperation,
644 A0BlockTransferThreadClusterLengths_AK0_M_AK1,
645 A0BlockTransferThreadClusterArrangeOrder,
646 A0B0B1DataType,
647 A0B0B1DataType,
648 decltype(a0_grid_desc_ak0_m_ak1),
649 decltype(a0_block_desc_ak0_m_ak1),
650 A0BlockTransferSrcAccessOrder,
652 A0BlockTransferSrcVectorDim,
653 2,
654 A0BlockTransferSrcScalarPerVector,
655 A0BlockTransferDstScalarPerVector_AK1,
656 1,
657 1,
658 true, // SrcResetCoord
659 true, // DstResetCoord
660 NumGemm0KPrefetchStage>(
661 a0_grid_desc_ak0_m_ak1,
662 make_multi_index(0, m_block_data_idx_on_grid, 0),
663 a0_element_op,
664 a0_block_desc_ak0_m_ak1,
665 make_multi_index(0, 0, 0),
667
668 // B0 matrix blockwise copy
669 auto b0_blockwise_copy =
671 B0ElementwiseOperation,
675 B0BlockTransferThreadClusterLengths_BK0_N_BK1,
676 B0BlockTransferThreadClusterArrangeOrder,
677 A0B0B1DataType,
678 A0B0B1DataType,
679 decltype(b0_grid_desc_bk0_n_bk1),
680 decltype(b0_block_desc_bk0_n_bk1),
681 B0BlockTransferSrcAccessOrder,
683 B0BlockTransferSrcVectorDim,
684 2,
685 B0BlockTransferSrcScalarPerVector,
686 B0BlockTransferDstScalarPerVector_BK1,
687 1,
688 1,
689 true, // SrcResetCoord
690 true, // DstResetCoord
691 NumGemm0KPrefetchStage>(
692 b0_grid_desc_bk0_n_bk1,
693 make_multi_index(0, 0, 0), // will loop over GemmN dimension
694 b0_element_op,
695 b0_block_desc_bk0_n_bk1,
696 make_multi_index(0, 0, 0),
698
699 // Fused Gemm+Gemm pipeline
700 // for n in N0:
701 // for k in K0:
702 // acc[m][n] += A[m][k] * B0[k][n]
703 // acc1[m][o] += acc[m][n] * B1[n][o]
704
705 // sanity check
706 constexpr auto lcm_A0K1_B0K1 = math::lcm(A0K1, B0K1);
707 constexpr bool is_single_rate_mfma =
709 lcm_A0K1_B0K1 <= 4) ||
710 (is_same<A0B0B1DataType, int8_t>::value && lcm_A0K1_B0K1 <= 8) ||
712 lcm_A0K1_B0K1 < 32))
713 ? true
714 : false;
715 constexpr auto is_scale_mfma = false;
716 constexpr index_t KPack = math::max(lcm_A0K1_B0K1,
717 MfmaSelector<A0B0B1DataType,
718 Gemm0MPerXdl,
719 Gemm0NPerXdl,
720 A0B0B1DataType,
721 is_single_rate_mfma,
722 is_scale_mfma>::selected_mfma.k_per_blk);
723
724 auto blockwise_gemm0 = BlockwiseGemmXdlops_v2<
725 BlockSize,
726 A0B0B1DataType,
727 Acc0DataType,
728 decltype(a0_block_desc_ak0_m_ak1),
729 decltype(b0_block_desc_bk0_n_bk1),
730 decltype(MakeGemm0AMmaTileDescriptor_M0_M1_M2_K(a0_block_desc_ak0_m_ak1)),
731 decltype(MakeGemm0BMmaTileDescriptor_N0_N1_N2_K(b0_block_desc_bk0_n_bk1)),
732 Gemm0MPerBlock,
733 Gemm0NPerBlock,
734 Gemm0KPerBlock,
735 Gemm0MPerXdl,
736 Gemm0NPerXdl,
737 Gemm0MXdlPerWave,
738 Gemm0NXdlPerWave,
739 KPack,
740 true>{}; // TransposeC
741
742 auto acc0_thread_buf = blockwise_gemm0.GetCThreadBuffer();
743
744 // LDS allocation for A0 and B0: be careful of alignment
746 static_cast<A0B0B1DataType*>(p_shared) + SharedMemTrait::a0_block_space_offset,
747 a0_block_desc_ak0_m_ak1.GetElementSpaceSize());
748
750 static_cast<A0B0B1DataType*>(p_shared) + SharedMemTrait::b0_block_space_offset,
751 b0_block_desc_bk0_n_bk1.GetElementSpaceSize());
752
753 constexpr auto a0_block_slice_copy_step = make_multi_index(Gemm0KPerBlock / A0K1, 0, 0);
754 constexpr auto b0_block_slice_copy_step = make_multi_index(Gemm0KPerBlock / B0K1, 0, 0);
755 const auto a0_block_reset_copy_step =
756 make_multi_index(-a0_grid_desc_ak0_m_ak1.GetLength(I0), 0, 0);
757 const auto b0_block_reset_copy_step =
758 make_multi_index(-b0_grid_desc_bk0_n_bk1.GetLength(I0), Gemm0NPerBlock, 0);
759
760 // gridwise GEMM pipeline
761 // Only supports LoopScheduler::Default
762 const auto gridwise_gemm0_pipeline =
764
765 const index_t num_k_block_main_loop = __builtin_amdgcn_readfirstlane(
766 (a0_grid_desc_ak0_m_ak1.GetLength(I0) * a0_grid_desc_ak0_m_ak1.GetLength(I2)) /
767 Gemm0KPerBlock);
768
769 //
770 // set up Gemm1
771 //
772
773 // Acc0 matrix threadwise copy: AccVGPR to VGPR and downcast to XDL input data type
774 constexpr auto acc0_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4 =
775 blockwise_gemm0.GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_N3_N4();
776
777 constexpr auto m0 = acc0_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4.GetLength(I0);
778 constexpr auto n0 = acc0_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4.GetLength(I1);
779 constexpr auto m1 = acc0_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4.GetLength(I2);
780 constexpr auto n1 = acc0_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4.GetLength(I3);
781 constexpr auto m2 = acc0_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4.GetLength(I4);
782 constexpr auto n2 = acc0_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4.GetLength(I5);
783 constexpr auto n3 = acc0_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4.GetLength(I6);
784 constexpr auto n4 = acc0_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4.GetLength(I7);
785
786 constexpr auto b1_block_slice_copy_step = make_multi_index(Gemm1KPerBlock / B1K1, 0, 0);
787
788 // d0 matrix threadwise copy
789 constexpr auto d0_thread_desc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5 =
791 I1, // NBlockID
792 m0, // MRepeat
793 n0, // NRepeat
794 m1, // MWaveId
795 n1, // NWaveId
796 m2, // MPerXdl
797 n2, // NGroupNum
798 n3, // NInputNum
799 n4)); // registerNum
800
801 auto d0s_thread_buf = generate_tuple(
802 [&](auto) {
803 return StaticBuffer<
805 A0B0B1DataType,
806 d0_thread_desc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5.GetElementSpaceSize(),
807 true>{};
808 },
810
811 const auto wave_id = GetGemm0WaveIdx();
812 const auto wave_m_n_id = GetGemm0WaveMNIdx(wave_id[I2]); // I2: 0~63
813
814 static_assert(CDE0BlockTransferSrcScalarPerVector <= n4,
815 "vector load must be not greater than n4");
816 static_assert(n4 % CDE0BlockTransferSrcScalarPerVector == 0);
817
818 auto d0s_threadwise_copy = generate_tuple(
819 [&](auto i) {
821 A0B0B1DataType,
822 A0B0B1DataType,
823 decltype(d0s_griddesc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5[i]),
824 decltype(d0_thread_desc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5),
825 Sequence<I1, // MBlockId
826 I1, // NBlockID
827 m0, // MRepeat
828 n0, // NRepeat
829 m1, // MWaveId
830 n1, // NWaveId
831 m2, // MPerXdl
832 n2, // NGroupNum
833 n3, // NInputNum
834 n4>,
836 9, // CDE0BlockTransferSrcVectorDim
837 CDE0BlockTransferSrcScalarPerVector,
838 1,
839 false>(d0s_griddesc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5[i],
840 make_multi_index(block_work_idx[I0], // MBlockId
841 0, // NBlockId
842 0, // mrepeat
843 0, // nrepeat
844 wave_id[I0], // MWaveId
845 wave_id[I1], // NWaveId
846 wave_m_n_id[I1], // MPerXdl
847 0, // group
848 wave_m_n_id[I0], // NInputIndex
849 0)); // register number
850 },
852 // acc0_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4 to acc0_thread_desc_k0_m_k1
853 // n0_n1_n2_n3 -> k0
854 // m0_m1_m2 -> m
855 // n4 -> k1
856 // NOTE: had to use merge_v3 or will spit out compilation errors
857 constexpr auto acc0_thread_desc_k0_m_k1 = transform_tensor_descriptor(
858 acc0_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4,
864
865 // A1 matrix in AccVGPR
866 // N2 num_groups_per_blk, N3 num_input_blks, N4 group_size
867 constexpr auto Acc0N3 =
868 blockwise_gemm0.GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_N3_N4().GetLength(I6);
869
870 constexpr auto A1ThreadSlice_K0_M_K1 = make_tuple(
872
873 constexpr auto A1ThreadSliceK0 = A1ThreadSlice_K0_M_K1[I0];
874 constexpr auto A1ThreadSliceM = A1ThreadSlice_K0_M_K1[I1];
875 constexpr auto A1ThreadSliceK1 = A1ThreadSlice_K0_M_K1[I2];
876#if defined(__gfx11__)
877 constexpr auto a1_thread_desc_k0_m_k1 = make_naive_tensor_descriptor_packed(
878 make_tuple(A1ThreadSliceK0, A1ThreadSliceM, Number<A1ThreadSliceK1 * 2>{}));
880 Acc0DataType,
881 A0B0B1DataType,
882 decltype(acc0_thread_desc_k0_m_k1),
883 decltype(a1_thread_desc_k0_m_k1),
887 2,
888 n4,
889 0x76543210,
890 0xfedcba98,
891 true>{make_tuple(0, 0, 0)};
892#else
893 constexpr auto a1_thread_desc_k0_m_k1 = make_naive_tensor_descriptor(
894 A1ThreadSlice_K0_M_K1,
895 make_tuple(A1ThreadSliceM * A1ThreadSliceK1, A1ThreadSliceK1, I1));
896
897 // A1 matrix blockwise copy
898 auto a1_blockwise_copy = ThreadwiseTensorSliceTransfer_StaticToStatic<
899 Acc0DataType,
900 A0B0B1DataType,
901 decltype(acc0_thread_desc_k0_m_k1),
902 decltype(a1_thread_desc_k0_m_k1),
906 2,
908#endif
909
910 // B1 matrix in LDS memory, dst of blockwise copy
911 constexpr auto b1_block_desc_bk0_n_bk1 = GetB1BlockDescriptor_BK0PerBlock_NPerBlock_BK1();
912 // B1 matrix blockwise copy
913 auto b1_blockwise_copy =
915 B0ElementwiseOperation,
919 B1BlockTransferThreadClusterLengths_BK0_N_BK1,
920 B1BlockTransferThreadClusterArrangeOrder,
921 A0B0B1DataType,
922 A0B0B1DataType,
923 decltype(b1_grid_desc_bk0_n_bk1),
924 decltype(b1_block_desc_bk0_n_bk1),
925 B1BlockTransferSrcAccessOrder,
927 B1BlockTransferSrcVectorDim,
928 2,
929 B1BlockTransferSrcScalarPerVector,
930 B1BlockTransferDstScalarPerVector_BK1,
931 1,
932 1,
933 B1ThreadTransferSrcResetCoordinateAfterRun,
934 true, // DstResetCoord
935 1>(b1_grid_desc_bk0_n_bk1,
936 make_multi_index(0, n_block_data_idx_on_grid, 0),
937 b1_element_op,
938 b1_block_desc_bk0_n_bk1,
939 make_multi_index(0, 0, 0),
941
943 a1_thread_desc_k0_m_k1.GetElementSpaceSize());
944
945 // reuse LDS space for gemm0's b0_block_buf
947 static_cast<A0B0B1DataType*>(p_shared) + SharedMemTrait::b1_block_space_offset,
948 b1_block_desc_bk0_n_bk1.GetElementSpaceSize());
949
950 // selected_mfma.group_size or B1K1 <= Gemm1KPack <= selected_mfma.group_size
951 // selected_mfma.k_per_blk <= Gemm1KPack
952 //
953 // Following similar rationale behind Gemm0KPack, let Gemm1KPack be the lowest common
954 // multiples of A1K1 (predetermined by selected_mfma.group_size) and B1K1. But in this case
955 // Gemm1KPack can't be higher than A1K1 itself because A1 matrix is distributed in VGPRs
956 // with 'group_size' amount of contiguous elements. Having Gemm1KPack greater than A1K1 will
957 // cause mismatch in summation index for example c[0:7] = a1[[0:3, 8:11]] * b1[0:7].
958 // therefore we may just as well assign Gemm1KPack = group_size
959#if defined(__gfx11__)
960 constexpr index_t Gemm1KPack =
962#else
963 constexpr index_t Gemm1KPack =
965#endif
966 auto blockwise_gemm1 = BlockwiseGemmXdlops_v2<
967 BlockSize,
968 A0B0B1DataType,
969 Acc1DataType,
970 decltype(a1_thread_desc_k0_m_k1),
971 decltype(b1_block_desc_bk0_n_bk1),
972 decltype(MakeGemm1AMmaTileDescriptor_M0_M1_M2_K(a1_thread_desc_k0_m_k1)),
973 decltype(MakeGemm1BMmaTileDescriptor_N0_N1_N2_K(b1_block_desc_bk0_n_bk1)),
974 Gemm0MPerBlock,
975 Gemm1NPerBlock,
976 Gemm1KPerBlock,
977 Gemm0MPerXdl,
978 Gemm0NPerXdl,
979 Gemm0MXdlPerWave,
980 Gemm1NXdlPerWave,
981 Gemm1KPack,
982 false, // TransposeC
983 Gemm1KPack, // AMmaKStride
984 Gemm1KPack * XdlopsGemm<A0B0B1DataType,
985 Gemm0MPerXdl,
986 Gemm0NPerXdl,
987 Gemm1KPack,
988 A0B0B1DataType,
989 false>{}
990 .K0PerXdlops>{ // BMmaKStride
991 make_tuple(0, 0, 0, 0)}; // A_origin
992
993 auto c1_thread_buf = blockwise_gemm1.GetCThreadBuffer();
994
995 const index_t num_gemm1_k_block_outer_loop =
996 b0_grid_desc_bk0_n_bk1.GetLength(I1) / Gemm0NPerBlock;
997 constexpr index_t num_gemm1_k_block_inner_loop = Gemm0NPerBlock / Gemm1KPerBlock;
998
999 // Initialize C1
1000 c1_thread_buf.Clear();
1001
1002 // gemm1 K loop
1003 index_t gemm1_k_block_outer_index = 0;
1004 do
1005 {
1006 // gemm0
1007 gridwise_gemm0_pipeline.template Run<HasMainKBlockLoop>(a0_grid_desc_ak0_m_ak1,
1008 a0_block_desc_ak0_m_ak1,
1009 a0_blockwise_copy,
1010 a0_grid_buf,
1011 a0_block_buf,
1012 a0_block_slice_copy_step,
1013 b0_grid_desc_bk0_n_bk1,
1014 b0_block_desc_bk0_n_bk1,
1015 b0_blockwise_copy,
1016 b0_grid_buf,
1017 b0_block_buf,
1018 b0_block_slice_copy_step,
1019 blockwise_gemm0,
1020 acc0_thread_buf,
1021 num_k_block_main_loop);
1022 // multiple d
1023 if constexpr(NumD0Tensor)
1024 {
1025 static_for<0, NumD0Tensor, 1>{}([&](auto i) {
1026 d0s_threadwise_copy(i).Run(d0s_griddesc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5[i],
1027 d0s_grid_buf[i],
1028 d0_thread_desc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5,
1029 make_tuple(I0, I0, I0, I0, I0, I0, I0, I0, I0, I0),
1030 d0s_thread_buf(i));
1031 });
1033 // get reference to src data
1034 const auto src_data_refs = generate_tie(
1035 // return type should be lvalue
1036 [&](auto iSrc) -> const auto& { return d0s_thread_buf[iSrc][i]; },
1038
1039 // get reference to dst data
1040 auto dst_data_refs = generate_tie(
1041 // return type should be lvalue
1042 [&](auto) -> auto& { return acc0_thread_buf(i); },
1043 Number<2>{});
1044
1045 unpack2(cde0_element_op, dst_data_refs, src_data_refs);
1046 });
1047 static_for<0, NumD0Tensor, 1>{}([&](auto i) {
1048 d0s_threadwise_copy(i).MoveSrcSliceWindow(
1049 d0s_griddesc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5[i],
1050 make_multi_index(0, 1, 0, 0, 0, 0, 0, 0, 0, 0));
1051 });
1052 }
1053 else
1054 {
1055 static_for<0, acc0_thread_buf.Size(), 1>{}(
1056 [&](auto i) { cde0_element_op(acc0_thread_buf(i), acc0_thread_buf[i]); });
1057 }
1058 // gemm1
1059 {
1060 // TODO: explore using dynamic buffer for a1 thread buffer
1061 // For a1_blockwise_copy, the goal is to satisfy pipeline requirements RunRead(),
1062 // RunWrite(), and MoveSliceWindow(). But it is impossible to implement given that
1063 // the A1 source buffer is static buffer holding the output of first GEMM and
1064 // requires constexpr offset by design. Therefore, we pass tensor coordinate offset
1065 // explicitly in Run() below.
1066
1067 // preload data into LDS
1068 b1_blockwise_copy.RunRead(b1_grid_desc_bk0_n_bk1, b1_grid_buf);
1069
1070 b1_blockwise_copy.MoveSrcSliceWindow(b1_grid_desc_bk0_n_bk1,
1071 b1_block_slice_copy_step);
1072
1073 block_sync_lds(); // wait for gemm0 LDS read
1074
1075 b1_blockwise_copy.RunWrite(b1_block_desc_bk0_n_bk1, b1_block_buf);
1076
1077 // main body
1078 if constexpr(num_gemm1_k_block_inner_loop > 1)
1079 {
1080 static_for<0, num_gemm1_k_block_inner_loop - 1, 1>{}([&](auto i) {
1081 a1_blockwise_copy.Run(acc0_thread_desc_k0_m_k1,
1083 acc0_thread_buf,
1084 a1_thread_desc_k0_m_k1,
1085 make_tuple(I0, I0, I0),
1086 a1_thread_buf);
1087
1088 b1_blockwise_copy.RunRead(b1_grid_desc_bk0_n_bk1, b1_grid_buf);
1089
1091
1092 blockwise_gemm1.Run(a1_thread_buf, b1_block_buf, c1_thread_buf);
1093
1095
1096 b1_blockwise_copy.MoveSrcSliceWindow(b1_grid_desc_bk0_n_bk1,
1097 b1_block_slice_copy_step);
1098
1099 b1_blockwise_copy.RunWrite(b1_block_desc_bk0_n_bk1, b1_block_buf);
1100 });
1101 }
1102 // tail
1103 {
1104 a1_blockwise_copy.Run(
1105 acc0_thread_desc_k0_m_k1,
1106 make_tuple(
1107 Number<(num_gemm1_k_block_inner_loop - 1) * A1ThreadSliceK0>{}, I0, I0),
1108 acc0_thread_buf,
1109 a1_thread_desc_k0_m_k1,
1110 make_tuple(I0, I0, I0),
1111 a1_thread_buf);
1112
1114
1115 blockwise_gemm1.Run(a1_thread_buf, b1_block_buf, c1_thread_buf);
1116 }
1117 } // end gemm1
1118
1119 a0_blockwise_copy.MoveSrcSliceWindow(a0_grid_desc_ak0_m_ak1,
1120 a0_block_reset_copy_step); // rewind K
1121 b0_blockwise_copy.MoveSrcSliceWindow(b0_grid_desc_bk0_n_bk1,
1122 b0_block_reset_copy_step); // rewind K and step N
1123
1124 block_sync_lds(); // wait for gemm1 LDS read
1125 } while(++gemm1_k_block_outer_index < num_gemm1_k_block_outer_loop); // end j loop
1126
1127 // shuffle C1 and write out
1128 {
1129 static_assert(Gemm0MXdlPerWave % C1ShuffleGemm0MXdlPerWavePerShuffle == 0 &&
1130 Gemm1NXdlPerWave % C1ShuffleGemm0NXdlPerWavePerShuffle == 0,
1131 "wrong!");
1132
1133 constexpr index_t MWave = Gemm0MPerBlock / (Gemm0MXdlPerWave * Gemm0MPerXdl);
1134 constexpr index_t NWave = Gemm1NPerBlock / (Gemm1NXdlPerWave * Gemm0NPerXdl);
1135
1136 // TODO: hacky, fix it!
1137 constexpr auto c1_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2 =
1138 blockwise_gemm1.GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2();
1139
1140 // TODO: hacky, fix it!
1141 // c1_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp is only used to get lengths
1142 constexpr auto c1_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp =
1143 blockwise_gemm1.GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2();
1144
1145 constexpr auto M0 = c1_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I0);
1146 constexpr auto N0 = c1_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I1);
1147 constexpr auto M1 = c1_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I2);
1148 constexpr auto N1 = c1_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I3);
1149 constexpr auto M2 = c1_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I4);
1150 constexpr auto M3 = c1_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I5);
1151 constexpr auto M4 = c1_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I6);
1152 constexpr auto N2 = c1_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I7);
1153
1154 constexpr auto c1_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
1156
1157 auto c1_shuffle_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
1158 static_cast<C1ShuffleDataType*>(p_shared),
1159 c1_shuffle_block_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
1160
1161 constexpr auto c1_block_desc_m0_n0_m1_n1_m2_m3_m4_n2 = transform_tensor_descriptor(
1162 c1_shuffle_block_desc_mblock_mperblock_nblock_nperblock,
1163 make_tuple(
1166 Number<C1ShuffleGemm0MXdlPerWavePerShuffle>{}, // M0 (Gemm0MXdlPerWave) per
1167 // shuffle
1168 M1, // M1 = MWave
1169 M2, // M2 * M3 * M4 = Gemm0MPerXdl
1170 M3,
1171 M4)),
1174 Number<C1ShuffleGemm0NXdlPerWavePerShuffle>{}, // N0 (Gemm0NXdlPerWave) per
1175 // shuffle
1176 N1, // N1 = NWave
1177 N2))), // N2 = Gemm0NPerXdl
1179 make_tuple(
1181
1182 // calculate origin of thread output tensor on global memory
1183 // blockwise GEMM C1 matrix starting index
1184 const auto c1_thread_mtx_on_block =
1185 blockwise_gemm1.CalculateCThreadOriginDataIndex(I0, I0, I0, I0);
1186
1187 const index_t m_thread_data_on_block = c1_thread_mtx_on_block[I0];
1188 const index_t n_thread_data_on_block = c1_thread_mtx_on_block[I1];
1189
1190 const auto m_thread_data_on_block_to_m0_m1_m2_m3_m4_adaptor =
1192 make_tuple(make_merge_transform(make_tuple(M0, M1, M2, M3, M4))),
1195
1196 const auto m_thread_data_on_block_idx =
1197 m_thread_data_on_block_to_m0_m1_m2_m3_m4_adaptor.CalculateBottomIndex(
1198 make_multi_index(m_thread_data_on_block));
1199
1200 const auto n_thread_data_on_block_to_n0_n1_n2_adaptor =
1205
1206 const auto n_thread_data_on_block_idx =
1207 n_thread_data_on_block_to_n0_n1_n2_adaptor.CalculateBottomIndex(
1208 make_multi_index(n_thread_data_on_block));
1209
1210 // shuffle: threadwise copy C from VGPR to LDS
1211 auto c1_thread_copy_vgpr_to_lds =
1213 C1ShuffleDataType,
1214 decltype(c1_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2),
1215 decltype(c1_block_desc_m0_n0_m1_n1_m2_m3_m4_n2),
1217 Sequence<C1ShuffleGemm0MXdlPerWavePerShuffle,
1218 C1ShuffleGemm0NXdlPerWavePerShuffle,
1219 I1,
1220 I1,
1221 M2,
1222 I1,
1223 M4,
1224 I1>,
1226 7,
1227 1,
1229 1,
1230 true>{
1231 c1_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
1233 0,
1234 m_thread_data_on_block_idx[I1],
1235 n_thread_data_on_block_idx[I1],
1236 m_thread_data_on_block_idx[I2],
1237 m_thread_data_on_block_idx[I3],
1238 m_thread_data_on_block_idx[I4],
1239 n_thread_data_on_block_idx[I2]),
1241
1242 // tuple of reference to C/Ds tensor descriptors
1243 const auto c1_d1s_desc_refs = concat_tuple_of_reference(
1244 tie(c1_shuffle_block_desc_mblock_mperblock_nblock_nperblock),
1245 generate_tie([&](auto i) -> const auto& // return type should be reference
1246 { return d1s_grid_desc_mblock_mperblock_nblock_nperblock[i]; },
1248
1249 // tuple of reference to C/Ds tensor descriptors
1250 const auto c1_d1s_buf_refs = concat_tuple_of_reference(
1251 tie(c1_shuffle_block_buf),
1252 generate_tie([&](auto i) -> const auto& // return type should be reference
1253 { return d1s_grid_buf[i]; },
1255
1256 // tuple of starting index of C/Ds blockwise copy
1257 const auto idx_c1_d1s_block_begin = container_concat(
1258 make_tuple(make_multi_index(0, 0, 0, 0)),
1260 [&](auto) {
1261 return make_multi_index(block_work_idx[I0], 0, block_work_idx[I1], 0);
1262 },
1264
1265 // shuffle: blockwise copy C from LDS to global
1266 auto cde1_shuffle_block_copy_lds_to_global = ThreadGroupTensorSliceTransfer_v7<
1268 decltype(container_concat(make_tuple(C1ShuffleDataType{}), D1sDataType{})),
1270 decltype(c1_d1s_desc_refs),
1271 decltype(tie(e1_grid_desc_mblock_mperblock_nblock_nperblock)),
1272 CDE1ElementwiseOperation,
1273 Sequence<static_cast<index_t>(E1GlobalMemoryDataOperation)>, // FIXME: make Sequence
1274 // support arbitray
1275 // type
1276 Sequence<1,
1277 C1ShuffleGemm0MXdlPerWavePerShuffle * MWave * Gemm0MPerXdl,
1278 1,
1279 C1ShuffleGemm0NXdlPerWavePerShuffle * NWave *
1280 Gemm0NPerXdl>, // BlockSliceLengths,
1281 CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
1282 Sequence<0, 1, 2, 3>, // typename ThreadClusterArrangeOrder,
1283 Sequence<0, 1, 2, 3>, // typename DimAccessOrder,
1284 3, // index_t VectorDim,
1285 CDE1ShuffleBlockTransferScalarPerVector_NPerBlock,
1289 false>>, // ThreadTransferSrcResetCoordinateAfterRunFlags
1290 Sequence<false>> // ThreadTransferDstResetCoordinateAfterRunFlags
1291 {c1_d1s_desc_refs,
1292 idx_c1_d1s_block_begin,
1293 tie(e1_grid_desc_mblock_mperblock_nblock_nperblock),
1294 make_tuple(make_multi_index(block_work_idx[I0], 0, block_work_idx[I1], 0)),
1295 cde1_element_op};
1296
1297 // space filling curve for threadwise C in VGPR
1298 constexpr auto sfc_c1_vgpr =
1301 Sequence<C1ShuffleGemm0MXdlPerWavePerShuffle,
1302 C1ShuffleGemm0NXdlPerWavePerShuffle,
1303 1,
1304 1,
1305 M2,
1306 1,
1307 M4,
1308 1>>{};
1309
1310 // space filling curve for shuffled blockwise C in global mem
1311 constexpr auto sfc_e1_global = SpaceFillingCurve<
1314 Sequence<1,
1315 C1ShuffleGemm0MXdlPerWavePerShuffle * MWave * Gemm0MPerXdl,
1316 1,
1317 C1ShuffleGemm0NXdlPerWavePerShuffle * NWave * Gemm0NPerXdl>>{};
1318
1319 constexpr index_t num_access = sfc_c1_vgpr.GetNumOfAccess();
1320
1321 static_assert(num_access == sfc_e1_global.GetNumOfAccess(), "wrong!");
1322
1323 static_for<0, num_access, 1>{}([&](auto access_id) {
1324 // make sure it's safe to write to LDS
1326
1327 // each thread write its data from VGPR to LDS
1328 c1_thread_copy_vgpr_to_lds.Run(c1_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2,
1329 sfc_c1_vgpr.GetIndexTupleOfNumber(access_id),
1330 c1_thread_buf,
1331 c1_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
1332 c1_shuffle_block_buf);
1333
1334 // make sure it's safe to read from LDS
1336
1337 // each block copy its data from LDS to global
1338 cde1_shuffle_block_copy_lds_to_global.Run(
1339 c1_d1s_desc_refs,
1340 c1_d1s_buf_refs,
1341 tie(e1_grid_desc_mblock_mperblock_nblock_nperblock),
1342 tie(e1_grid_buf));
1343
1344 if constexpr(access_id < num_access - 1)
1345 {
1346 constexpr auto e1_global_step = sfc_e1_global.GetForwardStep(access_id);
1347
1348 // move on D1s
1349 static_for<0, NumD1Tensor, 1>{}([&](auto i) {
1350 cde1_shuffle_block_copy_lds_to_global.MoveSrcSliceWindow(
1351 c1_d1s_desc_refs, i + I1, e1_global_step);
1352 });
1353
1354 // move on C
1355 cde1_shuffle_block_copy_lds_to_global.MoveDstSliceWindow(
1356 tie(e1_grid_desc_mblock_mperblock_nblock_nperblock), I0, e1_global_step);
1357 }
1358 });
1359 }
1360 }
1361};
1362
1363} // namespace ck
__host__ __device__ constexpr auto integer_least_multiple(X x, Y y)
Definition utility/math.hpp:78
__host__ __device__ constexpr T max(T x)
Definition utility/math.hpp:84
__host__ __device__ constexpr auto lcm(X x, Y y)
Definition utility/math.hpp:198
Definition ck.hpp:268
__host__ __device__ constexpr auto make_multi_index(Xs &&... xs)
Definition array_multi_index.hpp:15
typename uniform_sequence_gen< NSize, I >::type uniform_sequence_gen_t
Definition utility/sequence.hpp:928
__host__ __device__ constexpr auto make_static_buffer(Number< N >)
Definition static_buffer.hpp:186
__host__ __device__ constexpr auto make_pass_through_transform(const LowLength &low_length)
Definition multi_index_transform_helper.hpp:12
__host__ __device__ constexpr auto container_concat(const X &x, const Ys &... ys)
Definition utility/container_helper.hpp:320
int32_t index_t
Definition ck.hpp:299
__host__ __device__ constexpr auto make_naive_tensor_descriptor(const Tuple< Lengths... > &lengths, const Tuple< Strides... > &strides)
Definition tensor_descriptor_helper.hpp:49
InMemoryDataOperationEnum
Definition ck.hpp:277
@ Set
Definition ck.hpp:278
__host__ __device__ constexpr auto make_single_stage_tensor_adaptor(const Transforms &transforms, LowerDimensionOldTopIdss, UpperDimensionNewTopIdss)
Definition tensor_description/tensor_adaptor.hpp:425
remove_cv_t< remove_reference_t< T > > remove_cvref_t
Definition type.hpp:297
__host__ __device__ constexpr auto make_freeze_transform(const LowerIndex &low_idx)
Definition multi_index_transform_helper.hpp:151
constexpr Tuple< Args &... > tie(Args &... args) noexcept
Definition utility/tuple.hpp:218
integral_constant< index_t, N > Number
Definition number.hpp:12
constexpr auto GridwiseGemmPipeline_v1_Selector()
Definition gridwise_gemm_pipeline_v1.hpp:758
@ Vgpr
Definition amd_address_space.hpp:20
typename tuple_element< I, TTuple >::type tuple_element_t
Definition utility/tuple.hpp:208
__host__ __device__ constexpr auto make_merge_transform(const LowLengths &low_lengths)
Definition multi_index_transform_helper.hpp:55
__device__ index_t get_block_1d_id()
Definition get_id.hpp:47
__device__ constexpr index_t get_warp_size()
Definition get_id.hpp:10
__host__ __device__ constexpr auto make_merge_transform_v3_division_mod(const LowLengths &low_lengths)
Definition multi_index_transform_helper.hpp:84
__host__ __device__ constexpr auto generate_tuple(F &&f, Number< N >)
Definition tuple_helper.hpp:21
__host__ __device__ constexpr auto make_naive_tensor_descriptor_packed(const Tuple< Lengths... > &lengths)
Definition tensor_descriptor_helper.hpp:101
__host__ __device__ constexpr auto make_tuple(Xs &&... xs)
Definition utility/tuple.hpp:211
LoopScheduler
Definition loop_scheduler.hpp:15
@ Default
Definition loop_scheduler.hpp:16
__device__ index_t get_thread_local_1d_id()
Definition get_id.hpp:41
typename sequence_merge< Sx, Sy >::type sequence_merge_t
Definition utility/sequence.hpp:925
__host__ __device__ constexpr auto transform_tensor_descriptor(const OldTensorDescriptor &old_tensor_desc, const NewTransforms &new_transforms, NewLowerDimensionOldVisibleIdss, NewUpperDimensionNewVisibleIdss)
Definition tensor_description/tensor_descriptor.hpp:319
__device__ void block_sync_lds()
Definition synchronization.hpp:16
__host__ __device__ constexpr auto make_unmerge_transform(const UpLengths &up_lengths, integral_constant< bool, Use24BitIntegerCalculation >=integral_constant< bool, false >{})
Definition multi_index_transform_helper.hpp:90
__host__ __device__ constexpr auto unpack2(F &&f, X &&x, Y &&y)
Definition functional4.hpp:55
__host__ __device__ constexpr auto make_dynamic_buffer(T *p, ElementSpaceSize element_space_size)
Definition dynamic_buffer.hpp:472
__host__ __device__ constexpr auto generate_tie(F &&f, Number< N >)
Definition tuple_helper.hpp:34
__host__ __device__ constexpr auto concat_tuple_of_reference(const Tuple< X &... > &tx, const Tuple< Y &... > &ty)
Definition tuple_helper.hpp:42
Definition block_to_ctile_map.hpp:261
Blockwise gemm.
Definition blockwise_gemm_xdlops.hpp:690
Definition gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp:524
static constexpr auto b0_block_desc_bk0_n_bk1
Definition gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp:528
static constexpr auto max_lds_align
Definition gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp:533
static constexpr auto b1_block_desc_bk0_n_bk1
Definition gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp:530
ck::GridwiseBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0DataType, Acc0DataType, D0sDataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, InMemoryDataOperationEnum::Set, A0GridDesc_M_K, B0GridDesc_N_K, D0sGridDesc_M_N, B1GridDesc_N_K, D1sGridDesc_M_N, E1GridDesc_M_N, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave_, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, true, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, true, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, false, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >< math::max(Gemm0MXdlPerWave64, 1)>::SharedMemTrait::b1_block_space_offset
static constexpr auto b1_block_space_offset
Definition gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp:544
ck::GridwiseBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0DataType, Acc0DataType, D0sDataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, InMemoryDataOperationEnum::Set, A0GridDesc_M_K, B0GridDesc_N_K, D0sGridDesc_M_N, B1GridDesc_N_K, D1sGridDesc_M_N, E1GridDesc_M_N, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave_, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, true, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, true, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, false, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >< math::max(Gemm0MXdlPerWave64, 1)>::SharedMemTrait::b1_block_space_size_aligned
static constexpr auto b1_block_space_size_aligned
Definition gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp:539
static constexpr auto a0_block_desc_ak0_m_ak1
Definition gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp:526
ck::GridwiseBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0DataType, Acc0DataType, D0sDataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, InMemoryDataOperationEnum::Set, A0GridDesc_M_K, B0GridDesc_N_K, D0sGridDesc_M_N, B1GridDesc_N_K, D1sGridDesc_M_N, E1GridDesc_M_N, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave_, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, true, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, true, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, false, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >< math::max(Gemm0MXdlPerWave64, 1)>::SharedMemTrait::a0_block_space_size_aligned
static constexpr auto a0_block_space_size_aligned
Definition gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp:535
static constexpr auto c1_shuffle_block_desc_mblock_mperblock_nblock_nperblock
Definition gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp:547
static constexpr auto a0_block_space_offset
Definition gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp:542
ck::GridwiseBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0DataType, Acc0DataType, D0sDataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, InMemoryDataOperationEnum::Set, A0GridDesc_M_K, B0GridDesc_N_K, D0sGridDesc_M_N, B1GridDesc_N_K, D1sGridDesc_M_N, E1GridDesc_M_N, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave_, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, true, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, true, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, false, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >< math::max(Gemm0MXdlPerWave64, 1)>::SharedMemTrait::b0_block_space_size_aligned
static constexpr auto b0_block_space_size_aligned
Definition gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp:537
static constexpr auto b0_block_space_offset
Definition gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp:543
ck::GridwiseBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0DataType, Acc0DataType, D0sDataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, InMemoryDataOperationEnum::Set, A0GridDesc_M_K, B0GridDesc_N_K, D0sGridDesc_M_N, B1GridDesc_N_K, D1sGridDesc_M_N, E1GridDesc_M_N, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave_, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, true, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, true, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, false, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >< math::max(Gemm0MXdlPerWave64, 1)>::SharedMemTrait::c1_block_space_size
static constexpr auto c1_block_space_size
Definition gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp:549
Definition gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp:86
ck::GridwiseBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0DataType, Acc0DataType, D0sDataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, InMemoryDataOperationEnum::Set, A0GridDesc_M_K, B0GridDesc_N_K, D0sGridDesc_M_N, B1GridDesc_N_K, D1sGridDesc_M_N, E1GridDesc_M_N, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave_, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, true, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, true, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, false, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >::MakeGemm1AMmaTileDescriptor_M0_M1_M2_K
__host__ static __device__ constexpr auto MakeGemm1AMmaTileDescriptor_M0_M1_M2_K(const A0BlockDesc_AK0_M_AK1 &)
Definition gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp:189
ck::GridwiseBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0DataType, Acc0DataType, D0sDataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, InMemoryDataOperationEnum::Set, A0GridDesc_M_K, B0GridDesc_N_K, D0sGridDesc_M_N, B1GridDesc_N_K, D1sGridDesc_M_N, E1GridDesc_M_N, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave_, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, true, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, true, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, false, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >::CheckValidity
static __host__ constexpr bool CheckValidity(const A0GridDesc_M_K &a0_grid_desc_m_k, const B0GridDesc_N_K &b0_grid_desc_n_k, const B1GridDesc_N_K &b1_grid_desc_n_k, const E1GridDesc_M_N &e1_grid_desc_m_n, const Block2E1TileMap &block_2_e1tile_map)
Definition gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp:286
ck::GridwiseBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0DataType, Acc0DataType, D0sDataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, InMemoryDataOperationEnum::Set, A0GridDesc_M_K, B0GridDesc_N_K, D0sGridDesc_M_N, B1GridDesc_N_K, D1sGridDesc_M_N, E1GridDesc_M_N, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave_, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, true, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, true, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, false, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >< math::max(Gemm0MXdlPerWave64, 1)>::D0sGridDescriptor_M0_N0_M1_N1_M2_N2_M3_N3_N4_N5
remove_cvref_t< decltype(MakeD0sGridDescriptor_M0_N0_M1_N1_M2_N2_M3_N3_N4_N5(D0sGridDesc_M_N{}))> D0sGridDescriptor_M0_N0_M1_N1_M2_N2_M3_N3_N4_N5
Definition gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp:512
ck::GridwiseBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0DataType, Acc0DataType, D0sDataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, InMemoryDataOperationEnum::Set, A0GridDesc_M_K, B0GridDesc_N_K, D0sGridDesc_M_N, B1GridDesc_N_K, D1sGridDesc_M_N, E1GridDesc_M_N, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave_, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, true, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, true, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, false, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >::MakeGemm0D0GridDescriptor_M0_N0_M1_N1_M2_N2_M3_N3_N4_N5
__host__ static __device__ constexpr auto MakeGemm0D0GridDescriptor_M0_N0_M1_N1_M2_N2_M3_N3_N4_N5(const D0GridDesc_M_N &d0_grid_desc_m_n)
Definition gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp:404
ck::GridwiseBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0DataType, Acc0DataType, D0sDataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, InMemoryDataOperationEnum::Set, A0GridDesc_M_K, B0GridDesc_N_K, D0sGridDesc_M_N, B1GridDesc_N_K, D1sGridDesc_M_N, E1GridDesc_M_N, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave_, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, true, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, true, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, false, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >< math::max(Gemm0MXdlPerWave64, 1)>::D1sGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
remove_cvref_t< decltype(MakeD1sGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(D1sGridDesc_M_N{}))> D1sGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
Definition gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp:516
ck::GridwiseBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0DataType, Acc0DataType, D0sDataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, InMemoryDataOperationEnum::Set, A0GridDesc_M_K, B0GridDesc_N_K, D0sGridDesc_M_N, B1GridDesc_N_K, D1sGridDesc_M_N, E1GridDesc_M_N, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave_, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, true, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, true, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, false, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >< math::max(Gemm0MXdlPerWave64, 1)>::GridwiseGemmPipe
GridwiseGemmPipeline_v1< NumGemm0KPrefetchStage, true, true > GridwiseGemmPipe
Definition gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp:119
ck::GridwiseBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0DataType, Acc0DataType, D0sDataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, InMemoryDataOperationEnum::Set, A0GridDesc_M_K, B0GridDesc_N_K, D0sGridDesc_M_N, B1GridDesc_N_K, D1sGridDesc_M_N, E1GridDesc_M_N, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave_, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, true, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, true, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, false, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >::GetGemm0WaveIdx
static __device__ auto GetGemm0WaveIdx()
Definition gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp:145
ck::GridwiseBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0DataType, Acc0DataType, D0sDataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, InMemoryDataOperationEnum::Set, A0GridDesc_M_K, B0GridDesc_N_K, D0sGridDesc_M_N, B1GridDesc_N_K, D1sGridDesc_M_N, E1GridDesc_M_N, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave_, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, true, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, true, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, false, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >< math::max(Gemm0MXdlPerWave64, 1)>::A0K0PerBlock
static constexpr auto A0K0PerBlock
Definition gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp:107
ck::GridwiseBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0DataType, Acc0DataType, D0sDataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, InMemoryDataOperationEnum::Set, A0GridDesc_M_K, B0GridDesc_N_K, D0sGridDesc_M_N, B1GridDesc_N_K, D1sGridDesc_M_N, E1GridDesc_M_N, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave_, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, true, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, true, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, false, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >::MakeDefaultBlock2E1TileMap
__host__ static __device__ constexpr auto MakeDefaultBlock2E1TileMap(const E1GridDesc_M_N &e1_grid_desc_m_n)
Definition gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp:502
ck::GridwiseBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0DataType, Acc0DataType, D0sDataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, InMemoryDataOperationEnum::Set, A0GridDesc_M_K, B0GridDesc_N_K, D0sGridDesc_M_N, B1GridDesc_N_K, D1sGridDesc_M_N, E1GridDesc_M_N, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave_, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, true, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, true, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, false, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >< math::max(Gemm0MXdlPerWave64, 1)>::B0K1
static constexpr auto B0K1
Definition gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp:105
ck::GridwiseBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0DataType, Acc0DataType, D0sDataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, InMemoryDataOperationEnum::Set, A0GridDesc_M_K, B0GridDesc_N_K, D0sGridDesc_M_N, B1GridDesc_N_K, D1sGridDesc_M_N, E1GridDesc_M_N, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave_, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, true, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, true, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, false, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >::GetGemm0WaveMNIdx
static __device__ auto GetGemm0WaveMNIdx(const index_t thread_id)
Definition gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp:157
ck::GridwiseBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0DataType, Acc0DataType, D0sDataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, InMemoryDataOperationEnum::Set, A0GridDesc_M_K, B0GridDesc_N_K, D0sGridDesc_M_N, B1GridDesc_N_K, D1sGridDesc_M_N, E1GridDesc_M_N, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave_, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, true, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, true, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, false, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >< math::max(Gemm0MXdlPerWave64, 1)>::I2
static constexpr auto I2
Definition gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp:95
ck::GridwiseBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0DataType, Acc0DataType, D0sDataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, InMemoryDataOperationEnum::Set, A0GridDesc_M_K, B0GridDesc_N_K, D0sGridDesc_M_N, B1GridDesc_N_K, D1sGridDesc_M_N, E1GridDesc_M_N, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave_, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, true, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, true, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, false, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >< math::max(Gemm0MXdlPerWave64, 1)>::B1K1
static constexpr auto B1K1
Definition gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp:114
ck::GridwiseBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0DataType, Acc0DataType, D0sDataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, InMemoryDataOperationEnum::Set, A0GridDesc_M_K, B0GridDesc_N_K, D0sGridDesc_M_N, B1GridDesc_N_K, D1sGridDesc_M_N, E1GridDesc_M_N, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave_, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, true, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, true, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, false, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >::GetC1ShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
__host__ static __device__ constexpr auto GetC1ShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock()
Definition gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp:229
ck::GridwiseBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0DataType, Acc0DataType, D0sDataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, InMemoryDataOperationEnum::Set, A0GridDesc_M_K, B0GridDesc_N_K, D0sGridDesc_M_N, B1GridDesc_N_K, D1sGridDesc_M_N, E1GridDesc_M_N, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave_, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, true, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, true, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, false, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >::MakeD1sGridPointer
static constexpr auto MakeD1sGridPointer()
Definition gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp:134
ck::GridwiseBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0DataType, Acc0DataType, D0sDataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, InMemoryDataOperationEnum::Set, A0GridDesc_M_K, B0GridDesc_N_K, D0sGridDesc_M_N, B1GridDesc_N_K, D1sGridDesc_M_N, E1GridDesc_M_N, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave_, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, true, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, true, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, false, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >< math::max(Gemm0MXdlPerWave64, 1)>::I3
static constexpr auto I3
Definition gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp:96
ck::GridwiseBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0DataType, Acc0DataType, D0sDataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, InMemoryDataOperationEnum::Set, A0GridDesc_M_K, B0GridDesc_N_K, D0sGridDesc_M_N, B1GridDesc_N_K, D1sGridDesc_M_N, E1GridDesc_M_N, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave_, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, true, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, true, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, false, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >< math::max(Gemm0MXdlPerWave64, 1)>::NumD0Tensor
static constexpr index_t NumD0Tensor
Definition gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp:90
ck::GridwiseBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0DataType, Acc0DataType, D0sDataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, InMemoryDataOperationEnum::Set, A0GridDesc_M_K, B0GridDesc_N_K, D0sGridDesc_M_N, B1GridDesc_N_K, D1sGridDesc_M_N, E1GridDesc_M_N, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave_, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, true, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, true, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, false, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >::GetB1BlockDescriptor_BK0PerBlock_NPerBlock_BK1
__host__ static __device__ constexpr auto GetB1BlockDescriptor_BK0PerBlock_NPerBlock_BK1()
Definition gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp:220
ck::GridwiseBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0DataType, Acc0DataType, D0sDataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, InMemoryDataOperationEnum::Set, A0GridDesc_M_K, B0GridDesc_N_K, D0sGridDesc_M_N, B1GridDesc_N_K, D1sGridDesc_M_N, E1GridDesc_M_N, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave_, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, true, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, true, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, false, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >< math::max(Gemm0MXdlPerWave64, 1)>::D0sGridPointer
decltype(MakeD0sGridPointer()) D0sGridPointer
Definition gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp:553
ck::GridwiseBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0DataType, Acc0DataType, D0sDataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, InMemoryDataOperationEnum::Set, A0GridDesc_M_K, B0GridDesc_N_K, D0sGridDesc_M_N, B1GridDesc_N_K, D1sGridDesc_M_N, E1GridDesc_M_N, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave_, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, true, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, true, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, false, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >::MakeGemm1BMmaTileDescriptor_N0_N1_N2_K
__host__ static __device__ constexpr auto MakeGemm1BMmaTileDescriptor_N0_N1_N2_K(const BBlockDesc_BK0_N_BK1 &)
Definition gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp:197
ck::GridwiseBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0DataType, Acc0DataType, D0sDataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, InMemoryDataOperationEnum::Set, A0GridDesc_M_K, B0GridDesc_N_K, D0sGridDesc_M_N, B1GridDesc_N_K, D1sGridDesc_M_N, E1GridDesc_M_N, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave_, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, true, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, true, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, false, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >::GetSharedMemoryNumberOfByte
__host__ static __device__ constexpr index_t GetSharedMemoryNumberOfByte()
Definition gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp:244
ck::GridwiseBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0DataType, Acc0DataType, D0sDataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, InMemoryDataOperationEnum::Set, A0GridDesc_M_K, B0GridDesc_N_K, D0sGridDesc_M_N, B1GridDesc_N_K, D1sGridDesc_M_N, E1GridDesc_M_N, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave_, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, true, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, true, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, false, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >::GetA0BlockDescriptor_AK0PerBlock_MPerBlock_AK1
__host__ static __device__ constexpr auto GetA0BlockDescriptor_AK0PerBlock_MPerBlock_AK1()
Definition gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp:204
ck::GridwiseBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0DataType, Acc0DataType, D0sDataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, InMemoryDataOperationEnum::Set, A0GridDesc_M_K, B0GridDesc_N_K, D0sGridDesc_M_N, B1GridDesc_N_K, D1sGridDesc_M_N, E1GridDesc_M_N, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave_, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, true, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, true, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, false, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >::CalculateHasMainKBlockLoop
__host__ static __device__ constexpr bool CalculateHasMainKBlockLoop(index_t K)
Definition gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp:360
ck::GridwiseBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0DataType, Acc0DataType, D0sDataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, InMemoryDataOperationEnum::Set, A0GridDesc_M_K, B0GridDesc_N_K, D0sGridDesc_M_N, B1GridDesc_N_K, D1sGridDesc_M_N, E1GridDesc_M_N, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave_, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, true, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, true, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, false, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >< math::max(Gemm0MXdlPerWave64, 1)>::A0K1
static constexpr auto A0K1
Definition gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp:104
ck::GridwiseBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0DataType, Acc0DataType, D0sDataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, InMemoryDataOperationEnum::Set, A0GridDesc_M_K, B0GridDesc_N_K, D0sGridDesc_M_N, B1GridDesc_N_K, D1sGridDesc_M_N, E1GridDesc_M_N, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave_, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, true, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, true, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, false, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >< math::max(Gemm0MXdlPerWave64, 1)>::B1K0PerBlock
static constexpr auto B1K0PerBlock
Definition gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp:115
ck::GridwiseBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0DataType, Acc0DataType, D0sDataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, InMemoryDataOperationEnum::Set, A0GridDesc_M_K, B0GridDesc_N_K, D0sGridDesc_M_N, B1GridDesc_N_K, D1sGridDesc_M_N, E1GridDesc_M_N, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave_, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, true, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, true, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, false, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >< math::max(Gemm0MXdlPerWave64, 1)>::I4
static constexpr auto I4
Definition gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp:97
ck::GridwiseBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0DataType, Acc0DataType, D0sDataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, InMemoryDataOperationEnum::Set, A0GridDesc_M_K, B0GridDesc_N_K, D0sGridDesc_M_N, B1GridDesc_N_K, D1sGridDesc_M_N, E1GridDesc_M_N, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave_, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, true, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, true, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, false, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >< math::max(Gemm0MXdlPerWave64, 1)>::Gemm0MWaves
static constexpr auto Gemm0MWaves
Definition gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp:110
ck::GridwiseBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0DataType, Acc0DataType, D0sDataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, InMemoryDataOperationEnum::Set, A0GridDesc_M_K, B0GridDesc_N_K, D0sGridDesc_M_N, B1GridDesc_N_K, D1sGridDesc_M_N, E1GridDesc_M_N, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave_, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, true, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, true, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, false, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >::MakeD1sGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
__host__ static __device__ constexpr auto MakeD1sGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(const DsGridDescriptor_M_N &ds_grid_desc_m_n)
Definition gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp:490
ck::GridwiseBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0DataType, Acc0DataType, D0sDataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, InMemoryDataOperationEnum::Set, A0GridDesc_M_K, B0GridDesc_N_K, D0sGridDesc_M_N, B1GridDesc_N_K, D1sGridDesc_M_N, E1GridDesc_M_N, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave_, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, true, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, true, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, false, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >< math::max(Gemm0MXdlPerWave64, 1)>::Gemm0NWaves
static constexpr auto Gemm0NWaves
Definition gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp:111
ck::GridwiseBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0DataType, Acc0DataType, D0sDataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, InMemoryDataOperationEnum::Set, A0GridDesc_M_K, B0GridDesc_N_K, D0sGridDesc_M_N, B1GridDesc_N_K, D1sGridDesc_M_N, E1GridDesc_M_N, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave_, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, true, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, true, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, false, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >::MakeGemm0AMmaTileDescriptor_M0_M1_M2_K
__host__ static __device__ constexpr auto MakeGemm0AMmaTileDescriptor_M0_M1_M2_K(const A0BlockDesc_AK0_M_AK1 &)
Definition gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp:169
ck::GridwiseBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0DataType, Acc0DataType, D0sDataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, InMemoryDataOperationEnum::Set, A0GridDesc_M_K, B0GridDesc_N_K, D0sGridDesc_M_N, B1GridDesc_N_K, D1sGridDesc_M_N, E1GridDesc_M_N, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave_, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, true, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, true, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, false, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >< math::max(Gemm0MXdlPerWave64, 1)>::I6
static constexpr auto I6
Definition gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp:99
ck::GridwiseBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0DataType, Acc0DataType, D0sDataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, InMemoryDataOperationEnum::Set, A0GridDesc_M_K, B0GridDesc_N_K, D0sGridDesc_M_N, B1GridDesc_N_K, D1sGridDesc_M_N, E1GridDesc_M_N, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave_, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, true, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, true, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, false, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >::MakeD0sGridPointer
static constexpr auto MakeD0sGridPointer()
Definition gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp:122
ck::GridwiseBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0DataType, Acc0DataType, D0sDataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, InMemoryDataOperationEnum::Set, A0GridDesc_M_K, B0GridDesc_N_K, D0sGridDesc_M_N, B1GridDesc_N_K, D1sGridDesc_M_N, E1GridDesc_M_N, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave_, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, true, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, true, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, false, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >::Run
static __device__ void Run(const A0DataType *__restrict__ p_a0_grid, const A0DataType *__restrict__ p_b0_grid, D0sGridPointer p_d0s_grid, const A0DataType *__restrict__ p_b1_grid, D1sGridPointer p_d1s_grid, E1DataType *__restrict__ p_e1_grid, void *__restrict__ p_shared, const A0ElementwiseOperation &a0_element_op, const B0ElementwiseOperation &b0_element_op, const CDE0ElementwiseOperation &cde0_element_op, const B1ElementwiseOperation &b1_element_op, const CDE1ElementwiseOperation &cde1_element_op, const A0GridDesc_AK0_M_AK1 &a0_grid_desc_ak0_m_ak1, const B0GridDesc_BK0_N_BK1 &b0_grid_desc_bk0_n_bk1, const D0sGridDesc_M_N &d0s_griddesc_m_n, const B1GridDesc_BK0_N_BK1 &b1_grid_desc_bk0_n_bk1, const D1sGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock &d1s_grid_desc_mblock_mperblock_nblock_nperblock, const E1GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock &e1_grid_desc_mblock_mperblock_nblock_nperblock, const Block2E1TileMap &block_2_e1tile_map)
Definition gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp:561
ck::GridwiseBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0DataType, Acc0DataType, D0sDataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, InMemoryDataOperationEnum::Set, A0GridDesc_M_K, B0GridDesc_N_K, D0sGridDesc_M_N, B1GridDesc_N_K, D1sGridDesc_M_N, E1GridDesc_M_N, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave_, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, true, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, true, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, false, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >< math::max(Gemm0MXdlPerWave64, 1)>::I7
static constexpr auto I7
Definition gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp:100
ck::GridwiseBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0DataType, Acc0DataType, D0sDataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, InMemoryDataOperationEnum::Set, A0GridDesc_M_K, B0GridDesc_N_K, D0sGridDesc_M_N, B1GridDesc_N_K, D1sGridDesc_M_N, E1GridDesc_M_N, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave_, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, true, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, true, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, false, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >::MakeE1GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
__host__ static __device__ constexpr auto MakeE1GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(const E1GridDesc_M_N &e1_grid_desc_m_n)
Definition gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp:460
ck::GridwiseBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0DataType, Acc0DataType, D0sDataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, InMemoryDataOperationEnum::Set, A0GridDesc_M_K, B0GridDesc_N_K, D0sGridDesc_M_N, B1GridDesc_N_K, D1sGridDesc_M_N, E1GridDesc_M_N, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave_, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, true, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, true, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, false, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >::MakeDefaultB1GridDescriptor_BK0_N_BK1
__host__ static __device__ constexpr auto MakeDefaultB1GridDescriptor_BK0_N_BK1(const B1GridDesc_N_K &b1_grid_desc_n_k)
Definition gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp:443
ck::GridwiseBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0DataType, Acc0DataType, D0sDataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, InMemoryDataOperationEnum::Set, A0GridDesc_M_K, B0GridDesc_N_K, D0sGridDesc_M_N, B1GridDesc_N_K, D1sGridDesc_M_N, E1GridDesc_M_N, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave_, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, true, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, true, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, false, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >::IsValidCompilationParameter
static __device__ bool constexpr IsValidCompilationParameter()
Definition gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp:260
ck::GridwiseBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0DataType, Acc0DataType, D0sDataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, InMemoryDataOperationEnum::Set, A0GridDesc_M_K, B0GridDesc_N_K, D0sGridDesc_M_N, B1GridDesc_N_K, D1sGridDesc_M_N, E1GridDesc_M_N, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave_, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, true, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, true, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, false, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >< math::max(Gemm0MXdlPerWave64, 1)>::ThisThreadBlock
ThisThreadBlock< BlockSize > ThisThreadBlock
Definition gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp:117
ck::GridwiseBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0DataType, Acc0DataType, D0sDataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, InMemoryDataOperationEnum::Set, A0GridDesc_M_K, B0GridDesc_N_K, D0sGridDesc_M_N, B1GridDesc_N_K, D1sGridDesc_M_N, E1GridDesc_M_N, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave_, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, true, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, true, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, false, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >::MakeD0sGridDescriptor_M0_N0_M1_N1_M2_N2_M3_N3_N4_N5
__host__ static __device__ constexpr auto MakeD0sGridDescriptor_M0_N0_M1_N1_M2_N2_M3_N3_N4_N5(const D0sGridDesc_M_N &ds_grid_desc_m_n)
Definition gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp:479
ck::GridwiseBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0DataType, Acc0DataType, D0sDataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, InMemoryDataOperationEnum::Set, A0GridDesc_M_K, B0GridDesc_N_K, D0sGridDesc_M_N, B1GridDesc_N_K, D1sGridDesc_M_N, E1GridDesc_M_N, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave_, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, true, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, true, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, false, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >::GetB0BlockDescriptor_BK0PerBlock_NPerBlock_BK1
__host__ static __device__ constexpr auto GetB0BlockDescriptor_BK0PerBlock_NPerBlock_BK1()
Definition gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp:212
ck::GridwiseBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0DataType, Acc0DataType, D0sDataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, InMemoryDataOperationEnum::Set, A0GridDesc_M_K, B0GridDesc_N_K, D0sGridDesc_M_N, B1GridDesc_N_K, D1sGridDesc_M_N, E1GridDesc_M_N, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave_, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, true, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, true, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, false, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >::MakeGemm0BMmaTileDescriptor_N0_N1_N2_K
__host__ static __device__ constexpr auto MakeGemm0BMmaTileDescriptor_N0_N1_N2_K(const BBlockDesc_BK0_N_BK1 &)
Definition gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp:179
ck::GridwiseBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0DataType, Acc0DataType, D0sDataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, InMemoryDataOperationEnum::Set, A0GridDesc_M_K, B0GridDesc_N_K, D0sGridDesc_M_N, B1GridDesc_N_K, D1sGridDesc_M_N, E1GridDesc_M_N, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave_, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, true, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, true, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, false, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >< math::max(Gemm0MXdlPerWave64, 1)>::DefaultBlock2E1TileMap
remove_cvref_t< decltype(MakeDefaultBlock2E1TileMap(E1GridDesc_M_N{}))> DefaultBlock2E1TileMap
Definition gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp:520
ck::GridwiseBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0DataType, Acc0DataType, D0sDataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, InMemoryDataOperationEnum::Set, A0GridDesc_M_K, B0GridDesc_N_K, D0sGridDesc_M_N, B1GridDesc_N_K, D1sGridDesc_M_N, E1GridDesc_M_N, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave_, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, true, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, true, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, false, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >< math::max(Gemm0MXdlPerWave64, 1)>::I0
static constexpr auto I0
Definition gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp:93
ck::GridwiseBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0DataType, Acc0DataType, D0sDataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, InMemoryDataOperationEnum::Set, A0GridDesc_M_K, B0GridDesc_N_K, D0sGridDesc_M_N, B1GridDesc_N_K, D1sGridDesc_M_N, E1GridDesc_M_N, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave_, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, true, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, true, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, false, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >::MakeDefaultB0GridDescriptor_BK0_N_BK1
__host__ static __device__ constexpr auto MakeDefaultB0GridDescriptor_BK0_N_BK1(const B0GridDesc_N_K &b0_grid_desc_n_k)
Definition gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp:386
ck::GridwiseBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0DataType, Acc0DataType, D0sDataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, InMemoryDataOperationEnum::Set, A0GridDesc_M_K, B0GridDesc_N_K, D0sGridDesc_M_N, B1GridDesc_N_K, D1sGridDesc_M_N, E1GridDesc_M_N, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave_, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, true, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, true, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, false, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >< math::max(Gemm0MXdlPerWave64, 1)>::I5
static constexpr auto I5
Definition gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp:98
ck::GridwiseBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0DataType, Acc0DataType, D0sDataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, InMemoryDataOperationEnum::Set, A0GridDesc_M_K, B0GridDesc_N_K, D0sGridDesc_M_N, B1GridDesc_N_K, D1sGridDesc_M_N, E1GridDesc_M_N, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave_, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, true, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, true, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, false, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >< math::max(Gemm0MXdlPerWave64, 1)>::E1GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
remove_cvref_t< decltype(MakeE1GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(E1GridDesc_M_N{}))> E1GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
Definition gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp:508
ck::GridwiseBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0DataType, Acc0DataType, D0sDataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, InMemoryDataOperationEnum::Set, A0GridDesc_M_K, B0GridDesc_N_K, D0sGridDesc_M_N, B1GridDesc_N_K, D1sGridDesc_M_N, E1GridDesc_M_N, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave_, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, true, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, true, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, false, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >::MakeDefaultA0GridDescriptor_AK0_M_AK1
__host__ static __device__ constexpr auto MakeDefaultA0GridDescriptor_AK0_M_AK1(const A0GridDesc_M_K &a0_grid_desc_m_k)
Definition gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp:369
ck::GridwiseBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0DataType, Acc0DataType, D0sDataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, InMemoryDataOperationEnum::Set, A0GridDesc_M_K, B0GridDesc_N_K, D0sGridDesc_M_N, B1GridDesc_N_K, D1sGridDesc_M_N, E1GridDesc_M_N, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave_, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, true, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, true, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, false, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >< math::max(Gemm0MXdlPerWave64, 1)>::B0K0PerBlock
static constexpr auto B0K0PerBlock
Definition gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp:108
ck::GridwiseBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0DataType, Acc0DataType, D0sDataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, InMemoryDataOperationEnum::Set, A0GridDesc_M_K, B0GridDesc_N_K, D0sGridDesc_M_N, B1GridDesc_N_K, D1sGridDesc_M_N, E1GridDesc_M_N, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave_, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, true, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, true, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, false, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >< math::max(Gemm0MXdlPerWave64, 1)>::NumD1Tensor
static constexpr index_t NumD1Tensor
Definition gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp:91
ck::GridwiseBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0DataType, Acc0DataType, D0sDataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, InMemoryDataOperationEnum::Set, A0GridDesc_M_K, B0GridDesc_N_K, D0sGridDesc_M_N, B1GridDesc_N_K, D1sGridDesc_M_N, E1GridDesc_M_N, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave_, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, true, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, true, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, false, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >< math::max(Gemm0MXdlPerWave64, 1)>::I1
static constexpr auto I1
Definition gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp:94
ck::GridwiseBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0DataType, Acc0DataType, D0sDataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, InMemoryDataOperationEnum::Set, A0GridDesc_M_K, B0GridDesc_N_K, D0sGridDesc_M_N, B1GridDesc_N_K, D1sGridDesc_M_N, E1GridDesc_M_N, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave_, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, true, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, true, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, false, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >< math::max(Gemm0MXdlPerWave64, 1)>::WaveSize
static constexpr auto WaveSize
Definition gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp:112
ck::GridwiseBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle< A0DataType, Acc0DataType, D0sDataType, Acc1DataType, C1ShuffleDataType, D1sDataType, E1DataType, A0ElementwiseOperation, B0ElementwiseOperation, CDE0ElementwiseOperation, B1ElementwiseOperation, CDE1ElementwiseOperation, InMemoryDataOperationEnum::Set, A0GridDesc_M_K, B0GridDesc_N_K, D0sGridDesc_M_N, B1GridDesc_N_K, D1sGridDesc_M_N, E1GridDesc_M_N, NumGemm0KPrefetchStage, BlockSize, Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, A0K1, B0K1, B1K1, Gemm0MPerXdl, Gemm0NPerXdl, Gemm0MXdlPerWave_, Gemm0NXdlPerWave, Gemm1NXdlPerWave, A0BlockTransferThreadClusterLengths_AK0_M_AK1, A0BlockTransferThreadClusterArrangeOrder, A0BlockTransferSrcAccessOrder, A0BlockTransferSrcVectorDim, A0BlockTransferSrcScalarPerVector, A0BlockTransferDstScalarPerVector_AK1, true, A0BlockLdsExtraM, B0BlockTransferThreadClusterLengths_BK0_N_BK1, B0BlockTransferThreadClusterArrangeOrder, B0BlockTransferSrcAccessOrder, B0BlockTransferSrcVectorDim, B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_BK1, true, B0BlockLdsExtraN, CDE0BlockTransferSrcVectorDim, CDE0BlockTransferSrcScalaerPerVector, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, false, B1BlockLdsExtraN, C1ShuffleMXdlPerWavePerShuffle, C1ShuffleGemm0NXdlPerWavePerShuffle, CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDE1ShuffleBlockTransferScalarPerVector_NPerBlock, LoopSched >< math::max(Gemm0MXdlPerWave64, 1)>::D1sGridPointer
decltype(MakeD1sGridPointer()) D1sGridPointer
Definition gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp:554
Definition gridwise_gemm_pipeline_v1.hpp:13
Selects the appropriate MFMA instruction type and configuration for given data types and tile sizes o...
Definition xdlops_gemm.hpp:1208
static constexpr auto selected_mfma
Definition xdlops_gemm.hpp:1757
Definition utility/sequence.hpp:43
Definition tensor_space_filling_curve.hpp:20
Definition static_buffer.hpp:16
Blockwise data transfer.
Definition thread_group_tensor_slice_transfer_v4r1.hpp:46
Definition thread_group_tensor_slice_transfer_v7.hpp:42
Definition threadwise_tensor_slice_transfer.hpp:1877
Threadwise data transfer.
Definition threadwise_tensor_slice_transfer.hpp:1720
Definition threadwise_tensor_slice_transfer.hpp:39
Helper structure that facilitates transfer of source (grid) data to destination threads.
Definition threadwise_tensor_slice_transfer.hpp:234
Definition utility/tuple.hpp:117
Definition xdlops_gemm.hpp:1821
static constexpr value_type value
Definition utility/integral_constant.hpp:13
Definition functional2.hpp:33
Definition tensor_operation/gpu/element/unary_element_wise_operation.hpp:340