#include <blockwise_gemm_dl_v2r3.hpp>
◆ AIndex
template<
index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename ABlockDesc_BK0_BM_BK1, typename BBlockDesc_BK0_BN_BK1,
index_t BM1PerThreadBM11,
index_t BN1PerThreadBN11,
index_t BK0PerThread, typename BM10BN10ThreadClusterBM10Xs, typename BM10BN10ThreadClusterBN10Xs,
index_t AThreadCopyScalarPerVector_BM11,
index_t BThreadCopyScalarPerVector_BN11, typename
enable_if< ABlockDesc_BK0_BM_BK1::IsKnownAtCompileTime() &&BBlockDesc_BK0_BN_BK1::IsKnownAtCompileTime(), bool >::type = false>
| using ck::BlockwiseGemmDl_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_BM0_2_BN0_2< BlockSize, FloatA, FloatB, FloatC, ABlockDesc_BK0_BM_BK1, BBlockDesc_BK0_BN_BK1, BM1PerThreadBM11, BN1PerThreadBN11, BK0PerThread, BM10BN10ThreadClusterBM10Xs, BM10BN10ThreadClusterBN10Xs, AThreadCopyScalarPerVector_BM11, BThreadCopyScalarPerVector_BN11, type >::AIndex = MultiIndex<3> |
◆ BIndex
template<
index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename ABlockDesc_BK0_BM_BK1, typename BBlockDesc_BK0_BN_BK1,
index_t BM1PerThreadBM11,
index_t BN1PerThreadBN11,
index_t BK0PerThread, typename BM10BN10ThreadClusterBM10Xs, typename BM10BN10ThreadClusterBN10Xs,
index_t AThreadCopyScalarPerVector_BM11,
index_t BThreadCopyScalarPerVector_BN11, typename
enable_if< ABlockDesc_BK0_BM_BK1::IsKnownAtCompileTime() &&BBlockDesc_BK0_BN_BK1::IsKnownAtCompileTime(), bool >::type = false>
| using ck::BlockwiseGemmDl_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_BM0_2_BN0_2< BlockSize, FloatA, FloatB, FloatC, ABlockDesc_BK0_BM_BK1, BBlockDesc_BK0_BN_BK1, BM1PerThreadBM11, BN1PerThreadBN11, BK0PerThread, BM10BN10ThreadClusterBM10Xs, BM10BN10ThreadClusterBN10Xs, AThreadCopyScalarPerVector_BM11, BThreadCopyScalarPerVector_BN11, type >::BIndex = MultiIndex<3> |
◆ CIndex
template<
index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename ABlockDesc_BK0_BM_BK1, typename BBlockDesc_BK0_BN_BK1,
index_t BM1PerThreadBM11,
index_t BN1PerThreadBN11,
index_t BK0PerThread, typename BM10BN10ThreadClusterBM10Xs, typename BM10BN10ThreadClusterBN10Xs,
index_t AThreadCopyScalarPerVector_BM11,
index_t BThreadCopyScalarPerVector_BN11, typename
enable_if< ABlockDesc_BK0_BM_BK1::IsKnownAtCompileTime() &&BBlockDesc_BK0_BN_BK1::IsKnownAtCompileTime(), bool >::type = false>
| using ck::BlockwiseGemmDl_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_BM0_2_BN0_2< BlockSize, FloatA, FloatB, FloatC, ABlockDesc_BK0_BM_BK1, BBlockDesc_BK0_BN_BK1, BM1PerThreadBM11, BN1PerThreadBN11, BK0PerThread, BM10BN10ThreadClusterBM10Xs, BM10BN10ThreadClusterBN10Xs, AThreadCopyScalarPerVector_BM11, BThreadCopyScalarPerVector_BN11, type >::CIndex = MultiIndex<4> |
◆ BlockwiseGemmDl_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_BM0_2_BN0_2()
template<
index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename ABlockDesc_BK0_BM_BK1, typename BBlockDesc_BK0_BN_BK1,
index_t BM1PerThreadBM11,
index_t BN1PerThreadBN11,
index_t BK0PerThread, typename BM10BN10ThreadClusterBM10Xs, typename BM10BN10ThreadClusterBN10Xs,
index_t AThreadCopyScalarPerVector_BM11,
index_t BThreadCopyScalarPerVector_BN11, typename
enable_if< ABlockDesc_BK0_BM_BK1::IsKnownAtCompileTime() &&BBlockDesc_BK0_BN_BK1::IsKnownAtCompileTime(), bool >::type = false>
| __device__ ck::BlockwiseGemmDl_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_BM0_2_BN0_2< BlockSize, FloatA, FloatB, FloatC, ABlockDesc_BK0_BM_BK1, BBlockDesc_BK0_BN_BK1, BM1PerThreadBM11, BN1PerThreadBN11, BK0PerThread, BM10BN10ThreadClusterBM10Xs, BM10BN10ThreadClusterBN10Xs, AThreadCopyScalarPerVector_BM11, BThreadCopyScalarPerVector_BN11, type >::BlockwiseGemmDl_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_BM0_2_BN0_2 |
( |
| ) |
|
|
inline |
◆ CalculateCThreadOriginOnBlock_BM0_BM1_BN0_BN1()
template<
index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename ABlockDesc_BK0_BM_BK1, typename BBlockDesc_BK0_BN_BK1,
index_t BM1PerThreadBM11,
index_t BN1PerThreadBN11,
index_t BK0PerThread, typename BM10BN10ThreadClusterBM10Xs, typename BM10BN10ThreadClusterBN10Xs,
index_t AThreadCopyScalarPerVector_BM11,
index_t BThreadCopyScalarPerVector_BN11, typename
enable_if< ABlockDesc_BK0_BM_BK1::IsKnownAtCompileTime() &&BBlockDesc_BK0_BN_BK1::IsKnownAtCompileTime(), bool >::type = false>
| __device__ CIndex ck::BlockwiseGemmDl_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_BM0_2_BN0_2< BlockSize, FloatA, FloatB, FloatC, ABlockDesc_BK0_BM_BK1, BBlockDesc_BK0_BN_BK1, BM1PerThreadBM11, BN1PerThreadBN11, BK0PerThread, BM10BN10ThreadClusterBM10Xs, BM10BN10ThreadClusterBN10Xs, AThreadCopyScalarPerVector_BM11, BThreadCopyScalarPerVector_BN11, type >::CalculateCThreadOriginOnBlock_BM0_BM1_BN0_BN1 |
( |
index_t | thread_id | ) |
|
|
inlinestatic |
◆ GetCThreadTensorLengths_BM0_BM1_BN0_BN1()
template<
index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename ABlockDesc_BK0_BM_BK1, typename BBlockDesc_BK0_BN_BK1,
index_t BM1PerThreadBM11,
index_t BN1PerThreadBN11,
index_t BK0PerThread, typename BM10BN10ThreadClusterBM10Xs, typename BM10BN10ThreadClusterBN10Xs,
index_t AThreadCopyScalarPerVector_BM11,
index_t BThreadCopyScalarPerVector_BN11, typename
enable_if< ABlockDesc_BK0_BM_BK1::IsKnownAtCompileTime() &&BBlockDesc_BK0_BN_BK1::IsKnownAtCompileTime(), bool >::type = false>
| __host__ static __device__ constexpr auto ck::BlockwiseGemmDl_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_BM0_2_BN0_2< BlockSize, FloatA, FloatB, FloatC, ABlockDesc_BK0_BM_BK1, BBlockDesc_BK0_BN_BK1, BM1PerThreadBM11, BN1PerThreadBN11, BK0PerThread, BM10BN10ThreadClusterBM10Xs, BM10BN10ThreadClusterBN10Xs, AThreadCopyScalarPerVector_BM11, BThreadCopyScalarPerVector_BN11, type >::GetCThreadTensorLengths_BM0_BM1_BN0_BN1 |
( |
| ) |
|
|
inlinestaticconstexpr |
◆ MakeABlockDescriptor_BK0_BM0_BM1_BK1()
template<
index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename ABlockDesc_BK0_BM_BK1, typename BBlockDesc_BK0_BN_BK1,
index_t BM1PerThreadBM11,
index_t BN1PerThreadBN11,
index_t BK0PerThread, typename BM10BN10ThreadClusterBM10Xs, typename BM10BN10ThreadClusterBN10Xs,
index_t AThreadCopyScalarPerVector_BM11,
index_t BThreadCopyScalarPerVector_BN11, typename
enable_if< ABlockDesc_BK0_BM_BK1::IsKnownAtCompileTime() &&BBlockDesc_BK0_BN_BK1::IsKnownAtCompileTime(), bool >::type = false>
| __host__ static __device__ constexpr auto ck::BlockwiseGemmDl_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_BM0_2_BN0_2< BlockSize, FloatA, FloatB, FloatC, ABlockDesc_BK0_BM_BK1, BBlockDesc_BK0_BN_BK1, BM1PerThreadBM11, BN1PerThreadBN11, BK0PerThread, BM10BN10ThreadClusterBM10Xs, BM10BN10ThreadClusterBN10Xs, AThreadCopyScalarPerVector_BM11, BThreadCopyScalarPerVector_BN11, type >::MakeABlockDescriptor_BK0_BM0_BM1_BK1 |
( |
const ABlockDesc_BK0_BM_BK1 & | a_block_desc_bk0_bm_bk1 | ) |
|
|
inlinestaticconstexpr |
◆ MakeBBlockDescriptor_BK0_BN0_BN1_BK1()
template<
index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename ABlockDesc_BK0_BM_BK1, typename BBlockDesc_BK0_BN_BK1,
index_t BM1PerThreadBM11,
index_t BN1PerThreadBN11,
index_t BK0PerThread, typename BM10BN10ThreadClusterBM10Xs, typename BM10BN10ThreadClusterBN10Xs,
index_t AThreadCopyScalarPerVector_BM11,
index_t BThreadCopyScalarPerVector_BN11, typename
enable_if< ABlockDesc_BK0_BM_BK1::IsKnownAtCompileTime() &&BBlockDesc_BK0_BN_BK1::IsKnownAtCompileTime(), bool >::type = false>
| __host__ static __device__ constexpr auto ck::BlockwiseGemmDl_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_BM0_2_BN0_2< BlockSize, FloatA, FloatB, FloatC, ABlockDesc_BK0_BM_BK1, BBlockDesc_BK0_BN_BK1, BM1PerThreadBM11, BN1PerThreadBN11, BK0PerThread, BM10BN10ThreadClusterBM10Xs, BM10BN10ThreadClusterBN10Xs, AThreadCopyScalarPerVector_BM11, BThreadCopyScalarPerVector_BN11, type >::MakeBBlockDescriptor_BK0_BN0_BN1_BK1 |
( |
const BBlockDesc_BK0_BN_BK1 & | b_block_desc_bk0_bn_bk1 | ) |
|
|
inlinestaticconstexpr |
◆ MakeCBlockAdaptor_BM0_BM100_BM101_BM11_BN0_BN100_BN101_BN11_To_BM0_BM1_BN0_BN1()
template<
index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename ABlockDesc_BK0_BM_BK1, typename BBlockDesc_BK0_BN_BK1,
index_t BM1PerThreadBM11,
index_t BN1PerThreadBN11,
index_t BK0PerThread, typename BM10BN10ThreadClusterBM10Xs, typename BM10BN10ThreadClusterBN10Xs,
index_t AThreadCopyScalarPerVector_BM11,
index_t BThreadCopyScalarPerVector_BN11, typename
enable_if< ABlockDesc_BK0_BM_BK1::IsKnownAtCompileTime() &&BBlockDesc_BK0_BN_BK1::IsKnownAtCompileTime(), bool >::type = false>
| __host__ static __device__ constexpr auto ck::BlockwiseGemmDl_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_BM0_2_BN0_2< BlockSize, FloatA, FloatB, FloatC, ABlockDesc_BK0_BM_BK1, BBlockDesc_BK0_BN_BK1, BM1PerThreadBM11, BN1PerThreadBN11, BK0PerThread, BM10BN10ThreadClusterBM10Xs, BM10BN10ThreadClusterBN10Xs, AThreadCopyScalarPerVector_BM11, BThreadCopyScalarPerVector_BN11, type >::MakeCBlockAdaptor_BM0_BM100_BM101_BM11_BN0_BN100_BN101_BN11_To_BM0_BM1_BN0_BN1 |
( |
| ) |
|
|
inlinestaticconstexpr |
◆ MakeCBlockAdaptor_BM0_BM100_BM101_BM11_BN0_BN100_BN101_BN11_To_BM_BN()
template<
index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename ABlockDesc_BK0_BM_BK1, typename BBlockDesc_BK0_BN_BK1,
index_t BM1PerThreadBM11,
index_t BN1PerThreadBN11,
index_t BK0PerThread, typename BM10BN10ThreadClusterBM10Xs, typename BM10BN10ThreadClusterBN10Xs,
index_t AThreadCopyScalarPerVector_BM11,
index_t BThreadCopyScalarPerVector_BN11, typename
enable_if< ABlockDesc_BK0_BM_BK1::IsKnownAtCompileTime() &&BBlockDesc_BK0_BN_BK1::IsKnownAtCompileTime(), bool >::type = false>
| __host__ static __device__ constexpr auto ck::BlockwiseGemmDl_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_BM0_2_BN0_2< BlockSize, FloatA, FloatB, FloatC, ABlockDesc_BK0_BM_BK1, BBlockDesc_BK0_BN_BK1, BM1PerThreadBM11, BN1PerThreadBN11, BK0PerThread, BM10BN10ThreadClusterBM10Xs, BM10BN10ThreadClusterBN10Xs, AThreadCopyScalarPerVector_BM11, BThreadCopyScalarPerVector_BN11, type >::MakeCBlockAdaptor_BM0_BM100_BM101_BM11_BN0_BN100_BN101_BN11_To_BM_BN |
( |
| ) |
|
|
inlinestaticconstexpr |
◆ Run()
template<
index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename ABlockDesc_BK0_BM_BK1, typename BBlockDesc_BK0_BN_BK1,
index_t BM1PerThreadBM11,
index_t BN1PerThreadBN11,
index_t BK0PerThread, typename BM10BN10ThreadClusterBM10Xs, typename BM10BN10ThreadClusterBN10Xs,
index_t AThreadCopyScalarPerVector_BM11,
index_t BThreadCopyScalarPerVector_BN11, typename
enable_if< ABlockDesc_BK0_BM_BK1::IsKnownAtCompileTime() &&BBlockDesc_BK0_BN_BK1::IsKnownAtCompileTime(), bool >::type = false>
template<typename CThreadDesc_BM0_BM11_BN0_BN11, typename ABlockBuffer, typename BBlockBuffer, typename CThreadBuffer>
| __device__ void ck::BlockwiseGemmDl_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_BM0_2_BN0_2< BlockSize, FloatA, FloatB, FloatC, ABlockDesc_BK0_BM_BK1, BBlockDesc_BK0_BN_BK1, BM1PerThreadBM11, BN1PerThreadBN11, BK0PerThread, BM10BN10ThreadClusterBM10Xs, BM10BN10ThreadClusterBN10Xs, AThreadCopyScalarPerVector_BM11, BThreadCopyScalarPerVector_BN11, type >::Run |
( |
const CThreadDesc_BM0_BM11_BN0_BN11 & | , |
|
|
const ABlockBuffer & | a_block_buf, |
|
|
const BBlockBuffer & | b_block_buf, |
|
|
CThreadBuffer & | c_thread_buf ) const |
|
inline |
◆ a_block_desc_bk0_bm0_bm1_bk1_
template<
index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename ABlockDesc_BK0_BM_BK1, typename BBlockDesc_BK0_BN_BK1,
index_t BM1PerThreadBM11,
index_t BN1PerThreadBN11,
index_t BK0PerThread, typename BM10BN10ThreadClusterBM10Xs, typename BM10BN10ThreadClusterBN10Xs,
index_t AThreadCopyScalarPerVector_BM11,
index_t BThreadCopyScalarPerVector_BN11, typename
enable_if< ABlockDesc_BK0_BM_BK1::IsKnownAtCompileTime() &&BBlockDesc_BK0_BN_BK1::IsKnownAtCompileTime(), bool >::type = false>
| auto ck::BlockwiseGemmDl_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_BM0_2_BN0_2< BlockSize, FloatA, FloatB, FloatC, ABlockDesc_BK0_BM_BK1, BBlockDesc_BK0_BN_BK1, BM1PerThreadBM11, BN1PerThreadBN11, BK0PerThread, BM10BN10ThreadClusterBM10Xs, BM10BN10ThreadClusterBN10Xs, AThreadCopyScalarPerVector_BM11, BThreadCopyScalarPerVector_BN11, type >::a_block_desc_bk0_bm0_bm1_bk1_ |
|
staticconstexpr |
Initial value:=
__host__ static __device__ constexpr auto MakeABlockDescriptor_BK0_BM0_BM1_BK1(const ABlockDesc_BK0_BM_BK1 &a_block_desc_bk0_bm_bk1)
Definition blockwise_gemm_dl_v2r3.hpp:78
◆ b_block_desc_bk0_bn0_bn1_bk1_
template<
index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename ABlockDesc_BK0_BM_BK1, typename BBlockDesc_BK0_BN_BK1,
index_t BM1PerThreadBM11,
index_t BN1PerThreadBN11,
index_t BK0PerThread, typename BM10BN10ThreadClusterBM10Xs, typename BM10BN10ThreadClusterBN10Xs,
index_t AThreadCopyScalarPerVector_BM11,
index_t BThreadCopyScalarPerVector_BN11, typename
enable_if< ABlockDesc_BK0_BM_BK1::IsKnownAtCompileTime() &&BBlockDesc_BK0_BN_BK1::IsKnownAtCompileTime(), bool >::type = false>
| auto ck::BlockwiseGemmDl_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_BM0_2_BN0_2< BlockSize, FloatA, FloatB, FloatC, ABlockDesc_BK0_BM_BK1, BBlockDesc_BK0_BN_BK1, BM1PerThreadBM11, BN1PerThreadBN11, BK0PerThread, BM10BN10ThreadClusterBM10Xs, BM10BN10ThreadClusterBN10Xs, AThreadCopyScalarPerVector_BM11, BThreadCopyScalarPerVector_BN11, type >::b_block_desc_bk0_bn0_bn1_bk1_ |
|
staticconstexpr |
Initial value:=
__host__ static __device__ constexpr auto MakeBBlockDescriptor_BK0_BN0_BN1_BK1(const BBlockDesc_BK0_BN_BK1 &b_block_desc_bk0_bn_bk1)
Definition blockwise_gemm_dl_v2r3.hpp:92
◆ BK0
template<
index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename ABlockDesc_BK0_BM_BK1, typename BBlockDesc_BK0_BN_BK1,
index_t BM1PerThreadBM11,
index_t BN1PerThreadBN11,
index_t BK0PerThread, typename BM10BN10ThreadClusterBM10Xs, typename BM10BN10ThreadClusterBN10Xs,
index_t AThreadCopyScalarPerVector_BM11,
index_t BThreadCopyScalarPerVector_BN11, typename
enable_if< ABlockDesc_BK0_BM_BK1::IsKnownAtCompileTime() &&BBlockDesc_BK0_BN_BK1::IsKnownAtCompileTime(), bool >::type = false>
| index_t ck::BlockwiseGemmDl_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_BM0_2_BN0_2< BlockSize, FloatA, FloatB, FloatC, ABlockDesc_BK0_BM_BK1, BBlockDesc_BK0_BN_BK1, BM1PerThreadBM11, BN1PerThreadBN11, BK0PerThread, BM10BN10ThreadClusterBM10Xs, BM10BN10ThreadClusterBN10Xs, AThreadCopyScalarPerVector_BM11, BThreadCopyScalarPerVector_BN11, type >::BK0 = ABlockDesc_BK0_BM_BK1{}.GetLength(I0) |
|
staticconstexpr |
◆ BK1
template<
index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename ABlockDesc_BK0_BM_BK1, typename BBlockDesc_BK0_BN_BK1,
index_t BM1PerThreadBM11,
index_t BN1PerThreadBN11,
index_t BK0PerThread, typename BM10BN10ThreadClusterBM10Xs, typename BM10BN10ThreadClusterBN10Xs,
index_t AThreadCopyScalarPerVector_BM11,
index_t BThreadCopyScalarPerVector_BN11, typename
enable_if< ABlockDesc_BK0_BM_BK1::IsKnownAtCompileTime() &&BBlockDesc_BK0_BN_BK1::IsKnownAtCompileTime(), bool >::type = false>
| index_t ck::BlockwiseGemmDl_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_BM0_2_BN0_2< BlockSize, FloatA, FloatB, FloatC, ABlockDesc_BK0_BM_BK1, BBlockDesc_BK0_BN_BK1, BM1PerThreadBM11, BN1PerThreadBN11, BK0PerThread, BM10BN10ThreadClusterBM10Xs, BM10BN10ThreadClusterBN10Xs, AThreadCopyScalarPerVector_BM11, BThreadCopyScalarPerVector_BN11, type >::BK1 = ABlockDesc_BK0_BM_BK1{}.GetLength(I2) |
|
staticconstexpr |
◆ BM
template<
index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename ABlockDesc_BK0_BM_BK1, typename BBlockDesc_BK0_BN_BK1,
index_t BM1PerThreadBM11,
index_t BN1PerThreadBN11,
index_t BK0PerThread, typename BM10BN10ThreadClusterBM10Xs, typename BM10BN10ThreadClusterBN10Xs,
index_t AThreadCopyScalarPerVector_BM11,
index_t BThreadCopyScalarPerVector_BN11, typename
enable_if< ABlockDesc_BK0_BM_BK1::IsKnownAtCompileTime() &&BBlockDesc_BK0_BN_BK1::IsKnownAtCompileTime(), bool >::type = false>
| index_t ck::BlockwiseGemmDl_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_BM0_2_BN0_2< BlockSize, FloatA, FloatB, FloatC, ABlockDesc_BK0_BM_BK1, BBlockDesc_BK0_BN_BK1, BM1PerThreadBM11, BN1PerThreadBN11, BK0PerThread, BM10BN10ThreadClusterBM10Xs, BM10BN10ThreadClusterBN10Xs, AThreadCopyScalarPerVector_BM11, BThreadCopyScalarPerVector_BN11, type >::BM = ABlockDesc_BK0_BM_BK1{}.GetLength(I1) |
|
staticconstexpr |
◆ BM0
template<
index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename ABlockDesc_BK0_BM_BK1, typename BBlockDesc_BK0_BN_BK1,
index_t BM1PerThreadBM11,
index_t BN1PerThreadBN11,
index_t BK0PerThread, typename BM10BN10ThreadClusterBM10Xs, typename BM10BN10ThreadClusterBN10Xs,
index_t AThreadCopyScalarPerVector_BM11,
index_t BThreadCopyScalarPerVector_BN11, typename
enable_if< ABlockDesc_BK0_BM_BK1::IsKnownAtCompileTime() &&BBlockDesc_BK0_BN_BK1::IsKnownAtCompileTime(), bool >::type = false>
| index_t ck::BlockwiseGemmDl_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_BM0_2_BN0_2< BlockSize, FloatA, FloatB, FloatC, ABlockDesc_BK0_BM_BK1, BBlockDesc_BK0_BN_BK1, BM1PerThreadBM11, BN1PerThreadBN11, BK0PerThread, BM10BN10ThreadClusterBM10Xs, BM10BN10ThreadClusterBN10Xs, AThreadCopyScalarPerVector_BM11, BThreadCopyScalarPerVector_BN11, type >::BM0 = BM / BM1 |
|
staticconstexpr |
◆ BM1
template<
index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename ABlockDesc_BK0_BM_BK1, typename BBlockDesc_BK0_BN_BK1,
index_t BM1PerThreadBM11,
index_t BN1PerThreadBN11,
index_t BK0PerThread, typename BM10BN10ThreadClusterBM10Xs, typename BM10BN10ThreadClusterBN10Xs,
index_t AThreadCopyScalarPerVector_BM11,
index_t BThreadCopyScalarPerVector_BN11, typename
enable_if< ABlockDesc_BK0_BM_BK1::IsKnownAtCompileTime() &&BBlockDesc_BK0_BN_BK1::IsKnownAtCompileTime(), bool >::type = false>
| index_t ck::BlockwiseGemmDl_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_BM0_2_BN0_2< BlockSize, FloatA, FloatB, FloatC, ABlockDesc_BK0_BM_BK1, BBlockDesc_BK0_BN_BK1, BM1PerThreadBM11, BN1PerThreadBN11, BK0PerThread, BM10BN10ThreadClusterBM10Xs, BM10BN10ThreadClusterBN10Xs, AThreadCopyScalarPerVector_BM11, BThreadCopyScalarPerVector_BN11, type >::BM1 = BM100 * BM101 * BM11 |
|
staticconstexpr |
◆ BM100
template<
index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename ABlockDesc_BK0_BM_BK1, typename BBlockDesc_BK0_BN_BK1,
index_t BM1PerThreadBM11,
index_t BN1PerThreadBN11,
index_t BK0PerThread, typename BM10BN10ThreadClusterBM10Xs, typename BM10BN10ThreadClusterBN10Xs,
index_t AThreadCopyScalarPerVector_BM11,
index_t BThreadCopyScalarPerVector_BN11, typename
enable_if< ABlockDesc_BK0_BM_BK1::IsKnownAtCompileTime() &&BBlockDesc_BK0_BN_BK1::IsKnownAtCompileTime(), bool >::type = false>
| index_t ck::BlockwiseGemmDl_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_BM0_2_BN0_2< BlockSize, FloatA, FloatB, FloatC, ABlockDesc_BK0_BM_BK1, BBlockDesc_BK0_BN_BK1, BM1PerThreadBM11, BN1PerThreadBN11, BK0PerThread, BM10BN10ThreadClusterBM10Xs, BM10BN10ThreadClusterBN10Xs, AThreadCopyScalarPerVector_BM11, BThreadCopyScalarPerVector_BN11, type >::BM100 = BM10BN10ThreadClusterBM10Xs{}[I0] |
|
staticconstexpr |
◆ BM101
template<
index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename ABlockDesc_BK0_BM_BK1, typename BBlockDesc_BK0_BN_BK1,
index_t BM1PerThreadBM11,
index_t BN1PerThreadBN11,
index_t BK0PerThread, typename BM10BN10ThreadClusterBM10Xs, typename BM10BN10ThreadClusterBN10Xs,
index_t AThreadCopyScalarPerVector_BM11,
index_t BThreadCopyScalarPerVector_BN11, typename
enable_if< ABlockDesc_BK0_BM_BK1::IsKnownAtCompileTime() &&BBlockDesc_BK0_BN_BK1::IsKnownAtCompileTime(), bool >::type = false>
| index_t ck::BlockwiseGemmDl_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_BM0_2_BN0_2< BlockSize, FloatA, FloatB, FloatC, ABlockDesc_BK0_BM_BK1, BBlockDesc_BK0_BN_BK1, BM1PerThreadBM11, BN1PerThreadBN11, BK0PerThread, BM10BN10ThreadClusterBM10Xs, BM10BN10ThreadClusterBN10Xs, AThreadCopyScalarPerVector_BM11, BThreadCopyScalarPerVector_BN11, type >::BM101 = BM10BN10ThreadClusterBM10Xs{}[I1] |
|
staticconstexpr |
◆ BM11
template<
index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename ABlockDesc_BK0_BM_BK1, typename BBlockDesc_BK0_BN_BK1,
index_t BM1PerThreadBM11,
index_t BN1PerThreadBN11,
index_t BK0PerThread, typename BM10BN10ThreadClusterBM10Xs, typename BM10BN10ThreadClusterBN10Xs,
index_t AThreadCopyScalarPerVector_BM11,
index_t BThreadCopyScalarPerVector_BN11, typename
enable_if< ABlockDesc_BK0_BM_BK1::IsKnownAtCompileTime() &&BBlockDesc_BK0_BN_BK1::IsKnownAtCompileTime(), bool >::type = false>
| index_t ck::BlockwiseGemmDl_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_BM0_2_BN0_2< BlockSize, FloatA, FloatB, FloatC, ABlockDesc_BK0_BM_BK1, BBlockDesc_BK0_BN_BK1, BM1PerThreadBM11, BN1PerThreadBN11, BK0PerThread, BM10BN10ThreadClusterBM10Xs, BM10BN10ThreadClusterBN10Xs, AThreadCopyScalarPerVector_BM11, BThreadCopyScalarPerVector_BN11, type >::BM11 = BM1PerThreadBM11 |
|
staticconstexpr |
◆ BN
template<
index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename ABlockDesc_BK0_BM_BK1, typename BBlockDesc_BK0_BN_BK1,
index_t BM1PerThreadBM11,
index_t BN1PerThreadBN11,
index_t BK0PerThread, typename BM10BN10ThreadClusterBM10Xs, typename BM10BN10ThreadClusterBN10Xs,
index_t AThreadCopyScalarPerVector_BM11,
index_t BThreadCopyScalarPerVector_BN11, typename
enable_if< ABlockDesc_BK0_BM_BK1::IsKnownAtCompileTime() &&BBlockDesc_BK0_BN_BK1::IsKnownAtCompileTime(), bool >::type = false>
| index_t ck::BlockwiseGemmDl_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_BM0_2_BN0_2< BlockSize, FloatA, FloatB, FloatC, ABlockDesc_BK0_BM_BK1, BBlockDesc_BK0_BN_BK1, BM1PerThreadBM11, BN1PerThreadBN11, BK0PerThread, BM10BN10ThreadClusterBM10Xs, BM10BN10ThreadClusterBN10Xs, AThreadCopyScalarPerVector_BM11, BThreadCopyScalarPerVector_BN11, type >::BN = BBlockDesc_BK0_BN_BK1{}.GetLength(I1) |
|
staticconstexpr |
◆ BN0
template<
index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename ABlockDesc_BK0_BM_BK1, typename BBlockDesc_BK0_BN_BK1,
index_t BM1PerThreadBM11,
index_t BN1PerThreadBN11,
index_t BK0PerThread, typename BM10BN10ThreadClusterBM10Xs, typename BM10BN10ThreadClusterBN10Xs,
index_t AThreadCopyScalarPerVector_BM11,
index_t BThreadCopyScalarPerVector_BN11, typename
enable_if< ABlockDesc_BK0_BM_BK1::IsKnownAtCompileTime() &&BBlockDesc_BK0_BN_BK1::IsKnownAtCompileTime(), bool >::type = false>
| index_t ck::BlockwiseGemmDl_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_BM0_2_BN0_2< BlockSize, FloatA, FloatB, FloatC, ABlockDesc_BK0_BM_BK1, BBlockDesc_BK0_BN_BK1, BM1PerThreadBM11, BN1PerThreadBN11, BK0PerThread, BM10BN10ThreadClusterBM10Xs, BM10BN10ThreadClusterBN10Xs, AThreadCopyScalarPerVector_BM11, BThreadCopyScalarPerVector_BN11, type >::BN0 = BN / BN1 |
|
staticconstexpr |
◆ BN1
template<
index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename ABlockDesc_BK0_BM_BK1, typename BBlockDesc_BK0_BN_BK1,
index_t BM1PerThreadBM11,
index_t BN1PerThreadBN11,
index_t BK0PerThread, typename BM10BN10ThreadClusterBM10Xs, typename BM10BN10ThreadClusterBN10Xs,
index_t AThreadCopyScalarPerVector_BM11,
index_t BThreadCopyScalarPerVector_BN11, typename
enable_if< ABlockDesc_BK0_BM_BK1::IsKnownAtCompileTime() &&BBlockDesc_BK0_BN_BK1::IsKnownAtCompileTime(), bool >::type = false>
| index_t ck::BlockwiseGemmDl_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_BM0_2_BN0_2< BlockSize, FloatA, FloatB, FloatC, ABlockDesc_BK0_BM_BK1, BBlockDesc_BK0_BN_BK1, BM1PerThreadBM11, BN1PerThreadBN11, BK0PerThread, BM10BN10ThreadClusterBM10Xs, BM10BN10ThreadClusterBN10Xs, AThreadCopyScalarPerVector_BM11, BThreadCopyScalarPerVector_BN11, type >::BN1 = BN100 * BN101 * BN11 |
|
staticconstexpr |
◆ BN100
template<
index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename ABlockDesc_BK0_BM_BK1, typename BBlockDesc_BK0_BN_BK1,
index_t BM1PerThreadBM11,
index_t BN1PerThreadBN11,
index_t BK0PerThread, typename BM10BN10ThreadClusterBM10Xs, typename BM10BN10ThreadClusterBN10Xs,
index_t AThreadCopyScalarPerVector_BM11,
index_t BThreadCopyScalarPerVector_BN11, typename
enable_if< ABlockDesc_BK0_BM_BK1::IsKnownAtCompileTime() &&BBlockDesc_BK0_BN_BK1::IsKnownAtCompileTime(), bool >::type = false>
| index_t ck::BlockwiseGemmDl_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_BM0_2_BN0_2< BlockSize, FloatA, FloatB, FloatC, ABlockDesc_BK0_BM_BK1, BBlockDesc_BK0_BN_BK1, BM1PerThreadBM11, BN1PerThreadBN11, BK0PerThread, BM10BN10ThreadClusterBM10Xs, BM10BN10ThreadClusterBN10Xs, AThreadCopyScalarPerVector_BM11, BThreadCopyScalarPerVector_BN11, type >::BN100 = BM10BN10ThreadClusterBN10Xs{}[I0] |
|
staticconstexpr |
◆ BN101
template<
index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename ABlockDesc_BK0_BM_BK1, typename BBlockDesc_BK0_BN_BK1,
index_t BM1PerThreadBM11,
index_t BN1PerThreadBN11,
index_t BK0PerThread, typename BM10BN10ThreadClusterBM10Xs, typename BM10BN10ThreadClusterBN10Xs,
index_t AThreadCopyScalarPerVector_BM11,
index_t BThreadCopyScalarPerVector_BN11, typename
enable_if< ABlockDesc_BK0_BM_BK1::IsKnownAtCompileTime() &&BBlockDesc_BK0_BN_BK1::IsKnownAtCompileTime(), bool >::type = false>
| index_t ck::BlockwiseGemmDl_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_BM0_2_BN0_2< BlockSize, FloatA, FloatB, FloatC, ABlockDesc_BK0_BM_BK1, BBlockDesc_BK0_BN_BK1, BM1PerThreadBM11, BN1PerThreadBN11, BK0PerThread, BM10BN10ThreadClusterBM10Xs, BM10BN10ThreadClusterBN10Xs, AThreadCopyScalarPerVector_BM11, BThreadCopyScalarPerVector_BN11, type >::BN101 = BM10BN10ThreadClusterBN10Xs{}[I1] |
|
staticconstexpr |
◆ BN11
template<
index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename ABlockDesc_BK0_BM_BK1, typename BBlockDesc_BK0_BN_BK1,
index_t BM1PerThreadBM11,
index_t BN1PerThreadBN11,
index_t BK0PerThread, typename BM10BN10ThreadClusterBM10Xs, typename BM10BN10ThreadClusterBN10Xs,
index_t AThreadCopyScalarPerVector_BM11,
index_t BThreadCopyScalarPerVector_BN11, typename
enable_if< ABlockDesc_BK0_BM_BK1::IsKnownAtCompileTime() &&BBlockDesc_BK0_BN_BK1::IsKnownAtCompileTime(), bool >::type = false>
| index_t ck::BlockwiseGemmDl_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_BM0_2_BN0_2< BlockSize, FloatA, FloatB, FloatC, ABlockDesc_BK0_BM_BK1, BBlockDesc_BK0_BN_BK1, BM1PerThreadBM11, BN1PerThreadBN11, BK0PerThread, BM10BN10ThreadClusterBM10Xs, BM10BN10ThreadClusterBN10Xs, AThreadCopyScalarPerVector_BM11, BThreadCopyScalarPerVector_BN11, type >::BN11 = BN1PerThreadBN11 |
|
staticconstexpr |
◆ I0
template<
index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename ABlockDesc_BK0_BM_BK1, typename BBlockDesc_BK0_BN_BK1,
index_t BM1PerThreadBM11,
index_t BN1PerThreadBN11,
index_t BK0PerThread, typename BM10BN10ThreadClusterBM10Xs, typename BM10BN10ThreadClusterBN10Xs,
index_t AThreadCopyScalarPerVector_BM11,
index_t BThreadCopyScalarPerVector_BN11, typename
enable_if< ABlockDesc_BK0_BM_BK1::IsKnownAtCompileTime() &&BBlockDesc_BK0_BN_BK1::IsKnownAtCompileTime(), bool >::type = false>
| auto ck::BlockwiseGemmDl_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_BM0_2_BN0_2< BlockSize, FloatA, FloatB, FloatC, ABlockDesc_BK0_BM_BK1, BBlockDesc_BK0_BN_BK1, BM1PerThreadBM11, BN1PerThreadBN11, BK0PerThread, BM10BN10ThreadClusterBM10Xs, BM10BN10ThreadClusterBN10Xs, AThreadCopyScalarPerVector_BM11, BThreadCopyScalarPerVector_BN11, type >::I0 = Number<0>{} |
|
staticconstexpr |
◆ I1
template<
index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename ABlockDesc_BK0_BM_BK1, typename BBlockDesc_BK0_BN_BK1,
index_t BM1PerThreadBM11,
index_t BN1PerThreadBN11,
index_t BK0PerThread, typename BM10BN10ThreadClusterBM10Xs, typename BM10BN10ThreadClusterBN10Xs,
index_t AThreadCopyScalarPerVector_BM11,
index_t BThreadCopyScalarPerVector_BN11, typename
enable_if< ABlockDesc_BK0_BM_BK1::IsKnownAtCompileTime() &&BBlockDesc_BK0_BN_BK1::IsKnownAtCompileTime(), bool >::type = false>
| auto ck::BlockwiseGemmDl_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_BM0_2_BN0_2< BlockSize, FloatA, FloatB, FloatC, ABlockDesc_BK0_BM_BK1, BBlockDesc_BK0_BN_BK1, BM1PerThreadBM11, BN1PerThreadBN11, BK0PerThread, BM10BN10ThreadClusterBM10Xs, BM10BN10ThreadClusterBN10Xs, AThreadCopyScalarPerVector_BM11, BThreadCopyScalarPerVector_BN11, type >::I1 = Number<1>{} |
|
staticconstexpr |
◆ I2
template<
index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename ABlockDesc_BK0_BM_BK1, typename BBlockDesc_BK0_BN_BK1,
index_t BM1PerThreadBM11,
index_t BN1PerThreadBN11,
index_t BK0PerThread, typename BM10BN10ThreadClusterBM10Xs, typename BM10BN10ThreadClusterBN10Xs,
index_t AThreadCopyScalarPerVector_BM11,
index_t BThreadCopyScalarPerVector_BN11, typename
enable_if< ABlockDesc_BK0_BM_BK1::IsKnownAtCompileTime() &&BBlockDesc_BK0_BN_BK1::IsKnownAtCompileTime(), bool >::type = false>
| auto ck::BlockwiseGemmDl_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_BM0_2_BN0_2< BlockSize, FloatA, FloatB, FloatC, ABlockDesc_BK0_BM_BK1, BBlockDesc_BK0_BN_BK1, BM1PerThreadBM11, BN1PerThreadBN11, BK0PerThread, BM10BN10ThreadClusterBM10Xs, BM10BN10ThreadClusterBN10Xs, AThreadCopyScalarPerVector_BM11, BThreadCopyScalarPerVector_BN11, type >::I2 = Number<2>{} |
|
staticconstexpr |
◆ I3
template<
index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename ABlockDesc_BK0_BM_BK1, typename BBlockDesc_BK0_BN_BK1,
index_t BM1PerThreadBM11,
index_t BN1PerThreadBN11,
index_t BK0PerThread, typename BM10BN10ThreadClusterBM10Xs, typename BM10BN10ThreadClusterBN10Xs,
index_t AThreadCopyScalarPerVector_BM11,
index_t BThreadCopyScalarPerVector_BN11, typename
enable_if< ABlockDesc_BK0_BM_BK1::IsKnownAtCompileTime() &&BBlockDesc_BK0_BN_BK1::IsKnownAtCompileTime(), bool >::type = false>
| auto ck::BlockwiseGemmDl_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_BM0_2_BN0_2< BlockSize, FloatA, FloatB, FloatC, ABlockDesc_BK0_BM_BK1, BBlockDesc_BK0_BN_BK1, BM1PerThreadBM11, BN1PerThreadBN11, BK0PerThread, BM10BN10ThreadClusterBM10Xs, BM10BN10ThreadClusterBN10Xs, AThreadCopyScalarPerVector_BM11, BThreadCopyScalarPerVector_BN11, type >::I3 = Number<3>{} |
|
staticconstexpr |
The documentation for this struct was generated from the following file: