intrin_mfma_f32_16x16x128f8f6f4< 16, 16 > Struct Reference

intrin_mfma_f32_16x16x128f8f6f4&lt; 16, 16 &gt; Struct Reference#

Composable Kernel: ck::intrin_mfma_f32_16x16x128f8f6f4< 16, 16 > Struct Reference
ck::intrin_mfma_f32_16x16x128f8f6f4< 16, 16 > Struct Reference

Performs a matrix fused multiply-accumulate operation on 16x16x128 submatrices for f8f6f4 data types. More...

#include <amd_xdlops.hpp>

Static Public Member Functions

template<class FloatC>
static __device__ void Run (const f8x32_t &reg_a, const f8x32_t &reg_b, FloatC &reg_c)
template<class FloatC>
static __device__ void Run (const bf8x32_t &reg_a, const bf8x32_t &reg_b, FloatC &reg_c)
template<class FloatC>
static __device__ void Run (const bf8x32_t &reg_a, const f8x32_t &reg_b, FloatC &reg_c)
template<class FloatC>
static __device__ void Run (const f8x32_t &reg_a, const bf8x32_t &reg_b, FloatC &reg_c)
template<class FloatC>
static __device__ void Run (const f4x32_t &reg_a, const f4x32_t &reg_b, FloatC &reg_c)
template<class FloatC>
static __device__ void Run (const f6x32_t &reg_a, const f6x32_t &reg_b, FloatC &reg_c)
template<class FloatC>
static __device__ void Run (const bf6x32_t &reg_a, const bf6x32_t &reg_b, FloatC &reg_c)

Detailed Description

Performs a matrix fused multiply-accumulate operation on 16x16x128 submatrices for f8f6f4 data types.

Note
Calls scaled version of the instruction as the original instruction is not supported in the backend. That is the intended use. There is a backend optimization to select the unscaled operation if the scale is 0.

Member Function Documentation

◆ Run() [1/7]

template<class FloatC>
__device__ void ck::intrin_mfma_f32_16x16x128f8f6f4< 16, 16 >::Run ( const bf6x32_t & reg_a,
const bf6x32_t & reg_b,
FloatC & reg_c )
inlinestatic

◆ Run() [2/7]

template<class FloatC>
__device__ void ck::intrin_mfma_f32_16x16x128f8f6f4< 16, 16 >::Run ( const bf8x32_t & reg_a,
const bf8x32_t & reg_b,
FloatC & reg_c )
inlinestatic

◆ Run() [3/7]

template<class FloatC>
__device__ void ck::intrin_mfma_f32_16x16x128f8f6f4< 16, 16 >::Run ( const bf8x32_t & reg_a,
const f8x32_t & reg_b,
FloatC & reg_c )
inlinestatic

◆ Run() [4/7]

template<class FloatC>
__device__ void ck::intrin_mfma_f32_16x16x128f8f6f4< 16, 16 >::Run ( const f4x32_t & reg_a,
const f4x32_t & reg_b,
FloatC & reg_c )
inlinestatic

◆ Run() [5/7]

template<class FloatC>
__device__ void ck::intrin_mfma_f32_16x16x128f8f6f4< 16, 16 >::Run ( const f6x32_t & reg_a,
const f6x32_t & reg_b,
FloatC & reg_c )
inlinestatic

◆ Run() [6/7]

template<class FloatC>
__device__ void ck::intrin_mfma_f32_16x16x128f8f6f4< 16, 16 >::Run ( const f8x32_t & reg_a,
const bf8x32_t & reg_b,
FloatC & reg_c )
inlinestatic

◆ Run() [7/7]

template<class FloatC>
__device__ void ck::intrin_mfma_f32_16x16x128f8f6f4< 16, 16 >::Run ( const f8x32_t & reg_a,
const f8x32_t & reg_b,
FloatC & reg_c )
inlinestatic

The documentation for this struct was generated from the following file: