rotating_buffers.hpp Source File

rotating_buffers.hpp Source File#

Composable Kernel: rotating_buffers.hpp Source File
rotating_buffers.hpp
Go to the documentation of this file.
1// SPDX-License-Identifier: MIT
2// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
3
4#pragma once
5
8#include <hip/hip_runtime.h>
9
10namespace ck_tile {
11
12// RotatingMemWrapper: Prevents GPU data cache reuse during kernel benchmarking.
13//
14// Purpose:
15// When benchmarking a kernel repeatedly with the same input buffers, the GPU L2 cache
16// will serve data from cache (hot) instead of HBM (cold), leading to artificially fast
17// timing measurements. This wrapper rotates through multiple copies of buffers at different
18// memory addresses to force cache misses.
19//
20// How it works:
21// Constructor: Creates rotating_count copies of matrices A and B in GPU memory
22// Next(): Switches pointers to the next buffer copy (cycles through all copies)
23// Destructor: Frees extra buffer copies and restores original pointers
24//
25// Combined with flush_icache(), this ensures realistic "cold cache" performance measurements.
26template <typename ADataType, typename BDataType>
28{
30 RotatingMemWrapper(const void* a_ptr_,
31 const void* b_ptr_,
32 std::size_t rotating_count_hint,
33 std::size_t size_a_,
34 std::size_t size_b_)
35 : a_ptr(a_ptr_),
36 b_ptr(b_ptr_),
37 rotating_count(rotating_count_hint),
38 size_a(size_a_),
39 size_b(size_b_)
40 {
41 // Store original buffer pointers as first entry
42 p_a_grids.push_back(a_ptr);
43 p_b_grids.push_back(b_ptr);
44
45 // limit the rotating count to prevent oom
46 const uint64_t footprint = (size_a + size_b);
47 const uint64_t max_rotating_count = (1ULL << 31) / footprint;
48 rotating_count = std::min(rotating_count, max_rotating_count);
49
50 // Create (rotating_count - 1) additional copies at different memory addresses
51 for(size_t i = 1; i < rotating_count; i++)
52 {
53 {
54 void* pADeviceBuf;
55 HIP_CHECK_ERROR(hipMalloc(static_cast<void**>(&pADeviceBuf), size_a_));
56 HIP_CHECK_ERROR(hipMemcpy(static_cast<void*>(pADeviceBuf), // target buffer
57 const_cast<void*>(p_a_grids[0]), // source buffer
58 size_a_,
59 hipMemcpyDeviceToDevice));
60 p_a_grids.push_back(pADeviceBuf);
61 }
62
63 {
64 void* pBDeviceBuf;
65 HIP_CHECK_ERROR(hipMalloc(static_cast<void**>(&pBDeviceBuf), size_b_));
66 HIP_CHECK_ERROR(hipMemcpy(static_cast<void*>(pBDeviceBuf), // target buffer
67 const_cast<void*>(p_b_grids[0]), // source buffer
68 size_b_,
69 hipMemcpyDeviceToDevice));
70 p_b_grids.push_back(pBDeviceBuf);
71 }
72 }
73 }
74 // Rotate to the next buffer copy. Call this before each kernel run to use different
75 // memory addresses, forcing the GPU to fetch data from HBM instead of cache.
76 void Next()
77 {
78 if(rotating_count > 1)
79 {
80 std::size_t idx = iter++ % rotating_count; // Cycle through all buffer copies
81 a_ptr = p_a_grids[idx];
82 b_ptr = p_b_grids[idx];
83 }
84 }
85 void Print()
86 {
87 std::cout << "RotatingMemWrapper: { size_a: " << size_a << ", size_b: " << size_b
88 << ", rotating_count: " << rotating_count << "}" << std::endl;
89 }
90 // Cleanup: Free all extra buffer copies (keeping original) and restore original pointers
92 {
93 if(rotating_count > 1)
94 {
95 // Restore original buffer pointers
96 a_ptr = p_a_grids[0];
97 b_ptr = p_b_grids[0];
98
99 // Free extra buffer copies (index 0 is the original, don't free it)
100 for(size_t i = 1; i < rotating_count; i++)
101 {
102 ck_tile::hip_check_error(hipFree(const_cast<void*>(p_a_grids[i])));
103 ck_tile::hip_check_error(hipFree(const_cast<void*>(p_b_grids[i])));
104 }
105 }
106 }
107
108 private:
109 const void* a_ptr;
110 const void* b_ptr;
111 std::size_t iter = 0;
112 std::size_t rotating_count = 1;
113 std::size_t size_a = 0;
114 std::size_t size_b = 0;
115 std::vector<const void*> p_a_grids;
116 std::vector<const void*> p_b_grids;
117};
118inline void flush_icache()
119{
120 hipDeviceProp_t deviceProps;
121 HIP_CHECK_ERROR(hipGetDeviceProperties(&deviceProps, 0));
122
123 // Over-provision blocks to ensure all CUs execute the flush instruction.
124 // With imperfect scheduling, launching exactly 1 block per CU doesn't guarantee coverage.
125 // 60x over-provisioning provides statistical certainty that every CU gets at least one block.
126 constexpr int32_t blocks_per_cu = 60;
127 int32_t gpu_block3 = deviceProps.multiProcessorCount * blocks_per_cu;
128
129 ck_tile::flush_cache<<<dim3(gpu_block3), dim3(64), 0, nullptr>>>();
130 HIP_CHECK_ERROR(hipGetLastError());
131}
132} // namespace ck_tile
#define HIP_CHECK_ERROR(retval_or_funcall)
Definition host_utility/hip_check_error.hpp:21
Definition tile/core/algorithm/cluster_descriptor.hpp:13
CK_TILE_HOST void hip_check_error(hipError_t x)
Definition tile/host/hip_check_error.hpp:13
int32_t int32_t
Definition integer.hpp:10
void flush_icache()
Definition rotating_buffers.hpp:118
unsigned __int64 uint64_t
Definition stdint.h:136
void Print()
Definition rotating_buffers.hpp:85
void Next()
Definition rotating_buffers.hpp:76
RotatingMemWrapper(const void *a_ptr_, const void *b_ptr_, std::size_t rotating_count_hint, std::size_t size_a_, std::size_t size_b_)
Definition rotating_buffers.hpp:30
~RotatingMemWrapper() noexcept
Definition rotating_buffers.hpp:91