HIP: Heterogenous-computing Interface for Portability
Loading...
Searching...
No Matches
hip_cooperative_groups_helper.h
Go to the documentation of this file.
1/*
2Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.
3
4Permission is hereby granted, free of charge, to any person obtaining a copy
5of this software and associated documentation files (the "Software"), to deal
6in the Software without restriction, including without limitation the rights
7to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8copies of the Software, and to permit persons to whom the Software is
9furnished to do so, subject to the following conditions:
10
11The above copyright notice and this permission notice shall be included in
12all copies or substantial portions of the Software.
13
14THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20THE SOFTWARE.
21*/
22
31#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_HELPER_H
32#define HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_HELPER_H
33
34#if __cplusplus
35#if !defined(__HIPCC_RTC__)
36#include <hip/amd_detail/amd_hip_runtime.h> // threadId, blockId
37#include <hip/amd_detail/amd_device_functions.h>
38#endif
39#if !defined(__align__)
40#define __align__(x) __attribute__((aligned(x)))
41#endif
42
43#if !defined(__CG_QUALIFIER__)
44#define __CG_QUALIFIER__ __device__ __forceinline__
45#endif
46
47#if !defined(__CG_STATIC_QUALIFIER__)
48#define __CG_STATIC_QUALIFIER__ __device__ static __forceinline__
49#endif
50
51#if !defined(_CG_STATIC_CONST_DECL_)
52#define _CG_STATIC_CONST_DECL_ static constexpr
53#endif
54
55#if __AMDGCN_WAVEFRONT_SIZE == 32
56using lane_mask = unsigned int;
57#else
58using lane_mask = unsigned long long int;
59#endif
60
61namespace cooperative_groups {
62
63/* Global scope */
64template <unsigned int size>
65using is_power_of_2 = std::integral_constant<bool, (size & (size - 1)) == 0>;
66
67template <unsigned int size>
68using is_valid_wavefront = std::integral_constant<bool, (size <= __AMDGCN_WAVEFRONT_SIZE)>;
69
70template <unsigned int size>
71using is_valid_tile_size =
72 std::integral_constant<bool, is_power_of_2<size>::value && is_valid_wavefront<size>::value>;
73
74template <typename T>
75using is_valid_type =
76 std::integral_constant<bool, std::is_integral<T>::value || std::is_floating_point<T>::value>;
77
78namespace internal {
79
85typedef enum {
86 cg_invalid,
87 cg_multi_grid,
88 cg_grid,
89 cg_workgroup,
90 cg_tiled_group,
91 cg_coalesced_group
92} group_type;
105namespace helper {
118__CG_STATIC_QUALIFIER__ unsigned long long adjust_mask(
119 unsigned long long base_mask, unsigned long long input_mask) {
120 unsigned long long out = 0;
121 for (unsigned int i = 0, index = 0; i < __AMDGCN_WAVEFRONT_SIZE; i++) {
122 auto lane_active = base_mask & (1ull << i);
123 if (lane_active) {
124 auto result = input_mask & (1ull << i);
125 out |= ((result ? 1ull : 0ull) << index);
126 index++;
127 }
128 }
129 return out;
130}
131} // namespace helper
138namespace multi_grid {
139
140__CG_STATIC_QUALIFIER__ uint32_t num_grids() {
141 return static_cast<uint32_t>(__ockl_multi_grid_num_grids()); }
142
143__CG_STATIC_QUALIFIER__ uint32_t grid_rank() {
144 return static_cast<uint32_t>(__ockl_multi_grid_grid_rank()); }
145
146__CG_STATIC_QUALIFIER__ uint32_t size() { return static_cast<uint32_t>(__ockl_multi_grid_size()); }
147
148__CG_STATIC_QUALIFIER__ uint32_t thread_rank() {
149 return static_cast<uint32_t>(__ockl_multi_grid_thread_rank()); }
150
151__CG_STATIC_QUALIFIER__ bool is_valid() { return static_cast<bool>(__ockl_multi_grid_is_valid()); }
152
153__CG_STATIC_QUALIFIER__ void sync() { __ockl_multi_grid_sync(); }
154
155} // namespace multi_grid
156
161namespace grid {
162
163__CG_STATIC_QUALIFIER__ uint32_t size() {
164 return static_cast<uint32_t>((blockDim.z * gridDim.z) * (blockDim.y * gridDim.y) *
165 (blockDim.x * gridDim.x));
166}
167
168__CG_STATIC_QUALIFIER__ uint32_t thread_rank() {
169 // Compute global id of the workgroup to which the current thread belongs to
170 uint32_t blkIdx = static_cast<uint32_t>((blockIdx.z * gridDim.y * gridDim.x) +
171 (blockIdx.y * gridDim.x) + (blockIdx.x));
172
173 // Compute total number of threads being passed to reach current workgroup
174 // within grid
175 uint32_t num_threads_till_current_workgroup =
176 static_cast<uint32_t>(blkIdx * (blockDim.x * blockDim.y * blockDim.z));
177
178 // Compute thread local rank within current workgroup
179 uint32_t local_thread_rank = static_cast<uint32_t>((threadIdx.z * blockDim.y * blockDim.x) +
180 (threadIdx.y * blockDim.x) + (threadIdx.x));
181
182 return (num_threads_till_current_workgroup + local_thread_rank);
183}
184
185__CG_STATIC_QUALIFIER__ bool is_valid() { return static_cast<bool>(__ockl_grid_is_valid()); }
186
187__CG_STATIC_QUALIFIER__ void sync() { __ockl_grid_sync(); }
188
189} // namespace grid
190
196namespace workgroup {
197
198__CG_STATIC_QUALIFIER__ dim3 group_index() {
199 return (dim3(static_cast<uint32_t>(blockIdx.x), static_cast<uint32_t>(blockIdx.y),
200 static_cast<uint32_t>(blockIdx.z)));
201}
202
203__CG_STATIC_QUALIFIER__ dim3 thread_index() {
204 return (dim3(static_cast<uint32_t>(threadIdx.x), static_cast<uint32_t>(threadIdx.y),
205 static_cast<uint32_t>(threadIdx.z)));
206}
207
208__CG_STATIC_QUALIFIER__ uint32_t size() {
209 return (static_cast<uint32_t>(blockDim.x * blockDim.y * blockDim.z));
210}
211
212__CG_STATIC_QUALIFIER__ uint32_t thread_rank() {
213 return (static_cast<uint32_t>((threadIdx.z * blockDim.y * blockDim.x) +
214 (threadIdx.y * blockDim.x) + (threadIdx.x)));
215}
216
217__CG_STATIC_QUALIFIER__ bool is_valid() {
218 return true;
219}
220
221__CG_STATIC_QUALIFIER__ void sync() { __syncthreads(); }
222
223__CG_STATIC_QUALIFIER__ dim3 block_dim() {
224 return (dim3(static_cast<uint32_t>(blockDim.x), static_cast<uint32_t>(blockDim.y),
225 static_cast<uint32_t>(blockDim.z)));
226}
227
228} // namespace workgroup
229
230namespace tiled_group {
231
232// enforce ordering for memory intructions
233__CG_STATIC_QUALIFIER__ void sync() { __builtin_amdgcn_fence(__ATOMIC_ACQ_REL, "agent"); }
234
235} // namespace tiled_group
236
237namespace coalesced_group {
238
239// enforce ordering for memory intructions
240__CG_STATIC_QUALIFIER__ void sync() { __builtin_amdgcn_fence(__ATOMIC_ACQ_REL, "agent"); }
241
242// Masked bit count
243//
244// For each thread, this function returns the number of active threads which
245// have i-th bit of x set and come before the current thread.
246__CG_STATIC_QUALIFIER__ unsigned int masked_bit_count(lane_mask x, unsigned int add = 0) {
247 unsigned int counter=0;
248 #if __AMDGCN_WAVEFRONT_SIZE == 32
249 counter = __builtin_amdgcn_mbcnt_lo(x, add);
250 #else
251 counter = __builtin_amdgcn_mbcnt_lo(static_cast<lane_mask>(x), add);
252 counter = __builtin_amdgcn_mbcnt_hi(static_cast<lane_mask>(x >> 32), counter);
253 #endif
254
255 return counter;
256}
257
258} // namespace coalesced_group
259
260
261} // namespace internal
262
263} // namespace cooperative_groups
268#endif // __cplusplus
269#endif // HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_HELPER_H