HIP: Heterogenous-computing Interface for Portability
Loading...
Searching...
No Matches
amd_device_functions.h
1/*
2Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.
3
4Permission is hereby granted, free of charge, to any person obtaining a copy
5of this software and associated documentation files (the "Software"), to deal
6in the Software without restriction, including without limitation the rights
7to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8copies of the Software, and to permit persons to whom the Software is
9furnished to do so, subject to the following conditions:
10
11The above copyright notice and this permission notice shall be included in
12all copies or substantial portions of the Software.
13
14THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20THE SOFTWARE.
21*/
22
23#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_DEVICE_FUNCTIONS_H
24#define HIP_INCLUDE_HIP_AMD_DETAIL_DEVICE_FUNCTIONS_H
25
26#if !defined(__HIPCC_RTC__)
27#include <hip/amd_detail/amd_hip_common.h>
29#include <hip/amd_detail/hip_assert.h>
30#include "host_defines.h"
31#include "math_fwd.h"
32#include <hip/hip_runtime_api.h>
33#include <stddef.h>
34#include <hip/hip_vector_types.h>
35#endif // !defined(__HIPCC_RTC__)
36
37#if defined(__clang__) && defined(__HIP__)
38extern "C" __device__ int printf(const char *fmt, ...);
39#else
40template <typename... All>
41static inline __device__ void printf(const char* format, All... all) {}
42#endif
43
44extern "C" __device__ unsigned long long __ockl_steadyctr_u64();
45
46/*
47Integer Intrinsics
48*/
49
50// integer intrinsic function __poc __clz __ffs __brev
51__device__ static inline unsigned int __popc(unsigned int input) {
52 return __builtin_popcount(input);
53}
54__device__ static inline unsigned int __popcll(unsigned long long int input) {
55 return __builtin_popcountll(input);
56}
57
58__device__ static inline int __clz(int input) {
59 return __ockl_clz_u32((uint)input);
60}
61
62__device__ static inline int __clzll(long long int input) {
63 return __ockl_clz_u64((uint64_t)input);
64}
65
66__device__ static inline int __ffs(unsigned int input) {
67 return ( input == 0 ? -1 : __builtin_ctz(input) ) + 1;
68}
69
70__device__ static inline int __ffsll(unsigned long long int input) {
71 return ( input == 0 ? -1 : __builtin_ctzll(input) ) + 1;
72}
73
74__device__ static inline int __ffs(int input) {
75 return ( input == 0 ? -1 : __builtin_ctz(input) ) + 1;
76}
77
78__device__ static inline int __ffsll(long long int input) {
79 return ( input == 0 ? -1 : __builtin_ctzll(input) ) + 1;
80}
81
82// Given a 32/64-bit value exec mask and an integer value base (between 0 and WAVEFRONT_SIZE),
83// find the n-th (given by offset) set bit in the exec mask from the base bit, and return the bit position.
84// If not found, return -1.
85__device__ static int32_t __fns64(uint64_t mask, uint32_t base, int32_t offset) {
86 uint64_t temp_mask = mask;
87 int32_t temp_offset = offset;
88
89 if (offset == 0) {
90 temp_mask &= (1 << base);
91 temp_offset = 1;
92 }
93 else if (offset < 0) {
94 temp_mask = __builtin_bitreverse64(mask);
95 base = 63 - base;
96 temp_offset = -offset;
97 }
98
99 temp_mask = temp_mask & ((~0ULL) << base);
100 if (__builtin_popcountll(temp_mask) < temp_offset)
101 return -1;
102 int32_t total = 0;
103 for (int i = 0x20; i > 0; i >>= 1) {
104 uint64_t temp_mask_lo = temp_mask & ((1ULL << i) - 1);
105 int32_t pcnt = __builtin_popcountll(temp_mask_lo);
106 if (pcnt < temp_offset) {
107 temp_mask = temp_mask >> i;
108 temp_offset -= pcnt;
109 total += i;
110 }
111 else {
112 temp_mask = temp_mask_lo;
113 }
114 }
115 if (offset < 0)
116 return 63 - total;
117 else
118 return total;
119}
120
121__device__ static int32_t __fns32(uint32_t mask, uint32_t base, int32_t offset) {
122 uint32_t temp_mask = mask;
123 int32_t temp_offset = offset;
124 if (offset == 0) {
125 temp_mask &= (1 << base);
126 temp_offset = 1;
127 }
128 else if (offset < 0) {
129 temp_mask = __builtin_bitreverse32(mask);
130 base = 31 - base;
131 temp_offset = -offset;
132 }
133 temp_mask = temp_mask & ((~0U) << base);
134 if (__builtin_popcount(temp_mask) < temp_offset)
135 return -1;
136 int32_t total = 0;
137 for (int i = 0x10; i > 0; i >>= 1) {
138 uint32_t temp_mask_lo = temp_mask & ((1U << i) - 1);
139 int32_t pcnt = __builtin_popcount(temp_mask_lo);
140 if (pcnt < temp_offset) {
141 temp_mask = temp_mask >> i;
142 temp_offset -= pcnt;
143 total += i;
144 }
145 else {
146 temp_mask = temp_mask_lo;
147 }
148 }
149 if (offset < 0)
150 return 31 - total;
151 else
152 return total;
153}
154
155// Wrapper around __fns32() to make porting from CUDA easier
156__device__ static int32_t __fns(unsigned int mask, unsigned int base, int offset) {
157 return __fns32(mask, base, offset);
158}
159
160__device__ static inline unsigned int __brev(unsigned int input) {
161 return __builtin_bitreverse32(input);
162}
163
164__device__ static inline unsigned long long int __brevll(unsigned long long int input) {
165 return __builtin_bitreverse64(input);
166}
167
168__device__ static inline unsigned int __lastbit_u32_u64(uint64_t input) {
169 return input == 0 ? -1 : __builtin_ctzl(input);
170}
171
172__device__ static inline unsigned int __bitextract_u32(unsigned int src0, unsigned int src1, unsigned int src2) {
173 uint32_t offset = src1 & 31;
174 uint32_t width = src2 & 31;
175 return width == 0 ? 0 : (src0 << (32 - offset - width)) >> (32 - width);
176}
177
178__device__ static inline uint64_t __bitextract_u64(uint64_t src0, unsigned int src1, unsigned int src2) {
179 uint64_t offset = src1 & 63;
180 uint64_t width = src2 & 63;
181 return width == 0 ? 0 : (src0 << (64 - offset - width)) >> (64 - width);
182}
183
184__device__ static inline unsigned int __bitinsert_u32(unsigned int src0, unsigned int src1, unsigned int src2, unsigned int src3) {
185 uint32_t offset = src2 & 31;
186 uint32_t width = src3 & 31;
187 uint32_t mask = (1 << width) - 1;
188 return ((src0 & ~(mask << offset)) | ((src1 & mask) << offset));
189}
190
191__device__ static inline uint64_t __bitinsert_u64(uint64_t src0, uint64_t src1, unsigned int src2, unsigned int src3) {
192 uint64_t offset = src2 & 63;
193 uint64_t width = src3 & 63;
194 uint64_t mask = (1ULL << width) - 1;
195 return ((src0 & ~(mask << offset)) | ((src1 & mask) << offset));
196}
197
198__device__ inline unsigned int __funnelshift_l(unsigned int lo, unsigned int hi, unsigned int shift)
199{
200 uint32_t mask_shift = shift & 31;
201 return mask_shift == 0 ? hi : __builtin_amdgcn_alignbit(hi, lo, 32 - mask_shift);
202}
203
204__device__ inline unsigned int __funnelshift_lc(unsigned int lo, unsigned int hi, unsigned int shift)
205{
206 uint32_t min_shift = shift >= 32 ? 32 : shift;
207 return min_shift == 0 ? hi : __builtin_amdgcn_alignbit(hi, lo, 32 - min_shift);
208}
209
210__device__ inline unsigned int __funnelshift_r(unsigned int lo, unsigned int hi, unsigned int shift)
211{
212 return __builtin_amdgcn_alignbit(hi, lo, shift);
213}
214
215__device__ inline unsigned int __funnelshift_rc(unsigned int lo, unsigned int hi, unsigned int shift)
216{
217 return shift >= 32 ? hi : __builtin_amdgcn_alignbit(hi, lo, shift);
218}
219
220__device__ static unsigned int __byte_perm(unsigned int x, unsigned int y, unsigned int s);
221__device__ static unsigned int __hadd(int x, int y);
222__device__ static int __mul24(int x, int y);
223__device__ static long long int __mul64hi(long long int x, long long int y);
224__device__ static int __mulhi(int x, int y);
225__device__ static int __rhadd(int x, int y);
226__device__ static unsigned int __sad(int x, int y,unsigned int z);
227__device__ static unsigned int __uhadd(unsigned int x, unsigned int y);
228__device__ static int __umul24(unsigned int x, unsigned int y);
229__device__ static unsigned long long int __umul64hi(unsigned long long int x, unsigned long long int y);
230__device__ static unsigned int __umulhi(unsigned int x, unsigned int y);
231__device__ static unsigned int __urhadd(unsigned int x, unsigned int y);
232__device__ static unsigned int __usad(unsigned int x, unsigned int y, unsigned int z);
233
235 union {
236 unsigned char c[4];
237 unsigned int ui;
238 };
239} __attribute__((aligned(4)));
240
242 union {
243 unsigned int ui[2];
244 unsigned char c[8];
245 };
246} __attribute__((aligned(8)));
247
248__device__
249static inline unsigned int __byte_perm(unsigned int x, unsigned int y, unsigned int s) {
250 struct uchar2Holder cHoldVal;
251 struct ucharHolder cHoldKey;
252 cHoldKey.ui = s;
253 cHoldVal.ui[0] = x;
254 cHoldVal.ui[1] = y;
255 unsigned int result;
256 result = cHoldVal.c[cHoldKey.c[0] & 0x07];
257 result += (cHoldVal.c[(cHoldKey.c[0] & 0x70) >> 4] << 8);
258 result += (cHoldVal.c[cHoldKey.c[1] & 0x07] << 16);
259 result += (cHoldVal.c[(cHoldKey.c[1] & 0x70) >> 4] << 24);
260 return result;
261}
262
263__device__ static inline unsigned int __hadd(int x, int y) {
264 int z = x + y;
265 int sign = z & 0x8000000;
266 int value = z & 0x7FFFFFFF;
267 return ((value) >> 1 || sign);
268}
269
270__device__ static inline int __mul24(int x, int y) {
271 return __ockl_mul24_i32(x, y);
272}
273
274__device__ static inline long long __mul64hi(long long int x, long long int y) {
275 unsigned long long x0 = (unsigned long long)x & 0xffffffffUL;
276 long long x1 = x >> 32;
277 unsigned long long y0 = (unsigned long long)y & 0xffffffffUL;
278 long long y1 = y >> 32;
279 unsigned long long z0 = x0*y0;
280 long long t = x1*y0 + (z0 >> 32);
281 long long z1 = t & 0xffffffffL;
282 long long z2 = t >> 32;
283 z1 = x0*y1 + z1;
284 return x1*y1 + z2 + (z1 >> 32);
285}
286
287__device__ static inline int __mulhi(int x, int y) {
288 return __ockl_mul_hi_i32(x, y);
289}
290
291__device__ static inline int __rhadd(int x, int y) {
292 int z = x + y + 1;
293 int sign = z & 0x8000000;
294 int value = z & 0x7FFFFFFF;
295 return ((value) >> 1 || sign);
296}
297__device__ static inline unsigned int __sad(int x, int y, unsigned int z) {
298 return x > y ? x - y + z : y - x + z;
299}
300__device__ static inline unsigned int __uhadd(unsigned int x, unsigned int y) {
301 return (x + y) >> 1;
302}
303__device__ static inline int __umul24(unsigned int x, unsigned int y) {
304 return __ockl_mul24_u32(x, y);
305}
306
307__device__
308static inline unsigned long long __umul64hi(unsigned long long int x, unsigned long long int y) {
309 unsigned long long x0 = x & 0xffffffffUL;
310 unsigned long long x1 = x >> 32;
311 unsigned long long y0 = y & 0xffffffffUL;
312 unsigned long long y1 = y >> 32;
313 unsigned long long z0 = x0*y0;
314 unsigned long long t = x1*y0 + (z0 >> 32);
315 unsigned long long z1 = t & 0xffffffffUL;
316 unsigned long long z2 = t >> 32;
317 z1 = x0*y1 + z1;
318 return x1*y1 + z2 + (z1 >> 32);
319}
320
321__device__ static inline unsigned int __umulhi(unsigned int x, unsigned int y) {
322 return __ockl_mul_hi_u32(x, y);
323}
324__device__ static inline unsigned int __urhadd(unsigned int x, unsigned int y) {
325 return (x + y + 1) >> 1;
326}
327__device__ static inline unsigned int __usad(unsigned int x, unsigned int y, unsigned int z) {
328 return __ockl_sadd_u32(x, y, z);
329}
330
331__device__
332static inline unsigned int __mbcnt_lo(unsigned int x, unsigned int y) {return __builtin_amdgcn_mbcnt_lo(x,y);};
333
334__device__
335static inline unsigned int __mbcnt_hi(unsigned int x, unsigned int y) {return __builtin_amdgcn_mbcnt_hi(x,y);};
336
337/*
338HIP specific device functions
339*/
340
341#if !defined(__HIPCC_RTC__)
342#include "amd_warp_functions.h"
343#include "amd_warp_sync_functions.h"
344#endif
345
346#define MASK1 0x00ff00ff
347#define MASK2 0xff00ff00
348
349__device__ static inline char4 __hip_hc_add8pk(char4 in1, char4 in2) {
350 char4 out;
351 unsigned one1 = in1.w & MASK1;
352 unsigned one2 = in2.w & MASK1;
353 out.w = (one1 + one2) & MASK1;
354 one1 = in1.w & MASK2;
355 one2 = in2.w & MASK2;
356 out.w = out.w | ((one1 + one2) & MASK2);
357 return out;
358}
359
360__device__ static inline char4 __hip_hc_sub8pk(char4 in1, char4 in2) {
361 char4 out;
362 unsigned one1 = in1.w & MASK1;
363 unsigned one2 = in2.w & MASK1;
364 out.w = (one1 - one2) & MASK1;
365 one1 = in1.w & MASK2;
366 one2 = in2.w & MASK2;
367 out.w = out.w | ((one1 - one2) & MASK2);
368 return out;
369}
370
371__device__ static inline char4 __hip_hc_mul8pk(char4 in1, char4 in2) {
372 char4 out;
373 unsigned one1 = in1.w & MASK1;
374 unsigned one2 = in2.w & MASK1;
375 out.w = (one1 * one2) & MASK1;
376 one1 = in1.w & MASK2;
377 one2 = in2.w & MASK2;
378 out.w = out.w | ((one1 * one2) & MASK2);
379 return out;
380}
381
382__device__ static inline float __double2float_rd(double x) {
383 return __ocml_cvtrtn_f32_f64(x);
384}
385__device__ static inline float __double2float_rn(double x) { return x; }
386__device__ static inline float __double2float_ru(double x) {
387 return __ocml_cvtrtp_f32_f64(x);
388}
389__device__ static inline float __double2float_rz(double x) {
390 return __ocml_cvtrtz_f32_f64(x);
391}
392
393__device__ static inline int __double2hiint(double x) {
394 static_assert(sizeof(double) == 2 * sizeof(int), "");
395
396 int tmp[2];
397 __builtin_memcpy(tmp, &x, sizeof(tmp));
398
399 return tmp[1];
400}
401__device__ static inline int __double2loint(double x) {
402 static_assert(sizeof(double) == 2 * sizeof(int), "");
403
404 int tmp[2];
405 __builtin_memcpy(tmp, &x, sizeof(tmp));
406
407 return tmp[0];
408}
409
410__device__ static inline int __double2int_rd(double x) { return (int)__ocml_floor_f64(x); }
411__device__ static inline int __double2int_rn(double x) { return (int)__ocml_rint_f64(x); }
412__device__ static inline int __double2int_ru(double x) { return (int)__ocml_ceil_f64(x); }
413__device__ static inline int __double2int_rz(double x) { return (int)x; }
414
415__device__ static inline long long int __double2ll_rd(double x) {
416 return (long long)__ocml_floor_f64(x);
417}
418__device__ static inline long long int __double2ll_rn(double x) {
419 return (long long)__ocml_rint_f64(x);
420}
421__device__ static inline long long int __double2ll_ru(double x) {
422 return (long long)__ocml_ceil_f64(x);
423}
424__device__ static inline long long int __double2ll_rz(double x) { return (long long)x; }
425
426__device__ static inline unsigned int __double2uint_rd(double x) {
427 return (unsigned int)__ocml_floor_f64(x);
428}
429__device__ static inline unsigned int __double2uint_rn(double x) {
430 return (unsigned int)__ocml_rint_f64(x);
431}
432__device__ static inline unsigned int __double2uint_ru(double x) {
433 return (unsigned int)__ocml_ceil_f64(x);
434}
435__device__ static inline unsigned int __double2uint_rz(double x) { return (unsigned int)x; }
436
437__device__ static inline unsigned long long int __double2ull_rd(double x) {
438 return (unsigned long long int)__ocml_floor_f64(x);
439}
440__device__ static inline unsigned long long int __double2ull_rn(double x) {
441 return (unsigned long long int)__ocml_rint_f64(x);
442}
443__device__ static inline unsigned long long int __double2ull_ru(double x) {
444 return (unsigned long long int)__ocml_ceil_f64(x);
445}
446__device__ static inline unsigned long long int __double2ull_rz(double x) {
447 return (unsigned long long int)x;
448}
449__device__ static inline long long int __double_as_longlong(double x) {
450 static_assert(sizeof(long long) == sizeof(double), "");
451
452 long long tmp;
453 __builtin_memcpy(&tmp, &x, sizeof(tmp));
454
455 return tmp;
456}
457
458/*
459__device__ unsigned short __float2half_rn(float x);
460__device__ float __half2float(unsigned short);
461
462The above device function are not a valid .
463Use
464__device__ __half __float2half_rn(float x);
465__device__ float __half2float(__half);
466from hip_fp16.h
467
468CUDA implements half as unsigned short whereas, HIP doesn't.
469
470*/
471
472__device__ static inline int __float2int_rd(float x) { return (int)__ocml_floor_f32(x); }
473__device__ static inline int __float2int_rn(float x) { return (int)__ocml_rint_f32(x); }
474__device__ static inline int __float2int_ru(float x) { return (int)__ocml_ceil_f32(x); }
475__device__ static inline int __float2int_rz(float x) { return (int)__ocml_trunc_f32(x); }
476
477__device__ static inline long long int __float2ll_rd(float x) {
478 return (long long int)__ocml_floor_f32(x);
479}
480__device__ static inline long long int __float2ll_rn(float x) {
481 return (long long int)__ocml_rint_f32(x);
482}
483__device__ static inline long long int __float2ll_ru(float x) {
484 return (long long int)__ocml_ceil_f32(x);
485}
486__device__ static inline long long int __float2ll_rz(float x) { return (long long int)x; }
487
488__device__ static inline unsigned int __float2uint_rd(float x) {
489 return (unsigned int)__ocml_floor_f32(x);
490}
491__device__ static inline unsigned int __float2uint_rn(float x) {
492 return (unsigned int)__ocml_rint_f32(x);
493}
494__device__ static inline unsigned int __float2uint_ru(float x) {
495 return (unsigned int)__ocml_ceil_f32(x);
496}
497__device__ static inline unsigned int __float2uint_rz(float x) { return (unsigned int)x; }
498
499__device__ static inline unsigned long long int __float2ull_rd(float x) {
500 return (unsigned long long int)__ocml_floor_f32(x);
501}
502__device__ static inline unsigned long long int __float2ull_rn(float x) {
503 return (unsigned long long int)__ocml_rint_f32(x);
504}
505__device__ static inline unsigned long long int __float2ull_ru(float x) {
506 return (unsigned long long int)__ocml_ceil_f32(x);
507}
508__device__ static inline unsigned long long int __float2ull_rz(float x) {
509 return (unsigned long long int)x;
510}
511
512__device__ static inline int __float_as_int(float x) {
513 static_assert(sizeof(int) == sizeof(float), "");
514
515 int tmp;
516 __builtin_memcpy(&tmp, &x, sizeof(tmp));
517
518 return tmp;
519}
520
521__device__ static inline unsigned int __float_as_uint(float x) {
522 static_assert(sizeof(unsigned int) == sizeof(float), "");
523
524 unsigned int tmp;
525 __builtin_memcpy(&tmp, &x, sizeof(tmp));
526
527 return tmp;
528}
529
530__device__ static inline double __hiloint2double(int hi, int lo) {
531 static_assert(sizeof(double) == sizeof(uint64_t), "");
532
533 uint64_t tmp0 = (static_cast<uint64_t>(hi) << 32ull) | static_cast<uint32_t>(lo);
534 double tmp1;
535 __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
536
537 return tmp1;
538}
539
540__device__ static inline double __int2double_rn(int x) { return (double)x; }
541
542__device__ static inline float __int2float_rd(int x) {
543 return __ocml_cvtrtn_f32_s32(x);
544}
545__device__ static inline float __int2float_rn(int x) { return (float)x; }
546__device__ static inline float __int2float_ru(int x) {
547 return __ocml_cvtrtp_f32_s32(x);
548}
549__device__ static inline float __int2float_rz(int x) {
550 return __ocml_cvtrtz_f32_s32(x);
551}
552
553__device__ static inline float __int_as_float(int x) {
554 static_assert(sizeof(float) == sizeof(int), "");
555
556 float tmp;
557 __builtin_memcpy(&tmp, &x, sizeof(tmp));
558
559 return tmp;
560}
561
562__device__ static inline double __ll2double_rd(long long int x) {
563 return __ocml_cvtrtn_f64_s64(x);
564}
565__device__ static inline double __ll2double_rn(long long int x) { return (double)x; }
566__device__ static inline double __ll2double_ru(long long int x) {
567 return __ocml_cvtrtp_f64_s64(x);
568}
569__device__ static inline double __ll2double_rz(long long int x) {
570 return __ocml_cvtrtz_f64_s64(x);
571}
572
573__device__ static inline float __ll2float_rd(long long int x) {
574 return __ocml_cvtrtn_f32_s64(x);
575}
576__device__ static inline float __ll2float_rn(long long int x) { return (float)x; }
577__device__ static inline float __ll2float_ru(long long int x) {
578 return __ocml_cvtrtp_f32_s64(x);
579}
580__device__ static inline float __ll2float_rz(long long int x) {
581 return __ocml_cvtrtz_f32_s64(x);
582}
583
584__device__ static inline double __longlong_as_double(long long int x) {
585 static_assert(sizeof(double) == sizeof(long long), "");
586
587 double tmp;
588 __builtin_memcpy(&tmp, &x, sizeof(tmp));
589
590 return tmp;
591}
592
593__device__ static inline double __uint2double_rn(unsigned int x) { return (double)x; }
594
595__device__ static inline float __uint2float_rd(unsigned int x) {
596 return __ocml_cvtrtn_f32_u32(x);
597}
598__device__ static inline float __uint2float_rn(unsigned int x) { return (float)x; }
599__device__ static inline float __uint2float_ru(unsigned int x) {
600 return __ocml_cvtrtp_f32_u32(x);
601}
602__device__ static inline float __uint2float_rz(unsigned int x) {
603 return __ocml_cvtrtz_f32_u32(x);
604}
605
606__device__ static inline float __uint_as_float(unsigned int x) {
607 static_assert(sizeof(float) == sizeof(unsigned int), "");
608
609 float tmp;
610 __builtin_memcpy(&tmp, &x, sizeof(tmp));
611
612 return tmp;
613}
614
615__device__ static inline double __ull2double_rd(unsigned long long int x) {
616 return __ocml_cvtrtn_f64_u64(x);
617}
618__device__ static inline double __ull2double_rn(unsigned long long int x) { return (double)x; }
619__device__ static inline double __ull2double_ru(unsigned long long int x) {
620 return __ocml_cvtrtp_f64_u64(x);
621}
622__device__ static inline double __ull2double_rz(unsigned long long int x) {
623 return __ocml_cvtrtz_f64_u64(x);
624}
625
626__device__ static inline float __ull2float_rd(unsigned long long int x) {
627 return __ocml_cvtrtn_f32_u64(x);
628}
629__device__ static inline float __ull2float_rn(unsigned long long int x) { return (float)x; }
630__device__ static inline float __ull2float_ru(unsigned long long int x) {
631 return __ocml_cvtrtp_f32_u64(x);
632}
633__device__ static inline float __ull2float_rz(unsigned long long int x) {
634 return __ocml_cvtrtz_f32_u64(x);
635}
636
637#if defined(__clang__) && defined(__HIP__)
638
639// Clock functions
640__device__ long long int __clock64();
641__device__ long long int __clock();
642__device__ long long int clock64();
643__device__ long long int clock();
644__device__ long long int wall_clock64();
645// hip.amdgcn.bc - named sync
646__device__ void __named_sync();
647
648#ifdef __HIP_DEVICE_COMPILE__
649
650// Clock function to return GPU core cycle count.
651// GPU can change its core clock frequency at runtime. The maximum frequency can be queried
652// through hipDeviceAttributeClockRate attribute.
653__device__
654inline __attribute((always_inline))
655long long int __clock64() {
656#if __has_builtin(__builtin_amdgcn_s_memtime)
657 // Exists on gfx8, gfx9, gfx10.1, gfx10.2, gfx10.3
658 return (long long int) __builtin_amdgcn_s_memtime();
659#else
660 // Subject to change when better solution available
661 return (long long int) __builtin_readcyclecounter();
662#endif
663}
664
665__device__
666inline __attribute((always_inline))
667long long int __clock() { return __clock64(); }
668
669// Clock function to return wall clock count at a constant frequency that can be queried
670// through hipDeviceAttributeWallClockRate attribute.
671__device__
672inline __attribute__((always_inline))
673long long int wall_clock64() {
674 return (long long int) __ockl_steadyctr_u64();
675}
676
677__device__
678inline __attribute__((always_inline))
679long long int clock64() { return __clock64(); }
680
681__device__
682inline __attribute__((always_inline))
683long long int clock() { return __clock(); }
684
685// hip.amdgcn.bc - named sync
686__device__
687inline
688void __named_sync() { __builtin_amdgcn_s_barrier(); }
689
690#endif // __HIP_DEVICE_COMPILE__
691
692// hip.amdgcn.bc - lanemask
693__device__
694inline
695uint64_t __lanemask_gt()
696{
697 uint32_t lane = __ockl_lane_u32();
698 if (lane == 63)
699 return 0;
700 uint64_t ballot = __ballot64(1);
701 uint64_t mask = (~((uint64_t)0)) << (lane + 1);
702 return mask & ballot;
703}
704
705__device__
706inline
707uint64_t __lanemask_lt()
708{
709 uint32_t lane = __ockl_lane_u32();
710 int64_t ballot = __ballot64(1);
711 uint64_t mask = ((uint64_t)1 << lane) - (uint64_t)1;
712 return mask & ballot;
713}
714
715__device__
716inline
717uint64_t __lanemask_eq()
718{
719 uint32_t lane = __ockl_lane_u32();
720 int64_t mask = ((uint64_t)1 << lane);
721 return mask;
722}
723
724
725__device__ inline void* __local_to_generic(void* p) { return p; }
726
727#ifdef __HIP_DEVICE_COMPILE__
728__device__
729inline
730void* __get_dynamicgroupbaseptr()
731{
732 // Get group segment base pointer.
733 return (char*)__local_to_generic((void*)__to_local(__builtin_amdgcn_groupstaticsize()));
734}
735#else
736__device__
737void* __get_dynamicgroupbaseptr();
738#endif // __HIP_DEVICE_COMPILE__
739
740__device__
741inline
742void *__amdgcn_get_dynamicgroupbaseptr() {
743 return __get_dynamicgroupbaseptr();
744}
745
746// Memory Fence Functions
747__device__
748inline
749static void __threadfence()
750{
751 __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "agent");
752}
753
754__device__
755inline
756static void __threadfence_block()
757{
758 __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "workgroup");
759}
760
761__device__
762inline
763static void __threadfence_system()
764{
765 __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "");
766}
767__device__ inline static void __work_group_barrier(__cl_mem_fence_flags flags) {
768 if (flags) {
769 __builtin_amdgcn_fence(__ATOMIC_RELEASE, "workgroup");
770 __builtin_amdgcn_s_barrier();
771 __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "workgroup");
772 } else {
773 __builtin_amdgcn_s_barrier();
774 }
775}
776
777__device__
778inline
779static void __barrier(int n)
780{
781 __work_group_barrier((__cl_mem_fence_flags)n);
782}
783
784__device__
785inline
786__attribute__((convergent))
787void __syncthreads()
788{
789 __barrier(__CLK_LOCAL_MEM_FENCE);
790}
791
792__device__
793inline
794__attribute__((convergent))
795int __syncthreads_count(int predicate)
796{
797 return __ockl_wgred_add_i32(!!predicate);
798}
799
800__device__
801inline
802__attribute__((convergent))
803int __syncthreads_and(int predicate)
804{
805 return __ockl_wgred_and_i32(!!predicate);
806}
807
808__device__
809inline
810__attribute__((convergent))
811int __syncthreads_or(int predicate)
812{
813 return __ockl_wgred_or_i32(!!predicate);
814}
815
816// hip.amdgcn.bc - device routine
817/*
818 HW_ID Register bit structure for RDNA2 & RDNA3
819 WAVE_ID 4:0 Wave id within the SIMD.
820 SIMD_ID 9:8 SIMD_ID within the WGP: [0] = row, [1] = column.
821 WGP_ID 13:10 Physical WGP ID.
822 SA_ID 16 Shader Array ID
823 SE_ID 20:18 Shader Engine the wave is assigned to for gfx11
824 SE_ID 19:18 Shader Engine the wave is assigned to for gfx10
825 DP_RATE 31:29 Number of double-precision float units per SIMD
826
827 HW_ID Register bit structure for GCN and CDNA
828 WAVE_ID 3:0 Wave buffer slot number. 0-9.
829 SIMD_ID 5:4 SIMD which the wave is assigned to within the CU.
830 PIPE_ID 7:6 Pipeline from which the wave was dispatched.
831 CU_ID 11:8 Compute Unit the wave is assigned to.
832 SH_ID 12 Shader Array (within an SE) the wave is assigned to.
833 SE_ID 15:13 Shader Engine the wave is assigned to for gfx908, gfx90a
834 14:13 Shader Engine the wave is assigned to for gfx940-942
835 TG_ID 19:16 Thread-group ID
836 VM_ID 23:20 Virtual Memory ID
837 QUEUE_ID 26:24 Queue from which this wave was dispatched.
838 STATE_ID 29:27 State ID (graphics only, not compute).
839 ME_ID 31:30 Micro-engine ID.
840
841 XCC_ID Register bit structure for gfx940
842 XCC_ID 3:0 XCC the wave is assigned to.
843 */
844
845#if (defined (__GFX10__) || defined (__GFX11__))
846 #define HW_ID 23
847#else
848 #define HW_ID 4
849#endif
850
851#if (defined(__GFX10__) || defined(__GFX11__))
852 #define HW_ID_WGP_ID_SIZE 4
853 #define HW_ID_WGP_ID_OFFSET 10
854 #if (defined(__AMDGCN_CUMODE__))
855 #define HW_ID_CU_ID_SIZE 1
856 #define HW_ID_CU_ID_OFFSET 8
857 #endif
858#else
859 #define HW_ID_CU_ID_SIZE 4
860 #define HW_ID_CU_ID_OFFSET 8
861#endif
862
863#if (defined(__gfx908__) || defined(__gfx90a__) || \
864 defined(__GFX11__))
865 #define HW_ID_SE_ID_SIZE 3
866#else //4 SEs/XCC for gfx940-942
867 #define HW_ID_SE_ID_SIZE 2
868#endif
869#if (defined(__GFX10__) || defined(__GFX11__))
870 #define HW_ID_SE_ID_OFFSET 18
871 #define HW_ID_SA_ID_OFFSET 16
872 #define HW_ID_SA_ID_SIZE 1
873#else
874 #define HW_ID_SE_ID_OFFSET 13
875#endif
876
877#if (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
878 #define XCC_ID 20
879 #define XCC_ID_XCC_ID_SIZE 4
880 #define XCC_ID_XCC_ID_OFFSET 0
881#endif
882
883#if (!defined(__HIP_NO_IMAGE_SUPPORT) && \
884 (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)))
885 #define __HIP_NO_IMAGE_SUPPORT 1
886#endif
887
888/*
889 Encoding of parameter bitmask
890 HW_ID 5:0 HW_ID
891 OFFSET 10:6 Range: 0..31
892 SIZE 15:11 Range: 1..32
893 */
894
895#define GETREG_IMMED(SZ,OFF,REG) (((SZ) << 11) | ((OFF) << 6) | (REG))
896
897/*
898 __smid returns the wave's assigned Compute Unit and Shader Engine.
899 The Compute Unit, CU_ID returned in bits 3:0, and Shader Engine, SE_ID in bits 5:4.
900 Note: the results vary over time.
901 SZ minus 1 since SIZE is 1-based.
902*/
903__device__
904inline
905unsigned __smid(void)
906{
907 unsigned se_id = __builtin_amdgcn_s_getreg(
908 GETREG_IMMED(HW_ID_SE_ID_SIZE-1, HW_ID_SE_ID_OFFSET, HW_ID));
909 #if (defined(__GFX10__) || defined(__GFX11__))
910 unsigned wgp_id = __builtin_amdgcn_s_getreg(
911 GETREG_IMMED(HW_ID_WGP_ID_SIZE - 1, HW_ID_WGP_ID_OFFSET, HW_ID));
912 unsigned sa_id = __builtin_amdgcn_s_getreg(
913 GETREG_IMMED(HW_ID_SA_ID_SIZE - 1, HW_ID_SA_ID_OFFSET, HW_ID));
914 #if (defined(__AMDGCN_CUMODE__))
915 unsigned cu_id = __builtin_amdgcn_s_getreg(
916 GETREG_IMMED(HW_ID_CU_ID_SIZE - 1, HW_ID_CU_ID_OFFSET, HW_ID));
917 #endif
918 #else
919 #if (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
920 unsigned xcc_id = __builtin_amdgcn_s_getreg(
921 GETREG_IMMED(XCC_ID_XCC_ID_SIZE - 1, XCC_ID_XCC_ID_OFFSET, XCC_ID));
922 #endif
923 unsigned cu_id = __builtin_amdgcn_s_getreg(
924 GETREG_IMMED(HW_ID_CU_ID_SIZE - 1, HW_ID_CU_ID_OFFSET, HW_ID));
925 #endif
926 #if (defined(__GFX10__) || defined(__GFX11__))
927 unsigned temp = se_id;
928 temp = (temp << HW_ID_SA_ID_SIZE) | sa_id;
929 temp = (temp << HW_ID_WGP_ID_SIZE) | wgp_id;
930 #if (defined(__AMDGCN_CUMODE__))
931 temp = (temp << HW_ID_CU_ID_SIZE) | cu_id;
932 #endif
933 return temp;
934 //TODO : CU Mode impl
935 #elif (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
936 unsigned temp = xcc_id;
937 temp = (temp << HW_ID_SE_ID_SIZE) | se_id;
938 temp = (temp << HW_ID_CU_ID_SIZE) | cu_id;
939 return temp;
940 #else
941 return (se_id << HW_ID_CU_ID_SIZE) + cu_id;
942 #endif
943}
944
949#define HIP_DYNAMIC_SHARED(type, var) extern __shared__ type var[];
950#define HIP_DYNAMIC_SHARED_ATTRIBUTE
951
952#endif //defined(__clang__) && defined(__HIP__)
953
954
955// loop unrolling
956static inline __device__ void* __hip_hc_memcpy(void* dst, const void* src, size_t size) {
957 auto dstPtr = static_cast<unsigned char*>(dst);
958 auto srcPtr = static_cast<const unsigned char*>(src);
959
960 while (size >= 4u) {
961 dstPtr[0] = srcPtr[0];
962 dstPtr[1] = srcPtr[1];
963 dstPtr[2] = srcPtr[2];
964 dstPtr[3] = srcPtr[3];
965
966 size -= 4u;
967 srcPtr += 4u;
968 dstPtr += 4u;
969 }
970 switch (size) {
971 case 3:
972 dstPtr[2] = srcPtr[2];
973 case 2:
974 dstPtr[1] = srcPtr[1];
975 case 1:
976 dstPtr[0] = srcPtr[0];
977 }
978
979 return dst;
980}
981
982static inline __device__ void* __hip_hc_memset(void* dst, unsigned char val, size_t size) {
983 auto dstPtr = static_cast<unsigned char*>(dst);
984
985 while (size >= 4u) {
986 dstPtr[0] = val;
987 dstPtr[1] = val;
988 dstPtr[2] = val;
989 dstPtr[3] = val;
990
991 size -= 4u;
992 dstPtr += 4u;
993 }
994 switch (size) {
995 case 3:
996 dstPtr[2] = val;
997 case 2:
998 dstPtr[1] = val;
999 case 1:
1000 dstPtr[0] = val;
1001 }
1002
1003 return dst;
1004}
1005#ifndef __OPENMP_AMDGCN__
1006static inline __device__ void* memcpy(void* dst, const void* src, size_t size) {
1007 return __hip_hc_memcpy(dst, src, size);
1008}
1009
1010static inline __device__ void* memset(void* ptr, int val, size_t size) {
1011 unsigned char val8 = static_cast<unsigned char>(val);
1012 return __hip_hc_memset(ptr, val8, size);
1013}
1014#endif // !__OPENMP_AMDGCN__
1015
1016#endif
Contains declarations for types and functions in device library. Uses int64_t and uint64_t instead of...
_Float16 __2f16 __attribute__((ext_vector_type(2)))
Definition hip_fp16_math_fwd.h:57
Definition amd_device_functions.h:234
Definition amd_device_functions.h:241
Definition amd_hip_vector_types.h:1672