23#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_DEVICE_FUNCTIONS_H
24#define HIP_INCLUDE_HIP_AMD_DETAIL_DEVICE_FUNCTIONS_H
26#if !defined(__HIPCC_RTC__)
27#include <hip/amd_detail/amd_hip_common.h>
29#include <hip/amd_detail/hip_assert.h>
32#include <hip/hip_runtime_api.h>
34#include <hip/hip_vector_types.h>
37#if defined(__clang__) && defined(__HIP__)
38extern "C" __device__
int printf(
const char *fmt, ...);
40template <
typename... All>
41static inline __device__
void printf(
const char* format, All... all) {}
44extern "C" __device__
unsigned long long __ockl_steadyctr_u64();
51__device__
static inline unsigned int __popc(
unsigned int input) {
52 return __builtin_popcount(input);
54__device__
static inline unsigned int __popcll(
unsigned long long int input) {
55 return __builtin_popcountll(input);
58__device__
static inline int __clz(
int input) {
59 return __ockl_clz_u32((uint)input);
62__device__
static inline int __clzll(
long long int input) {
63 return __ockl_clz_u64((uint64_t)input);
66__device__
static inline int __ffs(
unsigned int input) {
67 return ( input == 0 ? -1 : __builtin_ctz(input) ) + 1;
70__device__
static inline int __ffsll(
unsigned long long int input) {
71 return ( input == 0 ? -1 : __builtin_ctzll(input) ) + 1;
74__device__
static inline int __ffs(
int input) {
75 return ( input == 0 ? -1 : __builtin_ctz(input) ) + 1;
78__device__
static inline int __ffsll(
long long int input) {
79 return ( input == 0 ? -1 : __builtin_ctzll(input) ) + 1;
85__device__
static int32_t __fns64(uint64_t mask, uint32_t base, int32_t offset) {
86 uint64_t temp_mask = mask;
87 int32_t temp_offset = offset;
90 temp_mask &= (1 << base);
93 else if (offset < 0) {
94 temp_mask = __builtin_bitreverse64(mask);
96 temp_offset = -offset;
99 temp_mask = temp_mask & ((~0ULL) << base);
100 if (__builtin_popcountll(temp_mask) < temp_offset)
103 for (
int i = 0x20; i > 0; i >>= 1) {
104 uint64_t temp_mask_lo = temp_mask & ((1ULL << i) - 1);
105 int32_t pcnt = __builtin_popcountll(temp_mask_lo);
106 if (pcnt < temp_offset) {
107 temp_mask = temp_mask >> i;
112 temp_mask = temp_mask_lo;
121__device__
static int32_t __fns32(uint32_t mask, uint32_t base, int32_t offset) {
122 uint32_t temp_mask = mask;
123 int32_t temp_offset = offset;
125 temp_mask &= (1 << base);
128 else if (offset < 0) {
129 temp_mask = __builtin_bitreverse32(mask);
131 temp_offset = -offset;
133 temp_mask = temp_mask & ((~0U) << base);
134 if (__builtin_popcount(temp_mask) < temp_offset)
137 for (
int i = 0x10; i > 0; i >>= 1) {
138 uint32_t temp_mask_lo = temp_mask & ((1U << i) - 1);
139 int32_t pcnt = __builtin_popcount(temp_mask_lo);
140 if (pcnt < temp_offset) {
141 temp_mask = temp_mask >> i;
146 temp_mask = temp_mask_lo;
156__device__
static int32_t __fns(
unsigned int mask,
unsigned int base,
int offset) {
157 return __fns32(mask, base, offset);
160__device__
static inline unsigned int __brev(
unsigned int input) {
161 return __builtin_bitreverse32(input);
164__device__
static inline unsigned long long int __brevll(
unsigned long long int input) {
165 return __builtin_bitreverse64(input);
168__device__
static inline unsigned int __lastbit_u32_u64(uint64_t input) {
169 return input == 0 ? -1 : __builtin_ctzl(input);
172__device__
static inline unsigned int __bitextract_u32(
unsigned int src0,
unsigned int src1,
unsigned int src2) {
173 uint32_t offset = src1 & 31;
174 uint32_t width = src2 & 31;
175 return width == 0 ? 0 : (src0 << (32 - offset - width)) >> (32 - width);
178__device__
static inline uint64_t __bitextract_u64(uint64_t src0,
unsigned int src1,
unsigned int src2) {
179 uint64_t offset = src1 & 63;
180 uint64_t width = src2 & 63;
181 return width == 0 ? 0 : (src0 << (64 - offset - width)) >> (64 - width);
184__device__
static inline unsigned int __bitinsert_u32(
unsigned int src0,
unsigned int src1,
unsigned int src2,
unsigned int src3) {
185 uint32_t offset = src2 & 31;
186 uint32_t width = src3 & 31;
187 uint32_t mask = (1 << width) - 1;
188 return ((src0 & ~(mask << offset)) | ((src1 & mask) << offset));
191__device__
static inline uint64_t __bitinsert_u64(uint64_t src0, uint64_t src1,
unsigned int src2,
unsigned int src3) {
192 uint64_t offset = src2 & 63;
193 uint64_t width = src3 & 63;
194 uint64_t mask = (1ULL << width) - 1;
195 return ((src0 & ~(mask << offset)) | ((src1 & mask) << offset));
198__device__
inline unsigned int __funnelshift_l(
unsigned int lo,
unsigned int hi,
unsigned int shift)
200 uint32_t mask_shift = shift & 31;
201 return mask_shift == 0 ? hi : __builtin_amdgcn_alignbit(hi, lo, 32 - mask_shift);
204__device__
inline unsigned int __funnelshift_lc(
unsigned int lo,
unsigned int hi,
unsigned int shift)
206 uint32_t min_shift = shift >= 32 ? 32 : shift;
207 return min_shift == 0 ? hi : __builtin_amdgcn_alignbit(hi, lo, 32 - min_shift);
210__device__
inline unsigned int __funnelshift_r(
unsigned int lo,
unsigned int hi,
unsigned int shift)
212 return __builtin_amdgcn_alignbit(hi, lo, shift);
215__device__
inline unsigned int __funnelshift_rc(
unsigned int lo,
unsigned int hi,
unsigned int shift)
217 return shift >= 32 ? hi : __builtin_amdgcn_alignbit(hi, lo, shift);
220__device__
static unsigned int __byte_perm(
unsigned int x,
unsigned int y,
unsigned int s);
221__device__
static unsigned int __hadd(
int x,
int y);
222__device__
static int __mul24(
int x,
int y);
223__device__
static long long int __mul64hi(
long long int x,
long long int y);
224__device__
static int __mulhi(
int x,
int y);
225__device__
static int __rhadd(
int x,
int y);
226__device__
static unsigned int __sad(
int x,
int y,
unsigned int z);
227__device__
static unsigned int __uhadd(
unsigned int x,
unsigned int y);
228__device__
static int __umul24(
unsigned int x,
unsigned int y);
229__device__
static unsigned long long int __umul64hi(
unsigned long long int x,
unsigned long long int y);
230__device__
static unsigned int __umulhi(
unsigned int x,
unsigned int y);
231__device__
static unsigned int __urhadd(
unsigned int x,
unsigned int y);
232__device__
static unsigned int __usad(
unsigned int x,
unsigned int y,
unsigned int z);
249static inline unsigned int __byte_perm(
unsigned int x,
unsigned int y,
unsigned int s) {
256 result = cHoldVal.c[cHoldKey.c[0] & 0x07];
257 result += (cHoldVal.c[(cHoldKey.c[0] & 0x70) >> 4] << 8);
258 result += (cHoldVal.c[cHoldKey.c[1] & 0x07] << 16);
259 result += (cHoldVal.c[(cHoldKey.c[1] & 0x70) >> 4] << 24);
263__device__
static inline unsigned int __hadd(
int x,
int y) {
265 int sign = z & 0x8000000;
266 int value = z & 0x7FFFFFFF;
267 return ((value) >> 1 || sign);
270__device__
static inline int __mul24(
int x,
int y) {
271 return __ockl_mul24_i32(x, y);
274__device__
static inline long long __mul64hi(
long long int x,
long long int y) {
275 unsigned long long x0 = (
unsigned long long)x & 0xffffffffUL;
276 long long x1 = x >> 32;
277 unsigned long long y0 = (
unsigned long long)y & 0xffffffffUL;
278 long long y1 = y >> 32;
279 unsigned long long z0 = x0*y0;
280 long long t = x1*y0 + (z0 >> 32);
281 long long z1 = t & 0xffffffffL;
282 long long z2 = t >> 32;
284 return x1*y1 + z2 + (z1 >> 32);
287__device__
static inline int __mulhi(
int x,
int y) {
288 return __ockl_mul_hi_i32(x, y);
291__device__
static inline int __rhadd(
int x,
int y) {
293 int sign = z & 0x8000000;
294 int value = z & 0x7FFFFFFF;
295 return ((value) >> 1 || sign);
297__device__
static inline unsigned int __sad(
int x,
int y,
unsigned int z) {
298 return x > y ? x - y + z : y - x + z;
300__device__
static inline unsigned int __uhadd(
unsigned int x,
unsigned int y) {
303__device__
static inline int __umul24(
unsigned int x,
unsigned int y) {
304 return __ockl_mul24_u32(x, y);
308static inline unsigned long long __umul64hi(
unsigned long long int x,
unsigned long long int y) {
309 unsigned long long x0 = x & 0xffffffffUL;
310 unsigned long long x1 = x >> 32;
311 unsigned long long y0 = y & 0xffffffffUL;
312 unsigned long long y1 = y >> 32;
313 unsigned long long z0 = x0*y0;
314 unsigned long long t = x1*y0 + (z0 >> 32);
315 unsigned long long z1 = t & 0xffffffffUL;
316 unsigned long long z2 = t >> 32;
318 return x1*y1 + z2 + (z1 >> 32);
321__device__
static inline unsigned int __umulhi(
unsigned int x,
unsigned int y) {
322 return __ockl_mul_hi_u32(x, y);
324__device__
static inline unsigned int __urhadd(
unsigned int x,
unsigned int y) {
325 return (x + y + 1) >> 1;
327__device__
static inline unsigned int __usad(
unsigned int x,
unsigned int y,
unsigned int z) {
328 return __ockl_sadd_u32(x, y, z);
332static inline unsigned int __mbcnt_lo(
unsigned int x,
unsigned int y) {
return __builtin_amdgcn_mbcnt_lo(x,y);};
335static inline unsigned int __mbcnt_hi(
unsigned int x,
unsigned int y) {
return __builtin_amdgcn_mbcnt_hi(x,y);};
341#if !defined(__HIPCC_RTC__)
342#include "amd_warp_functions.h"
343#include "amd_warp_sync_functions.h"
346#define MASK1 0x00ff00ff
347#define MASK2 0xff00ff00
351 unsigned one1 = in1.w & MASK1;
352 unsigned one2 = in2.w & MASK1;
353 out.w = (one1 + one2) & MASK1;
354 one1 = in1.w & MASK2;
355 one2 = in2.w & MASK2;
356 out.w = out.w | ((one1 + one2) & MASK2);
362 unsigned one1 = in1.w & MASK1;
363 unsigned one2 = in2.w & MASK1;
364 out.w = (one1 - one2) & MASK1;
365 one1 = in1.w & MASK2;
366 one2 = in2.w & MASK2;
367 out.w = out.w | ((one1 - one2) & MASK2);
373 unsigned one1 = in1.w & MASK1;
374 unsigned one2 = in2.w & MASK1;
375 out.w = (one1 * one2) & MASK1;
376 one1 = in1.w & MASK2;
377 one2 = in2.w & MASK2;
378 out.w = out.w | ((one1 * one2) & MASK2);
382__device__
static inline float __double2float_rd(
double x) {
383 return __ocml_cvtrtn_f32_f64(x);
385__device__
static inline float __double2float_rn(
double x) {
return x; }
386__device__
static inline float __double2float_ru(
double x) {
387 return __ocml_cvtrtp_f32_f64(x);
389__device__
static inline float __double2float_rz(
double x) {
390 return __ocml_cvtrtz_f32_f64(x);
393__device__
static inline int __double2hiint(
double x) {
394 static_assert(
sizeof(double) == 2 *
sizeof(int),
"");
397 __builtin_memcpy(tmp, &x,
sizeof(tmp));
401__device__
static inline int __double2loint(
double x) {
402 static_assert(
sizeof(double) == 2 *
sizeof(int),
"");
405 __builtin_memcpy(tmp, &x,
sizeof(tmp));
410__device__
static inline int __double2int_rd(
double x) {
return (
int)__ocml_floor_f64(x); }
411__device__
static inline int __double2int_rn(
double x) {
return (
int)__ocml_rint_f64(x); }
412__device__
static inline int __double2int_ru(
double x) {
return (
int)__ocml_ceil_f64(x); }
413__device__
static inline int __double2int_rz(
double x) {
return (
int)x; }
415__device__
static inline long long int __double2ll_rd(
double x) {
416 return (
long long)__ocml_floor_f64(x);
418__device__
static inline long long int __double2ll_rn(
double x) {
419 return (
long long)__ocml_rint_f64(x);
421__device__
static inline long long int __double2ll_ru(
double x) {
422 return (
long long)__ocml_ceil_f64(x);
424__device__
static inline long long int __double2ll_rz(
double x) {
return (
long long)x; }
426__device__
static inline unsigned int __double2uint_rd(
double x) {
427 return (
unsigned int)__ocml_floor_f64(x);
429__device__
static inline unsigned int __double2uint_rn(
double x) {
430 return (
unsigned int)__ocml_rint_f64(x);
432__device__
static inline unsigned int __double2uint_ru(
double x) {
433 return (
unsigned int)__ocml_ceil_f64(x);
435__device__
static inline unsigned int __double2uint_rz(
double x) {
return (
unsigned int)x; }
437__device__
static inline unsigned long long int __double2ull_rd(
double x) {
438 return (
unsigned long long int)__ocml_floor_f64(x);
440__device__
static inline unsigned long long int __double2ull_rn(
double x) {
441 return (
unsigned long long int)__ocml_rint_f64(x);
443__device__
static inline unsigned long long int __double2ull_ru(
double x) {
444 return (
unsigned long long int)__ocml_ceil_f64(x);
446__device__
static inline unsigned long long int __double2ull_rz(
double x) {
447 return (
unsigned long long int)x;
449__device__
static inline long long int __double_as_longlong(
double x) {
450 static_assert(
sizeof(
long long) ==
sizeof(double),
"");
453 __builtin_memcpy(&tmp, &x,
sizeof(tmp));
472__device__
static inline int __float2int_rd(
float x) {
return (
int)__ocml_floor_f32(x); }
473__device__
static inline int __float2int_rn(
float x) {
return (
int)__ocml_rint_f32(x); }
474__device__
static inline int __float2int_ru(
float x) {
return (
int)__ocml_ceil_f32(x); }
475__device__
static inline int __float2int_rz(
float x) {
return (
int)__ocml_trunc_f32(x); }
477__device__
static inline long long int __float2ll_rd(
float x) {
478 return (
long long int)__ocml_floor_f32(x);
480__device__
static inline long long int __float2ll_rn(
float x) {
481 return (
long long int)__ocml_rint_f32(x);
483__device__
static inline long long int __float2ll_ru(
float x) {
484 return (
long long int)__ocml_ceil_f32(x);
486__device__
static inline long long int __float2ll_rz(
float x) {
return (
long long int)x; }
488__device__
static inline unsigned int __float2uint_rd(
float x) {
489 return (
unsigned int)__ocml_floor_f32(x);
491__device__
static inline unsigned int __float2uint_rn(
float x) {
492 return (
unsigned int)__ocml_rint_f32(x);
494__device__
static inline unsigned int __float2uint_ru(
float x) {
495 return (
unsigned int)__ocml_ceil_f32(x);
497__device__
static inline unsigned int __float2uint_rz(
float x) {
return (
unsigned int)x; }
499__device__
static inline unsigned long long int __float2ull_rd(
float x) {
500 return (
unsigned long long int)__ocml_floor_f32(x);
502__device__
static inline unsigned long long int __float2ull_rn(
float x) {
503 return (
unsigned long long int)__ocml_rint_f32(x);
505__device__
static inline unsigned long long int __float2ull_ru(
float x) {
506 return (
unsigned long long int)__ocml_ceil_f32(x);
508__device__
static inline unsigned long long int __float2ull_rz(
float x) {
509 return (
unsigned long long int)x;
512__device__
static inline int __float_as_int(
float x) {
513 static_assert(
sizeof(int) ==
sizeof(float),
"");
516 __builtin_memcpy(&tmp, &x,
sizeof(tmp));
521__device__
static inline unsigned int __float_as_uint(
float x) {
522 static_assert(
sizeof(
unsigned int) ==
sizeof(float),
"");
525 __builtin_memcpy(&tmp, &x,
sizeof(tmp));
530__device__
static inline double __hiloint2double(
int hi,
int lo) {
531 static_assert(
sizeof(double) ==
sizeof(uint64_t),
"");
533 uint64_t tmp0 = (
static_cast<uint64_t
>(hi) << 32ull) |
static_cast<uint32_t
>(lo);
535 __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
540__device__
static inline double __int2double_rn(
int x) {
return (
double)x; }
542__device__
static inline float __int2float_rd(
int x) {
543 return __ocml_cvtrtn_f32_s32(x);
545__device__
static inline float __int2float_rn(
int x) {
return (
float)x; }
546__device__
static inline float __int2float_ru(
int x) {
547 return __ocml_cvtrtp_f32_s32(x);
549__device__
static inline float __int2float_rz(
int x) {
550 return __ocml_cvtrtz_f32_s32(x);
553__device__
static inline float __int_as_float(
int x) {
554 static_assert(
sizeof(float) ==
sizeof(int),
"");
557 __builtin_memcpy(&tmp, &x,
sizeof(tmp));
562__device__
static inline double __ll2double_rd(
long long int x) {
563 return __ocml_cvtrtn_f64_s64(x);
565__device__
static inline double __ll2double_rn(
long long int x) {
return (
double)x; }
566__device__
static inline double __ll2double_ru(
long long int x) {
567 return __ocml_cvtrtp_f64_s64(x);
569__device__
static inline double __ll2double_rz(
long long int x) {
570 return __ocml_cvtrtz_f64_s64(x);
573__device__
static inline float __ll2float_rd(
long long int x) {
574 return __ocml_cvtrtn_f32_s64(x);
576__device__
static inline float __ll2float_rn(
long long int x) {
return (
float)x; }
577__device__
static inline float __ll2float_ru(
long long int x) {
578 return __ocml_cvtrtp_f32_s64(x);
580__device__
static inline float __ll2float_rz(
long long int x) {
581 return __ocml_cvtrtz_f32_s64(x);
584__device__
static inline double __longlong_as_double(
long long int x) {
585 static_assert(
sizeof(double) ==
sizeof(
long long),
"");
588 __builtin_memcpy(&tmp, &x,
sizeof(tmp));
593__device__
static inline double __uint2double_rn(
unsigned int x) {
return (
double)x; }
595__device__
static inline float __uint2float_rd(
unsigned int x) {
596 return __ocml_cvtrtn_f32_u32(x);
598__device__
static inline float __uint2float_rn(
unsigned int x) {
return (
float)x; }
599__device__
static inline float __uint2float_ru(
unsigned int x) {
600 return __ocml_cvtrtp_f32_u32(x);
602__device__
static inline float __uint2float_rz(
unsigned int x) {
603 return __ocml_cvtrtz_f32_u32(x);
606__device__
static inline float __uint_as_float(
unsigned int x) {
607 static_assert(
sizeof(float) ==
sizeof(
unsigned int),
"");
610 __builtin_memcpy(&tmp, &x,
sizeof(tmp));
615__device__
static inline double __ull2double_rd(
unsigned long long int x) {
616 return __ocml_cvtrtn_f64_u64(x);
618__device__
static inline double __ull2double_rn(
unsigned long long int x) {
return (
double)x; }
619__device__
static inline double __ull2double_ru(
unsigned long long int x) {
620 return __ocml_cvtrtp_f64_u64(x);
622__device__
static inline double __ull2double_rz(
unsigned long long int x) {
623 return __ocml_cvtrtz_f64_u64(x);
626__device__
static inline float __ull2float_rd(
unsigned long long int x) {
627 return __ocml_cvtrtn_f32_u64(x);
629__device__
static inline float __ull2float_rn(
unsigned long long int x) {
return (
float)x; }
630__device__
static inline float __ull2float_ru(
unsigned long long int x) {
631 return __ocml_cvtrtp_f32_u64(x);
633__device__
static inline float __ull2float_rz(
unsigned long long int x) {
634 return __ocml_cvtrtz_f32_u64(x);
637#if defined(__clang__) && defined(__HIP__)
640__device__
long long int __clock64();
641__device__
long long int __clock();
642__device__
long long int clock64();
643__device__
long long int clock();
644__device__
long long int wall_clock64();
646__device__
void __named_sync();
648#ifdef __HIP_DEVICE_COMPILE__
654inline __attribute((always_inline))
655long long int __clock64() {
656#if __has_builtin(__builtin_amdgcn_s_memtime)
658 return (
long long int) __builtin_amdgcn_s_memtime();
661 return (
long long int) __builtin_readcyclecounter();
666inline __attribute((always_inline))
667long long int __clock() {
return __clock64(); }
673long long int wall_clock64() {
674 return (
long long int) __ockl_steadyctr_u64();
679long long int clock64() {
return __clock64(); }
683long long int clock() {
return __clock(); }
688void __named_sync() { __builtin_amdgcn_s_barrier(); }
695uint64_t __lanemask_gt()
697 uint32_t lane = __ockl_lane_u32();
700 uint64_t ballot = __ballot64(1);
701 uint64_t mask = (~((uint64_t)0)) << (lane + 1);
702 return mask & ballot;
707uint64_t __lanemask_lt()
709 uint32_t lane = __ockl_lane_u32();
710 int64_t ballot = __ballot64(1);
711 uint64_t mask = ((uint64_t)1 << lane) - (uint64_t)1;
712 return mask & ballot;
717uint64_t __lanemask_eq()
719 uint32_t lane = __ockl_lane_u32();
720 int64_t mask = ((uint64_t)1 << lane);
725__device__
inline void* __local_to_generic(
void* p) {
return p; }
727#ifdef __HIP_DEVICE_COMPILE__
730void* __get_dynamicgroupbaseptr()
733 return (
char*)__local_to_generic((
void*)__to_local(__builtin_amdgcn_groupstaticsize()));
737void* __get_dynamicgroupbaseptr();
742void *__amdgcn_get_dynamicgroupbaseptr() {
743 return __get_dynamicgroupbaseptr();
749static void __threadfence()
751 __builtin_amdgcn_fence(__ATOMIC_SEQ_CST,
"agent");
756static void __threadfence_block()
758 __builtin_amdgcn_fence(__ATOMIC_SEQ_CST,
"workgroup");
763static void __threadfence_system()
765 __builtin_amdgcn_fence(__ATOMIC_SEQ_CST,
"");
767__device__
inline static void __work_group_barrier(__cl_mem_fence_flags flags) {
769 __builtin_amdgcn_fence(__ATOMIC_RELEASE,
"workgroup");
770 __builtin_amdgcn_s_barrier();
771 __builtin_amdgcn_fence(__ATOMIC_ACQUIRE,
"workgroup");
773 __builtin_amdgcn_s_barrier();
779static void __barrier(
int n)
781 __work_group_barrier((__cl_mem_fence_flags)n);
789 __barrier(__CLK_LOCAL_MEM_FENCE);
795int __syncthreads_count(
int predicate)
797 return __ockl_wgred_add_i32(!!predicate);
803int __syncthreads_and(
int predicate)
805 return __ockl_wgred_and_i32(!!predicate);
811int __syncthreads_or(
int predicate)
813 return __ockl_wgred_or_i32(!!predicate);
845#if (defined (__GFX10__) || defined (__GFX11__))
851#if (defined(__GFX10__) || defined(__GFX11__))
852 #define HW_ID_WGP_ID_SIZE 4
853 #define HW_ID_WGP_ID_OFFSET 10
854 #if (defined(__AMDGCN_CUMODE__))
855 #define HW_ID_CU_ID_SIZE 1
856 #define HW_ID_CU_ID_OFFSET 8
859 #define HW_ID_CU_ID_SIZE 4
860 #define HW_ID_CU_ID_OFFSET 8
863#if (defined(__gfx908__) || defined(__gfx90a__) || \
865 #define HW_ID_SE_ID_SIZE 3
867 #define HW_ID_SE_ID_SIZE 2
869#if (defined(__GFX10__) || defined(__GFX11__))
870 #define HW_ID_SE_ID_OFFSET 18
871 #define HW_ID_SA_ID_OFFSET 16
872 #define HW_ID_SA_ID_SIZE 1
874 #define HW_ID_SE_ID_OFFSET 13
877#if (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
879 #define XCC_ID_XCC_ID_SIZE 4
880 #define XCC_ID_XCC_ID_OFFSET 0
883#if (!defined(__HIP_NO_IMAGE_SUPPORT) && \
884 (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)))
885 #define __HIP_NO_IMAGE_SUPPORT 1
895#define GETREG_IMMED(SZ,OFF,REG) (((SZ) << 11) | ((OFF) << 6) | (REG))
907 unsigned se_id = __builtin_amdgcn_s_getreg(
908 GETREG_IMMED(HW_ID_SE_ID_SIZE-1, HW_ID_SE_ID_OFFSET, HW_ID));
909 #if (defined(__GFX10__) || defined(__GFX11__))
910 unsigned wgp_id = __builtin_amdgcn_s_getreg(
911 GETREG_IMMED(HW_ID_WGP_ID_SIZE - 1, HW_ID_WGP_ID_OFFSET, HW_ID));
912 unsigned sa_id = __builtin_amdgcn_s_getreg(
913 GETREG_IMMED(HW_ID_SA_ID_SIZE - 1, HW_ID_SA_ID_OFFSET, HW_ID));
914 #if (defined(__AMDGCN_CUMODE__))
915 unsigned cu_id = __builtin_amdgcn_s_getreg(
916 GETREG_IMMED(HW_ID_CU_ID_SIZE - 1, HW_ID_CU_ID_OFFSET, HW_ID));
919 #if (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
920 unsigned xcc_id = __builtin_amdgcn_s_getreg(
921 GETREG_IMMED(XCC_ID_XCC_ID_SIZE - 1, XCC_ID_XCC_ID_OFFSET, XCC_ID));
923 unsigned cu_id = __builtin_amdgcn_s_getreg(
924 GETREG_IMMED(HW_ID_CU_ID_SIZE - 1, HW_ID_CU_ID_OFFSET, HW_ID));
926 #if (defined(__GFX10__) || defined(__GFX11__))
927 unsigned temp = se_id;
928 temp = (temp << HW_ID_SA_ID_SIZE) | sa_id;
929 temp = (temp << HW_ID_WGP_ID_SIZE) | wgp_id;
930 #if (defined(__AMDGCN_CUMODE__))
931 temp = (temp << HW_ID_CU_ID_SIZE) | cu_id;
935 #elif (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
936 unsigned temp = xcc_id;
937 temp = (temp << HW_ID_SE_ID_SIZE) | se_id;
938 temp = (temp << HW_ID_CU_ID_SIZE) | cu_id;
941 return (se_id << HW_ID_CU_ID_SIZE) + cu_id;
949#define HIP_DYNAMIC_SHARED(type, var) extern __shared__ type var[];
950#define HIP_DYNAMIC_SHARED_ATTRIBUTE
956static inline __device__
void* __hip_hc_memcpy(
void* dst,
const void* src,
size_t size) {
957 auto dstPtr =
static_cast<unsigned char*
>(dst);
958 auto srcPtr =
static_cast<const unsigned char*
>(src);
961 dstPtr[0] = srcPtr[0];
962 dstPtr[1] = srcPtr[1];
963 dstPtr[2] = srcPtr[2];
964 dstPtr[3] = srcPtr[3];
972 dstPtr[2] = srcPtr[2];
974 dstPtr[1] = srcPtr[1];
976 dstPtr[0] = srcPtr[0];
982static inline __device__
void* __hip_hc_memset(
void* dst,
unsigned char val,
size_t size) {
983 auto dstPtr =
static_cast<unsigned char*
>(dst);
1005#ifndef __OPENMP_AMDGCN__
1006static inline __device__
void* memcpy(
void* dst,
const void* src,
size_t size) {
1007 return __hip_hc_memcpy(dst, src, size);
1010static inline __device__
void* memset(
void* ptr,
int val,
size_t size) {
1011 unsigned char val8 =
static_cast<unsigned char>(val);
1012 return __hip_hc_memset(ptr, val8, size);
Contains declarations for types and functions in device library. Uses int64_t and uint64_t instead of...
_Float16 __2f16 __attribute__((ext_vector_type(2)))
Definition hip_fp16_math_fwd.h:57
Definition amd_device_functions.h:234
Definition amd_device_functions.h:241
Definition amd_hip_vector_types.h:1672