diff --git a/sycl/include/sycl/builtins.hpp b/sycl/include/sycl/builtins.hpp index 42ed4b062e6b4..c9bb5b837361b 100644 --- a/sycl/include/sycl/builtins.hpp +++ b/sycl/include/sycl/builtins.hpp @@ -29,618 +29,6 @@ extern __DPCPP_SYCL_EXTERNAL_LIBC int strncmp(const char *s1, const char *s2, size_t n); extern __DPCPP_SYCL_EXTERNAL_LIBC int rand(); extern __DPCPP_SYCL_EXTERNAL_LIBC void srand(unsigned int seed); -extern __DPCPP_SYCL_EXTERNAL long long int __imf_llmax(long long int x, - long long int y); -extern __DPCPP_SYCL_EXTERNAL long long int __imf_llmin(long long int x, - long long int y); -extern __DPCPP_SYCL_EXTERNAL int __imf_max(int x, int y); -extern __DPCPP_SYCL_EXTERNAL int __imf_min(int x, int y); -extern __DPCPP_SYCL_EXTERNAL unsigned long long int -__imf_ullmax(unsigned long long int x, unsigned long long int y); -extern __DPCPP_SYCL_EXTERNAL unsigned long long int -__imf_ullmin(unsigned long long int x, unsigned long long int y); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_umax(unsigned int x, - unsigned int y); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_umin(unsigned int x, - unsigned int y); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_brev(unsigned int x); -extern __DPCPP_SYCL_EXTERNAL unsigned long long int -__imf_brevll(unsigned long long int x); -extern __DPCPP_SYCL_EXTERNAL unsigned int -__imf_byte_perm(unsigned int x, unsigned int y, unsigned int s); -extern __DPCPP_SYCL_EXTERNAL int __imf_ffs(int x); -extern __DPCPP_SYCL_EXTERNAL int __imf_ffsll(long long int x); -extern __DPCPP_SYCL_EXTERNAL int __imf_clz(int x); -extern __DPCPP_SYCL_EXTERNAL int __imf_clzll(long long int x); -extern __DPCPP_SYCL_EXTERNAL int __imf_popc(unsigned int x); -extern __DPCPP_SYCL_EXTERNAL int __imf_popcll(unsigned long long int x); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_sad(int x, int y, - unsigned int z); -extern __DPCPP_SYCL_EXTERNAL unsigned int -__imf_usad(unsigned int x, unsigned int y, unsigned int z); -extern __DPCPP_SYCL_EXTERNAL int __imf_rhadd(int x, int y); -extern __DPCPP_SYCL_EXTERNAL int __imf_hadd(int x, int y); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_urhadd(unsigned int x, - unsigned int y); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_uhadd(unsigned int x, - unsigned int y); -extern __DPCPP_SYCL_EXTERNAL int __imf_mul24(int x, int y); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_umul24(unsigned int x, - unsigned int y); -extern __DPCPP_SYCL_EXTERNAL int __imf_mulhi(int x, int y); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_umulhi(unsigned int x, - unsigned int y); -extern __DPCPP_SYCL_EXTERNAL long long int __imf_mul64hi(long long int x, - long long int y); -extern __DPCPP_SYCL_EXTERNAL unsigned long long int -__imf_umul64hi(unsigned long long int x, unsigned long long int y); -extern __DPCPP_SYCL_EXTERNAL int __imf_abs(int x); -extern __DPCPP_SYCL_EXTERNAL long long int __imf_llabs(long long int x); -extern __DPCPP_SYCL_EXTERNAL float __imf_saturatef(float x); -extern __DPCPP_SYCL_EXTERNAL float __imf_fmaf(float x, float y, float z); -extern __DPCPP_SYCL_EXTERNAL float __imf_fabsf(float x); -extern __DPCPP_SYCL_EXTERNAL float __imf_floorf(float x); -extern __DPCPP_SYCL_EXTERNAL float __imf_ceilf(float x); -extern __DPCPP_SYCL_EXTERNAL float __imf_truncf(float x); -extern __DPCPP_SYCL_EXTERNAL float __imf_rintf(float x); -extern __DPCPP_SYCL_EXTERNAL float __imf_nearbyintf(float x); -extern __DPCPP_SYCL_EXTERNAL float __imf_sqrtf(float x); -extern __DPCPP_SYCL_EXTERNAL float __imf_rsqrtf(float x); -extern __DPCPP_SYCL_EXTERNAL float __imf_invf(float x); -extern __DPCPP_SYCL_EXTERNAL float __imf_fmaxf(float x, float y); -extern __DPCPP_SYCL_EXTERNAL float __imf_fminf(float x, float y); -extern __DPCPP_SYCL_EXTERNAL float __imf_copysignf(float x, float y); -extern __DPCPP_SYCL_EXTERNAL float __imf_fast_exp10f(float x); -extern __DPCPP_SYCL_EXTERNAL float __imf_fast_expf(float x); -extern __DPCPP_SYCL_EXTERNAL float __imf_fast_logf(float x); -extern __DPCPP_SYCL_EXTERNAL float __imf_fast_log2f(float x); -extern __DPCPP_SYCL_EXTERNAL float __imf_fast_log10f(float x); -extern __DPCPP_SYCL_EXTERNAL float __imf_fast_powf(float x, float y); -extern __DPCPP_SYCL_EXTERNAL float __imf_fast_fdividef(float x, float y); -extern __DPCPP_SYCL_EXTERNAL float __imf_fadd_rd(float x, float y); -extern __DPCPP_SYCL_EXTERNAL float __imf_fadd_rn(float x, float y); -extern __DPCPP_SYCL_EXTERNAL float __imf_fadd_ru(float x, float y); -extern __DPCPP_SYCL_EXTERNAL float __imf_fadd_rz(float x, float y); -extern __DPCPP_SYCL_EXTERNAL float __imf_fsub_rd(float x, float y); -extern __DPCPP_SYCL_EXTERNAL float __imf_fsub_rn(float x, float y); -extern __DPCPP_SYCL_EXTERNAL float __imf_fsub_ru(float x, float y); -extern __DPCPP_SYCL_EXTERNAL float __imf_fsub_rz(float x, float y); -extern __DPCPP_SYCL_EXTERNAL float __imf_fmul_rd(float x, float y); -extern __DPCPP_SYCL_EXTERNAL float __imf_fmul_rn(float x, float y); -extern __DPCPP_SYCL_EXTERNAL float __imf_fmul_ru(float x, float y); -extern __DPCPP_SYCL_EXTERNAL float __imf_fmul_rz(float x, float y); -extern __DPCPP_SYCL_EXTERNAL float __imf_fdiv_rd(float x, float y); -extern __DPCPP_SYCL_EXTERNAL float __imf_fdiv_rn(float x, float y); -extern __DPCPP_SYCL_EXTERNAL float __imf_fdiv_ru(float x, float y); -extern __DPCPP_SYCL_EXTERNAL float __imf_fdiv_rz(float x, float y); -extern __DPCPP_SYCL_EXTERNAL float __imf_frcp_rd(float x); -extern __DPCPP_SYCL_EXTERNAL float __imf_frcp_rn(float x); -extern __DPCPP_SYCL_EXTERNAL float __imf_frcp_ru(float x); -extern __DPCPP_SYCL_EXTERNAL float __imf_frcp_rz(float x); -extern __DPCPP_SYCL_EXTERNAL float __imf_fsigmf(float x); -extern __DPCPP_SYCL_EXTERNAL float __imf_fmaf_rd(float x, float y, float z); -extern __DPCPP_SYCL_EXTERNAL float __imf_fmaf_rn(float x, float y, float z); -extern __DPCPP_SYCL_EXTERNAL float __imf_fmaf_ru(float x, float y, float z); -extern __DPCPP_SYCL_EXTERNAL float __imf_fmaf_rz(float x, float y, float z); -extern __DPCPP_SYCL_EXTERNAL float __imf_sqrtf_rd(float x); -extern __DPCPP_SYCL_EXTERNAL float __imf_sqrtf_rn(float x); -extern __DPCPP_SYCL_EXTERNAL float __imf_sqrtf_ru(float x); -extern __DPCPP_SYCL_EXTERNAL float __imf_sqrtf_rz(float x); -extern __DPCPP_SYCL_EXTERNAL int __imf_float2int_rd(float x); -extern __DPCPP_SYCL_EXTERNAL int __imf_float2int_rn(float x); -extern __DPCPP_SYCL_EXTERNAL int __imf_float2int_ru(float x); -extern __DPCPP_SYCL_EXTERNAL int __imf_float2int_rz(float x); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_float2uint_rd(float x); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_float2uint_rn(float x); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_float2uint_ru(float x); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_float2uint_rz(float x); -extern __DPCPP_SYCL_EXTERNAL long long int __imf_float2ll_rd(float x); -extern __DPCPP_SYCL_EXTERNAL long long int __imf_float2ll_rn(float x); -extern __DPCPP_SYCL_EXTERNAL long long int __imf_float2ll_ru(float x); -extern __DPCPP_SYCL_EXTERNAL long long int __imf_float2ll_rz(float x); -extern __DPCPP_SYCL_EXTERNAL unsigned long long int __imf_float2ull_rd(float x); -extern __DPCPP_SYCL_EXTERNAL unsigned long long int __imf_float2ull_rn(float x); -extern __DPCPP_SYCL_EXTERNAL unsigned long long int __imf_float2ull_ru(float x); -extern __DPCPP_SYCL_EXTERNAL unsigned long long int __imf_float2ull_rz(float x); -extern __DPCPP_SYCL_EXTERNAL int __imf_float_as_int(float x); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_float_as_uint(float x); -extern __DPCPP_SYCL_EXTERNAL float __imf_int2float_rd(int x); -extern __DPCPP_SYCL_EXTERNAL float __imf_int2float_rn(int x); -extern __DPCPP_SYCL_EXTERNAL float __imf_int2float_ru(int x); -extern __DPCPP_SYCL_EXTERNAL float __imf_int2float_rz(int x); -extern __DPCPP_SYCL_EXTERNAL float __imf_int_as_float(int x); -extern __DPCPP_SYCL_EXTERNAL float __imf_ll2float_rd(long long int x); -extern __DPCPP_SYCL_EXTERNAL float __imf_ll2float_rn(long long int x); -extern __DPCPP_SYCL_EXTERNAL float __imf_ll2float_ru(long long int x); -extern __DPCPP_SYCL_EXTERNAL float __imf_ll2float_rz(long long int x); -extern __DPCPP_SYCL_EXTERNAL float __imf_uint2float_rd(unsigned int x); -extern __DPCPP_SYCL_EXTERNAL float __imf_uint2float_rn(unsigned int x); -extern __DPCPP_SYCL_EXTERNAL float __imf_uint2float_ru(unsigned int x); -extern __DPCPP_SYCL_EXTERNAL float __imf_uint2float_rz(unsigned int x); -extern __DPCPP_SYCL_EXTERNAL float __imf_uint_as_float(unsigned int x); -extern __DPCPP_SYCL_EXTERNAL float __imf_ull2float_rd(unsigned long long int x); -extern __DPCPP_SYCL_EXTERNAL float __imf_ull2float_rn(unsigned long long int x); -extern __DPCPP_SYCL_EXTERNAL float __imf_ull2float_ru(unsigned long long int x); -extern __DPCPP_SYCL_EXTERNAL float __imf_ull2float_rz(unsigned long long int x); -extern __DPCPP_SYCL_EXTERNAL float __imf_half2float(_Float16 x); -extern __DPCPP_SYCL_EXTERNAL _Float16 __imf_float2half_rd(float x); -extern __DPCPP_SYCL_EXTERNAL _Float16 __imf_float2half_rn(float x); -extern __DPCPP_SYCL_EXTERNAL _Float16 __imf_float2half_ru(float x); -extern __DPCPP_SYCL_EXTERNAL _Float16 __imf_float2half_rz(float x); -extern __DPCPP_SYCL_EXTERNAL int __imf_half2int_rd(_Float16 x); -extern __DPCPP_SYCL_EXTERNAL int __imf_half2int_rn(_Float16 x); -extern __DPCPP_SYCL_EXTERNAL int __imf_half2int_ru(_Float16 x); -extern __DPCPP_SYCL_EXTERNAL int __imf_half2int_rz(_Float16 x); -extern __DPCPP_SYCL_EXTERNAL long long __imf_half2ll_rd(_Float16 x); -extern __DPCPP_SYCL_EXTERNAL long long __imf_half2ll_rn(_Float16 x); -extern __DPCPP_SYCL_EXTERNAL long long __imf_half2ll_ru(_Float16 x); -extern __DPCPP_SYCL_EXTERNAL long long __imf_half2ll_rz(_Float16 x); -extern __DPCPP_SYCL_EXTERNAL short __imf_half2short_rd(_Float16 x); -extern __DPCPP_SYCL_EXTERNAL short __imf_half2short_rn(_Float16 x); -extern __DPCPP_SYCL_EXTERNAL short __imf_half2short_ru(_Float16 x); -extern __DPCPP_SYCL_EXTERNAL short __imf_half2short_rz(_Float16 x); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_half2uint_rd(_Float16 x); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_half2uint_rn(_Float16 x); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_half2uint_ru(_Float16 x); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_half2uint_rz(_Float16 x); -extern __DPCPP_SYCL_EXTERNAL unsigned long long __imf_half2ull_rd(_Float16 x); -extern __DPCPP_SYCL_EXTERNAL unsigned long long __imf_half2ull_rn(_Float16 x); -extern __DPCPP_SYCL_EXTERNAL unsigned long long __imf_half2ull_ru(_Float16 x); -extern __DPCPP_SYCL_EXTERNAL unsigned long long __imf_half2ull_rz(_Float16 x); -extern __DPCPP_SYCL_EXTERNAL unsigned short __imf_half2ushort_rd(_Float16 x); -extern __DPCPP_SYCL_EXTERNAL unsigned short __imf_half2ushort_rn(_Float16 x); -extern __DPCPP_SYCL_EXTERNAL unsigned short __imf_half2ushort_ru(_Float16 x); -extern __DPCPP_SYCL_EXTERNAL unsigned short __imf_half2ushort_rz(_Float16 x); -extern __DPCPP_SYCL_EXTERNAL short __imf_half_as_short(_Float16 x); -extern __DPCPP_SYCL_EXTERNAL unsigned short __imf_half_as_ushort(_Float16 x); -extern __DPCPP_SYCL_EXTERNAL _Float16 __imf_int2half_rd(int x); -extern __DPCPP_SYCL_EXTERNAL _Float16 __imf_int2half_rn(int x); -extern __DPCPP_SYCL_EXTERNAL _Float16 __imf_int2half_ru(int x); -extern __DPCPP_SYCL_EXTERNAL _Float16 __imf_int2half_rz(int x); -extern __DPCPP_SYCL_EXTERNAL _Float16 __imf_ll2half_rd(long long x); -extern __DPCPP_SYCL_EXTERNAL _Float16 __imf_ll2half_rn(long long x); -extern __DPCPP_SYCL_EXTERNAL _Float16 __imf_ll2half_ru(long long x); -extern __DPCPP_SYCL_EXTERNAL _Float16 __imf_ll2half_rz(long long x); -extern __DPCPP_SYCL_EXTERNAL _Float16 __imf_short2half_rd(short x); -extern __DPCPP_SYCL_EXTERNAL _Float16 __imf_short2half_rn(short x); -extern __DPCPP_SYCL_EXTERNAL _Float16 __imf_short2half_ru(short x); -extern __DPCPP_SYCL_EXTERNAL _Float16 __imf_short2half_rz(short x); -extern __DPCPP_SYCL_EXTERNAL _Float16 __imf_short_as_half(short x); -extern __DPCPP_SYCL_EXTERNAL _Float16 __imf_uint2half_rd(unsigned int x); -extern __DPCPP_SYCL_EXTERNAL _Float16 __imf_uint2half_rn(unsigned int x); -extern __DPCPP_SYCL_EXTERNAL _Float16 __imf_uint2half_ru(unsigned int x); -extern __DPCPP_SYCL_EXTERNAL _Float16 __imf_uint2half_rz(unsigned int x); -extern __DPCPP_SYCL_EXTERNAL _Float16 __imf_ull2half_rd(unsigned long long x); -extern __DPCPP_SYCL_EXTERNAL _Float16 __imf_ull2half_rn(unsigned long long x); -extern __DPCPP_SYCL_EXTERNAL _Float16 __imf_ull2half_ru(unsigned long long x); -extern __DPCPP_SYCL_EXTERNAL _Float16 __imf_ull2half_rz(unsigned long long x); -extern __DPCPP_SYCL_EXTERNAL _Float16 __imf_ushort2half_rd(unsigned short x); -extern __DPCPP_SYCL_EXTERNAL _Float16 __imf_ushort2half_rn(unsigned short x); -extern __DPCPP_SYCL_EXTERNAL _Float16 __imf_ushort2half_ru(unsigned short x); -extern __DPCPP_SYCL_EXTERNAL _Float16 __imf_ushort2half_rz(unsigned short x); -extern __DPCPP_SYCL_EXTERNAL _Float16 __imf_ushort_as_half(unsigned short x); -extern __DPCPP_SYCL_EXTERNAL _Float16 __imf_double2half(double x); - -extern __DPCPP_SYCL_EXTERNAL _Float16 __imf_fmaf16(_Float16 x, _Float16 y, - _Float16 z); -extern __DPCPP_SYCL_EXTERNAL _Float16 __imf_fabsf16(_Float16 x); -extern __DPCPP_SYCL_EXTERNAL _Float16 __imf_floorf16(_Float16 x); -extern __DPCPP_SYCL_EXTERNAL _Float16 __imf_ceilf16(_Float16 x); -extern __DPCPP_SYCL_EXTERNAL _Float16 __imf_truncf16(_Float16 x); -extern __DPCPP_SYCL_EXTERNAL _Float16 __imf_rintf16(_Float16 x); -extern __DPCPP_SYCL_EXTERNAL _Float16 __imf_nearbyintf16(_Float16 x); -extern __DPCPP_SYCL_EXTERNAL _Float16 __imf_sqrtf16(_Float16 x); -extern __DPCPP_SYCL_EXTERNAL _Float16 __imf_rsqrtf16(_Float16 x); -extern __DPCPP_SYCL_EXTERNAL _Float16 __imf_invf16(_Float16 x); -extern __DPCPP_SYCL_EXTERNAL _Float16 __imf_fmaxf16(_Float16 x, _Float16 y); -extern __DPCPP_SYCL_EXTERNAL _Float16 __imf_fminf16(_Float16 x, _Float16 y); -extern __DPCPP_SYCL_EXTERNAL _Float16 __imf_fsigmf16(_Float16 x); -extern __DPCPP_SYCL_EXTERNAL _Float16 __imf_copysignf16(_Float16 x, _Float16 y); -extern __DPCPP_SYCL_EXTERNAL float __imf_half2float(_Float16 x); -extern __DPCPP_SYCL_EXTERNAL float __imf_bfloat162float(uint16_t x); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_bfloat162uint_rd(uint16_t x); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_bfloat162uint_rn(uint16_t x); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_bfloat162uint_ru(uint16_t x); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_bfloat162uint_rz(uint16_t x); -extern __DPCPP_SYCL_EXTERNAL unsigned short -__imf_bfloat162ushort_rd(uint16_t x); -extern __DPCPP_SYCL_EXTERNAL unsigned short -__imf_bfloat162ushort_rn(uint16_t x); -extern __DPCPP_SYCL_EXTERNAL unsigned short -__imf_bfloat162ushort_ru(uint16_t x); -extern __DPCPP_SYCL_EXTERNAL unsigned short -__imf_bfloat162ushort_rz(uint16_t x); -extern __DPCPP_SYCL_EXTERNAL unsigned long long -__imf_bfloat162ull_rd(uint16_t x); -extern __DPCPP_SYCL_EXTERNAL unsigned long long -__imf_bfloat162ull_rn(uint16_t x); -extern __DPCPP_SYCL_EXTERNAL unsigned long long -__imf_bfloat162ull_ru(uint16_t x); -extern __DPCPP_SYCL_EXTERNAL unsigned long long -__imf_bfloat162ull_rz(uint16_t x); -extern __DPCPP_SYCL_EXTERNAL int __imf_bfloat162int_rd(uint16_t x); -extern __DPCPP_SYCL_EXTERNAL int __imf_bfloat162int_rn(uint16_t x); -extern __DPCPP_SYCL_EXTERNAL int __imf_bfloat162int_ru(uint16_t x); -extern __DPCPP_SYCL_EXTERNAL int __imf_bfloat162int_rz(uint16_t x); -extern __DPCPP_SYCL_EXTERNAL short __imf_bfloat162short_rd(uint16_t x); -extern __DPCPP_SYCL_EXTERNAL short __imf_bfloat162short_rn(uint16_t x); -extern __DPCPP_SYCL_EXTERNAL short __imf_bfloat162short_ru(uint16_t x); -extern __DPCPP_SYCL_EXTERNAL short __imf_bfloat162short_rz(uint16_t x); -extern __DPCPP_SYCL_EXTERNAL long long __imf_bfloat162ll_rd(uint16_t x); -extern __DPCPP_SYCL_EXTERNAL long long __imf_bfloat162ll_rn(uint16_t x); -extern __DPCPP_SYCL_EXTERNAL long long __imf_bfloat162ll_ru(uint16_t x); -extern __DPCPP_SYCL_EXTERNAL long long __imf_bfloat162ll_rz(uint16_t x); -extern __DPCPP_SYCL_EXTERNAL uint16_t __imf_float2bfloat16(float x); -extern __DPCPP_SYCL_EXTERNAL uint16_t __imf_float2bfloat16_rd(float x); -extern __DPCPP_SYCL_EXTERNAL uint16_t __imf_float2bfloat16_rn(float x); -extern __DPCPP_SYCL_EXTERNAL uint16_t __imf_float2bfloat16_ru(float x); -extern __DPCPP_SYCL_EXTERNAL uint16_t __imf_float2bfloat16_rz(float x); -extern __DPCPP_SYCL_EXTERNAL uint16_t -__imf_ushort2bfloat16_rd(unsigned short x); -extern __DPCPP_SYCL_EXTERNAL uint16_t -__imf_ushort2bfloat16_rn(unsigned short x); -extern __DPCPP_SYCL_EXTERNAL uint16_t -__imf_ushort2bfloat16_ru(unsigned short x); -extern __DPCPP_SYCL_EXTERNAL uint16_t -__imf_ushort2bfloat16_rz(unsigned short x); -extern __DPCPP_SYCL_EXTERNAL uint16_t __imf_uint2bfloat16_rd(unsigned int x); -extern __DPCPP_SYCL_EXTERNAL uint16_t __imf_uint2bfloat16_rn(unsigned int x); -extern __DPCPP_SYCL_EXTERNAL uint16_t __imf_uint2bfloat16_ru(unsigned int x); -extern __DPCPP_SYCL_EXTERNAL uint16_t __imf_uint2bfloat16_rz(unsigned int x); -extern __DPCPP_SYCL_EXTERNAL uint16_t -__imf_ull2bfloat16_rd(unsigned long long x); -extern __DPCPP_SYCL_EXTERNAL uint16_t -__imf_ull2bfloat16_rn(unsigned long long x); -extern __DPCPP_SYCL_EXTERNAL uint16_t -__imf_ull2bfloat16_ru(unsigned long long x); -extern __DPCPP_SYCL_EXTERNAL uint16_t -__imf_ull2bfloat16_rz(unsigned long long x); -extern __DPCPP_SYCL_EXTERNAL uint16_t __imf_short2bfloat16_rd(short x); -extern __DPCPP_SYCL_EXTERNAL uint16_t __imf_short2bfloat16_rn(short x); -extern __DPCPP_SYCL_EXTERNAL uint16_t __imf_short2bfloat16_ru(short x); -extern __DPCPP_SYCL_EXTERNAL uint16_t __imf_short2bfloat16_rz(short x); -extern __DPCPP_SYCL_EXTERNAL uint16_t __imf_int2bfloat16_rd(int x); -extern __DPCPP_SYCL_EXTERNAL uint16_t __imf_int2bfloat16_rn(int x); -extern __DPCPP_SYCL_EXTERNAL uint16_t __imf_int2bfloat16_ru(int x); -extern __DPCPP_SYCL_EXTERNAL uint16_t __imf_int2bfloat16_rz(int x); -extern __DPCPP_SYCL_EXTERNAL uint16_t __imf_ll2bfloat16_rd(long long x); -extern __DPCPP_SYCL_EXTERNAL uint16_t __imf_ll2bfloat16_rn(long long x); -extern __DPCPP_SYCL_EXTERNAL uint16_t __imf_ll2bfloat16_ru(long long x); -extern __DPCPP_SYCL_EXTERNAL uint16_t __imf_ll2bfloat16_rz(long long x); -extern __DPCPP_SYCL_EXTERNAL uint16_t __imf_double2bfloat16(double x); -extern __DPCPP_SYCL_EXTERNAL short __imf_bfloat16_as_short(uint16_t x); -extern __DPCPP_SYCL_EXTERNAL unsigned short -__imf_bfloat16_as_ushort(uint16_t x); -extern __DPCPP_SYCL_EXTERNAL uint16_t __imf_short_as_bfloat16(short x); -extern __DPCPP_SYCL_EXTERNAL uint16_t -__imf_ushort_as_bfloat16(unsigned short x); -extern __DPCPP_SYCL_EXTERNAL uint16_t __imf_fmabf16(uint16_t x, uint16_t y, - uint16_t z); -extern __DPCPP_SYCL_EXTERNAL uint16_t __imf_fmaxbf16(uint16_t x, uint16_t y); -extern __DPCPP_SYCL_EXTERNAL uint16_t __imf_fminbf16(uint16_t x, uint16_t y); -extern __DPCPP_SYCL_EXTERNAL uint16_t __imf_fabsbf16(uint16_t x); -extern __DPCPP_SYCL_EXTERNAL uint16_t __imf_rintbf16(uint16_t x); -extern __DPCPP_SYCL_EXTERNAL uint16_t __imf_floorbf16(uint16_t x); -extern __DPCPP_SYCL_EXTERNAL uint16_t __imf_ceilbf16(uint16_t x); -extern __DPCPP_SYCL_EXTERNAL uint16_t __imf_truncbf16(uint16_t x); -extern __DPCPP_SYCL_EXTERNAL uint16_t __imf_copysignbf16(uint16_t x, - uint16_t y); -extern __DPCPP_SYCL_EXTERNAL uint16_t __imf_fsigmbf16(uint16_t x); -extern __DPCPP_SYCL_EXTERNAL uint16_t __imf_sqrtbf16(uint16_t x); -extern __DPCPP_SYCL_EXTERNAL uint16_t __imf_rsqrtbf16(uint16_t x); -extern __DPCPP_SYCL_EXTERNAL double __imf_fma(double x, double y, double z); -extern __DPCPP_SYCL_EXTERNAL double __imf_fma_rd(double x, double y, double z); -extern __DPCPP_SYCL_EXTERNAL double __imf_fma_rn(double x, double y, double z); -extern __DPCPP_SYCL_EXTERNAL double __imf_fma_ru(double x, double y, double z); -extern __DPCPP_SYCL_EXTERNAL double __imf_fma_rz(double x, double y, double z); -extern __DPCPP_SYCL_EXTERNAL double __imf_fabs(double x); -extern __DPCPP_SYCL_EXTERNAL double __imf_floor(double x); -extern __DPCPP_SYCL_EXTERNAL double __imf_ceil(double x); -extern __DPCPP_SYCL_EXTERNAL double __imf_trunc(double x); -extern __DPCPP_SYCL_EXTERNAL double __imf_rint(double x); -extern __DPCPP_SYCL_EXTERNAL double __imf_rcp64h(double x); -extern __DPCPP_SYCL_EXTERNAL double __imf_nearbyint(double x); -extern __DPCPP_SYCL_EXTERNAL double __imf_sqrt(double x); -extern __DPCPP_SYCL_EXTERNAL double __imf_rsqrt(double x); -extern __DPCPP_SYCL_EXTERNAL double __imf_inv(double x); -extern __DPCPP_SYCL_EXTERNAL double __imf_fmax(double x, double y); -extern __DPCPP_SYCL_EXTERNAL double __imf_fmin(double x, double y); -extern __DPCPP_SYCL_EXTERNAL double __imf_copysign(double x, double y); -extern __DPCPP_SYCL_EXTERNAL double __imf_dadd_rd(double x, double y); -extern __DPCPP_SYCL_EXTERNAL double __imf_dadd_rn(double x, double y); -extern __DPCPP_SYCL_EXTERNAL double __imf_dadd_ru(double x, double y); -extern __DPCPP_SYCL_EXTERNAL double __imf_dadd_rz(double x, double y); -extern __DPCPP_SYCL_EXTERNAL double __imf_dsub_rd(double x, double y); -extern __DPCPP_SYCL_EXTERNAL double __imf_dsub_rn(double x, double y); -extern __DPCPP_SYCL_EXTERNAL double __imf_dsub_ru(double x, double y); -extern __DPCPP_SYCL_EXTERNAL double __imf_dsub_rz(double x, double y); -extern __DPCPP_SYCL_EXTERNAL double __imf_dmul_rd(double x, double y); -extern __DPCPP_SYCL_EXTERNAL double __imf_dmul_rn(double x, double y); -extern __DPCPP_SYCL_EXTERNAL double __imf_dmul_ru(double x, double y); -extern __DPCPP_SYCL_EXTERNAL double __imf_dmul_rz(double x, double y); -extern __DPCPP_SYCL_EXTERNAL double __imf_ddiv_rd(double x, double y); -extern __DPCPP_SYCL_EXTERNAL double __imf_ddiv_rn(double x, double y); -extern __DPCPP_SYCL_EXTERNAL double __imf_ddiv_ru(double x, double y); -extern __DPCPP_SYCL_EXTERNAL double __imf_ddiv_rz(double x, double y); -extern __DPCPP_SYCL_EXTERNAL double __imf_drcp_rd(double x); -extern __DPCPP_SYCL_EXTERNAL double __imf_drcp_rn(double x); -extern __DPCPP_SYCL_EXTERNAL double __imf_drcp_ru(double x); -extern __DPCPP_SYCL_EXTERNAL double __imf_drcp_rz(double x); -extern __DPCPP_SYCL_EXTERNAL double __imf_sqrt_rd(double x); -extern __DPCPP_SYCL_EXTERNAL double __imf_sqrt_rn(double x); -extern __DPCPP_SYCL_EXTERNAL double __imf_sqrt_ru(double x); -extern __DPCPP_SYCL_EXTERNAL double __imf_sqrt_rz(double x); -extern __DPCPP_SYCL_EXTERNAL float __imf_double2float_rd(double x); -extern __DPCPP_SYCL_EXTERNAL float __imf_double2float_rn(double x); -extern __DPCPP_SYCL_EXTERNAL float __imf_double2float_ru(double x); -extern __DPCPP_SYCL_EXTERNAL float __imf_double2float_rz(double x); -extern __DPCPP_SYCL_EXTERNAL int __imf_double2hiint(double x); -extern __DPCPP_SYCL_EXTERNAL int __imf_double2loint(double x); -extern __DPCPP_SYCL_EXTERNAL int __imf_double2int_rd(double x); -extern __DPCPP_SYCL_EXTERNAL int __imf_double2int_rn(double x); -extern __DPCPP_SYCL_EXTERNAL int __imf_double2int_ru(double x); -extern __DPCPP_SYCL_EXTERNAL int __imf_double2int_rz(double x); -extern __DPCPP_SYCL_EXTERNAL double __imf_int2double_rn(int x); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_double2uint_rd(double x); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_double2uint_rn(double x); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_double2uint_ru(double x); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_double2uint_rz(double x); -extern __DPCPP_SYCL_EXTERNAL long long int __imf_double2ll_rd(double x); -extern __DPCPP_SYCL_EXTERNAL long long int __imf_double2ll_rn(double x); -extern __DPCPP_SYCL_EXTERNAL long long int __imf_double2ll_ru(double x); -extern __DPCPP_SYCL_EXTERNAL long long int __imf_double2ll_rz(double x); -extern __DPCPP_SYCL_EXTERNAL double __imf_ll2double_rd(long long int x); -extern __DPCPP_SYCL_EXTERNAL double __imf_ll2double_rn(long long int x); -extern __DPCPP_SYCL_EXTERNAL double __imf_ll2double_ru(long long int x); -extern __DPCPP_SYCL_EXTERNAL double __imf_ll2double_rz(long long int x); -extern __DPCPP_SYCL_EXTERNAL double -__imf_ull2double_rd(unsigned long long int x); -extern __DPCPP_SYCL_EXTERNAL double -__imf_ull2double_rn(unsigned long long int x); -extern __DPCPP_SYCL_EXTERNAL double -__imf_ull2double_ru(unsigned long long int x); -extern __DPCPP_SYCL_EXTERNAL double -__imf_ull2double_rz(unsigned long long int x); -extern __DPCPP_SYCL_EXTERNAL unsigned long long int -__imf_double2ull_rd(double x); -extern __DPCPP_SYCL_EXTERNAL unsigned long long int -__imf_double2ull_rn(double x); -extern __DPCPP_SYCL_EXTERNAL unsigned long long int -__imf_double2ull_ru(double x); -extern __DPCPP_SYCL_EXTERNAL unsigned long long int -__imf_double2ull_rz(double x); -extern __DPCPP_SYCL_EXTERNAL long long int __imf_double_as_longlong(double x); -extern __DPCPP_SYCL_EXTERNAL double __imf_longlong_as_double(long long int x); -extern __DPCPP_SYCL_EXTERNAL double __imf_uint2double_rd(unsigned int x); -extern __DPCPP_SYCL_EXTERNAL double __imf_uint2double_rn(unsigned int x); -extern __DPCPP_SYCL_EXTERNAL double __imf_uint2double_ru(unsigned int x); -extern __DPCPP_SYCL_EXTERNAL double __imf_uint2double_rz(unsigned int x); -extern __DPCPP_SYCL_EXTERNAL double __imf_hiloint2double(int hi, int lo); - -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_vabs2(unsigned int x); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_vabs4(unsigned int x); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_vabsss2(unsigned int x); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_vabsss4(unsigned int x); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_vneg2(unsigned int x); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_vneg4(unsigned int x); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_vnegss2(unsigned int x); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_vnegss4(unsigned int x); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_vabsdiffs2(unsigned int x, - unsigned int y); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_vabsdiffs4(unsigned int x, - unsigned int y); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_vabsdiffu2(unsigned int x, - unsigned int y); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_vabsdiffu4(unsigned int x, - unsigned int y); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_vadd2(unsigned int x, - unsigned int y); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_vadd4(unsigned int x, - unsigned int y); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_vaddss2(unsigned int x, - unsigned int y); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_vaddss4(unsigned int x, - unsigned int y); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_vaddus2(unsigned int x, - unsigned int y); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_vaddus4(unsigned int x, - unsigned int y); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_vsub2(unsigned int x, - unsigned int y); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_vsub4(unsigned int x, - unsigned int y); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_vsubss2(unsigned int x, - unsigned int y); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_vsubss4(unsigned int x, - unsigned int y); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_vsubus2(unsigned int x, - unsigned int y); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_vsubus4(unsigned int x, - unsigned int y); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_vavgs2(unsigned int x, - unsigned int y); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_vavgs4(unsigned int x, - unsigned int y); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_vavgu2(unsigned int x, - unsigned int y); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_vavgu4(unsigned int x, - unsigned int y); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_vhaddu2(unsigned int x, - unsigned int y); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_vhaddu4(unsigned int x, - unsigned int y); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_vcmpeq2(unsigned int x, - unsigned int y); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_vcmpeq4(unsigned int x, - unsigned int y); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_vcmpne2(unsigned int x, - unsigned int y); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_vcmpne4(unsigned int x, - unsigned int y); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_vcmpges2(unsigned int x, - unsigned int y); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_vcmpges4(unsigned int x, - unsigned int y); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_vcmpgeu2(unsigned int x, - unsigned int y); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_vcmpgeu4(unsigned int x, - unsigned int y); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_vcmpgts2(unsigned int x, - unsigned int y); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_vcmpgts4(unsigned int x, - unsigned int y); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_vcmpgtu2(unsigned int x, - unsigned int y); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_vcmpgtu4(unsigned int x, - unsigned int y); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_vcmples2(unsigned int x, - unsigned int y); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_vcmples4(unsigned int x, - unsigned int y); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_vcmpleu2(unsigned int x, - unsigned int y); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_vcmpleu4(unsigned int x, - unsigned int y); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_vcmplts2(unsigned int x, - unsigned int y); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_vcmplts4(unsigned int x, - unsigned int y); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_vcmpltu2(unsigned int x, - unsigned int y); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_vcmpltu4(unsigned int x, - unsigned int y); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_vmaxs2(unsigned int x, - unsigned int y); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_vmaxs4(unsigned int x, - unsigned int y); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_vmaxu2(unsigned int x, - unsigned int y); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_vmaxu4(unsigned int x, - unsigned int y); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_vmins2(unsigned int x, - unsigned int y); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_vmins4(unsigned int x, - unsigned int y); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_vminu2(unsigned int x, - unsigned int y); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_vminu4(unsigned int x, - unsigned int y); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_vseteq2(unsigned int x, - unsigned int y); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_vseteq4(unsigned int x, - unsigned int y); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_vsetne2(unsigned int x, - unsigned int y); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_vsetne4(unsigned int x, - unsigned int y); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_vsetges2(unsigned int x, - unsigned int y); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_vsetges4(unsigned int x, - unsigned int y); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_vsetgeu2(unsigned int x, - unsigned int y); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_vsetgeu4(unsigned int x, - unsigned int y); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_vsetgts2(unsigned int x, - unsigned int y); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_vsetgts4(unsigned int x, - unsigned int y); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_vsetgtu2(unsigned int x, - unsigned int y); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_vsetgtu4(unsigned int x, - unsigned int y); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_vsetles2(unsigned int x, - unsigned int y); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_vsetles4(unsigned int x, - unsigned int y); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_vsetleu2(unsigned int x, - unsigned int y); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_vsetleu4(unsigned int x, - unsigned int y); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_vsetlts2(unsigned int x, - unsigned int y); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_vsetlts4(unsigned int x, - unsigned int y); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_vsetltu2(unsigned int x, - unsigned int y); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_vsetltu4(unsigned int x, - unsigned int y); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_vsads2(unsigned int x, - unsigned int y); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_vsads4(unsigned int x, - unsigned int y); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_vsadu2(unsigned int x, - unsigned int y); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_vsadu4(unsigned int x, - unsigned int y); -extern __DPCPP_SYCL_EXTERNAL unsigned int -__imf_viaddmax_s16x2(unsigned int x, unsigned int y, unsigned int z); -extern __DPCPP_SYCL_EXTERNAL unsigned int -__imf_viaddmax_s16x2_relu(unsigned int x, unsigned int y, unsigned int z); -extern __DPCPP_SYCL_EXTERNAL int __imf_viaddmax_s32(int x, int y, int z); -extern __DPCPP_SYCL_EXTERNAL int __imf_viaddmax_s32_relu(int x, int y, int z); -extern __DPCPP_SYCL_EXTERNAL unsigned int -__imf_viaddmax_u16x2(unsigned int x, unsigned int y, unsigned int z); -extern __DPCPP_SYCL_EXTERNAL unsigned int -__imf_viaddmax_u32(unsigned int x, unsigned int y, unsigned int z); -extern __DPCPP_SYCL_EXTERNAL unsigned int -__imf_viaddmin_s16x2(unsigned int x, unsigned int y, unsigned int z); -extern __DPCPP_SYCL_EXTERNAL unsigned int -__imf_viaddmin_s16x2_relu(unsigned int x, unsigned int y, unsigned int z); -extern __DPCPP_SYCL_EXTERNAL int __imf_viaddmin_s32(int x, int y, int z); -extern __DPCPP_SYCL_EXTERNAL int __imf_viaddmin_s32_relu(int x, int y, int z); -extern __DPCPP_SYCL_EXTERNAL unsigned int -__imf_viaddmin_u16x2(unsigned int x, unsigned int y, unsigned int z); -extern __DPCPP_SYCL_EXTERNAL unsigned int -__imf_viaddmin_u32(unsigned int x, unsigned int y, unsigned int z); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_vibmax_s16x2(unsigned int x, - unsigned int y, - bool *pred_hi, - bool *pred_lo); -extern __DPCPP_SYCL_EXTERNAL int __imf_vibmax_s32(int x, int y, bool *pred); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_vibmax_u16x2(unsigned int x, - unsigned int y, - bool *pred_hi, - bool *pred_lo); -extern __DPCPP_SYCL_EXTERNAL unsigned int -__imf_vibmax_u32(unsigned int x, unsigned int y, bool *pred); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_vibmin_s16x2(unsigned int x, - unsigned int y, - bool *pred_hi, - bool *pred_lo); -extern __DPCPP_SYCL_EXTERNAL int __imf_vibmin_s32(int x, int y, bool *pred); -extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_vibmin_u16x2(unsigned int x, - unsigned int y, - bool *pred_hi, - bool *pred_lo); -extern __DPCPP_SYCL_EXTERNAL unsigned int -__imf_vibmin_u32(unsigned int x, unsigned int y, bool *pred); -extern __DPCPP_SYCL_EXTERNAL unsigned int -__imf_vimax3_s16x2(unsigned int x, unsigned int y, unsigned int z); -extern __DPCPP_SYCL_EXTERNAL unsigned int -__imf_vimax3_s16x2_relu(unsigned int x, unsigned int y, unsigned int z); -extern __DPCPP_SYCL_EXTERNAL unsigned int -__imf_vimin3_s16x2(unsigned int x, unsigned int y, unsigned int z); -extern __DPCPP_SYCL_EXTERNAL unsigned int -__imf_vimin3_s16x2_relu(unsigned int x, unsigned int y, unsigned int z); -extern __DPCPP_SYCL_EXTERNAL int __imf_vimax3_s32(int x, int y, int z); -extern __DPCPP_SYCL_EXTERNAL int __imf_vimax3_s32_relu(int x, int y, int z); -extern __DPCPP_SYCL_EXTERNAL int __imf_vimin3_s32(int x, int y, int z); -extern __DPCPP_SYCL_EXTERNAL int __imf_vimin3_s32_relu(int x, int y, int z); -extern __DPCPP_SYCL_EXTERNAL unsigned int -__imf_vimax3_u16x2(unsigned int x, unsigned int y, unsigned int z); -extern __DPCPP_SYCL_EXTERNAL unsigned int -__imf_vimax3_u32(unsigned int x, unsigned int y, unsigned int z); -extern __DPCPP_SYCL_EXTERNAL unsigned int -__imf_vimin3_u16x2(unsigned int x, unsigned int y, unsigned int z); -extern __DPCPP_SYCL_EXTERNAL unsigned int -__imf_vimin3_u32(unsigned int x, unsigned int y, unsigned int z); -extern __DPCPP_SYCL_EXTERNAL unsigned int -__imf_vimax_s16x2_relu(unsigned int x, unsigned int y); -extern __DPCPP_SYCL_EXTERNAL int __imf_vimax_s32_relu(int x, int y); -extern __DPCPP_SYCL_EXTERNAL unsigned int -__imf_vimin_s16x2_relu(unsigned int x, unsigned int y); -extern __DPCPP_SYCL_EXTERNAL int __imf_vimin_s32_relu(int x, int y); } #ifdef __GLIBC__ namespace std { diff --git a/sycl/include/sycl/ext/intel/math.hpp b/sycl/include/sycl/ext/intel/math.hpp index d652cdccb4ff3..479310ec7181b 100644 --- a/sycl/include/sycl/ext/intel/math.hpp +++ b/sycl/include/sycl/ext/intel/math.hpp @@ -24,7 +24,7 @@ using _iml_half_internal = uint16_t; using _iml_bf16_internal = uint16_t; #include -#include +#include #include #include #include @@ -34,38 +34,6 @@ using _iml_bf16_internal = uint16_t; #include #include -extern "C" { -float __imf_saturatef(float); -float __imf_copysignf(float, float); -double __imf_copysign(double, double); -_iml_half_internal __imf_copysignf16(_iml_half_internal, _iml_half_internal); -float __imf_ceilf(float); -double __imf_ceil(double); -_iml_half_internal __imf_ceilf16(_iml_half_internal); -float __imf_floorf(float); -double __imf_floor(double); -_iml_half_internal __imf_floorf16(_iml_half_internal); -float __imf_fsigmf(float); -_iml_half_internal __imf_fsigmf16(_iml_half_internal); -_iml_bf16_internal __imf_fsigmbf16(_iml_bf16_internal); -float __imf_rintf(float); -double __imf_rint(double); -_iml_half_internal __imf_invf16(_iml_half_internal); -float __imf_invf(float); -double __imf_inv(double); -_iml_half_internal __imf_rintf16(_iml_half_internal); -float __imf_sqrtf(float); -double __imf_sqrt(double); -_iml_half_internal __imf_sqrtf16(_iml_half_internal); -float __imf_rsqrtf(float); -double __imf_rsqrt(double); -_iml_half_internal __imf_rsqrtf16(_iml_half_internal); -float __imf_truncf(float); -double __imf_trunc(double); -_iml_half_internal __imf_truncf16(_iml_half_internal); -double __imf_rcp64h(double); -}; - namespace sycl { inline namespace _V1 { namespace ext::intel::math { @@ -73,11 +41,30 @@ namespace ext::intel::math { static_assert(sizeof(sycl::half) == sizeof(_iml_half_internal), "sycl::half is not compatible with _iml_half_internal."); +/// -------------------------------------------------------------------------- +/// saturate(x) function +/// Clamps the float input to [+0.0, 1.0]. +/// -------------------------------------------------------------------------- +extern "C" { +__DPCPP_SYCL_EXTERNAL float __imf_saturatef(float); +}; + template std::enable_if_t, float> saturate(Tp x) { return __imf_saturatef(x); } +/// -------------------------------------------------------------------------- +/// copysign(x) function +/// Creates value with given magnitude, copying sign of second input. +/// -------------------------------------------------------------------------- +extern "C" { +__DPCPP_SYCL_EXTERNAL float __imf_copysignf(float, float); +__DPCPP_SYCL_EXTERNAL double __imf_copysign(double, double); +__DPCPP_SYCL_EXTERNAL +_iml_half_internal __imf_copysignf16(_iml_half_internal, _iml_half_internal); +}; + template std::enable_if_t, float> copysign(Tp x, Tp y) { return __imf_copysignf(x, y); @@ -96,6 +83,16 @@ std::enable_if_t, sycl::half> copysign(Tp x, return sycl::bit_cast(__imf_copysignf16(xi, yi)); } +/// -------------------------------------------------------------------------- +/// ceil(x) function +/// Returns ceiling value of the input. +/// -------------------------------------------------------------------------- +extern "C" { +__DPCPP_SYCL_EXTERNAL float __imf_ceilf(float); +__DPCPP_SYCL_EXTERNAL double __imf_ceil(double); +__DPCPP_SYCL_EXTERNAL _iml_half_internal __imf_ceilf16(_iml_half_internal); +}; + template std::enable_if_t, float> ceil(Tp x) { return __imf_ceilf(x); @@ -117,6 +114,16 @@ std::enable_if_t, sycl::half2> ceil(Tp x) { return sycl::half2{ceil(x.s0()), ceil(x.s1())}; } +/// -------------------------------------------------------------------------- +/// floor(x) function +/// Returns the largest integral value less than or equal to input. +/// -------------------------------------------------------------------------- +extern "C" { +__DPCPP_SYCL_EXTERNAL float __imf_floorf(float); +__DPCPP_SYCL_EXTERNAL double __imf_floor(double); +__DPCPP_SYCL_EXTERNAL _iml_half_internal __imf_floorf16(_iml_half_internal); +}; + template std::enable_if_t, float> floor(Tp x) { return __imf_floorf(x); @@ -138,6 +145,16 @@ std::enable_if_t, sycl::half2> floor(Tp x) { return sycl::half2{floor(x.s0()), floor(x.s1())}; } +/// -------------------------------------------------------------------------- +/// inv(x) function +/// Returns 1.0 / x. +/// -------------------------------------------------------------------------- +extern "C" { +__DPCPP_SYCL_EXTERNAL float __imf_invf(float); +__DPCPP_SYCL_EXTERNAL double __imf_inv(double); +__DPCPP_SYCL_EXTERNAL _iml_half_internal __imf_invf16(_iml_half_internal); +}; + template std::enable_if_t, float> inv(Tp x) { return __imf_invf(x); @@ -159,6 +176,16 @@ std::enable_if_t, sycl::half2> inv(Tp x) { return sycl::half2{inv(x.s0()), inv(x.s1())}; } +/// -------------------------------------------------------------------------- +/// rint(x) function +/// Rounds a floating-point value to the nearest integer value. +/// -------------------------------------------------------------------------- +extern "C" { +__DPCPP_SYCL_EXTERNAL float __imf_rintf(float); +__DPCPP_SYCL_EXTERNAL double __imf_rint(double); +__DPCPP_SYCL_EXTERNAL _iml_half_internal __imf_rintf16(_iml_half_internal); +}; + template std::enable_if_t, float> rint(Tp x) { return __imf_rintf(x); @@ -180,6 +207,16 @@ std::enable_if_t, sycl::half2> rint(Tp x) { return sycl::half2{rint(x.s0()), rint(x.s1())}; } +/// -------------------------------------------------------------------------- +/// sqrt(x) function +/// Returns square root of input. +/// -------------------------------------------------------------------------- +extern "C" { +__DPCPP_SYCL_EXTERNAL float __imf_sqrtf(float); +__DPCPP_SYCL_EXTERNAL double __imf_sqrt(double); +__DPCPP_SYCL_EXTERNAL _iml_half_internal __imf_sqrtf16(_iml_half_internal); +}; + template std::enable_if_t, float> sqrt(Tp x) { return __imf_sqrtf(x); @@ -201,6 +238,16 @@ std::enable_if_t, sycl::half2> sqrt(Tp x) { return sycl::half2{sqrt(x.s0()), sqrt(x.s1())}; } +/// -------------------------------------------------------------------------- +/// rsqrt(x) function +/// Returns 1.0 / sqrt(x). +/// -------------------------------------------------------------------------- +extern "C" { +__DPCPP_SYCL_EXTERNAL float __imf_rsqrtf(float); +__DPCPP_SYCL_EXTERNAL double __imf_rsqrt(double); +__DPCPP_SYCL_EXTERNAL _iml_half_internal __imf_rsqrtf16(_iml_half_internal); +}; + template std::enable_if_t, float> rsqrt(Tp x) { return __imf_rsqrtf(x); @@ -222,6 +269,16 @@ std::enable_if_t, sycl::half2> rsqrt(Tp x) { return sycl::half2{rsqrt(x.s0()), rsqrt(x.s1())}; } +/// -------------------------------------------------------------------------- +/// trunc(x) function +/// Truncates input to the integral part. +/// -------------------------------------------------------------------------- +extern "C" { +__DPCPP_SYCL_EXTERNAL float __imf_truncf(float); +__DPCPP_SYCL_EXTERNAL double __imf_trunc(double); +__DPCPP_SYCL_EXTERNAL _iml_half_internal __imf_truncf16(_iml_half_internal); +}; + template std::enable_if_t, float> trunc(Tp x) { return __imf_truncf(x); @@ -243,17 +300,26 @@ std::enable_if_t, sycl::half2> trunc(Tp x) { return sycl::half2{trunc(x.s0()), trunc(x.s1())}; } +/// -------------------------------------------------------------------------- +/// rcp64h(x) function +/// Provides high 32 bits of 1.0 / x. +/// -------------------------------------------------------------------------- +extern "C" { +__DPCPP_SYCL_EXTERNAL double __imf_rcp64h(double); +}; + template std::enable_if_t, double> rcp64h(Tp x) { return __imf_rcp64h(x); } + /// -------------------------------------------------------------------------- /// sigmoid(x) function /// -------------------------------------------------------------------------- extern "C" { -_iml_bf16_internal __imf_fsigmbf16(_iml_bf16_internal x); -_iml_half_internal __imf_fsigmf16(_iml_half_internal x); -float __imf_fsigmf(float x); +__DPCPP_SYCL_EXTERNAL _iml_bf16_internal __imf_fsigmbf16(_iml_bf16_internal x); +__DPCPP_SYCL_EXTERNAL _iml_half_internal __imf_fsigmf16(_iml_half_internal x); +__DPCPP_SYCL_EXTERNAL float __imf_fsigmf(float x); }; template diff --git a/sycl/include/sycl/ext/intel/math/imf_fp_conversions.hpp b/sycl/include/sycl/ext/intel/math/imf_fp_conversions.hpp index 50c1e35225a6a..1a4ce4b47b2eb 100644 --- a/sycl/include/sycl/ext/intel/math/imf_fp_conversions.hpp +++ b/sycl/include/sycl/ext/intel/math/imf_fp_conversions.hpp @@ -15,201 +15,33 @@ #include #include -extern "C" { -int __imf_float2int_rd(float); -int __imf_float2int_rn(float); -int __imf_float2int_ru(float); -int __imf_float2int_rz(float); -unsigned int __imf_float2uint_rd(float); -unsigned int __imf_float2uint_rn(float); -unsigned int __imf_float2uint_ru(float); -unsigned int __imf_float2uint_rz(float); -long long int __imf_float2ll_rd(float); -long long int __imf_float2ll_rn(float); -long long int __imf_float2ll_ru(float); -long long int __imf_float2ll_rz(float); -unsigned long long int __imf_float2ull_rd(float); -unsigned long long int __imf_float2ull_rn(float); -unsigned long long int __imf_float2ull_ru(float); -unsigned long long int __imf_float2ull_rz(float); -int __imf_float_as_int(float); -unsigned int __imf_float_as_uint(float); -float __imf_int2float_rd(int); -float __imf_int2float_rn(int); -float __imf_int2float_ru(int); -float __imf_int2float_rz(int); -float __imf_int_as_float(int); -float __imf_ll2float_rd(long long int); -float __imf_ll2float_rn(long long int); -float __imf_ll2float_ru(long long int); -float __imf_ll2float_rz(long long int); -float __imf_uint2float_rd(unsigned int); -float __imf_uint2float_rn(unsigned int); -float __imf_uint2float_ru(unsigned int); -float __imf_uint2float_rz(unsigned int); -float __imf_uint_as_float(unsigned int); -float __imf_ull2float_rd(unsigned long long int); -float __imf_ull2float_rn(unsigned long long int); -float __imf_ull2float_ru(unsigned long long int); -float __imf_ull2float_rz(unsigned long long int); -float __imf_double2float_rd(double); -float __imf_double2float_rn(double); -float __imf_double2float_ru(double); -float __imf_double2float_rz(double); -int __imf_double2hiint(double); -int __imf_double2loint(double); -int __imf_double2int_rd(double); -int __imf_double2int_rn(double); -int __imf_double2int_ru(double); -int __imf_double2int_rz(double); -long long __imf_double2ll_rd(double); -long long __imf_double2ll_rn(double); -long long __imf_double2ll_ru(double); -long long __imf_double2ll_rz(double); -unsigned int __imf_double2uint_rd(double); -unsigned int __imf_double2uint_rn(double); -unsigned int __imf_double2uint_ru(double); -unsigned int __imf_double2uint_rz(double); -unsigned long long __imf_double2ull_rd(double); -unsigned long long __imf_double2ull_rn(double); -unsigned long long __imf_double2ull_ru(double); -unsigned long long __imf_double2ull_rz(double); -long long __imf_double_as_longlong(double); -double __imf_hiloint2double(int, int); -double __imf_int2double_rn(int); -double __imf_ll2double_rd(long long); -double __imf_ll2double_rn(long long); -double __imf_ll2double_ru(long long); -double __imf_ll2double_rz(long long); -double __imf_longlong_as_double(long long); -double __imf_uint2double_rn(unsigned); -double __imf_ull2double_rd(unsigned long long); -double __imf_ull2double_rn(unsigned long long); -double __imf_ull2double_ru(unsigned long long); -double __imf_ull2double_rz(unsigned long long); -float __imf_half2float(_iml_half_internal); -_iml_half_internal __imf_float2half_rd(float); -_iml_half_internal __imf_float2half_rn(float); -_iml_half_internal __imf_float2half_ru(float); -_iml_half_internal __imf_float2half_rz(float); -int __imf_half2int_rd(_iml_half_internal); -int __imf_half2int_rn(_iml_half_internal); -int __imf_half2int_ru(_iml_half_internal); -int __imf_half2int_rz(_iml_half_internal); -long long __imf_half2ll_rd(_iml_half_internal); -long long __imf_half2ll_rn(_iml_half_internal); -long long __imf_half2ll_ru(_iml_half_internal); -long long __imf_half2ll_rz(_iml_half_internal); -short __imf_half2short_rd(_iml_half_internal); -short __imf_half2short_rn(_iml_half_internal); -short __imf_half2short_ru(_iml_half_internal); -short __imf_half2short_rz(_iml_half_internal); -unsigned int __imf_half2uint_rd(_iml_half_internal); -unsigned int __imf_half2uint_rn(_iml_half_internal); -unsigned int __imf_half2uint_ru(_iml_half_internal); -unsigned int __imf_half2uint_rz(_iml_half_internal); -unsigned long long __imf_half2ull_rd(_iml_half_internal); -unsigned long long __imf_half2ull_rn(_iml_half_internal); -unsigned long long __imf_half2ull_ru(_iml_half_internal); -unsigned long long __imf_half2ull_rz(_iml_half_internal); -unsigned short __imf_half2ushort_rd(_iml_half_internal); -unsigned short __imf_half2ushort_rn(_iml_half_internal); -unsigned short __imf_half2ushort_ru(_iml_half_internal); -unsigned short __imf_half2ushort_rz(_iml_half_internal); -short __imf_half_as_short(_iml_half_internal); -unsigned short __imf_half_as_ushort(_iml_half_internal); -_iml_half_internal __imf_int2half_rd(int); -_iml_half_internal __imf_int2half_rn(int); -_iml_half_internal __imf_int2half_ru(int); -_iml_half_internal __imf_int2half_rz(int); -_iml_half_internal __imf_ll2half_rd(long long); -_iml_half_internal __imf_ll2half_rn(long long); -_iml_half_internal __imf_ll2half_ru(long long); -_iml_half_internal __imf_ll2half_rz(long long); -_iml_half_internal __imf_short2half_rd(short); -_iml_half_internal __imf_short2half_rn(short); -_iml_half_internal __imf_short2half_ru(short); -_iml_half_internal __imf_short2half_rz(short); -_iml_half_internal __imf_short_as_half(short); -_iml_half_internal __imf_uint2half_rd(unsigned int); -_iml_half_internal __imf_uint2half_rn(unsigned int); -_iml_half_internal __imf_uint2half_ru(unsigned int); -_iml_half_internal __imf_uint2half_rz(unsigned int); -_iml_half_internal __imf_ull2half_rd(unsigned long long); -_iml_half_internal __imf_ull2half_rn(unsigned long long); -_iml_half_internal __imf_ull2half_ru(unsigned long long); -_iml_half_internal __imf_ull2half_rz(unsigned long long); -_iml_half_internal __imf_ushort2half_rd(unsigned short); -_iml_half_internal __imf_ushort2half_rn(unsigned short); -_iml_half_internal __imf_ushort2half_ru(unsigned short); -_iml_half_internal __imf_ushort2half_rz(unsigned short); -_iml_half_internal __imf_ushort_as_half(unsigned short); -_iml_half_internal __imf_double2half(double); -unsigned short __imf_bfloat162ushort_rd(uint16_t); -unsigned short __imf_bfloat162ushort_rn(uint16_t); -unsigned short __imf_bfloat162ushort_ru(uint16_t); -unsigned short __imf_bfloat162ushort_rz(uint16_t); -short __imf_bfloat162short_rd(uint16_t); -short __imf_bfloat162short_rn(uint16_t); -short __imf_bfloat162short_ru(uint16_t); -short __imf_bfloat162short_rz(uint16_t); -unsigned int __imf_bfloat162uint_rd(uint16_t); -unsigned int __imf_bfloat162uint_rn(uint16_t); -unsigned int __imf_bfloat162uint_ru(uint16_t); -unsigned int __imf_bfloat162uint_rz(uint16_t); -int __imf_bfloat162int_rd(uint16_t); -int __imf_bfloat162int_rn(uint16_t); -int __imf_bfloat162int_ru(uint16_t); -int __imf_bfloat162int_rz(uint16_t); -unsigned long long __imf_bfloat162ull_rd(uint16_t); -unsigned long long __imf_bfloat162ull_rn(uint16_t); -unsigned long long __imf_bfloat162ull_ru(uint16_t); -unsigned long long __imf_bfloat162ull_rz(uint16_t); -long long __imf_bfloat162ll_rd(uint16_t); -long long __imf_bfloat162ll_rn(uint16_t); -long long __imf_bfloat162ll_ru(uint16_t); -long long __imf_bfloat162ll_rz(uint16_t); -float __imf_bfloat162float(uint16_t); -uint16_t __imf_float2bfloat16(float); -uint16_t __imf_float2bfloat16_rd(float); -uint16_t __imf_float2bfloat16_rn(float); -uint16_t __imf_float2bfloat16_ru(float); -uint16_t __imf_float2bfloat16_rz(float); -uint16_t __imf_ushort2bfloat16_rd(unsigned short); -uint16_t __imf_ushort2bfloat16_rn(unsigned short); -uint16_t __imf_ushort2bfloat16_ru(unsigned short); -uint16_t __imf_ushort2bfloat16_rz(unsigned short); -uint16_t __imf_uint2bfloat16_rd(unsigned int); -uint16_t __imf_uint2bfloat16_rn(unsigned int); -uint16_t __imf_uint2bfloat16_ru(unsigned int); -uint16_t __imf_uint2bfloat16_rz(unsigned int); -uint16_t __imf_ull2bfloat16_rd(unsigned long long); -uint16_t __imf_ull2bfloat16_rn(unsigned long long); -uint16_t __imf_ull2bfloat16_ru(unsigned long long); -uint16_t __imf_ull2bfloat16_rz(unsigned long long); -uint16_t __imf_short2bfloat16_rd(short); -uint16_t __imf_short2bfloat16_rn(short); -uint16_t __imf_short2bfloat16_ru(short); -uint16_t __imf_short2bfloat16_rz(short); -uint16_t __imf_int2bfloat16_rd(int); -uint16_t __imf_int2bfloat16_rn(int); -uint16_t __imf_int2bfloat16_ru(int); -uint16_t __imf_int2bfloat16_rz(int); -uint16_t __imf_ll2bfloat16_rd(long long); -uint16_t __imf_ll2bfloat16_rn(long long); -uint16_t __imf_ll2bfloat16_ru(long long); -uint16_t __imf_ll2bfloat16_rz(long long); -uint16_t __imf_double2bfloat16(double); -short __imf_bfloat16_as_short(uint16_t); -unsigned short __imf_bfloat16_as_ushort(uint16_t); -uint16_t __imf_short_as_bfloat16(short); -uint16_t __imf_ushort_as_bfloat16(unsigned short); -}; - namespace sycl { inline namespace _V1 { namespace ext::intel::math { +/// -------------------------------------------------------------------------- +/// Float to integral conversions +/// Supports rounding down/up/to-nearest-even/towards-zero +/// -------------------------------------------------------------------------- +extern "C" { +__DPCPP_SYCL_EXTERNAL int __imf_float2int_rd(float); +__DPCPP_SYCL_EXTERNAL int __imf_float2int_rn(float); +__DPCPP_SYCL_EXTERNAL int __imf_float2int_ru(float); +__DPCPP_SYCL_EXTERNAL int __imf_float2int_rz(float); +__DPCPP_SYCL_EXTERNAL unsigned int __imf_float2uint_rd(float); +__DPCPP_SYCL_EXTERNAL unsigned int __imf_float2uint_rn(float); +__DPCPP_SYCL_EXTERNAL unsigned int __imf_float2uint_ru(float); +__DPCPP_SYCL_EXTERNAL unsigned int __imf_float2uint_rz(float); +__DPCPP_SYCL_EXTERNAL long long int __imf_float2ll_rd(float); +__DPCPP_SYCL_EXTERNAL long long int __imf_float2ll_rn(float); +__DPCPP_SYCL_EXTERNAL long long int __imf_float2ll_ru(float); +__DPCPP_SYCL_EXTERNAL long long int __imf_float2ll_rz(float); +__DPCPP_SYCL_EXTERNAL unsigned long long int __imf_float2ull_rd(float); +__DPCPP_SYCL_EXTERNAL unsigned long long int __imf_float2ull_rn(float); +__DPCPP_SYCL_EXTERNAL unsigned long long int __imf_float2ull_ru(float); +__DPCPP_SYCL_EXTERNAL unsigned long long int __imf_float2ull_rz(float); +}; + template To float2int_rd(From x) { return __imf_float2int_rd(x); } @@ -286,6 +118,29 @@ To float2ull_rz(From x) { return __imf_float2ull_rz(x); } +/// -------------------------------------------------------------------------- +/// Integral to float conversions +/// Supports rounding down/up/to-nearest-even/towards-zero +/// -------------------------------------------------------------------------- +extern "C" { +__DPCPP_SYCL_EXTERNAL float __imf_int2float_rd(int); +__DPCPP_SYCL_EXTERNAL float __imf_int2float_rn(int); +__DPCPP_SYCL_EXTERNAL float __imf_int2float_ru(int); +__DPCPP_SYCL_EXTERNAL float __imf_int2float_rz(int); +__DPCPP_SYCL_EXTERNAL float __imf_ll2float_rd(long long int); +__DPCPP_SYCL_EXTERNAL float __imf_ll2float_rn(long long int); +__DPCPP_SYCL_EXTERNAL float __imf_ll2float_ru(long long int); +__DPCPP_SYCL_EXTERNAL float __imf_ll2float_rz(long long int); +__DPCPP_SYCL_EXTERNAL float __imf_uint2float_rd(unsigned int); +__DPCPP_SYCL_EXTERNAL float __imf_uint2float_rn(unsigned int); +__DPCPP_SYCL_EXTERNAL float __imf_uint2float_ru(unsigned int); +__DPCPP_SYCL_EXTERNAL float __imf_uint2float_rz(unsigned int); +__DPCPP_SYCL_EXTERNAL float __imf_ull2float_rd(unsigned long long int); +__DPCPP_SYCL_EXTERNAL float __imf_ull2float_rn(unsigned long long int); +__DPCPP_SYCL_EXTERNAL float __imf_ull2float_ru(unsigned long long int); +__DPCPP_SYCL_EXTERNAL float __imf_ull2float_rz(unsigned long long int); +}; + template To ll2float_rd(From x) { return __imf_ll2float_rd(x); @@ -362,6 +217,16 @@ To uint2float_rz(From x) { return __imf_uint2float_rz(x); } +/// -------------------------------------------------------------------------- +/// Reinterprets bits in float type as int/unsigned int type and vice versa. +/// -------------------------------------------------------------------------- +extern "C" { +__DPCPP_SYCL_EXTERNAL int __imf_float_as_int(float); +__DPCPP_SYCL_EXTERNAL unsigned int __imf_float_as_uint(float); +__DPCPP_SYCL_EXTERNAL float __imf_int_as_float(int); +__DPCPP_SYCL_EXTERNAL float __imf_uint_as_float(unsigned int); +}; + template To float_as_int(From x) { return __imf_float_as_int(x); } @@ -380,6 +245,17 @@ To uint_as_float(From x) { return __imf_uint_as_float(x); } +/// -------------------------------------------------------------------------- +/// Double to float conversions +/// Supports rounding down/up/to-nearest-even/towards-zero +/// -------------------------------------------------------------------------- +extern "C" { +__DPCPP_SYCL_EXTERNAL float __imf_double2float_rd(double); +__DPCPP_SYCL_EXTERNAL float __imf_double2float_rn(double); +__DPCPP_SYCL_EXTERNAL float __imf_double2float_ru(double); +__DPCPP_SYCL_EXTERNAL float __imf_double2float_rz(double); +}; + template To double2float_rd(From x) { return __imf_double2float_rd(x); @@ -400,6 +276,14 @@ To double2float_rz(From x) { return __imf_double2float_rz(x); } +/// -------------------------------------------------------------------------- +/// Reinterprets high/low 32 bits in a double as a signed integer. +/// -------------------------------------------------------------------------- +extern "C" { +__DPCPP_SYCL_EXTERNAL int __imf_double2hiint(double); +__DPCPP_SYCL_EXTERNAL int __imf_double2loint(double); +}; + template To double2hiint(From x) { return __imf_double2hiint(x); } @@ -408,6 +292,29 @@ template To double2loint(From x) { return __imf_double2loint(x); } +/// -------------------------------------------------------------------------- +/// Double to integral conversions +/// Supports rounding down/up/to-nearest-even/towards-zero +/// -------------------------------------------------------------------------- +extern "C" { +__DPCPP_SYCL_EXTERNAL int __imf_double2int_rd(double); +__DPCPP_SYCL_EXTERNAL int __imf_double2int_rn(double); +__DPCPP_SYCL_EXTERNAL int __imf_double2int_ru(double); +__DPCPP_SYCL_EXTERNAL int __imf_double2int_rz(double); +__DPCPP_SYCL_EXTERNAL long long __imf_double2ll_rd(double); +__DPCPP_SYCL_EXTERNAL long long __imf_double2ll_rn(double); +__DPCPP_SYCL_EXTERNAL long long __imf_double2ll_ru(double); +__DPCPP_SYCL_EXTERNAL long long __imf_double2ll_rz(double); +__DPCPP_SYCL_EXTERNAL unsigned int __imf_double2uint_rd(double); +__DPCPP_SYCL_EXTERNAL unsigned int __imf_double2uint_rn(double); +__DPCPP_SYCL_EXTERNAL unsigned int __imf_double2uint_ru(double); +__DPCPP_SYCL_EXTERNAL unsigned int __imf_double2uint_rz(double); +__DPCPP_SYCL_EXTERNAL unsigned long long __imf_double2ull_rd(double); +__DPCPP_SYCL_EXTERNAL unsigned long long __imf_double2ull_rn(double); +__DPCPP_SYCL_EXTERNAL unsigned long long __imf_double2ull_ru(double); +__DPCPP_SYCL_EXTERNAL unsigned long long __imf_double2ull_rz(double); +}; + template To double2int_rd(From x) { return __imf_double2int_rd(x); } @@ -484,6 +391,14 @@ To double2ull_rz(From x) { return __imf_double2ull_rz(x); } +/// -------------------------------------------------------------------------- +/// Reinterprets bits in double as long long and vice versa. +/// -------------------------------------------------------------------------- +extern "C" { +__DPCPP_SYCL_EXTERNAL long long __imf_double_as_longlong(double); +__DPCPP_SYCL_EXTERNAL double __imf_longlong_as_double(long long); +}; + template To double_as_longlong(From x) { return __imf_double_as_longlong(x); @@ -494,11 +409,26 @@ To longlong_as_double(From x) { return __imf_longlong_as_double(x); } +/// -------------------------------------------------------------------------- +/// Reinterprets 2 32-bit integers as high and low 32-bits in a double value. +/// -------------------------------------------------------------------------- +extern "C" { +__DPCPP_SYCL_EXTERNAL double __imf_hiloint2double(int, int); +}; + template To hiloint2double(From x, From y) { return __imf_hiloint2double(x, y); } +/// -------------------------------------------------------------------------- +/// Signed and unsigned int type to double conversions in rounding towards-zero +/// -------------------------------------------------------------------------- +extern "C" { +__DPCPP_SYCL_EXTERNAL double __imf_int2double_rn(int); +__DPCPP_SYCL_EXTERNAL double __imf_uint2double_rn(unsigned); +}; + template To int2double_rn(From x) { return __imf_int2double_rn(x); } @@ -508,6 +438,21 @@ To uint2double_rn(From x) { return __imf_uint2double_rn(x); } +/// -------------------------------------------------------------------------- +/// Signed and unsigned long long type to double conversions +/// Supports rounding down/up/to-nearest-even/towards-zero +/// -------------------------------------------------------------------------- +extern "C" { +__DPCPP_SYCL_EXTERNAL double __imf_ll2double_rd(long long); +__DPCPP_SYCL_EXTERNAL double __imf_ll2double_rn(long long); +__DPCPP_SYCL_EXTERNAL double __imf_ll2double_ru(long long); +__DPCPP_SYCL_EXTERNAL double __imf_ll2double_rz(long long); +__DPCPP_SYCL_EXTERNAL double __imf_ull2double_rd(unsigned long long); +__DPCPP_SYCL_EXTERNAL double __imf_ull2double_rn(unsigned long long); +__DPCPP_SYCL_EXTERNAL double __imf_ull2double_ru(unsigned long long); +__DPCPP_SYCL_EXTERNAL double __imf_ull2double_rz(unsigned long long); +}; + template To ll2double_rd(From x) { return __imf_ll2double_rd(x); @@ -548,11 +493,29 @@ To ull2double_rz(From x) { return __imf_ull2double_rz(x); } +/// -------------------------------------------------------------------------- +/// Half to float conversions +/// -------------------------------------------------------------------------- +extern "C" { +__DPCPP_SYCL_EXTERNAL float __imf_half2float(_iml_half_internal); +}; + template To half2float(From x) { return __imf_half2float(sycl::bit_cast<_iml_half_internal>(x)); } +/// -------------------------------------------------------------------------- +/// Float to half conversions +/// Supports rounding down/up/to-nearest-even/towards-zero +/// -------------------------------------------------------------------------- +extern "C" { +__DPCPP_SYCL_EXTERNAL _iml_half_internal __imf_float2half_rd(float); +__DPCPP_SYCL_EXTERNAL _iml_half_internal __imf_float2half_rn(float); +__DPCPP_SYCL_EXTERNAL _iml_half_internal __imf_float2half_ru(float); +__DPCPP_SYCL_EXTERNAL _iml_half_internal __imf_float2half_rz(float); +}; + template To float2half_rn(From x) { return sycl::bit_cast(__imf_float2half_rn(x)); @@ -573,11 +536,49 @@ To float2half_rz(From x) { return sycl::bit_cast(__imf_float2half_rz(x)); } +/// -------------------------------------------------------------------------- +/// Double to half conversions in rounding to-nearest-even. +/// -------------------------------------------------------------------------- +extern "C" { +__DPCPP_SYCL_EXTERNAL _iml_half_internal __imf_double2half(double); +}; + template To double2half(From x) { return sycl::bit_cast(__imf_double2half(x)); } +/// -------------------------------------------------------------------------- +/// Half to integral type conversions +/// Supports rounding down/up/to-nearest-even/towards-zero +/// -------------------------------------------------------------------------- +extern "C" { +__DPCPP_SYCL_EXTERNAL int __imf_half2int_rd(_iml_half_internal); +__DPCPP_SYCL_EXTERNAL int __imf_half2int_rn(_iml_half_internal); +__DPCPP_SYCL_EXTERNAL int __imf_half2int_ru(_iml_half_internal); +__DPCPP_SYCL_EXTERNAL int __imf_half2int_rz(_iml_half_internal); +__DPCPP_SYCL_EXTERNAL long long __imf_half2ll_rd(_iml_half_internal); +__DPCPP_SYCL_EXTERNAL long long __imf_half2ll_rn(_iml_half_internal); +__DPCPP_SYCL_EXTERNAL long long __imf_half2ll_ru(_iml_half_internal); +__DPCPP_SYCL_EXTERNAL long long __imf_half2ll_rz(_iml_half_internal); +__DPCPP_SYCL_EXTERNAL short __imf_half2short_rd(_iml_half_internal); +__DPCPP_SYCL_EXTERNAL short __imf_half2short_rn(_iml_half_internal); +__DPCPP_SYCL_EXTERNAL short __imf_half2short_ru(_iml_half_internal); +__DPCPP_SYCL_EXTERNAL short __imf_half2short_rz(_iml_half_internal); +__DPCPP_SYCL_EXTERNAL unsigned int __imf_half2uint_rd(_iml_half_internal); +__DPCPP_SYCL_EXTERNAL unsigned int __imf_half2uint_rn(_iml_half_internal); +__DPCPP_SYCL_EXTERNAL unsigned int __imf_half2uint_ru(_iml_half_internal); +__DPCPP_SYCL_EXTERNAL unsigned int __imf_half2uint_rz(_iml_half_internal); +__DPCPP_SYCL_EXTERNAL unsigned long long __imf_half2ull_rd(_iml_half_internal); +__DPCPP_SYCL_EXTERNAL unsigned long long __imf_half2ull_rn(_iml_half_internal); +__DPCPP_SYCL_EXTERNAL unsigned long long __imf_half2ull_ru(_iml_half_internal); +__DPCPP_SYCL_EXTERNAL unsigned long long __imf_half2ull_rz(_iml_half_internal); +__DPCPP_SYCL_EXTERNAL unsigned short __imf_half2ushort_rd(_iml_half_internal); +__DPCPP_SYCL_EXTERNAL unsigned short __imf_half2ushort_rn(_iml_half_internal); +__DPCPP_SYCL_EXTERNAL unsigned short __imf_half2ushort_ru(_iml_half_internal); +__DPCPP_SYCL_EXTERNAL unsigned short __imf_half2ushort_rz(_iml_half_internal); +}; + template To half2int_rn(From x) { return __imf_half2int_rn(sycl::bit_cast<_iml_half_internal>(x)); @@ -698,6 +699,37 @@ To half2ull_rz(From x) { return __imf_half2ull_rz(sycl::bit_cast<_iml_half_internal>(x)); } +/// -------------------------------------------------------------------------- +/// Integral type to half conversions +/// Supports rounding down/up/to-nearest-even/towards-zero +/// -------------------------------------------------------------------------- +extern "C" { +__DPCPP_SYCL_EXTERNAL _iml_half_internal __imf_int2half_rd(int); +__DPCPP_SYCL_EXTERNAL _iml_half_internal __imf_int2half_rn(int); +__DPCPP_SYCL_EXTERNAL _iml_half_internal __imf_int2half_ru(int); +__DPCPP_SYCL_EXTERNAL _iml_half_internal __imf_int2half_rz(int); +__DPCPP_SYCL_EXTERNAL _iml_half_internal __imf_ll2half_rd(long long); +__DPCPP_SYCL_EXTERNAL _iml_half_internal __imf_ll2half_rn(long long); +__DPCPP_SYCL_EXTERNAL _iml_half_internal __imf_ll2half_ru(long long); +__DPCPP_SYCL_EXTERNAL _iml_half_internal __imf_ll2half_rz(long long); +__DPCPP_SYCL_EXTERNAL _iml_half_internal __imf_short2half_rd(short); +__DPCPP_SYCL_EXTERNAL _iml_half_internal __imf_short2half_rn(short); +__DPCPP_SYCL_EXTERNAL _iml_half_internal __imf_short2half_ru(short); +__DPCPP_SYCL_EXTERNAL _iml_half_internal __imf_short2half_rz(short); +__DPCPP_SYCL_EXTERNAL _iml_half_internal __imf_uint2half_rd(unsigned int); +__DPCPP_SYCL_EXTERNAL _iml_half_internal __imf_uint2half_rn(unsigned int); +__DPCPP_SYCL_EXTERNAL _iml_half_internal __imf_uint2half_ru(unsigned int); +__DPCPP_SYCL_EXTERNAL _iml_half_internal __imf_uint2half_rz(unsigned int); +__DPCPP_SYCL_EXTERNAL _iml_half_internal __imf_ull2half_rd(unsigned long long); +__DPCPP_SYCL_EXTERNAL _iml_half_internal __imf_ull2half_rn(unsigned long long); +__DPCPP_SYCL_EXTERNAL _iml_half_internal __imf_ull2half_ru(unsigned long long); +__DPCPP_SYCL_EXTERNAL _iml_half_internal __imf_ull2half_rz(unsigned long long); +__DPCPP_SYCL_EXTERNAL _iml_half_internal __imf_ushort2half_rd(unsigned short); +__DPCPP_SYCL_EXTERNAL _iml_half_internal __imf_ushort2half_rn(unsigned short); +__DPCPP_SYCL_EXTERNAL _iml_half_internal __imf_ushort2half_ru(unsigned short); +__DPCPP_SYCL_EXTERNAL _iml_half_internal __imf_ushort2half_rz(unsigned short); +}; + template To int2half_rn(From x) { return sycl::bit_cast(__imf_int2half_rn(x)); @@ -818,6 +850,16 @@ To ull2half_rz(From x) { return sycl::bit_cast(__imf_ull2half_rz(x)); } +/// -------------------------------------------------------------------------- +/// Reinterprets bits in bfloat16 type as short/unsigned short and vice versa. +/// -------------------------------------------------------------------------- +extern "C" { +__DPCPP_SYCL_EXTERNAL short __imf_bfloat16_as_short(uint16_t); +__DPCPP_SYCL_EXTERNAL unsigned short __imf_bfloat16_as_ushort(uint16_t); +__DPCPP_SYCL_EXTERNAL uint16_t __imf_short_as_bfloat16(short); +__DPCPP_SYCL_EXTERNAL uint16_t __imf_ushort_as_bfloat16(unsigned short); +}; + template To bfloat16_as_ushort(From x) { @@ -842,11 +884,29 @@ To short_as_bfloat16(From x) { __imf_short_as_bfloat16(x)); } +/// -------------------------------------------------------------------------- +/// Bfloat16 to float conversions +/// -------------------------------------------------------------------------- +extern "C" { +__DPCPP_SYCL_EXTERNAL float __imf_bfloat162float(uint16_t); +}; + template To bfloat162float(From x) { return __imf_bfloat162float(sycl::bit_cast(x)); } +/// -------------------------------------------------------------------------- +/// Float to bfloat16 conversions +/// Supports rounding down/up/to-nearest-even/towards-zero +/// -------------------------------------------------------------------------- +extern "C" { +__DPCPP_SYCL_EXTERNAL uint16_t __imf_float2bfloat16_rd(float); +__DPCPP_SYCL_EXTERNAL uint16_t __imf_float2bfloat16_rn(float); +__DPCPP_SYCL_EXTERNAL uint16_t __imf_float2bfloat16_ru(float); +__DPCPP_SYCL_EXTERNAL uint16_t __imf_float2bfloat16_rz(float); +}; + template To float2bfloat16(From x) { return sycl::bit_cast( @@ -877,6 +937,37 @@ To float2bfloat16_rz(From x) { __imf_float2bfloat16_rz(x)); } +/// -------------------------------------------------------------------------- +/// Integral to bfloat16 conversions +/// Supports rounding down/up/to-nearest-even/towards-zero +/// -------------------------------------------------------------------------- +extern "C" { +__DPCPP_SYCL_EXTERNAL uint16_t __imf_ushort2bfloat16_rd(unsigned short); +__DPCPP_SYCL_EXTERNAL uint16_t __imf_ushort2bfloat16_rn(unsigned short); +__DPCPP_SYCL_EXTERNAL uint16_t __imf_ushort2bfloat16_ru(unsigned short); +__DPCPP_SYCL_EXTERNAL uint16_t __imf_ushort2bfloat16_rz(unsigned short); +__DPCPP_SYCL_EXTERNAL uint16_t __imf_uint2bfloat16_rd(unsigned int); +__DPCPP_SYCL_EXTERNAL uint16_t __imf_uint2bfloat16_rn(unsigned int); +__DPCPP_SYCL_EXTERNAL uint16_t __imf_uint2bfloat16_ru(unsigned int); +__DPCPP_SYCL_EXTERNAL uint16_t __imf_uint2bfloat16_rz(unsigned int); +__DPCPP_SYCL_EXTERNAL uint16_t __imf_ull2bfloat16_rd(unsigned long long); +__DPCPP_SYCL_EXTERNAL uint16_t __imf_ull2bfloat16_rn(unsigned long long); +__DPCPP_SYCL_EXTERNAL uint16_t __imf_ull2bfloat16_ru(unsigned long long); +__DPCPP_SYCL_EXTERNAL uint16_t __imf_ull2bfloat16_rz(unsigned long long); +__DPCPP_SYCL_EXTERNAL uint16_t __imf_short2bfloat16_rd(short); +__DPCPP_SYCL_EXTERNAL uint16_t __imf_short2bfloat16_rn(short); +__DPCPP_SYCL_EXTERNAL uint16_t __imf_short2bfloat16_ru(short); +__DPCPP_SYCL_EXTERNAL uint16_t __imf_short2bfloat16_rz(short); +__DPCPP_SYCL_EXTERNAL uint16_t __imf_int2bfloat16_rd(int); +__DPCPP_SYCL_EXTERNAL uint16_t __imf_int2bfloat16_rn(int); +__DPCPP_SYCL_EXTERNAL uint16_t __imf_int2bfloat16_ru(int); +__DPCPP_SYCL_EXTERNAL uint16_t __imf_int2bfloat16_rz(int); +__DPCPP_SYCL_EXTERNAL uint16_t __imf_ll2bfloat16_rd(long long); +__DPCPP_SYCL_EXTERNAL uint16_t __imf_ll2bfloat16_rn(long long); +__DPCPP_SYCL_EXTERNAL uint16_t __imf_ll2bfloat16_ru(long long); +__DPCPP_SYCL_EXTERNAL uint16_t __imf_ll2bfloat16_rz(long long); +}; + template To ushort2bfloat16_rd(From x) { @@ -1017,11 +1108,49 @@ To ll2bfloat16_rz(From x) { return sycl::bit_cast(__imf_ll2bfloat16_rz(x)); } +/// -------------------------------------------------------------------------- +/// Double to bfloat16 conversions in rounding to-nearest-even. +/// -------------------------------------------------------------------------- +extern "C" { +__DPCPP_SYCL_EXTERNAL uint16_t __imf_double2bfloat16(double); +}; + template To double2bfloat16(From x) { return sycl::bit_cast(__imf_double2bfloat16(x)); } +/// -------------------------------------------------------------------------- +/// Bfloat16 to integral conversions +/// Supports rounding down/up/to-nearest-even/towards-zero +/// -------------------------------------------------------------------------- +extern "C" { +__DPCPP_SYCL_EXTERNAL unsigned short __imf_bfloat162ushort_rd(uint16_t); +__DPCPP_SYCL_EXTERNAL unsigned short __imf_bfloat162ushort_rn(uint16_t); +__DPCPP_SYCL_EXTERNAL unsigned short __imf_bfloat162ushort_ru(uint16_t); +__DPCPP_SYCL_EXTERNAL unsigned short __imf_bfloat162ushort_rz(uint16_t); +__DPCPP_SYCL_EXTERNAL short __imf_bfloat162short_rd(uint16_t); +__DPCPP_SYCL_EXTERNAL short __imf_bfloat162short_rn(uint16_t); +__DPCPP_SYCL_EXTERNAL short __imf_bfloat162short_ru(uint16_t); +__DPCPP_SYCL_EXTERNAL short __imf_bfloat162short_rz(uint16_t); +__DPCPP_SYCL_EXTERNAL unsigned int __imf_bfloat162uint_rd(uint16_t); +__DPCPP_SYCL_EXTERNAL unsigned int __imf_bfloat162uint_rn(uint16_t); +__DPCPP_SYCL_EXTERNAL unsigned int __imf_bfloat162uint_ru(uint16_t); +__DPCPP_SYCL_EXTERNAL unsigned int __imf_bfloat162uint_rz(uint16_t); +__DPCPP_SYCL_EXTERNAL int __imf_bfloat162int_rd(uint16_t); +__DPCPP_SYCL_EXTERNAL int __imf_bfloat162int_rn(uint16_t); +__DPCPP_SYCL_EXTERNAL int __imf_bfloat162int_ru(uint16_t); +__DPCPP_SYCL_EXTERNAL int __imf_bfloat162int_rz(uint16_t); +__DPCPP_SYCL_EXTERNAL unsigned long long __imf_bfloat162ull_rd(uint16_t); +__DPCPP_SYCL_EXTERNAL unsigned long long __imf_bfloat162ull_rn(uint16_t); +__DPCPP_SYCL_EXTERNAL unsigned long long __imf_bfloat162ull_ru(uint16_t); +__DPCPP_SYCL_EXTERNAL unsigned long long __imf_bfloat162ull_rz(uint16_t); +__DPCPP_SYCL_EXTERNAL long long __imf_bfloat162ll_rd(uint16_t); +__DPCPP_SYCL_EXTERNAL long long __imf_bfloat162ll_rn(uint16_t); +__DPCPP_SYCL_EXTERNAL long long __imf_bfloat162ll_ru(uint16_t); +__DPCPP_SYCL_EXTERNAL long long __imf_bfloat162ll_rz(uint16_t); +}; + template To bfloat162uint_rd(From x) { diff --git a/sycl/include/sycl/ext/intel/math/imf_integer_utils.hpp b/sycl/include/sycl/ext/intel/math/imf_integer_utils.hpp index 0a72281e1ea54..cd5f635ca56b4 100644 --- a/sycl/include/sycl/ext/intel/math/imf_integer_utils.hpp +++ b/sycl/include/sycl/ext/intel/math/imf_integer_utils.hpp @@ -10,52 +10,52 @@ #pragma once -extern "C" { -unsigned __imf_brev(unsigned); -unsigned long long __imf_brevll(unsigned long long); -unsigned __imf_byte_perm(unsigned, unsigned, unsigned); -long long __imf_llmax(long long x, long long y); -long long __imf_llmin(long long x, long long y); -int __imf_max(int x, int y); -int __imf_min(int x, int y); -unsigned long long __imf_ullmax(unsigned long long x, unsigned long long y); -unsigned long long __imf_ullmin(unsigned long long x, unsigned long long y); -unsigned __imf_umax(unsigned x, unsigned y); -unsigned __imf_umin(unsigned x, unsigned y); -int __imf_clz(int); -int __imf_clzll(long long); -int __imf_ffs(int); -int __imf_ffsll(long long); -int __imf_mul24(int, int); -int __imf_mulhi(int, int); -long long __imf_mul64hi(long long, long long); -int __imf_popc(unsigned); -int __imf_popcll(unsigned long long); -int __imf_rhadd(int, int); -int __imf_hadd(int, int); -unsigned __imf_sad(int, int, unsigned); -unsigned __imf_uhadd(unsigned, unsigned); -unsigned __imf_umul24(unsigned, unsigned); -unsigned __imf_umulhi(unsigned, unsigned); -unsigned long long __imf_umul64hi(unsigned long long, unsigned long long); -unsigned __imf_urhadd(unsigned, unsigned); -unsigned __imf_usad(unsigned, unsigned, unsigned); -} - +#include namespace sycl { inline namespace _V1 { namespace ext::intel::math { +/// -------------------------------------------------------------------------- +/// Reverses the bit order of unsigned integral type. +/// -------------------------------------------------------------------------- +extern "C" { +__DPCPP_SYCL_EXTERNAL unsigned __imf_brev(unsigned); +__DPCPP_SYCL_EXTERNAL unsigned long long __imf_brevll(unsigned long long); +}; + template unsigned brev(Tp x) { return __imf_brev(x); } template unsigned long long brevll(Tp x) { return __imf_brevll(x); } +/// -------------------------------------------------------------------------- +/// Returns selected bytes from two 32-bit unsigned integers. +/// -------------------------------------------------------------------------- +extern "C" { +__DPCPP_SYCL_EXTERNAL unsigned __imf_byte_perm(unsigned, unsigned, unsigned); +}; + template unsigned byte_perm(Tp x, Tp y, Tp z) { return __imf_byte_perm(x, y, z); } +/// -------------------------------------------------------------------------- +/// Returns maximum/minimum of the integral type input values. +/// -------------------------------------------------------------------------- +extern "C" { +__DPCPP_SYCL_EXTERNAL long long __imf_llmax(long long x, long long y); +__DPCPP_SYCL_EXTERNAL long long __imf_llmin(long long x, long long y); +__DPCPP_SYCL_EXTERNAL int __imf_max(int x, int y); +__DPCPP_SYCL_EXTERNAL int __imf_min(int x, int y); +__DPCPP_SYCL_EXTERNAL unsigned long long __imf_ullmax(unsigned long long x, + unsigned long long y); +__DPCPP_SYCL_EXTERNAL unsigned long long __imf_ullmin(unsigned long long x, + unsigned long long y); +__DPCPP_SYCL_EXTERNAL unsigned __imf_umax(unsigned x, unsigned y); +__DPCPP_SYCL_EXTERNAL unsigned __imf_umin(unsigned x, unsigned y); +}; + template int max(Tp x, Tp y) { return __imf_max(x, y); } template int min(Tp x, Tp y) { return __imf_min(x, y); } @@ -86,14 +86,45 @@ unsigned long long ullmin(Tp x, Tp y) { return __imf_ullmin(x, y); } +/// -------------------------------------------------------------------------- +/// Returns the number of consecutive leading 0 bits in 32/64-bit integer +/// -------------------------------------------------------------------------- +extern "C" { +__DPCPP_SYCL_EXTERNAL int __imf_clz(int); +__DPCPP_SYCL_EXTERNAL int __imf_clzll(long long); +}; + template int clz(Tp x) { return __imf_clz(x); } template int clzll(Tp x) { return __imf_clzll(x); } +/// -------------------------------------------------------------------------- +/// Finds the position of the LSB set to 1 in a 32/64-bit integer +/// -------------------------------------------------------------------------- +extern "C" { +__DPCPP_SYCL_EXTERNAL int __imf_ffs(int); +__DPCPP_SYCL_EXTERNAL int __imf_ffsll(long long); +}; + template int ffs(Tp x) { return __imf_ffs(x); } template int ffsll(Tp x) { return __imf_ffsll(x); } +/// -------------------------------------------------------------------------- +/// hadd(x), uhadd(x) +/// Returns average of signed/unsigned int type, avoiding overflow in +/// intermediate sum. +/// rhadd(x), urhadd(x) +/// Returns rounded average of signed/unsigned int type, avoiding overflow in +/// intermediate sum. +/// -------------------------------------------------------------------------- +extern "C" { +__DPCPP_SYCL_EXTERNAL int __imf_rhadd(int, int); +__DPCPP_SYCL_EXTERNAL int __imf_hadd(int, int); +__DPCPP_SYCL_EXTERNAL unsigned __imf_uhadd(unsigned, unsigned); +__DPCPP_SYCL_EXTERNAL unsigned __imf_urhadd(unsigned, unsigned); +}; + template int hadd(Tp x, Tp y) { return __imf_hadd(x, y); } template int rhadd(Tp x, Tp y) { return __imf_rhadd(x, y); } @@ -102,6 +133,31 @@ template unsigned urhadd(Tp x, Tp y) { return __imf_urhadd(x, y); } +template unsigned uhadd(Tp x, Tp y) { + return __imf_uhadd(x, y); +} + +/// -------------------------------------------------------------------------- +/// mul24(x), umul24(x) +/// Returns the least significant 32 bits of the product of the least +/// significant 24 bits of two signed/unsigned integers. +/// mulhi(x), umulhi(x) +/// Returns the most significant 32 bits of the product of the two 32-bit +/// signed/unsigned integers. +/// mul64hi(x), umul64hi(x) +/// Returns the most significant 64 bits of the product of the two 64-bit +/// signed/unsigned integers. +/// -------------------------------------------------------------------------- +extern "C" { +__DPCPP_SYCL_EXTERNAL int __imf_mul24(int, int); +__DPCPP_SYCL_EXTERNAL int __imf_mulhi(int, int); +__DPCPP_SYCL_EXTERNAL long long __imf_mul64hi(long long, long long); +__DPCPP_SYCL_EXTERNAL unsigned __imf_umul24(unsigned, unsigned); +__DPCPP_SYCL_EXTERNAL unsigned __imf_umulhi(unsigned, unsigned); +__DPCPP_SYCL_EXTERNAL unsigned long long __imf_umul64hi(unsigned long long, + unsigned long long); +}; + template int mul24(Tp x, Tp y) { return __imf_mul24(x, y); } template unsigned umul24(Tp x, Tp y) { @@ -123,12 +179,28 @@ unsigned long long umul64hi(Tp x, Tp y) { return __imf_umul64hi(x, y); } +/// -------------------------------------------------------------------------- +/// Counts the number of bits that are set to 1 in a 32-bit integer. +/// -------------------------------------------------------------------------- +extern "C" { +__DPCPP_SYCL_EXTERNAL int __imf_popc(unsigned); +__DPCPP_SYCL_EXTERNAL int __imf_popcll(unsigned long long); +}; + template int popc(Tp x) { return __imf_popc(x); } template int popcll(Tp x) { return __imf_popcll(x); } +/// -------------------------------------------------------------------------- +/// Returns |x - y| + z for unsigned/signed integers +/// -------------------------------------------------------------------------- +extern "C" { +__DPCPP_SYCL_EXTERNAL unsigned __imf_sad(int, int, unsigned); +__DPCPP_SYCL_EXTERNAL unsigned __imf_usad(unsigned, unsigned, unsigned); +}; + template unsigned sad(Tp1 x, Tp1 y, Tp2 z) { return __imf_sad(x, y, z); @@ -137,11 +209,6 @@ unsigned sad(Tp1 x, Tp1 y, Tp2 z) { template unsigned usad(Tp x, Tp y, Tp z) { return __imf_usad(x, y, z); } - -template unsigned uhadd(Tp x, Tp y) { - return __imf_uhadd(x, y); -} - } // namespace ext::intel::math } // namespace _V1 } // namespace sycl diff --git a/sycl/include/sycl/ext/intel/math/imf_rounding_math.hpp b/sycl/include/sycl/ext/intel/math/imf_rounding_math.hpp index d895d587fc987..11558cdfbb541 100644 --- a/sycl/include/sycl/ext/intel/math/imf_rounding_math.hpp +++ b/sycl/include/sycl/ext/intel/math/imf_rounding_math.hpp @@ -10,70 +10,22 @@ #pragma once -extern "C" { -float __imf_fadd_rz(float, float); -float __imf_fadd_rn(float, float); -float __imf_fadd_ru(float, float); -float __imf_fadd_rd(float, float); -float __imf_fsub_rz(float, float); -float __imf_fsub_rn(float, float); -float __imf_fsub_ru(float, float); -float __imf_fsub_rd(float, float); -float __imf_fmul_rz(float, float); -float __imf_fmul_rn(float, float); -float __imf_fmul_ru(float, float); -float __imf_fmul_rd(float, float); -float __imf_fdiv_rz(float, float); -float __imf_fdiv_rn(float, float); -float __imf_fdiv_ru(float, float); -float __imf_fdiv_rd(float, float); -float __imf_frcp_rz(float); -float __imf_frcp_rn(float); -float __imf_frcp_ru(float); -float __imf_frcp_rd(float); -float __imf_fmaf_rz(float, float, float); -float __imf_fmaf_rn(float, float, float); -float __imf_fmaf_ru(float, float, float); -float __imf_fmaf_rd(float, float, float); -float __imf_sqrtf_rz(float); -float __imf_sqrtf_rn(float); -float __imf_sqrtf_ru(float); -float __imf_sqrtf_rd(float); - -double __imf_dadd_rz(double, double); -double __imf_dadd_rn(double, double); -double __imf_dadd_ru(double, double); -double __imf_dadd_rd(double, double); -double __imf_dsub_rz(double, double); -double __imf_dsub_rn(double, double); -double __imf_dsub_ru(double, double); -double __imf_dsub_rd(double, double); -double __imf_dmul_rz(double, double); -double __imf_dmul_rn(double, double); -double __imf_dmul_ru(double, double); -double __imf_dmul_rd(double, double); -double __imf_ddiv_rz(double, double); -double __imf_ddiv_rn(double, double); -double __imf_ddiv_ru(double, double); -double __imf_ddiv_rd(double, double); -double __imf_drcp_rz(double); -double __imf_drcp_rn(double); -double __imf_drcp_ru(double); -double __imf_drcp_rd(double); -double __imf_fma_rz(double, double, double); -double __imf_fma_rn(double, double, double); -double __imf_fma_ru(double, double, double); -double __imf_fma_rd(double, double, double); -double __imf_sqrt_rz(double); -double __imf_sqrt_rn(double); -double __imf_sqrt_ru(double); -double __imf_sqrt_rd(double); -}; - +#include namespace sycl { inline namespace _V1 { namespace ext::intel::math { +/// -------------------------------------------------------------------------- +/// Adds 2 float values +/// Supports rounding down/up/to-nearest-even/towards-zero +/// -------------------------------------------------------------------------- +extern "C" { +__DPCPP_SYCL_EXTERNAL float __imf_fadd_rz(float, float); +__DPCPP_SYCL_EXTERNAL float __imf_fadd_rn(float, float); +__DPCPP_SYCL_EXTERNAL float __imf_fadd_ru(float, float); +__DPCPP_SYCL_EXTERNAL float __imf_fadd_rd(float, float); +}; + template Tp fadd_rd(Tp x, Tp y) { return __imf_fadd_rd(x, y); } @@ -90,6 +42,17 @@ template Tp fadd_rz(Tp x, Tp y) { return __imf_fadd_rz(x, y); } +/// -------------------------------------------------------------------------- +/// Substracts 2 float values +/// Supports rounding down/up/to-nearest-even/towards-zero +/// -------------------------------------------------------------------------- +extern "C" { +__DPCPP_SYCL_EXTERNAL float __imf_fsub_rz(float, float); +__DPCPP_SYCL_EXTERNAL float __imf_fsub_rn(float, float); +__DPCPP_SYCL_EXTERNAL float __imf_fsub_ru(float, float); +__DPCPP_SYCL_EXTERNAL float __imf_fsub_rd(float, float); +}; + template Tp fsub_rd(Tp x, Tp y) { return __imf_fsub_rd(x, y); } @@ -106,6 +69,17 @@ template Tp fsub_rz(Tp x, Tp y) { return __imf_fsub_rz(x, y); } +/// -------------------------------------------------------------------------- +/// Multiplies 2 float values +/// Supports rounding down/up/to-nearest-even/towards-zero +/// -------------------------------------------------------------------------- +extern "C" { +__DPCPP_SYCL_EXTERNAL float __imf_fmul_rz(float, float); +__DPCPP_SYCL_EXTERNAL float __imf_fmul_rn(float, float); +__DPCPP_SYCL_EXTERNAL float __imf_fmul_ru(float, float); +__DPCPP_SYCL_EXTERNAL float __imf_fmul_rd(float, float); +}; + template Tp fmul_rd(Tp x, Tp y) { return __imf_fmul_rd(x, y); } @@ -122,6 +96,17 @@ template Tp fmul_rz(Tp x, Tp y) { return __imf_fmul_rz(x, y); } +/// -------------------------------------------------------------------------- +/// Divides 2 float values +/// Supports rounding down/up/to-nearest-even/towards-zero +/// -------------------------------------------------------------------------- +extern "C" { +__DPCPP_SYCL_EXTERNAL float __imf_fdiv_rz(float, float); +__DPCPP_SYCL_EXTERNAL float __imf_fdiv_rn(float, float); +__DPCPP_SYCL_EXTERNAL float __imf_fdiv_ru(float, float); +__DPCPP_SYCL_EXTERNAL float __imf_fdiv_rd(float, float); +}; + template Tp fdiv_rd(Tp x, Tp y) { return __imf_fdiv_rd(x, y); } @@ -138,6 +123,17 @@ template Tp fdiv_rz(Tp x, Tp y) { return __imf_fdiv_rz(x, y); } +/// -------------------------------------------------------------------------- +/// Reciprocal of a float value +/// Supports rounding down/up/to-nearest-even/towards-zero +/// -------------------------------------------------------------------------- +extern "C" { +__DPCPP_SYCL_EXTERNAL float __imf_frcp_rz(float); +__DPCPP_SYCL_EXTERNAL float __imf_frcp_rn(float); +__DPCPP_SYCL_EXTERNAL float __imf_frcp_ru(float); +__DPCPP_SYCL_EXTERNAL float __imf_frcp_rd(float); +}; + template Tp frcp_rd(Tp x) { return __imf_frcp_rd(x); } template Tp frcp_rn(Tp x) { return __imf_frcp_rn(x); } @@ -146,6 +142,17 @@ template Tp frcp_ru(Tp x) { return __imf_frcp_ru(x); } template Tp frcp_rz(Tp x) { return __imf_frcp_rz(x); } +/// -------------------------------------------------------------------------- +/// Returns result of 'x * y + z' for float values +/// Supports rounding down/up/to-nearest-even/towards-zero +/// -------------------------------------------------------------------------- +extern "C" { +__DPCPP_SYCL_EXTERNAL float __imf_fmaf_rz(float, float, float); +__DPCPP_SYCL_EXTERNAL float __imf_fmaf_rn(float, float, float); +__DPCPP_SYCL_EXTERNAL float __imf_fmaf_ru(float, float, float); +__DPCPP_SYCL_EXTERNAL float __imf_fmaf_rd(float, float, float); +}; + template Tp fmaf_rd(Tp x, Tp y, Tp z) { return __imf_fmaf_rd(x, y, z); } @@ -162,6 +169,17 @@ template Tp fmaf_rz(Tp x, Tp y, Tp z) { return __imf_fmaf_rz(x, y, z); } +/// -------------------------------------------------------------------------- +/// Square root of a float value +/// Supports rounding down/up/to-nearest-even/towards-zero +/// -------------------------------------------------------------------------- +extern "C" { +__DPCPP_SYCL_EXTERNAL float __imf_sqrtf_rz(float); +__DPCPP_SYCL_EXTERNAL float __imf_sqrtf_rn(float); +__DPCPP_SYCL_EXTERNAL float __imf_sqrtf_ru(float); +__DPCPP_SYCL_EXTERNAL float __imf_sqrtf_rd(float); +}; + template Tp fsqrt_rd(Tp x) { return __imf_sqrtf_rd(x); } template Tp fsqrt_rn(Tp x) { return __imf_sqrtf_rn(x); } @@ -170,6 +188,17 @@ template Tp fsqrt_ru(Tp x) { return __imf_sqrtf_ru(x); } template Tp fsqrt_rz(Tp x) { return __imf_sqrtf_rz(x); } +/// -------------------------------------------------------------------------- +/// Adds 2 double values +/// Supports rounding down/up/to-nearest-even/towards-zero +/// -------------------------------------------------------------------------- +extern "C" { +__DPCPP_SYCL_EXTERNAL double __imf_dadd_rz(double, double); +__DPCPP_SYCL_EXTERNAL double __imf_dadd_rn(double, double); +__DPCPP_SYCL_EXTERNAL double __imf_dadd_ru(double, double); +__DPCPP_SYCL_EXTERNAL double __imf_dadd_rd(double, double); +}; + template Tp dadd_rd(Tp x, Tp y) { return __imf_dadd_rd(x, y); } @@ -186,6 +215,17 @@ template Tp dadd_rz(Tp x, Tp y) { return __imf_dadd_rz(x, y); } +/// -------------------------------------------------------------------------- +/// Substracts 2 double values +/// Supports rounding down/up/to-nearest-even/towards-zero +/// -------------------------------------------------------------------------- +extern "C" { +__DPCPP_SYCL_EXTERNAL double __imf_dsub_rz(double, double); +__DPCPP_SYCL_EXTERNAL double __imf_dsub_rn(double, double); +__DPCPP_SYCL_EXTERNAL double __imf_dsub_ru(double, double); +__DPCPP_SYCL_EXTERNAL double __imf_dsub_rd(double, double); +}; + template Tp dsub_rd(Tp x, Tp y) { return __imf_dsub_rd(x, y); } @@ -202,6 +242,17 @@ template Tp dsub_rz(Tp x, Tp y) { return __imf_dsub_rz(x, y); } +/// -------------------------------------------------------------------------- +/// Multiplies 2 double values +/// Supports rounding down/up/to-nearest-even/towards-zero +/// -------------------------------------------------------------------------- +extern "C" { +__DPCPP_SYCL_EXTERNAL double __imf_dmul_rz(double, double); +__DPCPP_SYCL_EXTERNAL double __imf_dmul_rn(double, double); +__DPCPP_SYCL_EXTERNAL double __imf_dmul_ru(double, double); +__DPCPP_SYCL_EXTERNAL double __imf_dmul_rd(double, double); +}; + template Tp dmul_rd(Tp x, Tp y) { return __imf_dmul_rd(x, y); } @@ -218,6 +269,17 @@ template Tp dmul_rz(Tp x, Tp y) { return __imf_dmul_rz(x, y); } +/// -------------------------------------------------------------------------- +/// Divides 2 double values +/// Supports rounding down/up/to-nearest-even/towards-zero +/// -------------------------------------------------------------------------- +extern "C" { +__DPCPP_SYCL_EXTERNAL double __imf_ddiv_rz(double, double); +__DPCPP_SYCL_EXTERNAL double __imf_ddiv_rn(double, double); +__DPCPP_SYCL_EXTERNAL double __imf_ddiv_ru(double, double); +__DPCPP_SYCL_EXTERNAL double __imf_ddiv_rd(double, double); +}; + template Tp ddiv_rd(Tp x, Tp y) { return __imf_ddiv_rd(x, y); } @@ -234,6 +296,17 @@ template Tp ddiv_rz(Tp x, Tp y) { return __imf_ddiv_rz(x, y); } +/// -------------------------------------------------------------------------- +/// Returns reciprocal of a double value +/// Supports rounding down/up/to-nearest-even/towards-zero +/// -------------------------------------------------------------------------- +extern "C" { +__DPCPP_SYCL_EXTERNAL double __imf_drcp_rz(double); +__DPCPP_SYCL_EXTERNAL double __imf_drcp_rn(double); +__DPCPP_SYCL_EXTERNAL double __imf_drcp_ru(double); +__DPCPP_SYCL_EXTERNAL double __imf_drcp_rd(double); +}; + template Tp drcp_rd(Tp x) { return __imf_drcp_rd(x); } template Tp drcp_rn(Tp x) { return __imf_drcp_rn(x); } @@ -242,6 +315,17 @@ template Tp drcp_ru(Tp x) { return __imf_drcp_ru(x); } template Tp drcp_rz(Tp x) { return __imf_drcp_rz(x); } +/// -------------------------------------------------------------------------- +/// Returns result of 'x * y + z' for double values +/// Supports rounding down/up/to-nearest-even/towards-zero +/// -------------------------------------------------------------------------- +extern "C" { +__DPCPP_SYCL_EXTERNAL double __imf_fma_rz(double, double, double); +__DPCPP_SYCL_EXTERNAL double __imf_fma_rn(double, double, double); +__DPCPP_SYCL_EXTERNAL double __imf_fma_ru(double, double, double); +__DPCPP_SYCL_EXTERNAL double __imf_fma_rd(double, double, double); +}; + template Tp fma_rd(Tp x, Tp y, Tp z) { return __imf_fma_rd(x, y, z); } @@ -258,6 +342,17 @@ template Tp fma_rz(Tp x, Tp y, Tp z) { return __imf_fma_rz(x, y, z); } +/// -------------------------------------------------------------------------- +/// Square root of a double value +/// Supports rounding down/up/to-nearest-even/towards-zero +/// -------------------------------------------------------------------------- +extern "C" { +__DPCPP_SYCL_EXTERNAL double __imf_sqrt_rz(double); +__DPCPP_SYCL_EXTERNAL double __imf_sqrt_rn(double); +__DPCPP_SYCL_EXTERNAL double __imf_sqrt_ru(double); +__DPCPP_SYCL_EXTERNAL double __imf_sqrt_rd(double); +}; + template Tp dsqrt_rd(Tp x) { return __imf_sqrt_rd(x); } template Tp dsqrt_rn(Tp x) { return __imf_sqrt_rn(x); } diff --git a/sycl/include/sycl/ext/intel/math/imf_simd.hpp b/sycl/include/sycl/ext/intel/math/imf_simd.hpp index effcf460b88b8..2bccf5e2fbf15 100644 --- a/sycl/include/sycl/ext/intel/math/imf_simd.hpp +++ b/sycl/include/sycl/ext/intel/math/imf_simd.hpp @@ -10,136 +10,24 @@ #pragma once +#include #include -extern "C" { -unsigned int __imf_vabs2(unsigned int); -unsigned int __imf_vabs4(unsigned int); -unsigned int __imf_vneg2(unsigned int); -unsigned int __imf_vneg4(unsigned int); -unsigned int __imf_vnegss2(unsigned int); -unsigned int __imf_vnegss4(unsigned int); -unsigned int __imf_vabsdiffs2(unsigned int, unsigned int); -unsigned int __imf_vabsdiffs4(unsigned int, unsigned int); -unsigned int __imf_vabsdiffu2(unsigned int, unsigned int); -unsigned int __imf_vabsdiffu4(unsigned int, unsigned int); -unsigned int __imf_vabsss2(unsigned int); -unsigned int __imf_vabsss4(unsigned int); -unsigned int __imf_vadd2(unsigned int, unsigned int); -unsigned int __imf_vadd4(unsigned int, unsigned int); -unsigned int __imf_vaddss2(unsigned int, unsigned int); -unsigned int __imf_vaddss4(unsigned int, unsigned int); -unsigned int __imf_vaddus2(unsigned int, unsigned int); -unsigned int __imf_vaddus4(unsigned int, unsigned int); -unsigned int __imf_vsub2(unsigned int, unsigned int); -unsigned int __imf_vsub4(unsigned int, unsigned int); -unsigned int __imf_vsubss2(unsigned int, unsigned int); -unsigned int __imf_vsubss4(unsigned int, unsigned int); -unsigned int __imf_vsubus2(unsigned int, unsigned int); -unsigned int __imf_vsubus4(unsigned int, unsigned int); -unsigned int __imf_vhaddu2(unsigned int, unsigned int); -unsigned int __imf_vhaddu4(unsigned int, unsigned int); -unsigned int __imf_vavgs2(unsigned int, unsigned int); -unsigned int __imf_vavgs4(unsigned int, unsigned int); -unsigned int __imf_vavgu2(unsigned int, unsigned int); -unsigned int __imf_vavgu4(unsigned int, unsigned int); -unsigned int __imf_vcmpeq2(unsigned int, unsigned int); -unsigned int __imf_vcmpeq4(unsigned int, unsigned int); -unsigned int __imf_vcmpges2(unsigned int, unsigned int); -unsigned int __imf_vcmpges4(unsigned int, unsigned int); -unsigned int __imf_vcmpgeu2(unsigned int, unsigned int); -unsigned int __imf_vcmpgeu4(unsigned int, unsigned int); -unsigned int __imf_vcmpgts2(unsigned int, unsigned int); -unsigned int __imf_vcmpgts4(unsigned int, unsigned int); -unsigned int __imf_vcmpgtu2(unsigned int, unsigned int); -unsigned int __imf_vcmpgtu4(unsigned int, unsigned int); -unsigned int __imf_vcmples2(unsigned int, unsigned int); -unsigned int __imf_vcmples4(unsigned int, unsigned int); -unsigned int __imf_vcmpleu2(unsigned int, unsigned int); -unsigned int __imf_vcmpleu4(unsigned int, unsigned int); -unsigned int __imf_vcmplts2(unsigned int, unsigned int); -unsigned int __imf_vcmplts4(unsigned int, unsigned int); -unsigned int __imf_vcmpltu2(unsigned int, unsigned int); -unsigned int __imf_vcmpltu4(unsigned int, unsigned int); -unsigned int __imf_vcmpne2(unsigned int, unsigned int); -unsigned int __imf_vcmpne4(unsigned int, unsigned int); -unsigned int __imf_vmaxs2(unsigned int, unsigned int); -unsigned int __imf_vmaxs4(unsigned int, unsigned int); -unsigned int __imf_vmaxu2(unsigned int, unsigned int); -unsigned int __imf_vmaxu4(unsigned int, unsigned int); -unsigned int __imf_vmins2(unsigned int, unsigned int); -unsigned int __imf_vmins4(unsigned int, unsigned int); -unsigned int __imf_vminu2(unsigned int, unsigned int); -unsigned int __imf_vminu4(unsigned int, unsigned int); -unsigned int __imf_vseteq2(unsigned int, unsigned int); -unsigned int __imf_vseteq4(unsigned int, unsigned int); -unsigned int __imf_vsetne2(unsigned int, unsigned int); -unsigned int __imf_vsetne4(unsigned int, unsigned int); -unsigned int __imf_vsetges2(unsigned int, unsigned int); -unsigned int __imf_vsetges4(unsigned int, unsigned int); -unsigned int __imf_vsetgeu2(unsigned int, unsigned int); -unsigned int __imf_vsetgeu4(unsigned int, unsigned int); -unsigned int __imf_vsetgts2(unsigned int, unsigned int); -unsigned int __imf_vsetgts4(unsigned int, unsigned int); -unsigned int __imf_vsetgtu2(unsigned int, unsigned int); -unsigned int __imf_vsetgtu4(unsigned int, unsigned int); -unsigned int __imf_vsetles2(unsigned int, unsigned int); -unsigned int __imf_vsetles4(unsigned int, unsigned int); -unsigned int __imf_vsetleu2(unsigned int, unsigned int); -unsigned int __imf_vsetleu4(unsigned int, unsigned int); -unsigned int __imf_vsetlts2(unsigned int, unsigned int); -unsigned int __imf_vsetlts4(unsigned int, unsigned int); -unsigned int __imf_vsetltu2(unsigned int, unsigned int); -unsigned int __imf_vsetltu4(unsigned int, unsigned int); -unsigned int __imf_vsads2(unsigned int, unsigned int); -unsigned int __imf_vsads4(unsigned int, unsigned int); -unsigned int __imf_vsadu2(unsigned int, unsigned int); -unsigned int __imf_vsadu4(unsigned int, unsigned int); -unsigned int __imf_viaddmax_s16x2(unsigned int, unsigned int, unsigned int); -unsigned int __imf_viaddmax_s16x2_relu(unsigned int, unsigned int, - unsigned int); -int __imf_viaddmax_s32(int, int, int); -int __imf_viaddmax_s32_relu(int, int, int); -unsigned int __imf_viaddmax_u16x2(unsigned int, unsigned int, unsigned int); -unsigned int __imf_viaddmax_u32(unsigned int, unsigned int, unsigned int); - -unsigned int __imf_viaddmin_s16x2(unsigned int, unsigned int, unsigned int); -unsigned int __imf_viaddmin_s16x2_relu(unsigned int, unsigned int, - unsigned int); -int __imf_viaddmin_s32(int, int, int); -int __imf_viaddmin_s32_relu(int, int, int); -unsigned int __imf_viaddmin_u16x2(unsigned int, unsigned int, unsigned int); -unsigned int __imf_viaddmin_u32(unsigned int, unsigned int, unsigned int); -unsigned int __imf_vibmax_s16x2(unsigned int, unsigned int, bool *, bool *); -int __imf_vibmax_s32(int, int, bool *); -unsigned int __imf_vibmax_u16x2(unsigned int, unsigned int, bool *, bool *); -unsigned int __imf_vibmax_u32(unsigned int, unsigned int, bool *); -unsigned int __imf_vibmin_s16x2(unsigned int, unsigned int, bool *, bool *); -int __imf_vibmin_s32(int, int, bool *); -unsigned int __imf_vibmin_u16x2(unsigned int, unsigned int, bool *, bool *); -unsigned int __imf_vibmin_u32(unsigned int, unsigned int, bool *); -unsigned int __imf_vimax3_s16x2(unsigned int, unsigned int, unsigned int); -unsigned int __imf_vimax3_s16x2_relu(unsigned int, unsigned int, unsigned int); -unsigned int __imf_vimin3_s16x2(unsigned int, unsigned int, unsigned int); -unsigned int __imf_vimin3_s16x2_relu(unsigned int, unsigned int, unsigned int); -int __imf_vimax3_s32(int, int, int); -int __imf_vimax3_s32_relu(int, int, int); -int __imf_vimin3_s32(int, int, int); -int __imf_vimin3_s32_relu(int, int, int); -unsigned int __imf_vimax3_u16x2(unsigned int, unsigned int, unsigned int); -unsigned int __imf_vimax3_u32(unsigned int, unsigned int, unsigned int); -unsigned int __imf_vimin3_u16x2(unsigned int, unsigned int, unsigned int); -unsigned int __imf_vimin3_u32(unsigned int, unsigned int, unsigned int); -unsigned int __imf_vimax_s16x2_relu(unsigned int, unsigned int); -int __imf_vimax_s32_relu(int, int); -unsigned int __imf_vimin_s16x2_relu(unsigned int, unsigned int); -int __imf_vimin_s32_relu(int, int); -}; - namespace sycl { inline namespace _V1 { namespace ext::intel::math { +/// -------------------------------------------------------------------------- +/// vabs2(x) +/// Returns per-halfword absolute value. +/// vabs4(x) +/// Returns per-byte absolute value. +/// -------------------------------------------------------------------------- +extern "C" { +__DPCPP_SYCL_EXTERNAL unsigned int __imf_vabs2(unsigned int); +__DPCPP_SYCL_EXTERNAL unsigned int __imf_vabs4(unsigned int); +}; + template std::enable_if_t, unsigned int> vabs2(Tp x) { return __imf_vabs2(x); @@ -150,6 +38,17 @@ std::enable_if_t, unsigned int> vabs4(Tp x) { return __imf_vabs4(x); } +/// -------------------------------------------------------------------------- +/// vneg2(x) +/// Returns per-halfword negation. +/// vneg4(x) +/// Returns per-byte negation. +/// -------------------------------------------------------------------------- +extern "C" { +__DPCPP_SYCL_EXTERNAL unsigned int __imf_vneg2(unsigned int); +__DPCPP_SYCL_EXTERNAL unsigned int __imf_vneg4(unsigned int); +}; + template std::enable_if_t, unsigned int> vneg2(Tp x) { return __imf_vneg2(x); @@ -160,6 +59,17 @@ std::enable_if_t, unsigned int> vneg4(Tp x) { return __imf_vneg4(x); } +/// -------------------------------------------------------------------------- +/// vnegss2(x) +/// Returns per-halfword negation with signed saturation. +/// vnegss4(x) +/// Returns per-byte negation with signed saturation. +/// -------------------------------------------------------------------------- +extern "C" { +__DPCPP_SYCL_EXTERNAL unsigned int __imf_vnegss2(unsigned int); +__DPCPP_SYCL_EXTERNAL unsigned int __imf_vnegss4(unsigned int); +}; + template std::enable_if_t, unsigned int> vnegss2(Tp x) { return __imf_vnegss2(x); @@ -170,6 +80,19 @@ std::enable_if_t, unsigned int> vnegss4(Tp x) { return __imf_vnegss4(x); } +/// -------------------------------------------------------------------------- +/// vabsdiffs2(x), vabsdiffu2(x) +/// Returns per-halfword absolute difference of signed/unsigned integer. +/// vabsdiffs4(x), vabsdiffu4(x) +/// Returns per-byte absolute difference of signed/unsigned integer. +/// -------------------------------------------------------------------------- +extern "C" { +__DPCPP_SYCL_EXTERNAL unsigned int __imf_vabsdiffs2(unsigned int, unsigned int); +__DPCPP_SYCL_EXTERNAL unsigned int __imf_vabsdiffs4(unsigned int, unsigned int); +__DPCPP_SYCL_EXTERNAL unsigned int __imf_vabsdiffu2(unsigned int, unsigned int); +__DPCPP_SYCL_EXTERNAL unsigned int __imf_vabsdiffu4(unsigned int, unsigned int); +}; + template std::enable_if_t, unsigned int> vabsdiffs2(Tp x, Tp y) { @@ -194,6 +117,17 @@ vabsdiffu4(Tp x, Tp y) { return __imf_vabsdiffu4(x, y); } +/// -------------------------------------------------------------------------- +/// vabsss2(x) +/// Returns per-halfword absolute value with signed saturation. +/// vabsss4(x) +/// Returns per-byte absolute value with signed saturation. +/// -------------------------------------------------------------------------- +extern "C" { +__DPCPP_SYCL_EXTERNAL unsigned int __imf_vabsss2(unsigned int); +__DPCPP_SYCL_EXTERNAL unsigned int __imf_vabsss4(unsigned int); +}; + template std::enable_if_t, unsigned int> vabsss2(Tp x) { return __imf_vabsss2(x); @@ -204,6 +138,17 @@ std::enable_if_t, unsigned int> vabsss4(Tp x) { return __imf_vabsss4(x); } +/// -------------------------------------------------------------------------- +/// vadd2(x, y) +/// Returns per-halfword unsigned addition ignoring overflow. +/// vadd4(x, y) +/// Returns per-byte unsigned addition ignoring overflow. +/// -------------------------------------------------------------------------- +extern "C" { +__DPCPP_SYCL_EXTERNAL unsigned int __imf_vadd2(unsigned int, unsigned int); +__DPCPP_SYCL_EXTERNAL unsigned int __imf_vadd4(unsigned int, unsigned int); +}; + template std::enable_if_t, unsigned int> vadd2(Tp x, Tp y) { @@ -216,6 +161,17 @@ std::enable_if_t, unsigned int> vadd4(Tp x, return __imf_vadd4(x, y); } +/// -------------------------------------------------------------------------- +/// vaddss2(x, y) +/// Returns per-halfword signed addition with signed saturation. +/// vaddss4(x, y) +/// Returns per-byte signed addition with signed saturation. +/// -------------------------------------------------------------------------- +extern "C" { +__DPCPP_SYCL_EXTERNAL unsigned int __imf_vaddss2(unsigned int, unsigned int); +__DPCPP_SYCL_EXTERNAL unsigned int __imf_vaddss4(unsigned int, unsigned int); +}; + template std::enable_if_t, unsigned int> vaddss2(Tp x, Tp y) { @@ -228,6 +184,17 @@ std::enable_if_t, unsigned int> vaddss4(Tp x, return __imf_vaddss4(x, y); } +/// -------------------------------------------------------------------------- +/// vaddus2(x, y) +/// Returns per-halfword unsigned addition with unsigned saturation. +/// vaddus4(x, y) +/// Returns per-byte unsigned addition with unsigned saturation. +/// -------------------------------------------------------------------------- +extern "C" { +__DPCPP_SYCL_EXTERNAL unsigned int __imf_vaddus2(unsigned int, unsigned int); +__DPCPP_SYCL_EXTERNAL unsigned int __imf_vaddus4(unsigned int, unsigned int); +}; + template std::enable_if_t, unsigned int> vaddus2(Tp x, Tp y) { @@ -240,6 +207,17 @@ std::enable_if_t, unsigned int> vaddus4(Tp x, return __imf_vaddus4(x, y); } +/// -------------------------------------------------------------------------- +/// vsub2(x, y) +/// Returns per-halfword unsigned subtraction ignoring overflow. +/// vsub4(x, y) +/// Returns per-byte unsigned subtraction ignoring overflow. +/// -------------------------------------------------------------------------- +extern "C" { +__DPCPP_SYCL_EXTERNAL unsigned int __imf_vsub2(unsigned int, unsigned int); +__DPCPP_SYCL_EXTERNAL unsigned int __imf_vsub4(unsigned int, unsigned int); +}; + template std::enable_if_t, unsigned int> vsub2(Tp x, Tp y) { @@ -252,6 +230,17 @@ std::enable_if_t, unsigned int> vsub4(Tp x, return __imf_vsub4(x, y); } +/// -------------------------------------------------------------------------- +/// vsubss2(x, y) +/// Returns per-halfword signed subtraction with signed saturation. +/// vsubss4(x, y) +/// Returns per-byte signed subtraction with signed saturation. +/// -------------------------------------------------------------------------- +extern "C" { +__DPCPP_SYCL_EXTERNAL unsigned int __imf_vsubss2(unsigned int, unsigned int); +__DPCPP_SYCL_EXTERNAL unsigned int __imf_vsubss4(unsigned int, unsigned int); +}; + template std::enable_if_t, unsigned int> vsubss2(Tp x, Tp y) { @@ -264,6 +253,17 @@ std::enable_if_t, unsigned int> vsubss4(Tp x, return __imf_vsubss4(x, y); } +/// -------------------------------------------------------------------------- +/// vsubus2(x, y) +/// Returns per-halfword unsigned subtraction with unsigned saturation. +/// vsubus4(x, y) +/// Returns per-byte unsigned subtraction with unsigned saturation. +/// -------------------------------------------------------------------------- +extern "C" { +__DPCPP_SYCL_EXTERNAL unsigned int __imf_vsubus2(unsigned int, unsigned int); +__DPCPP_SYCL_EXTERNAL unsigned int __imf_vsubus4(unsigned int, unsigned int); +}; + template std::enable_if_t, unsigned int> vsubus2(Tp x, Tp y) { @@ -276,6 +276,17 @@ std::enable_if_t, unsigned int> vsubus4(Tp x, return __imf_vsubus4(x, y); } +/// -------------------------------------------------------------------------- +/// vhaddu2(x, y) +/// Returns per-halfword unsigned average. +/// vhaddu4(x, y) +/// Returns per-byte unsigned average. +/// -------------------------------------------------------------------------- +extern "C" { +__DPCPP_SYCL_EXTERNAL unsigned int __imf_vhaddu2(unsigned int, unsigned int); +__DPCPP_SYCL_EXTERNAL unsigned int __imf_vhaddu4(unsigned int, unsigned int); +}; + template std::enable_if_t, unsigned int> vhaddu2(Tp x, Tp y) { @@ -288,6 +299,23 @@ std::enable_if_t, unsigned int> vhaddu4(Tp x, return __imf_vhaddu4(x, y); } +/// -------------------------------------------------------------------------- +/// vavgs2(x, y) +/// Returns per-halfword signed rounded average. +/// vavgs4(x, y) +/// Returns per-byte signed rounded average. +/// vavgu2(x, y) +/// Returns per-halfword unsigned rounded average. +/// vavgu4(x, y) +/// Returns per-byte unsigned rounded average. +/// -------------------------------------------------------------------------- +extern "C" { +__DPCPP_SYCL_EXTERNAL unsigned int __imf_vavgs2(unsigned int, unsigned int); +__DPCPP_SYCL_EXTERNAL unsigned int __imf_vavgs4(unsigned int, unsigned int); +__DPCPP_SYCL_EXTERNAL unsigned int __imf_vavgu2(unsigned int, unsigned int); +__DPCPP_SYCL_EXTERNAL unsigned int __imf_vavgu4(unsigned int, unsigned int); +}; + template std::enable_if_t, unsigned int> vavgs2(Tp x, Tp y) { @@ -312,6 +340,38 @@ std::enable_if_t, unsigned int> vavgu4(Tp x, return __imf_vavgu4(x, y); } +/// -------------------------------------------------------------------------- +/// Performs per-halfword/byte comparison and returns the combined result. +/// Splits input unsigned integer into 2 0r 4 parts, compares corresponding +/// parts. Returns 0xFFFF/0xFF if corresponding comparison succeeds, otherwise +/// returns 0. The supported comparison includes: +/// eq(==), ges(>= for signed), geu(>= for unsigned), gts(> for signed), +/// gtu(> for unsigned), les(<= for signed), leu(<= for unsigned), +/// lts(< for signed), ltu(< for unsigned), ne(!=). +/// -------------------------------------------------------------------------- +extern "C" { +__DPCPP_SYCL_EXTERNAL unsigned int __imf_vcmpeq2(unsigned int, unsigned int); +__DPCPP_SYCL_EXTERNAL unsigned int __imf_vcmpeq4(unsigned int, unsigned int); +__DPCPP_SYCL_EXTERNAL unsigned int __imf_vcmpges2(unsigned int, unsigned int); +__DPCPP_SYCL_EXTERNAL unsigned int __imf_vcmpges4(unsigned int, unsigned int); +__DPCPP_SYCL_EXTERNAL unsigned int __imf_vcmpgeu2(unsigned int, unsigned int); +__DPCPP_SYCL_EXTERNAL unsigned int __imf_vcmpgeu4(unsigned int, unsigned int); +__DPCPP_SYCL_EXTERNAL unsigned int __imf_vcmpgts2(unsigned int, unsigned int); +__DPCPP_SYCL_EXTERNAL unsigned int __imf_vcmpgts4(unsigned int, unsigned int); +__DPCPP_SYCL_EXTERNAL unsigned int __imf_vcmpgtu2(unsigned int, unsigned int); +__DPCPP_SYCL_EXTERNAL unsigned int __imf_vcmpgtu4(unsigned int, unsigned int); +__DPCPP_SYCL_EXTERNAL unsigned int __imf_vcmples2(unsigned int, unsigned int); +__DPCPP_SYCL_EXTERNAL unsigned int __imf_vcmples4(unsigned int, unsigned int); +__DPCPP_SYCL_EXTERNAL unsigned int __imf_vcmpleu2(unsigned int, unsigned int); +__DPCPP_SYCL_EXTERNAL unsigned int __imf_vcmpleu4(unsigned int, unsigned int); +__DPCPP_SYCL_EXTERNAL unsigned int __imf_vcmplts2(unsigned int, unsigned int); +__DPCPP_SYCL_EXTERNAL unsigned int __imf_vcmplts4(unsigned int, unsigned int); +__DPCPP_SYCL_EXTERNAL unsigned int __imf_vcmpltu2(unsigned int, unsigned int); +__DPCPP_SYCL_EXTERNAL unsigned int __imf_vcmpltu4(unsigned int, unsigned int); +__DPCPP_SYCL_EXTERNAL unsigned int __imf_vcmpne2(unsigned int, unsigned int); +__DPCPP_SYCL_EXTERNAL unsigned int __imf_vcmpne4(unsigned int, unsigned int); +}; + template std::enable_if_t, unsigned int> vcmpeq2(Tp x, Tp y) { @@ -432,6 +492,27 @@ std::enable_if_t, unsigned int> vcmpne4(Tp x, return __imf_vcmpne4(x, y); } +/// -------------------------------------------------------------------------- +/// vmaxs2(x, y), vmaxu2(x, y) +/// Returns per-halfword signed/unsigned maximum value. +/// vmaxs4(x, y), vmaxu4(x, y) +/// Returns per-byte signed/unsigned maximum value. +/// vmins2(x, y), vminu2(x, y) +/// Returns per-halfword signed/unsigned minimum value. +/// vmins4(x, y), vminu4(x, y) +/// Returns per-byte signed/unsigned minimum value. +/// -------------------------------------------------------------------------- +extern "C" { +__DPCPP_SYCL_EXTERNAL unsigned int __imf_vmaxs2(unsigned int, unsigned int); +__DPCPP_SYCL_EXTERNAL unsigned int __imf_vmaxs4(unsigned int, unsigned int); +__DPCPP_SYCL_EXTERNAL unsigned int __imf_vmaxu2(unsigned int, unsigned int); +__DPCPP_SYCL_EXTERNAL unsigned int __imf_vmaxu4(unsigned int, unsigned int); +__DPCPP_SYCL_EXTERNAL unsigned int __imf_vmins2(unsigned int, unsigned int); +__DPCPP_SYCL_EXTERNAL unsigned int __imf_vmins4(unsigned int, unsigned int); +__DPCPP_SYCL_EXTERNAL unsigned int __imf_vminu2(unsigned int, unsigned int); +__DPCPP_SYCL_EXTERNAL unsigned int __imf_vminu4(unsigned int, unsigned int); +}; + template std::enable_if_t, unsigned int> vmaxs2(Tp x, Tp y) { @@ -480,6 +561,33 @@ std::enable_if_t, unsigned int> vminu4(Tp x, return __imf_vminu4(x, y); } +/// -------------------------------------------------------------------------- +/// Similar to vcmp* functions but returns 1 if corresponding comparison +/// succeeds. +/// -------------------------------------------------------------------------- +extern "C" { +__DPCPP_SYCL_EXTERNAL unsigned int __imf_vseteq2(unsigned int, unsigned int); +__DPCPP_SYCL_EXTERNAL unsigned int __imf_vseteq4(unsigned int, unsigned int); +__DPCPP_SYCL_EXTERNAL unsigned int __imf_vsetne2(unsigned int, unsigned int); +__DPCPP_SYCL_EXTERNAL unsigned int __imf_vsetne4(unsigned int, unsigned int); +__DPCPP_SYCL_EXTERNAL unsigned int __imf_vsetges2(unsigned int, unsigned int); +__DPCPP_SYCL_EXTERNAL unsigned int __imf_vsetges4(unsigned int, unsigned int); +__DPCPP_SYCL_EXTERNAL unsigned int __imf_vsetgeu2(unsigned int, unsigned int); +__DPCPP_SYCL_EXTERNAL unsigned int __imf_vsetgeu4(unsigned int, unsigned int); +__DPCPP_SYCL_EXTERNAL unsigned int __imf_vsetgts2(unsigned int, unsigned int); +__DPCPP_SYCL_EXTERNAL unsigned int __imf_vsetgts4(unsigned int, unsigned int); +__DPCPP_SYCL_EXTERNAL unsigned int __imf_vsetgtu2(unsigned int, unsigned int); +__DPCPP_SYCL_EXTERNAL unsigned int __imf_vsetgtu4(unsigned int, unsigned int); +__DPCPP_SYCL_EXTERNAL unsigned int __imf_vsetles2(unsigned int, unsigned int); +__DPCPP_SYCL_EXTERNAL unsigned int __imf_vsetles4(unsigned int, unsigned int); +__DPCPP_SYCL_EXTERNAL unsigned int __imf_vsetleu2(unsigned int, unsigned int); +__DPCPP_SYCL_EXTERNAL unsigned int __imf_vsetleu4(unsigned int, unsigned int); +__DPCPP_SYCL_EXTERNAL unsigned int __imf_vsetlts2(unsigned int, unsigned int); +__DPCPP_SYCL_EXTERNAL unsigned int __imf_vsetlts4(unsigned int, unsigned int); +__DPCPP_SYCL_EXTERNAL unsigned int __imf_vsetltu2(unsigned int, unsigned int); +__DPCPP_SYCL_EXTERNAL unsigned int __imf_vsetltu4(unsigned int, unsigned int); +}; + template std::enable_if_t, unsigned int> vseteq2(Tp x, Tp y) { @@ -600,6 +708,19 @@ vsetltu4(Tp x, Tp y) { return __imf_vsetltu4(x, y); } +/// -------------------------------------------------------------------------- +/// vsads2(x, y), vsadu2(x, y) +/// Returns per-halfword sum of absolute difference of signed/unsigned. +/// vsads4(x, y), vsadu4(x, y) +/// Returns per-byte sum of absolute difference of signed/unsigned. +/// -------------------------------------------------------------------------- +extern "C" { +__DPCPP_SYCL_EXTERNAL unsigned int __imf_vsads2(unsigned int, unsigned int); +__DPCPP_SYCL_EXTERNAL unsigned int __imf_vsads4(unsigned int, unsigned int); +__DPCPP_SYCL_EXTERNAL unsigned int __imf_vsadu2(unsigned int, unsigned int); +__DPCPP_SYCL_EXTERNAL unsigned int __imf_vsadu4(unsigned int, unsigned int); +}; + template std::enable_if_t, unsigned int> vsads2(Tp x, Tp y) { @@ -624,6 +745,59 @@ std::enable_if_t, unsigned int> vsadu4(Tp x, return __imf_vsadu4(x, y); } +/// -------------------------------------------------------------------------- +/// viaddmax_s16x2(x, y, z), viaddmax_u16x2(x, y, z) +/// Splits 4 bytes of each input into 2 half-word values which are interpreted +/// as signed/unsigned shorts. For corresponding parts function performs: +/// max(x_part + y_part), z_part). Partial results are recombined and returned +/// as unsigned integer. +/// viaddmax_s32(x, y, z), viaddmax_u32(x, y, z) +/// Returns max(x + y, z). +/// viaddmax_s16x2_relu(x, y, z) +/// Splits 4 bytes of each input into 2 half-word values which are interpreted +/// as signed shorts. For corresponding parts function performs: +/// max(max(x_part + y_part, z_part), 0). Partial results are recombined and +/// returned as unsigned integer. +/// viaddmax_s32_relu(x, y, z) +/// Returns max(max(x + y, z), 0). +/// viaddmin_s16x2(x, y, z), viaddmin_u16x2(x, y, z) +/// Splits 4 bytes of each input into 2 half-word values which are interpreted +/// as signed/unsigned shorts. For corresponding parts function performs: +/// min(x_part + y_part), z_part). Partial results are recombined and returned +/// as unsigned integer. +/// viaddmin_s32(x, y, z), viaddmin_u32(x, y, z) +/// Returns min(x + y, z). +/// viaddmin_s16x2_relu(x, y, z) +/// Splits 4 bytes of each input into 2 half-word values which are interpreted +/// as signed shorts. For corresponding parts function performs: +/// max(min(x_part + y_part, z_part), 0). Partial results are recombined and +/// returned as unsigned integer. +/// viaddmin_s32_relu(x, y, z) +/// Returns max(min(x + y, z), 0). +/// -------------------------------------------------------------------------- +extern "C" { +__DPCPP_SYCL_EXTERNAL unsigned int +__imf_viaddmax_s16x2(unsigned int, unsigned int, unsigned int); +__DPCPP_SYCL_EXTERNAL unsigned int +__imf_viaddmax_s16x2_relu(unsigned int, unsigned int, unsigned int); +__DPCPP_SYCL_EXTERNAL int __imf_viaddmax_s32(int, int, int); +__DPCPP_SYCL_EXTERNAL int __imf_viaddmax_s32_relu(int, int, int); +__DPCPP_SYCL_EXTERNAL unsigned int +__imf_viaddmax_u16x2(unsigned int, unsigned int, unsigned int); +__DPCPP_SYCL_EXTERNAL unsigned int +__imf_viaddmax_u32(unsigned int, unsigned int, unsigned int); +__DPCPP_SYCL_EXTERNAL unsigned int +__imf_viaddmin_s16x2(unsigned int, unsigned int, unsigned int); +__DPCPP_SYCL_EXTERNAL unsigned int +__imf_viaddmin_s16x2_relu(unsigned int, unsigned int, unsigned int); +__DPCPP_SYCL_EXTERNAL int __imf_viaddmin_s32(int, int, int); +__DPCPP_SYCL_EXTERNAL int __imf_viaddmin_s32_relu(int, int, int); +__DPCPP_SYCL_EXTERNAL unsigned int +__imf_viaddmin_u16x2(unsigned int, unsigned int, unsigned int); +__DPCPP_SYCL_EXTERNAL unsigned int +__imf_viaddmin_u32(unsigned int, unsigned int, unsigned int); +}; + template std::enable_if_t, unsigned int> viaddmax_s16x2(Tp x, Tp y, Tp z) { @@ -694,6 +868,41 @@ viaddmin_u32(Tp x, Tp y, Tp z) { return __imf_viaddmin_u32(x, y, z); } +/// -------------------------------------------------------------------------- +/// vibmax_s16x2(x, y, p_hi, p_lo), vibmax_u16x2(x, y, p_hi, p_lo) +/// Splits 4 bytes of each input into 2 half-word values which are interpreted +/// as signed/unsigned shorts. For corresponding parts, return max(x_part, +/// y_part). Partial results are recombined and returned as unsigned int. +/// 'p_hi' and 'p_lo' are 2 bool pointers, set the result: +/// *p_hi = (x_high >= y_high), *p_lo = (x_low >= y_low) +/// vibmax_s32(x, y, p), vibmax_u32(x, y, p) +/// Returns max(x, y), also sets: *p = (x >= y). +/// vibmin_s16x2(x, y, p_hi, p_lo), vibmin_u16x2(x, y, p_hi, p_lo) +/// Splits 4 bytes of each input into 2 half-word values which are interpreted +/// as signed/unsigned shorts. For corresponding parts, return min(x_part, +/// y_part). Partial results are recombined and returned as unsigned int. +/// 'p_hi' and 'p_lo' are 2 bool pointers, set the result: +/// *p_hi = (x_high <= y_high), *p_lo = (x_low <= y_low) +/// vibmin_s32(x, y, p), vibmin_u32(x, y, p) +/// Returns min(x, y), also sets: *p = (x <= y). +/// -------------------------------------------------------------------------- +extern "C" { +__DPCPP_SYCL_EXTERNAL unsigned int +__imf_vibmax_s16x2(unsigned int, unsigned int, bool *, bool *); +__DPCPP_SYCL_EXTERNAL int __imf_vibmax_s32(int, int, bool *); +__DPCPP_SYCL_EXTERNAL unsigned int +__imf_vibmax_u16x2(unsigned int, unsigned int, bool *, bool *); +__DPCPP_SYCL_EXTERNAL unsigned int __imf_vibmax_u32(unsigned int, unsigned int, + bool *); +__DPCPP_SYCL_EXTERNAL unsigned int +__imf_vibmin_s16x2(unsigned int, unsigned int, bool *, bool *); +__DPCPP_SYCL_EXTERNAL int __imf_vibmin_s32(int, int, bool *); +__DPCPP_SYCL_EXTERNAL unsigned int +__imf_vibmin_u16x2(unsigned int, unsigned int, bool *, bool *); +__DPCPP_SYCL_EXTERNAL unsigned int __imf_vibmin_u32(unsigned int, unsigned int, + bool *); +}; + template std::enable_if_t, unsigned int> vibmax_s16x2(Tp x, Tp y, bool *p_hi, bool *p_lo) { @@ -740,6 +949,44 @@ vibmin_u32(Tp x, Tp y, bool *p) { return __imf_vibmin_u32(x, y, p); } +/// -------------------------------------------------------------------------- +/// vimax3_s16x2(x, y, z), vimax3_u16x2(x, y, z) +/// Splits 4 bytes of each input into 2 half-word values which are interpreted +/// as signed/unsigned shorts. For corresponding parts, returns +/// max(x_part, y_part, z_part). Partial results are recombined and returned as +/// unsigned int. +/// vimax3_s32(x, y, z), vimax3_u32(x, y, z) +/// Reinterpret input as signed int and returns max(x, y, z). +/// vimin3_s16x2(x, y, z), vimin3_u16x2(x, y, z) +/// Splits 4 bytes of each input into 2 half-word values which are interpreted +/// as signed/unsigned shorts. Returns min(x_part, y_part, z_part) for each +/// part. Partial results are recombined and returned as unsigned int. +/// vimin3_s32(x, y, z), vimin3_u32(x, y, z) +/// Reinterpret input as signed int and returns min(x, y, z). +/// -------------------------------------------------------------------------- +extern "C" { +__DPCPP_SYCL_EXTERNAL unsigned int +__imf_vimax3_s16x2(unsigned int, unsigned int, unsigned int); +__DPCPP_SYCL_EXTERNAL unsigned int +__imf_vimax3_s16x2_relu(unsigned int, unsigned int, unsigned int); +__DPCPP_SYCL_EXTERNAL unsigned int +__imf_vimin3_s16x2(unsigned int, unsigned int, unsigned int); +__DPCPP_SYCL_EXTERNAL unsigned int +__imf_vimin3_s16x2_relu(unsigned int, unsigned int, unsigned int); +__DPCPP_SYCL_EXTERNAL int __imf_vimax3_s32(int, int, int); +__DPCPP_SYCL_EXTERNAL int __imf_vimax3_s32_relu(int, int, int); +__DPCPP_SYCL_EXTERNAL int __imf_vimin3_s32(int, int, int); +__DPCPP_SYCL_EXTERNAL int __imf_vimin3_s32_relu(int, int, int); +__DPCPP_SYCL_EXTERNAL unsigned int +__imf_vimax3_u16x2(unsigned int, unsigned int, unsigned int); +__DPCPP_SYCL_EXTERNAL unsigned int __imf_vimax3_u32(unsigned int, unsigned int, + unsigned int); +__DPCPP_SYCL_EXTERNAL unsigned int +__imf_vimin3_u16x2(unsigned int, unsigned int, unsigned int); +__DPCPP_SYCL_EXTERNAL unsigned int __imf_vimin3_u32(unsigned int, unsigned int, + unsigned int); +}; + template std::enable_if_t, unsigned int> vimax3_s16x2(Tp x, Tp y, Tp z) { @@ -810,6 +1057,29 @@ vimin3_u32(Tp x, Tp y, Tp z) { return __imf_vimin3_u32(x, y, z); } +/// -------------------------------------------------------------------------- +/// vimax_s16x2_relu(x, y) +/// Splits 4 bytes of each input into 2 half-word values which are interpreted +/// as signed shorts. Returns max(x_part, y_part, 0) for each part. Partial +/// results are recombined and returned as unsigned int. +/// vimax_s32_relu(x, y) +/// Reinterpret input as signed int and returns max(x, y, 0). +/// vimin_s16x2_relu(x, y) +/// Splits 4 bytes of each input into 2 half-word values which are interpreted +/// as signed shorts. Returns max(min(x_part, y_part), 0) for each part. Partial +/// results are recombined and returned as unsigned int. +/// vimin_s32_relu(x, y) +/// Reinterpret input as signed int and returns max(min(x, y), 0). +/// -------------------------------------------------------------------------- +extern "C" { +__DPCPP_SYCL_EXTERNAL unsigned int __imf_vimax_s16x2_relu(unsigned int, + unsigned int); +__DPCPP_SYCL_EXTERNAL int __imf_vimax_s32_relu(int, int); +__DPCPP_SYCL_EXTERNAL unsigned int __imf_vimin_s16x2_relu(unsigned int, + unsigned int); +__DPCPP_SYCL_EXTERNAL int __imf_vimin_s32_relu(int, int); +}; + template std::enable_if_t, int> vimax_s32_relu(Tp x, Tp y) { return __imf_vimax_s32_relu(x, y);