| device_functions.h | | device_functions.h | |
| | | | |
| skipping to change at line 139 | | skipping to change at line 139 | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
| extern __device__ float __log10f(float) __THROW; | | extern __device__ float __log10f(float) __THROW; | |
| | | | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
| extern __device__ float __logf(float) __THROW; | | extern __device__ float __logf(float) __THROW; | |
| | | | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
| extern __device__ float __powf(float, float) __THROW; | | extern __device__ float __powf(float, float) __THROW; | |
| | | | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
|
| | | extern __device__ int __float2int_rn(float); | |
| | | /*DEVICE_BUILTIN*/ | |
| extern __device__ int __float2int_rz(float); | | extern __device__ int __float2int_rz(float); | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
| extern __device__ int __float2int_ru(float); | | extern __device__ int __float2int_ru(float); | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
| extern __device__ int __float2int_rd(float); | | extern __device__ int __float2int_rd(float); | |
|
| /*DEVICE_BUILTIN*/ | | | |
| extern __device__ int __float2int_rn(float); | | | |
| | | | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
|
| | | extern __device__ unsigned int __float2uint_rn(float); | |
| | | /*DEVICE_BUILTIN*/ | |
| extern __device__ unsigned int __float2uint_rz(float); | | extern __device__ unsigned int __float2uint_rz(float); | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
| extern __device__ unsigned int __float2uint_ru(float); | | extern __device__ unsigned int __float2uint_ru(float); | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
| extern __device__ unsigned int __float2uint_rd(float); | | extern __device__ unsigned int __float2uint_rd(float); | |
|
| /*DEVICE_BUILTIN*/ | | | |
| extern __device__ unsigned int __float2uint_rn(float); | | | |
| | | | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
|
| | | extern __device__ float __int2float_rn(int); | |
| | | /*DEVICE_BUILTIN*/ | |
| extern __device__ float __int2float_rz(int); | | extern __device__ float __int2float_rz(int); | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
| extern __device__ float __int2float_ru(int); | | extern __device__ float __int2float_ru(int); | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
| extern __device__ float __int2float_rd(int); | | extern __device__ float __int2float_rd(int); | |
|
| /*DEVICE_BUILTIN*/ | | | |
| extern __device__ float __int2float_rn(int); | | | |
| | | | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
|
| | | extern __device__ float __uint2float_rn(unsigned int); | |
| | | /*DEVICE_BUILTIN*/ | |
| extern __device__ float __uint2float_rz(unsigned int); | | extern __device__ float __uint2float_rz(unsigned int); | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
| extern __device__ float __uint2float_ru(unsigned int); | | extern __device__ float __uint2float_ru(unsigned int); | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
| extern __device__ float __uint2float_rd(unsigned int); | | extern __device__ float __uint2float_rd(unsigned int); | |
|
| /*DEVICE_BUILTIN*/ | | | |
| extern __device__ float __uint2float_rn(unsigned int); | | | |
| | | | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
|
| | | extern __device__ long long int __float2ll_rn(float); | |
| | | /*DEVICE_BUILTIN*/ | |
| extern __device__ long long int __float2ll_rz(float); | | extern __device__ long long int __float2ll_rz(float); | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
|
| extern __device__ long long int __float2ll_rn(float); | | extern __device__ long long int __float2ll_ru(float); | |
| | | /*DEVICE_BUILTIN*/ | |
| | | extern __device__ long long int __float2ll_rd(float); | |
| | | | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
|
| | | extern __device__ unsigned long long int __float2ull_rn(float); | |
| | | /*DEVICE_BUILTIN*/ | |
| extern __device__ unsigned long long int __float2ull_rz(float); | | extern __device__ unsigned long long int __float2ull_rz(float); | |
|
| | | /*DEVICE_BUILTIN*/ | |
| | | extern __device__ unsigned long long int __float2ull_ru(float); | |
| | | /*DEVICE_BUILTIN*/ | |
| | | extern __device__ unsigned long long int __float2ull_rd(float); | |
| | | | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
| extern __device__ float __ll2float_rn(long long int); | | extern __device__ float __ll2float_rn(long long int); | |
| | | | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
| extern __device__ float __ull2float_rn(unsigned long long
int); | | extern __device__ float __ull2float_rn(unsigned long long
int); | |
| | | | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
|
| | | extern __device__ unsigned short __float2half_rn(float); | |
| | | /*DEVICE_BUILTIN*/ | |
| | | extern __device__ float __half2float(unsigned short); | |
| | | | |
| | | /*DEVICE_BUILTIN*/ | |
| extern __device__ float __fadd_rn(float, float); | | extern __device__ float __fadd_rn(float, float); | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
| extern __device__ float __fadd_rz(float, float); | | extern __device__ float __fadd_rz(float, float); | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
| extern __device__ float __fadd_ru(float, float); | | extern __device__ float __fadd_ru(float, float); | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
| extern __device__ float __fadd_rd(float, float); | | extern __device__ float __fadd_rd(float, float); | |
| | | | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
| extern __device__ float __fmul_rn(float, float); | | extern __device__ float __fmul_rn(float, float); | |
| | | | |
| skipping to change at line 260 | | skipping to change at line 275 | |
| | | | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
| extern __device__ int __clzll(long long int); | | extern __device__ int __clzll(long long int); | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
| extern __device__ int __ffsll(long long int); | | extern __device__ int __ffsll(long long int); | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
| extern __device__ int __popcll(unsigned long long int); | | extern __device__ int __popcll(unsigned long long int); | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
| extern __device__ unsigned long long int __brevll(unsigned long long int); | | extern __device__ unsigned long long int __brevll(unsigned long long int); | |
| | | | |
|
| #if !defined(CUDA_NO_SM_13_DOUBLE_INTRINSICS) | | #if (__CUDA_ARCH__ >= 130) | |
| | | | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
| extern __device__ int __double2int_rz(double); | | extern __device__ int __double2int_rz(double); | |
| | | | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
| extern __device__ unsigned int __double2uint_rz(double); | | extern __device__ unsigned int __double2uint_rz(double); | |
| | | | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
| extern __device__ long long int __double2ll_rz(double); | | extern __device__ long long int __double2ll_rz(double); | |
| | | | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
| extern __device__ unsigned long long int __double2ull_rz(double); | | extern __device__ unsigned long long int __double2ull_rz(double); | |
| | | | |
|
| #endif /* ! CUDA_NO_SM_13_DOUBLE_INTRINSICS */ | | #endif /* __CUDA_ARCH__ >= 130 */ | |
| | | | |
| } | | } | |
| | | | |
| /**************************************************************************
***** | | /**************************************************************************
***** | |
| *
* | | *
* | |
| *
* | | *
* | |
| *
* | | *
* | |
| ***************************************************************************
****/ | | ***************************************************************************
****/ | |
| | | | |
| static __inline__ __device__ int mulhi(int a, int b) | | static __inline__ __device__ int mulhi(int a, int b) | |
| | | | |
| skipping to change at line 751 | | skipping to change at line 766 | |
| inc = abs(rem0) < abs(rem1); | | inc = abs(rem0) < abs(rem1); | |
| resi = ((expo_res << 23) + r + inc); | | resi = ((expo_res << 23) + r + inc); | |
| if (resi != 0x00800000) resi = 0; | | if (resi != 0x00800000) resi = 0; | |
| return __int_as_float(sign | resi); | | return __int_as_float(sign | resi); | |
| } | | } | |
| } | | } | |
| if (__cuda_fabsf(divisor) > CUDART_TWO_TO_126_F) { | | if (__cuda_fabsf(divisor) > CUDART_TWO_TO_126_F) { | |
| divisor *= 0.25f; | | divisor *= 0.25f; | |
| dividend *= 0.25f; | | dividend *= 0.25f; | |
| } | | } | |
|
| return dividend / divisor; | | return __fdividef (dividend, divisor); | |
| } | | } | |
| | | | |
| __device_func__(float __fdiv_rz (float dividend, float divisor)) | | __device_func__(float __fdiv_rz (float dividend, float divisor)) | |
| { | | { | |
| unsigned long long prod; | | unsigned long long prod; | |
| unsigned r, f, x, y, expox, expoy, sign; | | unsigned r, f, x, y, expox, expoy, sign; | |
| unsigned expo_res; | | unsigned expo_res; | |
| unsigned resi, cvtxi, cvtyi; | | unsigned resi, cvtxi, cvtyi; | |
| float t; | | float t; | |
| | | | |
| | | | |
| skipping to change at line 819 | | skipping to change at line 834 | |
| if (rem1 < 0) r--; | | if (rem1 < 0) r--; | |
| resi = ((expo_res << 23) + r); | | resi = ((expo_res << 23) + r); | |
| if (resi != 0x00800000) resi = 0; | | if (resi != 0x00800000) resi = 0; | |
| return __int_as_float(sign | resi); | | return __int_as_float(sign | resi); | |
| } | | } | |
| } | | } | |
| if (__cuda_fabsf(divisor) > CUDART_TWO_TO_126_F) { | | if (__cuda_fabsf(divisor) > CUDART_TWO_TO_126_F) { | |
| divisor *= 0.25f; | | divisor *= 0.25f; | |
| dividend *= 0.25f; | | dividend *= 0.25f; | |
| } | | } | |
|
| return dividend / divisor; | | return __fdividef (dividend, divisor); | |
| } | | } | |
| | | | |
| __device_func__(float __fdiv_ru (float dividend, float divisor)) | | __device_func__(float __fdiv_ru (float dividend, float divisor)) | |
| { | | { | |
| unsigned long long prod; | | unsigned long long prod; | |
| unsigned r, f, x, y, expox, expoy, sign; | | unsigned r, f, x, y, expox, expoy, sign; | |
| unsigned expo_res; | | unsigned expo_res; | |
| unsigned resi, cvtxi, cvtyi; | | unsigned resi, cvtxi, cvtyi; | |
| float t; | | float t; | |
| | | | |
| | | | |
| skipping to change at line 889 | | skipping to change at line 904 | |
| if ((rem1 > 0) && (!sign)) r++; | | if ((rem1 > 0) && (!sign)) r++; | |
| resi = ((expo_res << 23) + r); | | resi = ((expo_res << 23) + r); | |
| if (resi != 0x00800000) resi = 0; | | if (resi != 0x00800000) resi = 0; | |
| return __int_as_float(sign | resi); | | return __int_as_float(sign | resi); | |
| } | | } | |
| } | | } | |
| if (__cuda_fabsf(divisor) > CUDART_TWO_TO_126_F) { | | if (__cuda_fabsf(divisor) > CUDART_TWO_TO_126_F) { | |
| divisor *= 0.25f; | | divisor *= 0.25f; | |
| dividend *= 0.25f; | | dividend *= 0.25f; | |
| } | | } | |
|
| return dividend / divisor; | | return __fdividef (dividend, divisor); | |
| } | | } | |
| | | | |
| __device_func__(float __fdiv_rd (float dividend, float divisor)) | | __device_func__(float __fdiv_rd (float dividend, float divisor)) | |
| { | | { | |
| unsigned long long prod; | | unsigned long long prod; | |
| unsigned r, f, x, y, expox, expoy, sign; | | unsigned r, f, x, y, expox, expoy, sign; | |
| unsigned expo_res; | | unsigned expo_res; | |
| unsigned resi, cvtxi, cvtyi; | | unsigned resi, cvtxi, cvtyi; | |
| float t; | | float t; | |
| | | | |
| | | | |
| skipping to change at line 959 | | skipping to change at line 974 | |
| if ((rem1 > 0) && (sign)) r++; | | if ((rem1 > 0) && (sign)) r++; | |
| resi = ((expo_res << 23) + r); | | resi = ((expo_res << 23) + r); | |
| if (resi != 0x00800000) resi = 0; | | if (resi != 0x00800000) resi = 0; | |
| return __int_as_float(sign | resi); | | return __int_as_float(sign | resi); | |
| } | | } | |
| } | | } | |
| if (__cuda_fabsf(divisor) > CUDART_TWO_TO_126_F) { | | if (__cuda_fabsf(divisor) > CUDART_TWO_TO_126_F) { | |
| divisor *= 0.25f; | | divisor *= 0.25f; | |
| dividend *= 0.25f; | | dividend *= 0.25f; | |
| } | | } | |
|
| return dividend / divisor; | | return __fdividef (dividend, divisor); | |
| } | | } | |
| | | | |
| __device_func__(float __fadd_ru (float a, float b)) | | __device_func__(float __fadd_ru (float a, float b)) | |
| { | | { | |
| unsigned int expo_x, expo_y; | | unsigned int expo_x, expo_y; | |
| unsigned int xxi, yyi, temp; | | unsigned int xxi, yyi, temp; | |
| | | | |
| xxi = __float_as_int(a); | | xxi = __float_as_int(a); | |
| yyi = __float_as_int(b); | | yyi = __float_as_int(b); | |
| | | | |
| | | | |
| skipping to change at line 2921 | | skipping to change at line 2937 | |
| while ((int)cvty.i >= 0) { | | while ((int)cvty.i >= 0) { | |
| expoy--; | | expoy--; | |
| cvty.i = cvty.i + cvty.i; | | cvty.i = cvty.i + cvty.i; | |
| } | | } | |
| cvty.i = cvty.i >> 8; | | cvty.i = cvty.i >> 8; | |
| } | | } | |
| goto divide; | | goto divide; | |
| } | | } | |
| } | | } | |
| | | | |
|
| __device_func__(float __internal_fmul_kernel2 (float a, float b, | | __device_func__(float __internal_fmul_kernel (float a, float b, | |
| enum cudaRoundMode mode)) | | enum cudaRoundMode mode)) | |
| { | | { | |
| unsigned long long product; | | unsigned long long product; | |
| volatile union __cudart_FloatUintCvt xx, yy; | | volatile union __cudart_FloatUintCvt xx, yy; | |
|
| | | | |
| unsigned expo_x, expo_y; | | unsigned expo_x, expo_y; | |
| | | | |
| xx.f = a; | | xx.f = a; | |
| yy.f = b; | | yy.f = b; | |
| | | | |
| expo_y = 0xFF; | | expo_y = 0xFF; | |
| expo_x = expo_y & (xx.i >> 23); | | expo_x = expo_y & (xx.i >> 23); | |
| expo_x = expo_x - 1; | | expo_x = expo_x - 1; | |
| expo_y = expo_y & (yy.i >> 23); | | expo_y = expo_y & (yy.i >> 23); | |
| expo_y = expo_y - 1; | | expo_y = expo_y - 1; | |
| | | | |
| skipping to change at line 3418 | | skipping to change at line 3433 | |
| } else if (mode == cudaRoundPosInf) { | | } else if (mode == cudaRoundPosInf) { | |
| xx = xx + (!expo_y && temp); | | xx = xx + (!expo_y && temp); | |
| } else if (mode == cudaRoundMinInf) { | | } else if (mode == cudaRoundMinInf) { | |
| xx = xx + (expo_y && temp); | | xx = xx + (expo_y && temp); | |
| } | | } | |
| xx = expo_y + xx; /* add in sign bit */ | | xx = expo_y + xx; /* add in sign bit */ | |
| cvt.i = xx; | | cvt.i = xx; | |
| return cvt.f; | | return cvt.f; | |
| } | | } | |
| | | | |
|
| /* NOTE: Does not currently support round-to-nearest, round-to-zero */ | | __device_func__(float __internal_fadd_kernel (float a, float b, | |
| __device_func__(float __internal_fadd_kernel2 (float a, float b, | | enum cudaRoundMode mode)) | |
| enum cudaRoundMode mode)) | | | |
| { | | { | |
| volatile union __cudart_FloatUintCvt xx, yy; | | volatile union __cudart_FloatUintCvt xx, yy; | |
| unsigned int expo_x; | | unsigned int expo_x; | |
| unsigned int expo_y; | | unsigned int expo_y; | |
| unsigned int temp; | | unsigned int temp; | |
| | | | |
| xx.f = a; | | xx.f = a; | |
| yy.f = b; | | yy.f = b; | |
| | | | |
| /* make bigger operand the augend */ | | /* make bigger operand the augend */ | |
| | | | |
| skipping to change at line 3466 | | skipping to change at line 3480 | |
| | | | |
| if ((int)temp < 0) { | | if ((int)temp < 0) { | |
| /* signs differ, effective subtraction */ | | /* signs differ, effective subtraction */ | |
| temp = 32 - expo_y; | | temp = 32 - expo_y; | |
| temp = (expo_y) ? (yy.i << temp) : 0; | | temp = (expo_y) ? (yy.i << temp) : 0; | |
| temp = (unsigned)(-((int)temp)); | | temp = (unsigned)(-((int)temp)); | |
| xx.i = xx.i - (yy.i >> expo_y) - (temp ? 1 : 0); | | xx.i = xx.i - (yy.i >> expo_y) - (temp ? 1 : 0); | |
| if (xx.i & 0x00800000) { | | if (xx.i & 0x00800000) { | |
| if (expo_x <= 0xFD) { | | if (expo_x <= 0xFD) { | |
| xx.i = xx.i + (expo_x << 23); | | xx.i = xx.i + (expo_x << 23); | |
|
| if (mode == cudaRoundMinInf) { | | if (mode == cudaRoundNearest) { | |
| xx.i += (temp && (xx.i & 0x80000000)); | | if (temp < 0x80000000) return xx.f; | |
| | | xx.i += ((temp == 0x80000000) ? (xx.i & 1) : (temp >> 31)); | |
| | | } else if (mode == cudaRoundZero) { | |
| } else if (mode == cudaRoundPosInf) { | | } else if (mode == cudaRoundPosInf) { | |
| xx.i += (temp && !(xx.i & 0x80000000)); | | xx.i += (temp && !(xx.i & 0x80000000)); | |
|
| | | } else if (mode == cudaRoundMinInf) { | |
| | | xx.i += (temp && (xx.i & 0x80000000)); | |
| } | | } | |
| return xx.f; | | return xx.f; | |
| } | | } | |
| } else { | | } else { | |
| if ((temp | (xx.i << 1)) == 0) { | | if ((temp | (xx.i << 1)) == 0) { | |
| /* operands cancelled, resulting in a clean zero */ | | /* operands cancelled, resulting in a clean zero */ | |
| if (mode == cudaRoundMinInf) { | | if (mode == cudaRoundMinInf) { | |
| xx.i = 0x80000000; | | xx.i = 0x80000000; | |
|
| } else if (mode == cudaRoundPosInf) { | | } else { | |
| xx.i = 0; | | xx.i = 0; | |
| } | | } | |
| return xx.f; | | return xx.f; | |
| } | | } | |
| /* normalize result */ | | /* normalize result */ | |
| yy.i = xx.i & 0x80000000; | | yy.i = xx.i & 0x80000000; | |
| do { | | do { | |
| xx.i = (xx.i << 1) | (temp >> 31); | | xx.i = (xx.i << 1) | (temp >> 31); | |
| temp <<= 1; | | temp <<= 1; | |
| expo_x--; | | expo_x--; | |
| | | | |
| skipping to change at line 3500 | | skipping to change at line 3518 | |
| xx.i = xx.i | yy.i; | | xx.i = xx.i | yy.i; | |
| } | | } | |
| } else { | | } else { | |
| /* signs are the same, effective addition */ | | /* signs are the same, effective addition */ | |
| temp = 32 - expo_y; | | temp = 32 - expo_y; | |
| temp = (expo_y) ? (yy.i << temp) : 0; | | temp = (expo_y) ? (yy.i << temp) : 0; | |
| xx.i = xx.i + (yy.i >> expo_y); | | xx.i = xx.i + (yy.i >> expo_y); | |
| if (!(xx.i & 0x01000000)) { | | if (!(xx.i & 0x01000000)) { | |
| if (expo_x <= 0xFD) { | | if (expo_x <= 0xFD) { | |
| xx.i = xx.i + (expo_x << 23); | | xx.i = xx.i + (expo_x << 23); | |
|
| if (mode == cudaRoundMinInf) { | | if (mode == cudaRoundNearest) { | |
| xx.i += (temp && (xx.i & 0x80000000)); | | if (temp < 0x80000000) return xx.f; | |
| | | xx.i += ((temp == 0x80000000) ? (xx.i & 1) : (temp >> 31)); | |
| | | } else if (mode == cudaRoundZero) { | |
| } else if (mode == cudaRoundPosInf) { | | } else if (mode == cudaRoundPosInf) { | |
| xx.i += (temp && !(xx.i & 0x80000000)); | | xx.i += (temp && !(xx.i & 0x80000000)); | |
|
| | | } else if (mode == cudaRoundMinInf) { | |
| | | xx.i += (temp && (xx.i & 0x80000000)); | |
| } | | } | |
| return xx.f; | | return xx.f; | |
| } | | } | |
| } else { | | } else { | |
| /* normalize result */ | | /* normalize result */ | |
| temp = (xx.i << 31) | (temp >> 1); | | temp = (xx.i << 31) | (temp >> 1); | |
| xx.i = ((xx.i & 0x80000000) | (xx.i >> 1)) & ~0x40000000; | | xx.i = ((xx.i & 0x80000000) | (xx.i >> 1)) & ~0x40000000; | |
| expo_x++; | | expo_x++; | |
| } | | } | |
| } | | } | |
| if (expo_x <= 0xFD) { | | if (expo_x <= 0xFD) { | |
|
| if (mode == cudaRoundMinInf) { | | xx.i = xx.i + (expo_x << 23); | |
| xx.i += (temp && (xx.i & 0x80000000)); | | if (mode == cudaRoundNearest) { | |
| | | if (temp < 0x80000000) return xx.f; | |
| | | xx.i += ((temp == 0x80000000) ? (xx.i & 1) : (temp >> 31)); | |
| | | } else if (mode == cudaRoundZero) { | |
| } else if (mode == cudaRoundPosInf) { | | } else if (mode == cudaRoundPosInf) { | |
| xx.i += (temp && !(xx.i & 0x80000000)); | | xx.i += (temp && !(xx.i & 0x80000000)); | |
|
| | | } else if (mode == cudaRoundMinInf) { | |
| | | xx.i += (temp && (xx.i & 0x80000000)); | |
| } | | } | |
|
| xx.i = xx.i + (expo_x << 23); | | | |
| return xx.f; | | return xx.f; | |
| } | | } | |
| if ((int)expo_x >= 254) { | | if ((int)expo_x >= 254) { | |
| /* overflow: return infinity or largest normal */ | | /* overflow: return infinity or largest normal */ | |
| temp = xx.i & 0x80000000; | | temp = xx.i & 0x80000000; | |
|
| if (mode == cudaRoundMinInf) { | | if (mode == cudaRoundNearest) { | |
| | | xx.i = (temp) | 0x7f800000; | |
| | | } else if (mode == cudaRoundZero) { | |
| | | xx.i = (temp) | 0x7f7fffff; | |
| | | } else if (mode == cudaRoundMinInf) { | |
| xx.i = (temp ? 0xFF800000 : 0x7f7fffff); | | xx.i = (temp ? 0xFF800000 : 0x7f7fffff); | |
| } else if (mode == cudaRoundPosInf) { | | } else if (mode == cudaRoundPosInf) { | |
| xx.i = (temp ? 0xff7fffff : 0x7F800000); | | xx.i = (temp ? 0xff7fffff : 0x7F800000); | |
| } | | } | |
| return xx.f; | | return xx.f; | |
| } | | } | |
| /* underflow: denormal, or smallest normal */ | | /* underflow: denormal, or smallest normal */ | |
| expo_y = expo_x + 32; | | expo_y = expo_x + 32; | |
| yy.i = xx.i & 0x80000000; | | yy.i = xx.i & 0x80000000; | |
| xx.i = xx.i & ~0xff000000; | | xx.i = xx.i & ~0xff000000; | |
| expo_x = (unsigned)(-((int)expo_x)); | | expo_x = (unsigned)(-((int)expo_x)); | |
| temp = xx.i << expo_y | ((temp) ? 1 : 0); | | temp = xx.i << expo_y | ((temp) ? 1 : 0); | |
| xx.i = yy.i | (xx.i >> expo_x); | | xx.i = yy.i | (xx.i >> expo_x); | |
|
| if (mode == cudaRoundMinInf) { | | if (mode == cudaRoundNearest) { | |
| xx.i += (temp && yy.i); | | xx.i += (temp == 0x80000000) ? (xx.i & 1) : (temp >> 31); | |
| | | } else if (mode == cudaRoundZero) { | |
| } else if (mode == cudaRoundPosInf) { | | } else if (mode == cudaRoundPosInf) { | |
| xx.i += (temp && !yy.i); | | xx.i += (temp && !yy.i); | |
|
| | | } else if (mode == cudaRoundMinInf) { | |
| | | xx.i += (temp && yy.i); | |
| } | | } | |
| return xx.f; | | return xx.f; | |
| } else { | | } else { | |
| /* handle special cases separately */ | | /* handle special cases separately */ | |
| if (!(yy.i << 1)) { | | if (!(yy.i << 1)) { | |
| if (mode == cudaRoundMinInf) { | | if (mode == cudaRoundMinInf) { | |
| if (!(xx.i << 1)) { | | if (!(xx.i << 1)) { | |
| xx.i = xx.i | yy.i; | | xx.i = xx.i | yy.i; | |
| } | | } | |
|
| } else if (mode == cudaRoundPosInf) { | | } else { | |
| if (xx.i == 0x80000000) { | | if (xx.i == 0x80000000) { | |
| xx.i = yy.i; | | xx.i = yy.i; | |
| } | | } | |
| } | | } | |
| if ((xx.i << 1) > 0xff000000) { | | if ((xx.i << 1) > 0xff000000) { | |
| xx.i |= 0x00400000; | | xx.i |= 0x00400000; | |
| } | | } | |
| return xx.f; | | return xx.f; | |
| } | | } | |
| if ((expo_y != 254) && (expo_x != 254)) { | | if ((expo_y != 254) && (expo_x != 254)) { | |
| | | | |
| skipping to change at line 3680 | | skipping to change at line 3713 | |
| return __internal_fdiv_kernel (a, b, cudaRoundMinInf); | | return __internal_fdiv_kernel (a, b, cudaRoundMinInf); | |
| } | | } | |
| | | | |
| __device_func__(float __fdiv_ru (float a, float b)) | | __device_func__(float __fdiv_ru (float a, float b)) | |
| { | | { | |
| return __internal_fdiv_kernel (a, b, cudaRoundPosInf); | | return __internal_fdiv_kernel (a, b, cudaRoundPosInf); | |
| } | | } | |
| | | | |
| __device_func__(float __fadd_rd (float a, float b)) | | __device_func__(float __fadd_rd (float a, float b)) | |
| { | | { | |
|
| return __internal_fadd_kernel2 (a, b, cudaRoundMinInf); | | return __internal_fadd_kernel (a, b, cudaRoundMinInf); | |
| } | | } | |
| | | | |
| __device_func__(float __fadd_ru (float a, float b)) | | __device_func__(float __fadd_ru (float a, float b)) | |
| { | | { | |
|
| return __internal_fadd_kernel2 (a, b, cudaRoundPosInf); | | return __internal_fadd_kernel (a, b, cudaRoundPosInf); | |
| } | | } | |
| | | | |
| __device_func__(float __fmul_rd (float a, float b)) | | __device_func__(float __fmul_rd (float a, float b)) | |
| { | | { | |
|
| return __internal_fmul_kernel2 (a, b, cudaRoundMinInf); | | return __internal_fmul_kernel (a, b, cudaRoundMinInf); | |
| } | | } | |
| | | | |
| __device_func__(float __fmul_ru (float a, float b)) | | __device_func__(float __fmul_ru (float a, float b)) | |
| { | | { | |
|
| return __internal_fmul_kernel2 (a, b, cudaRoundPosInf); | | return __internal_fmul_kernel (a, b, cudaRoundPosInf); | |
| } | | } | |
| | | | |
| __device_func__(float __fmaf_rn (float a, float b, float c)) | | __device_func__(float __fmaf_rn (float a, float b, float c)) | |
| { | | { | |
| return __internal_fmaf_kernel (a, b, c, cudaRoundNearest); | | return __internal_fmaf_kernel (a, b, c, cudaRoundNearest); | |
| } | | } | |
| | | | |
| __device_func__(float __fmaf_rz (float a, float b, float c)) | | __device_func__(float __fmaf_rz (float a, float b, float c)) | |
| { | | { | |
| return __internal_fmaf_kernel (a, b, c, cudaRoundZero); | | return __internal_fmaf_kernel (a, b, c, cudaRoundZero); | |
| | | | |
| skipping to change at line 3777 | | skipping to change at line 3810 | |
| { | | { | |
| long long int res; | | long long int res; | |
| res = __umul64hi(a, b); | | res = __umul64hi(a, b); | |
| if (a < 0LL) res = res - b; | | if (a < 0LL) res = res - b; | |
| if (b < 0LL) res = res - a; | | if (b < 0LL) res = res - a; | |
| return res; | | return res; | |
| } | | } | |
| | | | |
| __device_func__(float __saturatef(float a)) | | __device_func__(float __saturatef(float a)) | |
| { | | { | |
|
| if (__cuda___isnanf(a)) return 0.0f; // update of PTX spec 10/15/2008 | | if (__cuda___isnanf(a)) return 0.0f; /* update of PTX spec 10/15/2008 */ | |
| return a >= 1.0f ? 1.0f : a <= 0.0f ? 0.0f : a; | | return a >= 1.0f ? 1.0f : a <= 0.0f ? 0.0f : a; | |
| } | | } | |
| | | | |
| __device_func__(unsigned int __sad(int a, int b, unsigned int c)) | | __device_func__(unsigned int __sad(int a, int b, unsigned int c)) | |
| { | | { | |
| long long int diff = (long long int)a - (long long int)b; | | long long int diff = (long long int)a - (long long int)b; | |
| | | | |
| return (unsigned int)(__cuda_llabs(diff) + (long long int)c); | | return (unsigned int)(__cuda_llabs(diff) + (long long int)c); | |
| } | | } | |
| | | | |
| | | | |
| skipping to change at line 4106 | | skipping to change at line 4139 | |
| { | | { | |
| volatile union __cudart_FloatUintCvt res; | | volatile union __cudart_FloatUintCvt res; | |
| int shift; | | int shift; | |
| unsigned int t; | | unsigned int t; | |
| res.i = a; | | res.i = a; | |
| if (a == 0) return res.f; | | if (a == 0) return res.f; | |
| shift = __internal_normalize((unsigned int*)&res.i); | | shift = __internal_normalize((unsigned int*)&res.i); | |
| t = res.i << 24; | | t = res.i << 24; | |
| res.i = (res.i >> 8); | | res.i = (res.i >> 8); | |
| res.i += (127 + 30 - shift) << 23; | | res.i += (127 + 30 - shift) << 23; | |
|
| if ((rndMode == cudaRoundNearest) && (t >= 0x80000000)) { | | if (rndMode == cudaRoundNearest) { | |
| res.i += (t == 0x80000000) ? (res.i & 1) : (t >> 31); | | res.i += (t == 0x80000000) ? (res.i & 1) : (t >> 31); | |
| } | | } | |
| else if ((rndMode == cudaRoundPosInf) && t) { | | else if ((rndMode == cudaRoundPosInf) && t) { | |
| res.i++; | | res.i++; | |
| } | | } | |
| return res.f; | | return res.f; | |
| } | | } | |
| | | | |
| __device_func__(float __uint2float_rz(unsigned int a)) | | __device_func__(float __uint2float_rz(unsigned int a)) | |
| { | | { | |
| | | | |
| skipping to change at line 4144 | | skipping to change at line 4177 | |
| #else /* __MULTI_CORE__ */ | | #else /* __MULTI_CORE__ */ | |
| return __internal_uint2float_kernel(a, cudaRoundNearest); | | return __internal_uint2float_kernel(a, cudaRoundNearest); | |
| #endif /* __MULTI_CORE__ */ | | #endif /* __MULTI_CORE__ */ | |
| } | | } | |
| | | | |
| __device_func__(float __ll2float_rn(long long int a)) | | __device_func__(float __ll2float_rn(long long int a)) | |
| { | | { | |
| return (float)a; | | return (float)a; | |
| } | | } | |
| | | | |
|
| __device_func__(float __ull2float_rn(unsigned long long int a)) | | __device_func__(float __internal_ull2float_kernel(unsigned long long int a,
enum cudaRoundMode rndMode)) | |
| { | | { | |
|
| #if defined(__MULTI_CORE__) | | | |
| return (float)a; | | | |
| #else /* __MULTI_CORE__ */ | | | |
| unsigned long long int temp; | | unsigned long long int temp; | |
| unsigned int res, t; | | unsigned int res, t; | |
| int shift; | | int shift; | |
| if (a == 0ULL) return 0.0f; | | if (a == 0ULL) return 0.0f; | |
| temp = a; | | temp = a; | |
| shift = __internal_normalize64(&temp); | | shift = __internal_normalize64(&temp); | |
| temp = (temp >> 8) | ((temp & 0xffULL) ? 1ULL : 0ULL); | | temp = (temp >> 8) | ((temp & 0xffULL) ? 1ULL : 0ULL); | |
| res = (unsigned int)(temp >> 32); | | res = (unsigned int)(temp >> 32); | |
| t = (unsigned int)temp; | | t = (unsigned int)temp; | |
| res += (127 + 62 - shift) << 23; /* add in exponent */ | | res += (127 + 62 - shift) << 23; /* add in exponent */ | |
|
| res += t == 0x80000000 ? res & 1 : t >> 31; | | if (rndMode == cudaRoundNearest) { | |
| | | res += (t == 0x80000000) ? (res & 1) : (t >> 31); | |
| | | } else if (rndMode == cudaRoundPosInf) { | |
| | | res += (t != 0); | |
| | | } | |
| return __int_as_float(res); | | return __int_as_float(res); | |
|
| | | } | |
| | | | |
| | | __device_func__(float __ull2float_rn(unsigned long long int a)) | |
| | | { | |
| | | #if defined(__MULTI_CORE__) | |
| | | return (float)a; | |
| | | #else /* __MULTI_CORE__ */ | |
| | | return __internal_ull2float_kernel(a, cudaRoundNearest); | |
| #endif /* __MULTI_CORE__ */ | | #endif /* __MULTI_CORE__ */ | |
| } | | } | |
| | | | |
|
| __device_func__(float __internal_fmul_kernel(float a, float b, int rndNeare
st)) | | __device_func__(unsigned short __float2half_rn(float f)) | |
| { | | { | |
|
| unsigned long long product; | | unsigned int x = __float_as_int (f); | |
| volatile union __cudart_FloatUintCvt xx, yy; | | unsigned int u = (x & 0x7fffffff), remainder, shift, lsb, lsb_s1, lsb_m1; | |
| unsigned expo_x, expo_y; | | unsigned int sign, exponent, mantissa; | |
| | | | |
|
| xx.f = a; | | /* Get rid of +NaN/-NaN case first. */ | |
| yy.f = b; | | if (u > 0x7f800000) { | |
| | | return 0x7fff; | |
| | | } | |
| | | | |
|
| expo_y = 0xFF; | | sign = ((x >> 16) & 0x8000); | |
| expo_x = expo_y & (xx.i >> 23); | | | |
| expo_x = expo_x - 1; | | | |
| expo_y = expo_y & (yy.i >> 23); | | | |
| expo_y = expo_y - 1; | | | |
| | | | |
|
| if ((expo_x <= 0xFD) && | | /* Get rid of +Inf/-Inf, +0/-0. */ | |
| (expo_y <= 0xFD)) { | | if (u > 0x477fefff) { | |
| multiply: | | return sign | 0x7c00; | |
| expo_x = expo_x + expo_y; | | } | |
| expo_y = xx.i ^ yy.i; | | if (u < 0x33000001) { | |
| xx.i = xx.i & 0x00ffffff; | | return sign | 0x0000; | |
| yy.i = yy.i << 8; | | | |
| xx.i = xx.i | 0x00800000; | | | |
| yy.i = yy.i | 0x80000000; | | | |
| /* compute product */ | | | |
| product = ((unsigned long long)xx.i) * yy.i; | | | |
| expo_x = expo_x - 127 + 2; | | | |
| expo_y = expo_y & 0x80000000; | | | |
| xx.i = (unsigned int)(product >> 32); | | | |
| yy.i = (unsigned int)(product & 0xffffffff); | | | |
| /* normalize mantissa */ | | | |
| if (xx.i < 0x00800000) { | | | |
| xx.i = (xx.i << 1) | (yy.i >> 31); | | | |
| yy.i = (yy.i << 1); | | | |
| expo_x--; | | | |
| } | | | |
| if (expo_x <= 0xFD) { | | | |
| xx.i = xx.i | expo_y; /* OR in sign bit */ | | | |
| xx.i = xx.i + (expo_x << 23); /* add in exponent */ | | | |
| /* round result to nearest or even */ | | | |
| if (yy.i < 0x80000000) return xx.f; | | | |
| xx.i += (((yy.i == 0x80000000) ? (xx.i & 1) : (yy.i >> 31)) | | | |
| && rndNearest); | | | |
| return xx.f; | | | |
| } else if ((int)expo_x >= 254) { | | | |
| /* overflow: return infinity */ | | | |
| xx.i = (expo_y | 0x7F800000) - (!rndNearest); | | | |
| return xx.f; | | | |
| } else { | | | |
| /* zero, denormal, or smallest normal */ | | | |
| expo_x = ((unsigned int)-((int)expo_x)); | | | |
| if (expo_x > 25) { | | | |
| /* massive underflow: return 0 */ | | | |
| xx.i = expo_y; | | | |
| return xx.f; | | | |
| } else { | | | |
| yy.i = (xx.i << (32 - expo_x)) | ((yy.i) ? 1 : 0); | | | |
| xx.i = expo_y + (xx.i >> expo_x); | | | |
| xx.i += (((yy.i == 0x80000000) ? (xx.i & 1) : (yy.i >> 31)) | | | |
| && rndNearest); | | | |
| return xx.f; | | | |
| } | | | |
| } | | | |
| } else { | | | |
| product = xx.i ^ yy.i; | | | |
| product = product & 0x80000000; | | | |
| if (!(xx.i & 0x7fffffff)) { | | | |
| if (expo_y != 254) { | | | |
| xx.i = (unsigned int)product; | | | |
| return xx.f; | | | |
| } | | | |
| expo_y = yy.i << 1; | | | |
| if (expo_y == 0xFF000000) { | | | |
| xx.i = expo_y | 0x00C00000; | | | |
| } else { | | | |
| xx.i = yy.i | 0x00400000; | | | |
| } | | | |
| return xx.f; | | | |
| } | | | |
| if (!(yy.i & 0x7fffffff)) { | | | |
| if (expo_x != 254) { | | | |
| xx.i = (unsigned int)product; | | | |
| return xx.f; | | | |
| } | | | |
| expo_x = xx.i << 1; | | | |
| if (expo_x == 0xFF000000) { | | | |
| xx.i = expo_x | 0x00C00000; | | | |
| } else { | | | |
| xx.i = xx.i | 0x00400000; | | | |
| } | | | |
| return xx.f; | | | |
| } | | | |
| if ((expo_y != 254) && (expo_x != 254)) { | | | |
| expo_y++; | | | |
| expo_x++; | | | |
| if (expo_x == 0) { | | | |
| expo_y |= xx.i & 0x80000000; | | | |
| /* | | | |
| * If both operands are denormals, we only need to normalize | | | |
| * one of them as the result will be either a denormal or zero. | | | |
| */ | | | |
| xx.i = xx.i << 8; | | | |
| while (!(xx.i & 0x80000000)) { | | | |
| xx.i <<= 1; | | | |
| expo_x--; | | | |
| } | | | |
| xx.i = (xx.i >> 8) | (expo_y & 0x80000000); | | | |
| expo_y &= ~0x80000000; | | | |
| expo_y--; | | | |
| goto multiply; | | | |
| } | | | |
| if (expo_y == 0) { | | | |
| expo_x |= yy.i & 0x80000000; | | | |
| yy.i = yy.i << 8; | | | |
| while (!(yy.i & 0x80000000)) { | | | |
| yy.i <<= 1; | | | |
| expo_y--; | | | |
| } | | | |
| yy.i = (yy.i >> 8) | (expo_x & 0x80000000); | | | |
| expo_x &= ~0x80000000; | | | |
| expo_x--; | | | |
| goto multiply; | | | |
| } | | | |
| } | | | |
| expo_x = xx.i << 1; | | | |
| expo_y = yy.i << 1; | | | |
| /* if x is NaN, return x */ | | | |
| if (expo_x > 0xFF000000) { | | | |
| /* cvt any SNaNs to QNaNs */ | | | |
| xx.i = xx.i | 0x00400000; | | | |
| return xx.f; | | | |
| } | | | |
| /* if y is NaN, return y */ | | | |
| if (expo_y > 0xFF000000) { | | | |
| /* cvt any SNaNs to QNaNs */ | | | |
| xx.i = yy.i | 0x00400000; | | | |
| return xx.f; | | | |
| } | | | |
| xx.i = (unsigned int)product | 0x7f800000; | | | |
| return xx.f; | | | |
| } | | } | |
|
| } | | | |
| | | | |
| __device_func__(float __internal_fadd_kernel(float a, float b, int rndNeare | | | |
| st)) | | | |
| { | | | |
| volatile union __cudart_FloatUintCvt xx, yy; | | | |
| unsigned int expo_x; | | | |
| unsigned int expo_y; | | | |
| unsigned int temp; | | | |
| | | | |
|
| xx.f = a; | | exponent = ((u >> 23) & 0xff); | |
| yy.f = b; | | mantissa = (u & 0x7fffff); | |
| | | | |
|
| /* make bigger operand the augend */ | | if (exponent > 0x70) { | |
| expo_y = yy.i << 1; | | shift = 13; | |
| if (expo_y > (xx.i << 1)) { | | exponent -= 0x70; | |
| expo_y = xx.i; | | } else { | |
| xx.i = yy.i; | | shift = 0x7e - exponent; | |
| yy.i = expo_y; | | exponent = 0; | |
| | | mantissa |= 0x800000; | |
| } | | } | |
|
| | | lsb = (1 << shift); | |
| | | lsb_s1 = (lsb >> 1); | |
| | | lsb_m1 = (lsb - 1); | |
| | | | |
|
| temp = 0xff; | | /* Round to nearest even. */ | |
| expo_x = temp & (xx.i >> 23); | | remainder = (mantissa & lsb_m1); | |
| expo_x = expo_x - 1; | | mantissa >>= shift; | |
| expo_y = temp & (yy.i >> 23); | | if (remainder > lsb_s1 || (remainder == lsb_s1 && (mantissa & 0x1))) { | |
| expo_y = expo_y - 1; | | ++mantissa; | |
| | | if (!(mantissa & 0x3ff)) { | |
| | | ++exponent; | |
| | | mantissa = 0; | |
| | | } | |
| | | } | |
| | | | |
|
| if ((expo_x <= 0xFD) && | | return sign | (exponent << 10) | mantissa; | |
| (expo_y <= 0xFD)) { | | } | |
| | | | |
|
| add: | | __device_func__(float __half2float(unsigned short h)) | |
| expo_y = expo_x - expo_y; | | { | |
| if (expo_y > 25) { | | unsigned int sign = ((h >> 15) & 1); | |
| expo_y = 31; | | unsigned int exponent = ((h >> 10) & 0x1f); | |
| } | | unsigned int mantissa = ((h & 0x3ff) << 13); | |
| temp = xx.i ^ yy.i; | | | |
| xx.i = xx.i & ~0x7f000000; | | | |
| xx.i = xx.i | 0x00800000; | | | |
| yy.i = yy.i & ~0xff000000; | | | |
| yy.i = yy.i | 0x00800000; | | | |
| | | | |
|
| if ((int)temp < 0) { | | if (exponent == 0x1f) { /* NaN or Inf */ | |
| /* signs differ, effective subtraction */ | | mantissa = (mantissa | |
| temp = 32 - expo_y; | | ? (sign = 0, 0x7fffff) | |
| temp = (expo_y) ? (yy.i << temp) : 0; | | : 0); | |
| temp = (unsigned int)(-((int)temp)); | | exponent = 0xff; | |
| xx.i = xx.i - (yy.i >> expo_y) - (temp ? 1 : 0); | | } else if (!exponent) { /* Denorm or Zero */ | |
| if (xx.i & 0x00800000) { | | if (mantissa) { | |
| if (expo_x <= 0xFD) { | | unsigned int msb; | |
| xx.i = xx.i & ~0x00800000; /* lop off integer bit */ | | exponent = 0x71; | |
| xx.i = (xx.i + (expo_x << 23)) + 0x00800000; | | do { | |
| if (temp < 0x80000000) return xx.f; | | msb = (mantissa & 0x400000); | |
| xx.i += (((temp == 0x80000000) ? (xx.i & 1) : (temp >> 31)) | | mantissa <<= 1; /* normalize */ | |
| && rndNearest); | | --exponent; | |
| return xx.f; | | } while (!msb); | |
| } | | mantissa &= 0x7fffff; /* 1.mantissa is implicit */ | |
| } else { | | | |
| if ((temp | (xx.i << 1)) == 0) { | | | |
| /* operands cancelled, resulting in a clean zero */ | | | |
| xx.i = 0; | | | |
| return xx.f; | | | |
| } | | | |
| /* normalize result */ | | | |
| yy.i = xx.i & 0x80000000; | | | |
| do { | | | |
| xx.i = (xx.i << 1) | (temp >> 31); | | | |
| temp <<= 1; | | | |
| expo_x--; | | | |
| } while (!(xx.i & 0x00800000)); | | | |
| xx.i = xx.i | yy.i; | | | |
| } | | | |
| } else { | | | |
| /* signs are the same, effective addition */ | | | |
| temp = 32 - expo_y; | | | |
| temp = (expo_y) ? (yy.i << temp) : 0; | | | |
| xx.i = xx.i + (yy.i >> expo_y); | | | |
| if (!(xx.i & 0x01000000)) { | | | |
| if (expo_x <= 0xFD) { | | | |
| expo_y = xx.i & 1; | | | |
| xx.i = xx.i + (expo_x << 23); | | | |
| if (temp < 0x80000000) return xx.f; | | | |
| xx.i += (((temp == 0x80000000) ? expo_y : (temp >> 31)) | | | |
| && rndNearest); | | | |
| return xx.f; | | | |
| } | | | |
| } else { | | | |
| /* normalize result */ | | | |
| temp = (xx.i << 31) | (temp >> 1); | | | |
| /* not ANSI compliant: xx.i = (((int)xx.i)>>1) & ~0x40000000 */ | | | |
| xx.i = ((xx.i & 0x80000000) | (xx.i >> 1)) & ~0x40000000; | | | |
| expo_x++; | | | |
| } | | | |
| } | | | |
| if (expo_x <= 0xFD) { | | | |
| expo_y = xx.i & 1; | | | |
| xx.i += (((temp == 0x80000000) ? expo_y : (temp >> 31)) | | | |
| && rndNearest); | | | |
| xx.i = xx.i + (expo_x << 23); | | | |
| return xx.f; | | | |
| } | | | |
| if ((int)expo_x >= 254) { | | | |
| /* overflow: return infinity */ | | | |
| xx.i = ((xx.i & 0x80000000) | 0x7f800000) - (!rndNearest); | | | |
| return xx.f; | | | |
| } | | } | |
|
| /* underflow: denormal, or smallest normal */ | | | |
| expo_y = expo_x + 32; | | | |
| yy.i = xx.i & 0x80000000; | | | |
| xx.i = xx.i & ~0xff000000; | | | |
| | | | |
| expo_x = (unsigned int)(-((int)expo_x)); | | | |
| temp = xx.i << expo_y | ((temp) ? 1 : 0); | | | |
| xx.i = yy.i | (xx.i >> expo_x); | | | |
| xx.i += (((temp == 0x80000000) ? (xx.i & 1) : (temp >> 31)) | | | |
| && rndNearest); | | | |
| return xx.f; | | | |
| } else { | | } else { | |
|
| /* handle special cases separately */ | | exponent += 0x70; | |
| if (!(yy.i << 1)) { | | | |
| if (xx.i == 0x80000000) { | | | |
| xx.i = yy.i; | | | |
| } | | | |
| if ((xx.i << 1) > 0xff000000) { | | | |
| xx.i |= 0x00400000; | | | |
| } | | | |
| return xx.f; | | | |
| } | | | |
| if ((expo_y != 254) && (expo_x != 254)) { | | | |
| /* remove sign bits */ | | | |
| if (expo_x == (unsigned int) -1) { | | | |
| temp = xx.i & 0x80000000; | | | |
| xx.i = xx.i << 8; | | | |
| while (!(xx.i & 0x80000000)) { | | | |
| xx.i <<= 1; | | | |
| expo_x--; | | | |
| } | | | |
| expo_x++; | | | |
| xx.i = (xx.i >> 8) | temp; | | | |
| } | | | |
| if (expo_y == (unsigned int) -1) { | | | |
| temp = yy.i & 0x80000000; | | | |
| yy.i = yy.i << 8; | | | |
| while (!(yy.i & 0x80000000)) { | | | |
| yy.i <<= 1; | | | |
| expo_y--; | | | |
| } | | | |
| expo_y++; | | | |
| yy.i = (yy.i >> 8) | temp; | | | |
| } | | | |
| goto add; | | | |
| } | | | |
| expo_x = xx.i << 1; | | | |
| expo_y = yy.i << 1; | | | |
| /* if x is NaN, return x */ | | | |
| if (expo_x > 0xff000000) { | | | |
| /* cvt any SNaNs to QNaNs */ | | | |
| xx.i = xx.i | 0x00400000; | | | |
| return xx.f; | | | |
| } | | | |
| /* if y is NaN, return y */ | | | |
| if (expo_y > 0xff000000) { | | | |
| /* cvt any SNaNs to QNaNs */ | | | |
| xx.i = yy.i | 0x00400000; | | | |
| return xx.f; | | | |
| } | | | |
| if ((expo_x == 0xff000000) && (expo_y == 0xff000000)) { | | | |
| /* | | | |
| * subtraction of infinities with the same sign, and addition of | | | |
| * infinities of unlike sign is undefined: return NaN INDEFINITE | | | |
| */ | | | |
| expo_x = xx.i ^ yy.i; | | | |
| xx.i = xx.i | ((expo_x) ? 0xffc00000 : 0); | | | |
| return xx.f; | | | |
| } | | | |
| /* handle infinities */ | | | |
| if (expo_y == 0xff000000) { | | | |
| xx.i = yy.i; | | | |
| } | | | |
| return xx.f; | | | |
| } | | } | |
|
| | | | |
| | | return __int_as_float ((sign << 31) | (exponent << 23) | mantissa); | |
| } | | } | |
| | | | |
| __device_func__(float __fadd_rz(float a, float b)) | | __device_func__(float __fadd_rz(float a, float b)) | |
| { | | { | |
|
| return __internal_fadd_kernel(a, b, 0); | | return __internal_fadd_kernel(a, b, cudaRoundZero); | |
| } | | } | |
| | | | |
| __device_func__(float __fmul_rz(float a, float b)) | | __device_func__(float __fmul_rz(float a, float b)) | |
| { | | { | |
|
| return __internal_fmul_kernel(a, b, 0); | | return __internal_fmul_kernel(a, b, cudaRoundZero); | |
| } | | } | |
| | | | |
| __device_func__(float __fadd_rn(float a, float b)) | | __device_func__(float __fadd_rn(float a, float b)) | |
| { | | { | |
|
| return __internal_fadd_kernel(a, b, 1); | | return __internal_fadd_kernel(a, b, cudaRoundNearest); | |
| } | | } | |
| | | | |
| __device_func__(float __fmul_rn(float a, float b)) | | __device_func__(float __fmul_rn(float a, float b)) | |
| { | | { | |
|
| return __internal_fmul_kernel(a, b, 1); | | return __internal_fmul_kernel(a, b, cudaRoundNearest); | |
| } | | } | |
| | | | |
| __device_func__(void __brkpt(int c)) | | __device_func__(void __brkpt(int c)) | |
| { | | { | |
| /* TODO */ | | /* TODO */ | |
| } | | } | |
| | | | |
| #if defined(__MULTI_CORE__) | | #if defined(__MULTI_CORE__) | |
| | | | |
| #define __syncthreads() \ | | #define __syncthreads() \ | |
| | | | |
| skipping to change at line 4543 | | skipping to change at line 4342 | |
| #endif /* __GNUC__ */ | | #endif /* __GNUC__ */ | |
| | | | |
| #endif /* __MULTI_CORE__ */ | | #endif /* __MULTI_CORE__ */ | |
| | | | |
| __device_func__(void __prof_trigger(int a)) | | __device_func__(void __prof_trigger(int a)) | |
| { | | { | |
| } | | } | |
| | | | |
| __device_func__(void __threadfence(void)) | | __device_func__(void __threadfence(void)) | |
| { | | { | |
|
| | | __syncthreads(); | |
| } | | } | |
| | | | |
| __device_func__(void __threadfence_block(void)) | | __device_func__(void __threadfence_block(void)) | |
| { | | { | |
|
| | | __syncthreads(); | |
| } | | } | |
| | | | |
| #if defined(__GNUC__) | | #if defined(__GNUC__) | |
| | | | |
| __device_func__(void __trap(void)) | | __device_func__(void __trap(void)) | |
| { | | { | |
| __builtin_trap(); | | __builtin_trap(); | |
| } | | } | |
| | | | |
| #elif defined(_WIN32) | | #elif defined(_WIN32) | |
| | | | |
| skipping to change at line 4572 | | skipping to change at line 4373 | |
| | | | |
| #endif /* __GNUC__ */ | | #endif /* __GNUC__ */ | |
| | | | |
| #endif /* __CUDABE__ */ | | #endif /* __CUDABE__ */ | |
| | | | |
| /**************************************************************************
***** | | /**************************************************************************
***** | |
| *
* | | *
* | |
| * DEVICE IMPLEMENTATIONS FOR FUNCTIONS WITH BUILTIN NVOPENCC OPERATIONS
* | | * DEVICE IMPLEMENTATIONS FOR FUNCTIONS WITH BUILTIN NVOPENCC OPERATIONS
* | |
| *
* | | *
* | |
| ***************************************************************************
****/ | | ***************************************************************************
****/ | |
|
| | | #if !defined(__CUDABE__) | |
| __device_func__(float __fdividef(float a, float b)) | | __device_func__(float __fdividef(float a, float b)) | |
| { | | { | |
| #if defined(__MULTI_CORE__) | | #if defined(__MULTI_CORE__) | |
| return a / b; | | return a / b; | |
|
| #elif defined(__CUDABE__) | | | |
| return a / b; | | | |
| #else /* __MULTI_CORE__ */ | | #else /* __MULTI_CORE__ */ | |
|
| | | volatile float aa = a; | |
| | | volatile float bb = b; | |
| /* match range restrictions of the device function */ | | /* match range restrictions of the device function */ | |
|
| if (__cuda_fabsf(b) > CUDART_TWO_TO_126_F) { | | if (__cuda_fabsf(bb) > CUDART_TWO_TO_126_F) { | |
| if (__cuda_fabsf(a) <= CUDART_NORM_HUGE_F) { | | if (__cuda_fabsf(aa) <= CUDART_NORM_HUGE_F) { | |
| return ((a / b) / CUDART_NORM_HUGE_F) / CUDART_NORM_HUGE_F; | | return ((aa / bb) / CUDART_NORM_HUGE_F) / CUDART_NORM_HUGE_F; | |
| } else { | | } else { | |
|
| return __int_as_float(0xffc00000); | | bb = 1.0f / bb; | |
| | | bb = bb / CUDART_NORM_HUGE_F; | |
| | | return aa * bb; | |
| } | | } | |
| } else { | | } else { | |
|
| return a / b; | | return aa / bb; | |
| } | | } | |
| #endif /* __MULTI_CORE__ */ | | #endif /* __MULTI_CORE__ */ | |
| } | | } | |
|
| | | #endif /* !defined(__CUDABE__) */ | |
| | | | |
| __device_func__(float __sinf(float a)) | | __device_func__(float __sinf(float a)) | |
| { | | { | |
|
| | | #if !defined(__CUDABE__) | |
| | | if ((__float_as_int(a) << 1) == 0xff000000) { | |
| | | return __fadd_rn (a, -a); /* return NaN */ | |
| | | } | |
| | | #endif /* !defined(__CUDABE__) */ | |
| return sinf(a); | | return sinf(a); | |
| } | | } | |
| | | | |
| __device_func__(float __cosf(float a)) | | __device_func__(float __cosf(float a)) | |
| { | | { | |
|
| | | #if !defined(__CUDABE__) | |
| | | if ((__float_as_int(a) << 1) == 0xff000000) { | |
| | | return __fadd_rn (a, -a); /* return NaN */ | |
| | | } | |
| | | #endif /* !defined(__CUDABE__) */ | |
| return cosf(a); | | return cosf(a); | |
| } | | } | |
| | | | |
| __device_func__(float __log2f(float a)) | | __device_func__(float __log2f(float a)) | |
| { | | { | |
| return log2f(a); | | return log2f(a); | |
| } | | } | |
| | | | |
| /**************************************************************************
***** | | /**************************************************************************
***** | |
| *
* | | *
* | |
| * SHARED HOST AND DEVICE IMPLEMENTATIONS
* | | * SHARED HOST AND DEVICE IMPLEMENTATIONS
* | |
| *
* | | *
* | |
| ***************************************************************************
****/ | | ***************************************************************************
****/ | |
| | | | |
| __device_func__(float __internal_accurate_fdividef(float a, float b)) | | __device_func__(float __internal_accurate_fdividef(float a, float b)) | |
| { | | { | |
|
| if (__cuda_fabsf(b) > CUDART_TWO_TO_126_F) { | | return a / b; | |
| a *= .25f; | | | |
| b *= .25f; | | | |
| } | | | |
| return __fdividef(a, b); | | | |
| } | | } | |
| | | | |
| __device_func__(float __tanf(float a)) | | __device_func__(float __tanf(float a)) | |
| { | | { | |
| #if defined(__MULTI_CORE__) | | #if defined(__MULTI_CORE__) | |
| return tanf(a); | | return tanf(a); | |
| #else /* __MULTI_CORE__ */ | | #else /* __MULTI_CORE__ */ | |
| return __fdividef (__sinf(a), __cosf(a)); | | return __fdividef (__sinf(a), __cosf(a)); | |
| #endif /* __MULTI_CORE__ */ | | #endif /* __MULTI_CORE__ */ | |
| } | | } | |
| | | | |
End of changes. 76 change blocks. |
| 367 lines changed or deleted | | 175 lines changed or added | |
|
| device_runtime.h | | device_runtime.h | |
| /* | | /* | |
|
| * Copyright 1993-2009 NVIDIA Corporation. All rights reserved. | | * Copyright 1993-2008 NVIDIA Corporation. All rights reserved. | |
| * | | * | |
| * NOTICE TO USER: | | * NOTICE TO USER: | |
| * | | * | |
| * This source code is subject to NVIDIA ownership rights under U.S. and | | * This source code is subject to NVIDIA ownership rights under U.S. and | |
| * international Copyright laws. Users and possessors of this source code | | * international Copyright laws. Users and possessors of this source code | |
| * are hereby granted a nonexclusive, royalty-free license to use this code | | * are hereby granted a nonexclusive, royalty-free license to use this code | |
| * in individual and commercial software. | | * in individual and commercial software. | |
| * | | * | |
| * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE | | * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE | |
| * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR | | * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR | |
| | | | |
| skipping to change at line 56 | | skipping to change at line 56 | |
| s | | s | |
| #define __unsized_shared_data(name, type_post) \ | | #define __unsized_shared_data(name, type_post) \ | |
| __unsized##name __unsized##type_post | | __unsized##name __unsized##type_post | |
| #define __sized_shared_data(name, type) \ | | #define __sized_shared_data(name, type) \ | |
| __sized##name type | | __sized##name type | |
| #define __sized__shared_var(name, s, type) \ | | #define __sized__shared_var(name, s, type) \ | |
| name | | name | |
| | | | |
| /*TEXTURE_TYPE*/ | | /*TEXTURE_TYPE*/ | |
| typedef const void *__texture_type__; | | typedef const void *__texture_type__; | |
|
| | | /*SURFACE_TYPE*/ | |
| | | typedef const void *__surface_type__; | |
| | | | |
| #if defined(__CUDABE__) /* cudabe compiler */ | | #if defined(__CUDABE__) /* cudabe compiler */ | |
| | | | |
| #define __pad__(f) | | #define __pad__(f) | |
| #define __text__ \ | | #define __text__ \ | |
| __attribute__((__texture__)) | | __attribute__((__texture__)) | |
|
| | | #define __surf__ \ | |
| | | __attribute__((__surface__)) | |
| #define ___device__(sc) \ | | #define ___device__(sc) \ | |
| static | | static | |
| #define __in__(cdecl, decl) \ | | #define __in__(cdecl, decl) \ | |
| __shared__ cdecl | | __shared__ cdecl | |
| #define __in_type__(cdecl, decl) \ | | #define __in_type__(cdecl, decl) \ | |
| cdecl | | cdecl | |
| #define __texture_var(name) \ | | #define __texture_var(name) \ | |
| name | | name | |
| #define __shared_var(name, s, type) \ | | #define __shared_var(name, s, type) \ | |
| name | | name | |
| | | | |
| skipping to change at line 86 | | skipping to change at line 90 | |
| #define __copy_param(local_decl, param) \ | | #define __copy_param(local_decl, param) \ | |
| local_decl = param | | local_decl = param | |
| #define __unsized_array_size \ | | #define __unsized_array_size \ | |
| [] | | [] | |
| #define __unsized__shared_var(name, s, type) \ | | #define __unsized__shared_var(name, s, type) \ | |
| name | | name | |
| #define __unsized__empty_array(s) \ | | #define __unsized__empty_array(s) \ | |
| s | | s | |
| #define __var_used__ \ | | #define __var_used__ \ | |
| __attribute__((__used__)) | | __attribute__((__used__)) | |
|
| | | #define __storage_extern_unsized__shared__ \ | |
| | | extern | |
| | | | |
| #undef __cdecl | | #undef __cdecl | |
| #define __cdecl | | #define __cdecl | |
| #undef __w64 | | #undef __w64 | |
| #define __w64 | | #define __w64 | |
| | | | |
| #elif defined(__CUDACC__) /* cudafe compiler */ | | #elif defined(__CUDACC__) /* cudafe compiler */ | |
| | | | |
|
| #define __loc_sc__(loc, sc) \ | | #define __loc_sc__(loc, size, sc) \ | |
| sc loc | | sc loc | |
| #define __pad__(f) | | #define __pad__(f) | |
| #define __text__ | | #define __text__ | |
|
| | | #define __surf__ | |
| #define ___device__(sc) \ | | #define ___device__(sc) \ | |
| sc __device__ | | sc __device__ | |
| #define __in__(cdecl, decl) \ | | #define __in__(cdecl, decl) \ | |
| decl | | decl | |
| #define __in_type__(cdecl, decl) \ | | #define __in_type__(cdecl, decl) \ | |
| decl | | decl | |
| #define __texture_var(name) \ | | #define __texture_var(name) \ | |
| name | | name | |
| #define __shared_var(name, s, type) \ | | #define __shared_var(name, s, type) \ | |
| name | | name | |
| | | | |
| skipping to change at line 127 | | skipping to change at line 134 | |
| #define __unsized__empty_array(s) \ | | #define __unsized__empty_array(s) \ | |
| s | | s | |
| | | | |
| #else /* host compiler (cl, gcc, open64, ...) */ | | #else /* host compiler (cl, gcc, open64, ...) */ | |
| | | | |
| #if defined (__MULTI_CORE__) || defined(__multi_core__) | | #if defined (__MULTI_CORE__) || defined(__multi_core__) | |
| | | | |
| struct uint3; | | struct uint3; | |
| extern struct uint3* CUDARTAPI __cudaGetBlockIdxPtr(void); | | extern struct uint3* CUDARTAPI __cudaGetBlockIdxPtr(void); | |
| extern void* CUDARTAPI __cudaGetSharedMem(void*); | | extern void* CUDARTAPI __cudaGetSharedMem(void*); | |
|
| | | extern void* CUDARTAPI __cudaCmcHostMalloc(size_t); | |
| | | extern size_t CUDARTAPI __cudaCmcGetStackSize(void); | |
| | | | |
| #endif /* __MULTI_CORE__ || __multi_core__ */ | | #endif /* __MULTI_CORE__ || __multi_core__ */ | |
| | | | |
| #if defined (__multi_core__) | | #if defined (__multi_core__) | |
| | | | |
| #if defined(__GNUC__) | | #if defined(__GNUC__) | |
| | | | |
| #if defined(__cplusplus) | | #if defined(__cplusplus) | |
| | | | |
| extern void *alloca(size_t) throw(); | | extern void *alloca(size_t) throw(); | |
| | | | |
| #else /* __cplusplus */ | | #else /* __cplusplus */ | |
| | | | |
| extern void *alloca(size_t); | | extern void *alloca(size_t); | |
| | | | |
| #endif /* __cplusplus */ | | #endif /* __cplusplus */ | |
| | | | |
|
| #define __cuda_alloc(s) \ | | #define __cuda_alloca(s) \ | |
| alloca(s) | | alloca(s) | |
| | | | |
| #else /* __GNUC__ */ | | #else /* __GNUC__ */ | |
| | | | |
| extern void *_alloca(size_t); | | extern void *_alloca(size_t); | |
| | | | |
|
| #define __cuda_alloc(s) \ | | #define __cuda_alloca(s) \ | |
| _alloca(s) | | _alloca(s) | |
| | | | |
| #endif /* __GNUC__ */ | | #endif /* __GNUC__ */ | |
| | | | |
|
| | | /* check if enough stack size remains for alloca to succeed. If so, | |
| | | use faster alloca() to do the allocation. Otherwise, allocate memory | |
| | | using the __cudaCmcHostMalloc() runtime function, which uses the slower | |
| | | malloc path to allocate memory | |
| | | */ | |
| | | #define __cudaCmcTargAlloc(num_bytes, max_stacksize, ptr_counter_stacksize, | |
| | | ptr_ret_sym) \ | |
| | | do { \ | |
| | | if (*(ptr_counter_stacksize) + (num_bytes) >= (max_stacksize)) { \ | |
| | | *(ptr_ret_sym) = __cudaCmcHostMalloc((num_bytes)); \ | |
| | | } else { \ | |
| | | *(ptr_ret_sym) = __cuda_alloca((num_bytes)); \ | |
| | | *(ptr_counter_stacksize) = *(ptr_counter_stacksize) + (num_bytes); \ | |
| | | } \ | |
| | | } while(0) | |
| | | | |
| #endif /* __multi_core__ */ | | #endif /* __multi_core__ */ | |
| | | | |
| #if defined (__MULTI_CORE__) | | #if defined (__MULTI_CORE__) | |
| | | | |
| #define ___device__(sc) \ | | #define ___device__(sc) \ | |
| static | | static | |
| #define __pad__(f) \ | | #define __pad__(f) \ | |
| f | | f | |
| #define __text__ | | #define __text__ | |
|
| | | #define __surf__ | |
| #define __cudaGet_blockIdx() \ | | #define __cudaGet_blockIdx() \ | |
| (*__cudaGetBlockIdxPtr()) | | (*__cudaGetBlockIdxPtr()) | |
| #define __shared_var(name, s, type) \ | | #define __shared_var(name, s, type) \ | |
| (s type __cudaGetSharedMem((void*)(&(name)))) | | (s type __cudaGetSharedMem((void*)(&(name)))) | |
| #define __var_used__ \ | | #define __var_used__ \ | |
| __attribute__((__used__)) | | __attribute__((__used__)) | |
| #define __storage_auto__shared__ \ | | #define __storage_auto__shared__ \ | |
| auto | | auto | |
| | | | |
| #undef __cdecl | | #undef __cdecl | |
| | | | |
| skipping to change at line 201 | | skipping to change at line 226 | |
| #elif defined(__GNUC__) | | #elif defined(__GNUC__) | |
| | | | |
| #define __STORAGE__ \ | | #define __STORAGE__ \ | |
| __attribute__((__common__)) | | __attribute__((__common__)) | |
| | | | |
| #elif defined(__cplusplus) | | #elif defined(__cplusplus) | |
| | | | |
| #define __STORAGE__ \ | | #define __STORAGE__ \ | |
| __declspec(selectany) | | __declspec(selectany) | |
| | | | |
|
| #pragma warning(disable: 4099 4190) | | | |
| | | | |
| #else /* __APPLE__ || __ICC */ | | #else /* __APPLE__ || __ICC */ | |
| | | | |
| #define __STORAGE__ | | #define __STORAGE__ | |
| | | | |
| #endif /* __APPLE__ || __ICC */ | | #endif /* __APPLE__ || __ICC */ | |
| | | | |
| #endif /* __MULTI_CORE__ */ | | #endif /* __MULTI_CORE__ */ | |
| | | | |
| #define __in__(cdecl, decl) \ | | #define __in__(cdecl, decl) \ | |
| decl | | decl | |
| | | | |
| skipping to change at line 230 | | skipping to change at line 253 | |
| name | | name | |
| #define __copy_param(local_decl, param) | | #define __copy_param(local_decl, param) | |
| #define __unsized_array_size | | #define __unsized_array_size | |
| #define __unsized__shared_var(name, s, type) \ | | #define __unsized__shared_var(name, s, type) \ | |
| (*name) | | (*name) | |
| #define __unsized__empty_array(s) | | #define __unsized__empty_array(s) | |
| | | | |
| /* this is compiled with a host compiler for device emulation */ | | /* this is compiled with a host compiler for device emulation */ | |
| #define __device_emulation | | #define __device_emulation | |
| | | | |
|
| | | #if defined(__cplusplus) | |
| | | | |
| | | #undef __VECTOR_TYPES_H__ | |
| | | | |
| | | #if defined(_WIN32) | |
| | | | |
| | | #pragma warning(disable: 4190 4522) | |
| | | | |
| | | #endif /* _WIN32 */ | |
| | | | |
| | | #endif /* __cplusplus */ | |
| | | | |
| #endif /* __CUDABE__ */ | | #endif /* __CUDABE__ */ | |
| | | | |
|
| | | #if defined(__cplusplus) | |
| | | | |
| | | static void *__cuda_memcpy(void*, const void*, size_t); | |
| | | | |
| | | /* for C++ compilation of lowered C++ (i.e. C) code */ | |
| | | #define __cuda_assign_operators(tag) | |
| | | \ | |
| | | tag& operator=( tag& a) { __cuda_memcpy(this, &a, | |
| | | sizeof(tag)); return *this;} \ | |
| | | tag& operator=(volatile tag& a) volatile { return *(tag*)this = (ta | |
| | | g&)a; } \ | |
| | | tag& operator=( const tag& a) { return *(tag*)this = (ta | |
| | | g&)a; } \ | |
| | | | |
| | | #endif /* __cplusplus */ | |
| | | | |
| #include "builtin_types.h" | | #include "builtin_types.h" | |
| #include "device_launch_parameters.h" | | #include "device_launch_parameters.h" | |
| #include "storage_class.h" | | #include "storage_class.h" | |
| | | | |
End of changes. 14 change blocks. |
| 6 lines changed or deleted | | 58 lines changed or added | |
|
| host_runtime.h | | host_runtime.h | |
| /* | | /* | |
|
| * Copyright 1993-2009 NVIDIA Corporation. All rights reserved. | | * Copyright 1993-2008 NVIDIA Corporation. All rights reserved. | |
| * | | * | |
| * NOTICE TO USER: | | * NOTICE TO USER: | |
| * | | * | |
| * This source code is subject to NVIDIA ownership rights under U.S. and | | * This source code is subject to NVIDIA ownership rights under U.S. and | |
| * international Copyright laws. Users and possessors of this source code | | * international Copyright laws. Users and possessors of this source code | |
| * are hereby granted a nonexclusive, royalty-free license to use this code | | * are hereby granted a nonexclusive, royalty-free license to use this code | |
| * in individual and commercial software. | | * in individual and commercial software. | |
| * | | * | |
| * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE | | * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE | |
| * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR | | * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR | |
| | | | |
| skipping to change at line 43 | | skipping to change at line 43 | |
| * the above Disclaimer and U.S. Government End Users Notice. | | * the above Disclaimer and U.S. Government End Users Notice. | |
| */ | | */ | |
| | | | |
| #if !defined(__CUDA_INTERNAL_COMPILATION__) | | #if !defined(__CUDA_INTERNAL_COMPILATION__) | |
| | | | |
| #define __CUDA_INTERNAL_COMPILATION__ | | #define __CUDA_INTERNAL_COMPILATION__ | |
| #define __glob_pref_var(var) \ | | #define __glob_pref_var(var) \ | |
| __global_##var | | __global_##var | |
| #define __global_var(var) \ | | #define __global_var(var) \ | |
| (*__glob_pref_var(var)) | | (*__glob_pref_var(var)) | |
|
| #define __shadow_pref_var(var) \ | | #define __shadow_var(c, cpp) \ | |
| __shadow_##var | | __shadow_pref_var(c, cpp) | |
| #define __shadow_var(var) \ | | | |
| __shadow_pref_var(var) | | | |
| #define __pad__(f) \ | | | |
| f | | | |
| #define __text__ | | #define __text__ | |
|
| | | #define __surf__ | |
| #define __dv(v) | | #define __dv(v) | |
| | | | |
|
| | | #if defined(_WIN32) && !defined(_WIN64) | |
| | | | |
| | | #define __pad__(f) \ | |
| | | f | |
| | | | |
| | | #else /* _WIN32 && !_WIN64 */ | |
| | | | |
| | | #define __pad__(f) | |
| | | | |
| | | #endif /* _WIN32 && !_WIN64 */ | |
| | | | |
| #if defined(__APPLE__) | | #if defined(__APPLE__) | |
| | | | |
| #define __extern_weak__ \ | | #define __extern_weak__ \ | |
| __weak_import__, | | __weak_import__, | |
| | | | |
| #elif defined(__GNUC__) | | #elif defined(__GNUC__) | |
| | | | |
| #define __extern_weak__ | | #define __extern_weak__ | |
| | | | |
| #endif /* __APPLE__ */ | | #endif /* __APPLE__ */ | |
| | | | |
| #if defined(__cplusplus) | | #if defined(__cplusplus) | |
| | | | |
|
| | | #define __shadow_pref_var(c, cpp) \ | |
| | | cpp##__cuda_shadow_variable__ | |
| #define __device_stub_name(c, cpp) \ | | #define __device_stub_name(c, cpp) \ | |
| cpp | | cpp | |
|
| | | #define __text_var(c, cpp) \ | |
| | | cpp | |
| #define __cppref__ \ | | #define __cppref__ \ | |
| & | | & | |
| | | | |
| #else /* __cplusplus */ | | #else /* __cplusplus */ | |
| | | | |
|
| | | #define __shadow_pref_var(c, cpp) \ | |
| | | c##__cuda_shadow_variable__ | |
| #define __device_stub_name(c, cpp) \ | | #define __device_stub_name(c, cpp) \ | |
| c | | c | |
|
| | | #define __text_var(c, cpp) \ | |
| | | c | |
| #define __cppref__ | | #define __cppref__ | |
| | | | |
| typedef char bool; | | typedef char bool; | |
| | | | |
| #endif /* __cplusplus */ | | #endif /* __cplusplus */ | |
| | | | |
|
| | | #if !defined(__GNUC__) || __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ | |
| | | < 3) | |
| | | | |
| | | #define __specialization_static \ | |
| | | static | |
| | | | |
| | | #else /* !__GNUC__ || __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 3) | |
| | | */ | |
| | | | |
| | | #define __specialization_static | |
| | | | |
| | | #endif /* !__GNUC__ || __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 3 | |
| | | ) */ | |
| | | | |
| #include "cuda_runtime_api.h" | | #include "cuda_runtime_api.h" | |
| #include "storage_class.h" | | #include "storage_class.h" | |
| | | | |
| #else /* !__CUDA_INTERNAL_COMPILATION__ */ | | #else /* !__CUDA_INTERNAL_COMPILATION__ */ | |
| | | | |
| #include "host_defines.h" | | #include "host_defines.h" | |
| | | | |
| #define __cudaRegisterBinary()
\ | | #define __cudaRegisterBinary()
\ | |
| __cudaFatCubinHandle = __cudaRegisterFatBinary((void*)__cudaFatCubi
n); \ | | __cudaFatCubinHandle = __cudaRegisterFatBinary((void*)__cudaFatCubi
n); \ | |
| atexit(__cudaUnregisterBinaryUtil) | | atexit(__cudaUnregisterBinaryUtil) | |
| #define __cudaRegisterVariable(var, ext, size, constant, global) \ | | #define __cudaRegisterVariable(var, ext, size, constant, global) \ | |
| __cudaRegisterVar(__cudaFatCubinHandle, (char*)&__host##var, (char*
)__device##var, __name##var, ext, size, constant, global) | | __cudaRegisterVar(__cudaFatCubinHandle, (char*)&__host##var, (char*
)__device##var, __name##var, ext, size, constant, global) | |
| #define __cudaRegisterGlobalTexture(tex, dim, norm, ext) \ | | #define __cudaRegisterGlobalTexture(tex, dim, norm, ext) \ | |
|
| __cudaRegisterTexture(__cudaFatCubinHandle, (const struct textureRe | | __cudaRegisterTexture(__cudaFatCubinHandle, (const struct textureRe | |
| ference*)&tex, __tex_var(tex), #tex, dim, norm, ext) | | ference*)&tex, __tex_var(tex), __name##tex, dim, norm, ext) | |
| | | #define __cudaRegisterGlobalSurface(surf, dim, ext) \ | |
| | | __cudaRegisterSurface(__cudaFatCubinHandle, (const struct surfaceRe | |
| | | ference*)&surf, __tex_var(surf), __name##surf, dim, ext) | |
| #define __cudaRegisterUnsizedShared(var) \ | | #define __cudaRegisterUnsizedShared(var) \ | |
| __cudaRegisterShared(__cudaFatCubinHandle, (void**)__device_var(var
)) | | __cudaRegisterShared(__cudaFatCubinHandle, (void**)__device_var(var
)) | |
| #define __cudaRegisterSharedVariable(var, size, align, sc) \ | | #define __cudaRegisterSharedVariable(var, size, align, sc) \ | |
| __cudaRegisterSharedVar(__cudaFatCubinHandle, (void**)__device_var(
var), size, align, sc) | | __cudaRegisterSharedVar(__cudaFatCubinHandle, (void**)__device_var(
var), size, align, sc) | |
| #define __cudaRegisterEntry(funptr, fun, thread_limit) \ | | #define __cudaRegisterEntry(funptr, fun, thread_limit) \ | |
| __cudaRegisterFunction(__cudaFatCubinHandle, (const char*)funptr, (
char*)__device_fun(fun), #fun, thread_limit, __ids) | | __cudaRegisterFunction(__cudaFatCubinHandle, (const char*)funptr, (
char*)__device_fun(fun), #fun, thread_limit, __ids) | |
| | | | |
| #define __cudaInitArgBlock(arg) \ | | #define __cudaInitArgBlock(arg) \ | |
|
| char __[256]; \ | | *(void**)(void*)&arg = (void*)0 | |
| *(char**)&arg = __ | | | |
| #define __cudaSetupArg(arg, offset) \ | | #define __cudaSetupArg(arg, offset) \ | |
|
| if (cudaSetupArgument((void*)(char*)&arg, sizeof(arg), (size_t)&off
set->arg - (size_t)offset) != cudaSuccess) \ | | if (cudaSetupArgument((void*)(char*)&arg, sizeof(arg), (size_t)&off
set->arg) != cudaSuccess) \ | |
| return | | return | |
| #define __cudaLaunch(fun) \ | | #define __cudaLaunch(fun) \ | |
| { volatile static char *__f; __f = fun; (void)cudaLaunch(fun); } | | { volatile static char *__f; __f = fun; (void)cudaLaunch(fun); } | |
| | | | |
| #if defined(__cplusplus) | | #if defined(__cplusplus) | |
| extern "C" { | | extern "C" { | |
| #endif /* __cplusplus */ | | #endif /* __cplusplus */ | |
| | | | |
| extern void** CUDARTAPI __cudaRegisterFatBinary( | | extern void** CUDARTAPI __cudaRegisterFatBinary( | |
| void *fatCubin | | void *fatCubin | |
| | | | |
| skipping to change at line 143 | | skipping to change at line 171 | |
| extern void CUDARTAPI __cudaRegisterTexture( | | extern void CUDARTAPI __cudaRegisterTexture( | |
| void **fatCubinHandle, | | void **fatCubinHandle, | |
| const struct textureReference *hostVar, | | const struct textureReference *hostVar, | |
| const void **deviceAddress, | | const void **deviceAddress, | |
| const char *deviceName, | | const char *deviceName, | |
| int dim, | | int dim, | |
| int norm, | | int norm, | |
| int ext | | int ext | |
| ); | | ); | |
| | | | |
|
| | | extern void CUDARTAPI __cudaRegisterSurface( | |
| | | void **fatCubinHandle, | |
| | | const struct surfaceReference *hostVar, | |
| | | const void **deviceAddress, | |
| | | const char *deviceName, | |
| | | int dim, | |
| | | int ext | |
| | | ); | |
| | | | |
| extern void CUDARTAPI __cudaRegisterShared( | | extern void CUDARTAPI __cudaRegisterShared( | |
| void **fatCubinHandle, | | void **fatCubinHandle, | |
| void **devicePtr | | void **devicePtr | |
| ); | | ); | |
| | | | |
| extern void CUDARTAPI __cudaRegisterSharedVar( | | extern void CUDARTAPI __cudaRegisterSharedVar( | |
| void **fatCubinHandle, | | void **fatCubinHandle, | |
| void **devicePtr, | | void **devicePtr, | |
| size_t size, | | size_t size, | |
| size_t alignment, | | size_t alignment, | |
| | | | |
| skipping to change at line 192 | | skipping to change at line 229 | |
| | | | |
| static void **__cudaFatCubinHandle; | | static void **__cudaFatCubinHandle; | |
| | | | |
| static void __cudaUnregisterBinaryUtil(void) | | static void __cudaUnregisterBinaryUtil(void) | |
| { | | { | |
| __cudaUnregisterFatBinary(__cudaFatCubinHandle); | | __cudaUnregisterFatBinary(__cudaFatCubinHandle); | |
| } | | } | |
| | | | |
| #if defined(__device_emulation) | | #if defined(__device_emulation) | |
| | | | |
|
| | | #if defined(__cplusplus) && !defined(__multi_core__) | |
| | | | |
| | | #define __cuda_emu__ \ | |
| | | __cuda_emu:: | |
| | | | |
| | | #else /* __cplusplus */ | |
| | | | |
| | | #define __cuda_emu__ | |
| | | | |
| | | #endif /* __cplusplus */ | |
| | | | |
| #define __device_fun(fun) \ | | #define __device_fun(fun) \ | |
|
| __device_wrapper_##fun | | __cuda_emu__ __device_wrapper_##fun | |
| #define __device_var(var) \ | | #define __device_var(var) \ | |
|
| (char*)&var | | (char*)&__cuda_emu__ var | |
| #define __tex_var(var) \ | | #define __tex_var(var) \ | |
|
| &__texture_var(var) | | &__cuda_emu__ __texture_var(var) | |
| #define __cudaFatCubin \ | | #define __cudaFatCubin \ | |
| 0 | | 0 | |
| | | | |
| #if defined(__multi_core__) | | #if defined(__multi_core__) | |
| | | | |
| #define __ids \ | | #define __ids \ | |
| (uint3*)0, (uint3*)0, &blockDim, &gridDim, &warpSize | | (uint3*)0, (uint3*)0, &blockDim, &gridDim, &warpSize | |
| | | | |
| #else /* __multi_core__ */ | | #else /* __multi_core__ */ | |
| | | | |
| #define __ids \ | | #define __ids \ | |
|
| &threadIdx, &blockIdx, &blockDim, &gridDim, &warpSize | | (uint3*)&__cuda_emu__ threadIdx, (uint3*)&__cuda_emu__ blockIdx, (d
im3*)&__cuda_emu__ blockDim, (dim3*)&__cuda_emu__ gridDim, &__cuda_emu__ wa
rpSize | |
| | | | |
| #endif /* __multi_core__ */ | | #endif /* __multi_core__ */ | |
| | | | |
| #else /* __device_emulation */ | | #else /* __device_emulation */ | |
| | | | |
| #define __device_fun(fun) \ | | #define __device_fun(fun) \ | |
| #fun | | #fun | |
| #define __device_var(var) \ | | #define __device_var(var) \ | |
| #var | | #var | |
| #define __tex_var(var) \ | | #define __tex_var(var) \ | |
| | | | |
| skipping to change at line 237 | | skipping to change at line 285 | |
| | | | |
| #endif /* __device_emulation */ | | #endif /* __device_emulation */ | |
| | | | |
| /* UTILITY MACROS */ | | /* UTILITY MACROS */ | |
| #define __device__global_var(var) \ | | #define __device__global_var(var) \ | |
| __device_var(var) | | __device_var(var) | |
| #define __name__global_var(var) \ | | #define __name__global_var(var) \ | |
| #var | | #var | |
| #define __host__global_var(var) \ | | #define __host__global_var(var) \ | |
| __glob_pref_var(var) | | __glob_pref_var(var) | |
|
| #define __device__shadow_var(var) \ | | #define __device__shadow_var(c, cpp) \ | |
| __device_var(var) | | __device_var(c) | |
| #define __name__shadow_var(var) \ | | #define __name__shadow_var(c, cpp) \ | |
| #var | | #c | |
| #define __host__shadow_var(var) \ | | #define __name__text_var(c, cpp) \ | |
| __shadow_pref_var(var) | | #c | |
| | | #define __host__shadow_var(c, cpp) \ | |
| | | __shadow_pref_var(c, cpp) | |
| | | | |
| | | #if defined(_WIN32) && defined(__cplusplus) | |
| | | | |
| | | #pragma warning(disable: 4099) | |
| | | | |
| | | #endif /* _WIN32 && __cplusplus */ | |
| | | | |
| #endif /* !__CUDA_INTERNAL_COMPILATION__ */ | | #endif /* !__CUDA_INTERNAL_COMPILATION__ */ | |
| | | | |
End of changes. 19 change blocks. |
| 22 lines changed or deleted | | 82 lines changed or added | |
|
| math_functions_dbl_ptx3.h | | math_functions_dbl_ptx3.h | |
| | | | |
| skipping to change at line 497 | | skipping to change at line 497 | |
| if (i & 2) { | | if (i & 2) { | |
| z = -z; | | z = -z; | |
| } | | } | |
| return z; | | return z; | |
| } | | } | |
| | | | |
| __device_func__(void __cuda_sincos(double a, double *sptr, double *cptr)) | | __device_func__(void __cuda_sincos(double a, double *sptr, double *cptr)) | |
| { | | { | |
| double t, u, s, c; | | double t, u, s, c; | |
| int i; | | int i; | |
|
| if (__cuda___isinf(a)) { | | t = __cuda_fabs(a); | |
| *sptr = CUDART_NAN; | | if ((t == CUDART_INF) || (t == CUDART_ZERO)) { | |
| *cptr = CUDART_NAN; | | s = __dmul_rn (a, CUDART_ZERO); /* generate NaN, zero */ | |
| return; | | c = 1.0 + s; /* generate NaN, one */ | |
| } | | *sptr = s; | |
| if (a == CUDART_ZERO) { | | *cptr = c; | |
| *sptr = a; | | | |
| *cptr = 1.0; | | | |
| return; | | return; | |
| } | | } | |
| t = __internal_trig_reduction_kerneld(a, &i); | | t = __internal_trig_reduction_kerneld(a, &i); | |
| u = __internal_cos_kerneld(t); | | u = __internal_cos_kerneld(t); | |
| t = __internal_sin_kerneld(t); | | t = __internal_sin_kerneld(t); | |
| if (i & 1) { | | if (i & 1) { | |
| s = u; | | s = u; | |
| c = t; | | c = t; | |
| } else { | | } else { | |
| s = t; | | s = t; | |
| | | | |
| skipping to change at line 561 | | skipping to change at line 559 | |
| /* normalize denormals */ | | /* normalize denormals */ | |
| if ((unsigned)ihi < (unsigned)0x00100000) { | | if ((unsigned)ihi < (unsigned)0x00100000) { | |
| a = a * CUDART_TWO_TO_54; | | a = a * CUDART_TWO_TO_54; | |
| e -= 54; | | e -= 54; | |
| ihi = __double2hiint(a); | | ihi = __double2hiint(a); | |
| ilo = __double2loint(a); | | ilo = __double2loint(a); | |
| } | | } | |
| /* a = m * 2^e. m <= sqrt(2): log2(a) = log2(m) + e. | | /* a = m * 2^e. m <= sqrt(2): log2(a) = log2(m) + e. | |
| * m > sqrt(2): log2(a) = log2(m/2) + (e+1) | | * m > sqrt(2): log2(a) = log2(m/2) + (e+1) | |
| */ | | */ | |
|
| e += ((ihi >> 20) & 0x7ff); | | e += (ihi >> 20); | |
| ihi = (ihi & 0x800fffff) | 0x3ff00000; | | ihi = (ihi & 0x800fffff) | 0x3ff00000; | |
| m = __hiloint2double (ihi, ilo); | | m = __hiloint2double (ihi, ilo); | |
| if ((unsigned)ihi > (unsigned)0x3ff6a09e) { | | if ((unsigned)ihi > (unsigned)0x3ff6a09e) { | |
| m = __internal_half(m); | | m = __internal_half(m); | |
| e = e + 1; | | e = e + 1; | |
| } | | } | |
| /* log((1+m)/(1-m)) = 2*atanh(m). log(m) = 2*atanh ((m-1)/(m+1)) */ | | /* log((1+m)/(1-m)) = 2*atanh(m). log(m) = 2*atanh ((m-1)/(m+1)) */ | |
| f = m - 1.0; | | f = m - 1.0; | |
| g = m + 1.0; | | g = m + 1.0; | |
| g = 1.0 / g; | | g = 1.0 / g; | |
| | | | |
| skipping to change at line 1590 | | skipping to change at line 1588 | |
| } | | } | |
| return t; | | return t; | |
| } | | } | |
| | | | |
| __device_func__(double __cuda_erfc(double a)) | | __device_func__(double __cuda_erfc(double a)) | |
| { | | { | |
| double p, q, h, l; | | double p, q, h, l; | |
| int ahi; | | int ahi; | |
| | | | |
| ahi = __double2hiint(a); | | ahi = __double2hiint(a); | |
|
| if (ahi < (int)0x3fe80000) { | | if (ahi < (int)0x3fe80000) { /* 0.75 */ | |
| return 1.0 - __cuda_erf(a); | | return 1.0 - __cuda_erf(a); | |
| } | | } | |
| if (a > 27.3) { | | if (a > 27.3) { | |
| return 0.0; | | return 0.0; | |
| } | | } | |
|
| if (ahi < (int)0x40140000) { | | if (ahi < (int)0x40140000) { /* 5.0 */ | |
| p = 5.64189549785304440E-001; | | /* max error 7 ulps on [0.75, 5.0] */ | |
| p = __fma_rn (p, a, 8.17405083437083490E+000); | | p = 5.6418956292134603E-001; | |
| p = __fma_rn (p, a, 5.68958722557864720E+001); | | p = __fma_rn (p, a, 7.9573512229784757E+000); | |
| p = __fma_rn (p, a, 2.42568747802647010E+002); | | p = __fma_rn (p, a, 5.4297984550299049E+001); | |
| p = __fma_rn (p, a, 6.80381374390412930E+002); | | p = __fma_rn (p, a, 2.2775657465890461E+002); | |
| p = __fma_rn (p, a, 1.25873132236024590E+003); | | p = __fma_rn (p, a, 6.2995529536738172E+002); | |
| p = __fma_rn (p, a, 1.43925353963809330E+003); | | p = __fma_rn (p, a, 1.1508293767713837E+003); | |
| p = __fma_rn (p, a, 8.15949420587659230E+002); | | p = __fma_rn (p, a, 1.3002167301542784E+003); | |
| q = a+ 1.44881247113239940E+001; | | p = __fma_rn (p, a, 7.2716547570180592E+002); | |
| q = __fma_rn (q, a, 1.01345387970210510E+002); | | q = a+ 1.4104035812651274E+001; | |
| q = __fma_rn (q, a, 4.37184684964009650E+002); | | q = __fma_rn (q, a, 9.6740724349422138E+001); | |
| q = __fma_rn (q, a, 1.25588209225251330E+003); | | q = __fma_rn (q, a, 4.1073916054746462E+002); | |
| q = __fma_rn (q, a, 2.43864015012647630E+003); | | q = __fma_rn (q, a, 1.1641974580374074E+003); | |
| q = __fma_rn (q, a, 3.10570469008816280E+003); | | q = __fma_rn (q, a, 2.2344896486798129E+003); | |
| q = __fma_rn (q, a, 2.35995386578953550E+003); | | q = __fma_rn (q, a, 2.8166572432808462E+003); | |
| q = __fma_rn (q, a, 8.15949420692539320E+002); | | q = __fma_rn (q, a, 2.1207350981593036E+003); | |
| | | q = __fma_rn (q, a, 7.2716547619708967E+002); | |
| | | p = p / q; | |
| | | h = a * a; | |
| | | l = __fma_rn (a, a, -h); | |
| | | q = __internal_exp_kernel(-h, 0); | |
| | | q = __fma_rn (l, -q, q); | |
| | | p = p * q; | |
| } else { | | } else { | |
|
| p = 5.64189583545675280E-001; | | /* max error 4 ulps on [5, 27.3] */ | |
| p = __fma_rn (p, a, 2.04728556066513970E+000); | | double ooa, ooasq; | |
| p = __fma_rn (p, a, 6.75128409662943610E+000); | | | |
| p = __fma_rn (p, a, 1.10459345071747900E+001); | | ooa = 1.0 / a; | |
| p = __fma_rn (p, a, 1.22570382896313600E+001); | | ooasq = ooa * ooa; | |
| p = __fma_rn (p, a, 6.01884641114116460E+000); | | p = -4.0025406686930527E+005; | |
| q = a+ 3.62871917534986780E+000; | | p = __fma_rn (p, ooasq, 1.4420582543942123E+005); | |
| q = __fma_rn (q, a, 1.24663395327043550E+001); | | p = __fma_rn (p, ooasq, -2.7664185780951841E+004); | |
| q = __fma_rn (q, a, 2.13927672803974790E+001); | | p = __fma_rn (p, ooasq, 4.1144611644767283E+003); | |
| q = __fma_rn (q, a, 2.72082423532866070E+001); | | p = __fma_rn (p, ooasq, -5.8706000519209351E+002); | |
| q = __fma_rn (q, a, 1.86422906830006700E+001); | | p = __fma_rn (p, ooasq, 9.1490086446323375E+001); | |
| q = __fma_rn (q, a, 6.13809834548870550E+000); | | p = __fma_rn (p, ooasq, -1.6659491387740221E+001); | |
| | | p = __fma_rn (p, ooasq, 3.7024804085481784E+000); | |
| | | p = __fma_rn (p, ooasq, -1.0578553994424316E+000); | |
| | | p = __fma_rn (p, ooasq, 4.2314218745087778E-001); | |
| | | p = __fma_rn (p, ooasq, -2.8209479177354962E-001); | |
| | | p = __fma_rn (p, ooasq, 5.6418958354775606E-001); | |
| | | h = a * a; | |
| | | l = __fma_rn (a, a, -h); | |
| | | q = __internal_exp_kernel(-h, 0); | |
| | | q = __fma_rn (l, -q, q); | |
| | | p = p * ooa; | |
| | | p = p * q; | |
| } | | } | |
|
| p = p / q; | | | |
| h = a * a; | | | |
| l = __fma_rn (a, a, -h); | | | |
| q = __internal_exp_kernel(-h, 0); | | | |
| q = __fma_rn (l, -q, q); | | | |
| p = p * q; | | | |
| return p; | | return p; | |
| } | | } | |
| | | | |
| /* approximate 1.0/(a*gamma(a)) on [-0.5,0.5] */ | | /* approximate 1.0/(a*gamma(a)) on [-0.5,0.5] */ | |
| __device_func__(double __internal_tgamma_kernel(double a)) | | __device_func__(double __internal_tgamma_kernel(double a)) | |
| { | | { | |
| double t; | | double t; | |
| t = -4.42689340712524750E-010; | | t = -4.42689340712524750E-010; | |
| t = __fma_rn (t, a, -2.02665918466589540E-007); | | t = __fma_rn (t, a, -2.02665918466589540E-007); | |
| t = __fma_rn (t, a, 1.13812117211195270E-006); | | t = __fma_rn (t, a, 1.13812117211195270E-006); | |
| | | | |
End of changes. 6 change blocks. |
| 45 lines changed or deleted | | 55 lines changed or added | |
|
| vector_types.h | | vector_types.h | |
| | | | |
| skipping to change at line 53 | | skipping to change at line 53 | |
| ***************************************************************************
****/ | | ***************************************************************************
****/ | |
| | | | |
| #include "host_defines.h" | | #include "host_defines.h" | |
| | | | |
| /**************************************************************************
***** | | /**************************************************************************
***** | |
| *
* | | *
* | |
| *
* | | *
* | |
| *
* | | *
* | |
| ***************************************************************************
****/ | | ***************************************************************************
****/ | |
| | | | |
|
| | | #if !defined(__cuda_assign_operators) | |
| | | | |
| | | #define __cuda_assign_operators(tag) | |
| | | | |
| | | #endif /* !__cuda_assign_operators */ | |
| | | | |
| | | #if !defined(__CUDACC__) && !defined(__CUDABE__) && \ | |
| | | !defined (__MULTI_CORE__) && defined(_WIN32) && !defined(_WIN64) | |
| | | | |
| | | #define __cuda_builtin_vector_align8(tag, ...) \ | |
| | | struct tag { \ | |
| | | union { \ | |
| | | struct { __VA_ARGS__; }; \ | |
| | | struct { long long int :1,:0; }; \ | |
| | | }; \ | |
| | | __cuda_assign_operators(tag) \ | |
| | | } | |
| | | | |
| | | #else /* !__CUDACC__ && !__CUDABE__ && !__MULTI_CORE__ && _WIN32 && !_WIN64 | |
| | | */ | |
| | | | |
| | | #define __cuda_builtin_vector_align8(tag, ...) \ | |
| | | struct __align__(8) tag { \ | |
| | | __VA_ARGS__; \ | |
| | | __cuda_assign_operators(tag) \ | |
| | | } | |
| | | | |
| | | #endif /* !__CUDACC__ && !__CUDABE__ && !__MULTI_CORE__ && _WIN32 && !_WIN6 | |
| | | 4 */ | |
| | | | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
| struct char1 | | struct char1 | |
| { | | { | |
| signed char x; | | signed char x; | |
|
| | | __cuda_assign_operators(char1) | |
| }; | | }; | |
| | | | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
| struct uchar1 | | struct uchar1 | |
| { | | { | |
| unsigned char x; | | unsigned char x; | |
|
| | | __cuda_assign_operators(uchar1) | |
| }; | | }; | |
| | | | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
|
| struct __builtin_align__(2) char2 | | struct __align__(2) char2 | |
| { | | { | |
| signed char x, y; | | signed char x, y; | |
|
| | | __cuda_assign_operators(char2) | |
| }; | | }; | |
| | | | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
|
| struct __builtin_align__(2) uchar2 | | struct __align__(2) uchar2 | |
| { | | { | |
| unsigned char x, y; | | unsigned char x, y; | |
|
| | | __cuda_assign_operators(uchar2) | |
| }; | | }; | |
| | | | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
| struct char3 | | struct char3 | |
| { | | { | |
| signed char x, y, z; | | signed char x, y, z; | |
|
| | | __cuda_assign_operators(char3) | |
| }; | | }; | |
| | | | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
| struct uchar3 | | struct uchar3 | |
| { | | { | |
| unsigned char x, y, z; | | unsigned char x, y, z; | |
|
| | | __cuda_assign_operators(uchar3) | |
| }; | | }; | |
| | | | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
|
| struct __builtin_align__(4) char4 | | struct __align__(4) char4 | |
| { | | { | |
| signed char x, y, z, w; | | signed char x, y, z, w; | |
|
| | | __cuda_assign_operators(char4) | |
| }; | | }; | |
| | | | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
|
| struct __builtin_align__(4) uchar4 | | struct __align__(4) uchar4 | |
| { | | { | |
| unsigned char x, y, z, w; | | unsigned char x, y, z, w; | |
|
| | | __cuda_assign_operators(uchar4) | |
| }; | | }; | |
| | | | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
| struct short1 | | struct short1 | |
| { | | { | |
| short x; | | short x; | |
|
| | | __cuda_assign_operators(short1) | |
| }; | | }; | |
| | | | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
| struct ushort1 | | struct ushort1 | |
| { | | { | |
| unsigned short x; | | unsigned short x; | |
|
| | | __cuda_assign_operators(ushort1) | |
| }; | | }; | |
| | | | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
|
| struct __builtin_align__(4) short2 | | struct __align__(4) short2 | |
| { | | { | |
| short x, y; | | short x, y; | |
|
| | | __cuda_assign_operators(short2) | |
| }; | | }; | |
| | | | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
|
| struct __builtin_align__(4) ushort2 | | struct __align__(4) ushort2 | |
| { | | { | |
| unsigned short x, y; | | unsigned short x, y; | |
|
| | | __cuda_assign_operators(ushort2) | |
| }; | | }; | |
| | | | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
| struct short3 | | struct short3 | |
| { | | { | |
| short x, y, z; | | short x, y, z; | |
|
| | | __cuda_assign_operators(short3) | |
| }; | | }; | |
| | | | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
| struct ushort3 | | struct ushort3 | |
| { | | { | |
| unsigned short x, y, z; | | unsigned short x, y, z; | |
|
| | | __cuda_assign_operators(ushort3) | |
| }; | | }; | |
| | | | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
|
| struct __builtin_align__(8) short4 | | __cuda_builtin_vector_align8(short4, short x, y, z, w); | |
| { | | | |
| short x, y, z, w; | | | |
| }; | | | |
| | | | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
|
| struct __builtin_align__(8) ushort4 | | __cuda_builtin_vector_align8(ushort4, unsigned short x, y, z, w); | |
| { | | | |
| unsigned short x, y, z, w; | | | |
| }; | | | |
| | | | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
| struct int1 | | struct int1 | |
| { | | { | |
| int x; | | int x; | |
|
| | | __cuda_assign_operators(int1) | |
| }; | | }; | |
| | | | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
| struct uint1 | | struct uint1 | |
| { | | { | |
| unsigned int x; | | unsigned int x; | |
|
| | | __cuda_assign_operators(uint1) | |
| }; | | }; | |
| | | | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
|
| struct __builtin_align__(8) int2 | | __cuda_builtin_vector_align8(int2, int x, y); | |
| { | | | |
| int x, y; | | | |
| }; | | | |
| | | | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
|
| struct __builtin_align__(8) uint2 | | __cuda_builtin_vector_align8(uint2, unsigned int x, y); | |
| { | | | |
| unsigned int x, y; | | | |
| }; | | | |
| | | | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
| struct int3 | | struct int3 | |
| { | | { | |
| int x, y, z; | | int x, y, z; | |
|
| | | __cuda_assign_operators(int3) | |
| }; | | }; | |
| | | | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
| struct uint3 | | struct uint3 | |
| { | | { | |
| unsigned int x, y, z; | | unsigned int x, y, z; | |
|
| | | __cuda_assign_operators(uint3) | |
| }; | | }; | |
| | | | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
| struct __builtin_align__(16) int4 | | struct __builtin_align__(16) int4 | |
| { | | { | |
| int x, y, z, w; | | int x, y, z, w; | |
|
| | | __cuda_assign_operators(int4) | |
| }; | | }; | |
| | | | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
| struct __builtin_align__(16) uint4 | | struct __builtin_align__(16) uint4 | |
| { | | { | |
| unsigned int x, y, z, w; | | unsigned int x, y, z, w; | |
|
| | | __cuda_assign_operators(uint4) | |
| }; | | }; | |
| | | | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
| struct long1 | | struct long1 | |
| { | | { | |
| long int x; | | long int x; | |
|
| | | __cuda_assign_operators(long1) | |
| }; | | }; | |
| | | | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
| struct ulong1 | | struct ulong1 | |
| { | | { | |
| unsigned long x; | | unsigned long x; | |
|
| | | __cuda_assign_operators(ulong1) | |
| }; | | }; | |
| | | | |
|
| /*DEVICE_BUILTIN*/ | | | |
| struct | | | |
| #if defined (_WIN32) | | #if defined (_WIN32) | |
|
| __builtin_align__(8) | | | |
| | | /*DEVICE_BUILTIN*/ | |
| | | __cuda_builtin_vector_align8(long2, long int x, y); | |
| | | | |
| | | /*DEVICE_BUILTIN*/ | |
| | | __cuda_builtin_vector_align8(ulong2, unsigned long int x, y); | |
| | | | |
| #else /* _WIN32 */ | | #else /* _WIN32 */ | |
|
| __builtin_align__(2*sizeof(long int)) | | | |
| #endif /* _WIN32 */ | | /*DEVICE_BUILTIN*/ | |
| long2 | | struct __align__(2*sizeof(long int)) long2 | |
| { | | { | |
| long int x, y; | | long int x, y; | |
|
| | | __cuda_assign_operators(long2) | |
| }; | | }; | |
| | | | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
|
| struct | | struct __align__(2*sizeof(unsigned long int)) ulong2 | |
| #if defined (_WIN32) | | | |
| __builtin_align__(8) | | | |
| #else /* _WIN32 */ | | | |
| __builtin_align__(2*sizeof(unsigned long int)) | | | |
| #endif /* _WIN32 */ | | | |
| ulong2 | | | |
| { | | { | |
| unsigned long int x, y; | | unsigned long int x, y; | |
|
| | | __cuda_assign_operators(ulong2) | |
| }; | | }; | |
| | | | |
|
| | | #endif /* _WIN32 */ | |
| | | | |
| #if !defined(__LP64__) | | #if !defined(__LP64__) | |
| | | | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
| struct long3 | | struct long3 | |
| { | | { | |
| long int x, y, z; | | long int x, y, z; | |
|
| | | __cuda_assign_operators(long3) | |
| }; | | }; | |
| | | | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
| struct ulong3 | | struct ulong3 | |
| { | | { | |
| unsigned long int x, y, z; | | unsigned long int x, y, z; | |
|
| | | __cuda_assign_operators(ulong3) | |
| }; | | }; | |
| | | | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
| struct __builtin_align__(16) long4 | | struct __builtin_align__(16) long4 | |
| { | | { | |
| long int x, y, z, w; | | long int x, y, z, w; | |
|
| | | __cuda_assign_operators(long4) | |
| }; | | }; | |
| | | | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
| struct __builtin_align__(16) ulong4 | | struct __builtin_align__(16) ulong4 | |
| { | | { | |
| unsigned long int x, y, z, w; | | unsigned long int x, y, z, w; | |
|
| | | __cuda_assign_operators(ulong4) | |
| }; | | }; | |
| | | | |
| #endif /* !__LP64__ */ | | #endif /* !__LP64__ */ | |
| | | | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
| struct float1 | | struct float1 | |
| { | | { | |
| float x; | | float x; | |
|
| | | __cuda_assign_operators(float1) | |
| }; | | }; | |
| | | | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
|
| struct __builtin_align__(8) float2 | | __cuda_builtin_vector_align8(float2, float x, y); | |
| { | | | |
| float x, y; | | | |
| }; | | | |
| | | | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
| struct float3 | | struct float3 | |
| { | | { | |
| float x, y, z; | | float x, y, z; | |
|
| | | __cuda_assign_operators(float3) | |
| }; | | }; | |
| | | | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
| struct __builtin_align__(16) float4 | | struct __builtin_align__(16) float4 | |
| { | | { | |
| float x, y, z, w; | | float x, y, z, w; | |
|
| | | __cuda_assign_operators(float4) | |
| }; | | }; | |
| | | | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
| struct longlong1 | | struct longlong1 | |
| { | | { | |
| long long int x; | | long long int x; | |
|
| | | __cuda_assign_operators(longlong1) | |
| }; | | }; | |
| | | | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
| struct ulonglong1 | | struct ulonglong1 | |
| { | | { | |
| unsigned long long int x; | | unsigned long long int x; | |
|
| | | __cuda_assign_operators(ulonglong1) | |
| }; | | }; | |
| | | | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
| struct __builtin_align__(16) longlong2 | | struct __builtin_align__(16) longlong2 | |
| { | | { | |
| long long int x, y; | | long long int x, y; | |
|
| | | __cuda_assign_operators(longlong2) | |
| }; | | }; | |
| | | | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
| struct __builtin_align__(16) ulonglong2 | | struct __builtin_align__(16) ulonglong2 | |
| { | | { | |
| unsigned long long int x, y; | | unsigned long long int x, y; | |
|
| | | __cuda_assign_operators(ulonglong2) | |
| }; | | }; | |
| | | | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
| struct double1 | | struct double1 | |
| { | | { | |
| double x; | | double x; | |
|
| | | __cuda_assign_operators(double1) | |
| }; | | }; | |
| | | | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
| struct __builtin_align__(16) double2 | | struct __builtin_align__(16) double2 | |
| { | | { | |
| double x, y; | | double x, y; | |
|
| | | __cuda_assign_operators(double2) | |
| }; | | }; | |
| | | | |
| /**************************************************************************
***** | | /**************************************************************************
***** | |
| *
* | | *
* | |
| *
* | | *
* | |
| *
* | | *
* | |
| ***************************************************************************
****/ | | ***************************************************************************
****/ | |
| | | | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
| typedef struct char1 char1; | | typedef struct char1 char1; | |
| | | | |
| skipping to change at line 419 | | skipping to change at line 469 | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
| typedef struct double2 double2; | | typedef struct double2 double2; | |
| | | | |
| /**************************************************************************
***** | | /**************************************************************************
***** | |
| *
* | | *
* | |
| *
* | | *
* | |
| *
* | | *
* | |
| ***************************************************************************
****/ | | ***************************************************************************
****/ | |
| | | | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
|
| typedef struct dim3 dim3; | | | |
| | | | |
| /*DEVICE_BUILTIN*/ | | | |
| struct dim3 | | struct dim3 | |
| { | | { | |
| unsigned int x, y, z; | | unsigned int x, y, z; | |
| #if defined(__cplusplus) | | #if defined(__cplusplus) | |
|
| dim3(unsigned int x = 1, unsigned int y = 1, unsigned int z = 1) : x(x) | | __host__ __device__ dim3(unsigned int x = 1, unsigned int y = 1, unsign | |
| , y(y), z(z) {} | | ed int z = 1) : x(x), y(y), z(z) {} | |
| dim3(uint3 v) : x(v.x), y(v.y), z(v.z) {} | | __host__ __device__ dim3(uint3 v) : x(v.x), y(v.y), z(v.z) {} | |
| operator uint3(void) { uint3 t; t.x = x; t.y = y; t.z = z; return t; } | | __host__ __device__ operator uint3(void) { uint3 t; t.x = x; t.y = y; t | |
| | | .z = z; return t; } | |
| #endif /* __cplusplus */ | | #endif /* __cplusplus */ | |
| }; | | }; | |
| | | | |
|
| | | /*DEVICE_BUILTIN*/ | |
| | | typedef struct dim3 dim3; | |
| | | | |
| | | #undef __cuda_assign_operators | |
| | | #undef __cuda_builtin_vector_align8 | |
| | | | |
| #endif /* !__VECTOR_TYPES_H__ */ | | #endif /* !__VECTOR_TYPES_H__ */ | |
| | | | |
End of changes. 57 change blocks. |
| 46 lines changed or deleted | | 102 lines changed or added | |
|