| cublas.h | | cublas.h | |
| | | | |
| skipping to change at line 94 | | skipping to change at line 94 | |
| #define CUBLASAPI __stdcall | | #define CUBLASAPI __stdcall | |
| #else | | #else | |
| #define CUBLASAPI | | #define CUBLASAPI | |
| #endif | | #endif | |
| #endif | | #endif | |
| | | | |
| #if defined(__cplusplus) | | #if defined(__cplusplus) | |
| extern "C" { | | extern "C" { | |
| #endif /* __cplusplus */ | | #endif /* __cplusplus */ | |
| | | | |
|
| | | #include "driver_types.h" | |
| #include "cuComplex.h" /* import complex data type */ | | #include "cuComplex.h" /* import complex data type */ | |
| | | | |
| /* CUBLAS status returns */ | | /* CUBLAS status returns */ | |
| #define CUBLAS_STATUS_SUCCESS 0x00000000 | | #define CUBLAS_STATUS_SUCCESS 0x00000000 | |
| #define CUBLAS_STATUS_NOT_INITIALIZED 0x00000001 | | #define CUBLAS_STATUS_NOT_INITIALIZED 0x00000001 | |
| #define CUBLAS_STATUS_ALLOC_FAILED 0x00000003 | | #define CUBLAS_STATUS_ALLOC_FAILED 0x00000003 | |
| #define CUBLAS_STATUS_INVALID_VALUE 0x00000007 | | #define CUBLAS_STATUS_INVALID_VALUE 0x00000007 | |
| #define CUBLAS_STATUS_ARCH_MISMATCH 0x00000008 | | #define CUBLAS_STATUS_ARCH_MISMATCH 0x00000008 | |
| #define CUBLAS_STATUS_MAPPING_ERROR 0x0000000B | | #define CUBLAS_STATUS_MAPPING_ERROR 0x0000000B | |
| #define CUBLAS_STATUS_EXECUTION_FAILED 0x0000000D | | #define CUBLAS_STATUS_EXECUTION_FAILED 0x0000000D | |
| | | | |
| skipping to change at line 289 | | skipping to change at line 290 | |
| * ------------- | | * ------------- | |
| * CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
d | | * CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
d | |
| * CUBLAS_STATUS_INVALID_VALUE if rows, cols, eleSize, lda, or ldb <= 0 | | * CUBLAS_STATUS_INVALID_VALUE if rows, cols, eleSize, lda, or ldb <= 0 | |
| * CUBLAS_STATUS_MAPPING_ERROR if error occurred accessing GPU memory | | * CUBLAS_STATUS_MAPPING_ERROR if error occurred accessing GPU memory | |
| * CUBLAS_STATUS_SUCCESS if the operation completed successfully | | * CUBLAS_STATUS_SUCCESS if the operation completed successfully | |
| */ | | */ | |
| cublasStatus CUBLASAPI cublasGetMatrix (int rows, int cols, int elemSize, | | cublasStatus CUBLASAPI cublasGetMatrix (int rows, int cols, int elemSize, | |
| const void *A, int lda, void *B, | | const void *A, int lda, void *B, | |
| int ldb); | | int ldb); | |
| | | | |
|
| | | /* | |
| | | * cublasStatus | |
| | | * cublasSetKernelStream ( cudaStream_t stream ) | |
| | | * | |
| | | * set the CUBLAS stream in which all subsequent CUBLAS kernel launches wil | |
| | | l run. | |
| | | * By default, if the CUBLAS stream is not set, all kernels will use the NU | |
| | | LL | |
| | | * stream. This routine can be used to change the stream between kernels la | |
| | | unches | |
| | | * and can be used also to set the CUBLAS stream back to NULL. | |
| | | * | |
| | | * Return Values | |
| | | * ------------- | |
| | | * CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize | |
| | | d | |
| | | * CUBLAS_STATUS_SUCCESS if stream set successfully | |
| | | */ | |
| | | cublasStatus CUBLASAPI cublasSetKernelStream (cudaStream_t stream); | |
| | | | |
| | | /* | |
| | | * cublasStatus | |
| | | * cublasSetVectorAsync ( int n, int elemSize, const void *x, int incx, | |
| | | * void *y, int incy, cudaStream_t stream ); | |
| | | * | |
| | | * cublasSetVectorAsync has the same functionnality as cublasSetVector | |
| | | * but the transfer is done asynchronously within the CUDA stream passed | |
| | | * in parameter. | |
| | | * | |
| | | * Return Values | |
| | | * ------------- | |
| | | * CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library not been initialized | |
| | | * CUBLAS_STATUS_INVALID_VALUE if incx, incy, or elemSize <= 0 | |
| | | * CUBLAS_STATUS_MAPPING_ERROR if an error occurred accessing GPU memory | |
| | | * CUBLAS_STATUS_SUCCESS if the operation completed successfully | |
| | | */ | |
| | | cublasStatus CUBLASAPI cublasSetVectorAsync (int n, int elemSize, | |
| | | const void *hostPtr, int incx, | |
| | | void *devicePtr, int incy, | |
| | | cudaStream_t stream); | |
| | | /* | |
| | | * cublasStatus | |
| | | * cublasGetVectorAsync( int n, int elemSize, const void *x, int incx, | |
| | | * void *y, int incy, cudaStream_t stream) | |
| | | * | |
| | | * cublasGetVectorAsync has the same functionnality as cublasGetVector | |
| | | * but the transfer is done asynchronously within the CUDA stream passed | |
| | | * in parameter. | |
| | | * | |
| | | * Return Values | |
| | | * ------------- | |
| | | * CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library not been initialized | |
| | | * CUBLAS_STATUS_INVALID_VALUE if incx, incy, or elemSize <= 0 | |
| | | * CUBLAS_STATUS_MAPPING_ERROR if an error occurred accessing GPU memory | |
| | | * CUBLAS_STATUS_SUCCESS if the operation completed successfully | |
| | | */ | |
| | | cublasStatus CUBLASAPI cublasGetVectorAsync (int n, int elemSize, | |
| | | const void *devicePtr, int inc | |
| | | x, | |
| | | void *hostPtr, int incy, | |
| | | cudaStream_t stream); | |
| | | | |
| | | /* | |
| | | * cublasStatus | |
| | | * cublasSetMatrixAsync (int rows, int cols, int elemSize, const void *A, | |
| | | * int lda, void *B, int ldb, cudaStream_t stream) | |
| | | * | |
| | | * cublasSetMatrixAsync has the same functionnality as cublasSetMatrix | |
| | | * but the transfer is done asynchronously within the CUDA stream passed | |
| | | * in parameter. | |
| | | * | |
| | | * Return Values | |
| | | * ------------- | |
| | | * CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize | |
| | | d | |
| | | * CUBLAS_STATUS_INVALID_VALUE if rows or cols < 0, or elemSize, lda, or | |
| | | * ldb <= 0 | |
| | | * CUBLAS_STATUS_MAPPING_ERROR if error occurred accessing GPU memory | |
| | | * CUBLAS_STATUS_SUCCESS if the operation completed successfully | |
| | | */ | |
| | | cublasStatus CUBLASAPI cublasSetMatrixAsync (int rows, int cols, int elemSi | |
| | | ze, | |
| | | const void *A, int lda, void * | |
| | | B, | |
| | | int ldb, cudaStream_t stream); | |
| | | | |
| | | /* | |
| | | * cublasStatus | |
| | | * cublasGetMatrixAsync (int rows, int cols, int elemSize, const void *A, | |
| | | * int lda, void *B, int ldb, cudaStream_t stream) | |
| | | * | |
| | | * cublasGetMatrixAsync has the same functionnality as cublasGetMatrix | |
| | | * but the transfer is done asynchronously within the CUDA stream passed | |
| | | * in parameter. | |
| | | * | |
| | | * Return Values | |
| | | * ------------- | |
| | | * CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize | |
| | | d | |
| | | * CUBLAS_STATUS_INVALID_VALUE if rows, cols, eleSize, lda, or ldb <= 0 | |
| | | * CUBLAS_STATUS_MAPPING_ERROR if error occurred accessing GPU memory | |
| | | * CUBLAS_STATUS_SUCCESS if the operation completed successfully | |
| | | */ | |
| | | cublasStatus CUBLASAPI cublasGetMatrixAsync (int rows, int cols, int elemSi | |
| | | ze, | |
| | | const void *A, int lda, void * | |
| | | B, | |
| | | int ldb, cudaStream_t stream); | |
| | | | |
| /* ---------------- CUBLAS single-precision BLAS1 functions ---------------
- */ | | /* ---------------- CUBLAS single-precision BLAS1 functions ---------------
- */ | |
| | | | |
| /* | | /* | |
| * int | | * int | |
| * cublasIsamax (int n, const float *x, int incx) | | * cublasIsamax (int n, const float *x, int incx) | |
| * | | * | |
| * finds the smallest index of the maximum magnitude element of single | | * finds the smallest index of the maximum magnitude element of single | |
| * precision vector x; that is, the result is the first i, i = 0 to n - 1, | | * precision vector x; that is, the result is the first i, i = 0 to n - 1, | |
| * that maximizes abs(x[1 + i * incx])). | | * that maximizes abs(x[1 + i * incx])). | |
| * | | * | |
| | | | |
| skipping to change at line 574 | | skipping to change at line 673 | |
| * The quantity r = (+/-) sqrt (sa^2 + sb^2) overwrites sa in storage. The | | * The quantity r = (+/-) sqrt (sa^2 + sb^2) overwrites sa in storage. The | |
| * value of sb is overwritten by a value z which allows sc and ss to be | | * value of sb is overwritten by a value z which allows sc and ss to be | |
| * recovered by the following algorithm: | | * recovered by the following algorithm: | |
| * | | * | |
| * if z=1 set sc = 0.0 and ss = 1.0 | | * if z=1 set sc = 0.0 and ss = 1.0 | |
| * if abs(z) < 1 set sc = sqrt(1-z^2) and ss = z | | * if abs(z) < 1 set sc = sqrt(1-z^2) and ss = z | |
| * if abs(z) > 1 set sc = 1/z and ss = sqrt(1-sc^2) | | * if abs(z) > 1 set sc = 1/z and ss = sqrt(1-sc^2) | |
| * | | * | |
| * The function srot (n, x, incx, y, incy, sc, ss) normally is called next | | * The function srot (n, x, incx, y, incy, sc, ss) normally is called next | |
| * to apply the transformation to a 2 x n matrix. | | * to apply the transformation to a 2 x n matrix. | |
|
| * Note that is function is provided for completeness and run exclusively | | * Note that this function is provided for completeness and run exclusively | |
| * on the Host. | | * on the Host. | |
| * | | * | |
| * Input | | * Input | |
| * ----- | | * ----- | |
| * sa single precision scalar | | * sa single precision scalar | |
| * sb single precision scalar | | * sb single precision scalar | |
| * | | * | |
| * Output | | * Output | |
| * ------ | | * ------ | |
| * sa single precision r | | * sa single precision r | |
| | | | |
| skipping to change at line 662 | | skipping to change at line 761 | |
| * | | * | |
| * sflag = -1.0f sflag = 0.0f sflag = 1.0f sflag = -2.0f | | * sflag = -1.0f sflag = 0.0f sflag = 1.0f sflag = -2.0f | |
| * | | * | |
| * (sh00 sh01) (1.0f sh01) (sh00 1.0f) (1.0f 0.0f) | | * (sh00 sh01) (1.0f sh01) (sh00 1.0f) (1.0f 0.0f) | |
| * h = ( ) ( ) ( ) ( ) | | * h = ( ) ( ) ( ) ( ) | |
| * (sh10 sh11) (sh10 1.0f) (-1.0f sh11) (0.0f 1.0f) | | * (sh10 sh11) (sh10 1.0f) (-1.0f sh11) (0.0f 1.0f) | |
| * | | * | |
| * sparam[1] through sparam[4] contain sh00, sh10, sh01, sh11, | | * sparam[1] through sparam[4] contain sh00, sh10, sh01, sh11, | |
| * respectively. Values of 1.0f, -1.0f, or 0.0f implied by the value | | * respectively. Values of 1.0f, -1.0f, or 0.0f implied by the value | |
| * of sflag are not stored in sparam. | | * of sflag are not stored in sparam. | |
|
| * Note that is function is provided for completeness and run exclusively | | * Note that this function is provided for completeness and run exclusively | |
| * on the Host. | | * on the Host. | |
| * | | * | |
| * Input | | * Input | |
| * ----- | | * ----- | |
| * sd1 single precision scalar | | * sd1 single precision scalar | |
| * sd2 single precision scalar | | * sd2 single precision scalar | |
| * sx1 single precision scalar | | * sx1 single precision scalar | |
| * sy1 single precision scalar | | * sy1 single precision scalar | |
| * | | * | |
| * Output | | * Output | |
| | | | |
| skipping to change at line 899 | | skipping to change at line 998 | |
| * | | * | |
| * ( sc cs ) | | * ( sc cs ) | |
| * G = ( ) , sc^2 + cabs(cs)^2 = 1, | | * G = ( ) , sc^2 + cabs(cs)^2 = 1, | |
| * (-cs sc ) | | * (-cs sc ) | |
| * | | * | |
| * which zeros the second entry of the complex 2-vector transpose(ca, cb). | | * which zeros the second entry of the complex 2-vector transpose(ca, cb). | |
| * | | * | |
| * The quantity ca/cabs(ca)*norm(ca,cb) overwrites ca in storage. The | | * The quantity ca/cabs(ca)*norm(ca,cb) overwrites ca in storage. The | |
| * function crot (n, x, incx, y, incy, sc, cs) is normally called next | | * function crot (n, x, incx, y, incy, sc, cs) is normally called next | |
| * to apply the transformation to a 2 x n matrix. | | * to apply the transformation to a 2 x n matrix. | |
|
| * Note that is function is provided for completeness and run exclusively | | * Note that this function is provided for completeness and run exclusively | |
| * on the Host. | | * on the Host. | |
| * | | * | |
| * Input | | * Input | |
| * ----- | | * ----- | |
| * ca single-precision complex precision scalar | | * ca single-precision complex precision scalar | |
| * cb single-precision complex scalar | | * cb single-precision complex scalar | |
| * | | * | |
| * Output | | * Output | |
| * ------ | | * ------ | |
| * ca single-precision complex ca/cabs(ca)*norm(ca,cb) | | * ca single-precision complex ca/cabs(ca)*norm(ca,cb) | |
| | | | |
| skipping to change at line 1498 | | skipping to change at line 1597 | |
| * | | * | |
| * ( sc cs ) | | * ( sc cs ) | |
| * G = ( ) , sc^2 + cabs(cs)^2 = 1, | | * G = ( ) , sc^2 + cabs(cs)^2 = 1, | |
| * (-cs sc ) | | * (-cs sc ) | |
| * | | * | |
| * which zeros the second entry of the complex 2-vector transpose(ca, cb). | | * which zeros the second entry of the complex 2-vector transpose(ca, cb). | |
| * | | * | |
| * The quantity ca/cabs(ca)*norm(ca,cb) overwrites ca in storage. The | | * The quantity ca/cabs(ca)*norm(ca,cb) overwrites ca in storage. The | |
| * function crot (n, x, incx, y, incy, sc, cs) is normally called next | | * function crot (n, x, incx, y, incy, sc, cs) is normally called next | |
| * to apply the transformation to a 2 x n matrix. | | * to apply the transformation to a 2 x n matrix. | |
|
| * Note that is function is provided for completeness and run exclusively | | * Note that this function is provided for completeness and run exclusively | |
| * on the Host. | | * on the Host. | |
| * | | * | |
| * Input | | * Input | |
| * ----- | | * ----- | |
| * ca double-precision complex precision scalar | | * ca double-precision complex precision scalar | |
| * cb double-precision complex scalar | | * cb double-precision complex scalar | |
| * | | * | |
| * Output | | * Output | |
| * ------ | | * ------ | |
| * ca double-precision complex ca/cabs(ca)*norm(ca,cb) | | * ca double-precision complex ca/cabs(ca)*norm(ca,cb) | |
| | | | |
| skipping to change at line 3398 | | skipping to change at line 3497 | |
| * | | * | |
| * Error Status | | * Error Status | |
| * ------------ | | * ------------ | |
| * CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
d | | * CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
d | |
| * CUBLAS_STATUS_INVALID_VALUE if k or n < 0, or if incx or incy == 0 | | * CUBLAS_STATUS_INVALID_VALUE if k or n < 0, or if incx or incy == 0 | |
| * CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU | | * CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU | |
| */ | | */ | |
| void CUBLASAPI cublasChbmv (char uplo, int n, int k, cuComplex alpha, | | void CUBLASAPI cublasChbmv (char uplo, int n, int k, cuComplex alpha, | |
| const cuComplex *A, int lda, const cuComplex *x
, | | const cuComplex *A, int lda, const cuComplex *x
, | |
| int incx, cuComplex beta, cuComplex *y, int inc
y); | | int incx, cuComplex beta, cuComplex *y, int inc
y); | |
|
| | | | |
| | | /* | |
| | | * void | |
| | | * cublasChpmv (char uplo, int n, cuComplex alpha, const cuComplex *AP, con | |
| | | st cuComplex *x, | |
| | | * int incx, cuComplex beta, cuComplex *y, int incy) | |
| | | * | |
| | | * performs the matrix-vector operation | |
| | | * | |
| | | * y = alpha * A * x + beta * y | |
| | | * | |
| | | * Alpha and beta are single precision complex scalars, and x and y are sin | |
| | | gle | |
| | | * precision complex vectors with n elements. A is an hermitian n x n matri | |
| | | x | |
| | | * consisting of single precision complex elements that is supplied in pack | |
| | | ed form. | |
| | | * | |
| | | * Input | |
| | | * ----- | |
| | | * uplo specifies whether the matrix data is stored in the upper or the l | |
| | | ower | |
| | | * triangular part of array AP. If uplo == 'U' or 'u', then the uppe | |
| | | r | |
| | | * triangular part of A is supplied in AP. If uplo == 'L' or 'l', th | |
| | | en | |
| | | * the lower triangular part of A is supplied in AP. | |
| | | * n specifies the number of rows and columns of the matrix A. It must | |
| | | be | |
| | | * at least zero. | |
| | | * alpha single precision complex scalar multiplier applied to A*x. | |
| | | * AP single precision complex array with at least ((n * (n + 1)) / 2) | |
| | | elements. If | |
| | | * uplo == 'U' or 'u', the array AP contains the upper triangular pa | |
| | | rt | |
| | | * of the hermitian matrix A, packed sequentially, column by column; | |
| | | * that is, if i <= j, then A[i,j] is stored is AP[i+(j*(j+1)/2)]. I | |
| | | f | |
| | | * uplo == 'L' or 'L', the array AP contains the lower triangular pa | |
| | | rt | |
| | | * of the hermitian matrix A, packed sequentially, column by column; | |
| | | * that is, if i >= j, then A[i,j] is stored in AP[i+((2*n-j+1)*j)/2 | |
| | | ]. | |
| | | * The imaginary parts of the diagonal elements need not be set, the | |
| | | y | |
| | | * are assumed to be zero. | |
| | | * x single precision complex array of length at least (1 + (n - 1) * | |
| | | abs(incx)). | |
| | | * incx storage spacing between elements of x. incx must not be zero. | |
| | | * beta single precision complex scalar multiplier applied to vector y; | |
| | | * y single precision array of length at least (1 + (n - 1) * abs(incy | |
| | | )). | |
| | | * If beta is zero, y is not read. | |
| | | * incy storage spacing between elements of y. incy must not be zero. | |
| | | * | |
| | | * Output | |
| | | * ------ | |
| | | * y updated according to y = alpha*A*x + beta*y | |
| | | * | |
| | | * Reference: http://www.netlib.org/blas/chpmv.f | |
| | | * | |
| | | * Error status for this function can be retrieved via cublasGetError(). | |
| | | * | |
| | | * Error Status | |
| | | * ------------ | |
| | | * CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize | |
| | | d | |
| | | * CUBLAS_STATUS_INVALID_VALUE if n < 0, or if incx or incy == 0 | |
| | | * CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU | |
| | | */ | |
| void CUBLASAPI cublasChpmv (char uplo, int n, cuComplex alpha, | | void CUBLASAPI cublasChpmv (char uplo, int n, cuComplex alpha, | |
| const cuComplex *AP, const cuComplex *x, int in
cx, | | const cuComplex *AP, const cuComplex *x, int in
cx, | |
| cuComplex beta, cuComplex *y, int incy); | | cuComplex beta, cuComplex *y, int incy); | |
| | | | |
| /* | | /* | |
| * | | * | |
| * cublasCtrmv (char uplo, char trans, char diag, int n, const cuComplex *A
, | | * cublasCtrmv (char uplo, char trans, char diag, int n, const cuComplex *A
, | |
| * int lda, cuComplex *x, int incx); | | * int lda, cuComplex *x, int incx); | |
| * | | * | |
| * performs one of the matrix-vector operations x = op(A) * x, | | * performs one of the matrix-vector operations x = op(A) * x, | |
| | | | |
| skipping to change at line 5494 | | skipping to change at line 5646 | |
| * The quantity r = (+/-) sqrt (sa^2 + sb^2) overwrites sa in storage. The | | * The quantity r = (+/-) sqrt (sa^2 + sb^2) overwrites sa in storage. The | |
| * value of sb is overwritten by a value z which allows sc and ss to be | | * value of sb is overwritten by a value z which allows sc and ss to be | |
| * recovered by the following algorithm: | | * recovered by the following algorithm: | |
| * | | * | |
| * if z=1 set sc = 0.0 and ss = 1.0 | | * if z=1 set sc = 0.0 and ss = 1.0 | |
| * if abs(z) < 1 set sc = sqrt(1-z^2) and ss = z | | * if abs(z) < 1 set sc = sqrt(1-z^2) and ss = z | |
| * if abs(z) > 1 set sc = 1/z and ss = sqrt(1-sc^2) | | * if abs(z) > 1 set sc = 1/z and ss = sqrt(1-sc^2) | |
| * | | * | |
| * The function drot (n, x, incx, y, incy, sc, ss) normally is called next | | * The function drot (n, x, incx, y, incy, sc, ss) normally is called next | |
| * to apply the transformation to a 2 x n matrix. | | * to apply the transformation to a 2 x n matrix. | |
|
| * Note that is function is provided for completeness and run exclusively | | * Note that this function is provided for completeness and run exclusively | |
| * on the Host. | | * on the Host. | |
| * | | * | |
| * Input | | * Input | |
| * ----- | | * ----- | |
| * sa double-precision scalar | | * sa double-precision scalar | |
| * sb double-precision scalar | | * sb double-precision scalar | |
| * | | * | |
| * Output | | * Output | |
| * ------ | | * ------ | |
| * sa double-precision r | | * sa double-precision r | |
| | | | |
| skipping to change at line 5535 | | skipping to change at line 5687 | |
| * The elements of x are in x[lx + i * incx], i = 0 to n-1, where lx = 1 if | | * The elements of x are in x[lx + i * incx], i = 0 to n-1, where lx = 1 if | |
| * incx >= 0, else lx = 1 + (1 - n) * incx, and similarly for y using ly an
d | | * incx >= 0, else lx = 1 + (1 - n) * incx, and similarly for y using ly an
d | |
| * incy. With sparam[0] = sflag, h has one of the following forms: | | * incy. With sparam[0] = sflag, h has one of the following forms: | |
| * | | * | |
| * sflag = -1.0 sflag = 0.0 sflag = 1.0 sflag = -2.0 | | * sflag = -1.0 sflag = 0.0 sflag = 1.0 sflag = -2.0 | |
| * | | * | |
| * (sh00 sh01) (1.0 sh01) (sh00 1.0) (1.0 0.0) | | * (sh00 sh01) (1.0 sh01) (sh00 1.0) (1.0 0.0) | |
| * h = ( ) ( ) ( ) ( ) | | * h = ( ) ( ) ( ) ( ) | |
| * (sh10 sh11) (sh10 1.0) (-1.0 sh11) (0.0 1.0) | | * (sh10 sh11) (sh10 1.0) (-1.0 sh11) (0.0 1.0) | |
| * | | * | |
|
| * Note that is function is provided for completeness and run exclusively | | * Note that this function is provided for completeness and run exclusively | |
| * on the Host. | | * on the Host. | |
| * | | * | |
| * Input | | * Input | |
| * ----- | | * ----- | |
| * n number of elements in input vectors | | * n number of elements in input vectors | |
| * x double-precision vector with n elements | | * x double-precision vector with n elements | |
| * incx storage spacing between elements of x | | * incx storage spacing between elements of x | |
| * y double-precision vector with n elements | | * y double-precision vector with n elements | |
| * incy storage spacing between elements of y | | * incy storage spacing between elements of y | |
| * sparam 5-element vector. sparam[0] is sflag described above. sparam[1] | | * sparam 5-element vector. sparam[0] is sflag described above. sparam[1] | |
| | | | |
| skipping to change at line 5586 | | skipping to change at line 5738 | |
| * | | * | |
| * sflag = -1.0 sflag = 0.0 sflag = 1.0 sflag = -2.0 | | * sflag = -1.0 sflag = 0.0 sflag = 1.0 sflag = -2.0 | |
| * | | * | |
| * (sh00 sh01) (1.0 sh01) (sh00 1.0) (1.0 0.0) | | * (sh00 sh01) (1.0 sh01) (sh00 1.0) (1.0 0.0) | |
| * h = ( ) ( ) ( ) ( ) | | * h = ( ) ( ) ( ) ( ) | |
| * (sh10 sh11) (sh10 1.0) (-1.0 sh11) (0.0 1.0) | | * (sh10 sh11) (sh10 1.0) (-1.0 sh11) (0.0 1.0) | |
| * | | * | |
| * sparam[1] through sparam[4] contain sh00, sh10, sh01, sh11, | | * sparam[1] through sparam[4] contain sh00, sh10, sh01, sh11, | |
| * respectively. Values of 1.0, -1.0, or 0.0 implied by the value | | * respectively. Values of 1.0, -1.0, or 0.0 implied by the value | |
| * of sflag are not stored in sparam. | | * of sflag are not stored in sparam. | |
|
| * Note that is function is provided for completeness and run exclusively | | * Note that this function is provided for completeness and run exclusively | |
| * on the Host. | | * on the Host. | |
| * | | * | |
| * Input | | * Input | |
| * ----- | | * ----- | |
| * sd1 single precision scalar | | * sd1 single precision scalar | |
| * sd2 single precision scalar | | * sd2 single precision scalar | |
| * sx1 single precision scalar | | * sx1 single precision scalar | |
| * sy1 single precision scalar | | * sy1 single precision scalar | |
| * | | * | |
| * Output | | * Output | |
| | | | |
End of changes. 10 change blocks. |
| 7 lines changed or deleted | | 187 lines changed or added | |
|
| cuda.h | | cuda.h | |
| | | | |
| skipping to change at line 57 | | skipping to change at line 57 | |
| | | | |
| /** | | /** | |
| * \defgroup CUDA_TYPES Data types used by CUDA driver | | * \defgroup CUDA_TYPES Data types used by CUDA driver | |
| * \ingroup CUDA_DRIVER | | * \ingroup CUDA_DRIVER | |
| * @{ | | * @{ | |
| */ | | */ | |
| | | | |
| /** | | /** | |
| * CUDA API version number | | * CUDA API version number | |
| */ | | */ | |
|
| #define CUDA_VERSION 3000 /* 3.0 */ | | #define CUDA_VERSION 3010 /* 3.1 */ | |
| | | | |
| #ifdef __cplusplus | | #ifdef __cplusplus | |
| extern "C" { | | extern "C" { | |
| #endif | | #endif | |
| typedef unsigned int CUdeviceptr; ///< CUDA device pointer | | typedef unsigned int CUdeviceptr; ///< CUDA device pointer | |
| | | | |
| typedef int CUdevice; ///< CUDA device | | typedef int CUdevice; ///< CUDA device | |
| typedef struct CUctx_st *CUcontext; ///< CUDA context | | typedef struct CUctx_st *CUcontext; ///< CUDA context | |
| typedef struct CUmod_st *CUmodule; ///< CUDA module | | typedef struct CUmod_st *CUmodule; ///< CUDA module | |
| typedef struct CUfunc_st *CUfunction; ///< CUDA function | | typedef struct CUfunc_st *CUfunction; ///< CUDA function | |
| typedef struct CUarray_st *CUarray; ///< CUDA array | | typedef struct CUarray_st *CUarray; ///< CUDA array | |
| typedef struct CUtexref_st *CUtexref; ///< CUDA texture reference | | typedef struct CUtexref_st *CUtexref; ///< CUDA texture reference | |
|
| | | typedef struct CUsurfref_st *CUsurfref; ///< CUDA surface reference | |
| typedef struct CUevent_st *CUevent; ///< CUDA event | | typedef struct CUevent_st *CUevent; ///< CUDA event | |
| typedef struct CUstream_st *CUstream; ///< CUDA stream | | typedef struct CUstream_st *CUstream; ///< CUDA stream | |
| typedef struct CUgraphicsResource_st *CUgraphicsResource; ///< CUDA gra
phics interop resource | | typedef struct CUgraphicsResource_st *CUgraphicsResource; ///< CUDA gra
phics interop resource | |
| | | | |
| typedef struct CUuuid_st { ///< CUDA definition of UUID | | typedef struct CUuuid_st { ///< CUDA definition of UUID | |
| char bytes[16]; | | char bytes[16]; | |
| } CUuuid; | | } CUuuid; | |
| | | | |
| /************************************ | | /************************************ | |
| ** | | ** | |
| | | | |
| skipping to change at line 175 | | skipping to change at line 176 | |
| CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH = 22, ///< Maximum 2D textu
re width | | CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH = 22, ///< Maximum 2D textu
re width | |
| CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT = 23,///< Maximum 2D textu
re height | | CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT = 23,///< Maximum 2D textu
re height | |
| CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH = 24, ///< Maximum 3D textu
re width | | CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH = 24, ///< Maximum 3D textu
re width | |
| CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT = 25,///< Maximum 3D textu
re height | | CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT = 25,///< Maximum 3D textu
re height | |
| CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH = 26, ///< Maximum 3D textu
re depth | | CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH = 26, ///< Maximum 3D textu
re depth | |
| CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_WIDTH = 27, ///< Maximum te
xture array width | | CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_WIDTH = 27, ///< Maximum te
xture array width | |
| CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_HEIGHT = 28,///< Maximum te
xture array height | | CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_HEIGHT = 28,///< Maximum te
xture array height | |
| CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES = 29, ///< Maximu
m slices in a texture array | | CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES = 29, ///< Maximu
m slices in a texture array | |
| CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT = 30, ///< Alignment requirement
for surfaces | | CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT = 30, ///< Alignment requirement
for surfaces | |
| CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS = 31, ///< Device can possibly e
xecute multiple kernels concurrently | | CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS = 31, ///< Device can possibly e
xecute multiple kernels concurrently | |
|
| CU_DEVICE_ATTRIBUTE_ECC_ENABLED = 32 ///< Device has ECC support enable | | CU_DEVICE_ATTRIBUTE_ECC_ENABLED = 32, ///< Device has ECC support enabl | |
| d | | ed | |
| | | CU_DEVICE_ATTRIBUTE_PCI_BUS_ID = 33, ////< PCI bus ID of the device | |
| | | CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID = 34 ////< PCI device ID of the devic | |
| | | e | |
| } CUdevice_attribute; | | } CUdevice_attribute; | |
| | | | |
| /** | | /** | |
| * Legacy device properties | | * Legacy device properties | |
| */ | | */ | |
| typedef struct CUdevprop_st { | | typedef struct CUdevprop_st { | |
| int maxThreadsPerBlock; ///< Maximum number of threads per block | | int maxThreadsPerBlock; ///< Maximum number of threads per block | |
| int maxThreadsDim[3]; ///< Maximum size of each dimension of a bl
ock | | int maxThreadsDim[3]; ///< Maximum size of each dimension of a bl
ock | |
| int maxGridSize[3]; ///< Maximum size of each dimension of a gr
id | | int maxGridSize[3]; ///< Maximum size of each dimension of a gr
id | |
| int sharedMemPerBlock; ///< Shared memory available per block in b
ytes | | int sharedMemPerBlock; ///< Shared memory available per block in b
ytes | |
| | | | |
| skipping to change at line 413 | | skipping to change at line 416 | |
| */ | | */ | |
| typedef enum CUarray_cubemap_face_enum { | | typedef enum CUarray_cubemap_face_enum { | |
| CU_CUBEMAP_FACE_POSITIVE_X = 0x00, ///< Positive X face of cubemap | | CU_CUBEMAP_FACE_POSITIVE_X = 0x00, ///< Positive X face of cubemap | |
| CU_CUBEMAP_FACE_NEGATIVE_X = 0x01, ///< Negative X face of cubemap | | CU_CUBEMAP_FACE_NEGATIVE_X = 0x01, ///< Negative X face of cubemap | |
| CU_CUBEMAP_FACE_POSITIVE_Y = 0x02, ///< Positive Y face of cubemap | | CU_CUBEMAP_FACE_POSITIVE_Y = 0x02, ///< Positive Y face of cubemap | |
| CU_CUBEMAP_FACE_NEGATIVE_Y = 0x03, ///< Negative Y face of cubemap | | CU_CUBEMAP_FACE_NEGATIVE_Y = 0x03, ///< Negative Y face of cubemap | |
| CU_CUBEMAP_FACE_POSITIVE_Z = 0x04, ///< Positive Z face of cubemap | | CU_CUBEMAP_FACE_POSITIVE_Z = 0x04, ///< Positive Z face of cubemap | |
| CU_CUBEMAP_FACE_NEGATIVE_Z = 0x05 ///< Negative Z face of cubemap | | CU_CUBEMAP_FACE_NEGATIVE_Z = 0x05 ///< Negative Z face of cubemap | |
| } CUarray_cubemap_face; | | } CUarray_cubemap_face; | |
| | | | |
|
| | | /** | |
| | | * Limits | |
| | | */ | |
| | | typedef enum CUlimit_enum { | |
| | | CU_LIMIT_STACK_SIZE = 0x00, ///< GPU thread stack size | |
| | | CU_LIMIT_PRINTF_FIFO_SIZE = 0x01 ///< GPU printf FIFO size | |
| | | } CUlimit; | |
| | | | |
| /************************************ | | /************************************ | |
| ** | | ** | |
| ** Error codes | | ** Error codes | |
| ** | | ** | |
| ***********************************/ | | ***********************************/ | |
| | | | |
| /** | | /** | |
| * Error codes | | * Error codes | |
| */ | | */ | |
| typedef enum cudaError_enum { | | typedef enum cudaError_enum { | |
| | | | |
|
| CUDA_SUCCESS = 0, ///< No errors | | CUDA_SUCCESS = 0, ///< No errors | |
| CUDA_ERROR_INVALID_VALUE = 1, ///< Invalid value | | CUDA_ERROR_INVALID_VALUE = 1, ///< Invalid value | |
| CUDA_ERROR_OUT_OF_MEMORY = 2, ///< Out of memory | | CUDA_ERROR_OUT_OF_MEMORY = 2, ///< Out of memory | |
| CUDA_ERROR_NOT_INITIALIZED = 3, ///< Driver not initialized | | CUDA_ERROR_NOT_INITIALIZED = 3, ///< Driver not initia | |
| CUDA_ERROR_DEINITIALIZED = 4, ///< Driver deinitialized | | lized | |
| | | CUDA_ERROR_DEINITIALIZED = 4, ///< Driver deinitiali | |
| | | zed | |
| | | | |
|
| CUDA_ERROR_NO_DEVICE = 100, ///< No CUDA-capable device | | CUDA_ERROR_NO_DEVICE = 100, ///< No CUDA-capable d | |
| available | | evice available | |
| CUDA_ERROR_INVALID_DEVICE = 101, ///< Invalid device | | CUDA_ERROR_INVALID_DEVICE = 101, ///< Invalid device | |
| | | | |
|
| CUDA_ERROR_INVALID_IMAGE = 200, ///< Invalid kernel image | | CUDA_ERROR_INVALID_IMAGE = 200, ///< Invalid kernel im | |
| CUDA_ERROR_INVALID_CONTEXT = 201, ///< Invalid context | | age | |
| CUDA_ERROR_CONTEXT_ALREADY_CURRENT = 202, ///< Context already curren | | CUDA_ERROR_INVALID_CONTEXT = 201, ///< Invalid context | |
| t | | CUDA_ERROR_CONTEXT_ALREADY_CURRENT = 202, ///< Context already c | |
| CUDA_ERROR_MAP_FAILED = 205, ///< Map failed | | urrent | |
| CUDA_ERROR_UNMAP_FAILED = 206, ///< Unmap failed | | CUDA_ERROR_MAP_FAILED = 205, ///< Map failed | |
| CUDA_ERROR_ARRAY_IS_MAPPED = 207, ///< Array is mapped | | CUDA_ERROR_UNMAP_FAILED = 206, ///< Unmap failed | |
| CUDA_ERROR_ALREADY_MAPPED = 208, ///< Already mapped | | CUDA_ERROR_ARRAY_IS_MAPPED = 207, ///< Array is mapped | |
| CUDA_ERROR_NO_BINARY_FOR_GPU = 209, ///< No binary for GPU | | CUDA_ERROR_ALREADY_MAPPED = 208, ///< Already mapped | |
| CUDA_ERROR_ALREADY_ACQUIRED = 210, ///< Already acquired | | CUDA_ERROR_NO_BINARY_FOR_GPU = 209, ///< No binary for GPU | |
| CUDA_ERROR_NOT_MAPPED = 211, ///< Not mapped | | CUDA_ERROR_ALREADY_ACQUIRED = 210, ///< Already acquired | |
| CUDA_ERROR_NOT_MAPPED_AS_ARRAY = 212, ///< Mapped resource not a | | CUDA_ERROR_NOT_MAPPED = 211, ///< Not mapped | |
| vailable for access as an array | | CUDA_ERROR_NOT_MAPPED_AS_ARRAY = 212, ///< Mapped resource n | |
| CUDA_ERROR_NOT_MAPPED_AS_POINTER = 213, ///< Mapped resource not a | | ot available for access as an array | |
| vailable for access as a pointer | | CUDA_ERROR_NOT_MAPPED_AS_POINTER = 213, ///< Mapped resource n | |
| CUDA_ERROR_ECC_UNCORRECTABLE = 214, ///< Uncorrectable ECC erro | | ot available for access as a pointer | |
| r detected | | CUDA_ERROR_ECC_UNCORRECTABLE = 214, ///< Uncorrectable ECC | |
| | | error detected | |
| | | CUDA_ERROR_UNSUPPORTED_LIMIT = 215, ///< CUlimit not suppo | |
| | | rted by device | |
| | | | |
|
| CUDA_ERROR_INVALID_SOURCE = 300, ///< Invalid source | | CUDA_ERROR_INVALID_SOURCE = 300, ///< Invalid source | |
| CUDA_ERROR_FILE_NOT_FOUND = 301, ///< File not found | | CUDA_ERROR_FILE_NOT_FOUND = 301, ///< File not found | |
| | | CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND = 302, ///< Link to a shared | |
| | | object failed to resolve | |
| | | CUDA_ERROR_SHARED_OBJECT_INIT_FAILED = 303, ///< Shared object ini | |
| | | tialization failed | |
| | | | |
|
| CUDA_ERROR_INVALID_HANDLE = 400, ///< Invalid handle | | CUDA_ERROR_INVALID_HANDLE = 400, ///< Invalid handle | |
| | | | |
|
| CUDA_ERROR_NOT_FOUND = 500, ///< Not found | | CUDA_ERROR_NOT_FOUND = 500, ///< Not found | |
| | | | |
|
| CUDA_ERROR_NOT_READY = 600, ///< CUDA not ready | | CUDA_ERROR_NOT_READY = 600, ///< CUDA not ready | |
| | | | |
|
| CUDA_ERROR_LAUNCH_FAILED = 700, ///< Launch failed | | CUDA_ERROR_LAUNCH_FAILED = 700, ///< Launch failed | |
| CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES = 701, ///< Launch exceeded resour | | CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES = 701, ///< Launch exceeded r | |
| ces | | esources | |
| CUDA_ERROR_LAUNCH_TIMEOUT = 702, ///< Launch exceeded timeou | | CUDA_ERROR_LAUNCH_TIMEOUT = 702, ///< Launch exceeded t | |
| t | | imeout | |
| CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING = 703, ///< Launch with incomp | | CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING = 703, ///< Launch with incom | |
| atible texturing | | patible texturing | |
| | | | |
|
| CUDA_ERROR_POINTER_IS_64BIT = 800, ///< Attempted to retrieve | | CUDA_ERROR_POINTER_IS_64BIT = 800, ///< Attempted to retr | |
| 64-bit pointer via 32-bit API function | | ieve 64-bit pointer via 32-bit API function | |
| CUDA_ERROR_SIZE_IS_64BIT = 801, ///< Attempted to retrieve | | CUDA_ERROR_SIZE_IS_64BIT = 801, ///< Attempted to retr | |
| 64-bit size via 32-bit API function | | ieve 64-bit size via 32-bit API function | |
| | | | |
|
| CUDA_ERROR_UNKNOWN = 999 ///< Unknown error | | CUDA_ERROR_UNKNOWN = 999 ///< Unknown error | |
| } CUresult; | | } CUresult; | |
| | | | |
| /** | | /** | |
| * If set, host memory is portable between CUDA contexts. | | * If set, host memory is portable between CUDA contexts. | |
| * Flag for ::cuMemHostAlloc() | | * Flag for ::cuMemHostAlloc() | |
| */ | | */ | |
| #define CU_MEMHOSTALLOC_PORTABLE 0x01 | | #define CU_MEMHOSTALLOC_PORTABLE 0x01 | |
| | | | |
| /** | | /** | |
| * If set, host memory is mapped into CUDA address space and | | * If set, host memory is mapped into CUDA address space and | |
| | | | |
| skipping to change at line 579 | | skipping to change at line 593 | |
| CUarray_format Format; ///< Array format | | CUarray_format Format; ///< Array format | |
| | | | |
| unsigned int NumChannels; ///< Channels per array element | | unsigned int NumChannels; ///< Channels per array element | |
| | | | |
| unsigned int Flags; ///< Flags | | unsigned int Flags; ///< Flags | |
| } CUDA_ARRAY3D_DESCRIPTOR; | | } CUDA_ARRAY3D_DESCRIPTOR; | |
| | | | |
| // if set, the CUDA array contains an array of 2D slices | | // if set, the CUDA array contains an array of 2D slices | |
| // and the Depth member of CUDA_ARRAY3D_DESCRIPTOR specifies | | // and the Depth member of CUDA_ARRAY3D_DESCRIPTOR specifies | |
| // the number of slices, not the depth of a 3D array. | | // the number of slices, not the depth of a 3D array. | |
|
| #define CUDA_ARRAY3D_2DARRAY 0x01 | | #define CUDA_ARRAY3D_2DARRAY 0x01 | |
| | | | |
| | | // this flag must be set in order to bind a surface reference | |
| | | // to the CUDA array | |
| | | #define CUDA_ARRAY3D_SURFACE_LDST 0x02 | |
| | | | |
| /** | | /** | |
| * Override the texref format with a format inferred from the array. | | * Override the texref format with a format inferred from the array. | |
| * Flag for ::cuTexRefSetArray() | | * Flag for ::cuTexRefSetArray() | |
| */ | | */ | |
| #define CU_TRSA_OVERRIDE_FORMAT 0x01 | | #define CU_TRSA_OVERRIDE_FORMAT 0x01 | |
| | | | |
| /** | | /** | |
| * Read the texture as integers rather than promoting the values to floats | | * Read the texture as integers rather than promoting the values to floats | |
| * in the range [0,1]. | | * in the range [0,1]. | |
| | | | |
| skipping to change at line 668 | | skipping to change at line 686 | |
| ***********************************/ | | ***********************************/ | |
| | | | |
| CUresult CUDAAPI cuModuleLoad(CUmodule *module, const char *fname); | | CUresult CUDAAPI cuModuleLoad(CUmodule *module, const char *fname); | |
| CUresult CUDAAPI cuModuleLoadData(CUmodule *module, const void *image)
; | | CUresult CUDAAPI cuModuleLoadData(CUmodule *module, const void *image)
; | |
| CUresult CUDAAPI cuModuleLoadDataEx(CUmodule *module, const void *imag
e, unsigned int numOptions, CUjit_option *options, void **optionValues); | | CUresult CUDAAPI cuModuleLoadDataEx(CUmodule *module, const void *imag
e, unsigned int numOptions, CUjit_option *options, void **optionValues); | |
| CUresult CUDAAPI cuModuleLoadFatBinary(CUmodule *module, const void *f
atCubin); | | CUresult CUDAAPI cuModuleLoadFatBinary(CUmodule *module, const void *f
atCubin); | |
| CUresult CUDAAPI cuModuleUnload(CUmodule hmod); | | CUresult CUDAAPI cuModuleUnload(CUmodule hmod); | |
| CUresult CUDAAPI cuModuleGetFunction(CUfunction *hfunc, CUmodule hmod,
const char *name); | | CUresult CUDAAPI cuModuleGetFunction(CUfunction *hfunc, CUmodule hmod,
const char *name); | |
| CUresult CUDAAPI cuModuleGetGlobal(CUdeviceptr *dptr, unsigned int *by
tes, CUmodule hmod, const char *name); | | CUresult CUDAAPI cuModuleGetGlobal(CUdeviceptr *dptr, unsigned int *by
tes, CUmodule hmod, const char *name); | |
| CUresult CUDAAPI cuModuleGetTexRef(CUtexref *pTexRef, CUmodule hmod, c
onst char *name); | | CUresult CUDAAPI cuModuleGetTexRef(CUtexref *pTexRef, CUmodule hmod, c
onst char *name); | |
|
| | | CUresult CUDAAPI cuModuleGetSurfRef(CUsurfref *pSurfRef, CUmodule hmod
, const char *name); | |
| | | | |
| /************************************ | | /************************************ | |
| ** | | ** | |
| ** Memory management | | ** Memory management | |
| ** | | ** | |
| ***********************************/ | | ***********************************/ | |
| | | | |
| CUresult CUDAAPI cuMemGetInfo(unsigned int *free, unsigned int *total); | | CUresult CUDAAPI cuMemGetInfo(unsigned int *free, unsigned int *total); | |
| | | | |
| CUresult CUDAAPI cuMemAlloc( CUdeviceptr *dptr, unsigned int bytesize); | | CUresult CUDAAPI cuMemAlloc( CUdeviceptr *dptr, unsigned int bytesize); | |
| | | | |
| skipping to change at line 715 | | skipping to change at line 734 | |
| | | | |
| // 1D functions | | // 1D functions | |
| // system <-> device memory | | // system <-> device memory | |
| CUresult CUDAAPI cuMemcpyHtoD (CUdeviceptr dstDevice, const void *
srcHost, unsigned int ByteCount ); | | CUresult CUDAAPI cuMemcpyHtoD (CUdeviceptr dstDevice, const void *
srcHost, unsigned int ByteCount ); | |
| CUresult CUDAAPI cuMemcpyDtoH (void *dstHost, CUdeviceptr srcDevic
e, unsigned int ByteCount ); | | CUresult CUDAAPI cuMemcpyDtoH (void *dstHost, CUdeviceptr srcDevic
e, unsigned int ByteCount ); | |
| | | | |
| // device <-> device memory | | // device <-> device memory | |
| CUresult CUDAAPI cuMemcpyDtoD (CUdeviceptr dstDevice, CUdeviceptr
srcDevice, unsigned int ByteCount ); | | CUresult CUDAAPI cuMemcpyDtoD (CUdeviceptr dstDevice, CUdeviceptr
srcDevice, unsigned int ByteCount ); | |
| | | | |
| // device <-> array memory | | // device <-> array memory | |
|
| CUresult CUDAAPI cuMemcpyDtoA ( CUarray dstArray, unsigned int dst | | CUresult CUDAAPI cuMemcpyDtoA ( CUarray dstArray, unsigned int dst | |
| Index, CUdeviceptr srcDevice, unsigned int ByteCount ); | | Offset, CUdeviceptr srcDevice, unsigned int ByteCount ); | |
| CUresult CUDAAPI cuMemcpyAtoD ( CUdeviceptr dstDevice, CUarray hSr | | CUresult CUDAAPI cuMemcpyAtoD ( CUdeviceptr dstDevice, CUarray src | |
| c, unsigned int SrcIndex, unsigned int ByteCount ); | | Array, unsigned int srcOffset, unsigned int ByteCount ); | |
| | | | |
| // system <-> array memory | | // system <-> array memory | |
|
| CUresult CUDAAPI cuMemcpyHtoA( CUarray dstArray, unsigned int dstI | | CUresult CUDAAPI cuMemcpyHtoA( CUarray dstArray, unsigned int dstO | |
| ndex, const void *pSrc, unsigned int ByteCount ); | | ffset, const void *srcHost, unsigned int ByteCount ); | |
| CUresult CUDAAPI cuMemcpyAtoH( void *dstHost, CUarray srcArray, un | | CUresult CUDAAPI cuMemcpyAtoH( void *dstHost, CUarray srcArray, un | |
| signed int srcIndex, unsigned int ByteCount ); | | signed int srcOffset, unsigned int ByteCount ); | |
| | | | |
| // array <-> array memory | | // array <-> array memory | |
|
| CUresult CUDAAPI cuMemcpyAtoA( CUarray dstArray, unsigned int dstI
ndex, CUarray srcArray, unsigned int srcIndex, unsigned int ByteCount ); | | CUresult CUDAAPI cuMemcpyAtoA( CUarray dstArray, unsigned int dstO
ffset, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount ); | |
| | | | |
| // 2D memcpy | | // 2D memcpy | |
| | | | |
| CUresult CUDAAPI cuMemcpy2D( const CUDA_MEMCPY2D *pCopy ); | | CUresult CUDAAPI cuMemcpy2D( const CUDA_MEMCPY2D *pCopy ); | |
| CUresult CUDAAPI cuMemcpy2DUnaligned( const CUDA_MEMCPY2D *pCopy )
; | | CUresult CUDAAPI cuMemcpy2DUnaligned( const CUDA_MEMCPY2D *pCopy )
; | |
| | | | |
| // 3D memcpy | | // 3D memcpy | |
| | | | |
| CUresult CUDAAPI cuMemcpy3D( const CUDA_MEMCPY3D *pCopy ); | | CUresult CUDAAPI cuMemcpy3D( const CUDA_MEMCPY3D *pCopy ); | |
| | | | |
| | | | |
| skipping to change at line 757 | | skipping to change at line 776 | |
| CUresult CUDAAPI cuMemcpyHtoDAsync (CUdeviceptr dstDevice, | | CUresult CUDAAPI cuMemcpyHtoDAsync (CUdeviceptr dstDevice, | |
| const void *srcHost, unsigned int ByteCount, CUstream hStream )
; | | const void *srcHost, unsigned int ByteCount, CUstream hStream )
; | |
| CUresult CUDAAPI cuMemcpyDtoHAsync (void *dstHost, | | CUresult CUDAAPI cuMemcpyDtoHAsync (void *dstHost, | |
| CUdeviceptr srcDevice, unsigned int ByteCount, CUstream hStream
); | | CUdeviceptr srcDevice, unsigned int ByteCount, CUstream hStream
); | |
| | | | |
| // device <-> device memory | | // device <-> device memory | |
| CUresult CUDAAPI cuMemcpyDtoDAsync (CUdeviceptr dstDevice, | | CUresult CUDAAPI cuMemcpyDtoDAsync (CUdeviceptr dstDevice, | |
| CUdeviceptr srcDevice, unsigned int ByteCount, CUstream hStream
); | | CUdeviceptr srcDevice, unsigned int ByteCount, CUstream hStream
); | |
| | | | |
| // system <-> array memory | | // system <-> array memory | |
|
| CUresult CUDAAPI cuMemcpyHtoAAsync( CUarray dstArray, unsigned int | | CUresult CUDAAPI cuMemcpyHtoAAsync( CUarray dstArray, unsigned int | |
| dstIndex, | | dstOffset, | |
| const void *pSrc, unsigned int ByteCount, CUstream hStream ); | | const void *srcHost, unsigned int ByteCount, CUstream hStream ) | |
| CUresult CUDAAPI cuMemcpyAtoHAsync( void *dstHost, CUarray srcArra | | ; | |
| y, unsigned int srcIndex, | | CUresult CUDAAPI cuMemcpyAtoHAsync( void *dstHost, CUarray srcArra | |
| | | y, unsigned int srcOffset, | |
| unsigned int ByteCount, CUstream hStream ); | | unsigned int ByteCount, CUstream hStream ); | |
| | | | |
| // 2D memcpy | | // 2D memcpy | |
| CUresult CUDAAPI cuMemcpy2DAsync( const CUDA_MEMCPY2D *pCopy, CUst
ream hStream ); | | CUresult CUDAAPI cuMemcpy2DAsync( const CUDA_MEMCPY2D *pCopy, CUst
ream hStream ); | |
| | | | |
| // 3D memcpy | | // 3D memcpy | |
| CUresult CUDAAPI cuMemcpy3DAsync( const CUDA_MEMCPY3D *pCopy, CUst
ream hStream ); | | CUresult CUDAAPI cuMemcpy3DAsync( const CUDA_MEMCPY3D *pCopy, CUst
ream hStream ); | |
| | | | |
| /************************************ | | /************************************ | |
| ** | | ** | |
| | | | |
| skipping to change at line 830 | | skipping to change at line 849 | |
| | | | |
| CUresult CUDAAPI cuTexRefGetAddress( CUdeviceptr *pdptr, CUtexref hTex
Ref ); | | CUresult CUDAAPI cuTexRefGetAddress( CUdeviceptr *pdptr, CUtexref hTex
Ref ); | |
| CUresult CUDAAPI cuTexRefGetArray( CUarray *phArray, CUtexref hTexRef
); | | CUresult CUDAAPI cuTexRefGetArray( CUarray *phArray, CUtexref hTexRef
); | |
| CUresult CUDAAPI cuTexRefGetAddressMode( CUaddress_mode *pam, CUtexref
hTexRef, int dim ); | | CUresult CUDAAPI cuTexRefGetAddressMode( CUaddress_mode *pam, CUtexref
hTexRef, int dim ); | |
| CUresult CUDAAPI cuTexRefGetFilterMode( CUfilter_mode *pfm, CUtexref h
TexRef ); | | CUresult CUDAAPI cuTexRefGetFilterMode( CUfilter_mode *pfm, CUtexref h
TexRef ); | |
| CUresult CUDAAPI cuTexRefGetFormat( CUarray_format *pFormat, int *pNum
Channels, CUtexref hTexRef ); | | CUresult CUDAAPI cuTexRefGetFormat( CUarray_format *pFormat, int *pNum
Channels, CUtexref hTexRef ); | |
| CUresult CUDAAPI cuTexRefGetFlags( unsigned int *pFlags, CUtexref hTex
Ref ); | | CUresult CUDAAPI cuTexRefGetFlags( unsigned int *pFlags, CUtexref hTex
Ref ); | |
| | | | |
| /************************************ | | /************************************ | |
| ** | | ** | |
|
| | | ** Surface reference management | |
| | | ** | |
| | | ***********************************/ | |
| | | | |
| | | CUresult CUDAAPI cuSurfRefSetArray( CUsurfref hSurfRef, CUarray hArray | |
| | | , unsigned int Flags ); | |
| | | CUresult CUDAAPI cuSurfRefGetArray( CUarray *phArray, CUsurfref hSurfR | |
| | | ef ); | |
| | | | |
| | | /************************************ | |
| | | ** | |
| ** Parameter management | | ** Parameter management | |
| ** | | ** | |
| ***********************************/ | | ***********************************/ | |
| | | | |
| CUresult CUDAAPI cuParamSetSize (CUfunction hfunc, unsigned int numbyt
es); | | CUresult CUDAAPI cuParamSetSize (CUfunction hfunc, unsigned int numbyt
es); | |
| CUresult CUDAAPI cuParamSeti (CUfunction hfunc, int offset, unsigne
d int value); | | CUresult CUDAAPI cuParamSeti (CUfunction hfunc, int offset, unsigne
d int value); | |
| CUresult CUDAAPI cuParamSetf (CUfunction hfunc, int offset, float v
alue); | | CUresult CUDAAPI cuParamSetf (CUfunction hfunc, int offset, float v
alue); | |
| CUresult CUDAAPI cuParamSetv (CUfunction hfunc, int offset, void *p
tr, unsigned int numbytes); | | CUresult CUDAAPI cuParamSetv (CUfunction hfunc, int offset, void *p
tr, unsigned int numbytes); | |
| CUresult CUDAAPI cuParamSetTexRef(CUfunction hfunc, int texunit, CUtex
ref hTexRef); | | CUresult CUDAAPI cuParamSetTexRef(CUfunction hfunc, int texunit, CUtex
ref hTexRef); | |
| | | | |
| | | | |
| skipping to change at line 891 | | skipping to change at line 919 | |
| CUresult CUDAAPI cuGraphicsMapResources( unsigned int count, CUgraphics
Resource *resources, CUstream hStream ); | | CUresult CUDAAPI cuGraphicsMapResources( unsigned int count, CUgraphics
Resource *resources, CUstream hStream ); | |
| CUresult CUDAAPI cuGraphicsUnmapResources( unsigned int count, CUgraphi
csResource *resources, CUstream hStream ); | | CUresult CUDAAPI cuGraphicsUnmapResources( unsigned int count, CUgraphi
csResource *resources, CUstream hStream ); | |
| | | | |
| /************************************ | | /************************************ | |
| ** | | ** | |
| ** Export tables | | ** Export tables | |
| ** | | ** | |
| ***********************************/ | | ***********************************/ | |
| CUresult CUDAAPI cuGetExportTable( const void **ppExportTable, const CU
uuid *pExportTableId ); | | CUresult CUDAAPI cuGetExportTable( const void **ppExportTable, const CU
uuid *pExportTableId ); | |
| | | | |
|
| | | /************************************ | |
| | | ** | |
| | | ** Limits | |
| | | ** | |
| | | ***********************************/ | |
| | | | |
| | | CUresult CUDAAPI cuCtxSetLimit(CUlimit limit, size_t value); | |
| | | CUresult CUDAAPI cuCtxGetLimit(size_t *pvalue, CUlimit limit); | |
| | | | |
| #ifdef __cplusplus | | #ifdef __cplusplus | |
| } | | } | |
| #endif | | #endif | |
| | | | |
| #endif /* __cuda_cuda_h__ */ | | #endif /* __cuda_cuda_h__ */ | |
| | | | |
End of changes. 22 change blocks. |
| 60 lines changed or deleted | | 107 lines changed or added | |
|
| device_functions.h | | device_functions.h | |
| | | | |
| skipping to change at line 47 | | skipping to change at line 47 | |
| #define __DEVICE_FUNCTIONS_H__ | | #define __DEVICE_FUNCTIONS_H__ | |
| | | | |
| /**************************************************************************
***** | | /**************************************************************************
***** | |
| *
* | | *
* | |
| *
* | | *
* | |
| *
* | | *
* | |
| ***************************************************************************
****/ | | ***************************************************************************
****/ | |
| | | | |
| #if defined(__cplusplus) && defined(__CUDACC__) | | #if defined(__cplusplus) && defined(__CUDACC__) | |
| | | | |
|
| | | #include "builtin_types.h" | |
| #include "device_types.h" | | #include "device_types.h" | |
| #include "host_defines.h" | | #include "host_defines.h" | |
| | | | |
| /**************************************************************************
***** | | /**************************************************************************
***** | |
| *
* | | *
* | |
| *
* | | *
* | |
| *
* | | *
* | |
| ***************************************************************************
****/ | | ***************************************************************************
****/ | |
| | | | |
| extern "C" | | extern "C" | |
| | | | |
| skipping to change at line 75 | | skipping to change at line 76 | |
| extern __device__ long long int __mul64hi(long long int, long long
int); | | extern __device__ long long int __mul64hi(long long int, long long
int); | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
| extern __device__ unsigned long long int __umul64hi(unsigned long long int,
unsigned long long int); | | extern __device__ unsigned long long int __umul64hi(unsigned long long int,
unsigned long long int); | |
| | | | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
| extern __device__ float __int_as_float(int); | | extern __device__ float __int_as_float(int); | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
| extern __device__ int __float_as_int(float); | | extern __device__ int __float_as_int(float); | |
| | | | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
|
| | | extern __device__ void __synchronous_start(int); | |
| | | /*DEVICE_BUILTIN*/ | |
| | | extern __device__ void __synchronous_end(void); | |
| | | /*DEVICE_BUILTIN*/ | |
| extern __device__ void __syncthreads(void); | | extern __device__ void __syncthreads(void); | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
| extern __device__ void __prof_trigger(int); | | extern __device__ void __prof_trigger(int); | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
| extern __device__ void __threadfence(void); | | extern __device__ void __threadfence(void); | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
| extern __device__ void __threadfence_block(void); | | extern __device__ void __threadfence_block(void); | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
| extern __device__ void __trap(void); | | extern __device__ void __trap(void); | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
| | | | |
| skipping to change at line 287 | | skipping to change at line 292 | |
| | | | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
| extern __device__ int __clzll(long long int); | | extern __device__ int __clzll(long long int); | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
| extern __device__ int __ffsll(long long int); | | extern __device__ int __ffsll(long long int); | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
| extern __device__ int __popcll(unsigned long long int); | | extern __device__ int __popcll(unsigned long long int); | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
| extern __device__ unsigned long long int __brevll(unsigned long long int); | | extern __device__ unsigned long long int __brevll(unsigned long long int); | |
| | | | |
|
| | | /*DEVICE_BUILTIN*/ | |
| | | extern __device__ unsigned int __byte_perm(unsigned int, unsigned | |
| | | int, unsigned int); | |
| | | | |
| #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 130 | | #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 130 | |
| | | | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
| extern __device__ int __double2int_rz(double); | | extern __device__ int __double2int_rz(double); | |
| | | | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
| extern __device__ unsigned int __double2uint_rz(double); | | extern __device__ unsigned int __double2uint_rz(double); | |
| | | | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
| extern __device__ long long int __double2ll_rz(double); | | extern __device__ long long int __double2ll_rz(double); | |
| | | | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
| extern __device__ unsigned long long int __double2ull_rz(double); | | extern __device__ unsigned long long int __double2ull_rz(double); | |
| | | | |
|
| | | /*DEVICE_BUILTIN*/ | |
| | | extern __device__ unsigned int __pm0(void); | |
| | | /*DEVICE_BUILTIN*/ | |
| | | extern __device__ unsigned int __pm1(void); | |
| | | /*DEVICE_BUILTIN*/ | |
| | | extern __device__ unsigned int __pm2(void); | |
| | | /*DEVICE_BUILTIN*/ | |
| | | extern __device__ unsigned int __pm3(void); | |
| | | | |
| #endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 130 */ | | #endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 130 */ | |
| | | | |
| } | | } | |
| | | | |
| /**************************************************************************
***** | | /**************************************************************************
***** | |
| *
* | | *
* | |
| *
* | | *
* | |
| *
* | | *
* | |
| ***************************************************************************
****/ | | ***************************************************************************
****/ | |
| | | | |
| | | | |
| skipping to change at line 448 | | skipping to change at line 465 | |
| } | | } | |
| | | | |
| static __inline__ __device__ float uint2float(unsigned int a, enum cudaRoun
dMode mode = cudaRoundNearest) | | static __inline__ __device__ float uint2float(unsigned int a, enum cudaRoun
dMode mode = cudaRoundNearest) | |
| { | | { | |
| return mode == cudaRoundZero ? __uint2float_rz(a) : | | return mode == cudaRoundZero ? __uint2float_rz(a) : | |
| mode == cudaRoundPosInf ? __uint2float_ru(a) : | | mode == cudaRoundPosInf ? __uint2float_ru(a) : | |
| mode == cudaRoundMinInf ? __uint2float_rd(a) : | | mode == cudaRoundMinInf ? __uint2float_rd(a) : | |
| __uint2float_rn(a); | | __uint2float_rn(a); | |
| } | | } | |
| | | | |
|
| #elif !defined(__CUDACC__) | | #elif defined(__CUDABE__) | |
| | | | |
|
| #include "crt/func_macro.h" | | /************************************************************************** | |
| | | ***** | |
| | | * | |
| | | * | |
| | | * DEVICE IMPLEMENTATIONS FOR FUNCTIONS WITH BUILTIN NVOPENCC OPERATIONS | |
| | | * | |
| | | * | |
| | | * | |
| | | *************************************************************************** | |
| | | ****/ | |
| | | | |
|
| #include "host_defines.h" | | static __forceinline__ float __sinf(float a) | |
| #include "math_constants.h" | | { | |
| | | return __builtin_sinf(a); | |
| | | } | |
| | | | |
|
| #if defined(__CUDABE__) | | static __forceinline__ float __cosf(float a) | |
| | | { | |
| | | return __builtin_cosf(a); | |
| | | } | |
| | | | |
|
| #if (__CUDA_ARCH__ < 200) | | static __forceinline__ float __log2f(float a) | |
| | | { | |
| | | return __builtin_log2f(a); | |
| | | } | |
| | | | |
|
| __device_func__(float __frcp_rn (float x)) | | /************************************************************************** | |
| | | ***** | |
| | | * | |
| | | * | |
| | | * DEVICE IMPLEMENTATIONS FOR FUNCTIONS WITHOUT BUILTIN NVOPENCC OPERATIONS | |
| | | * | |
| | | * | |
| | | * | |
| | | *************************************************************************** | |
| | | ****/ | |
| | | | |
| | | static __forceinline__ float __tanf(float a) | |
| | | { | |
| | | return __fdividef (__sinf(a), __cosf(a)); | |
| | | } | |
| | | | |
| | | static __forceinline__ void __sincosf(float a, float *sptr, float *cptr) | |
| | | { | |
| | | *sptr = __sinf(a); | |
| | | *cptr = __cosf(a); | |
| | | } | |
| | | | |
| | | static __forceinline__ float __expf(float a) | |
| | | { | |
| | | return exp2f(a * CUDART_L2E_F); | |
| | | } | |
| | | | |
| | | static __forceinline__ float __exp10f(float a) | |
| | | { | |
| | | return exp2f(a * CUDART_L2T_F); | |
| | | } | |
| | | | |
| | | static __forceinline__ float __log10f(float a) | |
| | | { | |
| | | return CUDART_LG2_F * __log2f(a); | |
| | | } | |
| | | | |
| | | static __forceinline__ float __logf(float a) | |
| | | { | |
| | | return CUDART_LN2_F * __log2f(a); | |
| | | } | |
| | | | |
| | | static __forceinline__ float __powf(float a, float b) | |
| | | { | |
| | | return exp2f(b * __log2f(a)); | |
| | | } | |
| | | | |
| | | static __forceinline__ float fdividef(float a, float b) | |
| | | { | |
| | | #if defined(__USE_FAST_MATH__) && !defined(__CUDA_PREC_DIV) | |
| | | return __fdividef(a, b); | |
| | | #else /* __USE_FAST_MATH__ && !__CUDA_PREC_DIV */ | |
| | | return a / b; | |
| | | #endif /* __USE_FAST_MATH__ && !__CUDA_PREC_DIV */ | |
| | | } | |
| | | | |
| | | #if defined(CUDA_FLOAT_MATH_FUNCTIONS) | |
| | | | |
| | | static __forceinline__ double fdivide(double a, double b) | |
| | | { | |
| | | return (double)fdividef((float)a, (float)b); | |
| | | } | |
| | | | |
| | | #endif /* CUDA_FLOAT_MATH_FUNCTIONS */ | |
| | | | |
| | | #if defined(CUDA_DOUBLE_MATH_FUNCTIONS) | |
| | | | |
| | | static __forceinline__ double fdivide(double a, double b) | |
| | | { | |
| | | return a / b; | |
| | | } | |
| | | | |
| | | #endif /* CUDA_DOUBLE_MATH_FUNCTIONS */ | |
| | | | |
| | | #if __CUDA_ARCH__ < 200 | |
| | | | |
| | | static __forceinline__ float __frcp_rn (float x) | |
| { | | { | |
| unsigned int expo; | | unsigned int expo; | |
| unsigned f, y; | | unsigned f, y; | |
| unsigned int argi; | | unsigned int argi; | |
| float t; | | float t; | |
| | | | |
| argi = __float_as_int(x); | | argi = __float_as_int(x); | |
| expo = (argi >> 23); | | expo = (argi >> 23); | |
| expo = expo & 0xff; | | expo = expo & 0xff; | |
| f = expo - 1; | | f = expo - 1; | |
| | | | |
| skipping to change at line 497 | | skipping to change at line 597 | |
| f = (unsigned)(-(int)f); | | f = (unsigned)(-(int)f); | |
| if (expo < f) { | | if (expo < f) { | |
| t = __int_as_float(__float_as_int(t)+1); | | t = __int_as_float(__float_as_int(t)+1); | |
| } | | } | |
| return t; | | return t; | |
| } | | } | |
| } | | } | |
| return 1.0f / x; | | return 1.0f / x; | |
| } | | } | |
| | | | |
|
| __device_func__(float __frcp_rz (float x)) | | static __forceinline__ float __frcp_rz (float x) | |
| { | | { | |
| unsigned int expo; | | unsigned int expo; | |
| unsigned f, y; | | unsigned f, y; | |
| unsigned int argi; | | unsigned int argi; | |
| float t; | | float t; | |
| | | | |
| argi = __float_as_int(x); | | argi = __float_as_int(x); | |
| expo = (argi >> 23); | | expo = (argi >> 23); | |
| expo = expo & 0xff; | | expo = expo & 0xff; | |
| f = expo - 1; | | f = expo - 1; | |
| | | | |
| skipping to change at line 525 | | skipping to change at line 625 | |
| f = __umul24(y, argi); | | f = __umul24(y, argi); | |
| if ((int)f > 0) { | | if ((int)f > 0) { | |
| t = __int_as_float(__float_as_int(t)-1); | | t = __int_as_float(__float_as_int(t)-1); | |
| } | | } | |
| return t; | | return t; | |
| } | | } | |
| } | | } | |
| return 1.0f / x; | | return 1.0f / x; | |
| } | | } | |
| | | | |
|
| __device_func__(float __frcp_rd (float x)) | | static __forceinline__ float __frcp_rd (float x) | |
| { | | { | |
| unsigned int expo; | | unsigned int expo; | |
| unsigned f, y; | | unsigned f, y; | |
| unsigned int argi; | | unsigned int argi; | |
| float t; | | float t; | |
| | | | |
| argi = __float_as_int(x); | | argi = __float_as_int(x); | |
| expo = (argi >> 23); | | expo = (argi >> 23); | |
| expo = expo & 0xff; | | expo = expo & 0xff; | |
| f = expo - 1; | | f = expo - 1; | |
| | | | |
| skipping to change at line 556 | | skipping to change at line 656 | |
| } | | } | |
| if (((int)f < 0) && (x < 0.0f)) { | | if (((int)f < 0) && (x < 0.0f)) { | |
| t = __int_as_float(__float_as_int(t)+1); | | t = __int_as_float(__float_as_int(t)+1); | |
| } | | } | |
| return t; | | return t; | |
| } | | } | |
| } | | } | |
| return 1.0f / x; | | return 1.0f / x; | |
| } | | } | |
| | | | |
|
| __device_func__(float __frcp_ru (float x)) | | static __forceinline__ float __frcp_ru (float x) | |
| { | | { | |
| unsigned int expo; | | unsigned int expo; | |
| unsigned f, y; | | unsigned f, y; | |
| unsigned int argi; | | unsigned int argi; | |
| float t; | | float t; | |
| | | | |
| argi = __float_as_int(x); | | argi = __float_as_int(x); | |
| expo = (argi >> 23); | | expo = (argi >> 23); | |
| expo = expo & 0xff; | | expo = expo & 0xff; | |
| f = expo - 1; | | f = expo - 1; | |
| | | | |
| skipping to change at line 587 | | skipping to change at line 687 | |
| } | | } | |
| if (((int)f < 0) && (x > 0.0f)) { | | if (((int)f < 0) && (x > 0.0f)) { | |
| t = __int_as_float(__float_as_int(t)+1); | | t = __int_as_float(__float_as_int(t)+1); | |
| } | | } | |
| return t; | | return t; | |
| } | | } | |
| } | | } | |
| return 1.0f / x; | | return 1.0f / x; | |
| } | | } | |
| | | | |
|
| __device_func__(float __fsqrt_rn (float radicand)) | | static __forceinline__ float __fsqrt_rn (float radicand) | |
| { | | { | |
| unsigned int expo, argi; | | unsigned int expo, argi; | |
| unsigned int s, f, x; | | unsigned int s, f, x; | |
| | | | |
| argi = __float_as_int(radicand); | | argi = __float_as_int(radicand); | |
| expo = argi >> 23; | | expo = argi >> 23; | |
| expo = expo & 0xff; | | expo = expo & 0xff; | |
| f = expo - 1; | | f = expo - 1; | |
| | | | |
| if ((argi <= 0x80000000) && (f <= 0xFD)) { | | if ((argi <= 0x80000000) && (f <= 0xFD)) { | |
| | | | |
| skipping to change at line 621 | | skipping to change at line 721 | |
| f = x - (2 * argi + 1); | | f = x - (2 * argi + 1); | |
| if ((int)f < 0) f = (unsigned)(-(int)f); | | if ((int)f < 0) f = (unsigned)(-(int)f); | |
| if ((int)x < 0) x = (unsigned)(-(int)x); | | if ((int)x < 0) x = (unsigned)(-(int)x); | |
| if (f < x) argi ++; | | if (f < x) argi ++; | |
| argi = argi + (((expo + 125) & ~0x1) << 22); | | argi = argi + (((expo + 125) & ~0x1) << 22); | |
| return __int_as_float(argi); | | return __int_as_float(argi); | |
| } | | } | |
| return sqrtf(radicand); | | return sqrtf(radicand); | |
| } | | } | |
| | | | |
|
| __device_func__(float __fsqrt_rz (float radicand)) | | static __forceinline__ float __fsqrt_rz (float radicand) | |
| { | | { | |
| unsigned int expo, argi; | | unsigned int expo, argi; | |
| unsigned int s, f, x; | | unsigned int s, f, x; | |
| | | | |
| argi = __float_as_int(radicand); | | argi = __float_as_int(radicand); | |
| expo = argi >> 23; | | expo = argi >> 23; | |
| expo = expo & 0xff; | | expo = expo & 0xff; | |
| f = expo - 1; | | f = expo - 1; | |
| | | | |
| if ((argi <= 0x80000000) && (f <= 0xFD)) { | | if ((argi <= 0x80000000) && (f <= 0xFD)) { | |
| | | | |
| skipping to change at line 652 | | skipping to change at line 752 | |
| /* compute truncated result */ | | /* compute truncated result */ | |
| argi = (argi + 4) >> 3; | | argi = (argi + 4) >> 3; | |
| x = (x << 16) - (argi * argi); | | x = (x << 16) - (argi * argi); | |
| if ((int)x < 0) argi--; | | if ((int)x < 0) argi--; | |
| argi = argi + (((expo + 125) & ~0x1) << 22); | | argi = argi + (((expo + 125) & ~0x1) << 22); | |
| return __int_as_float(argi); | | return __int_as_float(argi); | |
| } | | } | |
| return sqrtf(radicand); | | return sqrtf(radicand); | |
| } | | } | |
| | | | |
|
| __device_func__(float __fsqrt_ru (float radicand)) | | static __forceinline__ float __fsqrt_ru (float radicand) | |
| { | | { | |
| unsigned int expo, argi; | | unsigned int expo, argi; | |
| unsigned int s, f, x; | | unsigned int s, f, x; | |
| | | | |
| argi = __float_as_int(radicand); | | argi = __float_as_int(radicand); | |
| expo = argi >> 23; | | expo = argi >> 23; | |
| expo = expo & 0xff; | | expo = expo & 0xff; | |
| f = expo - 1; | | f = expo - 1; | |
| | | | |
| if ((argi <= 0x80000000) && (f <= 0xFD)) { | | if ((argi <= 0x80000000) && (f <= 0xFD)) { | |
| | | | |
| skipping to change at line 682 | | skipping to change at line 782 | |
| argi = __umulhi(x,argi); | | argi = __umulhi(x,argi); | |
| argi = (argi + 4) >> 3; | | argi = (argi + 4) >> 3; | |
| x = (x << 16) - (argi * argi); | | x = (x << 16) - (argi * argi); | |
| if ((int)x > 0) argi++; | | if ((int)x > 0) argi++; | |
| argi = argi + (((expo + 125) & ~0x1) << 22); | | argi = argi + (((expo + 125) & ~0x1) << 22); | |
| return __int_as_float(argi); | | return __int_as_float(argi); | |
| } | | } | |
| return sqrtf(radicand); | | return sqrtf(radicand); | |
| } | | } | |
| | | | |
|
| __device_func__(float __fsqrt_rd (float radicand)) | | static __forceinline__ float __fsqrt_rd (float radicand) | |
| { | | { | |
| unsigned int expo, argi; | | unsigned int expo, argi; | |
| unsigned int s, f, x; | | unsigned int s, f, x; | |
| | | | |
| argi = __float_as_int(radicand); | | argi = __float_as_int(radicand); | |
| expo = argi >> 23; | | expo = argi >> 23; | |
| expo = expo & 0xff; | | expo = expo & 0xff; | |
| f = expo - 1; | | f = expo - 1; | |
| | | | |
| if ((argi <= 0x80000000) && (f <= 0xFD)) { | | if ((argi <= 0x80000000) && (f <= 0xFD)) { | |
| | | | |
| skipping to change at line 713 | | skipping to change at line 813 | |
| /* compute truncated result */ | | /* compute truncated result */ | |
| argi = (argi + 4) >> 3; | | argi = (argi + 4) >> 3; | |
| x = (x << 16) - (argi * argi); | | x = (x << 16) - (argi * argi); | |
| if ((int)x < 0) argi--; | | if ((int)x < 0) argi--; | |
| argi = argi + (((expo + 125) & ~0x1) << 22); | | argi = argi + (((expo + 125) & ~0x1) << 22); | |
| return __int_as_float(argi); | | return __int_as_float(argi); | |
| } | | } | |
| return sqrtf(radicand); | | return sqrtf(radicand); | |
| } | | } | |
| | | | |
|
| __device_func__(float __fdiv_rn (float dividend, float divisor)) | | static __forceinline__ float __fdiv_rn (float dividend, float divisor) | |
| { | | { | |
| unsigned long long prod; | | unsigned long long prod; | |
| unsigned r, f, x, y, expox, expoy, sign; | | unsigned r, f, x, y, expox, expoy, sign; | |
| unsigned expo_res; | | unsigned expo_res; | |
| unsigned resi, cvtxi, cvtyi; | | unsigned resi, cvtxi, cvtyi; | |
| float t; | | float t; | |
| | | | |
| cvtxi = __float_as_int(dividend); | | cvtxi = __float_as_int(dividend); | |
| cvtyi = __float_as_int(divisor); | | cvtyi = __float_as_int(divisor); | |
| expox = (cvtxi >> 23) & 0xff; | | expox = (cvtxi >> 23) & 0xff; | |
| | | | |
| skipping to change at line 776 | | skipping to change at line 876 | |
| prod = ((unsigned long long)y) * r; | | prod = ((unsigned long long)y) * r; | |
| x = x << (23 + ((prod >> 32) >> 15)); | | x = x << (23 + ((prod >> 32) >> 15)); | |
| rem1 = x - (unsigned)(prod & 0xffffffff); | | rem1 = x - (unsigned)(prod & 0xffffffff); | |
| rem0 = rem1 - y; | | rem0 = rem1 - y; | |
| inc = abs(rem0) < abs(rem1); | | inc = abs(rem0) < abs(rem1); | |
| resi = ((expo_res << 23) + r + inc); | | resi = ((expo_res << 23) + r + inc); | |
| if (resi != 0x00800000) resi = 0; | | if (resi != 0x00800000) resi = 0; | |
| return __int_as_float(sign | resi); | | return __int_as_float(sign | resi); | |
| } | | } | |
| } | | } | |
|
| if (__cuda_fabsf(divisor) > CUDART_TWO_TO_126_F) { | | if (fabsf(divisor) > CUDART_TWO_TO_126_F) { | |
| divisor *= 0.25f; | | divisor *= 0.25f; | |
| dividend *= 0.25f; | | dividend *= 0.25f; | |
| } | | } | |
| return __fdividef (dividend, divisor); | | return __fdividef (dividend, divisor); | |
| } | | } | |
| | | | |
|
| __device_func__(float __fdiv_rz (float dividend, float divisor)) | | static __forceinline__ float __fdiv_rz (float dividend, float divisor) | |
| { | | { | |
| unsigned long long prod; | | unsigned long long prod; | |
| unsigned r, f, x, y, expox, expoy, sign; | | unsigned r, f, x, y, expox, expoy, sign; | |
| unsigned expo_res; | | unsigned expo_res; | |
| unsigned resi, cvtxi, cvtyi; | | unsigned resi, cvtxi, cvtyi; | |
| float t; | | float t; | |
| | | | |
| cvtxi = __float_as_int(dividend); | | cvtxi = __float_as_int(dividend); | |
| cvtyi = __float_as_int(divisor); | | cvtyi = __float_as_int(divisor); | |
| expox = (cvtxi >> 23) & 0xff; | | expox = (cvtxi >> 23) & 0xff; | |
| | | | |
| skipping to change at line 844 | | skipping to change at line 944 | |
| int rem1; | | int rem1; | |
| prod = ((unsigned long long)y) * r; | | prod = ((unsigned long long)y) * r; | |
| x = x << (23 + ((prod >> 32) >> 15)); | | x = x << (23 + ((prod >> 32) >> 15)); | |
| rem1 = x - (unsigned)(prod & 0xffffffff); | | rem1 = x - (unsigned)(prod & 0xffffffff); | |
| if (rem1 < 0) r--; | | if (rem1 < 0) r--; | |
| resi = ((expo_res << 23) + r); | | resi = ((expo_res << 23) + r); | |
| if (resi != 0x00800000) resi = 0; | | if (resi != 0x00800000) resi = 0; | |
| return __int_as_float(sign | resi); | | return __int_as_float(sign | resi); | |
| } | | } | |
| } | | } | |
|
| if (__cuda_fabsf(divisor) > CUDART_TWO_TO_126_F) { | | if (fabsf(divisor) > CUDART_TWO_TO_126_F) { | |
| divisor *= 0.25f; | | divisor *= 0.25f; | |
| dividend *= 0.25f; | | dividend *= 0.25f; | |
| } | | } | |
| return __fdividef (dividend, divisor); | | return __fdividef (dividend, divisor); | |
| } | | } | |
| | | | |
|
| __device_func__(float __fdiv_ru (float dividend, float divisor)) | | static __forceinline__ float __fdiv_ru (float dividend, float divisor) | |
| { | | { | |
| unsigned long long prod; | | unsigned long long prod; | |
| unsigned r, f, x, y, expox, expoy, sign; | | unsigned r, f, x, y, expox, expoy, sign; | |
| unsigned expo_res; | | unsigned expo_res; | |
| unsigned resi, cvtxi, cvtyi; | | unsigned resi, cvtxi, cvtyi; | |
| float t; | | float t; | |
| | | | |
| cvtxi = __float_as_int(dividend); | | cvtxi = __float_as_int(dividend); | |
| cvtyi = __float_as_int(divisor); | | cvtyi = __float_as_int(divisor); | |
| expox = (cvtxi >> 23) & 0xff; | | expox = (cvtxi >> 23) & 0xff; | |
| | | | |
| skipping to change at line 914 | | skipping to change at line 1014 | |
| prod = ((unsigned long long)y) * r; | | prod = ((unsigned long long)y) * r; | |
| x = x << (23 + ((prod >> 32) >> 15)); | | x = x << (23 + ((prod >> 32) >> 15)); | |
| rem1 = x - (unsigned)(prod & 0xffffffff); | | rem1 = x - (unsigned)(prod & 0xffffffff); | |
| if ((rem1 < 0) && (sign)) r--; | | if ((rem1 < 0) && (sign)) r--; | |
| if ((rem1 > 0) && (!sign)) r++; | | if ((rem1 > 0) && (!sign)) r++; | |
| resi = ((expo_res << 23) + r); | | resi = ((expo_res << 23) + r); | |
| if (resi != 0x00800000) resi = 0; | | if (resi != 0x00800000) resi = 0; | |
| return __int_as_float(sign | resi); | | return __int_as_float(sign | resi); | |
| } | | } | |
| } | | } | |
|
| if (__cuda_fabsf(divisor) > CUDART_TWO_TO_126_F) { | | if (fabsf(divisor) > CUDART_TWO_TO_126_F) { | |
| divisor *= 0.25f; | | divisor *= 0.25f; | |
| dividend *= 0.25f; | | dividend *= 0.25f; | |
| } | | } | |
| return __fdividef (dividend, divisor); | | return __fdividef (dividend, divisor); | |
| } | | } | |
| | | | |
|
| __device_func__(float __fdiv_rd (float dividend, float divisor)) | | static __forceinline__ float __fdiv_rd (float dividend, float divisor) | |
| { | | { | |
| unsigned long long prod; | | unsigned long long prod; | |
| unsigned r, f, x, y, expox, expoy, sign; | | unsigned r, f, x, y, expox, expoy, sign; | |
| unsigned expo_res; | | unsigned expo_res; | |
| unsigned resi, cvtxi, cvtyi; | | unsigned resi, cvtxi, cvtyi; | |
| float t; | | float t; | |
| | | | |
| cvtxi = __float_as_int(dividend); | | cvtxi = __float_as_int(dividend); | |
| cvtyi = __float_as_int(divisor); | | cvtyi = __float_as_int(divisor); | |
| expox = (cvtxi >> 23) & 0xff; | | expox = (cvtxi >> 23) & 0xff; | |
| | | | |
| skipping to change at line 984 | | skipping to change at line 1084 | |
| prod = ((unsigned long long)y) * r; | | prod = ((unsigned long long)y) * r; | |
| x = x << (23 + ((prod >> 32) >> 15)); | | x = x << (23 + ((prod >> 32) >> 15)); | |
| rem1 = x - (unsigned)(prod & 0xffffffff); | | rem1 = x - (unsigned)(prod & 0xffffffff); | |
| if ((rem1 < 0) && (!sign)) r--; | | if ((rem1 < 0) && (!sign)) r--; | |
| if ((rem1 > 0) && (sign)) r++; | | if ((rem1 > 0) && (sign)) r++; | |
| resi = ((expo_res << 23) + r); | | resi = ((expo_res << 23) + r); | |
| if (resi != 0x00800000) resi = 0; | | if (resi != 0x00800000) resi = 0; | |
| return __int_as_float(sign | resi); | | return __int_as_float(sign | resi); | |
| } | | } | |
| } | | } | |
|
| if (__cuda_fabsf(divisor) > CUDART_TWO_TO_126_F) { | | if (fabsf(divisor) > CUDART_TWO_TO_126_F) { | |
| divisor *= 0.25f; | | divisor *= 0.25f; | |
| dividend *= 0.25f; | | dividend *= 0.25f; | |
| } | | } | |
| return __fdividef (dividend, divisor); | | return __fdividef (dividend, divisor); | |
| } | | } | |
| | | | |
|
| __device_func__(float __fadd_ru (float a, float b)) | | static __forceinline__ float __fadd_ru (float a, float b) | |
| { | | { | |
| unsigned int expo_x, expo_y; | | unsigned int expo_x, expo_y; | |
| unsigned int xxi, yyi, temp; | | unsigned int xxi, yyi, temp; | |
| | | | |
| xxi = __float_as_int(a); | | xxi = __float_as_int(a); | |
| yyi = __float_as_int(b); | | yyi = __float_as_int(b); | |
| | | | |
| /* make bigger operand the augend */ | | /* make bigger operand the augend */ | |
| expo_y = yyi << 1; | | expo_y = yyi << 1; | |
| if (expo_y > (xxi << 1)) { | | if (expo_y > (xxi << 1)) { | |
| | | | |
| skipping to change at line 1094 | | skipping to change at line 1194 | |
| xxi = xxi & ~0xff000000; | | xxi = xxi & ~0xff000000; | |
| expo_x = (unsigned int)(-((int)expo_x)); | | expo_x = (unsigned int)(-((int)expo_x)); | |
| xxi = (xxi >> expo_x); | | xxi = (xxi >> expo_x); | |
| if ((expo_x > 25) || (xxi != 0x00800000)) xxi = 0; | | if ((expo_x > 25) || (xxi != 0x00800000)) xxi = 0; | |
| return __int_as_float(yyi | xxi); | | return __int_as_float(yyi | xxi); | |
| } else { | | } else { | |
| return a + b; | | return a + b; | |
| } | | } | |
| } | | } | |
| | | | |
|
| __device_func__(float __fadd_rd (float a, float b)) | | static __forceinline__ float __fadd_rd (float a, float b) | |
| { | | { | |
| unsigned int expo_x, expo_y; | | unsigned int expo_x, expo_y; | |
| unsigned int xxi, yyi, temp; | | unsigned int xxi, yyi, temp; | |
| | | | |
| xxi = __float_as_int(a); | | xxi = __float_as_int(a); | |
| yyi = __float_as_int(b); | | yyi = __float_as_int(b); | |
| | | | |
| /* make bigger operand the augend */ | | /* make bigger operand the augend */ | |
| expo_y = yyi << 1; | | expo_y = yyi << 1; | |
| if (expo_y > (xxi << 1)) { | | if (expo_y > (xxi << 1)) { | |
| | | | |
| skipping to change at line 1202 | | skipping to change at line 1302 | |
| if ((expo_x > 25) || (xxi != 0x00800000)) xxi = 0; | | if ((expo_x > 25) || (xxi != 0x00800000)) xxi = 0; | |
| return __int_as_float(yyi | xxi); | | return __int_as_float(yyi | xxi); | |
| } else { | | } else { | |
| a = a + b; | | a = a + b; | |
| xxi = xxi ^ yyi; | | xxi = xxi ^ yyi; | |
| if ((a == 0.0f) && ((int)xxi < 0)) a = __int_as_float(0x80000000); | | if ((a == 0.0f) && ((int)xxi < 0)) a = __int_as_float(0x80000000); | |
| return a; | | return a; | |
| } | | } | |
| } | | } | |
| | | | |
|
| __device_func__(float __fmul_ru (float a, float b)) | | static __forceinline__ float __fmul_ru (float a, float b) | |
| { | | { | |
| unsigned long long product; | | unsigned long long product; | |
| unsigned int expo_x, expo_y; | | unsigned int expo_x, expo_y; | |
| unsigned int xxi, yyi; | | unsigned int xxi, yyi; | |
| | | | |
| xxi = __float_as_int(a); | | xxi = __float_as_int(a); | |
| yyi = __float_as_int(b); | | yyi = __float_as_int(b); | |
| | | | |
| expo_y = 0xFF; | | expo_y = 0xFF; | |
| expo_x = expo_y & (xxi >> 23); | | expo_x = expo_y & (xxi >> 23); | |
| | | | |
| skipping to change at line 1260 | | skipping to change at line 1360 | |
| xxi += (yyi && !expo_y); | | xxi += (yyi && !expo_y); | |
| xxi = (xxi >> expo_x); | | xxi = (xxi >> expo_x); | |
| if ((expo_x > 25) || (xxi != 0x00800000)) xxi = 0; | | if ((expo_x > 25) || (xxi != 0x00800000)) xxi = 0; | |
| return __int_as_float(expo_y | xxi); | | return __int_as_float(expo_y | xxi); | |
| } | | } | |
| } else { | | } else { | |
| return a * b; | | return a * b; | |
| } | | } | |
| } | | } | |
| | | | |
|
| __device_func__(float __fmul_rd (float a, float b)) | | static __forceinline__ float __fmul_rd (float a, float b) | |
| { | | { | |
| unsigned long long product; | | unsigned long long product; | |
| unsigned int expo_x, expo_y; | | unsigned int expo_x, expo_y; | |
| unsigned int xxi, yyi; | | unsigned int xxi, yyi; | |
| | | | |
| xxi = __float_as_int(a); | | xxi = __float_as_int(a); | |
| yyi = __float_as_int(b); | | yyi = __float_as_int(b); | |
| | | | |
| expo_y = 0xFF; | | expo_y = 0xFF; | |
| expo_x = expo_y & (xxi >> 23); | | expo_x = expo_y & (xxi >> 23); | |
| | | | |
| skipping to change at line 1318 | | skipping to change at line 1418 | |
| xxi += (yyi && expo_y); | | xxi += (yyi && expo_y); | |
| xxi = (xxi >> expo_x); | | xxi = (xxi >> expo_x); | |
| if ((expo_x > 25) || (xxi != 0x00800000)) xxi = 0; | | if ((expo_x > 25) || (xxi != 0x00800000)) xxi = 0; | |
| return __int_as_float(expo_y | xxi); | | return __int_as_float(expo_y | xxi); | |
| } | | } | |
| } else { | | } else { | |
| return a * b; | | return a * b; | |
| } | | } | |
| } | | } | |
| | | | |
|
| __device_func__(float __fmaf_rn (float a, float b, float c)) | | static __forceinline__ float __fmaf_rn (float a, float b, float c) | |
| { | | { | |
| unsigned long long product; | | unsigned long long product; | |
| unsigned int xx, yy, zz, ww; | | unsigned int xx, yy, zz, ww; | |
| unsigned int temp, s, u; | | unsigned int temp, s, u; | |
| unsigned int expo_x, expo_y, expo_z; | | unsigned int expo_x, expo_y, expo_z; | |
| | | | |
| xx = __float_as_int(a); | | xx = __float_as_int(a); | |
| yy = __float_as_int(b); | | yy = __float_as_int(b); | |
| zz = __float_as_int(c); | | zz = __float_as_int(c); | |
| | | | |
| | | | |
| skipping to change at line 1593 | | skipping to change at line 1693 | |
| xx += (temp >= 0x80000000); | | xx += (temp >= 0x80000000); | |
| if (xx >= 0x01000000) { | | if (xx >= 0x01000000) { | |
| xx = xx >> 1; | | xx = xx >> 1; | |
| expo_x--; | | expo_x--; | |
| } | | } | |
| if (expo_x > 0) xx = 0; | | if (expo_x > 0) xx = 0; | |
| xx = expo_y | xx; | | xx = expo_y | xx; | |
| return __int_as_float(xx); | | return __int_as_float(xx); | |
| } | | } | |
| | | | |
|
| __device_func__(float __fmaf_rz (float a, float b, float c)) | | static __forceinline__ float __fmaf_rz (float a, float b, float c) | |
| { | | { | |
| unsigned long long product; | | unsigned long long product; | |
| unsigned int xx, yy, zz, ww; | | unsigned int xx, yy, zz, ww; | |
| unsigned int temp, s, u; | | unsigned int temp, s, u; | |
| unsigned int expo_x, expo_y, expo_z; | | unsigned int expo_x, expo_y, expo_z; | |
| | | | |
| xx = __float_as_int(a); | | xx = __float_as_int(a); | |
| yy = __float_as_int(b); | | yy = __float_as_int(b); | |
| zz = __float_as_int(c); | | zz = __float_as_int(c); | |
| | | | |
| | | | |
| skipping to change at line 1857 | | skipping to change at line 1957 | |
| return __int_as_float(xx); | | return __int_as_float(xx); | |
| } else if ((int)expo_x >= 126) { | | } else if ((int)expo_x >= 126) { | |
| /* overflow */ | | /* overflow */ | |
| xx = expo_y | 0x7f7fffff; | | xx = expo_y | 0x7f7fffff; | |
| return __int_as_float(xx); | | return __int_as_float(xx); | |
| } | | } | |
| /* subnormal */ | | /* subnormal */ | |
| return __int_as_float(expo_y); | | return __int_as_float(expo_y); | |
| } | | } | |
| | | | |
|
| __device_func__(float __fmaf_ru (float a, float b, float c)) | | static __forceinline__ float __fmaf_ru (float a, float b, float c) | |
| { | | { | |
| unsigned long long product; | | unsigned long long product; | |
| unsigned int xx, yy, zz, ww; | | unsigned int xx, yy, zz, ww; | |
| unsigned int temp, s, u; | | unsigned int temp, s, u; | |
| unsigned int expo_x, expo_y, expo_z; | | unsigned int expo_x, expo_y, expo_z; | |
| | | | |
| xx = __float_as_int(a); | | xx = __float_as_int(a); | |
| yy = __float_as_int(b); | | yy = __float_as_int(b); | |
| zz = __float_as_int(c); | | zz = __float_as_int(c); | |
| | | | |
| | | | |
| skipping to change at line 2126 | | skipping to change at line 2226 | |
| return __int_as_float(xx); | | return __int_as_float(xx); | |
| } | | } | |
| /* subnormal */ | | /* subnormal */ | |
| expo_x = ((unsigned int)-((int)expo_x)); | | expo_x = ((unsigned int)-((int)expo_x)); | |
| xx += (temp && !expo_y); | | xx += (temp && !expo_y); | |
| xx = (xx >> expo_x); | | xx = (xx >> expo_x); | |
| if ((expo_x > 25) || (xx != 0x00800000)) xx = 0; | | if ((expo_x > 25) || (xx != 0x00800000)) xx = 0; | |
| return __int_as_float(expo_y | xx); | | return __int_as_float(expo_y | xx); | |
| } | | } | |
| | | | |
|
| __device_func__(float __fmaf_rd (float a, float b, float c)) | | static __forceinline__ float __fmaf_rd (float a, float b, float c) | |
| { | | { | |
| unsigned long long product; | | unsigned long long product; | |
| unsigned int xx, yy, zz, ww; | | unsigned int xx, yy, zz, ww; | |
| unsigned int temp, s, u; | | unsigned int temp, s, u; | |
| unsigned int expo_x, expo_y, expo_z; | | unsigned int expo_x, expo_y, expo_z; | |
| | | | |
| xx = __float_as_int(a); | | xx = __float_as_int(a); | |
| yy = __float_as_int(b); | | yy = __float_as_int(b); | |
| zz = __float_as_int(c); | | zz = __float_as_int(c); | |
| | | | |
| | | | |
| skipping to change at line 2395 | | skipping to change at line 2495 | |
| return __int_as_float(xx); | | return __int_as_float(xx); | |
| } | | } | |
| /* subnormal */ | | /* subnormal */ | |
| expo_x = ((unsigned int)-((int)expo_x)); | | expo_x = ((unsigned int)-((int)expo_x)); | |
| xx += (temp && expo_y); | | xx += (temp && expo_y); | |
| xx = (xx >> expo_x); | | xx = (xx >> expo_x); | |
| if ((expo_x > 25) || (xx != 0x00800000)) xx = 0; | | if ((expo_x > 25) || (xx != 0x00800000)) xx = 0; | |
| return __int_as_float(expo_y | xx); | | return __int_as_float(expo_y | xx); | |
| } | | } | |
| | | | |
|
| #endif /* __CUDA_ARCH__ < 200 */ | | static __forceinline__ int __clz(int a) | |
| | | | |
| #else /* defined(__CUDABE__) */ | | | |
| | | | |
| #include "common_types.h" | | | |
| | | | |
| static __device__ const unsigned char __internal_rcpTab[128] = | | | |
| { | | | |
| 0xff, 0xfd, 0xfb, 0xf9, 0xf7, 0xf5, 0xf4, 0xf2, | | | |
| 0xf0, 0xee, 0xed, 0xeb, 0xe9, 0xe8, 0xe6, 0xe4, | | | |
| 0xe3, 0xe1, 0xe0, 0xde, 0xdd, 0xdb, 0xda, 0xd8, | | | |
| 0xd7, 0xd5, 0xd4, 0xd3, 0xd1, 0xd0, 0xcf, 0xcd, | | | |
| 0xcc, 0xcb, 0xca, 0xc8, 0xc7, 0xc6, 0xc5, 0xc4, | | | |
| 0xc2, 0xc1, 0xc0, 0xbf, 0xbe, 0xbd, 0xbc, 0xbb, | | | |
| 0xba, 0xb9, 0xb8, 0xb7, 0xb6, 0xb5, 0xb4, 0xb3, | | | |
| 0xb2, 0xb1, 0xb0, 0xaf, 0xae, 0xad, 0xac, 0xab, | | | |
| 0xaa, 0xa9, 0xa8, 0xa8, 0xa7, 0xa6, 0xa5, 0xa4, | | | |
| 0xa3, 0xa3, 0xa2, 0xa1, 0xa0, 0x9f, 0x9f, 0x9e, | | | |
| 0x9d, 0x9c, 0x9c, 0x9b, 0x9a, 0x99, 0x99, 0x98, | | | |
| 0x97, 0x97, 0x96, 0x95, 0x95, 0x94, 0x93, 0x93, | | | |
| 0x92, 0x91, 0x91, 0x90, 0x8f, 0x8f, 0x8e, 0x8e, | | | |
| 0x8d, 0x8c, 0x8c, 0x8b, 0x8b, 0x8a, 0x89, 0x89, | | | |
| 0x88, 0x88, 0x87, 0x87, 0x86, 0x85, 0x85, 0x84, | | | |
| 0x84, 0x83, 0x83, 0x82, 0x82, 0x81, 0x81, 0x80 | | | |
| }; | | | |
| | | | |
| static __device__ const unsigned int __internal_invSqrtCubeTab[96] = | | | |
| { | | | |
| 0xfa0bf8fe, 0xee6b28fa, 0xe5f024f7, 0xdaf268f3, | | | |
| 0xd2f000f0, 0xc890c0ec, 0xc10378e9, 0xb9a758e6, | | | |
| 0xb4da40e4, 0xadcea0e1, 0xa6f278de, 0xa279c0dc, | | | |
| 0x9beb48d9, 0x97a5c4d7, 0x916340d4, 0x8d4fc8d2, | | | |
| 0x895000d0, 0x8563b8ce, 0x818ac0cc, 0x7dc4e8ca, | | | |
| 0x7a1200c8, 0x7671d8c6, 0x72e440c4, 0x6f6908c2, | | | |
| 0x6db240c1, 0x6a523cbf, 0x670424bd, 0x6563c0bc, | | | |
| 0x623028ba, 0x609ce8b9, 0x5d8364b7, 0x5bfd18b6, | | | |
| 0x58fd40b4, 0x5783a8b3, 0x560e48b2, 0x533000b0, | | | |
| 0x51c70caf, 0x506238ae, 0x4da4c0ac, 0x4c4c10ab, | | | |
| 0x4af768aa, 0x49a6b8a9, 0x485a00a8, 0x471134a7, | | | |
| 0x45cc58a6, 0x434e40a4, 0x4214f8a3, 0x40df88a2, | | | |
| 0x3fade0a1, 0x3e8000a0, 0x3d55dc9f, 0x3c2f789e, | | | |
| 0x3c2f789e, 0x3b0cc49d, 0x39edc09c, 0x38d2609b, | | | |
| 0x37baa89a, 0x36a68899, 0x35960098, 0x34890497, | | | |
| 0x34890497, 0x337f9896, 0x3279ac95, 0x31774094, | | | |
| 0x30784893, 0x30784893, 0x2f7cc892, 0x2e84b091, | | | |
| 0x2d900090, 0x2d900090, 0x2c9eac8f, 0x2bb0b88e, | | | |
| 0x2bb0b88e, 0x2ac6148d, 0x29dec08c, 0x29dec08c, | | | |
| 0x28fab08b, 0x2819e88a, 0x2819e88a, 0x273c5889, | | | |
| 0x273c5889, 0x26620088, 0x258ad487, 0x258ad487, | | | |
| 0x24b6d886, 0x24b6d886, 0x23e5fc85, 0x23184084, | | | |
| 0x23184084, 0x224d9883, 0x224d9883, 0x21860882, | | | |
| 0x21860882, 0x20c18081, 0x20c18081, 0x20000080 | | | |
| }; | | | |
| | | | |
| __device_func__(float __internal_frcp_kernel (float x,enum cudaRoundMode mo | | | |
| de)) | | | |
| { | | | |
| unsigned long long prod; | | | |
| volatile union __cudart_FloatUintCvt arg; | | | |
| unsigned int expo; | | | |
| unsigned int sign; | | | |
| unsigned f, y; | | | |
| | | | |
| arg.f = x; | | | |
| sign = arg.i & 0x80000000; | | | |
| expo = (arg.i >> 23); | | | |
| expo = expo & 0xff; | | | |
| f = expo - 1; | | | |
| | | | |
| if (f <= 0xFD) { | | | |
| y = (arg.i << 8); | | | |
| y = y | 0x80000000; | | | |
| /* initial approximation */ | | | |
| arg.i = __internal_rcpTab[(y >> 24) - 128]; | | | |
| /* first NR iteration */ | | | |
| f = arg.i * arg.i; | | | |
| f = f << 16; | | | |
| prod = ((unsigned long long)y) * f; | | | |
| arg.i = (arg.i << 24) - (unsigned)(prod >> 32); | | | |
| /* second NR iteration */ | | | |
| f = arg.i + arg.i; | | | |
| prod = ((unsigned long long)y) * f; | | | |
| f = (unsigned)(-(int)(prod >> 32)); | | | |
| prod = ((unsigned long long)arg.i) * f; | | | |
| y = y >> 8; | | | |
| /* compute exponent */ | | | |
| expo = (2 * 127) - expo - 2; | | | |
| arg.i = (unsigned)(prod >> 32); | | | |
| if (mode == cudaRoundNearest) { | | | |
| arg.i = arg.i >> 6; | | | |
| } else { | | | |
| arg.i = (arg.i + 32) >> 6; | | | |
| } | | | |
| if ((int)expo >= 0) { | | | |
| f = y * arg.i; | | | |
| arg.i = ((expo << 23) + arg.i) | sign; | | | |
| } else { | | | |
| /* result is a denormal */ | | | |
| expo = -(int)expo; | | | |
| arg.i = arg.i >> expo; | | | |
| f = y * arg.i; | | | |
| arg.i = arg.i | sign; | | | |
| } | | | |
| if (mode == cudaRoundNearest) { | | | |
| expo = f + y; | | | |
| if ((int)f < 0) f = (unsigned)(-(int)f); | | | |
| if ((int)expo < 0) expo = (unsigned)(-(int)expo); | | | |
| if (expo < f) arg.i++; | | | |
| } else if (mode == cudaRoundZero) { | | | |
| if ((int)f > 0) arg.i = arg.i - 1; | | | |
| } else if (mode == cudaRoundPosInf) { | | | |
| if (((int)f > 0) && sign) arg.i = arg.i - 1; | | | |
| if (((int)f < 0) && !sign) arg.i = arg.i + 1; | | | |
| } else { /* mode == cudaRoundMinInf */ | | | |
| if (((int)f > 0) && !sign) arg.i = arg.i - 1; | | | |
| if (((int)f < 0) && sign) arg.i = arg.i + 1; | | | |
| } | | | |
| return arg.f; | | | |
| } else { | | | |
| /* zero returns infinity. Must handle negative zero as well */ | | | |
| if (!(arg.i << 1)) { | | | |
| arg.i = 0x7F800000 | arg.i; | | | |
| return arg.f; | | | |
| } | | | |
| /* infinity returns zero of like sign */ | | | |
| if ((arg.i << 1) == 0xff000000) { | | | |
| arg.i &= 0x80000000; | | | |
| return arg.f; | | | |
| } | | | |
| /* convert SNaNs to QNaNs */ | | | |
| if ((arg.i << 1) > 0xff000000) { | | | |
| arg.i |= 0x00400000; | | | |
| return arg.f; | | | |
| } | | | |
| /* denormals */ | | | |
| f = 0; | | | |
| arg.i <<= 8; | | | |
| do { | | | |
| f++; | | | |
| arg.i <<= 1; | | | |
| } while ((int)arg.i > 0); | | | |
| arg.i >>= 8; | | | |
| arg.i |= sign; | | | |
| arg.f = __internal_frcp_kernel (arg.f, mode); | | | |
| expo = ((arg.i << 1) >> 24); | | | |
| if ((expo + f) < 255) { | | | |
| arg.i = (arg.i + (f << 23)); | | | |
| return arg.f; | | | |
| } | | | |
| if (mode == cudaRoundNearest) { | | | |
| arg.i = (arg.i & 0x80000000) | 0x7f800000; | | | |
| } else if (mode == cudaRoundZero) { | | | |
| arg.i = (arg.i & 0x80000000) | 0x7f7fffff; | | | |
| } else if (mode == cudaRoundPosInf) { | | | |
| arg.i = (arg.i & 0x80000000) | ((sign) ? 0x7f7fffff : 0x7f800000); | | | |
| } else { /* mode == cudaRoundMinInf */ | | | |
| arg.i = (arg.i & 0x80000000) | ((sign) ? 0x7f800000 : 0x7f7fffff); | | | |
| } | | | |
| return arg.f; | | | |
| } | | | |
| } | | | |
| | | | |
| __device_func__(float __internal_fsqrt_kernel (float radicand, | | | |
| enum cudaRoundMode mode)) | | | |
| { | | | |
| unsigned long long prod; | | | |
| volatile union __cudart_FloatUintCvt arg; | | | |
| unsigned int expo; | | | |
| unsigned int s, f, x; | | | |
| | | | |
| arg.f = radicand; | | | |
| expo = arg.i >> 23; | | | |
| expo = expo & 0xff; | | | |
| f = expo - 1; | | | |
| | | | |
| if ((arg.i <= 0x80000000) && (f <= 0xFD)) { | | | |
| /* normalize input argument */ | | | |
| x = (arg.i << 8) | 0x80000000; | | | |
| x = x >> (expo & 1); | | | |
| /* initial approximation */ | | | |
| arg.i = f = __internal_invSqrtCubeTab[((unsigned)x >> 25) - 32]; | | | |
| /* first NR iteration */ | | | |
| prod = ((unsigned long long)x) * f; | | | |
| arg.i = ((arg.i * 3) << 22) - (unsigned)(prod >> 32); | | | |
| /* second NR iteration */ | | | |
| prod = ((unsigned long long)arg.i) * arg.i; | | | |
| s = (unsigned)(prod >> 32); | | | |
| prod = ((unsigned long long)x) * s; | | | |
| f = 0x30000000 - (unsigned)(prod >> 32); | | | |
| prod = ((unsigned long long)f) * arg.i; | | | |
| arg.i = (unsigned)(prod >> 32); | | | |
| /* compute sqrt(x) as x * 1/sqrt(x) */ | | | |
| prod = ((unsigned long long)x) * arg.i; | | | |
| arg.i = (unsigned)(prod >> 32); | | | |
| if (mode == cudaRoundNearest) { | | | |
| arg.i = arg.i >> 3; | | | |
| } else { | | | |
| arg.i = (arg.i + 4) >> 3; | | | |
| } | | | |
| x = (x << 16) - (arg.i * arg.i); | | | |
| /* round to nearest based on remainder; tie case impossible */ | | | |
| if (mode == cudaRoundNearest) { | | | |
| f = x - (2 * arg.i + 1); | | | |
| if ((int)f < 0) f = (unsigned)(-(int)f); | | | |
| if ((int)x < 0) x = (unsigned)(-(int)x); | | | |
| if (f < x) arg.i ++; | | | |
| } else if ((mode == cudaRoundZero) || (mode == cudaRoundMinInf)) { | | | |
| if ((int)x < 0) arg.i--; | | | |
| } else if (mode == cudaRoundPosInf) { | | | |
| if ((int)x > 0) arg.i++; | | | |
| } | | | |
| arg.i = arg.i + (((expo + 125) & ~0x1) << 22); | | | |
| return arg.f; | | | |
| } else { | | | |
| /* if zero, or positive infinity, return argument */ | | | |
| if (!(arg.i << 1) || (arg.i == 0x7F800000)) { | | | |
| return arg.f; | | | |
| } | | | |
| /* if NaN, return argument, possibly converted to QNaN */ | | | |
| if ((arg.i << 1) > 0xFF000000) { | | | |
| arg.i |= 0x00400000; | | | |
| return arg.f; | | | |
| } | | | |
| /* if negative, return NaN: INDEFINITE */ | | | |
| if (arg.i & 0x80000000) { | | | |
| arg.i = 0xFFC00000; | | | |
| return arg.f; | | | |
| } | | | |
| /* denormal, normalize it before computing square root */ | | | |
| x = 0; | | | |
| arg.i <<= 8; | | | |
| do { | | | |
| x++; | | | |
| arg.i <<= 1; | | | |
| } while ((int)arg.i > 0); | | | |
| arg.i >>= 8; | | | |
| arg.i += (x & 1) << 23; | | | |
| x += (x & 1); | | | |
| arg.f = __internal_fsqrt_kernel (arg.f, mode); | | | |
| arg.i -= ((x >> 1) << 23); | | | |
| return arg.f; | | | |
| } | | | |
| } | | | |
| | | | |
| __device_func__(float __internal_fdiv_kernel (float dividend, float divisor | | | |
| , | | | |
| enum cudaRoundMode mode)) | | | |
| { | | | |
| unsigned long long prod; | | | |
| unsigned r, f, x, y, expox, expoy, sign; | | | |
| volatile union __cudart_FloatUintCvt cvtx, cvty, res; | | | |
| | | | |
| cvtx.f = dividend; | | | |
| cvty.f = divisor; | | | |
| expox = ((cvtx.i >> 23) & 0xff) - 1; | | | |
| expoy = ((cvty.i >> 23) & 0xff) - 1; | | | |
| sign = ((cvtx.i ^ cvty.i) & 0x80000000); | | | |
| | | | |
| if ((expox <= 0xFD) && (expoy <= 0xFD)) { | | | |
| divide: | | | |
| expox = expox - expoy + 127 - 1; | | | |
| expoy = expox; | | | |
| /* extract mantissas */ | | | |
| y = (cvty.i << 8) | 0x80000000; | | | |
| x = (cvtx.i & 0x00ffffff) | 0x00800000; | | | |
| /* initial approximation */ | | | |
| r = __internal_rcpTab[(y >> 24) - 128]; | | | |
| /* first NR iteration */ | | | |
| f = r * r; | | | |
| prod = ((unsigned long long)y) * (f << 16); | | | |
| r = (r << 24) - (unsigned)(prod >> 32); | | | |
| /* second NR iteration */ | | | |
| prod = ((unsigned long long)y) * (r << 1); | | | |
| f = (unsigned)-(int)(prod >> 32); | | | |
| prod = ((unsigned long long)f) * (r << 1); | | | |
| r = (unsigned)(prod >> 32); | | | |
| /* produce quotient */ | | | |
| prod = ((unsigned long long)x) * (r << 1); | | | |
| /* normalize mantissa */ | | | |
| if (((int)((prod >> 32) << 8)) > 0) { | | | |
| expox--; | | | |
| prod = prod + prod; | | | |
| } | | | |
| if (mode == cudaRoundNearest) { | | | |
| /* preliminary mantissa */ | | | |
| r = (unsigned)(prod >> 32); | | | |
| y = y >> 8; | | | |
| /* result is a normal */ | | | |
| if (expox <= 0xFD) { | | | |
| int rem0, rem1, inc; | | | |
| /* round mantissa to nearest even */ | | | |
| prod = ((unsigned long long)y) * r; | | | |
| x = x << (23 + ((prod >> 32) >> 15)); | | | |
| rem1 = x - (unsigned)(prod & 0xffffffff); | | | |
| rem0 = rem1 - y; | | | |
| inc = abs(rem0) < abs(rem1); | | | |
| /* merge sign, mantissa, exponent for final result */ | | | |
| res.i = sign | ((expox << 23) + r + inc); | | | |
| return res.f; | | | |
| } else if ((int)expox >= 254) { | | | |
| /* overflow: return infinity */ | | | |
| res.i = sign | 0x7f800000; | | | |
| return res.f; | | | |
| } else { | | | |
| /* underflow: result is zero, denormal, or smallest normal */ | | | |
| int shift = -(int)expox; | | | |
| if (shift > 23) { | | | |
| /* result is zero or smallest denormal */ | | | |
| r = (shift < 25) && ((x != y) || (r > 0x00ff0000)); | | | |
| res.i = sign | r; | | | |
| return res.f; | | | |
| } | | | |
| if (x == y) { | | | |
| /* result is denormal */ | | | |
| shift = -(int)expoy; | | | |
| r = 0x00800000 >> shift; | | | |
| res.i = sign | r; | | | |
| return res.f; | | | |
| } | | | |
| { | | | |
| unsigned long long tempx; | | | |
| long long remlo, remhi; | | | |
| /* result is denormal or smallest normal */ | | | |
| r = r >> shift; | | | |
| prod = ((unsigned long long)y) * r; | | | |
| tempx = ((unsigned long long)x) << (23 - shift); | | | |
| remlo = 2 * tempx - 2 * prod - y; | | | |
| remhi = remlo + 2 * tempx; | | | |
| if (remlo < 0) remlo = -remlo; | | | |
| if (remhi < 0) remhi = -remhi; | | | |
| if (remhi < remlo) tempx = 2 * tempx; | | | |
| remlo = tempx - prod; | | | |
| remhi = remlo - y; | | | |
| if (remlo < 0) remlo = -remlo; | | | |
| if (remhi < 0) remhi = -remhi; | | | |
| if ((remhi < remlo) || ((remhi == remlo) && (r & 1))) r++; | | | |
| res.i = sign | r; | | | |
| return res.f; | | | |
| } | | | |
| } | | | |
| } else if (mode == cudaRoundZero) { | | | |
| /* preliminary mantissa */ | | | |
| prod += 0x0000000080000000ULL; | | | |
| r = (unsigned)(prod >> 32); | | | |
| y = y >> 8; | | | |
| /* result is a normal */ | | | |
| if (expox <= 0xFD) { | | | |
| int rem1; | | | |
| prod = ((unsigned long long)y) * r; | | | |
| x = x << (23 + ((prod >> 32) >> 15)); | | | |
| rem1 = x - (unsigned)(prod & 0xffffffff); | | | |
| if (rem1 < 0) r--; | | | |
| r = (expox << 23) + r; | | | |
| if (r == 0x7f800000) r = 0x7f7fffff; | | | |
| res.i = sign | r; | | | |
| return res.f; | | | |
| } else if ((int)expox >= 254) { | | | |
| /* overflow: return largest normal */ | | | |
| res.i = sign | 0x7f7fffff; | | | |
| return res.f; | | | |
| } else { | | | |
| /* underflow: result is zero, denormal, or smallest normal */ | | | |
| int shift = -(int)expox; | | | |
| if ((x == y) && (shift < 31)) { | | | |
| shift = -(int)expoy; | | | |
| r = 0x00800000 >> shift; | | | |
| res.i = sign | r; | | | |
| return res.f; | | | |
| } | | | |
| if (shift > 23) { | | | |
| r = 0; | | | |
| res.i = sign | r; | | | |
| return res.f; | | | |
| } | | | |
| { | | | |
| unsigned long long tempx; | | | |
| long long remlo, remhi; | | | |
| /* result is denormal or smallest normal */ | | | |
| r = r >> shift; | | | |
| prod = ((unsigned long long)y) * r; | | | |
| tempx = ((unsigned long long)x) << (23 - shift); | | | |
| remlo = 2 * tempx - 2 * prod - y; | | | |
| remhi = remlo + 2 * tempx; | | | |
| if (remlo < 0) remlo = -remlo; | | | |
| if (remhi < 0) remhi = -remhi; | | | |
| if (remhi < remlo) tempx = 2 * tempx; | | | |
| remlo = tempx - prod; | | | |
| if ((remlo < 0) & (r != 0)) r--; | | | |
| res.i = sign | r; | | | |
| return res.f; | | | |
| } | | | |
| } | | | |
| } else if (mode == cudaRoundPosInf) { | | | |
| /* preliminary mantissa */ | | | |
| prod += 0x0000000080000000ULL; | | | |
| r = (unsigned)(prod >> 32); | | | |
| y = y >> 8; | | | |
| /* result is a normal */ | | | |
| if (expox <= 0xFD) { | | | |
| int rem1; | | | |
| prod = ((unsigned long long)y) * r; | | | |
| x = x << (23 + ((prod >> 32) >> 15)); | | | |
| rem1 = x - (unsigned)(prod & 0xffffffff); | | | |
| if ((rem1 < 0) && (sign)) r--; | | | |
| if ((rem1 > 0) && (!sign)) r++; | | | |
| r = (expox << 23) + r; | | | |
| if ((r == 0x7f800000) && (sign)) r = 0x7f7fffff; | | | |
| res.i = sign | r; | | | |
| return res.f; | | | |
| } else if ((int)expox >= 254) { | | | |
| /* overflow: return largest normal, or infinity */ | | | |
| r = sign ? 0x7f7fffff : 0x7f800000; | | | |
| res.i = sign | r; | | | |
| return res.f; | | | |
| } else { | | | |
| /* underflow: result is zero, denormal, or smallest normal */ | | | |
| int shift = -(int)expox; | | | |
| if ((x == y) && (shift <= 24)) { | | | |
| shift = -(int)expoy; | | | |
| r = 0x00800000 >> shift; | | | |
| if (r == 0) r = !sign; | | | |
| res.i = sign | r; | | | |
| return res.f; | | | |
| } | | | |
| if (shift > 23) { | | | |
| r = !sign; | | | |
| res.i = sign | r; | | | |
| return res.f; | | | |
| } | | | |
| { | | | |
| unsigned long long tempx; | | | |
| long long remlo, remhi; | | | |
| /* result is denormal or smallest normal */ | | | |
| r = r >> shift; | | | |
| prod = ((unsigned long long)y) * r; | | | |
| tempx = ((unsigned long long)x) << (23 - shift); | | | |
| remlo = 2 * tempx - 2 * prod - y; | | | |
| remhi = remlo + 2 * tempx; | | | |
| if (remlo < 0) remlo = -remlo; | | | |
| if (remhi < 0) remhi = -remhi; | | | |
| if (remhi < remlo) tempx = 2 * tempx; | | | |
| remlo = tempx - prod; | | | |
| if ((remlo < 0) && (r != 0) && (sign)) r--; | | | |
| if ((remlo > 0) && (!sign)) r++; | | | |
| res.i = sign | r; | | | |
| return res.f; | | | |
| } | | | |
| } | | | |
| } else if (mode == cudaRoundMinInf) { | | | |
| /* preliminary mantissa */ | | | |
| prod += 0x0000000080000000ULL; | | | |
| r = (unsigned)(prod >> 32); | | | |
| y = y >> 8; | | | |
| /* result is a normal */ | | | |
| if (expox <= 0xFD) { | | | |
| int rem1; | | | |
| prod = ((unsigned long long)y) * r; | | | |
| x = x << (23 + ((prod >> 32) >> 15)); | | | |
| rem1 = x - (unsigned)(prod & 0xffffffff); | | | |
| if ((rem1 < 0) && (!sign)) r--; | | | |
| if ((rem1 > 0) && (sign)) r++; | | | |
| r = (expox << 23) + r; | | | |
| if ((r == 0x7f800000) && (!sign)) r = 0x7f7fffff; | | | |
| res.i = sign | r; | | | |
| return res.f; | | | |
| } else if ((int)expox >= 254) { | | | |
| /* overflow: return largest normal, or infinity */ | | | |
| r = sign ? 0x7f800000 : 0x7f7fffff; | | | |
| res.i = sign | r; | | | |
| return res.f; | | | |
| } else { | | | |
| /* underflow: result is zero, denormal, or smallest normal */ | | | |
| int shift = -(int)expox; | | | |
| if ((x == y) && (shift <= 24)) { | | | |
| shift = -(int)expoy; | | | |
| r = 0x00800000 >> shift; | | | |
| if (r == 0) r = !!sign; | | | |
| res.i = sign | r; | | | |
| return res.f; | | | |
| } | | | |
| if (shift > 23) { | | | |
| r = !!sign; | | | |
| res.i = sign | r; | | | |
| return res.f; | | | |
| } | | | |
| { | | | |
| unsigned long long tempx; | | | |
| long long remlo, remhi; | | | |
| /* result is denormal or smallest normal */ | | | |
| r = r >> shift; | | | |
| prod = ((unsigned long long)y) * r; | | | |
| tempx = ((unsigned long long)x) << (23 - shift); | | | |
| remlo = 2 * tempx - 2 * prod - y; | | | |
| remhi = remlo + 2 * tempx; | | | |
| if (remlo < 0) remlo = -remlo; | | | |
| if (remhi < 0) remhi = -remhi; | | | |
| if (remhi < remlo) tempx = 2 * tempx; | | | |
| remlo = tempx - prod; | | | |
| if ((remlo < 0) && (r != 0) && (!sign)) r--; | | | |
| if ((remlo > 0) && (sign)) r++; | | | |
| res.i = sign | r; | | | |
| return res.f; | | | |
| } | | | |
| } | | | |
| } | | | |
| } | | | |
| { | | | |
| int xzero, yzero, xinf, yinf, xnan, ynan; | | | |
| | | | |
| xnan = (cvtx.i << 1) > 0xff000000; | | | |
| ynan = (cvty.i << 1) > 0xff000000; | | | |
| /* handle NaNs. Convert SNaNs to QNaNs */ | | | |
| if (xnan) { | | | |
| res.i = cvtx.i | 0x00400000; | | | |
| return res.f; | | | |
| } | | | |
| if (ynan) { | | | |
| res.i = cvty.i | 0x00400000; | | | |
| return res.f; | | | |
| } | | | |
| xzero = (cvtx.i << 1) == 0x00000000; | | | |
| yzero = (cvty.i << 1) == 0x00000000; | | | |
| xinf = (cvtx.i << 1) == 0xff000000; | | | |
| yinf = (cvty.i << 1) == 0xff000000; | | | |
| /* 0/0 and INF/INF are invalid operations. Return INDEFINITE */ | | | |
| if ((xzero & yzero) | (xinf & yinf)) { | | | |
| res.i = 0xffc00000; | | | |
| return res.f; | | | |
| } | | | |
| /* x/INF and 0/y -> 0 */ | | | |
| if (xzero | yinf) { | | | |
| res.i = sign; | | | |
| return res.f; | | | |
| } | | | |
| /* x/0 and INF/y -> INF */ | | | |
| if (yzero | xinf) { | | | |
| res.i = sign | 0x7f800000; | | | |
| return res.f; | | | |
| } | | | |
| /* normalize denormals */ | | | |
| if ((int)expox < 0) { | | | |
| cvtx.i = cvtx.i << 9; | | | |
| while ((int)cvtx.i >= 0) { | | | |
| expox--; | | | |
| cvtx.i = cvtx.i + cvtx.i; | | | |
| } | | | |
| cvtx.i = cvtx.i >> 8; | | | |
| } | | | |
| if ((int)expoy < 0) { | | | |
| cvty.i = cvty.i << 9; | | | |
| while ((int)cvty.i >= 0) { | | | |
| expoy--; | | | |
| cvty.i = cvty.i + cvty.i; | | | |
| } | | | |
| cvty.i = cvty.i >> 8; | | | |
| } | | | |
| goto divide; | | | |
| } | | | |
| } | | | |
| | | | |
| __device_func__(float __internal_fmul_kernel (float a, float b, | | | |
| enum cudaRoundMode mode)) | | | |
| { | | | |
| unsigned long long product; | | | |
| volatile union __cudart_FloatUintCvt xx, yy; | | | |
| unsigned expo_x, expo_y; | | | |
| | | | |
| xx.f = a; | | | |
| yy.f = b; | | | |
| | | | |
| expo_y = 0xFF; | | | |
| expo_x = expo_y & (xx.i >> 23); | | | |
| expo_x = expo_x - 1; | | | |
| expo_y = expo_y & (yy.i >> 23); | | | |
| expo_y = expo_y - 1; | | | |
| | | | |
| if ((expo_x <= 0xFD) && | | | |
| (expo_y <= 0xFD)) { | | | |
| multiply: | | | |
| expo_x = expo_x + expo_y; | | | |
| expo_y = xx.i ^ yy.i; | | | |
| xx.i = xx.i & 0x00ffffff; | | | |
| yy.i = yy.i << 8; | | | |
| xx.i = xx.i | 0x00800000; | | | |
| yy.i = yy.i | 0x80000000; | | | |
| /* compute product */ | | | |
| product = ((unsigned long long)xx.i) * yy.i; | | | |
| expo_x = expo_x - 127 + 2; | | | |
| expo_y = expo_y & 0x80000000; | | | |
| xx.i = (unsigned int)(product >> 32); | | | |
| yy.i = (unsigned int)(product & 0xffffffff); | | | |
| /* normalize mantissa */ | | | |
| if (xx.i < 0x00800000) { | | | |
| xx.i = (xx.i << 1) | (yy.i >> 31); | | | |
| yy.i = (yy.i << 1); | | | |
| expo_x--; | | | |
| } | | | |
| if (expo_x <= 0xFD) { | | | |
| xx.i = xx.i | expo_y; /* OR in sign bit */ | | | |
| xx.i = xx.i + (expo_x << 23); /* add in exponent */ | | | |
| /* round result to nearest or even */ | | | |
| if (mode == cudaRoundNearest) { | | | |
| if (yy.i < 0x80000000) return xx.f; | | | |
| xx.i += ((yy.i == 0x80000000) ? (xx.i & 1) : (yy.i >> 31)); | | | |
| } else if (mode == cudaRoundZero) { | | | |
| } else if (mode == cudaRoundPosInf) { | | | |
| xx.i += (yy.i && !expo_y); | | | |
| } else if (mode == cudaRoundMinInf) { | | | |
| xx.i += (yy.i && expo_y); | | | |
| } | | | |
| return xx.f; | | | |
| } else if ((int)expo_x >= 254) { | | | |
| /* overflow: return infinity or largest normal */ | | | |
| if (mode == cudaRoundNearest) { | | | |
| xx.i = expo_y | 0x7F800000; | | | |
| } else if (mode == cudaRoundZero) { | | | |
| xx.i = expo_y | 0x7F7FFFFF; | | | |
| } else if (mode == cudaRoundPosInf) { | | | |
| xx.i = (expo_y ? 0xff7fffff : 0x7F800000); | | | |
| } else { /* (mode == cudaRoundMinInf) */ | | | |
| xx.i = (expo_y ? 0xFF800000 : 0x7f7fffff); | | | |
| } | | | |
| return xx.f; | | | |
| } else { | | | |
| /* zero, denormal, or smallest normal */ | | | |
| expo_x = ((unsigned int)-((int)expo_x)); | | | |
| if (mode == cudaRoundNearest) { | | | |
| if (expo_x > 25) { | | | |
| /* massive underflow: return 0 */ | | | |
| xx.i = expo_y; | | | |
| return xx.f; | | | |
| } else { | | | |
| yy.i = (xx.i << (32 - expo_x)) | ((yy.i) ? 1 : 0); | | | |
| xx.i = expo_y + (xx.i >> expo_x); | | | |
| xx.i += ((yy.i == 0x80000000) ? (xx.i & 1) : (yy.i >> 31)); | | | |
| return xx.f; | | | |
| } | | | |
| } else if (mode == cudaRoundZero) { | | | |
| if (expo_x > 25) expo_x = 25; | | | |
| xx.i = expo_y + (xx.i >> expo_x); | | | |
| return xx.f; | | | |
| } else if (mode == cudaRoundPosInf) { | | | |
| if (expo_x > 25) expo_x = 25; | | | |
| yy.i = (xx.i << (32 - expo_x)) | ((yy.i) ? 1 : 0); | | | |
| xx.i = expo_y + (xx.i >> expo_x); | | | |
| xx.i += (yy.i && !expo_y); | | | |
| return xx.f; | | | |
| } else { /* (mode == cudaRoundMinInf) */ | | | |
| if (expo_x > 25) expo_x = 25; | | | |
| yy.i = (xx.i << (32 - expo_x)) | ((yy.i) ? 1 : 0); | | | |
| xx.i = expo_y + (xx.i >> expo_x); | | | |
| xx.i += (yy.i && expo_y); | | | |
| return xx.f; | | | |
| } | | | |
| } | | | |
| } else { | | | |
| product = xx.i ^ yy.i; | | | |
| product = product & 0x80000000; | | | |
| if (!(xx.i & 0x7fffffff)) { | | | |
| if (expo_y != 254) { | | | |
| xx.i = (unsigned int)product; | | | |
| return xx.f; | | | |
| } | | | |
| expo_y = yy.i << 1; | | | |
| if (expo_y == 0xFF000000) { | | | |
| xx.i = expo_y | 0x00C00000; | | | |
| } else { | | | |
| xx.i = yy.i | 0x00400000; | | | |
| } | | | |
| return xx.f; | | | |
| } | | | |
| if (!(yy.i & 0x7fffffff)) { | | | |
| if (expo_x != 254) { | | | |
| xx.i = (unsigned int)product; | | | |
| return xx.f; | | | |
| } | | | |
| expo_x = xx.i << 1; | | | |
| if (expo_x == 0xFF000000) { | | | |
| xx.i = expo_x | 0x00C00000; | | | |
| } else { | | | |
| xx.i = xx.i | 0x00400000; | | | |
| } | | | |
| return xx.f; | | | |
| } | | | |
| if ((expo_y != 254) && (expo_x != 254)) { | | | |
| expo_y++; | | | |
| expo_x++; | | | |
| if (expo_x == 0) { | | | |
| expo_y |= xx.i & 0x80000000; | | | |
| /* | | | |
| * If both operands are denormals, we only need to normalize | | | |
| * one of them as the result will be either a denormal or zero. | | | |
| */ | | | |
| xx.i = xx.i << 8; | | | |
| while (!(xx.i & 0x80000000)) { | | | |
| xx.i <<= 1; | | | |
| expo_x--; | | | |
| } | | | |
| xx.i = (xx.i >> 8) | (expo_y & 0x80000000); | | | |
| expo_y &= ~0x80000000; | | | |
| expo_y--; | | | |
| goto multiply; | | | |
| } | | | |
| if (expo_y == 0) { | | | |
| expo_x |= yy.i & 0x80000000; | | | |
| yy.i = yy.i << 8; | | | |
| while (!(yy.i & 0x80000000)) { | | | |
| yy.i <<= 1; | | | |
| expo_y--; | | | |
| } | | | |
| yy.i = (yy.i >> 8) | (expo_x & 0x80000000); | | | |
| expo_x &= ~0x80000000; | | | |
| expo_x--; | | | |
| goto multiply; | | | |
| } | | | |
| } | | | |
| expo_x = xx.i << 1; | | | |
| expo_y = yy.i << 1; | | | |
| /* if x is NaN, return x */ | | | |
| if (expo_x > 0xFF000000) { | | | |
| /* cvt any SNaNs to QNaNs */ | | | |
| xx.i = xx.i | 0x00400000; | | | |
| return xx.f; | | | |
| } | | | |
| /* if y is NaN, return y */ | | | |
| if (expo_y > 0xFF000000) { | | | |
| /* cvt any SNaNs to QNaNs */ | | | |
| xx.i = yy.i | 0x00400000; | | | |
| return xx.f; | | | |
| } | | | |
| xx.i = (unsigned int)product | 0x7f800000; | | | |
| return xx.f; | | | |
| } | | | |
| } | | | |
| | | | |
| __device_func__(float __internal_fmaf_kernel (float a, float b, float c, | | | |
| enum cudaRoundMode mode)) | | | |
| { | | | |
| unsigned long long product; | | | |
| unsigned int xx, yy, zz, ww; | | | |
| unsigned int temp, s, u; | | | |
| unsigned int expo_x, expo_y, expo_z; | | | |
| volatile union __cudart_FloatUintCvt cvt; | | | |
| | | | |
| cvt.f = a; | | | |
| xx = cvt.i; | | | |
| cvt.f = b; | | | |
| yy = cvt.i; | | | |
| cvt.f = c; | | | |
| zz = cvt.i; | | | |
| | | | |
| temp = 0xff; | | | |
| expo_x = temp & (xx >> 23); | | | |
| expo_x = expo_x - 1; | | | |
| expo_y = temp & (yy >> 23); | | | |
| expo_y = expo_y - 1; | | | |
| expo_z = temp & (zz >> 23); | | | |
| expo_z = expo_z - 1; | | | |
| | | | |
| if (!((expo_x <= 0xFD) && | | | |
| (expo_y <= 0xFD) && | | | |
| (expo_z <= 0xFD))) { | | | |
| /* fmad (nan, y, z) --> nan | | | |
| fmad (x, nan, z) --> nan | | | |
| fmad (x, y, nan) --> nan | | | |
| */ | | | |
| if ((yy << 1) > 0xff000000) { | | | |
| return b + b; | | | |
| } | | | |
| if ((zz << 1) > 0xff000000) { | | | |
| return c + c; | | | |
| } | | | |
| if ((xx << 1) > 0xff000000) { | | | |
| return a + a; | | | |
| } | | | |
| /* fmad (0, inf, z) --> NaN | | | |
| fmad (inf, 0, z) --> NaN | | | |
| fmad (-inf,+y,+inf) --> NaN | | | |
| fmad (+x,-inf,+inf) --> NaN | | | |
| fmad (+inf,-y,+inf) --> NaN | | | |
| fmad (-x,+inf,+inf) --> NaN | | | |
| fmad (-inf,-y,-inf) --> NaN | | | |
| fmad (-x,-inf,-inf) --> NaN | | | |
| fmad (+inf,+y,-inf) --> NaN | | | |
| fmad (+x,+inf,-inf) --> NaN | | | |
| */ | | | |
| if ((((xx << 1) == 0) && ((yy << 1) == 0xff000000)) || | | | |
| (((yy << 1) == 0) && ((xx << 1) == 0xff000000))) { | | | |
| cvt.i = 0xffc00000; | | | |
| return cvt.f; | | | |
| } | | | |
| if ((zz << 1) == 0xff000000) { | | | |
| if (((yy << 1) == 0xff000000) || ((xx << 1) == 0xff000000)) { | | | |
| if ((int)(xx ^ yy ^ zz) < 0) { | | | |
| cvt.i = 0xffc00000; | | | |
| return cvt.f; | | | |
| } | | | |
| } | | | |
| } | | | |
| /* fmad (inf, y, z) --> inf | | | |
| fmad (x, inf, z) --> inf | | | |
| fmad (x, y, inf) --> inf | | | |
| */ | | | |
| if ((xx << 1) == 0xff000000) { | | | |
| xx = xx ^ (yy & 0x80000000); | | | |
| cvt.i = xx; | | | |
| return cvt.f; | | | |
| } | | | |
| if ((yy << 1) == 0xff000000) { | | | |
| yy = yy ^ (xx & 0x80000000); | | | |
| cvt.i = yy; | | | |
| return cvt.f; | | | |
| } | | | |
| if ((zz << 1) == 0xff000000) { | | | |
| cvt.i = zz; | | | |
| return cvt.f; | | | |
| } | | | |
| /* fmad (+0, -y, -0) --> -0 | | | |
| fmad (-0, +y, -0) --> -0 | | | |
| fmad (+x, -0, -0) --> -0 | | | |
| fmad (-x, +0, -0) --> -0 | | | |
| */ | | | |
| if (zz == 0x80000000) { | | | |
| if (((xx << 1) == 0) || ((yy << 1) == 0)) { | | | |
| if ((int)(xx ^ yy) < 0) { | | | |
| cvt.i = zz; | | | |
| return cvt.f; | | | |
| } | | | |
| } | | | |
| } | | | |
| /* fmad (0, y, 0) --> +0 | | | |
| fmad (x, 0, 0) --> +0 | | | |
| */ | | | |
| if (((zz << 1) == 0) && | | | |
| (((xx << 1) == 0) || ((yy << 1) == 0))) { | | | |
| if (mode == cudaRoundMinInf) { | | | |
| zz = 0x80000000 & (xx ^ yy ^ zz); | | | |
| } else { | | | |
| zz &= 0x7fffffff; | | | |
| } | | | |
| cvt.i = zz; | | | |
| return cvt.f; | | | |
| } | | | |
| /* fmad (0, y, z) --> z | | | |
| fmad (x, 0, z) --> z | | | |
| */ | | | |
| if (((xx << 1) == 0) || ((yy << 1) == 0)) { | | | |
| cvt.i = zz; | | | |
| return cvt.f; | | | |
| } | | | |
| /* normalize x, if denormal */ | | | |
| if (expo_x == (unsigned)-1) { | | | |
| temp = xx & 0x80000000; | | | |
| xx = xx << 8; | | | |
| while (!(xx & 0x80000000)) { | | | |
| xx <<= 1; | | | |
| expo_x--; | | | |
| } | | | |
| expo_x++; | | | |
| xx = (xx >> 8) | temp; | | | |
| } | | | |
| /* normalize y, if denormal */ | | | |
| if (expo_y == (unsigned)-1) { | | | |
| temp = yy & 0x80000000; | | | |
| yy = yy << 8; | | | |
| while (!(yy & 0x80000000)) { | | | |
| yy <<= 1; | | | |
| expo_y--; | | | |
| } | | | |
| expo_y++; | | | |
| yy = (yy >> 8) | temp; | | | |
| } | | | |
| /* normalize z, if denormal */ | | | |
| if ((expo_z == (unsigned)-1) && ((zz << 1) != 0)) { | | | |
| temp = zz & 0x80000000; | | | |
| zz = zz << 8; | | | |
| while (!(zz & 0x80000000)) { | | | |
| zz <<= 1; | | | |
| expo_z--; | | | |
| } | | | |
| expo_z++; | | | |
| zz = (zz >> 8) | temp; | | | |
| } | | | |
| } | | | |
| | | | |
| expo_x = expo_x + expo_y; | | | |
| expo_y = xx ^ yy; | | | |
| xx = xx & 0x00ffffff; | | | |
| yy = yy << 8; | | | |
| xx = xx | 0x00800000; | | | |
| yy = yy | 0x80000000; | | | |
| | | | |
| product = ((unsigned long long)xx) * yy; | | | |
| xx = (unsigned)(product >> 32); | | | |
| yy = (unsigned)(product & 0xffffffff); | | | |
| | | | |
| expo_x = expo_x - 127 + 2; | | | |
| expo_y = expo_y & 0x80000000; | | | |
| /* normalize mantissa */ | | | |
| if (xx < 0x00800000) { | | | |
| xx = (xx << 1) | (yy >> 31); | | | |
| yy = (yy << 1); | | | |
| expo_x--; | | | |
| } | | | |
| temp = 0; | | | |
| | | | |
| if ((zz << 1) != 0) { /* z is not zero */ | | | |
| s = zz & 0x80000000; | | | |
| zz &= 0x00ffffff; | | | |
| zz |= 0x00800000; | | | |
| ww = 0; | | | |
| /* compare and swap. put augend into xx:yy */ | | | |
| if ((int)expo_z > (int)expo_x) { | | | |
| temp = expo_z; | | | |
| expo_z = expo_x; | | | |
| expo_x = temp; | | | |
| temp = zz; | | | |
| zz = xx; | | | |
| xx = temp; | | | |
| temp = ww; | | | |
| ww = yy; | | | |
| yy = temp; | | | |
| temp = expo_y; | | | |
| expo_y = s; | | | |
| s = temp; | | | |
| } | | | |
| /* augend_sign = expo_y, augend_mant = xx:yy, augend_expo = expo_x */ | | | |
| /* addend_sign = s, addend_mant = zz:ww, addend_expo = expo_z */ | | | |
| expo_z = expo_x - expo_z; | | | |
| u = expo_y ^ s; | | | |
| if (expo_z <= 49) { | | | |
| /* denormalize addend */ | | | |
| temp = 0; | | | |
| while (expo_z >= 32) { | | | |
| temp = ww | (temp != 0); | | | |
| ww = zz; | | | |
| zz = 0; | | | |
| expo_z -= 32; | | | |
| } | | | |
| if (expo_z) { | | | |
| temp = ((temp >> expo_z) | (ww << (32 - expo_z)) | | | | |
| ((temp << (32 - expo_z)) != 0)); | | | |
| ww = (ww >> expo_z) | (zz << (32 - expo_z)); | | | |
| zz = (zz >> expo_z); | | | |
| } | | | |
| | | | |
| } else { | | | |
| temp = 1; | | | |
| ww = 0; | | | |
| zz = 0; | | | |
| } | | | |
| if ((int)u < 0) { | | | |
| /* signs differ, effective subtraction */ | | | |
| temp = (unsigned)(-(int)temp); | | | |
| s = (temp != 0); | | | |
| u = yy - s; | | | |
| s = u > yy; | | | |
| yy = u - ww; | | | |
| s += yy > u; | | | |
| xx = (xx - zz) - s; | | | |
| if (!(xx | yy | temp)) { | | | |
| /* complete cancelation, return 0 */ | | | |
| if (mode == cudaRoundMinInf) { | | | |
| xx = 0x80000000; | | | |
| } | | | |
| cvt.i = xx; | | | |
| return cvt.f; | | | |
| } | | | |
| if ((int)xx < 0) { | | | |
| /* ooops, augend had smaller mantissa. Negate mantissa and flip | | | |
| sign of result*/ | | | |
| temp = ~temp; | | | |
| yy = ~yy; | | | |
| xx = ~xx; | | | |
| if (++temp == 0) { | | | |
| if (++yy == 0) { | | | |
| ++xx; | | | |
| } | | | |
| } | | | |
| expo_y ^= 0x80000000; | | | |
| } | | | |
| /* normalize mantissa, if necessary */ | | | |
| while (!(xx & 0x00800000)) { | | | |
| xx = (xx << 1) | (yy >> 31); | | | |
| yy = (yy << 1); | | | |
| expo_x--; | | | |
| } | | | |
| } else { | | | |
| /* signs are the same, effective addition */ | | | |
| yy = yy + ww; | | | |
| s = yy < ww; | | | |
| xx = xx + zz + s; | | | |
| if (xx & 0x01000000) { | | | |
| temp = temp | (yy << 31); | | | |
| yy = (yy >> 1) | (xx << 31); | | | |
| xx = ((xx & 0x80000000) | (xx >> 1)) & ~0x40000000; | | | |
| expo_x++; | | | |
| } | | | |
| } | | | |
| } | | | |
| temp = yy | (temp != 0); | | | |
| if (expo_x <= 0xFD) { | | | |
| /* normal */ | | | |
| xx |= expo_y; /* or in sign bit */ | | | |
| if (mode == cudaRoundNearest) { | | | |
| s = xx & 1; /* mantissa lsb */ | | | |
| xx += (temp == 0x80000000) ? s : (temp >> 31); | | | |
| } else if (mode == cudaRoundPosInf) { | | | |
| xx += temp && !expo_y; | | | |
| } else if (mode == cudaRoundMinInf) { | | | |
| xx += temp && expo_y; | | | |
| } | | | |
| xx = xx + (expo_x << 23); /* add in exponent */ | | | |
| cvt.i = xx; | | | |
| return cvt.f; | | | |
| } else if ((int)expo_x >= 126) { | | | |
| /* overflow */ | | | |
| if (mode == cudaRoundNearest) { | | | |
| xx = expo_y | 0x7f800000; | | | |
| } else if (mode == cudaRoundZero) { | | | |
| xx = expo_y | 0x7F7FFFFF; | | | |
| } else if (mode == cudaRoundPosInf) { | | | |
| xx = expo_y ? 0xFF7FFFFF : 0x7f800000; | | | |
| } else if (mode == cudaRoundMinInf) { | | | |
| xx = expo_y ? 0xff800000 : 0x7f7fffff; | | | |
| } | | | |
| cvt.i = xx; | | | |
| return cvt.f; | | | |
| } | | | |
| /* subnormal */ | | | |
| expo_x = (unsigned int)(-(int)expo_x); | | | |
| if (expo_x > 25) { | | | |
| /* massive underflow: return 0, or smallest denormal */ | | | |
| xx = 0; | | | |
| if (mode == cudaRoundPosInf) { | | | |
| xx += !expo_y; | | | |
| } else if (mode == cudaRoundMinInf) { | | | |
| xx += !!expo_y; | | | |
| } | | | |
| cvt.i = expo_y | xx; | | | |
| return cvt.f; | | | |
| } | | | |
| temp = (xx << (32 - expo_x)) | ((temp) ? 1 : 0); | | | |
| xx = xx >> expo_x; | | | |
| if (mode == cudaRoundNearest) { | | | |
| xx = xx + ((temp == 0x80000000) ? (xx & 1) : (temp >> 31)); | | | |
| } else if (mode == cudaRoundPosInf) { | | | |
| xx = xx + (!expo_y && temp); | | | |
| } else if (mode == cudaRoundMinInf) { | | | |
| xx = xx + (expo_y && temp); | | | |
| } | | | |
| xx = expo_y + xx; /* add in sign bit */ | | | |
| cvt.i = xx; | | | |
| return cvt.f; | | | |
| } | | | |
| | | | |
| __device_func__(float __internal_fadd_kernel (float a, float b, | | | |
| enum cudaRoundMode mode)) | | | |
| { | | | |
| volatile union __cudart_FloatUintCvt xx, yy; | | | |
| unsigned int expo_x; | | | |
| unsigned int expo_y; | | | |
| unsigned int temp; | | | |
| | | | |
| xx.f = a; | | | |
| yy.f = b; | | | |
| | | | |
| /* make bigger operand the augend */ | | | |
| expo_y = yy.i << 1; | | | |
| if (expo_y > (xx.i << 1)) { | | | |
| expo_y = xx.i; | | | |
| xx.i = yy.i; | | | |
| yy.i = expo_y; | | | |
| } | | | |
| | | | |
| temp = 0xff; | | | |
| expo_x = temp & (xx.i >> 23); | | | |
| expo_x = expo_x - 1; | | | |
| expo_y = temp & (yy.i >> 23); | | | |
| expo_y = expo_y - 1; | | | |
| | | | |
| if ((expo_x <= 0xFD) && | | | |
| (expo_y <= 0xFD)) { | | | |
| add: | | | |
| expo_y = expo_x - expo_y; | | | |
| if (expo_y > 25) { | | | |
| expo_y = 31; | | | |
| } | | | |
| temp = xx.i ^ yy.i; | | | |
| xx.i = xx.i & ~0x7f000000; | | | |
| xx.i = xx.i | 0x00800000; | | | |
| yy.i = yy.i & ~0xff000000; | | | |
| yy.i = yy.i | 0x00800000; | | | |
| | | | |
| if ((int)temp < 0) { | | | |
| /* signs differ, effective subtraction */ | | | |
| temp = 32 - expo_y; | | | |
| temp = (expo_y) ? (yy.i << temp) : 0; | | | |
| temp = (unsigned)(-((int)temp)); | | | |
| xx.i = xx.i - (yy.i >> expo_y) - (temp ? 1 : 0); | | | |
| if (xx.i & 0x00800000) { | | | |
| if (expo_x <= 0xFD) { | | | |
| xx.i = xx.i + (expo_x << 23); | | | |
| if (mode == cudaRoundNearest) { | | | |
| if (temp < 0x80000000) return xx.f; | | | |
| xx.i += ((temp == 0x80000000) ? (xx.i & 1) : (temp >> 31)); | | | |
| } else if (mode == cudaRoundZero) { | | | |
| } else if (mode == cudaRoundPosInf) { | | | |
| xx.i += (temp && !(xx.i & 0x80000000)); | | | |
| } else if (mode == cudaRoundMinInf) { | | | |
| xx.i += (temp && (xx.i & 0x80000000)); | | | |
| } | | | |
| return xx.f; | | | |
| } | | | |
| } else { | | | |
| if ((temp | (xx.i << 1)) == 0) { | | | |
| /* operands cancelled, resulting in a clean zero */ | | | |
| if (mode == cudaRoundMinInf) { | | | |
| xx.i = 0x80000000; | | | |
| } else { | | | |
| xx.i = 0; | | | |
| } | | | |
| return xx.f; | | | |
| } | | | |
| /* normalize result */ | | | |
| yy.i = xx.i & 0x80000000; | | | |
| do { | | | |
| xx.i = (xx.i << 1) | (temp >> 31); | | | |
| temp <<= 1; | | | |
| expo_x--; | | | |
| } while (!(xx.i & 0x00800000)); | | | |
| xx.i = xx.i | yy.i; | | | |
| } | | | |
| } else { | | | |
| /* signs are the same, effective addition */ | | | |
| temp = 32 - expo_y; | | | |
| temp = (expo_y) ? (yy.i << temp) : 0; | | | |
| xx.i = xx.i + (yy.i >> expo_y); | | | |
| if (!(xx.i & 0x01000000)) { | | | |
| if (expo_x <= 0xFD) { | | | |
| xx.i = xx.i + (expo_x << 23); | | | |
| if (mode == cudaRoundNearest) { | | | |
| if (temp < 0x80000000) return xx.f; | | | |
| xx.i += ((temp == 0x80000000) ? (xx.i & 1) : (temp >> 31)); | | | |
| } else if (mode == cudaRoundZero) { | | | |
| } else if (mode == cudaRoundPosInf) { | | | |
| xx.i += (temp && !(xx.i & 0x80000000)); | | | |
| } else if (mode == cudaRoundMinInf) { | | | |
| xx.i += (temp && (xx.i & 0x80000000)); | | | |
| } | | | |
| return xx.f; | | | |
| } | | | |
| } else { | | | |
| /* normalize result */ | | | |
| temp = (xx.i << 31) | (temp >> 1); | | | |
| xx.i = ((xx.i & 0x80000000) | (xx.i >> 1)) & ~0x40000000; | | | |
| expo_x++; | | | |
| } | | | |
| } | | | |
| if (expo_x <= 0xFD) { | | | |
| xx.i = xx.i + (expo_x << 23); | | | |
| if (mode == cudaRoundNearest) { | | | |
| if (temp < 0x80000000) return xx.f; | | | |
| xx.i += ((temp == 0x80000000) ? (xx.i & 1) : (temp >> 31)); | | | |
| } else if (mode == cudaRoundZero) { | | | |
| } else if (mode == cudaRoundPosInf) { | | | |
| xx.i += (temp && !(xx.i & 0x80000000)); | | | |
| } else if (mode == cudaRoundMinInf) { | | | |
| xx.i += (temp && (xx.i & 0x80000000)); | | | |
| } | | | |
| return xx.f; | | | |
| } | | | |
| if ((int)expo_x >= 254) { | | | |
| /* overflow: return infinity or largest normal */ | | | |
| temp = xx.i & 0x80000000; | | | |
| if (mode == cudaRoundNearest) { | | | |
| xx.i = (temp) | 0x7f800000; | | | |
| } else if (mode == cudaRoundZero) { | | | |
| xx.i = (temp) | 0x7f7fffff; | | | |
| } else if (mode == cudaRoundMinInf) { | | | |
| xx.i = (temp ? 0xFF800000 : 0x7f7fffff); | | | |
| } else if (mode == cudaRoundPosInf) { | | | |
| xx.i = (temp ? 0xff7fffff : 0x7F800000); | | | |
| } | | | |
| return xx.f; | | | |
| } | | | |
| /* underflow: denormal, or smallest normal */ | | | |
| expo_y = expo_x + 32; | | | |
| yy.i = xx.i & 0x80000000; | | | |
| xx.i = xx.i & ~0xff000000; | | | |
| expo_x = (unsigned)(-((int)expo_x)); | | | |
| temp = xx.i << expo_y | ((temp) ? 1 : 0); | | | |
| xx.i = yy.i | (xx.i >> expo_x); | | | |
| if (mode == cudaRoundNearest) { | | | |
| xx.i += (temp == 0x80000000) ? (xx.i & 1) : (temp >> 31); | | | |
| } else if (mode == cudaRoundZero) { | | | |
| } else if (mode == cudaRoundPosInf) { | | | |
| xx.i += (temp && !yy.i); | | | |
| } else if (mode == cudaRoundMinInf) { | | | |
| xx.i += (temp && yy.i); | | | |
| } | | | |
| return xx.f; | | | |
| } else { | | | |
| /* handle special cases separately */ | | | |
| if (!(yy.i << 1)) { | | | |
| if (mode == cudaRoundMinInf) { | | | |
| if (!(xx.i << 1)) { | | | |
| xx.i = xx.i | yy.i; | | | |
| } | | | |
| } else { | | | |
| if (xx.i == 0x80000000) { | | | |
| xx.i = yy.i; | | | |
| } | | | |
| } | | | |
| if ((xx.i << 1) > 0xff000000) { | | | |
| xx.i |= 0x00400000; | | | |
| } | | | |
| return xx.f; | | | |
| } | | | |
| if ((expo_y != 254) && (expo_x != 254)) { | | | |
| /* remove sign bits */ | | | |
| if (expo_x == (unsigned int) -1) { | | | |
| temp = xx.i & 0x80000000; | | | |
| xx.i = xx.i << 8; | | | |
| while (!(xx.i & 0x80000000)) { | | | |
| xx.i <<= 1; | | | |
| expo_x--; | | | |
| } | | | |
| expo_x++; | | | |
| xx.i = (xx.i >> 8) | temp; | | | |
| } | | | |
| if (expo_y == (unsigned int) -1) { | | | |
| temp = yy.i & 0x80000000; | | | |
| yy.i = yy.i << 8; | | | |
| while (!(yy.i & 0x80000000)) { | | | |
| yy.i <<= 1; | | | |
| expo_y--; | | | |
| } | | | |
| expo_y++; | | | |
| yy.i = (yy.i >> 8) | temp; | | | |
| } | | | |
| goto add; | | | |
| } | | | |
| expo_x = xx.i << 1; | | | |
| expo_y = yy.i << 1; | | | |
| /* if x is NaN, return x */ | | | |
| if (expo_x > 0xff000000) { | | | |
| /* cvt any SNaNs to QNaNs */ | | | |
| xx.i = xx.i | 0x00400000; | | | |
| return xx.f; | | | |
| } | | | |
| /* if y is NaN, return y */ | | | |
| if (expo_y > 0xff000000) { | | | |
| /* cvt any SNaNs to QNaNs */ | | | |
| xx.i = yy.i | 0x00400000; | | | |
| return xx.f; | | | |
| } | | | |
| if ((expo_x == 0xff000000) && (expo_y == 0xff000000)) { | | | |
| /* | | | |
| * subtraction of infinities with the same sign, and addition of | | | |
| * infinities of unlike sign is undefined: return NaN INDEFINITE | | | |
| */ | | | |
| expo_x = xx.i ^ yy.i; | | | |
| xx.i = xx.i | ((expo_x) ? 0xffc00000 : 0); | | | |
| return xx.f; | | | |
| } | | | |
| /* handle infinities */ | | | |
| if (expo_y == 0xff000000) { | | | |
| xx.i = yy.i; | | | |
| } | | | |
| return xx.f; | | | |
| } | | | |
| } | | | |
| | | | |
| __device_func__(float __frcp_rn (float a)) | | | |
| { | | | |
| return __internal_frcp_kernel (a, cudaRoundNearest); | | | |
| } | | | |
| | | | |
| __device_func__(float __frcp_rz (float a)) | | | |
| { | | | |
| return __internal_frcp_kernel (a, cudaRoundZero); | | | |
| } | | | |
| | | | |
| __device_func__(float __frcp_rd (float a)) | | | |
| { | | | |
| return __internal_frcp_kernel (a, cudaRoundMinInf); | | | |
| } | | | |
| | | | |
| __device_func__(float __frcp_ru (float a)) | | | |
| { | | | |
| return __internal_frcp_kernel (a, cudaRoundPosInf); | | | |
| } | | | |
| | | | |
| __device_func__(float __fsqrt_rn (float a)) | | | |
| { | | | |
| return __internal_fsqrt_kernel (a, cudaRoundNearest); | | | |
| } | | | |
| | | | |
| __device_func__(float __fsqrt_rz (float a)) | | | |
| { | | | |
| return __internal_fsqrt_kernel (a, cudaRoundZero); | | | |
| } | | | |
| | | | |
| __device_func__(float __fsqrt_rd (float a)) | | | |
| { | | | |
| return __internal_fsqrt_kernel (a, cudaRoundMinInf); | | | |
| } | | | |
| | | | |
| __device_func__(float __fsqrt_ru (float a)) | | | |
| { | | | |
| return __internal_fsqrt_kernel (a, cudaRoundPosInf); | | | |
| } | | | |
| | | | |
| __device_func__(float __fdiv_rn (float a, float b)) | | | |
| { | | | |
| return __internal_fdiv_kernel (a, b, cudaRoundNearest); | | | |
| } | | | |
| | | | |
| __device_func__(float __fdiv_rz (float a, float b)) | | | |
| { | | | |
| return __internal_fdiv_kernel (a, b, cudaRoundZero); | | | |
| } | | | |
| | | | |
| __device_func__(float __fdiv_rd (float a, float b)) | | | |
| { | | | |
| return __internal_fdiv_kernel (a, b, cudaRoundMinInf); | | | |
| } | | | |
| | | | |
| __device_func__(float __fdiv_ru (float a, float b)) | | | |
| { | | | |
| return __internal_fdiv_kernel (a, b, cudaRoundPosInf); | | | |
| } | | | |
| | | | |
| __device_func__(float __fadd_rd (float a, float b)) | | | |
| { | | | |
| return __internal_fadd_kernel (a, b, cudaRoundMinInf); | | | |
| } | | | |
| | | | |
| __device_func__(float __fadd_ru (float a, float b)) | | | |
| { | | | |
| return __internal_fadd_kernel (a, b, cudaRoundPosInf); | | | |
| } | | | |
| | | | |
| __device_func__(float __fmul_rd (float a, float b)) | | | |
| { | | | |
| return __internal_fmul_kernel (a, b, cudaRoundMinInf); | | | |
| } | | | |
| | | | |
| __device_func__(float __fmul_ru (float a, float b)) | | | |
| { | | | |
| return __internal_fmul_kernel (a, b, cudaRoundPosInf); | | | |
| } | | | |
| | | | |
| __device_func__(float __fmaf_rn (float a, float b, float c)) | | | |
| { | | | |
| return __internal_fmaf_kernel (a, b, c, cudaRoundNearest); | | | |
| } | | | |
| | | | |
| __device_func__(float __fmaf_rz (float a, float b, float c)) | | | |
| { | | | |
| return __internal_fmaf_kernel (a, b, c, cudaRoundZero); | | | |
| } | | | |
| | | | |
| __device_func__(float __fmaf_ru (float a, float b, float c)) | | | |
| { | | | |
| return __internal_fmaf_kernel (a, b, c, cudaRoundPosInf); | | | |
| } | | | |
| | | | |
| __device_func__(float __fmaf_rd (float a, float b, float c)) | | | |
| { | | | |
| return __internal_fmaf_kernel (a, b, c, cudaRoundMinInf); | | | |
| } | | | |
| | | | |
| __device_func__(int __cuda___isnan(double a)); | | | |
| __device_func__(int __cuda___isnanf(float a)); | | | |
| __device_func__(int __double2int_rz(double)); | | | |
| __device_func__(unsigned int __double2uint_rz(double)); | | | |
| __device_func__(long long int __double2ll_rz(double)); | | | |
| __device_func__(unsigned long long int __double2ull_rz(double)); | | | |
| | | | |
| #define __internal_clamp(val, max, min, nan) | | | |
| \ | | | |
| if (sizeof(val) == sizeof(double) && __cuda___isnan((double)val)) re | | | |
| turn nan; \ | | | |
| if (sizeof(val) == sizeof(float) && __cuda___isnanf((float)val)) ret | | | |
| urn nan; \ | | | |
| if (val >= max) return max; | | | |
| \ | | | |
| if (val <= min) return min | | | |
| | | | |
| /************************************************************************** | | | |
| ***** | | | |
| * | | | |
| * | | | |
| * HOST IMPLEMENTATIONS FOR FUNCTIONS WITH BUILTIN NVOPENCC OPREATIONS | | | |
| * | | | |
| * | | | |
| * | | | |
| *************************************************************************** | | | |
| ****/ | | | |
| | | | |
| __device_func__(int __mulhi(int a, int b)) | | | |
| { | | | |
| long long int c = (long long int)a * (long long int)b; | | | |
| | | | |
| return (int)(c >> 32); | | | |
| } | | | |
| | | | |
| __device_func__(unsigned int __umulhi(unsigned int a, unsigned int b)) | | | |
| { | | | |
| unsigned long long int c = (unsigned long long int)a * (unsigned long lon | | | |
| g int)b; | | | |
| | | | |
| return (unsigned int)(c >> 32); | | | |
| } | | | |
| | | | |
| __device_func__(unsigned long long int __umul64hi(unsigned long long int a, | | | |
| unsigned long long int b)) | | | |
| { | | | |
| unsigned int a_lo = (unsigned int)a; | | | |
| unsigned long long int a_hi = a >> 32; | | | |
| unsigned int b_lo = (unsigned int)b; | | | |
| unsigned long long int b_hi = b >> 32; | | | |
| unsigned long long int m1 = a_lo * b_hi; | | | |
| unsigned long long int m2 = a_hi * b_lo; | | | |
| unsigned int carry; | | | |
| | | | |
| carry = (0ULL + __umulhi(a_lo, b_lo) + (unsigned int)m1 + (unsigned int)m | | | |
| 2) >> 32; | | | |
| | | | |
| return a_hi * b_hi + (m1 >> 32) + (m2 >> 32) + carry; | | | |
| } | | | |
| | | | |
| __device_func__(long long int __mul64hi(long long int a, long long int b)) | | | |
| { | | | |
| long long int res; | | | |
| res = __umul64hi(a, b); | | | |
| if (a < 0LL) res = res - b; | | | |
| if (b < 0LL) res = res - a; | | | |
| return res; | | | |
| } | | | |
| | | | |
| __device_func__(float __saturatef(float a)) | | | |
| { | | | |
| if (__cuda___isnanf(a)) return 0.0f; /* update of PTX spec 10/15/2008 */ | | | |
| return a >= 1.0f ? 1.0f : a <= 0.0f ? 0.0f : a; | | | |
| } | | | |
| | | | |
| __device_func__(unsigned int __sad(int a, int b, unsigned int c)) | | | |
| { | | | |
| long long int diff = (long long int)a - (long long int)b; | | | |
| | | | |
| return (unsigned int)(__cuda_llabs(diff) + (long long int)c); | | | |
| } | | | |
| | | | |
| __device_func__(unsigned int __usad(unsigned int a, unsigned int b, unsigne | | | |
| d int c)) | | | |
| { | | | |
| long long int diff = (long long int)a - (long long int)b; | | | |
| | | | |
| return (unsigned int)(__cuda_llabs(diff) + (long long int)c); | | | |
| } | | | |
| | | | |
| __device_func__(int __mul24(int a, int b)) | | | |
| { | | | |
| a &= 0xffffff; | | | |
| a = (a & 0x800000) != 0 ? a | ~0xffffff : a; | | | |
| b &= 0xffffff; | | | |
| b = (b & 0x800000) != 0 ? b | ~0xffffff : b; | | | |
| | | | |
| return a * b; | | | |
| } | | | |
| | | | |
| __device_func__(unsigned int __umul24(unsigned int a, unsigned int b)) | | | |
| { | | | |
| a &= 0xffffff; | | | |
| b &= 0xffffff; | | | |
| | | | |
| return a * b; | | | |
| } | | | |
| | | | |
| __device_func__(float __int_as_float(int a)) | | | |
| { | | | |
| volatile union __cudart_FloatIntCvt u; | | | |
| | | | |
| u.i = a; | | | |
| return u.f; | | | |
| } | | | |
| | | | |
| __device_func__(int __float_as_int(float a)) | | | |
| { | | | |
| volatile union __cudart_FloatIntCvt u; | | | |
| | | | |
| u.f = a; | | | |
| return u.i; | | | |
| } | | | |
| | | | |
| __device_func__(long long int __internal_float2ll_kernel(float a, long long | | | |
| int max, long long int min, long long int nan, enum cudaRoundMode rndMode) | | | |
| ) | | | |
| { | | | |
| unsigned long long int res, t = 0ULL; | | | |
| int shift; | | | |
| unsigned int ia; | | | |
| | | | |
| __internal_clamp(a, max, min, nan); | | | |
| ia = __float_as_int(a); | | | |
| shift = 189 - ((ia >> 23) & 0xff); | | | |
| res = (unsigned long long int)(((ia << 8) | 0x80000000) >> 1) << 32; | | | |
| if (shift >= 64) { | | | |
| t = res; | | | |
| res = 0; | | | |
| } else if (shift) { | | | |
| t = res << (64 - shift); | | | |
| res = res >> shift; | | | |
| } | | | |
| if (rndMode == cudaRoundNearest && (long long int)t < 0LL) { | | | |
| res += t == 0x8000000000000000ULL ? res & 1ULL : 1ULL; | | | |
| } | | | |
| else if (rndMode == cudaRoundMinInf && t != 0ULL && ia > 0x80000000) { | | | |
| res++; | | | |
| } | | | |
| else if (rndMode == cudaRoundPosInf && t != 0ULL && (int)ia > 0) { | | | |
| res++; | | | |
| } | | | |
| if ((int)ia < 0) res = (unsigned long long int)-(long long int)res; | | | |
| return (long long int)res; | | | |
| } | | | |
| | | | |
| __device_func__(int __internal_float2int(float a, enum cudaRoundMode rndMod | | | |
| e)) | | | |
| { | | | |
| return (int)__internal_float2ll_kernel(a, 2147483647LL, -2147483648LL, 0L | | | |
| L, rndMode); | | | |
| } | | | |
| | | | |
| __device_func__(int __float2int_rz(float a)) | | | |
| { | | | |
| return __internal_float2int(a, cudaRoundZero); | | | |
| } | | | |
| | | | |
| __device_func__(int __float2int_ru(float a)) | | | |
| { | | | |
| return __internal_float2int(a, cudaRoundPosInf); | | | |
| } | | | |
| | | | |
| __device_func__(int __float2int_rd(float a)) | | | |
| { | | | |
| return __internal_float2int(a, cudaRoundMinInf); | | | |
| } | | | |
| | | | |
| __device_func__(int __float2int_rn(float a)) | | | |
| { | | | |
| return __internal_float2int(a, cudaRoundNearest); | | | |
| } | | | |
| | | | |
| __device_func__(long long int __internal_float2ll(float a, enum cudaRoundMo | | | |
| de rndMode)) | | | |
| { | | | |
| return __internal_float2ll_kernel(a, 9223372036854775807LL, -922337203685 | | | |
| 4775807LL -1LL, -9223372036854775807LL -1LL, rndMode); | | | |
| } | | | |
| | | | |
| __device_func__(long long int __float2ll_rz(float a)) | | | |
| { | | | |
| return __internal_float2ll(a, cudaRoundZero); | | | |
| } | | | |
| | | | |
| __device_func__(long long int __float2ll_ru(float a)) | | | |
| { | | | |
| return __internal_float2ll(a, cudaRoundPosInf); | | | |
| } | | | |
| | | | |
| __device_func__(long long int __float2ll_rd(float a)) | | | |
| { | | | |
| return __internal_float2ll(a, cudaRoundMinInf); | | | |
| } | | | |
| | | | |
| __device_func__(long long int __float2ll_rn(float a)) | | | |
| { | | | |
| return __internal_float2ll(a, cudaRoundNearest); | | | |
| } | | | |
| | | | |
| __device_func__(unsigned long long int __internal_float2ull_kernel(float a, | | | |
| unsigned long long int max, unsigned long long int nan, enum cudaRoundMode | | | |
| rndMode)) | | | |
| { | | | |
| unsigned long long int res, t = 0ULL; | | | |
| int shift; | | | |
| unsigned int ia; | | | |
| | | | |
| __internal_clamp(a, max, 0LL, nan); | | | |
| ia = __float_as_int(a); | | | |
| shift = 190 - ((ia >> 23) & 0xff); | | | |
| res = (unsigned long long int)((ia << 8) | 0x80000000) << 32; | | | |
| if (shift >= 64) { | | | |
| t = res >> (int)(shift > 64); | | | |
| res = 0; | | | |
| } else if (shift) { | | | |
| t = res << (64 - shift); | | | |
| res = res >> shift; | | | |
| } | | | |
| if (rndMode == cudaRoundNearest && (long long int)t < 0LL) { | | | |
| res += t == 0x8000000000000000ULL ? res & 1ULL : 1ULL; | | | |
| } | | | |
| else if (rndMode == cudaRoundPosInf && t != 0ULL) { | | | |
| res++; | | | |
| } | | | |
| return res; | | | |
| } | | | |
| | | | |
| __device_func__(unsigned int __internal_float2uint(float a, enum cudaRoundM | | | |
| ode rndMode)) | | | |
| { | | | |
| return (unsigned int)__internal_float2ull_kernel(a, 4294967295U, 0U, rndM | | | |
| ode); | | | |
| } | | | |
| | | | |
| __device_func__(unsigned int __float2uint_rz(float a)) | | | |
| { | | | |
| return __internal_float2uint(a, cudaRoundZero); | | | |
| } | | | |
| | | | |
| __device_func__(unsigned int __float2uint_ru(float a)) | | | |
| { | | | |
| return __internal_float2uint(a, cudaRoundPosInf); | | | |
| } | | | |
| | | | |
| __device_func__(unsigned int __float2uint_rd(float a)) | | | |
| { | | | |
| return __internal_float2uint(a, cudaRoundMinInf); | | | |
| } | | | |
| | | | |
| __device_func__(unsigned int __float2uint_rn(float a)) | | | |
| { | | | |
| return __internal_float2uint(a, cudaRoundNearest); | | | |
| } | | | |
| | | | |
| __device_func__(unsigned long long int __internal_float2ull(float a, enum c | | | |
| udaRoundMode rndMode)) | | | |
| { | | | |
| return __internal_float2ull_kernel(a, 18446744073709551615ULL, 9223372036 | | | |
| 854775808ULL, rndMode); | | | |
| } | | | |
| | | | |
| __device_func__(unsigned long long int __float2ull_rz(float a)) | | | |
| { | | | |
| return __internal_float2ull(a, cudaRoundZero); | | | |
| } | | | |
| | | | |
| __device_func__(unsigned long long int __float2ull_ru(float a)) | | | |
| { | | | |
| return __internal_float2ull(a, cudaRoundPosInf); | | | |
| } | | | |
| | | | |
| __device_func__(unsigned long long int __float2ull_rd(float a)) | | | |
| { | | | |
| return __internal_float2ull(a, cudaRoundMinInf); | | | |
| } | | | |
| | | | |
| __device_func__(unsigned long long int __float2ull_rn(float a)) | | | |
| { | | | |
| return __internal_float2ull(a, cudaRoundNearest); | | | |
| } | | | |
| | | | |
| __device_func__(int __internal_normalize64(unsigned long long int *a)) | | | |
| { | | | |
| int lz = 0; | | | |
| | | | |
| if ((*a & 0xffffffff00000000ULL) == 0ULL) { | | | |
| *a <<= 32; | | | |
| lz += 32; | | | |
| } | | | |
| if ((*a & 0xffff000000000000ULL) == 0ULL) { | | | |
| *a <<= 16; | | | |
| lz += 16; | | | |
| } | | | |
| if ((*a & 0xff00000000000000ULL) == 0ULL) { | | | |
| *a <<= 8; | | | |
| lz += 8; | | | |
| } | | | |
| if ((*a & 0xf000000000000000ULL) == 0ULL) { | | | |
| *a <<= 4; | | | |
| lz += 4; | | | |
| } | | | |
| if ((*a & 0xC000000000000000ULL) == 0ULL) { | | | |
| *a <<= 2; | | | |
| lz += 2; | | | |
| } | | | |
| if ((*a & 0x8000000000000000ULL) == 0ULL) { | | | |
| *a <<= 1; | | | |
| lz += 1; | | | |
| } | | | |
| return lz; | | | |
| } | | | |
| | | | |
| __device_func__(int __internal_normalize(unsigned int *a)) | | | |
| { | | | |
| unsigned long long int t = (unsigned long long int)*a; | | | |
| int lz = __internal_normalize64(&t); | | | |
| | | | |
| *a = (unsigned int)(t >> 32); | | | |
| | | | |
| return lz - 32; | | | |
| } | | | |
| | | | |
| __device_func__(float __internal_int2float_kernel(int a, enum cudaRoundMode | | | |
| rndMode)) | | | |
| { | | | |
| volatile union __cudart_FloatUintCvt res; | | | |
| int shift; | | | |
| unsigned int t; | | | |
| res.i = a; | | | |
| if (a == 0) return res.f; | | | |
| if (a < 0) res.i = (unsigned int)-a; | | | |
| shift = __internal_normalize((unsigned int*)&res.i); | | | |
| t = res.i << 24; | | | |
| res.i = (res.i >> 8); | | | |
| res.i += (127 + 30 - shift) << 23; | | | |
| if (a < 0) res.i |= 0x80000000; | | | |
| if ((rndMode == cudaRoundNearest) && (t >= 0x80000000)) { | | | |
| res.i += (t == 0x80000000) ? (res.i & 1) : (t >> 31); | | | |
| } | | | |
| else if ((rndMode == cudaRoundMinInf) && t && (a < 0)) { | | | |
| res.i++; | | | |
| } | | | |
| else if ((rndMode == cudaRoundPosInf) && t && (a > 0)) { | | | |
| res.i++; | | | |
| } | | | |
| return res.f; | | | |
| } | | | |
| | | | |
| __device_func__(float __int2float_rz(int a)) | | | |
| { | | | |
| return __internal_int2float_kernel(a, cudaRoundZero); | | | |
| } | | | |
| | | | |
| __device_func__(float __int2float_ru(int a)) | | | |
| { | | | |
| return __internal_int2float_kernel(a, cudaRoundPosInf); | | | |
| } | | | |
| | | | |
| __device_func__(float __int2float_rd(int a)) | | | |
| { | | | |
| return __internal_int2float_kernel(a, cudaRoundMinInf); | | | |
| } | | | |
| | | | |
| __device_func__(float __int2float_rn(int a)) | | | |
| { | | | |
| return __internal_int2float_kernel(a, cudaRoundNearest); | | | |
| } | | | |
| | | | |
| __device_func__(float __internal_uint2float_kernel(unsigned int a, enum cud | | | |
| aRoundMode rndMode)) | | | |
| { | | | |
| volatile union __cudart_FloatUintCvt res; | | | |
| int shift; | | | |
| unsigned int t; | | | |
| res.i = a; | | | |
| if (a == 0) return res.f; | | | |
| shift = __internal_normalize((unsigned int*)&res.i); | | | |
| t = res.i << 24; | | | |
| res.i = (res.i >> 8); | | | |
| res.i += (127 + 30 - shift) << 23; | | | |
| if (rndMode == cudaRoundNearest) { | | | |
| res.i += (t == 0x80000000) ? (res.i & 1) : (t >> 31); | | | |
| } | | | |
| else if ((rndMode == cudaRoundPosInf) && t) { | | | |
| res.i++; | | | |
| } | | | |
| return res.f; | | | |
| } | | | |
| | | | |
| __device_func__(float __uint2float_rz(unsigned int a)) | | | |
| { | | | |
| return __internal_uint2float_kernel(a, cudaRoundZero); | | | |
| } | | | |
| | | | |
| __device_func__(float __uint2float_ru(unsigned int a)) | | | |
| { | | | |
| return __internal_uint2float_kernel(a, cudaRoundPosInf); | | | |
| } | | | |
| | | | |
| __device_func__(float __uint2float_rd(unsigned int a)) | | | |
| { | | | |
| return __internal_uint2float_kernel(a, cudaRoundMinInf); | | | |
| } | | | |
| | | | |
| __device_func__(float __uint2float_rn(unsigned int a)) | | | |
| { | | | |
| return __internal_uint2float_kernel(a, cudaRoundNearest); | | | |
| } | | | |
| | | | |
| __device_func__(float __internal_ull2float_kernel(unsigned long long int a, | | | |
| enum cudaRoundMode rndMode)) | | | |
| { | | | |
| unsigned long long int temp; | | | |
| unsigned int res, t; | | | |
| int shift; | | | |
| if (a == 0ULL) return 0.0f; | | | |
| temp = a; | | | |
| shift = __internal_normalize64(&temp); | | | |
| temp = (temp >> 8) | ((temp & 0xffULL) ? 1ULL : 0ULL); | | | |
| res = (unsigned int)(temp >> 32); | | | |
| t = (unsigned int)temp; | | | |
| res += (127 + 62 - shift) << 23; /* add in exponent */ | | | |
| if (rndMode == cudaRoundNearest) { | | | |
| res += (t == 0x80000000) ? (res & 1) : (t >> 31); | | | |
| } else if (rndMode == cudaRoundPosInf) { | | | |
| res += (t != 0); | | | |
| } | | | |
| return __int_as_float(res); | | | |
| } | | | |
| | | | |
| __device_func__(float __internal_ll2float_kernel(long long int a, enum cuda | | | |
| RoundMode rndMode)) | | | |
| { | | | |
| unsigned long long int temp; | | | |
| volatile float res = 0.0f; | | | |
| | | | |
| if (a < 0LL) { | | | |
| temp = (~((unsigned long long int)a)) + 1ULL; | | | |
| if (rndMode == cudaRoundPosInf) { | | | |
| rndMode = cudaRoundMinInf; | | | |
| } else if (rndMode == cudaRoundMinInf) { | | | |
| rndMode = cudaRoundPosInf; | | | |
| } | | | |
| } else { | | | |
| temp = (unsigned long long int)a; | | | |
| } | | | |
| res = __internal_ull2float_kernel (temp, rndMode); | | | |
| if (a < 0LL) { | | | |
| res = -res; | | | |
| } | | | |
| return res; | | | |
| } | | | |
| | | | |
| __device_func__(float __ll2float_rn(long long int a)) | | | |
| { | | | |
| return __internal_ll2float_kernel(a, cudaRoundNearest); | | | |
| } | | | |
| | | | |
| __device_func__(float __ll2float_rz(long long int a)) | | | |
| { | | | |
| return __internal_ll2float_kernel(a, cudaRoundZero); | | | |
| } | | | |
| | | | |
| __device_func__(float __ll2float_ru(long long int a)) | | | |
| { | | | |
| return __internal_ll2float_kernel(a, cudaRoundPosInf); | | | |
| } | | | |
| | | | |
| __device_func__(float __ll2float_rd(long long int a)) | | | |
| { | | | |
| return __internal_ll2float_kernel(a, cudaRoundMinInf); | | | |
| } | | | |
| | | | |
| __device_func__(float __ull2float_rn(unsigned long long int a)) | | | |
| { | | | |
| return __internal_ull2float_kernel(a, cudaRoundNearest); | | | |
| } | | | |
| | | | |
| __device_func__(float __ull2float_rz(unsigned long long int a)) | | | |
| { | | | |
| return __internal_ull2float_kernel(a, cudaRoundZero); | | | |
| } | | | |
| | | | |
| __device_func__(float __ull2float_ru(unsigned long long int a)) | | | |
| { | | | |
| return __internal_ull2float_kernel(a, cudaRoundPosInf); | | | |
| } | | | |
| | | | |
| __device_func__(float __ull2float_rd(unsigned long long int a)) | | | |
| { | | | |
| return __internal_ull2float_kernel(a, cudaRoundMinInf); | | | |
| } | | | |
| | | | |
| __device_func__(unsigned short __float2half_rn(float f)) | | | |
| { | | | |
| unsigned int x = __float_as_int (f); | | | |
| unsigned int u = (x & 0x7fffffff), remainder, shift, lsb, lsb_s1, lsb_m1; | | | |
| unsigned int sign, exponent, mantissa; | | | |
| | | | |
| /* Get rid of +NaN/-NaN case first. */ | | | |
| if (u > 0x7f800000) { | | | |
| return 0x7fff; | | | |
| } | | | |
| | | | |
| sign = ((x >> 16) & 0x8000); | | | |
| | | | |
| /* Get rid of +Inf/-Inf, +0/-0. */ | | | |
| if (u > 0x477fefff) { | | | |
| return sign | 0x7c00; | | | |
| } | | | |
| if (u < 0x33000001) { | | | |
| return sign | 0x0000; | | | |
| } | | | |
| | | | |
| exponent = ((u >> 23) & 0xff); | | | |
| mantissa = (u & 0x7fffff); | | | |
| | | | |
| if (exponent > 0x70) { | | | |
| shift = 13; | | | |
| exponent -= 0x70; | | | |
| } else { | | | |
| shift = 0x7e - exponent; | | | |
| exponent = 0; | | | |
| mantissa |= 0x800000; | | | |
| } | | | |
| lsb = (1 << shift); | | | |
| lsb_s1 = (lsb >> 1); | | | |
| lsb_m1 = (lsb - 1); | | | |
| | | | |
| /* Round to nearest even. */ | | | |
| remainder = (mantissa & lsb_m1); | | | |
| mantissa >>= shift; | | | |
| if (remainder > lsb_s1 || (remainder == lsb_s1 && (mantissa & 0x1))) { | | | |
| ++mantissa; | | | |
| if (!(mantissa & 0x3ff)) { | | | |
| ++exponent; | | | |
| mantissa = 0; | | | |
| } | | | |
| } | | | |
| | | | |
| return sign | (exponent << 10) | mantissa; | | | |
| } | | | |
| | | | |
| __device_func__(float __half2float(unsigned short h)) | | | |
| { | | | |
| unsigned int sign = ((h >> 15) & 1); | | | |
| unsigned int exponent = ((h >> 10) & 0x1f); | | | |
| unsigned int mantissa = ((h & 0x3ff) << 13); | | | |
| | | | |
| if (exponent == 0x1f) { /* NaN or Inf */ | | | |
| mantissa = (mantissa | | | |
| ? (sign = 0, 0x7fffff) | | | |
| : 0); | | | |
| exponent = 0xff; | | | |
| } else if (!exponent) { /* Denorm or Zero */ | | | |
| if (mantissa) { | | | |
| unsigned int msb; | | | |
| exponent = 0x71; | | | |
| do { | | | |
| msb = (mantissa & 0x400000); | | | |
| mantissa <<= 1; /* normalize */ | | | |
| --exponent; | | | |
| } while (!msb); | | | |
| mantissa &= 0x7fffff; /* 1.mantissa is implicit */ | | | |
| } | | | |
| } else { | | | |
| exponent += 0x70; | | | |
| } | | | |
| | | | |
| return __int_as_float ((sign << 31) | (exponent << 23) | mantissa); | | | |
| } | | | |
| | | | |
| __device_func__(float __fadd_rz(float a, float b)) | | | |
| { | | | |
| return __internal_fadd_kernel(a, b, cudaRoundZero); | | | |
| } | | | |
| | | | |
| __device_func__(float __fmul_rz(float a, float b)) | | | |
| { | | | |
| return __internal_fmul_kernel(a, b, cudaRoundZero); | | | |
| } | | | |
| | | | |
| __device_func__(float __fadd_rn(float a, float b)) | | | |
| { | | | |
| return __internal_fadd_kernel(a, b, cudaRoundNearest); | | | |
| } | | | |
| | | | |
| __device_func__(float __fmul_rn(float a, float b)) | | | |
| { | | | |
| return __internal_fmul_kernel(a, b, cudaRoundNearest); | | | |
| } | | | |
| | | | |
| __device_func__(void __brkpt(int c)) | | | |
| { | | | |
| /* TODO */ | | | |
| } | | | |
| | | | |
| #if defined(__cplusplus) | | | |
| extern "C" { | | | |
| #endif /* __cplusplus */ | | | |
| | | | |
| extern int CUDARTAPI __cudaSynchronizeThreads(void**, void*); | | | |
| | | | |
| #if defined(__cplusplus) | | | |
| } | | | |
| #endif /* __cplusplus */ | | | |
| | | | |
| #if defined(__GNUC__) | | | |
| | | | |
| __device_func__(inline __attribute__((always_inline)) void __syncthreads(vo | | | |
| id)) | | | |
| { | | | |
| volatile int _ = 0; | | | |
| L: if (__cudaSynchronizeThreads((void**)&&L, (void*)&_)) goto L; | | | |
| } | | | |
| | | | |
| #elif defined(_WIN32) | | | |
| | | | |
| #define __syncthreads() \ | | | |
| (void)__cudaSynchronizeThreads((void**)0, (void*)0) | | | |
| | | | |
| #endif /* __GNUC__ */ | | | |
| | | | |
| __device_func__(void __prof_trigger(int a)) | | | |
| { | | | |
| } | | | |
| | | | |
| __device_func__(void __threadfence(void)) | | | |
| { | | | |
| __syncthreads(); | | | |
| } | | | |
| | | | |
| __device_func__(void __threadfence_block(void)) | | | |
| { | | | |
| __syncthreads(); | | | |
| } | | | |
| | | | |
| #if defined(__GNUC__) | | | |
| | | | |
| __device_func__(void __trap(void)) | | | |
| { | | | |
| __builtin_trap(); | | | |
| } | | | |
| | | | |
| #elif defined(_WIN32) | | | |
| | | | |
| __device_func__(void __trap(void)) | | | |
| { | | | |
| __debugbreak(); | | | |
| } | | | |
| | | | |
| #endif /* __GNUC__ */ | | | |
| | | | |
| #endif /* __CUDABE__ */ | | | |
| | | | |
| /************************************************************************** | | | |
| ***** | | | |
| * | | | |
| * | | | |
| * DEVICE IMPLEMENTATIONS FOR FUNCTIONS WITH BUILTIN NVOPENCC OPERATIONS | | | |
| * | | | |
| * | | | |
| * | | | |
| *************************************************************************** | | | |
| ****/ | | | |
| #if !defined(__CUDABE__) | | | |
| __device_func__(float __fdividef(float a, float b)) | | | |
| { | | | |
| volatile float aa = a; | | | |
| volatile float bb = b; | | | |
| /* match range restrictions of the device function */ | | | |
| if (__cuda_fabsf(bb) > CUDART_TWO_TO_126_F) { | | | |
| if (__cuda_fabsf(aa) <= CUDART_NORM_HUGE_F) { | | | |
| return ((aa / bb) / CUDART_NORM_HUGE_F) / CUDART_NORM_HUGE_F; | | | |
| } else { | | | |
| bb = 1.0f / bb; | | | |
| bb = bb / CUDART_NORM_HUGE_F; | | | |
| return aa * bb; | | | |
| } | | | |
| } else { | | | |
| return aa / bb; | | | |
| } | | | |
| } | | | |
| #endif /* !defined(__CUDABE__) */ | | | |
| | | | |
| __device_func__(float __sinf(float a)) | | | |
| { | | | |
| #if !defined(__CUDABE__) | | | |
| if ((__float_as_int(a) << 1) == 0xff000000) { | | | |
| return __fadd_rn (a, -a); /* return NaN */ | | | |
| } | | | |
| #endif /* !defined(__CUDABE__) */ | | | |
| return sinf(a); | | | |
| } | | | |
| | | | |
| __device_func__(float __cosf(float a)) | | | |
| { | | | |
| #if !defined(__CUDABE__) | | | |
| if ((__float_as_int(a) << 1) == 0xff000000) { | | | |
| return __fadd_rn (a, -a); /* return NaN */ | | | |
| } | | | |
| #endif /* !defined(__CUDABE__) */ | | | |
| return cosf(a); | | | |
| } | | | |
| | | | |
| __device_func__(float __log2f(float a)) | | | |
| { | | | |
| return log2f(a); | | | |
| } | | | |
| | | | |
| /************************************************************************** | | | |
| ***** | | | |
| * | | | |
| * | | | |
| * SHARED HOST AND DEVICE IMPLEMENTATIONS | | | |
| * | | | |
| * | | | |
| * | | | |
| *************************************************************************** | | | |
| ****/ | | | |
| __device_func__(float __tanf(float a)) | | | |
| { | | | |
| return __fdividef (__sinf(a), __cosf(a)); | | | |
| } | | | |
| | | | |
| __device_func__(void __sincosf(float a, float *sptr, float *cptr)) | | | |
| { | | | |
| *sptr = __sinf(a); | | | |
| *cptr = __cosf(a); | | | |
| } | | | |
| | | | |
| __device_func__(float __expf(float a)) | | | |
| { | | | |
| return __cuda_exp2f(a * CUDART_L2E_F); | | | |
| } | | | |
| | | | |
| __device_func__(float __exp10f(float a)) | | | |
| { | | | |
| return __cuda_exp2f(a * CUDART_L2T_F); | | | |
| } | | | |
| | | | |
| __device_func__(float __log10f(float a)) | | | |
| { | | | |
| return CUDART_LG2_F * __log2f(a); | | | |
| } | | | |
| | | | |
| __device_func__(float __logf(float a)) | | | |
| { | | | |
| return CUDART_LN2_F * __log2f(a); | | | |
| } | | | |
| | | | |
| __device_func__(float __powf(float a, float b)) | | | |
| { | | | |
| return __cuda_exp2f(b * __log2f(a)); | | | |
| } | | | |
| | | | |
| __device_func__(float fdividef(float a, float b)) | | | |
| { | | | |
| #if defined(__USE_FAST_MATH__) && !defined(__CUDA_PREC_DIV) | | | |
| return __fdividef(a, b); | | | |
| #else /* __USE_FAST_MATH__ && !__CUDA_PREC_DIV */ | | | |
| return a / b; | | | |
| #endif /* __USE_FAST_MATH__ && !__CUDA_PREC_DIV */ | | | |
| } | | | |
| | | | |
| #if !defined(__CUDABE__) || (__CUDA_ARCH__ < 200) | | | |
| | | | |
| __device_func__(int __clz(int a)) | | | |
| { | | { | |
| return (a)?(158-(__float_as_int(__uint2float_rz((unsigned int)a))>>23)):3
2; | | return (a)?(158-(__float_as_int(__uint2float_rz((unsigned int)a))>>23)):3
2; | |
| } | | } | |
| | | | |
|
| __device_func__(int __clzll(long long int a)) | | static __forceinline__ int __clzll(long long int a) | |
| { | | { | |
| int ahi = ((int)((unsigned long long)a >> 32)); | | int ahi = ((int)((unsigned long long)a >> 32)); | |
| int alo = ((int)((unsigned long long)a & 0xffffffffULL)); | | int alo = ((int)((unsigned long long)a & 0xffffffffULL)); | |
| int res; | | int res; | |
| if (ahi) { | | if (ahi) { | |
| res = 0; | | res = 0; | |
| } else { | | } else { | |
| res = 32; | | res = 32; | |
| ahi = alo; | | ahi = alo; | |
| } | | } | |
| res = res + __clz(ahi); | | res = res + __clz(ahi); | |
| return res; | | return res; | |
| } | | } | |
| | | | |
|
| __device_func__(int __popc(unsigned int a)) | | static __forceinline__ int __popc(unsigned int a) | |
| { | | { | |
| a = a - ((a >> 1) & 0x55555555); | | a = a - ((a >> 1) & 0x55555555); | |
| a = (a & 0x33333333) + ((a >> 2) & 0x33333333); | | a = (a & 0x33333333) + ((a >> 2) & 0x33333333); | |
| a = (a + (a >> 4)) & 0x0f0f0f0f; | | a = (a + (a >> 4)) & 0x0f0f0f0f; | |
| a = ((__umul24(a, 0x808080) << 1) + a) >> 24; | | a = ((__umul24(a, 0x808080) << 1) + a) >> 24; | |
| return a; | | return a; | |
| } | | } | |
| | | | |
|
| __device_func__(int __popcll(unsigned long long int a)) | | static __forceinline__ int __popcll(unsigned long long int a) | |
| { | | { | |
| unsigned int ahi = ((unsigned int)(a >> 32)); | | unsigned int ahi = ((unsigned int)(a >> 32)); | |
| unsigned int alo = ((unsigned int)(a & 0xffffffffULL)); | | unsigned int alo = ((unsigned int)(a & 0xffffffffULL)); | |
| alo = alo - ((alo >> 1) & 0x55555555); | | alo = alo - ((alo >> 1) & 0x55555555); | |
| alo = (alo & 0x33333333) + ((alo >> 2) & 0x33333333); | | alo = (alo & 0x33333333) + ((alo >> 2) & 0x33333333); | |
| ahi = ahi - ((ahi >> 1) & 0x55555555); | | ahi = ahi - ((ahi >> 1) & 0x55555555); | |
| ahi = (ahi & 0x33333333) + ((ahi >> 2) & 0x33333333); | | ahi = (ahi & 0x33333333) + ((ahi >> 2) & 0x33333333); | |
| alo = alo + ahi; | | alo = alo + ahi; | |
| alo = (alo & 0x0f0f0f0f) + ((alo >> 4) & 0x0f0f0f0f); | | alo = (alo & 0x0f0f0f0f) + ((alo >> 4) & 0x0f0f0f0f); | |
| alo = ((__umul24(alo, 0x808080) << 1) + alo) >> 24; | | alo = ((__umul24(alo, 0x808080) << 1) + alo) >> 24; | |
| return alo; | | return alo; | |
| } | | } | |
| | | | |
|
| __device_func__(unsigned int __brev(unsigned int a)) | | static __forceinline__ unsigned int __brev(unsigned int a) | |
| { | | { | |
| /* Use Knuth's algorithm from http://www.hackersdelight.org/revisions.pdf
*/ | | /* Use Knuth's algorithm from http://www.hackersdelight.org/revisions.pdf
*/ | |
| unsigned int t; | | unsigned int t; | |
| a = (a << 15) | (a >> 17); | | a = (a << 15) | (a >> 17); | |
| t = (a ^ (a >> 10)) & 0x003f801f; | | t = (a ^ (a >> 10)) & 0x003f801f; | |
| a = (t + (t << 10)) ^ a; | | a = (t + (t << 10)) ^ a; | |
| t = (a ^ (a >> 4)) & 0x0e038421; | | t = (a ^ (a >> 4)) & 0x0e038421; | |
| a = (t + (t << 4)) ^ a; | | a = (t + (t << 4)) ^ a; | |
| t = (a ^ (a >> 2)) & 0x22488842; | | t = (a ^ (a >> 2)) & 0x22488842; | |
| a = (t + (t << 2)) ^ a; | | a = (t + (t << 2)) ^ a; | |
| return a; | | return a; | |
| } | | } | |
| | | | |
|
| __device_func__(unsigned long long int __brevll(unsigned long long int a)) | | static __forceinline__ unsigned long long int __brevll(unsigned long long i
nt a) | |
| { | | { | |
| unsigned int hi = (unsigned int)(a >> 32); | | unsigned int hi = (unsigned int)(a >> 32); | |
| unsigned int lo = (unsigned int)(a & 0xffffffffULL); | | unsigned int lo = (unsigned int)(a & 0xffffffffULL); | |
| unsigned int t; | | unsigned int t; | |
| t = __brev(lo); | | t = __brev(lo); | |
| lo = __brev(hi); | | lo = __brev(hi); | |
| return ((unsigned long long int)t << 32) + (unsigned long long int)lo; | | return ((unsigned long long int)t << 32) + (unsigned long long int)lo; | |
| } | | } | |
| | | | |
|
| #endif /* __CUDABE__ || __CUDA_ARCH__ < 200 */ | | static __forceinline__ unsigned int __byte_perm(unsigned int a, unsigned in | |
| | | t b, unsigned int slct) | |
| __device_func__(int __ffs(int a)) | | | |
| { | | | |
| return 32 - __clz (a & -a); | | | |
| } | | | |
| | | | |
| __device_func__(int __ffsll(long long int a)) | | | |
| { | | | |
| return 64 - __clzll (a & -a); | | | |
| } | | | |
| | | | |
| #if defined(CUDA_DOUBLE_MATH_FUNCTIONS) && defined(CUDA_FLOAT_MATH_FUNCTION | | | |
| S) | | | |
| | | | |
| #error -- conflicting mode for double math routines | | | |
| | | | |
| #endif /* CUDA_DOUBLE_MATH_FUNCTIONS && CUDA_FLOAT_MATH_FUNCTIONS */ | | | |
| | | | |
| #if defined(CUDA_FLOAT_MATH_FUNCTIONS) | | | |
| | | | |
| __device_func__(double fdivide(double a, double b)) | | | |
| { | | | |
| return (double)fdividef((float)a, (float)b); | | | |
| } | | | |
| | | | |
| #if !defined(__CUDABE__) | | | |
| | | | |
| __device_func__(int __double2int_rz(double a)) | | | |
| { | | | |
| return __float2int_rz((float)a); | | | |
| } | | | |
| | | | |
| __device_func__(unsigned int __double2uint_rz(double a)) | | | |
| { | | | |
| return __float2uint_rz((float)a); | | | |
| } | | | |
| | | | |
| __device_func__(long long int __double2ll_rz(double a)) | | | |
| { | | | |
| return __float2ll_rz((float)a); | | | |
| } | | | |
| | | | |
| __device_func__(unsigned long long int __double2ull_rz(double a)) | | | |
| { | | | |
| return __float2ull_rz((float)a); | | | |
| } | | | |
| | | | |
| #endif /* !__CUDABE__ */ | | | |
| | | | |
| #endif /* CUDA_FLOAT_MATH_FUNCTIONS */ | | | |
| | | | |
| #if defined(CUDA_DOUBLE_MATH_FUNCTIONS) | | | |
| | | | |
| __device_func__(double fdivide(double a, double b)) | | | |
| { | | { | |
|
| return a / b; | | unsigned int i0 = (slct >> 0) & 0x7; | |
| } | | unsigned int i1 = (slct >> 4) & 0x7; | |
| | | unsigned int i2 = (slct >> 8) & 0x7; | |
| #if !defined(__CUDABE__) | | unsigned int i3 = (slct >> 12) & 0x7; | |
| | | | |
| __device_func__(int __internal_double2int(double a, enum cudaRoundMode rndM | | | |
| ode)); | | | |
| __device_func__(unsigned int __internal_double2uint(double a, enum cudaRoun | | | |
| dMode rndMode)); | | | |
| __device_func__(long long int __internal_double2ll(double a, enum cudaRound | | | |
| Mode rndMode)); | | | |
| __device_func__(unsigned long long int __internal_double2ull(double a, enum | | | |
| cudaRoundMode rndMode)); | | | |
| | | | |
|
| __device_func__(int __double2int_rz(double a)) | | return (((((i0 < 4) ? (a >> (i0*8)) : (b >> ((i0-4)*8))) & 0xff) << 0) + | |
| { | | ((((i1 < 4) ? (a >> (i1*8)) : (b >> ((i1-4)*8))) & 0xff) << 8) + | |
| return __internal_double2int(a, cudaRoundZero); | | ((((i2 < 4) ? (a >> (i2*8)) : (b >> ((i2-4)*8))) & 0xff) << 16) + | |
| | | ((((i3 < 4) ? (a >> (i3*8)) : (b >> ((i3-4)*8))) & 0xff) << 24)); | |
| } | | } | |
| | | | |
|
| __device_func__(unsigned int __double2uint_rz(double a)) | | #endif /* __CUDA_ARCH__ < 200 */ | |
| { | | | |
| return __internal_double2uint(a, cudaRoundZero); | | | |
| } | | | |
| | | | |
|
| __device_func__(long long int __double2ll_rz(double a)) | | static __forceinline__ int __ffs(int a) | |
| { | | { | |
|
| return __internal_double2ll(a, cudaRoundZero); | | return 32 - __clz(a & -a); | |
| } | | } | |
| | | | |
|
| __device_func__(unsigned long long int __double2ull_rz(double a)) | | static __forceinline__ int __ffsll(long long int a) | |
| { | | { | |
|
| return __internal_double2ull(a, cudaRoundZero); | | return 64 - __clzll(a & -a); | |
| } | | } | |
| | | | |
|
| #endif /* !__CUDABE__ */ | | | |
| | | | |
| #endif /* CUDA_DOUBLE_MATH_FUNCTIONS */ | | | |
| | | | |
| #endif /* __cplusplus && __CUDACC__ */ | | #endif /* __cplusplus && __CUDACC__ */ | |
| | | | |
| /**************************************************************************
***** | | /**************************************************************************
***** | |
| *
* | | *
* | |
| *
* | | *
* | |
| *
* | | *
* | |
| ***************************************************************************
****/ | | ***************************************************************************
****/ | |
| | | | |
| #include "sm_11_atomic_functions.h" | | #include "sm_11_atomic_functions.h" | |
| #include "sm_12_atomic_functions.h" | | #include "sm_12_atomic_functions.h" | |
| #include "sm_13_double_functions.h" | | #include "sm_13_double_functions.h" | |
| #include "sm_20_atomic_functions.h" | | #include "sm_20_atomic_functions.h" | |
| #include "sm_20_intrinsics.h" | | #include "sm_20_intrinsics.h" | |
|
| | | #include "surface_functions.h" | |
| #include "texture_fetch_functions.h" | | #include "texture_fetch_functions.h" | |
| | | | |
| #endif /* !__DEVICE_FUNCTIONS_H__ */ | | #endif /* !__DEVICE_FUNCTIONS_H__ */ | |
| | | | |
End of changes. 49 change blocks. |
| 2271 lines changed or deleted | | 163 lines changed or added | |
|
| math_functions_dbl_ptx3.h | | math_functions_dbl_ptx3.h | |
| | | | |
| skipping to change at line 41 | | skipping to change at line 41 | |
| * Any use of this source code in individual and commercial software must | | * Any use of this source code in individual and commercial software must | |
| * include, in the user documentation and internal comments to the code, | | * include, in the user documentation and internal comments to the code, | |
| * the above Disclaimer and U.S. Government End Users Notice. | | * the above Disclaimer and U.S. Government End Users Notice. | |
| */ | | */ | |
| | | | |
| #if !defined(__MATH_FUNCTIONS_DBL_PTX3_H__) | | #if !defined(__MATH_FUNCTIONS_DBL_PTX3_H__) | |
| #define __MATH_FUNCTIONS_DBL_PTX3_H__ | | #define __MATH_FUNCTIONS_DBL_PTX3_H__ | |
| | | | |
| /* True double precision implementations, since native double support */ | | /* True double precision implementations, since native double support */ | |
| | | | |
|
| #if defined(__cplusplus) && defined(__CUDACC__) | | #if defined(__CUDABE__) | |
| | | | |
| #elif !defined(__CUDACC__) | | | |
| | | | |
| #include "crt/func_macro.h" | | | |
| | | | |
| #define INT_MAX \ | | | |
| ((int)((unsigned int)-1 >> 1)) | | | |
| | | | |
|
| #include "device_functions.h" | | | |
| #include "math_constants.h" | | | |
| #if !defined(__CUDABE__) | | | |
| #include "common_types.h" | | | |
| #endif | | | |
| /**************************************************************************
***** | | /**************************************************************************
***** | |
| *
* | | *
* | |
|
| * DEVICE IMPLEMENTATIONS FOR FUNCTIONS WITH BUILTIN NVOPENCC OPREATIONS
* | | * DEVICE IMPLEMENTATIONS FOR FUNCTIONS WITH BUILTIN NVOPENCC OPERATIONS
* | |
| *
* | | *
* | |
| ***************************************************************************
****/ | | ***************************************************************************
****/ | |
| | | | |
|
| __device_func__(double __cuda_fabs(double a)) | | static __forceinline__ double rint(double a) | |
| { | | | |
| return fabs(a); | | | |
| } | | | |
| | | | |
| __device_func__(double __cuda_fmax(double a, double b)) | | | |
| { | | | |
| #if !defined(__CUDABE__) | | | |
| volatile union __cudart_DoubleUlonglongCvt cvta, cvtb; | | | |
| int nana, nanb; | | | |
| | | | |
| cvta.d = a; | | | |
| cvtb.d = b; | | | |
| nana = ((cvta.i << 1) > 0xffe0000000000000ULL); | | | |
| nanb = ((cvtb.i << 1) > 0xffe0000000000000ULL); | | | |
| if (nana && nanb) return a + b; | | | |
| if (nana) return b; | | | |
| if (nanb) return a; | | | |
| if ((cvta.d == 0.0) && (cvtb.d == 0.0)) { | | | |
| cvta.i &= cvtb.i; | | | |
| return cvta.d; | | | |
| } | | | |
| return a > b ? a : b; | | | |
| #else | | | |
| return fmax(a, b); | | | |
| #endif /* !defined(__CUDABE__) */ | | | |
| } | | | |
| | | | |
| __device_func__(double __cuda_fmin(double a, double b)) | | | |
| { | | | |
| #if !defined(__CUDABE__) | | | |
| volatile union __cudart_DoubleUlonglongCvt cvta, cvtb; | | | |
| int nana, nanb; | | | |
| | | | |
| cvta.d = a; | | | |
| cvtb.d = b; | | | |
| nana = ((cvta.i << 1) > 0xffe0000000000000ULL); | | | |
| nanb = ((cvtb.i << 1) > 0xffe0000000000000ULL); | | | |
| if (nana && nanb) return a + b; | | | |
| if (nana) return b; | | | |
| if (nanb) return a; | | | |
| if ((cvta.i | cvtb.i) == 0x8000000000000000ULL) { | | | |
| return CUDART_NEG_ZERO ; | | | |
| } | | | |
| return a < b ? a : b; | | | |
| #else | | | |
| return fmin(a, b); | | | |
| #endif /* !defined(__CUDABE__) */ | | | |
| } | | | |
| | | | |
| __device_func__(double __cuda_ceil(double a)) | | | |
| { | | { | |
|
| return ceil(a); | | return __builtin_round(a); | |
| } | | } | |
| | | | |
|
| __device_func__(double __cuda_floor(double a)) | | static __forceinline__ long int lrint(double a) | |
| { | | { | |
|
| return floor(a); | | #if defined(__LP64__) | |
| | | return (long int)__double2ll_rn(a); | |
| | | #else /* __LP64__ */ | |
| | | return (long int)__double2int_rn(a); | |
| | | #endif /* __LP64__ */ | |
| } | | } | |
| | | | |
|
| __device_func__(double __cuda_trunc(double a)) | | static __forceinline__ long long int llrint(double a) | |
| { | | { | |
|
| return trunc(a); | | return __double2ll_rn(a); | |
| } | | } | |
| | | | |
|
| __device_func__(double __cuda_nearbyint(double a)) | | static __forceinline__ double nearbyint(double a) | |
| { | | { | |
|
| #if defined(__CUDABE__) | | return __builtin_round(a); | |
| return round(a); | | | |
| #else /* __CUDABE__ */ | | | |
| double res = nearbyint(a); | | | |
| #if defined(__APPLE__) | | | |
| if ((a != 0.0) && (__cuda_fabs(a) <= 0.5)) { | | | |
| res = fabs(res) * ((a < 0.0) ? -3e-324 : 3e-324); | | | |
| } | | | |
| #endif /* __APPLE__ */ | | | |
| return res; | | | |
| #endif /* __CUDABE__ */ | | | |
| } | | } | |
| | | | |
| /**************************************************************************
***** | | /**************************************************************************
***** | |
| *
* | | *
* | |
| * DEVICE IMPLEMENTATIONS FOR FUNCTIONS WITHOUT BUILTIN NVOPENCC OPREATIONS
* | | * DEVICE IMPLEMENTATIONS FOR FUNCTIONS WITHOUT BUILTIN NVOPENCC OPREATIONS
* | |
| *
* | | *
* | |
| ***************************************************************************
****/ | | ***************************************************************************
****/ | |
| | | | |
|
| __device_func__(double __cuda_rint(double a)) | | static __forceinline__ int __signbit(double a) | |
| { | | | |
| return __cuda_nearbyint(a); | | | |
| } | | | |
| | | | |
| __device_func__(long int __cuda_lrint(double a)) | | | |
| { | | | |
| #if defined(__LP64__) | | | |
| return (long int)__double2ll_rn(a); | | | |
| #else /* __LP64__ */ | | | |
| return (long int)__double2int_rn(a); | | | |
| #endif /* __LP64__ */ | | | |
| } | | | |
| | | | |
| __device_func__(long long int __cuda_llrint(double a)) | | | |
| { | | | |
| return __double2ll_rn(a); | | | |
| } | | | |
| | | | |
| __device_func__(int __cuda___signbit(double a)) | | | |
| { | | { | |
| return (int)((unsigned int)__double2hiint(a) >> 31); | | return (int)((unsigned int)__double2hiint(a) >> 31); | |
| } | | } | |
| | | | |
|
| __device_func__(int __cuda___finite(double a)) | | static __forceinline__ int __finite(double a) | |
| { | | { | |
|
| return __cuda_fabs(a) < CUDART_INF; | | return fabs(a) < CUDART_INF; | |
| } | | } | |
| | | | |
|
| __device_func__(int __cuda___isinf(double a)) | | static __forceinline__ int __isinf(double a) | |
| { | | { | |
|
| return __cuda_fabs(a) == CUDART_INF; | | return fabs(a) == CUDART_INF; | |
| } | | } | |
| | | | |
|
| __device_func__(int __cuda___isnan(double a)) | | static __forceinline__ int __isnan(double a) | |
| { | | { | |
|
| return !(__cuda_fabs(a) <= CUDART_INF); | | return !(fabs(a) <= CUDART_INF); | |
| } | | } | |
| | | | |
|
| __device_func__(double __cuda_copysign(double a, double b)) | | static __forceinline__ double copysign(double a, double b) | |
| { | | { | |
| int alo, ahi, bhi; | | int alo, ahi, bhi; | |
| | | | |
| bhi = __double2hiint(b); | | bhi = __double2hiint(b); | |
| alo = __double2loint(a); | | alo = __double2loint(a); | |
| ahi = __double2hiint(a); | | ahi = __double2hiint(a); | |
| ahi = (bhi & 0x80000000) | (ahi & ~0x80000000); | | ahi = (bhi & 0x80000000) | (ahi & ~0x80000000); | |
| return __hiloint2double(ahi, alo); | | return __hiloint2double(ahi, alo); | |
| } | | } | |
| | | | |
| /* like copysign, but requires that argument a is postive */ | | /* like copysign, but requires that argument a is postive */ | |
|
| __device_func__(double __internal_copysign_pos(double a, double b)) | | static __forceinline__ double __internal_copysign_pos(double a, double b) | |
| { | | { | |
| int alo, ahi, bhi; | | int alo, ahi, bhi; | |
| | | | |
| bhi = __double2hiint(b); | | bhi = __double2hiint(b); | |
| alo = __double2loint(a); | | alo = __double2loint(a); | |
| ahi = __double2hiint(a); | | ahi = __double2hiint(a); | |
| ahi = (bhi & 0x80000000) | ahi; | | ahi = (bhi & 0x80000000) | ahi; | |
| return __hiloint2double(ahi, alo); | | return __hiloint2double(ahi, alo); | |
| } | | } | |
| | | | |
|
| | | static __forceinline__ double __internal_fast_rcp(double a) | |
| | | { | |
| | | double e, y; | |
| | | float x; | |
| | | x = __double2float_rn(a); | |
| | | y = (double)(1.0f/x); | |
| | | e = __fma_rn (-a, y, 1.0); | |
| | | e = __fma_rn ( e, e, e); | |
| | | y = __fma_rn ( e, y, y); | |
| | | return y; | |
| | | } | |
| | | | |
| /* 1152 bits of 2/PI for Payne-Hanek style argument reduction. */ | | /* 1152 bits of 2/PI for Payne-Hanek style argument reduction. */ | |
| static __constant__ unsigned long long int __cudart_i2opi_d [] = { | | static __constant__ unsigned long long int __cudart_i2opi_d [] = { | |
| 0x6bfb5fb11f8d5d08ULL, | | 0x6bfb5fb11f8d5d08ULL, | |
| 0x3d0739f78a5292eaULL, | | 0x3d0739f78a5292eaULL, | |
| 0x7527bac7ebe5f17bULL, | | 0x7527bac7ebe5f17bULL, | |
| 0x4f463f669e5fea2dULL, | | 0x4f463f669e5fea2dULL, | |
| 0x6d367ecf27cb09b7ULL, | | 0x6d367ecf27cb09b7ULL, | |
| 0xef2f118b5a0a6d1fULL, | | 0xef2f118b5a0a6d1fULL, | |
| 0x1ff897ffde05980fULL, | | 0x1ff897ffde05980fULL, | |
| 0x9c845f8bbdf9283bULL, | | 0x9c845f8bbdf9283bULL, | |
| | | | |
| skipping to change at line 231 | | skipping to change at line 156 | |
| 0xe88235f52ebb4484ULL, | | 0xe88235f52ebb4484ULL, | |
| 0xfe1deb1cb129a73eULL, | | 0xfe1deb1cb129a73eULL, | |
| 0x06492eea09d1921cULL, | | 0x06492eea09d1921cULL, | |
| 0xb7246e3a424dd2e0ULL, | | 0xb7246e3a424dd2e0ULL, | |
| 0xfe5163abdebbc561ULL, | | 0xfe5163abdebbc561ULL, | |
| 0xdb6295993c439041ULL, | | 0xdb6295993c439041ULL, | |
| 0xfc2757d1f534ddc0ULL, | | 0xfc2757d1f534ddc0ULL, | |
| 0xa2f9836e4e441529ULL, | | 0xa2f9836e4e441529ULL, | |
| }; | | }; | |
| | | | |
|
| __device_func__(double __internal_trig_reduction_kerneld(double a, int *qua
drant)) | | static __forceinline__ double __internal_trig_reduction_kerneld(double a, i
nt *quadrant) | |
| { | | { | |
| double j; | | double j; | |
| int q; | | int q; | |
|
| if (__cuda_fabs(a) > CUDART_TRIG_PLOSS) { | | if (fabs(a) > CUDART_TRIG_PLOSS) { | |
| /* Payne-Hanek style argument reduction. */ | | /* Payne-Hanek style argument reduction. */ | |
| unsigned long long int ia; | | unsigned long long int ia; | |
| unsigned long long int s; | | unsigned long long int s; | |
| unsigned long long int result[5]; | | unsigned long long int result[5]; | |
| unsigned long long int phi, plo; | | unsigned long long int phi, plo; | |
| unsigned long long int hi, lo; | | unsigned long long int hi, lo; | |
| unsigned int e; | | unsigned int e; | |
| int idx; | | int idx; | |
| | | | |
| ia = __double_as_longlong(a); | | ia = __double_as_longlong(a); | |
| s = ia & 0x8000000000000000ULL; | | s = ia & 0x8000000000000000ULL; | |
| e = (unsigned int)(((ia >> 52) & 0x7ff) - 1024); | | e = (unsigned int)(((ia >> 52) & 0x7ff) - 1024); | |
| ia = (ia << 11) | 0x8000000000000000ULL; | | ia = (ia << 11) | 0x8000000000000000ULL; | |
| /* compute x * 2/pi */ | | /* compute x * 2/pi */ | |
| idx = 16 - (e >> 6); | | idx = 16 - (e >> 6); | |
| hi = 0; | | hi = 0; | |
|
| #if defined(__CUDABE__) | | | |
| #pragma unroll 1 | | #pragma unroll 1 | |
|
| #endif /* __CUDABE__ */ | | | |
| for (q = (idx-1); q < min(18,idx+3); q++) { | | for (q = (idx-1); q < min(18,idx+3); q++) { | |
| plo = __cudart_i2opi_d[q] * ia; | | plo = __cudart_i2opi_d[q] * ia; | |
| phi = __umul64hi (__cudart_i2opi_d[q], ia); | | phi = __umul64hi (__cudart_i2opi_d[q], ia); | |
| lo = hi + plo; | | lo = hi + plo; | |
| hi = phi + (lo < plo); | | hi = phi + (lo < plo); | |
| result[q-(idx-1)] = lo; | | result[q-(idx-1)] = lo; | |
| } | | } | |
| result[q-(idx-1)] = hi; | | result[q-(idx-1)] = hi; | |
| e = e & 63; | | e = e & 63; | |
| /* shift result such that hi:lo<127:126> are the least significant | | /* shift result such that hi:lo<127:126> are the least significant | |
| | | | |
| skipping to change at line 324 | | skipping to change at line 247 | |
| * http://arxiv.org/PS_cache/arxiv/pdf/0708/0708.3722v1.pdf | | * http://arxiv.org/PS_cache/arxiv/pdf/0708/0708.3722v1.pdf | |
| */ | | */ | |
| a = __fma_rn (-j, 1.5707963267948966e+000, a); | | a = __fma_rn (-j, 1.5707963267948966e+000, a); | |
| a = __fma_rn (-j, 6.1232339957367574e-017, a); | | a = __fma_rn (-j, 6.1232339957367574e-017, a); | |
| a = __fma_rn (-j, 8.4784276603688985e-032, a); | | a = __fma_rn (-j, 8.4784276603688985e-032, a); | |
| *quadrant = q; | | *quadrant = q; | |
| return a; | | return a; | |
| } | | } | |
| | | | |
| /* approximate sine on -pi/4...+pi/4 */ | | /* approximate sine on -pi/4...+pi/4 */ | |
|
| __device_func__(double __internal_sin_kerneld(double x)) | | static __forceinline__ double __internal_sin_kerneld(double x) | |
| { | | { | |
| double x2, z; | | double x2, z; | |
| x2 = x * x; | | x2 = x * x; | |
| z = 1.5896230157221844E-010; | | z = 1.5896230157221844E-010; | |
| z = __fma_rn (z, x2, -2.5050747762850355E-008); | | z = __fma_rn (z, x2, -2.5050747762850355E-008); | |
| z = __fma_rn (z, x2, 2.7557313621385676E-006); | | z = __fma_rn (z, x2, 2.7557313621385676E-006); | |
| z = __fma_rn (z, x2, -1.9841269829589539E-004); | | z = __fma_rn (z, x2, -1.9841269829589539E-004); | |
| z = __fma_rn (z, x2, 8.3333333333221182E-003); | | z = __fma_rn (z, x2, 8.3333333333221182E-003); | |
| z = __fma_rn (z, x2, -1.6666666666666630E-001); | | z = __fma_rn (z, x2, -1.6666666666666630E-001); | |
| z = z * x2; | | z = z * x2; | |
| z = __fma_rn (z, x, x); | | z = __fma_rn (z, x, x); | |
| return z; | | return z; | |
| } | | } | |
| | | | |
| /* approximate cosine on -pi/4...+pi/4 */ | | /* approximate cosine on -pi/4...+pi/4 */ | |
|
| __device_func__(double __internal_cos_kerneld(double x)) | | static __forceinline__ double __internal_cos_kerneld(double x) | |
| { | | { | |
| double x2, z; | | double x2, z; | |
| x2 = x * x; | | x2 = x * x; | |
| z = -1.136788825395985E-011; | | z = -1.136788825395985E-011; | |
| z = __fma_rn (z, x2, 2.087588480545065E-009); | | z = __fma_rn (z, x2, 2.087588480545065E-009); | |
| z = __fma_rn (z, x2, -2.755731555403950E-007); | | z = __fma_rn (z, x2, -2.755731555403950E-007); | |
| z = __fma_rn (z, x2, 2.480158729365970E-005); | | z = __fma_rn (z, x2, 2.480158729365970E-005); | |
| z = __fma_rn (z, x2, -1.388888888888074E-003); | | z = __fma_rn (z, x2, -1.388888888888074E-003); | |
| z = __fma_rn (z, x2, 4.166666666666664E-002); | | z = __fma_rn (z, x2, 4.166666666666664E-002); | |
| z = __fma_rn (z, x2, -5.000000000000000E-001); | | z = __fma_rn (z, x2, -5.000000000000000E-001); | |
| z = __fma_rn (z, x2, 1.000000000000000E+000); | | z = __fma_rn (z, x2, 1.000000000000000E+000); | |
| return z; | | return z; | |
| } | | } | |
| | | | |
| /* approximate tangent on -pi/4...+pi/4 */ | | /* approximate tangent on -pi/4...+pi/4 */ | |
|
| __device_func__(double __internal_tan_kerneld(double x, int i)) | | static __forceinline__ double __internal_tan_kerneld(double x, int i) | |
| { | | { | |
| double x2, z, q; | | double x2, z, q; | |
| x2 = x * x; | | x2 = x * x; | |
| z = 9.8006287203286300E-006; | | z = 9.8006287203286300E-006; | |
| z = __fma_rn (z, x2, -2.4279526494179897E-005); | | z = __fma_rn (z, x2, -2.4279526494179897E-005); | |
| z = __fma_rn (z, x2, 4.8644173130937162E-005); | | z = __fma_rn (z, x2, 4.8644173130937162E-005); | |
| z = __fma_rn (z, x2, -2.5640012693782273E-005); | | z = __fma_rn (z, x2, -2.5640012693782273E-005); | |
| z = __fma_rn (z, x2, 6.7223984330880073E-005); | | z = __fma_rn (z, x2, 6.7223984330880073E-005); | |
| z = __fma_rn (z, x2, 8.3559287318211639E-005); | | z = __fma_rn (z, x2, 8.3559287318211639E-005); | |
| z = __fma_rn (z, x2, 2.4375039850848564E-004); | | z = __fma_rn (z, x2, 2.4375039850848564E-004); | |
| | | | |
| skipping to change at line 388 | | skipping to change at line 311 | |
| double s = q - x; | | double s = q - x; | |
| double w = __fma_rn (z, x, -s); // tail of q | | double w = __fma_rn (z, x, -s); // tail of q | |
| z = 1.0 / q; | | z = 1.0 / q; | |
| z = -z; | | z = -z; | |
| s = __fma_rn (q, z, 1.0); | | s = __fma_rn (q, z, 1.0); | |
| q = __fma_rn (z, __fma_rn (z, w, s), z); | | q = __fma_rn (z, __fma_rn (z, w, s), z); | |
| } | | } | |
| return q; | | return q; | |
| } | | } | |
| | | | |
|
| __device_func__(double __cuda_sqrt(double a)) | | | |
| { | | | |
| return sqrt(a); | | | |
| } | | | |
| | | | |
| __device_func__(double __cuda_rsqrt(double a)) | | | |
| { | | | |
| #if !defined(__CUDABE__) | | | |
| return 1.0 / sqrt(a); | | | |
| #else | | | |
| return rsqrt(a); | | | |
| #endif | | | |
| } | | | |
| | | | |
| /* approximates exp(a)-1 on [-log(1.5),log(1.5)] accurate to 1 ulp */ | | /* approximates exp(a)-1 on [-log(1.5),log(1.5)] accurate to 1 ulp */ | |
|
| __device_func__(double __internal_expm1_kernel (double a)) | | static __forceinline__ double __internal_expm1_kernel (double a) | |
| { | | { | |
| double t; | | double t; | |
| t = 2.08842685477913050E-009; | | t = 2.08842685477913050E-009; | |
| t = __fma_rn (t, a, 2.51366409033551950E-008); | | t = __fma_rn (t, a, 2.51366409033551950E-008); | |
| t = __fma_rn (t, a, 2.75574612072447230E-007); | | t = __fma_rn (t, a, 2.75574612072447230E-007); | |
| t = __fma_rn (t, a, 2.75571539284473460E-006); | | t = __fma_rn (t, a, 2.75571539284473460E-006); | |
| t = __fma_rn (t, a, 2.48015869443077950E-005); | | t = __fma_rn (t, a, 2.48015869443077950E-005); | |
| t = __fma_rn (t, a, 1.98412699878799470E-004); | | t = __fma_rn (t, a, 1.98412699878799470E-004); | |
| t = __fma_rn (t, a, 1.38888888892029890E-003); | | t = __fma_rn (t, a, 1.38888888892029890E-003); | |
| t = __fma_rn (t, a, 8.33333333327662860E-003); | | t = __fma_rn (t, a, 8.33333333327662860E-003); | |
| t = __fma_rn (t, a, 4.16666666666656370E-002); | | t = __fma_rn (t, a, 4.16666666666656370E-002); | |
| t = __fma_rn (t, a, 1.66666666666667380E-001); | | t = __fma_rn (t, a, 1.66666666666667380E-001); | |
| t = __fma_rn (t, a, 5.00000000000000000E-001); | | t = __fma_rn (t, a, 5.00000000000000000E-001); | |
| t = t * a; | | t = t * a; | |
| t = __fma_rn (t, a, a); | | t = __fma_rn (t, a, a); | |
| return t; | | return t; | |
| } | | } | |
| | | | |
| /* approximate 2*atanh(0.5*a) on [-0.25,0.25] */ | | /* approximate 2*atanh(0.5*a) on [-0.25,0.25] */ | |
|
| __device_func__(double __internal_atanh_kernel (double a_1, double a_2)) | | static __forceinline__ double __internal_atanh_kernel (double a_1, double a
_2) | |
| { | | { | |
| double a, a2, t; | | double a, a2, t; | |
| | | | |
| a = a_1 + a_2; | | a = a_1 + a_2; | |
| a2 = a * a; | | a2 = a * a; | |
| t = 7.597322383488143E-002/65536.0; | | t = 7.597322383488143E-002/65536.0; | |
| t = __fma_rn (t, a2, 6.457518383364042E-002/16384.0); | | t = __fma_rn (t, a2, 6.457518383364042E-002/16384.0); | |
| t = __fma_rn (t, a2, 7.705685707267146E-002/4096.0); | | t = __fma_rn (t, a2, 7.705685707267146E-002/4096.0); | |
| t = __fma_rn (t, a2, 9.090417561104036E-002/1024.0); | | t = __fma_rn (t, a2, 9.090417561104036E-002/1024.0); | |
| t = __fma_rn (t, a2, 1.111112158368149E-001/256.0); | | t = __fma_rn (t, a2, 1.111112158368149E-001/256.0); | |
| t = __fma_rn (t, a2, 1.428571416261528E-001/64.0); | | t = __fma_rn (t, a2, 1.428571416261528E-001/64.0); | |
| t = __fma_rn (t, a2, 2.000000000069858E-001/16.0); | | t = __fma_rn (t, a2, 2.000000000069858E-001/16.0); | |
| t = __fma_rn (t, a2, 3.333333333333198E-001/4.0); | | t = __fma_rn (t, a2, 3.333333333333198E-001/4.0); | |
| t = t * a2; | | t = t * a2; | |
| t = __fma_rn (t, a, a_2); | | t = __fma_rn (t, a, a_2); | |
| t = t + a_1; | | t = t + a_1; | |
| return t; | | return t; | |
| } | | } | |
| | | | |
|
| __device_func__(double __internal_exp2i_kernel(int b)) | | static __forceinline__ double __internal_exp2i_kernel(int b) | |
| { | | { | |
| return __hiloint2double((b + 1023) << 20, 0); | | return __hiloint2double((b + 1023) << 20, 0); | |
| } | | } | |
| | | | |
|
| __device_func__(double __internal_half(double a)) | | static __forceinline__ double __internal_half(double a) | |
| { | | { | |
| unsigned int ihi, ilo; | | unsigned int ihi, ilo; | |
| ilo = __double2loint(a); | | ilo = __double2loint(a); | |
| ihi = __double2hiint(a); | | ihi = __double2hiint(a); | |
| return __hiloint2double(ihi - 0x00100000, ilo); | | return __hiloint2double(ihi - 0x00100000, ilo); | |
| } | | } | |
| | | | |
|
| __device_func__(double __internal_twice(double a)) | | static __forceinline__ double __internal_twice(double a) | |
| { | | { | |
| unsigned int ihi, ilo; | | unsigned int ihi, ilo; | |
| ilo = __double2loint(a); | | ilo = __double2loint(a); | |
| ihi = __double2hiint(a); | | ihi = __double2hiint(a); | |
| return __hiloint2double(ihi + 0x00100000, ilo); | | return __hiloint2double(ihi + 0x00100000, ilo); | |
| } | | } | |
| | | | |
|
| __device_func__(double __cuda_sin(double a)) | | static __forceinline__ double sin(double a) | |
| { | | { | |
| double z; | | double z; | |
| int i; | | int i; | |
|
| if (__cuda___isinf(a) || (a == CUDART_ZERO)) { | | if (__isinf(a) || (a == CUDART_ZERO)) { | |
| return __dmul_rn(a, CUDART_ZERO); | | return __dmul_rn(a, CUDART_ZERO); | |
| } | | } | |
| z = __internal_trig_reduction_kerneld(a, &i); | | z = __internal_trig_reduction_kerneld(a, &i); | |
| /* here, abs(z) <= pi/4, and i has the quadrant */ | | /* here, abs(z) <= pi/4, and i has the quadrant */ | |
| if (i & 1) { | | if (i & 1) { | |
| z = __internal_cos_kerneld(z); | | z = __internal_cos_kerneld(z); | |
| } else { | | } else { | |
| z = __internal_sin_kerneld(z); | | z = __internal_sin_kerneld(z); | |
| } | | } | |
| if (i & 2) { | | if (i & 2) { | |
| z = -z; | | z = -z; | |
| } | | } | |
| return z; | | return z; | |
| } | | } | |
| | | | |
|
| __device_func__(double __cuda_sinpi(double a)) | | static __forceinline__ double sinpi(double a) | |
| { | | { | |
| double z; | | double z; | |
| double fi; | | double fi; | |
| int i; | | int i; | |
| | | | |
|
| if (__cuda___isinf(a) || (a == CUDART_ZERO)) { | | if (__isinf(a) || (a == CUDART_ZERO)) { | |
| return __dmul_rn(a, CUDART_ZERO); | | return __dmul_rn(a, CUDART_ZERO); | |
| } | | } | |
| /* IEEE-754: sinPi(+n) is +0 and sinPi(-n) is -0 for positive integers n.
*/ | | /* IEEE-754: sinPi(+n) is +0 and sinPi(-n) is -0 for positive integers n.
*/ | |
|
| if (a == __cuda_trunc(a)) { | | if (a == trunc(a)) { | |
| return __longlong_as_double(__double_as_longlong(a)&0x8000000000000000U
LL); | | return __longlong_as_double(__double_as_longlong(a)&0x8000000000000000U
LL); | |
| } | | } | |
|
| fi = __cuda_rint (a * 2.0); | | fi = rint (a * 2.0); | |
| z = __fma_rn (fi, -0.5, a); | | z = __fma_rn (fi, -0.5, a); | |
| z = __fma_rn (z, CUDART_PI_HI, z * CUDART_PI_LO); | | z = __fma_rn (z, CUDART_PI_HI, z * CUDART_PI_LO); | |
| i = (int)(((long long)fi) & 3); | | i = (int)(((long long)fi) & 3); | |
| if (i & 1) { | | if (i & 1) { | |
| z = __internal_cos_kerneld(z); | | z = __internal_cos_kerneld(z); | |
| } else { | | } else { | |
| z = __internal_sin_kerneld(z); | | z = __internal_sin_kerneld(z); | |
| } | | } | |
| if (i & 2) { | | if (i & 2) { | |
| z = -z; | | z = -z; | |
| } | | } | |
| return z; | | return z; | |
| } | | } | |
| | | | |
|
| __device_func__(double __cuda_cos(double a)) | | static __forceinline__ double cos(double a) | |
| { | | { | |
| double z; | | double z; | |
| int i; | | int i; | |
|
| if (__cuda___isinf(a)) { | | if (__isinf(a)) { | |
| return CUDART_NAN; | | return CUDART_NAN; | |
| } | | } | |
| z = __internal_trig_reduction_kerneld(a, &i); | | z = __internal_trig_reduction_kerneld(a, &i); | |
| /* here, abs(z) <= pi/4, and i has the quadrant */ | | /* here, abs(z) <= pi/4, and i has the quadrant */ | |
| i++; | | i++; | |
| if (i & 1) { | | if (i & 1) { | |
| z = __internal_cos_kerneld(z); | | z = __internal_cos_kerneld(z); | |
| } else { | | } else { | |
| z = __internal_sin_kerneld(z); | | z = __internal_sin_kerneld(z); | |
| } | | } | |
| if (i & 2) { | | if (i & 2) { | |
| z = -z; | | z = -z; | |
| } | | } | |
| return z; | | return z; | |
| } | | } | |
| | | | |
|
| __device_func__(void __cuda_sincos(double a, double *sptr, double *cptr)) | | static __forceinline__ void sincos(double a, double *sptr, double *cptr) | |
| { | | { | |
| double t, u, s, c; | | double t, u, s, c; | |
| int i; | | int i; | |
|
| t = __cuda_fabs(a); | | t = fabs(a); | |
| if ((t == CUDART_INF) || (t == CUDART_ZERO)) { | | if ((t == CUDART_INF) || (t == CUDART_ZERO)) { | |
| s = __dmul_rn (a, CUDART_ZERO); /* generate NaN, zero */ | | s = __dmul_rn (a, CUDART_ZERO); /* generate NaN, zero */ | |
| c = 1.0 + s; /* generate NaN, one */ | | c = 1.0 + s; /* generate NaN, one */ | |
| *sptr = s; | | *sptr = s; | |
| *cptr = c; | | *cptr = c; | |
| return; | | return; | |
| } | | } | |
| t = __internal_trig_reduction_kerneld(a, &i); | | t = __internal_trig_reduction_kerneld(a, &i); | |
| u = __internal_cos_kerneld(t); | | u = __internal_cos_kerneld(t); | |
| t = __internal_sin_kerneld(t); | | t = __internal_sin_kerneld(t); | |
| | | | |
| skipping to change at line 566 | | skipping to change at line 475 | |
| s = -s; | | s = -s; | |
| } | | } | |
| i++; | | i++; | |
| if (i & 2) { | | if (i & 2) { | |
| c = -c; | | c = -c; | |
| } | | } | |
| *sptr = s; | | *sptr = s; | |
| *cptr = c; | | *cptr = c; | |
| } | | } | |
| | | | |
|
| __device_func__(double __cuda_tan(double a)) | | static __forceinline__ double tan(double a) | |
| { | | { | |
| double z; | | double z; | |
| int i; | | int i; | |
|
| if (__cuda___isinf(a)) { | | if (__isinf(a)) { | |
| return __dadd_rn (a, -a); /* return NaN */ | | return __dadd_rn (a, -a); /* return NaN */ | |
| } | | } | |
| z = __internal_trig_reduction_kerneld(a, &i); | | z = __internal_trig_reduction_kerneld(a, &i); | |
| /* here, abs(z) <= pi/4, and i has the quadrant */ | | /* here, abs(z) <= pi/4, and i has the quadrant */ | |
| z = __internal_tan_kerneld(z, i & 1); | | z = __internal_tan_kerneld(z, i & 1); | |
| return z; | | return z; | |
| } | | } | |
| | | | |
|
| __device_func__(double __cuda_log(double a)) | | static __forceinline__ double log(double a) | |
| { | | { | |
| double m, f, g, u, v, tmp, q, ulo, log_lo, log_hi; | | double m, f, g, u, v, tmp, q, ulo, log_lo, log_hi; | |
| int ihi, ilo; | | int ihi, ilo; | |
| | | | |
| ihi = __double2hiint(a); | | ihi = __double2hiint(a); | |
| ilo = __double2loint(a); | | ilo = __double2loint(a); | |
| | | | |
| if ((a > CUDART_ZERO) && (a < CUDART_INF)) { | | if ((a > CUDART_ZERO) && (a < CUDART_INF)) { | |
| int e = -1023; | | int e = -1023; | |
| /* normalize denormals */ | | /* normalize denormals */ | |
| | | | |
| skipping to change at line 609 | | skipping to change at line 518 | |
| e += (ihi >> 20); | | e += (ihi >> 20); | |
| ihi = (ihi & 0x800fffff) | 0x3ff00000; | | ihi = (ihi & 0x800fffff) | 0x3ff00000; | |
| m = __hiloint2double (ihi, ilo); | | m = __hiloint2double (ihi, ilo); | |
| if ((unsigned)ihi > (unsigned)0x3ff6a09e) { | | if ((unsigned)ihi > (unsigned)0x3ff6a09e) { | |
| m = __internal_half(m); | | m = __internal_half(m); | |
| e = e + 1; | | e = e + 1; | |
| } | | } | |
| /* log((1+m)/(1-m)) = 2*atanh(m). log(m) = 2*atanh ((m-1)/(m+1)) */ | | /* log((1+m)/(1-m)) = 2*atanh(m). log(m) = 2*atanh ((m-1)/(m+1)) */ | |
| f = m - 1.0; | | f = m - 1.0; | |
| g = m + 1.0; | | g = m + 1.0; | |
|
| g = 1.0 / g; | | g = __internal_fast_rcp(g); | |
| u = f * g; | | u = f * g; | |
| u = u + u; | | u = u + u; | |
| /* u = 2.0 * (m - 1.0) / (m + 1.0) */ | | /* u = 2.0 * (m - 1.0) / (m + 1.0) */ | |
| v = u * u; | | v = u * u; | |
| q = 6.7261411553826339E-2/65536.0; | | q = 6.7261411553826339E-2/65536.0; | |
| q = __fma_rn (q, v, 6.6133829643643394E-2/16384.0); | | q = __fma_rn (q, v, 6.6133829643643394E-2/16384.0); | |
| q = __fma_rn (q, v, 7.6940931149150890E-2/4096.0); | | q = __fma_rn (q, v, 7.6940931149150890E-2/4096.0); | |
| q = __fma_rn (q, v, 9.0908745692137444E-2/1024.0); | | q = __fma_rn (q, v, 9.0908745692137444E-2/1024.0); | |
| q = __fma_rn (q, v, 1.1111111499059706E-1/256.0); | | q = __fma_rn (q, v, 1.1111111499059706E-1/256.0); | |
| q = __fma_rn (q, v, 1.4285714283305975E-1/64.0); | | q = __fma_rn (q, v, 1.4285714283305975E-1/64.0); | |
| | | | |
| skipping to change at line 640 | | skipping to change at line 549 | |
| log_lo = ulo + q; | | log_lo = ulo + q; | |
| /* log_hi + log_lo = log(m)+e*log(2)=log(a) to more than double precisi
on*/ | | /* log_hi + log_lo = log(m)+e*log(2)=log(a) to more than double precisi
on*/ | |
| q = __fma_rn ( e, CUDART_LN2_HI, log_hi); | | q = __fma_rn ( e, CUDART_LN2_HI, log_hi); | |
| tmp = __fma_rn (-e, CUDART_LN2_HI, q); | | tmp = __fma_rn (-e, CUDART_LN2_HI, q); | |
| tmp = tmp - log_hi; | | tmp = tmp - log_hi; | |
| log_hi = q; | | log_hi = q; | |
| log_lo = log_lo - tmp; | | log_lo = log_lo - tmp; | |
| log_lo = __fma_rn (e, CUDART_LN2_LO, log_lo); | | log_lo = __fma_rn (e, CUDART_LN2_LO, log_lo); | |
| return log_hi + log_lo; | | return log_hi + log_lo; | |
| } else { | | } else { | |
|
| if (__cuda___isnan(a)) { | | if (__isnan(a)) { | |
| return a + a; | | return a + a; | |
| } | | } | |
| /* log(0) = -INF */ | | /* log(0) = -INF */ | |
| if (a == 0) { | | if (a == 0) { | |
| return -CUDART_INF; | | return -CUDART_INF; | |
| } | | } | |
| /* log(INF) = INF */ | | /* log(INF) = INF */ | |
| if (a == CUDART_INF) { | | if (a == CUDART_INF) { | |
| return a; | | return a; | |
| } | | } | |
| /* log(x) is undefined for x < 0.0, return INDEFINITE */ | | /* log(x) is undefined for x < 0.0, return INDEFINITE */ | |
| return CUDART_NAN; | | return CUDART_NAN; | |
| } | | } | |
| } | | } | |
| | | | |
| /* Requires |x.y| > |y.y|. 8 DP operations */ | | /* Requires |x.y| > |y.y|. 8 DP operations */ | |
|
| __device_func__(double2 __internal_ddadd_xgty (double2 x, double2 y)) | | static __forceinline__ double2 __internal_ddadd_xgty (double2 x, double2 y) | |
| { | | { | |
| double2 z; | | double2 z; | |
|
| #if defined(__GNUC__) && !defined(__CUDABE__) | | | |
| volatile | | | |
| #endif | | | |
| double r, s, e; | | double r, s, e; | |
| r = x.y + y.y; | | r = x.y + y.y; | |
| e = x.y - r; | | e = x.y - r; | |
| s = ((e + y.y) + y.x) + x.x; | | s = ((e + y.y) + y.x) + x.x; | |
| z.y = e = r + s; | | z.y = e = r + s; | |
| z.x = (r - e) + s; | | z.x = (r - e) + s; | |
| return z; | | return z; | |
| } | | } | |
| | | | |
| /* Take full advantage of FMA. Only 8 DP operations */ | | /* Take full advantage of FMA. Only 8 DP operations */ | |
|
| __device_func__(double2 __internal_ddmul (double2 x, double2 y)) | | static __forceinline__ double2 __internal_ddmul (double2 x, double2 y) | |
| { | | { | |
|
| #if defined(__GNUC__) && !defined(__CUDABE__) | | | |
| volatile | | | |
| #endif | | | |
| double e; | | double e; | |
| double2 t, z; | | double2 t, z; | |
| t.y = x.y * y.y; | | t.y = x.y * y.y; | |
| t.x = __fma_rn (x.y, y.y, -t.y); | | t.x = __fma_rn (x.y, y.y, -t.y); | |
| t.x = __fma_rn (x.x, y.x, t.x); | | t.x = __fma_rn (x.x, y.x, t.x); | |
| t.x = __fma_rn (x.y, y.x, t.x); | | t.x = __fma_rn (x.y, y.x, t.x); | |
| t.x = __fma_rn (x.x, y.y, t.x); | | t.x = __fma_rn (x.x, y.y, t.x); | |
| z.y = e = t.y + t.x; | | z.y = e = t.y + t.x; | |
| z.x = (t.y - e) + t.x; | | z.x = (t.y - e) + t.x; | |
| return z; | | return z; | |
| } | | } | |
| | | | |
|
| __device_func__(double2 __internal_log_ext_prec(double a)) | | static __forceinline__ double2 __internal_log_ext_prec(double a) | |
| { | | { | |
| double2 res; | | double2 res; | |
| double2 qq, cc, uu, tt; | | double2 qq, cc, uu, tt; | |
| double f, g, u, v, q, ulo, tmp, m; | | double f, g, u, v, q, ulo, tmp, m; | |
| int ilo, ihi, expo; | | int ilo, ihi, expo; | |
| | | | |
| ihi = __double2hiint(a); | | ihi = __double2hiint(a); | |
| ilo = __double2loint(a); | | ilo = __double2loint(a); | |
| expo = (ihi >> 20) & 0x7ff; | | expo = (ihi >> 20) & 0x7ff; | |
| /* convert denormals to normals for computation of log(a) */ | | /* convert denormals to normals for computation of log(a) */ | |
| | | | |
| skipping to change at line 726 | | skipping to change at line 629 | |
| m = __internal_half(m); | | m = __internal_half(m); | |
| expo = expo + 1; | | expo = expo + 1; | |
| } | | } | |
| /* compute log(m) with extended precision using an algorithm derived from | | /* compute log(m) with extended precision using an algorithm derived from | |
| * P.T.P. Tang, "Table Driven Implementation of the Logarithm Function", | | * P.T.P. Tang, "Table Driven Implementation of the Logarithm Function", | |
| * TOMS, Vol. 16., No. 4, December 1990, pp. 378-400. A modified polynomi
al | | * TOMS, Vol. 16., No. 4, December 1990, pp. 378-400. A modified polynomi
al | |
| * approximation to atanh(x) on the interval [-0.1716, 0.1716] is utilize
d. | | * approximation to atanh(x) on the interval [-0.1716, 0.1716] is utilize
d. | |
| */ | | */ | |
| f = m - 1.0; | | f = m - 1.0; | |
| g = m + 1.0; | | g = m + 1.0; | |
|
| g = 1.0 / g; | | g = __internal_fast_rcp(g); | |
| u = f * g; | | u = f * g; | |
| u = u + u; | | u = u + u; | |
| /* u = 2.0 * (m - 1.0) / (m + 1.0) */ | | /* u = 2.0 * (m - 1.0) / (m + 1.0) */ | |
| v = u * u; | | v = u * u; | |
| q = 6.6253631649203309E-2/65536.0; | | q = 6.6253631649203309E-2/65536.0; | |
| q = __fma_rn (q, v, 6.6250935587260612E-2/16384.0); | | q = __fma_rn (q, v, 6.6250935587260612E-2/16384.0); | |
| q = __fma_rn (q, v, 7.6935437806732829E-2/4096.0); | | q = __fma_rn (q, v, 7.6935437806732829E-2/4096.0); | |
| q = __fma_rn (q, v, 9.0908878711093280E-2/1024.0); | | q = __fma_rn (q, v, 9.0908878711093280E-2/1024.0); | |
| q = __fma_rn (q, v, 1.1111111322892790E-1/256.0); | | q = __fma_rn (q, v, 1.1111111322892790E-1/256.0); | |
| q = __fma_rn (q, v, 1.4285714284546502E-1/64.0); | | q = __fma_rn (q, v, 1.4285714284546502E-1/64.0); | |
| | | | |
| skipping to change at line 766 | | skipping to change at line 669 | |
| u = uu.y; | | u = uu.y; | |
| ulo = uu.x; | | ulo = uu.x; | |
| /* log(2)*expo in double-double format */ | | /* log(2)*expo in double-double format */ | |
| tt.y = expo * 6.9314718055966296e-001; /* multiplication is exact */ | | tt.y = expo * 6.9314718055966296e-001; /* multiplication is exact */ | |
| tt.x = expo * 2.8235290563031577e-013; | | tt.x = expo * 2.8235290563031577e-013; | |
| /* log(a) = log(m) + log(2)*expo; if expo != 0, |log(2)*expo| > |log(m)|
*/ | | /* log(a) = log(m) + log(2)*expo; if expo != 0, |log(2)*expo| > |log(m)|
*/ | |
| res = __internal_ddadd_xgty (tt, uu); | | res = __internal_ddadd_xgty (tt, uu); | |
| return res; | | return res; | |
| } | | } | |
| | | | |
|
| __device_func__(double __cuda_log2(double a)) | | static __forceinline__ double log2(double a) | |
| { | | { | |
| double t; | | double t; | |
|
| t = __cuda_log(a); | | t = log(a); | |
| return __fma_rn (t, CUDART_L2E_HI, t * CUDART_L2E_LO); | | return __fma_rn (t, CUDART_L2E_HI, t * CUDART_L2E_LO); | |
| } | | } | |
| | | | |
|
| __device_func__(double __cuda_log10(double a)) | | static __forceinline__ double log10(double a) | |
| { | | { | |
| double t; | | double t; | |
|
| t = __cuda_log(a); | | t = log(a); | |
| return __fma_rn (t, CUDART_LGE_HI, t * CUDART_LGE_LO); | | return __fma_rn (t, CUDART_LGE_HI, t * CUDART_LGE_LO); | |
| } | | } | |
| | | | |
|
| __device_func__(double __cuda_log1p(double a)) | | static __forceinline__ double log1p(double a) | |
| { | | { | |
| double t; | | double t; | |
| int i; | | int i; | |
| | | | |
| i = __double2hiint(a); | | i = __double2hiint(a); | |
| if (((unsigned)i < (unsigned)0x3fe55555) || ((int)i < (int)0xbfd99999)) { | | if (((unsigned)i < (unsigned)0x3fe55555) || ((int)i < (int)0xbfd99999)) { | |
| /* Compute log2(a+1) = 2*atanh(a/(a+2)) */ | | /* Compute log2(a+1) = 2*atanh(a/(a+2)) */ | |
| t = a + 2.0; | | t = a + 2.0; | |
| t = a / t; | | t = a / t; | |
| t = -a * t; | | t = -a * t; | |
| t = __internal_atanh_kernel(a, t); | | t = __internal_atanh_kernel(a, t); | |
| return t; | | return t; | |
| } | | } | |
|
| return __cuda_log (a + CUDART_ONE); | | return log (a + CUDART_ONE); | |
| } | | } | |
| | | | |
|
| __device_func__(double __internal_exp_kernel(double a, int scale)) | | static __forceinline__ double __internal_exp_kernel(double a, int scale) | |
| { | | { | |
| double t, fac, z; | | double t, fac, z; | |
|
| int i; | | int i, k; | |
| /* exp(a) = 2^(rint(a/log(2)) + z) = 2^(i + z) */ | | /* exp(a) = 2^(rint(a/log(2)) + z) = 2^(i + z) */ | |
|
| t = __cuda_rint (a * CUDART_L2E); | | t = rint (a * CUDART_L2E); | |
| i = (int)t; | | i = (int)t; | |
| z = __fma_rn (t, -CUDART_LN2_HI, a); | | z = __fma_rn (t, -CUDART_LN2_HI, a); | |
| z = __fma_rn (t, -CUDART_LN2_LO, z); | | z = __fma_rn (t, -CUDART_LN2_LO, z); | |
|
| fac = 2.0; | | k = 0x40000000; | |
| if (i <= -1021) { | | if (i <= -1021) { | |
| i += 55; | | i += 55; | |
|
| fac = CUDART_TWO_TO_M54; | | k -= 55 << 20; | |
| } | | } | |
|
| | | fac = __hiloint2double(k, 0); /* 2^-54 if a is denormal, 2.0 otherwise */ | |
| /* exp(a) = 2^i * e^z */ | | /* exp(a) = 2^i * e^z */ | |
| t = __internal_expm1_kernel(z); | | t = __internal_expm1_kernel(z); | |
|
| z = __internal_exp2i_kernel(i + scale - 1); | | z = __hiloint2double(((i + scale) << 20) + ((-1 + 1023) << 20), 0); | |
| t = __fma_rn (t, z, z); | | t = __fma_rn (t, z, z); | |
| t = t * fac; | | t = t * fac; | |
| return t; | | return t; | |
| } | | } | |
| | | | |
|
| __device_func__(double __cuda_exp(double a)) | | static __forceinline__ double exp(double a) | |
| { | | { | |
| double t; | | double t; | |
| int i; | | int i; | |
| i = __double2hiint(a); | | i = __double2hiint(a); | |
| if (((unsigned)i < (unsigned)0x40862e43) || ((int)i < (int)0xC0874911)) { | | if (((unsigned)i < (unsigned)0x40862e43) || ((int)i < (int)0xC0874911)) { | |
| t = __internal_exp_kernel(a, 0); | | t = __internal_exp_kernel(a, 0); | |
| return t; | | return t; | |
| } | | } | |
|
| t = ((unsigned int)i >> 31) ? CUDART_ZERO : CUDART_INF; | | t = (i < 0) ? CUDART_ZERO : CUDART_INF; | |
| if (__cuda___isnan(a)) { | | if (__isnan(a)) { | |
| t = a + a; | | t = a + a; | |
| } | | } | |
| return t; | | return t; | |
| } | | } | |
| | | | |
|
| __device_func__(double __cuda_exp2(double a)) | | static __forceinline__ double exp2(double a) | |
| { | | { | |
| double z; | | double z; | |
| double t; | | double t; | |
| double fac; | | double fac; | |
| int i; | | int i; | |
| | | | |
| i = __double2hiint(a); | | i = __double2hiint(a); | |
| if (((unsigned)i < (unsigned)0x40900000) || ((int)i < (int)0xc090cc00)) { | | if (((unsigned)i < (unsigned)0x40900000) || ((int)i < (int)0xc090cc00)) { | |
|
| t = __cuda_rint (a); | | t = rint (a); | |
| z = a - t; | | z = a - t; | |
| i = (int)t; | | i = (int)t; | |
| fac = 2.0; | | fac = 2.0; | |
| if (i <= -1021) { | | if (i <= -1021) { | |
| i += 55; | | i += 55; | |
| fac = CUDART_TWO_TO_M54; | | fac = CUDART_TWO_TO_M54; | |
| } | | } | |
| /* 2^z = exp(log(2)*z) */ | | /* 2^z = exp(log(2)*z) */ | |
| z = __fma_rn (z, CUDART_LN2_HI, z * CUDART_LN2_LO); | | z = __fma_rn (z, CUDART_LN2_HI, z * CUDART_LN2_LO); | |
| t = __internal_expm1_kernel(z); | | t = __internal_expm1_kernel(z); | |
| z = __internal_exp2i_kernel(i - 1); | | z = __internal_exp2i_kernel(i - 1); | |
| t = __fma_rn (t, z, z); | | t = __fma_rn (t, z, z); | |
| t = t * fac; | | t = t * fac; | |
| return t; | | return t; | |
| } | | } | |
|
| t = ((unsigned int)i >> 31) ? CUDART_ZERO : CUDART_INF; | | t = (i < 0) ? CUDART_ZERO : CUDART_INF; | |
| if (__cuda___isnan(a)) { | | if (__isnan(a)) { | |
| t = a + a; | | t = a + a; | |
| } | | } | |
| return t; | | return t; | |
| } | | } | |
| | | | |
|
| __device_func__(double __cuda_exp10(double a)) | | static __forceinline__ double exp10(double a) | |
| { | | { | |
| double z; | | double z; | |
| double t; | | double t; | |
| double fac; | | double fac; | |
| int i; | | int i; | |
| | | | |
| i = __double2hiint(a); | | i = __double2hiint(a); | |
| if (((unsigned)i < (unsigned)0x40734414) || ((int)i < (int)0xc07439b8)) { | | if (((unsigned)i < (unsigned)0x40734414) || ((int)i < (int)0xc07439b8)) { | |
|
| t = __cuda_rint (a * CUDART_L2T); | | t = rint (a * CUDART_L2T); | |
| i = (int)t; | | i = (int)t; | |
| z = __fma_rn (t, -CUDART_LG2_HI, a); | | z = __fma_rn (t, -CUDART_LG2_HI, a); | |
| z = __fma_rn (t, -CUDART_LG2_LO, z); | | z = __fma_rn (t, -CUDART_LG2_LO, z); | |
| fac = 2.0; | | fac = 2.0; | |
| if (i <= -1021) { | | if (i <= -1021) { | |
| i += 55; | | i += 55; | |
| fac = CUDART_TWO_TO_M54; | | fac = CUDART_TWO_TO_M54; | |
| } | | } | |
| /* 2^z = exp(log(10)*z) */ | | /* 2^z = exp(log(10)*z) */ | |
| z = __fma_rn (z, CUDART_LNT_HI, z * CUDART_LNT_LO); | | z = __fma_rn (z, CUDART_LNT_HI, z * CUDART_LNT_LO); | |
| t = __internal_expm1_kernel(z); | | t = __internal_expm1_kernel(z); | |
| z = __internal_exp2i_kernel(i - 1); | | z = __internal_exp2i_kernel(i - 1); | |
| t = __fma_rn (t, z, z); | | t = __fma_rn (t, z, z); | |
| t = t * fac; | | t = t * fac; | |
| return t; | | return t; | |
| } | | } | |
|
| t = ((unsigned int)i >> 31) ? CUDART_ZERO : CUDART_INF; | | t = (i < 0) ? CUDART_ZERO : CUDART_INF; | |
| if (__cuda___isnan(a)) { | | if (__isnan(a)) { | |
| t = a + a; | | t = a + a; | |
| } | | } | |
| return t; | | return t; | |
| } | | } | |
| | | | |
|
| __device_func__(double __cuda_expm1(double a)) | | static __forceinline__ double expm1(double a) | |
| { | | { | |
| double t, z, u; | | double t, z, u; | |
| int i, j, k; | | int i, j, k; | |
| | | | |
| k = __double2hiint(a); | | k = __double2hiint(a); | |
| if (((unsigned)k < (unsigned)0x40862e43) || ((int)k < (int)0xc04a8000)) { | | if (((unsigned)k < (unsigned)0x40862e43) || ((int)k < (int)0xc04a8000)) { | |
|
| t = __cuda_rint (a * CUDART_L2E); | | t = rint (a * CUDART_L2E); | |
| i = (int)t; | | i = (int)t; | |
| z = __fma_rn (t, -CUDART_LN2_HI, a); | | z = __fma_rn (t, -CUDART_LN2_HI, a); | |
| z = __fma_rn (t, -CUDART_LN2_LO, z); | | z = __fma_rn (t, -CUDART_LN2_LO, z); | |
| k = k + k; | | k = k + k; | |
| if ((unsigned)k < (unsigned)0x7fb3e647) { | | if ((unsigned)k < (unsigned)0x7fb3e647) { | |
| z = a; | | z = a; | |
| i = 0; | | i = 0; | |
| } | | } | |
| t = __internal_expm1_kernel(z); | | t = __internal_expm1_kernel(z); | |
| j = i; | | j = i; | |
| if (i == 1024) j--; | | if (i == 1024) j--; | |
| u = __internal_exp2i_kernel(j); | | u = __internal_exp2i_kernel(j); | |
| a = u - 1.0; | | a = u - 1.0; | |
| t = __fma_rn (t, u, a); | | t = __fma_rn (t, u, a); | |
| if (i == 1024) t = t + t; | | if (i == 1024) t = t + t; | |
| if (k == 0) t = z; /* preserve -0 */ | | if (k == 0) t = z; /* preserve -0 */ | |
| return t; | | return t; | |
| } | | } | |
|
| t = ((unsigned int)k >> 31) ? -CUDART_ONE : CUDART_INF; | | t = (k < 0) ? -CUDART_ONE : CUDART_INF; | |
| if (__cuda___isnan(a)) { | | if (__isnan(a)) { | |
| t = a + a; | | t = a + a; | |
| } | | } | |
| return t; | | return t; | |
| } | | } | |
| | | | |
|
| __device_func__(double __cuda_cosh(double a)) | | static __forceinline__ double cosh(double a) | |
| { | | { | |
| double z; | | double z; | |
| int i; | | int i; | |
| | | | |
|
| z = __cuda_fabs(a); | | z = fabs(a); | |
| i = __double2hiint(z); | | i = __double2hiint(z); | |
| if ((unsigned)i < (unsigned)0x408633cf) { | | if ((unsigned)i < (unsigned)0x408633cf) { | |
| z = __internal_exp_kernel(z, -2); | | z = __internal_exp_kernel(z, -2); | |
| z = __fma_rn(2.0, z, 0.125 / z); | | z = __fma_rn(2.0, z, 0.125 / z); | |
| return z; | | return z; | |
| } else { | | } else { | |
| if (z > 0.0) a = CUDART_INF_F; | | if (z > 0.0) a = CUDART_INF_F; | |
| return a + a; | | return a + a; | |
| } | | } | |
| } | | } | |
| | | | |
|
| __device_func__(double __cuda_sinh(double a)) | | static __forceinline__ double sinh(double a) | |
| { | | { | |
| double s, z; | | double s, z; | |
| s = a; | | s = a; | |
|
| a = __cuda_fabs(a); | | a = fabs(a); | |
| if (a < 1.0) { /* danger of catastrophic cancellation */ | | if (a < 1.0) { /* danger of catastrophic cancellation */ | |
| double a2 = a * a; | | double a2 = a * a; | |
| /* approximate sinh(x) on [0,1] with a polynomial */ | | /* approximate sinh(x) on [0,1] with a polynomial */ | |
| z = 1.632386098183803E-010; | | z = 1.632386098183803E-010; | |
| z = __fma_rn (z, a2, 2.504854501385687E-008); | | z = __fma_rn (z, a2, 2.504854501385687E-008); | |
| z = __fma_rn (z, a2, 2.755734274788706E-006); | | z = __fma_rn (z, a2, 2.755734274788706E-006); | |
| z = __fma_rn (z, a2, 1.984126976294102E-004); | | z = __fma_rn (z, a2, 1.984126976294102E-004); | |
| z = __fma_rn (z, a2, 8.333333333452911E-003); | | z = __fma_rn (z, a2, 8.333333333452911E-003); | |
| z = __fma_rn (z, a2, 1.666666666666606E-001); | | z = __fma_rn (z, a2, 1.666666666666606E-001); | |
| z = z * a2; | | z = z * a2; | |
| z = __fma_rn (z, a, a); | | z = __fma_rn (z, a, a); | |
| } else if (a < 2.0) { /* work around accuracy issue in vicinity of 1.4
*/ | | } else if (a < 2.0) { /* work around accuracy issue in vicinity of 1.4
*/ | |
|
| z = __cuda_expm1(a); | | z = expm1(a); | |
| z = __internal_half (z + z / (z + 1.0)); | | z = __internal_half (z + z / (z + 1.0)); | |
| } else { | | } else { | |
| z = __internal_exp_kernel(a, -1); | | z = __internal_exp_kernel(a, -1); | |
| z = z + (1.0 / (-4.0 * z)); | | z = z + (1.0 / (-4.0 * z)); | |
| if (a >= CUDART_LN2_X_1025) { | | if (a >= CUDART_LN2_X_1025) { | |
| z = CUDART_INF; /* overflow -> infinity */ | | z = CUDART_INF; /* overflow -> infinity */ | |
| } | | } | |
| } | | } | |
| z = __internal_copysign_pos(z, s); | | z = __internal_copysign_pos(z, s); | |
| return z; | | return z; | |
| } | | } | |
| | | | |
|
| __device_func__(double __cuda_tanh(double a)) | | static __forceinline__ double tanh(double a) | |
| { | | { | |
| double t; | | double t; | |
|
| t = __cuda_fabs(a); | | t = fabs(a); | |
| if (t >= 0.55) { | | if (t >= 0.55) { | |
| double s; | | double s; | |
| s = 1.0 - 2.0 / (__internal_exp_kernel(2.0 * t, 0) + 1.0); | | s = 1.0 - 2.0 / (__internal_exp_kernel(2.0 * t, 0) + 1.0); | |
| if (t > 350.0) { | | if (t > 350.0) { | |
| s = 1.0; /* overflow -> 1.0 */ | | s = 1.0; /* overflow -> 1.0 */ | |
| } | | } | |
| a = __internal_copysign_pos(s, a); | | a = __internal_copysign_pos(s, a); | |
| } else { | | } else { | |
| double a2; | | double a2; | |
| a2 = a * a; | | a2 = a * a; | |
| | | | |
| skipping to change at line 1011 | | skipping to change at line 915 | |
| t = __fma_rn (t, a2, -5.396825387607743E-002); | | t = __fma_rn (t, a2, -5.396825387607743E-002); | |
| t = __fma_rn (t, a2, 1.333333333316870E-001); | | t = __fma_rn (t, a2, 1.333333333316870E-001); | |
| t = __fma_rn (t, a2, -3.333333333333232E-001); | | t = __fma_rn (t, a2, -3.333333333333232E-001); | |
| t = t * a2; | | t = t * a2; | |
| t = __fma_rn (t, a, a); | | t = __fma_rn (t, a, a); | |
| a = __internal_copysign_pos(t, a); | | a = __internal_copysign_pos(t, a); | |
| } | | } | |
| return a; | | return a; | |
| } | | } | |
| | | | |
|
| __device_func__(double __internal_atan_kernel(double a)) | | static __forceinline__ double __internal_atan_kernel(double a) | |
| { | | { | |
| double t, a2; | | double t, a2; | |
| a2 = a * a; | | a2 = a * a; | |
| t = -2.0258553044438358E-005 ; | | t = -2.0258553044438358E-005 ; | |
| t = __fma_rn (t, a2, 2.2302240345758510E-004); | | t = __fma_rn (t, a2, 2.2302240345758510E-004); | |
| t = __fma_rn (t, a2, -1.1640717779930576E-003); | | t = __fma_rn (t, a2, -1.1640717779930576E-003); | |
| t = __fma_rn (t, a2, 3.8559749383629918E-003); | | t = __fma_rn (t, a2, 3.8559749383629918E-003); | |
| t = __fma_rn (t, a2, -9.1845592187165485E-003); | | t = __fma_rn (t, a2, -9.1845592187165485E-003); | |
| t = __fma_rn (t, a2, 1.6978035834597331E-002); | | t = __fma_rn (t, a2, 1.6978035834597331E-002); | |
| t = __fma_rn (t, a2, -2.5826796814495994E-002); | | t = __fma_rn (t, a2, -2.5826796814495994E-002); | |
| | | | |
| skipping to change at line 1039 | | skipping to change at line 943 | |
| t = __fma_rn (t, a2, -9.0909012354005225E-002); | | t = __fma_rn (t, a2, -9.0909012354005225E-002); | |
| t = __fma_rn (t, a2, 1.1111110678749424E-001); | | t = __fma_rn (t, a2, 1.1111110678749424E-001); | |
| t = __fma_rn (t, a2, -1.4285714271334815E-001); | | t = __fma_rn (t, a2, -1.4285714271334815E-001); | |
| t = __fma_rn (t, a2, 1.9999999999755019E-001); | | t = __fma_rn (t, a2, 1.9999999999755019E-001); | |
| t = __fma_rn (t, a2, -3.3333333333331860E-001); | | t = __fma_rn (t, a2, -3.3333333333331860E-001); | |
| t = t * a2; | | t = t * a2; | |
| t = __fma_rn (t, a, a); | | t = __fma_rn (t, a, a); | |
| return t; | | return t; | |
| } | | } | |
| | | | |
|
| __device_func__(double __cuda_atan2(double a, double b)) | | static __forceinline__ double atan2(double a, double b) | |
| { | | { | |
| double t0, t1, t3; | | double t0, t1, t3; | |
|
| if (__cuda___isnan(a) || __cuda___isnan(b)) { | | if (__isnan(a) || __isnan(b)) { | |
| return a + b; | | return a + b; | |
| } | | } | |
| /* reduce arguments to first octant */ | | /* reduce arguments to first octant */ | |
| /* r = (|x| < |y|) ? (|x| / |y|) : (|y| / |x|) */ | | /* r = (|x| < |y|) ? (|x| / |y|) : (|y| / |x|) */ | |
|
| t3 = __cuda_fabs(b); | | t3 = fabs(b); | |
| t1 = __cuda_fabs(a); | | t1 = fabs(a); | |
| if (t3 == 0.0 && t1 == 0.0) { | | if (t3 == 0.0 && t1 == 0.0) { | |
|
| t3 = __cuda___signbit(b) ? CUDART_PI : 0; | | t3 = (__double2hiint(b) < 0) ? CUDART_PI : 0; | |
| } else if (__cuda___isinf(t3) && __cuda___isinf(t1)) { | | } else if (__isinf(t3) && __isinf(t1)) { | |
| t3 = __cuda___signbit(b) ? CUDART_3PIO4 : CUDART_PIO4; | | t3 = (__double2hiint(b) < 0) ? CUDART_3PIO4 : CUDART_PIO4; | |
| } else { | | } else { | |
|
| t0 = __cuda_fmax (t1, t3); | | t0 = fmax (t1, t3); | |
| t1 = __cuda_fmin (t1, t3); | | t1 = fmin (t1, t3); | |
| t3 = t1 / t0; | | t3 = t1 / t0; | |
| t3 = __internal_atan_kernel(t3); | | t3 = __internal_atan_kernel(t3); | |
| /* Map result according to octant. */ | | /* Map result according to octant. */ | |
|
| if (__cuda_fabs(a) > __cuda_fabs(b)) t3 = CUDART_PIO2 - t3; | | if (fabs(a) > fabs(b)) t3 = CUDART_PIO2 - t3; | |
| if (b < 0.0) t3 = CUDART_PI - t3; | | if (b < 0.0) t3 = CUDART_PI - t3; | |
| } | | } | |
| t3 = __internal_copysign_pos(t3, a); | | t3 = __internal_copysign_pos(t3, a); | |
| return t3; | | return t3; | |
| } | | } | |
| | | | |
|
| __device_func__(double __cuda_atan(double a)) | | static __forceinline__ double atan(double a) | |
| { | | { | |
| double t0, t1; | | double t0, t1; | |
| /* reduce argument to first octant */ | | /* reduce argument to first octant */ | |
|
| t0 = __cuda_fabs(a); | | t0 = fabs(a); | |
| t1 = t0; | | t1 = t0; | |
| if (t0 > 1.0) { | | if (t0 > 1.0) { | |
| t1 = 1.0 / t1; | | t1 = 1.0 / t1; | |
| } | | } | |
| /* approximate atan(r) in first octant */ | | /* approximate atan(r) in first octant */ | |
| t1 = __internal_atan_kernel(t1); | | t1 = __internal_atan_kernel(t1); | |
| /* map result according to octant. */ | | /* map result according to octant. */ | |
| if (t0 > 1.0) { | | if (t0 > 1.0) { | |
| t1 = CUDART_PIO2 - t1; | | t1 = CUDART_PIO2 - t1; | |
| } | | } | |
| return __internal_copysign_pos(t1, a); | | return __internal_copysign_pos(t1, a); | |
| } | | } | |
| | | | |
| /* b should be the square of a */ | | /* b should be the square of a */ | |
|
| __device_func__(double __internal_asin_kernel(double a, double b)) | | static __forceinline__ double __internal_asin_kernel(double a, double b) | |
| { | | { | |
| double r; | | double r; | |
| r = 6.259798167646803E-002; | | r = 6.259798167646803E-002; | |
| r = __fma_rn (r, b, -7.620591484676952E-002); | | r = __fma_rn (r, b, -7.620591484676952E-002); | |
| r = __fma_rn (r, b, 6.686894879337643E-002); | | r = __fma_rn (r, b, 6.686894879337643E-002); | |
| r = __fma_rn (r, b, -1.787828218369301E-002); | | r = __fma_rn (r, b, -1.787828218369301E-002); | |
| r = __fma_rn (r, b, 1.745227928732326E-002); | | r = __fma_rn (r, b, 1.745227928732326E-002); | |
| r = __fma_rn (r, b, 1.000422754245580E-002); | | r = __fma_rn (r, b, 1.000422754245580E-002); | |
| r = __fma_rn (r, b, 1.418108777515123E-002); | | r = __fma_rn (r, b, 1.418108777515123E-002); | |
| r = __fma_rn (r, b, 1.733194598980628E-002); | | r = __fma_rn (r, b, 1.733194598980628E-002); | |
| r = __fma_rn (r, b, 2.237350511593569E-002); | | r = __fma_rn (r, b, 2.237350511593569E-002); | |
| r = __fma_rn (r, b, 3.038188875134962E-002); | | r = __fma_rn (r, b, 3.038188875134962E-002); | |
| r = __fma_rn (r, b, 4.464285849810986E-002); | | r = __fma_rn (r, b, 4.464285849810986E-002); | |
| r = __fma_rn (r, b, 7.499999998342270E-002); | | r = __fma_rn (r, b, 7.499999998342270E-002); | |
| r = __fma_rn (r, b, 1.666666666667375E-001); | | r = __fma_rn (r, b, 1.666666666667375E-001); | |
| r = r * b; | | r = r * b; | |
| return r; | | return r; | |
| } | | } | |
| | | | |
|
| __device_func__(double __cuda_asin(double a)) | | static __forceinline__ double asin(double a) | |
| { | | { | |
| double fa, t0, t1; | | double fa, t0, t1; | |
| int ihi, ahi; | | int ihi, ahi; | |
| ahi = __double2hiint(a); | | ahi = __double2hiint(a); | |
|
| fa = __cuda_fabs(a); | | fa = fabs(a); | |
| ihi = __double2hiint(fa); | | ihi = __double2hiint(fa); | |
| if (ihi < 0x3fe26666) { | | if (ihi < 0x3fe26666) { | |
| t1 = fa * fa; | | t1 = fa * fa; | |
| t1 = __internal_asin_kernel (fa, t1); | | t1 = __internal_asin_kernel (fa, t1); | |
| t1 = __fma_rn (t1, fa, fa); | | t1 = __fma_rn (t1, fa, fa); | |
| t1 = __internal_copysign_pos(t1, a); | | t1 = __internal_copysign_pos(t1, a); | |
| } else { | | } else { | |
| t1 = __fma_rn (-0.5, fa, 0.5); | | t1 = __fma_rn (-0.5, fa, 0.5); | |
|
| t0 = __cuda_sqrt (t1); | | t0 = sqrt (t1); | |
| t1 = __internal_asin_kernel (t0, t1); | | t1 = __internal_asin_kernel (t0, t1); | |
| t0 = -2.0 * t0; | | t0 = -2.0 * t0; | |
| t1 = __fma_rn (t0, t1, CUDART_PIO2_LO); | | t1 = __fma_rn (t0, t1, CUDART_PIO2_LO); | |
| t0 = t0 + CUDART_PIO4_HI; | | t0 = t0 + CUDART_PIO4_HI; | |
| t1 = t0 + t1; | | t1 = t0 + t1; | |
| t1 = t1 + CUDART_PIO4_HI; | | t1 = t1 + CUDART_PIO4_HI; | |
| if (ahi < 0x3ff00000) { | | if (ahi < 0x3ff00000) { | |
| t1 = __internal_copysign_pos(t1, a); | | t1 = __internal_copysign_pos(t1, a); | |
| } | | } | |
| } | | } | |
| return t1; | | return t1; | |
| } | | } | |
| | | | |
|
| __device_func__(double __cuda_acos(double a)) | | static __forceinline__ double acos(double a) | |
| { | | { | |
| double t0, t1; | | double t0, t1; | |
| int ihi, ahi; | | int ihi, ahi; | |
| | | | |
|
| #if !defined(__CUDABE__) | | | |
| if (__cuda___isnan(a)) { | | | |
| return a + a; | | | |
| } | | | |
| #endif | | | |
| ahi = __double2hiint(a); | | ahi = __double2hiint(a); | |
|
| t0 = __cuda_fabs (a); | | t0 = fabs (a); | |
| ihi = __double2hiint(t0); | | ihi = __double2hiint(t0); | |
| if (ihi < 0x3fe26666) { | | if (ihi < 0x3fe26666) { | |
| t1 = t0 * t0; | | t1 = t0 * t0; | |
| t1 = __internal_asin_kernel (t0, t1); | | t1 = __internal_asin_kernel (t0, t1); | |
| t0 = __fma_rn (t1, t0, t0); | | t0 = __fma_rn (t1, t0, t0); | |
| if ((unsigned)ahi >= (unsigned)0x80000000) { | | if ((unsigned)ahi >= (unsigned)0x80000000) { | |
| t0 = __fma_rn (1.0, t0, +CUDART_PIO2_LO); | | t0 = __fma_rn (1.0, t0, +CUDART_PIO2_LO); | |
| t0 = CUDART_PIO2_HI + t0; | | t0 = CUDART_PIO2_HI + t0; | |
| } else { | | } else { | |
| t0 = __fma_rn (1.0, t0, -CUDART_PIO2_LO); | | t0 = __fma_rn (1.0, t0, -CUDART_PIO2_LO); | |
| t0 = CUDART_PIO2_HI - t0; | | t0 = CUDART_PIO2_HI - t0; | |
| } | | } | |
| } else { | | } else { | |
| t1 = __fma_rn (-0.5, t0, 0.5); | | t1 = __fma_rn (-0.5, t0, 0.5); | |
|
| t0 = __cuda_sqrt(t1); | | t0 = sqrt(t1); | |
| t1 = __internal_asin_kernel (t0, t1); | | t1 = __internal_asin_kernel (t0, t1); | |
| t0 = __fma_rn (t1, t0, t0); | | t0 = __fma_rn (t1, t0, t0); | |
| t0 = 2.0 * t0; | | t0 = 2.0 * t0; | |
| if ((unsigned)ahi >= (unsigned)0x80000000) { | | if ((unsigned)ahi >= (unsigned)0x80000000) { | |
| t0 = __fma_rn (1.0, t0, -CUDART_PI_LO); | | t0 = __fma_rn (1.0, t0, -CUDART_PI_LO); | |
| t0 = CUDART_PI_HI - t0; | | t0 = CUDART_PI_HI - t0; | |
| } | | } | |
| } | | } | |
| return t0; | | return t0; | |
| } | | } | |
| | | | |
|
| __device_func__(double __cuda_acosh(double a)) | | static __forceinline__ double acosh(double a) | |
| { | | { | |
| double t; | | double t; | |
|
| #if !defined(__CUDABE__) | | | |
| if (__cuda___isnan(a)) { | | | |
| return a + a; | | | |
| } | | | |
| #endif | | | |
| t = a - 1.0; | | t = a - 1.0; | |
|
| if (__cuda_fabs(t) > CUDART_TWO_TO_52) { | | if (fabs(t) > CUDART_TWO_TO_52) { | |
| /* for large a, acosh = log(2*a) */ | | /* for large a, acosh = log(2*a) */ | |
|
| return CUDART_LN2 + __cuda_log(a); | | return CUDART_LN2 + log(a); | |
| } else { | | } else { | |
|
| t = t + __cuda_sqrt(__fma_rn(a, t, t)); | | t = t + sqrt(__fma_rn(a, t, t)); | |
| return __cuda_log1p(t); | | return log1p(t); | |
| } | | } | |
| } | | } | |
| | | | |
|
| __device_func__(double __cuda_asinh(double a)) | | static __forceinline__ double asinh(double a) | |
| { | | { | |
| double fa, t; | | double fa, t; | |
|
| fa = __cuda_fabs(a); | | fa = fabs(a); | |
| if (__double2hiint(fa) >= 0x5ff00000) { /* prevent intermediate underflow
*/ | | if (__double2hiint(fa) >= 0x5ff00000) { /* prevent intermediate underflow
*/ | |
|
| t = CUDART_LN2 + __cuda_log(fa); | | t = CUDART_LN2 + log(fa); | |
| } else { | | } else { | |
| t = fa * fa; | | t = fa * fa; | |
|
| t = __cuda_log1p (fa + t / (1.0 + __cuda_sqrt(1.0 + t))); | | t = log1p (fa + t / (1.0 + sqrt(1.0 + t))); | |
| } | | } | |
| return __internal_copysign_pos(t, a); | | return __internal_copysign_pos(t, a); | |
| } | | } | |
| | | | |
|
| __device_func__(double __cuda_atanh(double a)) | | static __forceinline__ double atanh(double a) | |
| { | | { | |
| double fa, t; | | double fa, t; | |
|
| #if !defined(__CUDABE__) | | fa = fabs(a); | |
| if (__cuda___isnan(a)) { | | | |
| return a + a; | | | |
| } | | | |
| #endif | | | |
| fa = __cuda_fabs(a); | | | |
| t = (2.0 * fa) / (1.0 - fa); | | t = (2.0 * fa) / (1.0 - fa); | |
|
| t = 0.5 * __cuda_log1p(t); | | t = 0.5 * log1p(t); | |
| #if !defined(__CUDABE__) | | if (__double2hiint(a) < 0) { | |
| if (__cuda___isnan(t)) { | | | |
| return t; | | | |
| } | | | |
| #endif | | | |
| if (__cuda___signbit(a)) { | | | |
| t = -t; | | t = -t; | |
| } | | } | |
| return t; | | return t; | |
| } | | } | |
| | | | |
|
| __device_func__(double __cuda_hypot(double a, double b)) | | static __forceinline__ double hypot(double a, double b) | |
| { | | { | |
| double v, w, t, fa, fb; | | double v, w, t, fa, fb; | |
| | | | |
|
| fa = __cuda_fabs(a); | | fa = fabs(a); | |
| fb = __cuda_fabs(b); | | fb = fabs(b); | |
| v = __cuda_fmax(fa, fb); | | v = fmax(fa, fb); | |
| w = __cuda_fmin(fa, fb); | | w = fmin(fa, fb); | |
| t = w / v; | | t = w / v; | |
| t = __fma_rn (t, t, 1.0); | | t = __fma_rn (t, t, 1.0); | |
|
| t = v * __cuda_sqrt(t); | | t = v * sqrt(t); | |
| if (v == 0.0) { | | if (v == 0.0) { | |
| t = v + w; /* fixup for zero divide */ | | t = v + w; /* fixup for zero divide */ | |
| } | | } | |
| if ((!(fa <= CUDART_INF)) || (!(fb <= CUDART_INF))) { | | if ((!(fa <= CUDART_INF)) || (!(fb <= CUDART_INF))) { | |
| t = a + b; /* fixup for NaNs */ | | t = a + b; /* fixup for NaNs */ | |
| } | | } | |
| if (v == CUDART_INF) { | | if (v == CUDART_INF) { | |
| t = v + w; /* fixup for infinities */ | | t = v + w; /* fixup for infinities */ | |
| } | | } | |
| return t; | | return t; | |
| } | | } | |
| | | | |
|
| __device_func__(double __cuda_cbrt(double a)) | | static __forceinline__ double cbrt(double a) | |
| { | | { | |
| float s; | | float s; | |
| double t, r; | | double t, r; | |
| int ilo, ihi, expo, nexpo, denorm; | | int ilo, ihi, expo, nexpo, denorm; | |
|
| if ((a == 0.0) || !(__cuda___finite(a))) { | | if ((a == 0.0) || !(__finite(a))) { | |
| return a + a; | | return a + a; | |
| } | | } | |
|
| t = __cuda_fabs(a); | | t = fabs(a); | |
| ilo = __double2loint(t); | | ilo = __double2loint(t); | |
| ihi = __double2hiint(t); | | ihi = __double2hiint(t); | |
| expo = ((int)((unsigned int)ihi >> 20) & 0x7ff); | | expo = ((int)((unsigned int)ihi >> 20) & 0x7ff); | |
| denorm = 0; | | denorm = 0; | |
| if (expo == 0) { | | if (expo == 0) { | |
| /* denormal */ | | /* denormal */ | |
| t = t * CUDART_TWO_TO_54; | | t = t * CUDART_TWO_TO_54; | |
| denorm = 18; | | denorm = 18; | |
| ilo = __double2loint(t); | | ilo = __double2loint(t); | |
| ihi = __double2hiint(t); | | ihi = __double2hiint(t); | |
| expo = ((int)((unsigned int)ihi >> 20) & 0x7ff); | | expo = ((int)((unsigned int)ihi >> 20) & 0x7ff); | |
| } | | } | |
| /* scale into float range */ | | /* scale into float range */ | |
| nexpo = __float2int_rn(CUDART_THIRD_F * (float)(expo - 1022)); | | nexpo = __float2int_rn(CUDART_THIRD_F * (float)(expo - 1022)); | |
| ihi -= (3 * nexpo) << 20; | | ihi -= (3 * nexpo) << 20; | |
| r = __hiloint2double(ihi, ilo); | | r = __hiloint2double(ihi, ilo); | |
| /* initial approximation */ | | /* initial approximation */ | |
| s = (float)r; | | s = (float)r; | |
|
| t = __cuda_exp2f(-CUDART_THIRD_F * __log2f(s)); /* approximate invcbrt
*/ | | t = exp2f(-CUDART_THIRD_F * __log2f(s)); /* approximate invcbrt */ | |
| t = __fma_rn(__fma_rn(t*t,-r*t,1.0), CUDART_THIRD*t, t);/* refine invcbrt
*/ | | t = __fma_rn(__fma_rn(t*t,-r*t,1.0), CUDART_THIRD*t, t);/* refine invcbrt
*/ | |
| t = r * t * t; /* approximate cbrt
*/ | | t = r * t * t; /* approximate cbrt
*/ | |
| t = __fma_rn(t - (r / (t * t)), -CUDART_THIRD, t); /* refine cbrt
*/ | | t = __fma_rn(t - (r / (t * t)), -CUDART_THIRD, t); /* refine cbrt
*/ | |
| /* scale result back into double range */ | | /* scale result back into double range */ | |
| ilo = __double2loint(t); | | ilo = __double2loint(t); | |
| ihi = __double2hiint(t); | | ihi = __double2hiint(t); | |
| ihi += (nexpo - denorm) << 20; | | ihi += (nexpo - denorm) << 20; | |
| t = __hiloint2double(ihi, ilo); | | t = __hiloint2double(ihi, ilo); | |
|
| if (__cuda___signbit(a)) { | | if (__double2hiint(a) < 0) { | |
| t = -t; | | t = -t; | |
| } | | } | |
| return t; | | return t; | |
| } | | } | |
| | | | |
|
| __device_func__(double __cuda_rcbrt(double a)) | | static __forceinline__ double rcbrt(double a) | |
| { | | { | |
| float s; | | float s; | |
| double t, r; | | double t, r; | |
| int ilo, ihi, expo, nexpo, denorm; | | int ilo, ihi, expo, nexpo, denorm; | |
|
| if ((a == 0.0) || !(__cuda___finite(a))) { | | if ((a == 0.0) || !(__finite(a))) { | |
| return 1.0 / a; | | return 1.0 / a; | |
| } | | } | |
|
| t = __cuda_fabs(a); | | t = fabs(a); | |
| ilo = __double2loint(t); | | ilo = __double2loint(t); | |
| ihi = __double2hiint(t); | | ihi = __double2hiint(t); | |
| expo = ((int)((unsigned int)ihi >> 20) & 0x7ff); | | expo = ((int)((unsigned int)ihi >> 20) & 0x7ff); | |
| denorm = 0; | | denorm = 0; | |
| if (expo == 0) { | | if (expo == 0) { | |
| /* denormal */ | | /* denormal */ | |
| t = t * CUDART_TWO_TO_54; | | t = t * CUDART_TWO_TO_54; | |
| denorm = 18; | | denorm = 18; | |
| ilo = __double2loint(t); | | ilo = __double2loint(t); | |
| ihi = __double2hiint(t); | | ihi = __double2hiint(t); | |
| expo = ((int)((unsigned int)ihi >> 20) & 0x7ff); | | expo = ((int)((unsigned int)ihi >> 20) & 0x7ff); | |
| } | | } | |
| /* scale into float range */ | | /* scale into float range */ | |
| nexpo = __float2int_rn(CUDART_THIRD_F * (float)(expo - 1022)); | | nexpo = __float2int_rn(CUDART_THIRD_F * (float)(expo - 1022)); | |
| ihi -= (3 * nexpo) << 20; | | ihi -= (3 * nexpo) << 20; | |
| r = __hiloint2double(ihi, ilo); | | r = __hiloint2double(ihi, ilo); | |
| /* initial approximation */ | | /* initial approximation */ | |
| s = (float)r; | | s = (float)r; | |
|
| t = __cuda_exp2f(-CUDART_THIRD_F * __log2f(s)); /* approximate invcbrt
*/ | | t = exp2f(-CUDART_THIRD_F * __log2f(s)); /* approximate invcbrt */ | |
| t = __fma_rn(__fma_rn(t*t,-r*t,1.0), CUDART_THIRD*t, t);/* refine invcbrt
*/ | | t = __fma_rn(__fma_rn(t*t,-r*t,1.0), CUDART_THIRD*t, t);/* refine invcbrt
*/ | |
| t = __fma_rn(__fma_rn(t*t,-r*t,1.0), CUDART_THIRD*t, t);/* refine invcbrt
*/ | | t = __fma_rn(__fma_rn(t*t,-r*t,1.0), CUDART_THIRD*t, t);/* refine invcbrt
*/ | |
| /* scale result back into double range */ | | /* scale result back into double range */ | |
| ilo = __double2loint(t); | | ilo = __double2loint(t); | |
| ihi = __double2hiint(t); | | ihi = __double2hiint(t); | |
| ihi += (-(nexpo - denorm)) << 20; | | ihi += (-(nexpo - denorm)) << 20; | |
| t = __hiloint2double(ihi, ilo); | | t = __hiloint2double(ihi, ilo); | |
|
| if (__cuda___signbit(a)) { | | if (__double2hiint(a) < 0) { | |
| t = -t; | | t = -t; | |
| } | | } | |
| return t; | | return t; | |
| } | | } | |
| | | | |
|
| __device_func__(double __internal_accurate_pow(double a, double b)) | | static __forceinline__ double __internal_accurate_pow(double a, double b) | |
| { | | { | |
| double2 loga; | | double2 loga; | |
| double2 prod; | | double2 prod; | |
| double t_hi, t_lo; | | double t_hi, t_lo; | |
| double tmp; | | double tmp; | |
|
| #if !defined(__CUDABE__) && defined(__linux__) && !defined(__LP64__) | | | |
| volatile | | | |
| #endif | | | |
| double e; | | double e; | |
| | | | |
| /* compute log(a) in double-double format*/ | | /* compute log(a) in double-double format*/ | |
| loga = __internal_log_ext_prec(a); | | loga = __internal_log_ext_prec(a); | |
| | | | |
| /* prevent overflow during extended precision multiply */ | | /* prevent overflow during extended precision multiply */ | |
|
| if (__cuda_fabs(b) > 1e304) b *= 1.220703125e-4; | | if (fabs(b) > 1e304) b *= 1.220703125e-4; | |
| /* compute b * log(a) in double-double format */ | | /* compute b * log(a) in double-double format */ | |
| t_hi = loga.y * b; | | t_hi = loga.y * b; | |
| t_lo = __fma_rn (loga.y, b, -t_hi); | | t_lo = __fma_rn (loga.y, b, -t_hi); | |
| t_lo = __fma_rn (loga.x, b, t_lo); | | t_lo = __fma_rn (loga.x, b, t_lo); | |
| prod.y = e = t_hi + t_lo; | | prod.y = e = t_hi + t_lo; | |
| prod.x = (t_hi - e) + t_lo; | | prod.x = (t_hi - e) + t_lo; | |
| | | | |
| /* compute pow(a,b) = exp(b*log(a)) */ | | /* compute pow(a,b) = exp(b*log(a)) */ | |
|
| tmp = __cuda_exp(prod.y); | | tmp = exp(prod.y); | |
| /* prevent -INF + INF = NaN */ | | /* prevent -INF + INF = NaN */ | |
|
| if (!__cuda___isinf(tmp)) { | | if (!__isinf(tmp)) { | |
| /* if prod.x is much smaller than prod.y, then exp(prod.y + prod.x) ~= | | /* if prod.x is much smaller than prod.y, then exp(prod.y + prod.x) ~= | |
| * exp(prod.y) + prod.x * exp(prod.y) | | * exp(prod.y) + prod.x * exp(prod.y) | |
| */ | | */ | |
| tmp = __fma_rn (tmp, prod.x, tmp); | | tmp = __fma_rn (tmp, prod.x, tmp); | |
| } | | } | |
| return tmp; | | return tmp; | |
| } | | } | |
| | | | |
|
| __device_func__(double __cuda_pow(double a, double b)) | | static __forceinline__ double pow(double a, double b) | |
| { | | { | |
| int bIsOddInteger; | | int bIsOddInteger; | |
| double t; | | double t; | |
| | | | |
| if (a == 1.0 || b == 0.0) { | | if (a == 1.0 || b == 0.0) { | |
| return 1.0; | | return 1.0; | |
| } | | } | |
|
| if (__cuda___isnan(a) || __cuda___isnan(b)) { | | if (__isnan(a) || __isnan(b)) { | |
| return a + b; | | return a + b; | |
| } | | } | |
| if (a == CUDART_INF) { | | if (a == CUDART_INF) { | |
|
| return __cuda___signbit(b) ? CUDART_ZERO : CUDART_INF; | | return (__double2hiint(b) < 0) ? CUDART_ZERO : CUDART_INF; | |
| } | | } | |
|
| if (__cuda___isinf(b)) { | | if (__isinf(b)) { | |
| if (a == -1.0) { | | if (a == -1.0) { | |
| return 1.0; | | return 1.0; | |
| } | | } | |
|
| t = __cuda_fabs(a) > 1.0 ? CUDART_INF : CUDART_ZERO; | | t = fabs(a) > 1.0 ? CUDART_INF : CUDART_ZERO; | |
| if (b < CUDART_ZERO) { | | if (b < CUDART_ZERO) { | |
| t = 1.0 / t; | | t = 1.0 / t; | |
| } | | } | |
| return t; | | return t; | |
| } | | } | |
|
| bIsOddInteger = __cuda_fabs(b - (2.0f * __cuda_trunc(0.5 * b))) == 1.0; | | bIsOddInteger = fabs(b - (2.0f * trunc(0.5 * b))) == 1.0; | |
| if (a == CUDART_ZERO) { | | if (a == CUDART_ZERO) { | |
| t = bIsOddInteger ? a : CUDART_ZERO; | | t = bIsOddInteger ? a : CUDART_ZERO; | |
| if (b < CUDART_ZERO) { | | if (b < CUDART_ZERO) { | |
| t = 1.0 / t; | | t = 1.0 / t; | |
| } | | } | |
| return t; | | return t; | |
| } | | } | |
| if (a == -CUDART_INF) { | | if (a == -CUDART_INF) { | |
| t = (b < CUDART_ZERO) ? -1.0/a : -a; | | t = (b < CUDART_ZERO) ? -1.0/a : -a; | |
| if (bIsOddInteger) { | | if (bIsOddInteger) { | |
| t = __longlong_as_double(__double_as_longlong(t)^0x8000000000000000UL
L); | | t = __longlong_as_double(__double_as_longlong(t)^0x8000000000000000UL
L); | |
| } | | } | |
| return t; | | return t; | |
| } | | } | |
|
| if ((a < CUDART_ZERO) && (b != __cuda_trunc(b))) { | | if ((a < CUDART_ZERO) && (b != trunc(b))) { | |
| return CUDART_NAN; | | return CUDART_NAN; | |
| } | | } | |
|
| t = __cuda_fabs(a); | | t = fabs(a); | |
| t = __internal_accurate_pow(t, b); | | t = __internal_accurate_pow(t, b); | |
| if ((a < CUDART_ZERO) && bIsOddInteger) { | | if ((a < CUDART_ZERO) && bIsOddInteger) { | |
| t = __longlong_as_double(__double_as_longlong(t) ^ 0x8000000000000000UL
L); | | t = __longlong_as_double(__double_as_longlong(t) ^ 0x8000000000000000UL
L); | |
| } | | } | |
| return t; | | return t; | |
| } | | } | |
| | | | |
|
| __device_func__(double __cuda_erf(double a)) | | static __forceinline__ double erf(double a) | |
| { | | { | |
| double t, r, q; | | double t, r, q; | |
| | | | |
|
| t = __cuda_fabs(a); | | t = fabs(a); | |
| if (t >= 1.0) { | | if (t >= 1.0) { | |
| r = -1.28836351230756500E-019; | | r = -1.28836351230756500E-019; | |
| r = __fma_rn (r, t, 1.30597472161093370E-017); | | r = __fma_rn (r, t, 1.30597472161093370E-017); | |
| r = __fma_rn (r, t, -6.33924401259620500E-016); | | r = __fma_rn (r, t, -6.33924401259620500E-016); | |
| r = __fma_rn (r, t, 1.96231865908940140E-014); | | r = __fma_rn (r, t, 1.96231865908940140E-014); | |
| r = __fma_rn (r, t, -4.35272243559990750E-013); | | r = __fma_rn (r, t, -4.35272243559990750E-013); | |
| r = __fma_rn (r, t, 7.37083927929352150E-012); | | r = __fma_rn (r, t, 7.37083927929352150E-012); | |
| r = __fma_rn (r, t, -9.91402142550461630E-011); | | r = __fma_rn (r, t, -9.91402142550461630E-011); | |
| r = __fma_rn (r, t, 1.08817017167760820E-009); | | r = __fma_rn (r, t, 1.08817017167760820E-009); | |
| r = __fma_rn (r, t, -9.93918713097634620E-009); | | r = __fma_rn (r, t, -9.93918713097634620E-009); | |
| | | | |
| skipping to change at line 1474 | | skipping to change at line 1355 | |
| r = __fma_rn (r, q, 5.22397760611847340E-003); | | r = __fma_rn (r, q, 5.22397760611847340E-003); | |
| r = __fma_rn (r, q, -2.68661706431114690E-002); | | r = __fma_rn (r, q, -2.68661706431114690E-002); | |
| r = __fma_rn (r, q, 1.12837916709441850E-001); | | r = __fma_rn (r, q, 1.12837916709441850E-001); | |
| r = __fma_rn (r, q, -3.76126389031835210E-001); | | r = __fma_rn (r, q, -3.76126389031835210E-001); | |
| r = __fma_rn (r, q, 1.12837916709551260E+000); | | r = __fma_rn (r, q, 1.12837916709551260E+000); | |
| a = r * a; | | a = r * a; | |
| } | | } | |
| return a; | | return a; | |
| } | | } | |
| | | | |
|
| __device_func__(double __cuda_erfinv(double a)) | | static __forceinline__ double erfinv(double a) | |
| { | | { | |
| double fa, t; | | double fa, t; | |
| | | | |
|
| fa = __cuda_fabs(a); | | fa = fabs(a); | |
| if (fa >= 1.0) { | | if (fa >= 1.0) { | |
| t = CUDART_NAN; /* NaN */ | | t = CUDART_NAN; /* NaN */ | |
| if (fa == 1.0) { | | if (fa == 1.0) { | |
| t = a * CUDART_INF; /* Infinity */ | | t = a * CUDART_INF; /* Infinity */ | |
| } | | } | |
| } else if (fa >= 0.9375) { | | } else if (fa >= 0.9375) { | |
| /* Based on: J.M. Blair, C.A. Edwards, J.H. Johnson: Rational Chebyshev | | /* Based on: J.M. Blair, C.A. Edwards, J.H. Johnson: Rational Chebyshev | |
| Approximations for the Inverse of the Error Function. Mathematics of | | Approximations for the Inverse of the Error Function. Mathematics of | |
| Computation, Vol. 30, No. 136 (Oct. 1976), pp. 827-830. Table 59 | | Computation, Vol. 30, No. 136 (Oct. 1976), pp. 827-830. Table 59 | |
| */ | | */ | |
| double p, q; | | double p, q; | |
| | | | |
|
| t = __cuda_log1p(-fa); | | t = log1p(-fa); | |
| t = __cuda_rsqrt(-t); | | t = rsqrt(-t); | |
| p = 2.7834010353747001060e-3; | | p = 2.7834010353747001060e-3; | |
| p = __fma_rn (p, t, 8.6030097526280260580e-1); | | p = __fma_rn (p, t, 8.6030097526280260580e-1); | |
| p = __fma_rn (p, t, 2.1371214997265515515e+0); | | p = __fma_rn (p, t, 2.1371214997265515515e+0); | |
| p = __fma_rn (p, t, 3.1598519601132090206e+0); | | p = __fma_rn (p, t, 3.1598519601132090206e+0); | |
| p = __fma_rn (p, t, 3.5780402569085996758e+0); | | p = __fma_rn (p, t, 3.5780402569085996758e+0); | |
| p = __fma_rn (p, t, 1.5335297523989890804e+0); | | p = __fma_rn (p, t, 1.5335297523989890804e+0); | |
| p = __fma_rn (p, t, 3.4839207139657522572e-1); | | p = __fma_rn (p, t, 3.4839207139657522572e-1); | |
| p = __fma_rn (p, t, 5.3644861147153648366e-2); | | p = __fma_rn (p, t, 5.3644861147153648366e-2); | |
| p = __fma_rn (p, t, 4.3836709877126095665e-3); | | p = __fma_rn (p, t, 4.3836709877126095665e-3); | |
| p = __fma_rn (p, t, 1.3858518113496718808e-4); | | p = __fma_rn (p, t, 1.3858518113496718808e-4); | |
| | | | |
| skipping to change at line 1571 | | skipping to change at line 1452 | |
| q = __fma_rn (q, t, .59039348134843665626e+4); | | q = __fma_rn (q, t, .59039348134843665626e+4); | |
| q = __fma_rn (q, t, -.48481635430048872102e+4); | | q = __fma_rn (q, t, -.48481635430048872102e+4); | |
| q = __fma_rn (q, t, .18997769186453057810e+4); | | q = __fma_rn (q, t, .18997769186453057810e+4); | |
| q = __fma_rn (q, t, -.28386514725366621129e+3); | | q = __fma_rn (q, t, -.28386514725366621129e+3); | |
| p = p / q; | | p = p / q; | |
| t = a * p; | | t = a * p; | |
| } | | } | |
| return t; | | return t; | |
| } | | } | |
| | | | |
|
| __device_func__(double __cuda_erfcinv(double a)) | | static __forceinline__ double erfcinv(double a) | |
| { | | { | |
| double t; | | double t; | |
|
| #if !defined(__CUDABE__) | | | |
| if (__cuda___isnan(a)) return a + a; | | | |
| #endif | | | |
| if (a <= CUDART_ZERO) { | | if (a <= CUDART_ZERO) { | |
| t = CUDART_NAN; | | t = CUDART_NAN; | |
| if (a == CUDART_ZERO) { | | if (a == CUDART_ZERO) { | |
| t = (1.0 - a) * CUDART_INF; | | t = (1.0 - a) * CUDART_INF; | |
| } | | } | |
| } | | } | |
| else if (a >= 0.0625) { | | else if (a >= 0.0625) { | |
|
| t = __cuda_erfinv (1.0 - a); | | t = erfinv (1.0 - a); | |
| } | | } | |
| else if (a >= 1e-100) { | | else if (a >= 1e-100) { | |
| /* Based on: J.M. Blair, C.A. Edwards, J.H. Johnson: Rational Chebyshev | | /* Based on: J.M. Blair, C.A. Edwards, J.H. Johnson: Rational Chebyshev | |
| Approximations for the Inverse of the Error Function. Mathematics of | | Approximations for the Inverse of the Error Function. Mathematics of | |
| Computation, Vol. 30, No. 136 (Oct. 1976), pp. 827-830. Table 59 | | Computation, Vol. 30, No. 136 (Oct. 1976), pp. 827-830. Table 59 | |
| */ | | */ | |
| double p, q; | | double p, q; | |
|
| t = __cuda_log(a); | | t = log(a); | |
| t = __cuda_rsqrt(-t); | | t = rsqrt(-t); | |
| p = 2.7834010353747001060e-3; | | p = 2.7834010353747001060e-3; | |
| p = __fma_rn (p, t, 8.6030097526280260580e-1); | | p = __fma_rn (p, t, 8.6030097526280260580e-1); | |
| p = __fma_rn (p, t, 2.1371214997265515515e+0); | | p = __fma_rn (p, t, 2.1371214997265515515e+0); | |
| p = __fma_rn (p, t, 3.1598519601132090206e+0); | | p = __fma_rn (p, t, 3.1598519601132090206e+0); | |
| p = __fma_rn (p, t, 3.5780402569085996758e+0); | | p = __fma_rn (p, t, 3.5780402569085996758e+0); | |
| p = __fma_rn (p, t, 1.5335297523989890804e+0); | | p = __fma_rn (p, t, 1.5335297523989890804e+0); | |
| p = __fma_rn (p, t, 3.4839207139657522572e-1); | | p = __fma_rn (p, t, 3.4839207139657522572e-1); | |
| p = __fma_rn (p, t, 5.3644861147153648366e-2); | | p = __fma_rn (p, t, 5.3644861147153648366e-2); | |
| p = __fma_rn (p, t, 4.3836709877126095665e-3); | | p = __fma_rn (p, t, 4.3836709877126095665e-3); | |
| p = __fma_rn (p, t, 1.3858518113496718808e-4); | | p = __fma_rn (p, t, 1.3858518113496718808e-4); | |
| | | | |
| skipping to change at line 1623 | | skipping to change at line 1501 | |
| q = __fma_rn (q, t, 1.3858762165532246059e-4); | | q = __fma_rn (q, t, 1.3858762165532246059e-4); | |
| q = __fma_rn (q, t, 1.1738313872397777529e-6); | | q = __fma_rn (q, t, 1.1738313872397777529e-6); | |
| t = p / (q * t); | | t = p / (q * t); | |
| } | | } | |
| else { | | else { | |
| /* Based on: J.M. Blair, C.A. Edwards, J.H. Johnson: Rational Chebyshev | | /* Based on: J.M. Blair, C.A. Edwards, J.H. Johnson: Rational Chebyshev | |
| Approximations for the Inverse of the Error Function. Mathematics of | | Approximations for the Inverse of the Error Function. Mathematics of | |
| Computation, Vol. 30, No. 136 (Oct. 1976), pp. 827-830. Table 82 | | Computation, Vol. 30, No. 136 (Oct. 1976), pp. 827-830. Table 82 | |
| */ | | */ | |
| double p, q; | | double p, q; | |
|
| t = __cuda_log(a); | | t = log(a); | |
| t = __cuda_rsqrt(-t); | | t = rsqrt(-t); | |
| p = 6.9952990607058154858e-1; | | p = 6.9952990607058154858e-1; | |
| p = __fma_rn (p, t, 1.9507620287580568829e+0); | | p = __fma_rn (p, t, 1.9507620287580568829e+0); | |
| p = __fma_rn (p, t, 8.2810030904462690216e-1); | | p = __fma_rn (p, t, 8.2810030904462690216e-1); | |
| p = __fma_rn (p, t, 1.1279046353630280005e-1); | | p = __fma_rn (p, t, 1.1279046353630280005e-1); | |
| p = __fma_rn (p, t, 6.0537914739162189689e-3); | | p = __fma_rn (p, t, 6.0537914739162189689e-3); | |
| p = __fma_rn (p, t, 1.3714329569665128933e-4); | | p = __fma_rn (p, t, 1.3714329569665128933e-4); | |
| p = __fma_rn (p, t, 1.2964481560643197452e-6); | | p = __fma_rn (p, t, 1.2964481560643197452e-6); | |
| p = __fma_rn (p, t, 4.6156006321345332510e-9); | | p = __fma_rn (p, t, 4.6156006321345332510e-9); | |
| p = __fma_rn (p, t, 4.5344689563209398450e-12); | | p = __fma_rn (p, t, 4.5344689563209398450e-12); | |
| q = t+ 1.5771922386662040546e+0; | | q = t+ 1.5771922386662040546e+0; | |
| | | | |
| skipping to change at line 1648 | | skipping to change at line 1526 | |
| q = __fma_rn (q, t, 6.0574830550097140404e-3); | | q = __fma_rn (q, t, 6.0574830550097140404e-3); | |
| q = __fma_rn (q, t, 1.3715891988350205065e-4); | | q = __fma_rn (q, t, 1.3715891988350205065e-4); | |
| q = __fma_rn (q, t, 1.2964671850944981713e-6); | | q = __fma_rn (q, t, 1.2964671850944981713e-6); | |
| q = __fma_rn (q, t, 4.6156017600933592558e-9); | | q = __fma_rn (q, t, 4.6156017600933592558e-9); | |
| q = __fma_rn (q, t, 4.5344687377088206783e-12); | | q = __fma_rn (q, t, 4.5344687377088206783e-12); | |
| t = p / (q * t); | | t = p / (q * t); | |
| } | | } | |
| return t; | | return t; | |
| } | | } | |
| | | | |
|
| __device_func__(double __cuda_erfc(double a)) | | static __forceinline__ double erfc(double a) | |
| { | | { | |
| double p, q, h, l; | | double p, q, h, l; | |
| int ahi; | | int ahi; | |
| | | | |
| ahi = __double2hiint(a); | | ahi = __double2hiint(a); | |
| if (ahi < (int)0x3fea0400) { /* 1665/2048 */ | | if (ahi < (int)0x3fea0400) { /* 1665/2048 */ | |
|
| return 1.0 - __cuda_erf(a); | | return 1.0 - erf(a); | |
| } | | } | |
| if (ahi < (int)0x40140000) { /* 5.0 */ | | if (ahi < (int)0x40140000) { /* 5.0 */ | |
| /* On the interval [1665/2048, 5.0] the following approximation is used
: | | /* On the interval [1665/2048, 5.0] the following approximation is used
: | |
| erfc(a) = (1.0 + 1/a * r(1/a)) * 1/a * 0.5 * exp(-a*a), where the ra
nge | | erfc(a) = (1.0 + 1/a * r(1/a)) * 1/a * 0.5 * exp(-a*a), where the ra
nge | |
| of r(1/a) is approximately [-0.17, 0.11]. r(1/a) is computed by rati
onal | | of r(1/a) is approximately [-0.17, 0.11]. r(1/a) is computed by rati
onal | |
| approximation. | | approximation. | |
| */ | | */ | |
| double t; | | double t; | |
| | | | |
|
| t = 1.0 / a; | | t = __internal_fast_rcp(a); | |
| p = -1.0000000252849461E+000; | | p = -1.0000000252849461E+000; | |
| p = __fma_rn (p, t, -7.3398971987771156E-001); | | p = __fma_rn (p, t, -7.3398971987771156E-001); | |
| p = __fma_rn (p, t, -1.4685633784433072E-001); | | p = __fma_rn (p, t, -1.4685633784433072E-001); | |
| p = __fma_rn (p, t, 1.2963557011001836E-001); | | p = __fma_rn (p, t, 1.2963557011001836E-001); | |
| p = __fma_rn (p, t, 1.0901177826674287E-001); | | p = __fma_rn (p, t, 1.0901177826674287E-001); | |
| p = __fma_rn (p, t, 3.9250612663155882E-002); | | p = __fma_rn (p, t, 3.9250612663155882E-002); | |
| p = __fma_rn (p, t, 7.5883167167654269E-003); | | p = __fma_rn (p, t, 7.5883167167654269E-003); | |
| p = __fma_rn (p, t, 6.6438196820856965E-004); | | p = __fma_rn (p, t, 6.6438196820856965E-004); | |
| q = t + 2.7339900293714838E+000; | | q = t + 2.7339900293714838E+000; | |
| q = __fma_rn (q, t, 3.3580762542361291E+000); | | q = __fma_rn (q, t, 3.3580762542361291E+000); | |
| q = __fma_rn (q, t, 2.4165688909166021E+000); | | q = __fma_rn (q, t, 2.4165688909166021E+000); | |
| q = __fma_rn (q, t, 1.1092158770004934E+000); | | q = __fma_rn (q, t, 1.1092158770004934E+000); | |
| q = __fma_rn (q, t, 3.2845571970789467E-001); | | q = __fma_rn (q, t, 3.2845571970789467E-001); | |
| q = __fma_rn (q, t, 5.9110343116276186E-002); | | q = __fma_rn (q, t, 5.9110343116276186E-002); | |
| q = __fma_rn (q, t, 5.1750858802842702E-003); | | q = __fma_rn (q, t, 5.1750858802842702E-003); | |
| q = __fma_rn (q, t, 1.2937416364002241E-009); | | q = __fma_rn (q, t, 1.2937416364002241E-009); | |
|
| q = 1.0 / q; | | q = __internal_fast_rcp(q); | |
| p = p * q; | | p = p * q; | |
| p = p * t; | | p = p * t; | |
| h = a * a; | | h = a * a; | |
| l = __fma_rn (a, a, -h); | | l = __fma_rn (a, a, -h); | |
| q = __internal_exp_kernel(-h, -1); | | q = __internal_exp_kernel(-h, -1); | |
| q = __fma_rn (l, -q, q); | | q = __fma_rn (l, -q, q); | |
| p = __fma_rn (p, q, q); | | p = __fma_rn (p, q, q); | |
| p = p * t; | | p = p * t; | |
| } else { | | } else { | |
| /* max error 4 ulps on [5, 27.3] */ | | /* max error 4 ulps on [5, 27.3] */ | |
| double ooa, ooasq; | | double ooa, ooasq; | |
| | | | |
|
| ooa = 1.0 / a; | | ooa = __internal_fast_rcp(a); | |
| ooasq = ooa * ooa; | | ooasq = ooa * ooa; | |
| p = -4.0025406686930527E+005; | | p = -4.0025406686930527E+005; | |
| p = __fma_rn (p, ooasq, 1.4420582543942123E+005); | | p = __fma_rn (p, ooasq, 1.4420582543942123E+005); | |
| p = __fma_rn (p, ooasq, -2.7664185780951841E+004); | | p = __fma_rn (p, ooasq, -2.7664185780951841E+004); | |
| p = __fma_rn (p, ooasq, 4.1144611644767283E+003); | | p = __fma_rn (p, ooasq, 4.1144611644767283E+003); | |
| p = __fma_rn (p, ooasq, -5.8706000519209351E+002); | | p = __fma_rn (p, ooasq, -5.8706000519209351E+002); | |
| p = __fma_rn (p, ooasq, 9.1490086446323375E+001); | | p = __fma_rn (p, ooasq, 9.1490086446323375E+001); | |
| p = __fma_rn (p, ooasq, -1.6659491387740221E+001); | | p = __fma_rn (p, ooasq, -1.6659491387740221E+001); | |
| p = __fma_rn (p, ooasq, 3.7024804085481784E+000); | | p = __fma_rn (p, ooasq, 3.7024804085481784E+000); | |
| p = __fma_rn (p, ooasq, -1.0578553994424316E+000); | | p = __fma_rn (p, ooasq, -1.0578553994424316E+000); | |
| | | | |
| skipping to change at line 1723 | | skipping to change at line 1601 | |
| p = p * ooa; | | p = p * ooa; | |
| p = p * q; | | p = p * q; | |
| if (a > 27.3) { | | if (a > 27.3) { | |
| p = 0.0; | | p = 0.0; | |
| } | | } | |
| } | | } | |
| return p; | | return p; | |
| } | | } | |
| | | | |
| /* approximate 1.0/(a*gamma(a)) on [-0.5,0.5] */ | | /* approximate 1.0/(a*gamma(a)) on [-0.5,0.5] */ | |
|
| __device_func__(double __internal_tgamma_kernel(double a)) | | static __forceinline__ double __internal_tgamma_kernel(double a) | |
| { | | { | |
| double t; | | double t; | |
| t = -4.42689340712524750E-010; | | t = -4.42689340712524750E-010; | |
| t = __fma_rn (t, a, -2.02665918466589540E-007); | | t = __fma_rn (t, a, -2.02665918466589540E-007); | |
| t = __fma_rn (t, a, 1.13812117211195270E-006); | | t = __fma_rn (t, a, 1.13812117211195270E-006); | |
| t = __fma_rn (t, a, -1.25077348166307480E-006); | | t = __fma_rn (t, a, -1.25077348166307480E-006); | |
| t = __fma_rn (t, a, -2.01365017404087710E-005); | | t = __fma_rn (t, a, -2.01365017404087710E-005); | |
| t = __fma_rn (t, a, 1.28050126073544860E-004); | | t = __fma_rn (t, a, 1.28050126073544860E-004); | |
| t = __fma_rn (t, a, -2.15241408115274180E-004); | | t = __fma_rn (t, a, -2.15241408115274180E-004); | |
| t = __fma_rn (t, a, -1.16516754597046040E-003); | | t = __fma_rn (t, a, -1.16516754597046040E-003); | |
| | | | |
| skipping to change at line 1746 | | skipping to change at line 1624 | |
| t = __fma_rn (t, a, -4.21977345547223940E-002); | | t = __fma_rn (t, a, -4.21977345547223940E-002); | |
| t = __fma_rn (t, a, 1.66538611382503560E-001); | | t = __fma_rn (t, a, 1.66538611382503560E-001); | |
| t = __fma_rn (t, a, -4.20026350341054440E-002); | | t = __fma_rn (t, a, -4.20026350341054440E-002); | |
| t = __fma_rn (t, a, -6.55878071520257120E-001); | | t = __fma_rn (t, a, -6.55878071520257120E-001); | |
| t = __fma_rn (t, a, 5.77215664901532870E-001); | | t = __fma_rn (t, a, 5.77215664901532870E-001); | |
| t = __fma_rn (t, a, 1.00000000000000000E+000); | | t = __fma_rn (t, a, 1.00000000000000000E+000); | |
| return t; | | return t; | |
| } | | } | |
| | | | |
| /* Stirling approximation for gamma(a), a > 20 */ | | /* Stirling approximation for gamma(a), a > 20 */ | |
|
| __device_func__(double __internal_stirling_poly(double a)) | | static __forceinline__ double __internal_stirling_poly(double a) | |
| { | | { | |
|
| double x = 1.0 / a; | | double x = __internal_fast_rcp(a); | |
| double z = 0.0; | | double z = 0.0; | |
| z = __fma_rn (z, x, 8.3949872067208726e-004); | | z = __fma_rn (z, x, 8.3949872067208726e-004); | |
| z = __fma_rn (z, x, -5.1717909082605919e-005); | | z = __fma_rn (z, x, -5.1717909082605919e-005); | |
| z = __fma_rn (z, x, -5.9216643735369393e-004); | | z = __fma_rn (z, x, -5.9216643735369393e-004); | |
| z = __fma_rn (z, x, 6.9728137583658571e-005); | | z = __fma_rn (z, x, 6.9728137583658571e-005); | |
| z = __fma_rn (z, x, 7.8403922172006662e-004); | | z = __fma_rn (z, x, 7.8403922172006662e-004); | |
| z = __fma_rn (z, x, -2.2947209362139917e-004); | | z = __fma_rn (z, x, -2.2947209362139917e-004); | |
| z = __fma_rn (z, x, -2.6813271604938273e-003); | | z = __fma_rn (z, x, -2.6813271604938273e-003); | |
| z = __fma_rn (z, x, 3.4722222222222220e-003); | | z = __fma_rn (z, x, 3.4722222222222220e-003); | |
| z = __fma_rn (z, x, 8.3333333333333329e-002); | | z = __fma_rn (z, x, 8.3333333333333329e-002); | |
| z = __fma_rn (z, x, 1.0000000000000000e+000); | | z = __fma_rn (z, x, 1.0000000000000000e+000); | |
| return z; | | return z; | |
| } | | } | |
| | | | |
|
| __device_func__(double __internal_tgamma_stirling(double a)) | | static __forceinline__ double __internal_tgamma_stirling(double a) | |
| { | | { | |
| if (a < 1.7162437695630274e+002) { | | if (a < 1.7162437695630274e+002) { | |
|
| #if defined(__GNUC__) && !defined(__CUDABE__) | | | |
| volatile | | | |
| #endif | | | |
| double t_hi, t_lo, e; | | double t_hi, t_lo, e; | |
| | | | |
| double2 loga, prod; | | double2 loga, prod; | |
| double z = __internal_stirling_poly (a); | | double z = __internal_stirling_poly (a); | |
| double b = a - 0.5; | | double b = a - 0.5; | |
| | | | |
| /* compute log(a) in double-double format*/ | | /* compute log(a) in double-double format*/ | |
| loga = __internal_log_ext_prec(a); | | loga = __internal_log_ext_prec(a); | |
| | | | |
| /* compute (a - 0.5) * log(a) in double-double format */ | | /* compute (a - 0.5) * log(a) in double-double format */ | |
| | | | |
| skipping to change at line 1791 | | skipping to change at line 1666 | |
| t_lo = __fma_rn (loga.x, b, t_lo); | | t_lo = __fma_rn (loga.x, b, t_lo); | |
| prod.y = e = t_hi + t_lo; | | prod.y = e = t_hi + t_lo; | |
| prod.x = (t_hi - e) + t_lo; | | prod.x = (t_hi - e) + t_lo; | |
| | | | |
| /* compute (a - 0.5) * log(a) - a in double-double format */ | | /* compute (a - 0.5) * log(a) - a in double-double format */ | |
| loga.y = -a; | | loga.y = -a; | |
| loga.x = 0.0; | | loga.x = 0.0; | |
| prod = __internal_ddadd_xgty (prod, loga); | | prod = __internal_ddadd_xgty (prod, loga); | |
| | | | |
| /* compute pow(a,b) = exp(b*log(a)) */ | | /* compute pow(a,b) = exp(b*log(a)) */ | |
|
| a = __cuda_exp(prod.y); | | a = exp(prod.y); | |
| /* prevent -INF + INF = NaN */ | | /* prevent -INF + INF = NaN */ | |
|
| if (!__cuda___isinf(a)) { | | if (!__isinf(a)) { | |
| /* if prod.x is much smaller than prod.y, then exp(prod.y + prod.x) ~
= | | /* if prod.x is much smaller than prod.y, then exp(prod.y + prod.x) ~
= | |
| * exp(prod.y) + prod.x * exp(prod.y) | | * exp(prod.y) + prod.x * exp(prod.y) | |
| */ | | */ | |
| a = __fma_rn (a, prod.x, a); | | a = __fma_rn (a, prod.x, a); | |
| } | | } | |
| a = __fma_rn (a, CUDART_SQRT_2PI_HI, a * CUDART_SQRT_2PI_LO); | | a = __fma_rn (a, CUDART_SQRT_2PI_HI, a * CUDART_SQRT_2PI_LO); | |
| return a * z; | | return a * z; | |
| } else { | | } else { | |
| return CUDART_INF; | | return CUDART_INF; | |
| } | | } | |
| } | | } | |
| | | | |
|
| __device_func__(double __cuda_tgamma(double a)) | | static __forceinline__ double tgamma(double a) | |
| { | | { | |
| double s, xx, x = a; | | double s, xx, x = a; | |
|
| if (__cuda___isnan(a)) { | | if (__isnan(a)) { | |
| return a + a; | | return a + a; | |
| } | | } | |
|
| if (__cuda_fabs(x) < 15.0) { | | if (fabs(x) < 15.0) { | |
| /* Based on: Kraemer, W.: "Berechnung der Gammafunktion G(x) fuer reel
le | | /* Based on: Kraemer, W.: "Berechnung der Gammafunktion G(x) fuer reel
le | |
| * Punkt- und Intervallargumente". Zeitschrift fuer angewandte Mathema
tik | | * Punkt- und Intervallargumente". Zeitschrift fuer angewandte Mathema
tik | |
| * und Mechanik, Vol. 70 (1990), No. 6, pp. 581-584 | | * und Mechanik, Vol. 70 (1990), No. 6, pp. 581-584 | |
| */ | | */ | |
| if (x >= 0.0) { | | if (x >= 0.0) { | |
| s = 1.0; | | s = 1.0; | |
| xx = x; | | xx = x; | |
| while (xx > 1.5) { | | while (xx > 1.5) { | |
| s = __fma_rn(s, xx, -s); | | s = __fma_rn(s, xx, -s); | |
| xx = xx - 1.0; | | xx = xx - 1.0; | |
| | | | |
| skipping to change at line 1835 | | skipping to change at line 1710 | |
| xx = xx - 1.0; | | xx = xx - 1.0; | |
| } | | } | |
| xx = __internal_tgamma_kernel (xx); | | xx = __internal_tgamma_kernel (xx); | |
| if (x < 0.5) { | | if (x < 0.5) { | |
| xx = xx * x; | | xx = xx * x; | |
| } | | } | |
| s = s / xx; | | s = s / xx; | |
| } else { | | } else { | |
| xx = x; | | xx = x; | |
| s = xx; | | s = xx; | |
|
| if (x == __cuda_trunc(x)) { | | if (x == trunc(x)) { | |
| return CUDART_NAN; | | return CUDART_NAN; | |
| } | | } | |
| while (xx < -0.5) { | | while (xx < -0.5) { | |
| s = __fma_rn (s, xx, s); | | s = __fma_rn (s, xx, s); | |
| xx = xx + 1.0; | | xx = xx + 1.0; | |
| } | | } | |
| xx = __internal_tgamma_kernel (xx); | | xx = __internal_tgamma_kernel (xx); | |
| s = s * xx; | | s = s * xx; | |
| s = 1.0 / s; | | s = 1.0 / s; | |
| } | | } | |
| return s; | | return s; | |
| } else { | | } else { | |
| if (x >= 0.0) { | | if (x >= 0.0) { | |
| return __internal_tgamma_stirling (x); | | return __internal_tgamma_stirling (x); | |
| } else { | | } else { | |
| double t; | | double t; | |
| int quot; | | int quot; | |
|
| if (x == __cuda_trunc(x)) { | | if (x == trunc(x)) { | |
| return CUDART_NAN; | | return CUDART_NAN; | |
| } | | } | |
| if (x < -185.0) { | | if (x < -185.0) { | |
| int negative; | | int negative; | |
|
| x = __cuda_floor(x); | | x = floor(x); | |
| negative = ((x - (2.0 * __cuda_floor(0.5 * x))) == 1.0); | | negative = ((x - (2.0 * floor(0.5 * x))) == 1.0); | |
| return negative ? CUDART_NEG_ZERO : CUDART_ZERO; | | return negative ? CUDART_NEG_ZERO : CUDART_ZERO; | |
| } | | } | |
| /* compute sin(pi*x) accurately */ | | /* compute sin(pi*x) accurately */ | |
|
| xx = __cuda_rint (__internal_twice(x)); | | xx = rint (__internal_twice(x)); | |
| quot = (int)xx; | | quot = (int)xx; | |
| xx = __fma_rn (-0.5, xx, x); | | xx = __fma_rn (-0.5, xx, x); | |
| xx = xx * CUDART_PI; | | xx = xx * CUDART_PI; | |
| if (quot & 1) { | | if (quot & 1) { | |
| xx = __internal_cos_kerneld (xx); | | xx = __internal_cos_kerneld (xx); | |
| } else { | | } else { | |
| xx = __internal_sin_kerneld (xx); | | xx = __internal_sin_kerneld (xx); | |
| } | | } | |
| if (quot & 2) { | | if (quot & 2) { | |
| xx = -xx; | | xx = -xx; | |
| } | | } | |
|
| x = __cuda_fabs (x); | | x = fabs (x); | |
| s = __cuda_exp (-x); | | s = exp (-x); | |
| t = x - 0.5; | | t = x - 0.5; | |
| if (x > 140.0) t = __internal_half(t); | | if (x > 140.0) t = __internal_half(t); | |
|
| t = __cuda_pow (x, t); | | t = pow (x, t); | |
| if (x > 140.0) s = s * t; | | if (x > 140.0) s = s * t; | |
| s = s * __internal_stirling_poly (x); | | s = s * __internal_stirling_poly (x); | |
| s = s * x; | | s = s * x; | |
| s = s * xx; | | s = s * xx; | |
| s = 1.0 / s; | | s = 1.0 / s; | |
| s = __fma_rn (s, CUDART_SQRT_PIO2_HI, CUDART_SQRT_PIO2_LO * s); | | s = __fma_rn (s, CUDART_SQRT_PIO2_HI, CUDART_SQRT_PIO2_LO * s); | |
| s = s / t; | | s = s / t; | |
| return s; | | return s; | |
| } | | } | |
| } | | } | |
| } | | } | |
| | | | |
|
| __device_func__(double __internal_lgamma_pos(double a)) | | static __forceinline__ double __internal_lgamma_pos(double a) | |
| { | | { | |
| double sum; | | double sum; | |
| double s, t; | | double s, t; | |
| | | | |
| if (a == CUDART_INF) { | | if (a == CUDART_INF) { | |
| return a; | | return a; | |
| } | | } | |
| if (a >= 3.0) { | | if (a >= 3.0) { | |
| if (a >= 8.0) { | | if (a >= 8.0) { | |
| /* Stirling approximation; coefficients from Hart et al, "Computer | | /* Stirling approximation; coefficients from Hart et al, "Computer | |
| * Approximations", Wiley 1968. Approximation 5404. | | * Approximations", Wiley 1968. Approximation 5404. | |
| */ | | */ | |
|
| s = 1.0 / a; | | s = __internal_fast_rcp(a); | |
| t = s * s; | | t = s * s; | |
| sum = -0.1633436431e-2; | | sum = -0.1633436431e-2; | |
| sum = __fma_rn (sum, t, 0.83645878922e-3); | | sum = __fma_rn (sum, t, 0.83645878922e-3); | |
| sum = __fma_rn (sum, t, -0.5951896861197e-3); | | sum = __fma_rn (sum, t, -0.5951896861197e-3); | |
| sum = __fma_rn (sum, t, 0.793650576493454e-3); | | sum = __fma_rn (sum, t, 0.793650576493454e-3); | |
| sum = __fma_rn (sum, t, -0.277777777735865004e-2); | | sum = __fma_rn (sum, t, -0.277777777735865004e-2); | |
| sum = __fma_rn (sum, t, 0.833333333333331018375e-1); | | sum = __fma_rn (sum, t, 0.833333333333331018375e-1); | |
| sum = __fma_rn (sum, s, 0.918938533204672); | | sum = __fma_rn (sum, s, 0.918938533204672); | |
|
| s = __internal_half(__cuda_log (a)); | | s = __internal_half(log (a)); | |
| t = a - 0.5; | | t = a - 0.5; | |
| s = s * t; | | s = s * t; | |
| t = s - a; | | t = s - a; | |
| s = s + sum; | | s = s + sum; | |
| t = t + s; | | t = t + s; | |
| return t; | | return t; | |
| } else { | | } else { | |
| a = a - 3.0; | | a = a - 3.0; | |
| s = -4.02412642744125560E+003; | | s = -4.02412642744125560E+003; | |
| s = __fma_rn (s, a, -2.97693796998962000E+005); | | s = __fma_rn (s, a, -2.97693796998962000E+005); | |
| | | | |
| skipping to change at line 2009 | | skipping to change at line 1884 | |
| t = __fma_rn (t, a, -1.16484324388538480E-003); | | t = __fma_rn (t, a, -1.16484324388538480E-003); | |
| t = __fma_rn (t, a, 7.21883433044470670E-003); | | t = __fma_rn (t, a, 7.21883433044470670E-003); | |
| t = __fma_rn (t, a, -9.62194579514229560E-003); | | t = __fma_rn (t, a, -9.62194579514229560E-003); | |
| t = __fma_rn (t, a, -4.21977386992884450E-002); | | t = __fma_rn (t, a, -4.21977386992884450E-002); | |
| t = __fma_rn (t, a, 1.66538611813682460E-001); | | t = __fma_rn (t, a, 1.66538611813682460E-001); | |
| t = __fma_rn (t, a, -4.20026350606819980E-002); | | t = __fma_rn (t, a, -4.20026350606819980E-002); | |
| t = __fma_rn (t, a, -6.55878071519427450E-001); | | t = __fma_rn (t, a, -6.55878071519427450E-001); | |
| t = __fma_rn (t, a, 5.77215664901523870E-001); | | t = __fma_rn (t, a, 5.77215664901523870E-001); | |
| t = t * a; | | t = t * a; | |
| t = __fma_rn (t, a, a); | | t = __fma_rn (t, a, a); | |
|
| return -__cuda_log (t); | | return -log (t); | |
| } | | } | |
| } | | } | |
| | | | |
|
| __device_func__(double __cuda_lgamma(double a)) | | static __forceinline__ double lgamma(double a) | |
| { | | { | |
| double t; | | double t; | |
| double i; | | double i; | |
| long long int quot; | | long long int quot; | |
|
| if (__cuda___isnan(a)) { | | if (__isnan(a)) { | |
| return a + a; | | return a + a; | |
| } | | } | |
|
| t = __internal_lgamma_pos(__cuda_fabs(a)); | | t = __internal_lgamma_pos(fabs(a)); | |
| if (a >= 0.0) return t; | | if (a >= 0.0) return t; | |
|
| a = __cuda_fabs(a); | | a = fabs(a); | |
| i = __cuda_trunc(a); | | i = trunc(a); | |
| if (a == i) return CUDART_INF; /* a is an integer: return infinity */ | | if (a == i) return CUDART_INF; /* a is an integer: return infinity */ | |
|
| if (a < 1e-19) return -__cuda_log(a); | | if (a < 1e-19) return -log(a); | |
| i = __cuda_rint (2.0 * a); | | i = rint (2.0 * a); | |
| quot = (long long int)i; | | quot = (long long int)i; | |
| i = __fma_rn (-0.5, i, a); | | i = __fma_rn (-0.5, i, a); | |
| i = i * CUDART_PI; | | i = i * CUDART_PI; | |
| if (quot & 1) { | | if (quot & 1) { | |
| i = __internal_cos_kerneld(i); | | i = __internal_cos_kerneld(i); | |
| } else { | | } else { | |
| i = __internal_sin_kerneld(i); | | i = __internal_sin_kerneld(i); | |
| } | | } | |
|
| i = __cuda_fabs(i); | | i = fabs(i); | |
| t = __cuda_log(CUDART_PI / (i * a)) - t; | | t = log(CUDART_PI / (i * a)) - t; | |
| return t; | | return t; | |
| } | | } | |
| | | | |
|
| __device_func__(double __cuda_ldexp(double a, int b)) | | static __forceinline__ double ldexp(double a, int b) | |
| { | | { | |
|
| double fa = __cuda_fabs (a); | | double fa = fabs (a); | |
| if ((fa == CUDART_ZERO) || (fa == CUDART_INF) || (!(fa <= CUDART_INF))) { | | if ((fa == CUDART_ZERO) || (fa == CUDART_INF) || (!(fa <= CUDART_INF))) { | |
| return a + a; | | return a + a; | |
| } | | } | |
| if (b == 0) { | | if (b == 0) { | |
| return a; | | return a; | |
| } | | } | |
| if (b > 2200) b = 2200; | | if (b > 2200) b = 2200; | |
| if (b < -2200) b = -2200; | | if (b < -2200) b = -2200; | |
|
| if (__cuda_abs (b) < 1022) { | | if (abs (b) < 1022) { | |
| return a * __internal_exp2i_kernel(b); | | return a * __internal_exp2i_kernel(b); | |
| } | | } | |
|
| if (__cuda_abs (b) < 2044) { | | if (abs (b) < 2044) { | |
| int bhalf = b / 2; | | int bhalf = b / 2; | |
| return a * __internal_exp2i_kernel (bhalf) * | | return a * __internal_exp2i_kernel (bhalf) * | |
| __internal_exp2i_kernel (b - bhalf); | | __internal_exp2i_kernel (b - bhalf); | |
| } else { | | } else { | |
| int bquarter = b / 4; | | int bquarter = b / 4; | |
| double t = __internal_exp2i_kernel(bquarter); | | double t = __internal_exp2i_kernel(bquarter); | |
| return a * t * t * t *__internal_exp2i_kernel (b - 3 * bquarter); | | return a * t * t * t *__internal_exp2i_kernel (b - 3 * bquarter); | |
| } | | } | |
| } | | } | |
| | | | |
|
| __device_func__(double __cuda_scalbn(double a, int b)) | | static __forceinline__ double scalbn(double a, int b) | |
| { | | { | |
| /* On binary systems, ldexp(x,exp) is equivalent to scalbn(x,exp) */ | | /* On binary systems, ldexp(x,exp) is equivalent to scalbn(x,exp) */ | |
|
| return __cuda_ldexp(a, b); | | return ldexp(a, b); | |
| } | | } | |
| | | | |
|
| __device_func__(double __cuda_scalbln(double a, long int b)) | | static __forceinline__ double scalbln(double a, long int b) | |
| { | | { | |
| #if defined(__LP64__) | | #if defined(__LP64__) | |
| /* clamp to integer range prior to conversion */ | | /* clamp to integer range prior to conversion */ | |
| if (b < -2147483648L) b = -2147483648L; | | if (b < -2147483648L) b = -2147483648L; | |
| if (b > 2147483647L) b = 2147483647L; | | if (b > 2147483647L) b = 2147483647L; | |
|
| #endif | | #endif /* __LP64__ */ | |
| return __cuda_scalbn(a, (int)b); | | return scalbn(a, (int)b); | |
| } | | } | |
| | | | |
|
| __device_func__(double __cuda_frexp(double a, int *b)) | | static __forceinline__ double frexp(double a, int *b) | |
| { | | { | |
|
| double fa = __cuda_fabs(a); | | double fa = fabs(a); | |
| unsigned int expo; | | unsigned int expo; | |
| unsigned int denorm; | | unsigned int denorm; | |
| | | | |
| if (fa < CUDART_TWO_TO_M1022) { | | if (fa < CUDART_TWO_TO_M1022) { | |
| a *= CUDART_TWO_TO_54; | | a *= CUDART_TWO_TO_54; | |
| denorm = 54; | | denorm = 54; | |
| } else { | | } else { | |
| denorm = 0; | | denorm = 0; | |
| } | | } | |
| expo = (__double2hiint(a) >> 20) & 0x7ff; | | expo = (__double2hiint(a) >> 20) & 0x7ff; | |
| | | | |
| skipping to change at line 2107 | | skipping to change at line 1982 | |
| a = a + a; | | a = a + a; | |
| } else { | | } else { | |
| expo = expo - denorm - 1022; | | expo = expo - denorm - 1022; | |
| a = __longlong_as_double((__double_as_longlong(a) & 0x800fffffffffffffU
LL)| | | a = __longlong_as_double((__double_as_longlong(a) & 0x800fffffffffffffU
LL)| | |
| 0x3fe0000000000000ULL); | | 0x3fe0000000000000ULL); | |
| } | | } | |
| *b = expo; | | *b = expo; | |
| return a; | | return a; | |
| } | | } | |
| | | | |
|
| __device_func__(double __cuda_modf(double a, double *b)) | | static __forceinline__ double modf(double a, double *b) | |
| { | | { | |
| double t; | | double t; | |
|
| if (__cuda___finite(a)) { | | if (__finite(a)) { | |
| t = __cuda_trunc(a); | | t = trunc(a); | |
| *b = t; | | *b = t; | |
| t = a - t; | | t = a - t; | |
| return __internal_copysign_pos(t, a); | | return __internal_copysign_pos(t, a); | |
|
| } else if (__cuda___isinf(a)) { | | } else if (__isinf(a)) { | |
| t = 0.0; | | t = 0.0; | |
| *b = a; | | *b = a; | |
| return __internal_copysign_pos(t, a); | | return __internal_copysign_pos(t, a); | |
| } else { | | } else { | |
| *b = a + a; | | *b = a + a; | |
| return a + a; | | return a + a; | |
| } | | } | |
| } | | } | |
| | | | |
|
| __device_func__(double __cuda_fmod(double a, double b)) | | static __forceinline__ double fmod(double a, double b) | |
| { | | { | |
| double orig_a = a; | | double orig_a = a; | |
| double orig_b = b; | | double orig_b = b; | |
|
| a = __cuda_fabs(a); | | a = fabs(a); | |
| b = __cuda_fabs(b); | | b = fabs(b); | |
| if (!((a <= CUDART_INF) && (b <= CUDART_INF))) { | | if (!((a <= CUDART_INF) && (b <= CUDART_INF))) { | |
| return orig_a + orig_b; | | return orig_a + orig_b; | |
| } | | } | |
| if (a == CUDART_INF || b == 0.0) { | | if (a == CUDART_INF || b == 0.0) { | |
| return CUDART_NAN; | | return CUDART_NAN; | |
| } else if (a >= b) { | | } else if (a >= b) { | |
| int bhi = __double2hiint(b); | | int bhi = __double2hiint(b); | |
| int blo = __double2loint(b); | | int blo = __double2loint(b); | |
| int ahi = __double2hiint(a); | | int ahi = __double2hiint(a); | |
| double scaled_b = 0.0; | | double scaled_b = 0.0; | |
| | | | |
| skipping to change at line 2168 | | skipping to change at line 2043 | |
| a -= scaled_b; | | a -= scaled_b; | |
| } | | } | |
| scaled_b *= 0.5; | | scaled_b *= 0.5; | |
| } | | } | |
| return __internal_copysign_pos(a, orig_a); | | return __internal_copysign_pos(a, orig_a); | |
| } else { | | } else { | |
| return orig_a; | | return orig_a; | |
| } | | } | |
| } | | } | |
| | | | |
|
| __device_func__(double __cuda_remainder(double a, double b)) | | static __forceinline__ double remainder(double a, double b) | |
| { | | { | |
| double orig_a; | | double orig_a; | |
| double twoa = 0.0; | | double twoa = 0.0; | |
| unsigned int quot0 = 0; /* quotient bit 0 */ | | unsigned int quot0 = 0; /* quotient bit 0 */ | |
| int bhi; | | int bhi; | |
| int blo; | | int blo; | |
| int ahi; | | int ahi; | |
|
| if (__cuda___isnan(a) || __cuda___isnan(b)) { | | if (__isnan(a) || __isnan(b)) { | |
| return a + b; | | return a + b; | |
| } | | } | |
| orig_a = a; | | orig_a = a; | |
|
| a = __cuda_fabs(a); | | a = fabs(a); | |
| b = __cuda_fabs(b); | | b = fabs(b); | |
| if (a == CUDART_INF || b == 0.0) { | | if (a == CUDART_INF || b == 0.0) { | |
| return CUDART_NAN; | | return CUDART_NAN; | |
| } else if (a >= b) { | | } else if (a >= b) { | |
| double scaled_b = 0.0; | | double scaled_b = 0.0; | |
| bhi = __double2hiint(b); | | bhi = __double2hiint(b); | |
| blo = __double2loint(b); | | blo = __double2loint(b); | |
| ahi = __double2hiint(a); | | ahi = __double2hiint(a); | |
| if (b < CUDART_TWO_TO_M1022) { | | if (b < CUDART_TWO_TO_M1022) { | |
| double t = b; | | double t = b; | |
| while ((t < a) && (t < CUDART_TWO_TO_M1022)) { | | while ((t < a) && (t < CUDART_TWO_TO_M1022)) { | |
| | | | |
| skipping to change at line 2225 | | skipping to change at line 2100 | |
| if ((twoa > b) || ((twoa == b) && quot0)) { | | if ((twoa > b) || ((twoa == b) && quot0)) { | |
| a -= b; | | a -= b; | |
| } | | } | |
| bhi = __double2hiint(a); | | bhi = __double2hiint(a); | |
| blo = __double2loint(a); | | blo = __double2loint(a); | |
| ahi = __double2hiint(orig_a); | | ahi = __double2hiint(orig_a); | |
| a = __hiloint2double((ahi & 0x80000000) ^ bhi, blo); | | a = __hiloint2double((ahi & 0x80000000) ^ bhi, blo); | |
| return a; | | return a; | |
| } | | } | |
| | | | |
|
| __device_func__(double __cuda_remquo(double a, double b, int *c)) | | static __forceinline__ double remquo(double a, double b, int *c) | |
| { | | { | |
| double orig_a; | | double orig_a; | |
| double twoa = 0.0; | | double twoa = 0.0; | |
| unsigned int quot = 0; /* trailing quotient bits */ | | unsigned int quot = 0; /* trailing quotient bits */ | |
| unsigned int sign; | | unsigned int sign; | |
| int bhi; | | int bhi; | |
| int blo; | | int blo; | |
| int ahi; | | int ahi; | |
|
| if (__cuda___isnan(a) || __cuda___isnan(b)) { | | if (__isnan(a) || __isnan(b)) { | |
| *c = quot; | | *c = quot; | |
| return a + b; | | return a + b; | |
| } | | } | |
| orig_a = a; | | orig_a = a; | |
|
| sign = 0 - (__cuda___signbit(a) != __cuda___signbit(b)); | | sign = 0 - ((__double2hiint(a) ^ __double2hiint(b)) < 0); | |
| a = __cuda_fabs(a); | | a = fabs(a); | |
| b = __cuda_fabs(b); | | b = fabs(b); | |
| if (a == CUDART_INF || b == 0.0) { | | if (a == CUDART_INF || b == 0.0) { | |
| *c = quot; | | *c = quot; | |
| return CUDART_NAN; | | return CUDART_NAN; | |
| } else if (a >= b) { | | } else if (a >= b) { | |
| double scaled_b = 0.0; | | double scaled_b = 0.0; | |
| bhi = __double2hiint(b); | | bhi = __double2hiint(b); | |
| blo = __double2loint(b); | | blo = __double2loint(b); | |
| ahi = __double2hiint(a); | | ahi = __double2hiint(a); | |
| if (b < CUDART_TWO_TO_M1022) { | | if (b < CUDART_TWO_TO_M1022) { | |
| double t = b; | | double t = b; | |
| | | | |
| skipping to change at line 2291 | | skipping to change at line 2166 | |
| blo = __double2loint(a); | | blo = __double2loint(a); | |
| ahi = __double2hiint(orig_a); | | ahi = __double2hiint(orig_a); | |
| a = __hiloint2double((ahi & 0x80000000) ^ bhi, blo); | | a = __hiloint2double((ahi & 0x80000000) ^ bhi, blo); | |
| quot = quot & CUDART_REMQUO_MASK_F; | | quot = quot & CUDART_REMQUO_MASK_F; | |
| quot = quot ^ sign; | | quot = quot ^ sign; | |
| quot = quot - sign; | | quot = quot - sign; | |
| *c = quot; | | *c = quot; | |
| return a; | | return a; | |
| } | | } | |
| | | | |
|
| __device_func__(double __cuda_nextafter(double a, double b)) | | static __forceinline__ double nextafter(double a, double b) | |
| { | | { | |
| unsigned long long int ia; | | unsigned long long int ia; | |
| unsigned long long int ib; | | unsigned long long int ib; | |
| ia = __double_as_longlong(a); | | ia = __double_as_longlong(a); | |
| ib = __double_as_longlong(b); | | ib = __double_as_longlong(b); | |
|
| if (__cuda___isnan(a) || __cuda___isnan(b)) return a + b; /* NaN */ | | if (__isnan(a) || __isnan(b)) return a + b; /* NaN */ | |
| if (((ia | ib) << 1) == 0ULL) return b; | | if (((ia | ib) << 1) == 0ULL) return b; | |
| if ((ia + ia) == 0ULL) { | | if ((ia + ia) == 0ULL) { | |
| return __internal_copysign_pos(CUDART_MIN_DENORM, b); /* crossover */ | | return __internal_copysign_pos(CUDART_MIN_DENORM, b); /* crossover */ | |
| } | | } | |
| if ((a < b) && (a < 0.0)) ia--; | | if ((a < b) && (a < 0.0)) ia--; | |
| if ((a < b) && (a > 0.0)) ia++; | | if ((a < b) && (a > 0.0)) ia++; | |
| if ((a > b) && (a < 0.0)) ia++; | | if ((a > b) && (a < 0.0)) ia++; | |
| if ((a > b) && (a > 0.0)) ia--; | | if ((a > b) && (a > 0.0)) ia--; | |
| a = __longlong_as_double(ia); | | a = __longlong_as_double(ia); | |
| return a; | | return a; | |
| } | | } | |
| | | | |
|
| __device_func__(double __cuda_nan(const char *tagp)) | | static __forceinline__ double nan(const char *tagp) | |
| { | | { | |
| unsigned long long int i; | | unsigned long long int i; | |
| | | | |
| i = __internal_nan_kernel (tagp); | | i = __internal_nan_kernel (tagp); | |
| i = (i & 0x000fffffffffffffULL) | 0x7ff8000000000000ULL; | | i = (i & 0x000fffffffffffffULL) | 0x7ff8000000000000ULL; | |
| return __longlong_as_double(i); | | return __longlong_as_double(i); | |
| } | | } | |
| | | | |
|
| __device_func__(double __cuda_round(double a)) | | static __forceinline__ double round(double a) | |
| { | | { | |
|
| double fa = __cuda_fabs(a); | | double fa = fabs(a); | |
| if (fa >= CUDART_TWO_TO_52) { | | if (fa >= CUDART_TWO_TO_52) { | |
| return a; | | return a; | |
| } else { | | } else { | |
| double u; | | double u; | |
|
| u = __cuda_trunc(fa + 0.5); | | u = trunc(fa + 0.5); | |
| if (fa < 0.5) u = 0; | | if (fa < 0.5) u = 0; | |
| u = __internal_copysign_pos(u, a); | | u = __internal_copysign_pos(u, a); | |
| return u; | | return u; | |
| } | | } | |
| } | | } | |
| | | | |
|
| __device_func__(long long int __cuda_llround(double a)) | | static __forceinline__ long long int llround(double a) | |
| { | | { | |
|
| #if !defined(__CUDABE__) | | return (long long int)round(a); | |
| if (a >= 9223372036854775807.0) return 0x7fffffffffffffffLL; | | | |
| if (a <= -9223372036854775808.0) return 0x8000000000000000LL; | | | |
| #endif /* !__CUDABE__ */ | | | |
| return (long long int)(__cuda_round(a)); | | | |
| } | | } | |
| | | | |
|
| __device_func__(long int __cuda_lround(double a)) | | static __forceinline__ long int lround(double a) | |
| { | | { | |
| #if defined(__LP64__) | | #if defined(__LP64__) | |
|
| return (long int)(__cuda_llround(a)); | | return (long int)llround(a); | |
| #else /* __LP64__ */ | | #else /* __LP64__ */ | |
|
| #if !defined(__CUDABE__) | | return (long int)round(a); | |
| if (__cuda___isnan(a)) return 0x80000000L; | | | |
| if (a >= 2147483647.0) return 0x7fffffffL; | | | |
| if (a <= -2147483648.0) return 0x80000000L; | | | |
| #endif /* !__CUDABE__ */ | | | |
| return (long int)(__cuda_round(a)); | | | |
| #endif /* __LP64__ */ | | #endif /* __LP64__ */ | |
| } | | } | |
| | | | |
|
| __device_func__(double __cuda_fdim(double a, double b)) | | static __forceinline__ double fdim(double a, double b) | |
| { | | { | |
| double t; | | double t; | |
| t = a - b; /* default also takes care of NaNs */ | | t = a - b; /* default also takes care of NaNs */ | |
| if (a <= b) { | | if (a <= b) { | |
| t = 0.0; | | t = 0.0; | |
| } | | } | |
| return t; | | return t; | |
| } | | } | |
| | | | |
|
| __device_func__(int __cuda_ilogb(double a)) | | static __forceinline__ int ilogb(double a) | |
| { | | { | |
| unsigned long long int i; | | unsigned long long int i; | |
| unsigned int ihi; | | unsigned int ihi; | |
| unsigned int ilo; | | unsigned int ilo; | |
|
| if (__cuda___isnan(a)) return -INT_MAX-1; | | if (__isnan(a)) return -__cuda_INT_MAX-1; | |
| if (__cuda___isinf(a)) return INT_MAX; | | if (__isinf(a)) return __cuda_INT_MAX; | |
| if (a == 0.0) return -INT_MAX-1; | | if (a == 0.0) return -__cuda_INT_MAX-1; | |
| a = __cuda_fabs(a); | | a = fabs(a); | |
| ilo = __double2loint(a); | | ilo = __double2loint(a); | |
| ihi = __double2hiint(a); | | ihi = __double2hiint(a); | |
| i = ((unsigned long long int)ihi) << 32 | (unsigned long long int)ilo; | | i = ((unsigned long long int)ihi) << 32 | (unsigned long long int)ilo; | |
| if (a >= CUDART_TWO_TO_M1022) { | | if (a >= CUDART_TWO_TO_M1022) { | |
| return ((int)((ihi >> 20) & 0x7ff)) - 1023; | | return ((int)((ihi >> 20) & 0x7ff)) - 1023; | |
| } else { | | } else { | |
| return -1011 - __clzll(i); | | return -1011 - __clzll(i); | |
| } | | } | |
| } | | } | |
| | | | |
|
| __device_func__(double __cuda_logb(double a)) | | static __forceinline__ double logb(double a) | |
| { | | { | |
| unsigned long long int i; | | unsigned long long int i; | |
| unsigned int ihi; | | unsigned int ihi; | |
| unsigned int ilo; | | unsigned int ilo; | |
|
| if (__cuda___isnan(a)) return a + a; | | if (__isnan(a)) return a + a; | |
| a = __cuda_fabs(a); | | a = fabs(a); | |
| if (a == CUDART_INF) return a; | | if (a == CUDART_INF) return a; | |
| if (a == 0.0) return -CUDART_INF; | | if (a == 0.0) return -CUDART_INF; | |
| ilo = __double2loint(a); | | ilo = __double2loint(a); | |
| ihi = __double2hiint(a); | | ihi = __double2hiint(a); | |
| i = ((unsigned long long int)ihi) << 32 | (unsigned long long int)ilo; | | i = ((unsigned long long int)ihi) << 32 | (unsigned long long int)ilo; | |
| if (a >= CUDART_TWO_TO_M1022) { | | if (a >= CUDART_TWO_TO_M1022) { | |
| return (double)((int)((ihi >> 20) & 0x7ff)) - 1023; | | return (double)((int)((ihi >> 20) & 0x7ff)) - 1023; | |
| } else { | | } else { | |
| int expo = -1011 - __clzll(i); | | int expo = -1011 - __clzll(i); | |
| return (double)expo; | | return (double)expo; | |
| } | | } | |
| } | | } | |
| | | | |
|
| __device_func__(double __cuda_fma(double a, double b, double c)) | | static __forceinline__ double fma(double a, double b, double c) | |
| { | | { | |
| return __fma_rn(a, b, c); | | return __fma_rn(a, b, c); | |
| } | | } | |
| | | | |
|
| #if __APPLE__ | | #if defined(__APPLE__) | |
| __device_func__(int __cuda___isfinited(double a)) | | | |
| | | static __forceinline__ int __isfinited(double a) | |
| { | | { | |
|
| return __cuda___finite(a); | | return __finite(a); | |
| } | | } | |
| | | | |
|
| __device_func__(int __cuda___signbitd(double a)) | | static __forceinline__ int __signbitd(double a) | |
| { | | { | |
|
| return __cuda___signbit(a); | | return __signbit(a); | |
| } | | } | |
|
| #endif | | | |
| | | | |
|
| #endif /* __cplusplus && __CUDACC__ */ | | #endif /* __APPLE__ */ | |
| | | | |
| | | #endif /* __CUDABE__ */ | |
| | | | |
| #endif /* __MATH_FUNCTIONS_DBL_PTX3_H__ */ | | #endif /* __MATH_FUNCTIONS_DBL_PTX3_H__ */ | |
| | | | |
End of changes. 226 change blocks. |
| 396 lines changed or deleted | | 264 lines changed or added | |
|