| cublas.h | | cublas.h | |
| /* | | /* | |
|
| * Copyright 1993-2008 NVIDIA Corporation. All rights reserved. | | * Copyright 1993-2009 NVIDIA Corporation. All rights reserved. | |
| * | | * | |
| * NOTICE TO USER: | | * NOTICE TO USER: | |
| * | | * | |
| * This source code is subject to NVIDIA ownership rights under U.S. and | | * This source code is subject to NVIDIA ownership rights under U.S. and | |
| * international Copyright laws. Users and possessors of this source code | | * international Copyright laws. Users and possessors of this source code | |
| * are hereby granted a nonexclusive, royalty-free license to use this code | | * are hereby granted a nonexclusive, royalty-free license to use this code | |
| * in individual and commercial software. | | * in individual and commercial software. | |
| * | | * | |
| * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE | | * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE | |
| * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR | | * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR | |
| | | | |
| skipping to change at line 1209 | | skipping to change at line 1209 | |
| * | | * | |
| * Error status for this function can be retrieved via cublasGetError(). | | * Error status for this function can be retrieved via cublasGetError(). | |
| * | | * | |
| * Error Status | | * Error Status | |
| * ------------ | | * ------------ | |
| * CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
d | | * CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
d | |
| * CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU | | * CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU | |
| */ | | */ | |
| float CUBLASAPI cublasScnrm2 (int n, const cuComplex *x, int incx); | | float CUBLASAPI cublasScnrm2 (int n, const cuComplex *x, int incx); | |
| | | | |
|
| | | /* ----------------- CUBLAS double-complex BLAS1 functions ---------------- | |
| | | - */ | |
| | | | |
| | | /* | |
| | | * cuDoubleComplex | |
| | | * zdotu (int n, const cuDoubleComplex *x, int incx, const cuDoubleComplex | |
| | | *y, int incy) | |
| | | * | |
| | | * computes the dot product of two double-complex vectors. It returns the | |
| | | * dot product of the double-complex vectors x and y if successful, and dou | |
| | | ble-complex | |
| | | * zero otherwise. It computes the sum for i = 0 to n - 1 of x[lx + i * inc | |
| | | x] * | |
| | | * y[ly + i * incy], where lx = 1 if incx >= 0, else lx = 1 + (1 - n) * inc | |
| | | x; | |
| | | * ly is defined in a similar way using incy. | |
| | | * | |
| | | * Input | |
| | | * ----- | |
| | | * n number of elements in input vectors | |
| | | * x double-complex vector with n elements | |
| | | * incx storage spacing between elements of x | |
| | | * y double-complex vector with n elements | |
| | | * incy storage spacing between elements of y | |
| | | * | |
| | | * Output | |
| | | * ------ | |
| | | * returns double-complex dot product (zero if n <= 0) | |
| | | * | |
| | | * Reference: http://www.netlib.org/blas/zdotu.f | |
| | | * | |
| | | * Error status for this function can be retrieved via cublasGetError(). | |
| | | * | |
| | | * Error Status | |
| | | * ------------ | |
| | | * CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has nor been initialize | |
| | | d | |
| | | * CUBLAS_STATUS_EXECUTION_FAILED if function failed to execute on GPU | |
| | | */ | |
| | | cuDoubleComplex CUBLASAPI cublasZdotu (int n, const cuDoubleComplex *x, int | |
| | | incx, | |
| | | const cuDoubleComplex *y, int incy); | |
| | | | |
| | | /* | |
| | | * void | |
| | | * cublasZscal (int n, cuComplex alpha, cuComplex *x, int incx) | |
| | | * | |
| | | * replaces double-complex vector x with double-complex alpha * x. For i | |
| | | * = 0 to n - 1, it replaces x[ix + i * incx] with alpha * x[ix + i * incx] | |
| | | , | |
| | | * where ix = 1 if incx >= 0, else ix = 1 + (1 - n) * incx. | |
| | | * | |
| | | * Input | |
| | | * ----- | |
| | | * n number of elements in input vectors | |
| | | * alpha double-complex scalar multiplier | |
| | | * x double-complex vector with n elements | |
| | | * incx storage spacing between elements of x | |
| | | * | |
| | | * Output | |
| | | * ------ | |
| | | * x double-complex result (unchanged if n <= 0 or incx <= 0) | |
| | | * | |
| | | * Reference: http://www.netlib.org/blas/zscal.f | |
| | | * | |
| | | * Error status for this function can be retrieved via cublasGetError(). | |
| | | * | |
| | | * Error Status | |
| | | * ------------ | |
| | | * CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize | |
| | | d | |
| | | * CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU | |
| | | */ | |
| | | void CUBLASAPI cublasZscal (int n, cuDoubleComplex alpha, cuDoubleComplex * | |
| | | x, int incx); | |
| | | | |
| /* --------------- CUBLAS single precision BLAS2 functions ---------------
- */ | | /* --------------- CUBLAS single precision BLAS2 functions ---------------
- */ | |
| | | | |
| /* | | /* | |
| * void | | * void | |
| * cublasSgbmv (char trans, int m, int n, int kl, int ku, float alpha, | | * cublasSgbmv (char trans, int m, int n, int kl, int ku, float alpha, | |
| * const float *A, int lda, const float *x, int incx, float be
ta, | | * const float *A, int lda, const float *x, int incx, float be
ta, | |
| * float *y, int incy) | | * float *y, int incy) | |
| * | | * | |
| * performs one of the matrix-vector operations | | * performs one of the matrix-vector operations | |
| * | | * | |
| | | | |
| skipping to change at line 2114 | | skipping to change at line 2180 | |
| * | | * | |
| * Error Status | | * Error Status | |
| * ------------ | | * ------------ | |
| * CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
d | | * CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
d | |
| * CUBLAS_STATUS_INVALID_VALUE if incx == 0 or if n < 0 or n > 4070 | | * CUBLAS_STATUS_INVALID_VALUE if incx == 0 or if n < 0 or n > 4070 | |
| * CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU | | * CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU | |
| */ | | */ | |
| void CUBLASAPI cublasStrsv (char uplo, char trans, char diag, int n, | | void CUBLASAPI cublasStrsv (char uplo, char trans, char diag, int n, | |
| const float *A, int lda, float *x, int incx); | | const float *A, int lda, float *x, int incx); | |
| | | | |
|
| | | /* ----------------- CUBLAS double complex BLAS3 functions ---------------- | |
| | | - */ | |
| | | | |
| | | /* | |
| | | * cublasZgemv (char trans, int m, int n, cuDoubleComplex alpha, const cuDo | |
| | | ubleComplex *A, int lda, | |
| | | * const cuDoubleComplex *x, int incx, cuDoubleComplex beta, c | |
| | | uDoubleComplex *y, int incy) | |
| | | * | |
| | | * performs one of the matrix-vector operations | |
| | | * | |
| | | * y = alpha * op(A) * x + beta * y, | |
| | | * | |
| | | * where op(A) is one of | |
| | | * | |
| | | * op(A) = A or op(A) = transpose(A) | |
| | | * | |
| | | * where alpha and beta are double precision scalars, x and y are double | |
| | | * precision vectors, and A is an m x n matrix consisting of double precisi | |
| | | on | |
| | | * elements. Matrix A is stored in column major format, and lda is the lead | |
| | | ing | |
| | | * dimension of the two-dimensional array in which A is stored. | |
| | | * | |
| | | * Input | |
| | | * ----- | |
| | | * trans specifies op(A). If transa = 'n' or 'N', op(A) = A. If trans = | |
| | | * trans = 't', 'T', 'c', or 'C', op(A) = transpose(A) | |
| | | * m specifies the number of rows of the matrix A. m must be at least | |
| | | * zero. | |
| | | * n specifies the number of columns of the matrix A. n must be at lea | |
| | | st | |
| | | * zero. | |
| | | * alpha double precision scalar multiplier applied to op(A). | |
| | | * A double precision array of dimensions (lda, n) if trans = 'n' or | |
| | | * 'N'), and of dimensions (lda, m) otherwise. lda must be at least | |
| | | * max(1, m) and at least max(1, n) otherwise. | |
| | | * lda leading dimension of two-dimensional array used to store matrix A | |
| | | * x double precision array of length at least (1 + (n - 1) * abs(incx | |
| | | )) | |
| | | * when trans = 'N' or 'n' and at least (1 + (m - 1) * abs(incx)) | |
| | | * otherwise. | |
| | | * incx specifies the storage spacing between elements of x. incx must no | |
| | | t | |
| | | * be zero. | |
| | | * beta double precision scalar multiplier applied to vector y. If beta | |
| | | * is zero, y is not read. | |
| | | * y double precision array of length at least (1 + (m - 1) * abs(incy | |
| | | )) | |
| | | * when trans = 'N' or 'n' and at least (1 + (n - 1) * abs(incy)) | |
| | | * otherwise. | |
| | | * incy specifies the storage spacing between elements of x. incx must no | |
| | | t | |
| | | * be zero. | |
| | | * | |
| | | * Output | |
| | | * ------ | |
| | | * y updated according to alpha * op(A) * x + beta * y | |
| | | * | |
| | | * Reference: http://www.netlib.org/blas/zgemv.f | |
| | | * | |
| | | * Error status for this function can be retrieved via cublasGetError(). | |
| | | * | |
| | | * Error Status | |
| | | * ------------ | |
| | | * CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize | |
| | | d | |
| | | * CUBLAS_STATUS_INVALID_VALUE if m or n are < 0, or if incx or incy == | |
| | | 0 | |
| | | * CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU | |
| | | */ | |
| | | void CUBLASAPI cublasZgemv (char trans, int m, int n, cuDoubleComplex alpha | |
| | | , | |
| | | const cuDoubleComplex *A, int lda, const cuDoub | |
| | | leComplex *x, int incx, | |
| | | cuDoubleComplex beta, cuDoubleComplex *y, int i | |
| | | ncy); | |
| | | | |
| /* ----------------- CUBLAS single complex BLAS2 functions ----------------
- */ | | /* ----------------- CUBLAS single complex BLAS2 functions ----------------
- */ | |
| void CUBLASAPI cublasCgemv (char trans, int m, int n, cuComplex alpha, | | void CUBLASAPI cublasCgemv (char trans, int m, int n, cuComplex alpha, | |
| const cuComplex *A, int lda, const cuComplex *x
, | | const cuComplex *A, int lda, const cuComplex *x
, | |
| int incx, cuComplex beta, cuComplex *y, int inc
y); | | int incx, cuComplex beta, cuComplex *y, int inc
y); | |
| void CUBLASAPI cublasCgbmv (char trans, int m, int n, int kl, int ku, | | void CUBLASAPI cublasCgbmv (char trans, int m, int n, int kl, int ku, | |
| cuComplex alpha, const cuComplex *A, int lda, | | cuComplex alpha, const cuComplex *A, int lda, | |
| const cuComplex *x, int incx, cuComplex beta, | | const cuComplex *x, int incx, cuComplex beta, | |
| cuComplex *y, int incy); | | cuComplex *y, int incy); | |
| void CUBLASAPI cublasChemv (char uplo, int n, cuComplex alpha, | | void CUBLASAPI cublasChemv (char uplo, int n, cuComplex alpha, | |
| const cuComplex *A, int lda, const cuComplex *x
, | | const cuComplex *A, int lda, const cuComplex *x
, | |
| | | | |
| skipping to change at line 3577 | | skipping to change at line 3706 | |
| * CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support | | * CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support | |
| * CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU | | * CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU | |
| */ | | */ | |
| void CUBLASAPI cublasDtrsm (char side, char uplo, char transa, | | void CUBLASAPI cublasDtrsm (char side, char uplo, char transa, | |
| char diag, int m, int n, double alpha, | | char diag, int m, int n, double alpha, | |
| const double *A, int lda, double *B, | | const double *A, int lda, double *B, | |
| int ldb); | | int ldb); | |
| | | | |
| /* | | /* | |
| * void | | * void | |
|
| | | * cublasZtrsm (char side, char uplo, char transa, char diag, int m, int n, | |
| | | * cuDoubleComplex alpha, const cuDoubleComplex *A, int lda, | |
| | | * cuDoubleComplex *B, int ldb) | |
| | | * | |
| | | * solves one of the matrix equations | |
| | | * | |
| | | * op(A) * X = alpha * B, or X * op(A) = alpha * B, | |
| | | * | |
| | | * where alpha is a double precision complex scalar, and X and B are m x n | |
| | | matrices | |
| | | * that are composed of double precision complex elements. A is a unit or n | |
| | | on-unit, | |
| | | * upper or lower triangular matrix, and op(A) is one of | |
| | | * | |
| | | * op(A) = A or op(A) = transpose(A) or op( A ) = conj( A' ). | |
| | | * | |
| | | * The result matrix X overwrites input matrix B; that is, on exit the resu | |
| | | lt | |
| | | * is stored in B. Matrices A and B are stored in column major format, and | |
| | | * lda and ldb are the leading dimensions of the two-dimensonials arrays th | |
| | | at | |
| | | * contain A and B, respectively. | |
| | | * | |
| | | * Input | |
| | | * ----- | |
| | | * side specifies whether op(A) appears on the left or right of X as | |
| | | * follows: side = 'L' or 'l' indicates solve op(A) * X = alpha * B. | |
| | | * side = 'R' or 'r' indicates solve X * op(A) = alpha * B. | |
| | | * uplo specifies whether the matrix A is an upper or lower triangular | |
| | | * matrix as follows: uplo = 'U' or 'u' indicates A is an upper | |
| | | * triangular matrix. uplo = 'L' or 'l' indicates A is a lower | |
| | | * triangular matrix. | |
| | | * transa specifies the form of op(A) to be used in matrix multiplication | |
| | | * as follows: If transa = 'N' or 'N', then op(A) = A. If transa = | |
| | | * 'T', 't', 'C', or 'c', then op(A) = transpose(A). | |
| | | * diag specifies whether or not A is a unit triangular matrix like so: | |
| | | * if diag = 'U' or 'u', A is assumed to be unit triangular. If | |
| | | * diag = 'N' or 'n', then A is not assumed to be unit triangular. | |
| | | * m specifies the number of rows of B. m must be at least zero. | |
| | | * n specifies the number of columns of B. n must be at least zero. | |
| | | * alpha is a double precision complex scalar to be multiplied with B. Whe | |
| | | n alpha is | |
| | | * zero, then A is not referenced and B need not be set before entry | |
| | | . | |
| | | * A is a double precision complex array of dimensions (lda, k), where | |
| | | k is | |
| | | * m when side = 'L' or 'l', and is n when side = 'R' or 'r'. If | |
| | | * uplo = 'U' or 'u', the leading k x k upper triangular part of | |
| | | * the array A must contain the upper triangular matrix and the | |
| | | * strictly lower triangular matrix of A is not referenced. When | |
| | | * uplo = 'L' or 'l', the leading k x k lower triangular part of | |
| | | * the array A must contain the lower triangular matrix and the | |
| | | * strictly upper triangular part of A is not referenced. Note that | |
| | | * when diag = 'U' or 'u', the diagonal elements of A are not | |
| | | * referenced, and are assumed to be unity. | |
| | | * lda is the leading dimension of the two dimensional array containing | |
| | | A. | |
| | | * When side = 'L' or 'l' then lda must be at least max(1, m), when | |
| | | * side = 'R' or 'r' then lda must be at least max(1, n). | |
| | | * B is a double precision complex array of dimensions (ldb, n). ldb m | |
| | | ust be | |
| | | * at least max (1,m). The leading m x n part of the array B must | |
| | | * contain the right-hand side matrix B. On exit B is overwritten | |
| | | * by the solution matrix X. | |
| | | * ldb is the leading dimension of the two dimensional array containing | |
| | | B. | |
| | | * ldb must be at least max(1, m). | |
| | | * | |
| | | * Output | |
| | | * ------ | |
| | | * B contains the solution matrix X satisfying op(A) * X = alpha * B, | |
| | | * or X * op(A) = alpha * B | |
| | | * | |
| | | * Reference: http://www.netlib.org/blas/ztrsm.f | |
| | | * | |
| | | * Error status for this function can be retrieved via cublasGetError(). | |
| | | * | |
| | | * Error Status | |
| | | * ------------ | |
| | | * CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize | |
| | | d | |
| | | * CUBLAS_STATUS_INVALID_VALUE if m or n < 0 | |
| | | * CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support | |
| | | * CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU | |
| | | */ | |
| | | void CUBLASAPI cublasZtrsm (char side, char uplo, char transa, | |
| | | char diag, int m, int n, cuDoubleComplex alpha, | |
| | | const cuDoubleComplex *A, int lda, | |
| | | cuDoubleComplex *B, int ldb); | |
| | | | |
| | | /* | |
| | | * void | |
| * cublasDtrmm (char side, char uplo, char transa, char diag, int m, int n, | | * cublasDtrmm (char side, char uplo, char transa, char diag, int m, int n, | |
| * double alpha, const double *A, int lda, const double *B, in
t ldb) | | * double alpha, const double *A, int lda, const double *B, in
t ldb) | |
| * | | * | |
| * performs one of the matrix-matrix operations | | * performs one of the matrix-matrix operations | |
| * | | * | |
| * B = alpha * op(A) * B, or B = alpha * B * op(A) | | * B = alpha * op(A) * B, or B = alpha * B * op(A) | |
| * | | * | |
| * where alpha is a double-precision scalar, B is an m x n matrix composed | | * where alpha is a double-precision scalar, B is an m x n matrix composed | |
| * of double precision elements, and A is a unit or non-unit, upper or lowe
r, | | * of double precision elements, and A is a unit or non-unit, upper or lowe
r, | |
| * triangular matrix composed of double precision elements. op(A) is one of | | * triangular matrix composed of double precision elements. op(A) is one of | |
| | | | |
| skipping to change at line 3813 | | skipping to change at line 4023 | |
| * CUBLAS_STATUS_INVALID_VALUE if n < 0 or k < 0 | | * CUBLAS_STATUS_INVALID_VALUE if n < 0 or k < 0 | |
| * CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support | | * CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support | |
| * CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU | | * CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU | |
| */ | | */ | |
| void CUBLASAPI cublasDsyrk (char uplo, char trans, int n, int k, | | void CUBLASAPI cublasDsyrk (char uplo, char trans, int n, int k, | |
| double alpha, const double *A, int lda, | | double alpha, const double *A, int lda, | |
| double beta, double *C, int ldc); | | double beta, double *C, int ldc); | |
| | | | |
| /* | | /* | |
| * void | | * void | |
|
| | | * cublasZsyrk (char uplo, char trans, int n, int k, cuDoubleComplex alpha, | |
| | | * const cuDoubleComplex *A, int lda, cuDoubleComplex beta, cu | |
| | | DoubleComplex *C, int ldc) | |
| | | * | |
| | | * performs one of the symmetric rank k operations | |
| | | * | |
| | | * C = alpha * A * transpose(A) + beta * C, or | |
| | | * C = alpha * transpose(A) * A + beta * C. | |
| | | * | |
| | | * Alpha and beta are double precision complex scalars. C is an n x n symme | |
| | | tric matrix | |
| | | * consisting of double precision complex elements and stored in either low | |
| | | er or | |
| | | * upper storage mode. A is a matrix consisting of double precision complex | |
| | | elements | |
| | | * with dimension of n x k in the first case, and k x n in the second case. | |
| | | * | |
| | | * Input | |
| | | * ----- | |
| | | * uplo specifies whether the symmetric matrix C is stored in upper or lo | |
| | | wer | |
| | | * storage mode as follows. If uplo == 'U' or 'u', only the upper | |
| | | * triangular part of the symmetric matrix is to be referenced, and | |
| | | the | |
| | | * elements of the strictly lower triangular part are to be infered | |
| | | from | |
| | | * those in the upper triangular part. If uplo == 'L' or 'l', only t | |
| | | he | |
| | | * lower triangular part of the symmetric matrix is to be referenced | |
| | | , | |
| | | * and the elements of the strictly upper triangular part are to be | |
| | | * infered from those in the lower triangular part. | |
| | | * trans specifies the operation to be performed. If trans == 'N' or 'n', | |
| | | C = | |
| | | * alpha * transpose(A) + beta * C. If trans == 'T', 't', 'C', or 'c | |
| | | ', | |
| | | * C = transpose(A) * A + beta * C. | |
| | | * n specifies the number of rows and the number columns of matrix C. | |
| | | If | |
| | | * trans == 'N' or 'n', n specifies the number of rows of matrix A. | |
| | | If | |
| | | * trans == 'T', 't', 'C', or 'c', n specifies the columns of matrix | |
| | | A. | |
| | | * n must be at least zero. | |
| | | * k If trans == 'N' or 'n', k specifies the number of rows of matrix | |
| | | A. | |
| | | * If trans == 'T', 't', 'C', or 'c', k specifies the number of rows | |
| | | of | |
| | | * matrix A. k must be at least zero. | |
| | | * alpha double precision complex scalar multiplier applied to A * transpo | |
| | | se(A) or | |
| | | * transpose(A) * A. | |
| | | * A double precision complex array of dimensions (lda, ka), where ka | |
| | | is k when | |
| | | * trans == 'N' or 'n', and is n otherwise. When trans == 'N' or 'n' | |
| | | , | |
| | | * the leading n x k part of array A must contain the matrix A, | |
| | | * otherwise the leading k x n part of the array must contains the | |
| | | * matrix A. | |
| | | * lda leading dimension of A. When trans == 'N' or 'n' then lda must be | |
| | | at | |
| | | * least max(1, n). Otherwise lda must be at least max(1, k). | |
| | | * beta double precision complex scalar multiplier applied to C. If beta | |
| | | izs zero, C | |
| | | * does not have to be a valid input | |
| | | * C double precision complex array of dimensions (ldc, n). If uplo = | |
| | | 'U' or 'u', | |
| | | * the leading n x n triangular part of the array C must contain the | |
| | | * upper triangular part of the symmetric matrix C and the strictly | |
| | | * lower triangular part of C is not referenced. On exit, the upper | |
| | | * triangular part of C is overwritten by the upper trinagular part | |
| | | of | |
| | | * the updated matrix. If uplo = 'L' or 'l', the leading n x n | |
| | | * triangular part of the array C must contain the lower triangular | |
| | | part | |
| | | * of the symmetric matrix C and the strictly upper triangular part | |
| | | of C | |
| | | * is not referenced. On exit, the lower triangular part of C is | |
| | | * overwritten by the lower trinagular part of the updated matrix. | |
| | | * ldc leading dimension of C. It must be at least max(1, n). | |
| | | * | |
| | | * Output | |
| | | * ------ | |
| | | * C updated according to C = alpha * A * transpose(A) + beta * C, or | |
| | | C = | |
| | | * alpha * transpose(A) * A + beta * C | |
| | | * | |
| | | * Reference: http://www.netlib.org/blas/zsyrk.f | |
| | | * | |
| | | * Error status for this function can be retrieved via cublasGetError(). | |
| | | * | |
| | | * Error Status | |
| | | * ------------ | |
| | | * CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize | |
| | | d | |
| | | * CUBLAS_STATUS_INVALID_VALUE if n < 0 or k < 0 | |
| | | * CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support | |
| | | * CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU | |
| | | */ | |
| | | void CUBLASAPI cublasZsyrk (char uplo, char trans, int n, int k, | |
| | | cuDoubleComplex alpha, | |
| | | const cuDoubleComplex *A, int lda, | |
| | | cuDoubleComplex beta, | |
| | | cuDoubleComplex *C, int ldc); | |
| | | | |
| | | /* | |
| | | * void | |
| * cublasDsyr2k (char uplo, char trans, int n, int k, double alpha, | | * cublasDsyr2k (char uplo, char trans, int n, int k, double alpha, | |
| * const double *A, int lda, const double *B, int ldb, | | * const double *A, int lda, const double *B, int ldb, | |
| * double beta, double *C, int ldc) | | * double beta, double *C, int ldc) | |
| * | | * | |
| * performs one of the symmetric rank 2k operations | | * performs one of the symmetric rank 2k operations | |
| * | | * | |
| * C = alpha * A * transpose(B) + alpha * B * transpose(A) + beta * C, o
r | | * C = alpha * A * transpose(B) + alpha * B * transpose(A) + beta * C, o
r | |
| * C = alpha * transpose(A) * B + alpha * transpose(B) * A + beta * C. | | * C = alpha * transpose(A) * B + alpha * transpose(B) * A + beta * C. | |
| * | | * | |
| * Alpha and beta are double precision scalars. C is an n x n symmetric mat
rix | | * Alpha and beta are double precision scalars. C is an n x n symmetric mat
rix | |
| | | | |
End of changes. 5 change blocks. |
| 1 lines changed or deleted | | 354 lines changed or added | |
|
| cuda.h | | cuda.h | |
| /* | | /* | |
|
| * Copyright 1993-2008 NVIDIA Corporation. All rights reserved. | | * Copyright 1993-2009 NVIDIA Corporation. All rights reserved. | |
| * | | * | |
| * NOTICE TO USER: | | * NOTICE TO USER: | |
| * | | * | |
| * This source code is subject to NVIDIA ownership rights under U.S. and | | * This source code is subject to NVIDIA ownership rights under U.S. and | |
| * international Copyright laws. Users and possessors of this source code | | * international Copyright laws. Users and possessors of this source code | |
| * are hereby granted a nonexclusive, royalty-free license to use this code | | * are hereby granted a nonexclusive, royalty-free license to use this code | |
| * in individual and commercial software. | | * in individual and commercial software. | |
| * | | * | |
| * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE | | * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE | |
| * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR | | * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR | |
| | | | |
| skipping to change at line 36 | | skipping to change at line 36 | |
| * and is provided to the U.S. Government only as a commercial end item. | | * and is provided to the U.S. Government only as a commercial end item. | |
| * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through | | * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through | |
| * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the | | * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the | |
| * source code with only those rights set forth herein. | | * source code with only those rights set forth herein. | |
| * | | * | |
| * Any use of this source code in individual and commercial software must | | * Any use of this source code in individual and commercial software must | |
| * include, in the user documentation and internal comments to the code, | | * include, in the user documentation and internal comments to the code, | |
| * the above Disclaimer and U.S. Government End Users Notice. | | * the above Disclaimer and U.S. Government End Users Notice. | |
| */ | | */ | |
| | | | |
|
| // ------------------------------------------------------------------------ | | | |
| ---- | | | |
| // | | | |
| // Main public header file for the CompUte Device Api | | | |
| // | | | |
| // ------------------------------------------------------------------------ | | | |
| ---- | | | |
| | | | |
| #ifndef __cuda_cuda_h__ | | #ifndef __cuda_cuda_h__ | |
| #define __cuda_cuda_h__ | | #define __cuda_cuda_h__ | |
| | | | |
|
| /* CUDA API version number */ | | #include <stdlib.h> | |
| #define CUDA_VERSION 2010 /* 2.1 */ | | | |
| | | /** | |
| | | * \file | |
| | | * \name Data types used by CUDA driver | |
| | | * \author NVIDIA Corporation | |
| | | * \brief Data types used by CUDA driver | |
| | | */ | |
| | | | |
| | | /** | |
| | | * \defgroup CUDA_TYPES Data types used by CUDA driver | |
| | | * \ingroup CUDA_DRIVER | |
| | | * @{ | |
| | | */ | |
| | | | |
| | | /** | |
| | | * CUDA API version number | |
| | | */ | |
| | | #define CUDA_VERSION 2020 /* 2.2 */ | |
| | | | |
| #ifdef __cplusplus | | #ifdef __cplusplus | |
| extern "C" { | | extern "C" { | |
| #endif | | #endif | |
|
| typedef unsigned int CUdeviceptr; | | typedef unsigned int CUdeviceptr; ///< CUDA device pointer | |
| | | | |
|
| typedef int CUdevice; | | typedef int CUdevice; ///< CUDA device | |
| typedef struct CUctx_st *CUcontext; | | typedef struct CUctx_st *CUcontext; ///< CUDA context | |
| typedef struct CUmod_st *CUmodule; | | typedef struct CUmod_st *CUmodule; ///< CUDA module | |
| typedef struct CUfunc_st *CUfunction; | | typedef struct CUfunc_st *CUfunction; ///< CUDA function | |
| typedef struct CUarray_st *CUarray; | | typedef struct CUarray_st *CUarray; ///< CUDA array | |
| typedef struct CUtexref_st *CUtexref; | | typedef struct CUtexref_st *CUtexref; ///< CUDA texture reference | |
| typedef struct CUevent_st *CUevent; | | typedef struct CUevent_st *CUevent; ///< CUDA event | |
| typedef struct CUstream_st *CUstream; | | typedef struct CUstream_st *CUstream; ///< CUDA stream | |
| | | | |
| /************************************ | | /************************************ | |
| ** | | ** | |
| ** Enums | | ** Enums | |
| ** | | ** | |
| ***********************************/ | | ***********************************/ | |
| | | | |
|
| // | | /** | |
| // context creation flags | | * Context creation flags | |
| // | | */ | |
| typedef enum CUctx_flags_enum { | | typedef enum CUctx_flags_enum { | |
|
| CU_CTX_SCHED_AUTO = 0, | | CU_CTX_SCHED_AUTO = 0, ///< Automatic scheduling | |
| CU_CTX_SCHED_SPIN = 1, | | CU_CTX_SCHED_SPIN = 1, ///< Set spin as default scheduling | |
| CU_CTX_SCHED_YIELD = 2, | | CU_CTX_SCHED_YIELD = 2, ///< Set yield as default scheduling | |
| CU_CTX_SCHED_MASK = 0x3, | | CU_CTX_SCHED_MASK = 0x3, | |
|
| CU_CTX_FLAGS_MASK = CU_CTX_SCHED_MASK | | CU_CTX_BLOCKING_SYNC = 4, ///< Use blocking synchronization | |
| | | CU_CTX_MAP_HOST = 8, ///< Support mapped pinned allocations | |
| | | CU_CTX_FLAGS_MASK = 0xf, | |
| } CUctx_flags; | | } CUctx_flags; | |
| | | | |
|
| // | | /** | |
| // array formats | | * Event creation flags | |
| // | | */ | |
| | | typedef enum CUevent_flags_enum { | |
| | | CU_EVENT_DEFAULT = 0, ///< Default event flag | |
| | | CU_EVENT_BLOCKING_SYNC = 1, ///< Event uses blocking synchronization | |
| | | } CUevent_flags; | |
| | | | |
| | | /** | |
| | | * Array formats | |
| | | */ | |
| typedef enum CUarray_format_enum { | | typedef enum CUarray_format_enum { | |
|
| CU_AD_FORMAT_UNSIGNED_INT8 = 0x01, | | CU_AD_FORMAT_UNSIGNED_INT8 = 0x01, ///< Unsigned 8-bit integers | |
| CU_AD_FORMAT_UNSIGNED_INT16 = 0x02, | | CU_AD_FORMAT_UNSIGNED_INT16 = 0x02, ///< Unsigned 16-bit integers | |
| CU_AD_FORMAT_UNSIGNED_INT32 = 0x03, | | CU_AD_FORMAT_UNSIGNED_INT32 = 0x03, ///< Unsigned 32-bit integers | |
| CU_AD_FORMAT_SIGNED_INT8 = 0x08, | | CU_AD_FORMAT_SIGNED_INT8 = 0x08, ///< Signed 8-bit integers | |
| CU_AD_FORMAT_SIGNED_INT16 = 0x09, | | CU_AD_FORMAT_SIGNED_INT16 = 0x09, ///< Signed 16-bit integers | |
| CU_AD_FORMAT_SIGNED_INT32 = 0x0a, | | CU_AD_FORMAT_SIGNED_INT32 = 0x0a, ///< Signed 32-bit integers | |
| CU_AD_FORMAT_HALF = 0x10, | | CU_AD_FORMAT_HALF = 0x10, ///< 16-bit floating point | |
| CU_AD_FORMAT_FLOAT = 0x20 | | CU_AD_FORMAT_FLOAT = 0x20 ///< 32-bit floating point | |
| } CUarray_format; | | } CUarray_format; | |
| | | | |
|
| // | | /** | |
| // Texture reference addressing modes | | * Texture reference addressing modes | |
| // | | */ | |
| typedef enum CUaddress_mode_enum { | | typedef enum CUaddress_mode_enum { | |
|
| CU_TR_ADDRESS_MODE_WRAP = 0, | | CU_TR_ADDRESS_MODE_WRAP = 0, ///< Wrapping address mode | |
| CU_TR_ADDRESS_MODE_CLAMP = 1, | | CU_TR_ADDRESS_MODE_CLAMP = 1, ///< Clamp to edge address mode | |
| CU_TR_ADDRESS_MODE_MIRROR = 2, | | CU_TR_ADDRESS_MODE_MIRROR = 2, ///< Mirror address mode | |
| } CUaddress_mode; | | } CUaddress_mode; | |
| | | | |
|
| // | | /** | |
| // Texture reference filtering modes | | * Texture reference filtering modes | |
| // | | */ | |
| typedef enum CUfilter_mode_enum { | | typedef enum CUfilter_mode_enum { | |
|
| CU_TR_FILTER_MODE_POINT = 0, | | CU_TR_FILTER_MODE_POINT = 0, ///< Point filter mode | |
| CU_TR_FILTER_MODE_LINEAR = 1 | | CU_TR_FILTER_MODE_LINEAR = 1 ///< Linear filter mode | |
| } CUfilter_mode; | | } CUfilter_mode; | |
| | | | |
|
| // | | /** | |
| // Device properties | | * Device properties | |
| // | | */ | |
| typedef enum CUdevice_attribute_enum { | | typedef enum CUdevice_attribute_enum { | |
|
| CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 1, | | CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 1, ///< Maximum number of | |
| CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X = 2, | | threads per block | |
| CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y = 3, | | CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X = 2, ///< Maximum block dime | |
| CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z = 4, | | nsion X | |
| CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X = 5, | | CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y = 3, ///< Maximum block dime | |
| CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y = 6, | | nsion Y | |
| CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z = 7, | | CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z = 4, ///< Maximum block dime | |
| CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK = 8, | | nsion Z | |
| CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK = 8, // Deprecated, us | | CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X = 5, ///< Maximum grid dimen | |
| e CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK | | sion X | |
| CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY = 9, | | CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y = 6, ///< Maximum grid dimen | |
| CU_DEVICE_ATTRIBUTE_WARP_SIZE = 10, | | sion Y | |
| CU_DEVICE_ATTRIBUTE_MAX_PITCH = 11, | | CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z = 7, ///< Maximum grid dimen | |
| CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK = 12, | | sion Z | |
| CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK = 12, // Deprecated, us | | CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK = 8, ///< Maximum sh | |
| e CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK | | ared memory available per block in bytes | |
| CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13, | | CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK = 8, ///< Deprecated, us | |
| CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT = 14, | | e CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK | |
| | | CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY = 9, ///< Memory available o | |
| | | n device for __constant__ variables in a CUDA C kernel in bytes | |
| | | CU_DEVICE_ATTRIBUTE_WARP_SIZE = 10, ///< Warp size in threa | |
| | | ds | |
| | | CU_DEVICE_ATTRIBUTE_MAX_PITCH = 11, ///< Maximum pitch in b | |
| | | ytes allowed by memory copies | |
| | | CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK = 12, ///< Maximum number | |
| | | of 32-bit registers available per block | |
| | | CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK = 12, ///< Deprecated, use CU | |
| | | _DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK | |
| | | CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13, ///< Peak clock frequen | |
| | | cy in kilohertz | |
| | | CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT = 14, ///< Alignment requirem | |
| | | ent for textures | |
| | | | |
|
| CU_DEVICE_ATTRIBUTE_GPU_OVERLAP = 15, | | CU_DEVICE_ATTRIBUTE_GPU_OVERLAP = 15, ///< Device can possibl | |
| CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16, | | y copy memory and execute a kernel concurrently | |
| CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT = 17 | | CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16, ///< Number of multipro | |
| | | cessors on device | |
| | | CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT = 17, ///< Specifies whether | |
| | | there is a run time limit on kernels | |
| | | CU_DEVICE_ATTRIBUTE_INTEGRATED = 18, ///< Device is integrat | |
| | | ed with host memory | |
| | | CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY = 19, ///< Device can map hos | |
| | | t memory into CUDA address space | |
| | | CU_DEVICE_ATTRIBUTE_COMPUTE_MODE = 20 ///< Compute mode (See | |
| | | ::CUcomputemode for details) | |
| } CUdevice_attribute; | | } CUdevice_attribute; | |
| | | | |
|
| // | | /** | |
| // Legacy device properties | | * Legacy device properties | |
| // | | */ | |
| typedef struct CUdevprop_st { | | typedef struct CUdevprop_st { | |
|
| int maxThreadsPerBlock; | | int maxThreadsPerBlock; ///< Maximum number of threads per block | |
| int maxThreadsDim[3]; | | int maxThreadsDim[3]; ///< Maximum size of each dimension of a bl | |
| int maxGridSize[3]; | | ock | |
| int sharedMemPerBlock; | | int maxGridSize[3]; ///< Maximum size of each dimension of a gr | |
| int totalConstantMemory; | | id | |
| int SIMDWidth; | | int sharedMemPerBlock; ///< Shared memory available per block in b | |
| int memPitch; | | ytes | |
| int regsPerBlock; | | int totalConstantMemory; ///< Constant memory available on device in | |
| int clockRate; | | bytes | |
| int textureAlign; | | int SIMDWidth; ///< Warp size in threads | |
| | | int memPitch; ///< Maximum pitch in bytes allowed by memo | |
| | | ry copies | |
| | | int regsPerBlock; ///< 32-bit registers available per block | |
| | | int clockRate; ///< Clock frequency in kilohertz | |
| | | int textureAlign; ///< Alignment requirement for textures | |
| } CUdevprop; | | } CUdevprop; | |
| | | | |
|
| // | | /** | |
| // Memory types | | * Function properties | |
| // | | */ | |
| | | typedef enum CUfunction_attribute_enum { | |
| | | /** | |
| | | * The number of threads beyond which a launch of the function would fa | |
| | | il. | |
| | | * This number depends on both the function and the device on which the | |
| | | * function is currently loaded. | |
| | | */ | |
| | | CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 0, | |
| | | | |
| | | /** | |
| | | * The size in bytes of statically-allocated shared memory required by | |
| | | * this function. This does not include dynamically-allocated shared | |
| | | * memory requested by the user at runtime. | |
| | | */ | |
| | | CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES = 1, | |
| | | | |
| | | /** | |
| | | * The size in bytes of user-allocated constant memory required by this | |
| | | * function. | |
| | | */ | |
| | | CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES = 2, | |
| | | | |
| | | /** | |
| | | * The size in bytes of thread local memory used by this function. | |
| | | */ | |
| | | CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES = 3, | |
| | | | |
| | | /** | |
| | | * The number of registers used by each thread of this function. | |
| | | */ | |
| | | CU_FUNC_ATTRIBUTE_NUM_REGS = 4, | |
| | | | |
| | | CU_FUNC_ATTRIBUTE_MAX | |
| | | } CUfunction_attribute; | |
| | | | |
| | | /** | |
| | | * Memory types | |
| | | */ | |
| typedef enum CUmemorytype_enum { | | typedef enum CUmemorytype_enum { | |
|
| CU_MEMORYTYPE_HOST = 0x01, | | CU_MEMORYTYPE_HOST = 0x01, ///< Host memory | |
| CU_MEMORYTYPE_DEVICE = 0x02, | | CU_MEMORYTYPE_DEVICE = 0x02, ///< Device memory | |
| CU_MEMORYTYPE_ARRAY = 0x03 | | CU_MEMORYTYPE_ARRAY = 0x03 ///< Array memory | |
| } CUmemorytype; | | } CUmemorytype; | |
| | | | |
|
| // | | /** | |
| // Online compiler options | | * Compute Modes | |
| // | | */ | |
| | | typedef enum CUcomputemode_enum { | |
| | | CU_COMPUTEMODE_DEFAULT = 0, ///< Default compute mode (Multiple | |
| | | contexts allowed per device) | |
| | | CU_COMPUTEMODE_EXCLUSIVE = 1, ///< Compute-exclusive mode (Only on | |
| | | e context can be present on this device at a time) | |
| | | CU_COMPUTEMODE_PROHIBITED = 2 ///< Compute-prohibited mode (No con | |
| | | texts can be created on this device at this time) | |
| | | } CUcomputemode; | |
| | | | |
| | | /** | |
| | | * Online compiler options | |
| | | */ | |
| typedef enum CUjit_option_enum | | typedef enum CUjit_option_enum | |
| { | | { | |
|
| // CU_JIT_MAX_REGISTERS - Max number of registers that a thread may use | | /** | |
| . | | * Max number of registers that a thread may use. | |
| | | */ | |
| CU_JIT_MAX_REGISTERS = 0, | | CU_JIT_MAX_REGISTERS = 0, | |
| | | | |
|
| // CU_JIT_THREADS_PER_BLOCK - | | /** | |
| // IN: Specifies minimum number of threads per block to target compilat | | * IN: Specifies minimum number of threads per block to target compilat | |
| ion for | | ion | |
| // OUT: Returns the number of threads the compiler actually targeted. | | * for\n | |
| This | | * OUT: Returns the number of threads the compiler actually targeted. | |
| // restricts the resource utilization fo the compiler (e.g. max registe | | * This restricts the resource utilization fo the compiler (e.g. max | |
| rs) such | | * registers) such that a block with the given number of threads should | |
| // that a block with the given number of threads should be able to laun | | be | |
| ch based | | * able to launch based on register limitations. Note, this option does | |
| // on register limitations. Note, this option does not currently take | | not | |
| into | | * currently take into account any other resource limitations, such as | |
| // account any other resource limitations, such as shared memory utiliz | | * shared memory utilization. | |
| ation. | | */ | |
| CU_JIT_THREADS_PER_BLOCK, | | CU_JIT_THREADS_PER_BLOCK, | |
| | | | |
|
| // CU_JIT_WALL_TIME - returns a float value in the option of the wall c | | /** | |
| lock | | * Returns a float value in the option of the wall clock time, in | |
| // time, in milliseconds, spent creating the cubin | | * milliseconds, spent creating the cubin | |
| | | */ | |
| CU_JIT_WALL_TIME, | | CU_JIT_WALL_TIME, | |
| | | | |
|
| // CU_JIT_INFO_LUG_BUFFER - pointer to a buffer in which to print any l | | /** | |
| og | | * Pointer to a buffer in which to print any log messsages from PTXAS | |
| // messsages from PTXAS that are informational in nature | | * that are informational in nature | |
| | | */ | |
| CU_JIT_INFO_LOG_BUFFER, | | CU_JIT_INFO_LOG_BUFFER, | |
| | | | |
|
| // CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES - | | /** | |
| // IN: Log buffer size in bytes. Log messages will be capped at this s | | * IN: Log buffer size in bytes. Log messages will be capped at this s | |
| ize | | ize | |
| // (including null terminator) | | * (including null terminator)\n | |
| // OUT: Amount of log buffer filled with messages | | * OUT: Amount of log buffer filled with messages | |
| | | */ | |
| CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, | | CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, | |
| | | | |
|
| // CU_JIT_ERROR_LOG_BUFFER - pointer to a buffer in which to print any | | /** | |
| log | | * Pointer to a buffer in which to print any log messages from PTXAS th | |
| // messages from PTXAS that reflect errors | | at | |
| | | * reflect errors | |
| | | */ | |
| CU_JIT_ERROR_LOG_BUFFER, | | CU_JIT_ERROR_LOG_BUFFER, | |
| | | | |
|
| // CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES - | | /** | |
| // IN: Log buffer size in bytes. Log messages will be capped at this s | | * IN: Log buffer size in bytes. Log messages will be capped at this s | |
| ize | | ize | |
| // (including null terminator) | | * (including null terminator)\n | |
| // OUT: Amount of log buffer filled with messages | | * OUT: Amount of log buffer filled with messages | |
| | | */ | |
| CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, | | CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, | |
| | | | |
|
| // CU_JIT_OPTIMIZATION_LEVEL - level of optimizations to apply to gener | | /** | |
| ated | | * Level of optimizations to apply to generated code (0 - 4), with 4 | |
| // code (0 - 4), with 4 being the default and highest level of optimiza | | * being the default and highest level of optimizations. | |
| tions. | | */ | |
| CU_JIT_OPTIMIZATION_LEVEL, | | CU_JIT_OPTIMIZATION_LEVEL, | |
| | | | |
|
| // CU_JIT_TARGET_FROM_CU_CONTEXT - no option value required. Determine | | /** | |
| s | | * No option value required. Determines the target based on the current | |
| // the target based on the current attached context (default) | | * attached context (default) | |
| | | */ | |
| CU_JIT_TARGET_FROM_CUCONTEXT, | | CU_JIT_TARGET_FROM_CUCONTEXT, | |
| | | | |
|
| // CU_JIT_TARGET - target is chosen based on supplied CUjit_target_enum | | /** | |
| . | | * Target is chosen based on supplied CUjit_target_enum. | |
| | | */ | |
| CU_JIT_TARGET, | | CU_JIT_TARGET, | |
| | | | |
|
| // CU_JIT_FALLBACK_STRATEGY - specifies choice of fallback strategy if | | /** | |
| // matching cubin is not found. Choice is based on supplied | | * Specifies choice of fallback strategy if matching cubin is not found | |
| // CUjit_fallback_enum. | | . | |
| | | * Choice is based on supplied CUjit_fallback_enum. | |
| | | */ | |
| CU_JIT_FALLBACK_STRATEGY | | CU_JIT_FALLBACK_STRATEGY | |
| | | | |
| } CUjit_option; | | } CUjit_option; | |
| | | | |
|
| // | | /** | |
| // Online compilation targets | | * Online compilation targets | |
| // | | */ | |
| typedef enum CUjit_target_enum | | typedef enum CUjit_target_enum | |
| { | | { | |
|
| CU_TARGET_COMPUTE_10 = 0, | | CU_TARGET_COMPUTE_10 = 0, ///< Compute device class 1.0 | |
| CU_TARGET_COMPUTE_11, | | CU_TARGET_COMPUTE_11, ///< Compute device class 1.1 | |
| CU_TARGET_COMPUTE_12, | | CU_TARGET_COMPUTE_12, ///< Compute device class 1.2 | |
| CU_TARGET_COMPUTE_13 | | CU_TARGET_COMPUTE_13 ///< Compute device class 1.3 | |
| } CUjit_target; | | } CUjit_target; | |
| | | | |
|
| // | | /** | |
| // Cubin matching fallback strategies | | * Cubin matching fallback strategies | |
| // | | */ | |
| typedef enum CUjit_fallback_enum | | typedef enum CUjit_fallback_enum | |
| { | | { | |
|
| // prefer to compile ptx | | /** Prefer to compile ptx */ | |
| CU_PREFER_PTX = 0, | | CU_PREFER_PTX = 0, | |
| | | | |
|
| // prefer to fall back to compatible binary code | | /** Prefer to fall back to compatible binary code */ | |
| CU_PREFER_BINARY | | CU_PREFER_BINARY | |
| | | | |
| } CUjit_fallback; | | } CUjit_fallback; | |
| | | | |
| /************************************ | | /************************************ | |
| ** | | ** | |
| ** Error codes | | ** Error codes | |
| ** | | ** | |
| ***********************************/ | | ***********************************/ | |
| | | | |
|
| | | /** | |
| | | * Error codes | |
| | | */ | |
| typedef enum cudaError_enum { | | typedef enum cudaError_enum { | |
| | | | |
|
| CUDA_SUCCESS = 0, | | CUDA_SUCCESS = 0, ///< No errors | |
| CUDA_ERROR_INVALID_VALUE = 1, | | CUDA_ERROR_INVALID_VALUE = 1, ///< Invalid value | |
| CUDA_ERROR_OUT_OF_MEMORY = 2, | | CUDA_ERROR_OUT_OF_MEMORY = 2, ///< Out of memory | |
| CUDA_ERROR_NOT_INITIALIZED = 3, | | CUDA_ERROR_NOT_INITIALIZED = 3, ///< Driver not initialized | |
| CUDA_ERROR_DEINITIALIZED = 4, | | CUDA_ERROR_DEINITIALIZED = 4, ///< Driver deinitialized | |
| | | | |
|
| CUDA_ERROR_NO_DEVICE = 100, | | CUDA_ERROR_NO_DEVICE = 100, ///< No CUDA-capable device | |
| CUDA_ERROR_INVALID_DEVICE = 101, | | available | |
| | | CUDA_ERROR_INVALID_DEVICE = 101, ///< Invalid device | |
| | | | |
|
| CUDA_ERROR_INVALID_IMAGE = 200, | | CUDA_ERROR_INVALID_IMAGE = 200, ///< Invalid kernel image | |
| CUDA_ERROR_INVALID_CONTEXT = 201, | | CUDA_ERROR_INVALID_CONTEXT = 201, ///< Invalid context | |
| CUDA_ERROR_CONTEXT_ALREADY_CURRENT = 202, | | CUDA_ERROR_CONTEXT_ALREADY_CURRENT = 202, ///< Context already curren | |
| CUDA_ERROR_MAP_FAILED = 205, | | t | |
| CUDA_ERROR_UNMAP_FAILED = 206, | | CUDA_ERROR_MAP_FAILED = 205, ///< Map failed | |
| CUDA_ERROR_ARRAY_IS_MAPPED = 207, | | CUDA_ERROR_UNMAP_FAILED = 206, ///< Unmap failed | |
| CUDA_ERROR_ALREADY_MAPPED = 208, | | CUDA_ERROR_ARRAY_IS_MAPPED = 207, ///< Array is mapped | |
| CUDA_ERROR_NO_BINARY_FOR_GPU = 209, | | CUDA_ERROR_ALREADY_MAPPED = 208, ///< Already mapped | |
| CUDA_ERROR_ALREADY_ACQUIRED = 210, | | CUDA_ERROR_NO_BINARY_FOR_GPU = 209, ///< No binary for GPU | |
| CUDA_ERROR_NOT_MAPPED = 211, | | CUDA_ERROR_ALREADY_ACQUIRED = 210, ///< Already acquired | |
| | | CUDA_ERROR_NOT_MAPPED = 211, ///< Not mapped | |
| | | | |
|
| CUDA_ERROR_INVALID_SOURCE = 300, | | CUDA_ERROR_INVALID_SOURCE = 300, ///< Invalid source | |
| CUDA_ERROR_FILE_NOT_FOUND = 301, | | CUDA_ERROR_FILE_NOT_FOUND = 301, ///< File not found | |
| | | | |
|
| CUDA_ERROR_INVALID_HANDLE = 400, | | CUDA_ERROR_INVALID_HANDLE = 400, ///< Invalid handle | |
| | | | |
|
| CUDA_ERROR_NOT_FOUND = 500, | | CUDA_ERROR_NOT_FOUND = 500, ///< Not found | |
| | | | |
|
| CUDA_ERROR_NOT_READY = 600, | | CUDA_ERROR_NOT_READY = 600, ///< CUDA not ready | |
| | | | |
|
| CUDA_ERROR_LAUNCH_FAILED = 700, | | CUDA_ERROR_LAUNCH_FAILED = 700, ///< Launch failed | |
| CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES = 701, | | CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES = 701, ///< Launch exceeded resour | |
| CUDA_ERROR_LAUNCH_TIMEOUT = 702, | | ces | |
| CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING = 703, | | CUDA_ERROR_LAUNCH_TIMEOUT = 702, ///< Launch exceeded timeou | |
| | | t | |
| | | CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING = 703, ///< Launch with incomp | |
| | | atible texturing | |
| | | | |
|
| CUDA_ERROR_UNKNOWN = 999 | | CUDA_ERROR_UNKNOWN = 999 ///< Unknown error | |
| } CUresult; | | } CUresult; | |
| | | | |
|
| | | /** | |
| | | * If set, host memory is portable between CUDA contexts. | |
| | | * Flag for ::cuMemHostAlloc() | |
| | | */ | |
| | | #define CU_MEMHOSTALLOC_PORTABLE 0x01 | |
| | | | |
| | | /** | |
| | | * If set, host memory is mapped into CUDA address space and | |
| | | * ::cuMemHostGetDevicePointer() may be called on the host pointer. | |
| | | * Flag for ::cuMemHostAlloc() | |
| | | */ | |
| | | #define CU_MEMHOSTALLOC_DEVICEMAP 0x02 | |
| | | | |
| | | /** | |
| | | * If set, host memory is allocated as write-combined - fast to write, | |
| | | * faster to DMA, slow to read except via SSE4 streaming load instruction | |
| | | * (MOVNTDQA). | |
| | | * Flag for ::cuMemHostAlloc() | |
| | | */ | |
| | | #define CU_MEMHOSTALLOC_WRITECOMBINED 0x04 | |
| | | | |
| | | /** | |
| | | * 2D memory copy parameters | |
| | | */ | |
| | | typedef struct CUDA_MEMCPY2D_st { | |
| | | | |
| | | unsigned int srcXInBytes, ///< Source X in bytes | |
| | | srcY; ///< Source Y | |
| | | CUmemorytype srcMemoryType; ///< Source memory type (host, device, arra | |
| | | y) | |
| | | const void *srcHost; ///< Source host pointer | |
| | | CUdeviceptr srcDevice; ///< Source device pointer | |
| | | CUarray srcArray; ///< Source array reference | |
| | | unsigned int srcPitch; ///< Source pitch (ignored when src is arra | |
| | | y) | |
| | | | |
| | | unsigned int dstXInBytes, ///< Destination X in bytes | |
| | | dstY; ///< Destination Y | |
| | | CUmemorytype dstMemoryType; ///< Destination memory type (host, device, | |
| | | array) | |
| | | void *dstHost; ///< Destination host pointer | |
| | | CUdeviceptr dstDevice; ///< Destination device pointer | |
| | | CUarray dstArray; ///< Destination array reference | |
| | | unsigned int dstPitch; ///< Destination pitch (ignored when dst is | |
| | | array) | |
| | | | |
| | | unsigned int WidthInBytes; ///< Width of 2D memory copy in bytes | |
| | | unsigned int Height; ///< Height of 2D memory copy | |
| | | } CUDA_MEMCPY2D; | |
| | | | |
| | | /** | |
| | | * 3D memory copy parameters | |
| | | */ | |
| | | typedef struct CUDA_MEMCPY3D_st { | |
| | | | |
| | | unsigned int srcXInBytes, ///< Source X in bytes | |
| | | srcY, ///< Source Y | |
| | | srcZ; ///< Source Z | |
| | | unsigned int srcLOD; ///< Source LOD | |
| | | CUmemorytype srcMemoryType; ///< Source memory type (host, device, arra | |
| | | y) | |
| | | const void *srcHost; ///< Source host pointer | |
| | | CUdeviceptr srcDevice; ///< Source device pointer | |
| | | CUarray srcArray; ///< Source array reference | |
| | | void *reserved0; ///< Must be NULL | |
| | | unsigned int srcPitch; ///< Source pitch (ignored when src is arra | |
| | | y) | |
| | | unsigned int srcHeight; ///< Source height (ignored when src is arr | |
| | | ay; may be 0 if Depth==1) | |
| | | | |
| | | unsigned int dstXInBytes, ///< Destination X in bytes | |
| | | dstY, ///< Destination Y | |
| | | dstZ; ///< Destination Z | |
| | | unsigned int dstLOD; ///< Destination LOD | |
| | | CUmemorytype dstMemoryType; ///< Destination memory type (host, device, | |
| | | array) | |
| | | void *dstHost; ///< Destination host pointer | |
| | | CUdeviceptr dstDevice; ///< Destination device pointer | |
| | | CUarray dstArray; ///< Destination array reference | |
| | | void *reserved1; ///< Must be NULL | |
| | | unsigned int dstPitch; ///< Destination pitch (ignored when dst is | |
| | | array) | |
| | | unsigned int dstHeight; ///< Destination height (ignored when dst i | |
| | | s array; may be 0 if Depth==1) | |
| | | | |
| | | unsigned int WidthInBytes; ///< Width of 3D memory copy in bytes | |
| | | unsigned int Height; ///< Height of 3D memory copy | |
| | | unsigned int Depth; ///< Depth of 3D memory copy | |
| | | } CUDA_MEMCPY3D; | |
| | | | |
| | | /** | |
| | | * Array descriptor | |
| | | */ | |
| | | typedef struct | |
| | | { | |
| | | unsigned int Width; ///< Width of array | |
| | | unsigned int Height; ///< Height of array | |
| | | | |
| | | CUarray_format Format; ///< Array format | |
| | | | |
| | | unsigned int NumChannels; ///< Channels per array element | |
| | | } CUDA_ARRAY_DESCRIPTOR; | |
| | | | |
| | | /** | |
| | | * 3D array descriptor | |
| | | */ | |
| | | typedef struct | |
| | | { | |
| | | unsigned int Width; ///< Width of 3D array | |
| | | unsigned int Height; ///< Height of 3D array | |
| | | unsigned int Depth; ///< Depth of 3D array | |
| | | | |
| | | CUarray_format Format; ///< Array format | |
| | | | |
| | | unsigned int NumChannels; ///< Channels per array element | |
| | | | |
| | | unsigned int Flags; ///< Flags | |
| | | } CUDA_ARRAY3D_DESCRIPTOR; | |
| | | | |
| | | /** | |
| | | * Override the texref format with a format inferred from the array. | |
| | | * Flag for ::cuTexRefSetArray() | |
| | | */ | |
| | | #define CU_TRSA_OVERRIDE_FORMAT 0x01 | |
| | | | |
| | | /** | |
| | | * Read the texture as integers rather than promoting the values to floats | |
| | | * in the range [0,1]. | |
| | | * Flag for ::cuTexRefSetFlags() | |
| | | */ | |
| | | #define CU_TRSF_READ_AS_INTEGER 0x01 | |
| | | | |
| | | /** | |
| | | * Use normalized texture coordinates in the range [0,1) instead of [0,dim) | |
| | | . | |
| | | * Flag for ::cuTexRefSetFlags() | |
| | | */ | |
| | | #define CU_TRSF_NORMALIZED_COORDINATES 0x02 | |
| | | | |
| | | /** | |
| | | * For texture references loaded into the module, use default texunit from | |
| | | * texture reference. | |
| | | */ | |
| | | #define CU_PARAM_TR_DEFAULT -1 | |
| | | | |
| | | /** @} */ | |
| | | /** @} */ /* END CUDA_TYPES */ | |
| | | | |
| #ifdef _WIN32 | | #ifdef _WIN32 | |
| #define CUDAAPI __stdcall | | #define CUDAAPI __stdcall | |
| #else | | #else | |
| #define CUDAAPI | | #define CUDAAPI | |
| #endif | | #endif | |
| | | | |
| /********************************* | | /********************************* | |
| ** Initialization | | ** Initialization | |
| *********************************/ | | *********************************/ | |
| CUresult CUDAAPI cuInit(unsigned int Flags); | | CUresult CUDAAPI cuInit(unsigned int Flags); | |
| | | | |
|
| | | /********************************* | |
| | | ** Driver Version Query | |
| | | *********************************/ | |
| | | CUresult CUDAAPI cuDriverGetVersion(int *driverVersion); | |
| | | | |
| /************************************ | | /************************************ | |
| ** | | ** | |
| ** Device management | | ** Device management | |
| ** | | ** | |
| ***********************************/ | | ***********************************/ | |
| | | | |
| CUresult CUDAAPI cuDeviceGet(CUdevice *device, int ordinal); | | CUresult CUDAAPI cuDeviceGet(CUdevice *device, int ordinal); | |
| CUresult CUDAAPI cuDeviceGetCount(int *count); | | CUresult CUDAAPI cuDeviceGetCount(int *count); | |
| CUresult CUDAAPI cuDeviceGetName(char *name, int len, CUdevice dev); | | CUresult CUDAAPI cuDeviceGetName(char *name, int len, CUdevice dev); | |
| CUresult CUDAAPI cuDeviceComputeCapability(int *major, int *minor, CUd
evice dev); | | CUresult CUDAAPI cuDeviceComputeCapability(int *major, int *minor, CUd
evice dev); | |
| | | | |
| skipping to change at line 367 | | skipping to change at line 602 | |
| // size of biggest r/w to be performe
d by kernels on this memory | | // size of biggest r/w to be performe
d by kernels on this memory | |
| // 4, 8 or 16 bytes | | // 4, 8 or 16 bytes | |
| unsigned int ElementSizeBytes | | unsigned int ElementSizeBytes | |
| ); | | ); | |
| CUresult CUDAAPI cuMemFree(CUdeviceptr dptr); | | CUresult CUDAAPI cuMemFree(CUdeviceptr dptr); | |
| CUresult CUDAAPI cuMemGetAddressRange( CUdeviceptr *pbase, unsigned int
*psize, CUdeviceptr dptr ); | | CUresult CUDAAPI cuMemGetAddressRange( CUdeviceptr *pbase, unsigned int
*psize, CUdeviceptr dptr ); | |
| | | | |
| CUresult CUDAAPI cuMemAllocHost(void **pp, unsigned int bytesize); | | CUresult CUDAAPI cuMemAllocHost(void **pp, unsigned int bytesize); | |
| CUresult CUDAAPI cuMemFreeHost(void *p); | | CUresult CUDAAPI cuMemFreeHost(void *p); | |
| | | | |
|
| | | CUresult CUDAAPI cuMemHostAlloc(void **pp, size_t bytesize, unsigned in | |
| | | t Flags ); | |
| | | | |
| | | CUresult CUDAAPI cuMemHostGetDevicePointer( CUdeviceptr *pdptr, void *p | |
| | | , unsigned int Flags ); | |
| | | | |
| /************************************ | | /************************************ | |
| ** | | ** | |
| ** Synchronous Memcpy | | ** Synchronous Memcpy | |
| ** | | ** | |
| ** Intra-device memcpy's done with these functions may execute in para
llel with the CPU, | | ** Intra-device memcpy's done with these functions may execute in para
llel with the CPU, | |
| ** but if host memory is involved, they wait until the copy is done be
fore returning. | | ** but if host memory is involved, they wait until the copy is done be
fore returning. | |
| ** | | ** | |
| ***********************************/ | | ***********************************/ | |
| | | | |
| // 1D functions | | // 1D functions | |
| | | | |
| skipping to change at line 397 | | skipping to change at line 636 | |
| | | | |
| // system <-> array memory | | // system <-> array memory | |
| CUresult CUDAAPI cuMemcpyHtoA( CUarray dstArray, unsigned int dstI
ndex, const void *pSrc, unsigned int ByteCount ); | | CUresult CUDAAPI cuMemcpyHtoA( CUarray dstArray, unsigned int dstI
ndex, const void *pSrc, unsigned int ByteCount ); | |
| CUresult CUDAAPI cuMemcpyAtoH( void *dstHost, CUarray srcArray, un
signed int srcIndex, unsigned int ByteCount ); | | CUresult CUDAAPI cuMemcpyAtoH( void *dstHost, CUarray srcArray, un
signed int srcIndex, unsigned int ByteCount ); | |
| | | | |
| // array <-> array memory | | // array <-> array memory | |
| CUresult CUDAAPI cuMemcpyAtoA( CUarray dstArray, unsigned int dstI
ndex, CUarray srcArray, unsigned int srcIndex, unsigned int ByteCount ); | | CUresult CUDAAPI cuMemcpyAtoA( CUarray dstArray, unsigned int dstI
ndex, CUarray srcArray, unsigned int srcIndex, unsigned int ByteCount ); | |
| | | | |
| // 2D memcpy | | // 2D memcpy | |
| | | | |
|
| typedef struct CUDA_MEMCPY2D_st { | | | |
| | | | |
| unsigned int srcXInBytes, srcY; | | | |
| CUmemorytype srcMemoryType; | | | |
| const void *srcHost; | | | |
| CUdeviceptr srcDevice; | | | |
| CUarray srcArray; | | | |
| unsigned int srcPitch; // ignored when src is array | | | |
| | | | |
| unsigned int dstXInBytes, dstY; | | | |
| CUmemorytype dstMemoryType; | | | |
| void *dstHost; | | | |
| CUdeviceptr dstDevice; | | | |
| CUarray dstArray; | | | |
| unsigned int dstPitch; // ignored when dst is array | | | |
| | | | |
| unsigned int WidthInBytes; | | | |
| unsigned int Height; | | | |
| } CUDA_MEMCPY2D; | | | |
| CUresult CUDAAPI cuMemcpy2D( const CUDA_MEMCPY2D *pCopy ); | | CUresult CUDAAPI cuMemcpy2D( const CUDA_MEMCPY2D *pCopy ); | |
| CUresult CUDAAPI cuMemcpy2DUnaligned( const CUDA_MEMCPY2D *pCopy )
; | | CUresult CUDAAPI cuMemcpy2DUnaligned( const CUDA_MEMCPY2D *pCopy )
; | |
| | | | |
| // 3D memcpy | | // 3D memcpy | |
| | | | |
|
| typedef struct CUDA_MEMCPY3D_st { | | | |
| | | | |
| unsigned int srcXInBytes, srcY, srcZ; | | | |
| unsigned int srcLOD; | | | |
| CUmemorytype srcMemoryType; | | | |
| const void *srcHost; | | | |
| CUdeviceptr srcDevice; | | | |
| CUarray srcArray; | | | |
| void *reserved0; // must be NULL | | | |
| unsigned int srcPitch; // ignored when src is array | | | |
| unsigned int srcHeight; // ignored when src is array; may b | | | |
| e 0 if Depth==1 | | | |
| | | | |
| unsigned int dstXInBytes, dstY, dstZ; | | | |
| unsigned int dstLOD; | | | |
| CUmemorytype dstMemoryType; | | | |
| void *dstHost; | | | |
| CUdeviceptr dstDevice; | | | |
| CUarray dstArray; | | | |
| void *reserved1; // must be NULL | | | |
| unsigned int dstPitch; // ignored when dst is array | | | |
| unsigned int dstHeight; // ignored when dst is array; may b | | | |
| e 0 if Depth==1 | | | |
| | | | |
| unsigned int WidthInBytes; | | | |
| unsigned int Height; | | | |
| unsigned int Depth; | | | |
| } CUDA_MEMCPY3D; | | | |
| CUresult CUDAAPI cuMemcpy3D( const CUDA_MEMCPY3D *pCopy ); | | CUresult CUDAAPI cuMemcpy3D( const CUDA_MEMCPY3D *pCopy ); | |
| | | | |
| /************************************ | | /************************************ | |
| ** | | ** | |
| ** Asynchronous Memcpy | | ** Asynchronous Memcpy | |
| ** | | ** | |
| ** Any host memory involved must be DMA'able (e.g., allocated with cuM
emAllocHost). | | ** Any host memory involved must be DMA'able (e.g., allocated with cuM
emAllocHost). | |
| ** memcpy's done with these functions execute in parallel with the CPU
and, if | | ** memcpy's done with these functions execute in parallel with the CPU
and, if | |
| ** the hardware is available, may execute in parallel with the GPU. | | ** the hardware is available, may execute in parallel with the GPU. | |
| ** Asynchronous memcpy must be accompanied by appropriate stream synch
ronization. | | ** Asynchronous memcpy must be accompanied by appropriate stream synch
ronization. | |
| | | | |
| skipping to change at line 500 | | skipping to change at line 694 | |
| CUresult CUDAAPI cuMemsetD2D32( CUdeviceptr dstDevice, unsigned in
t dstPitch, unsigned int ui, unsigned int Width, unsigned int Height ); | | CUresult CUDAAPI cuMemsetD2D32( CUdeviceptr dstDevice, unsigned in
t dstPitch, unsigned int ui, unsigned int Width, unsigned int Height ); | |
| | | | |
| /************************************ | | /************************************ | |
| ** | | ** | |
| ** Function management | | ** Function management | |
| ** | | ** | |
| ***********************************/ | | ***********************************/ | |
| | | | |
| CUresult CUDAAPI cuFuncSetBlockShape (CUfunction hfunc, int x, int y, i
nt z); | | CUresult CUDAAPI cuFuncSetBlockShape (CUfunction hfunc, int x, int y, i
nt z); | |
| CUresult CUDAAPI cuFuncSetSharedSize (CUfunction hfunc, unsigned int by
tes); | | CUresult CUDAAPI cuFuncSetSharedSize (CUfunction hfunc, unsigned int by
tes); | |
|
| | | CUresult CUDAAPI cuFuncGetAttribute (int *pi, CUfunction_attribute attr
ib, CUfunction hfunc); | |
| | | | |
| /************************************ | | /************************************ | |
| ** | | ** | |
| ** Array management | | ** Array management | |
| ** | | ** | |
| ***********************************/ | | ***********************************/ | |
| | | | |
|
| typedef struct | | | |
| { | | | |
| // | | | |
| // dimensions | | | |
| // | | | |
| unsigned int Width; | | | |
| unsigned int Height; | | | |
| | | | |
| // | | | |
| // format | | | |
| // | | | |
| CUarray_format Format; | | | |
| | | | |
| // channels per array element | | | |
| unsigned int NumChannels; | | | |
| } CUDA_ARRAY_DESCRIPTOR; | | | |
| | | | |
| CUresult CUDAAPI cuArrayCreate( CUarray *pHandle, const CUDA_ARRAY_DES
CRIPTOR *pAllocateArray ); | | CUresult CUDAAPI cuArrayCreate( CUarray *pHandle, const CUDA_ARRAY_DES
CRIPTOR *pAllocateArray ); | |
| CUresult CUDAAPI cuArrayGetDescriptor( CUDA_ARRAY_DESCRIPTOR *pArrayDe
scriptor, CUarray hArray ); | | CUresult CUDAAPI cuArrayGetDescriptor( CUDA_ARRAY_DESCRIPTOR *pArrayDe
scriptor, CUarray hArray ); | |
| CUresult CUDAAPI cuArrayDestroy( CUarray hArray ); | | CUresult CUDAAPI cuArrayDestroy( CUarray hArray ); | |
| | | | |
|
| typedef struct | | | |
| { | | | |
| // | | | |
| // dimensions | | | |
| // | | | |
| unsigned int Width; | | | |
| unsigned int Height; | | | |
| unsigned int Depth; | | | |
| // | | | |
| // format | | | |
| // | | | |
| CUarray_format Format; | | | |
| | | | |
| // channels per array element | | | |
| unsigned int NumChannels; | | | |
| // | | | |
| // flags | | | |
| // | | | |
| unsigned int Flags; | | | |
| | | | |
| } CUDA_ARRAY3D_DESCRIPTOR; | | | |
| CUresult CUDAAPI cuArray3DCreate( CUarray *pHandle, const CUDA_ARRAY3D
_DESCRIPTOR *pAllocateArray ); | | CUresult CUDAAPI cuArray3DCreate( CUarray *pHandle, const CUDA_ARRAY3D
_DESCRIPTOR *pAllocateArray ); | |
| CUresult CUDAAPI cuArray3DGetDescriptor( CUDA_ARRAY3D_DESCRIPTOR *pArr
ayDescriptor, CUarray hArray ); | | CUresult CUDAAPI cuArray3DGetDescriptor( CUDA_ARRAY3D_DESCRIPTOR *pArr
ayDescriptor, CUarray hArray ); | |
| | | | |
| /************************************ | | /************************************ | |
| ** | | ** | |
| ** Texture reference management | | ** Texture reference management | |
| ** | | ** | |
| ***********************************/ | | ***********************************/ | |
| CUresult CUDAAPI cuTexRefCreate( CUtexref *pTexRef ); | | CUresult CUDAAPI cuTexRefCreate( CUtexref *pTexRef ); | |
| CUresult CUDAAPI cuTexRefDestroy( CUtexref hTexRef ); | | CUresult CUDAAPI cuTexRefDestroy( CUtexref hTexRef ); | |
| | | | |
| CUresult CUDAAPI cuTexRefSetArray( CUtexref hTexRef, CUarray hArray, u
nsigned int Flags ); | | CUresult CUDAAPI cuTexRefSetArray( CUtexref hTexRef, CUarray hArray, u
nsigned int Flags ); | |
|
| // override the texref format with a format inferred from the array | | | |
| #define CU_TRSA_OVERRIDE_FORMAT 0x01 | | | |
| CUresult CUDAAPI cuTexRefSetAddress( unsigned int *ByteOffset, CUtexre
f hTexRef, CUdeviceptr dptr, unsigned int bytes ); | | CUresult CUDAAPI cuTexRefSetAddress( unsigned int *ByteOffset, CUtexre
f hTexRef, CUdeviceptr dptr, unsigned int bytes ); | |
|
| | | CUresult CUDAAPI cuTexRefSetAddress2D( CUtexref hTexRef, const CUDA_AR
RAY_DESCRIPTOR *desc, CUdeviceptr dptr, unsigned int Pitch); | |
| CUresult CUDAAPI cuTexRefSetFormat( CUtexref hTexRef, CUarray_format f
mt, int NumPackedComponents ); | | CUresult CUDAAPI cuTexRefSetFormat( CUtexref hTexRef, CUarray_format f
mt, int NumPackedComponents ); | |
|
| | | | |
| CUresult CUDAAPI cuTexRefSetAddressMode( CUtexref hTexRef, int dim, CU
address_mode am ); | | CUresult CUDAAPI cuTexRefSetAddressMode( CUtexref hTexRef, int dim, CU
address_mode am ); | |
| CUresult CUDAAPI cuTexRefSetFilterMode( CUtexref hTexRef, CUfilter_mod
e fm ); | | CUresult CUDAAPI cuTexRefSetFilterMode( CUtexref hTexRef, CUfilter_mod
e fm ); | |
| CUresult CUDAAPI cuTexRefSetFlags( CUtexref hTexRef, unsigned int Flag
s ); | | CUresult CUDAAPI cuTexRefSetFlags( CUtexref hTexRef, unsigned int Flag
s ); | |
|
| // read the texture as integers rather than promoting the values | | | |
| // to floats in the range [0,1] | | | |
| #define CU_TRSF_READ_AS_INTEGER 0x01 | | | |
| | | | |
| // use normalized texture coordinates in the range [0,1) instead of | | | |
| [0,dim) | | | |
| #define CU_TRSF_NORMALIZED_COORDINATES 0x02 | | | |
| | | | |
| CUresult CUDAAPI cuTexRefGetAddress( CUdeviceptr *pdptr, CUtexref hTex
Ref ); | | CUresult CUDAAPI cuTexRefGetAddress( CUdeviceptr *pdptr, CUtexref hTex
Ref ); | |
| CUresult CUDAAPI cuTexRefGetArray( CUarray *phArray, CUtexref hTexRef
); | | CUresult CUDAAPI cuTexRefGetArray( CUarray *phArray, CUtexref hTexRef
); | |
| CUresult CUDAAPI cuTexRefGetAddressMode( CUaddress_mode *pam, CUtexref
hTexRef, int dim ); | | CUresult CUDAAPI cuTexRefGetAddressMode( CUaddress_mode *pam, CUtexref
hTexRef, int dim ); | |
| CUresult CUDAAPI cuTexRefGetFilterMode( CUfilter_mode *pfm, CUtexref h
TexRef ); | | CUresult CUDAAPI cuTexRefGetFilterMode( CUfilter_mode *pfm, CUtexref h
TexRef ); | |
| CUresult CUDAAPI cuTexRefGetFormat( CUarray_format *pFormat, int *pNum
Channels, CUtexref hTexRef ); | | CUresult CUDAAPI cuTexRefGetFormat( CUarray_format *pFormat, int *pNum
Channels, CUtexref hTexRef ); | |
| CUresult CUDAAPI cuTexRefGetFlags( unsigned int *pFlags, CUtexref hTex
Ref ); | | CUresult CUDAAPI cuTexRefGetFlags( unsigned int *pFlags, CUtexref hTex
Ref ); | |
| | | | |
| /************************************ | | /************************************ | |
| ** | | ** | |
| ** Parameter management | | ** Parameter management | |
| ** | | ** | |
| ***********************************/ | | ***********************************/ | |
| | | | |
| CUresult CUDAAPI cuParamSetSize (CUfunction hfunc, unsigned int numbyt
es); | | CUresult CUDAAPI cuParamSetSize (CUfunction hfunc, unsigned int numbyt
es); | |
| CUresult CUDAAPI cuParamSeti (CUfunction hfunc, int offset, unsigne
d int value); | | CUresult CUDAAPI cuParamSeti (CUfunction hfunc, int offset, unsigne
d int value); | |
| CUresult CUDAAPI cuParamSetf (CUfunction hfunc, int offset, float v
alue); | | CUresult CUDAAPI cuParamSetf (CUfunction hfunc, int offset, float v
alue); | |
| CUresult CUDAAPI cuParamSetv (CUfunction hfunc, int offset, void *
ptr, unsigned int numbytes); | | CUresult CUDAAPI cuParamSetv (CUfunction hfunc, int offset, void *
ptr, unsigned int numbytes); | |
| CUresult CUDAAPI cuParamSetTexRef(CUfunction hfunc, int texunit, CUtex
ref hTexRef); | | CUresult CUDAAPI cuParamSetTexRef(CUfunction hfunc, int texunit, CUtex
ref hTexRef); | |
|
| // for texture references loaded into the module, | | | |
| // use default texunit from texture reference | | | |
| #define CU_PARAM_TR_DEFAULT -1 | | | |
| | | | |
| /************************************ | | /************************************ | |
| ** | | ** | |
| ** Launch functions | | ** Launch functions | |
| ** | | ** | |
| ***********************************/ | | ***********************************/ | |
| | | | |
| CUresult CUDAAPI cuLaunch ( CUfunction f ); | | CUresult CUDAAPI cuLaunch ( CUfunction f ); | |
| CUresult CUDAAPI cuLaunchGrid (CUfunction f, int grid_width, int grid_h
eight); | | CUresult CUDAAPI cuLaunchGrid (CUfunction f, int grid_width, int grid_h
eight); | |
| CUresult CUDAAPI cuLaunchGridAsync( CUfunction f, int grid_width, int g
rid_height, CUstream hStream ); | | CUresult CUDAAPI cuLaunchGridAsync( CUfunction f, int grid_width, int g
rid_height, CUstream hStream ); | |
| | | | |
End of changes. 61 change blocks. |
| 278 lines changed or deleted | | 457 lines changed or added | |
|
| cuda_runtime.h | | cuda_runtime.h | |
| /* | | /* | |
|
| * Copyright 1993-2008 NVIDIA Corporation. All rights reserved. | | * Copyright 1993-2009 NVIDIA Corporation. All rights reserved. | |
| * | | * | |
| * NOTICE TO USER: | | * NOTICE TO USER: | |
| * | | * | |
| * This source code is subject to NVIDIA ownership rights under U.S. and | | * This source code is subject to NVIDIA ownership rights under U.S. and | |
| * international Copyright laws. Users and possessors of this source code | | * international Copyright laws. Users and possessors of this source code | |
| * are hereby granted a nonexclusive, royalty-free license to use this code | | * are hereby granted a nonexclusive, royalty-free license to use this code | |
| * in individual and commercial software. | | * in individual and commercial software. | |
| * | | * | |
| * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE | | * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE | |
| * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR | | * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR | |
| | | | |
| skipping to change at line 77 | | skipping to change at line 77 | |
| #endif /* __CUDACC__ */ | | #endif /* __CUDACC__ */ | |
| | | | |
| #if defined(__cplusplus) | | #if defined(__cplusplus) | |
| | | | |
| /**************************************************************************
***** | | /**************************************************************************
***** | |
| *
* | | *
* | |
| *
* | | *
* | |
| *
* | | *
* | |
| ***************************************************************************
****/ | | ***************************************************************************
****/ | |
| | | | |
|
| | | /** | |
| | | * \ingroup CUDART_HIGHLEVEL | |
| | | * \brief \hl Configure a device launch | |
| | | * | |
| | | * Pushes \p size bytes of the argument pointed to by \p arg at \p offset | |
| | | * bytes from the start of the parameter passing area, which starts at | |
| | | * offset 0. The arguments are stored in the top of the execution stack. | |
| | | * \ref ::cudaSetupArgument(T,size_t) "cudaSetupArgument()" must be precede | |
| | | d | |
| | | * by a call to ::cudaConfigureCall(). | |
| | | * | |
| | | * \param arg - Argument to push for a kernel launch | |
| | | * \param offset - Offset in argument stack to push new arg | |
| | | * | |
| | | * \return | |
| | | * ::cudaSuccess | |
| | | * \notefnerr | |
| | | * | |
| | | * \sa \ref ::cudaLaunch(T*) "cudaLaunch (C++ API)", | |
| | | * \ref ::cudaSetupArgument(const void*, size_t, size_t) "cudaSetupArgument | |
| | | (C API)" | |
| | | * ::cudaConfigureCall | |
| | | */ | |
| template<class T> | | template<class T> | |
| __inline__ __host__ cudaError_t cudaSetupArgument( | | __inline__ __host__ cudaError_t cudaSetupArgument( | |
| T arg, | | T arg, | |
| size_t offset | | size_t offset | |
| ) | | ) | |
| { | | { | |
| return cudaSetupArgument((const void*)&arg, sizeof(T), offset); | | return cudaSetupArgument((const void*)&arg, sizeof(T), offset); | |
| } | | } | |
| | | | |
| #if defined(__CUDACC__) | | #if defined(__CUDACC__) | |
| | | | |
| /**************************************************************************
***** | | /**************************************************************************
***** | |
| *
* | | *
* | |
| *
* | | *
* | |
| *
* | | *
* | |
| ***************************************************************************
****/ | | ***************************************************************************
****/ | |
| | | | |
|
| | | /** | |
| | | * \addtogroup CUDART_HIGHLEVEL | |
| | | * @{ | |
| | | */ | |
| | | | |
| static __inline__ __host__ cudaError_t cudaMemcpyToSymbol( | | static __inline__ __host__ cudaError_t cudaMemcpyToSymbol( | |
| char *symbol, | | char *symbol, | |
| const void *src, | | const void *src, | |
| size_t count, | | size_t count, | |
| size_t offset = 0, | | size_t offset = 0, | |
| enum cudaMemcpyKind kind = cudaMemcpyHostToDevice | | enum cudaMemcpyKind kind = cudaMemcpyHostToDevice | |
| ) | | ) | |
| { | | { | |
| return cudaMemcpyToSymbol((const char*)symbol, src, count, offset, kind); | | return cudaMemcpyToSymbol((const char*)symbol, src, count, offset, kind); | |
| } | | } | |
| | | | |
| skipping to change at line 204 | | skipping to change at line 230 | |
| } | | } | |
| | | | |
| static __inline__ __host__ cudaError_t cudaGetSymbolAddress( | | static __inline__ __host__ cudaError_t cudaGetSymbolAddress( | |
| void **devPtr, | | void **devPtr, | |
| char *symbol | | char *symbol | |
| ) | | ) | |
| { | | { | |
| return cudaGetSymbolAddress(devPtr, (const char*)symbol); | | return cudaGetSymbolAddress(devPtr, (const char*)symbol); | |
| } | | } | |
| | | | |
|
| | | /** | |
| | | * \brief \hl Finds the address associated with a CUDA symbol | |
| | | * | |
| | | * Returns in \p *devPtr the address of symbol \p symbol on the device. | |
| | | * \p symbol can either be a variable that resides in global memory space, | |
| | | or | |
| | | * it can be a character string, naming a variable that resides in global | |
| | | * memory space. If \p symbol cannot be found, or if \p symbol is not decla | |
| | | red | |
| | | * in the global memory space, \p *devPtr is unchanged and the error | |
| | | * ::cudaErrorInvalidSymbol is returned. | |
| | | * | |
| | | * \param devPtr - Return device pointer associated with symbol | |
| | | * \param symbol - Global variable or string symbol to search for | |
| | | * | |
| | | * \return | |
| | | * ::cudaSuccess, | |
| | | * ::cudaErrorInvalidSymbol, | |
| | | * ::cudaErrorAddressOfConstant | |
| | | * \notefnerr | |
| | | * | |
| | | * \sa \ref ::cudaGetSymbolAddress(void**, const char*) "cudaGetSymbolAddre | |
| | | ss (C API)" | |
| | | * \ref ::cudaGetSymbolSize(size_t*, const T&) "cudaGetSymbolSize (C++ API) | |
| | | " | |
| | | */ | |
| template<class T> | | template<class T> | |
| __inline__ __host__ cudaError_t cudaGetSymbolAddress( | | __inline__ __host__ cudaError_t cudaGetSymbolAddress( | |
| void **devPtr, | | void **devPtr, | |
| const T &symbol | | const T &symbol | |
| ) | | ) | |
| { | | { | |
| return cudaGetSymbolAddress(devPtr, (const char*)&symbol); | | return cudaGetSymbolAddress(devPtr, (const char*)&symbol); | |
| } | | } | |
| | | | |
| /**************************************************************************
***** | | /**************************************************************************
***** | |
| | | | |
| skipping to change at line 227 | | skipping to change at line 275 | |
| ***************************************************************************
****/ | | ***************************************************************************
****/ | |
| | | | |
| static __inline__ __host__ cudaError_t cudaGetSymbolSize( | | static __inline__ __host__ cudaError_t cudaGetSymbolSize( | |
| size_t *size, | | size_t *size, | |
| char *symbol | | char *symbol | |
| ) | | ) | |
| { | | { | |
| return cudaGetSymbolSize(size, (const char*)symbol); | | return cudaGetSymbolSize(size, (const char*)symbol); | |
| } | | } | |
| | | | |
|
| | | /** | |
| | | * \brief \hl Finds the size of the object associated with a CUDA symbol | |
| | | * | |
| | | * Returns in \p *size the size of symbol \p symbol. \p symbol can either b | |
| | | e a | |
| | | * variable that resides in global or constant memory space, or it can be a | |
| | | * character string, naming a variable that resides in global or constant | |
| | | * memory space. If \p symbol cannot be found, or if \p symbol is not decla | |
| | | red | |
| | | * in global or constant memory space, \p *size is unchanged and the error | |
| | | * ::cudaErrorInvalidSymbol is returned. | |
| | | * | |
| | | * \param size - Size of object associated with symbol | |
| | | * \param symbol - Global variable or string symbol to find size of | |
| | | * | |
| | | * \return | |
| | | * ::cudaSuccess, | |
| | | * ::cudaErrorInvalidSymbol | |
| | | * \notefnerr | |
| | | * | |
| | | * \sa \ref ::cudaGetSymbolAddress(void**, const T&) "cudaGetSymbolAddress | |
| | | (C++ API)" | |
| | | * \ref ::cudaGetSymbolSize(size_t*, const char*) "cudaGetSymbolSize (C API | |
| | | )" | |
| | | */ | |
| template<class T> | | template<class T> | |
| __inline__ __host__ cudaError_t cudaGetSymbolSize( | | __inline__ __host__ cudaError_t cudaGetSymbolSize( | |
| size_t *size, | | size_t *size, | |
| const T &symbol | | const T &symbol | |
| ) | | ) | |
| { | | { | |
| return cudaGetSymbolSize(size, (const char*)&symbol); | | return cudaGetSymbolSize(size, (const char*)&symbol); | |
| } | | } | |
| | | | |
|
| | | /** @} */ /* END CUDART_MEMORY */ | |
| | | | |
| /**************************************************************************
***** | | /**************************************************************************
***** | |
| *
* | | *
* | |
| *
* | | *
* | |
| *
* | | *
* | |
| ***************************************************************************
****/ | | ***************************************************************************
****/ | |
| | | | |
|
| | | /** | |
| | | * \addtogroup CUDART_HIGHLEVEL | |
| | | * | |
| | | * @{ | |
| | | */ | |
| | | | |
| | | /** | |
| | | * \brief \hl Binds a memory area to a texture | |
| | | * | |
| | | * Binds \p size bytes of the memory area pointed to by \p devPtr to textur | |
| | | e | |
| | | * reference \p tex. \p desc describes how the memory is interpreted when | |
| | | * fetching values from the texture. The \p offset parameter is an optional | |
| | | * byte offset as with the low-level | |
| | | * \ref ::cudaBindTexture(size_t*, const struct textureReference*, const vo | |
| | | id*, const struct cudaChannelFormatDesc*, size_t) "cudaBindTexture()" | |
| | | * function. Any memory previously bound to \p tex is unbound. | |
| | | * | |
| | | * \param offset - Offset in bytes | |
| | | * \param tex - Texture to bind | |
| | | * \param devPtr - Memory area on device | |
| | | * \param desc - Channel format | |
| | | * \param size - Size of the memory area pointed to by devPtr | |
| | | * | |
| | | * \return | |
| | | * ::cudaSuccess, | |
| | | * ::cudaErrorInvalidValue, | |
| | | * ::cudaErrorInvalidDevicePointer, | |
| | | * ::cudaErrorInvalidTexture | |
| | | * \notefnerr | |
| | | * | |
| | | * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)" | |
| | | , | |
| | | * ::cudaGetChannelDesc, ::cudaGetTextureReference, | |
| | | * \ref ::cudaBindTexture(size_t*, const struct textureReference*, const vo | |
| | | id*, const struct cudaChannelFormatDesc*, size_t) "cudaBindTexture (C API)" | |
| | | , | |
| | | * \ref ::cudaBindTexture(size_t*, const struct texture< T, dim, readMode>& | |
| | | , const void*, size_t) "cudaBindTexture (C++ API, inherited channel descrip | |
| | | tor)", | |
| | | * \ref ::cudaBindTexture2D(size_t*, const struct texture< T, dim, readMode | |
| | | >&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_ | |
| | | t) "cudaBindTexture2D (C++ API)", | |
| | | * \ref ::cudaBindTextureToArray(const struct texture< T, dim, readMode>&, | |
| | | const struct cudaArray*, const struct cudaChannelFormatDesc&) "cudaBindText | |
| | | ureToArray (C++ API)", | |
| | | * \ref ::cudaBindTextureToArray(const struct texture< T, dim, readMode>&, | |
| | | const struct cudaArray*) "cudaBindTextureToArray (C++ API, inherited channe | |
| | | l descriptor)", | |
| | | * \ref ::cudaUnbindTexture(const struct texture< T, dim, readMode>&) "cuda | |
| | | UnbindTexture (C++ API)", | |
| | | * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture< T, d | |
| | | im, readMode >&) "cudaGetTextureAlignmentOffset (C++ API)" | |
| | | */ | |
| template<class T, int dim, enum cudaTextureReadMode readMode> | | template<class T, int dim, enum cudaTextureReadMode readMode> | |
| __inline__ __host__ cudaError_t cudaBindTexture( | | __inline__ __host__ cudaError_t cudaBindTexture( | |
| size_t *offset, | | size_t *offset, | |
| const struct texture<T, dim, readMode> &tex, | | const struct texture<T, dim, readMode> &tex, | |
| const void *devPtr, | | const void *devPtr, | |
| const struct cudaChannelFormatDesc &desc, | | const struct cudaChannelFormatDesc &desc, | |
| size_t size = UINT_MAX | | size_t size = UINT_MAX | |
| ) | | ) | |
| { | | { | |
| return cudaBindTexture(offset, &tex, devPtr, &desc, size); | | return cudaBindTexture(offset, &tex, devPtr, &desc, size); | |
| } | | } | |
| | | | |
|
| | | /** | |
| | | * \brief \hl Binds a memory area to a texture | |
| | | * | |
| | | * Binds \p size bytes of the memory area pointed to by \p devPtr to textur | |
| | | e | |
| | | * reference \p tex. The channel descriptor is inherited from the texture | |
| | | * reference type. The \p offset parameter is an optional byte offset as wi | |
| | | th | |
| | | * the low-level | |
| | | * ::cudaBindTexture(size_t*, const struct textureReference*, const void*, | |
| | | const struct cudaChannelFormatDesc*, size_t) | |
| | | * function. Any memory previously bound to \p tex is unbound. | |
| | | * | |
| | | * \param offset - Offset in bytes | |
| | | * \param tex - Texture to bind | |
| | | * \param devPtr - Memory area on device | |
| | | * \param size - Size of the memory area pointed to by devPtr | |
| | | * | |
| | | * \return | |
| | | * ::cudaSuccess, | |
| | | * ::cudaErrorInvalidValue, | |
| | | * ::cudaErrorInvalidDevicePointer, | |
| | | * ::cudaErrorInvalidTexture | |
| | | * \notefnerr | |
| | | * | |
| | | * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API), | |
| | | * ::cudaGetChannelDesc, ::cudaGetTextureReference, | |
| | | * \ref ::cudaBindTexture(size_t*, const struct textureReference*, const vo | |
| | | id*, const struct cudaChannelFormatDesc*, size_t) "cudaBindTexture (C API)" | |
| | | , | |
| | | * \ref ::cudaBindTexture(size_t*, const struct texture< T, dim, readMode>& | |
| | | , const void*, size_t) "cudaBindTexture (C++ API, inherited channel descrip | |
| | | tor)", | |
| | | * \ref ::cudaBindTexture2D(size_t*, const struct texture< T, dim, readMode | |
| | | >&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_ | |
| | | t) "cudaBindTexture2D (C++ API)", | |
| | | * \ref ::cudaBindTextureToArray(const struct texture< T, dim, readMode>&, | |
| | | const struct cudaArray*, const struct cudaChannelFormatDesc&) "cudaBindText | |
| | | ureToArray (C++ API)", | |
| | | * \ref ::cudaBindTextureToArray(const struct texture< T, dim, readMode>&, | |
| | | const struct cudaArray*) "cudaBindTextureToArray (C++ API, inherited channe | |
| | | l descriptor), | |
| | | * \ref ::cudaUnbindTexture(const struct texture<T, dim, readMode>&) "cudaU | |
| | | nbindTexture (C++ API)", | |
| | | * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture< T, d | |
| | | im, readMode>&) "cudaGetTextureAlignmentOffset (C++ API)" | |
| | | */ | |
| template<class T, int dim, enum cudaTextureReadMode readMode> | | template<class T, int dim, enum cudaTextureReadMode readMode> | |
| __inline__ __host__ cudaError_t cudaBindTexture( | | __inline__ __host__ cudaError_t cudaBindTexture( | |
| size_t *offset, | | size_t *offset, | |
| const struct texture<T, dim, readMode> &tex, | | const struct texture<T, dim, readMode> &tex, | |
| const void *devPtr, | | const void *devPtr, | |
| size_t size = UINT_MAX | | size_t size = UINT_MAX | |
| ) | | ) | |
| { | | { | |
| return cudaBindTexture(offset, tex, devPtr, tex.channelDesc, size); | | return cudaBindTexture(offset, tex, devPtr, tex.channelDesc, size); | |
| } | | } | |
| | | | |
|
| | | /** | |
| | | * \brief \hl Binds a 2D memory area to a texture | |
| | | * | |
| | | * Binds the 2D memory area pointed to by \p devPtr to the | |
| | | * texture reference \p tex. The size of the area is constrained by | |
| | | * \p width in texel units, \p height in texel units, and \p pitch in byte | |
| | | * units. \p desc describes how the memory is interpreted when fetching val | |
| | | ues | |
| | | * from the texture. Any memory previously bound to \p tex is unbound. | |
| | | * | |
| | | * Since the hardware enforces an alignment requirement on texture base | |
| | | * addresses, | |
| | | * \ref ::cudaBindTexture2D(size_t*, const struct texture< T, dim, readMode | |
| | | >&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_ | |
| | | t) "cudaBindTexture2D()" | |
| | | * returns in \p *offset a byte offset that | |
| | | * must be applied to texture fetches in order to read from the desired mem | |
| | | ory. | |
| | | * This offset must be divided by the texel size and passed to kernels that | |
| | | * read from the texture so they can be applied to the ::tex2D() function. | |
| | | * If the device memory pointer was returned from ::cudaMalloc(), the offse | |
| | | t is | |
| | | * guaranteed to be 0 and NULL may be passed as the \p offset parameter. | |
| | | * | |
| | | * \param offset - Offset in bytes | |
| | | * \param tex - Texture reference to bind | |
| | | * \param devPtr - 2D memory area on device | |
| | | * \param desc - Channel format | |
| | | * \param width - Width in texel units | |
| | | * \param height - Height in texel units | |
| | | * \param pitch - Pitch in bytes | |
| | | * | |
| | | * \return | |
| | | * ::cudaSuccess, | |
| | | * ::cudaErrorInvalidValue, | |
| | | * ::cudaErrorInvalidDevicePointer, | |
| | | * ::cudaErrorInvalidTexture | |
| | | * \notefnerr | |
| | | * | |
| | | * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API), | |
| | | * ::cudaGetChannelDesc, ::cudaGetTextureReference, | |
| | | * \ref ::cudaBindTexture(size_t*, const struct texture< T, dim, readMode>& | |
| | | , const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTextur | |
| | | e (C++ API), | |
| | | * \ref ::cudaBindTexture(size_t*, const struct texture< T, dim, readMode>& | |
| | | , const void*, size_t) "cudaBindTexture (C++ API, inherited channel descrip | |
| | | tor)", | |
| | | * \ref ::cudaBindTexture2D(size_t*, const struct textureReference*, const | |
| | | void*, const struct cudaChannelFormatDesc*, size_t, size_t, size_t) "cudaBi | |
| | | ndTexture2D (C API)", | |
| | | * \ref ::cudaBindTextureToArray(const struct texture< T, dim, readMode>&, | |
| | | const struct cudaArray*, const struct cudaChannelFormatDesc&) "cudaBindText | |
| | | ureToArray (C++ API)", | |
| | | * \ref ::cudaBindTextureToArray(const struct texture< T, dim, readMode>&, | |
| | | const struct cudaArray*) "cudaBindTextureToArray (C++ API, inherited channe | |
| | | l descriptor), | |
| | | * \ref ::cudaUnbindTexture(const struct texture<T, dim, readMode>&) "cudaU | |
| | | nbindTexture (C++ API)", | |
| | | * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture< T, d | |
| | | im, readMode>&) "cudaGetTextureAlignmentOffset (C++ API)" | |
| | | */ | |
| | | template<class T, int dim, enum cudaTextureReadMode readMode> | |
| | | __inline__ __host__ cudaError_t cudaBindTexture2D( | |
| | | size_t *offset, | |
| | | const struct texture<T, dim, readMode> &tex, | |
| | | const void *devPtr, | |
| | | const struct cudaChannelFormatDesc &desc, | |
| | | size_t width, | |
| | | size_t height, | |
| | | size_t pitch | |
| | | ) | |
| | | { | |
| | | return cudaBindTexture2D( offset, &tex, devPtr, &desc, width, height, pit | |
| | | ch); | |
| | | } | |
| | | | |
| | | /** | |
| | | * \brief \hl Binds an array to a texture | |
| | | * | |
| | | * Binds the CUDA array \p array to the texture reference \p tex. | |
| | | * \p desc describes how the memory is interpreted when fetching values fro | |
| | | m | |
| | | * the texture. Any CUDA array previously bound to \p tex is unbound. | |
| | | * | |
| | | * \param tex - Texture to bind | |
| | | * \param array - Memory array on device | |
| | | * \param desc - Channel format | |
| | | * | |
| | | * \return | |
| | | * ::cudaSuccess, | |
| | | * ::cudaErrorInvalidValue, | |
| | | * ::cudaErrorInvalidDevicePointer, | |
| | | * ::cudaErrorInvalidTexture | |
| | | * \notefnerr | |
| | | * | |
| | | * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)" | |
| | | , | |
| | | * ::cudaGetChannelDesc, ::cudaGetTextureReference, | |
| | | * \ref ::cudaBindTexture(size_t*, const struct texture< T, dim, readMode>& | |
| | | , const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTextur | |
| | | e (C++ API)", | |
| | | * \ref ::cudaBindTexture(size_t*, const struct texture< T, dim, readMode>& | |
| | | , const void*, size_t) "cudaBindTexture (C++ API, inherited channel descrip | |
| | | tor)", | |
| | | * \ref ::cudaBindTexture2D(size_t*, const struct texture< T, dim, readMode | |
| | | >&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_ | |
| | | t) "cudaBindTexture2D (C++ API)", | |
| | | * \ref ::cudaBindTextureToArray(const struct textureReference*, const stru | |
| | | ct cudaArray*, const struct cudaChannelFormatDesc*) "cudaBindTextureToArray | |
| | | (C API)", | |
| | | * \ref ::cudaBindTextureToArray(const struct texture< T, dim, readMode>&, | |
| | | const struct cudaArray*) "cudaBindTextureToArray (C++ API, inherited channe | |
| | | l descriptor)", | |
| | | * \ref ::cudaUnbindTexture(const struct texture< T, dim, readMode>&) "cuda | |
| | | UnbindTexture (C++ API)", | |
| | | * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture< T, d | |
| | | im, readMode >&) "cudaGetTextureAlignmentOffset (C++ API)" | |
| | | */ | |
| template<class T, int dim, enum cudaTextureReadMode readMode> | | template<class T, int dim, enum cudaTextureReadMode readMode> | |
| __inline__ __host__ cudaError_t cudaBindTextureToArray( | | __inline__ __host__ cudaError_t cudaBindTextureToArray( | |
| const struct texture<T, dim, readMode> &tex, | | const struct texture<T, dim, readMode> &tex, | |
| const struct cudaArray *array, | | const struct cudaArray *array, | |
| const struct cudaChannelFormatDesc &desc | | const struct cudaChannelFormatDesc &desc | |
| ) | | ) | |
| { | | { | |
| return cudaBindTextureToArray(&tex, array, &desc); | | return cudaBindTextureToArray(&tex, array, &desc); | |
| } | | } | |
| | | | |
|
| | | /** | |
| | | * \brief \hl Binds an array to a texture | |
| | | * | |
| | | * Binds the CUDA array \p array to the texture reference \p tex. | |
| | | * The channel descriptor is inherited from the CUDA array. Any CUDA array | |
| | | * previously bound to \p tex is unbound. | |
| | | * | |
| | | * \param tex - Texture to bind | |
| | | * \param array - Memory array on device | |
| | | * | |
| | | * \return | |
| | | * ::cudaSuccess, | |
| | | * ::cudaErrorInvalidValue, | |
| | | * ::cudaErrorInvalidDevicePointer, | |
| | | * ::cudaErrorInvalidTexture | |
| | | * \notefnerr | |
| | | * | |
| | | * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)" | |
| | | , | |
| | | * ::cudaGetChannelDesc, ::cudaGetTextureReference, | |
| | | * \ref ::cudaBindTexture(size_t*, const struct texture< T, dim, readMode>& | |
| | | , const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTextur | |
| | | e (C++ API)", | |
| | | * \ref ::cudaBindTexture(size_t*, const struct texture< T, dim, readMode>& | |
| | | , const void*, size_t) "cudaBindTexture (C++ API, inherited channel descrip | |
| | | tor)", | |
| | | * \ref ::cudaBindTexture2D(size_t*, const struct texture< T, dim, readMode | |
| | | >&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_ | |
| | | t) "cudaBindTexture2D (C++ API)", | |
| | | * \ref ::cudaBindTextureToArray(const struct textureReference*, const stru | |
| | | ct cudaArray*, const struct cudaChannelFormatDesc*) "cudaBindTextureToArray | |
| | | (C API)", | |
| | | * \ref ::cudaBindTextureToArray(const struct texture< T, dim, readMode>&, | |
| | | const struct cudaArray*, const struct cudaChannelFormatDesc&) "cudaBindText | |
| | | ureToArray (C++ API)", | |
| | | * \ref ::cudaUnbindTexture(const struct texture< T, dim, readMode>&) "cuda | |
| | | UnbindTexture (C++ API)", | |
| | | * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture< T, d | |
| | | im, readMode >&) "cudaGetTextureAlignmentOffset (C++ API)" | |
| | | */ | |
| template<class T, int dim, enum cudaTextureReadMode readMode> | | template<class T, int dim, enum cudaTextureReadMode readMode> | |
| __inline__ __host__ cudaError_t cudaBindTextureToArray( | | __inline__ __host__ cudaError_t cudaBindTextureToArray( | |
| const struct texture<T, dim, readMode> &tex, | | const struct texture<T, dim, readMode> &tex, | |
| const struct cudaArray *array | | const struct cudaArray *array | |
| ) | | ) | |
| { | | { | |
| struct cudaChannelFormatDesc desc; | | struct cudaChannelFormatDesc desc; | |
| cudaError_t err = cudaGetChannelDesc(&desc, array); | | cudaError_t err = cudaGetChannelDesc(&desc, array); | |
| | | | |
| return err == cudaSuccess ? cudaBindTextureToArray(tex, array, desc) : er
r; | | return err == cudaSuccess ? cudaBindTextureToArray(tex, array, desc) : er
r; | |
| } | | } | |
| | | | |
| /**************************************************************************
***** | | /**************************************************************************
***** | |
| *
* | | *
* | |
| *
* | | *
* | |
| *
* | | *
* | |
| ***************************************************************************
****/ | | ***************************************************************************
****/ | |
| | | | |
|
| | | /** | |
| | | * \brief \hl Unbinds a texture | |
| | | * | |
| | | * Unbinds the texture bound to \p tex. | |
| | | * | |
| | | * \param tex - Texture to unbind | |
| | | * | |
| | | * \return ::cudaSuccess | |
| | | * \notefnerr | |
| | | * | |
| | | * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)" | |
| | | , | |
| | | * ::cudaGetChannelDesc, ::cudaGetTextureReference, | |
| | | * \ref ::cudaBindTexture(size_t*, const struct texture< T, dim, readMode>& | |
| | | , const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTextur | |
| | | e (C++ API)", | |
| | | * \ref ::cudaBindTexture(size_t*, const struct texture< T, dim, readMode>& | |
| | | , const void*, size_t) "cudaBindTexture (C++ API, inherited channel descrip | |
| | | tor)", | |
| | | * \ref ::cudaBindTexture2D(size_t*, const struct texture< T, dim, readMode | |
| | | >&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_ | |
| | | t) "cudaBindTexture2D (C++ API)", | |
| | | * \ref ::cudaBindTextureToArray(const struct texture< T, dim, readMode>&, | |
| | | const struct cudaArray*, const struct cudaChannelFormatDesc&) "cudaBindText | |
| | | ureToArray (C++ API)", | |
| | | * \ref ::cudaBindTextureToArray(const struct texture< T, dim, readMode>&, | |
| | | const struct cudaArray*) "cudaBindTextureToArray (C++ API, inherited channe | |
| | | l descriptor)", | |
| | | * \ref ::cudaUnbindTexture(const struct textureReference*) "cudaUnbindText | |
| | | ure (C API)", | |
| | | * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture< T, d | |
| | | im, readMode >&) "cudaGetTextureAlignmentOffset (C++ API)" | |
| | | */ | |
| template<class T, int dim, enum cudaTextureReadMode readMode> | | template<class T, int dim, enum cudaTextureReadMode readMode> | |
| __inline__ __host__ cudaError_t cudaUnbindTexture( | | __inline__ __host__ cudaError_t cudaUnbindTexture( | |
| const struct texture<T, dim, readMode> &tex | | const struct texture<T, dim, readMode> &tex | |
| ) | | ) | |
| { | | { | |
| return cudaUnbindTexture(&tex); | | return cudaUnbindTexture(&tex); | |
| } | | } | |
| | | | |
| /**************************************************************************
***** | | /**************************************************************************
***** | |
| *
* | | *
* | |
| *
* | | *
* | |
| *
* | | *
* | |
| ***************************************************************************
****/ | | ***************************************************************************
****/ | |
| | | | |
|
| | | /** | |
| | | * \brief \hl Get the alignment offset of a texture | |
| | | * | |
| | | * Returns in \p *offset the offset that was returned when texture referenc | |
| | | e | |
| | | * \p tex was bound. | |
| | | * | |
| | | * \param offset - Offset of texture reference in bytes | |
| | | * \param tex - Texture to get offset of | |
| | | * | |
| | | * \return | |
| | | * ::cudaSuccess, | |
| | | * ::cudaErrorInvalidTexture, | |
| | | * ::cudaErrorInvalidTextureBinding | |
| | | * \notefnerr | |
| | | * | |
| | | * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)" | |
| | | , | |
| | | * ::cudaGetChannelDesc, ::cudaGetTextureReference, | |
| | | * \ref ::cudaBindTexture(size_t*, const struct texture< T, dim, readMode>& | |
| | | , const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTextur | |
| | | e (C++ API)", | |
| | | * \ref ::cudaBindTexture(size_t*, const struct texture< T, dim, readMode>& | |
| | | , const void*, size_t) "cudaBindTexture (C++ API, inherited channel descrip | |
| | | tor)", | |
| | | * \ref ::cudaBindTexture2D(size_t*, const struct texture< T, dim, readMode | |
| | | >&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_ | |
| | | t) "cudaBindTexture2D (C++ API)", | |
| | | * \ref ::cudaBindTextureToArray(const struct texture< T, dim, readMode>&, | |
| | | const struct cudaArray*, const struct cudaChannelFormatDesc&) "cudaBindText | |
| | | ureToArray (C++ API)", | |
| | | * \ref ::cudaBindTextureToArray(const struct texture< T, dim, readMode>&, | |
| | | const struct cudaArray*) "cudaBindTextureToArray (C++ API, inherited channe | |
| | | l descriptor)", | |
| | | * \ref ::cudaUnbindTexture(const struct texture< T, dim, readMode>&) "cuda | |
| | | UnbindTexture (C++ API)", | |
| | | * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct textureRefere | |
| | | nce*) "cudaGetTextureAlignmentOffset (C API)" | |
| | | */ | |
| template<class T, int dim, enum cudaTextureReadMode readMode> | | template<class T, int dim, enum cudaTextureReadMode readMode> | |
| __inline__ __host__ cudaError_t cudaGetTextureAlignmentOffset( | | __inline__ __host__ cudaError_t cudaGetTextureAlignmentOffset( | |
| size_t *offset, | | size_t *offset, | |
| const struct texture<T, dim, readMode> &tex | | const struct texture<T, dim, readMode> &tex | |
| ) | | ) | |
| { | | { | |
| return cudaGetTextureAlignmentOffset(offset, &tex); | | return cudaGetTextureAlignmentOffset(offset, &tex); | |
| } | | } | |
| | | | |
|
| | | /** @} */ /* END CUDART_HIGHLEVEL */ | |
| | | | |
| /**************************************************************************
***** | | /**************************************************************************
***** | |
| *
* | | *
* | |
| *
* | | *
* | |
| *
* | | *
* | |
| ***************************************************************************
****/ | | ***************************************************************************
****/ | |
| | | | |
|
| | | /** | |
| | | * \ingroup CUDART_HIGHLEVEL | |
| | | * \brief \hl Launches a device function | |
| | | * | |
| | | * Launches the function \p entry on the device. \p entry can either be a | |
| | | * function that executes on the device, or it can be a character string, | |
| | | * naming a function that executes on the device. \p entry must be declared | |
| | | as | |
| | | * a \p __global__ function. | |
| | | * \ref ::cudaLaunch(T*) "cudaLaunch()" must be preceded by a call to | |
| | | * ::cudaConfigureCall() since it pops the data that was pushed by | |
| | | * ::cudaConfigureCall() from the execution stack. | |
| | | * | |
| | | * \param entry - Device function pointer or char string naming device func | |
| | | tion | |
| | | * to execute | |
| | | * | |
| | | * \return | |
| | | * ::cudaSuccess, | |
| | | * ::cudaErrorInvalidDeviceFunction, | |
| | | * ::cudaErrorInvalidConfiguration | |
| | | * \notefnerr | |
| | | * | |
| | | * \sa ::cudaConfigureCall, | |
| | | * \ref ::cudaSetupArgument(T,size_t) "cudaSetupArgument (C++ API)", | |
| | | * \ref ::cudaLaunch(const char*) "cudaLaunch (C API)" | |
| | | */ | |
| template<class T> | | template<class T> | |
| __inline__ __host__ cudaError_t cudaLaunch( | | __inline__ __host__ cudaError_t cudaLaunch( | |
|
| T *symbol | | T *entry | |
| ) | | ) | |
| { | | { | |
|
| return cudaLaunch((const char*)symbol); | | return cudaLaunch((const char*)entry); | |
| } | | } | |
| | | | |
| #endif /* __CUDACC__ */ | | #endif /* __CUDACC__ */ | |
| | | | |
| #endif /* __cplusplus */ | | #endif /* __cplusplus */ | |
| | | | |
| #endif /* !__CUDA_RUNTIME_H__ */ | | #endif /* !__CUDA_RUNTIME_H__ */ | |
| | | | |
End of changes. 16 change blocks. |
| 3 lines changed or deleted | | 444 lines changed or added | |
|
| device_functions.h | | device_functions.h | |
| /* | | /* | |
|
| * Copyright 1993-2008 NVIDIA Corporation. All rights reserved. | | * Copyright 1993-2009 NVIDIA Corporation. All rights reserved. | |
| * | | * | |
| * NOTICE TO USER: | | * NOTICE TO USER: | |
| * | | * | |
| * This source code is subject to NVIDIA ownership rights under U.S. and | | * This source code is subject to NVIDIA ownership rights under U.S. and | |
| * international Copyright laws. Users and possessors of this source code | | * international Copyright laws. Users and possessors of this source code | |
| * are hereby granted a nonexclusive, royalty-free license to use this code | | * are hereby granted a nonexclusive, royalty-free license to use this code | |
| * in individual and commercial software. | | * in individual and commercial software. | |
| * | | * | |
| * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE | | * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE | |
| * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR | | * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR | |
| | | | |
| skipping to change at line 77 | | skipping to change at line 77 | |
| extern __device__ unsigned long long int __umul64hi(unsigned long long int,
unsigned long long int); | | extern __device__ unsigned long long int __umul64hi(unsigned long long int,
unsigned long long int); | |
| | | | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
| extern __device__ float __int_as_float(int); | | extern __device__ float __int_as_float(int); | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
| extern __device__ int __float_as_int(float); | | extern __device__ int __float_as_int(float); | |
| | | | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
| extern __device__ void __syncthreads(void); | | extern __device__ void __syncthreads(void); | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
|
| | | extern __device__ void __prof_trigger(int); | |
| | | /*DEVICE_BUILTIN*/ | |
| | | extern __device__ void __threadfence(void); | |
| | | /*DEVICE_BUILTIN*/ | |
| | | extern __device__ void __threadfence_block(void); | |
| | | /*DEVICE_BUILTIN*/ | |
| extern __device__ void __trap(void); | | extern __device__ void __trap(void); | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
| extern __device__ void __brkpt(int); | | extern __device__ void __brkpt(int); | |
| | | | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
| extern __device__ float __saturatef(float); | | extern __device__ float __saturatef(float); | |
| | | | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
| extern __device__ unsigned int __sad(int, int, unsigned int); | | extern __device__ unsigned int __sad(int, int, unsigned int); | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
| | | | |
| skipping to change at line 183 | | skipping to change at line 189 | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
| extern __device__ unsigned long long int __float2ull_rz(float); | | extern __device__ unsigned long long int __float2ull_rz(float); | |
| | | | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
| extern __device__ float __ll2float_rn(long long int); | | extern __device__ float __ll2float_rn(long long int); | |
| | | | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
| extern __device__ float __ull2float_rn(unsigned long long
int); | | extern __device__ float __ull2float_rn(unsigned long long
int); | |
| | | | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
|
| | | extern __device__ float __fadd_rn(float, float); | |
| | | /*DEVICE_BUILTIN*/ | |
| extern __device__ float __fadd_rz(float, float); | | extern __device__ float __fadd_rz(float, float); | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
|
| extern __device__ float __fmul_rz(float, float); | | extern __device__ float __fadd_ru(float, float); | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
|
| extern __device__ float __fadd_rn(float, float); | | extern __device__ float __fadd_rd(float, float); | |
| | | | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
| extern __device__ float __fmul_rn(float, float); | | extern __device__ float __fmul_rn(float, float); | |
|
| | | /*DEVICE_BUILTIN*/ | |
| | | extern __device__ float __fmul_rz(float, float); | |
| | | /*DEVICE_BUILTIN*/ | |
| | | extern __device__ float __fmul_ru(float, float); | |
| | | /*DEVICE_BUILTIN*/ | |
| | | extern __device__ float __fmul_rd(float, float); | |
| | | | |
| | | /*DEVICE_BUILTIN*/ | |
| | | extern __device__ float __fmaf_rn(float, float, float); | |
| | | /*DEVICE_BUILTIN*/ | |
| | | extern __device__ float __fmaf_rz(float, float, float); | |
| | | /*DEVICE_BUILTIN*/ | |
| | | extern __device__ float __fmaf_ru(float, float, float); | |
| | | /*DEVICE_BUILTIN*/ | |
| | | extern __device__ float __fmaf_rd(float, float, float); | |
| | | | |
| | | /*DEVICE_BUILTIN*/ | |
| | | extern __device__ float __frcp_rn(float); | |
| | | /*DEVICE_BUILTIN*/ | |
| | | extern __device__ float __frcp_rz(float); | |
| | | /*DEVICE_BUILTIN*/ | |
| | | extern __device__ float __frcp_ru(float); | |
| | | /*DEVICE_BUILTIN*/ | |
| | | extern __device__ float __frcp_rd(float); | |
| | | | |
| | | /*DEVICE_BUILTIN*/ | |
| | | extern __device__ float __fsqrt_rn(float); | |
| | | /*DEVICE_BUILTIN*/ | |
| | | extern __device__ float __fsqrt_rz(float); | |
| | | /*DEVICE_BUILTIN*/ | |
| | | extern __device__ float __fsqrt_ru(float); | |
| | | /*DEVICE_BUILTIN*/ | |
| | | extern __device__ float __fsqrt_rd(float); | |
| | | | |
| | | /*DEVICE_BUILTIN*/ | |
| | | extern __device__ float __fdiv_rn(float, float); | |
| | | /*DEVICE_BUILTIN*/ | |
| | | extern __device__ float __fdiv_rz(float, float); | |
| | | /*DEVICE_BUILTIN*/ | |
| | | extern __device__ float __fdiv_ru(float, float); | |
| | | /*DEVICE_BUILTIN*/ | |
| | | extern __device__ float __fdiv_rd(float, float); | |
| | | | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
| extern __device__ int __clz(int); | | extern __device__ int __clz(int); | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
| extern __device__ int __ffs(int); | | extern __device__ int __ffs(int); | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
| extern __device__ int __popc(unsigned int); | | extern __device__ int __popc(unsigned int); | |
|
| | | /*DEVICE_BUILTIN*/ | |
| | | extern __device__ unsigned int __brev(unsigned int); | |
| | | | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
| extern __device__ int __clzll(long long int); | | extern __device__ int __clzll(long long int); | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
| extern __device__ int __ffsll(long long int); | | extern __device__ int __ffsll(long long int); | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
| extern __device__ int __popcll(unsigned long long int); | | extern __device__ int __popcll(unsigned long long int); | |
|
| | | /*DEVICE_BUILTIN*/ | |
| | | extern __device__ unsigned long long int __brevll(unsigned long long int); | |
| | | | |
| #if !defined(CUDA_NO_SM_13_DOUBLE_INTRINSICS) | | #if !defined(CUDA_NO_SM_13_DOUBLE_INTRINSICS) | |
| | | | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
| extern __device__ int __double2int_rz(double); | | extern __device__ int __double2int_rz(double); | |
| | | | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
| extern __device__ unsigned int __double2uint_rz(double); | | extern __device__ unsigned int __double2uint_rz(double); | |
| | | | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
| | | | |
| skipping to change at line 309 | | skipping to change at line 364 | |
| static __inline__ __device__ void brkpt(int c) | | static __inline__ __device__ void brkpt(int c) | |
| { | | { | |
| __brkpt(c); | | __brkpt(c); | |
| } | | } | |
| | | | |
| static __inline__ __device__ void syncthreads(void) | | static __inline__ __device__ void syncthreads(void) | |
| { | | { | |
| __syncthreads(); | | __syncthreads(); | |
| } | | } | |
| | | | |
|
| | | static __inline__ __device__ void prof_trigger(int e) | |
| | | { | |
| | | if (e == 0) __prof_trigger( 0); | |
| | | else if (e == 1) __prof_trigger( 1); | |
| | | else if (e == 2) __prof_trigger( 2); | |
| | | else if (e == 3) __prof_trigger( 3); | |
| | | else if (e == 4) __prof_trigger( 4); | |
| | | else if (e == 5) __prof_trigger( 5); | |
| | | else if (e == 6) __prof_trigger( 6); | |
| | | else if (e == 7) __prof_trigger( 7); | |
| | | else if (e == 8) __prof_trigger( 8); | |
| | | else if (e == 9) __prof_trigger( 9); | |
| | | else if (e == 10) __prof_trigger(10); | |
| | | else if (e == 11) __prof_trigger(11); | |
| | | else if (e == 12) __prof_trigger(12); | |
| | | else if (e == 13) __prof_trigger(13); | |
| | | else if (e == 14) __prof_trigger(14); | |
| | | else if (e == 15) __prof_trigger(15); | |
| | | } | |
| | | | |
| | | static __inline__ __device__ void threadfence(bool global = true) | |
| | | { | |
| | | global ? __threadfence() : __threadfence_block(); | |
| | | } | |
| | | | |
| static __inline__ __device__ int float2int(float a, enum cudaRoundMode mode
= cudaRoundZero) | | static __inline__ __device__ int float2int(float a, enum cudaRoundMode mode
= cudaRoundZero) | |
| { | | { | |
| return mode == cudaRoundNearest ? __float2int_rn(a) : | | return mode == cudaRoundNearest ? __float2int_rn(a) : | |
| mode == cudaRoundPosInf ? __float2int_ru(a) : | | mode == cudaRoundPosInf ? __float2int_ru(a) : | |
| mode == cudaRoundMinInf ? __float2int_rd(a) : | | mode == cudaRoundMinInf ? __float2int_rd(a) : | |
| __float2int_rz(a); | | __float2int_rz(a); | |
| } | | } | |
| | | | |
| static __inline__ __device__ unsigned int float2uint(float a, enum cudaRoun
dMode mode = cudaRoundZero) | | static __inline__ __device__ unsigned int float2uint(float a, enum cudaRoun
dMode mode = cudaRoundZero) | |
| { | | { | |
| | | | |
| skipping to change at line 348 | | skipping to change at line 428 | |
| __uint2float_rn(a); | | __uint2float_rn(a); | |
| } | | } | |
| | | | |
| #elif !defined(__CUDACC__) | | #elif !defined(__CUDACC__) | |
| | | | |
| #include "crt/func_macro.h" | | #include "crt/func_macro.h" | |
| | | | |
| #include "host_defines.h" | | #include "host_defines.h" | |
| #include "math_constants.h" | | #include "math_constants.h" | |
| | | | |
|
| #if !defined(__CUDABE__) | | #if defined(__CUDABE__) | |
| | | | |
| | | __device_func__(float __frcp_rn (float x)) | |
| | | { | |
| | | unsigned int expo; | |
| | | unsigned f, y; | |
| | | unsigned int argi; | |
| | | float t; | |
| | | | |
| | | argi = __float_as_int(x); | |
| | | expo = (argi >> 23); | |
| | | expo = expo & 0xff; | |
| | | f = expo - 1; | |
| | | if (f <= 0xFD) { | |
| | | y = (argi & 0x00ffffff) | 0x00800000; | |
| | | expo = (2 * 127) - expo - 2; | |
| | | t = 1.0f / x; | |
| | | argi = __float_as_int(t); | |
| | | argi = (argi & 0x00ffffff) | 0x00800000; | |
| | | if ((int)expo >= 0) { | |
| | | /* compute remainder1 */ | |
| | | f = __umul24(y, argi); | |
| | | /* remainder1 must be negative. Fix if neccessary */ | |
| | | if ((int)f > 0) { | |
| | | t = __int_as_float(__float_as_int(t)-1); | |
| | | f -= y; | |
| | | } | |
| | | /* compute remainder2 */ | |
| | | expo = f + y; | |
| | | /* round result based on which remainder is smaller in magnitude */ | |
| | | f = (unsigned)(-(int)f); | |
| | | if (expo < f) { | |
| | | t = __int_as_float(__float_as_int(t)+1); | |
| | | } | |
| | | return t; | |
| | | } | |
| | | } | |
| | | return 1.0f / x; | |
| | | } | |
| | | | |
| | | __device_func__(float __frcp_rz (float x)) | |
| | | { | |
| | | unsigned int expo; | |
| | | unsigned f, y; | |
| | | unsigned int argi; | |
| | | float t; | |
| | | | |
| | | argi = __float_as_int(x); | |
| | | expo = (argi >> 23); | |
| | | expo = expo & 0xff; | |
| | | f = expo - 1; | |
| | | if (f <= 0xFD) { | |
| | | y = (argi & 0x00ffffff) | 0x00800000; | |
| | | expo = (2 * 127) - expo - 2; | |
| | | t = 1.0f / x; | |
| | | argi = __float_as_int(t); | |
| | | argi = (argi & 0x00ffffff) | 0x00800000; | |
| | | if ((int)expo >= 0) { | |
| | | f = __umul24(y, argi); | |
| | | if ((int)f > 0) { | |
| | | t = __int_as_float(__float_as_int(t)-1); | |
| | | } | |
| | | return t; | |
| | | } | |
| | | } | |
| | | return 1.0f / x; | |
| | | } | |
| | | | |
| | | __device_func__(float __frcp_rd (float x)) | |
| | | { | |
| | | unsigned int expo; | |
| | | unsigned f, y; | |
| | | unsigned int argi; | |
| | | float t; | |
| | | | |
| | | argi = __float_as_int(x); | |
| | | expo = (argi >> 23); | |
| | | expo = expo & 0xff; | |
| | | f = expo - 1; | |
| | | if (f <= 0xFD) { | |
| | | y = (argi & 0x00ffffff) | 0x00800000; | |
| | | expo = (2 * 127) - expo - 2; | |
| | | t = 1.0f / x; | |
| | | argi = __float_as_int(t); | |
| | | argi = (argi & 0x00ffffff) | 0x00800000; | |
| | | if ((int)expo >= 0) { | |
| | | f = __umul24(y, argi); | |
| | | if (((int)f > 0) && (x > 0.0f)) { | |
| | | t = __int_as_float(__float_as_int(t)-1); | |
| | | } | |
| | | if (((int)f < 0) && (x < 0.0f)) { | |
| | | t = __int_as_float(__float_as_int(t)+1); | |
| | | } | |
| | | return t; | |
| | | } | |
| | | } | |
| | | return 1.0f / x; | |
| | | } | |
| | | | |
| | | __device_func__(float __frcp_ru (float x)) | |
| | | { | |
| | | unsigned int expo; | |
| | | unsigned f, y; | |
| | | unsigned int argi; | |
| | | float t; | |
| | | | |
| | | argi = __float_as_int(x); | |
| | | expo = (argi >> 23); | |
| | | expo = expo & 0xff; | |
| | | f = expo - 1; | |
| | | if (f <= 0xFD) { | |
| | | y = (argi & 0x00ffffff) | 0x00800000; | |
| | | expo = (2 * 127) - expo - 2; | |
| | | t = 1.0f / x; | |
| | | argi = __float_as_int(t); | |
| | | argi = (argi & 0x00ffffff) | 0x00800000; | |
| | | if ((int)expo >= 0) { | |
| | | f = __umul24(y, argi); | |
| | | if (((int)f > 0) && (x < 0.0f)) { | |
| | | t = __int_as_float(__float_as_int(t)-1); | |
| | | } | |
| | | if (((int)f < 0) && (x > 0.0f)) { | |
| | | t = __int_as_float(__float_as_int(t)+1); | |
| | | } | |
| | | return t; | |
| | | } | |
| | | } | |
| | | return 1.0f / x; | |
| | | } | |
| | | | |
| | | __device_func__(float __fsqrt_rn (float radicand)) | |
| | | { | |
| | | unsigned int expo, argi; | |
| | | unsigned int s, f, x; | |
| | | | |
| | | argi = __float_as_int(radicand); | |
| | | expo = argi >> 23; | |
| | | expo = expo & 0xff; | |
| | | f = expo - 1; | |
| | | | |
| | | if ((argi <= 0x80000000) && (f <= 0xFD)) { | |
| | | x = (argi << 8) | 0x80000000; | |
| | | x = x >> (expo & 1); | |
| | | argi = (((__float_as_int(rsqrtf(__int_as_float( | |
| | | __float_as_int(radicand)|1)))&0x00ffffff)|0x00800000)<<7); | |
| | | /* second NR iteration */ | |
| | | s = __umulhi(argi,argi); | |
| | | f = 0x30000000 - __umulhi(x,s); | |
| | | argi = __umulhi(f,argi); | |
| | | /* compute sqrt_rn(x) as x * 1/sqrt_rn(x) */ | |
| | | argi = __umulhi(x,argi); | |
| | | argi = argi >> 3; | |
| | | x = (x << 16) - (argi * argi); | |
| | | /* round to nearest based on remainder; tie case impossible */ | |
| | | f = x - (2 * argi + 1); | |
| | | if ((int)f < 0) f = (unsigned)(-(int)f); | |
| | | if ((int)x < 0) x = (unsigned)(-(int)x); | |
| | | if (f < x) argi ++; | |
| | | argi = argi + (((expo + 125) & ~0x1) << 22); | |
| | | return __int_as_float(argi); | |
| | | } | |
| | | return sqrtf(radicand); | |
| | | } | |
| | | | |
| | | __device_func__(float __fsqrt_rz (float radicand)) | |
| | | { | |
| | | unsigned int expo, argi; | |
| | | unsigned int s, f, x; | |
| | | | |
| | | argi = __float_as_int(radicand); | |
| | | expo = argi >> 23; | |
| | | expo = expo & 0xff; | |
| | | f = expo - 1; | |
| | | | |
| | | if ((argi <= 0x80000000) && (f <= 0xFD)) { | |
| | | x = (argi << 8) | 0x80000000; | |
| | | x = x >> (expo & 1); | |
| | | argi = (((__float_as_int(rsqrtf(__int_as_float( | |
| | | __float_as_int(radicand)|1)))&0x00ffffff)|0x00800000)<<7); | |
| | | /* NR iteration */ | |
| | | s = __umulhi(argi,argi); | |
| | | f = 0x30000000 - __umulhi(x,s); | |
| | | argi = __umulhi(f,argi); | |
| | | /* compute sqrt_rz(x) as x * 1/sqrt_rz(x) */ | |
| | | argi = __umulhi(x,argi); | |
| | | /* compute truncated result */ | |
| | | argi = (argi + 4) >> 3; | |
| | | x = (x << 16) - (argi * argi); | |
| | | if ((int)x < 0) argi--; | |
| | | argi = argi + (((expo + 125) & ~0x1) << 22); | |
| | | return __int_as_float(argi); | |
| | | } | |
| | | return sqrtf(radicand); | |
| | | } | |
| | | | |
| | | __device_func__(float __fsqrt_ru (float radicand)) | |
| | | { | |
| | | unsigned int expo, argi; | |
| | | unsigned int s, f, x; | |
| | | | |
| | | argi = __float_as_int(radicand); | |
| | | expo = argi >> 23; | |
| | | expo = expo & 0xff; | |
| | | f = expo - 1; | |
| | | | |
| | | if ((argi <= 0x80000000) && (f <= 0xFD)) { | |
| | | x = (argi << 8) | 0x80000000; | |
| | | x = x >> (expo & 1); | |
| | | argi = (((__float_as_int(rsqrtf(__int_as_float( | |
| | | __float_as_int(radicand)|1)))&0x00ffffff)|0x00800000)<<7); | |
| | | /* NR iteration */ | |
| | | s = __umulhi(argi,argi); | |
| | | f = 0x30000000 - __umulhi(x,s); | |
| | | argi = __umulhi(f,argi); | |
| | | /* compute sqrt_ru(x) as x * 1/sqrt_ru(x) */ | |
| | | argi = __umulhi(x,argi); | |
| | | argi = (argi + 4) >> 3; | |
| | | x = (x << 16) - (argi * argi); | |
| | | if ((int)x > 0) argi++; | |
| | | argi = argi + (((expo + 125) & ~0x1) << 22); | |
| | | return __int_as_float(argi); | |
| | | } | |
| | | return sqrtf(radicand); | |
| | | } | |
| | | | |
| | | __device_func__(float __fsqrt_rd (float radicand)) | |
| | | { | |
| | | unsigned int expo, argi; | |
| | | unsigned int s, f, x; | |
| | | | |
| | | argi = __float_as_int(radicand); | |
| | | expo = argi >> 23; | |
| | | expo = expo & 0xff; | |
| | | f = expo - 1; | |
| | | | |
| | | if ((argi <= 0x80000000) && (f <= 0xFD)) { | |
| | | x = (argi << 8) | 0x80000000; | |
| | | x = x >> (expo & 1); | |
| | | argi = (((__float_as_int(rsqrtf(__int_as_float( | |
| | | __float_as_int(radicand)|1)))&0x00ffffff)|0x00800000)<<7); | |
| | | /* NR iteration */ | |
| | | s = __umulhi(argi,argi); | |
| | | f = 0x30000000 - __umulhi(x,s); | |
| | | argi = __umulhi(f,argi); | |
| | | /* compute sqrt_rd(x) as x * 1/sqrt_rd(x) */ | |
| | | argi = __umulhi(x,argi); | |
| | | /* compute truncated result */ | |
| | | argi = (argi + 4) >> 3; | |
| | | x = (x << 16) - (argi * argi); | |
| | | if ((int)x < 0) argi--; | |
| | | argi = argi + (((expo + 125) & ~0x1) << 22); | |
| | | return __int_as_float(argi); | |
| | | } | |
| | | return sqrtf(radicand); | |
| | | } | |
| | | | |
| | | __device_func__(float __fdiv_rn (float dividend, float divisor)) | |
| | | { | |
| | | unsigned long long prod; | |
| | | unsigned r, f, x, y, expox, expoy, sign; | |
| | | unsigned expo_res; | |
| | | unsigned resi, cvtxi, cvtyi; | |
| | | float t; | |
| | | | |
| | | cvtxi = __float_as_int(dividend); | |
| | | cvtyi = __float_as_int(divisor); | |
| | | expox = (cvtxi >> 23) & 0xff; | |
| | | expoy = (cvtyi >> 23) & 0xff; | |
| | | sign = ((cvtxi ^ cvtyi) & 0x80000000); | |
| | | | |
| | | if (((expox - 1) <= 0xFD) && ((expoy - 1) <= 0xFD)) { | |
| | | expo_res = expox - expoy + 127 - 1; | |
| | | /* extract mantissas */ | |
| | | y = (cvtyi << 8) | 0x80000000; | |
| | | x = (cvtxi & 0x00ffffff) | 0x00800000; | |
| | | t =__int_as_float((cvtyi & 0x00ffffff) | 0x3f800001); | |
| | | r = ((__float_as_int(1.0f / t) & 0x00ffffff) | 0x00800000) << 7; | |
| | | /* NR iteration */ | |
| | | f = (unsigned)-(int)__umulhi (y, r << 1); | |
| | | r = __umulhi (f, r << 1); | |
| | | /* produce quotient */ | |
| | | prod = ((unsigned long long)x) * (r << 1); | |
| | | /* normalize mantissa */ | |
| | | if (((int)((prod >> 32) << 8)) > 0) { | |
| | | expo_res--; | |
| | | prod = prod + prod; | |
| | | } | |
| | | /* preliminary mantissa */ | |
| | | r = (unsigned)(prod >> 32); | |
| | | y = y >> 8; | |
| | | /* result is a normal */ | |
| | | if (expo_res <= 0xFD) { | |
| | | int rem0, rem1, inc; | |
| | | /* round mantissa to nearest even */ | |
| | | prod = ((unsigned long long)y) * r; | |
| | | x = x << (23 + ((prod >> 32) >> 15)); | |
| | | rem1 = x - (unsigned)(prod & 0xffffffff); | |
| | | rem0 = rem1 - y; | |
| | | inc = abs(rem0) < abs(rem1); | |
| | | /* merge sign, mantissa, exponent for final result */ | |
| | | resi = sign | ((expo_res << 23) + r + inc); | |
| | | return __int_as_float(resi); | |
| | | } else if ((int)expo_res >= 254) { | |
| | | /* overflow: return infinity */ | |
| | | resi = sign | 0x7f800000; | |
| | | return __int_as_float(resi); | |
| | | } else { | |
| | | /* underflow, may still round to normal */ | |
| | | int rem0, rem1, inc; | |
| | | prod = ((unsigned long long)y) * r; | |
| | | x = x << (23 + ((prod >> 32) >> 15)); | |
| | | rem1 = x - (unsigned)(prod & 0xffffffff); | |
| | | rem0 = rem1 - y; | |
| | | inc = abs(rem0) < abs(rem1); | |
| | | resi = ((expo_res << 23) + r + inc); | |
| | | if (resi != 0x00800000) resi = 0; | |
| | | return __int_as_float(sign | resi); | |
| | | } | |
| | | } | |
| | | if (__cuda_fabsf(divisor) > CUDART_TWO_TO_126_F) { | |
| | | divisor *= 0.25f; | |
| | | dividend *= 0.25f; | |
| | | } | |
| | | return dividend / divisor; | |
| | | } | |
| | | | |
| | | __device_func__(float __fdiv_rz (float dividend, float divisor)) | |
| | | { | |
| | | unsigned long long prod; | |
| | | unsigned r, f, x, y, expox, expoy, sign; | |
| | | unsigned expo_res; | |
| | | unsigned resi, cvtxi, cvtyi; | |
| | | float t; | |
| | | | |
| | | cvtxi = __float_as_int(dividend); | |
| | | cvtyi = __float_as_int(divisor); | |
| | | expox = (cvtxi >> 23) & 0xff; | |
| | | expoy = (cvtyi >> 23) & 0xff; | |
| | | sign = ((cvtxi ^ cvtyi) & 0x80000000); | |
| | | | |
| | | if (((expox - 1) <= 0xFD) && ((expoy - 1) <= 0xFD)) { | |
| | | expo_res = expox - expoy + 127 - 1; | |
| | | /* extract mantissas */ | |
| | | y = (cvtyi << 8) | 0x80000000; | |
| | | x = (cvtxi & 0x00ffffff) | 0x00800000; | |
| | | t =__int_as_float((cvtyi & 0x00ffffff) | 0x3f800001); | |
| | | r = ((__float_as_int(1.0f / t) & 0x00ffffff) | 0x00800000) << 7; | |
| | | /* NR iteration */ | |
| | | f = (unsigned)-(int)__umulhi (y, r << 1); | |
| | | r = __umulhi (f, r << 1); | |
| | | /* produce quotient */ | |
| | | prod = ((unsigned long long)x) * (r << 1); | |
| | | /* normalize mantissa */ | |
| | | if (((int)((prod >> 32) << 8)) > 0) { | |
| | | expo_res--; | |
| | | prod = prod + prod; | |
| | | } | |
| | | /* preliminary mantissa */ | |
| | | prod += 0x0000000080000000ULL; | |
| | | r = (unsigned)(prod >> 32); | |
| | | y = y >> 8; | |
| | | if (expo_res <= 0xFD) { | |
| | | /* result is a normal */ | |
| | | int rem1; | |
| | | prod = ((unsigned long long)y) * r; | |
| | | x = x << (23 + ((prod >> 32) >> 15)); | |
| | | rem1 = x - (unsigned)(prod & 0xffffffff); | |
| | | if (rem1 < 0) r--; | |
| | | resi = (expo_res << 23) + r; | |
| | | if (resi == 0x7f800000) resi = 0x7f7fffff; | |
| | | return __int_as_float(sign | resi); | |
| | | } else if ((int)expo_res >= 254) { | |
| | | /* overflow: return largest normal */ | |
| | | resi = 0x7f7fffff; | |
| | | return __int_as_float(sign |resi); | |
| | | } else { | |
| | | /* underflow: result is smallest normal or zero */ | |
| | | int rem1; | |
| | | prod = ((unsigned long long)y) * r; | |
| | | x = x << (23 + ((prod >> 32) >> 15)); | |
| | | rem1 = x - (unsigned)(prod & 0xffffffff); | |
| | | if (rem1 < 0) r--; | |
| | | resi = ((expo_res << 23) + r); | |
| | | if (resi != 0x00800000) resi = 0; | |
| | | return __int_as_float(sign | resi); | |
| | | } | |
| | | } | |
| | | if (__cuda_fabsf(divisor) > CUDART_TWO_TO_126_F) { | |
| | | divisor *= 0.25f; | |
| | | dividend *= 0.25f; | |
| | | } | |
| | | return dividend / divisor; | |
| | | } | |
| | | | |
| | | __device_func__(float __fdiv_ru (float dividend, float divisor)) | |
| | | { | |
| | | unsigned long long prod; | |
| | | unsigned r, f, x, y, expox, expoy, sign; | |
| | | unsigned expo_res; | |
| | | unsigned resi, cvtxi, cvtyi; | |
| | | float t; | |
| | | | |
| | | cvtxi = __float_as_int(dividend); | |
| | | cvtyi = __float_as_int(divisor); | |
| | | expox = (cvtxi >> 23) & 0xff; | |
| | | expoy = (cvtyi >> 23) & 0xff; | |
| | | sign = ((cvtxi ^ cvtyi) & 0x80000000); | |
| | | | |
| | | if (((expox - 1) <= 0xFD) && ((expoy - 1) <= 0xFD)) { | |
| | | expo_res = expox - expoy + 127 - 1; | |
| | | /* extract mantissas */ | |
| | | y = (cvtyi << 8) | 0x80000000; | |
| | | x = (cvtxi & 0x00ffffff) | 0x00800000; | |
| | | t =__int_as_float((cvtyi & 0x00ffffff) | 0x3f800001); | |
| | | r = ((__float_as_int(1.0f / t) & 0x00ffffff) | 0x00800000) << 7; | |
| | | /* NR iteration */ | |
| | | f = (unsigned)-(int)__umulhi (y, r << 1); | |
| | | r = __umulhi (f, r << 1); | |
| | | /* produce quotient */ | |
| | | prod = ((unsigned long long)x) * (r << 1); | |
| | | /* normalize mantissa */ | |
| | | if (((int)((prod >> 32) << 8)) > 0) { | |
| | | expo_res--; | |
| | | prod = prod + prod; | |
| | | } | |
| | | /* preliminary mantissa */ | |
| | | prod += 0x0000000080000000ULL; | |
| | | r = (unsigned)(prod >> 32); | |
| | | y = y >> 8; | |
| | | if (expo_res <= 0xFD) { | |
| | | /* result is a normal */ | |
| | | int rem1; | |
| | | prod = ((unsigned long long)y) * r; | |
| | | x = x << (23 + ((prod >> 32) >> 15)); | |
| | | rem1 = x - (unsigned)(prod & 0xffffffff); | |
| | | if ((rem1 < 0) && (sign)) r--; | |
| | | if ((rem1 > 0) && (!sign)) r++; | |
| | | resi = (expo_res << 23) + r; | |
| | | if ((resi == 0x7f800000) && (sign)) resi = 0x7f7fffff; | |
| | | return __int_as_float(sign | resi); | |
| | | } else if ((int)expo_res >= 254) { | |
| | | /* overflow: return largest normal */ | |
| | | resi = sign ? 0x7f7fffff : 0x7f800000; | |
| | | return __int_as_float(sign | resi); | |
| | | } else { | |
| | | /* underflow: result is smallest normal or zero */ | |
| | | int rem1; | |
| | | prod = ((unsigned long long)y) * r; | |
| | | x = x << (23 + ((prod >> 32) >> 15)); | |
| | | rem1 = x - (unsigned)(prod & 0xffffffff); | |
| | | if ((rem1 < 0) && (sign)) r--; | |
| | | if ((rem1 > 0) && (!sign)) r++; | |
| | | resi = ((expo_res << 23) + r); | |
| | | if (resi != 0x00800000) resi = 0; | |
| | | return __int_as_float(sign | resi); | |
| | | } | |
| | | } | |
| | | if (__cuda_fabsf(divisor) > CUDART_TWO_TO_126_F) { | |
| | | divisor *= 0.25f; | |
| | | dividend *= 0.25f; | |
| | | } | |
| | | return dividend / divisor; | |
| | | } | |
| | | | |
| | | __device_func__(float __fdiv_rd (float dividend, float divisor)) | |
| | | { | |
| | | unsigned long long prod; | |
| | | unsigned r, f, x, y, expox, expoy, sign; | |
| | | unsigned expo_res; | |
| | | unsigned resi, cvtxi, cvtyi; | |
| | | float t; | |
| | | | |
| | | cvtxi = __float_as_int(dividend); | |
| | | cvtyi = __float_as_int(divisor); | |
| | | expox = (cvtxi >> 23) & 0xff; | |
| | | expoy = (cvtyi >> 23) & 0xff; | |
| | | sign = ((cvtxi ^ cvtyi) & 0x80000000); | |
| | | | |
| | | if (((expox - 1) <= 0xFD) && ((expoy - 1) <= 0xFD)) { | |
| | | expo_res = expox - expoy + 127 - 1; | |
| | | /* extract mantissas */ | |
| | | y = (cvtyi << 8) | 0x80000000; | |
| | | x = (cvtxi & 0x00ffffff) | 0x00800000; | |
| | | t =__int_as_float((cvtyi & 0x00ffffff) | 0x3f800001); | |
| | | r = ((__float_as_int(1.0f / t) & 0x00ffffff) | 0x00800000) << 7; | |
| | | /* NR iteration */ | |
| | | f = (unsigned)-(int)__umulhi (y, r << 1); | |
| | | r = __umulhi (f, r << 1); | |
| | | /* produce quotient */ | |
| | | prod = ((unsigned long long)x) * (r << 1); | |
| | | /* normalize mantissa */ | |
| | | if (((int)((prod >> 32) << 8)) > 0) { | |
| | | expo_res--; | |
| | | prod = prod + prod; | |
| | | } | |
| | | /* preliminary mantissa */ | |
| | | prod += 0x0000000080000000ULL; | |
| | | r = (unsigned)(prod >> 32); | |
| | | y = y >> 8; | |
| | | if (expo_res <= 0xFD) { | |
| | | /* result is a normal */ | |
| | | int rem1; | |
| | | prod = ((unsigned long long)y) * r; | |
| | | x = x << (23 + ((prod >> 32) >> 15)); | |
| | | rem1 = x - (unsigned)(prod & 0xffffffff); | |
| | | if ((rem1 < 0) && (!sign)) r--; | |
| | | if ((rem1 > 0) && (sign)) r++; | |
| | | resi = (expo_res << 23) + r; | |
| | | if ((resi == 0x7f800000) && (!sign)) resi = 0x7f7fffff; | |
| | | return __int_as_float(sign | resi); | |
| | | } else if ((int)expo_res >= 254) { | |
| | | /* overflow: return largest normal */ | |
| | | resi = sign ? 0x7f800000 : 0x7f7fffff; | |
| | | return __int_as_float(sign |resi); | |
| | | } else { | |
| | | /* underflow: result is smallest normal or zero */ | |
| | | int rem1; | |
| | | prod = ((unsigned long long)y) * r; | |
| | | x = x << (23 + ((prod >> 32) >> 15)); | |
| | | rem1 = x - (unsigned)(prod & 0xffffffff); | |
| | | if ((rem1 < 0) && (!sign)) r--; | |
| | | if ((rem1 > 0) && (sign)) r++; | |
| | | resi = ((expo_res << 23) + r); | |
| | | if (resi != 0x00800000) resi = 0; | |
| | | return __int_as_float(sign | resi); | |
| | | } | |
| | | } | |
| | | if (__cuda_fabsf(divisor) > CUDART_TWO_TO_126_F) { | |
| | | divisor *= 0.25f; | |
| | | dividend *= 0.25f; | |
| | | } | |
| | | return dividend / divisor; | |
| | | } | |
| | | | |
| | | __device_func__(float __fadd_ru (float a, float b)) | |
| | | { | |
| | | unsigned int expo_x, expo_y; | |
| | | unsigned int xxi, yyi, temp; | |
| | | | |
| | | xxi = __float_as_int(a); | |
| | | yyi = __float_as_int(b); | |
| | | | |
| | | /* make bigger operand the augend */ | |
| | | expo_y = yyi << 1; | |
| | | if (expo_y > (xxi << 1)) { | |
| | | expo_y = xxi; | |
| | | xxi = yyi; | |
| | | yyi = expo_y; | |
| | | } | |
| | | | |
| | | temp = 0xff; | |
| | | expo_x = temp & (xxi >> 23); | |
| | | expo_x = expo_x - 1; | |
| | | expo_y = temp & (yyi >> 23); | |
| | | expo_y = expo_y - 1; | |
| | | | |
| | | if ((expo_x <= 0xFD) && | |
| | | (expo_y <= 0xFD)) { | |
| | | | |
| | | expo_y = expo_x - expo_y; | |
| | | if (expo_y > 25) { | |
| | | expo_y = 31; | |
| | | } | |
| | | temp = xxi ^ yyi; | |
| | | xxi = xxi & ~0x7f000000; | |
| | | xxi = xxi | 0x00800000; | |
| | | yyi = yyi & ~0xff000000; | |
| | | yyi = yyi | 0x00800000; | |
| | | | |
| | | if ((int)temp < 0) { | |
| | | /* signs differ, effective subtraction */ | |
| | | temp = 32 - expo_y; | |
| | | temp = (expo_y) ? (yyi << temp) : 0; | |
| | | temp = (unsigned int)(-((int)temp)); | |
| | | xxi = xxi - (yyi >> expo_y) - (temp ? 1 : 0); | |
| | | if (xxi & 0x00800000) { | |
| | | if (expo_x <= 0xFD) { | |
| | | xxi = (xxi + (expo_x << 23)); | |
| | | xxi += (temp && !(xxi & 0x80000000)); | |
| | | return __int_as_float(xxi); | |
| | | } | |
| | | } else { | |
| | | if ((temp | (xxi << 1)) == 0) { | |
| | | /* operands cancelled, resulting in a clean zero */ | |
| | | xxi = 0; | |
| | | return __int_as_float(xxi); | |
| | | } | |
| | | /* normalize result */ | |
| | | yyi = xxi & 0x80000000; | |
| | | do { | |
| | | xxi = (xxi << 1) | (temp >> 31); | |
| | | temp <<= 1; | |
| | | expo_x--; | |
| | | } while (!(xxi & 0x00800000)); | |
| | | xxi = xxi | yyi; | |
| | | } | |
| | | } else { | |
| | | /* signs are the same, effective addition */ | |
| | | temp = 32 - expo_y; | |
| | | temp = (expo_y) ? (yyi << temp) : 0; | |
| | | xxi = xxi + (yyi >> expo_y); | |
| | | if (!(xxi & 0x01000000)) { | |
| | | if (expo_x <= 0xFD) { | |
| | | xxi = xxi + (expo_x << 23); | |
| | | xxi += (temp && !(xxi & 0x80000000)); | |
| | | return __int_as_float(xxi); | |
| | | } | |
| | | } else { | |
| | | /* normalize result */ | |
| | | temp = (xxi << 31) | (temp >> 1); | |
| | | xxi = ((xxi & 0x80000000) | (xxi >> 1)) & ~0x40000000; | |
| | | expo_x++; | |
| | | } | |
| | | } | |
| | | if (expo_x <= 0xFD) { | |
| | | xxi += (temp && !(xxi & 0x80000000)); | |
| | | xxi = xxi + (expo_x << 23); | |
| | | return __int_as_float(xxi); | |
| | | } | |
| | | if ((int)expo_x >= 254) { | |
| | | /* overflow: return infinity or largest normal */ | |
| | | temp = xxi & 0x80000000; | |
| | | xxi = (temp ? 0xff7fffff : 0x7F800000); | |
| | | return __int_as_float(xxi); | |
| | | } | |
| | | /* underflow: zero or smallest normal */ | |
| | | yyi = xxi & 0x80000000; | |
| | | xxi = xxi & ~0xff000000; | |
| | | expo_x = (unsigned int)(-((int)expo_x)); | |
| | | xxi = (xxi >> expo_x); | |
| | | if ((expo_x > 25) || (xxi != 0x00800000)) xxi = 0; | |
| | | return __int_as_float(yyi | xxi); | |
| | | } else { | |
| | | return a + b; | |
| | | } | |
| | | } | |
| | | | |
| | | __device_func__(float __fadd_rd (float a, float b)) | |
| | | { | |
| | | unsigned int expo_x, expo_y; | |
| | | unsigned int xxi, yyi, temp; | |
| | | | |
| | | xxi = __float_as_int(a); | |
| | | yyi = __float_as_int(b); | |
| | | | |
| | | /* make bigger operand the augend */ | |
| | | expo_y = yyi << 1; | |
| | | if (expo_y > (xxi << 1)) { | |
| | | expo_y = xxi; | |
| | | xxi = yyi; | |
| | | yyi = expo_y; | |
| | | } | |
| | | | |
| | | temp = 0xff; | |
| | | expo_x = temp & (xxi >> 23); | |
| | | expo_x = expo_x - 1; | |
| | | expo_y = temp & (yyi >> 23); | |
| | | expo_y = expo_y - 1; | |
| | | | |
| | | if ((expo_x <= 0xFD) && | |
| | | (expo_y <= 0xFD)) { | |
| | | | |
| | | expo_y = expo_x - expo_y; | |
| | | if (expo_y > 25) { | |
| | | expo_y = 31; | |
| | | } | |
| | | temp = xxi ^ yyi; | |
| | | xxi = xxi & ~0x7f000000; | |
| | | xxi = xxi | 0x00800000; | |
| | | yyi = yyi & ~0xff000000; | |
| | | yyi = yyi | 0x00800000; | |
| | | | |
| | | if ((int)temp < 0) { | |
| | | /* signs differ, effective subtraction */ | |
| | | temp = 32 - expo_y; | |
| | | temp = (expo_y) ? (yyi << temp) : 0; | |
| | | temp = (unsigned int)(-((int)temp)); | |
| | | xxi = xxi - (yyi >> expo_y) - (temp ? 1 : 0); | |
| | | if (xxi & 0x00800000) { | |
| | | if (expo_x <= 0xFD) { | |
| | | xxi = xxi & ~0x00800000; /* lop off integer bit */ | |
| | | xxi = (xxi + (expo_x << 23)) + 0x00800000; | |
| | | xxi += (temp && (xxi & 0x80000000)); | |
| | | return __int_as_float(xxi); | |
| | | } | |
| | | } else { | |
| | | if ((temp | (xxi << 1)) == 0) { | |
| | | /* operands cancelled, resulting in a clean zero */ | |
| | | xxi = 0x80000000; | |
| | | return __int_as_float(xxi); | |
| | | } | |
| | | /* normalize result */ | |
| | | yyi = xxi & 0x80000000; | |
| | | do { | |
| | | xxi = (xxi << 1) | (temp >> 31); | |
| | | temp <<= 1; | |
| | | expo_x--; | |
| | | } while (!(xxi & 0x00800000)); | |
| | | xxi = xxi | yyi; | |
| | | } | |
| | | } else { | |
| | | /* signs are the same, effective addition */ | |
| | | temp = 32 - expo_y; | |
| | | temp = (expo_y) ? (yyi << temp) : 0; | |
| | | xxi = xxi + (yyi >> expo_y); | |
| | | if (!(xxi & 0x01000000)) { | |
| | | if (expo_x <= 0xFD) { | |
| | | expo_y = xxi & 1; | |
| | | xxi = xxi + (expo_x << 23); | |
| | | xxi += (temp && (xxi & 0x80000000)); | |
| | | return __int_as_float(xxi); | |
| | | } | |
| | | } else { | |
| | | /* normalize result */ | |
| | | temp = (xxi << 31) | (temp >> 1); | |
| | | xxi = ((xxi & 0x80000000) | (xxi >> 1)) & ~0x40000000; | |
| | | expo_x++; | |
| | | } | |
| | | } | |
| | | if (expo_x <= 0xFD) { | |
| | | xxi += (temp && (xxi & 0x80000000)); | |
| | | xxi = xxi + (expo_x << 23); | |
| | | return __int_as_float(xxi); | |
| | | } | |
| | | if ((int)expo_x >= 254) { | |
| | | /* overflow: return infinity or largest normal */ | |
| | | temp = xxi & 0x80000000; | |
| | | xxi = (temp ? 0xFF800000 : 0x7f7fffff); | |
| | | return __int_as_float(xxi); | |
| | | } | |
| | | /* underflow: zero or smallest normal */ | |
| | | yyi = xxi & 0x80000000; | |
| | | xxi = xxi & ~0xff000000; | |
| | | expo_x = (unsigned int)(-((int)expo_x)); | |
| | | xxi = (xxi >> expo_x); | |
| | | if ((expo_x > 25) || (xxi != 0x00800000)) xxi = 0; | |
| | | return __int_as_float(yyi | xxi); | |
| | | } else { | |
| | | a = a + b; | |
| | | xxi = xxi ^ yyi; | |
| | | if ((a == 0.0f) && ((int)xxi < 0)) a = __int_as_float(0x80000000); | |
| | | return a; | |
| | | } | |
| | | } | |
| | | | |
| | | __device_func__(float __fmul_ru (float a, float b)) | |
| | | { | |
| | | unsigned long long product; | |
| | | unsigned int expo_x, expo_y; | |
| | | unsigned int xxi, yyi; | |
| | | | |
| | | xxi = __float_as_int(a); | |
| | | yyi = __float_as_int(b); | |
| | | | |
| | | expo_y = 0xFF; | |
| | | expo_x = expo_y & (xxi >> 23); | |
| | | expo_x = expo_x - 1; | |
| | | expo_y = expo_y & (yyi >> 23); | |
| | | expo_y = expo_y - 1; | |
| | | | |
| | | if ((expo_x <= 0xFD) && | |
| | | (expo_y <= 0xFD)) { | |
| | | expo_x = expo_x + expo_y; | |
| | | expo_y = xxi ^ yyi; | |
| | | xxi = xxi & 0x00ffffff; | |
| | | yyi = yyi << 8; | |
| | | xxi = xxi | 0x00800000; | |
| | | yyi = yyi | 0x80000000; | |
| | | /* compute product */ | |
| | | product = ((unsigned long long)xxi) * yyi; | |
| | | expo_x = expo_x - 127 + 2; | |
| | | expo_y = expo_y & 0x80000000; | |
| | | xxi = (unsigned int)(product >> 32); | |
| | | yyi = (unsigned int)(product & 0xffffffff); | |
| | | /* normalize mantissa */ | |
| | | if (xxi < 0x00800000) { | |
| | | xxi = (xxi << 1) | (yyi >> 31); | |
| | | yyi = (yyi << 1); | |
| | | expo_x--; | |
| | | } | |
| | | if (expo_x <= 0xFD) { | |
| | | xxi = xxi | expo_y; /* OR in sign bit */ | |
| | | xxi = xxi + (expo_x << 23); /* add in exponent */ | |
| | | /* round result */ | |
| | | xxi += (yyi && !expo_y); | |
| | | return __int_as_float(xxi); | |
| | | } else if ((int)expo_x >= 254) { | |
| | | /* overflow: return infinity or largest normal */ | |
| | | xxi = (expo_y ? 0xff7fffff : 0x7F800000); | |
| | | return __int_as_float(xxi); | |
| | | } else { | |
| | | /* underflow: zero, or smallest normal */ | |
| | | expo_x = ((unsigned int)-((int)expo_x)); | |
| | | xxi += (yyi && !expo_y); | |
| | | xxi = (xxi >> expo_x); | |
| | | if ((expo_x > 25) || (xxi != 0x00800000)) xxi = 0; | |
| | | return __int_as_float(expo_y | xxi); | |
| | | } | |
| | | } else { | |
| | | return a * b; | |
| | | } | |
| | | } | |
| | | | |
| | | __device_func__(float __fmul_rd (float a, float b)) | |
| | | { | |
| | | unsigned long long product; | |
| | | unsigned int expo_x, expo_y; | |
| | | unsigned int xxi, yyi; | |
| | | | |
| | | xxi = __float_as_int(a); | |
| | | yyi = __float_as_int(b); | |
| | | | |
| | | expo_y = 0xFF; | |
| | | expo_x = expo_y & (xxi >> 23); | |
| | | expo_x = expo_x - 1; | |
| | | expo_y = expo_y & (yyi >> 23); | |
| | | expo_y = expo_y - 1; | |
| | | | |
| | | if ((expo_x <= 0xFD) && | |
| | | (expo_y <= 0xFD)) { | |
| | | expo_x = expo_x + expo_y; | |
| | | expo_y = xxi ^ yyi; | |
| | | xxi = xxi & 0x00ffffff; | |
| | | yyi = yyi << 8; | |
| | | xxi = xxi | 0x00800000; | |
| | | yyi = yyi | 0x80000000; | |
| | | /* compute product */ | |
| | | product = ((unsigned long long)xxi) * yyi; | |
| | | expo_x = expo_x - 127 + 2; | |
| | | expo_y = expo_y & 0x80000000; | |
| | | xxi = (unsigned int)(product >> 32); | |
| | | yyi = (unsigned int)(product & 0xffffffff); | |
| | | /* normalize mantissa */ | |
| | | if (xxi < 0x00800000) { | |
| | | xxi = (xxi << 1) | (yyi >> 31); | |
| | | yyi = (yyi << 1); | |
| | | expo_x--; | |
| | | } | |
| | | if (expo_x <= 0xFD) { | |
| | | xxi = xxi | expo_y; /* OR in sign bit */ | |
| | | xxi = xxi + (expo_x << 23); /* add in exponent */ | |
| | | /* round result */ | |
| | | xxi += (yyi && expo_y); | |
| | | return __int_as_float(xxi); | |
| | | } else if ((int)expo_x >= 254) { | |
| | | /* overflow: return infinity or largest normal */ | |
| | | xxi = expo_y | (expo_y ?0x7F800000 : 0x7f7fffff); | |
| | | return __int_as_float(xxi); | |
| | | } else { | |
| | | /* underflow: zero, or smallest normal */ | |
| | | expo_x = ((unsigned int)-((int)expo_x)); | |
| | | xxi += (yyi && expo_y); | |
| | | xxi = (xxi >> expo_x); | |
| | | if ((expo_x > 25) || (xxi != 0x00800000)) xxi = 0; | |
| | | return __int_as_float(expo_y | xxi); | |
| | | } | |
| | | } else { | |
| | | return a * b; | |
| | | } | |
| | | } | |
| | | | |
| | | __device_func__(float __fmaf_rn (float a, float b, float c)) | |
| | | { | |
| | | unsigned long long product; | |
| | | unsigned int xx, yy, zz, ww; | |
| | | unsigned int temp, s, u; | |
| | | unsigned int expo_x, expo_y, expo_z; | |
| | | | |
| | | xx = __float_as_int(a); | |
| | | yy = __float_as_int(b); | |
| | | zz = __float_as_int(c); | |
| | | | |
| | | /* Match 'denormals are zero' behavior of the GPU */ | |
| | | if ((xx << 1) < 0x01000000) xx &= 0x80000000; | |
| | | if ((yy << 1) < 0x01000000) yy &= 0x80000000; | |
| | | if ((zz << 1) < 0x01000000) zz &= 0x80000000; | |
| | | | |
| | | temp = 0xff; | |
| | | expo_x = temp & (xx >> 23); | |
| | | expo_x = expo_x - 1; | |
| | | expo_y = temp & (yy >> 23); | |
| | | expo_y = expo_y - 1; | |
| | | expo_z = temp & (zz >> 23); | |
| | | expo_z = expo_z - 1; | |
| | | | |
| | | if (!((expo_x <= 0xFD) && | |
| | | (expo_y <= 0xFD) && | |
| | | (expo_z <= 0xFD))) { | |
| | | /* fmad (nan, y, z) --> nan | |
| | | fmad (x, nan, z) --> nan | |
| | | fmad (x, y, nan) --> nan | |
| | | */ | |
| | | if ((yy << 1) > 0xff000000) { | |
| | | return rsqrtf(b); | |
| | | } | |
| | | if ((zz << 1) > 0xff000000) { | |
| | | return rsqrtf(c); | |
| | | } | |
| | | if ((xx << 1) > 0xff000000) { | |
| | | return rsqrtf(a); | |
| | | } | |
| | | /* fmad (0, inf, z) --> NaN | |
| | | fmad (inf, 0, z) --> NaN | |
| | | fmad (-inf,+y,+inf) --> NaN | |
| | | fmad (+x,-inf,+inf) --> NaN | |
| | | fmad (+inf,-y,+inf) --> NaN | |
| | | fmad (-x,+inf,+inf) --> NaN | |
| | | fmad (-inf,-y,-inf) --> NaN | |
| | | fmad (-x,-inf,-inf) --> NaN | |
| | | fmad (+inf,+y,-inf) --> NaN | |
| | | fmad (+x,+inf,-inf) --> NaN | |
| | | */ | |
| | | if ((((xx << 1) == 0) && ((yy << 1) == 0xff000000)) || | |
| | | (((yy << 1) == 0) && ((xx << 1) == 0xff000000))) { | |
| | | return rsqrtf(__int_as_float(0xffc00000)); | |
| | | } | |
| | | if ((zz << 1) == 0xff000000) { | |
| | | if (((yy << 1) == 0xff000000) || ((xx << 1) == 0xff000000)) { | |
| | | if ((int)(xx ^ yy ^ zz) < 0) { | |
| | | return rsqrtf(__int_as_float(0xffc00000)); | |
| | | } | |
| | | } | |
| | | } | |
| | | /* fmad (inf, y, z) --> inf | |
| | | fmad (x, inf, z) --> inf | |
| | | fmad (x, y, inf) --> inf | |
| | | */ | |
| | | if ((xx << 1) == 0xff000000) { | |
| | | xx = xx ^ (yy & 0x80000000); | |
| | | return __int_as_float(xx); | |
| | | } | |
| | | if ((yy << 1) == 0xff000000) { | |
| | | yy = yy ^ (xx & 0x80000000); | |
| | | return __int_as_float(yy); | |
| | | } | |
| | | if ((zz << 1) == 0xff000000) { | |
| | | return __int_as_float(zz); | |
| | | } | |
| | | /* fmad (+0, -y, -0) --> -0 | |
| | | fmad (-0, +y, -0) --> -0 | |
| | | fmad (+x, -0, -0) --> -0 | |
| | | fmad (-x, +0, -0) --> -0 | |
| | | */ | |
| | | if (zz == 0x80000000) { | |
| | | if (((xx << 1) == 0) || ((yy << 1) == 0)) { | |
| | | if ((int)(xx ^ yy) < 0) { | |
| | | return __int_as_float(zz); | |
| | | } | |
| | | } | |
| | | } | |
| | | /* fmad (0, y, 0) --> +0 | |
| | | fmad (x, 0, 0) --> +0 | |
| | | */ | |
| | | if (((zz << 1) == 0) && | |
| | | (((xx << 1) == 0) || ((yy << 1) == 0))) { | |
| | | zz &= 0x7fffffff; | |
| | | return __int_as_float(zz); | |
| | | } | |
| | | /* fmad (0, y, z) --> z | |
| | | fmad (x, 0, z) --> z | |
| | | */ | |
| | | if (((xx << 1) == 0) || ((yy << 1) == 0)) { | |
| | | return __int_as_float(zz); | |
| | | } | |
| | | /* normalize x, if denormal */ | |
| | | if (expo_x == (unsigned)-1) { | |
| | | temp = xx & 0x80000000; | |
| | | xx = xx << 8; | |
| | | while (!(xx & 0x80000000)) { | |
| | | xx <<= 1; | |
| | | expo_x--; | |
| | | } | |
| | | expo_x++; | |
| | | xx = (xx >> 8) | temp; | |
| | | } | |
| | | /* normalize y, if denormal */ | |
| | | if (expo_y == (unsigned)-1) { | |
| | | temp = yy & 0x80000000; | |
| | | yy = yy << 8; | |
| | | while (!(yy & 0x80000000)) { | |
| | | yy <<= 1; | |
| | | expo_y--; | |
| | | } | |
| | | expo_y++; | |
| | | yy = (yy >> 8) | temp; | |
| | | } | |
| | | /* normalize z, if denormal */ | |
| | | if ((expo_z == (unsigned)-1) && ((zz << 1) != 0)) { | |
| | | temp = zz & 0x80000000; | |
| | | zz = zz << 8; | |
| | | while (!(zz & 0x80000000)) { | |
| | | zz <<= 1; | |
| | | expo_z--; | |
| | | } | |
| | | expo_z++; | |
| | | zz = (zz >> 8) | temp; | |
| | | } | |
| | | } | |
| | | | |
| | | expo_x = expo_x + expo_y; | |
| | | expo_y = xx ^ yy; | |
| | | xx = xx & 0x00ffffff; | |
| | | yy = yy << 8; | |
| | | xx = xx | 0x00800000; | |
| | | yy = yy | 0x80000000; | |
| | | | |
| | | product = ((unsigned long long)xx) * yy; | |
| | | xx = (unsigned)(product >> 32); | |
| | | yy = (unsigned)(product & 0xffffffff); | |
| | | | |
| | | expo_x = expo_x - 127 + 2; | |
| | | expo_y = expo_y & 0x80000000; | |
| | | /* normalize mantissa */ | |
| | | if (xx < 0x00800000) { | |
| | | xx = (xx << 1) | (yy >> 31); | |
| | | yy = (yy << 1); | |
| | | expo_x--; | |
| | | } | |
| | | temp = 0; | |
| | | | |
| | | if ((zz << 1) != 0) { /* z is not zero */ | |
| | | s = zz & 0x80000000; | |
| | | zz &= 0x00ffffff; | |
| | | zz |= 0x00800000; | |
| | | ww = 0; | |
| | | /* compare and swap. put augend into xx:yy */ | |
| | | if ((int)expo_z > (int)expo_x) { | |
| | | temp = expo_z; | |
| | | expo_z = expo_x; | |
| | | expo_x = temp; | |
| | | temp = zz; | |
| | | zz = xx; | |
| | | xx = temp; | |
| | | temp = ww; | |
| | | ww = yy; | |
| | | yy = temp; | |
| | | temp = expo_y; | |
| | | expo_y = s; | |
| | | s = temp; | |
| | | } | |
| | | /* augend_sign = expo_y, augend_mant = xx:yy, augend_expo = expo_x */ | |
| | | /* addend_sign = s, addend_mant = zz:ww, addend_expo = expo_z */ | |
| | | expo_z = expo_x - expo_z; | |
| | | u = expo_y ^ s; | |
| | | if (expo_z <= 49) { | |
| | | /* denormalize addend */ | |
| | | temp = 0; | |
| | | while (expo_z >= 32) { | |
| | | temp = ww | (temp != 0); | |
| | | ww = zz; | |
| | | zz = 0; | |
| | | expo_z -= 32; | |
| | | } | |
| | | if (expo_z) { | |
| | | temp = ((temp >> expo_z) | (ww << (32 - expo_z)) | | |
| | | ((temp << (32 - expo_z)) != 0)); | |
| | | ww = (ww >> expo_z) | (zz << (32 - expo_z)); | |
| | | zz = (zz >> expo_z); | |
| | | } | |
| | | | |
| | | } else { | |
| | | temp = 1; | |
| | | ww = 0; | |
| | | zz = 0; | |
| | | } | |
| | | if ((int)u < 0) { | |
| | | /* signs differ, effective subtraction */ | |
| | | temp = (unsigned)(-(int)temp); | |
| | | s = (temp != 0); | |
| | | u = yy - s; | |
| | | s = u > yy; | |
| | | yy = u - ww; | |
| | | s += yy > u; | |
| | | xx = (xx - zz) - s; | |
| | | if (!(xx | yy | temp)) { | |
| | | /* complete cancelation, return 0 */ | |
| | | return __int_as_float(xx); | |
| | | } | |
| | | if ((int)xx < 0) { | |
| | | /* ooops, augend had smaller mantissa. Negate mantissa and flip | |
| | | sign of result*/ | |
| | | temp = ~temp; | |
| | | yy = ~yy; | |
| | | xx = ~xx; | |
| | | if (++temp == 0) { | |
| | | if (++yy == 0) { | |
| | | ++xx; | |
| | | } | |
| | | } | |
| | | expo_y ^= 0x80000000; | |
| | | } | |
| | | /* normalize mantissa, if necessary */ | |
| | | while (!(xx & 0x00800000)) { | |
| | | xx = (xx << 1) | (yy >> 31); | |
| | | yy = (yy << 1); | |
| | | expo_x--; | |
| | | } | |
| | | } else { | |
| | | /* signs are the same, effective addition */ | |
| | | yy = yy + ww; | |
| | | s = yy < ww; | |
| | | xx = xx + zz + s; | |
| | | if (xx & 0x01000000) { | |
| | | temp = temp | (yy << 31); | |
| | | yy = (yy >> 1) | (xx << 31); | |
| | | xx = ((xx & 0x80000000) | (xx >> 1)) & ~0x40000000; | |
| | | expo_x++; | |
| | | } | |
| | | } | |
| | | } | |
| | | temp = yy | (temp != 0); | |
| | | if (expo_x <= 0xFD) { | |
| | | /* normal */ | |
| | | xx |= expo_y; /* or in sign bit */ | |
| | | s = xx & 1; /* mantissa lsb */ | |
| | | xx += (temp == 0x80000000) ? s : (temp >> 31); | |
| | | xx = xx + (expo_x << 23); /* add in exponent */ | |
| | | return __int_as_float(xx); | |
| | | } else if ((int)expo_x >= 126) { | |
| | | /* overflow */ | |
| | | xx = expo_y | 0x7f800000; | |
| | | return __int_as_float(xx); | |
| | | } | |
| | | /* subnormal */ | |
| | | expo_x = (unsigned int)(-(int)expo_x); | |
| | | /* Match 'flush to zero' response of the GPU */ | |
| | | xx += (temp >= 0x80000000); | |
| | | if (xx >= 0x01000000) { | |
| | | xx = xx >> 1; | |
| | | expo_x--; | |
| | | } | |
| | | if (expo_x > 0) xx = 0; | |
| | | xx = expo_y | xx; | |
| | | return __int_as_float(xx); | |
| | | } | |
| | | | |
| | | __device_func__(float __fmaf_rz (float a, float b, float c)) | |
| | | { | |
| | | unsigned long long product; | |
| | | unsigned int xx, yy, zz, ww; | |
| | | unsigned int temp, s, u; | |
| | | unsigned int expo_x, expo_y, expo_z; | |
| | | | |
| | | xx = __float_as_int(a); | |
| | | yy = __float_as_int(b); | |
| | | zz = __float_as_int(c); | |
| | | | |
| | | /* Match 'denormals are zero' behavior of the GPU */ | |
| | | if ((xx << 1) < 0x01000000) xx &= 0x80000000; | |
| | | if ((yy << 1) < 0x01000000) yy &= 0x80000000; | |
| | | if ((zz << 1) < 0x01000000) zz &= 0x80000000; | |
| | | | |
| | | temp = 0xff; | |
| | | expo_x = temp & (xx >> 23); | |
| | | expo_x = expo_x - 1; | |
| | | expo_y = temp & (yy >> 23); | |
| | | expo_y = expo_y - 1; | |
| | | expo_z = temp & (zz >> 23); | |
| | | expo_z = expo_z - 1; | |
| | | | |
| | | if (!((expo_x <= 0xFD) && | |
| | | (expo_y <= 0xFD) && | |
| | | (expo_z <= 0xFD))) { | |
| | | /* fmad (nan, y, z) --> nan | |
| | | fmad (x, nan, z) --> nan | |
| | | fmad (x, y, nan) --> nan | |
| | | */ | |
| | | if ((yy << 1) > 0xff000000) { | |
| | | return rsqrtf(b); | |
| | | } | |
| | | if ((zz << 1) > 0xff000000) { | |
| | | return rsqrtf(c); | |
| | | } | |
| | | if ((xx << 1) > 0xff000000) { | |
| | | return rsqrtf(a); | |
| | | } | |
| | | /* fmad (0, inf, z) --> NaN | |
| | | fmad (inf, 0, z) --> NaN | |
| | | fmad (-inf,+y,+inf) --> NaN | |
| | | fmad (+x,-inf,+inf) --> NaN | |
| | | fmad (+inf,-y,+inf) --> NaN | |
| | | fmad (-x,+inf,+inf) --> NaN | |
| | | fmad (-inf,-y,-inf) --> NaN | |
| | | fmad (-x,-inf,-inf) --> NaN | |
| | | fmad (+inf,+y,-inf) --> NaN | |
| | | fmad (+x,+inf,-inf) --> NaN | |
| | | */ | |
| | | if ((((xx << 1) == 0) && ((yy << 1) == 0xff000000)) || | |
| | | (((yy << 1) == 0) && ((xx << 1) == 0xff000000))) { | |
| | | return rsqrtf(__int_as_float(0xffc00000)); | |
| | | } | |
| | | if ((zz << 1) == 0xff000000) { | |
| | | if (((yy << 1) == 0xff000000) || ((xx << 1) == 0xff000000)) { | |
| | | if ((int)(xx ^ yy ^ zz) < 0) { | |
| | | return rsqrtf(__int_as_float(0xffc00000)); | |
| | | } | |
| | | } | |
| | | } | |
| | | /* fmad (inf, y, z) --> inf | |
| | | fmad (x, inf, z) --> inf | |
| | | fmad (x, y, inf) --> inf | |
| | | */ | |
| | | if ((xx << 1) == 0xff000000) { | |
| | | xx = xx ^ (yy & 0x80000000); | |
| | | return __int_as_float(xx); | |
| | | } | |
| | | if ((yy << 1) == 0xff000000) { | |
| | | yy = yy ^ (xx & 0x80000000); | |
| | | return __int_as_float(yy); | |
| | | } | |
| | | if ((zz << 1) == 0xff000000) { | |
| | | return __int_as_float(zz); | |
| | | } | |
| | | /* fmad (+0, -y, -0) --> -0 | |
| | | fmad (-0, +y, -0) --> -0 | |
| | | fmad (+x, -0, -0) --> -0 | |
| | | fmad (-x, +0, -0) --> -0 | |
| | | */ | |
| | | if (zz == 0x80000000) { | |
| | | if (((xx << 1) == 0) || ((yy << 1) == 0)) { | |
| | | if ((int)(xx ^ yy) < 0) { | |
| | | return __int_as_float(zz); | |
| | | } | |
| | | } | |
| | | } | |
| | | /* fmad (0, y, 0) --> +0 | |
| | | fmad (x, 0, 0) --> +0 | |
| | | */ | |
| | | if (((zz << 1) == 0) && | |
| | | (((xx << 1) == 0) || ((yy << 1) == 0))) { | |
| | | zz &= 0x7fffffff; | |
| | | return __int_as_float(zz); | |
| | | } | |
| | | /* fmad (0, y, z) --> z | |
| | | fmad (x, 0, z) --> z | |
| | | */ | |
| | | if (((xx << 1) == 0) || ((yy << 1) == 0)) { | |
| | | return __int_as_float(zz); | |
| | | } | |
| | | /* normalize x, if denormal */ | |
| | | if (expo_x == (unsigned)-1) { | |
| | | temp = xx & 0x80000000; | |
| | | xx = xx << 8; | |
| | | while (!(xx & 0x80000000)) { | |
| | | xx <<= 1; | |
| | | expo_x--; | |
| | | } | |
| | | expo_x++; | |
| | | xx = (xx >> 8) | temp; | |
| | | } | |
| | | /* normalize y, if denormal */ | |
| | | if (expo_y == (unsigned)-1) { | |
| | | temp = yy & 0x80000000; | |
| | | yy = yy << 8; | |
| | | while (!(yy & 0x80000000)) { | |
| | | yy <<= 1; | |
| | | expo_y--; | |
| | | } | |
| | | expo_y++; | |
| | | yy = (yy >> 8) | temp; | |
| | | } | |
| | | /* normalize z, if denormal */ | |
| | | if ((expo_z == (unsigned)-1) && ((zz << 1) != 0)) { | |
| | | temp = zz & 0x80000000; | |
| | | zz = zz << 8; | |
| | | while (!(zz & 0x80000000)) { | |
| | | zz <<= 1; | |
| | | expo_z--; | |
| | | } | |
| | | expo_z++; | |
| | | zz = (zz >> 8) | temp; | |
| | | } | |
| | | } | |
| | | | |
| | | expo_x = expo_x + expo_y; | |
| | | expo_y = xx ^ yy; | |
| | | xx = xx & 0x00ffffff; | |
| | | yy = yy << 8; | |
| | | xx = xx | 0x00800000; | |
| | | yy = yy | 0x80000000; | |
| | | | |
| | | product = ((unsigned long long)xx) * yy; | |
| | | xx = (unsigned)(product >> 32); | |
| | | yy = (unsigned)(product & 0xffffffff); | |
| | | | |
| | | expo_x = expo_x - 127 + 2; | |
| | | expo_y = expo_y & 0x80000000; | |
| | | /* normalize mantissa */ | |
| | | if (xx < 0x00800000) { | |
| | | xx = (xx << 1) | (yy >> 31); | |
| | | yy = (yy << 1); | |
| | | expo_x--; | |
| | | } | |
| | | temp = 0; | |
| | | | |
| | | if ((zz << 1) != 0) { /* z is not zero */ | |
| | | s = zz & 0x80000000; | |
| | | zz &= 0x00ffffff; | |
| | | zz |= 0x00800000; | |
| | | ww = 0; | |
| | | /* compare and swap. put augend into xx:yy */ | |
| | | if ((int)expo_z > (int)expo_x) { | |
| | | temp = expo_z; | |
| | | expo_z = expo_x; | |
| | | expo_x = temp; | |
| | | temp = zz; | |
| | | zz = xx; | |
| | | xx = temp; | |
| | | temp = ww; | |
| | | ww = yy; | |
| | | yy = temp; | |
| | | temp = expo_y; | |
| | | expo_y = s; | |
| | | s = temp; | |
| | | } | |
| | | /* augend_sign = expo_y, augend_mant = xx:yy, augend_expo = expo_x */ | |
| | | /* addend_sign = s, addend_mant = zz:ww, addend_expo = expo_z */ | |
| | | expo_z = expo_x - expo_z; | |
| | | u = expo_y ^ s; | |
| | | if (expo_z <= 49) { | |
| | | /* denormalize addend */ | |
| | | temp = 0; | |
| | | while (expo_z >= 32) { | |
| | | temp = ww | (temp != 0); | |
| | | ww = zz; | |
| | | zz = 0; | |
| | | expo_z -= 32; | |
| | | } | |
| | | if (expo_z) { | |
| | | temp = ((temp >> expo_z) | (ww << (32 - expo_z)) | | |
| | | ((temp << (32 - expo_z)) != 0)); | |
| | | ww = (ww >> expo_z) | (zz << (32 - expo_z)); | |
| | | zz = (zz >> expo_z); | |
| | | } | |
| | | | |
| | | } else { | |
| | | temp = 1; | |
| | | ww = 0; | |
| | | zz = 0; | |
| | | } | |
| | | if ((int)u < 0) { | |
| | | /* signs differ, effective subtraction */ | |
| | | temp = (unsigned)(-(int)temp); | |
| | | s = (temp != 0); | |
| | | u = yy - s; | |
| | | s = u > yy; | |
| | | yy = u - ww; | |
| | | s += yy > u; | |
| | | xx = (xx - zz) - s; | |
| | | if (!(xx | yy | temp)) { | |
| | | /* complete cancelation, return 0 */ | |
| | | return __int_as_float(xx); | |
| | | } | |
| | | if ((int)xx < 0) { | |
| | | /* ooops, augend had smaller mantissa. Negate mantissa and flip | |
| | | sign of result*/ | |
| | | temp = ~temp; | |
| | | yy = ~yy; | |
| | | xx = ~xx; | |
| | | if (++temp == 0) { | |
| | | if (++yy == 0) { | |
| | | ++xx; | |
| | | } | |
| | | } | |
| | | expo_y ^= 0x80000000; | |
| | | } | |
| | | /* normalize mantissa, if necessary */ | |
| | | while (!(xx & 0x00800000)) { | |
| | | xx = (xx << 1) | (yy >> 31); | |
| | | yy = (yy << 1); | |
| | | expo_x--; | |
| | | } | |
| | | } else { | |
| | | /* signs are the same, effective addition */ | |
| | | yy = yy + ww; | |
| | | s = yy < ww; | |
| | | xx = xx + zz + s; | |
| | | if (xx & 0x01000000) { | |
| | | temp = temp | (yy << 31); | |
| | | yy = (yy >> 1) | (xx << 31); | |
| | | xx = ((xx & 0x80000000) | (xx >> 1)) & ~0x40000000; | |
| | | expo_x++; | |
| | | } | |
| | | } | |
| | | } | |
| | | temp = yy | (temp != 0); | |
| | | if (expo_x <= 0xFD) { | |
| | | /* normal */ | |
| | | xx |= expo_y; /* or in sign bit */ | |
| | | xx = xx + (expo_x << 23); /* add in exponent */ | |
| | | return __int_as_float(xx); | |
| | | } else if ((int)expo_x >= 126) { | |
| | | /* overflow */ | |
| | | xx = expo_y | 0x7f7fffff; | |
| | | return __int_as_float(xx); | |
| | | } | |
| | | /* subnormal */ | |
| | | return __int_as_float(expo_y); | |
| | | } | |
| | | | |
| | | __device_func__(float __fmaf_ru (float a, float b, float c)) | |
| | | { | |
| | | unsigned long long product; | |
| | | unsigned int xx, yy, zz, ww; | |
| | | unsigned int temp, s, u; | |
| | | unsigned int expo_x, expo_y, expo_z; | |
| | | | |
| | | xx = __float_as_int(a); | |
| | | yy = __float_as_int(b); | |
| | | zz = __float_as_int(c); | |
| | | | |
| | | /* Match 'denormals are zero' behavior of the GPU */ | |
| | | if ((xx << 1) < 0x01000000) xx &= 0x80000000; | |
| | | if ((yy << 1) < 0x01000000) yy &= 0x80000000; | |
| | | if ((zz << 1) < 0x01000000) zz &= 0x80000000; | |
| | | | |
| | | temp = 0xff; | |
| | | expo_x = temp & (xx >> 23); | |
| | | expo_x = expo_x - 1; | |
| | | expo_y = temp & (yy >> 23); | |
| | | expo_y = expo_y - 1; | |
| | | expo_z = temp & (zz >> 23); | |
| | | expo_z = expo_z - 1; | |
| | | | |
| | | if (!((expo_x <= 0xFD) && | |
| | | (expo_y <= 0xFD) && | |
| | | (expo_z <= 0xFD))) { | |
| | | /* fmad (nan, y, z) --> nan | |
| | | fmad (x, nan, z) --> nan | |
| | | fmad (x, y, nan) --> nan | |
| | | */ | |
| | | if ((yy << 1) > 0xff000000) { | |
| | | return rsqrtf(b); | |
| | | } | |
| | | if ((zz << 1) > 0xff000000) { | |
| | | return rsqrtf(c); | |
| | | } | |
| | | if ((xx << 1) > 0xff000000) { | |
| | | return rsqrtf(a); | |
| | | } | |
| | | /* fmad (0, inf, z) --> NaN | |
| | | fmad (inf, 0, z) --> NaN | |
| | | fmad (-inf,+y,+inf) --> NaN | |
| | | fmad (+x,-inf,+inf) --> NaN | |
| | | fmad (+inf,-y,+inf) --> NaN | |
| | | fmad (-x,+inf,+inf) --> NaN | |
| | | fmad (-inf,-y,-inf) --> NaN | |
| | | fmad (-x,-inf,-inf) --> NaN | |
| | | fmad (+inf,+y,-inf) --> NaN | |
| | | fmad (+x,+inf,-inf) --> NaN | |
| | | */ | |
| | | if ((((xx << 1) == 0) && ((yy << 1) == 0xff000000)) || | |
| | | (((yy << 1) == 0) && ((xx << 1) == 0xff000000))) { | |
| | | return rsqrtf(__int_as_float(0xffc00000)); | |
| | | } | |
| | | if ((zz << 1) == 0xff000000) { | |
| | | if (((yy << 1) == 0xff000000) || ((xx << 1) == 0xff000000)) { | |
| | | if ((int)(xx ^ yy ^ zz) < 0) { | |
| | | return rsqrtf(__int_as_float(0xffc00000)); | |
| | | } | |
| | | } | |
| | | } | |
| | | /* fmad (inf, y, z) --> inf | |
| | | fmad (x, inf, z) --> inf | |
| | | fmad (x, y, inf) --> inf | |
| | | */ | |
| | | if ((xx << 1) == 0xff000000) { | |
| | | xx = xx ^ (yy & 0x80000000); | |
| | | return __int_as_float(xx); | |
| | | } | |
| | | if ((yy << 1) == 0xff000000) { | |
| | | yy = yy ^ (xx & 0x80000000); | |
| | | return __int_as_float(yy); | |
| | | } | |
| | | if ((zz << 1) == 0xff000000) { | |
| | | return __int_as_float(zz); | |
| | | } | |
| | | /* fmad (+0, -y, -0) --> -0 | |
| | | fmad (-0, +y, -0) --> -0 | |
| | | fmad (+x, -0, -0) --> -0 | |
| | | fmad (-x, +0, -0) --> -0 | |
| | | */ | |
| | | if (zz == 0x80000000) { | |
| | | if (((xx << 1) == 0) || ((yy << 1) == 0)) { | |
| | | if ((int)(xx ^ yy) < 0) { | |
| | | return __int_as_float(zz); | |
| | | } | |
| | | } | |
| | | } | |
| | | /* fmad (0, y, 0) --> +0 | |
| | | fmad (x, 0, 0) --> +0 | |
| | | */ | |
| | | if (((zz << 1) == 0) && | |
| | | (((xx << 1) == 0) || ((yy << 1) == 0))) { | |
| | | zz &= 0x7fffffff; | |
| | | return __int_as_float(zz); | |
| | | } | |
| | | /* fmad (0, y, z) --> z | |
| | | fmad (x, 0, z) --> z | |
| | | */ | |
| | | if (((xx << 1) == 0) || ((yy << 1) == 0)) { | |
| | | return __int_as_float(zz); | |
| | | } | |
| | | /* normalize x, if denormal */ | |
| | | if (expo_x == (unsigned)-1) { | |
| | | temp = xx & 0x80000000; | |
| | | xx = xx << 8; | |
| | | while (!(xx & 0x80000000)) { | |
| | | xx <<= 1; | |
| | | expo_x--; | |
| | | } | |
| | | expo_x++; | |
| | | xx = (xx >> 8) | temp; | |
| | | } | |
| | | /* normalize y, if denormal */ | |
| | | if (expo_y == (unsigned)-1) { | |
| | | temp = yy & 0x80000000; | |
| | | yy = yy << 8; | |
| | | while (!(yy & 0x80000000)) { | |
| | | yy <<= 1; | |
| | | expo_y--; | |
| | | } | |
| | | expo_y++; | |
| | | yy = (yy >> 8) | temp; | |
| | | } | |
| | | /* normalize z, if denormal */ | |
| | | if ((expo_z == (unsigned)-1) && ((zz << 1) != 0)) { | |
| | | temp = zz & 0x80000000; | |
| | | zz = zz << 8; | |
| | | while (!(zz & 0x80000000)) { | |
| | | zz <<= 1; | |
| | | expo_z--; | |
| | | } | |
| | | expo_z++; | |
| | | zz = (zz >> 8) | temp; | |
| | | } | |
| | | } | |
| | | | |
| | | expo_x = expo_x + expo_y; | |
| | | expo_y = xx ^ yy; | |
| | | xx = xx & 0x00ffffff; | |
| | | yy = yy << 8; | |
| | | xx = xx | 0x00800000; | |
| | | yy = yy | 0x80000000; | |
| | | | |
| | | product = ((unsigned long long)xx) * yy; | |
| | | xx = (unsigned)(product >> 32); | |
| | | yy = (unsigned)(product & 0xffffffff); | |
| | | | |
| | | expo_x = expo_x - 127 + 2; | |
| | | expo_y = expo_y & 0x80000000; | |
| | | /* normalize mantissa */ | |
| | | if (xx < 0x00800000) { | |
| | | xx = (xx << 1) | (yy >> 31); | |
| | | yy = (yy << 1); | |
| | | expo_x--; | |
| | | } | |
| | | temp = 0; | |
| | | | |
| | | if ((zz << 1) != 0) { /* z is not zero */ | |
| | | s = zz & 0x80000000; | |
| | | zz &= 0x00ffffff; | |
| | | zz |= 0x00800000; | |
| | | ww = 0; | |
| | | /* compare and swap. put augend into xx:yy */ | |
| | | if ((int)expo_z > (int)expo_x) { | |
| | | temp = expo_z; | |
| | | expo_z = expo_x; | |
| | | expo_x = temp; | |
| | | temp = zz; | |
| | | zz = xx; | |
| | | xx = temp; | |
| | | temp = ww; | |
| | | ww = yy; | |
| | | yy = temp; | |
| | | temp = expo_y; | |
| | | expo_y = s; | |
| | | s = temp; | |
| | | } | |
| | | /* augend_sign = expo_y, augend_mant = xx:yy, augend_expo = expo_x */ | |
| | | /* addend_sign = s, addend_mant = zz:ww, addend_expo = expo_z */ | |
| | | expo_z = expo_x - expo_z; | |
| | | u = expo_y ^ s; | |
| | | if (expo_z <= 49) { | |
| | | /* denormalize addend */ | |
| | | temp = 0; | |
| | | while (expo_z >= 32) { | |
| | | temp = ww | (temp != 0); | |
| | | ww = zz; | |
| | | zz = 0; | |
| | | expo_z -= 32; | |
| | | } | |
| | | if (expo_z) { | |
| | | temp = ((temp >> expo_z) | (ww << (32 - expo_z)) | | |
| | | ((temp << (32 - expo_z)) != 0)); | |
| | | ww = (ww >> expo_z) | (zz << (32 - expo_z)); | |
| | | zz = (zz >> expo_z); | |
| | | } | |
| | | | |
| | | } else { | |
| | | temp = 1; | |
| | | ww = 0; | |
| | | zz = 0; | |
| | | } | |
| | | if ((int)u < 0) { | |
| | | /* signs differ, effective subtraction */ | |
| | | temp = (unsigned)(-(int)temp); | |
| | | s = (temp != 0); | |
| | | u = yy - s; | |
| | | s = u > yy; | |
| | | yy = u - ww; | |
| | | s += yy > u; | |
| | | xx = (xx - zz) - s; | |
| | | if (!(xx | yy | temp)) { | |
| | | /* complete cancelation, return 0 */ | |
| | | return __int_as_float(xx); | |
| | | } | |
| | | if ((int)xx < 0) { | |
| | | /* ooops, augend had smaller mantissa. Negate mantissa and flip | |
| | | sign of result*/ | |
| | | temp = ~temp; | |
| | | yy = ~yy; | |
| | | xx = ~xx; | |
| | | if (++temp == 0) { | |
| | | if (++yy == 0) { | |
| | | ++xx; | |
| | | } | |
| | | } | |
| | | expo_y ^= 0x80000000; | |
| | | } | |
| | | /* normalize mantissa, if necessary */ | |
| | | while (!(xx & 0x00800000)) { | |
| | | xx = (xx << 1) | (yy >> 31); | |
| | | yy = (yy << 1); | |
| | | expo_x--; | |
| | | } | |
| | | } else { | |
| | | /* signs are the same, effective addition */ | |
| | | yy = yy + ww; | |
| | | s = yy < ww; | |
| | | xx = xx + zz + s; | |
| | | if (xx & 0x01000000) { | |
| | | temp = temp | (yy << 31); | |
| | | yy = (yy >> 1) | (xx << 31); | |
| | | xx = ((xx & 0x80000000) | (xx >> 1)) & ~0x40000000; | |
| | | expo_x++; | |
| | | } | |
| | | } | |
| | | } | |
| | | temp = yy | (temp != 0); | |
| | | if (expo_x <= 0xFD) { | |
| | | /* normal */ | |
| | | xx |= expo_y; /* or in sign bit */ | |
| | | xx += (temp && !expo_y); /* round result */ | |
| | | xx = xx + (expo_x << 23); /* add in exponent */ | |
| | | return __int_as_float(xx); | |
| | | } else if ((int)expo_x >= 126) { | |
| | | /* overflow */ | |
| | | xx = expo_y | (expo_y ? 0x7f7fffff : 0x7F800000); | |
| | | return __int_as_float(xx); | |
| | | } | |
| | | /* subnormal */ | |
| | | expo_x = ((unsigned int)-((int)expo_x)); | |
| | | xx += (temp && !expo_y); | |
| | | xx = (xx >> expo_x); | |
| | | if ((expo_x > 25) || (xx != 0x00800000)) xx = 0; | |
| | | return __int_as_float(expo_y | xx); | |
| | | } | |
| | | | |
| | | __device_func__(float __fmaf_rd (float a, float b, float c)) | |
| | | { | |
| | | unsigned long long product; | |
| | | unsigned int xx, yy, zz, ww; | |
| | | unsigned int temp, s, u; | |
| | | unsigned int expo_x, expo_y, expo_z; | |
| | | | |
| | | xx = __float_as_int(a); | |
| | | yy = __float_as_int(b); | |
| | | zz = __float_as_int(c); | |
| | | | |
| | | /* Match 'denormals are zero' behavior of the GPU */ | |
| | | if ((xx << 1) < 0x01000000) xx &= 0x80000000; | |
| | | if ((yy << 1) < 0x01000000) yy &= 0x80000000; | |
| | | if ((zz << 1) < 0x01000000) zz &= 0x80000000; | |
| | | | |
| | | temp = 0xff; | |
| | | expo_x = temp & (xx >> 23); | |
| | | expo_x = expo_x - 1; | |
| | | expo_y = temp & (yy >> 23); | |
| | | expo_y = expo_y - 1; | |
| | | expo_z = temp & (zz >> 23); | |
| | | expo_z = expo_z - 1; | |
| | | | |
| | | if (!((expo_x <= 0xFD) && | |
| | | (expo_y <= 0xFD) && | |
| | | (expo_z <= 0xFD))) { | |
| | | /* fmad (nan, y, z) --> nan | |
| | | fmad (x, nan, z) --> nan | |
| | | fmad (x, y, nan) --> nan | |
| | | */ | |
| | | if ((yy << 1) > 0xff000000) { | |
| | | return rsqrtf(b); | |
| | | } | |
| | | if ((zz << 1) > 0xff000000) { | |
| | | return rsqrtf(c); | |
| | | } | |
| | | if ((xx << 1) > 0xff000000) { | |
| | | return rsqrtf(a); | |
| | | } | |
| | | /* fmad (0, inf, z) --> NaN | |
| | | fmad (inf, 0, z) --> NaN | |
| | | fmad (-inf,+y,+inf) --> NaN | |
| | | fmad (+x,-inf,+inf) --> NaN | |
| | | fmad (+inf,-y,+inf) --> NaN | |
| | | fmad (-x,+inf,+inf) --> NaN | |
| | | fmad (-inf,-y,-inf) --> NaN | |
| | | fmad (-x,-inf,-inf) --> NaN | |
| | | fmad (+inf,+y,-inf) --> NaN | |
| | | fmad (+x,+inf,-inf) --> NaN | |
| | | */ | |
| | | if ((((xx << 1) == 0) && ((yy << 1) == 0xff000000)) || | |
| | | (((yy << 1) == 0) && ((xx << 1) == 0xff000000))) { | |
| | | return rsqrtf(__int_as_float(0xffc00000)); | |
| | | } | |
| | | if ((zz << 1) == 0xff000000) { | |
| | | if (((yy << 1) == 0xff000000) || ((xx << 1) == 0xff000000)) { | |
| | | if ((int)(xx ^ yy ^ zz) < 0) { | |
| | | return rsqrtf(__int_as_float(0xffc00000)); | |
| | | } | |
| | | } | |
| | | } | |
| | | /* fmad (inf, y, z) --> inf | |
| | | fmad (x, inf, z) --> inf | |
| | | fmad (x, y, inf) --> inf | |
| | | */ | |
| | | if ((xx << 1) == 0xff000000) { | |
| | | xx = xx ^ (yy & 0x80000000); | |
| | | return __int_as_float(xx); | |
| | | } | |
| | | if ((yy << 1) == 0xff000000) { | |
| | | yy = yy ^ (xx & 0x80000000); | |
| | | return __int_as_float(yy); | |
| | | } | |
| | | if ((zz << 1) == 0xff000000) { | |
| | | return __int_as_float(zz); | |
| | | } | |
| | | /* fmad (+0, -y, -0) --> -0 | |
| | | fmad (-0, +y, -0) --> -0 | |
| | | fmad (+x, -0, -0) --> -0 | |
| | | fmad (-x, +0, -0) --> -0 | |
| | | */ | |
| | | if (zz == 0x80000000) { | |
| | | if (((xx << 1) == 0) || ((yy << 1) == 0)) { | |
| | | if ((int)(xx ^ yy) < 0) { | |
| | | return __int_as_float(zz); | |
| | | } | |
| | | } | |
| | | } | |
| | | /* fmad (0, y, 0) --> +0 | |
| | | fmad (x, 0, 0) --> +0 | |
| | | */ | |
| | | if (((zz << 1) == 0) && | |
| | | (((xx << 1) == 0) || ((yy << 1) == 0))) { | |
| | | zz = (xx ^ yy ^ zz) & 0x80000000; | |
| | | return __int_as_float(zz); | |
| | | } | |
| | | /* fmad (0, y, z) --> z | |
| | | fmad (x, 0, z) --> z | |
| | | */ | |
| | | if (((xx << 1) == 0) || ((yy << 1) == 0)) { | |
| | | return __int_as_float(zz); | |
| | | } | |
| | | /* normalize x, if denormal */ | |
| | | if (expo_x == (unsigned)-1) { | |
| | | temp = xx & 0x80000000; | |
| | | xx = xx << 8; | |
| | | while (!(xx & 0x80000000)) { | |
| | | xx <<= 1; | |
| | | expo_x--; | |
| | | } | |
| | | expo_x++; | |
| | | xx = (xx >> 8) | temp; | |
| | | } | |
| | | /* normalize y, if denormal */ | |
| | | if (expo_y == (unsigned)-1) { | |
| | | temp = yy & 0x80000000; | |
| | | yy = yy << 8; | |
| | | while (!(yy & 0x80000000)) { | |
| | | yy <<= 1; | |
| | | expo_y--; | |
| | | } | |
| | | expo_y++; | |
| | | yy = (yy >> 8) | temp; | |
| | | } | |
| | | /* normalize z, if denormal */ | |
| | | if ((expo_z == (unsigned)-1) && ((zz << 1) != 0)) { | |
| | | temp = zz & 0x80000000; | |
| | | zz = zz << 8; | |
| | | while (!(zz & 0x80000000)) { | |
| | | zz <<= 1; | |
| | | expo_z--; | |
| | | } | |
| | | expo_z++; | |
| | | zz = (zz >> 8) | temp; | |
| | | } | |
| | | } | |
| | | | |
| | | expo_x = expo_x + expo_y; | |
| | | expo_y = xx ^ yy; | |
| | | xx = xx & 0x00ffffff; | |
| | | yy = yy << 8; | |
| | | xx = xx | 0x00800000; | |
| | | yy = yy | 0x80000000; | |
| | | | |
| | | product = ((unsigned long long)xx) * yy; | |
| | | xx = (unsigned)(product >> 32); | |
| | | yy = (unsigned)(product & 0xffffffff); | |
| | | | |
| | | expo_x = expo_x - 127 + 2; | |
| | | expo_y = expo_y & 0x80000000; | |
| | | /* normalize mantissa */ | |
| | | if (xx < 0x00800000) { | |
| | | xx = (xx << 1) | (yy >> 31); | |
| | | yy = (yy << 1); | |
| | | expo_x--; | |
| | | } | |
| | | temp = 0; | |
| | | | |
| | | if ((zz << 1) != 0) { /* z is not zero */ | |
| | | s = zz & 0x80000000; | |
| | | zz &= 0x00ffffff; | |
| | | zz |= 0x00800000; | |
| | | ww = 0; | |
| | | /* compare and swap. put augend into xx:yy */ | |
| | | if ((int)expo_z > (int)expo_x) { | |
| | | temp = expo_z; | |
| | | expo_z = expo_x; | |
| | | expo_x = temp; | |
| | | temp = zz; | |
| | | zz = xx; | |
| | | xx = temp; | |
| | | temp = ww; | |
| | | ww = yy; | |
| | | yy = temp; | |
| | | temp = expo_y; | |
| | | expo_y = s; | |
| | | s = temp; | |
| | | } | |
| | | /* augend_sign = expo_y, augend_mant = xx:yy, augend_expo = expo_x */ | |
| | | /* addend_sign = s, addend_mant = zz:ww, addend_expo = expo_z */ | |
| | | expo_z = expo_x - expo_z; | |
| | | u = expo_y ^ s; | |
| | | if (expo_z <= 49) { | |
| | | /* denormalize addend */ | |
| | | temp = 0; | |
| | | while (expo_z >= 32) { | |
| | | temp = ww | (temp != 0); | |
| | | ww = zz; | |
| | | zz = 0; | |
| | | expo_z -= 32; | |
| | | } | |
| | | if (expo_z) { | |
| | | temp = ((temp >> expo_z) | (ww << (32 - expo_z)) | | |
| | | ((temp << (32 - expo_z)) != 0)); | |
| | | ww = (ww >> expo_z) | (zz << (32 - expo_z)); | |
| | | zz = (zz >> expo_z); | |
| | | } | |
| | | | |
| | | } else { | |
| | | temp = 1; | |
| | | ww = 0; | |
| | | zz = 0; | |
| | | } | |
| | | if ((int)u < 0) { | |
| | | /* signs differ, effective subtraction */ | |
| | | temp = (unsigned)(-(int)temp); | |
| | | s = (temp != 0); | |
| | | u = yy - s; | |
| | | s = u > yy; | |
| | | yy = u - ww; | |
| | | s += yy > u; | |
| | | xx = (xx - zz) - s; | |
| | | if (!(xx | yy | temp)) { | |
| | | /* complete cancelation, return -0 */ | |
| | | return __int_as_float(0x80000000); | |
| | | } | |
| | | if ((int)xx < 0) { | |
| | | /* ooops, augend had smaller mantissa. Negate mantissa and flip | |
| | | sign of result*/ | |
| | | temp = ~temp; | |
| | | yy = ~yy; | |
| | | xx = ~xx; | |
| | | if (++temp == 0) { | |
| | | if (++yy == 0) { | |
| | | ++xx; | |
| | | } | |
| | | } | |
| | | expo_y ^= 0x80000000; | |
| | | } | |
| | | /* normalize mantissa, if necessary */ | |
| | | while (!(xx & 0x00800000)) { | |
| | | xx = (xx << 1) | (yy >> 31); | |
| | | yy = (yy << 1); | |
| | | expo_x--; | |
| | | } | |
| | | } else { | |
| | | /* signs are the same, effective addition */ | |
| | | yy = yy + ww; | |
| | | s = yy < ww; | |
| | | xx = xx + zz + s; | |
| | | if (xx & 0x01000000) { | |
| | | temp = temp | (yy << 31); | |
| | | yy = (yy >> 1) | (xx << 31); | |
| | | xx = ((xx & 0x80000000) | (xx >> 1)) & ~0x40000000; | |
| | | expo_x++; | |
| | | } | |
| | | } | |
| | | } | |
| | | temp = yy | (temp != 0); | |
| | | if (expo_x <= 0xFD) { | |
| | | /* normal */ | |
| | | xx |= expo_y; /* or in sign bit */ | |
| | | xx += (temp && expo_y); /* round result */ | |
| | | xx = xx + (expo_x << 23); /* add in exponent */ | |
| | | return __int_as_float(xx); | |
| | | } else if ((int)expo_x >= 126) { | |
| | | /* overflow */ | |
| | | xx = expo_y | (expo_y ? 0x7f800000 : 0x7F7FFFFF); | |
| | | return __int_as_float(xx); | |
| | | } | |
| | | /* subnormal */ | |
| | | expo_x = ((unsigned int)-((int)expo_x)); | |
| | | xx += (temp && expo_y); | |
| | | xx = (xx >> expo_x); | |
| | | if ((expo_x > 25) || (xx != 0x00800000)) xx = 0; | |
| | | return __int_as_float(expo_y | xx); | |
| | | } | |
| | | | |
| | | #else /* defined(__CUDABE__) */ | |
| | | #include "common_types.h" | |
| | | | |
| | | static __device__ const unsigned char __internal_rcpTab[128] = | |
| | | { | |
| | | 0xff, 0xfd, 0xfb, 0xf9, 0xf7, 0xf5, 0xf4, 0xf2, | |
| | | 0xf0, 0xee, 0xed, 0xeb, 0xe9, 0xe8, 0xe6, 0xe4, | |
| | | 0xe3, 0xe1, 0xe0, 0xde, 0xdd, 0xdb, 0xda, 0xd8, | |
| | | 0xd7, 0xd5, 0xd4, 0xd3, 0xd1, 0xd0, 0xcf, 0xcd, | |
| | | 0xcc, 0xcb, 0xca, 0xc8, 0xc7, 0xc6, 0xc5, 0xc4, | |
| | | 0xc2, 0xc1, 0xc0, 0xbf, 0xbe, 0xbd, 0xbc, 0xbb, | |
| | | 0xba, 0xb9, 0xb8, 0xb7, 0xb6, 0xb5, 0xb4, 0xb3, | |
| | | 0xb2, 0xb1, 0xb0, 0xaf, 0xae, 0xad, 0xac, 0xab, | |
| | | 0xaa, 0xa9, 0xa8, 0xa8, 0xa7, 0xa6, 0xa5, 0xa4, | |
| | | 0xa3, 0xa3, 0xa2, 0xa1, 0xa0, 0x9f, 0x9f, 0x9e, | |
| | | 0x9d, 0x9c, 0x9c, 0x9b, 0x9a, 0x99, 0x99, 0x98, | |
| | | 0x97, 0x97, 0x96, 0x95, 0x95, 0x94, 0x93, 0x93, | |
| | | 0x92, 0x91, 0x91, 0x90, 0x8f, 0x8f, 0x8e, 0x8e, | |
| | | 0x8d, 0x8c, 0x8c, 0x8b, 0x8b, 0x8a, 0x89, 0x89, | |
| | | 0x88, 0x88, 0x87, 0x87, 0x86, 0x85, 0x85, 0x84, | |
| | | 0x84, 0x83, 0x83, 0x82, 0x82, 0x81, 0x81, 0x80 | |
| | | }; | |
| | | | |
| | | static __device__ const unsigned int __internal_invSqrtCubeTab[96] = | |
| | | { | |
| | | 0xfa0bf8fe, 0xee6b28fa, 0xe5f024f7, 0xdaf268f3, | |
| | | 0xd2f000f0, 0xc890c0ec, 0xc10378e9, 0xb9a758e6, | |
| | | 0xb4da40e4, 0xadcea0e1, 0xa6f278de, 0xa279c0dc, | |
| | | 0x9beb48d9, 0x97a5c4d7, 0x916340d4, 0x8d4fc8d2, | |
| | | 0x895000d0, 0x8563b8ce, 0x818ac0cc, 0x7dc4e8ca, | |
| | | 0x7a1200c8, 0x7671d8c6, 0x72e440c4, 0x6f6908c2, | |
| | | 0x6db240c1, 0x6a523cbf, 0x670424bd, 0x6563c0bc, | |
| | | 0x623028ba, 0x609ce8b9, 0x5d8364b7, 0x5bfd18b6, | |
| | | 0x58fd40b4, 0x5783a8b3, 0x560e48b2, 0x533000b0, | |
| | | 0x51c70caf, 0x506238ae, 0x4da4c0ac, 0x4c4c10ab, | |
| | | 0x4af768aa, 0x49a6b8a9, 0x485a00a8, 0x471134a7, | |
| | | 0x45cc58a6, 0x434e40a4, 0x4214f8a3, 0x40df88a2, | |
| | | 0x3fade0a1, 0x3e8000a0, 0x3d55dc9f, 0x3c2f789e, | |
| | | 0x3c2f789e, 0x3b0cc49d, 0x39edc09c, 0x38d2609b, | |
| | | 0x37baa89a, 0x36a68899, 0x35960098, 0x34890497, | |
| | | 0x34890497, 0x337f9896, 0x3279ac95, 0x31774094, | |
| | | 0x30784893, 0x30784893, 0x2f7cc892, 0x2e84b091, | |
| | | 0x2d900090, 0x2d900090, 0x2c9eac8f, 0x2bb0b88e, | |
| | | 0x2bb0b88e, 0x2ac6148d, 0x29dec08c, 0x29dec08c, | |
| | | 0x28fab08b, 0x2819e88a, 0x2819e88a, 0x273c5889, | |
| | | 0x273c5889, 0x26620088, 0x258ad487, 0x258ad487, | |
| | | 0x24b6d886, 0x24b6d886, 0x23e5fc85, 0x23184084, | |
| | | 0x23184084, 0x224d9883, 0x224d9883, 0x21860882, | |
| | | 0x21860882, 0x20c18081, 0x20c18081, 0x20000080 | |
| | | }; | |
| | | | |
| | | __device_func__(float __internal_frcp_kernel (float x,enum cudaRoundMode mo | |
| | | de)) | |
| | | { | |
| | | unsigned long long prod; | |
| | | volatile union __cudart_FloatUintCvt arg; | |
| | | unsigned int expo; | |
| | | unsigned int sign; | |
| | | unsigned f, y; | |
| | | | |
| | | arg.f = x; | |
| | | sign = arg.i & 0x80000000; | |
| | | expo = (arg.i >> 23); | |
| | | expo = expo & 0xff; | |
| | | f = expo - 1; | |
| | | | |
| | | if (f <= 0xFD) { | |
| | | y = (arg.i << 8); | |
| | | y = y | 0x80000000; | |
| | | /* initial approximation */ | |
| | | arg.i = __internal_rcpTab[(y >> 24) - 128]; | |
| | | /* first NR iteration */ | |
| | | f = arg.i * arg.i; | |
| | | f = f << 16; | |
| | | prod = ((unsigned long long)y) * f; | |
| | | arg.i = (arg.i << 24) - (unsigned)(prod >> 32); | |
| | | /* second NR iteration */ | |
| | | f = arg.i + arg.i; | |
| | | prod = ((unsigned long long)y) * f; | |
| | | f = (unsigned)(-(int)(prod >> 32)); | |
| | | prod = ((unsigned long long)arg.i) * f; | |
| | | y = y >> 8; | |
| | | /* compute exponent */ | |
| | | expo = (2 * 127) - expo - 2; | |
| | | arg.i = (unsigned)(prod >> 32); | |
| | | if (mode == cudaRoundNearest) { | |
| | | arg.i = arg.i >> 6; | |
| | | } else { | |
| | | arg.i = (arg.i + 32) >> 6; | |
| | | } | |
| | | if ((int)expo >= 0) { | |
| | | f = y * arg.i; | |
| | | arg.i = ((expo << 23) + arg.i) | sign; | |
| | | } else { | |
| | | /* result is a denormal */ | |
| | | expo = -(int)expo; | |
| | | arg.i = arg.i >> expo; | |
| | | f = y * arg.i; | |
| | | arg.i = arg.i | sign; | |
| | | } | |
| | | if (mode == cudaRoundNearest) { | |
| | | expo = f + y; | |
| | | if ((int)f < 0) f = (unsigned)(-(int)f); | |
| | | if ((int)expo < 0) expo = (unsigned)(-(int)expo); | |
| | | if (expo < f) arg.i++; | |
| | | } else if (mode == cudaRoundZero) { | |
| | | if ((int)f > 0) arg.i = arg.i - 1; | |
| | | } else if (mode == cudaRoundPosInf) { | |
| | | if (((int)f > 0) && sign) arg.i = arg.i - 1; | |
| | | if (((int)f < 0) && !sign) arg.i = arg.i + 1; | |
| | | } else { /* mode == cudaRoundMinInf */ | |
| | | if (((int)f > 0) && !sign) arg.i = arg.i - 1; | |
| | | if (((int)f < 0) && sign) arg.i = arg.i + 1; | |
| | | } | |
| | | return arg.f; | |
| | | } else { | |
| | | /* zero returns infinity. Must handle negative zero as well */ | |
| | | if (!(arg.i << 1)) { | |
| | | arg.i = 0x7F800000 | arg.i; | |
| | | return arg.f; | |
| | | } | |
| | | /* infinity returns zero of like sign */ | |
| | | if ((arg.i << 1) == 0xff000000) { | |
| | | arg.i &= 0x80000000; | |
| | | return arg.f; | |
| | | } | |
| | | /* convert SNaNs to QNaNs */ | |
| | | if ((arg.i << 1) > 0xff000000) { | |
| | | arg.i |= 0x00400000; | |
| | | return arg.f; | |
| | | } | |
| | | /* denormals */ | |
| | | f = 0; | |
| | | arg.i <<= 8; | |
| | | do { | |
| | | f++; | |
| | | arg.i <<= 1; | |
| | | } while ((int)arg.i > 0); | |
| | | arg.i >>= 8; | |
| | | arg.i |= sign; | |
| | | arg.f = __internal_frcp_kernel (arg.f, mode); | |
| | | expo = ((arg.i << 1) >> 24); | |
| | | if ((expo + f) < 255) { | |
| | | arg.i = (arg.i + (f << 23)); | |
| | | return arg.f; | |
| | | } | |
| | | if (mode == cudaRoundNearest) { | |
| | | arg.i = (arg.i & 0x80000000) | 0x7f800000; | |
| | | } else if (mode == cudaRoundZero) { | |
| | | arg.i = (arg.i & 0x80000000) | 0x7f7fffff; | |
| | | } else if (mode == cudaRoundPosInf) { | |
| | | arg.i = (arg.i & 0x80000000) | ((sign) ? 0x7f7fffff : 0x7f800000); | |
| | | } else { /* mode == cudaRoundMinInf */ | |
| | | arg.i = (arg.i & 0x80000000) | ((sign) ? 0x7f800000 : 0x7f7fffff); | |
| | | } | |
| | | return arg.f; | |
| | | } | |
| | | } | |
| | | | |
| | | __device_func__(float __internal_fsqrt_kernel (float radicand, | |
| | | enum cudaRoundMode mode)) | |
| | | { | |
| | | unsigned long long prod; | |
| | | volatile union __cudart_FloatUintCvt arg; | |
| | | unsigned int expo; | |
| | | unsigned int s, f, x; | |
| | | | |
| | | arg.f = radicand; | |
| | | expo = arg.i >> 23; | |
| | | expo = expo & 0xff; | |
| | | f = expo - 1; | |
| | | | |
| | | if ((arg.i <= 0x80000000) && (f <= 0xFD)) { | |
| | | /* normalize input argument */ | |
| | | x = (arg.i << 8) | 0x80000000; | |
| | | x = x >> (expo & 1); | |
| | | /* initial approximation */ | |
| | | arg.i = f = __internal_invSqrtCubeTab[((unsigned)x >> 25) - 32]; | |
| | | /* first NR iteration */ | |
| | | prod = ((unsigned long long)x) * f; | |
| | | arg.i = ((arg.i * 3) << 22) - (unsigned)(prod >> 32); | |
| | | /* second NR iteration */ | |
| | | prod = ((unsigned long long)arg.i) * arg.i; | |
| | | s = (unsigned)(prod >> 32); | |
| | | prod = ((unsigned long long)x) * s; | |
| | | f = 0x30000000 - (unsigned)(prod >> 32); | |
| | | prod = ((unsigned long long)f) * arg.i; | |
| | | arg.i = (unsigned)(prod >> 32); | |
| | | /* compute sqrt(x) as x * 1/sqrt(x) */ | |
| | | prod = ((unsigned long long)x) * arg.i; | |
| | | arg.i = (unsigned)(prod >> 32); | |
| | | if (mode == cudaRoundNearest) { | |
| | | arg.i = arg.i >> 3; | |
| | | } else { | |
| | | arg.i = (arg.i + 4) >> 3; | |
| | | } | |
| | | x = (x << 16) - (arg.i * arg.i); | |
| | | /* round to nearest based on remainder; tie case impossible */ | |
| | | if (mode == cudaRoundNearest) { | |
| | | f = x - (2 * arg.i + 1); | |
| | | if ((int)f < 0) f = (unsigned)(-(int)f); | |
| | | if ((int)x < 0) x = (unsigned)(-(int)x); | |
| | | if (f < x) arg.i ++; | |
| | | } else if ((mode == cudaRoundZero) || (mode == cudaRoundMinInf)) { | |
| | | if ((int)x < 0) arg.i--; | |
| | | } else if (mode == cudaRoundPosInf) { | |
| | | if ((int)x > 0) arg.i++; | |
| | | } | |
| | | arg.i = arg.i + (((expo + 125) & ~0x1) << 22); | |
| | | return arg.f; | |
| | | } else { | |
| | | /* if zero, or positive infinity, return argument */ | |
| | | if (!(arg.i << 1) || (arg.i == 0x7F800000)) { | |
| | | return arg.f; | |
| | | } | |
| | | /* if NaN, return argument, possibly converted to QNaN */ | |
| | | if ((arg.i << 1) > 0xFF000000) { | |
| | | arg.i |= 0x00400000; | |
| | | return arg.f; | |
| | | } | |
| | | /* if negative, return NaN: INDEFINITE */ | |
| | | if (arg.i & 0x80000000) { | |
| | | arg.i = 0xFFC00000; | |
| | | return arg.f; | |
| | | } | |
| | | /* denormal, normalize it before computing square root */ | |
| | | x = 0; | |
| | | arg.i <<= 8; | |
| | | do { | |
| | | x++; | |
| | | arg.i <<= 1; | |
| | | } while ((int)arg.i > 0); | |
| | | arg.i >>= 8; | |
| | | arg.i += (x & 1) << 23; | |
| | | x += (x & 1); | |
| | | arg.f = __internal_fsqrt_kernel (arg.f, mode); | |
| | | arg.i -= ((x >> 1) << 23); | |
| | | return arg.f; | |
| | | } | |
| | | } | |
| | | | |
| | | __device_func__(float __internal_fdiv_kernel (float dividend, float divisor | |
| | | , | |
| | | enum cudaRoundMode mode)) | |
| | | { | |
| | | unsigned long long prod; | |
| | | unsigned r, f, x, y, expox, expoy, sign; | |
| | | volatile union __cudart_FloatUintCvt cvtx, cvty, res; | |
| | | | |
| | | cvtx.f = dividend; | |
| | | cvty.f = divisor; | |
| | | expox = ((cvtx.i >> 23) & 0xff) - 1; | |
| | | expoy = ((cvty.i >> 23) & 0xff) - 1; | |
| | | sign = ((cvtx.i ^ cvty.i) & 0x80000000); | |
| | | | |
| | | if ((expox <= 0xFD) && (expoy <= 0xFD)) { | |
| | | divide: | |
| | | expox = expox - expoy + 127 - 1; | |
| | | expoy = expox; | |
| | | /* extract mantissas */ | |
| | | y = (cvty.i << 8) | 0x80000000; | |
| | | x = (cvtx.i & 0x00ffffff) | 0x00800000; | |
| | | /* initial approximation */ | |
| | | r = __internal_rcpTab[(y >> 24) - 128]; | |
| | | /* first NR iteration */ | |
| | | f = r * r; | |
| | | prod = ((unsigned long long)y) * (f << 16); | |
| | | r = (r << 24) - (unsigned)(prod >> 32); | |
| | | /* second NR iteration */ | |
| | | prod = ((unsigned long long)y) * (r << 1); | |
| | | f = (unsigned)-(int)(prod >> 32); | |
| | | prod = ((unsigned long long)f) * (r << 1); | |
| | | r = (unsigned)(prod >> 32); | |
| | | /* produce quotient */ | |
| | | prod = ((unsigned long long)x) * (r << 1); | |
| | | /* normalize mantissa */ | |
| | | if (((int)((prod >> 32) << 8)) > 0) { | |
| | | expox--; | |
| | | prod = prod + prod; | |
| | | } | |
| | | if (mode == cudaRoundNearest) { | |
| | | /* preliminary mantissa */ | |
| | | r = (unsigned)(prod >> 32); | |
| | | y = y >> 8; | |
| | | /* result is a normal */ | |
| | | if (expox <= 0xFD) { | |
| | | int rem0, rem1, inc; | |
| | | /* round mantissa to nearest even */ | |
| | | prod = ((unsigned long long)y) * r; | |
| | | x = x << (23 + ((prod >> 32) >> 15)); | |
| | | rem1 = x - (unsigned)(prod & 0xffffffff); | |
| | | rem0 = rem1 - y; | |
| | | inc = abs(rem0) < abs(rem1); | |
| | | /* merge sign, mantissa, exponent for final result */ | |
| | | res.i = sign | ((expox << 23) + r + inc); | |
| | | return res.f; | |
| | | } else if ((int)expox >= 254) { | |
| | | /* overflow: return infinity */ | |
| | | res.i = sign | 0x7f800000; | |
| | | return res.f; | |
| | | } else { | |
| | | /* underflow: result is zero, denormal, or smallest normal */ | |
| | | int shift = -(int)expox; | |
| | | if (shift > 23) { | |
| | | /* result is zero or smallest denormal */ | |
| | | r = (shift < 25) && ((x != y) || (r > 0x00ff0000)); | |
| | | res.i = sign | r; | |
| | | return res.f; | |
| | | } | |
| | | if (x == y) { | |
| | | /* result is denormal */ | |
| | | shift = -(int)expoy; | |
| | | r = 0x00800000 >> shift; | |
| | | res.i = sign | r; | |
| | | return res.f; | |
| | | } | |
| | | { | |
| | | unsigned long long tempx; | |
| | | long long remlo, remhi; | |
| | | /* result is denormal or smallest normal */ | |
| | | r = r >> shift; | |
| | | prod = ((unsigned long long)y) * r; | |
| | | tempx = ((unsigned long long)x) << (23 - shift); | |
| | | remlo = 2 * tempx - 2 * prod - y; | |
| | | remhi = remlo + 2 * tempx; | |
| | | if (remlo < 0) remlo = -remlo; | |
| | | if (remhi < 0) remhi = -remhi; | |
| | | if (remhi < remlo) tempx = 2 * tempx; | |
| | | remlo = tempx - prod; | |
| | | remhi = remlo - y; | |
| | | if (remlo < 0) remlo = -remlo; | |
| | | if (remhi < 0) remhi = -remhi; | |
| | | if ((remhi < remlo) || ((remhi == remlo) && (r & 1))) r++; | |
| | | res.i = sign | r; | |
| | | return res.f; | |
| | | } | |
| | | } | |
| | | } else if (mode == cudaRoundZero) { | |
| | | /* preliminary mantissa */ | |
| | | prod += 0x0000000080000000ULL; | |
| | | r = (unsigned)(prod >> 32); | |
| | | y = y >> 8; | |
| | | /* result is a normal */ | |
| | | if (expox <= 0xFD) { | |
| | | int rem1; | |
| | | prod = ((unsigned long long)y) * r; | |
| | | x = x << (23 + ((prod >> 32) >> 15)); | |
| | | rem1 = x - (unsigned)(prod & 0xffffffff); | |
| | | if (rem1 < 0) r--; | |
| | | r = (expox << 23) + r; | |
| | | if (r == 0x7f800000) r = 0x7f7fffff; | |
| | | res.i = sign | r; | |
| | | return res.f; | |
| | | } else if ((int)expox >= 254) { | |
| | | /* overflow: return largest normal */ | |
| | | res.i = sign | 0x7f7fffff; | |
| | | return res.f; | |
| | | } else { | |
| | | /* underflow: result is zero, denormal, or smallest normal */ | |
| | | int shift = -(int)expox; | |
| | | if ((x == y) && (shift < 31)) { | |
| | | shift = -(int)expoy; | |
| | | r = 0x00800000 >> shift; | |
| | | res.i = sign | r; | |
| | | return res.f; | |
| | | } | |
| | | if (shift > 23) { | |
| | | r = 0; | |
| | | res.i = sign | r; | |
| | | return res.f; | |
| | | } | |
| | | { | |
| | | unsigned long long tempx; | |
| | | long long remlo, remhi; | |
| | | /* result is denormal or smallest normal */ | |
| | | r = r >> shift; | |
| | | prod = ((unsigned long long)y) * r; | |
| | | tempx = ((unsigned long long)x) << (23 - shift); | |
| | | remlo = 2 * tempx - 2 * prod - y; | |
| | | remhi = remlo + 2 * tempx; | |
| | | if (remlo < 0) remlo = -remlo; | |
| | | if (remhi < 0) remhi = -remhi; | |
| | | if (remhi < remlo) tempx = 2 * tempx; | |
| | | remlo = tempx - prod; | |
| | | if ((remlo < 0) & (r != 0)) r--; | |
| | | res.i = sign | r; | |
| | | return res.f; | |
| | | } | |
| | | } | |
| | | } else if (mode == cudaRoundPosInf) { | |
| | | /* preliminary mantissa */ | |
| | | prod += 0x0000000080000000ULL; | |
| | | r = (unsigned)(prod >> 32); | |
| | | y = y >> 8; | |
| | | /* result is a normal */ | |
| | | if (expox <= 0xFD) { | |
| | | int rem1; | |
| | | prod = ((unsigned long long)y) * r; | |
| | | x = x << (23 + ((prod >> 32) >> 15)); | |
| | | rem1 = x - (unsigned)(prod & 0xffffffff); | |
| | | if ((rem1 < 0) && (sign)) r--; | |
| | | if ((rem1 > 0) && (!sign)) r++; | |
| | | r = (expox << 23) + r; | |
| | | if ((r == 0x7f800000) && (sign)) r = 0x7f7fffff; | |
| | | res.i = sign | r; | |
| | | return res.f; | |
| | | } else if ((int)expox >= 254) { | |
| | | /* overflow: return largest normal, or infinity */ | |
| | | r = sign ? 0x7f7fffff : 0x7f800000; | |
| | | res.i = sign | r; | |
| | | return res.f; | |
| | | } else { | |
| | | /* underflow: result is zero, denormal, or smallest normal */ | |
| | | int shift = -(int)expox; | |
| | | if ((x == y) && (shift <= 24)) { | |
| | | shift = -(int)expoy; | |
| | | r = 0x00800000 >> shift; | |
| | | if (r == 0) r = !sign; | |
| | | res.i = sign | r; | |
| | | return res.f; | |
| | | } | |
| | | if (shift > 23) { | |
| | | r = !sign; | |
| | | res.i = sign | r; | |
| | | return res.f; | |
| | | } | |
| | | { | |
| | | unsigned long long tempx; | |
| | | long long remlo, remhi; | |
| | | /* result is denormal or smallest normal */ | |
| | | r = r >> shift; | |
| | | prod = ((unsigned long long)y) * r; | |
| | | tempx = ((unsigned long long)x) << (23 - shift); | |
| | | remlo = 2 * tempx - 2 * prod - y; | |
| | | remhi = remlo + 2 * tempx; | |
| | | if (remlo < 0) remlo = -remlo; | |
| | | if (remhi < 0) remhi = -remhi; | |
| | | if (remhi < remlo) tempx = 2 * tempx; | |
| | | remlo = tempx - prod; | |
| | | if ((remlo < 0) && (r != 0) && (sign)) r--; | |
| | | if ((remlo > 0) && (!sign)) r++; | |
| | | res.i = sign | r; | |
| | | return res.f; | |
| | | } | |
| | | } | |
| | | } else if (mode == cudaRoundMinInf) { | |
| | | /* preliminary mantissa */ | |
| | | prod += 0x0000000080000000ULL; | |
| | | r = (unsigned)(prod >> 32); | |
| | | y = y >> 8; | |
| | | /* result is a normal */ | |
| | | if (expox <= 0xFD) { | |
| | | int rem1; | |
| | | prod = ((unsigned long long)y) * r; | |
| | | x = x << (23 + ((prod >> 32) >> 15)); | |
| | | rem1 = x - (unsigned)(prod & 0xffffffff); | |
| | | if ((rem1 < 0) && (!sign)) r--; | |
| | | if ((rem1 > 0) && (sign)) r++; | |
| | | r = (expox << 23) + r; | |
| | | if ((r == 0x7f800000) && (!sign)) r = 0x7f7fffff; | |
| | | res.i = sign | r; | |
| | | return res.f; | |
| | | } else if ((int)expox >= 254) { | |
| | | /* overflow: return largest normal, or infinity */ | |
| | | r = sign ? 0x7f800000 : 0x7f7fffff; | |
| | | res.i = sign | r; | |
| | | return res.f; | |
| | | } else { | |
| | | /* underflow: result is zero, denormal, or smallest normal */ | |
| | | int shift = -(int)expox; | |
| | | if ((x == y) && (shift <= 24)) { | |
| | | shift = -(int)expoy; | |
| | | r = 0x00800000 >> shift; | |
| | | if (r == 0) r = !!sign; | |
| | | res.i = sign | r; | |
| | | return res.f; | |
| | | } | |
| | | if (shift > 23) { | |
| | | r = !!sign; | |
| | | res.i = sign | r; | |
| | | return res.f; | |
| | | } | |
| | | { | |
| | | unsigned long long tempx; | |
| | | long long remlo, remhi; | |
| | | /* result is denormal or smallest normal */ | |
| | | r = r >> shift; | |
| | | prod = ((unsigned long long)y) * r; | |
| | | tempx = ((unsigned long long)x) << (23 - shift); | |
| | | remlo = 2 * tempx - 2 * prod - y; | |
| | | remhi = remlo + 2 * tempx; | |
| | | if (remlo < 0) remlo = -remlo; | |
| | | if (remhi < 0) remhi = -remhi; | |
| | | if (remhi < remlo) tempx = 2 * tempx; | |
| | | remlo = tempx - prod; | |
| | | if ((remlo < 0) && (r != 0) && (!sign)) r--; | |
| | | if ((remlo > 0) && (sign)) r++; | |
| | | res.i = sign | r; | |
| | | return res.f; | |
| | | } | |
| | | } | |
| | | } | |
| | | } | |
| | | { | |
| | | int xzero, yzero, xinf, yinf, xnan, ynan; | |
| | | | |
| | | xnan = (cvtx.i << 1) > 0xff000000; | |
| | | ynan = (cvty.i << 1) > 0xff000000; | |
| | | /* handle NaNs. Convert SNaNs to QNaNs */ | |
| | | if (xnan) { | |
| | | res.i = cvtx.i | 0x00400000; | |
| | | return res.f; | |
| | | } | |
| | | if (ynan) { | |
| | | res.i = cvty.i | 0x00400000; | |
| | | return res.f; | |
| | | } | |
| | | xzero = (cvtx.i << 1) == 0x00000000; | |
| | | yzero = (cvty.i << 1) == 0x00000000; | |
| | | xinf = (cvtx.i << 1) == 0xff000000; | |
| | | yinf = (cvty.i << 1) == 0xff000000; | |
| | | /* 0/0 and INF/INF are invalid operations. Return INDEFINITE */ | |
| | | if ((xzero & yzero) | (xinf & yinf)) { | |
| | | res.i = 0xffc00000; | |
| | | return res.f; | |
| | | } | |
| | | /* x/INF and 0/y -> 0 */ | |
| | | if (xzero | yinf) { | |
| | | res.i = sign; | |
| | | return res.f; | |
| | | } | |
| | | /* x/0 and INF/y -> INF */ | |
| | | if (yzero | xinf) { | |
| | | res.i = sign | 0x7f800000; | |
| | | return res.f; | |
| | | } | |
| | | /* normalize denormals */ | |
| | | if ((int)expox < 0) { | |
| | | cvtx.i = cvtx.i << 9; | |
| | | while ((int)cvtx.i >= 0) { | |
| | | expox--; | |
| | | cvtx.i = cvtx.i + cvtx.i; | |
| | | } | |
| | | cvtx.i = cvtx.i >> 8; | |
| | | } | |
| | | if ((int)expoy < 0) { | |
| | | cvty.i = cvty.i << 9; | |
| | | while ((int)cvty.i >= 0) { | |
| | | expoy--; | |
| | | cvty.i = cvty.i + cvty.i; | |
| | | } | |
| | | cvty.i = cvty.i >> 8; | |
| | | } | |
| | | goto divide; | |
| | | } | |
| | | } | |
| | | | |
| | | __device_func__(float __internal_fmul_kernel2 (float a, float b, | |
| | | enum cudaRoundMode mode)) | |
| | | { | |
| | | unsigned long long product; | |
| | | volatile union __cudart_FloatUintCvt xx, yy; | |
| | | | |
| | | unsigned expo_x, expo_y; | |
| | | | |
| | | xx.f = a; | |
| | | yy.f = b; | |
| | | | |
| | | expo_y = 0xFF; | |
| | | expo_x = expo_y & (xx.i >> 23); | |
| | | expo_x = expo_x - 1; | |
| | | expo_y = expo_y & (yy.i >> 23); | |
| | | expo_y = expo_y - 1; | |
| | | | |
| | | if ((expo_x <= 0xFD) && | |
| | | (expo_y <= 0xFD)) { | |
| | | multiply: | |
| | | expo_x = expo_x + expo_y; | |
| | | expo_y = xx.i ^ yy.i; | |
| | | xx.i = xx.i & 0x00ffffff; | |
| | | yy.i = yy.i << 8; | |
| | | xx.i = xx.i | 0x00800000; | |
| | | yy.i = yy.i | 0x80000000; | |
| | | /* compute product */ | |
| | | product = ((unsigned long long)xx.i) * yy.i; | |
| | | expo_x = expo_x - 127 + 2; | |
| | | expo_y = expo_y & 0x80000000; | |
| | | xx.i = (unsigned int)(product >> 32); | |
| | | yy.i = (unsigned int)(product & 0xffffffff); | |
| | | /* normalize mantissa */ | |
| | | if (xx.i < 0x00800000) { | |
| | | xx.i = (xx.i << 1) | (yy.i >> 31); | |
| | | yy.i = (yy.i << 1); | |
| | | expo_x--; | |
| | | } | |
| | | if (expo_x <= 0xFD) { | |
| | | xx.i = xx.i | expo_y; /* OR in sign bit */ | |
| | | xx.i = xx.i + (expo_x << 23); /* add in exponent */ | |
| | | /* round result to nearest or even */ | |
| | | if (mode == cudaRoundNearest) { | |
| | | if (yy.i < 0x80000000) return xx.f; | |
| | | xx.i += ((yy.i == 0x80000000) ? (xx.i & 1) : (yy.i >> 31)); | |
| | | } else if (mode == cudaRoundZero) { | |
| | | } else if (mode == cudaRoundPosInf) { | |
| | | xx.i += (yy.i && !expo_y); | |
| | | } else if (mode == cudaRoundMinInf) { | |
| | | xx.i += (yy.i && expo_y); | |
| | | } | |
| | | return xx.f; | |
| | | } else if ((int)expo_x >= 254) { | |
| | | /* overflow: return infinity or largest normal */ | |
| | | if (mode == cudaRoundNearest) { | |
| | | xx.i = expo_y | 0x7F800000; | |
| | | } else if (mode == cudaRoundZero) { | |
| | | xx.i = expo_y | 0x7F7FFFFF; | |
| | | } else if (mode == cudaRoundPosInf) { | |
| | | xx.i = (expo_y ? 0xff7fffff : 0x7F800000); | |
| | | } else { /* (mode == cudaRoundMinInf) */ | |
| | | xx.i = (expo_y ? 0xFF800000 : 0x7f7fffff); | |
| | | } | |
| | | return xx.f; | |
| | | } else { | |
| | | /* zero, denormal, or smallest normal */ | |
| | | expo_x = ((unsigned int)-((int)expo_x)); | |
| | | if (mode == cudaRoundNearest) { | |
| | | if (expo_x > 25) { | |
| | | /* massive underflow: return 0 */ | |
| | | xx.i = expo_y; | |
| | | return xx.f; | |
| | | } else { | |
| | | yy.i = (xx.i << (32 - expo_x)) | ((yy.i) ? 1 : 0); | |
| | | xx.i = expo_y + (xx.i >> expo_x); | |
| | | xx.i += ((yy.i == 0x80000000) ? (xx.i & 1) : (yy.i >> 31)); | |
| | | return xx.f; | |
| | | } | |
| | | } else if (mode == cudaRoundZero) { | |
| | | if (expo_x > 25) expo_x = 25; | |
| | | xx.i = expo_y + (xx.i >> expo_x); | |
| | | return xx.f; | |
| | | } else if (mode == cudaRoundPosInf) { | |
| | | if (expo_x > 25) expo_x = 25; | |
| | | yy.i = (xx.i << (32 - expo_x)) | ((yy.i) ? 1 : 0); | |
| | | xx.i = expo_y + (xx.i >> expo_x); | |
| | | xx.i += (yy.i && !expo_y); | |
| | | return xx.f; | |
| | | } else { /* (mode == cudaRoundMinInf) */ | |
| | | if (expo_x > 25) expo_x = 25; | |
| | | yy.i = (xx.i << (32 - expo_x)) | ((yy.i) ? 1 : 0); | |
| | | xx.i = expo_y + (xx.i >> expo_x); | |
| | | xx.i += (yy.i && expo_y); | |
| | | return xx.f; | |
| | | } | |
| | | } | |
| | | } else { | |
| | | product = xx.i ^ yy.i; | |
| | | product = product & 0x80000000; | |
| | | if (!(xx.i & 0x7fffffff)) { | |
| | | if (expo_y != 254) { | |
| | | xx.i = (unsigned int)product; | |
| | | return xx.f; | |
| | | } | |
| | | expo_y = yy.i << 1; | |
| | | if (expo_y == 0xFF000000) { | |
| | | xx.i = expo_y | 0x00C00000; | |
| | | } else { | |
| | | xx.i = yy.i | 0x00400000; | |
| | | } | |
| | | return xx.f; | |
| | | } | |
| | | if (!(yy.i & 0x7fffffff)) { | |
| | | if (expo_x != 254) { | |
| | | xx.i = (unsigned int)product; | |
| | | return xx.f; | |
| | | } | |
| | | expo_x = xx.i << 1; | |
| | | if (expo_x == 0xFF000000) { | |
| | | xx.i = expo_x | 0x00C00000; | |
| | | } else { | |
| | | xx.i = xx.i | 0x00400000; | |
| | | } | |
| | | return xx.f; | |
| | | } | |
| | | if ((expo_y != 254) && (expo_x != 254)) { | |
| | | expo_y++; | |
| | | expo_x++; | |
| | | if (expo_x == 0) { | |
| | | expo_y |= xx.i & 0x80000000; | |
| | | /* | |
| | | * If both operands are denormals, we only need to normalize | |
| | | * one of them as the result will be either a denormal or zero. | |
| | | */ | |
| | | xx.i = xx.i << 8; | |
| | | while (!(xx.i & 0x80000000)) { | |
| | | xx.i <<= 1; | |
| | | expo_x--; | |
| | | } | |
| | | xx.i = (xx.i >> 8) | (expo_y & 0x80000000); | |
| | | expo_y &= ~0x80000000; | |
| | | expo_y--; | |
| | | goto multiply; | |
| | | } | |
| | | if (expo_y == 0) { | |
| | | expo_x |= yy.i & 0x80000000; | |
| | | yy.i = yy.i << 8; | |
| | | while (!(yy.i & 0x80000000)) { | |
| | | yy.i <<= 1; | |
| | | expo_y--; | |
| | | } | |
| | | yy.i = (yy.i >> 8) | (expo_x & 0x80000000); | |
| | | expo_x &= ~0x80000000; | |
| | | expo_x--; | |
| | | goto multiply; | |
| | | } | |
| | | } | |
| | | expo_x = xx.i << 1; | |
| | | expo_y = yy.i << 1; | |
| | | /* if x is NaN, return x */ | |
| | | if (expo_x > 0xFF000000) { | |
| | | /* cvt any SNaNs to QNaNs */ | |
| | | xx.i = xx.i | 0x00400000; | |
| | | return xx.f; | |
| | | } | |
| | | /* if y is NaN, return y */ | |
| | | if (expo_y > 0xFF000000) { | |
| | | /* cvt any SNaNs to QNaNs */ | |
| | | xx.i = yy.i | 0x00400000; | |
| | | return xx.f; | |
| | | } | |
| | | xx.i = (unsigned int)product | 0x7f800000; | |
| | | return xx.f; | |
| | | } | |
| | | } | |
| | | | |
| | | __device_func__(float __internal_fmaf_kernel (float a, float b, float c, | |
| | | enum cudaRoundMode mode)) | |
| | | { | |
| | | unsigned long long product; | |
| | | unsigned int xx, yy, zz, ww; | |
| | | unsigned int temp, s, u; | |
| | | unsigned int expo_x, expo_y, expo_z; | |
| | | volatile union __cudart_FloatUintCvt cvt; | |
| | | | |
| | | cvt.f = a; | |
| | | xx = cvt.i; | |
| | | cvt.f = b; | |
| | | yy = cvt.i; | |
| | | cvt.f = c; | |
| | | zz = cvt.i; | |
| | | | |
| | | temp = 0xff; | |
| | | expo_x = temp & (xx >> 23); | |
| | | expo_x = expo_x - 1; | |
| | | expo_y = temp & (yy >> 23); | |
| | | expo_y = expo_y - 1; | |
| | | expo_z = temp & (zz >> 23); | |
| | | expo_z = expo_z - 1; | |
| | | | |
| | | if (!((expo_x <= 0xFD) && | |
| | | (expo_y <= 0xFD) && | |
| | | (expo_z <= 0xFD))) { | |
| | | /* fmad (nan, y, z) --> nan | |
| | | fmad (x, nan, z) --> nan | |
| | | fmad (x, y, nan) --> nan | |
| | | */ | |
| | | if ((yy << 1) > 0xff000000) { | |
| | | return b + b; | |
| | | } | |
| | | if ((zz << 1) > 0xff000000) { | |
| | | return c + c; | |
| | | } | |
| | | if ((xx << 1) > 0xff000000) { | |
| | | return a + a; | |
| | | } | |
| | | /* fmad (0, inf, z) --> NaN | |
| | | fmad (inf, 0, z) --> NaN | |
| | | fmad (-inf,+y,+inf) --> NaN | |
| | | fmad (+x,-inf,+inf) --> NaN | |
| | | fmad (+inf,-y,+inf) --> NaN | |
| | | fmad (-x,+inf,+inf) --> NaN | |
| | | fmad (-inf,-y,-inf) --> NaN | |
| | | fmad (-x,-inf,-inf) --> NaN | |
| | | fmad (+inf,+y,-inf) --> NaN | |
| | | fmad (+x,+inf,-inf) --> NaN | |
| | | */ | |
| | | if ((((xx << 1) == 0) && ((yy << 1) == 0xff000000)) || | |
| | | (((yy << 1) == 0) && ((xx << 1) == 0xff000000))) { | |
| | | cvt.i = 0xffc00000; | |
| | | return cvt.f; | |
| | | } | |
| | | if ((zz << 1) == 0xff000000) { | |
| | | if (((yy << 1) == 0xff000000) || ((xx << 1) == 0xff000000)) { | |
| | | if ((int)(xx ^ yy ^ zz) < 0) { | |
| | | cvt.i = 0xffc00000; | |
| | | return cvt.f; | |
| | | } | |
| | | } | |
| | | } | |
| | | /* fmad (inf, y, z) --> inf | |
| | | fmad (x, inf, z) --> inf | |
| | | fmad (x, y, inf) --> inf | |
| | | */ | |
| | | if ((xx << 1) == 0xff000000) { | |
| | | xx = xx ^ (yy & 0x80000000); | |
| | | cvt.i = xx; | |
| | | return cvt.f; | |
| | | } | |
| | | if ((yy << 1) == 0xff000000) { | |
| | | yy = yy ^ (xx & 0x80000000); | |
| | | cvt.i = yy; | |
| | | return cvt.f; | |
| | | } | |
| | | if ((zz << 1) == 0xff000000) { | |
| | | cvt.i = zz; | |
| | | return cvt.f; | |
| | | } | |
| | | /* fmad (+0, -y, -0) --> -0 | |
| | | fmad (-0, +y, -0) --> -0 | |
| | | fmad (+x, -0, -0) --> -0 | |
| | | fmad (-x, +0, -0) --> -0 | |
| | | */ | |
| | | if (zz == 0x80000000) { | |
| | | if (((xx << 1) == 0) || ((yy << 1) == 0)) { | |
| | | if ((int)(xx ^ yy) < 0) { | |
| | | cvt.i = zz; | |
| | | return cvt.f; | |
| | | } | |
| | | } | |
| | | } | |
| | | /* fmad (0, y, 0) --> +0 | |
| | | fmad (x, 0, 0) --> +0 | |
| | | */ | |
| | | if (((zz << 1) == 0) && | |
| | | (((xx << 1) == 0) || ((yy << 1) == 0))) { | |
| | | if (mode == cudaRoundMinInf) { | |
| | | zz = 0x80000000 & (xx ^ yy ^ zz); | |
| | | } else { | |
| | | zz &= 0x7fffffff; | |
| | | } | |
| | | cvt.i = zz; | |
| | | return cvt.f; | |
| | | } | |
| | | /* fmad (0, y, z) --> z | |
| | | fmad (x, 0, z) --> z | |
| | | */ | |
| | | if (((xx << 1) == 0) || ((yy << 1) == 0)) { | |
| | | cvt.i = zz; | |
| | | return cvt.f; | |
| | | } | |
| | | /* normalize x, if denormal */ | |
| | | if (expo_x == (unsigned)-1) { | |
| | | temp = xx & 0x80000000; | |
| | | xx = xx << 8; | |
| | | while (!(xx & 0x80000000)) { | |
| | | xx <<= 1; | |
| | | expo_x--; | |
| | | } | |
| | | expo_x++; | |
| | | xx = (xx >> 8) | temp; | |
| | | } | |
| | | /* normalize y, if denormal */ | |
| | | if (expo_y == (unsigned)-1) { | |
| | | temp = yy & 0x80000000; | |
| | | yy = yy << 8; | |
| | | while (!(yy & 0x80000000)) { | |
| | | yy <<= 1; | |
| | | expo_y--; | |
| | | } | |
| | | expo_y++; | |
| | | yy = (yy >> 8) | temp; | |
| | | } | |
| | | /* normalize z, if denormal */ | |
| | | if ((expo_z == (unsigned)-1) && ((zz << 1) != 0)) { | |
| | | temp = zz & 0x80000000; | |
| | | zz = zz << 8; | |
| | | while (!(zz & 0x80000000)) { | |
| | | zz <<= 1; | |
| | | expo_z--; | |
| | | } | |
| | | expo_z++; | |
| | | zz = (zz >> 8) | temp; | |
| | | } | |
| | | } | |
| | | | |
| | | expo_x = expo_x + expo_y; | |
| | | expo_y = xx ^ yy; | |
| | | xx = xx & 0x00ffffff; | |
| | | yy = yy << 8; | |
| | | xx = xx | 0x00800000; | |
| | | yy = yy | 0x80000000; | |
| | | | |
| | | product = ((unsigned long long)xx) * yy; | |
| | | xx = (unsigned)(product >> 32); | |
| | | yy = (unsigned)(product & 0xffffffff); | |
| | | | |
| | | expo_x = expo_x - 127 + 2; | |
| | | expo_y = expo_y & 0x80000000; | |
| | | /* normalize mantissa */ | |
| | | if (xx < 0x00800000) { | |
| | | xx = (xx << 1) | (yy >> 31); | |
| | | yy = (yy << 1); | |
| | | expo_x--; | |
| | | } | |
| | | temp = 0; | |
| | | | |
| | | if ((zz << 1) != 0) { /* z is not zero */ | |
| | | s = zz & 0x80000000; | |
| | | zz &= 0x00ffffff; | |
| | | zz |= 0x00800000; | |
| | | ww = 0; | |
| | | /* compare and swap. put augend into xx:yy */ | |
| | | if ((int)expo_z > (int)expo_x) { | |
| | | temp = expo_z; | |
| | | expo_z = expo_x; | |
| | | expo_x = temp; | |
| | | temp = zz; | |
| | | zz = xx; | |
| | | xx = temp; | |
| | | temp = ww; | |
| | | ww = yy; | |
| | | yy = temp; | |
| | | temp = expo_y; | |
| | | expo_y = s; | |
| | | s = temp; | |
| | | } | |
| | | /* augend_sign = expo_y, augend_mant = xx:yy, augend_expo = expo_x */ | |
| | | /* addend_sign = s, addend_mant = zz:ww, addend_expo = expo_z */ | |
| | | expo_z = expo_x - expo_z; | |
| | | u = expo_y ^ s; | |
| | | if (expo_z <= 49) { | |
| | | /* denormalize addend */ | |
| | | temp = 0; | |
| | | while (expo_z >= 32) { | |
| | | temp = ww | (temp != 0); | |
| | | ww = zz; | |
| | | zz = 0; | |
| | | expo_z -= 32; | |
| | | } | |
| | | if (expo_z) { | |
| | | temp = ((temp >> expo_z) | (ww << (32 - expo_z)) | | |
| | | ((temp << (32 - expo_z)) != 0)); | |
| | | ww = (ww >> expo_z) | (zz << (32 - expo_z)); | |
| | | zz = (zz >> expo_z); | |
| | | } | |
| | | | |
| | | } else { | |
| | | temp = 1; | |
| | | ww = 0; | |
| | | zz = 0; | |
| | | } | |
| | | if ((int)u < 0) { | |
| | | /* signs differ, effective subtraction */ | |
| | | temp = (unsigned)(-(int)temp); | |
| | | s = (temp != 0); | |
| | | u = yy - s; | |
| | | s = u > yy; | |
| | | yy = u - ww; | |
| | | s += yy > u; | |
| | | xx = (xx - zz) - s; | |
| | | if (!(xx | yy | temp)) { | |
| | | /* complete cancelation, return 0 */ | |
| | | if (mode == cudaRoundMinInf) { | |
| | | xx = 0x80000000; | |
| | | } | |
| | | cvt.i = xx; | |
| | | return cvt.f; | |
| | | } | |
| | | if ((int)xx < 0) { | |
| | | /* ooops, augend had smaller mantissa. Negate mantissa and flip | |
| | | sign of result*/ | |
| | | temp = ~temp; | |
| | | yy = ~yy; | |
| | | xx = ~xx; | |
| | | if (++temp == 0) { | |
| | | if (++yy == 0) { | |
| | | ++xx; | |
| | | } | |
| | | } | |
| | | expo_y ^= 0x80000000; | |
| | | } | |
| | | /* normalize mantissa, if necessary */ | |
| | | while (!(xx & 0x00800000)) { | |
| | | xx = (xx << 1) | (yy >> 31); | |
| | | yy = (yy << 1); | |
| | | expo_x--; | |
| | | } | |
| | | } else { | |
| | | /* signs are the same, effective addition */ | |
| | | yy = yy + ww; | |
| | | s = yy < ww; | |
| | | xx = xx + zz + s; | |
| | | if (xx & 0x01000000) { | |
| | | temp = temp | (yy << 31); | |
| | | yy = (yy >> 1) | (xx << 31); | |
| | | xx = ((xx & 0x80000000) | (xx >> 1)) & ~0x40000000; | |
| | | expo_x++; | |
| | | } | |
| | | } | |
| | | } | |
| | | temp = yy | (temp != 0); | |
| | | if (expo_x <= 0xFD) { | |
| | | /* normal */ | |
| | | xx |= expo_y; /* or in sign bit */ | |
| | | if (mode == cudaRoundNearest) { | |
| | | s = xx & 1; /* mantissa lsb */ | |
| | | xx += (temp == 0x80000000) ? s : (temp >> 31); | |
| | | } else if (mode == cudaRoundPosInf) { | |
| | | xx += temp && !expo_y; | |
| | | } else if (mode == cudaRoundMinInf) { | |
| | | xx += temp && expo_y; | |
| | | } | |
| | | xx = xx + (expo_x << 23); /* add in exponent */ | |
| | | cvt.i = xx; | |
| | | return cvt.f; | |
| | | } else if ((int)expo_x >= 126) { | |
| | | /* overflow */ | |
| | | if (mode == cudaRoundNearest) { | |
| | | xx = expo_y | 0x7f800000; | |
| | | } else if (mode == cudaRoundZero) { | |
| | | xx = expo_y | 0x7F7FFFFF; | |
| | | } else if (mode == cudaRoundPosInf) { | |
| | | xx = expo_y ? 0xFF7FFFFF : 0x7f800000; | |
| | | } else if (mode == cudaRoundMinInf) { | |
| | | xx = expo_y ? 0xff800000 : 0x7f7fffff; | |
| | | } | |
| | | cvt.i = xx; | |
| | | return cvt.f; | |
| | | } | |
| | | /* subnormal */ | |
| | | expo_x = (unsigned int)(-(int)expo_x); | |
| | | if (expo_x > 25) { | |
| | | /* massive underflow: return 0, or smallest denormal */ | |
| | | xx = 0; | |
| | | if (mode == cudaRoundPosInf) { | |
| | | xx += !expo_y; | |
| | | } else if (mode == cudaRoundMinInf) { | |
| | | xx += !!expo_y; | |
| | | } | |
| | | cvt.i = expo_y | xx; | |
| | | return cvt.f; | |
| | | } | |
| | | temp = (xx << (32 - expo_x)) | ((temp) ? 1 : 0); | |
| | | xx = xx >> expo_x; | |
| | | if (mode == cudaRoundNearest) { | |
| | | xx = xx + ((temp == 0x80000000) ? (xx & 1) : (temp >> 31)); | |
| | | } else if (mode == cudaRoundPosInf) { | |
| | | xx = xx + (!expo_y && temp); | |
| | | } else if (mode == cudaRoundMinInf) { | |
| | | xx = xx + (expo_y && temp); | |
| | | } | |
| | | xx = expo_y + xx; /* add in sign bit */ | |
| | | cvt.i = xx; | |
| | | return cvt.f; | |
| | | } | |
| | | | |
| | | /* NOTE: Does not currently support round-to-nearest, round-to-zero */ | |
| | | __device_func__(float __internal_fadd_kernel2 (float a, float b, | |
| | | enum cudaRoundMode mode)) | |
| | | { | |
| | | volatile union __cudart_FloatUintCvt xx, yy; | |
| | | unsigned int expo_x; | |
| | | unsigned int expo_y; | |
| | | unsigned int temp; | |
| | | | |
| | | xx.f = a; | |
| | | yy.f = b; | |
| | | | |
| | | /* make bigger operand the augend */ | |
| | | expo_y = yy.i << 1; | |
| | | if (expo_y > (xx.i << 1)) { | |
| | | expo_y = xx.i; | |
| | | xx.i = yy.i; | |
| | | yy.i = expo_y; | |
| | | } | |
| | | | |
| | | temp = 0xff; | |
| | | expo_x = temp & (xx.i >> 23); | |
| | | expo_x = expo_x - 1; | |
| | | expo_y = temp & (yy.i >> 23); | |
| | | expo_y = expo_y - 1; | |
| | | | |
| | | if ((expo_x <= 0xFD) && | |
| | | (expo_y <= 0xFD)) { | |
| | | add: | |
| | | expo_y = expo_x - expo_y; | |
| | | if (expo_y > 25) { | |
| | | expo_y = 31; | |
| | | } | |
| | | temp = xx.i ^ yy.i; | |
| | | xx.i = xx.i & ~0x7f000000; | |
| | | xx.i = xx.i | 0x00800000; | |
| | | yy.i = yy.i & ~0xff000000; | |
| | | yy.i = yy.i | 0x00800000; | |
| | | | |
| | | if ((int)temp < 0) { | |
| | | /* signs differ, effective subtraction */ | |
| | | temp = 32 - expo_y; | |
| | | temp = (expo_y) ? (yy.i << temp) : 0; | |
| | | temp = (unsigned)(-((int)temp)); | |
| | | xx.i = xx.i - (yy.i >> expo_y) - (temp ? 1 : 0); | |
| | | if (xx.i & 0x00800000) { | |
| | | if (expo_x <= 0xFD) { | |
| | | xx.i = xx.i + (expo_x << 23); | |
| | | if (mode == cudaRoundMinInf) { | |
| | | xx.i += (temp && (xx.i & 0x80000000)); | |
| | | } else if (mode == cudaRoundPosInf) { | |
| | | xx.i += (temp && !(xx.i & 0x80000000)); | |
| | | } | |
| | | return xx.f; | |
| | | } | |
| | | } else { | |
| | | if ((temp | (xx.i << 1)) == 0) { | |
| | | /* operands cancelled, resulting in a clean zero */ | |
| | | if (mode == cudaRoundMinInf) { | |
| | | xx.i = 0x80000000; | |
| | | } else if (mode == cudaRoundPosInf) { | |
| | | xx.i = 0; | |
| | | } | |
| | | return xx.f; | |
| | | } | |
| | | /* normalize result */ | |
| | | yy.i = xx.i & 0x80000000; | |
| | | do { | |
| | | xx.i = (xx.i << 1) | (temp >> 31); | |
| | | temp <<= 1; | |
| | | expo_x--; | |
| | | } while (!(xx.i & 0x00800000)); | |
| | | xx.i = xx.i | yy.i; | |
| | | } | |
| | | } else { | |
| | | /* signs are the same, effective addition */ | |
| | | temp = 32 - expo_y; | |
| | | temp = (expo_y) ? (yy.i << temp) : 0; | |
| | | xx.i = xx.i + (yy.i >> expo_y); | |
| | | if (!(xx.i & 0x01000000)) { | |
| | | if (expo_x <= 0xFD) { | |
| | | xx.i = xx.i + (expo_x << 23); | |
| | | if (mode == cudaRoundMinInf) { | |
| | | xx.i += (temp && (xx.i & 0x80000000)); | |
| | | } else if (mode == cudaRoundPosInf) { | |
| | | xx.i += (temp && !(xx.i & 0x80000000)); | |
| | | } | |
| | | return xx.f; | |
| | | } | |
| | | } else { | |
| | | /* normalize result */ | |
| | | temp = (xx.i << 31) | (temp >> 1); | |
| | | xx.i = ((xx.i & 0x80000000) | (xx.i >> 1)) & ~0x40000000; | |
| | | expo_x++; | |
| | | } | |
| | | } | |
| | | if (expo_x <= 0xFD) { | |
| | | if (mode == cudaRoundMinInf) { | |
| | | xx.i += (temp && (xx.i & 0x80000000)); | |
| | | } else if (mode == cudaRoundPosInf) { | |
| | | xx.i += (temp && !(xx.i & 0x80000000)); | |
| | | } | |
| | | xx.i = xx.i + (expo_x << 23); | |
| | | return xx.f; | |
| | | } | |
| | | if ((int)expo_x >= 254) { | |
| | | /* overflow: return infinity or largest normal */ | |
| | | temp = xx.i & 0x80000000; | |
| | | if (mode == cudaRoundMinInf) { | |
| | | xx.i = (temp ? 0xFF800000 : 0x7f7fffff); | |
| | | } else if (mode == cudaRoundPosInf) { | |
| | | xx.i = (temp ? 0xff7fffff : 0x7F800000); | |
| | | } | |
| | | return xx.f; | |
| | | } | |
| | | /* underflow: denormal, or smallest normal */ | |
| | | expo_y = expo_x + 32; | |
| | | yy.i = xx.i & 0x80000000; | |
| | | xx.i = xx.i & ~0xff000000; | |
| | | expo_x = (unsigned)(-((int)expo_x)); | |
| | | temp = xx.i << expo_y | ((temp) ? 1 : 0); | |
| | | xx.i = yy.i | (xx.i >> expo_x); | |
| | | if (mode == cudaRoundMinInf) { | |
| | | xx.i += (temp && yy.i); | |
| | | } else if (mode == cudaRoundPosInf) { | |
| | | xx.i += (temp && !yy.i); | |
| | | } | |
| | | return xx.f; | |
| | | } else { | |
| | | /* handle special cases separately */ | |
| | | if (!(yy.i << 1)) { | |
| | | if (mode == cudaRoundMinInf) { | |
| | | if (!(xx.i << 1)) { | |
| | | xx.i = xx.i | yy.i; | |
| | | } | |
| | | } else if (mode == cudaRoundPosInf) { | |
| | | if (xx.i == 0x80000000) { | |
| | | xx.i = yy.i; | |
| | | } | |
| | | } | |
| | | if ((xx.i << 1) > 0xff000000) { | |
| | | xx.i |= 0x00400000; | |
| | | } | |
| | | return xx.f; | |
| | | } | |
| | | if ((expo_y != 254) && (expo_x != 254)) { | |
| | | /* remove sign bits */ | |
| | | if (expo_x == (unsigned int) -1) { | |
| | | temp = xx.i & 0x80000000; | |
| | | xx.i = xx.i << 8; | |
| | | while (!(xx.i & 0x80000000)) { | |
| | | xx.i <<= 1; | |
| | | expo_x--; | |
| | | } | |
| | | expo_x++; | |
| | | xx.i = (xx.i >> 8) | temp; | |
| | | } | |
| | | if (expo_y == (unsigned int) -1) { | |
| | | temp = yy.i & 0x80000000; | |
| | | yy.i = yy.i << 8; | |
| | | while (!(yy.i & 0x80000000)) { | |
| | | yy.i <<= 1; | |
| | | expo_y--; | |
| | | } | |
| | | expo_y++; | |
| | | yy.i = (yy.i >> 8) | temp; | |
| | | } | |
| | | goto add; | |
| | | } | |
| | | expo_x = xx.i << 1; | |
| | | expo_y = yy.i << 1; | |
| | | /* if x is NaN, return x */ | |
| | | if (expo_x > 0xff000000) { | |
| | | /* cvt any SNaNs to QNaNs */ | |
| | | xx.i = xx.i | 0x00400000; | |
| | | return xx.f; | |
| | | } | |
| | | /* if y is NaN, return y */ | |
| | | if (expo_y > 0xff000000) { | |
| | | /* cvt any SNaNs to QNaNs */ | |
| | | xx.i = yy.i | 0x00400000; | |
| | | return xx.f; | |
| | | } | |
| | | if ((expo_x == 0xff000000) && (expo_y == 0xff000000)) { | |
| | | /* | |
| | | * subtraction of infinities with the same sign, and addition of | |
| | | * infinities of unlike sign is undefined: return NaN INDEFINITE | |
| | | */ | |
| | | expo_x = xx.i ^ yy.i; | |
| | | xx.i = xx.i | ((expo_x) ? 0xffc00000 : 0); | |
| | | return xx.f; | |
| | | } | |
| | | /* handle infinities */ | |
| | | if (expo_y == 0xff000000) { | |
| | | xx.i = yy.i; | |
| | | } | |
| | | return xx.f; | |
| | | } | |
| | | } | |
| | | | |
| | | __device_func__(float __frcp_rn (float a)) | |
| | | { | |
| | | return __internal_frcp_kernel (a, cudaRoundNearest); | |
| | | } | |
| | | | |
| | | __device_func__(float __frcp_rz (float a)) | |
| | | { | |
| | | return __internal_frcp_kernel (a, cudaRoundZero); | |
| | | } | |
| | | | |
| | | __device_func__(float __frcp_rd (float a)) | |
| | | { | |
| | | return __internal_frcp_kernel (a, cudaRoundMinInf); | |
| | | } | |
| | | | |
| | | __device_func__(float __frcp_ru (float a)) | |
| | | { | |
| | | return __internal_frcp_kernel (a, cudaRoundPosInf); | |
| | | } | |
| | | | |
| | | __device_func__(float __fsqrt_rn (float a)) | |
| | | { | |
| | | return __internal_fsqrt_kernel (a, cudaRoundNearest); | |
| | | } | |
| | | | |
| | | __device_func__(float __fsqrt_rz (float a)) | |
| | | { | |
| | | return __internal_fsqrt_kernel (a, cudaRoundZero); | |
| | | } | |
| | | | |
| | | __device_func__(float __fsqrt_rd (float a)) | |
| | | { | |
| | | return __internal_fsqrt_kernel (a, cudaRoundMinInf); | |
| | | } | |
| | | | |
| | | __device_func__(float __fsqrt_ru (float a)) | |
| | | { | |
| | | return __internal_fsqrt_kernel (a, cudaRoundPosInf); | |
| | | } | |
| | | | |
| | | __device_func__(float __fdiv_rn (float a, float b)) | |
| | | { | |
| | | return __internal_fdiv_kernel (a, b, cudaRoundNearest); | |
| | | } | |
| | | | |
| | | __device_func__(float __fdiv_rz (float a, float b)) | |
| | | { | |
| | | return __internal_fdiv_kernel (a, b, cudaRoundZero); | |
| | | } | |
| | | | |
| | | __device_func__(float __fdiv_rd (float a, float b)) | |
| | | { | |
| | | return __internal_fdiv_kernel (a, b, cudaRoundMinInf); | |
| | | } | |
| | | | |
| | | __device_func__(float __fdiv_ru (float a, float b)) | |
| | | { | |
| | | return __internal_fdiv_kernel (a, b, cudaRoundPosInf); | |
| | | } | |
| | | | |
| | | __device_func__(float __fadd_rd (float a, float b)) | |
| | | { | |
| | | return __internal_fadd_kernel2 (a, b, cudaRoundMinInf); | |
| | | } | |
| | | | |
| | | __device_func__(float __fadd_ru (float a, float b)) | |
| | | { | |
| | | return __internal_fadd_kernel2 (a, b, cudaRoundPosInf); | |
| | | } | |
| | | | |
| | | __device_func__(float __fmul_rd (float a, float b)) | |
| | | { | |
| | | return __internal_fmul_kernel2 (a, b, cudaRoundMinInf); | |
| | | } | |
| | | | |
| | | __device_func__(float __fmul_ru (float a, float b)) | |
| | | { | |
| | | return __internal_fmul_kernel2 (a, b, cudaRoundPosInf); | |
| | | } | |
| | | | |
| | | __device_func__(float __fmaf_rn (float a, float b, float c)) | |
| | | { | |
| | | return __internal_fmaf_kernel (a, b, c, cudaRoundNearest); | |
| | | } | |
| | | | |
| | | __device_func__(float __fmaf_rz (float a, float b, float c)) | |
| | | { | |
| | | return __internal_fmaf_kernel (a, b, c, cudaRoundZero); | |
| | | } | |
| | | | |
| | | __device_func__(float __fmaf_ru (float a, float b, float c)) | |
| | | { | |
| | | return __internal_fmaf_kernel (a, b, c, cudaRoundPosInf); | |
| | | } | |
| | | | |
| | | __device_func__(float __fmaf_rd (float a, float b, float c)) | |
| | | { | |
| | | return __internal_fmaf_kernel (a, b, c, cudaRoundMinInf); | |
| | | } | |
| | | | |
| __device_func__(int __cuda___isnan(double a)); | | __device_func__(int __cuda___isnan(double a)); | |
| __device_func__(int __cuda___isnanf(float a)); | | __device_func__(int __cuda___isnanf(float a)); | |
| __device_func__(int __double2int_rz(double)); | | __device_func__(int __double2int_rz(double)); | |
| __device_func__(unsigned int __double2uint_rz(double)); | | __device_func__(unsigned int __double2uint_rz(double)); | |
| __device_func__(long long int __double2ll_rz(double)); | | __device_func__(long long int __double2ll_rz(double)); | |
| __device_func__(unsigned long long int __double2ull_rz(double)); | | __device_func__(unsigned long long int __double2ull_rz(double)); | |
| | | | |
| #define __internal_clamp(val, max, min, nan)
\ | | #define __internal_clamp(val, max, min, nan)
\ | |
| if (sizeof(val) == sizeof(double) && __cuda___isnan((double)val)) re
turn nan; \ | | if (sizeof(val) == sizeof(double) && __cuda___isnan((double)val)) re
turn nan; \ | |
| | | | |
| skipping to change at line 409 | | skipping to change at line 3777 | |
| { | | { | |
| long long int res; | | long long int res; | |
| res = __umul64hi(a, b); | | res = __umul64hi(a, b); | |
| if (a < 0LL) res = res - b; | | if (a < 0LL) res = res - b; | |
| if (b < 0LL) res = res - a; | | if (b < 0LL) res = res - a; | |
| return res; | | return res; | |
| } | | } | |
| | | | |
| __device_func__(float __saturatef(float a)) | | __device_func__(float __saturatef(float a)) | |
| { | | { | |
|
| | | if (__cuda___isnanf(a)) return 0.0f; // update of PTX spec 10/15/2008 | |
| return a >= 1.0f ? 1.0f : a <= 0.0f ? 0.0f : a; | | return a >= 1.0f ? 1.0f : a <= 0.0f ? 0.0f : a; | |
| } | | } | |
| | | | |
| __device_func__(unsigned int __sad(int a, int b, unsigned int c)) | | __device_func__(unsigned int __sad(int a, int b, unsigned int c)) | |
| { | | { | |
| long long int diff = (long long int)a - (long long int)b; | | long long int diff = (long long int)a - (long long int)b; | |
| | | | |
| return (unsigned int)(__cuda_llabs(diff) + (long long int)c); | | return (unsigned int)(__cuda_llabs(diff) + (long long int)c); | |
| } | | } | |
| | | | |
| | | | |
| skipping to change at line 450 | | skipping to change at line 3819 | |
| #if !defined(__MULTI_CORE__) | | #if !defined(__MULTI_CORE__) | |
| a &= 0xffffff; | | a &= 0xffffff; | |
| b &= 0xffffff; | | b &= 0xffffff; | |
| #endif /* !__MULTI_CORE__ */ | | #endif /* !__MULTI_CORE__ */ | |
| | | | |
| return a * b; | | return a * b; | |
| } | | } | |
| | | | |
| __device_func__(float __int_as_float(int a)) | | __device_func__(float __int_as_float(int a)) | |
| { | | { | |
|
| volatile union {int a; float b;} u; | | volatile union __cudart_FloatIntCvt u; | |
| | | | |
| u.a = a; | | | |
| | | | |
|
| return u.b; | | u.i = a; | |
| | | return u.f; | |
| } | | } | |
| | | | |
| __device_func__(int __float_as_int(float a)) | | __device_func__(int __float_as_int(float a)) | |
| { | | { | |
|
| volatile union {float a; int b;} u; | | volatile union __cudart_FloatIntCvt u; | |
| | | | |
| u.a = a; | | | |
| | | | |
|
| return u.b; | | u.f = a; | |
| | | return u.i; | |
| } | | } | |
| | | | |
| __device_func__(long long int __internal_float2ll_kernel(float a, long long
int max, long long int min, long long int nan, enum cudaRoundMode rndMode)
) | | __device_func__(long long int __internal_float2ll_kernel(float a, long long
int max, long long int min, long long int nan, enum cudaRoundMode rndMode)
) | |
| { | | { | |
| unsigned long long int res, t = 0ULL; | | unsigned long long int res, t = 0ULL; | |
| int shift; | | int shift; | |
| unsigned int ia; | | unsigned int ia; | |
| | | | |
| __internal_clamp(a, max, min, nan); | | __internal_clamp(a, max, min, nan); | |
| ia = __float_as_int(a); | | ia = __float_as_int(a); | |
| | | | |
| skipping to change at line 681 | | skipping to change at line 4048 | |
| unsigned long long int t = (unsigned long long int)*a; | | unsigned long long int t = (unsigned long long int)*a; | |
| int lz = __internal_normalize64(&t); | | int lz = __internal_normalize64(&t); | |
| | | | |
| *a = (unsigned int)(t >> 32); | | *a = (unsigned int)(t >> 32); | |
| | | | |
| return lz - 32; | | return lz - 32; | |
| } | | } | |
| | | | |
| __device_func__(float __internal_int2float_kernel(int a, enum cudaRoundMode
rndMode)) | | __device_func__(float __internal_int2float_kernel(int a, enum cudaRoundMode
rndMode)) | |
| { | | { | |
|
| volatile union { | | volatile union __cudart_FloatUintCvt res; | |
| float f; | | | |
| unsigned int i; | | | |
| } res; | | | |
| int shift; | | int shift; | |
| unsigned int t; | | unsigned int t; | |
| res.i = a; | | res.i = a; | |
| if (a == 0) return res.f; | | if (a == 0) return res.f; | |
| if (a < 0) res.i = (unsigned int)-a; | | if (a < 0) res.i = (unsigned int)-a; | |
| shift = __internal_normalize((unsigned int*)&res.i); | | shift = __internal_normalize((unsigned int*)&res.i); | |
| t = res.i << 24; | | t = res.i << 24; | |
| res.i = (res.i >> 8); | | res.i = (res.i >> 8); | |
| res.i += (127 + 30 - shift) << 23; | | res.i += (127 + 30 - shift) << 23; | |
| if (a < 0) res.i |= 0x80000000; | | if (a < 0) res.i |= 0x80000000; | |
| | | | |
| skipping to change at line 733 | | skipping to change at line 4097 | |
| { | | { | |
| #if defined(__MULTI_CORE__) | | #if defined(__MULTI_CORE__) | |
| return (float)a; | | return (float)a; | |
| #else /* __MULTI_CORE__ */ | | #else /* __MULTI_CORE__ */ | |
| return __internal_int2float_kernel(a, cudaRoundNearest); | | return __internal_int2float_kernel(a, cudaRoundNearest); | |
| #endif /* __MULTI_CORE__ */ | | #endif /* __MULTI_CORE__ */ | |
| } | | } | |
| | | | |
| __device_func__(float __internal_uint2float_kernel(unsigned int a, enum cud
aRoundMode rndMode)) | | __device_func__(float __internal_uint2float_kernel(unsigned int a, enum cud
aRoundMode rndMode)) | |
| { | | { | |
|
| volatile union { | | volatile union __cudart_FloatUintCvt res; | |
| float f; | | | |
| unsigned int i; | | | |
| } res; | | | |
| int shift; | | int shift; | |
| unsigned int t; | | unsigned int t; | |
| res.i = a; | | res.i = a; | |
| if (a == 0) return res.f; | | if (a == 0) return res.f; | |
| shift = __internal_normalize((unsigned int*)&res.i); | | shift = __internal_normalize((unsigned int*)&res.i); | |
| t = res.i << 24; | | t = res.i << 24; | |
| res.i = (res.i >> 8); | | res.i = (res.i >> 8); | |
| res.i += (127 + 30 - shift) << 23; | | res.i += (127 + 30 - shift) << 23; | |
| if ((rndMode == cudaRoundNearest) && (t >= 0x80000000)) { | | if ((rndMode == cudaRoundNearest) && (t >= 0x80000000)) { | |
| res.i += (t == 0x80000000) ? (res.i & 1) : (t >> 31); | | res.i += (t == 0x80000000) ? (res.i & 1) : (t >> 31); | |
| | | | |
| skipping to change at line 806 | | skipping to change at line 4167 | |
| t = (unsigned int)temp; | | t = (unsigned int)temp; | |
| res += (127 + 62 - shift) << 23; /* add in exponent */ | | res += (127 + 62 - shift) << 23; /* add in exponent */ | |
| res += t == 0x80000000 ? res & 1 : t >> 31; | | res += t == 0x80000000 ? res & 1 : t >> 31; | |
| return __int_as_float(res); | | return __int_as_float(res); | |
| #endif /* __MULTI_CORE__ */ | | #endif /* __MULTI_CORE__ */ | |
| } | | } | |
| | | | |
| __device_func__(float __internal_fmul_kernel(float a, float b, int rndNeare
st)) | | __device_func__(float __internal_fmul_kernel(float a, float b, int rndNeare
st)) | |
| { | | { | |
| unsigned long long product; | | unsigned long long product; | |
|
| volatile union { | | volatile union __cudart_FloatUintCvt xx, yy; | |
| float f; | | | |
| unsigned int i; | | | |
| } xx, yy; | | | |
| unsigned expo_x, expo_y; | | unsigned expo_x, expo_y; | |
| | | | |
| xx.f = a; | | xx.f = a; | |
| yy.f = b; | | yy.f = b; | |
| | | | |
| expo_y = 0xFF; | | expo_y = 0xFF; | |
| expo_x = expo_y & (xx.i >> 23); | | expo_x = expo_y & (xx.i >> 23); | |
| expo_x = expo_x - 1; | | expo_x = expo_x - 1; | |
| expo_y = expo_y & (yy.i >> 23); | | expo_y = expo_y & (yy.i >> 23); | |
| expo_y = expo_y - 1; | | expo_y = expo_y - 1; | |
| | | | |
| skipping to change at line 951 | | skipping to change at line 4309 | |
| xx.i = yy.i | 0x00400000; | | xx.i = yy.i | 0x00400000; | |
| return xx.f; | | return xx.f; | |
| } | | } | |
| xx.i = (unsigned int)product | 0x7f800000; | | xx.i = (unsigned int)product | 0x7f800000; | |
| return xx.f; | | return xx.f; | |
| } | | } | |
| } | | } | |
| | | | |
| __device_func__(float __internal_fadd_kernel(float a, float b, int rndNeare
st)) | | __device_func__(float __internal_fadd_kernel(float a, float b, int rndNeare
st)) | |
| { | | { | |
|
| volatile union { | | volatile union __cudart_FloatUintCvt xx, yy; | |
| float f; | | | |
| unsigned int i; | | | |
| } xx, yy; | | | |
| unsigned int expo_x; | | unsigned int expo_x; | |
| unsigned int expo_y; | | unsigned int expo_y; | |
| unsigned int temp; | | unsigned int temp; | |
| | | | |
| xx.f = a; | | xx.f = a; | |
| yy.f = b; | | yy.f = b; | |
| | | | |
| /* make bigger operand the augend */ | | /* make bigger operand the augend */ | |
| expo_y = yy.i << 1; | | expo_y = yy.i << 1; | |
| if (expo_y > (xx.i << 1)) { | | if (expo_y > (xx.i << 1)) { | |
| | | | |
| skipping to change at line 1069 | | skipping to change at line 4424 | |
| expo_x = (unsigned int)(-((int)expo_x)); | | expo_x = (unsigned int)(-((int)expo_x)); | |
| temp = xx.i << expo_y | ((temp) ? 1 : 0); | | temp = xx.i << expo_y | ((temp) ? 1 : 0); | |
| xx.i = yy.i | (xx.i >> expo_x); | | xx.i = yy.i | (xx.i >> expo_x); | |
| xx.i += (((temp == 0x80000000) ? (xx.i & 1) : (temp >> 31)) | | xx.i += (((temp == 0x80000000) ? (xx.i & 1) : (temp >> 31)) | |
| && rndNearest); | | && rndNearest); | |
| return xx.f; | | return xx.f; | |
| } else { | | } else { | |
| /* handle special cases separately */ | | /* handle special cases separately */ | |
| if (!(yy.i << 1)) { | | if (!(yy.i << 1)) { | |
| if (xx.i == 0x80000000) { | | if (xx.i == 0x80000000) { | |
|
| xx.i = yy.i; | | xx.i = yy.i; | |
| } | | } | |
| if ((xx.i << 1) > 0xff000000) { | | if ((xx.i << 1) > 0xff000000) { | |
|
| xx.i |= 0x00400000; | | xx.i |= 0x00400000; | |
| } | | } | |
| return xx.f; | | return xx.f; | |
| } | | } | |
| if ((expo_y != 254) && (expo_x != 254)) { | | if ((expo_y != 254) && (expo_x != 254)) { | |
| /* remove sign bits */ | | /* remove sign bits */ | |
| if (expo_x == (unsigned int) -1) { | | if (expo_x == (unsigned int) -1) { | |
| temp = xx.i & 0x80000000; | | temp = xx.i & 0x80000000; | |
| xx.i = xx.i << 8; | | xx.i = xx.i << 8; | |
| while (!(xx.i & 0x80000000)) { | | while (!(xx.i & 0x80000000)) { | |
| xx.i <<= 1; | | xx.i <<= 1; | |
| | | | |
| skipping to change at line 1182 | | skipping to change at line 4537 | |
| | | | |
| #elif defined(_WIN32) | | #elif defined(_WIN32) | |
| | | | |
| #define __syncthreads() \ | | #define __syncthreads() \ | |
| (void)__cudaSynchronizeThreads((void**)0, (void*)0) | | (void)__cudaSynchronizeThreads((void**)0, (void*)0) | |
| | | | |
| #endif /* __GNUC__ */ | | #endif /* __GNUC__ */ | |
| | | | |
| #endif /* __MULTI_CORE__ */ | | #endif /* __MULTI_CORE__ */ | |
| | | | |
|
| | | __device_func__(void __prof_trigger(int a)) | |
| | | { | |
| | | } | |
| | | | |
| | | __device_func__(void __threadfence(void)) | |
| | | { | |
| | | } | |
| | | | |
| | | __device_func__(void __threadfence_block(void)) | |
| | | { | |
| | | } | |
| | | | |
| #if defined(__GNUC__) | | #if defined(__GNUC__) | |
| | | | |
| __device_func__(void __trap(void)) | | __device_func__(void __trap(void)) | |
| { | | { | |
| __builtin_trap(); | | __builtin_trap(); | |
| } | | } | |
| | | | |
| #elif defined(_WIN32) | | #elif defined(_WIN32) | |
| | | | |
| __device_func__(void __trap(void)) | | __device_func__(void __trap(void)) | |
| { | | { | |
| __debugbreak(); | | __debugbreak(); | |
| } | | } | |
| | | | |
| #endif /* __GNUC__ */ | | #endif /* __GNUC__ */ | |
| | | | |
|
| #endif /* !__CUDABE__ */ | | #endif /* __CUDABE__ */ | |
| | | | |
| /**************************************************************************
***** | | /**************************************************************************
***** | |
| *
* | | *
* | |
| * DEVICE IMPLEMENTATIONS FOR FUNCTIONS WITH BUILTIN NVOPENCC OPERATIONS
* | | * DEVICE IMPLEMENTATIONS FOR FUNCTIONS WITH BUILTIN NVOPENCC OPERATIONS
* | |
| *
* | | *
* | |
| ***************************************************************************
****/ | | ***************************************************************************
****/ | |
| | | | |
| __device_func__(float __fdividef(float a, float b)) | | __device_func__(float __fdividef(float a, float b)) | |
| { | | { | |
| #if defined(__MULTI_CORE__) | | #if defined(__MULTI_CORE__) | |
| return a / b; | | return a / b; | |
| #elif defined(__CUDABE__) | | #elif defined(__CUDABE__) | |
| return a / b; | | return a / b; | |
| #else /* __MULTI_CORE__ */ | | #else /* __MULTI_CORE__ */ | |
| /* match range restrictions of the device function */ | | /* match range restrictions of the device function */ | |
| if (__cuda_fabsf(b) > CUDART_TWO_TO_126_F) { | | if (__cuda_fabsf(b) > CUDART_TWO_TO_126_F) { | |
| if (__cuda_fabsf(a) <= CUDART_NORM_HUGE_F) { | | if (__cuda_fabsf(a) <= CUDART_NORM_HUGE_F) { | |
| return ((a / b) / CUDART_NORM_HUGE_F) / CUDART_NORM_HUGE_F; | | return ((a / b) / CUDART_NORM_HUGE_F) / CUDART_NORM_HUGE_F; | |
| } else { | | } else { | |
|
| return CUDART_NAN_F; | | return __int_as_float(0xffc00000); | |
| } | | } | |
| } else { | | } else { | |
| return a / b; | | return a / b; | |
| } | | } | |
| #endif /* __MULTI_CORE__ */ | | #endif /* __MULTI_CORE__ */ | |
| } | | } | |
| | | | |
| __device_func__(float __sinf(float a)) | | __device_func__(float __sinf(float a)) | |
| { | | { | |
| return sinf(a); | | return sinf(a); | |
| | | | |
| skipping to change at line 1261 | | skipping to change at line 4628 | |
| b *= .25f; | | b *= .25f; | |
| } | | } | |
| return __fdividef(a, b); | | return __fdividef(a, b); | |
| } | | } | |
| | | | |
| __device_func__(float __tanf(float a)) | | __device_func__(float __tanf(float a)) | |
| { | | { | |
| #if defined(__MULTI_CORE__) | | #if defined(__MULTI_CORE__) | |
| return tanf(a); | | return tanf(a); | |
| #else /* __MULTI_CORE__ */ | | #else /* __MULTI_CORE__ */ | |
|
| return __sinf(a) / __cosf(a); | | return __fdividef (__sinf(a), __cosf(a)); | |
| #endif /* __MULTI_CORE__ */ | | #endif /* __MULTI_CORE__ */ | |
| } | | } | |
| | | | |
| __device_func__(void __sincosf(float a, float *sptr, float *cptr)) | | __device_func__(void __sincosf(float a, float *sptr, float *cptr)) | |
| { | | { | |
| #if defined(__MULTI_CORE__) | | #if defined(__MULTI_CORE__) | |
| sincosf(a, sptr, cptr); | | sincosf(a, sptr, cptr); | |
| #else /* __MULTI_CORE__ */ | | #else /* __MULTI_CORE__ */ | |
| *sptr = __sinf(a); | | *sptr = __sinf(a); | |
| *cptr = __cosf(a); | | *cptr = __cosf(a); | |
| | | | |
| skipping to change at line 1336 | | skipping to change at line 4703 | |
| #else /* __MULTI_CORE__ */ | | #else /* __MULTI_CORE__ */ | |
| return __internal_accurate_fdividef(a, b); | | return __internal_accurate_fdividef(a, b); | |
| #endif /* __MULTI_CORE__ */ | | #endif /* __MULTI_CORE__ */ | |
| } | | } | |
| | | | |
| __device_func__(int __clz(int a)) | | __device_func__(int __clz(int a)) | |
| { | | { | |
| return (a)?(158-(__float_as_int(__uint2float_rz((unsigned int)a))>>23)):3
2; | | return (a)?(158-(__float_as_int(__uint2float_rz((unsigned int)a))>>23)):3
2; | |
| } | | } | |
| | | | |
|
| __device_func__(int __ffs(int a)) | | | |
| { | | | |
| return 32 - __clz (a & -a); | | | |
| } | | | |
| | | | |
| __device_func__(int __popc(unsigned int a)) | | | |
| { | | | |
| a = a - ((a >> 1) & 0x55555555); | | | |
| a = (a & 0x33333333) + ((a >> 2) & 0x33333333); | | | |
| a = (a + (a >> 4)) & 0x0f0f0f0f; | | | |
| a = ((__umul24(a, 0x808080) << 1) + a) >> 24; | | | |
| return a; | | | |
| } | | | |
| | | | |
| __device_func__(int __clzll(long long int a)) | | __device_func__(int __clzll(long long int a)) | |
| { | | { | |
| int ahi = ((int)((unsigned long long)a >> 32)); | | int ahi = ((int)((unsigned long long)a >> 32)); | |
| int alo = ((int)((unsigned long long)a & 0xffffffffULL)); | | int alo = ((int)((unsigned long long)a & 0xffffffffULL)); | |
| int res; | | int res; | |
| if (ahi) { | | if (ahi) { | |
|
| res = 0; | | res = 0; | |
| } else { | | } else { | |
|
| res = 32; | | res = 32; | |
| ahi = alo; | | ahi = alo; | |
| } | | } | |
| res = res + __clz(ahi); | | res = res + __clz(ahi); | |
| return res; | | return res; | |
| } | | } | |
| | | | |
|
| __device_func__(int __ffsll(long long int a)) | | __device_func__(int __popc(unsigned int a)) | |
| { | | { | |
|
| return 64 - __clzll (a & -a); | | a = a - ((a >> 1) & 0x55555555); | |
| | | a = (a & 0x33333333) + ((a >> 2) & 0x33333333); | |
| | | a = (a + (a >> 4)) & 0x0f0f0f0f; | |
| | | a = ((__umul24(a, 0x808080) << 1) + a) >> 24; | |
| | | return a; | |
| } | | } | |
| | | | |
| __device_func__(int __popcll(unsigned long long int a)) | | __device_func__(int __popcll(unsigned long long int a)) | |
| { | | { | |
| unsigned int ahi = ((unsigned int)(a >> 32)); | | unsigned int ahi = ((unsigned int)(a >> 32)); | |
| unsigned int alo = ((unsigned int)(a & 0xffffffffULL)); | | unsigned int alo = ((unsigned int)(a & 0xffffffffULL)); | |
| alo = alo - ((alo >> 1) & 0x55555555); | | alo = alo - ((alo >> 1) & 0x55555555); | |
| alo = (alo & 0x33333333) + ((alo >> 2) & 0x33333333); | | alo = (alo & 0x33333333) + ((alo >> 2) & 0x33333333); | |
| ahi = ahi - ((ahi >> 1) & 0x55555555); | | ahi = ahi - ((ahi >> 1) & 0x55555555); | |
| ahi = (ahi & 0x33333333) + ((ahi >> 2) & 0x33333333); | | ahi = (ahi & 0x33333333) + ((ahi >> 2) & 0x33333333); | |
| alo = alo + ahi; | | alo = alo + ahi; | |
| alo = (alo & 0x0f0f0f0f) + ((alo >> 4) & 0x0f0f0f0f); | | alo = (alo & 0x0f0f0f0f) + ((alo >> 4) & 0x0f0f0f0f); | |
| alo = ((__umul24(alo, 0x808080) << 1) + alo) >> 24; | | alo = ((__umul24(alo, 0x808080) << 1) + alo) >> 24; | |
| return alo; | | return alo; | |
| } | | } | |
| | | | |
|
| | | __device_func__(unsigned int __brev(unsigned int a)) | |
| | | { | |
| | | a = ((a >> 1) & 0x55555555) + ((a & 0x55555555) << 1); | |
| | | a = ((a >> 2) & 0x33333333) + ((a & 0x33333333) << 2); | |
| | | a = ((a >> 4) & 0x0F0F0F0F) + ((a & 0x0F0F0F0F) << 4); | |
| | | a = ((a >> 8) & 0x00FF00FF) + ((a & 0x00FF00FF) << 8); | |
| | | a = ( a >> 16 ) + ( a << 16); | |
| | | return a; | |
| | | } | |
| | | | |
| | | __device_func__(unsigned long long int __brevll(unsigned long long int a)) | |
| | | { | |
| | | unsigned int hi = (unsigned int)(a >> 32); | |
| | | unsigned int lo = (unsigned int)(a & 0xffffffffULL); | |
| | | unsigned int t; | |
| | | t = __brev(lo); | |
| | | lo = __brev(hi); | |
| | | return ((unsigned long long int)t << 32) + (unsigned long long int)lo; | |
| | | } | |
| | | | |
| | | __device_func__(int __ffs(int a)) | |
| | | { | |
| | | return 32 - __clz (a & -a); | |
| | | } | |
| | | | |
| | | __device_func__(int __ffsll(long long int a)) | |
| | | { | |
| | | return 64 - __clzll (a & -a); | |
| | | } | |
| | | | |
| #if defined(CUDA_DOUBLE_MATH_FUNCTIONS) && defined(CUDA_FLOAT_MATH_FUNCTION
S) | | #if defined(CUDA_DOUBLE_MATH_FUNCTIONS) && defined(CUDA_FLOAT_MATH_FUNCTION
S) | |
| | | | |
| #error -- conflicting mode for double math routines | | #error -- conflicting mode for double math routines | |
| | | | |
| #endif /* CUDA_DOUBLE_MATH_FUNCTIONS && CUDA_FLOAT_MATH_FUNCTIONS */ | | #endif /* CUDA_DOUBLE_MATH_FUNCTIONS && CUDA_FLOAT_MATH_FUNCTIONS */ | |
| | | | |
| #if defined(CUDA_FLOAT_MATH_FUNCTIONS) | | #if defined(CUDA_FLOAT_MATH_FUNCTIONS) | |
| | | | |
| __device_func__(double fdivide(double a, double b)) | | __device_func__(double fdivide(double a, double b)) | |
| { | | { | |
| | | | |
End of changes. 31 change blocks. |
| 52 lines changed or deleted | | 3441 lines changed or added | |
|
| math_functions.h | | math_functions.h | |
| /* | | /* | |
|
| * Copyright 1993-2008 NVIDIA Corporation. All rights reserved. | | * Copyright 1993-2009 NVIDIA Corporation. All rights reserved. | |
| * | | * | |
| * NOTICE TO USER: | | * NOTICE TO USER: | |
| * | | * | |
| * This source code is subject to NVIDIA ownership rights under U.S. and | | * This source code is subject to NVIDIA ownership rights under U.S. and | |
| * international Copyright laws. Users and possessors of this source code | | * international Copyright laws. Users and possessors of this source code | |
| * are hereby granted a nonexclusive, royalty-free license to use this code | | * are hereby granted a nonexclusive, royalty-free license to use this code | |
| * in individual and commercial software. | | * in individual and commercial software. | |
| * | | * | |
| * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE | | * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE | |
| * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR | | * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR | |
| | | | |
| skipping to change at line 339 | | skipping to change at line 339 | |
| extern __host__ __device__ double remquo(double, double, int*) __THR
OW; | | extern __host__ __device__ double remquo(double, double, int*) __THR
OW; | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
| extern __host__ __device__ float remquof(float, float, int*) __THRO
W; | | extern __host__ __device__ float remquof(float, float, int*) __THRO
W; | |
| | | | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
| extern __host__ __device__ double erf(double) __THROW; | | extern __host__ __device__ double erf(double) __THROW; | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
| extern __host__ __device__ float erff(float) __THROW; | | extern __host__ __device__ float erff(float) __THROW; | |
| | | | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
|
| | | extern __host__ __device__ double erfinv(double) __THROW; | |
| | | /*DEVICE_BUILTIN*/ | |
| | | extern __host__ __device__ float erfinvf(float) __THROW; | |
| | | | |
| | | /*DEVICE_BUILTIN*/ | |
| extern __host__ __device__ double erfc(double) __THROW; | | extern __host__ __device__ double erfc(double) __THROW; | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
| extern __host__ __device__ float erfcf(float) __THROW; | | extern __host__ __device__ float erfcf(float) __THROW; | |
| | | | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
|
| | | extern __host__ __device__ double erfcinv(double) __THROW; | |
| | | /*DEVICE_BUILTIN*/ | |
| | | extern __host__ __device__ float erfcinvf(float) __THROW; | |
| | | | |
| | | /*DEVICE_BUILTIN*/ | |
| extern __host__ __device__ double lgamma(double) __THROW; | | extern __host__ __device__ double lgamma(double) __THROW; | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
| extern __host__ __device__ float lgammaf(float) __THROW; | | extern __host__ __device__ float lgammaf(float) __THROW; | |
| | | | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
| extern __host__ __device__ double tgamma(double) __THROW; | | extern __host__ __device__ double tgamma(double) __THROW; | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
| extern __host__ __device__ float tgammaf(float) __THROW; | | extern __host__ __device__ float tgammaf(float) __THROW; | |
| | | | |
| /*DEVICE_BUILTIN*/ | | /*DEVICE_BUILTIN*/ | |
| | | | |
| skipping to change at line 670 | | skipping to change at line 680 | |
| static __inline__ __host__ __device__ void sincos(float a, float *sptr, flo
at *cptr) | | static __inline__ __host__ __device__ void sincos(float a, float *sptr, flo
at *cptr) | |
| { | | { | |
| sincosf(a, sptr, cptr); | | sincosf(a, sptr, cptr); | |
| } | | } | |
| | | | |
| static __inline__ __host__ __device__ float erf(float a) | | static __inline__ __host__ __device__ float erf(float a) | |
| { | | { | |
| return erff(a); | | return erff(a); | |
| } | | } | |
| | | | |
|
| | | static __inline__ __host__ __device__ float erfinv(float a) | |
| | | { | |
| | | return erfinvf(a); | |
| | | } | |
| | | | |
| static __inline__ __host__ __device__ float erfc(float a) | | static __inline__ __host__ __device__ float erfc(float a) | |
| { | | { | |
| return erfcf(a); | | return erfcf(a); | |
| } | | } | |
| | | | |
|
| | | static __inline__ __host__ __device__ float erfcinv(float a) | |
| | | { | |
| | | return erfcinvf(a); | |
| | | } | |
| | | | |
| static __inline__ __host__ __device__ float lgamma(float a) | | static __inline__ __host__ __device__ float lgamma(float a) | |
| { | | { | |
| return lgammaf(a); | | return lgammaf(a); | |
| } | | } | |
| | | | |
| static __inline__ __host__ __device__ float tgamma(float a) | | static __inline__ __host__ __device__ float tgamma(float a) | |
| { | | { | |
| return tgammaf(a); | | return tgammaf(a); | |
| } | | } | |
| | | | |
| | | | |
| skipping to change at line 1055 | | skipping to change at line 1075 | |
| return copysignf(u, a); | | return copysignf(u, a); | |
| } | | } | |
| } | | } | |
| | | | |
| __device_func__(float __internal_fminf(float a, float b)) | | __device_func__(float __internal_fminf(float a, float b)) | |
| { | | { | |
| volatile union { | | volatile union { | |
| float f; | | float f; | |
| unsigned int i; | | unsigned int i; | |
| } cvta, cvtb; | | } cvta, cvtb; | |
|
| | | int nana, nanb; | |
| | | | |
| cvta.f = a; | | cvta.f = a; | |
| cvtb.f = b; | | cvtb.f = b; | |
|
| if ((cvta.i << 1) > 0xff000000) return b; | | nana = ((cvta.i << 1) > 0xff000000); | |
| if ((cvtb.i << 1) > 0xff000000) return a; | | nanb = ((cvtb.i << 1) > 0xff000000); | |
| | | if (nana && nanb) return a + b; | |
| | | if (nana) return b; | |
| | | if (nanb) return a; | |
| if ((cvta.i | cvtb.i) == 0x80000000) { | | if ((cvta.i | cvtb.i) == 0x80000000) { | |
| return CUDART_NEG_ZERO_F; | | return CUDART_NEG_ZERO_F; | |
| } | | } | |
| return a < b ? a : b; | | return a < b ? a : b; | |
| } | | } | |
| | | | |
| __device_func__(float __internal_fmaxf(float a, float b)) | | __device_func__(float __internal_fmaxf(float a, float b)) | |
| { | | { | |
| volatile union { | | volatile union { | |
| float f; | | float f; | |
| unsigned int i; | | unsigned int i; | |
| } cvta, cvtb; | | } cvta, cvtb; | |
|
| | | int nana, nanb; | |
| | | | |
| cvta.f = a; | | cvta.f = a; | |
| cvtb.f = b; | | cvtb.f = b; | |
|
| if ((cvta.i << 1) > 0xff000000) return b; | | nana = ((cvta.i << 1) > 0xff000000); | |
| if ((cvtb.i << 1) > 0xff000000) return a; | | nanb = ((cvtb.i << 1) > 0xff000000); | |
| | | if (nana && nanb) return a + b; | |
| | | if (nana) return b; | |
| | | if (nanb) return a; | |
| if ((cvta.f == 0.0f) && (cvtb.f == 0.0f)) { | | if ((cvta.f == 0.0f) && (cvtb.f == 0.0f)) { | |
| cvta.i &= cvtb.i; | | cvta.i &= cvtb.i; | |
| return cvta.f; | | return cvta.f; | |
| } | | } | |
| return a > b ? a : b; | | return a > b ? a : b; | |
| } | | } | |
| | | | |
| #if defined(_WIN32) | | #if defined(_WIN32) | |
| | | | |
| __func__(double trunc(double a)) | | __func__(double trunc(double a)) | |
| | | | |
| skipping to change at line 1140 | | skipping to change at line 1168 | |
| return ceilf(a); | | return ceilf(a); | |
| } | | } | |
| | | | |
| __device_func__(float __cuda_floorf(float a)) | | __device_func__(float __cuda_floorf(float a)) | |
| { | | { | |
| return floorf(a); | | return floorf(a); | |
| } | | } | |
| | | | |
| __device_func__(float __cuda_sqrtf(float a)) | | __device_func__(float __cuda_sqrtf(float a)) | |
| { | | { | |
|
| return sqrtf(a); | | return sqrtf(a); | |
| } | | } | |
| | | | |
| __device_func__(float __cuda_rsqrtf(float a)) | | __device_func__(float __cuda_rsqrtf(float a)) | |
| { | | { | |
|
| return 1.0f / sqrtf(a); | | return 1.0f / sqrtf(a); | |
| } | | } | |
| | | | |
| __device_func__(float __cuda_truncf(float a)) | | __device_func__(float __cuda_truncf(float a)) | |
| { | | { | |
| return truncf(a); | | return truncf(a); | |
| } | | } | |
| | | | |
| __device_func__(int __cuda_max(int a, int b)) | | __device_func__(int __cuda_max(int a, int b)) | |
| { | | { | |
| return max(a, b); | | return max(a, b); | |
| | | | |
| skipping to change at line 1308 | | skipping to change at line 1336 | |
| #endif /* __CUDABE__ */ | | #endif /* __CUDABE__ */ | |
| return a; | | return a; | |
| } | | } | |
| | | | |
| __device_func__(float __cuda_nanf(const char *tagp)) | | __device_func__(float __cuda_nanf(const char *tagp)) | |
| { | | { | |
| /* the GPU only has one canonical QNaN, so return that */ | | /* the GPU only has one canonical QNaN, so return that */ | |
| return CUDART_NAN_F; | | return CUDART_NAN_F; | |
| } | | } | |
| | | | |
|
| | | __device_func__(float __internal_fmad(float a, float b, float c)) | |
| | | { | |
| | | return a * b + c; | |
| | | } | |
| | | | |
| /* approximate 2*atanh(a/2) for |a| < 0.245 */ | | /* approximate 2*atanh(a/2) for |a| < 0.245 */ | |
| __device_func__(float __internal_atanhf_kernel(float a_1, float a_2)) | | __device_func__(float __internal_atanhf_kernel(float a_1, float a_2)) | |
| { | | { | |
| float a, a2, t; | | float a, a2, t; | |
| | | | |
| a = a_1 + a_2; | | a = a_1 + a_2; | |
| a2 = a * a; | | a2 = a * a; | |
|
| t = 1.566305595598990E-001f/64.0f; | | t = 1.566305595598990E-001f/64.0f; | |
| t = t * a2 + 1.995081856004762E-001f/16.0f; | | t = __internal_fmad (t, a2, 1.995081856004762E-001f/16.0f); | |
| t = t * a2 + 3.333382699617026E-001f/4.0f; | | t = __internal_fmad (t, a2, 3.333382699617026E-001f/4.0f); | |
| t = t * a2; | | t = t * a2; | |
|
| t = t * a + a_2; | | t = __internal_fmad (t, a, a_2); | |
| t = t + a_1; | | t = t + a_1; | |
| return t; | | return t; | |
| } | | } | |
| | | | |
| /* compute atan(r) in first octant, i.e. 0 <= r <= 1 | | /* compute atan(r) in first octant, i.e. 0 <= r <= 1 | |
| * eps ~= 2.16e-7 | | * eps ~= 2.16e-7 | |
| */ | | */ | |
| __device_func__(float __internal_atanf_kernel(float a)) | | __device_func__(float __internal_atanf_kernel(float a)) | |
| { | | { | |
| float t4, t0, t1; | | float t4, t0, t1; | |
| | | | |
| t4 = a * a; | | t4 = a * a; | |
|
| t0 = - 5.674867153f; | | t0 = -5.674867153f; | |
| t0 = t4 * - 0.823362947f + t0; | | t0 = __internal_fmad (t4, -0.823362947f, t0); | |
| t0 = t0 * t4 - 6.565555096f; | | t0 = __internal_fmad (t0, t4, -6.565555096f); | |
| t0 = t0 * t4; | | t0 = t0 * t4; | |
| t0 = t0 * a; | | t0 = t0 * a; | |
|
| t1 = t4 + 11.33538818f; | | t1 = t4 + 11.33538818f; | |
| t1 = t1 * t4 + 28.84246826f; | | t1 = __internal_fmad (t1, t4, 28.84246826f); | |
| t1 = t1 * t4 + 19.69667053f; | | t1 = __internal_fmad (t1, t4, 19.69667053f); | |
| t1 = 1.0f / t1; | | t1 = 1.0f / t1; | |
|
| a = t0 * t1 + a; | | a = __internal_fmad (t0, t1, a); | |
| return a; | | return a; | |
| } | | } | |
| | | | |
| /* approximate tangent on -pi/4...+pi/4 */ | | /* approximate tangent on -pi/4...+pi/4 */ | |
| __device_func__(float __internal_tan_kernel(float a)) | | __device_func__(float __internal_tan_kernel(float a)) | |
| { | | { | |
| float a2, s, t; | | float a2, s, t; | |
| | | | |
| a2 = a * a; | | a2 = a * a; | |
|
| t = 4.114678393115178E-003f * a2 - 8.231194034909670E-001f; | | t = __internal_fmad (4.114678393115178E-003f, a2, -8.231194034909670E-00
1f); | |
| s = a2 - 2.469348886157666E+000f; | | s = a2 - 2.469348886157666E+000f; | |
| s = 1.0f / s; | | s = 1.0f / s; | |
| t = t * s; | | t = t * s; | |
| t = t * a2; | | t = t * a2; | |
|
| t = t * a + a; | | t = __internal_fmad (t, a, a); | |
| return t; | | return t; | |
| } | | } | |
| | | | |
| __device_func__(float __internal_accurate_logf(float a)) | | __device_func__(float __internal_accurate_logf(float a)) | |
| { | | { | |
| float t; | | float t; | |
| float z; | | float z; | |
| float m; | | float m; | |
| int ia, e; | | int ia, e; | |
| ia = __float_as_int(a); | | ia = __float_as_int(a); | |
| | | | |
| skipping to change at line 1380 | | skipping to change at line 1413 | |
| } | | } | |
| /* log(a) = 2 * atanh((a-1)/(a+1)) */ | | /* log(a) = 2 * atanh((a-1)/(a+1)) */ | |
| m = __int_as_float((ia & 0x807fffff) | 0x3f800000); | | m = __int_as_float((ia & 0x807fffff) | 0x3f800000); | |
| e = ((unsigned)ia >> 23) - 127; | | e = ((unsigned)ia >> 23) - 127; | |
| if (m > CUDART_SQRT_TWO_F) { | | if (m > CUDART_SQRT_TWO_F) { | |
| m = m * 0.5f; | | m = m * 0.5f; | |
| e = e + 1; | | e = e + 1; | |
| } | | } | |
| t = m - 1.0f; | | t = m - 1.0f; | |
| z = m + 1.0f; | | z = m + 1.0f; | |
|
| z = t / z; | | z = __fdividef (t, z); | |
| z = -t * z; | | z = -t * z; | |
| z = __internal_atanhf_kernel(t, z); | | z = __internal_atanhf_kernel(t, z); | |
|
| z = (float)e * CUDART_LN2_F + z; | | z = __internal_fmad ((float)e, CUDART_LN2_F, z); | |
| return z; | | return z; | |
| } | | } | |
| | | | |
| __device_func__(float2 __internal_log_ep(float a)) | | __device_func__(float2 __internal_log_ep(float a)) | |
| { | | { | |
| float2 res; | | float2 res; | |
| int expo; | | int expo; | |
| float m; | | float m; | |
| float log_hi, log_lo; | | float log_hi, log_lo; | |
| float t_hi, t_lo; | | float t_hi, t_lo; | |
| | | | |
| skipping to change at line 1426 | | skipping to change at line 1459 | |
| /* compute log(m) with extended precision using an algorithm from P.T.P. | | /* compute log(m) with extended precision using an algorithm from P.T.P. | |
| * Tang, "Table Driven Implementation of the Logarithm Function", TOMS, | | * Tang, "Table Driven Implementation of the Logarithm Function", TOMS, | |
| * Vol. 16., No. 4, December 1990, pp. 378-400. A modified polynomial | | * Vol. 16., No. 4, December 1990, pp. 378-400. A modified polynomial | |
| * approximation to atanh(x) on the interval [-0.1716, 0.1716] is utilize
d. | | * approximation to atanh(x) on the interval [-0.1716, 0.1716] is utilize
d. | |
| */ | | */ | |
| f = m - 1.0f; | | f = m - 1.0f; | |
| g = m + 1.0f; | | g = m + 1.0f; | |
| g = 1.0f / g; | | g = 1.0f / g; | |
| u = 2.0f * f * g; | | u = 2.0f * f * g; | |
| v = u * u; | | v = u * u; | |
|
| q = 1.49356810919559350E-001f/64.0f; | | q = 1.49356810919559350E-001f/64.0f; | |
| q = q * v + 1.99887797540072460E-001f/16.0f; | | q = __internal_fmad (q, v, 1.99887797540072460E-001f/16.0f); | |
| q = q * v + 3.33333880955515580E-001f/4.0f; | | q = __internal_fmad (q, v, 3.33333880955515580E-001f/4.0f); | |
| q = q * v; | | q = q * v; | |
| q = q * u; | | q = q * u; | |
| log_hi = __int_as_float(__float_as_int(u) & 0xfffff000); | | log_hi = __int_as_float(__float_as_int(u) & 0xfffff000); | |
| v = __int_as_float(__float_as_int(f) & 0xfffff000); | | v = __int_as_float(__float_as_int(f) & 0xfffff000); | |
| u = 2.0f * (f - log_hi); | | u = 2.0f * (f - log_hi); | |
| f = f - v; | | f = f - v; | |
|
| u = u - log_hi * v; | | u = __internal_fmad (-log_hi, v, u); | |
| u = u - log_hi * f; | | u = __internal_fmad (-log_hi, f, u); | |
| u = g * u; | | u = g * u; | |
| /* compute log(m) = log_hi + u + q in double-single format*/ | | /* compute log(m) = log_hi + u + q in double-single format*/ | |
| | | | |
| /* log += u; |log| > |u| */ | | /* log += u; |log| > |u| */ | |
| r = log_hi + u; | | r = log_hi + u; | |
| s = u - (r - log_hi); | | s = u - (r - log_hi); | |
| log_hi = r; | | log_hi = r; | |
| log_lo = s; | | log_lo = s; | |
| /* log += q; |log| > |q| */ | | /* log += q; |log| > |q| */ | |
| r = log_hi + q; | | r = log_hi + q; | |
| | | | |
| skipping to change at line 1541 | | skipping to change at line 1574 | |
| } | | } | |
| result[q] = hi; | | result[q] = hi; | |
| e = e & 31; | | e = e & 31; | |
| /* shift result such that hi:lo<63:62> are the least significant | | /* shift result such that hi:lo<63:62> are the least significant | |
| integer bits, and hi:lo<61:0> are the fractional bits of the result | | integer bits, and hi:lo<61:0> are the fractional bits of the result | |
| */ | | */ | |
| hi = result[idx+2]; | | hi = result[idx+2]; | |
| lo = result[idx+1]; | | lo = result[idx+1]; | |
| if (e) { | | if (e) { | |
| q = 32 - e; | | q = 32 - e; | |
|
| hi = (hi << e) | (lo >> q); | | hi = (hi << e) + (lo >> q); | |
| lo = (lo << e) | (result[idx] >> q); | | lo = (lo << e) + (result[idx] >> q); | |
| } | | } | |
| q = hi >> 30; | | q = hi >> 30; | |
| /* fraction */ | | /* fraction */ | |
|
| hi = (hi << 2) | (lo >> 30); | | hi = (hi << 2) + (lo >> 30); | |
| lo = (lo << 2); | | lo = (lo << 2); | |
| e = (hi + (lo > 0)) > 0x80000000; /* fraction >= 0.5 */ | | e = (hi + (lo > 0)) > 0x80000000; /* fraction >= 0.5 */ | |
| q += e; | | q += e; | |
| if (s) q = -q; | | if (s) q = -q; | |
| if (e) { | | if (e) { | |
| unsigned int t; | | unsigned int t; | |
| hi = ~hi; | | hi = ~hi; | |
| lo = -(int)lo; | | lo = -(int)lo; | |
| t = (lo == 0); | | t = (lo == 0); | |
| hi += t; | | hi += t; | |
| s = s ^ 0x80000000; | | s = s ^ 0x80000000; | |
| } | | } | |
| *quadrant = q; | | *quadrant = q; | |
| /* normalize fraction */ | | /* normalize fraction */ | |
| e = 0; | | e = 0; | |
| while ((int)hi > 0) { | | while ((int)hi > 0) { | |
|
| hi = (hi << 1) | (lo >> 31); | | hi = (hi << 1) + (lo >> 31); | |
| lo = (lo << 1); | | lo = (lo << 1); | |
| e--; | | e--; | |
| } | | } | |
| lo = hi * 0xc90fdaa2; | | lo = hi * 0xc90fdaa2; | |
| hi = __umulhi(hi, 0xc90fdaa2); | | hi = __umulhi(hi, 0xc90fdaa2); | |
| if ((int)hi > 0) { | | if ((int)hi > 0) { | |
|
| hi = (hi << 1) | (lo >> 31); | | hi = (hi << 1) + (lo >> 31); | |
| lo = (lo << 1); | | lo = (lo << 1); | |
| e--; | | e--; | |
| } | | } | |
| hi = hi + (lo > 0); | | hi = hi + (lo > 0); | |
| ia = s | (((e + 126) << 23) + (hi >> 8) + ((hi << 24) >= 0x80000000)); | | ia = s | (((e + 126) << 23) + (hi >> 8) + ((hi << 24) >= 0x80000000)); | |
| return __int_as_float(ia); | | return __int_as_float(ia); | |
| } | | } | |
|
| q = __float2int_rn(a * CUDART_2_OVER_PI_F); | | q = __float2int_rn (a * CUDART_2_OVER_PI_F); | |
| j = (float)q; | | j = (float)q; | |
|
| a = a - j * 1.5703125000000000e+000f; | | a = __internal_fmad (-j, 1.5703125000000000e+000f, a); | |
| a = a - j * 4.8351287841796875e-004f; | | a = __internal_fmad (-j, 4.8351287841796875e-004f, a); | |
| a = a - j * 3.1385570764541626e-007f; | | a = __internal_fmad (-j, 3.1385570764541626e-007f, a); | |
| a = a - j * 6.0771005065061922e-011f; | | a = __internal_fmad (-j, 6.0771005065061922e-011f, a); | |
| *quadrant = q; | | *quadrant = q; | |
| return a; | | return a; | |
| } | | } | |
| | | | |
| /* High quality implementation of expf(). A naive implementation, expf(x) = | | /* High quality implementation of expf(). A naive implementation, expf(x) = | |
| * exp2f (x * log2(e)), loses significant accuracy for large arguments, and | | * exp2f (x * log2(e)), loses significant accuracy for large arguments, and | |
| * may return results with only 15 to 16 good bits (out of 24). The present | | * may return results with only 15 to 16 good bits (out of 24). The present | |
| * implementation limits the error to about 2 ulps across the entire argume
nt | | * implementation limits the error to about 2 ulps across the entire argume
nt | |
| * range. It does so by employing an extended precision representation for | | * range. It does so by employing an extended precision representation for | |
| * ln(2) which is composited from ln2_hi = 0.6931457519f which provides the | | * ln(2) which is composited from ln2_hi = 0.6931457519f which provides the | |
| * most significant 16-bit of ln(2), and ln2_lo = 1.4286067653e-6f, which | | * most significant 16-bit of ln(2), and ln2_lo = 1.4286067653e-6f, which | |
| * provides the least significant 24 bits. | | * provides the least significant 24 bits. | |
| */ | | */ | |
| __device_func__(float __internal_expf_kernel(float a, float scale)) | | __device_func__(float __internal_expf_kernel(float a, float scale)) | |
| { | | { | |
| float j, z; | | float j, z; | |
| | | | |
| j = __cuda_truncf(a * CUDART_L2E_F); | | j = __cuda_truncf(a * CUDART_L2E_F); | |
|
| z = a - j * 0.6931457519f; | | z = __internal_fmad (j, -0.6931457519f, a); | |
| z = z - j * 1.4286067653e-6f; | | z = __internal_fmad (j, -1.4286067653e-6f, z); | |
| z = z * CUDART_L2E_F; | | z = z * CUDART_L2E_F; | |
| z = __cuda_exp2f(z) * __cuda_exp2f(j + scale); | | z = __cuda_exp2f(z) * __cuda_exp2f(j + scale); | |
| return z; | | return z; | |
| } | | } | |
| | | | |
| __device_func__(float __internal_accurate_expf(float a)) | | __device_func__(float __internal_accurate_expf(float a)) | |
| { | | { | |
| float z; | | float z; | |
|
| | | | |
| z = __internal_expf_kernel(a, 0.0f); | | z = __internal_expf_kernel(a, 0.0f); | |
| if (a < -105.0f) z = 0.0f; | | if (a < -105.0f) z = 0.0f; | |
| if (a > 105.0f) z = CUDART_INF_F; | | if (a > 105.0f) z = CUDART_INF_F; | |
| return z; | | return z; | |
| } | | } | |
| | | | |
| __device_func__(float __internal_accurate_exp10f(float a)) | | __device_func__(float __internal_accurate_exp10f(float a)) | |
| { | | { | |
| float j, z; | | float j, z; | |
|
| | | | |
| j = __cuda_truncf(a * CUDART_L2T_F); | | j = __cuda_truncf(a * CUDART_L2T_F); | |
|
| z = a - j * 3.0102920532226563e-001f; | | z = __internal_fmad (j, -3.0102920532226563e-001f, a); | |
| z = z - j * 7.9034171557301747e-007f; | | z = __internal_fmad (j, -7.9034171557301747e-007f, z); | |
| z = z * CUDART_L2T_F; | | z = z * CUDART_L2T_F; | |
| z = __cuda_exp2f(z) * __cuda_exp2f(j); | | z = __cuda_exp2f(z) * __cuda_exp2f(j); | |
| if (a < -46.0f) z = 0.0f; | | if (a < -46.0f) z = 0.0f; | |
| if (a > 46.0f) z = CUDART_INF_F; | | if (a > 46.0f) z = CUDART_INF_F; | |
| return z; | | return z; | |
| } | | } | |
| | | | |
| __device_func__(float __internal_lgammaf_pos(float a)) | | __device_func__(float __internal_lgammaf_pos(float a)) | |
| { | | { | |
| float sum; | | float sum; | |
| | | | |
| skipping to change at line 1646 | | skipping to change at line 1681 | |
| if (a == CUDART_INF_F) { | | if (a == CUDART_INF_F) { | |
| return a; | | return a; | |
| } | | } | |
| if (a >= 3.0f) { | | if (a >= 3.0f) { | |
| if (a >= 7.8f) { | | if (a >= 7.8f) { | |
| /* Stirling approximation for a >= 8; coefficients from Hart et al, | | /* Stirling approximation for a >= 8; coefficients from Hart et al, | |
| * "Computer Approximations", Wiley 1968. Approximation 5401 | | * "Computer Approximations", Wiley 1968. Approximation 5401 | |
| */ | | */ | |
| s = 1.0f / a; | | s = 1.0f / a; | |
| t = s * s; | | t = s * s; | |
|
| sum = 0.77783067e-3f; | | sum = 0.77783067e-3f; | |
| sum = sum * t - 0.2777655457e-2f; | | sum = __internal_fmad (sum, t, -0.2777655457e-2f); | |
| sum = sum * t + 0.83333273853e-1f; | | sum = __internal_fmad (sum, t, 0.83333273853e-1f); | |
| sum = sum * s + 0.918938533204672f; | | sum = __internal_fmad (sum, s, 0.918938533204672f); | |
| s = 0.5f * __internal_accurate_logf(a); | | s = 0.5f * __internal_accurate_logf(a); | |
| t = a - 0.5f; | | t = a - 0.5f; | |
| s = s * t; | | s = s * t; | |
| t = s - a; | | t = s - a; | |
| s = __fadd_rn(s, sum); /* prevent FMAD merging */ | | s = __fadd_rn(s, sum); /* prevent FMAD merging */ | |
| t = t + s; | | t = t + s; | |
| return t; | | return t; | |
| } else { | | } else { | |
| a = a - 3.0f; | | a = a - 3.0f; | |
|
| s = - 7.488903254816711E+002f; | | s = -7.488903254816711E+002f; | |
| s = s * a - 1.234974215949363E+004f; | | s = __internal_fmad (s, a, -1.234974215949363E+004f); | |
| s = s * a - 4.106137688064877E+004f; | | s = __internal_fmad (s, a, -4.106137688064877E+004f); | |
| s = s * a - 4.831066242492429E+004f; | | s = __internal_fmad (s, a, -4.831066242492429E+004f); | |
| s = s * a - 1.430333998207429E+005f; | | s = __internal_fmad (s, a, -1.430333998207429E+005f); | |
| t = a - 2.592509840117874E+002f; | | t = a - 2.592509840117874E+002f; | |
| t = t * a - 1.077717972228532E+004f; | | t = __internal_fmad (t, a, -1.077717972228532E+004f); | |
| t = t * a - 9.268505031444956E+004f; | | t = __internal_fmad (t, a, -9.268505031444956E+004f); | |
| t = t * a - 2.063535768623558E+005f; | | t = __internal_fmad (t, a, -2.063535768623558E+005f); | |
| t = s / t; | | t = __fdividef (s, t); | |
| t = t + a; | | t = t + a; | |
| return t; | | return t; | |
| } | | } | |
| } else if (a >= 1.5f) { | | } else if (a >= 1.5f) { | |
| a = a - 2.0f; | | a = a - 2.0f; | |
|
| t = + 4.959849168282574E-005f; | | t = 4.959849168282574E-005f; | |
| t = t * a - 2.208948403848352E-004f; | | t = __internal_fmad (t, a, -2.208948403848352E-004f); | |
| t = t * a + 5.413142447864599E-004f; | | t = __internal_fmad (t, a, 5.413142447864599E-004f); | |
| t = t * a - 1.204516976842832E-003f; | | t = __internal_fmad (t, a, -1.204516976842832E-003f); | |
| t = t * a + 2.884251838546602E-003f; | | t = __internal_fmad (t, a, 2.884251838546602E-003f); | |
| t = t * a - 7.382757963931180E-003f; | | t = __internal_fmad (t, a, -7.382757963931180E-003f); | |
| t = t * a + 2.058131963026755E-002f; | | t = __internal_fmad (t, a, 2.058131963026755E-002f); | |
| t = t * a - 6.735248600734503E-002f; | | t = __internal_fmad (t, a, -6.735248600734503E-002f); | |
| t = t * a + 3.224670187176319E-001f; | | t = __internal_fmad (t, a, 3.224670187176319E-001f); | |
| t = t * a + 4.227843368636472E-001f; | | t = __internal_fmad (t, a, 4.227843368636472E-001f); | |
| t = t * a; | | t = t * a; | |
| return t; | | return t; | |
| } else if (a >= 0.7f) { | | } else if (a >= 0.7f) { | |
| a = 1.0f - a; | | a = 1.0f - a; | |
|
| t = + 4.588266515364258E-002f; | | t = 4.588266515364258E-002f; | |
| t = t * a + 1.037396712740616E-001f; | | t = __internal_fmad (t, a, 1.037396712740616E-001f); | |
| t = t * a + 1.228036339653591E-001f; | | t = __internal_fmad (t, a, 1.228036339653591E-001f); | |
| t = t * a + 1.275242157462838E-001f; | | t = __internal_fmad (t, a, 1.275242157462838E-001f); | |
| t = t * a + 1.432166835245778E-001f; | | t = __internal_fmad (t, a, 1.432166835245778E-001f); | |
| t = t * a + 1.693435824224152E-001f; | | t = __internal_fmad (t, a, 1.693435824224152E-001f); | |
| t = t * a + 2.074079329483975E-001f; | | t = __internal_fmad (t, a, 2.074079329483975E-001f); | |
| t = t * a + 2.705875136435339E-001f; | | t = __internal_fmad (t, a, 2.705875136435339E-001f); | |
| t = t * a + 4.006854436743395E-001f; | | t = __internal_fmad (t, a, 4.006854436743395E-001f); | |
| t = t * a + 8.224669796332661E-001f; | | t = __internal_fmad (t, a, 8.224669796332661E-001f); | |
| t = t * a + 5.772156651487230E-001f; | | t = __internal_fmad (t, a, 5.772156651487230E-001f); | |
| t = t * a; | | t = t * a; | |
| return t; | | return t; | |
| } else { | | } else { | |
|
| t = + 3.587515669447039E-003f; | | t = 3.587515669447039E-003f; | |
| t = t * a - 5.471285428060787E-003f; | | t = __internal_fmad (t, a, -5.471285428060787E-003f); | |
| t = t * a - 4.462712795343244E-002f; | | t = __internal_fmad (t, a, -4.462712795343244E-002f); | |
| t = t * a + 1.673177015593242E-001f; | | t = __internal_fmad (t, a, 1.673177015593242E-001f); | |
| t = t * a - 4.213597883575600E-002f; | | t = __internal_fmad (t, a, -4.213597883575600E-002f); | |
| t = t * a - 6.558672843439567E-001f; | | t = __internal_fmad (t, a, -6.558672843439567E-001f); | |
| t = t * a + 5.772153712885004E-001f; | | t = __internal_fmad (t, a, 5.772153712885004E-001f); | |
| t = t * a; | | t = t * a; | |
|
| t = t * a + a; | | t = __internal_fmad (t, a, a); | |
| return -__internal_accurate_logf(t); | | return -__internal_accurate_logf(t); | |
| } | | } | |
| } | | } | |
| | | | |
| /* approximate sine on -pi/4...+pi/4 */ | | /* approximate sine on -pi/4...+pi/4 */ | |
| __device_func__(float __internal_sin_kernel(float x)) | | __device_func__(float __internal_sin_kernel(float x)) | |
| { | | { | |
| float x2, z; | | float x2, z; | |
| | | | |
| x2 = x * x; | | x2 = x * x; | |
|
| z = - 1.95152959e-4f; | | z = -1.95152959e-4f; | |
| z = z * x2 + 8.33216087e-3f; | | z = __internal_fmad (z, x2, 8.33216087e-3f); | |
| z = z * x2 - 1.66666546e-1f; | | z = __internal_fmad (z, x2, -1.66666546e-1f); | |
| z = z * x2; | | z = z * x2; | |
|
| z = z * x + x; | | z = __internal_fmad (z, x, x); | |
| | | | |
| return z; | | return z; | |
| } | | } | |
| | | | |
| /* approximate cosine on -pi/4...+pi/4 */ | | /* approximate cosine on -pi/4...+pi/4 */ | |
| __device_func__(float __internal_cos_kernel(float x)) | | __device_func__(float __internal_cos_kernel(float x)) | |
| { | | { | |
| float x2, z; | | float x2, z; | |
| | | | |
| x2 = x * x; | | x2 = x * x; | |
|
| z = 2.44331571e-5f; | | z = 2.44331571e-5f; | |
| z = z * x2 - 1.38873163e-3f; | | z = __internal_fmad (z, x2, -1.38873163e-3f); | |
| z = z * x2 + 4.16666457e-2f; | | z = __internal_fmad (z, x2, 4.16666457e-2f); | |
| z = z * x2 - 5.00000000e-1f; | | z = __internal_fmad (z, x2, -5.00000000e-1f); | |
| z = z * x2 + 1.00000000e+0f; | | z = __internal_fmad (z, x2, 1.00000000e+0f); | |
| return z; | | return z; | |
| } | | } | |
| | | | |
| __device_func__(float __internal_accurate_sinf(float a)) | | __device_func__(float __internal_accurate_sinf(float a)) | |
| { | | { | |
| float z; | | float z; | |
| int i; | | int i; | |
| | | | |
| if ((__cuda___isinff(a)) || (a == CUDART_ZERO_F)) { | | if ((__cuda___isinff(a)) || (a == CUDART_ZERO_F)) { | |
| return __fmul_rn (a, CUDART_ZERO_F); | | return __fmul_rn (a, CUDART_ZERO_F); | |
| | | | |
| skipping to change at line 1802 | | skipping to change at line 1836 | |
| { | | { | |
| #if defined(__MULTI_CORE__) | | #if defined(__MULTI_CORE__) | |
| return cosf(a); | | return cosf(a); | |
| #elif defined(__USE_FAST_MATH__) | | #elif defined(__USE_FAST_MATH__) | |
| return __cosf(a); | | return __cosf(a); | |
| #else /* __MULTI_CORE__ */ | | #else /* __MULTI_CORE__ */ | |
| float z; | | float z; | |
| int i; | | int i; | |
| | | | |
| if (__cuda___isinff(a)) { | | if (__cuda___isinff(a)) { | |
|
| return CUDART_NAN_F; | | return __fadd_rn (a, -a); /* return NaN */ | |
| } | | } | |
| z = __internal_trig_reduction_kernel(a, &i); | | z = __internal_trig_reduction_kernel(a, &i); | |
| /* here, abs(z) <= pi/4, and i has the quadrant */ | | /* here, abs(z) <= pi/4, and i has the quadrant */ | |
| i++; | | i++; | |
| if (i & 1) { | | if (i & 1) { | |
| z = __internal_cos_kernel(z); | | z = __internal_cos_kernel(z); | |
| } else { | | } else { | |
| z = __internal_sin_kernel(z); | | z = __internal_sin_kernel(z); | |
| } | | } | |
| if (i & 2) { | | if (i & 2) { | |
| | | | |
| skipping to change at line 1836 | | skipping to change at line 1870 | |
| float z; | | float z; | |
| int i; | | int i; | |
| | | | |
| if (__cuda___isinff(a)) { | | if (__cuda___isinff(a)) { | |
| return CUDART_NAN_F; | | return CUDART_NAN_F; | |
| } | | } | |
| z = __internal_trig_reduction_kernel(a, &i); | | z = __internal_trig_reduction_kernel(a, &i); | |
| /* here, abs(z) <= pi/4, and i has the quadrant */ | | /* here, abs(z) <= pi/4, and i has the quadrant */ | |
| z = __internal_tan_kernel(z); | | z = __internal_tan_kernel(z); | |
| if (i & 1) { | | if (i & 1) { | |
|
| z = -1.0f / z; | | z = - (1.0f / z); | |
| } | | } | |
| return z; | | return z; | |
| #endif /* __MULTI_CORE__ */ | | #endif /* __MULTI_CORE__ */ | |
| } | | } | |
| | | | |
| __device_func__(float __cuda_log2f(float a)) | | __device_func__(float __cuda_log2f(float a)) | |
| { | | { | |
| #if defined(__MULTI_CORE__) | | #if defined(__MULTI_CORE__) | |
| return log2f(a); | | return log2f(a); | |
| #elif defined(__USE_FAST_MATH__) | | #elif defined(__USE_FAST_MATH__) | |
| | | | |
| skipping to change at line 1881 | | skipping to change at line 1915 | |
| return __internal_accurate_exp10f(a); | | return __internal_accurate_exp10f(a); | |
| #endif /* __MULTI_CORE__ */ | | #endif /* __MULTI_CORE__ */ | |
| } | | } | |
| | | | |
| __device_func__(float __cuda_coshf(float a)) | | __device_func__(float __cuda_coshf(float a)) | |
| { | | { | |
| float z; | | float z; | |
| | | | |
| a = __cuda_fabsf(a); | | a = __cuda_fabsf(a); | |
| z = __internal_expf_kernel(a, -2.0f); | | z = __internal_expf_kernel(a, -2.0f); | |
|
| z = 2.0f * z + 0.125f / z; | | z = __internal_fmad (2.0f, z, __fdividef (0.125f, z)); | |
| if (a >= 90.0f) { | | if (a >= 90.0f) { | |
| z = CUDART_INF_F; /* overflow -> infinity */ | | z = CUDART_INF_F; /* overflow -> infinity */ | |
| } | | } | |
| return z; | | return z; | |
| } | | } | |
| | | | |
| __device_func__(float __cuda_sinhf(float a)) | | __device_func__(float __cuda_sinhf(float a)) | |
| { | | { | |
| float s, z; | | float s, z; | |
| | | | |
| s = a; | | s = a; | |
| a = __cuda_fabsf(a); | | a = __cuda_fabsf(a); | |
| if (a < 1.0f) { /* danger of catastrophic cancellation */ | | if (a < 1.0f) { /* danger of catastrophic cancellation */ | |
| float a2 = a * a; | | float a2 = a * a; | |
| /* approximate sinh(x) on [0,1] with a polynomial */ | | /* approximate sinh(x) on [0,1] with a polynomial */ | |
|
| z = 2.816951222e-6f; | | z = 2.816951222e-6f; | |
| z = z * a2 + 1.983615978e-4f; | | z = __internal_fmad (z, a2, 1.983615978e-4f); | |
| z = z * a2 + 8.333350058e-3f; | | z = __internal_fmad (z, a2, 8.333350058e-3f); | |
| z = z * a2 + 1.666666650e-1f; | | z = __internal_fmad (z, a2, 1.666666650e-1f); | |
| z = z * a2; | | z = z * a2; | |
|
| z = z * a + a; | | z = __internal_fmad (z, a, a); | |
| } else { | | } else { | |
| z = __internal_expf_kernel(a, -2.0f); | | z = __internal_expf_kernel(a, -2.0f); | |
|
| z = 2.0f * z - 0.125f / z; | | z = __internal_fmad (2.0f, z, -__fdividef (0.125f, z)); | |
| if (a >= 90.0f) { | | if (a >= 90.0f) { | |
| z = CUDART_INF_F; /* overflow -> infinity */ | | z = CUDART_INF_F; /* overflow -> infinity */ | |
| } | | } | |
| } | | } | |
| return __cuda_copysignf(z, s); | | return __cuda_copysignf(z, s); | |
| } | | } | |
| | | | |
| __device_func__(float __cuda_tanhf(float a)) | | __device_func__(float __cuda_tanhf(float a)) | |
| { | | { | |
| float s, t; | | float s, t; | |
| | | | |
| t = __cuda_fabsf(a); | | t = __cuda_fabsf(a); | |
| if (t < 0.55f) { | | if (t < 0.55f) { | |
| float z, z2; | | float z, z2; | |
| z = t; | | z = t; | |
| z2 = z * z; | | z2 = z * z; | |
|
| t = 1.643758066599993e-2f; | | t = 1.643758066599993e-2f; | |
| t = t * z2 - 5.267181327760551e-2f; | | t = __internal_fmad (t, z2, -5.267181327760551e-2f); | |
| t = t * z2 + 1.332072505223051e-1f; | | t = __internal_fmad (t, z2, 1.332072505223051e-1f); | |
| t = t * z2 - 3.333294663641083e-1f; | | t = __internal_fmad (t, z2, -3.333294663641083e-1f); | |
| t = t * z2; | | t = t * z2; | |
|
| s = t * z + z; | | s = __internal_fmad (t, z, z); | |
| } else { | | } else { | |
|
| s = 1.0f - 2.0f / (__internal_expf_kernel(2.0f * t, 0.0f) + 1.0f); | | s = 1.0f - __fdividef(2.0f,(__internal_expf_kernel(2.0f * t, 0.0f)+1.0f
)); | |
| if (t >= 88.0f) { | | if (t >= 88.0f) { | |
| s = 1.0f; | | s = 1.0f; | |
| } | | } | |
| } | | } | |
| return __cuda_copysignf(s, a); | | return __cuda_copysignf(s, a); | |
| } | | } | |
| | | | |
| __device_func__(float __cuda_atan2f(float a, float b)) | | __device_func__(float __cuda_atan2f(float a, float b)) | |
| { | | { | |
| #if defined(__MULTI_CORE__) | | #if defined(__MULTI_CORE__) | |
| | | | |
| skipping to change at line 1999 | | skipping to change at line 2033 | |
| } | | } | |
| return __cuda_copysignf(t1, a); | | return __cuda_copysignf(t1, a); | |
| } | | } | |
| | | | |
| /* approximate asin(a) on [0, 0.575] */ | | /* approximate asin(a) on [0, 0.575] */ | |
| __device_func__(float __internal_asinf_kernel(float a)) | | __device_func__(float __internal_asinf_kernel(float a)) | |
| { | | { | |
| float t2, t3, t4; | | float t2, t3, t4; | |
| | | | |
| t2 = a * a; | | t2 = a * a; | |
|
| t3 = - 0.501162291f; | | t3 = -0.501162291f; | |
| t3 = t3 * t2 + 0.915201485f; | | t3 = __internal_fmad (t3, t2, 0.915201485f); | |
| t3 = t3 * t2; | | t3 = t3 * t2; | |
| t3 = t3 * a; | | t3 = t3 * a; | |
|
| t4 = t2 - 5.478654385f; | | t4 = t2 - 5.478654385f; | |
| t4 = t4 * t2 + 5.491230488f; | | t4 = __internal_fmad (t4, t2, 5.491230488f); | |
| t4 = 1.0f / t4; | | t4 = 1.0f / t4; | |
|
| a = t3 * t4 + a; | | a = __internal_fmad (t3, t4, a); | |
| return a; | | return a; | |
| } | | } | |
| | | | |
| __device_func__(float __cuda_asinf(float a)) | | __device_func__(float __cuda_asinf(float a)) | |
| { | | { | |
| float t0, t1, t2; | | float t0, t1, t2; | |
| | | | |
| t0 = __cuda_fabsf(a); | | t0 = __cuda_fabsf(a); | |
| t2 = 1.0f - t0; | | t2 = 1.0f - t0; | |
| t2 = 0.5f * t2; | | t2 = 0.5f * t2; | |
| t2 = __cuda_sqrtf(t2); | | t2 = __cuda_sqrtf(t2); | |
| t1 = t0 > 0.575f ? t2 : t0; | | t1 = t0 > 0.575f ? t2 : t0; | |
| t1 = __internal_asinf_kernel(t1); | | t1 = __internal_asinf_kernel(t1); | |
|
| t2 = -2.0f * t1 + CUDART_PIO2_F; | | t2 = __internal_fmad (-2.0f, t1, CUDART_PIO2_F); | |
| if (t0 > 0.575f) { | | if (t0 > 0.575f) { | |
| t1 = t2; | | t1 = t2; | |
| } | | } | |
| return __cuda_copysignf(t1, a); | | return __cuda_copysignf(t1, a); | |
| } | | } | |
| | | | |
| __device_func__(float __cuda_acosf(float a)) | | __device_func__(float __cuda_acosf(float a)) | |
| { | | { | |
| float t0, t1, t2; | | float t0, t1, t2; | |
| | | | |
| | | | |
| skipping to change at line 2079 | | skipping to change at line 2113 | |
| return log1pf(a); | | return log1pf(a); | |
| #else /* __MULTI_CORE__ */ | | #else /* __MULTI_CORE__ */ | |
| float t; | | float t; | |
| #if !defined(__CUDABE__) && defined(_WIN32) | | #if !defined(__CUDABE__) && defined(_WIN32) | |
| /* MSVC doesn't handle negative zero correctly, so handle it separately *
/ | | /* MSVC doesn't handle negative zero correctly, so handle it separately *
/ | |
| if (a == 0.0f) return a; | | if (a == 0.0f) return a; | |
| #endif /* !__CUDABE__ && _WIN32 */ | | #endif /* !__CUDABE__ && _WIN32 */ | |
| if (a >= -0.394f && a <= 0.65f) { | | if (a >= -0.394f && a <= 0.65f) { | |
| /* log(a+1) = 2*atanh(a/(a+2)) */ | | /* log(a+1) = 2*atanh(a/(a+2)) */ | |
| t = a + 2.0f; | | t = a + 2.0f; | |
|
| t = a / t; | | t = __fdividef (a, t); | |
| t = -a * t; | | t = -a * t; | |
| t = __internal_atanhf_kernel (a, t); | | t = __internal_atanhf_kernel (a, t); | |
| } else { | | } else { | |
| t = __internal_accurate_logf (CUDART_ONE_F + a); | | t = __internal_accurate_logf (CUDART_ONE_F + a); | |
| } | | } | |
| return t; | | return t; | |
| #endif /* __MULTI_CORE__ */ | | #endif /* __MULTI_CORE__ */ | |
| } | | } | |
| | | | |
| __device_func__(float __cuda_acoshf(float a)) | | __device_func__(float __cuda_acoshf(float a)) | |
| | | | |
| skipping to change at line 2101 | | skipping to change at line 2135 | |
| #if defined(__MULTI_CORE__) | | #if defined(__MULTI_CORE__) | |
| return acoshf(a); | | return acoshf(a); | |
| #else /* __MULTI_CORE__ */ | | #else /* __MULTI_CORE__ */ | |
| float t; | | float t; | |
| | | | |
| t = a - 1.0f; | | t = a - 1.0f; | |
| if (__cuda_fabsf(t) > CUDART_TWO_TO_23_F) { | | if (__cuda_fabsf(t) > CUDART_TWO_TO_23_F) { | |
| /* for large a, acosh = log(2*a) */ | | /* for large a, acosh = log(2*a) */ | |
| return CUDART_LN2_F + __internal_accurate_logf(a); | | return CUDART_LN2_F + __internal_accurate_logf(a); | |
| } else { | | } else { | |
|
| t = t + __cuda_sqrtf(a * t + t); | | t = t + __cuda_sqrtf(__internal_fmad (a, t, t)); | |
| return __cuda_log1pf(t); | | return __cuda_log1pf(t); | |
| } | | } | |
| #endif /* __MULTI_CORE__ */ | | #endif /* __MULTI_CORE__ */ | |
| } | | } | |
| | | | |
| __device_func__(float __cuda_asinhf(float a)) | | __device_func__(float __cuda_asinhf(float a)) | |
| { | | { | |
| #if defined(__MULTI_CORE__) | | #if defined(__MULTI_CORE__) | |
| return asinhf(a); | | return asinhf(a); | |
| #else /* __MULTI_CORE__ */ | | #else /* __MULTI_CORE__ */ | |
| float fa, oofa, t; | | float fa, oofa, t; | |
| | | | |
| fa = __cuda_fabsf(a); | | fa = __cuda_fabsf(a); | |
| if (fa > CUDART_TWO_TO_126_F) { /* prevent intermediate underflow */ | | if (fa > CUDART_TWO_TO_126_F) { /* prevent intermediate underflow */ | |
| t = CUDART_LN2_F + __logf(fa); /* fast version is safe here */ | | t = CUDART_LN2_F + __logf(fa); /* fast version is safe here */ | |
| } else { | | } else { | |
| oofa = 1.0f / fa; | | oofa = 1.0f / fa; | |
|
| t = fa + fa / (oofa + __cuda_sqrtf(1.0f + oofa * oofa)); | | t =fa+__fdividef (fa,(oofa+__cuda_sqrtf(__internal_fmad(oofa,oofa,1.0f)
))); | |
| t = __cuda_log1pf(t); | | t = __cuda_log1pf(t); | |
| } | | } | |
| return __cuda_copysignf(t, a); | | return __cuda_copysignf(t, a); | |
| #endif /* __MULTI_CORE__ */ | | #endif /* __MULTI_CORE__ */ | |
| } | | } | |
| | | | |
| __device_func__(float __cuda_atanhf(float a)) | | __device_func__(float __cuda_atanhf(float a)) | |
| { | | { | |
| #if defined(__MULTI_CORE__) | | #if defined(__MULTI_CORE__) | |
| return atanhf(a); | | return atanhf(a); | |
| #else /* __MULTI_CORE__ */ | | #else /* __MULTI_CORE__ */ | |
| float fa, t; | | float fa, t; | |
| | | | |
| fa = __cuda_fabsf(a); | | fa = __cuda_fabsf(a); | |
|
| t = (2.0f * fa) / (1.0f - fa); | | t = __fdividef ((2.0f * fa), (1.0f - fa)); | |
| t = 0.5f * __cuda_log1pf(t); | | t = 0.5f * __cuda_log1pf(t); | |
| return __cuda_copysignf(t, a); | | return __cuda_copysignf(t, a); | |
| #endif /* __MULTI_CORE__ */ | | #endif /* __MULTI_CORE__ */ | |
| } | | } | |
| | | | |
| __device_func__(float __cuda_expm1f(float a)) | | __device_func__(float __cuda_expm1f(float a)) | |
| { | | { | |
| float t, z, j, u; | | float t, z, j, u; | |
| /* expm1(a) = 2^t*(expm1(z)+1)-1 */ | | /* expm1(a) = 2^t*(expm1(z)+1)-1 */ | |
| t = __cuda_rintf (a * CUDART_L2E_F); | | t = __cuda_rintf (a * CUDART_L2E_F); | |
|
| z = a - t * 0.6931457519f; | | z = __internal_fmad (-t, 0.6931457519f, a); | |
| z = z - t * 1.4286067653e-6f; | | z = __internal_fmad (-t, 1.4286067653e-6f, z); | |
| /* prevent loss of accuracy for args a tad outside [-0.5*log(2),0.5*log(2
)]*/ | | /* prevent loss of accuracy for args a tad outside [-0.5*log(2),0.5*log(2
)]*/ | |
| if (__cuda_fabsf(a) < 0.41f) { | | if (__cuda_fabsf(a) < 0.41f) { | |
| z = a; | | z = a; | |
| t = 0.0f; | | t = 0.0f; | |
| } | | } | |
| /* prevent intermediate overflow */ | | /* prevent intermediate overflow */ | |
| j = t; | | j = t; | |
| if (t == 128.0f) j = j - 1.0f; | | if (t == 128.0f) j = j - 1.0f; | |
| /* expm1(z) on [log(2/3), log(3/2)] */ | | /* expm1(z) on [log(2/3), log(3/2)] */ | |
|
| u = 1.38795078474044430E-003f; | | u = 1.38795078474044430E-003f; | |
| u = u * z + 8.38241261853264930E-003f; | | u = __internal_fmad (u, z, 8.38241261853264930E-003f); | |
| u = u * z + 4.16678317762833940E-002f; | | u = __internal_fmad (u, z, 4.16678317762833940E-002f); | |
| u = u * z + 1.66663978874356580E-001f; | | u = __internal_fmad (u, z, 1.66663978874356580E-001f); | |
| u = u * z + 4.99999940395997040E-001f; | | u = __internal_fmad (u, z, 4.99999940395997040E-001f); | |
| u = u * z; | | u = u * z; | |
|
| u = u * z + z; | | u = __internal_fmad (u, z, z); | |
| if (a == 0.0f) u = a; // preserve input of -0 | | if (a == 0.0f) u = a; // preserve input of -0 | |
| /* 2^j*[exmp1(z)+1]-1 = 2^j*expm1(z)+2^j-1 */ | | /* 2^j*[exmp1(z)+1]-1 = 2^j*expm1(z)+2^j-1 */ | |
| z = __cuda_exp2f (j); | | z = __cuda_exp2f (j); | |
| a = z - 1.0f; | | a = z - 1.0f; | |
|
| if (a != 0.0f) u = u * z + a; // preserve -0 generated by FTZ | | if (a != 0.0f) u = __internal_fmad (u, z, a); // preserve -0 generated by | |
| if (t == 128.0f) u = u + u; // work around intermediate overflow | | FTZ | |
| | | if (t == 128.0f) u = u + u; // work around intermediate overflow | |
| /* handle massive overflow and underflow */ | | /* handle massive overflow and underflow */ | |
| if (j > 128.0f) u = CUDART_INF_F; | | if (j > 128.0f) u = CUDART_INF_F; | |
| if (j < -25.0f) u = -1.0f; | | if (j < -25.0f) u = -1.0f; | |
| return u; | | return u; | |
| } | | } | |
| | | | |
| __device_func__(float __cuda_hypotf(float a, float b)) | | __device_func__(float __cuda_hypotf(float a, float b)) | |
| { | | { | |
| #if defined(__MULTI_CORE__) | | #if defined(__MULTI_CORE__) | |
| return hypotf(a, b); | | return hypotf(a, b); | |
| | | | |
| skipping to change at line 2193 | | skipping to change at line 2227 | |
| b = __cuda_fabsf(b); | | b = __cuda_fabsf(b); | |
| /* can't use min, max because they do not propagate NaNs */ | | /* can't use min, max because they do not propagate NaNs */ | |
| if (a > b) { | | if (a > b) { | |
| v = a; | | v = a; | |
| w = b; | | w = b; | |
| } else { | | } else { | |
| v = b; | | v = b; | |
| w = a; | | w = a; | |
| } | | } | |
| t = __internal_accurate_fdividef(w, v); | | t = __internal_accurate_fdividef(w, v); | |
|
| t = 1.0f + t * t; | | t = __internal_fmad (t, t, 1.0f); | |
| t = v * __cuda_sqrtf(t); | | t = v * __cuda_sqrtf(t); | |
| if (v == 0.0f) { | | if (v == 0.0f) { | |
| t = v + w; | | t = v + w; | |
| } | | } | |
| if ((v == CUDART_INF_F) || (w == CUDART_INF_F)) { | | if ((v == CUDART_INF_F) || (w == CUDART_INF_F)) { | |
| t = CUDART_INF_F; | | t = CUDART_INF_F; | |
| } | | } | |
| return t; | | return t; | |
| #endif /* __MULTI_CORE__ */ | | #endif /* __MULTI_CORE__ */ | |
| } | | } | |
| | | | |
| skipping to change at line 2216 | | skipping to change at line 2250 | |
| { | | { | |
| #if defined(__MULTI_CORE__) | | #if defined(__MULTI_CORE__) | |
| return cbrtf(a); | | return cbrtf(a); | |
| #else /* __MULTI_CORE__ */ | | #else /* __MULTI_CORE__ */ | |
| float s, t; | | float s, t; | |
| | | | |
| s = __cuda_fabsf(a); | | s = __cuda_fabsf(a); | |
| if ((a == 0.0f) || (s == CUDART_INF_F)) { | | if ((a == 0.0f) || (s == CUDART_INF_F)) { | |
| return a; | | return a; | |
| } | | } | |
|
| t = __cuda_exp2f(CUDART_THIRD_F * __log2f(s)); /* initial approximation * | | t = __cuda_exp2f(CUDART_THIRD_F * __log2f(s)); /* initial approximation | |
| / | | */ | |
| t = t - (t - (s / (t * t))) * CUDART_THIRD_F; /* refine approximation */ | | t = t-(t-(__fdividef(s,(t*t))))*CUDART_THIRD_F; /* refine approximation | |
| | | */ | |
| if (__cuda___signbitf(a)) { | | if (__cuda___signbitf(a)) { | |
|
| t = -t; | | t = -t; | |
| } | | } | |
| return t; | | return t; | |
| #endif /* __MULTI_CORE__ */ | | #endif /* __MULTI_CORE__ */ | |
| } | | } | |
| | | | |
| __device_func__(float __cuda_erff(float a)) | | __device_func__(float __cuda_erff(float a)) | |
| { | | { | |
| float t, r, q; | | float t, r, q; | |
| | | | |
| t = __cuda_fabsf(a); | | t = __cuda_fabsf(a); | |
| if (t < 1.0f) { | | if (t < 1.0f) { | |
| t = t * t; | | t = t * t; | |
|
| r = -5.58510127926029810E-004f; | | r = -5.58510127926029810E-004f; | |
| r = r * t + 4.90688891415893070E-003f; | | r = __internal_fmad (r, t, 4.90688891415893070E-003f); | |
| r = r * t - 2.67027980930150640E-002f; | | r = __internal_fmad (r, t, -2.67027980930150640E-002f); | |
| r = r * t + 1.12799056505903940E-001f; | | r = __internal_fmad (r, t, 1.12799056505903940E-001f); | |
| r = r * t - 3.76122956138427440E-001f; | | r = __internal_fmad (r, t, -3.76122956138427440E-001f); | |
| r = r * t + 1.12837911712623450E+000f; | | r = __internal_fmad (r, t, 1.12837911712623450E+000f); | |
| a = a * r; | | a = a * r; | |
| } else if (t <= CUDART_INF_F) { | | } else if (t <= CUDART_INF_F) { | |
| /* coefficients from Hastings, "Approximations for Digital Computers", | | /* coefficients from Hastings, "Approximations for Digital Computers", | |
| * Princeton University Press 1955. Sheet 45. | | * Princeton University Press 1955. Sheet 45. | |
| */ | | */ | |
|
| q = 0.3275911f * t + 1.0f; | | q = __internal_fmad (t, 0.3275911f, 1.0f); | |
| q = 1.0f / q; | | q = 1.0f / q; | |
|
| r = 1.061405429f; | | r = 1.061405429f; | |
| r = r * q - 1.453152027f; | | r = __internal_fmad (r, q, -1.453152027f); | |
| r = r * q + 1.421413741f; | | r = __internal_fmad (r, q, 1.421413741f); | |
| r = r * q - 0.284496736f; | | r = __internal_fmad (r, q, -0.284496736f); | |
| r = r * q + 0.254829592f; | | r = __internal_fmad (r, q, 0.254829592f); | |
| r = r * q; | | r = r * q; | |
| q = __internal_expf_kernel(-a * a, 0.0f); | | q = __internal_expf_kernel(-a * a, 0.0f); | |
|
| r = 1.0f - q * r; | | r = __internal_fmad (-q, r, 1.0f); | |
| if (t >= 5.5f) { | | if (t >= 5.5f) { | |
| r = 1.0f; | | r = 1.0f; | |
| } | | } | |
| a = __int_as_float (__float_as_int(r) | (__float_as_int(a) & 0x80000000
)); | | a = __int_as_float (__float_as_int(r) | (__float_as_int(a) & 0x80000000
)); | |
| } | | } | |
| return a; | | return a; | |
| } | | } | |
| | | | |
|
| | | __device_func__(float __cuda_erfinvf (float a)) | |
| | | { | |
| | | float fa, t; | |
| | | | |
| | | fa = fabsf(a); | |
| | | if (fa >= 1.0f) { | |
| | | t = __cuda_rsqrtf (__int_as_float (0xffc00000)); /* NaN */ | |
| | | if (fa == 1.0f) { | |
| | | t = a * CUDART_INF_F; /* Infinity */ | |
| | | } | |
| | | } else if (fa > 0.9375f) { | |
| | | /* Based on: J.M. Blair, C.A. Edwards, J.H. Johnson: Rational Chebyshev | |
| | | Approximations for the Inverse of the Error Function. Mathematics of | |
| | | Computation, Vol. 30, No. 136 (Oct. 1976), pp. 827-830. Table 50 | |
| | | */ | |
| | | float p, q; | |
| | | | |
| | | t = __cuda_log1pf(-fa); | |
| | | t = __cuda_rsqrtf(-t); | |
| | | p = -1.64441567910e-1f; | |
| | | p = __internal_fmad (p, t, 6.80544246825e-1f); | |
| | | p = __internal_fmad (p, t, -1.12808139162e+0f); | |
| | | p = __internal_fmad (p, t, 6.90969348887e-1f); | |
| | | p = __internal_fmad (p, t, 1.38271964963e+0f); | |
| | | p = __internal_fmad (p, t, 1.55047000312e-1f); | |
| | | q = t + 1.38522814199e+0f; | |
| | | q = __internal_fmad (q, t, 1.55024849822e-1f); | |
| | | q = q * t; | |
| | | t = __fdividef (p, q); | |
| | | if (a < 0.0f) t = -t; | |
| | | } else if (fa > 0.75f) { | |
| | | /* Based on: J.M. Blair, C.A. Edwards, J.H. Johnson: Rational Chebyshev | |
| | | Approximations for the Inverse of the Error Function. Mathematics of | |
| | | Computation, Vol. 30, No. 136 (Oct. 1976), pp. 827-830. Table 30 | |
| | | */ | |
| | | float p, q; | |
| | | | |
| | | t = __internal_fmad (a, a, -0.87890625f); | |
| | | p = -7.1986748896e+0f; | |
| | | p = __internal_fmad (p, t, +1.3411974175e+1f); | |
| | | p = __internal_fmad (p, t, -5.1381573203e+0f); | |
| | | p = __internal_fmad (p, t, 4.9633374831e-1f); | |
| | | q = t -1.1436535838e+1f; | |
| | | q = __internal_fmad (q, t, 1.3568885572e+1f); | |
| | | q = __internal_fmad (q, t, -4.1747509256e+0f); | |
| | | q = __internal_fmad (q, t, 3.5327242323e-1f); | |
| | | p = __fdividef (p, q); | |
| | | t = a * p; | |
| | | } else { /* polynomial approximation on [0, 0.75], max error 2 ulps */ | |
| | | float a2; | |
| | | | |
| | | a2 = a * a; | |
| | | t = 6.1046168794766742E-001f; | |
| | | t = __internal_fmad (t, a2, -8.9504882462753121E-001f); | |
| | | t = __internal_fmad (t, a2, 7.0224162369928511E-001f); | |
| | | t = __internal_fmad (t, a2, -1.9993784895823222E-001f); | |
| | | t = __internal_fmad (t, a2, 1.1920613463949599E-001f); | |
| | | t = __internal_fmad (t, a2, 8.0131492246997685E-002f); | |
| | | t = __internal_fmad (t, a2, 1.2793154958377403E-001f); | |
| | | t = __internal_fmad (t, a2, 2.3200529172828793E-001f); | |
| | | t = __internal_fmad (t, a2, 8.8622695604694379E-001f); | |
| | | t = t * a; | |
| | | } | |
| | | return t; | |
| | | } | |
| | | | |
| __device_func__(float __cuda_erfcf(float a)) | | __device_func__(float __cuda_erfcf(float a)) | |
| { | | { | |
| if (a <= 0.55f) { | | if (a <= 0.55f) { | |
| return 1.0f - __cuda_erff(a); | | return 1.0f - __cuda_erff(a); | |
| } else if (a > 10.0f) { | | } else if (a > 10.0f) { | |
| return 0.0f; | | return 0.0f; | |
| } else { | | } else { | |
| float p; | | float p; | |
| float q; | | float q; | |
| float h; | | float h; | |
| float l; | | float l; | |
| /* This rational approximation has a slight accuracy issue since all th
e | | /* This rational approximation has a slight accuracy issue since all th
e | |
| * coefficients have same sign so error accumulates when this is comput
ed | | * coefficients have same sign so error accumulates when this is comput
ed | |
| * in single precision. Also the division at the end isn't IEEE complia
nt. | | * in single precision. Also the division at the end isn't IEEE complia
nt. | |
| */ | | */ | |
|
| p = + 4.014893410762552E-006f; | | p = 4.014893410762552E-006f; | |
| p = p * a + 5.640401259462436E-001f; | | p = __internal_fmad (p, a, 5.640401259462436E-001f); | |
| p = p * a + 2.626649872281140E+000f; | | p = __internal_fmad (p, a, 2.626649872281140E+000f); | |
| p = p * a + 5.486372652389673E+000f; | | p = __internal_fmad (p, a, 5.486372652389673E+000f); | |
| p = p * a + 5.250714831459401E+000f; | | p = __internal_fmad (p, a, 5.250714831459401E+000f); | |
| q = a + 4.651376250488319E+000f; | | q = a + 4.651376250488319E+000f; | |
| q = q * a + 1.026302828878470E+001f; | | q = __internal_fmad (q, a, 1.026302828878470E+001f); | |
| q = q * a + 1.140762166021288E+001f; | | q = __internal_fmad (q, a, 1.140762166021288E+001f); | |
| q = q * a + 5.251211619089947E+000f; | | q = __internal_fmad (q, a, 5.251211619089947E+000f); | |
| /* Use reciprocal plus NR refinement for division */ | | /* Use reciprocal plus NR refinement for division */ | |
| h = 1.0f / q; | | h = 1.0f / q; | |
|
| q = 2.0f * h - q * h * h; | | q = __internal_fmad (-q * h, h, 2.0f * h); | |
| p = p * q; | | p = p * q; | |
| /* compute exp(-a*a) with extended precision to avoid error magnificati
on*/ | | /* compute exp(-a*a) with extended precision to avoid error magnificati
on*/ | |
| h = __int_as_float(__float_as_int(a) & 0xfffff000); /* upper 12 bits *
/ | | h = __int_as_float(__float_as_int(a) & 0xfffff000); /* upper 12 bits *
/ | |
| l = __fadd_rn (a, -h); /* lower 12 bits */ | | l = __fadd_rn (a, -h); /* lower 12 bits */ | |
| q = __fmul_rn (-h, h); /* this product is error free */ | | q = __fmul_rn (-h, h); /* this product is error free */ | |
| q = __internal_expf_kernel(q, 0.0f); | | q = __internal_expf_kernel(q, 0.0f); | |
| a = a + h; | | a = a + h; | |
| l = l * a; | | l = l * a; | |
| h = __internal_expf_kernel(-l, 0.0f); | | h = __internal_expf_kernel(-l, 0.0f); | |
| q = q * h; | | q = q * h; | |
| p = p * q; | | p = p * q; | |
| return p; | | return p; | |
| } | | } | |
| } | | } | |
| | | | |
|
| | | __device_func__(float __cuda_erfcinvf (float a)) | |
| | | { | |
| | | float t; | |
| | | if (a <= 0.0f) { | |
| | | t = CUDART_NAN_F; | |
| | | if (a == 0.0f) { | |
| | | t = (1.0f - a) * CUDART_INF_F; | |
| | | } | |
| | | } | |
| | | else if (a >= 0.0625f) { | |
| | | t = __cuda_erfinvf (1.0f - a); | |
| | | } | |
| | | else { | |
| | | float p, q; | |
| | | t = __cuda_logf(a); | |
| | | t = __cuda_rsqrtf(-t); | |
| | | p = -1.64441567910e-1f; | |
| | | p = __internal_fmad (p, t, 6.80544246825e-1f); | |
| | | p = __internal_fmad (p, t, -1.12808139162e+0f); | |
| | | p = __internal_fmad (p, t, 6.90969348887e-1f); | |
| | | p = __internal_fmad (p, t, 1.38271964963e+0f); | |
| | | p = __internal_fmad (p, t, 1.55047000312e-1f); | |
| | | q = t + 1.38522814199e+0f; | |
| | | q = __internal_fmad (q, t, 1.55024849822e-1f); | |
| | | q = q * t; | |
| | | t = __fdividef (p, q); | |
| | | } | |
| | | return t; | |
| | | } | |
| | | | |
| __device_func__(float __cuda_lgammaf(float a)) | | __device_func__(float __cuda_lgammaf(float a)) | |
| { | | { | |
| float t; | | float t; | |
| float i; | | float i; | |
| int quot; | | int quot; | |
| t = __internal_lgammaf_pos(__cuda_fabsf(a)); | | t = __internal_lgammaf_pos(__cuda_fabsf(a)); | |
| if (a >= 0.0f) return t; | | if (a >= 0.0f) return t; | |
| a = __cuda_fabsf(a); | | a = __cuda_fabsf(a); | |
| i = __cuda_floorf(a); | | i = __cuda_floorf(a); | |
| if (a == i) return CUDART_INF_F; /* a is an integer: return infinity */ | | if (a == i) return CUDART_INF_F; /* a is an integer: return infinity */ | |
| if (a < 1e-19f) return -__internal_accurate_logf(a); | | if (a < 1e-19f) return -__internal_accurate_logf(a); | |
| i = __cuda_rintf (2.0f * a); | | i = __cuda_rintf (2.0f * a); | |
| quot = (int)i; | | quot = (int)i; | |
|
| i = a - 0.5f * i; | | i = __internal_fmad (-i, 0.5f, a); | |
| i = i * CUDART_PI_F; | | i = i * CUDART_PI_F; | |
| if (quot & 1) { | | if (quot & 1) { | |
| i = __internal_cos_kernel(i); | | i = __internal_cos_kernel(i); | |
| } else { | | } else { | |
| i = __internal_sin_kernel(i); | | i = __internal_sin_kernel(i); | |
| } | | } | |
| i = __cuda_fabsf(i); | | i = __cuda_fabsf(i); | |
| t = CUDART_LNPI_F - __internal_accurate_logf(i * a) - t; | | t = CUDART_LNPI_F - __internal_accurate_logf(i * a) - t; | |
| return t; | | return t; | |
| } | | } | |
| | | | |
| __device_func__(float __cuda_ldexpf(float a, int b)) | | __device_func__(float __cuda_ldexpf(float a, int b)) | |
| { | | { | |
| #if defined(__MULTI_CORE__) | | #if defined(__MULTI_CORE__) | |
| return ldexpf(a, b); | | return ldexpf(a, b); | |
| #else /* __MULTI_CORE__ */ | | #else /* __MULTI_CORE__ */ | |
| float fa = __cuda_fabsf(a); | | float fa = __cuda_fabsf(a); | |
| | | | |
| if ((fa == CUDART_ZERO_F) || (fa == CUDART_INF_F) || (b == 0)) { | | if ((fa == CUDART_ZERO_F) || (fa == CUDART_INF_F) || (b == 0)) { | |
|
| | | if (!(fa > CUDART_ZERO_F)) a = a + a; | |
| return a; | | return a; | |
|
| } | | } else if (__cuda_abs(b) < 126) { | |
| else if (__cuda_abs(b) < 126) { | | | |
| return a * __cuda_exp2f((float)b); | | return a * __cuda_exp2f((float)b); | |
|
| } | | } else if (__cuda_abs(b) < 252) { | |
| else if (__cuda_abs(b) < 252) { | | | |
| int bhalf = b / 2; | | int bhalf = b / 2; | |
| return a * __cuda_exp2f((float)bhalf) * __cuda_exp2f((float)(b - bhalf)
); | | return a * __cuda_exp2f((float)bhalf) * __cuda_exp2f((float)(b - bhalf)
); | |
|
| } | | } else { | |
| else { | | | |
| int bquarter = b / 4; | | int bquarter = b / 4; | |
| float t = __cuda_exp2f((float)bquarter); | | float t = __cuda_exp2f((float)bquarter); | |
| return a * t * t * t * __cuda_exp2f((float)(b - 3 * bquarter)); | | return a * t * t * t * __cuda_exp2f((float)(b - 3 * bquarter)); | |
| } | | } | |
| #endif /* __MULTI_CORE__ */ | | #endif /* __MULTI_CORE__ */ | |
| } | | } | |
| | | | |
| __device_func__(float __cuda_scalbnf(float a, int b)) | | __device_func__(float __cuda_scalbnf(float a, int b)) | |
| { | | { | |
| #if defined(__MULTI_CORE__) | | #if defined(__MULTI_CORE__) | |
| | | | |
| skipping to change at line 2439 | | skipping to change at line 2567 | |
| return fmodf(a, b); | | return fmodf(a, b); | |
| #else /* __MULTI_CORE__ */ | | #else /* __MULTI_CORE__ */ | |
| float orig_a = a; | | float orig_a = a; | |
| float orig_b = b; | | float orig_b = b; | |
| a = __cuda_fabsf(a); | | a = __cuda_fabsf(a); | |
| b = __cuda_fabsf(b); | | b = __cuda_fabsf(b); | |
| if (!((a <= CUDART_INF_F) && (b <= CUDART_INF_F))) { | | if (!((a <= CUDART_INF_F) && (b <= CUDART_INF_F))) { | |
| return orig_a + orig_b; | | return orig_a + orig_b; | |
| } | | } | |
| if ((a == CUDART_INF_F) || (b == CUDART_ZERO_F)) { | | if ((a == CUDART_INF_F) || (b == CUDART_ZERO_F)) { | |
|
| return CUDART_NAN_F; | | return __cuda_rsqrtf (__int_as_float (0xffc00000)); | |
| } else if (a >= b) { | | } else if (a >= b) { | |
| #if !defined(__CUDABE__) | | #if !defined(__CUDABE__) | |
| /* Need to be able to handle denormals correctly */ | | /* Need to be able to handle denormals correctly */ | |
| int expoa = (a < CUDART_TWO_TO_M126_F) ? | | int expoa = (a < CUDART_TWO_TO_M126_F) ? | |
| ((int)__log2f(a)) : (((__float_as_int(a) >> 23) & 0xff) - 127); | | ((int)__log2f(a)) : (((__float_as_int(a) >> 23) & 0xff) - 127); | |
| int expob = (b < CUDART_TWO_TO_M126_F) ? | | int expob = (b < CUDART_TWO_TO_M126_F) ? | |
| ((int)__log2f(b)) : (((__float_as_int(b) >> 23) & 0xff) - 127); | | ((int)__log2f(b)) : (((__float_as_int(b) >> 23) & 0xff) - 127); | |
| int scale = expoa - expob; | | int scale = expoa - expob; | |
| float scaled_b = __cuda_ldexpf(b, scale); | | float scaled_b = __cuda_ldexpf(b, scale); | |
| if (scaled_b <= 0.5f * a) { | | if (scaled_b <= 0.5f * a) { | |
| | | | |
| skipping to change at line 2467 | | skipping to change at line 2595 | |
| } | | } | |
| #endif /* !__CUDABE__ */ | | #endif /* !__CUDABE__ */ | |
| while (scaled_b >= b) { | | while (scaled_b >= b) { | |
| if (a >= scaled_b) { | | if (a >= scaled_b) { | |
| a -= scaled_b; | | a -= scaled_b; | |
| } | | } | |
| scaled_b *= 0.5f; | | scaled_b *= 0.5f; | |
| } | | } | |
| return __cuda_copysignf(a, orig_a); | | return __cuda_copysignf(a, orig_a); | |
| } else { | | } else { | |
|
| | | if (!(a > CUDART_ZERO_F)) orig_a = orig_a + orig_a; | |
| return orig_a; | | return orig_a; | |
| } | | } | |
| #endif /* __MULTI_CORE__ */ | | #endif /* __MULTI_CORE__ */ | |
| } | | } | |
| | | | |
| __device_func__(float __cuda_remainderf(float a, float b)) | | __device_func__(float __cuda_remainderf(float a, float b)) | |
| { | | { | |
| | | | |
| float twoa = 0.0f; | | float twoa = 0.0f; | |
| unsigned int quot0 = 0; /* quotient bit 0 */ | | unsigned int quot0 = 0; /* quotient bit 0 */ | |
| float orig_a = a; | | float orig_a = a; | |
| float orig_b = b; | | float orig_b = b; | |
| | | | |
| a = __cuda_fabsf(a); | | a = __cuda_fabsf(a); | |
| b = __cuda_fabsf(b); | | b = __cuda_fabsf(b); | |
| if (!((a <= CUDART_INF_F) && (b <= CUDART_INF_F))) { | | if (!((a <= CUDART_INF_F) && (b <= CUDART_INF_F))) { | |
| return orig_a + orig_b; | | return orig_a + orig_b; | |
| } | | } | |
| if ((a == CUDART_INF_F) || (b == CUDART_ZERO_F)) { | | if ((a == CUDART_INF_F) || (b == CUDART_ZERO_F)) { | |
|
| return CUDART_NAN_F; | | return __cuda_rsqrtf (__int_as_float (0xffc00000)); | |
| } else if (a >= b) { | | } else if (a >= b) { | |
| #if !defined(__CUDABE__) | | #if !defined(__CUDABE__) | |
| int expoa = (a < CUDART_TWO_TO_M126_F) ? | | int expoa = (a < CUDART_TWO_TO_M126_F) ? | |
| ((int)__log2f(a)) : (((__float_as_int(a) >> 23) & 0xff) - 127); | | ((int)__log2f(a)) : (((__float_as_int(a) >> 23) & 0xff) - 127); | |
| int expob = (b < CUDART_TWO_TO_M126_F) ? | | int expob = (b < CUDART_TWO_TO_M126_F) ? | |
| ((int)__log2f(b)) : (((__float_as_int(b) >> 23) & 0xff) - 127); | | ((int)__log2f(b)) : (((__float_as_int(b) >> 23) & 0xff) - 127); | |
| int scale = expoa - expob; | | int scale = expoa - expob; | |
| float scaled_b = __cuda_ldexpf(b, scale); | | float scaled_b = __cuda_ldexpf(b, scale); | |
| if (scaled_b <= 0.5f * a) { | | if (scaled_b <= 0.5f * a) { | |
| scaled_b *= 2.0f; | | scaled_b *= 2.0f; | |
| | | | |
| skipping to change at line 2564 | | skipping to change at line 2693 | |
| /* quo has a value whose sign is the sign of x/y */ | | /* quo has a value whose sign is the sign of x/y */ | |
| sign = 0 - (__cuda___signbitf(a) != __cuda___signbitf(b)); | | sign = 0 - (__cuda___signbitf(a) != __cuda___signbitf(b)); | |
| a = __cuda_fabsf(a); | | a = __cuda_fabsf(a); | |
| b = __cuda_fabsf(b); | | b = __cuda_fabsf(b); | |
| if (!((a <= CUDART_INF_F) && (b <= CUDART_INF_F))) { | | if (!((a <= CUDART_INF_F) && (b <= CUDART_INF_F))) { | |
| *quo = quot; | | *quo = quot; | |
| return orig_a + orig_b; | | return orig_a + orig_b; | |
| } | | } | |
| if ((a == CUDART_INF_F) || (b == CUDART_ZERO_F)) { | | if ((a == CUDART_INF_F) || (b == CUDART_ZERO_F)) { | |
| *quo = quot; | | *quo = quot; | |
|
| return CUDART_NAN_F; | | return __cuda_rsqrtf (__int_as_float (0xffc00000)); | |
| } else if (a >= b) { | | } else if (a >= b) { | |
| #if !defined(__CUDABE__) | | #if !defined(__CUDABE__) | |
| /* Need to be able to handle denormals correctly */ | | /* Need to be able to handle denormals correctly */ | |
| int expoa = (a < CUDART_TWO_TO_M126_F) ? | | int expoa = (a < CUDART_TWO_TO_M126_F) ? | |
| ((int)__log2f(a)) : (((__float_as_int(a) >> 23) & 0xff) - 127); | | ((int)__log2f(a)) : (((__float_as_int(a) >> 23) & 0xff) - 127); | |
| int expob = (b < CUDART_TWO_TO_M126_F) ? | | int expob = (b < CUDART_TWO_TO_M126_F) ? | |
| ((int)__log2f(b)) : (((__float_as_int(b) >> 23) & 0xff) - 127); | | ((int)__log2f(b)) : (((__float_as_int(b) >> 23) & 0xff) - 127); | |
| int scale = expoa - expob; | | int scale = expoa - expob; | |
| float scaled_b = __cuda_ldexpf(b, scale); | | float scaled_b = __cuda_ldexpf(b, scale); | |
| if (scaled_b <= 0.5f * a) { | | if (scaled_b <= 0.5f * a) { | |
| | | | |
| skipping to change at line 2640 | | skipping to change at line 2769 | |
| __float_as_int(a)); | | __float_as_int(a)); | |
| quot = quot & CUDART_REMQUO_MASK_F; | | quot = quot & CUDART_REMQUO_MASK_F; | |
| quot = quot ^ sign; | | quot = quot ^ sign; | |
| quot = quot - sign; | | quot = quot - sign; | |
| *quo = quot; | | *quo = quot; | |
| return a; | | return a; | |
| } | | } | |
| | | | |
| __device_func__(float __cuda_fmaf(float a, float b, float c)) | | __device_func__(float __cuda_fmaf(float a, float b, float c)) | |
| { | | { | |
|
| unsigned int xx, yy, zz, ww; | | return __fmaf_rn(a, b, c); | |
| unsigned int temp, s, u; | | | |
| unsigned int expo_x, expo_y, expo_z; | | | |
| | | | |
| xx = __float_as_int(a); | | | |
| yy = __float_as_int(b); | | | |
| zz = __float_as_int(c); | | | |
| | | | |
| #if defined(__CUDABE__) | | | |
| /* Match 'denormals are zero' behavior of the GPU */ | | | |
| if ((xx << 1) < 0x01000000) xx &= 0x80000000; | | | |
| if ((yy << 1) < 0x01000000) yy &= 0x80000000; | | | |
| if ((zz << 1) < 0x01000000) zz &= 0x80000000; | | | |
| #endif /* __CUDABE__ */ | | | |
| | | | |
| temp = 0xff; | | | |
| expo_x = temp & (xx >> 23); | | | |
| expo_x = expo_x - 1; | | | |
| expo_y = temp & (yy >> 23); | | | |
| expo_y = expo_y - 1; | | | |
| expo_z = temp & (zz >> 23); | | | |
| expo_z = expo_z - 1; | | | |
| | | | |
| if (!((expo_x <= 0xFD) && | | | |
| (expo_y <= 0xFD) && | | | |
| (expo_z <= 0xFD))) { | | | |
| /* fma (nan, y, z) --> nan | | | |
| fma (x, nan, z) --> nan | | | |
| fma (x, y, nan) --> nan | | | |
| */ | | | |
| if ((yy << 1) > 0xff000000) { | | | |
| return CUDART_NAN_F; | | | |
| } | | | |
| if ((zz << 1) > 0xff000000) { | | | |
| return CUDART_NAN_F; | | | |
| } | | | |
| if ((xx << 1) > 0xff000000) { | | | |
| return CUDART_NAN_F; | | | |
| } | | | |
| /* fma (0, inf, z) --> NaN | | | |
| fma (inf, 0, z) --> NaN | | | |
| fma (-inf,+y,+inf) --> NaN | | | |
| fma (+x,-inf,+inf) --> NaN | | | |
| fma (+inf,-y,+inf) --> NaN | | | |
| fma (-x,+inf,+inf) --> NaN | | | |
| fma (-inf,-y,-inf) --> NaN | | | |
| fma (-x,-inf,-inf) --> NaN | | | |
| fma (+inf,+y,-inf) --> NaN | | | |
| fma (+x,+inf,-inf) --> NaN | | | |
| */ | | | |
| if ((((xx << 1) == 0) && ((yy << 1) == 0xff000000)) || | | | |
| (((yy << 1) == 0) && ((xx << 1) == 0xff000000))) { | | | |
| return CUDART_NAN_F; | | | |
| } | | | |
| if ((zz << 1) == 0xff000000) { | | | |
| if (((yy << 1) == 0xff000000) || ((xx << 1) == 0xff000000)) { | | | |
| if ((int)(xx ^ yy ^ zz) < 0) { | | | |
| return CUDART_NAN_F; | | | |
| } | | | |
| } | | | |
| } | | | |
| /* fma (inf, y, z) --> inf | | | |
| fma (x, inf, z) --> inf | | | |
| fma (x, y, inf) --> inf | | | |
| */ | | | |
| if ((xx << 1) == 0xff000000) { | | | |
| xx = xx ^ (yy & 0x80000000); | | | |
| return __int_as_float(xx); | | | |
| } | | | |
| if ((yy << 1) == 0xff000000) { | | | |
| yy = yy ^ (xx & 0x80000000); | | | |
| return __int_as_float(yy); | | | |
| } | | | |
| if ((zz << 1) == 0xff000000) { | | | |
| return __int_as_float(zz); | | | |
| } | | | |
| /* fma (+0, -y, -0) --> -0 | | | |
| fma (-0, +y, -0) --> -0 | | | |
| fma (+x, -0, -0) --> -0 | | | |
| fma (-x, +0, -0) --> -0 | | | |
| */ | | | |
| if (zz == 0x80000000) { | | | |
| if (((xx << 1) == 0) || ((yy << 1) == 0)) { | | | |
| if ((int)(xx ^ yy) < 0) { | | | |
| return __int_as_float(zz); | | | |
| } | | | |
| } | | | |
| } | | | |
| /* fma (0, y, 0) --> +0 | | | |
| fma (x, 0, 0) --> +0 | | | |
| */ | | | |
| if (((zz << 1) == 0) && | | | |
| (((xx << 1) == 0) || ((yy << 1) == 0))) { | | | |
| zz &= 0x7fffffff; | | | |
| return __int_as_float(zz); | | | |
| } | | | |
| /* fma (0, y, z) --> z | | | |
| fma (x, 0, z) --> z | | | |
| */ | | | |
| if (((xx << 1) == 0) || ((yy << 1) == 0)) { | | | |
| return __int_as_float(zz); | | | |
| } | | | |
| /* normalize x, if denormal */ | | | |
| if (expo_x == (unsigned int)-1) { | | | |
| temp = xx & 0x80000000; | | | |
| xx = xx << 8; | | | |
| while (!(xx & 0x80000000)) { | | | |
| xx <<= 1; | | | |
| expo_x--; | | | |
| } | | | |
| expo_x++; | | | |
| xx = (xx >> 8) | temp; | | | |
| } | | | |
| /* normalize y, if denormal */ | | | |
| if (expo_y == (unsigned int)-1) { | | | |
| temp = yy & 0x80000000; | | | |
| yy = yy << 8; | | | |
| while (!(yy & 0x80000000)) { | | | |
| yy <<= 1; | | | |
| expo_y--; | | | |
| } | | | |
| expo_y++; | | | |
| yy = (yy >> 8) | temp; | | | |
| } | | | |
| /* normalize z, if denormal */ | | | |
| if ((expo_z == (unsigned int)-1) && ((zz << 1) != 0)) { | | | |
| temp = zz & 0x80000000; | | | |
| zz = zz << 8; | | | |
| while (!(zz & 0x80000000)) { | | | |
| zz <<= 1; | | | |
| expo_z--; | | | |
| } | | | |
| expo_z++; | | | |
| zz = (zz >> 8) | temp; | | | |
| } | | | |
| } | | | |
| | | | |
| expo_x = expo_x + expo_y; | | | |
| expo_y = xx ^ yy; | | | |
| xx = xx & 0x00ffffff; | | | |
| yy = yy << 8; | | | |
| xx = xx | 0x00800000; | | | |
| yy = yy | 0x80000000; | | | |
| | | | |
| s = __umulhi(xx, yy); | | | |
| yy = xx * yy; | | | |
| xx = s; | | | |
| expo_x = expo_x - 127 + 2; | | | |
| expo_y = expo_y & 0x80000000; | | | |
| | | | |
| /* normalize mantissa */ | | | |
| if (xx < 0x00800000) { | | | |
| xx = (xx << 1) | (yy >> 31); | | | |
| yy = (yy << 1); | | | |
| expo_x--; | | | |
| } | | | |
| temp = 0; | | | |
| if ((zz << 1) != 0) { /* z is not zero */ | | | |
| s = zz & 0x80000000; | | | |
| zz &= 0x00ffffff; | | | |
| zz |= 0x00800000; | | | |
| ww = 0; | | | |
| /* compare and swap. put augend into xx:yy */ | | | |
| if ((int)expo_z > (int)expo_x) { | | | |
| temp = expo_z; | | | |
| expo_z = expo_x; | | | |
| expo_x = temp; | | | |
| temp = zz; | | | |
| zz = xx; | | | |
| xx = temp; | | | |
| temp = ww; | | | |
| ww = yy; | | | |
| yy = temp; | | | |
| temp = expo_y; | | | |
| expo_y = s; | | | |
| s = temp; | | | |
| } | | | |
| /* augend_sign = expo_y, augend_mant = xx:yy, augend_expo = expo_x */ | | | |
| /* addend_sign = s, addend_mant = zz:ww, addend_expo = expo_z */ | | | |
| expo_z = expo_x - expo_z; | | | |
| u = expo_y ^ s; | | | |
| if (expo_z <= 49) { | | | |
| /* denormalize addend */ | | | |
| temp = 0; | | | |
| while (expo_z >= 32) { | | | |
| temp = ww | (temp != 0); | | | |
| ww = zz; | | | |
| zz = 0; | | | |
| expo_z -= 32; | | | |
| } | | | |
| if (expo_z) { | | | |
| temp = ((temp >> expo_z) | (ww << (32 - expo_z)) | | | | |
| ((temp << (32 - expo_z)) != 0)); | | | |
| ww = (ww >> expo_z) | (zz << (32 - expo_z)); | | | |
| zz = (zz >> expo_z); | | | |
| } | | | |
| } else { | | | |
| temp = 1; | | | |
| ww = 0; | | | |
| zz = 0; | | | |
| } | | | |
| if ((int)u < 0) { | | | |
| /* signs differ, effective subtraction */ | | | |
| temp = (unsigned)(-(int)temp); | | | |
| s = (temp != 0); | | | |
| u = yy - s; | | | |
| s = u > yy; | | | |
| yy = u - ww; | | | |
| s += yy > u; | | | |
| xx = (xx - zz) - s; | | | |
| if (!(xx | yy | temp)) { | | | |
| /* complete cancelation, return 0 */ | | | |
| return __int_as_float(xx); | | | |
| } | | | |
| if ((int)xx < 0) { | | | |
| /* Oops, augend had smaller mantissa. Negate mantissa and flip | | | |
| sign of result | | | |
| */ | | | |
| temp = ~temp; | | | |
| yy = ~yy; | | | |
| xx = ~xx; | | | |
| if (++temp == 0) { | | | |
| if (++yy == 0) { | | | |
| ++xx; | | | |
| } | | | |
| } | | | |
| expo_y ^= 0x80000000; | | | |
| } | | | |
| /* normalize mantissa, if necessary */ | | | |
| while (!(xx & 0x00800000)) { | | | |
| xx = (xx << 1) | (yy >> 31); | | | |
| yy = (yy << 1); | | | |
| expo_x--; | | | |
| } | | | |
| } else { | | | |
| /* signs are the same, effective addition */ | | | |
| yy = yy + ww; | | | |
| s = yy < ww; | | | |
| xx = xx + zz + s; | | | |
| if (xx & 0x01000000) { | | | |
| temp = temp | (yy << 31); | | | |
| yy = (yy >> 1) | (xx << 31); | | | |
| xx = ((xx & 0x80000000) | (xx >> 1)) & ~0x40000000; | | | |
| expo_x++; | | | |
| } | | | |
| } | | | |
| } | | | |
| temp = yy | (temp != 0); | | | |
| if (expo_x <= 0xFD) { | | | |
| /* normal */ | | | |
| xx |= expo_y; /* or in sign bit */ | | | |
| s = xx & 1; /* mantissa lsb */ | | | |
| xx += (temp == 0x80000000) ? s : (temp >> 31); | | | |
| xx = xx + (expo_x << 23); /* add in exponent */ | | | |
| return __int_as_float(xx); | | | |
| } else if ((int)expo_x >= 126) { | | | |
| /* overflow */ | | | |
| xx = expo_y | 0x7f800000; | | | |
| return __int_as_float(xx); | | | |
| } | | | |
| /* subnormal */ | | | |
| expo_x = (unsigned int)(-(int)expo_x); | | | |
| if (expo_x > 25) { | | | |
| /* massive underflow: return 0 */ | | | |
| return __int_as_float(expo_y); | | | |
| } | | | |
| yy = (xx << (32 - expo_x)) | ((yy) ? 1 : 0); | | | |
| xx = expo_y + (xx >> expo_x); | | | |
| xx = xx + ((yy==0x80000000) ? (xx & 1) : (yy >> 31)); | | | |
| xx |= expo_y; /* or in sign bit */ | | | |
| #if defined(__CUDABE__) | | | |
| /* Match 'flush to zero' response of the GPU */ | | | |
| if ((xx << 1) < 0x01000000) xx = expo_y; | | | |
| #endif /* __CUDABE__ */ | | | |
| return __int_as_float(xx); | | | |
| } | | } | |
| | | | |
| __device_func__(float __internal_accurate_powf(float a, float b)) | | __device_func__(float __internal_accurate_powf(float a, float b)) | |
| { | | { | |
| float2 loga, prod; | | float2 loga, prod; | |
| #if !defined(__CUDABE__) && defined(_MSC_VER) && !defined(_WIN64) | | #if !defined(__CUDABE__) && defined(_MSC_VER) && !defined(_WIN64) | |
| volatile float t; | | volatile float t; | |
| #else | | #else | |
| float t; | | float t; | |
| #endif | | #endif | |
| | | | |
| skipping to change at line 2948 | | skipping to change at line 2803 | |
| prod.x = prod.x + __int_as_float(0x37000000); | | prod.x = prod.x + __int_as_float(0x37000000); | |
| } | | } | |
| | | | |
| /* compute pow(a,b) = exp(b*log(a)) */ | | /* compute pow(a,b) = exp(b*log(a)) */ | |
| t = __cuda_expf(prod.y); | | t = __cuda_expf(prod.y); | |
| /* prevent -INF + INF = NaN */ | | /* prevent -INF + INF = NaN */ | |
| if (t != CUDART_INF_F) { | | if (t != CUDART_INF_F) { | |
| /* if prod.x is much smaller than prod.y, then exp(prod.y+prod.x) ~= | | /* if prod.x is much smaller than prod.y, then exp(prod.y+prod.x) ~= | |
| * exp(prod.y) + prod.x * exp(prod.y) | | * exp(prod.y) + prod.x * exp(prod.y) | |
| */ | | */ | |
|
| t = t * prod.x + t; | | t = __internal_fmad (t, prod.x, t); | |
| } | | } | |
| return t; | | return t; | |
| } | | } | |
| | | | |
| __device_func__(float __cuda_powif(float a, int b)) | | __device_func__(float __cuda_powif(float a, int b)) | |
| { | | { | |
| unsigned int e = __cuda_abs(b); | | unsigned int e = __cuda_abs(b); | |
| float r = 1.0f; | | float r = 1.0f; | |
| | | | |
| while (1) { | | while (1) { | |
| if ((e & 1) != 0) { | | if ((e & 1) != 0) { | |
| r = r * a; | | r = r * a; | |
| } | | } | |
| e = e >> 1; | | e = e >> 1; | |
| if (e == 0) { | | if (e == 0) { | |
|
| return b < 0 ? 1.0f/r : r; | | return b < 0 ? 1.0f / r : r; | |
| } | | } | |
| a = a * a; | | a = a * a; | |
| } | | } | |
| } | | } | |
| | | | |
| __device_func__(double __cuda_powi(double a, int b)) | | __device_func__(double __cuda_powi(double a, int b)) | |
| { | | { | |
| unsigned int e = __cuda_abs(b); | | unsigned int e = __cuda_abs(b); | |
| double r = 1.0; | | double r = 1.0; | |
| | | | |
| while (1) { | | while (1) { | |
| if ((e & 1) != 0) { | | if ((e & 1) != 0) { | |
| r = r * a; | | r = r * a; | |
| } | | } | |
| e = e >> 1; | | e = e >> 1; | |
| if (e == 0) { | | if (e == 0) { | |
|
| return b < 0 ? 1.0/r : r; | | return b < 0 ? 1.0 / r : r; | |
| } | | } | |
| a = a * a; | | a = a * a; | |
| } | | } | |
| } | | } | |
| | | | |
| __device_func__(float __cuda_powf(float a, float b)) | | __device_func__(float __cuda_powf(float a, float b)) | |
| { | | { | |
| #if defined(__MULTI_CORE__) | | #if defined(__MULTI_CORE__) | |
| return powf(a, b); | | return powf(a, b); | |
| #elif defined(__USE_FAST_MATH__) | | #elif defined(__USE_FAST_MATH__) | |
| | | | |
| skipping to change at line 3024 | | skipping to change at line 2879 | |
| } | | } | |
| bIsOddInteger = (b - (2.0f * floorf(0.5f * b))) == 1.0f; | | bIsOddInteger = (b - (2.0f * floorf(0.5f * b))) == 1.0f; | |
| if (a == CUDART_ZERO_F) { | | if (a == CUDART_ZERO_F) { | |
| t = bIsOddInteger ? a : CUDART_ZERO_F; | | t = bIsOddInteger ? a : CUDART_ZERO_F; | |
| if (b < CUDART_ZERO_F) { | | if (b < CUDART_ZERO_F) { | |
| t = 1.0f / t; | | t = 1.0f / t; | |
| } | | } | |
| return t; | | return t; | |
| } | | } | |
| if (a == -CUDART_INF_F) { | | if (a == -CUDART_INF_F) { | |
|
| t = (b < CUDART_ZERO_F) ? -1.0f/a : -a; | | t = - ((b < CUDART_ZERO_F) ? (1.0f / a) : a); | |
| if (bIsOddInteger) { | | if (bIsOddInteger) { | |
| t = __int_as_float(__float_as_int(t) ^ 0x80000000); | | t = __int_as_float(__float_as_int(t) ^ 0x80000000); | |
| } | | } | |
| return t; | | return t; | |
| } | | } | |
| if ((a < CUDART_ZERO_F) && (b != __cuda_truncf(b))) { | | if ((a < CUDART_ZERO_F) && (b != __cuda_truncf(b))) { | |
|
| return CUDART_NAN_F; | | return __cuda_rsqrtf(__int_as_float(0xffc00000)); | |
| } | | } | |
| t = __cuda_fabsf(a); | | t = __cuda_fabsf(a); | |
| t = __internal_accurate_powf(t, b); | | t = __internal_accurate_powf(t, b); | |
| if ((a < CUDART_ZERO_F) && bIsOddInteger) { | | if ((a < CUDART_ZERO_F) && bIsOddInteger) { | |
| t = __int_as_float(__float_as_int(t) ^ 0x80000000); | | t = __int_as_float(__float_as_int(t) ^ 0x80000000); | |
| } | | } | |
| return t; | | return t; | |
| #endif /* __MULTI_CORE__ */ | | #endif /* __MULTI_CORE__ */ | |
| } | | } | |
| | | | |
| /* approximate 1.0/(x*gamma(x)) on [-0.5,0.5] */ | | /* approximate 1.0/(x*gamma(x)) on [-0.5,0.5] */ | |
| __device_func__(float __internal_tgammaf_kernel(float a)) | | __device_func__(float __internal_tgammaf_kernel(float a)) | |
| { | | { | |
| float t; | | float t; | |
|
| t = - 1.05767296987211380E-003f; | | t = -1.05767296987211380E-003f; | |
| t = t * a + 7.09279059435508670E-003f; | | t = __internal_fmad (t, a, 7.09279059435508670E-003f); | |
| t = t * a - 9.65347121958557050E-003f; | | t = __internal_fmad (t, a, -9.65347121958557050E-003f); | |
| t = t * a - 4.21736613253687960E-002f; | | t = __internal_fmad (t, a, -4.21736613253687960E-002f); | |
| t = t * a + 1.66542401247154280E-001f; | | t = __internal_fmad (t, a, 1.66542401247154280E-001f); | |
| t = t * a - 4.20043267827838460E-002f; | | t = __internal_fmad (t, a, -4.20043267827838460E-002f); | |
| t = t * a - 6.55878234051332940E-001f; | | t = __internal_fmad (t, a, -6.55878234051332940E-001f); | |
| t = t * a + 5.77215696929794240E-001f; | | t = __internal_fmad (t, a, 5.77215696929794240E-001f); | |
| t = t * a + 1.00000000000000000E+000f; | | t = __internal_fmad (t, a, 1.00000000000000000E+000f); | |
| return t; | | return t; | |
| } | | } | |
| | | | |
| /* Based on: Kraemer, W.: "Berechnung der Gammafunktion G(x) fuer reelle Pu
nkt- | | /* Based on: Kraemer, W.: "Berechnung der Gammafunktion G(x) fuer reelle Pu
nkt- | |
| und Intervallargumente". Zeitschrift fuer angewandte Mathematik und | | und Intervallargumente". Zeitschrift fuer angewandte Mathematik und | |
| Mechanik, Vol. 70 (1990), No. 6, pp. 581-584 | | Mechanik, Vol. 70 (1990), No. 6, pp. 581-584 | |
| */ | | */ | |
| __device_func__(float __cuda_tgammaf(float a)) | | __device_func__(float __cuda_tgammaf(float a)) | |
| { | | { | |
| float s, xx, x=a; | | float s, xx, x=a; | |
| | | | |
| skipping to change at line 3083 | | skipping to change at line 2938 | |
| xx = xx - 1.0f; | | xx = xx - 1.0f; | |
| s = s * xx; | | s = s * xx; | |
| } | | } | |
| if (x >= 0.5f) { | | if (x >= 0.5f) { | |
| xx = xx - 1.0f; | | xx = xx - 1.0f; | |
| } | | } | |
| xx = __internal_tgammaf_kernel(xx); | | xx = __internal_tgammaf_kernel(xx); | |
| if (x < 0.5f) { | | if (x < 0.5f) { | |
| xx = xx * x; | | xx = xx * x; | |
| } | | } | |
|
| s = s / xx; | | s = __fdividef(s, xx); | |
| if (x > 34.03f) { | | if (x > 34.03f) { | |
| /* Cannot use s = s * x - s due to intermediate overflow! */ | | /* Cannot use s = s * x - s due to intermediate overflow! */ | |
| xx = x - 1.0f; | | xx = x - 1.0f; | |
| s = s * xx; | | s = s * xx; | |
| } | | } | |
| return s; | | return s; | |
| } else { | | } else { | |
| if (x == __cuda_floorf(x)) { /* x is negative integer */ | | if (x == __cuda_floorf(x)) { /* x is negative integer */ | |
| x = CUDART_NAN_F; /* NaN, propagates through on device */ | | x = CUDART_NAN_F; /* NaN, propagates through on device */ | |
| #if !defined(__CUDABE__) | | #if !defined(__CUDABE__) | |
| | | | |
| skipping to change at line 3133 | | skipping to change at line 2988 | |
| return s; | | return s; | |
| } | | } | |
| } | | } | |
| | | | |
| __device_func__(float __cuda_roundf(float a)) | | __device_func__(float __cuda_roundf(float a)) | |
| { | | { | |
| #if defined(__MULTI_CORE__) | | #if defined(__MULTI_CORE__) | |
| return roundf(a); | | return roundf(a); | |
| #else /* __MULTI_CORE__ */ | | #else /* __MULTI_CORE__ */ | |
| float fa = __cuda_fabsf(a); | | float fa = __cuda_fabsf(a); | |
|
| if (fa > CUDART_TWO_TO_23_F) { | | float u = __cuda_copysignf (0.5f, a); | |
| return a; | | u = __cuda_truncf (a + u); | |
| } else { | | if (fa > CUDART_TWO_TO_23_F) u = a; | |
| float u = __cuda_floorf(fa + 0.5f); | | if (fa < 0.5f) u = __cuda_truncf (a); | |
| if (fa < 0.5f) u = 0.0f; | | return u; | |
| return __cuda_copysignf(u, a); | | | |
| } | | | |
| #endif /* __MULTI_CORE__ */ | | #endif /* __MULTI_CORE__ */ | |
| } | | } | |
| | | | |
| __device_func__(long long int __internal_llroundf_kernel(float a)) | | __device_func__(long long int __internal_llroundf_kernel(float a)) | |
| { | | { | |
| unsigned long long int res, t = 0LL; | | unsigned long long int res, t = 0LL; | |
| int shift; | | int shift; | |
| unsigned int ia = __float_as_int(a); | | unsigned int ia = __float_as_int(a); | |
| if ((ia << 1) > 0xff000000) return 0LL; | | if ((ia << 1) > 0xff000000) return 0LL; | |
| if ((int)ia >= 0x5f000000) return 0x7fffffffffffffffLL; | | if ((int)ia >= 0x5f000000) return 0x7fffffffffffffffLL; | |
| | | | |
| skipping to change at line 3161 | | skipping to change at line 3014 | |
| shift = 189 - ((ia >> 23) & 0xff); | | shift = 189 - ((ia >> 23) & 0xff); | |
| res = ((long long int)(((ia << 8) | 0x80000000) >> 1)) << 32; | | res = ((long long int)(((ia << 8) | 0x80000000) >> 1)) << 32; | |
| if (shift >= 64) { | | if (shift >= 64) { | |
| t = res; | | t = res; | |
| res = 0; | | res = 0; | |
| } else if (shift) { | | } else if (shift) { | |
| t = res << (64 - shift); | | t = res << (64 - shift); | |
| res = res >> shift; | | res = res >> shift; | |
| } | | } | |
| if (t >= 0x8000000000000000LL) { | | if (t >= 0x8000000000000000LL) { | |
|
| res++; | | res++; | |
| } | | } | |
| if ((int)ia < 0) res = (unsigned long long int)(-(long long int)res); | | if ((int)ia < 0) res = (unsigned long long int)(-(long long int)res); | |
| return (long long int)res; | | return (long long int)res; | |
| } | | } | |
| | | | |
| __device_func__(long long int __cuda_llroundf(float a)) | | __device_func__(long long int __cuda_llroundf(float a)) | |
| { | | { | |
| #if defined(__MULTI_CORE__) | | #if defined(__MULTI_CORE__) | |
| return llroundf(a); | | return llroundf(a); | |
| #else /* __MULTI_CORE__ */ | | #else /* __MULTI_CORE__ */ | |
| | | | |
| skipping to change at line 3749 | | skipping to change at line 3602 | |
| __func__(int ilogbf(float a)) | | __func__(int ilogbf(float a)) | |
| { | | { | |
| return ilogb((double)a); | | return ilogb((double)a); | |
| } | | } | |
| | | | |
| __func__(float erff(float a)) | | __func__(float erff(float a)) | |
| { | | { | |
| return (float)erf((double)a); | | return (float)erf((double)a); | |
| } | | } | |
| | | | |
|
| | | __func__(float erfinvf(float a)) | |
| | | { | |
| | | return (float)erfinv((double)a); | |
| | | } | |
| | | | |
| __func__(float erfcf(float a)) | | __func__(float erfcf(float a)) | |
| { | | { | |
| return (float)erfc((double)a); | | return (float)erfc((double)a); | |
| } | | } | |
| | | | |
|
| | | __func__(float erfcinvf(float a)) | |
| | | { | |
| | | return (float)erfcinv((double)a); | |
| | | } | |
| | | | |
| __func__(float lgammaf(float a)) | | __func__(float lgammaf(float a)) | |
| { | | { | |
| return (float)lgamma((double)a); | | return (float)lgamma((double)a); | |
| } | | } | |
| | | | |
| __func__(float tgammaf(float a)) | | __func__(float tgammaf(float a)) | |
| { | | { | |
| return (float)tgamma((double)a); | | return (float)tgamma((double)a); | |
| } | | } | |
| | | | |
| | | | |
| skipping to change at line 3821 | | skipping to change at line 3684 | |
| __func__(double tgamma(double a)) | | __func__(double tgamma(double a)) | |
| { | | { | |
| return (double)__cuda_tgammaf((float)a); | | return (double)__cuda_tgammaf((float)a); | |
| } | | } | |
| | | | |
| __func__(double erf(double a)) | | __func__(double erf(double a)) | |
| { | | { | |
| return (double)__cuda_erff((float)a); | | return (double)__cuda_erff((float)a); | |
| } | | } | |
| | | | |
|
| | | __func__(double erfinv(double a)) | |
| | | { | |
| | | return (double)__cuda_erfinvf((float)a); | |
| | | } | |
| | | | |
| __func__(double erfc(double a)) | | __func__(double erfc(double a)) | |
| { | | { | |
| return (double)__cuda_erfcf((float)a); | | return (double)__cuda_erfcf((float)a); | |
| } | | } | |
| | | | |
|
| | | __func__(double erfcinv(double a)) | |
| | | { | |
| | | return (double)__cuda_erfcinvf((float)a); | |
| | | } | |
| | | | |
| __func__(double remquo(double a, double b, int *quo)) | | __func__(double remquo(double a, double b, int *quo)) | |
| { | | { | |
| return (double)__cuda_remquof((float)a, (float)b, quo); | | return (double)__cuda_remquof((float)a, (float)b, quo); | |
| } | | } | |
| | | | |
| __func__(double remainder(double a, double b)) | | __func__(double remainder(double a, double b)) | |
| { | | { | |
| return (double)__cuda_remainderf((float)a, (float)b); | | return (double)__cuda_remainderf((float)a, (float)b); | |
| } | | } | |
| | | | |
| | | | |
End of changes. 97 change blocks. |
| 466 lines changed or deleted | | 341 lines changed or added | |
|
| math_functions_dbl_ptx3.h | | math_functions_dbl_ptx3.h | |
| /* | | /* | |
|
| * Copyright 1993-2008 NVIDIA Corporation. All rights reserved. | | * Copyright 1993-2009 NVIDIA Corporation. All rights reserved. | |
| * | | * | |
| * NOTICE TO USER: | | * NOTICE TO USER: | |
| * | | * | |
| * This source code is subject to NVIDIA ownership rights under U.S. and | | * This source code is subject to NVIDIA ownership rights under U.S. and | |
| * international Copyright laws. Users and possessors of this source code | | * international Copyright laws. Users and possessors of this source code | |
| * are hereby granted a nonexclusive, royalty-free license to use this code | | * are hereby granted a nonexclusive, royalty-free license to use this code | |
| * in individual and commercial software. | | * in individual and commercial software. | |
| * | | * | |
| * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE | | * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE | |
| * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR | | * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR | |
| | | | |
| skipping to change at line 52 | | skipping to change at line 52 | |
| | | | |
| #elif !defined(__CUDACC__) | | #elif !defined(__CUDACC__) | |
| | | | |
| #include "crt/func_macro.h" | | #include "crt/func_macro.h" | |
| | | | |
| #define INT_MAX \ | | #define INT_MAX \ | |
| ((int)((unsigned int)-1 >> 1)) | | ((int)((unsigned int)-1 >> 1)) | |
| | | | |
| #include "device_functions.h" | | #include "device_functions.h" | |
| #include "math_constants.h" | | #include "math_constants.h" | |
|
| | | #if !defined(__CUDABE__) | |
| | | #include "common_types.h" | |
| | | #endif | |
| /**************************************************************************
***** | | /**************************************************************************
***** | |
| *
* | | *
* | |
| * DEVICE IMPLEMENTATIONS FOR FUNCTIONS WITH BUILTIN NVOPENCC OPREATIONS
* | | * DEVICE IMPLEMENTATIONS FOR FUNCTIONS WITH BUILTIN NVOPENCC OPREATIONS
* | |
| *
* | | *
* | |
| ***************************************************************************
****/ | | ***************************************************************************
****/ | |
| | | | |
| __device_func__(double __cuda_fabs(double a)) | | __device_func__(double __cuda_fabs(double a)) | |
| { | | { | |
| return fabs(a); | | return fabs(a); | |
| } | | } | |
| | | | |
| __device_func__(double __cuda_fmax(double a, double b)) | | __device_func__(double __cuda_fmax(double a, double b)) | |
| { | | { | |
| #if !defined(__CUDABE__) | | #if !defined(__CUDABE__) | |
|
| volatile union { | | volatile union __cudart_DoubleUlonglongCvt cvta, cvtb; | |
| double d; | | int nana, nanb; | |
| unsigned long long int l; | | | |
| } cvta, cvtb; | | | |
| cvta.d = a; | | cvta.d = a; | |
| cvtb.d = b; | | cvtb.d = b; | |
|
| if ((cvtb.l << 1) > 0xffe0000000000000ULL) return a; | | nana = ((cvta.i << 1) > 0xffe0000000000000ULL); | |
| if ((cvta.l << 1) > 0xffe0000000000000ULL) return b; | | nanb = ((cvtb.i << 1) > 0xffe0000000000000ULL); | |
| | | if (nana && nanb) return a + b; | |
| | | if (nana) return b; | |
| | | if (nanb) return a; | |
| if ((cvta.d == 0.0) && (cvtb.d == 0.0)) { | | if ((cvta.d == 0.0) && (cvtb.d == 0.0)) { | |
|
| cvta.l &= cvtb.l; | | cvta.i &= cvtb.i; | |
| return cvta.d; | | return cvta.d; | |
| } | | } | |
| return a > b ? a : b; | | return a > b ? a : b; | |
| #else | | #else | |
| return fmax(a, b); | | return fmax(a, b); | |
| #endif /* !defined(__CUDABE__) */ | | #endif /* !defined(__CUDABE__) */ | |
| } | | } | |
| | | | |
| __device_func__(double __cuda_fmin(double a, double b)) | | __device_func__(double __cuda_fmin(double a, double b)) | |
| { | | { | |
| #if !defined(__CUDABE__) | | #if !defined(__CUDABE__) | |
|
| volatile union { | | volatile union __cudart_DoubleUlonglongCvt cvta, cvtb; | |
| double d; | | int nana, nanb; | |
| unsigned long long int l; | | | |
| } cvta, cvtb; | | | |
| cvta.d = a; | | cvta.d = a; | |
| cvtb.d = b; | | cvtb.d = b; | |
|
| if ((cvtb.l << 1) > 0xffe0000000000000ULL) return a; | | nana = ((cvta.i << 1) > 0xffe0000000000000ULL); | |
| if ((cvta.l << 1) > 0xffe0000000000000ULL) return b; | | nanb = ((cvtb.i << 1) > 0xffe0000000000000ULL); | |
| if ((cvta.l | cvtb.l) == 0x8000000000000000ULL) { | | if (nana && nanb) return a + b; | |
| | | if (nana) return b; | |
| | | if (nanb) return a; | |
| | | if ((cvta.i | cvtb.i) == 0x8000000000000000ULL) { | |
| return CUDART_NEG_ZERO ; | | return CUDART_NEG_ZERO ; | |
| } | | } | |
| return a < b ? a : b; | | return a < b ? a : b; | |
| #else | | #else | |
| return fmin(a, b); | | return fmin(a, b); | |
| #endif /* !defined(__CUDABE__) */ | | #endif /* !defined(__CUDABE__) */ | |
| } | | } | |
| | | | |
| __device_func__(double __cuda_ceil(double a)) | | __device_func__(double __cuda_ceil(double a)) | |
| { | | { | |
| | | | |
| skipping to change at line 539 | | skipping to change at line 545 | |
| z = __internal_trig_reduction_kerneld(a, &i); | | z = __internal_trig_reduction_kerneld(a, &i); | |
| /* here, abs(z) <= pi/4, and i has the quadrant */ | | /* here, abs(z) <= pi/4, and i has the quadrant */ | |
| z = __internal_tan_kerneld(z, i & 1); | | z = __internal_tan_kerneld(z, i & 1); | |
| return z; | | return z; | |
| } | | } | |
| | | | |
| __device_func__(double __cuda_log(double a)) | | __device_func__(double __cuda_log(double a)) | |
| { | | { | |
| double m, f, g, u, v, tmp, q, ulo, log_lo, log_hi; | | double m, f, g, u, v, tmp, q, ulo, log_lo, log_hi; | |
| int ihi, ilo; | | int ihi, ilo; | |
|
| int e = 0; | | | |
| | | | |
| ihi = __double2hiint(a); | | ihi = __double2hiint(a); | |
| ilo = __double2loint(a); | | ilo = __double2loint(a); | |
|
| if (__cuda___isnan(a)) { | | | |
| return a + a; | | if ((a > CUDART_ZERO) && (a < CUDART_INF)) { | |
| } | | int e = -1023; | |
| /* log(x) is undefined for x < 0.0, return INDEFINITE */ | | /* normalize denormals */ | |
| if (a < 0.0) { | | if ((unsigned)ihi < (unsigned)0x00100000) { | |
| | | a = a * CUDART_TWO_TO_54; | |
| | | e -= 54; | |
| | | ihi = __double2hiint(a); | |
| | | ilo = __double2loint(a); | |
| | | } | |
| | | /* a = m * 2^e. m <= sqrt(2): log2(a) = log2(m) + e. | |
| | | * m > sqrt(2): log2(a) = log2(m/2) + (e+1) | |
| | | */ | |
| | | e += ((ihi >> 20) & 0x7ff); | |
| | | ihi = (ihi & 0x800fffff) | 0x3ff00000; | |
| | | m = __hiloint2double (ihi, ilo); | |
| | | if ((unsigned)ihi > (unsigned)0x3ff6a09e) { | |
| | | m = __internal_half(m); | |
| | | e = e + 1; | |
| | | } | |
| | | /* log((1+m)/(1-m)) = 2*atanh(m). log(m) = 2*atanh ((m-1)/(m+1)) */ | |
| | | f = m - 1.0; | |
| | | g = m + 1.0; | |
| | | g = 1.0 / g; | |
| | | u = f * g; | |
| | | u = u + u; | |
| | | /* u = 2.0 * (m - 1.0) / (m + 1.0) */ | |
| | | v = u * u; | |
| | | q = 6.7261411553826339E-2/65536.0; | |
| | | q = __fma_rn (q, v, 6.6133829643643394E-2/16384.0); | |
| | | q = __fma_rn (q, v, 7.6940931149150890E-2/4096.0); | |
| | | q = __fma_rn (q, v, 9.0908745692137444E-2/1024.0); | |
| | | q = __fma_rn (q, v, 1.1111111499059706E-1/256.0); | |
| | | q = __fma_rn (q, v, 1.4285714283305975E-1/64.0); | |
| | | q = __fma_rn (q, v, 2.0000000000007223E-1/16.0); | |
| | | q = __fma_rn (q, v, 3.3333333333333326E-1/4.0); | |
| | | tmp = __internal_twice (f - u); | |
| | | tmp = __fma_rn (-u, f, tmp); // tmp = remainder of division | |
| | | ulo = g * tmp; // less significant quotient bits | |
| | | /* u + ulo = 2.0 * (m - 1.0) / (m + 1.0) to more than double precision | |
| | | */ | |
| | | q = q * v; | |
| | | q = q * u; | |
| | | /* log_hi + log_lo = log(m) to more than double precision */ | |
| | | log_hi = u; | |
| | | log_lo = ulo + q; | |
| | | /* log_hi + log_lo = log(m)+e*log(2)=log(a) to more than double precisi | |
| | | on*/ | |
| | | q = __fma_rn ( e, CUDART_LN2_HI, log_hi); | |
| | | tmp = __fma_rn (-e, CUDART_LN2_HI, q); | |
| | | tmp = tmp - log_hi; | |
| | | log_hi = q; | |
| | | log_lo = log_lo - tmp; | |
| | | log_lo = __fma_rn (e, CUDART_LN2_LO, log_lo); | |
| | | return log_hi + log_lo; | |
| | | } else { | |
| | | if (__cuda___isnan(a)) { | |
| | | return a + a; | |
| | | } | |
| | | /* log(0) = -INF */ | |
| | | if (a == 0) { | |
| | | return -CUDART_INF; | |
| | | } | |
| | | /* log(INF) = INF */ | |
| | | if (a == CUDART_INF) { | |
| | | return a; | |
| | | } | |
| | | /* log(x) is undefined for x < 0.0, return INDEFINITE */ | |
| return CUDART_NAN; | | return CUDART_NAN; | |
| } | | } | |
|
| /* log(0) = -INF */ | | | |
| if (a == 0) { | | | |
| return -CUDART_INF; | | | |
| } | | | |
| /* log(INF) = INF */ | | | |
| if (__cuda___isinf(a)) { | | | |
| return a; | | | |
| } | | | |
| /* normalize denormals */ | | | |
| if (a < CUDART_TWO_TO_M1022) { | | | |
| a = a * CUDART_TWO_TO_54; | | | |
| e = -54; | | | |
| ihi = __double2hiint(a); | | | |
| ilo = __double2loint(a); | | | |
| } | | | |
| /* a = m * 2^e. m <= sqrt(2): log2(a) = log2(m) + e. | | | |
| * m > sqrt(2): log2(a) = log2(m/2) + (e+1) | | | |
| */ | | | |
| e += ((ihi >> 20) & 0x7ff) - 1023; | | | |
| m = __hiloint2double ((ihi & 0x800fffff) | 0x3ff00000, ilo); | | | |
| if (m > CUDART_SQRT_TWO) { | | | |
| m = __internal_half(m); | | | |
| e = e + 1; | | | |
| } | | | |
| /* log((1+m)/(1-m)) = 2*atanh(m). log(m) = 2*atanh ((m-1)/(m+1)) */ | | | |
| f = m - 1.0; | | | |
| g = m + 1.0; | | | |
| g = 1.0 / g; | | | |
| u = f * g; | | | |
| u = u + u; | | | |
| /* u = 2.0 * (m - 1.0) / (m + 1.0) */ | | | |
| v = u * u; | | | |
| q = 6.7261411553826339E-2/65536.0; | | | |
| q = __fma_rn (q, v, 6.6133829643643394E-2/16384.0); | | | |
| q = __fma_rn (q, v, 7.6940931149150890E-2/4096.0); | | | |
| q = __fma_rn (q, v, 9.0908745692137444E-2/1024.0); | | | |
| q = __fma_rn (q, v, 1.1111111499059706E-1/256.0); | | | |
| q = __fma_rn (q, v, 1.4285714283305975E-1/64.0); | | | |
| q = __fma_rn (q, v, 2.0000000000007223E-1/16.0); | | | |
| q = __fma_rn (q, v, 3.3333333333333326E-1/4.0); | | | |
| tmp = __internal_twice (f - u); | | | |
| tmp = __fma_rn (-u, f, tmp); // tmp = remainder of division | | | |
| ulo = g * tmp; // less significant quotient bits | | | |
| /* u + ulo = 2.0 * (m - 1.0) / (m + 1.0) to more than double precision */ | | | |
| q = q * v; | | | |
| q = q * u; | | | |
| /* log_hi + log_lo = log(m) to more than double precision */ | | | |
| log_hi = u; | | | |
| log_lo = ulo + q; | | | |
| /* log_hi + log_lo = log(m)+e*log(2)=log(a) to more than double precision | | | |
| */ | | | |
| q = __fma_rn ( e, CUDART_LN2_HI, log_hi); | | | |
| tmp = __fma_rn (-e, CUDART_LN2_HI, q); | | | |
| tmp = tmp - log_hi; | | | |
| log_hi = q; | | | |
| log_lo = log_lo - tmp; | | | |
| log_lo = __fma_rn (e, CUDART_LN2_LO, log_lo); | | | |
| return log_hi + log_lo; | | | |
| } | | } | |
| | | | |
| /* Requires |x.y| > |y.y|. 8 DP operations */ | | /* Requires |x.y| > |y.y|. 8 DP operations */ | |
| __device_func__(double2 __internal_ddadd_xgty (double2 x, double2 y)) | | __device_func__(double2 __internal_ddadd_xgty (double2 x, double2 y)) | |
| { | | { | |
|
| double2 z; | | double2 z; | |
| #if defined(__GNUC__) && !defined(__CUDABE__) | | #if defined(__GNUC__) && !defined(__CUDABE__) | |
|
| volatile double r, s, e; | | volatile | |
| #else | | | |
| double r, s, e; | | | |
| #endif | | #endif | |
|
| r = x.y + y.y; | | double r, s, e; | |
| e = x.y - r; | | r = x.y + y.y; | |
| s = ((e + y.y) + y.x) + x.x; | | e = x.y - r; | |
| z.y = e = r + s; | | s = ((e + y.y) + y.x) + x.x; | |
| z.x = (r - e) + s; | | z.y = e = r + s; | |
| return z; | | z.x = (r - e) + s; | |
| | | return z; | |
| } | | } | |
| | | | |
| /* Take full advantage of FMA. Only 8 DP operations */ | | /* Take full advantage of FMA. Only 8 DP operations */ | |
| __device_func__(double2 __internal_ddmul (double2 x, double2 y)) | | __device_func__(double2 __internal_ddmul (double2 x, double2 y)) | |
| { | | { | |
| #if defined(__GNUC__) && !defined(__CUDABE__) | | #if defined(__GNUC__) && !defined(__CUDABE__) | |
|
| volatile double e; | | volatile | |
| #else | | | |
| double e; | | | |
| #endif | | #endif | |
|
| double2 t, z; | | double e; | |
| t.y = x.y * y.y; | | double2 t, z; | |
| t.x = __fma_rn (x.y, y.y, -t.y); | | t.y = x.y * y.y; | |
| t.x = __fma_rn (x.x, y.x, t.x); | | t.x = __fma_rn (x.y, y.y, -t.y); | |
| t.x = __fma_rn (x.y, y.x, t.x); | | t.x = __fma_rn (x.x, y.x, t.x); | |
| t.x = __fma_rn (x.x, y.y, t.x); | | t.x = __fma_rn (x.y, y.x, t.x); | |
| z.y = e = t.y + t.x; | | t.x = __fma_rn (x.x, y.y, t.x); | |
| z.x = (t.y - e) + t.x; | | z.y = e = t.y + t.x; | |
| return z; | | z.x = (t.y - e) + t.x; | |
| | | return z; | |
| } | | } | |
| | | | |
| __device_func__(double2 __internal_log_ext_prec(double a)) | | __device_func__(double2 __internal_log_ext_prec(double a)) | |
| { | | { | |
| double2 res; | | double2 res; | |
| double2 qq, cc, uu, tt; | | double2 qq, cc, uu, tt; | |
| double f, g, u, v, q, ulo, tmp, m; | | double f, g, u, v, q, ulo, tmp, m; | |
| int ilo, ihi, expo; | | int ilo, ihi, expo; | |
| | | | |
| ihi = __double2hiint(a); | | ihi = __double2hiint(a); | |
| | | | |
| skipping to change at line 668 | | skipping to change at line 675 | |
| ihi = __double2hiint(a); | | ihi = __double2hiint(a); | |
| ilo = __double2loint(a); | | ilo = __double2loint(a); | |
| expo = (ihi >> 20) & 0x7ff; | | expo = (ihi >> 20) & 0x7ff; | |
| expo -= 54; | | expo -= 54; | |
| } | | } | |
| expo -= 1023; | | expo -= 1023; | |
| /* log(a) = log(m*2^expo) = | | /* log(a) = log(m*2^expo) = | |
| log(m) + log(2)*expo, if m < sqrt(2), | | log(m) + log(2)*expo, if m < sqrt(2), | |
| log(m*0.5) + log(2)*(expo+1), if m >= sqrt(2) | | log(m*0.5) + log(2)*(expo+1), if m >= sqrt(2) | |
| */ | | */ | |
|
| m = __hiloint2double((ihi & 0x800fffff) | 0x3ff00000, ilo); | | ihi = (ihi & 0x800fffff) | 0x3ff00000; | |
| if (m > CUDART_SQRT_TWO) { | | m = __hiloint2double (ihi, ilo); | |
| | | if ((unsigned)ihi > (unsigned)0x3ff6a09e) { | |
| m = __internal_half(m); | | m = __internal_half(m); | |
| expo = expo + 1; | | expo = expo + 1; | |
| } | | } | |
| /* compute log(m) with extended precision using an algorithm derived from | | /* compute log(m) with extended precision using an algorithm derived from | |
| * P.T.P. Tang, "Table Driven Implementation of the Logarithm Function", | | * P.T.P. Tang, "Table Driven Implementation of the Logarithm Function", | |
| * TOMS, Vol. 16., No. 4, December 1990, pp. 378-400. A modified polynomi
al | | * TOMS, Vol. 16., No. 4, December 1990, pp. 378-400. A modified polynomi
al | |
| * approximation to atanh(x) on the interval [-0.1716, 0.1716] is utilize
d. | | * approximation to atanh(x) on the interval [-0.1716, 0.1716] is utilize
d. | |
| */ | | */ | |
| f = m - 1.0; | | f = m - 1.0; | |
| g = m + 1.0; | | g = m + 1.0; | |
| | | | |
| skipping to change at line 737 | | skipping to change at line 745 | |
| __device_func__(double __cuda_log10(double a)) | | __device_func__(double __cuda_log10(double a)) | |
| { | | { | |
| double t; | | double t; | |
| t = __cuda_log(a); | | t = __cuda_log(a); | |
| return __fma_rn (t, CUDART_LGE_HI, t * CUDART_LGE_LO); | | return __fma_rn (t, CUDART_LGE_HI, t * CUDART_LGE_LO); | |
| } | | } | |
| | | | |
| __device_func__(double __cuda_log1p(double a)) | | __device_func__(double __cuda_log1p(double a)) | |
| { | | { | |
| double t; | | double t; | |
|
| #if !defined(__CUDABE__) | | int i; | |
| if (__cuda___isnan(a)) { | | | |
| return a + a; | | i = __double2hiint(a); | |
| } | | if (((unsigned)i < (unsigned)0x3fe55555) || ((int)i < (int)0xbfd99999)) { | |
| #endif | | /* Compute log2(a+1) = 2*atanh(a/(a+2)) */ | |
| if ((a < -0.4) || (a > CUDART_TWOTHIRD)) { | | t = a + 2.0; | |
| return __cuda_log (a + 1.0); | | t = a / t; | |
| | | t = -a * t; | |
| | | t = __internal_atanh_kernel(a, t); | |
| | | return t; | |
| } | | } | |
|
| /* Compute log2(a+1) = 2*atanh(a/(a+2)) */ | | return __cuda_log (a + CUDART_ONE); | |
| t = a + 2.0; | | | |
| t = a / t; | | | |
| t = -a * t; | | | |
| t = __internal_atanh_kernel(a, t); | | | |
| return t; | | | |
| } | | } | |
| | | | |
| __device_func__(double __internal_exp_kernel(double a, int scale)) | | __device_func__(double __internal_exp_kernel(double a, int scale)) | |
| { | | { | |
| double t, fac, z; | | double t, fac, z; | |
| int i; | | int i; | |
| /* exp(a) = 2^(rint(a/log(2)) + z) = 2^(i + z) */ | | /* exp(a) = 2^(rint(a/log(2)) + z) = 2^(i + z) */ | |
| t = __cuda_rint (a * CUDART_L2E); | | t = __cuda_rint (a * CUDART_L2E); | |
| i = (int)t; | | i = (int)t; | |
| z = __fma_rn (t, -CUDART_LN2_HI, a); | | z = __fma_rn (t, -CUDART_LN2_HI, a); | |
| z = __fma_rn (t, -CUDART_LN2_LO, z); | | z = __fma_rn (t, -CUDART_LN2_LO, z); | |
| fac = 2.0; | | fac = 2.0; | |
| if (i <= -1021) { | | if (i <= -1021) { | |
| i += 55; | | i += 55; | |
| fac = CUDART_TWO_TO_M54; | | fac = CUDART_TWO_TO_M54; | |
| } | | } | |
|
| | | /* exp(a) = 2^i * e^z */ | |
| t = __internal_expm1_kernel(z); | | t = __internal_expm1_kernel(z); | |
|
| /* exp(a) = 2^i * 2^z */ | | z = __internal_exp2i_kernel(i + scale - 1); | |
| z = __hiloint2double((1022 + i + scale) << 20, 0); | | | |
| t = __fma_rn (t, z, z); | | t = __fma_rn (t, z, z); | |
| t = t * fac; | | t = t * fac; | |
| return t; | | return t; | |
| } | | } | |
| | | | |
| __device_func__(double __cuda_exp(double a)) | | __device_func__(double __cuda_exp(double a)) | |
| { | | { | |
|
| if (a > CUDART_LN2_X_1024) { | | double t; | |
| return CUDART_INF; | | int i; | |
| | | i = __double2hiint(a); | |
| | | if (((unsigned)i < (unsigned)0x40862e43) || ((int)i < (int)0xC0874911)) { | |
| | | t = __internal_exp_kernel(a, 0); | |
| | | return t; | |
| } | | } | |
|
| if (a <= -CUDART_LN2_X_1075) { | | t = ((unsigned int)i >> 31) ? CUDART_ZERO : CUDART_INF; | |
| return CUDART_ZERO; | | if (__cuda___isnan(a)) { | |
| | | t = a + a; | |
| } | | } | |
|
| a = __internal_exp_kernel(a, 0); | | return t; | |
| return a; | | | |
| } | | } | |
| | | | |
| __device_func__(double __cuda_exp2(double a)) | | __device_func__(double __cuda_exp2(double a)) | |
| { | | { | |
| double z; | | double z; | |
| double t; | | double t; | |
| double fac; | | double fac; | |
| int i; | | int i; | |
| | | | |
|
| if (a >= 1024.0) { | | i = __double2hiint(a); | |
| return CUDART_INF; | | if (((unsigned)i < (unsigned)0x40900000) || ((int)i < (int)0xc090cc00)) { | |
| } | | t = __cuda_rint (a); | |
| if (a < -1075.0) { | | z = a - t; | |
| return CUDART_ZERO; | | i = (int)t; | |
| | | fac = 2.0; | |
| | | if (i <= -1021) { | |
| | | i += 55; | |
| | | fac = CUDART_TWO_TO_M54; | |
| | | } | |
| | | /* 2^z = exp(log(2)*z) */ | |
| | | z = __fma_rn (z, CUDART_LN2_HI, z * CUDART_LN2_LO); | |
| | | t = __internal_expm1_kernel(z); | |
| | | z = __internal_exp2i_kernel(i - 1); | |
| | | t = __fma_rn (t, z, z); | |
| | | t = t * fac; | |
| | | return t; | |
| } | | } | |
|
| t = __cuda_rint (a); | | t = ((unsigned int)i >> 31) ? CUDART_ZERO : CUDART_INF; | |
| z = a - t; | | if (__cuda___isnan(a)) { | |
| i = (int)t; | | t = a + a; | |
| fac = 2.0; | | | |
| if (i <= -1021) { | | | |
| i += 55; | | | |
| fac = CUDART_TWO_TO_M54; | | | |
| } | | } | |
|
| /* 2^z = exp(log(2)*z) */ | | | |
| z = __fma_rn (z, CUDART_LN2_HI, z * CUDART_LN2_LO); | | | |
| t = __internal_expm1_kernel(z); | | | |
| z = __hiloint2double((1022 + i) << 20, 0); | | | |
| t = __fma_rn (t, z, z); | | | |
| t = t * fac; | | | |
| return t; | | return t; | |
| } | | } | |
| | | | |
| __device_func__(double __cuda_exp10(double a)) | | __device_func__(double __cuda_exp10(double a)) | |
| { | | { | |
| double z; | | double z; | |
| double t; | | double t; | |
| double fac; | | double fac; | |
| int i; | | int i; | |
| | | | |
|
| if (a >= CUDART_LG2_X_1024) { | | i = __double2hiint(a); | |
| return CUDART_INF; | | if (((unsigned)i < (unsigned)0x40734414) || ((int)i < (int)0xc07439b8)) { | |
| } | | t = __cuda_rint (a * CUDART_L2T); | |
| if (a < -CUDART_LG2_X_1075) { | | i = (int)t; | |
| return CUDART_ZERO; | | z = __fma_rn (t, -CUDART_LG2_HI, a); | |
| | | z = __fma_rn (t, -CUDART_LG2_LO, z); | |
| | | fac = 2.0; | |
| | | if (i <= -1021) { | |
| | | i += 55; | |
| | | fac = CUDART_TWO_TO_M54; | |
| | | } | |
| | | /* 2^z = exp(log(10)*z) */ | |
| | | z = __fma_rn (z, CUDART_LNT_HI, z * CUDART_LNT_LO); | |
| | | t = __internal_expm1_kernel(z); | |
| | | z = __internal_exp2i_kernel(i - 1); | |
| | | t = __fma_rn (t, z, z); | |
| | | t = t * fac; | |
| | | return t; | |
| } | | } | |
|
| t = __cuda_rint (a * CUDART_L2T); | | t = ((unsigned int)i >> 31) ? CUDART_ZERO : CUDART_INF; | |
| i = (int)t; | | if (__cuda___isnan(a)) { | |
| z = __fma_rn (t, -CUDART_LG2_HI, a); | | t = a + a; | |
| z = __fma_rn (t, -CUDART_LG2_LO, z); | | | |
| fac = 2.0; | | | |
| if (i <= -1021) { | | | |
| i += 55; | | | |
| fac = CUDART_TWO_TO_M54; | | | |
| } | | } | |
|
| /* 2^z = exp(log(10)*z) */ | | | |
| z = __fma_rn (z, CUDART_LNT_HI, z * CUDART_LNT_LO); | | | |
| t = __internal_expm1_kernel(z); | | | |
| /* exp(a) = 2^i * 2^z */ | | | |
| z = __hiloint2double((1022 + i) << 20, 0); | | | |
| t = __fma_rn (t, z, z); | | | |
| t = t * fac; | | | |
| return t; | | return t; | |
| } | | } | |
| | | | |
| __device_func__(double __cuda_expm1(double a)) | | __device_func__(double __cuda_expm1(double a)) | |
| { | | { | |
| double t, z, u; | | double t, z, u; | |
|
| int i, j; | | int i, j, k; | |
| if (a > CUDART_LN2_X_1024) { | | | |
| return CUDART_INF; | | k = __double2hiint(a); | |
| } | | if (((unsigned)k < (unsigned)0x40862e43) || ((int)k < (int)0xc04a8000)) { | |
| if (a < -53.0) { | | t = __cuda_rint (a * CUDART_L2E); | |
| return -1.0; | | i = (int)t; | |
| | | z = __fma_rn (t, -CUDART_LN2_HI, a); | |
| | | z = __fma_rn (t, -CUDART_LN2_LO, z); | |
| | | k = k + k; | |
| | | if ((unsigned)k < (unsigned)0x7fb3e647) { | |
| | | z = a; | |
| | | i = 0; | |
| | | } | |
| | | t = __internal_expm1_kernel(z); | |
| | | j = i; | |
| | | if (i == 1024) j--; | |
| | | u = __internal_exp2i_kernel(j); | |
| | | a = u - 1.0; | |
| | | t = __fma_rn (t, u, a); | |
| | | if (i == 1024) t = t + t; | |
| | | if (k == 0) t = z; /* preserve -0 */ | |
| | | return t; | |
| } | | } | |
|
| t = __cuda_rint (a * CUDART_L2E); | | t = ((unsigned int)k >> 31) ? -CUDART_ONE : CUDART_INF; | |
| i = (int)t; | | if (__cuda___isnan(a)) { | |
| z = __fma_rn (t, -CUDART_LN2_HI, a); | | t = a + a; | |
| z = __fma_rn (t, -CUDART_LN2_LO, z); | | | |
| if (__cuda_fabs(a) < 0.405465108) { | | | |
| z = a; | | | |
| i = 0; | | | |
| } | | } | |
|
| j = (i == 1024) ? (i - 1) : i; | | | |
| t = __internal_expm1_kernel(z); | | | |
| u = __hiloint2double((1023 + j) << 20, 0); | | | |
| a = u - 1.0; | | | |
| t = __fma_rn (t, u, a); | | | |
| if (z == 0.0) t = z; /* preserve -0 */ | | | |
| if (i == 1024) t = t + t; | | | |
| return t; | | return t; | |
| } | | } | |
| | | | |
| __device_func__(double __cuda_cosh(double a)) | | __device_func__(double __cuda_cosh(double a)) | |
| { | | { | |
| double z; | | double z; | |
|
| if (__cuda___isnan(a)) { | | int i; | |
| | | | |
| | | z = __cuda_fabs(a); | |
| | | i = __double2hiint(z); | |
| | | if ((unsigned)i < (unsigned)0x408633cf) { | |
| | | z = __internal_exp_kernel(z, -2); | |
| | | z = __fma_rn(2.0, z, 0.125 / z); | |
| | | return z; | |
| | | } else { | |
| | | if (z > 0.0) a = CUDART_INF_F; | |
| return a + a; | | return a + a; | |
| } | | } | |
|
| a = __cuda_fabs(a); | | | |
| z = __internal_exp_kernel(a, -2); | | | |
| z = __fma_rn(2.0, z, 0.125 / z); | | | |
| if (a >= CUDART_LN2_X_1025) { | | | |
| z = CUDART_INF_F; /* overflow -> infinity */ | | | |
| } | | | |
| return z; | | | |
| } | | } | |
| | | | |
| __device_func__(double __cuda_sinh(double a)) | | __device_func__(double __cuda_sinh(double a)) | |
| { | | { | |
| double s, z; | | double s, z; | |
| s = a; | | s = a; | |
| a = __cuda_fabs(a); | | a = __cuda_fabs(a); | |
| if (a < 1.0) { /* danger of catastrophic cancellation */ | | if (a < 1.0) { /* danger of catastrophic cancellation */ | |
| double a2 = a * a; | | double a2 = a * a; | |
| /* approximate sinh(x) on [0,1] with a polynomial */ | | /* approximate sinh(x) on [0,1] with a polynomial */ | |
| | | | |
| skipping to change at line 957 | | skipping to change at line 977 | |
| t = __fma_rn (t, a, a); | | t = __fma_rn (t, a, a); | |
| a = __cuda_copysign(t, a); | | a = __cuda_copysign(t, a); | |
| } | | } | |
| return a; | | return a; | |
| } | | } | |
| | | | |
| __device_func__(double __internal_atan_kernel(double a)) | | __device_func__(double __internal_atan_kernel(double a)) | |
| { | | { | |
| double t, a2; | | double t, a2; | |
| a2 = a * a; | | a2 = a * a; | |
|
| t = -2.0258553044438358E-005 ; | | t = -2.0258553044438358E-005 ; | |
| t = __fma_rn (t, a2, 2.2302240345758510E-004); | | t = __fma_rn (t, a2, 2.2302240345758510E-004); | |
| t = __fma_rn (t, a2, -1.1640717779930576E-003); | | t = __fma_rn (t, a2, -1.1640717779930576E-003); | |
| t = __fma_rn (t, a2, 3.8559749383629918E-003); | | t = __fma_rn (t, a2, 3.8559749383629918E-003); | |
| t = __fma_rn (t, a2, -9.1845592187165485E-003); | | t = __fma_rn (t, a2, -9.1845592187165485E-003); | |
| t = __fma_rn (t, a2, 1.6978035834597331E-002); | | t = __fma_rn (t, a2, 1.6978035834597331E-002); | |
| t = __fma_rn (t, a2, -2.5826796814495994E-002); | | t = __fma_rn (t, a2, -2.5826796814495994E-002); | |
| t = __fma_rn (t, a2, 3.4067811082715123E-002); | | t = __fma_rn (t, a2, 3.4067811082715123E-002); | |
| t = __fma_rn (t, a2, -4.0926382420509971E-002); | | t = __fma_rn (t, a2, -4.0926382420509971E-002); | |
| t = __fma_rn (t, a2, 4.6739496199157994E-002); | | t = __fma_rn (t, a2, 4.6739496199157994E-002); | |
| t = __fma_rn (t, a2, -5.2392330054601317E-002); | | t = __fma_rn (t, a2, -5.2392330054601317E-002); | |
| | | | |
| skipping to change at line 1049 | | skipping to change at line 1069 | |
| r = __fma_rn (r, b, 3.038188875134962E-002); | | r = __fma_rn (r, b, 3.038188875134962E-002); | |
| r = __fma_rn (r, b, 4.464285849810986E-002); | | r = __fma_rn (r, b, 4.464285849810986E-002); | |
| r = __fma_rn (r, b, 7.499999998342270E-002); | | r = __fma_rn (r, b, 7.499999998342270E-002); | |
| r = __fma_rn (r, b, 1.666666666667375E-001); | | r = __fma_rn (r, b, 1.666666666667375E-001); | |
| r = r * b; | | r = r * b; | |
| return r; | | return r; | |
| } | | } | |
| | | | |
| __device_func__(double __cuda_asin(double a)) | | __device_func__(double __cuda_asin(double a)) | |
| { | | { | |
|
| double t0, t1; | | double fa, t0, t1; | |
| t0 = __cuda_fabs(a); | | int ihi, ahi; | |
| if (t0 > 1.0) { | | ahi = __double2hiint(a); | |
| return CUDART_NAN; | | fa = __cuda_fabs(a); | |
| } | | ihi = __double2hiint(fa); | |
| if (t0 > 0.575) { | | if (ihi < 0x3fe26666) { | |
| t1 = __fma_rn (-0.5, t0, 0.5); | | t1 = fa * fa; | |
| | | t1 = __internal_asin_kernel (fa, t1); | |
| | | t1 = __fma_rn (t1, fa, fa); | |
| | | t1 = __cuda_copysign(t1, a); | |
| | | } else { | |
| | | t1 = __fma_rn (-0.5, fa, 0.5); | |
| t0 = __cuda_sqrt (t1); | | t0 = __cuda_sqrt (t1); | |
| t1 = __internal_asin_kernel (t0, t1); | | t1 = __internal_asin_kernel (t0, t1); | |
| t0 = -2.0 * t0; | | t0 = -2.0 * t0; | |
| t1 = __fma_rn (t0, t1, CUDART_PIO2_LO); | | t1 = __fma_rn (t0, t1, CUDART_PIO2_LO); | |
| t0 = t0 + CUDART_PIO4_HI; | | t0 = t0 + CUDART_PIO4_HI; | |
| t1 = t0 + t1; | | t1 = t0 + t1; | |
| t1 = t1 + CUDART_PIO4_HI; | | t1 = t1 + CUDART_PIO4_HI; | |
|
| } else { | | if (ahi < 0x3ff00000) { | |
| t1 = t0 * t0; | | t1 = __cuda_copysign(t1, a); | |
| t1 = __internal_asin_kernel (t0, t1); | | } | |
| t1 = __fma_rn (t1, t0, t0); | | | |
| } | | } | |
|
| return __cuda_copysign(t1, a); | | return t1; | |
| } | | } | |
| | | | |
| __device_func__(double __cuda_acos(double a)) | | __device_func__(double __cuda_acos(double a)) | |
| { | | { | |
| double t0, t1; | | double t0, t1; | |
|
| | | int ihi, ahi; | |
| | | | |
| #if !defined(__CUDABE__) | | #if !defined(__CUDABE__) | |
| if (__cuda___isnan(a)) { | | if (__cuda___isnan(a)) { | |
| return a + a; | | return a + a; | |
| } | | } | |
| #endif | | #endif | |
|
| | | ahi = __double2hiint(a); | |
| t0 = __cuda_fabs (a); | | t0 = __cuda_fabs (a); | |
|
| if (t0 > 0.575) { | | ihi = __double2hiint(t0); | |
| t1 = __fma_rn (-0.5, t0, 0.5); | | if (ihi < 0x3fe26666) { | |
| t0 = __cuda_sqrt(t1); | | | |
| t1 = __internal_asin_kernel (t0, t1); | | | |
| t0 = __fma_rn (t1, t0, t0); | | | |
| t0 = 2.0 * t0; | | | |
| if (__cuda___signbit(a)) { | | | |
| t0 = __fma_rn (1.0, t0, -CUDART_PI_LO); | | | |
| t0 = CUDART_PI_HI - t0; | | | |
| } | | | |
| } else { | | | |
| t1 = t0 * t0; | | t1 = t0 * t0; | |
| t1 = __internal_asin_kernel (t0, t1); | | t1 = __internal_asin_kernel (t0, t1); | |
| t0 = __fma_rn (t1, t0, t0); | | t0 = __fma_rn (t1, t0, t0); | |
|
| if (__cuda___signbit(a)) { | | if ((unsigned)ahi >= (unsigned)0x80000000) { | |
| t0 = __fma_rn (1.0, t0, +CUDART_PIO2_LO); | | t0 = __fma_rn (1.0, t0, +CUDART_PIO2_LO); | |
| t0 = CUDART_PIO2_HI + t0; | | t0 = CUDART_PIO2_HI + t0; | |
| } else { | | } else { | |
| t0 = __fma_rn (1.0, t0, -CUDART_PIO2_LO); | | t0 = __fma_rn (1.0, t0, -CUDART_PIO2_LO); | |
| t0 = CUDART_PIO2_HI - t0; | | t0 = CUDART_PIO2_HI - t0; | |
| } | | } | |
|
| | | } else { | |
| | | t1 = __fma_rn (-0.5, t0, 0.5); | |
| | | t0 = __cuda_sqrt(t1); | |
| | | t1 = __internal_asin_kernel (t0, t1); | |
| | | t0 = __fma_rn (t1, t0, t0); | |
| | | t0 = 2.0 * t0; | |
| | | if ((unsigned)ahi >= (unsigned)0x80000000) { | |
| | | t0 = __fma_rn (1.0, t0, -CUDART_PI_LO); | |
| | | t0 = CUDART_PI_HI - t0; | |
| | | } | |
| } | | } | |
| return t0; | | return t0; | |
| } | | } | |
| | | | |
| __device_func__(double __cuda_acosh(double a)) | | __device_func__(double __cuda_acosh(double a)) | |
| { | | { | |
| double t; | | double t; | |
| #if !defined(__CUDABE__) | | #if !defined(__CUDABE__) | |
| if (__cuda___isnan(a)) { | | if (__cuda___isnan(a)) { | |
| return a + a; | | return a + a; | |
| | | | |
| skipping to change at line 1382 | | skipping to change at line 1410 | |
| r = __fma_rn (r, q, 5.22397760611847340E-003); | | r = __fma_rn (r, q, 5.22397760611847340E-003); | |
| r = __fma_rn (r, q, -2.68661706431114690E-002); | | r = __fma_rn (r, q, -2.68661706431114690E-002); | |
| r = __fma_rn (r, q, 1.12837916709441850E-001); | | r = __fma_rn (r, q, 1.12837916709441850E-001); | |
| r = __fma_rn (r, q, -3.76126389031835210E-001); | | r = __fma_rn (r, q, -3.76126389031835210E-001); | |
| r = __fma_rn (r, q, 1.12837916709551260E+000); | | r = __fma_rn (r, q, 1.12837916709551260E+000); | |
| a = r * a; | | a = r * a; | |
| } | | } | |
| return a; | | return a; | |
| } | | } | |
| | | | |
|
| | | __device_func__(double __cuda_erfinv(double a)) | |
| | | { | |
| | | double fa, t; | |
| | | | |
| | | fa = fabs(a); | |
| | | if (fa >= 1.0) { | |
| | | t = CUDART_NAN; /* NaN */ | |
| | | if (fa == 1.0) { | |
| | | t = a * CUDART_INF; /* Infinity */ | |
| | | } | |
| | | } else if (fa >= 0.9375) { | |
| | | /* Based on: J.M. Blair, C.A. Edwards, J.H. Johnson: Rational Chebyshev | |
| | | Approximations for the Inverse of the Error Function. Mathematics of | |
| | | Computation, Vol. 30, No. 136 (Oct. 1976), pp. 827-830. Table 59 | |
| | | */ | |
| | | double p, q; | |
| | | | |
| | | t = __cuda_log1p(-fa); | |
| | | t = __cuda_rsqrt(-t); | |
| | | p = 2.7834010353747001060e-3; | |
| | | p = __fma_rn (p, t, 8.6030097526280260580e-1); | |
| | | p = __fma_rn (p, t, 2.1371214997265515515e+0); | |
| | | p = __fma_rn (p, t, 3.1598519601132090206e+0); | |
| | | p = __fma_rn (p, t, 3.5780402569085996758e+0); | |
| | | p = __fma_rn (p, t, 1.5335297523989890804e+0); | |
| | | p = __fma_rn (p, t, 3.4839207139657522572e-1); | |
| | | p = __fma_rn (p, t, 5.3644861147153648366e-2); | |
| | | p = __fma_rn (p, t, 4.3836709877126095665e-3); | |
| | | p = __fma_rn (p, t, 1.3858518113496718808e-4); | |
| | | p = __fma_rn (p, t, 1.1738352509991666680e-6); | |
| | | q = t+ 2.2859981272422905412e+0; | |
| | | q = __fma_rn (q, t, 4.3859045256449554654e+0); | |
| | | q = __fma_rn (q, t, 4.6632960348736635331e+0); | |
| | | q = __fma_rn (q, t, 3.9846608184671757296e+0); | |
| | | q = __fma_rn (q, t, 1.6068377709719017609e+0); | |
| | | q = __fma_rn (q, t, 3.5609087305900265560e-1); | |
| | | q = __fma_rn (q, t, 5.3963550303200816744e-2); | |
| | | q = __fma_rn (q, t, 4.3873424022706935023e-3); | |
| | | q = __fma_rn (q, t, 1.3858762165532246059e-4); | |
| | | q = __fma_rn (q, t, 1.1738313872397777529e-6); | |
| | | t = p / (q * t); | |
| | | if (a < 0.0) t = -t; | |
| | | } else if (fa >= 0.75) { | |
| | | /* Based on: J.M. Blair, C.A. Edwards, J.H. Johnson: Rational Chebyshev | |
| | | Approximations for the Inverse of the Error Function. Mathematics of | |
| | | Computation, Vol. 30, No. 136 (Oct. 1976), pp. 827-830. Table 39 | |
| | | */ | |
| | | double p, q; | |
| | | | |
| | | t = __fma_rn (a, a, -.87890625); | |
| | | p = .21489185007307062000e+0; | |
| | | p = __fma_rn (p, t, -.64200071507209448655e+1); | |
| | | p = __fma_rn (p, t, .29631331505876308123e+2); | |
| | | p = __fma_rn (p, t, -.47644367129787181803e+2); | |
| | | p = __fma_rn (p, t, .34810057749357500873e+2); | |
| | | p = __fma_rn (p, t, -.12954198980646771502e+2); | |
| | | p = __fma_rn (p, t, .25349389220714893917e+1); | |
| | | p = __fma_rn (p, t, -.24758242362823355486e+0); | |
| | | p = __fma_rn (p, t, .94897362808681080020e-2); | |
| | | q = t -.12831383833953226499e+2; | |
| | | q = __fma_rn (q, t, .41409991778428888716e+2); | |
| | | q = __fma_rn (q, t, -.53715373448862143349e+2); | |
| | | q = __fma_rn (q, t, .33880176779595142685e+2); | |
| | | q = __fma_rn (q, t, -.11315360624238054876e+2); | |
| | | q = __fma_rn (q, t, .20369295047216351160e+1); | |
| | | q = __fma_rn (q, t, -.18611650627372178511e+0); | |
| | | q = __fma_rn (q, t, .67544512778850945940e-2); | |
| | | p = p / q; | |
| | | t = a * p; | |
| | | } else { | |
| | | /* Based on: J.M. Blair, C.A. Edwards, J.H. Johnson: Rational Chebyshev | |
| | | Approximations for the Inverse of the Error Function. Mathematics of | |
| | | Computation, Vol. 30, No. 136 (Oct. 1976), pp. 827-830. Table 18 | |
| | | */ | |
| | | double p, q; | |
| | | | |
| | | t = __fma_rn (a, a, -.5625); | |
| | | p = -.23886240104308755900e+2; | |
| | | p = __fma_rn (p, t, .45560204272689128170e+3); | |
| | | p = __fma_rn (p, t, -.22977467176607144887e+4); | |
| | | p = __fma_rn (p, t, .46631433533434331287e+4); | |
| | | p = __fma_rn (p, t, -.43799652308386926161e+4); | |
| | | p = __fma_rn (p, t, .19007153590528134753e+4); | |
| | | p = __fma_rn (p, t, -.30786872642313695280e+3); | |
| | | q = t -.83288327901936570000e+2; | |
| | | q = __fma_rn (q, t, .92741319160935318800e+3); | |
| | | q = __fma_rn (q, t, -.35088976383877264098e+4); | |
| | | q = __fma_rn (q, t, .59039348134843665626e+4); | |
| | | q = __fma_rn (q, t, -.48481635430048872102e+4); | |
| | | q = __fma_rn (q, t, .18997769186453057810e+4); | |
| | | q = __fma_rn (q, t, -.28386514725366621129e+3); | |
| | | p = p / q; | |
| | | t = a * p; | |
| | | } | |
| | | return t; | |
| | | } | |
| | | | |
| | | __device_func__(double __cuda_erfcinv(double a)) | |
| | | { | |
| | | double t; | |
| | | #if !defined(__CUDABE__) | |
| | | if (__cuda___isnan(a)) return a + a; | |
| | | #endif | |
| | | if (a <= CUDART_ZERO) { | |
| | | t = CUDART_NAN; | |
| | | if (a == CUDART_ZERO) { | |
| | | t = (1.0 - a) * CUDART_INF; | |
| | | } | |
| | | } | |
| | | else if (a >= 0.0625) { | |
| | | t = __cuda_erfinv (1.0 - a); | |
| | | } | |
| | | else if (a >= 1e-100) { | |
| | | /* Based on: J.M. Blair, C.A. Edwards, J.H. Johnson: Rational Chebyshev | |
| | | Approximations for the Inverse of the Error Function. Mathematics of | |
| | | Computation, Vol. 30, No. 136 (Oct. 1976), pp. 827-830. Table 59 | |
| | | */ | |
| | | double p, q; | |
| | | t = __cuda_log(a); | |
| | | t = __cuda_rsqrt(-t); | |
| | | p = 2.7834010353747001060e-3; | |
| | | p = __fma_rn (p, t, 8.6030097526280260580e-1); | |
| | | p = __fma_rn (p, t, 2.1371214997265515515e+0); | |
| | | p = __fma_rn (p, t, 3.1598519601132090206e+0); | |
| | | p = __fma_rn (p, t, 3.5780402569085996758e+0); | |
| | | p = __fma_rn (p, t, 1.5335297523989890804e+0); | |
| | | p = __fma_rn (p, t, 3.4839207139657522572e-1); | |
| | | p = __fma_rn (p, t, 5.3644861147153648366e-2); | |
| | | p = __fma_rn (p, t, 4.3836709877126095665e-3); | |
| | | p = __fma_rn (p, t, 1.3858518113496718808e-4); | |
| | | p = __fma_rn (p, t, 1.1738352509991666680e-6); | |
| | | q = t+ 2.2859981272422905412e+0; | |
| | | q = __fma_rn (q, t, 4.3859045256449554654e+0); | |
| | | q = __fma_rn (q, t, 4.6632960348736635331e+0); | |
| | | q = __fma_rn (q, t, 3.9846608184671757296e+0); | |
| | | q = __fma_rn (q, t, 1.6068377709719017609e+0); | |
| | | q = __fma_rn (q, t, 3.5609087305900265560e-1); | |
| | | q = __fma_rn (q, t, 5.3963550303200816744e-2); | |
| | | q = __fma_rn (q, t, 4.3873424022706935023e-3); | |
| | | q = __fma_rn (q, t, 1.3858762165532246059e-4); | |
| | | q = __fma_rn (q, t, 1.1738313872397777529e-6); | |
| | | t = p / (q * t); | |
| | | } | |
| | | else { | |
| | | /* Based on: J.M. Blair, C.A. Edwards, J.H. Johnson: Rational Chebyshev | |
| | | Approximations for the Inverse of the Error Function. Mathematics of | |
| | | Computation, Vol. 30, No. 136 (Oct. 1976), pp. 827-830. Table 82 | |
| | | */ | |
| | | double p, q; | |
| | | t = __cuda_log(a); | |
| | | t = __cuda_rsqrt(-t); | |
| | | p = 6.9952990607058154858e-1; | |
| | | p = __fma_rn (p, t, 1.9507620287580568829e+0); | |
| | | p = __fma_rn (p, t, 8.2810030904462690216e-1); | |
| | | p = __fma_rn (p, t, 1.1279046353630280005e-1); | |
| | | p = __fma_rn (p, t, 6.0537914739162189689e-3); | |
| | | p = __fma_rn (p, t, 1.3714329569665128933e-4); | |
| | | p = __fma_rn (p, t, 1.2964481560643197452e-6); | |
| | | p = __fma_rn (p, t, 4.6156006321345332510e-9); | |
| | | p = __fma_rn (p, t, 4.5344689563209398450e-12); | |
| | | q = t+ 1.5771922386662040546e+0; | |
| | | q = __fma_rn (q, t, 2.1238242087454993542e+0); | |
| | | q = __fma_rn (q, t, 8.4001814918178042919e-1); | |
| | | q = __fma_rn (q, t, 1.1311889334355782065e-1); | |
| | | q = __fma_rn (q, t, 6.0574830550097140404e-3); | |
| | | q = __fma_rn (q, t, 1.3715891988350205065e-4); | |
| | | q = __fma_rn (q, t, 1.2964671850944981713e-6); | |
| | | q = __fma_rn (q, t, 4.6156017600933592558e-9); | |
| | | q = __fma_rn (q, t, 4.5344687377088206783e-12); | |
| | | t = p / (q * t); | |
| | | } | |
| | | return t; | |
| | | } | |
| | | | |
| __device_func__(double __cuda_erfc(double a)) | | __device_func__(double __cuda_erfc(double a)) | |
| { | | { | |
| double p, q, h, l; | | double p, q, h, l; | |
|
| if (__cuda___isnan(a)) { | | int ahi; | |
| return a + a; | | | |
| } | | ahi = __double2hiint(a); | |
| if (a <= 0.55) { | | if (ahi < (int)0x3fe80000) { | |
| return 1.0 - __cuda_erf(a); | | return 1.0 - __cuda_erf(a); | |
| } | | } | |
| if (a > 27.3) { | | if (a > 27.3) { | |
| return 0.0; | | return 0.0; | |
| } | | } | |
|
| if (a <= 5.0) { | | if (ahi < (int)0x40140000) { | |
| p = 5.64189549785304440E-001; | | p = 5.64189549785304440E-001; | |
| p = __fma_rn (p, a, 8.17405083437083490E+000); | | p = __fma_rn (p, a, 8.17405083437083490E+000); | |
| p = __fma_rn (p, a, 5.68958722557864720E+001); | | p = __fma_rn (p, a, 5.68958722557864720E+001); | |
| p = __fma_rn (p, a, 2.42568747802647010E+002); | | p = __fma_rn (p, a, 2.42568747802647010E+002); | |
| p = __fma_rn (p, a, 6.80381374390412930E+002); | | p = __fma_rn (p, a, 6.80381374390412930E+002); | |
| p = __fma_rn (p, a, 1.25873132236024590E+003); | | p = __fma_rn (p, a, 1.25873132236024590E+003); | |
| p = __fma_rn (p, a, 1.43925353963809330E+003); | | p = __fma_rn (p, a, 1.43925353963809330E+003); | |
| p = __fma_rn (p, a, 8.15949420587659230E+002); | | p = __fma_rn (p, a, 8.15949420587659230E+002); | |
| q = a+ 1.44881247113239940E+001; | | q = a+ 1.44881247113239940E+001; | |
| q = __fma_rn (q, a, 1.01345387970210510E+002); | | q = __fma_rn (q, a, 1.01345387970210510E+002); | |
| | | | |
| skipping to change at line 1426 | | skipping to change at line 1628 | |
| p = __fma_rn (p, a, 1.22570382896313600E+001); | | p = __fma_rn (p, a, 1.22570382896313600E+001); | |
| p = __fma_rn (p, a, 6.01884641114116460E+000); | | p = __fma_rn (p, a, 6.01884641114116460E+000); | |
| q = a+ 3.62871917534986780E+000; | | q = a+ 3.62871917534986780E+000; | |
| q = __fma_rn (q, a, 1.24663395327043550E+001); | | q = __fma_rn (q, a, 1.24663395327043550E+001); | |
| q = __fma_rn (q, a, 2.13927672803974790E+001); | | q = __fma_rn (q, a, 2.13927672803974790E+001); | |
| q = __fma_rn (q, a, 2.72082423532866070E+001); | | q = __fma_rn (q, a, 2.72082423532866070E+001); | |
| q = __fma_rn (q, a, 1.86422906830006700E+001); | | q = __fma_rn (q, a, 1.86422906830006700E+001); | |
| q = __fma_rn (q, a, 6.13809834548870550E+000); | | q = __fma_rn (q, a, 6.13809834548870550E+000); | |
| } | | } | |
| p = p / q; | | p = p / q; | |
|
| h = -a * a; | | h = a * a; | |
| l = __fma_rn (-a, a, -h); | | l = __fma_rn (a, a, -h); | |
| q = __internal_exp_kernel(h, 0); | | q = __internal_exp_kernel(-h, 0); | |
| q = __fma_rn (q, l, q); | | q = __fma_rn (l, -q, q); | |
| p = p * q; | | p = p * q; | |
| return p; | | return p; | |
| } | | } | |
| | | | |
| /* approximate 1.0/(a*gamma(a)) on [-0.5,0.5] */ | | /* approximate 1.0/(a*gamma(a)) on [-0.5,0.5] */ | |
| __device_func__(double __internal_tgamma_kernel(double a)) | | __device_func__(double __internal_tgamma_kernel(double a)) | |
| { | | { | |
| double t; | | double t; | |
| t = -4.42689340712524750E-010; | | t = -4.42689340712524750E-010; | |
| t = __fma_rn (t, a, -2.02665918466589540E-007); | | t = __fma_rn (t, a, -2.02665918466589540E-007); | |
| | | | |
End of changes. 47 change blocks. |
| 225 lines changed or deleted | | 428 lines changed or added | |
|