__cudaFatFormat.h   __cudaFatFormat.h 
/* /*
* Copyright 1993-2009 NVIDIA Corporation. All rights reserved. * Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
* *
* NOTICE TO USER: * NOTICE TO USER:
* *
* This source code is subject to NVIDIA ownership rights under U.S. and * This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws. Users and possessors of this source code * international Copyright laws. Users and possessors of this source code
* are hereby granted a nonexclusive, royalty-free license to use this code * are hereby granted a nonexclusive, royalty-free license to use this code
* in individual and commercial software. * in individual and commercial software.
* *
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
skipping to change at line 230 skipping to change at line 230
void fatGetCubinForGpuWithPolicy( __cudaFatCudaBinary *binary, __cudaFatCom pilationPolicy policy, char* gpuName, char* *cubin, char* *dbgInfoFile ); void fatGetCubinForGpuWithPolicy( __cudaFatCudaBinary *binary, __cudaFatCom pilationPolicy policy, char* gpuName, char* *cubin, char* *dbgInfoFile );
#define fatGetCubinForGpu(binary,gpuName,cubin,dbgInfoFile) \ #define fatGetCubinForGpu(binary,gpuName,cubin,dbgInfoFile) \
fatGetCubinForGpuWithPolicy(binary,__cudaFatAvoidPTX,gpuName,cubi n,dbgInfoFile) fatGetCubinForGpuWithPolicy(binary,__cudaFatAvoidPTX,gpuName,cubi n,dbgInfoFile)
/* /*
* Function : Check if a binary will be JITed for the specified targ et architecture * Function : Check if a binary will be JITed for the specified targ et architecture
* Parameters : binary (I) Fat binary * Parameters : binary (I) Fat binary
* policy (I) Compilation policy, as described by fatGet CubinForGpuWithPolicy * policy (I) Compilation policy, as described by fatGet CubinForGpuWithPolicy
* gpuName (I) Name of target GPU * gpuName (I) Name of target GPU
* ptx (O) PTX string to be JITed * ptx (O) PTX string to be JITed. Must be freed by c aller.
* Function Result : True if the given binary will be JITed; otherwise, Fal se * Function Result : True if the given binary will be JITed; otherwise, Fal se
*/ */
unsigned char fatCheckJitForGpuWithPolicy( __cudaFatCudaBinary *binary, __c udaFatCompilationPolicy policy, char* gpuName, char* *ptx ); unsigned char fatCheckJitForGpuWithPolicy( __cudaFatCudaBinary *binary, __c udaFatCompilationPolicy policy, char* gpuName, char* *ptx );
#define fatCheckJitForGpu(binary,gpuName,ptx) \ #define fatCheckJitForGpu(binary,gpuName,ptx) \
fatCheckJitForGpuWithPolicy(binary,__cudaFatAvoidPTX,gpuName,ptx) fatCheckJitForGpuWithPolicy(binary,__cudaFatAvoidPTX,gpuName,ptx)
/* /*
* Function : Free information previously obtained via function fatG etCubinForGpu. * Function : Free information previously obtained via function fatG etCubinForGpu.
* Parameters : cubin (I) Cubin text string to free * Parameters : cubin (I) Cubin text string to free
* dbgInfo (I) Debug info filename to free, or NULL * dbgInfo (I) Debug info filename to free, or NULL
*/ */
void fatFreeCubin( char* cubin, char* dbgInfoFile ); void fatFreeCubin( char* cubin, char* dbgInfoFile );
/*
* Function : Free information previously obtained via function fatC
heckJitForGpuWithPolicy.
* Parameters : ptx (I) PTX text string to free
*/
void __cudaFatFreePTX( char* ptx );
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif
#endif #endif
 End of changes. 3 change blocks. 
2 lines changed or deleted 9 lines changed or added


 builtin_types.h   builtin_types.h 
skipping to change at line 44 skipping to change at line 44
*/ */
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
#include "device_types.h" #include "device_types.h"
#include "driver_types.h" #include "driver_types.h"
#include "surface_types.h"
#include "texture_types.h" #include "texture_types.h"
#include "vector_types.h" #include "vector_types.h"
 End of changes. 1 change blocks. 
0 lines changed or deleted 1 lines changed or added


 common_functions.h   common_functions.h 
skipping to change at line 47 skipping to change at line 47
#define __COMMON_FUNCTIONS_H__ #define __COMMON_FUNCTIONS_H__
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
#if defined(__cplusplus) && defined(__CUDACC__) #if defined(__cplusplus) && defined(__CUDACC__)
#include "builtin_types.h"
#include "host_defines.h" #include "host_defines.h"
#include <time.h>
#include <string.h> #include <string.h>
#include <time.h>
extern "C" extern "C"
{ {
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern _CRTIMP __host__ __device__ clock_t __cdecl clock(void) __THROW; extern _CRTIMP __host__ __device__ clock_t __cdecl clock(void) __THROW;
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __host__ __device__ void * __cdecl memset(void *s, int c, s extern __host__ __device__ void* __cdecl memset(void*, int, size_
ize_t n) __THROW; t) __THROW;
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __host__ __device__ void * __cdecl memcpy(void *d, const vo id *s, size_t n) __THROW; extern __host__ __device__ void* __cdecl memcpy(void*, const void *, size_t) __THROW;
} }
#elif !defined(__CUDACC__) #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 200
#include "crt/func_macro.h"
__device_func__(clock_t __cuda_clock(void)) #include <stdio.h>
{
return clock();
}
__device_func__(void *__cuda_memset(void *s, int c, size_t n)) extern "C"
{ {
char *p = (char*)s;
while (n--) *p++ = (char)c; /*DEVICE_BUILTIN*/
extern __host__ __device__ int __cdecl printf(const char*, ...)
;
return s;
} }
__device_func__(void *__cuda_memcpy(void *d, const void *s, size_t n)) #endif /* __CUDA_ARCH__ && __CUDA_ARCH__ >= 200 */
{
char *p = (char*)d;
const char *r = (const char*)s;
while (n--) *p++ = *r++;
return d;
}
#endif /* __cplusplus && __CUDACC__ */ #endif /* __cplusplus && __CUDACC__ */
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
#include "math_functions.h" #include "math_functions.h"
 End of changes. 13 change blocks. 
26 lines changed or deleted 12 lines changed or added


 cublas.h   cublas.h 
skipping to change at line 94 skipping to change at line 94
#define CUBLASAPI __stdcall #define CUBLASAPI __stdcall
#else #else
#define CUBLASAPI #define CUBLASAPI
#endif #endif
#endif #endif
#if defined(__cplusplus) #if defined(__cplusplus)
extern "C" { extern "C" {
#endif /* __cplusplus */ #endif /* __cplusplus */
#include "driver_types.h"
#include "cuComplex.h" /* import complex data type */ #include "cuComplex.h" /* import complex data type */
/* CUBLAS status returns */ /* CUBLAS status returns */
#define CUBLAS_STATUS_SUCCESS 0x00000000 #define CUBLAS_STATUS_SUCCESS 0x00000000
#define CUBLAS_STATUS_NOT_INITIALIZED 0x00000001 #define CUBLAS_STATUS_NOT_INITIALIZED 0x00000001
#define CUBLAS_STATUS_ALLOC_FAILED 0x00000003 #define CUBLAS_STATUS_ALLOC_FAILED 0x00000003
#define CUBLAS_STATUS_INVALID_VALUE 0x00000007 #define CUBLAS_STATUS_INVALID_VALUE 0x00000007
#define CUBLAS_STATUS_ARCH_MISMATCH 0x00000008 #define CUBLAS_STATUS_ARCH_MISMATCH 0x00000008
#define CUBLAS_STATUS_MAPPING_ERROR 0x0000000B #define CUBLAS_STATUS_MAPPING_ERROR 0x0000000B
#define CUBLAS_STATUS_EXECUTION_FAILED 0x0000000D #define CUBLAS_STATUS_EXECUTION_FAILED 0x0000000D
skipping to change at line 289 skipping to change at line 290
* ------------- * -------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize d * CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize d
* CUBLAS_STATUS_INVALID_VALUE if rows, cols, eleSize, lda, or ldb <= 0 * CUBLAS_STATUS_INVALID_VALUE if rows, cols, eleSize, lda, or ldb <= 0
* CUBLAS_STATUS_MAPPING_ERROR if error occurred accessing GPU memory * CUBLAS_STATUS_MAPPING_ERROR if error occurred accessing GPU memory
* CUBLAS_STATUS_SUCCESS if the operation completed successfully * CUBLAS_STATUS_SUCCESS if the operation completed successfully
*/ */
cublasStatus CUBLASAPI cublasGetMatrix (int rows, int cols, int elemSize, cublasStatus CUBLASAPI cublasGetMatrix (int rows, int cols, int elemSize,
const void *A, int lda, void *B, const void *A, int lda, void *B,
int ldb); int ldb);
/*
* cublasStatus
* cublasSetKernelStream ( cudaStream_t stream )
*
* set the CUBLAS stream in which all subsequent CUBLAS kernel launches wil
l run.
* By default, if the CUBLAS stream is not set, all kernels will use the NU
LL
* stream. This routine can be used to change the stream between kernels la
unches
* and can be used also to set the CUBLAS stream back to NULL.
*
* Return Values
* -------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
d
* CUBLAS_STATUS_SUCCESS if stream set successfully
*/
cublasStatus CUBLASAPI cublasSetKernelStream (cudaStream_t stream);
/*
* cublasStatus
* cublasSetVectorAsync ( int n, int elemSize, const void *x, int incx,
* void *y, int incy, cudaStream_t stream );
*
* cublasSetVectorAsync has the same functionnality as cublasSetVector
* but the transfer is done asynchronously within the CUDA stream passed
* in parameter.
*
* Return Values
* -------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library not been initialized
* CUBLAS_STATUS_INVALID_VALUE if incx, incy, or elemSize <= 0
* CUBLAS_STATUS_MAPPING_ERROR if an error occurred accessing GPU memory
* CUBLAS_STATUS_SUCCESS if the operation completed successfully
*/
cublasStatus CUBLASAPI cublasSetVectorAsync (int n, int elemSize,
const void *hostPtr, int incx,
void *devicePtr, int incy,
cudaStream_t stream);
/*
* cublasStatus
* cublasGetVectorAsync( int n, int elemSize, const void *x, int incx,
* void *y, int incy, cudaStream_t stream)
*
* cublasGetVectorAsync has the same functionnality as cublasGetVector
* but the transfer is done asynchronously within the CUDA stream passed
* in parameter.
*
* Return Values
* -------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library not been initialized
* CUBLAS_STATUS_INVALID_VALUE if incx, incy, or elemSize <= 0
* CUBLAS_STATUS_MAPPING_ERROR if an error occurred accessing GPU memory
* CUBLAS_STATUS_SUCCESS if the operation completed successfully
*/
cublasStatus CUBLASAPI cublasGetVectorAsync (int n, int elemSize,
const void *devicePtr, int inc
x,
void *hostPtr, int incy,
cudaStream_t stream);
/*
* cublasStatus
* cublasSetMatrixAsync (int rows, int cols, int elemSize, const void *A,
* int lda, void *B, int ldb, cudaStream_t stream)
*
* cublasSetMatrixAsync has the same functionnality as cublasSetMatrix
* but the transfer is done asynchronously within the CUDA stream passed
* in parameter.
*
* Return Values
* -------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
d
* CUBLAS_STATUS_INVALID_VALUE if rows or cols < 0, or elemSize, lda, or
* ldb <= 0
* CUBLAS_STATUS_MAPPING_ERROR if error occurred accessing GPU memory
* CUBLAS_STATUS_SUCCESS if the operation completed successfully
*/
cublasStatus CUBLASAPI cublasSetMatrixAsync (int rows, int cols, int elemSi
ze,
const void *A, int lda, void *
B,
int ldb, cudaStream_t stream);
/*
* cublasStatus
* cublasGetMatrixAsync (int rows, int cols, int elemSize, const void *A,
* int lda, void *B, int ldb, cudaStream_t stream)
*
* cublasGetMatrixAsync has the same functionnality as cublasGetMatrix
* but the transfer is done asynchronously within the CUDA stream passed
* in parameter.
*
* Return Values
* -------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
d
* CUBLAS_STATUS_INVALID_VALUE if rows, cols, eleSize, lda, or ldb <= 0
* CUBLAS_STATUS_MAPPING_ERROR if error occurred accessing GPU memory
* CUBLAS_STATUS_SUCCESS if the operation completed successfully
*/
cublasStatus CUBLASAPI cublasGetMatrixAsync (int rows, int cols, int elemSi
ze,
const void *A, int lda, void *
B,
int ldb, cudaStream_t stream);
/* ---------------- CUBLAS single-precision BLAS1 functions --------------- - */ /* ---------------- CUBLAS single-precision BLAS1 functions --------------- - */
/* /*
* int * int
* cublasIsamax (int n, const float *x, int incx) * cublasIsamax (int n, const float *x, int incx)
* *
* finds the smallest index of the maximum magnitude element of single * finds the smallest index of the maximum magnitude element of single
* precision vector x; that is, the result is the first i, i = 0 to n - 1, * precision vector x; that is, the result is the first i, i = 0 to n - 1,
* that maximizes abs(x[1 + i * incx])). * that maximizes abs(x[1 + i * incx])).
* *
skipping to change at line 574 skipping to change at line 673
* The quantity r = (+/-) sqrt (sa^2 + sb^2) overwrites sa in storage. The * The quantity r = (+/-) sqrt (sa^2 + sb^2) overwrites sa in storage. The
* value of sb is overwritten by a value z which allows sc and ss to be * value of sb is overwritten by a value z which allows sc and ss to be
* recovered by the following algorithm: * recovered by the following algorithm:
* *
* if z=1 set sc = 0.0 and ss = 1.0 * if z=1 set sc = 0.0 and ss = 1.0
* if abs(z) < 1 set sc = sqrt(1-z^2) and ss = z * if abs(z) < 1 set sc = sqrt(1-z^2) and ss = z
* if abs(z) > 1 set sc = 1/z and ss = sqrt(1-sc^2) * if abs(z) > 1 set sc = 1/z and ss = sqrt(1-sc^2)
* *
* The function srot (n, x, incx, y, incy, sc, ss) normally is called next * The function srot (n, x, incx, y, incy, sc, ss) normally is called next
* to apply the transformation to a 2 x n matrix. * to apply the transformation to a 2 x n matrix.
* Note that is function is provided for completeness and run exclusively * Note that this function is provided for completeness and run exclusively
* on the Host. * on the Host.
* *
* Input * Input
* ----- * -----
* sa single precision scalar * sa single precision scalar
* sb single precision scalar * sb single precision scalar
* *
* Output * Output
* ------ * ------
* sa single precision r * sa single precision r
skipping to change at line 662 skipping to change at line 761
* *
* sflag = -1.0f sflag = 0.0f sflag = 1.0f sflag = -2.0f * sflag = -1.0f sflag = 0.0f sflag = 1.0f sflag = -2.0f
* *
* (sh00 sh01) (1.0f sh01) (sh00 1.0f) (1.0f 0.0f) * (sh00 sh01) (1.0f sh01) (sh00 1.0f) (1.0f 0.0f)
* h = ( ) ( ) ( ) ( ) * h = ( ) ( ) ( ) ( )
* (sh10 sh11) (sh10 1.0f) (-1.0f sh11) (0.0f 1.0f) * (sh10 sh11) (sh10 1.0f) (-1.0f sh11) (0.0f 1.0f)
* *
* sparam[1] through sparam[4] contain sh00, sh10, sh01, sh11, * sparam[1] through sparam[4] contain sh00, sh10, sh01, sh11,
* respectively. Values of 1.0f, -1.0f, or 0.0f implied by the value * respectively. Values of 1.0f, -1.0f, or 0.0f implied by the value
* of sflag are not stored in sparam. * of sflag are not stored in sparam.
* Note that is function is provided for completeness and run exclusively * Note that this function is provided for completeness and run exclusively
* on the Host. * on the Host.
* *
* Input * Input
* ----- * -----
* sd1 single precision scalar * sd1 single precision scalar
* sd2 single precision scalar * sd2 single precision scalar
* sx1 single precision scalar * sx1 single precision scalar
* sy1 single precision scalar * sy1 single precision scalar
* *
* Output * Output
skipping to change at line 899 skipping to change at line 998
* *
* ( sc cs ) * ( sc cs )
* G = ( ) , sc^2 + cabs(cs)^2 = 1, * G = ( ) , sc^2 + cabs(cs)^2 = 1,
* (-cs sc ) * (-cs sc )
* *
* which zeros the second entry of the complex 2-vector transpose(ca, cb). * which zeros the second entry of the complex 2-vector transpose(ca, cb).
* *
* The quantity ca/cabs(ca)*norm(ca,cb) overwrites ca in storage. The * The quantity ca/cabs(ca)*norm(ca,cb) overwrites ca in storage. The
* function crot (n, x, incx, y, incy, sc, cs) is normally called next * function crot (n, x, incx, y, incy, sc, cs) is normally called next
* to apply the transformation to a 2 x n matrix. * to apply the transformation to a 2 x n matrix.
* Note that is function is provided for completeness and run exclusively * Note that this function is provided for completeness and run exclusively
* on the Host. * on the Host.
* *
* Input * Input
* ----- * -----
* ca single-precision complex precision scalar * ca single-precision complex precision scalar
* cb single-precision complex scalar * cb single-precision complex scalar
* *
* Output * Output
* ------ * ------
* ca single-precision complex ca/cabs(ca)*norm(ca,cb) * ca single-precision complex ca/cabs(ca)*norm(ca,cb)
skipping to change at line 1498 skipping to change at line 1597
* *
* ( sc cs ) * ( sc cs )
* G = ( ) , sc^2 + cabs(cs)^2 = 1, * G = ( ) , sc^2 + cabs(cs)^2 = 1,
* (-cs sc ) * (-cs sc )
* *
* which zeros the second entry of the complex 2-vector transpose(ca, cb). * which zeros the second entry of the complex 2-vector transpose(ca, cb).
* *
* The quantity ca/cabs(ca)*norm(ca,cb) overwrites ca in storage. The * The quantity ca/cabs(ca)*norm(ca,cb) overwrites ca in storage. The
* function crot (n, x, incx, y, incy, sc, cs) is normally called next * function crot (n, x, incx, y, incy, sc, cs) is normally called next
* to apply the transformation to a 2 x n matrix. * to apply the transformation to a 2 x n matrix.
* Note that is function is provided for completeness and run exclusively * Note that this function is provided for completeness and run exclusively
* on the Host. * on the Host.
* *
* Input * Input
* ----- * -----
* ca double-precision complex precision scalar * ca double-precision complex precision scalar
* cb double-precision complex scalar * cb double-precision complex scalar
* *
* Output * Output
* ------ * ------
* ca double-precision complex ca/cabs(ca)*norm(ca,cb) * ca double-precision complex ca/cabs(ca)*norm(ca,cb)
skipping to change at line 3398 skipping to change at line 3497
* *
* Error Status * Error Status
* ------------ * ------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize d * CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize d
* CUBLAS_STATUS_INVALID_VALUE if k or n < 0, or if incx or incy == 0 * CUBLAS_STATUS_INVALID_VALUE if k or n < 0, or if incx or incy == 0
* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU * CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
*/ */
void CUBLASAPI cublasChbmv (char uplo, int n, int k, cuComplex alpha, void CUBLASAPI cublasChbmv (char uplo, int n, int k, cuComplex alpha,
const cuComplex *A, int lda, const cuComplex *x , const cuComplex *A, int lda, const cuComplex *x ,
int incx, cuComplex beta, cuComplex *y, int inc y); int incx, cuComplex beta, cuComplex *y, int inc y);
/*
* void
* cublasChpmv (char uplo, int n, cuComplex alpha, const cuComplex *AP, con
st cuComplex *x,
* int incx, cuComplex beta, cuComplex *y, int incy)
*
* performs the matrix-vector operation
*
* y = alpha * A * x + beta * y
*
* Alpha and beta are single precision complex scalars, and x and y are sin
gle
* precision complex vectors with n elements. A is an hermitian n x n matri
x
* consisting of single precision complex elements that is supplied in pack
ed form.
*
* Input
* -----
* uplo specifies whether the matrix data is stored in the upper or the l
ower
* triangular part of array AP. If uplo == 'U' or 'u', then the uppe
r
* triangular part of A is supplied in AP. If uplo == 'L' or 'l', th
en
* the lower triangular part of A is supplied in AP.
* n specifies the number of rows and columns of the matrix A. It must
be
* at least zero.
* alpha single precision complex scalar multiplier applied to A*x.
* AP single precision complex array with at least ((n * (n + 1)) / 2)
elements. If
* uplo == 'U' or 'u', the array AP contains the upper triangular pa
rt
* of the hermitian matrix A, packed sequentially, column by column;
* that is, if i <= j, then A[i,j] is stored is AP[i+(j*(j+1)/2)]. I
f
* uplo == 'L' or 'L', the array AP contains the lower triangular pa
rt
* of the hermitian matrix A, packed sequentially, column by column;
* that is, if i >= j, then A[i,j] is stored in AP[i+((2*n-j+1)*j)/2
].
* The imaginary parts of the diagonal elements need not be set, the
y
* are assumed to be zero.
* x single precision complex array of length at least (1 + (n - 1) *
abs(incx)).
* incx storage spacing between elements of x. incx must not be zero.
* beta single precision complex scalar multiplier applied to vector y;
* y single precision array of length at least (1 + (n - 1) * abs(incy
)).
* If beta is zero, y is not read.
* incy storage spacing between elements of y. incy must not be zero.
*
* Output
* ------
* y updated according to y = alpha*A*x + beta*y
*
* Reference: http://www.netlib.org/blas/chpmv.f
*
* Error status for this function can be retrieved via cublasGetError().
*
* Error Status
* ------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
d
* CUBLAS_STATUS_INVALID_VALUE if n < 0, or if incx or incy == 0
* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
*/
void CUBLASAPI cublasChpmv (char uplo, int n, cuComplex alpha, void CUBLASAPI cublasChpmv (char uplo, int n, cuComplex alpha,
const cuComplex *AP, const cuComplex *x, int in cx, const cuComplex *AP, const cuComplex *x, int in cx,
cuComplex beta, cuComplex *y, int incy); cuComplex beta, cuComplex *y, int incy);
/* /*
* *
* cublasCtrmv (char uplo, char trans, char diag, int n, const cuComplex *A , * cublasCtrmv (char uplo, char trans, char diag, int n, const cuComplex *A ,
* int lda, cuComplex *x, int incx); * int lda, cuComplex *x, int incx);
* *
* performs one of the matrix-vector operations x = op(A) * x, * performs one of the matrix-vector operations x = op(A) * x,
skipping to change at line 5494 skipping to change at line 5646
* The quantity r = (+/-) sqrt (sa^2 + sb^2) overwrites sa in storage. The * The quantity r = (+/-) sqrt (sa^2 + sb^2) overwrites sa in storage. The
* value of sb is overwritten by a value z which allows sc and ss to be * value of sb is overwritten by a value z which allows sc and ss to be
* recovered by the following algorithm: * recovered by the following algorithm:
* *
* if z=1 set sc = 0.0 and ss = 1.0 * if z=1 set sc = 0.0 and ss = 1.0
* if abs(z) < 1 set sc = sqrt(1-z^2) and ss = z * if abs(z) < 1 set sc = sqrt(1-z^2) and ss = z
* if abs(z) > 1 set sc = 1/z and ss = sqrt(1-sc^2) * if abs(z) > 1 set sc = 1/z and ss = sqrt(1-sc^2)
* *
* The function drot (n, x, incx, y, incy, sc, ss) normally is called next * The function drot (n, x, incx, y, incy, sc, ss) normally is called next
* to apply the transformation to a 2 x n matrix. * to apply the transformation to a 2 x n matrix.
* Note that is function is provided for completeness and run exclusively * Note that this function is provided for completeness and run exclusively
* on the Host. * on the Host.
* *
* Input * Input
* ----- * -----
* sa double-precision scalar * sa double-precision scalar
* sb double-precision scalar * sb double-precision scalar
* *
* Output * Output
* ------ * ------
* sa double-precision r * sa double-precision r
skipping to change at line 5535 skipping to change at line 5687
* The elements of x are in x[lx + i * incx], i = 0 to n-1, where lx = 1 if * The elements of x are in x[lx + i * incx], i = 0 to n-1, where lx = 1 if
* incx >= 0, else lx = 1 + (1 - n) * incx, and similarly for y using ly an d * incx >= 0, else lx = 1 + (1 - n) * incx, and similarly for y using ly an d
* incy. With sparam[0] = sflag, h has one of the following forms: * incy. With sparam[0] = sflag, h has one of the following forms:
* *
* sflag = -1.0 sflag = 0.0 sflag = 1.0 sflag = -2.0 * sflag = -1.0 sflag = 0.0 sflag = 1.0 sflag = -2.0
* *
* (sh00 sh01) (1.0 sh01) (sh00 1.0) (1.0 0.0) * (sh00 sh01) (1.0 sh01) (sh00 1.0) (1.0 0.0)
* h = ( ) ( ) ( ) ( ) * h = ( ) ( ) ( ) ( )
* (sh10 sh11) (sh10 1.0) (-1.0 sh11) (0.0 1.0) * (sh10 sh11) (sh10 1.0) (-1.0 sh11) (0.0 1.0)
* *
* Note that is function is provided for completeness and run exclusively * Note that this function is provided for completeness and run exclusively
* on the Host. * on the Host.
* *
* Input * Input
* ----- * -----
* n number of elements in input vectors * n number of elements in input vectors
* x double-precision vector with n elements * x double-precision vector with n elements
* incx storage spacing between elements of x * incx storage spacing between elements of x
* y double-precision vector with n elements * y double-precision vector with n elements
* incy storage spacing between elements of y * incy storage spacing between elements of y
* sparam 5-element vector. sparam[0] is sflag described above. sparam[1] * sparam 5-element vector. sparam[0] is sflag described above. sparam[1]
skipping to change at line 5586 skipping to change at line 5738
* *
* sflag = -1.0 sflag = 0.0 sflag = 1.0 sflag = -2.0 * sflag = -1.0 sflag = 0.0 sflag = 1.0 sflag = -2.0
* *
* (sh00 sh01) (1.0 sh01) (sh00 1.0) (1.0 0.0) * (sh00 sh01) (1.0 sh01) (sh00 1.0) (1.0 0.0)
* h = ( ) ( ) ( ) ( ) * h = ( ) ( ) ( ) ( )
* (sh10 sh11) (sh10 1.0) (-1.0 sh11) (0.0 1.0) * (sh10 sh11) (sh10 1.0) (-1.0 sh11) (0.0 1.0)
* *
* sparam[1] through sparam[4] contain sh00, sh10, sh01, sh11, * sparam[1] through sparam[4] contain sh00, sh10, sh01, sh11,
* respectively. Values of 1.0, -1.0, or 0.0 implied by the value * respectively. Values of 1.0, -1.0, or 0.0 implied by the value
* of sflag are not stored in sparam. * of sflag are not stored in sparam.
* Note that is function is provided for completeness and run exclusively * Note that this function is provided for completeness and run exclusively
* on the Host. * on the Host.
* *
* Input * Input
* ----- * -----
* sd1 single precision scalar * sd1 single precision scalar
* sd2 single precision scalar * sd2 single precision scalar
* sx1 single precision scalar * sx1 single precision scalar
* sy1 single precision scalar * sy1 single precision scalar
* *
* Output * Output
 End of changes. 10 change blocks. 
7 lines changed or deleted 187 lines changed or added


 cuda.h   cuda.h 
skipping to change at line 57 skipping to change at line 57
/** /**
* \defgroup CUDA_TYPES Data types used by CUDA driver * \defgroup CUDA_TYPES Data types used by CUDA driver
* \ingroup CUDA_DRIVER * \ingroup CUDA_DRIVER
* @{ * @{
*/ */
/** /**
* CUDA API version number * CUDA API version number
*/ */
#define CUDA_VERSION 3000 /* 3.0 */ #define CUDA_VERSION 3010 /* 3.1 */
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
#endif #endif
typedef unsigned int CUdeviceptr; ///< CUDA device pointer typedef unsigned int CUdeviceptr; ///< CUDA device pointer
typedef int CUdevice; ///< CUDA device typedef int CUdevice; ///< CUDA device
typedef struct CUctx_st *CUcontext; ///< CUDA context typedef struct CUctx_st *CUcontext; ///< CUDA context
typedef struct CUmod_st *CUmodule; ///< CUDA module typedef struct CUmod_st *CUmodule; ///< CUDA module
typedef struct CUfunc_st *CUfunction; ///< CUDA function typedef struct CUfunc_st *CUfunction; ///< CUDA function
typedef struct CUarray_st *CUarray; ///< CUDA array typedef struct CUarray_st *CUarray; ///< CUDA array
typedef struct CUtexref_st *CUtexref; ///< CUDA texture reference typedef struct CUtexref_st *CUtexref; ///< CUDA texture reference
typedef struct CUsurfref_st *CUsurfref; ///< CUDA surface reference
typedef struct CUevent_st *CUevent; ///< CUDA event typedef struct CUevent_st *CUevent; ///< CUDA event
typedef struct CUstream_st *CUstream; ///< CUDA stream typedef struct CUstream_st *CUstream; ///< CUDA stream
typedef struct CUgraphicsResource_st *CUgraphicsResource; ///< CUDA gra phics interop resource typedef struct CUgraphicsResource_st *CUgraphicsResource; ///< CUDA gra phics interop resource
typedef struct CUuuid_st { ///< CUDA definition of UUID typedef struct CUuuid_st { ///< CUDA definition of UUID
char bytes[16]; char bytes[16];
} CUuuid; } CUuuid;
/************************************ /************************************
** **
skipping to change at line 175 skipping to change at line 176
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH = 22, ///< Maximum 2D textu re width CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH = 22, ///< Maximum 2D textu re width
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT = 23,///< Maximum 2D textu re height CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT = 23,///< Maximum 2D textu re height
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH = 24, ///< Maximum 3D textu re width CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH = 24, ///< Maximum 3D textu re width
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT = 25,///< Maximum 3D textu re height CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT = 25,///< Maximum 3D textu re height
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH = 26, ///< Maximum 3D textu re depth CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH = 26, ///< Maximum 3D textu re depth
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_WIDTH = 27, ///< Maximum te xture array width CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_WIDTH = 27, ///< Maximum te xture array width
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_HEIGHT = 28,///< Maximum te xture array height CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_HEIGHT = 28,///< Maximum te xture array height
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES = 29, ///< Maximu m slices in a texture array CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES = 29, ///< Maximu m slices in a texture array
CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT = 30, ///< Alignment requirement for surfaces CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT = 30, ///< Alignment requirement for surfaces
CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS = 31, ///< Device can possibly e xecute multiple kernels concurrently CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS = 31, ///< Device can possibly e xecute multiple kernels concurrently
CU_DEVICE_ATTRIBUTE_ECC_ENABLED = 32 ///< Device has ECC support enable CU_DEVICE_ATTRIBUTE_ECC_ENABLED = 32, ///< Device has ECC support enabl
d ed
CU_DEVICE_ATTRIBUTE_PCI_BUS_ID = 33, ////< PCI bus ID of the device
CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID = 34 ////< PCI device ID of the devic
e
} CUdevice_attribute; } CUdevice_attribute;
/** /**
* Legacy device properties * Legacy device properties
*/ */
typedef struct CUdevprop_st { typedef struct CUdevprop_st {
int maxThreadsPerBlock; ///< Maximum number of threads per block int maxThreadsPerBlock; ///< Maximum number of threads per block
int maxThreadsDim[3]; ///< Maximum size of each dimension of a bl ock int maxThreadsDim[3]; ///< Maximum size of each dimension of a bl ock
int maxGridSize[3]; ///< Maximum size of each dimension of a gr id int maxGridSize[3]; ///< Maximum size of each dimension of a gr id
int sharedMemPerBlock; ///< Shared memory available per block in b ytes int sharedMemPerBlock; ///< Shared memory available per block in b ytes
skipping to change at line 413 skipping to change at line 416
*/ */
typedef enum CUarray_cubemap_face_enum { typedef enum CUarray_cubemap_face_enum {
CU_CUBEMAP_FACE_POSITIVE_X = 0x00, ///< Positive X face of cubemap CU_CUBEMAP_FACE_POSITIVE_X = 0x00, ///< Positive X face of cubemap
CU_CUBEMAP_FACE_NEGATIVE_X = 0x01, ///< Negative X face of cubemap CU_CUBEMAP_FACE_NEGATIVE_X = 0x01, ///< Negative X face of cubemap
CU_CUBEMAP_FACE_POSITIVE_Y = 0x02, ///< Positive Y face of cubemap CU_CUBEMAP_FACE_POSITIVE_Y = 0x02, ///< Positive Y face of cubemap
CU_CUBEMAP_FACE_NEGATIVE_Y = 0x03, ///< Negative Y face of cubemap CU_CUBEMAP_FACE_NEGATIVE_Y = 0x03, ///< Negative Y face of cubemap
CU_CUBEMAP_FACE_POSITIVE_Z = 0x04, ///< Positive Z face of cubemap CU_CUBEMAP_FACE_POSITIVE_Z = 0x04, ///< Positive Z face of cubemap
CU_CUBEMAP_FACE_NEGATIVE_Z = 0x05 ///< Negative Z face of cubemap CU_CUBEMAP_FACE_NEGATIVE_Z = 0x05 ///< Negative Z face of cubemap
} CUarray_cubemap_face; } CUarray_cubemap_face;
/**
* Limits
*/
typedef enum CUlimit_enum {
CU_LIMIT_STACK_SIZE = 0x00, ///< GPU thread stack size
CU_LIMIT_PRINTF_FIFO_SIZE = 0x01 ///< GPU printf FIFO size
} CUlimit;
/************************************ /************************************
** **
** Error codes ** Error codes
** **
***********************************/ ***********************************/
/** /**
* Error codes * Error codes
*/ */
typedef enum cudaError_enum { typedef enum cudaError_enum {
CUDA_SUCCESS = 0, ///< No errors CUDA_SUCCESS = 0, ///< No errors
CUDA_ERROR_INVALID_VALUE = 1, ///< Invalid value CUDA_ERROR_INVALID_VALUE = 1, ///< Invalid value
CUDA_ERROR_OUT_OF_MEMORY = 2, ///< Out of memory CUDA_ERROR_OUT_OF_MEMORY = 2, ///< Out of memory
CUDA_ERROR_NOT_INITIALIZED = 3, ///< Driver not initialized CUDA_ERROR_NOT_INITIALIZED = 3, ///< Driver not initia
CUDA_ERROR_DEINITIALIZED = 4, ///< Driver deinitialized lized
CUDA_ERROR_DEINITIALIZED = 4, ///< Driver deinitiali
zed
CUDA_ERROR_NO_DEVICE = 100, ///< No CUDA-capable device CUDA_ERROR_NO_DEVICE = 100, ///< No CUDA-capable d
available evice available
CUDA_ERROR_INVALID_DEVICE = 101, ///< Invalid device CUDA_ERROR_INVALID_DEVICE = 101, ///< Invalid device
CUDA_ERROR_INVALID_IMAGE = 200, ///< Invalid kernel image CUDA_ERROR_INVALID_IMAGE = 200, ///< Invalid kernel im
CUDA_ERROR_INVALID_CONTEXT = 201, ///< Invalid context age
CUDA_ERROR_CONTEXT_ALREADY_CURRENT = 202, ///< Context already curren CUDA_ERROR_INVALID_CONTEXT = 201, ///< Invalid context
t CUDA_ERROR_CONTEXT_ALREADY_CURRENT = 202, ///< Context already c
CUDA_ERROR_MAP_FAILED = 205, ///< Map failed urrent
CUDA_ERROR_UNMAP_FAILED = 206, ///< Unmap failed CUDA_ERROR_MAP_FAILED = 205, ///< Map failed
CUDA_ERROR_ARRAY_IS_MAPPED = 207, ///< Array is mapped CUDA_ERROR_UNMAP_FAILED = 206, ///< Unmap failed
CUDA_ERROR_ALREADY_MAPPED = 208, ///< Already mapped CUDA_ERROR_ARRAY_IS_MAPPED = 207, ///< Array is mapped
CUDA_ERROR_NO_BINARY_FOR_GPU = 209, ///< No binary for GPU CUDA_ERROR_ALREADY_MAPPED = 208, ///< Already mapped
CUDA_ERROR_ALREADY_ACQUIRED = 210, ///< Already acquired CUDA_ERROR_NO_BINARY_FOR_GPU = 209, ///< No binary for GPU
CUDA_ERROR_NOT_MAPPED = 211, ///< Not mapped CUDA_ERROR_ALREADY_ACQUIRED = 210, ///< Already acquired
CUDA_ERROR_NOT_MAPPED_AS_ARRAY = 212, ///< Mapped resource not a CUDA_ERROR_NOT_MAPPED = 211, ///< Not mapped
vailable for access as an array CUDA_ERROR_NOT_MAPPED_AS_ARRAY = 212, ///< Mapped resource n
CUDA_ERROR_NOT_MAPPED_AS_POINTER = 213, ///< Mapped resource not a ot available for access as an array
vailable for access as a pointer CUDA_ERROR_NOT_MAPPED_AS_POINTER = 213, ///< Mapped resource n
CUDA_ERROR_ECC_UNCORRECTABLE = 214, ///< Uncorrectable ECC erro ot available for access as a pointer
r detected CUDA_ERROR_ECC_UNCORRECTABLE = 214, ///< Uncorrectable ECC
error detected
CUDA_ERROR_UNSUPPORTED_LIMIT = 215, ///< CUlimit not suppo
rted by device
CUDA_ERROR_INVALID_SOURCE = 300, ///< Invalid source CUDA_ERROR_INVALID_SOURCE = 300, ///< Invalid source
CUDA_ERROR_FILE_NOT_FOUND = 301, ///< File not found CUDA_ERROR_FILE_NOT_FOUND = 301, ///< File not found
CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND = 302, ///< Link to a shared
object failed to resolve
CUDA_ERROR_SHARED_OBJECT_INIT_FAILED = 303, ///< Shared object ini
tialization failed
CUDA_ERROR_INVALID_HANDLE = 400, ///< Invalid handle CUDA_ERROR_INVALID_HANDLE = 400, ///< Invalid handle
CUDA_ERROR_NOT_FOUND = 500, ///< Not found CUDA_ERROR_NOT_FOUND = 500, ///< Not found
CUDA_ERROR_NOT_READY = 600, ///< CUDA not ready CUDA_ERROR_NOT_READY = 600, ///< CUDA not ready
CUDA_ERROR_LAUNCH_FAILED = 700, ///< Launch failed CUDA_ERROR_LAUNCH_FAILED = 700, ///< Launch failed
CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES = 701, ///< Launch exceeded resour CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES = 701, ///< Launch exceeded r
ces esources
CUDA_ERROR_LAUNCH_TIMEOUT = 702, ///< Launch exceeded timeou CUDA_ERROR_LAUNCH_TIMEOUT = 702, ///< Launch exceeded t
t imeout
CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING = 703, ///< Launch with incomp CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING = 703, ///< Launch with incom
atible texturing patible texturing
CUDA_ERROR_POINTER_IS_64BIT = 800, ///< Attempted to retrieve CUDA_ERROR_POINTER_IS_64BIT = 800, ///< Attempted to retr
64-bit pointer via 32-bit API function ieve 64-bit pointer via 32-bit API function
CUDA_ERROR_SIZE_IS_64BIT = 801, ///< Attempted to retrieve CUDA_ERROR_SIZE_IS_64BIT = 801, ///< Attempted to retr
64-bit size via 32-bit API function ieve 64-bit size via 32-bit API function
CUDA_ERROR_UNKNOWN = 999 ///< Unknown error CUDA_ERROR_UNKNOWN = 999 ///< Unknown error
} CUresult; } CUresult;
/** /**
* If set, host memory is portable between CUDA contexts. * If set, host memory is portable between CUDA contexts.
* Flag for ::cuMemHostAlloc() * Flag for ::cuMemHostAlloc()
*/ */
#define CU_MEMHOSTALLOC_PORTABLE 0x01 #define CU_MEMHOSTALLOC_PORTABLE 0x01
/** /**
* If set, host memory is mapped into CUDA address space and * If set, host memory is mapped into CUDA address space and
skipping to change at line 579 skipping to change at line 593
CUarray_format Format; ///< Array format CUarray_format Format; ///< Array format
unsigned int NumChannels; ///< Channels per array element unsigned int NumChannels; ///< Channels per array element
unsigned int Flags; ///< Flags unsigned int Flags; ///< Flags
} CUDA_ARRAY3D_DESCRIPTOR; } CUDA_ARRAY3D_DESCRIPTOR;
// if set, the CUDA array contains an array of 2D slices // if set, the CUDA array contains an array of 2D slices
// and the Depth member of CUDA_ARRAY3D_DESCRIPTOR specifies // and the Depth member of CUDA_ARRAY3D_DESCRIPTOR specifies
// the number of slices, not the depth of a 3D array. // the number of slices, not the depth of a 3D array.
#define CUDA_ARRAY3D_2DARRAY 0x01 #define CUDA_ARRAY3D_2DARRAY 0x01
// this flag must be set in order to bind a surface reference
// to the CUDA array
#define CUDA_ARRAY3D_SURFACE_LDST 0x02
/** /**
* Override the texref format with a format inferred from the array. * Override the texref format with a format inferred from the array.
* Flag for ::cuTexRefSetArray() * Flag for ::cuTexRefSetArray()
*/ */
#define CU_TRSA_OVERRIDE_FORMAT 0x01 #define CU_TRSA_OVERRIDE_FORMAT 0x01
/** /**
* Read the texture as integers rather than promoting the values to floats * Read the texture as integers rather than promoting the values to floats
* in the range [0,1]. * in the range [0,1].
skipping to change at line 668 skipping to change at line 686
***********************************/ ***********************************/
CUresult CUDAAPI cuModuleLoad(CUmodule *module, const char *fname); CUresult CUDAAPI cuModuleLoad(CUmodule *module, const char *fname);
CUresult CUDAAPI cuModuleLoadData(CUmodule *module, const void *image) ; CUresult CUDAAPI cuModuleLoadData(CUmodule *module, const void *image) ;
CUresult CUDAAPI cuModuleLoadDataEx(CUmodule *module, const void *imag e, unsigned int numOptions, CUjit_option *options, void **optionValues); CUresult CUDAAPI cuModuleLoadDataEx(CUmodule *module, const void *imag e, unsigned int numOptions, CUjit_option *options, void **optionValues);
CUresult CUDAAPI cuModuleLoadFatBinary(CUmodule *module, const void *f atCubin); CUresult CUDAAPI cuModuleLoadFatBinary(CUmodule *module, const void *f atCubin);
CUresult CUDAAPI cuModuleUnload(CUmodule hmod); CUresult CUDAAPI cuModuleUnload(CUmodule hmod);
CUresult CUDAAPI cuModuleGetFunction(CUfunction *hfunc, CUmodule hmod, const char *name); CUresult CUDAAPI cuModuleGetFunction(CUfunction *hfunc, CUmodule hmod, const char *name);
CUresult CUDAAPI cuModuleGetGlobal(CUdeviceptr *dptr, unsigned int *by tes, CUmodule hmod, const char *name); CUresult CUDAAPI cuModuleGetGlobal(CUdeviceptr *dptr, unsigned int *by tes, CUmodule hmod, const char *name);
CUresult CUDAAPI cuModuleGetTexRef(CUtexref *pTexRef, CUmodule hmod, c onst char *name); CUresult CUDAAPI cuModuleGetTexRef(CUtexref *pTexRef, CUmodule hmod, c onst char *name);
CUresult CUDAAPI cuModuleGetSurfRef(CUsurfref *pSurfRef, CUmodule hmod , const char *name);
/************************************ /************************************
** **
** Memory management ** Memory management
** **
***********************************/ ***********************************/
CUresult CUDAAPI cuMemGetInfo(unsigned int *free, unsigned int *total); CUresult CUDAAPI cuMemGetInfo(unsigned int *free, unsigned int *total);
CUresult CUDAAPI cuMemAlloc( CUdeviceptr *dptr, unsigned int bytesize); CUresult CUDAAPI cuMemAlloc( CUdeviceptr *dptr, unsigned int bytesize);
skipping to change at line 715 skipping to change at line 734
// 1D functions // 1D functions
// system <-> device memory // system <-> device memory
CUresult CUDAAPI cuMemcpyHtoD (CUdeviceptr dstDevice, const void * srcHost, unsigned int ByteCount ); CUresult CUDAAPI cuMemcpyHtoD (CUdeviceptr dstDevice, const void * srcHost, unsigned int ByteCount );
CUresult CUDAAPI cuMemcpyDtoH (void *dstHost, CUdeviceptr srcDevic e, unsigned int ByteCount ); CUresult CUDAAPI cuMemcpyDtoH (void *dstHost, CUdeviceptr srcDevic e, unsigned int ByteCount );
// device <-> device memory // device <-> device memory
CUresult CUDAAPI cuMemcpyDtoD (CUdeviceptr dstDevice, CUdeviceptr srcDevice, unsigned int ByteCount ); CUresult CUDAAPI cuMemcpyDtoD (CUdeviceptr dstDevice, CUdeviceptr srcDevice, unsigned int ByteCount );
// device <-> array memory // device <-> array memory
CUresult CUDAAPI cuMemcpyDtoA ( CUarray dstArray, unsigned int dst CUresult CUDAAPI cuMemcpyDtoA ( CUarray dstArray, unsigned int dst
Index, CUdeviceptr srcDevice, unsigned int ByteCount ); Offset, CUdeviceptr srcDevice, unsigned int ByteCount );
CUresult CUDAAPI cuMemcpyAtoD ( CUdeviceptr dstDevice, CUarray hSr CUresult CUDAAPI cuMemcpyAtoD ( CUdeviceptr dstDevice, CUarray src
c, unsigned int SrcIndex, unsigned int ByteCount ); Array, unsigned int srcOffset, unsigned int ByteCount );
// system <-> array memory // system <-> array memory
CUresult CUDAAPI cuMemcpyHtoA( CUarray dstArray, unsigned int dstI CUresult CUDAAPI cuMemcpyHtoA( CUarray dstArray, unsigned int dstO
ndex, const void *pSrc, unsigned int ByteCount ); ffset, const void *srcHost, unsigned int ByteCount );
CUresult CUDAAPI cuMemcpyAtoH( void *dstHost, CUarray srcArray, un CUresult CUDAAPI cuMemcpyAtoH( void *dstHost, CUarray srcArray, un
signed int srcIndex, unsigned int ByteCount ); signed int srcOffset, unsigned int ByteCount );
// array <-> array memory // array <-> array memory
CUresult CUDAAPI cuMemcpyAtoA( CUarray dstArray, unsigned int dstI ndex, CUarray srcArray, unsigned int srcIndex, unsigned int ByteCount ); CUresult CUDAAPI cuMemcpyAtoA( CUarray dstArray, unsigned int dstO ffset, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount );
// 2D memcpy // 2D memcpy
CUresult CUDAAPI cuMemcpy2D( const CUDA_MEMCPY2D *pCopy ); CUresult CUDAAPI cuMemcpy2D( const CUDA_MEMCPY2D *pCopy );
CUresult CUDAAPI cuMemcpy2DUnaligned( const CUDA_MEMCPY2D *pCopy ) ; CUresult CUDAAPI cuMemcpy2DUnaligned( const CUDA_MEMCPY2D *pCopy ) ;
// 3D memcpy // 3D memcpy
CUresult CUDAAPI cuMemcpy3D( const CUDA_MEMCPY3D *pCopy ); CUresult CUDAAPI cuMemcpy3D( const CUDA_MEMCPY3D *pCopy );
skipping to change at line 757 skipping to change at line 776
CUresult CUDAAPI cuMemcpyHtoDAsync (CUdeviceptr dstDevice, CUresult CUDAAPI cuMemcpyHtoDAsync (CUdeviceptr dstDevice,
const void *srcHost, unsigned int ByteCount, CUstream hStream ) ; const void *srcHost, unsigned int ByteCount, CUstream hStream ) ;
CUresult CUDAAPI cuMemcpyDtoHAsync (void *dstHost, CUresult CUDAAPI cuMemcpyDtoHAsync (void *dstHost,
CUdeviceptr srcDevice, unsigned int ByteCount, CUstream hStream ); CUdeviceptr srcDevice, unsigned int ByteCount, CUstream hStream );
// device <-> device memory // device <-> device memory
CUresult CUDAAPI cuMemcpyDtoDAsync (CUdeviceptr dstDevice, CUresult CUDAAPI cuMemcpyDtoDAsync (CUdeviceptr dstDevice,
CUdeviceptr srcDevice, unsigned int ByteCount, CUstream hStream ); CUdeviceptr srcDevice, unsigned int ByteCount, CUstream hStream );
// system <-> array memory // system <-> array memory
CUresult CUDAAPI cuMemcpyHtoAAsync( CUarray dstArray, unsigned int CUresult CUDAAPI cuMemcpyHtoAAsync( CUarray dstArray, unsigned int
dstIndex, dstOffset,
const void *pSrc, unsigned int ByteCount, CUstream hStream ); const void *srcHost, unsigned int ByteCount, CUstream hStream )
CUresult CUDAAPI cuMemcpyAtoHAsync( void *dstHost, CUarray srcArra ;
y, unsigned int srcIndex, CUresult CUDAAPI cuMemcpyAtoHAsync( void *dstHost, CUarray srcArra
y, unsigned int srcOffset,
unsigned int ByteCount, CUstream hStream ); unsigned int ByteCount, CUstream hStream );
// 2D memcpy // 2D memcpy
CUresult CUDAAPI cuMemcpy2DAsync( const CUDA_MEMCPY2D *pCopy, CUst ream hStream ); CUresult CUDAAPI cuMemcpy2DAsync( const CUDA_MEMCPY2D *pCopy, CUst ream hStream );
// 3D memcpy // 3D memcpy
CUresult CUDAAPI cuMemcpy3DAsync( const CUDA_MEMCPY3D *pCopy, CUst ream hStream ); CUresult CUDAAPI cuMemcpy3DAsync( const CUDA_MEMCPY3D *pCopy, CUst ream hStream );
/************************************ /************************************
** **
skipping to change at line 830 skipping to change at line 849
CUresult CUDAAPI cuTexRefGetAddress( CUdeviceptr *pdptr, CUtexref hTex Ref ); CUresult CUDAAPI cuTexRefGetAddress( CUdeviceptr *pdptr, CUtexref hTex Ref );
CUresult CUDAAPI cuTexRefGetArray( CUarray *phArray, CUtexref hTexRef ); CUresult CUDAAPI cuTexRefGetArray( CUarray *phArray, CUtexref hTexRef );
CUresult CUDAAPI cuTexRefGetAddressMode( CUaddress_mode *pam, CUtexref hTexRef, int dim ); CUresult CUDAAPI cuTexRefGetAddressMode( CUaddress_mode *pam, CUtexref hTexRef, int dim );
CUresult CUDAAPI cuTexRefGetFilterMode( CUfilter_mode *pfm, CUtexref h TexRef ); CUresult CUDAAPI cuTexRefGetFilterMode( CUfilter_mode *pfm, CUtexref h TexRef );
CUresult CUDAAPI cuTexRefGetFormat( CUarray_format *pFormat, int *pNum Channels, CUtexref hTexRef ); CUresult CUDAAPI cuTexRefGetFormat( CUarray_format *pFormat, int *pNum Channels, CUtexref hTexRef );
CUresult CUDAAPI cuTexRefGetFlags( unsigned int *pFlags, CUtexref hTex Ref ); CUresult CUDAAPI cuTexRefGetFlags( unsigned int *pFlags, CUtexref hTex Ref );
/************************************ /************************************
** **
** Surface reference management
**
***********************************/
CUresult CUDAAPI cuSurfRefSetArray( CUsurfref hSurfRef, CUarray hArray
, unsigned int Flags );
CUresult CUDAAPI cuSurfRefGetArray( CUarray *phArray, CUsurfref hSurfR
ef );
/************************************
**
** Parameter management ** Parameter management
** **
***********************************/ ***********************************/
CUresult CUDAAPI cuParamSetSize (CUfunction hfunc, unsigned int numbyt es); CUresult CUDAAPI cuParamSetSize (CUfunction hfunc, unsigned int numbyt es);
CUresult CUDAAPI cuParamSeti (CUfunction hfunc, int offset, unsigne d int value); CUresult CUDAAPI cuParamSeti (CUfunction hfunc, int offset, unsigne d int value);
CUresult CUDAAPI cuParamSetf (CUfunction hfunc, int offset, float v alue); CUresult CUDAAPI cuParamSetf (CUfunction hfunc, int offset, float v alue);
CUresult CUDAAPI cuParamSetv (CUfunction hfunc, int offset, void *p tr, unsigned int numbytes); CUresult CUDAAPI cuParamSetv (CUfunction hfunc, int offset, void *p tr, unsigned int numbytes);
CUresult CUDAAPI cuParamSetTexRef(CUfunction hfunc, int texunit, CUtex ref hTexRef); CUresult CUDAAPI cuParamSetTexRef(CUfunction hfunc, int texunit, CUtex ref hTexRef);
skipping to change at line 891 skipping to change at line 919
CUresult CUDAAPI cuGraphicsMapResources( unsigned int count, CUgraphics Resource *resources, CUstream hStream ); CUresult CUDAAPI cuGraphicsMapResources( unsigned int count, CUgraphics Resource *resources, CUstream hStream );
CUresult CUDAAPI cuGraphicsUnmapResources( unsigned int count, CUgraphi csResource *resources, CUstream hStream ); CUresult CUDAAPI cuGraphicsUnmapResources( unsigned int count, CUgraphi csResource *resources, CUstream hStream );
/************************************ /************************************
** **
** Export tables ** Export tables
** **
***********************************/ ***********************************/
CUresult CUDAAPI cuGetExportTable( const void **ppExportTable, const CU uuid *pExportTableId ); CUresult CUDAAPI cuGetExportTable( const void **ppExportTable, const CU uuid *pExportTableId );
/************************************
**
** Limits
**
***********************************/
CUresult CUDAAPI cuCtxSetLimit(CUlimit limit, size_t value);
CUresult CUDAAPI cuCtxGetLimit(size_t *pvalue, CUlimit limit);
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif
#endif /* __cuda_cuda_h__ */ #endif /* __cuda_cuda_h__ */
 End of changes. 22 change blocks. 
60 lines changed or deleted 107 lines changed or added


 cuda_gl_interop.h   cuda_gl_interop.h 
skipping to change at line 45 skipping to change at line 45
#if !defined(__CUDA_GL_INTEROP_H__) #if !defined(__CUDA_GL_INTEROP_H__)
#define __CUDA_GL_INTEROP_H__ #define __CUDA_GL_INTEROP_H__
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
#include "builtin_types.h"
#include "host_defines.h" #include "host_defines.h"
#if defined(__APPLE__) #if defined(__APPLE__)
#include <OpenGL/gl.h> #include <OpenGL/gl.h>
#else /* __APPLE__ */ #else /* __APPLE__ */
#include <GL/gl.h> #include <GL/gl.h>
 End of changes. 1 change blocks. 
0 lines changed or deleted 1 lines changed or added


 cuda_runtime.h   cuda_runtime.h 
skipping to change at line 63 skipping to change at line 63
#include "builtin_types.h" #include "builtin_types.h"
#include "channel_descriptor.h" #include "channel_descriptor.h"
#include "cuda_runtime_api.h" #include "cuda_runtime_api.h"
#include "driver_functions.h" #include "driver_functions.h"
#include "host_defines.h" #include "host_defines.h"
#include "vector_functions.h" #include "vector_functions.h"
#if defined(__CUDACC__) #if defined(__CUDACC__)
#include "common_functions.h" #include "common_functions.h"
#include "cuda_surface_types.h"
#include "cuda_texture_types.h" #include "cuda_texture_types.h"
#include "device_functions.h" #include "device_functions.h"
#include "device_launch_parameters.h" #include "device_launch_parameters.h"
#endif /* __CUDACC__ */ #endif /* __CUDACC__ */
#if defined(__cplusplus) #if defined(__cplusplus)
/************************************************************************** ***** /************************************************************************** *****
* * * *
skipping to change at line 286 skipping to change at line 287
char *symbol char *symbol
) )
{ {
return cudaGetSymbolAddress(devPtr, (const char*)symbol); return cudaGetSymbolAddress(devPtr, (const char*)symbol);
} }
/** /**
* \brief \hl Finds the address associated with a CUDA symbol * \brief \hl Finds the address associated with a CUDA symbol
* *
* Returns in \p *devPtr the address of symbol \p symbol on the device. * Returns in \p *devPtr the address of symbol \p symbol on the device.
* \p symbol can either be a variable that resides in global memory space, * \p symbol can either be a variable that resides in global or constant me
or mory space, or
* it can be a character string, naming a variable that resides in global * it can be a character string, naming a variable that resides in global o
r constant
* memory space. If \p symbol cannot be found, or if \p symbol is not decla red * memory space. If \p symbol cannot be found, or if \p symbol is not decla red
* in the global memory space, \p *devPtr is unchanged and the error * in the global or constant memory space, \p *devPtr is unchanged and the
* ::cudaErrorInvalidSymbol is returned. error
* ::cudaErrorInvalidSymbol is returned. If there are multiple global or co
nstant
* variables with the same string name (from separate files) and the lookup
* is done via character string, ::cudaErrorDuplicateVariableName is
* returned.
* *
* \param devPtr - Return device pointer associated with symbol * \param devPtr - Return device pointer associated with symbol
* \param symbol - Global variable or string symbol to search for * \param symbol - Global/constant variable or string symbol to search for
* *
* \return * \return
* ::cudaSuccess, * ::cudaSuccess,
* ::cudaErrorInvalidSymbol, * ::cudaErrorInvalidSymbol,
* ::cudaErrorAddressOfConstant * ::cudaErrorDuplicateVariableName
* \notefnerr * \notefnerr
* *
* \sa \ref ::cudaGetSymbolAddress(void**, const char*) "cudaGetSymbolAddre ss (C API)" * \sa \ref ::cudaGetSymbolAddress(void**, const char*) "cudaGetSymbolAddre ss (C API)"
* \ref ::cudaGetSymbolSize(size_t*, const T&) "cudaGetSymbolSize (C++ API) " * \ref ::cudaGetSymbolSize(size_t*, const T&) "cudaGetSymbolSize (C++ API) "
*/ */
template<class T> template<class T>
__inline__ __host__ cudaError_t cudaGetSymbolAddress( __inline__ __host__ cudaError_t cudaGetSymbolAddress(
void **devPtr, void **devPtr,
const T &symbol const T &symbol
) )
skipping to change at line 335 skipping to change at line 339
} }
/** /**
* \brief \hl Finds the size of the object associated with a CUDA symbol * \brief \hl Finds the size of the object associated with a CUDA symbol
* *
* Returns in \p *size the size of symbol \p symbol. \p symbol can either b e a * Returns in \p *size the size of symbol \p symbol. \p symbol can either b e a
* variable that resides in global or constant memory space, or it can be a * variable that resides in global or constant memory space, or it can be a
* character string, naming a variable that resides in global or constant * character string, naming a variable that resides in global or constant
* memory space. If \p symbol cannot be found, or if \p symbol is not decla red * memory space. If \p symbol cannot be found, or if \p symbol is not decla red
* in global or constant memory space, \p *size is unchanged and the error * in global or constant memory space, \p *size is unchanged and the error
* ::cudaErrorInvalidSymbol is returned. * ::cudaErrorInvalidSymbol is returned. If there are multiple global
* variables with the same string name (from separate files) and the lookup
* is done via character string, ::cudaErrorDuplicateVariableName is
* returned.
* *
* \param size - Size of object associated with symbol * \param size - Size of object associated with symbol
* \param symbol - Global variable or string symbol to find size of * \param symbol - Global variable or string symbol to find size of
* *
* \return * \return
* ::cudaSuccess, * ::cudaSuccess,
* ::cudaErrorInvalidSymbol * ::cudaErrorInvalidSymbol,
* ::cudaErrorDuplicateVariableName
* \notefnerr * \notefnerr
* *
* \sa \ref ::cudaGetSymbolAddress(void**, const T&) "cudaGetSymbolAddress (C++ API)" * \sa \ref ::cudaGetSymbolAddress(void**, const T&) "cudaGetSymbolAddress (C++ API)"
* \ref ::cudaGetSymbolSize(size_t*, const char*) "cudaGetSymbolSize (C API )" * \ref ::cudaGetSymbolSize(size_t*, const char*) "cudaGetSymbolSize (C API )"
*/ */
template<class T> template<class T>
__inline__ __host__ cudaError_t cudaGetSymbolSize( __inline__ __host__ cudaError_t cudaGetSymbolSize(
size_t *size, size_t *size,
const T &symbol const T &symbol
) )
skipping to change at line 747 skipping to change at line 755
* \param entry - Device function pointer or char string naming device func tion * \param entry - Device function pointer or char string naming device func tion
* to execute * to execute
* *
* \return * \return
* ::cudaSuccess, * ::cudaSuccess,
* ::cudaErrorInvalidDeviceFunction, * ::cudaErrorInvalidDeviceFunction,
* ::cudaErrorInvalidConfiguration, * ::cudaErrorInvalidConfiguration,
* ::cudaErrorLaunchFailure, * ::cudaErrorLaunchFailure,
* ::cudaErrorPriorLaunchFailure, * ::cudaErrorPriorLaunchFailure,
* ::cudaErrorLaunchTimeout, * ::cudaErrorLaunchTimeout,
* ::cudaErrorLaunchOutOfResources * ::cudaErrorLaunchOutOfResources,
* ::cudaErrorSharedObjectSymbolNotFound,
* ::cudaErrorSharedObjectInitFailed
* \notefnerr * \notefnerr
* *
* \sa ::cudaConfigureCall, * \sa ::cudaConfigureCall,
* \ref ::cudaFuncSetCacheConfig(T*, enum cudaFuncCache) "cudaFuncSetCacheC onfig (C++ API)", * \ref ::cudaFuncSetCacheConfig(T*, enum cudaFuncCache) "cudaFuncSetCacheC onfig (C++ API)",
* \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes*, T*) "cudaFuncGe tAttributes (C++ API)", * \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes*, T*) "cudaFuncGe tAttributes (C++ API)",
* \ref ::cudaLaunch(const char*) "cudaLaunch (C API)", * \ref ::cudaLaunch(const char*) "cudaLaunch (C API)",
* ::cudaSetDoubleForDevice, * ::cudaSetDoubleForDevice,
* ::cudaSetDoubleForHost, * ::cudaSetDoubleForHost,
* \ref ::cudaSetupArgument(T,size_t) "cudaSetupArgument (C++ API)" * \ref ::cudaSetupArgument(T,size_t) "cudaSetupArgument (C++ API)"
*/ */
skipping to change at line 804 skipping to change at line 814
*/ */
template<class T> template<class T>
__inline__ __host__ cudaError_t cudaFuncGetAttributes( __inline__ __host__ cudaError_t cudaFuncGetAttributes(
struct cudaFuncAttributes *attr, struct cudaFuncAttributes *attr,
T *entry T *entry
) )
{ {
return cudaFuncGetAttributes(attr, (const char*)entry); return cudaFuncGetAttributes(attr, (const char*)entry);
} }
/**
* \ingroup CUDART_HIGHLEVEL
* \brief \hl Binds an array to a surface
*
* Binds the CUDA array \p array to the surface reference \p surf.
* \p desc describes how the memory is interpreted when dealing with
* the surface. Any CUDA array previously bound to \p surf is unbound.
*
* \param surf - Surface to bind
* \param array - Memory array on device
* \param desc - Channel format
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidValue,
* ::cudaErrorInvalidSurface
* \notefnerr
*
* \sa \ref ::cudaBindSurfaceToArray(const struct surfaceReference*, const
struct cudaArray*, const struct cudaChannelFormatDesc*) "cudaBindSurfaceToA
rray (C API)",
* \ref ::cudaBindSurfaceToArray(const struct surface< T, dim>&, const stru
ct cudaArray*) "cudaBindSurfaceToArray (C++ API, inherited channel descript
or)"
*/
template<class T, int dim>
__inline__ __host__ cudaError_t cudaBindSurfaceToArray(
const struct surface<T, dim> &surf,
const struct cudaArray *array,
const struct cudaChannelFormatDesc &desc
)
{
return cudaBindSurfaceToArray(&surf, array, &desc);
}
/**
* \ingroup CUDART_HIGHLEVEL
* \brief \hl Binds an array to a surface
*
* Binds the CUDA array \p array to the surface reference \p surf.
* The channel descriptor is inherited from the CUDA array. Any CUDA array
* previously bound to \p surf is unbound.
*
* \param surf - Surface to bind
* \param array - Memory array on device
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidValue,
* ::cudaErrorInvalidSurface
* \notefnerr
*
* \sa \ref ::cudaBindSurfaceToArray(const struct surfaceReference*, const
struct cudaArray*, const struct cudaChannelFormatDesc*) "cudaBindSurfaceToA
rray (C API)",
* \ref ::cudaBindSurfaceToArray(const struct surface< T, dim>&, const stru
ct cudaArray*, const struct cudaChannelFormatDesc&) "cudaBindSurfaceToArray
(C++ API)"
*/
template<class T, int dim>
__inline__ __host__ cudaError_t cudaBindSurfaceToArray(
const struct surface<T, dim> &surf,
const struct cudaArray *array
)
{
struct cudaChannelFormatDesc desc;
cudaError_t err = cudaGetChannelDesc(&desc, array);
return err == cudaSuccess ? cudaBindSurfaceToArray(surf, array, desc) : e
rr;
}
#endif /* __CUDACC__ */ #endif /* __CUDACC__ */
#endif /* __cplusplus */ #endif /* __cplusplus */
#endif /* !__CUDA_RUNTIME_H__ */ #endif /* !__CUDA_RUNTIME_H__ */
 End of changes. 9 change blocks. 
10 lines changed or deleted 95 lines changed or added


 cuda_runtime_api.h   cuda_runtime_api.h 
skipping to change at line 41 skipping to change at line 41
* Any use of this source code in individual and commercial software must * Any use of this source code in individual and commercial software must
* include, in the user documentation and internal comments to the code, * include, in the user documentation and internal comments to the code,
* the above Disclaimer and U.S. Government End Users Notice. * the above Disclaimer and U.S. Government End Users Notice.
*/ */
#if !defined(__CUDA_RUNTIME_API_H__) #if !defined(__CUDA_RUNTIME_API_H__)
#define __CUDA_RUNTIME_API_H__ #define __CUDA_RUNTIME_API_H__
/************************************************************************** ***** /************************************************************************** *****
* * * *
* CUDA Runtime API Version 3.0 * * CUDA Runtime API Version 3.1 *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
#define CUDART_VERSION 3000 #define CUDART_VERSION 3010
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
#include "host_defines.h" #include "host_defines.h"
#include "builtin_types.h" #include "builtin_types.h"
skipping to change at line 90 skipping to change at line 90
extern "C" { extern "C" {
#endif /* __cplusplus */ #endif /* __cplusplus */
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
extern __host__ cudaError_t CUDARTAPI cudaMalloc3D(struct cudaPitchedPtr* p itchedDevPtr, struct cudaExtent extent); extern __host__ cudaError_t CUDARTAPI cudaMalloc3D(struct cudaPitchedPtr* p itchedDevPtr, struct cudaExtent extent);
extern __host__ cudaError_t CUDARTAPI cudaMalloc3DArray(struct cudaArray** arrayPtr, const struct cudaChannelFormatDesc* desc, struct cudaExtent exten t); extern __host__ cudaError_t CUDARTAPI cudaMalloc3DArray(struct cudaArray** arrayPtr, const struct cudaChannelFormatDesc* desc, struct cudaExtent exten t, unsigned int flags __dv(0));
extern __host__ cudaError_t CUDARTAPI cudaMemset3D(struct cudaPitchedPtr pi tchedDevPtr, int value, struct cudaExtent extent); extern __host__ cudaError_t CUDARTAPI cudaMemset3D(struct cudaPitchedPtr pi tchedDevPtr, int value, struct cudaExtent extent);
extern __host__ cudaError_t CUDARTAPI cudaMemcpy3D(const struct cudaMemcpy3 DParms *p); extern __host__ cudaError_t CUDARTAPI cudaMemcpy3D(const struct cudaMemcpy3 DParms *p);
extern __host__ cudaError_t CUDARTAPI cudaMemcpy3DAsync(const struct cudaMe mcpy3DParms *p, cudaStream_t stream __dv(0)); extern __host__ cudaError_t CUDARTAPI cudaMemcpy3DAsync(const struct cudaMe mcpy3DParms *p, cudaStream_t stream __dv(0));
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
extern __host__ cudaError_t CUDARTAPI cudaMalloc(void **devPtr, size_t size ); extern __host__ cudaError_t CUDARTAPI cudaMalloc(void **devPtr, size_t size );
extern __host__ cudaError_t CUDARTAPI cudaMallocHost(void **ptr, size_t siz e); extern __host__ cudaError_t CUDARTAPI cudaMallocHost(void **ptr, size_t siz e);
extern __host__ cudaError_t CUDARTAPI cudaMallocPitch(void **devPtr, size_t *pitch, size_t width, size_t height); extern __host__ cudaError_t CUDARTAPI cudaMallocPitch(void **devPtr, size_t *pitch, size_t width, size_t height);
extern __host__ cudaError_t CUDARTAPI cudaMallocArray(struct cudaArray **ar ray, const struct cudaChannelFormatDesc *desc, size_t width, size_t height __dv(1)); extern __host__ cudaError_t CUDARTAPI cudaMallocArray(struct cudaArray **ar ray, const struct cudaChannelFormatDesc *desc, size_t width, size_t height __dv(0), unsigned int flags __dv(0));
extern __host__ cudaError_t CUDARTAPI cudaFree(void *devPtr); extern __host__ cudaError_t CUDARTAPI cudaFree(void *devPtr);
extern __host__ cudaError_t CUDARTAPI cudaFreeHost(void *ptr); extern __host__ cudaError_t CUDARTAPI cudaFreeHost(void *ptr);
extern __host__ cudaError_t CUDARTAPI cudaFreeArray(struct cudaArray *array ); extern __host__ cudaError_t CUDARTAPI cudaFreeArray(struct cudaArray *array );
extern __host__ cudaError_t CUDARTAPI cudaHostAlloc(void **pHost, size_t by tes, unsigned int flags); extern __host__ cudaError_t CUDARTAPI cudaHostAlloc(void **pHost, size_t by tes, unsigned int flags);
extern __host__ cudaError_t CUDARTAPI cudaHostGetDevicePointer(void **pDevi ce, void *pHost, unsigned int flags); extern __host__ cudaError_t CUDARTAPI cudaHostGetDevicePointer(void **pDevi ce, void *pHost, unsigned int flags);
extern __host__ cudaError_t CUDARTAPI cudaHostGetFlags(unsigned int *pFlags , void *pHost); extern __host__ cudaError_t CUDARTAPI cudaHostGetFlags(unsigned int *pFlags , void *pHost);
/************************************************************************** ***** /************************************************************************** *****
* * * *
skipping to change at line 197 skipping to change at line 197
extern __host__ cudaError_t CUDARTAPI cudaUnbindTexture(const struct textur eReference *texref); extern __host__ cudaError_t CUDARTAPI cudaUnbindTexture(const struct textur eReference *texref);
extern __host__ cudaError_t CUDARTAPI cudaGetTextureAlignmentOffset(size_t *offset, const struct textureReference *texref); extern __host__ cudaError_t CUDARTAPI cudaGetTextureAlignmentOffset(size_t *offset, const struct textureReference *texref);
extern __host__ cudaError_t CUDARTAPI cudaGetTextureReference(const struct textureReference **texref, const char *symbol); extern __host__ cudaError_t CUDARTAPI cudaGetTextureReference(const struct textureReference **texref, const char *symbol);
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
extern __host__ cudaError_t CUDARTAPI cudaBindSurfaceToArray(const struct s
urfaceReference *surfref, const struct cudaArray *array, const struct cudaC
hannelFormatDesc *desc);
extern __host__ cudaError_t CUDARTAPI cudaGetSurfaceAlignmentOffset(size_t
*offset, const struct surfaceReference *surfref);
extern __host__ cudaError_t CUDARTAPI cudaGetSurfaceReference(const struct
surfaceReference **surfref, const char *symbol);
/**************************************************************************
*****
*
*
*
*
*
*
***************************************************************************
****/
extern __host__ cudaError_t CUDARTAPI cudaGetChannelDesc(struct cudaChannel FormatDesc *desc, const struct cudaArray *array); extern __host__ cudaError_t CUDARTAPI cudaGetChannelDesc(struct cudaChannel FormatDesc *desc, const struct cudaArray *array);
extern __host__ struct cudaChannelFormatDesc CUDARTAPI cudaCreateChannelDes c(int x, int y, int z, int w, enum cudaChannelFormatKind f); extern __host__ struct cudaChannelFormatDesc CUDARTAPI cudaCreateChannelDes c(int x, int y, int z, int w, enum cudaChannelFormatKind f);
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
extern __host__ cudaError_t CUDARTAPI cudaGetLastError(void); extern __host__ cudaError_t CUDARTAPI cudaGetLastError(void);
extern __host__ cudaError_t CUDARTAPI cudaPeekAtLastError(void);
extern __host__ const char* CUDARTAPI cudaGetErrorString(cudaError_t error) ; extern __host__ const char* CUDARTAPI cudaGetErrorString(cudaError_t error) ;
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
extern __host__ cudaError_t CUDARTAPI cudaConfigureCall(dim3 gridDim, dim3 blockDim, size_t sharedMem __dv(0), cudaStream_t stream __dv(0)); extern __host__ cudaError_t CUDARTAPI cudaConfigureCall(dim3 gridDim, dim3 blockDim, size_t sharedMem __dv(0), cudaStream_t stream __dv(0));
extern __host__ cudaError_t CUDARTAPI cudaSetupArgument(const void *arg, si ze_t size, size_t offset); extern __host__ cudaError_t CUDARTAPI cudaSetupArgument(const void *arg, si ze_t size, size_t offset);
skipping to change at line 263 skipping to change at line 274
extern __host__ cudaError_t CUDARTAPI cudaSetDoubleForHost(double *d); extern __host__ cudaError_t CUDARTAPI cudaSetDoubleForHost(double *d);
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
extern __host__ cudaError_t CUDARTAPI cudaThreadExit(void); extern __host__ cudaError_t CUDARTAPI cudaThreadExit(void);
extern __host__ cudaError_t CUDARTAPI cudaThreadSynchronize(void); extern __host__ cudaError_t CUDARTAPI cudaThreadSynchronize(void);
extern __host__ cudaError_t CUDARTAPI cudaThreadSetLimit(enum cudaLimit lim
it, size_t value);
extern __host__ cudaError_t CUDARTAPI cudaThreadGetLimit(size_t *pValue, en
um cudaLimit limit);
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
extern __host__ cudaError_t CUDARTAPI cudaDriverGetVersion(int *driverVersi on); extern __host__ cudaError_t CUDARTAPI cudaDriverGetVersion(int *driverVersi on);
extern __host__ cudaError_t CUDARTAPI cudaRuntimeGetVersion(int *runtimeVer sion); extern __host__ cudaError_t CUDARTAPI cudaRuntimeGetVersion(int *runtimeVer sion);
extern __host__ cudaError_t CUDARTAPI cudaGetExportTable(const void **ppExp ortTable, const cudaUUID_t *pExportTableId);
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
extern __host__ cudaError_t CUDARTAPI cudaGraphicsUnregisterResource(struct cudaGraphicsResource *resource); extern __host__ cudaError_t CUDARTAPI cudaGraphicsUnregisterResource(struct cudaGraphicsResource *resource);
extern __host__ cudaError_t CUDARTAPI cudaGraphicsResourceSetMapFlags(struc t cudaGraphicsResource *resource, unsigned int flags); extern __host__ cudaError_t CUDARTAPI cudaGraphicsResourceSetMapFlags(struc t cudaGraphicsResource *resource, unsigned int flags);
extern __host__ cudaError_t CUDARTAPI cudaGraphicsMapResources(int count, s truct cudaGraphicsResource **resources, cudaStream_t stream __dv(0)); extern __host__ cudaError_t CUDARTAPI cudaGraphicsMapResources(int count, s truct cudaGraphicsResource **resources, cudaStream_t stream __dv(0));
 End of changes. 8 change blocks. 
4 lines changed or deleted 29 lines changed or added


 cuda_texture_types.h   cuda_texture_types.h 
skipping to change at line 47 skipping to change at line 47
#define __CUDA_TEXTURE_TYPES_H__ #define __CUDA_TEXTURE_TYPES_H__
#if defined(__cplusplus) && defined(__CUDACC__) #if defined(__cplusplus) && defined(__CUDACC__)
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
#include "builtin_types.h"
#include "channel_descriptor.h" #include "channel_descriptor.h"
#include "driver_types.h" #include "driver_types.h"
#include "host_defines.h" #include "host_defines.h"
#include "texture_types.h" #include "texture_types.h"
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
 End of changes. 1 change blocks. 
0 lines changed or deleted 1 lines changed or added


 cufft.h   cufft.h 
skipping to change at line 102 skipping to change at line 102
// CUFFT supports the following transform types // CUFFT supports the following transform types
typedef enum cufftType_t { typedef enum cufftType_t {
CUFFT_R2C = 0x2a, // Real to Complex (interleaved) CUFFT_R2C = 0x2a, // Real to Complex (interleaved)
CUFFT_C2R = 0x2c, // Complex (interleaved) to Real CUFFT_C2R = 0x2c, // Complex (interleaved) to Real
CUFFT_C2C = 0x29, // Complex to Complex, interleaved CUFFT_C2C = 0x29, // Complex to Complex, interleaved
CUFFT_D2Z = 0x6a, // Double to Double-Complex CUFFT_D2Z = 0x6a, // Double to Double-Complex
CUFFT_Z2D = 0x6c, // Double-Complex to Double CUFFT_Z2D = 0x6c, // Double-Complex to Double
CUFFT_Z2Z = 0x69 // Double-Complex to Double-Complex CUFFT_Z2Z = 0x69 // Double-Complex to Double-Complex
} cufftType; } cufftType;
// Certain R2C and C2R transforms go much more slowly when FFTW memory
// layout and behaviour is required. The default is "best performance",
// which means not-compatible-with-fftw. Use the cufftSetCompatibilityMode
// API to enable exact FFTW-like behaviour.
//
// These flags can be ORed together to select precise FFTW compatibility
// behaviour. The two levels presently supported are:
//
// CUFFT_COMPATIBILITY_FFTW_PADDING
// Inserts extra padding between packed in-place transforms for
// batched transforms with power-of-2 size.
//
// CUFFT_COMPATIBILITY_FFTW_C2R_ASYMMETRIC
// Guarantees FFTW-compatible output for non-symmetric complex inputs
// for transforms with power-of-2 size. This is only useful for
// artificial (i.e. random) datasets as actual data will always be
// symmetric if it has come from the real plane. If you don't
// understand what this means, you probably don't have to use it.
//
// CUFFT_COMPATIBILITY_FFTW
// For convenience, enables all FFTW compatibility modes at once.
//
typedef enum cufftCompatibility_t {
CUFFT_COMPATIBILITY_NATIVE = 0x00,
CUFFT_COMPATIBILITY_FFTW_PADDING = 0x01, // The default value
CUFFT_COMPATIBILITY_FFTW_ASYMMETRIC = 0x02,
CUFFT_COMPATIBILITY_FFTW_ALL = 0x03
} cufftCompatibility;
#define CUFFT_COMPATIBILITY_DEFAULT CUFFT_COMPATIBILITY_FFTW_PADDING
cufftResult CUFFTAPI cufftPlan1d(cufftHandle *plan, cufftResult CUFFTAPI cufftPlan1d(cufftHandle *plan,
int nx, int nx,
cufftType type, cufftType type,
int batch /* deprecated - use cufftPlanMan y */); int batch /* deprecated - use cufftPlanMan y */);
cufftResult CUFFTAPI cufftPlan2d(cufftHandle *plan, cufftResult CUFFTAPI cufftPlan2d(cufftHandle *plan,
int nx, int ny, int nx, int ny,
cufftType type); cufftType type);
cufftResult CUFFTAPI cufftPlan3d(cufftHandle *plan, cufftResult CUFFTAPI cufftPlan3d(cufftHandle *plan,
skipping to change at line 154 skipping to change at line 185
cufftDoubleReal *idata, cufftDoubleReal *idata,
cufftDoubleComplex *odata); cufftDoubleComplex *odata);
cufftResult CUFFTAPI cufftExecZ2D(cufftHandle plan, cufftResult CUFFTAPI cufftExecZ2D(cufftHandle plan,
cufftDoubleComplex *idata, cufftDoubleComplex *idata,
cufftDoubleReal *odata); cufftDoubleReal *odata);
cufftResult CUFFTAPI cufftSetStream(cufftHandle plan, cufftResult CUFFTAPI cufftSetStream(cufftHandle plan,
cudaStream_t stream); cudaStream_t stream);
cufftResult CUFFTAPI cufftSetCompatibilityMode(cufftHandle plan,
cufftCompatibility mode);
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif
#endif /* _CUFFT_H_ */ #endif /* _CUFFT_H_ */
 End of changes. 2 change blocks. 
0 lines changed or deleted 33 lines changed or added


 device_functions.h   device_functions.h 
skipping to change at line 47 skipping to change at line 47
#define __DEVICE_FUNCTIONS_H__ #define __DEVICE_FUNCTIONS_H__
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
#if defined(__cplusplus) && defined(__CUDACC__) #if defined(__cplusplus) && defined(__CUDACC__)
#include "builtin_types.h"
#include "device_types.h" #include "device_types.h"
#include "host_defines.h" #include "host_defines.h"
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
extern "C" extern "C"
skipping to change at line 75 skipping to change at line 76
extern __device__ long long int __mul64hi(long long int, long long int); extern __device__ long long int __mul64hi(long long int, long long int);
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ unsigned long long int __umul64hi(unsigned long long int, unsigned long long int); extern __device__ unsigned long long int __umul64hi(unsigned long long int, unsigned long long int);
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ float __int_as_float(int); extern __device__ float __int_as_float(int);
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ int __float_as_int(float); extern __device__ int __float_as_int(float);
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ void __synchronous_start(int);
/*DEVICE_BUILTIN*/
extern __device__ void __synchronous_end(void);
/*DEVICE_BUILTIN*/
extern __device__ void __syncthreads(void); extern __device__ void __syncthreads(void);
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ void __prof_trigger(int); extern __device__ void __prof_trigger(int);
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ void __threadfence(void); extern __device__ void __threadfence(void);
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ void __threadfence_block(void); extern __device__ void __threadfence_block(void);
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ void __trap(void); extern __device__ void __trap(void);
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
skipping to change at line 287 skipping to change at line 292
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ int __clzll(long long int); extern __device__ int __clzll(long long int);
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ int __ffsll(long long int); extern __device__ int __ffsll(long long int);
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ int __popcll(unsigned long long int); extern __device__ int __popcll(unsigned long long int);
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ unsigned long long int __brevll(unsigned long long int); extern __device__ unsigned long long int __brevll(unsigned long long int);
/*DEVICE_BUILTIN*/
extern __device__ unsigned int __byte_perm(unsigned int, unsigned
int, unsigned int);
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 130 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 130
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ int __double2int_rz(double); extern __device__ int __double2int_rz(double);
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ unsigned int __double2uint_rz(double); extern __device__ unsigned int __double2uint_rz(double);
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ long long int __double2ll_rz(double); extern __device__ long long int __double2ll_rz(double);
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ unsigned long long int __double2ull_rz(double); extern __device__ unsigned long long int __double2ull_rz(double);
/*DEVICE_BUILTIN*/
extern __device__ unsigned int __pm0(void);
/*DEVICE_BUILTIN*/
extern __device__ unsigned int __pm1(void);
/*DEVICE_BUILTIN*/
extern __device__ unsigned int __pm2(void);
/*DEVICE_BUILTIN*/
extern __device__ unsigned int __pm3(void);
#endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 130 */ #endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 130 */
} }
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
skipping to change at line 448 skipping to change at line 465
} }
static __inline__ __device__ float uint2float(unsigned int a, enum cudaRoun dMode mode = cudaRoundNearest) static __inline__ __device__ float uint2float(unsigned int a, enum cudaRoun dMode mode = cudaRoundNearest)
{ {
return mode == cudaRoundZero ? __uint2float_rz(a) : return mode == cudaRoundZero ? __uint2float_rz(a) :
mode == cudaRoundPosInf ? __uint2float_ru(a) : mode == cudaRoundPosInf ? __uint2float_ru(a) :
mode == cudaRoundMinInf ? __uint2float_rd(a) : mode == cudaRoundMinInf ? __uint2float_rd(a) :
__uint2float_rn(a); __uint2float_rn(a);
} }
#elif !defined(__CUDACC__) #elif defined(__CUDABE__)
#include "crt/func_macro.h" /**************************************************************************
*****
*
*
* DEVICE IMPLEMENTATIONS FOR FUNCTIONS WITH BUILTIN NVOPENCC OPERATIONS
*
*
*
***************************************************************************
****/
#include "host_defines.h" static __forceinline__ float __sinf(float a)
#include "math_constants.h" {
return __builtin_sinf(a);
}
#if defined(__CUDABE__) static __forceinline__ float __cosf(float a)
{
return __builtin_cosf(a);
}
#if (__CUDA_ARCH__ < 200) static __forceinline__ float __log2f(float a)
{
return __builtin_log2f(a);
}
__device_func__(float __frcp_rn (float x)) /**************************************************************************
*****
*
*
* DEVICE IMPLEMENTATIONS FOR FUNCTIONS WITHOUT BUILTIN NVOPENCC OPERATIONS
*
*
*
***************************************************************************
****/
static __forceinline__ float __tanf(float a)
{
return __fdividef (__sinf(a), __cosf(a));
}
static __forceinline__ void __sincosf(float a, float *sptr, float *cptr)
{
*sptr = __sinf(a);
*cptr = __cosf(a);
}
static __forceinline__ float __expf(float a)
{
return exp2f(a * CUDART_L2E_F);
}
static __forceinline__ float __exp10f(float a)
{
return exp2f(a * CUDART_L2T_F);
}
static __forceinline__ float __log10f(float a)
{
return CUDART_LG2_F * __log2f(a);
}
static __forceinline__ float __logf(float a)
{
return CUDART_LN2_F * __log2f(a);
}
static __forceinline__ float __powf(float a, float b)
{
return exp2f(b * __log2f(a));
}
static __forceinline__ float fdividef(float a, float b)
{
#if defined(__USE_FAST_MATH__) && !defined(__CUDA_PREC_DIV)
return __fdividef(a, b);
#else /* __USE_FAST_MATH__ && !__CUDA_PREC_DIV */
return a / b;
#endif /* __USE_FAST_MATH__ && !__CUDA_PREC_DIV */
}
#if defined(CUDA_FLOAT_MATH_FUNCTIONS)
static __forceinline__ double fdivide(double a, double b)
{
return (double)fdividef((float)a, (float)b);
}
#endif /* CUDA_FLOAT_MATH_FUNCTIONS */
#if defined(CUDA_DOUBLE_MATH_FUNCTIONS)
static __forceinline__ double fdivide(double a, double b)
{
return a / b;
}
#endif /* CUDA_DOUBLE_MATH_FUNCTIONS */
#if __CUDA_ARCH__ < 200
static __forceinline__ float __frcp_rn (float x)
{ {
unsigned int expo; unsigned int expo;
unsigned f, y; unsigned f, y;
unsigned int argi; unsigned int argi;
float t; float t;
argi = __float_as_int(x); argi = __float_as_int(x);
expo = (argi >> 23); expo = (argi >> 23);
expo = expo & 0xff; expo = expo & 0xff;
f = expo - 1; f = expo - 1;
skipping to change at line 497 skipping to change at line 597
f = (unsigned)(-(int)f); f = (unsigned)(-(int)f);
if (expo < f) { if (expo < f) {
t = __int_as_float(__float_as_int(t)+1); t = __int_as_float(__float_as_int(t)+1);
} }
return t; return t;
} }
} }
return 1.0f / x; return 1.0f / x;
} }
__device_func__(float __frcp_rz (float x)) static __forceinline__ float __frcp_rz (float x)
{ {
unsigned int expo; unsigned int expo;
unsigned f, y; unsigned f, y;
unsigned int argi; unsigned int argi;
float t; float t;
argi = __float_as_int(x); argi = __float_as_int(x);
expo = (argi >> 23); expo = (argi >> 23);
expo = expo & 0xff; expo = expo & 0xff;
f = expo - 1; f = expo - 1;
skipping to change at line 525 skipping to change at line 625
f = __umul24(y, argi); f = __umul24(y, argi);
if ((int)f > 0) { if ((int)f > 0) {
t = __int_as_float(__float_as_int(t)-1); t = __int_as_float(__float_as_int(t)-1);
} }
return t; return t;
} }
} }
return 1.0f / x; return 1.0f / x;
} }
__device_func__(float __frcp_rd (float x)) static __forceinline__ float __frcp_rd (float x)
{ {
unsigned int expo; unsigned int expo;
unsigned f, y; unsigned f, y;
unsigned int argi; unsigned int argi;
float t; float t;
argi = __float_as_int(x); argi = __float_as_int(x);
expo = (argi >> 23); expo = (argi >> 23);
expo = expo & 0xff; expo = expo & 0xff;
f = expo - 1; f = expo - 1;
skipping to change at line 556 skipping to change at line 656
} }
if (((int)f < 0) && (x < 0.0f)) { if (((int)f < 0) && (x < 0.0f)) {
t = __int_as_float(__float_as_int(t)+1); t = __int_as_float(__float_as_int(t)+1);
} }
return t; return t;
} }
} }
return 1.0f / x; return 1.0f / x;
} }
__device_func__(float __frcp_ru (float x)) static __forceinline__ float __frcp_ru (float x)
{ {
unsigned int expo; unsigned int expo;
unsigned f, y; unsigned f, y;
unsigned int argi; unsigned int argi;
float t; float t;
argi = __float_as_int(x); argi = __float_as_int(x);
expo = (argi >> 23); expo = (argi >> 23);
expo = expo & 0xff; expo = expo & 0xff;
f = expo - 1; f = expo - 1;
skipping to change at line 587 skipping to change at line 687
} }
if (((int)f < 0) && (x > 0.0f)) { if (((int)f < 0) && (x > 0.0f)) {
t = __int_as_float(__float_as_int(t)+1); t = __int_as_float(__float_as_int(t)+1);
} }
return t; return t;
} }
} }
return 1.0f / x; return 1.0f / x;
} }
__device_func__(float __fsqrt_rn (float radicand)) static __forceinline__ float __fsqrt_rn (float radicand)
{ {
unsigned int expo, argi; unsigned int expo, argi;
unsigned int s, f, x; unsigned int s, f, x;
argi = __float_as_int(radicand); argi = __float_as_int(radicand);
expo = argi >> 23; expo = argi >> 23;
expo = expo & 0xff; expo = expo & 0xff;
f = expo - 1; f = expo - 1;
if ((argi <= 0x80000000) && (f <= 0xFD)) { if ((argi <= 0x80000000) && (f <= 0xFD)) {
skipping to change at line 621 skipping to change at line 721
f = x - (2 * argi + 1); f = x - (2 * argi + 1);
if ((int)f < 0) f = (unsigned)(-(int)f); if ((int)f < 0) f = (unsigned)(-(int)f);
if ((int)x < 0) x = (unsigned)(-(int)x); if ((int)x < 0) x = (unsigned)(-(int)x);
if (f < x) argi ++; if (f < x) argi ++;
argi = argi + (((expo + 125) & ~0x1) << 22); argi = argi + (((expo + 125) & ~0x1) << 22);
return __int_as_float(argi); return __int_as_float(argi);
} }
return sqrtf(radicand); return sqrtf(radicand);
} }
__device_func__(float __fsqrt_rz (float radicand)) static __forceinline__ float __fsqrt_rz (float radicand)
{ {
unsigned int expo, argi; unsigned int expo, argi;
unsigned int s, f, x; unsigned int s, f, x;
argi = __float_as_int(radicand); argi = __float_as_int(radicand);
expo = argi >> 23; expo = argi >> 23;
expo = expo & 0xff; expo = expo & 0xff;
f = expo - 1; f = expo - 1;
if ((argi <= 0x80000000) && (f <= 0xFD)) { if ((argi <= 0x80000000) && (f <= 0xFD)) {
skipping to change at line 652 skipping to change at line 752
/* compute truncated result */ /* compute truncated result */
argi = (argi + 4) >> 3; argi = (argi + 4) >> 3;
x = (x << 16) - (argi * argi); x = (x << 16) - (argi * argi);
if ((int)x < 0) argi--; if ((int)x < 0) argi--;
argi = argi + (((expo + 125) & ~0x1) << 22); argi = argi + (((expo + 125) & ~0x1) << 22);
return __int_as_float(argi); return __int_as_float(argi);
} }
return sqrtf(radicand); return sqrtf(radicand);
} }
__device_func__(float __fsqrt_ru (float radicand)) static __forceinline__ float __fsqrt_ru (float radicand)
{ {
unsigned int expo, argi; unsigned int expo, argi;
unsigned int s, f, x; unsigned int s, f, x;
argi = __float_as_int(radicand); argi = __float_as_int(radicand);
expo = argi >> 23; expo = argi >> 23;
expo = expo & 0xff; expo = expo & 0xff;
f = expo - 1; f = expo - 1;
if ((argi <= 0x80000000) && (f <= 0xFD)) { if ((argi <= 0x80000000) && (f <= 0xFD)) {
skipping to change at line 682 skipping to change at line 782
argi = __umulhi(x,argi); argi = __umulhi(x,argi);
argi = (argi + 4) >> 3; argi = (argi + 4) >> 3;
x = (x << 16) - (argi * argi); x = (x << 16) - (argi * argi);
if ((int)x > 0) argi++; if ((int)x > 0) argi++;
argi = argi + (((expo + 125) & ~0x1) << 22); argi = argi + (((expo + 125) & ~0x1) << 22);
return __int_as_float(argi); return __int_as_float(argi);
} }
return sqrtf(radicand); return sqrtf(radicand);
} }
__device_func__(float __fsqrt_rd (float radicand)) static __forceinline__ float __fsqrt_rd (float radicand)
{ {
unsigned int expo, argi; unsigned int expo, argi;
unsigned int s, f, x; unsigned int s, f, x;
argi = __float_as_int(radicand); argi = __float_as_int(radicand);
expo = argi >> 23; expo = argi >> 23;
expo = expo & 0xff; expo = expo & 0xff;
f = expo - 1; f = expo - 1;
if ((argi <= 0x80000000) && (f <= 0xFD)) { if ((argi <= 0x80000000) && (f <= 0xFD)) {
skipping to change at line 713 skipping to change at line 813
/* compute truncated result */ /* compute truncated result */
argi = (argi + 4) >> 3; argi = (argi + 4) >> 3;
x = (x << 16) - (argi * argi); x = (x << 16) - (argi * argi);
if ((int)x < 0) argi--; if ((int)x < 0) argi--;
argi = argi + (((expo + 125) & ~0x1) << 22); argi = argi + (((expo + 125) & ~0x1) << 22);
return __int_as_float(argi); return __int_as_float(argi);
} }
return sqrtf(radicand); return sqrtf(radicand);
} }
__device_func__(float __fdiv_rn (float dividend, float divisor)) static __forceinline__ float __fdiv_rn (float dividend, float divisor)
{ {
unsigned long long prod; unsigned long long prod;
unsigned r, f, x, y, expox, expoy, sign; unsigned r, f, x, y, expox, expoy, sign;
unsigned expo_res; unsigned expo_res;
unsigned resi, cvtxi, cvtyi; unsigned resi, cvtxi, cvtyi;
float t; float t;
cvtxi = __float_as_int(dividend); cvtxi = __float_as_int(dividend);
cvtyi = __float_as_int(divisor); cvtyi = __float_as_int(divisor);
expox = (cvtxi >> 23) & 0xff; expox = (cvtxi >> 23) & 0xff;
skipping to change at line 776 skipping to change at line 876
prod = ((unsigned long long)y) * r; prod = ((unsigned long long)y) * r;
x = x << (23 + ((prod >> 32) >> 15)); x = x << (23 + ((prod >> 32) >> 15));
rem1 = x - (unsigned)(prod & 0xffffffff); rem1 = x - (unsigned)(prod & 0xffffffff);
rem0 = rem1 - y; rem0 = rem1 - y;
inc = abs(rem0) < abs(rem1); inc = abs(rem0) < abs(rem1);
resi = ((expo_res << 23) + r + inc); resi = ((expo_res << 23) + r + inc);
if (resi != 0x00800000) resi = 0; if (resi != 0x00800000) resi = 0;
return __int_as_float(sign | resi); return __int_as_float(sign | resi);
} }
} }
if (__cuda_fabsf(divisor) > CUDART_TWO_TO_126_F) { if (fabsf(divisor) > CUDART_TWO_TO_126_F) {
divisor *= 0.25f; divisor *= 0.25f;
dividend *= 0.25f; dividend *= 0.25f;
} }
return __fdividef (dividend, divisor); return __fdividef (dividend, divisor);
} }
__device_func__(float __fdiv_rz (float dividend, float divisor)) static __forceinline__ float __fdiv_rz (float dividend, float divisor)
{ {
unsigned long long prod; unsigned long long prod;
unsigned r, f, x, y, expox, expoy, sign; unsigned r, f, x, y, expox, expoy, sign;
unsigned expo_res; unsigned expo_res;
unsigned resi, cvtxi, cvtyi; unsigned resi, cvtxi, cvtyi;
float t; float t;
cvtxi = __float_as_int(dividend); cvtxi = __float_as_int(dividend);
cvtyi = __float_as_int(divisor); cvtyi = __float_as_int(divisor);
expox = (cvtxi >> 23) & 0xff; expox = (cvtxi >> 23) & 0xff;
skipping to change at line 844 skipping to change at line 944
int rem1; int rem1;
prod = ((unsigned long long)y) * r; prod = ((unsigned long long)y) * r;
x = x << (23 + ((prod >> 32) >> 15)); x = x << (23 + ((prod >> 32) >> 15));
rem1 = x - (unsigned)(prod & 0xffffffff); rem1 = x - (unsigned)(prod & 0xffffffff);
if (rem1 < 0) r--; if (rem1 < 0) r--;
resi = ((expo_res << 23) + r); resi = ((expo_res << 23) + r);
if (resi != 0x00800000) resi = 0; if (resi != 0x00800000) resi = 0;
return __int_as_float(sign | resi); return __int_as_float(sign | resi);
} }
} }
if (__cuda_fabsf(divisor) > CUDART_TWO_TO_126_F) { if (fabsf(divisor) > CUDART_TWO_TO_126_F) {
divisor *= 0.25f; divisor *= 0.25f;
dividend *= 0.25f; dividend *= 0.25f;
} }
return __fdividef (dividend, divisor); return __fdividef (dividend, divisor);
} }
__device_func__(float __fdiv_ru (float dividend, float divisor)) static __forceinline__ float __fdiv_ru (float dividend, float divisor)
{ {
unsigned long long prod; unsigned long long prod;
unsigned r, f, x, y, expox, expoy, sign; unsigned r, f, x, y, expox, expoy, sign;
unsigned expo_res; unsigned expo_res;
unsigned resi, cvtxi, cvtyi; unsigned resi, cvtxi, cvtyi;
float t; float t;
cvtxi = __float_as_int(dividend); cvtxi = __float_as_int(dividend);
cvtyi = __float_as_int(divisor); cvtyi = __float_as_int(divisor);
expox = (cvtxi >> 23) & 0xff; expox = (cvtxi >> 23) & 0xff;
skipping to change at line 914 skipping to change at line 1014
prod = ((unsigned long long)y) * r; prod = ((unsigned long long)y) * r;
x = x << (23 + ((prod >> 32) >> 15)); x = x << (23 + ((prod >> 32) >> 15));
rem1 = x - (unsigned)(prod & 0xffffffff); rem1 = x - (unsigned)(prod & 0xffffffff);
if ((rem1 < 0) && (sign)) r--; if ((rem1 < 0) && (sign)) r--;
if ((rem1 > 0) && (!sign)) r++; if ((rem1 > 0) && (!sign)) r++;
resi = ((expo_res << 23) + r); resi = ((expo_res << 23) + r);
if (resi != 0x00800000) resi = 0; if (resi != 0x00800000) resi = 0;
return __int_as_float(sign | resi); return __int_as_float(sign | resi);
} }
} }
if (__cuda_fabsf(divisor) > CUDART_TWO_TO_126_F) { if (fabsf(divisor) > CUDART_TWO_TO_126_F) {
divisor *= 0.25f; divisor *= 0.25f;
dividend *= 0.25f; dividend *= 0.25f;
} }
return __fdividef (dividend, divisor); return __fdividef (dividend, divisor);
} }
__device_func__(float __fdiv_rd (float dividend, float divisor)) static __forceinline__ float __fdiv_rd (float dividend, float divisor)
{ {
unsigned long long prod; unsigned long long prod;
unsigned r, f, x, y, expox, expoy, sign; unsigned r, f, x, y, expox, expoy, sign;
unsigned expo_res; unsigned expo_res;
unsigned resi, cvtxi, cvtyi; unsigned resi, cvtxi, cvtyi;
float t; float t;
cvtxi = __float_as_int(dividend); cvtxi = __float_as_int(dividend);
cvtyi = __float_as_int(divisor); cvtyi = __float_as_int(divisor);
expox = (cvtxi >> 23) & 0xff; expox = (cvtxi >> 23) & 0xff;
skipping to change at line 984 skipping to change at line 1084
prod = ((unsigned long long)y) * r; prod = ((unsigned long long)y) * r;
x = x << (23 + ((prod >> 32) >> 15)); x = x << (23 + ((prod >> 32) >> 15));
rem1 = x - (unsigned)(prod & 0xffffffff); rem1 = x - (unsigned)(prod & 0xffffffff);
if ((rem1 < 0) && (!sign)) r--; if ((rem1 < 0) && (!sign)) r--;
if ((rem1 > 0) && (sign)) r++; if ((rem1 > 0) && (sign)) r++;
resi = ((expo_res << 23) + r); resi = ((expo_res << 23) + r);
if (resi != 0x00800000) resi = 0; if (resi != 0x00800000) resi = 0;
return __int_as_float(sign | resi); return __int_as_float(sign | resi);
} }
} }
if (__cuda_fabsf(divisor) > CUDART_TWO_TO_126_F) { if (fabsf(divisor) > CUDART_TWO_TO_126_F) {
divisor *= 0.25f; divisor *= 0.25f;
dividend *= 0.25f; dividend *= 0.25f;
} }
return __fdividef (dividend, divisor); return __fdividef (dividend, divisor);
} }
__device_func__(float __fadd_ru (float a, float b)) static __forceinline__ float __fadd_ru (float a, float b)
{ {
unsigned int expo_x, expo_y; unsigned int expo_x, expo_y;
unsigned int xxi, yyi, temp; unsigned int xxi, yyi, temp;
xxi = __float_as_int(a); xxi = __float_as_int(a);
yyi = __float_as_int(b); yyi = __float_as_int(b);
/* make bigger operand the augend */ /* make bigger operand the augend */
expo_y = yyi << 1; expo_y = yyi << 1;
if (expo_y > (xxi << 1)) { if (expo_y > (xxi << 1)) {
skipping to change at line 1094 skipping to change at line 1194
xxi = xxi & ~0xff000000; xxi = xxi & ~0xff000000;
expo_x = (unsigned int)(-((int)expo_x)); expo_x = (unsigned int)(-((int)expo_x));
xxi = (xxi >> expo_x); xxi = (xxi >> expo_x);
if ((expo_x > 25) || (xxi != 0x00800000)) xxi = 0; if ((expo_x > 25) || (xxi != 0x00800000)) xxi = 0;
return __int_as_float(yyi | xxi); return __int_as_float(yyi | xxi);
} else { } else {
return a + b; return a + b;
} }
} }
__device_func__(float __fadd_rd (float a, float b)) static __forceinline__ float __fadd_rd (float a, float b)
{ {
unsigned int expo_x, expo_y; unsigned int expo_x, expo_y;
unsigned int xxi, yyi, temp; unsigned int xxi, yyi, temp;
xxi = __float_as_int(a); xxi = __float_as_int(a);
yyi = __float_as_int(b); yyi = __float_as_int(b);
/* make bigger operand the augend */ /* make bigger operand the augend */
expo_y = yyi << 1; expo_y = yyi << 1;
if (expo_y > (xxi << 1)) { if (expo_y > (xxi << 1)) {
skipping to change at line 1202 skipping to change at line 1302
if ((expo_x > 25) || (xxi != 0x00800000)) xxi = 0; if ((expo_x > 25) || (xxi != 0x00800000)) xxi = 0;
return __int_as_float(yyi | xxi); return __int_as_float(yyi | xxi);
} else { } else {
a = a + b; a = a + b;
xxi = xxi ^ yyi; xxi = xxi ^ yyi;
if ((a == 0.0f) && ((int)xxi < 0)) a = __int_as_float(0x80000000); if ((a == 0.0f) && ((int)xxi < 0)) a = __int_as_float(0x80000000);
return a; return a;
} }
} }
__device_func__(float __fmul_ru (float a, float b)) static __forceinline__ float __fmul_ru (float a, float b)
{ {
unsigned long long product; unsigned long long product;
unsigned int expo_x, expo_y; unsigned int expo_x, expo_y;
unsigned int xxi, yyi; unsigned int xxi, yyi;
xxi = __float_as_int(a); xxi = __float_as_int(a);
yyi = __float_as_int(b); yyi = __float_as_int(b);
expo_y = 0xFF; expo_y = 0xFF;
expo_x = expo_y & (xxi >> 23); expo_x = expo_y & (xxi >> 23);
skipping to change at line 1260 skipping to change at line 1360
xxi += (yyi && !expo_y); xxi += (yyi && !expo_y);
xxi = (xxi >> expo_x); xxi = (xxi >> expo_x);
if ((expo_x > 25) || (xxi != 0x00800000)) xxi = 0; if ((expo_x > 25) || (xxi != 0x00800000)) xxi = 0;
return __int_as_float(expo_y | xxi); return __int_as_float(expo_y | xxi);
} }
} else { } else {
return a * b; return a * b;
} }
} }
__device_func__(float __fmul_rd (float a, float b)) static __forceinline__ float __fmul_rd (float a, float b)
{ {
unsigned long long product; unsigned long long product;
unsigned int expo_x, expo_y; unsigned int expo_x, expo_y;
unsigned int xxi, yyi; unsigned int xxi, yyi;
xxi = __float_as_int(a); xxi = __float_as_int(a);
yyi = __float_as_int(b); yyi = __float_as_int(b);
expo_y = 0xFF; expo_y = 0xFF;
expo_x = expo_y & (xxi >> 23); expo_x = expo_y & (xxi >> 23);
skipping to change at line 1318 skipping to change at line 1418
xxi += (yyi && expo_y); xxi += (yyi && expo_y);
xxi = (xxi >> expo_x); xxi = (xxi >> expo_x);
if ((expo_x > 25) || (xxi != 0x00800000)) xxi = 0; if ((expo_x > 25) || (xxi != 0x00800000)) xxi = 0;
return __int_as_float(expo_y | xxi); return __int_as_float(expo_y | xxi);
} }
} else { } else {
return a * b; return a * b;
} }
} }
__device_func__(float __fmaf_rn (float a, float b, float c)) static __forceinline__ float __fmaf_rn (float a, float b, float c)
{ {
unsigned long long product; unsigned long long product;
unsigned int xx, yy, zz, ww; unsigned int xx, yy, zz, ww;
unsigned int temp, s, u; unsigned int temp, s, u;
unsigned int expo_x, expo_y, expo_z; unsigned int expo_x, expo_y, expo_z;
xx = __float_as_int(a); xx = __float_as_int(a);
yy = __float_as_int(b); yy = __float_as_int(b);
zz = __float_as_int(c); zz = __float_as_int(c);
skipping to change at line 1593 skipping to change at line 1693
xx += (temp >= 0x80000000); xx += (temp >= 0x80000000);
if (xx >= 0x01000000) { if (xx >= 0x01000000) {
xx = xx >> 1; xx = xx >> 1;
expo_x--; expo_x--;
} }
if (expo_x > 0) xx = 0; if (expo_x > 0) xx = 0;
xx = expo_y | xx; xx = expo_y | xx;
return __int_as_float(xx); return __int_as_float(xx);
} }
__device_func__(float __fmaf_rz (float a, float b, float c)) static __forceinline__ float __fmaf_rz (float a, float b, float c)
{ {
unsigned long long product; unsigned long long product;
unsigned int xx, yy, zz, ww; unsigned int xx, yy, zz, ww;
unsigned int temp, s, u; unsigned int temp, s, u;
unsigned int expo_x, expo_y, expo_z; unsigned int expo_x, expo_y, expo_z;
xx = __float_as_int(a); xx = __float_as_int(a);
yy = __float_as_int(b); yy = __float_as_int(b);
zz = __float_as_int(c); zz = __float_as_int(c);
skipping to change at line 1857 skipping to change at line 1957
return __int_as_float(xx); return __int_as_float(xx);
} else if ((int)expo_x >= 126) { } else if ((int)expo_x >= 126) {
/* overflow */ /* overflow */
xx = expo_y | 0x7f7fffff; xx = expo_y | 0x7f7fffff;
return __int_as_float(xx); return __int_as_float(xx);
} }
/* subnormal */ /* subnormal */
return __int_as_float(expo_y); return __int_as_float(expo_y);
} }
__device_func__(float __fmaf_ru (float a, float b, float c)) static __forceinline__ float __fmaf_ru (float a, float b, float c)
{ {
unsigned long long product; unsigned long long product;
unsigned int xx, yy, zz, ww; unsigned int xx, yy, zz, ww;
unsigned int temp, s, u; unsigned int temp, s, u;
unsigned int expo_x, expo_y, expo_z; unsigned int expo_x, expo_y, expo_z;
xx = __float_as_int(a); xx = __float_as_int(a);
yy = __float_as_int(b); yy = __float_as_int(b);
zz = __float_as_int(c); zz = __float_as_int(c);
skipping to change at line 2126 skipping to change at line 2226
return __int_as_float(xx); return __int_as_float(xx);
} }
/* subnormal */ /* subnormal */
expo_x = ((unsigned int)-((int)expo_x)); expo_x = ((unsigned int)-((int)expo_x));
xx += (temp && !expo_y); xx += (temp && !expo_y);
xx = (xx >> expo_x); xx = (xx >> expo_x);
if ((expo_x > 25) || (xx != 0x00800000)) xx = 0; if ((expo_x > 25) || (xx != 0x00800000)) xx = 0;
return __int_as_float(expo_y | xx); return __int_as_float(expo_y | xx);
} }
__device_func__(float __fmaf_rd (float a, float b, float c)) static __forceinline__ float __fmaf_rd (float a, float b, float c)
{ {
unsigned long long product; unsigned long long product;
unsigned int xx, yy, zz, ww; unsigned int xx, yy, zz, ww;
unsigned int temp, s, u; unsigned int temp, s, u;
unsigned int expo_x, expo_y, expo_z; unsigned int expo_x, expo_y, expo_z;
xx = __float_as_int(a); xx = __float_as_int(a);
yy = __float_as_int(b); yy = __float_as_int(b);
zz = __float_as_int(c); zz = __float_as_int(c);
skipping to change at line 2395 skipping to change at line 2495
return __int_as_float(xx); return __int_as_float(xx);
} }
/* subnormal */ /* subnormal */
expo_x = ((unsigned int)-((int)expo_x)); expo_x = ((unsigned int)-((int)expo_x));
xx += (temp && expo_y); xx += (temp && expo_y);
xx = (xx >> expo_x); xx = (xx >> expo_x);
if ((expo_x > 25) || (xx != 0x00800000)) xx = 0; if ((expo_x > 25) || (xx != 0x00800000)) xx = 0;
return __int_as_float(expo_y | xx); return __int_as_float(expo_y | xx);
} }
#endif /* __CUDA_ARCH__ < 200 */ static __forceinline__ int __clz(int a)
#else /* defined(__CUDABE__) */
#include "common_types.h"
static __device__ const unsigned char __internal_rcpTab[128] =
{
0xff, 0xfd, 0xfb, 0xf9, 0xf7, 0xf5, 0xf4, 0xf2,
0xf0, 0xee, 0xed, 0xeb, 0xe9, 0xe8, 0xe6, 0xe4,
0xe3, 0xe1, 0xe0, 0xde, 0xdd, 0xdb, 0xda, 0xd8,
0xd7, 0xd5, 0xd4, 0xd3, 0xd1, 0xd0, 0xcf, 0xcd,
0xcc, 0xcb, 0xca, 0xc8, 0xc7, 0xc6, 0xc5, 0xc4,
0xc2, 0xc1, 0xc0, 0xbf, 0xbe, 0xbd, 0xbc, 0xbb,
0xba, 0xb9, 0xb8, 0xb7, 0xb6, 0xb5, 0xb4, 0xb3,
0xb2, 0xb1, 0xb0, 0xaf, 0xae, 0xad, 0xac, 0xab,
0xaa, 0xa9, 0xa8, 0xa8, 0xa7, 0xa6, 0xa5, 0xa4,
0xa3, 0xa3, 0xa2, 0xa1, 0xa0, 0x9f, 0x9f, 0x9e,
0x9d, 0x9c, 0x9c, 0x9b, 0x9a, 0x99, 0x99, 0x98,
0x97, 0x97, 0x96, 0x95, 0x95, 0x94, 0x93, 0x93,
0x92, 0x91, 0x91, 0x90, 0x8f, 0x8f, 0x8e, 0x8e,
0x8d, 0x8c, 0x8c, 0x8b, 0x8b, 0x8a, 0x89, 0x89,
0x88, 0x88, 0x87, 0x87, 0x86, 0x85, 0x85, 0x84,
0x84, 0x83, 0x83, 0x82, 0x82, 0x81, 0x81, 0x80
};
static __device__ const unsigned int __internal_invSqrtCubeTab[96] =
{
0xfa0bf8fe, 0xee6b28fa, 0xe5f024f7, 0xdaf268f3,
0xd2f000f0, 0xc890c0ec, 0xc10378e9, 0xb9a758e6,
0xb4da40e4, 0xadcea0e1, 0xa6f278de, 0xa279c0dc,
0x9beb48d9, 0x97a5c4d7, 0x916340d4, 0x8d4fc8d2,
0x895000d0, 0x8563b8ce, 0x818ac0cc, 0x7dc4e8ca,
0x7a1200c8, 0x7671d8c6, 0x72e440c4, 0x6f6908c2,
0x6db240c1, 0x6a523cbf, 0x670424bd, 0x6563c0bc,
0x623028ba, 0x609ce8b9, 0x5d8364b7, 0x5bfd18b6,
0x58fd40b4, 0x5783a8b3, 0x560e48b2, 0x533000b0,
0x51c70caf, 0x506238ae, 0x4da4c0ac, 0x4c4c10ab,
0x4af768aa, 0x49a6b8a9, 0x485a00a8, 0x471134a7,
0x45cc58a6, 0x434e40a4, 0x4214f8a3, 0x40df88a2,
0x3fade0a1, 0x3e8000a0, 0x3d55dc9f, 0x3c2f789e,
0x3c2f789e, 0x3b0cc49d, 0x39edc09c, 0x38d2609b,
0x37baa89a, 0x36a68899, 0x35960098, 0x34890497,
0x34890497, 0x337f9896, 0x3279ac95, 0x31774094,
0x30784893, 0x30784893, 0x2f7cc892, 0x2e84b091,
0x2d900090, 0x2d900090, 0x2c9eac8f, 0x2bb0b88e,
0x2bb0b88e, 0x2ac6148d, 0x29dec08c, 0x29dec08c,
0x28fab08b, 0x2819e88a, 0x2819e88a, 0x273c5889,
0x273c5889, 0x26620088, 0x258ad487, 0x258ad487,
0x24b6d886, 0x24b6d886, 0x23e5fc85, 0x23184084,
0x23184084, 0x224d9883, 0x224d9883, 0x21860882,
0x21860882, 0x20c18081, 0x20c18081, 0x20000080
};
__device_func__(float __internal_frcp_kernel (float x,enum cudaRoundMode mo
de))
{
unsigned long long prod;
volatile union __cudart_FloatUintCvt arg;
unsigned int expo;
unsigned int sign;
unsigned f, y;
arg.f = x;
sign = arg.i & 0x80000000;
expo = (arg.i >> 23);
expo = expo & 0xff;
f = expo - 1;
if (f <= 0xFD) {
y = (arg.i << 8);
y = y | 0x80000000;
/* initial approximation */
arg.i = __internal_rcpTab[(y >> 24) - 128];
/* first NR iteration */
f = arg.i * arg.i;
f = f << 16;
prod = ((unsigned long long)y) * f;
arg.i = (arg.i << 24) - (unsigned)(prod >> 32);
/* second NR iteration */
f = arg.i + arg.i;
prod = ((unsigned long long)y) * f;
f = (unsigned)(-(int)(prod >> 32));
prod = ((unsigned long long)arg.i) * f;
y = y >> 8;
/* compute exponent */
expo = (2 * 127) - expo - 2;
arg.i = (unsigned)(prod >> 32);
if (mode == cudaRoundNearest) {
arg.i = arg.i >> 6;
} else {
arg.i = (arg.i + 32) >> 6;
}
if ((int)expo >= 0) {
f = y * arg.i;
arg.i = ((expo << 23) + arg.i) | sign;
} else {
/* result is a denormal */
expo = -(int)expo;
arg.i = arg.i >> expo;
f = y * arg.i;
arg.i = arg.i | sign;
}
if (mode == cudaRoundNearest) {
expo = f + y;
if ((int)f < 0) f = (unsigned)(-(int)f);
if ((int)expo < 0) expo = (unsigned)(-(int)expo);
if (expo < f) arg.i++;
} else if (mode == cudaRoundZero) {
if ((int)f > 0) arg.i = arg.i - 1;
} else if (mode == cudaRoundPosInf) {
if (((int)f > 0) && sign) arg.i = arg.i - 1;
if (((int)f < 0) && !sign) arg.i = arg.i + 1;
} else { /* mode == cudaRoundMinInf */
if (((int)f > 0) && !sign) arg.i = arg.i - 1;
if (((int)f < 0) && sign) arg.i = arg.i + 1;
}
return arg.f;
} else {
/* zero returns infinity. Must handle negative zero as well */
if (!(arg.i << 1)) {
arg.i = 0x7F800000 | arg.i;
return arg.f;
}
/* infinity returns zero of like sign */
if ((arg.i << 1) == 0xff000000) {
arg.i &= 0x80000000;
return arg.f;
}
/* convert SNaNs to QNaNs */
if ((arg.i << 1) > 0xff000000) {
arg.i |= 0x00400000;
return arg.f;
}
/* denormals */
f = 0;
arg.i <<= 8;
do {
f++;
arg.i <<= 1;
} while ((int)arg.i > 0);
arg.i >>= 8;
arg.i |= sign;
arg.f = __internal_frcp_kernel (arg.f, mode);
expo = ((arg.i << 1) >> 24);
if ((expo + f) < 255) {
arg.i = (arg.i + (f << 23));
return arg.f;
}
if (mode == cudaRoundNearest) {
arg.i = (arg.i & 0x80000000) | 0x7f800000;
} else if (mode == cudaRoundZero) {
arg.i = (arg.i & 0x80000000) | 0x7f7fffff;
} else if (mode == cudaRoundPosInf) {
arg.i = (arg.i & 0x80000000) | ((sign) ? 0x7f7fffff : 0x7f800000);
} else { /* mode == cudaRoundMinInf */
arg.i = (arg.i & 0x80000000) | ((sign) ? 0x7f800000 : 0x7f7fffff);
}
return arg.f;
}
}
__device_func__(float __internal_fsqrt_kernel (float radicand,
enum cudaRoundMode mode))
{
unsigned long long prod;
volatile union __cudart_FloatUintCvt arg;
unsigned int expo;
unsigned int s, f, x;
arg.f = radicand;
expo = arg.i >> 23;
expo = expo & 0xff;
f = expo - 1;
if ((arg.i <= 0x80000000) && (f <= 0xFD)) {
/* normalize input argument */
x = (arg.i << 8) | 0x80000000;
x = x >> (expo & 1);
/* initial approximation */
arg.i = f = __internal_invSqrtCubeTab[((unsigned)x >> 25) - 32];
/* first NR iteration */
prod = ((unsigned long long)x) * f;
arg.i = ((arg.i * 3) << 22) - (unsigned)(prod >> 32);
/* second NR iteration */
prod = ((unsigned long long)arg.i) * arg.i;
s = (unsigned)(prod >> 32);
prod = ((unsigned long long)x) * s;
f = 0x30000000 - (unsigned)(prod >> 32);
prod = ((unsigned long long)f) * arg.i;
arg.i = (unsigned)(prod >> 32);
/* compute sqrt(x) as x * 1/sqrt(x) */
prod = ((unsigned long long)x) * arg.i;
arg.i = (unsigned)(prod >> 32);
if (mode == cudaRoundNearest) {
arg.i = arg.i >> 3;
} else {
arg.i = (arg.i + 4) >> 3;
}
x = (x << 16) - (arg.i * arg.i);
/* round to nearest based on remainder; tie case impossible */
if (mode == cudaRoundNearest) {
f = x - (2 * arg.i + 1);
if ((int)f < 0) f = (unsigned)(-(int)f);
if ((int)x < 0) x = (unsigned)(-(int)x);
if (f < x) arg.i ++;
} else if ((mode == cudaRoundZero) || (mode == cudaRoundMinInf)) {
if ((int)x < 0) arg.i--;
} else if (mode == cudaRoundPosInf) {
if ((int)x > 0) arg.i++;
}
arg.i = arg.i + (((expo + 125) & ~0x1) << 22);
return arg.f;
} else {
/* if zero, or positive infinity, return argument */
if (!(arg.i << 1) || (arg.i == 0x7F800000)) {
return arg.f;
}
/* if NaN, return argument, possibly converted to QNaN */
if ((arg.i << 1) > 0xFF000000) {
arg.i |= 0x00400000;
return arg.f;
}
/* if negative, return NaN: INDEFINITE */
if (arg.i & 0x80000000) {
arg.i = 0xFFC00000;
return arg.f;
}
/* denormal, normalize it before computing square root */
x = 0;
arg.i <<= 8;
do {
x++;
arg.i <<= 1;
} while ((int)arg.i > 0);
arg.i >>= 8;
arg.i += (x & 1) << 23;
x += (x & 1);
arg.f = __internal_fsqrt_kernel (arg.f, mode);
arg.i -= ((x >> 1) << 23);
return arg.f;
}
}
__device_func__(float __internal_fdiv_kernel (float dividend, float divisor
,
enum cudaRoundMode mode))
{
unsigned long long prod;
unsigned r, f, x, y, expox, expoy, sign;
volatile union __cudart_FloatUintCvt cvtx, cvty, res;
cvtx.f = dividend;
cvty.f = divisor;
expox = ((cvtx.i >> 23) & 0xff) - 1;
expoy = ((cvty.i >> 23) & 0xff) - 1;
sign = ((cvtx.i ^ cvty.i) & 0x80000000);
if ((expox <= 0xFD) && (expoy <= 0xFD)) {
divide:
expox = expox - expoy + 127 - 1;
expoy = expox;
/* extract mantissas */
y = (cvty.i << 8) | 0x80000000;
x = (cvtx.i & 0x00ffffff) | 0x00800000;
/* initial approximation */
r = __internal_rcpTab[(y >> 24) - 128];
/* first NR iteration */
f = r * r;
prod = ((unsigned long long)y) * (f << 16);
r = (r << 24) - (unsigned)(prod >> 32);
/* second NR iteration */
prod = ((unsigned long long)y) * (r << 1);
f = (unsigned)-(int)(prod >> 32);
prod = ((unsigned long long)f) * (r << 1);
r = (unsigned)(prod >> 32);
/* produce quotient */
prod = ((unsigned long long)x) * (r << 1);
/* normalize mantissa */
if (((int)((prod >> 32) << 8)) > 0) {
expox--;
prod = prod + prod;
}
if (mode == cudaRoundNearest) {
/* preliminary mantissa */
r = (unsigned)(prod >> 32);
y = y >> 8;
/* result is a normal */
if (expox <= 0xFD) {
int rem0, rem1, inc;
/* round mantissa to nearest even */
prod = ((unsigned long long)y) * r;
x = x << (23 + ((prod >> 32) >> 15));
rem1 = x - (unsigned)(prod & 0xffffffff);
rem0 = rem1 - y;
inc = abs(rem0) < abs(rem1);
/* merge sign, mantissa, exponent for final result */
res.i = sign | ((expox << 23) + r + inc);
return res.f;
} else if ((int)expox >= 254) {
/* overflow: return infinity */
res.i = sign | 0x7f800000;
return res.f;
} else {
/* underflow: result is zero, denormal, or smallest normal */
int shift = -(int)expox;
if (shift > 23) {
/* result is zero or smallest denormal */
r = (shift < 25) && ((x != y) || (r > 0x00ff0000));
res.i = sign | r;
return res.f;
}
if (x == y) {
/* result is denormal */
shift = -(int)expoy;
r = 0x00800000 >> shift;
res.i = sign | r;
return res.f;
}
{
unsigned long long tempx;
long long remlo, remhi;
/* result is denormal or smallest normal */
r = r >> shift;
prod = ((unsigned long long)y) * r;
tempx = ((unsigned long long)x) << (23 - shift);
remlo = 2 * tempx - 2 * prod - y;
remhi = remlo + 2 * tempx;
if (remlo < 0) remlo = -remlo;
if (remhi < 0) remhi = -remhi;
if (remhi < remlo) tempx = 2 * tempx;
remlo = tempx - prod;
remhi = remlo - y;
if (remlo < 0) remlo = -remlo;
if (remhi < 0) remhi = -remhi;
if ((remhi < remlo) || ((remhi == remlo) && (r & 1))) r++;
res.i = sign | r;
return res.f;
}
}
} else if (mode == cudaRoundZero) {
/* preliminary mantissa */
prod += 0x0000000080000000ULL;
r = (unsigned)(prod >> 32);
y = y >> 8;
/* result is a normal */
if (expox <= 0xFD) {
int rem1;
prod = ((unsigned long long)y) * r;
x = x << (23 + ((prod >> 32) >> 15));
rem1 = x - (unsigned)(prod & 0xffffffff);
if (rem1 < 0) r--;
r = (expox << 23) + r;
if (r == 0x7f800000) r = 0x7f7fffff;
res.i = sign | r;
return res.f;
} else if ((int)expox >= 254) {
/* overflow: return largest normal */
res.i = sign | 0x7f7fffff;
return res.f;
} else {
/* underflow: result is zero, denormal, or smallest normal */
int shift = -(int)expox;
if ((x == y) && (shift < 31)) {
shift = -(int)expoy;
r = 0x00800000 >> shift;
res.i = sign | r;
return res.f;
}
if (shift > 23) {
r = 0;
res.i = sign | r;
return res.f;
}
{
unsigned long long tempx;
long long remlo, remhi;
/* result is denormal or smallest normal */
r = r >> shift;
prod = ((unsigned long long)y) * r;
tempx = ((unsigned long long)x) << (23 - shift);
remlo = 2 * tempx - 2 * prod - y;
remhi = remlo + 2 * tempx;
if (remlo < 0) remlo = -remlo;
if (remhi < 0) remhi = -remhi;
if (remhi < remlo) tempx = 2 * tempx;
remlo = tempx - prod;
if ((remlo < 0) & (r != 0)) r--;
res.i = sign | r;
return res.f;
}
}
} else if (mode == cudaRoundPosInf) {
/* preliminary mantissa */
prod += 0x0000000080000000ULL;
r = (unsigned)(prod >> 32);
y = y >> 8;
/* result is a normal */
if (expox <= 0xFD) {
int rem1;
prod = ((unsigned long long)y) * r;
x = x << (23 + ((prod >> 32) >> 15));
rem1 = x - (unsigned)(prod & 0xffffffff);
if ((rem1 < 0) && (sign)) r--;
if ((rem1 > 0) && (!sign)) r++;
r = (expox << 23) + r;
if ((r == 0x7f800000) && (sign)) r = 0x7f7fffff;
res.i = sign | r;
return res.f;
} else if ((int)expox >= 254) {
/* overflow: return largest normal, or infinity */
r = sign ? 0x7f7fffff : 0x7f800000;
res.i = sign | r;
return res.f;
} else {
/* underflow: result is zero, denormal, or smallest normal */
int shift = -(int)expox;
if ((x == y) && (shift <= 24)) {
shift = -(int)expoy;
r = 0x00800000 >> shift;
if (r == 0) r = !sign;
res.i = sign | r;
return res.f;
}
if (shift > 23) {
r = !sign;
res.i = sign | r;
return res.f;
}
{
unsigned long long tempx;
long long remlo, remhi;
/* result is denormal or smallest normal */
r = r >> shift;
prod = ((unsigned long long)y) * r;
tempx = ((unsigned long long)x) << (23 - shift);
remlo = 2 * tempx - 2 * prod - y;
remhi = remlo + 2 * tempx;
if (remlo < 0) remlo = -remlo;
if (remhi < 0) remhi = -remhi;
if (remhi < remlo) tempx = 2 * tempx;
remlo = tempx - prod;
if ((remlo < 0) && (r != 0) && (sign)) r--;
if ((remlo > 0) && (!sign)) r++;
res.i = sign | r;
return res.f;
}
}
} else if (mode == cudaRoundMinInf) {
/* preliminary mantissa */
prod += 0x0000000080000000ULL;
r = (unsigned)(prod >> 32);
y = y >> 8;
/* result is a normal */
if (expox <= 0xFD) {
int rem1;
prod = ((unsigned long long)y) * r;
x = x << (23 + ((prod >> 32) >> 15));
rem1 = x - (unsigned)(prod & 0xffffffff);
if ((rem1 < 0) && (!sign)) r--;
if ((rem1 > 0) && (sign)) r++;
r = (expox << 23) + r;
if ((r == 0x7f800000) && (!sign)) r = 0x7f7fffff;
res.i = sign | r;
return res.f;
} else if ((int)expox >= 254) {
/* overflow: return largest normal, or infinity */
r = sign ? 0x7f800000 : 0x7f7fffff;
res.i = sign | r;
return res.f;
} else {
/* underflow: result is zero, denormal, or smallest normal */
int shift = -(int)expox;
if ((x == y) && (shift <= 24)) {
shift = -(int)expoy;
r = 0x00800000 >> shift;
if (r == 0) r = !!sign;
res.i = sign | r;
return res.f;
}
if (shift > 23) {
r = !!sign;
res.i = sign | r;
return res.f;
}
{
unsigned long long tempx;
long long remlo, remhi;
/* result is denormal or smallest normal */
r = r >> shift;
prod = ((unsigned long long)y) * r;
tempx = ((unsigned long long)x) << (23 - shift);
remlo = 2 * tempx - 2 * prod - y;
remhi = remlo + 2 * tempx;
if (remlo < 0) remlo = -remlo;
if (remhi < 0) remhi = -remhi;
if (remhi < remlo) tempx = 2 * tempx;
remlo = tempx - prod;
if ((remlo < 0) && (r != 0) && (!sign)) r--;
if ((remlo > 0) && (sign)) r++;
res.i = sign | r;
return res.f;
}
}
}
}
{
int xzero, yzero, xinf, yinf, xnan, ynan;
xnan = (cvtx.i << 1) > 0xff000000;
ynan = (cvty.i << 1) > 0xff000000;
/* handle NaNs. Convert SNaNs to QNaNs */
if (xnan) {
res.i = cvtx.i | 0x00400000;
return res.f;
}
if (ynan) {
res.i = cvty.i | 0x00400000;
return res.f;
}
xzero = (cvtx.i << 1) == 0x00000000;
yzero = (cvty.i << 1) == 0x00000000;
xinf = (cvtx.i << 1) == 0xff000000;
yinf = (cvty.i << 1) == 0xff000000;
/* 0/0 and INF/INF are invalid operations. Return INDEFINITE */
if ((xzero & yzero) | (xinf & yinf)) {
res.i = 0xffc00000;
return res.f;
}
/* x/INF and 0/y -> 0 */
if (xzero | yinf) {
res.i = sign;
return res.f;
}
/* x/0 and INF/y -> INF */
if (yzero | xinf) {
res.i = sign | 0x7f800000;
return res.f;
}
/* normalize denormals */
if ((int)expox < 0) {
cvtx.i = cvtx.i << 9;
while ((int)cvtx.i >= 0) {
expox--;
cvtx.i = cvtx.i + cvtx.i;
}
cvtx.i = cvtx.i >> 8;
}
if ((int)expoy < 0) {
cvty.i = cvty.i << 9;
while ((int)cvty.i >= 0) {
expoy--;
cvty.i = cvty.i + cvty.i;
}
cvty.i = cvty.i >> 8;
}
goto divide;
}
}
__device_func__(float __internal_fmul_kernel (float a, float b,
enum cudaRoundMode mode))
{
unsigned long long product;
volatile union __cudart_FloatUintCvt xx, yy;
unsigned expo_x, expo_y;
xx.f = a;
yy.f = b;
expo_y = 0xFF;
expo_x = expo_y & (xx.i >> 23);
expo_x = expo_x - 1;
expo_y = expo_y & (yy.i >> 23);
expo_y = expo_y - 1;
if ((expo_x <= 0xFD) &&
(expo_y <= 0xFD)) {
multiply:
expo_x = expo_x + expo_y;
expo_y = xx.i ^ yy.i;
xx.i = xx.i & 0x00ffffff;
yy.i = yy.i << 8;
xx.i = xx.i | 0x00800000;
yy.i = yy.i | 0x80000000;
/* compute product */
product = ((unsigned long long)xx.i) * yy.i;
expo_x = expo_x - 127 + 2;
expo_y = expo_y & 0x80000000;
xx.i = (unsigned int)(product >> 32);
yy.i = (unsigned int)(product & 0xffffffff);
/* normalize mantissa */
if (xx.i < 0x00800000) {
xx.i = (xx.i << 1) | (yy.i >> 31);
yy.i = (yy.i << 1);
expo_x--;
}
if (expo_x <= 0xFD) {
xx.i = xx.i | expo_y; /* OR in sign bit */
xx.i = xx.i + (expo_x << 23); /* add in exponent */
/* round result to nearest or even */
if (mode == cudaRoundNearest) {
if (yy.i < 0x80000000) return xx.f;
xx.i += ((yy.i == 0x80000000) ? (xx.i & 1) : (yy.i >> 31));
} else if (mode == cudaRoundZero) {
} else if (mode == cudaRoundPosInf) {
xx.i += (yy.i && !expo_y);
} else if (mode == cudaRoundMinInf) {
xx.i += (yy.i && expo_y);
}
return xx.f;
} else if ((int)expo_x >= 254) {
/* overflow: return infinity or largest normal */
if (mode == cudaRoundNearest) {
xx.i = expo_y | 0x7F800000;
} else if (mode == cudaRoundZero) {
xx.i = expo_y | 0x7F7FFFFF;
} else if (mode == cudaRoundPosInf) {
xx.i = (expo_y ? 0xff7fffff : 0x7F800000);
} else { /* (mode == cudaRoundMinInf) */
xx.i = (expo_y ? 0xFF800000 : 0x7f7fffff);
}
return xx.f;
} else {
/* zero, denormal, or smallest normal */
expo_x = ((unsigned int)-((int)expo_x));
if (mode == cudaRoundNearest) {
if (expo_x > 25) {
/* massive underflow: return 0 */
xx.i = expo_y;
return xx.f;
} else {
yy.i = (xx.i << (32 - expo_x)) | ((yy.i) ? 1 : 0);
xx.i = expo_y + (xx.i >> expo_x);
xx.i += ((yy.i == 0x80000000) ? (xx.i & 1) : (yy.i >> 31));
return xx.f;
}
} else if (mode == cudaRoundZero) {
if (expo_x > 25) expo_x = 25;
xx.i = expo_y + (xx.i >> expo_x);
return xx.f;
} else if (mode == cudaRoundPosInf) {
if (expo_x > 25) expo_x = 25;
yy.i = (xx.i << (32 - expo_x)) | ((yy.i) ? 1 : 0);
xx.i = expo_y + (xx.i >> expo_x);
xx.i += (yy.i && !expo_y);
return xx.f;
} else { /* (mode == cudaRoundMinInf) */
if (expo_x > 25) expo_x = 25;
yy.i = (xx.i << (32 - expo_x)) | ((yy.i) ? 1 : 0);
xx.i = expo_y + (xx.i >> expo_x);
xx.i += (yy.i && expo_y);
return xx.f;
}
}
} else {
product = xx.i ^ yy.i;
product = product & 0x80000000;
if (!(xx.i & 0x7fffffff)) {
if (expo_y != 254) {
xx.i = (unsigned int)product;
return xx.f;
}
expo_y = yy.i << 1;
if (expo_y == 0xFF000000) {
xx.i = expo_y | 0x00C00000;
} else {
xx.i = yy.i | 0x00400000;
}
return xx.f;
}
if (!(yy.i & 0x7fffffff)) {
if (expo_x != 254) {
xx.i = (unsigned int)product;
return xx.f;
}
expo_x = xx.i << 1;
if (expo_x == 0xFF000000) {
xx.i = expo_x | 0x00C00000;
} else {
xx.i = xx.i | 0x00400000;
}
return xx.f;
}
if ((expo_y != 254) && (expo_x != 254)) {
expo_y++;
expo_x++;
if (expo_x == 0) {
expo_y |= xx.i & 0x80000000;
/*
* If both operands are denormals, we only need to normalize
* one of them as the result will be either a denormal or zero.
*/
xx.i = xx.i << 8;
while (!(xx.i & 0x80000000)) {
xx.i <<= 1;
expo_x--;
}
xx.i = (xx.i >> 8) | (expo_y & 0x80000000);
expo_y &= ~0x80000000;
expo_y--;
goto multiply;
}
if (expo_y == 0) {
expo_x |= yy.i & 0x80000000;
yy.i = yy.i << 8;
while (!(yy.i & 0x80000000)) {
yy.i <<= 1;
expo_y--;
}
yy.i = (yy.i >> 8) | (expo_x & 0x80000000);
expo_x &= ~0x80000000;
expo_x--;
goto multiply;
}
}
expo_x = xx.i << 1;
expo_y = yy.i << 1;
/* if x is NaN, return x */
if (expo_x > 0xFF000000) {
/* cvt any SNaNs to QNaNs */
xx.i = xx.i | 0x00400000;
return xx.f;
}
/* if y is NaN, return y */
if (expo_y > 0xFF000000) {
/* cvt any SNaNs to QNaNs */
xx.i = yy.i | 0x00400000;
return xx.f;
}
xx.i = (unsigned int)product | 0x7f800000;
return xx.f;
}
}
__device_func__(float __internal_fmaf_kernel (float a, float b, float c,
enum cudaRoundMode mode))
{
unsigned long long product;
unsigned int xx, yy, zz, ww;
unsigned int temp, s, u;
unsigned int expo_x, expo_y, expo_z;
volatile union __cudart_FloatUintCvt cvt;
cvt.f = a;
xx = cvt.i;
cvt.f = b;
yy = cvt.i;
cvt.f = c;
zz = cvt.i;
temp = 0xff;
expo_x = temp & (xx >> 23);
expo_x = expo_x - 1;
expo_y = temp & (yy >> 23);
expo_y = expo_y - 1;
expo_z = temp & (zz >> 23);
expo_z = expo_z - 1;
if (!((expo_x <= 0xFD) &&
(expo_y <= 0xFD) &&
(expo_z <= 0xFD))) {
/* fmad (nan, y, z) --> nan
fmad (x, nan, z) --> nan
fmad (x, y, nan) --> nan
*/
if ((yy << 1) > 0xff000000) {
return b + b;
}
if ((zz << 1) > 0xff000000) {
return c + c;
}
if ((xx << 1) > 0xff000000) {
return a + a;
}
/* fmad (0, inf, z) --> NaN
fmad (inf, 0, z) --> NaN
fmad (-inf,+y,+inf) --> NaN
fmad (+x,-inf,+inf) --> NaN
fmad (+inf,-y,+inf) --> NaN
fmad (-x,+inf,+inf) --> NaN
fmad (-inf,-y,-inf) --> NaN
fmad (-x,-inf,-inf) --> NaN
fmad (+inf,+y,-inf) --> NaN
fmad (+x,+inf,-inf) --> NaN
*/
if ((((xx << 1) == 0) && ((yy << 1) == 0xff000000)) ||
(((yy << 1) == 0) && ((xx << 1) == 0xff000000))) {
cvt.i = 0xffc00000;
return cvt.f;
}
if ((zz << 1) == 0xff000000) {
if (((yy << 1) == 0xff000000) || ((xx << 1) == 0xff000000)) {
if ((int)(xx ^ yy ^ zz) < 0) {
cvt.i = 0xffc00000;
return cvt.f;
}
}
}
/* fmad (inf, y, z) --> inf
fmad (x, inf, z) --> inf
fmad (x, y, inf) --> inf
*/
if ((xx << 1) == 0xff000000) {
xx = xx ^ (yy & 0x80000000);
cvt.i = xx;
return cvt.f;
}
if ((yy << 1) == 0xff000000) {
yy = yy ^ (xx & 0x80000000);
cvt.i = yy;
return cvt.f;
}
if ((zz << 1) == 0xff000000) {
cvt.i = zz;
return cvt.f;
}
/* fmad (+0, -y, -0) --> -0
fmad (-0, +y, -0) --> -0
fmad (+x, -0, -0) --> -0
fmad (-x, +0, -0) --> -0
*/
if (zz == 0x80000000) {
if (((xx << 1) == 0) || ((yy << 1) == 0)) {
if ((int)(xx ^ yy) < 0) {
cvt.i = zz;
return cvt.f;
}
}
}
/* fmad (0, y, 0) --> +0
fmad (x, 0, 0) --> +0
*/
if (((zz << 1) == 0) &&
(((xx << 1) == 0) || ((yy << 1) == 0))) {
if (mode == cudaRoundMinInf) {
zz = 0x80000000 & (xx ^ yy ^ zz);
} else {
zz &= 0x7fffffff;
}
cvt.i = zz;
return cvt.f;
}
/* fmad (0, y, z) --> z
fmad (x, 0, z) --> z
*/
if (((xx << 1) == 0) || ((yy << 1) == 0)) {
cvt.i = zz;
return cvt.f;
}
/* normalize x, if denormal */
if (expo_x == (unsigned)-1) {
temp = xx & 0x80000000;
xx = xx << 8;
while (!(xx & 0x80000000)) {
xx <<= 1;
expo_x--;
}
expo_x++;
xx = (xx >> 8) | temp;
}
/* normalize y, if denormal */
if (expo_y == (unsigned)-1) {
temp = yy & 0x80000000;
yy = yy << 8;
while (!(yy & 0x80000000)) {
yy <<= 1;
expo_y--;
}
expo_y++;
yy = (yy >> 8) | temp;
}
/* normalize z, if denormal */
if ((expo_z == (unsigned)-1) && ((zz << 1) != 0)) {
temp = zz & 0x80000000;
zz = zz << 8;
while (!(zz & 0x80000000)) {
zz <<= 1;
expo_z--;
}
expo_z++;
zz = (zz >> 8) | temp;
}
}
expo_x = expo_x + expo_y;
expo_y = xx ^ yy;
xx = xx & 0x00ffffff;
yy = yy << 8;
xx = xx | 0x00800000;
yy = yy | 0x80000000;
product = ((unsigned long long)xx) * yy;
xx = (unsigned)(product >> 32);
yy = (unsigned)(product & 0xffffffff);
expo_x = expo_x - 127 + 2;
expo_y = expo_y & 0x80000000;
/* normalize mantissa */
if (xx < 0x00800000) {
xx = (xx << 1) | (yy >> 31);
yy = (yy << 1);
expo_x--;
}
temp = 0;
if ((zz << 1) != 0) { /* z is not zero */
s = zz & 0x80000000;
zz &= 0x00ffffff;
zz |= 0x00800000;
ww = 0;
/* compare and swap. put augend into xx:yy */
if ((int)expo_z > (int)expo_x) {
temp = expo_z;
expo_z = expo_x;
expo_x = temp;
temp = zz;
zz = xx;
xx = temp;
temp = ww;
ww = yy;
yy = temp;
temp = expo_y;
expo_y = s;
s = temp;
}
/* augend_sign = expo_y, augend_mant = xx:yy, augend_expo = expo_x */
/* addend_sign = s, addend_mant = zz:ww, addend_expo = expo_z */
expo_z = expo_x - expo_z;
u = expo_y ^ s;
if (expo_z <= 49) {
/* denormalize addend */
temp = 0;
while (expo_z >= 32) {
temp = ww | (temp != 0);
ww = zz;
zz = 0;
expo_z -= 32;
}
if (expo_z) {
temp = ((temp >> expo_z) | (ww << (32 - expo_z)) |
((temp << (32 - expo_z)) != 0));
ww = (ww >> expo_z) | (zz << (32 - expo_z));
zz = (zz >> expo_z);
}
} else {
temp = 1;
ww = 0;
zz = 0;
}
if ((int)u < 0) {
/* signs differ, effective subtraction */
temp = (unsigned)(-(int)temp);
s = (temp != 0);
u = yy - s;
s = u > yy;
yy = u - ww;
s += yy > u;
xx = (xx - zz) - s;
if (!(xx | yy | temp)) {
/* complete cancelation, return 0 */
if (mode == cudaRoundMinInf) {
xx = 0x80000000;
}
cvt.i = xx;
return cvt.f;
}
if ((int)xx < 0) {
/* ooops, augend had smaller mantissa. Negate mantissa and flip
sign of result*/
temp = ~temp;
yy = ~yy;
xx = ~xx;
if (++temp == 0) {
if (++yy == 0) {
++xx;
}
}
expo_y ^= 0x80000000;
}
/* normalize mantissa, if necessary */
while (!(xx & 0x00800000)) {
xx = (xx << 1) | (yy >> 31);
yy = (yy << 1);
expo_x--;
}
} else {
/* signs are the same, effective addition */
yy = yy + ww;
s = yy < ww;
xx = xx + zz + s;
if (xx & 0x01000000) {
temp = temp | (yy << 31);
yy = (yy >> 1) | (xx << 31);
xx = ((xx & 0x80000000) | (xx >> 1)) & ~0x40000000;
expo_x++;
}
}
}
temp = yy | (temp != 0);
if (expo_x <= 0xFD) {
/* normal */
xx |= expo_y; /* or in sign bit */
if (mode == cudaRoundNearest) {
s = xx & 1; /* mantissa lsb */
xx += (temp == 0x80000000) ? s : (temp >> 31);
} else if (mode == cudaRoundPosInf) {
xx += temp && !expo_y;
} else if (mode == cudaRoundMinInf) {
xx += temp && expo_y;
}
xx = xx + (expo_x << 23); /* add in exponent */
cvt.i = xx;
return cvt.f;
} else if ((int)expo_x >= 126) {
/* overflow */
if (mode == cudaRoundNearest) {
xx = expo_y | 0x7f800000;
} else if (mode == cudaRoundZero) {
xx = expo_y | 0x7F7FFFFF;
} else if (mode == cudaRoundPosInf) {
xx = expo_y ? 0xFF7FFFFF : 0x7f800000;
} else if (mode == cudaRoundMinInf) {
xx = expo_y ? 0xff800000 : 0x7f7fffff;
}
cvt.i = xx;
return cvt.f;
}
/* subnormal */
expo_x = (unsigned int)(-(int)expo_x);
if (expo_x > 25) {
/* massive underflow: return 0, or smallest denormal */
xx = 0;
if (mode == cudaRoundPosInf) {
xx += !expo_y;
} else if (mode == cudaRoundMinInf) {
xx += !!expo_y;
}
cvt.i = expo_y | xx;
return cvt.f;
}
temp = (xx << (32 - expo_x)) | ((temp) ? 1 : 0);
xx = xx >> expo_x;
if (mode == cudaRoundNearest) {
xx = xx + ((temp == 0x80000000) ? (xx & 1) : (temp >> 31));
} else if (mode == cudaRoundPosInf) {
xx = xx + (!expo_y && temp);
} else if (mode == cudaRoundMinInf) {
xx = xx + (expo_y && temp);
}
xx = expo_y + xx; /* add in sign bit */
cvt.i = xx;
return cvt.f;
}
__device_func__(float __internal_fadd_kernel (float a, float b,
enum cudaRoundMode mode))
{
volatile union __cudart_FloatUintCvt xx, yy;
unsigned int expo_x;
unsigned int expo_y;
unsigned int temp;
xx.f = a;
yy.f = b;
/* make bigger operand the augend */
expo_y = yy.i << 1;
if (expo_y > (xx.i << 1)) {
expo_y = xx.i;
xx.i = yy.i;
yy.i = expo_y;
}
temp = 0xff;
expo_x = temp & (xx.i >> 23);
expo_x = expo_x - 1;
expo_y = temp & (yy.i >> 23);
expo_y = expo_y - 1;
if ((expo_x <= 0xFD) &&
(expo_y <= 0xFD)) {
add:
expo_y = expo_x - expo_y;
if (expo_y > 25) {
expo_y = 31;
}
temp = xx.i ^ yy.i;
xx.i = xx.i & ~0x7f000000;
xx.i = xx.i | 0x00800000;
yy.i = yy.i & ~0xff000000;
yy.i = yy.i | 0x00800000;
if ((int)temp < 0) {
/* signs differ, effective subtraction */
temp = 32 - expo_y;
temp = (expo_y) ? (yy.i << temp) : 0;
temp = (unsigned)(-((int)temp));
xx.i = xx.i - (yy.i >> expo_y) - (temp ? 1 : 0);
if (xx.i & 0x00800000) {
if (expo_x <= 0xFD) {
xx.i = xx.i + (expo_x << 23);
if (mode == cudaRoundNearest) {
if (temp < 0x80000000) return xx.f;
xx.i += ((temp == 0x80000000) ? (xx.i & 1) : (temp >> 31));
} else if (mode == cudaRoundZero) {
} else if (mode == cudaRoundPosInf) {
xx.i += (temp && !(xx.i & 0x80000000));
} else if (mode == cudaRoundMinInf) {
xx.i += (temp && (xx.i & 0x80000000));
}
return xx.f;
}
} else {
if ((temp | (xx.i << 1)) == 0) {
/* operands cancelled, resulting in a clean zero */
if (mode == cudaRoundMinInf) {
xx.i = 0x80000000;
} else {
xx.i = 0;
}
return xx.f;
}
/* normalize result */
yy.i = xx.i & 0x80000000;
do {
xx.i = (xx.i << 1) | (temp >> 31);
temp <<= 1;
expo_x--;
} while (!(xx.i & 0x00800000));
xx.i = xx.i | yy.i;
}
} else {
/* signs are the same, effective addition */
temp = 32 - expo_y;
temp = (expo_y) ? (yy.i << temp) : 0;
xx.i = xx.i + (yy.i >> expo_y);
if (!(xx.i & 0x01000000)) {
if (expo_x <= 0xFD) {
xx.i = xx.i + (expo_x << 23);
if (mode == cudaRoundNearest) {
if (temp < 0x80000000) return xx.f;
xx.i += ((temp == 0x80000000) ? (xx.i & 1) : (temp >> 31));
} else if (mode == cudaRoundZero) {
} else if (mode == cudaRoundPosInf) {
xx.i += (temp && !(xx.i & 0x80000000));
} else if (mode == cudaRoundMinInf) {
xx.i += (temp && (xx.i & 0x80000000));
}
return xx.f;
}
} else {
/* normalize result */
temp = (xx.i << 31) | (temp >> 1);
xx.i = ((xx.i & 0x80000000) | (xx.i >> 1)) & ~0x40000000;
expo_x++;
}
}
if (expo_x <= 0xFD) {
xx.i = xx.i + (expo_x << 23);
if (mode == cudaRoundNearest) {
if (temp < 0x80000000) return xx.f;
xx.i += ((temp == 0x80000000) ? (xx.i & 1) : (temp >> 31));
} else if (mode == cudaRoundZero) {
} else if (mode == cudaRoundPosInf) {
xx.i += (temp && !(xx.i & 0x80000000));
} else if (mode == cudaRoundMinInf) {
xx.i += (temp && (xx.i & 0x80000000));
}
return xx.f;
}
if ((int)expo_x >= 254) {
/* overflow: return infinity or largest normal */
temp = xx.i & 0x80000000;
if (mode == cudaRoundNearest) {
xx.i = (temp) | 0x7f800000;
} else if (mode == cudaRoundZero) {
xx.i = (temp) | 0x7f7fffff;
} else if (mode == cudaRoundMinInf) {
xx.i = (temp ? 0xFF800000 : 0x7f7fffff);
} else if (mode == cudaRoundPosInf) {
xx.i = (temp ? 0xff7fffff : 0x7F800000);
}
return xx.f;
}
/* underflow: denormal, or smallest normal */
expo_y = expo_x + 32;
yy.i = xx.i & 0x80000000;
xx.i = xx.i & ~0xff000000;
expo_x = (unsigned)(-((int)expo_x));
temp = xx.i << expo_y | ((temp) ? 1 : 0);
xx.i = yy.i | (xx.i >> expo_x);
if (mode == cudaRoundNearest) {
xx.i += (temp == 0x80000000) ? (xx.i & 1) : (temp >> 31);
} else if (mode == cudaRoundZero) {
} else if (mode == cudaRoundPosInf) {
xx.i += (temp && !yy.i);
} else if (mode == cudaRoundMinInf) {
xx.i += (temp && yy.i);
}
return xx.f;
} else {
/* handle special cases separately */
if (!(yy.i << 1)) {
if (mode == cudaRoundMinInf) {
if (!(xx.i << 1)) {
xx.i = xx.i | yy.i;
}
} else {
if (xx.i == 0x80000000) {
xx.i = yy.i;
}
}
if ((xx.i << 1) > 0xff000000) {
xx.i |= 0x00400000;
}
return xx.f;
}
if ((expo_y != 254) && (expo_x != 254)) {
/* remove sign bits */
if (expo_x == (unsigned int) -1) {
temp = xx.i & 0x80000000;
xx.i = xx.i << 8;
while (!(xx.i & 0x80000000)) {
xx.i <<= 1;
expo_x--;
}
expo_x++;
xx.i = (xx.i >> 8) | temp;
}
if (expo_y == (unsigned int) -1) {
temp = yy.i & 0x80000000;
yy.i = yy.i << 8;
while (!(yy.i & 0x80000000)) {
yy.i <<= 1;
expo_y--;
}
expo_y++;
yy.i = (yy.i >> 8) | temp;
}
goto add;
}
expo_x = xx.i << 1;
expo_y = yy.i << 1;
/* if x is NaN, return x */
if (expo_x > 0xff000000) {
/* cvt any SNaNs to QNaNs */
xx.i = xx.i | 0x00400000;
return xx.f;
}
/* if y is NaN, return y */
if (expo_y > 0xff000000) {
/* cvt any SNaNs to QNaNs */
xx.i = yy.i | 0x00400000;
return xx.f;
}
if ((expo_x == 0xff000000) && (expo_y == 0xff000000)) {
/*
* subtraction of infinities with the same sign, and addition of
* infinities of unlike sign is undefined: return NaN INDEFINITE
*/
expo_x = xx.i ^ yy.i;
xx.i = xx.i | ((expo_x) ? 0xffc00000 : 0);
return xx.f;
}
/* handle infinities */
if (expo_y == 0xff000000) {
xx.i = yy.i;
}
return xx.f;
}
}
__device_func__(float __frcp_rn (float a))
{
return __internal_frcp_kernel (a, cudaRoundNearest);
}
__device_func__(float __frcp_rz (float a))
{
return __internal_frcp_kernel (a, cudaRoundZero);
}
__device_func__(float __frcp_rd (float a))
{
return __internal_frcp_kernel (a, cudaRoundMinInf);
}
__device_func__(float __frcp_ru (float a))
{
return __internal_frcp_kernel (a, cudaRoundPosInf);
}
__device_func__(float __fsqrt_rn (float a))
{
return __internal_fsqrt_kernel (a, cudaRoundNearest);
}
__device_func__(float __fsqrt_rz (float a))
{
return __internal_fsqrt_kernel (a, cudaRoundZero);
}
__device_func__(float __fsqrt_rd (float a))
{
return __internal_fsqrt_kernel (a, cudaRoundMinInf);
}
__device_func__(float __fsqrt_ru (float a))
{
return __internal_fsqrt_kernel (a, cudaRoundPosInf);
}
__device_func__(float __fdiv_rn (float a, float b))
{
return __internal_fdiv_kernel (a, b, cudaRoundNearest);
}
__device_func__(float __fdiv_rz (float a, float b))
{
return __internal_fdiv_kernel (a, b, cudaRoundZero);
}
__device_func__(float __fdiv_rd (float a, float b))
{
return __internal_fdiv_kernel (a, b, cudaRoundMinInf);
}
__device_func__(float __fdiv_ru (float a, float b))
{
return __internal_fdiv_kernel (a, b, cudaRoundPosInf);
}
__device_func__(float __fadd_rd (float a, float b))
{
return __internal_fadd_kernel (a, b, cudaRoundMinInf);
}
__device_func__(float __fadd_ru (float a, float b))
{
return __internal_fadd_kernel (a, b, cudaRoundPosInf);
}
__device_func__(float __fmul_rd (float a, float b))
{
return __internal_fmul_kernel (a, b, cudaRoundMinInf);
}
__device_func__(float __fmul_ru (float a, float b))
{
return __internal_fmul_kernel (a, b, cudaRoundPosInf);
}
__device_func__(float __fmaf_rn (float a, float b, float c))
{
return __internal_fmaf_kernel (a, b, c, cudaRoundNearest);
}
__device_func__(float __fmaf_rz (float a, float b, float c))
{
return __internal_fmaf_kernel (a, b, c, cudaRoundZero);
}
__device_func__(float __fmaf_ru (float a, float b, float c))
{
return __internal_fmaf_kernel (a, b, c, cudaRoundPosInf);
}
__device_func__(float __fmaf_rd (float a, float b, float c))
{
return __internal_fmaf_kernel (a, b, c, cudaRoundMinInf);
}
__device_func__(int __cuda___isnan(double a));
__device_func__(int __cuda___isnanf(float a));
__device_func__(int __double2int_rz(double));
__device_func__(unsigned int __double2uint_rz(double));
__device_func__(long long int __double2ll_rz(double));
__device_func__(unsigned long long int __double2ull_rz(double));
#define __internal_clamp(val, max, min, nan)
\
if (sizeof(val) == sizeof(double) && __cuda___isnan((double)val)) re
turn nan; \
if (sizeof(val) == sizeof(float) && __cuda___isnanf((float)val)) ret
urn nan; \
if (val >= max) return max;
\
if (val <= min) return min
/**************************************************************************
*****
*
*
* HOST IMPLEMENTATIONS FOR FUNCTIONS WITH BUILTIN NVOPENCC OPREATIONS
*
*
*
***************************************************************************
****/
__device_func__(int __mulhi(int a, int b))
{
long long int c = (long long int)a * (long long int)b;
return (int)(c >> 32);
}
__device_func__(unsigned int __umulhi(unsigned int a, unsigned int b))
{
unsigned long long int c = (unsigned long long int)a * (unsigned long lon
g int)b;
return (unsigned int)(c >> 32);
}
__device_func__(unsigned long long int __umul64hi(unsigned long long int a,
unsigned long long int b))
{
unsigned int a_lo = (unsigned int)a;
unsigned long long int a_hi = a >> 32;
unsigned int b_lo = (unsigned int)b;
unsigned long long int b_hi = b >> 32;
unsigned long long int m1 = a_lo * b_hi;
unsigned long long int m2 = a_hi * b_lo;
unsigned int carry;
carry = (0ULL + __umulhi(a_lo, b_lo) + (unsigned int)m1 + (unsigned int)m
2) >> 32;
return a_hi * b_hi + (m1 >> 32) + (m2 >> 32) + carry;
}
__device_func__(long long int __mul64hi(long long int a, long long int b))
{
long long int res;
res = __umul64hi(a, b);
if (a < 0LL) res = res - b;
if (b < 0LL) res = res - a;
return res;
}
__device_func__(float __saturatef(float a))
{
if (__cuda___isnanf(a)) return 0.0f; /* update of PTX spec 10/15/2008 */
return a >= 1.0f ? 1.0f : a <= 0.0f ? 0.0f : a;
}
__device_func__(unsigned int __sad(int a, int b, unsigned int c))
{
long long int diff = (long long int)a - (long long int)b;
return (unsigned int)(__cuda_llabs(diff) + (long long int)c);
}
__device_func__(unsigned int __usad(unsigned int a, unsigned int b, unsigne
d int c))
{
long long int diff = (long long int)a - (long long int)b;
return (unsigned int)(__cuda_llabs(diff) + (long long int)c);
}
__device_func__(int __mul24(int a, int b))
{
a &= 0xffffff;
a = (a & 0x800000) != 0 ? a | ~0xffffff : a;
b &= 0xffffff;
b = (b & 0x800000) != 0 ? b | ~0xffffff : b;
return a * b;
}
__device_func__(unsigned int __umul24(unsigned int a, unsigned int b))
{
a &= 0xffffff;
b &= 0xffffff;
return a * b;
}
__device_func__(float __int_as_float(int a))
{
volatile union __cudart_FloatIntCvt u;
u.i = a;
return u.f;
}
__device_func__(int __float_as_int(float a))
{
volatile union __cudart_FloatIntCvt u;
u.f = a;
return u.i;
}
__device_func__(long long int __internal_float2ll_kernel(float a, long long
int max, long long int min, long long int nan, enum cudaRoundMode rndMode)
)
{
unsigned long long int res, t = 0ULL;
int shift;
unsigned int ia;
__internal_clamp(a, max, min, nan);
ia = __float_as_int(a);
shift = 189 - ((ia >> 23) & 0xff);
res = (unsigned long long int)(((ia << 8) | 0x80000000) >> 1) << 32;
if (shift >= 64) {
t = res;
res = 0;
} else if (shift) {
t = res << (64 - shift);
res = res >> shift;
}
if (rndMode == cudaRoundNearest && (long long int)t < 0LL) {
res += t == 0x8000000000000000ULL ? res & 1ULL : 1ULL;
}
else if (rndMode == cudaRoundMinInf && t != 0ULL && ia > 0x80000000) {
res++;
}
else if (rndMode == cudaRoundPosInf && t != 0ULL && (int)ia > 0) {
res++;
}
if ((int)ia < 0) res = (unsigned long long int)-(long long int)res;
return (long long int)res;
}
__device_func__(int __internal_float2int(float a, enum cudaRoundMode rndMod
e))
{
return (int)__internal_float2ll_kernel(a, 2147483647LL, -2147483648LL, 0L
L, rndMode);
}
__device_func__(int __float2int_rz(float a))
{
return __internal_float2int(a, cudaRoundZero);
}
__device_func__(int __float2int_ru(float a))
{
return __internal_float2int(a, cudaRoundPosInf);
}
__device_func__(int __float2int_rd(float a))
{
return __internal_float2int(a, cudaRoundMinInf);
}
__device_func__(int __float2int_rn(float a))
{
return __internal_float2int(a, cudaRoundNearest);
}
__device_func__(long long int __internal_float2ll(float a, enum cudaRoundMo
de rndMode))
{
return __internal_float2ll_kernel(a, 9223372036854775807LL, -922337203685
4775807LL -1LL, -9223372036854775807LL -1LL, rndMode);
}
__device_func__(long long int __float2ll_rz(float a))
{
return __internal_float2ll(a, cudaRoundZero);
}
__device_func__(long long int __float2ll_ru(float a))
{
return __internal_float2ll(a, cudaRoundPosInf);
}
__device_func__(long long int __float2ll_rd(float a))
{
return __internal_float2ll(a, cudaRoundMinInf);
}
__device_func__(long long int __float2ll_rn(float a))
{
return __internal_float2ll(a, cudaRoundNearest);
}
__device_func__(unsigned long long int __internal_float2ull_kernel(float a,
unsigned long long int max, unsigned long long int nan, enum cudaRoundMode
rndMode))
{
unsigned long long int res, t = 0ULL;
int shift;
unsigned int ia;
__internal_clamp(a, max, 0LL, nan);
ia = __float_as_int(a);
shift = 190 - ((ia >> 23) & 0xff);
res = (unsigned long long int)((ia << 8) | 0x80000000) << 32;
if (shift >= 64) {
t = res >> (int)(shift > 64);
res = 0;
} else if (shift) {
t = res << (64 - shift);
res = res >> shift;
}
if (rndMode == cudaRoundNearest && (long long int)t < 0LL) {
res += t == 0x8000000000000000ULL ? res & 1ULL : 1ULL;
}
else if (rndMode == cudaRoundPosInf && t != 0ULL) {
res++;
}
return res;
}
__device_func__(unsigned int __internal_float2uint(float a, enum cudaRoundM
ode rndMode))
{
return (unsigned int)__internal_float2ull_kernel(a, 4294967295U, 0U, rndM
ode);
}
__device_func__(unsigned int __float2uint_rz(float a))
{
return __internal_float2uint(a, cudaRoundZero);
}
__device_func__(unsigned int __float2uint_ru(float a))
{
return __internal_float2uint(a, cudaRoundPosInf);
}
__device_func__(unsigned int __float2uint_rd(float a))
{
return __internal_float2uint(a, cudaRoundMinInf);
}
__device_func__(unsigned int __float2uint_rn(float a))
{
return __internal_float2uint(a, cudaRoundNearest);
}
__device_func__(unsigned long long int __internal_float2ull(float a, enum c
udaRoundMode rndMode))
{
return __internal_float2ull_kernel(a, 18446744073709551615ULL, 9223372036
854775808ULL, rndMode);
}
__device_func__(unsigned long long int __float2ull_rz(float a))
{
return __internal_float2ull(a, cudaRoundZero);
}
__device_func__(unsigned long long int __float2ull_ru(float a))
{
return __internal_float2ull(a, cudaRoundPosInf);
}
__device_func__(unsigned long long int __float2ull_rd(float a))
{
return __internal_float2ull(a, cudaRoundMinInf);
}
__device_func__(unsigned long long int __float2ull_rn(float a))
{
return __internal_float2ull(a, cudaRoundNearest);
}
__device_func__(int __internal_normalize64(unsigned long long int *a))
{
int lz = 0;
if ((*a & 0xffffffff00000000ULL) == 0ULL) {
*a <<= 32;
lz += 32;
}
if ((*a & 0xffff000000000000ULL) == 0ULL) {
*a <<= 16;
lz += 16;
}
if ((*a & 0xff00000000000000ULL) == 0ULL) {
*a <<= 8;
lz += 8;
}
if ((*a & 0xf000000000000000ULL) == 0ULL) {
*a <<= 4;
lz += 4;
}
if ((*a & 0xC000000000000000ULL) == 0ULL) {
*a <<= 2;
lz += 2;
}
if ((*a & 0x8000000000000000ULL) == 0ULL) {
*a <<= 1;
lz += 1;
}
return lz;
}
__device_func__(int __internal_normalize(unsigned int *a))
{
unsigned long long int t = (unsigned long long int)*a;
int lz = __internal_normalize64(&t);
*a = (unsigned int)(t >> 32);
return lz - 32;
}
__device_func__(float __internal_int2float_kernel(int a, enum cudaRoundMode
rndMode))
{
volatile union __cudart_FloatUintCvt res;
int shift;
unsigned int t;
res.i = a;
if (a == 0) return res.f;
if (a < 0) res.i = (unsigned int)-a;
shift = __internal_normalize((unsigned int*)&res.i);
t = res.i << 24;
res.i = (res.i >> 8);
res.i += (127 + 30 - shift) << 23;
if (a < 0) res.i |= 0x80000000;
if ((rndMode == cudaRoundNearest) && (t >= 0x80000000)) {
res.i += (t == 0x80000000) ? (res.i & 1) : (t >> 31);
}
else if ((rndMode == cudaRoundMinInf) && t && (a < 0)) {
res.i++;
}
else if ((rndMode == cudaRoundPosInf) && t && (a > 0)) {
res.i++;
}
return res.f;
}
__device_func__(float __int2float_rz(int a))
{
return __internal_int2float_kernel(a, cudaRoundZero);
}
__device_func__(float __int2float_ru(int a))
{
return __internal_int2float_kernel(a, cudaRoundPosInf);
}
__device_func__(float __int2float_rd(int a))
{
return __internal_int2float_kernel(a, cudaRoundMinInf);
}
__device_func__(float __int2float_rn(int a))
{
return __internal_int2float_kernel(a, cudaRoundNearest);
}
__device_func__(float __internal_uint2float_kernel(unsigned int a, enum cud
aRoundMode rndMode))
{
volatile union __cudart_FloatUintCvt res;
int shift;
unsigned int t;
res.i = a;
if (a == 0) return res.f;
shift = __internal_normalize((unsigned int*)&res.i);
t = res.i << 24;
res.i = (res.i >> 8);
res.i += (127 + 30 - shift) << 23;
if (rndMode == cudaRoundNearest) {
res.i += (t == 0x80000000) ? (res.i & 1) : (t >> 31);
}
else if ((rndMode == cudaRoundPosInf) && t) {
res.i++;
}
return res.f;
}
__device_func__(float __uint2float_rz(unsigned int a))
{
return __internal_uint2float_kernel(a, cudaRoundZero);
}
__device_func__(float __uint2float_ru(unsigned int a))
{
return __internal_uint2float_kernel(a, cudaRoundPosInf);
}
__device_func__(float __uint2float_rd(unsigned int a))
{
return __internal_uint2float_kernel(a, cudaRoundMinInf);
}
__device_func__(float __uint2float_rn(unsigned int a))
{
return __internal_uint2float_kernel(a, cudaRoundNearest);
}
__device_func__(float __internal_ull2float_kernel(unsigned long long int a,
enum cudaRoundMode rndMode))
{
unsigned long long int temp;
unsigned int res, t;
int shift;
if (a == 0ULL) return 0.0f;
temp = a;
shift = __internal_normalize64(&temp);
temp = (temp >> 8) | ((temp & 0xffULL) ? 1ULL : 0ULL);
res = (unsigned int)(temp >> 32);
t = (unsigned int)temp;
res += (127 + 62 - shift) << 23; /* add in exponent */
if (rndMode == cudaRoundNearest) {
res += (t == 0x80000000) ? (res & 1) : (t >> 31);
} else if (rndMode == cudaRoundPosInf) {
res += (t != 0);
}
return __int_as_float(res);
}
__device_func__(float __internal_ll2float_kernel(long long int a, enum cuda
RoundMode rndMode))
{
unsigned long long int temp;
volatile float res = 0.0f;
if (a < 0LL) {
temp = (~((unsigned long long int)a)) + 1ULL;
if (rndMode == cudaRoundPosInf) {
rndMode = cudaRoundMinInf;
} else if (rndMode == cudaRoundMinInf) {
rndMode = cudaRoundPosInf;
}
} else {
temp = (unsigned long long int)a;
}
res = __internal_ull2float_kernel (temp, rndMode);
if (a < 0LL) {
res = -res;
}
return res;
}
__device_func__(float __ll2float_rn(long long int a))
{
return __internal_ll2float_kernel(a, cudaRoundNearest);
}
__device_func__(float __ll2float_rz(long long int a))
{
return __internal_ll2float_kernel(a, cudaRoundZero);
}
__device_func__(float __ll2float_ru(long long int a))
{
return __internal_ll2float_kernel(a, cudaRoundPosInf);
}
__device_func__(float __ll2float_rd(long long int a))
{
return __internal_ll2float_kernel(a, cudaRoundMinInf);
}
__device_func__(float __ull2float_rn(unsigned long long int a))
{
return __internal_ull2float_kernel(a, cudaRoundNearest);
}
__device_func__(float __ull2float_rz(unsigned long long int a))
{
return __internal_ull2float_kernel(a, cudaRoundZero);
}
__device_func__(float __ull2float_ru(unsigned long long int a))
{
return __internal_ull2float_kernel(a, cudaRoundPosInf);
}
__device_func__(float __ull2float_rd(unsigned long long int a))
{
return __internal_ull2float_kernel(a, cudaRoundMinInf);
}
__device_func__(unsigned short __float2half_rn(float f))
{
unsigned int x = __float_as_int (f);
unsigned int u = (x & 0x7fffffff), remainder, shift, lsb, lsb_s1, lsb_m1;
unsigned int sign, exponent, mantissa;
/* Get rid of +NaN/-NaN case first. */
if (u > 0x7f800000) {
return 0x7fff;
}
sign = ((x >> 16) & 0x8000);
/* Get rid of +Inf/-Inf, +0/-0. */
if (u > 0x477fefff) {
return sign | 0x7c00;
}
if (u < 0x33000001) {
return sign | 0x0000;
}
exponent = ((u >> 23) & 0xff);
mantissa = (u & 0x7fffff);
if (exponent > 0x70) {
shift = 13;
exponent -= 0x70;
} else {
shift = 0x7e - exponent;
exponent = 0;
mantissa |= 0x800000;
}
lsb = (1 << shift);
lsb_s1 = (lsb >> 1);
lsb_m1 = (lsb - 1);
/* Round to nearest even. */
remainder = (mantissa & lsb_m1);
mantissa >>= shift;
if (remainder > lsb_s1 || (remainder == lsb_s1 && (mantissa & 0x1))) {
++mantissa;
if (!(mantissa & 0x3ff)) {
++exponent;
mantissa = 0;
}
}
return sign | (exponent << 10) | mantissa;
}
__device_func__(float __half2float(unsigned short h))
{
unsigned int sign = ((h >> 15) & 1);
unsigned int exponent = ((h >> 10) & 0x1f);
unsigned int mantissa = ((h & 0x3ff) << 13);
if (exponent == 0x1f) { /* NaN or Inf */
mantissa = (mantissa
? (sign = 0, 0x7fffff)
: 0);
exponent = 0xff;
} else if (!exponent) { /* Denorm or Zero */
if (mantissa) {
unsigned int msb;
exponent = 0x71;
do {
msb = (mantissa & 0x400000);
mantissa <<= 1; /* normalize */
--exponent;
} while (!msb);
mantissa &= 0x7fffff; /* 1.mantissa is implicit */
}
} else {
exponent += 0x70;
}
return __int_as_float ((sign << 31) | (exponent << 23) | mantissa);
}
__device_func__(float __fadd_rz(float a, float b))
{
return __internal_fadd_kernel(a, b, cudaRoundZero);
}
__device_func__(float __fmul_rz(float a, float b))
{
return __internal_fmul_kernel(a, b, cudaRoundZero);
}
__device_func__(float __fadd_rn(float a, float b))
{
return __internal_fadd_kernel(a, b, cudaRoundNearest);
}
__device_func__(float __fmul_rn(float a, float b))
{
return __internal_fmul_kernel(a, b, cudaRoundNearest);
}
__device_func__(void __brkpt(int c))
{
/* TODO */
}
#if defined(__cplusplus)
extern "C" {
#endif /* __cplusplus */
extern int CUDARTAPI __cudaSynchronizeThreads(void**, void*);
#if defined(__cplusplus)
}
#endif /* __cplusplus */
#if defined(__GNUC__)
__device_func__(inline __attribute__((always_inline)) void __syncthreads(vo
id))
{
volatile int _ = 0;
L: if (__cudaSynchronizeThreads((void**)&&L, (void*)&_)) goto L;
}
#elif defined(_WIN32)
#define __syncthreads() \
(void)__cudaSynchronizeThreads((void**)0, (void*)0)
#endif /* __GNUC__ */
__device_func__(void __prof_trigger(int a))
{
}
__device_func__(void __threadfence(void))
{
__syncthreads();
}
__device_func__(void __threadfence_block(void))
{
__syncthreads();
}
#if defined(__GNUC__)
__device_func__(void __trap(void))
{
__builtin_trap();
}
#elif defined(_WIN32)
__device_func__(void __trap(void))
{
__debugbreak();
}
#endif /* __GNUC__ */
#endif /* __CUDABE__ */
/**************************************************************************
*****
*
*
* DEVICE IMPLEMENTATIONS FOR FUNCTIONS WITH BUILTIN NVOPENCC OPERATIONS
*
*
*
***************************************************************************
****/
#if !defined(__CUDABE__)
__device_func__(float __fdividef(float a, float b))
{
volatile float aa = a;
volatile float bb = b;
/* match range restrictions of the device function */
if (__cuda_fabsf(bb) > CUDART_TWO_TO_126_F) {
if (__cuda_fabsf(aa) <= CUDART_NORM_HUGE_F) {
return ((aa / bb) / CUDART_NORM_HUGE_F) / CUDART_NORM_HUGE_F;
} else {
bb = 1.0f / bb;
bb = bb / CUDART_NORM_HUGE_F;
return aa * bb;
}
} else {
return aa / bb;
}
}
#endif /* !defined(__CUDABE__) */
__device_func__(float __sinf(float a))
{
#if !defined(__CUDABE__)
if ((__float_as_int(a) << 1) == 0xff000000) {
return __fadd_rn (a, -a); /* return NaN */
}
#endif /* !defined(__CUDABE__) */
return sinf(a);
}
__device_func__(float __cosf(float a))
{
#if !defined(__CUDABE__)
if ((__float_as_int(a) << 1) == 0xff000000) {
return __fadd_rn (a, -a); /* return NaN */
}
#endif /* !defined(__CUDABE__) */
return cosf(a);
}
__device_func__(float __log2f(float a))
{
return log2f(a);
}
/**************************************************************************
*****
*
*
* SHARED HOST AND DEVICE IMPLEMENTATIONS
*
*
*
***************************************************************************
****/
__device_func__(float __tanf(float a))
{
return __fdividef (__sinf(a), __cosf(a));
}
__device_func__(void __sincosf(float a, float *sptr, float *cptr))
{
*sptr = __sinf(a);
*cptr = __cosf(a);
}
__device_func__(float __expf(float a))
{
return __cuda_exp2f(a * CUDART_L2E_F);
}
__device_func__(float __exp10f(float a))
{
return __cuda_exp2f(a * CUDART_L2T_F);
}
__device_func__(float __log10f(float a))
{
return CUDART_LG2_F * __log2f(a);
}
__device_func__(float __logf(float a))
{
return CUDART_LN2_F * __log2f(a);
}
__device_func__(float __powf(float a, float b))
{
return __cuda_exp2f(b * __log2f(a));
}
__device_func__(float fdividef(float a, float b))
{
#if defined(__USE_FAST_MATH__) && !defined(__CUDA_PREC_DIV)
return __fdividef(a, b);
#else /* __USE_FAST_MATH__ && !__CUDA_PREC_DIV */
return a / b;
#endif /* __USE_FAST_MATH__ && !__CUDA_PREC_DIV */
}
#if !defined(__CUDABE__) || (__CUDA_ARCH__ < 200)
__device_func__(int __clz(int a))
{ {
return (a)?(158-(__float_as_int(__uint2float_rz((unsigned int)a))>>23)):3 2; return (a)?(158-(__float_as_int(__uint2float_rz((unsigned int)a))>>23)):3 2;
} }
__device_func__(int __clzll(long long int a)) static __forceinline__ int __clzll(long long int a)
{ {
int ahi = ((int)((unsigned long long)a >> 32)); int ahi = ((int)((unsigned long long)a >> 32));
int alo = ((int)((unsigned long long)a & 0xffffffffULL)); int alo = ((int)((unsigned long long)a & 0xffffffffULL));
int res; int res;
if (ahi) { if (ahi) {
res = 0; res = 0;
} else { } else {
res = 32; res = 32;
ahi = alo; ahi = alo;
} }
res = res + __clz(ahi); res = res + __clz(ahi);
return res; return res;
} }
__device_func__(int __popc(unsigned int a)) static __forceinline__ int __popc(unsigned int a)
{ {
a = a - ((a >> 1) & 0x55555555); a = a - ((a >> 1) & 0x55555555);
a = (a & 0x33333333) + ((a >> 2) & 0x33333333); a = (a & 0x33333333) + ((a >> 2) & 0x33333333);
a = (a + (a >> 4)) & 0x0f0f0f0f; a = (a + (a >> 4)) & 0x0f0f0f0f;
a = ((__umul24(a, 0x808080) << 1) + a) >> 24; a = ((__umul24(a, 0x808080) << 1) + a) >> 24;
return a; return a;
} }
__device_func__(int __popcll(unsigned long long int a)) static __forceinline__ int __popcll(unsigned long long int a)
{ {
unsigned int ahi = ((unsigned int)(a >> 32)); unsigned int ahi = ((unsigned int)(a >> 32));
unsigned int alo = ((unsigned int)(a & 0xffffffffULL)); unsigned int alo = ((unsigned int)(a & 0xffffffffULL));
alo = alo - ((alo >> 1) & 0x55555555); alo = alo - ((alo >> 1) & 0x55555555);
alo = (alo & 0x33333333) + ((alo >> 2) & 0x33333333); alo = (alo & 0x33333333) + ((alo >> 2) & 0x33333333);
ahi = ahi - ((ahi >> 1) & 0x55555555); ahi = ahi - ((ahi >> 1) & 0x55555555);
ahi = (ahi & 0x33333333) + ((ahi >> 2) & 0x33333333); ahi = (ahi & 0x33333333) + ((ahi >> 2) & 0x33333333);
alo = alo + ahi; alo = alo + ahi;
alo = (alo & 0x0f0f0f0f) + ((alo >> 4) & 0x0f0f0f0f); alo = (alo & 0x0f0f0f0f) + ((alo >> 4) & 0x0f0f0f0f);
alo = ((__umul24(alo, 0x808080) << 1) + alo) >> 24; alo = ((__umul24(alo, 0x808080) << 1) + alo) >> 24;
return alo; return alo;
} }
__device_func__(unsigned int __brev(unsigned int a)) static __forceinline__ unsigned int __brev(unsigned int a)
{ {
/* Use Knuth's algorithm from http://www.hackersdelight.org/revisions.pdf */ /* Use Knuth's algorithm from http://www.hackersdelight.org/revisions.pdf */
unsigned int t; unsigned int t;
a = (a << 15) | (a >> 17); a = (a << 15) | (a >> 17);
t = (a ^ (a >> 10)) & 0x003f801f; t = (a ^ (a >> 10)) & 0x003f801f;
a = (t + (t << 10)) ^ a; a = (t + (t << 10)) ^ a;
t = (a ^ (a >> 4)) & 0x0e038421; t = (a ^ (a >> 4)) & 0x0e038421;
a = (t + (t << 4)) ^ a; a = (t + (t << 4)) ^ a;
t = (a ^ (a >> 2)) & 0x22488842; t = (a ^ (a >> 2)) & 0x22488842;
a = (t + (t << 2)) ^ a; a = (t + (t << 2)) ^ a;
return a; return a;
} }
__device_func__(unsigned long long int __brevll(unsigned long long int a)) static __forceinline__ unsigned long long int __brevll(unsigned long long i nt a)
{ {
unsigned int hi = (unsigned int)(a >> 32); unsigned int hi = (unsigned int)(a >> 32);
unsigned int lo = (unsigned int)(a & 0xffffffffULL); unsigned int lo = (unsigned int)(a & 0xffffffffULL);
unsigned int t; unsigned int t;
t = __brev(lo); t = __brev(lo);
lo = __brev(hi); lo = __brev(hi);
return ((unsigned long long int)t << 32) + (unsigned long long int)lo; return ((unsigned long long int)t << 32) + (unsigned long long int)lo;
} }
#endif /* __CUDABE__ || __CUDA_ARCH__ < 200 */ static __forceinline__ unsigned int __byte_perm(unsigned int a, unsigned in
t b, unsigned int slct)
__device_func__(int __ffs(int a))
{
return 32 - __clz (a & -a);
}
__device_func__(int __ffsll(long long int a))
{
return 64 - __clzll (a & -a);
}
#if defined(CUDA_DOUBLE_MATH_FUNCTIONS) && defined(CUDA_FLOAT_MATH_FUNCTION
S)
#error -- conflicting mode for double math routines
#endif /* CUDA_DOUBLE_MATH_FUNCTIONS && CUDA_FLOAT_MATH_FUNCTIONS */
#if defined(CUDA_FLOAT_MATH_FUNCTIONS)
__device_func__(double fdivide(double a, double b))
{
return (double)fdividef((float)a, (float)b);
}
#if !defined(__CUDABE__)
__device_func__(int __double2int_rz(double a))
{
return __float2int_rz((float)a);
}
__device_func__(unsigned int __double2uint_rz(double a))
{
return __float2uint_rz((float)a);
}
__device_func__(long long int __double2ll_rz(double a))
{
return __float2ll_rz((float)a);
}
__device_func__(unsigned long long int __double2ull_rz(double a))
{
return __float2ull_rz((float)a);
}
#endif /* !__CUDABE__ */
#endif /* CUDA_FLOAT_MATH_FUNCTIONS */
#if defined(CUDA_DOUBLE_MATH_FUNCTIONS)
__device_func__(double fdivide(double a, double b))
{ {
return a / b; unsigned int i0 = (slct >> 0) & 0x7;
} unsigned int i1 = (slct >> 4) & 0x7;
unsigned int i2 = (slct >> 8) & 0x7;
#if !defined(__CUDABE__) unsigned int i3 = (slct >> 12) & 0x7;
__device_func__(int __internal_double2int(double a, enum cudaRoundMode rndM
ode));
__device_func__(unsigned int __internal_double2uint(double a, enum cudaRoun
dMode rndMode));
__device_func__(long long int __internal_double2ll(double a, enum cudaRound
Mode rndMode));
__device_func__(unsigned long long int __internal_double2ull(double a, enum
cudaRoundMode rndMode));
__device_func__(int __double2int_rz(double a)) return (((((i0 < 4) ? (a >> (i0*8)) : (b >> ((i0-4)*8))) & 0xff) << 0) +
{ ((((i1 < 4) ? (a >> (i1*8)) : (b >> ((i1-4)*8))) & 0xff) << 8) +
return __internal_double2int(a, cudaRoundZero); ((((i2 < 4) ? (a >> (i2*8)) : (b >> ((i2-4)*8))) & 0xff) << 16) +
((((i3 < 4) ? (a >> (i3*8)) : (b >> ((i3-4)*8))) & 0xff) << 24));
} }
__device_func__(unsigned int __double2uint_rz(double a)) #endif /* __CUDA_ARCH__ < 200 */
{
return __internal_double2uint(a, cudaRoundZero);
}
__device_func__(long long int __double2ll_rz(double a)) static __forceinline__ int __ffs(int a)
{ {
return __internal_double2ll(a, cudaRoundZero); return 32 - __clz(a & -a);
} }
__device_func__(unsigned long long int __double2ull_rz(double a)) static __forceinline__ int __ffsll(long long int a)
{ {
return __internal_double2ull(a, cudaRoundZero); return 64 - __clzll(a & -a);
} }
#endif /* !__CUDABE__ */
#endif /* CUDA_DOUBLE_MATH_FUNCTIONS */
#endif /* __cplusplus && __CUDACC__ */ #endif /* __cplusplus && __CUDACC__ */
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
#include "sm_11_atomic_functions.h" #include "sm_11_atomic_functions.h"
#include "sm_12_atomic_functions.h" #include "sm_12_atomic_functions.h"
#include "sm_13_double_functions.h" #include "sm_13_double_functions.h"
#include "sm_20_atomic_functions.h" #include "sm_20_atomic_functions.h"
#include "sm_20_intrinsics.h" #include "sm_20_intrinsics.h"
#include "surface_functions.h"
#include "texture_fetch_functions.h" #include "texture_fetch_functions.h"
#endif /* !__DEVICE_FUNCTIONS_H__ */ #endif /* !__DEVICE_FUNCTIONS_H__ */
 End of changes. 49 change blocks. 
2271 lines changed or deleted 163 lines changed or added


 device_runtime.h   device_runtime.h 
/* /*
* Copyright 1993-2008 NVIDIA Corporation. All rights reserved. * Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
* *
* NOTICE TO USER: * NOTICE TO USER:
* *
* This source code is subject to NVIDIA ownership rights under U.S. and * This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws. Users and possessors of this source code * international Copyright laws. Users and possessors of this source code
* are hereby granted a nonexclusive, royalty-free license to use this code * are hereby granted a nonexclusive, royalty-free license to use this code
* in individual and commercial software. * in individual and commercial software.
* *
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
skipping to change at line 36 skipping to change at line 36
* and is provided to the U.S. Government only as a commercial end item. * and is provided to the U.S. Government only as a commercial end item.
* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
* 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
* source code with only those rights set forth herein. * source code with only those rights set forth herein.
* *
* Any use of this source code in individual and commercial software must * Any use of this source code in individual and commercial software must
* include, in the user documentation and internal comments to the code, * include, in the user documentation and internal comments to the code,
* the above Disclaimer and U.S. Government End Users Notice. * the above Disclaimer and U.S. Government End Users Notice.
*/ */
#if !defined(__CUDA_INTERNAL_COMPILATION__)
#define __CUDA_INTERNAL_COMPILATION__ #define __CUDA_INTERNAL_COMPILATION__
#endif /* !__CUDA_INTERNAL_COMPILATION__ */
#include "host_defines.h" #include "host_defines.h"
#define __no_sc__ #define __no_sc__
#define __empty_array(s) \ #define __pad__(f)
s
#define __unsized_shared_data(name, type_post) \
__unsized##name __unsized##type_post
#define __sized_shared_data(name, type) \
__sized##name type
#define __sized__shared_var(name, s, type) \
name
/*TEXTURE_TYPE*/ /*TEXTURE_TYPE*/
typedef const void *__texture_type__; typedef const void *__texture_type__;
/*SURFACE_TYPE*/ /*SURFACE_TYPE*/
typedef const void *__surface_type__; typedef const void *__surface_type__;
#if defined(__CUDABE__) /* cudabe compiler */ #if defined(__CUDABE__) /* cudabe compiler */
#define __pad__(f) #if __CUDA_ARCH__ >= 200
#define ___device__(sc) \
sc
#else /* __CUDA_ARCH__ >= 200 */
#define ___device__(sc) \
static
#endif /* __CUDA_ARCH__ >= 200 */
#define __text__ \ #define __text__ \
__attribute__((__texture__)) __attribute__((__texture__))
#define __surf__ \ #define __surf__ \
__attribute__((__surface__)) __attribute__((__surface__))
#define ___device__(sc) \
static
#define __in__(cdecl, decl) \
cdecl
#define __in_type__(cdecl, decl) \
cdecl
#define __texture_var(name) \
name
#define __shared_var(name, s, type) \
name
#define __val_param(name) \ #define __val_param(name) \
__val_param##name __val_param##name
#define __copy_param(local_decl, param) \ #define __copy_param(local_decl, param) \
local_decl = param local_decl = param
#define __unsized_array_size \
[]
#define __unsized__shared_var(name, s, type) \
name
#define __unsized__empty_array(s) \
s
#define __var_used__ \ #define __var_used__ \
__attribute__((__used__)) __attribute__((__used__))
#define __storage_extern_unsized__shared__ \ #define __storage_extern_unsized__shared__ \
extern extern
#define __cxa_vec_util(n, num, size, f) \
int i; for (i = 0; i < num; i++) f(n + i)
#define __cxa_vec_ctor(n, num, size, c, d) \ #define __cxa_vec_ctor(n, num, size, c, d) \
({ __cxa_vec_util(n, num, size, c); (void)0; }) ({ int i; for (i = 0; i < num; i++) c((void*)n + i*size); (void)0; })
#define __cxa_vec_dtor(n, num, size, d) \ #define __cxa_vec_dtor(n, num, size, d) \
{ __cxa_vec_util(n, num, size, d); } { int i; for (i = num-1; i >= 0; i--) d((void*)n + i*size); }
#undef __cdecl #undef __cdecl
#define __cdecl #define __cdecl
#undef __w64 #undef __w64
#define __w64 #define __w64
#elif defined(__CUDACC__) /* cudafe compiler */ #elif defined(__CUDACC__) /* cudafe compiler */
#define __loc_sc__(loc, size, sc) \ #define __loc_sc__(loc, size, sc) \
sc loc sc loc
#define __pad__(f)
#define __text__ #define __text__
#define __surf__ #define __surf__
#define ___device__(sc) \ #define ___device__(sc) \
sc __device__ sc __device__
#define __in__(cdecl, decl) \
decl
#define __in_type__(cdecl, decl) \
decl
#define __texture_var(name) \
name
#define __shared_var(name, s, type) \
name
#define __val_param(name) \
name
#define __copy_param(local_decl, param)
#define __unsized_array_size \
[]
#define __unsized__shared_var(name, s, type) \
name
#define __unsized__empty_array(s) \
s
#else /* host compiler (cl, gcc, open64, ...) */
#if defined (__MULTI_CORE__) || defined(__multi_core__)
struct uint3;
extern struct uint3* CUDARTAPI __cudaGetBlockIdxPtr(void);
extern void* CUDARTAPI __cudaGetSharedMem(void*);
extern void* CUDARTAPI __cudaCmcHostMalloc(size_t);
extern size_t CUDARTAPI __cudaCmcGetStackSize(void);
#endif /* __MULTI_CORE__ || __multi_core__ */
#if defined (__multi_core__)
#if defined(__GNUC__)
#if defined(__cplusplus)
extern void *alloca(size_t) throw();
#else /* __cplusplus */
extern void *alloca(size_t);
#endif /* __cplusplus */
#define __cuda_alloca(s) \
alloca(s)
#else /* __GNUC__ */
extern void *_alloca(size_t);
#define __cuda_alloca(s) \
_alloca(s)
#endif /* __GNUC__ */
/* check if enough stack size remains for alloca to succeed. If so,
use faster alloca() to do the allocation. Otherwise, allocate memory
using the __cudaCmcHostMalloc() runtime function, which uses the slower
malloc path to allocate memory
*/
#define __cudaCmcTargAlloc(num_bytes, max_stacksize, ptr_counter_stacksize,
ptr_ret_sym) \
do { \
if (*(ptr_counter_stacksize) + (num_bytes) >= (max_stacksize)) { \
*(ptr_ret_sym) = __cudaCmcHostMalloc((num_bytes)); \
} else { \
*(ptr_ret_sym) = __cuda_alloca((num_bytes)); \
*(ptr_counter_stacksize) = *(ptr_counter_stacksize) + (num_bytes); \
} \
} while(0)
#endif /* __multi_core__ */
#if defined (__MULTI_CORE__)
#define ___device__(sc) \
static
#define __pad__(f) \
f
#define __text__
#define __surf__
#define __cudaGet_blockIdx() \
(*__cudaGetBlockIdxPtr())
#define __shared_var(name, s, type) \
(s type __cudaGetSharedMem((void*)(&(name))))
#define __var_used__ \
__attribute__((__used__))
#define __storage_auto__shared__ \
auto
#undef __cdecl
#define __cdecl
#undef __w64
#define __w64
#else /* __MULTI_CORE__ */
#define ___device__(sc) \
static __device__
#define __shared_var(name, s, type) \
name
#if defined(__APPLE__) || defined(__ICC)
#define __STORAGE__ \
__attribute__((__weak__))
#elif defined(__GNUC__)
#define __STORAGE__ \
__attribute__((__common__))
#elif defined(__cplusplus)
#define __STORAGE__ \
__declspec(selectany)
#else /* __APPLE__ || __ICC */
#define __STORAGE__
#endif /* __APPLE__ || __ICC */
#endif /* __MULTI_CORE__ */
#define __in__(cdecl, decl) \
decl
#define __in_type__(cdecl, decl) \
decl
#define __texture_var(name) \
__texture_##name
#define __val_param(name) \ #define __val_param(name) \
name name
#define __copy_param(local_decl, param) #define __copy_param(local_decl, param)
#define __unsized_array_size
#define __unsized__shared_var(name, s, type) \
(*name)
#define __unsized__empty_array(s)
#define __cxa_vec_ctor(n, num, size, c, d) \
__cxa_vec_util((void*)n, num, size, (void (*)(void*))c)
#define __cxa_vec_dtor(n, num, size, d) \
__cxa_vec_util((void*)n, num, size, (void (*)(void*))d)
static void __cxa_vec_util(void *n, size_t num, size_t size, void (*f)(void
*))
{
size_t i;
for (i = 0; i < num; i++) {
f((void*)((char*)n + i * size));
}
}
/* this is compiled with a host compiler for device emulation */
#define __device_emulation
#if defined(__cplusplus)
#undef __VECTOR_TYPES_H__
#if defined(_WIN32)
#pragma warning(disable: 4190 4522)
#endif /* _WIN32 */
#endif /* __cplusplus */
#endif /* __CUDABE__ */ #endif /* __CUDABE__ */
#if defined(__cplusplus)
static void *__cuda_memcpy(void*, const void*, size_t);
/* for C++ compilation of lowered C++ (i.e. C) code */
#define __cuda_assign_operators(tag)
\
tag& operator=( tag& a) { __cuda_memcpy(this, &a,
sizeof(tag)); return *this;} \
tag& operator=(volatile tag& a) volatile { return *(tag*)this = (ta
g&)a; } \
tag& operator=( const tag& a) { return *(tag*)this = (ta
g&)a; } \
#endif /* __cplusplus */
#include "builtin_types.h" #include "builtin_types.h"
#include "device_launch_parameters.h" #include "device_launch_parameters.h"
#include "storage_class.h" #include "storage_class.h"
 End of changes. 14 change blocks. 
216 lines changed or deleted 16 lines changed or added


 driver_functions.h   driver_functions.h 
skipping to change at line 45 skipping to change at line 45
#if !defined(__DRIVER_FUNCTIONS_H__) #if !defined(__DRIVER_FUNCTIONS_H__)
#define __DRIVER_FUNCTIONS_H__ #define __DRIVER_FUNCTIONS_H__
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
#include "builtin_types.h"
#include "host_defines.h" #include "host_defines.h"
#include "driver_types.h" #include "driver_types.h"
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
static __inline__ __host__ struct cudaPitchedPtr make_cudaPitchedPtr(void * d, size_t p, size_t xsz, size_t ysz) static __inline__ __host__ struct cudaPitchedPtr make_cudaPitchedPtr(void * d, size_t p, size_t xsz, size_t ysz)
 End of changes. 1 change blocks. 
0 lines changed or deleted 1 lines changed or added


 driver_types.h   driver_types.h 
skipping to change at line 79 skipping to change at line 79
#define cudaEventBlockingSync 1 ///< Event uses blocking synchroniz ation #define cudaEventBlockingSync 1 ///< Event uses blocking synchroniz ation
#define cudaDeviceScheduleAuto 0 ///< Device flag - Automatic schedu ling #define cudaDeviceScheduleAuto 0 ///< Device flag - Automatic schedu ling
#define cudaDeviceScheduleSpin 1 ///< Device flag - Spin default sch eduling #define cudaDeviceScheduleSpin 1 ///< Device flag - Spin default sch eduling
#define cudaDeviceScheduleYield 2 ///< Device flag - Yield default sc heduling #define cudaDeviceScheduleYield 2 ///< Device flag - Yield default sc heduling
#define cudaDeviceBlockingSync 4 ///< Device flag - Use blocking syn chronization #define cudaDeviceBlockingSync 4 ///< Device flag - Use blocking syn chronization
#define cudaDeviceMapHost 8 ///< Device flag - Support mapped p inned allocations #define cudaDeviceMapHost 8 ///< Device flag - Support mapped p inned allocations
#define cudaDeviceLmemResizeToMax 16 ///< Device flag - Keep local memor y allocation after launch #define cudaDeviceLmemResizeToMax 16 ///< Device flag - Keep local memor y allocation after launch
#define cudaDeviceMask 0x1f ///< Device flags mask #define cudaDeviceMask 0x1f ///< Device flags mask
#define cudaArraySurfaceLoadStore 0x02 ///< Must be set in cudaMallocArra
y in order to bind surfaces to the CUDA array
#endif /* !__CUDA_INTERNAL_COMPILATION__ */ #endif /* !__CUDA_INTERNAL_COMPILATION__ */
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
/** /**
* CUDA error types * CUDA error types
skipping to change at line 116 skipping to change at line 118
cudaErrorInvalidSymbol = 13, ///< Invalid symbol cudaErrorInvalidSymbol = 13, ///< Invalid symbol
cudaErrorMapBufferObjectFailed = 14, ///< Map buffer object failed cudaErrorMapBufferObjectFailed = 14, ///< Map buffer object failed
cudaErrorUnmapBufferObjectFailed = 15, ///< Unmap buffer objec t failed cudaErrorUnmapBufferObjectFailed = 15, ///< Unmap buffer objec t failed
cudaErrorInvalidHostPointer = 16, ///< Invalid host point er cudaErrorInvalidHostPointer = 16, ///< Invalid host point er
cudaErrorInvalidDevicePointer = 17, ///< Invalid device poi nter cudaErrorInvalidDevicePointer = 17, ///< Invalid device poi nter
cudaErrorInvalidTexture = 18, ///< Invalid texture cudaErrorInvalidTexture = 18, ///< Invalid texture
cudaErrorInvalidTextureBinding = 19, ///< Invalid texture bi nding cudaErrorInvalidTextureBinding = 19, ///< Invalid texture bi nding
cudaErrorInvalidChannelDescriptor = 20, ///< Invalid channel de scriptor cudaErrorInvalidChannelDescriptor = 20, ///< Invalid channel de scriptor
cudaErrorInvalidMemcpyDirection = 21, ///< Invalid memcpy dir ection cudaErrorInvalidMemcpyDirection = 21, ///< Invalid memcpy dir ection
cudaErrorAddressOfConstant = 22, ///< Address of constan t error cudaErrorAddressOfConstant = 22, ///< Address of constan t error
///< \deprecated
///< This error return
is deprecated as of
///< Cuda 3.1. Variable
s in constant memory
///< may now have their
address taken by the
///< runtime via ::cuda
GetSymbolAddress().
cudaErrorTextureFetchFailed = 23, ///< Texture fetch fail ed cudaErrorTextureFetchFailed = 23, ///< Texture fetch fail ed
cudaErrorTextureNotBound = 24, ///< Texture not bound error cudaErrorTextureNotBound = 24, ///< Texture not bound error
cudaErrorSynchronizationError = 25, ///< Synchronization er ror cudaErrorSynchronizationError = 25, ///< Synchronization er ror
cudaErrorInvalidFilterSetting = 26, ///< Invalid filter set ting cudaErrorInvalidFilterSetting = 26, ///< Invalid filter set ting
cudaErrorInvalidNormSetting = 27, ///< Invalid norm setti ng cudaErrorInvalidNormSetting = 27, ///< Invalid norm setti ng
cudaErrorMixedDeviceExecution = 28, ///< Mixed device execu tion cudaErrorMixedDeviceExecution = 28, ///< Mixed device execu tion
cudaErrorCudartUnloading = 29, ///< CUDA runtime unloa ding cudaErrorCudartUnloading = 29, ///< CUDA runtime unloa ding
cudaErrorUnknown = 30, ///< Unknown error cond ition cudaErrorUnknown = 30, ///< Unknown error cond ition
cudaErrorNotYetImplemented = 31, ///< Function not yet i mplemented cudaErrorNotYetImplemented = 31, ///< Function not yet i mplemented
cudaErrorMemoryValueTooLarge = 32, ///< Memory value too l arge cudaErrorMemoryValueTooLarge = 32, ///< Memory value too l arge
cudaErrorInvalidResourceHandle = 33, ///< Invalid resource h andle cudaErrorInvalidResourceHandle = 33, ///< Invalid resource h andle
cudaErrorNotReady = 34, ///< Not ready error cudaErrorNotReady = 34, ///< Not ready error
cudaErrorInsufficientDriver = 35, ///< CUDA runtime is ne wer than driver cudaErrorInsufficientDriver = 35, ///< CUDA runtime is ne wer than driver
cudaErrorSetOnActiveProcess = 36, ///< Set on active proc ess error cudaErrorSetOnActiveProcess = 36, ///< Set on active proc ess error
cudaErrorNoDevice = 38, ///< No available CUDA cudaErrorInvalidSurface = 37, ///< Invalid surface
device cudaErrorNoDevice = 38, ///< No Cuda-capable de
vices detected
cudaErrorECCUncorrectable = 39, ///< Uncorrectable ECC error detected cudaErrorECCUncorrectable = 39, ///< Uncorrectable ECC error detected
cudaErrorSharedObjectSymbolNotFound = 40, ///< Link to a shared o
bject failed to resolve
cudaErrorSharedObjectInitFailed = 41, ///< Shared object init
ialization failed
cudaErrorUnsupportedLimit = 42, ///< ::cudaLimit not su
pported by device
cudaErrorDuplicateVariableName = 43, ///< Duplicate global v
ariable lookup by string name
cudaErrorDuplicateTextureName = 44, ///< Duplicate texture
lookup by string name
cudaErrorDuplicateSurfaceName = 45, ///< Duplicate surface
lookup by string name
cudaErrorDevicesUnavailable = 46, ///< All Cuda-capable d
evices are busy (see ::cudaComputeMode) or unavailable
cudaErrorStartupFailure = 0x7f, ///< Startup failure cudaErrorStartupFailure = 0x7f, ///< Startup failure
cudaErrorApiFailureBase = 10000 ///< API failure base cudaErrorApiFailureBase = 10000 ///< API failure base
}; };
/** /**
* Channel format kind * Channel format kind
*/ */
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
enum cudaChannelFormatKind enum cudaChannelFormatKind
{ {
skipping to change at line 200 skipping to change at line 215
}; };
/** /**
* CUDA extent * CUDA extent
* \sa ::make_cudaExtent * \sa ::make_cudaExtent
*/ */
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct cudaExtent struct cudaExtent
{ {
size_t width; ///< Width in bytes size_t width; ///< Width in bytes
size_t height; ///< Height in bytes size_t height; ///< Height in elements
size_t depth; ///< Depth in bytes size_t depth; ///< Depth in elements
}; };
/** /**
* CUDA 3D position * CUDA 3D position
* \sa ::make_cudaPos * \sa ::make_cudaPos
*/ */
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct cudaPos struct cudaPos
{ {
size_t x; ///< x size_t x; ///< x
skipping to change at line 246 skipping to change at line 261
*/ */
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct cudaGraphicsResource; struct cudaGraphicsResource;
/** /**
* CUDA graphics interop register flags * CUDA graphics interop register flags
*/ */
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
enum cudaGraphicsRegisterFlags enum cudaGraphicsRegisterFlags
{ {
cudaGraphicsRegisterFlagsNone = 0, ///< Default cudaGraphicsRegisterFlagsNone = 0 ///< Default
}; };
/** /**
* CUDA graphics interop map flags * CUDA graphics interop map flags
*/ */
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
enum cudaGraphicsMapFlags enum cudaGraphicsMapFlags
{ {
cudaGraphicsMapFlagsNone = 0, ///< Default; Assume resource can be read/written cudaGraphicsMapFlagsNone = 0, ///< Default; Assume resource can be read/written
cudaGraphicsMapFlagsReadOnly = 1, ///< CUDA will not write to this r esource cudaGraphicsMapFlagsReadOnly = 1, ///< CUDA will not write to this r esource
cudaGraphicsMapFlagsWriteDiscard = 2, ///< CUDA will only write to and w ill not read from this resource cudaGraphicsMapFlagsWriteDiscard = 2 ///< CUDA will only write to and w ill not read from this resource
}; };
/** /**
* CUDA graphics interop array indices for cube maps * CUDA graphics interop array indices for cube maps
*/ */
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
enum cudaGraphicsCubeFace { enum cudaGraphicsCubeFace {
cudaGraphicsCubeFacePositiveX = 0x00, ///< Positive X face of cubemap cudaGraphicsCubeFacePositiveX = 0x00, ///< Positive X face of cubemap
cudaGraphicsCubeFaceNegativeX = 0x01, ///< Negative X face of cubemap cudaGraphicsCubeFaceNegativeX = 0x01, ///< Negative X face of cubemap
cudaGraphicsCubeFacePositiveY = 0x02, ///< Positive Y face of cubemap cudaGraphicsCubeFacePositiveY = 0x02, ///< Positive Y face of cubemap
cudaGraphicsCubeFaceNegativeY = 0x03, ///< Negative Y face of cubemap cudaGraphicsCubeFaceNegativeY = 0x03, ///< Negative Y face of cubemap
cudaGraphicsCubeFacePositiveZ = 0x04, ///< Positive Z face of cubemap cudaGraphicsCubeFacePositiveZ = 0x04, ///< Positive Z face of cubemap
cudaGraphicsCubeFaceNegativeZ = 0x05, ///< Negative Z face of cubemap cudaGraphicsCubeFaceNegativeZ = 0x05 ///< Negative Z face of cubemap
}; };
/** /**
* CUDA function attributes * CUDA function attributes
*/ */
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct cudaFuncAttributes struct cudaFuncAttributes
{ {
size_t sharedSizeBytes; ///< Size of shared memory in bytes size_t sharedSizeBytes; ///< Size of shared memory in bytes
size_t constSizeBytes; ///< Size of constant memory in bytes size_t constSizeBytes; ///< Size of constant memory in bytes
skipping to change at line 322 skipping to change at line 337
*/ */
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
enum cudaComputeMode enum cudaComputeMode
{ {
cudaComputeModeDefault = 0, ///< Default compute mode (Multiple thr eads can use ::cudaSetDevice() with this device) cudaComputeModeDefault = 0, ///< Default compute mode (Multiple thr eads can use ::cudaSetDevice() with this device)
cudaComputeModeExclusive = 1, ///< Compute-exclusive mode (Only one t hread will be able to use ::cudaSetDevice() with this device) cudaComputeModeExclusive = 1, ///< Compute-exclusive mode (Only one t hread will be able to use ::cudaSetDevice() with this device)
cudaComputeModeProhibited = 2 ///< Compute-prohibited mode (No thread s can use ::cudaSetDevice() with this device) cudaComputeModeProhibited = 2 ///< Compute-prohibited mode (No thread s can use ::cudaSetDevice() with this device)
}; };
/** /**
* CUDA Limits
*/
/*DEVICE_BUILTIN*/
enum cudaLimit
{
cudaLimitStackSize = 0x00, ///< GPU thread stack size
cudaLimitPrintfFifoSize = 0x01 ///< GPU printf FIFO size
};
/**
* CUDA device properties * CUDA device properties
*/ */
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct cudaDeviceProp struct cudaDeviceProp
{ {
char name[256]; ///< ASCII string identifying device char name[256]; ///< ASCII string identifying device
size_t totalGlobalMem; ///< Global memory available on device in bytes size_t totalGlobalMem; ///< Global memory available on device in bytes
size_t sharedMemPerBlock; ///< Shared memory available per block in bytes size_t sharedMemPerBlock; ///< Shared memory available per block in bytes
int regsPerBlock; ///< 32-bit registers available per blo ck int regsPerBlock; ///< 32-bit registers available per blo ck
int warpSize; ///< Warp size in threads int warpSize; ///< Warp size in threads
skipping to change at line 351 skipping to change at line 376
int deviceOverlap; ///< Device can concurrently copy memor y and execute a kernel int deviceOverlap; ///< Device can concurrently copy memor y and execute a kernel
int multiProcessorCount; ///< Number of multiprocessors on devic e int multiProcessorCount; ///< Number of multiprocessors on devic e
int kernelExecTimeoutEnabled; ///< Specified whether there is a run t ime limit on kernels int kernelExecTimeoutEnabled; ///< Specified whether there is a run t ime limit on kernels
int integrated; ///< Device is integrated as opposed to discrete int integrated; ///< Device is integrated as opposed to discrete
int canMapHostMemory; ///< Device can map host memory with cu daHostAlloc/cudaHostGetDevicePointer int canMapHostMemory; ///< Device can map host memory with cu daHostAlloc/cudaHostGetDevicePointer
int computeMode; ///< Compute mode (See ::cudaComputeMod e) int computeMode; ///< Compute mode (See ::cudaComputeMod e)
int maxTexture1D; ///< Maximum 1D texture size int maxTexture1D; ///< Maximum 1D texture size
int maxTexture2D[2]; ///< Maximum 2D texture dimensions int maxTexture2D[2]; ///< Maximum 2D texture dimensions
int maxTexture3D[3]; ///< Maximum 3D texture dimensions int maxTexture3D[3]; ///< Maximum 3D texture dimensions
int maxTexture2DArray[3]; ///< Maximum 2D texture array dimension s int maxTexture2DArray[3]; ///< Maximum 2D texture array dimension s
size_t surfaceAlignment; ///< Alignment requirements for surface s
int concurrentKernels; ///< Device can possibly execute multip le kernels concurrently int concurrentKernels; ///< Device can possibly execute multip le kernels concurrently
int __cudaReserved[26]; int ECCEnabled; ///< Device has ECC support enabled
int pciBusID; ///< PCI bus ID of the device
int pciDeviceID; ///< PCI device ID of the device
int __cudaReserved[22];
}; };
#define cudaDevicePropDontCare \ #define cudaDevicePropDontCare \
{ \ { \
{'\0'}, /* char name[256]; */ \ {'\0'}, /* char name[256]; */ \
0, /* size_t totalGlobalMem; */ \ 0, /* size_t totalGlobalMem; */ \
0, /* size_t sharedMemPerBlock; */ \ 0, /* size_t sharedMemPerBlock; */ \
0, /* int regsPerBlock; */ \ 0, /* int regsPerBlock; */ \
0, /* int warpSize; */ \ 0, /* int warpSize; */ \
0, /* size_t memPitch; */ \ 0, /* size_t memPitch; */ \
skipping to change at line 381 skipping to change at line 410
-1, /* int deviceOverlap; */ \ -1, /* int deviceOverlap; */ \
0, /* int multiProcessorCount; */ \ 0, /* int multiProcessorCount; */ \
0, /* int kernelExecTimeoutEnabled */ \ 0, /* int kernelExecTimeoutEnabled */ \
0, /* int integrated */ \ 0, /* int integrated */ \
0, /* int canMapHostMemory */ \ 0, /* int canMapHostMemory */ \
0, /* int computeMode */ \ 0, /* int computeMode */ \
0, /* int maxTexture1D */ \ 0, /* int maxTexture1D */ \
{0, 0}, /* int maxTexture2D[2] */ \ {0, 0}, /* int maxTexture2D[2] */ \
{0, 0, 0}, /* int maxTexture3D[3] */ \ {0, 0, 0}, /* int maxTexture3D[3] */ \
{0, 0, 0}, /* int maxTexture2DArray[3] */ \ {0, 0, 0}, /* int maxTexture2DArray[3] */ \
0 /* int concurrentKernels */ \ 0, /* size_t surfaceAlignment */ \
0, /* int concurrentKernels */ \
0 /* int ECCEnabled */ \
} ///< Empty device properties } ///< Empty device properties
/************************************************************************** ***** /************************************************************************** *****
* * * *
* SHORTHAND TYPE DEFINITION USED BY RUNTIME API * * SHORTHAND TYPE DEFINITION USED BY RUNTIME API *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
/** /**
* CUDA Error types * CUDA Error types
*/ */
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
typedef enum cudaError cudaError_t; typedef enum cudaError cudaError_t;
/** /**
* CUDA stream * CUDA stream
*/ */
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
typedef int cudaStream_t; typedef struct CUstream_st *cudaStream_t;
/** /**
* CUDA event types * CUDA event types
*/ */
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
typedef int cudaEvent_t; typedef struct CUevent_st *cudaEvent_t;
/**
* CUDA UUID types
*/
/*DEVICE_BUILTIN*/
typedef struct CUuuid_st cudaUUID_t;
/** @} */ /** @} */
/** @} */ /* END CUDART_TYPES */ /** @} */ /* END CUDART_TYPES */
#endif /* !__DRIVER_TYPES_H__ */ #endif /* !__DRIVER_TYPES_H__ */
 End of changes. 14 change blocks. 
11 lines changed or deleted 60 lines changed or added


 func_macro.h   func_macro.h 
/* /*
* Copyright 1993-2008 NVIDIA Corporation. All rights reserved. * Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
* *
* NOTICE TO USER: * NOTICE TO USER:
* *
* This source code is subject to NVIDIA ownership rights under U.S. and * This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws. Users and possessors of this source code * international Copyright laws. Users and possessors of this source code
* are hereby granted a nonexclusive, royalty-free license to use this code * are hereby granted a nonexclusive, royalty-free license to use this code
* in individual and commercial software. * in individual and commercial software.
* *
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
skipping to change at line 54 skipping to change at line 54
___device__(static) decl ___device__(static) decl
#else /* __CUDABE__ */ #else /* __CUDABE__ */
#if !defined(__CUDA_INTERNAL_COMPILATION__) #if !defined(__CUDA_INTERNAL_COMPILATION__)
#error -- incorrect inclusion of a cudart header file #error -- incorrect inclusion of a cudart header file
#endif /* !__CUDA_INTERNAL_COMPILATION__ */ #endif /* !__CUDA_INTERNAL_COMPILATION__ */
#if defined(__cplusplus) && defined(__device_emulation) && !defined(__multi
_core__)
#define __begin_host_func \
}
#define __end_host_func \
namespace __cuda_emu {
#define __host_device_call(f) \
__cuda_emu::f
#else /* __cplusplus && __device_emulation && !__multi_core__ */
#define __begin_host_func
#define __end_host_func
#define __host_device_call(f) \
f
#endif /* __cplusplus && __device_emulation !__multi_core__ */
#if defined(__APPLE__) #if defined(__APPLE__)
#define __func__(decl) \ #define __func__(decl) \
extern __attribute__((__weak_import__, __weak__)) decl; decl extern __attribute__((__weak_import__, __weak__)) decl; decl
#define __device_func__(decl) \ #define __device_func__(decl) \
static __attribute__((__unused__)) decl static __attribute__((__unused__)) decl
#elif defined(__GNUC__) #elif defined(__GNUC__)
#define __func__(decl) \ #define __func__(decl) \
 End of changes. 2 change blocks. 
20 lines changed or deleted 1 lines changed or added


 host_config.h   host_config.h 
skipping to change at line 52 skipping to change at line 52
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
#if defined(__CUDACC__) #if defined(__CUDACC__)
#if defined(__APPLE__) #if defined(__APPLE__)
#define _CRTIMP #define _CRTIMP
#define __THROW #define __THROW
#if defined(__BLOCKS__) #if defined(__BLOCKS__) /* nvcc does not support closures */
#undef __BLOCKS__ #undef __BLOCKS__
#endif /* __BLOCKS__ */ #endif /* __BLOCKS__ */
#elif defined(__GNUC__) #elif defined(__GNUC__)
#define _CRTIMP #define _CRTIMP
#include <features.h> /* for __THROW */ #include <features.h> /* for __THROW */
 End of changes. 1 change blocks. 
1 lines changed or deleted 1 lines changed or added


 host_defines.h   host_defines.h 
skipping to change at line 102 skipping to change at line 102
__declspec(dllexport) __declspec(dllexport)
#define __annotate__(a) \ #define __annotate__(a) \
__declspec(a) __declspec(a)
#define __location__(a) \ #define __location__(a) \
__annotate__(__##a##__) __annotate__(__##a##__)
#define CUDARTAPI \ #define CUDARTAPI \
__stdcall __stdcall
#endif /* !__GNUC__ && !_WIN32 */ #endif /* !__GNUC__ && !_WIN32 */
#if !defined(__GNUC__) || __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__
< 3)
#define __specialization_static \
static
#else /* !__GNUC__ || __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 3)
*/
#define __specialization_static
#endif /* !__GNUC__ || __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 3
) */
#if !defined(__CUDACC__) && !defined(__CUDABE__) #if !defined(__CUDACC__) && !defined(__CUDABE__)
#undef __annotate__ #undef __annotate__
#define __annotate__(a) #define __annotate__(a)
#else /* !__CUDACC__ && !__CUDABE__ */
#define __launch_bounds__(...) \
__annotate__(launch_bounds(__VA_ARGS__))
#endif /* !__CUDACC__ && !__CUDABE__ */ #endif /* !__CUDACC__ && !__CUDABE__ */
#if defined(__CUDACC__) || defined(__CUDABE__) || \ #if defined(__CUDACC__) || defined(__CUDABE__) || \
defined(__GNUC__) || defined(_WIN64) defined(__GNUC__) || defined(_WIN64)
#define __builtin_align__(a) \ #define __builtin_align__(a) \
__align__(a) __align__(a)
#else /* __CUDACC__ || __CUDABE__ || __GNUC__ || _WIN64 */ #else /* __CUDACC__ || __CUDABE__ || __GNUC__ || _WIN64 */
skipping to change at line 131 skipping to change at line 147
#define __device__ \ #define __device__ \
__location__(device) __location__(device)
#define __host__ \ #define __host__ \
__location__(host) __location__(host)
#define __global__ \ #define __global__ \
__location__(global) __location__(global)
#define __shared__ \ #define __shared__ \
__location__(shared) __location__(shared)
#define __constant__ \ #define __constant__ \
__location__(constant) __location__(constant)
#define __launch_bounds__(...) \
__annotate__(launch_bounds(__VA_ARGS__))
#endif /* !__HOST_DEFINES_H__ */ #endif /* !__HOST_DEFINES_H__ */
 End of changes. 3 change blocks. 
2 lines changed or deleted 19 lines changed or added


 host_runtime.h   host_runtime.h 
/* /*
* Copyright 1993-2008 NVIDIA Corporation. All rights reserved. * Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
* *
* NOTICE TO USER: * NOTICE TO USER:
* *
* This source code is subject to NVIDIA ownership rights under U.S. and * This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws. Users and possessors of this source code * international Copyright laws. Users and possessors of this source code
* are hereby granted a nonexclusive, royalty-free license to use this code * are hereby granted a nonexclusive, royalty-free license to use this code
* in individual and commercial software. * in individual and commercial software.
* *
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
skipping to change at line 39 skipping to change at line 39
* source code with only those rights set forth herein. * source code with only those rights set forth herein.
* *
* Any use of this source code in individual and commercial software must * Any use of this source code in individual and commercial software must
* include, in the user documentation and internal comments to the code, * include, in the user documentation and internal comments to the code,
* the above Disclaimer and U.S. Government End Users Notice. * the above Disclaimer and U.S. Government End Users Notice.
*/ */
#if !defined(__CUDA_INTERNAL_COMPILATION__) #if !defined(__CUDA_INTERNAL_COMPILATION__)
#define __CUDA_INTERNAL_COMPILATION__ #define __CUDA_INTERNAL_COMPILATION__
#define __glob_pref_var(var) \
__global_##var
#define __global_var(var) \
(*__glob_pref_var(var))
#define __shadow_var(c, cpp) \
__shadow_pref_var(c, cpp)
#define __text__ #define __text__
#define __surf__ #define __surf__
#define __dv(v)
#define __name__shadow_var(c, cpp) \ #define __name__shadow_var(c, cpp) \
__pick(#c, #cpp) #cpp
#define __name__text_var(c, cpp) \ #define __name__text_var(c, cpp) \
__pick(#c, #cpp) #cpp
#define __shadow_pref_var(c, cpp) \ #define __host__shadow_var(c, cpp) \
__pick(c##__cuda_shadow_variable__, cpp##__cuda_shadow_variable__) cpp
#define __device_stub_name(c, cpp) \
__pick(c, cpp)
#define __text_var(c, cpp) \ #define __text_var(c, cpp) \
__pick(c, cpp) cpp
#define __cppref__ \ #define __device_fun(fun) \
__pick(, &) #fun
#define __device_var(var) \
#var
#define __device__text_var(c, cpp) \
#c
#define __device__shadow_var(c, cpp) \
#c
#if defined(_WIN32) && !defined(_WIN64) #if defined(_WIN32) && !defined(_WIN64)
#define __pad__(f) \ #define __pad__(f) \
f f
#else /* _WIN32 && !_WIN64 */ #else /* _WIN32 && !_WIN64 */
#define __pad__(f) #define __pad__(f)
#endif /* _WIN32 && !_WIN64 */ #endif /* _WIN32 && !_WIN64 */
#if defined(__APPLE__) #include "builtin_types.h"
#define __extern_weak__ \
__weak_import__,
#elif defined(__GNUC__)
#define __extern_weak__
#endif /* __APPLE__ */
#if defined(__cplusplus)
#define __pick(c, cpp) \
cpp
#else /* __cplusplus */
#define __pick(c, cpp) \
c
typedef char bool;
#endif /* __cplusplus */
#if !defined(__GNUC__) || __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__
< 3)
#define __specialization_static \
static
#else /* !__GNUC__ || __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 3)
*/
#define __specialization_static
#endif /* !__GNUC__ || __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 3
) */
#include "cuda_runtime_api.h"
#include "storage_class.h" #include "storage_class.h"
#else /* !__CUDA_INTERNAL_COMPILATION__ */ #else /* !__CUDA_INTERNAL_COMPILATION__ */
#include "host_defines.h" #define __cudaRegisterBinary()
\
#define __cudaRegisterBinary() __cudaFatCubinHandle = __cudaRegisterFatBinary((void*)&__fatDeviceT
\ ext); \
__cudaFatCubinHandle = __cudaRegisterFatBinary((void*)__cudaFatCubi
n); \
atexit(__cudaUnregisterBinaryUtil) atexit(__cudaUnregisterBinaryUtil)
#define __cudaRegisterVariable(var, ext, size, constant, global) \ #define __cudaRegisterVariable(var, ext, size, constant, global) \
__cudaRegisterVar(__cudaFatCubinHandle, (char*)&__host##var, (char* )__device##var, __name##var, ext, size, constant, global) __cudaRegisterVar(__cudaFatCubinHandle, (char*)&__host##var, (char* )__device##var, __name##var, ext, size, constant, global)
#define __cudaRegisterGlobalTexture(tex, dim, norm, ext) \ #define __cudaRegisterGlobalTexture(tex, dim, norm, ext) \
__cudaRegisterTexture(__cudaFatCubinHandle, (const struct textureRe ference*)&tex, __tex_var(tex), __name##tex, dim, norm, ext) __cudaRegisterTexture(__cudaFatCubinHandle, (const struct textureRe ference*)&tex, (const void**)__device##tex, __name##tex, dim, norm, ext)
#define __cudaRegisterGlobalSurface(surf, dim, ext) \ #define __cudaRegisterGlobalSurface(surf, dim, ext) \
__cudaRegisterSurface(__cudaFatCubinHandle, (const struct surfaceRe __cudaRegisterSurface(__cudaFatCubinHandle, (const struct surfaceRe
ference*)&surf, __tex_var(surf), __name##surf, dim, ext) ference*)&surf, (const void**)__device##surf, __name##surf, dim, ext)
#define __cudaRegisterUnsizedShared(var) \
__cudaRegisterShared(__cudaFatCubinHandle, (void**)__device_var(var
))
#define __cudaRegisterSharedVariable(var, size, align, sc) \
__cudaRegisterSharedVar(__cudaFatCubinHandle, (void**)__device_var(
var), size, align, sc)
#define __cudaRegisterEntry(funptr, fun, thread_limit) \ #define __cudaRegisterEntry(funptr, fun, thread_limit) \
__cudaRegisterFunction(__cudaFatCubinHandle, (const char*)funptr, ( __cudaRegisterFunction(__cudaFatCubinHandle, (const char*)funptr, (
char*)__device_fun(fun), #fun, __cuda_tl__(thread_limit), __ids) char*)__device_fun(fun), #fun, -1, (uint3*)0, (uint3*)0, (dim3*)0, (dim3*)0
, (int*)0)
#define __cudaInitArgBlock(arg) \ #define __cudaInitArgBlock(arg) \
*(void**)(void*)&arg = (void*)0 *(void**)(void*)&arg = (void*)0
#define __cudaSetupArg(arg, offset) \ #define __cudaSetupArg(arg, offset) \
if (cudaSetupArgument((void*)(char*)&arg, sizeof(arg), (size_t)&off set->arg) != cudaSuccess) \ if (cudaSetupArgument((void*)(char*)&arg, sizeof(arg), (size_t)&off set->arg) != cudaSuccess) \
return return
#define __cudaLaunch(fun) \ #define __cudaLaunch(fun) \
{ volatile static char *__f; __f = fun; (void)cudaLaunch(fun); } { volatile static char *__f; __f = fun; (void)cudaLaunch(fun); }
#if defined(__cplusplus)
extern "C" { extern "C" {
#endif /* __cplusplus */
extern void** CUDARTAPI __cudaRegisterFatBinary( extern void** CUDARTAPI __cudaRegisterFatBinary(
void *fatCubin void *fatCubin
); );
extern void CUDARTAPI __cudaUnregisterFatBinary( extern void CUDARTAPI __cudaUnregisterFatBinary(
void **fatCubinHandle void **fatCubinHandle
); );
extern void CUDARTAPI __cudaRegisterVar( extern void CUDARTAPI __cudaRegisterVar(
skipping to change at line 181 skipping to change at line 133
extern void CUDARTAPI __cudaRegisterSurface( extern void CUDARTAPI __cudaRegisterSurface(
void **fatCubinHandle, void **fatCubinHandle,
const struct surfaceReference *hostVar, const struct surfaceReference *hostVar,
const void **deviceAddress, const void **deviceAddress,
const char *deviceName, const char *deviceName,
int dim, int dim,
int ext int ext
); );
extern void CUDARTAPI __cudaRegisterShared(
void **fatCubinHandle,
void **devicePtr
);
extern void CUDARTAPI __cudaRegisterSharedVar(
void **fatCubinHandle,
void **devicePtr,
size_t size,
size_t alignment,
int storage
);
extern void CUDARTAPI __cudaRegisterFunction( extern void CUDARTAPI __cudaRegisterFunction(
void **fatCubinHandle, void **fatCubinHandle,
const char *hostFun, const char *hostFun,
char *deviceFun, char *deviceFun,
const char *deviceName, const char *deviceName,
int thread_limit, int thread_limit,
uint3 *tid, uint3 *tid,
uint3 *bid, uint3 *bid,
dim3 *bDim, dim3 *bDim,
dim3 *gDim, dim3 *gDim,
int *wSize int *wSize
); );
#if defined(__cplusplus) #if defined(__GNUC__)
}
#endif /* __cplusplus */
#if defined(__GNUC__) && defined(__cplusplus)
extern int atexit(void(*)(void)) throw(); extern int atexit(void(*)(void)) throw();
#else /* __GNUC__ && __cplusplus */ #else /* __GNUC__ */
extern int __cdecl atexit(void(__cdecl *)(void)); extern int __cdecl atexit(void(__cdecl *)(void));
#endif /* __GNUC__ && __cplusplus */ #endif /* __GNUC__ */
}
static void **__cudaFatCubinHandle; static void **__cudaFatCubinHandle;
static void __cdecl __cudaUnregisterBinaryUtil(void) static void __cdecl __cudaUnregisterBinaryUtil(void)
{ {
__cudaUnregisterFatBinary(__cudaFatCubinHandle); __cudaUnregisterFatBinary(__cudaFatCubinHandle);
} }
#if defined(__device_emulation)
#if defined(__cplusplus) && !defined(__multi_core__)
#define __cuda_emu__ \
__cuda_emu::
#else /* __cplusplus */
#define __cuda_emu__
#endif /* __cplusplus */
#define __device_fun(fun) \
__cuda_emu__ __device_wrapper_##fun
#define __device_var(var) \
&__cuda_emu__ var
#define __tex_var(var) \
&__cuda_emu__ __texture_var(var)
#define __cudaFatCubin \
0
#define __cuda_tl__(l) \
l
#if defined(__multi_core__)
#define __ids \
(uint3*)0, (uint3*)0, &blockDim, &gridDim, &warpSize
#else /* __multi_core__ */
#define __ids \
(uint3*)&__cuda_emu__ threadIdx, (uint3*)&__cuda_emu__ blockIdx, (d
im3*)&__cuda_emu__ blockDim, (dim3*)&__cuda_emu__ gridDim, &__cuda_emu__ wa
rpSize
#endif /* __multi_core__ */
#else /* __device_emulation */
#define __device_fun(fun) \
#fun
#define __device_var(var) \
#var
#define __tex_var(var) \
0
#define __cudaFatCubin \
&__fatDeviceText
#define __cuda_tl__(l) \
-1
#define __ids \
(uint3*)0, (uint3*)0, (dim3*)0, (dim3*)0, (int*)0
#include "common_functions.h" #include "common_functions.h"
#endif /* __device_emulation */
/* UTILITY MACROS */
#define __device__global_var(var) \
__device_var(var)
#define __name__global_var(var) \
#var
#define __host__global_var(var) \
__glob_pref_var(var)
#define __device__shadow_var(c, cpp) \
__device_var(c)
#define __host__shadow_var(c, cpp) \
__shadow_pref_var(c, cpp)
#if defined(_WIN32) #if defined(_WIN32)
#if defined(__cplusplus)
#pragma warning(disable: 4099) #pragma warning(disable: 4099)
#endif /* __cplusplus */
#if !defined(_WIN64) #if !defined(_WIN64)
#pragma warning(disable: 4408) #pragma warning(disable: 4408)
#endif /* !_WIN64 */ #endif /* !_WIN64 */
#endif /* _WIN32 */ #endif /* _WIN32 */
#endif /* !__CUDA_INTERNAL_COMPILATION__ */ #endif /* !__CUDA_INTERNAL_COMPILATION__ */
 End of changes. 21 change blocks. 
168 lines changed or deleted 30 lines changed or added


 math_functions_dbl_ptx1.h   math_functions_dbl_ptx1.h 
skipping to change at line 39 skipping to change at line 39
* source code with only those rights set forth herein. * source code with only those rights set forth herein.
* *
* Any use of this source code in individual and commercial software must * Any use of this source code in individual and commercial software must
* include, in the user documentation and internal comments to the code, * include, in the user documentation and internal comments to the code,
* the above Disclaimer and U.S. Government End Users Notice. * the above Disclaimer and U.S. Government End Users Notice.
*/ */
#if !defined(__MATH_FUNCTIONS_DBL_PTX1_H__) #if !defined(__MATH_FUNCTIONS_DBL_PTX1_H__)
#define __MATH_FUNCTIONS_DBL_PTX1_H__ #define __MATH_FUNCTIONS_DBL_PTX1_H__
#if defined(__cplusplus) && defined(__CUDACC__) #if defined(__CUDABE__)
#elif !defined(__CUDACC__)
#include "crt/func_macro.h"
__device_func__(double __cuda_fabs(double a)) static __forceinline__ double fabs(double a)
{ {
return (float)__cuda_fabsf((float)a); return (double)fabsf((float)a);
} }
__device_func__(double __cuda_fmax(double a, double b)) static __forceinline__ double fmax(double a, double b)
{ {
return (float)__cuda_fmaxf((float)a, (float)b); return (double)fmaxf((float)a, (float)b);
} }
__device_func__(double __cuda_fmin(double a, double b)) static __forceinline__ double fmin(double a, double b)
{ {
return (float)__cuda_fminf((float)a, (float)b); return (double)fminf((float)a, (float)b);
} }
__device_func__(int __cuda___finite(double a)) static __forceinline__ int __finite(double a)
{ {
return __cuda___finitef((float)a); return __finitef((float)a);
} }
__device_func__(int __cuda___isinf(double a)) static __forceinline__ int __isinf(double a)
{ {
return __cuda___isinff((float)a); return __isinff((float)a);
} }
__device_func__(int __cuda___isnan(double a)) static __forceinline__ int __isnan(double a)
{ {
return __cuda___isnanf((float)a); return __isnanf((float)a);
} }
__device_func__(int __cuda___signbit(double a)) static __forceinline__ int __signbit(double a)
{ {
return __cuda___signbitf((float)a); return __signbitf((float)a);
} }
__device_func__(double __cuda_sqrt(double a)) static __forceinline__ double sqrt(double a)
{ {
return (double)__cuda_sqrtf((float)a); return (double)sqrtf((float)a);
} }
__device_func__(double __cuda_rsqrt(double a)) static __forceinline__ double rsqrt(double a)
{ {
return (double)__cuda_rsqrtf((float)a); return (double)rsqrtf((float)a);
} }
__device_func__(double __cuda_ceil(double a)) static __forceinline__ double ceil(double a)
{ {
return (double)__cuda_ceilf((float)a); return (double)ceilf((float)a);
} }
__device_func__(double __cuda_trunc(double a)) static __forceinline__ double trunc(double a)
{ {
return (double)__cuda_truncf((float)a); return (double)truncf((float)a);
} }
__device_func__(double __cuda_floor(double a)) static __forceinline__ double floor(double a)
{ {
return (double)__cuda_floorf((float)a); return (double)floorf((float)a);
} }
__device_func__(double __cuda_copysign(double a, double b)) static __forceinline__ double copysign(double a, double b)
{ {
return (double)__cuda_copysignf((float)a, (float)b); return (double)copysignf((float)a, (float)b);
} }
__device_func__(double __cuda_sin(double a)) static __forceinline__ double sin(double a)
{ {
return (double)__cuda_sinf((float)a); return (double)sinf((float)a);
} }
__device_func__(double __cuda_sinpi(double a)) static __forceinline__ double sinpi(double a)
{ {
return (double)__cuda_sinpif((float)a); return (double)sinpif((float)a);
} }
__device_func__(double __cuda_cos(double a)) static __forceinline__ double cos(double a)
{ {
return (double)__cuda_cosf((float)a); return (double)cosf((float)a);
} }
__device_func__(void __cuda_sincos(double a, double *sptr, double *cptr)) static __forceinline__ void sincos(double a, double *sptr, double *cptr)
{ {
float fs, fc; float fs, fc;
__cuda_sincosf((float)a, &fs, &fc); sincosf((float)a, &fs, &fc);
*sptr = (double)fs; *sptr = (double)fs;
*cptr = (double)fc; *cptr = (double)fc;
} }
__device_func__(double __cuda_tan(double a)) static __forceinline__ double tan(double a)
{ {
return (double)__cuda_tanf((float)a); return (double)tanf((float)a);
} }
__device_func__(double __cuda_exp(double a)) static __forceinline__ double exp(double a)
{ {
return (double)__cuda_expf((float)a); return (double)expf((float)a);
} }
__device_func__(double __cuda_exp2(double a)) static __forceinline__ double exp2(double a)
{ {
return (double)__cuda_exp2f((float)a); return (double)exp2f((float)a);
} }
__device_func__(double __cuda_exp10(double a)) static __forceinline__ double exp10(double a)
{ {
return (double)__cuda_exp10f((float)a); return (double)exp10f((float)a);
} }
__device_func__(double __cuda_expm1(double a)) static __forceinline__ double expm1(double a)
{ {
return (double)__cuda_expm1f((float)a); return (double)expm1f((float)a);
} }
__device_func__(double __cuda_cosh(double a)) static __forceinline__ double cosh(double a)
{ {
return (double)__cuda_coshf((float)a); return (double)coshf((float)a);
} }
__device_func__(double __cuda_sinh(double a)) static __forceinline__ double sinh(double a)
{ {
return (double)__cuda_sinhf((float)a); return (double)sinhf((float)a);
} }
__device_func__(double __cuda_tanh(double a)) static __forceinline__ double tanh(double a)
{ {
return (double)__cuda_tanhf((float)a); return (double)tanhf((float)a);
} }
__device_func__(double __cuda_asin(double a)) static __forceinline__ double asin(double a)
{ {
return (double)__cuda_asinf((float)a); return (double)asinf((float)a);
} }
__device_func__(double __cuda_acos(double a)) static __forceinline__ double acos(double a)
{ {
return (double)__cuda_acosf((float)a); return (double)acosf((float)a);
} }
__device_func__(double __cuda_atan(double a)) static __forceinline__ double atan(double a)
{ {
return (double)__cuda_atanf((float)a); return (double)atanf((float)a);
} }
__device_func__(double __cuda_atan2(double a, double b)) static __forceinline__ double atan2(double a, double b)
{ {
return (double)__cuda_atan2f((float)a, (float)b); return (double)atan2f((float)a, (float)b);
} }
__device_func__(double __cuda_log(double a)) static __forceinline__ double log(double a)
{ {
return (double)__cuda_logf((float)a); return (double)logf((float)a);
} }
__device_func__(double __cuda_log2(double a)) static __forceinline__ double log2(double a)
{ {
return (double)__cuda_log2f((float)a); return (double)log2f((float)a);
} }
__device_func__(double __cuda_log10(double a)) static __forceinline__ double log10(double a)
{ {
return (double)__cuda_log10f((float)a); return (double)log10f((float)a);
} }
__device_func__(double __cuda_log1p(double a)) static __forceinline__ double log1p(double a)
{ {
return (double)__cuda_log1pf((float)a); return (double)log1pf((float)a);
} }
__device_func__(double __cuda_acosh(double a)) static __forceinline__ double acosh(double a)
{ {
return (double)__cuda_acoshf((float)a); return (double)acoshf((float)a);
} }
__device_func__(double __cuda_asinh(double a)) static __forceinline__ double asinh(double a)
{ {
return (double)__cuda_asinhf((float)a); return (double)asinhf((float)a);
} }
__device_func__(double __cuda_atanh(double a)) static __forceinline__ double atanh(double a)
{ {
return (double)__cuda_atanhf((float)a); return (double)atanhf((float)a);
} }
__device_func__(double __cuda_hypot(double a, double b)) static __forceinline__ double hypot(double a, double b)
{ {
return (double)__cuda_hypotf((float)a, (float)b); return (double)hypotf((float)a, (float)b);
} }
__device_func__(double __cuda_cbrt(double a)) static __forceinline__ double cbrt(double a)
{ {
return (double)__cuda_cbrtf((float)a); return (double)cbrtf((float)a);
} }
__device_func__(double __cuda_rcbrt(double a)) static __forceinline__ double rcbrt(double a)
{ {
return (double)__cuda_rcbrtf((float)a); return (double)rcbrtf((float)a);
} }
__device_func__(double __cuda_erf(double a)) static __forceinline__ double erf(double a)
{ {
return (double)__cuda_erff((float)a); return (double)erff((float)a);
} }
__device_func__(double __cuda_erfinv(double a)) static __forceinline__ double erfinv(double a)
{ {
return (double)__cuda_erfinvf((float)a); return (double)erfinvf((float)a);
} }
__device_func__(double __cuda_erfc(double a)) static __forceinline__ double erfc(double a)
{ {
return (double)__cuda_erfcf((float)a); return (double)erfcf((float)a);
} }
__device_func__(double __cuda_erfcinv(double a)) static __forceinline__ double erfcinv(double a)
{ {
return (double)__cuda_erfcinvf((float)a); return (double)erfcinvf((float)a);
} }
__device_func__(double __cuda_lgamma(double a)) static __forceinline__ double lgamma(double a)
{ {
return (double)__cuda_lgammaf((float)a); return (double)lgammaf((float)a);
} }
__device_func__(double __cuda_tgamma(double a)) static __forceinline__ double tgamma(double a)
{ {
return (double)__cuda_tgammaf((float)a); return (double)tgammaf((float)a);
} }
__device_func__(double __cuda_ldexp(double a, int b)) static __forceinline__ double ldexp(double a, int b)
{ {
return (double)__cuda_ldexpf((float)a, b); return (double)ldexpf((float)a, b);
} }
__device_func__(double __cuda_scalbn(double a, int b)) static __forceinline__ double scalbn(double a, int b)
{ {
return (double)__cuda_scalbnf((float)a, b); return (double)scalbnf((float)a, b);
} }
__device_func__(double __cuda_scalbln(double a, long b)) static __forceinline__ double scalbln(double a, long b)
{ {
return (double)__cuda_scalblnf((float)a, b); return (double)scalblnf((float)a, b);
} }
__device_func__(double __cuda_frexp(double a, int *b)) static __forceinline__ double frexp(double a, int *b)
{ {
return (double)__cuda_frexpf((float)a, b); return (double)frexpf((float)a, b);
} }
__device_func__(double __cuda_modf(double a, double *b)) static __forceinline__ double modf(double a, double *b)
{ {
float fb; float fb;
float fa = __cuda_modff((float)a, &fb); float fa = modff((float)a, &fb);
*b = (double)fb; *b = (double)fb;
return (double)fa; return (double)fa;
} }
__device_func__(double __cuda_fmod(double a, double b)) static __forceinline__ double fmod(double a, double b)
{ {
return (double)__cuda_fmodf((float)a, (float)b); return (double)fmodf((float)a, (float)b);
} }
__device_func__(double __cuda_remainder(double a, double b)) static __forceinline__ double remainder(double a, double b)
{ {
return (double)__cuda_remainderf((float)a, (float)b); return (double)remainderf((float)a, (float)b);
} }
__device_func__(double __cuda_remquo(double a, double b, int *c)) static __forceinline__ double remquo(double a, double b, int *c)
{ {
return (double)__cuda_remquof((float)a, (float)b, c); return (double)remquof((float)a, (float)b, c);
} }
__device_func__(double __cuda_nextafter(double a, double b)) static __forceinline__ double nextafter(double a, double b)
{ {
return (double)__cuda_nextafterf((float)a, (float)b); return (double)nextafterf((float)a, (float)b);
} }
__device_func__(double __cuda_nan(const char *tagp)) static __forceinline__ double nan(const char *tagp)
{ {
return (double)__cuda_nanf(tagp); return (double)nanf(tagp);
} }
__device_func__(double __cuda_pow(double a, double b)) static __forceinline__ double pow(double a, double b)
{ {
return (double)__cuda_powf((float)a, (float)b); return (double)powf((float)a, (float)b);
} }
__device_func__(double __cuda_round(double a)) static __forceinline__ double round(double a)
{ {
return (double)__cuda_roundf((float)a); return (double)roundf((float)a);
} }
__device_func__(long __cuda_lround(double a)) static __forceinline__ long lround(double a)
{ {
return __cuda_lroundf((float)a); return lroundf((float)a);
} }
__device_func__(long long __cuda_llround(double a)) static __forceinline__ long long llround(double a)
{ {
return __cuda_llroundf((float)a); return llroundf((float)a);
} }
__device_func__(double __cuda_rint(double a)) static __forceinline__ double rint(double a)
{ {
return (double)__cuda_rintf((float)a); return (double)rintf((float)a);
} }
__device_func__(long __cuda_lrint(double a)) static __forceinline__ long lrint(double a)
{ {
return __cuda_lrintf((float)a); return lrintf((float)a);
} }
__device_func__(long long __cuda_llrint(double a)) static __forceinline__ long long llrint(double a)
{ {
return __cuda_llrintf((float)a); return llrintf((float)a);
} }
__device_func__(double __cuda_nearbyint(double a)) static __forceinline__ double nearbyint(double a)
{ {
return (double)__cuda_nearbyintf((float)a); return (double)nearbyintf((float)a);
} }
__device_func__(double __cuda_fdim(double a, double b)) static __forceinline__ double fdim(double a, double b)
{ {
return (double)__cuda_fdimf((float)a, (float)b); return (double)fdimf((float)a, (float)b);
} }
__device_func__(int __cuda_ilogb(double a)) static __forceinline__ int ilogb(double a)
{ {
return __cuda_ilogbf((float)a); return ilogbf((float)a);
} }
__device_func__(double __cuda_logb(double a)) static __forceinline__ double logb(double a)
{ {
return (double)__cuda_logbf((float)a); return (double)logbf((float)a);
} }
__device_func__(double __cuda_fma(double a, double b, double c)) static __forceinline__ double fma(double a, double b, double c)
{ {
return (double)__cuda_fmaf((float)a, (float)b, (float)c); return (double)fmaf((float)a, (float)b, (float)c);
} }
#if __APPLE__ #if defined(__APPLE__)
__device_func__(int __cuda___isfinited(double a))
static __forceinline__ int __isfinited(double a)
{ {
return (double)__cuda___finitef((float)a); return (double)__finitef((float)a);
} }
__device_func__(int __cuda___signbitd(double a)) static __forceinline__ int __signbitd(double a)
{ {
return (double)__cuda___signbitf((float)a); return (double)__signbitf((float)a);
} }
#endif
#endif /* __cplusplus && __CUDACC__ */ #endif /* __APPLE__ */
#endif /* __CUDABE__ */
#endif /* __MATH_FUNCTIONS_DBL_PTX1_H__ */ #endif /* __MATH_FUNCTIONS_DBL_PTX1_H__ */
 End of changes. 141 change blocks. 
146 lines changed or deleted 144 lines changed or added


 math_functions_dbl_ptx3.h   math_functions_dbl_ptx3.h 
skipping to change at line 41 skipping to change at line 41
* Any use of this source code in individual and commercial software must * Any use of this source code in individual and commercial software must
* include, in the user documentation and internal comments to the code, * include, in the user documentation and internal comments to the code,
* the above Disclaimer and U.S. Government End Users Notice. * the above Disclaimer and U.S. Government End Users Notice.
*/ */
#if !defined(__MATH_FUNCTIONS_DBL_PTX3_H__) #if !defined(__MATH_FUNCTIONS_DBL_PTX3_H__)
#define __MATH_FUNCTIONS_DBL_PTX3_H__ #define __MATH_FUNCTIONS_DBL_PTX3_H__
/* True double precision implementations, since native double support */ /* True double precision implementations, since native double support */
#if defined(__cplusplus) && defined(__CUDACC__) #if defined(__CUDABE__)
#elif !defined(__CUDACC__)
#include "crt/func_macro.h"
#define INT_MAX \
((int)((unsigned int)-1 >> 1))
#include "device_functions.h"
#include "math_constants.h"
#if !defined(__CUDABE__)
#include "common_types.h"
#endif
/************************************************************************** ***** /************************************************************************** *****
* * * *
* DEVICE IMPLEMENTATIONS FOR FUNCTIONS WITH BUILTIN NVOPENCC OPREATIONS * * DEVICE IMPLEMENTATIONS FOR FUNCTIONS WITH BUILTIN NVOPENCC OPERATIONS *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
__device_func__(double __cuda_fabs(double a)) static __forceinline__ double rint(double a)
{
return fabs(a);
}
__device_func__(double __cuda_fmax(double a, double b))
{
#if !defined(__CUDABE__)
volatile union __cudart_DoubleUlonglongCvt cvta, cvtb;
int nana, nanb;
cvta.d = a;
cvtb.d = b;
nana = ((cvta.i << 1) > 0xffe0000000000000ULL);
nanb = ((cvtb.i << 1) > 0xffe0000000000000ULL);
if (nana && nanb) return a + b;
if (nana) return b;
if (nanb) return a;
if ((cvta.d == 0.0) && (cvtb.d == 0.0)) {
cvta.i &= cvtb.i;
return cvta.d;
}
return a > b ? a : b;
#else
return fmax(a, b);
#endif /* !defined(__CUDABE__) */
}
__device_func__(double __cuda_fmin(double a, double b))
{
#if !defined(__CUDABE__)
volatile union __cudart_DoubleUlonglongCvt cvta, cvtb;
int nana, nanb;
cvta.d = a;
cvtb.d = b;
nana = ((cvta.i << 1) > 0xffe0000000000000ULL);
nanb = ((cvtb.i << 1) > 0xffe0000000000000ULL);
if (nana && nanb) return a + b;
if (nana) return b;
if (nanb) return a;
if ((cvta.i | cvtb.i) == 0x8000000000000000ULL) {
return CUDART_NEG_ZERO ;
}
return a < b ? a : b;
#else
return fmin(a, b);
#endif /* !defined(__CUDABE__) */
}
__device_func__(double __cuda_ceil(double a))
{ {
return ceil(a); return __builtin_round(a);
} }
__device_func__(double __cuda_floor(double a)) static __forceinline__ long int lrint(double a)
{ {
return floor(a); #if defined(__LP64__)
return (long int)__double2ll_rn(a);
#else /* __LP64__ */
return (long int)__double2int_rn(a);
#endif /* __LP64__ */
} }
__device_func__(double __cuda_trunc(double a)) static __forceinline__ long long int llrint(double a)
{ {
return trunc(a); return __double2ll_rn(a);
} }
__device_func__(double __cuda_nearbyint(double a)) static __forceinline__ double nearbyint(double a)
{ {
#if defined(__CUDABE__) return __builtin_round(a);
return round(a);
#else /* __CUDABE__ */
double res = nearbyint(a);
#if defined(__APPLE__)
if ((a != 0.0) && (__cuda_fabs(a) <= 0.5)) {
res = fabs(res) * ((a < 0.0) ? -3e-324 : 3e-324);
}
#endif /* __APPLE__ */
return res;
#endif /* __CUDABE__ */
} }
/************************************************************************** ***** /************************************************************************** *****
* * * *
* DEVICE IMPLEMENTATIONS FOR FUNCTIONS WITHOUT BUILTIN NVOPENCC OPREATIONS * * DEVICE IMPLEMENTATIONS FOR FUNCTIONS WITHOUT BUILTIN NVOPENCC OPREATIONS *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
__device_func__(double __cuda_rint(double a)) static __forceinline__ int __signbit(double a)
{
return __cuda_nearbyint(a);
}
__device_func__(long int __cuda_lrint(double a))
{
#if defined(__LP64__)
return (long int)__double2ll_rn(a);
#else /* __LP64__ */
return (long int)__double2int_rn(a);
#endif /* __LP64__ */
}
__device_func__(long long int __cuda_llrint(double a))
{
return __double2ll_rn(a);
}
__device_func__(int __cuda___signbit(double a))
{ {
return (int)((unsigned int)__double2hiint(a) >> 31); return (int)((unsigned int)__double2hiint(a) >> 31);
} }
__device_func__(int __cuda___finite(double a)) static __forceinline__ int __finite(double a)
{ {
return __cuda_fabs(a) < CUDART_INF; return fabs(a) < CUDART_INF;
} }
__device_func__(int __cuda___isinf(double a)) static __forceinline__ int __isinf(double a)
{ {
return __cuda_fabs(a) == CUDART_INF; return fabs(a) == CUDART_INF;
} }
__device_func__(int __cuda___isnan(double a)) static __forceinline__ int __isnan(double a)
{ {
return !(__cuda_fabs(a) <= CUDART_INF); return !(fabs(a) <= CUDART_INF);
} }
__device_func__(double __cuda_copysign(double a, double b)) static __forceinline__ double copysign(double a, double b)
{ {
int alo, ahi, bhi; int alo, ahi, bhi;
bhi = __double2hiint(b); bhi = __double2hiint(b);
alo = __double2loint(a); alo = __double2loint(a);
ahi = __double2hiint(a); ahi = __double2hiint(a);
ahi = (bhi & 0x80000000) | (ahi & ~0x80000000); ahi = (bhi & 0x80000000) | (ahi & ~0x80000000);
return __hiloint2double(ahi, alo); return __hiloint2double(ahi, alo);
} }
/* like copysign, but requires that argument a is postive */ /* like copysign, but requires that argument a is postive */
__device_func__(double __internal_copysign_pos(double a, double b)) static __forceinline__ double __internal_copysign_pos(double a, double b)
{ {
int alo, ahi, bhi; int alo, ahi, bhi;
bhi = __double2hiint(b); bhi = __double2hiint(b);
alo = __double2loint(a); alo = __double2loint(a);
ahi = __double2hiint(a); ahi = __double2hiint(a);
ahi = (bhi & 0x80000000) | ahi; ahi = (bhi & 0x80000000) | ahi;
return __hiloint2double(ahi, alo); return __hiloint2double(ahi, alo);
} }
static __forceinline__ double __internal_fast_rcp(double a)
{
double e, y;
float x;
x = __double2float_rn(a);
y = (double)(1.0f/x);
e = __fma_rn (-a, y, 1.0);
e = __fma_rn ( e, e, e);
y = __fma_rn ( e, y, y);
return y;
}
/* 1152 bits of 2/PI for Payne-Hanek style argument reduction. */ /* 1152 bits of 2/PI for Payne-Hanek style argument reduction. */
static __constant__ unsigned long long int __cudart_i2opi_d [] = { static __constant__ unsigned long long int __cudart_i2opi_d [] = {
0x6bfb5fb11f8d5d08ULL, 0x6bfb5fb11f8d5d08ULL,
0x3d0739f78a5292eaULL, 0x3d0739f78a5292eaULL,
0x7527bac7ebe5f17bULL, 0x7527bac7ebe5f17bULL,
0x4f463f669e5fea2dULL, 0x4f463f669e5fea2dULL,
0x6d367ecf27cb09b7ULL, 0x6d367ecf27cb09b7ULL,
0xef2f118b5a0a6d1fULL, 0xef2f118b5a0a6d1fULL,
0x1ff897ffde05980fULL, 0x1ff897ffde05980fULL,
0x9c845f8bbdf9283bULL, 0x9c845f8bbdf9283bULL,
skipping to change at line 231 skipping to change at line 156
0xe88235f52ebb4484ULL, 0xe88235f52ebb4484ULL,
0xfe1deb1cb129a73eULL, 0xfe1deb1cb129a73eULL,
0x06492eea09d1921cULL, 0x06492eea09d1921cULL,
0xb7246e3a424dd2e0ULL, 0xb7246e3a424dd2e0ULL,
0xfe5163abdebbc561ULL, 0xfe5163abdebbc561ULL,
0xdb6295993c439041ULL, 0xdb6295993c439041ULL,
0xfc2757d1f534ddc0ULL, 0xfc2757d1f534ddc0ULL,
0xa2f9836e4e441529ULL, 0xa2f9836e4e441529ULL,
}; };
__device_func__(double __internal_trig_reduction_kerneld(double a, int *qua drant)) static __forceinline__ double __internal_trig_reduction_kerneld(double a, i nt *quadrant)
{ {
double j; double j;
int q; int q;
if (__cuda_fabs(a) > CUDART_TRIG_PLOSS) { if (fabs(a) > CUDART_TRIG_PLOSS) {
/* Payne-Hanek style argument reduction. */ /* Payne-Hanek style argument reduction. */
unsigned long long int ia; unsigned long long int ia;
unsigned long long int s; unsigned long long int s;
unsigned long long int result[5]; unsigned long long int result[5];
unsigned long long int phi, plo; unsigned long long int phi, plo;
unsigned long long int hi, lo; unsigned long long int hi, lo;
unsigned int e; unsigned int e;
int idx; int idx;
ia = __double_as_longlong(a); ia = __double_as_longlong(a);
s = ia & 0x8000000000000000ULL; s = ia & 0x8000000000000000ULL;
e = (unsigned int)(((ia >> 52) & 0x7ff) - 1024); e = (unsigned int)(((ia >> 52) & 0x7ff) - 1024);
ia = (ia << 11) | 0x8000000000000000ULL; ia = (ia << 11) | 0x8000000000000000ULL;
/* compute x * 2/pi */ /* compute x * 2/pi */
idx = 16 - (e >> 6); idx = 16 - (e >> 6);
hi = 0; hi = 0;
#if defined(__CUDABE__)
#pragma unroll 1 #pragma unroll 1
#endif /* __CUDABE__ */
for (q = (idx-1); q < min(18,idx+3); q++) { for (q = (idx-1); q < min(18,idx+3); q++) {
plo = __cudart_i2opi_d[q] * ia; plo = __cudart_i2opi_d[q] * ia;
phi = __umul64hi (__cudart_i2opi_d[q], ia); phi = __umul64hi (__cudart_i2opi_d[q], ia);
lo = hi + plo; lo = hi + plo;
hi = phi + (lo < plo); hi = phi + (lo < plo);
result[q-(idx-1)] = lo; result[q-(idx-1)] = lo;
} }
result[q-(idx-1)] = hi; result[q-(idx-1)] = hi;
e = e & 63; e = e & 63;
/* shift result such that hi:lo<127:126> are the least significant /* shift result such that hi:lo<127:126> are the least significant
skipping to change at line 324 skipping to change at line 247
* http://arxiv.org/PS_cache/arxiv/pdf/0708/0708.3722v1.pdf * http://arxiv.org/PS_cache/arxiv/pdf/0708/0708.3722v1.pdf
*/ */
a = __fma_rn (-j, 1.5707963267948966e+000, a); a = __fma_rn (-j, 1.5707963267948966e+000, a);
a = __fma_rn (-j, 6.1232339957367574e-017, a); a = __fma_rn (-j, 6.1232339957367574e-017, a);
a = __fma_rn (-j, 8.4784276603688985e-032, a); a = __fma_rn (-j, 8.4784276603688985e-032, a);
*quadrant = q; *quadrant = q;
return a; return a;
} }
/* approximate sine on -pi/4...+pi/4 */ /* approximate sine on -pi/4...+pi/4 */
__device_func__(double __internal_sin_kerneld(double x)) static __forceinline__ double __internal_sin_kerneld(double x)
{ {
double x2, z; double x2, z;
x2 = x * x; x2 = x * x;
z = 1.5896230157221844E-010; z = 1.5896230157221844E-010;
z = __fma_rn (z, x2, -2.5050747762850355E-008); z = __fma_rn (z, x2, -2.5050747762850355E-008);
z = __fma_rn (z, x2, 2.7557313621385676E-006); z = __fma_rn (z, x2, 2.7557313621385676E-006);
z = __fma_rn (z, x2, -1.9841269829589539E-004); z = __fma_rn (z, x2, -1.9841269829589539E-004);
z = __fma_rn (z, x2, 8.3333333333221182E-003); z = __fma_rn (z, x2, 8.3333333333221182E-003);
z = __fma_rn (z, x2, -1.6666666666666630E-001); z = __fma_rn (z, x2, -1.6666666666666630E-001);
z = z * x2; z = z * x2;
z = __fma_rn (z, x, x); z = __fma_rn (z, x, x);
return z; return z;
} }
/* approximate cosine on -pi/4...+pi/4 */ /* approximate cosine on -pi/4...+pi/4 */
__device_func__(double __internal_cos_kerneld(double x)) static __forceinline__ double __internal_cos_kerneld(double x)
{ {
double x2, z; double x2, z;
x2 = x * x; x2 = x * x;
z = -1.136788825395985E-011; z = -1.136788825395985E-011;
z = __fma_rn (z, x2, 2.087588480545065E-009); z = __fma_rn (z, x2, 2.087588480545065E-009);
z = __fma_rn (z, x2, -2.755731555403950E-007); z = __fma_rn (z, x2, -2.755731555403950E-007);
z = __fma_rn (z, x2, 2.480158729365970E-005); z = __fma_rn (z, x2, 2.480158729365970E-005);
z = __fma_rn (z, x2, -1.388888888888074E-003); z = __fma_rn (z, x2, -1.388888888888074E-003);
z = __fma_rn (z, x2, 4.166666666666664E-002); z = __fma_rn (z, x2, 4.166666666666664E-002);
z = __fma_rn (z, x2, -5.000000000000000E-001); z = __fma_rn (z, x2, -5.000000000000000E-001);
z = __fma_rn (z, x2, 1.000000000000000E+000); z = __fma_rn (z, x2, 1.000000000000000E+000);
return z; return z;
} }
/* approximate tangent on -pi/4...+pi/4 */ /* approximate tangent on -pi/4...+pi/4 */
__device_func__(double __internal_tan_kerneld(double x, int i)) static __forceinline__ double __internal_tan_kerneld(double x, int i)
{ {
double x2, z, q; double x2, z, q;
x2 = x * x; x2 = x * x;
z = 9.8006287203286300E-006; z = 9.8006287203286300E-006;
z = __fma_rn (z, x2, -2.4279526494179897E-005); z = __fma_rn (z, x2, -2.4279526494179897E-005);
z = __fma_rn (z, x2, 4.8644173130937162E-005); z = __fma_rn (z, x2, 4.8644173130937162E-005);
z = __fma_rn (z, x2, -2.5640012693782273E-005); z = __fma_rn (z, x2, -2.5640012693782273E-005);
z = __fma_rn (z, x2, 6.7223984330880073E-005); z = __fma_rn (z, x2, 6.7223984330880073E-005);
z = __fma_rn (z, x2, 8.3559287318211639E-005); z = __fma_rn (z, x2, 8.3559287318211639E-005);
z = __fma_rn (z, x2, 2.4375039850848564E-004); z = __fma_rn (z, x2, 2.4375039850848564E-004);
skipping to change at line 388 skipping to change at line 311
double s = q - x; double s = q - x;
double w = __fma_rn (z, x, -s); // tail of q double w = __fma_rn (z, x, -s); // tail of q
z = 1.0 / q; z = 1.0 / q;
z = -z; z = -z;
s = __fma_rn (q, z, 1.0); s = __fma_rn (q, z, 1.0);
q = __fma_rn (z, __fma_rn (z, w, s), z); q = __fma_rn (z, __fma_rn (z, w, s), z);
} }
return q; return q;
} }
__device_func__(double __cuda_sqrt(double a))
{
return sqrt(a);
}
__device_func__(double __cuda_rsqrt(double a))
{
#if !defined(__CUDABE__)
return 1.0 / sqrt(a);
#else
return rsqrt(a);
#endif
}
/* approximates exp(a)-1 on [-log(1.5),log(1.5)] accurate to 1 ulp */ /* approximates exp(a)-1 on [-log(1.5),log(1.5)] accurate to 1 ulp */
__device_func__(double __internal_expm1_kernel (double a)) static __forceinline__ double __internal_expm1_kernel (double a)
{ {
double t; double t;
t = 2.08842685477913050E-009; t = 2.08842685477913050E-009;
t = __fma_rn (t, a, 2.51366409033551950E-008); t = __fma_rn (t, a, 2.51366409033551950E-008);
t = __fma_rn (t, a, 2.75574612072447230E-007); t = __fma_rn (t, a, 2.75574612072447230E-007);
t = __fma_rn (t, a, 2.75571539284473460E-006); t = __fma_rn (t, a, 2.75571539284473460E-006);
t = __fma_rn (t, a, 2.48015869443077950E-005); t = __fma_rn (t, a, 2.48015869443077950E-005);
t = __fma_rn (t, a, 1.98412699878799470E-004); t = __fma_rn (t, a, 1.98412699878799470E-004);
t = __fma_rn (t, a, 1.38888888892029890E-003); t = __fma_rn (t, a, 1.38888888892029890E-003);
t = __fma_rn (t, a, 8.33333333327662860E-003); t = __fma_rn (t, a, 8.33333333327662860E-003);
t = __fma_rn (t, a, 4.16666666666656370E-002); t = __fma_rn (t, a, 4.16666666666656370E-002);
t = __fma_rn (t, a, 1.66666666666667380E-001); t = __fma_rn (t, a, 1.66666666666667380E-001);
t = __fma_rn (t, a, 5.00000000000000000E-001); t = __fma_rn (t, a, 5.00000000000000000E-001);
t = t * a; t = t * a;
t = __fma_rn (t, a, a); t = __fma_rn (t, a, a);
return t; return t;
} }
/* approximate 2*atanh(0.5*a) on [-0.25,0.25] */ /* approximate 2*atanh(0.5*a) on [-0.25,0.25] */
__device_func__(double __internal_atanh_kernel (double a_1, double a_2)) static __forceinline__ double __internal_atanh_kernel (double a_1, double a _2)
{ {
double a, a2, t; double a, a2, t;
a = a_1 + a_2; a = a_1 + a_2;
a2 = a * a; a2 = a * a;
t = 7.597322383488143E-002/65536.0; t = 7.597322383488143E-002/65536.0;
t = __fma_rn (t, a2, 6.457518383364042E-002/16384.0); t = __fma_rn (t, a2, 6.457518383364042E-002/16384.0);
t = __fma_rn (t, a2, 7.705685707267146E-002/4096.0); t = __fma_rn (t, a2, 7.705685707267146E-002/4096.0);
t = __fma_rn (t, a2, 9.090417561104036E-002/1024.0); t = __fma_rn (t, a2, 9.090417561104036E-002/1024.0);
t = __fma_rn (t, a2, 1.111112158368149E-001/256.0); t = __fma_rn (t, a2, 1.111112158368149E-001/256.0);
t = __fma_rn (t, a2, 1.428571416261528E-001/64.0); t = __fma_rn (t, a2, 1.428571416261528E-001/64.0);
t = __fma_rn (t, a2, 2.000000000069858E-001/16.0); t = __fma_rn (t, a2, 2.000000000069858E-001/16.0);
t = __fma_rn (t, a2, 3.333333333333198E-001/4.0); t = __fma_rn (t, a2, 3.333333333333198E-001/4.0);
t = t * a2; t = t * a2;
t = __fma_rn (t, a, a_2); t = __fma_rn (t, a, a_2);
t = t + a_1; t = t + a_1;
return t; return t;
} }
__device_func__(double __internal_exp2i_kernel(int b)) static __forceinline__ double __internal_exp2i_kernel(int b)
{ {
return __hiloint2double((b + 1023) << 20, 0); return __hiloint2double((b + 1023) << 20, 0);
} }
__device_func__(double __internal_half(double a)) static __forceinline__ double __internal_half(double a)
{ {
unsigned int ihi, ilo; unsigned int ihi, ilo;
ilo = __double2loint(a); ilo = __double2loint(a);
ihi = __double2hiint(a); ihi = __double2hiint(a);
return __hiloint2double(ihi - 0x00100000, ilo); return __hiloint2double(ihi - 0x00100000, ilo);
} }
__device_func__(double __internal_twice(double a)) static __forceinline__ double __internal_twice(double a)
{ {
unsigned int ihi, ilo; unsigned int ihi, ilo;
ilo = __double2loint(a); ilo = __double2loint(a);
ihi = __double2hiint(a); ihi = __double2hiint(a);
return __hiloint2double(ihi + 0x00100000, ilo); return __hiloint2double(ihi + 0x00100000, ilo);
} }
__device_func__(double __cuda_sin(double a)) static __forceinline__ double sin(double a)
{ {
double z; double z;
int i; int i;
if (__cuda___isinf(a) || (a == CUDART_ZERO)) { if (__isinf(a) || (a == CUDART_ZERO)) {
return __dmul_rn(a, CUDART_ZERO); return __dmul_rn(a, CUDART_ZERO);
} }
z = __internal_trig_reduction_kerneld(a, &i); z = __internal_trig_reduction_kerneld(a, &i);
/* here, abs(z) <= pi/4, and i has the quadrant */ /* here, abs(z) <= pi/4, and i has the quadrant */
if (i & 1) { if (i & 1) {
z = __internal_cos_kerneld(z); z = __internal_cos_kerneld(z);
} else { } else {
z = __internal_sin_kerneld(z); z = __internal_sin_kerneld(z);
} }
if (i & 2) { if (i & 2) {
z = -z; z = -z;
} }
return z; return z;
} }
__device_func__(double __cuda_sinpi(double a)) static __forceinline__ double sinpi(double a)
{ {
double z; double z;
double fi; double fi;
int i; int i;
if (__cuda___isinf(a) || (a == CUDART_ZERO)) { if (__isinf(a) || (a == CUDART_ZERO)) {
return __dmul_rn(a, CUDART_ZERO); return __dmul_rn(a, CUDART_ZERO);
} }
/* IEEE-754: sinPi(+n) is +0 and sinPi(-n) is -0 for positive integers n. */ /* IEEE-754: sinPi(+n) is +0 and sinPi(-n) is -0 for positive integers n. */
if (a == __cuda_trunc(a)) { if (a == trunc(a)) {
return __longlong_as_double(__double_as_longlong(a)&0x8000000000000000U LL); return __longlong_as_double(__double_as_longlong(a)&0x8000000000000000U LL);
} }
fi = __cuda_rint (a * 2.0); fi = rint (a * 2.0);
z = __fma_rn (fi, -0.5, a); z = __fma_rn (fi, -0.5, a);
z = __fma_rn (z, CUDART_PI_HI, z * CUDART_PI_LO); z = __fma_rn (z, CUDART_PI_HI, z * CUDART_PI_LO);
i = (int)(((long long)fi) & 3); i = (int)(((long long)fi) & 3);
if (i & 1) { if (i & 1) {
z = __internal_cos_kerneld(z); z = __internal_cos_kerneld(z);
} else { } else {
z = __internal_sin_kerneld(z); z = __internal_sin_kerneld(z);
} }
if (i & 2) { if (i & 2) {
z = -z; z = -z;
} }
return z; return z;
} }
__device_func__(double __cuda_cos(double a)) static __forceinline__ double cos(double a)
{ {
double z; double z;
int i; int i;
if (__cuda___isinf(a)) { if (__isinf(a)) {
return CUDART_NAN; return CUDART_NAN;
} }
z = __internal_trig_reduction_kerneld(a, &i); z = __internal_trig_reduction_kerneld(a, &i);
/* here, abs(z) <= pi/4, and i has the quadrant */ /* here, abs(z) <= pi/4, and i has the quadrant */
i++; i++;
if (i & 1) { if (i & 1) {
z = __internal_cos_kerneld(z); z = __internal_cos_kerneld(z);
} else { } else {
z = __internal_sin_kerneld(z); z = __internal_sin_kerneld(z);
} }
if (i & 2) { if (i & 2) {
z = -z; z = -z;
} }
return z; return z;
} }
__device_func__(void __cuda_sincos(double a, double *sptr, double *cptr)) static __forceinline__ void sincos(double a, double *sptr, double *cptr)
{ {
double t, u, s, c; double t, u, s, c;
int i; int i;
t = __cuda_fabs(a); t = fabs(a);
if ((t == CUDART_INF) || (t == CUDART_ZERO)) { if ((t == CUDART_INF) || (t == CUDART_ZERO)) {
s = __dmul_rn (a, CUDART_ZERO); /* generate NaN, zero */ s = __dmul_rn (a, CUDART_ZERO); /* generate NaN, zero */
c = 1.0 + s; /* generate NaN, one */ c = 1.0 + s; /* generate NaN, one */
*sptr = s; *sptr = s;
*cptr = c; *cptr = c;
return; return;
} }
t = __internal_trig_reduction_kerneld(a, &i); t = __internal_trig_reduction_kerneld(a, &i);
u = __internal_cos_kerneld(t); u = __internal_cos_kerneld(t);
t = __internal_sin_kerneld(t); t = __internal_sin_kerneld(t);
skipping to change at line 566 skipping to change at line 475
s = -s; s = -s;
} }
i++; i++;
if (i & 2) { if (i & 2) {
c = -c; c = -c;
} }
*sptr = s; *sptr = s;
*cptr = c; *cptr = c;
} }
__device_func__(double __cuda_tan(double a)) static __forceinline__ double tan(double a)
{ {
double z; double z;
int i; int i;
if (__cuda___isinf(a)) { if (__isinf(a)) {
return __dadd_rn (a, -a); /* return NaN */ return __dadd_rn (a, -a); /* return NaN */
} }
z = __internal_trig_reduction_kerneld(a, &i); z = __internal_trig_reduction_kerneld(a, &i);
/* here, abs(z) <= pi/4, and i has the quadrant */ /* here, abs(z) <= pi/4, and i has the quadrant */
z = __internal_tan_kerneld(z, i & 1); z = __internal_tan_kerneld(z, i & 1);
return z; return z;
} }
__device_func__(double __cuda_log(double a)) static __forceinline__ double log(double a)
{ {
double m, f, g, u, v, tmp, q, ulo, log_lo, log_hi; double m, f, g, u, v, tmp, q, ulo, log_lo, log_hi;
int ihi, ilo; int ihi, ilo;
ihi = __double2hiint(a); ihi = __double2hiint(a);
ilo = __double2loint(a); ilo = __double2loint(a);
if ((a > CUDART_ZERO) && (a < CUDART_INF)) { if ((a > CUDART_ZERO) && (a < CUDART_INF)) {
int e = -1023; int e = -1023;
/* normalize denormals */ /* normalize denormals */
skipping to change at line 609 skipping to change at line 518
e += (ihi >> 20); e += (ihi >> 20);
ihi = (ihi & 0x800fffff) | 0x3ff00000; ihi = (ihi & 0x800fffff) | 0x3ff00000;
m = __hiloint2double (ihi, ilo); m = __hiloint2double (ihi, ilo);
if ((unsigned)ihi > (unsigned)0x3ff6a09e) { if ((unsigned)ihi > (unsigned)0x3ff6a09e) {
m = __internal_half(m); m = __internal_half(m);
e = e + 1; e = e + 1;
} }
/* log((1+m)/(1-m)) = 2*atanh(m). log(m) = 2*atanh ((m-1)/(m+1)) */ /* log((1+m)/(1-m)) = 2*atanh(m). log(m) = 2*atanh ((m-1)/(m+1)) */
f = m - 1.0; f = m - 1.0;
g = m + 1.0; g = m + 1.0;
g = 1.0 / g; g = __internal_fast_rcp(g);
u = f * g; u = f * g;
u = u + u; u = u + u;
/* u = 2.0 * (m - 1.0) / (m + 1.0) */ /* u = 2.0 * (m - 1.0) / (m + 1.0) */
v = u * u; v = u * u;
q = 6.7261411553826339E-2/65536.0; q = 6.7261411553826339E-2/65536.0;
q = __fma_rn (q, v, 6.6133829643643394E-2/16384.0); q = __fma_rn (q, v, 6.6133829643643394E-2/16384.0);
q = __fma_rn (q, v, 7.6940931149150890E-2/4096.0); q = __fma_rn (q, v, 7.6940931149150890E-2/4096.0);
q = __fma_rn (q, v, 9.0908745692137444E-2/1024.0); q = __fma_rn (q, v, 9.0908745692137444E-2/1024.0);
q = __fma_rn (q, v, 1.1111111499059706E-1/256.0); q = __fma_rn (q, v, 1.1111111499059706E-1/256.0);
q = __fma_rn (q, v, 1.4285714283305975E-1/64.0); q = __fma_rn (q, v, 1.4285714283305975E-1/64.0);
skipping to change at line 640 skipping to change at line 549
log_lo = ulo + q; log_lo = ulo + q;
/* log_hi + log_lo = log(m)+e*log(2)=log(a) to more than double precisi on*/ /* log_hi + log_lo = log(m)+e*log(2)=log(a) to more than double precisi on*/
q = __fma_rn ( e, CUDART_LN2_HI, log_hi); q = __fma_rn ( e, CUDART_LN2_HI, log_hi);
tmp = __fma_rn (-e, CUDART_LN2_HI, q); tmp = __fma_rn (-e, CUDART_LN2_HI, q);
tmp = tmp - log_hi; tmp = tmp - log_hi;
log_hi = q; log_hi = q;
log_lo = log_lo - tmp; log_lo = log_lo - tmp;
log_lo = __fma_rn (e, CUDART_LN2_LO, log_lo); log_lo = __fma_rn (e, CUDART_LN2_LO, log_lo);
return log_hi + log_lo; return log_hi + log_lo;
} else { } else {
if (__cuda___isnan(a)) { if (__isnan(a)) {
return a + a; return a + a;
} }
/* log(0) = -INF */ /* log(0) = -INF */
if (a == 0) { if (a == 0) {
return -CUDART_INF; return -CUDART_INF;
} }
/* log(INF) = INF */ /* log(INF) = INF */
if (a == CUDART_INF) { if (a == CUDART_INF) {
return a; return a;
} }
/* log(x) is undefined for x < 0.0, return INDEFINITE */ /* log(x) is undefined for x < 0.0, return INDEFINITE */
return CUDART_NAN; return CUDART_NAN;
} }
} }
/* Requires |x.y| > |y.y|. 8 DP operations */ /* Requires |x.y| > |y.y|. 8 DP operations */
__device_func__(double2 __internal_ddadd_xgty (double2 x, double2 y)) static __forceinline__ double2 __internal_ddadd_xgty (double2 x, double2 y)
{ {
double2 z; double2 z;
#if defined(__GNUC__) && !defined(__CUDABE__)
volatile
#endif
double r, s, e; double r, s, e;
r = x.y + y.y; r = x.y + y.y;
e = x.y - r; e = x.y - r;
s = ((e + y.y) + y.x) + x.x; s = ((e + y.y) + y.x) + x.x;
z.y = e = r + s; z.y = e = r + s;
z.x = (r - e) + s; z.x = (r - e) + s;
return z; return z;
} }
/* Take full advantage of FMA. Only 8 DP operations */ /* Take full advantage of FMA. Only 8 DP operations */
__device_func__(double2 __internal_ddmul (double2 x, double2 y)) static __forceinline__ double2 __internal_ddmul (double2 x, double2 y)
{ {
#if defined(__GNUC__) && !defined(__CUDABE__)
volatile
#endif
double e; double e;
double2 t, z; double2 t, z;
t.y = x.y * y.y; t.y = x.y * y.y;
t.x = __fma_rn (x.y, y.y, -t.y); t.x = __fma_rn (x.y, y.y, -t.y);
t.x = __fma_rn (x.x, y.x, t.x); t.x = __fma_rn (x.x, y.x, t.x);
t.x = __fma_rn (x.y, y.x, t.x); t.x = __fma_rn (x.y, y.x, t.x);
t.x = __fma_rn (x.x, y.y, t.x); t.x = __fma_rn (x.x, y.y, t.x);
z.y = e = t.y + t.x; z.y = e = t.y + t.x;
z.x = (t.y - e) + t.x; z.x = (t.y - e) + t.x;
return z; return z;
} }
__device_func__(double2 __internal_log_ext_prec(double a)) static __forceinline__ double2 __internal_log_ext_prec(double a)
{ {
double2 res; double2 res;
double2 qq, cc, uu, tt; double2 qq, cc, uu, tt;
double f, g, u, v, q, ulo, tmp, m; double f, g, u, v, q, ulo, tmp, m;
int ilo, ihi, expo; int ilo, ihi, expo;
ihi = __double2hiint(a); ihi = __double2hiint(a);
ilo = __double2loint(a); ilo = __double2loint(a);
expo = (ihi >> 20) & 0x7ff; expo = (ihi >> 20) & 0x7ff;
/* convert denormals to normals for computation of log(a) */ /* convert denormals to normals for computation of log(a) */
skipping to change at line 726 skipping to change at line 629
m = __internal_half(m); m = __internal_half(m);
expo = expo + 1; expo = expo + 1;
} }
/* compute log(m) with extended precision using an algorithm derived from /* compute log(m) with extended precision using an algorithm derived from
* P.T.P. Tang, "Table Driven Implementation of the Logarithm Function", * P.T.P. Tang, "Table Driven Implementation of the Logarithm Function",
* TOMS, Vol. 16., No. 4, December 1990, pp. 378-400. A modified polynomi al * TOMS, Vol. 16., No. 4, December 1990, pp. 378-400. A modified polynomi al
* approximation to atanh(x) on the interval [-0.1716, 0.1716] is utilize d. * approximation to atanh(x) on the interval [-0.1716, 0.1716] is utilize d.
*/ */
f = m - 1.0; f = m - 1.0;
g = m + 1.0; g = m + 1.0;
g = 1.0 / g; g = __internal_fast_rcp(g);
u = f * g; u = f * g;
u = u + u; u = u + u;
/* u = 2.0 * (m - 1.0) / (m + 1.0) */ /* u = 2.0 * (m - 1.0) / (m + 1.0) */
v = u * u; v = u * u;
q = 6.6253631649203309E-2/65536.0; q = 6.6253631649203309E-2/65536.0;
q = __fma_rn (q, v, 6.6250935587260612E-2/16384.0); q = __fma_rn (q, v, 6.6250935587260612E-2/16384.0);
q = __fma_rn (q, v, 7.6935437806732829E-2/4096.0); q = __fma_rn (q, v, 7.6935437806732829E-2/4096.0);
q = __fma_rn (q, v, 9.0908878711093280E-2/1024.0); q = __fma_rn (q, v, 9.0908878711093280E-2/1024.0);
q = __fma_rn (q, v, 1.1111111322892790E-1/256.0); q = __fma_rn (q, v, 1.1111111322892790E-1/256.0);
q = __fma_rn (q, v, 1.4285714284546502E-1/64.0); q = __fma_rn (q, v, 1.4285714284546502E-1/64.0);
skipping to change at line 766 skipping to change at line 669
u = uu.y; u = uu.y;
ulo = uu.x; ulo = uu.x;
/* log(2)*expo in double-double format */ /* log(2)*expo in double-double format */
tt.y = expo * 6.9314718055966296e-001; /* multiplication is exact */ tt.y = expo * 6.9314718055966296e-001; /* multiplication is exact */
tt.x = expo * 2.8235290563031577e-013; tt.x = expo * 2.8235290563031577e-013;
/* log(a) = log(m) + log(2)*expo; if expo != 0, |log(2)*expo| > |log(m)| */ /* log(a) = log(m) + log(2)*expo; if expo != 0, |log(2)*expo| > |log(m)| */
res = __internal_ddadd_xgty (tt, uu); res = __internal_ddadd_xgty (tt, uu);
return res; return res;
} }
__device_func__(double __cuda_log2(double a)) static __forceinline__ double log2(double a)
{ {
double t; double t;
t = __cuda_log(a); t = log(a);
return __fma_rn (t, CUDART_L2E_HI, t * CUDART_L2E_LO); return __fma_rn (t, CUDART_L2E_HI, t * CUDART_L2E_LO);
} }
__device_func__(double __cuda_log10(double a)) static __forceinline__ double log10(double a)
{ {
double t; double t;
t = __cuda_log(a); t = log(a);
return __fma_rn (t, CUDART_LGE_HI, t * CUDART_LGE_LO); return __fma_rn (t, CUDART_LGE_HI, t * CUDART_LGE_LO);
} }
__device_func__(double __cuda_log1p(double a)) static __forceinline__ double log1p(double a)
{ {
double t; double t;
int i; int i;
i = __double2hiint(a); i = __double2hiint(a);
if (((unsigned)i < (unsigned)0x3fe55555) || ((int)i < (int)0xbfd99999)) { if (((unsigned)i < (unsigned)0x3fe55555) || ((int)i < (int)0xbfd99999)) {
/* Compute log2(a+1) = 2*atanh(a/(a+2)) */ /* Compute log2(a+1) = 2*atanh(a/(a+2)) */
t = a + 2.0; t = a + 2.0;
t = a / t; t = a / t;
t = -a * t; t = -a * t;
t = __internal_atanh_kernel(a, t); t = __internal_atanh_kernel(a, t);
return t; return t;
} }
return __cuda_log (a + CUDART_ONE); return log (a + CUDART_ONE);
} }
__device_func__(double __internal_exp_kernel(double a, int scale)) static __forceinline__ double __internal_exp_kernel(double a, int scale)
{ {
double t, fac, z; double t, fac, z;
int i; int i, k;
/* exp(a) = 2^(rint(a/log(2)) + z) = 2^(i + z) */ /* exp(a) = 2^(rint(a/log(2)) + z) = 2^(i + z) */
t = __cuda_rint (a * CUDART_L2E); t = rint (a * CUDART_L2E);
i = (int)t; i = (int)t;
z = __fma_rn (t, -CUDART_LN2_HI, a); z = __fma_rn (t, -CUDART_LN2_HI, a);
z = __fma_rn (t, -CUDART_LN2_LO, z); z = __fma_rn (t, -CUDART_LN2_LO, z);
fac = 2.0; k = 0x40000000;
if (i <= -1021) { if (i <= -1021) {
i += 55; i += 55;
fac = CUDART_TWO_TO_M54; k -= 55 << 20;
} }
fac = __hiloint2double(k, 0); /* 2^-54 if a is denormal, 2.0 otherwise */
/* exp(a) = 2^i * e^z */ /* exp(a) = 2^i * e^z */
t = __internal_expm1_kernel(z); t = __internal_expm1_kernel(z);
z = __internal_exp2i_kernel(i + scale - 1); z = __hiloint2double(((i + scale) << 20) + ((-1 + 1023) << 20), 0);
t = __fma_rn (t, z, z); t = __fma_rn (t, z, z);
t = t * fac; t = t * fac;
return t; return t;
} }
__device_func__(double __cuda_exp(double a)) static __forceinline__ double exp(double a)
{ {
double t; double t;
int i; int i;
i = __double2hiint(a); i = __double2hiint(a);
if (((unsigned)i < (unsigned)0x40862e43) || ((int)i < (int)0xC0874911)) { if (((unsigned)i < (unsigned)0x40862e43) || ((int)i < (int)0xC0874911)) {
t = __internal_exp_kernel(a, 0); t = __internal_exp_kernel(a, 0);
return t; return t;
} }
t = ((unsigned int)i >> 31) ? CUDART_ZERO : CUDART_INF; t = (i < 0) ? CUDART_ZERO : CUDART_INF;
if (__cuda___isnan(a)) { if (__isnan(a)) {
t = a + a; t = a + a;
} }
return t; return t;
} }
__device_func__(double __cuda_exp2(double a)) static __forceinline__ double exp2(double a)
{ {
double z; double z;
double t; double t;
double fac; double fac;
int i; int i;
i = __double2hiint(a); i = __double2hiint(a);
if (((unsigned)i < (unsigned)0x40900000) || ((int)i < (int)0xc090cc00)) { if (((unsigned)i < (unsigned)0x40900000) || ((int)i < (int)0xc090cc00)) {
t = __cuda_rint (a); t = rint (a);
z = a - t; z = a - t;
i = (int)t; i = (int)t;
fac = 2.0; fac = 2.0;
if (i <= -1021) { if (i <= -1021) {
i += 55; i += 55;
fac = CUDART_TWO_TO_M54; fac = CUDART_TWO_TO_M54;
} }
/* 2^z = exp(log(2)*z) */ /* 2^z = exp(log(2)*z) */
z = __fma_rn (z, CUDART_LN2_HI, z * CUDART_LN2_LO); z = __fma_rn (z, CUDART_LN2_HI, z * CUDART_LN2_LO);
t = __internal_expm1_kernel(z); t = __internal_expm1_kernel(z);
z = __internal_exp2i_kernel(i - 1); z = __internal_exp2i_kernel(i - 1);
t = __fma_rn (t, z, z); t = __fma_rn (t, z, z);
t = t * fac; t = t * fac;
return t; return t;
} }
t = ((unsigned int)i >> 31) ? CUDART_ZERO : CUDART_INF; t = (i < 0) ? CUDART_ZERO : CUDART_INF;
if (__cuda___isnan(a)) { if (__isnan(a)) {
t = a + a; t = a + a;
} }
return t; return t;
} }
__device_func__(double __cuda_exp10(double a)) static __forceinline__ double exp10(double a)
{ {
double z; double z;
double t; double t;
double fac; double fac;
int i; int i;
i = __double2hiint(a); i = __double2hiint(a);
if (((unsigned)i < (unsigned)0x40734414) || ((int)i < (int)0xc07439b8)) { if (((unsigned)i < (unsigned)0x40734414) || ((int)i < (int)0xc07439b8)) {
t = __cuda_rint (a * CUDART_L2T); t = rint (a * CUDART_L2T);
i = (int)t; i = (int)t;
z = __fma_rn (t, -CUDART_LG2_HI, a); z = __fma_rn (t, -CUDART_LG2_HI, a);
z = __fma_rn (t, -CUDART_LG2_LO, z); z = __fma_rn (t, -CUDART_LG2_LO, z);
fac = 2.0; fac = 2.0;
if (i <= -1021) { if (i <= -1021) {
i += 55; i += 55;
fac = CUDART_TWO_TO_M54; fac = CUDART_TWO_TO_M54;
} }
/* 2^z = exp(log(10)*z) */ /* 2^z = exp(log(10)*z) */
z = __fma_rn (z, CUDART_LNT_HI, z * CUDART_LNT_LO); z = __fma_rn (z, CUDART_LNT_HI, z * CUDART_LNT_LO);
t = __internal_expm1_kernel(z); t = __internal_expm1_kernel(z);
z = __internal_exp2i_kernel(i - 1); z = __internal_exp2i_kernel(i - 1);
t = __fma_rn (t, z, z); t = __fma_rn (t, z, z);
t = t * fac; t = t * fac;
return t; return t;
} }
t = ((unsigned int)i >> 31) ? CUDART_ZERO : CUDART_INF; t = (i < 0) ? CUDART_ZERO : CUDART_INF;
if (__cuda___isnan(a)) { if (__isnan(a)) {
t = a + a; t = a + a;
} }
return t; return t;
} }
__device_func__(double __cuda_expm1(double a)) static __forceinline__ double expm1(double a)
{ {
double t, z, u; double t, z, u;
int i, j, k; int i, j, k;
k = __double2hiint(a); k = __double2hiint(a);
if (((unsigned)k < (unsigned)0x40862e43) || ((int)k < (int)0xc04a8000)) { if (((unsigned)k < (unsigned)0x40862e43) || ((int)k < (int)0xc04a8000)) {
t = __cuda_rint (a * CUDART_L2E); t = rint (a * CUDART_L2E);
i = (int)t; i = (int)t;
z = __fma_rn (t, -CUDART_LN2_HI, a); z = __fma_rn (t, -CUDART_LN2_HI, a);
z = __fma_rn (t, -CUDART_LN2_LO, z); z = __fma_rn (t, -CUDART_LN2_LO, z);
k = k + k; k = k + k;
if ((unsigned)k < (unsigned)0x7fb3e647) { if ((unsigned)k < (unsigned)0x7fb3e647) {
z = a; z = a;
i = 0; i = 0;
} }
t = __internal_expm1_kernel(z); t = __internal_expm1_kernel(z);
j = i; j = i;
if (i == 1024) j--; if (i == 1024) j--;
u = __internal_exp2i_kernel(j); u = __internal_exp2i_kernel(j);
a = u - 1.0; a = u - 1.0;
t = __fma_rn (t, u, a); t = __fma_rn (t, u, a);
if (i == 1024) t = t + t; if (i == 1024) t = t + t;
if (k == 0) t = z; /* preserve -0 */ if (k == 0) t = z; /* preserve -0 */
return t; return t;
} }
t = ((unsigned int)k >> 31) ? -CUDART_ONE : CUDART_INF; t = (k < 0) ? -CUDART_ONE : CUDART_INF;
if (__cuda___isnan(a)) { if (__isnan(a)) {
t = a + a; t = a + a;
} }
return t; return t;
} }
__device_func__(double __cuda_cosh(double a)) static __forceinline__ double cosh(double a)
{ {
double z; double z;
int i; int i;
z = __cuda_fabs(a); z = fabs(a);
i = __double2hiint(z); i = __double2hiint(z);
if ((unsigned)i < (unsigned)0x408633cf) { if ((unsigned)i < (unsigned)0x408633cf) {
z = __internal_exp_kernel(z, -2); z = __internal_exp_kernel(z, -2);
z = __fma_rn(2.0, z, 0.125 / z); z = __fma_rn(2.0, z, 0.125 / z);
return z; return z;
} else { } else {
if (z > 0.0) a = CUDART_INF_F; if (z > 0.0) a = CUDART_INF_F;
return a + a; return a + a;
} }
} }
__device_func__(double __cuda_sinh(double a)) static __forceinline__ double sinh(double a)
{ {
double s, z; double s, z;
s = a; s = a;
a = __cuda_fabs(a); a = fabs(a);
if (a < 1.0) { /* danger of catastrophic cancellation */ if (a < 1.0) { /* danger of catastrophic cancellation */
double a2 = a * a; double a2 = a * a;
/* approximate sinh(x) on [0,1] with a polynomial */ /* approximate sinh(x) on [0,1] with a polynomial */
z = 1.632386098183803E-010; z = 1.632386098183803E-010;
z = __fma_rn (z, a2, 2.504854501385687E-008); z = __fma_rn (z, a2, 2.504854501385687E-008);
z = __fma_rn (z, a2, 2.755734274788706E-006); z = __fma_rn (z, a2, 2.755734274788706E-006);
z = __fma_rn (z, a2, 1.984126976294102E-004); z = __fma_rn (z, a2, 1.984126976294102E-004);
z = __fma_rn (z, a2, 8.333333333452911E-003); z = __fma_rn (z, a2, 8.333333333452911E-003);
z = __fma_rn (z, a2, 1.666666666666606E-001); z = __fma_rn (z, a2, 1.666666666666606E-001);
z = z * a2; z = z * a2;
z = __fma_rn (z, a, a); z = __fma_rn (z, a, a);
} else if (a < 2.0) { /* work around accuracy issue in vicinity of 1.4 */ } else if (a < 2.0) { /* work around accuracy issue in vicinity of 1.4 */
z = __cuda_expm1(a); z = expm1(a);
z = __internal_half (z + z / (z + 1.0)); z = __internal_half (z + z / (z + 1.0));
} else { } else {
z = __internal_exp_kernel(a, -1); z = __internal_exp_kernel(a, -1);
z = z + (1.0 / (-4.0 * z)); z = z + (1.0 / (-4.0 * z));
if (a >= CUDART_LN2_X_1025) { if (a >= CUDART_LN2_X_1025) {
z = CUDART_INF; /* overflow -> infinity */ z = CUDART_INF; /* overflow -> infinity */
} }
} }
z = __internal_copysign_pos(z, s); z = __internal_copysign_pos(z, s);
return z; return z;
} }
__device_func__(double __cuda_tanh(double a)) static __forceinline__ double tanh(double a)
{ {
double t; double t;
t = __cuda_fabs(a); t = fabs(a);
if (t >= 0.55) { if (t >= 0.55) {
double s; double s;
s = 1.0 - 2.0 / (__internal_exp_kernel(2.0 * t, 0) + 1.0); s = 1.0 - 2.0 / (__internal_exp_kernel(2.0 * t, 0) + 1.0);
if (t > 350.0) { if (t > 350.0) {
s = 1.0; /* overflow -> 1.0 */ s = 1.0; /* overflow -> 1.0 */
} }
a = __internal_copysign_pos(s, a); a = __internal_copysign_pos(s, a);
} else { } else {
double a2; double a2;
a2 = a * a; a2 = a * a;
skipping to change at line 1011 skipping to change at line 915
t = __fma_rn (t, a2, -5.396825387607743E-002); t = __fma_rn (t, a2, -5.396825387607743E-002);
t = __fma_rn (t, a2, 1.333333333316870E-001); t = __fma_rn (t, a2, 1.333333333316870E-001);
t = __fma_rn (t, a2, -3.333333333333232E-001); t = __fma_rn (t, a2, -3.333333333333232E-001);
t = t * a2; t = t * a2;
t = __fma_rn (t, a, a); t = __fma_rn (t, a, a);
a = __internal_copysign_pos(t, a); a = __internal_copysign_pos(t, a);
} }
return a; return a;
} }
__device_func__(double __internal_atan_kernel(double a)) static __forceinline__ double __internal_atan_kernel(double a)
{ {
double t, a2; double t, a2;
a2 = a * a; a2 = a * a;
t = -2.0258553044438358E-005 ; t = -2.0258553044438358E-005 ;
t = __fma_rn (t, a2, 2.2302240345758510E-004); t = __fma_rn (t, a2, 2.2302240345758510E-004);
t = __fma_rn (t, a2, -1.1640717779930576E-003); t = __fma_rn (t, a2, -1.1640717779930576E-003);
t = __fma_rn (t, a2, 3.8559749383629918E-003); t = __fma_rn (t, a2, 3.8559749383629918E-003);
t = __fma_rn (t, a2, -9.1845592187165485E-003); t = __fma_rn (t, a2, -9.1845592187165485E-003);
t = __fma_rn (t, a2, 1.6978035834597331E-002); t = __fma_rn (t, a2, 1.6978035834597331E-002);
t = __fma_rn (t, a2, -2.5826796814495994E-002); t = __fma_rn (t, a2, -2.5826796814495994E-002);
skipping to change at line 1039 skipping to change at line 943
t = __fma_rn (t, a2, -9.0909012354005225E-002); t = __fma_rn (t, a2, -9.0909012354005225E-002);
t = __fma_rn (t, a2, 1.1111110678749424E-001); t = __fma_rn (t, a2, 1.1111110678749424E-001);
t = __fma_rn (t, a2, -1.4285714271334815E-001); t = __fma_rn (t, a2, -1.4285714271334815E-001);
t = __fma_rn (t, a2, 1.9999999999755019E-001); t = __fma_rn (t, a2, 1.9999999999755019E-001);
t = __fma_rn (t, a2, -3.3333333333331860E-001); t = __fma_rn (t, a2, -3.3333333333331860E-001);
t = t * a2; t = t * a2;
t = __fma_rn (t, a, a); t = __fma_rn (t, a, a);
return t; return t;
} }
__device_func__(double __cuda_atan2(double a, double b)) static __forceinline__ double atan2(double a, double b)
{ {
double t0, t1, t3; double t0, t1, t3;
if (__cuda___isnan(a) || __cuda___isnan(b)) { if (__isnan(a) || __isnan(b)) {
return a + b; return a + b;
} }
/* reduce arguments to first octant */ /* reduce arguments to first octant */
/* r = (|x| < |y|) ? (|x| / |y|) : (|y| / |x|) */ /* r = (|x| < |y|) ? (|x| / |y|) : (|y| / |x|) */
t3 = __cuda_fabs(b); t3 = fabs(b);
t1 = __cuda_fabs(a); t1 = fabs(a);
if (t3 == 0.0 && t1 == 0.0) { if (t3 == 0.0 && t1 == 0.0) {
t3 = __cuda___signbit(b) ? CUDART_PI : 0; t3 = (__double2hiint(b) < 0) ? CUDART_PI : 0;
} else if (__cuda___isinf(t3) && __cuda___isinf(t1)) { } else if (__isinf(t3) && __isinf(t1)) {
t3 = __cuda___signbit(b) ? CUDART_3PIO4 : CUDART_PIO4; t3 = (__double2hiint(b) < 0) ? CUDART_3PIO4 : CUDART_PIO4;
} else { } else {
t0 = __cuda_fmax (t1, t3); t0 = fmax (t1, t3);
t1 = __cuda_fmin (t1, t3); t1 = fmin (t1, t3);
t3 = t1 / t0; t3 = t1 / t0;
t3 = __internal_atan_kernel(t3); t3 = __internal_atan_kernel(t3);
/* Map result according to octant. */ /* Map result according to octant. */
if (__cuda_fabs(a) > __cuda_fabs(b)) t3 = CUDART_PIO2 - t3; if (fabs(a) > fabs(b)) t3 = CUDART_PIO2 - t3;
if (b < 0.0) t3 = CUDART_PI - t3; if (b < 0.0) t3 = CUDART_PI - t3;
} }
t3 = __internal_copysign_pos(t3, a); t3 = __internal_copysign_pos(t3, a);
return t3; return t3;
} }
__device_func__(double __cuda_atan(double a)) static __forceinline__ double atan(double a)
{ {
double t0, t1; double t0, t1;
/* reduce argument to first octant */ /* reduce argument to first octant */
t0 = __cuda_fabs(a); t0 = fabs(a);
t1 = t0; t1 = t0;
if (t0 > 1.0) { if (t0 > 1.0) {
t1 = 1.0 / t1; t1 = 1.0 / t1;
} }
/* approximate atan(r) in first octant */ /* approximate atan(r) in first octant */
t1 = __internal_atan_kernel(t1); t1 = __internal_atan_kernel(t1);
/* map result according to octant. */ /* map result according to octant. */
if (t0 > 1.0) { if (t0 > 1.0) {
t1 = CUDART_PIO2 - t1; t1 = CUDART_PIO2 - t1;
} }
return __internal_copysign_pos(t1, a); return __internal_copysign_pos(t1, a);
} }
/* b should be the square of a */ /* b should be the square of a */
__device_func__(double __internal_asin_kernel(double a, double b)) static __forceinline__ double __internal_asin_kernel(double a, double b)
{ {
double r; double r;
r = 6.259798167646803E-002; r = 6.259798167646803E-002;
r = __fma_rn (r, b, -7.620591484676952E-002); r = __fma_rn (r, b, -7.620591484676952E-002);
r = __fma_rn (r, b, 6.686894879337643E-002); r = __fma_rn (r, b, 6.686894879337643E-002);
r = __fma_rn (r, b, -1.787828218369301E-002); r = __fma_rn (r, b, -1.787828218369301E-002);
r = __fma_rn (r, b, 1.745227928732326E-002); r = __fma_rn (r, b, 1.745227928732326E-002);
r = __fma_rn (r, b, 1.000422754245580E-002); r = __fma_rn (r, b, 1.000422754245580E-002);
r = __fma_rn (r, b, 1.418108777515123E-002); r = __fma_rn (r, b, 1.418108777515123E-002);
r = __fma_rn (r, b, 1.733194598980628E-002); r = __fma_rn (r, b, 1.733194598980628E-002);
r = __fma_rn (r, b, 2.237350511593569E-002); r = __fma_rn (r, b, 2.237350511593569E-002);
r = __fma_rn (r, b, 3.038188875134962E-002); r = __fma_rn (r, b, 3.038188875134962E-002);
r = __fma_rn (r, b, 4.464285849810986E-002); r = __fma_rn (r, b, 4.464285849810986E-002);
r = __fma_rn (r, b, 7.499999998342270E-002); r = __fma_rn (r, b, 7.499999998342270E-002);
r = __fma_rn (r, b, 1.666666666667375E-001); r = __fma_rn (r, b, 1.666666666667375E-001);
r = r * b; r = r * b;
return r; return r;
} }
__device_func__(double __cuda_asin(double a)) static __forceinline__ double asin(double a)
{ {
double fa, t0, t1; double fa, t0, t1;
int ihi, ahi; int ihi, ahi;
ahi = __double2hiint(a); ahi = __double2hiint(a);
fa = __cuda_fabs(a); fa = fabs(a);
ihi = __double2hiint(fa); ihi = __double2hiint(fa);
if (ihi < 0x3fe26666) { if (ihi < 0x3fe26666) {
t1 = fa * fa; t1 = fa * fa;
t1 = __internal_asin_kernel (fa, t1); t1 = __internal_asin_kernel (fa, t1);
t1 = __fma_rn (t1, fa, fa); t1 = __fma_rn (t1, fa, fa);
t1 = __internal_copysign_pos(t1, a); t1 = __internal_copysign_pos(t1, a);
} else { } else {
t1 = __fma_rn (-0.5, fa, 0.5); t1 = __fma_rn (-0.5, fa, 0.5);
t0 = __cuda_sqrt (t1); t0 = sqrt (t1);
t1 = __internal_asin_kernel (t0, t1); t1 = __internal_asin_kernel (t0, t1);
t0 = -2.0 * t0; t0 = -2.0 * t0;
t1 = __fma_rn (t0, t1, CUDART_PIO2_LO); t1 = __fma_rn (t0, t1, CUDART_PIO2_LO);
t0 = t0 + CUDART_PIO4_HI; t0 = t0 + CUDART_PIO4_HI;
t1 = t0 + t1; t1 = t0 + t1;
t1 = t1 + CUDART_PIO4_HI; t1 = t1 + CUDART_PIO4_HI;
if (ahi < 0x3ff00000) { if (ahi < 0x3ff00000) {
t1 = __internal_copysign_pos(t1, a); t1 = __internal_copysign_pos(t1, a);
} }
} }
return t1; return t1;
} }
__device_func__(double __cuda_acos(double a)) static __forceinline__ double acos(double a)
{ {
double t0, t1; double t0, t1;
int ihi, ahi; int ihi, ahi;
#if !defined(__CUDABE__)
if (__cuda___isnan(a)) {
return a + a;
}
#endif
ahi = __double2hiint(a); ahi = __double2hiint(a);
t0 = __cuda_fabs (a); t0 = fabs (a);
ihi = __double2hiint(t0); ihi = __double2hiint(t0);
if (ihi < 0x3fe26666) { if (ihi < 0x3fe26666) {
t1 = t0 * t0; t1 = t0 * t0;
t1 = __internal_asin_kernel (t0, t1); t1 = __internal_asin_kernel (t0, t1);
t0 = __fma_rn (t1, t0, t0); t0 = __fma_rn (t1, t0, t0);
if ((unsigned)ahi >= (unsigned)0x80000000) { if ((unsigned)ahi >= (unsigned)0x80000000) {
t0 = __fma_rn (1.0, t0, +CUDART_PIO2_LO); t0 = __fma_rn (1.0, t0, +CUDART_PIO2_LO);
t0 = CUDART_PIO2_HI + t0; t0 = CUDART_PIO2_HI + t0;
} else { } else {
t0 = __fma_rn (1.0, t0, -CUDART_PIO2_LO); t0 = __fma_rn (1.0, t0, -CUDART_PIO2_LO);
t0 = CUDART_PIO2_HI - t0; t0 = CUDART_PIO2_HI - t0;
} }
} else { } else {
t1 = __fma_rn (-0.5, t0, 0.5); t1 = __fma_rn (-0.5, t0, 0.5);
t0 = __cuda_sqrt(t1); t0 = sqrt(t1);
t1 = __internal_asin_kernel (t0, t1); t1 = __internal_asin_kernel (t0, t1);
t0 = __fma_rn (t1, t0, t0); t0 = __fma_rn (t1, t0, t0);
t0 = 2.0 * t0; t0 = 2.0 * t0;
if ((unsigned)ahi >= (unsigned)0x80000000) { if ((unsigned)ahi >= (unsigned)0x80000000) {
t0 = __fma_rn (1.0, t0, -CUDART_PI_LO); t0 = __fma_rn (1.0, t0, -CUDART_PI_LO);
t0 = CUDART_PI_HI - t0; t0 = CUDART_PI_HI - t0;
} }
} }
return t0; return t0;
} }
__device_func__(double __cuda_acosh(double a)) static __forceinline__ double acosh(double a)
{ {
double t; double t;
#if !defined(__CUDABE__)
if (__cuda___isnan(a)) {
return a + a;
}
#endif
t = a - 1.0; t = a - 1.0;
if (__cuda_fabs(t) > CUDART_TWO_TO_52) { if (fabs(t) > CUDART_TWO_TO_52) {
/* for large a, acosh = log(2*a) */ /* for large a, acosh = log(2*a) */
return CUDART_LN2 + __cuda_log(a); return CUDART_LN2 + log(a);
} else { } else {
t = t + __cuda_sqrt(__fma_rn(a, t, t)); t = t + sqrt(__fma_rn(a, t, t));
return __cuda_log1p(t); return log1p(t);
} }
} }
__device_func__(double __cuda_asinh(double a)) static __forceinline__ double asinh(double a)
{ {
double fa, t; double fa, t;
fa = __cuda_fabs(a); fa = fabs(a);
if (__double2hiint(fa) >= 0x5ff00000) { /* prevent intermediate underflow */ if (__double2hiint(fa) >= 0x5ff00000) { /* prevent intermediate underflow */
t = CUDART_LN2 + __cuda_log(fa); t = CUDART_LN2 + log(fa);
} else { } else {
t = fa * fa; t = fa * fa;
t = __cuda_log1p (fa + t / (1.0 + __cuda_sqrt(1.0 + t))); t = log1p (fa + t / (1.0 + sqrt(1.0 + t)));
} }
return __internal_copysign_pos(t, a); return __internal_copysign_pos(t, a);
} }
__device_func__(double __cuda_atanh(double a)) static __forceinline__ double atanh(double a)
{ {
double fa, t; double fa, t;
#if !defined(__CUDABE__) fa = fabs(a);
if (__cuda___isnan(a)) {
return a + a;
}
#endif
fa = __cuda_fabs(a);
t = (2.0 * fa) / (1.0 - fa); t = (2.0 * fa) / (1.0 - fa);
t = 0.5 * __cuda_log1p(t); t = 0.5 * log1p(t);
#if !defined(__CUDABE__) if (__double2hiint(a) < 0) {
if (__cuda___isnan(t)) {
return t;
}
#endif
if (__cuda___signbit(a)) {
t = -t; t = -t;
} }
return t; return t;
} }
__device_func__(double __cuda_hypot(double a, double b)) static __forceinline__ double hypot(double a, double b)
{ {
double v, w, t, fa, fb; double v, w, t, fa, fb;
fa = __cuda_fabs(a); fa = fabs(a);
fb = __cuda_fabs(b); fb = fabs(b);
v = __cuda_fmax(fa, fb); v = fmax(fa, fb);
w = __cuda_fmin(fa, fb); w = fmin(fa, fb);
t = w / v; t = w / v;
t = __fma_rn (t, t, 1.0); t = __fma_rn (t, t, 1.0);
t = v * __cuda_sqrt(t); t = v * sqrt(t);
if (v == 0.0) { if (v == 0.0) {
t = v + w; /* fixup for zero divide */ t = v + w; /* fixup for zero divide */
} }
if ((!(fa <= CUDART_INF)) || (!(fb <= CUDART_INF))) { if ((!(fa <= CUDART_INF)) || (!(fb <= CUDART_INF))) {
t = a + b; /* fixup for NaNs */ t = a + b; /* fixup for NaNs */
} }
if (v == CUDART_INF) { if (v == CUDART_INF) {
t = v + w; /* fixup for infinities */ t = v + w; /* fixup for infinities */
} }
return t; return t;
} }
__device_func__(double __cuda_cbrt(double a)) static __forceinline__ double cbrt(double a)
{ {
float s; float s;
double t, r; double t, r;
int ilo, ihi, expo, nexpo, denorm; int ilo, ihi, expo, nexpo, denorm;
if ((a == 0.0) || !(__cuda___finite(a))) { if ((a == 0.0) || !(__finite(a))) {
return a + a; return a + a;
} }
t = __cuda_fabs(a); t = fabs(a);
ilo = __double2loint(t); ilo = __double2loint(t);
ihi = __double2hiint(t); ihi = __double2hiint(t);
expo = ((int)((unsigned int)ihi >> 20) & 0x7ff); expo = ((int)((unsigned int)ihi >> 20) & 0x7ff);
denorm = 0; denorm = 0;
if (expo == 0) { if (expo == 0) {
/* denormal */ /* denormal */
t = t * CUDART_TWO_TO_54; t = t * CUDART_TWO_TO_54;
denorm = 18; denorm = 18;
ilo = __double2loint(t); ilo = __double2loint(t);
ihi = __double2hiint(t); ihi = __double2hiint(t);
expo = ((int)((unsigned int)ihi >> 20) & 0x7ff); expo = ((int)((unsigned int)ihi >> 20) & 0x7ff);
} }
/* scale into float range */ /* scale into float range */
nexpo = __float2int_rn(CUDART_THIRD_F * (float)(expo - 1022)); nexpo = __float2int_rn(CUDART_THIRD_F * (float)(expo - 1022));
ihi -= (3 * nexpo) << 20; ihi -= (3 * nexpo) << 20;
r = __hiloint2double(ihi, ilo); r = __hiloint2double(ihi, ilo);
/* initial approximation */ /* initial approximation */
s = (float)r; s = (float)r;
t = __cuda_exp2f(-CUDART_THIRD_F * __log2f(s)); /* approximate invcbrt */ t = exp2f(-CUDART_THIRD_F * __log2f(s)); /* approximate invcbrt */
t = __fma_rn(__fma_rn(t*t,-r*t,1.0), CUDART_THIRD*t, t);/* refine invcbrt */ t = __fma_rn(__fma_rn(t*t,-r*t,1.0), CUDART_THIRD*t, t);/* refine invcbrt */
t = r * t * t; /* approximate cbrt */ t = r * t * t; /* approximate cbrt */
t = __fma_rn(t - (r / (t * t)), -CUDART_THIRD, t); /* refine cbrt */ t = __fma_rn(t - (r / (t * t)), -CUDART_THIRD, t); /* refine cbrt */
/* scale result back into double range */ /* scale result back into double range */
ilo = __double2loint(t); ilo = __double2loint(t);
ihi = __double2hiint(t); ihi = __double2hiint(t);
ihi += (nexpo - denorm) << 20; ihi += (nexpo - denorm) << 20;
t = __hiloint2double(ihi, ilo); t = __hiloint2double(ihi, ilo);
if (__cuda___signbit(a)) { if (__double2hiint(a) < 0) {
t = -t; t = -t;
} }
return t; return t;
} }
__device_func__(double __cuda_rcbrt(double a)) static __forceinline__ double rcbrt(double a)
{ {
float s; float s;
double t, r; double t, r;
int ilo, ihi, expo, nexpo, denorm; int ilo, ihi, expo, nexpo, denorm;
if ((a == 0.0) || !(__cuda___finite(a))) { if ((a == 0.0) || !(__finite(a))) {
return 1.0 / a; return 1.0 / a;
} }
t = __cuda_fabs(a); t = fabs(a);
ilo = __double2loint(t); ilo = __double2loint(t);
ihi = __double2hiint(t); ihi = __double2hiint(t);
expo = ((int)((unsigned int)ihi >> 20) & 0x7ff); expo = ((int)((unsigned int)ihi >> 20) & 0x7ff);
denorm = 0; denorm = 0;
if (expo == 0) { if (expo == 0) {
/* denormal */ /* denormal */
t = t * CUDART_TWO_TO_54; t = t * CUDART_TWO_TO_54;
denorm = 18; denorm = 18;
ilo = __double2loint(t); ilo = __double2loint(t);
ihi = __double2hiint(t); ihi = __double2hiint(t);
expo = ((int)((unsigned int)ihi >> 20) & 0x7ff); expo = ((int)((unsigned int)ihi >> 20) & 0x7ff);
} }
/* scale into float range */ /* scale into float range */
nexpo = __float2int_rn(CUDART_THIRD_F * (float)(expo - 1022)); nexpo = __float2int_rn(CUDART_THIRD_F * (float)(expo - 1022));
ihi -= (3 * nexpo) << 20; ihi -= (3 * nexpo) << 20;
r = __hiloint2double(ihi, ilo); r = __hiloint2double(ihi, ilo);
/* initial approximation */ /* initial approximation */
s = (float)r; s = (float)r;
t = __cuda_exp2f(-CUDART_THIRD_F * __log2f(s)); /* approximate invcbrt */ t = exp2f(-CUDART_THIRD_F * __log2f(s)); /* approximate invcbrt */
t = __fma_rn(__fma_rn(t*t,-r*t,1.0), CUDART_THIRD*t, t);/* refine invcbrt */ t = __fma_rn(__fma_rn(t*t,-r*t,1.0), CUDART_THIRD*t, t);/* refine invcbrt */
t = __fma_rn(__fma_rn(t*t,-r*t,1.0), CUDART_THIRD*t, t);/* refine invcbrt */ t = __fma_rn(__fma_rn(t*t,-r*t,1.0), CUDART_THIRD*t, t);/* refine invcbrt */
/* scale result back into double range */ /* scale result back into double range */
ilo = __double2loint(t); ilo = __double2loint(t);
ihi = __double2hiint(t); ihi = __double2hiint(t);
ihi += (-(nexpo - denorm)) << 20; ihi += (-(nexpo - denorm)) << 20;
t = __hiloint2double(ihi, ilo); t = __hiloint2double(ihi, ilo);
if (__cuda___signbit(a)) { if (__double2hiint(a) < 0) {
t = -t; t = -t;
} }
return t; return t;
} }
__device_func__(double __internal_accurate_pow(double a, double b)) static __forceinline__ double __internal_accurate_pow(double a, double b)
{ {
double2 loga; double2 loga;
double2 prod; double2 prod;
double t_hi, t_lo; double t_hi, t_lo;
double tmp; double tmp;
#if !defined(__CUDABE__) && defined(__linux__) && !defined(__LP64__)
volatile
#endif
double e; double e;
/* compute log(a) in double-double format*/ /* compute log(a) in double-double format*/
loga = __internal_log_ext_prec(a); loga = __internal_log_ext_prec(a);
/* prevent overflow during extended precision multiply */ /* prevent overflow during extended precision multiply */
if (__cuda_fabs(b) > 1e304) b *= 1.220703125e-4; if (fabs(b) > 1e304) b *= 1.220703125e-4;
/* compute b * log(a) in double-double format */ /* compute b * log(a) in double-double format */
t_hi = loga.y * b; t_hi = loga.y * b;
t_lo = __fma_rn (loga.y, b, -t_hi); t_lo = __fma_rn (loga.y, b, -t_hi);
t_lo = __fma_rn (loga.x, b, t_lo); t_lo = __fma_rn (loga.x, b, t_lo);
prod.y = e = t_hi + t_lo; prod.y = e = t_hi + t_lo;
prod.x = (t_hi - e) + t_lo; prod.x = (t_hi - e) + t_lo;
/* compute pow(a,b) = exp(b*log(a)) */ /* compute pow(a,b) = exp(b*log(a)) */
tmp = __cuda_exp(prod.y); tmp = exp(prod.y);
/* prevent -INF + INF = NaN */ /* prevent -INF + INF = NaN */
if (!__cuda___isinf(tmp)) { if (!__isinf(tmp)) {
/* if prod.x is much smaller than prod.y, then exp(prod.y + prod.x) ~= /* if prod.x is much smaller than prod.y, then exp(prod.y + prod.x) ~=
* exp(prod.y) + prod.x * exp(prod.y) * exp(prod.y) + prod.x * exp(prod.y)
*/ */
tmp = __fma_rn (tmp, prod.x, tmp); tmp = __fma_rn (tmp, prod.x, tmp);
} }
return tmp; return tmp;
} }
__device_func__(double __cuda_pow(double a, double b)) static __forceinline__ double pow(double a, double b)
{ {
int bIsOddInteger; int bIsOddInteger;
double t; double t;
if (a == 1.0 || b == 0.0) { if (a == 1.0 || b == 0.0) {
return 1.0; return 1.0;
} }
if (__cuda___isnan(a) || __cuda___isnan(b)) { if (__isnan(a) || __isnan(b)) {
return a + b; return a + b;
} }
if (a == CUDART_INF) { if (a == CUDART_INF) {
return __cuda___signbit(b) ? CUDART_ZERO : CUDART_INF; return (__double2hiint(b) < 0) ? CUDART_ZERO : CUDART_INF;
} }
if (__cuda___isinf(b)) { if (__isinf(b)) {
if (a == -1.0) { if (a == -1.0) {
return 1.0; return 1.0;
} }
t = __cuda_fabs(a) > 1.0 ? CUDART_INF : CUDART_ZERO; t = fabs(a) > 1.0 ? CUDART_INF : CUDART_ZERO;
if (b < CUDART_ZERO) { if (b < CUDART_ZERO) {
t = 1.0 / t; t = 1.0 / t;
} }
return t; return t;
} }
bIsOddInteger = __cuda_fabs(b - (2.0f * __cuda_trunc(0.5 * b))) == 1.0; bIsOddInteger = fabs(b - (2.0f * trunc(0.5 * b))) == 1.0;
if (a == CUDART_ZERO) { if (a == CUDART_ZERO) {
t = bIsOddInteger ? a : CUDART_ZERO; t = bIsOddInteger ? a : CUDART_ZERO;
if (b < CUDART_ZERO) { if (b < CUDART_ZERO) {
t = 1.0 / t; t = 1.0 / t;
} }
return t; return t;
} }
if (a == -CUDART_INF) { if (a == -CUDART_INF) {
t = (b < CUDART_ZERO) ? -1.0/a : -a; t = (b < CUDART_ZERO) ? -1.0/a : -a;
if (bIsOddInteger) { if (bIsOddInteger) {
t = __longlong_as_double(__double_as_longlong(t)^0x8000000000000000UL L); t = __longlong_as_double(__double_as_longlong(t)^0x8000000000000000UL L);
} }
return t; return t;
} }
if ((a < CUDART_ZERO) && (b != __cuda_trunc(b))) { if ((a < CUDART_ZERO) && (b != trunc(b))) {
return CUDART_NAN; return CUDART_NAN;
} }
t = __cuda_fabs(a); t = fabs(a);
t = __internal_accurate_pow(t, b); t = __internal_accurate_pow(t, b);
if ((a < CUDART_ZERO) && bIsOddInteger) { if ((a < CUDART_ZERO) && bIsOddInteger) {
t = __longlong_as_double(__double_as_longlong(t) ^ 0x8000000000000000UL L); t = __longlong_as_double(__double_as_longlong(t) ^ 0x8000000000000000UL L);
} }
return t; return t;
} }
__device_func__(double __cuda_erf(double a)) static __forceinline__ double erf(double a)
{ {
double t, r, q; double t, r, q;
t = __cuda_fabs(a); t = fabs(a);
if (t >= 1.0) { if (t >= 1.0) {
r = -1.28836351230756500E-019; r = -1.28836351230756500E-019;
r = __fma_rn (r, t, 1.30597472161093370E-017); r = __fma_rn (r, t, 1.30597472161093370E-017);
r = __fma_rn (r, t, -6.33924401259620500E-016); r = __fma_rn (r, t, -6.33924401259620500E-016);
r = __fma_rn (r, t, 1.96231865908940140E-014); r = __fma_rn (r, t, 1.96231865908940140E-014);
r = __fma_rn (r, t, -4.35272243559990750E-013); r = __fma_rn (r, t, -4.35272243559990750E-013);
r = __fma_rn (r, t, 7.37083927929352150E-012); r = __fma_rn (r, t, 7.37083927929352150E-012);
r = __fma_rn (r, t, -9.91402142550461630E-011); r = __fma_rn (r, t, -9.91402142550461630E-011);
r = __fma_rn (r, t, 1.08817017167760820E-009); r = __fma_rn (r, t, 1.08817017167760820E-009);
r = __fma_rn (r, t, -9.93918713097634620E-009); r = __fma_rn (r, t, -9.93918713097634620E-009);
skipping to change at line 1474 skipping to change at line 1355
r = __fma_rn (r, q, 5.22397760611847340E-003); r = __fma_rn (r, q, 5.22397760611847340E-003);
r = __fma_rn (r, q, -2.68661706431114690E-002); r = __fma_rn (r, q, -2.68661706431114690E-002);
r = __fma_rn (r, q, 1.12837916709441850E-001); r = __fma_rn (r, q, 1.12837916709441850E-001);
r = __fma_rn (r, q, -3.76126389031835210E-001); r = __fma_rn (r, q, -3.76126389031835210E-001);
r = __fma_rn (r, q, 1.12837916709551260E+000); r = __fma_rn (r, q, 1.12837916709551260E+000);
a = r * a; a = r * a;
} }
return a; return a;
} }
__device_func__(double __cuda_erfinv(double a)) static __forceinline__ double erfinv(double a)
{ {
double fa, t; double fa, t;
fa = __cuda_fabs(a); fa = fabs(a);
if (fa >= 1.0) { if (fa >= 1.0) {
t = CUDART_NAN; /* NaN */ t = CUDART_NAN; /* NaN */
if (fa == 1.0) { if (fa == 1.0) {
t = a * CUDART_INF; /* Infinity */ t = a * CUDART_INF; /* Infinity */
} }
} else if (fa >= 0.9375) { } else if (fa >= 0.9375) {
/* Based on: J.M. Blair, C.A. Edwards, J.H. Johnson: Rational Chebyshev /* Based on: J.M. Blair, C.A. Edwards, J.H. Johnson: Rational Chebyshev
Approximations for the Inverse of the Error Function. Mathematics of Approximations for the Inverse of the Error Function. Mathematics of
Computation, Vol. 30, No. 136 (Oct. 1976), pp. 827-830. Table 59 Computation, Vol. 30, No. 136 (Oct. 1976), pp. 827-830. Table 59
*/ */
double p, q; double p, q;
t = __cuda_log1p(-fa); t = log1p(-fa);
t = __cuda_rsqrt(-t); t = rsqrt(-t);
p = 2.7834010353747001060e-3; p = 2.7834010353747001060e-3;
p = __fma_rn (p, t, 8.6030097526280260580e-1); p = __fma_rn (p, t, 8.6030097526280260580e-1);
p = __fma_rn (p, t, 2.1371214997265515515e+0); p = __fma_rn (p, t, 2.1371214997265515515e+0);
p = __fma_rn (p, t, 3.1598519601132090206e+0); p = __fma_rn (p, t, 3.1598519601132090206e+0);
p = __fma_rn (p, t, 3.5780402569085996758e+0); p = __fma_rn (p, t, 3.5780402569085996758e+0);
p = __fma_rn (p, t, 1.5335297523989890804e+0); p = __fma_rn (p, t, 1.5335297523989890804e+0);
p = __fma_rn (p, t, 3.4839207139657522572e-1); p = __fma_rn (p, t, 3.4839207139657522572e-1);
p = __fma_rn (p, t, 5.3644861147153648366e-2); p = __fma_rn (p, t, 5.3644861147153648366e-2);
p = __fma_rn (p, t, 4.3836709877126095665e-3); p = __fma_rn (p, t, 4.3836709877126095665e-3);
p = __fma_rn (p, t, 1.3858518113496718808e-4); p = __fma_rn (p, t, 1.3858518113496718808e-4);
skipping to change at line 1571 skipping to change at line 1452
q = __fma_rn (q, t, .59039348134843665626e+4); q = __fma_rn (q, t, .59039348134843665626e+4);
q = __fma_rn (q, t, -.48481635430048872102e+4); q = __fma_rn (q, t, -.48481635430048872102e+4);
q = __fma_rn (q, t, .18997769186453057810e+4); q = __fma_rn (q, t, .18997769186453057810e+4);
q = __fma_rn (q, t, -.28386514725366621129e+3); q = __fma_rn (q, t, -.28386514725366621129e+3);
p = p / q; p = p / q;
t = a * p; t = a * p;
} }
return t; return t;
} }
__device_func__(double __cuda_erfcinv(double a)) static __forceinline__ double erfcinv(double a)
{ {
double t; double t;
#if !defined(__CUDABE__)
if (__cuda___isnan(a)) return a + a;
#endif
if (a <= CUDART_ZERO) { if (a <= CUDART_ZERO) {
t = CUDART_NAN; t = CUDART_NAN;
if (a == CUDART_ZERO) { if (a == CUDART_ZERO) {
t = (1.0 - a) * CUDART_INF; t = (1.0 - a) * CUDART_INF;
} }
} }
else if (a >= 0.0625) { else if (a >= 0.0625) {
t = __cuda_erfinv (1.0 - a); t = erfinv (1.0 - a);
} }
else if (a >= 1e-100) { else if (a >= 1e-100) {
/* Based on: J.M. Blair, C.A. Edwards, J.H. Johnson: Rational Chebyshev /* Based on: J.M. Blair, C.A. Edwards, J.H. Johnson: Rational Chebyshev
Approximations for the Inverse of the Error Function. Mathematics of Approximations for the Inverse of the Error Function. Mathematics of
Computation, Vol. 30, No. 136 (Oct. 1976), pp. 827-830. Table 59 Computation, Vol. 30, No. 136 (Oct. 1976), pp. 827-830. Table 59
*/ */
double p, q; double p, q;
t = __cuda_log(a); t = log(a);
t = __cuda_rsqrt(-t); t = rsqrt(-t);
p = 2.7834010353747001060e-3; p = 2.7834010353747001060e-3;
p = __fma_rn (p, t, 8.6030097526280260580e-1); p = __fma_rn (p, t, 8.6030097526280260580e-1);
p = __fma_rn (p, t, 2.1371214997265515515e+0); p = __fma_rn (p, t, 2.1371214997265515515e+0);
p = __fma_rn (p, t, 3.1598519601132090206e+0); p = __fma_rn (p, t, 3.1598519601132090206e+0);
p = __fma_rn (p, t, 3.5780402569085996758e+0); p = __fma_rn (p, t, 3.5780402569085996758e+0);
p = __fma_rn (p, t, 1.5335297523989890804e+0); p = __fma_rn (p, t, 1.5335297523989890804e+0);
p = __fma_rn (p, t, 3.4839207139657522572e-1); p = __fma_rn (p, t, 3.4839207139657522572e-1);
p = __fma_rn (p, t, 5.3644861147153648366e-2); p = __fma_rn (p, t, 5.3644861147153648366e-2);
p = __fma_rn (p, t, 4.3836709877126095665e-3); p = __fma_rn (p, t, 4.3836709877126095665e-3);
p = __fma_rn (p, t, 1.3858518113496718808e-4); p = __fma_rn (p, t, 1.3858518113496718808e-4);
skipping to change at line 1623 skipping to change at line 1501
q = __fma_rn (q, t, 1.3858762165532246059e-4); q = __fma_rn (q, t, 1.3858762165532246059e-4);
q = __fma_rn (q, t, 1.1738313872397777529e-6); q = __fma_rn (q, t, 1.1738313872397777529e-6);
t = p / (q * t); t = p / (q * t);
} }
else { else {
/* Based on: J.M. Blair, C.A. Edwards, J.H. Johnson: Rational Chebyshev /* Based on: J.M. Blair, C.A. Edwards, J.H. Johnson: Rational Chebyshev
Approximations for the Inverse of the Error Function. Mathematics of Approximations for the Inverse of the Error Function. Mathematics of
Computation, Vol. 30, No. 136 (Oct. 1976), pp. 827-830. Table 82 Computation, Vol. 30, No. 136 (Oct. 1976), pp. 827-830. Table 82
*/ */
double p, q; double p, q;
t = __cuda_log(a); t = log(a);
t = __cuda_rsqrt(-t); t = rsqrt(-t);
p = 6.9952990607058154858e-1; p = 6.9952990607058154858e-1;
p = __fma_rn (p, t, 1.9507620287580568829e+0); p = __fma_rn (p, t, 1.9507620287580568829e+0);
p = __fma_rn (p, t, 8.2810030904462690216e-1); p = __fma_rn (p, t, 8.2810030904462690216e-1);
p = __fma_rn (p, t, 1.1279046353630280005e-1); p = __fma_rn (p, t, 1.1279046353630280005e-1);
p = __fma_rn (p, t, 6.0537914739162189689e-3); p = __fma_rn (p, t, 6.0537914739162189689e-3);
p = __fma_rn (p, t, 1.3714329569665128933e-4); p = __fma_rn (p, t, 1.3714329569665128933e-4);
p = __fma_rn (p, t, 1.2964481560643197452e-6); p = __fma_rn (p, t, 1.2964481560643197452e-6);
p = __fma_rn (p, t, 4.6156006321345332510e-9); p = __fma_rn (p, t, 4.6156006321345332510e-9);
p = __fma_rn (p, t, 4.5344689563209398450e-12); p = __fma_rn (p, t, 4.5344689563209398450e-12);
q = t+ 1.5771922386662040546e+0; q = t+ 1.5771922386662040546e+0;
skipping to change at line 1648 skipping to change at line 1526
q = __fma_rn (q, t, 6.0574830550097140404e-3); q = __fma_rn (q, t, 6.0574830550097140404e-3);
q = __fma_rn (q, t, 1.3715891988350205065e-4); q = __fma_rn (q, t, 1.3715891988350205065e-4);
q = __fma_rn (q, t, 1.2964671850944981713e-6); q = __fma_rn (q, t, 1.2964671850944981713e-6);
q = __fma_rn (q, t, 4.6156017600933592558e-9); q = __fma_rn (q, t, 4.6156017600933592558e-9);
q = __fma_rn (q, t, 4.5344687377088206783e-12); q = __fma_rn (q, t, 4.5344687377088206783e-12);
t = p / (q * t); t = p / (q * t);
} }
return t; return t;
} }
__device_func__(double __cuda_erfc(double a)) static __forceinline__ double erfc(double a)
{ {
double p, q, h, l; double p, q, h, l;
int ahi; int ahi;
ahi = __double2hiint(a); ahi = __double2hiint(a);
if (ahi < (int)0x3fea0400) { /* 1665/2048 */ if (ahi < (int)0x3fea0400) { /* 1665/2048 */
return 1.0 - __cuda_erf(a); return 1.0 - erf(a);
} }
if (ahi < (int)0x40140000) { /* 5.0 */ if (ahi < (int)0x40140000) { /* 5.0 */
/* On the interval [1665/2048, 5.0] the following approximation is used : /* On the interval [1665/2048, 5.0] the following approximation is used :
erfc(a) = (1.0 + 1/a * r(1/a)) * 1/a * 0.5 * exp(-a*a), where the ra nge erfc(a) = (1.0 + 1/a * r(1/a)) * 1/a * 0.5 * exp(-a*a), where the ra nge
of r(1/a) is approximately [-0.17, 0.11]. r(1/a) is computed by rati onal of r(1/a) is approximately [-0.17, 0.11]. r(1/a) is computed by rati onal
approximation. approximation.
*/ */
double t; double t;
t = 1.0 / a; t = __internal_fast_rcp(a);
p = -1.0000000252849461E+000; p = -1.0000000252849461E+000;
p = __fma_rn (p, t, -7.3398971987771156E-001); p = __fma_rn (p, t, -7.3398971987771156E-001);
p = __fma_rn (p, t, -1.4685633784433072E-001); p = __fma_rn (p, t, -1.4685633784433072E-001);
p = __fma_rn (p, t, 1.2963557011001836E-001); p = __fma_rn (p, t, 1.2963557011001836E-001);
p = __fma_rn (p, t, 1.0901177826674287E-001); p = __fma_rn (p, t, 1.0901177826674287E-001);
p = __fma_rn (p, t, 3.9250612663155882E-002); p = __fma_rn (p, t, 3.9250612663155882E-002);
p = __fma_rn (p, t, 7.5883167167654269E-003); p = __fma_rn (p, t, 7.5883167167654269E-003);
p = __fma_rn (p, t, 6.6438196820856965E-004); p = __fma_rn (p, t, 6.6438196820856965E-004);
q = t + 2.7339900293714838E+000; q = t + 2.7339900293714838E+000;
q = __fma_rn (q, t, 3.3580762542361291E+000); q = __fma_rn (q, t, 3.3580762542361291E+000);
q = __fma_rn (q, t, 2.4165688909166021E+000); q = __fma_rn (q, t, 2.4165688909166021E+000);
q = __fma_rn (q, t, 1.1092158770004934E+000); q = __fma_rn (q, t, 1.1092158770004934E+000);
q = __fma_rn (q, t, 3.2845571970789467E-001); q = __fma_rn (q, t, 3.2845571970789467E-001);
q = __fma_rn (q, t, 5.9110343116276186E-002); q = __fma_rn (q, t, 5.9110343116276186E-002);
q = __fma_rn (q, t, 5.1750858802842702E-003); q = __fma_rn (q, t, 5.1750858802842702E-003);
q = __fma_rn (q, t, 1.2937416364002241E-009); q = __fma_rn (q, t, 1.2937416364002241E-009);
q = 1.0 / q; q = __internal_fast_rcp(q);
p = p * q; p = p * q;
p = p * t; p = p * t;
h = a * a; h = a * a;
l = __fma_rn (a, a, -h); l = __fma_rn (a, a, -h);
q = __internal_exp_kernel(-h, -1); q = __internal_exp_kernel(-h, -1);
q = __fma_rn (l, -q, q); q = __fma_rn (l, -q, q);
p = __fma_rn (p, q, q); p = __fma_rn (p, q, q);
p = p * t; p = p * t;
} else { } else {
/* max error 4 ulps on [5, 27.3] */ /* max error 4 ulps on [5, 27.3] */
double ooa, ooasq; double ooa, ooasq;
ooa = 1.0 / a; ooa = __internal_fast_rcp(a);
ooasq = ooa * ooa; ooasq = ooa * ooa;
p = -4.0025406686930527E+005; p = -4.0025406686930527E+005;
p = __fma_rn (p, ooasq, 1.4420582543942123E+005); p = __fma_rn (p, ooasq, 1.4420582543942123E+005);
p = __fma_rn (p, ooasq, -2.7664185780951841E+004); p = __fma_rn (p, ooasq, -2.7664185780951841E+004);
p = __fma_rn (p, ooasq, 4.1144611644767283E+003); p = __fma_rn (p, ooasq, 4.1144611644767283E+003);
p = __fma_rn (p, ooasq, -5.8706000519209351E+002); p = __fma_rn (p, ooasq, -5.8706000519209351E+002);
p = __fma_rn (p, ooasq, 9.1490086446323375E+001); p = __fma_rn (p, ooasq, 9.1490086446323375E+001);
p = __fma_rn (p, ooasq, -1.6659491387740221E+001); p = __fma_rn (p, ooasq, -1.6659491387740221E+001);
p = __fma_rn (p, ooasq, 3.7024804085481784E+000); p = __fma_rn (p, ooasq, 3.7024804085481784E+000);
p = __fma_rn (p, ooasq, -1.0578553994424316E+000); p = __fma_rn (p, ooasq, -1.0578553994424316E+000);
skipping to change at line 1723 skipping to change at line 1601
p = p * ooa; p = p * ooa;
p = p * q; p = p * q;
if (a > 27.3) { if (a > 27.3) {
p = 0.0; p = 0.0;
} }
} }
return p; return p;
} }
/* approximate 1.0/(a*gamma(a)) on [-0.5,0.5] */ /* approximate 1.0/(a*gamma(a)) on [-0.5,0.5] */
__device_func__(double __internal_tgamma_kernel(double a)) static __forceinline__ double __internal_tgamma_kernel(double a)
{ {
double t; double t;
t = -4.42689340712524750E-010; t = -4.42689340712524750E-010;
t = __fma_rn (t, a, -2.02665918466589540E-007); t = __fma_rn (t, a, -2.02665918466589540E-007);
t = __fma_rn (t, a, 1.13812117211195270E-006); t = __fma_rn (t, a, 1.13812117211195270E-006);
t = __fma_rn (t, a, -1.25077348166307480E-006); t = __fma_rn (t, a, -1.25077348166307480E-006);
t = __fma_rn (t, a, -2.01365017404087710E-005); t = __fma_rn (t, a, -2.01365017404087710E-005);
t = __fma_rn (t, a, 1.28050126073544860E-004); t = __fma_rn (t, a, 1.28050126073544860E-004);
t = __fma_rn (t, a, -2.15241408115274180E-004); t = __fma_rn (t, a, -2.15241408115274180E-004);
t = __fma_rn (t, a, -1.16516754597046040E-003); t = __fma_rn (t, a, -1.16516754597046040E-003);
skipping to change at line 1746 skipping to change at line 1624
t = __fma_rn (t, a, -4.21977345547223940E-002); t = __fma_rn (t, a, -4.21977345547223940E-002);
t = __fma_rn (t, a, 1.66538611382503560E-001); t = __fma_rn (t, a, 1.66538611382503560E-001);
t = __fma_rn (t, a, -4.20026350341054440E-002); t = __fma_rn (t, a, -4.20026350341054440E-002);
t = __fma_rn (t, a, -6.55878071520257120E-001); t = __fma_rn (t, a, -6.55878071520257120E-001);
t = __fma_rn (t, a, 5.77215664901532870E-001); t = __fma_rn (t, a, 5.77215664901532870E-001);
t = __fma_rn (t, a, 1.00000000000000000E+000); t = __fma_rn (t, a, 1.00000000000000000E+000);
return t; return t;
} }
/* Stirling approximation for gamma(a), a > 20 */ /* Stirling approximation for gamma(a), a > 20 */
__device_func__(double __internal_stirling_poly(double a)) static __forceinline__ double __internal_stirling_poly(double a)
{ {
double x = 1.0 / a; double x = __internal_fast_rcp(a);
double z = 0.0; double z = 0.0;
z = __fma_rn (z, x, 8.3949872067208726e-004); z = __fma_rn (z, x, 8.3949872067208726e-004);
z = __fma_rn (z, x, -5.1717909082605919e-005); z = __fma_rn (z, x, -5.1717909082605919e-005);
z = __fma_rn (z, x, -5.9216643735369393e-004); z = __fma_rn (z, x, -5.9216643735369393e-004);
z = __fma_rn (z, x, 6.9728137583658571e-005); z = __fma_rn (z, x, 6.9728137583658571e-005);
z = __fma_rn (z, x, 7.8403922172006662e-004); z = __fma_rn (z, x, 7.8403922172006662e-004);
z = __fma_rn (z, x, -2.2947209362139917e-004); z = __fma_rn (z, x, -2.2947209362139917e-004);
z = __fma_rn (z, x, -2.6813271604938273e-003); z = __fma_rn (z, x, -2.6813271604938273e-003);
z = __fma_rn (z, x, 3.4722222222222220e-003); z = __fma_rn (z, x, 3.4722222222222220e-003);
z = __fma_rn (z, x, 8.3333333333333329e-002); z = __fma_rn (z, x, 8.3333333333333329e-002);
z = __fma_rn (z, x, 1.0000000000000000e+000); z = __fma_rn (z, x, 1.0000000000000000e+000);
return z; return z;
} }
__device_func__(double __internal_tgamma_stirling(double a)) static __forceinline__ double __internal_tgamma_stirling(double a)
{ {
if (a < 1.7162437695630274e+002) { if (a < 1.7162437695630274e+002) {
#if defined(__GNUC__) && !defined(__CUDABE__)
volatile
#endif
double t_hi, t_lo, e; double t_hi, t_lo, e;
double2 loga, prod; double2 loga, prod;
double z = __internal_stirling_poly (a); double z = __internal_stirling_poly (a);
double b = a - 0.5; double b = a - 0.5;
/* compute log(a) in double-double format*/ /* compute log(a) in double-double format*/
loga = __internal_log_ext_prec(a); loga = __internal_log_ext_prec(a);
/* compute (a - 0.5) * log(a) in double-double format */ /* compute (a - 0.5) * log(a) in double-double format */
skipping to change at line 1791 skipping to change at line 1666
t_lo = __fma_rn (loga.x, b, t_lo); t_lo = __fma_rn (loga.x, b, t_lo);
prod.y = e = t_hi + t_lo; prod.y = e = t_hi + t_lo;
prod.x = (t_hi - e) + t_lo; prod.x = (t_hi - e) + t_lo;
/* compute (a - 0.5) * log(a) - a in double-double format */ /* compute (a - 0.5) * log(a) - a in double-double format */
loga.y = -a; loga.y = -a;
loga.x = 0.0; loga.x = 0.0;
prod = __internal_ddadd_xgty (prod, loga); prod = __internal_ddadd_xgty (prod, loga);
/* compute pow(a,b) = exp(b*log(a)) */ /* compute pow(a,b) = exp(b*log(a)) */
a = __cuda_exp(prod.y); a = exp(prod.y);
/* prevent -INF + INF = NaN */ /* prevent -INF + INF = NaN */
if (!__cuda___isinf(a)) { if (!__isinf(a)) {
/* if prod.x is much smaller than prod.y, then exp(prod.y + prod.x) ~ = /* if prod.x is much smaller than prod.y, then exp(prod.y + prod.x) ~ =
* exp(prod.y) + prod.x * exp(prod.y) * exp(prod.y) + prod.x * exp(prod.y)
*/ */
a = __fma_rn (a, prod.x, a); a = __fma_rn (a, prod.x, a);
} }
a = __fma_rn (a, CUDART_SQRT_2PI_HI, a * CUDART_SQRT_2PI_LO); a = __fma_rn (a, CUDART_SQRT_2PI_HI, a * CUDART_SQRT_2PI_LO);
return a * z; return a * z;
} else { } else {
return CUDART_INF; return CUDART_INF;
} }
} }
__device_func__(double __cuda_tgamma(double a)) static __forceinline__ double tgamma(double a)
{ {
double s, xx, x = a; double s, xx, x = a;
if (__cuda___isnan(a)) { if (__isnan(a)) {
return a + a; return a + a;
} }
if (__cuda_fabs(x) < 15.0) { if (fabs(x) < 15.0) {
/* Based on: Kraemer, W.: "Berechnung der Gammafunktion G(x) fuer reel le /* Based on: Kraemer, W.: "Berechnung der Gammafunktion G(x) fuer reel le
* Punkt- und Intervallargumente". Zeitschrift fuer angewandte Mathema tik * Punkt- und Intervallargumente". Zeitschrift fuer angewandte Mathema tik
* und Mechanik, Vol. 70 (1990), No. 6, pp. 581-584 * und Mechanik, Vol. 70 (1990), No. 6, pp. 581-584
*/ */
if (x >= 0.0) { if (x >= 0.0) {
s = 1.0; s = 1.0;
xx = x; xx = x;
while (xx > 1.5) { while (xx > 1.5) {
s = __fma_rn(s, xx, -s); s = __fma_rn(s, xx, -s);
xx = xx - 1.0; xx = xx - 1.0;
skipping to change at line 1835 skipping to change at line 1710
xx = xx - 1.0; xx = xx - 1.0;
} }
xx = __internal_tgamma_kernel (xx); xx = __internal_tgamma_kernel (xx);
if (x < 0.5) { if (x < 0.5) {
xx = xx * x; xx = xx * x;
} }
s = s / xx; s = s / xx;
} else { } else {
xx = x; xx = x;
s = xx; s = xx;
if (x == __cuda_trunc(x)) { if (x == trunc(x)) {
return CUDART_NAN; return CUDART_NAN;
} }
while (xx < -0.5) { while (xx < -0.5) {
s = __fma_rn (s, xx, s); s = __fma_rn (s, xx, s);
xx = xx + 1.0; xx = xx + 1.0;
} }
xx = __internal_tgamma_kernel (xx); xx = __internal_tgamma_kernel (xx);
s = s * xx; s = s * xx;
s = 1.0 / s; s = 1.0 / s;
} }
return s; return s;
} else { } else {
if (x >= 0.0) { if (x >= 0.0) {
return __internal_tgamma_stirling (x); return __internal_tgamma_stirling (x);
} else { } else {
double t; double t;
int quot; int quot;
if (x == __cuda_trunc(x)) { if (x == trunc(x)) {
return CUDART_NAN; return CUDART_NAN;
} }
if (x < -185.0) { if (x < -185.0) {
int negative; int negative;
x = __cuda_floor(x); x = floor(x);
negative = ((x - (2.0 * __cuda_floor(0.5 * x))) == 1.0); negative = ((x - (2.0 * floor(0.5 * x))) == 1.0);
return negative ? CUDART_NEG_ZERO : CUDART_ZERO; return negative ? CUDART_NEG_ZERO : CUDART_ZERO;
} }
/* compute sin(pi*x) accurately */ /* compute sin(pi*x) accurately */
xx = __cuda_rint (__internal_twice(x)); xx = rint (__internal_twice(x));
quot = (int)xx; quot = (int)xx;
xx = __fma_rn (-0.5, xx, x); xx = __fma_rn (-0.5, xx, x);
xx = xx * CUDART_PI; xx = xx * CUDART_PI;
if (quot & 1) { if (quot & 1) {
xx = __internal_cos_kerneld (xx); xx = __internal_cos_kerneld (xx);
} else { } else {
xx = __internal_sin_kerneld (xx); xx = __internal_sin_kerneld (xx);
} }
if (quot & 2) { if (quot & 2) {
xx = -xx; xx = -xx;
} }
x = __cuda_fabs (x); x = fabs (x);
s = __cuda_exp (-x); s = exp (-x);
t = x - 0.5; t = x - 0.5;
if (x > 140.0) t = __internal_half(t); if (x > 140.0) t = __internal_half(t);
t = __cuda_pow (x, t); t = pow (x, t);
if (x > 140.0) s = s * t; if (x > 140.0) s = s * t;
s = s * __internal_stirling_poly (x); s = s * __internal_stirling_poly (x);
s = s * x; s = s * x;
s = s * xx; s = s * xx;
s = 1.0 / s; s = 1.0 / s;
s = __fma_rn (s, CUDART_SQRT_PIO2_HI, CUDART_SQRT_PIO2_LO * s); s = __fma_rn (s, CUDART_SQRT_PIO2_HI, CUDART_SQRT_PIO2_LO * s);
s = s / t; s = s / t;
return s; return s;
} }
} }
} }
__device_func__(double __internal_lgamma_pos(double a)) static __forceinline__ double __internal_lgamma_pos(double a)
{ {
double sum; double sum;
double s, t; double s, t;
if (a == CUDART_INF) { if (a == CUDART_INF) {
return a; return a;
} }
if (a >= 3.0) { if (a >= 3.0) {
if (a >= 8.0) { if (a >= 8.0) {
/* Stirling approximation; coefficients from Hart et al, "Computer /* Stirling approximation; coefficients from Hart et al, "Computer
* Approximations", Wiley 1968. Approximation 5404. * Approximations", Wiley 1968. Approximation 5404.
*/ */
s = 1.0 / a; s = __internal_fast_rcp(a);
t = s * s; t = s * s;
sum = -0.1633436431e-2; sum = -0.1633436431e-2;
sum = __fma_rn (sum, t, 0.83645878922e-3); sum = __fma_rn (sum, t, 0.83645878922e-3);
sum = __fma_rn (sum, t, -0.5951896861197e-3); sum = __fma_rn (sum, t, -0.5951896861197e-3);
sum = __fma_rn (sum, t, 0.793650576493454e-3); sum = __fma_rn (sum, t, 0.793650576493454e-3);
sum = __fma_rn (sum, t, -0.277777777735865004e-2); sum = __fma_rn (sum, t, -0.277777777735865004e-2);
sum = __fma_rn (sum, t, 0.833333333333331018375e-1); sum = __fma_rn (sum, t, 0.833333333333331018375e-1);
sum = __fma_rn (sum, s, 0.918938533204672); sum = __fma_rn (sum, s, 0.918938533204672);
s = __internal_half(__cuda_log (a)); s = __internal_half(log (a));
t = a - 0.5; t = a - 0.5;
s = s * t; s = s * t;
t = s - a; t = s - a;
s = s + sum; s = s + sum;
t = t + s; t = t + s;
return t; return t;
} else { } else {
a = a - 3.0; a = a - 3.0;
s = -4.02412642744125560E+003; s = -4.02412642744125560E+003;
s = __fma_rn (s, a, -2.97693796998962000E+005); s = __fma_rn (s, a, -2.97693796998962000E+005);
skipping to change at line 2009 skipping to change at line 1884
t = __fma_rn (t, a, -1.16484324388538480E-003); t = __fma_rn (t, a, -1.16484324388538480E-003);
t = __fma_rn (t, a, 7.21883433044470670E-003); t = __fma_rn (t, a, 7.21883433044470670E-003);
t = __fma_rn (t, a, -9.62194579514229560E-003); t = __fma_rn (t, a, -9.62194579514229560E-003);
t = __fma_rn (t, a, -4.21977386992884450E-002); t = __fma_rn (t, a, -4.21977386992884450E-002);
t = __fma_rn (t, a, 1.66538611813682460E-001); t = __fma_rn (t, a, 1.66538611813682460E-001);
t = __fma_rn (t, a, -4.20026350606819980E-002); t = __fma_rn (t, a, -4.20026350606819980E-002);
t = __fma_rn (t, a, -6.55878071519427450E-001); t = __fma_rn (t, a, -6.55878071519427450E-001);
t = __fma_rn (t, a, 5.77215664901523870E-001); t = __fma_rn (t, a, 5.77215664901523870E-001);
t = t * a; t = t * a;
t = __fma_rn (t, a, a); t = __fma_rn (t, a, a);
return -__cuda_log (t); return -log (t);
} }
} }
__device_func__(double __cuda_lgamma(double a)) static __forceinline__ double lgamma(double a)
{ {
double t; double t;
double i; double i;
long long int quot; long long int quot;
if (__cuda___isnan(a)) { if (__isnan(a)) {
return a + a; return a + a;
} }
t = __internal_lgamma_pos(__cuda_fabs(a)); t = __internal_lgamma_pos(fabs(a));
if (a >= 0.0) return t; if (a >= 0.0) return t;
a = __cuda_fabs(a); a = fabs(a);
i = __cuda_trunc(a); i = trunc(a);
if (a == i) return CUDART_INF; /* a is an integer: return infinity */ if (a == i) return CUDART_INF; /* a is an integer: return infinity */
if (a < 1e-19) return -__cuda_log(a); if (a < 1e-19) return -log(a);
i = __cuda_rint (2.0 * a); i = rint (2.0 * a);
quot = (long long int)i; quot = (long long int)i;
i = __fma_rn (-0.5, i, a); i = __fma_rn (-0.5, i, a);
i = i * CUDART_PI; i = i * CUDART_PI;
if (quot & 1) { if (quot & 1) {
i = __internal_cos_kerneld(i); i = __internal_cos_kerneld(i);
} else { } else {
i = __internal_sin_kerneld(i); i = __internal_sin_kerneld(i);
} }
i = __cuda_fabs(i); i = fabs(i);
t = __cuda_log(CUDART_PI / (i * a)) - t; t = log(CUDART_PI / (i * a)) - t;
return t; return t;
} }
__device_func__(double __cuda_ldexp(double a, int b)) static __forceinline__ double ldexp(double a, int b)
{ {
double fa = __cuda_fabs (a); double fa = fabs (a);
if ((fa == CUDART_ZERO) || (fa == CUDART_INF) || (!(fa <= CUDART_INF))) { if ((fa == CUDART_ZERO) || (fa == CUDART_INF) || (!(fa <= CUDART_INF))) {
return a + a; return a + a;
} }
if (b == 0) { if (b == 0) {
return a; return a;
} }
if (b > 2200) b = 2200; if (b > 2200) b = 2200;
if (b < -2200) b = -2200; if (b < -2200) b = -2200;
if (__cuda_abs (b) < 1022) { if (abs (b) < 1022) {
return a * __internal_exp2i_kernel(b); return a * __internal_exp2i_kernel(b);
} }
if (__cuda_abs (b) < 2044) { if (abs (b) < 2044) {
int bhalf = b / 2; int bhalf = b / 2;
return a * __internal_exp2i_kernel (bhalf) * return a * __internal_exp2i_kernel (bhalf) *
__internal_exp2i_kernel (b - bhalf); __internal_exp2i_kernel (b - bhalf);
} else { } else {
int bquarter = b / 4; int bquarter = b / 4;
double t = __internal_exp2i_kernel(bquarter); double t = __internal_exp2i_kernel(bquarter);
return a * t * t * t *__internal_exp2i_kernel (b - 3 * bquarter); return a * t * t * t *__internal_exp2i_kernel (b - 3 * bquarter);
} }
} }
__device_func__(double __cuda_scalbn(double a, int b)) static __forceinline__ double scalbn(double a, int b)
{ {
/* On binary systems, ldexp(x,exp) is equivalent to scalbn(x,exp) */ /* On binary systems, ldexp(x,exp) is equivalent to scalbn(x,exp) */
return __cuda_ldexp(a, b); return ldexp(a, b);
} }
__device_func__(double __cuda_scalbln(double a, long int b)) static __forceinline__ double scalbln(double a, long int b)
{ {
#if defined(__LP64__) #if defined(__LP64__)
/* clamp to integer range prior to conversion */ /* clamp to integer range prior to conversion */
if (b < -2147483648L) b = -2147483648L; if (b < -2147483648L) b = -2147483648L;
if (b > 2147483647L) b = 2147483647L; if (b > 2147483647L) b = 2147483647L;
#endif #endif /* __LP64__ */
return __cuda_scalbn(a, (int)b); return scalbn(a, (int)b);
} }
__device_func__(double __cuda_frexp(double a, int *b)) static __forceinline__ double frexp(double a, int *b)
{ {
double fa = __cuda_fabs(a); double fa = fabs(a);
unsigned int expo; unsigned int expo;
unsigned int denorm; unsigned int denorm;
if (fa < CUDART_TWO_TO_M1022) { if (fa < CUDART_TWO_TO_M1022) {
a *= CUDART_TWO_TO_54; a *= CUDART_TWO_TO_54;
denorm = 54; denorm = 54;
} else { } else {
denorm = 0; denorm = 0;
} }
expo = (__double2hiint(a) >> 20) & 0x7ff; expo = (__double2hiint(a) >> 20) & 0x7ff;
skipping to change at line 2107 skipping to change at line 1982
a = a + a; a = a + a;
} else { } else {
expo = expo - denorm - 1022; expo = expo - denorm - 1022;
a = __longlong_as_double((__double_as_longlong(a) & 0x800fffffffffffffU LL)| a = __longlong_as_double((__double_as_longlong(a) & 0x800fffffffffffffU LL)|
0x3fe0000000000000ULL); 0x3fe0000000000000ULL);
} }
*b = expo; *b = expo;
return a; return a;
} }
__device_func__(double __cuda_modf(double a, double *b)) static __forceinline__ double modf(double a, double *b)
{ {
double t; double t;
if (__cuda___finite(a)) { if (__finite(a)) {
t = __cuda_trunc(a); t = trunc(a);
*b = t; *b = t;
t = a - t; t = a - t;
return __internal_copysign_pos(t, a); return __internal_copysign_pos(t, a);
} else if (__cuda___isinf(a)) { } else if (__isinf(a)) {
t = 0.0; t = 0.0;
*b = a; *b = a;
return __internal_copysign_pos(t, a); return __internal_copysign_pos(t, a);
} else { } else {
*b = a + a; *b = a + a;
return a + a; return a + a;
} }
} }
__device_func__(double __cuda_fmod(double a, double b)) static __forceinline__ double fmod(double a, double b)
{ {
double orig_a = a; double orig_a = a;
double orig_b = b; double orig_b = b;
a = __cuda_fabs(a); a = fabs(a);
b = __cuda_fabs(b); b = fabs(b);
if (!((a <= CUDART_INF) && (b <= CUDART_INF))) { if (!((a <= CUDART_INF) && (b <= CUDART_INF))) {
return orig_a + orig_b; return orig_a + orig_b;
} }
if (a == CUDART_INF || b == 0.0) { if (a == CUDART_INF || b == 0.0) {
return CUDART_NAN; return CUDART_NAN;
} else if (a >= b) { } else if (a >= b) {
int bhi = __double2hiint(b); int bhi = __double2hiint(b);
int blo = __double2loint(b); int blo = __double2loint(b);
int ahi = __double2hiint(a); int ahi = __double2hiint(a);
double scaled_b = 0.0; double scaled_b = 0.0;
skipping to change at line 2168 skipping to change at line 2043
a -= scaled_b; a -= scaled_b;
} }
scaled_b *= 0.5; scaled_b *= 0.5;
} }
return __internal_copysign_pos(a, orig_a); return __internal_copysign_pos(a, orig_a);
} else { } else {
return orig_a; return orig_a;
} }
} }
__device_func__(double __cuda_remainder(double a, double b)) static __forceinline__ double remainder(double a, double b)
{ {
double orig_a; double orig_a;
double twoa = 0.0; double twoa = 0.0;
unsigned int quot0 = 0; /* quotient bit 0 */ unsigned int quot0 = 0; /* quotient bit 0 */
int bhi; int bhi;
int blo; int blo;
int ahi; int ahi;
if (__cuda___isnan(a) || __cuda___isnan(b)) { if (__isnan(a) || __isnan(b)) {
return a + b; return a + b;
} }
orig_a = a; orig_a = a;
a = __cuda_fabs(a); a = fabs(a);
b = __cuda_fabs(b); b = fabs(b);
if (a == CUDART_INF || b == 0.0) { if (a == CUDART_INF || b == 0.0) {
return CUDART_NAN; return CUDART_NAN;
} else if (a >= b) { } else if (a >= b) {
double scaled_b = 0.0; double scaled_b = 0.0;
bhi = __double2hiint(b); bhi = __double2hiint(b);
blo = __double2loint(b); blo = __double2loint(b);
ahi = __double2hiint(a); ahi = __double2hiint(a);
if (b < CUDART_TWO_TO_M1022) { if (b < CUDART_TWO_TO_M1022) {
double t = b; double t = b;
while ((t < a) && (t < CUDART_TWO_TO_M1022)) { while ((t < a) && (t < CUDART_TWO_TO_M1022)) {
skipping to change at line 2225 skipping to change at line 2100
if ((twoa > b) || ((twoa == b) && quot0)) { if ((twoa > b) || ((twoa == b) && quot0)) {
a -= b; a -= b;
} }
bhi = __double2hiint(a); bhi = __double2hiint(a);
blo = __double2loint(a); blo = __double2loint(a);
ahi = __double2hiint(orig_a); ahi = __double2hiint(orig_a);
a = __hiloint2double((ahi & 0x80000000) ^ bhi, blo); a = __hiloint2double((ahi & 0x80000000) ^ bhi, blo);
return a; return a;
} }
__device_func__(double __cuda_remquo(double a, double b, int *c)) static __forceinline__ double remquo(double a, double b, int *c)
{ {
double orig_a; double orig_a;
double twoa = 0.0; double twoa = 0.0;
unsigned int quot = 0; /* trailing quotient bits */ unsigned int quot = 0; /* trailing quotient bits */
unsigned int sign; unsigned int sign;
int bhi; int bhi;
int blo; int blo;
int ahi; int ahi;
if (__cuda___isnan(a) || __cuda___isnan(b)) { if (__isnan(a) || __isnan(b)) {
*c = quot; *c = quot;
return a + b; return a + b;
} }
orig_a = a; orig_a = a;
sign = 0 - (__cuda___signbit(a) != __cuda___signbit(b)); sign = 0 - ((__double2hiint(a) ^ __double2hiint(b)) < 0);
a = __cuda_fabs(a); a = fabs(a);
b = __cuda_fabs(b); b = fabs(b);
if (a == CUDART_INF || b == 0.0) { if (a == CUDART_INF || b == 0.0) {
*c = quot; *c = quot;
return CUDART_NAN; return CUDART_NAN;
} else if (a >= b) { } else if (a >= b) {
double scaled_b = 0.0; double scaled_b = 0.0;
bhi = __double2hiint(b); bhi = __double2hiint(b);
blo = __double2loint(b); blo = __double2loint(b);
ahi = __double2hiint(a); ahi = __double2hiint(a);
if (b < CUDART_TWO_TO_M1022) { if (b < CUDART_TWO_TO_M1022) {
double t = b; double t = b;
skipping to change at line 2291 skipping to change at line 2166
blo = __double2loint(a); blo = __double2loint(a);
ahi = __double2hiint(orig_a); ahi = __double2hiint(orig_a);
a = __hiloint2double((ahi & 0x80000000) ^ bhi, blo); a = __hiloint2double((ahi & 0x80000000) ^ bhi, blo);
quot = quot & CUDART_REMQUO_MASK_F; quot = quot & CUDART_REMQUO_MASK_F;
quot = quot ^ sign; quot = quot ^ sign;
quot = quot - sign; quot = quot - sign;
*c = quot; *c = quot;
return a; return a;
} }
__device_func__(double __cuda_nextafter(double a, double b)) static __forceinline__ double nextafter(double a, double b)
{ {
unsigned long long int ia; unsigned long long int ia;
unsigned long long int ib; unsigned long long int ib;
ia = __double_as_longlong(a); ia = __double_as_longlong(a);
ib = __double_as_longlong(b); ib = __double_as_longlong(b);
if (__cuda___isnan(a) || __cuda___isnan(b)) return a + b; /* NaN */ if (__isnan(a) || __isnan(b)) return a + b; /* NaN */
if (((ia | ib) << 1) == 0ULL) return b; if (((ia | ib) << 1) == 0ULL) return b;
if ((ia + ia) == 0ULL) { if ((ia + ia) == 0ULL) {
return __internal_copysign_pos(CUDART_MIN_DENORM, b); /* crossover */ return __internal_copysign_pos(CUDART_MIN_DENORM, b); /* crossover */
} }
if ((a < b) && (a < 0.0)) ia--; if ((a < b) && (a < 0.0)) ia--;
if ((a < b) && (a > 0.0)) ia++; if ((a < b) && (a > 0.0)) ia++;
if ((a > b) && (a < 0.0)) ia++; if ((a > b) && (a < 0.0)) ia++;
if ((a > b) && (a > 0.0)) ia--; if ((a > b) && (a > 0.0)) ia--;
a = __longlong_as_double(ia); a = __longlong_as_double(ia);
return a; return a;
} }
__device_func__(double __cuda_nan(const char *tagp)) static __forceinline__ double nan(const char *tagp)
{ {
unsigned long long int i; unsigned long long int i;
i = __internal_nan_kernel (tagp); i = __internal_nan_kernel (tagp);
i = (i & 0x000fffffffffffffULL) | 0x7ff8000000000000ULL; i = (i & 0x000fffffffffffffULL) | 0x7ff8000000000000ULL;
return __longlong_as_double(i); return __longlong_as_double(i);
} }
__device_func__(double __cuda_round(double a)) static __forceinline__ double round(double a)
{ {
double fa = __cuda_fabs(a); double fa = fabs(a);
if (fa >= CUDART_TWO_TO_52) { if (fa >= CUDART_TWO_TO_52) {
return a; return a;
} else { } else {
double u; double u;
u = __cuda_trunc(fa + 0.5); u = trunc(fa + 0.5);
if (fa < 0.5) u = 0; if (fa < 0.5) u = 0;
u = __internal_copysign_pos(u, a); u = __internal_copysign_pos(u, a);
return u; return u;
} }
} }
__device_func__(long long int __cuda_llround(double a)) static __forceinline__ long long int llround(double a)
{ {
#if !defined(__CUDABE__) return (long long int)round(a);
if (a >= 9223372036854775807.0) return 0x7fffffffffffffffLL;
if (a <= -9223372036854775808.0) return 0x8000000000000000LL;
#endif /* !__CUDABE__ */
return (long long int)(__cuda_round(a));
} }
__device_func__(long int __cuda_lround(double a)) static __forceinline__ long int lround(double a)
{ {
#if defined(__LP64__) #if defined(__LP64__)
return (long int)(__cuda_llround(a)); return (long int)llround(a);
#else /* __LP64__ */ #else /* __LP64__ */
#if !defined(__CUDABE__) return (long int)round(a);
if (__cuda___isnan(a)) return 0x80000000L;
if (a >= 2147483647.0) return 0x7fffffffL;
if (a <= -2147483648.0) return 0x80000000L;
#endif /* !__CUDABE__ */
return (long int)(__cuda_round(a));
#endif /* __LP64__ */ #endif /* __LP64__ */
} }
__device_func__(double __cuda_fdim(double a, double b)) static __forceinline__ double fdim(double a, double b)
{ {
double t; double t;
t = a - b; /* default also takes care of NaNs */ t = a - b; /* default also takes care of NaNs */
if (a <= b) { if (a <= b) {
t = 0.0; t = 0.0;
} }
return t; return t;
} }
__device_func__(int __cuda_ilogb(double a)) static __forceinline__ int ilogb(double a)
{ {
unsigned long long int i; unsigned long long int i;
unsigned int ihi; unsigned int ihi;
unsigned int ilo; unsigned int ilo;
if (__cuda___isnan(a)) return -INT_MAX-1; if (__isnan(a)) return -__cuda_INT_MAX-1;
if (__cuda___isinf(a)) return INT_MAX; if (__isinf(a)) return __cuda_INT_MAX;
if (a == 0.0) return -INT_MAX-1; if (a == 0.0) return -__cuda_INT_MAX-1;
a = __cuda_fabs(a); a = fabs(a);
ilo = __double2loint(a); ilo = __double2loint(a);
ihi = __double2hiint(a); ihi = __double2hiint(a);
i = ((unsigned long long int)ihi) << 32 | (unsigned long long int)ilo; i = ((unsigned long long int)ihi) << 32 | (unsigned long long int)ilo;
if (a >= CUDART_TWO_TO_M1022) { if (a >= CUDART_TWO_TO_M1022) {
return ((int)((ihi >> 20) & 0x7ff)) - 1023; return ((int)((ihi >> 20) & 0x7ff)) - 1023;
} else { } else {
return -1011 - __clzll(i); return -1011 - __clzll(i);
} }
} }
__device_func__(double __cuda_logb(double a)) static __forceinline__ double logb(double a)
{ {
unsigned long long int i; unsigned long long int i;
unsigned int ihi; unsigned int ihi;
unsigned int ilo; unsigned int ilo;
if (__cuda___isnan(a)) return a + a; if (__isnan(a)) return a + a;
a = __cuda_fabs(a); a = fabs(a);
if (a == CUDART_INF) return a; if (a == CUDART_INF) return a;
if (a == 0.0) return -CUDART_INF; if (a == 0.0) return -CUDART_INF;
ilo = __double2loint(a); ilo = __double2loint(a);
ihi = __double2hiint(a); ihi = __double2hiint(a);
i = ((unsigned long long int)ihi) << 32 | (unsigned long long int)ilo; i = ((unsigned long long int)ihi) << 32 | (unsigned long long int)ilo;
if (a >= CUDART_TWO_TO_M1022) { if (a >= CUDART_TWO_TO_M1022) {
return (double)((int)((ihi >> 20) & 0x7ff)) - 1023; return (double)((int)((ihi >> 20) & 0x7ff)) - 1023;
} else { } else {
int expo = -1011 - __clzll(i); int expo = -1011 - __clzll(i);
return (double)expo; return (double)expo;
} }
} }
__device_func__(double __cuda_fma(double a, double b, double c)) static __forceinline__ double fma(double a, double b, double c)
{ {
return __fma_rn(a, b, c); return __fma_rn(a, b, c);
} }
#if __APPLE__ #if defined(__APPLE__)
__device_func__(int __cuda___isfinited(double a))
static __forceinline__ int __isfinited(double a)
{ {
return __cuda___finite(a); return __finite(a);
} }
__device_func__(int __cuda___signbitd(double a)) static __forceinline__ int __signbitd(double a)
{ {
return __cuda___signbit(a); return __signbit(a);
} }
#endif
#endif /* __cplusplus && __CUDACC__ */ #endif /* __APPLE__ */
#endif /* __CUDABE__ */
#endif /* __MATH_FUNCTIONS_DBL_PTX3_H__ */ #endif /* __MATH_FUNCTIONS_DBL_PTX3_H__ */
 End of changes. 226 change blocks. 
396 lines changed or deleted 264 lines changed or added


 sm_11_atomic_functions.h   sm_11_atomic_functions.h 
skipping to change at line 49 skipping to change at line 49
#if defined(__cplusplus) && defined(__CUDACC__) #if defined(__cplusplus) && defined(__CUDACC__)
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 110 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 110
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
#include "builtin_types.h"
#include "host_defines.h" #include "host_defines.h"
extern "C" extern "C"
{ {
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ int __iAtomicAdd(int *address, int val); extern __device__ int __iAtomicAdd(int *address, int val);
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ unsigned int __uAtomicAdd(unsigned int *address, unsigned int val); extern __device__ unsigned int __uAtomicAdd(unsigned int *address, unsigned int val);
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
skipping to change at line 208 skipping to change at line 209
return __iAtomicCAS(address, compare, val); return __iAtomicCAS(address, compare, val);
} }
static __inline__ __device__ unsigned int atomicCAS(unsigned int *address, unsigned int compare, unsigned int val) static __inline__ __device__ unsigned int atomicCAS(unsigned int *address, unsigned int compare, unsigned int val)
{ {
return __uAtomicCAS(address, compare, val); return __uAtomicCAS(address, compare, val);
} }
#endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 110 */ #endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 110 */
#elif !defined(__CUDACC__)
#include "crt/func_macro.h"
#if !defined(__CUDABE__)
#if defined(__cplusplus)
extern "C" {
#endif /* __cplusplus */
extern void CUDARTAPI __cudaMutexOperation(int lock);
#if defined(__cplusplus)
}
#endif /* __cplusplus */
#define __cudaAtomicOperation(code) \
__cudaMutexOperation(1); \
code \
__cudaMutexOperation(0);
__device_func__(int __iAtomicAdd(int *address, int val))
{
int old;
__cudaAtomicOperation(
old = *address;
*address = old + val;
)
return old;
}
__device_func__(unsigned int __uAtomicAdd(unsigned int *address, unsigned i
nt val))
{
unsigned int old;
__cudaAtomicOperation(
old = *address;
*address = old + val;
)
return old;
}
__device_func__(int __iAtomicExch(int *address, int val))
{
int old;
__cudaAtomicOperation(
old = *address;
*address = val;
)
return old;
}
__device_func__(unsigned int __uAtomicExch(unsigned int *address, unsigned
int val))
{
unsigned int old;
__cudaAtomicOperation(
old = *address;
*address = val;
)
return old;
}
__device_func__(float __fAtomicExch(float *address, float val))
{
float old;
__cudaAtomicOperation(
old = *address;
*address = val;
)
return old;
}
__device_func__(int __iAtomicMin(int *address, int val))
{
int old;
__cudaAtomicOperation(
old = *address;
*address = old < val ? old : val;
)
return old;
}
__device_func__(unsigned int __uAtomicMin(unsigned int *address, unsigned i
nt val))
{
unsigned int old;
__cudaAtomicOperation(
old = *address;
*address = old < val ? old : val;
)
return old;
}
__device_func__(int __iAtomicMax(int *address, int val))
{
int old;
__cudaAtomicOperation(
old = *address;
*address = old > val ? old : val;
)
return old;
}
__device_func__(unsigned int __uAtomicMax(unsigned int *address, unsigned i
nt val))
{
unsigned int old;
__cudaAtomicOperation(
old = *address;
*address = old > val ? old : val;
)
return old;
}
__device_func__(unsigned int __uAtomicInc(unsigned int *address, unsigned i
nt val))
{
unsigned int old;
__cudaAtomicOperation(
old = *address;
*address = (old >= val) ? 0 : old + 1;
)
return old;
}
__device_func__(unsigned int __uAtomicDec(unsigned int *address, unsigned i
nt val))
{
unsigned int old;
__cudaAtomicOperation(
old = *address;
*address = ((old == 0) | (old > val)) ? val : (old - 1);
)
return old;
}
__device_func__(int __iAtomicAnd(int *address, int val))
{
int old;
__cudaAtomicOperation(
old = *address;
*address = old & val;
)
return old;
}
__device_func__(unsigned int __uAtomicAnd(unsigned int *address, unsigned i
nt val))
{
unsigned int old;
__cudaAtomicOperation(
old = *address;
*address = old & val;
)
return old;
}
__device_func__(int __iAtomicOr(int *address, int val))
{
int old;
__cudaAtomicOperation(
old = *address;
*address = old | val;
)
return old;
}
__device_func__(unsigned int __uAtomicOr(unsigned int *address, unsigned in
t val))
{
unsigned int old;
__cudaAtomicOperation(
old = *address;
*address = old | val;
)
return old;
}
__device_func__(int __iAtomicXor(int *address, int val))
{
int old;
__cudaAtomicOperation(
old = *address;
*address = old ^ val;
)
return old;
}
__device_func__(unsigned int __uAtomicXor(unsigned int *address, unsigned i
nt val))
{
unsigned int old;
__cudaAtomicOperation(
old = *address;
*address = old ^ val;
)
return old;
}
__device_func__(int __iAtomicCAS(int *address, int compare, int val))
{
int old;
__cudaAtomicOperation(
old = *address;
*address = old == compare ? val : old;
)
return old;
}
__device_func__(unsigned int __uAtomicCAS(unsigned int *address, unsigned i
nt compare, unsigned int val))
{
unsigned int old;
__cudaAtomicOperation(
old = *address;
*address = old == compare ? val : old;
)
return old;
}
#undef __cudaAtomicOperation
#endif /* !__CUDABE__ */
#endif /* __cplusplus && __CUDACC__ */ #endif /* __cplusplus && __CUDACC__ */
#endif /* !__SM_11_ATOMIC_FUNCTIONS_H__ */ #endif /* !__SM_11_ATOMIC_FUNCTIONS_H__ */
 End of changes. 2 change blocks. 
262 lines changed or deleted 1 lines changed or added


 sm_12_atomic_functions.h   sm_12_atomic_functions.h 
skipping to change at line 49 skipping to change at line 49
#if defined(__cplusplus) && defined(__CUDACC__) #if defined(__cplusplus) && defined(__CUDACC__)
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 120 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 120
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
#include "builtin_types.h"
#include "host_defines.h" #include "host_defines.h"
extern "C" extern "C"
{ {
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ unsigned long long int __ullAtomicAdd(unsigned long long int *address, unsigned long long int val); extern __device__ unsigned long long int __ullAtomicAdd(unsigned long long int *address, unsigned long long int val);
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ unsigned long long int __ullAtomicExch(unsigned long long int *address, unsigned long long int val); extern __device__ unsigned long long int __ullAtomicExch(unsigned long long int *address, unsigned long long int val);
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
skipping to change at line 101 skipping to change at line 102
return (bool)__any((int)cond); return (bool)__any((int)cond);
} }
static __inline__ __device__ bool all(bool cond) static __inline__ __device__ bool all(bool cond)
{ {
return (bool)__all((int)cond); return (bool)__all((int)cond);
} }
#endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 120 */ #endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 120 */
#elif !defined(__CUDACC__)
#include "crt/func_macro.h"
#if !defined(__CUDABE__)
#if defined(__cplusplus)
extern "C" {
#endif /* __cplusplus */
extern void CUDARTAPI __cudaMutexOperation(int lock);
#if defined(__cplusplus)
}
#endif /* __cplusplus */
#define __cudaAtomicOperation(code) \
__cudaMutexOperation(1); \
code \
__cudaMutexOperation(0);
__device_func__(unsigned long long int __ullAtomicAdd(unsigned long long in
t *address, unsigned long long int val))
{
unsigned long long int old;
__cudaAtomicOperation(
old = *address;
*address = old + val;
)
return old;
}
__device_func__(unsigned long long int __ullAtomicExch(unsigned long long i
nt *address, unsigned long long int val))
{
unsigned long long int old;
__cudaAtomicOperation(
old = *address;
*address = val;
)
return old;
}
__device_func__(unsigned long long int __ullAtomicCAS(unsigned long long in
t *address, unsigned long long int compare, unsigned long long int val))
{
unsigned long long int old;
__cudaAtomicOperation(
old = *address;
*address = old == compare ? val : old;
)
return old;
}
#undef __cudaAtomicOperation
__device_func__(int __any(int cond))
{
return cond;
}
__device_func__(int __all(int cond))
{
return cond;
}
#endif /* !__CUDABE__ */
#endif /* __cplusplus && __CUDACC__ */ #endif /* __cplusplus && __CUDACC__ */
#endif /* !__SM_12_ATOMIC_FUNCTIONS_H__ */ #endif /* !__SM_12_ATOMIC_FUNCTIONS_H__ */
 End of changes. 2 change blocks. 
71 lines changed or deleted 1 lines changed or added


 sm_13_double_functions.h   sm_13_double_functions.h 
skipping to change at line 55 skipping to change at line 55
#if defined(__cplusplus) && defined(__CUDACC__) #if defined(__cplusplus) && defined(__CUDACC__)
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 130 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 130
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
#include "builtin_types.h"
#include "device_types.h" #include "device_types.h"
#include "host_defines.h" #include "host_defines.h"
extern "C" extern "C"
{ {
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ long long int __double_as_longlong(double); extern __device__ long long int __double_as_longlong(double);
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ double __longlong_as_double(long long int) ; extern __device__ double __longlong_as_double(long long int) ;
skipping to change at line 258 skipping to change at line 259
return (double)a; return (double)a;
} }
static __inline__ __device__ double float2double(float a, enum cudaRoundMod e mode = cudaRoundNearest) static __inline__ __device__ double float2double(float a, enum cudaRoundMod e mode = cudaRoundNearest)
{ {
return (double)a; return (double)a;
} }
#endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 130 */ #endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 130 */
#elif !defined(__CUDACC__) #elif defined(__CUDABE__)
#include "crt/func_macro.h"
#if !defined(__CUDABE__)
/**************************************************************************
*****
*
*
* HOST IMPLEMENTATIONS FOR FUNCTIONS
*
*
*
***************************************************************************
****/
#include "common_types.h"
__device_func__(double __longlong_as_double(long long int a))
{
volatile union __cudart_DoubleLonglongCvt u;
u.i = a;
return u.d;
}
__device_func__(long long int __double_as_longlong(double a))
{
volatile union __cudart_DoubleLonglongCvt u;
u.d = a;
return u.i;
}
/* Note: this kernel does not support round-to-nearest-or-even */
__device_func__(float __internal_double2float_kernel(double a, enum cudaRou
ndMode rndMode))
{
volatile union __cudart_DoubleUlonglongCvt xx;
volatile union __cudart_FloatUintCvt res;
unsigned long long sticky;
int shift;
xx.d = a;
res.i = (((unsigned int) (xx.i >> 32)) & 0x80000000);
if (a == 0.0) {
/* Zero */
return res.f;
}
if ((xx.i & 0x7ff0000000000000ULL) == 0x7ff0000000000000ULL) {
if ((xx.i & 0x7fffffffffffffffULL) > 0x7ff0000000000000ULL) {
/* Nan */
res.i = ((unsigned int)((xx.i >> 32) & 0x80000000) |
(255U << 23) | 0x00400000 |
(unsigned int)((xx.i >> (53 - 24)) & 0x007fffff));
} else {
/* Inf */
res.i |= 0x7f800000;
}
return res.f;
}
shift = ((int) ((xx.i >> 52) & 0x7ff)) - 1023;
/* Overflow */
xx.i = (xx.i & 0x000fffffffffffffULL);
if (shift >= 128) {
if ((rndMode == cudaRoundZero) ||
((rndMode == cudaRoundMinInf) && !res.i) ||
((rndMode == cudaRoundPosInf) && res.i)) {
res.i |= 0x7f7fffff;
} else {
res.i |= 0x7f800000;
}
return res.f;
}
if (shift <= -127) {
/* Underflow */
xx.i |= 0x0010000000000000ULL;
if (shift < -180) {
sticky = xx.i;
xx.i = 0;
} else {
sticky = xx.i << (64 - (-126 - shift));
xx.i >>= (-126 - shift);
}
sticky |= xx.i << (64 - 29);
if ((((rndMode == cudaRoundPosInf) && !res.i) ||
((rndMode == cudaRoundMinInf) && res.i)) &&
sticky) {
res.i += 1;
}
res.i += ((unsigned int) (xx.i >> 29)) & 0x007fffff;
return res.f;
}
sticky = xx.i << (64 - 29);
if ((((rndMode == cudaRoundPosInf) && !res.i) ||
((rndMode == cudaRoundMinInf) && res.i)) &&
sticky) {
res.i += 1;
}
res.i += ((unsigned int) (xx.i >> 29)) & 0x007fffff;
res.i += (unsigned int) (127 + shift) << 23;
return res.f;
}
__device_func__(double __internal_ll2double_kernel(long long int a, enum cu
daRoundMode rndMode))
{
volatile union __cudart_DoubleUlonglongCvt res;
int shift;
unsigned int t;
res.i = a;
if (a == 0) return res.d;
if (a < 0) res.i = (unsigned long long int)-a;
shift = __internal_normalize64((unsigned long long int*)&res.i);
t = ((unsigned int) res.i) << 21;
res.i >>= 11;
res.i += ((unsigned long long int)(1023 + 62 - shift)) << 52;
if (a < 0) res.i |= 0x8000000000000000ULL;
if ((rndMode == cudaRoundNearest) && (t >= 0x80000000)) {
res.i += (t == 0x80000000) ? (res.i & 1) : 1;
}
else if ((rndMode == cudaRoundMinInf) && t && (a < 0)) {
res.i++;
}
else if ((rndMode == cudaRoundPosInf) && t && (a > 0)) {
res.i++;
}
return res.d;
}
__device_func__(double __internal_ull2double_kernel(unsigned long long int
a, enum cudaRoundMode rndMode))
{
volatile union __cudart_DoubleUlonglongCvt res;
int shift;
unsigned int t;
res.i = a;
if (a == 0) return res.d;
shift = __internal_normalize64((unsigned long long int *)&res.i);
t = ((unsigned int) res.i) << 21;
res.i >>= 11;
res.i += ((unsigned long long int)(1023 + 62 - shift)) << 52;
if ((rndMode == cudaRoundNearest) && (t >= 0x80000000)) {
res.i += (t == 0x80000000) ? (res.i & 1) : 1;
}
else if ((rndMode == cudaRoundPosInf) && t) {
res.i++;
}
return res.d;
}
__device_func__(long long int __internal_double2ll_kernel(double a, long lo
ng int max, long long int min, long long int nan, enum cudaRoundMode rndMod
e))
{
volatile union __cudart_DoubleUlonglongCvt xx, res;
unsigned long long int t = 0;
int shift;
xx.d = a;
__internal_clamp(a, max, min, nan);
shift = (int) (1023 + 62 - ((xx.i >> 52) & 0x7ff));
res.i = ((xx.i << 11) | 0x8000000000000000ULL) >> 1;
if (shift >= 64) {
t = res.i;
res.i = 0;
} else if (shift) {
t = res.i << (64 - shift);
res.i = res.i >> shift;
}
if ((rndMode == cudaRoundNearest) && (t >= 0x8000000000000000ULL)) {
res.i += (t == 0x8000000000000000ULL) ? (res.i & 1) : 1;
}
else if ((rndMode == cudaRoundMinInf) && t &&
(xx.i > 0x8000000000000000ULL)) {
res.i++;
}
else if ((rndMode == cudaRoundPosInf) && t && ((long long int)xx.i > 0))
{
res.i++;
}
if ((long long int)xx.i < 0) {
res.i = (unsigned long long int)(-(long long int)res.i);
}
return res.i;
}
__device_func__(unsigned long long int __internal_double2ull_kernel(double
a, unsigned long long int max, unsigned long long int nan, enum cudaRoundMo
de rndMode))
{
volatile union __cudart_DoubleUlonglongCvt xx, res;
unsigned long long int t = 0;
int shift;
xx.d = a;
__internal_clamp(a, max, 0LL, nan);
if (a == 0.0) return 0LL;
shift = (int) (1023 + 63 - ((xx.i >> 52) & 0x7ff));
res.i = ((xx.i << 11) | 0x8000000000000000ULL);
if (shift >= 64) {
t = res.i >> (int)(shift > 64);
res.i = 0;
} else if (shift) {
t = res.i << (64 - shift);
res.i = res.i >> shift;
}
if ((rndMode == cudaRoundNearest) && (t >= 0x8000000000000000ULL)) {
res.i += (t == 0x8000000000000000ULL) ? (res.i & 1) : 1;
}
else if ((rndMode == cudaRoundPosInf) && t) {
res.i++;
}
return res.i;
}
__device_func__(int __double2hiint(double a))
{
volatile union __cudart_DoubleInthiloCvt cvt;
cvt.d = a;
return cvt.i[1];
}
__device_func__(int __double2loint(double a))
{
volatile union __cudart_DoubleInthiloCvt cvt;
cvt.d = a;
return cvt.i[0];
}
__device_func__(double __hiloint2double(int a, int b))
{
volatile union __cudart_DoubleInthiloCvt cvt;
cvt.i[0] = b;
cvt.i[1] = a;
return cvt.d;
}
__device_func__(float __double2float_rn(double a))
{
return (float)a;
}
__device_func__(float __double2float_rz(double a))
{
return __internal_double2float_kernel(a, cudaRoundZero);
}
__device_func__(float __double2float_ru(double a))
{
return __internal_double2float_kernel(a, cudaRoundPosInf);
}
__device_func__(float __double2float_rd(double a))
{
return __internal_double2float_kernel(a, cudaRoundMinInf);
}
__device_func__(int __internal_double2int(double a, enum cudaRoundMode rndM
ode))
{
return (int)__internal_double2ll_kernel(a, 2147483647LL, -2147483648LL, -
2147483648LL, rndMode);
}
__device_func__(int __double2int_rn(double a))
{
return __internal_double2int(a, cudaRoundNearest);
}
__device_func__(int __double2int_ru(double a))
{
return __internal_double2int(a, cudaRoundPosInf);
}
__device_func__(int __double2int_rd(double a))
{
return __internal_double2int(a, cudaRoundMinInf);
}
__device_func__(unsigned int __internal_double2uint(double a, enum cudaRoun
dMode rndMode))
{
return (unsigned int)__internal_double2ull_kernel(a, 4294967295ULL, 21474
83648ULL, rndMode);
}
__device_func__(unsigned int __double2uint_rn(double a))
{
return __internal_double2uint(a, cudaRoundNearest);
}
__device_func__(unsigned int __double2uint_ru(double a))
{
return __internal_double2uint(a, cudaRoundPosInf);
}
__device_func__(unsigned int __double2uint_rd(double a))
{
return __internal_double2uint(a, cudaRoundMinInf);
}
__device_func__(long long int __internal_double2ll(double a, enum cudaRound
Mode rndMode))
{
return __internal_double2ll_kernel(a, 9223372036854775807LL, -92233720368
54775807LL -1LL, -9223372036854775807LL -1LL, rndMode);
}
__device_func__(long long int __double2ll_rn(double a))
{
return __internal_double2ll(a, cudaRoundNearest);
}
__device_func__(long long int __double2ll_ru(double a))
{
return __internal_double2ll(a, cudaRoundPosInf);
}
__device_func__(long long int __double2ll_rd(double a))
{
return __internal_double2ll(a, cudaRoundMinInf);
}
__device_func__(unsigned long long int __internal_double2ull(double a, enum
cudaRoundMode rndMode))
{
return __internal_double2ull_kernel(a, 18446744073709551615ULL, 922337203
6854775808ULL, rndMode);
}
__device_func__(unsigned long long int __double2ull_rn(double a))
{
return __internal_double2ull(a, cudaRoundNearest);
}
__device_func__(unsigned long long int __double2ull_ru(double a))
{
return __internal_double2ull(a, cudaRoundPosInf);
}
__device_func__(unsigned long long int __double2ull_rd(double a))
{
return __internal_double2ull(a, cudaRoundMinInf);
}
__device_func__(double __int2double_rn(int a))
{
return (double)a;
}
__device_func__(double __uint2double_rn(unsigned int a))
{
return (double)a;
}
__device_func__(double __ll2double_rn(long long int a))
{
return (double)a;
}
__device_func__(double __ll2double_rz(long long int a))
{
return __internal_ll2double_kernel(a, cudaRoundZero);
}
__device_func__(double __ll2double_rd(long long int a))
{
return __internal_ll2double_kernel(a, cudaRoundMinInf);
}
__device_func__(double __ll2double_ru(long long int a))
{
return __internal_ll2double_kernel(a, cudaRoundPosInf);
}
__device_func__(double __ull2double_rn(unsigned long long int a))
{
return __internal_ull2double_kernel(a, cudaRoundNearest);
}
__device_func__(double __ull2double_rz(unsigned long long int a))
{
return __internal_ull2double_kernel(a, cudaRoundZero);
}
__device_func__(double __ull2double_rd(unsigned long long int a))
{
return __internal_ull2double_kernel(a, cudaRoundMinInf);
}
__device_func__(double __ull2double_ru(unsigned long long int a))
{
return __internal_ull2double_kernel(a, cudaRoundPosInf);
}
#endif /* !__CUDABE__ */
#if !defined(__CUDABE__) || __CUDA_ARCH__ < 130
#include "common_types.h" #if __CUDA_ARCH__ < 130
__device_func__(double __internal_fma_kernel(double x, double y, double z, enum cudaRoundMode rndMode)) static __forceinline__ double __internal_fma_kernel(double x, double y, dou ble z, enum cudaRoundMode rndMode)
{ {
struct __cudart_UintUint xx, yy, zz, ww; struct __cudart_UintUint {
unsigned int lo;
unsigned int hi;
} xx, yy, zz, ww;
unsigned int s, t, u, prod0, prod1, prod2, prod3, expo_x, expo_y, expo_z; unsigned int s, t, u, prod0, prod1, prod2, prod3, expo_x, expo_y, expo_z;
xx.hi = __double2hiint(x); xx.hi = __double2hiint(x);
xx.lo = __double2loint(x); xx.lo = __double2loint(x);
yy.hi = __double2hiint(y); yy.hi = __double2hiint(y);
yy.lo = __double2loint(y); yy.lo = __double2loint(y);
zz.hi = __double2hiint(z); zz.hi = __double2hiint(z);
zz.lo = __double2loint(z); zz.lo = __double2loint(z);
expo_z = 0x7FF; expo_z = 0x7FF;
skipping to change at line 1094 skipping to change at line 718
xx.lo += (t == 0x80000000) ? expo_x : (t >> 31); xx.lo += (t == 0x80000000) ? expo_x : (t >> 31);
} else if (((rndMode == cudaRoundPosInf) && t && (!expo_y)) || } else if (((rndMode == cudaRoundPosInf) && t && (!expo_y)) ||
((rndMode == cudaRoundMinInf) && t && expo_y)) { ((rndMode == cudaRoundMinInf) && t && expo_y)) {
xx.lo += 1; xx.lo += 1;
} }
xx.hi += (u > xx.lo); xx.hi += (u > xx.lo);
xx.hi |= yy.hi; xx.hi |= yy.hi;
return __hiloint2double(xx.hi, xx.lo); return __hiloint2double(xx.hi, xx.lo);
} }
__device_func__(double __fma_rn(double x, double y, double z)) static __forceinline__ double __fma_rn(double x, double y, double z)
{ {
return __internal_fma_kernel(x, y, z, cudaRoundNearest); return __internal_fma_kernel(x, y, z, cudaRoundNearest);
} }
__device_func__(double __fma_rd(double x, double y, double z)) static __forceinline__ double __fma_rd(double x, double y, double z)
{ {
return __internal_fma_kernel(x, y, z, cudaRoundMinInf); return __internal_fma_kernel(x, y, z, cudaRoundMinInf);
} }
__device_func__(double __fma_ru(double x, double y, double z)) static __forceinline__ double __fma_ru(double x, double y, double z)
{ {
return __internal_fma_kernel(x, y, z, cudaRoundPosInf); return __internal_fma_kernel(x, y, z, cudaRoundPosInf);
} }
__device_func__(double __fma_rz(double x, double y, double z)) static __forceinline__ double __fma_rz(double x, double y, double z)
{ {
return __internal_fma_kernel(x, y, z, cudaRoundZero); return __internal_fma_kernel(x, y, z, cudaRoundZero);
} }
__device_func__(double __dadd_rz(double a, double b)) static __forceinline__ double __dadd_rz(double a, double b)
{ {
return __fma_rz(a, CUDART_ONE, b); return __fma_rz(a, CUDART_ONE, b);
} }
__device_func__(double __dadd_ru(double a, double b)) static __forceinline__ double __dadd_ru(double a, double b)
{ {
return __fma_ru(a, CUDART_ONE, b); return __fma_ru(a, CUDART_ONE, b);
} }
__device_func__(double __dadd_rd(double a, double b)) static __forceinline__ double __dadd_rd(double a, double b)
{ {
return __fma_rd(a, CUDART_ONE, b); return __fma_rd(a, CUDART_ONE, b);
} }
__device_func__(double __dmul_rz(double a, double b)) static __forceinline__ double __dmul_rz(double a, double b)
{ {
return __fma_rz(a, b, CUDART_NEG_ZERO); return __fma_rz(a, b, CUDART_NEG_ZERO);
} }
__device_func__(double __dmul_ru(double a, double b)) static __forceinline__ double __dmul_ru(double a, double b)
{ {
return __fma_ru(a, b, CUDART_NEG_ZERO); return __fma_ru(a, b, CUDART_NEG_ZERO);
} }
__device_func__(double __dmul_rd(double a, double b)) static __forceinline__ double __dmul_rd(double a, double b)
{ {
return __fma_rd(a, b, CUDART_ZERO); return __fma_rd(a, b, CUDART_ZERO);
} }
__device_func__(double __dadd_rn(double a, double b)) static __forceinline__ double __dadd_rn(double a, double b)
{ {
return __fma_rn(a, CUDART_ONE, b); return __fma_rn(a, CUDART_ONE, b);
} }
__device_func__(double __dmul_rn(double a, double b)) static __forceinline__ double __dmul_rn(double a, double b)
{ {
return __fma_rn(a, b, CUDART_NEG_ZERO); return __fma_rn(a, b, CUDART_NEG_ZERO);
} }
#endif /* !__CUDABE__ || __CUDA_ARCH__ < 130 */ #endif /* __CUDA_ARCH__ < 130 */
/**************************************************************************
*****
*
*
* HOST / DEVICE IMPLEMENTATIONS FOR FUNCTIONS
*
*
*
***************************************************************************
****/
#endif /* __cplusplus && __CUDACC__ */ #endif /* __cplusplus && __CUDACC__ */
#endif /* !__SM_13_DOUBLE_FUNCTIONS_H__ */ #endif /* !__SM_13_DOUBLE_FUNCTIONS_H__ */
 End of changes. 19 change blocks. 
428 lines changed or deleted 21 lines changed or added


 sm_20_atomic_functions.h   sm_20_atomic_functions.h 
skipping to change at line 49 skipping to change at line 49
#if defined(__cplusplus) && defined(__CUDACC__) #if defined(__cplusplus) && defined(__CUDACC__)
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 200 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 200
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
#include "builtin_types.h"
#include "host_defines.h" #include "host_defines.h"
extern "C" extern "C"
{ {
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ float __fAtomicAdd(float *address, float val); extern __device__ float __fAtomicAdd(float *address, float val);
} }
skipping to change at line 72 skipping to change at line 73
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
static __inline__ __device__ float atomicAdd(float *address, float val) static __inline__ __device__ float atomicAdd(float *address, float val)
{ {
return __fAtomicAdd(address, val); return __fAtomicAdd(address, val);
} }
#endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 200 */ #endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 200 */
#elif !defined(__CUDACC__)
#include "crt/func_macro.h"
#if !defined(__CUDABE__)
#if defined(__cplusplus)
extern "C" {
#endif /* __cplusplus */
extern void CUDARTAPI __cudaMutexOperation(int lock);
#if defined(__cplusplus)
}
#endif /* __cplusplus */
#define __cudaAtomicOperation(code) \
__cudaMutexOperation(1); \
code \
__cudaMutexOperation(0);
__device_func__(float __fAtomicAdd(float *address, float val))
{
float old;
__cudaAtomicOperation(
old = *address;
*address = old + val;
)
return old;
}
#undef __cudaAtomicOperation
#endif /* !__CUDABE__ */
#endif /* __cplusplus && __CUDACC__ */ #endif /* __cplusplus && __CUDACC__ */
#endif /* !__SM_20_ATOMIC_FUNCTIONS_H__ */ #endif /* !__SM_20_ATOMIC_FUNCTIONS_H__ */
 End of changes. 2 change blocks. 
36 lines changed or deleted 1 lines changed or added


 sm_20_intrinsics.h   sm_20_intrinsics.h 
skipping to change at line 49 skipping to change at line 49
#if defined(__cplusplus) && defined(__CUDACC__) #if defined(__cplusplus) && defined(__CUDACC__)
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 200 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 200
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
#include "builtin_types.h"
#include "device_types.h" #include "device_types.h"
#include "host_defines.h" #include "host_defines.h"
extern "C" extern "C"
{ {
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ void __threadfence_system(void); extern __device__ void __threadfence_system(void);
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
skipping to change at line 112 skipping to change at line 113
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ float __fmaf_ieee_rn(float, float, float) ; extern __device__ float __fmaf_ieee_rn(float, float, float) ;
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ float __fmaf_ieee_rz(float, float, float) ; extern __device__ float __fmaf_ieee_rz(float, float, float) ;
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ float __fmaf_ieee_ru(float, float, float) ; extern __device__ float __fmaf_ieee_ru(float, float, float) ;
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ float __fmaf_ieee_rd(float, float, float) ; extern __device__ float __fmaf_ieee_rd(float, float, float) ;
/*DEVICE_BUILTIN*/
extern __device__ double __rcp64h(double);
} }
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
static __inline__ __device__ unsigned int ballot(bool pred) static __inline__ __device__ unsigned int ballot(bool pred)
{ {
skipping to change at line 142 skipping to change at line 146
return (bool)__syncthreads_and((int)pred); return (bool)__syncthreads_and((int)pred);
} }
static __inline__ __device__ bool syncthreads_or(bool pred) static __inline__ __device__ bool syncthreads_or(bool pred)
{ {
return (bool)__syncthreads_or((int)pred); return (bool)__syncthreads_or((int)pred);
} }
#endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 200 */ #endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 200 */
#elif !defined(__CUDACC__)
#include "crt/func_macro.h"
#if !defined(__CUDABE__)
/**************************************************************************
*****
*
*
* HOST IMPLEMENTATIONS FOR FUNCTIONS
*
*
*
***************************************************************************
****/
#include "common_types.h"
__device_func__(void __threadfence_system(void))
{
__syncthreads();
}
__device_func__(unsigned int __ballot(int pred))
{
return (unsigned int)1;
}
__device_func__(int __syncthreads_count(int pred))
{
return 1;
}
__device_func__(int __syncthreads_and(int pred))
{
return pred;
}
__device_func__(int __syncthreads_or(int pred))
{
return pred;
}
__device_func__(long long int clock64(void))
{
return (long long int)__cuda_clock();
}
__device_func__(double __internal_ddiv_kernel (double x, double y,
enum cudaRoundMode mode))
{
volatile union __cudart_DoubleLonglongCvt cvt;
unsigned long long a, b, q, sign;
int expoa, expob, normalize, i;
cvt.d = x;
a = cvt.i;
cvt.d = y;
b = cvt.i;
sign = (a ^ b) & 0x8000000000000000ULL;
expoa = ((int)(a >> 52) & 0x7ff) - 1;
expob = ((int)(b >> 52) & 0x7ff) - 1;
if (((unsigned)expoa >= 0x7fe) || ((unsigned)expob >= 0x7fe)) {
/* handle NaNs */
if ((a << 1) > 0xffe0000000000000ULL) {
cvt.i = a | 0x0008000000000000ULL;
return cvt.d;
}
if ((b << 1) > 0xffe0000000000000ULL) {
cvt.i = b | 0x0008000000000000ULL;
return cvt.d;
}
/* 0/0 and INF/INF ==> INDEFINITE */
if ((((a << 1) == 0x0000000000000000ULL) &&
((b << 1) == 0x0000000000000000ULL)) ||
(((a << 1) == 0xffe0000000000000ULL) &&
((b << 1) == 0xffe0000000000000ULL))) {
cvt.i = 0xfff8000000000000ULL;
return cvt.d;
}
/* 0/y or x/INF ==> 0 */
if (((a << 1) == 0x0000000000000000ULL) ||
((b << 1) == 0xffe0000000000000ULL)) {
cvt.i = sign;
return cvt.d;
}
/* INF/y or x/0 ==> INF */
if (((b << 1) == 0x0000000000000000ULL) ||
((a << 1) == 0xffe0000000000000ULL)) {
cvt.i = sign | 0x7ff0000000000000ULL;
return cvt.d;
}
if (expoa < 0) {
a = a << 12;
while ((long long)a > 0) {
a = a + a;
expoa--;
}
a = a >> 11;
}
if (expob < 0) {
b = b << 12;
while ((long long)b > 0) {
b = b + b;
expob--;
}
b = b >> 11;
}
}
a = (a & 0x000fffffffffffffULL) | 0x0010000000000000ULL;
b = (b & 0x000fffffffffffffULL) | 0x0010000000000000ULL;
/* 1 <= x < 2 / 1 <= y < 2 => 0.5 < q < 2.0 */
q = 0ULL;
a = a - b;
normalize = (long long)a < 0;
for (i = 0; i < (54 + normalize); i++) {
if ((long long)a < 0) {
q = q + q;
a = a + a;
a = a + b;
} else {
q = q + q + 1;
a = a + a;
a = a - b;
}
}
expoa = (expoa - expob) - normalize + 1022;
if ((unsigned)expoa < 0x7fe) {
/* no tie case for division if not denorm, one round bit sufficient */
if (mode == cudaRoundNearest) {
q++;
} else if ((mode == cudaRoundPosInf) && (!sign)) {
if ((q & 1) || (a != (unsigned long long)-(long long)b)) q += 2;
} else if ((mode == cudaRoundMinInf) && (sign)) {
if ((q & 1) || (a != (unsigned long long)-(long long)b)) q += 2;
}
q = (q >> 1) & 0x000fffffffffffffULL;
q = sign | (((long long)(expoa+1) << 52) + q);
} else if (expoa >= 0x7fe) {
/* overflow, return infinity or largest normal*/
if ((mode == cudaRoundNearest) ||
((mode == cudaRoundPosInf) && !sign) ||
((mode == cudaRoundMinInf) && sign)) {
q = sign | 0x7ff0000000000000ULL;
} else {
q = sign | 0x7fefffffffffffffULL;
}
} else {
/* denormal results can involve tie cases, generate sticky bit */
unsigned long long sticky;
expoa = -expoa;
if (expoa > 63) expoa = 63;
/* 1 <= expoa <= 63 */
sticky = (q << (64 - expoa)) | (a!=(unsigned long long)-(long long)b);
q = q >> expoa;
if (mode == cudaRoundNearest) {
if ((q & 1) && (sticky || (q & 2))) q++;
} else if ((mode == cudaRoundPosInf) && (!sign)) {
if ((q & 1) || (sticky)) q += 2;
} else if ((mode == cudaRoundMinInf) && (sign)) {
if ((q & 1) || (sticky)) q += 2;
}
q = q >> 1;
q = q | sign;
}
cvt.i = q;
return cvt.d;
}
__device_func__(double __ddiv_rn(double a, double b))
{
return __internal_ddiv_kernel (a, b, cudaRoundNearest);
}
__device_func__(double __ddiv_rz(double a, double b))
{
return __internal_ddiv_kernel (a, b, cudaRoundZero);
}
__device_func__(double __ddiv_ru(double a, double b))
{
return __internal_ddiv_kernel (a, b, cudaRoundPosInf);
}
__device_func__(double __ddiv_rd(double a, double b))
{
return __internal_ddiv_kernel (a, b, cudaRoundMinInf);
}
__device_func__(double __drcp_rn(double a))
{
return __internal_ddiv_kernel (1.0, a, cudaRoundNearest);
}
__device_func__(double __drcp_rz(double a))
{
return __internal_ddiv_kernel (1.0, a, cudaRoundZero);
}
__device_func__(double __drcp_ru(double a))
{
return __internal_ddiv_kernel (1.0, a, cudaRoundPosInf);
}
__device_func__(double __drcp_rd(double a))
{
return __internal_ddiv_kernel (1.0, a, cudaRoundMinInf);
}
__device_func__(double __internal_dsqrt_kernel (double a,
enum cudaRoundMode mode))
{
volatile union __cudart_DoubleLonglongCvt cvt;
unsigned long long ia, manthi, mantlo;
unsigned long long t, q, r, s;
int expoa;
int round, sticky, odd;
int sign;
cvt.d = a;
ia = cvt.i;
expoa = ((int)(ia >> 52) & 0x7ff) - 1;
sign = (int)(ia >> 63);
/* handle special cases */
if (((unsigned)expoa >= 0x7fe) || sign) {
/* handle NaNs */
if ((ia << 1) > 0xffe0000000000000ULL) {
cvt.i |= 0x0008000000000000ULL;
return cvt.d;
}
/* arguments less than -0 */
if (ia > 0x8000000000000000ULL) {
cvt.i = 0xfff8000000000000ULL;
return cvt.d;
}
/* handle infinities */
if ((ia << 1) == 0xffe0000000000000ULL) {
return cvt.d;
}
/* handle zeros */
if ((ia << 1) == 0x0000000000000000ULL) {
return cvt.d;
}
/* handle denormals */
if (expoa < 0) {
ia = ia << 12;
while ((long long)ia > 0) {
ia = ia + ia;
expoa--;
}
ia = ia >> 11;
}
}
/* extract mantissa */
ia = (ia << 11) | 0x8000000000000000ULL;
if (!(expoa & 1)) {
/* exponent even: shift mantissa right by 1 bit */
ia >>= 1;
}
manthi = ia >> 32;
mantlo = ia & 0xffffffffULL;
/* A few Newton-Raphson iterations to get initial 16 result bits */
t = ((manthi >> 24) | 0x100) >> 1;
t = (expoa & 1) ? (t - 10) : t;
q = ((manthi >> 16) / t);
t = (((q + t) >> 1) << 8) | 0xff;
q = manthi / t;
t = (q + t) >> 1;
if (t > 0xffff) t = 0xffff;
/* compute remainder and adjust first result "digit" */
r = manthi - t * t;
while ((long long)r < 0) {
t--;
r += 2 * t + 1;
}
/* compute second result "digit" by longhand computation */
s = ((r << 15) + (mantlo >> 17)) / t;
if (s > 0xffff) s = 0xffff;
/* compute remainder and adjust second result "digit" */
r = (r << 32) + mantlo;
r = r - (t << 17) * s;
r = r - s * s;
t = (t << 16) + s;
while ((long long)r < 0) {
t--;
r += 2 * t + 1;
}
/* compute third result "digit" by longhand computation */
s = (r << 15) / t;
if (s > 0xffff) s = 0xffff;
/* compute remainder and adjust third result "digit" */
r = r << 32;
r = r - (t << 17) * s;
r = r - s * s;
t = (t << 16) + s;
while ((long long)r < 0) {
t--;
r += 2 * t + 1;
}
/* compute fourth result "digit" by longhand computation */
s = (r << 5) / t;
if (s > 0x3f) s = 0x3f;
/* compute remainder and adjust fourth result "digit" */
r = r << 12;
r = r - ((t << 7) + s) * s;
t = (t << 6) + s;
while ((long long)r < 0) {
t--;
r += 2 * t + 1;
}
/* prepare for rounding mantissa */
round = (int)(t & 1);
sticky = (r != 0ULL);
t = t >> 1;
odd = (int)(t & 1);
/* round mantissa */
if (mode == cudaRoundNearest) {
t += round && (sticky || odd);
} else if (mode == cudaRoundPosInf) {
t += round || sticky;
}
/* construct final result */
expoa = (expoa >> 1) + 0x1ff;
q = (((unsigned long long int)expoa) << 52) + t;
cvt.i = q;
return cvt.d;
}
__device_func__(double __dsqrt_rn(double a))
{
return __internal_dsqrt_kernel (a, cudaRoundNearest);
}
__device_func__(double __dsqrt_rz(double a))
{
return __internal_dsqrt_kernel (a, cudaRoundZero);
}
__device_func__(double __dsqrt_ru(double a))
{
return __internal_dsqrt_kernel (a, cudaRoundPosInf);
}
__device_func__(double __dsqrt_rd(double a))
{
return __internal_dsqrt_kernel (a, cudaRoundMinInf);
}
__device_func__(float __fmaf_ieee_rn(float a, float b, float c))
{
return __fmaf_rn(a, b, c);
}
__device_func__(float __fmaf_ieee_ru(float a, float b, float c))
{
return __fmaf_ru(a, b, c);
}
__device_func__(float __fmaf_ieee_rd(float a, float b, float c))
{
return __fmaf_rd(a, b, c);
}
__device_func__(float __fmaf_ieee_rz(float a, float b, float c))
{
return __fmaf_rz(a, b, c);
}
#endif /* !defined(__CUDABE__) */
#endif /* __cplusplus && __CUDACC__ */ #endif /* __cplusplus && __CUDACC__ */
#endif /* !__SM_20_INTRINSICS_H__ */ #endif /* !__SM_20_INTRINSICS_H__ */
 End of changes. 3 change blocks. 
388 lines changed or deleted 4 lines changed or added


 storage_class.h   storage_class.h 
/* /*
* Copyright 1993-2008 NVIDIA Corporation. All rights reserved. * Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
* *
* NOTICE TO USER: * NOTICE TO USER:
* *
* This source code is subject to NVIDIA ownership rights under U.S. and * This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws. Users and possessors of this source code * international Copyright laws. Users and possessors of this source code
* are hereby granted a nonexclusive, royalty-free license to use this code * are hereby granted a nonexclusive, royalty-free license to use this code
* in individual and commercial software. * in individual and commercial software.
* *
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
 End of changes. 1 change blocks. 
1 lines changed or deleted 1 lines changed or added


 texture_fetch_functions.h   texture_fetch_functions.h 
skipping to change at line 47 skipping to change at line 47
#define __TEXTURE_FETCH_FUNCTIONS_H__ #define __TEXTURE_FETCH_FUNCTIONS_H__
#if defined(__cplusplus) && defined(__CUDACC__) #if defined(__cplusplus) && defined(__CUDACC__)
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
#include "builtin_types.h"
#include "cuda_texture_types.h" #include "cuda_texture_types.h"
#include "host_defines.h" #include "host_defines.h"
#include "texture_types.h" #include "texture_types.h"
#include "vector_functions.h" #include "vector_functions.h"
#include "vector_types.h" #include "vector_types.h"
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
skipping to change at line 1875 skipping to change at line 1876
} }
static __inline__ __device__ float4 tex3D(texture<ushort4, 3, cudaReadModeN ormalizedFloat> t, float x, float y, float z) static __inline__ __device__ float4 tex3D(texture<ushort4, 3, cudaReadModeN ormalizedFloat> t, float x, float y, float z)
{ {
uint4 v = __utexfetch(t, make_float4(x, y, z, 0)); uint4 v = __utexfetch(t, make_float4(x, y, z, 0));
float4 w = make_float4(__int_as_float(v.x), __int_as_float(v.y), __int_as _float(v.z), __int_as_float(v.w)); float4 w = make_float4(__int_as_float(v.x), __int_as_float(v.y), __int_as _float(v.z), __int_as_float(v.w));
return make_float4(w.x, w.y, w.z, w.w); return make_float4(w.x, w.y, w.z, w.w);
} }
#elif !defined(__CUDACC__) #elif defined(__CUDABE__)
#include "host_defines.h"
#include "crt/func_macro.h"
#if defined(__CUDABE__)
extern uint4 __utexfetchi1D(const void*, int4); extern uint4 __utexfetchi1D(const void*, int4);
extern int4 __itexfetchi1D(const void*, int4); extern int4 __itexfetchi1D(const void*, int4);
extern float4 __ftexfetchi1D(const void*, int4); extern float4 __ftexfetchi1D(const void*, int4);
extern uint4 __utexfetch1D(const void*, float4); extern uint4 __utexfetch1D(const void*, float4);
extern int4 __itexfetch1D(const void*, float4); extern int4 __itexfetch1D(const void*, float4);
extern float4 __ftexfetch1D(const void*, float4); extern float4 __ftexfetch1D(const void*, float4);
extern uint4 __utexfetch2D(const void*, float4); extern uint4 __utexfetch2D(const void*, float4);
extern int4 __itexfetch2D(const void*, float4); extern int4 __itexfetch2D(const void*, float4);
extern float4 __ftexfetch2D(const void*, float4); extern float4 __ftexfetch2D(const void*, float4);
skipping to change at line 1908 skipping to change at line 1904
__itexfetchi1D(t, i) __itexfetchi1D(t, i)
#define __ftexfetchi(t, i) \ #define __ftexfetchi(t, i) \
__ftexfetchi1D(t, i) __ftexfetchi1D(t, i)
#define __utexfetch(t, i, d) \ #define __utexfetch(t, i, d) \
__utexfetch##d##D(t, i) __utexfetch##d##D(t, i)
#define __itexfetch(t, i, d) \ #define __itexfetch(t, i, d) \
__itexfetch##d##D(t, i) __itexfetch##d##D(t, i)
#define __ftexfetch(t, i, d) \ #define __ftexfetch(t, i, d) \
__ftexfetch##d##D(t, i) __ftexfetch##d##D(t, i)
#else /* __CUDABE__ */
#if defined(__cplusplus)
extern "C" {
#endif /* __cplusplus */
extern void CUDARTAPI __cudaTextureFetch(const void *tex, void *index, int
integer, void *val);
#if defined(__cplusplus)
}
#endif /* __cplusplus */
__device_func__(int4 __itexfetchi(const void *tex, int4 index))
{
int4 val;
__cudaTextureFetch(tex, (void*)&index, 1, (void*)&val);
return val;
}
__device_func__(uint4 __utexfetchi(const void *tex, int4 index))
{
uint4 val;
__cudaTextureFetch(tex, (void*)&index, 1, (void*)&val);
return val;
}
__device_func__(float4 __ftexfetchi(const void *tex, int4 index))
{
float4 val;
__cudaTextureFetch(tex, (void*)&index, 1, (void*)&val);
return val;
}
__device_func__(int4 __itexfetch(const void *tex, float4 index, int dim))
{
int4 val;
__cudaTextureFetch(tex, (void*)&index, 0, (void*)&val);
return val;
}
__device_func__(uint4 __utexfetch(const void *tex, float4 index, int dim))
{
uint4 val;
__cudaTextureFetch(tex, (void*)&index, 0, (void*)&val);
return val;
}
__device_func__(float4 __ftexfetch(const void *tex, float4 index, int dim))
{
float4 val;
__cudaTextureFetch(tex, (void*)&index, 0, (void*)&val);
return val;
}
#endif /* __CUDABE__ */
#endif /* __cplusplus && __CUDACC__ */ #endif /* __cplusplus && __CUDACC__ */
#endif /* !__TEXTURE_FETCH_FUNCTIONS_H__ */ #endif /* !__TEXTURE_FETCH_FUNCTIONS_H__ */
 End of changes. 3 change blocks. 
75 lines changed or deleted 2 lines changed or added


 vector_functions.h   vector_functions.h 
skipping to change at line 45 skipping to change at line 45
#if !defined(__VECTOR_FUNCTIONS_H__) #if !defined(__VECTOR_FUNCTIONS_H__)
#define __VECTOR_FUNCTIONS_H__ #define __VECTOR_FUNCTIONS_H__
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
#include "builtin_types.h"
#include "host_defines.h" #include "host_defines.h"
#include "vector_types.h" #include "vector_types.h"
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
static __inline__ __host__ __device__ char1 make_char1(signed char x) static __inline__ __host__ __device__ char1 make_char1(signed char x)
 End of changes. 1 change blocks. 
0 lines changed or deleted 1 lines changed or added


 vector_types.h   vector_types.h 
skipping to change at line 45 skipping to change at line 45
#if !defined(__VECTOR_TYPES_H__) #if !defined(__VECTOR_TYPES_H__)
#define __VECTOR_TYPES_H__ #define __VECTOR_TYPES_H__
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
#include "builtin_types.h"
#include "host_defines.h" #include "host_defines.h"
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
#if !defined(__cuda_assign_operators)
#define __cuda_assign_operators(tag)
#endif /* !__cuda_assign_operators */
#if !defined(__CUDACC__) && !defined(__CUDABE__) && \ #if !defined(__CUDACC__) && !defined(__CUDABE__) && \
defined(_WIN32) && !defined(_WIN64) defined(_WIN32) && !defined(_WIN64)
#define __cuda_builtin_vector_align8(tag, ...) \ #define __cuda_builtin_vector_align8(tag, members) \
struct tag { \ struct tag { \
union { \ union { \
struct { __VA_ARGS__; }; \ struct { members }; \
struct { long long int :1,:0; }; \ struct { long long int :1,:0; }; \
}; \ }; \
__cuda_assign_operators(tag) \
} }
#else /* !__CUDACC__ && !__CUDABE__ && _WIN32 && !_WIN64 */ #else /* !__CUDACC__ && !__CUDABE__ && _WIN32 && !_WIN64 */
#define __cuda_builtin_vector_align8(tag, ...) \ #define __cuda_builtin_vector_align8(tag, members) \
struct __align__(8) tag { \ struct __align__(8) tag { \
__VA_ARGS__; \ members \
__cuda_assign_operators(tag) \
} }
#endif /* !__CUDACC__ && !__CUDABE__ && _WIN32 && !_WIN64 */ #endif /* !__CUDACC__ && !__CUDABE__ && _WIN32 && !_WIN64 */
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct char1 struct char1
{ {
signed char x; signed char x;
__cuda_assign_operators(char1)
}; };
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct uchar1 struct uchar1
{ {
unsigned char x; unsigned char x;
__cuda_assign_operators(uchar1)
}; };
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct __align__(2) char2 struct __align__(2) char2
{ {
signed char x, y; signed char x, y;
__cuda_assign_operators(char2)
}; };
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct __align__(2) uchar2 struct __align__(2) uchar2
{ {
unsigned char x, y; unsigned char x, y;
__cuda_assign_operators(uchar2)
}; };
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct char3 struct char3
{ {
signed char x, y, z; signed char x, y, z;
__cuda_assign_operators(char3)
}; };
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct uchar3 struct uchar3
{ {
unsigned char x, y, z; unsigned char x, y, z;
__cuda_assign_operators(uchar3)
}; };
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct __align__(4) char4 struct __align__(4) char4
{ {
signed char x, y, z, w; signed char x, y, z, w;
__cuda_assign_operators(char4)
}; };
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct __align__(4) uchar4 struct __align__(4) uchar4
{ {
unsigned char x, y, z, w; unsigned char x, y, z, w;
__cuda_assign_operators(uchar4)
}; };
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct short1 struct short1
{ {
short x; short x;
__cuda_assign_operators(short1)
}; };
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct ushort1 struct ushort1
{ {
unsigned short x; unsigned short x;
__cuda_assign_operators(ushort1)
}; };
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct __align__(4) short2 struct __align__(4) short2
{ {
short x, y; short x, y;
__cuda_assign_operators(short2)
}; };
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct __align__(4) ushort2 struct __align__(4) ushort2
{ {
unsigned short x, y; unsigned short x, y;
__cuda_assign_operators(ushort2)
}; };
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct short3 struct short3
{ {
short x, y, z; short x, y, z;
__cuda_assign_operators(short3)
}; };
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct ushort3 struct ushort3
{ {
unsigned short x, y, z; unsigned short x, y, z;
__cuda_assign_operators(ushort3)
}; };
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
__cuda_builtin_vector_align8(short4, short x, y, z, w); __cuda_builtin_vector_align8(short4, short x; short y; short z; short w;);
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
__cuda_builtin_vector_align8(ushort4, unsigned short x, y, z, w); __cuda_builtin_vector_align8(ushort4, unsigned short x; unsigned short y; u nsigned short z; unsigned short w;);
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct int1 struct int1
{ {
int x; int x;
__cuda_assign_operators(int1)
}; };
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct uint1 struct uint1
{ {
unsigned int x; unsigned int x;
__cuda_assign_operators(uint1)
}; };
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
__cuda_builtin_vector_align8(int2, int x, y); __cuda_builtin_vector_align8(int2, int x; int y;);
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
__cuda_builtin_vector_align8(uint2, unsigned int x, y); __cuda_builtin_vector_align8(uint2, unsigned int x; unsigned int y;);
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct int3 struct int3
{ {
int x, y, z; int x, y, z;
__cuda_assign_operators(int3)
}; };
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct uint3 struct uint3
{ {
unsigned int x, y, z; unsigned int x, y, z;
__cuda_assign_operators(uint3)
}; };
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct __builtin_align__(16) int4 struct __builtin_align__(16) int4
{ {
int x, y, z, w; int x, y, z, w;
__cuda_assign_operators(int4)
}; };
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct __builtin_align__(16) uint4 struct __builtin_align__(16) uint4
{ {
unsigned int x, y, z, w; unsigned int x, y, z, w;
__cuda_assign_operators(uint4)
}; };
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct long1 struct long1
{ {
long int x; long int x;
__cuda_assign_operators(long1)
}; };
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct ulong1 struct ulong1
{ {
unsigned long x; unsigned long x;
__cuda_assign_operators(ulong1)
}; };
#if defined (_WIN32) #if defined (_WIN32)
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
__cuda_builtin_vector_align8(long2, long int x, y); __cuda_builtin_vector_align8(long2, long int x; long int y;);
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
__cuda_builtin_vector_align8(ulong2, unsigned long int x, y); __cuda_builtin_vector_align8(ulong2, unsigned long int x; unsigned long int y;);
#else /* _WIN32 */ #else /* _WIN32 */
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct __align__(2*sizeof(long int)) long2 struct __align__(2*sizeof(long int)) long2
{ {
long int x, y; long int x, y;
__cuda_assign_operators(long2)
}; };
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct __align__(2*sizeof(unsigned long int)) ulong2 struct __align__(2*sizeof(unsigned long int)) ulong2
{ {
unsigned long int x, y; unsigned long int x, y;
__cuda_assign_operators(ulong2)
}; };
#endif /* _WIN32 */ #endif /* _WIN32 */
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct long3 struct long3
{ {
long int x, y, z; long int x, y, z;
__cuda_assign_operators(long3)
}; };
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct ulong3 struct ulong3
{ {
unsigned long int x, y, z; unsigned long int x, y, z;
__cuda_assign_operators(ulong3)
}; };
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct __builtin_align__(16) long4 struct __builtin_align__(16) long4
{ {
long int x, y, z, w; long int x, y, z, w;
__cuda_assign_operators(long4)
}; };
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct __builtin_align__(16) ulong4 struct __builtin_align__(16) ulong4
{ {
unsigned long int x, y, z, w; unsigned long int x, y, z, w;
__cuda_assign_operators(ulong4)
}; };
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct float1 struct float1
{ {
float x; float x;
__cuda_assign_operators(float1)
}; };
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
__cuda_builtin_vector_align8(float2, float x, y); __cuda_builtin_vector_align8(float2, float x; float y;);
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct float3 struct float3
{ {
float x, y, z; float x, y, z;
__cuda_assign_operators(float3)
}; };
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct __builtin_align__(16) float4 struct __builtin_align__(16) float4
{ {
float x, y, z, w; float x, y, z, w;
__cuda_assign_operators(float4)
}; };
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct longlong1 struct longlong1
{ {
long long int x; long long int x;
__cuda_assign_operators(longlong1)
}; };
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct ulonglong1 struct ulonglong1
{ {
unsigned long long int x; unsigned long long int x;
__cuda_assign_operators(ulonglong1)
}; };
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct __builtin_align__(16) longlong2 struct __builtin_align__(16) longlong2
{ {
long long int x, y; long long int x, y;
__cuda_assign_operators(longlong2)
}; };
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct __builtin_align__(16) ulonglong2 struct __builtin_align__(16) ulonglong2
{ {
unsigned long long int x, y; unsigned long long int x, y;
__cuda_assign_operators(ulonglong2)
}; };
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct longlong3 struct longlong3
{ {
long long int x, y, z; long long int x, y, z;
__cuda_assign_operators(longlong3)
}; };
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct ulonglong3 struct ulonglong3
{ {
unsigned long long int x, y, z; unsigned long long int x, y, z;
__cuda_assign_operators(ulonglong3)
}; };
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct __builtin_align__(16) longlong4 struct __builtin_align__(16) longlong4
{ {
long long int x, y, z ,w; long long int x, y, z ,w;
__cuda_assign_operators(longlong4)
}; };
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct __builtin_align__(16) ulonglong4 struct __builtin_align__(16) ulonglong4
{ {
unsigned long long int x, y, z, w; unsigned long long int x, y, z, w;
__cuda_assign_operators(ulonglong4)
}; };
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct double1 struct double1
{ {
double x; double x;
__cuda_assign_operators(double1)
}; };
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct __builtin_align__(16) double2 struct __builtin_align__(16) double2
{ {
double x, y; double x, y;
__cuda_assign_operators(double2)
}; };
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct double3 struct double3
{ {
double x, y, z; double x, y, z;
__cuda_assign_operators(double3)
}; };
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct __builtin_align__(16) double4 struct __builtin_align__(16) double4
{ {
double x, y, z, w; double x, y, z, w;
__cuda_assign_operators(double4)
}; };
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
typedef struct char1 char1; typedef struct char1 char1;
skipping to change at line 522 skipping to change at line 472
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct dim3 struct dim3
{ {
unsigned int x, y, z; unsigned int x, y, z;
#if defined(__cplusplus) && !defined(__CUDABE__) #if defined(__cplusplus)
__host__ __device__ dim3(unsigned int x = 1, unsigned int y = 1, unsign __host__ __device__ dim3(unsigned int vx = 1, unsigned int vy = 1, unsi
ed int z = 1) : x(x), y(y), z(z) {} gned int vz = 1) : x(vx), y(vy), z(vz) {}
__host__ __device__ dim3(uint3 v) : x(v.x), y(v.y), z(v.z) {} __host__ __device__ dim3(uint3 v) : x(v.x), y(v.y), z(v.z) {}
__host__ __device__ operator uint3(void) { uint3 t; t.x = x; t.y = y; t .z = z; return t; } __host__ __device__ operator uint3(void) { uint3 t; t.x = x; t.y = y; t .z = z; return t; }
#endif /* __cplusplus && !__CUDABE__ */ #endif /* __cplusplus */
}; };
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
typedef struct dim3 dim3; typedef struct dim3 dim3;
#undef __cuda_assign_operators
#undef __cuda_builtin_vector_align8 #undef __cuda_builtin_vector_align8
#endif /* !__VECTOR_TYPES_H__ */ #endif /* !__VECTOR_TYPES_H__ */
 End of changes. 57 change blocks. 
72 lines changed or deleted 21 lines changed or added

This html diff was produced by rfcdiff 1.41. The latest version is available from http://tools.ietf.org/tools/rfcdiff/