__cudaFatFormat.h   __cudaFatFormat.h 
/* /*
* Copyright 1993-2008 NVIDIA Corporation. All rights reserved. * Copyright 1993-2009 NVIDIA Corporation. All rights reserved.
* *
* NOTICE TO USER: * NOTICE TO USER:
* *
* This source code is subject to NVIDIA ownership rights under U.S. and * This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws. Users and possessors of this source code * international Copyright laws. Users and possessors of this source code
* are hereby granted a nonexclusive, royalty-free license to use this code * are hereby granted a nonexclusive, royalty-free license to use this code
* in individual and commercial software. * in individual and commercial software.
* *
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
skipping to change at line 166 skipping to change at line 167
char* ident; char* ident;
char* usageMode; char* usageMode;
__cudaFatPtxEntry *ptx; __cudaFatPtxEntry *ptx;
__cudaFatCubinEntry *cubin; __cudaFatCubinEntry *cubin;
__cudaFatDebugEntry *debug; __cudaFatDebugEntry *debug;
void* debugInfo; void* debugInfo;
unsigned int flags; unsigned int flags;
__cudaFatSymbol *exported; __cudaFatSymbol *exported;
__cudaFatSymbol *imported; __cudaFatSymbol *imported;
struct __cudaFatCudaBinaryRec *dependends; struct __cudaFatCudaBinaryRec *dependends;
unsigned int characteristic;
} __cudaFatCudaBinary; } __cudaFatCudaBinary;
/* /*
* Current version and magic numbers: * Current version and magic numbers:
*/ */
#define __cudaFatVERSION 0x00000003 #define __cudaFatVERSION 0x00000003
#define __cudaFatMAGIC 0x1ee55a01 #define __cudaFatMAGIC 0x1ee55a01
/* /*
* Version history log: * Version history log:
* 1 : __cudaFatDebugEntry field added to __cudaFatCudaBinary struct * 1 : __cudaFatDebugEntry field added to __cudaFatCudaBinary struct
* 2 : flags and debugInfo field added. * 2 : flags and debugInfo field added.
* 3 : import/export symbol list * 3 : import/export symbol list
* 4 : characteristic added
*/ */
/*--------------------------------- Functions ----------------------------- ---*/ /*--------------------------------- Functions ----------------------------- ---*/
typedef enum { typedef enum {
__cudaFatAvoidPTX, __cudaFatAvoidPTX,
__cudaFatPreferBestCode __cudaFatPreferBestCode
} __cudaFatCompilationPolicy; } __cudaFatCompilationPolicy;
/* /*
 End of changes. 3 change blocks. 
1 lines changed or deleted 3 lines changed or added


 builtin_types.h   builtin_types.h 
/* /*
* Copyright 1993-2008 NVIDIA Corporation. All rights reserved. * Copyright 1993-2009 NVIDIA Corporation. All rights reserved.
* *
* NOTICE TO USER: * NOTICE TO USER:
* *
* This source code is subject to NVIDIA ownership rights under U.S. and * This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws. Users and possessors of this source code * international Copyright laws. Users and possessors of this source code
* are hereby granted a nonexclusive, royalty-free license to use this code * are hereby granted a nonexclusive, royalty-free license to use this code
* in individual and commercial software. * in individual and commercial software.
* *
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
 End of changes. 1 change blocks. 
1 lines changed or deleted 1 lines changed or added


 channel_descriptor.h   channel_descriptor.h 
/* /*
* Copyright 1993-2008 NVIDIA Corporation. All rights reserved. * Copyright 1993-2009 NVIDIA Corporation. All rights reserved.
* *
* NOTICE TO USER: * NOTICE TO USER:
* *
* This source code is subject to NVIDIA ownership rights under U.S. and * This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws. Users and possessors of this source code * international Copyright laws. Users and possessors of this source code
* are hereby granted a nonexclusive, royalty-free license to use this code * are hereby granted a nonexclusive, royalty-free license to use this code
* in individual and commercial software. * in individual and commercial software.
* *
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
skipping to change at line 58 skipping to change at line 58
#include "cuda_runtime_api.h" #include "cuda_runtime_api.h"
#include "host_defines.h" #include "host_defines.h"
#include "vector_types.h" #include "vector_types.h"
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
/**
* \addtogroup CUDART_HIGHLEVEL
*
* @{
*/
/**
* \brief \hl Returns a channel descriptor using the specified format
*
* Returns a channel descriptor with format \p f and number of bits of each
* component \p x, \p y, \p z, and \p w. The ::cudaChannelFormatDesc is
* defined as:
* \code
struct cudaChannelFormatDesc {
int x, y, z, w;
enum cudaChannelFormatKind f;
};
* \endcode
*
* where ::cudaChannelFormatKind is one of ::cudaChannelFormatKindSigned,
* ::cudaChannelFormatKindUnsigned, or ::cudaChannelFormatKindFloat.
*
* \return
* Channel descriptor with format \p f
*
* \sa \ref ::cudaCreateChannelDesc(int,int,int,int,cudaChannelFormatKind)
"cudaCreateChannelDesc (Low level)",
* ::cudaGetChannelDesc, ::cudaGetTextureReference,
* \ref ::cudaBindTexture(size_t*, const struct texture< T, dim, readMode>&
, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTextur
e (High level)",
* \ref ::cudaBindTexture(size_t*, const struct texture< T, dim, readMode>&
, const void*, size_t) "cudaBindTexture (High level, inherited channel desc
riptor)",
* \ref ::cudaBindTexture2D(size_t*, const struct texture< T, dim, readMode
>&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_
t) "cudaBindTexture2D (High level)",
* \ref ::cudaBindTextureToArray(const struct texture< T, dim, readMode>&,
const struct cudaArray*, const struct cudaChannelFormatDesc&) "cudaBindText
ureToArray (High level)",
* \ref ::cudaBindTextureToArray(const struct texture< T, dim, readMode>&,
const struct cudaArray*) "cudaBindTextureToArray (High level, inherited cha
nnel descriptor)",
* \ref ::cudaUnbindTexture(const struct texture< T, dim, readMode>&) "cuda
UnbindTexture (High level)",
* \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture< T, d
im, readMode>&) "cudaGetTextureAlignmentOffset (High level)"
*/
template<class T> __inline__ __host__ cudaChannelFormatDesc cudaCreateChann elDesc(void) template<class T> __inline__ __host__ cudaChannelFormatDesc cudaCreateChann elDesc(void)
{ {
return cudaCreateChannelDesc(0, 0, 0, 0, cudaChannelFormatKindNone); return cudaCreateChannelDesc(0, 0, 0, 0, cudaChannelFormatKindNone);
} }
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc< char>(void) template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc< char>(void)
{ {
int e = (int)sizeof(char) * 8; int e = (int)sizeof(char) * 8;
#if __SIGNED_CHARS__ #if __SIGNED_CHARS__
skipping to change at line 332 skipping to change at line 367
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc< float4>(void) template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc< float4>(void)
{ {
int e = (int)sizeof(float) * 8; int e = (int)sizeof(float) * 8;
return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindFloat); return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindFloat);
} }
#endif /* __cplusplus */ #endif /* __cplusplus */
/** @} */
/** @} */ /* END CUDART_TEXTURE_HL */
#endif /* !__CHANNEL_DESCRIPTOR_H__ */ #endif /* !__CHANNEL_DESCRIPTOR_H__ */
 End of changes. 3 change blocks. 
1 lines changed or deleted 52 lines changed or added


 common_functions.h   common_functions.h 
/* /*
* Copyright 1993-2008 NVIDIA Corporation. All rights reserved. * Copyright 1993-2009 NVIDIA Corporation. All rights reserved.
* *
* NOTICE TO USER: * NOTICE TO USER:
* *
* This source code is subject to NVIDIA ownership rights under U.S. and * This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws. Users and possessors of this source code * international Copyright laws. Users and possessors of this source code
* are hereby granted a nonexclusive, royalty-free license to use this code * are hereby granted a nonexclusive, royalty-free license to use this code
* in individual and commercial software. * in individual and commercial software.
* *
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
skipping to change at line 56 skipping to change at line 56
#include "host_defines.h" #include "host_defines.h"
#include <time.h> #include <time.h>
#include <string.h> #include <string.h>
extern "C" extern "C"
{ {
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __host__ __device__ clock_t clock(void) __THROW; extern _CRTIMP __host__ __device__ clock_t clock(void) __THROW;
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __host__ __device__ void *memset(void *s, int c, size_t n) __THROW; extern __host__ __device__ void *memset(void *s, int c, size_t n) __THROW;
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __host__ __device__ void *memcpy(void *d, const void *s, size_t n) _ _THROW; extern __host__ __device__ void *memcpy(void *d, const void *s, size_t n) _ _THROW;
} }
#elif !defined(__CUDACC__) #elif !defined(__CUDACC__)
 End of changes. 2 change blocks. 
2 lines changed or deleted 2 lines changed or added


 cuComplex.h   cuComplex.h 
/* /*
* Copyright 1993-2008 NVIDIA Corporation. All rights reserved. * Copyright 1993-2009 NVIDIA Corporation. All rights reserved.
* *
* NOTICE TO USER: * NOTICE TO USER:
* *
* This source code is subject to NVIDIA ownership rights under U.S. and * This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws. Users and possessors of this source code * international Copyright laws. Users and possessors of this source code
* are hereby granted a nonexclusive, royalty-free license to use this code * are hereby granted a nonexclusive, royalty-free license to use this code
* in individual and commercial software. * in individual and commercial software.
* *
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
 End of changes. 1 change blocks. 
1 lines changed or deleted 1 lines changed or added


 cublas.h   cublas.h 
/* /*
* Copyright 1993-2008 NVIDIA Corporation. All rights reserved. * Copyright 1993-2009 NVIDIA Corporation. All rights reserved.
* *
* NOTICE TO USER: * NOTICE TO USER:
* *
* This source code is subject to NVIDIA ownership rights under U.S. and * This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws. Users and possessors of this source code * international Copyright laws. Users and possessors of this source code
* are hereby granted a nonexclusive, royalty-free license to use this code * are hereby granted a nonexclusive, royalty-free license to use this code
* in individual and commercial software. * in individual and commercial software.
* *
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
skipping to change at line 1209 skipping to change at line 1209
* *
* Error status for this function can be retrieved via cublasGetError(). * Error status for this function can be retrieved via cublasGetError().
* *
* Error Status * Error Status
* ------------ * ------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize d * CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize d
* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU * CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
*/ */
float CUBLASAPI cublasScnrm2 (int n, const cuComplex *x, int incx); float CUBLASAPI cublasScnrm2 (int n, const cuComplex *x, int incx);
/* ----------------- CUBLAS double-complex BLAS1 functions ----------------
- */
/*
* cuDoubleComplex
* zdotu (int n, const cuDoubleComplex *x, int incx, const cuDoubleComplex
*y, int incy)
*
* computes the dot product of two double-complex vectors. It returns the
* dot product of the double-complex vectors x and y if successful, and dou
ble-complex
* zero otherwise. It computes the sum for i = 0 to n - 1 of x[lx + i * inc
x] *
* y[ly + i * incy], where lx = 1 if incx >= 0, else lx = 1 + (1 - n) * inc
x;
* ly is defined in a similar way using incy.
*
* Input
* -----
* n number of elements in input vectors
* x double-complex vector with n elements
* incx storage spacing between elements of x
* y double-complex vector with n elements
* incy storage spacing between elements of y
*
* Output
* ------
* returns double-complex dot product (zero if n <= 0)
*
* Reference: http://www.netlib.org/blas/zdotu.f
*
* Error status for this function can be retrieved via cublasGetError().
*
* Error Status
* ------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has nor been initialize
d
* CUBLAS_STATUS_EXECUTION_FAILED if function failed to execute on GPU
*/
cuDoubleComplex CUBLASAPI cublasZdotu (int n, const cuDoubleComplex *x, int
incx,
const cuDoubleComplex *y, int incy);
/*
* void
* cublasZscal (int n, cuComplex alpha, cuComplex *x, int incx)
*
* replaces double-complex vector x with double-complex alpha * x. For i
* = 0 to n - 1, it replaces x[ix + i * incx] with alpha * x[ix + i * incx]
,
* where ix = 1 if incx >= 0, else ix = 1 + (1 - n) * incx.
*
* Input
* -----
* n number of elements in input vectors
* alpha double-complex scalar multiplier
* x double-complex vector with n elements
* incx storage spacing between elements of x
*
* Output
* ------
* x double-complex result (unchanged if n <= 0 or incx <= 0)
*
* Reference: http://www.netlib.org/blas/zscal.f
*
* Error status for this function can be retrieved via cublasGetError().
*
* Error Status
* ------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
d
* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
*/
void CUBLASAPI cublasZscal (int n, cuDoubleComplex alpha, cuDoubleComplex *
x, int incx);
/* --------------- CUBLAS single precision BLAS2 functions --------------- - */ /* --------------- CUBLAS single precision BLAS2 functions --------------- - */
/* /*
* void * void
* cublasSgbmv (char trans, int m, int n, int kl, int ku, float alpha, * cublasSgbmv (char trans, int m, int n, int kl, int ku, float alpha,
* const float *A, int lda, const float *x, int incx, float be ta, * const float *A, int lda, const float *x, int incx, float be ta,
* float *y, int incy) * float *y, int incy)
* *
* performs one of the matrix-vector operations * performs one of the matrix-vector operations
* *
skipping to change at line 2114 skipping to change at line 2180
* *
* Error Status * Error Status
* ------------ * ------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize d * CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize d
* CUBLAS_STATUS_INVALID_VALUE if incx == 0 or if n < 0 or n > 4070 * CUBLAS_STATUS_INVALID_VALUE if incx == 0 or if n < 0 or n > 4070
* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU * CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
*/ */
void CUBLASAPI cublasStrsv (char uplo, char trans, char diag, int n, void CUBLASAPI cublasStrsv (char uplo, char trans, char diag, int n,
const float *A, int lda, float *x, int incx); const float *A, int lda, float *x, int incx);
/* ----------------- CUBLAS double complex BLAS3 functions ----------------
- */
/*
* cublasZgemv (char trans, int m, int n, cuDoubleComplex alpha, const cuDo
ubleComplex *A, int lda,
* const cuDoubleComplex *x, int incx, cuDoubleComplex beta, c
uDoubleComplex *y, int incy)
*
* performs one of the matrix-vector operations
*
* y = alpha * op(A) * x + beta * y,
*
* where op(A) is one of
*
* op(A) = A or op(A) = transpose(A)
*
* where alpha and beta are double precision scalars, x and y are double
* precision vectors, and A is an m x n matrix consisting of double precisi
on
* elements. Matrix A is stored in column major format, and lda is the lead
ing
* dimension of the two-dimensional array in which A is stored.
*
* Input
* -----
* trans specifies op(A). If transa = 'n' or 'N', op(A) = A. If trans =
* trans = 't', 'T', 'c', or 'C', op(A) = transpose(A)
* m specifies the number of rows of the matrix A. m must be at least
* zero.
* n specifies the number of columns of the matrix A. n must be at lea
st
* zero.
* alpha double precision scalar multiplier applied to op(A).
* A double precision array of dimensions (lda, n) if trans = 'n' or
* 'N'), and of dimensions (lda, m) otherwise. lda must be at least
* max(1, m) and at least max(1, n) otherwise.
* lda leading dimension of two-dimensional array used to store matrix A
* x double precision array of length at least (1 + (n - 1) * abs(incx
))
* when trans = 'N' or 'n' and at least (1 + (m - 1) * abs(incx))
* otherwise.
* incx specifies the storage spacing between elements of x. incx must no
t
* be zero.
* beta double precision scalar multiplier applied to vector y. If beta
* is zero, y is not read.
* y double precision array of length at least (1 + (m - 1) * abs(incy
))
* when trans = 'N' or 'n' and at least (1 + (n - 1) * abs(incy))
* otherwise.
* incy specifies the storage spacing between elements of x. incx must no
t
* be zero.
*
* Output
* ------
* y updated according to alpha * op(A) * x + beta * y
*
* Reference: http://www.netlib.org/blas/zgemv.f
*
* Error status for this function can be retrieved via cublasGetError().
*
* Error Status
* ------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
d
* CUBLAS_STATUS_INVALID_VALUE if m or n are < 0, or if incx or incy ==
0
* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
*/
void CUBLASAPI cublasZgemv (char trans, int m, int n, cuDoubleComplex alpha
,
const cuDoubleComplex *A, int lda, const cuDoub
leComplex *x, int incx,
cuDoubleComplex beta, cuDoubleComplex *y, int i
ncy);
/* ----------------- CUBLAS single complex BLAS2 functions ---------------- - */ /* ----------------- CUBLAS single complex BLAS2 functions ---------------- - */
void CUBLASAPI cublasCgemv (char trans, int m, int n, cuComplex alpha, void CUBLASAPI cublasCgemv (char trans, int m, int n, cuComplex alpha,
const cuComplex *A, int lda, const cuComplex *x , const cuComplex *A, int lda, const cuComplex *x ,
int incx, cuComplex beta, cuComplex *y, int inc y); int incx, cuComplex beta, cuComplex *y, int inc y);
void CUBLASAPI cublasCgbmv (char trans, int m, int n, int kl, int ku, void CUBLASAPI cublasCgbmv (char trans, int m, int n, int kl, int ku,
cuComplex alpha, const cuComplex *A, int lda, cuComplex alpha, const cuComplex *A, int lda,
const cuComplex *x, int incx, cuComplex beta, const cuComplex *x, int incx, cuComplex beta,
cuComplex *y, int incy); cuComplex *y, int incy);
void CUBLASAPI cublasChemv (char uplo, int n, cuComplex alpha, void CUBLASAPI cublasChemv (char uplo, int n, cuComplex alpha,
const cuComplex *A, int lda, const cuComplex *x , const cuComplex *A, int lda, const cuComplex *x ,
skipping to change at line 3577 skipping to change at line 3706
* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support * CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU * CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
*/ */
void CUBLASAPI cublasDtrsm (char side, char uplo, char transa, void CUBLASAPI cublasDtrsm (char side, char uplo, char transa,
char diag, int m, int n, double alpha, char diag, int m, int n, double alpha,
const double *A, int lda, double *B, const double *A, int lda, double *B,
int ldb); int ldb);
/* /*
* void * void
* cublasZtrsm (char side, char uplo, char transa, char diag, int m, int n,
* cuDoubleComplex alpha, const cuDoubleComplex *A, int lda,
* cuDoubleComplex *B, int ldb)
*
* solves one of the matrix equations
*
* op(A) * X = alpha * B, or X * op(A) = alpha * B,
*
* where alpha is a double precision complex scalar, and X and B are m x n
matrices
* that are composed of double precision complex elements. A is a unit or n
on-unit,
* upper or lower triangular matrix, and op(A) is one of
*
* op(A) = A or op(A) = transpose(A) or op( A ) = conj( A' ).
*
* The result matrix X overwrites input matrix B; that is, on exit the resu
lt
* is stored in B. Matrices A and B are stored in column major format, and
* lda and ldb are the leading dimensions of the two-dimensonials arrays th
at
* contain A and B, respectively.
*
* Input
* -----
* side specifies whether op(A) appears on the left or right of X as
* follows: side = 'L' or 'l' indicates solve op(A) * X = alpha * B.
* side = 'R' or 'r' indicates solve X * op(A) = alpha * B.
* uplo specifies whether the matrix A is an upper or lower triangular
* matrix as follows: uplo = 'U' or 'u' indicates A is an upper
* triangular matrix. uplo = 'L' or 'l' indicates A is a lower
* triangular matrix.
* transa specifies the form of op(A) to be used in matrix multiplication
* as follows: If transa = 'N' or 'N', then op(A) = A. If transa =
* 'T', 't', 'C', or 'c', then op(A) = transpose(A).
* diag specifies whether or not A is a unit triangular matrix like so:
* if diag = 'U' or 'u', A is assumed to be unit triangular. If
* diag = 'N' or 'n', then A is not assumed to be unit triangular.
* m specifies the number of rows of B. m must be at least zero.
* n specifies the number of columns of B. n must be at least zero.
* alpha is a double precision complex scalar to be multiplied with B. Whe
n alpha is
* zero, then A is not referenced and B need not be set before entry
.
* A is a double precision complex array of dimensions (lda, k), where
k is
* m when side = 'L' or 'l', and is n when side = 'R' or 'r'. If
* uplo = 'U' or 'u', the leading k x k upper triangular part of
* the array A must contain the upper triangular matrix and the
* strictly lower triangular matrix of A is not referenced. When
* uplo = 'L' or 'l', the leading k x k lower triangular part of
* the array A must contain the lower triangular matrix and the
* strictly upper triangular part of A is not referenced. Note that
* when diag = 'U' or 'u', the diagonal elements of A are not
* referenced, and are assumed to be unity.
* lda is the leading dimension of the two dimensional array containing
A.
* When side = 'L' or 'l' then lda must be at least max(1, m), when
* side = 'R' or 'r' then lda must be at least max(1, n).
* B is a double precision complex array of dimensions (ldb, n). ldb m
ust be
* at least max (1,m). The leading m x n part of the array B must
* contain the right-hand side matrix B. On exit B is overwritten
* by the solution matrix X.
* ldb is the leading dimension of the two dimensional array containing
B.
* ldb must be at least max(1, m).
*
* Output
* ------
* B contains the solution matrix X satisfying op(A) * X = alpha * B,
* or X * op(A) = alpha * B
*
* Reference: http://www.netlib.org/blas/ztrsm.f
*
* Error status for this function can be retrieved via cublasGetError().
*
* Error Status
* ------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
d
* CUBLAS_STATUS_INVALID_VALUE if m or n < 0
* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
*/
void CUBLASAPI cublasZtrsm (char side, char uplo, char transa,
char diag, int m, int n, cuDoubleComplex alpha,
const cuDoubleComplex *A, int lda,
cuDoubleComplex *B, int ldb);
/*
* void
* cublasDtrmm (char side, char uplo, char transa, char diag, int m, int n, * cublasDtrmm (char side, char uplo, char transa, char diag, int m, int n,
* double alpha, const double *A, int lda, const double *B, in t ldb) * double alpha, const double *A, int lda, const double *B, in t ldb)
* *
* performs one of the matrix-matrix operations * performs one of the matrix-matrix operations
* *
* B = alpha * op(A) * B, or B = alpha * B * op(A) * B = alpha * op(A) * B, or B = alpha * B * op(A)
* *
* where alpha is a double-precision scalar, B is an m x n matrix composed * where alpha is a double-precision scalar, B is an m x n matrix composed
* of double precision elements, and A is a unit or non-unit, upper or lowe r, * of double precision elements, and A is a unit or non-unit, upper or lowe r,
* triangular matrix composed of double precision elements. op(A) is one of * triangular matrix composed of double precision elements. op(A) is one of
skipping to change at line 3813 skipping to change at line 4023
* CUBLAS_STATUS_INVALID_VALUE if n < 0 or k < 0 * CUBLAS_STATUS_INVALID_VALUE if n < 0 or k < 0
* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support * CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU * CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
*/ */
void CUBLASAPI cublasDsyrk (char uplo, char trans, int n, int k, void CUBLASAPI cublasDsyrk (char uplo, char trans, int n, int k,
double alpha, const double *A, int lda, double alpha, const double *A, int lda,
double beta, double *C, int ldc); double beta, double *C, int ldc);
/* /*
* void * void
* cublasZsyrk (char uplo, char trans, int n, int k, cuDoubleComplex alpha,
* const cuDoubleComplex *A, int lda, cuDoubleComplex beta, cu
DoubleComplex *C, int ldc)
*
* performs one of the symmetric rank k operations
*
* C = alpha * A * transpose(A) + beta * C, or
* C = alpha * transpose(A) * A + beta * C.
*
* Alpha and beta are double precision complex scalars. C is an n x n symme
tric matrix
* consisting of double precision complex elements and stored in either low
er or
* upper storage mode. A is a matrix consisting of double precision complex
elements
* with dimension of n x k in the first case, and k x n in the second case.
*
* Input
* -----
* uplo specifies whether the symmetric matrix C is stored in upper or lo
wer
* storage mode as follows. If uplo == 'U' or 'u', only the upper
* triangular part of the symmetric matrix is to be referenced, and
the
* elements of the strictly lower triangular part are to be infered
from
* those in the upper triangular part. If uplo == 'L' or 'l', only t
he
* lower triangular part of the symmetric matrix is to be referenced
,
* and the elements of the strictly upper triangular part are to be
* infered from those in the lower triangular part.
* trans specifies the operation to be performed. If trans == 'N' or 'n',
C =
* alpha * transpose(A) + beta * C. If trans == 'T', 't', 'C', or 'c
',
* C = transpose(A) * A + beta * C.
* n specifies the number of rows and the number columns of matrix C.
If
* trans == 'N' or 'n', n specifies the number of rows of matrix A.
If
* trans == 'T', 't', 'C', or 'c', n specifies the columns of matrix
A.
* n must be at least zero.
* k If trans == 'N' or 'n', k specifies the number of rows of matrix
A.
* If trans == 'T', 't', 'C', or 'c', k specifies the number of rows
of
* matrix A. k must be at least zero.
* alpha double precision complex scalar multiplier applied to A * transpo
se(A) or
* transpose(A) * A.
* A double precision complex array of dimensions (lda, ka), where ka
is k when
* trans == 'N' or 'n', and is n otherwise. When trans == 'N' or 'n'
,
* the leading n x k part of array A must contain the matrix A,
* otherwise the leading k x n part of the array must contains the
* matrix A.
* lda leading dimension of A. When trans == 'N' or 'n' then lda must be
at
* least max(1, n). Otherwise lda must be at least max(1, k).
* beta double precision complex scalar multiplier applied to C. If beta
izs zero, C
* does not have to be a valid input
* C double precision complex array of dimensions (ldc, n). If uplo =
'U' or 'u',
* the leading n x n triangular part of the array C must contain the
* upper triangular part of the symmetric matrix C and the strictly
* lower triangular part of C is not referenced. On exit, the upper
* triangular part of C is overwritten by the upper trinagular part
of
* the updated matrix. If uplo = 'L' or 'l', the leading n x n
* triangular part of the array C must contain the lower triangular
part
* of the symmetric matrix C and the strictly upper triangular part
of C
* is not referenced. On exit, the lower triangular part of C is
* overwritten by the lower trinagular part of the updated matrix.
* ldc leading dimension of C. It must be at least max(1, n).
*
* Output
* ------
* C updated according to C = alpha * A * transpose(A) + beta * C, or
C =
* alpha * transpose(A) * A + beta * C
*
* Reference: http://www.netlib.org/blas/zsyrk.f
*
* Error status for this function can be retrieved via cublasGetError().
*
* Error Status
* ------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
d
* CUBLAS_STATUS_INVALID_VALUE if n < 0 or k < 0
* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
*/
void CUBLASAPI cublasZsyrk (char uplo, char trans, int n, int k,
cuDoubleComplex alpha,
const cuDoubleComplex *A, int lda,
cuDoubleComplex beta,
cuDoubleComplex *C, int ldc);
/*
* void
* cublasDsyr2k (char uplo, char trans, int n, int k, double alpha, * cublasDsyr2k (char uplo, char trans, int n, int k, double alpha,
* const double *A, int lda, const double *B, int ldb, * const double *A, int lda, const double *B, int ldb,
* double beta, double *C, int ldc) * double beta, double *C, int ldc)
* *
* performs one of the symmetric rank 2k operations * performs one of the symmetric rank 2k operations
* *
* C = alpha * A * transpose(B) + alpha * B * transpose(A) + beta * C, o r * C = alpha * A * transpose(B) + alpha * B * transpose(A) + beta * C, o r
* C = alpha * transpose(A) * B + alpha * transpose(B) * A + beta * C. * C = alpha * transpose(A) * B + alpha * transpose(B) * A + beta * C.
* *
* Alpha and beta are double precision scalars. C is an n x n symmetric mat rix * Alpha and beta are double precision scalars. C is an n x n symmetric mat rix
 End of changes. 5 change blocks. 
1 lines changed or deleted 354 lines changed or added


 cuda.h   cuda.h 
/* /*
* Copyright 1993-2008 NVIDIA Corporation. All rights reserved. * Copyright 1993-2009 NVIDIA Corporation. All rights reserved.
* *
* NOTICE TO USER: * NOTICE TO USER:
* *
* This source code is subject to NVIDIA ownership rights under U.S. and * This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws. Users and possessors of this source code * international Copyright laws. Users and possessors of this source code
* are hereby granted a nonexclusive, royalty-free license to use this code * are hereby granted a nonexclusive, royalty-free license to use this code
* in individual and commercial software. * in individual and commercial software.
* *
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
skipping to change at line 36 skipping to change at line 36
* and is provided to the U.S. Government only as a commercial end item. * and is provided to the U.S. Government only as a commercial end item.
* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
* 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
* source code with only those rights set forth herein. * source code with only those rights set forth herein.
* *
* Any use of this source code in individual and commercial software must * Any use of this source code in individual and commercial software must
* include, in the user documentation and internal comments to the code, * include, in the user documentation and internal comments to the code,
* the above Disclaimer and U.S. Government End Users Notice. * the above Disclaimer and U.S. Government End Users Notice.
*/ */
// ------------------------------------------------------------------------
----
//
// Main public header file for the CompUte Device Api
//
// ------------------------------------------------------------------------
----
#ifndef __cuda_cuda_h__ #ifndef __cuda_cuda_h__
#define __cuda_cuda_h__ #define __cuda_cuda_h__
/* CUDA API version number */ #include <stdlib.h>
#define CUDA_VERSION 2010 /* 2.1 */
/**
* \file
* \name Data types used by CUDA driver
* \author NVIDIA Corporation
* \brief Data types used by CUDA driver
*/
/**
* \defgroup CUDA_TYPES Data types used by CUDA driver
* \ingroup CUDA_DRIVER
* @{
*/
/**
* CUDA API version number
*/
#define CUDA_VERSION 2020 /* 2.2 */
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
#endif #endif
typedef unsigned int CUdeviceptr; typedef unsigned int CUdeviceptr; ///< CUDA device pointer
typedef int CUdevice; typedef int CUdevice; ///< CUDA device
typedef struct CUctx_st *CUcontext; typedef struct CUctx_st *CUcontext; ///< CUDA context
typedef struct CUmod_st *CUmodule; typedef struct CUmod_st *CUmodule; ///< CUDA module
typedef struct CUfunc_st *CUfunction; typedef struct CUfunc_st *CUfunction; ///< CUDA function
typedef struct CUarray_st *CUarray; typedef struct CUarray_st *CUarray; ///< CUDA array
typedef struct CUtexref_st *CUtexref; typedef struct CUtexref_st *CUtexref; ///< CUDA texture reference
typedef struct CUevent_st *CUevent; typedef struct CUevent_st *CUevent; ///< CUDA event
typedef struct CUstream_st *CUstream; typedef struct CUstream_st *CUstream; ///< CUDA stream
/************************************ /************************************
** **
** Enums ** Enums
** **
***********************************/ ***********************************/
// /**
// context creation flags * Context creation flags
// */
typedef enum CUctx_flags_enum { typedef enum CUctx_flags_enum {
CU_CTX_SCHED_AUTO = 0, CU_CTX_SCHED_AUTO = 0, ///< Automatic scheduling
CU_CTX_SCHED_SPIN = 1, CU_CTX_SCHED_SPIN = 1, ///< Set spin as default scheduling
CU_CTX_SCHED_YIELD = 2, CU_CTX_SCHED_YIELD = 2, ///< Set yield as default scheduling
CU_CTX_SCHED_MASK = 0x3, CU_CTX_SCHED_MASK = 0x3,
CU_CTX_FLAGS_MASK = CU_CTX_SCHED_MASK CU_CTX_BLOCKING_SYNC = 4, ///< Use blocking synchronization
CU_CTX_MAP_HOST = 8, ///< Support mapped pinned allocations
CU_CTX_FLAGS_MASK = 0xf,
} CUctx_flags; } CUctx_flags;
// /**
// array formats * Event creation flags
// */
typedef enum CUevent_flags_enum {
CU_EVENT_DEFAULT = 0, ///< Default event flag
CU_EVENT_BLOCKING_SYNC = 1, ///< Event uses blocking synchronization
} CUevent_flags;
/**
* Array formats
*/
typedef enum CUarray_format_enum { typedef enum CUarray_format_enum {
CU_AD_FORMAT_UNSIGNED_INT8 = 0x01, CU_AD_FORMAT_UNSIGNED_INT8 = 0x01, ///< Unsigned 8-bit integers
CU_AD_FORMAT_UNSIGNED_INT16 = 0x02, CU_AD_FORMAT_UNSIGNED_INT16 = 0x02, ///< Unsigned 16-bit integers
CU_AD_FORMAT_UNSIGNED_INT32 = 0x03, CU_AD_FORMAT_UNSIGNED_INT32 = 0x03, ///< Unsigned 32-bit integers
CU_AD_FORMAT_SIGNED_INT8 = 0x08, CU_AD_FORMAT_SIGNED_INT8 = 0x08, ///< Signed 8-bit integers
CU_AD_FORMAT_SIGNED_INT16 = 0x09, CU_AD_FORMAT_SIGNED_INT16 = 0x09, ///< Signed 16-bit integers
CU_AD_FORMAT_SIGNED_INT32 = 0x0a, CU_AD_FORMAT_SIGNED_INT32 = 0x0a, ///< Signed 32-bit integers
CU_AD_FORMAT_HALF = 0x10, CU_AD_FORMAT_HALF = 0x10, ///< 16-bit floating point
CU_AD_FORMAT_FLOAT = 0x20 CU_AD_FORMAT_FLOAT = 0x20 ///< 32-bit floating point
} CUarray_format; } CUarray_format;
// /**
// Texture reference addressing modes * Texture reference addressing modes
// */
typedef enum CUaddress_mode_enum { typedef enum CUaddress_mode_enum {
CU_TR_ADDRESS_MODE_WRAP = 0, CU_TR_ADDRESS_MODE_WRAP = 0, ///< Wrapping address mode
CU_TR_ADDRESS_MODE_CLAMP = 1, CU_TR_ADDRESS_MODE_CLAMP = 1, ///< Clamp to edge address mode
CU_TR_ADDRESS_MODE_MIRROR = 2, CU_TR_ADDRESS_MODE_MIRROR = 2, ///< Mirror address mode
} CUaddress_mode; } CUaddress_mode;
// /**
// Texture reference filtering modes * Texture reference filtering modes
// */
typedef enum CUfilter_mode_enum { typedef enum CUfilter_mode_enum {
CU_TR_FILTER_MODE_POINT = 0, CU_TR_FILTER_MODE_POINT = 0, ///< Point filter mode
CU_TR_FILTER_MODE_LINEAR = 1 CU_TR_FILTER_MODE_LINEAR = 1 ///< Linear filter mode
} CUfilter_mode; } CUfilter_mode;
// /**
// Device properties * Device properties
// */
typedef enum CUdevice_attribute_enum { typedef enum CUdevice_attribute_enum {
CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 1, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 1, ///< Maximum number of
CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X = 2, threads per block
CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y = 3, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X = 2, ///< Maximum block dime
CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z = 4, nsion X
CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X = 5, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y = 3, ///< Maximum block dime
CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y = 6, nsion Y
CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z = 7, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z = 4, ///< Maximum block dime
CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK = 8, nsion Z
CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK = 8, // Deprecated, us CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X = 5, ///< Maximum grid dimen
e CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK sion X
CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY = 9, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y = 6, ///< Maximum grid dimen
CU_DEVICE_ATTRIBUTE_WARP_SIZE = 10, sion Y
CU_DEVICE_ATTRIBUTE_MAX_PITCH = 11, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z = 7, ///< Maximum grid dimen
CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK = 12, sion Z
CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK = 12, // Deprecated, us CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK = 8, ///< Maximum sh
e CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK ared memory available per block in bytes
CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13, CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK = 8, ///< Deprecated, us
CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT = 14, e CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK
CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY = 9, ///< Memory available o
n device for __constant__ variables in a CUDA C kernel in bytes
CU_DEVICE_ATTRIBUTE_WARP_SIZE = 10, ///< Warp size in threa
ds
CU_DEVICE_ATTRIBUTE_MAX_PITCH = 11, ///< Maximum pitch in b
ytes allowed by memory copies
CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK = 12, ///< Maximum number
of 32-bit registers available per block
CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK = 12, ///< Deprecated, use CU
_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK
CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13, ///< Peak clock frequen
cy in kilohertz
CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT = 14, ///< Alignment requirem
ent for textures
CU_DEVICE_ATTRIBUTE_GPU_OVERLAP = 15, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP = 15, ///< Device can possibl
CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16, y copy memory and execute a kernel concurrently
CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT = 17 CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16, ///< Number of multipro
cessors on device
CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT = 17, ///< Specifies whether
there is a run time limit on kernels
CU_DEVICE_ATTRIBUTE_INTEGRATED = 18, ///< Device is integrat
ed with host memory
CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY = 19, ///< Device can map hos
t memory into CUDA address space
CU_DEVICE_ATTRIBUTE_COMPUTE_MODE = 20 ///< Compute mode (See
::CUcomputemode for details)
} CUdevice_attribute; } CUdevice_attribute;
// /**
// Legacy device properties * Legacy device properties
// */
typedef struct CUdevprop_st { typedef struct CUdevprop_st {
int maxThreadsPerBlock; int maxThreadsPerBlock; ///< Maximum number of threads per block
int maxThreadsDim[3]; int maxThreadsDim[3]; ///< Maximum size of each dimension of a bl
int maxGridSize[3]; ock
int sharedMemPerBlock; int maxGridSize[3]; ///< Maximum size of each dimension of a gr
int totalConstantMemory; id
int SIMDWidth; int sharedMemPerBlock; ///< Shared memory available per block in b
int memPitch; ytes
int regsPerBlock; int totalConstantMemory; ///< Constant memory available on device in
int clockRate; bytes
int textureAlign; int SIMDWidth; ///< Warp size in threads
int memPitch; ///< Maximum pitch in bytes allowed by memo
ry copies
int regsPerBlock; ///< 32-bit registers available per block
int clockRate; ///< Clock frequency in kilohertz
int textureAlign; ///< Alignment requirement for textures
} CUdevprop; } CUdevprop;
// /**
// Memory types * Function properties
// */
typedef enum CUfunction_attribute_enum {
/**
* The number of threads beyond which a launch of the function would fa
il.
* This number depends on both the function and the device on which the
* function is currently loaded.
*/
CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 0,
/**
* The size in bytes of statically-allocated shared memory required by
* this function. This does not include dynamically-allocated shared
* memory requested by the user at runtime.
*/
CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES = 1,
/**
* The size in bytes of user-allocated constant memory required by this
* function.
*/
CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES = 2,
/**
* The size in bytes of thread local memory used by this function.
*/
CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES = 3,
/**
* The number of registers used by each thread of this function.
*/
CU_FUNC_ATTRIBUTE_NUM_REGS = 4,
CU_FUNC_ATTRIBUTE_MAX
} CUfunction_attribute;
/**
* Memory types
*/
typedef enum CUmemorytype_enum { typedef enum CUmemorytype_enum {
CU_MEMORYTYPE_HOST = 0x01, CU_MEMORYTYPE_HOST = 0x01, ///< Host memory
CU_MEMORYTYPE_DEVICE = 0x02, CU_MEMORYTYPE_DEVICE = 0x02, ///< Device memory
CU_MEMORYTYPE_ARRAY = 0x03 CU_MEMORYTYPE_ARRAY = 0x03 ///< Array memory
} CUmemorytype; } CUmemorytype;
// /**
// Online compiler options * Compute Modes
// */
typedef enum CUcomputemode_enum {
CU_COMPUTEMODE_DEFAULT = 0, ///< Default compute mode (Multiple
contexts allowed per device)
CU_COMPUTEMODE_EXCLUSIVE = 1, ///< Compute-exclusive mode (Only on
e context can be present on this device at a time)
CU_COMPUTEMODE_PROHIBITED = 2 ///< Compute-prohibited mode (No con
texts can be created on this device at this time)
} CUcomputemode;
/**
* Online compiler options
*/
typedef enum CUjit_option_enum typedef enum CUjit_option_enum
{ {
// CU_JIT_MAX_REGISTERS - Max number of registers that a thread may use /**
. * Max number of registers that a thread may use.
*/
CU_JIT_MAX_REGISTERS = 0, CU_JIT_MAX_REGISTERS = 0,
// CU_JIT_THREADS_PER_BLOCK - /**
// IN: Specifies minimum number of threads per block to target compilat * IN: Specifies minimum number of threads per block to target compilat
ion for ion
// OUT: Returns the number of threads the compiler actually targeted. * for\n
This * OUT: Returns the number of threads the compiler actually targeted.
// restricts the resource utilization fo the compiler (e.g. max registe * This restricts the resource utilization fo the compiler (e.g. max
rs) such * registers) such that a block with the given number of threads should
// that a block with the given number of threads should be able to laun be
ch based * able to launch based on register limitations. Note, this option does
// on register limitations. Note, this option does not currently take not
into * currently take into account any other resource limitations, such as
// account any other resource limitations, such as shared memory utiliz * shared memory utilization.
ation. */
CU_JIT_THREADS_PER_BLOCK, CU_JIT_THREADS_PER_BLOCK,
// CU_JIT_WALL_TIME - returns a float value in the option of the wall c /**
lock * Returns a float value in the option of the wall clock time, in
// time, in milliseconds, spent creating the cubin * milliseconds, spent creating the cubin
*/
CU_JIT_WALL_TIME, CU_JIT_WALL_TIME,
// CU_JIT_INFO_LUG_BUFFER - pointer to a buffer in which to print any l /**
og * Pointer to a buffer in which to print any log messsages from PTXAS
// messsages from PTXAS that are informational in nature * that are informational in nature
*/
CU_JIT_INFO_LOG_BUFFER, CU_JIT_INFO_LOG_BUFFER,
// CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES - /**
// IN: Log buffer size in bytes. Log messages will be capped at this s * IN: Log buffer size in bytes. Log messages will be capped at this s
ize ize
// (including null terminator) * (including null terminator)\n
// OUT: Amount of log buffer filled with messages * OUT: Amount of log buffer filled with messages
*/
CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES,
// CU_JIT_ERROR_LOG_BUFFER - pointer to a buffer in which to print any /**
log * Pointer to a buffer in which to print any log messages from PTXAS th
// messages from PTXAS that reflect errors at
* reflect errors
*/
CU_JIT_ERROR_LOG_BUFFER, CU_JIT_ERROR_LOG_BUFFER,
// CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES - /**
// IN: Log buffer size in bytes. Log messages will be capped at this s * IN: Log buffer size in bytes. Log messages will be capped at this s
ize ize
// (including null terminator) * (including null terminator)\n
// OUT: Amount of log buffer filled with messages * OUT: Amount of log buffer filled with messages
*/
CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES,
// CU_JIT_OPTIMIZATION_LEVEL - level of optimizations to apply to gener /**
ated * Level of optimizations to apply to generated code (0 - 4), with 4
// code (0 - 4), with 4 being the default and highest level of optimiza * being the default and highest level of optimizations.
tions. */
CU_JIT_OPTIMIZATION_LEVEL, CU_JIT_OPTIMIZATION_LEVEL,
// CU_JIT_TARGET_FROM_CU_CONTEXT - no option value required. Determine /**
s * No option value required. Determines the target based on the current
// the target based on the current attached context (default) * attached context (default)
*/
CU_JIT_TARGET_FROM_CUCONTEXT, CU_JIT_TARGET_FROM_CUCONTEXT,
// CU_JIT_TARGET - target is chosen based on supplied CUjit_target_enum /**
. * Target is chosen based on supplied CUjit_target_enum.
*/
CU_JIT_TARGET, CU_JIT_TARGET,
// CU_JIT_FALLBACK_STRATEGY - specifies choice of fallback strategy if /**
// matching cubin is not found. Choice is based on supplied * Specifies choice of fallback strategy if matching cubin is not found
// CUjit_fallback_enum. .
* Choice is based on supplied CUjit_fallback_enum.
*/
CU_JIT_FALLBACK_STRATEGY CU_JIT_FALLBACK_STRATEGY
} CUjit_option; } CUjit_option;
// /**
// Online compilation targets * Online compilation targets
// */
typedef enum CUjit_target_enum typedef enum CUjit_target_enum
{ {
CU_TARGET_COMPUTE_10 = 0, CU_TARGET_COMPUTE_10 = 0, ///< Compute device class 1.0
CU_TARGET_COMPUTE_11, CU_TARGET_COMPUTE_11, ///< Compute device class 1.1
CU_TARGET_COMPUTE_12, CU_TARGET_COMPUTE_12, ///< Compute device class 1.2
CU_TARGET_COMPUTE_13 CU_TARGET_COMPUTE_13 ///< Compute device class 1.3
} CUjit_target; } CUjit_target;
// /**
// Cubin matching fallback strategies * Cubin matching fallback strategies
// */
typedef enum CUjit_fallback_enum typedef enum CUjit_fallback_enum
{ {
// prefer to compile ptx /** Prefer to compile ptx */
CU_PREFER_PTX = 0, CU_PREFER_PTX = 0,
// prefer to fall back to compatible binary code /** Prefer to fall back to compatible binary code */
CU_PREFER_BINARY CU_PREFER_BINARY
} CUjit_fallback; } CUjit_fallback;
/************************************ /************************************
** **
** Error codes ** Error codes
** **
***********************************/ ***********************************/
/**
* Error codes
*/
typedef enum cudaError_enum { typedef enum cudaError_enum {
CUDA_SUCCESS = 0, CUDA_SUCCESS = 0, ///< No errors
CUDA_ERROR_INVALID_VALUE = 1, CUDA_ERROR_INVALID_VALUE = 1, ///< Invalid value
CUDA_ERROR_OUT_OF_MEMORY = 2, CUDA_ERROR_OUT_OF_MEMORY = 2, ///< Out of memory
CUDA_ERROR_NOT_INITIALIZED = 3, CUDA_ERROR_NOT_INITIALIZED = 3, ///< Driver not initialized
CUDA_ERROR_DEINITIALIZED = 4, CUDA_ERROR_DEINITIALIZED = 4, ///< Driver deinitialized
CUDA_ERROR_NO_DEVICE = 100, CUDA_ERROR_NO_DEVICE = 100, ///< No CUDA-capable device
CUDA_ERROR_INVALID_DEVICE = 101, available
CUDA_ERROR_INVALID_DEVICE = 101, ///< Invalid device
CUDA_ERROR_INVALID_IMAGE = 200, CUDA_ERROR_INVALID_IMAGE = 200, ///< Invalid kernel image
CUDA_ERROR_INVALID_CONTEXT = 201, CUDA_ERROR_INVALID_CONTEXT = 201, ///< Invalid context
CUDA_ERROR_CONTEXT_ALREADY_CURRENT = 202, CUDA_ERROR_CONTEXT_ALREADY_CURRENT = 202, ///< Context already curren
CUDA_ERROR_MAP_FAILED = 205, t
CUDA_ERROR_UNMAP_FAILED = 206, CUDA_ERROR_MAP_FAILED = 205, ///< Map failed
CUDA_ERROR_ARRAY_IS_MAPPED = 207, CUDA_ERROR_UNMAP_FAILED = 206, ///< Unmap failed
CUDA_ERROR_ALREADY_MAPPED = 208, CUDA_ERROR_ARRAY_IS_MAPPED = 207, ///< Array is mapped
CUDA_ERROR_NO_BINARY_FOR_GPU = 209, CUDA_ERROR_ALREADY_MAPPED = 208, ///< Already mapped
CUDA_ERROR_ALREADY_ACQUIRED = 210, CUDA_ERROR_NO_BINARY_FOR_GPU = 209, ///< No binary for GPU
CUDA_ERROR_NOT_MAPPED = 211, CUDA_ERROR_ALREADY_ACQUIRED = 210, ///< Already acquired
CUDA_ERROR_NOT_MAPPED = 211, ///< Not mapped
CUDA_ERROR_INVALID_SOURCE = 300, CUDA_ERROR_INVALID_SOURCE = 300, ///< Invalid source
CUDA_ERROR_FILE_NOT_FOUND = 301, CUDA_ERROR_FILE_NOT_FOUND = 301, ///< File not found
CUDA_ERROR_INVALID_HANDLE = 400, CUDA_ERROR_INVALID_HANDLE = 400, ///< Invalid handle
CUDA_ERROR_NOT_FOUND = 500, CUDA_ERROR_NOT_FOUND = 500, ///< Not found
CUDA_ERROR_NOT_READY = 600, CUDA_ERROR_NOT_READY = 600, ///< CUDA not ready
CUDA_ERROR_LAUNCH_FAILED = 700, CUDA_ERROR_LAUNCH_FAILED = 700, ///< Launch failed
CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES = 701, CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES = 701, ///< Launch exceeded resour
CUDA_ERROR_LAUNCH_TIMEOUT = 702, ces
CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING = 703, CUDA_ERROR_LAUNCH_TIMEOUT = 702, ///< Launch exceeded timeou
t
CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING = 703, ///< Launch with incomp
atible texturing
CUDA_ERROR_UNKNOWN = 999 CUDA_ERROR_UNKNOWN = 999 ///< Unknown error
} CUresult; } CUresult;
/**
* If set, host memory is portable between CUDA contexts.
* Flag for ::cuMemHostAlloc()
*/
#define CU_MEMHOSTALLOC_PORTABLE 0x01
/**
* If set, host memory is mapped into CUDA address space and
* ::cuMemHostGetDevicePointer() may be called on the host pointer.
* Flag for ::cuMemHostAlloc()
*/
#define CU_MEMHOSTALLOC_DEVICEMAP 0x02
/**
* If set, host memory is allocated as write-combined - fast to write,
* faster to DMA, slow to read except via SSE4 streaming load instruction
* (MOVNTDQA).
* Flag for ::cuMemHostAlloc()
*/
#define CU_MEMHOSTALLOC_WRITECOMBINED 0x04
/**
* 2D memory copy parameters
*/
typedef struct CUDA_MEMCPY2D_st {
unsigned int srcXInBytes, ///< Source X in bytes
srcY; ///< Source Y
CUmemorytype srcMemoryType; ///< Source memory type (host, device, arra
y)
const void *srcHost; ///< Source host pointer
CUdeviceptr srcDevice; ///< Source device pointer
CUarray srcArray; ///< Source array reference
unsigned int srcPitch; ///< Source pitch (ignored when src is arra
y)
unsigned int dstXInBytes, ///< Destination X in bytes
dstY; ///< Destination Y
CUmemorytype dstMemoryType; ///< Destination memory type (host, device,
array)
void *dstHost; ///< Destination host pointer
CUdeviceptr dstDevice; ///< Destination device pointer
CUarray dstArray; ///< Destination array reference
unsigned int dstPitch; ///< Destination pitch (ignored when dst is
array)
unsigned int WidthInBytes; ///< Width of 2D memory copy in bytes
unsigned int Height; ///< Height of 2D memory copy
} CUDA_MEMCPY2D;
/**
* 3D memory copy parameters
*/
typedef struct CUDA_MEMCPY3D_st {
unsigned int srcXInBytes, ///< Source X in bytes
srcY, ///< Source Y
srcZ; ///< Source Z
unsigned int srcLOD; ///< Source LOD
CUmemorytype srcMemoryType; ///< Source memory type (host, device, arra
y)
const void *srcHost; ///< Source host pointer
CUdeviceptr srcDevice; ///< Source device pointer
CUarray srcArray; ///< Source array reference
void *reserved0; ///< Must be NULL
unsigned int srcPitch; ///< Source pitch (ignored when src is arra
y)
unsigned int srcHeight; ///< Source height (ignored when src is arr
ay; may be 0 if Depth==1)
unsigned int dstXInBytes, ///< Destination X in bytes
dstY, ///< Destination Y
dstZ; ///< Destination Z
unsigned int dstLOD; ///< Destination LOD
CUmemorytype dstMemoryType; ///< Destination memory type (host, device,
array)
void *dstHost; ///< Destination host pointer
CUdeviceptr dstDevice; ///< Destination device pointer
CUarray dstArray; ///< Destination array reference
void *reserved1; ///< Must be NULL
unsigned int dstPitch; ///< Destination pitch (ignored when dst is
array)
unsigned int dstHeight; ///< Destination height (ignored when dst i
s array; may be 0 if Depth==1)
unsigned int WidthInBytes; ///< Width of 3D memory copy in bytes
unsigned int Height; ///< Height of 3D memory copy
unsigned int Depth; ///< Depth of 3D memory copy
} CUDA_MEMCPY3D;
/**
* Array descriptor
*/
typedef struct
{
unsigned int Width; ///< Width of array
unsigned int Height; ///< Height of array
CUarray_format Format; ///< Array format
unsigned int NumChannels; ///< Channels per array element
} CUDA_ARRAY_DESCRIPTOR;
/**
* 3D array descriptor
*/
typedef struct
{
unsigned int Width; ///< Width of 3D array
unsigned int Height; ///< Height of 3D array
unsigned int Depth; ///< Depth of 3D array
CUarray_format Format; ///< Array format
unsigned int NumChannels; ///< Channels per array element
unsigned int Flags; ///< Flags
} CUDA_ARRAY3D_DESCRIPTOR;
/**
* Override the texref format with a format inferred from the array.
* Flag for ::cuTexRefSetArray()
*/
#define CU_TRSA_OVERRIDE_FORMAT 0x01
/**
* Read the texture as integers rather than promoting the values to floats
* in the range [0,1].
* Flag for ::cuTexRefSetFlags()
*/
#define CU_TRSF_READ_AS_INTEGER 0x01
/**
* Use normalized texture coordinates in the range [0,1) instead of [0,dim)
.
* Flag for ::cuTexRefSetFlags()
*/
#define CU_TRSF_NORMALIZED_COORDINATES 0x02
/**
* For texture references loaded into the module, use default texunit from
* texture reference.
*/
#define CU_PARAM_TR_DEFAULT -1
/** @} */
/** @} */ /* END CUDA_TYPES */
#ifdef _WIN32 #ifdef _WIN32
#define CUDAAPI __stdcall #define CUDAAPI __stdcall
#else #else
#define CUDAAPI #define CUDAAPI
#endif #endif
/********************************* /*********************************
** Initialization ** Initialization
*********************************/ *********************************/
CUresult CUDAAPI cuInit(unsigned int Flags); CUresult CUDAAPI cuInit(unsigned int Flags);
/*********************************
** Driver Version Query
*********************************/
CUresult CUDAAPI cuDriverGetVersion(int *driverVersion);
/************************************ /************************************
** **
** Device management ** Device management
** **
***********************************/ ***********************************/
CUresult CUDAAPI cuDeviceGet(CUdevice *device, int ordinal); CUresult CUDAAPI cuDeviceGet(CUdevice *device, int ordinal);
CUresult CUDAAPI cuDeviceGetCount(int *count); CUresult CUDAAPI cuDeviceGetCount(int *count);
CUresult CUDAAPI cuDeviceGetName(char *name, int len, CUdevice dev); CUresult CUDAAPI cuDeviceGetName(char *name, int len, CUdevice dev);
CUresult CUDAAPI cuDeviceComputeCapability(int *major, int *minor, CUd evice dev); CUresult CUDAAPI cuDeviceComputeCapability(int *major, int *minor, CUd evice dev);
skipping to change at line 367 skipping to change at line 602
// size of biggest r/w to be performe d by kernels on this memory // size of biggest r/w to be performe d by kernels on this memory
// 4, 8 or 16 bytes // 4, 8 or 16 bytes
unsigned int ElementSizeBytes unsigned int ElementSizeBytes
); );
CUresult CUDAAPI cuMemFree(CUdeviceptr dptr); CUresult CUDAAPI cuMemFree(CUdeviceptr dptr);
CUresult CUDAAPI cuMemGetAddressRange( CUdeviceptr *pbase, unsigned int *psize, CUdeviceptr dptr ); CUresult CUDAAPI cuMemGetAddressRange( CUdeviceptr *pbase, unsigned int *psize, CUdeviceptr dptr );
CUresult CUDAAPI cuMemAllocHost(void **pp, unsigned int bytesize); CUresult CUDAAPI cuMemAllocHost(void **pp, unsigned int bytesize);
CUresult CUDAAPI cuMemFreeHost(void *p); CUresult CUDAAPI cuMemFreeHost(void *p);
CUresult CUDAAPI cuMemHostAlloc(void **pp, size_t bytesize, unsigned in
t Flags );
CUresult CUDAAPI cuMemHostGetDevicePointer( CUdeviceptr *pdptr, void *p
, unsigned int Flags );
/************************************ /************************************
** **
** Synchronous Memcpy ** Synchronous Memcpy
** **
** Intra-device memcpy's done with these functions may execute in para llel with the CPU, ** Intra-device memcpy's done with these functions may execute in para llel with the CPU,
** but if host memory is involved, they wait until the copy is done be fore returning. ** but if host memory is involved, they wait until the copy is done be fore returning.
** **
***********************************/ ***********************************/
// 1D functions // 1D functions
skipping to change at line 397 skipping to change at line 636
// system <-> array memory // system <-> array memory
CUresult CUDAAPI cuMemcpyHtoA( CUarray dstArray, unsigned int dstI ndex, const void *pSrc, unsigned int ByteCount ); CUresult CUDAAPI cuMemcpyHtoA( CUarray dstArray, unsigned int dstI ndex, const void *pSrc, unsigned int ByteCount );
CUresult CUDAAPI cuMemcpyAtoH( void *dstHost, CUarray srcArray, un signed int srcIndex, unsigned int ByteCount ); CUresult CUDAAPI cuMemcpyAtoH( void *dstHost, CUarray srcArray, un signed int srcIndex, unsigned int ByteCount );
// array <-> array memory // array <-> array memory
CUresult CUDAAPI cuMemcpyAtoA( CUarray dstArray, unsigned int dstI ndex, CUarray srcArray, unsigned int srcIndex, unsigned int ByteCount ); CUresult CUDAAPI cuMemcpyAtoA( CUarray dstArray, unsigned int dstI ndex, CUarray srcArray, unsigned int srcIndex, unsigned int ByteCount );
// 2D memcpy // 2D memcpy
typedef struct CUDA_MEMCPY2D_st {
unsigned int srcXInBytes, srcY;
CUmemorytype srcMemoryType;
const void *srcHost;
CUdeviceptr srcDevice;
CUarray srcArray;
unsigned int srcPitch; // ignored when src is array
unsigned int dstXInBytes, dstY;
CUmemorytype dstMemoryType;
void *dstHost;
CUdeviceptr dstDevice;
CUarray dstArray;
unsigned int dstPitch; // ignored when dst is array
unsigned int WidthInBytes;
unsigned int Height;
} CUDA_MEMCPY2D;
CUresult CUDAAPI cuMemcpy2D( const CUDA_MEMCPY2D *pCopy ); CUresult CUDAAPI cuMemcpy2D( const CUDA_MEMCPY2D *pCopy );
CUresult CUDAAPI cuMemcpy2DUnaligned( const CUDA_MEMCPY2D *pCopy ) ; CUresult CUDAAPI cuMemcpy2DUnaligned( const CUDA_MEMCPY2D *pCopy ) ;
// 3D memcpy // 3D memcpy
typedef struct CUDA_MEMCPY3D_st {
unsigned int srcXInBytes, srcY, srcZ;
unsigned int srcLOD;
CUmemorytype srcMemoryType;
const void *srcHost;
CUdeviceptr srcDevice;
CUarray srcArray;
void *reserved0; // must be NULL
unsigned int srcPitch; // ignored when src is array
unsigned int srcHeight; // ignored when src is array; may b
e 0 if Depth==1
unsigned int dstXInBytes, dstY, dstZ;
unsigned int dstLOD;
CUmemorytype dstMemoryType;
void *dstHost;
CUdeviceptr dstDevice;
CUarray dstArray;
void *reserved1; // must be NULL
unsigned int dstPitch; // ignored when dst is array
unsigned int dstHeight; // ignored when dst is array; may b
e 0 if Depth==1
unsigned int WidthInBytes;
unsigned int Height;
unsigned int Depth;
} CUDA_MEMCPY3D;
CUresult CUDAAPI cuMemcpy3D( const CUDA_MEMCPY3D *pCopy ); CUresult CUDAAPI cuMemcpy3D( const CUDA_MEMCPY3D *pCopy );
/************************************ /************************************
** **
** Asynchronous Memcpy ** Asynchronous Memcpy
** **
** Any host memory involved must be DMA'able (e.g., allocated with cuM emAllocHost). ** Any host memory involved must be DMA'able (e.g., allocated with cuM emAllocHost).
** memcpy's done with these functions execute in parallel with the CPU and, if ** memcpy's done with these functions execute in parallel with the CPU and, if
** the hardware is available, may execute in parallel with the GPU. ** the hardware is available, may execute in parallel with the GPU.
** Asynchronous memcpy must be accompanied by appropriate stream synch ronization. ** Asynchronous memcpy must be accompanied by appropriate stream synch ronization.
skipping to change at line 500 skipping to change at line 694
CUresult CUDAAPI cuMemsetD2D32( CUdeviceptr dstDevice, unsigned in t dstPitch, unsigned int ui, unsigned int Width, unsigned int Height ); CUresult CUDAAPI cuMemsetD2D32( CUdeviceptr dstDevice, unsigned in t dstPitch, unsigned int ui, unsigned int Width, unsigned int Height );
/************************************ /************************************
** **
** Function management ** Function management
** **
***********************************/ ***********************************/
CUresult CUDAAPI cuFuncSetBlockShape (CUfunction hfunc, int x, int y, i nt z); CUresult CUDAAPI cuFuncSetBlockShape (CUfunction hfunc, int x, int y, i nt z);
CUresult CUDAAPI cuFuncSetSharedSize (CUfunction hfunc, unsigned int by tes); CUresult CUDAAPI cuFuncSetSharedSize (CUfunction hfunc, unsigned int by tes);
CUresult CUDAAPI cuFuncGetAttribute (int *pi, CUfunction_attribute attr ib, CUfunction hfunc);
/************************************ /************************************
** **
** Array management ** Array management
** **
***********************************/ ***********************************/
typedef struct
{
//
// dimensions
//
unsigned int Width;
unsigned int Height;
//
// format
//
CUarray_format Format;
// channels per array element
unsigned int NumChannels;
} CUDA_ARRAY_DESCRIPTOR;
CUresult CUDAAPI cuArrayCreate( CUarray *pHandle, const CUDA_ARRAY_DES CRIPTOR *pAllocateArray ); CUresult CUDAAPI cuArrayCreate( CUarray *pHandle, const CUDA_ARRAY_DES CRIPTOR *pAllocateArray );
CUresult CUDAAPI cuArrayGetDescriptor( CUDA_ARRAY_DESCRIPTOR *pArrayDe scriptor, CUarray hArray ); CUresult CUDAAPI cuArrayGetDescriptor( CUDA_ARRAY_DESCRIPTOR *pArrayDe scriptor, CUarray hArray );
CUresult CUDAAPI cuArrayDestroy( CUarray hArray ); CUresult CUDAAPI cuArrayDestroy( CUarray hArray );
typedef struct
{
//
// dimensions
//
unsigned int Width;
unsigned int Height;
unsigned int Depth;
//
// format
//
CUarray_format Format;
// channels per array element
unsigned int NumChannels;
//
// flags
//
unsigned int Flags;
} CUDA_ARRAY3D_DESCRIPTOR;
CUresult CUDAAPI cuArray3DCreate( CUarray *pHandle, const CUDA_ARRAY3D _DESCRIPTOR *pAllocateArray ); CUresult CUDAAPI cuArray3DCreate( CUarray *pHandle, const CUDA_ARRAY3D _DESCRIPTOR *pAllocateArray );
CUresult CUDAAPI cuArray3DGetDescriptor( CUDA_ARRAY3D_DESCRIPTOR *pArr ayDescriptor, CUarray hArray ); CUresult CUDAAPI cuArray3DGetDescriptor( CUDA_ARRAY3D_DESCRIPTOR *pArr ayDescriptor, CUarray hArray );
/************************************ /************************************
** **
** Texture reference management ** Texture reference management
** **
***********************************/ ***********************************/
CUresult CUDAAPI cuTexRefCreate( CUtexref *pTexRef ); CUresult CUDAAPI cuTexRefCreate( CUtexref *pTexRef );
CUresult CUDAAPI cuTexRefDestroy( CUtexref hTexRef ); CUresult CUDAAPI cuTexRefDestroy( CUtexref hTexRef );
CUresult CUDAAPI cuTexRefSetArray( CUtexref hTexRef, CUarray hArray, u nsigned int Flags ); CUresult CUDAAPI cuTexRefSetArray( CUtexref hTexRef, CUarray hArray, u nsigned int Flags );
// override the texref format with a format inferred from the array
#define CU_TRSA_OVERRIDE_FORMAT 0x01
CUresult CUDAAPI cuTexRefSetAddress( unsigned int *ByteOffset, CUtexre f hTexRef, CUdeviceptr dptr, unsigned int bytes ); CUresult CUDAAPI cuTexRefSetAddress( unsigned int *ByteOffset, CUtexre f hTexRef, CUdeviceptr dptr, unsigned int bytes );
CUresult CUDAAPI cuTexRefSetAddress2D( CUtexref hTexRef, const CUDA_AR RAY_DESCRIPTOR *desc, CUdeviceptr dptr, unsigned int Pitch);
CUresult CUDAAPI cuTexRefSetFormat( CUtexref hTexRef, CUarray_format f mt, int NumPackedComponents ); CUresult CUDAAPI cuTexRefSetFormat( CUtexref hTexRef, CUarray_format f mt, int NumPackedComponents );
CUresult CUDAAPI cuTexRefSetAddressMode( CUtexref hTexRef, int dim, CU address_mode am ); CUresult CUDAAPI cuTexRefSetAddressMode( CUtexref hTexRef, int dim, CU address_mode am );
CUresult CUDAAPI cuTexRefSetFilterMode( CUtexref hTexRef, CUfilter_mod e fm ); CUresult CUDAAPI cuTexRefSetFilterMode( CUtexref hTexRef, CUfilter_mod e fm );
CUresult CUDAAPI cuTexRefSetFlags( CUtexref hTexRef, unsigned int Flag s ); CUresult CUDAAPI cuTexRefSetFlags( CUtexref hTexRef, unsigned int Flag s );
// read the texture as integers rather than promoting the values
// to floats in the range [0,1]
#define CU_TRSF_READ_AS_INTEGER 0x01
// use normalized texture coordinates in the range [0,1) instead of
[0,dim)
#define CU_TRSF_NORMALIZED_COORDINATES 0x02
CUresult CUDAAPI cuTexRefGetAddress( CUdeviceptr *pdptr, CUtexref hTex Ref ); CUresult CUDAAPI cuTexRefGetAddress( CUdeviceptr *pdptr, CUtexref hTex Ref );
CUresult CUDAAPI cuTexRefGetArray( CUarray *phArray, CUtexref hTexRef ); CUresult CUDAAPI cuTexRefGetArray( CUarray *phArray, CUtexref hTexRef );
CUresult CUDAAPI cuTexRefGetAddressMode( CUaddress_mode *pam, CUtexref hTexRef, int dim ); CUresult CUDAAPI cuTexRefGetAddressMode( CUaddress_mode *pam, CUtexref hTexRef, int dim );
CUresult CUDAAPI cuTexRefGetFilterMode( CUfilter_mode *pfm, CUtexref h TexRef ); CUresult CUDAAPI cuTexRefGetFilterMode( CUfilter_mode *pfm, CUtexref h TexRef );
CUresult CUDAAPI cuTexRefGetFormat( CUarray_format *pFormat, int *pNum Channels, CUtexref hTexRef ); CUresult CUDAAPI cuTexRefGetFormat( CUarray_format *pFormat, int *pNum Channels, CUtexref hTexRef );
CUresult CUDAAPI cuTexRefGetFlags( unsigned int *pFlags, CUtexref hTex Ref ); CUresult CUDAAPI cuTexRefGetFlags( unsigned int *pFlags, CUtexref hTex Ref );
/************************************ /************************************
** **
** Parameter management ** Parameter management
** **
***********************************/ ***********************************/
CUresult CUDAAPI cuParamSetSize (CUfunction hfunc, unsigned int numbyt es); CUresult CUDAAPI cuParamSetSize (CUfunction hfunc, unsigned int numbyt es);
CUresult CUDAAPI cuParamSeti (CUfunction hfunc, int offset, unsigne d int value); CUresult CUDAAPI cuParamSeti (CUfunction hfunc, int offset, unsigne d int value);
CUresult CUDAAPI cuParamSetf (CUfunction hfunc, int offset, float v alue); CUresult CUDAAPI cuParamSetf (CUfunction hfunc, int offset, float v alue);
CUresult CUDAAPI cuParamSetv (CUfunction hfunc, int offset, void * ptr, unsigned int numbytes); CUresult CUDAAPI cuParamSetv (CUfunction hfunc, int offset, void * ptr, unsigned int numbytes);
CUresult CUDAAPI cuParamSetTexRef(CUfunction hfunc, int texunit, CUtex ref hTexRef); CUresult CUDAAPI cuParamSetTexRef(CUfunction hfunc, int texunit, CUtex ref hTexRef);
// for texture references loaded into the module,
// use default texunit from texture reference
#define CU_PARAM_TR_DEFAULT -1
/************************************ /************************************
** **
** Launch functions ** Launch functions
** **
***********************************/ ***********************************/
CUresult CUDAAPI cuLaunch ( CUfunction f ); CUresult CUDAAPI cuLaunch ( CUfunction f );
CUresult CUDAAPI cuLaunchGrid (CUfunction f, int grid_width, int grid_h eight); CUresult CUDAAPI cuLaunchGrid (CUfunction f, int grid_width, int grid_h eight);
CUresult CUDAAPI cuLaunchGridAsync( CUfunction f, int grid_width, int g rid_height, CUstream hStream ); CUresult CUDAAPI cuLaunchGridAsync( CUfunction f, int grid_width, int g rid_height, CUstream hStream );
 End of changes. 61 change blocks. 
278 lines changed or deleted 457 lines changed or added


 cudaGL.h   cudaGL.h 
/* /*
* Copyright 1993-2008 NVIDIA Corporation. All rights reserved. * Copyright 1993-2009 NVIDIA Corporation. All rights reserved.
* *
* NOTICE TO USER: * NOTICE TO USER:
* *
* This source code is subject to NVIDIA ownership rights under U.S. and * This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws. Users and possessors of this source code * international Copyright laws. Users and possessors of this source code
* are hereby granted a nonexclusive, royalty-free license to use this code * are hereby granted a nonexclusive, royalty-free license to use this code
* in individual and commercial software. * in individual and commercial software.
* *
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
skipping to change at line 50 skipping to change at line 50
extern "C" { extern "C" {
#endif #endif
CUresult CUDAAPI cuGLInit(void); CUresult CUDAAPI cuGLInit(void);
CUresult CUDAAPI cuGLCtxCreate( CUcontext *pCtx, unsigned int Flags, CUdevi ce device ); CUresult CUDAAPI cuGLCtxCreate( CUcontext *pCtx, unsigned int Flags, CUdevi ce device );
CUresult CUDAAPI cuGLRegisterBufferObject( GLuint bufferobj ); CUresult CUDAAPI cuGLRegisterBufferObject( GLuint bufferobj );
CUresult CUDAAPI cuGLMapBufferObject( CUdeviceptr *dptr, unsigned int *size , GLuint bufferobj ); CUresult CUDAAPI cuGLMapBufferObject( CUdeviceptr *dptr, unsigned int *size , GLuint bufferobj );
CUresult CUDAAPI cuGLUnmapBufferObject( GLuint bufferobj ); CUresult CUDAAPI cuGLUnmapBufferObject( GLuint bufferobj );
CUresult CUDAAPI cuGLUnregisterBufferObject( GLuint bufferobj ); CUresult CUDAAPI cuGLUnregisterBufferObject( GLuint bufferobj );
#if defined(_WIN32)
#if !defined(WGL_NV_gpu_affinity)
typedef void* HGPUNV;
#endif
CUresult CUDAAPI cuWGLGetDevice( CUdevice *pDevice, HGPUNV hGpu );
#endif
#ifdef __cplusplus #ifdef __cplusplus
}; };
#endif #endif
#endif #endif
 End of changes. 2 change blocks. 
1 lines changed or deleted 8 lines changed or added


 cuda_gl_interop.h   cuda_gl_interop.h 
/* /*
* Copyright 1993-2008 NVIDIA Corporation. All rights reserved. * Copyright 1993-2009 NVIDIA Corporation. All rights reserved.
* *
* NOTICE TO USER: * NOTICE TO USER:
* *
* This source code is subject to NVIDIA ownership rights under U.S. and * This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws. Users and possessors of this source code * international Copyright laws. Users and possessors of this source code
* are hereby granted a nonexclusive, royalty-free license to use this code * are hereby granted a nonexclusive, royalty-free license to use this code
* in individual and commercial software. * in individual and commercial software.
* *
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
skipping to change at line 72 skipping to change at line 72
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
extern __host__ cudaError_t CUDARTAPI cudaGLSetGLDevice(int device); extern __host__ cudaError_t CUDARTAPI cudaGLSetGLDevice(int device);
extern __host__ cudaError_t CUDARTAPI cudaGLRegisterBufferObject(GLuint buf Obj); extern __host__ cudaError_t CUDARTAPI cudaGLRegisterBufferObject(GLuint buf Obj);
extern __host__ cudaError_t CUDARTAPI cudaGLMapBufferObject(void **devPtr, GLuint bufObj); extern __host__ cudaError_t CUDARTAPI cudaGLMapBufferObject(void **devPtr, GLuint bufObj);
extern __host__ cudaError_t CUDARTAPI cudaGLUnmapBufferObject(GLuint bufObj ); extern __host__ cudaError_t CUDARTAPI cudaGLUnmapBufferObject(GLuint bufObj );
extern __host__ cudaError_t CUDARTAPI cudaGLUnregisterBufferObject(GLuint b ufObj); extern __host__ cudaError_t CUDARTAPI cudaGLUnregisterBufferObject(GLuint b ufObj);
#ifdef _WIN32
#ifndef WGL_NV_gpu_affinity
typedef void* HGPUNV;
#endif
extern __host__ cudaError_t CUDARTAPI cudaWGLGetDevice(int *device, HGPUNV
hGpu);
#endif
#if defined(__cplusplus) #if defined(__cplusplus)
} }
#endif /* __cplusplus */ #endif /* __cplusplus */
#endif /* __CUDA_GL_INTEROP_H__ */ #endif /* __CUDA_GL_INTEROP_H__ */
 End of changes. 2 change blocks. 
1 lines changed or deleted 8 lines changed or added


 cuda_runtime.h   cuda_runtime.h 
/* /*
* Copyright 1993-2008 NVIDIA Corporation. All rights reserved. * Copyright 1993-2009 NVIDIA Corporation. All rights reserved.
* *
* NOTICE TO USER: * NOTICE TO USER:
* *
* This source code is subject to NVIDIA ownership rights under U.S. and * This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws. Users and possessors of this source code * international Copyright laws. Users and possessors of this source code
* are hereby granted a nonexclusive, royalty-free license to use this code * are hereby granted a nonexclusive, royalty-free license to use this code
* in individual and commercial software. * in individual and commercial software.
* *
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
skipping to change at line 77 skipping to change at line 77
#endif /* __CUDACC__ */ #endif /* __CUDACC__ */
#if defined(__cplusplus) #if defined(__cplusplus)
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
/**
* \ingroup CUDART_HIGHLEVEL
* \brief \hl Configure a device launch
*
* Pushes \p size bytes of the argument pointed to by \p arg at \p offset
* bytes from the start of the parameter passing area, which starts at
* offset 0. The arguments are stored in the top of the execution stack.
* \ref ::cudaSetupArgument(T,size_t) "cudaSetupArgument()" must be precede
d
* by a call to ::cudaConfigureCall().
*
* \param arg - Argument to push for a kernel launch
* \param offset - Offset in argument stack to push new arg
*
* \return
* ::cudaSuccess
* \notefnerr
*
* \sa \ref ::cudaLaunch(T*) "cudaLaunch (C++ API)",
* \ref ::cudaSetupArgument(const void*, size_t, size_t) "cudaSetupArgument
(C API)"
* ::cudaConfigureCall
*/
template<class T> template<class T>
__inline__ __host__ cudaError_t cudaSetupArgument( __inline__ __host__ cudaError_t cudaSetupArgument(
T arg, T arg,
size_t offset size_t offset
) )
{ {
return cudaSetupArgument((const void*)&arg, sizeof(T), offset); return cudaSetupArgument((const void*)&arg, sizeof(T), offset);
} }
#if defined(__CUDACC__) #if defined(__CUDACC__)
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
/**
* \addtogroup CUDART_HIGHLEVEL
* @{
*/
static __inline__ __host__ cudaError_t cudaMemcpyToSymbol( static __inline__ __host__ cudaError_t cudaMemcpyToSymbol(
char *symbol, char *symbol,
const void *src, const void *src,
size_t count, size_t count,
size_t offset = 0, size_t offset = 0,
enum cudaMemcpyKind kind = cudaMemcpyHostToDevice enum cudaMemcpyKind kind = cudaMemcpyHostToDevice
) )
{ {
return cudaMemcpyToSymbol((const char*)symbol, src, count, offset, kind); return cudaMemcpyToSymbol((const char*)symbol, src, count, offset, kind);
} }
skipping to change at line 204 skipping to change at line 230
} }
static __inline__ __host__ cudaError_t cudaGetSymbolAddress( static __inline__ __host__ cudaError_t cudaGetSymbolAddress(
void **devPtr, void **devPtr,
char *symbol char *symbol
) )
{ {
return cudaGetSymbolAddress(devPtr, (const char*)symbol); return cudaGetSymbolAddress(devPtr, (const char*)symbol);
} }
/**
* \brief \hl Finds the address associated with a CUDA symbol
*
* Returns in \p *devPtr the address of symbol \p symbol on the device.
* \p symbol can either be a variable that resides in global memory space,
or
* it can be a character string, naming a variable that resides in global
* memory space. If \p symbol cannot be found, or if \p symbol is not decla
red
* in the global memory space, \p *devPtr is unchanged and the error
* ::cudaErrorInvalidSymbol is returned.
*
* \param devPtr - Return device pointer associated with symbol
* \param symbol - Global variable or string symbol to search for
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidSymbol,
* ::cudaErrorAddressOfConstant
* \notefnerr
*
* \sa \ref ::cudaGetSymbolAddress(void**, const char*) "cudaGetSymbolAddre
ss (C API)"
* \ref ::cudaGetSymbolSize(size_t*, const T&) "cudaGetSymbolSize (C++ API)
"
*/
template<class T> template<class T>
__inline__ __host__ cudaError_t cudaGetSymbolAddress( __inline__ __host__ cudaError_t cudaGetSymbolAddress(
void **devPtr, void **devPtr,
const T &symbol const T &symbol
) )
{ {
return cudaGetSymbolAddress(devPtr, (const char*)&symbol); return cudaGetSymbolAddress(devPtr, (const char*)&symbol);
} }
/************************************************************************** ***** /************************************************************************** *****
skipping to change at line 227 skipping to change at line 275
*************************************************************************** ****/ *************************************************************************** ****/
static __inline__ __host__ cudaError_t cudaGetSymbolSize( static __inline__ __host__ cudaError_t cudaGetSymbolSize(
size_t *size, size_t *size,
char *symbol char *symbol
) )
{ {
return cudaGetSymbolSize(size, (const char*)symbol); return cudaGetSymbolSize(size, (const char*)symbol);
} }
/**
* \brief \hl Finds the size of the object associated with a CUDA symbol
*
* Returns in \p *size the size of symbol \p symbol. \p symbol can either b
e a
* variable that resides in global or constant memory space, or it can be a
* character string, naming a variable that resides in global or constant
* memory space. If \p symbol cannot be found, or if \p symbol is not decla
red
* in global or constant memory space, \p *size is unchanged and the error
* ::cudaErrorInvalidSymbol is returned.
*
* \param size - Size of object associated with symbol
* \param symbol - Global variable or string symbol to find size of
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidSymbol
* \notefnerr
*
* \sa \ref ::cudaGetSymbolAddress(void**, const T&) "cudaGetSymbolAddress
(C++ API)"
* \ref ::cudaGetSymbolSize(size_t*, const char*) "cudaGetSymbolSize (C API
)"
*/
template<class T> template<class T>
__inline__ __host__ cudaError_t cudaGetSymbolSize( __inline__ __host__ cudaError_t cudaGetSymbolSize(
size_t *size, size_t *size,
const T &symbol const T &symbol
) )
{ {
return cudaGetSymbolSize(size, (const char*)&symbol); return cudaGetSymbolSize(size, (const char*)&symbol);
} }
/** @} */ /* END CUDART_MEMORY */
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
/**
* \addtogroup CUDART_HIGHLEVEL
*
* @{
*/
/**
* \brief \hl Binds a memory area to a texture
*
* Binds \p size bytes of the memory area pointed to by \p devPtr to textur
e
* reference \p tex. \p desc describes how the memory is interpreted when
* fetching values from the texture. The \p offset parameter is an optional
* byte offset as with the low-level
* \ref ::cudaBindTexture(size_t*, const struct textureReference*, const vo
id*, const struct cudaChannelFormatDesc*, size_t) "cudaBindTexture()"
* function. Any memory previously bound to \p tex is unbound.
*
* \param offset - Offset in bytes
* \param tex - Texture to bind
* \param devPtr - Memory area on device
* \param desc - Channel format
* \param size - Size of the memory area pointed to by devPtr
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidValue,
* ::cudaErrorInvalidDevicePointer,
* ::cudaErrorInvalidTexture
* \notefnerr
*
* \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)"
,
* ::cudaGetChannelDesc, ::cudaGetTextureReference,
* \ref ::cudaBindTexture(size_t*, const struct textureReference*, const vo
id*, const struct cudaChannelFormatDesc*, size_t) "cudaBindTexture (C API)"
,
* \ref ::cudaBindTexture(size_t*, const struct texture< T, dim, readMode>&
, const void*, size_t) "cudaBindTexture (C++ API, inherited channel descrip
tor)",
* \ref ::cudaBindTexture2D(size_t*, const struct texture< T, dim, readMode
>&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_
t) "cudaBindTexture2D (C++ API)",
* \ref ::cudaBindTextureToArray(const struct texture< T, dim, readMode>&,
const struct cudaArray*, const struct cudaChannelFormatDesc&) "cudaBindText
ureToArray (C++ API)",
* \ref ::cudaBindTextureToArray(const struct texture< T, dim, readMode>&,
const struct cudaArray*) "cudaBindTextureToArray (C++ API, inherited channe
l descriptor)",
* \ref ::cudaUnbindTexture(const struct texture< T, dim, readMode>&) "cuda
UnbindTexture (C++ API)",
* \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture< T, d
im, readMode >&) "cudaGetTextureAlignmentOffset (C++ API)"
*/
template<class T, int dim, enum cudaTextureReadMode readMode> template<class T, int dim, enum cudaTextureReadMode readMode>
__inline__ __host__ cudaError_t cudaBindTexture( __inline__ __host__ cudaError_t cudaBindTexture(
size_t *offset, size_t *offset,
const struct texture<T, dim, readMode> &tex, const struct texture<T, dim, readMode> &tex,
const void *devPtr, const void *devPtr,
const struct cudaChannelFormatDesc &desc, const struct cudaChannelFormatDesc &desc,
size_t size = UINT_MAX size_t size = UINT_MAX
) )
{ {
return cudaBindTexture(offset, &tex, devPtr, &desc, size); return cudaBindTexture(offset, &tex, devPtr, &desc, size);
} }
/**
* \brief \hl Binds a memory area to a texture
*
* Binds \p size bytes of the memory area pointed to by \p devPtr to textur
e
* reference \p tex. The channel descriptor is inherited from the texture
* reference type. The \p offset parameter is an optional byte offset as wi
th
* the low-level
* ::cudaBindTexture(size_t*, const struct textureReference*, const void*,
const struct cudaChannelFormatDesc*, size_t)
* function. Any memory previously bound to \p tex is unbound.
*
* \param offset - Offset in bytes
* \param tex - Texture to bind
* \param devPtr - Memory area on device
* \param size - Size of the memory area pointed to by devPtr
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidValue,
* ::cudaErrorInvalidDevicePointer,
* ::cudaErrorInvalidTexture
* \notefnerr
*
* \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API),
* ::cudaGetChannelDesc, ::cudaGetTextureReference,
* \ref ::cudaBindTexture(size_t*, const struct textureReference*, const vo
id*, const struct cudaChannelFormatDesc*, size_t) "cudaBindTexture (C API)"
,
* \ref ::cudaBindTexture(size_t*, const struct texture< T, dim, readMode>&
, const void*, size_t) "cudaBindTexture (C++ API, inherited channel descrip
tor)",
* \ref ::cudaBindTexture2D(size_t*, const struct texture< T, dim, readMode
>&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_
t) "cudaBindTexture2D (C++ API)",
* \ref ::cudaBindTextureToArray(const struct texture< T, dim, readMode>&,
const struct cudaArray*, const struct cudaChannelFormatDesc&) "cudaBindText
ureToArray (C++ API)",
* \ref ::cudaBindTextureToArray(const struct texture< T, dim, readMode>&,
const struct cudaArray*) "cudaBindTextureToArray (C++ API, inherited channe
l descriptor),
* \ref ::cudaUnbindTexture(const struct texture<T, dim, readMode>&) "cudaU
nbindTexture (C++ API)",
* \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture< T, d
im, readMode>&) "cudaGetTextureAlignmentOffset (C++ API)"
*/
template<class T, int dim, enum cudaTextureReadMode readMode> template<class T, int dim, enum cudaTextureReadMode readMode>
__inline__ __host__ cudaError_t cudaBindTexture( __inline__ __host__ cudaError_t cudaBindTexture(
size_t *offset, size_t *offset,
const struct texture<T, dim, readMode> &tex, const struct texture<T, dim, readMode> &tex,
const void *devPtr, const void *devPtr,
size_t size = UINT_MAX size_t size = UINT_MAX
) )
{ {
return cudaBindTexture(offset, tex, devPtr, tex.channelDesc, size); return cudaBindTexture(offset, tex, devPtr, tex.channelDesc, size);
} }
/**
* \brief \hl Binds a 2D memory area to a texture
*
* Binds the 2D memory area pointed to by \p devPtr to the
* texture reference \p tex. The size of the area is constrained by
* \p width in texel units, \p height in texel units, and \p pitch in byte
* units. \p desc describes how the memory is interpreted when fetching val
ues
* from the texture. Any memory previously bound to \p tex is unbound.
*
* Since the hardware enforces an alignment requirement on texture base
* addresses,
* \ref ::cudaBindTexture2D(size_t*, const struct texture< T, dim, readMode
>&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_
t) "cudaBindTexture2D()"
* returns in \p *offset a byte offset that
* must be applied to texture fetches in order to read from the desired mem
ory.
* This offset must be divided by the texel size and passed to kernels that
* read from the texture so they can be applied to the ::tex2D() function.
* If the device memory pointer was returned from ::cudaMalloc(), the offse
t is
* guaranteed to be 0 and NULL may be passed as the \p offset parameter.
*
* \param offset - Offset in bytes
* \param tex - Texture reference to bind
* \param devPtr - 2D memory area on device
* \param desc - Channel format
* \param width - Width in texel units
* \param height - Height in texel units
* \param pitch - Pitch in bytes
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidValue,
* ::cudaErrorInvalidDevicePointer,
* ::cudaErrorInvalidTexture
* \notefnerr
*
* \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API),
* ::cudaGetChannelDesc, ::cudaGetTextureReference,
* \ref ::cudaBindTexture(size_t*, const struct texture< T, dim, readMode>&
, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTextur
e (C++ API),
* \ref ::cudaBindTexture(size_t*, const struct texture< T, dim, readMode>&
, const void*, size_t) "cudaBindTexture (C++ API, inherited channel descrip
tor)",
* \ref ::cudaBindTexture2D(size_t*, const struct textureReference*, const
void*, const struct cudaChannelFormatDesc*, size_t, size_t, size_t) "cudaBi
ndTexture2D (C API)",
* \ref ::cudaBindTextureToArray(const struct texture< T, dim, readMode>&,
const struct cudaArray*, const struct cudaChannelFormatDesc&) "cudaBindText
ureToArray (C++ API)",
* \ref ::cudaBindTextureToArray(const struct texture< T, dim, readMode>&,
const struct cudaArray*) "cudaBindTextureToArray (C++ API, inherited channe
l descriptor),
* \ref ::cudaUnbindTexture(const struct texture<T, dim, readMode>&) "cudaU
nbindTexture (C++ API)",
* \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture< T, d
im, readMode>&) "cudaGetTextureAlignmentOffset (C++ API)"
*/
template<class T, int dim, enum cudaTextureReadMode readMode>
__inline__ __host__ cudaError_t cudaBindTexture2D(
size_t *offset,
const struct texture<T, dim, readMode> &tex,
const void *devPtr,
const struct cudaChannelFormatDesc &desc,
size_t width,
size_t height,
size_t pitch
)
{
return cudaBindTexture2D( offset, &tex, devPtr, &desc, width, height, pit
ch);
}
/**
* \brief \hl Binds an array to a texture
*
* Binds the CUDA array \p array to the texture reference \p tex.
* \p desc describes how the memory is interpreted when fetching values fro
m
* the texture. Any CUDA array previously bound to \p tex is unbound.
*
* \param tex - Texture to bind
* \param array - Memory array on device
* \param desc - Channel format
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidValue,
* ::cudaErrorInvalidDevicePointer,
* ::cudaErrorInvalidTexture
* \notefnerr
*
* \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)"
,
* ::cudaGetChannelDesc, ::cudaGetTextureReference,
* \ref ::cudaBindTexture(size_t*, const struct texture< T, dim, readMode>&
, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTextur
e (C++ API)",
* \ref ::cudaBindTexture(size_t*, const struct texture< T, dim, readMode>&
, const void*, size_t) "cudaBindTexture (C++ API, inherited channel descrip
tor)",
* \ref ::cudaBindTexture2D(size_t*, const struct texture< T, dim, readMode
>&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_
t) "cudaBindTexture2D (C++ API)",
* \ref ::cudaBindTextureToArray(const struct textureReference*, const stru
ct cudaArray*, const struct cudaChannelFormatDesc*) "cudaBindTextureToArray
(C API)",
* \ref ::cudaBindTextureToArray(const struct texture< T, dim, readMode>&,
const struct cudaArray*) "cudaBindTextureToArray (C++ API, inherited channe
l descriptor)",
* \ref ::cudaUnbindTexture(const struct texture< T, dim, readMode>&) "cuda
UnbindTexture (C++ API)",
* \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture< T, d
im, readMode >&) "cudaGetTextureAlignmentOffset (C++ API)"
*/
template<class T, int dim, enum cudaTextureReadMode readMode> template<class T, int dim, enum cudaTextureReadMode readMode>
__inline__ __host__ cudaError_t cudaBindTextureToArray( __inline__ __host__ cudaError_t cudaBindTextureToArray(
const struct texture<T, dim, readMode> &tex, const struct texture<T, dim, readMode> &tex,
const struct cudaArray *array, const struct cudaArray *array,
const struct cudaChannelFormatDesc &desc const struct cudaChannelFormatDesc &desc
) )
{ {
return cudaBindTextureToArray(&tex, array, &desc); return cudaBindTextureToArray(&tex, array, &desc);
} }
/**
* \brief \hl Binds an array to a texture
*
* Binds the CUDA array \p array to the texture reference \p tex.
* The channel descriptor is inherited from the CUDA array. Any CUDA array
* previously bound to \p tex is unbound.
*
* \param tex - Texture to bind
* \param array - Memory array on device
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidValue,
* ::cudaErrorInvalidDevicePointer,
* ::cudaErrorInvalidTexture
* \notefnerr
*
* \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)"
,
* ::cudaGetChannelDesc, ::cudaGetTextureReference,
* \ref ::cudaBindTexture(size_t*, const struct texture< T, dim, readMode>&
, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTextur
e (C++ API)",
* \ref ::cudaBindTexture(size_t*, const struct texture< T, dim, readMode>&
, const void*, size_t) "cudaBindTexture (C++ API, inherited channel descrip
tor)",
* \ref ::cudaBindTexture2D(size_t*, const struct texture< T, dim, readMode
>&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_
t) "cudaBindTexture2D (C++ API)",
* \ref ::cudaBindTextureToArray(const struct textureReference*, const stru
ct cudaArray*, const struct cudaChannelFormatDesc*) "cudaBindTextureToArray
(C API)",
* \ref ::cudaBindTextureToArray(const struct texture< T, dim, readMode>&,
const struct cudaArray*, const struct cudaChannelFormatDesc&) "cudaBindText
ureToArray (C++ API)",
* \ref ::cudaUnbindTexture(const struct texture< T, dim, readMode>&) "cuda
UnbindTexture (C++ API)",
* \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture< T, d
im, readMode >&) "cudaGetTextureAlignmentOffset (C++ API)"
*/
template<class T, int dim, enum cudaTextureReadMode readMode> template<class T, int dim, enum cudaTextureReadMode readMode>
__inline__ __host__ cudaError_t cudaBindTextureToArray( __inline__ __host__ cudaError_t cudaBindTextureToArray(
const struct texture<T, dim, readMode> &tex, const struct texture<T, dim, readMode> &tex,
const struct cudaArray *array const struct cudaArray *array
) )
{ {
struct cudaChannelFormatDesc desc; struct cudaChannelFormatDesc desc;
cudaError_t err = cudaGetChannelDesc(&desc, array); cudaError_t err = cudaGetChannelDesc(&desc, array);
return err == cudaSuccess ? cudaBindTextureToArray(tex, array, desc) : er r; return err == cudaSuccess ? cudaBindTextureToArray(tex, array, desc) : er r;
} }
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
/**
* \brief \hl Unbinds a texture
*
* Unbinds the texture bound to \p tex.
*
* \param tex - Texture to unbind
*
* \return ::cudaSuccess
* \notefnerr
*
* \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)"
,
* ::cudaGetChannelDesc, ::cudaGetTextureReference,
* \ref ::cudaBindTexture(size_t*, const struct texture< T, dim, readMode>&
, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTextur
e (C++ API)",
* \ref ::cudaBindTexture(size_t*, const struct texture< T, dim, readMode>&
, const void*, size_t) "cudaBindTexture (C++ API, inherited channel descrip
tor)",
* \ref ::cudaBindTexture2D(size_t*, const struct texture< T, dim, readMode
>&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_
t) "cudaBindTexture2D (C++ API)",
* \ref ::cudaBindTextureToArray(const struct texture< T, dim, readMode>&,
const struct cudaArray*, const struct cudaChannelFormatDesc&) "cudaBindText
ureToArray (C++ API)",
* \ref ::cudaBindTextureToArray(const struct texture< T, dim, readMode>&,
const struct cudaArray*) "cudaBindTextureToArray (C++ API, inherited channe
l descriptor)",
* \ref ::cudaUnbindTexture(const struct textureReference*) "cudaUnbindText
ure (C API)",
* \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture< T, d
im, readMode >&) "cudaGetTextureAlignmentOffset (C++ API)"
*/
template<class T, int dim, enum cudaTextureReadMode readMode> template<class T, int dim, enum cudaTextureReadMode readMode>
__inline__ __host__ cudaError_t cudaUnbindTexture( __inline__ __host__ cudaError_t cudaUnbindTexture(
const struct texture<T, dim, readMode> &tex const struct texture<T, dim, readMode> &tex
) )
{ {
return cudaUnbindTexture(&tex); return cudaUnbindTexture(&tex);
} }
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
/**
* \brief \hl Get the alignment offset of a texture
*
* Returns in \p *offset the offset that was returned when texture referenc
e
* \p tex was bound.
*
* \param offset - Offset of texture reference in bytes
* \param tex - Texture to get offset of
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidTexture,
* ::cudaErrorInvalidTextureBinding
* \notefnerr
*
* \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)"
,
* ::cudaGetChannelDesc, ::cudaGetTextureReference,
* \ref ::cudaBindTexture(size_t*, const struct texture< T, dim, readMode>&
, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTextur
e (C++ API)",
* \ref ::cudaBindTexture(size_t*, const struct texture< T, dim, readMode>&
, const void*, size_t) "cudaBindTexture (C++ API, inherited channel descrip
tor)",
* \ref ::cudaBindTexture2D(size_t*, const struct texture< T, dim, readMode
>&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_
t) "cudaBindTexture2D (C++ API)",
* \ref ::cudaBindTextureToArray(const struct texture< T, dim, readMode>&,
const struct cudaArray*, const struct cudaChannelFormatDesc&) "cudaBindText
ureToArray (C++ API)",
* \ref ::cudaBindTextureToArray(const struct texture< T, dim, readMode>&,
const struct cudaArray*) "cudaBindTextureToArray (C++ API, inherited channe
l descriptor)",
* \ref ::cudaUnbindTexture(const struct texture< T, dim, readMode>&) "cuda
UnbindTexture (C++ API)",
* \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct textureRefere
nce*) "cudaGetTextureAlignmentOffset (C API)"
*/
template<class T, int dim, enum cudaTextureReadMode readMode> template<class T, int dim, enum cudaTextureReadMode readMode>
__inline__ __host__ cudaError_t cudaGetTextureAlignmentOffset( __inline__ __host__ cudaError_t cudaGetTextureAlignmentOffset(
size_t *offset, size_t *offset,
const struct texture<T, dim, readMode> &tex const struct texture<T, dim, readMode> &tex
) )
{ {
return cudaGetTextureAlignmentOffset(offset, &tex); return cudaGetTextureAlignmentOffset(offset, &tex);
} }
/** @} */ /* END CUDART_HIGHLEVEL */
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
/**
* \ingroup CUDART_HIGHLEVEL
* \brief \hl Launches a device function
*
* Launches the function \p entry on the device. \p entry can either be a
* function that executes on the device, or it can be a character string,
* naming a function that executes on the device. \p entry must be declared
as
* a \p __global__ function.
* \ref ::cudaLaunch(T*) "cudaLaunch()" must be preceded by a call to
* ::cudaConfigureCall() since it pops the data that was pushed by
* ::cudaConfigureCall() from the execution stack.
*
* \param entry - Device function pointer or char string naming device func
tion
* to execute
*
* \return
* ::cudaSuccess,
* ::cudaErrorInvalidDeviceFunction,
* ::cudaErrorInvalidConfiguration
* \notefnerr
*
* \sa ::cudaConfigureCall,
* \ref ::cudaSetupArgument(T,size_t) "cudaSetupArgument (C++ API)",
* \ref ::cudaLaunch(const char*) "cudaLaunch (C API)"
*/
template<class T> template<class T>
__inline__ __host__ cudaError_t cudaLaunch( __inline__ __host__ cudaError_t cudaLaunch(
T *symbol T *entry
) )
{ {
return cudaLaunch((const char*)symbol); return cudaLaunch((const char*)entry);
} }
#endif /* __CUDACC__ */ #endif /* __CUDACC__ */
#endif /* __cplusplus */ #endif /* __cplusplus */
#endif /* !__CUDA_RUNTIME_H__ */ #endif /* !__CUDA_RUNTIME_H__ */
 End of changes. 16 change blocks. 
3 lines changed or deleted 444 lines changed or added


 cuda_runtime_api.h   cuda_runtime_api.h 
/* /*
* Copyright 1993-2008 NVIDIA Corporation. All rights reserved. * Copyright 1993-2009 NVIDIA Corporation. All rights reserved.
* *
* NOTICE TO USER: * NOTICE TO USER:
* *
* This source code is subject to NVIDIA ownership rights under U.S. and * This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws. Users and possessors of this source code * international Copyright laws. Users and possessors of this source code
* are hereby granted a nonexclusive, royalty-free license to use this code * are hereby granted a nonexclusive, royalty-free license to use this code
* in individual and commercial software. * in individual and commercial software.
* *
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
skipping to change at line 41 skipping to change at line 41
* Any use of this source code in individual and commercial software must * Any use of this source code in individual and commercial software must
* include, in the user documentation and internal comments to the code, * include, in the user documentation and internal comments to the code,
* the above Disclaimer and U.S. Government End Users Notice. * the above Disclaimer and U.S. Government End Users Notice.
*/ */
#if !defined(__CUDA_RUNTIME_API_H__) #if !defined(__CUDA_RUNTIME_API_H__)
#define __CUDA_RUNTIME_API_H__ #define __CUDA_RUNTIME_API_H__
/************************************************************************** ***** /************************************************************************** *****
* * * *
* CUDA runtime API version number 2.1 * * CUDA Runtime API Version 2.2 *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
#define CUDART_VERSION \ #define CUDART_VERSION 2020
2010
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
#include "host_defines.h" #include "host_defines.h"
#include "builtin_types.h" #include "builtin_types.h"
/** \cond impl_private */
#if !defined(__dv) #if !defined(__dv)
#if defined(__cplusplus) #if defined(__cplusplus)
#define __dv(v) \ #define __dv(v) \
= v = v
#else /* __cplusplus */ #else /* __cplusplus */
#define __dv(v) #define __dv(v)
#endif /* __cplusplus */ #endif /* __cplusplus */
#endif /* !__dv */ #endif /* !__dv */
/** \endcond impl_private */
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
#if defined(__cplusplus) #if defined(__cplusplus)
extern "C" { extern "C" {
#endif /* __cplusplus */ #endif /* __cplusplus */
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
extern __host__ cudaError_t CUDARTAPI cudaMalloc3D(struct cudaPitchedPtr* p itchDevPtr, struct cudaExtent extent); extern __host__ cudaError_t CUDARTAPI cudaMalloc3D(struct cudaPitchedPtr* p itchedDevPtr, struct cudaExtent extent);
extern __host__ cudaError_t CUDARTAPI cudaMalloc3DArray(struct cudaArray** arrayPtr, const struct cudaChannelFormatDesc* desc, struct cudaExtent exten t); extern __host__ cudaError_t CUDARTAPI cudaMalloc3DArray(struct cudaArray** arrayPtr, const struct cudaChannelFormatDesc* desc, struct cudaExtent exten t);
extern __host__ cudaError_t CUDARTAPI cudaMemset3D(struct cudaPitchedPtr pi tchDevPtr, int value, struct cudaExtent extent); extern __host__ cudaError_t CUDARTAPI cudaMemset3D(struct cudaPitchedPtr pi tchedDevPtr, int value, struct cudaExtent extent);
extern __host__ cudaError_t CUDARTAPI cudaMemcpy3D(const struct cudaMemcpy3 DParms *p); extern __host__ cudaError_t CUDARTAPI cudaMemcpy3D(const struct cudaMemcpy3 DParms *p);
extern __host__ cudaError_t CUDARTAPI cudaMemcpy3DAsync(const struct cudaMe mcpy3DParms *p, cudaStream_t stream); extern __host__ cudaError_t CUDARTAPI cudaMemcpy3DAsync(const struct cudaMe mcpy3DParms *p, cudaStream_t stream);
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
extern __host__ cudaError_t CUDARTAPI cudaMalloc(void **devPtr, size_t size ); extern __host__ cudaError_t CUDARTAPI cudaMalloc(void **devPtr, size_t size );
extern __host__ cudaError_t CUDARTAPI cudaMallocHost(void **ptr, size_t siz e); extern __host__ cudaError_t CUDARTAPI cudaMallocHost(void **ptr, size_t siz e);
extern __host__ cudaError_t CUDARTAPI cudaMallocPitch(void **devPtr, size_t *pitch, size_t width, size_t height); extern __host__ cudaError_t CUDARTAPI cudaMallocPitch(void **devPtr, size_t *pitch, size_t width, size_t height);
extern __host__ cudaError_t CUDARTAPI cudaMallocArray(struct cudaArray **ar ray, const struct cudaChannelFormatDesc *desc, size_t width, size_t height __dv(1)); extern __host__ cudaError_t CUDARTAPI cudaMallocArray(struct cudaArray **ar ray, const struct cudaChannelFormatDesc *desc, size_t width, size_t height __dv(1));
extern __host__ cudaError_t CUDARTAPI cudaFree(void *devPtr); extern __host__ cudaError_t CUDARTAPI cudaFree(void *devPtr);
extern __host__ cudaError_t CUDARTAPI cudaFreeHost(void *ptr); extern __host__ cudaError_t CUDARTAPI cudaFreeHost(void *ptr);
extern __host__ cudaError_t CUDARTAPI cudaFreeArray(struct cudaArray *array ); extern __host__ cudaError_t CUDARTAPI cudaFreeArray(struct cudaArray *array );
extern __host__ cudaError_t CUDARTAPI cudaHostAlloc(void **pHost, size_t by
tes, unsigned int flags);
extern __host__ cudaError_t CUDARTAPI cudaHostGetDevicePointer(void **pDevi
ce, void *pHost, unsigned int flags);
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
extern __host__ cudaError_t CUDARTAPI cudaMemcpy(void *dst, const void *src , size_t count, enum cudaMemcpyKind kind); extern __host__ cudaError_t CUDARTAPI cudaMemcpy(void *dst, const void *src , size_t count, enum cudaMemcpyKind kind);
extern __host__ cudaError_t CUDARTAPI cudaMemcpyToArray(struct cudaArray *d st, size_t wOffset, size_t hOffset, const void *src, size_t count, enum cud aMemcpyKind kind); extern __host__ cudaError_t CUDARTAPI cudaMemcpyToArray(struct cudaArray *d st, size_t wOffset, size_t hOffset, const void *src, size_t count, enum cud aMemcpyKind kind);
extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromArray(void *dst, const struct cudaArray *src, size_t wOffset, size_t hOffset, size_t count, enum c udaMemcpyKind kind); extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromArray(void *dst, const struct cudaArray *src, size_t wOffset, size_t hOffset, size_t count, enum c udaMemcpyKind kind);
extern __host__ cudaError_t CUDARTAPI cudaMemcpyArrayToArray(struct cudaArr ay *dst, size_t wOffsetDst, size_t hOffsetDst, const struct cudaArray *src, size_t wOffsetSrc, size_t hOffsetSrc, size_t count, enum cudaMemcpyKind ki nd __dv(cudaMemcpyDeviceToDevice)); extern __host__ cudaError_t CUDARTAPI cudaMemcpyArrayToArray(struct cudaArr ay *dst, size_t wOffsetDst, size_t hOffsetDst, const struct cudaArray *src, size_t wOffsetSrc, size_t hOffsetSrc, size_t count, enum cudaMemcpyKind ki nd __dv(cudaMemcpyDeviceToDevice));
skipping to change at line 146 skipping to change at line 150
extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DFromArrayAsync(void *dst, size_t dpitch, const struct cudaArray *src, size_t wOffset, size_t hOffset , size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t strea m); extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DFromArrayAsync(void *dst, size_t dpitch, const struct cudaArray *src, size_t wOffset, size_t hOffset , size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t strea m);
extern __host__ cudaError_t CUDARTAPI cudaMemcpyToSymbolAsync(const char *s ymbol, const void *src, size_t count, size_t offset, enum cudaMemcpyKind ki nd, cudaStream_t stream); extern __host__ cudaError_t CUDARTAPI cudaMemcpyToSymbolAsync(const char *s ymbol, const void *src, size_t count, size_t offset, enum cudaMemcpyKind ki nd, cudaStream_t stream);
extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromSymbolAsync(void *dst, const char *symbol, size_t count, size_t offset, enum cudaMemcpyKind kind, cudaStream_t stream); extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromSymbolAsync(void *dst, const char *symbol, size_t count, size_t offset, enum cudaMemcpyKind kind, cudaStream_t stream);
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
extern __host__ cudaError_t CUDARTAPI cudaMemset(void *mem, int c, size_t c extern __host__ cudaError_t CUDARTAPI cudaMemset(void *devPtr, int value, s
ount); ize_t count);
extern __host__ cudaError_t CUDARTAPI cudaMemset2D(void *mem, size_t pitch, extern __host__ cudaError_t CUDARTAPI cudaMemset2D(void *devPtr, size_t pit
int c, size_t width, size_t height); ch, int value, size_t width, size_t height);
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
extern __host__ cudaError_t CUDARTAPI cudaGetSymbolAddress(void **devPtr, c onst char *symbol); extern __host__ cudaError_t CUDARTAPI cudaGetSymbolAddress(void **devPtr, c onst char *symbol);
extern __host__ cudaError_t CUDARTAPI cudaGetSymbolSize(size_t *size, const char *symbol); extern __host__ cudaError_t CUDARTAPI cudaGetSymbolSize(size_t *size, const char *symbol);
skipping to change at line 169 skipping to change at line 173
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
extern __host__ cudaError_t CUDARTAPI cudaGetDeviceCount(int *count); extern __host__ cudaError_t CUDARTAPI cudaGetDeviceCount(int *count);
extern __host__ cudaError_t CUDARTAPI cudaGetDeviceProperties(struct cudaDe viceProp *prop, int device); extern __host__ cudaError_t CUDARTAPI cudaGetDeviceProperties(struct cudaDe viceProp *prop, int device);
extern __host__ cudaError_t CUDARTAPI cudaChooseDevice(int *device, const s truct cudaDeviceProp *prop); extern __host__ cudaError_t CUDARTAPI cudaChooseDevice(int *device, const s truct cudaDeviceProp *prop);
extern __host__ cudaError_t CUDARTAPI cudaSetDevice(int device); extern __host__ cudaError_t CUDARTAPI cudaSetDevice(int device);
extern __host__ cudaError_t CUDARTAPI cudaGetDevice(int *device); extern __host__ cudaError_t CUDARTAPI cudaGetDevice(int *device);
extern __host__ cudaError_t CUDARTAPI cudaSetValidDevices(int *device_arr,
int len);
extern __host__ cudaError_t CUDARTAPI cudaSetDeviceFlags( int flags );
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
extern __host__ cudaError_t CUDARTAPI cudaBindTexture(size_t *offset, const struct textureReference *texref, const void *devPtr, const struct cudaChan nelFormatDesc *desc, size_t size __dv(UINT_MAX)); extern __host__ cudaError_t CUDARTAPI cudaBindTexture(size_t *offset, const struct textureReference *texref, const void *devPtr, const struct cudaChan nelFormatDesc *desc, size_t size __dv(UINT_MAX));
extern __host__ cudaError_t CUDARTAPI cudaBindTexture2D(size_t *offset,cons t struct textureReference *texref,const void *devPtr, const struct cudaChan nelFormatDesc *desc,size_t width, size_t height, size_t pitch);
extern __host__ cudaError_t CUDARTAPI cudaBindTextureToArray(const struct t extureReference *texref, const struct cudaArray *array, const struct cudaCh annelFormatDesc *desc); extern __host__ cudaError_t CUDARTAPI cudaBindTextureToArray(const struct t extureReference *texref, const struct cudaArray *array, const struct cudaCh annelFormatDesc *desc);
extern __host__ cudaError_t CUDARTAPI cudaUnbindTexture(const struct textur eReference *texref); extern __host__ cudaError_t CUDARTAPI cudaUnbindTexture(const struct textur eReference *texref);
extern __host__ cudaError_t CUDARTAPI cudaGetTextureAlignmentOffset(size_t *offset, const struct textureReference *texref); extern __host__ cudaError_t CUDARTAPI cudaGetTextureAlignmentOffset(size_t *offset, const struct textureReference *texref);
extern __host__ cudaError_t CUDARTAPI cudaGetTextureReference(const struct textureReference **texref, const char *symbol); extern __host__ cudaError_t CUDARTAPI cudaGetTextureReference(const struct textureReference **texref, const char *symbol);
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
skipping to change at line 208 skipping to change at line 215
extern __host__ const char* CUDARTAPI cudaGetErrorString(cudaError_t error) ; extern __host__ const char* CUDARTAPI cudaGetErrorString(cudaError_t error) ;
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
extern __host__ cudaError_t CUDARTAPI cudaConfigureCall(dim3 gridDim, dim3 blockDim, size_t sharedMem __dv(0), cudaStream_t stream __dv(0)); extern __host__ cudaError_t CUDARTAPI cudaConfigureCall(dim3 gridDim, dim3 blockDim, size_t sharedMem __dv(0), cudaStream_t stream __dv(0));
extern __host__ cudaError_t CUDARTAPI cudaSetupArgument(const void *arg, si ze_t size, size_t offset); extern __host__ cudaError_t CUDARTAPI cudaSetupArgument(const void *arg, si ze_t size, size_t offset);
extern __host__ cudaError_t CUDARTAPI cudaLaunch(const char *symbol); extern __host__ cudaError_t CUDARTAPI cudaLaunch(const char *entry);
extern __host__ cudaError_t CUDARTAPI cudaFuncGetAttributes(struct cudaFunc
Attributes *attr, const char *func);
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
extern __host__ cudaError_t CUDARTAPI cudaStreamCreate(cudaStream_t *stream ); extern __host__ cudaError_t CUDARTAPI cudaStreamCreate(cudaStream_t *pStrea m);
extern __host__ cudaError_t CUDARTAPI cudaStreamDestroy(cudaStream_t stream ); extern __host__ cudaError_t CUDARTAPI cudaStreamDestroy(cudaStream_t stream );
extern __host__ cudaError_t CUDARTAPI cudaStreamSynchronize(cudaStream_t st ream); extern __host__ cudaError_t CUDARTAPI cudaStreamSynchronize(cudaStream_t st ream);
extern __host__ cudaError_t CUDARTAPI cudaStreamQuery(cudaStream_t stream); extern __host__ cudaError_t CUDARTAPI cudaStreamQuery(cudaStream_t stream);
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
extern __host__ cudaError_t CUDARTAPI cudaEventCreate(cudaEvent_t *event); extern __host__ cudaError_t CUDARTAPI cudaEventCreate(cudaEvent_t *event);
extern __host__ cudaError_t CUDARTAPI cudaEventCreateWithFlags(cudaEvent_t *event, int flags);
extern __host__ cudaError_t CUDARTAPI cudaEventRecord(cudaEvent_t event, cu daStream_t stream); extern __host__ cudaError_t CUDARTAPI cudaEventRecord(cudaEvent_t event, cu daStream_t stream);
extern __host__ cudaError_t CUDARTAPI cudaEventQuery(cudaEvent_t event); extern __host__ cudaError_t CUDARTAPI cudaEventQuery(cudaEvent_t event);
extern __host__ cudaError_t CUDARTAPI cudaEventSynchronize(cudaEvent_t even t); extern __host__ cudaError_t CUDARTAPI cudaEventSynchronize(cudaEvent_t even t);
extern __host__ cudaError_t CUDARTAPI cudaEventDestroy(cudaEvent_t event); extern __host__ cudaError_t CUDARTAPI cudaEventDestroy(cudaEvent_t event);
extern __host__ cudaError_t CUDARTAPI cudaEventElapsedTime(float *ms, cudaE vent_t start, cudaEvent_t end); extern __host__ cudaError_t CUDARTAPI cudaEventElapsedTime(float *ms, cudaE vent_t start, cudaEvent_t end);
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
skipping to change at line 252 skipping to change at line 261
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
extern __host__ cudaError_t CUDARTAPI cudaThreadExit(void); extern __host__ cudaError_t CUDARTAPI cudaThreadExit(void);
extern __host__ cudaError_t CUDARTAPI cudaThreadSynchronize(void); extern __host__ cudaError_t CUDARTAPI cudaThreadSynchronize(void);
/**************************************************************************
*****
*
*
*
*
*
*
***************************************************************************
****/
extern __host__ cudaError_t CUDARTAPI cudaDriverGetVersion(int *driverVersi
on);
extern __host__ cudaError_t CUDARTAPI cudaRuntimeGetVersion(int *runtimeVer
sion);
#if defined(__cplusplus) #if defined(__cplusplus)
} }
#endif /* __cplusplus */ #endif /* __cplusplus */
#undef __dv #undef __dv
#endif /* !__CUDA_RUNTIME_API_H__ */ #endif /* !__CUDA_RUNTIME_API_H__ */
 End of changes. 15 change blocks. 
12 lines changed or deleted 41 lines changed or added


 cuda_texture_types.h   cuda_texture_types.h 
/* /*
* Copyright 1993-2008 NVIDIA Corporation. All rights reserved. * Copyright 1993-2009 NVIDIA Corporation. All rights reserved.
* *
* NOTICE TO USER: * NOTICE TO USER:
* *
* This source code is subject to NVIDIA ownership rights under U.S. and * This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws. Users and possessors of this source code * international Copyright laws. Users and possessors of this source code
* are hereby granted a nonexclusive, royalty-free license to use this code * are hereby granted a nonexclusive, royalty-free license to use this code
* in individual and commercial software. * in individual and commercial software.
* *
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
skipping to change at line 59 skipping to change at line 59
#include "host_defines.h" #include "host_defines.h"
#include "texture_types.h" #include "texture_types.h"
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
/*TEXTURE_TYPE*/ /*TEXTURE_TYPE*/
template<class T, int dim = 1, enum cudaTextureReadMode = cudaReadModeEleme ntType> template<class T, int dim = 1, enum cudaTextureReadMode mode = cudaReadMode ElementType>
struct texture : public textureReference struct texture : public textureReference
{ {
__host__ texture(int norm = 0, __host__ texture(int norm = 0,
enum cudaTextureFilterMode fMode = cudaFilterModePoint, enum cudaTextureFilterMode fMode = cudaFilterModePoint,
enum cudaTextureAddressMode aMode = cudaAddressModeClamp ) enum cudaTextureAddressMode aMode = cudaAddressModeClamp )
{ {
normalized = norm; normalized = norm;
filterMode = fMode; filterMode = fMode;
addressMode[0] = aMode; addressMode[0] = aMode;
addressMode[1] = aMode; addressMode[1] = aMode;
 End of changes. 2 change blocks. 
2 lines changed or deleted 2 lines changed or added


 cufft.h   cufft.h 
/* /*
* Copyright 1993-2008 NVIDIA Corporation. All rights reserved. * Copyright 1993-2009 NVIDIA Corporation. All rights reserved.
* *
* NOTICE TO USER: * NOTICE TO USER:
* *
* This source code is subject to NVIDIA ownership rights under U.S. and * This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws. Users and possessors of this source code * international Copyright laws. Users and possessors of this source code
* are hereby granted a nonexclusive, royalty-free license to use this code * are hereby granted a nonexclusive, royalty-free license to use this code
* in individual and commercial software. * in individual and commercial software.
* *
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
skipping to change at line 40 skipping to change at line 40
* *
* Any use of this source code in individual and commercial software must * Any use of this source code in individual and commercial software must
* include, in the user documentation and internal comments to the code, * include, in the user documentation and internal comments to the code,
* the above Disclaimer and U.S. Government End Users Notice. * the above Disclaimer and U.S. Government End Users Notice.
*/ */
/* /*
* cufft.h * cufft.h
* Public header file for the NVIDIA Cuda FFT library (CUFFT) * Public header file for the NVIDIA Cuda FFT library (CUFFT)
*/ */
#ifndef _CUFFT_H_ #ifndef _CUFFT_H_
#define _CUFFT_H_ #define _CUFFT_H_
#include <stdio.h> #include <stdio.h>
#include "cuComplex.h" #include "cuComplex.h"
#ifdef __MULTI_CORE__
#error CUFFT not supported on multicore
#endif
#ifndef CUFFTAPI #ifndef CUFFTAPI
#ifdef _WIN32 #ifdef _WIN32
#define CUFFTAPI __stdcall #define CUFFTAPI __stdcall
#else #else
#define CUFFTAPI #define CUFFTAPI
#endif #endif
#endif #endif
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
 End of changes. 3 change blocks. 
6 lines changed or deleted 1 lines changed or added


 device_functions.h   device_functions.h 
/* /*
* Copyright 1993-2008 NVIDIA Corporation. All rights reserved. * Copyright 1993-2009 NVIDIA Corporation. All rights reserved.
* *
* NOTICE TO USER: * NOTICE TO USER:
* *
* This source code is subject to NVIDIA ownership rights under U.S. and * This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws. Users and possessors of this source code * international Copyright laws. Users and possessors of this source code
* are hereby granted a nonexclusive, royalty-free license to use this code * are hereby granted a nonexclusive, royalty-free license to use this code
* in individual and commercial software. * in individual and commercial software.
* *
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
skipping to change at line 77 skipping to change at line 77
extern __device__ unsigned long long int __umul64hi(unsigned long long int, unsigned long long int); extern __device__ unsigned long long int __umul64hi(unsigned long long int, unsigned long long int);
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ float __int_as_float(int); extern __device__ float __int_as_float(int);
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ int __float_as_int(float); extern __device__ int __float_as_int(float);
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ void __syncthreads(void); extern __device__ void __syncthreads(void);
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ void __prof_trigger(int);
/*DEVICE_BUILTIN*/
extern __device__ void __threadfence(void);
/*DEVICE_BUILTIN*/
extern __device__ void __threadfence_block(void);
/*DEVICE_BUILTIN*/
extern __device__ void __trap(void); extern __device__ void __trap(void);
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ void __brkpt(int); extern __device__ void __brkpt(int);
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ float __saturatef(float); extern __device__ float __saturatef(float);
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ unsigned int __sad(int, int, unsigned int); extern __device__ unsigned int __sad(int, int, unsigned int);
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
skipping to change at line 183 skipping to change at line 189
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ unsigned long long int __float2ull_rz(float); extern __device__ unsigned long long int __float2ull_rz(float);
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ float __ll2float_rn(long long int); extern __device__ float __ll2float_rn(long long int);
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ float __ull2float_rn(unsigned long long int); extern __device__ float __ull2float_rn(unsigned long long int);
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ float __fadd_rn(float, float);
/*DEVICE_BUILTIN*/
extern __device__ float __fadd_rz(float, float); extern __device__ float __fadd_rz(float, float);
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ float __fmul_rz(float, float); extern __device__ float __fadd_ru(float, float);
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ float __fadd_rn(float, float); extern __device__ float __fadd_rd(float, float);
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ float __fmul_rn(float, float); extern __device__ float __fmul_rn(float, float);
/*DEVICE_BUILTIN*/
extern __device__ float __fmul_rz(float, float);
/*DEVICE_BUILTIN*/
extern __device__ float __fmul_ru(float, float);
/*DEVICE_BUILTIN*/
extern __device__ float __fmul_rd(float, float);
/*DEVICE_BUILTIN*/
extern __device__ float __fmaf_rn(float, float, float);
/*DEVICE_BUILTIN*/
extern __device__ float __fmaf_rz(float, float, float);
/*DEVICE_BUILTIN*/
extern __device__ float __fmaf_ru(float, float, float);
/*DEVICE_BUILTIN*/
extern __device__ float __fmaf_rd(float, float, float);
/*DEVICE_BUILTIN*/
extern __device__ float __frcp_rn(float);
/*DEVICE_BUILTIN*/
extern __device__ float __frcp_rz(float);
/*DEVICE_BUILTIN*/
extern __device__ float __frcp_ru(float);
/*DEVICE_BUILTIN*/
extern __device__ float __frcp_rd(float);
/*DEVICE_BUILTIN*/
extern __device__ float __fsqrt_rn(float);
/*DEVICE_BUILTIN*/
extern __device__ float __fsqrt_rz(float);
/*DEVICE_BUILTIN*/
extern __device__ float __fsqrt_ru(float);
/*DEVICE_BUILTIN*/
extern __device__ float __fsqrt_rd(float);
/*DEVICE_BUILTIN*/
extern __device__ float __fdiv_rn(float, float);
/*DEVICE_BUILTIN*/
extern __device__ float __fdiv_rz(float, float);
/*DEVICE_BUILTIN*/
extern __device__ float __fdiv_ru(float, float);
/*DEVICE_BUILTIN*/
extern __device__ float __fdiv_rd(float, float);
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ int __clz(int); extern __device__ int __clz(int);
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ int __ffs(int); extern __device__ int __ffs(int);
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ int __popc(unsigned int); extern __device__ int __popc(unsigned int);
/*DEVICE_BUILTIN*/
extern __device__ unsigned int __brev(unsigned int);
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ int __clzll(long long int); extern __device__ int __clzll(long long int);
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ int __ffsll(long long int); extern __device__ int __ffsll(long long int);
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ int __popcll(unsigned long long int); extern __device__ int __popcll(unsigned long long int);
/*DEVICE_BUILTIN*/
extern __device__ unsigned long long int __brevll(unsigned long long int);
#if !defined(CUDA_NO_SM_13_DOUBLE_INTRINSICS) #if !defined(CUDA_NO_SM_13_DOUBLE_INTRINSICS)
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ int __double2int_rz(double); extern __device__ int __double2int_rz(double);
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ unsigned int __double2uint_rz(double); extern __device__ unsigned int __double2uint_rz(double);
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
skipping to change at line 309 skipping to change at line 364
static __inline__ __device__ void brkpt(int c) static __inline__ __device__ void brkpt(int c)
{ {
__brkpt(c); __brkpt(c);
} }
static __inline__ __device__ void syncthreads(void) static __inline__ __device__ void syncthreads(void)
{ {
__syncthreads(); __syncthreads();
} }
static __inline__ __device__ void prof_trigger(int e)
{
if (e == 0) __prof_trigger( 0);
else if (e == 1) __prof_trigger( 1);
else if (e == 2) __prof_trigger( 2);
else if (e == 3) __prof_trigger( 3);
else if (e == 4) __prof_trigger( 4);
else if (e == 5) __prof_trigger( 5);
else if (e == 6) __prof_trigger( 6);
else if (e == 7) __prof_trigger( 7);
else if (e == 8) __prof_trigger( 8);
else if (e == 9) __prof_trigger( 9);
else if (e == 10) __prof_trigger(10);
else if (e == 11) __prof_trigger(11);
else if (e == 12) __prof_trigger(12);
else if (e == 13) __prof_trigger(13);
else if (e == 14) __prof_trigger(14);
else if (e == 15) __prof_trigger(15);
}
static __inline__ __device__ void threadfence(bool global = true)
{
global ? __threadfence() : __threadfence_block();
}
static __inline__ __device__ int float2int(float a, enum cudaRoundMode mode = cudaRoundZero) static __inline__ __device__ int float2int(float a, enum cudaRoundMode mode = cudaRoundZero)
{ {
return mode == cudaRoundNearest ? __float2int_rn(a) : return mode == cudaRoundNearest ? __float2int_rn(a) :
mode == cudaRoundPosInf ? __float2int_ru(a) : mode == cudaRoundPosInf ? __float2int_ru(a) :
mode == cudaRoundMinInf ? __float2int_rd(a) : mode == cudaRoundMinInf ? __float2int_rd(a) :
__float2int_rz(a); __float2int_rz(a);
} }
static __inline__ __device__ unsigned int float2uint(float a, enum cudaRoun dMode mode = cudaRoundZero) static __inline__ __device__ unsigned int float2uint(float a, enum cudaRoun dMode mode = cudaRoundZero)
{ {
skipping to change at line 348 skipping to change at line 428
__uint2float_rn(a); __uint2float_rn(a);
} }
#elif !defined(__CUDACC__) #elif !defined(__CUDACC__)
#include "crt/func_macro.h" #include "crt/func_macro.h"
#include "host_defines.h" #include "host_defines.h"
#include "math_constants.h" #include "math_constants.h"
#if !defined(__CUDABE__) #if defined(__CUDABE__)
__device_func__(float __frcp_rn (float x))
{
unsigned int expo;
unsigned f, y;
unsigned int argi;
float t;
argi = __float_as_int(x);
expo = (argi >> 23);
expo = expo & 0xff;
f = expo - 1;
if (f <= 0xFD) {
y = (argi & 0x00ffffff) | 0x00800000;
expo = (2 * 127) - expo - 2;
t = 1.0f / x;
argi = __float_as_int(t);
argi = (argi & 0x00ffffff) | 0x00800000;
if ((int)expo >= 0) {
/* compute remainder1 */
f = __umul24(y, argi);
/* remainder1 must be negative. Fix if neccessary */
if ((int)f > 0) {
t = __int_as_float(__float_as_int(t)-1);
f -= y;
}
/* compute remainder2 */
expo = f + y;
/* round result based on which remainder is smaller in magnitude */
f = (unsigned)(-(int)f);
if (expo < f) {
t = __int_as_float(__float_as_int(t)+1);
}
return t;
}
}
return 1.0f / x;
}
__device_func__(float __frcp_rz (float x))
{
unsigned int expo;
unsigned f, y;
unsigned int argi;
float t;
argi = __float_as_int(x);
expo = (argi >> 23);
expo = expo & 0xff;
f = expo - 1;
if (f <= 0xFD) {
y = (argi & 0x00ffffff) | 0x00800000;
expo = (2 * 127) - expo - 2;
t = 1.0f / x;
argi = __float_as_int(t);
argi = (argi & 0x00ffffff) | 0x00800000;
if ((int)expo >= 0) {
f = __umul24(y, argi);
if ((int)f > 0) {
t = __int_as_float(__float_as_int(t)-1);
}
return t;
}
}
return 1.0f / x;
}
__device_func__(float __frcp_rd (float x))
{
unsigned int expo;
unsigned f, y;
unsigned int argi;
float t;
argi = __float_as_int(x);
expo = (argi >> 23);
expo = expo & 0xff;
f = expo - 1;
if (f <= 0xFD) {
y = (argi & 0x00ffffff) | 0x00800000;
expo = (2 * 127) - expo - 2;
t = 1.0f / x;
argi = __float_as_int(t);
argi = (argi & 0x00ffffff) | 0x00800000;
if ((int)expo >= 0) {
f = __umul24(y, argi);
if (((int)f > 0) && (x > 0.0f)) {
t = __int_as_float(__float_as_int(t)-1);
}
if (((int)f < 0) && (x < 0.0f)) {
t = __int_as_float(__float_as_int(t)+1);
}
return t;
}
}
return 1.0f / x;
}
__device_func__(float __frcp_ru (float x))
{
unsigned int expo;
unsigned f, y;
unsigned int argi;
float t;
argi = __float_as_int(x);
expo = (argi >> 23);
expo = expo & 0xff;
f = expo - 1;
if (f <= 0xFD) {
y = (argi & 0x00ffffff) | 0x00800000;
expo = (2 * 127) - expo - 2;
t = 1.0f / x;
argi = __float_as_int(t);
argi = (argi & 0x00ffffff) | 0x00800000;
if ((int)expo >= 0) {
f = __umul24(y, argi);
if (((int)f > 0) && (x < 0.0f)) {
t = __int_as_float(__float_as_int(t)-1);
}
if (((int)f < 0) && (x > 0.0f)) {
t = __int_as_float(__float_as_int(t)+1);
}
return t;
}
}
return 1.0f / x;
}
__device_func__(float __fsqrt_rn (float radicand))
{
unsigned int expo, argi;
unsigned int s, f, x;
argi = __float_as_int(radicand);
expo = argi >> 23;
expo = expo & 0xff;
f = expo - 1;
if ((argi <= 0x80000000) && (f <= 0xFD)) {
x = (argi << 8) | 0x80000000;
x = x >> (expo & 1);
argi = (((__float_as_int(rsqrtf(__int_as_float(
__float_as_int(radicand)|1)))&0x00ffffff)|0x00800000)<<7);
/* second NR iteration */
s = __umulhi(argi,argi);
f = 0x30000000 - __umulhi(x,s);
argi = __umulhi(f,argi);
/* compute sqrt_rn(x) as x * 1/sqrt_rn(x) */
argi = __umulhi(x,argi);
argi = argi >> 3;
x = (x << 16) - (argi * argi);
/* round to nearest based on remainder; tie case impossible */
f = x - (2 * argi + 1);
if ((int)f < 0) f = (unsigned)(-(int)f);
if ((int)x < 0) x = (unsigned)(-(int)x);
if (f < x) argi ++;
argi = argi + (((expo + 125) & ~0x1) << 22);
return __int_as_float(argi);
}
return sqrtf(radicand);
}
__device_func__(float __fsqrt_rz (float radicand))
{
unsigned int expo, argi;
unsigned int s, f, x;
argi = __float_as_int(radicand);
expo = argi >> 23;
expo = expo & 0xff;
f = expo - 1;
if ((argi <= 0x80000000) && (f <= 0xFD)) {
x = (argi << 8) | 0x80000000;
x = x >> (expo & 1);
argi = (((__float_as_int(rsqrtf(__int_as_float(
__float_as_int(radicand)|1)))&0x00ffffff)|0x00800000)<<7);
/* NR iteration */
s = __umulhi(argi,argi);
f = 0x30000000 - __umulhi(x,s);
argi = __umulhi(f,argi);
/* compute sqrt_rz(x) as x * 1/sqrt_rz(x) */
argi = __umulhi(x,argi);
/* compute truncated result */
argi = (argi + 4) >> 3;
x = (x << 16) - (argi * argi);
if ((int)x < 0) argi--;
argi = argi + (((expo + 125) & ~0x1) << 22);
return __int_as_float(argi);
}
return sqrtf(radicand);
}
__device_func__(float __fsqrt_ru (float radicand))
{
unsigned int expo, argi;
unsigned int s, f, x;
argi = __float_as_int(radicand);
expo = argi >> 23;
expo = expo & 0xff;
f = expo - 1;
if ((argi <= 0x80000000) && (f <= 0xFD)) {
x = (argi << 8) | 0x80000000;
x = x >> (expo & 1);
argi = (((__float_as_int(rsqrtf(__int_as_float(
__float_as_int(radicand)|1)))&0x00ffffff)|0x00800000)<<7);
/* NR iteration */
s = __umulhi(argi,argi);
f = 0x30000000 - __umulhi(x,s);
argi = __umulhi(f,argi);
/* compute sqrt_ru(x) as x * 1/sqrt_ru(x) */
argi = __umulhi(x,argi);
argi = (argi + 4) >> 3;
x = (x << 16) - (argi * argi);
if ((int)x > 0) argi++;
argi = argi + (((expo + 125) & ~0x1) << 22);
return __int_as_float(argi);
}
return sqrtf(radicand);
}
__device_func__(float __fsqrt_rd (float radicand))
{
unsigned int expo, argi;
unsigned int s, f, x;
argi = __float_as_int(radicand);
expo = argi >> 23;
expo = expo & 0xff;
f = expo - 1;
if ((argi <= 0x80000000) && (f <= 0xFD)) {
x = (argi << 8) | 0x80000000;
x = x >> (expo & 1);
argi = (((__float_as_int(rsqrtf(__int_as_float(
__float_as_int(radicand)|1)))&0x00ffffff)|0x00800000)<<7);
/* NR iteration */
s = __umulhi(argi,argi);
f = 0x30000000 - __umulhi(x,s);
argi = __umulhi(f,argi);
/* compute sqrt_rd(x) as x * 1/sqrt_rd(x) */
argi = __umulhi(x,argi);
/* compute truncated result */
argi = (argi + 4) >> 3;
x = (x << 16) - (argi * argi);
if ((int)x < 0) argi--;
argi = argi + (((expo + 125) & ~0x1) << 22);
return __int_as_float(argi);
}
return sqrtf(radicand);
}
__device_func__(float __fdiv_rn (float dividend, float divisor))
{
unsigned long long prod;
unsigned r, f, x, y, expox, expoy, sign;
unsigned expo_res;
unsigned resi, cvtxi, cvtyi;
float t;
cvtxi = __float_as_int(dividend);
cvtyi = __float_as_int(divisor);
expox = (cvtxi >> 23) & 0xff;
expoy = (cvtyi >> 23) & 0xff;
sign = ((cvtxi ^ cvtyi) & 0x80000000);
if (((expox - 1) <= 0xFD) && ((expoy - 1) <= 0xFD)) {
expo_res = expox - expoy + 127 - 1;
/* extract mantissas */
y = (cvtyi << 8) | 0x80000000;
x = (cvtxi & 0x00ffffff) | 0x00800000;
t =__int_as_float((cvtyi & 0x00ffffff) | 0x3f800001);
r = ((__float_as_int(1.0f / t) & 0x00ffffff) | 0x00800000) << 7;
/* NR iteration */
f = (unsigned)-(int)__umulhi (y, r << 1);
r = __umulhi (f, r << 1);
/* produce quotient */
prod = ((unsigned long long)x) * (r << 1);
/* normalize mantissa */
if (((int)((prod >> 32) << 8)) > 0) {
expo_res--;
prod = prod + prod;
}
/* preliminary mantissa */
r = (unsigned)(prod >> 32);
y = y >> 8;
/* result is a normal */
if (expo_res <= 0xFD) {
int rem0, rem1, inc;
/* round mantissa to nearest even */
prod = ((unsigned long long)y) * r;
x = x << (23 + ((prod >> 32) >> 15));
rem1 = x - (unsigned)(prod & 0xffffffff);
rem0 = rem1 - y;
inc = abs(rem0) < abs(rem1);
/* merge sign, mantissa, exponent for final result */
resi = sign | ((expo_res << 23) + r + inc);
return __int_as_float(resi);
} else if ((int)expo_res >= 254) {
/* overflow: return infinity */
resi = sign | 0x7f800000;
return __int_as_float(resi);
} else {
/* underflow, may still round to normal */
int rem0, rem1, inc;
prod = ((unsigned long long)y) * r;
x = x << (23 + ((prod >> 32) >> 15));
rem1 = x - (unsigned)(prod & 0xffffffff);
rem0 = rem1 - y;
inc = abs(rem0) < abs(rem1);
resi = ((expo_res << 23) + r + inc);
if (resi != 0x00800000) resi = 0;
return __int_as_float(sign | resi);
}
}
if (__cuda_fabsf(divisor) > CUDART_TWO_TO_126_F) {
divisor *= 0.25f;
dividend *= 0.25f;
}
return dividend / divisor;
}
__device_func__(float __fdiv_rz (float dividend, float divisor))
{
unsigned long long prod;
unsigned r, f, x, y, expox, expoy, sign;
unsigned expo_res;
unsigned resi, cvtxi, cvtyi;
float t;
cvtxi = __float_as_int(dividend);
cvtyi = __float_as_int(divisor);
expox = (cvtxi >> 23) & 0xff;
expoy = (cvtyi >> 23) & 0xff;
sign = ((cvtxi ^ cvtyi) & 0x80000000);
if (((expox - 1) <= 0xFD) && ((expoy - 1) <= 0xFD)) {
expo_res = expox - expoy + 127 - 1;
/* extract mantissas */
y = (cvtyi << 8) | 0x80000000;
x = (cvtxi & 0x00ffffff) | 0x00800000;
t =__int_as_float((cvtyi & 0x00ffffff) | 0x3f800001);
r = ((__float_as_int(1.0f / t) & 0x00ffffff) | 0x00800000) << 7;
/* NR iteration */
f = (unsigned)-(int)__umulhi (y, r << 1);
r = __umulhi (f, r << 1);
/* produce quotient */
prod = ((unsigned long long)x) * (r << 1);
/* normalize mantissa */
if (((int)((prod >> 32) << 8)) > 0) {
expo_res--;
prod = prod + prod;
}
/* preliminary mantissa */
prod += 0x0000000080000000ULL;
r = (unsigned)(prod >> 32);
y = y >> 8;
if (expo_res <= 0xFD) {
/* result is a normal */
int rem1;
prod = ((unsigned long long)y) * r;
x = x << (23 + ((prod >> 32) >> 15));
rem1 = x - (unsigned)(prod & 0xffffffff);
if (rem1 < 0) r--;
resi = (expo_res << 23) + r;
if (resi == 0x7f800000) resi = 0x7f7fffff;
return __int_as_float(sign | resi);
} else if ((int)expo_res >= 254) {
/* overflow: return largest normal */
resi = 0x7f7fffff;
return __int_as_float(sign |resi);
} else {
/* underflow: result is smallest normal or zero */
int rem1;
prod = ((unsigned long long)y) * r;
x = x << (23 + ((prod >> 32) >> 15));
rem1 = x - (unsigned)(prod & 0xffffffff);
if (rem1 < 0) r--;
resi = ((expo_res << 23) + r);
if (resi != 0x00800000) resi = 0;
return __int_as_float(sign | resi);
}
}
if (__cuda_fabsf(divisor) > CUDART_TWO_TO_126_F) {
divisor *= 0.25f;
dividend *= 0.25f;
}
return dividend / divisor;
}
__device_func__(float __fdiv_ru (float dividend, float divisor))
{
unsigned long long prod;
unsigned r, f, x, y, expox, expoy, sign;
unsigned expo_res;
unsigned resi, cvtxi, cvtyi;
float t;
cvtxi = __float_as_int(dividend);
cvtyi = __float_as_int(divisor);
expox = (cvtxi >> 23) & 0xff;
expoy = (cvtyi >> 23) & 0xff;
sign = ((cvtxi ^ cvtyi) & 0x80000000);
if (((expox - 1) <= 0xFD) && ((expoy - 1) <= 0xFD)) {
expo_res = expox - expoy + 127 - 1;
/* extract mantissas */
y = (cvtyi << 8) | 0x80000000;
x = (cvtxi & 0x00ffffff) | 0x00800000;
t =__int_as_float((cvtyi & 0x00ffffff) | 0x3f800001);
r = ((__float_as_int(1.0f / t) & 0x00ffffff) | 0x00800000) << 7;
/* NR iteration */
f = (unsigned)-(int)__umulhi (y, r << 1);
r = __umulhi (f, r << 1);
/* produce quotient */
prod = ((unsigned long long)x) * (r << 1);
/* normalize mantissa */
if (((int)((prod >> 32) << 8)) > 0) {
expo_res--;
prod = prod + prod;
}
/* preliminary mantissa */
prod += 0x0000000080000000ULL;
r = (unsigned)(prod >> 32);
y = y >> 8;
if (expo_res <= 0xFD) {
/* result is a normal */
int rem1;
prod = ((unsigned long long)y) * r;
x = x << (23 + ((prod >> 32) >> 15));
rem1 = x - (unsigned)(prod & 0xffffffff);
if ((rem1 < 0) && (sign)) r--;
if ((rem1 > 0) && (!sign)) r++;
resi = (expo_res << 23) + r;
if ((resi == 0x7f800000) && (sign)) resi = 0x7f7fffff;
return __int_as_float(sign | resi);
} else if ((int)expo_res >= 254) {
/* overflow: return largest normal */
resi = sign ? 0x7f7fffff : 0x7f800000;
return __int_as_float(sign | resi);
} else {
/* underflow: result is smallest normal or zero */
int rem1;
prod = ((unsigned long long)y) * r;
x = x << (23 + ((prod >> 32) >> 15));
rem1 = x - (unsigned)(prod & 0xffffffff);
if ((rem1 < 0) && (sign)) r--;
if ((rem1 > 0) && (!sign)) r++;
resi = ((expo_res << 23) + r);
if (resi != 0x00800000) resi = 0;
return __int_as_float(sign | resi);
}
}
if (__cuda_fabsf(divisor) > CUDART_TWO_TO_126_F) {
divisor *= 0.25f;
dividend *= 0.25f;
}
return dividend / divisor;
}
__device_func__(float __fdiv_rd (float dividend, float divisor))
{
unsigned long long prod;
unsigned r, f, x, y, expox, expoy, sign;
unsigned expo_res;
unsigned resi, cvtxi, cvtyi;
float t;
cvtxi = __float_as_int(dividend);
cvtyi = __float_as_int(divisor);
expox = (cvtxi >> 23) & 0xff;
expoy = (cvtyi >> 23) & 0xff;
sign = ((cvtxi ^ cvtyi) & 0x80000000);
if (((expox - 1) <= 0xFD) && ((expoy - 1) <= 0xFD)) {
expo_res = expox - expoy + 127 - 1;
/* extract mantissas */
y = (cvtyi << 8) | 0x80000000;
x = (cvtxi & 0x00ffffff) | 0x00800000;
t =__int_as_float((cvtyi & 0x00ffffff) | 0x3f800001);
r = ((__float_as_int(1.0f / t) & 0x00ffffff) | 0x00800000) << 7;
/* NR iteration */
f = (unsigned)-(int)__umulhi (y, r << 1);
r = __umulhi (f, r << 1);
/* produce quotient */
prod = ((unsigned long long)x) * (r << 1);
/* normalize mantissa */
if (((int)((prod >> 32) << 8)) > 0) {
expo_res--;
prod = prod + prod;
}
/* preliminary mantissa */
prod += 0x0000000080000000ULL;
r = (unsigned)(prod >> 32);
y = y >> 8;
if (expo_res <= 0xFD) {
/* result is a normal */
int rem1;
prod = ((unsigned long long)y) * r;
x = x << (23 + ((prod >> 32) >> 15));
rem1 = x - (unsigned)(prod & 0xffffffff);
if ((rem1 < 0) && (!sign)) r--;
if ((rem1 > 0) && (sign)) r++;
resi = (expo_res << 23) + r;
if ((resi == 0x7f800000) && (!sign)) resi = 0x7f7fffff;
return __int_as_float(sign | resi);
} else if ((int)expo_res >= 254) {
/* overflow: return largest normal */
resi = sign ? 0x7f800000 : 0x7f7fffff;
return __int_as_float(sign |resi);
} else {
/* underflow: result is smallest normal or zero */
int rem1;
prod = ((unsigned long long)y) * r;
x = x << (23 + ((prod >> 32) >> 15));
rem1 = x - (unsigned)(prod & 0xffffffff);
if ((rem1 < 0) && (!sign)) r--;
if ((rem1 > 0) && (sign)) r++;
resi = ((expo_res << 23) + r);
if (resi != 0x00800000) resi = 0;
return __int_as_float(sign | resi);
}
}
if (__cuda_fabsf(divisor) > CUDART_TWO_TO_126_F) {
divisor *= 0.25f;
dividend *= 0.25f;
}
return dividend / divisor;
}
__device_func__(float __fadd_ru (float a, float b))
{
unsigned int expo_x, expo_y;
unsigned int xxi, yyi, temp;
xxi = __float_as_int(a);
yyi = __float_as_int(b);
/* make bigger operand the augend */
expo_y = yyi << 1;
if (expo_y > (xxi << 1)) {
expo_y = xxi;
xxi = yyi;
yyi = expo_y;
}
temp = 0xff;
expo_x = temp & (xxi >> 23);
expo_x = expo_x - 1;
expo_y = temp & (yyi >> 23);
expo_y = expo_y - 1;
if ((expo_x <= 0xFD) &&
(expo_y <= 0xFD)) {
expo_y = expo_x - expo_y;
if (expo_y > 25) {
expo_y = 31;
}
temp = xxi ^ yyi;
xxi = xxi & ~0x7f000000;
xxi = xxi | 0x00800000;
yyi = yyi & ~0xff000000;
yyi = yyi | 0x00800000;
if ((int)temp < 0) {
/* signs differ, effective subtraction */
temp = 32 - expo_y;
temp = (expo_y) ? (yyi << temp) : 0;
temp = (unsigned int)(-((int)temp));
xxi = xxi - (yyi >> expo_y) - (temp ? 1 : 0);
if (xxi & 0x00800000) {
if (expo_x <= 0xFD) {
xxi = (xxi + (expo_x << 23));
xxi += (temp && !(xxi & 0x80000000));
return __int_as_float(xxi);
}
} else {
if ((temp | (xxi << 1)) == 0) {
/* operands cancelled, resulting in a clean zero */
xxi = 0;
return __int_as_float(xxi);
}
/* normalize result */
yyi = xxi & 0x80000000;
do {
xxi = (xxi << 1) | (temp >> 31);
temp <<= 1;
expo_x--;
} while (!(xxi & 0x00800000));
xxi = xxi | yyi;
}
} else {
/* signs are the same, effective addition */
temp = 32 - expo_y;
temp = (expo_y) ? (yyi << temp) : 0;
xxi = xxi + (yyi >> expo_y);
if (!(xxi & 0x01000000)) {
if (expo_x <= 0xFD) {
xxi = xxi + (expo_x << 23);
xxi += (temp && !(xxi & 0x80000000));
return __int_as_float(xxi);
}
} else {
/* normalize result */
temp = (xxi << 31) | (temp >> 1);
xxi = ((xxi & 0x80000000) | (xxi >> 1)) & ~0x40000000;
expo_x++;
}
}
if (expo_x <= 0xFD) {
xxi += (temp && !(xxi & 0x80000000));
xxi = xxi + (expo_x << 23);
return __int_as_float(xxi);
}
if ((int)expo_x >= 254) {
/* overflow: return infinity or largest normal */
temp = xxi & 0x80000000;
xxi = (temp ? 0xff7fffff : 0x7F800000);
return __int_as_float(xxi);
}
/* underflow: zero or smallest normal */
yyi = xxi & 0x80000000;
xxi = xxi & ~0xff000000;
expo_x = (unsigned int)(-((int)expo_x));
xxi = (xxi >> expo_x);
if ((expo_x > 25) || (xxi != 0x00800000)) xxi = 0;
return __int_as_float(yyi | xxi);
} else {
return a + b;
}
}
__device_func__(float __fadd_rd (float a, float b))
{
unsigned int expo_x, expo_y;
unsigned int xxi, yyi, temp;
xxi = __float_as_int(a);
yyi = __float_as_int(b);
/* make bigger operand the augend */
expo_y = yyi << 1;
if (expo_y > (xxi << 1)) {
expo_y = xxi;
xxi = yyi;
yyi = expo_y;
}
temp = 0xff;
expo_x = temp & (xxi >> 23);
expo_x = expo_x - 1;
expo_y = temp & (yyi >> 23);
expo_y = expo_y - 1;
if ((expo_x <= 0xFD) &&
(expo_y <= 0xFD)) {
expo_y = expo_x - expo_y;
if (expo_y > 25) {
expo_y = 31;
}
temp = xxi ^ yyi;
xxi = xxi & ~0x7f000000;
xxi = xxi | 0x00800000;
yyi = yyi & ~0xff000000;
yyi = yyi | 0x00800000;
if ((int)temp < 0) {
/* signs differ, effective subtraction */
temp = 32 - expo_y;
temp = (expo_y) ? (yyi << temp) : 0;
temp = (unsigned int)(-((int)temp));
xxi = xxi - (yyi >> expo_y) - (temp ? 1 : 0);
if (xxi & 0x00800000) {
if (expo_x <= 0xFD) {
xxi = xxi & ~0x00800000; /* lop off integer bit */
xxi = (xxi + (expo_x << 23)) + 0x00800000;
xxi += (temp && (xxi & 0x80000000));
return __int_as_float(xxi);
}
} else {
if ((temp | (xxi << 1)) == 0) {
/* operands cancelled, resulting in a clean zero */
xxi = 0x80000000;
return __int_as_float(xxi);
}
/* normalize result */
yyi = xxi & 0x80000000;
do {
xxi = (xxi << 1) | (temp >> 31);
temp <<= 1;
expo_x--;
} while (!(xxi & 0x00800000));
xxi = xxi | yyi;
}
} else {
/* signs are the same, effective addition */
temp = 32 - expo_y;
temp = (expo_y) ? (yyi << temp) : 0;
xxi = xxi + (yyi >> expo_y);
if (!(xxi & 0x01000000)) {
if (expo_x <= 0xFD) {
expo_y = xxi & 1;
xxi = xxi + (expo_x << 23);
xxi += (temp && (xxi & 0x80000000));
return __int_as_float(xxi);
}
} else {
/* normalize result */
temp = (xxi << 31) | (temp >> 1);
xxi = ((xxi & 0x80000000) | (xxi >> 1)) & ~0x40000000;
expo_x++;
}
}
if (expo_x <= 0xFD) {
xxi += (temp && (xxi & 0x80000000));
xxi = xxi + (expo_x << 23);
return __int_as_float(xxi);
}
if ((int)expo_x >= 254) {
/* overflow: return infinity or largest normal */
temp = xxi & 0x80000000;
xxi = (temp ? 0xFF800000 : 0x7f7fffff);
return __int_as_float(xxi);
}
/* underflow: zero or smallest normal */
yyi = xxi & 0x80000000;
xxi = xxi & ~0xff000000;
expo_x = (unsigned int)(-((int)expo_x));
xxi = (xxi >> expo_x);
if ((expo_x > 25) || (xxi != 0x00800000)) xxi = 0;
return __int_as_float(yyi | xxi);
} else {
a = a + b;
xxi = xxi ^ yyi;
if ((a == 0.0f) && ((int)xxi < 0)) a = __int_as_float(0x80000000);
return a;
}
}
__device_func__(float __fmul_ru (float a, float b))
{
unsigned long long product;
unsigned int expo_x, expo_y;
unsigned int xxi, yyi;
xxi = __float_as_int(a);
yyi = __float_as_int(b);
expo_y = 0xFF;
expo_x = expo_y & (xxi >> 23);
expo_x = expo_x - 1;
expo_y = expo_y & (yyi >> 23);
expo_y = expo_y - 1;
if ((expo_x <= 0xFD) &&
(expo_y <= 0xFD)) {
expo_x = expo_x + expo_y;
expo_y = xxi ^ yyi;
xxi = xxi & 0x00ffffff;
yyi = yyi << 8;
xxi = xxi | 0x00800000;
yyi = yyi | 0x80000000;
/* compute product */
product = ((unsigned long long)xxi) * yyi;
expo_x = expo_x - 127 + 2;
expo_y = expo_y & 0x80000000;
xxi = (unsigned int)(product >> 32);
yyi = (unsigned int)(product & 0xffffffff);
/* normalize mantissa */
if (xxi < 0x00800000) {
xxi = (xxi << 1) | (yyi >> 31);
yyi = (yyi << 1);
expo_x--;
}
if (expo_x <= 0xFD) {
xxi = xxi | expo_y; /* OR in sign bit */
xxi = xxi + (expo_x << 23); /* add in exponent */
/* round result */
xxi += (yyi && !expo_y);
return __int_as_float(xxi);
} else if ((int)expo_x >= 254) {
/* overflow: return infinity or largest normal */
xxi = (expo_y ? 0xff7fffff : 0x7F800000);
return __int_as_float(xxi);
} else {
/* underflow: zero, or smallest normal */
expo_x = ((unsigned int)-((int)expo_x));
xxi += (yyi && !expo_y);
xxi = (xxi >> expo_x);
if ((expo_x > 25) || (xxi != 0x00800000)) xxi = 0;
return __int_as_float(expo_y | xxi);
}
} else {
return a * b;
}
}
__device_func__(float __fmul_rd (float a, float b))
{
unsigned long long product;
unsigned int expo_x, expo_y;
unsigned int xxi, yyi;
xxi = __float_as_int(a);
yyi = __float_as_int(b);
expo_y = 0xFF;
expo_x = expo_y & (xxi >> 23);
expo_x = expo_x - 1;
expo_y = expo_y & (yyi >> 23);
expo_y = expo_y - 1;
if ((expo_x <= 0xFD) &&
(expo_y <= 0xFD)) {
expo_x = expo_x + expo_y;
expo_y = xxi ^ yyi;
xxi = xxi & 0x00ffffff;
yyi = yyi << 8;
xxi = xxi | 0x00800000;
yyi = yyi | 0x80000000;
/* compute product */
product = ((unsigned long long)xxi) * yyi;
expo_x = expo_x - 127 + 2;
expo_y = expo_y & 0x80000000;
xxi = (unsigned int)(product >> 32);
yyi = (unsigned int)(product & 0xffffffff);
/* normalize mantissa */
if (xxi < 0x00800000) {
xxi = (xxi << 1) | (yyi >> 31);
yyi = (yyi << 1);
expo_x--;
}
if (expo_x <= 0xFD) {
xxi = xxi | expo_y; /* OR in sign bit */
xxi = xxi + (expo_x << 23); /* add in exponent */
/* round result */
xxi += (yyi && expo_y);
return __int_as_float(xxi);
} else if ((int)expo_x >= 254) {
/* overflow: return infinity or largest normal */
xxi = expo_y | (expo_y ?0x7F800000 : 0x7f7fffff);
return __int_as_float(xxi);
} else {
/* underflow: zero, or smallest normal */
expo_x = ((unsigned int)-((int)expo_x));
xxi += (yyi && expo_y);
xxi = (xxi >> expo_x);
if ((expo_x > 25) || (xxi != 0x00800000)) xxi = 0;
return __int_as_float(expo_y | xxi);
}
} else {
return a * b;
}
}
__device_func__(float __fmaf_rn (float a, float b, float c))
{
unsigned long long product;
unsigned int xx, yy, zz, ww;
unsigned int temp, s, u;
unsigned int expo_x, expo_y, expo_z;
xx = __float_as_int(a);
yy = __float_as_int(b);
zz = __float_as_int(c);
/* Match 'denormals are zero' behavior of the GPU */
if ((xx << 1) < 0x01000000) xx &= 0x80000000;
if ((yy << 1) < 0x01000000) yy &= 0x80000000;
if ((zz << 1) < 0x01000000) zz &= 0x80000000;
temp = 0xff;
expo_x = temp & (xx >> 23);
expo_x = expo_x - 1;
expo_y = temp & (yy >> 23);
expo_y = expo_y - 1;
expo_z = temp & (zz >> 23);
expo_z = expo_z - 1;
if (!((expo_x <= 0xFD) &&
(expo_y <= 0xFD) &&
(expo_z <= 0xFD))) {
/* fmad (nan, y, z) --> nan
fmad (x, nan, z) --> nan
fmad (x, y, nan) --> nan
*/
if ((yy << 1) > 0xff000000) {
return rsqrtf(b);
}
if ((zz << 1) > 0xff000000) {
return rsqrtf(c);
}
if ((xx << 1) > 0xff000000) {
return rsqrtf(a);
}
/* fmad (0, inf, z) --> NaN
fmad (inf, 0, z) --> NaN
fmad (-inf,+y,+inf) --> NaN
fmad (+x,-inf,+inf) --> NaN
fmad (+inf,-y,+inf) --> NaN
fmad (-x,+inf,+inf) --> NaN
fmad (-inf,-y,-inf) --> NaN
fmad (-x,-inf,-inf) --> NaN
fmad (+inf,+y,-inf) --> NaN
fmad (+x,+inf,-inf) --> NaN
*/
if ((((xx << 1) == 0) && ((yy << 1) == 0xff000000)) ||
(((yy << 1) == 0) && ((xx << 1) == 0xff000000))) {
return rsqrtf(__int_as_float(0xffc00000));
}
if ((zz << 1) == 0xff000000) {
if (((yy << 1) == 0xff000000) || ((xx << 1) == 0xff000000)) {
if ((int)(xx ^ yy ^ zz) < 0) {
return rsqrtf(__int_as_float(0xffc00000));
}
}
}
/* fmad (inf, y, z) --> inf
fmad (x, inf, z) --> inf
fmad (x, y, inf) --> inf
*/
if ((xx << 1) == 0xff000000) {
xx = xx ^ (yy & 0x80000000);
return __int_as_float(xx);
}
if ((yy << 1) == 0xff000000) {
yy = yy ^ (xx & 0x80000000);
return __int_as_float(yy);
}
if ((zz << 1) == 0xff000000) {
return __int_as_float(zz);
}
/* fmad (+0, -y, -0) --> -0
fmad (-0, +y, -0) --> -0
fmad (+x, -0, -0) --> -0
fmad (-x, +0, -0) --> -0
*/
if (zz == 0x80000000) {
if (((xx << 1) == 0) || ((yy << 1) == 0)) {
if ((int)(xx ^ yy) < 0) {
return __int_as_float(zz);
}
}
}
/* fmad (0, y, 0) --> +0
fmad (x, 0, 0) --> +0
*/
if (((zz << 1) == 0) &&
(((xx << 1) == 0) || ((yy << 1) == 0))) {
zz &= 0x7fffffff;
return __int_as_float(zz);
}
/* fmad (0, y, z) --> z
fmad (x, 0, z) --> z
*/
if (((xx << 1) == 0) || ((yy << 1) == 0)) {
return __int_as_float(zz);
}
/* normalize x, if denormal */
if (expo_x == (unsigned)-1) {
temp = xx & 0x80000000;
xx = xx << 8;
while (!(xx & 0x80000000)) {
xx <<= 1;
expo_x--;
}
expo_x++;
xx = (xx >> 8) | temp;
}
/* normalize y, if denormal */
if (expo_y == (unsigned)-1) {
temp = yy & 0x80000000;
yy = yy << 8;
while (!(yy & 0x80000000)) {
yy <<= 1;
expo_y--;
}
expo_y++;
yy = (yy >> 8) | temp;
}
/* normalize z, if denormal */
if ((expo_z == (unsigned)-1) && ((zz << 1) != 0)) {
temp = zz & 0x80000000;
zz = zz << 8;
while (!(zz & 0x80000000)) {
zz <<= 1;
expo_z--;
}
expo_z++;
zz = (zz >> 8) | temp;
}
}
expo_x = expo_x + expo_y;
expo_y = xx ^ yy;
xx = xx & 0x00ffffff;
yy = yy << 8;
xx = xx | 0x00800000;
yy = yy | 0x80000000;
product = ((unsigned long long)xx) * yy;
xx = (unsigned)(product >> 32);
yy = (unsigned)(product & 0xffffffff);
expo_x = expo_x - 127 + 2;
expo_y = expo_y & 0x80000000;
/* normalize mantissa */
if (xx < 0x00800000) {
xx = (xx << 1) | (yy >> 31);
yy = (yy << 1);
expo_x--;
}
temp = 0;
if ((zz << 1) != 0) { /* z is not zero */
s = zz & 0x80000000;
zz &= 0x00ffffff;
zz |= 0x00800000;
ww = 0;
/* compare and swap. put augend into xx:yy */
if ((int)expo_z > (int)expo_x) {
temp = expo_z;
expo_z = expo_x;
expo_x = temp;
temp = zz;
zz = xx;
xx = temp;
temp = ww;
ww = yy;
yy = temp;
temp = expo_y;
expo_y = s;
s = temp;
}
/* augend_sign = expo_y, augend_mant = xx:yy, augend_expo = expo_x */
/* addend_sign = s, addend_mant = zz:ww, addend_expo = expo_z */
expo_z = expo_x - expo_z;
u = expo_y ^ s;
if (expo_z <= 49) {
/* denormalize addend */
temp = 0;
while (expo_z >= 32) {
temp = ww | (temp != 0);
ww = zz;
zz = 0;
expo_z -= 32;
}
if (expo_z) {
temp = ((temp >> expo_z) | (ww << (32 - expo_z)) |
((temp << (32 - expo_z)) != 0));
ww = (ww >> expo_z) | (zz << (32 - expo_z));
zz = (zz >> expo_z);
}
} else {
temp = 1;
ww = 0;
zz = 0;
}
if ((int)u < 0) {
/* signs differ, effective subtraction */
temp = (unsigned)(-(int)temp);
s = (temp != 0);
u = yy - s;
s = u > yy;
yy = u - ww;
s += yy > u;
xx = (xx - zz) - s;
if (!(xx | yy | temp)) {
/* complete cancelation, return 0 */
return __int_as_float(xx);
}
if ((int)xx < 0) {
/* ooops, augend had smaller mantissa. Negate mantissa and flip
sign of result*/
temp = ~temp;
yy = ~yy;
xx = ~xx;
if (++temp == 0) {
if (++yy == 0) {
++xx;
}
}
expo_y ^= 0x80000000;
}
/* normalize mantissa, if necessary */
while (!(xx & 0x00800000)) {
xx = (xx << 1) | (yy >> 31);
yy = (yy << 1);
expo_x--;
}
} else {
/* signs are the same, effective addition */
yy = yy + ww;
s = yy < ww;
xx = xx + zz + s;
if (xx & 0x01000000) {
temp = temp | (yy << 31);
yy = (yy >> 1) | (xx << 31);
xx = ((xx & 0x80000000) | (xx >> 1)) & ~0x40000000;
expo_x++;
}
}
}
temp = yy | (temp != 0);
if (expo_x <= 0xFD) {
/* normal */
xx |= expo_y; /* or in sign bit */
s = xx & 1; /* mantissa lsb */
xx += (temp == 0x80000000) ? s : (temp >> 31);
xx = xx + (expo_x << 23); /* add in exponent */
return __int_as_float(xx);
} else if ((int)expo_x >= 126) {
/* overflow */
xx = expo_y | 0x7f800000;
return __int_as_float(xx);
}
/* subnormal */
expo_x = (unsigned int)(-(int)expo_x);
/* Match 'flush to zero' response of the GPU */
xx += (temp >= 0x80000000);
if (xx >= 0x01000000) {
xx = xx >> 1;
expo_x--;
}
if (expo_x > 0) xx = 0;
xx = expo_y | xx;
return __int_as_float(xx);
}
__device_func__(float __fmaf_rz (float a, float b, float c))
{
unsigned long long product;
unsigned int xx, yy, zz, ww;
unsigned int temp, s, u;
unsigned int expo_x, expo_y, expo_z;
xx = __float_as_int(a);
yy = __float_as_int(b);
zz = __float_as_int(c);
/* Match 'denormals are zero' behavior of the GPU */
if ((xx << 1) < 0x01000000) xx &= 0x80000000;
if ((yy << 1) < 0x01000000) yy &= 0x80000000;
if ((zz << 1) < 0x01000000) zz &= 0x80000000;
temp = 0xff;
expo_x = temp & (xx >> 23);
expo_x = expo_x - 1;
expo_y = temp & (yy >> 23);
expo_y = expo_y - 1;
expo_z = temp & (zz >> 23);
expo_z = expo_z - 1;
if (!((expo_x <= 0xFD) &&
(expo_y <= 0xFD) &&
(expo_z <= 0xFD))) {
/* fmad (nan, y, z) --> nan
fmad (x, nan, z) --> nan
fmad (x, y, nan) --> nan
*/
if ((yy << 1) > 0xff000000) {
return rsqrtf(b);
}
if ((zz << 1) > 0xff000000) {
return rsqrtf(c);
}
if ((xx << 1) > 0xff000000) {
return rsqrtf(a);
}
/* fmad (0, inf, z) --> NaN
fmad (inf, 0, z) --> NaN
fmad (-inf,+y,+inf) --> NaN
fmad (+x,-inf,+inf) --> NaN
fmad (+inf,-y,+inf) --> NaN
fmad (-x,+inf,+inf) --> NaN
fmad (-inf,-y,-inf) --> NaN
fmad (-x,-inf,-inf) --> NaN
fmad (+inf,+y,-inf) --> NaN
fmad (+x,+inf,-inf) --> NaN
*/
if ((((xx << 1) == 0) && ((yy << 1) == 0xff000000)) ||
(((yy << 1) == 0) && ((xx << 1) == 0xff000000))) {
return rsqrtf(__int_as_float(0xffc00000));
}
if ((zz << 1) == 0xff000000) {
if (((yy << 1) == 0xff000000) || ((xx << 1) == 0xff000000)) {
if ((int)(xx ^ yy ^ zz) < 0) {
return rsqrtf(__int_as_float(0xffc00000));
}
}
}
/* fmad (inf, y, z) --> inf
fmad (x, inf, z) --> inf
fmad (x, y, inf) --> inf
*/
if ((xx << 1) == 0xff000000) {
xx = xx ^ (yy & 0x80000000);
return __int_as_float(xx);
}
if ((yy << 1) == 0xff000000) {
yy = yy ^ (xx & 0x80000000);
return __int_as_float(yy);
}
if ((zz << 1) == 0xff000000) {
return __int_as_float(zz);
}
/* fmad (+0, -y, -0) --> -0
fmad (-0, +y, -0) --> -0
fmad (+x, -0, -0) --> -0
fmad (-x, +0, -0) --> -0
*/
if (zz == 0x80000000) {
if (((xx << 1) == 0) || ((yy << 1) == 0)) {
if ((int)(xx ^ yy) < 0) {
return __int_as_float(zz);
}
}
}
/* fmad (0, y, 0) --> +0
fmad (x, 0, 0) --> +0
*/
if (((zz << 1) == 0) &&
(((xx << 1) == 0) || ((yy << 1) == 0))) {
zz &= 0x7fffffff;
return __int_as_float(zz);
}
/* fmad (0, y, z) --> z
fmad (x, 0, z) --> z
*/
if (((xx << 1) == 0) || ((yy << 1) == 0)) {
return __int_as_float(zz);
}
/* normalize x, if denormal */
if (expo_x == (unsigned)-1) {
temp = xx & 0x80000000;
xx = xx << 8;
while (!(xx & 0x80000000)) {
xx <<= 1;
expo_x--;
}
expo_x++;
xx = (xx >> 8) | temp;
}
/* normalize y, if denormal */
if (expo_y == (unsigned)-1) {
temp = yy & 0x80000000;
yy = yy << 8;
while (!(yy & 0x80000000)) {
yy <<= 1;
expo_y--;
}
expo_y++;
yy = (yy >> 8) | temp;
}
/* normalize z, if denormal */
if ((expo_z == (unsigned)-1) && ((zz << 1) != 0)) {
temp = zz & 0x80000000;
zz = zz << 8;
while (!(zz & 0x80000000)) {
zz <<= 1;
expo_z--;
}
expo_z++;
zz = (zz >> 8) | temp;
}
}
expo_x = expo_x + expo_y;
expo_y = xx ^ yy;
xx = xx & 0x00ffffff;
yy = yy << 8;
xx = xx | 0x00800000;
yy = yy | 0x80000000;
product = ((unsigned long long)xx) * yy;
xx = (unsigned)(product >> 32);
yy = (unsigned)(product & 0xffffffff);
expo_x = expo_x - 127 + 2;
expo_y = expo_y & 0x80000000;
/* normalize mantissa */
if (xx < 0x00800000) {
xx = (xx << 1) | (yy >> 31);
yy = (yy << 1);
expo_x--;
}
temp = 0;
if ((zz << 1) != 0) { /* z is not zero */
s = zz & 0x80000000;
zz &= 0x00ffffff;
zz |= 0x00800000;
ww = 0;
/* compare and swap. put augend into xx:yy */
if ((int)expo_z > (int)expo_x) {
temp = expo_z;
expo_z = expo_x;
expo_x = temp;
temp = zz;
zz = xx;
xx = temp;
temp = ww;
ww = yy;
yy = temp;
temp = expo_y;
expo_y = s;
s = temp;
}
/* augend_sign = expo_y, augend_mant = xx:yy, augend_expo = expo_x */
/* addend_sign = s, addend_mant = zz:ww, addend_expo = expo_z */
expo_z = expo_x - expo_z;
u = expo_y ^ s;
if (expo_z <= 49) {
/* denormalize addend */
temp = 0;
while (expo_z >= 32) {
temp = ww | (temp != 0);
ww = zz;
zz = 0;
expo_z -= 32;
}
if (expo_z) {
temp = ((temp >> expo_z) | (ww << (32 - expo_z)) |
((temp << (32 - expo_z)) != 0));
ww = (ww >> expo_z) | (zz << (32 - expo_z));
zz = (zz >> expo_z);
}
} else {
temp = 1;
ww = 0;
zz = 0;
}
if ((int)u < 0) {
/* signs differ, effective subtraction */
temp = (unsigned)(-(int)temp);
s = (temp != 0);
u = yy - s;
s = u > yy;
yy = u - ww;
s += yy > u;
xx = (xx - zz) - s;
if (!(xx | yy | temp)) {
/* complete cancelation, return 0 */
return __int_as_float(xx);
}
if ((int)xx < 0) {
/* ooops, augend had smaller mantissa. Negate mantissa and flip
sign of result*/
temp = ~temp;
yy = ~yy;
xx = ~xx;
if (++temp == 0) {
if (++yy == 0) {
++xx;
}
}
expo_y ^= 0x80000000;
}
/* normalize mantissa, if necessary */
while (!(xx & 0x00800000)) {
xx = (xx << 1) | (yy >> 31);
yy = (yy << 1);
expo_x--;
}
} else {
/* signs are the same, effective addition */
yy = yy + ww;
s = yy < ww;
xx = xx + zz + s;
if (xx & 0x01000000) {
temp = temp | (yy << 31);
yy = (yy >> 1) | (xx << 31);
xx = ((xx & 0x80000000) | (xx >> 1)) & ~0x40000000;
expo_x++;
}
}
}
temp = yy | (temp != 0);
if (expo_x <= 0xFD) {
/* normal */
xx |= expo_y; /* or in sign bit */
xx = xx + (expo_x << 23); /* add in exponent */
return __int_as_float(xx);
} else if ((int)expo_x >= 126) {
/* overflow */
xx = expo_y | 0x7f7fffff;
return __int_as_float(xx);
}
/* subnormal */
return __int_as_float(expo_y);
}
__device_func__(float __fmaf_ru (float a, float b, float c))
{
unsigned long long product;
unsigned int xx, yy, zz, ww;
unsigned int temp, s, u;
unsigned int expo_x, expo_y, expo_z;
xx = __float_as_int(a);
yy = __float_as_int(b);
zz = __float_as_int(c);
/* Match 'denormals are zero' behavior of the GPU */
if ((xx << 1) < 0x01000000) xx &= 0x80000000;
if ((yy << 1) < 0x01000000) yy &= 0x80000000;
if ((zz << 1) < 0x01000000) zz &= 0x80000000;
temp = 0xff;
expo_x = temp & (xx >> 23);
expo_x = expo_x - 1;
expo_y = temp & (yy >> 23);
expo_y = expo_y - 1;
expo_z = temp & (zz >> 23);
expo_z = expo_z - 1;
if (!((expo_x <= 0xFD) &&
(expo_y <= 0xFD) &&
(expo_z <= 0xFD))) {
/* fmad (nan, y, z) --> nan
fmad (x, nan, z) --> nan
fmad (x, y, nan) --> nan
*/
if ((yy << 1) > 0xff000000) {
return rsqrtf(b);
}
if ((zz << 1) > 0xff000000) {
return rsqrtf(c);
}
if ((xx << 1) > 0xff000000) {
return rsqrtf(a);
}
/* fmad (0, inf, z) --> NaN
fmad (inf, 0, z) --> NaN
fmad (-inf,+y,+inf) --> NaN
fmad (+x,-inf,+inf) --> NaN
fmad (+inf,-y,+inf) --> NaN
fmad (-x,+inf,+inf) --> NaN
fmad (-inf,-y,-inf) --> NaN
fmad (-x,-inf,-inf) --> NaN
fmad (+inf,+y,-inf) --> NaN
fmad (+x,+inf,-inf) --> NaN
*/
if ((((xx << 1) == 0) && ((yy << 1) == 0xff000000)) ||
(((yy << 1) == 0) && ((xx << 1) == 0xff000000))) {
return rsqrtf(__int_as_float(0xffc00000));
}
if ((zz << 1) == 0xff000000) {
if (((yy << 1) == 0xff000000) || ((xx << 1) == 0xff000000)) {
if ((int)(xx ^ yy ^ zz) < 0) {
return rsqrtf(__int_as_float(0xffc00000));
}
}
}
/* fmad (inf, y, z) --> inf
fmad (x, inf, z) --> inf
fmad (x, y, inf) --> inf
*/
if ((xx << 1) == 0xff000000) {
xx = xx ^ (yy & 0x80000000);
return __int_as_float(xx);
}
if ((yy << 1) == 0xff000000) {
yy = yy ^ (xx & 0x80000000);
return __int_as_float(yy);
}
if ((zz << 1) == 0xff000000) {
return __int_as_float(zz);
}
/* fmad (+0, -y, -0) --> -0
fmad (-0, +y, -0) --> -0
fmad (+x, -0, -0) --> -0
fmad (-x, +0, -0) --> -0
*/
if (zz == 0x80000000) {
if (((xx << 1) == 0) || ((yy << 1) == 0)) {
if ((int)(xx ^ yy) < 0) {
return __int_as_float(zz);
}
}
}
/* fmad (0, y, 0) --> +0
fmad (x, 0, 0) --> +0
*/
if (((zz << 1) == 0) &&
(((xx << 1) == 0) || ((yy << 1) == 0))) {
zz &= 0x7fffffff;
return __int_as_float(zz);
}
/* fmad (0, y, z) --> z
fmad (x, 0, z) --> z
*/
if (((xx << 1) == 0) || ((yy << 1) == 0)) {
return __int_as_float(zz);
}
/* normalize x, if denormal */
if (expo_x == (unsigned)-1) {
temp = xx & 0x80000000;
xx = xx << 8;
while (!(xx & 0x80000000)) {
xx <<= 1;
expo_x--;
}
expo_x++;
xx = (xx >> 8) | temp;
}
/* normalize y, if denormal */
if (expo_y == (unsigned)-1) {
temp = yy & 0x80000000;
yy = yy << 8;
while (!(yy & 0x80000000)) {
yy <<= 1;
expo_y--;
}
expo_y++;
yy = (yy >> 8) | temp;
}
/* normalize z, if denormal */
if ((expo_z == (unsigned)-1) && ((zz << 1) != 0)) {
temp = zz & 0x80000000;
zz = zz << 8;
while (!(zz & 0x80000000)) {
zz <<= 1;
expo_z--;
}
expo_z++;
zz = (zz >> 8) | temp;
}
}
expo_x = expo_x + expo_y;
expo_y = xx ^ yy;
xx = xx & 0x00ffffff;
yy = yy << 8;
xx = xx | 0x00800000;
yy = yy | 0x80000000;
product = ((unsigned long long)xx) * yy;
xx = (unsigned)(product >> 32);
yy = (unsigned)(product & 0xffffffff);
expo_x = expo_x - 127 + 2;
expo_y = expo_y & 0x80000000;
/* normalize mantissa */
if (xx < 0x00800000) {
xx = (xx << 1) | (yy >> 31);
yy = (yy << 1);
expo_x--;
}
temp = 0;
if ((zz << 1) != 0) { /* z is not zero */
s = zz & 0x80000000;
zz &= 0x00ffffff;
zz |= 0x00800000;
ww = 0;
/* compare and swap. put augend into xx:yy */
if ((int)expo_z > (int)expo_x) {
temp = expo_z;
expo_z = expo_x;
expo_x = temp;
temp = zz;
zz = xx;
xx = temp;
temp = ww;
ww = yy;
yy = temp;
temp = expo_y;
expo_y = s;
s = temp;
}
/* augend_sign = expo_y, augend_mant = xx:yy, augend_expo = expo_x */
/* addend_sign = s, addend_mant = zz:ww, addend_expo = expo_z */
expo_z = expo_x - expo_z;
u = expo_y ^ s;
if (expo_z <= 49) {
/* denormalize addend */
temp = 0;
while (expo_z >= 32) {
temp = ww | (temp != 0);
ww = zz;
zz = 0;
expo_z -= 32;
}
if (expo_z) {
temp = ((temp >> expo_z) | (ww << (32 - expo_z)) |
((temp << (32 - expo_z)) != 0));
ww = (ww >> expo_z) | (zz << (32 - expo_z));
zz = (zz >> expo_z);
}
} else {
temp = 1;
ww = 0;
zz = 0;
}
if ((int)u < 0) {
/* signs differ, effective subtraction */
temp = (unsigned)(-(int)temp);
s = (temp != 0);
u = yy - s;
s = u > yy;
yy = u - ww;
s += yy > u;
xx = (xx - zz) - s;
if (!(xx | yy | temp)) {
/* complete cancelation, return 0 */
return __int_as_float(xx);
}
if ((int)xx < 0) {
/* ooops, augend had smaller mantissa. Negate mantissa and flip
sign of result*/
temp = ~temp;
yy = ~yy;
xx = ~xx;
if (++temp == 0) {
if (++yy == 0) {
++xx;
}
}
expo_y ^= 0x80000000;
}
/* normalize mantissa, if necessary */
while (!(xx & 0x00800000)) {
xx = (xx << 1) | (yy >> 31);
yy = (yy << 1);
expo_x--;
}
} else {
/* signs are the same, effective addition */
yy = yy + ww;
s = yy < ww;
xx = xx + zz + s;
if (xx & 0x01000000) {
temp = temp | (yy << 31);
yy = (yy >> 1) | (xx << 31);
xx = ((xx & 0x80000000) | (xx >> 1)) & ~0x40000000;
expo_x++;
}
}
}
temp = yy | (temp != 0);
if (expo_x <= 0xFD) {
/* normal */
xx |= expo_y; /* or in sign bit */
xx += (temp && !expo_y); /* round result */
xx = xx + (expo_x << 23); /* add in exponent */
return __int_as_float(xx);
} else if ((int)expo_x >= 126) {
/* overflow */
xx = expo_y | (expo_y ? 0x7f7fffff : 0x7F800000);
return __int_as_float(xx);
}
/* subnormal */
expo_x = ((unsigned int)-((int)expo_x));
xx += (temp && !expo_y);
xx = (xx >> expo_x);
if ((expo_x > 25) || (xx != 0x00800000)) xx = 0;
return __int_as_float(expo_y | xx);
}
__device_func__(float __fmaf_rd (float a, float b, float c))
{
unsigned long long product;
unsigned int xx, yy, zz, ww;
unsigned int temp, s, u;
unsigned int expo_x, expo_y, expo_z;
xx = __float_as_int(a);
yy = __float_as_int(b);
zz = __float_as_int(c);
/* Match 'denormals are zero' behavior of the GPU */
if ((xx << 1) < 0x01000000) xx &= 0x80000000;
if ((yy << 1) < 0x01000000) yy &= 0x80000000;
if ((zz << 1) < 0x01000000) zz &= 0x80000000;
temp = 0xff;
expo_x = temp & (xx >> 23);
expo_x = expo_x - 1;
expo_y = temp & (yy >> 23);
expo_y = expo_y - 1;
expo_z = temp & (zz >> 23);
expo_z = expo_z - 1;
if (!((expo_x <= 0xFD) &&
(expo_y <= 0xFD) &&
(expo_z <= 0xFD))) {
/* fmad (nan, y, z) --> nan
fmad (x, nan, z) --> nan
fmad (x, y, nan) --> nan
*/
if ((yy << 1) > 0xff000000) {
return rsqrtf(b);
}
if ((zz << 1) > 0xff000000) {
return rsqrtf(c);
}
if ((xx << 1) > 0xff000000) {
return rsqrtf(a);
}
/* fmad (0, inf, z) --> NaN
fmad (inf, 0, z) --> NaN
fmad (-inf,+y,+inf) --> NaN
fmad (+x,-inf,+inf) --> NaN
fmad (+inf,-y,+inf) --> NaN
fmad (-x,+inf,+inf) --> NaN
fmad (-inf,-y,-inf) --> NaN
fmad (-x,-inf,-inf) --> NaN
fmad (+inf,+y,-inf) --> NaN
fmad (+x,+inf,-inf) --> NaN
*/
if ((((xx << 1) == 0) && ((yy << 1) == 0xff000000)) ||
(((yy << 1) == 0) && ((xx << 1) == 0xff000000))) {
return rsqrtf(__int_as_float(0xffc00000));
}
if ((zz << 1) == 0xff000000) {
if (((yy << 1) == 0xff000000) || ((xx << 1) == 0xff000000)) {
if ((int)(xx ^ yy ^ zz) < 0) {
return rsqrtf(__int_as_float(0xffc00000));
}
}
}
/* fmad (inf, y, z) --> inf
fmad (x, inf, z) --> inf
fmad (x, y, inf) --> inf
*/
if ((xx << 1) == 0xff000000) {
xx = xx ^ (yy & 0x80000000);
return __int_as_float(xx);
}
if ((yy << 1) == 0xff000000) {
yy = yy ^ (xx & 0x80000000);
return __int_as_float(yy);
}
if ((zz << 1) == 0xff000000) {
return __int_as_float(zz);
}
/* fmad (+0, -y, -0) --> -0
fmad (-0, +y, -0) --> -0
fmad (+x, -0, -0) --> -0
fmad (-x, +0, -0) --> -0
*/
if (zz == 0x80000000) {
if (((xx << 1) == 0) || ((yy << 1) == 0)) {
if ((int)(xx ^ yy) < 0) {
return __int_as_float(zz);
}
}
}
/* fmad (0, y, 0) --> +0
fmad (x, 0, 0) --> +0
*/
if (((zz << 1) == 0) &&
(((xx << 1) == 0) || ((yy << 1) == 0))) {
zz = (xx ^ yy ^ zz) & 0x80000000;
return __int_as_float(zz);
}
/* fmad (0, y, z) --> z
fmad (x, 0, z) --> z
*/
if (((xx << 1) == 0) || ((yy << 1) == 0)) {
return __int_as_float(zz);
}
/* normalize x, if denormal */
if (expo_x == (unsigned)-1) {
temp = xx & 0x80000000;
xx = xx << 8;
while (!(xx & 0x80000000)) {
xx <<= 1;
expo_x--;
}
expo_x++;
xx = (xx >> 8) | temp;
}
/* normalize y, if denormal */
if (expo_y == (unsigned)-1) {
temp = yy & 0x80000000;
yy = yy << 8;
while (!(yy & 0x80000000)) {
yy <<= 1;
expo_y--;
}
expo_y++;
yy = (yy >> 8) | temp;
}
/* normalize z, if denormal */
if ((expo_z == (unsigned)-1) && ((zz << 1) != 0)) {
temp = zz & 0x80000000;
zz = zz << 8;
while (!(zz & 0x80000000)) {
zz <<= 1;
expo_z--;
}
expo_z++;
zz = (zz >> 8) | temp;
}
}
expo_x = expo_x + expo_y;
expo_y = xx ^ yy;
xx = xx & 0x00ffffff;
yy = yy << 8;
xx = xx | 0x00800000;
yy = yy | 0x80000000;
product = ((unsigned long long)xx) * yy;
xx = (unsigned)(product >> 32);
yy = (unsigned)(product & 0xffffffff);
expo_x = expo_x - 127 + 2;
expo_y = expo_y & 0x80000000;
/* normalize mantissa */
if (xx < 0x00800000) {
xx = (xx << 1) | (yy >> 31);
yy = (yy << 1);
expo_x--;
}
temp = 0;
if ((zz << 1) != 0) { /* z is not zero */
s = zz & 0x80000000;
zz &= 0x00ffffff;
zz |= 0x00800000;
ww = 0;
/* compare and swap. put augend into xx:yy */
if ((int)expo_z > (int)expo_x) {
temp = expo_z;
expo_z = expo_x;
expo_x = temp;
temp = zz;
zz = xx;
xx = temp;
temp = ww;
ww = yy;
yy = temp;
temp = expo_y;
expo_y = s;
s = temp;
}
/* augend_sign = expo_y, augend_mant = xx:yy, augend_expo = expo_x */
/* addend_sign = s, addend_mant = zz:ww, addend_expo = expo_z */
expo_z = expo_x - expo_z;
u = expo_y ^ s;
if (expo_z <= 49) {
/* denormalize addend */
temp = 0;
while (expo_z >= 32) {
temp = ww | (temp != 0);
ww = zz;
zz = 0;
expo_z -= 32;
}
if (expo_z) {
temp = ((temp >> expo_z) | (ww << (32 - expo_z)) |
((temp << (32 - expo_z)) != 0));
ww = (ww >> expo_z) | (zz << (32 - expo_z));
zz = (zz >> expo_z);
}
} else {
temp = 1;
ww = 0;
zz = 0;
}
if ((int)u < 0) {
/* signs differ, effective subtraction */
temp = (unsigned)(-(int)temp);
s = (temp != 0);
u = yy - s;
s = u > yy;
yy = u - ww;
s += yy > u;
xx = (xx - zz) - s;
if (!(xx | yy | temp)) {
/* complete cancelation, return -0 */
return __int_as_float(0x80000000);
}
if ((int)xx < 0) {
/* ooops, augend had smaller mantissa. Negate mantissa and flip
sign of result*/
temp = ~temp;
yy = ~yy;
xx = ~xx;
if (++temp == 0) {
if (++yy == 0) {
++xx;
}
}
expo_y ^= 0x80000000;
}
/* normalize mantissa, if necessary */
while (!(xx & 0x00800000)) {
xx = (xx << 1) | (yy >> 31);
yy = (yy << 1);
expo_x--;
}
} else {
/* signs are the same, effective addition */
yy = yy + ww;
s = yy < ww;
xx = xx + zz + s;
if (xx & 0x01000000) {
temp = temp | (yy << 31);
yy = (yy >> 1) | (xx << 31);
xx = ((xx & 0x80000000) | (xx >> 1)) & ~0x40000000;
expo_x++;
}
}
}
temp = yy | (temp != 0);
if (expo_x <= 0xFD) {
/* normal */
xx |= expo_y; /* or in sign bit */
xx += (temp && expo_y); /* round result */
xx = xx + (expo_x << 23); /* add in exponent */
return __int_as_float(xx);
} else if ((int)expo_x >= 126) {
/* overflow */
xx = expo_y | (expo_y ? 0x7f800000 : 0x7F7FFFFF);
return __int_as_float(xx);
}
/* subnormal */
expo_x = ((unsigned int)-((int)expo_x));
xx += (temp && expo_y);
xx = (xx >> expo_x);
if ((expo_x > 25) || (xx != 0x00800000)) xx = 0;
return __int_as_float(expo_y | xx);
}
#else /* defined(__CUDABE__) */
#include "common_types.h"
static __device__ const unsigned char __internal_rcpTab[128] =
{
0xff, 0xfd, 0xfb, 0xf9, 0xf7, 0xf5, 0xf4, 0xf2,
0xf0, 0xee, 0xed, 0xeb, 0xe9, 0xe8, 0xe6, 0xe4,
0xe3, 0xe1, 0xe0, 0xde, 0xdd, 0xdb, 0xda, 0xd8,
0xd7, 0xd5, 0xd4, 0xd3, 0xd1, 0xd0, 0xcf, 0xcd,
0xcc, 0xcb, 0xca, 0xc8, 0xc7, 0xc6, 0xc5, 0xc4,
0xc2, 0xc1, 0xc0, 0xbf, 0xbe, 0xbd, 0xbc, 0xbb,
0xba, 0xb9, 0xb8, 0xb7, 0xb6, 0xb5, 0xb4, 0xb3,
0xb2, 0xb1, 0xb0, 0xaf, 0xae, 0xad, 0xac, 0xab,
0xaa, 0xa9, 0xa8, 0xa8, 0xa7, 0xa6, 0xa5, 0xa4,
0xa3, 0xa3, 0xa2, 0xa1, 0xa0, 0x9f, 0x9f, 0x9e,
0x9d, 0x9c, 0x9c, 0x9b, 0x9a, 0x99, 0x99, 0x98,
0x97, 0x97, 0x96, 0x95, 0x95, 0x94, 0x93, 0x93,
0x92, 0x91, 0x91, 0x90, 0x8f, 0x8f, 0x8e, 0x8e,
0x8d, 0x8c, 0x8c, 0x8b, 0x8b, 0x8a, 0x89, 0x89,
0x88, 0x88, 0x87, 0x87, 0x86, 0x85, 0x85, 0x84,
0x84, 0x83, 0x83, 0x82, 0x82, 0x81, 0x81, 0x80
};
static __device__ const unsigned int __internal_invSqrtCubeTab[96] =
{
0xfa0bf8fe, 0xee6b28fa, 0xe5f024f7, 0xdaf268f3,
0xd2f000f0, 0xc890c0ec, 0xc10378e9, 0xb9a758e6,
0xb4da40e4, 0xadcea0e1, 0xa6f278de, 0xa279c0dc,
0x9beb48d9, 0x97a5c4d7, 0x916340d4, 0x8d4fc8d2,
0x895000d0, 0x8563b8ce, 0x818ac0cc, 0x7dc4e8ca,
0x7a1200c8, 0x7671d8c6, 0x72e440c4, 0x6f6908c2,
0x6db240c1, 0x6a523cbf, 0x670424bd, 0x6563c0bc,
0x623028ba, 0x609ce8b9, 0x5d8364b7, 0x5bfd18b6,
0x58fd40b4, 0x5783a8b3, 0x560e48b2, 0x533000b0,
0x51c70caf, 0x506238ae, 0x4da4c0ac, 0x4c4c10ab,
0x4af768aa, 0x49a6b8a9, 0x485a00a8, 0x471134a7,
0x45cc58a6, 0x434e40a4, 0x4214f8a3, 0x40df88a2,
0x3fade0a1, 0x3e8000a0, 0x3d55dc9f, 0x3c2f789e,
0x3c2f789e, 0x3b0cc49d, 0x39edc09c, 0x38d2609b,
0x37baa89a, 0x36a68899, 0x35960098, 0x34890497,
0x34890497, 0x337f9896, 0x3279ac95, 0x31774094,
0x30784893, 0x30784893, 0x2f7cc892, 0x2e84b091,
0x2d900090, 0x2d900090, 0x2c9eac8f, 0x2bb0b88e,
0x2bb0b88e, 0x2ac6148d, 0x29dec08c, 0x29dec08c,
0x28fab08b, 0x2819e88a, 0x2819e88a, 0x273c5889,
0x273c5889, 0x26620088, 0x258ad487, 0x258ad487,
0x24b6d886, 0x24b6d886, 0x23e5fc85, 0x23184084,
0x23184084, 0x224d9883, 0x224d9883, 0x21860882,
0x21860882, 0x20c18081, 0x20c18081, 0x20000080
};
__device_func__(float __internal_frcp_kernel (float x,enum cudaRoundMode mo
de))
{
unsigned long long prod;
volatile union __cudart_FloatUintCvt arg;
unsigned int expo;
unsigned int sign;
unsigned f, y;
arg.f = x;
sign = arg.i & 0x80000000;
expo = (arg.i >> 23);
expo = expo & 0xff;
f = expo - 1;
if (f <= 0xFD) {
y = (arg.i << 8);
y = y | 0x80000000;
/* initial approximation */
arg.i = __internal_rcpTab[(y >> 24) - 128];
/* first NR iteration */
f = arg.i * arg.i;
f = f << 16;
prod = ((unsigned long long)y) * f;
arg.i = (arg.i << 24) - (unsigned)(prod >> 32);
/* second NR iteration */
f = arg.i + arg.i;
prod = ((unsigned long long)y) * f;
f = (unsigned)(-(int)(prod >> 32));
prod = ((unsigned long long)arg.i) * f;
y = y >> 8;
/* compute exponent */
expo = (2 * 127) - expo - 2;
arg.i = (unsigned)(prod >> 32);
if (mode == cudaRoundNearest) {
arg.i = arg.i >> 6;
} else {
arg.i = (arg.i + 32) >> 6;
}
if ((int)expo >= 0) {
f = y * arg.i;
arg.i = ((expo << 23) + arg.i) | sign;
} else {
/* result is a denormal */
expo = -(int)expo;
arg.i = arg.i >> expo;
f = y * arg.i;
arg.i = arg.i | sign;
}
if (mode == cudaRoundNearest) {
expo = f + y;
if ((int)f < 0) f = (unsigned)(-(int)f);
if ((int)expo < 0) expo = (unsigned)(-(int)expo);
if (expo < f) arg.i++;
} else if (mode == cudaRoundZero) {
if ((int)f > 0) arg.i = arg.i - 1;
} else if (mode == cudaRoundPosInf) {
if (((int)f > 0) && sign) arg.i = arg.i - 1;
if (((int)f < 0) && !sign) arg.i = arg.i + 1;
} else { /* mode == cudaRoundMinInf */
if (((int)f > 0) && !sign) arg.i = arg.i - 1;
if (((int)f < 0) && sign) arg.i = arg.i + 1;
}
return arg.f;
} else {
/* zero returns infinity. Must handle negative zero as well */
if (!(arg.i << 1)) {
arg.i = 0x7F800000 | arg.i;
return arg.f;
}
/* infinity returns zero of like sign */
if ((arg.i << 1) == 0xff000000) {
arg.i &= 0x80000000;
return arg.f;
}
/* convert SNaNs to QNaNs */
if ((arg.i << 1) > 0xff000000) {
arg.i |= 0x00400000;
return arg.f;
}
/* denormals */
f = 0;
arg.i <<= 8;
do {
f++;
arg.i <<= 1;
} while ((int)arg.i > 0);
arg.i >>= 8;
arg.i |= sign;
arg.f = __internal_frcp_kernel (arg.f, mode);
expo = ((arg.i << 1) >> 24);
if ((expo + f) < 255) {
arg.i = (arg.i + (f << 23));
return arg.f;
}
if (mode == cudaRoundNearest) {
arg.i = (arg.i & 0x80000000) | 0x7f800000;
} else if (mode == cudaRoundZero) {
arg.i = (arg.i & 0x80000000) | 0x7f7fffff;
} else if (mode == cudaRoundPosInf) {
arg.i = (arg.i & 0x80000000) | ((sign) ? 0x7f7fffff : 0x7f800000);
} else { /* mode == cudaRoundMinInf */
arg.i = (arg.i & 0x80000000) | ((sign) ? 0x7f800000 : 0x7f7fffff);
}
return arg.f;
}
}
__device_func__(float __internal_fsqrt_kernel (float radicand,
enum cudaRoundMode mode))
{
unsigned long long prod;
volatile union __cudart_FloatUintCvt arg;
unsigned int expo;
unsigned int s, f, x;
arg.f = radicand;
expo = arg.i >> 23;
expo = expo & 0xff;
f = expo - 1;
if ((arg.i <= 0x80000000) && (f <= 0xFD)) {
/* normalize input argument */
x = (arg.i << 8) | 0x80000000;
x = x >> (expo & 1);
/* initial approximation */
arg.i = f = __internal_invSqrtCubeTab[((unsigned)x >> 25) - 32];
/* first NR iteration */
prod = ((unsigned long long)x) * f;
arg.i = ((arg.i * 3) << 22) - (unsigned)(prod >> 32);
/* second NR iteration */
prod = ((unsigned long long)arg.i) * arg.i;
s = (unsigned)(prod >> 32);
prod = ((unsigned long long)x) * s;
f = 0x30000000 - (unsigned)(prod >> 32);
prod = ((unsigned long long)f) * arg.i;
arg.i = (unsigned)(prod >> 32);
/* compute sqrt(x) as x * 1/sqrt(x) */
prod = ((unsigned long long)x) * arg.i;
arg.i = (unsigned)(prod >> 32);
if (mode == cudaRoundNearest) {
arg.i = arg.i >> 3;
} else {
arg.i = (arg.i + 4) >> 3;
}
x = (x << 16) - (arg.i * arg.i);
/* round to nearest based on remainder; tie case impossible */
if (mode == cudaRoundNearest) {
f = x - (2 * arg.i + 1);
if ((int)f < 0) f = (unsigned)(-(int)f);
if ((int)x < 0) x = (unsigned)(-(int)x);
if (f < x) arg.i ++;
} else if ((mode == cudaRoundZero) || (mode == cudaRoundMinInf)) {
if ((int)x < 0) arg.i--;
} else if (mode == cudaRoundPosInf) {
if ((int)x > 0) arg.i++;
}
arg.i = arg.i + (((expo + 125) & ~0x1) << 22);
return arg.f;
} else {
/* if zero, or positive infinity, return argument */
if (!(arg.i << 1) || (arg.i == 0x7F800000)) {
return arg.f;
}
/* if NaN, return argument, possibly converted to QNaN */
if ((arg.i << 1) > 0xFF000000) {
arg.i |= 0x00400000;
return arg.f;
}
/* if negative, return NaN: INDEFINITE */
if (arg.i & 0x80000000) {
arg.i = 0xFFC00000;
return arg.f;
}
/* denormal, normalize it before computing square root */
x = 0;
arg.i <<= 8;
do {
x++;
arg.i <<= 1;
} while ((int)arg.i > 0);
arg.i >>= 8;
arg.i += (x & 1) << 23;
x += (x & 1);
arg.f = __internal_fsqrt_kernel (arg.f, mode);
arg.i -= ((x >> 1) << 23);
return arg.f;
}
}
__device_func__(float __internal_fdiv_kernel (float dividend, float divisor
,
enum cudaRoundMode mode))
{
unsigned long long prod;
unsigned r, f, x, y, expox, expoy, sign;
volatile union __cudart_FloatUintCvt cvtx, cvty, res;
cvtx.f = dividend;
cvty.f = divisor;
expox = ((cvtx.i >> 23) & 0xff) - 1;
expoy = ((cvty.i >> 23) & 0xff) - 1;
sign = ((cvtx.i ^ cvty.i) & 0x80000000);
if ((expox <= 0xFD) && (expoy <= 0xFD)) {
divide:
expox = expox - expoy + 127 - 1;
expoy = expox;
/* extract mantissas */
y = (cvty.i << 8) | 0x80000000;
x = (cvtx.i & 0x00ffffff) | 0x00800000;
/* initial approximation */
r = __internal_rcpTab[(y >> 24) - 128];
/* first NR iteration */
f = r * r;
prod = ((unsigned long long)y) * (f << 16);
r = (r << 24) - (unsigned)(prod >> 32);
/* second NR iteration */
prod = ((unsigned long long)y) * (r << 1);
f = (unsigned)-(int)(prod >> 32);
prod = ((unsigned long long)f) * (r << 1);
r = (unsigned)(prod >> 32);
/* produce quotient */
prod = ((unsigned long long)x) * (r << 1);
/* normalize mantissa */
if (((int)((prod >> 32) << 8)) > 0) {
expox--;
prod = prod + prod;
}
if (mode == cudaRoundNearest) {
/* preliminary mantissa */
r = (unsigned)(prod >> 32);
y = y >> 8;
/* result is a normal */
if (expox <= 0xFD) {
int rem0, rem1, inc;
/* round mantissa to nearest even */
prod = ((unsigned long long)y) * r;
x = x << (23 + ((prod >> 32) >> 15));
rem1 = x - (unsigned)(prod & 0xffffffff);
rem0 = rem1 - y;
inc = abs(rem0) < abs(rem1);
/* merge sign, mantissa, exponent for final result */
res.i = sign | ((expox << 23) + r + inc);
return res.f;
} else if ((int)expox >= 254) {
/* overflow: return infinity */
res.i = sign | 0x7f800000;
return res.f;
} else {
/* underflow: result is zero, denormal, or smallest normal */
int shift = -(int)expox;
if (shift > 23) {
/* result is zero or smallest denormal */
r = (shift < 25) && ((x != y) || (r > 0x00ff0000));
res.i = sign | r;
return res.f;
}
if (x == y) {
/* result is denormal */
shift = -(int)expoy;
r = 0x00800000 >> shift;
res.i = sign | r;
return res.f;
}
{
unsigned long long tempx;
long long remlo, remhi;
/* result is denormal or smallest normal */
r = r >> shift;
prod = ((unsigned long long)y) * r;
tempx = ((unsigned long long)x) << (23 - shift);
remlo = 2 * tempx - 2 * prod - y;
remhi = remlo + 2 * tempx;
if (remlo < 0) remlo = -remlo;
if (remhi < 0) remhi = -remhi;
if (remhi < remlo) tempx = 2 * tempx;
remlo = tempx - prod;
remhi = remlo - y;
if (remlo < 0) remlo = -remlo;
if (remhi < 0) remhi = -remhi;
if ((remhi < remlo) || ((remhi == remlo) && (r & 1))) r++;
res.i = sign | r;
return res.f;
}
}
} else if (mode == cudaRoundZero) {
/* preliminary mantissa */
prod += 0x0000000080000000ULL;
r = (unsigned)(prod >> 32);
y = y >> 8;
/* result is a normal */
if (expox <= 0xFD) {
int rem1;
prod = ((unsigned long long)y) * r;
x = x << (23 + ((prod >> 32) >> 15));
rem1 = x - (unsigned)(prod & 0xffffffff);
if (rem1 < 0) r--;
r = (expox << 23) + r;
if (r == 0x7f800000) r = 0x7f7fffff;
res.i = sign | r;
return res.f;
} else if ((int)expox >= 254) {
/* overflow: return largest normal */
res.i = sign | 0x7f7fffff;
return res.f;
} else {
/* underflow: result is zero, denormal, or smallest normal */
int shift = -(int)expox;
if ((x == y) && (shift < 31)) {
shift = -(int)expoy;
r = 0x00800000 >> shift;
res.i = sign | r;
return res.f;
}
if (shift > 23) {
r = 0;
res.i = sign | r;
return res.f;
}
{
unsigned long long tempx;
long long remlo, remhi;
/* result is denormal or smallest normal */
r = r >> shift;
prod = ((unsigned long long)y) * r;
tempx = ((unsigned long long)x) << (23 - shift);
remlo = 2 * tempx - 2 * prod - y;
remhi = remlo + 2 * tempx;
if (remlo < 0) remlo = -remlo;
if (remhi < 0) remhi = -remhi;
if (remhi < remlo) tempx = 2 * tempx;
remlo = tempx - prod;
if ((remlo < 0) & (r != 0)) r--;
res.i = sign | r;
return res.f;
}
}
} else if (mode == cudaRoundPosInf) {
/* preliminary mantissa */
prod += 0x0000000080000000ULL;
r = (unsigned)(prod >> 32);
y = y >> 8;
/* result is a normal */
if (expox <= 0xFD) {
int rem1;
prod = ((unsigned long long)y) * r;
x = x << (23 + ((prod >> 32) >> 15));
rem1 = x - (unsigned)(prod & 0xffffffff);
if ((rem1 < 0) && (sign)) r--;
if ((rem1 > 0) && (!sign)) r++;
r = (expox << 23) + r;
if ((r == 0x7f800000) && (sign)) r = 0x7f7fffff;
res.i = sign | r;
return res.f;
} else if ((int)expox >= 254) {
/* overflow: return largest normal, or infinity */
r = sign ? 0x7f7fffff : 0x7f800000;
res.i = sign | r;
return res.f;
} else {
/* underflow: result is zero, denormal, or smallest normal */
int shift = -(int)expox;
if ((x == y) && (shift <= 24)) {
shift = -(int)expoy;
r = 0x00800000 >> shift;
if (r == 0) r = !sign;
res.i = sign | r;
return res.f;
}
if (shift > 23) {
r = !sign;
res.i = sign | r;
return res.f;
}
{
unsigned long long tempx;
long long remlo, remhi;
/* result is denormal or smallest normal */
r = r >> shift;
prod = ((unsigned long long)y) * r;
tempx = ((unsigned long long)x) << (23 - shift);
remlo = 2 * tempx - 2 * prod - y;
remhi = remlo + 2 * tempx;
if (remlo < 0) remlo = -remlo;
if (remhi < 0) remhi = -remhi;
if (remhi < remlo) tempx = 2 * tempx;
remlo = tempx - prod;
if ((remlo < 0) && (r != 0) && (sign)) r--;
if ((remlo > 0) && (!sign)) r++;
res.i = sign | r;
return res.f;
}
}
} else if (mode == cudaRoundMinInf) {
/* preliminary mantissa */
prod += 0x0000000080000000ULL;
r = (unsigned)(prod >> 32);
y = y >> 8;
/* result is a normal */
if (expox <= 0xFD) {
int rem1;
prod = ((unsigned long long)y) * r;
x = x << (23 + ((prod >> 32) >> 15));
rem1 = x - (unsigned)(prod & 0xffffffff);
if ((rem1 < 0) && (!sign)) r--;
if ((rem1 > 0) && (sign)) r++;
r = (expox << 23) + r;
if ((r == 0x7f800000) && (!sign)) r = 0x7f7fffff;
res.i = sign | r;
return res.f;
} else if ((int)expox >= 254) {
/* overflow: return largest normal, or infinity */
r = sign ? 0x7f800000 : 0x7f7fffff;
res.i = sign | r;
return res.f;
} else {
/* underflow: result is zero, denormal, or smallest normal */
int shift = -(int)expox;
if ((x == y) && (shift <= 24)) {
shift = -(int)expoy;
r = 0x00800000 >> shift;
if (r == 0) r = !!sign;
res.i = sign | r;
return res.f;
}
if (shift > 23) {
r = !!sign;
res.i = sign | r;
return res.f;
}
{
unsigned long long tempx;
long long remlo, remhi;
/* result is denormal or smallest normal */
r = r >> shift;
prod = ((unsigned long long)y) * r;
tempx = ((unsigned long long)x) << (23 - shift);
remlo = 2 * tempx - 2 * prod - y;
remhi = remlo + 2 * tempx;
if (remlo < 0) remlo = -remlo;
if (remhi < 0) remhi = -remhi;
if (remhi < remlo) tempx = 2 * tempx;
remlo = tempx - prod;
if ((remlo < 0) && (r != 0) && (!sign)) r--;
if ((remlo > 0) && (sign)) r++;
res.i = sign | r;
return res.f;
}
}
}
}
{
int xzero, yzero, xinf, yinf, xnan, ynan;
xnan = (cvtx.i << 1) > 0xff000000;
ynan = (cvty.i << 1) > 0xff000000;
/* handle NaNs. Convert SNaNs to QNaNs */
if (xnan) {
res.i = cvtx.i | 0x00400000;
return res.f;
}
if (ynan) {
res.i = cvty.i | 0x00400000;
return res.f;
}
xzero = (cvtx.i << 1) == 0x00000000;
yzero = (cvty.i << 1) == 0x00000000;
xinf = (cvtx.i << 1) == 0xff000000;
yinf = (cvty.i << 1) == 0xff000000;
/* 0/0 and INF/INF are invalid operations. Return INDEFINITE */
if ((xzero & yzero) | (xinf & yinf)) {
res.i = 0xffc00000;
return res.f;
}
/* x/INF and 0/y -> 0 */
if (xzero | yinf) {
res.i = sign;
return res.f;
}
/* x/0 and INF/y -> INF */
if (yzero | xinf) {
res.i = sign | 0x7f800000;
return res.f;
}
/* normalize denormals */
if ((int)expox < 0) {
cvtx.i = cvtx.i << 9;
while ((int)cvtx.i >= 0) {
expox--;
cvtx.i = cvtx.i + cvtx.i;
}
cvtx.i = cvtx.i >> 8;
}
if ((int)expoy < 0) {
cvty.i = cvty.i << 9;
while ((int)cvty.i >= 0) {
expoy--;
cvty.i = cvty.i + cvty.i;
}
cvty.i = cvty.i >> 8;
}
goto divide;
}
}
__device_func__(float __internal_fmul_kernel2 (float a, float b,
enum cudaRoundMode mode))
{
unsigned long long product;
volatile union __cudart_FloatUintCvt xx, yy;
unsigned expo_x, expo_y;
xx.f = a;
yy.f = b;
expo_y = 0xFF;
expo_x = expo_y & (xx.i >> 23);
expo_x = expo_x - 1;
expo_y = expo_y & (yy.i >> 23);
expo_y = expo_y - 1;
if ((expo_x <= 0xFD) &&
(expo_y <= 0xFD)) {
multiply:
expo_x = expo_x + expo_y;
expo_y = xx.i ^ yy.i;
xx.i = xx.i & 0x00ffffff;
yy.i = yy.i << 8;
xx.i = xx.i | 0x00800000;
yy.i = yy.i | 0x80000000;
/* compute product */
product = ((unsigned long long)xx.i) * yy.i;
expo_x = expo_x - 127 + 2;
expo_y = expo_y & 0x80000000;
xx.i = (unsigned int)(product >> 32);
yy.i = (unsigned int)(product & 0xffffffff);
/* normalize mantissa */
if (xx.i < 0x00800000) {
xx.i = (xx.i << 1) | (yy.i >> 31);
yy.i = (yy.i << 1);
expo_x--;
}
if (expo_x <= 0xFD) {
xx.i = xx.i | expo_y; /* OR in sign bit */
xx.i = xx.i + (expo_x << 23); /* add in exponent */
/* round result to nearest or even */
if (mode == cudaRoundNearest) {
if (yy.i < 0x80000000) return xx.f;
xx.i += ((yy.i == 0x80000000) ? (xx.i & 1) : (yy.i >> 31));
} else if (mode == cudaRoundZero) {
} else if (mode == cudaRoundPosInf) {
xx.i += (yy.i && !expo_y);
} else if (mode == cudaRoundMinInf) {
xx.i += (yy.i && expo_y);
}
return xx.f;
} else if ((int)expo_x >= 254) {
/* overflow: return infinity or largest normal */
if (mode == cudaRoundNearest) {
xx.i = expo_y | 0x7F800000;
} else if (mode == cudaRoundZero) {
xx.i = expo_y | 0x7F7FFFFF;
} else if (mode == cudaRoundPosInf) {
xx.i = (expo_y ? 0xff7fffff : 0x7F800000);
} else { /* (mode == cudaRoundMinInf) */
xx.i = (expo_y ? 0xFF800000 : 0x7f7fffff);
}
return xx.f;
} else {
/* zero, denormal, or smallest normal */
expo_x = ((unsigned int)-((int)expo_x));
if (mode == cudaRoundNearest) {
if (expo_x > 25) {
/* massive underflow: return 0 */
xx.i = expo_y;
return xx.f;
} else {
yy.i = (xx.i << (32 - expo_x)) | ((yy.i) ? 1 : 0);
xx.i = expo_y + (xx.i >> expo_x);
xx.i += ((yy.i == 0x80000000) ? (xx.i & 1) : (yy.i >> 31));
return xx.f;
}
} else if (mode == cudaRoundZero) {
if (expo_x > 25) expo_x = 25;
xx.i = expo_y + (xx.i >> expo_x);
return xx.f;
} else if (mode == cudaRoundPosInf) {
if (expo_x > 25) expo_x = 25;
yy.i = (xx.i << (32 - expo_x)) | ((yy.i) ? 1 : 0);
xx.i = expo_y + (xx.i >> expo_x);
xx.i += (yy.i && !expo_y);
return xx.f;
} else { /* (mode == cudaRoundMinInf) */
if (expo_x > 25) expo_x = 25;
yy.i = (xx.i << (32 - expo_x)) | ((yy.i) ? 1 : 0);
xx.i = expo_y + (xx.i >> expo_x);
xx.i += (yy.i && expo_y);
return xx.f;
}
}
} else {
product = xx.i ^ yy.i;
product = product & 0x80000000;
if (!(xx.i & 0x7fffffff)) {
if (expo_y != 254) {
xx.i = (unsigned int)product;
return xx.f;
}
expo_y = yy.i << 1;
if (expo_y == 0xFF000000) {
xx.i = expo_y | 0x00C00000;
} else {
xx.i = yy.i | 0x00400000;
}
return xx.f;
}
if (!(yy.i & 0x7fffffff)) {
if (expo_x != 254) {
xx.i = (unsigned int)product;
return xx.f;
}
expo_x = xx.i << 1;
if (expo_x == 0xFF000000) {
xx.i = expo_x | 0x00C00000;
} else {
xx.i = xx.i | 0x00400000;
}
return xx.f;
}
if ((expo_y != 254) && (expo_x != 254)) {
expo_y++;
expo_x++;
if (expo_x == 0) {
expo_y |= xx.i & 0x80000000;
/*
* If both operands are denormals, we only need to normalize
* one of them as the result will be either a denormal or zero.
*/
xx.i = xx.i << 8;
while (!(xx.i & 0x80000000)) {
xx.i <<= 1;
expo_x--;
}
xx.i = (xx.i >> 8) | (expo_y & 0x80000000);
expo_y &= ~0x80000000;
expo_y--;
goto multiply;
}
if (expo_y == 0) {
expo_x |= yy.i & 0x80000000;
yy.i = yy.i << 8;
while (!(yy.i & 0x80000000)) {
yy.i <<= 1;
expo_y--;
}
yy.i = (yy.i >> 8) | (expo_x & 0x80000000);
expo_x &= ~0x80000000;
expo_x--;
goto multiply;
}
}
expo_x = xx.i << 1;
expo_y = yy.i << 1;
/* if x is NaN, return x */
if (expo_x > 0xFF000000) {
/* cvt any SNaNs to QNaNs */
xx.i = xx.i | 0x00400000;
return xx.f;
}
/* if y is NaN, return y */
if (expo_y > 0xFF000000) {
/* cvt any SNaNs to QNaNs */
xx.i = yy.i | 0x00400000;
return xx.f;
}
xx.i = (unsigned int)product | 0x7f800000;
return xx.f;
}
}
__device_func__(float __internal_fmaf_kernel (float a, float b, float c,
enum cudaRoundMode mode))
{
unsigned long long product;
unsigned int xx, yy, zz, ww;
unsigned int temp, s, u;
unsigned int expo_x, expo_y, expo_z;
volatile union __cudart_FloatUintCvt cvt;
cvt.f = a;
xx = cvt.i;
cvt.f = b;
yy = cvt.i;
cvt.f = c;
zz = cvt.i;
temp = 0xff;
expo_x = temp & (xx >> 23);
expo_x = expo_x - 1;
expo_y = temp & (yy >> 23);
expo_y = expo_y - 1;
expo_z = temp & (zz >> 23);
expo_z = expo_z - 1;
if (!((expo_x <= 0xFD) &&
(expo_y <= 0xFD) &&
(expo_z <= 0xFD))) {
/* fmad (nan, y, z) --> nan
fmad (x, nan, z) --> nan
fmad (x, y, nan) --> nan
*/
if ((yy << 1) > 0xff000000) {
return b + b;
}
if ((zz << 1) > 0xff000000) {
return c + c;
}
if ((xx << 1) > 0xff000000) {
return a + a;
}
/* fmad (0, inf, z) --> NaN
fmad (inf, 0, z) --> NaN
fmad (-inf,+y,+inf) --> NaN
fmad (+x,-inf,+inf) --> NaN
fmad (+inf,-y,+inf) --> NaN
fmad (-x,+inf,+inf) --> NaN
fmad (-inf,-y,-inf) --> NaN
fmad (-x,-inf,-inf) --> NaN
fmad (+inf,+y,-inf) --> NaN
fmad (+x,+inf,-inf) --> NaN
*/
if ((((xx << 1) == 0) && ((yy << 1) == 0xff000000)) ||
(((yy << 1) == 0) && ((xx << 1) == 0xff000000))) {
cvt.i = 0xffc00000;
return cvt.f;
}
if ((zz << 1) == 0xff000000) {
if (((yy << 1) == 0xff000000) || ((xx << 1) == 0xff000000)) {
if ((int)(xx ^ yy ^ zz) < 0) {
cvt.i = 0xffc00000;
return cvt.f;
}
}
}
/* fmad (inf, y, z) --> inf
fmad (x, inf, z) --> inf
fmad (x, y, inf) --> inf
*/
if ((xx << 1) == 0xff000000) {
xx = xx ^ (yy & 0x80000000);
cvt.i = xx;
return cvt.f;
}
if ((yy << 1) == 0xff000000) {
yy = yy ^ (xx & 0x80000000);
cvt.i = yy;
return cvt.f;
}
if ((zz << 1) == 0xff000000) {
cvt.i = zz;
return cvt.f;
}
/* fmad (+0, -y, -0) --> -0
fmad (-0, +y, -0) --> -0
fmad (+x, -0, -0) --> -0
fmad (-x, +0, -0) --> -0
*/
if (zz == 0x80000000) {
if (((xx << 1) == 0) || ((yy << 1) == 0)) {
if ((int)(xx ^ yy) < 0) {
cvt.i = zz;
return cvt.f;
}
}
}
/* fmad (0, y, 0) --> +0
fmad (x, 0, 0) --> +0
*/
if (((zz << 1) == 0) &&
(((xx << 1) == 0) || ((yy << 1) == 0))) {
if (mode == cudaRoundMinInf) {
zz = 0x80000000 & (xx ^ yy ^ zz);
} else {
zz &= 0x7fffffff;
}
cvt.i = zz;
return cvt.f;
}
/* fmad (0, y, z) --> z
fmad (x, 0, z) --> z
*/
if (((xx << 1) == 0) || ((yy << 1) == 0)) {
cvt.i = zz;
return cvt.f;
}
/* normalize x, if denormal */
if (expo_x == (unsigned)-1) {
temp = xx & 0x80000000;
xx = xx << 8;
while (!(xx & 0x80000000)) {
xx <<= 1;
expo_x--;
}
expo_x++;
xx = (xx >> 8) | temp;
}
/* normalize y, if denormal */
if (expo_y == (unsigned)-1) {
temp = yy & 0x80000000;
yy = yy << 8;
while (!(yy & 0x80000000)) {
yy <<= 1;
expo_y--;
}
expo_y++;
yy = (yy >> 8) | temp;
}
/* normalize z, if denormal */
if ((expo_z == (unsigned)-1) && ((zz << 1) != 0)) {
temp = zz & 0x80000000;
zz = zz << 8;
while (!(zz & 0x80000000)) {
zz <<= 1;
expo_z--;
}
expo_z++;
zz = (zz >> 8) | temp;
}
}
expo_x = expo_x + expo_y;
expo_y = xx ^ yy;
xx = xx & 0x00ffffff;
yy = yy << 8;
xx = xx | 0x00800000;
yy = yy | 0x80000000;
product = ((unsigned long long)xx) * yy;
xx = (unsigned)(product >> 32);
yy = (unsigned)(product & 0xffffffff);
expo_x = expo_x - 127 + 2;
expo_y = expo_y & 0x80000000;
/* normalize mantissa */
if (xx < 0x00800000) {
xx = (xx << 1) | (yy >> 31);
yy = (yy << 1);
expo_x--;
}
temp = 0;
if ((zz << 1) != 0) { /* z is not zero */
s = zz & 0x80000000;
zz &= 0x00ffffff;
zz |= 0x00800000;
ww = 0;
/* compare and swap. put augend into xx:yy */
if ((int)expo_z > (int)expo_x) {
temp = expo_z;
expo_z = expo_x;
expo_x = temp;
temp = zz;
zz = xx;
xx = temp;
temp = ww;
ww = yy;
yy = temp;
temp = expo_y;
expo_y = s;
s = temp;
}
/* augend_sign = expo_y, augend_mant = xx:yy, augend_expo = expo_x */
/* addend_sign = s, addend_mant = zz:ww, addend_expo = expo_z */
expo_z = expo_x - expo_z;
u = expo_y ^ s;
if (expo_z <= 49) {
/* denormalize addend */
temp = 0;
while (expo_z >= 32) {
temp = ww | (temp != 0);
ww = zz;
zz = 0;
expo_z -= 32;
}
if (expo_z) {
temp = ((temp >> expo_z) | (ww << (32 - expo_z)) |
((temp << (32 - expo_z)) != 0));
ww = (ww >> expo_z) | (zz << (32 - expo_z));
zz = (zz >> expo_z);
}
} else {
temp = 1;
ww = 0;
zz = 0;
}
if ((int)u < 0) {
/* signs differ, effective subtraction */
temp = (unsigned)(-(int)temp);
s = (temp != 0);
u = yy - s;
s = u > yy;
yy = u - ww;
s += yy > u;
xx = (xx - zz) - s;
if (!(xx | yy | temp)) {
/* complete cancelation, return 0 */
if (mode == cudaRoundMinInf) {
xx = 0x80000000;
}
cvt.i = xx;
return cvt.f;
}
if ((int)xx < 0) {
/* ooops, augend had smaller mantissa. Negate mantissa and flip
sign of result*/
temp = ~temp;
yy = ~yy;
xx = ~xx;
if (++temp == 0) {
if (++yy == 0) {
++xx;
}
}
expo_y ^= 0x80000000;
}
/* normalize mantissa, if necessary */
while (!(xx & 0x00800000)) {
xx = (xx << 1) | (yy >> 31);
yy = (yy << 1);
expo_x--;
}
} else {
/* signs are the same, effective addition */
yy = yy + ww;
s = yy < ww;
xx = xx + zz + s;
if (xx & 0x01000000) {
temp = temp | (yy << 31);
yy = (yy >> 1) | (xx << 31);
xx = ((xx & 0x80000000) | (xx >> 1)) & ~0x40000000;
expo_x++;
}
}
}
temp = yy | (temp != 0);
if (expo_x <= 0xFD) {
/* normal */
xx |= expo_y; /* or in sign bit */
if (mode == cudaRoundNearest) {
s = xx & 1; /* mantissa lsb */
xx += (temp == 0x80000000) ? s : (temp >> 31);
} else if (mode == cudaRoundPosInf) {
xx += temp && !expo_y;
} else if (mode == cudaRoundMinInf) {
xx += temp && expo_y;
}
xx = xx + (expo_x << 23); /* add in exponent */
cvt.i = xx;
return cvt.f;
} else if ((int)expo_x >= 126) {
/* overflow */
if (mode == cudaRoundNearest) {
xx = expo_y | 0x7f800000;
} else if (mode == cudaRoundZero) {
xx = expo_y | 0x7F7FFFFF;
} else if (mode == cudaRoundPosInf) {
xx = expo_y ? 0xFF7FFFFF : 0x7f800000;
} else if (mode == cudaRoundMinInf) {
xx = expo_y ? 0xff800000 : 0x7f7fffff;
}
cvt.i = xx;
return cvt.f;
}
/* subnormal */
expo_x = (unsigned int)(-(int)expo_x);
if (expo_x > 25) {
/* massive underflow: return 0, or smallest denormal */
xx = 0;
if (mode == cudaRoundPosInf) {
xx += !expo_y;
} else if (mode == cudaRoundMinInf) {
xx += !!expo_y;
}
cvt.i = expo_y | xx;
return cvt.f;
}
temp = (xx << (32 - expo_x)) | ((temp) ? 1 : 0);
xx = xx >> expo_x;
if (mode == cudaRoundNearest) {
xx = xx + ((temp == 0x80000000) ? (xx & 1) : (temp >> 31));
} else if (mode == cudaRoundPosInf) {
xx = xx + (!expo_y && temp);
} else if (mode == cudaRoundMinInf) {
xx = xx + (expo_y && temp);
}
xx = expo_y + xx; /* add in sign bit */
cvt.i = xx;
return cvt.f;
}
/* NOTE: Does not currently support round-to-nearest, round-to-zero */
__device_func__(float __internal_fadd_kernel2 (float a, float b,
enum cudaRoundMode mode))
{
volatile union __cudart_FloatUintCvt xx, yy;
unsigned int expo_x;
unsigned int expo_y;
unsigned int temp;
xx.f = a;
yy.f = b;
/* make bigger operand the augend */
expo_y = yy.i << 1;
if (expo_y > (xx.i << 1)) {
expo_y = xx.i;
xx.i = yy.i;
yy.i = expo_y;
}
temp = 0xff;
expo_x = temp & (xx.i >> 23);
expo_x = expo_x - 1;
expo_y = temp & (yy.i >> 23);
expo_y = expo_y - 1;
if ((expo_x <= 0xFD) &&
(expo_y <= 0xFD)) {
add:
expo_y = expo_x - expo_y;
if (expo_y > 25) {
expo_y = 31;
}
temp = xx.i ^ yy.i;
xx.i = xx.i & ~0x7f000000;
xx.i = xx.i | 0x00800000;
yy.i = yy.i & ~0xff000000;
yy.i = yy.i | 0x00800000;
if ((int)temp < 0) {
/* signs differ, effective subtraction */
temp = 32 - expo_y;
temp = (expo_y) ? (yy.i << temp) : 0;
temp = (unsigned)(-((int)temp));
xx.i = xx.i - (yy.i >> expo_y) - (temp ? 1 : 0);
if (xx.i & 0x00800000) {
if (expo_x <= 0xFD) {
xx.i = xx.i + (expo_x << 23);
if (mode == cudaRoundMinInf) {
xx.i += (temp && (xx.i & 0x80000000));
} else if (mode == cudaRoundPosInf) {
xx.i += (temp && !(xx.i & 0x80000000));
}
return xx.f;
}
} else {
if ((temp | (xx.i << 1)) == 0) {
/* operands cancelled, resulting in a clean zero */
if (mode == cudaRoundMinInf) {
xx.i = 0x80000000;
} else if (mode == cudaRoundPosInf) {
xx.i = 0;
}
return xx.f;
}
/* normalize result */
yy.i = xx.i & 0x80000000;
do {
xx.i = (xx.i << 1) | (temp >> 31);
temp <<= 1;
expo_x--;
} while (!(xx.i & 0x00800000));
xx.i = xx.i | yy.i;
}
} else {
/* signs are the same, effective addition */
temp = 32 - expo_y;
temp = (expo_y) ? (yy.i << temp) : 0;
xx.i = xx.i + (yy.i >> expo_y);
if (!(xx.i & 0x01000000)) {
if (expo_x <= 0xFD) {
xx.i = xx.i + (expo_x << 23);
if (mode == cudaRoundMinInf) {
xx.i += (temp && (xx.i & 0x80000000));
} else if (mode == cudaRoundPosInf) {
xx.i += (temp && !(xx.i & 0x80000000));
}
return xx.f;
}
} else {
/* normalize result */
temp = (xx.i << 31) | (temp >> 1);
xx.i = ((xx.i & 0x80000000) | (xx.i >> 1)) & ~0x40000000;
expo_x++;
}
}
if (expo_x <= 0xFD) {
if (mode == cudaRoundMinInf) {
xx.i += (temp && (xx.i & 0x80000000));
} else if (mode == cudaRoundPosInf) {
xx.i += (temp && !(xx.i & 0x80000000));
}
xx.i = xx.i + (expo_x << 23);
return xx.f;
}
if ((int)expo_x >= 254) {
/* overflow: return infinity or largest normal */
temp = xx.i & 0x80000000;
if (mode == cudaRoundMinInf) {
xx.i = (temp ? 0xFF800000 : 0x7f7fffff);
} else if (mode == cudaRoundPosInf) {
xx.i = (temp ? 0xff7fffff : 0x7F800000);
}
return xx.f;
}
/* underflow: denormal, or smallest normal */
expo_y = expo_x + 32;
yy.i = xx.i & 0x80000000;
xx.i = xx.i & ~0xff000000;
expo_x = (unsigned)(-((int)expo_x));
temp = xx.i << expo_y | ((temp) ? 1 : 0);
xx.i = yy.i | (xx.i >> expo_x);
if (mode == cudaRoundMinInf) {
xx.i += (temp && yy.i);
} else if (mode == cudaRoundPosInf) {
xx.i += (temp && !yy.i);
}
return xx.f;
} else {
/* handle special cases separately */
if (!(yy.i << 1)) {
if (mode == cudaRoundMinInf) {
if (!(xx.i << 1)) {
xx.i = xx.i | yy.i;
}
} else if (mode == cudaRoundPosInf) {
if (xx.i == 0x80000000) {
xx.i = yy.i;
}
}
if ((xx.i << 1) > 0xff000000) {
xx.i |= 0x00400000;
}
return xx.f;
}
if ((expo_y != 254) && (expo_x != 254)) {
/* remove sign bits */
if (expo_x == (unsigned int) -1) {
temp = xx.i & 0x80000000;
xx.i = xx.i << 8;
while (!(xx.i & 0x80000000)) {
xx.i <<= 1;
expo_x--;
}
expo_x++;
xx.i = (xx.i >> 8) | temp;
}
if (expo_y == (unsigned int) -1) {
temp = yy.i & 0x80000000;
yy.i = yy.i << 8;
while (!(yy.i & 0x80000000)) {
yy.i <<= 1;
expo_y--;
}
expo_y++;
yy.i = (yy.i >> 8) | temp;
}
goto add;
}
expo_x = xx.i << 1;
expo_y = yy.i << 1;
/* if x is NaN, return x */
if (expo_x > 0xff000000) {
/* cvt any SNaNs to QNaNs */
xx.i = xx.i | 0x00400000;
return xx.f;
}
/* if y is NaN, return y */
if (expo_y > 0xff000000) {
/* cvt any SNaNs to QNaNs */
xx.i = yy.i | 0x00400000;
return xx.f;
}
if ((expo_x == 0xff000000) && (expo_y == 0xff000000)) {
/*
* subtraction of infinities with the same sign, and addition of
* infinities of unlike sign is undefined: return NaN INDEFINITE
*/
expo_x = xx.i ^ yy.i;
xx.i = xx.i | ((expo_x) ? 0xffc00000 : 0);
return xx.f;
}
/* handle infinities */
if (expo_y == 0xff000000) {
xx.i = yy.i;
}
return xx.f;
}
}
__device_func__(float __frcp_rn (float a))
{
return __internal_frcp_kernel (a, cudaRoundNearest);
}
__device_func__(float __frcp_rz (float a))
{
return __internal_frcp_kernel (a, cudaRoundZero);
}
__device_func__(float __frcp_rd (float a))
{
return __internal_frcp_kernel (a, cudaRoundMinInf);
}
__device_func__(float __frcp_ru (float a))
{
return __internal_frcp_kernel (a, cudaRoundPosInf);
}
__device_func__(float __fsqrt_rn (float a))
{
return __internal_fsqrt_kernel (a, cudaRoundNearest);
}
__device_func__(float __fsqrt_rz (float a))
{
return __internal_fsqrt_kernel (a, cudaRoundZero);
}
__device_func__(float __fsqrt_rd (float a))
{
return __internal_fsqrt_kernel (a, cudaRoundMinInf);
}
__device_func__(float __fsqrt_ru (float a))
{
return __internal_fsqrt_kernel (a, cudaRoundPosInf);
}
__device_func__(float __fdiv_rn (float a, float b))
{
return __internal_fdiv_kernel (a, b, cudaRoundNearest);
}
__device_func__(float __fdiv_rz (float a, float b))
{
return __internal_fdiv_kernel (a, b, cudaRoundZero);
}
__device_func__(float __fdiv_rd (float a, float b))
{
return __internal_fdiv_kernel (a, b, cudaRoundMinInf);
}
__device_func__(float __fdiv_ru (float a, float b))
{
return __internal_fdiv_kernel (a, b, cudaRoundPosInf);
}
__device_func__(float __fadd_rd (float a, float b))
{
return __internal_fadd_kernel2 (a, b, cudaRoundMinInf);
}
__device_func__(float __fadd_ru (float a, float b))
{
return __internal_fadd_kernel2 (a, b, cudaRoundPosInf);
}
__device_func__(float __fmul_rd (float a, float b))
{
return __internal_fmul_kernel2 (a, b, cudaRoundMinInf);
}
__device_func__(float __fmul_ru (float a, float b))
{
return __internal_fmul_kernel2 (a, b, cudaRoundPosInf);
}
__device_func__(float __fmaf_rn (float a, float b, float c))
{
return __internal_fmaf_kernel (a, b, c, cudaRoundNearest);
}
__device_func__(float __fmaf_rz (float a, float b, float c))
{
return __internal_fmaf_kernel (a, b, c, cudaRoundZero);
}
__device_func__(float __fmaf_ru (float a, float b, float c))
{
return __internal_fmaf_kernel (a, b, c, cudaRoundPosInf);
}
__device_func__(float __fmaf_rd (float a, float b, float c))
{
return __internal_fmaf_kernel (a, b, c, cudaRoundMinInf);
}
__device_func__(int __cuda___isnan(double a)); __device_func__(int __cuda___isnan(double a));
__device_func__(int __cuda___isnanf(float a)); __device_func__(int __cuda___isnanf(float a));
__device_func__(int __double2int_rz(double)); __device_func__(int __double2int_rz(double));
__device_func__(unsigned int __double2uint_rz(double)); __device_func__(unsigned int __double2uint_rz(double));
__device_func__(long long int __double2ll_rz(double)); __device_func__(long long int __double2ll_rz(double));
__device_func__(unsigned long long int __double2ull_rz(double)); __device_func__(unsigned long long int __double2ull_rz(double));
#define __internal_clamp(val, max, min, nan) \ #define __internal_clamp(val, max, min, nan) \
if (sizeof(val) == sizeof(double) && __cuda___isnan((double)val)) re turn nan; \ if (sizeof(val) == sizeof(double) && __cuda___isnan((double)val)) re turn nan; \
skipping to change at line 409 skipping to change at line 3777
{ {
long long int res; long long int res;
res = __umul64hi(a, b); res = __umul64hi(a, b);
if (a < 0LL) res = res - b; if (a < 0LL) res = res - b;
if (b < 0LL) res = res - a; if (b < 0LL) res = res - a;
return res; return res;
} }
__device_func__(float __saturatef(float a)) __device_func__(float __saturatef(float a))
{ {
if (__cuda___isnanf(a)) return 0.0f; // update of PTX spec 10/15/2008
return a >= 1.0f ? 1.0f : a <= 0.0f ? 0.0f : a; return a >= 1.0f ? 1.0f : a <= 0.0f ? 0.0f : a;
} }
__device_func__(unsigned int __sad(int a, int b, unsigned int c)) __device_func__(unsigned int __sad(int a, int b, unsigned int c))
{ {
long long int diff = (long long int)a - (long long int)b; long long int diff = (long long int)a - (long long int)b;
return (unsigned int)(__cuda_llabs(diff) + (long long int)c); return (unsigned int)(__cuda_llabs(diff) + (long long int)c);
} }
skipping to change at line 450 skipping to change at line 3819
#if !defined(__MULTI_CORE__) #if !defined(__MULTI_CORE__)
a &= 0xffffff; a &= 0xffffff;
b &= 0xffffff; b &= 0xffffff;
#endif /* !__MULTI_CORE__ */ #endif /* !__MULTI_CORE__ */
return a * b; return a * b;
} }
__device_func__(float __int_as_float(int a)) __device_func__(float __int_as_float(int a))
{ {
volatile union {int a; float b;} u; volatile union __cudart_FloatIntCvt u;
u.a = a;
return u.b; u.i = a;
return u.f;
} }
__device_func__(int __float_as_int(float a)) __device_func__(int __float_as_int(float a))
{ {
volatile union {float a; int b;} u; volatile union __cudart_FloatIntCvt u;
u.a = a;
return u.b; u.f = a;
return u.i;
} }
__device_func__(long long int __internal_float2ll_kernel(float a, long long int max, long long int min, long long int nan, enum cudaRoundMode rndMode) ) __device_func__(long long int __internal_float2ll_kernel(float a, long long int max, long long int min, long long int nan, enum cudaRoundMode rndMode) )
{ {
unsigned long long int res, t = 0ULL; unsigned long long int res, t = 0ULL;
int shift; int shift;
unsigned int ia; unsigned int ia;
__internal_clamp(a, max, min, nan); __internal_clamp(a, max, min, nan);
ia = __float_as_int(a); ia = __float_as_int(a);
skipping to change at line 681 skipping to change at line 4048
unsigned long long int t = (unsigned long long int)*a; unsigned long long int t = (unsigned long long int)*a;
int lz = __internal_normalize64(&t); int lz = __internal_normalize64(&t);
*a = (unsigned int)(t >> 32); *a = (unsigned int)(t >> 32);
return lz - 32; return lz - 32;
} }
__device_func__(float __internal_int2float_kernel(int a, enum cudaRoundMode rndMode)) __device_func__(float __internal_int2float_kernel(int a, enum cudaRoundMode rndMode))
{ {
volatile union { volatile union __cudart_FloatUintCvt res;
float f;
unsigned int i;
} res;
int shift; int shift;
unsigned int t; unsigned int t;
res.i = a; res.i = a;
if (a == 0) return res.f; if (a == 0) return res.f;
if (a < 0) res.i = (unsigned int)-a; if (a < 0) res.i = (unsigned int)-a;
shift = __internal_normalize((unsigned int*)&res.i); shift = __internal_normalize((unsigned int*)&res.i);
t = res.i << 24; t = res.i << 24;
res.i = (res.i >> 8); res.i = (res.i >> 8);
res.i += (127 + 30 - shift) << 23; res.i += (127 + 30 - shift) << 23;
if (a < 0) res.i |= 0x80000000; if (a < 0) res.i |= 0x80000000;
skipping to change at line 733 skipping to change at line 4097
{ {
#if defined(__MULTI_CORE__) #if defined(__MULTI_CORE__)
return (float)a; return (float)a;
#else /* __MULTI_CORE__ */ #else /* __MULTI_CORE__ */
return __internal_int2float_kernel(a, cudaRoundNearest); return __internal_int2float_kernel(a, cudaRoundNearest);
#endif /* __MULTI_CORE__ */ #endif /* __MULTI_CORE__ */
} }
__device_func__(float __internal_uint2float_kernel(unsigned int a, enum cud aRoundMode rndMode)) __device_func__(float __internal_uint2float_kernel(unsigned int a, enum cud aRoundMode rndMode))
{ {
volatile union { volatile union __cudart_FloatUintCvt res;
float f;
unsigned int i;
} res;
int shift; int shift;
unsigned int t; unsigned int t;
res.i = a; res.i = a;
if (a == 0) return res.f; if (a == 0) return res.f;
shift = __internal_normalize((unsigned int*)&res.i); shift = __internal_normalize((unsigned int*)&res.i);
t = res.i << 24; t = res.i << 24;
res.i = (res.i >> 8); res.i = (res.i >> 8);
res.i += (127 + 30 - shift) << 23; res.i += (127 + 30 - shift) << 23;
if ((rndMode == cudaRoundNearest) && (t >= 0x80000000)) { if ((rndMode == cudaRoundNearest) && (t >= 0x80000000)) {
res.i += (t == 0x80000000) ? (res.i & 1) : (t >> 31); res.i += (t == 0x80000000) ? (res.i & 1) : (t >> 31);
skipping to change at line 806 skipping to change at line 4167
t = (unsigned int)temp; t = (unsigned int)temp;
res += (127 + 62 - shift) << 23; /* add in exponent */ res += (127 + 62 - shift) << 23; /* add in exponent */
res += t == 0x80000000 ? res & 1 : t >> 31; res += t == 0x80000000 ? res & 1 : t >> 31;
return __int_as_float(res); return __int_as_float(res);
#endif /* __MULTI_CORE__ */ #endif /* __MULTI_CORE__ */
} }
__device_func__(float __internal_fmul_kernel(float a, float b, int rndNeare st)) __device_func__(float __internal_fmul_kernel(float a, float b, int rndNeare st))
{ {
unsigned long long product; unsigned long long product;
volatile union { volatile union __cudart_FloatUintCvt xx, yy;
float f;
unsigned int i;
} xx, yy;
unsigned expo_x, expo_y; unsigned expo_x, expo_y;
xx.f = a; xx.f = a;
yy.f = b; yy.f = b;
expo_y = 0xFF; expo_y = 0xFF;
expo_x = expo_y & (xx.i >> 23); expo_x = expo_y & (xx.i >> 23);
expo_x = expo_x - 1; expo_x = expo_x - 1;
expo_y = expo_y & (yy.i >> 23); expo_y = expo_y & (yy.i >> 23);
expo_y = expo_y - 1; expo_y = expo_y - 1;
skipping to change at line 951 skipping to change at line 4309
xx.i = yy.i | 0x00400000; xx.i = yy.i | 0x00400000;
return xx.f; return xx.f;
} }
xx.i = (unsigned int)product | 0x7f800000; xx.i = (unsigned int)product | 0x7f800000;
return xx.f; return xx.f;
} }
} }
__device_func__(float __internal_fadd_kernel(float a, float b, int rndNeare st)) __device_func__(float __internal_fadd_kernel(float a, float b, int rndNeare st))
{ {
volatile union { volatile union __cudart_FloatUintCvt xx, yy;
float f;
unsigned int i;
} xx, yy;
unsigned int expo_x; unsigned int expo_x;
unsigned int expo_y; unsigned int expo_y;
unsigned int temp; unsigned int temp;
xx.f = a; xx.f = a;
yy.f = b; yy.f = b;
/* make bigger operand the augend */ /* make bigger operand the augend */
expo_y = yy.i << 1; expo_y = yy.i << 1;
if (expo_y > (xx.i << 1)) { if (expo_y > (xx.i << 1)) {
skipping to change at line 1069 skipping to change at line 4424
expo_x = (unsigned int)(-((int)expo_x)); expo_x = (unsigned int)(-((int)expo_x));
temp = xx.i << expo_y | ((temp) ? 1 : 0); temp = xx.i << expo_y | ((temp) ? 1 : 0);
xx.i = yy.i | (xx.i >> expo_x); xx.i = yy.i | (xx.i >> expo_x);
xx.i += (((temp == 0x80000000) ? (xx.i & 1) : (temp >> 31)) xx.i += (((temp == 0x80000000) ? (xx.i & 1) : (temp >> 31))
&& rndNearest); && rndNearest);
return xx.f; return xx.f;
} else { } else {
/* handle special cases separately */ /* handle special cases separately */
if (!(yy.i << 1)) { if (!(yy.i << 1)) {
if (xx.i == 0x80000000) { if (xx.i == 0x80000000) {
xx.i = yy.i; xx.i = yy.i;
} }
if ((xx.i << 1) > 0xff000000) { if ((xx.i << 1) > 0xff000000) {
xx.i |= 0x00400000; xx.i |= 0x00400000;
} }
return xx.f; return xx.f;
} }
if ((expo_y != 254) && (expo_x != 254)) { if ((expo_y != 254) && (expo_x != 254)) {
/* remove sign bits */ /* remove sign bits */
if (expo_x == (unsigned int) -1) { if (expo_x == (unsigned int) -1) {
temp = xx.i & 0x80000000; temp = xx.i & 0x80000000;
xx.i = xx.i << 8; xx.i = xx.i << 8;
while (!(xx.i & 0x80000000)) { while (!(xx.i & 0x80000000)) {
xx.i <<= 1; xx.i <<= 1;
skipping to change at line 1182 skipping to change at line 4537
#elif defined(_WIN32) #elif defined(_WIN32)
#define __syncthreads() \ #define __syncthreads() \
(void)__cudaSynchronizeThreads((void**)0, (void*)0) (void)__cudaSynchronizeThreads((void**)0, (void*)0)
#endif /* __GNUC__ */ #endif /* __GNUC__ */
#endif /* __MULTI_CORE__ */ #endif /* __MULTI_CORE__ */
__device_func__(void __prof_trigger(int a))
{
}
__device_func__(void __threadfence(void))
{
}
__device_func__(void __threadfence_block(void))
{
}
#if defined(__GNUC__) #if defined(__GNUC__)
__device_func__(void __trap(void)) __device_func__(void __trap(void))
{ {
__builtin_trap(); __builtin_trap();
} }
#elif defined(_WIN32) #elif defined(_WIN32)
__device_func__(void __trap(void)) __device_func__(void __trap(void))
{ {
__debugbreak(); __debugbreak();
} }
#endif /* __GNUC__ */ #endif /* __GNUC__ */
#endif /* !__CUDABE__ */ #endif /* __CUDABE__ */
/************************************************************************** ***** /************************************************************************** *****
* * * *
* DEVICE IMPLEMENTATIONS FOR FUNCTIONS WITH BUILTIN NVOPENCC OPERATIONS * * DEVICE IMPLEMENTATIONS FOR FUNCTIONS WITH BUILTIN NVOPENCC OPERATIONS *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
__device_func__(float __fdividef(float a, float b)) __device_func__(float __fdividef(float a, float b))
{ {
#if defined(__MULTI_CORE__) #if defined(__MULTI_CORE__)
return a / b; return a / b;
#elif defined(__CUDABE__) #elif defined(__CUDABE__)
return a / b; return a / b;
#else /* __MULTI_CORE__ */ #else /* __MULTI_CORE__ */
/* match range restrictions of the device function */ /* match range restrictions of the device function */
if (__cuda_fabsf(b) > CUDART_TWO_TO_126_F) { if (__cuda_fabsf(b) > CUDART_TWO_TO_126_F) {
if (__cuda_fabsf(a) <= CUDART_NORM_HUGE_F) { if (__cuda_fabsf(a) <= CUDART_NORM_HUGE_F) {
return ((a / b) / CUDART_NORM_HUGE_F) / CUDART_NORM_HUGE_F; return ((a / b) / CUDART_NORM_HUGE_F) / CUDART_NORM_HUGE_F;
} else { } else {
return CUDART_NAN_F; return __int_as_float(0xffc00000);
} }
} else { } else {
return a / b; return a / b;
} }
#endif /* __MULTI_CORE__ */ #endif /* __MULTI_CORE__ */
} }
__device_func__(float __sinf(float a)) __device_func__(float __sinf(float a))
{ {
return sinf(a); return sinf(a);
skipping to change at line 1261 skipping to change at line 4628
b *= .25f; b *= .25f;
} }
return __fdividef(a, b); return __fdividef(a, b);
} }
__device_func__(float __tanf(float a)) __device_func__(float __tanf(float a))
{ {
#if defined(__MULTI_CORE__) #if defined(__MULTI_CORE__)
return tanf(a); return tanf(a);
#else /* __MULTI_CORE__ */ #else /* __MULTI_CORE__ */
return __sinf(a) / __cosf(a); return __fdividef (__sinf(a), __cosf(a));
#endif /* __MULTI_CORE__ */ #endif /* __MULTI_CORE__ */
} }
__device_func__(void __sincosf(float a, float *sptr, float *cptr)) __device_func__(void __sincosf(float a, float *sptr, float *cptr))
{ {
#if defined(__MULTI_CORE__) #if defined(__MULTI_CORE__)
sincosf(a, sptr, cptr); sincosf(a, sptr, cptr);
#else /* __MULTI_CORE__ */ #else /* __MULTI_CORE__ */
*sptr = __sinf(a); *sptr = __sinf(a);
*cptr = __cosf(a); *cptr = __cosf(a);
skipping to change at line 1336 skipping to change at line 4703
#else /* __MULTI_CORE__ */ #else /* __MULTI_CORE__ */
return __internal_accurate_fdividef(a, b); return __internal_accurate_fdividef(a, b);
#endif /* __MULTI_CORE__ */ #endif /* __MULTI_CORE__ */
} }
__device_func__(int __clz(int a)) __device_func__(int __clz(int a))
{ {
return (a)?(158-(__float_as_int(__uint2float_rz((unsigned int)a))>>23)):3 2; return (a)?(158-(__float_as_int(__uint2float_rz((unsigned int)a))>>23)):3 2;
} }
__device_func__(int __ffs(int a))
{
return 32 - __clz (a & -a);
}
__device_func__(int __popc(unsigned int a))
{
a = a - ((a >> 1) & 0x55555555);
a = (a & 0x33333333) + ((a >> 2) & 0x33333333);
a = (a + (a >> 4)) & 0x0f0f0f0f;
a = ((__umul24(a, 0x808080) << 1) + a) >> 24;
return a;
}
__device_func__(int __clzll(long long int a)) __device_func__(int __clzll(long long int a))
{ {
int ahi = ((int)((unsigned long long)a >> 32)); int ahi = ((int)((unsigned long long)a >> 32));
int alo = ((int)((unsigned long long)a & 0xffffffffULL)); int alo = ((int)((unsigned long long)a & 0xffffffffULL));
int res; int res;
if (ahi) { if (ahi) {
res = 0; res = 0;
} else { } else {
res = 32; res = 32;
ahi = alo; ahi = alo;
} }
res = res + __clz(ahi); res = res + __clz(ahi);
return res; return res;
} }
__device_func__(int __ffsll(long long int a)) __device_func__(int __popc(unsigned int a))
{ {
return 64 - __clzll (a & -a); a = a - ((a >> 1) & 0x55555555);
a = (a & 0x33333333) + ((a >> 2) & 0x33333333);
a = (a + (a >> 4)) & 0x0f0f0f0f;
a = ((__umul24(a, 0x808080) << 1) + a) >> 24;
return a;
} }
__device_func__(int __popcll(unsigned long long int a)) __device_func__(int __popcll(unsigned long long int a))
{ {
unsigned int ahi = ((unsigned int)(a >> 32)); unsigned int ahi = ((unsigned int)(a >> 32));
unsigned int alo = ((unsigned int)(a & 0xffffffffULL)); unsigned int alo = ((unsigned int)(a & 0xffffffffULL));
alo = alo - ((alo >> 1) & 0x55555555); alo = alo - ((alo >> 1) & 0x55555555);
alo = (alo & 0x33333333) + ((alo >> 2) & 0x33333333); alo = (alo & 0x33333333) + ((alo >> 2) & 0x33333333);
ahi = ahi - ((ahi >> 1) & 0x55555555); ahi = ahi - ((ahi >> 1) & 0x55555555);
ahi = (ahi & 0x33333333) + ((ahi >> 2) & 0x33333333); ahi = (ahi & 0x33333333) + ((ahi >> 2) & 0x33333333);
alo = alo + ahi; alo = alo + ahi;
alo = (alo & 0x0f0f0f0f) + ((alo >> 4) & 0x0f0f0f0f); alo = (alo & 0x0f0f0f0f) + ((alo >> 4) & 0x0f0f0f0f);
alo = ((__umul24(alo, 0x808080) << 1) + alo) >> 24; alo = ((__umul24(alo, 0x808080) << 1) + alo) >> 24;
return alo; return alo;
} }
__device_func__(unsigned int __brev(unsigned int a))
{
a = ((a >> 1) & 0x55555555) + ((a & 0x55555555) << 1);
a = ((a >> 2) & 0x33333333) + ((a & 0x33333333) << 2);
a = ((a >> 4) & 0x0F0F0F0F) + ((a & 0x0F0F0F0F) << 4);
a = ((a >> 8) & 0x00FF00FF) + ((a & 0x00FF00FF) << 8);
a = ( a >> 16 ) + ( a << 16);
return a;
}
__device_func__(unsigned long long int __brevll(unsigned long long int a))
{
unsigned int hi = (unsigned int)(a >> 32);
unsigned int lo = (unsigned int)(a & 0xffffffffULL);
unsigned int t;
t = __brev(lo);
lo = __brev(hi);
return ((unsigned long long int)t << 32) + (unsigned long long int)lo;
}
__device_func__(int __ffs(int a))
{
return 32 - __clz (a & -a);
}
__device_func__(int __ffsll(long long int a))
{
return 64 - __clzll (a & -a);
}
#if defined(CUDA_DOUBLE_MATH_FUNCTIONS) && defined(CUDA_FLOAT_MATH_FUNCTION S) #if defined(CUDA_DOUBLE_MATH_FUNCTIONS) && defined(CUDA_FLOAT_MATH_FUNCTION S)
#error -- conflicting mode for double math routines #error -- conflicting mode for double math routines
#endif /* CUDA_DOUBLE_MATH_FUNCTIONS && CUDA_FLOAT_MATH_FUNCTIONS */ #endif /* CUDA_DOUBLE_MATH_FUNCTIONS && CUDA_FLOAT_MATH_FUNCTIONS */
#if defined(CUDA_FLOAT_MATH_FUNCTIONS) #if defined(CUDA_FLOAT_MATH_FUNCTIONS)
__device_func__(double fdivide(double a, double b)) __device_func__(double fdivide(double a, double b))
{ {
 End of changes. 31 change blocks. 
52 lines changed or deleted 3441 lines changed or added


 device_launch_parameters.h   device_launch_parameters.h 
/* /*
* Copyright 1993-2008 NVIDIA Corporation. All rights reserved. * Copyright 1993-2009 NVIDIA Corporation. All rights reserved.
* *
* NOTICE TO USER: * NOTICE TO USER:
* *
* This source code is subject to NVIDIA ownership rights under U.S. and * This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws. Users and possessors of this source code * international Copyright laws. Users and possessors of this source code
* are hereby granted a nonexclusive, royalty-free license to use this code * are hereby granted a nonexclusive, royalty-free license to use this code
* in individual and commercial software. * in individual and commercial software.
* *
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
 End of changes. 1 change blocks. 
1 lines changed or deleted 1 lines changed or added


 device_runtime.h   device_runtime.h 
/* /*
* Copyright 1993-2008 NVIDIA Corporation. All rights reserved. * Copyright 1993-2009 NVIDIA Corporation. All rights reserved.
* *
* NOTICE TO USER: * NOTICE TO USER:
* *
* This source code is subject to NVIDIA ownership rights under U.S. and * This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws. Users and possessors of this source code * international Copyright laws. Users and possessors of this source code
* are hereby granted a nonexclusive, royalty-free license to use this code * are hereby granted a nonexclusive, royalty-free license to use this code
* in individual and commercial software. * in individual and commercial software.
* *
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
skipping to change at line 56 skipping to change at line 56
s s
#define __unsized_shared_data(name, type_post) \ #define __unsized_shared_data(name, type_post) \
__unsized##name __unsized##type_post __unsized##name __unsized##type_post
#define __sized_shared_data(name, type) \ #define __sized_shared_data(name, type) \
__sized##name type __sized##name type
#define __sized__shared_var(name, s, type) \ #define __sized__shared_var(name, s, type) \
name name
/*TEXTURE_TYPE*/ /*TEXTURE_TYPE*/
typedef const void *__texture_type__; typedef const void *__texture_type__;
/*SURFACE_TYPE*/
typedef const void *__surface_type__;
#if defined(__CUDABE__) /* cudabe compiler */ #if defined(__CUDABE__) /* cudabe compiler */
#define __pad__(f) #define __pad__(f)
#define __text__ \ #define __text__ \
__attribute__((__texture__)) __attribute__((__texture__))
#define __surf__ \
__attribute__((__surface__))
#define ___device__(sc) \ #define ___device__(sc) \
static static
#define __in__(cdecl, decl) \ #define __in__(cdecl, decl) \
__shared__ cdecl __shared__ cdecl
#define __in_type__(cdecl, decl) \ #define __in_type__(cdecl, decl) \
cdecl cdecl
#define __texture_var(name) \ #define __texture_var(name) \
name name
#define __shared_var(name, s, type) \ #define __shared_var(name, s, type) \
name name
skipping to change at line 102 skipping to change at line 98
#define __cdecl #define __cdecl
#undef __w64 #undef __w64
#define __w64 #define __w64
#elif defined(__CUDACC__) /* cudafe compiler */ #elif defined(__CUDACC__) /* cudafe compiler */
#define __loc_sc__(loc, sc) \ #define __loc_sc__(loc, sc) \
sc loc sc loc
#define __pad__(f) #define __pad__(f)
#define __text__ #define __text__
#define __surf__
#define ___device__(sc) \ #define ___device__(sc) \
sc __device__ sc __device__
#define __in__(cdecl, decl) \ #define __in__(cdecl, decl) \
decl decl
#define __in_type__(cdecl, decl) \ #define __in_type__(cdecl, decl) \
decl decl
#define __texture_var(name) \ #define __texture_var(name) \
name name
#define __shared_var(name, s, type) \ #define __shared_var(name, s, type) \
name name
skipping to change at line 170 skipping to change at line 165
#endif /* __multi_core__ */ #endif /* __multi_core__ */
#if defined (__MULTI_CORE__) #if defined (__MULTI_CORE__)
#define ___device__(sc) \ #define ___device__(sc) \
static static
#define __pad__(f) \ #define __pad__(f) \
f f
#define __text__ #define __text__
#define __surf__
#define __cudaGet_blockIdx() \ #define __cudaGet_blockIdx() \
(*__cudaGetBlockIdxPtr()) (*__cudaGetBlockIdxPtr())
#define __shared_var(name, s, type) \ #define __shared_var(name, s, type) \
(s type __cudaGetSharedMem((void*)(&(name)))) (s type __cudaGetSharedMem((void*)(&(name))))
#define __var_used__ \ #define __var_used__ \
__attribute__((__used__)) __attribute__((__used__))
#define __storage_auto__shared__ \ #define __storage_auto__shared__ \
auto auto
#undef __cdecl #undef __cdecl
 End of changes. 5 change blocks. 
7 lines changed or deleted 1 lines changed or added


 device_types.h   device_types.h 
/* /*
* Copyright 1993-2008 NVIDIA Corporation. All rights reserved. * Copyright 1993-2009 NVIDIA Corporation. All rights reserved.
* *
* NOTICE TO USER: * NOTICE TO USER:
* *
* This source code is subject to NVIDIA ownership rights under U.S. and * This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws. Users and possessors of this source code * international Copyright laws. Users and possessors of this source code
* are hereby granted a nonexclusive, royalty-free license to use this code * are hereby granted a nonexclusive, royalty-free license to use this code
* in individual and commercial software. * in individual and commercial software.
* *
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
 End of changes. 1 change blocks. 
1 lines changed or deleted 1 lines changed or added


 driver_functions.h   driver_functions.h 
/* /*
* Copyright 1993-2008 NVIDIA Corporation. All rights reserved. * Copyright 1993-2009 NVIDIA Corporation. All rights reserved.
* *
* NOTICE TO USER: * NOTICE TO USER:
* *
* This source code is subject to NVIDIA ownership rights under U.S. and * This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws. Users and possessors of this source code * international Copyright laws. Users and possessors of this source code
* are hereby granted a nonexclusive, royalty-free license to use this code * are hereby granted a nonexclusive, royalty-free license to use this code
* in individual and commercial software. * in individual and commercial software.
* *
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
 End of changes. 1 change blocks. 
1 lines changed or deleted 1 lines changed or added


 driver_types.h   driver_types.h 
/* /*
* Copyright 1993-2008 NVIDIA Corporation. All rights reserved. * Copyright 1993-2009 NVIDIA Corporation. All rights reserved.
* *
* NOTICE TO USER: * NOTICE TO USER:
* *
* This source code is subject to NVIDIA ownership rights under U.S. and * This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws. Users and possessors of this source code * international Copyright laws. Users and possessors of this source code
* are hereby granted a nonexclusive, royalty-free license to use this code * are hereby granted a nonexclusive, royalty-free license to use this code
* in individual and commercial software. * in individual and commercial software.
* *
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
skipping to change at line 39 skipping to change at line 39
* source code with only those rights set forth herein. * source code with only those rights set forth herein.
* *
* Any use of this source code in individual and commercial software must * Any use of this source code in individual and commercial software must
* include, in the user documentation and internal comments to the code, * include, in the user documentation and internal comments to the code,
* the above Disclaimer and U.S. Government End Users Notice. * the above Disclaimer and U.S. Government End Users Notice.
*/ */
#if !defined(__DRIVER_TYPES_H__) #if !defined(__DRIVER_TYPES_H__)
#define __DRIVER_TYPES_H__ #define __DRIVER_TYPES_H__
/**
* \file
* \name Data types used by CUDA Runtime
* \author NVIDIA Corporation
* \brief Data types used by CUDA Runtime
*/
/**
* \defgroup CUDART_TYPES Data types used by CUDA Runtime
* \ingroup CUDART
*
* @{
*/
/************************************************************************** ***** /************************************************************************** *****
* * * *
* TYPE DEFINITIONS USED BY RUNTIME API * * TYPE DEFINITIONS USED BY RUNTIME API *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
#if !defined(__CUDA_INTERNAL_COMPILATION__) #if !defined(__CUDA_INTERNAL_COMPILATION__)
#include <limits.h> #include <limits.h>
#include <stddef.h> #include <stddef.h>
#define cudaHostAllocDefault 0 ///< Default page-locked allocation
flag
#define cudaHostAllocPortable 1 ///< Pinned memory accessible by al
l CUDA contexts
#define cudaHostAllocMapped 2 ///< Map allocation into device spa
ce
#define cudaHostAllocWriteCombined 4 ///< Write-combined memory
#define cudaEventDefault 0 ///< Default event flag
#define cudaEventBlockingSync 1 ///< Event uses blocking synchroniz
ation
#define cudaDeviceScheduleAuto 0 ///< Device flag - Automatic schedu
ling
#define cudaDeviceScheduleSpin 1 ///< Device flag - Spin default sch
eduling
#define cudaDeviceScheduleYield 2 ///< Device flag - Yield default sc
heduling
#define cudaDeviceBlockingSync 4 ///< Device flag - Use blocking syn
chronization
#define cudaDeviceMapHost 8 ///< Device flag - Support mapped p
inned allocations
#define cudaDeviceMask 0xf ///< Device flags mask
#endif /* !__CUDA_INTERNAL_COMPILATION__ */ #endif /* !__CUDA_INTERNAL_COMPILATION__ */
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
/**
* CUDA error types
*/
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
enum cudaError enum cudaError
{ {
cudaSuccess = 0, cudaSuccess = 0, ///< No errors
cudaErrorMissingConfiguration, cudaErrorMissingConfiguration = 1, ///< Missing configurat
cudaErrorMemoryAllocation, ion error
cudaErrorInitializationError, cudaErrorMemoryAllocation = 2, ///< Memory allocation
cudaErrorLaunchFailure, error
cudaErrorPriorLaunchFailure, cudaErrorInitializationError = 3, ///< Initialization err
cudaErrorLaunchTimeout, or
cudaErrorLaunchOutOfResources, cudaErrorLaunchFailure = 4, ///< Launch failure
cudaErrorInvalidDeviceFunction, cudaErrorPriorLaunchFailure = 5, ///< Prior launch failu
cudaErrorInvalidConfiguration, re
cudaErrorInvalidDevice, cudaErrorLaunchTimeout = 6, ///< Launch timeout err
cudaErrorInvalidValue, or
cudaErrorInvalidPitchValue, cudaErrorLaunchOutOfResources = 7, ///< Launch out of reso
cudaErrorInvalidSymbol, urces error
cudaErrorMapBufferObjectFailed, cudaErrorInvalidDeviceFunction = 8, ///< Invalid device fun
cudaErrorUnmapBufferObjectFailed, ction
cudaErrorInvalidHostPointer, cudaErrorInvalidConfiguration = 9, ///< Invalid configurat
cudaErrorInvalidDevicePointer, ion
cudaErrorInvalidTexture, cudaErrorInvalidDevice = 10, ///< Invalid device
cudaErrorInvalidTextureBinding, cudaErrorInvalidValue = 11, ///< Invalid value
cudaErrorInvalidChannelDescriptor, cudaErrorInvalidPitchValue = 12, ///< Invalid pitch valu
cudaErrorInvalidMemcpyDirection, e
cudaErrorAddressOfConstant, cudaErrorInvalidSymbol = 13, ///< Invalid symbol
cudaErrorTextureFetchFailed, cudaErrorMapBufferObjectFailed = 14, ///< Map buffer object
cudaErrorTextureNotBound, failed
cudaErrorSynchronizationError, cudaErrorUnmapBufferObjectFailed = 15, ///< Unmap buffer objec
cudaErrorInvalidFilterSetting, t failed
cudaErrorInvalidNormSetting, cudaErrorInvalidHostPointer = 16, ///< Invalid host point
cudaErrorMixedDeviceExecution, er
cudaErrorCudartUnloading, cudaErrorInvalidDevicePointer = 17, ///< Invalid device poi
cudaErrorUnknown, nter
cudaErrorNotYetImplemented, cudaErrorInvalidTexture = 18, ///< Invalid texture
cudaErrorMemoryValueTooLarge, cudaErrorInvalidTextureBinding = 19, ///< Invalid texture bi
cudaErrorInvalidResourceHandle, nding
cudaErrorNotReady, cudaErrorInvalidChannelDescriptor = 20, ///< Invalid channel de
cudaErrorInsufficientDriver, scriptor
cudaErrorSetOnActiveProcess, cudaErrorInvalidMemcpyDirection = 21, ///< Invalid memcpy dir
cudaErrorStartupFailure = 0x7f, ection
cudaErrorApiFailureBase = 10000 cudaErrorAddressOfConstant = 22, ///< Address of constan
t error
cudaErrorTextureFetchFailed = 23, ///< Texture fetch fail
ed
cudaErrorTextureNotBound = 24, ///< Texture not bound
error
cudaErrorSynchronizationError = 25, ///< Synchronization er
ror
cudaErrorInvalidFilterSetting = 26, ///< Invalid filter set
ting
cudaErrorInvalidNormSetting = 27, ///< Invalid norm setti
ng
cudaErrorMixedDeviceExecution = 28, ///< Mixed device execu
tion
cudaErrorCudartUnloading = 29, ///< CUDA runtime unloa
ding
cudaErrorUnknown = 30, ///< Unknown error cond
ition
cudaErrorNotYetImplemented = 31, ///< Function not yet i
mplemented
cudaErrorMemoryValueTooLarge = 32, ///< Memory value too l
arge
cudaErrorInvalidResourceHandle = 33, ///< Invalid resource h
andle
cudaErrorNotReady = 34, ///< Not ready error
cudaErrorInsufficientDriver = 35, ///< CUDA runtime is ne
wer than driver
cudaErrorSetOnActiveProcess = 36, ///< Set on active proc
ess error
cudaErrorNoDevice = 38, ///< No available CUDA
device
cudaErrorStartupFailure = 0x7f, ///< Startup failure
cudaErrorApiFailureBase = 10000 ///< API failure base
}; };
/**
* Channel format kind
*/
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
enum cudaChannelFormatKind enum cudaChannelFormatKind
{ {
cudaChannelFormatKindSigned, cudaChannelFormatKindSigned = 0, ///< Signed channel for
cudaChannelFormatKindUnsigned, mat
cudaChannelFormatKindFloat, cudaChannelFormatKindUnsigned = 1, ///< Unsigned channel f
cudaChannelFormatKindNone ormat
cudaChannelFormatKindFloat = 2, ///< Float channel form
at
cudaChannelFormatKindNone = 3, ///< No channel format
}; };
/**
* CUDA Channel format descriptor
*/
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct cudaChannelFormatDesc struct cudaChannelFormatDesc
{ {
int x; int x; ///< x
int y; int y; ///< y
int z; int z; ///< z
int w; int w; ///< w
enum cudaChannelFormatKind f; enum cudaChannelFormatKind f; ///< Channel format kind
}; };
/**
* CUDA array
*/
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct cudaArray; struct cudaArray;
/**
* CUDA memory copy types
*/
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
enum cudaMemcpyKind enum cudaMemcpyKind
{ {
cudaMemcpyHostToHost = 0, cudaMemcpyHostToHost = 0, ///< Host -> Host
cudaMemcpyHostToDevice, cudaMemcpyHostToDevice = 1, ///< Host -> Device
cudaMemcpyDeviceToHost, cudaMemcpyDeviceToHost = 2, ///< Device -> Host
cudaMemcpyDeviceToDevice cudaMemcpyDeviceToDevice = 3 ///< Device -> Device
}; };
/**
* CUDA Pitched memory pointer
*/
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct cudaPitchedPtr struct cudaPitchedPtr
{ {
void *ptr; void *ptr; ///< Pointer to allocated memory
size_t pitch; size_t pitch; ///< Pitch of allocated memory in bytes
size_t xsize; size_t xsize; ///< Logical width of allocation in bytes
size_t ysize; size_t ysize; ///< Logical height of allocation in bytes
}; };
/**
* CUDA extent
*/
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct cudaExtent struct cudaExtent
{ {
size_t width; size_t width; ///< Width in bytes
size_t height; size_t height; ///< Height in bytes
size_t depth; size_t depth; ///< Depth in bytes
}; };
/**
* CUDA 3D position
*/
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct cudaPos struct cudaPos
{ {
size_t x; size_t x; ///< x
size_t y; size_t y; ///< y
size_t z; size_t z; ///< z
}; };
/**
* CUDA 3D memory copying parameters
*/
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct cudaMemcpy3DParms struct cudaMemcpy3DParms
{ {
struct cudaArray *srcArray; struct cudaArray *srcArray; ///< Source memory address
struct cudaPos srcPos; struct cudaPos srcPos; ///< Source position offset
struct cudaPitchedPtr srcPtr; struct cudaPitchedPtr srcPtr; ///< Pitched source memory address
struct cudaArray *dstArray; struct cudaArray *dstArray; ///< Destination memory address
struct cudaPos dstPos; struct cudaPos dstPos; ///< Destination position offset
struct cudaPitchedPtr dstPtr; struct cudaPitchedPtr dstPtr; ///< Pitched destination memory address
struct cudaExtent extent; struct cudaExtent extent; ///< Requested memory copy size
enum cudaMemcpyKind kind; enum cudaMemcpyKind kind; ///< Type of transfer
};
/**
* CUDA function attributes
*/
/*DEVICE_BUILTIN*/
struct cudaFuncAttributes
{
size_t sharedSizeBytes; ///< Size of shared memory in bytes
size_t constSizeBytes; ///< Size of constant memory in bytes
size_t localSizeBytes; ///< Size of local memory in bytes
int maxThreadsPerBlock; ///< Maximum number of threads per block
int numRegs; ///< Number of registers used
int __cudaReserved[8];
};
/**
* CUDA device compute modes
*/
/*DEVICE_BUILTIN*/
enum cudaComputeMode
{
cudaComputeModeDefault = 0, ///< Default compute mode (Multiple thr
eads can use ::cudaSetDevice() with this device)
cudaComputeModeExclusive = 1, ///< Compute-exclusive mode (Only one t
hread will be able to use ::cudaSetDevice() with this device)
cudaComputeModeProhibited = 2 ///< Compute-prohibited mode (No thread
s can use ::cudaSetDevice() with this device)
}; };
/**
* CUDA device properties
*/
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct cudaDeviceProp struct cudaDeviceProp
{ {
char name[256]; char name[256]; ///< ASCII string identifying device
size_t totalGlobalMem; size_t totalGlobalMem; ///< Global memory available on device
size_t sharedMemPerBlock; in bytes
int regsPerBlock; size_t sharedMemPerBlock; ///< Shared memory available per block
int warpSize; in bytes
size_t memPitch; int regsPerBlock; ///< 32-bit registers available per blo
int maxThreadsPerBlock; ck
int maxThreadsDim[3]; int warpSize; ///< Warp size in threads
int maxGridSize[3]; size_t memPitch; ///< Maximum pitch in bytes allowed by
int clockRate; memory copies
size_t totalConstMem; int maxThreadsPerBlock; ///< Maximum number of threads per bloc
int major; k
int minor; int maxThreadsDim[3]; ///< Maximum size of each dimension of
size_t textureAlignment; a block
int deviceOverlap; int maxGridSize[3]; ///< Maximum size of each dimension of
int multiProcessorCount; a grid
int kernelExecTimeoutEnabled; int clockRate; ///< Clock frequency in kilohertz
int __cudaReserved[39]; size_t totalConstMem; ///< Constant memory available on devic
e in bytes
int major; ///< Major compute capability
int minor; ///< Minor compute capability
size_t textureAlignment; ///< Alignment requirement for textures
int deviceOverlap; ///< Device can concurrently copy memor
y and execute a kernel
int multiProcessorCount; ///< Number of multiprocessors on devic
e
int kernelExecTimeoutEnabled; ///< Specified whether there is a run t
ime limit on kernels
int integrated; ///< Device is integrated as opposed to
discrete
int canMapHostMemory; ///< Device can map host memory with cu
daHostAlloc/cudaHostGetDevicePointer
int computeMode; ///< Compute mode (See ::cudaComputeMod
e)
int __cudaReserved[36];
}; };
#define cudaDevicePropDontCare \ #define cudaDevicePropDontCare \
{ \ { \
{'\0'}, /* char name[256]; */ \ {'\0'}, /* char name[256]; */ \
0, /* size_t totalGlobalMem; */ \ 0, /* size_t totalGlobalMem; */ \
0, /* size_t sharedMemPerBlock; */ \ 0, /* size_t sharedMemPerBlock; */ \
0, /* int regsPerBlock; */ \ 0, /* int regsPerBlock; */ \
0, /* int warpSize; */ \ 0, /* int warpSize; */ \
0, /* size_t memPitch; */ \ 0, /* size_t memPitch; */ \
0, /* int maxThreadsPerBlock; */ \ 0, /* int maxThreadsPerBlock; */ \
{0, 0, 0}, /* int maxThreadsDim[3]; */ \ {0, 0, 0}, /* int maxThreadsDim[3]; */ \
{0, 0, 0}, /* int maxGridSize[3]; */ \ {0, 0, 0}, /* int maxGridSize[3]; */ \
0, /* int clockRate; */ \ 0, /* int clockRate; */ \
0, /* size_t totalConstMem; */ \ 0, /* size_t totalConstMem; */ \
-1, /* int major; */ \ -1, /* int major; */ \
-1, /* int minor; */ \ -1, /* int minor; */ \
0, /* size_t textureAlignment; */ \ 0, /* size_t textureAlignment; */ \
-1, /* int deviceOverlap; */ \ -1, /* int deviceOverlap; */ \
0, /* int multiProcessorCount; */ \ 0, /* int multiProcessorCount; */ \
0 /* int kernelExecTimeoutEnabled */ \ 0, /* int kernelExecTimeoutEnabled */ \
} 0, /* int integrated */ \
0, /* int canMapHostMemory */ \
0, /* int computeMode */ \
} ///< Empty device properties
/************************************************************************** ***** /************************************************************************** *****
* * * *
* SHORTHAND TYPE DEFINITION USED BY RUNTIME API * * SHORTHAND TYPE DEFINITION USED BY RUNTIME API *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
/**
* CUDA Error types
*/
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
typedef enum cudaError cudaError_t; typedef enum cudaError cudaError_t;
/**
* CUDA stream
*/
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
typedef int cudaStream_t; typedef int cudaStream_t;
/**
* CUDA event types
*/
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
typedef int cudaEvent_t; typedef int cudaEvent_t;
/** @} */
/** @} */ /* END CUDART_TYPES */
#endif /* !__DRIVER_TYPES_H__ */ #endif /* !__DRIVER_TYPES_H__ */
 End of changes. 29 change blocks. 
91 lines changed or deleted 253 lines changed or added


 func_macro.h   func_macro.h 
/* /*
* Copyright 1993-2008 NVIDIA Corporation. All rights reserved. * Copyright 1993-2009 NVIDIA Corporation. All rights reserved.
* *
* NOTICE TO USER: * NOTICE TO USER:
* *
* This source code is subject to NVIDIA ownership rights under U.S. and * This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws. Users and possessors of this source code * international Copyright laws. Users and possessors of this source code
* are hereby granted a nonexclusive, royalty-free license to use this code * are hereby granted a nonexclusive, royalty-free license to use this code
* in individual and commercial software. * in individual and commercial software.
* *
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
 End of changes. 1 change blocks. 
1 lines changed or deleted 1 lines changed or added


 host_config.h   host_config.h 
/* /*
* Copyright 1993-2008 NVIDIA Corporation. All rights reserved. * Copyright 1993-2009 NVIDIA Corporation. All rights reserved.
* *
* NOTICE TO USER: * NOTICE TO USER:
* *
* This source code is subject to NVIDIA ownership rights under U.S. and * This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws. Users and possessors of this source code * international Copyright laws. Users and possessors of this source code
* are hereby granted a nonexclusive, royalty-free license to use this code * are hereby granted a nonexclusive, royalty-free license to use this code
* in individual and commercial software. * in individual and commercial software.
* *
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
skipping to change at line 49 skipping to change at line 49
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
#if defined(__CUDACC__) #if defined(__CUDACC__)
#if defined(__APPLE__) #if defined(__APPLE__)
#define _CRTIMP
#define __THROW #define __THROW
#if defined(__MULTI_CORE__)
#error multicore not supported for MacOs
#endif /* __MULTI_CORE__ */
#elif defined(__GNUC__) #elif defined(__GNUC__)
#include <features.h> /* for __THROW */ #define _CRTIMP
#include <bits/c++config.h> /* get _GLIBCXX_ATOMIC_BUILTINS */ #if defined(__MULTI_CORE__) && __GNUC__ > 3
#error multicore not supported for gcc 4.x
#endif /* __MULTI_CORE__ & __GNUC__ > 3 */
#include <features.h> /* for __THROW */
#include <bits/c++config.h> /* for _GLIBCXX_ATOMIC_BUILTINS */
#if _GLIBCXX_ATOMIC_BUILTINS == 1 #if _GLIBCXX_ATOMIC_BUILTINS == 1
#undef _GLIBCXX_ATOMIC_BUILTINS /* for missing __sync_fetch_and_add */ #undef _GLIBCXX_ATOMIC_BUILTINS /* for missing __sync_fetch_and_add */
#endif /* _GLIBCXX_ATOMIC_BUILTINS == 1 */ #endif /* _GLIBCXX_ATOMIC_BUILTINS == 1 */
#elif defined(_WIN32) #elif defined(_WIN32)
#if _MSC_VER >= 1400 #if defined(__MULTI_CORE__) && _MSC_VER != 1400
#error multicore support available only for VC8
#endif /* __MULTI_CORE__ & _MSC_VER != 1400 */
#if _MSC_VER >= 1500
#undef _USE_DECLSPECS_FOR_SAL #undef _USE_DECLSPECS_FOR_SAL
#define _USE_DECLSPECS_FOR_SAL \ #define _USE_DECLSPECS_FOR_SAL \
1 1
#endif /* _MSC_VER >= 1500 */
#if _MSC_VER >= 1400
#if !defined(_CRT_NONSTDC_NO_WARNINGS) #if !defined(_CRT_NONSTDC_NO_WARNINGS)
#define _CRT_NONSTDC_NO_WARNINGS /* to suppress warnings */ #define _CRT_NONSTDC_NO_WARNINGS /* to suppress warnings */
#endif /* _CRT_NONSTDC_NO_WARNINGS */ #endif /* _CRT_NONSTDC_NO_WARNINGS */
#if !defined(_CRT_SECURE_NO_WARNINGS) #if !defined(_CRT_SECURE_NO_WARNINGS)
#define _CRT_SECURE_NO_WARNINGS /* to suppress warnings */ #define _CRT_SECURE_NO_WARNINGS /* to suppress warnings */
#endif /* _CRT_SECURE_NO_WARNINGS */ #endif /* _CRT_SECURE_NO_WARNINGS */
#endif /* _MSC_VER >= 1400 */ #endif /* _MSC_VER >= 1400 */
#if !defined(NOMINMAX) #if !defined(NOMINMAX)
#define NOMINMAX /* min and max are part of cuda runtime */ #define NOMINMAX /* min and max are part of cuda runtime */
#endif /* !NOMINMAX */ #endif /* !NOMINMAX */
#define __THROW #include <crtdefs.h> /* for _CRTIMP */
/* forward declarations for windows C++ header files */
#include <stddef.h>
class type_info;
#if !defined(_CRTIMP)
#if defined(_DLL)
#define _CRTIMP \
__declspec(dllimport)
#else /* _DLL */
#define _CRTIMP
#endif /* _DLL */
#endif /* !_CRTIMP */
#if defined(_DEBUG)
#if !defined(_NATIVE_WCHAR_T_DEFINED) && defined(_M_CEE_PURE)
extern "C++"
#else /* !_NATIVE_WCHAR_T_DEFINED && _M_CEE_PURE */
extern "C"
#endif /* !_NATIVE_WCHAR_T_DEFINED && _M_CEE_PURE */
_CRTIMP void __cdecl _invalid_parameter(const wchar_t*, const wchar_t*, con
st wchar_t*, unsigned int, uintptr_t);
#else /* _DEBUG */
extern "C" _CRTIMP void __cdecl _invalid_parameter_noinfo(void);
#endif /* DEBUG */
namespace std
{
struct _Secure_char_traits_tag;
namespace _Traits_helper
{
template<class T> inline typename T::char_type *copy_s(typename T::char
_type*, size_t, const typename T::char_type*, size_t, _Secure_char_traits_t
ag);
template<class T> inline typename T::char_type *move_s(typename T::char
_type*, size_t, const typename T::char_type*, size_t, _Secure_char_traits_t
ag);
}
}
namespace stdext #define __THROW
{
template<class U, class V, class W> inline V unchecked_uninitialized_copy
(U, U, V, W&);
}
#endif /* __GNUC__ */ #endif /* __GNUC__ */
#endif /* __CUDACC__ */ #endif /* __CUDACC__ */
#endif /* !__HOST_CONFIG_H__ */ #endif /* !__HOST_CONFIG_H__ */
 End of changes. 9 change blocks. 
50 lines changed or deleted 30 lines changed or added


 host_defines.h   host_defines.h 
/* /*
* Copyright 1993-2008 NVIDIA Corporation. All rights reserved. * Copyright 1993-2009 NVIDIA Corporation. All rights reserved.
* *
* NOTICE TO USER: * NOTICE TO USER:
* *
* This source code is subject to NVIDIA ownership rights under U.S. and * This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws. Users and possessors of this source code * international Copyright laws. Users and possessors of this source code
* are hereby granted a nonexclusive, royalty-free license to use this code * are hereby granted a nonexclusive, royalty-free license to use this code
* in individual and commercial software. * in individual and commercial software.
* *
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
skipping to change at line 121 skipping to change at line 121
#define __device__ \ #define __device__ \
__location__(__device__) __location__(__device__)
#define __host__ \ #define __host__ \
__location__(__host__) __location__(__host__)
#define __global__ \ #define __global__ \
__location__(__global__) __location__(__global__)
#define __shared__ \ #define __shared__ \
__location__(__shared__) __location__(__shared__)
#define __constant__ \ #define __constant__ \
__location__(__constant__) __location__(__constant__)
#define __launch_bounds__(t, b) \ #define __launch_bounds__(t) \
__location__(__launch_bounds__(t, b)) __location__(__launch_bounds__(t))
#endif /* !__HOST_DEFINES_H__ */ #endif /* !__HOST_DEFINES_H__ */
 End of changes. 2 change blocks. 
3 lines changed or deleted 3 lines changed or added


 host_runtime.h   host_runtime.h 
/* /*
* Copyright 1993-2008 NVIDIA Corporation. All rights reserved. * Copyright 1993-2009 NVIDIA Corporation. All rights reserved.
* *
* NOTICE TO USER: * NOTICE TO USER:
* *
* This source code is subject to NVIDIA ownership rights under U.S. and * This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws. Users and possessors of this source code * international Copyright laws. Users and possessors of this source code
* are hereby granted a nonexclusive, royalty-free license to use this code * are hereby granted a nonexclusive, royalty-free license to use this code
* in individual and commercial software. * in individual and commercial software.
* *
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
skipping to change at line 63 skipping to change at line 63
#define __extern_weak__ \ #define __extern_weak__ \
__weak_import__, __weak_import__,
#elif defined(__GNUC__) #elif defined(__GNUC__)
#define __extern_weak__ #define __extern_weak__
#endif /* __APPLE__ */ #endif /* __APPLE__ */
#if !defined(__cplusplus) #if defined(__cplusplus)
#define __device_stub_name(c, cpp) \
cpp
#define __cppref__ \
&
#else /* __cplusplus */
#define __device_stub_name(c, cpp) \
c
#define __cppref__
typedef char bool; typedef char bool;
#endif /* !__cplusplus */ #endif /* __cplusplus */
#include "cuda_runtime_api.h" #include "cuda_runtime_api.h"
#include "storage_class.h" #include "storage_class.h"
#else /* !__CUDA_INTERNAL_COMPILATION__ */ #else /* !__CUDA_INTERNAL_COMPILATION__ */
#include "host_defines.h" #include "host_defines.h"
#define __cudaRegisterBinary() \ #define __cudaRegisterBinary() \
__cudaFatCubinHandle = __cudaRegisterFatBinary((void*)__cudaFatCubi n); \ __cudaFatCubinHandle = __cudaRegisterFatBinary((void*)__cudaFatCubi n); \
atexit(__cudaUnregisterBinaryUtil) atexit(__cudaUnregisterBinaryUtil)
#define __cudaRegisterVariable(var, ext, size, constant, global) \ #define __cudaRegisterVariable(var, ext, size, constant, global) \
__cudaRegisterVar(__cudaFatCubinHandle, (char*)&__host##var, (char* )__device##var, __name##var, ext, size, constant, global) __cudaRegisterVar(__cudaFatCubinHandle, (char*)&__host##var, (char* )__device##var, __name##var, ext, size, constant, global)
#define __cudaRegisterGlobalTexture(tex, dim, norm, ext) \ #define __cudaRegisterGlobalTexture(tex, dim, norm, ext) \
__cudaRegisterTexture(__cudaFatCubinHandle, (const struct textureRe ference*)&tex, __tex_var(tex), #tex, dim, norm, ext) __cudaRegisterTexture(__cudaFatCubinHandle, (const struct textureRe ference*)&tex, __tex_var(tex), #tex, dim, norm, ext)
#define __cudaRegisterUnsizedShared(var) \ #define __cudaRegisterUnsizedShared(var) \
__cudaRegisterShared(__cudaFatCubinHandle, (void**)__device_var(var )) __cudaRegisterShared(__cudaFatCubinHandle, (void**)__device_var(var ))
#define __cudaRegisterSharedVariable(var, size, align, sc) \ #define __cudaRegisterSharedVariable(var, size, align, sc) \
__cudaRegisterSharedVar(__cudaFatCubinHandle, (void**)__device_var( var), size, align, sc) __cudaRegisterSharedVar(__cudaFatCubinHandle, (void**)__device_var( var), size, align, sc)
#define __cudaRegisterEntry(fun, thread_limit) \ #define __cudaRegisterEntry(funptr, fun, thread_limit) \
__cudaRegisterFunction(__cudaFatCubinHandle, (const char*)__device_ __cudaRegisterFunction(__cudaFatCubinHandle, (const char*)funptr, (
stub_##fun, (char*)__device_fun(fun), #fun, thread_limit, __ids) char*)__device_fun(fun), #fun, thread_limit, __ids)
#define __cudaInitArgBlock(arg) \ #define __cudaInitArgBlock(arg) \
char __[256]; \ char __[256]; \
*(char**)&arg = __ *(char**)&arg = __
#define __cudaSetupArg(arg, offset) \ #define __cudaSetupArg(arg, offset) \
if (cudaSetupArgument((void*)(char*)&arg, sizeof(arg), (size_t)&off set->arg - (size_t)offset) != cudaSuccess) \ if (cudaSetupArgument((void*)(char*)&arg, sizeof(arg), (size_t)&off set->arg - (size_t)offset) != cudaSuccess) \
return return
#define __cudaLaunch(fun) \ #define __cudaLaunch(fun) \
{ volatile static char *__f; __f = fun; (void)cudaLaunch(fun); } { volatile static char *__f; __f = fun; (void)cudaLaunch(fun); }
 End of changes. 4 change blocks. 
6 lines changed or deleted 17 lines changed or added


 math_constants.h   math_constants.h 
/* /*
* Copyright 1993-2008 NVIDIA Corporation. All rights reserved. * Copyright 1993-2009 NVIDIA Corporation. All rights reserved.
* *
* NOTICE TO USER: * NOTICE TO USER:
* *
* This source code is subject to NVIDIA ownership rights under U.S. and * This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws. Users and possessors of this source code * international Copyright laws. Users and possessors of this source code
* are hereby granted a nonexclusive, royalty-free license to use this code * are hereby granted a nonexclusive, royalty-free license to use this code
* in individual and commercial software. * in individual and commercial software.
* *
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
 End of changes. 1 change blocks. 
1 lines changed or deleted 1 lines changed or added


 math_functions.h   math_functions.h 
/* /*
* Copyright 1993-2008 NVIDIA Corporation. All rights reserved. * Copyright 1993-2009 NVIDIA Corporation. All rights reserved.
* *
* NOTICE TO USER: * NOTICE TO USER:
* *
* This source code is subject to NVIDIA ownership rights under U.S. and * This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws. Users and possessors of this source code * international Copyright laws. Users and possessors of this source code
* are hereby granted a nonexclusive, royalty-free license to use this code * are hereby granted a nonexclusive, royalty-free license to use this code
* in individual and commercial software. * in individual and commercial software.
* *
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
skipping to change at line 339 skipping to change at line 339
extern __host__ __device__ double remquo(double, double, int*) __THR OW; extern __host__ __device__ double remquo(double, double, int*) __THR OW;
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __host__ __device__ float remquof(float, float, int*) __THRO W; extern __host__ __device__ float remquof(float, float, int*) __THRO W;
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __host__ __device__ double erf(double) __THROW; extern __host__ __device__ double erf(double) __THROW;
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __host__ __device__ float erff(float) __THROW; extern __host__ __device__ float erff(float) __THROW;
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __host__ __device__ double erfinv(double) __THROW;
/*DEVICE_BUILTIN*/
extern __host__ __device__ float erfinvf(float) __THROW;
/*DEVICE_BUILTIN*/
extern __host__ __device__ double erfc(double) __THROW; extern __host__ __device__ double erfc(double) __THROW;
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __host__ __device__ float erfcf(float) __THROW; extern __host__ __device__ float erfcf(float) __THROW;
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __host__ __device__ double erfcinv(double) __THROW;
/*DEVICE_BUILTIN*/
extern __host__ __device__ float erfcinvf(float) __THROW;
/*DEVICE_BUILTIN*/
extern __host__ __device__ double lgamma(double) __THROW; extern __host__ __device__ double lgamma(double) __THROW;
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __host__ __device__ float lgammaf(float) __THROW; extern __host__ __device__ float lgammaf(float) __THROW;
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __host__ __device__ double tgamma(double) __THROW; extern __host__ __device__ double tgamma(double) __THROW;
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __host__ __device__ float tgammaf(float) __THROW; extern __host__ __device__ float tgammaf(float) __THROW;
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
skipping to change at line 670 skipping to change at line 680
static __inline__ __host__ __device__ void sincos(float a, float *sptr, flo at *cptr) static __inline__ __host__ __device__ void sincos(float a, float *sptr, flo at *cptr)
{ {
sincosf(a, sptr, cptr); sincosf(a, sptr, cptr);
} }
static __inline__ __host__ __device__ float erf(float a) static __inline__ __host__ __device__ float erf(float a)
{ {
return erff(a); return erff(a);
} }
static __inline__ __host__ __device__ float erfinv(float a)
{
return erfinvf(a);
}
static __inline__ __host__ __device__ float erfc(float a) static __inline__ __host__ __device__ float erfc(float a)
{ {
return erfcf(a); return erfcf(a);
} }
static __inline__ __host__ __device__ float erfcinv(float a)
{
return erfcinvf(a);
}
static __inline__ __host__ __device__ float lgamma(float a) static __inline__ __host__ __device__ float lgamma(float a)
{ {
return lgammaf(a); return lgammaf(a);
} }
static __inline__ __host__ __device__ float tgamma(float a) static __inline__ __host__ __device__ float tgamma(float a)
{ {
return tgammaf(a); return tgammaf(a);
} }
skipping to change at line 1055 skipping to change at line 1075
return copysignf(u, a); return copysignf(u, a);
} }
} }
__device_func__(float __internal_fminf(float a, float b)) __device_func__(float __internal_fminf(float a, float b))
{ {
volatile union { volatile union {
float f; float f;
unsigned int i; unsigned int i;
} cvta, cvtb; } cvta, cvtb;
int nana, nanb;
cvta.f = a; cvta.f = a;
cvtb.f = b; cvtb.f = b;
if ((cvta.i << 1) > 0xff000000) return b; nana = ((cvta.i << 1) > 0xff000000);
if ((cvtb.i << 1) > 0xff000000) return a; nanb = ((cvtb.i << 1) > 0xff000000);
if (nana && nanb) return a + b;
if (nana) return b;
if (nanb) return a;
if ((cvta.i | cvtb.i) == 0x80000000) { if ((cvta.i | cvtb.i) == 0x80000000) {
return CUDART_NEG_ZERO_F; return CUDART_NEG_ZERO_F;
} }
return a < b ? a : b; return a < b ? a : b;
} }
__device_func__(float __internal_fmaxf(float a, float b)) __device_func__(float __internal_fmaxf(float a, float b))
{ {
volatile union { volatile union {
float f; float f;
unsigned int i; unsigned int i;
} cvta, cvtb; } cvta, cvtb;
int nana, nanb;
cvta.f = a; cvta.f = a;
cvtb.f = b; cvtb.f = b;
if ((cvta.i << 1) > 0xff000000) return b; nana = ((cvta.i << 1) > 0xff000000);
if ((cvtb.i << 1) > 0xff000000) return a; nanb = ((cvtb.i << 1) > 0xff000000);
if (nana && nanb) return a + b;
if (nana) return b;
if (nanb) return a;
if ((cvta.f == 0.0f) && (cvtb.f == 0.0f)) { if ((cvta.f == 0.0f) && (cvtb.f == 0.0f)) {
cvta.i &= cvtb.i; cvta.i &= cvtb.i;
return cvta.f; return cvta.f;
} }
return a > b ? a : b; return a > b ? a : b;
} }
#if defined(_WIN32) #if defined(_WIN32)
__func__(double trunc(double a)) __func__(double trunc(double a))
skipping to change at line 1140 skipping to change at line 1168
return ceilf(a); return ceilf(a);
} }
__device_func__(float __cuda_floorf(float a)) __device_func__(float __cuda_floorf(float a))
{ {
return floorf(a); return floorf(a);
} }
__device_func__(float __cuda_sqrtf(float a)) __device_func__(float __cuda_sqrtf(float a))
{ {
return sqrtf(a); return sqrtf(a);
} }
__device_func__(float __cuda_rsqrtf(float a)) __device_func__(float __cuda_rsqrtf(float a))
{ {
return 1.0f / sqrtf(a); return 1.0f / sqrtf(a);
} }
__device_func__(float __cuda_truncf(float a)) __device_func__(float __cuda_truncf(float a))
{ {
return truncf(a); return truncf(a);
} }
__device_func__(int __cuda_max(int a, int b)) __device_func__(int __cuda_max(int a, int b))
{ {
return max(a, b); return max(a, b);
skipping to change at line 1308 skipping to change at line 1336
#endif /* __CUDABE__ */ #endif /* __CUDABE__ */
return a; return a;
} }
__device_func__(float __cuda_nanf(const char *tagp)) __device_func__(float __cuda_nanf(const char *tagp))
{ {
/* the GPU only has one canonical QNaN, so return that */ /* the GPU only has one canonical QNaN, so return that */
return CUDART_NAN_F; return CUDART_NAN_F;
} }
__device_func__(float __internal_fmad(float a, float b, float c))
{
return a * b + c;
}
/* approximate 2*atanh(a/2) for |a| < 0.245 */ /* approximate 2*atanh(a/2) for |a| < 0.245 */
__device_func__(float __internal_atanhf_kernel(float a_1, float a_2)) __device_func__(float __internal_atanhf_kernel(float a_1, float a_2))
{ {
float a, a2, t; float a, a2, t;
a = a_1 + a_2; a = a_1 + a_2;
a2 = a * a; a2 = a * a;
t = 1.566305595598990E-001f/64.0f; t = 1.566305595598990E-001f/64.0f;
t = t * a2 + 1.995081856004762E-001f/16.0f; t = __internal_fmad (t, a2, 1.995081856004762E-001f/16.0f);
t = t * a2 + 3.333382699617026E-001f/4.0f; t = __internal_fmad (t, a2, 3.333382699617026E-001f/4.0f);
t = t * a2; t = t * a2;
t = t * a + a_2; t = __internal_fmad (t, a, a_2);
t = t + a_1; t = t + a_1;
return t; return t;
} }
/* compute atan(r) in first octant, i.e. 0 <= r <= 1 /* compute atan(r) in first octant, i.e. 0 <= r <= 1
* eps ~= 2.16e-7 * eps ~= 2.16e-7
*/ */
__device_func__(float __internal_atanf_kernel(float a)) __device_func__(float __internal_atanf_kernel(float a))
{ {
float t4, t0, t1; float t4, t0, t1;
t4 = a * a; t4 = a * a;
t0 = - 5.674867153f; t0 = -5.674867153f;
t0 = t4 * - 0.823362947f + t0; t0 = __internal_fmad (t4, -0.823362947f, t0);
t0 = t0 * t4 - 6.565555096f; t0 = __internal_fmad (t0, t4, -6.565555096f);
t0 = t0 * t4; t0 = t0 * t4;
t0 = t0 * a; t0 = t0 * a;
t1 = t4 + 11.33538818f; t1 = t4 + 11.33538818f;
t1 = t1 * t4 + 28.84246826f; t1 = __internal_fmad (t1, t4, 28.84246826f);
t1 = t1 * t4 + 19.69667053f; t1 = __internal_fmad (t1, t4, 19.69667053f);
t1 = 1.0f / t1; t1 = 1.0f / t1;
a = t0 * t1 + a; a = __internal_fmad (t0, t1, a);
return a; return a;
} }
/* approximate tangent on -pi/4...+pi/4 */ /* approximate tangent on -pi/4...+pi/4 */
__device_func__(float __internal_tan_kernel(float a)) __device_func__(float __internal_tan_kernel(float a))
{ {
float a2, s, t; float a2, s, t;
a2 = a * a; a2 = a * a;
t = 4.114678393115178E-003f * a2 - 8.231194034909670E-001f; t = __internal_fmad (4.114678393115178E-003f, a2, -8.231194034909670E-00 1f);
s = a2 - 2.469348886157666E+000f; s = a2 - 2.469348886157666E+000f;
s = 1.0f / s; s = 1.0f / s;
t = t * s; t = t * s;
t = t * a2; t = t * a2;
t = t * a + a; t = __internal_fmad (t, a, a);
return t; return t;
} }
__device_func__(float __internal_accurate_logf(float a)) __device_func__(float __internal_accurate_logf(float a))
{ {
float t; float t;
float z; float z;
float m; float m;
int ia, e; int ia, e;
ia = __float_as_int(a); ia = __float_as_int(a);
skipping to change at line 1380 skipping to change at line 1413
} }
/* log(a) = 2 * atanh((a-1)/(a+1)) */ /* log(a) = 2 * atanh((a-1)/(a+1)) */
m = __int_as_float((ia & 0x807fffff) | 0x3f800000); m = __int_as_float((ia & 0x807fffff) | 0x3f800000);
e = ((unsigned)ia >> 23) - 127; e = ((unsigned)ia >> 23) - 127;
if (m > CUDART_SQRT_TWO_F) { if (m > CUDART_SQRT_TWO_F) {
m = m * 0.5f; m = m * 0.5f;
e = e + 1; e = e + 1;
} }
t = m - 1.0f; t = m - 1.0f;
z = m + 1.0f; z = m + 1.0f;
z = t / z; z = __fdividef (t, z);
z = -t * z; z = -t * z;
z = __internal_atanhf_kernel(t, z); z = __internal_atanhf_kernel(t, z);
z = (float)e * CUDART_LN2_F + z; z = __internal_fmad ((float)e, CUDART_LN2_F, z);
return z; return z;
} }
__device_func__(float2 __internal_log_ep(float a)) __device_func__(float2 __internal_log_ep(float a))
{ {
float2 res; float2 res;
int expo; int expo;
float m; float m;
float log_hi, log_lo; float log_hi, log_lo;
float t_hi, t_lo; float t_hi, t_lo;
skipping to change at line 1426 skipping to change at line 1459
/* compute log(m) with extended precision using an algorithm from P.T.P. /* compute log(m) with extended precision using an algorithm from P.T.P.
* Tang, "Table Driven Implementation of the Logarithm Function", TOMS, * Tang, "Table Driven Implementation of the Logarithm Function", TOMS,
* Vol. 16., No. 4, December 1990, pp. 378-400. A modified polynomial * Vol. 16., No. 4, December 1990, pp. 378-400. A modified polynomial
* approximation to atanh(x) on the interval [-0.1716, 0.1716] is utilize d. * approximation to atanh(x) on the interval [-0.1716, 0.1716] is utilize d.
*/ */
f = m - 1.0f; f = m - 1.0f;
g = m + 1.0f; g = m + 1.0f;
g = 1.0f / g; g = 1.0f / g;
u = 2.0f * f * g; u = 2.0f * f * g;
v = u * u; v = u * u;
q = 1.49356810919559350E-001f/64.0f; q = 1.49356810919559350E-001f/64.0f;
q = q * v + 1.99887797540072460E-001f/16.0f; q = __internal_fmad (q, v, 1.99887797540072460E-001f/16.0f);
q = q * v + 3.33333880955515580E-001f/4.0f; q = __internal_fmad (q, v, 3.33333880955515580E-001f/4.0f);
q = q * v; q = q * v;
q = q * u; q = q * u;
log_hi = __int_as_float(__float_as_int(u) & 0xfffff000); log_hi = __int_as_float(__float_as_int(u) & 0xfffff000);
v = __int_as_float(__float_as_int(f) & 0xfffff000); v = __int_as_float(__float_as_int(f) & 0xfffff000);
u = 2.0f * (f - log_hi); u = 2.0f * (f - log_hi);
f = f - v; f = f - v;
u = u - log_hi * v; u = __internal_fmad (-log_hi, v, u);
u = u - log_hi * f; u = __internal_fmad (-log_hi, f, u);
u = g * u; u = g * u;
/* compute log(m) = log_hi + u + q in double-single format*/ /* compute log(m) = log_hi + u + q in double-single format*/
/* log += u; |log| > |u| */ /* log += u; |log| > |u| */
r = log_hi + u; r = log_hi + u;
s = u - (r - log_hi); s = u - (r - log_hi);
log_hi = r; log_hi = r;
log_lo = s; log_lo = s;
/* log += q; |log| > |q| */ /* log += q; |log| > |q| */
r = log_hi + q; r = log_hi + q;
skipping to change at line 1541 skipping to change at line 1574
} }
result[q] = hi; result[q] = hi;
e = e & 31; e = e & 31;
/* shift result such that hi:lo<63:62> are the least significant /* shift result such that hi:lo<63:62> are the least significant
integer bits, and hi:lo<61:0> are the fractional bits of the result integer bits, and hi:lo<61:0> are the fractional bits of the result
*/ */
hi = result[idx+2]; hi = result[idx+2];
lo = result[idx+1]; lo = result[idx+1];
if (e) { if (e) {
q = 32 - e; q = 32 - e;
hi = (hi << e) | (lo >> q); hi = (hi << e) + (lo >> q);
lo = (lo << e) | (result[idx] >> q); lo = (lo << e) + (result[idx] >> q);
} }
q = hi >> 30; q = hi >> 30;
/* fraction */ /* fraction */
hi = (hi << 2) | (lo >> 30); hi = (hi << 2) + (lo >> 30);
lo = (lo << 2); lo = (lo << 2);
e = (hi + (lo > 0)) > 0x80000000; /* fraction >= 0.5 */ e = (hi + (lo > 0)) > 0x80000000; /* fraction >= 0.5 */
q += e; q += e;
if (s) q = -q; if (s) q = -q;
if (e) { if (e) {
unsigned int t; unsigned int t;
hi = ~hi; hi = ~hi;
lo = -(int)lo; lo = -(int)lo;
t = (lo == 0); t = (lo == 0);
hi += t; hi += t;
s = s ^ 0x80000000; s = s ^ 0x80000000;
} }
*quadrant = q; *quadrant = q;
/* normalize fraction */ /* normalize fraction */
e = 0; e = 0;
while ((int)hi > 0) { while ((int)hi > 0) {
hi = (hi << 1) | (lo >> 31); hi = (hi << 1) + (lo >> 31);
lo = (lo << 1); lo = (lo << 1);
e--; e--;
} }
lo = hi * 0xc90fdaa2; lo = hi * 0xc90fdaa2;
hi = __umulhi(hi, 0xc90fdaa2); hi = __umulhi(hi, 0xc90fdaa2);
if ((int)hi > 0) { if ((int)hi > 0) {
hi = (hi << 1) | (lo >> 31); hi = (hi << 1) + (lo >> 31);
lo = (lo << 1); lo = (lo << 1);
e--; e--;
} }
hi = hi + (lo > 0); hi = hi + (lo > 0);
ia = s | (((e + 126) << 23) + (hi >> 8) + ((hi << 24) >= 0x80000000)); ia = s | (((e + 126) << 23) + (hi >> 8) + ((hi << 24) >= 0x80000000));
return __int_as_float(ia); return __int_as_float(ia);
} }
q = __float2int_rn(a * CUDART_2_OVER_PI_F); q = __float2int_rn (a * CUDART_2_OVER_PI_F);
j = (float)q; j = (float)q;
a = a - j * 1.5703125000000000e+000f; a = __internal_fmad (-j, 1.5703125000000000e+000f, a);
a = a - j * 4.8351287841796875e-004f; a = __internal_fmad (-j, 4.8351287841796875e-004f, a);
a = a - j * 3.1385570764541626e-007f; a = __internal_fmad (-j, 3.1385570764541626e-007f, a);
a = a - j * 6.0771005065061922e-011f; a = __internal_fmad (-j, 6.0771005065061922e-011f, a);
*quadrant = q; *quadrant = q;
return a; return a;
} }
/* High quality implementation of expf(). A naive implementation, expf(x) = /* High quality implementation of expf(). A naive implementation, expf(x) =
* exp2f (x * log2(e)), loses significant accuracy for large arguments, and * exp2f (x * log2(e)), loses significant accuracy for large arguments, and
* may return results with only 15 to 16 good bits (out of 24). The present * may return results with only 15 to 16 good bits (out of 24). The present
* implementation limits the error to about 2 ulps across the entire argume nt * implementation limits the error to about 2 ulps across the entire argume nt
* range. It does so by employing an extended precision representation for * range. It does so by employing an extended precision representation for
* ln(2) which is composited from ln2_hi = 0.6931457519f which provides the * ln(2) which is composited from ln2_hi = 0.6931457519f which provides the
* most significant 16-bit of ln(2), and ln2_lo = 1.4286067653e-6f, which * most significant 16-bit of ln(2), and ln2_lo = 1.4286067653e-6f, which
* provides the least significant 24 bits. * provides the least significant 24 bits.
*/ */
__device_func__(float __internal_expf_kernel(float a, float scale)) __device_func__(float __internal_expf_kernel(float a, float scale))
{ {
float j, z; float j, z;
j = __cuda_truncf(a * CUDART_L2E_F); j = __cuda_truncf(a * CUDART_L2E_F);
z = a - j * 0.6931457519f; z = __internal_fmad (j, -0.6931457519f, a);
z = z - j * 1.4286067653e-6f; z = __internal_fmad (j, -1.4286067653e-6f, z);
z = z * CUDART_L2E_F; z = z * CUDART_L2E_F;
z = __cuda_exp2f(z) * __cuda_exp2f(j + scale); z = __cuda_exp2f(z) * __cuda_exp2f(j + scale);
return z; return z;
} }
__device_func__(float __internal_accurate_expf(float a)) __device_func__(float __internal_accurate_expf(float a))
{ {
float z; float z;
z = __internal_expf_kernel(a, 0.0f); z = __internal_expf_kernel(a, 0.0f);
if (a < -105.0f) z = 0.0f; if (a < -105.0f) z = 0.0f;
if (a > 105.0f) z = CUDART_INF_F; if (a > 105.0f) z = CUDART_INF_F;
return z; return z;
} }
__device_func__(float __internal_accurate_exp10f(float a)) __device_func__(float __internal_accurate_exp10f(float a))
{ {
float j, z; float j, z;
j = __cuda_truncf(a * CUDART_L2T_F); j = __cuda_truncf(a * CUDART_L2T_F);
z = a - j * 3.0102920532226563e-001f; z = __internal_fmad (j, -3.0102920532226563e-001f, a);
z = z - j * 7.9034171557301747e-007f; z = __internal_fmad (j, -7.9034171557301747e-007f, z);
z = z * CUDART_L2T_F; z = z * CUDART_L2T_F;
z = __cuda_exp2f(z) * __cuda_exp2f(j); z = __cuda_exp2f(z) * __cuda_exp2f(j);
if (a < -46.0f) z = 0.0f; if (a < -46.0f) z = 0.0f;
if (a > 46.0f) z = CUDART_INF_F; if (a > 46.0f) z = CUDART_INF_F;
return z; return z;
} }
__device_func__(float __internal_lgammaf_pos(float a)) __device_func__(float __internal_lgammaf_pos(float a))
{ {
float sum; float sum;
skipping to change at line 1646 skipping to change at line 1681
if (a == CUDART_INF_F) { if (a == CUDART_INF_F) {
return a; return a;
} }
if (a >= 3.0f) { if (a >= 3.0f) {
if (a >= 7.8f) { if (a >= 7.8f) {
/* Stirling approximation for a >= 8; coefficients from Hart et al, /* Stirling approximation for a >= 8; coefficients from Hart et al,
* "Computer Approximations", Wiley 1968. Approximation 5401 * "Computer Approximations", Wiley 1968. Approximation 5401
*/ */
s = 1.0f / a; s = 1.0f / a;
t = s * s; t = s * s;
sum = 0.77783067e-3f; sum = 0.77783067e-3f;
sum = sum * t - 0.2777655457e-2f; sum = __internal_fmad (sum, t, -0.2777655457e-2f);
sum = sum * t + 0.83333273853e-1f; sum = __internal_fmad (sum, t, 0.83333273853e-1f);
sum = sum * s + 0.918938533204672f; sum = __internal_fmad (sum, s, 0.918938533204672f);
s = 0.5f * __internal_accurate_logf(a); s = 0.5f * __internal_accurate_logf(a);
t = a - 0.5f; t = a - 0.5f;
s = s * t; s = s * t;
t = s - a; t = s - a;
s = __fadd_rn(s, sum); /* prevent FMAD merging */ s = __fadd_rn(s, sum); /* prevent FMAD merging */
t = t + s; t = t + s;
return t; return t;
} else { } else {
a = a - 3.0f; a = a - 3.0f;
s = - 7.488903254816711E+002f; s = -7.488903254816711E+002f;
s = s * a - 1.234974215949363E+004f; s = __internal_fmad (s, a, -1.234974215949363E+004f);
s = s * a - 4.106137688064877E+004f; s = __internal_fmad (s, a, -4.106137688064877E+004f);
s = s * a - 4.831066242492429E+004f; s = __internal_fmad (s, a, -4.831066242492429E+004f);
s = s * a - 1.430333998207429E+005f; s = __internal_fmad (s, a, -1.430333998207429E+005f);
t = a - 2.592509840117874E+002f; t = a - 2.592509840117874E+002f;
t = t * a - 1.077717972228532E+004f; t = __internal_fmad (t, a, -1.077717972228532E+004f);
t = t * a - 9.268505031444956E+004f; t = __internal_fmad (t, a, -9.268505031444956E+004f);
t = t * a - 2.063535768623558E+005f; t = __internal_fmad (t, a, -2.063535768623558E+005f);
t = s / t; t = __fdividef (s, t);
t = t + a; t = t + a;
return t; return t;
} }
} else if (a >= 1.5f) { } else if (a >= 1.5f) {
a = a - 2.0f; a = a - 2.0f;
t = + 4.959849168282574E-005f; t = 4.959849168282574E-005f;
t = t * a - 2.208948403848352E-004f; t = __internal_fmad (t, a, -2.208948403848352E-004f);
t = t * a + 5.413142447864599E-004f; t = __internal_fmad (t, a, 5.413142447864599E-004f);
t = t * a - 1.204516976842832E-003f; t = __internal_fmad (t, a, -1.204516976842832E-003f);
t = t * a + 2.884251838546602E-003f; t = __internal_fmad (t, a, 2.884251838546602E-003f);
t = t * a - 7.382757963931180E-003f; t = __internal_fmad (t, a, -7.382757963931180E-003f);
t = t * a + 2.058131963026755E-002f; t = __internal_fmad (t, a, 2.058131963026755E-002f);
t = t * a - 6.735248600734503E-002f; t = __internal_fmad (t, a, -6.735248600734503E-002f);
t = t * a + 3.224670187176319E-001f; t = __internal_fmad (t, a, 3.224670187176319E-001f);
t = t * a + 4.227843368636472E-001f; t = __internal_fmad (t, a, 4.227843368636472E-001f);
t = t * a; t = t * a;
return t; return t;
} else if (a >= 0.7f) { } else if (a >= 0.7f) {
a = 1.0f - a; a = 1.0f - a;
t = + 4.588266515364258E-002f; t = 4.588266515364258E-002f;
t = t * a + 1.037396712740616E-001f; t = __internal_fmad (t, a, 1.037396712740616E-001f);
t = t * a + 1.228036339653591E-001f; t = __internal_fmad (t, a, 1.228036339653591E-001f);
t = t * a + 1.275242157462838E-001f; t = __internal_fmad (t, a, 1.275242157462838E-001f);
t = t * a + 1.432166835245778E-001f; t = __internal_fmad (t, a, 1.432166835245778E-001f);
t = t * a + 1.693435824224152E-001f; t = __internal_fmad (t, a, 1.693435824224152E-001f);
t = t * a + 2.074079329483975E-001f; t = __internal_fmad (t, a, 2.074079329483975E-001f);
t = t * a + 2.705875136435339E-001f; t = __internal_fmad (t, a, 2.705875136435339E-001f);
t = t * a + 4.006854436743395E-001f; t = __internal_fmad (t, a, 4.006854436743395E-001f);
t = t * a + 8.224669796332661E-001f; t = __internal_fmad (t, a, 8.224669796332661E-001f);
t = t * a + 5.772156651487230E-001f; t = __internal_fmad (t, a, 5.772156651487230E-001f);
t = t * a; t = t * a;
return t; return t;
} else { } else {
t = + 3.587515669447039E-003f; t = 3.587515669447039E-003f;
t = t * a - 5.471285428060787E-003f; t = __internal_fmad (t, a, -5.471285428060787E-003f);
t = t * a - 4.462712795343244E-002f; t = __internal_fmad (t, a, -4.462712795343244E-002f);
t = t * a + 1.673177015593242E-001f; t = __internal_fmad (t, a, 1.673177015593242E-001f);
t = t * a - 4.213597883575600E-002f; t = __internal_fmad (t, a, -4.213597883575600E-002f);
t = t * a - 6.558672843439567E-001f; t = __internal_fmad (t, a, -6.558672843439567E-001f);
t = t * a + 5.772153712885004E-001f; t = __internal_fmad (t, a, 5.772153712885004E-001f);
t = t * a; t = t * a;
t = t * a + a; t = __internal_fmad (t, a, a);
return -__internal_accurate_logf(t); return -__internal_accurate_logf(t);
} }
} }
/* approximate sine on -pi/4...+pi/4 */ /* approximate sine on -pi/4...+pi/4 */
__device_func__(float __internal_sin_kernel(float x)) __device_func__(float __internal_sin_kernel(float x))
{ {
float x2, z; float x2, z;
x2 = x * x; x2 = x * x;
z = - 1.95152959e-4f; z = -1.95152959e-4f;
z = z * x2 + 8.33216087e-3f; z = __internal_fmad (z, x2, 8.33216087e-3f);
z = z * x2 - 1.66666546e-1f; z = __internal_fmad (z, x2, -1.66666546e-1f);
z = z * x2; z = z * x2;
z = z * x + x; z = __internal_fmad (z, x, x);
return z; return z;
} }
/* approximate cosine on -pi/4...+pi/4 */ /* approximate cosine on -pi/4...+pi/4 */
__device_func__(float __internal_cos_kernel(float x)) __device_func__(float __internal_cos_kernel(float x))
{ {
float x2, z; float x2, z;
x2 = x * x; x2 = x * x;
z = 2.44331571e-5f; z = 2.44331571e-5f;
z = z * x2 - 1.38873163e-3f; z = __internal_fmad (z, x2, -1.38873163e-3f);
z = z * x2 + 4.16666457e-2f; z = __internal_fmad (z, x2, 4.16666457e-2f);
z = z * x2 - 5.00000000e-1f; z = __internal_fmad (z, x2, -5.00000000e-1f);
z = z * x2 + 1.00000000e+0f; z = __internal_fmad (z, x2, 1.00000000e+0f);
return z; return z;
} }
__device_func__(float __internal_accurate_sinf(float a)) __device_func__(float __internal_accurate_sinf(float a))
{ {
float z; float z;
int i; int i;
if ((__cuda___isinff(a)) || (a == CUDART_ZERO_F)) { if ((__cuda___isinff(a)) || (a == CUDART_ZERO_F)) {
return __fmul_rn (a, CUDART_ZERO_F); return __fmul_rn (a, CUDART_ZERO_F);
skipping to change at line 1802 skipping to change at line 1836
{ {
#if defined(__MULTI_CORE__) #if defined(__MULTI_CORE__)
return cosf(a); return cosf(a);
#elif defined(__USE_FAST_MATH__) #elif defined(__USE_FAST_MATH__)
return __cosf(a); return __cosf(a);
#else /* __MULTI_CORE__ */ #else /* __MULTI_CORE__ */
float z; float z;
int i; int i;
if (__cuda___isinff(a)) { if (__cuda___isinff(a)) {
return CUDART_NAN_F; return __fadd_rn (a, -a); /* return NaN */
} }
z = __internal_trig_reduction_kernel(a, &i); z = __internal_trig_reduction_kernel(a, &i);
/* here, abs(z) <= pi/4, and i has the quadrant */ /* here, abs(z) <= pi/4, and i has the quadrant */
i++; i++;
if (i & 1) { if (i & 1) {
z = __internal_cos_kernel(z); z = __internal_cos_kernel(z);
} else { } else {
z = __internal_sin_kernel(z); z = __internal_sin_kernel(z);
} }
if (i & 2) { if (i & 2) {
skipping to change at line 1836 skipping to change at line 1870
float z; float z;
int i; int i;
if (__cuda___isinff(a)) { if (__cuda___isinff(a)) {
return CUDART_NAN_F; return CUDART_NAN_F;
} }
z = __internal_trig_reduction_kernel(a, &i); z = __internal_trig_reduction_kernel(a, &i);
/* here, abs(z) <= pi/4, and i has the quadrant */ /* here, abs(z) <= pi/4, and i has the quadrant */
z = __internal_tan_kernel(z); z = __internal_tan_kernel(z);
if (i & 1) { if (i & 1) {
z = -1.0f / z; z = - (1.0f / z);
} }
return z; return z;
#endif /* __MULTI_CORE__ */ #endif /* __MULTI_CORE__ */
} }
__device_func__(float __cuda_log2f(float a)) __device_func__(float __cuda_log2f(float a))
{ {
#if defined(__MULTI_CORE__) #if defined(__MULTI_CORE__)
return log2f(a); return log2f(a);
#elif defined(__USE_FAST_MATH__) #elif defined(__USE_FAST_MATH__)
skipping to change at line 1881 skipping to change at line 1915
return __internal_accurate_exp10f(a); return __internal_accurate_exp10f(a);
#endif /* __MULTI_CORE__ */ #endif /* __MULTI_CORE__ */
} }
__device_func__(float __cuda_coshf(float a)) __device_func__(float __cuda_coshf(float a))
{ {
float z; float z;
a = __cuda_fabsf(a); a = __cuda_fabsf(a);
z = __internal_expf_kernel(a, -2.0f); z = __internal_expf_kernel(a, -2.0f);
z = 2.0f * z + 0.125f / z; z = __internal_fmad (2.0f, z, __fdividef (0.125f, z));
if (a >= 90.0f) { if (a >= 90.0f) {
z = CUDART_INF_F; /* overflow -> infinity */ z = CUDART_INF_F; /* overflow -> infinity */
} }
return z; return z;
} }
__device_func__(float __cuda_sinhf(float a)) __device_func__(float __cuda_sinhf(float a))
{ {
float s, z; float s, z;
s = a; s = a;
a = __cuda_fabsf(a); a = __cuda_fabsf(a);
if (a < 1.0f) { /* danger of catastrophic cancellation */ if (a < 1.0f) { /* danger of catastrophic cancellation */
float a2 = a * a; float a2 = a * a;
/* approximate sinh(x) on [0,1] with a polynomial */ /* approximate sinh(x) on [0,1] with a polynomial */
z = 2.816951222e-6f; z = 2.816951222e-6f;
z = z * a2 + 1.983615978e-4f; z = __internal_fmad (z, a2, 1.983615978e-4f);
z = z * a2 + 8.333350058e-3f; z = __internal_fmad (z, a2, 8.333350058e-3f);
z = z * a2 + 1.666666650e-1f; z = __internal_fmad (z, a2, 1.666666650e-1f);
z = z * a2; z = z * a2;
z = z * a + a; z = __internal_fmad (z, a, a);
} else { } else {
z = __internal_expf_kernel(a, -2.0f); z = __internal_expf_kernel(a, -2.0f);
z = 2.0f * z - 0.125f / z; z = __internal_fmad (2.0f, z, -__fdividef (0.125f, z));
if (a >= 90.0f) { if (a >= 90.0f) {
z = CUDART_INF_F; /* overflow -> infinity */ z = CUDART_INF_F; /* overflow -> infinity */
} }
} }
return __cuda_copysignf(z, s); return __cuda_copysignf(z, s);
} }
__device_func__(float __cuda_tanhf(float a)) __device_func__(float __cuda_tanhf(float a))
{ {
float s, t; float s, t;
t = __cuda_fabsf(a); t = __cuda_fabsf(a);
if (t < 0.55f) { if (t < 0.55f) {
float z, z2; float z, z2;
z = t; z = t;
z2 = z * z; z2 = z * z;
t = 1.643758066599993e-2f; t = 1.643758066599993e-2f;
t = t * z2 - 5.267181327760551e-2f; t = __internal_fmad (t, z2, -5.267181327760551e-2f);
t = t * z2 + 1.332072505223051e-1f; t = __internal_fmad (t, z2, 1.332072505223051e-1f);
t = t * z2 - 3.333294663641083e-1f; t = __internal_fmad (t, z2, -3.333294663641083e-1f);
t = t * z2; t = t * z2;
s = t * z + z; s = __internal_fmad (t, z, z);
} else { } else {
s = 1.0f - 2.0f / (__internal_expf_kernel(2.0f * t, 0.0f) + 1.0f); s = 1.0f - __fdividef(2.0f,(__internal_expf_kernel(2.0f * t, 0.0f)+1.0f ));
if (t >= 88.0f) { if (t >= 88.0f) {
s = 1.0f; s = 1.0f;
} }
} }
return __cuda_copysignf(s, a); return __cuda_copysignf(s, a);
} }
__device_func__(float __cuda_atan2f(float a, float b)) __device_func__(float __cuda_atan2f(float a, float b))
{ {
#if defined(__MULTI_CORE__) #if defined(__MULTI_CORE__)
skipping to change at line 1999 skipping to change at line 2033
} }
return __cuda_copysignf(t1, a); return __cuda_copysignf(t1, a);
} }
/* approximate asin(a) on [0, 0.575] */ /* approximate asin(a) on [0, 0.575] */
__device_func__(float __internal_asinf_kernel(float a)) __device_func__(float __internal_asinf_kernel(float a))
{ {
float t2, t3, t4; float t2, t3, t4;
t2 = a * a; t2 = a * a;
t3 = - 0.501162291f; t3 = -0.501162291f;
t3 = t3 * t2 + 0.915201485f; t3 = __internal_fmad (t3, t2, 0.915201485f);
t3 = t3 * t2; t3 = t3 * t2;
t3 = t3 * a; t3 = t3 * a;
t4 = t2 - 5.478654385f; t4 = t2 - 5.478654385f;
t4 = t4 * t2 + 5.491230488f; t4 = __internal_fmad (t4, t2, 5.491230488f);
t4 = 1.0f / t4; t4 = 1.0f / t4;
a = t3 * t4 + a; a = __internal_fmad (t3, t4, a);
return a; return a;
} }
__device_func__(float __cuda_asinf(float a)) __device_func__(float __cuda_asinf(float a))
{ {
float t0, t1, t2; float t0, t1, t2;
t0 = __cuda_fabsf(a); t0 = __cuda_fabsf(a);
t2 = 1.0f - t0; t2 = 1.0f - t0;
t2 = 0.5f * t2; t2 = 0.5f * t2;
t2 = __cuda_sqrtf(t2); t2 = __cuda_sqrtf(t2);
t1 = t0 > 0.575f ? t2 : t0; t1 = t0 > 0.575f ? t2 : t0;
t1 = __internal_asinf_kernel(t1); t1 = __internal_asinf_kernel(t1);
t2 = -2.0f * t1 + CUDART_PIO2_F; t2 = __internal_fmad (-2.0f, t1, CUDART_PIO2_F);
if (t0 > 0.575f) { if (t0 > 0.575f) {
t1 = t2; t1 = t2;
} }
return __cuda_copysignf(t1, a); return __cuda_copysignf(t1, a);
} }
__device_func__(float __cuda_acosf(float a)) __device_func__(float __cuda_acosf(float a))
{ {
float t0, t1, t2; float t0, t1, t2;
skipping to change at line 2079 skipping to change at line 2113
return log1pf(a); return log1pf(a);
#else /* __MULTI_CORE__ */ #else /* __MULTI_CORE__ */
float t; float t;
#if !defined(__CUDABE__) && defined(_WIN32) #if !defined(__CUDABE__) && defined(_WIN32)
/* MSVC doesn't handle negative zero correctly, so handle it separately * / /* MSVC doesn't handle negative zero correctly, so handle it separately * /
if (a == 0.0f) return a; if (a == 0.0f) return a;
#endif /* !__CUDABE__ && _WIN32 */ #endif /* !__CUDABE__ && _WIN32 */
if (a >= -0.394f && a <= 0.65f) { if (a >= -0.394f && a <= 0.65f) {
/* log(a+1) = 2*atanh(a/(a+2)) */ /* log(a+1) = 2*atanh(a/(a+2)) */
t = a + 2.0f; t = a + 2.0f;
t = a / t; t = __fdividef (a, t);
t = -a * t; t = -a * t;
t = __internal_atanhf_kernel (a, t); t = __internal_atanhf_kernel (a, t);
} else { } else {
t = __internal_accurate_logf (CUDART_ONE_F + a); t = __internal_accurate_logf (CUDART_ONE_F + a);
} }
return t; return t;
#endif /* __MULTI_CORE__ */ #endif /* __MULTI_CORE__ */
} }
__device_func__(float __cuda_acoshf(float a)) __device_func__(float __cuda_acoshf(float a))
skipping to change at line 2101 skipping to change at line 2135
#if defined(__MULTI_CORE__) #if defined(__MULTI_CORE__)
return acoshf(a); return acoshf(a);
#else /* __MULTI_CORE__ */ #else /* __MULTI_CORE__ */
float t; float t;
t = a - 1.0f; t = a - 1.0f;
if (__cuda_fabsf(t) > CUDART_TWO_TO_23_F) { if (__cuda_fabsf(t) > CUDART_TWO_TO_23_F) {
/* for large a, acosh = log(2*a) */ /* for large a, acosh = log(2*a) */
return CUDART_LN2_F + __internal_accurate_logf(a); return CUDART_LN2_F + __internal_accurate_logf(a);
} else { } else {
t = t + __cuda_sqrtf(a * t + t); t = t + __cuda_sqrtf(__internal_fmad (a, t, t));
return __cuda_log1pf(t); return __cuda_log1pf(t);
} }
#endif /* __MULTI_CORE__ */ #endif /* __MULTI_CORE__ */
} }
__device_func__(float __cuda_asinhf(float a)) __device_func__(float __cuda_asinhf(float a))
{ {
#if defined(__MULTI_CORE__) #if defined(__MULTI_CORE__)
return asinhf(a); return asinhf(a);
#else /* __MULTI_CORE__ */ #else /* __MULTI_CORE__ */
float fa, oofa, t; float fa, oofa, t;
fa = __cuda_fabsf(a); fa = __cuda_fabsf(a);
if (fa > CUDART_TWO_TO_126_F) { /* prevent intermediate underflow */ if (fa > CUDART_TWO_TO_126_F) { /* prevent intermediate underflow */
t = CUDART_LN2_F + __logf(fa); /* fast version is safe here */ t = CUDART_LN2_F + __logf(fa); /* fast version is safe here */
} else { } else {
oofa = 1.0f / fa; oofa = 1.0f / fa;
t = fa + fa / (oofa + __cuda_sqrtf(1.0f + oofa * oofa)); t =fa+__fdividef (fa,(oofa+__cuda_sqrtf(__internal_fmad(oofa,oofa,1.0f) )));
t = __cuda_log1pf(t); t = __cuda_log1pf(t);
} }
return __cuda_copysignf(t, a); return __cuda_copysignf(t, a);
#endif /* __MULTI_CORE__ */ #endif /* __MULTI_CORE__ */
} }
__device_func__(float __cuda_atanhf(float a)) __device_func__(float __cuda_atanhf(float a))
{ {
#if defined(__MULTI_CORE__) #if defined(__MULTI_CORE__)
return atanhf(a); return atanhf(a);
#else /* __MULTI_CORE__ */ #else /* __MULTI_CORE__ */
float fa, t; float fa, t;
fa = __cuda_fabsf(a); fa = __cuda_fabsf(a);
t = (2.0f * fa) / (1.0f - fa); t = __fdividef ((2.0f * fa), (1.0f - fa));
t = 0.5f * __cuda_log1pf(t); t = 0.5f * __cuda_log1pf(t);
return __cuda_copysignf(t, a); return __cuda_copysignf(t, a);
#endif /* __MULTI_CORE__ */ #endif /* __MULTI_CORE__ */
} }
__device_func__(float __cuda_expm1f(float a)) __device_func__(float __cuda_expm1f(float a))
{ {
float t, z, j, u; float t, z, j, u;
/* expm1(a) = 2^t*(expm1(z)+1)-1 */ /* expm1(a) = 2^t*(expm1(z)+1)-1 */
t = __cuda_rintf (a * CUDART_L2E_F); t = __cuda_rintf (a * CUDART_L2E_F);
z = a - t * 0.6931457519f; z = __internal_fmad (-t, 0.6931457519f, a);
z = z - t * 1.4286067653e-6f; z = __internal_fmad (-t, 1.4286067653e-6f, z);
/* prevent loss of accuracy for args a tad outside [-0.5*log(2),0.5*log(2 )]*/ /* prevent loss of accuracy for args a tad outside [-0.5*log(2),0.5*log(2 )]*/
if (__cuda_fabsf(a) < 0.41f) { if (__cuda_fabsf(a) < 0.41f) {
z = a; z = a;
t = 0.0f; t = 0.0f;
} }
/* prevent intermediate overflow */ /* prevent intermediate overflow */
j = t; j = t;
if (t == 128.0f) j = j - 1.0f; if (t == 128.0f) j = j - 1.0f;
/* expm1(z) on [log(2/3), log(3/2)] */ /* expm1(z) on [log(2/3), log(3/2)] */
u = 1.38795078474044430E-003f; u = 1.38795078474044430E-003f;
u = u * z + 8.38241261853264930E-003f; u = __internal_fmad (u, z, 8.38241261853264930E-003f);
u = u * z + 4.16678317762833940E-002f; u = __internal_fmad (u, z, 4.16678317762833940E-002f);
u = u * z + 1.66663978874356580E-001f; u = __internal_fmad (u, z, 1.66663978874356580E-001f);
u = u * z + 4.99999940395997040E-001f; u = __internal_fmad (u, z, 4.99999940395997040E-001f);
u = u * z; u = u * z;
u = u * z + z; u = __internal_fmad (u, z, z);
if (a == 0.0f) u = a; // preserve input of -0 if (a == 0.0f) u = a; // preserve input of -0
/* 2^j*[exmp1(z)+1]-1 = 2^j*expm1(z)+2^j-1 */ /* 2^j*[exmp1(z)+1]-1 = 2^j*expm1(z)+2^j-1 */
z = __cuda_exp2f (j); z = __cuda_exp2f (j);
a = z - 1.0f; a = z - 1.0f;
if (a != 0.0f) u = u * z + a; // preserve -0 generated by FTZ if (a != 0.0f) u = __internal_fmad (u, z, a); // preserve -0 generated by
if (t == 128.0f) u = u + u; // work around intermediate overflow FTZ
if (t == 128.0f) u = u + u; // work around intermediate overflow
/* handle massive overflow and underflow */ /* handle massive overflow and underflow */
if (j > 128.0f) u = CUDART_INF_F; if (j > 128.0f) u = CUDART_INF_F;
if (j < -25.0f) u = -1.0f; if (j < -25.0f) u = -1.0f;
return u; return u;
} }
__device_func__(float __cuda_hypotf(float a, float b)) __device_func__(float __cuda_hypotf(float a, float b))
{ {
#if defined(__MULTI_CORE__) #if defined(__MULTI_CORE__)
return hypotf(a, b); return hypotf(a, b);
skipping to change at line 2193 skipping to change at line 2227
b = __cuda_fabsf(b); b = __cuda_fabsf(b);
/* can't use min, max because they do not propagate NaNs */ /* can't use min, max because they do not propagate NaNs */
if (a > b) { if (a > b) {
v = a; v = a;
w = b; w = b;
} else { } else {
v = b; v = b;
w = a; w = a;
} }
t = __internal_accurate_fdividef(w, v); t = __internal_accurate_fdividef(w, v);
t = 1.0f + t * t; t = __internal_fmad (t, t, 1.0f);
t = v * __cuda_sqrtf(t); t = v * __cuda_sqrtf(t);
if (v == 0.0f) { if (v == 0.0f) {
t = v + w; t = v + w;
} }
if ((v == CUDART_INF_F) || (w == CUDART_INF_F)) { if ((v == CUDART_INF_F) || (w == CUDART_INF_F)) {
t = CUDART_INF_F; t = CUDART_INF_F;
} }
return t; return t;
#endif /* __MULTI_CORE__ */ #endif /* __MULTI_CORE__ */
} }
skipping to change at line 2216 skipping to change at line 2250
{ {
#if defined(__MULTI_CORE__) #if defined(__MULTI_CORE__)
return cbrtf(a); return cbrtf(a);
#else /* __MULTI_CORE__ */ #else /* __MULTI_CORE__ */
float s, t; float s, t;
s = __cuda_fabsf(a); s = __cuda_fabsf(a);
if ((a == 0.0f) || (s == CUDART_INF_F)) { if ((a == 0.0f) || (s == CUDART_INF_F)) {
return a; return a;
} }
t = __cuda_exp2f(CUDART_THIRD_F * __log2f(s)); /* initial approximation * t = __cuda_exp2f(CUDART_THIRD_F * __log2f(s)); /* initial approximation
/ */
t = t - (t - (s / (t * t))) * CUDART_THIRD_F; /* refine approximation */ t = t-(t-(__fdividef(s,(t*t))))*CUDART_THIRD_F; /* refine approximation
*/
if (__cuda___signbitf(a)) { if (__cuda___signbitf(a)) {
t = -t; t = -t;
} }
return t; return t;
#endif /* __MULTI_CORE__ */ #endif /* __MULTI_CORE__ */
} }
__device_func__(float __cuda_erff(float a)) __device_func__(float __cuda_erff(float a))
{ {
float t, r, q; float t, r, q;
t = __cuda_fabsf(a); t = __cuda_fabsf(a);
if (t < 1.0f) { if (t < 1.0f) {
t = t * t; t = t * t;
r = -5.58510127926029810E-004f; r = -5.58510127926029810E-004f;
r = r * t + 4.90688891415893070E-003f; r = __internal_fmad (r, t, 4.90688891415893070E-003f);
r = r * t - 2.67027980930150640E-002f; r = __internal_fmad (r, t, -2.67027980930150640E-002f);
r = r * t + 1.12799056505903940E-001f; r = __internal_fmad (r, t, 1.12799056505903940E-001f);
r = r * t - 3.76122956138427440E-001f; r = __internal_fmad (r, t, -3.76122956138427440E-001f);
r = r * t + 1.12837911712623450E+000f; r = __internal_fmad (r, t, 1.12837911712623450E+000f);
a = a * r; a = a * r;
} else if (t <= CUDART_INF_F) { } else if (t <= CUDART_INF_F) {
/* coefficients from Hastings, "Approximations for Digital Computers", /* coefficients from Hastings, "Approximations for Digital Computers",
* Princeton University Press 1955. Sheet 45. * Princeton University Press 1955. Sheet 45.
*/ */
q = 0.3275911f * t + 1.0f; q = __internal_fmad (t, 0.3275911f, 1.0f);
q = 1.0f / q; q = 1.0f / q;
r = 1.061405429f; r = 1.061405429f;
r = r * q - 1.453152027f; r = __internal_fmad (r, q, -1.453152027f);
r = r * q + 1.421413741f; r = __internal_fmad (r, q, 1.421413741f);
r = r * q - 0.284496736f; r = __internal_fmad (r, q, -0.284496736f);
r = r * q + 0.254829592f; r = __internal_fmad (r, q, 0.254829592f);
r = r * q; r = r * q;
q = __internal_expf_kernel(-a * a, 0.0f); q = __internal_expf_kernel(-a * a, 0.0f);
r = 1.0f - q * r; r = __internal_fmad (-q, r, 1.0f);
if (t >= 5.5f) { if (t >= 5.5f) {
r = 1.0f; r = 1.0f;
} }
a = __int_as_float (__float_as_int(r) | (__float_as_int(a) & 0x80000000 )); a = __int_as_float (__float_as_int(r) | (__float_as_int(a) & 0x80000000 ));
} }
return a; return a;
} }
__device_func__(float __cuda_erfinvf (float a))
{
float fa, t;
fa = fabsf(a);
if (fa >= 1.0f) {
t = __cuda_rsqrtf (__int_as_float (0xffc00000)); /* NaN */
if (fa == 1.0f) {
t = a * CUDART_INF_F; /* Infinity */
}
} else if (fa > 0.9375f) {
/* Based on: J.M. Blair, C.A. Edwards, J.H. Johnson: Rational Chebyshev
Approximations for the Inverse of the Error Function. Mathematics of
Computation, Vol. 30, No. 136 (Oct. 1976), pp. 827-830. Table 50
*/
float p, q;
t = __cuda_log1pf(-fa);
t = __cuda_rsqrtf(-t);
p = -1.64441567910e-1f;
p = __internal_fmad (p, t, 6.80544246825e-1f);
p = __internal_fmad (p, t, -1.12808139162e+0f);
p = __internal_fmad (p, t, 6.90969348887e-1f);
p = __internal_fmad (p, t, 1.38271964963e+0f);
p = __internal_fmad (p, t, 1.55047000312e-1f);
q = t + 1.38522814199e+0f;
q = __internal_fmad (q, t, 1.55024849822e-1f);
q = q * t;
t = __fdividef (p, q);
if (a < 0.0f) t = -t;
} else if (fa > 0.75f) {
/* Based on: J.M. Blair, C.A. Edwards, J.H. Johnson: Rational Chebyshev
Approximations for the Inverse of the Error Function. Mathematics of
Computation, Vol. 30, No. 136 (Oct. 1976), pp. 827-830. Table 30
*/
float p, q;
t = __internal_fmad (a, a, -0.87890625f);
p = -7.1986748896e+0f;
p = __internal_fmad (p, t, +1.3411974175e+1f);
p = __internal_fmad (p, t, -5.1381573203e+0f);
p = __internal_fmad (p, t, 4.9633374831e-1f);
q = t -1.1436535838e+1f;
q = __internal_fmad (q, t, 1.3568885572e+1f);
q = __internal_fmad (q, t, -4.1747509256e+0f);
q = __internal_fmad (q, t, 3.5327242323e-1f);
p = __fdividef (p, q);
t = a * p;
} else { /* polynomial approximation on [0, 0.75], max error 2 ulps */
float a2;
a2 = a * a;
t = 6.1046168794766742E-001f;
t = __internal_fmad (t, a2, -8.9504882462753121E-001f);
t = __internal_fmad (t, a2, 7.0224162369928511E-001f);
t = __internal_fmad (t, a2, -1.9993784895823222E-001f);
t = __internal_fmad (t, a2, 1.1920613463949599E-001f);
t = __internal_fmad (t, a2, 8.0131492246997685E-002f);
t = __internal_fmad (t, a2, 1.2793154958377403E-001f);
t = __internal_fmad (t, a2, 2.3200529172828793E-001f);
t = __internal_fmad (t, a2, 8.8622695604694379E-001f);
t = t * a;
}
return t;
}
__device_func__(float __cuda_erfcf(float a)) __device_func__(float __cuda_erfcf(float a))
{ {
if (a <= 0.55f) { if (a <= 0.55f) {
return 1.0f - __cuda_erff(a); return 1.0f - __cuda_erff(a);
} else if (a > 10.0f) { } else if (a > 10.0f) {
return 0.0f; return 0.0f;
} else { } else {
float p; float p;
float q; float q;
float h; float h;
float l; float l;
/* This rational approximation has a slight accuracy issue since all th e /* This rational approximation has a slight accuracy issue since all th e
* coefficients have same sign so error accumulates when this is comput ed * coefficients have same sign so error accumulates when this is comput ed
* in single precision. Also the division at the end isn't IEEE complia nt. * in single precision. Also the division at the end isn't IEEE complia nt.
*/ */
p = + 4.014893410762552E-006f; p = 4.014893410762552E-006f;
p = p * a + 5.640401259462436E-001f; p = __internal_fmad (p, a, 5.640401259462436E-001f);
p = p * a + 2.626649872281140E+000f; p = __internal_fmad (p, a, 2.626649872281140E+000f);
p = p * a + 5.486372652389673E+000f; p = __internal_fmad (p, a, 5.486372652389673E+000f);
p = p * a + 5.250714831459401E+000f; p = __internal_fmad (p, a, 5.250714831459401E+000f);
q = a + 4.651376250488319E+000f; q = a + 4.651376250488319E+000f;
q = q * a + 1.026302828878470E+001f; q = __internal_fmad (q, a, 1.026302828878470E+001f);
q = q * a + 1.140762166021288E+001f; q = __internal_fmad (q, a, 1.140762166021288E+001f);
q = q * a + 5.251211619089947E+000f; q = __internal_fmad (q, a, 5.251211619089947E+000f);
/* Use reciprocal plus NR refinement for division */ /* Use reciprocal plus NR refinement for division */
h = 1.0f / q; h = 1.0f / q;
q = 2.0f * h - q * h * h; q = __internal_fmad (-q * h, h, 2.0f * h);
p = p * q; p = p * q;
/* compute exp(-a*a) with extended precision to avoid error magnificati on*/ /* compute exp(-a*a) with extended precision to avoid error magnificati on*/
h = __int_as_float(__float_as_int(a) & 0xfffff000); /* upper 12 bits * / h = __int_as_float(__float_as_int(a) & 0xfffff000); /* upper 12 bits * /
l = __fadd_rn (a, -h); /* lower 12 bits */ l = __fadd_rn (a, -h); /* lower 12 bits */
q = __fmul_rn (-h, h); /* this product is error free */ q = __fmul_rn (-h, h); /* this product is error free */
q = __internal_expf_kernel(q, 0.0f); q = __internal_expf_kernel(q, 0.0f);
a = a + h; a = a + h;
l = l * a; l = l * a;
h = __internal_expf_kernel(-l, 0.0f); h = __internal_expf_kernel(-l, 0.0f);
q = q * h; q = q * h;
p = p * q; p = p * q;
return p; return p;
} }
} }
__device_func__(float __cuda_erfcinvf (float a))
{
float t;
if (a <= 0.0f) {
t = CUDART_NAN_F;
if (a == 0.0f) {
t = (1.0f - a) * CUDART_INF_F;
}
}
else if (a >= 0.0625f) {
t = __cuda_erfinvf (1.0f - a);
}
else {
float p, q;
t = __cuda_logf(a);
t = __cuda_rsqrtf(-t);
p = -1.64441567910e-1f;
p = __internal_fmad (p, t, 6.80544246825e-1f);
p = __internal_fmad (p, t, -1.12808139162e+0f);
p = __internal_fmad (p, t, 6.90969348887e-1f);
p = __internal_fmad (p, t, 1.38271964963e+0f);
p = __internal_fmad (p, t, 1.55047000312e-1f);
q = t + 1.38522814199e+0f;
q = __internal_fmad (q, t, 1.55024849822e-1f);
q = q * t;
t = __fdividef (p, q);
}
return t;
}
__device_func__(float __cuda_lgammaf(float a)) __device_func__(float __cuda_lgammaf(float a))
{ {
float t; float t;
float i; float i;
int quot; int quot;
t = __internal_lgammaf_pos(__cuda_fabsf(a)); t = __internal_lgammaf_pos(__cuda_fabsf(a));
if (a >= 0.0f) return t; if (a >= 0.0f) return t;
a = __cuda_fabsf(a); a = __cuda_fabsf(a);
i = __cuda_floorf(a); i = __cuda_floorf(a);
if (a == i) return CUDART_INF_F; /* a is an integer: return infinity */ if (a == i) return CUDART_INF_F; /* a is an integer: return infinity */
if (a < 1e-19f) return -__internal_accurate_logf(a); if (a < 1e-19f) return -__internal_accurate_logf(a);
i = __cuda_rintf (2.0f * a); i = __cuda_rintf (2.0f * a);
quot = (int)i; quot = (int)i;
i = a - 0.5f * i; i = __internal_fmad (-i, 0.5f, a);
i = i * CUDART_PI_F; i = i * CUDART_PI_F;
if (quot & 1) { if (quot & 1) {
i = __internal_cos_kernel(i); i = __internal_cos_kernel(i);
} else { } else {
i = __internal_sin_kernel(i); i = __internal_sin_kernel(i);
} }
i = __cuda_fabsf(i); i = __cuda_fabsf(i);
t = CUDART_LNPI_F - __internal_accurate_logf(i * a) - t; t = CUDART_LNPI_F - __internal_accurate_logf(i * a) - t;
return t; return t;
} }
__device_func__(float __cuda_ldexpf(float a, int b)) __device_func__(float __cuda_ldexpf(float a, int b))
{ {
#if defined(__MULTI_CORE__) #if defined(__MULTI_CORE__)
return ldexpf(a, b); return ldexpf(a, b);
#else /* __MULTI_CORE__ */ #else /* __MULTI_CORE__ */
float fa = __cuda_fabsf(a); float fa = __cuda_fabsf(a);
if ((fa == CUDART_ZERO_F) || (fa == CUDART_INF_F) || (b == 0)) { if ((fa == CUDART_ZERO_F) || (fa == CUDART_INF_F) || (b == 0)) {
if (!(fa > CUDART_ZERO_F)) a = a + a;
return a; return a;
} } else if (__cuda_abs(b) < 126) {
else if (__cuda_abs(b) < 126) {
return a * __cuda_exp2f((float)b); return a * __cuda_exp2f((float)b);
} } else if (__cuda_abs(b) < 252) {
else if (__cuda_abs(b) < 252) {
int bhalf = b / 2; int bhalf = b / 2;
return a * __cuda_exp2f((float)bhalf) * __cuda_exp2f((float)(b - bhalf) ); return a * __cuda_exp2f((float)bhalf) * __cuda_exp2f((float)(b - bhalf) );
} } else {
else {
int bquarter = b / 4; int bquarter = b / 4;
float t = __cuda_exp2f((float)bquarter); float t = __cuda_exp2f((float)bquarter);
return a * t * t * t * __cuda_exp2f((float)(b - 3 * bquarter)); return a * t * t * t * __cuda_exp2f((float)(b - 3 * bquarter));
} }
#endif /* __MULTI_CORE__ */ #endif /* __MULTI_CORE__ */
} }
__device_func__(float __cuda_scalbnf(float a, int b)) __device_func__(float __cuda_scalbnf(float a, int b))
{ {
#if defined(__MULTI_CORE__) #if defined(__MULTI_CORE__)
skipping to change at line 2439 skipping to change at line 2567
return fmodf(a, b); return fmodf(a, b);
#else /* __MULTI_CORE__ */ #else /* __MULTI_CORE__ */
float orig_a = a; float orig_a = a;
float orig_b = b; float orig_b = b;
a = __cuda_fabsf(a); a = __cuda_fabsf(a);
b = __cuda_fabsf(b); b = __cuda_fabsf(b);
if (!((a <= CUDART_INF_F) && (b <= CUDART_INF_F))) { if (!((a <= CUDART_INF_F) && (b <= CUDART_INF_F))) {
return orig_a + orig_b; return orig_a + orig_b;
} }
if ((a == CUDART_INF_F) || (b == CUDART_ZERO_F)) { if ((a == CUDART_INF_F) || (b == CUDART_ZERO_F)) {
return CUDART_NAN_F; return __cuda_rsqrtf (__int_as_float (0xffc00000));
} else if (a >= b) { } else if (a >= b) {
#if !defined(__CUDABE__) #if !defined(__CUDABE__)
/* Need to be able to handle denormals correctly */ /* Need to be able to handle denormals correctly */
int expoa = (a < CUDART_TWO_TO_M126_F) ? int expoa = (a < CUDART_TWO_TO_M126_F) ?
((int)__log2f(a)) : (((__float_as_int(a) >> 23) & 0xff) - 127); ((int)__log2f(a)) : (((__float_as_int(a) >> 23) & 0xff) - 127);
int expob = (b < CUDART_TWO_TO_M126_F) ? int expob = (b < CUDART_TWO_TO_M126_F) ?
((int)__log2f(b)) : (((__float_as_int(b) >> 23) & 0xff) - 127); ((int)__log2f(b)) : (((__float_as_int(b) >> 23) & 0xff) - 127);
int scale = expoa - expob; int scale = expoa - expob;
float scaled_b = __cuda_ldexpf(b, scale); float scaled_b = __cuda_ldexpf(b, scale);
if (scaled_b <= 0.5f * a) { if (scaled_b <= 0.5f * a) {
skipping to change at line 2467 skipping to change at line 2595
} }
#endif /* !__CUDABE__ */ #endif /* !__CUDABE__ */
while (scaled_b >= b) { while (scaled_b >= b) {
if (a >= scaled_b) { if (a >= scaled_b) {
a -= scaled_b; a -= scaled_b;
} }
scaled_b *= 0.5f; scaled_b *= 0.5f;
} }
return __cuda_copysignf(a, orig_a); return __cuda_copysignf(a, orig_a);
} else { } else {
if (!(a > CUDART_ZERO_F)) orig_a = orig_a + orig_a;
return orig_a; return orig_a;
} }
#endif /* __MULTI_CORE__ */ #endif /* __MULTI_CORE__ */
} }
__device_func__(float __cuda_remainderf(float a, float b)) __device_func__(float __cuda_remainderf(float a, float b))
{ {
float twoa = 0.0f; float twoa = 0.0f;
unsigned int quot0 = 0; /* quotient bit 0 */ unsigned int quot0 = 0; /* quotient bit 0 */
float orig_a = a; float orig_a = a;
float orig_b = b; float orig_b = b;
a = __cuda_fabsf(a); a = __cuda_fabsf(a);
b = __cuda_fabsf(b); b = __cuda_fabsf(b);
if (!((a <= CUDART_INF_F) && (b <= CUDART_INF_F))) { if (!((a <= CUDART_INF_F) && (b <= CUDART_INF_F))) {
return orig_a + orig_b; return orig_a + orig_b;
} }
if ((a == CUDART_INF_F) || (b == CUDART_ZERO_F)) { if ((a == CUDART_INF_F) || (b == CUDART_ZERO_F)) {
return CUDART_NAN_F; return __cuda_rsqrtf (__int_as_float (0xffc00000));
} else if (a >= b) { } else if (a >= b) {
#if !defined(__CUDABE__) #if !defined(__CUDABE__)
int expoa = (a < CUDART_TWO_TO_M126_F) ? int expoa = (a < CUDART_TWO_TO_M126_F) ?
((int)__log2f(a)) : (((__float_as_int(a) >> 23) & 0xff) - 127); ((int)__log2f(a)) : (((__float_as_int(a) >> 23) & 0xff) - 127);
int expob = (b < CUDART_TWO_TO_M126_F) ? int expob = (b < CUDART_TWO_TO_M126_F) ?
((int)__log2f(b)) : (((__float_as_int(b) >> 23) & 0xff) - 127); ((int)__log2f(b)) : (((__float_as_int(b) >> 23) & 0xff) - 127);
int scale = expoa - expob; int scale = expoa - expob;
float scaled_b = __cuda_ldexpf(b, scale); float scaled_b = __cuda_ldexpf(b, scale);
if (scaled_b <= 0.5f * a) { if (scaled_b <= 0.5f * a) {
scaled_b *= 2.0f; scaled_b *= 2.0f;
skipping to change at line 2564 skipping to change at line 2693
/* quo has a value whose sign is the sign of x/y */ /* quo has a value whose sign is the sign of x/y */
sign = 0 - (__cuda___signbitf(a) != __cuda___signbitf(b)); sign = 0 - (__cuda___signbitf(a) != __cuda___signbitf(b));
a = __cuda_fabsf(a); a = __cuda_fabsf(a);
b = __cuda_fabsf(b); b = __cuda_fabsf(b);
if (!((a <= CUDART_INF_F) && (b <= CUDART_INF_F))) { if (!((a <= CUDART_INF_F) && (b <= CUDART_INF_F))) {
*quo = quot; *quo = quot;
return orig_a + orig_b; return orig_a + orig_b;
} }
if ((a == CUDART_INF_F) || (b == CUDART_ZERO_F)) { if ((a == CUDART_INF_F) || (b == CUDART_ZERO_F)) {
*quo = quot; *quo = quot;
return CUDART_NAN_F; return __cuda_rsqrtf (__int_as_float (0xffc00000));
} else if (a >= b) { } else if (a >= b) {
#if !defined(__CUDABE__) #if !defined(__CUDABE__)
/* Need to be able to handle denormals correctly */ /* Need to be able to handle denormals correctly */
int expoa = (a < CUDART_TWO_TO_M126_F) ? int expoa = (a < CUDART_TWO_TO_M126_F) ?
((int)__log2f(a)) : (((__float_as_int(a) >> 23) & 0xff) - 127); ((int)__log2f(a)) : (((__float_as_int(a) >> 23) & 0xff) - 127);
int expob = (b < CUDART_TWO_TO_M126_F) ? int expob = (b < CUDART_TWO_TO_M126_F) ?
((int)__log2f(b)) : (((__float_as_int(b) >> 23) & 0xff) - 127); ((int)__log2f(b)) : (((__float_as_int(b) >> 23) & 0xff) - 127);
int scale = expoa - expob; int scale = expoa - expob;
float scaled_b = __cuda_ldexpf(b, scale); float scaled_b = __cuda_ldexpf(b, scale);
if (scaled_b <= 0.5f * a) { if (scaled_b <= 0.5f * a) {
skipping to change at line 2640 skipping to change at line 2769
__float_as_int(a)); __float_as_int(a));
quot = quot & CUDART_REMQUO_MASK_F; quot = quot & CUDART_REMQUO_MASK_F;
quot = quot ^ sign; quot = quot ^ sign;
quot = quot - sign; quot = quot - sign;
*quo = quot; *quo = quot;
return a; return a;
} }
__device_func__(float __cuda_fmaf(float a, float b, float c)) __device_func__(float __cuda_fmaf(float a, float b, float c))
{ {
unsigned int xx, yy, zz, ww; return __fmaf_rn(a, b, c);
unsigned int temp, s, u;
unsigned int expo_x, expo_y, expo_z;
xx = __float_as_int(a);
yy = __float_as_int(b);
zz = __float_as_int(c);
#if defined(__CUDABE__)
/* Match 'denormals are zero' behavior of the GPU */
if ((xx << 1) < 0x01000000) xx &= 0x80000000;
if ((yy << 1) < 0x01000000) yy &= 0x80000000;
if ((zz << 1) < 0x01000000) zz &= 0x80000000;
#endif /* __CUDABE__ */
temp = 0xff;
expo_x = temp & (xx >> 23);
expo_x = expo_x - 1;
expo_y = temp & (yy >> 23);
expo_y = expo_y - 1;
expo_z = temp & (zz >> 23);
expo_z = expo_z - 1;
if (!((expo_x <= 0xFD) &&
(expo_y <= 0xFD) &&
(expo_z <= 0xFD))) {
/* fma (nan, y, z) --> nan
fma (x, nan, z) --> nan
fma (x, y, nan) --> nan
*/
if ((yy << 1) > 0xff000000) {
return CUDART_NAN_F;
}
if ((zz << 1) > 0xff000000) {
return CUDART_NAN_F;
}
if ((xx << 1) > 0xff000000) {
return CUDART_NAN_F;
}
/* fma (0, inf, z) --> NaN
fma (inf, 0, z) --> NaN
fma (-inf,+y,+inf) --> NaN
fma (+x,-inf,+inf) --> NaN
fma (+inf,-y,+inf) --> NaN
fma (-x,+inf,+inf) --> NaN
fma (-inf,-y,-inf) --> NaN
fma (-x,-inf,-inf) --> NaN
fma (+inf,+y,-inf) --> NaN
fma (+x,+inf,-inf) --> NaN
*/
if ((((xx << 1) == 0) && ((yy << 1) == 0xff000000)) ||
(((yy << 1) == 0) && ((xx << 1) == 0xff000000))) {
return CUDART_NAN_F;
}
if ((zz << 1) == 0xff000000) {
if (((yy << 1) == 0xff000000) || ((xx << 1) == 0xff000000)) {
if ((int)(xx ^ yy ^ zz) < 0) {
return CUDART_NAN_F;
}
}
}
/* fma (inf, y, z) --> inf
fma (x, inf, z) --> inf
fma (x, y, inf) --> inf
*/
if ((xx << 1) == 0xff000000) {
xx = xx ^ (yy & 0x80000000);
return __int_as_float(xx);
}
if ((yy << 1) == 0xff000000) {
yy = yy ^ (xx & 0x80000000);
return __int_as_float(yy);
}
if ((zz << 1) == 0xff000000) {
return __int_as_float(zz);
}
/* fma (+0, -y, -0) --> -0
fma (-0, +y, -0) --> -0
fma (+x, -0, -0) --> -0
fma (-x, +0, -0) --> -0
*/
if (zz == 0x80000000) {
if (((xx << 1) == 0) || ((yy << 1) == 0)) {
if ((int)(xx ^ yy) < 0) {
return __int_as_float(zz);
}
}
}
/* fma (0, y, 0) --> +0
fma (x, 0, 0) --> +0
*/
if (((zz << 1) == 0) &&
(((xx << 1) == 0) || ((yy << 1) == 0))) {
zz &= 0x7fffffff;
return __int_as_float(zz);
}
/* fma (0, y, z) --> z
fma (x, 0, z) --> z
*/
if (((xx << 1) == 0) || ((yy << 1) == 0)) {
return __int_as_float(zz);
}
/* normalize x, if denormal */
if (expo_x == (unsigned int)-1) {
temp = xx & 0x80000000;
xx = xx << 8;
while (!(xx & 0x80000000)) {
xx <<= 1;
expo_x--;
}
expo_x++;
xx = (xx >> 8) | temp;
}
/* normalize y, if denormal */
if (expo_y == (unsigned int)-1) {
temp = yy & 0x80000000;
yy = yy << 8;
while (!(yy & 0x80000000)) {
yy <<= 1;
expo_y--;
}
expo_y++;
yy = (yy >> 8) | temp;
}
/* normalize z, if denormal */
if ((expo_z == (unsigned int)-1) && ((zz << 1) != 0)) {
temp = zz & 0x80000000;
zz = zz << 8;
while (!(zz & 0x80000000)) {
zz <<= 1;
expo_z--;
}
expo_z++;
zz = (zz >> 8) | temp;
}
}
expo_x = expo_x + expo_y;
expo_y = xx ^ yy;
xx = xx & 0x00ffffff;
yy = yy << 8;
xx = xx | 0x00800000;
yy = yy | 0x80000000;
s = __umulhi(xx, yy);
yy = xx * yy;
xx = s;
expo_x = expo_x - 127 + 2;
expo_y = expo_y & 0x80000000;
/* normalize mantissa */
if (xx < 0x00800000) {
xx = (xx << 1) | (yy >> 31);
yy = (yy << 1);
expo_x--;
}
temp = 0;
if ((zz << 1) != 0) { /* z is not zero */
s = zz & 0x80000000;
zz &= 0x00ffffff;
zz |= 0x00800000;
ww = 0;
/* compare and swap. put augend into xx:yy */
if ((int)expo_z > (int)expo_x) {
temp = expo_z;
expo_z = expo_x;
expo_x = temp;
temp = zz;
zz = xx;
xx = temp;
temp = ww;
ww = yy;
yy = temp;
temp = expo_y;
expo_y = s;
s = temp;
}
/* augend_sign = expo_y, augend_mant = xx:yy, augend_expo = expo_x */
/* addend_sign = s, addend_mant = zz:ww, addend_expo = expo_z */
expo_z = expo_x - expo_z;
u = expo_y ^ s;
if (expo_z <= 49) {
/* denormalize addend */
temp = 0;
while (expo_z >= 32) {
temp = ww | (temp != 0);
ww = zz;
zz = 0;
expo_z -= 32;
}
if (expo_z) {
temp = ((temp >> expo_z) | (ww << (32 - expo_z)) |
((temp << (32 - expo_z)) != 0));
ww = (ww >> expo_z) | (zz << (32 - expo_z));
zz = (zz >> expo_z);
}
} else {
temp = 1;
ww = 0;
zz = 0;
}
if ((int)u < 0) {
/* signs differ, effective subtraction */
temp = (unsigned)(-(int)temp);
s = (temp != 0);
u = yy - s;
s = u > yy;
yy = u - ww;
s += yy > u;
xx = (xx - zz) - s;
if (!(xx | yy | temp)) {
/* complete cancelation, return 0 */
return __int_as_float(xx);
}
if ((int)xx < 0) {
/* Oops, augend had smaller mantissa. Negate mantissa and flip
sign of result
*/
temp = ~temp;
yy = ~yy;
xx = ~xx;
if (++temp == 0) {
if (++yy == 0) {
++xx;
}
}
expo_y ^= 0x80000000;
}
/* normalize mantissa, if necessary */
while (!(xx & 0x00800000)) {
xx = (xx << 1) | (yy >> 31);
yy = (yy << 1);
expo_x--;
}
} else {
/* signs are the same, effective addition */
yy = yy + ww;
s = yy < ww;
xx = xx + zz + s;
if (xx & 0x01000000) {
temp = temp | (yy << 31);
yy = (yy >> 1) | (xx << 31);
xx = ((xx & 0x80000000) | (xx >> 1)) & ~0x40000000;
expo_x++;
}
}
}
temp = yy | (temp != 0);
if (expo_x <= 0xFD) {
/* normal */
xx |= expo_y; /* or in sign bit */
s = xx & 1; /* mantissa lsb */
xx += (temp == 0x80000000) ? s : (temp >> 31);
xx = xx + (expo_x << 23); /* add in exponent */
return __int_as_float(xx);
} else if ((int)expo_x >= 126) {
/* overflow */
xx = expo_y | 0x7f800000;
return __int_as_float(xx);
}
/* subnormal */
expo_x = (unsigned int)(-(int)expo_x);
if (expo_x > 25) {
/* massive underflow: return 0 */
return __int_as_float(expo_y);
}
yy = (xx << (32 - expo_x)) | ((yy) ? 1 : 0);
xx = expo_y + (xx >> expo_x);
xx = xx + ((yy==0x80000000) ? (xx & 1) : (yy >> 31));
xx |= expo_y; /* or in sign bit */
#if defined(__CUDABE__)
/* Match 'flush to zero' response of the GPU */
if ((xx << 1) < 0x01000000) xx = expo_y;
#endif /* __CUDABE__ */
return __int_as_float(xx);
} }
__device_func__(float __internal_accurate_powf(float a, float b)) __device_func__(float __internal_accurate_powf(float a, float b))
{ {
float2 loga, prod; float2 loga, prod;
#if !defined(__CUDABE__) && defined(_MSC_VER) && !defined(_WIN64) #if !defined(__CUDABE__) && defined(_MSC_VER) && !defined(_WIN64)
volatile float t; volatile float t;
#else #else
float t; float t;
#endif #endif
skipping to change at line 2948 skipping to change at line 2803
prod.x = prod.x + __int_as_float(0x37000000); prod.x = prod.x + __int_as_float(0x37000000);
} }
/* compute pow(a,b) = exp(b*log(a)) */ /* compute pow(a,b) = exp(b*log(a)) */
t = __cuda_expf(prod.y); t = __cuda_expf(prod.y);
/* prevent -INF + INF = NaN */ /* prevent -INF + INF = NaN */
if (t != CUDART_INF_F) { if (t != CUDART_INF_F) {
/* if prod.x is much smaller than prod.y, then exp(prod.y+prod.x) ~= /* if prod.x is much smaller than prod.y, then exp(prod.y+prod.x) ~=
* exp(prod.y) + prod.x * exp(prod.y) * exp(prod.y) + prod.x * exp(prod.y)
*/ */
t = t * prod.x + t; t = __internal_fmad (t, prod.x, t);
} }
return t; return t;
} }
__device_func__(float __cuda_powif(float a, int b)) __device_func__(float __cuda_powif(float a, int b))
{ {
unsigned int e = __cuda_abs(b); unsigned int e = __cuda_abs(b);
float r = 1.0f; float r = 1.0f;
while (1) { while (1) {
if ((e & 1) != 0) { if ((e & 1) != 0) {
r = r * a; r = r * a;
} }
e = e >> 1; e = e >> 1;
if (e == 0) { if (e == 0) {
return b < 0 ? 1.0f/r : r; return b < 0 ? 1.0f / r : r;
} }
a = a * a; a = a * a;
} }
} }
__device_func__(double __cuda_powi(double a, int b)) __device_func__(double __cuda_powi(double a, int b))
{ {
unsigned int e = __cuda_abs(b); unsigned int e = __cuda_abs(b);
double r = 1.0; double r = 1.0;
while (1) { while (1) {
if ((e & 1) != 0) { if ((e & 1) != 0) {
r = r * a; r = r * a;
} }
e = e >> 1; e = e >> 1;
if (e == 0) { if (e == 0) {
return b < 0 ? 1.0/r : r; return b < 0 ? 1.0 / r : r;
} }
a = a * a; a = a * a;
} }
} }
__device_func__(float __cuda_powf(float a, float b)) __device_func__(float __cuda_powf(float a, float b))
{ {
#if defined(__MULTI_CORE__) #if defined(__MULTI_CORE__)
return powf(a, b); return powf(a, b);
#elif defined(__USE_FAST_MATH__) #elif defined(__USE_FAST_MATH__)
skipping to change at line 3024 skipping to change at line 2879
} }
bIsOddInteger = (b - (2.0f * floorf(0.5f * b))) == 1.0f; bIsOddInteger = (b - (2.0f * floorf(0.5f * b))) == 1.0f;
if (a == CUDART_ZERO_F) { if (a == CUDART_ZERO_F) {
t = bIsOddInteger ? a : CUDART_ZERO_F; t = bIsOddInteger ? a : CUDART_ZERO_F;
if (b < CUDART_ZERO_F) { if (b < CUDART_ZERO_F) {
t = 1.0f / t; t = 1.0f / t;
} }
return t; return t;
} }
if (a == -CUDART_INF_F) { if (a == -CUDART_INF_F) {
t = (b < CUDART_ZERO_F) ? -1.0f/a : -a; t = - ((b < CUDART_ZERO_F) ? (1.0f / a) : a);
if (bIsOddInteger) { if (bIsOddInteger) {
t = __int_as_float(__float_as_int(t) ^ 0x80000000); t = __int_as_float(__float_as_int(t) ^ 0x80000000);
} }
return t; return t;
} }
if ((a < CUDART_ZERO_F) && (b != __cuda_truncf(b))) { if ((a < CUDART_ZERO_F) && (b != __cuda_truncf(b))) {
return CUDART_NAN_F; return __cuda_rsqrtf(__int_as_float(0xffc00000));
} }
t = __cuda_fabsf(a); t = __cuda_fabsf(a);
t = __internal_accurate_powf(t, b); t = __internal_accurate_powf(t, b);
if ((a < CUDART_ZERO_F) && bIsOddInteger) { if ((a < CUDART_ZERO_F) && bIsOddInteger) {
t = __int_as_float(__float_as_int(t) ^ 0x80000000); t = __int_as_float(__float_as_int(t) ^ 0x80000000);
} }
return t; return t;
#endif /* __MULTI_CORE__ */ #endif /* __MULTI_CORE__ */
} }
/* approximate 1.0/(x*gamma(x)) on [-0.5,0.5] */ /* approximate 1.0/(x*gamma(x)) on [-0.5,0.5] */
__device_func__(float __internal_tgammaf_kernel(float a)) __device_func__(float __internal_tgammaf_kernel(float a))
{ {
float t; float t;
t = - 1.05767296987211380E-003f; t = -1.05767296987211380E-003f;
t = t * a + 7.09279059435508670E-003f; t = __internal_fmad (t, a, 7.09279059435508670E-003f);
t = t * a - 9.65347121958557050E-003f; t = __internal_fmad (t, a, -9.65347121958557050E-003f);
t = t * a - 4.21736613253687960E-002f; t = __internal_fmad (t, a, -4.21736613253687960E-002f);
t = t * a + 1.66542401247154280E-001f; t = __internal_fmad (t, a, 1.66542401247154280E-001f);
t = t * a - 4.20043267827838460E-002f; t = __internal_fmad (t, a, -4.20043267827838460E-002f);
t = t * a - 6.55878234051332940E-001f; t = __internal_fmad (t, a, -6.55878234051332940E-001f);
t = t * a + 5.77215696929794240E-001f; t = __internal_fmad (t, a, 5.77215696929794240E-001f);
t = t * a + 1.00000000000000000E+000f; t = __internal_fmad (t, a, 1.00000000000000000E+000f);
return t; return t;
} }
/* Based on: Kraemer, W.: "Berechnung der Gammafunktion G(x) fuer reelle Pu nkt- /* Based on: Kraemer, W.: "Berechnung der Gammafunktion G(x) fuer reelle Pu nkt-
und Intervallargumente". Zeitschrift fuer angewandte Mathematik und und Intervallargumente". Zeitschrift fuer angewandte Mathematik und
Mechanik, Vol. 70 (1990), No. 6, pp. 581-584 Mechanik, Vol. 70 (1990), No. 6, pp. 581-584
*/ */
__device_func__(float __cuda_tgammaf(float a)) __device_func__(float __cuda_tgammaf(float a))
{ {
float s, xx, x=a; float s, xx, x=a;
skipping to change at line 3083 skipping to change at line 2938
xx = xx - 1.0f; xx = xx - 1.0f;
s = s * xx; s = s * xx;
} }
if (x >= 0.5f) { if (x >= 0.5f) {
xx = xx - 1.0f; xx = xx - 1.0f;
} }
xx = __internal_tgammaf_kernel(xx); xx = __internal_tgammaf_kernel(xx);
if (x < 0.5f) { if (x < 0.5f) {
xx = xx * x; xx = xx * x;
} }
s = s / xx; s = __fdividef(s, xx);
if (x > 34.03f) { if (x > 34.03f) {
/* Cannot use s = s * x - s due to intermediate overflow! */ /* Cannot use s = s * x - s due to intermediate overflow! */
xx = x - 1.0f; xx = x - 1.0f;
s = s * xx; s = s * xx;
} }
return s; return s;
} else { } else {
if (x == __cuda_floorf(x)) { /* x is negative integer */ if (x == __cuda_floorf(x)) { /* x is negative integer */
x = CUDART_NAN_F; /* NaN, propagates through on device */ x = CUDART_NAN_F; /* NaN, propagates through on device */
#if !defined(__CUDABE__) #if !defined(__CUDABE__)
skipping to change at line 3133 skipping to change at line 2988
return s; return s;
} }
} }
__device_func__(float __cuda_roundf(float a)) __device_func__(float __cuda_roundf(float a))
{ {
#if defined(__MULTI_CORE__) #if defined(__MULTI_CORE__)
return roundf(a); return roundf(a);
#else /* __MULTI_CORE__ */ #else /* __MULTI_CORE__ */
float fa = __cuda_fabsf(a); float fa = __cuda_fabsf(a);
if (fa > CUDART_TWO_TO_23_F) { float u = __cuda_copysignf (0.5f, a);
return a; u = __cuda_truncf (a + u);
} else { if (fa > CUDART_TWO_TO_23_F) u = a;
float u = __cuda_floorf(fa + 0.5f); if (fa < 0.5f) u = __cuda_truncf (a);
if (fa < 0.5f) u = 0.0f; return u;
return __cuda_copysignf(u, a);
}
#endif /* __MULTI_CORE__ */ #endif /* __MULTI_CORE__ */
} }
__device_func__(long long int __internal_llroundf_kernel(float a)) __device_func__(long long int __internal_llroundf_kernel(float a))
{ {
unsigned long long int res, t = 0LL; unsigned long long int res, t = 0LL;
int shift; int shift;
unsigned int ia = __float_as_int(a); unsigned int ia = __float_as_int(a);
if ((ia << 1) > 0xff000000) return 0LL; if ((ia << 1) > 0xff000000) return 0LL;
if ((int)ia >= 0x5f000000) return 0x7fffffffffffffffLL; if ((int)ia >= 0x5f000000) return 0x7fffffffffffffffLL;
skipping to change at line 3161 skipping to change at line 3014
shift = 189 - ((ia >> 23) & 0xff); shift = 189 - ((ia >> 23) & 0xff);
res = ((long long int)(((ia << 8) | 0x80000000) >> 1)) << 32; res = ((long long int)(((ia << 8) | 0x80000000) >> 1)) << 32;
if (shift >= 64) { if (shift >= 64) {
t = res; t = res;
res = 0; res = 0;
} else if (shift) { } else if (shift) {
t = res << (64 - shift); t = res << (64 - shift);
res = res >> shift; res = res >> shift;
} }
if (t >= 0x8000000000000000LL) { if (t >= 0x8000000000000000LL) {
res++; res++;
} }
if ((int)ia < 0) res = (unsigned long long int)(-(long long int)res); if ((int)ia < 0) res = (unsigned long long int)(-(long long int)res);
return (long long int)res; return (long long int)res;
} }
__device_func__(long long int __cuda_llroundf(float a)) __device_func__(long long int __cuda_llroundf(float a))
{ {
#if defined(__MULTI_CORE__) #if defined(__MULTI_CORE__)
return llroundf(a); return llroundf(a);
#else /* __MULTI_CORE__ */ #else /* __MULTI_CORE__ */
skipping to change at line 3749 skipping to change at line 3602
__func__(int ilogbf(float a)) __func__(int ilogbf(float a))
{ {
return ilogb((double)a); return ilogb((double)a);
} }
__func__(float erff(float a)) __func__(float erff(float a))
{ {
return (float)erf((double)a); return (float)erf((double)a);
} }
__func__(float erfinvf(float a))
{
return (float)erfinv((double)a);
}
__func__(float erfcf(float a)) __func__(float erfcf(float a))
{ {
return (float)erfc((double)a); return (float)erfc((double)a);
} }
__func__(float erfcinvf(float a))
{
return (float)erfcinv((double)a);
}
__func__(float lgammaf(float a)) __func__(float lgammaf(float a))
{ {
return (float)lgamma((double)a); return (float)lgamma((double)a);
} }
__func__(float tgammaf(float a)) __func__(float tgammaf(float a))
{ {
return (float)tgamma((double)a); return (float)tgamma((double)a);
} }
skipping to change at line 3821 skipping to change at line 3684
__func__(double tgamma(double a)) __func__(double tgamma(double a))
{ {
return (double)__cuda_tgammaf((float)a); return (double)__cuda_tgammaf((float)a);
} }
__func__(double erf(double a)) __func__(double erf(double a))
{ {
return (double)__cuda_erff((float)a); return (double)__cuda_erff((float)a);
} }
__func__(double erfinv(double a))
{
return (double)__cuda_erfinvf((float)a);
}
__func__(double erfc(double a)) __func__(double erfc(double a))
{ {
return (double)__cuda_erfcf((float)a); return (double)__cuda_erfcf((float)a);
} }
__func__(double erfcinv(double a))
{
return (double)__cuda_erfcinvf((float)a);
}
__func__(double remquo(double a, double b, int *quo)) __func__(double remquo(double a, double b, int *quo))
{ {
return (double)__cuda_remquof((float)a, (float)b, quo); return (double)__cuda_remquof((float)a, (float)b, quo);
} }
__func__(double remainder(double a, double b)) __func__(double remainder(double a, double b))
{ {
return (double)__cuda_remainderf((float)a, (float)b); return (double)__cuda_remainderf((float)a, (float)b);
} }
 End of changes. 97 change blocks. 
466 lines changed or deleted 341 lines changed or added


 math_functions_dbl_ptx1.h   math_functions_dbl_ptx1.h 
/* /*
* Copyright 1993-2008 NVIDIA Corporation. All rights reserved. * Copyright 1993-2009 NVIDIA Corporation. All rights reserved.
* *
* NOTICE TO USER: * NOTICE TO USER:
* *
* This source code is subject to NVIDIA ownership rights under U.S. and * This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws. Users and possessors of this source code * international Copyright laws. Users and possessors of this source code
* are hereby granted a nonexclusive, royalty-free license to use this code * are hereby granted a nonexclusive, royalty-free license to use this code
* in individual and commercial software. * in individual and commercial software.
* *
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
skipping to change at line 240 skipping to change at line 240
__device_func__(double __cuda_cbrt(double a)) __device_func__(double __cuda_cbrt(double a))
{ {
return (double)__cuda_cbrtf((float)a); return (double)__cuda_cbrtf((float)a);
} }
__device_func__(double __cuda_erf(double a)) __device_func__(double __cuda_erf(double a))
{ {
return (double)__cuda_erff((float)a); return (double)__cuda_erff((float)a);
} }
__device_func__(double __cuda_erfinv(double a))
{
return (double)__cuda_erfinvf((float)a);
}
__device_func__(double __cuda_erfc(double a)) __device_func__(double __cuda_erfc(double a))
{ {
return (double)__cuda_erfcf((float)a); return (double)__cuda_erfcf((float)a);
} }
__device_func__(double __cuda_erfcinv(double a))
{
return (double)__cuda_erfcinvf((float)a);
}
__device_func__(double __cuda_lgamma(double a)) __device_func__(double __cuda_lgamma(double a))
{ {
return (double)__cuda_lgammaf((float)a); return (double)__cuda_lgammaf((float)a);
} }
__device_func__(double __cuda_tgamma(double a)) __device_func__(double __cuda_tgamma(double a))
{ {
return (double)__cuda_tgammaf((float)a); return (double)__cuda_tgammaf((float)a);
} }
 End of changes. 3 change blocks. 
1 lines changed or deleted 11 lines changed or added


 math_functions_dbl_ptx3.h   math_functions_dbl_ptx3.h 
/* /*
* Copyright 1993-2008 NVIDIA Corporation. All rights reserved. * Copyright 1993-2009 NVIDIA Corporation. All rights reserved.
* *
* NOTICE TO USER: * NOTICE TO USER:
* *
* This source code is subject to NVIDIA ownership rights under U.S. and * This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws. Users and possessors of this source code * international Copyright laws. Users and possessors of this source code
* are hereby granted a nonexclusive, royalty-free license to use this code * are hereby granted a nonexclusive, royalty-free license to use this code
* in individual and commercial software. * in individual and commercial software.
* *
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
skipping to change at line 52 skipping to change at line 52
#elif !defined(__CUDACC__) #elif !defined(__CUDACC__)
#include "crt/func_macro.h" #include "crt/func_macro.h"
#define INT_MAX \ #define INT_MAX \
((int)((unsigned int)-1 >> 1)) ((int)((unsigned int)-1 >> 1))
#include "device_functions.h" #include "device_functions.h"
#include "math_constants.h" #include "math_constants.h"
#if !defined(__CUDABE__)
#include "common_types.h"
#endif
/************************************************************************** ***** /************************************************************************** *****
* * * *
* DEVICE IMPLEMENTATIONS FOR FUNCTIONS WITH BUILTIN NVOPENCC OPREATIONS * * DEVICE IMPLEMENTATIONS FOR FUNCTIONS WITH BUILTIN NVOPENCC OPREATIONS *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
__device_func__(double __cuda_fabs(double a)) __device_func__(double __cuda_fabs(double a))
{ {
return fabs(a); return fabs(a);
} }
__device_func__(double __cuda_fmax(double a, double b)) __device_func__(double __cuda_fmax(double a, double b))
{ {
#if !defined(__CUDABE__) #if !defined(__CUDABE__)
volatile union { volatile union __cudart_DoubleUlonglongCvt cvta, cvtb;
double d; int nana, nanb;
unsigned long long int l;
} cvta, cvtb;
cvta.d = a; cvta.d = a;
cvtb.d = b; cvtb.d = b;
if ((cvtb.l << 1) > 0xffe0000000000000ULL) return a; nana = ((cvta.i << 1) > 0xffe0000000000000ULL);
if ((cvta.l << 1) > 0xffe0000000000000ULL) return b; nanb = ((cvtb.i << 1) > 0xffe0000000000000ULL);
if (nana && nanb) return a + b;
if (nana) return b;
if (nanb) return a;
if ((cvta.d == 0.0) && (cvtb.d == 0.0)) { if ((cvta.d == 0.0) && (cvtb.d == 0.0)) {
cvta.l &= cvtb.l; cvta.i &= cvtb.i;
return cvta.d; return cvta.d;
} }
return a > b ? a : b; return a > b ? a : b;
#else #else
return fmax(a, b); return fmax(a, b);
#endif /* !defined(__CUDABE__) */ #endif /* !defined(__CUDABE__) */
} }
__device_func__(double __cuda_fmin(double a, double b)) __device_func__(double __cuda_fmin(double a, double b))
{ {
#if !defined(__CUDABE__) #if !defined(__CUDABE__)
volatile union { volatile union __cudart_DoubleUlonglongCvt cvta, cvtb;
double d; int nana, nanb;
unsigned long long int l;
} cvta, cvtb;
cvta.d = a; cvta.d = a;
cvtb.d = b; cvtb.d = b;
if ((cvtb.l << 1) > 0xffe0000000000000ULL) return a; nana = ((cvta.i << 1) > 0xffe0000000000000ULL);
if ((cvta.l << 1) > 0xffe0000000000000ULL) return b; nanb = ((cvtb.i << 1) > 0xffe0000000000000ULL);
if ((cvta.l | cvtb.l) == 0x8000000000000000ULL) { if (nana && nanb) return a + b;
if (nana) return b;
if (nanb) return a;
if ((cvta.i | cvtb.i) == 0x8000000000000000ULL) {
return CUDART_NEG_ZERO ; return CUDART_NEG_ZERO ;
} }
return a < b ? a : b; return a < b ? a : b;
#else #else
return fmin(a, b); return fmin(a, b);
#endif /* !defined(__CUDABE__) */ #endif /* !defined(__CUDABE__) */
} }
__device_func__(double __cuda_ceil(double a)) __device_func__(double __cuda_ceil(double a))
{ {
skipping to change at line 539 skipping to change at line 545
z = __internal_trig_reduction_kerneld(a, &i); z = __internal_trig_reduction_kerneld(a, &i);
/* here, abs(z) <= pi/4, and i has the quadrant */ /* here, abs(z) <= pi/4, and i has the quadrant */
z = __internal_tan_kerneld(z, i & 1); z = __internal_tan_kerneld(z, i & 1);
return z; return z;
} }
__device_func__(double __cuda_log(double a)) __device_func__(double __cuda_log(double a))
{ {
double m, f, g, u, v, tmp, q, ulo, log_lo, log_hi; double m, f, g, u, v, tmp, q, ulo, log_lo, log_hi;
int ihi, ilo; int ihi, ilo;
int e = 0;
ihi = __double2hiint(a); ihi = __double2hiint(a);
ilo = __double2loint(a); ilo = __double2loint(a);
if (__cuda___isnan(a)) {
return a + a; if ((a > CUDART_ZERO) && (a < CUDART_INF)) {
} int e = -1023;
/* log(x) is undefined for x < 0.0, return INDEFINITE */ /* normalize denormals */
if (a < 0.0) { if ((unsigned)ihi < (unsigned)0x00100000) {
a = a * CUDART_TWO_TO_54;
e -= 54;
ihi = __double2hiint(a);
ilo = __double2loint(a);
}
/* a = m * 2^e. m <= sqrt(2): log2(a) = log2(m) + e.
* m > sqrt(2): log2(a) = log2(m/2) + (e+1)
*/
e += ((ihi >> 20) & 0x7ff);
ihi = (ihi & 0x800fffff) | 0x3ff00000;
m = __hiloint2double (ihi, ilo);
if ((unsigned)ihi > (unsigned)0x3ff6a09e) {
m = __internal_half(m);
e = e + 1;
}
/* log((1+m)/(1-m)) = 2*atanh(m). log(m) = 2*atanh ((m-1)/(m+1)) */
f = m - 1.0;
g = m + 1.0;
g = 1.0 / g;
u = f * g;
u = u + u;
/* u = 2.0 * (m - 1.0) / (m + 1.0) */
v = u * u;
q = 6.7261411553826339E-2/65536.0;
q = __fma_rn (q, v, 6.6133829643643394E-2/16384.0);
q = __fma_rn (q, v, 7.6940931149150890E-2/4096.0);
q = __fma_rn (q, v, 9.0908745692137444E-2/1024.0);
q = __fma_rn (q, v, 1.1111111499059706E-1/256.0);
q = __fma_rn (q, v, 1.4285714283305975E-1/64.0);
q = __fma_rn (q, v, 2.0000000000007223E-1/16.0);
q = __fma_rn (q, v, 3.3333333333333326E-1/4.0);
tmp = __internal_twice (f - u);
tmp = __fma_rn (-u, f, tmp); // tmp = remainder of division
ulo = g * tmp; // less significant quotient bits
/* u + ulo = 2.0 * (m - 1.0) / (m + 1.0) to more than double precision
*/
q = q * v;
q = q * u;
/* log_hi + log_lo = log(m) to more than double precision */
log_hi = u;
log_lo = ulo + q;
/* log_hi + log_lo = log(m)+e*log(2)=log(a) to more than double precisi
on*/
q = __fma_rn ( e, CUDART_LN2_HI, log_hi);
tmp = __fma_rn (-e, CUDART_LN2_HI, q);
tmp = tmp - log_hi;
log_hi = q;
log_lo = log_lo - tmp;
log_lo = __fma_rn (e, CUDART_LN2_LO, log_lo);
return log_hi + log_lo;
} else {
if (__cuda___isnan(a)) {
return a + a;
}
/* log(0) = -INF */
if (a == 0) {
return -CUDART_INF;
}
/* log(INF) = INF */
if (a == CUDART_INF) {
return a;
}
/* log(x) is undefined for x < 0.0, return INDEFINITE */
return CUDART_NAN; return CUDART_NAN;
} }
/* log(0) = -INF */
if (a == 0) {
return -CUDART_INF;
}
/* log(INF) = INF */
if (__cuda___isinf(a)) {
return a;
}
/* normalize denormals */
if (a < CUDART_TWO_TO_M1022) {
a = a * CUDART_TWO_TO_54;
e = -54;
ihi = __double2hiint(a);
ilo = __double2loint(a);
}
/* a = m * 2^e. m <= sqrt(2): log2(a) = log2(m) + e.
* m > sqrt(2): log2(a) = log2(m/2) + (e+1)
*/
e += ((ihi >> 20) & 0x7ff) - 1023;
m = __hiloint2double ((ihi & 0x800fffff) | 0x3ff00000, ilo);
if (m > CUDART_SQRT_TWO) {
m = __internal_half(m);
e = e + 1;
}
/* log((1+m)/(1-m)) = 2*atanh(m). log(m) = 2*atanh ((m-1)/(m+1)) */
f = m - 1.0;
g = m + 1.0;
g = 1.0 / g;
u = f * g;
u = u + u;
/* u = 2.0 * (m - 1.0) / (m + 1.0) */
v = u * u;
q = 6.7261411553826339E-2/65536.0;
q = __fma_rn (q, v, 6.6133829643643394E-2/16384.0);
q = __fma_rn (q, v, 7.6940931149150890E-2/4096.0);
q = __fma_rn (q, v, 9.0908745692137444E-2/1024.0);
q = __fma_rn (q, v, 1.1111111499059706E-1/256.0);
q = __fma_rn (q, v, 1.4285714283305975E-1/64.0);
q = __fma_rn (q, v, 2.0000000000007223E-1/16.0);
q = __fma_rn (q, v, 3.3333333333333326E-1/4.0);
tmp = __internal_twice (f - u);
tmp = __fma_rn (-u, f, tmp); // tmp = remainder of division
ulo = g * tmp; // less significant quotient bits
/* u + ulo = 2.0 * (m - 1.0) / (m + 1.0) to more than double precision */
q = q * v;
q = q * u;
/* log_hi + log_lo = log(m) to more than double precision */
log_hi = u;
log_lo = ulo + q;
/* log_hi + log_lo = log(m)+e*log(2)=log(a) to more than double precision
*/
q = __fma_rn ( e, CUDART_LN2_HI, log_hi);
tmp = __fma_rn (-e, CUDART_LN2_HI, q);
tmp = tmp - log_hi;
log_hi = q;
log_lo = log_lo - tmp;
log_lo = __fma_rn (e, CUDART_LN2_LO, log_lo);
return log_hi + log_lo;
} }
/* Requires |x.y| > |y.y|. 8 DP operations */ /* Requires |x.y| > |y.y|. 8 DP operations */
__device_func__(double2 __internal_ddadd_xgty (double2 x, double2 y)) __device_func__(double2 __internal_ddadd_xgty (double2 x, double2 y))
{ {
double2 z; double2 z;
#if defined(__GNUC__) && !defined(__CUDABE__) #if defined(__GNUC__) && !defined(__CUDABE__)
volatile double r, s, e; volatile
#else
double r, s, e;
#endif #endif
r = x.y + y.y; double r, s, e;
e = x.y - r; r = x.y + y.y;
s = ((e + y.y) + y.x) + x.x; e = x.y - r;
z.y = e = r + s; s = ((e + y.y) + y.x) + x.x;
z.x = (r - e) + s; z.y = e = r + s;
return z; z.x = (r - e) + s;
return z;
} }
/* Take full advantage of FMA. Only 8 DP operations */ /* Take full advantage of FMA. Only 8 DP operations */
__device_func__(double2 __internal_ddmul (double2 x, double2 y)) __device_func__(double2 __internal_ddmul (double2 x, double2 y))
{ {
#if defined(__GNUC__) && !defined(__CUDABE__) #if defined(__GNUC__) && !defined(__CUDABE__)
volatile double e; volatile
#else
double e;
#endif #endif
double2 t, z; double e;
t.y = x.y * y.y; double2 t, z;
t.x = __fma_rn (x.y, y.y, -t.y); t.y = x.y * y.y;
t.x = __fma_rn (x.x, y.x, t.x); t.x = __fma_rn (x.y, y.y, -t.y);
t.x = __fma_rn (x.y, y.x, t.x); t.x = __fma_rn (x.x, y.x, t.x);
t.x = __fma_rn (x.x, y.y, t.x); t.x = __fma_rn (x.y, y.x, t.x);
z.y = e = t.y + t.x; t.x = __fma_rn (x.x, y.y, t.x);
z.x = (t.y - e) + t.x; z.y = e = t.y + t.x;
return z; z.x = (t.y - e) + t.x;
return z;
} }
__device_func__(double2 __internal_log_ext_prec(double a)) __device_func__(double2 __internal_log_ext_prec(double a))
{ {
double2 res; double2 res;
double2 qq, cc, uu, tt; double2 qq, cc, uu, tt;
double f, g, u, v, q, ulo, tmp, m; double f, g, u, v, q, ulo, tmp, m;
int ilo, ihi, expo; int ilo, ihi, expo;
ihi = __double2hiint(a); ihi = __double2hiint(a);
skipping to change at line 668 skipping to change at line 675
ihi = __double2hiint(a); ihi = __double2hiint(a);
ilo = __double2loint(a); ilo = __double2loint(a);
expo = (ihi >> 20) & 0x7ff; expo = (ihi >> 20) & 0x7ff;
expo -= 54; expo -= 54;
} }
expo -= 1023; expo -= 1023;
/* log(a) = log(m*2^expo) = /* log(a) = log(m*2^expo) =
log(m) + log(2)*expo, if m < sqrt(2), log(m) + log(2)*expo, if m < sqrt(2),
log(m*0.5) + log(2)*(expo+1), if m >= sqrt(2) log(m*0.5) + log(2)*(expo+1), if m >= sqrt(2)
*/ */
m = __hiloint2double((ihi & 0x800fffff) | 0x3ff00000, ilo); ihi = (ihi & 0x800fffff) | 0x3ff00000;
if (m > CUDART_SQRT_TWO) { m = __hiloint2double (ihi, ilo);
if ((unsigned)ihi > (unsigned)0x3ff6a09e) {
m = __internal_half(m); m = __internal_half(m);
expo = expo + 1; expo = expo + 1;
} }
/* compute log(m) with extended precision using an algorithm derived from /* compute log(m) with extended precision using an algorithm derived from
* P.T.P. Tang, "Table Driven Implementation of the Logarithm Function", * P.T.P. Tang, "Table Driven Implementation of the Logarithm Function",
* TOMS, Vol. 16., No. 4, December 1990, pp. 378-400. A modified polynomi al * TOMS, Vol. 16., No. 4, December 1990, pp. 378-400. A modified polynomi al
* approximation to atanh(x) on the interval [-0.1716, 0.1716] is utilize d. * approximation to atanh(x) on the interval [-0.1716, 0.1716] is utilize d.
*/ */
f = m - 1.0; f = m - 1.0;
g = m + 1.0; g = m + 1.0;
skipping to change at line 737 skipping to change at line 745
__device_func__(double __cuda_log10(double a)) __device_func__(double __cuda_log10(double a))
{ {
double t; double t;
t = __cuda_log(a); t = __cuda_log(a);
return __fma_rn (t, CUDART_LGE_HI, t * CUDART_LGE_LO); return __fma_rn (t, CUDART_LGE_HI, t * CUDART_LGE_LO);
} }
__device_func__(double __cuda_log1p(double a)) __device_func__(double __cuda_log1p(double a))
{ {
double t; double t;
#if !defined(__CUDABE__) int i;
if (__cuda___isnan(a)) {
return a + a; i = __double2hiint(a);
} if (((unsigned)i < (unsigned)0x3fe55555) || ((int)i < (int)0xbfd99999)) {
#endif /* Compute log2(a+1) = 2*atanh(a/(a+2)) */
if ((a < -0.4) || (a > CUDART_TWOTHIRD)) { t = a + 2.0;
return __cuda_log (a + 1.0); t = a / t;
t = -a * t;
t = __internal_atanh_kernel(a, t);
return t;
} }
/* Compute log2(a+1) = 2*atanh(a/(a+2)) */ return __cuda_log (a + CUDART_ONE);
t = a + 2.0;
t = a / t;
t = -a * t;
t = __internal_atanh_kernel(a, t);
return t;
} }
__device_func__(double __internal_exp_kernel(double a, int scale)) __device_func__(double __internal_exp_kernel(double a, int scale))
{ {
double t, fac, z; double t, fac, z;
int i; int i;
/* exp(a) = 2^(rint(a/log(2)) + z) = 2^(i + z) */ /* exp(a) = 2^(rint(a/log(2)) + z) = 2^(i + z) */
t = __cuda_rint (a * CUDART_L2E); t = __cuda_rint (a * CUDART_L2E);
i = (int)t; i = (int)t;
z = __fma_rn (t, -CUDART_LN2_HI, a); z = __fma_rn (t, -CUDART_LN2_HI, a);
z = __fma_rn (t, -CUDART_LN2_LO, z); z = __fma_rn (t, -CUDART_LN2_LO, z);
fac = 2.0; fac = 2.0;
if (i <= -1021) { if (i <= -1021) {
i += 55; i += 55;
fac = CUDART_TWO_TO_M54; fac = CUDART_TWO_TO_M54;
} }
/* exp(a) = 2^i * e^z */
t = __internal_expm1_kernel(z); t = __internal_expm1_kernel(z);
/* exp(a) = 2^i * 2^z */ z = __internal_exp2i_kernel(i + scale - 1);
z = __hiloint2double((1022 + i + scale) << 20, 0);
t = __fma_rn (t, z, z); t = __fma_rn (t, z, z);
t = t * fac; t = t * fac;
return t; return t;
} }
__device_func__(double __cuda_exp(double a)) __device_func__(double __cuda_exp(double a))
{ {
if (a > CUDART_LN2_X_1024) { double t;
return CUDART_INF; int i;
i = __double2hiint(a);
if (((unsigned)i < (unsigned)0x40862e43) || ((int)i < (int)0xC0874911)) {
t = __internal_exp_kernel(a, 0);
return t;
} }
if (a <= -CUDART_LN2_X_1075) { t = ((unsigned int)i >> 31) ? CUDART_ZERO : CUDART_INF;
return CUDART_ZERO; if (__cuda___isnan(a)) {
t = a + a;
} }
a = __internal_exp_kernel(a, 0); return t;
return a;
} }
__device_func__(double __cuda_exp2(double a)) __device_func__(double __cuda_exp2(double a))
{ {
double z; double z;
double t; double t;
double fac; double fac;
int i; int i;
if (a >= 1024.0) { i = __double2hiint(a);
return CUDART_INF; if (((unsigned)i < (unsigned)0x40900000) || ((int)i < (int)0xc090cc00)) {
} t = __cuda_rint (a);
if (a < -1075.0) { z = a - t;
return CUDART_ZERO; i = (int)t;
fac = 2.0;
if (i <= -1021) {
i += 55;
fac = CUDART_TWO_TO_M54;
}
/* 2^z = exp(log(2)*z) */
z = __fma_rn (z, CUDART_LN2_HI, z * CUDART_LN2_LO);
t = __internal_expm1_kernel(z);
z = __internal_exp2i_kernel(i - 1);
t = __fma_rn (t, z, z);
t = t * fac;
return t;
} }
t = __cuda_rint (a); t = ((unsigned int)i >> 31) ? CUDART_ZERO : CUDART_INF;
z = a - t; if (__cuda___isnan(a)) {
i = (int)t; t = a + a;
fac = 2.0;
if (i <= -1021) {
i += 55;
fac = CUDART_TWO_TO_M54;
} }
/* 2^z = exp(log(2)*z) */
z = __fma_rn (z, CUDART_LN2_HI, z * CUDART_LN2_LO);
t = __internal_expm1_kernel(z);
z = __hiloint2double((1022 + i) << 20, 0);
t = __fma_rn (t, z, z);
t = t * fac;
return t; return t;
} }
__device_func__(double __cuda_exp10(double a)) __device_func__(double __cuda_exp10(double a))
{ {
double z; double z;
double t; double t;
double fac; double fac;
int i; int i;
if (a >= CUDART_LG2_X_1024) { i = __double2hiint(a);
return CUDART_INF; if (((unsigned)i < (unsigned)0x40734414) || ((int)i < (int)0xc07439b8)) {
} t = __cuda_rint (a * CUDART_L2T);
if (a < -CUDART_LG2_X_1075) { i = (int)t;
return CUDART_ZERO; z = __fma_rn (t, -CUDART_LG2_HI, a);
z = __fma_rn (t, -CUDART_LG2_LO, z);
fac = 2.0;
if (i <= -1021) {
i += 55;
fac = CUDART_TWO_TO_M54;
}
/* 2^z = exp(log(10)*z) */
z = __fma_rn (z, CUDART_LNT_HI, z * CUDART_LNT_LO);
t = __internal_expm1_kernel(z);
z = __internal_exp2i_kernel(i - 1);
t = __fma_rn (t, z, z);
t = t * fac;
return t;
} }
t = __cuda_rint (a * CUDART_L2T); t = ((unsigned int)i >> 31) ? CUDART_ZERO : CUDART_INF;
i = (int)t; if (__cuda___isnan(a)) {
z = __fma_rn (t, -CUDART_LG2_HI, a); t = a + a;
z = __fma_rn (t, -CUDART_LG2_LO, z);
fac = 2.0;
if (i <= -1021) {
i += 55;
fac = CUDART_TWO_TO_M54;
} }
/* 2^z = exp(log(10)*z) */
z = __fma_rn (z, CUDART_LNT_HI, z * CUDART_LNT_LO);
t = __internal_expm1_kernel(z);
/* exp(a) = 2^i * 2^z */
z = __hiloint2double((1022 + i) << 20, 0);
t = __fma_rn (t, z, z);
t = t * fac;
return t; return t;
} }
__device_func__(double __cuda_expm1(double a)) __device_func__(double __cuda_expm1(double a))
{ {
double t, z, u; double t, z, u;
int i, j; int i, j, k;
if (a > CUDART_LN2_X_1024) {
return CUDART_INF; k = __double2hiint(a);
} if (((unsigned)k < (unsigned)0x40862e43) || ((int)k < (int)0xc04a8000)) {
if (a < -53.0) { t = __cuda_rint (a * CUDART_L2E);
return -1.0; i = (int)t;
z = __fma_rn (t, -CUDART_LN2_HI, a);
z = __fma_rn (t, -CUDART_LN2_LO, z);
k = k + k;
if ((unsigned)k < (unsigned)0x7fb3e647) {
z = a;
i = 0;
}
t = __internal_expm1_kernel(z);
j = i;
if (i == 1024) j--;
u = __internal_exp2i_kernel(j);
a = u - 1.0;
t = __fma_rn (t, u, a);
if (i == 1024) t = t + t;
if (k == 0) t = z; /* preserve -0 */
return t;
} }
t = __cuda_rint (a * CUDART_L2E); t = ((unsigned int)k >> 31) ? -CUDART_ONE : CUDART_INF;
i = (int)t; if (__cuda___isnan(a)) {
z = __fma_rn (t, -CUDART_LN2_HI, a); t = a + a;
z = __fma_rn (t, -CUDART_LN2_LO, z);
if (__cuda_fabs(a) < 0.405465108) {
z = a;
i = 0;
} }
j = (i == 1024) ? (i - 1) : i;
t = __internal_expm1_kernel(z);
u = __hiloint2double((1023 + j) << 20, 0);
a = u - 1.0;
t = __fma_rn (t, u, a);
if (z == 0.0) t = z; /* preserve -0 */
if (i == 1024) t = t + t;
return t; return t;
} }
__device_func__(double __cuda_cosh(double a)) __device_func__(double __cuda_cosh(double a))
{ {
double z; double z;
if (__cuda___isnan(a)) { int i;
z = __cuda_fabs(a);
i = __double2hiint(z);
if ((unsigned)i < (unsigned)0x408633cf) {
z = __internal_exp_kernel(z, -2);
z = __fma_rn(2.0, z, 0.125 / z);
return z;
} else {
if (z > 0.0) a = CUDART_INF_F;
return a + a; return a + a;
} }
a = __cuda_fabs(a);
z = __internal_exp_kernel(a, -2);
z = __fma_rn(2.0, z, 0.125 / z);
if (a >= CUDART_LN2_X_1025) {
z = CUDART_INF_F; /* overflow -> infinity */
}
return z;
} }
__device_func__(double __cuda_sinh(double a)) __device_func__(double __cuda_sinh(double a))
{ {
double s, z; double s, z;
s = a; s = a;
a = __cuda_fabs(a); a = __cuda_fabs(a);
if (a < 1.0) { /* danger of catastrophic cancellation */ if (a < 1.0) { /* danger of catastrophic cancellation */
double a2 = a * a; double a2 = a * a;
/* approximate sinh(x) on [0,1] with a polynomial */ /* approximate sinh(x) on [0,1] with a polynomial */
skipping to change at line 957 skipping to change at line 977
t = __fma_rn (t, a, a); t = __fma_rn (t, a, a);
a = __cuda_copysign(t, a); a = __cuda_copysign(t, a);
} }
return a; return a;
} }
__device_func__(double __internal_atan_kernel(double a)) __device_func__(double __internal_atan_kernel(double a))
{ {
double t, a2; double t, a2;
a2 = a * a; a2 = a * a;
t = -2.0258553044438358E-005 ; t = -2.0258553044438358E-005 ;
t = __fma_rn (t, a2, 2.2302240345758510E-004); t = __fma_rn (t, a2, 2.2302240345758510E-004);
t = __fma_rn (t, a2, -1.1640717779930576E-003); t = __fma_rn (t, a2, -1.1640717779930576E-003);
t = __fma_rn (t, a2, 3.8559749383629918E-003); t = __fma_rn (t, a2, 3.8559749383629918E-003);
t = __fma_rn (t, a2, -9.1845592187165485E-003); t = __fma_rn (t, a2, -9.1845592187165485E-003);
t = __fma_rn (t, a2, 1.6978035834597331E-002); t = __fma_rn (t, a2, 1.6978035834597331E-002);
t = __fma_rn (t, a2, -2.5826796814495994E-002); t = __fma_rn (t, a2, -2.5826796814495994E-002);
t = __fma_rn (t, a2, 3.4067811082715123E-002); t = __fma_rn (t, a2, 3.4067811082715123E-002);
t = __fma_rn (t, a2, -4.0926382420509971E-002); t = __fma_rn (t, a2, -4.0926382420509971E-002);
t = __fma_rn (t, a2, 4.6739496199157994E-002); t = __fma_rn (t, a2, 4.6739496199157994E-002);
t = __fma_rn (t, a2, -5.2392330054601317E-002); t = __fma_rn (t, a2, -5.2392330054601317E-002);
skipping to change at line 1049 skipping to change at line 1069
r = __fma_rn (r, b, 3.038188875134962E-002); r = __fma_rn (r, b, 3.038188875134962E-002);
r = __fma_rn (r, b, 4.464285849810986E-002); r = __fma_rn (r, b, 4.464285849810986E-002);
r = __fma_rn (r, b, 7.499999998342270E-002); r = __fma_rn (r, b, 7.499999998342270E-002);
r = __fma_rn (r, b, 1.666666666667375E-001); r = __fma_rn (r, b, 1.666666666667375E-001);
r = r * b; r = r * b;
return r; return r;
} }
__device_func__(double __cuda_asin(double a)) __device_func__(double __cuda_asin(double a))
{ {
double t0, t1; double fa, t0, t1;
t0 = __cuda_fabs(a); int ihi, ahi;
if (t0 > 1.0) { ahi = __double2hiint(a);
return CUDART_NAN; fa = __cuda_fabs(a);
} ihi = __double2hiint(fa);
if (t0 > 0.575) { if (ihi < 0x3fe26666) {
t1 = __fma_rn (-0.5, t0, 0.5); t1 = fa * fa;
t1 = __internal_asin_kernel (fa, t1);
t1 = __fma_rn (t1, fa, fa);
t1 = __cuda_copysign(t1, a);
} else {
t1 = __fma_rn (-0.5, fa, 0.5);
t0 = __cuda_sqrt (t1); t0 = __cuda_sqrt (t1);
t1 = __internal_asin_kernel (t0, t1); t1 = __internal_asin_kernel (t0, t1);
t0 = -2.0 * t0; t0 = -2.0 * t0;
t1 = __fma_rn (t0, t1, CUDART_PIO2_LO); t1 = __fma_rn (t0, t1, CUDART_PIO2_LO);
t0 = t0 + CUDART_PIO4_HI; t0 = t0 + CUDART_PIO4_HI;
t1 = t0 + t1; t1 = t0 + t1;
t1 = t1 + CUDART_PIO4_HI; t1 = t1 + CUDART_PIO4_HI;
} else { if (ahi < 0x3ff00000) {
t1 = t0 * t0; t1 = __cuda_copysign(t1, a);
t1 = __internal_asin_kernel (t0, t1); }
t1 = __fma_rn (t1, t0, t0);
} }
return __cuda_copysign(t1, a); return t1;
} }
__device_func__(double __cuda_acos(double a)) __device_func__(double __cuda_acos(double a))
{ {
double t0, t1; double t0, t1;
int ihi, ahi;
#if !defined(__CUDABE__) #if !defined(__CUDABE__)
if (__cuda___isnan(a)) { if (__cuda___isnan(a)) {
return a + a; return a + a;
} }
#endif #endif
ahi = __double2hiint(a);
t0 = __cuda_fabs (a); t0 = __cuda_fabs (a);
if (t0 > 0.575) { ihi = __double2hiint(t0);
t1 = __fma_rn (-0.5, t0, 0.5); if (ihi < 0x3fe26666) {
t0 = __cuda_sqrt(t1);
t1 = __internal_asin_kernel (t0, t1);
t0 = __fma_rn (t1, t0, t0);
t0 = 2.0 * t0;
if (__cuda___signbit(a)) {
t0 = __fma_rn (1.0, t0, -CUDART_PI_LO);
t0 = CUDART_PI_HI - t0;
}
} else {
t1 = t0 * t0; t1 = t0 * t0;
t1 = __internal_asin_kernel (t0, t1); t1 = __internal_asin_kernel (t0, t1);
t0 = __fma_rn (t1, t0, t0); t0 = __fma_rn (t1, t0, t0);
if (__cuda___signbit(a)) { if ((unsigned)ahi >= (unsigned)0x80000000) {
t0 = __fma_rn (1.0, t0, +CUDART_PIO2_LO); t0 = __fma_rn (1.0, t0, +CUDART_PIO2_LO);
t0 = CUDART_PIO2_HI + t0; t0 = CUDART_PIO2_HI + t0;
} else { } else {
t0 = __fma_rn (1.0, t0, -CUDART_PIO2_LO); t0 = __fma_rn (1.0, t0, -CUDART_PIO2_LO);
t0 = CUDART_PIO2_HI - t0; t0 = CUDART_PIO2_HI - t0;
} }
} else {
t1 = __fma_rn (-0.5, t0, 0.5);
t0 = __cuda_sqrt(t1);
t1 = __internal_asin_kernel (t0, t1);
t0 = __fma_rn (t1, t0, t0);
t0 = 2.0 * t0;
if ((unsigned)ahi >= (unsigned)0x80000000) {
t0 = __fma_rn (1.0, t0, -CUDART_PI_LO);
t0 = CUDART_PI_HI - t0;
}
} }
return t0; return t0;
} }
__device_func__(double __cuda_acosh(double a)) __device_func__(double __cuda_acosh(double a))
{ {
double t; double t;
#if !defined(__CUDABE__) #if !defined(__CUDABE__)
if (__cuda___isnan(a)) { if (__cuda___isnan(a)) {
return a + a; return a + a;
skipping to change at line 1382 skipping to change at line 1410
r = __fma_rn (r, q, 5.22397760611847340E-003); r = __fma_rn (r, q, 5.22397760611847340E-003);
r = __fma_rn (r, q, -2.68661706431114690E-002); r = __fma_rn (r, q, -2.68661706431114690E-002);
r = __fma_rn (r, q, 1.12837916709441850E-001); r = __fma_rn (r, q, 1.12837916709441850E-001);
r = __fma_rn (r, q, -3.76126389031835210E-001); r = __fma_rn (r, q, -3.76126389031835210E-001);
r = __fma_rn (r, q, 1.12837916709551260E+000); r = __fma_rn (r, q, 1.12837916709551260E+000);
a = r * a; a = r * a;
} }
return a; return a;
} }
__device_func__(double __cuda_erfinv(double a))
{
double fa, t;
fa = fabs(a);
if (fa >= 1.0) {
t = CUDART_NAN; /* NaN */
if (fa == 1.0) {
t = a * CUDART_INF; /* Infinity */
}
} else if (fa >= 0.9375) {
/* Based on: J.M. Blair, C.A. Edwards, J.H. Johnson: Rational Chebyshev
Approximations for the Inverse of the Error Function. Mathematics of
Computation, Vol. 30, No. 136 (Oct. 1976), pp. 827-830. Table 59
*/
double p, q;
t = __cuda_log1p(-fa);
t = __cuda_rsqrt(-t);
p = 2.7834010353747001060e-3;
p = __fma_rn (p, t, 8.6030097526280260580e-1);
p = __fma_rn (p, t, 2.1371214997265515515e+0);
p = __fma_rn (p, t, 3.1598519601132090206e+0);
p = __fma_rn (p, t, 3.5780402569085996758e+0);
p = __fma_rn (p, t, 1.5335297523989890804e+0);
p = __fma_rn (p, t, 3.4839207139657522572e-1);
p = __fma_rn (p, t, 5.3644861147153648366e-2);
p = __fma_rn (p, t, 4.3836709877126095665e-3);
p = __fma_rn (p, t, 1.3858518113496718808e-4);
p = __fma_rn (p, t, 1.1738352509991666680e-6);
q = t+ 2.2859981272422905412e+0;
q = __fma_rn (q, t, 4.3859045256449554654e+0);
q = __fma_rn (q, t, 4.6632960348736635331e+0);
q = __fma_rn (q, t, 3.9846608184671757296e+0);
q = __fma_rn (q, t, 1.6068377709719017609e+0);
q = __fma_rn (q, t, 3.5609087305900265560e-1);
q = __fma_rn (q, t, 5.3963550303200816744e-2);
q = __fma_rn (q, t, 4.3873424022706935023e-3);
q = __fma_rn (q, t, 1.3858762165532246059e-4);
q = __fma_rn (q, t, 1.1738313872397777529e-6);
t = p / (q * t);
if (a < 0.0) t = -t;
} else if (fa >= 0.75) {
/* Based on: J.M. Blair, C.A. Edwards, J.H. Johnson: Rational Chebyshev
Approximations for the Inverse of the Error Function. Mathematics of
Computation, Vol. 30, No. 136 (Oct. 1976), pp. 827-830. Table 39
*/
double p, q;
t = __fma_rn (a, a, -.87890625);
p = .21489185007307062000e+0;
p = __fma_rn (p, t, -.64200071507209448655e+1);
p = __fma_rn (p, t, .29631331505876308123e+2);
p = __fma_rn (p, t, -.47644367129787181803e+2);
p = __fma_rn (p, t, .34810057749357500873e+2);
p = __fma_rn (p, t, -.12954198980646771502e+2);
p = __fma_rn (p, t, .25349389220714893917e+1);
p = __fma_rn (p, t, -.24758242362823355486e+0);
p = __fma_rn (p, t, .94897362808681080020e-2);
q = t -.12831383833953226499e+2;
q = __fma_rn (q, t, .41409991778428888716e+2);
q = __fma_rn (q, t, -.53715373448862143349e+2);
q = __fma_rn (q, t, .33880176779595142685e+2);
q = __fma_rn (q, t, -.11315360624238054876e+2);
q = __fma_rn (q, t, .20369295047216351160e+1);
q = __fma_rn (q, t, -.18611650627372178511e+0);
q = __fma_rn (q, t, .67544512778850945940e-2);
p = p / q;
t = a * p;
} else {
/* Based on: J.M. Blair, C.A. Edwards, J.H. Johnson: Rational Chebyshev
Approximations for the Inverse of the Error Function. Mathematics of
Computation, Vol. 30, No. 136 (Oct. 1976), pp. 827-830. Table 18
*/
double p, q;
t = __fma_rn (a, a, -.5625);
p = -.23886240104308755900e+2;
p = __fma_rn (p, t, .45560204272689128170e+3);
p = __fma_rn (p, t, -.22977467176607144887e+4);
p = __fma_rn (p, t, .46631433533434331287e+4);
p = __fma_rn (p, t, -.43799652308386926161e+4);
p = __fma_rn (p, t, .19007153590528134753e+4);
p = __fma_rn (p, t, -.30786872642313695280e+3);
q = t -.83288327901936570000e+2;
q = __fma_rn (q, t, .92741319160935318800e+3);
q = __fma_rn (q, t, -.35088976383877264098e+4);
q = __fma_rn (q, t, .59039348134843665626e+4);
q = __fma_rn (q, t, -.48481635430048872102e+4);
q = __fma_rn (q, t, .18997769186453057810e+4);
q = __fma_rn (q, t, -.28386514725366621129e+3);
p = p / q;
t = a * p;
}
return t;
}
__device_func__(double __cuda_erfcinv(double a))
{
double t;
#if !defined(__CUDABE__)
if (__cuda___isnan(a)) return a + a;
#endif
if (a <= CUDART_ZERO) {
t = CUDART_NAN;
if (a == CUDART_ZERO) {
t = (1.0 - a) * CUDART_INF;
}
}
else if (a >= 0.0625) {
t = __cuda_erfinv (1.0 - a);
}
else if (a >= 1e-100) {
/* Based on: J.M. Blair, C.A. Edwards, J.H. Johnson: Rational Chebyshev
Approximations for the Inverse of the Error Function. Mathematics of
Computation, Vol. 30, No. 136 (Oct. 1976), pp. 827-830. Table 59
*/
double p, q;
t = __cuda_log(a);
t = __cuda_rsqrt(-t);
p = 2.7834010353747001060e-3;
p = __fma_rn (p, t, 8.6030097526280260580e-1);
p = __fma_rn (p, t, 2.1371214997265515515e+0);
p = __fma_rn (p, t, 3.1598519601132090206e+0);
p = __fma_rn (p, t, 3.5780402569085996758e+0);
p = __fma_rn (p, t, 1.5335297523989890804e+0);
p = __fma_rn (p, t, 3.4839207139657522572e-1);
p = __fma_rn (p, t, 5.3644861147153648366e-2);
p = __fma_rn (p, t, 4.3836709877126095665e-3);
p = __fma_rn (p, t, 1.3858518113496718808e-4);
p = __fma_rn (p, t, 1.1738352509991666680e-6);
q = t+ 2.2859981272422905412e+0;
q = __fma_rn (q, t, 4.3859045256449554654e+0);
q = __fma_rn (q, t, 4.6632960348736635331e+0);
q = __fma_rn (q, t, 3.9846608184671757296e+0);
q = __fma_rn (q, t, 1.6068377709719017609e+0);
q = __fma_rn (q, t, 3.5609087305900265560e-1);
q = __fma_rn (q, t, 5.3963550303200816744e-2);
q = __fma_rn (q, t, 4.3873424022706935023e-3);
q = __fma_rn (q, t, 1.3858762165532246059e-4);
q = __fma_rn (q, t, 1.1738313872397777529e-6);
t = p / (q * t);
}
else {
/* Based on: J.M. Blair, C.A. Edwards, J.H. Johnson: Rational Chebyshev
Approximations for the Inverse of the Error Function. Mathematics of
Computation, Vol. 30, No. 136 (Oct. 1976), pp. 827-830. Table 82
*/
double p, q;
t = __cuda_log(a);
t = __cuda_rsqrt(-t);
p = 6.9952990607058154858e-1;
p = __fma_rn (p, t, 1.9507620287580568829e+0);
p = __fma_rn (p, t, 8.2810030904462690216e-1);
p = __fma_rn (p, t, 1.1279046353630280005e-1);
p = __fma_rn (p, t, 6.0537914739162189689e-3);
p = __fma_rn (p, t, 1.3714329569665128933e-4);
p = __fma_rn (p, t, 1.2964481560643197452e-6);
p = __fma_rn (p, t, 4.6156006321345332510e-9);
p = __fma_rn (p, t, 4.5344689563209398450e-12);
q = t+ 1.5771922386662040546e+0;
q = __fma_rn (q, t, 2.1238242087454993542e+0);
q = __fma_rn (q, t, 8.4001814918178042919e-1);
q = __fma_rn (q, t, 1.1311889334355782065e-1);
q = __fma_rn (q, t, 6.0574830550097140404e-3);
q = __fma_rn (q, t, 1.3715891988350205065e-4);
q = __fma_rn (q, t, 1.2964671850944981713e-6);
q = __fma_rn (q, t, 4.6156017600933592558e-9);
q = __fma_rn (q, t, 4.5344687377088206783e-12);
t = p / (q * t);
}
return t;
}
__device_func__(double __cuda_erfc(double a)) __device_func__(double __cuda_erfc(double a))
{ {
double p, q, h, l; double p, q, h, l;
if (__cuda___isnan(a)) { int ahi;
return a + a;
} ahi = __double2hiint(a);
if (a <= 0.55) { if (ahi < (int)0x3fe80000) {
return 1.0 - __cuda_erf(a); return 1.0 - __cuda_erf(a);
} }
if (a > 27.3) { if (a > 27.3) {
return 0.0; return 0.0;
} }
if (a <= 5.0) { if (ahi < (int)0x40140000) {
p = 5.64189549785304440E-001; p = 5.64189549785304440E-001;
p = __fma_rn (p, a, 8.17405083437083490E+000); p = __fma_rn (p, a, 8.17405083437083490E+000);
p = __fma_rn (p, a, 5.68958722557864720E+001); p = __fma_rn (p, a, 5.68958722557864720E+001);
p = __fma_rn (p, a, 2.42568747802647010E+002); p = __fma_rn (p, a, 2.42568747802647010E+002);
p = __fma_rn (p, a, 6.80381374390412930E+002); p = __fma_rn (p, a, 6.80381374390412930E+002);
p = __fma_rn (p, a, 1.25873132236024590E+003); p = __fma_rn (p, a, 1.25873132236024590E+003);
p = __fma_rn (p, a, 1.43925353963809330E+003); p = __fma_rn (p, a, 1.43925353963809330E+003);
p = __fma_rn (p, a, 8.15949420587659230E+002); p = __fma_rn (p, a, 8.15949420587659230E+002);
q = a+ 1.44881247113239940E+001; q = a+ 1.44881247113239940E+001;
q = __fma_rn (q, a, 1.01345387970210510E+002); q = __fma_rn (q, a, 1.01345387970210510E+002);
skipping to change at line 1426 skipping to change at line 1628
p = __fma_rn (p, a, 1.22570382896313600E+001); p = __fma_rn (p, a, 1.22570382896313600E+001);
p = __fma_rn (p, a, 6.01884641114116460E+000); p = __fma_rn (p, a, 6.01884641114116460E+000);
q = a+ 3.62871917534986780E+000; q = a+ 3.62871917534986780E+000;
q = __fma_rn (q, a, 1.24663395327043550E+001); q = __fma_rn (q, a, 1.24663395327043550E+001);
q = __fma_rn (q, a, 2.13927672803974790E+001); q = __fma_rn (q, a, 2.13927672803974790E+001);
q = __fma_rn (q, a, 2.72082423532866070E+001); q = __fma_rn (q, a, 2.72082423532866070E+001);
q = __fma_rn (q, a, 1.86422906830006700E+001); q = __fma_rn (q, a, 1.86422906830006700E+001);
q = __fma_rn (q, a, 6.13809834548870550E+000); q = __fma_rn (q, a, 6.13809834548870550E+000);
} }
p = p / q; p = p / q;
h = -a * a; h = a * a;
l = __fma_rn (-a, a, -h); l = __fma_rn (a, a, -h);
q = __internal_exp_kernel(h, 0); q = __internal_exp_kernel(-h, 0);
q = __fma_rn (q, l, q); q = __fma_rn (l, -q, q);
p = p * q; p = p * q;
return p; return p;
} }
/* approximate 1.0/(a*gamma(a)) on [-0.5,0.5] */ /* approximate 1.0/(a*gamma(a)) on [-0.5,0.5] */
__device_func__(double __internal_tgamma_kernel(double a)) __device_func__(double __internal_tgamma_kernel(double a))
{ {
double t; double t;
t = -4.42689340712524750E-010; t = -4.42689340712524750E-010;
t = __fma_rn (t, a, -2.02665918466589540E-007); t = __fma_rn (t, a, -2.02665918466589540E-007);
 End of changes. 47 change blocks. 
225 lines changed or deleted 428 lines changed or added


 sm_11_atomic_functions.h   sm_11_atomic_functions.h 
/* /*
* Copyright 1993-2008 NVIDIA Corporation. All rights reserved. * Copyright 1993-2009 NVIDIA Corporation. All rights reserved.
* *
* NOTICE TO USER: * NOTICE TO USER:
* *
* This source code is subject to NVIDIA ownership rights under U.S. and * This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws. Users and possessors of this source code * international Copyright laws. Users and possessors of this source code
* are hereby granted a nonexclusive, royalty-free license to use this code * are hereby granted a nonexclusive, royalty-free license to use this code
* in individual and commercial software. * in individual and commercial software.
* *
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
skipping to change at line 257 skipping to change at line 257
__builtin___iAtomicXor(address, val) __builtin___iAtomicXor(address, val)
#define __uAtomicXor(address, val) \ #define __uAtomicXor(address, val) \
__builtin___uAtomicXor(address, val) __builtin___uAtomicXor(address, val)
#define __iAtomicCAS(address, compare, val) \ #define __iAtomicCAS(address, compare, val) \
__builtin___iAtomicCAS(address, compare, val) __builtin___iAtomicCAS(address, compare, val)
#define __uAtomicCAS(address, compare, val) \ #define __uAtomicCAS(address, compare, val) \
__builtin___uAtomicCAS(address, compare, val) __builtin___uAtomicCAS(address, compare, val)
#else /* __MULTI_CORE__ */ #else /* __MULTI_CORE__ */
extern void CUDARTAPI __cudaMutexOperation(int lock);
#define __cudaAtomicOperation(code) \
__cudaMutexOperation(1); \
code \
__cudaMutexOperation(0);
__device_func__(int __iAtomicAdd(int *address, int val)) __device_func__(int __iAtomicAdd(int *address, int val))
{ {
int old = *address; int old;
*address = old + val;
__cudaAtomicOperation(
old = *address;
*address = old + val;
)
return old; return old;
} }
__device_func__(unsigned int __uAtomicAdd(unsigned int *address, unsigned i nt val)) __device_func__(unsigned int __uAtomicAdd(unsigned int *address, unsigned i nt val))
{ {
unsigned int old = *address; unsigned int old;
*address = old + val; __cudaAtomicOperation(
old = *address;
*address = old + val;
)
return old; return old;
} }
__device_func__(int __iAtomicExch(int *address, int val)) __device_func__(int __iAtomicExch(int *address, int val))
{ {
int old = *address; int old;
*address = val; __cudaAtomicOperation(
old = *address;
*address = val;
)
return old; return old;
} }
__device_func__(unsigned int __uAtomicExch(unsigned int *address, unsigned int val)) __device_func__(unsigned int __uAtomicExch(unsigned int *address, unsigned int val))
{ {
unsigned int old = *address; unsigned int old;
*address = val; __cudaAtomicOperation(
old = *address;
*address = val;
)
return old; return old;
} }
__device_func__(float __fAtomicExch(float *address, float val)) __device_func__(float __fAtomicExch(float *address, float val))
{ {
float old = *address; float old;
*address = val; __cudaAtomicOperation(
old = *address;
*address = val;
)
return old; return old;
} }
__device_func__(int __iAtomicMin(int *address, int val)) __device_func__(int __iAtomicMin(int *address, int val))
{ {
int old = *address; int old;
*address = old < val ? old : val; __cudaAtomicOperation(
old = *address;
*address = old < val ? old : val;
)
return old; return old;
} }
__device_func__(unsigned int __uAtomicMin(unsigned int *address, unsigned i nt val)) __device_func__(unsigned int __uAtomicMin(unsigned int *address, unsigned i nt val))
{ {
unsigned int old = *address; unsigned int old;
*address = old < val ? old : val; __cudaAtomicOperation(
old = *address;
*address = old < val ? old : val;
)
return old; return old;
} }
__device_func__(int __iAtomicMax(int *address, int val)) __device_func__(int __iAtomicMax(int *address, int val))
{ {
int old = *address; int old;
*address = old > val ? old : val; __cudaAtomicOperation(
old = *address;
*address = old > val ? old : val;
)
return old; return old;
} }
__device_func__(unsigned int __uAtomicMax(unsigned int *address, unsigned i nt val)) __device_func__(unsigned int __uAtomicMax(unsigned int *address, unsigned i nt val))
{ {
unsigned int old = *address; unsigned int old;
*address = old > val ? old : val; __cudaAtomicOperation(
old = *address;
*address = old > val ? old : val;
)
return old; return old;
} }
__device_func__(unsigned int __uAtomicInc(unsigned int *address, unsigned i nt val)) __device_func__(unsigned int __uAtomicInc(unsigned int *address, unsigned i nt val))
{ {
unsigned int old = *address; unsigned int old;
*address = (old >= val) ? 0 : old + 1; __cudaAtomicOperation(
old = *address;
*address = (old >= val) ? 0 : old + 1;
)
return old; return old;
} }
__device_func__(unsigned int __uAtomicDec(unsigned int *address, unsigned i nt val)) __device_func__(unsigned int __uAtomicDec(unsigned int *address, unsigned i nt val))
{ {
unsigned int old = *address; unsigned int old;
*address = ((old == 0) | (old > val)) ? val : (old - 1); __cudaAtomicOperation(
old = *address;
*address = ((old == 0) | (old > val)) ? val : (old - 1);
)
return old; return old;
} }
__device_func__(int __iAtomicAnd(int *address, int val)) __device_func__(int __iAtomicAnd(int *address, int val))
{ {
int old = *address; int old;
*address = old & val; __cudaAtomicOperation(
old = *address;
*address = old & val;
)
return old; return old;
} }
__device_func__(unsigned int __uAtomicAnd(unsigned int *address, unsigned i nt val)) __device_func__(unsigned int __uAtomicAnd(unsigned int *address, unsigned i nt val))
{ {
unsigned int old = *address; unsigned int old;
*address = old & val; __cudaAtomicOperation(
old = *address;
*address = old & val;
)
return old; return old;
} }
__device_func__(int __iAtomicOr(int *address, int val)) __device_func__(int __iAtomicOr(int *address, int val))
{ {
int old = *address; int old;
*address = old | val; __cudaAtomicOperation(
old = *address;
*address = old | val;
)
return old; return old;
} }
__device_func__(unsigned int __uAtomicOr(unsigned int *address, unsigned in t val)) __device_func__(unsigned int __uAtomicOr(unsigned int *address, unsigned in t val))
{ {
unsigned int old = *address; unsigned int old;
*address = old | val; __cudaAtomicOperation(
old = *address;
*address = old | val;
)
return old; return old;
} }
__device_func__(int __iAtomicXor(int *address, int val)) __device_func__(int __iAtomicXor(int *address, int val))
{ {
int old = *address; int old;
*address = old ^ val; __cudaAtomicOperation(
old = *address;
*address = old ^ val;
)
return old; return old;
} }
__device_func__(unsigned int __uAtomicXor(unsigned int *address, unsigned i nt val)) __device_func__(unsigned int __uAtomicXor(unsigned int *address, unsigned i nt val))
{ {
unsigned int old = *address; unsigned int old;
*address = old ^ val; __cudaAtomicOperation(
old = *address;
*address = old ^ val;
)
return old; return old;
} }
__device_func__(int __iAtomicCAS(int *address, int compare, int val)) __device_func__(int __iAtomicCAS(int *address, int compare, int val))
{ {
int old = *address; int old;
*address = old == compare ? val : old; __cudaAtomicOperation(
old = *address;
*address = old == compare ? val : old;
)
return old; return old;
} }
__device_func__(unsigned int __uAtomicCAS(unsigned int *address, unsigned i nt compare, unsigned int val)) __device_func__(unsigned int __uAtomicCAS(unsigned int *address, unsigned i nt compare, unsigned int val))
{ {
unsigned int old = *address; unsigned int old;
*address = old == compare ? val : old; __cudaAtomicOperation(
old = *address;
*address = old == compare ? val : old;
)
return old; return old;
} }
#undef __cudaAtomicOperation
#endif /* __MULTI_CORE__ */ #endif /* __MULTI_CORE__ */
#endif /* !__CUDABE__ */ #endif /* !__CUDABE__ */
#endif /* __cplusplus && __CUDACC__ */ #endif /* __cplusplus && __CUDACC__ */
#endif /* !__SM_11_ATOMIC_FUNCTIONS_H__ */ #endif /* !__SM_11_ATOMIC_FUNCTIONS_H__ */
 End of changes. 41 change blocks. 
40 lines changed or deleted 105 lines changed or added


 sm_12_atomic_functions.h   sm_12_atomic_functions.h 
/* /*
* Copyright 1993-2008 NVIDIA Corporation. All rights reserved. * Copyright 1993-2009 NVIDIA Corporation. All rights reserved.
* *
* NOTICE TO USER: * NOTICE TO USER:
* *
* This source code is subject to NVIDIA ownership rights under U.S. and * This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws. Users and possessors of this source code * international Copyright laws. Users and possessors of this source code
* are hereby granted a nonexclusive, royalty-free license to use this code * are hereby granted a nonexclusive, royalty-free license to use this code
* in individual and commercial software. * in individual and commercial software.
* *
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
skipping to change at line 118 skipping to change at line 118
#define __ullAtomicAdd(address, val) \ #define __ullAtomicAdd(address, val) \
__builtin___ullAtomicAdd(address, val) __builtin___ullAtomicAdd(address, val)
#define __ullAtomicExch(address, val) \ #define __ullAtomicExch(address, val) \
__builtin___ullAtomicExch(address, val) __builtin___ullAtomicExch(address, val)
#define __ullAtomicCAS(address, compare, val) \ #define __ullAtomicCAS(address, compare, val) \
__builtin___ullAtomicCAS(address, compare, val) __builtin___ullAtomicCAS(address, compare, val)
#else /* __MULTI_CORE__ */ #else /* __MULTI_CORE__ */
extern void CUDARTAPI __cudaMutexOperation(int lock);
#define __cudaAtomicOperation(code) \
__cudaMutexOperation(1); \
code \
__cudaMutexOperation(0);
__device_func__(unsigned long long int __ullAtomicAdd(unsigned long long in t *address, unsigned long long int val)) __device_func__(unsigned long long int __ullAtomicAdd(unsigned long long in t *address, unsigned long long int val))
{ {
unsigned long long int old = *address; unsigned long long int old;
*address = old + val;
__cudaAtomicOperation(
old = *address;
*address = old + val;
)
return old; return old;
} }
__device_func__(unsigned long long int __ullAtomicExch(unsigned long long i nt *address, unsigned long long int val)) __device_func__(unsigned long long int __ullAtomicExch(unsigned long long i nt *address, unsigned long long int val))
{ {
unsigned long long int old = *address; unsigned long long int old;
*address = val;
__cudaAtomicOperation(
old = *address;
*address = val;
)
return old; return old;
} }
__device_func__(unsigned long long int __ullAtomicCAS(unsigned long long in t *address, unsigned long long int compare, unsigned long long int val)) __device_func__(unsigned long long int __ullAtomicCAS(unsigned long long in t *address, unsigned long long int compare, unsigned long long int val))
{ {
unsigned long long int old = *address; unsigned long long int old;
*address = old == compare ? val : old;
__cudaAtomicOperation(
old = *address;
*address = old == compare ? val : old;
)
return old; return old;
} }
#undef __cudaAtomicOperation
#endif /* __MULTI_CORE__ */ #endif /* __MULTI_CORE__ */
__device_func__(int __any(int cond)) __device_func__(int __any(int cond))
{ {
return cond; return cond;
} }
__device_func__(int __all(int cond)) __device_func__(int __all(int cond))
{ {
return cond; return cond;
 End of changes. 9 change blocks. 
10 lines changed or deleted 25 lines changed or added


 sm_13_double_functions.h   sm_13_double_functions.h 
/* /*
* Copyright 1993-2008 NVIDIA Corporation. All rights reserved. * Copyright 1993-2009 NVIDIA Corporation. All rights reserved.
* *
* NOTICE TO USER: * NOTICE TO USER:
* *
* This source code is subject to NVIDIA ownership rights under U.S. and * This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws. Users and possessors of this source code * international Copyright laws. Users and possessors of this source code
* are hereby granted a nonexclusive, royalty-free license to use this code * are hereby granted a nonexclusive, royalty-free license to use this code
* in individual and commercial software. * in individual and commercial software.
* *
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
skipping to change at line 265 skipping to change at line 265
#include "crt/func_macro.h" #include "crt/func_macro.h"
#if !defined(__CUDABE__) #if !defined(__CUDABE__)
/************************************************************************** ***** /************************************************************************** *****
* * * *
* HOST IMPLEMENTATIONS FOR FUNCTIONS * * HOST IMPLEMENTATIONS FOR FUNCTIONS *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
#include "common_types.h"
__device_func__(double __longlong_as_double(long long int a)) __device_func__(double __longlong_as_double(long long int a))
{ {
volatile union {long long int a; double b;} u; volatile union __cudart_DoubleLonglongCvt u;
u.i = a;
u.a = a; return u.d;
return u.b;
} }
__device_func__(long long int __double_as_longlong(double a)) __device_func__(long long int __double_as_longlong(double a))
{ {
volatile union {double a; long long int b;} u; volatile union __cudart_DoubleLonglongCvt u;
u.d = a;
u.a = a; return u.i;
return u.b;
} }
__device_func__(float __internal_double2float_kernel(double a)) __device_func__(float __internal_double2float_kernel(double a))
{ {
volatile union { volatile union __cudart_DoubleUlonglongCvt xx;
double d; volatile union __cudart_FloatUintCvt res;
unsigned long long int i;
} xx;
volatile union {
float f;
unsigned int i;
} res;
int shift; int shift;
xx.d = a; xx.d = a;
if (xx.i == 0) return 0.0f; if (xx.i == 0) return 0.0f;
res.i = (((unsigned int) (xx.i >> 32)) & 0x80000000); res.i = (((unsigned int) (xx.i >> 32)) & 0x80000000);
if ((xx.i & 0x7ff0000000000000ULL) == 0x7ff0000000000000ULL) { if ((xx.i & 0x7ff0000000000000ULL) == 0x7ff0000000000000ULL) {
if ((xx.i & 0x7fffffffffffffffULL) > 0x7ff0000000000000ULL) { if ((xx.i & 0x7fffffffffffffffULL) > 0x7ff0000000000000ULL) {
// Nan // Nan
res.i = 0x7f8fffff; res.i = 0x7f8fffff;
} else { } else {
// Inf // Inf
skipping to change at line 333 skipping to change at line 325
} else { } else {
res.i |= (unsigned int) (127 + shift) << 23; res.i |= (unsigned int) (127 + shift) << 23;
} }
res.i |= ((unsigned int) (xx.i >> 29)) & 0x007fffff; res.i |= ((unsigned int) (xx.i >> 29)) & 0x007fffff;
xx.i &= 0x1fffffff; xx.i &= 0x1fffffff;
return res.f; return res.f;
} }
__device_func__(double __internal_ll2double_kernel(long long int a, enum cu daRoundMode rndMode)) __device_func__(double __internal_ll2double_kernel(long long int a, enum cu daRoundMode rndMode))
{ {
volatile union { volatile union __cudart_DoubleUlonglongCvt res;
double d;
unsigned long long int i;
} res;
int shift; int shift;
unsigned int t; unsigned int t;
res.i = a; res.i = a;
if (a == 0) return res.d; if (a == 0) return res.d;
if (a < 0) res.i = (unsigned long long int)-a; if (a < 0) res.i = (unsigned long long int)-a;
shift = __internal_normalize64((unsigned long long int*)&res.i); shift = __internal_normalize64((unsigned long long int*)&res.i);
t = ((unsigned int) res.i) << 21; t = ((unsigned int) res.i) << 21;
res.i >>= 11; res.i >>= 11;
res.i += ((unsigned long long int)(1023 + 62 - shift)) << 52; res.i += ((unsigned long long int)(1023 + 62 - shift)) << 52;
if (a < 0) res.i |= 0x8000000000000000ULL; if (a < 0) res.i |= 0x8000000000000000ULL;
skipping to change at line 361 skipping to change at line 350
res.i++; res.i++;
} }
else if ((rndMode == cudaRoundPosInf) && t && (a > 0)) { else if ((rndMode == cudaRoundPosInf) && t && (a > 0)) {
res.i++; res.i++;
} }
return res.d; return res.d;
} }
__device_func__(double __internal_ull2double_kernel(unsigned long long int a, enum cudaRoundMode rndMode)) __device_func__(double __internal_ull2double_kernel(unsigned long long int a, enum cudaRoundMode rndMode))
{ {
volatile union { volatile union __cudart_DoubleUlonglongCvt res;
double d;
unsigned long long int i;
} res;
int shift; int shift;
unsigned int t; unsigned int t;
res.i = a; res.i = a;
if (a == 0) return res.d; if (a == 0) return res.d;
shift = __internal_normalize64((unsigned long long int *)&res.i); shift = __internal_normalize64((unsigned long long int *)&res.i);
t = ((unsigned int) res.i) << 21; t = ((unsigned int) res.i) << 21;
res.i >>= 11; res.i >>= 11;
res.i += ((unsigned long long int)(1023 + 62 - shift)) << 52; res.i += ((unsigned long long int)(1023 + 62 - shift)) << 52;
if ((rndMode == cudaRoundNearest) && (t >= 0x80000000)) { if ((rndMode == cudaRoundNearest) && (t >= 0x80000000)) {
res.i += (t == 0x80000000) ? (res.i & 1) : 1; res.i += (t == 0x80000000) ? (res.i & 1) : 1;
} }
else if ((rndMode == cudaRoundPosInf) && t) { else if ((rndMode == cudaRoundPosInf) && t) {
res.i++; res.i++;
} }
return res.d; return res.d;
} }
__device_func__(long long int __internal_double2ll_kernel(double a, long lo ng int max, long long int min, long long int nan, enum cudaRoundMode rndMod e)) __device_func__(long long int __internal_double2ll_kernel(double a, long lo ng int max, long long int min, long long int nan, enum cudaRoundMode rndMod e))
{ {
volatile union { volatile union __cudart_DoubleUlonglongCvt xx, res;
double d;
unsigned long long int i;
} xx, res;
unsigned long long int t = 0; unsigned long long int t = 0;
int shift; int shift;
xx.d = a; xx.d = a;
__internal_clamp(a, max, min, nan); __internal_clamp(a, max, min, nan);
shift = (int) (1023 + 62 - ((xx.i >> 52) & 0x7ff)); shift = (int) (1023 + 62 - ((xx.i >> 52) & 0x7ff));
res.i = ((xx.i << 11) | 0x8000000000000000ULL) >> 1; res.i = ((xx.i << 11) | 0x8000000000000000ULL) >> 1;
if (shift >= 64) { if (shift >= 64) {
t = res.i; t = res.i;
res.i = 0; res.i = 0;
skipping to change at line 420 skipping to change at line 403
res.i++; res.i++;
} }
if ((long long int)xx.i < 0) { if ((long long int)xx.i < 0) {
res.i = (unsigned long long int)(-(long long int)res.i); res.i = (unsigned long long int)(-(long long int)res.i);
} }
return res.i; return res.i;
} }
__device_func__(unsigned long long int __internal_double2ull_kernel(double a, unsigned long long int max, unsigned long long int nan, enum cudaRoundMo de rndMode)) __device_func__(unsigned long long int __internal_double2ull_kernel(double a, unsigned long long int max, unsigned long long int nan, enum cudaRoundMo de rndMode))
{ {
volatile union { volatile union __cudart_DoubleUlonglongCvt xx, res;
double d;
unsigned long long int i;
} xx, res;
unsigned long long int t = 0; unsigned long long int t = 0;
int shift; int shift;
xx.d = a; xx.d = a;
__internal_clamp(a, max, 0LL, nan); __internal_clamp(a, max, 0LL, nan);
if (a == 0.0) return 0LL; if (a == 0.0) return 0LL;
shift = (int) (1023 + 63 - ((xx.i >> 52) & 0x7ff)); shift = (int) (1023 + 63 - ((xx.i >> 52) & 0x7ff));
res.i = ((xx.i << 11) | 0x8000000000000000ULL); res.i = ((xx.i << 11) | 0x8000000000000000ULL);
if (shift >= 64) { if (shift >= 64) {
t = res.i >> (int)(shift > 64); t = res.i >> (int)(shift > 64);
res.i = 0; res.i = 0;
} else if (shift) { } else if (shift) {
skipping to change at line 450 skipping to change at line 431
res.i += (t == 0x8000000000000000ULL) ? (res.i & 1) : 1; res.i += (t == 0x8000000000000000ULL) ? (res.i & 1) : 1;
} }
else if ((rndMode == cudaRoundPosInf) && t) { else if ((rndMode == cudaRoundPosInf) && t) {
res.i++; res.i++;
} }
return res.i; return res.i;
} }
__device_func__(int __double2hiint(double a)) __device_func__(int __double2hiint(double a))
{ {
volatile union { volatile union __cudart_DoubleInthiloCvt cvt;
double d;
signed int i[2];
} cvt;
cvt.d = a; cvt.d = a;
return cvt.i[1]; return cvt.i[1];
} }
__device_func__(int __double2loint(double a)) __device_func__(int __double2loint(double a))
{ {
volatile union { volatile union __cudart_DoubleInthiloCvt cvt;
double d;
signed int i[2];
} cvt;
cvt.d = a; cvt.d = a;
return cvt.i[0]; return cvt.i[0];
} }
__device_func__(double __hiloint2double(int a, int b)) __device_func__(double __hiloint2double(int a, int b))
{ {
volatile union { volatile union __cudart_DoubleInthiloCvt cvt;
double d;
signed int i[2];
} cvt;
cvt.i[0] = b; cvt.i[0] = b;
cvt.i[1] = a; cvt.i[1] = a;
return cvt.d; return cvt.d;
} }
__device_func__(float __double2float_rn(double a)) __device_func__(float __double2float_rn(double a))
{ {
return (float)a; return (float)a;
} }
__device_func__(float __double2float_rz(double a)) __device_func__(float __double2float_rz(double a))
{ {
skipping to change at line 628 skipping to change at line 597
} }
__device_func__(double __ull2double_ru(unsigned long long int a)) __device_func__(double __ull2double_ru(unsigned long long int a))
{ {
return __internal_ull2double_kernel(a, cudaRoundPosInf); return __internal_ull2double_kernel(a, cudaRoundPosInf);
} }
#endif /* !__CUDABE__ */ #endif /* !__CUDABE__ */
#if !defined(__CUDABE__) || defined(CUDA_NO_SM_13_DOUBLE_INTRINSICS) #if !defined(__CUDABE__) || defined(CUDA_NO_SM_13_DOUBLE_INTRINSICS)
#include "common_types.h"
__device_func__(double __internal_fma_kernel(double x, double y, double z, enum cudaRoundMode rndMode)) __device_func__(double __internal_fma_kernel(double x, double y, double z, enum cudaRoundMode rndMode))
{ {
#ifdef __MULTI_CORE__ #ifdef __MULTI_CORE__
volatile volatile
#endif /* __MULTI_CORE__ */ #endif /* __MULTI_CORE__ */
struct { struct __cudart_UintUint xx, yy, zz, ww;
unsigned int lo;
unsigned int hi;
} xx, yy, zz, ww;
unsigned int s, t, u, prod0, prod1, prod2, prod3, expo_x, expo_y, expo_z; unsigned int s, t, u, prod0, prod1, prod2, prod3, expo_x, expo_y, expo_z;
xx.hi = __double2hiint(x); xx.hi = __double2hiint(x);
xx.lo = __double2loint(x); xx.lo = __double2loint(x);
yy.hi = __double2hiint(y); yy.hi = __double2hiint(y);
yy.lo = __double2loint(y); yy.lo = __double2loint(y);
zz.hi = __double2hiint(z); zz.hi = __double2hiint(z);
zz.lo = __double2loint(z); zz.lo = __double2loint(z);
expo_z = 0x7FF; expo_z = 0x7FF;
 End of changes. 18 change blocks. 
55 lines changed or deleted 21 lines changed or added


 storage_class.h   storage_class.h 
/* /*
* Copyright 1993-2008 NVIDIA Corporation. All rights reserved. * Copyright 1993-2009 NVIDIA Corporation. All rights reserved.
* *
* NOTICE TO USER: * NOTICE TO USER:
* *
* This source code is subject to NVIDIA ownership rights under U.S. and * This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws. Users and possessors of this source code * international Copyright laws. Users and possessors of this source code
* are hereby granted a nonexclusive, royalty-free license to use this code * are hereby granted a nonexclusive, royalty-free license to use this code
* in individual and commercial software. * in individual and commercial software.
* *
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
 End of changes. 1 change blocks. 
1 lines changed or deleted 1 lines changed or added


 texture_fetch_functions.h   texture_fetch_functions.h 
/* /*
* Copyright 1993-2008 NVIDIA Corporation. All rights reserved. * Copyright 1993-2009 NVIDIA Corporation. All rights reserved.
* *
* NOTICE TO USER: * NOTICE TO USER:
* *
* This source code is subject to NVIDIA ownership rights under U.S. and * This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws. Users and possessors of this source code * international Copyright laws. Users and possessors of this source code
* are hereby granted a nonexclusive, royalty-free license to use this code * are hereby granted a nonexclusive, royalty-free license to use this code
* in individual and commercial software. * in individual and commercial software.
* *
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
 End of changes. 1 change blocks. 
1 lines changed or deleted 1 lines changed or added


 texture_types.h   texture_types.h 
/* /*
* Copyright 1993-2008 NVIDIA Corporation. All rights reserved. * Copyright 1993-2009 NVIDIA Corporation. All rights reserved.
* *
* NOTICE TO USER: * NOTICE TO USER:
* *
* This source code is subject to NVIDIA ownership rights under U.S. and * This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws. Users and possessors of this source code * international Copyright laws. Users and possessors of this source code
* are hereby granted a nonexclusive, royalty-free license to use this code * are hereby granted a nonexclusive, royalty-free license to use this code
* in individual and commercial software. * in individual and commercial software.
* *
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
 End of changes. 1 change blocks. 
1 lines changed or deleted 1 lines changed or added


 vector_functions.h   vector_functions.h 
/* /*
* Copyright 1993-2008 NVIDIA Corporation. All rights reserved. * Copyright 1993-2009 NVIDIA Corporation. All rights reserved.
* *
* NOTICE TO USER: * NOTICE TO USER:
* *
* This source code is subject to NVIDIA ownership rights under U.S. and * This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws. Users and possessors of this source code * international Copyright laws. Users and possessors of this source code
* are hereby granted a nonexclusive, royalty-free license to use this code * are hereby granted a nonexclusive, royalty-free license to use this code
* in individual and commercial software. * in individual and commercial software.
* *
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
 End of changes. 1 change blocks. 
1 lines changed or deleted 1 lines changed or added


 vector_types.h   vector_types.h 
/* /*
* Copyright 1993-2008 NVIDIA Corporation. All rights reserved. * Copyright 1993-2009 NVIDIA Corporation. All rights reserved.
* *
* NOTICE TO USER: * NOTICE TO USER:
* *
* This source code is subject to NVIDIA ownership rights under U.S. and * This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws. Users and possessors of this source code * international Copyright laws. Users and possessors of this source code
* are hereby granted a nonexclusive, royalty-free license to use this code * are hereby granted a nonexclusive, royalty-free license to use this code
* in individual and commercial software. * in individual and commercial software.
* *
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
 End of changes. 1 change blocks. 
1 lines changed or deleted 1 lines changed or added

This html diff was produced by rfcdiff 1.41. The latest version is available from http://tools.ietf.org/tools/rfcdiff/