cuda: headers diff between 3.0 and 3.1 versions

	builtin_types.h	builtin_types.h

	skipping to change at line 44	skipping to change at line 44
	*/	*/

	/************************************************************************ ***	/************************************************************************ ***
	* *	* *
	* *	* *
	* *	* *
	************************************************************************* **/	************************************************************************* **/

	#include "device_types.h"	#include "device_types.h"
	#include "driver_types.h"	#include "driver_types.h"

		#include "surface_types.h"
	#include "texture_types.h"	#include "texture_types.h"
	#include "vector_types.h"	#include "vector_types.h"

End of changes. 1 change blocks.
	0 lines changed or deleted	1 lines changed or added

	common_functions.h	common_functions.h

	skipping to change at line 47	skipping to change at line 47
	#define __COMMON_FUNCTIONS_H__	#define __COMMON_FUNCTIONS_H__

	/************************************************************************ ***	/************************************************************************ ***
	* *	* *
	* *	* *
	* *	* *
	************************************************************************* **/	************************************************************************* **/

	#if defined(__cplusplus) && defined(__CUDACC__)	#if defined(__cplusplus) && defined(__CUDACC__)


		#include "builtin_types.h"
	#include "host_defines.h"	#include "host_defines.h"


	#include <time.h>
	#include <string.h>	#include <string.h>

		#include <time.h>

	extern "C"	extern "C"
	{	{

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern _CRTIMP __host__ __device__ clock_t __cdecl clock(void) __THROW;	extern _CRTIMP __host__ __device__ clock_t __cdecl clock(void) __THROW;


	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/

	extern __host__ __device__ void * __cdecl memset(void *s, int c, s	extern __host__ __device__ void* __cdecl memset(void*, int, size_
	ize_t n) __THROW;	t) __THROW;

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/

	extern __host__ __device__ void * __cdecl memcpy(void d, const vo id s, size_t n) __THROW;	extern __host__ __device__ void* __cdecl memcpy(void, const void , size_t) __THROW;

	}	}


	#elif !defined(__CUDACC__)	#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 200

	#include "crt/func_macro.h"


	__device_func__(clock_t __cuda_clock(void))	#include <stdio.h>
	{
	return clock();
	}


	__device_func__(void __cuda_memset(void s, int c, size_t n))	extern "C"
	{	{

	char p = (char)s;


	while (n--) *p++ = (char)c;	/DEVICE_BUILTIN/
		extern __host__ __device__ int __cdecl printf(const char*, ...)
		;


	return s;
	}	}


	__device_func__(void __cuda_memcpy(void d, const void *s, size_t n))	#endif /* __CUDA_ARCH__ && __CUDA_ARCH__ >= 200 */
	{
	char p = (char)d;
	const char r = (const char)s;

	while (n--) p++ = r++;

	return d;
	}

	#endif /* __cplusplus && __CUDACC__ */	#endif /* __cplusplus && __CUDACC__ */

	/************************************************************************ ***	/************************************************************************ ***
	* *	* *
	* *	* *
	* *	* *
	************************************************************************* **/	************************************************************************* **/

	#include "math_functions.h"	#include "math_functions.h"

End of changes. 13 change blocks.
	26 lines changed or deleted	12 lines changed or added

	cublas.h	cublas.h

	skipping to change at line 94	skipping to change at line 94
	#define CUBLASAPI __stdcall	#define CUBLASAPI __stdcall
	#else	#else
	#define CUBLASAPI	#define CUBLASAPI
	#endif	#endif
	#endif	#endif

	#if defined(__cplusplus)	#if defined(__cplusplus)
	extern "C" {	extern "C" {
	#endif /* __cplusplus */	#endif /* __cplusplus */


		#include "driver_types.h"
	#include "cuComplex.h" /* import complex data type */	#include "cuComplex.h" /* import complex data type */

	/* CUBLAS status returns */	/* CUBLAS status returns */
	#define CUBLAS_STATUS_SUCCESS 0x00000000	#define CUBLAS_STATUS_SUCCESS 0x00000000
	#define CUBLAS_STATUS_NOT_INITIALIZED 0x00000001	#define CUBLAS_STATUS_NOT_INITIALIZED 0x00000001
	#define CUBLAS_STATUS_ALLOC_FAILED 0x00000003	#define CUBLAS_STATUS_ALLOC_FAILED 0x00000003
	#define CUBLAS_STATUS_INVALID_VALUE 0x00000007	#define CUBLAS_STATUS_INVALID_VALUE 0x00000007
	#define CUBLAS_STATUS_ARCH_MISMATCH 0x00000008	#define CUBLAS_STATUS_ARCH_MISMATCH 0x00000008
	#define CUBLAS_STATUS_MAPPING_ERROR 0x0000000B	#define CUBLAS_STATUS_MAPPING_ERROR 0x0000000B
	#define CUBLAS_STATUS_EXECUTION_FAILED 0x0000000D	#define CUBLAS_STATUS_EXECUTION_FAILED 0x0000000D

	skipping to change at line 289	skipping to change at line 290
	* -------------	* -------------
	* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize d	* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize d
	* CUBLAS_STATUS_INVALID_VALUE if rows, cols, eleSize, lda, or ldb <= 0	* CUBLAS_STATUS_INVALID_VALUE if rows, cols, eleSize, lda, or ldb <= 0
	* CUBLAS_STATUS_MAPPING_ERROR if error occurred accessing GPU memory	* CUBLAS_STATUS_MAPPING_ERROR if error occurred accessing GPU memory
	* CUBLAS_STATUS_SUCCESS if the operation completed successfully	* CUBLAS_STATUS_SUCCESS if the operation completed successfully
	*/	*/
	cublasStatus CUBLASAPI cublasGetMatrix (int rows, int cols, int elemSize,	cublasStatus CUBLASAPI cublasGetMatrix (int rows, int cols, int elemSize,
	const void A, int lda, void B,	const void A, int lda, void B,
	int ldb);	int ldb);


		/*
		* cublasStatus
		* cublasSetKernelStream ( cudaStream_t stream )
		*
		* set the CUBLAS stream in which all subsequent CUBLAS kernel launches wil
		l run.
		* By default, if the CUBLAS stream is not set, all kernels will use the NU
		LL
		* stream. This routine can be used to change the stream between kernels la
		unches
		* and can be used also to set the CUBLAS stream back to NULL.
		*
		* Return Values
		* -------------
		* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
		d
		* CUBLAS_STATUS_SUCCESS if stream set successfully
		*/
		cublasStatus CUBLASAPI cublasSetKernelStream (cudaStream_t stream);

		/*
		* cublasStatus
		* cublasSetVectorAsync ( int n, int elemSize, const void *x, int incx,
		* void *y, int incy, cudaStream_t stream );
		*
		* cublasSetVectorAsync has the same functionnality as cublasSetVector
		* but the transfer is done asynchronously within the CUDA stream passed
		* in parameter.
		*
		* Return Values
		* -------------
		* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library not been initialized
		* CUBLAS_STATUS_INVALID_VALUE if incx, incy, or elemSize <= 0
		* CUBLAS_STATUS_MAPPING_ERROR if an error occurred accessing GPU memory
		* CUBLAS_STATUS_SUCCESS if the operation completed successfully
		*/
		cublasStatus CUBLASAPI cublasSetVectorAsync (int n, int elemSize,
		const void *hostPtr, int incx,
		void *devicePtr, int incy,
		cudaStream_t stream);
		/*
		* cublasStatus
		* cublasGetVectorAsync( int n, int elemSize, const void *x, int incx,
		* void *y, int incy, cudaStream_t stream)
		*
		* cublasGetVectorAsync has the same functionnality as cublasGetVector
		* but the transfer is done asynchronously within the CUDA stream passed
		* in parameter.
		*
		* Return Values
		* -------------
		* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library not been initialized
		* CUBLAS_STATUS_INVALID_VALUE if incx, incy, or elemSize <= 0
		* CUBLAS_STATUS_MAPPING_ERROR if an error occurred accessing GPU memory
		* CUBLAS_STATUS_SUCCESS if the operation completed successfully
		*/
		cublasStatus CUBLASAPI cublasGetVectorAsync (int n, int elemSize,
		const void *devicePtr, int inc
		x,
		void *hostPtr, int incy,
		cudaStream_t stream);

		/*
		* cublasStatus
		* cublasSetMatrixAsync (int rows, int cols, int elemSize, const void *A,
		* int lda, void *B, int ldb, cudaStream_t stream)
		*
		* cublasSetMatrixAsync has the same functionnality as cublasSetMatrix
		* but the transfer is done asynchronously within the CUDA stream passed
		* in parameter.
		*
		* Return Values
		* -------------
		* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
		d
		* CUBLAS_STATUS_INVALID_VALUE if rows or cols < 0, or elemSize, lda, or
		* ldb <= 0
		* CUBLAS_STATUS_MAPPING_ERROR if error occurred accessing GPU memory
		* CUBLAS_STATUS_SUCCESS if the operation completed successfully
		*/
		cublasStatus CUBLASAPI cublasSetMatrixAsync (int rows, int cols, int elemSi
		ze,
		const void A, int lda, void
		B,
		int ldb, cudaStream_t stream);

		/*
		* cublasStatus
		* cublasGetMatrixAsync (int rows, int cols, int elemSize, const void *A,
		* int lda, void *B, int ldb, cudaStream_t stream)
		*
		* cublasGetMatrixAsync has the same functionnality as cublasGetMatrix
		* but the transfer is done asynchronously within the CUDA stream passed
		* in parameter.
		*
		* Return Values
		* -------------
		* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
		d
		* CUBLAS_STATUS_INVALID_VALUE if rows, cols, eleSize, lda, or ldb <= 0
		* CUBLAS_STATUS_MAPPING_ERROR if error occurred accessing GPU memory
		* CUBLAS_STATUS_SUCCESS if the operation completed successfully
		*/
		cublasStatus CUBLASAPI cublasGetMatrixAsync (int rows, int cols, int elemSi
		ze,
		const void A, int lda, void
		B,
		int ldb, cudaStream_t stream);

	/* ---------------- CUBLAS single-precision BLAS1 functions --------------- - */	/* ---------------- CUBLAS single-precision BLAS1 functions --------------- - */

	/*	/*
	* int	* int
	* cublasIsamax (int n, const float *x, int incx)	* cublasIsamax (int n, const float *x, int incx)
	*	*
	* finds the smallest index of the maximum magnitude element of single	* finds the smallest index of the maximum magnitude element of single
	* precision vector x; that is, the result is the first i, i = 0 to n - 1,	* precision vector x; that is, the result is the first i, i = 0 to n - 1,
	* that maximizes abs(x[1 + i * incx])).	* that maximizes abs(x[1 + i * incx])).
	*	*

	skipping to change at line 574	skipping to change at line 673
	* The quantity r = (+/-) sqrt (sa^2 + sb^2) overwrites sa in storage. The	* The quantity r = (+/-) sqrt (sa^2 + sb^2) overwrites sa in storage. The
	* value of sb is overwritten by a value z which allows sc and ss to be	* value of sb is overwritten by a value z which allows sc and ss to be
	* recovered by the following algorithm:	* recovered by the following algorithm:
	*	*
	* if z=1 set sc = 0.0 and ss = 1.0	* if z=1 set sc = 0.0 and ss = 1.0
	* if abs(z) < 1 set sc = sqrt(1-z^2) and ss = z	* if abs(z) < 1 set sc = sqrt(1-z^2) and ss = z
	* if abs(z) > 1 set sc = 1/z and ss = sqrt(1-sc^2)	* if abs(z) > 1 set sc = 1/z and ss = sqrt(1-sc^2)
	*	*
	* The function srot (n, x, incx, y, incy, sc, ss) normally is called next	* The function srot (n, x, incx, y, incy, sc, ss) normally is called next
	* to apply the transformation to a 2 x n matrix.	* to apply the transformation to a 2 x n matrix.

	* Note that is function is provided for completeness and run exclusively	* Note that this function is provided for completeness and run exclusively
	* on the Host.	* on the Host.
	*	*
	* Input	* Input
	* -----	* -----
	* sa single precision scalar	* sa single precision scalar
	* sb single precision scalar	* sb single precision scalar
	*	*
	* Output	* Output
	* ------	* ------
	* sa single precision r	* sa single precision r

	skipping to change at line 662	skipping to change at line 761
	*	*
	* sflag = -1.0f sflag = 0.0f sflag = 1.0f sflag = -2.0f	* sflag = -1.0f sflag = 0.0f sflag = 1.0f sflag = -2.0f
	*	*
	* (sh00 sh01) (1.0f sh01) (sh00 1.0f) (1.0f 0.0f)	* (sh00 sh01) (1.0f sh01) (sh00 1.0f) (1.0f 0.0f)
	* h = ( ) ( ) ( ) ( )	* h = ( ) ( ) ( ) ( )
	* (sh10 sh11) (sh10 1.0f) (-1.0f sh11) (0.0f 1.0f)	* (sh10 sh11) (sh10 1.0f) (-1.0f sh11) (0.0f 1.0f)
	*	*
	* sparam[1] through sparam[4] contain sh00, sh10, sh01, sh11,	* sparam[1] through sparam[4] contain sh00, sh10, sh01, sh11,
	* respectively. Values of 1.0f, -1.0f, or 0.0f implied by the value	* respectively. Values of 1.0f, -1.0f, or 0.0f implied by the value
	* of sflag are not stored in sparam.	* of sflag are not stored in sparam.

	* Note that is function is provided for completeness and run exclusively	* Note that this function is provided for completeness and run exclusively
	* on the Host.	* on the Host.
	*	*
	* Input	* Input
	* -----	* -----
	* sd1 single precision scalar	* sd1 single precision scalar
	* sd2 single precision scalar	* sd2 single precision scalar
	* sx1 single precision scalar	* sx1 single precision scalar
	* sy1 single precision scalar	* sy1 single precision scalar
	*	*
	* Output	* Output

	skipping to change at line 899	skipping to change at line 998
	*	*
	* ( sc cs )	* ( sc cs )
	* G = ( ) , sc^2 + cabs(cs)^2 = 1,	* G = ( ) , sc^2 + cabs(cs)^2 = 1,
	* (-cs sc )	* (-cs sc )
	*	*
	* which zeros the second entry of the complex 2-vector transpose(ca, cb).	* which zeros the second entry of the complex 2-vector transpose(ca, cb).
	*	*
	* The quantity ca/cabs(ca)*norm(ca,cb) overwrites ca in storage. The	* The quantity ca/cabs(ca)*norm(ca,cb) overwrites ca in storage. The
	* function crot (n, x, incx, y, incy, sc, cs) is normally called next	* function crot (n, x, incx, y, incy, sc, cs) is normally called next
	* to apply the transformation to a 2 x n matrix.	* to apply the transformation to a 2 x n matrix.

	* Note that is function is provided for completeness and run exclusively	* Note that this function is provided for completeness and run exclusively
	* on the Host.	* on the Host.
	*	*
	* Input	* Input
	* -----	* -----
	* ca single-precision complex precision scalar	* ca single-precision complex precision scalar
	* cb single-precision complex scalar	* cb single-precision complex scalar
	*	*
	* Output	* Output
	* ------	* ------
	* ca single-precision complex ca/cabs(ca)*norm(ca,cb)	* ca single-precision complex ca/cabs(ca)*norm(ca,cb)

	skipping to change at line 1498	skipping to change at line 1597
	*	*
	* ( sc cs )	* ( sc cs )
	* G = ( ) , sc^2 + cabs(cs)^2 = 1,	* G = ( ) , sc^2 + cabs(cs)^2 = 1,
	* (-cs sc )	* (-cs sc )
	*	*
	* which zeros the second entry of the complex 2-vector transpose(ca, cb).	* which zeros the second entry of the complex 2-vector transpose(ca, cb).
	*	*
	* The quantity ca/cabs(ca)*norm(ca,cb) overwrites ca in storage. The	* The quantity ca/cabs(ca)*norm(ca,cb) overwrites ca in storage. The
	* function crot (n, x, incx, y, incy, sc, cs) is normally called next	* function crot (n, x, incx, y, incy, sc, cs) is normally called next
	* to apply the transformation to a 2 x n matrix.	* to apply the transformation to a 2 x n matrix.

	* Note that is function is provided for completeness and run exclusively	* Note that this function is provided for completeness and run exclusively
	* on the Host.	* on the Host.
	*	*
	* Input	* Input
	* -----	* -----
	* ca double-precision complex precision scalar	* ca double-precision complex precision scalar
	* cb double-precision complex scalar	* cb double-precision complex scalar
	*	*
	* Output	* Output
	* ------	* ------
	* ca double-precision complex ca/cabs(ca)*norm(ca,cb)	* ca double-precision complex ca/cabs(ca)*norm(ca,cb)

	skipping to change at line 3398	skipping to change at line 3497
	*	*
	* Error Status	* Error Status
	* ------------	* ------------
	* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize d	* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize d
	* CUBLAS_STATUS_INVALID_VALUE if k or n < 0, or if incx or incy == 0	* CUBLAS_STATUS_INVALID_VALUE if k or n < 0, or if incx or incy == 0
	* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU	* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
	*/	*/
	void CUBLASAPI cublasChbmv (char uplo, int n, int k, cuComplex alpha,	void CUBLASAPI cublasChbmv (char uplo, int n, int k, cuComplex alpha,
	const cuComplex A, int lda, const cuComplex x ,	const cuComplex A, int lda, const cuComplex x ,
	int incx, cuComplex beta, cuComplex *y, int inc y);	int incx, cuComplex beta, cuComplex *y, int inc y);


		/*
		* void
		* cublasChpmv (char uplo, int n, cuComplex alpha, const cuComplex *AP, con
		st cuComplex *x,
		* int incx, cuComplex beta, cuComplex *y, int incy)
		*
		* performs the matrix-vector operation
		*
		* y = alpha * A * x + beta * y
		*
		* Alpha and beta are single precision complex scalars, and x and y are sin
		gle
		* precision complex vectors with n elements. A is an hermitian n x n matri
		x
		* consisting of single precision complex elements that is supplied in pack
		ed form.
		*
		* Input
		* -----
		* uplo specifies whether the matrix data is stored in the upper or the l
		ower
		* triangular part of array AP. If uplo == 'U' or 'u', then the uppe
		r
		* triangular part of A is supplied in AP. If uplo == 'L' or 'l', th
		en
		* the lower triangular part of A is supplied in AP.
		* n specifies the number of rows and columns of the matrix A. It must
		be
		* at least zero.
		* alpha single precision complex scalar multiplier applied to A*x.
		* AP single precision complex array with at least ((n * (n + 1)) / 2)
		elements. If
		* uplo == 'U' or 'u', the array AP contains the upper triangular pa
		rt
		* of the hermitian matrix A, packed sequentially, column by column;
		* that is, if i <= j, then A[i,j] is stored is AP[i+(j*(j+1)/2)]. I
		f
		* uplo == 'L' or 'L', the array AP contains the lower triangular pa
		rt
		* of the hermitian matrix A, packed sequentially, column by column;
		* that is, if i >= j, then A[i,j] is stored in AP[i+((2n-j+1)j)/2
		].
		* The imaginary parts of the diagonal elements need not be set, the
		y
		* are assumed to be zero.
		* x single precision complex array of length at least (1 + (n - 1) *
		abs(incx)).
		* incx storage spacing between elements of x. incx must not be zero.
		* beta single precision complex scalar multiplier applied to vector y;
		* y single precision array of length at least (1 + (n - 1) * abs(incy
		)).
		* If beta is zero, y is not read.
		* incy storage spacing between elements of y. incy must not be zero.
		*
		* Output
		* ------
		* y updated according to y = alphaAx + beta*y
		*
		* Reference: http://www.netlib.org/blas/chpmv.f
		*
		* Error status for this function can be retrieved via cublasGetError().
		*
		* Error Status
		* ------------
		* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
		d
		* CUBLAS_STATUS_INVALID_VALUE if n < 0, or if incx or incy == 0
		* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
		*/
	void CUBLASAPI cublasChpmv (char uplo, int n, cuComplex alpha,	void CUBLASAPI cublasChpmv (char uplo, int n, cuComplex alpha,
	const cuComplex AP, const cuComplex x, int in cx,	const cuComplex AP, const cuComplex x, int in cx,
	cuComplex beta, cuComplex *y, int incy);	cuComplex beta, cuComplex *y, int incy);

	/*	/*
	*	*
	* cublasCtrmv (char uplo, char trans, char diag, int n, const cuComplex *A ,	* cublasCtrmv (char uplo, char trans, char diag, int n, const cuComplex *A ,
	* int lda, cuComplex *x, int incx);	* int lda, cuComplex *x, int incx);
	*	*
	* performs one of the matrix-vector operations x = op(A) * x,	* performs one of the matrix-vector operations x = op(A) * x,

	skipping to change at line 5494	skipping to change at line 5646
	* The quantity r = (+/-) sqrt (sa^2 + sb^2) overwrites sa in storage. The	* The quantity r = (+/-) sqrt (sa^2 + sb^2) overwrites sa in storage. The
	* value of sb is overwritten by a value z which allows sc and ss to be	* value of sb is overwritten by a value z which allows sc and ss to be
	* recovered by the following algorithm:	* recovered by the following algorithm:
	*	*
	* if z=1 set sc = 0.0 and ss = 1.0	* if z=1 set sc = 0.0 and ss = 1.0
	* if abs(z) < 1 set sc = sqrt(1-z^2) and ss = z	* if abs(z) < 1 set sc = sqrt(1-z^2) and ss = z
	* if abs(z) > 1 set sc = 1/z and ss = sqrt(1-sc^2)	* if abs(z) > 1 set sc = 1/z and ss = sqrt(1-sc^2)
	*	*
	* The function drot (n, x, incx, y, incy, sc, ss) normally is called next	* The function drot (n, x, incx, y, incy, sc, ss) normally is called next
	* to apply the transformation to a 2 x n matrix.	* to apply the transformation to a 2 x n matrix.

	* Note that is function is provided for completeness and run exclusively	* Note that this function is provided for completeness and run exclusively
	* on the Host.	* on the Host.
	*	*
	* Input	* Input
	* -----	* -----
	* sa double-precision scalar	* sa double-precision scalar
	* sb double-precision scalar	* sb double-precision scalar
	*	*
	* Output	* Output
	* ------	* ------
	* sa double-precision r	* sa double-precision r

	skipping to change at line 5535	skipping to change at line 5687
	* The elements of x are in x[lx + i * incx], i = 0 to n-1, where lx = 1 if	* The elements of x are in x[lx + i * incx], i = 0 to n-1, where lx = 1 if
	* incx >= 0, else lx = 1 + (1 - n) * incx, and similarly for y using ly an d	* incx >= 0, else lx = 1 + (1 - n) * incx, and similarly for y using ly an d
	* incy. With sparam[0] = sflag, h has one of the following forms:	* incy. With sparam[0] = sflag, h has one of the following forms:
	*	*
	* sflag = -1.0 sflag = 0.0 sflag = 1.0 sflag = -2.0	* sflag = -1.0 sflag = 0.0 sflag = 1.0 sflag = -2.0
	*	*
	* (sh00 sh01) (1.0 sh01) (sh00 1.0) (1.0 0.0)	* (sh00 sh01) (1.0 sh01) (sh00 1.0) (1.0 0.0)
	* h = ( ) ( ) ( ) ( )	* h = ( ) ( ) ( ) ( )
	* (sh10 sh11) (sh10 1.0) (-1.0 sh11) (0.0 1.0)	* (sh10 sh11) (sh10 1.0) (-1.0 sh11) (0.0 1.0)
	*	*

	* Note that is function is provided for completeness and run exclusively	* Note that this function is provided for completeness and run exclusively
	* on the Host.	* on the Host.
	*	*
	* Input	* Input
	* -----	* -----
	* n number of elements in input vectors	* n number of elements in input vectors
	* x double-precision vector with n elements	* x double-precision vector with n elements
	* incx storage spacing between elements of x	* incx storage spacing between elements of x
	* y double-precision vector with n elements	* y double-precision vector with n elements
	* incy storage spacing between elements of y	* incy storage spacing between elements of y
	* sparam 5-element vector. sparam[0] is sflag described above. sparam[1]	* sparam 5-element vector. sparam[0] is sflag described above. sparam[1]

	skipping to change at line 5586	skipping to change at line 5738
	*	*
	* sflag = -1.0 sflag = 0.0 sflag = 1.0 sflag = -2.0	* sflag = -1.0 sflag = 0.0 sflag = 1.0 sflag = -2.0
	*	*
	* (sh00 sh01) (1.0 sh01) (sh00 1.0) (1.0 0.0)	* (sh00 sh01) (1.0 sh01) (sh00 1.0) (1.0 0.0)
	* h = ( ) ( ) ( ) ( )	* h = ( ) ( ) ( ) ( )
	* (sh10 sh11) (sh10 1.0) (-1.0 sh11) (0.0 1.0)	* (sh10 sh11) (sh10 1.0) (-1.0 sh11) (0.0 1.0)
	*	*
	* sparam[1] through sparam[4] contain sh00, sh10, sh01, sh11,	* sparam[1] through sparam[4] contain sh00, sh10, sh01, sh11,
	* respectively. Values of 1.0, -1.0, or 0.0 implied by the value	* respectively. Values of 1.0, -1.0, or 0.0 implied by the value
	* of sflag are not stored in sparam.	* of sflag are not stored in sparam.

	* Note that is function is provided for completeness and run exclusively	* Note that this function is provided for completeness and run exclusively
	* on the Host.	* on the Host.
	*	*
	* Input	* Input
	* -----	* -----
	* sd1 single precision scalar	* sd1 single precision scalar
	* sd2 single precision scalar	* sd2 single precision scalar
	* sx1 single precision scalar	* sx1 single precision scalar
	* sy1 single precision scalar	* sy1 single precision scalar
	*	*
	* Output	* Output

End of changes. 10 change blocks.
	7 lines changed or deleted	187 lines changed or added

	cuda.h	cuda.h

	skipping to change at line 57	skipping to change at line 57

	/**	/**
	* \defgroup CUDA_TYPES Data types used by CUDA driver	* \defgroup CUDA_TYPES Data types used by CUDA driver
	* \ingroup CUDA_DRIVER	* \ingroup CUDA_DRIVER
	* @{	* @{
	*/	*/

	/**	/**
	* CUDA API version number	* CUDA API version number
	*/	*/

	#define CUDA_VERSION 3000 /* 3.0 */	#define CUDA_VERSION 3010 /* 3.1 */

	#ifdef __cplusplus	#ifdef __cplusplus
	extern "C" {	extern "C" {
	#endif	#endif
	typedef unsigned int CUdeviceptr; ///< CUDA device pointer	typedef unsigned int CUdeviceptr; ///< CUDA device pointer

	typedef int CUdevice; ///< CUDA device	typedef int CUdevice; ///< CUDA device
	typedef struct CUctx_st *CUcontext; ///< CUDA context	typedef struct CUctx_st *CUcontext; ///< CUDA context
	typedef struct CUmod_st *CUmodule; ///< CUDA module	typedef struct CUmod_st *CUmodule; ///< CUDA module
	typedef struct CUfunc_st *CUfunction; ///< CUDA function	typedef struct CUfunc_st *CUfunction; ///< CUDA function
	typedef struct CUarray_st *CUarray; ///< CUDA array	typedef struct CUarray_st *CUarray; ///< CUDA array
	typedef struct CUtexref_st *CUtexref; ///< CUDA texture reference	typedef struct CUtexref_st *CUtexref; ///< CUDA texture reference

		typedef struct CUsurfref_st *CUsurfref; ///< CUDA surface reference
	typedef struct CUevent_st *CUevent; ///< CUDA event	typedef struct CUevent_st *CUevent; ///< CUDA event
	typedef struct CUstream_st *CUstream; ///< CUDA stream	typedef struct CUstream_st *CUstream; ///< CUDA stream
	typedef struct CUgraphicsResource_st *CUgraphicsResource; ///< CUDA gra phics interop resource	typedef struct CUgraphicsResource_st *CUgraphicsResource; ///< CUDA gra phics interop resource

	typedef struct CUuuid_st { ///< CUDA definition of UUID	typedef struct CUuuid_st { ///< CUDA definition of UUID
	char bytes[16];	char bytes[16];
	} CUuuid;	} CUuuid;

	/************************************	/************************************
	**	**

	skipping to change at line 175	skipping to change at line 176
	CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH = 22, ///< Maximum 2D textu re width	CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH = 22, ///< Maximum 2D textu re width
	CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT = 23,///< Maximum 2D textu re height	CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT = 23,///< Maximum 2D textu re height
	CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH = 24, ///< Maximum 3D textu re width	CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH = 24, ///< Maximum 3D textu re width
	CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT = 25,///< Maximum 3D textu re height	CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT = 25,///< Maximum 3D textu re height
	CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH = 26, ///< Maximum 3D textu re depth	CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH = 26, ///< Maximum 3D textu re depth
	CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_WIDTH = 27, ///< Maximum te xture array width	CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_WIDTH = 27, ///< Maximum te xture array width
	CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_HEIGHT = 28,///< Maximum te xture array height	CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_HEIGHT = 28,///< Maximum te xture array height
	CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES = 29, ///< Maximu m slices in a texture array	CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES = 29, ///< Maximu m slices in a texture array
	CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT = 30, ///< Alignment requirement for surfaces	CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT = 30, ///< Alignment requirement for surfaces
	CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS = 31, ///< Device can possibly e xecute multiple kernels concurrently	CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS = 31, ///< Device can possibly e xecute multiple kernels concurrently

	CU_DEVICE_ATTRIBUTE_ECC_ENABLED = 32 ///< Device has ECC support enable	CU_DEVICE_ATTRIBUTE_ECC_ENABLED = 32, ///< Device has ECC support enabl
	d	ed
		CU_DEVICE_ATTRIBUTE_PCI_BUS_ID = 33, ////< PCI bus ID of the device
		CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID = 34 ////< PCI device ID of the devic
		e
	} CUdevice_attribute;	} CUdevice_attribute;

	/**	/**
	* Legacy device properties	* Legacy device properties
	*/	*/
	typedef struct CUdevprop_st {	typedef struct CUdevprop_st {
	int maxThreadsPerBlock; ///< Maximum number of threads per block	int maxThreadsPerBlock; ///< Maximum number of threads per block
	int maxThreadsDim[3]; ///< Maximum size of each dimension of a bl ock	int maxThreadsDim[3]; ///< Maximum size of each dimension of a bl ock
	int maxGridSize[3]; ///< Maximum size of each dimension of a gr id	int maxGridSize[3]; ///< Maximum size of each dimension of a gr id
	int sharedMemPerBlock; ///< Shared memory available per block in b ytes	int sharedMemPerBlock; ///< Shared memory available per block in b ytes

	skipping to change at line 413	skipping to change at line 416
	*/	*/
	typedef enum CUarray_cubemap_face_enum {	typedef enum CUarray_cubemap_face_enum {
	CU_CUBEMAP_FACE_POSITIVE_X = 0x00, ///< Positive X face of cubemap	CU_CUBEMAP_FACE_POSITIVE_X = 0x00, ///< Positive X face of cubemap
	CU_CUBEMAP_FACE_NEGATIVE_X = 0x01, ///< Negative X face of cubemap	CU_CUBEMAP_FACE_NEGATIVE_X = 0x01, ///< Negative X face of cubemap
	CU_CUBEMAP_FACE_POSITIVE_Y = 0x02, ///< Positive Y face of cubemap	CU_CUBEMAP_FACE_POSITIVE_Y = 0x02, ///< Positive Y face of cubemap
	CU_CUBEMAP_FACE_NEGATIVE_Y = 0x03, ///< Negative Y face of cubemap	CU_CUBEMAP_FACE_NEGATIVE_Y = 0x03, ///< Negative Y face of cubemap
	CU_CUBEMAP_FACE_POSITIVE_Z = 0x04, ///< Positive Z face of cubemap	CU_CUBEMAP_FACE_POSITIVE_Z = 0x04, ///< Positive Z face of cubemap
	CU_CUBEMAP_FACE_NEGATIVE_Z = 0x05 ///< Negative Z face of cubemap	CU_CUBEMAP_FACE_NEGATIVE_Z = 0x05 ///< Negative Z face of cubemap
	} CUarray_cubemap_face;	} CUarray_cubemap_face;


		/**
		* Limits
		*/
		typedef enum CUlimit_enum {
		CU_LIMIT_STACK_SIZE = 0x00, ///< GPU thread stack size
		CU_LIMIT_PRINTF_FIFO_SIZE = 0x01 ///< GPU printf FIFO size
		} CUlimit;

	/************************************	/************************************
	**	**
	** Error codes	** Error codes
	**	**
	***********************************/	***********************************/

	/**	/**
	* Error codes	* Error codes
	*/	*/
	typedef enum cudaError_enum {	typedef enum cudaError_enum {


	CUDA_SUCCESS = 0, ///< No errors	CUDA_SUCCESS = 0, ///< No errors
	CUDA_ERROR_INVALID_VALUE = 1, ///< Invalid value	CUDA_ERROR_INVALID_VALUE = 1, ///< Invalid value
	CUDA_ERROR_OUT_OF_MEMORY = 2, ///< Out of memory	CUDA_ERROR_OUT_OF_MEMORY = 2, ///< Out of memory
	CUDA_ERROR_NOT_INITIALIZED = 3, ///< Driver not initialized	CUDA_ERROR_NOT_INITIALIZED = 3, ///< Driver not initia
	CUDA_ERROR_DEINITIALIZED = 4, ///< Driver deinitialized	lized
		CUDA_ERROR_DEINITIALIZED = 4, ///< Driver deinitiali
		zed


	CUDA_ERROR_NO_DEVICE = 100, ///< No CUDA-capable device	CUDA_ERROR_NO_DEVICE = 100, ///< No CUDA-capable d
	available	evice available
	CUDA_ERROR_INVALID_DEVICE = 101, ///< Invalid device	CUDA_ERROR_INVALID_DEVICE = 101, ///< Invalid device


	CUDA_ERROR_INVALID_IMAGE = 200, ///< Invalid kernel image	CUDA_ERROR_INVALID_IMAGE = 200, ///< Invalid kernel im
	CUDA_ERROR_INVALID_CONTEXT = 201, ///< Invalid context	age
	CUDA_ERROR_CONTEXT_ALREADY_CURRENT = 202, ///< Context already curren	CUDA_ERROR_INVALID_CONTEXT = 201, ///< Invalid context
	t	CUDA_ERROR_CONTEXT_ALREADY_CURRENT = 202, ///< Context already c
	CUDA_ERROR_MAP_FAILED = 205, ///< Map failed	urrent
	CUDA_ERROR_UNMAP_FAILED = 206, ///< Unmap failed	CUDA_ERROR_MAP_FAILED = 205, ///< Map failed
	CUDA_ERROR_ARRAY_IS_MAPPED = 207, ///< Array is mapped	CUDA_ERROR_UNMAP_FAILED = 206, ///< Unmap failed
	CUDA_ERROR_ALREADY_MAPPED = 208, ///< Already mapped	CUDA_ERROR_ARRAY_IS_MAPPED = 207, ///< Array is mapped
	CUDA_ERROR_NO_BINARY_FOR_GPU = 209, ///< No binary for GPU	CUDA_ERROR_ALREADY_MAPPED = 208, ///< Already mapped
	CUDA_ERROR_ALREADY_ACQUIRED = 210, ///< Already acquired	CUDA_ERROR_NO_BINARY_FOR_GPU = 209, ///< No binary for GPU
	CUDA_ERROR_NOT_MAPPED = 211, ///< Not mapped	CUDA_ERROR_ALREADY_ACQUIRED = 210, ///< Already acquired
	CUDA_ERROR_NOT_MAPPED_AS_ARRAY = 212, ///< Mapped resource not a	CUDA_ERROR_NOT_MAPPED = 211, ///< Not mapped
	vailable for access as an array	CUDA_ERROR_NOT_MAPPED_AS_ARRAY = 212, ///< Mapped resource n
	CUDA_ERROR_NOT_MAPPED_AS_POINTER = 213, ///< Mapped resource not a	ot available for access as an array
	vailable for access as a pointer	CUDA_ERROR_NOT_MAPPED_AS_POINTER = 213, ///< Mapped resource n
	CUDA_ERROR_ECC_UNCORRECTABLE = 214, ///< Uncorrectable ECC erro	ot available for access as a pointer
	r detected	CUDA_ERROR_ECC_UNCORRECTABLE = 214, ///< Uncorrectable ECC
		error detected
		CUDA_ERROR_UNSUPPORTED_LIMIT = 215, ///< CUlimit not suppo
		rted by device


	CUDA_ERROR_INVALID_SOURCE = 300, ///< Invalid source	CUDA_ERROR_INVALID_SOURCE = 300, ///< Invalid source
	CUDA_ERROR_FILE_NOT_FOUND = 301, ///< File not found	CUDA_ERROR_FILE_NOT_FOUND = 301, ///< File not found
		CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND = 302, ///< Link to a shared
		object failed to resolve
		CUDA_ERROR_SHARED_OBJECT_INIT_FAILED = 303, ///< Shared object ini
		tialization failed


	CUDA_ERROR_INVALID_HANDLE = 400, ///< Invalid handle	CUDA_ERROR_INVALID_HANDLE = 400, ///< Invalid handle


	CUDA_ERROR_NOT_FOUND = 500, ///< Not found	CUDA_ERROR_NOT_FOUND = 500, ///< Not found


	CUDA_ERROR_NOT_READY = 600, ///< CUDA not ready	CUDA_ERROR_NOT_READY = 600, ///< CUDA not ready


	CUDA_ERROR_LAUNCH_FAILED = 700, ///< Launch failed	CUDA_ERROR_LAUNCH_FAILED = 700, ///< Launch failed
	CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES = 701, ///< Launch exceeded resour	CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES = 701, ///< Launch exceeded r
	ces	esources
	CUDA_ERROR_LAUNCH_TIMEOUT = 702, ///< Launch exceeded timeou	CUDA_ERROR_LAUNCH_TIMEOUT = 702, ///< Launch exceeded t
	t	imeout
	CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING = 703, ///< Launch with incomp	CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING = 703, ///< Launch with incom
	atible texturing	patible texturing


	CUDA_ERROR_POINTER_IS_64BIT = 800, ///< Attempted to retrieve	CUDA_ERROR_POINTER_IS_64BIT = 800, ///< Attempted to retr
	64-bit pointer via 32-bit API function	ieve 64-bit pointer via 32-bit API function
	CUDA_ERROR_SIZE_IS_64BIT = 801, ///< Attempted to retrieve	CUDA_ERROR_SIZE_IS_64BIT = 801, ///< Attempted to retr
	64-bit size via 32-bit API function	ieve 64-bit size via 32-bit API function


	CUDA_ERROR_UNKNOWN = 999 ///< Unknown error	CUDA_ERROR_UNKNOWN = 999 ///< Unknown error
	} CUresult;	} CUresult;

	/**	/**
	* If set, host memory is portable between CUDA contexts.	* If set, host memory is portable between CUDA contexts.
	* Flag for ::cuMemHostAlloc()	* Flag for ::cuMemHostAlloc()
	*/	*/
	#define CU_MEMHOSTALLOC_PORTABLE 0x01	#define CU_MEMHOSTALLOC_PORTABLE 0x01

	/**	/**
	* If set, host memory is mapped into CUDA address space and	* If set, host memory is mapped into CUDA address space and

	skipping to change at line 579	skipping to change at line 593
	CUarray_format Format; ///< Array format	CUarray_format Format; ///< Array format

	unsigned int NumChannels; ///< Channels per array element	unsigned int NumChannels; ///< Channels per array element

	unsigned int Flags; ///< Flags	unsigned int Flags; ///< Flags
	} CUDA_ARRAY3D_DESCRIPTOR;	} CUDA_ARRAY3D_DESCRIPTOR;

	// if set, the CUDA array contains an array of 2D slices	// if set, the CUDA array contains an array of 2D slices
	// and the Depth member of CUDA_ARRAY3D_DESCRIPTOR specifies	// and the Depth member of CUDA_ARRAY3D_DESCRIPTOR specifies
	// the number of slices, not the depth of a 3D array.	// the number of slices, not the depth of a 3D array.

	#define CUDA_ARRAY3D_2DARRAY 0x01	#define CUDA_ARRAY3D_2DARRAY 0x01

		// this flag must be set in order to bind a surface reference
		// to the CUDA array
		#define CUDA_ARRAY3D_SURFACE_LDST 0x02

	/**	/**
	* Override the texref format with a format inferred from the array.	* Override the texref format with a format inferred from the array.
	* Flag for ::cuTexRefSetArray()	* Flag for ::cuTexRefSetArray()
	*/	*/
	#define CU_TRSA_OVERRIDE_FORMAT 0x01	#define CU_TRSA_OVERRIDE_FORMAT 0x01

	/**	/**
	* Read the texture as integers rather than promoting the values to floats	* Read the texture as integers rather than promoting the values to floats
	* in the range [0,1].	* in the range [0,1].

	skipping to change at line 668	skipping to change at line 686
	***********************************/	***********************************/

	CUresult CUDAAPI cuModuleLoad(CUmodule module, const char fname);	CUresult CUDAAPI cuModuleLoad(CUmodule module, const char fname);
	CUresult CUDAAPI cuModuleLoadData(CUmodule module, const void image) ;	CUresult CUDAAPI cuModuleLoadData(CUmodule module, const void image) ;
	CUresult CUDAAPI cuModuleLoadDataEx(CUmodule module, const void imag e, unsigned int numOptions, CUjit_option options, void *optionValues);	CUresult CUDAAPI cuModuleLoadDataEx(CUmodule module, const void imag e, unsigned int numOptions, CUjit_option options, void *optionValues);
	CUresult CUDAAPI cuModuleLoadFatBinary(CUmodule module, const void f atCubin);	CUresult CUDAAPI cuModuleLoadFatBinary(CUmodule module, const void f atCubin);
	CUresult CUDAAPI cuModuleUnload(CUmodule hmod);	CUresult CUDAAPI cuModuleUnload(CUmodule hmod);
	CUresult CUDAAPI cuModuleGetFunction(CUfunction hfunc, CUmodule hmod, const char name);	CUresult CUDAAPI cuModuleGetFunction(CUfunction hfunc, CUmodule hmod, const char name);
	CUresult CUDAAPI cuModuleGetGlobal(CUdeviceptr dptr, unsigned int by tes, CUmodule hmod, const char *name);	CUresult CUDAAPI cuModuleGetGlobal(CUdeviceptr dptr, unsigned int by tes, CUmodule hmod, const char *name);
	CUresult CUDAAPI cuModuleGetTexRef(CUtexref pTexRef, CUmodule hmod, c onst char name);	CUresult CUDAAPI cuModuleGetTexRef(CUtexref pTexRef, CUmodule hmod, c onst char name);

		CUresult CUDAAPI cuModuleGetSurfRef(CUsurfref pSurfRef, CUmodule hmod , const char name);

	/************************************	/************************************
	**	**
	** Memory management	** Memory management
	**	**
	***********************************/	***********************************/

	CUresult CUDAAPI cuMemGetInfo(unsigned int free, unsigned int total);	CUresult CUDAAPI cuMemGetInfo(unsigned int free, unsigned int total);

	CUresult CUDAAPI cuMemAlloc( CUdeviceptr *dptr, unsigned int bytesize);	CUresult CUDAAPI cuMemAlloc( CUdeviceptr *dptr, unsigned int bytesize);

	skipping to change at line 715	skipping to change at line 734

	// 1D functions	// 1D functions
	// system <-> device memory	// system <-> device memory
	CUresult CUDAAPI cuMemcpyHtoD (CUdeviceptr dstDevice, const void * srcHost, unsigned int ByteCount );	CUresult CUDAAPI cuMemcpyHtoD (CUdeviceptr dstDevice, const void * srcHost, unsigned int ByteCount );
	CUresult CUDAAPI cuMemcpyDtoH (void *dstHost, CUdeviceptr srcDevic e, unsigned int ByteCount );	CUresult CUDAAPI cuMemcpyDtoH (void *dstHost, CUdeviceptr srcDevic e, unsigned int ByteCount );

	// device <-> device memory	// device <-> device memory
	CUresult CUDAAPI cuMemcpyDtoD (CUdeviceptr dstDevice, CUdeviceptr srcDevice, unsigned int ByteCount );	CUresult CUDAAPI cuMemcpyDtoD (CUdeviceptr dstDevice, CUdeviceptr srcDevice, unsigned int ByteCount );

	// device <-> array memory	// device <-> array memory

	CUresult CUDAAPI cuMemcpyDtoA ( CUarray dstArray, unsigned int dst	CUresult CUDAAPI cuMemcpyDtoA ( CUarray dstArray, unsigned int dst
	Index, CUdeviceptr srcDevice, unsigned int ByteCount );	Offset, CUdeviceptr srcDevice, unsigned int ByteCount );
	CUresult CUDAAPI cuMemcpyAtoD ( CUdeviceptr dstDevice, CUarray hSr	CUresult CUDAAPI cuMemcpyAtoD ( CUdeviceptr dstDevice, CUarray src
	c, unsigned int SrcIndex, unsigned int ByteCount );	Array, unsigned int srcOffset, unsigned int ByteCount );

	// system <-> array memory	// system <-> array memory

	CUresult CUDAAPI cuMemcpyHtoA( CUarray dstArray, unsigned int dstI	CUresult CUDAAPI cuMemcpyHtoA( CUarray dstArray, unsigned int dstO
	ndex, const void *pSrc, unsigned int ByteCount );	ffset, const void *srcHost, unsigned int ByteCount );
	CUresult CUDAAPI cuMemcpyAtoH( void *dstHost, CUarray srcArray, un	CUresult CUDAAPI cuMemcpyAtoH( void *dstHost, CUarray srcArray, un
	signed int srcIndex, unsigned int ByteCount );	signed int srcOffset, unsigned int ByteCount );

	// array <-> array memory	// array <-> array memory

	CUresult CUDAAPI cuMemcpyAtoA( CUarray dstArray, unsigned int dstI ndex, CUarray srcArray, unsigned int srcIndex, unsigned int ByteCount );	CUresult CUDAAPI cuMemcpyAtoA( CUarray dstArray, unsigned int dstO ffset, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount );

	// 2D memcpy	// 2D memcpy

	CUresult CUDAAPI cuMemcpy2D( const CUDA_MEMCPY2D *pCopy );	CUresult CUDAAPI cuMemcpy2D( const CUDA_MEMCPY2D *pCopy );
	CUresult CUDAAPI cuMemcpy2DUnaligned( const CUDA_MEMCPY2D *pCopy ) ;	CUresult CUDAAPI cuMemcpy2DUnaligned( const CUDA_MEMCPY2D *pCopy ) ;

	// 3D memcpy	// 3D memcpy

	CUresult CUDAAPI cuMemcpy3D( const CUDA_MEMCPY3D *pCopy );	CUresult CUDAAPI cuMemcpy3D( const CUDA_MEMCPY3D *pCopy );


	skipping to change at line 757	skipping to change at line 776
	CUresult CUDAAPI cuMemcpyHtoDAsync (CUdeviceptr dstDevice,	CUresult CUDAAPI cuMemcpyHtoDAsync (CUdeviceptr dstDevice,
	const void *srcHost, unsigned int ByteCount, CUstream hStream ) ;	const void *srcHost, unsigned int ByteCount, CUstream hStream ) ;
	CUresult CUDAAPI cuMemcpyDtoHAsync (void *dstHost,	CUresult CUDAAPI cuMemcpyDtoHAsync (void *dstHost,
	CUdeviceptr srcDevice, unsigned int ByteCount, CUstream hStream );	CUdeviceptr srcDevice, unsigned int ByteCount, CUstream hStream );

	// device <-> device memory	// device <-> device memory
	CUresult CUDAAPI cuMemcpyDtoDAsync (CUdeviceptr dstDevice,	CUresult CUDAAPI cuMemcpyDtoDAsync (CUdeviceptr dstDevice,
	CUdeviceptr srcDevice, unsigned int ByteCount, CUstream hStream );	CUdeviceptr srcDevice, unsigned int ByteCount, CUstream hStream );

	// system <-> array memory	// system <-> array memory

	CUresult CUDAAPI cuMemcpyHtoAAsync( CUarray dstArray, unsigned int	CUresult CUDAAPI cuMemcpyHtoAAsync( CUarray dstArray, unsigned int
	dstIndex,	dstOffset,
	const void *pSrc, unsigned int ByteCount, CUstream hStream );	const void *srcHost, unsigned int ByteCount, CUstream hStream )
	CUresult CUDAAPI cuMemcpyAtoHAsync( void *dstHost, CUarray srcArra	;
	y, unsigned int srcIndex,	CUresult CUDAAPI cuMemcpyAtoHAsync( void *dstHost, CUarray srcArra
		y, unsigned int srcOffset,
	unsigned int ByteCount, CUstream hStream );	unsigned int ByteCount, CUstream hStream );

	// 2D memcpy	// 2D memcpy
	CUresult CUDAAPI cuMemcpy2DAsync( const CUDA_MEMCPY2D *pCopy, CUst ream hStream );	CUresult CUDAAPI cuMemcpy2DAsync( const CUDA_MEMCPY2D *pCopy, CUst ream hStream );

	// 3D memcpy	// 3D memcpy
	CUresult CUDAAPI cuMemcpy3DAsync( const CUDA_MEMCPY3D *pCopy, CUst ream hStream );	CUresult CUDAAPI cuMemcpy3DAsync( const CUDA_MEMCPY3D *pCopy, CUst ream hStream );

	/************************************	/************************************
	**	**

	skipping to change at line 830	skipping to change at line 849

	CUresult CUDAAPI cuTexRefGetAddress( CUdeviceptr *pdptr, CUtexref hTex Ref );	CUresult CUDAAPI cuTexRefGetAddress( CUdeviceptr *pdptr, CUtexref hTex Ref );
	CUresult CUDAAPI cuTexRefGetArray( CUarray *phArray, CUtexref hTexRef );	CUresult CUDAAPI cuTexRefGetArray( CUarray *phArray, CUtexref hTexRef );
	CUresult CUDAAPI cuTexRefGetAddressMode( CUaddress_mode *pam, CUtexref hTexRef, int dim );	CUresult CUDAAPI cuTexRefGetAddressMode( CUaddress_mode *pam, CUtexref hTexRef, int dim );
	CUresult CUDAAPI cuTexRefGetFilterMode( CUfilter_mode *pfm, CUtexref h TexRef );	CUresult CUDAAPI cuTexRefGetFilterMode( CUfilter_mode *pfm, CUtexref h TexRef );
	CUresult CUDAAPI cuTexRefGetFormat( CUarray_format pFormat, int pNum Channels, CUtexref hTexRef );	CUresult CUDAAPI cuTexRefGetFormat( CUarray_format pFormat, int pNum Channels, CUtexref hTexRef );
	CUresult CUDAAPI cuTexRefGetFlags( unsigned int *pFlags, CUtexref hTex Ref );	CUresult CUDAAPI cuTexRefGetFlags( unsigned int *pFlags, CUtexref hTex Ref );

	/************************************	/************************************
	**	**

		** Surface reference management
		**
		***********************************/

		CUresult CUDAAPI cuSurfRefSetArray( CUsurfref hSurfRef, CUarray hArray
		, unsigned int Flags );
		CUresult CUDAAPI cuSurfRefGetArray( CUarray *phArray, CUsurfref hSurfR
		ef );

		/************************************
		**
	** Parameter management	** Parameter management
	**	**
	***********************************/	***********************************/

	CUresult CUDAAPI cuParamSetSize (CUfunction hfunc, unsigned int numbyt es);	CUresult CUDAAPI cuParamSetSize (CUfunction hfunc, unsigned int numbyt es);
	CUresult CUDAAPI cuParamSeti (CUfunction hfunc, int offset, unsigne d int value);	CUresult CUDAAPI cuParamSeti (CUfunction hfunc, int offset, unsigne d int value);
	CUresult CUDAAPI cuParamSetf (CUfunction hfunc, int offset, float v alue);	CUresult CUDAAPI cuParamSetf (CUfunction hfunc, int offset, float v alue);
	CUresult CUDAAPI cuParamSetv (CUfunction hfunc, int offset, void *p tr, unsigned int numbytes);	CUresult CUDAAPI cuParamSetv (CUfunction hfunc, int offset, void *p tr, unsigned int numbytes);
	CUresult CUDAAPI cuParamSetTexRef(CUfunction hfunc, int texunit, CUtex ref hTexRef);	CUresult CUDAAPI cuParamSetTexRef(CUfunction hfunc, int texunit, CUtex ref hTexRef);


	skipping to change at line 891	skipping to change at line 919
	CUresult CUDAAPI cuGraphicsMapResources( unsigned int count, CUgraphics Resource *resources, CUstream hStream );	CUresult CUDAAPI cuGraphicsMapResources( unsigned int count, CUgraphics Resource *resources, CUstream hStream );
	CUresult CUDAAPI cuGraphicsUnmapResources( unsigned int count, CUgraphi csResource *resources, CUstream hStream );	CUresult CUDAAPI cuGraphicsUnmapResources( unsigned int count, CUgraphi csResource *resources, CUstream hStream );

	/************************************	/************************************
	**	**
	** Export tables	** Export tables
	**	**
	***********************************/	***********************************/
	CUresult CUDAAPI cuGetExportTable( const void *ppExportTable, const CU uuid pExportTableId );	CUresult CUDAAPI cuGetExportTable( const void *ppExportTable, const CU uuid pExportTableId );


		/************************************
		**
		** Limits
		**
		***********************************/

		CUresult CUDAAPI cuCtxSetLimit(CUlimit limit, size_t value);
		CUresult CUDAAPI cuCtxGetLimit(size_t *pvalue, CUlimit limit);

	#ifdef __cplusplus	#ifdef __cplusplus
	}	}
	#endif	#endif

	#endif /* __cuda_cuda_h__ */	#endif /* __cuda_cuda_h__ */

End of changes. 22 change blocks.
	60 lines changed or deleted	107 lines changed or added

	cuda_gl_interop.h	cuda_gl_interop.h

	skipping to change at line 45	skipping to change at line 45

	#if !defined(__CUDA_GL_INTEROP_H__)	#if !defined(__CUDA_GL_INTEROP_H__)
	#define __CUDA_GL_INTEROP_H__	#define __CUDA_GL_INTEROP_H__

	/************************************************************************ ***	/************************************************************************ ***
	* *	* *
	* *	* *
	* *	* *
	************************************************************************* **/	************************************************************************* **/


		#include "builtin_types.h"
	#include "host_defines.h"	#include "host_defines.h"

	#if defined(__APPLE__)	#if defined(__APPLE__)

	#include <OpenGL/gl.h>	#include <OpenGL/gl.h>

	#else /* __APPLE__ */	#else /* __APPLE__ */

	#include <GL/gl.h>	#include <GL/gl.h>


End of changes. 1 change blocks.
	0 lines changed or deleted	1 lines changed or added

	cuda_runtime.h	cuda_runtime.h

	skipping to change at line 63	skipping to change at line 63
	#include "builtin_types.h"	#include "builtin_types.h"
	#include "channel_descriptor.h"	#include "channel_descriptor.h"
	#include "cuda_runtime_api.h"	#include "cuda_runtime_api.h"
	#include "driver_functions.h"	#include "driver_functions.h"
	#include "host_defines.h"	#include "host_defines.h"
	#include "vector_functions.h"	#include "vector_functions.h"

	#if defined(__CUDACC__)	#if defined(__CUDACC__)

	#include "common_functions.h"	#include "common_functions.h"

		#include "cuda_surface_types.h"
	#include "cuda_texture_types.h"	#include "cuda_texture_types.h"
	#include "device_functions.h"	#include "device_functions.h"
	#include "device_launch_parameters.h"	#include "device_launch_parameters.h"

	#endif /* __CUDACC__ */	#endif /* __CUDACC__ */

	#if defined(__cplusplus)	#if defined(__cplusplus)

	/************************************************************************ ***	/************************************************************************ ***
	* *	* *

	skipping to change at line 286	skipping to change at line 287
	char *symbol	char *symbol
	)	)
	{	{
	return cudaGetSymbolAddress(devPtr, (const char*)symbol);	return cudaGetSymbolAddress(devPtr, (const char*)symbol);
	}	}

	/**	/**
	* \brief \hl Finds the address associated with a CUDA symbol	* \brief \hl Finds the address associated with a CUDA symbol
	*	*
	* Returns in \p *devPtr the address of symbol \p symbol on the device.	* Returns in \p *devPtr the address of symbol \p symbol on the device.

	* \p symbol can either be a variable that resides in global memory space,	* \p symbol can either be a variable that resides in global or constant me
	or	mory space, or
	* it can be a character string, naming a variable that resides in global	* it can be a character string, naming a variable that resides in global o
		r constant
	* memory space. If \p symbol cannot be found, or if \p symbol is not decla red	* memory space. If \p symbol cannot be found, or if \p symbol is not decla red

	* in the global memory space, \p *devPtr is unchanged and the error	* in the global or constant memory space, \p *devPtr is unchanged and the
	* ::cudaErrorInvalidSymbol is returned.	error
		* ::cudaErrorInvalidSymbol is returned. If there are multiple global or co
		nstant
		* variables with the same string name (from separate files) and the lookup
		* is done via character string, ::cudaErrorDuplicateVariableName is
		* returned.
	*	*
	* \param devPtr - Return device pointer associated with symbol	* \param devPtr - Return device pointer associated with symbol

	* \param symbol - Global variable or string symbol to search for	* \param symbol - Global/constant variable or string symbol to search for
	*	*
	* \return	* \return
	* ::cudaSuccess,	* ::cudaSuccess,
	* ::cudaErrorInvalidSymbol,	* ::cudaErrorInvalidSymbol,

	* ::cudaErrorAddressOfConstant	* ::cudaErrorDuplicateVariableName
	* \notefnerr	* \notefnerr
	*	*
	* \sa \ref ::cudaGetSymbolAddress(void*, const char) "cudaGetSymbolAddre ss (C API)"	* \sa \ref ::cudaGetSymbolAddress(void*, const char) "cudaGetSymbolAddre ss (C API)"
	* \ref ::cudaGetSymbolSize(size_t*, const T&) "cudaGetSymbolSize (C++ API) "	* \ref ::cudaGetSymbolSize(size_t*, const T&) "cudaGetSymbolSize (C++ API) "
	*/	*/
	template<class T>	template<class T>
	__inline__ __host__ cudaError_t cudaGetSymbolAddress(	__inline__ __host__ cudaError_t cudaGetSymbolAddress(
	void **devPtr,	void **devPtr,
	const T &symbol	const T &symbol
	)	)

	skipping to change at line 335	skipping to change at line 339
	}	}

	/**	/**
	* \brief \hl Finds the size of the object associated with a CUDA symbol	* \brief \hl Finds the size of the object associated with a CUDA symbol
	*	*
	* Returns in \p *size the size of symbol \p symbol. \p symbol can either b e a	* Returns in \p *size the size of symbol \p symbol. \p symbol can either b e a
	* variable that resides in global or constant memory space, or it can be a	* variable that resides in global or constant memory space, or it can be a
	* character string, naming a variable that resides in global or constant	* character string, naming a variable that resides in global or constant
	* memory space. If \p symbol cannot be found, or if \p symbol is not decla red	* memory space. If \p symbol cannot be found, or if \p symbol is not decla red
	* in global or constant memory space, \p *size is unchanged and the error	* in global or constant memory space, \p *size is unchanged and the error

	* ::cudaErrorInvalidSymbol is returned.	* ::cudaErrorInvalidSymbol is returned. If there are multiple global
		* variables with the same string name (from separate files) and the lookup
		* is done via character string, ::cudaErrorDuplicateVariableName is
		* returned.
	*	*
	* \param size - Size of object associated with symbol	* \param size - Size of object associated with symbol
	* \param symbol - Global variable or string symbol to find size of	* \param symbol - Global variable or string symbol to find size of
	*	*
	* \return	* \return
	* ::cudaSuccess,	* ::cudaSuccess,

	* ::cudaErrorInvalidSymbol	* ::cudaErrorInvalidSymbol,
		* ::cudaErrorDuplicateVariableName
	* \notefnerr	* \notefnerr
	*	*
	* \sa \ref ::cudaGetSymbolAddress(void**, const T&) "cudaGetSymbolAddress (C++ API)"	* \sa \ref ::cudaGetSymbolAddress(void**, const T&) "cudaGetSymbolAddress (C++ API)"
	* \ref ::cudaGetSymbolSize(size_t, const char) "cudaGetSymbolSize (C API )"	* \ref ::cudaGetSymbolSize(size_t, const char) "cudaGetSymbolSize (C API )"
	*/	*/
	template<class T>	template<class T>
	__inline__ __host__ cudaError_t cudaGetSymbolSize(	__inline__ __host__ cudaError_t cudaGetSymbolSize(
	size_t *size,	size_t *size,
	const T &symbol	const T &symbol
	)	)

	skipping to change at line 747	skipping to change at line 755
	* \param entry - Device function pointer or char string naming device func tion	* \param entry - Device function pointer or char string naming device func tion
	* to execute	* to execute
	*	*
	* \return	* \return
	* ::cudaSuccess,	* ::cudaSuccess,
	* ::cudaErrorInvalidDeviceFunction,	* ::cudaErrorInvalidDeviceFunction,
	* ::cudaErrorInvalidConfiguration,	* ::cudaErrorInvalidConfiguration,
	* ::cudaErrorLaunchFailure,	* ::cudaErrorLaunchFailure,
	* ::cudaErrorPriorLaunchFailure,	* ::cudaErrorPriorLaunchFailure,
	* ::cudaErrorLaunchTimeout,	* ::cudaErrorLaunchTimeout,

	* ::cudaErrorLaunchOutOfResources	* ::cudaErrorLaunchOutOfResources,
		* ::cudaErrorSharedObjectSymbolNotFound,
		* ::cudaErrorSharedObjectInitFailed
	* \notefnerr	* \notefnerr
	*	*
	* \sa ::cudaConfigureCall,	* \sa ::cudaConfigureCall,
	* \ref ::cudaFuncSetCacheConfig(T*, enum cudaFuncCache) "cudaFuncSetCacheC onfig (C++ API)",	* \ref ::cudaFuncSetCacheConfig(T*, enum cudaFuncCache) "cudaFuncSetCacheC onfig (C++ API)",
	* \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes, T) "cudaFuncGe tAttributes (C++ API)",	* \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes, T) "cudaFuncGe tAttributes (C++ API)",
	* \ref ::cudaLaunch(const char*) "cudaLaunch (C API)",	* \ref ::cudaLaunch(const char*) "cudaLaunch (C API)",
	* ::cudaSetDoubleForDevice,	* ::cudaSetDoubleForDevice,
	* ::cudaSetDoubleForHost,	* ::cudaSetDoubleForHost,
	* \ref ::cudaSetupArgument(T,size_t) "cudaSetupArgument (C++ API)"	* \ref ::cudaSetupArgument(T,size_t) "cudaSetupArgument (C++ API)"
	*/	*/

	skipping to change at line 804	skipping to change at line 814
	*/	*/
	template<class T>	template<class T>
	__inline__ __host__ cudaError_t cudaFuncGetAttributes(	__inline__ __host__ cudaError_t cudaFuncGetAttributes(
	struct cudaFuncAttributes *attr,	struct cudaFuncAttributes *attr,
	T *entry	T *entry
	)	)
	{	{
	return cudaFuncGetAttributes(attr, (const char*)entry);	return cudaFuncGetAttributes(attr, (const char*)entry);
	}	}


		/**
		* \ingroup CUDART_HIGHLEVEL
		* \brief \hl Binds an array to a surface
		*
		* Binds the CUDA array \p array to the surface reference \p surf.
		* \p desc describes how the memory is interpreted when dealing with
		* the surface. Any CUDA array previously bound to \p surf is unbound.
		*
		* \param surf - Surface to bind
		* \param array - Memory array on device
		* \param desc - Channel format
		*
		* \return
		* ::cudaSuccess,
		* ::cudaErrorInvalidValue,
		* ::cudaErrorInvalidSurface
		* \notefnerr
		*
		* \sa \ref ::cudaBindSurfaceToArray(const struct surfaceReference*, const
		struct cudaArray, const struct cudaChannelFormatDesc) "cudaBindSurfaceToA
		rray (C API)",
		* \ref ::cudaBindSurfaceToArray(const struct surface< T, dim>&, const stru
		ct cudaArray*) "cudaBindSurfaceToArray (C++ API, inherited channel descript
		or)"
		*/
		template<class T, int dim>
		__inline__ __host__ cudaError_t cudaBindSurfaceToArray(
		const struct surface<T, dim> &surf,
		const struct cudaArray *array,
		const struct cudaChannelFormatDesc &desc
		)
		{
		return cudaBindSurfaceToArray(&surf, array, &desc);
		}

		/**
		* \ingroup CUDART_HIGHLEVEL
		* \brief \hl Binds an array to a surface
		*
		* Binds the CUDA array \p array to the surface reference \p surf.
		* The channel descriptor is inherited from the CUDA array. Any CUDA array
		* previously bound to \p surf is unbound.
		*
		* \param surf - Surface to bind
		* \param array - Memory array on device
		*
		* \return
		* ::cudaSuccess,
		* ::cudaErrorInvalidValue,
		* ::cudaErrorInvalidSurface
		* \notefnerr
		*
		* \sa \ref ::cudaBindSurfaceToArray(const struct surfaceReference*, const
		struct cudaArray, const struct cudaChannelFormatDesc) "cudaBindSurfaceToA
		rray (C API)",
		* \ref ::cudaBindSurfaceToArray(const struct surface< T, dim>&, const stru
		ct cudaArray*, const struct cudaChannelFormatDesc&) "cudaBindSurfaceToArray
		(C++ API)"
		*/
		template<class T, int dim>
		__inline__ __host__ cudaError_t cudaBindSurfaceToArray(
		const struct surface<T, dim> &surf,
		const struct cudaArray *array
		)
		{
		struct cudaChannelFormatDesc desc;
		cudaError_t err = cudaGetChannelDesc(&desc, array);

		return err == cudaSuccess ? cudaBindSurfaceToArray(surf, array, desc) : e
		rr;
		}

	#endif /* __CUDACC__ */	#endif /* __CUDACC__ */

	#endif /* __cplusplus */	#endif /* __cplusplus */

	#endif /* !__CUDA_RUNTIME_H__ */	#endif /* !__CUDA_RUNTIME_H__ */

End of changes. 9 change blocks.
	10 lines changed or deleted	95 lines changed or added

	cuda_texture_types.h	cuda_texture_types.h

	skipping to change at line 47	skipping to change at line 47
	#define __CUDA_TEXTURE_TYPES_H__	#define __CUDA_TEXTURE_TYPES_H__

	#if defined(__cplusplus) && defined(__CUDACC__)	#if defined(__cplusplus) && defined(__CUDACC__)

	/************************************************************************ ***	/************************************************************************ ***
	* *	* *
	* *	* *
	* *	* *
	************************************************************************* **/	************************************************************************* **/


		#include "builtin_types.h"
	#include "channel_descriptor.h"	#include "channel_descriptor.h"
	#include "driver_types.h"	#include "driver_types.h"
	#include "host_defines.h"	#include "host_defines.h"
	#include "texture_types.h"	#include "texture_types.h"

	/************************************************************************ ***	/************************************************************************ ***
	* *	* *
	* *	* *
	* *	* *
	************************************************************************* **/	************************************************************************* **/

End of changes. 1 change blocks.
	0 lines changed or deleted	1 lines changed or added

	cufft.h	cufft.h

	skipping to change at line 102	skipping to change at line 102
	// CUFFT supports the following transform types	// CUFFT supports the following transform types
	typedef enum cufftType_t {	typedef enum cufftType_t {
	CUFFT_R2C = 0x2a, // Real to Complex (interleaved)	CUFFT_R2C = 0x2a, // Real to Complex (interleaved)
	CUFFT_C2R = 0x2c, // Complex (interleaved) to Real	CUFFT_C2R = 0x2c, // Complex (interleaved) to Real
	CUFFT_C2C = 0x29, // Complex to Complex, interleaved	CUFFT_C2C = 0x29, // Complex to Complex, interleaved
	CUFFT_D2Z = 0x6a, // Double to Double-Complex	CUFFT_D2Z = 0x6a, // Double to Double-Complex
	CUFFT_Z2D = 0x6c, // Double-Complex to Double	CUFFT_Z2D = 0x6c, // Double-Complex to Double
	CUFFT_Z2Z = 0x69 // Double-Complex to Double-Complex	CUFFT_Z2Z = 0x69 // Double-Complex to Double-Complex
	} cufftType;	} cufftType;


		// Certain R2C and C2R transforms go much more slowly when FFTW memory
		// layout and behaviour is required. The default is "best performance",
		// which means not-compatible-with-fftw. Use the cufftSetCompatibilityMode
		// API to enable exact FFTW-like behaviour.
		//
		// These flags can be ORed together to select precise FFTW compatibility
		// behaviour. The two levels presently supported are:
		//
		// CUFFT_COMPATIBILITY_FFTW_PADDING
		// Inserts extra padding between packed in-place transforms for
		// batched transforms with power-of-2 size.
		//
		// CUFFT_COMPATIBILITY_FFTW_C2R_ASYMMETRIC
		// Guarantees FFTW-compatible output for non-symmetric complex inputs
		// for transforms with power-of-2 size. This is only useful for
		// artificial (i.e. random) datasets as actual data will always be
		// symmetric if it has come from the real plane. If you don't
		// understand what this means, you probably don't have to use it.
		//
		// CUFFT_COMPATIBILITY_FFTW
		// For convenience, enables all FFTW compatibility modes at once.
		//
		typedef enum cufftCompatibility_t {
		CUFFT_COMPATIBILITY_NATIVE = 0x00,
		CUFFT_COMPATIBILITY_FFTW_PADDING = 0x01, // The default value
		CUFFT_COMPATIBILITY_FFTW_ASYMMETRIC = 0x02,
		CUFFT_COMPATIBILITY_FFTW_ALL = 0x03
		} cufftCompatibility;

		#define CUFFT_COMPATIBILITY_DEFAULT CUFFT_COMPATIBILITY_FFTW_PADDING

	cufftResult CUFFTAPI cufftPlan1d(cufftHandle *plan,	cufftResult CUFFTAPI cufftPlan1d(cufftHandle *plan,
	int nx,	int nx,
	cufftType type,	cufftType type,
	int batch /* deprecated - use cufftPlanMan y */);	int batch /* deprecated - use cufftPlanMan y */);

	cufftResult CUFFTAPI cufftPlan2d(cufftHandle *plan,	cufftResult CUFFTAPI cufftPlan2d(cufftHandle *plan,
	int nx, int ny,	int nx, int ny,
	cufftType type);	cufftType type);

	cufftResult CUFFTAPI cufftPlan3d(cufftHandle *plan,	cufftResult CUFFTAPI cufftPlan3d(cufftHandle *plan,

	skipping to change at line 154	skipping to change at line 185
	cufftDoubleReal *idata,	cufftDoubleReal *idata,
	cufftDoubleComplex *odata);	cufftDoubleComplex *odata);

	cufftResult CUFFTAPI cufftExecZ2D(cufftHandle plan,	cufftResult CUFFTAPI cufftExecZ2D(cufftHandle plan,
	cufftDoubleComplex *idata,	cufftDoubleComplex *idata,
	cufftDoubleReal *odata);	cufftDoubleReal *odata);

	cufftResult CUFFTAPI cufftSetStream(cufftHandle plan,	cufftResult CUFFTAPI cufftSetStream(cufftHandle plan,
	cudaStream_t stream);	cudaStream_t stream);


		cufftResult CUFFTAPI cufftSetCompatibilityMode(cufftHandle plan,
		cufftCompatibility mode);
	#ifdef __cplusplus	#ifdef __cplusplus
	}	}
	#endif	#endif

	#endif /* _CUFFT_H_ */	#endif /* _CUFFT_H_ */

End of changes. 2 change blocks.
	0 lines changed or deleted	33 lines changed or added

	device_functions.h	device_functions.h

	skipping to change at line 47	skipping to change at line 47
	#define __DEVICE_FUNCTIONS_H__	#define __DEVICE_FUNCTIONS_H__

	/************************************************************************ ***	/************************************************************************ ***
	* *	* *
	* *	* *
	* *	* *
	************************************************************************* **/	************************************************************************* **/

	#if defined(__cplusplus) && defined(__CUDACC__)	#if defined(__cplusplus) && defined(__CUDACC__)


		#include "builtin_types.h"
	#include "device_types.h"	#include "device_types.h"
	#include "host_defines.h"	#include "host_defines.h"

	/************************************************************************ ***	/************************************************************************ ***
	* *	* *
	* *	* *
	* *	* *
	************************************************************************* **/	************************************************************************* **/

	extern "C"	extern "C"

	skipping to change at line 75	skipping to change at line 76
	extern __device__ long long int __mul64hi(long long int, long long int);	extern __device__ long long int __mul64hi(long long int, long long int);
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __device__ unsigned long long int __umul64hi(unsigned long long int, unsigned long long int);	extern __device__ unsigned long long int __umul64hi(unsigned long long int, unsigned long long int);

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __device__ float __int_as_float(int);	extern __device__ float __int_as_float(int);
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __device__ int __float_as_int(float);	extern __device__ int __float_as_int(float);

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/

		extern __device__ void __synchronous_start(int);
		/DEVICE_BUILTIN/
		extern __device__ void __synchronous_end(void);
		/DEVICE_BUILTIN/
	extern __device__ void __syncthreads(void);	extern __device__ void __syncthreads(void);
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __device__ void __prof_trigger(int);	extern __device__ void __prof_trigger(int);
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __device__ void __threadfence(void);	extern __device__ void __threadfence(void);
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __device__ void __threadfence_block(void);	extern __device__ void __threadfence_block(void);
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __device__ void __trap(void);	extern __device__ void __trap(void);
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/

	skipping to change at line 287	skipping to change at line 292

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __device__ int __clzll(long long int);	extern __device__ int __clzll(long long int);
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __device__ int __ffsll(long long int);	extern __device__ int __ffsll(long long int);
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __device__ int __popcll(unsigned long long int);	extern __device__ int __popcll(unsigned long long int);
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __device__ unsigned long long int __brevll(unsigned long long int);	extern __device__ unsigned long long int __brevll(unsigned long long int);


		/DEVICE_BUILTIN/
		extern __device__ unsigned int __byte_perm(unsigned int, unsigned
		int, unsigned int);

	#if !defined(__CUDA_ARCH__) \|\| __CUDA_ARCH__ >= 130	#if !defined(__CUDA_ARCH__) \|\| __CUDA_ARCH__ >= 130

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __device__ int __double2int_rz(double);	extern __device__ int __double2int_rz(double);

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __device__ unsigned int __double2uint_rz(double);	extern __device__ unsigned int __double2uint_rz(double);

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __device__ long long int __double2ll_rz(double);	extern __device__ long long int __double2ll_rz(double);

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __device__ unsigned long long int __double2ull_rz(double);	extern __device__ unsigned long long int __double2ull_rz(double);


		/DEVICE_BUILTIN/
		extern __device__ unsigned int __pm0(void);
		/DEVICE_BUILTIN/
		extern __device__ unsigned int __pm1(void);
		/DEVICE_BUILTIN/
		extern __device__ unsigned int __pm2(void);
		/DEVICE_BUILTIN/
		extern __device__ unsigned int __pm3(void);

	#endif /* !__CUDA_ARCH__ \|\| __CUDA_ARCH__ >= 130 */	#endif /* !__CUDA_ARCH__ \|\| __CUDA_ARCH__ >= 130 */

	}	}

	/************************************************************************ ***	/************************************************************************ ***
	* *	* *
	* *	* *
	* *	* *
	************************************************************************* **/	************************************************************************* **/


	skipping to change at line 448	skipping to change at line 465
	}	}

	static __inline__ __device__ float uint2float(unsigned int a, enum cudaRoun dMode mode = cudaRoundNearest)	static __inline__ __device__ float uint2float(unsigned int a, enum cudaRoun dMode mode = cudaRoundNearest)
	{	{
	return mode == cudaRoundZero ? __uint2float_rz(a) :	return mode == cudaRoundZero ? __uint2float_rz(a) :
	mode == cudaRoundPosInf ? __uint2float_ru(a) :	mode == cudaRoundPosInf ? __uint2float_ru(a) :
	mode == cudaRoundMinInf ? __uint2float_rd(a) :	mode == cudaRoundMinInf ? __uint2float_rd(a) :
	__uint2float_rn(a);	__uint2float_rn(a);
	}	}


	#elif !defined(__CUDACC__)	#elif defined(__CUDABE__)


	#include "crt/func_macro.h"	/**************************************************************************
		*****
		*
		*
		* DEVICE IMPLEMENTATIONS FOR FUNCTIONS WITH BUILTIN NVOPENCC OPERATIONS
		*
		*
		*
		***************************************************************************
		****/


	#include "host_defines.h"	static __forceinline__ float __sinf(float a)
	#include "math_constants.h"	{
		return __builtin_sinf(a);
		}


	#if defined(__CUDABE__)	static __forceinline__ float __cosf(float a)
		{
		return __builtin_cosf(a);
		}


	#if (__CUDA_ARCH__ < 200)	static __forceinline__ float __log2f(float a)
		{
		return __builtin_log2f(a);
		}


	__device_func__(float __frcp_rn (float x))	/**************************************************************************
		*****
		*
		*
		* DEVICE IMPLEMENTATIONS FOR FUNCTIONS WITHOUT BUILTIN NVOPENCC OPERATIONS
		*
		*
		*
		***************************************************************************
		****/

		static __forceinline__ float __tanf(float a)
		{
		return __fdividef (__sinf(a), __cosf(a));
		}

		static __forceinline__ void __sincosf(float a, float sptr, float cptr)
		{
		*sptr = __sinf(a);
		*cptr = __cosf(a);
		}

		static __forceinline__ float __expf(float a)
		{
		return exp2f(a * CUDART_L2E_F);
		}

		static __forceinline__ float __exp10f(float a)
		{
		return exp2f(a * CUDART_L2T_F);
		}

		static __forceinline__ float __log10f(float a)
		{
		return CUDART_LG2_F * __log2f(a);
		}

		static __forceinline__ float __logf(float a)
		{
		return CUDART_LN2_F * __log2f(a);
		}

		static __forceinline__ float __powf(float a, float b)
		{
		return exp2f(b * __log2f(a));
		}

		static __forceinline__ float fdividef(float a, float b)
		{
		#if defined(__USE_FAST_MATH__) && !defined(__CUDA_PREC_DIV)
		return __fdividef(a, b);
		#else /* __USE_FAST_MATH__ && !__CUDA_PREC_DIV */
		return a / b;
		#endif /* __USE_FAST_MATH__ && !__CUDA_PREC_DIV */
		}

		#if defined(CUDA_FLOAT_MATH_FUNCTIONS)

		static __forceinline__ double fdivide(double a, double b)
		{
		return (double)fdividef((float)a, (float)b);
		}

		#endif /* CUDA_FLOAT_MATH_FUNCTIONS */

		#if defined(CUDA_DOUBLE_MATH_FUNCTIONS)

		static __forceinline__ double fdivide(double a, double b)
		{
		return a / b;
		}

		#endif /* CUDA_DOUBLE_MATH_FUNCTIONS */

		#if __CUDA_ARCH__ < 200

		static __forceinline__ float __frcp_rn (float x)
	{	{
	unsigned int expo;	unsigned int expo;
	unsigned f, y;	unsigned f, y;
	unsigned int argi;	unsigned int argi;
	float t;	float t;

	argi = __float_as_int(x);	argi = __float_as_int(x);
	expo = (argi >> 23);	expo = (argi >> 23);
	expo = expo & 0xff;	expo = expo & 0xff;
	f = expo - 1;	f = expo - 1;

	skipping to change at line 497	skipping to change at line 597
	f = (unsigned)(-(int)f);	f = (unsigned)(-(int)f);
	if (expo < f) {	if (expo < f) {
	t = __int_as_float(__float_as_int(t)+1);	t = __int_as_float(__float_as_int(t)+1);
	}	}
	return t;	return t;
	}	}
	}	}
	return 1.0f / x;	return 1.0f / x;
	}	}


	__device_func__(float __frcp_rz (float x))	static __forceinline__ float __frcp_rz (float x)
	{	{
	unsigned int expo;	unsigned int expo;
	unsigned f, y;	unsigned f, y;
	unsigned int argi;	unsigned int argi;
	float t;	float t;

	argi = __float_as_int(x);	argi = __float_as_int(x);
	expo = (argi >> 23);	expo = (argi >> 23);
	expo = expo & 0xff;	expo = expo & 0xff;
	f = expo - 1;	f = expo - 1;

	skipping to change at line 525	skipping to change at line 625
	f = __umul24(y, argi);	f = __umul24(y, argi);
	if ((int)f > 0) {	if ((int)f > 0) {
	t = __int_as_float(__float_as_int(t)-1);	t = __int_as_float(__float_as_int(t)-1);
	}	}
	return t;	return t;
	}	}
	}	}
	return 1.0f / x;	return 1.0f / x;
	}	}


	__device_func__(float __frcp_rd (float x))	static __forceinline__ float __frcp_rd (float x)
	{	{
	unsigned int expo;	unsigned int expo;
	unsigned f, y;	unsigned f, y;
	unsigned int argi;	unsigned int argi;
	float t;	float t;

	argi = __float_as_int(x);	argi = __float_as_int(x);
	expo = (argi >> 23);	expo = (argi >> 23);
	expo = expo & 0xff;	expo = expo & 0xff;
	f = expo - 1;	f = expo - 1;

	skipping to change at line 556	skipping to change at line 656
	}	}
	if (((int)f < 0) && (x < 0.0f)) {	if (((int)f < 0) && (x < 0.0f)) {
	t = __int_as_float(__float_as_int(t)+1);	t = __int_as_float(__float_as_int(t)+1);
	}	}
	return t;	return t;
	}	}
	}	}
	return 1.0f / x;	return 1.0f / x;
	}	}


	__device_func__(float __frcp_ru (float x))	static __forceinline__ float __frcp_ru (float x)
	{	{
	unsigned int expo;	unsigned int expo;
	unsigned f, y;	unsigned f, y;
	unsigned int argi;	unsigned int argi;
	float t;	float t;

	argi = __float_as_int(x);	argi = __float_as_int(x);
	expo = (argi >> 23);	expo = (argi >> 23);
	expo = expo & 0xff;	expo = expo & 0xff;
	f = expo - 1;	f = expo - 1;

	skipping to change at line 587	skipping to change at line 687
	}	}
	if (((int)f < 0) && (x > 0.0f)) {	if (((int)f < 0) && (x > 0.0f)) {
	t = __int_as_float(__float_as_int(t)+1);	t = __int_as_float(__float_as_int(t)+1);
	}	}
	return t;	return t;
	}	}
	}	}
	return 1.0f / x;	return 1.0f / x;
	}	}


	__device_func__(float __fsqrt_rn (float radicand))	static __forceinline__ float __fsqrt_rn (float radicand)
	{	{
	unsigned int expo, argi;	unsigned int expo, argi;
	unsigned int s, f, x;	unsigned int s, f, x;

	argi = __float_as_int(radicand);	argi = __float_as_int(radicand);
	expo = argi >> 23;	expo = argi >> 23;
	expo = expo & 0xff;	expo = expo & 0xff;
	f = expo - 1;	f = expo - 1;

	if ((argi <= 0x80000000) && (f <= 0xFD)) {	if ((argi <= 0x80000000) && (f <= 0xFD)) {

	skipping to change at line 621	skipping to change at line 721
	f = x - (2 * argi + 1);	f = x - (2 * argi + 1);
	if ((int)f < 0) f = (unsigned)(-(int)f);	if ((int)f < 0) f = (unsigned)(-(int)f);
	if ((int)x < 0) x = (unsigned)(-(int)x);	if ((int)x < 0) x = (unsigned)(-(int)x);
	if (f < x) argi ++;	if (f < x) argi ++;
	argi = argi + (((expo + 125) & ~0x1) << 22);	argi = argi + (((expo + 125) & ~0x1) << 22);
	return __int_as_float(argi);	return __int_as_float(argi);
	}	}
	return sqrtf(radicand);	return sqrtf(radicand);
	}	}


	__device_func__(float __fsqrt_rz (float radicand))	static __forceinline__ float __fsqrt_rz (float radicand)
	{	{
	unsigned int expo, argi;	unsigned int expo, argi;
	unsigned int s, f, x;	unsigned int s, f, x;

	argi = __float_as_int(radicand);	argi = __float_as_int(radicand);
	expo = argi >> 23;	expo = argi >> 23;
	expo = expo & 0xff;	expo = expo & 0xff;
	f = expo - 1;	f = expo - 1;

	if ((argi <= 0x80000000) && (f <= 0xFD)) {	if ((argi <= 0x80000000) && (f <= 0xFD)) {

	skipping to change at line 652	skipping to change at line 752
	/* compute truncated result */	/* compute truncated result */
	argi = (argi + 4) >> 3;	argi = (argi + 4) >> 3;
	x = (x << 16) - (argi * argi);	x = (x << 16) - (argi * argi);
	if ((int)x < 0) argi--;	if ((int)x < 0) argi--;
	argi = argi + (((expo + 125) & ~0x1) << 22);	argi = argi + (((expo + 125) & ~0x1) << 22);
	return __int_as_float(argi);	return __int_as_float(argi);
	}	}
	return sqrtf(radicand);	return sqrtf(radicand);
	}	}


	__device_func__(float __fsqrt_ru (float radicand))	static __forceinline__ float __fsqrt_ru (float radicand)
	{	{
	unsigned int expo, argi;	unsigned int expo, argi;
	unsigned int s, f, x;	unsigned int s, f, x;

	argi = __float_as_int(radicand);	argi = __float_as_int(radicand);
	expo = argi >> 23;	expo = argi >> 23;
	expo = expo & 0xff;	expo = expo & 0xff;
	f = expo - 1;	f = expo - 1;

	if ((argi <= 0x80000000) && (f <= 0xFD)) {	if ((argi <= 0x80000000) && (f <= 0xFD)) {

	skipping to change at line 682	skipping to change at line 782
	argi = __umulhi(x,argi);	argi = __umulhi(x,argi);
	argi = (argi + 4) >> 3;	argi = (argi + 4) >> 3;
	x = (x << 16) - (argi * argi);	x = (x << 16) - (argi * argi);
	if ((int)x > 0) argi++;	if ((int)x > 0) argi++;
	argi = argi + (((expo + 125) & ~0x1) << 22);	argi = argi + (((expo + 125) & ~0x1) << 22);
	return __int_as_float(argi);	return __int_as_float(argi);
	}	}
	return sqrtf(radicand);	return sqrtf(radicand);
	}	}


	__device_func__(float __fsqrt_rd (float radicand))	static __forceinline__ float __fsqrt_rd (float radicand)
	{	{
	unsigned int expo, argi;	unsigned int expo, argi;
	unsigned int s, f, x;	unsigned int s, f, x;

	argi = __float_as_int(radicand);	argi = __float_as_int(radicand);
	expo = argi >> 23;	expo = argi >> 23;
	expo = expo & 0xff;	expo = expo & 0xff;
	f = expo - 1;	f = expo - 1;

	if ((argi <= 0x80000000) && (f <= 0xFD)) {	if ((argi <= 0x80000000) && (f <= 0xFD)) {

	skipping to change at line 713	skipping to change at line 813
	/* compute truncated result */	/* compute truncated result */
	argi = (argi + 4) >> 3;	argi = (argi + 4) >> 3;
	x = (x << 16) - (argi * argi);	x = (x << 16) - (argi * argi);
	if ((int)x < 0) argi--;	if ((int)x < 0) argi--;
	argi = argi + (((expo + 125) & ~0x1) << 22);	argi = argi + (((expo + 125) & ~0x1) << 22);
	return __int_as_float(argi);	return __int_as_float(argi);
	}	}
	return sqrtf(radicand);	return sqrtf(radicand);
	}	}


	__device_func__(float __fdiv_rn (float dividend, float divisor))	static __forceinline__ float __fdiv_rn (float dividend, float divisor)
	{	{
	unsigned long long prod;	unsigned long long prod;
	unsigned r, f, x, y, expox, expoy, sign;	unsigned r, f, x, y, expox, expoy, sign;
	unsigned expo_res;	unsigned expo_res;
	unsigned resi, cvtxi, cvtyi;	unsigned resi, cvtxi, cvtyi;
	float t;	float t;

	cvtxi = __float_as_int(dividend);	cvtxi = __float_as_int(dividend);
	cvtyi = __float_as_int(divisor);	cvtyi = __float_as_int(divisor);
	expox = (cvtxi >> 23) & 0xff;	expox = (cvtxi >> 23) & 0xff;

	skipping to change at line 776	skipping to change at line 876
	prod = ((unsigned long long)y) * r;	prod = ((unsigned long long)y) * r;
	x = x << (23 + ((prod >> 32) >> 15));	x = x << (23 + ((prod >> 32) >> 15));
	rem1 = x - (unsigned)(prod & 0xffffffff);	rem1 = x - (unsigned)(prod & 0xffffffff);
	rem0 = rem1 - y;	rem0 = rem1 - y;
	inc = abs(rem0) < abs(rem1);	inc = abs(rem0) < abs(rem1);
	resi = ((expo_res << 23) + r + inc);	resi = ((expo_res << 23) + r + inc);
	if (resi != 0x00800000) resi = 0;	if (resi != 0x00800000) resi = 0;
	return __int_as_float(sign \| resi);	return __int_as_float(sign \| resi);
	}	}
	}	}

	if (__cuda_fabsf(divisor) > CUDART_TWO_TO_126_F) {	if (fabsf(divisor) > CUDART_TWO_TO_126_F) {
	divisor *= 0.25f;	divisor *= 0.25f;
	dividend *= 0.25f;	dividend *= 0.25f;
	}	}
	return __fdividef (dividend, divisor);	return __fdividef (dividend, divisor);
	}	}


	__device_func__(float __fdiv_rz (float dividend, float divisor))	static __forceinline__ float __fdiv_rz (float dividend, float divisor)
	{	{
	unsigned long long prod;	unsigned long long prod;
	unsigned r, f, x, y, expox, expoy, sign;	unsigned r, f, x, y, expox, expoy, sign;
	unsigned expo_res;	unsigned expo_res;
	unsigned resi, cvtxi, cvtyi;	unsigned resi, cvtxi, cvtyi;
	float t;	float t;

	cvtxi = __float_as_int(dividend);	cvtxi = __float_as_int(dividend);
	cvtyi = __float_as_int(divisor);	cvtyi = __float_as_int(divisor);
	expox = (cvtxi >> 23) & 0xff;	expox = (cvtxi >> 23) & 0xff;

	skipping to change at line 844	skipping to change at line 944
	int rem1;	int rem1;
	prod = ((unsigned long long)y) * r;	prod = ((unsigned long long)y) * r;
	x = x << (23 + ((prod >> 32) >> 15));	x = x << (23 + ((prod >> 32) >> 15));
	rem1 = x - (unsigned)(prod & 0xffffffff);	rem1 = x - (unsigned)(prod & 0xffffffff);
	if (rem1 < 0) r--;	if (rem1 < 0) r--;
	resi = ((expo_res << 23) + r);	resi = ((expo_res << 23) + r);
	if (resi != 0x00800000) resi = 0;	if (resi != 0x00800000) resi = 0;
	return __int_as_float(sign \| resi);	return __int_as_float(sign \| resi);
	}	}
	}	}

	if (__cuda_fabsf(divisor) > CUDART_TWO_TO_126_F) {	if (fabsf(divisor) > CUDART_TWO_TO_126_F) {
	divisor *= 0.25f;	divisor *= 0.25f;
	dividend *= 0.25f;	dividend *= 0.25f;
	}	}
	return __fdividef (dividend, divisor);	return __fdividef (dividend, divisor);
	}	}


	__device_func__(float __fdiv_ru (float dividend, float divisor))	static __forceinline__ float __fdiv_ru (float dividend, float divisor)
	{	{
	unsigned long long prod;	unsigned long long prod;
	unsigned r, f, x, y, expox, expoy, sign;	unsigned r, f, x, y, expox, expoy, sign;
	unsigned expo_res;	unsigned expo_res;
	unsigned resi, cvtxi, cvtyi;	unsigned resi, cvtxi, cvtyi;
	float t;	float t;

	cvtxi = __float_as_int(dividend);	cvtxi = __float_as_int(dividend);
	cvtyi = __float_as_int(divisor);	cvtyi = __float_as_int(divisor);
	expox = (cvtxi >> 23) & 0xff;	expox = (cvtxi >> 23) & 0xff;

	skipping to change at line 914	skipping to change at line 1014
	prod = ((unsigned long long)y) * r;	prod = ((unsigned long long)y) * r;
	x = x << (23 + ((prod >> 32) >> 15));	x = x << (23 + ((prod >> 32) >> 15));
	rem1 = x - (unsigned)(prod & 0xffffffff);	rem1 = x - (unsigned)(prod & 0xffffffff);
	if ((rem1 < 0) && (sign)) r--;	if ((rem1 < 0) && (sign)) r--;
	if ((rem1 > 0) && (!sign)) r++;	if ((rem1 > 0) && (!sign)) r++;
	resi = ((expo_res << 23) + r);	resi = ((expo_res << 23) + r);
	if (resi != 0x00800000) resi = 0;	if (resi != 0x00800000) resi = 0;
	return __int_as_float(sign \| resi);	return __int_as_float(sign \| resi);
	}	}
	}	}

	if (__cuda_fabsf(divisor) > CUDART_TWO_TO_126_F) {	if (fabsf(divisor) > CUDART_TWO_TO_126_F) {
	divisor *= 0.25f;	divisor *= 0.25f;
	dividend *= 0.25f;	dividend *= 0.25f;
	}	}
	return __fdividef (dividend, divisor);	return __fdividef (dividend, divisor);
	}	}


	__device_func__(float __fdiv_rd (float dividend, float divisor))	static __forceinline__ float __fdiv_rd (float dividend, float divisor)
	{	{
	unsigned long long prod;	unsigned long long prod;
	unsigned r, f, x, y, expox, expoy, sign;	unsigned r, f, x, y, expox, expoy, sign;
	unsigned expo_res;	unsigned expo_res;
	unsigned resi, cvtxi, cvtyi;	unsigned resi, cvtxi, cvtyi;
	float t;	float t;

	cvtxi = __float_as_int(dividend);	cvtxi = __float_as_int(dividend);
	cvtyi = __float_as_int(divisor);	cvtyi = __float_as_int(divisor);
	expox = (cvtxi >> 23) & 0xff;	expox = (cvtxi >> 23) & 0xff;

	skipping to change at line 984	skipping to change at line 1084
	prod = ((unsigned long long)y) * r;	prod = ((unsigned long long)y) * r;
	x = x << (23 + ((prod >> 32) >> 15));	x = x << (23 + ((prod >> 32) >> 15));
	rem1 = x - (unsigned)(prod & 0xffffffff);	rem1 = x - (unsigned)(prod & 0xffffffff);
	if ((rem1 < 0) && (!sign)) r--;	if ((rem1 < 0) && (!sign)) r--;
	if ((rem1 > 0) && (sign)) r++;	if ((rem1 > 0) && (sign)) r++;
	resi = ((expo_res << 23) + r);	resi = ((expo_res << 23) + r);
	if (resi != 0x00800000) resi = 0;	if (resi != 0x00800000) resi = 0;
	return __int_as_float(sign \| resi);	return __int_as_float(sign \| resi);
	}	}
	}	}

	if (__cuda_fabsf(divisor) > CUDART_TWO_TO_126_F) {	if (fabsf(divisor) > CUDART_TWO_TO_126_F) {
	divisor *= 0.25f;	divisor *= 0.25f;
	dividend *= 0.25f;	dividend *= 0.25f;
	}	}
	return __fdividef (dividend, divisor);	return __fdividef (dividend, divisor);
	}	}


	__device_func__(float __fadd_ru (float a, float b))	static __forceinline__ float __fadd_ru (float a, float b)
	{	{
	unsigned int expo_x, expo_y;	unsigned int expo_x, expo_y;
	unsigned int xxi, yyi, temp;	unsigned int xxi, yyi, temp;

	xxi = __float_as_int(a);	xxi = __float_as_int(a);
	yyi = __float_as_int(b);	yyi = __float_as_int(b);

	/* make bigger operand the augend */	/* make bigger operand the augend */
	expo_y = yyi << 1;	expo_y = yyi << 1;
	if (expo_y > (xxi << 1)) {	if (expo_y > (xxi << 1)) {

	skipping to change at line 1094	skipping to change at line 1194
	xxi = xxi & ~0xff000000;	xxi = xxi & ~0xff000000;
	expo_x = (unsigned int)(-((int)expo_x));	expo_x = (unsigned int)(-((int)expo_x));
	xxi = (xxi >> expo_x);	xxi = (xxi >> expo_x);
	if ((expo_x > 25) \|\| (xxi != 0x00800000)) xxi = 0;	if ((expo_x > 25) \|\| (xxi != 0x00800000)) xxi = 0;
	return __int_as_float(yyi \| xxi);	return __int_as_float(yyi \| xxi);
	} else {	} else {
	return a + b;	return a + b;
	}	}
	}	}


	__device_func__(float __fadd_rd (float a, float b))	static __forceinline__ float __fadd_rd (float a, float b)
	{	{
	unsigned int expo_x, expo_y;	unsigned int expo_x, expo_y;
	unsigned int xxi, yyi, temp;	unsigned int xxi, yyi, temp;

	xxi = __float_as_int(a);	xxi = __float_as_int(a);
	yyi = __float_as_int(b);	yyi = __float_as_int(b);

	/* make bigger operand the augend */	/* make bigger operand the augend */
	expo_y = yyi << 1;	expo_y = yyi << 1;
	if (expo_y > (xxi << 1)) {	if (expo_y > (xxi << 1)) {

	skipping to change at line 1202	skipping to change at line 1302
	if ((expo_x > 25) \|\| (xxi != 0x00800000)) xxi = 0;	if ((expo_x > 25) \|\| (xxi != 0x00800000)) xxi = 0;
	return __int_as_float(yyi \| xxi);	return __int_as_float(yyi \| xxi);
	} else {	} else {
	a = a + b;	a = a + b;
	xxi = xxi ^ yyi;	xxi = xxi ^ yyi;
	if ((a == 0.0f) && ((int)xxi < 0)) a = __int_as_float(0x80000000);	if ((a == 0.0f) && ((int)xxi < 0)) a = __int_as_float(0x80000000);
	return a;	return a;
	}	}
	}	}


	__device_func__(float __fmul_ru (float a, float b))	static __forceinline__ float __fmul_ru (float a, float b)
	{	{
	unsigned long long product;	unsigned long long product;
	unsigned int expo_x, expo_y;	unsigned int expo_x, expo_y;
	unsigned int xxi, yyi;	unsigned int xxi, yyi;

	xxi = __float_as_int(a);	xxi = __float_as_int(a);
	yyi = __float_as_int(b);	yyi = __float_as_int(b);

	expo_y = 0xFF;	expo_y = 0xFF;
	expo_x = expo_y & (xxi >> 23);	expo_x = expo_y & (xxi >> 23);

	skipping to change at line 1260	skipping to change at line 1360
	xxi += (yyi && !expo_y);	xxi += (yyi && !expo_y);
	xxi = (xxi >> expo_x);	xxi = (xxi >> expo_x);
	if ((expo_x > 25) \|\| (xxi != 0x00800000)) xxi = 0;	if ((expo_x > 25) \|\| (xxi != 0x00800000)) xxi = 0;
	return __int_as_float(expo_y \| xxi);	return __int_as_float(expo_y \| xxi);
	}	}
	} else {	} else {
	return a * b;	return a * b;
	}	}
	}	}


	__device_func__(float __fmul_rd (float a, float b))	static __forceinline__ float __fmul_rd (float a, float b)
	{	{
	unsigned long long product;	unsigned long long product;
	unsigned int expo_x, expo_y;	unsigned int expo_x, expo_y;
	unsigned int xxi, yyi;	unsigned int xxi, yyi;

	xxi = __float_as_int(a);	xxi = __float_as_int(a);
	yyi = __float_as_int(b);	yyi = __float_as_int(b);

	expo_y = 0xFF;	expo_y = 0xFF;
	expo_x = expo_y & (xxi >> 23);	expo_x = expo_y & (xxi >> 23);

	skipping to change at line 1318	skipping to change at line 1418
	xxi += (yyi && expo_y);	xxi += (yyi && expo_y);
	xxi = (xxi >> expo_x);	xxi = (xxi >> expo_x);
	if ((expo_x > 25) \|\| (xxi != 0x00800000)) xxi = 0;	if ((expo_x > 25) \|\| (xxi != 0x00800000)) xxi = 0;
	return __int_as_float(expo_y \| xxi);	return __int_as_float(expo_y \| xxi);
	}	}
	} else {	} else {
	return a * b;	return a * b;
	}	}
	}	}


	__device_func__(float __fmaf_rn (float a, float b, float c))	static __forceinline__ float __fmaf_rn (float a, float b, float c)
	{	{
	unsigned long long product;	unsigned long long product;
	unsigned int xx, yy, zz, ww;	unsigned int xx, yy, zz, ww;
	unsigned int temp, s, u;	unsigned int temp, s, u;
	unsigned int expo_x, expo_y, expo_z;	unsigned int expo_x, expo_y, expo_z;

	xx = __float_as_int(a);	xx = __float_as_int(a);
	yy = __float_as_int(b);	yy = __float_as_int(b);
	zz = __float_as_int(c);	zz = __float_as_int(c);


	skipping to change at line 1593	skipping to change at line 1693
	xx += (temp >= 0x80000000);	xx += (temp >= 0x80000000);
	if (xx >= 0x01000000) {	if (xx >= 0x01000000) {
	xx = xx >> 1;	xx = xx >> 1;
	expo_x--;	expo_x--;
	}	}
	if (expo_x > 0) xx = 0;	if (expo_x > 0) xx = 0;
	xx = expo_y \| xx;	xx = expo_y \| xx;
	return __int_as_float(xx);	return __int_as_float(xx);
	}	}


	__device_func__(float __fmaf_rz (float a, float b, float c))	static __forceinline__ float __fmaf_rz (float a, float b, float c)
	{	{
	unsigned long long product;	unsigned long long product;
	unsigned int xx, yy, zz, ww;	unsigned int xx, yy, zz, ww;
	unsigned int temp, s, u;	unsigned int temp, s, u;
	unsigned int expo_x, expo_y, expo_z;	unsigned int expo_x, expo_y, expo_z;

	xx = __float_as_int(a);	xx = __float_as_int(a);
	yy = __float_as_int(b);	yy = __float_as_int(b);
	zz = __float_as_int(c);	zz = __float_as_int(c);


	skipping to change at line 1857	skipping to change at line 1957
	return __int_as_float(xx);	return __int_as_float(xx);
	} else if ((int)expo_x >= 126) {	} else if ((int)expo_x >= 126) {
	/* overflow */	/* overflow */
	xx = expo_y \| 0x7f7fffff;	xx = expo_y \| 0x7f7fffff;
	return __int_as_float(xx);	return __int_as_float(xx);
	}	}
	/* subnormal */	/* subnormal */
	return __int_as_float(expo_y);	return __int_as_float(expo_y);
	}	}


	__device_func__(float __fmaf_ru (float a, float b, float c))	static __forceinline__ float __fmaf_ru (float a, float b, float c)
	{	{
	unsigned long long product;	unsigned long long product;
	unsigned int xx, yy, zz, ww;	unsigned int xx, yy, zz, ww;
	unsigned int temp, s, u;	unsigned int temp, s, u;
	unsigned int expo_x, expo_y, expo_z;	unsigned int expo_x, expo_y, expo_z;

	xx = __float_as_int(a);	xx = __float_as_int(a);
	yy = __float_as_int(b);	yy = __float_as_int(b);
	zz = __float_as_int(c);	zz = __float_as_int(c);


	skipping to change at line 2126	skipping to change at line 2226
	return __int_as_float(xx);	return __int_as_float(xx);
	}	}
	/* subnormal */	/* subnormal */
	expo_x = ((unsigned int)-((int)expo_x));	expo_x = ((unsigned int)-((int)expo_x));
	xx += (temp && !expo_y);	xx += (temp && !expo_y);
	xx = (xx >> expo_x);	xx = (xx >> expo_x);
	if ((expo_x > 25) \|\| (xx != 0x00800000)) xx = 0;	if ((expo_x > 25) \|\| (xx != 0x00800000)) xx = 0;
	return __int_as_float(expo_y \| xx);	return __int_as_float(expo_y \| xx);
	}	}


	__device_func__(float __fmaf_rd (float a, float b, float c))	static __forceinline__ float __fmaf_rd (float a, float b, float c)
	{	{
	unsigned long long product;	unsigned long long product;
	unsigned int xx, yy, zz, ww;	unsigned int xx, yy, zz, ww;
	unsigned int temp, s, u;	unsigned int temp, s, u;
	unsigned int expo_x, expo_y, expo_z;	unsigned int expo_x, expo_y, expo_z;

	xx = __float_as_int(a);	xx = __float_as_int(a);
	yy = __float_as_int(b);	yy = __float_as_int(b);
	zz = __float_as_int(c);	zz = __float_as_int(c);


	skipping to change at line 2395	skipping to change at line 2495
	return __int_as_float(xx);	return __int_as_float(xx);
	}	}
	/* subnormal */	/* subnormal */
	expo_x = ((unsigned int)-((int)expo_x));	expo_x = ((unsigned int)-((int)expo_x));
	xx += (temp && expo_y);	xx += (temp && expo_y);
	xx = (xx >> expo_x);	xx = (xx >> expo_x);
	if ((expo_x > 25) \|\| (xx != 0x00800000)) xx = 0;	if ((expo_x > 25) \|\| (xx != 0x00800000)) xx = 0;
	return __int_as_float(expo_y \| xx);	return __int_as_float(expo_y \| xx);
	}	}


	#endif /* __CUDA_ARCH__ < 200 */	static __forceinline__ int __clz(int a)

	#else /* defined(__CUDABE__) */

	#include "common_types.h"

	static __device__ const unsigned char __internal_rcpTab[128] =
	{
	0xff, 0xfd, 0xfb, 0xf9, 0xf7, 0xf5, 0xf4, 0xf2,
	0xf0, 0xee, 0xed, 0xeb, 0xe9, 0xe8, 0xe6, 0xe4,
	0xe3, 0xe1, 0xe0, 0xde, 0xdd, 0xdb, 0xda, 0xd8,
	0xd7, 0xd5, 0xd4, 0xd3, 0xd1, 0xd0, 0xcf, 0xcd,
	0xcc, 0xcb, 0xca, 0xc8, 0xc7, 0xc6, 0xc5, 0xc4,
	0xc2, 0xc1, 0xc0, 0xbf, 0xbe, 0xbd, 0xbc, 0xbb,
	0xba, 0xb9, 0xb8, 0xb7, 0xb6, 0xb5, 0xb4, 0xb3,
	0xb2, 0xb1, 0xb0, 0xaf, 0xae, 0xad, 0xac, 0xab,
	0xaa, 0xa9, 0xa8, 0xa8, 0xa7, 0xa6, 0xa5, 0xa4,
	0xa3, 0xa3, 0xa2, 0xa1, 0xa0, 0x9f, 0x9f, 0x9e,
	0x9d, 0x9c, 0x9c, 0x9b, 0x9a, 0x99, 0x99, 0x98,
	0x97, 0x97, 0x96, 0x95, 0x95, 0x94, 0x93, 0x93,
	0x92, 0x91, 0x91, 0x90, 0x8f, 0x8f, 0x8e, 0x8e,
	0x8d, 0x8c, 0x8c, 0x8b, 0x8b, 0x8a, 0x89, 0x89,
	0x88, 0x88, 0x87, 0x87, 0x86, 0x85, 0x85, 0x84,
	0x84, 0x83, 0x83, 0x82, 0x82, 0x81, 0x81, 0x80
	};

	static __device__ const unsigned int __internal_invSqrtCubeTab[96] =
	{
	0xfa0bf8fe, 0xee6b28fa, 0xe5f024f7, 0xdaf268f3,
	0xd2f000f0, 0xc890c0ec, 0xc10378e9, 0xb9a758e6,
	0xb4da40e4, 0xadcea0e1, 0xa6f278de, 0xa279c0dc,
	0x9beb48d9, 0x97a5c4d7, 0x916340d4, 0x8d4fc8d2,
	0x895000d0, 0x8563b8ce, 0x818ac0cc, 0x7dc4e8ca,
	0x7a1200c8, 0x7671d8c6, 0x72e440c4, 0x6f6908c2,
	0x6db240c1, 0x6a523cbf, 0x670424bd, 0x6563c0bc,
	0x623028ba, 0x609ce8b9, 0x5d8364b7, 0x5bfd18b6,
	0x58fd40b4, 0x5783a8b3, 0x560e48b2, 0x533000b0,
	0x51c70caf, 0x506238ae, 0x4da4c0ac, 0x4c4c10ab,
	0x4af768aa, 0x49a6b8a9, 0x485a00a8, 0x471134a7,
	0x45cc58a6, 0x434e40a4, 0x4214f8a3, 0x40df88a2,
	0x3fade0a1, 0x3e8000a0, 0x3d55dc9f, 0x3c2f789e,
	0x3c2f789e, 0x3b0cc49d, 0x39edc09c, 0x38d2609b,
	0x37baa89a, 0x36a68899, 0x35960098, 0x34890497,
	0x34890497, 0x337f9896, 0x3279ac95, 0x31774094,
	0x30784893, 0x30784893, 0x2f7cc892, 0x2e84b091,
	0x2d900090, 0x2d900090, 0x2c9eac8f, 0x2bb0b88e,
	0x2bb0b88e, 0x2ac6148d, 0x29dec08c, 0x29dec08c,
	0x28fab08b, 0x2819e88a, 0x2819e88a, 0x273c5889,
	0x273c5889, 0x26620088, 0x258ad487, 0x258ad487,
	0x24b6d886, 0x24b6d886, 0x23e5fc85, 0x23184084,
	0x23184084, 0x224d9883, 0x224d9883, 0x21860882,
	0x21860882, 0x20c18081, 0x20c18081, 0x20000080
	};

	__device_func__(float __internal_frcp_kernel (float x,enum cudaRoundMode mo
	de))
	{
	unsigned long long prod;
	volatile union __cudart_FloatUintCvt arg;
	unsigned int expo;
	unsigned int sign;
	unsigned f, y;

	arg.f = x;
	sign = arg.i & 0x80000000;
	expo = (arg.i >> 23);
	expo = expo & 0xff;
	f = expo - 1;

	if (f <= 0xFD) {
	y = (arg.i << 8);
	y = y \| 0x80000000;
	/* initial approximation */
	arg.i = __internal_rcpTab[(y >> 24) - 128];
	/* first NR iteration */
	f = arg.i * arg.i;
	f = f << 16;
	prod = ((unsigned long long)y) * f;
	arg.i = (arg.i << 24) - (unsigned)(prod >> 32);
	/* second NR iteration */
	f = arg.i + arg.i;
	prod = ((unsigned long long)y) * f;
	f = (unsigned)(-(int)(prod >> 32));
	prod = ((unsigned long long)arg.i) * f;
	y = y >> 8;
	/* compute exponent */
	expo = (2 * 127) - expo - 2;
	arg.i = (unsigned)(prod >> 32);
	if (mode == cudaRoundNearest) {
	arg.i = arg.i >> 6;
	} else {
	arg.i = (arg.i + 32) >> 6;
	}
	if ((int)expo >= 0) {
	f = y * arg.i;
	arg.i = ((expo << 23) + arg.i) \| sign;
	} else {
	/* result is a denormal */
	expo = -(int)expo;
	arg.i = arg.i >> expo;
	f = y * arg.i;
	arg.i = arg.i \| sign;
	}
	if (mode == cudaRoundNearest) {
	expo = f + y;
	if ((int)f < 0) f = (unsigned)(-(int)f);
	if ((int)expo < 0) expo = (unsigned)(-(int)expo);
	if (expo < f) arg.i++;
	} else if (mode == cudaRoundZero) {
	if ((int)f > 0) arg.i = arg.i - 1;
	} else if (mode == cudaRoundPosInf) {
	if (((int)f > 0) && sign) arg.i = arg.i - 1;
	if (((int)f < 0) && !sign) arg.i = arg.i + 1;
	} else { /* mode == cudaRoundMinInf */
	if (((int)f > 0) && !sign) arg.i = arg.i - 1;
	if (((int)f < 0) && sign) arg.i = arg.i + 1;
	}
	return arg.f;
	} else {
	/* zero returns infinity. Must handle negative zero as well */
	if (!(arg.i << 1)) {
	arg.i = 0x7F800000 \| arg.i;
	return arg.f;
	}
	/* infinity returns zero of like sign */
	if ((arg.i << 1) == 0xff000000) {
	arg.i &= 0x80000000;
	return arg.f;
	}
	/* convert SNaNs to QNaNs */
	if ((arg.i << 1) > 0xff000000) {
	arg.i \|= 0x00400000;
	return arg.f;
	}
	/* denormals */
	f = 0;
	arg.i <<= 8;
	do {
	f++;
	arg.i <<= 1;
	} while ((int)arg.i > 0);
	arg.i >>= 8;
	arg.i \|= sign;
	arg.f = __internal_frcp_kernel (arg.f, mode);
	expo = ((arg.i << 1) >> 24);
	if ((expo + f) < 255) {
	arg.i = (arg.i + (f << 23));
	return arg.f;
	}
	if (mode == cudaRoundNearest) {
	arg.i = (arg.i & 0x80000000) \| 0x7f800000;
	} else if (mode == cudaRoundZero) {
	arg.i = (arg.i & 0x80000000) \| 0x7f7fffff;
	} else if (mode == cudaRoundPosInf) {
	arg.i = (arg.i & 0x80000000) \| ((sign) ? 0x7f7fffff : 0x7f800000);
	} else { /* mode == cudaRoundMinInf */
	arg.i = (arg.i & 0x80000000) \| ((sign) ? 0x7f800000 : 0x7f7fffff);
	}
	return arg.f;
	}
	}

	__device_func__(float __internal_fsqrt_kernel (float radicand,
	enum cudaRoundMode mode))
	{
	unsigned long long prod;
	volatile union __cudart_FloatUintCvt arg;
	unsigned int expo;
	unsigned int s, f, x;

	arg.f = radicand;
	expo = arg.i >> 23;
	expo = expo & 0xff;
	f = expo - 1;

	if ((arg.i <= 0x80000000) && (f <= 0xFD)) {
	/* normalize input argument */
	x = (arg.i << 8) \| 0x80000000;
	x = x >> (expo & 1);
	/* initial approximation */
	arg.i = f = __internal_invSqrtCubeTab[((unsigned)x >> 25) - 32];
	/* first NR iteration */
	prod = ((unsigned long long)x) * f;
	arg.i = ((arg.i * 3) << 22) - (unsigned)(prod >> 32);
	/* second NR iteration */
	prod = ((unsigned long long)arg.i) * arg.i;
	s = (unsigned)(prod >> 32);
	prod = ((unsigned long long)x) * s;
	f = 0x30000000 - (unsigned)(prod >> 32);
	prod = ((unsigned long long)f) * arg.i;
	arg.i = (unsigned)(prod >> 32);
	/* compute sqrt(x) as x * 1/sqrt(x) */
	prod = ((unsigned long long)x) * arg.i;
	arg.i = (unsigned)(prod >> 32);
	if (mode == cudaRoundNearest) {
	arg.i = arg.i >> 3;
	} else {
	arg.i = (arg.i + 4) >> 3;
	}
	x = (x << 16) - (arg.i * arg.i);
	/* round to nearest based on remainder; tie case impossible */
	if (mode == cudaRoundNearest) {
	f = x - (2 * arg.i + 1);
	if ((int)f < 0) f = (unsigned)(-(int)f);
	if ((int)x < 0) x = (unsigned)(-(int)x);
	if (f < x) arg.i ++;
	} else if ((mode == cudaRoundZero) \|\| (mode == cudaRoundMinInf)) {
	if ((int)x < 0) arg.i--;
	} else if (mode == cudaRoundPosInf) {
	if ((int)x > 0) arg.i++;
	}
	arg.i = arg.i + (((expo + 125) & ~0x1) << 22);
	return arg.f;
	} else {
	/* if zero, or positive infinity, return argument */
	if (!(arg.i << 1) \|\| (arg.i == 0x7F800000)) {
	return arg.f;
	}
	/* if NaN, return argument, possibly converted to QNaN */
	if ((arg.i << 1) > 0xFF000000) {
	arg.i \|= 0x00400000;
	return arg.f;
	}
	/* if negative, return NaN: INDEFINITE */
	if (arg.i & 0x80000000) {
	arg.i = 0xFFC00000;
	return arg.f;
	}
	/* denormal, normalize it before computing square root */
	x = 0;
	arg.i <<= 8;
	do {
	x++;
	arg.i <<= 1;
	} while ((int)arg.i > 0);
	arg.i >>= 8;
	arg.i += (x & 1) << 23;
	x += (x & 1);
	arg.f = __internal_fsqrt_kernel (arg.f, mode);
	arg.i -= ((x >> 1) << 23);
	return arg.f;
	}
	}

	__device_func__(float __internal_fdiv_kernel (float dividend, float divisor
	,
	enum cudaRoundMode mode))
	{
	unsigned long long prod;
	unsigned r, f, x, y, expox, expoy, sign;
	volatile union __cudart_FloatUintCvt cvtx, cvty, res;

	cvtx.f = dividend;
	cvty.f = divisor;
	expox = ((cvtx.i >> 23) & 0xff) - 1;
	expoy = ((cvty.i >> 23) & 0xff) - 1;
	sign = ((cvtx.i ^ cvty.i) & 0x80000000);

	if ((expox <= 0xFD) && (expoy <= 0xFD)) {
	divide:
	expox = expox - expoy + 127 - 1;
	expoy = expox;
	/* extract mantissas */
	y = (cvty.i << 8) \| 0x80000000;
	x = (cvtx.i & 0x00ffffff) \| 0x00800000;
	/* initial approximation */
	r = __internal_rcpTab[(y >> 24) - 128];
	/* first NR iteration */
	f = r * r;
	prod = ((unsigned long long)y) * (f << 16);
	r = (r << 24) - (unsigned)(prod >> 32);
	/* second NR iteration */
	prod = ((unsigned long long)y) * (r << 1);
	f = (unsigned)-(int)(prod >> 32);
	prod = ((unsigned long long)f) * (r << 1);
	r = (unsigned)(prod >> 32);
	/* produce quotient */
	prod = ((unsigned long long)x) * (r << 1);
	/* normalize mantissa */
	if (((int)((prod >> 32) << 8)) > 0) {
	expox--;
	prod = prod + prod;
	}
	if (mode == cudaRoundNearest) {
	/* preliminary mantissa */
	r = (unsigned)(prod >> 32);
	y = y >> 8;
	/* result is a normal */
	if (expox <= 0xFD) {
	int rem0, rem1, inc;
	/* round mantissa to nearest even */
	prod = ((unsigned long long)y) * r;
	x = x << (23 + ((prod >> 32) >> 15));
	rem1 = x - (unsigned)(prod & 0xffffffff);
	rem0 = rem1 - y;
	inc = abs(rem0) < abs(rem1);
	/* merge sign, mantissa, exponent for final result */
	res.i = sign \| ((expox << 23) + r + inc);
	return res.f;
	} else if ((int)expox >= 254) {
	/* overflow: return infinity */
	res.i = sign \| 0x7f800000;
	return res.f;
	} else {
	/* underflow: result is zero, denormal, or smallest normal */
	int shift = -(int)expox;
	if (shift > 23) {
	/* result is zero or smallest denormal */
	r = (shift < 25) && ((x != y) \|\| (r > 0x00ff0000));
	res.i = sign \| r;
	return res.f;
	}
	if (x == y) {
	/* result is denormal */
	shift = -(int)expoy;
	r = 0x00800000 >> shift;
	res.i = sign \| r;
	return res.f;
	}
	{
	unsigned long long tempx;
	long long remlo, remhi;
	/* result is denormal or smallest normal */
	r = r >> shift;
	prod = ((unsigned long long)y) * r;
	tempx = ((unsigned long long)x) << (23 - shift);
	remlo = 2 * tempx - 2 * prod - y;
	remhi = remlo + 2 * tempx;
	if (remlo < 0) remlo = -remlo;
	if (remhi < 0) remhi = -remhi;
	if (remhi < remlo) tempx = 2 * tempx;
	remlo = tempx - prod;
	remhi = remlo - y;
	if (remlo < 0) remlo = -remlo;
	if (remhi < 0) remhi = -remhi;
	if ((remhi < remlo) \|\| ((remhi == remlo) && (r & 1))) r++;
	res.i = sign \| r;
	return res.f;
	}
	}
	} else if (mode == cudaRoundZero) {
	/* preliminary mantissa */
	prod += 0x0000000080000000ULL;
	r = (unsigned)(prod >> 32);
	y = y >> 8;
	/* result is a normal */
	if (expox <= 0xFD) {
	int rem1;
	prod = ((unsigned long long)y) * r;
	x = x << (23 + ((prod >> 32) >> 15));
	rem1 = x - (unsigned)(prod & 0xffffffff);
	if (rem1 < 0) r--;
	r = (expox << 23) + r;
	if (r == 0x7f800000) r = 0x7f7fffff;
	res.i = sign \| r;
	return res.f;
	} else if ((int)expox >= 254) {
	/* overflow: return largest normal */
	res.i = sign \| 0x7f7fffff;
	return res.f;
	} else {
	/* underflow: result is zero, denormal, or smallest normal */
	int shift = -(int)expox;
	if ((x == y) && (shift < 31)) {
	shift = -(int)expoy;
	r = 0x00800000 >> shift;
	res.i = sign \| r;
	return res.f;
	}
	if (shift > 23) {
	r = 0;
	res.i = sign \| r;
	return res.f;
	}
	{
	unsigned long long tempx;
	long long remlo, remhi;
	/* result is denormal or smallest normal */
	r = r >> shift;
	prod = ((unsigned long long)y) * r;
	tempx = ((unsigned long long)x) << (23 - shift);
	remlo = 2 * tempx - 2 * prod - y;
	remhi = remlo + 2 * tempx;
	if (remlo < 0) remlo = -remlo;
	if (remhi < 0) remhi = -remhi;
	if (remhi < remlo) tempx = 2 * tempx;
	remlo = tempx - prod;
	if ((remlo < 0) & (r != 0)) r--;
	res.i = sign \| r;
	return res.f;
	}
	}
	} else if (mode == cudaRoundPosInf) {
	/* preliminary mantissa */
	prod += 0x0000000080000000ULL;
	r = (unsigned)(prod >> 32);
	y = y >> 8;
	/* result is a normal */
	if (expox <= 0xFD) {
	int rem1;
	prod = ((unsigned long long)y) * r;
	x = x << (23 + ((prod >> 32) >> 15));
	rem1 = x - (unsigned)(prod & 0xffffffff);
	if ((rem1 < 0) && (sign)) r--;
	if ((rem1 > 0) && (!sign)) r++;
	r = (expox << 23) + r;
	if ((r == 0x7f800000) && (sign)) r = 0x7f7fffff;
	res.i = sign \| r;
	return res.f;
	} else if ((int)expox >= 254) {
	/* overflow: return largest normal, or infinity */
	r = sign ? 0x7f7fffff : 0x7f800000;
	res.i = sign \| r;
	return res.f;
	} else {
	/* underflow: result is zero, denormal, or smallest normal */
	int shift = -(int)expox;
	if ((x == y) && (shift <= 24)) {
	shift = -(int)expoy;
	r = 0x00800000 >> shift;
	if (r == 0) r = !sign;
	res.i = sign \| r;
	return res.f;
	}
	if (shift > 23) {
	r = !sign;
	res.i = sign \| r;
	return res.f;
	}
	{
	unsigned long long tempx;
	long long remlo, remhi;
	/* result is denormal or smallest normal */
	r = r >> shift;
	prod = ((unsigned long long)y) * r;
	tempx = ((unsigned long long)x) << (23 - shift);
	remlo = 2 * tempx - 2 * prod - y;
	remhi = remlo + 2 * tempx;
	if (remlo < 0) remlo = -remlo;
	if (remhi < 0) remhi = -remhi;
	if (remhi < remlo) tempx = 2 * tempx;
	remlo = tempx - prod;
	if ((remlo < 0) && (r != 0) && (sign)) r--;
	if ((remlo > 0) && (!sign)) r++;
	res.i = sign \| r;
	return res.f;
	}
	}
	} else if (mode == cudaRoundMinInf) {
	/* preliminary mantissa */
	prod += 0x0000000080000000ULL;
	r = (unsigned)(prod >> 32);
	y = y >> 8;
	/* result is a normal */
	if (expox <= 0xFD) {
	int rem1;
	prod = ((unsigned long long)y) * r;
	x = x << (23 + ((prod >> 32) >> 15));
	rem1 = x - (unsigned)(prod & 0xffffffff);
	if ((rem1 < 0) && (!sign)) r--;
	if ((rem1 > 0) && (sign)) r++;
	r = (expox << 23) + r;
	if ((r == 0x7f800000) && (!sign)) r = 0x7f7fffff;
	res.i = sign \| r;
	return res.f;
	} else if ((int)expox >= 254) {
	/* overflow: return largest normal, or infinity */
	r = sign ? 0x7f800000 : 0x7f7fffff;
	res.i = sign \| r;
	return res.f;
	} else {
	/* underflow: result is zero, denormal, or smallest normal */
	int shift = -(int)expox;
	if ((x == y) && (shift <= 24)) {
	shift = -(int)expoy;
	r = 0x00800000 >> shift;
	if (r == 0) r = !!sign;
	res.i = sign \| r;
	return res.f;
	}
	if (shift > 23) {
	r = !!sign;
	res.i = sign \| r;
	return res.f;
	}
	{
	unsigned long long tempx;
	long long remlo, remhi;
	/* result is denormal or smallest normal */
	r = r >> shift;
	prod = ((unsigned long long)y) * r;
	tempx = ((unsigned long long)x) << (23 - shift);
	remlo = 2 * tempx - 2 * prod - y;
	remhi = remlo + 2 * tempx;
	if (remlo < 0) remlo = -remlo;
	if (remhi < 0) remhi = -remhi;
	if (remhi < remlo) tempx = 2 * tempx;
	remlo = tempx - prod;
	if ((remlo < 0) && (r != 0) && (!sign)) r--;
	if ((remlo > 0) && (sign)) r++;
	res.i = sign \| r;
	return res.f;
	}
	}
	}
	}
	{
	int xzero, yzero, xinf, yinf, xnan, ynan;

	xnan = (cvtx.i << 1) > 0xff000000;
	ynan = (cvty.i << 1) > 0xff000000;
	/* handle NaNs. Convert SNaNs to QNaNs */
	if (xnan) {
	res.i = cvtx.i \| 0x00400000;
	return res.f;
	}
	if (ynan) {
	res.i = cvty.i \| 0x00400000;
	return res.f;
	}
	xzero = (cvtx.i << 1) == 0x00000000;
	yzero = (cvty.i << 1) == 0x00000000;
	xinf = (cvtx.i << 1) == 0xff000000;
	yinf = (cvty.i << 1) == 0xff000000;
	/* 0/0 and INF/INF are invalid operations. Return INDEFINITE */
	if ((xzero & yzero) \| (xinf & yinf)) {
	res.i = 0xffc00000;
	return res.f;
	}
	/* x/INF and 0/y -> 0 */
	if (xzero \| yinf) {
	res.i = sign;
	return res.f;
	}
	/* x/0 and INF/y -> INF */
	if (yzero \| xinf) {
	res.i = sign \| 0x7f800000;
	return res.f;
	}
	/* normalize denormals */
	if ((int)expox < 0) {
	cvtx.i = cvtx.i << 9;
	while ((int)cvtx.i >= 0) {
	expox--;
	cvtx.i = cvtx.i + cvtx.i;
	}
	cvtx.i = cvtx.i >> 8;
	}
	if ((int)expoy < 0) {
	cvty.i = cvty.i << 9;
	while ((int)cvty.i >= 0) {
	expoy--;
	cvty.i = cvty.i + cvty.i;
	}
	cvty.i = cvty.i >> 8;
	}
	goto divide;
	}
	}

	__device_func__(float __internal_fmul_kernel (float a, float b,
	enum cudaRoundMode mode))
	{
	unsigned long long product;
	volatile union __cudart_FloatUintCvt xx, yy;
	unsigned expo_x, expo_y;

	xx.f = a;
	yy.f = b;

	expo_y = 0xFF;
	expo_x = expo_y & (xx.i >> 23);
	expo_x = expo_x - 1;
	expo_y = expo_y & (yy.i >> 23);
	expo_y = expo_y - 1;

	if ((expo_x <= 0xFD) &&
	(expo_y <= 0xFD)) {
	multiply:
	expo_x = expo_x + expo_y;
	expo_y = xx.i ^ yy.i;
	xx.i = xx.i & 0x00ffffff;
	yy.i = yy.i << 8;
	xx.i = xx.i \| 0x00800000;
	yy.i = yy.i \| 0x80000000;
	/* compute product */
	product = ((unsigned long long)xx.i) * yy.i;
	expo_x = expo_x - 127 + 2;
	expo_y = expo_y & 0x80000000;
	xx.i = (unsigned int)(product >> 32);
	yy.i = (unsigned int)(product & 0xffffffff);
	/* normalize mantissa */
	if (xx.i < 0x00800000) {
	xx.i = (xx.i << 1) \| (yy.i >> 31);
	yy.i = (yy.i << 1);
	expo_x--;
	}
	if (expo_x <= 0xFD) {
	xx.i = xx.i \| expo_y; /* OR in sign bit */
	xx.i = xx.i + (expo_x << 23); /* add in exponent */
	/* round result to nearest or even */
	if (mode == cudaRoundNearest) {
	if (yy.i < 0x80000000) return xx.f;
	xx.i += ((yy.i == 0x80000000) ? (xx.i & 1) : (yy.i >> 31));
	} else if (mode == cudaRoundZero) {
	} else if (mode == cudaRoundPosInf) {
	xx.i += (yy.i && !expo_y);
	} else if (mode == cudaRoundMinInf) {
	xx.i += (yy.i && expo_y);
	}
	return xx.f;
	} else if ((int)expo_x >= 254) {
	/* overflow: return infinity or largest normal */
	if (mode == cudaRoundNearest) {
	xx.i = expo_y \| 0x7F800000;
	} else if (mode == cudaRoundZero) {
	xx.i = expo_y \| 0x7F7FFFFF;
	} else if (mode == cudaRoundPosInf) {
	xx.i = (expo_y ? 0xff7fffff : 0x7F800000);
	} else { /* (mode == cudaRoundMinInf) */
	xx.i = (expo_y ? 0xFF800000 : 0x7f7fffff);
	}
	return xx.f;
	} else {
	/* zero, denormal, or smallest normal */
	expo_x = ((unsigned int)-((int)expo_x));
	if (mode == cudaRoundNearest) {
	if (expo_x > 25) {
	/* massive underflow: return 0 */
	xx.i = expo_y;
	return xx.f;
	} else {
	yy.i = (xx.i << (32 - expo_x)) \| ((yy.i) ? 1 : 0);
	xx.i = expo_y + (xx.i >> expo_x);
	xx.i += ((yy.i == 0x80000000) ? (xx.i & 1) : (yy.i >> 31));
	return xx.f;
	}
	} else if (mode == cudaRoundZero) {
	if (expo_x > 25) expo_x = 25;
	xx.i = expo_y + (xx.i >> expo_x);
	return xx.f;
	} else if (mode == cudaRoundPosInf) {
	if (expo_x > 25) expo_x = 25;
	yy.i = (xx.i << (32 - expo_x)) \| ((yy.i) ? 1 : 0);
	xx.i = expo_y + (xx.i >> expo_x);
	xx.i += (yy.i && !expo_y);
	return xx.f;
	} else { /* (mode == cudaRoundMinInf) */
	if (expo_x > 25) expo_x = 25;
	yy.i = (xx.i << (32 - expo_x)) \| ((yy.i) ? 1 : 0);
	xx.i = expo_y + (xx.i >> expo_x);
	xx.i += (yy.i && expo_y);
	return xx.f;
	}
	}
	} else {
	product = xx.i ^ yy.i;
	product = product & 0x80000000;
	if (!(xx.i & 0x7fffffff)) {
	if (expo_y != 254) {
	xx.i = (unsigned int)product;
	return xx.f;
	}
	expo_y = yy.i << 1;
	if (expo_y == 0xFF000000) {
	xx.i = expo_y \| 0x00C00000;
	} else {
	xx.i = yy.i \| 0x00400000;
	}
	return xx.f;
	}
	if (!(yy.i & 0x7fffffff)) {
	if (expo_x != 254) {
	xx.i = (unsigned int)product;
	return xx.f;
	}
	expo_x = xx.i << 1;
	if (expo_x == 0xFF000000) {
	xx.i = expo_x \| 0x00C00000;
	} else {
	xx.i = xx.i \| 0x00400000;
	}
	return xx.f;
	}
	if ((expo_y != 254) && (expo_x != 254)) {
	expo_y++;
	expo_x++;
	if (expo_x == 0) {
	expo_y \|= xx.i & 0x80000000;
	/*
	* If both operands are denormals, we only need to normalize
	* one of them as the result will be either a denormal or zero.
	*/
	xx.i = xx.i << 8;
	while (!(xx.i & 0x80000000)) {
	xx.i <<= 1;
	expo_x--;
	}
	xx.i = (xx.i >> 8) \| (expo_y & 0x80000000);
	expo_y &= ~0x80000000;
	expo_y--;
	goto multiply;
	}
	if (expo_y == 0) {
	expo_x \|= yy.i & 0x80000000;
	yy.i = yy.i << 8;
	while (!(yy.i & 0x80000000)) {
	yy.i <<= 1;
	expo_y--;
	}
	yy.i = (yy.i >> 8) \| (expo_x & 0x80000000);
	expo_x &= ~0x80000000;
	expo_x--;
	goto multiply;
	}
	}
	expo_x = xx.i << 1;
	expo_y = yy.i << 1;
	/* if x is NaN, return x */
	if (expo_x > 0xFF000000) {
	/* cvt any SNaNs to QNaNs */
	xx.i = xx.i \| 0x00400000;
	return xx.f;
	}
	/* if y is NaN, return y */
	if (expo_y > 0xFF000000) {
	/* cvt any SNaNs to QNaNs */
	xx.i = yy.i \| 0x00400000;
	return xx.f;
	}
	xx.i = (unsigned int)product \| 0x7f800000;
	return xx.f;
	}
	}

	__device_func__(float __internal_fmaf_kernel (float a, float b, float c,
	enum cudaRoundMode mode))
	{
	unsigned long long product;
	unsigned int xx, yy, zz, ww;
	unsigned int temp, s, u;
	unsigned int expo_x, expo_y, expo_z;
	volatile union __cudart_FloatUintCvt cvt;

	cvt.f = a;
	xx = cvt.i;
	cvt.f = b;
	yy = cvt.i;
	cvt.f = c;
	zz = cvt.i;

	temp = 0xff;
	expo_x = temp & (xx >> 23);
	expo_x = expo_x - 1;
	expo_y = temp & (yy >> 23);
	expo_y = expo_y - 1;
	expo_z = temp & (zz >> 23);
	expo_z = expo_z - 1;

	if (!((expo_x <= 0xFD) &&
	(expo_y <= 0xFD) &&
	(expo_z <= 0xFD))) {
	/* fmad (nan, y, z) --> nan
	fmad (x, nan, z) --> nan
	fmad (x, y, nan) --> nan
	*/
	if ((yy << 1) > 0xff000000) {
	return b + b;
	}
	if ((zz << 1) > 0xff000000) {
	return c + c;
	}
	if ((xx << 1) > 0xff000000) {
	return a + a;
	}
	/* fmad (0, inf, z) --> NaN
	fmad (inf, 0, z) --> NaN
	fmad (-inf,+y,+inf) --> NaN
	fmad (+x,-inf,+inf) --> NaN
	fmad (+inf,-y,+inf) --> NaN
	fmad (-x,+inf,+inf) --> NaN
	fmad (-inf,-y,-inf) --> NaN
	fmad (-x,-inf,-inf) --> NaN
	fmad (+inf,+y,-inf) --> NaN
	fmad (+x,+inf,-inf) --> NaN
	*/
	if ((((xx << 1) == 0) && ((yy << 1) == 0xff000000)) \|\|
	(((yy << 1) == 0) && ((xx << 1) == 0xff000000))) {
	cvt.i = 0xffc00000;
	return cvt.f;
	}
	if ((zz << 1) == 0xff000000) {
	if (((yy << 1) == 0xff000000) \|\| ((xx << 1) == 0xff000000)) {
	if ((int)(xx ^ yy ^ zz) < 0) {
	cvt.i = 0xffc00000;
	return cvt.f;
	}
	}
	}
	/* fmad (inf, y, z) --> inf
	fmad (x, inf, z) --> inf
	fmad (x, y, inf) --> inf
	*/
	if ((xx << 1) == 0xff000000) {
	xx = xx ^ (yy & 0x80000000);
	cvt.i = xx;
	return cvt.f;
	}
	if ((yy << 1) == 0xff000000) {
	yy = yy ^ (xx & 0x80000000);
	cvt.i = yy;
	return cvt.f;
	}
	if ((zz << 1) == 0xff000000) {
	cvt.i = zz;
	return cvt.f;
	}
	/* fmad (+0, -y, -0) --> -0
	fmad (-0, +y, -0) --> -0
	fmad (+x, -0, -0) --> -0
	fmad (-x, +0, -0) --> -0
	*/
	if (zz == 0x80000000) {
	if (((xx << 1) == 0) \|\| ((yy << 1) == 0)) {
	if ((int)(xx ^ yy) < 0) {
	cvt.i = zz;
	return cvt.f;
	}
	}
	}
	/* fmad (0, y, 0) --> +0
	fmad (x, 0, 0) --> +0
	*/
	if (((zz << 1) == 0) &&
	(((xx << 1) == 0) \|\| ((yy << 1) == 0))) {
	if (mode == cudaRoundMinInf) {
	zz = 0x80000000 & (xx ^ yy ^ zz);
	} else {
	zz &= 0x7fffffff;
	}
	cvt.i = zz;
	return cvt.f;
	}
	/* fmad (0, y, z) --> z
	fmad (x, 0, z) --> z
	*/
	if (((xx << 1) == 0) \|\| ((yy << 1) == 0)) {
	cvt.i = zz;
	return cvt.f;
	}
	/* normalize x, if denormal */
	if (expo_x == (unsigned)-1) {
	temp = xx & 0x80000000;
	xx = xx << 8;
	while (!(xx & 0x80000000)) {
	xx <<= 1;
	expo_x--;
	}
	expo_x++;
	xx = (xx >> 8) \| temp;
	}
	/* normalize y, if denormal */
	if (expo_y == (unsigned)-1) {
	temp = yy & 0x80000000;
	yy = yy << 8;
	while (!(yy & 0x80000000)) {
	yy <<= 1;
	expo_y--;
	}
	expo_y++;
	yy = (yy >> 8) \| temp;
	}
	/* normalize z, if denormal */
	if ((expo_z == (unsigned)-1) && ((zz << 1) != 0)) {
	temp = zz & 0x80000000;
	zz = zz << 8;
	while (!(zz & 0x80000000)) {
	zz <<= 1;
	expo_z--;
	}
	expo_z++;
	zz = (zz >> 8) \| temp;
	}
	}

	expo_x = expo_x + expo_y;
	expo_y = xx ^ yy;
	xx = xx & 0x00ffffff;
	yy = yy << 8;
	xx = xx \| 0x00800000;
	yy = yy \| 0x80000000;

	product = ((unsigned long long)xx) * yy;
	xx = (unsigned)(product >> 32);
	yy = (unsigned)(product & 0xffffffff);

	expo_x = expo_x - 127 + 2;
	expo_y = expo_y & 0x80000000;
	/* normalize mantissa */
	if (xx < 0x00800000) {
	xx = (xx << 1) \| (yy >> 31);
	yy = (yy << 1);
	expo_x--;
	}
	temp = 0;

	if ((zz << 1) != 0) { /* z is not zero */
	s = zz & 0x80000000;
	zz &= 0x00ffffff;
	zz \|= 0x00800000;
	ww = 0;
	/* compare and swap. put augend into xx:yy */
	if ((int)expo_z > (int)expo_x) {
	temp = expo_z;
	expo_z = expo_x;
	expo_x = temp;
	temp = zz;
	zz = xx;
	xx = temp;
	temp = ww;
	ww = yy;
	yy = temp;
	temp = expo_y;
	expo_y = s;
	s = temp;
	}
	/* augend_sign = expo_y, augend_mant = xx:yy, augend_expo = expo_x */
	/* addend_sign = s, addend_mant = zz:ww, addend_expo = expo_z */
	expo_z = expo_x - expo_z;
	u = expo_y ^ s;
	if (expo_z <= 49) {
	/* denormalize addend */
	temp = 0;
	while (expo_z >= 32) {
	temp = ww \| (temp != 0);
	ww = zz;
	zz = 0;
	expo_z -= 32;
	}
	if (expo_z) {
	temp = ((temp >> expo_z) \| (ww << (32 - expo_z)) \|
	((temp << (32 - expo_z)) != 0));
	ww = (ww >> expo_z) \| (zz << (32 - expo_z));
	zz = (zz >> expo_z);
	}

	} else {
	temp = 1;
	ww = 0;
	zz = 0;
	}
	if ((int)u < 0) {
	/* signs differ, effective subtraction */
	temp = (unsigned)(-(int)temp);
	s = (temp != 0);
	u = yy - s;
	s = u > yy;
	yy = u - ww;
	s += yy > u;
	xx = (xx - zz) - s;
	if (!(xx \| yy \| temp)) {
	/* complete cancelation, return 0 */
	if (mode == cudaRoundMinInf) {
	xx = 0x80000000;
	}
	cvt.i = xx;
	return cvt.f;
	}
	if ((int)xx < 0) {
	/* ooops, augend had smaller mantissa. Negate mantissa and flip
	sign of result*/
	temp = ~temp;
	yy = ~yy;
	xx = ~xx;
	if (++temp == 0) {
	if (++yy == 0) {
	++xx;
	}
	}
	expo_y ^= 0x80000000;
	}
	/* normalize mantissa, if necessary */
	while (!(xx & 0x00800000)) {
	xx = (xx << 1) \| (yy >> 31);
	yy = (yy << 1);
	expo_x--;
	}
	} else {
	/* signs are the same, effective addition */
	yy = yy + ww;
	s = yy < ww;
	xx = xx + zz + s;
	if (xx & 0x01000000) {
	temp = temp \| (yy << 31);
	yy = (yy >> 1) \| (xx << 31);
	xx = ((xx & 0x80000000) \| (xx >> 1)) & ~0x40000000;
	expo_x++;
	}
	}
	}
	temp = yy \| (temp != 0);
	if (expo_x <= 0xFD) {
	/* normal */
	xx \|= expo_y; /* or in sign bit */
	if (mode == cudaRoundNearest) {
	s = xx & 1; /* mantissa lsb */
	xx += (temp == 0x80000000) ? s : (temp >> 31);
	} else if (mode == cudaRoundPosInf) {
	xx += temp && !expo_y;
	} else if (mode == cudaRoundMinInf) {
	xx += temp && expo_y;
	}
	xx = xx + (expo_x << 23); /* add in exponent */
	cvt.i = xx;
	return cvt.f;
	} else if ((int)expo_x >= 126) {
	/* overflow */
	if (mode == cudaRoundNearest) {
	xx = expo_y \| 0x7f800000;
	} else if (mode == cudaRoundZero) {
	xx = expo_y \| 0x7F7FFFFF;
	} else if (mode == cudaRoundPosInf) {
	xx = expo_y ? 0xFF7FFFFF : 0x7f800000;
	} else if (mode == cudaRoundMinInf) {
	xx = expo_y ? 0xff800000 : 0x7f7fffff;
	}
	cvt.i = xx;
	return cvt.f;
	}
	/* subnormal */
	expo_x = (unsigned int)(-(int)expo_x);
	if (expo_x > 25) {
	/* massive underflow: return 0, or smallest denormal */
	xx = 0;
	if (mode == cudaRoundPosInf) {
	xx += !expo_y;
	} else if (mode == cudaRoundMinInf) {
	xx += !!expo_y;
	}
	cvt.i = expo_y \| xx;
	return cvt.f;
	}
	temp = (xx << (32 - expo_x)) \| ((temp) ? 1 : 0);
	xx = xx >> expo_x;
	if (mode == cudaRoundNearest) {
	xx = xx + ((temp == 0x80000000) ? (xx & 1) : (temp >> 31));
	} else if (mode == cudaRoundPosInf) {
	xx = xx + (!expo_y && temp);
	} else if (mode == cudaRoundMinInf) {
	xx = xx + (expo_y && temp);
	}
	xx = expo_y + xx; /* add in sign bit */
	cvt.i = xx;
	return cvt.f;
	}

	__device_func__(float __internal_fadd_kernel (float a, float b,
	enum cudaRoundMode mode))
	{
	volatile union __cudart_FloatUintCvt xx, yy;
	unsigned int expo_x;
	unsigned int expo_y;
	unsigned int temp;

	xx.f = a;
	yy.f = b;

	/* make bigger operand the augend */
	expo_y = yy.i << 1;
	if (expo_y > (xx.i << 1)) {
	expo_y = xx.i;
	xx.i = yy.i;
	yy.i = expo_y;
	}

	temp = 0xff;
	expo_x = temp & (xx.i >> 23);
	expo_x = expo_x - 1;
	expo_y = temp & (yy.i >> 23);
	expo_y = expo_y - 1;

	if ((expo_x <= 0xFD) &&
	(expo_y <= 0xFD)) {
	add:
	expo_y = expo_x - expo_y;
	if (expo_y > 25) {
	expo_y = 31;
	}
	temp = xx.i ^ yy.i;
	xx.i = xx.i & ~0x7f000000;
	xx.i = xx.i \| 0x00800000;
	yy.i = yy.i & ~0xff000000;
	yy.i = yy.i \| 0x00800000;

	if ((int)temp < 0) {
	/* signs differ, effective subtraction */
	temp = 32 - expo_y;
	temp = (expo_y) ? (yy.i << temp) : 0;
	temp = (unsigned)(-((int)temp));
	xx.i = xx.i - (yy.i >> expo_y) - (temp ? 1 : 0);
	if (xx.i & 0x00800000) {
	if (expo_x <= 0xFD) {
	xx.i = xx.i + (expo_x << 23);
	if (mode == cudaRoundNearest) {
	if (temp < 0x80000000) return xx.f;
	xx.i += ((temp == 0x80000000) ? (xx.i & 1) : (temp >> 31));
	} else if (mode == cudaRoundZero) {
	} else if (mode == cudaRoundPosInf) {
	xx.i += (temp && !(xx.i & 0x80000000));
	} else if (mode == cudaRoundMinInf) {
	xx.i += (temp && (xx.i & 0x80000000));
	}
	return xx.f;
	}
	} else {
	if ((temp \| (xx.i << 1)) == 0) {
	/* operands cancelled, resulting in a clean zero */
	if (mode == cudaRoundMinInf) {
	xx.i = 0x80000000;
	} else {
	xx.i = 0;
	}
	return xx.f;
	}
	/* normalize result */
	yy.i = xx.i & 0x80000000;
	do {
	xx.i = (xx.i << 1) \| (temp >> 31);
	temp <<= 1;
	expo_x--;
	} while (!(xx.i & 0x00800000));
	xx.i = xx.i \| yy.i;
	}
	} else {
	/* signs are the same, effective addition */
	temp = 32 - expo_y;
	temp = (expo_y) ? (yy.i << temp) : 0;
	xx.i = xx.i + (yy.i >> expo_y);
	if (!(xx.i & 0x01000000)) {
	if (expo_x <= 0xFD) {
	xx.i = xx.i + (expo_x << 23);
	if (mode == cudaRoundNearest) {
	if (temp < 0x80000000) return xx.f;
	xx.i += ((temp == 0x80000000) ? (xx.i & 1) : (temp >> 31));
	} else if (mode == cudaRoundZero) {
	} else if (mode == cudaRoundPosInf) {
	xx.i += (temp && !(xx.i & 0x80000000));
	} else if (mode == cudaRoundMinInf) {
	xx.i += (temp && (xx.i & 0x80000000));
	}
	return xx.f;
	}
	} else {
	/* normalize result */
	temp = (xx.i << 31) \| (temp >> 1);
	xx.i = ((xx.i & 0x80000000) \| (xx.i >> 1)) & ~0x40000000;
	expo_x++;
	}
	}
	if (expo_x <= 0xFD) {
	xx.i = xx.i + (expo_x << 23);
	if (mode == cudaRoundNearest) {
	if (temp < 0x80000000) return xx.f;
	xx.i += ((temp == 0x80000000) ? (xx.i & 1) : (temp >> 31));
	} else if (mode == cudaRoundZero) {
	} else if (mode == cudaRoundPosInf) {
	xx.i += (temp && !(xx.i & 0x80000000));
	} else if (mode == cudaRoundMinInf) {
	xx.i += (temp && (xx.i & 0x80000000));
	}
	return xx.f;
	}
	if ((int)expo_x >= 254) {
	/* overflow: return infinity or largest normal */
	temp = xx.i & 0x80000000;
	if (mode == cudaRoundNearest) {
	xx.i = (temp) \| 0x7f800000;
	} else if (mode == cudaRoundZero) {
	xx.i = (temp) \| 0x7f7fffff;
	} else if (mode == cudaRoundMinInf) {
	xx.i = (temp ? 0xFF800000 : 0x7f7fffff);
	} else if (mode == cudaRoundPosInf) {
	xx.i = (temp ? 0xff7fffff : 0x7F800000);
	}
	return xx.f;
	}
	/* underflow: denormal, or smallest normal */
	expo_y = expo_x + 32;
	yy.i = xx.i & 0x80000000;
	xx.i = xx.i & ~0xff000000;
	expo_x = (unsigned)(-((int)expo_x));
	temp = xx.i << expo_y \| ((temp) ? 1 : 0);
	xx.i = yy.i \| (xx.i >> expo_x);
	if (mode == cudaRoundNearest) {
	xx.i += (temp == 0x80000000) ? (xx.i & 1) : (temp >> 31);
	} else if (mode == cudaRoundZero) {
	} else if (mode == cudaRoundPosInf) {
	xx.i += (temp && !yy.i);
	} else if (mode == cudaRoundMinInf) {
	xx.i += (temp && yy.i);
	}
	return xx.f;
	} else {
	/* handle special cases separately */
	if (!(yy.i << 1)) {
	if (mode == cudaRoundMinInf) {
	if (!(xx.i << 1)) {
	xx.i = xx.i \| yy.i;
	}
	} else {
	if (xx.i == 0x80000000) {
	xx.i = yy.i;
	}
	}
	if ((xx.i << 1) > 0xff000000) {
	xx.i \|= 0x00400000;
	}
	return xx.f;
	}
	if ((expo_y != 254) && (expo_x != 254)) {
	/* remove sign bits */
	if (expo_x == (unsigned int) -1) {
	temp = xx.i & 0x80000000;
	xx.i = xx.i << 8;
	while (!(xx.i & 0x80000000)) {
	xx.i <<= 1;
	expo_x--;
	}
	expo_x++;
	xx.i = (xx.i >> 8) \| temp;
	}
	if (expo_y == (unsigned int) -1) {
	temp = yy.i & 0x80000000;
	yy.i = yy.i << 8;
	while (!(yy.i & 0x80000000)) {
	yy.i <<= 1;
	expo_y--;
	}
	expo_y++;
	yy.i = (yy.i >> 8) \| temp;
	}
	goto add;
	}
	expo_x = xx.i << 1;
	expo_y = yy.i << 1;
	/* if x is NaN, return x */
	if (expo_x > 0xff000000) {
	/* cvt any SNaNs to QNaNs */
	xx.i = xx.i \| 0x00400000;
	return xx.f;
	}
	/* if y is NaN, return y */
	if (expo_y > 0xff000000) {
	/* cvt any SNaNs to QNaNs */
	xx.i = yy.i \| 0x00400000;
	return xx.f;
	}
	if ((expo_x == 0xff000000) && (expo_y == 0xff000000)) {
	/*
	* subtraction of infinities with the same sign, and addition of
	* infinities of unlike sign is undefined: return NaN INDEFINITE
	*/
	expo_x = xx.i ^ yy.i;
	xx.i = xx.i \| ((expo_x) ? 0xffc00000 : 0);
	return xx.f;
	}
	/* handle infinities */
	if (expo_y == 0xff000000) {
	xx.i = yy.i;
	}
	return xx.f;
	}
	}

	__device_func__(float __frcp_rn (float a))
	{
	return __internal_frcp_kernel (a, cudaRoundNearest);
	}

	__device_func__(float __frcp_rz (float a))
	{
	return __internal_frcp_kernel (a, cudaRoundZero);
	}

	__device_func__(float __frcp_rd (float a))
	{
	return __internal_frcp_kernel (a, cudaRoundMinInf);
	}

	__device_func__(float __frcp_ru (float a))
	{
	return __internal_frcp_kernel (a, cudaRoundPosInf);
	}

	__device_func__(float __fsqrt_rn (float a))
	{
	return __internal_fsqrt_kernel (a, cudaRoundNearest);
	}

	__device_func__(float __fsqrt_rz (float a))
	{
	return __internal_fsqrt_kernel (a, cudaRoundZero);
	}

	__device_func__(float __fsqrt_rd (float a))
	{
	return __internal_fsqrt_kernel (a, cudaRoundMinInf);
	}

	__device_func__(float __fsqrt_ru (float a))
	{
	return __internal_fsqrt_kernel (a, cudaRoundPosInf);
	}

	__device_func__(float __fdiv_rn (float a, float b))
	{
	return __internal_fdiv_kernel (a, b, cudaRoundNearest);
	}

	__device_func__(float __fdiv_rz (float a, float b))
	{
	return __internal_fdiv_kernel (a, b, cudaRoundZero);
	}

	__device_func__(float __fdiv_rd (float a, float b))
	{
	return __internal_fdiv_kernel (a, b, cudaRoundMinInf);
	}

	__device_func__(float __fdiv_ru (float a, float b))
	{
	return __internal_fdiv_kernel (a, b, cudaRoundPosInf);
	}

	__device_func__(float __fadd_rd (float a, float b))
	{
	return __internal_fadd_kernel (a, b, cudaRoundMinInf);
	}

	__device_func__(float __fadd_ru (float a, float b))
	{
	return __internal_fadd_kernel (a, b, cudaRoundPosInf);
	}

	__device_func__(float __fmul_rd (float a, float b))
	{
	return __internal_fmul_kernel (a, b, cudaRoundMinInf);
	}

	__device_func__(float __fmul_ru (float a, float b))
	{
	return __internal_fmul_kernel (a, b, cudaRoundPosInf);
	}

	__device_func__(float __fmaf_rn (float a, float b, float c))
	{
	return __internal_fmaf_kernel (a, b, c, cudaRoundNearest);
	}

	__device_func__(float __fmaf_rz (float a, float b, float c))
	{
	return __internal_fmaf_kernel (a, b, c, cudaRoundZero);
	}

	__device_func__(float __fmaf_ru (float a, float b, float c))
	{
	return __internal_fmaf_kernel (a, b, c, cudaRoundPosInf);
	}

	__device_func__(float __fmaf_rd (float a, float b, float c))
	{
	return __internal_fmaf_kernel (a, b, c, cudaRoundMinInf);
	}

	__device_func__(int __cuda___isnan(double a));
	__device_func__(int __cuda___isnanf(float a));
	__device_func__(int __double2int_rz(double));
	__device_func__(unsigned int __double2uint_rz(double));
	__device_func__(long long int __double2ll_rz(double));
	__device_func__(unsigned long long int __double2ull_rz(double));

	#define __internal_clamp(val, max, min, nan)
	\
	if (sizeof(val) == sizeof(double) && __cuda___isnan((double)val)) re
	turn nan; \
	if (sizeof(val) == sizeof(float) && __cuda___isnanf((float)val)) ret
	urn nan; \
	if (val >= max) return max;
	\
	if (val <= min) return min

	/**************************************************************************
	*****
	*
	*
	* HOST IMPLEMENTATIONS FOR FUNCTIONS WITH BUILTIN NVOPENCC OPREATIONS
	*
	*
	*
	***************************************************************************
	****/

	__device_func__(int __mulhi(int a, int b))
	{
	long long int c = (long long int)a * (long long int)b;

	return (int)(c >> 32);
	}

	__device_func__(unsigned int __umulhi(unsigned int a, unsigned int b))
	{
	unsigned long long int c = (unsigned long long int)a * (unsigned long lon
	g int)b;

	return (unsigned int)(c >> 32);
	}

	__device_func__(unsigned long long int __umul64hi(unsigned long long int a,
	unsigned long long int b))
	{
	unsigned int a_lo = (unsigned int)a;
	unsigned long long int a_hi = a >> 32;
	unsigned int b_lo = (unsigned int)b;
	unsigned long long int b_hi = b >> 32;
	unsigned long long int m1 = a_lo * b_hi;
	unsigned long long int m2 = a_hi * b_lo;
	unsigned int carry;

	carry = (0ULL + __umulhi(a_lo, b_lo) + (unsigned int)m1 + (unsigned int)m
	2) >> 32;

	return a_hi * b_hi + (m1 >> 32) + (m2 >> 32) + carry;
	}

	__device_func__(long long int __mul64hi(long long int a, long long int b))
	{
	long long int res;
	res = __umul64hi(a, b);
	if (a < 0LL) res = res - b;
	if (b < 0LL) res = res - a;
	return res;
	}

	__device_func__(float __saturatef(float a))
	{
	if (__cuda___isnanf(a)) return 0.0f; /* update of PTX spec 10/15/2008 */
	return a >= 1.0f ? 1.0f : a <= 0.0f ? 0.0f : a;
	}

	__device_func__(unsigned int __sad(int a, int b, unsigned int c))
	{
	long long int diff = (long long int)a - (long long int)b;

	return (unsigned int)(__cuda_llabs(diff) + (long long int)c);
	}

	__device_func__(unsigned int __usad(unsigned int a, unsigned int b, unsigne
	d int c))
	{
	long long int diff = (long long int)a - (long long int)b;

	return (unsigned int)(__cuda_llabs(diff) + (long long int)c);
	}

	__device_func__(int __mul24(int a, int b))
	{
	a &= 0xffffff;
	a = (a & 0x800000) != 0 ? a \| ~0xffffff : a;
	b &= 0xffffff;
	b = (b & 0x800000) != 0 ? b \| ~0xffffff : b;

	return a * b;
	}

	__device_func__(unsigned int __umul24(unsigned int a, unsigned int b))
	{
	a &= 0xffffff;
	b &= 0xffffff;

	return a * b;
	}

	__device_func__(float __int_as_float(int a))
	{
	volatile union __cudart_FloatIntCvt u;

	u.i = a;
	return u.f;
	}

	__device_func__(int __float_as_int(float a))
	{
	volatile union __cudart_FloatIntCvt u;

	u.f = a;
	return u.i;
	}

	__device_func__(long long int __internal_float2ll_kernel(float a, long long
	int max, long long int min, long long int nan, enum cudaRoundMode rndMode)
	)
	{
	unsigned long long int res, t = 0ULL;
	int shift;
	unsigned int ia;

	__internal_clamp(a, max, min, nan);
	ia = __float_as_int(a);
	shift = 189 - ((ia >> 23) & 0xff);
	res = (unsigned long long int)(((ia << 8) \| 0x80000000) >> 1) << 32;
	if (shift >= 64) {
	t = res;
	res = 0;
	} else if (shift) {
	t = res << (64 - shift);
	res = res >> shift;
	}
	if (rndMode == cudaRoundNearest && (long long int)t < 0LL) {
	res += t == 0x8000000000000000ULL ? res & 1ULL : 1ULL;
	}
	else if (rndMode == cudaRoundMinInf && t != 0ULL && ia > 0x80000000) {
	res++;
	}
	else if (rndMode == cudaRoundPosInf && t != 0ULL && (int)ia > 0) {
	res++;
	}
	if ((int)ia < 0) res = (unsigned long long int)-(long long int)res;
	return (long long int)res;
	}

	__device_func__(int __internal_float2int(float a, enum cudaRoundMode rndMod
	e))
	{
	return (int)__internal_float2ll_kernel(a, 2147483647LL, -2147483648LL, 0L
	L, rndMode);
	}

	__device_func__(int __float2int_rz(float a))
	{
	return __internal_float2int(a, cudaRoundZero);
	}

	__device_func__(int __float2int_ru(float a))
	{
	return __internal_float2int(a, cudaRoundPosInf);
	}

	__device_func__(int __float2int_rd(float a))
	{
	return __internal_float2int(a, cudaRoundMinInf);
	}

	__device_func__(int __float2int_rn(float a))
	{
	return __internal_float2int(a, cudaRoundNearest);
	}

	__device_func__(long long int __internal_float2ll(float a, enum cudaRoundMo
	de rndMode))
	{
	return __internal_float2ll_kernel(a, 9223372036854775807LL, -922337203685
	4775807LL -1LL, -9223372036854775807LL -1LL, rndMode);
	}

	__device_func__(long long int __float2ll_rz(float a))
	{
	return __internal_float2ll(a, cudaRoundZero);
	}

	__device_func__(long long int __float2ll_ru(float a))
	{
	return __internal_float2ll(a, cudaRoundPosInf);
	}

	__device_func__(long long int __float2ll_rd(float a))
	{
	return __internal_float2ll(a, cudaRoundMinInf);
	}

	__device_func__(long long int __float2ll_rn(float a))
	{
	return __internal_float2ll(a, cudaRoundNearest);
	}

	__device_func__(unsigned long long int __internal_float2ull_kernel(float a,
	unsigned long long int max, unsigned long long int nan, enum cudaRoundMode
	rndMode))
	{
	unsigned long long int res, t = 0ULL;
	int shift;
	unsigned int ia;

	__internal_clamp(a, max, 0LL, nan);
	ia = __float_as_int(a);
	shift = 190 - ((ia >> 23) & 0xff);
	res = (unsigned long long int)((ia << 8) \| 0x80000000) << 32;
	if (shift >= 64) {
	t = res >> (int)(shift > 64);
	res = 0;
	} else if (shift) {
	t = res << (64 - shift);
	res = res >> shift;
	}
	if (rndMode == cudaRoundNearest && (long long int)t < 0LL) {
	res += t == 0x8000000000000000ULL ? res & 1ULL : 1ULL;
	}
	else if (rndMode == cudaRoundPosInf && t != 0ULL) {
	res++;
	}
	return res;
	}

	__device_func__(unsigned int __internal_float2uint(float a, enum cudaRoundM
	ode rndMode))
	{
	return (unsigned int)__internal_float2ull_kernel(a, 4294967295U, 0U, rndM
	ode);
	}

	__device_func__(unsigned int __float2uint_rz(float a))
	{
	return __internal_float2uint(a, cudaRoundZero);
	}

	__device_func__(unsigned int __float2uint_ru(float a))
	{
	return __internal_float2uint(a, cudaRoundPosInf);
	}

	__device_func__(unsigned int __float2uint_rd(float a))
	{
	return __internal_float2uint(a, cudaRoundMinInf);
	}

	__device_func__(unsigned int __float2uint_rn(float a))
	{
	return __internal_float2uint(a, cudaRoundNearest);
	}

	__device_func__(unsigned long long int __internal_float2ull(float a, enum c
	udaRoundMode rndMode))
	{
	return __internal_float2ull_kernel(a, 18446744073709551615ULL, 9223372036
	854775808ULL, rndMode);
	}

	__device_func__(unsigned long long int __float2ull_rz(float a))
	{
	return __internal_float2ull(a, cudaRoundZero);
	}

	__device_func__(unsigned long long int __float2ull_ru(float a))
	{
	return __internal_float2ull(a, cudaRoundPosInf);
	}

	__device_func__(unsigned long long int __float2ull_rd(float a))
	{
	return __internal_float2ull(a, cudaRoundMinInf);
	}

	__device_func__(unsigned long long int __float2ull_rn(float a))
	{
	return __internal_float2ull(a, cudaRoundNearest);
	}

	__device_func__(int __internal_normalize64(unsigned long long int *a))
	{
	int lz = 0;

	if ((*a & 0xffffffff00000000ULL) == 0ULL) {
	*a <<= 32;
	lz += 32;
	}
	if ((*a & 0xffff000000000000ULL) == 0ULL) {
	*a <<= 16;
	lz += 16;
	}
	if ((*a & 0xff00000000000000ULL) == 0ULL) {
	*a <<= 8;
	lz += 8;
	}
	if ((*a & 0xf000000000000000ULL) == 0ULL) {
	*a <<= 4;
	lz += 4;
	}
	if ((*a & 0xC000000000000000ULL) == 0ULL) {
	*a <<= 2;
	lz += 2;
	}
	if ((*a & 0x8000000000000000ULL) == 0ULL) {
	*a <<= 1;
	lz += 1;
	}
	return lz;
	}

	__device_func__(int __internal_normalize(unsigned int *a))
	{
	unsigned long long int t = (unsigned long long int)*a;
	int lz = __internal_normalize64(&t);

	*a = (unsigned int)(t >> 32);

	return lz - 32;
	}

	__device_func__(float __internal_int2float_kernel(int a, enum cudaRoundMode
	rndMode))
	{
	volatile union __cudart_FloatUintCvt res;
	int shift;
	unsigned int t;
	res.i = a;
	if (a == 0) return res.f;
	if (a < 0) res.i = (unsigned int)-a;
	shift = __internal_normalize((unsigned int*)&res.i);
	t = res.i << 24;
	res.i = (res.i >> 8);
	res.i += (127 + 30 - shift) << 23;
	if (a < 0) res.i \|= 0x80000000;
	if ((rndMode == cudaRoundNearest) && (t >= 0x80000000)) {
	res.i += (t == 0x80000000) ? (res.i & 1) : (t >> 31);
	}
	else if ((rndMode == cudaRoundMinInf) && t && (a < 0)) {
	res.i++;
	}
	else if ((rndMode == cudaRoundPosInf) && t && (a > 0)) {
	res.i++;
	}
	return res.f;
	}

	__device_func__(float __int2float_rz(int a))
	{
	return __internal_int2float_kernel(a, cudaRoundZero);
	}

	__device_func__(float __int2float_ru(int a))
	{
	return __internal_int2float_kernel(a, cudaRoundPosInf);
	}

	__device_func__(float __int2float_rd(int a))
	{
	return __internal_int2float_kernel(a, cudaRoundMinInf);
	}

	__device_func__(float __int2float_rn(int a))
	{
	return __internal_int2float_kernel(a, cudaRoundNearest);
	}

	__device_func__(float __internal_uint2float_kernel(unsigned int a, enum cud
	aRoundMode rndMode))
	{
	volatile union __cudart_FloatUintCvt res;
	int shift;
	unsigned int t;
	res.i = a;
	if (a == 0) return res.f;
	shift = __internal_normalize((unsigned int*)&res.i);
	t = res.i << 24;
	res.i = (res.i >> 8);
	res.i += (127 + 30 - shift) << 23;
	if (rndMode == cudaRoundNearest) {
	res.i += (t == 0x80000000) ? (res.i & 1) : (t >> 31);
	}
	else if ((rndMode == cudaRoundPosInf) && t) {
	res.i++;
	}
	return res.f;
	}

	__device_func__(float __uint2float_rz(unsigned int a))
	{
	return __internal_uint2float_kernel(a, cudaRoundZero);
	}

	__device_func__(float __uint2float_ru(unsigned int a))
	{
	return __internal_uint2float_kernel(a, cudaRoundPosInf);
	}

	__device_func__(float __uint2float_rd(unsigned int a))
	{
	return __internal_uint2float_kernel(a, cudaRoundMinInf);
	}

	__device_func__(float __uint2float_rn(unsigned int a))
	{
	return __internal_uint2float_kernel(a, cudaRoundNearest);
	}

	__device_func__(float __internal_ull2float_kernel(unsigned long long int a,
	enum cudaRoundMode rndMode))
	{
	unsigned long long int temp;
	unsigned int res, t;
	int shift;
	if (a == 0ULL) return 0.0f;
	temp = a;
	shift = __internal_normalize64(&temp);
	temp = (temp >> 8) \| ((temp & 0xffULL) ? 1ULL : 0ULL);
	res = (unsigned int)(temp >> 32);
	t = (unsigned int)temp;
	res += (127 + 62 - shift) << 23; /* add in exponent */
	if (rndMode == cudaRoundNearest) {
	res += (t == 0x80000000) ? (res & 1) : (t >> 31);
	} else if (rndMode == cudaRoundPosInf) {
	res += (t != 0);
	}
	return __int_as_float(res);
	}

	__device_func__(float __internal_ll2float_kernel(long long int a, enum cuda
	RoundMode rndMode))
	{
	unsigned long long int temp;
	volatile float res = 0.0f;

	if (a < 0LL) {
	temp = (~((unsigned long long int)a)) + 1ULL;
	if (rndMode == cudaRoundPosInf) {
	rndMode = cudaRoundMinInf;
	} else if (rndMode == cudaRoundMinInf) {
	rndMode = cudaRoundPosInf;
	}
	} else {
	temp = (unsigned long long int)a;
	}
	res = __internal_ull2float_kernel (temp, rndMode);
	if (a < 0LL) {
	res = -res;
	}
	return res;
	}

	__device_func__(float __ll2float_rn(long long int a))
	{
	return __internal_ll2float_kernel(a, cudaRoundNearest);
	}

	__device_func__(float __ll2float_rz(long long int a))
	{
	return __internal_ll2float_kernel(a, cudaRoundZero);
	}

	__device_func__(float __ll2float_ru(long long int a))
	{
	return __internal_ll2float_kernel(a, cudaRoundPosInf);
	}

	__device_func__(float __ll2float_rd(long long int a))
	{
	return __internal_ll2float_kernel(a, cudaRoundMinInf);
	}

	__device_func__(float __ull2float_rn(unsigned long long int a))
	{
	return __internal_ull2float_kernel(a, cudaRoundNearest);
	}

	__device_func__(float __ull2float_rz(unsigned long long int a))
	{
	return __internal_ull2float_kernel(a, cudaRoundZero);
	}

	__device_func__(float __ull2float_ru(unsigned long long int a))
	{
	return __internal_ull2float_kernel(a, cudaRoundPosInf);
	}

	__device_func__(float __ull2float_rd(unsigned long long int a))
	{
	return __internal_ull2float_kernel(a, cudaRoundMinInf);
	}

	__device_func__(unsigned short __float2half_rn(float f))
	{
	unsigned int x = __float_as_int (f);
	unsigned int u = (x & 0x7fffffff), remainder, shift, lsb, lsb_s1, lsb_m1;
	unsigned int sign, exponent, mantissa;

	/* Get rid of +NaN/-NaN case first. */
	if (u > 0x7f800000) {
	return 0x7fff;
	}

	sign = ((x >> 16) & 0x8000);

	/* Get rid of +Inf/-Inf, +0/-0. */
	if (u > 0x477fefff) {
	return sign \| 0x7c00;
	}
	if (u < 0x33000001) {
	return sign \| 0x0000;
	}

	exponent = ((u >> 23) & 0xff);
	mantissa = (u & 0x7fffff);

	if (exponent > 0x70) {
	shift = 13;
	exponent -= 0x70;
	} else {
	shift = 0x7e - exponent;
	exponent = 0;
	mantissa \|= 0x800000;
	}
	lsb = (1 << shift);
	lsb_s1 = (lsb >> 1);
	lsb_m1 = (lsb - 1);

	/* Round to nearest even. */
	remainder = (mantissa & lsb_m1);
	mantissa >>= shift;
	if (remainder > lsb_s1 \|\| (remainder == lsb_s1 && (mantissa & 0x1))) {
	++mantissa;
	if (!(mantissa & 0x3ff)) {
	++exponent;
	mantissa = 0;
	}
	}

	return sign \| (exponent << 10) \| mantissa;
	}

	__device_func__(float __half2float(unsigned short h))
	{
	unsigned int sign = ((h >> 15) & 1);
	unsigned int exponent = ((h >> 10) & 0x1f);
	unsigned int mantissa = ((h & 0x3ff) << 13);

	if (exponent == 0x1f) { /* NaN or Inf */
	mantissa = (mantissa
	? (sign = 0, 0x7fffff)
	: 0);
	exponent = 0xff;
	} else if (!exponent) { /* Denorm or Zero */
	if (mantissa) {
	unsigned int msb;
	exponent = 0x71;
	do {
	msb = (mantissa & 0x400000);
	mantissa <<= 1; /* normalize */
	--exponent;
	} while (!msb);
	mantissa &= 0x7fffff; /* 1.mantissa is implicit */
	}
	} else {
	exponent += 0x70;
	}

	return __int_as_float ((sign << 31) \| (exponent << 23) \| mantissa);
	}

	__device_func__(float __fadd_rz(float a, float b))
	{
	return __internal_fadd_kernel(a, b, cudaRoundZero);
	}

	__device_func__(float __fmul_rz(float a, float b))
	{
	return __internal_fmul_kernel(a, b, cudaRoundZero);
	}

	__device_func__(float __fadd_rn(float a, float b))
	{
	return __internal_fadd_kernel(a, b, cudaRoundNearest);
	}

	__device_func__(float __fmul_rn(float a, float b))
	{
	return __internal_fmul_kernel(a, b, cudaRoundNearest);
	}

	__device_func__(void __brkpt(int c))
	{
	/* TODO */
	}

	#if defined(__cplusplus)
	extern "C" {
	#endif /* __cplusplus */

	extern int CUDARTAPI __cudaSynchronizeThreads(void*, void);

	#if defined(__cplusplus)
	}
	#endif /* __cplusplus */

	#if defined(__GNUC__)

	__device_func__(inline __attribute__((always_inline)) void __syncthreads(vo
	id))
	{
	volatile int _ = 0;
	L: if (__cudaSynchronizeThreads((void*)&&L, (void)&_)) goto L;
	}

	#elif defined(_WIN32)

	#define __syncthreads() \
	(void)__cudaSynchronizeThreads((void*)0, (void)0)

	#endif /* __GNUC__ */

	__device_func__(void __prof_trigger(int a))
	{
	}

	__device_func__(void __threadfence(void))
	{
	__syncthreads();
	}

	__device_func__(void __threadfence_block(void))
	{
	__syncthreads();
	}

	#if defined(__GNUC__)

	__device_func__(void __trap(void))
	{
	__builtin_trap();
	}

	#elif defined(_WIN32)

	__device_func__(void __trap(void))
	{
	__debugbreak();
	}

	#endif /* __GNUC__ */

	#endif /* __CUDABE__ */

	/**************************************************************************
	*****
	*
	*
	* DEVICE IMPLEMENTATIONS FOR FUNCTIONS WITH BUILTIN NVOPENCC OPERATIONS
	*
	*
	*
	***************************************************************************
	****/
	#if !defined(__CUDABE__)
	__device_func__(float __fdividef(float a, float b))
	{
	volatile float aa = a;
	volatile float bb = b;
	/* match range restrictions of the device function */
	if (__cuda_fabsf(bb) > CUDART_TWO_TO_126_F) {
	if (__cuda_fabsf(aa) <= CUDART_NORM_HUGE_F) {
	return ((aa / bb) / CUDART_NORM_HUGE_F) / CUDART_NORM_HUGE_F;
	} else {
	bb = 1.0f / bb;
	bb = bb / CUDART_NORM_HUGE_F;
	return aa * bb;
	}
	} else {
	return aa / bb;
	}
	}
	#endif /* !defined(__CUDABE__) */

	__device_func__(float __sinf(float a))
	{
	#if !defined(__CUDABE__)
	if ((__float_as_int(a) << 1) == 0xff000000) {
	return __fadd_rn (a, -a); /* return NaN */
	}
	#endif /* !defined(__CUDABE__) */
	return sinf(a);
	}

	__device_func__(float __cosf(float a))
	{
	#if !defined(__CUDABE__)
	if ((__float_as_int(a) << 1) == 0xff000000) {
	return __fadd_rn (a, -a); /* return NaN */
	}
	#endif /* !defined(__CUDABE__) */
	return cosf(a);
	}

	__device_func__(float __log2f(float a))
	{
	return log2f(a);
	}

	/**************************************************************************
	*****
	*
	*
	* SHARED HOST AND DEVICE IMPLEMENTATIONS
	*
	*
	*
	***************************************************************************
	****/
	__device_func__(float __tanf(float a))
	{
	return __fdividef (__sinf(a), __cosf(a));
	}

	__device_func__(void __sincosf(float a, float sptr, float cptr))
	{
	*sptr = __sinf(a);
	*cptr = __cosf(a);
	}

	__device_func__(float __expf(float a))
	{
	return __cuda_exp2f(a * CUDART_L2E_F);
	}

	__device_func__(float __exp10f(float a))
	{
	return __cuda_exp2f(a * CUDART_L2T_F);
	}

	__device_func__(float __log10f(float a))
	{
	return CUDART_LG2_F * __log2f(a);
	}

	__device_func__(float __logf(float a))
	{
	return CUDART_LN2_F * __log2f(a);
	}

	__device_func__(float __powf(float a, float b))
	{
	return __cuda_exp2f(b * __log2f(a));
	}

	__device_func__(float fdividef(float a, float b))
	{
	#if defined(__USE_FAST_MATH__) && !defined(__CUDA_PREC_DIV)
	return __fdividef(a, b);
	#else /* __USE_FAST_MATH__ && !__CUDA_PREC_DIV */
	return a / b;
	#endif /* __USE_FAST_MATH__ && !__CUDA_PREC_DIV */
	}

	#if !defined(__CUDABE__) \|\| (__CUDA_ARCH__ < 200)

	__device_func__(int __clz(int a))
	{	{
	return (a)?(158-(__float_as_int(__uint2float_rz((unsigned int)a))>>23)):3 2;	return (a)?(158-(__float_as_int(__uint2float_rz((unsigned int)a))>>23)):3 2;
	}	}


	__device_func__(int __clzll(long long int a))	static __forceinline__ int __clzll(long long int a)
	{	{
	int ahi = ((int)((unsigned long long)a >> 32));	int ahi = ((int)((unsigned long long)a >> 32));
	int alo = ((int)((unsigned long long)a & 0xffffffffULL));	int alo = ((int)((unsigned long long)a & 0xffffffffULL));
	int res;	int res;
	if (ahi) {	if (ahi) {
	res = 0;	res = 0;
	} else {	} else {
	res = 32;	res = 32;
	ahi = alo;	ahi = alo;
	}	}
	res = res + __clz(ahi);	res = res + __clz(ahi);
	return res;	return res;
	}	}


	__device_func__(int __popc(unsigned int a))	static __forceinline__ int __popc(unsigned int a)
	{	{
	a = a - ((a >> 1) & 0x55555555);	a = a - ((a >> 1) & 0x55555555);
	a = (a & 0x33333333) + ((a >> 2) & 0x33333333);	a = (a & 0x33333333) + ((a >> 2) & 0x33333333);
	a = (a + (a >> 4)) & 0x0f0f0f0f;	a = (a + (a >> 4)) & 0x0f0f0f0f;
	a = ((__umul24(a, 0x808080) << 1) + a) >> 24;	a = ((__umul24(a, 0x808080) << 1) + a) >> 24;
	return a;	return a;
	}	}


	__device_func__(int __popcll(unsigned long long int a))	static __forceinline__ int __popcll(unsigned long long int a)
	{	{
	unsigned int ahi = ((unsigned int)(a >> 32));	unsigned int ahi = ((unsigned int)(a >> 32));
	unsigned int alo = ((unsigned int)(a & 0xffffffffULL));	unsigned int alo = ((unsigned int)(a & 0xffffffffULL));
	alo = alo - ((alo >> 1) & 0x55555555);	alo = alo - ((alo >> 1) & 0x55555555);
	alo = (alo & 0x33333333) + ((alo >> 2) & 0x33333333);	alo = (alo & 0x33333333) + ((alo >> 2) & 0x33333333);
	ahi = ahi - ((ahi >> 1) & 0x55555555);	ahi = ahi - ((ahi >> 1) & 0x55555555);
	ahi = (ahi & 0x33333333) + ((ahi >> 2) & 0x33333333);	ahi = (ahi & 0x33333333) + ((ahi >> 2) & 0x33333333);
	alo = alo + ahi;	alo = alo + ahi;
	alo = (alo & 0x0f0f0f0f) + ((alo >> 4) & 0x0f0f0f0f);	alo = (alo & 0x0f0f0f0f) + ((alo >> 4) & 0x0f0f0f0f);
	alo = ((__umul24(alo, 0x808080) << 1) + alo) >> 24;	alo = ((__umul24(alo, 0x808080) << 1) + alo) >> 24;
	return alo;	return alo;
	}	}


	__device_func__(unsigned int __brev(unsigned int a))	static __forceinline__ unsigned int __brev(unsigned int a)
	{	{
	/* Use Knuth's algorithm from http://www.hackersdelight.org/revisions.pdf */	/* Use Knuth's algorithm from http://www.hackersdelight.org/revisions.pdf */
	unsigned int t;	unsigned int t;
	a = (a << 15) \| (a >> 17);	a = (a << 15) \| (a >> 17);
	t = (a ^ (a >> 10)) & 0x003f801f;	t = (a ^ (a >> 10)) & 0x003f801f;
	a = (t + (t << 10)) ^ a;	a = (t + (t << 10)) ^ a;
	t = (a ^ (a >> 4)) & 0x0e038421;	t = (a ^ (a >> 4)) & 0x0e038421;
	a = (t + (t << 4)) ^ a;	a = (t + (t << 4)) ^ a;
	t = (a ^ (a >> 2)) & 0x22488842;	t = (a ^ (a >> 2)) & 0x22488842;
	a = (t + (t << 2)) ^ a;	a = (t + (t << 2)) ^ a;
	return a;	return a;
	}	}


	__device_func__(unsigned long long int __brevll(unsigned long long int a))	static __forceinline__ unsigned long long int __brevll(unsigned long long i nt a)
	{	{
	unsigned int hi = (unsigned int)(a >> 32);	unsigned int hi = (unsigned int)(a >> 32);
	unsigned int lo = (unsigned int)(a & 0xffffffffULL);	unsigned int lo = (unsigned int)(a & 0xffffffffULL);
	unsigned int t;	unsigned int t;
	t = __brev(lo);	t = __brev(lo);
	lo = __brev(hi);	lo = __brev(hi);
	return ((unsigned long long int)t << 32) + (unsigned long long int)lo;	return ((unsigned long long int)t << 32) + (unsigned long long int)lo;
	}	}


	#endif /* __CUDABE__ \|\| __CUDA_ARCH__ < 200 */	static __forceinline__ unsigned int __byte_perm(unsigned int a, unsigned in
		t b, unsigned int slct)
	__device_func__(int __ffs(int a))
	{
	return 32 - __clz (a & -a);
	}

	__device_func__(int __ffsll(long long int a))
	{
	return 64 - __clzll (a & -a);
	}

	#if defined(CUDA_DOUBLE_MATH_FUNCTIONS) && defined(CUDA_FLOAT_MATH_FUNCTION
	S)

	#error -- conflicting mode for double math routines

	#endif /* CUDA_DOUBLE_MATH_FUNCTIONS && CUDA_FLOAT_MATH_FUNCTIONS */

	#if defined(CUDA_FLOAT_MATH_FUNCTIONS)

	__device_func__(double fdivide(double a, double b))
	{
	return (double)fdividef((float)a, (float)b);
	}

	#if !defined(__CUDABE__)

	__device_func__(int __double2int_rz(double a))
	{
	return __float2int_rz((float)a);
	}

	__device_func__(unsigned int __double2uint_rz(double a))
	{
	return __float2uint_rz((float)a);
	}

	__device_func__(long long int __double2ll_rz(double a))
	{
	return __float2ll_rz((float)a);
	}

	__device_func__(unsigned long long int __double2ull_rz(double a))
	{
	return __float2ull_rz((float)a);
	}

	#endif /* !__CUDABE__ */

	#endif /* CUDA_FLOAT_MATH_FUNCTIONS */

	#if defined(CUDA_DOUBLE_MATH_FUNCTIONS)

	__device_func__(double fdivide(double a, double b))
	{	{

	return a / b;	unsigned int i0 = (slct >> 0) & 0x7;
	}	unsigned int i1 = (slct >> 4) & 0x7;
		unsigned int i2 = (slct >> 8) & 0x7;
	#if !defined(__CUDABE__)	unsigned int i3 = (slct >> 12) & 0x7;

	__device_func__(int __internal_double2int(double a, enum cudaRoundMode rndM
	ode));
	__device_func__(unsigned int __internal_double2uint(double a, enum cudaRoun
	dMode rndMode));
	__device_func__(long long int __internal_double2ll(double a, enum cudaRound
	Mode rndMode));
	__device_func__(unsigned long long int __internal_double2ull(double a, enum
	cudaRoundMode rndMode));


	__device_func__(int __double2int_rz(double a))	return (((((i0 < 4) ? (a >> (i08)) : (b >> ((i0-4)8))) & 0xff) << 0) +
	{	((((i1 < 4) ? (a >> (i18)) : (b >> ((i1-4)8))) & 0xff) << 8) +
	return __internal_double2int(a, cudaRoundZero);	((((i2 < 4) ? (a >> (i28)) : (b >> ((i2-4)8))) & 0xff) << 16) +
		((((i3 < 4) ? (a >> (i38)) : (b >> ((i3-4)8))) & 0xff) << 24));
	}	}


	__device_func__(unsigned int __double2uint_rz(double a))	#endif /* __CUDA_ARCH__ < 200 */
	{
	return __internal_double2uint(a, cudaRoundZero);
	}


	__device_func__(long long int __double2ll_rz(double a))	static __forceinline__ int __ffs(int a)
	{	{

	return __internal_double2ll(a, cudaRoundZero);	return 32 - __clz(a & -a);
	}	}


	__device_func__(unsigned long long int __double2ull_rz(double a))	static __forceinline__ int __ffsll(long long int a)
	{	{

	return __internal_double2ull(a, cudaRoundZero);	return 64 - __clzll(a & -a);
	}	}


	#endif /* !__CUDABE__ */

	#endif /* CUDA_DOUBLE_MATH_FUNCTIONS */

	#endif /* __cplusplus && __CUDACC__ */	#endif /* __cplusplus && __CUDACC__ */

	/************************************************************************ ***	/************************************************************************ ***
	* *	* *
	* *	* *
	* *	* *
	************************************************************************* **/	************************************************************************* **/

	#include "sm_11_atomic_functions.h"	#include "sm_11_atomic_functions.h"
	#include "sm_12_atomic_functions.h"	#include "sm_12_atomic_functions.h"
	#include "sm_13_double_functions.h"	#include "sm_13_double_functions.h"
	#include "sm_20_atomic_functions.h"	#include "sm_20_atomic_functions.h"
	#include "sm_20_intrinsics.h"	#include "sm_20_intrinsics.h"

		#include "surface_functions.h"
	#include "texture_fetch_functions.h"	#include "texture_fetch_functions.h"

	#endif /* !__DEVICE_FUNCTIONS_H__ */	#endif /* !__DEVICE_FUNCTIONS_H__ */

End of changes. 49 change blocks.
	2271 lines changed or deleted	163 lines changed or added

	device_runtime.h	device_runtime.h
	/*	/*

	* Copyright 1993-2008 NVIDIA Corporation. All rights reserved.	* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
	*	*
	* NOTICE TO USER:	* NOTICE TO USER:
	*	*
	* This source code is subject to NVIDIA ownership rights under U.S. and	* This source code is subject to NVIDIA ownership rights under U.S. and
	* international Copyright laws. Users and possessors of this source code	* international Copyright laws. Users and possessors of this source code
	* are hereby granted a nonexclusive, royalty-free license to use this code	* are hereby granted a nonexclusive, royalty-free license to use this code
	* in individual and commercial software.	* in individual and commercial software.
	*	*
	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR

	skipping to change at line 36	skipping to change at line 36
	* and is provided to the U.S. Government only as a commercial end item.	* and is provided to the U.S. Government only as a commercial end item.
	* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through	* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
	* 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the	* 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
	* source code with only those rights set forth herein.	* source code with only those rights set forth herein.
	*	*
	* Any use of this source code in individual and commercial software must	* Any use of this source code in individual and commercial software must
	* include, in the user documentation and internal comments to the code,	* include, in the user documentation and internal comments to the code,
	* the above Disclaimer and U.S. Government End Users Notice.	* the above Disclaimer and U.S. Government End Users Notice.
	*/	*/


	#if !defined(__CUDA_INTERNAL_COMPILATION__)

	#define __CUDA_INTERNAL_COMPILATION__	#define __CUDA_INTERNAL_COMPILATION__


	#endif /* !__CUDA_INTERNAL_COMPILATION__ */

	#include "host_defines.h"	#include "host_defines.h"

	#define __no_sc__	#define __no_sc__

	#define __empty_array(s) \	#define __pad__(f)
	s
	#define __unsized_shared_data(name, type_post) \
	__unsized##name __unsized##type_post
	#define __sized_shared_data(name, type) \
	__sized##name type
	#define __sized__shared_var(name, s, type) \
	name

	/TEXTURE_TYPE/	/TEXTURE_TYPE/
	typedef const void *__texture_type__;	typedef const void *__texture_type__;
	/SURFACE_TYPE/	/SURFACE_TYPE/
	typedef const void *__surface_type__;	typedef const void *__surface_type__;

	#if defined(__CUDABE__) /* cudabe compiler */	#if defined(__CUDABE__) /* cudabe compiler */


	#define __pad__(f)	#if __CUDA_ARCH__ >= 200

		#define ___device__(sc) \
		sc

		#else /* __CUDA_ARCH__ >= 200 */

		#define ___device__(sc) \
		static

		#endif /* __CUDA_ARCH__ >= 200 */

	#define __text__ \	#define __text__ \
	__attribute__((__texture__))	__attribute__((__texture__))
	#define __surf__ \	#define __surf__ \
	__attribute__((__surface__))	__attribute__((__surface__))

	#define ___device__(sc) \
	static
	#define __in__(cdecl, decl) \
	cdecl
	#define __in_type__(cdecl, decl) \
	cdecl
	#define __texture_var(name) \
	name
	#define __shared_var(name, s, type) \
	name
	#define __val_param(name) \	#define __val_param(name) \
	__val_param##name	__val_param##name
	#define __copy_param(local_decl, param) \	#define __copy_param(local_decl, param) \
	local_decl = param	local_decl = param

	#define __unsized_array_size \
	[]
	#define __unsized__shared_var(name, s, type) \
	name
	#define __unsized__empty_array(s) \
	s
	#define __var_used__ \	#define __var_used__ \
	__attribute__((__used__))	__attribute__((__used__))
	#define __storage_extern_unsized__shared__ \	#define __storage_extern_unsized__shared__ \
	extern	extern

	#define __cxa_vec_util(n, num, size, f) \
	int i; for (i = 0; i < num; i++) f(n + i)
	#define __cxa_vec_ctor(n, num, size, c, d) \	#define __cxa_vec_ctor(n, num, size, c, d) \

	({ __cxa_vec_util(n, num, size, c); (void)0; })	({ int i; for (i = 0; i < num; i++) c((void)n + isize); (void)0; })
	#define __cxa_vec_dtor(n, num, size, d) \	#define __cxa_vec_dtor(n, num, size, d) \

	{ __cxa_vec_util(n, num, size, d); }	{ int i; for (i = num-1; i >= 0; i--) d((void)n + isize); }

	#undef __cdecl	#undef __cdecl
	#define __cdecl	#define __cdecl
	#undef __w64	#undef __w64
	#define __w64	#define __w64

	#elif defined(__CUDACC__) /* cudafe compiler */	#elif defined(__CUDACC__) /* cudafe compiler */

	#define __loc_sc__(loc, size, sc) \	#define __loc_sc__(loc, size, sc) \
	sc loc	sc loc

	#define __pad__(f)
	#define __text__	#define __text__
	#define __surf__	#define __surf__
	#define ___device__(sc) \	#define ___device__(sc) \
	sc __device__	sc __device__

	#define __in__(cdecl, decl) \
	decl
	#define __in_type__(cdecl, decl) \
	decl
	#define __texture_var(name) \
	name
	#define __shared_var(name, s, type) \
	name
	#define __val_param(name) \
	name
	#define __copy_param(local_decl, param)
	#define __unsized_array_size \
	[]
	#define __unsized__shared_var(name, s, type) \
	name
	#define __unsized__empty_array(s) \
	s

	#else /* host compiler (cl, gcc, open64, ...) */

	#if defined (__MULTI_CORE__) \|\| defined(__multi_core__)

	struct uint3;
	extern struct uint3* CUDARTAPI __cudaGetBlockIdxPtr(void);
	extern void* CUDARTAPI __cudaGetSharedMem(void*);
	extern void* CUDARTAPI __cudaCmcHostMalloc(size_t);
	extern size_t CUDARTAPI __cudaCmcGetStackSize(void);

	#endif /* __MULTI_CORE__ \|\| __multi_core__ */

	#if defined (__multi_core__)

	#if defined(__GNUC__)

	#if defined(__cplusplus)

	extern void *alloca(size_t) throw();

	#else /* __cplusplus */

	extern void *alloca(size_t);

	#endif /* __cplusplus */

	#define __cuda_alloca(s) \
	alloca(s)

	#else /* __GNUC__ */

	extern void *_alloca(size_t);

	#define __cuda_alloca(s) \
	_alloca(s)

	#endif /* __GNUC__ */

	/* check if enough stack size remains for alloca to succeed. If so,
	use faster alloca() to do the allocation. Otherwise, allocate memory
	using the __cudaCmcHostMalloc() runtime function, which uses the slower
	malloc path to allocate memory
	*/
	#define __cudaCmcTargAlloc(num_bytes, max_stacksize, ptr_counter_stacksize,
	ptr_ret_sym) \
	do { \
	if (*(ptr_counter_stacksize) + (num_bytes) >= (max_stacksize)) { \
	*(ptr_ret_sym) = __cudaCmcHostMalloc((num_bytes)); \
	} else { \
	*(ptr_ret_sym) = __cuda_alloca((num_bytes)); \
	(ptr_counter_stacksize) = (ptr_counter_stacksize) + (num_bytes); \
	} \
	} while(0)

	#endif /* __multi_core__ */

	#if defined (__MULTI_CORE__)

	#define ___device__(sc) \
	static
	#define __pad__(f) \
	f
	#define __text__
	#define __surf__
	#define __cudaGet_blockIdx() \
	(*__cudaGetBlockIdxPtr())
	#define __shared_var(name, s, type) \
	(s type __cudaGetSharedMem((void*)(&(name))))
	#define __var_used__ \
	__attribute__((__used__))
	#define __storage_auto__shared__ \
	auto

	#undef __cdecl
	#define __cdecl
	#undef __w64
	#define __w64

	#else /* __MULTI_CORE__ */

	#define ___device__(sc) \
	static __device__
	#define __shared_var(name, s, type) \
	name

	#if defined(__APPLE__) \|\| defined(__ICC)

	#define __STORAGE__ \
	__attribute__((__weak__))

	#elif defined(__GNUC__)

	#define __STORAGE__ \
	__attribute__((__common__))

	#elif defined(__cplusplus)

	#define __STORAGE__ \
	__declspec(selectany)

	#else /* __APPLE__ \|\| __ICC */

	#define __STORAGE__

	#endif /* __APPLE__ \|\| __ICC */

	#endif /* __MULTI_CORE__ */

	#define __in__(cdecl, decl) \
	decl
	#define __in_type__(cdecl, decl) \
	decl
	#define __texture_var(name) \
	__texture_##name
	#define __val_param(name) \	#define __val_param(name) \
	name	name
	#define __copy_param(local_decl, param)	#define __copy_param(local_decl, param)

	#define __unsized_array_size
	#define __unsized__shared_var(name, s, type) \
	(*name)
	#define __unsized__empty_array(s)
	#define __cxa_vec_ctor(n, num, size, c, d) \
	__cxa_vec_util((void)n, num, size, (void ()(void*))c)
	#define __cxa_vec_dtor(n, num, size, d) \
	__cxa_vec_util((void)n, num, size, (void ()(void*))d)

	static void __cxa_vec_util(void n, size_t num, size_t size, void (f)(void
	*))
	{
	size_t i;

	for (i = 0; i < num; i++) {
	f((void)((char)n + i * size));
	}
	}

	/* this is compiled with a host compiler for device emulation */
	#define __device_emulation

	#if defined(__cplusplus)

	#undef __VECTOR_TYPES_H__

	#if defined(_WIN32)

	#pragma warning(disable: 4190 4522)

	#endif /* _WIN32 */

	#endif /* __cplusplus */

	#endif /* __CUDABE__ */	#endif /* __CUDABE__ */


	#if defined(__cplusplus)

	static void __cuda_memcpy(void, const void*, size_t);

	/* for C++ compilation of lowered C++ (i.e. C) code */
	#define __cuda_assign_operators(tag)
	\
	tag& operator=( tag& a) { __cuda_memcpy(this, &a,
	sizeof(tag)); return *this;} \
	tag& operator=(volatile tag& a) volatile { return (tag)this = (ta
	g&)a; } \
	tag& operator=( const tag& a) { return (tag)this = (ta
	g&)a; } \

	#endif /* __cplusplus */

	#include "builtin_types.h"	#include "builtin_types.h"
	#include "device_launch_parameters.h"	#include "device_launch_parameters.h"
	#include "storage_class.h"	#include "storage_class.h"

End of changes. 14 change blocks.
	216 lines changed or deleted	16 lines changed or added

	driver_functions.h	driver_functions.h

	skipping to change at line 45	skipping to change at line 45

	#if !defined(__DRIVER_FUNCTIONS_H__)	#if !defined(__DRIVER_FUNCTIONS_H__)
	#define __DRIVER_FUNCTIONS_H__	#define __DRIVER_FUNCTIONS_H__

	/************************************************************************ ***	/************************************************************************ ***
	* *	* *
	* *	* *
	* *	* *
	************************************************************************* **/	************************************************************************* **/


		#include "builtin_types.h"
	#include "host_defines.h"	#include "host_defines.h"
	#include "driver_types.h"	#include "driver_types.h"

	/************************************************************************ ***	/************************************************************************ ***
	* *	* *
	* *	* *
	* *	* *
	************************************************************************* **/	************************************************************************* **/

	static __inline__ __host__ struct cudaPitchedPtr make_cudaPitchedPtr(void * d, size_t p, size_t xsz, size_t ysz)	static __inline__ __host__ struct cudaPitchedPtr make_cudaPitchedPtr(void * d, size_t p, size_t xsz, size_t ysz)

End of changes. 1 change blocks.
	0 lines changed or deleted	1 lines changed or added

	driver_types.h	driver_types.h

	skipping to change at line 79	skipping to change at line 79
	#define cudaEventBlockingSync 1 ///< Event uses blocking synchroniz ation	#define cudaEventBlockingSync 1 ///< Event uses blocking synchroniz ation

	#define cudaDeviceScheduleAuto 0 ///< Device flag - Automatic schedu ling	#define cudaDeviceScheduleAuto 0 ///< Device flag - Automatic schedu ling
	#define cudaDeviceScheduleSpin 1 ///< Device flag - Spin default sch eduling	#define cudaDeviceScheduleSpin 1 ///< Device flag - Spin default sch eduling
	#define cudaDeviceScheduleYield 2 ///< Device flag - Yield default sc heduling	#define cudaDeviceScheduleYield 2 ///< Device flag - Yield default sc heduling
	#define cudaDeviceBlockingSync 4 ///< Device flag - Use blocking syn chronization	#define cudaDeviceBlockingSync 4 ///< Device flag - Use blocking syn chronization
	#define cudaDeviceMapHost 8 ///< Device flag - Support mapped p inned allocations	#define cudaDeviceMapHost 8 ///< Device flag - Support mapped p inned allocations
	#define cudaDeviceLmemResizeToMax 16 ///< Device flag - Keep local memor y allocation after launch	#define cudaDeviceLmemResizeToMax 16 ///< Device flag - Keep local memor y allocation after launch
	#define cudaDeviceMask 0x1f ///< Device flags mask	#define cudaDeviceMask 0x1f ///< Device flags mask


		#define cudaArraySurfaceLoadStore 0x02 ///< Must be set in cudaMallocArra
		y in order to bind surfaces to the CUDA array

	#endif /* !__CUDA_INTERNAL_COMPILATION__ */	#endif /* !__CUDA_INTERNAL_COMPILATION__ */

	/************************************************************************ ***	/************************************************************************ ***
	* *	* *
	* *	* *
	* *	* *
	************************************************************************* **/	************************************************************************* **/

	/**	/**
	* CUDA error types	* CUDA error types

	skipping to change at line 116	skipping to change at line 118
	cudaErrorInvalidSymbol = 13, ///< Invalid symbol	cudaErrorInvalidSymbol = 13, ///< Invalid symbol
	cudaErrorMapBufferObjectFailed = 14, ///< Map buffer object failed	cudaErrorMapBufferObjectFailed = 14, ///< Map buffer object failed
	cudaErrorUnmapBufferObjectFailed = 15, ///< Unmap buffer objec t failed	cudaErrorUnmapBufferObjectFailed = 15, ///< Unmap buffer objec t failed
	cudaErrorInvalidHostPointer = 16, ///< Invalid host point er	cudaErrorInvalidHostPointer = 16, ///< Invalid host point er
	cudaErrorInvalidDevicePointer = 17, ///< Invalid device poi nter	cudaErrorInvalidDevicePointer = 17, ///< Invalid device poi nter
	cudaErrorInvalidTexture = 18, ///< Invalid texture	cudaErrorInvalidTexture = 18, ///< Invalid texture
	cudaErrorInvalidTextureBinding = 19, ///< Invalid texture bi nding	cudaErrorInvalidTextureBinding = 19, ///< Invalid texture bi nding
	cudaErrorInvalidChannelDescriptor = 20, ///< Invalid channel de scriptor	cudaErrorInvalidChannelDescriptor = 20, ///< Invalid channel de scriptor
	cudaErrorInvalidMemcpyDirection = 21, ///< Invalid memcpy dir ection	cudaErrorInvalidMemcpyDirection = 21, ///< Invalid memcpy dir ection
	cudaErrorAddressOfConstant = 22, ///< Address of constan t error	cudaErrorAddressOfConstant = 22, ///< Address of constan t error

		///< \deprecated
		///< This error return
		is deprecated as of
		///< Cuda 3.1. Variable
		s in constant memory
		///< may now have their
		address taken by the
		///< runtime via ::cuda
		GetSymbolAddress().
	cudaErrorTextureFetchFailed = 23, ///< Texture fetch fail ed	cudaErrorTextureFetchFailed = 23, ///< Texture fetch fail ed
	cudaErrorTextureNotBound = 24, ///< Texture not bound error	cudaErrorTextureNotBound = 24, ///< Texture not bound error
	cudaErrorSynchronizationError = 25, ///< Synchronization er ror	cudaErrorSynchronizationError = 25, ///< Synchronization er ror
	cudaErrorInvalidFilterSetting = 26, ///< Invalid filter set ting	cudaErrorInvalidFilterSetting = 26, ///< Invalid filter set ting
	cudaErrorInvalidNormSetting = 27, ///< Invalid norm setti ng	cudaErrorInvalidNormSetting = 27, ///< Invalid norm setti ng
	cudaErrorMixedDeviceExecution = 28, ///< Mixed device execu tion	cudaErrorMixedDeviceExecution = 28, ///< Mixed device execu tion
	cudaErrorCudartUnloading = 29, ///< CUDA runtime unloa ding	cudaErrorCudartUnloading = 29, ///< CUDA runtime unloa ding
	cudaErrorUnknown = 30, ///< Unknown error cond ition	cudaErrorUnknown = 30, ///< Unknown error cond ition
	cudaErrorNotYetImplemented = 31, ///< Function not yet i mplemented	cudaErrorNotYetImplemented = 31, ///< Function not yet i mplemented
	cudaErrorMemoryValueTooLarge = 32, ///< Memory value too l arge	cudaErrorMemoryValueTooLarge = 32, ///< Memory value too l arge
	cudaErrorInvalidResourceHandle = 33, ///< Invalid resource h andle	cudaErrorInvalidResourceHandle = 33, ///< Invalid resource h andle
	cudaErrorNotReady = 34, ///< Not ready error	cudaErrorNotReady = 34, ///< Not ready error
	cudaErrorInsufficientDriver = 35, ///< CUDA runtime is ne wer than driver	cudaErrorInsufficientDriver = 35, ///< CUDA runtime is ne wer than driver
	cudaErrorSetOnActiveProcess = 36, ///< Set on active proc ess error	cudaErrorSetOnActiveProcess = 36, ///< Set on active proc ess error

	cudaErrorNoDevice = 38, ///< No available CUDA	cudaErrorInvalidSurface = 37, ///< Invalid surface
	device	cudaErrorNoDevice = 38, ///< No Cuda-capable de
		vices detected
	cudaErrorECCUncorrectable = 39, ///< Uncorrectable ECC error detected	cudaErrorECCUncorrectable = 39, ///< Uncorrectable ECC error detected

		cudaErrorSharedObjectSymbolNotFound = 40, ///< Link to a shared o
		bject failed to resolve
		cudaErrorSharedObjectInitFailed = 41, ///< Shared object init
		ialization failed
		cudaErrorUnsupportedLimit = 42, ///< ::cudaLimit not su
		pported by device
		cudaErrorDuplicateVariableName = 43, ///< Duplicate global v
		ariable lookup by string name
		cudaErrorDuplicateTextureName = 44, ///< Duplicate texture
		lookup by string name
		cudaErrorDuplicateSurfaceName = 45, ///< Duplicate surface
		lookup by string name
		cudaErrorDevicesUnavailable = 46, ///< All Cuda-capable d
		evices are busy (see ::cudaComputeMode) or unavailable
	cudaErrorStartupFailure = 0x7f, ///< Startup failure	cudaErrorStartupFailure = 0x7f, ///< Startup failure
	cudaErrorApiFailureBase = 10000 ///< API failure base	cudaErrorApiFailureBase = 10000 ///< API failure base
	};	};

	/**	/**
	* Channel format kind	* Channel format kind
	*/	*/
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	enum cudaChannelFormatKind	enum cudaChannelFormatKind
	{	{

	skipping to change at line 200	skipping to change at line 215
	};	};

	/**	/**
	* CUDA extent	* CUDA extent
	* \sa ::make_cudaExtent	* \sa ::make_cudaExtent
	*/	*/
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct cudaExtent	struct cudaExtent
	{	{
	size_t width; ///< Width in bytes	size_t width; ///< Width in bytes

	size_t height; ///< Height in bytes	size_t height; ///< Height in elements
	size_t depth; ///< Depth in bytes	size_t depth; ///< Depth in elements
	};	};

	/**	/**
	* CUDA 3D position	* CUDA 3D position
	* \sa ::make_cudaPos	* \sa ::make_cudaPos
	*/	*/
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct cudaPos	struct cudaPos
	{	{
	size_t x; ///< x	size_t x; ///< x

	skipping to change at line 246	skipping to change at line 261
	*/	*/
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct cudaGraphicsResource;	struct cudaGraphicsResource;

	/**	/**
	* CUDA graphics interop register flags	* CUDA graphics interop register flags
	*/	*/
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	enum cudaGraphicsRegisterFlags	enum cudaGraphicsRegisterFlags
	{	{

	cudaGraphicsRegisterFlagsNone = 0, ///< Default	cudaGraphicsRegisterFlagsNone = 0 ///< Default
	};	};

	/**	/**
	* CUDA graphics interop map flags	* CUDA graphics interop map flags
	*/	*/
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	enum cudaGraphicsMapFlags	enum cudaGraphicsMapFlags
	{	{
	cudaGraphicsMapFlagsNone = 0, ///< Default; Assume resource can be read/written	cudaGraphicsMapFlagsNone = 0, ///< Default; Assume resource can be read/written
	cudaGraphicsMapFlagsReadOnly = 1, ///< CUDA will not write to this r esource	cudaGraphicsMapFlagsReadOnly = 1, ///< CUDA will not write to this r esource

	cudaGraphicsMapFlagsWriteDiscard = 2, ///< CUDA will only write to and w ill not read from this resource	cudaGraphicsMapFlagsWriteDiscard = 2 ///< CUDA will only write to and w ill not read from this resource
	};	};

	/**	/**
	* CUDA graphics interop array indices for cube maps	* CUDA graphics interop array indices for cube maps
	*/	*/
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	enum cudaGraphicsCubeFace {	enum cudaGraphicsCubeFace {
	cudaGraphicsCubeFacePositiveX = 0x00, ///< Positive X face of cubemap	cudaGraphicsCubeFacePositiveX = 0x00, ///< Positive X face of cubemap
	cudaGraphicsCubeFaceNegativeX = 0x01, ///< Negative X face of cubemap	cudaGraphicsCubeFaceNegativeX = 0x01, ///< Negative X face of cubemap
	cudaGraphicsCubeFacePositiveY = 0x02, ///< Positive Y face of cubemap	cudaGraphicsCubeFacePositiveY = 0x02, ///< Positive Y face of cubemap
	cudaGraphicsCubeFaceNegativeY = 0x03, ///< Negative Y face of cubemap	cudaGraphicsCubeFaceNegativeY = 0x03, ///< Negative Y face of cubemap
	cudaGraphicsCubeFacePositiveZ = 0x04, ///< Positive Z face of cubemap	cudaGraphicsCubeFacePositiveZ = 0x04, ///< Positive Z face of cubemap

	cudaGraphicsCubeFaceNegativeZ = 0x05, ///< Negative Z face of cubemap	cudaGraphicsCubeFaceNegativeZ = 0x05 ///< Negative Z face of cubemap
	};	};

	/**	/**
	* CUDA function attributes	* CUDA function attributes
	*/	*/
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct cudaFuncAttributes	struct cudaFuncAttributes
	{	{
	size_t sharedSizeBytes; ///< Size of shared memory in bytes	size_t sharedSizeBytes; ///< Size of shared memory in bytes
	size_t constSizeBytes; ///< Size of constant memory in bytes	size_t constSizeBytes; ///< Size of constant memory in bytes

	skipping to change at line 322	skipping to change at line 337
	*/	*/
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	enum cudaComputeMode	enum cudaComputeMode
	{	{
	cudaComputeModeDefault = 0, ///< Default compute mode (Multiple thr eads can use ::cudaSetDevice() with this device)	cudaComputeModeDefault = 0, ///< Default compute mode (Multiple thr eads can use ::cudaSetDevice() with this device)
	cudaComputeModeExclusive = 1, ///< Compute-exclusive mode (Only one t hread will be able to use ::cudaSetDevice() with this device)	cudaComputeModeExclusive = 1, ///< Compute-exclusive mode (Only one t hread will be able to use ::cudaSetDevice() with this device)
	cudaComputeModeProhibited = 2 ///< Compute-prohibited mode (No thread s can use ::cudaSetDevice() with this device)	cudaComputeModeProhibited = 2 ///< Compute-prohibited mode (No thread s can use ::cudaSetDevice() with this device)
	};	};

	/**	/**

		* CUDA Limits
		*/
		/DEVICE_BUILTIN/
		enum cudaLimit
		{
		cudaLimitStackSize = 0x00, ///< GPU thread stack size
		cudaLimitPrintfFifoSize = 0x01 ///< GPU printf FIFO size
		};

		/**
	* CUDA device properties	* CUDA device properties
	*/	*/
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct cudaDeviceProp	struct cudaDeviceProp
	{	{
	char name[256]; ///< ASCII string identifying device	char name[256]; ///< ASCII string identifying device
	size_t totalGlobalMem; ///< Global memory available on device in bytes	size_t totalGlobalMem; ///< Global memory available on device in bytes
	size_t sharedMemPerBlock; ///< Shared memory available per block in bytes	size_t sharedMemPerBlock; ///< Shared memory available per block in bytes
	int regsPerBlock; ///< 32-bit registers available per blo ck	int regsPerBlock; ///< 32-bit registers available per blo ck
	int warpSize; ///< Warp size in threads	int warpSize; ///< Warp size in threads

	skipping to change at line 351	skipping to change at line 376
	int deviceOverlap; ///< Device can concurrently copy memor y and execute a kernel	int deviceOverlap; ///< Device can concurrently copy memor y and execute a kernel
	int multiProcessorCount; ///< Number of multiprocessors on devic e	int multiProcessorCount; ///< Number of multiprocessors on devic e
	int kernelExecTimeoutEnabled; ///< Specified whether there is a run t ime limit on kernels	int kernelExecTimeoutEnabled; ///< Specified whether there is a run t ime limit on kernels
	int integrated; ///< Device is integrated as opposed to discrete	int integrated; ///< Device is integrated as opposed to discrete
	int canMapHostMemory; ///< Device can map host memory with cu daHostAlloc/cudaHostGetDevicePointer	int canMapHostMemory; ///< Device can map host memory with cu daHostAlloc/cudaHostGetDevicePointer
	int computeMode; ///< Compute mode (See ::cudaComputeMod e)	int computeMode; ///< Compute mode (See ::cudaComputeMod e)
	int maxTexture1D; ///< Maximum 1D texture size	int maxTexture1D; ///< Maximum 1D texture size
	int maxTexture2D[2]; ///< Maximum 2D texture dimensions	int maxTexture2D[2]; ///< Maximum 2D texture dimensions
	int maxTexture3D[3]; ///< Maximum 3D texture dimensions	int maxTexture3D[3]; ///< Maximum 3D texture dimensions
	int maxTexture2DArray[3]; ///< Maximum 2D texture array dimension s	int maxTexture2DArray[3]; ///< Maximum 2D texture array dimension s

		size_t surfaceAlignment; ///< Alignment requirements for surface s
	int concurrentKernels; ///< Device can possibly execute multip le kernels concurrently	int concurrentKernels; ///< Device can possibly execute multip le kernels concurrently

	int __cudaReserved[26];	int ECCEnabled; ///< Device has ECC support enabled
		int pciBusID; ///< PCI bus ID of the device
		int pciDeviceID; ///< PCI device ID of the device
		int __cudaReserved[22];
	};	};

	#define cudaDevicePropDontCare \	#define cudaDevicePropDontCare \
	{ \	{ \
	{'\0'}, /* char name[256]; */ \	{'\0'}, /* char name[256]; */ \
	0, /* size_t totalGlobalMem; */ \	0, /* size_t totalGlobalMem; */ \
	0, /* size_t sharedMemPerBlock; */ \	0, /* size_t sharedMemPerBlock; */ \
	0, /* int regsPerBlock; */ \	0, /* int regsPerBlock; */ \
	0, /* int warpSize; */ \	0, /* int warpSize; */ \
	0, /* size_t memPitch; */ \	0, /* size_t memPitch; */ \

	skipping to change at line 381	skipping to change at line 410
	-1, /* int deviceOverlap; */ \	-1, /* int deviceOverlap; */ \
	0, /* int multiProcessorCount; */ \	0, /* int multiProcessorCount; */ \
	0, /* int kernelExecTimeoutEnabled */ \	0, /* int kernelExecTimeoutEnabled */ \
	0, /* int integrated */ \	0, /* int integrated */ \
	0, /* int canMapHostMemory */ \	0, /* int canMapHostMemory */ \
	0, /* int computeMode */ \	0, /* int computeMode */ \
	0, /* int maxTexture1D */ \	0, /* int maxTexture1D */ \
	{0, 0}, /* int maxTexture2D[2] */ \	{0, 0}, /* int maxTexture2D[2] */ \
	{0, 0, 0}, /* int maxTexture3D[3] */ \	{0, 0, 0}, /* int maxTexture3D[3] */ \
	{0, 0, 0}, /* int maxTexture2DArray[3] */ \	{0, 0, 0}, /* int maxTexture2DArray[3] */ \

	0 /* int concurrentKernels */ \	0, /* size_t surfaceAlignment */ \
		0, /* int concurrentKernels */ \
		0 /* int ECCEnabled */ \
	} ///< Empty device properties	} ///< Empty device properties

	/************************************************************************ ***	/************************************************************************ ***
	* *	* *
	* SHORTHAND TYPE DEFINITION USED BY RUNTIME API *	* SHORTHAND TYPE DEFINITION USED BY RUNTIME API *
	* *	* *
	************************************************************************* **/	************************************************************************* **/

	/**	/**
	* CUDA Error types	* CUDA Error types
	*/	*/
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	typedef enum cudaError cudaError_t;	typedef enum cudaError cudaError_t;

	/**	/**
	* CUDA stream	* CUDA stream
	*/	*/
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/

	typedef int cudaStream_t;	typedef struct CUstream_st *cudaStream_t;

	/**	/**
	* CUDA event types	* CUDA event types
	*/	*/
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/

	typedef int cudaEvent_t;	typedef struct CUevent_st *cudaEvent_t;

		/**
		* CUDA UUID types
		*/
		/DEVICE_BUILTIN/
		typedef struct CUuuid_st cudaUUID_t;

	/** @} */	/** @} */
	/** @} / / END CUDART_TYPES */	/** @} / / END CUDART_TYPES */

	#endif /* !__DRIVER_TYPES_H__ */	#endif /* !__DRIVER_TYPES_H__ */

End of changes. 14 change blocks.
	11 lines changed or deleted	60 lines changed or added

	func_macro.h	func_macro.h
	/*	/*

	* Copyright 1993-2008 NVIDIA Corporation. All rights reserved.	* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
	*	*
	* NOTICE TO USER:	* NOTICE TO USER:
	*	*
	* This source code is subject to NVIDIA ownership rights under U.S. and	* This source code is subject to NVIDIA ownership rights under U.S. and
	* international Copyright laws. Users and possessors of this source code	* international Copyright laws. Users and possessors of this source code
	* are hereby granted a nonexclusive, royalty-free license to use this code	* are hereby granted a nonexclusive, royalty-free license to use this code
	* in individual and commercial software.	* in individual and commercial software.
	*	*
	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR

	skipping to change at line 54	skipping to change at line 54
	___device__(static) decl	___device__(static) decl

	#else /* __CUDABE__ */	#else /* __CUDABE__ */

	#if !defined(__CUDA_INTERNAL_COMPILATION__)	#if !defined(__CUDA_INTERNAL_COMPILATION__)

	#error -- incorrect inclusion of a cudart header file	#error -- incorrect inclusion of a cudart header file

	#endif /* !__CUDA_INTERNAL_COMPILATION__ */	#endif /* !__CUDA_INTERNAL_COMPILATION__ */


	#if defined(__cplusplus) && defined(__device_emulation) && !defined(__multi
	_core__)

	#define __begin_host_func \
	}
	#define __end_host_func \
	namespace __cuda_emu {
	#define __host_device_call(f) \
	__cuda_emu::f

	#else /* __cplusplus && __device_emulation && !__multi_core__ */

	#define __begin_host_func
	#define __end_host_func
	#define __host_device_call(f) \
	f

	#endif /* __cplusplus && __device_emulation !__multi_core__ */

	#if defined(__APPLE__)	#if defined(__APPLE__)

	#define __func__(decl) \	#define __func__(decl) \
	extern __attribute__((__weak_import__, __weak__)) decl; decl	extern __attribute__((__weak_import__, __weak__)) decl; decl
	#define __device_func__(decl) \	#define __device_func__(decl) \
	static __attribute__((__unused__)) decl	static __attribute__((__unused__)) decl

	#elif defined(__GNUC__)	#elif defined(__GNUC__)

	#define __func__(decl) \	#define __func__(decl) \

End of changes. 2 change blocks.
	20 lines changed or deleted	1 lines changed or added

	host_config.h	host_config.h

	skipping to change at line 52	skipping to change at line 52
	* *	* *
	************************************************************************* **/	************************************************************************* **/

	#if defined(__CUDACC__)	#if defined(__CUDACC__)

	#if defined(__APPLE__)	#if defined(__APPLE__)

	#define _CRTIMP	#define _CRTIMP
	#define __THROW	#define __THROW


	#if defined(__BLOCKS__)	#if defined(__BLOCKS__) /* nvcc does not support closures */

	#undef __BLOCKS__	#undef __BLOCKS__

	#endif /* __BLOCKS__ */	#endif /* __BLOCKS__ */

	#elif defined(__GNUC__)	#elif defined(__GNUC__)

	#define _CRTIMP	#define _CRTIMP

	#include <features.h> /* for __THROW */	#include <features.h> /* for __THROW */

End of changes. 1 change blocks.
	1 lines changed or deleted	1 lines changed or added

	host_defines.h	host_defines.h

	skipping to change at line 102	skipping to change at line 102
	__declspec(dllexport)	__declspec(dllexport)
	#define __annotate__(a) \	#define __annotate__(a) \
	__declspec(a)	__declspec(a)
	#define __location__(a) \	#define __location__(a) \
	__annotate__(__##a##__)	__annotate__(__##a##__)
	#define CUDARTAPI \	#define CUDARTAPI \
	__stdcall	__stdcall

	#endif /* !__GNUC__ && !_WIN32 */	#endif /* !__GNUC__ && !_WIN32 */


		#if !defined(__GNUC__) \|\| __GNUC__ < 4 \|\| (__GNUC__ == 4 && __GNUC_MINOR__
		< 3)

		#define __specialization_static \
		static

		#else /* !__GNUC__ \|\| __GNUC__ < 4 \|\| (__GNUC__ == 4 && __GNUC_MINOR__ < 3)
		*/

		#define __specialization_static

		#endif /* !__GNUC__ \|\| __GNUC__ < 4 \|\| (__GNUC__ == 4 && __GNUC_MINOR__ < 3
		) */

	#if !defined(__CUDACC__) && !defined(__CUDABE__)	#if !defined(__CUDACC__) && !defined(__CUDABE__)

	#undef __annotate__	#undef __annotate__
	#define __annotate__(a)	#define __annotate__(a)


		#else /* !__CUDACC__ && !__CUDABE__ */

		#define __launch_bounds__(...) \
		__annotate__(launch_bounds(__VA_ARGS__))

	#endif /* !__CUDACC__ && !__CUDABE__ */	#endif /* !__CUDACC__ && !__CUDABE__ */

	#if defined(__CUDACC__) \|\| defined(__CUDABE__) \|\| \	#if defined(__CUDACC__) \|\| defined(__CUDABE__) \|\| \
	defined(__GNUC__) \|\| defined(_WIN64)	defined(__GNUC__) \|\| defined(_WIN64)

	#define __builtin_align__(a) \	#define __builtin_align__(a) \
	__align__(a)	__align__(a)

	#else /* __CUDACC__ \|\| __CUDABE__ \|\| __GNUC__ \|\| _WIN64 */	#else /* __CUDACC__ \|\| __CUDABE__ \|\| __GNUC__ \|\| _WIN64 */


	skipping to change at line 131	skipping to change at line 147
	#define __device__ \	#define __device__ \
	__location__(device)	__location__(device)
	#define __host__ \	#define __host__ \
	__location__(host)	__location__(host)
	#define __global__ \	#define __global__ \
	__location__(global)	__location__(global)
	#define __shared__ \	#define __shared__ \
	__location__(shared)	__location__(shared)
	#define __constant__ \	#define __constant__ \
	__location__(constant)	__location__(constant)

	#define __launch_bounds__(...) \
	__annotate__(launch_bounds(__VA_ARGS__))

	#endif /* !__HOST_DEFINES_H__ */	#endif /* !__HOST_DEFINES_H__ */

End of changes. 3 change blocks.
	2 lines changed or deleted	19 lines changed or added

	host_runtime.h	host_runtime.h
	/*	/*

	* Copyright 1993-2008 NVIDIA Corporation. All rights reserved.	* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
	*	*
	* NOTICE TO USER:	* NOTICE TO USER:
	*	*
	* This source code is subject to NVIDIA ownership rights under U.S. and	* This source code is subject to NVIDIA ownership rights under U.S. and
	* international Copyright laws. Users and possessors of this source code	* international Copyright laws. Users and possessors of this source code
	* are hereby granted a nonexclusive, royalty-free license to use this code	* are hereby granted a nonexclusive, royalty-free license to use this code
	* in individual and commercial software.	* in individual and commercial software.
	*	*
	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR

	skipping to change at line 39	skipping to change at line 39
	* source code with only those rights set forth herein.	* source code with only those rights set forth herein.
	*	*
	* Any use of this source code in individual and commercial software must	* Any use of this source code in individual and commercial software must
	* include, in the user documentation and internal comments to the code,	* include, in the user documentation and internal comments to the code,
	* the above Disclaimer and U.S. Government End Users Notice.	* the above Disclaimer and U.S. Government End Users Notice.
	*/	*/

	#if !defined(__CUDA_INTERNAL_COMPILATION__)	#if !defined(__CUDA_INTERNAL_COMPILATION__)

	#define __CUDA_INTERNAL_COMPILATION__	#define __CUDA_INTERNAL_COMPILATION__

	#define __glob_pref_var(var) \
	__global_##var
	#define __global_var(var) \
	(*__glob_pref_var(var))
	#define __shadow_var(c, cpp) \
	__shadow_pref_var(c, cpp)
	#define __text__	#define __text__
	#define __surf__	#define __surf__

	#define __dv(v)
	#define __name__shadow_var(c, cpp) \	#define __name__shadow_var(c, cpp) \

	__pick(#c, #cpp)	#cpp
	#define __name__text_var(c, cpp) \	#define __name__text_var(c, cpp) \

	__pick(#c, #cpp)	#cpp
	#define __shadow_pref_var(c, cpp) \	#define __host__shadow_var(c, cpp) \
	__pick(c##__cuda_shadow_variable__, cpp##__cuda_shadow_variable__)	cpp
	#define __device_stub_name(c, cpp) \
	__pick(c, cpp)
	#define __text_var(c, cpp) \	#define __text_var(c, cpp) \

	__pick(c, cpp)	cpp
	#define __cppref__ \	#define __device_fun(fun) \
	__pick(, &)	#fun
		#define __device_var(var) \
		#var
		#define __device__text_var(c, cpp) \
		#c
		#define __device__shadow_var(c, cpp) \
		#c

	#if defined(_WIN32) && !defined(_WIN64)	#if defined(_WIN32) && !defined(_WIN64)

	#define __pad__(f) \	#define __pad__(f) \
	f	f

	#else /* _WIN32 && !_WIN64 */	#else /* _WIN32 && !_WIN64 */

	#define __pad__(f)	#define __pad__(f)

	#endif /* _WIN32 && !_WIN64 */	#endif /* _WIN32 && !_WIN64 */


	#if defined(__APPLE__)	#include "builtin_types.h"

	#define __extern_weak__ \
	__weak_import__,

	#elif defined(__GNUC__)

	#define __extern_weak__

	#endif /* __APPLE__ */

	#if defined(__cplusplus)

	#define __pick(c, cpp) \
	cpp

	#else /* __cplusplus */

	#define __pick(c, cpp) \
	c

	typedef char bool;

	#endif /* __cplusplus */

	#if !defined(__GNUC__) \|\| __GNUC__ < 4 \|\| (__GNUC__ == 4 && __GNUC_MINOR__
	< 3)

	#define __specialization_static \
	static

	#else /* !__GNUC__ \|\| __GNUC__ < 4 \|\| (__GNUC__ == 4 && __GNUC_MINOR__ < 3)
	*/

	#define __specialization_static

	#endif /* !__GNUC__ \|\| __GNUC__ < 4 \|\| (__GNUC__ == 4 && __GNUC_MINOR__ < 3
	) */

	#include "cuda_runtime_api.h"
	#include "storage_class.h"	#include "storage_class.h"

	#else /* !__CUDA_INTERNAL_COMPILATION__ */	#else /* !__CUDA_INTERNAL_COMPILATION__ */


	#include "host_defines.h"	#define __cudaRegisterBinary()
		\
	#define __cudaRegisterBinary()	__cudaFatCubinHandle = __cudaRegisterFatBinary((void*)&__fatDeviceT
	\	ext); \
	__cudaFatCubinHandle = __cudaRegisterFatBinary((void*)__cudaFatCubi
	n); \
	atexit(__cudaUnregisterBinaryUtil)	atexit(__cudaUnregisterBinaryUtil)
	#define __cudaRegisterVariable(var, ext, size, constant, global) \	#define __cudaRegisterVariable(var, ext, size, constant, global) \
	__cudaRegisterVar(__cudaFatCubinHandle, (char)&__host##var, (char )__device##var, __name##var, ext, size, constant, global)	__cudaRegisterVar(__cudaFatCubinHandle, (char)&__host##var, (char )__device##var, __name##var, ext, size, constant, global)
	#define __cudaRegisterGlobalTexture(tex, dim, norm, ext) \	#define __cudaRegisterGlobalTexture(tex, dim, norm, ext) \

	__cudaRegisterTexture(__cudaFatCubinHandle, (const struct textureRe ference*)&tex, __tex_var(tex), __name##tex, dim, norm, ext)	__cudaRegisterTexture(__cudaFatCubinHandle, (const struct textureRe ference)&tex, (const void*)__device##tex, __name##tex, dim, norm, ext)
	#define __cudaRegisterGlobalSurface(surf, dim, ext) \	#define __cudaRegisterGlobalSurface(surf, dim, ext) \

	__cudaRegisterSurface(__cudaFatCubinHandle, (const struct surfaceRe	__cudaRegisterSurface(__cudaFatCubinHandle, (const struct surfaceRe
	ference*)&surf, __tex_var(surf), __name##surf, dim, ext)	ference)&surf, (const void*)__device##surf, __name##surf, dim, ext)
	#define __cudaRegisterUnsizedShared(var) \
	__cudaRegisterShared(__cudaFatCubinHandle, (void**)__device_var(var
	))
	#define __cudaRegisterSharedVariable(var, size, align, sc) \
	__cudaRegisterSharedVar(__cudaFatCubinHandle, (void**)__device_var(
	var), size, align, sc)
	#define __cudaRegisterEntry(funptr, fun, thread_limit) \	#define __cudaRegisterEntry(funptr, fun, thread_limit) \

	__cudaRegisterFunction(__cudaFatCubinHandle, (const char*)funptr, (	__cudaRegisterFunction(__cudaFatCubinHandle, (const char*)funptr, (
	char*)__device_fun(fun), #fun, __cuda_tl__(thread_limit), __ids)	char)__device_fun(fun), #fun, -1, (uint3)0, (uint3)0, (dim3)0, (dim3*)0
		, (int*)0)
	#define __cudaInitArgBlock(arg) \	#define __cudaInitArgBlock(arg) \
	(void)(void)&arg = (void*)0	(void)(void)&arg = (void*)0
	#define __cudaSetupArg(arg, offset) \	#define __cudaSetupArg(arg, offset) \
	if (cudaSetupArgument((void)(char)&arg, sizeof(arg), (size_t)&off set->arg) != cudaSuccess) \	if (cudaSetupArgument((void)(char)&arg, sizeof(arg), (size_t)&off set->arg) != cudaSuccess) \
	return	return
	#define __cudaLaunch(fun) \	#define __cudaLaunch(fun) \
	{ volatile static char *__f; __f = fun; (void)cudaLaunch(fun); }	{ volatile static char *__f; __f = fun; (void)cudaLaunch(fun); }


	#if defined(__cplusplus)
	extern "C" {	extern "C" {

	#endif /* __cplusplus */

	extern void** CUDARTAPI __cudaRegisterFatBinary(	extern void** CUDARTAPI __cudaRegisterFatBinary(
	void *fatCubin	void *fatCubin
	);	);

	extern void CUDARTAPI __cudaUnregisterFatBinary(	extern void CUDARTAPI __cudaUnregisterFatBinary(
	void **fatCubinHandle	void **fatCubinHandle
	);	);

	extern void CUDARTAPI __cudaRegisterVar(	extern void CUDARTAPI __cudaRegisterVar(

	skipping to change at line 181	skipping to change at line 133

	extern void CUDARTAPI __cudaRegisterSurface(	extern void CUDARTAPI __cudaRegisterSurface(
	void **fatCubinHandle,	void **fatCubinHandle,
	const struct surfaceReference *hostVar,	const struct surfaceReference *hostVar,
	const void **deviceAddress,	const void **deviceAddress,
	const char *deviceName,	const char *deviceName,
	int dim,	int dim,
	int ext	int ext
	);	);


	extern void CUDARTAPI __cudaRegisterShared(
	void **fatCubinHandle,
	void **devicePtr
	);

	extern void CUDARTAPI __cudaRegisterSharedVar(
	void **fatCubinHandle,
	void **devicePtr,
	size_t size,
	size_t alignment,
	int storage
	);

	extern void CUDARTAPI __cudaRegisterFunction(	extern void CUDARTAPI __cudaRegisterFunction(
	void **fatCubinHandle,	void **fatCubinHandle,
	const char *hostFun,	const char *hostFun,
	char *deviceFun,	char *deviceFun,
	const char *deviceName,	const char *deviceName,
	int thread_limit,	int thread_limit,
	uint3 *tid,	uint3 *tid,
	uint3 *bid,	uint3 *bid,
	dim3 *bDim,	dim3 *bDim,
	dim3 *gDim,	dim3 *gDim,
	int *wSize	int *wSize
	);	);


	#if defined(__cplusplus)	#if defined(__GNUC__)
	}
	#endif /* __cplusplus */

	#if defined(__GNUC__) && defined(__cplusplus)

	extern int atexit(void(*)(void)) throw();	extern int atexit(void(*)(void)) throw();


	#else /* __GNUC__ && __cplusplus */	#else /* __GNUC__ */

	extern int __cdecl atexit(void(__cdecl *)(void));	extern int __cdecl atexit(void(__cdecl *)(void));


	#endif /* __GNUC__ && __cplusplus */	#endif /* __GNUC__ */

		}

	static void **__cudaFatCubinHandle;	static void **__cudaFatCubinHandle;

	static void __cdecl __cudaUnregisterBinaryUtil(void)	static void __cdecl __cudaUnregisterBinaryUtil(void)
	{	{
	__cudaUnregisterFatBinary(__cudaFatCubinHandle);	__cudaUnregisterFatBinary(__cudaFatCubinHandle);
	}	}


	#if defined(__device_emulation)

	#if defined(__cplusplus) && !defined(__multi_core__)

	#define __cuda_emu__ \
	__cuda_emu::

	#else /* __cplusplus */

	#define __cuda_emu__

	#endif /* __cplusplus */

	#define __device_fun(fun) \
	__cuda_emu__ __device_wrapper_##fun
	#define __device_var(var) \
	&__cuda_emu__ var
	#define __tex_var(var) \
	&__cuda_emu__ __texture_var(var)
	#define __cudaFatCubin \
	0
	#define __cuda_tl__(l) \
	l

	#if defined(__multi_core__)

	#define __ids \
	(uint3)0, (uint3)0, &blockDim, &gridDim, &warpSize

	#else /* __multi_core__ */

	#define __ids \
	(uint3)&__cuda_emu__ threadIdx, (uint3)&__cuda_emu__ blockIdx, (d
	im3)&__cuda_emu__ blockDim, (dim3)&__cuda_emu__ gridDim, &__cuda_emu__ wa
	rpSize

	#endif /* __multi_core__ */

	#else /* __device_emulation */

	#define __device_fun(fun) \
	#fun
	#define __device_var(var) \
	#var
	#define __tex_var(var) \
	0
	#define __cudaFatCubin \
	&__fatDeviceText
	#define __cuda_tl__(l) \
	-1
	#define __ids \
	(uint3)0, (uint3)0, (dim3)0, (dim3)0, (int*)0

	#include "common_functions.h"	#include "common_functions.h"


	#endif /* __device_emulation */

	/* UTILITY MACROS */
	#define __device__global_var(var) \
	__device_var(var)
	#define __name__global_var(var) \
	#var
	#define __host__global_var(var) \
	__glob_pref_var(var)
	#define __device__shadow_var(c, cpp) \
	__device_var(c)
	#define __host__shadow_var(c, cpp) \
	__shadow_pref_var(c, cpp)

	#if defined(_WIN32)	#if defined(_WIN32)


	#if defined(__cplusplus)

	#pragma warning(disable: 4099)	#pragma warning(disable: 4099)


	#endif /* __cplusplus */

	#if !defined(_WIN64)	#if !defined(_WIN64)

	#pragma warning(disable: 4408)	#pragma warning(disable: 4408)

	#endif /* !_WIN64 */	#endif /* !_WIN64 */

	#endif /* _WIN32 */	#endif /* _WIN32 */

	#endif /* !__CUDA_INTERNAL_COMPILATION__ */	#endif /* !__CUDA_INTERNAL_COMPILATION__ */

End of changes. 21 change blocks.
	168 lines changed or deleted	30 lines changed or added

	math_functions_dbl_ptx1.h	math_functions_dbl_ptx1.h

	skipping to change at line 39	skipping to change at line 39
	* source code with only those rights set forth herein.	* source code with only those rights set forth herein.
	*	*
	* Any use of this source code in individual and commercial software must	* Any use of this source code in individual and commercial software must
	* include, in the user documentation and internal comments to the code,	* include, in the user documentation and internal comments to the code,
	* the above Disclaimer and U.S. Government End Users Notice.	* the above Disclaimer and U.S. Government End Users Notice.
	*/	*/

	#if !defined(__MATH_FUNCTIONS_DBL_PTX1_H__)	#if !defined(__MATH_FUNCTIONS_DBL_PTX1_H__)
	#define __MATH_FUNCTIONS_DBL_PTX1_H__	#define __MATH_FUNCTIONS_DBL_PTX1_H__


	#if defined(__cplusplus) && defined(__CUDACC__)	#if defined(__CUDABE__)

	#elif !defined(__CUDACC__)

	#include "crt/func_macro.h"


	__device_func__(double __cuda_fabs(double a))	static __forceinline__ double fabs(double a)
	{	{

	return (float)__cuda_fabsf((float)a);	return (double)fabsf((float)a);
	}	}


	__device_func__(double __cuda_fmax(double a, double b))	static __forceinline__ double fmax(double a, double b)
	{	{

	return (float)__cuda_fmaxf((float)a, (float)b);	return (double)fmaxf((float)a, (float)b);
	}	}


	__device_func__(double __cuda_fmin(double a, double b))	static __forceinline__ double fmin(double a, double b)
	{	{

	return (float)__cuda_fminf((float)a, (float)b);	return (double)fminf((float)a, (float)b);
	}	}


	__device_func__(int __cuda___finite(double a))	static __forceinline__ int __finite(double a)
	{	{

	return __cuda___finitef((float)a);	return __finitef((float)a);
	}	}


	__device_func__(int __cuda___isinf(double a))	static __forceinline__ int __isinf(double a)
	{	{

	return __cuda___isinff((float)a);	return __isinff((float)a);
	}	}


	__device_func__(int __cuda___isnan(double a))	static __forceinline__ int __isnan(double a)
	{	{

	return __cuda___isnanf((float)a);	return __isnanf((float)a);
	}	}


	__device_func__(int __cuda___signbit(double a))	static __forceinline__ int __signbit(double a)
	{	{

	return __cuda___signbitf((float)a);	return __signbitf((float)a);
	}	}


	__device_func__(double __cuda_sqrt(double a))	static __forceinline__ double sqrt(double a)
	{	{

	return (double)__cuda_sqrtf((float)a);	return (double)sqrtf((float)a);
	}	}


	__device_func__(double __cuda_rsqrt(double a))	static __forceinline__ double rsqrt(double a)
	{	{

	return (double)__cuda_rsqrtf((float)a);	return (double)rsqrtf((float)a);
	}	}


	__device_func__(double __cuda_ceil(double a))	static __forceinline__ double ceil(double a)
	{	{

	return (double)__cuda_ceilf((float)a);	return (double)ceilf((float)a);
	}	}


	__device_func__(double __cuda_trunc(double a))	static __forceinline__ double trunc(double a)
	{	{

	return (double)__cuda_truncf((float)a);	return (double)truncf((float)a);
	}	}


	__device_func__(double __cuda_floor(double a))	static __forceinline__ double floor(double a)
	{	{

	return (double)__cuda_floorf((float)a);	return (double)floorf((float)a);
	}	}


	__device_func__(double __cuda_copysign(double a, double b))	static __forceinline__ double copysign(double a, double b)
	{	{

	return (double)__cuda_copysignf((float)a, (float)b);	return (double)copysignf((float)a, (float)b);
	}	}


	__device_func__(double __cuda_sin(double a))	static __forceinline__ double sin(double a)
	{	{

	return (double)__cuda_sinf((float)a);	return (double)sinf((float)a);
	}	}


	__device_func__(double __cuda_sinpi(double a))	static __forceinline__ double sinpi(double a)
	{	{

	return (double)__cuda_sinpif((float)a);	return (double)sinpif((float)a);
	}	}


	__device_func__(double __cuda_cos(double a))	static __forceinline__ double cos(double a)
	{	{

	return (double)__cuda_cosf((float)a);	return (double)cosf((float)a);
	}	}


	__device_func__(void __cuda_sincos(double a, double sptr, double cptr))	static __forceinline__ void sincos(double a, double sptr, double cptr)
	{	{
	float fs, fc;	float fs, fc;


	__cuda_sincosf((float)a, &fs, &fc);	sincosf((float)a, &fs, &fc);

	*sptr = (double)fs;	*sptr = (double)fs;
	*cptr = (double)fc;	*cptr = (double)fc;
	}	}


	__device_func__(double __cuda_tan(double a))	static __forceinline__ double tan(double a)
	{	{

	return (double)__cuda_tanf((float)a);	return (double)tanf((float)a);
	}	}


	__device_func__(double __cuda_exp(double a))	static __forceinline__ double exp(double a)
	{	{

	return (double)__cuda_expf((float)a);	return (double)expf((float)a);
	}	}


	__device_func__(double __cuda_exp2(double a))	static __forceinline__ double exp2(double a)
	{	{

	return (double)__cuda_exp2f((float)a);	return (double)exp2f((float)a);
	}	}


	__device_func__(double __cuda_exp10(double a))	static __forceinline__ double exp10(double a)
	{	{

	return (double)__cuda_exp10f((float)a);	return (double)exp10f((float)a);
	}	}


	__device_func__(double __cuda_expm1(double a))	static __forceinline__ double expm1(double a)
	{	{

	return (double)__cuda_expm1f((float)a);	return (double)expm1f((float)a);
	}	}


	__device_func__(double __cuda_cosh(double a))	static __forceinline__ double cosh(double a)
	{	{

	return (double)__cuda_coshf((float)a);	return (double)coshf((float)a);
	}	}


	__device_func__(double __cuda_sinh(double a))	static __forceinline__ double sinh(double a)
	{	{

	return (double)__cuda_sinhf((float)a);	return (double)sinhf((float)a);
	}	}


	__device_func__(double __cuda_tanh(double a))	static __forceinline__ double tanh(double a)
	{	{

	return (double)__cuda_tanhf((float)a);	return (double)tanhf((float)a);
	}	}


	__device_func__(double __cuda_asin(double a))	static __forceinline__ double asin(double a)
	{	{

	return (double)__cuda_asinf((float)a);	return (double)asinf((float)a);
	}	}


	__device_func__(double __cuda_acos(double a))	static __forceinline__ double acos(double a)
	{	{

	return (double)__cuda_acosf((float)a);	return (double)acosf((float)a);
	}	}


	__device_func__(double __cuda_atan(double a))	static __forceinline__ double atan(double a)
	{	{

	return (double)__cuda_atanf((float)a);	return (double)atanf((float)a);
	}	}


	__device_func__(double __cuda_atan2(double a, double b))	static __forceinline__ double atan2(double a, double b)
	{	{

	return (double)__cuda_atan2f((float)a, (float)b);	return (double)atan2f((float)a, (float)b);
	}	}


	__device_func__(double __cuda_log(double a))	static __forceinline__ double log(double a)
	{	{

	return (double)__cuda_logf((float)a);	return (double)logf((float)a);
	}	}


	__device_func__(double __cuda_log2(double a))	static __forceinline__ double log2(double a)
	{	{

	return (double)__cuda_log2f((float)a);	return (double)log2f((float)a);
	}	}


	__device_func__(double __cuda_log10(double a))	static __forceinline__ double log10(double a)
	{	{

	return (double)__cuda_log10f((float)a);	return (double)log10f((float)a);
	}	}


	__device_func__(double __cuda_log1p(double a))	static __forceinline__ double log1p(double a)
	{	{

	return (double)__cuda_log1pf((float)a);	return (double)log1pf((float)a);
	}	}


	__device_func__(double __cuda_acosh(double a))	static __forceinline__ double acosh(double a)
	{	{

	return (double)__cuda_acoshf((float)a);	return (double)acoshf((float)a);
	}	}


	__device_func__(double __cuda_asinh(double a))	static __forceinline__ double asinh(double a)
	{	{

	return (double)__cuda_asinhf((float)a);	return (double)asinhf((float)a);
	}	}


	__device_func__(double __cuda_atanh(double a))	static __forceinline__ double atanh(double a)
	{	{

	return (double)__cuda_atanhf((float)a);	return (double)atanhf((float)a);
	}	}


	__device_func__(double __cuda_hypot(double a, double b))	static __forceinline__ double hypot(double a, double b)
	{	{

	return (double)__cuda_hypotf((float)a, (float)b);	return (double)hypotf((float)a, (float)b);
	}	}


	__device_func__(double __cuda_cbrt(double a))	static __forceinline__ double cbrt(double a)
	{	{

	return (double)__cuda_cbrtf((float)a);	return (double)cbrtf((float)a);
	}	}


	__device_func__(double __cuda_rcbrt(double a))	static __forceinline__ double rcbrt(double a)
	{	{

	return (double)__cuda_rcbrtf((float)a);	return (double)rcbrtf((float)a);
	}	}


	__device_func__(double __cuda_erf(double a))	static __forceinline__ double erf(double a)
	{	{

	return (double)__cuda_erff((float)a);	return (double)erff((float)a);
	}	}


	__device_func__(double __cuda_erfinv(double a))	static __forceinline__ double erfinv(double a)
	{	{

	return (double)__cuda_erfinvf((float)a);	return (double)erfinvf((float)a);
	}	}


	__device_func__(double __cuda_erfc(double a))	static __forceinline__ double erfc(double a)
	{	{

	return (double)__cuda_erfcf((float)a);	return (double)erfcf((float)a);
	}	}


	__device_func__(double __cuda_erfcinv(double a))	static __forceinline__ double erfcinv(double a)
	{	{

	return (double)__cuda_erfcinvf((float)a);	return (double)erfcinvf((float)a);
	}	}


	__device_func__(double __cuda_lgamma(double a))	static __forceinline__ double lgamma(double a)
	{	{

	return (double)__cuda_lgammaf((float)a);	return (double)lgammaf((float)a);
	}	}


	__device_func__(double __cuda_tgamma(double a))	static __forceinline__ double tgamma(double a)
	{	{

	return (double)__cuda_tgammaf((float)a);	return (double)tgammaf((float)a);
	}	}


	__device_func__(double __cuda_ldexp(double a, int b))	static __forceinline__ double ldexp(double a, int b)
	{	{

	return (double)__cuda_ldexpf((float)a, b);	return (double)ldexpf((float)a, b);
	}	}


	__device_func__(double __cuda_scalbn(double a, int b))	static __forceinline__ double scalbn(double a, int b)
	{	{

	return (double)__cuda_scalbnf((float)a, b);	return (double)scalbnf((float)a, b);
	}	}


	__device_func__(double __cuda_scalbln(double a, long b))	static __forceinline__ double scalbln(double a, long b)
	{	{

	return (double)__cuda_scalblnf((float)a, b);	return (double)scalblnf((float)a, b);
	}	}


	__device_func__(double __cuda_frexp(double a, int *b))	static __forceinline__ double frexp(double a, int *b)
	{	{

	return (double)__cuda_frexpf((float)a, b);	return (double)frexpf((float)a, b);
	}	}


	__device_func__(double __cuda_modf(double a, double *b))	static __forceinline__ double modf(double a, double *b)
	{	{
	float fb;	float fb;

	float fa = __cuda_modff((float)a, &fb);	float fa = modff((float)a, &fb);

	*b = (double)fb;	*b = (double)fb;

	return (double)fa;	return (double)fa;
	}	}


	__device_func__(double __cuda_fmod(double a, double b))	static __forceinline__ double fmod(double a, double b)
	{	{

	return (double)__cuda_fmodf((float)a, (float)b);	return (double)fmodf((float)a, (float)b);
	}	}


	__device_func__(double __cuda_remainder(double a, double b))	static __forceinline__ double remainder(double a, double b)
	{	{

	return (double)__cuda_remainderf((float)a, (float)b);	return (double)remainderf((float)a, (float)b);
	}	}


	__device_func__(double __cuda_remquo(double a, double b, int *c))	static __forceinline__ double remquo(double a, double b, int *c)
	{	{

	return (double)__cuda_remquof((float)a, (float)b, c);	return (double)remquof((float)a, (float)b, c);
	}	}


	__device_func__(double __cuda_nextafter(double a, double b))	static __forceinline__ double nextafter(double a, double b)
	{	{

	return (double)__cuda_nextafterf((float)a, (float)b);	return (double)nextafterf((float)a, (float)b);
	}	}


	__device_func__(double __cuda_nan(const char *tagp))	static __forceinline__ double nan(const char *tagp)
	{	{

	return (double)__cuda_nanf(tagp);	return (double)nanf(tagp);
	}	}


	__device_func__(double __cuda_pow(double a, double b))	static __forceinline__ double pow(double a, double b)
	{	{

	return (double)__cuda_powf((float)a, (float)b);	return (double)powf((float)a, (float)b);
	}	}


	__device_func__(double __cuda_round(double a))	static __forceinline__ double round(double a)
	{	{

	return (double)__cuda_roundf((float)a);	return (double)roundf((float)a);
	}	}


	__device_func__(long __cuda_lround(double a))	static __forceinline__ long lround(double a)
	{	{

	return __cuda_lroundf((float)a);	return lroundf((float)a);
	}	}


	__device_func__(long long __cuda_llround(double a))	static __forceinline__ long long llround(double a)
	{	{

	return __cuda_llroundf((float)a);	return llroundf((float)a);
	}	}


	__device_func__(double __cuda_rint(double a))	static __forceinline__ double rint(double a)
	{	{

	return (double)__cuda_rintf((float)a);	return (double)rintf((float)a);
	}	}


	__device_func__(long __cuda_lrint(double a))	static __forceinline__ long lrint(double a)
	{	{

	return __cuda_lrintf((float)a);	return lrintf((float)a);
	}	}


	__device_func__(long long __cuda_llrint(double a))	static __forceinline__ long long llrint(double a)
	{	{

	return __cuda_llrintf((float)a);	return llrintf((float)a);
	}	}


	__device_func__(double __cuda_nearbyint(double a))	static __forceinline__ double nearbyint(double a)
	{	{

	return (double)__cuda_nearbyintf((float)a);	return (double)nearbyintf((float)a);
	}	}


	__device_func__(double __cuda_fdim(double a, double b))	static __forceinline__ double fdim(double a, double b)
	{	{

	return (double)__cuda_fdimf((float)a, (float)b);	return (double)fdimf((float)a, (float)b);
	}	}


	__device_func__(int __cuda_ilogb(double a))	static __forceinline__ int ilogb(double a)
	{	{

	return __cuda_ilogbf((float)a);	return ilogbf((float)a);
	}	}


	__device_func__(double __cuda_logb(double a))	static __forceinline__ double logb(double a)
	{	{

	return (double)__cuda_logbf((float)a);	return (double)logbf((float)a);
	}	}


	__device_func__(double __cuda_fma(double a, double b, double c))	static __forceinline__ double fma(double a, double b, double c)
	{	{

	return (double)__cuda_fmaf((float)a, (float)b, (float)c);	return (double)fmaf((float)a, (float)b, (float)c);
	}	}


	#if __APPLE__	#if defined(__APPLE__)
	__device_func__(int __cuda___isfinited(double a))
		static __forceinline__ int __isfinited(double a)
	{	{

	return (double)__cuda___finitef((float)a);	return (double)__finitef((float)a);
	}	}


	__device_func__(int __cuda___signbitd(double a))	static __forceinline__ int __signbitd(double a)
	{	{

	return (double)__cuda___signbitf((float)a);	return (double)__signbitf((float)a);
	}	}

	#endif


	#endif /* __cplusplus && __CUDACC__ */	#endif /* __APPLE__ */

		#endif /* __CUDABE__ */

	#endif /* __MATH_FUNCTIONS_DBL_PTX1_H__ */	#endif /* __MATH_FUNCTIONS_DBL_PTX1_H__ */

End of changes. 141 change blocks.
	146 lines changed or deleted	144 lines changed or added

	math_functions_dbl_ptx3.h	math_functions_dbl_ptx3.h

	skipping to change at line 41	skipping to change at line 41
	* Any use of this source code in individual and commercial software must	* Any use of this source code in individual and commercial software must
	* include, in the user documentation and internal comments to the code,	* include, in the user documentation and internal comments to the code,
	* the above Disclaimer and U.S. Government End Users Notice.	* the above Disclaimer and U.S. Government End Users Notice.
	*/	*/

	#if !defined(__MATH_FUNCTIONS_DBL_PTX3_H__)	#if !defined(__MATH_FUNCTIONS_DBL_PTX3_H__)
	#define __MATH_FUNCTIONS_DBL_PTX3_H__	#define __MATH_FUNCTIONS_DBL_PTX3_H__

	/* True double precision implementations, since native double support */	/* True double precision implementations, since native double support */


	#if defined(__cplusplus) && defined(__CUDACC__)	#if defined(__CUDABE__)

	#elif !defined(__CUDACC__)

	#include "crt/func_macro.h"

	#define INT_MAX \
	((int)((unsigned int)-1 >> 1))


	#include "device_functions.h"
	#include "math_constants.h"
	#if !defined(__CUDABE__)
	#include "common_types.h"
	#endif
	/************************************************************************ ***	/************************************************************************ ***
	* *	* *

	* DEVICE IMPLEMENTATIONS FOR FUNCTIONS WITH BUILTIN NVOPENCC OPREATIONS *	* DEVICE IMPLEMENTATIONS FOR FUNCTIONS WITH BUILTIN NVOPENCC OPERATIONS *
	* *	* *
	************************************************************************* **/	************************************************************************* **/


	__device_func__(double __cuda_fabs(double a))	static __forceinline__ double rint(double a)
	{
	return fabs(a);
	}

	__device_func__(double __cuda_fmax(double a, double b))
	{
	#if !defined(__CUDABE__)
	volatile union __cudart_DoubleUlonglongCvt cvta, cvtb;
	int nana, nanb;

	cvta.d = a;
	cvtb.d = b;
	nana = ((cvta.i << 1) > 0xffe0000000000000ULL);
	nanb = ((cvtb.i << 1) > 0xffe0000000000000ULL);
	if (nana && nanb) return a + b;
	if (nana) return b;
	if (nanb) return a;
	if ((cvta.d == 0.0) && (cvtb.d == 0.0)) {
	cvta.i &= cvtb.i;
	return cvta.d;
	}
	return a > b ? a : b;
	#else
	return fmax(a, b);
	#endif /* !defined(__CUDABE__) */
	}

	__device_func__(double __cuda_fmin(double a, double b))
	{
	#if !defined(__CUDABE__)
	volatile union __cudart_DoubleUlonglongCvt cvta, cvtb;
	int nana, nanb;

	cvta.d = a;
	cvtb.d = b;
	nana = ((cvta.i << 1) > 0xffe0000000000000ULL);
	nanb = ((cvtb.i << 1) > 0xffe0000000000000ULL);
	if (nana && nanb) return a + b;
	if (nana) return b;
	if (nanb) return a;
	if ((cvta.i \| cvtb.i) == 0x8000000000000000ULL) {
	return CUDART_NEG_ZERO ;
	}
	return a < b ? a : b;
	#else
	return fmin(a, b);
	#endif /* !defined(__CUDABE__) */
	}

	__device_func__(double __cuda_ceil(double a))
	{	{

	return ceil(a);	return __builtin_round(a);
	}	}


	__device_func__(double __cuda_floor(double a))	static __forceinline__ long int lrint(double a)
	{	{

	return floor(a);	#if defined(__LP64__)
		return (long int)__double2ll_rn(a);
		#else /* __LP64__ */
		return (long int)__double2int_rn(a);
		#endif /* __LP64__ */
	}	}


	__device_func__(double __cuda_trunc(double a))	static __forceinline__ long long int llrint(double a)
	{	{

	return trunc(a);	return __double2ll_rn(a);
	}	}


	__device_func__(double __cuda_nearbyint(double a))	static __forceinline__ double nearbyint(double a)
	{	{

	#if defined(__CUDABE__)	return __builtin_round(a);
	return round(a);
	#else /* __CUDABE__ */
	double res = nearbyint(a);
	#if defined(__APPLE__)
	if ((a != 0.0) && (__cuda_fabs(a) <= 0.5)) {
	res = fabs(res) * ((a < 0.0) ? -3e-324 : 3e-324);
	}
	#endif /* __APPLE__ */
	return res;
	#endif /* __CUDABE__ */
	}	}

	/************************************************************************ ***	/************************************************************************ ***
	* *	* *
	* DEVICE IMPLEMENTATIONS FOR FUNCTIONS WITHOUT BUILTIN NVOPENCC OPREATIONS *	* DEVICE IMPLEMENTATIONS FOR FUNCTIONS WITHOUT BUILTIN NVOPENCC OPREATIONS *
	* *	* *
	************************************************************************* **/	************************************************************************* **/


	__device_func__(double __cuda_rint(double a))	static __forceinline__ int __signbit(double a)
	{
	return __cuda_nearbyint(a);
	}

	__device_func__(long int __cuda_lrint(double a))
	{
	#if defined(__LP64__)
	return (long int)__double2ll_rn(a);
	#else /* __LP64__ */
	return (long int)__double2int_rn(a);
	#endif /* __LP64__ */
	}

	__device_func__(long long int __cuda_llrint(double a))
	{
	return __double2ll_rn(a);
	}

	__device_func__(int __cuda___signbit(double a))
	{	{
	return (int)((unsigned int)__double2hiint(a) >> 31);	return (int)((unsigned int)__double2hiint(a) >> 31);
	}	}


	__device_func__(int __cuda___finite(double a))	static __forceinline__ int __finite(double a)
	{	{

	return __cuda_fabs(a) < CUDART_INF;	return fabs(a) < CUDART_INF;
	}	}


	__device_func__(int __cuda___isinf(double a))	static __forceinline__ int __isinf(double a)
	{	{

	return __cuda_fabs(a) == CUDART_INF;	return fabs(a) == CUDART_INF;
	}	}


	__device_func__(int __cuda___isnan(double a))	static __forceinline__ int __isnan(double a)
	{	{

	return !(__cuda_fabs(a) <= CUDART_INF);	return !(fabs(a) <= CUDART_INF);
	}	}


	__device_func__(double __cuda_copysign(double a, double b))	static __forceinline__ double copysign(double a, double b)
	{	{
	int alo, ahi, bhi;	int alo, ahi, bhi;

	bhi = __double2hiint(b);	bhi = __double2hiint(b);
	alo = __double2loint(a);	alo = __double2loint(a);
	ahi = __double2hiint(a);	ahi = __double2hiint(a);
	ahi = (bhi & 0x80000000) \| (ahi & ~0x80000000);	ahi = (bhi & 0x80000000) \| (ahi & ~0x80000000);
	return __hiloint2double(ahi, alo);	return __hiloint2double(ahi, alo);
	}	}

	/* like copysign, but requires that argument a is postive */	/* like copysign, but requires that argument a is postive */

	__device_func__(double __internal_copysign_pos(double a, double b))	static __forceinline__ double __internal_copysign_pos(double a, double b)
	{	{
	int alo, ahi, bhi;	int alo, ahi, bhi;

	bhi = __double2hiint(b);	bhi = __double2hiint(b);
	alo = __double2loint(a);	alo = __double2loint(a);
	ahi = __double2hiint(a);	ahi = __double2hiint(a);
	ahi = (bhi & 0x80000000) \| ahi;	ahi = (bhi & 0x80000000) \| ahi;
	return __hiloint2double(ahi, alo);	return __hiloint2double(ahi, alo);
	}	}


		static __forceinline__ double __internal_fast_rcp(double a)
		{
		double e, y;
		float x;
		x = __double2float_rn(a);
		y = (double)(1.0f/x);
		e = __fma_rn (-a, y, 1.0);
		e = __fma_rn ( e, e, e);
		y = __fma_rn ( e, y, y);
		return y;
		}

	/* 1152 bits of 2/PI for Payne-Hanek style argument reduction. */	/* 1152 bits of 2/PI for Payne-Hanek style argument reduction. */
	static __constant__ unsigned long long int __cudart_i2opi_d [] = {	static __constant__ unsigned long long int __cudart_i2opi_d [] = {
	0x6bfb5fb11f8d5d08ULL,	0x6bfb5fb11f8d5d08ULL,
	0x3d0739f78a5292eaULL,	0x3d0739f78a5292eaULL,
	0x7527bac7ebe5f17bULL,	0x7527bac7ebe5f17bULL,
	0x4f463f669e5fea2dULL,	0x4f463f669e5fea2dULL,
	0x6d367ecf27cb09b7ULL,	0x6d367ecf27cb09b7ULL,
	0xef2f118b5a0a6d1fULL,	0xef2f118b5a0a6d1fULL,
	0x1ff897ffde05980fULL,	0x1ff897ffde05980fULL,
	0x9c845f8bbdf9283bULL,	0x9c845f8bbdf9283bULL,

	skipping to change at line 231	skipping to change at line 156
	0xe88235f52ebb4484ULL,	0xe88235f52ebb4484ULL,
	0xfe1deb1cb129a73eULL,	0xfe1deb1cb129a73eULL,
	0x06492eea09d1921cULL,	0x06492eea09d1921cULL,
	0xb7246e3a424dd2e0ULL,	0xb7246e3a424dd2e0ULL,
	0xfe5163abdebbc561ULL,	0xfe5163abdebbc561ULL,
	0xdb6295993c439041ULL,	0xdb6295993c439041ULL,
	0xfc2757d1f534ddc0ULL,	0xfc2757d1f534ddc0ULL,
	0xa2f9836e4e441529ULL,	0xa2f9836e4e441529ULL,
	};	};


	__device_func__(double __internal_trig_reduction_kerneld(double a, int *qua drant))	static __forceinline__ double __internal_trig_reduction_kerneld(double a, i nt *quadrant)
	{	{
	double j;	double j;
	int q;	int q;

	if (__cuda_fabs(a) > CUDART_TRIG_PLOSS) {	if (fabs(a) > CUDART_TRIG_PLOSS) {
	/* Payne-Hanek style argument reduction. */	/* Payne-Hanek style argument reduction. */
	unsigned long long int ia;	unsigned long long int ia;
	unsigned long long int s;	unsigned long long int s;
	unsigned long long int result[5];	unsigned long long int result[5];
	unsigned long long int phi, plo;	unsigned long long int phi, plo;
	unsigned long long int hi, lo;	unsigned long long int hi, lo;
	unsigned int e;	unsigned int e;
	int idx;	int idx;

	ia = __double_as_longlong(a);	ia = __double_as_longlong(a);
	s = ia & 0x8000000000000000ULL;	s = ia & 0x8000000000000000ULL;
	e = (unsigned int)(((ia >> 52) & 0x7ff) - 1024);	e = (unsigned int)(((ia >> 52) & 0x7ff) - 1024);
	ia = (ia << 11) \| 0x8000000000000000ULL;	ia = (ia << 11) \| 0x8000000000000000ULL;
	/* compute x * 2/pi */	/* compute x * 2/pi */
	idx = 16 - (e >> 6);	idx = 16 - (e >> 6);
	hi = 0;	hi = 0;

	#if defined(__CUDABE__)
	#pragma unroll 1	#pragma unroll 1

	#endif /* __CUDABE__ */
	for (q = (idx-1); q < min(18,idx+3); q++) {	for (q = (idx-1); q < min(18,idx+3); q++) {
	plo = __cudart_i2opi_d[q] * ia;	plo = __cudart_i2opi_d[q] * ia;
	phi = __umul64hi (__cudart_i2opi_d[q], ia);	phi = __umul64hi (__cudart_i2opi_d[q], ia);
	lo = hi + plo;	lo = hi + plo;
	hi = phi + (lo < plo);	hi = phi + (lo < plo);
	result[q-(idx-1)] = lo;	result[q-(idx-1)] = lo;
	}	}
	result[q-(idx-1)] = hi;	result[q-(idx-1)] = hi;
	e = e & 63;	e = e & 63;
	/* shift result such that hi:lo<127:126> are the least significant	/* shift result such that hi:lo<127:126> are the least significant

	skipping to change at line 324	skipping to change at line 247
	* http://arxiv.org/PS_cache/arxiv/pdf/0708/0708.3722v1.pdf	* http://arxiv.org/PS_cache/arxiv/pdf/0708/0708.3722v1.pdf
	*/	*/
	a = __fma_rn (-j, 1.5707963267948966e+000, a);	a = __fma_rn (-j, 1.5707963267948966e+000, a);
	a = __fma_rn (-j, 6.1232339957367574e-017, a);	a = __fma_rn (-j, 6.1232339957367574e-017, a);
	a = __fma_rn (-j, 8.4784276603688985e-032, a);	a = __fma_rn (-j, 8.4784276603688985e-032, a);
	*quadrant = q;	*quadrant = q;
	return a;	return a;
	}	}

	/* approximate sine on -pi/4...+pi/4 */	/* approximate sine on -pi/4...+pi/4 */

	__device_func__(double __internal_sin_kerneld(double x))	static __forceinline__ double __internal_sin_kerneld(double x)
	{	{
	double x2, z;	double x2, z;
	x2 = x * x;	x2 = x * x;
	z = 1.5896230157221844E-010;	z = 1.5896230157221844E-010;
	z = __fma_rn (z, x2, -2.5050747762850355E-008);	z = __fma_rn (z, x2, -2.5050747762850355E-008);
	z = __fma_rn (z, x2, 2.7557313621385676E-006);	z = __fma_rn (z, x2, 2.7557313621385676E-006);
	z = __fma_rn (z, x2, -1.9841269829589539E-004);	z = __fma_rn (z, x2, -1.9841269829589539E-004);
	z = __fma_rn (z, x2, 8.3333333333221182E-003);	z = __fma_rn (z, x2, 8.3333333333221182E-003);
	z = __fma_rn (z, x2, -1.6666666666666630E-001);	z = __fma_rn (z, x2, -1.6666666666666630E-001);
	z = z * x2;	z = z * x2;
	z = __fma_rn (z, x, x);	z = __fma_rn (z, x, x);
	return z;	return z;
	}	}

	/* approximate cosine on -pi/4...+pi/4 */	/* approximate cosine on -pi/4...+pi/4 */

	__device_func__(double __internal_cos_kerneld(double x))	static __forceinline__ double __internal_cos_kerneld(double x)
	{	{
	double x2, z;	double x2, z;
	x2 = x * x;	x2 = x * x;
	z = -1.136788825395985E-011;	z = -1.136788825395985E-011;
	z = __fma_rn (z, x2, 2.087588480545065E-009);	z = __fma_rn (z, x2, 2.087588480545065E-009);
	z = __fma_rn (z, x2, -2.755731555403950E-007);	z = __fma_rn (z, x2, -2.755731555403950E-007);
	z = __fma_rn (z, x2, 2.480158729365970E-005);	z = __fma_rn (z, x2, 2.480158729365970E-005);
	z = __fma_rn (z, x2, -1.388888888888074E-003);	z = __fma_rn (z, x2, -1.388888888888074E-003);
	z = __fma_rn (z, x2, 4.166666666666664E-002);	z = __fma_rn (z, x2, 4.166666666666664E-002);
	z = __fma_rn (z, x2, -5.000000000000000E-001);	z = __fma_rn (z, x2, -5.000000000000000E-001);
	z = __fma_rn (z, x2, 1.000000000000000E+000);	z = __fma_rn (z, x2, 1.000000000000000E+000);
	return z;	return z;
	}	}

	/* approximate tangent on -pi/4...+pi/4 */	/* approximate tangent on -pi/4...+pi/4 */

	__device_func__(double __internal_tan_kerneld(double x, int i))	static __forceinline__ double __internal_tan_kerneld(double x, int i)
	{	{
	double x2, z, q;	double x2, z, q;
	x2 = x * x;	x2 = x * x;
	z = 9.8006287203286300E-006;	z = 9.8006287203286300E-006;
	z = __fma_rn (z, x2, -2.4279526494179897E-005);	z = __fma_rn (z, x2, -2.4279526494179897E-005);
	z = __fma_rn (z, x2, 4.8644173130937162E-005);	z = __fma_rn (z, x2, 4.8644173130937162E-005);
	z = __fma_rn (z, x2, -2.5640012693782273E-005);	z = __fma_rn (z, x2, -2.5640012693782273E-005);
	z = __fma_rn (z, x2, 6.7223984330880073E-005);	z = __fma_rn (z, x2, 6.7223984330880073E-005);
	z = __fma_rn (z, x2, 8.3559287318211639E-005);	z = __fma_rn (z, x2, 8.3559287318211639E-005);
	z = __fma_rn (z, x2, 2.4375039850848564E-004);	z = __fma_rn (z, x2, 2.4375039850848564E-004);

	skipping to change at line 388	skipping to change at line 311
	double s = q - x;	double s = q - x;
	double w = __fma_rn (z, x, -s); // tail of q	double w = __fma_rn (z, x, -s); // tail of q
	z = 1.0 / q;	z = 1.0 / q;
	z = -z;	z = -z;
	s = __fma_rn (q, z, 1.0);	s = __fma_rn (q, z, 1.0);
	q = __fma_rn (z, __fma_rn (z, w, s), z);	q = __fma_rn (z, __fma_rn (z, w, s), z);
	}	}
	return q;	return q;
	}	}


	__device_func__(double __cuda_sqrt(double a))
	{
	return sqrt(a);
	}

	__device_func__(double __cuda_rsqrt(double a))
	{
	#if !defined(__CUDABE__)
	return 1.0 / sqrt(a);
	#else
	return rsqrt(a);
	#endif
	}

	/* approximates exp(a)-1 on [-log(1.5),log(1.5)] accurate to 1 ulp */	/* approximates exp(a)-1 on [-log(1.5),log(1.5)] accurate to 1 ulp */

	__device_func__(double __internal_expm1_kernel (double a))	static __forceinline__ double __internal_expm1_kernel (double a)
	{	{
	double t;	double t;
	t = 2.08842685477913050E-009;	t = 2.08842685477913050E-009;
	t = __fma_rn (t, a, 2.51366409033551950E-008);	t = __fma_rn (t, a, 2.51366409033551950E-008);
	t = __fma_rn (t, a, 2.75574612072447230E-007);	t = __fma_rn (t, a, 2.75574612072447230E-007);
	t = __fma_rn (t, a, 2.75571539284473460E-006);	t = __fma_rn (t, a, 2.75571539284473460E-006);
	t = __fma_rn (t, a, 2.48015869443077950E-005);	t = __fma_rn (t, a, 2.48015869443077950E-005);
	t = __fma_rn (t, a, 1.98412699878799470E-004);	t = __fma_rn (t, a, 1.98412699878799470E-004);
	t = __fma_rn (t, a, 1.38888888892029890E-003);	t = __fma_rn (t, a, 1.38888888892029890E-003);
	t = __fma_rn (t, a, 8.33333333327662860E-003);	t = __fma_rn (t, a, 8.33333333327662860E-003);
	t = __fma_rn (t, a, 4.16666666666656370E-002);	t = __fma_rn (t, a, 4.16666666666656370E-002);
	t = __fma_rn (t, a, 1.66666666666667380E-001);	t = __fma_rn (t, a, 1.66666666666667380E-001);
	t = __fma_rn (t, a, 5.00000000000000000E-001);	t = __fma_rn (t, a, 5.00000000000000000E-001);
	t = t * a;	t = t * a;
	t = __fma_rn (t, a, a);	t = __fma_rn (t, a, a);
	return t;	return t;
	}	}

	/* approximate 2atanh(0.5a) on [-0.25,0.25] */	/* approximate 2atanh(0.5a) on [-0.25,0.25] */

	__device_func__(double __internal_atanh_kernel (double a_1, double a_2))	static __forceinline__ double __internal_atanh_kernel (double a_1, double a _2)
	{	{
	double a, a2, t;	double a, a2, t;

	a = a_1 + a_2;	a = a_1 + a_2;
	a2 = a * a;	a2 = a * a;
	t = 7.597322383488143E-002/65536.0;	t = 7.597322383488143E-002/65536.0;
	t = __fma_rn (t, a2, 6.457518383364042E-002/16384.0);	t = __fma_rn (t, a2, 6.457518383364042E-002/16384.0);
	t = __fma_rn (t, a2, 7.705685707267146E-002/4096.0);	t = __fma_rn (t, a2, 7.705685707267146E-002/4096.0);
	t = __fma_rn (t, a2, 9.090417561104036E-002/1024.0);	t = __fma_rn (t, a2, 9.090417561104036E-002/1024.0);
	t = __fma_rn (t, a2, 1.111112158368149E-001/256.0);	t = __fma_rn (t, a2, 1.111112158368149E-001/256.0);
	t = __fma_rn (t, a2, 1.428571416261528E-001/64.0);	t = __fma_rn (t, a2, 1.428571416261528E-001/64.0);
	t = __fma_rn (t, a2, 2.000000000069858E-001/16.0);	t = __fma_rn (t, a2, 2.000000000069858E-001/16.0);
	t = __fma_rn (t, a2, 3.333333333333198E-001/4.0);	t = __fma_rn (t, a2, 3.333333333333198E-001/4.0);
	t = t * a2;	t = t * a2;
	t = __fma_rn (t, a, a_2);	t = __fma_rn (t, a, a_2);
	t = t + a_1;	t = t + a_1;
	return t;	return t;
	}	}


	__device_func__(double __internal_exp2i_kernel(int b))	static __forceinline__ double __internal_exp2i_kernel(int b)
	{	{
	return __hiloint2double((b + 1023) << 20, 0);	return __hiloint2double((b + 1023) << 20, 0);
	}	}


	__device_func__(double __internal_half(double a))	static __forceinline__ double __internal_half(double a)
	{	{
	unsigned int ihi, ilo;	unsigned int ihi, ilo;
	ilo = __double2loint(a);	ilo = __double2loint(a);
	ihi = __double2hiint(a);	ihi = __double2hiint(a);
	return __hiloint2double(ihi - 0x00100000, ilo);	return __hiloint2double(ihi - 0x00100000, ilo);
	}	}


	__device_func__(double __internal_twice(double a))	static __forceinline__ double __internal_twice(double a)
	{	{
	unsigned int ihi, ilo;	unsigned int ihi, ilo;
	ilo = __double2loint(a);	ilo = __double2loint(a);
	ihi = __double2hiint(a);	ihi = __double2hiint(a);
	return __hiloint2double(ihi + 0x00100000, ilo);	return __hiloint2double(ihi + 0x00100000, ilo);
	}	}


	__device_func__(double __cuda_sin(double a))	static __forceinline__ double sin(double a)
	{	{
	double z;	double z;
	int i;	int i;

	if (__cuda___isinf(a) \|\| (a == CUDART_ZERO)) {	if (__isinf(a) \|\| (a == CUDART_ZERO)) {
	return __dmul_rn(a, CUDART_ZERO);	return __dmul_rn(a, CUDART_ZERO);
	}	}
	z = __internal_trig_reduction_kerneld(a, &i);	z = __internal_trig_reduction_kerneld(a, &i);
	/* here, abs(z) <= pi/4, and i has the quadrant */	/* here, abs(z) <= pi/4, and i has the quadrant */
	if (i & 1) {	if (i & 1) {
	z = __internal_cos_kerneld(z);	z = __internal_cos_kerneld(z);
	} else {	} else {
	z = __internal_sin_kerneld(z);	z = __internal_sin_kerneld(z);
	}	}
	if (i & 2) {	if (i & 2) {
	z = -z;	z = -z;
	}	}
	return z;	return z;
	}	}


	__device_func__(double __cuda_sinpi(double a))	static __forceinline__ double sinpi(double a)
	{	{
	double z;	double z;
	double fi;	double fi;
	int i;	int i;


	if (__cuda___isinf(a) \|\| (a == CUDART_ZERO)) {	if (__isinf(a) \|\| (a == CUDART_ZERO)) {
	return __dmul_rn(a, CUDART_ZERO);	return __dmul_rn(a, CUDART_ZERO);
	}	}
	/* IEEE-754: sinPi(+n) is +0 and sinPi(-n) is -0 for positive integers n. */	/* IEEE-754: sinPi(+n) is +0 and sinPi(-n) is -0 for positive integers n. */

	if (a == __cuda_trunc(a)) {	if (a == trunc(a)) {
	return __longlong_as_double(__double_as_longlong(a)&0x8000000000000000U LL);	return __longlong_as_double(__double_as_longlong(a)&0x8000000000000000U LL);
	}	}

	fi = __cuda_rint (a * 2.0);	fi = rint (a * 2.0);
	z = __fma_rn (fi, -0.5, a);	z = __fma_rn (fi, -0.5, a);
	z = __fma_rn (z, CUDART_PI_HI, z * CUDART_PI_LO);	z = __fma_rn (z, CUDART_PI_HI, z * CUDART_PI_LO);
	i = (int)(((long long)fi) & 3);	i = (int)(((long long)fi) & 3);
	if (i & 1) {	if (i & 1) {
	z = __internal_cos_kerneld(z);	z = __internal_cos_kerneld(z);
	} else {	} else {
	z = __internal_sin_kerneld(z);	z = __internal_sin_kerneld(z);
	}	}
	if (i & 2) {	if (i & 2) {
	z = -z;	z = -z;
	}	}
	return z;	return z;
	}	}


	__device_func__(double __cuda_cos(double a))	static __forceinline__ double cos(double a)
	{	{
	double z;	double z;
	int i;	int i;

	if (__cuda___isinf(a)) {	if (__isinf(a)) {
	return CUDART_NAN;	return CUDART_NAN;
	}	}
	z = __internal_trig_reduction_kerneld(a, &i);	z = __internal_trig_reduction_kerneld(a, &i);
	/* here, abs(z) <= pi/4, and i has the quadrant */	/* here, abs(z) <= pi/4, and i has the quadrant */
	i++;	i++;
	if (i & 1) {	if (i & 1) {
	z = __internal_cos_kerneld(z);	z = __internal_cos_kerneld(z);
	} else {	} else {
	z = __internal_sin_kerneld(z);	z = __internal_sin_kerneld(z);
	}	}
	if (i & 2) {	if (i & 2) {
	z = -z;	z = -z;
	}	}
	return z;	return z;
	}	}


	__device_func__(void __cuda_sincos(double a, double sptr, double cptr))	static __forceinline__ void sincos(double a, double sptr, double cptr)
	{	{
	double t, u, s, c;	double t, u, s, c;
	int i;	int i;

	t = __cuda_fabs(a);	t = fabs(a);
	if ((t == CUDART_INF) \|\| (t == CUDART_ZERO)) {	if ((t == CUDART_INF) \|\| (t == CUDART_ZERO)) {
	s = __dmul_rn (a, CUDART_ZERO); /* generate NaN, zero */	s = __dmul_rn (a, CUDART_ZERO); /* generate NaN, zero */
	c = 1.0 + s; /* generate NaN, one */	c = 1.0 + s; /* generate NaN, one */
	*sptr = s;	*sptr = s;
	*cptr = c;	*cptr = c;
	return;	return;
	}	}
	t = __internal_trig_reduction_kerneld(a, &i);	t = __internal_trig_reduction_kerneld(a, &i);
	u = __internal_cos_kerneld(t);	u = __internal_cos_kerneld(t);
	t = __internal_sin_kerneld(t);	t = __internal_sin_kerneld(t);

	skipping to change at line 566	skipping to change at line 475
	s = -s;	s = -s;
	}	}
	i++;	i++;
	if (i & 2) {	if (i & 2) {
	c = -c;	c = -c;
	}	}
	*sptr = s;	*sptr = s;
	*cptr = c;	*cptr = c;
	}	}


	__device_func__(double __cuda_tan(double a))	static __forceinline__ double tan(double a)
	{	{
	double z;	double z;
	int i;	int i;

	if (__cuda___isinf(a)) {	if (__isinf(a)) {
	return __dadd_rn (a, -a); /* return NaN */	return __dadd_rn (a, -a); /* return NaN */
	}	}
	z = __internal_trig_reduction_kerneld(a, &i);	z = __internal_trig_reduction_kerneld(a, &i);
	/* here, abs(z) <= pi/4, and i has the quadrant */	/* here, abs(z) <= pi/4, and i has the quadrant */
	z = __internal_tan_kerneld(z, i & 1);	z = __internal_tan_kerneld(z, i & 1);
	return z;	return z;
	}	}


	__device_func__(double __cuda_log(double a))	static __forceinline__ double log(double a)
	{	{
	double m, f, g, u, v, tmp, q, ulo, log_lo, log_hi;	double m, f, g, u, v, tmp, q, ulo, log_lo, log_hi;
	int ihi, ilo;	int ihi, ilo;

	ihi = __double2hiint(a);	ihi = __double2hiint(a);
	ilo = __double2loint(a);	ilo = __double2loint(a);

	if ((a > CUDART_ZERO) && (a < CUDART_INF)) {	if ((a > CUDART_ZERO) && (a < CUDART_INF)) {
	int e = -1023;	int e = -1023;
	/* normalize denormals */	/* normalize denormals */

	skipping to change at line 609	skipping to change at line 518
	e += (ihi >> 20);	e += (ihi >> 20);
	ihi = (ihi & 0x800fffff) \| 0x3ff00000;	ihi = (ihi & 0x800fffff) \| 0x3ff00000;
	m = __hiloint2double (ihi, ilo);	m = __hiloint2double (ihi, ilo);
	if ((unsigned)ihi > (unsigned)0x3ff6a09e) {	if ((unsigned)ihi > (unsigned)0x3ff6a09e) {
	m = __internal_half(m);	m = __internal_half(m);
	e = e + 1;	e = e + 1;
	}	}
	/* log((1+m)/(1-m)) = 2atanh(m). log(m) = 2atanh ((m-1)/(m+1)) */	/* log((1+m)/(1-m)) = 2atanh(m). log(m) = 2atanh ((m-1)/(m+1)) */
	f = m - 1.0;	f = m - 1.0;
	g = m + 1.0;	g = m + 1.0;

	g = 1.0 / g;	g = __internal_fast_rcp(g);
	u = f * g;	u = f * g;
	u = u + u;	u = u + u;
	/* u = 2.0 * (m - 1.0) / (m + 1.0) */	/* u = 2.0 * (m - 1.0) / (m + 1.0) */
	v = u * u;	v = u * u;
	q = 6.7261411553826339E-2/65536.0;	q = 6.7261411553826339E-2/65536.0;
	q = __fma_rn (q, v, 6.6133829643643394E-2/16384.0);	q = __fma_rn (q, v, 6.6133829643643394E-2/16384.0);
	q = __fma_rn (q, v, 7.6940931149150890E-2/4096.0);	q = __fma_rn (q, v, 7.6940931149150890E-2/4096.0);
	q = __fma_rn (q, v, 9.0908745692137444E-2/1024.0);	q = __fma_rn (q, v, 9.0908745692137444E-2/1024.0);
	q = __fma_rn (q, v, 1.1111111499059706E-1/256.0);	q = __fma_rn (q, v, 1.1111111499059706E-1/256.0);
	q = __fma_rn (q, v, 1.4285714283305975E-1/64.0);	q = __fma_rn (q, v, 1.4285714283305975E-1/64.0);

	skipping to change at line 640	skipping to change at line 549
	log_lo = ulo + q;	log_lo = ulo + q;
	/* log_hi + log_lo = log(m)+elog(2)=log(a) to more than double precisi on/	/* log_hi + log_lo = log(m)+elog(2)=log(a) to more than double precisi on/
	q = __fma_rn ( e, CUDART_LN2_HI, log_hi);	q = __fma_rn ( e, CUDART_LN2_HI, log_hi);
	tmp = __fma_rn (-e, CUDART_LN2_HI, q);	tmp = __fma_rn (-e, CUDART_LN2_HI, q);
	tmp = tmp - log_hi;	tmp = tmp - log_hi;
	log_hi = q;	log_hi = q;
	log_lo = log_lo - tmp;	log_lo = log_lo - tmp;
	log_lo = __fma_rn (e, CUDART_LN2_LO, log_lo);	log_lo = __fma_rn (e, CUDART_LN2_LO, log_lo);
	return log_hi + log_lo;	return log_hi + log_lo;
	} else {	} else {

	if (__cuda___isnan(a)) {	if (__isnan(a)) {
	return a + a;	return a + a;
	}	}
	/* log(0) = -INF */	/* log(0) = -INF */
	if (a == 0) {	if (a == 0) {
	return -CUDART_INF;	return -CUDART_INF;
	}	}
	/* log(INF) = INF */	/* log(INF) = INF */
	if (a == CUDART_INF) {	if (a == CUDART_INF) {
	return a;	return a;
	}	}
	/* log(x) is undefined for x < 0.0, return INDEFINITE */	/* log(x) is undefined for x < 0.0, return INDEFINITE */
	return CUDART_NAN;	return CUDART_NAN;
	}	}
	}	}

	/* Requires \|x.y\| > \|y.y\|. 8 DP operations */	/* Requires \|x.y\| > \|y.y\|. 8 DP operations */

	__device_func__(double2 __internal_ddadd_xgty (double2 x, double2 y))	static __forceinline__ double2 __internal_ddadd_xgty (double2 x, double2 y)
	{	{
	double2 z;	double2 z;

	#if defined(__GNUC__) && !defined(__CUDABE__)
	volatile
	#endif
	double r, s, e;	double r, s, e;
	r = x.y + y.y;	r = x.y + y.y;
	e = x.y - r;	e = x.y - r;
	s = ((e + y.y) + y.x) + x.x;	s = ((e + y.y) + y.x) + x.x;
	z.y = e = r + s;	z.y = e = r + s;
	z.x = (r - e) + s;	z.x = (r - e) + s;
	return z;	return z;
	}	}

	/* Take full advantage of FMA. Only 8 DP operations */	/* Take full advantage of FMA. Only 8 DP operations */

	__device_func__(double2 __internal_ddmul (double2 x, double2 y))	static __forceinline__ double2 __internal_ddmul (double2 x, double2 y)
	{	{

	#if defined(__GNUC__) && !defined(__CUDABE__)
	volatile
	#endif
	double e;	double e;
	double2 t, z;	double2 t, z;
	t.y = x.y * y.y;	t.y = x.y * y.y;
	t.x = __fma_rn (x.y, y.y, -t.y);	t.x = __fma_rn (x.y, y.y, -t.y);
	t.x = __fma_rn (x.x, y.x, t.x);	t.x = __fma_rn (x.x, y.x, t.x);
	t.x = __fma_rn (x.y, y.x, t.x);	t.x = __fma_rn (x.y, y.x, t.x);
	t.x = __fma_rn (x.x, y.y, t.x);	t.x = __fma_rn (x.x, y.y, t.x);
	z.y = e = t.y + t.x;	z.y = e = t.y + t.x;
	z.x = (t.y - e) + t.x;	z.x = (t.y - e) + t.x;
	return z;	return z;
	}	}


	__device_func__(double2 __internal_log_ext_prec(double a))	static __forceinline__ double2 __internal_log_ext_prec(double a)
	{	{
	double2 res;	double2 res;
	double2 qq, cc, uu, tt;	double2 qq, cc, uu, tt;
	double f, g, u, v, q, ulo, tmp, m;	double f, g, u, v, q, ulo, tmp, m;
	int ilo, ihi, expo;	int ilo, ihi, expo;

	ihi = __double2hiint(a);	ihi = __double2hiint(a);
	ilo = __double2loint(a);	ilo = __double2loint(a);
	expo = (ihi >> 20) & 0x7ff;	expo = (ihi >> 20) & 0x7ff;
	/* convert denormals to normals for computation of log(a) */	/* convert denormals to normals for computation of log(a) */

	skipping to change at line 726	skipping to change at line 629
	m = __internal_half(m);	m = __internal_half(m);
	expo = expo + 1;	expo = expo + 1;
	}	}
	/* compute log(m) with extended precision using an algorithm derived from	/* compute log(m) with extended precision using an algorithm derived from
	* P.T.P. Tang, "Table Driven Implementation of the Logarithm Function",	* P.T.P. Tang, "Table Driven Implementation of the Logarithm Function",
	* TOMS, Vol. 16., No. 4, December 1990, pp. 378-400. A modified polynomi al	* TOMS, Vol. 16., No. 4, December 1990, pp. 378-400. A modified polynomi al
	* approximation to atanh(x) on the interval [-0.1716, 0.1716] is utilize d.	* approximation to atanh(x) on the interval [-0.1716, 0.1716] is utilize d.
	*/	*/
	f = m - 1.0;	f = m - 1.0;
	g = m + 1.0;	g = m + 1.0;

	g = 1.0 / g;	g = __internal_fast_rcp(g);
	u = f * g;	u = f * g;
	u = u + u;	u = u + u;
	/* u = 2.0 * (m - 1.0) / (m + 1.0) */	/* u = 2.0 * (m - 1.0) / (m + 1.0) */
	v = u * u;	v = u * u;
	q = 6.6253631649203309E-2/65536.0;	q = 6.6253631649203309E-2/65536.0;
	q = __fma_rn (q, v, 6.6250935587260612E-2/16384.0);	q = __fma_rn (q, v, 6.6250935587260612E-2/16384.0);
	q = __fma_rn (q, v, 7.6935437806732829E-2/4096.0);	q = __fma_rn (q, v, 7.6935437806732829E-2/4096.0);
	q = __fma_rn (q, v, 9.0908878711093280E-2/1024.0);	q = __fma_rn (q, v, 9.0908878711093280E-2/1024.0);
	q = __fma_rn (q, v, 1.1111111322892790E-1/256.0);	q = __fma_rn (q, v, 1.1111111322892790E-1/256.0);
	q = __fma_rn (q, v, 1.4285714284546502E-1/64.0);	q = __fma_rn (q, v, 1.4285714284546502E-1/64.0);

	skipping to change at line 766	skipping to change at line 669
	u = uu.y;	u = uu.y;
	ulo = uu.x;	ulo = uu.x;
	/* log(2)expo in double-double format /	/* log(2)expo in double-double format /
	tt.y = expo * 6.9314718055966296e-001; /* multiplication is exact */	tt.y = expo * 6.9314718055966296e-001; /* multiplication is exact */
	tt.x = expo * 2.8235290563031577e-013;	tt.x = expo * 2.8235290563031577e-013;
	/* log(a) = log(m) + log(2)expo; if expo != 0, \|log(2)expo\| > \|log(m)\| */	/* log(a) = log(m) + log(2)expo; if expo != 0, \|log(2)expo\| > \|log(m)\| */
	res = __internal_ddadd_xgty (tt, uu);	res = __internal_ddadd_xgty (tt, uu);
	return res;	return res;
	}	}


	__device_func__(double __cuda_log2(double a))	static __forceinline__ double log2(double a)
	{	{
	double t;	double t;

	t = __cuda_log(a);	t = log(a);
	return __fma_rn (t, CUDART_L2E_HI, t * CUDART_L2E_LO);	return __fma_rn (t, CUDART_L2E_HI, t * CUDART_L2E_LO);
	}	}


	__device_func__(double __cuda_log10(double a))	static __forceinline__ double log10(double a)
	{	{
	double t;	double t;

	t = __cuda_log(a);	t = log(a);
	return __fma_rn (t, CUDART_LGE_HI, t * CUDART_LGE_LO);	return __fma_rn (t, CUDART_LGE_HI, t * CUDART_LGE_LO);
	}	}


	__device_func__(double __cuda_log1p(double a))	static __forceinline__ double log1p(double a)
	{	{
	double t;	double t;
	int i;	int i;

	i = __double2hiint(a);	i = __double2hiint(a);
	if (((unsigned)i < (unsigned)0x3fe55555) \|\| ((int)i < (int)0xbfd99999)) {	if (((unsigned)i < (unsigned)0x3fe55555) \|\| ((int)i < (int)0xbfd99999)) {
	/* Compute log2(a+1) = 2atanh(a/(a+2)) /	/* Compute log2(a+1) = 2atanh(a/(a+2)) /
	t = a + 2.0;	t = a + 2.0;
	t = a / t;	t = a / t;
	t = -a * t;	t = -a * t;
	t = __internal_atanh_kernel(a, t);	t = __internal_atanh_kernel(a, t);
	return t;	return t;
	}	}

	return __cuda_log (a + CUDART_ONE);	return log (a + CUDART_ONE);
	}	}


	__device_func__(double __internal_exp_kernel(double a, int scale))	static __forceinline__ double __internal_exp_kernel(double a, int scale)
	{	{
	double t, fac, z;	double t, fac, z;

	int i;	int i, k;
	/* exp(a) = 2^(rint(a/log(2)) + z) = 2^(i + z) */	/* exp(a) = 2^(rint(a/log(2)) + z) = 2^(i + z) */

	t = __cuda_rint (a * CUDART_L2E);	t = rint (a * CUDART_L2E);
	i = (int)t;	i = (int)t;
	z = __fma_rn (t, -CUDART_LN2_HI, a);	z = __fma_rn (t, -CUDART_LN2_HI, a);
	z = __fma_rn (t, -CUDART_LN2_LO, z);	z = __fma_rn (t, -CUDART_LN2_LO, z);

	fac = 2.0;	k = 0x40000000;
	if (i <= -1021) {	if (i <= -1021) {
	i += 55;	i += 55;

	fac = CUDART_TWO_TO_M54;	k -= 55 << 20;
	}	}

		fac = __hiloint2double(k, 0); /* 2^-54 if a is denormal, 2.0 otherwise */
	/* exp(a) = 2^i * e^z */	/* exp(a) = 2^i * e^z */
	t = __internal_expm1_kernel(z);	t = __internal_expm1_kernel(z);

	z = __internal_exp2i_kernel(i + scale - 1);	z = __hiloint2double(((i + scale) << 20) + ((-1 + 1023) << 20), 0);
	t = __fma_rn (t, z, z);	t = __fma_rn (t, z, z);
	t = t * fac;	t = t * fac;
	return t;	return t;
	}	}


	__device_func__(double __cuda_exp(double a))	static __forceinline__ double exp(double a)
	{	{
	double t;	double t;
	int i;	int i;
	i = __double2hiint(a);	i = __double2hiint(a);
	if (((unsigned)i < (unsigned)0x40862e43) \|\| ((int)i < (int)0xC0874911)) {	if (((unsigned)i < (unsigned)0x40862e43) \|\| ((int)i < (int)0xC0874911)) {
	t = __internal_exp_kernel(a, 0);	t = __internal_exp_kernel(a, 0);
	return t;	return t;
	}	}

	t = ((unsigned int)i >> 31) ? CUDART_ZERO : CUDART_INF;	t = (i < 0) ? CUDART_ZERO : CUDART_INF;
	if (__cuda___isnan(a)) {	if (__isnan(a)) {
	t = a + a;	t = a + a;
	}	}
	return t;	return t;
	}	}


	__device_func__(double __cuda_exp2(double a))	static __forceinline__ double exp2(double a)
	{	{
	double z;	double z;
	double t;	double t;
	double fac;	double fac;
	int i;	int i;

	i = __double2hiint(a);	i = __double2hiint(a);
	if (((unsigned)i < (unsigned)0x40900000) \|\| ((int)i < (int)0xc090cc00)) {	if (((unsigned)i < (unsigned)0x40900000) \|\| ((int)i < (int)0xc090cc00)) {

	t = __cuda_rint (a);	t = rint (a);
	z = a - t;	z = a - t;
	i = (int)t;	i = (int)t;
	fac = 2.0;	fac = 2.0;
	if (i <= -1021) {	if (i <= -1021) {
	i += 55;	i += 55;
	fac = CUDART_TWO_TO_M54;	fac = CUDART_TWO_TO_M54;
	}	}
	/* 2^z = exp(log(2)z) /	/* 2^z = exp(log(2)z) /
	z = __fma_rn (z, CUDART_LN2_HI, z * CUDART_LN2_LO);	z = __fma_rn (z, CUDART_LN2_HI, z * CUDART_LN2_LO);
	t = __internal_expm1_kernel(z);	t = __internal_expm1_kernel(z);
	z = __internal_exp2i_kernel(i - 1);	z = __internal_exp2i_kernel(i - 1);
	t = __fma_rn (t, z, z);	t = __fma_rn (t, z, z);
	t = t * fac;	t = t * fac;
	return t;	return t;
	}	}

	t = ((unsigned int)i >> 31) ? CUDART_ZERO : CUDART_INF;	t = (i < 0) ? CUDART_ZERO : CUDART_INF;
	if (__cuda___isnan(a)) {	if (__isnan(a)) {
	t = a + a;	t = a + a;
	}	}
	return t;	return t;
	}	}


	__device_func__(double __cuda_exp10(double a))	static __forceinline__ double exp10(double a)
	{	{
	double z;	double z;
	double t;	double t;
	double fac;	double fac;
	int i;	int i;

	i = __double2hiint(a);	i = __double2hiint(a);
	if (((unsigned)i < (unsigned)0x40734414) \|\| ((int)i < (int)0xc07439b8)) {	if (((unsigned)i < (unsigned)0x40734414) \|\| ((int)i < (int)0xc07439b8)) {

	t = __cuda_rint (a * CUDART_L2T);	t = rint (a * CUDART_L2T);
	i = (int)t;	i = (int)t;
	z = __fma_rn (t, -CUDART_LG2_HI, a);	z = __fma_rn (t, -CUDART_LG2_HI, a);
	z = __fma_rn (t, -CUDART_LG2_LO, z);	z = __fma_rn (t, -CUDART_LG2_LO, z);
	fac = 2.0;	fac = 2.0;
	if (i <= -1021) {	if (i <= -1021) {
	i += 55;	i += 55;
	fac = CUDART_TWO_TO_M54;	fac = CUDART_TWO_TO_M54;
	}	}
	/* 2^z = exp(log(10)z) /	/* 2^z = exp(log(10)z) /
	z = __fma_rn (z, CUDART_LNT_HI, z * CUDART_LNT_LO);	z = __fma_rn (z, CUDART_LNT_HI, z * CUDART_LNT_LO);
	t = __internal_expm1_kernel(z);	t = __internal_expm1_kernel(z);
	z = __internal_exp2i_kernel(i - 1);	z = __internal_exp2i_kernel(i - 1);
	t = __fma_rn (t, z, z);	t = __fma_rn (t, z, z);
	t = t * fac;	t = t * fac;
	return t;	return t;
	}	}

	t = ((unsigned int)i >> 31) ? CUDART_ZERO : CUDART_INF;	t = (i < 0) ? CUDART_ZERO : CUDART_INF;
	if (__cuda___isnan(a)) {	if (__isnan(a)) {
	t = a + a;	t = a + a;
	}	}
	return t;	return t;
	}	}


	__device_func__(double __cuda_expm1(double a))	static __forceinline__ double expm1(double a)
	{	{
	double t, z, u;	double t, z, u;
	int i, j, k;	int i, j, k;

	k = __double2hiint(a);	k = __double2hiint(a);
	if (((unsigned)k < (unsigned)0x40862e43) \|\| ((int)k < (int)0xc04a8000)) {	if (((unsigned)k < (unsigned)0x40862e43) \|\| ((int)k < (int)0xc04a8000)) {

	t = __cuda_rint (a * CUDART_L2E);	t = rint (a * CUDART_L2E);
	i = (int)t;	i = (int)t;
	z = __fma_rn (t, -CUDART_LN2_HI, a);	z = __fma_rn (t, -CUDART_LN2_HI, a);
	z = __fma_rn (t, -CUDART_LN2_LO, z);	z = __fma_rn (t, -CUDART_LN2_LO, z);
	k = k + k;	k = k + k;
	if ((unsigned)k < (unsigned)0x7fb3e647) {	if ((unsigned)k < (unsigned)0x7fb3e647) {
	z = a;	z = a;
	i = 0;	i = 0;
	}	}
	t = __internal_expm1_kernel(z);	t = __internal_expm1_kernel(z);
	j = i;	j = i;
	if (i == 1024) j--;	if (i == 1024) j--;
	u = __internal_exp2i_kernel(j);	u = __internal_exp2i_kernel(j);
	a = u - 1.0;	a = u - 1.0;
	t = __fma_rn (t, u, a);	t = __fma_rn (t, u, a);
	if (i == 1024) t = t + t;	if (i == 1024) t = t + t;
	if (k == 0) t = z; /* preserve -0 */	if (k == 0) t = z; /* preserve -0 */
	return t;	return t;
	}	}

	t = ((unsigned int)k >> 31) ? -CUDART_ONE : CUDART_INF;	t = (k < 0) ? -CUDART_ONE : CUDART_INF;
	if (__cuda___isnan(a)) {	if (__isnan(a)) {
	t = a + a;	t = a + a;
	}	}
	return t;	return t;
	}	}


	__device_func__(double __cuda_cosh(double a))	static __forceinline__ double cosh(double a)
	{	{
	double z;	double z;
	int i;	int i;


	z = __cuda_fabs(a);	z = fabs(a);
	i = __double2hiint(z);	i = __double2hiint(z);
	if ((unsigned)i < (unsigned)0x408633cf) {	if ((unsigned)i < (unsigned)0x408633cf) {
	z = __internal_exp_kernel(z, -2);	z = __internal_exp_kernel(z, -2);
	z = __fma_rn(2.0, z, 0.125 / z);	z = __fma_rn(2.0, z, 0.125 / z);
	return z;	return z;
	} else {	} else {
	if (z > 0.0) a = CUDART_INF_F;	if (z > 0.0) a = CUDART_INF_F;
	return a + a;	return a + a;
	}	}
	}	}


	__device_func__(double __cuda_sinh(double a))	static __forceinline__ double sinh(double a)
	{	{
	double s, z;	double s, z;
	s = a;	s = a;

	a = __cuda_fabs(a);	a = fabs(a);
	if (a < 1.0) { /* danger of catastrophic cancellation */	if (a < 1.0) { /* danger of catastrophic cancellation */
	double a2 = a * a;	double a2 = a * a;
	/* approximate sinh(x) on [0,1] with a polynomial */	/* approximate sinh(x) on [0,1] with a polynomial */
	z = 1.632386098183803E-010;	z = 1.632386098183803E-010;
	z = __fma_rn (z, a2, 2.504854501385687E-008);	z = __fma_rn (z, a2, 2.504854501385687E-008);
	z = __fma_rn (z, a2, 2.755734274788706E-006);	z = __fma_rn (z, a2, 2.755734274788706E-006);
	z = __fma_rn (z, a2, 1.984126976294102E-004);	z = __fma_rn (z, a2, 1.984126976294102E-004);
	z = __fma_rn (z, a2, 8.333333333452911E-003);	z = __fma_rn (z, a2, 8.333333333452911E-003);
	z = __fma_rn (z, a2, 1.666666666666606E-001);	z = __fma_rn (z, a2, 1.666666666666606E-001);
	z = z * a2;	z = z * a2;
	z = __fma_rn (z, a, a);	z = __fma_rn (z, a, a);
	} else if (a < 2.0) { /* work around accuracy issue in vicinity of 1.4 */	} else if (a < 2.0) { /* work around accuracy issue in vicinity of 1.4 */

	z = __cuda_expm1(a);	z = expm1(a);
	z = __internal_half (z + z / (z + 1.0));	z = __internal_half (z + z / (z + 1.0));
	} else {	} else {
	z = __internal_exp_kernel(a, -1);	z = __internal_exp_kernel(a, -1);
	z = z + (1.0 / (-4.0 * z));	z = z + (1.0 / (-4.0 * z));
	if (a >= CUDART_LN2_X_1025) {	if (a >= CUDART_LN2_X_1025) {
	z = CUDART_INF; /* overflow -> infinity */	z = CUDART_INF; /* overflow -> infinity */
	}	}
	}	}
	z = __internal_copysign_pos(z, s);	z = __internal_copysign_pos(z, s);
	return z;	return z;
	}	}


	__device_func__(double __cuda_tanh(double a))	static __forceinline__ double tanh(double a)
	{	{
	double t;	double t;

	t = __cuda_fabs(a);	t = fabs(a);
	if (t >= 0.55) {	if (t >= 0.55) {
	double s;	double s;
	s = 1.0 - 2.0 / (__internal_exp_kernel(2.0 * t, 0) + 1.0);	s = 1.0 - 2.0 / (__internal_exp_kernel(2.0 * t, 0) + 1.0);
	if (t > 350.0) {	if (t > 350.0) {
	s = 1.0; /* overflow -> 1.0 */	s = 1.0; /* overflow -> 1.0 */
	}	}
	a = __internal_copysign_pos(s, a);	a = __internal_copysign_pos(s, a);
	} else {	} else {
	double a2;	double a2;
	a2 = a * a;	a2 = a * a;

	skipping to change at line 1011	skipping to change at line 915
	t = __fma_rn (t, a2, -5.396825387607743E-002);	t = __fma_rn (t, a2, -5.396825387607743E-002);
	t = __fma_rn (t, a2, 1.333333333316870E-001);	t = __fma_rn (t, a2, 1.333333333316870E-001);
	t = __fma_rn (t, a2, -3.333333333333232E-001);	t = __fma_rn (t, a2, -3.333333333333232E-001);
	t = t * a2;	t = t * a2;
	t = __fma_rn (t, a, a);	t = __fma_rn (t, a, a);
	a = __internal_copysign_pos(t, a);	a = __internal_copysign_pos(t, a);
	}	}
	return a;	return a;
	}	}


	__device_func__(double __internal_atan_kernel(double a))	static __forceinline__ double __internal_atan_kernel(double a)
	{	{
	double t, a2;	double t, a2;
	a2 = a * a;	a2 = a * a;
	t = -2.0258553044438358E-005 ;	t = -2.0258553044438358E-005 ;
	t = __fma_rn (t, a2, 2.2302240345758510E-004);	t = __fma_rn (t, a2, 2.2302240345758510E-004);
	t = __fma_rn (t, a2, -1.1640717779930576E-003);	t = __fma_rn (t, a2, -1.1640717779930576E-003);
	t = __fma_rn (t, a2, 3.8559749383629918E-003);	t = __fma_rn (t, a2, 3.8559749383629918E-003);
	t = __fma_rn (t, a2, -9.1845592187165485E-003);	t = __fma_rn (t, a2, -9.1845592187165485E-003);
	t = __fma_rn (t, a2, 1.6978035834597331E-002);	t = __fma_rn (t, a2, 1.6978035834597331E-002);
	t = __fma_rn (t, a2, -2.5826796814495994E-002);	t = __fma_rn (t, a2, -2.5826796814495994E-002);

	skipping to change at line 1039	skipping to change at line 943
	t = __fma_rn (t, a2, -9.0909012354005225E-002);	t = __fma_rn (t, a2, -9.0909012354005225E-002);
	t = __fma_rn (t, a2, 1.1111110678749424E-001);	t = __fma_rn (t, a2, 1.1111110678749424E-001);
	t = __fma_rn (t, a2, -1.4285714271334815E-001);	t = __fma_rn (t, a2, -1.4285714271334815E-001);
	t = __fma_rn (t, a2, 1.9999999999755019E-001);	t = __fma_rn (t, a2, 1.9999999999755019E-001);
	t = __fma_rn (t, a2, -3.3333333333331860E-001);	t = __fma_rn (t, a2, -3.3333333333331860E-001);
	t = t * a2;	t = t * a2;
	t = __fma_rn (t, a, a);	t = __fma_rn (t, a, a);
	return t;	return t;
	}	}


	__device_func__(double __cuda_atan2(double a, double b))	static __forceinline__ double atan2(double a, double b)
	{	{
	double t0, t1, t3;	double t0, t1, t3;

	if (__cuda___isnan(a) \|\| __cuda___isnan(b)) {	if (__isnan(a) \|\| __isnan(b)) {
	return a + b;	return a + b;
	}	}
	/* reduce arguments to first octant */	/* reduce arguments to first octant */
	/* r = (\|x\| < \|y\|) ? (\|x\| / \|y\|) : (\|y\| / \|x\|) */	/* r = (\|x\| < \|y\|) ? (\|x\| / \|y\|) : (\|y\| / \|x\|) */

	t3 = __cuda_fabs(b);	t3 = fabs(b);
	t1 = __cuda_fabs(a);	t1 = fabs(a);
	if (t3 == 0.0 && t1 == 0.0) {	if (t3 == 0.0 && t1 == 0.0) {

	t3 = __cuda___signbit(b) ? CUDART_PI : 0;	t3 = (__double2hiint(b) < 0) ? CUDART_PI : 0;
	} else if (__cuda___isinf(t3) && __cuda___isinf(t1)) {	} else if (__isinf(t3) && __isinf(t1)) {
	t3 = __cuda___signbit(b) ? CUDART_3PIO4 : CUDART_PIO4;	t3 = (__double2hiint(b) < 0) ? CUDART_3PIO4 : CUDART_PIO4;
	} else {	} else {

	t0 = __cuda_fmax (t1, t3);	t0 = fmax (t1, t3);
	t1 = __cuda_fmin (t1, t3);	t1 = fmin (t1, t3);
	t3 = t1 / t0;	t3 = t1 / t0;
	t3 = __internal_atan_kernel(t3);	t3 = __internal_atan_kernel(t3);
	/* Map result according to octant. */	/* Map result according to octant. */

	if (__cuda_fabs(a) > __cuda_fabs(b)) t3 = CUDART_PIO2 - t3;	if (fabs(a) > fabs(b)) t3 = CUDART_PIO2 - t3;
	if (b < 0.0) t3 = CUDART_PI - t3;	if (b < 0.0) t3 = CUDART_PI - t3;
	}	}
	t3 = __internal_copysign_pos(t3, a);	t3 = __internal_copysign_pos(t3, a);
	return t3;	return t3;
	}	}


	__device_func__(double __cuda_atan(double a))	static __forceinline__ double atan(double a)
	{	{
	double t0, t1;	double t0, t1;
	/* reduce argument to first octant */	/* reduce argument to first octant */

	t0 = __cuda_fabs(a);	t0 = fabs(a);
	t1 = t0;	t1 = t0;
	if (t0 > 1.0) {	if (t0 > 1.0) {
	t1 = 1.0 / t1;	t1 = 1.0 / t1;
	}	}
	/* approximate atan(r) in first octant */	/* approximate atan(r) in first octant */
	t1 = __internal_atan_kernel(t1);	t1 = __internal_atan_kernel(t1);
	/* map result according to octant. */	/* map result according to octant. */
	if (t0 > 1.0) {	if (t0 > 1.0) {
	t1 = CUDART_PIO2 - t1;	t1 = CUDART_PIO2 - t1;
	}	}
	return __internal_copysign_pos(t1, a);	return __internal_copysign_pos(t1, a);
	}	}

	/* b should be the square of a */	/* b should be the square of a */

	__device_func__(double __internal_asin_kernel(double a, double b))	static __forceinline__ double __internal_asin_kernel(double a, double b)
	{	{
	double r;	double r;
	r = 6.259798167646803E-002;	r = 6.259798167646803E-002;
	r = __fma_rn (r, b, -7.620591484676952E-002);	r = __fma_rn (r, b, -7.620591484676952E-002);
	r = __fma_rn (r, b, 6.686894879337643E-002);	r = __fma_rn (r, b, 6.686894879337643E-002);
	r = __fma_rn (r, b, -1.787828218369301E-002);	r = __fma_rn (r, b, -1.787828218369301E-002);
	r = __fma_rn (r, b, 1.745227928732326E-002);	r = __fma_rn (r, b, 1.745227928732326E-002);
	r = __fma_rn (r, b, 1.000422754245580E-002);	r = __fma_rn (r, b, 1.000422754245580E-002);
	r = __fma_rn (r, b, 1.418108777515123E-002);	r = __fma_rn (r, b, 1.418108777515123E-002);
	r = __fma_rn (r, b, 1.733194598980628E-002);	r = __fma_rn (r, b, 1.733194598980628E-002);
	r = __fma_rn (r, b, 2.237350511593569E-002);	r = __fma_rn (r, b, 2.237350511593569E-002);
	r = __fma_rn (r, b, 3.038188875134962E-002);	r = __fma_rn (r, b, 3.038188875134962E-002);
	r = __fma_rn (r, b, 4.464285849810986E-002);	r = __fma_rn (r, b, 4.464285849810986E-002);
	r = __fma_rn (r, b, 7.499999998342270E-002);	r = __fma_rn (r, b, 7.499999998342270E-002);
	r = __fma_rn (r, b, 1.666666666667375E-001);	r = __fma_rn (r, b, 1.666666666667375E-001);
	r = r * b;	r = r * b;
	return r;	return r;
	}	}


	__device_func__(double __cuda_asin(double a))	static __forceinline__ double asin(double a)
	{	{
	double fa, t0, t1;	double fa, t0, t1;
	int ihi, ahi;	int ihi, ahi;
	ahi = __double2hiint(a);	ahi = __double2hiint(a);

	fa = __cuda_fabs(a);	fa = fabs(a);
	ihi = __double2hiint(fa);	ihi = __double2hiint(fa);
	if (ihi < 0x3fe26666) {	if (ihi < 0x3fe26666) {
	t1 = fa * fa;	t1 = fa * fa;
	t1 = __internal_asin_kernel (fa, t1);	t1 = __internal_asin_kernel (fa, t1);
	t1 = __fma_rn (t1, fa, fa);	t1 = __fma_rn (t1, fa, fa);
	t1 = __internal_copysign_pos(t1, a);	t1 = __internal_copysign_pos(t1, a);
	} else {	} else {
	t1 = __fma_rn (-0.5, fa, 0.5);	t1 = __fma_rn (-0.5, fa, 0.5);

	t0 = __cuda_sqrt (t1);	t0 = sqrt (t1);
	t1 = __internal_asin_kernel (t0, t1);	t1 = __internal_asin_kernel (t0, t1);
	t0 = -2.0 * t0;	t0 = -2.0 * t0;
	t1 = __fma_rn (t0, t1, CUDART_PIO2_LO);	t1 = __fma_rn (t0, t1, CUDART_PIO2_LO);
	t0 = t0 + CUDART_PIO4_HI;	t0 = t0 + CUDART_PIO4_HI;
	t1 = t0 + t1;	t1 = t0 + t1;
	t1 = t1 + CUDART_PIO4_HI;	t1 = t1 + CUDART_PIO4_HI;
	if (ahi < 0x3ff00000) {	if (ahi < 0x3ff00000) {
	t1 = __internal_copysign_pos(t1, a);	t1 = __internal_copysign_pos(t1, a);
	}	}
	}	}
	return t1;	return t1;
	}	}


	__device_func__(double __cuda_acos(double a))	static __forceinline__ double acos(double a)
	{	{
	double t0, t1;	double t0, t1;
	int ihi, ahi;	int ihi, ahi;


	#if !defined(__CUDABE__)
	if (__cuda___isnan(a)) {
	return a + a;
	}
	#endif
	ahi = __double2hiint(a);	ahi = __double2hiint(a);

	t0 = __cuda_fabs (a);	t0 = fabs (a);
	ihi = __double2hiint(t0);	ihi = __double2hiint(t0);
	if (ihi < 0x3fe26666) {	if (ihi < 0x3fe26666) {
	t1 = t0 * t0;	t1 = t0 * t0;
	t1 = __internal_asin_kernel (t0, t1);	t1 = __internal_asin_kernel (t0, t1);
	t0 = __fma_rn (t1, t0, t0);	t0 = __fma_rn (t1, t0, t0);
	if ((unsigned)ahi >= (unsigned)0x80000000) {	if ((unsigned)ahi >= (unsigned)0x80000000) {
	t0 = __fma_rn (1.0, t0, +CUDART_PIO2_LO);	t0 = __fma_rn (1.0, t0, +CUDART_PIO2_LO);
	t0 = CUDART_PIO2_HI + t0;	t0 = CUDART_PIO2_HI + t0;
	} else {	} else {
	t0 = __fma_rn (1.0, t0, -CUDART_PIO2_LO);	t0 = __fma_rn (1.0, t0, -CUDART_PIO2_LO);
	t0 = CUDART_PIO2_HI - t0;	t0 = CUDART_PIO2_HI - t0;
	}	}
	} else {	} else {
	t1 = __fma_rn (-0.5, t0, 0.5);	t1 = __fma_rn (-0.5, t0, 0.5);

	t0 = __cuda_sqrt(t1);	t0 = sqrt(t1);
	t1 = __internal_asin_kernel (t0, t1);	t1 = __internal_asin_kernel (t0, t1);
	t0 = __fma_rn (t1, t0, t0);	t0 = __fma_rn (t1, t0, t0);
	t0 = 2.0 * t0;	t0 = 2.0 * t0;
	if ((unsigned)ahi >= (unsigned)0x80000000) {	if ((unsigned)ahi >= (unsigned)0x80000000) {
	t0 = __fma_rn (1.0, t0, -CUDART_PI_LO);	t0 = __fma_rn (1.0, t0, -CUDART_PI_LO);
	t0 = CUDART_PI_HI - t0;	t0 = CUDART_PI_HI - t0;
	}	}
	}	}
	return t0;	return t0;
	}	}


	__device_func__(double __cuda_acosh(double a))	static __forceinline__ double acosh(double a)
	{	{
	double t;	double t;

	#if !defined(__CUDABE__)
	if (__cuda___isnan(a)) {
	return a + a;
	}
	#endif
	t = a - 1.0;	t = a - 1.0;

	if (__cuda_fabs(t) > CUDART_TWO_TO_52) {	if (fabs(t) > CUDART_TWO_TO_52) {
	/* for large a, acosh = log(2a) /	/* for large a, acosh = log(2a) /

	return CUDART_LN2 + __cuda_log(a);	return CUDART_LN2 + log(a);
	} else {	} else {

	t = t + __cuda_sqrt(__fma_rn(a, t, t));	t = t + sqrt(__fma_rn(a, t, t));
	return __cuda_log1p(t);	return log1p(t);
	}	}
	}	}


	__device_func__(double __cuda_asinh(double a))	static __forceinline__ double asinh(double a)
	{	{
	double fa, t;	double fa, t;

	fa = __cuda_fabs(a);	fa = fabs(a);
	if (__double2hiint(fa) >= 0x5ff00000) { /* prevent intermediate underflow */	if (__double2hiint(fa) >= 0x5ff00000) { /* prevent intermediate underflow */

	t = CUDART_LN2 + __cuda_log(fa);	t = CUDART_LN2 + log(fa);
	} else {	} else {
	t = fa * fa;	t = fa * fa;

	t = __cuda_log1p (fa + t / (1.0 + __cuda_sqrt(1.0 + t)));	t = log1p (fa + t / (1.0 + sqrt(1.0 + t)));
	}	}
	return __internal_copysign_pos(t, a);	return __internal_copysign_pos(t, a);
	}	}


	__device_func__(double __cuda_atanh(double a))	static __forceinline__ double atanh(double a)
	{	{
	double fa, t;	double fa, t;

	#if !defined(__CUDABE__)	fa = fabs(a);
	if (__cuda___isnan(a)) {
	return a + a;
	}
	#endif
	fa = __cuda_fabs(a);
	t = (2.0 * fa) / (1.0 - fa);	t = (2.0 * fa) / (1.0 - fa);

	t = 0.5 * __cuda_log1p(t);	t = 0.5 * log1p(t);
	#if !defined(__CUDABE__)	if (__double2hiint(a) < 0) {
	if (__cuda___isnan(t)) {
	return t;
	}
	#endif
	if (__cuda___signbit(a)) {
	t = -t;	t = -t;
	}	}
	return t;	return t;
	}	}


	__device_func__(double __cuda_hypot(double a, double b))	static __forceinline__ double hypot(double a, double b)
	{	{
	double v, w, t, fa, fb;	double v, w, t, fa, fb;


	fa = __cuda_fabs(a);	fa = fabs(a);
	fb = __cuda_fabs(b);	fb = fabs(b);
	v = __cuda_fmax(fa, fb);	v = fmax(fa, fb);
	w = __cuda_fmin(fa, fb);	w = fmin(fa, fb);
	t = w / v;	t = w / v;
	t = __fma_rn (t, t, 1.0);	t = __fma_rn (t, t, 1.0);

	t = v * __cuda_sqrt(t);	t = v * sqrt(t);
	if (v == 0.0) {	if (v == 0.0) {
	t = v + w; /* fixup for zero divide */	t = v + w; /* fixup for zero divide */
	}	}
	if ((!(fa <= CUDART_INF)) \|\| (!(fb <= CUDART_INF))) {	if ((!(fa <= CUDART_INF)) \|\| (!(fb <= CUDART_INF))) {
	t = a + b; /* fixup for NaNs */	t = a + b; /* fixup for NaNs */
	}	}
	if (v == CUDART_INF) {	if (v == CUDART_INF) {
	t = v + w; /* fixup for infinities */	t = v + w; /* fixup for infinities */
	}	}
	return t;	return t;
	}	}


	__device_func__(double __cuda_cbrt(double a))	static __forceinline__ double cbrt(double a)
	{	{
	float s;	float s;
	double t, r;	double t, r;
	int ilo, ihi, expo, nexpo, denorm;	int ilo, ihi, expo, nexpo, denorm;

	if ((a == 0.0) \|\| !(__cuda___finite(a))) {	if ((a == 0.0) \|\| !(__finite(a))) {
	return a + a;	return a + a;
	}	}

	t = __cuda_fabs(a);	t = fabs(a);
	ilo = __double2loint(t);	ilo = __double2loint(t);
	ihi = __double2hiint(t);	ihi = __double2hiint(t);
	expo = ((int)((unsigned int)ihi >> 20) & 0x7ff);	expo = ((int)((unsigned int)ihi >> 20) & 0x7ff);
	denorm = 0;	denorm = 0;
	if (expo == 0) {	if (expo == 0) {
	/* denormal */	/* denormal */
	t = t * CUDART_TWO_TO_54;	t = t * CUDART_TWO_TO_54;
	denorm = 18;	denorm = 18;
	ilo = __double2loint(t);	ilo = __double2loint(t);
	ihi = __double2hiint(t);	ihi = __double2hiint(t);
	expo = ((int)((unsigned int)ihi >> 20) & 0x7ff);	expo = ((int)((unsigned int)ihi >> 20) & 0x7ff);
	}	}
	/* scale into float range */	/* scale into float range */
	nexpo = __float2int_rn(CUDART_THIRD_F * (float)(expo - 1022));	nexpo = __float2int_rn(CUDART_THIRD_F * (float)(expo - 1022));
	ihi -= (3 * nexpo) << 20;	ihi -= (3 * nexpo) << 20;
	r = __hiloint2double(ihi, ilo);	r = __hiloint2double(ihi, ilo);
	/* initial approximation */	/* initial approximation */
	s = (float)r;	s = (float)r;

	t = __cuda_exp2f(-CUDART_THIRD_F * __log2f(s)); /* approximate invcbrt */	t = exp2f(-CUDART_THIRD_F * __log2f(s)); /* approximate invcbrt */
	t = __fma_rn(__fma_rn(tt,-rt,1.0), CUDART_THIRDt, t);/ refine invcbrt */	t = __fma_rn(__fma_rn(tt,-rt,1.0), CUDART_THIRDt, t);/ refine invcbrt */
	t = r * t * t; /* approximate cbrt */	t = r * t * t; /* approximate cbrt */
	t = __fma_rn(t - (r / (t * t)), -CUDART_THIRD, t); /* refine cbrt */	t = __fma_rn(t - (r / (t * t)), -CUDART_THIRD, t); /* refine cbrt */
	/* scale result back into double range */	/* scale result back into double range */
	ilo = __double2loint(t);	ilo = __double2loint(t);
	ihi = __double2hiint(t);	ihi = __double2hiint(t);
	ihi += (nexpo - denorm) << 20;	ihi += (nexpo - denorm) << 20;
	t = __hiloint2double(ihi, ilo);	t = __hiloint2double(ihi, ilo);

	if (__cuda___signbit(a)) {	if (__double2hiint(a) < 0) {
	t = -t;	t = -t;
	}	}
	return t;	return t;
	}	}


	__device_func__(double __cuda_rcbrt(double a))	static __forceinline__ double rcbrt(double a)
	{	{
	float s;	float s;
	double t, r;	double t, r;
	int ilo, ihi, expo, nexpo, denorm;	int ilo, ihi, expo, nexpo, denorm;

	if ((a == 0.0) \|\| !(__cuda___finite(a))) {	if ((a == 0.0) \|\| !(__finite(a))) {
	return 1.0 / a;	return 1.0 / a;
	}	}

	t = __cuda_fabs(a);	t = fabs(a);
	ilo = __double2loint(t);	ilo = __double2loint(t);
	ihi = __double2hiint(t);	ihi = __double2hiint(t);
	expo = ((int)((unsigned int)ihi >> 20) & 0x7ff);	expo = ((int)((unsigned int)ihi >> 20) & 0x7ff);
	denorm = 0;	denorm = 0;
	if (expo == 0) {	if (expo == 0) {
	/* denormal */	/* denormal */
	t = t * CUDART_TWO_TO_54;	t = t * CUDART_TWO_TO_54;
	denorm = 18;	denorm = 18;
	ilo = __double2loint(t);	ilo = __double2loint(t);
	ihi = __double2hiint(t);	ihi = __double2hiint(t);
	expo = ((int)((unsigned int)ihi >> 20) & 0x7ff);	expo = ((int)((unsigned int)ihi >> 20) & 0x7ff);
	}	}
	/* scale into float range */	/* scale into float range */
	nexpo = __float2int_rn(CUDART_THIRD_F * (float)(expo - 1022));	nexpo = __float2int_rn(CUDART_THIRD_F * (float)(expo - 1022));
	ihi -= (3 * nexpo) << 20;	ihi -= (3 * nexpo) << 20;
	r = __hiloint2double(ihi, ilo);	r = __hiloint2double(ihi, ilo);
	/* initial approximation */	/* initial approximation */
	s = (float)r;	s = (float)r;

	t = __cuda_exp2f(-CUDART_THIRD_F * __log2f(s)); /* approximate invcbrt */	t = exp2f(-CUDART_THIRD_F * __log2f(s)); /* approximate invcbrt */
	t = __fma_rn(__fma_rn(tt,-rt,1.0), CUDART_THIRDt, t);/ refine invcbrt */	t = __fma_rn(__fma_rn(tt,-rt,1.0), CUDART_THIRDt, t);/ refine invcbrt */
	t = __fma_rn(__fma_rn(tt,-rt,1.0), CUDART_THIRDt, t);/ refine invcbrt */	t = __fma_rn(__fma_rn(tt,-rt,1.0), CUDART_THIRDt, t);/ refine invcbrt */
	/* scale result back into double range */	/* scale result back into double range */
	ilo = __double2loint(t);	ilo = __double2loint(t);
	ihi = __double2hiint(t);	ihi = __double2hiint(t);
	ihi += (-(nexpo - denorm)) << 20;	ihi += (-(nexpo - denorm)) << 20;
	t = __hiloint2double(ihi, ilo);	t = __hiloint2double(ihi, ilo);

	if (__cuda___signbit(a)) {	if (__double2hiint(a) < 0) {
	t = -t;	t = -t;
	}	}
	return t;	return t;
	}	}


	__device_func__(double __internal_accurate_pow(double a, double b))	static __forceinline__ double __internal_accurate_pow(double a, double b)
	{	{
	double2 loga;	double2 loga;
	double2 prod;	double2 prod;
	double t_hi, t_lo;	double t_hi, t_lo;
	double tmp;	double tmp;

	#if !defined(__CUDABE__) && defined(__linux__) && !defined(__LP64__)
	volatile
	#endif
	double e;	double e;

	/* compute log(a) in double-double format*/	/* compute log(a) in double-double format*/
	loga = __internal_log_ext_prec(a);	loga = __internal_log_ext_prec(a);

	/* prevent overflow during extended precision multiply */	/* prevent overflow during extended precision multiply */

	if (__cuda_fabs(b) > 1e304) b *= 1.220703125e-4;	if (fabs(b) > 1e304) b *= 1.220703125e-4;
	/* compute b * log(a) in double-double format */	/* compute b * log(a) in double-double format */
	t_hi = loga.y * b;	t_hi = loga.y * b;
	t_lo = __fma_rn (loga.y, b, -t_hi);	t_lo = __fma_rn (loga.y, b, -t_hi);
	t_lo = __fma_rn (loga.x, b, t_lo);	t_lo = __fma_rn (loga.x, b, t_lo);
	prod.y = e = t_hi + t_lo;	prod.y = e = t_hi + t_lo;
	prod.x = (t_hi - e) + t_lo;	prod.x = (t_hi - e) + t_lo;

	/* compute pow(a,b) = exp(blog(a)) /	/* compute pow(a,b) = exp(blog(a)) /

	tmp = __cuda_exp(prod.y);	tmp = exp(prod.y);
	/* prevent -INF + INF = NaN */	/* prevent -INF + INF = NaN */

	if (!__cuda___isinf(tmp)) {	if (!__isinf(tmp)) {
	/* if prod.x is much smaller than prod.y, then exp(prod.y + prod.x) ~=	/* if prod.x is much smaller than prod.y, then exp(prod.y + prod.x) ~=
	* exp(prod.y) + prod.x * exp(prod.y)	* exp(prod.y) + prod.x * exp(prod.y)
	*/	*/
	tmp = __fma_rn (tmp, prod.x, tmp);	tmp = __fma_rn (tmp, prod.x, tmp);
	}	}
	return tmp;	return tmp;
	}	}


	__device_func__(double __cuda_pow(double a, double b))	static __forceinline__ double pow(double a, double b)
	{	{
	int bIsOddInteger;	int bIsOddInteger;
	double t;	double t;

	if (a == 1.0 \|\| b == 0.0) {	if (a == 1.0 \|\| b == 0.0) {
	return 1.0;	return 1.0;
	}	}

	if (__cuda___isnan(a) \|\| __cuda___isnan(b)) {	if (__isnan(a) \|\| __isnan(b)) {
	return a + b;	return a + b;
	}	}
	if (a == CUDART_INF) {	if (a == CUDART_INF) {

	return __cuda___signbit(b) ? CUDART_ZERO : CUDART_INF;	return (__double2hiint(b) < 0) ? CUDART_ZERO : CUDART_INF;
	}	}

	if (__cuda___isinf(b)) {	if (__isinf(b)) {
	if (a == -1.0) {	if (a == -1.0) {
	return 1.0;	return 1.0;
	}	}

	t = __cuda_fabs(a) > 1.0 ? CUDART_INF : CUDART_ZERO;	t = fabs(a) > 1.0 ? CUDART_INF : CUDART_ZERO;
	if (b < CUDART_ZERO) {	if (b < CUDART_ZERO) {
	t = 1.0 / t;	t = 1.0 / t;
	}	}
	return t;	return t;
	}	}

	bIsOddInteger = __cuda_fabs(b - (2.0f * __cuda_trunc(0.5 * b))) == 1.0;	bIsOddInteger = fabs(b - (2.0f * trunc(0.5 * b))) == 1.0;
	if (a == CUDART_ZERO) {	if (a == CUDART_ZERO) {
	t = bIsOddInteger ? a : CUDART_ZERO;	t = bIsOddInteger ? a : CUDART_ZERO;
	if (b < CUDART_ZERO) {	if (b < CUDART_ZERO) {
	t = 1.0 / t;	t = 1.0 / t;
	}	}
	return t;	return t;
	}	}
	if (a == -CUDART_INF) {	if (a == -CUDART_INF) {
	t = (b < CUDART_ZERO) ? -1.0/a : -a;	t = (b < CUDART_ZERO) ? -1.0/a : -a;
	if (bIsOddInteger) {	if (bIsOddInteger) {
	t = __longlong_as_double(__double_as_longlong(t)^0x8000000000000000UL L);	t = __longlong_as_double(__double_as_longlong(t)^0x8000000000000000UL L);
	}	}
	return t;	return t;
	}	}

	if ((a < CUDART_ZERO) && (b != __cuda_trunc(b))) {	if ((a < CUDART_ZERO) && (b != trunc(b))) {
	return CUDART_NAN;	return CUDART_NAN;
	}	}

	t = __cuda_fabs(a);	t = fabs(a);
	t = __internal_accurate_pow(t, b);	t = __internal_accurate_pow(t, b);
	if ((a < CUDART_ZERO) && bIsOddInteger) {	if ((a < CUDART_ZERO) && bIsOddInteger) {
	t = __longlong_as_double(__double_as_longlong(t) ^ 0x8000000000000000UL L);	t = __longlong_as_double(__double_as_longlong(t) ^ 0x8000000000000000UL L);
	}	}
	return t;	return t;
	}	}


	__device_func__(double __cuda_erf(double a))	static __forceinline__ double erf(double a)
	{	{
	double t, r, q;	double t, r, q;


	t = __cuda_fabs(a);	t = fabs(a);
	if (t >= 1.0) {	if (t >= 1.0) {
	r = -1.28836351230756500E-019;	r = -1.28836351230756500E-019;
	r = __fma_rn (r, t, 1.30597472161093370E-017);	r = __fma_rn (r, t, 1.30597472161093370E-017);
	r = __fma_rn (r, t, -6.33924401259620500E-016);	r = __fma_rn (r, t, -6.33924401259620500E-016);
	r = __fma_rn (r, t, 1.96231865908940140E-014);	r = __fma_rn (r, t, 1.96231865908940140E-014);
	r = __fma_rn (r, t, -4.35272243559990750E-013);	r = __fma_rn (r, t, -4.35272243559990750E-013);
	r = __fma_rn (r, t, 7.37083927929352150E-012);	r = __fma_rn (r, t, 7.37083927929352150E-012);
	r = __fma_rn (r, t, -9.91402142550461630E-011);	r = __fma_rn (r, t, -9.91402142550461630E-011);
	r = __fma_rn (r, t, 1.08817017167760820E-009);	r = __fma_rn (r, t, 1.08817017167760820E-009);
	r = __fma_rn (r, t, -9.93918713097634620E-009);	r = __fma_rn (r, t, -9.93918713097634620E-009);

	skipping to change at line 1474	skipping to change at line 1355
	r = __fma_rn (r, q, 5.22397760611847340E-003);	r = __fma_rn (r, q, 5.22397760611847340E-003);
	r = __fma_rn (r, q, -2.68661706431114690E-002);	r = __fma_rn (r, q, -2.68661706431114690E-002);
	r = __fma_rn (r, q, 1.12837916709441850E-001);	r = __fma_rn (r, q, 1.12837916709441850E-001);
	r = __fma_rn (r, q, -3.76126389031835210E-001);	r = __fma_rn (r, q, -3.76126389031835210E-001);
	r = __fma_rn (r, q, 1.12837916709551260E+000);	r = __fma_rn (r, q, 1.12837916709551260E+000);
	a = r * a;	a = r * a;
	}	}
	return a;	return a;
	}	}


	__device_func__(double __cuda_erfinv(double a))	static __forceinline__ double erfinv(double a)
	{	{
	double fa, t;	double fa, t;


	fa = __cuda_fabs(a);	fa = fabs(a);
	if (fa >= 1.0) {	if (fa >= 1.0) {
	t = CUDART_NAN; /* NaN */	t = CUDART_NAN; /* NaN */
	if (fa == 1.0) {	if (fa == 1.0) {
	t = a * CUDART_INF; /* Infinity */	t = a * CUDART_INF; /* Infinity */
	}	}
	} else if (fa >= 0.9375) {	} else if (fa >= 0.9375) {
	/* Based on: J.M. Blair, C.A. Edwards, J.H. Johnson: Rational Chebyshev	/* Based on: J.M. Blair, C.A. Edwards, J.H. Johnson: Rational Chebyshev
	Approximations for the Inverse of the Error Function. Mathematics of	Approximations for the Inverse of the Error Function. Mathematics of
	Computation, Vol. 30, No. 136 (Oct. 1976), pp. 827-830. Table 59	Computation, Vol. 30, No. 136 (Oct. 1976), pp. 827-830. Table 59
	*/	*/
	double p, q;	double p, q;


	t = __cuda_log1p(-fa);	t = log1p(-fa);
	t = __cuda_rsqrt(-t);	t = rsqrt(-t);
	p = 2.7834010353747001060e-3;	p = 2.7834010353747001060e-3;
	p = __fma_rn (p, t, 8.6030097526280260580e-1);	p = __fma_rn (p, t, 8.6030097526280260580e-1);
	p = __fma_rn (p, t, 2.1371214997265515515e+0);	p = __fma_rn (p, t, 2.1371214997265515515e+0);
	p = __fma_rn (p, t, 3.1598519601132090206e+0);	p = __fma_rn (p, t, 3.1598519601132090206e+0);
	p = __fma_rn (p, t, 3.5780402569085996758e+0);	p = __fma_rn (p, t, 3.5780402569085996758e+0);
	p = __fma_rn (p, t, 1.5335297523989890804e+0);	p = __fma_rn (p, t, 1.5335297523989890804e+0);
	p = __fma_rn (p, t, 3.4839207139657522572e-1);	p = __fma_rn (p, t, 3.4839207139657522572e-1);
	p = __fma_rn (p, t, 5.3644861147153648366e-2);	p = __fma_rn (p, t, 5.3644861147153648366e-2);
	p = __fma_rn (p, t, 4.3836709877126095665e-3);	p = __fma_rn (p, t, 4.3836709877126095665e-3);
	p = __fma_rn (p, t, 1.3858518113496718808e-4);	p = __fma_rn (p, t, 1.3858518113496718808e-4);

	skipping to change at line 1571	skipping to change at line 1452
	q = __fma_rn (q, t, .59039348134843665626e+4);	q = __fma_rn (q, t, .59039348134843665626e+4);
	q = __fma_rn (q, t, -.48481635430048872102e+4);	q = __fma_rn (q, t, -.48481635430048872102e+4);
	q = __fma_rn (q, t, .18997769186453057810e+4);	q = __fma_rn (q, t, .18997769186453057810e+4);
	q = __fma_rn (q, t, -.28386514725366621129e+3);	q = __fma_rn (q, t, -.28386514725366621129e+3);
	p = p / q;	p = p / q;
	t = a * p;	t = a * p;
	}	}
	return t;	return t;
	}	}


	__device_func__(double __cuda_erfcinv(double a))	static __forceinline__ double erfcinv(double a)
	{	{
	double t;	double t;

	#if !defined(__CUDABE__)
	if (__cuda___isnan(a)) return a + a;
	#endif
	if (a <= CUDART_ZERO) {	if (a <= CUDART_ZERO) {
	t = CUDART_NAN;	t = CUDART_NAN;
	if (a == CUDART_ZERO) {	if (a == CUDART_ZERO) {
	t = (1.0 - a) * CUDART_INF;	t = (1.0 - a) * CUDART_INF;
	}	}
	}	}
	else if (a >= 0.0625) {	else if (a >= 0.0625) {

	t = __cuda_erfinv (1.0 - a);	t = erfinv (1.0 - a);
	}	}
	else if (a >= 1e-100) {	else if (a >= 1e-100) {
	/* Based on: J.M. Blair, C.A. Edwards, J.H. Johnson: Rational Chebyshev	/* Based on: J.M. Blair, C.A. Edwards, J.H. Johnson: Rational Chebyshev
	Approximations for the Inverse of the Error Function. Mathematics of	Approximations for the Inverse of the Error Function. Mathematics of
	Computation, Vol. 30, No. 136 (Oct. 1976), pp. 827-830. Table 59	Computation, Vol. 30, No. 136 (Oct. 1976), pp. 827-830. Table 59
	*/	*/
	double p, q;	double p, q;

	t = __cuda_log(a);	t = log(a);
	t = __cuda_rsqrt(-t);	t = rsqrt(-t);
	p = 2.7834010353747001060e-3;	p = 2.7834010353747001060e-3;
	p = __fma_rn (p, t, 8.6030097526280260580e-1);	p = __fma_rn (p, t, 8.6030097526280260580e-1);
	p = __fma_rn (p, t, 2.1371214997265515515e+0);	p = __fma_rn (p, t, 2.1371214997265515515e+0);
	p = __fma_rn (p, t, 3.1598519601132090206e+0);	p = __fma_rn (p, t, 3.1598519601132090206e+0);
	p = __fma_rn (p, t, 3.5780402569085996758e+0);	p = __fma_rn (p, t, 3.5780402569085996758e+0);
	p = __fma_rn (p, t, 1.5335297523989890804e+0);	p = __fma_rn (p, t, 1.5335297523989890804e+0);
	p = __fma_rn (p, t, 3.4839207139657522572e-1);	p = __fma_rn (p, t, 3.4839207139657522572e-1);
	p = __fma_rn (p, t, 5.3644861147153648366e-2);	p = __fma_rn (p, t, 5.3644861147153648366e-2);
	p = __fma_rn (p, t, 4.3836709877126095665e-3);	p = __fma_rn (p, t, 4.3836709877126095665e-3);
	p = __fma_rn (p, t, 1.3858518113496718808e-4);	p = __fma_rn (p, t, 1.3858518113496718808e-4);

	skipping to change at line 1623	skipping to change at line 1501
	q = __fma_rn (q, t, 1.3858762165532246059e-4);	q = __fma_rn (q, t, 1.3858762165532246059e-4);
	q = __fma_rn (q, t, 1.1738313872397777529e-6);	q = __fma_rn (q, t, 1.1738313872397777529e-6);
	t = p / (q * t);	t = p / (q * t);
	}	}
	else {	else {
	/* Based on: J.M. Blair, C.A. Edwards, J.H. Johnson: Rational Chebyshev	/* Based on: J.M. Blair, C.A. Edwards, J.H. Johnson: Rational Chebyshev
	Approximations for the Inverse of the Error Function. Mathematics of	Approximations for the Inverse of the Error Function. Mathematics of
	Computation, Vol. 30, No. 136 (Oct. 1976), pp. 827-830. Table 82	Computation, Vol. 30, No. 136 (Oct. 1976), pp. 827-830. Table 82
	*/	*/
	double p, q;	double p, q;

	t = __cuda_log(a);	t = log(a);
	t = __cuda_rsqrt(-t);	t = rsqrt(-t);
	p = 6.9952990607058154858e-1;	p = 6.9952990607058154858e-1;
	p = __fma_rn (p, t, 1.9507620287580568829e+0);	p = __fma_rn (p, t, 1.9507620287580568829e+0);
	p = __fma_rn (p, t, 8.2810030904462690216e-1);	p = __fma_rn (p, t, 8.2810030904462690216e-1);
	p = __fma_rn (p, t, 1.1279046353630280005e-1);	p = __fma_rn (p, t, 1.1279046353630280005e-1);
	p = __fma_rn (p, t, 6.0537914739162189689e-3);	p = __fma_rn (p, t, 6.0537914739162189689e-3);
	p = __fma_rn (p, t, 1.3714329569665128933e-4);	p = __fma_rn (p, t, 1.3714329569665128933e-4);
	p = __fma_rn (p, t, 1.2964481560643197452e-6);	p = __fma_rn (p, t, 1.2964481560643197452e-6);
	p = __fma_rn (p, t, 4.6156006321345332510e-9);	p = __fma_rn (p, t, 4.6156006321345332510e-9);
	p = __fma_rn (p, t, 4.5344689563209398450e-12);	p = __fma_rn (p, t, 4.5344689563209398450e-12);
	q = t+ 1.5771922386662040546e+0;	q = t+ 1.5771922386662040546e+0;

	skipping to change at line 1648	skipping to change at line 1526
	q = __fma_rn (q, t, 6.0574830550097140404e-3);	q = __fma_rn (q, t, 6.0574830550097140404e-3);
	q = __fma_rn (q, t, 1.3715891988350205065e-4);	q = __fma_rn (q, t, 1.3715891988350205065e-4);
	q = __fma_rn (q, t, 1.2964671850944981713e-6);	q = __fma_rn (q, t, 1.2964671850944981713e-6);
	q = __fma_rn (q, t, 4.6156017600933592558e-9);	q = __fma_rn (q, t, 4.6156017600933592558e-9);
	q = __fma_rn (q, t, 4.5344687377088206783e-12);	q = __fma_rn (q, t, 4.5344687377088206783e-12);
	t = p / (q * t);	t = p / (q * t);
	}	}
	return t;	return t;
	}	}


	__device_func__(double __cuda_erfc(double a))	static __forceinline__ double erfc(double a)
	{	{
	double p, q, h, l;	double p, q, h, l;
	int ahi;	int ahi;

	ahi = __double2hiint(a);	ahi = __double2hiint(a);
	if (ahi < (int)0x3fea0400) { /* 1665/2048 */	if (ahi < (int)0x3fea0400) { /* 1665/2048 */

	return 1.0 - __cuda_erf(a);	return 1.0 - erf(a);
	}	}
	if (ahi < (int)0x40140000) { /* 5.0 */	if (ahi < (int)0x40140000) { /* 5.0 */
	/* On the interval [1665/2048, 5.0] the following approximation is used :	/* On the interval [1665/2048, 5.0] the following approximation is used :
	erfc(a) = (1.0 + 1/a * r(1/a)) * 1/a * 0.5 * exp(-a*a), where the ra nge	erfc(a) = (1.0 + 1/a * r(1/a)) * 1/a * 0.5 * exp(-a*a), where the ra nge
	of r(1/a) is approximately [-0.17, 0.11]. r(1/a) is computed by rati onal	of r(1/a) is approximately [-0.17, 0.11]. r(1/a) is computed by rati onal
	approximation.	approximation.
	*/	*/
	double t;	double t;


	t = 1.0 / a;	t = __internal_fast_rcp(a);
	p = -1.0000000252849461E+000;	p = -1.0000000252849461E+000;
	p = __fma_rn (p, t, -7.3398971987771156E-001);	p = __fma_rn (p, t, -7.3398971987771156E-001);
	p = __fma_rn (p, t, -1.4685633784433072E-001);	p = __fma_rn (p, t, -1.4685633784433072E-001);
	p = __fma_rn (p, t, 1.2963557011001836E-001);	p = __fma_rn (p, t, 1.2963557011001836E-001);
	p = __fma_rn (p, t, 1.0901177826674287E-001);	p = __fma_rn (p, t, 1.0901177826674287E-001);
	p = __fma_rn (p, t, 3.9250612663155882E-002);	p = __fma_rn (p, t, 3.9250612663155882E-002);
	p = __fma_rn (p, t, 7.5883167167654269E-003);	p = __fma_rn (p, t, 7.5883167167654269E-003);
	p = __fma_rn (p, t, 6.6438196820856965E-004);	p = __fma_rn (p, t, 6.6438196820856965E-004);
	q = t + 2.7339900293714838E+000;	q = t + 2.7339900293714838E+000;
	q = __fma_rn (q, t, 3.3580762542361291E+000);	q = __fma_rn (q, t, 3.3580762542361291E+000);
	q = __fma_rn (q, t, 2.4165688909166021E+000);	q = __fma_rn (q, t, 2.4165688909166021E+000);
	q = __fma_rn (q, t, 1.1092158770004934E+000);	q = __fma_rn (q, t, 1.1092158770004934E+000);
	q = __fma_rn (q, t, 3.2845571970789467E-001);	q = __fma_rn (q, t, 3.2845571970789467E-001);
	q = __fma_rn (q, t, 5.9110343116276186E-002);	q = __fma_rn (q, t, 5.9110343116276186E-002);
	q = __fma_rn (q, t, 5.1750858802842702E-003);	q = __fma_rn (q, t, 5.1750858802842702E-003);
	q = __fma_rn (q, t, 1.2937416364002241E-009);	q = __fma_rn (q, t, 1.2937416364002241E-009);

	q = 1.0 / q;	q = __internal_fast_rcp(q);
	p = p * q;	p = p * q;
	p = p * t;	p = p * t;
	h = a * a;	h = a * a;
	l = __fma_rn (a, a, -h);	l = __fma_rn (a, a, -h);
	q = __internal_exp_kernel(-h, -1);	q = __internal_exp_kernel(-h, -1);
	q = __fma_rn (l, -q, q);	q = __fma_rn (l, -q, q);
	p = __fma_rn (p, q, q);	p = __fma_rn (p, q, q);
	p = p * t;	p = p * t;
	} else {	} else {
	/* max error 4 ulps on [5, 27.3] */	/* max error 4 ulps on [5, 27.3] */
	double ooa, ooasq;	double ooa, ooasq;


	ooa = 1.0 / a;	ooa = __internal_fast_rcp(a);
	ooasq = ooa * ooa;	ooasq = ooa * ooa;
	p = -4.0025406686930527E+005;	p = -4.0025406686930527E+005;
	p = __fma_rn (p, ooasq, 1.4420582543942123E+005);	p = __fma_rn (p, ooasq, 1.4420582543942123E+005);
	p = __fma_rn (p, ooasq, -2.7664185780951841E+004);	p = __fma_rn (p, ooasq, -2.7664185780951841E+004);
	p = __fma_rn (p, ooasq, 4.1144611644767283E+003);	p = __fma_rn (p, ooasq, 4.1144611644767283E+003);
	p = __fma_rn (p, ooasq, -5.8706000519209351E+002);	p = __fma_rn (p, ooasq, -5.8706000519209351E+002);
	p = __fma_rn (p, ooasq, 9.1490086446323375E+001);	p = __fma_rn (p, ooasq, 9.1490086446323375E+001);
	p = __fma_rn (p, ooasq, -1.6659491387740221E+001);	p = __fma_rn (p, ooasq, -1.6659491387740221E+001);
	p = __fma_rn (p, ooasq, 3.7024804085481784E+000);	p = __fma_rn (p, ooasq, 3.7024804085481784E+000);
	p = __fma_rn (p, ooasq, -1.0578553994424316E+000);	p = __fma_rn (p, ooasq, -1.0578553994424316E+000);

	skipping to change at line 1723	skipping to change at line 1601
	p = p * ooa;	p = p * ooa;
	p = p * q;	p = p * q;
	if (a > 27.3) {	if (a > 27.3) {
	p = 0.0;	p = 0.0;
	}	}
	}	}
	return p;	return p;
	}	}

	/* approximate 1.0/(agamma(a)) on [-0.5,0.5] /	/* approximate 1.0/(agamma(a)) on [-0.5,0.5] /

	__device_func__(double __internal_tgamma_kernel(double a))	static __forceinline__ double __internal_tgamma_kernel(double a)
	{	{
	double t;	double t;
	t = -4.42689340712524750E-010;	t = -4.42689340712524750E-010;
	t = __fma_rn (t, a, -2.02665918466589540E-007);	t = __fma_rn (t, a, -2.02665918466589540E-007);
	t = __fma_rn (t, a, 1.13812117211195270E-006);	t = __fma_rn (t, a, 1.13812117211195270E-006);
	t = __fma_rn (t, a, -1.25077348166307480E-006);	t = __fma_rn (t, a, -1.25077348166307480E-006);
	t = __fma_rn (t, a, -2.01365017404087710E-005);	t = __fma_rn (t, a, -2.01365017404087710E-005);
	t = __fma_rn (t, a, 1.28050126073544860E-004);	t = __fma_rn (t, a, 1.28050126073544860E-004);
	t = __fma_rn (t, a, -2.15241408115274180E-004);	t = __fma_rn (t, a, -2.15241408115274180E-004);
	t = __fma_rn (t, a, -1.16516754597046040E-003);	t = __fma_rn (t, a, -1.16516754597046040E-003);

	skipping to change at line 1746	skipping to change at line 1624
	t = __fma_rn (t, a, -4.21977345547223940E-002);	t = __fma_rn (t, a, -4.21977345547223940E-002);
	t = __fma_rn (t, a, 1.66538611382503560E-001);	t = __fma_rn (t, a, 1.66538611382503560E-001);
	t = __fma_rn (t, a, -4.20026350341054440E-002);	t = __fma_rn (t, a, -4.20026350341054440E-002);
	t = __fma_rn (t, a, -6.55878071520257120E-001);	t = __fma_rn (t, a, -6.55878071520257120E-001);
	t = __fma_rn (t, a, 5.77215664901532870E-001);	t = __fma_rn (t, a, 5.77215664901532870E-001);
	t = __fma_rn (t, a, 1.00000000000000000E+000);	t = __fma_rn (t, a, 1.00000000000000000E+000);
	return t;	return t;
	}	}

	/* Stirling approximation for gamma(a), a > 20 */	/* Stirling approximation for gamma(a), a > 20 */

	__device_func__(double __internal_stirling_poly(double a))	static __forceinline__ double __internal_stirling_poly(double a)
	{	{

	double x = 1.0 / a;	double x = __internal_fast_rcp(a);
	double z = 0.0;	double z = 0.0;
	z = __fma_rn (z, x, 8.3949872067208726e-004);	z = __fma_rn (z, x, 8.3949872067208726e-004);
	z = __fma_rn (z, x, -5.1717909082605919e-005);	z = __fma_rn (z, x, -5.1717909082605919e-005);
	z = __fma_rn (z, x, -5.9216643735369393e-004);	z = __fma_rn (z, x, -5.9216643735369393e-004);
	z = __fma_rn (z, x, 6.9728137583658571e-005);	z = __fma_rn (z, x, 6.9728137583658571e-005);
	z = __fma_rn (z, x, 7.8403922172006662e-004);	z = __fma_rn (z, x, 7.8403922172006662e-004);
	z = __fma_rn (z, x, -2.2947209362139917e-004);	z = __fma_rn (z, x, -2.2947209362139917e-004);
	z = __fma_rn (z, x, -2.6813271604938273e-003);	z = __fma_rn (z, x, -2.6813271604938273e-003);
	z = __fma_rn (z, x, 3.4722222222222220e-003);	z = __fma_rn (z, x, 3.4722222222222220e-003);
	z = __fma_rn (z, x, 8.3333333333333329e-002);	z = __fma_rn (z, x, 8.3333333333333329e-002);
	z = __fma_rn (z, x, 1.0000000000000000e+000);	z = __fma_rn (z, x, 1.0000000000000000e+000);
	return z;	return z;
	}	}


	__device_func__(double __internal_tgamma_stirling(double a))	static __forceinline__ double __internal_tgamma_stirling(double a)
	{	{
	if (a < 1.7162437695630274e+002) {	if (a < 1.7162437695630274e+002) {

	#if defined(__GNUC__) && !defined(__CUDABE__)
	volatile
	#endif
	double t_hi, t_lo, e;	double t_hi, t_lo, e;

	double2 loga, prod;	double2 loga, prod;
	double z = __internal_stirling_poly (a);	double z = __internal_stirling_poly (a);
	double b = a - 0.5;	double b = a - 0.5;

	/* compute log(a) in double-double format*/	/* compute log(a) in double-double format*/
	loga = __internal_log_ext_prec(a);	loga = __internal_log_ext_prec(a);

	/* compute (a - 0.5) * log(a) in double-double format */	/* compute (a - 0.5) * log(a) in double-double format */

	skipping to change at line 1791	skipping to change at line 1666
	t_lo = __fma_rn (loga.x, b, t_lo);	t_lo = __fma_rn (loga.x, b, t_lo);
	prod.y = e = t_hi + t_lo;	prod.y = e = t_hi + t_lo;
	prod.x = (t_hi - e) + t_lo;	prod.x = (t_hi - e) + t_lo;

	/* compute (a - 0.5) * log(a) - a in double-double format */	/* compute (a - 0.5) * log(a) - a in double-double format */
	loga.y = -a;	loga.y = -a;
	loga.x = 0.0;	loga.x = 0.0;
	prod = __internal_ddadd_xgty (prod, loga);	prod = __internal_ddadd_xgty (prod, loga);

	/* compute pow(a,b) = exp(blog(a)) /	/* compute pow(a,b) = exp(blog(a)) /

	a = __cuda_exp(prod.y);	a = exp(prod.y);
	/* prevent -INF + INF = NaN */	/* prevent -INF + INF = NaN */

	if (!__cuda___isinf(a)) {	if (!__isinf(a)) {
	/* if prod.x is much smaller than prod.y, then exp(prod.y + prod.x) ~ =	/* if prod.x is much smaller than prod.y, then exp(prod.y + prod.x) ~ =
	* exp(prod.y) + prod.x * exp(prod.y)	* exp(prod.y) + prod.x * exp(prod.y)
	*/	*/
	a = __fma_rn (a, prod.x, a);	a = __fma_rn (a, prod.x, a);
	}	}
	a = __fma_rn (a, CUDART_SQRT_2PI_HI, a * CUDART_SQRT_2PI_LO);	a = __fma_rn (a, CUDART_SQRT_2PI_HI, a * CUDART_SQRT_2PI_LO);
	return a * z;	return a * z;
	} else {	} else {
	return CUDART_INF;	return CUDART_INF;
	}	}
	}	}


	__device_func__(double __cuda_tgamma(double a))	static __forceinline__ double tgamma(double a)
	{	{
	double s, xx, x = a;	double s, xx, x = a;

	if (__cuda___isnan(a)) {	if (__isnan(a)) {
	return a + a;	return a + a;
	}	}

	if (__cuda_fabs(x) < 15.0) {	if (fabs(x) < 15.0) {
	/* Based on: Kraemer, W.: "Berechnung der Gammafunktion G(x) fuer reel le	/* Based on: Kraemer, W.: "Berechnung der Gammafunktion G(x) fuer reel le
	* Punkt- und Intervallargumente". Zeitschrift fuer angewandte Mathema tik	* Punkt- und Intervallargumente". Zeitschrift fuer angewandte Mathema tik
	* und Mechanik, Vol. 70 (1990), No. 6, pp. 581-584	* und Mechanik, Vol. 70 (1990), No. 6, pp. 581-584
	*/	*/
	if (x >= 0.0) {	if (x >= 0.0) {
	s = 1.0;	s = 1.0;
	xx = x;	xx = x;
	while (xx > 1.5) {	while (xx > 1.5) {
	s = __fma_rn(s, xx, -s);	s = __fma_rn(s, xx, -s);
	xx = xx - 1.0;	xx = xx - 1.0;

	skipping to change at line 1835	skipping to change at line 1710
	xx = xx - 1.0;	xx = xx - 1.0;
	}	}
	xx = __internal_tgamma_kernel (xx);	xx = __internal_tgamma_kernel (xx);
	if (x < 0.5) {	if (x < 0.5) {
	xx = xx * x;	xx = xx * x;
	}	}
	s = s / xx;	s = s / xx;
	} else {	} else {
	xx = x;	xx = x;
	s = xx;	s = xx;

	if (x == __cuda_trunc(x)) {	if (x == trunc(x)) {
	return CUDART_NAN;	return CUDART_NAN;
	}	}
	while (xx < -0.5) {	while (xx < -0.5) {
	s = __fma_rn (s, xx, s);	s = __fma_rn (s, xx, s);
	xx = xx + 1.0;	xx = xx + 1.0;
	}	}
	xx = __internal_tgamma_kernel (xx);	xx = __internal_tgamma_kernel (xx);
	s = s * xx;	s = s * xx;
	s = 1.0 / s;	s = 1.0 / s;
	}	}
	return s;	return s;
	} else {	} else {
	if (x >= 0.0) {	if (x >= 0.0) {
	return __internal_tgamma_stirling (x);	return __internal_tgamma_stirling (x);
	} else {	} else {
	double t;	double t;
	int quot;	int quot;

	if (x == __cuda_trunc(x)) {	if (x == trunc(x)) {
	return CUDART_NAN;	return CUDART_NAN;
	}	}
	if (x < -185.0) {	if (x < -185.0) {
	int negative;	int negative;

	x = __cuda_floor(x);	x = floor(x);
	negative = ((x - (2.0 * __cuda_floor(0.5 * x))) == 1.0);	negative = ((x - (2.0 * floor(0.5 * x))) == 1.0);
	return negative ? CUDART_NEG_ZERO : CUDART_ZERO;	return negative ? CUDART_NEG_ZERO : CUDART_ZERO;
	}	}
	/* compute sin(pix) accurately /	/* compute sin(pix) accurately /

	xx = __cuda_rint (__internal_twice(x));	xx = rint (__internal_twice(x));
	quot = (int)xx;	quot = (int)xx;
	xx = __fma_rn (-0.5, xx, x);	xx = __fma_rn (-0.5, xx, x);
	xx = xx * CUDART_PI;	xx = xx * CUDART_PI;
	if (quot & 1) {	if (quot & 1) {
	xx = __internal_cos_kerneld (xx);	xx = __internal_cos_kerneld (xx);
	} else {	} else {
	xx = __internal_sin_kerneld (xx);	xx = __internal_sin_kerneld (xx);
	}	}
	if (quot & 2) {	if (quot & 2) {
	xx = -xx;	xx = -xx;
	}	}

	x = __cuda_fabs (x);	x = fabs (x);
	s = __cuda_exp (-x);	s = exp (-x);
	t = x - 0.5;	t = x - 0.5;
	if (x > 140.0) t = __internal_half(t);	if (x > 140.0) t = __internal_half(t);

	t = __cuda_pow (x, t);	t = pow (x, t);
	if (x > 140.0) s = s * t;	if (x > 140.0) s = s * t;
	s = s * __internal_stirling_poly (x);	s = s * __internal_stirling_poly (x);
	s = s * x;	s = s * x;
	s = s * xx;	s = s * xx;
	s = 1.0 / s;	s = 1.0 / s;
	s = __fma_rn (s, CUDART_SQRT_PIO2_HI, CUDART_SQRT_PIO2_LO * s);	s = __fma_rn (s, CUDART_SQRT_PIO2_HI, CUDART_SQRT_PIO2_LO * s);
	s = s / t;	s = s / t;
	return s;	return s;
	}	}
	}	}
	}	}


	__device_func__(double __internal_lgamma_pos(double a))	static __forceinline__ double __internal_lgamma_pos(double a)
	{	{
	double sum;	double sum;
	double s, t;	double s, t;

	if (a == CUDART_INF) {	if (a == CUDART_INF) {
	return a;	return a;
	}	}
	if (a >= 3.0) {	if (a >= 3.0) {
	if (a >= 8.0) {	if (a >= 8.0) {
	/* Stirling approximation; coefficients from Hart et al, "Computer	/* Stirling approximation; coefficients from Hart et al, "Computer
	* Approximations", Wiley 1968. Approximation 5404.	* Approximations", Wiley 1968. Approximation 5404.
	*/	*/

	s = 1.0 / a;	s = __internal_fast_rcp(a);
	t = s * s;	t = s * s;
	sum = -0.1633436431e-2;	sum = -0.1633436431e-2;
	sum = __fma_rn (sum, t, 0.83645878922e-3);	sum = __fma_rn (sum, t, 0.83645878922e-3);
	sum = __fma_rn (sum, t, -0.5951896861197e-3);	sum = __fma_rn (sum, t, -0.5951896861197e-3);
	sum = __fma_rn (sum, t, 0.793650576493454e-3);	sum = __fma_rn (sum, t, 0.793650576493454e-3);
	sum = __fma_rn (sum, t, -0.277777777735865004e-2);	sum = __fma_rn (sum, t, -0.277777777735865004e-2);
	sum = __fma_rn (sum, t, 0.833333333333331018375e-1);	sum = __fma_rn (sum, t, 0.833333333333331018375e-1);
	sum = __fma_rn (sum, s, 0.918938533204672);	sum = __fma_rn (sum, s, 0.918938533204672);

	s = __internal_half(__cuda_log (a));	s = __internal_half(log (a));
	t = a - 0.5;	t = a - 0.5;
	s = s * t;	s = s * t;
	t = s - a;	t = s - a;
	s = s + sum;	s = s + sum;
	t = t + s;	t = t + s;
	return t;	return t;
	} else {	} else {
	a = a - 3.0;	a = a - 3.0;
	s = -4.02412642744125560E+003;	s = -4.02412642744125560E+003;
	s = __fma_rn (s, a, -2.97693796998962000E+005);	s = __fma_rn (s, a, -2.97693796998962000E+005);

	skipping to change at line 2009	skipping to change at line 1884
	t = __fma_rn (t, a, -1.16484324388538480E-003);	t = __fma_rn (t, a, -1.16484324388538480E-003);
	t = __fma_rn (t, a, 7.21883433044470670E-003);	t = __fma_rn (t, a, 7.21883433044470670E-003);
	t = __fma_rn (t, a, -9.62194579514229560E-003);	t = __fma_rn (t, a, -9.62194579514229560E-003);
	t = __fma_rn (t, a, -4.21977386992884450E-002);	t = __fma_rn (t, a, -4.21977386992884450E-002);
	t = __fma_rn (t, a, 1.66538611813682460E-001);	t = __fma_rn (t, a, 1.66538611813682460E-001);
	t = __fma_rn (t, a, -4.20026350606819980E-002);	t = __fma_rn (t, a, -4.20026350606819980E-002);
	t = __fma_rn (t, a, -6.55878071519427450E-001);	t = __fma_rn (t, a, -6.55878071519427450E-001);
	t = __fma_rn (t, a, 5.77215664901523870E-001);	t = __fma_rn (t, a, 5.77215664901523870E-001);
	t = t * a;	t = t * a;
	t = __fma_rn (t, a, a);	t = __fma_rn (t, a, a);

	return -__cuda_log (t);	return -log (t);
	}	}
	}	}


	__device_func__(double __cuda_lgamma(double a))	static __forceinline__ double lgamma(double a)
	{	{
	double t;	double t;
	double i;	double i;
	long long int quot;	long long int quot;

	if (__cuda___isnan(a)) {	if (__isnan(a)) {
	return a + a;	return a + a;
	}	}

	t = __internal_lgamma_pos(__cuda_fabs(a));	t = __internal_lgamma_pos(fabs(a));
	if (a >= 0.0) return t;	if (a >= 0.0) return t;

	a = __cuda_fabs(a);	a = fabs(a);
	i = __cuda_trunc(a);	i = trunc(a);
	if (a == i) return CUDART_INF; /* a is an integer: return infinity */	if (a == i) return CUDART_INF; /* a is an integer: return infinity */

	if (a < 1e-19) return -__cuda_log(a);	if (a < 1e-19) return -log(a);
	i = __cuda_rint (2.0 * a);	i = rint (2.0 * a);
	quot = (long long int)i;	quot = (long long int)i;
	i = __fma_rn (-0.5, i, a);	i = __fma_rn (-0.5, i, a);
	i = i * CUDART_PI;	i = i * CUDART_PI;
	if (quot & 1) {	if (quot & 1) {
	i = __internal_cos_kerneld(i);	i = __internal_cos_kerneld(i);
	} else {	} else {
	i = __internal_sin_kerneld(i);	i = __internal_sin_kerneld(i);
	}	}

	i = __cuda_fabs(i);	i = fabs(i);
	t = __cuda_log(CUDART_PI / (i * a)) - t;	t = log(CUDART_PI / (i * a)) - t;
	return t;	return t;
	}	}


	__device_func__(double __cuda_ldexp(double a, int b))	static __forceinline__ double ldexp(double a, int b)
	{	{

	double fa = __cuda_fabs (a);	double fa = fabs (a);
	if ((fa == CUDART_ZERO) \|\| (fa == CUDART_INF) \|\| (!(fa <= CUDART_INF))) {	if ((fa == CUDART_ZERO) \|\| (fa == CUDART_INF) \|\| (!(fa <= CUDART_INF))) {
	return a + a;	return a + a;
	}	}
	if (b == 0) {	if (b == 0) {
	return a;	return a;
	}	}
	if (b > 2200) b = 2200;	if (b > 2200) b = 2200;
	if (b < -2200) b = -2200;	if (b < -2200) b = -2200;

	if (__cuda_abs (b) < 1022) {	if (abs (b) < 1022) {
	return a * __internal_exp2i_kernel(b);	return a * __internal_exp2i_kernel(b);
	}	}

	if (__cuda_abs (b) < 2044) {	if (abs (b) < 2044) {
	int bhalf = b / 2;	int bhalf = b / 2;
	return a * __internal_exp2i_kernel (bhalf) *	return a * __internal_exp2i_kernel (bhalf) *
	__internal_exp2i_kernel (b - bhalf);	__internal_exp2i_kernel (b - bhalf);
	} else {	} else {
	int bquarter = b / 4;	int bquarter = b / 4;
	double t = __internal_exp2i_kernel(bquarter);	double t = __internal_exp2i_kernel(bquarter);
	return a * t * t * t __internal_exp2i_kernel (b - 3 bquarter);	return a * t * t * t __internal_exp2i_kernel (b - 3 bquarter);
	}	}
	}	}


	__device_func__(double __cuda_scalbn(double a, int b))	static __forceinline__ double scalbn(double a, int b)
	{	{
	/* On binary systems, ldexp(x,exp) is equivalent to scalbn(x,exp) */	/* On binary systems, ldexp(x,exp) is equivalent to scalbn(x,exp) */

	return __cuda_ldexp(a, b);	return ldexp(a, b);
	}	}


	__device_func__(double __cuda_scalbln(double a, long int b))	static __forceinline__ double scalbln(double a, long int b)
	{	{
	#if defined(__LP64__)	#if defined(__LP64__)
	/* clamp to integer range prior to conversion */	/* clamp to integer range prior to conversion */
	if (b < -2147483648L) b = -2147483648L;	if (b < -2147483648L) b = -2147483648L;
	if (b > 2147483647L) b = 2147483647L;	if (b > 2147483647L) b = 2147483647L;

	#endif	#endif /* __LP64__ */
	return __cuda_scalbn(a, (int)b);	return scalbn(a, (int)b);
	}	}


	__device_func__(double __cuda_frexp(double a, int *b))	static __forceinline__ double frexp(double a, int *b)
	{	{

	double fa = __cuda_fabs(a);	double fa = fabs(a);
	unsigned int expo;	unsigned int expo;
	unsigned int denorm;	unsigned int denorm;

	if (fa < CUDART_TWO_TO_M1022) {	if (fa < CUDART_TWO_TO_M1022) {
	a *= CUDART_TWO_TO_54;	a *= CUDART_TWO_TO_54;
	denorm = 54;	denorm = 54;
	} else {	} else {
	denorm = 0;	denorm = 0;
	}	}
	expo = (__double2hiint(a) >> 20) & 0x7ff;	expo = (__double2hiint(a) >> 20) & 0x7ff;

	skipping to change at line 2107	skipping to change at line 1982
	a = a + a;	a = a + a;
	} else {	} else {
	expo = expo - denorm - 1022;	expo = expo - denorm - 1022;
	a = __longlong_as_double((__double_as_longlong(a) & 0x800fffffffffffffU LL)\|	a = __longlong_as_double((__double_as_longlong(a) & 0x800fffffffffffffU LL)\|
	0x3fe0000000000000ULL);	0x3fe0000000000000ULL);
	}	}
	*b = expo;	*b = expo;
	return a;	return a;
	}	}


	__device_func__(double __cuda_modf(double a, double *b))	static __forceinline__ double modf(double a, double *b)
	{	{
	double t;	double t;

	if (__cuda___finite(a)) {	if (__finite(a)) {
	t = __cuda_trunc(a);	t = trunc(a);
	*b = t;	*b = t;
	t = a - t;	t = a - t;
	return __internal_copysign_pos(t, a);	return __internal_copysign_pos(t, a);

	} else if (__cuda___isinf(a)) {	} else if (__isinf(a)) {
	t = 0.0;	t = 0.0;
	*b = a;	*b = a;
	return __internal_copysign_pos(t, a);	return __internal_copysign_pos(t, a);
	} else {	} else {
	*b = a + a;	*b = a + a;
	return a + a;	return a + a;
	}	}
	}	}


	__device_func__(double __cuda_fmod(double a, double b))	static __forceinline__ double fmod(double a, double b)
	{	{
	double orig_a = a;	double orig_a = a;
	double orig_b = b;	double orig_b = b;

	a = __cuda_fabs(a);	a = fabs(a);
	b = __cuda_fabs(b);	b = fabs(b);
	if (!((a <= CUDART_INF) && (b <= CUDART_INF))) {	if (!((a <= CUDART_INF) && (b <= CUDART_INF))) {
	return orig_a + orig_b;	return orig_a + orig_b;
	}	}
	if (a == CUDART_INF \|\| b == 0.0) {	if (a == CUDART_INF \|\| b == 0.0) {
	return CUDART_NAN;	return CUDART_NAN;
	} else if (a >= b) {	} else if (a >= b) {
	int bhi = __double2hiint(b);	int bhi = __double2hiint(b);
	int blo = __double2loint(b);	int blo = __double2loint(b);
	int ahi = __double2hiint(a);	int ahi = __double2hiint(a);
	double scaled_b = 0.0;	double scaled_b = 0.0;

	skipping to change at line 2168	skipping to change at line 2043
	a -= scaled_b;	a -= scaled_b;
	}	}
	scaled_b *= 0.5;	scaled_b *= 0.5;
	}	}
	return __internal_copysign_pos(a, orig_a);	return __internal_copysign_pos(a, orig_a);
	} else {	} else {
	return orig_a;	return orig_a;
	}	}
	}	}


	__device_func__(double __cuda_remainder(double a, double b))	static __forceinline__ double remainder(double a, double b)
	{	{
	double orig_a;	double orig_a;
	double twoa = 0.0;	double twoa = 0.0;
	unsigned int quot0 = 0; /* quotient bit 0 */	unsigned int quot0 = 0; /* quotient bit 0 */
	int bhi;	int bhi;
	int blo;	int blo;
	int ahi;	int ahi;

	if (__cuda___isnan(a) \|\| __cuda___isnan(b)) {	if (__isnan(a) \|\| __isnan(b)) {
	return a + b;	return a + b;
	}	}
	orig_a = a;	orig_a = a;

	a = __cuda_fabs(a);	a = fabs(a);
	b = __cuda_fabs(b);	b = fabs(b);
	if (a == CUDART_INF \|\| b == 0.0) {	if (a == CUDART_INF \|\| b == 0.0) {
	return CUDART_NAN;	return CUDART_NAN;
	} else if (a >= b) {	} else if (a >= b) {
	double scaled_b = 0.0;	double scaled_b = 0.0;
	bhi = __double2hiint(b);	bhi = __double2hiint(b);
	blo = __double2loint(b);	blo = __double2loint(b);
	ahi = __double2hiint(a);	ahi = __double2hiint(a);
	if (b < CUDART_TWO_TO_M1022) {	if (b < CUDART_TWO_TO_M1022) {
	double t = b;	double t = b;
	while ((t < a) && (t < CUDART_TWO_TO_M1022)) {	while ((t < a) && (t < CUDART_TWO_TO_M1022)) {

	skipping to change at line 2225	skipping to change at line 2100
	if ((twoa > b) \|\| ((twoa == b) && quot0)) {	if ((twoa > b) \|\| ((twoa == b) && quot0)) {
	a -= b;	a -= b;
	}	}
	bhi = __double2hiint(a);	bhi = __double2hiint(a);
	blo = __double2loint(a);	blo = __double2loint(a);
	ahi = __double2hiint(orig_a);	ahi = __double2hiint(orig_a);
	a = __hiloint2double((ahi & 0x80000000) ^ bhi, blo);	a = __hiloint2double((ahi & 0x80000000) ^ bhi, blo);
	return a;	return a;
	}	}


	__device_func__(double __cuda_remquo(double a, double b, int *c))	static __forceinline__ double remquo(double a, double b, int *c)
	{	{
	double orig_a;	double orig_a;
	double twoa = 0.0;	double twoa = 0.0;
	unsigned int quot = 0; /* trailing quotient bits */	unsigned int quot = 0; /* trailing quotient bits */
	unsigned int sign;	unsigned int sign;
	int bhi;	int bhi;
	int blo;	int blo;
	int ahi;	int ahi;

	if (__cuda___isnan(a) \|\| __cuda___isnan(b)) {	if (__isnan(a) \|\| __isnan(b)) {
	*c = quot;	*c = quot;
	return a + b;	return a + b;
	}	}
	orig_a = a;	orig_a = a;

	sign = 0 - (__cuda___signbit(a) != __cuda___signbit(b));	sign = 0 - ((__double2hiint(a) ^ __double2hiint(b)) < 0);
	a = __cuda_fabs(a);	a = fabs(a);
	b = __cuda_fabs(b);	b = fabs(b);
	if (a == CUDART_INF \|\| b == 0.0) {	if (a == CUDART_INF \|\| b == 0.0) {
	*c = quot;	*c = quot;
	return CUDART_NAN;	return CUDART_NAN;
	} else if (a >= b) {	} else if (a >= b) {
	double scaled_b = 0.0;	double scaled_b = 0.0;
	bhi = __double2hiint(b);	bhi = __double2hiint(b);
	blo = __double2loint(b);	blo = __double2loint(b);
	ahi = __double2hiint(a);	ahi = __double2hiint(a);
	if (b < CUDART_TWO_TO_M1022) {	if (b < CUDART_TWO_TO_M1022) {
	double t = b;	double t = b;

	skipping to change at line 2291	skipping to change at line 2166
	blo = __double2loint(a);	blo = __double2loint(a);
	ahi = __double2hiint(orig_a);	ahi = __double2hiint(orig_a);
	a = __hiloint2double((ahi & 0x80000000) ^ bhi, blo);	a = __hiloint2double((ahi & 0x80000000) ^ bhi, blo);
	quot = quot & CUDART_REMQUO_MASK_F;	quot = quot & CUDART_REMQUO_MASK_F;
	quot = quot ^ sign;	quot = quot ^ sign;
	quot = quot - sign;	quot = quot - sign;
	*c = quot;	*c = quot;
	return a;	return a;
	}	}


	__device_func__(double __cuda_nextafter(double a, double b))	static __forceinline__ double nextafter(double a, double b)
	{	{
	unsigned long long int ia;	unsigned long long int ia;
	unsigned long long int ib;	unsigned long long int ib;
	ia = __double_as_longlong(a);	ia = __double_as_longlong(a);
	ib = __double_as_longlong(b);	ib = __double_as_longlong(b);

	if (__cuda___isnan(a) \|\| __cuda___isnan(b)) return a + b; /* NaN */	if (__isnan(a) \|\| __isnan(b)) return a + b; /* NaN */
	if (((ia \| ib) << 1) == 0ULL) return b;	if (((ia \| ib) << 1) == 0ULL) return b;
	if ((ia + ia) == 0ULL) {	if ((ia + ia) == 0ULL) {
	return __internal_copysign_pos(CUDART_MIN_DENORM, b); /* crossover */	return __internal_copysign_pos(CUDART_MIN_DENORM, b); /* crossover */
	}	}
	if ((a < b) && (a < 0.0)) ia--;	if ((a < b) && (a < 0.0)) ia--;
	if ((a < b) && (a > 0.0)) ia++;	if ((a < b) && (a > 0.0)) ia++;
	if ((a > b) && (a < 0.0)) ia++;	if ((a > b) && (a < 0.0)) ia++;
	if ((a > b) && (a > 0.0)) ia--;	if ((a > b) && (a > 0.0)) ia--;
	a = __longlong_as_double(ia);	a = __longlong_as_double(ia);
	return a;	return a;
	}	}


	__device_func__(double __cuda_nan(const char *tagp))	static __forceinline__ double nan(const char *tagp)
	{	{
	unsigned long long int i;	unsigned long long int i;

	i = __internal_nan_kernel (tagp);	i = __internal_nan_kernel (tagp);
	i = (i & 0x000fffffffffffffULL) \| 0x7ff8000000000000ULL;	i = (i & 0x000fffffffffffffULL) \| 0x7ff8000000000000ULL;
	return __longlong_as_double(i);	return __longlong_as_double(i);
	}	}


	__device_func__(double __cuda_round(double a))	static __forceinline__ double round(double a)
	{	{

	double fa = __cuda_fabs(a);	double fa = fabs(a);
	if (fa >= CUDART_TWO_TO_52) {	if (fa >= CUDART_TWO_TO_52) {
	return a;	return a;
	} else {	} else {
	double u;	double u;

	u = __cuda_trunc(fa + 0.5);	u = trunc(fa + 0.5);
	if (fa < 0.5) u = 0;	if (fa < 0.5) u = 0;
	u = __internal_copysign_pos(u, a);	u = __internal_copysign_pos(u, a);
	return u;	return u;
	}	}
	}	}


	__device_func__(long long int __cuda_llround(double a))	static __forceinline__ long long int llround(double a)
	{	{

	#if !defined(__CUDABE__)	return (long long int)round(a);
	if (a >= 9223372036854775807.0) return 0x7fffffffffffffffLL;
	if (a <= -9223372036854775808.0) return 0x8000000000000000LL;
	#endif /* !__CUDABE__ */
	return (long long int)(__cuda_round(a));
	}	}


	__device_func__(long int __cuda_lround(double a))	static __forceinline__ long int lround(double a)
	{	{
	#if defined(__LP64__)	#if defined(__LP64__)

	return (long int)(__cuda_llround(a));	return (long int)llround(a);
	#else /* __LP64__ */	#else /* __LP64__ */

	#if !defined(__CUDABE__)	return (long int)round(a);
	if (__cuda___isnan(a)) return 0x80000000L;
	if (a >= 2147483647.0) return 0x7fffffffL;
	if (a <= -2147483648.0) return 0x80000000L;
	#endif /* !__CUDABE__ */
	return (long int)(__cuda_round(a));
	#endif /* __LP64__ */	#endif /* __LP64__ */
	}	}


	__device_func__(double __cuda_fdim(double a, double b))	static __forceinline__ double fdim(double a, double b)
	{	{
	double t;	double t;
	t = a - b; /* default also takes care of NaNs */	t = a - b; /* default also takes care of NaNs */
	if (a <= b) {	if (a <= b) {
	t = 0.0;	t = 0.0;
	}	}
	return t;	return t;
	}	}


	__device_func__(int __cuda_ilogb(double a))	static __forceinline__ int ilogb(double a)
	{	{
	unsigned long long int i;	unsigned long long int i;
	unsigned int ihi;	unsigned int ihi;
	unsigned int ilo;	unsigned int ilo;

	if (__cuda___isnan(a)) return -INT_MAX-1;	if (__isnan(a)) return -__cuda_INT_MAX-1;
	if (__cuda___isinf(a)) return INT_MAX;	if (__isinf(a)) return __cuda_INT_MAX;
	if (a == 0.0) return -INT_MAX-1;	if (a == 0.0) return -__cuda_INT_MAX-1;
	a = __cuda_fabs(a);	a = fabs(a);
	ilo = __double2loint(a);	ilo = __double2loint(a);
	ihi = __double2hiint(a);	ihi = __double2hiint(a);
	i = ((unsigned long long int)ihi) << 32 \| (unsigned long long int)ilo;	i = ((unsigned long long int)ihi) << 32 \| (unsigned long long int)ilo;
	if (a >= CUDART_TWO_TO_M1022) {	if (a >= CUDART_TWO_TO_M1022) {
	return ((int)((ihi >> 20) & 0x7ff)) - 1023;	return ((int)((ihi >> 20) & 0x7ff)) - 1023;
	} else {	} else {
	return -1011 - __clzll(i);	return -1011 - __clzll(i);
	}	}
	}	}


	__device_func__(double __cuda_logb(double a))	static __forceinline__ double logb(double a)
	{	{
	unsigned long long int i;	unsigned long long int i;
	unsigned int ihi;	unsigned int ihi;
	unsigned int ilo;	unsigned int ilo;

	if (__cuda___isnan(a)) return a + a;	if (__isnan(a)) return a + a;
	a = __cuda_fabs(a);	a = fabs(a);
	if (a == CUDART_INF) return a;	if (a == CUDART_INF) return a;
	if (a == 0.0) return -CUDART_INF;	if (a == 0.0) return -CUDART_INF;
	ilo = __double2loint(a);	ilo = __double2loint(a);
	ihi = __double2hiint(a);	ihi = __double2hiint(a);
	i = ((unsigned long long int)ihi) << 32 \| (unsigned long long int)ilo;	i = ((unsigned long long int)ihi) << 32 \| (unsigned long long int)ilo;
	if (a >= CUDART_TWO_TO_M1022) {	if (a >= CUDART_TWO_TO_M1022) {
	return (double)((int)((ihi >> 20) & 0x7ff)) - 1023;	return (double)((int)((ihi >> 20) & 0x7ff)) - 1023;
	} else {	} else {
	int expo = -1011 - __clzll(i);	int expo = -1011 - __clzll(i);
	return (double)expo;	return (double)expo;
	}	}
	}	}


	__device_func__(double __cuda_fma(double a, double b, double c))	static __forceinline__ double fma(double a, double b, double c)
	{	{
	return __fma_rn(a, b, c);	return __fma_rn(a, b, c);
	}	}


	#if __APPLE__	#if defined(__APPLE__)
	__device_func__(int __cuda___isfinited(double a))
		static __forceinline__ int __isfinited(double a)
	{	{

	return __cuda___finite(a);	return __finite(a);
	}	}


	__device_func__(int __cuda___signbitd(double a))	static __forceinline__ int __signbitd(double a)
	{	{

	return __cuda___signbit(a);	return __signbit(a);
	}	}

	#endif


	#endif /* __cplusplus && __CUDACC__ */	#endif /* __APPLE__ */

		#endif /* __CUDABE__ */

	#endif /* __MATH_FUNCTIONS_DBL_PTX3_H__ */	#endif /* __MATH_FUNCTIONS_DBL_PTX3_H__ */

End of changes. 226 change blocks.
	396 lines changed or deleted	264 lines changed or added

	sm_11_atomic_functions.h	sm_11_atomic_functions.h

	skipping to change at line 49	skipping to change at line 49
	#if defined(__cplusplus) && defined(__CUDACC__)	#if defined(__cplusplus) && defined(__CUDACC__)

	#if !defined(__CUDA_ARCH__) \|\| __CUDA_ARCH__ >= 110	#if !defined(__CUDA_ARCH__) \|\| __CUDA_ARCH__ >= 110

	/************************************************************************ ***	/************************************************************************ ***
	* *	* *
	* *	* *
	* *	* *
	************************************************************************* **/	************************************************************************* **/


		#include "builtin_types.h"
	#include "host_defines.h"	#include "host_defines.h"

	extern "C"	extern "C"
	{	{

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __device__ int __iAtomicAdd(int *address, int val);	extern __device__ int __iAtomicAdd(int *address, int val);
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __device__ unsigned int __uAtomicAdd(unsigned int *address, unsigned int val);	extern __device__ unsigned int __uAtomicAdd(unsigned int *address, unsigned int val);
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/

	skipping to change at line 208	skipping to change at line 209
	return __iAtomicCAS(address, compare, val);	return __iAtomicCAS(address, compare, val);
	}	}

	static __inline__ __device__ unsigned int atomicCAS(unsigned int *address, unsigned int compare, unsigned int val)	static __inline__ __device__ unsigned int atomicCAS(unsigned int *address, unsigned int compare, unsigned int val)
	{	{
	return __uAtomicCAS(address, compare, val);	return __uAtomicCAS(address, compare, val);
	}	}

	#endif /* !__CUDA_ARCH__ \|\| __CUDA_ARCH__ >= 110 */	#endif /* !__CUDA_ARCH__ \|\| __CUDA_ARCH__ >= 110 */


	#elif !defined(__CUDACC__)

	#include "crt/func_macro.h"

	#if !defined(__CUDABE__)

	#if defined(__cplusplus)
	extern "C" {
	#endif /* __cplusplus */

	extern void CUDARTAPI __cudaMutexOperation(int lock);

	#if defined(__cplusplus)
	}
	#endif /* __cplusplus */

	#define __cudaAtomicOperation(code) \
	__cudaMutexOperation(1); \
	code \
	__cudaMutexOperation(0);

	__device_func__(int __iAtomicAdd(int *address, int val))
	{
	int old;

	__cudaAtomicOperation(
	old = *address;
	*address = old + val;
	)
	return old;
	}

	__device_func__(unsigned int __uAtomicAdd(unsigned int *address, unsigned i
	nt val))
	{
	unsigned int old;

	__cudaAtomicOperation(
	old = *address;
	*address = old + val;
	)

	return old;
	}

	__device_func__(int __iAtomicExch(int *address, int val))
	{
	int old;

	__cudaAtomicOperation(
	old = *address;
	*address = val;
	)

	return old;
	}

	__device_func__(unsigned int __uAtomicExch(unsigned int *address, unsigned
	int val))
	{
	unsigned int old;

	__cudaAtomicOperation(
	old = *address;
	*address = val;
	)

	return old;
	}

	__device_func__(float __fAtomicExch(float *address, float val))
	{
	float old;

	__cudaAtomicOperation(
	old = *address;
	*address = val;
	)

	return old;
	}

	__device_func__(int __iAtomicMin(int *address, int val))
	{
	int old;

	__cudaAtomicOperation(
	old = *address;
	*address = old < val ? old : val;
	)

	return old;
	}

	__device_func__(unsigned int __uAtomicMin(unsigned int *address, unsigned i
	nt val))
	{
	unsigned int old;

	__cudaAtomicOperation(
	old = *address;
	*address = old < val ? old : val;
	)

	return old;
	}

	__device_func__(int __iAtomicMax(int *address, int val))
	{
	int old;

	__cudaAtomicOperation(
	old = *address;
	*address = old > val ? old : val;
	)

	return old;
	}

	__device_func__(unsigned int __uAtomicMax(unsigned int *address, unsigned i
	nt val))
	{
	unsigned int old;

	__cudaAtomicOperation(
	old = *address;
	*address = old > val ? old : val;
	)

	return old;
	}

	__device_func__(unsigned int __uAtomicInc(unsigned int *address, unsigned i
	nt val))
	{
	unsigned int old;

	__cudaAtomicOperation(
	old = *address;
	*address = (old >= val) ? 0 : old + 1;
	)

	return old;
	}

	__device_func__(unsigned int __uAtomicDec(unsigned int *address, unsigned i
	nt val))
	{
	unsigned int old;

	__cudaAtomicOperation(
	old = *address;
	*address = ((old == 0) \| (old > val)) ? val : (old - 1);
	)

	return old;
	}

	__device_func__(int __iAtomicAnd(int *address, int val))
	{
	int old;

	__cudaAtomicOperation(
	old = *address;
	*address = old & val;
	)

	return old;
	}

	__device_func__(unsigned int __uAtomicAnd(unsigned int *address, unsigned i
	nt val))
	{
	unsigned int old;

	__cudaAtomicOperation(
	old = *address;
	*address = old & val;
	)

	return old;
	}

	__device_func__(int __iAtomicOr(int *address, int val))
	{
	int old;

	__cudaAtomicOperation(
	old = *address;
	*address = old \| val;
	)

	return old;
	}

	__device_func__(unsigned int __uAtomicOr(unsigned int *address, unsigned in
	t val))
	{
	unsigned int old;

	__cudaAtomicOperation(
	old = *address;
	*address = old \| val;
	)

	return old;
	}

	__device_func__(int __iAtomicXor(int *address, int val))
	{
	int old;

	__cudaAtomicOperation(
	old = *address;
	*address = old ^ val;
	)

	return old;
	}

	__device_func__(unsigned int __uAtomicXor(unsigned int *address, unsigned i
	nt val))
	{
	unsigned int old;

	__cudaAtomicOperation(
	old = *address;
	*address = old ^ val;
	)

	return old;
	}

	__device_func__(int __iAtomicCAS(int *address, int compare, int val))
	{
	int old;

	__cudaAtomicOperation(
	old = *address;
	*address = old == compare ? val : old;
	)

	return old;
	}

	__device_func__(unsigned int __uAtomicCAS(unsigned int *address, unsigned i
	nt compare, unsigned int val))
	{
	unsigned int old;

	__cudaAtomicOperation(
	old = *address;
	*address = old == compare ? val : old;
	)

	return old;
	}

	#undef __cudaAtomicOperation

	#endif /* !__CUDABE__ */

	#endif /* __cplusplus && __CUDACC__ */	#endif /* __cplusplus && __CUDACC__ */

	#endif /* !__SM_11_ATOMIC_FUNCTIONS_H__ */	#endif /* !__SM_11_ATOMIC_FUNCTIONS_H__ */

End of changes. 2 change blocks.
	262 lines changed or deleted	1 lines changed or added

	sm_12_atomic_functions.h	sm_12_atomic_functions.h

	skipping to change at line 49	skipping to change at line 49
	#if defined(__cplusplus) && defined(__CUDACC__)	#if defined(__cplusplus) && defined(__CUDACC__)

	#if !defined(__CUDA_ARCH__) \|\| __CUDA_ARCH__ >= 120	#if !defined(__CUDA_ARCH__) \|\| __CUDA_ARCH__ >= 120

	/************************************************************************ ***	/************************************************************************ ***
	* *	* *
	* *	* *
	* *	* *
	************************************************************************* **/	************************************************************************* **/


		#include "builtin_types.h"
	#include "host_defines.h"	#include "host_defines.h"

	extern "C"	extern "C"
	{	{

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __device__ unsigned long long int __ullAtomicAdd(unsigned long long int *address, unsigned long long int val);	extern __device__ unsigned long long int __ullAtomicAdd(unsigned long long int *address, unsigned long long int val);
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __device__ unsigned long long int __ullAtomicExch(unsigned long long int *address, unsigned long long int val);	extern __device__ unsigned long long int __ullAtomicExch(unsigned long long int *address, unsigned long long int val);
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/

	skipping to change at line 101	skipping to change at line 102
	return (bool)__any((int)cond);	return (bool)__any((int)cond);
	}	}

	static __inline__ __device__ bool all(bool cond)	static __inline__ __device__ bool all(bool cond)
	{	{
	return (bool)__all((int)cond);	return (bool)__all((int)cond);
	}	}

	#endif /* !__CUDA_ARCH__ \|\| __CUDA_ARCH__ >= 120 */	#endif /* !__CUDA_ARCH__ \|\| __CUDA_ARCH__ >= 120 */


	#elif !defined(__CUDACC__)

	#include "crt/func_macro.h"

	#if !defined(__CUDABE__)

	#if defined(__cplusplus)
	extern "C" {
	#endif /* __cplusplus */

	extern void CUDARTAPI __cudaMutexOperation(int lock);

	#if defined(__cplusplus)
	}
	#endif /* __cplusplus */

	#define __cudaAtomicOperation(code) \
	__cudaMutexOperation(1); \
	code \
	__cudaMutexOperation(0);

	__device_func__(unsigned long long int __ullAtomicAdd(unsigned long long in
	t *address, unsigned long long int val))
	{
	unsigned long long int old;

	__cudaAtomicOperation(
	old = *address;
	*address = old + val;
	)
	return old;
	}

	__device_func__(unsigned long long int __ullAtomicExch(unsigned long long i
	nt *address, unsigned long long int val))
	{
	unsigned long long int old;

	__cudaAtomicOperation(
	old = *address;
	*address = val;
	)
	return old;
	}

	__device_func__(unsigned long long int __ullAtomicCAS(unsigned long long in
	t *address, unsigned long long int compare, unsigned long long int val))
	{
	unsigned long long int old;

	__cudaAtomicOperation(
	old = *address;
	*address = old == compare ? val : old;
	)
	return old;
	}

	#undef __cudaAtomicOperation

	__device_func__(int __any(int cond))
	{
	return cond;
	}

	__device_func__(int __all(int cond))
	{
	return cond;
	}

	#endif /* !__CUDABE__ */

	#endif /* __cplusplus && __CUDACC__ */	#endif /* __cplusplus && __CUDACC__ */

	#endif /* !__SM_12_ATOMIC_FUNCTIONS_H__ */	#endif /* !__SM_12_ATOMIC_FUNCTIONS_H__ */

End of changes. 2 change blocks.
	71 lines changed or deleted	1 lines changed or added

	sm_13_double_functions.h	sm_13_double_functions.h

	skipping to change at line 55	skipping to change at line 55
	#if defined(__cplusplus) && defined(__CUDACC__)	#if defined(__cplusplus) && defined(__CUDACC__)

	#if !defined(__CUDA_ARCH__) \|\| __CUDA_ARCH__ >= 130	#if !defined(__CUDA_ARCH__) \|\| __CUDA_ARCH__ >= 130

	/************************************************************************ ***	/************************************************************************ ***
	* *	* *
	* *	* *
	* *	* *
	************************************************************************* **/	************************************************************************* **/


		#include "builtin_types.h"
	#include "device_types.h"	#include "device_types.h"
	#include "host_defines.h"	#include "host_defines.h"

	extern "C"	extern "C"
	{	{

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __device__ long long int __double_as_longlong(double);	extern __device__ long long int __double_as_longlong(double);
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __device__ double __longlong_as_double(long long int) ;	extern __device__ double __longlong_as_double(long long int) ;

	skipping to change at line 258	skipping to change at line 259
	return (double)a;	return (double)a;
	}	}

	static __inline__ __device__ double float2double(float a, enum cudaRoundMod e mode = cudaRoundNearest)	static __inline__ __device__ double float2double(float a, enum cudaRoundMod e mode = cudaRoundNearest)
	{	{
	return (double)a;	return (double)a;
	}	}

	#endif /* !__CUDA_ARCH__ \|\| __CUDA_ARCH__ >= 130 */	#endif /* !__CUDA_ARCH__ \|\| __CUDA_ARCH__ >= 130 */


	#elif !defined(__CUDACC__)	#elif defined(__CUDABE__)

	#include "crt/func_macro.h"

	#if !defined(__CUDABE__)

	/**************************************************************************
	*****
	*
	*
	* HOST IMPLEMENTATIONS FOR FUNCTIONS
	*
	*
	*
	***************************************************************************
	****/

	#include "common_types.h"

	__device_func__(double __longlong_as_double(long long int a))
	{
	volatile union __cudart_DoubleLonglongCvt u;
	u.i = a;
	return u.d;
	}

	__device_func__(long long int __double_as_longlong(double a))
	{
	volatile union __cudart_DoubleLonglongCvt u;
	u.d = a;
	return u.i;
	}

	/* Note: this kernel does not support round-to-nearest-or-even */
	__device_func__(float __internal_double2float_kernel(double a, enum cudaRou
	ndMode rndMode))
	{
	volatile union __cudart_DoubleUlonglongCvt xx;
	volatile union __cudart_FloatUintCvt res;

	unsigned long long sticky;
	int shift;
	xx.d = a;
	res.i = (((unsigned int) (xx.i >> 32)) & 0x80000000);
	if (a == 0.0) {
	/* Zero */
	return res.f;
	}
	if ((xx.i & 0x7ff0000000000000ULL) == 0x7ff0000000000000ULL) {
	if ((xx.i & 0x7fffffffffffffffULL) > 0x7ff0000000000000ULL) {
	/* Nan */
	res.i = ((unsigned int)((xx.i >> 32) & 0x80000000) \|
	(255U << 23) \| 0x00400000 \|
	(unsigned int)((xx.i >> (53 - 24)) & 0x007fffff));
	} else {
	/* Inf */
	res.i \|= 0x7f800000;
	}
	return res.f;
	}
	shift = ((int) ((xx.i >> 52) & 0x7ff)) - 1023;
	/* Overflow */
	xx.i = (xx.i & 0x000fffffffffffffULL);
	if (shift >= 128) {
	if ((rndMode == cudaRoundZero) \|\|
	((rndMode == cudaRoundMinInf) && !res.i) \|\|
	((rndMode == cudaRoundPosInf) && res.i)) {
	res.i \|= 0x7f7fffff;
	} else {
	res.i \|= 0x7f800000;
	}
	return res.f;
	}
	if (shift <= -127) {
	/* Underflow */
	xx.i \|= 0x0010000000000000ULL;
	if (shift < -180) {
	sticky = xx.i;
	xx.i = 0;
	} else {
	sticky = xx.i << (64 - (-126 - shift));
	xx.i >>= (-126 - shift);
	}
	sticky \|= xx.i << (64 - 29);
	if ((((rndMode == cudaRoundPosInf) && !res.i) \|\|
	((rndMode == cudaRoundMinInf) && res.i)) &&
	sticky) {
	res.i += 1;
	}
	res.i += ((unsigned int) (xx.i >> 29)) & 0x007fffff;
	return res.f;
	}
	sticky = xx.i << (64 - 29);
	if ((((rndMode == cudaRoundPosInf) && !res.i) \|\|
	((rndMode == cudaRoundMinInf) && res.i)) &&
	sticky) {
	res.i += 1;
	}
	res.i += ((unsigned int) (xx.i >> 29)) & 0x007fffff;
	res.i += (unsigned int) (127 + shift) << 23;
	return res.f;
	}

	__device_func__(double __internal_ll2double_kernel(long long int a, enum cu
	daRoundMode rndMode))
	{
	volatile union __cudart_DoubleUlonglongCvt res;
	int shift;
	unsigned int t;
	res.i = a;
	if (a == 0) return res.d;
	if (a < 0) res.i = (unsigned long long int)-a;
	shift = __internal_normalize64((unsigned long long int*)&res.i);
	t = ((unsigned int) res.i) << 21;
	res.i >>= 11;
	res.i += ((unsigned long long int)(1023 + 62 - shift)) << 52;
	if (a < 0) res.i \|= 0x8000000000000000ULL;
	if ((rndMode == cudaRoundNearest) && (t >= 0x80000000)) {
	res.i += (t == 0x80000000) ? (res.i & 1) : 1;
	}
	else if ((rndMode == cudaRoundMinInf) && t && (a < 0)) {
	res.i++;
	}
	else if ((rndMode == cudaRoundPosInf) && t && (a > 0)) {
	res.i++;
	}
	return res.d;
	}

	__device_func__(double __internal_ull2double_kernel(unsigned long long int
	a, enum cudaRoundMode rndMode))
	{
	volatile union __cudart_DoubleUlonglongCvt res;
	int shift;
	unsigned int t;
	res.i = a;
	if (a == 0) return res.d;
	shift = __internal_normalize64((unsigned long long int *)&res.i);
	t = ((unsigned int) res.i) << 21;
	res.i >>= 11;
	res.i += ((unsigned long long int)(1023 + 62 - shift)) << 52;
	if ((rndMode == cudaRoundNearest) && (t >= 0x80000000)) {
	res.i += (t == 0x80000000) ? (res.i & 1) : 1;
	}
	else if ((rndMode == cudaRoundPosInf) && t) {
	res.i++;
	}
	return res.d;
	}

	__device_func__(long long int __internal_double2ll_kernel(double a, long lo
	ng int max, long long int min, long long int nan, enum cudaRoundMode rndMod
	e))
	{
	volatile union __cudart_DoubleUlonglongCvt xx, res;
	unsigned long long int t = 0;
	int shift;

	xx.d = a;
	__internal_clamp(a, max, min, nan);
	shift = (int) (1023 + 62 - ((xx.i >> 52) & 0x7ff));
	res.i = ((xx.i << 11) \| 0x8000000000000000ULL) >> 1;
	if (shift >= 64) {
	t = res.i;
	res.i = 0;
	} else if (shift) {
	t = res.i << (64 - shift);
	res.i = res.i >> shift;
	}
	if ((rndMode == cudaRoundNearest) && (t >= 0x8000000000000000ULL)) {
	res.i += (t == 0x8000000000000000ULL) ? (res.i & 1) : 1;
	}
	else if ((rndMode == cudaRoundMinInf) && t &&
	(xx.i > 0x8000000000000000ULL)) {
	res.i++;
	}
	else if ((rndMode == cudaRoundPosInf) && t && ((long long int)xx.i > 0))
	{
	res.i++;
	}
	if ((long long int)xx.i < 0) {
	res.i = (unsigned long long int)(-(long long int)res.i);
	}
	return res.i;
	}

	__device_func__(unsigned long long int __internal_double2ull_kernel(double
	a, unsigned long long int max, unsigned long long int nan, enum cudaRoundMo
	de rndMode))
	{
	volatile union __cudart_DoubleUlonglongCvt xx, res;
	unsigned long long int t = 0;
	int shift;

	xx.d = a;
	__internal_clamp(a, max, 0LL, nan);

	if (a == 0.0) return 0LL;
	shift = (int) (1023 + 63 - ((xx.i >> 52) & 0x7ff));
	res.i = ((xx.i << 11) \| 0x8000000000000000ULL);
	if (shift >= 64) {
	t = res.i >> (int)(shift > 64);
	res.i = 0;
	} else if (shift) {
	t = res.i << (64 - shift);
	res.i = res.i >> shift;
	}
	if ((rndMode == cudaRoundNearest) && (t >= 0x8000000000000000ULL)) {
	res.i += (t == 0x8000000000000000ULL) ? (res.i & 1) : 1;
	}
	else if ((rndMode == cudaRoundPosInf) && t) {
	res.i++;
	}
	return res.i;
	}

	__device_func__(int __double2hiint(double a))
	{
	volatile union __cudart_DoubleInthiloCvt cvt;

	cvt.d = a;
	return cvt.i[1];
	}

	__device_func__(int __double2loint(double a))
	{
	volatile union __cudart_DoubleInthiloCvt cvt;

	cvt.d = a;
	return cvt.i[0];
	}

	__device_func__(double __hiloint2double(int a, int b))
	{
	volatile union __cudart_DoubleInthiloCvt cvt;

	cvt.i[0] = b;
	cvt.i[1] = a;
	return cvt.d;
	}

	__device_func__(float __double2float_rn(double a))
	{
	return (float)a;
	}

	__device_func__(float __double2float_rz(double a))
	{
	return __internal_double2float_kernel(a, cudaRoundZero);
	}

	__device_func__(float __double2float_ru(double a))
	{
	return __internal_double2float_kernel(a, cudaRoundPosInf);
	}

	__device_func__(float __double2float_rd(double a))
	{
	return __internal_double2float_kernel(a, cudaRoundMinInf);
	}

	__device_func__(int __internal_double2int(double a, enum cudaRoundMode rndM
	ode))
	{
	return (int)__internal_double2ll_kernel(a, 2147483647LL, -2147483648LL, -
	2147483648LL, rndMode);
	}

	__device_func__(int __double2int_rn(double a))
	{
	return __internal_double2int(a, cudaRoundNearest);
	}

	__device_func__(int __double2int_ru(double a))
	{
	return __internal_double2int(a, cudaRoundPosInf);
	}

	__device_func__(int __double2int_rd(double a))
	{
	return __internal_double2int(a, cudaRoundMinInf);
	}

	__device_func__(unsigned int __internal_double2uint(double a, enum cudaRoun
	dMode rndMode))
	{
	return (unsigned int)__internal_double2ull_kernel(a, 4294967295ULL, 21474
	83648ULL, rndMode);
	}

	__device_func__(unsigned int __double2uint_rn(double a))
	{
	return __internal_double2uint(a, cudaRoundNearest);
	}

	__device_func__(unsigned int __double2uint_ru(double a))
	{
	return __internal_double2uint(a, cudaRoundPosInf);
	}

	__device_func__(unsigned int __double2uint_rd(double a))
	{
	return __internal_double2uint(a, cudaRoundMinInf);
	}

	__device_func__(long long int __internal_double2ll(double a, enum cudaRound
	Mode rndMode))
	{
	return __internal_double2ll_kernel(a, 9223372036854775807LL, -92233720368
	54775807LL -1LL, -9223372036854775807LL -1LL, rndMode);
	}

	__device_func__(long long int __double2ll_rn(double a))
	{
	return __internal_double2ll(a, cudaRoundNearest);
	}

	__device_func__(long long int __double2ll_ru(double a))
	{
	return __internal_double2ll(a, cudaRoundPosInf);
	}

	__device_func__(long long int __double2ll_rd(double a))
	{
	return __internal_double2ll(a, cudaRoundMinInf);
	}

	__device_func__(unsigned long long int __internal_double2ull(double a, enum
	cudaRoundMode rndMode))
	{
	return __internal_double2ull_kernel(a, 18446744073709551615ULL, 922337203
	6854775808ULL, rndMode);
	}

	__device_func__(unsigned long long int __double2ull_rn(double a))
	{
	return __internal_double2ull(a, cudaRoundNearest);
	}

	__device_func__(unsigned long long int __double2ull_ru(double a))
	{
	return __internal_double2ull(a, cudaRoundPosInf);
	}

	__device_func__(unsigned long long int __double2ull_rd(double a))
	{
	return __internal_double2ull(a, cudaRoundMinInf);
	}

	__device_func__(double __int2double_rn(int a))
	{
	return (double)a;
	}

	__device_func__(double __uint2double_rn(unsigned int a))
	{
	return (double)a;
	}

	__device_func__(double __ll2double_rn(long long int a))
	{
	return (double)a;
	}

	__device_func__(double __ll2double_rz(long long int a))
	{
	return __internal_ll2double_kernel(a, cudaRoundZero);
	}

	__device_func__(double __ll2double_rd(long long int a))
	{
	return __internal_ll2double_kernel(a, cudaRoundMinInf);
	}

	__device_func__(double __ll2double_ru(long long int a))
	{
	return __internal_ll2double_kernel(a, cudaRoundPosInf);
	}

	__device_func__(double __ull2double_rn(unsigned long long int a))
	{
	return __internal_ull2double_kernel(a, cudaRoundNearest);
	}

	__device_func__(double __ull2double_rz(unsigned long long int a))
	{
	return __internal_ull2double_kernel(a, cudaRoundZero);
	}

	__device_func__(double __ull2double_rd(unsigned long long int a))
	{
	return __internal_ull2double_kernel(a, cudaRoundMinInf);
	}

	__device_func__(double __ull2double_ru(unsigned long long int a))
	{
	return __internal_ull2double_kernel(a, cudaRoundPosInf);
	}

	#endif /* !__CUDABE__ */

	#if !defined(__CUDABE__) \|\| __CUDA_ARCH__ < 130


	#include "common_types.h"	#if __CUDA_ARCH__ < 130


	__device_func__(double __internal_fma_kernel(double x, double y, double z, enum cudaRoundMode rndMode))	static __forceinline__ double __internal_fma_kernel(double x, double y, dou ble z, enum cudaRoundMode rndMode)
	{	{

	struct __cudart_UintUint xx, yy, zz, ww;	struct __cudart_UintUint {
		unsigned int lo;
		unsigned int hi;
		} xx, yy, zz, ww;
	unsigned int s, t, u, prod0, prod1, prod2, prod3, expo_x, expo_y, expo_z;	unsigned int s, t, u, prod0, prod1, prod2, prod3, expo_x, expo_y, expo_z;

	xx.hi = __double2hiint(x);	xx.hi = __double2hiint(x);
	xx.lo = __double2loint(x);	xx.lo = __double2loint(x);
	yy.hi = __double2hiint(y);	yy.hi = __double2hiint(y);
	yy.lo = __double2loint(y);	yy.lo = __double2loint(y);
	zz.hi = __double2hiint(z);	zz.hi = __double2hiint(z);
	zz.lo = __double2loint(z);	zz.lo = __double2loint(z);

	expo_z = 0x7FF;	expo_z = 0x7FF;

	skipping to change at line 1094	skipping to change at line 718
	xx.lo += (t == 0x80000000) ? expo_x : (t >> 31);	xx.lo += (t == 0x80000000) ? expo_x : (t >> 31);
	} else if (((rndMode == cudaRoundPosInf) && t && (!expo_y)) \|\|	} else if (((rndMode == cudaRoundPosInf) && t && (!expo_y)) \|\|
	((rndMode == cudaRoundMinInf) && t && expo_y)) {	((rndMode == cudaRoundMinInf) && t && expo_y)) {
	xx.lo += 1;	xx.lo += 1;
	}	}
	xx.hi += (u > xx.lo);	xx.hi += (u > xx.lo);
	xx.hi \|= yy.hi;	xx.hi \|= yy.hi;
	return __hiloint2double(xx.hi, xx.lo);	return __hiloint2double(xx.hi, xx.lo);
	}	}


	__device_func__(double __fma_rn(double x, double y, double z))	static __forceinline__ double __fma_rn(double x, double y, double z)
	{	{
	return __internal_fma_kernel(x, y, z, cudaRoundNearest);	return __internal_fma_kernel(x, y, z, cudaRoundNearest);
	}	}


	__device_func__(double __fma_rd(double x, double y, double z))	static __forceinline__ double __fma_rd(double x, double y, double z)
	{	{
	return __internal_fma_kernel(x, y, z, cudaRoundMinInf);	return __internal_fma_kernel(x, y, z, cudaRoundMinInf);
	}	}


	__device_func__(double __fma_ru(double x, double y, double z))	static __forceinline__ double __fma_ru(double x, double y, double z)
	{	{
	return __internal_fma_kernel(x, y, z, cudaRoundPosInf);	return __internal_fma_kernel(x, y, z, cudaRoundPosInf);
	}	}


	__device_func__(double __fma_rz(double x, double y, double z))	static __forceinline__ double __fma_rz(double x, double y, double z)
	{	{
	return __internal_fma_kernel(x, y, z, cudaRoundZero);	return __internal_fma_kernel(x, y, z, cudaRoundZero);
	}	}


	__device_func__(double __dadd_rz(double a, double b))	static __forceinline__ double __dadd_rz(double a, double b)
	{	{
	return __fma_rz(a, CUDART_ONE, b);	return __fma_rz(a, CUDART_ONE, b);
	}	}


	__device_func__(double __dadd_ru(double a, double b))	static __forceinline__ double __dadd_ru(double a, double b)
	{	{
	return __fma_ru(a, CUDART_ONE, b);	return __fma_ru(a, CUDART_ONE, b);
	}	}


	__device_func__(double __dadd_rd(double a, double b))	static __forceinline__ double __dadd_rd(double a, double b)
	{	{
	return __fma_rd(a, CUDART_ONE, b);	return __fma_rd(a, CUDART_ONE, b);
	}	}


	__device_func__(double __dmul_rz(double a, double b))	static __forceinline__ double __dmul_rz(double a, double b)
	{	{
	return __fma_rz(a, b, CUDART_NEG_ZERO);	return __fma_rz(a, b, CUDART_NEG_ZERO);
	}	}


	__device_func__(double __dmul_ru(double a, double b))	static __forceinline__ double __dmul_ru(double a, double b)
	{	{
	return __fma_ru(a, b, CUDART_NEG_ZERO);	return __fma_ru(a, b, CUDART_NEG_ZERO);
	}	}


	__device_func__(double __dmul_rd(double a, double b))	static __forceinline__ double __dmul_rd(double a, double b)
	{	{
	return __fma_rd(a, b, CUDART_ZERO);	return __fma_rd(a, b, CUDART_ZERO);
	}	}


	__device_func__(double __dadd_rn(double a, double b))	static __forceinline__ double __dadd_rn(double a, double b)
	{	{
	return __fma_rn(a, CUDART_ONE, b);	return __fma_rn(a, CUDART_ONE, b);
	}	}


	__device_func__(double __dmul_rn(double a, double b))	static __forceinline__ double __dmul_rn(double a, double b)
	{	{
	return __fma_rn(a, b, CUDART_NEG_ZERO);	return __fma_rn(a, b, CUDART_NEG_ZERO);
	}	}


	#endif /* !__CUDABE__ \|\| __CUDA_ARCH__ < 130 */	#endif /* __CUDA_ARCH__ < 130 */


	/**************************************************************************
	*****
	*
	*
	* HOST / DEVICE IMPLEMENTATIONS FOR FUNCTIONS
	*
	*
	*
	***************************************************************************
	****/
	#endif /* __cplusplus && __CUDACC__ */	#endif /* __cplusplus && __CUDACC__ */

	#endif /* !__SM_13_DOUBLE_FUNCTIONS_H__ */	#endif /* !__SM_13_DOUBLE_FUNCTIONS_H__ */

End of changes. 19 change blocks.
	428 lines changed or deleted	21 lines changed or added

	sm_20_atomic_functions.h	sm_20_atomic_functions.h

	skipping to change at line 49	skipping to change at line 49
	#if defined(__cplusplus) && defined(__CUDACC__)	#if defined(__cplusplus) && defined(__CUDACC__)

	#if !defined(__CUDA_ARCH__) \|\| __CUDA_ARCH__ >= 200	#if !defined(__CUDA_ARCH__) \|\| __CUDA_ARCH__ >= 200

	/************************************************************************ ***	/************************************************************************ ***
	* *	* *
	* *	* *
	* *	* *
	************************************************************************* **/	************************************************************************* **/


		#include "builtin_types.h"
	#include "host_defines.h"	#include "host_defines.h"

	extern "C"	extern "C"
	{	{

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __device__ float __fAtomicAdd(float *address, float val);	extern __device__ float __fAtomicAdd(float *address, float val);

	}	}


	skipping to change at line 72	skipping to change at line 73
	* *	* *
	************************************************************************* **/	************************************************************************* **/

	static __inline__ __device__ float atomicAdd(float *address, float val)	static __inline__ __device__ float atomicAdd(float *address, float val)
	{	{
	return __fAtomicAdd(address, val);	return __fAtomicAdd(address, val);
	}	}

	#endif /* !__CUDA_ARCH__ \|\| __CUDA_ARCH__ >= 200 */	#endif /* !__CUDA_ARCH__ \|\| __CUDA_ARCH__ >= 200 */


	#elif !defined(__CUDACC__)

	#include "crt/func_macro.h"

	#if !defined(__CUDABE__)

	#if defined(__cplusplus)
	extern "C" {
	#endif /* __cplusplus */

	extern void CUDARTAPI __cudaMutexOperation(int lock);

	#if defined(__cplusplus)
	}
	#endif /* __cplusplus */

	#define __cudaAtomicOperation(code) \
	__cudaMutexOperation(1); \
	code \
	__cudaMutexOperation(0);

	__device_func__(float __fAtomicAdd(float *address, float val))
	{
	float old;

	__cudaAtomicOperation(
	old = *address;
	*address = old + val;
	)
	return old;
	}

	#undef __cudaAtomicOperation

	#endif /* !__CUDABE__ */

	#endif /* __cplusplus && __CUDACC__ */	#endif /* __cplusplus && __CUDACC__ */

	#endif /* !__SM_20_ATOMIC_FUNCTIONS_H__ */	#endif /* !__SM_20_ATOMIC_FUNCTIONS_H__ */

End of changes. 2 change blocks.
	36 lines changed or deleted	1 lines changed or added

	sm_20_intrinsics.h	sm_20_intrinsics.h

	skipping to change at line 49	skipping to change at line 49
	#if defined(__cplusplus) && defined(__CUDACC__)	#if defined(__cplusplus) && defined(__CUDACC__)

	#if !defined(__CUDA_ARCH__) \|\| __CUDA_ARCH__ >= 200	#if !defined(__CUDA_ARCH__) \|\| __CUDA_ARCH__ >= 200

	/************************************************************************ ***	/************************************************************************ ***
	* *	* *
	* *	* *
	* *	* *
	************************************************************************* **/	************************************************************************* **/


		#include "builtin_types.h"
	#include "device_types.h"	#include "device_types.h"
	#include "host_defines.h"	#include "host_defines.h"

	extern "C"	extern "C"
	{	{

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __device__ void __threadfence_system(void);	extern __device__ void __threadfence_system(void);

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/

	skipping to change at line 112	skipping to change at line 113

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __device__ float __fmaf_ieee_rn(float, float, float) ;	extern __device__ float __fmaf_ieee_rn(float, float, float) ;
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __device__ float __fmaf_ieee_rz(float, float, float) ;	extern __device__ float __fmaf_ieee_rz(float, float, float) ;
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __device__ float __fmaf_ieee_ru(float, float, float) ;	extern __device__ float __fmaf_ieee_ru(float, float, float) ;
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __device__ float __fmaf_ieee_rd(float, float, float) ;	extern __device__ float __fmaf_ieee_rd(float, float, float) ;


		/DEVICE_BUILTIN/
		extern __device__ double __rcp64h(double);

	}	}

	/************************************************************************ ***	/************************************************************************ ***
	* *	* *
	* *	* *
	* *	* *
	************************************************************************* **/	************************************************************************* **/

	static __inline__ __device__ unsigned int ballot(bool pred)	static __inline__ __device__ unsigned int ballot(bool pred)
	{	{

	skipping to change at line 142	skipping to change at line 146
	return (bool)__syncthreads_and((int)pred);	return (bool)__syncthreads_and((int)pred);
	}	}

	static __inline__ __device__ bool syncthreads_or(bool pred)	static __inline__ __device__ bool syncthreads_or(bool pred)
	{	{
	return (bool)__syncthreads_or((int)pred);	return (bool)__syncthreads_or((int)pred);
	}	}

	#endif /* !__CUDA_ARCH__ \|\| __CUDA_ARCH__ >= 200 */	#endif /* !__CUDA_ARCH__ \|\| __CUDA_ARCH__ >= 200 */


	#elif !defined(__CUDACC__)

	#include "crt/func_macro.h"

	#if !defined(__CUDABE__)

	/**************************************************************************
	*****
	*
	*
	* HOST IMPLEMENTATIONS FOR FUNCTIONS
	*
	*
	*
	***************************************************************************
	****/

	#include "common_types.h"

	__device_func__(void __threadfence_system(void))
	{
	__syncthreads();
	}

	__device_func__(unsigned int __ballot(int pred))
	{
	return (unsigned int)1;
	}

	__device_func__(int __syncthreads_count(int pred))
	{
	return 1;
	}

	__device_func__(int __syncthreads_and(int pred))
	{
	return pred;
	}

	__device_func__(int __syncthreads_or(int pred))
	{
	return pred;
	}

	__device_func__(long long int clock64(void))
	{
	return (long long int)__cuda_clock();
	}

	__device_func__(double __internal_ddiv_kernel (double x, double y,
	enum cudaRoundMode mode))
	{
	volatile union __cudart_DoubleLonglongCvt cvt;
	unsigned long long a, b, q, sign;
	int expoa, expob, normalize, i;

	cvt.d = x;
	a = cvt.i;
	cvt.d = y;
	b = cvt.i;

	sign = (a ^ b) & 0x8000000000000000ULL;

	expoa = ((int)(a >> 52) & 0x7ff) - 1;
	expob = ((int)(b >> 52) & 0x7ff) - 1;

	if (((unsigned)expoa >= 0x7fe) \|\| ((unsigned)expob >= 0x7fe)) {
	/* handle NaNs */
	if ((a << 1) > 0xffe0000000000000ULL) {
	cvt.i = a \| 0x0008000000000000ULL;
	return cvt.d;
	}
	if ((b << 1) > 0xffe0000000000000ULL) {
	cvt.i = b \| 0x0008000000000000ULL;
	return cvt.d;
	}
	/* 0/0 and INF/INF ==> INDEFINITE */
	if ((((a << 1) == 0x0000000000000000ULL) &&
	((b << 1) == 0x0000000000000000ULL)) \|\|
	(((a << 1) == 0xffe0000000000000ULL) &&
	((b << 1) == 0xffe0000000000000ULL))) {
	cvt.i = 0xfff8000000000000ULL;
	return cvt.d;
	}
	/* 0/y or x/INF ==> 0 */
	if (((a << 1) == 0x0000000000000000ULL) \|\|
	((b << 1) == 0xffe0000000000000ULL)) {
	cvt.i = sign;
	return cvt.d;
	}
	/* INF/y or x/0 ==> INF */
	if (((b << 1) == 0x0000000000000000ULL) \|\|
	((a << 1) == 0xffe0000000000000ULL)) {
	cvt.i = sign \| 0x7ff0000000000000ULL;
	return cvt.d;
	}
	if (expoa < 0) {
	a = a << 12;
	while ((long long)a > 0) {
	a = a + a;
	expoa--;
	}
	a = a >> 11;
	}
	if (expob < 0) {
	b = b << 12;
	while ((long long)b > 0) {
	b = b + b;
	expob--;
	}
	b = b >> 11;
	}
	}
	a = (a & 0x000fffffffffffffULL) \| 0x0010000000000000ULL;
	b = (b & 0x000fffffffffffffULL) \| 0x0010000000000000ULL;
	/* 1 <= x < 2 / 1 <= y < 2 => 0.5 < q < 2.0 */
	q = 0ULL;
	a = a - b;
	normalize = (long long)a < 0;
	for (i = 0; i < (54 + normalize); i++) {
	if ((long long)a < 0) {
	q = q + q;
	a = a + a;
	a = a + b;
	} else {
	q = q + q + 1;
	a = a + a;
	a = a - b;
	}
	}
	expoa = (expoa - expob) - normalize + 1022;
	if ((unsigned)expoa < 0x7fe) {
	/* no tie case for division if not denorm, one round bit sufficient */
	if (mode == cudaRoundNearest) {
	q++;
	} else if ((mode == cudaRoundPosInf) && (!sign)) {
	if ((q & 1) \|\| (a != (unsigned long long)-(long long)b)) q += 2;
	} else if ((mode == cudaRoundMinInf) && (sign)) {
	if ((q & 1) \|\| (a != (unsigned long long)-(long long)b)) q += 2;
	}
	q = (q >> 1) & 0x000fffffffffffffULL;
	q = sign \| (((long long)(expoa+1) << 52) + q);
	} else if (expoa >= 0x7fe) {
	/* overflow, return infinity or largest normal*/
	if ((mode == cudaRoundNearest) \|\|
	((mode == cudaRoundPosInf) && !sign) \|\|
	((mode == cudaRoundMinInf) && sign)) {
	q = sign \| 0x7ff0000000000000ULL;
	} else {
	q = sign \| 0x7fefffffffffffffULL;
	}
	} else {
	/* denormal results can involve tie cases, generate sticky bit */
	unsigned long long sticky;
	expoa = -expoa;
	if (expoa > 63) expoa = 63;
	/* 1 <= expoa <= 63 */
	sticky = (q << (64 - expoa)) \| (a!=(unsigned long long)-(long long)b);
	q = q >> expoa;
	if (mode == cudaRoundNearest) {
	if ((q & 1) && (sticky \|\| (q & 2))) q++;
	} else if ((mode == cudaRoundPosInf) && (!sign)) {
	if ((q & 1) \|\| (sticky)) q += 2;
	} else if ((mode == cudaRoundMinInf) && (sign)) {
	if ((q & 1) \|\| (sticky)) q += 2;
	}
	q = q >> 1;
	q = q \| sign;
	}
	cvt.i = q;
	return cvt.d;
	}

	__device_func__(double __ddiv_rn(double a, double b))
	{
	return __internal_ddiv_kernel (a, b, cudaRoundNearest);
	}

	__device_func__(double __ddiv_rz(double a, double b))
	{
	return __internal_ddiv_kernel (a, b, cudaRoundZero);
	}

	__device_func__(double __ddiv_ru(double a, double b))
	{
	return __internal_ddiv_kernel (a, b, cudaRoundPosInf);
	}

	__device_func__(double __ddiv_rd(double a, double b))
	{
	return __internal_ddiv_kernel (a, b, cudaRoundMinInf);
	}

	__device_func__(double __drcp_rn(double a))
	{
	return __internal_ddiv_kernel (1.0, a, cudaRoundNearest);
	}

	__device_func__(double __drcp_rz(double a))
	{
	return __internal_ddiv_kernel (1.0, a, cudaRoundZero);
	}

	__device_func__(double __drcp_ru(double a))
	{
	return __internal_ddiv_kernel (1.0, a, cudaRoundPosInf);
	}

	__device_func__(double __drcp_rd(double a))
	{
	return __internal_ddiv_kernel (1.0, a, cudaRoundMinInf);
	}

	__device_func__(double __internal_dsqrt_kernel (double a,
	enum cudaRoundMode mode))
	{
	volatile union __cudart_DoubleLonglongCvt cvt;
	unsigned long long ia, manthi, mantlo;
	unsigned long long t, q, r, s;
	int expoa;
	int round, sticky, odd;
	int sign;

	cvt.d = a;
	ia = cvt.i;
	expoa = ((int)(ia >> 52) & 0x7ff) - 1;
	sign = (int)(ia >> 63);

	/* handle special cases */
	if (((unsigned)expoa >= 0x7fe) \|\| sign) {
	/* handle NaNs */
	if ((ia << 1) > 0xffe0000000000000ULL) {
	cvt.i \|= 0x0008000000000000ULL;
	return cvt.d;
	}
	/* arguments less than -0 */
	if (ia > 0x8000000000000000ULL) {
	cvt.i = 0xfff8000000000000ULL;
	return cvt.d;
	}
	/* handle infinities */
	if ((ia << 1) == 0xffe0000000000000ULL) {
	return cvt.d;
	}
	/* handle zeros */
	if ((ia << 1) == 0x0000000000000000ULL) {
	return cvt.d;
	}
	/* handle denormals */
	if (expoa < 0) {
	ia = ia << 12;
	while ((long long)ia > 0) {
	ia = ia + ia;
	expoa--;
	}
	ia = ia >> 11;
	}
	}

	/* extract mantissa */
	ia = (ia << 11) \| 0x8000000000000000ULL;
	if (!(expoa & 1)) {
	/* exponent even: shift mantissa right by 1 bit */
	ia >>= 1;
	}

	manthi = ia >> 32;
	mantlo = ia & 0xffffffffULL;

	/* A few Newton-Raphson iterations to get initial 16 result bits */
	t = ((manthi >> 24) \| 0x100) >> 1;
	t = (expoa & 1) ? (t - 10) : t;
	q = ((manthi >> 16) / t);
	t = (((q + t) >> 1) << 8) \| 0xff;
	q = manthi / t;
	t = (q + t) >> 1;
	if (t > 0xffff) t = 0xffff;

	/* compute remainder and adjust first result "digit" */
	r = manthi - t * t;
	while ((long long)r < 0) {
	t--;
	r += 2 * t + 1;
	}

	/* compute second result "digit" by longhand computation */
	s = ((r << 15) + (mantlo >> 17)) / t;
	if (s > 0xffff) s = 0xffff;

	/* compute remainder and adjust second result "digit" */
	r = (r << 32) + mantlo;
	r = r - (t << 17) * s;
	r = r - s * s;
	t = (t << 16) + s;
	while ((long long)r < 0) {
	t--;
	r += 2 * t + 1;
	}

	/* compute third result "digit" by longhand computation */
	s = (r << 15) / t;
	if (s > 0xffff) s = 0xffff;

	/* compute remainder and adjust third result "digit" */
	r = r << 32;
	r = r - (t << 17) * s;
	r = r - s * s;
	t = (t << 16) + s;
	while ((long long)r < 0) {
	t--;
	r += 2 * t + 1;
	}

	/* compute fourth result "digit" by longhand computation */
	s = (r << 5) / t;
	if (s > 0x3f) s = 0x3f;

	/* compute remainder and adjust fourth result "digit" */
	r = r << 12;
	r = r - ((t << 7) + s) * s;
	t = (t << 6) + s;
	while ((long long)r < 0) {
	t--;
	r += 2 * t + 1;
	}

	/* prepare for rounding mantissa */
	round = (int)(t & 1);
	sticky = (r != 0ULL);
	t = t >> 1;
	odd = (int)(t & 1);

	/* round mantissa */
	if (mode == cudaRoundNearest) {
	t += round && (sticky \|\| odd);
	} else if (mode == cudaRoundPosInf) {
	t += round \|\| sticky;
	}

	/* construct final result */
	expoa = (expoa >> 1) + 0x1ff;
	q = (((unsigned long long int)expoa) << 52) + t;
	cvt.i = q;
	return cvt.d;
	}

	__device_func__(double __dsqrt_rn(double a))
	{
	return __internal_dsqrt_kernel (a, cudaRoundNearest);
	}

	__device_func__(double __dsqrt_rz(double a))
	{
	return __internal_dsqrt_kernel (a, cudaRoundZero);
	}

	__device_func__(double __dsqrt_ru(double a))
	{
	return __internal_dsqrt_kernel (a, cudaRoundPosInf);
	}

	__device_func__(double __dsqrt_rd(double a))
	{
	return __internal_dsqrt_kernel (a, cudaRoundMinInf);
	}

	__device_func__(float __fmaf_ieee_rn(float a, float b, float c))
	{
	return __fmaf_rn(a, b, c);
	}

	__device_func__(float __fmaf_ieee_ru(float a, float b, float c))
	{
	return __fmaf_ru(a, b, c);
	}

	__device_func__(float __fmaf_ieee_rd(float a, float b, float c))
	{
	return __fmaf_rd(a, b, c);
	}

	__device_func__(float __fmaf_ieee_rz(float a, float b, float c))
	{
	return __fmaf_rz(a, b, c);
	}

	#endif /* !defined(__CUDABE__) */

	#endif /* __cplusplus && __CUDACC__ */	#endif /* __cplusplus && __CUDACC__ */

	#endif /* !__SM_20_INTRINSICS_H__ */	#endif /* !__SM_20_INTRINSICS_H__ */

End of changes. 3 change blocks.
	388 lines changed or deleted	4 lines changed or added

	storage_class.h	storage_class.h
	/*	/*

	* Copyright 1993-2008 NVIDIA Corporation. All rights reserved.	* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
	*	*
	* NOTICE TO USER:	* NOTICE TO USER:
	*	*
	* This source code is subject to NVIDIA ownership rights under U.S. and	* This source code is subject to NVIDIA ownership rights under U.S. and
	* international Copyright laws. Users and possessors of this source code	* international Copyright laws. Users and possessors of this source code
	* are hereby granted a nonexclusive, royalty-free license to use this code	* are hereby granted a nonexclusive, royalty-free license to use this code
	* in individual and commercial software.	* in individual and commercial software.
	*	*
	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR

End of changes. 1 change blocks.
	1 lines changed or deleted	1 lines changed or added

	texture_fetch_functions.h	texture_fetch_functions.h

	skipping to change at line 47	skipping to change at line 47
	#define __TEXTURE_FETCH_FUNCTIONS_H__	#define __TEXTURE_FETCH_FUNCTIONS_H__

	#if defined(__cplusplus) && defined(__CUDACC__)	#if defined(__cplusplus) && defined(__CUDACC__)

	/************************************************************************ ***	/************************************************************************ ***
	* *	* *
	* *	* *
	* *	* *
	************************************************************************* **/	************************************************************************* **/


		#include "builtin_types.h"
	#include "cuda_texture_types.h"	#include "cuda_texture_types.h"
	#include "host_defines.h"	#include "host_defines.h"
	#include "texture_types.h"	#include "texture_types.h"
	#include "vector_functions.h"	#include "vector_functions.h"
	#include "vector_types.h"	#include "vector_types.h"

	/************************************************************************ ***	/************************************************************************ ***
	* *	* *
	* *	* *
	* *	* *

	skipping to change at line 1875	skipping to change at line 1876
	}	}

	static __inline__ __device__ float4 tex3D(texture<ushort4, 3, cudaReadModeN ormalizedFloat> t, float x, float y, float z)	static __inline__ __device__ float4 tex3D(texture<ushort4, 3, cudaReadModeN ormalizedFloat> t, float x, float y, float z)
	{	{
	uint4 v = __utexfetch(t, make_float4(x, y, z, 0));	uint4 v = __utexfetch(t, make_float4(x, y, z, 0));
	float4 w = make_float4(__int_as_float(v.x), __int_as_float(v.y), __int_as _float(v.z), __int_as_float(v.w));	float4 w = make_float4(__int_as_float(v.x), __int_as_float(v.y), __int_as _float(v.z), __int_as_float(v.w));

	return make_float4(w.x, w.y, w.z, w.w);	return make_float4(w.x, w.y, w.z, w.w);
	}	}


	#elif !defined(__CUDACC__)	#elif defined(__CUDABE__)

	#include "host_defines.h"
	#include "crt/func_macro.h"

	#if defined(__CUDABE__)

	extern uint4 __utexfetchi1D(const void*, int4);	extern uint4 __utexfetchi1D(const void*, int4);
	extern int4 __itexfetchi1D(const void*, int4);	extern int4 __itexfetchi1D(const void*, int4);
	extern float4 __ftexfetchi1D(const void*, int4);	extern float4 __ftexfetchi1D(const void*, int4);
	extern uint4 __utexfetch1D(const void*, float4);	extern uint4 __utexfetch1D(const void*, float4);
	extern int4 __itexfetch1D(const void*, float4);	extern int4 __itexfetch1D(const void*, float4);
	extern float4 __ftexfetch1D(const void*, float4);	extern float4 __ftexfetch1D(const void*, float4);
	extern uint4 __utexfetch2D(const void*, float4);	extern uint4 __utexfetch2D(const void*, float4);
	extern int4 __itexfetch2D(const void*, float4);	extern int4 __itexfetch2D(const void*, float4);
	extern float4 __ftexfetch2D(const void*, float4);	extern float4 __ftexfetch2D(const void*, float4);

	skipping to change at line 1908	skipping to change at line 1904
	__itexfetchi1D(t, i)	__itexfetchi1D(t, i)
	#define __ftexfetchi(t, i) \	#define __ftexfetchi(t, i) \
	__ftexfetchi1D(t, i)	__ftexfetchi1D(t, i)
	#define __utexfetch(t, i, d) \	#define __utexfetch(t, i, d) \
	__utexfetch##d##D(t, i)	__utexfetch##d##D(t, i)
	#define __itexfetch(t, i, d) \	#define __itexfetch(t, i, d) \
	__itexfetch##d##D(t, i)	__itexfetch##d##D(t, i)
	#define __ftexfetch(t, i, d) \	#define __ftexfetch(t, i, d) \
	__ftexfetch##d##D(t, i)	__ftexfetch##d##D(t, i)


	#else /* __CUDABE__ */

	#if defined(__cplusplus)
	extern "C" {
	#endif /* __cplusplus */

	extern void CUDARTAPI __cudaTextureFetch(const void tex, void index, int
	integer, void *val);

	#if defined(__cplusplus)
	}
	#endif /* __cplusplus */

	__device_func__(int4 __itexfetchi(const void *tex, int4 index))
	{
	int4 val;

	__cudaTextureFetch(tex, (void)&index, 1, (void)&val);

	return val;
	}

	__device_func__(uint4 __utexfetchi(const void *tex, int4 index))
	{
	uint4 val;

	__cudaTextureFetch(tex, (void)&index, 1, (void)&val);

	return val;
	}

	__device_func__(float4 __ftexfetchi(const void *tex, int4 index))
	{
	float4 val;

	__cudaTextureFetch(tex, (void)&index, 1, (void)&val);

	return val;
	}

	__device_func__(int4 __itexfetch(const void *tex, float4 index, int dim))
	{
	int4 val;

	__cudaTextureFetch(tex, (void)&index, 0, (void)&val);

	return val;
	}

	__device_func__(uint4 __utexfetch(const void *tex, float4 index, int dim))
	{
	uint4 val;

	__cudaTextureFetch(tex, (void)&index, 0, (void)&val);

	return val;
	}

	__device_func__(float4 __ftexfetch(const void *tex, float4 index, int dim))
	{
	float4 val;

	__cudaTextureFetch(tex, (void)&index, 0, (void)&val);

	return val;
	}

	#endif /* __CUDABE__ */

	#endif /* __cplusplus && __CUDACC__ */	#endif /* __cplusplus && __CUDACC__ */

	#endif /* !__TEXTURE_FETCH_FUNCTIONS_H__ */	#endif /* !__TEXTURE_FETCH_FUNCTIONS_H__ */

End of changes. 3 change blocks.
	75 lines changed or deleted	2 lines changed or added

	vector_functions.h	vector_functions.h

	skipping to change at line 45	skipping to change at line 45

	#if !defined(__VECTOR_FUNCTIONS_H__)	#if !defined(__VECTOR_FUNCTIONS_H__)
	#define __VECTOR_FUNCTIONS_H__	#define __VECTOR_FUNCTIONS_H__

	/************************************************************************ ***	/************************************************************************ ***
	* *	* *
	* *	* *
	* *	* *
	************************************************************************* **/	************************************************************************* **/


		#include "builtin_types.h"
	#include "host_defines.h"	#include "host_defines.h"
	#include "vector_types.h"	#include "vector_types.h"

	/************************************************************************ ***	/************************************************************************ ***
	* *	* *
	* *	* *
	* *	* *
	************************************************************************* **/	************************************************************************* **/

	static __inline__ __host__ __device__ char1 make_char1(signed char x)	static __inline__ __host__ __device__ char1 make_char1(signed char x)

End of changes. 1 change blocks.
	0 lines changed or deleted	1 lines changed or added

	vector_types.h	vector_types.h

	skipping to change at line 45	skipping to change at line 45

	#if !defined(__VECTOR_TYPES_H__)	#if !defined(__VECTOR_TYPES_H__)
	#define __VECTOR_TYPES_H__	#define __VECTOR_TYPES_H__

	/************************************************************************ ***	/************************************************************************ ***
	* *	* *
	* *	* *
	* *	* *
	************************************************************************* **/	************************************************************************* **/


		#include "builtin_types.h"
	#include "host_defines.h"	#include "host_defines.h"

	/************************************************************************ ***	/************************************************************************ ***
	* *	* *
	* *	* *
	* *	* *
	************************************************************************* **/	************************************************************************* **/


	#if !defined(__cuda_assign_operators)

	#define __cuda_assign_operators(tag)

	#endif /* !__cuda_assign_operators */

	#if !defined(__CUDACC__) && !defined(__CUDABE__) && \	#if !defined(__CUDACC__) && !defined(__CUDABE__) && \
	defined(_WIN32) && !defined(_WIN64)	defined(_WIN32) && !defined(_WIN64)


	#define __cuda_builtin_vector_align8(tag, ...) \	#define __cuda_builtin_vector_align8(tag, members) \
	struct tag { \	struct tag { \
	union { \	union { \
	struct { __VA_ARGS__; }; \	struct { members }; \
	struct { long long int :1,:0; }; \	struct { long long int :1,:0; }; \
	}; \	}; \
	__cuda_assign_operators(tag) \
	}	}

	#else /* !__CUDACC__ && !__CUDABE__ && _WIN32 && !_WIN64 */	#else /* !__CUDACC__ && !__CUDABE__ && _WIN32 && !_WIN64 */


	#define __cuda_builtin_vector_align8(tag, ...) \	#define __cuda_builtin_vector_align8(tag, members) \
	struct __align__(8) tag { \	struct __align__(8) tag { \
	__VA_ARGS__; \	members \
	__cuda_assign_operators(tag) \
	}	}

	#endif /* !__CUDACC__ && !__CUDABE__ && _WIN32 && !_WIN64 */	#endif /* !__CUDACC__ && !__CUDABE__ && _WIN32 && !_WIN64 */

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct char1	struct char1
	{	{
	signed char x;	signed char x;

	__cuda_assign_operators(char1)
	};	};

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct uchar1	struct uchar1
	{	{
	unsigned char x;	unsigned char x;

	__cuda_assign_operators(uchar1)
	};	};

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct __align__(2) char2	struct __align__(2) char2
	{	{
	signed char x, y;	signed char x, y;

	__cuda_assign_operators(char2)
	};	};

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct __align__(2) uchar2	struct __align__(2) uchar2
	{	{
	unsigned char x, y;	unsigned char x, y;

	__cuda_assign_operators(uchar2)
	};	};

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct char3	struct char3
	{	{
	signed char x, y, z;	signed char x, y, z;

	__cuda_assign_operators(char3)
	};	};

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct uchar3	struct uchar3
	{	{
	unsigned char x, y, z;	unsigned char x, y, z;

	__cuda_assign_operators(uchar3)
	};	};

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct __align__(4) char4	struct __align__(4) char4
	{	{
	signed char x, y, z, w;	signed char x, y, z, w;

	__cuda_assign_operators(char4)
	};	};

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct __align__(4) uchar4	struct __align__(4) uchar4
	{	{
	unsigned char x, y, z, w;	unsigned char x, y, z, w;

	__cuda_assign_operators(uchar4)
	};	};

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct short1	struct short1
	{	{
	short x;	short x;

	__cuda_assign_operators(short1)
	};	};

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct ushort1	struct ushort1
	{	{
	unsigned short x;	unsigned short x;

	__cuda_assign_operators(ushort1)
	};	};

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct __align__(4) short2	struct __align__(4) short2
	{	{
	short x, y;	short x, y;

	__cuda_assign_operators(short2)
	};	};

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct __align__(4) ushort2	struct __align__(4) ushort2
	{	{
	unsigned short x, y;	unsigned short x, y;

	__cuda_assign_operators(ushort2)
	};	};

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct short3	struct short3
	{	{
	short x, y, z;	short x, y, z;

	__cuda_assign_operators(short3)
	};	};

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct ushort3	struct ushort3
	{	{
	unsigned short x, y, z;	unsigned short x, y, z;

	__cuda_assign_operators(ushort3)
	};	};

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/

	__cuda_builtin_vector_align8(short4, short x, y, z, w);	__cuda_builtin_vector_align8(short4, short x; short y; short z; short w;);

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/

	__cuda_builtin_vector_align8(ushort4, unsigned short x, y, z, w);	__cuda_builtin_vector_align8(ushort4, unsigned short x; unsigned short y; u nsigned short z; unsigned short w;);

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct int1	struct int1
	{	{
	int x;	int x;

	__cuda_assign_operators(int1)
	};	};

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct uint1	struct uint1
	{	{
	unsigned int x;	unsigned int x;

	__cuda_assign_operators(uint1)
	};	};

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/

	__cuda_builtin_vector_align8(int2, int x, y);	__cuda_builtin_vector_align8(int2, int x; int y;);

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/

	__cuda_builtin_vector_align8(uint2, unsigned int x, y);	__cuda_builtin_vector_align8(uint2, unsigned int x; unsigned int y;);

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct int3	struct int3
	{	{
	int x, y, z;	int x, y, z;

	__cuda_assign_operators(int3)
	};	};

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct uint3	struct uint3
	{	{
	unsigned int x, y, z;	unsigned int x, y, z;

	__cuda_assign_operators(uint3)
	};	};

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct __builtin_align__(16) int4	struct __builtin_align__(16) int4
	{	{
	int x, y, z, w;	int x, y, z, w;

	__cuda_assign_operators(int4)
	};	};

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct __builtin_align__(16) uint4	struct __builtin_align__(16) uint4
	{	{
	unsigned int x, y, z, w;	unsigned int x, y, z, w;

	__cuda_assign_operators(uint4)
	};	};

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct long1	struct long1
	{	{
	long int x;	long int x;

	__cuda_assign_operators(long1)
	};	};

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct ulong1	struct ulong1
	{	{
	unsigned long x;	unsigned long x;

	__cuda_assign_operators(ulong1)
	};	};

	#if defined (_WIN32)	#if defined (_WIN32)

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/

	__cuda_builtin_vector_align8(long2, long int x, y);	__cuda_builtin_vector_align8(long2, long int x; long int y;);

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/

	__cuda_builtin_vector_align8(ulong2, unsigned long int x, y);	__cuda_builtin_vector_align8(ulong2, unsigned long int x; unsigned long int y;);

	#else /* _WIN32 */	#else /* _WIN32 */

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct __align__(2*sizeof(long int)) long2	struct __align__(2*sizeof(long int)) long2
	{	{
	long int x, y;	long int x, y;

	__cuda_assign_operators(long2)
	};	};

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct __align__(2*sizeof(unsigned long int)) ulong2	struct __align__(2*sizeof(unsigned long int)) ulong2
	{	{
	unsigned long int x, y;	unsigned long int x, y;

	__cuda_assign_operators(ulong2)
	};	};

	#endif /* _WIN32 */	#endif /* _WIN32 */

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct long3	struct long3
	{	{
	long int x, y, z;	long int x, y, z;

	__cuda_assign_operators(long3)
	};	};

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct ulong3	struct ulong3
	{	{
	unsigned long int x, y, z;	unsigned long int x, y, z;

	__cuda_assign_operators(ulong3)
	};	};

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct __builtin_align__(16) long4	struct __builtin_align__(16) long4
	{	{
	long int x, y, z, w;	long int x, y, z, w;

	__cuda_assign_operators(long4)
	};	};

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct __builtin_align__(16) ulong4	struct __builtin_align__(16) ulong4
	{	{
	unsigned long int x, y, z, w;	unsigned long int x, y, z, w;

	__cuda_assign_operators(ulong4)
	};	};

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct float1	struct float1
	{	{
	float x;	float x;

	__cuda_assign_operators(float1)
	};	};

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/

	__cuda_builtin_vector_align8(float2, float x, y);	__cuda_builtin_vector_align8(float2, float x; float y;);

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct float3	struct float3
	{	{
	float x, y, z;	float x, y, z;

	__cuda_assign_operators(float3)
	};	};

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct __builtin_align__(16) float4	struct __builtin_align__(16) float4
	{	{
	float x, y, z, w;	float x, y, z, w;

	__cuda_assign_operators(float4)
	};	};

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct longlong1	struct longlong1
	{	{
	long long int x;	long long int x;

	__cuda_assign_operators(longlong1)
	};	};

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct ulonglong1	struct ulonglong1
	{	{
	unsigned long long int x;	unsigned long long int x;

	__cuda_assign_operators(ulonglong1)
	};	};

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct __builtin_align__(16) longlong2	struct __builtin_align__(16) longlong2
	{	{
	long long int x, y;	long long int x, y;

	__cuda_assign_operators(longlong2)
	};	};

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct __builtin_align__(16) ulonglong2	struct __builtin_align__(16) ulonglong2
	{	{
	unsigned long long int x, y;	unsigned long long int x, y;

	__cuda_assign_operators(ulonglong2)
	};	};

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct longlong3	struct longlong3
	{	{
	long long int x, y, z;	long long int x, y, z;

	__cuda_assign_operators(longlong3)
	};	};

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct ulonglong3	struct ulonglong3
	{	{
	unsigned long long int x, y, z;	unsigned long long int x, y, z;

	__cuda_assign_operators(ulonglong3)
	};	};

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct __builtin_align__(16) longlong4	struct __builtin_align__(16) longlong4
	{	{
	long long int x, y, z ,w;	long long int x, y, z ,w;

	__cuda_assign_operators(longlong4)
	};	};

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct __builtin_align__(16) ulonglong4	struct __builtin_align__(16) ulonglong4
	{	{
	unsigned long long int x, y, z, w;	unsigned long long int x, y, z, w;

	__cuda_assign_operators(ulonglong4)
	};	};

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct double1	struct double1
	{	{
	double x;	double x;

	__cuda_assign_operators(double1)
	};	};

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct __builtin_align__(16) double2	struct __builtin_align__(16) double2
	{	{
	double x, y;	double x, y;

	__cuda_assign_operators(double2)
	};	};

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct double3	struct double3
	{	{
	double x, y, z;	double x, y, z;

	__cuda_assign_operators(double3)
	};	};

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct __builtin_align__(16) double4	struct __builtin_align__(16) double4
	{	{
	double x, y, z, w;	double x, y, z, w;

	__cuda_assign_operators(double4)
	};	};

	/************************************************************************ ***	/************************************************************************ ***
	* *	* *
	* *	* *
	* *	* *
	************************************************************************* **/	************************************************************************* **/

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	typedef struct char1 char1;	typedef struct char1 char1;

	skipping to change at line 522	skipping to change at line 472
	/************************************************************************ ***	/************************************************************************ ***
	* *	* *
	* *	* *
	* *	* *
	************************************************************************* **/	************************************************************************* **/

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct dim3	struct dim3
	{	{
	unsigned int x, y, z;	unsigned int x, y, z;

	#if defined(__cplusplus) && !defined(__CUDABE__)	#if defined(__cplusplus)
	__host__ __device__ dim3(unsigned int x = 1, unsigned int y = 1, unsign	__host__ __device__ dim3(unsigned int vx = 1, unsigned int vy = 1, unsi
	ed int z = 1) : x(x), y(y), z(z) {}	gned int vz = 1) : x(vx), y(vy), z(vz) {}
	__host__ __device__ dim3(uint3 v) : x(v.x), y(v.y), z(v.z) {}	__host__ __device__ dim3(uint3 v) : x(v.x), y(v.y), z(v.z) {}
	__host__ __device__ operator uint3(void) { uint3 t; t.x = x; t.y = y; t .z = z; return t; }	__host__ __device__ operator uint3(void) { uint3 t; t.x = x; t.y = y; t .z = z; return t; }

	#endif /* __cplusplus && !__CUDABE__ */	#endif /* __cplusplus */
	};	};

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	typedef struct dim3 dim3;	typedef struct dim3 dim3;


	#undef __cuda_assign_operators
	#undef __cuda_builtin_vector_align8	#undef __cuda_builtin_vector_align8

	#endif /* !__VECTOR_TYPES_H__ */	#endif /* !__VECTOR_TYPES_H__ */

End of changes. 57 change blocks.
	72 lines changed or deleted	21 lines changed or added

This html diff was produced by rfcdiff 1.41. The latest version is available from http://tools.ietf.org/tools/rfcdiff/

	__cudaFatFormat.h	__cudaFatFormat.h
	/*	/*

	* Copyright 1993-2009 NVIDIA Corporation. All rights reserved.	* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
	*	*
	* NOTICE TO USER:	* NOTICE TO USER:
	*	*
	* This source code is subject to NVIDIA ownership rights under U.S. and	* This source code is subject to NVIDIA ownership rights under U.S. and
	* international Copyright laws. Users and possessors of this source code	* international Copyright laws. Users and possessors of this source code
	* are hereby granted a nonexclusive, royalty-free license to use this code	* are hereby granted a nonexclusive, royalty-free license to use this code
	* in individual and commercial software.	* in individual and commercial software.
	*	*
	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR

	skipping to change at line 230	skipping to change at line 230
	void fatGetCubinForGpuWithPolicy( __cudaFatCudaBinary binary, __cudaFatCom pilationPolicy policy, char gpuName, char* cubin, char *dbgInfoFile );	void fatGetCubinForGpuWithPolicy( __cudaFatCudaBinary binary, __cudaFatCom pilationPolicy policy, char gpuName, char* cubin, char *dbgInfoFile );

	#define fatGetCubinForGpu(binary,gpuName,cubin,dbgInfoFile) \	#define fatGetCubinForGpu(binary,gpuName,cubin,dbgInfoFile) \
	fatGetCubinForGpuWithPolicy(binary,__cudaFatAvoidPTX,gpuName,cubi n,dbgInfoFile)	fatGetCubinForGpuWithPolicy(binary,__cudaFatAvoidPTX,gpuName,cubi n,dbgInfoFile)

	/*	/*
	* Function : Check if a binary will be JITed for the specified targ et architecture	* Function : Check if a binary will be JITed for the specified targ et architecture
	* Parameters : binary (I) Fat binary	* Parameters : binary (I) Fat binary
	* policy (I) Compilation policy, as described by fatGet CubinForGpuWithPolicy	* policy (I) Compilation policy, as described by fatGet CubinForGpuWithPolicy
	* gpuName (I) Name of target GPU	* gpuName (I) Name of target GPU

	* ptx (O) PTX string to be JITed	* ptx (O) PTX string to be JITed. Must be freed by c aller.
	* Function Result : True if the given binary will be JITed; otherwise, Fal se	* Function Result : True if the given binary will be JITed; otherwise, Fal se
	*/	*/
	unsigned char fatCheckJitForGpuWithPolicy( __cudaFatCudaBinary binary, __c udaFatCompilationPolicy policy, char gpuName, char* *ptx );	unsigned char fatCheckJitForGpuWithPolicy( __cudaFatCudaBinary binary, __c udaFatCompilationPolicy policy, char gpuName, char* *ptx );

	#define fatCheckJitForGpu(binary,gpuName,ptx) \	#define fatCheckJitForGpu(binary,gpuName,ptx) \
	fatCheckJitForGpuWithPolicy(binary,__cudaFatAvoidPTX,gpuName,ptx)	fatCheckJitForGpuWithPolicy(binary,__cudaFatAvoidPTX,gpuName,ptx)

	/*	/*
	* Function : Free information previously obtained via function fatG etCubinForGpu.	* Function : Free information previously obtained via function fatG etCubinForGpu.
	* Parameters : cubin (I) Cubin text string to free	* Parameters : cubin (I) Cubin text string to free
	* dbgInfo (I) Debug info filename to free, or NULL	* dbgInfo (I) Debug info filename to free, or NULL
	*/	*/
	void fatFreeCubin( char* cubin, char* dbgInfoFile );	void fatFreeCubin( char* cubin, char* dbgInfoFile );


		/*
		* Function : Free information previously obtained via function fatC
		heckJitForGpuWithPolicy.
		* Parameters : ptx (I) PTX text string to free
		*/
		void __cudaFatFreePTX( char* ptx );

	#ifdef __cplusplus	#ifdef __cplusplus
	}	}
	#endif	#endif

	#endif	#endif

End of changes. 3 change blocks.
	2 lines changed or deleted	9 lines changed or added