cuda: headers diff between 2.1 and 2.2 versions

	__cudaFatFormat.h	__cudaFatFormat.h
	/*	/*

	* Copyright 1993-2008 NVIDIA Corporation. All rights reserved.	* Copyright 1993-2009 NVIDIA Corporation. All rights reserved.
	*	*
	* NOTICE TO USER:	* NOTICE TO USER:
	*	*
	* This source code is subject to NVIDIA ownership rights under U.S. and	* This source code is subject to NVIDIA ownership rights under U.S. and
	* international Copyright laws. Users and possessors of this source code	* international Copyright laws. Users and possessors of this source code
	* are hereby granted a nonexclusive, royalty-free license to use this code	* are hereby granted a nonexclusive, royalty-free license to use this code
	* in individual and commercial software.	* in individual and commercial software.
	*	*
	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR

	skipping to change at line 166	skipping to change at line 167
	char* ident;	char* ident;
	char* usageMode;	char* usageMode;
	__cudaFatPtxEntry *ptx;	__cudaFatPtxEntry *ptx;
	__cudaFatCubinEntry *cubin;	__cudaFatCubinEntry *cubin;
	__cudaFatDebugEntry *debug;	__cudaFatDebugEntry *debug;
	void* debugInfo;	void* debugInfo;
	unsigned int flags;	unsigned int flags;
	__cudaFatSymbol *exported;	__cudaFatSymbol *exported;
	__cudaFatSymbol *imported;	__cudaFatSymbol *imported;
	struct __cudaFatCudaBinaryRec *dependends;	struct __cudaFatCudaBinaryRec *dependends;

		unsigned int characteristic;
	} __cudaFatCudaBinary;	} __cudaFatCudaBinary;

	/*	/*
	* Current version and magic numbers:	* Current version and magic numbers:
	*/	*/
	#define __cudaFatVERSION 0x00000003	#define __cudaFatVERSION 0x00000003
	#define __cudaFatMAGIC 0x1ee55a01	#define __cudaFatMAGIC 0x1ee55a01

	/*	/*
	* Version history log:	* Version history log:
	* 1 : __cudaFatDebugEntry field added to __cudaFatCudaBinary struct	* 1 : __cudaFatDebugEntry field added to __cudaFatCudaBinary struct
	* 2 : flags and debugInfo field added.	* 2 : flags and debugInfo field added.
	* 3 : import/export symbol list	* 3 : import/export symbol list

		* 4 : characteristic added
	*/	*/

	/--------------------------------- Functions ----------------------------- ---/	/--------------------------------- Functions ----------------------------- ---/

	typedef enum {	typedef enum {
	__cudaFatAvoidPTX,	__cudaFatAvoidPTX,
	__cudaFatPreferBestCode	__cudaFatPreferBestCode
	} __cudaFatCompilationPolicy;	} __cudaFatCompilationPolicy;

	/*	/*

End of changes. 3 change blocks.
	1 lines changed or deleted	3 lines changed or added

	builtin_types.h	builtin_types.h
	/*	/*

	* Copyright 1993-2008 NVIDIA Corporation. All rights reserved.	* Copyright 1993-2009 NVIDIA Corporation. All rights reserved.
	*	*
	* NOTICE TO USER:	* NOTICE TO USER:
	*	*
	* This source code is subject to NVIDIA ownership rights under U.S. and	* This source code is subject to NVIDIA ownership rights under U.S. and
	* international Copyright laws. Users and possessors of this source code	* international Copyright laws. Users and possessors of this source code
	* are hereby granted a nonexclusive, royalty-free license to use this code	* are hereby granted a nonexclusive, royalty-free license to use this code
	* in individual and commercial software.	* in individual and commercial software.
	*	*
	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR

End of changes. 1 change blocks.
	1 lines changed or deleted	1 lines changed or added

	channel_descriptor.h	channel_descriptor.h
	/*	/*

	* Copyright 1993-2008 NVIDIA Corporation. All rights reserved.	* Copyright 1993-2009 NVIDIA Corporation. All rights reserved.
	*	*
	* NOTICE TO USER:	* NOTICE TO USER:
	*	*
	* This source code is subject to NVIDIA ownership rights under U.S. and	* This source code is subject to NVIDIA ownership rights under U.S. and
	* international Copyright laws. Users and possessors of this source code	* international Copyright laws. Users and possessors of this source code
	* are hereby granted a nonexclusive, royalty-free license to use this code	* are hereby granted a nonexclusive, royalty-free license to use this code
	* in individual and commercial software.	* in individual and commercial software.
	*	*
	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR

	skipping to change at line 58	skipping to change at line 58
	#include "cuda_runtime_api.h"	#include "cuda_runtime_api.h"
	#include "host_defines.h"	#include "host_defines.h"
	#include "vector_types.h"	#include "vector_types.h"

	/************************************************************************ ***	/************************************************************************ ***
	* *	* *
	* *	* *
	* *	* *
	************************************************************************* **/	************************************************************************* **/


		/**
		* \addtogroup CUDART_HIGHLEVEL
		*
		* @{
		*/

		/**
		* \brief \hl Returns a channel descriptor using the specified format
		*
		* Returns a channel descriptor with format \p f and number of bits of each
		* component \p x, \p y, \p z, and \p w. The ::cudaChannelFormatDesc is
		* defined as:
		* \code
		struct cudaChannelFormatDesc {
		int x, y, z, w;
		enum cudaChannelFormatKind f;
		};
		* \endcode
		*
		* where ::cudaChannelFormatKind is one of ::cudaChannelFormatKindSigned,
		* ::cudaChannelFormatKindUnsigned, or ::cudaChannelFormatKindFloat.
		*
		* \return
		* Channel descriptor with format \p f
		*
		* \sa \ref ::cudaCreateChannelDesc(int,int,int,int,cudaChannelFormatKind)
		"cudaCreateChannelDesc (Low level)",
		* ::cudaGetChannelDesc, ::cudaGetTextureReference,
		* \ref ::cudaBindTexture(size_t*, const struct texture< T, dim, readMode>&
		, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTextur
		e (High level)",
		* \ref ::cudaBindTexture(size_t*, const struct texture< T, dim, readMode>&
		, const void*, size_t) "cudaBindTexture (High level, inherited channel desc
		riptor)",
		* \ref ::cudaBindTexture2D(size_t*, const struct texture< T, dim, readMode
		>&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_
		t) "cudaBindTexture2D (High level)",
		* \ref ::cudaBindTextureToArray(const struct texture< T, dim, readMode>&,
		const struct cudaArray*, const struct cudaChannelFormatDesc&) "cudaBindText
		ureToArray (High level)",
		* \ref ::cudaBindTextureToArray(const struct texture< T, dim, readMode>&,
		const struct cudaArray*) "cudaBindTextureToArray (High level, inherited cha
		nnel descriptor)",
		* \ref ::cudaUnbindTexture(const struct texture< T, dim, readMode>&) "cuda
		UnbindTexture (High level)",
		* \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture< T, d
		im, readMode>&) "cudaGetTextureAlignmentOffset (High level)"
		*/
	template<class T> __inline__ __host__ cudaChannelFormatDesc cudaCreateChann elDesc(void)	template<class T> __inline__ __host__ cudaChannelFormatDesc cudaCreateChann elDesc(void)
	{	{
	return cudaCreateChannelDesc(0, 0, 0, 0, cudaChannelFormatKindNone);	return cudaCreateChannelDesc(0, 0, 0, 0, cudaChannelFormatKindNone);
	}	}

	template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc< char>(void)	template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc< char>(void)
	{	{
	int e = (int)sizeof(char) * 8;	int e = (int)sizeof(char) * 8;

	#if __SIGNED_CHARS__	#if __SIGNED_CHARS__

	skipping to change at line 332	skipping to change at line 367

	template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc< float4>(void)	template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc< float4>(void)
	{	{
	int e = (int)sizeof(float) * 8;	int e = (int)sizeof(float) * 8;

	return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindFloat);	return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindFloat);
	}	}

	#endif /* __cplusplus */	#endif /* __cplusplus */


		/** @} */
		/** @} / / END CUDART_TEXTURE_HL */

	#endif /* !__CHANNEL_DESCRIPTOR_H__ */	#endif /* !__CHANNEL_DESCRIPTOR_H__ */

End of changes. 3 change blocks.
	1 lines changed or deleted	52 lines changed or added

	common_functions.h	common_functions.h
	/*	/*

	* Copyright 1993-2008 NVIDIA Corporation. All rights reserved.	* Copyright 1993-2009 NVIDIA Corporation. All rights reserved.
	*	*
	* NOTICE TO USER:	* NOTICE TO USER:
	*	*
	* This source code is subject to NVIDIA ownership rights under U.S. and	* This source code is subject to NVIDIA ownership rights under U.S. and
	* international Copyright laws. Users and possessors of this source code	* international Copyright laws. Users and possessors of this source code
	* are hereby granted a nonexclusive, royalty-free license to use this code	* are hereby granted a nonexclusive, royalty-free license to use this code
	* in individual and commercial software.	* in individual and commercial software.
	*	*
	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR

	skipping to change at line 56	skipping to change at line 56

	#include "host_defines.h"	#include "host_defines.h"

	#include <time.h>	#include <time.h>
	#include <string.h>	#include <string.h>

	extern "C"	extern "C"
	{	{

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/

	extern __host__ __device__ clock_t clock(void) __THROW;	extern _CRTIMP __host__ __device__ clock_t clock(void) __THROW;

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __host__ __device__ void memset(void s, int c, size_t n) __THROW;	extern __host__ __device__ void memset(void s, int c, size_t n) __THROW;

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __host__ __device__ void memcpy(void d, const void *s, size_t n) _ _THROW;	extern __host__ __device__ void memcpy(void d, const void *s, size_t n) _ _THROW;

	}	}

	#elif !defined(__CUDACC__)	#elif !defined(__CUDACC__)

End of changes. 2 change blocks.
	2 lines changed or deleted	2 lines changed or added

	cuComplex.h	cuComplex.h
	/*	/*

	* Copyright 1993-2008 NVIDIA Corporation. All rights reserved.	* Copyright 1993-2009 NVIDIA Corporation. All rights reserved.
	*	*
	* NOTICE TO USER:	* NOTICE TO USER:
	*	*
	* This source code is subject to NVIDIA ownership rights under U.S. and	* This source code is subject to NVIDIA ownership rights under U.S. and
	* international Copyright laws. Users and possessors of this source code	* international Copyright laws. Users and possessors of this source code
	* are hereby granted a nonexclusive, royalty-free license to use this code	* are hereby granted a nonexclusive, royalty-free license to use this code
	* in individual and commercial software.	* in individual and commercial software.
	*	*
	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR

End of changes. 1 change blocks.
	1 lines changed or deleted	1 lines changed or added

	cublas.h	cublas.h
	/*	/*

	* Copyright 1993-2008 NVIDIA Corporation. All rights reserved.	* Copyright 1993-2009 NVIDIA Corporation. All rights reserved.
	*	*
	* NOTICE TO USER:	* NOTICE TO USER:
	*	*
	* This source code is subject to NVIDIA ownership rights under U.S. and	* This source code is subject to NVIDIA ownership rights under U.S. and
	* international Copyright laws. Users and possessors of this source code	* international Copyright laws. Users and possessors of this source code
	* are hereby granted a nonexclusive, royalty-free license to use this code	* are hereby granted a nonexclusive, royalty-free license to use this code
	* in individual and commercial software.	* in individual and commercial software.
	*	*
	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR

	skipping to change at line 1209	skipping to change at line 1209
	*	*
	* Error status for this function can be retrieved via cublasGetError().	* Error status for this function can be retrieved via cublasGetError().
	*	*
	* Error Status	* Error Status
	* ------------	* ------------
	* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize d	* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize d
	* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU	* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
	*/	*/
	float CUBLASAPI cublasScnrm2 (int n, const cuComplex *x, int incx);	float CUBLASAPI cublasScnrm2 (int n, const cuComplex *x, int incx);


		/* ----------------- CUBLAS double-complex BLAS1 functions ----------------
		- */

		/*
		* cuDoubleComplex
		* zdotu (int n, const cuDoubleComplex *x, int incx, const cuDoubleComplex
		*y, int incy)
		*
		* computes the dot product of two double-complex vectors. It returns the
		* dot product of the double-complex vectors x and y if successful, and dou
		ble-complex
		* zero otherwise. It computes the sum for i = 0 to n - 1 of x[lx + i * inc
		x] *
		* y[ly + i * incy], where lx = 1 if incx >= 0, else lx = 1 + (1 - n) * inc
		x;
		* ly is defined in a similar way using incy.
		*
		* Input
		* -----
		* n number of elements in input vectors
		* x double-complex vector with n elements
		* incx storage spacing between elements of x
		* y double-complex vector with n elements
		* incy storage spacing between elements of y
		*
		* Output
		* ------
		* returns double-complex dot product (zero if n <= 0)
		*
		* Reference: http://www.netlib.org/blas/zdotu.f
		*
		* Error status for this function can be retrieved via cublasGetError().
		*
		* Error Status
		* ------------
		* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has nor been initialize
		d
		* CUBLAS_STATUS_EXECUTION_FAILED if function failed to execute on GPU
		*/
		cuDoubleComplex CUBLASAPI cublasZdotu (int n, const cuDoubleComplex *x, int
		incx,
		const cuDoubleComplex *y, int incy);

		/*
		* void
		* cublasZscal (int n, cuComplex alpha, cuComplex *x, int incx)
		*
		* replaces double-complex vector x with double-complex alpha * x. For i
		* = 0 to n - 1, it replaces x[ix + i * incx] with alpha * x[ix + i * incx]
		,
		* where ix = 1 if incx >= 0, else ix = 1 + (1 - n) * incx.
		*
		* Input
		* -----
		* n number of elements in input vectors
		* alpha double-complex scalar multiplier
		* x double-complex vector with n elements
		* incx storage spacing between elements of x
		*
		* Output
		* ------
		* x double-complex result (unchanged if n <= 0 or incx <= 0)
		*
		* Reference: http://www.netlib.org/blas/zscal.f
		*
		* Error status for this function can be retrieved via cublasGetError().
		*
		* Error Status
		* ------------
		* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
		d
		* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
		*/
		void CUBLASAPI cublasZscal (int n, cuDoubleComplex alpha, cuDoubleComplex *
		x, int incx);

	/* --------------- CUBLAS single precision BLAS2 functions --------------- - */	/* --------------- CUBLAS single precision BLAS2 functions --------------- - */

	/*	/*
	* void	* void
	* cublasSgbmv (char trans, int m, int n, int kl, int ku, float alpha,	* cublasSgbmv (char trans, int m, int n, int kl, int ku, float alpha,
	* const float A, int lda, const float x, int incx, float be ta,	* const float A, int lda, const float x, int incx, float be ta,
	* float *y, int incy)	* float *y, int incy)
	*	*
	* performs one of the matrix-vector operations	* performs one of the matrix-vector operations
	*	*

	skipping to change at line 2114	skipping to change at line 2180
	*	*
	* Error Status	* Error Status
	* ------------	* ------------
	* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize d	* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize d
	* CUBLAS_STATUS_INVALID_VALUE if incx == 0 or if n < 0 or n > 4070	* CUBLAS_STATUS_INVALID_VALUE if incx == 0 or if n < 0 or n > 4070
	* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU	* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
	*/	*/
	void CUBLASAPI cublasStrsv (char uplo, char trans, char diag, int n,	void CUBLASAPI cublasStrsv (char uplo, char trans, char diag, int n,
	const float A, int lda, float x, int incx);	const float A, int lda, float x, int incx);


		/* ----------------- CUBLAS double complex BLAS3 functions ----------------
		- */

		/*
		* cublasZgemv (char trans, int m, int n, cuDoubleComplex alpha, const cuDo
		ubleComplex *A, int lda,
		* const cuDoubleComplex *x, int incx, cuDoubleComplex beta, c
		uDoubleComplex *y, int incy)
		*
		* performs one of the matrix-vector operations
		*
		* y = alpha * op(A) * x + beta * y,
		*
		* where op(A) is one of
		*
		* op(A) = A or op(A) = transpose(A)
		*
		* where alpha and beta are double precision scalars, x and y are double
		* precision vectors, and A is an m x n matrix consisting of double precisi
		on
		* elements. Matrix A is stored in column major format, and lda is the lead
		ing
		* dimension of the two-dimensional array in which A is stored.
		*
		* Input
		* -----
		* trans specifies op(A). If transa = 'n' or 'N', op(A) = A. If trans =
		* trans = 't', 'T', 'c', or 'C', op(A) = transpose(A)
		* m specifies the number of rows of the matrix A. m must be at least
		* zero.
		* n specifies the number of columns of the matrix A. n must be at lea
		st
		* zero.
		* alpha double precision scalar multiplier applied to op(A).
		* A double precision array of dimensions (lda, n) if trans = 'n' or
		* 'N'), and of dimensions (lda, m) otherwise. lda must be at least
		* max(1, m) and at least max(1, n) otherwise.
		* lda leading dimension of two-dimensional array used to store matrix A
		* x double precision array of length at least (1 + (n - 1) * abs(incx
		))
		* when trans = 'N' or 'n' and at least (1 + (m - 1) * abs(incx))
		* otherwise.
		* incx specifies the storage spacing between elements of x. incx must no
		t
		* be zero.
		* beta double precision scalar multiplier applied to vector y. If beta
		* is zero, y is not read.
		* y double precision array of length at least (1 + (m - 1) * abs(incy
		))
		* when trans = 'N' or 'n' and at least (1 + (n - 1) * abs(incy))
		* otherwise.
		* incy specifies the storage spacing between elements of x. incx must no
		t
		* be zero.
		*
		* Output
		* ------
		* y updated according to alpha * op(A) * x + beta * y
		*
		* Reference: http://www.netlib.org/blas/zgemv.f
		*
		* Error status for this function can be retrieved via cublasGetError().
		*
		* Error Status
		* ------------
		* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
		d
		* CUBLAS_STATUS_INVALID_VALUE if m or n are < 0, or if incx or incy ==
		0
		* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
		*/
		void CUBLASAPI cublasZgemv (char trans, int m, int n, cuDoubleComplex alpha
		,
		const cuDoubleComplex *A, int lda, const cuDoub
		leComplex *x, int incx,
		cuDoubleComplex beta, cuDoubleComplex *y, int i
		ncy);

	/* ----------------- CUBLAS single complex BLAS2 functions ---------------- - */	/* ----------------- CUBLAS single complex BLAS2 functions ---------------- - */
	void CUBLASAPI cublasCgemv (char trans, int m, int n, cuComplex alpha,	void CUBLASAPI cublasCgemv (char trans, int m, int n, cuComplex alpha,
	const cuComplex A, int lda, const cuComplex x ,	const cuComplex A, int lda, const cuComplex x ,
	int incx, cuComplex beta, cuComplex *y, int inc y);	int incx, cuComplex beta, cuComplex *y, int inc y);
	void CUBLASAPI cublasCgbmv (char trans, int m, int n, int kl, int ku,	void CUBLASAPI cublasCgbmv (char trans, int m, int n, int kl, int ku,
	cuComplex alpha, const cuComplex *A, int lda,	cuComplex alpha, const cuComplex *A, int lda,
	const cuComplex *x, int incx, cuComplex beta,	const cuComplex *x, int incx, cuComplex beta,
	cuComplex *y, int incy);	cuComplex *y, int incy);
	void CUBLASAPI cublasChemv (char uplo, int n, cuComplex alpha,	void CUBLASAPI cublasChemv (char uplo, int n, cuComplex alpha,
	const cuComplex A, int lda, const cuComplex x ,	const cuComplex A, int lda, const cuComplex x ,

	skipping to change at line 3577	skipping to change at line 3706
	* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support	* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
	* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU	* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
	*/	*/
	void CUBLASAPI cublasDtrsm (char side, char uplo, char transa,	void CUBLASAPI cublasDtrsm (char side, char uplo, char transa,
	char diag, int m, int n, double alpha,	char diag, int m, int n, double alpha,
	const double A, int lda, double B,	const double A, int lda, double B,
	int ldb);	int ldb);

	/*	/*
	* void	* void

		* cublasZtrsm (char side, char uplo, char transa, char diag, int m, int n,
		* cuDoubleComplex alpha, const cuDoubleComplex *A, int lda,
		* cuDoubleComplex *B, int ldb)
		*
		* solves one of the matrix equations
		*
		* op(A) * X = alpha * B, or X * op(A) = alpha * B,
		*
		* where alpha is a double precision complex scalar, and X and B are m x n
		matrices
		* that are composed of double precision complex elements. A is a unit or n
		on-unit,
		* upper or lower triangular matrix, and op(A) is one of
		*
		* op(A) = A or op(A) = transpose(A) or op( A ) = conj( A' ).
		*
		* The result matrix X overwrites input matrix B; that is, on exit the resu
		lt
		* is stored in B. Matrices A and B are stored in column major format, and
		* lda and ldb are the leading dimensions of the two-dimensonials arrays th
		at
		* contain A and B, respectively.
		*
		* Input
		* -----
		* side specifies whether op(A) appears on the left or right of X as
		* follows: side = 'L' or 'l' indicates solve op(A) * X = alpha * B.
		* side = 'R' or 'r' indicates solve X * op(A) = alpha * B.
		* uplo specifies whether the matrix A is an upper or lower triangular
		* matrix as follows: uplo = 'U' or 'u' indicates A is an upper
		* triangular matrix. uplo = 'L' or 'l' indicates A is a lower
		* triangular matrix.
		* transa specifies the form of op(A) to be used in matrix multiplication
		* as follows: If transa = 'N' or 'N', then op(A) = A. If transa =
		* 'T', 't', 'C', or 'c', then op(A) = transpose(A).
		* diag specifies whether or not A is a unit triangular matrix like so:
		* if diag = 'U' or 'u', A is assumed to be unit triangular. If
		* diag = 'N' or 'n', then A is not assumed to be unit triangular.
		* m specifies the number of rows of B. m must be at least zero.
		* n specifies the number of columns of B. n must be at least zero.
		* alpha is a double precision complex scalar to be multiplied with B. Whe
		n alpha is
		* zero, then A is not referenced and B need not be set before entry
		.
		* A is a double precision complex array of dimensions (lda, k), where
		k is
		* m when side = 'L' or 'l', and is n when side = 'R' or 'r'. If
		* uplo = 'U' or 'u', the leading k x k upper triangular part of
		* the array A must contain the upper triangular matrix and the
		* strictly lower triangular matrix of A is not referenced. When
		* uplo = 'L' or 'l', the leading k x k lower triangular part of
		* the array A must contain the lower triangular matrix and the
		* strictly upper triangular part of A is not referenced. Note that
		* when diag = 'U' or 'u', the diagonal elements of A are not
		* referenced, and are assumed to be unity.
		* lda is the leading dimension of the two dimensional array containing
		A.
		* When side = 'L' or 'l' then lda must be at least max(1, m), when
		* side = 'R' or 'r' then lda must be at least max(1, n).
		* B is a double precision complex array of dimensions (ldb, n). ldb m
		ust be
		* at least max (1,m). The leading m x n part of the array B must
		* contain the right-hand side matrix B. On exit B is overwritten
		* by the solution matrix X.
		* ldb is the leading dimension of the two dimensional array containing
		B.
		* ldb must be at least max(1, m).
		*
		* Output
		* ------
		* B contains the solution matrix X satisfying op(A) * X = alpha * B,
		* or X * op(A) = alpha * B
		*
		* Reference: http://www.netlib.org/blas/ztrsm.f
		*
		* Error status for this function can be retrieved via cublasGetError().
		*
		* Error Status
		* ------------
		* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
		d
		* CUBLAS_STATUS_INVALID_VALUE if m or n < 0
		* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
		* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
		*/
		void CUBLASAPI cublasZtrsm (char side, char uplo, char transa,
		char diag, int m, int n, cuDoubleComplex alpha,
		const cuDoubleComplex *A, int lda,
		cuDoubleComplex *B, int ldb);

		/*
		* void
	* cublasDtrmm (char side, char uplo, char transa, char diag, int m, int n,	* cublasDtrmm (char side, char uplo, char transa, char diag, int m, int n,
	* double alpha, const double A, int lda, const double B, in t ldb)	* double alpha, const double A, int lda, const double B, in t ldb)
	*	*
	* performs one of the matrix-matrix operations	* performs one of the matrix-matrix operations
	*	*
	* B = alpha * op(A) * B, or B = alpha * B * op(A)	* B = alpha * op(A) * B, or B = alpha * B * op(A)
	*	*
	* where alpha is a double-precision scalar, B is an m x n matrix composed	* where alpha is a double-precision scalar, B is an m x n matrix composed
	* of double precision elements, and A is a unit or non-unit, upper or lowe r,	* of double precision elements, and A is a unit or non-unit, upper or lowe r,
	* triangular matrix composed of double precision elements. op(A) is one of	* triangular matrix composed of double precision elements. op(A) is one of

	skipping to change at line 3813	skipping to change at line 4023
	* CUBLAS_STATUS_INVALID_VALUE if n < 0 or k < 0	* CUBLAS_STATUS_INVALID_VALUE if n < 0 or k < 0
	* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support	* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
	* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU	* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
	*/	*/
	void CUBLASAPI cublasDsyrk (char uplo, char trans, int n, int k,	void CUBLASAPI cublasDsyrk (char uplo, char trans, int n, int k,
	double alpha, const double *A, int lda,	double alpha, const double *A, int lda,
	double beta, double *C, int ldc);	double beta, double *C, int ldc);

	/*	/*
	* void	* void

		* cublasZsyrk (char uplo, char trans, int n, int k, cuDoubleComplex alpha,
		* const cuDoubleComplex *A, int lda, cuDoubleComplex beta, cu
		DoubleComplex *C, int ldc)
		*
		* performs one of the symmetric rank k operations
		*
		* C = alpha * A * transpose(A) + beta * C, or
		* C = alpha * transpose(A) * A + beta * C.
		*
		* Alpha and beta are double precision complex scalars. C is an n x n symme
		tric matrix
		* consisting of double precision complex elements and stored in either low
		er or
		* upper storage mode. A is a matrix consisting of double precision complex
		elements
		* with dimension of n x k in the first case, and k x n in the second case.
		*
		* Input
		* -----
		* uplo specifies whether the symmetric matrix C is stored in upper or lo
		wer
		* storage mode as follows. If uplo == 'U' or 'u', only the upper
		* triangular part of the symmetric matrix is to be referenced, and
		the
		* elements of the strictly lower triangular part are to be infered
		from
		* those in the upper triangular part. If uplo == 'L' or 'l', only t
		he
		* lower triangular part of the symmetric matrix is to be referenced
		,
		* and the elements of the strictly upper triangular part are to be
		* infered from those in the lower triangular part.
		* trans specifies the operation to be performed. If trans == 'N' or 'n',
		C =
		* alpha * transpose(A) + beta * C. If trans == 'T', 't', 'C', or 'c
		',
		* C = transpose(A) * A + beta * C.
		* n specifies the number of rows and the number columns of matrix C.
		If
		* trans == 'N' or 'n', n specifies the number of rows of matrix A.
		If
		* trans == 'T', 't', 'C', or 'c', n specifies the columns of matrix
		A.
		* n must be at least zero.
		* k If trans == 'N' or 'n', k specifies the number of rows of matrix
		A.
		* If trans == 'T', 't', 'C', or 'c', k specifies the number of rows
		of
		* matrix A. k must be at least zero.
		* alpha double precision complex scalar multiplier applied to A * transpo
		se(A) or
		* transpose(A) * A.
		* A double precision complex array of dimensions (lda, ka), where ka
		is k when
		* trans == 'N' or 'n', and is n otherwise. When trans == 'N' or 'n'
		,
		* the leading n x k part of array A must contain the matrix A,
		* otherwise the leading k x n part of the array must contains the
		* matrix A.
		* lda leading dimension of A. When trans == 'N' or 'n' then lda must be
		at
		* least max(1, n). Otherwise lda must be at least max(1, k).
		* beta double precision complex scalar multiplier applied to C. If beta
		izs zero, C
		* does not have to be a valid input
		* C double precision complex array of dimensions (ldc, n). If uplo =
		'U' or 'u',
		* the leading n x n triangular part of the array C must contain the
		* upper triangular part of the symmetric matrix C and the strictly
		* lower triangular part of C is not referenced. On exit, the upper
		* triangular part of C is overwritten by the upper trinagular part
		of
		* the updated matrix. If uplo = 'L' or 'l', the leading n x n
		* triangular part of the array C must contain the lower triangular
		part
		* of the symmetric matrix C and the strictly upper triangular part
		of C
		* is not referenced. On exit, the lower triangular part of C is
		* overwritten by the lower trinagular part of the updated matrix.
		* ldc leading dimension of C. It must be at least max(1, n).
		*
		* Output
		* ------
		* C updated according to C = alpha * A * transpose(A) + beta * C, or
		C =
		* alpha * transpose(A) * A + beta * C
		*
		* Reference: http://www.netlib.org/blas/zsyrk.f
		*
		* Error status for this function can be retrieved via cublasGetError().
		*
		* Error Status
		* ------------
		* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
		d
		* CUBLAS_STATUS_INVALID_VALUE if n < 0 or k < 0
		* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
		* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
		*/
		void CUBLASAPI cublasZsyrk (char uplo, char trans, int n, int k,
		cuDoubleComplex alpha,
		const cuDoubleComplex *A, int lda,
		cuDoubleComplex beta,
		cuDoubleComplex *C, int ldc);

		/*
		* void
	* cublasDsyr2k (char uplo, char trans, int n, int k, double alpha,	* cublasDsyr2k (char uplo, char trans, int n, int k, double alpha,
	* const double A, int lda, const double B, int ldb,	* const double A, int lda, const double B, int ldb,
	* double beta, double *C, int ldc)	* double beta, double *C, int ldc)
	*	*
	* performs one of the symmetric rank 2k operations	* performs one of the symmetric rank 2k operations
	*	*
	* C = alpha * A * transpose(B) + alpha * B * transpose(A) + beta * C, o r	* C = alpha * A * transpose(B) + alpha * B * transpose(A) + beta * C, o r
	* C = alpha * transpose(A) * B + alpha * transpose(B) * A + beta * C.	* C = alpha * transpose(A) * B + alpha * transpose(B) * A + beta * C.
	*	*
	* Alpha and beta are double precision scalars. C is an n x n symmetric mat rix	* Alpha and beta are double precision scalars. C is an n x n symmetric mat rix

End of changes. 5 change blocks.
	1 lines changed or deleted	354 lines changed or added

	cuda.h	cuda.h
	/*	/*

	* Copyright 1993-2008 NVIDIA Corporation. All rights reserved.	* Copyright 1993-2009 NVIDIA Corporation. All rights reserved.
	*	*
	* NOTICE TO USER:	* NOTICE TO USER:
	*	*
	* This source code is subject to NVIDIA ownership rights under U.S. and	* This source code is subject to NVIDIA ownership rights under U.S. and
	* international Copyright laws. Users and possessors of this source code	* international Copyright laws. Users and possessors of this source code
	* are hereby granted a nonexclusive, royalty-free license to use this code	* are hereby granted a nonexclusive, royalty-free license to use this code
	* in individual and commercial software.	* in individual and commercial software.
	*	*
	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR

	skipping to change at line 36	skipping to change at line 36
	* and is provided to the U.S. Government only as a commercial end item.	* and is provided to the U.S. Government only as a commercial end item.
	* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through	* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
	* 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the	* 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
	* source code with only those rights set forth herein.	* source code with only those rights set forth herein.
	*	*
	* Any use of this source code in individual and commercial software must	* Any use of this source code in individual and commercial software must
	* include, in the user documentation and internal comments to the code,	* include, in the user documentation and internal comments to the code,
	* the above Disclaimer and U.S. Government End Users Notice.	* the above Disclaimer and U.S. Government End Users Notice.
	*/	*/


	// ------------------------------------------------------------------------
	----
	//
	// Main public header file for the CompUte Device Api
	//
	// ------------------------------------------------------------------------
	----

	#ifndef __cuda_cuda_h__	#ifndef __cuda_cuda_h__
	#define __cuda_cuda_h__	#define __cuda_cuda_h__


	/* CUDA API version number */	#include <stdlib.h>
	#define CUDA_VERSION 2010 /* 2.1 */
		/**
		* \file
		* \name Data types used by CUDA driver
		* \author NVIDIA Corporation
		* \brief Data types used by CUDA driver
		*/

		/**
		* \defgroup CUDA_TYPES Data types used by CUDA driver
		* \ingroup CUDA_DRIVER
		* @{
		*/

		/**
		* CUDA API version number
		*/
		#define CUDA_VERSION 2020 /* 2.2 */

	#ifdef __cplusplus	#ifdef __cplusplus
	extern "C" {	extern "C" {
	#endif	#endif

	typedef unsigned int CUdeviceptr;	typedef unsigned int CUdeviceptr; ///< CUDA device pointer


	typedef int CUdevice;	typedef int CUdevice; ///< CUDA device
	typedef struct CUctx_st *CUcontext;	typedef struct CUctx_st *CUcontext; ///< CUDA context
	typedef struct CUmod_st *CUmodule;	typedef struct CUmod_st *CUmodule; ///< CUDA module
	typedef struct CUfunc_st *CUfunction;	typedef struct CUfunc_st *CUfunction; ///< CUDA function
	typedef struct CUarray_st *CUarray;	typedef struct CUarray_st *CUarray; ///< CUDA array
	typedef struct CUtexref_st *CUtexref;	typedef struct CUtexref_st *CUtexref; ///< CUDA texture reference
	typedef struct CUevent_st *CUevent;	typedef struct CUevent_st *CUevent; ///< CUDA event
	typedef struct CUstream_st *CUstream;	typedef struct CUstream_st *CUstream; ///< CUDA stream

	/************************************	/************************************
	**	**
	** Enums	** Enums
	**	**
	***********************************/	***********************************/


	//	/**
	// context creation flags	* Context creation flags
	//	*/
	typedef enum CUctx_flags_enum {	typedef enum CUctx_flags_enum {

	CU_CTX_SCHED_AUTO = 0,	CU_CTX_SCHED_AUTO = 0, ///< Automatic scheduling
	CU_CTX_SCHED_SPIN = 1,	CU_CTX_SCHED_SPIN = 1, ///< Set spin as default scheduling
	CU_CTX_SCHED_YIELD = 2,	CU_CTX_SCHED_YIELD = 2, ///< Set yield as default scheduling
	CU_CTX_SCHED_MASK = 0x3,	CU_CTX_SCHED_MASK = 0x3,

	CU_CTX_FLAGS_MASK = CU_CTX_SCHED_MASK	CU_CTX_BLOCKING_SYNC = 4, ///< Use blocking synchronization
		CU_CTX_MAP_HOST = 8, ///< Support mapped pinned allocations
		CU_CTX_FLAGS_MASK = 0xf,
	} CUctx_flags;	} CUctx_flags;


	//	/**
	// array formats	* Event creation flags
	//	*/
		typedef enum CUevent_flags_enum {
		CU_EVENT_DEFAULT = 0, ///< Default event flag
		CU_EVENT_BLOCKING_SYNC = 1, ///< Event uses blocking synchronization
		} CUevent_flags;

		/**
		* Array formats
		*/
	typedef enum CUarray_format_enum {	typedef enum CUarray_format_enum {

	CU_AD_FORMAT_UNSIGNED_INT8 = 0x01,	CU_AD_FORMAT_UNSIGNED_INT8 = 0x01, ///< Unsigned 8-bit integers
	CU_AD_FORMAT_UNSIGNED_INT16 = 0x02,	CU_AD_FORMAT_UNSIGNED_INT16 = 0x02, ///< Unsigned 16-bit integers
	CU_AD_FORMAT_UNSIGNED_INT32 = 0x03,	CU_AD_FORMAT_UNSIGNED_INT32 = 0x03, ///< Unsigned 32-bit integers
	CU_AD_FORMAT_SIGNED_INT8 = 0x08,	CU_AD_FORMAT_SIGNED_INT8 = 0x08, ///< Signed 8-bit integers
	CU_AD_FORMAT_SIGNED_INT16 = 0x09,	CU_AD_FORMAT_SIGNED_INT16 = 0x09, ///< Signed 16-bit integers
	CU_AD_FORMAT_SIGNED_INT32 = 0x0a,	CU_AD_FORMAT_SIGNED_INT32 = 0x0a, ///< Signed 32-bit integers
	CU_AD_FORMAT_HALF = 0x10,	CU_AD_FORMAT_HALF = 0x10, ///< 16-bit floating point
	CU_AD_FORMAT_FLOAT = 0x20	CU_AD_FORMAT_FLOAT = 0x20 ///< 32-bit floating point
	} CUarray_format;	} CUarray_format;


	//	/**
	// Texture reference addressing modes	* Texture reference addressing modes
	//	*/
	typedef enum CUaddress_mode_enum {	typedef enum CUaddress_mode_enum {

	CU_TR_ADDRESS_MODE_WRAP = 0,	CU_TR_ADDRESS_MODE_WRAP = 0, ///< Wrapping address mode
	CU_TR_ADDRESS_MODE_CLAMP = 1,	CU_TR_ADDRESS_MODE_CLAMP = 1, ///< Clamp to edge address mode
	CU_TR_ADDRESS_MODE_MIRROR = 2,	CU_TR_ADDRESS_MODE_MIRROR = 2, ///< Mirror address mode
	} CUaddress_mode;	} CUaddress_mode;


	//	/**
	// Texture reference filtering modes	* Texture reference filtering modes
	//	*/
	typedef enum CUfilter_mode_enum {	typedef enum CUfilter_mode_enum {

	CU_TR_FILTER_MODE_POINT = 0,	CU_TR_FILTER_MODE_POINT = 0, ///< Point filter mode
	CU_TR_FILTER_MODE_LINEAR = 1	CU_TR_FILTER_MODE_LINEAR = 1 ///< Linear filter mode
	} CUfilter_mode;	} CUfilter_mode;


	//	/**
	// Device properties	* Device properties
	//	*/
	typedef enum CUdevice_attribute_enum {	typedef enum CUdevice_attribute_enum {

	CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 1,	CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 1, ///< Maximum number of
	CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X = 2,	threads per block
	CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y = 3,	CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X = 2, ///< Maximum block dime
	CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z = 4,	nsion X
	CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X = 5,	CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y = 3, ///< Maximum block dime
	CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y = 6,	nsion Y
	CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z = 7,	CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z = 4, ///< Maximum block dime
	CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK = 8,	nsion Z
	CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK = 8, // Deprecated, us	CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X = 5, ///< Maximum grid dimen
	e CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK	sion X
	CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY = 9,	CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y = 6, ///< Maximum grid dimen
	CU_DEVICE_ATTRIBUTE_WARP_SIZE = 10,	sion Y
	CU_DEVICE_ATTRIBUTE_MAX_PITCH = 11,	CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z = 7, ///< Maximum grid dimen
	CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK = 12,	sion Z
	CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK = 12, // Deprecated, us	CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK = 8, ///< Maximum sh
	e CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK	ared memory available per block in bytes
	CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13,	CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK = 8, ///< Deprecated, us
	CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT = 14,	e CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK
		CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY = 9, ///< Memory available o
		n device for __constant__ variables in a CUDA C kernel in bytes
		CU_DEVICE_ATTRIBUTE_WARP_SIZE = 10, ///< Warp size in threa
		ds
		CU_DEVICE_ATTRIBUTE_MAX_PITCH = 11, ///< Maximum pitch in b
		ytes allowed by memory copies
		CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK = 12, ///< Maximum number
		of 32-bit registers available per block
		CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK = 12, ///< Deprecated, use CU
		_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK
		CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13, ///< Peak clock frequen
		cy in kilohertz
		CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT = 14, ///< Alignment requirem
		ent for textures


	CU_DEVICE_ATTRIBUTE_GPU_OVERLAP = 15,	CU_DEVICE_ATTRIBUTE_GPU_OVERLAP = 15, ///< Device can possibl
	CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16,	y copy memory and execute a kernel concurrently
	CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT = 17	CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16, ///< Number of multipro
		cessors on device
		CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT = 17, ///< Specifies whether
		there is a run time limit on kernels
		CU_DEVICE_ATTRIBUTE_INTEGRATED = 18, ///< Device is integrat
		ed with host memory
		CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY = 19, ///< Device can map hos
		t memory into CUDA address space
		CU_DEVICE_ATTRIBUTE_COMPUTE_MODE = 20 ///< Compute mode (See
		::CUcomputemode for details)
	} CUdevice_attribute;	} CUdevice_attribute;


	//	/**
	// Legacy device properties	* Legacy device properties
	//	*/
	typedef struct CUdevprop_st {	typedef struct CUdevprop_st {

	int maxThreadsPerBlock;	int maxThreadsPerBlock; ///< Maximum number of threads per block
	int maxThreadsDim[3];	int maxThreadsDim[3]; ///< Maximum size of each dimension of a bl
	int maxGridSize[3];	ock
	int sharedMemPerBlock;	int maxGridSize[3]; ///< Maximum size of each dimension of a gr
	int totalConstantMemory;	id
	int SIMDWidth;	int sharedMemPerBlock; ///< Shared memory available per block in b
	int memPitch;	ytes
	int regsPerBlock;	int totalConstantMemory; ///< Constant memory available on device in
	int clockRate;	bytes
	int textureAlign;	int SIMDWidth; ///< Warp size in threads
		int memPitch; ///< Maximum pitch in bytes allowed by memo
		ry copies
		int regsPerBlock; ///< 32-bit registers available per block
		int clockRate; ///< Clock frequency in kilohertz
		int textureAlign; ///< Alignment requirement for textures
	} CUdevprop;	} CUdevprop;


	//	/**
	// Memory types	* Function properties
	//	*/
		typedef enum CUfunction_attribute_enum {
		/**
		* The number of threads beyond which a launch of the function would fa
		il.
		* This number depends on both the function and the device on which the
		* function is currently loaded.
		*/
		CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 0,

		/**
		* The size in bytes of statically-allocated shared memory required by
		* this function. This does not include dynamically-allocated shared
		* memory requested by the user at runtime.
		*/
		CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES = 1,

		/**
		* The size in bytes of user-allocated constant memory required by this
		* function.
		*/
		CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES = 2,

		/**
		* The size in bytes of thread local memory used by this function.
		*/
		CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES = 3,

		/**
		* The number of registers used by each thread of this function.
		*/
		CU_FUNC_ATTRIBUTE_NUM_REGS = 4,

		CU_FUNC_ATTRIBUTE_MAX
		} CUfunction_attribute;

		/**
		* Memory types
		*/
	typedef enum CUmemorytype_enum {	typedef enum CUmemorytype_enum {

	CU_MEMORYTYPE_HOST = 0x01,	CU_MEMORYTYPE_HOST = 0x01, ///< Host memory
	CU_MEMORYTYPE_DEVICE = 0x02,	CU_MEMORYTYPE_DEVICE = 0x02, ///< Device memory
	CU_MEMORYTYPE_ARRAY = 0x03	CU_MEMORYTYPE_ARRAY = 0x03 ///< Array memory
	} CUmemorytype;	} CUmemorytype;


	//	/**
	// Online compiler options	* Compute Modes
	//	*/
		typedef enum CUcomputemode_enum {
		CU_COMPUTEMODE_DEFAULT = 0, ///< Default compute mode (Multiple
		contexts allowed per device)
		CU_COMPUTEMODE_EXCLUSIVE = 1, ///< Compute-exclusive mode (Only on
		e context can be present on this device at a time)
		CU_COMPUTEMODE_PROHIBITED = 2 ///< Compute-prohibited mode (No con
		texts can be created on this device at this time)
		} CUcomputemode;

		/**
		* Online compiler options
		*/
	typedef enum CUjit_option_enum	typedef enum CUjit_option_enum
	{	{

	// CU_JIT_MAX_REGISTERS - Max number of registers that a thread may use	/**
	.	* Max number of registers that a thread may use.
		*/
	CU_JIT_MAX_REGISTERS = 0,	CU_JIT_MAX_REGISTERS = 0,


	// CU_JIT_THREADS_PER_BLOCK -	/**
	// IN: Specifies minimum number of threads per block to target compilat	* IN: Specifies minimum number of threads per block to target compilat
	ion for	ion
	// OUT: Returns the number of threads the compiler actually targeted.	* for\n
	This	* OUT: Returns the number of threads the compiler actually targeted.
	// restricts the resource utilization fo the compiler (e.g. max registe	* This restricts the resource utilization fo the compiler (e.g. max
	rs) such	* registers) such that a block with the given number of threads should
	// that a block with the given number of threads should be able to laun	be
	ch based	* able to launch based on register limitations. Note, this option does
	// on register limitations. Note, this option does not currently take	not
	into	* currently take into account any other resource limitations, such as
	// account any other resource limitations, such as shared memory utiliz	* shared memory utilization.
	ation.	*/
	CU_JIT_THREADS_PER_BLOCK,	CU_JIT_THREADS_PER_BLOCK,


	// CU_JIT_WALL_TIME - returns a float value in the option of the wall c	/**
	lock	* Returns a float value in the option of the wall clock time, in
	// time, in milliseconds, spent creating the cubin	* milliseconds, spent creating the cubin
		*/
	CU_JIT_WALL_TIME,	CU_JIT_WALL_TIME,


	// CU_JIT_INFO_LUG_BUFFER - pointer to a buffer in which to print any l	/**
	og	* Pointer to a buffer in which to print any log messsages from PTXAS
	// messsages from PTXAS that are informational in nature	* that are informational in nature
		*/
	CU_JIT_INFO_LOG_BUFFER,	CU_JIT_INFO_LOG_BUFFER,


	// CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES -	/**
	// IN: Log buffer size in bytes. Log messages will be capped at this s	* IN: Log buffer size in bytes. Log messages will be capped at this s
	ize	ize
	// (including null terminator)	* (including null terminator)\n
	// OUT: Amount of log buffer filled with messages	* OUT: Amount of log buffer filled with messages
		*/
	CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES,	CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES,


	// CU_JIT_ERROR_LOG_BUFFER - pointer to a buffer in which to print any	/**
	log	* Pointer to a buffer in which to print any log messages from PTXAS th
	// messages from PTXAS that reflect errors	at
		* reflect errors
		*/
	CU_JIT_ERROR_LOG_BUFFER,	CU_JIT_ERROR_LOG_BUFFER,


	// CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES -	/**
	// IN: Log buffer size in bytes. Log messages will be capped at this s	* IN: Log buffer size in bytes. Log messages will be capped at this s
	ize	ize
	// (including null terminator)	* (including null terminator)\n
	// OUT: Amount of log buffer filled with messages	* OUT: Amount of log buffer filled with messages
		*/
	CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES,	CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES,


	// CU_JIT_OPTIMIZATION_LEVEL - level of optimizations to apply to gener	/**
	ated	* Level of optimizations to apply to generated code (0 - 4), with 4
	// code (0 - 4), with 4 being the default and highest level of optimiza	* being the default and highest level of optimizations.
	tions.	*/
	CU_JIT_OPTIMIZATION_LEVEL,	CU_JIT_OPTIMIZATION_LEVEL,


	// CU_JIT_TARGET_FROM_CU_CONTEXT - no option value required. Determine	/**
	s	* No option value required. Determines the target based on the current
	// the target based on the current attached context (default)	* attached context (default)
		*/
	CU_JIT_TARGET_FROM_CUCONTEXT,	CU_JIT_TARGET_FROM_CUCONTEXT,


	// CU_JIT_TARGET - target is chosen based on supplied CUjit_target_enum	/**
	.	* Target is chosen based on supplied CUjit_target_enum.
		*/
	CU_JIT_TARGET,	CU_JIT_TARGET,


	// CU_JIT_FALLBACK_STRATEGY - specifies choice of fallback strategy if	/**
	// matching cubin is not found. Choice is based on supplied	* Specifies choice of fallback strategy if matching cubin is not found
	// CUjit_fallback_enum.	.
		* Choice is based on supplied CUjit_fallback_enum.
		*/
	CU_JIT_FALLBACK_STRATEGY	CU_JIT_FALLBACK_STRATEGY

	} CUjit_option;	} CUjit_option;


	//	/**
	// Online compilation targets	* Online compilation targets
	//	*/
	typedef enum CUjit_target_enum	typedef enum CUjit_target_enum
	{	{

	CU_TARGET_COMPUTE_10 = 0,	CU_TARGET_COMPUTE_10 = 0, ///< Compute device class 1.0
	CU_TARGET_COMPUTE_11,	CU_TARGET_COMPUTE_11, ///< Compute device class 1.1
	CU_TARGET_COMPUTE_12,	CU_TARGET_COMPUTE_12, ///< Compute device class 1.2
	CU_TARGET_COMPUTE_13	CU_TARGET_COMPUTE_13 ///< Compute device class 1.3
	} CUjit_target;	} CUjit_target;


	//	/**
	// Cubin matching fallback strategies	* Cubin matching fallback strategies
	//	*/
	typedef enum CUjit_fallback_enum	typedef enum CUjit_fallback_enum
	{	{

	// prefer to compile ptx	/** Prefer to compile ptx */
	CU_PREFER_PTX = 0,	CU_PREFER_PTX = 0,


	// prefer to fall back to compatible binary code	/** Prefer to fall back to compatible binary code */
	CU_PREFER_BINARY	CU_PREFER_BINARY

	} CUjit_fallback;	} CUjit_fallback;

	/************************************	/************************************
	**	**
	** Error codes	** Error codes
	**	**
	***********************************/	***********************************/


		/**
		* Error codes
		*/
	typedef enum cudaError_enum {	typedef enum cudaError_enum {


	CUDA_SUCCESS = 0,	CUDA_SUCCESS = 0, ///< No errors
	CUDA_ERROR_INVALID_VALUE = 1,	CUDA_ERROR_INVALID_VALUE = 1, ///< Invalid value
	CUDA_ERROR_OUT_OF_MEMORY = 2,	CUDA_ERROR_OUT_OF_MEMORY = 2, ///< Out of memory
	CUDA_ERROR_NOT_INITIALIZED = 3,	CUDA_ERROR_NOT_INITIALIZED = 3, ///< Driver not initialized
	CUDA_ERROR_DEINITIALIZED = 4,	CUDA_ERROR_DEINITIALIZED = 4, ///< Driver deinitialized


	CUDA_ERROR_NO_DEVICE = 100,	CUDA_ERROR_NO_DEVICE = 100, ///< No CUDA-capable device
	CUDA_ERROR_INVALID_DEVICE = 101,	available
		CUDA_ERROR_INVALID_DEVICE = 101, ///< Invalid device


	CUDA_ERROR_INVALID_IMAGE = 200,	CUDA_ERROR_INVALID_IMAGE = 200, ///< Invalid kernel image
	CUDA_ERROR_INVALID_CONTEXT = 201,	CUDA_ERROR_INVALID_CONTEXT = 201, ///< Invalid context
	CUDA_ERROR_CONTEXT_ALREADY_CURRENT = 202,	CUDA_ERROR_CONTEXT_ALREADY_CURRENT = 202, ///< Context already curren
	CUDA_ERROR_MAP_FAILED = 205,	t
	CUDA_ERROR_UNMAP_FAILED = 206,	CUDA_ERROR_MAP_FAILED = 205, ///< Map failed
	CUDA_ERROR_ARRAY_IS_MAPPED = 207,	CUDA_ERROR_UNMAP_FAILED = 206, ///< Unmap failed
	CUDA_ERROR_ALREADY_MAPPED = 208,	CUDA_ERROR_ARRAY_IS_MAPPED = 207, ///< Array is mapped
	CUDA_ERROR_NO_BINARY_FOR_GPU = 209,	CUDA_ERROR_ALREADY_MAPPED = 208, ///< Already mapped
	CUDA_ERROR_ALREADY_ACQUIRED = 210,	CUDA_ERROR_NO_BINARY_FOR_GPU = 209, ///< No binary for GPU
	CUDA_ERROR_NOT_MAPPED = 211,	CUDA_ERROR_ALREADY_ACQUIRED = 210, ///< Already acquired
		CUDA_ERROR_NOT_MAPPED = 211, ///< Not mapped


	CUDA_ERROR_INVALID_SOURCE = 300,	CUDA_ERROR_INVALID_SOURCE = 300, ///< Invalid source
	CUDA_ERROR_FILE_NOT_FOUND = 301,	CUDA_ERROR_FILE_NOT_FOUND = 301, ///< File not found


	CUDA_ERROR_INVALID_HANDLE = 400,	CUDA_ERROR_INVALID_HANDLE = 400, ///< Invalid handle


	CUDA_ERROR_NOT_FOUND = 500,	CUDA_ERROR_NOT_FOUND = 500, ///< Not found


	CUDA_ERROR_NOT_READY = 600,	CUDA_ERROR_NOT_READY = 600, ///< CUDA not ready


	CUDA_ERROR_LAUNCH_FAILED = 700,	CUDA_ERROR_LAUNCH_FAILED = 700, ///< Launch failed
	CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES = 701,	CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES = 701, ///< Launch exceeded resour
	CUDA_ERROR_LAUNCH_TIMEOUT = 702,	ces
	CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING = 703,	CUDA_ERROR_LAUNCH_TIMEOUT = 702, ///< Launch exceeded timeou
		t
		CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING = 703, ///< Launch with incomp
		atible texturing


	CUDA_ERROR_UNKNOWN = 999	CUDA_ERROR_UNKNOWN = 999 ///< Unknown error
	} CUresult;	} CUresult;


		/**
		* If set, host memory is portable between CUDA contexts.
		* Flag for ::cuMemHostAlloc()
		*/
		#define CU_MEMHOSTALLOC_PORTABLE 0x01

		/**
		* If set, host memory is mapped into CUDA address space and
		* ::cuMemHostGetDevicePointer() may be called on the host pointer.
		* Flag for ::cuMemHostAlloc()
		*/
		#define CU_MEMHOSTALLOC_DEVICEMAP 0x02

		/**
		* If set, host memory is allocated as write-combined - fast to write,
		* faster to DMA, slow to read except via SSE4 streaming load instruction
		* (MOVNTDQA).
		* Flag for ::cuMemHostAlloc()
		*/
		#define CU_MEMHOSTALLOC_WRITECOMBINED 0x04

		/**
		* 2D memory copy parameters
		*/
		typedef struct CUDA_MEMCPY2D_st {

		unsigned int srcXInBytes, ///< Source X in bytes
		srcY; ///< Source Y
		CUmemorytype srcMemoryType; ///< Source memory type (host, device, arra
		y)
		const void *srcHost; ///< Source host pointer
		CUdeviceptr srcDevice; ///< Source device pointer
		CUarray srcArray; ///< Source array reference
		unsigned int srcPitch; ///< Source pitch (ignored when src is arra
		y)

		unsigned int dstXInBytes, ///< Destination X in bytes
		dstY; ///< Destination Y
		CUmemorytype dstMemoryType; ///< Destination memory type (host, device,
		array)
		void *dstHost; ///< Destination host pointer
		CUdeviceptr dstDevice; ///< Destination device pointer
		CUarray dstArray; ///< Destination array reference
		unsigned int dstPitch; ///< Destination pitch (ignored when dst is
		array)

		unsigned int WidthInBytes; ///< Width of 2D memory copy in bytes
		unsigned int Height; ///< Height of 2D memory copy
		} CUDA_MEMCPY2D;

		/**
		* 3D memory copy parameters
		*/
		typedef struct CUDA_MEMCPY3D_st {

		unsigned int srcXInBytes, ///< Source X in bytes
		srcY, ///< Source Y
		srcZ; ///< Source Z
		unsigned int srcLOD; ///< Source LOD
		CUmemorytype srcMemoryType; ///< Source memory type (host, device, arra
		y)
		const void *srcHost; ///< Source host pointer
		CUdeviceptr srcDevice; ///< Source device pointer
		CUarray srcArray; ///< Source array reference
		void *reserved0; ///< Must be NULL
		unsigned int srcPitch; ///< Source pitch (ignored when src is arra
		y)
		unsigned int srcHeight; ///< Source height (ignored when src is arr
		ay; may be 0 if Depth==1)

		unsigned int dstXInBytes, ///< Destination X in bytes
		dstY, ///< Destination Y
		dstZ; ///< Destination Z
		unsigned int dstLOD; ///< Destination LOD
		CUmemorytype dstMemoryType; ///< Destination memory type (host, device,
		array)
		void *dstHost; ///< Destination host pointer
		CUdeviceptr dstDevice; ///< Destination device pointer
		CUarray dstArray; ///< Destination array reference
		void *reserved1; ///< Must be NULL
		unsigned int dstPitch; ///< Destination pitch (ignored when dst is
		array)
		unsigned int dstHeight; ///< Destination height (ignored when dst i
		s array; may be 0 if Depth==1)

		unsigned int WidthInBytes; ///< Width of 3D memory copy in bytes
		unsigned int Height; ///< Height of 3D memory copy
		unsigned int Depth; ///< Depth of 3D memory copy
		} CUDA_MEMCPY3D;

		/**
		* Array descriptor
		*/
		typedef struct
		{
		unsigned int Width; ///< Width of array
		unsigned int Height; ///< Height of array

		CUarray_format Format; ///< Array format

		unsigned int NumChannels; ///< Channels per array element
		} CUDA_ARRAY_DESCRIPTOR;

		/**
		* 3D array descriptor
		*/
		typedef struct
		{
		unsigned int Width; ///< Width of 3D array
		unsigned int Height; ///< Height of 3D array
		unsigned int Depth; ///< Depth of 3D array

		CUarray_format Format; ///< Array format

		unsigned int NumChannels; ///< Channels per array element

		unsigned int Flags; ///< Flags
		} CUDA_ARRAY3D_DESCRIPTOR;

		/**
		* Override the texref format with a format inferred from the array.
		* Flag for ::cuTexRefSetArray()
		*/
		#define CU_TRSA_OVERRIDE_FORMAT 0x01

		/**
		* Read the texture as integers rather than promoting the values to floats
		* in the range [0,1].
		* Flag for ::cuTexRefSetFlags()
		*/
		#define CU_TRSF_READ_AS_INTEGER 0x01

		/**
		* Use normalized texture coordinates in the range [0,1) instead of [0,dim)
		.
		* Flag for ::cuTexRefSetFlags()
		*/
		#define CU_TRSF_NORMALIZED_COORDINATES 0x02

		/**
		* For texture references loaded into the module, use default texunit from
		* texture reference.
		*/
		#define CU_PARAM_TR_DEFAULT -1

		/** @} */
		/** @} / / END CUDA_TYPES */

	#ifdef _WIN32	#ifdef _WIN32
	#define CUDAAPI __stdcall	#define CUDAAPI __stdcall
	#else	#else
	#define CUDAAPI	#define CUDAAPI
	#endif	#endif

	/*********************************	/*********************************
	** Initialization	** Initialization
	*********************************/	*********************************/
	CUresult CUDAAPI cuInit(unsigned int Flags);	CUresult CUDAAPI cuInit(unsigned int Flags);


		/*********************************
		** Driver Version Query
		*********************************/
		CUresult CUDAAPI cuDriverGetVersion(int *driverVersion);

	/************************************	/************************************
	**	**
	** Device management	** Device management
	**	**
	***********************************/	***********************************/

	CUresult CUDAAPI cuDeviceGet(CUdevice *device, int ordinal);	CUresult CUDAAPI cuDeviceGet(CUdevice *device, int ordinal);
	CUresult CUDAAPI cuDeviceGetCount(int *count);	CUresult CUDAAPI cuDeviceGetCount(int *count);
	CUresult CUDAAPI cuDeviceGetName(char *name, int len, CUdevice dev);	CUresult CUDAAPI cuDeviceGetName(char *name, int len, CUdevice dev);
	CUresult CUDAAPI cuDeviceComputeCapability(int major, int minor, CUd evice dev);	CUresult CUDAAPI cuDeviceComputeCapability(int major, int minor, CUd evice dev);

	skipping to change at line 367	skipping to change at line 602
	// size of biggest r/w to be performe d by kernels on this memory	// size of biggest r/w to be performe d by kernels on this memory
	// 4, 8 or 16 bytes	// 4, 8 or 16 bytes
	unsigned int ElementSizeBytes	unsigned int ElementSizeBytes
	);	);
	CUresult CUDAAPI cuMemFree(CUdeviceptr dptr);	CUresult CUDAAPI cuMemFree(CUdeviceptr dptr);
	CUresult CUDAAPI cuMemGetAddressRange( CUdeviceptr pbase, unsigned int psize, CUdeviceptr dptr );	CUresult CUDAAPI cuMemGetAddressRange( CUdeviceptr pbase, unsigned int psize, CUdeviceptr dptr );

	CUresult CUDAAPI cuMemAllocHost(void **pp, unsigned int bytesize);	CUresult CUDAAPI cuMemAllocHost(void **pp, unsigned int bytesize);
	CUresult CUDAAPI cuMemFreeHost(void *p);	CUresult CUDAAPI cuMemFreeHost(void *p);


		CUresult CUDAAPI cuMemHostAlloc(void **pp, size_t bytesize, unsigned in
		t Flags );

		CUresult CUDAAPI cuMemHostGetDevicePointer( CUdeviceptr pdptr, void p
		, unsigned int Flags );

	/************************************	/************************************
	**	**
	** Synchronous Memcpy	** Synchronous Memcpy
	**	**
	** Intra-device memcpy's done with these functions may execute in para llel with the CPU,	** Intra-device memcpy's done with these functions may execute in para llel with the CPU,
	** but if host memory is involved, they wait until the copy is done be fore returning.	** but if host memory is involved, they wait until the copy is done be fore returning.
	**	**
	***********************************/	***********************************/

	// 1D functions	// 1D functions

	skipping to change at line 397	skipping to change at line 636

	// system <-> array memory	// system <-> array memory
	CUresult CUDAAPI cuMemcpyHtoA( CUarray dstArray, unsigned int dstI ndex, const void *pSrc, unsigned int ByteCount );	CUresult CUDAAPI cuMemcpyHtoA( CUarray dstArray, unsigned int dstI ndex, const void *pSrc, unsigned int ByteCount );
	CUresult CUDAAPI cuMemcpyAtoH( void *dstHost, CUarray srcArray, un signed int srcIndex, unsigned int ByteCount );	CUresult CUDAAPI cuMemcpyAtoH( void *dstHost, CUarray srcArray, un signed int srcIndex, unsigned int ByteCount );

	// array <-> array memory	// array <-> array memory
	CUresult CUDAAPI cuMemcpyAtoA( CUarray dstArray, unsigned int dstI ndex, CUarray srcArray, unsigned int srcIndex, unsigned int ByteCount );	CUresult CUDAAPI cuMemcpyAtoA( CUarray dstArray, unsigned int dstI ndex, CUarray srcArray, unsigned int srcIndex, unsigned int ByteCount );

	// 2D memcpy	// 2D memcpy


	typedef struct CUDA_MEMCPY2D_st {

	unsigned int srcXInBytes, srcY;
	CUmemorytype srcMemoryType;
	const void *srcHost;
	CUdeviceptr srcDevice;
	CUarray srcArray;
	unsigned int srcPitch; // ignored when src is array

	unsigned int dstXInBytes, dstY;
	CUmemorytype dstMemoryType;
	void *dstHost;
	CUdeviceptr dstDevice;
	CUarray dstArray;
	unsigned int dstPitch; // ignored when dst is array

	unsigned int WidthInBytes;
	unsigned int Height;
	} CUDA_MEMCPY2D;
	CUresult CUDAAPI cuMemcpy2D( const CUDA_MEMCPY2D *pCopy );	CUresult CUDAAPI cuMemcpy2D( const CUDA_MEMCPY2D *pCopy );
	CUresult CUDAAPI cuMemcpy2DUnaligned( const CUDA_MEMCPY2D *pCopy ) ;	CUresult CUDAAPI cuMemcpy2DUnaligned( const CUDA_MEMCPY2D *pCopy ) ;

	// 3D memcpy	// 3D memcpy


	typedef struct CUDA_MEMCPY3D_st {

	unsigned int srcXInBytes, srcY, srcZ;
	unsigned int srcLOD;
	CUmemorytype srcMemoryType;
	const void *srcHost;
	CUdeviceptr srcDevice;
	CUarray srcArray;
	void *reserved0; // must be NULL
	unsigned int srcPitch; // ignored when src is array
	unsigned int srcHeight; // ignored when src is array; may b
	e 0 if Depth==1

	unsigned int dstXInBytes, dstY, dstZ;
	unsigned int dstLOD;
	CUmemorytype dstMemoryType;
	void *dstHost;
	CUdeviceptr dstDevice;
	CUarray dstArray;
	void *reserved1; // must be NULL
	unsigned int dstPitch; // ignored when dst is array
	unsigned int dstHeight; // ignored when dst is array; may b
	e 0 if Depth==1

	unsigned int WidthInBytes;
	unsigned int Height;
	unsigned int Depth;
	} CUDA_MEMCPY3D;
	CUresult CUDAAPI cuMemcpy3D( const CUDA_MEMCPY3D *pCopy );	CUresult CUDAAPI cuMemcpy3D( const CUDA_MEMCPY3D *pCopy );

	/************************************	/************************************
	**	**
	** Asynchronous Memcpy	** Asynchronous Memcpy
	**	**
	** Any host memory involved must be DMA'able (e.g., allocated with cuM emAllocHost).	** Any host memory involved must be DMA'able (e.g., allocated with cuM emAllocHost).
	** memcpy's done with these functions execute in parallel with the CPU and, if	** memcpy's done with these functions execute in parallel with the CPU and, if
	** the hardware is available, may execute in parallel with the GPU.	** the hardware is available, may execute in parallel with the GPU.
	** Asynchronous memcpy must be accompanied by appropriate stream synch ronization.	** Asynchronous memcpy must be accompanied by appropriate stream synch ronization.

	skipping to change at line 500	skipping to change at line 694
	CUresult CUDAAPI cuMemsetD2D32( CUdeviceptr dstDevice, unsigned in t dstPitch, unsigned int ui, unsigned int Width, unsigned int Height );	CUresult CUDAAPI cuMemsetD2D32( CUdeviceptr dstDevice, unsigned in t dstPitch, unsigned int ui, unsigned int Width, unsigned int Height );

	/************************************	/************************************
	**	**
	** Function management	** Function management
	**	**
	***********************************/	***********************************/

	CUresult CUDAAPI cuFuncSetBlockShape (CUfunction hfunc, int x, int y, i nt z);	CUresult CUDAAPI cuFuncSetBlockShape (CUfunction hfunc, int x, int y, i nt z);
	CUresult CUDAAPI cuFuncSetSharedSize (CUfunction hfunc, unsigned int by tes);	CUresult CUDAAPI cuFuncSetSharedSize (CUfunction hfunc, unsigned int by tes);

		CUresult CUDAAPI cuFuncGetAttribute (int *pi, CUfunction_attribute attr ib, CUfunction hfunc);

	/************************************	/************************************
	**	**
	** Array management	** Array management
	**	**
	***********************************/	***********************************/


	typedef struct
	{
	//
	// dimensions
	//
	unsigned int Width;
	unsigned int Height;

	//
	// format
	//
	CUarray_format Format;

	// channels per array element
	unsigned int NumChannels;
	} CUDA_ARRAY_DESCRIPTOR;

	CUresult CUDAAPI cuArrayCreate( CUarray pHandle, const CUDA_ARRAY_DES CRIPTOR pAllocateArray );	CUresult CUDAAPI cuArrayCreate( CUarray pHandle, const CUDA_ARRAY_DES CRIPTOR pAllocateArray );
	CUresult CUDAAPI cuArrayGetDescriptor( CUDA_ARRAY_DESCRIPTOR *pArrayDe scriptor, CUarray hArray );	CUresult CUDAAPI cuArrayGetDescriptor( CUDA_ARRAY_DESCRIPTOR *pArrayDe scriptor, CUarray hArray );
	CUresult CUDAAPI cuArrayDestroy( CUarray hArray );	CUresult CUDAAPI cuArrayDestroy( CUarray hArray );


	typedef struct
	{
	//
	// dimensions
	//
	unsigned int Width;
	unsigned int Height;
	unsigned int Depth;
	//
	// format
	//
	CUarray_format Format;

	// channels per array element
	unsigned int NumChannels;
	//
	// flags
	//
	unsigned int Flags;

	} CUDA_ARRAY3D_DESCRIPTOR;
	CUresult CUDAAPI cuArray3DCreate( CUarray pHandle, const CUDA_ARRAY3D _DESCRIPTOR pAllocateArray );	CUresult CUDAAPI cuArray3DCreate( CUarray pHandle, const CUDA_ARRAY3D _DESCRIPTOR pAllocateArray );
	CUresult CUDAAPI cuArray3DGetDescriptor( CUDA_ARRAY3D_DESCRIPTOR *pArr ayDescriptor, CUarray hArray );	CUresult CUDAAPI cuArray3DGetDescriptor( CUDA_ARRAY3D_DESCRIPTOR *pArr ayDescriptor, CUarray hArray );

	/************************************	/************************************
	**	**
	** Texture reference management	** Texture reference management
	**	**
	***********************************/	***********************************/
	CUresult CUDAAPI cuTexRefCreate( CUtexref *pTexRef );	CUresult CUDAAPI cuTexRefCreate( CUtexref *pTexRef );
	CUresult CUDAAPI cuTexRefDestroy( CUtexref hTexRef );	CUresult CUDAAPI cuTexRefDestroy( CUtexref hTexRef );

	CUresult CUDAAPI cuTexRefSetArray( CUtexref hTexRef, CUarray hArray, u nsigned int Flags );	CUresult CUDAAPI cuTexRefSetArray( CUtexref hTexRef, CUarray hArray, u nsigned int Flags );

	// override the texref format with a format inferred from the array
	#define CU_TRSA_OVERRIDE_FORMAT 0x01
	CUresult CUDAAPI cuTexRefSetAddress( unsigned int *ByteOffset, CUtexre f hTexRef, CUdeviceptr dptr, unsigned int bytes );	CUresult CUDAAPI cuTexRefSetAddress( unsigned int *ByteOffset, CUtexre f hTexRef, CUdeviceptr dptr, unsigned int bytes );

		CUresult CUDAAPI cuTexRefSetAddress2D( CUtexref hTexRef, const CUDA_AR RAY_DESCRIPTOR *desc, CUdeviceptr dptr, unsigned int Pitch);
	CUresult CUDAAPI cuTexRefSetFormat( CUtexref hTexRef, CUarray_format f mt, int NumPackedComponents );	CUresult CUDAAPI cuTexRefSetFormat( CUtexref hTexRef, CUarray_format f mt, int NumPackedComponents );


	CUresult CUDAAPI cuTexRefSetAddressMode( CUtexref hTexRef, int dim, CU address_mode am );	CUresult CUDAAPI cuTexRefSetAddressMode( CUtexref hTexRef, int dim, CU address_mode am );
	CUresult CUDAAPI cuTexRefSetFilterMode( CUtexref hTexRef, CUfilter_mod e fm );	CUresult CUDAAPI cuTexRefSetFilterMode( CUtexref hTexRef, CUfilter_mod e fm );
	CUresult CUDAAPI cuTexRefSetFlags( CUtexref hTexRef, unsigned int Flag s );	CUresult CUDAAPI cuTexRefSetFlags( CUtexref hTexRef, unsigned int Flag s );

	// read the texture as integers rather than promoting the values
	// to floats in the range [0,1]
	#define CU_TRSF_READ_AS_INTEGER 0x01

	// use normalized texture coordinates in the range [0,1) instead of
	[0,dim)
	#define CU_TRSF_NORMALIZED_COORDINATES 0x02

	CUresult CUDAAPI cuTexRefGetAddress( CUdeviceptr *pdptr, CUtexref hTex Ref );	CUresult CUDAAPI cuTexRefGetAddress( CUdeviceptr *pdptr, CUtexref hTex Ref );
	CUresult CUDAAPI cuTexRefGetArray( CUarray *phArray, CUtexref hTexRef );	CUresult CUDAAPI cuTexRefGetArray( CUarray *phArray, CUtexref hTexRef );
	CUresult CUDAAPI cuTexRefGetAddressMode( CUaddress_mode *pam, CUtexref hTexRef, int dim );	CUresult CUDAAPI cuTexRefGetAddressMode( CUaddress_mode *pam, CUtexref hTexRef, int dim );
	CUresult CUDAAPI cuTexRefGetFilterMode( CUfilter_mode *pfm, CUtexref h TexRef );	CUresult CUDAAPI cuTexRefGetFilterMode( CUfilter_mode *pfm, CUtexref h TexRef );
	CUresult CUDAAPI cuTexRefGetFormat( CUarray_format pFormat, int pNum Channels, CUtexref hTexRef );	CUresult CUDAAPI cuTexRefGetFormat( CUarray_format pFormat, int pNum Channels, CUtexref hTexRef );
	CUresult CUDAAPI cuTexRefGetFlags( unsigned int *pFlags, CUtexref hTex Ref );	CUresult CUDAAPI cuTexRefGetFlags( unsigned int *pFlags, CUtexref hTex Ref );

	/************************************	/************************************
	**	**
	** Parameter management	** Parameter management
	**	**
	***********************************/	***********************************/

	CUresult CUDAAPI cuParamSetSize (CUfunction hfunc, unsigned int numbyt es);	CUresult CUDAAPI cuParamSetSize (CUfunction hfunc, unsigned int numbyt es);
	CUresult CUDAAPI cuParamSeti (CUfunction hfunc, int offset, unsigne d int value);	CUresult CUDAAPI cuParamSeti (CUfunction hfunc, int offset, unsigne d int value);
	CUresult CUDAAPI cuParamSetf (CUfunction hfunc, int offset, float v alue);	CUresult CUDAAPI cuParamSetf (CUfunction hfunc, int offset, float v alue);
	CUresult CUDAAPI cuParamSetv (CUfunction hfunc, int offset, void * ptr, unsigned int numbytes);	CUresult CUDAAPI cuParamSetv (CUfunction hfunc, int offset, void * ptr, unsigned int numbytes);
	CUresult CUDAAPI cuParamSetTexRef(CUfunction hfunc, int texunit, CUtex ref hTexRef);	CUresult CUDAAPI cuParamSetTexRef(CUfunction hfunc, int texunit, CUtex ref hTexRef);

	// for texture references loaded into the module,
	// use default texunit from texture reference
	#define CU_PARAM_TR_DEFAULT -1

	/************************************	/************************************
	**	**
	** Launch functions	** Launch functions
	**	**
	***********************************/	***********************************/

	CUresult CUDAAPI cuLaunch ( CUfunction f );	CUresult CUDAAPI cuLaunch ( CUfunction f );
	CUresult CUDAAPI cuLaunchGrid (CUfunction f, int grid_width, int grid_h eight);	CUresult CUDAAPI cuLaunchGrid (CUfunction f, int grid_width, int grid_h eight);
	CUresult CUDAAPI cuLaunchGridAsync( CUfunction f, int grid_width, int g rid_height, CUstream hStream );	CUresult CUDAAPI cuLaunchGridAsync( CUfunction f, int grid_width, int g rid_height, CUstream hStream );

End of changes. 61 change blocks.
	278 lines changed or deleted	457 lines changed or added

	cudaGL.h	cudaGL.h
	/*	/*

	* Copyright 1993-2008 NVIDIA Corporation. All rights reserved.	* Copyright 1993-2009 NVIDIA Corporation. All rights reserved.
	*	*
	* NOTICE TO USER:	* NOTICE TO USER:
	*	*
	* This source code is subject to NVIDIA ownership rights under U.S. and	* This source code is subject to NVIDIA ownership rights under U.S. and
	* international Copyright laws. Users and possessors of this source code	* international Copyright laws. Users and possessors of this source code
	* are hereby granted a nonexclusive, royalty-free license to use this code	* are hereby granted a nonexclusive, royalty-free license to use this code
	* in individual and commercial software.	* in individual and commercial software.
	*	*
	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR

	skipping to change at line 50	skipping to change at line 50
	extern "C" {	extern "C" {
	#endif	#endif

	CUresult CUDAAPI cuGLInit(void);	CUresult CUDAAPI cuGLInit(void);
	CUresult CUDAAPI cuGLCtxCreate( CUcontext *pCtx, unsigned int Flags, CUdevi ce device );	CUresult CUDAAPI cuGLCtxCreate( CUcontext *pCtx, unsigned int Flags, CUdevi ce device );
	CUresult CUDAAPI cuGLRegisterBufferObject( GLuint bufferobj );	CUresult CUDAAPI cuGLRegisterBufferObject( GLuint bufferobj );
	CUresult CUDAAPI cuGLMapBufferObject( CUdeviceptr dptr, unsigned int size , GLuint bufferobj );	CUresult CUDAAPI cuGLMapBufferObject( CUdeviceptr dptr, unsigned int size , GLuint bufferobj );
	CUresult CUDAAPI cuGLUnmapBufferObject( GLuint bufferobj );	CUresult CUDAAPI cuGLUnmapBufferObject( GLuint bufferobj );
	CUresult CUDAAPI cuGLUnregisterBufferObject( GLuint bufferobj );	CUresult CUDAAPI cuGLUnregisterBufferObject( GLuint bufferobj );


		#if defined(_WIN32)
		#if !defined(WGL_NV_gpu_affinity)
		typedef void* HGPUNV;
		#endif
		CUresult CUDAAPI cuWGLGetDevice( CUdevice *pDevice, HGPUNV hGpu );
		#endif

	#ifdef __cplusplus	#ifdef __cplusplus
	};	};
	#endif	#endif

	#endif	#endif

End of changes. 2 change blocks.
	1 lines changed or deleted	8 lines changed or added

	cuda_gl_interop.h	cuda_gl_interop.h
	/*	/*

	* Copyright 1993-2008 NVIDIA Corporation. All rights reserved.	* Copyright 1993-2009 NVIDIA Corporation. All rights reserved.
	*	*
	* NOTICE TO USER:	* NOTICE TO USER:
	*	*
	* This source code is subject to NVIDIA ownership rights under U.S. and	* This source code is subject to NVIDIA ownership rights under U.S. and
	* international Copyright laws. Users and possessors of this source code	* international Copyright laws. Users and possessors of this source code
	* are hereby granted a nonexclusive, royalty-free license to use this code	* are hereby granted a nonexclusive, royalty-free license to use this code
	* in individual and commercial software.	* in individual and commercial software.
	*	*
	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR

	skipping to change at line 72	skipping to change at line 72
	* *	* *
	* *	* *
	* *	* *
	************************************************************************* **/	************************************************************************* **/

	extern __host__ cudaError_t CUDARTAPI cudaGLSetGLDevice(int device);	extern __host__ cudaError_t CUDARTAPI cudaGLSetGLDevice(int device);
	extern __host__ cudaError_t CUDARTAPI cudaGLRegisterBufferObject(GLuint buf Obj);	extern __host__ cudaError_t CUDARTAPI cudaGLRegisterBufferObject(GLuint buf Obj);
	extern __host__ cudaError_t CUDARTAPI cudaGLMapBufferObject(void **devPtr, GLuint bufObj);	extern __host__ cudaError_t CUDARTAPI cudaGLMapBufferObject(void **devPtr, GLuint bufObj);
	extern __host__ cudaError_t CUDARTAPI cudaGLUnmapBufferObject(GLuint bufObj );	extern __host__ cudaError_t CUDARTAPI cudaGLUnmapBufferObject(GLuint bufObj );
	extern __host__ cudaError_t CUDARTAPI cudaGLUnregisterBufferObject(GLuint b ufObj);	extern __host__ cudaError_t CUDARTAPI cudaGLUnregisterBufferObject(GLuint b ufObj);

		#ifdef _WIN32
		#ifndef WGL_NV_gpu_affinity
		typedef void* HGPUNV;
		#endif
		extern __host__ cudaError_t CUDARTAPI cudaWGLGetDevice(int *device, HGPUNV
		hGpu);
		#endif

	#if defined(__cplusplus)	#if defined(__cplusplus)
	}	}
	#endif /* __cplusplus */	#endif /* __cplusplus */

	#endif /* __CUDA_GL_INTEROP_H__ */	#endif /* __CUDA_GL_INTEROP_H__ */

End of changes. 2 change blocks.
	1 lines changed or deleted	8 lines changed or added

	cuda_runtime.h	cuda_runtime.h
	/*	/*

	* Copyright 1993-2008 NVIDIA Corporation. All rights reserved.	* Copyright 1993-2009 NVIDIA Corporation. All rights reserved.
	*	*
	* NOTICE TO USER:	* NOTICE TO USER:
	*	*
	* This source code is subject to NVIDIA ownership rights under U.S. and	* This source code is subject to NVIDIA ownership rights under U.S. and
	* international Copyright laws. Users and possessors of this source code	* international Copyright laws. Users and possessors of this source code
	* are hereby granted a nonexclusive, royalty-free license to use this code	* are hereby granted a nonexclusive, royalty-free license to use this code
	* in individual and commercial software.	* in individual and commercial software.
	*	*
	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR

	skipping to change at line 77	skipping to change at line 77
	#endif /* __CUDACC__ */	#endif /* __CUDACC__ */

	#if defined(__cplusplus)	#if defined(__cplusplus)

	/************************************************************************ ***	/************************************************************************ ***
	* *	* *
	* *	* *
	* *	* *
	************************************************************************* **/	************************************************************************* **/


		/**
		* \ingroup CUDART_HIGHLEVEL
		* \brief \hl Configure a device launch
		*
		* Pushes \p size bytes of the argument pointed to by \p arg at \p offset
		* bytes from the start of the parameter passing area, which starts at
		* offset 0. The arguments are stored in the top of the execution stack.
		* \ref ::cudaSetupArgument(T,size_t) "cudaSetupArgument()" must be precede
		d
		* by a call to ::cudaConfigureCall().
		*
		* \param arg - Argument to push for a kernel launch
		* \param offset - Offset in argument stack to push new arg
		*
		* \return
		* ::cudaSuccess
		* \notefnerr
		*
		* \sa \ref ::cudaLaunch(T*) "cudaLaunch (C++ API)",
		* \ref ::cudaSetupArgument(const void*, size_t, size_t) "cudaSetupArgument
		(C API)"
		* ::cudaConfigureCall
		*/
	template<class T>	template<class T>
	__inline__ __host__ cudaError_t cudaSetupArgument(	__inline__ __host__ cudaError_t cudaSetupArgument(
	T arg,	T arg,
	size_t offset	size_t offset
	)	)
	{	{
	return cudaSetupArgument((const void*)&arg, sizeof(T), offset);	return cudaSetupArgument((const void*)&arg, sizeof(T), offset);
	}	}

	#if defined(__CUDACC__)	#if defined(__CUDACC__)

	/************************************************************************ ***	/************************************************************************ ***
	* *	* *
	* *	* *
	* *	* *
	************************************************************************* **/	************************************************************************* **/


		/**
		* \addtogroup CUDART_HIGHLEVEL
		* @{
		*/

	static __inline__ __host__ cudaError_t cudaMemcpyToSymbol(	static __inline__ __host__ cudaError_t cudaMemcpyToSymbol(
	char *symbol,	char *symbol,
	const void *src,	const void *src,
	size_t count,	size_t count,
	size_t offset = 0,	size_t offset = 0,
	enum cudaMemcpyKind kind = cudaMemcpyHostToDevice	enum cudaMemcpyKind kind = cudaMemcpyHostToDevice
	)	)
	{	{
	return cudaMemcpyToSymbol((const char*)symbol, src, count, offset, kind);	return cudaMemcpyToSymbol((const char*)symbol, src, count, offset, kind);
	}	}

	skipping to change at line 204	skipping to change at line 230
	}	}

	static __inline__ __host__ cudaError_t cudaGetSymbolAddress(	static __inline__ __host__ cudaError_t cudaGetSymbolAddress(
	void **devPtr,	void **devPtr,
	char *symbol	char *symbol
	)	)
	{	{
	return cudaGetSymbolAddress(devPtr, (const char*)symbol);	return cudaGetSymbolAddress(devPtr, (const char*)symbol);
	}	}


		/**
		* \brief \hl Finds the address associated with a CUDA symbol
		*
		* Returns in \p *devPtr the address of symbol \p symbol on the device.
		* \p symbol can either be a variable that resides in global memory space,
		or
		* it can be a character string, naming a variable that resides in global
		* memory space. If \p symbol cannot be found, or if \p symbol is not decla
		red
		* in the global memory space, \p *devPtr is unchanged and the error
		* ::cudaErrorInvalidSymbol is returned.
		*
		* \param devPtr - Return device pointer associated with symbol
		* \param symbol - Global variable or string symbol to search for
		*
		* \return
		* ::cudaSuccess,
		* ::cudaErrorInvalidSymbol,
		* ::cudaErrorAddressOfConstant
		* \notefnerr
		*
		* \sa \ref ::cudaGetSymbolAddress(void*, const char) "cudaGetSymbolAddre
		ss (C API)"
		* \ref ::cudaGetSymbolSize(size_t*, const T&) "cudaGetSymbolSize (C++ API)
		"
		*/
	template<class T>	template<class T>
	__inline__ __host__ cudaError_t cudaGetSymbolAddress(	__inline__ __host__ cudaError_t cudaGetSymbolAddress(
	void **devPtr,	void **devPtr,
	const T &symbol	const T &symbol
	)	)
	{	{
	return cudaGetSymbolAddress(devPtr, (const char*)&symbol);	return cudaGetSymbolAddress(devPtr, (const char*)&symbol);
	}	}

	/************************************************************************ ***	/************************************************************************ ***

	skipping to change at line 227	skipping to change at line 275
	************************************************************************* **/	************************************************************************* **/

	static __inline__ __host__ cudaError_t cudaGetSymbolSize(	static __inline__ __host__ cudaError_t cudaGetSymbolSize(
	size_t *size,	size_t *size,
	char *symbol	char *symbol
	)	)
	{	{
	return cudaGetSymbolSize(size, (const char*)symbol);	return cudaGetSymbolSize(size, (const char*)symbol);
	}	}


		/**
		* \brief \hl Finds the size of the object associated with a CUDA symbol
		*
		* Returns in \p *size the size of symbol \p symbol. \p symbol can either b
		e a
		* variable that resides in global or constant memory space, or it can be a
		* character string, naming a variable that resides in global or constant
		* memory space. If \p symbol cannot be found, or if \p symbol is not decla
		red
		* in global or constant memory space, \p *size is unchanged and the error
		* ::cudaErrorInvalidSymbol is returned.
		*
		* \param size - Size of object associated with symbol
		* \param symbol - Global variable or string symbol to find size of
		*
		* \return
		* ::cudaSuccess,
		* ::cudaErrorInvalidSymbol
		* \notefnerr
		*
		* \sa \ref ::cudaGetSymbolAddress(void**, const T&) "cudaGetSymbolAddress
		(C++ API)"
		* \ref ::cudaGetSymbolSize(size_t, const char) "cudaGetSymbolSize (C API
		)"
		*/
	template<class T>	template<class T>
	__inline__ __host__ cudaError_t cudaGetSymbolSize(	__inline__ __host__ cudaError_t cudaGetSymbolSize(
	size_t *size,	size_t *size,
	const T &symbol	const T &symbol
	)	)
	{	{
	return cudaGetSymbolSize(size, (const char*)&symbol);	return cudaGetSymbolSize(size, (const char*)&symbol);
	}	}


		/** @} / / END CUDART_MEMORY */

	/************************************************************************ ***	/************************************************************************ ***
	* *	* *
	* *	* *
	* *	* *
	************************************************************************* **/	************************************************************************* **/


		/**
		* \addtogroup CUDART_HIGHLEVEL
		*
		* @{
		*/

		/**
		* \brief \hl Binds a memory area to a texture
		*
		* Binds \p size bytes of the memory area pointed to by \p devPtr to textur
		e
		* reference \p tex. \p desc describes how the memory is interpreted when
		* fetching values from the texture. The \p offset parameter is an optional
		* byte offset as with the low-level
		* \ref ::cudaBindTexture(size_t, const struct textureReference, const vo
		id, const struct cudaChannelFormatDesc, size_t) "cudaBindTexture()"
		* function. Any memory previously bound to \p tex is unbound.
		*
		* \param offset - Offset in bytes
		* \param tex - Texture to bind
		* \param devPtr - Memory area on device
		* \param desc - Channel format
		* \param size - Size of the memory area pointed to by devPtr
		*
		* \return
		* ::cudaSuccess,
		* ::cudaErrorInvalidValue,
		* ::cudaErrorInvalidDevicePointer,
		* ::cudaErrorInvalidTexture
		* \notefnerr
		*
		* \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)"
		,
		* ::cudaGetChannelDesc, ::cudaGetTextureReference,
		* \ref ::cudaBindTexture(size_t, const struct textureReference, const vo
		id, const struct cudaChannelFormatDesc, size_t) "cudaBindTexture (C API)"
		,
		* \ref ::cudaBindTexture(size_t*, const struct texture< T, dim, readMode>&
		, const void*, size_t) "cudaBindTexture (C++ API, inherited channel descrip
		tor)",
		* \ref ::cudaBindTexture2D(size_t*, const struct texture< T, dim, readMode
		>&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_
		t) "cudaBindTexture2D (C++ API)",
		* \ref ::cudaBindTextureToArray(const struct texture< T, dim, readMode>&,
		const struct cudaArray*, const struct cudaChannelFormatDesc&) "cudaBindText
		ureToArray (C++ API)",
		* \ref ::cudaBindTextureToArray(const struct texture< T, dim, readMode>&,
		const struct cudaArray*) "cudaBindTextureToArray (C++ API, inherited channe
		l descriptor)",
		* \ref ::cudaUnbindTexture(const struct texture< T, dim, readMode>&) "cuda
		UnbindTexture (C++ API)",
		* \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture< T, d
		im, readMode >&) "cudaGetTextureAlignmentOffset (C++ API)"
		*/
	template<class T, int dim, enum cudaTextureReadMode readMode>	template<class T, int dim, enum cudaTextureReadMode readMode>
	__inline__ __host__ cudaError_t cudaBindTexture(	__inline__ __host__ cudaError_t cudaBindTexture(
	size_t *offset,	size_t *offset,
	const struct texture<T, dim, readMode> &tex,	const struct texture<T, dim, readMode> &tex,
	const void *devPtr,	const void *devPtr,
	const struct cudaChannelFormatDesc &desc,	const struct cudaChannelFormatDesc &desc,
	size_t size = UINT_MAX	size_t size = UINT_MAX
	)	)
	{	{
	return cudaBindTexture(offset, &tex, devPtr, &desc, size);	return cudaBindTexture(offset, &tex, devPtr, &desc, size);
	}	}


		/**
		* \brief \hl Binds a memory area to a texture
		*
		* Binds \p size bytes of the memory area pointed to by \p devPtr to textur
		e
		* reference \p tex. The channel descriptor is inherited from the texture
		* reference type. The \p offset parameter is an optional byte offset as wi
		th
		* the low-level
		* ::cudaBindTexture(size_t, const struct textureReference, const void*,
		const struct cudaChannelFormatDesc*, size_t)
		* function. Any memory previously bound to \p tex is unbound.
		*
		* \param offset - Offset in bytes
		* \param tex - Texture to bind
		* \param devPtr - Memory area on device
		* \param size - Size of the memory area pointed to by devPtr
		*
		* \return
		* ::cudaSuccess,
		* ::cudaErrorInvalidValue,
		* ::cudaErrorInvalidDevicePointer,
		* ::cudaErrorInvalidTexture
		* \notefnerr
		*
		* \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API),
		* ::cudaGetChannelDesc, ::cudaGetTextureReference,
		* \ref ::cudaBindTexture(size_t, const struct textureReference, const vo
		id, const struct cudaChannelFormatDesc, size_t) "cudaBindTexture (C API)"
		,
		* \ref ::cudaBindTexture(size_t*, const struct texture< T, dim, readMode>&
		, const void*, size_t) "cudaBindTexture (C++ API, inherited channel descrip
		tor)",
		* \ref ::cudaBindTexture2D(size_t*, const struct texture< T, dim, readMode
		>&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_
		t) "cudaBindTexture2D (C++ API)",
		* \ref ::cudaBindTextureToArray(const struct texture< T, dim, readMode>&,
		const struct cudaArray*, const struct cudaChannelFormatDesc&) "cudaBindText
		ureToArray (C++ API)",
		* \ref ::cudaBindTextureToArray(const struct texture< T, dim, readMode>&,
		const struct cudaArray*) "cudaBindTextureToArray (C++ API, inherited channe
		l descriptor),
		* \ref ::cudaUnbindTexture(const struct texture<T, dim, readMode>&) "cudaU
		nbindTexture (C++ API)",
		* \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture< T, d
		im, readMode>&) "cudaGetTextureAlignmentOffset (C++ API)"
		*/
	template<class T, int dim, enum cudaTextureReadMode readMode>	template<class T, int dim, enum cudaTextureReadMode readMode>
	__inline__ __host__ cudaError_t cudaBindTexture(	__inline__ __host__ cudaError_t cudaBindTexture(
	size_t *offset,	size_t *offset,
	const struct texture<T, dim, readMode> &tex,	const struct texture<T, dim, readMode> &tex,
	const void *devPtr,	const void *devPtr,
	size_t size = UINT_MAX	size_t size = UINT_MAX
	)	)
	{	{
	return cudaBindTexture(offset, tex, devPtr, tex.channelDesc, size);	return cudaBindTexture(offset, tex, devPtr, tex.channelDesc, size);
	}	}


		/**
		* \brief \hl Binds a 2D memory area to a texture
		*
		* Binds the 2D memory area pointed to by \p devPtr to the
		* texture reference \p tex. The size of the area is constrained by
		* \p width in texel units, \p height in texel units, and \p pitch in byte
		* units. \p desc describes how the memory is interpreted when fetching val
		ues
		* from the texture. Any memory previously bound to \p tex is unbound.
		*
		* Since the hardware enforces an alignment requirement on texture base
		* addresses,
		* \ref ::cudaBindTexture2D(size_t*, const struct texture< T, dim, readMode
		>&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_
		t) "cudaBindTexture2D()"
		* returns in \p *offset a byte offset that
		* must be applied to texture fetches in order to read from the desired mem
		ory.
		* This offset must be divided by the texel size and passed to kernels that
		* read from the texture so they can be applied to the ::tex2D() function.
		* If the device memory pointer was returned from ::cudaMalloc(), the offse
		t is
		* guaranteed to be 0 and NULL may be passed as the \p offset parameter.
		*
		* \param offset - Offset in bytes
		* \param tex - Texture reference to bind
		* \param devPtr - 2D memory area on device
		* \param desc - Channel format
		* \param width - Width in texel units
		* \param height - Height in texel units
		* \param pitch - Pitch in bytes
		*
		* \return
		* ::cudaSuccess,
		* ::cudaErrorInvalidValue,
		* ::cudaErrorInvalidDevicePointer,
		* ::cudaErrorInvalidTexture
		* \notefnerr
		*
		* \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API),
		* ::cudaGetChannelDesc, ::cudaGetTextureReference,
		* \ref ::cudaBindTexture(size_t*, const struct texture< T, dim, readMode>&
		, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTextur
		e (C++ API),
		* \ref ::cudaBindTexture(size_t*, const struct texture< T, dim, readMode>&
		, const void*, size_t) "cudaBindTexture (C++ API, inherited channel descrip
		tor)",
		* \ref ::cudaBindTexture2D(size_t, const struct textureReference, const
		void, const struct cudaChannelFormatDesc, size_t, size_t, size_t) "cudaBi
		ndTexture2D (C API)",
		* \ref ::cudaBindTextureToArray(const struct texture< T, dim, readMode>&,
		const struct cudaArray*, const struct cudaChannelFormatDesc&) "cudaBindText
		ureToArray (C++ API)",
		* \ref ::cudaBindTextureToArray(const struct texture< T, dim, readMode>&,
		const struct cudaArray*) "cudaBindTextureToArray (C++ API, inherited channe
		l descriptor),
		* \ref ::cudaUnbindTexture(const struct texture<T, dim, readMode>&) "cudaU
		nbindTexture (C++ API)",
		* \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture< T, d
		im, readMode>&) "cudaGetTextureAlignmentOffset (C++ API)"
		*/
		template<class T, int dim, enum cudaTextureReadMode readMode>
		__inline__ __host__ cudaError_t cudaBindTexture2D(
		size_t *offset,
		const struct texture<T, dim, readMode> &tex,
		const void *devPtr,
		const struct cudaChannelFormatDesc &desc,
		size_t width,
		size_t height,
		size_t pitch
		)
		{
		return cudaBindTexture2D( offset, &tex, devPtr, &desc, width, height, pit
		ch);
		}

		/**
		* \brief \hl Binds an array to a texture
		*
		* Binds the CUDA array \p array to the texture reference \p tex.
		* \p desc describes how the memory is interpreted when fetching values fro
		m
		* the texture. Any CUDA array previously bound to \p tex is unbound.
		*
		* \param tex - Texture to bind
		* \param array - Memory array on device
		* \param desc - Channel format
		*
		* \return
		* ::cudaSuccess,
		* ::cudaErrorInvalidValue,
		* ::cudaErrorInvalidDevicePointer,
		* ::cudaErrorInvalidTexture
		* \notefnerr
		*
		* \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)"
		,
		* ::cudaGetChannelDesc, ::cudaGetTextureReference,
		* \ref ::cudaBindTexture(size_t*, const struct texture< T, dim, readMode>&
		, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTextur
		e (C++ API)",
		* \ref ::cudaBindTexture(size_t*, const struct texture< T, dim, readMode>&
		, const void*, size_t) "cudaBindTexture (C++ API, inherited channel descrip
		tor)",
		* \ref ::cudaBindTexture2D(size_t*, const struct texture< T, dim, readMode
		>&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_
		t) "cudaBindTexture2D (C++ API)",
		* \ref ::cudaBindTextureToArray(const struct textureReference*, const stru
		ct cudaArray, const struct cudaChannelFormatDesc) "cudaBindTextureToArray
		(C API)",
		* \ref ::cudaBindTextureToArray(const struct texture< T, dim, readMode>&,
		const struct cudaArray*) "cudaBindTextureToArray (C++ API, inherited channe
		l descriptor)",
		* \ref ::cudaUnbindTexture(const struct texture< T, dim, readMode>&) "cuda
		UnbindTexture (C++ API)",
		* \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture< T, d
		im, readMode >&) "cudaGetTextureAlignmentOffset (C++ API)"
		*/
	template<class T, int dim, enum cudaTextureReadMode readMode>	template<class T, int dim, enum cudaTextureReadMode readMode>
	__inline__ __host__ cudaError_t cudaBindTextureToArray(	__inline__ __host__ cudaError_t cudaBindTextureToArray(
	const struct texture<T, dim, readMode> &tex,	const struct texture<T, dim, readMode> &tex,
	const struct cudaArray *array,	const struct cudaArray *array,
	const struct cudaChannelFormatDesc &desc	const struct cudaChannelFormatDesc &desc
	)	)
	{	{
	return cudaBindTextureToArray(&tex, array, &desc);	return cudaBindTextureToArray(&tex, array, &desc);
	}	}


		/**
		* \brief \hl Binds an array to a texture
		*
		* Binds the CUDA array \p array to the texture reference \p tex.
		* The channel descriptor is inherited from the CUDA array. Any CUDA array
		* previously bound to \p tex is unbound.
		*
		* \param tex - Texture to bind
		* \param array - Memory array on device
		*
		* \return
		* ::cudaSuccess,
		* ::cudaErrorInvalidValue,
		* ::cudaErrorInvalidDevicePointer,
		* ::cudaErrorInvalidTexture
		* \notefnerr
		*
		* \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)"
		,
		* ::cudaGetChannelDesc, ::cudaGetTextureReference,
		* \ref ::cudaBindTexture(size_t*, const struct texture< T, dim, readMode>&
		, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTextur
		e (C++ API)",
		* \ref ::cudaBindTexture(size_t*, const struct texture< T, dim, readMode>&
		, const void*, size_t) "cudaBindTexture (C++ API, inherited channel descrip
		tor)",
		* \ref ::cudaBindTexture2D(size_t*, const struct texture< T, dim, readMode
		>&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_
		t) "cudaBindTexture2D (C++ API)",
		* \ref ::cudaBindTextureToArray(const struct textureReference*, const stru
		ct cudaArray, const struct cudaChannelFormatDesc) "cudaBindTextureToArray
		(C API)",
		* \ref ::cudaBindTextureToArray(const struct texture< T, dim, readMode>&,
		const struct cudaArray*, const struct cudaChannelFormatDesc&) "cudaBindText
		ureToArray (C++ API)",
		* \ref ::cudaUnbindTexture(const struct texture< T, dim, readMode>&) "cuda
		UnbindTexture (C++ API)",
		* \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture< T, d
		im, readMode >&) "cudaGetTextureAlignmentOffset (C++ API)"
		*/
	template<class T, int dim, enum cudaTextureReadMode readMode>	template<class T, int dim, enum cudaTextureReadMode readMode>
	__inline__ __host__ cudaError_t cudaBindTextureToArray(	__inline__ __host__ cudaError_t cudaBindTextureToArray(
	const struct texture<T, dim, readMode> &tex,	const struct texture<T, dim, readMode> &tex,
	const struct cudaArray *array	const struct cudaArray *array
	)	)
	{	{
	struct cudaChannelFormatDesc desc;	struct cudaChannelFormatDesc desc;
	cudaError_t err = cudaGetChannelDesc(&desc, array);	cudaError_t err = cudaGetChannelDesc(&desc, array);

	return err == cudaSuccess ? cudaBindTextureToArray(tex, array, desc) : er r;	return err == cudaSuccess ? cudaBindTextureToArray(tex, array, desc) : er r;
	}	}

	/************************************************************************ ***	/************************************************************************ ***
	* *	* *
	* *	* *
	* *	* *
	************************************************************************* **/	************************************************************************* **/


		/**
		* \brief \hl Unbinds a texture
		*
		* Unbinds the texture bound to \p tex.
		*
		* \param tex - Texture to unbind
		*
		* \return ::cudaSuccess
		* \notefnerr
		*
		* \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)"
		,
		* ::cudaGetChannelDesc, ::cudaGetTextureReference,
		* \ref ::cudaBindTexture(size_t*, const struct texture< T, dim, readMode>&
		, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTextur
		e (C++ API)",
		* \ref ::cudaBindTexture(size_t*, const struct texture< T, dim, readMode>&
		, const void*, size_t) "cudaBindTexture (C++ API, inherited channel descrip
		tor)",
		* \ref ::cudaBindTexture2D(size_t*, const struct texture< T, dim, readMode
		>&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_
		t) "cudaBindTexture2D (C++ API)",
		* \ref ::cudaBindTextureToArray(const struct texture< T, dim, readMode>&,
		const struct cudaArray*, const struct cudaChannelFormatDesc&) "cudaBindText
		ureToArray (C++ API)",
		* \ref ::cudaBindTextureToArray(const struct texture< T, dim, readMode>&,
		const struct cudaArray*) "cudaBindTextureToArray (C++ API, inherited channe
		l descriptor)",
		* \ref ::cudaUnbindTexture(const struct textureReference*) "cudaUnbindText
		ure (C API)",
		* \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture< T, d
		im, readMode >&) "cudaGetTextureAlignmentOffset (C++ API)"
		*/
	template<class T, int dim, enum cudaTextureReadMode readMode>	template<class T, int dim, enum cudaTextureReadMode readMode>
	__inline__ __host__ cudaError_t cudaUnbindTexture(	__inline__ __host__ cudaError_t cudaUnbindTexture(
	const struct texture<T, dim, readMode> &tex	const struct texture<T, dim, readMode> &tex
	)	)
	{	{
	return cudaUnbindTexture(&tex);	return cudaUnbindTexture(&tex);
	}	}

	/************************************************************************ ***	/************************************************************************ ***
	* *	* *
	* *	* *
	* *	* *
	************************************************************************* **/	************************************************************************* **/


		/**
		* \brief \hl Get the alignment offset of a texture
		*
		* Returns in \p *offset the offset that was returned when texture referenc
		e
		* \p tex was bound.
		*
		* \param offset - Offset of texture reference in bytes
		* \param tex - Texture to get offset of
		*
		* \return
		* ::cudaSuccess,
		* ::cudaErrorInvalidTexture,
		* ::cudaErrorInvalidTextureBinding
		* \notefnerr
		*
		* \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)"
		,
		* ::cudaGetChannelDesc, ::cudaGetTextureReference,
		* \ref ::cudaBindTexture(size_t*, const struct texture< T, dim, readMode>&
		, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTextur
		e (C++ API)",
		* \ref ::cudaBindTexture(size_t*, const struct texture< T, dim, readMode>&
		, const void*, size_t) "cudaBindTexture (C++ API, inherited channel descrip
		tor)",
		* \ref ::cudaBindTexture2D(size_t*, const struct texture< T, dim, readMode
		>&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_
		t) "cudaBindTexture2D (C++ API)",
		* \ref ::cudaBindTextureToArray(const struct texture< T, dim, readMode>&,
		const struct cudaArray*, const struct cudaChannelFormatDesc&) "cudaBindText
		ureToArray (C++ API)",
		* \ref ::cudaBindTextureToArray(const struct texture< T, dim, readMode>&,
		const struct cudaArray*) "cudaBindTextureToArray (C++ API, inherited channe
		l descriptor)",
		* \ref ::cudaUnbindTexture(const struct texture< T, dim, readMode>&) "cuda
		UnbindTexture (C++ API)",
		* \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct textureRefere
		nce*) "cudaGetTextureAlignmentOffset (C API)"
		*/
	template<class T, int dim, enum cudaTextureReadMode readMode>	template<class T, int dim, enum cudaTextureReadMode readMode>
	__inline__ __host__ cudaError_t cudaGetTextureAlignmentOffset(	__inline__ __host__ cudaError_t cudaGetTextureAlignmentOffset(
	size_t *offset,	size_t *offset,
	const struct texture<T, dim, readMode> &tex	const struct texture<T, dim, readMode> &tex
	)	)
	{	{
	return cudaGetTextureAlignmentOffset(offset, &tex);	return cudaGetTextureAlignmentOffset(offset, &tex);
	}	}


		/** @} / / END CUDART_HIGHLEVEL */

	/************************************************************************ ***	/************************************************************************ ***
	* *	* *
	* *	* *
	* *	* *
	************************************************************************* **/	************************************************************************* **/


		/**
		* \ingroup CUDART_HIGHLEVEL
		* \brief \hl Launches a device function
		*
		* Launches the function \p entry on the device. \p entry can either be a
		* function that executes on the device, or it can be a character string,
		* naming a function that executes on the device. \p entry must be declared
		as
		* a \p __global__ function.
		* \ref ::cudaLaunch(T*) "cudaLaunch()" must be preceded by a call to
		* ::cudaConfigureCall() since it pops the data that was pushed by
		* ::cudaConfigureCall() from the execution stack.
		*
		* \param entry - Device function pointer or char string naming device func
		tion
		* to execute
		*
		* \return
		* ::cudaSuccess,
		* ::cudaErrorInvalidDeviceFunction,
		* ::cudaErrorInvalidConfiguration
		* \notefnerr
		*
		* \sa ::cudaConfigureCall,
		* \ref ::cudaSetupArgument(T,size_t) "cudaSetupArgument (C++ API)",
		* \ref ::cudaLaunch(const char*) "cudaLaunch (C API)"
		*/
	template<class T>	template<class T>
	__inline__ __host__ cudaError_t cudaLaunch(	__inline__ __host__ cudaError_t cudaLaunch(

	T *symbol	T *entry
	)	)
	{	{

	return cudaLaunch((const char*)symbol);	return cudaLaunch((const char*)entry);
	}	}

	#endif /* __CUDACC__ */	#endif /* __CUDACC__ */

	#endif /* __cplusplus */	#endif /* __cplusplus */

	#endif /* !__CUDA_RUNTIME_H__ */	#endif /* !__CUDA_RUNTIME_H__ */

End of changes. 16 change blocks.
	3 lines changed or deleted	444 lines changed or added

	cuda_texture_types.h	cuda_texture_types.h
	/*	/*

	* Copyright 1993-2008 NVIDIA Corporation. All rights reserved.	* Copyright 1993-2009 NVIDIA Corporation. All rights reserved.
	*	*
	* NOTICE TO USER:	* NOTICE TO USER:
	*	*
	* This source code is subject to NVIDIA ownership rights under U.S. and	* This source code is subject to NVIDIA ownership rights under U.S. and
	* international Copyright laws. Users and possessors of this source code	* international Copyright laws. Users and possessors of this source code
	* are hereby granted a nonexclusive, royalty-free license to use this code	* are hereby granted a nonexclusive, royalty-free license to use this code
	* in individual and commercial software.	* in individual and commercial software.
	*	*
	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR

	skipping to change at line 59	skipping to change at line 59
	#include "host_defines.h"	#include "host_defines.h"
	#include "texture_types.h"	#include "texture_types.h"

	/************************************************************************ ***	/************************************************************************ ***
	* *	* *
	* *	* *
	* *	* *
	************************************************************************* **/	************************************************************************* **/

	/TEXTURE_TYPE/	/TEXTURE_TYPE/

	template<class T, int dim = 1, enum cudaTextureReadMode = cudaReadModeEleme ntType>	template<class T, int dim = 1, enum cudaTextureReadMode mode = cudaReadMode ElementType>
	struct texture : public textureReference	struct texture : public textureReference
	{	{
	__host__ texture(int norm = 0,	__host__ texture(int norm = 0,
	enum cudaTextureFilterMode fMode = cudaFilterModePoint,	enum cudaTextureFilterMode fMode = cudaFilterModePoint,
	enum cudaTextureAddressMode aMode = cudaAddressModeClamp )	enum cudaTextureAddressMode aMode = cudaAddressModeClamp )
	{	{
	normalized = norm;	normalized = norm;
	filterMode = fMode;	filterMode = fMode;
	addressMode[0] = aMode;	addressMode[0] = aMode;
	addressMode[1] = aMode;	addressMode[1] = aMode;

End of changes. 2 change blocks.
	2 lines changed or deleted	2 lines changed or added

	cufft.h	cufft.h
	/*	/*

	* Copyright 1993-2008 NVIDIA Corporation. All rights reserved.	* Copyright 1993-2009 NVIDIA Corporation. All rights reserved.
	*	*
	* NOTICE TO USER:	* NOTICE TO USER:
	*	*
	* This source code is subject to NVIDIA ownership rights under U.S. and	* This source code is subject to NVIDIA ownership rights under U.S. and
	* international Copyright laws. Users and possessors of this source code	* international Copyright laws. Users and possessors of this source code
	* are hereby granted a nonexclusive, royalty-free license to use this code	* are hereby granted a nonexclusive, royalty-free license to use this code
	* in individual and commercial software.	* in individual and commercial software.
	*	*
	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR

	skipping to change at line 40	skipping to change at line 40
	*	*
	* Any use of this source code in individual and commercial software must	* Any use of this source code in individual and commercial software must
	* include, in the user documentation and internal comments to the code,	* include, in the user documentation and internal comments to the code,
	* the above Disclaimer and U.S. Government End Users Notice.	* the above Disclaimer and U.S. Government End Users Notice.
	*/	*/

	/*	/*
	* cufft.h	* cufft.h
	* Public header file for the NVIDIA Cuda FFT library (CUFFT)	* Public header file for the NVIDIA Cuda FFT library (CUFFT)
	*/	*/


	#ifndef _CUFFT_H_	#ifndef _CUFFT_H_
	#define _CUFFT_H_	#define _CUFFT_H_

	#include <stdio.h>	#include <stdio.h>
	#include "cuComplex.h"	#include "cuComplex.h"


	#ifdef __MULTI_CORE__
	#error CUFFT not supported on multicore
	#endif

	#ifndef CUFFTAPI	#ifndef CUFFTAPI
	#ifdef _WIN32	#ifdef _WIN32
	#define CUFFTAPI __stdcall	#define CUFFTAPI __stdcall
	#else	#else
	#define CUFFTAPI	#define CUFFTAPI
	#endif	#endif
	#endif	#endif

	#ifdef __cplusplus	#ifdef __cplusplus
	extern "C" {	extern "C" {

End of changes. 3 change blocks.
	6 lines changed or deleted	1 lines changed or added

	device_functions.h	device_functions.h
	/*	/*

	* Copyright 1993-2008 NVIDIA Corporation. All rights reserved.	* Copyright 1993-2009 NVIDIA Corporation. All rights reserved.
	*	*
	* NOTICE TO USER:	* NOTICE TO USER:
	*	*
	* This source code is subject to NVIDIA ownership rights under U.S. and	* This source code is subject to NVIDIA ownership rights under U.S. and
	* international Copyright laws. Users and possessors of this source code	* international Copyright laws. Users and possessors of this source code
	* are hereby granted a nonexclusive, royalty-free license to use this code	* are hereby granted a nonexclusive, royalty-free license to use this code
	* in individual and commercial software.	* in individual and commercial software.
	*	*
	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR

	skipping to change at line 77	skipping to change at line 77
	extern __device__ unsigned long long int __umul64hi(unsigned long long int, unsigned long long int);	extern __device__ unsigned long long int __umul64hi(unsigned long long int, unsigned long long int);

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __device__ float __int_as_float(int);	extern __device__ float __int_as_float(int);
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __device__ int __float_as_int(float);	extern __device__ int __float_as_int(float);

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __device__ void __syncthreads(void);	extern __device__ void __syncthreads(void);
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/

		extern __device__ void __prof_trigger(int);
		/DEVICE_BUILTIN/
		extern __device__ void __threadfence(void);
		/DEVICE_BUILTIN/
		extern __device__ void __threadfence_block(void);
		/DEVICE_BUILTIN/
	extern __device__ void __trap(void);	extern __device__ void __trap(void);
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __device__ void __brkpt(int);	extern __device__ void __brkpt(int);

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __device__ float __saturatef(float);	extern __device__ float __saturatef(float);

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __device__ unsigned int __sad(int, int, unsigned int);	extern __device__ unsigned int __sad(int, int, unsigned int);
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/

	skipping to change at line 183	skipping to change at line 189
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __device__ unsigned long long int __float2ull_rz(float);	extern __device__ unsigned long long int __float2ull_rz(float);

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __device__ float __ll2float_rn(long long int);	extern __device__ float __ll2float_rn(long long int);

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __device__ float __ull2float_rn(unsigned long long int);	extern __device__ float __ull2float_rn(unsigned long long int);

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/

		extern __device__ float __fadd_rn(float, float);
		/DEVICE_BUILTIN/
	extern __device__ float __fadd_rz(float, float);	extern __device__ float __fadd_rz(float, float);
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/

	extern __device__ float __fmul_rz(float, float);	extern __device__ float __fadd_ru(float, float);
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/

	extern __device__ float __fadd_rn(float, float);	extern __device__ float __fadd_rd(float, float);

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __device__ float __fmul_rn(float, float);	extern __device__ float __fmul_rn(float, float);

		/DEVICE_BUILTIN/
		extern __device__ float __fmul_rz(float, float);
		/DEVICE_BUILTIN/
		extern __device__ float __fmul_ru(float, float);
		/DEVICE_BUILTIN/
		extern __device__ float __fmul_rd(float, float);

		/DEVICE_BUILTIN/
		extern __device__ float __fmaf_rn(float, float, float);
		/DEVICE_BUILTIN/
		extern __device__ float __fmaf_rz(float, float, float);
		/DEVICE_BUILTIN/
		extern __device__ float __fmaf_ru(float, float, float);
		/DEVICE_BUILTIN/
		extern __device__ float __fmaf_rd(float, float, float);

		/DEVICE_BUILTIN/
		extern __device__ float __frcp_rn(float);
		/DEVICE_BUILTIN/
		extern __device__ float __frcp_rz(float);
		/DEVICE_BUILTIN/
		extern __device__ float __frcp_ru(float);
		/DEVICE_BUILTIN/
		extern __device__ float __frcp_rd(float);

		/DEVICE_BUILTIN/
		extern __device__ float __fsqrt_rn(float);
		/DEVICE_BUILTIN/
		extern __device__ float __fsqrt_rz(float);
		/DEVICE_BUILTIN/
		extern __device__ float __fsqrt_ru(float);
		/DEVICE_BUILTIN/
		extern __device__ float __fsqrt_rd(float);

		/DEVICE_BUILTIN/
		extern __device__ float __fdiv_rn(float, float);
		/DEVICE_BUILTIN/
		extern __device__ float __fdiv_rz(float, float);
		/DEVICE_BUILTIN/
		extern __device__ float __fdiv_ru(float, float);
		/DEVICE_BUILTIN/
		extern __device__ float __fdiv_rd(float, float);

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __device__ int __clz(int);	extern __device__ int __clz(int);
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __device__ int __ffs(int);	extern __device__ int __ffs(int);
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __device__ int __popc(unsigned int);	extern __device__ int __popc(unsigned int);

		/DEVICE_BUILTIN/
		extern __device__ unsigned int __brev(unsigned int);

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __device__ int __clzll(long long int);	extern __device__ int __clzll(long long int);
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __device__ int __ffsll(long long int);	extern __device__ int __ffsll(long long int);
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __device__ int __popcll(unsigned long long int);	extern __device__ int __popcll(unsigned long long int);

		/DEVICE_BUILTIN/
		extern __device__ unsigned long long int __brevll(unsigned long long int);

	#if !defined(CUDA_NO_SM_13_DOUBLE_INTRINSICS)	#if !defined(CUDA_NO_SM_13_DOUBLE_INTRINSICS)

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __device__ int __double2int_rz(double);	extern __device__ int __double2int_rz(double);

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __device__ unsigned int __double2uint_rz(double);	extern __device__ unsigned int __double2uint_rz(double);

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/

	skipping to change at line 309	skipping to change at line 364
	static __inline__ __device__ void brkpt(int c)	static __inline__ __device__ void brkpt(int c)
	{	{
	__brkpt(c);	__brkpt(c);
	}	}

	static __inline__ __device__ void syncthreads(void)	static __inline__ __device__ void syncthreads(void)
	{	{
	__syncthreads();	__syncthreads();
	}	}


		static __inline__ __device__ void prof_trigger(int e)
		{
		if (e == 0) __prof_trigger( 0);
		else if (e == 1) __prof_trigger( 1);
		else if (e == 2) __prof_trigger( 2);
		else if (e == 3) __prof_trigger( 3);
		else if (e == 4) __prof_trigger( 4);
		else if (e == 5) __prof_trigger( 5);
		else if (e == 6) __prof_trigger( 6);
		else if (e == 7) __prof_trigger( 7);
		else if (e == 8) __prof_trigger( 8);
		else if (e == 9) __prof_trigger( 9);
		else if (e == 10) __prof_trigger(10);
		else if (e == 11) __prof_trigger(11);
		else if (e == 12) __prof_trigger(12);
		else if (e == 13) __prof_trigger(13);
		else if (e == 14) __prof_trigger(14);
		else if (e == 15) __prof_trigger(15);
		}

		static __inline__ __device__ void threadfence(bool global = true)
		{
		global ? __threadfence() : __threadfence_block();
		}

	static __inline__ __device__ int float2int(float a, enum cudaRoundMode mode = cudaRoundZero)	static __inline__ __device__ int float2int(float a, enum cudaRoundMode mode = cudaRoundZero)
	{	{
	return mode == cudaRoundNearest ? __float2int_rn(a) :	return mode == cudaRoundNearest ? __float2int_rn(a) :
	mode == cudaRoundPosInf ? __float2int_ru(a) :	mode == cudaRoundPosInf ? __float2int_ru(a) :
	mode == cudaRoundMinInf ? __float2int_rd(a) :	mode == cudaRoundMinInf ? __float2int_rd(a) :
	__float2int_rz(a);	__float2int_rz(a);
	}	}

	static __inline__ __device__ unsigned int float2uint(float a, enum cudaRoun dMode mode = cudaRoundZero)	static __inline__ __device__ unsigned int float2uint(float a, enum cudaRoun dMode mode = cudaRoundZero)
	{	{

	skipping to change at line 348	skipping to change at line 428
	__uint2float_rn(a);	__uint2float_rn(a);
	}	}

	#elif !defined(__CUDACC__)	#elif !defined(__CUDACC__)

	#include "crt/func_macro.h"	#include "crt/func_macro.h"

	#include "host_defines.h"	#include "host_defines.h"
	#include "math_constants.h"	#include "math_constants.h"


	#if !defined(__CUDABE__)	#if defined(__CUDABE__)

		__device_func__(float __frcp_rn (float x))
		{
		unsigned int expo;
		unsigned f, y;
		unsigned int argi;
		float t;

		argi = __float_as_int(x);
		expo = (argi >> 23);
		expo = expo & 0xff;
		f = expo - 1;
		if (f <= 0xFD) {
		y = (argi & 0x00ffffff) \| 0x00800000;
		expo = (2 * 127) - expo - 2;
		t = 1.0f / x;
		argi = __float_as_int(t);
		argi = (argi & 0x00ffffff) \| 0x00800000;
		if ((int)expo >= 0) {
		/* compute remainder1 */
		f = __umul24(y, argi);
		/* remainder1 must be negative. Fix if neccessary */
		if ((int)f > 0) {
		t = __int_as_float(__float_as_int(t)-1);
		f -= y;
		}
		/* compute remainder2 */
		expo = f + y;
		/* round result based on which remainder is smaller in magnitude */
		f = (unsigned)(-(int)f);
		if (expo < f) {
		t = __int_as_float(__float_as_int(t)+1);
		}
		return t;
		}
		}
		return 1.0f / x;
		}

		__device_func__(float __frcp_rz (float x))
		{
		unsigned int expo;
		unsigned f, y;
		unsigned int argi;
		float t;

		argi = __float_as_int(x);
		expo = (argi >> 23);
		expo = expo & 0xff;
		f = expo - 1;
		if (f <= 0xFD) {
		y = (argi & 0x00ffffff) \| 0x00800000;
		expo = (2 * 127) - expo - 2;
		t = 1.0f / x;
		argi = __float_as_int(t);
		argi = (argi & 0x00ffffff) \| 0x00800000;
		if ((int)expo >= 0) {
		f = __umul24(y, argi);
		if ((int)f > 0) {
		t = __int_as_float(__float_as_int(t)-1);
		}
		return t;
		}
		}
		return 1.0f / x;
		}

		__device_func__(float __frcp_rd (float x))
		{
		unsigned int expo;
		unsigned f, y;
		unsigned int argi;
		float t;

		argi = __float_as_int(x);
		expo = (argi >> 23);
		expo = expo & 0xff;
		f = expo - 1;
		if (f <= 0xFD) {
		y = (argi & 0x00ffffff) \| 0x00800000;
		expo = (2 * 127) - expo - 2;
		t = 1.0f / x;
		argi = __float_as_int(t);
		argi = (argi & 0x00ffffff) \| 0x00800000;
		if ((int)expo >= 0) {
		f = __umul24(y, argi);
		if (((int)f > 0) && (x > 0.0f)) {
		t = __int_as_float(__float_as_int(t)-1);
		}
		if (((int)f < 0) && (x < 0.0f)) {
		t = __int_as_float(__float_as_int(t)+1);
		}
		return t;
		}
		}
		return 1.0f / x;
		}

		__device_func__(float __frcp_ru (float x))
		{
		unsigned int expo;
		unsigned f, y;
		unsigned int argi;
		float t;

		argi = __float_as_int(x);
		expo = (argi >> 23);
		expo = expo & 0xff;
		f = expo - 1;
		if (f <= 0xFD) {
		y = (argi & 0x00ffffff) \| 0x00800000;
		expo = (2 * 127) - expo - 2;
		t = 1.0f / x;
		argi = __float_as_int(t);
		argi = (argi & 0x00ffffff) \| 0x00800000;
		if ((int)expo >= 0) {
		f = __umul24(y, argi);
		if (((int)f > 0) && (x < 0.0f)) {
		t = __int_as_float(__float_as_int(t)-1);
		}
		if (((int)f < 0) && (x > 0.0f)) {
		t = __int_as_float(__float_as_int(t)+1);
		}
		return t;
		}
		}
		return 1.0f / x;
		}

		__device_func__(float __fsqrt_rn (float radicand))
		{
		unsigned int expo, argi;
		unsigned int s, f, x;

		argi = __float_as_int(radicand);
		expo = argi >> 23;
		expo = expo & 0xff;
		f = expo - 1;

		if ((argi <= 0x80000000) && (f <= 0xFD)) {
		x = (argi << 8) \| 0x80000000;
		x = x >> (expo & 1);
		argi = (((__float_as_int(rsqrtf(__int_as_float(
		__float_as_int(radicand)\|1)))&0x00ffffff)\|0x00800000)<<7);
		/* second NR iteration */
		s = __umulhi(argi,argi);
		f = 0x30000000 - __umulhi(x,s);
		argi = __umulhi(f,argi);
		/* compute sqrt_rn(x) as x * 1/sqrt_rn(x) */
		argi = __umulhi(x,argi);
		argi = argi >> 3;
		x = (x << 16) - (argi * argi);
		/* round to nearest based on remainder; tie case impossible */
		f = x - (2 * argi + 1);
		if ((int)f < 0) f = (unsigned)(-(int)f);
		if ((int)x < 0) x = (unsigned)(-(int)x);
		if (f < x) argi ++;
		argi = argi + (((expo + 125) & ~0x1) << 22);
		return __int_as_float(argi);
		}
		return sqrtf(radicand);
		}

		__device_func__(float __fsqrt_rz (float radicand))
		{
		unsigned int expo, argi;
		unsigned int s, f, x;

		argi = __float_as_int(radicand);
		expo = argi >> 23;
		expo = expo & 0xff;
		f = expo - 1;

		if ((argi <= 0x80000000) && (f <= 0xFD)) {
		x = (argi << 8) \| 0x80000000;
		x = x >> (expo & 1);
		argi = (((__float_as_int(rsqrtf(__int_as_float(
		__float_as_int(radicand)\|1)))&0x00ffffff)\|0x00800000)<<7);
		/* NR iteration */
		s = __umulhi(argi,argi);
		f = 0x30000000 - __umulhi(x,s);
		argi = __umulhi(f,argi);
		/* compute sqrt_rz(x) as x * 1/sqrt_rz(x) */
		argi = __umulhi(x,argi);
		/* compute truncated result */
		argi = (argi + 4) >> 3;
		x = (x << 16) - (argi * argi);
		if ((int)x < 0) argi--;
		argi = argi + (((expo + 125) & ~0x1) << 22);
		return __int_as_float(argi);
		}
		return sqrtf(radicand);
		}

		__device_func__(float __fsqrt_ru (float radicand))
		{
		unsigned int expo, argi;
		unsigned int s, f, x;

		argi = __float_as_int(radicand);
		expo = argi >> 23;
		expo = expo & 0xff;
		f = expo - 1;

		if ((argi <= 0x80000000) && (f <= 0xFD)) {
		x = (argi << 8) \| 0x80000000;
		x = x >> (expo & 1);
		argi = (((__float_as_int(rsqrtf(__int_as_float(
		__float_as_int(radicand)\|1)))&0x00ffffff)\|0x00800000)<<7);
		/* NR iteration */
		s = __umulhi(argi,argi);
		f = 0x30000000 - __umulhi(x,s);
		argi = __umulhi(f,argi);
		/* compute sqrt_ru(x) as x * 1/sqrt_ru(x) */
		argi = __umulhi(x,argi);
		argi = (argi + 4) >> 3;
		x = (x << 16) - (argi * argi);
		if ((int)x > 0) argi++;
		argi = argi + (((expo + 125) & ~0x1) << 22);
		return __int_as_float(argi);
		}
		return sqrtf(radicand);
		}

		__device_func__(float __fsqrt_rd (float radicand))
		{
		unsigned int expo, argi;
		unsigned int s, f, x;

		argi = __float_as_int(radicand);
		expo = argi >> 23;
		expo = expo & 0xff;
		f = expo - 1;

		if ((argi <= 0x80000000) && (f <= 0xFD)) {
		x = (argi << 8) \| 0x80000000;
		x = x >> (expo & 1);
		argi = (((__float_as_int(rsqrtf(__int_as_float(
		__float_as_int(radicand)\|1)))&0x00ffffff)\|0x00800000)<<7);
		/* NR iteration */
		s = __umulhi(argi,argi);
		f = 0x30000000 - __umulhi(x,s);
		argi = __umulhi(f,argi);
		/* compute sqrt_rd(x) as x * 1/sqrt_rd(x) */
		argi = __umulhi(x,argi);
		/* compute truncated result */
		argi = (argi + 4) >> 3;
		x = (x << 16) - (argi * argi);
		if ((int)x < 0) argi--;
		argi = argi + (((expo + 125) & ~0x1) << 22);
		return __int_as_float(argi);
		}
		return sqrtf(radicand);
		}

		__device_func__(float __fdiv_rn (float dividend, float divisor))
		{
		unsigned long long prod;
		unsigned r, f, x, y, expox, expoy, sign;
		unsigned expo_res;
		unsigned resi, cvtxi, cvtyi;
		float t;

		cvtxi = __float_as_int(dividend);
		cvtyi = __float_as_int(divisor);
		expox = (cvtxi >> 23) & 0xff;
		expoy = (cvtyi >> 23) & 0xff;
		sign = ((cvtxi ^ cvtyi) & 0x80000000);

		if (((expox - 1) <= 0xFD) && ((expoy - 1) <= 0xFD)) {
		expo_res = expox - expoy + 127 - 1;
		/* extract mantissas */
		y = (cvtyi << 8) \| 0x80000000;
		x = (cvtxi & 0x00ffffff) \| 0x00800000;
		t =__int_as_float((cvtyi & 0x00ffffff) \| 0x3f800001);
		r = ((__float_as_int(1.0f / t) & 0x00ffffff) \| 0x00800000) << 7;
		/* NR iteration */
		f = (unsigned)-(int)__umulhi (y, r << 1);
		r = __umulhi (f, r << 1);
		/* produce quotient */
		prod = ((unsigned long long)x) * (r << 1);
		/* normalize mantissa */
		if (((int)((prod >> 32) << 8)) > 0) {
		expo_res--;
		prod = prod + prod;
		}
		/* preliminary mantissa */
		r = (unsigned)(prod >> 32);
		y = y >> 8;
		/* result is a normal */
		if (expo_res <= 0xFD) {
		int rem0, rem1, inc;
		/* round mantissa to nearest even */
		prod = ((unsigned long long)y) * r;
		x = x << (23 + ((prod >> 32) >> 15));
		rem1 = x - (unsigned)(prod & 0xffffffff);
		rem0 = rem1 - y;
		inc = abs(rem0) < abs(rem1);
		/* merge sign, mantissa, exponent for final result */
		resi = sign \| ((expo_res << 23) + r + inc);
		return __int_as_float(resi);
		} else if ((int)expo_res >= 254) {
		/* overflow: return infinity */
		resi = sign \| 0x7f800000;
		return __int_as_float(resi);
		} else {
		/* underflow, may still round to normal */
		int rem0, rem1, inc;
		prod = ((unsigned long long)y) * r;
		x = x << (23 + ((prod >> 32) >> 15));
		rem1 = x - (unsigned)(prod & 0xffffffff);
		rem0 = rem1 - y;
		inc = abs(rem0) < abs(rem1);
		resi = ((expo_res << 23) + r + inc);
		if (resi != 0x00800000) resi = 0;
		return __int_as_float(sign \| resi);
		}
		}
		if (__cuda_fabsf(divisor) > CUDART_TWO_TO_126_F) {
		divisor *= 0.25f;
		dividend *= 0.25f;
		}
		return dividend / divisor;
		}

		__device_func__(float __fdiv_rz (float dividend, float divisor))
		{
		unsigned long long prod;
		unsigned r, f, x, y, expox, expoy, sign;
		unsigned expo_res;
		unsigned resi, cvtxi, cvtyi;
		float t;

		cvtxi = __float_as_int(dividend);
		cvtyi = __float_as_int(divisor);
		expox = (cvtxi >> 23) & 0xff;
		expoy = (cvtyi >> 23) & 0xff;
		sign = ((cvtxi ^ cvtyi) & 0x80000000);

		if (((expox - 1) <= 0xFD) && ((expoy - 1) <= 0xFD)) {
		expo_res = expox - expoy + 127 - 1;
		/* extract mantissas */
		y = (cvtyi << 8) \| 0x80000000;
		x = (cvtxi & 0x00ffffff) \| 0x00800000;
		t =__int_as_float((cvtyi & 0x00ffffff) \| 0x3f800001);
		r = ((__float_as_int(1.0f / t) & 0x00ffffff) \| 0x00800000) << 7;
		/* NR iteration */
		f = (unsigned)-(int)__umulhi (y, r << 1);
		r = __umulhi (f, r << 1);
		/* produce quotient */
		prod = ((unsigned long long)x) * (r << 1);
		/* normalize mantissa */
		if (((int)((prod >> 32) << 8)) > 0) {
		expo_res--;
		prod = prod + prod;
		}
		/* preliminary mantissa */
		prod += 0x0000000080000000ULL;
		r = (unsigned)(prod >> 32);
		y = y >> 8;
		if (expo_res <= 0xFD) {
		/* result is a normal */
		int rem1;
		prod = ((unsigned long long)y) * r;
		x = x << (23 + ((prod >> 32) >> 15));
		rem1 = x - (unsigned)(prod & 0xffffffff);
		if (rem1 < 0) r--;
		resi = (expo_res << 23) + r;
		if (resi == 0x7f800000) resi = 0x7f7fffff;
		return __int_as_float(sign \| resi);
		} else if ((int)expo_res >= 254) {
		/* overflow: return largest normal */
		resi = 0x7f7fffff;
		return __int_as_float(sign \|resi);
		} else {
		/* underflow: result is smallest normal or zero */
		int rem1;
		prod = ((unsigned long long)y) * r;
		x = x << (23 + ((prod >> 32) >> 15));
		rem1 = x - (unsigned)(prod & 0xffffffff);
		if (rem1 < 0) r--;
		resi = ((expo_res << 23) + r);
		if (resi != 0x00800000) resi = 0;
		return __int_as_float(sign \| resi);
		}
		}
		if (__cuda_fabsf(divisor) > CUDART_TWO_TO_126_F) {
		divisor *= 0.25f;
		dividend *= 0.25f;
		}
		return dividend / divisor;
		}

		__device_func__(float __fdiv_ru (float dividend, float divisor))
		{
		unsigned long long prod;
		unsigned r, f, x, y, expox, expoy, sign;
		unsigned expo_res;
		unsigned resi, cvtxi, cvtyi;
		float t;

		cvtxi = __float_as_int(dividend);
		cvtyi = __float_as_int(divisor);
		expox = (cvtxi >> 23) & 0xff;
		expoy = (cvtyi >> 23) & 0xff;
		sign = ((cvtxi ^ cvtyi) & 0x80000000);

		if (((expox - 1) <= 0xFD) && ((expoy - 1) <= 0xFD)) {
		expo_res = expox - expoy + 127 - 1;
		/* extract mantissas */
		y = (cvtyi << 8) \| 0x80000000;
		x = (cvtxi & 0x00ffffff) \| 0x00800000;
		t =__int_as_float((cvtyi & 0x00ffffff) \| 0x3f800001);
		r = ((__float_as_int(1.0f / t) & 0x00ffffff) \| 0x00800000) << 7;
		/* NR iteration */
		f = (unsigned)-(int)__umulhi (y, r << 1);
		r = __umulhi (f, r << 1);
		/* produce quotient */
		prod = ((unsigned long long)x) * (r << 1);
		/* normalize mantissa */
		if (((int)((prod >> 32) << 8)) > 0) {
		expo_res--;
		prod = prod + prod;
		}
		/* preliminary mantissa */
		prod += 0x0000000080000000ULL;
		r = (unsigned)(prod >> 32);
		y = y >> 8;
		if (expo_res <= 0xFD) {
		/* result is a normal */
		int rem1;
		prod = ((unsigned long long)y) * r;
		x = x << (23 + ((prod >> 32) >> 15));
		rem1 = x - (unsigned)(prod & 0xffffffff);
		if ((rem1 < 0) && (sign)) r--;
		if ((rem1 > 0) && (!sign)) r++;
		resi = (expo_res << 23) + r;
		if ((resi == 0x7f800000) && (sign)) resi = 0x7f7fffff;
		return __int_as_float(sign \| resi);
		} else if ((int)expo_res >= 254) {
		/* overflow: return largest normal */
		resi = sign ? 0x7f7fffff : 0x7f800000;
		return __int_as_float(sign \| resi);
		} else {
		/* underflow: result is smallest normal or zero */
		int rem1;
		prod = ((unsigned long long)y) * r;
		x = x << (23 + ((prod >> 32) >> 15));
		rem1 = x - (unsigned)(prod & 0xffffffff);
		if ((rem1 < 0) && (sign)) r--;
		if ((rem1 > 0) && (!sign)) r++;
		resi = ((expo_res << 23) + r);
		if (resi != 0x00800000) resi = 0;
		return __int_as_float(sign \| resi);
		}
		}
		if (__cuda_fabsf(divisor) > CUDART_TWO_TO_126_F) {
		divisor *= 0.25f;
		dividend *= 0.25f;
		}
		return dividend / divisor;
		}

		__device_func__(float __fdiv_rd (float dividend, float divisor))
		{
		unsigned long long prod;
		unsigned r, f, x, y, expox, expoy, sign;
		unsigned expo_res;
		unsigned resi, cvtxi, cvtyi;
		float t;

		cvtxi = __float_as_int(dividend);
		cvtyi = __float_as_int(divisor);
		expox = (cvtxi >> 23) & 0xff;
		expoy = (cvtyi >> 23) & 0xff;
		sign = ((cvtxi ^ cvtyi) & 0x80000000);

		if (((expox - 1) <= 0xFD) && ((expoy - 1) <= 0xFD)) {
		expo_res = expox - expoy + 127 - 1;
		/* extract mantissas */
		y = (cvtyi << 8) \| 0x80000000;
		x = (cvtxi & 0x00ffffff) \| 0x00800000;
		t =__int_as_float((cvtyi & 0x00ffffff) \| 0x3f800001);
		r = ((__float_as_int(1.0f / t) & 0x00ffffff) \| 0x00800000) << 7;
		/* NR iteration */
		f = (unsigned)-(int)__umulhi (y, r << 1);
		r = __umulhi (f, r << 1);
		/* produce quotient */
		prod = ((unsigned long long)x) * (r << 1);
		/* normalize mantissa */
		if (((int)((prod >> 32) << 8)) > 0) {
		expo_res--;
		prod = prod + prod;
		}
		/* preliminary mantissa */
		prod += 0x0000000080000000ULL;
		r = (unsigned)(prod >> 32);
		y = y >> 8;
		if (expo_res <= 0xFD) {
		/* result is a normal */
		int rem1;
		prod = ((unsigned long long)y) * r;
		x = x << (23 + ((prod >> 32) >> 15));
		rem1 = x - (unsigned)(prod & 0xffffffff);
		if ((rem1 < 0) && (!sign)) r--;
		if ((rem1 > 0) && (sign)) r++;
		resi = (expo_res << 23) + r;
		if ((resi == 0x7f800000) && (!sign)) resi = 0x7f7fffff;
		return __int_as_float(sign \| resi);
		} else if ((int)expo_res >= 254) {
		/* overflow: return largest normal */
		resi = sign ? 0x7f800000 : 0x7f7fffff;
		return __int_as_float(sign \|resi);
		} else {
		/* underflow: result is smallest normal or zero */
		int rem1;
		prod = ((unsigned long long)y) * r;
		x = x << (23 + ((prod >> 32) >> 15));
		rem1 = x - (unsigned)(prod & 0xffffffff);
		if ((rem1 < 0) && (!sign)) r--;
		if ((rem1 > 0) && (sign)) r++;
		resi = ((expo_res << 23) + r);
		if (resi != 0x00800000) resi = 0;
		return __int_as_float(sign \| resi);
		}
		}
		if (__cuda_fabsf(divisor) > CUDART_TWO_TO_126_F) {
		divisor *= 0.25f;
		dividend *= 0.25f;
		}
		return dividend / divisor;
		}

		__device_func__(float __fadd_ru (float a, float b))
		{
		unsigned int expo_x, expo_y;
		unsigned int xxi, yyi, temp;

		xxi = __float_as_int(a);
		yyi = __float_as_int(b);

		/* make bigger operand the augend */
		expo_y = yyi << 1;
		if (expo_y > (xxi << 1)) {
		expo_y = xxi;
		xxi = yyi;
		yyi = expo_y;
		}

		temp = 0xff;
		expo_x = temp & (xxi >> 23);
		expo_x = expo_x - 1;
		expo_y = temp & (yyi >> 23);
		expo_y = expo_y - 1;

		if ((expo_x <= 0xFD) &&
		(expo_y <= 0xFD)) {

		expo_y = expo_x - expo_y;
		if (expo_y > 25) {
		expo_y = 31;
		}
		temp = xxi ^ yyi;
		xxi = xxi & ~0x7f000000;
		xxi = xxi \| 0x00800000;
		yyi = yyi & ~0xff000000;
		yyi = yyi \| 0x00800000;

		if ((int)temp < 0) {
		/* signs differ, effective subtraction */
		temp = 32 - expo_y;
		temp = (expo_y) ? (yyi << temp) : 0;
		temp = (unsigned int)(-((int)temp));
		xxi = xxi - (yyi >> expo_y) - (temp ? 1 : 0);
		if (xxi & 0x00800000) {
		if (expo_x <= 0xFD) {
		xxi = (xxi + (expo_x << 23));
		xxi += (temp && !(xxi & 0x80000000));
		return __int_as_float(xxi);
		}
		} else {
		if ((temp \| (xxi << 1)) == 0) {
		/* operands cancelled, resulting in a clean zero */
		xxi = 0;
		return __int_as_float(xxi);
		}
		/* normalize result */
		yyi = xxi & 0x80000000;
		do {
		xxi = (xxi << 1) \| (temp >> 31);
		temp <<= 1;
		expo_x--;
		} while (!(xxi & 0x00800000));
		xxi = xxi \| yyi;
		}
		} else {
		/* signs are the same, effective addition */
		temp = 32 - expo_y;
		temp = (expo_y) ? (yyi << temp) : 0;
		xxi = xxi + (yyi >> expo_y);
		if (!(xxi & 0x01000000)) {
		if (expo_x <= 0xFD) {
		xxi = xxi + (expo_x << 23);
		xxi += (temp && !(xxi & 0x80000000));
		return __int_as_float(xxi);
		}
		} else {
		/* normalize result */
		temp = (xxi << 31) \| (temp >> 1);
		xxi = ((xxi & 0x80000000) \| (xxi >> 1)) & ~0x40000000;
		expo_x++;
		}
		}
		if (expo_x <= 0xFD) {
		xxi += (temp && !(xxi & 0x80000000));
		xxi = xxi + (expo_x << 23);
		return __int_as_float(xxi);
		}
		if ((int)expo_x >= 254) {
		/* overflow: return infinity or largest normal */
		temp = xxi & 0x80000000;
		xxi = (temp ? 0xff7fffff : 0x7F800000);
		return __int_as_float(xxi);
		}
		/* underflow: zero or smallest normal */
		yyi = xxi & 0x80000000;
		xxi = xxi & ~0xff000000;
		expo_x = (unsigned int)(-((int)expo_x));
		xxi = (xxi >> expo_x);
		if ((expo_x > 25) \|\| (xxi != 0x00800000)) xxi = 0;
		return __int_as_float(yyi \| xxi);
		} else {
		return a + b;
		}
		}

		__device_func__(float __fadd_rd (float a, float b))
		{
		unsigned int expo_x, expo_y;
		unsigned int xxi, yyi, temp;

		xxi = __float_as_int(a);
		yyi = __float_as_int(b);

		/* make bigger operand the augend */
		expo_y = yyi << 1;
		if (expo_y > (xxi << 1)) {
		expo_y = xxi;
		xxi = yyi;
		yyi = expo_y;
		}

		temp = 0xff;
		expo_x = temp & (xxi >> 23);
		expo_x = expo_x - 1;
		expo_y = temp & (yyi >> 23);
		expo_y = expo_y - 1;

		if ((expo_x <= 0xFD) &&
		(expo_y <= 0xFD)) {

		expo_y = expo_x - expo_y;
		if (expo_y > 25) {
		expo_y = 31;
		}
		temp = xxi ^ yyi;
		xxi = xxi & ~0x7f000000;
		xxi = xxi \| 0x00800000;
		yyi = yyi & ~0xff000000;
		yyi = yyi \| 0x00800000;

		if ((int)temp < 0) {
		/* signs differ, effective subtraction */
		temp = 32 - expo_y;
		temp = (expo_y) ? (yyi << temp) : 0;
		temp = (unsigned int)(-((int)temp));
		xxi = xxi - (yyi >> expo_y) - (temp ? 1 : 0);
		if (xxi & 0x00800000) {
		if (expo_x <= 0xFD) {
		xxi = xxi & ~0x00800000; /* lop off integer bit */
		xxi = (xxi + (expo_x << 23)) + 0x00800000;
		xxi += (temp && (xxi & 0x80000000));
		return __int_as_float(xxi);
		}
		} else {
		if ((temp \| (xxi << 1)) == 0) {
		/* operands cancelled, resulting in a clean zero */
		xxi = 0x80000000;
		return __int_as_float(xxi);
		}
		/* normalize result */
		yyi = xxi & 0x80000000;
		do {
		xxi = (xxi << 1) \| (temp >> 31);
		temp <<= 1;
		expo_x--;
		} while (!(xxi & 0x00800000));
		xxi = xxi \| yyi;
		}
		} else {
		/* signs are the same, effective addition */
		temp = 32 - expo_y;
		temp = (expo_y) ? (yyi << temp) : 0;
		xxi = xxi + (yyi >> expo_y);
		if (!(xxi & 0x01000000)) {
		if (expo_x <= 0xFD) {
		expo_y = xxi & 1;
		xxi = xxi + (expo_x << 23);
		xxi += (temp && (xxi & 0x80000000));
		return __int_as_float(xxi);
		}
		} else {
		/* normalize result */
		temp = (xxi << 31) \| (temp >> 1);
		xxi = ((xxi & 0x80000000) \| (xxi >> 1)) & ~0x40000000;
		expo_x++;
		}
		}
		if (expo_x <= 0xFD) {
		xxi += (temp && (xxi & 0x80000000));
		xxi = xxi + (expo_x << 23);
		return __int_as_float(xxi);
		}
		if ((int)expo_x >= 254) {
		/* overflow: return infinity or largest normal */
		temp = xxi & 0x80000000;
		xxi = (temp ? 0xFF800000 : 0x7f7fffff);
		return __int_as_float(xxi);
		}
		/* underflow: zero or smallest normal */
		yyi = xxi & 0x80000000;
		xxi = xxi & ~0xff000000;
		expo_x = (unsigned int)(-((int)expo_x));
		xxi = (xxi >> expo_x);
		if ((expo_x > 25) \|\| (xxi != 0x00800000)) xxi = 0;
		return __int_as_float(yyi \| xxi);
		} else {
		a = a + b;
		xxi = xxi ^ yyi;
		if ((a == 0.0f) && ((int)xxi < 0)) a = __int_as_float(0x80000000);
		return a;
		}
		}

		__device_func__(float __fmul_ru (float a, float b))
		{
		unsigned long long product;
		unsigned int expo_x, expo_y;
		unsigned int xxi, yyi;

		xxi = __float_as_int(a);
		yyi = __float_as_int(b);

		expo_y = 0xFF;
		expo_x = expo_y & (xxi >> 23);
		expo_x = expo_x - 1;
		expo_y = expo_y & (yyi >> 23);
		expo_y = expo_y - 1;

		if ((expo_x <= 0xFD) &&
		(expo_y <= 0xFD)) {
		expo_x = expo_x + expo_y;
		expo_y = xxi ^ yyi;
		xxi = xxi & 0x00ffffff;
		yyi = yyi << 8;
		xxi = xxi \| 0x00800000;
		yyi = yyi \| 0x80000000;
		/* compute product */
		product = ((unsigned long long)xxi) * yyi;
		expo_x = expo_x - 127 + 2;
		expo_y = expo_y & 0x80000000;
		xxi = (unsigned int)(product >> 32);
		yyi = (unsigned int)(product & 0xffffffff);
		/* normalize mantissa */
		if (xxi < 0x00800000) {
		xxi = (xxi << 1) \| (yyi >> 31);
		yyi = (yyi << 1);
		expo_x--;
		}
		if (expo_x <= 0xFD) {
		xxi = xxi \| expo_y; /* OR in sign bit */
		xxi = xxi + (expo_x << 23); /* add in exponent */
		/* round result */
		xxi += (yyi && !expo_y);
		return __int_as_float(xxi);
		} else if ((int)expo_x >= 254) {
		/* overflow: return infinity or largest normal */
		xxi = (expo_y ? 0xff7fffff : 0x7F800000);
		return __int_as_float(xxi);
		} else {
		/* underflow: zero, or smallest normal */
		expo_x = ((unsigned int)-((int)expo_x));
		xxi += (yyi && !expo_y);
		xxi = (xxi >> expo_x);
		if ((expo_x > 25) \|\| (xxi != 0x00800000)) xxi = 0;
		return __int_as_float(expo_y \| xxi);
		}
		} else {
		return a * b;
		}
		}

		__device_func__(float __fmul_rd (float a, float b))
		{
		unsigned long long product;
		unsigned int expo_x, expo_y;
		unsigned int xxi, yyi;

		xxi = __float_as_int(a);
		yyi = __float_as_int(b);

		expo_y = 0xFF;
		expo_x = expo_y & (xxi >> 23);
		expo_x = expo_x - 1;
		expo_y = expo_y & (yyi >> 23);
		expo_y = expo_y - 1;

		if ((expo_x <= 0xFD) &&
		(expo_y <= 0xFD)) {
		expo_x = expo_x + expo_y;
		expo_y = xxi ^ yyi;
		xxi = xxi & 0x00ffffff;
		yyi = yyi << 8;
		xxi = xxi \| 0x00800000;
		yyi = yyi \| 0x80000000;
		/* compute product */
		product = ((unsigned long long)xxi) * yyi;
		expo_x = expo_x - 127 + 2;
		expo_y = expo_y & 0x80000000;
		xxi = (unsigned int)(product >> 32);
		yyi = (unsigned int)(product & 0xffffffff);
		/* normalize mantissa */
		if (xxi < 0x00800000) {
		xxi = (xxi << 1) \| (yyi >> 31);
		yyi = (yyi << 1);
		expo_x--;
		}
		if (expo_x <= 0xFD) {
		xxi = xxi \| expo_y; /* OR in sign bit */
		xxi = xxi + (expo_x << 23); /* add in exponent */
		/* round result */
		xxi += (yyi && expo_y);
		return __int_as_float(xxi);
		} else if ((int)expo_x >= 254) {
		/* overflow: return infinity or largest normal */
		xxi = expo_y \| (expo_y ?0x7F800000 : 0x7f7fffff);
		return __int_as_float(xxi);
		} else {
		/* underflow: zero, or smallest normal */
		expo_x = ((unsigned int)-((int)expo_x));
		xxi += (yyi && expo_y);
		xxi = (xxi >> expo_x);
		if ((expo_x > 25) \|\| (xxi != 0x00800000)) xxi = 0;
		return __int_as_float(expo_y \| xxi);
		}
		} else {
		return a * b;
		}
		}

		__device_func__(float __fmaf_rn (float a, float b, float c))
		{
		unsigned long long product;
		unsigned int xx, yy, zz, ww;
		unsigned int temp, s, u;
		unsigned int expo_x, expo_y, expo_z;

		xx = __float_as_int(a);
		yy = __float_as_int(b);
		zz = __float_as_int(c);

		/* Match 'denormals are zero' behavior of the GPU */
		if ((xx << 1) < 0x01000000) xx &= 0x80000000;
		if ((yy << 1) < 0x01000000) yy &= 0x80000000;
		if ((zz << 1) < 0x01000000) zz &= 0x80000000;

		temp = 0xff;
		expo_x = temp & (xx >> 23);
		expo_x = expo_x - 1;
		expo_y = temp & (yy >> 23);
		expo_y = expo_y - 1;
		expo_z = temp & (zz >> 23);
		expo_z = expo_z - 1;

		if (!((expo_x <= 0xFD) &&
		(expo_y <= 0xFD) &&
		(expo_z <= 0xFD))) {
		/* fmad (nan, y, z) --> nan
		fmad (x, nan, z) --> nan
		fmad (x, y, nan) --> nan
		*/
		if ((yy << 1) > 0xff000000) {
		return rsqrtf(b);
		}
		if ((zz << 1) > 0xff000000) {
		return rsqrtf(c);
		}
		if ((xx << 1) > 0xff000000) {
		return rsqrtf(a);
		}
		/* fmad (0, inf, z) --> NaN
		fmad (inf, 0, z) --> NaN
		fmad (-inf,+y,+inf) --> NaN
		fmad (+x,-inf,+inf) --> NaN
		fmad (+inf,-y,+inf) --> NaN
		fmad (-x,+inf,+inf) --> NaN
		fmad (-inf,-y,-inf) --> NaN
		fmad (-x,-inf,-inf) --> NaN
		fmad (+inf,+y,-inf) --> NaN
		fmad (+x,+inf,-inf) --> NaN
		*/
		if ((((xx << 1) == 0) && ((yy << 1) == 0xff000000)) \|\|
		(((yy << 1) == 0) && ((xx << 1) == 0xff000000))) {
		return rsqrtf(__int_as_float(0xffc00000));
		}
		if ((zz << 1) == 0xff000000) {
		if (((yy << 1) == 0xff000000) \|\| ((xx << 1) == 0xff000000)) {
		if ((int)(xx ^ yy ^ zz) < 0) {
		return rsqrtf(__int_as_float(0xffc00000));
		}
		}
		}
		/* fmad (inf, y, z) --> inf
		fmad (x, inf, z) --> inf
		fmad (x, y, inf) --> inf
		*/
		if ((xx << 1) == 0xff000000) {
		xx = xx ^ (yy & 0x80000000);
		return __int_as_float(xx);
		}
		if ((yy << 1) == 0xff000000) {
		yy = yy ^ (xx & 0x80000000);
		return __int_as_float(yy);
		}
		if ((zz << 1) == 0xff000000) {
		return __int_as_float(zz);
		}
		/* fmad (+0, -y, -0) --> -0
		fmad (-0, +y, -0) --> -0
		fmad (+x, -0, -0) --> -0
		fmad (-x, +0, -0) --> -0
		*/
		if (zz == 0x80000000) {
		if (((xx << 1) == 0) \|\| ((yy << 1) == 0)) {
		if ((int)(xx ^ yy) < 0) {
		return __int_as_float(zz);
		}
		}
		}
		/* fmad (0, y, 0) --> +0
		fmad (x, 0, 0) --> +0
		*/
		if (((zz << 1) == 0) &&
		(((xx << 1) == 0) \|\| ((yy << 1) == 0))) {
		zz &= 0x7fffffff;
		return __int_as_float(zz);
		}
		/* fmad (0, y, z) --> z
		fmad (x, 0, z) --> z
		*/
		if (((xx << 1) == 0) \|\| ((yy << 1) == 0)) {
		return __int_as_float(zz);
		}
		/* normalize x, if denormal */
		if (expo_x == (unsigned)-1) {
		temp = xx & 0x80000000;
		xx = xx << 8;
		while (!(xx & 0x80000000)) {
		xx <<= 1;
		expo_x--;
		}
		expo_x++;
		xx = (xx >> 8) \| temp;
		}
		/* normalize y, if denormal */
		if (expo_y == (unsigned)-1) {
		temp = yy & 0x80000000;
		yy = yy << 8;
		while (!(yy & 0x80000000)) {
		yy <<= 1;
		expo_y--;
		}
		expo_y++;
		yy = (yy >> 8) \| temp;
		}
		/* normalize z, if denormal */
		if ((expo_z == (unsigned)-1) && ((zz << 1) != 0)) {
		temp = zz & 0x80000000;
		zz = zz << 8;
		while (!(zz & 0x80000000)) {
		zz <<= 1;
		expo_z--;
		}
		expo_z++;
		zz = (zz >> 8) \| temp;
		}
		}

		expo_x = expo_x + expo_y;
		expo_y = xx ^ yy;
		xx = xx & 0x00ffffff;
		yy = yy << 8;
		xx = xx \| 0x00800000;
		yy = yy \| 0x80000000;

		product = ((unsigned long long)xx) * yy;
		xx = (unsigned)(product >> 32);
		yy = (unsigned)(product & 0xffffffff);

		expo_x = expo_x - 127 + 2;
		expo_y = expo_y & 0x80000000;
		/* normalize mantissa */
		if (xx < 0x00800000) {
		xx = (xx << 1) \| (yy >> 31);
		yy = (yy << 1);
		expo_x--;
		}
		temp = 0;

		if ((zz << 1) != 0) { /* z is not zero */
		s = zz & 0x80000000;
		zz &= 0x00ffffff;
		zz \|= 0x00800000;
		ww = 0;
		/* compare and swap. put augend into xx:yy */
		if ((int)expo_z > (int)expo_x) {
		temp = expo_z;
		expo_z = expo_x;
		expo_x = temp;
		temp = zz;
		zz = xx;
		xx = temp;
		temp = ww;
		ww = yy;
		yy = temp;
		temp = expo_y;
		expo_y = s;
		s = temp;
		}
		/* augend_sign = expo_y, augend_mant = xx:yy, augend_expo = expo_x */
		/* addend_sign = s, addend_mant = zz:ww, addend_expo = expo_z */
		expo_z = expo_x - expo_z;
		u = expo_y ^ s;
		if (expo_z <= 49) {
		/* denormalize addend */
		temp = 0;
		while (expo_z >= 32) {
		temp = ww \| (temp != 0);
		ww = zz;
		zz = 0;
		expo_z -= 32;
		}
		if (expo_z) {
		temp = ((temp >> expo_z) \| (ww << (32 - expo_z)) \|
		((temp << (32 - expo_z)) != 0));
		ww = (ww >> expo_z) \| (zz << (32 - expo_z));
		zz = (zz >> expo_z);
		}

		} else {
		temp = 1;
		ww = 0;
		zz = 0;
		}
		if ((int)u < 0) {
		/* signs differ, effective subtraction */
		temp = (unsigned)(-(int)temp);
		s = (temp != 0);
		u = yy - s;
		s = u > yy;
		yy = u - ww;
		s += yy > u;
		xx = (xx - zz) - s;
		if (!(xx \| yy \| temp)) {
		/* complete cancelation, return 0 */
		return __int_as_float(xx);
		}
		if ((int)xx < 0) {
		/* ooops, augend had smaller mantissa. Negate mantissa and flip
		sign of result*/
		temp = ~temp;
		yy = ~yy;
		xx = ~xx;
		if (++temp == 0) {
		if (++yy == 0) {
		++xx;
		}
		}
		expo_y ^= 0x80000000;
		}
		/* normalize mantissa, if necessary */
		while (!(xx & 0x00800000)) {
		xx = (xx << 1) \| (yy >> 31);
		yy = (yy << 1);
		expo_x--;
		}
		} else {
		/* signs are the same, effective addition */
		yy = yy + ww;
		s = yy < ww;
		xx = xx + zz + s;
		if (xx & 0x01000000) {
		temp = temp \| (yy << 31);
		yy = (yy >> 1) \| (xx << 31);
		xx = ((xx & 0x80000000) \| (xx >> 1)) & ~0x40000000;
		expo_x++;
		}
		}
		}
		temp = yy \| (temp != 0);
		if (expo_x <= 0xFD) {
		/* normal */
		xx \|= expo_y; /* or in sign bit */
		s = xx & 1; /* mantissa lsb */
		xx += (temp == 0x80000000) ? s : (temp >> 31);
		xx = xx + (expo_x << 23); /* add in exponent */
		return __int_as_float(xx);
		} else if ((int)expo_x >= 126) {
		/* overflow */
		xx = expo_y \| 0x7f800000;
		return __int_as_float(xx);
		}
		/* subnormal */
		expo_x = (unsigned int)(-(int)expo_x);
		/* Match 'flush to zero' response of the GPU */
		xx += (temp >= 0x80000000);
		if (xx >= 0x01000000) {
		xx = xx >> 1;
		expo_x--;
		}
		if (expo_x > 0) xx = 0;
		xx = expo_y \| xx;
		return __int_as_float(xx);
		}

		__device_func__(float __fmaf_rz (float a, float b, float c))
		{
		unsigned long long product;
		unsigned int xx, yy, zz, ww;
		unsigned int temp, s, u;
		unsigned int expo_x, expo_y, expo_z;

		xx = __float_as_int(a);
		yy = __float_as_int(b);
		zz = __float_as_int(c);

		/* Match 'denormals are zero' behavior of the GPU */
		if ((xx << 1) < 0x01000000) xx &= 0x80000000;
		if ((yy << 1) < 0x01000000) yy &= 0x80000000;
		if ((zz << 1) < 0x01000000) zz &= 0x80000000;

		temp = 0xff;
		expo_x = temp & (xx >> 23);
		expo_x = expo_x - 1;
		expo_y = temp & (yy >> 23);
		expo_y = expo_y - 1;
		expo_z = temp & (zz >> 23);
		expo_z = expo_z - 1;

		if (!((expo_x <= 0xFD) &&
		(expo_y <= 0xFD) &&
		(expo_z <= 0xFD))) {
		/* fmad (nan, y, z) --> nan
		fmad (x, nan, z) --> nan
		fmad (x, y, nan) --> nan
		*/
		if ((yy << 1) > 0xff000000) {
		return rsqrtf(b);
		}
		if ((zz << 1) > 0xff000000) {
		return rsqrtf(c);
		}
		if ((xx << 1) > 0xff000000) {
		return rsqrtf(a);
		}
		/* fmad (0, inf, z) --> NaN
		fmad (inf, 0, z) --> NaN
		fmad (-inf,+y,+inf) --> NaN
		fmad (+x,-inf,+inf) --> NaN
		fmad (+inf,-y,+inf) --> NaN
		fmad (-x,+inf,+inf) --> NaN
		fmad (-inf,-y,-inf) --> NaN
		fmad (-x,-inf,-inf) --> NaN
		fmad (+inf,+y,-inf) --> NaN
		fmad (+x,+inf,-inf) --> NaN
		*/
		if ((((xx << 1) == 0) && ((yy << 1) == 0xff000000)) \|\|
		(((yy << 1) == 0) && ((xx << 1) == 0xff000000))) {
		return rsqrtf(__int_as_float(0xffc00000));
		}
		if ((zz << 1) == 0xff000000) {
		if (((yy << 1) == 0xff000000) \|\| ((xx << 1) == 0xff000000)) {
		if ((int)(xx ^ yy ^ zz) < 0) {
		return rsqrtf(__int_as_float(0xffc00000));
		}
		}
		}
		/* fmad (inf, y, z) --> inf
		fmad (x, inf, z) --> inf
		fmad (x, y, inf) --> inf
		*/
		if ((xx << 1) == 0xff000000) {
		xx = xx ^ (yy & 0x80000000);
		return __int_as_float(xx);
		}
		if ((yy << 1) == 0xff000000) {
		yy = yy ^ (xx & 0x80000000);
		return __int_as_float(yy);
		}
		if ((zz << 1) == 0xff000000) {
		return __int_as_float(zz);
		}
		/* fmad (+0, -y, -0) --> -0
		fmad (-0, +y, -0) --> -0
		fmad (+x, -0, -0) --> -0
		fmad (-x, +0, -0) --> -0
		*/
		if (zz == 0x80000000) {
		if (((xx << 1) == 0) \|\| ((yy << 1) == 0)) {
		if ((int)(xx ^ yy) < 0) {
		return __int_as_float(zz);
		}
		}
		}
		/* fmad (0, y, 0) --> +0
		fmad (x, 0, 0) --> +0
		*/
		if (((zz << 1) == 0) &&
		(((xx << 1) == 0) \|\| ((yy << 1) == 0))) {
		zz &= 0x7fffffff;
		return __int_as_float(zz);
		}
		/* fmad (0, y, z) --> z
		fmad (x, 0, z) --> z
		*/
		if (((xx << 1) == 0) \|\| ((yy << 1) == 0)) {
		return __int_as_float(zz);
		}
		/* normalize x, if denormal */
		if (expo_x == (unsigned)-1) {
		temp = xx & 0x80000000;
		xx = xx << 8;
		while (!(xx & 0x80000000)) {
		xx <<= 1;
		expo_x--;
		}
		expo_x++;
		xx = (xx >> 8) \| temp;
		}
		/* normalize y, if denormal */
		if (expo_y == (unsigned)-1) {
		temp = yy & 0x80000000;
		yy = yy << 8;
		while (!(yy & 0x80000000)) {
		yy <<= 1;
		expo_y--;
		}
		expo_y++;
		yy = (yy >> 8) \| temp;
		}
		/* normalize z, if denormal */
		if ((expo_z == (unsigned)-1) && ((zz << 1) != 0)) {
		temp = zz & 0x80000000;
		zz = zz << 8;
		while (!(zz & 0x80000000)) {
		zz <<= 1;
		expo_z--;
		}
		expo_z++;
		zz = (zz >> 8) \| temp;
		}
		}

		expo_x = expo_x + expo_y;
		expo_y = xx ^ yy;
		xx = xx & 0x00ffffff;
		yy = yy << 8;
		xx = xx \| 0x00800000;
		yy = yy \| 0x80000000;

		product = ((unsigned long long)xx) * yy;
		xx = (unsigned)(product >> 32);
		yy = (unsigned)(product & 0xffffffff);

		expo_x = expo_x - 127 + 2;
		expo_y = expo_y & 0x80000000;
		/* normalize mantissa */
		if (xx < 0x00800000) {
		xx = (xx << 1) \| (yy >> 31);
		yy = (yy << 1);
		expo_x--;
		}
		temp = 0;

		if ((zz << 1) != 0) { /* z is not zero */
		s = zz & 0x80000000;
		zz &= 0x00ffffff;
		zz \|= 0x00800000;
		ww = 0;
		/* compare and swap. put augend into xx:yy */
		if ((int)expo_z > (int)expo_x) {
		temp = expo_z;
		expo_z = expo_x;
		expo_x = temp;
		temp = zz;
		zz = xx;
		xx = temp;
		temp = ww;
		ww = yy;
		yy = temp;
		temp = expo_y;
		expo_y = s;
		s = temp;
		}
		/* augend_sign = expo_y, augend_mant = xx:yy, augend_expo = expo_x */
		/* addend_sign = s, addend_mant = zz:ww, addend_expo = expo_z */
		expo_z = expo_x - expo_z;
		u = expo_y ^ s;
		if (expo_z <= 49) {
		/* denormalize addend */
		temp = 0;
		while (expo_z >= 32) {
		temp = ww \| (temp != 0);
		ww = zz;
		zz = 0;
		expo_z -= 32;
		}
		if (expo_z) {
		temp = ((temp >> expo_z) \| (ww << (32 - expo_z)) \|
		((temp << (32 - expo_z)) != 0));
		ww = (ww >> expo_z) \| (zz << (32 - expo_z));
		zz = (zz >> expo_z);
		}

		} else {
		temp = 1;
		ww = 0;
		zz = 0;
		}
		if ((int)u < 0) {
		/* signs differ, effective subtraction */
		temp = (unsigned)(-(int)temp);
		s = (temp != 0);
		u = yy - s;
		s = u > yy;
		yy = u - ww;
		s += yy > u;
		xx = (xx - zz) - s;
		if (!(xx \| yy \| temp)) {
		/* complete cancelation, return 0 */
		return __int_as_float(xx);
		}
		if ((int)xx < 0) {
		/* ooops, augend had smaller mantissa. Negate mantissa and flip
		sign of result*/
		temp = ~temp;
		yy = ~yy;
		xx = ~xx;
		if (++temp == 0) {
		if (++yy == 0) {
		++xx;
		}
		}
		expo_y ^= 0x80000000;
		}
		/* normalize mantissa, if necessary */
		while (!(xx & 0x00800000)) {
		xx = (xx << 1) \| (yy >> 31);
		yy = (yy << 1);
		expo_x--;
		}
		} else {
		/* signs are the same, effective addition */
		yy = yy + ww;
		s = yy < ww;
		xx = xx + zz + s;
		if (xx & 0x01000000) {
		temp = temp \| (yy << 31);
		yy = (yy >> 1) \| (xx << 31);
		xx = ((xx & 0x80000000) \| (xx >> 1)) & ~0x40000000;
		expo_x++;
		}
		}
		}
		temp = yy \| (temp != 0);
		if (expo_x <= 0xFD) {
		/* normal */
		xx \|= expo_y; /* or in sign bit */
		xx = xx + (expo_x << 23); /* add in exponent */
		return __int_as_float(xx);
		} else if ((int)expo_x >= 126) {
		/* overflow */
		xx = expo_y \| 0x7f7fffff;
		return __int_as_float(xx);
		}
		/* subnormal */
		return __int_as_float(expo_y);
		}

		__device_func__(float __fmaf_ru (float a, float b, float c))
		{
		unsigned long long product;
		unsigned int xx, yy, zz, ww;
		unsigned int temp, s, u;
		unsigned int expo_x, expo_y, expo_z;

		xx = __float_as_int(a);
		yy = __float_as_int(b);
		zz = __float_as_int(c);

		/* Match 'denormals are zero' behavior of the GPU */
		if ((xx << 1) < 0x01000000) xx &= 0x80000000;
		if ((yy << 1) < 0x01000000) yy &= 0x80000000;
		if ((zz << 1) < 0x01000000) zz &= 0x80000000;

		temp = 0xff;
		expo_x = temp & (xx >> 23);
		expo_x = expo_x - 1;
		expo_y = temp & (yy >> 23);
		expo_y = expo_y - 1;
		expo_z = temp & (zz >> 23);
		expo_z = expo_z - 1;

		if (!((expo_x <= 0xFD) &&
		(expo_y <= 0xFD) &&
		(expo_z <= 0xFD))) {
		/* fmad (nan, y, z) --> nan
		fmad (x, nan, z) --> nan
		fmad (x, y, nan) --> nan
		*/
		if ((yy << 1) > 0xff000000) {
		return rsqrtf(b);
		}
		if ((zz << 1) > 0xff000000) {
		return rsqrtf(c);
		}
		if ((xx << 1) > 0xff000000) {
		return rsqrtf(a);
		}
		/* fmad (0, inf, z) --> NaN
		fmad (inf, 0, z) --> NaN
		fmad (-inf,+y,+inf) --> NaN
		fmad (+x,-inf,+inf) --> NaN
		fmad (+inf,-y,+inf) --> NaN
		fmad (-x,+inf,+inf) --> NaN
		fmad (-inf,-y,-inf) --> NaN
		fmad (-x,-inf,-inf) --> NaN
		fmad (+inf,+y,-inf) --> NaN
		fmad (+x,+inf,-inf) --> NaN
		*/
		if ((((xx << 1) == 0) && ((yy << 1) == 0xff000000)) \|\|
		(((yy << 1) == 0) && ((xx << 1) == 0xff000000))) {
		return rsqrtf(__int_as_float(0xffc00000));
		}
		if ((zz << 1) == 0xff000000) {
		if (((yy << 1) == 0xff000000) \|\| ((xx << 1) == 0xff000000)) {
		if ((int)(xx ^ yy ^ zz) < 0) {
		return rsqrtf(__int_as_float(0xffc00000));
		}
		}
		}
		/* fmad (inf, y, z) --> inf
		fmad (x, inf, z) --> inf
		fmad (x, y, inf) --> inf
		*/
		if ((xx << 1) == 0xff000000) {
		xx = xx ^ (yy & 0x80000000);
		return __int_as_float(xx);
		}
		if ((yy << 1) == 0xff000000) {
		yy = yy ^ (xx & 0x80000000);
		return __int_as_float(yy);
		}
		if ((zz << 1) == 0xff000000) {
		return __int_as_float(zz);
		}
		/* fmad (+0, -y, -0) --> -0
		fmad (-0, +y, -0) --> -0
		fmad (+x, -0, -0) --> -0
		fmad (-x, +0, -0) --> -0
		*/
		if (zz == 0x80000000) {
		if (((xx << 1) == 0) \|\| ((yy << 1) == 0)) {
		if ((int)(xx ^ yy) < 0) {
		return __int_as_float(zz);
		}
		}
		}
		/* fmad (0, y, 0) --> +0
		fmad (x, 0, 0) --> +0
		*/
		if (((zz << 1) == 0) &&
		(((xx << 1) == 0) \|\| ((yy << 1) == 0))) {
		zz &= 0x7fffffff;
		return __int_as_float(zz);
		}
		/* fmad (0, y, z) --> z
		fmad (x, 0, z) --> z
		*/
		if (((xx << 1) == 0) \|\| ((yy << 1) == 0)) {
		return __int_as_float(zz);
		}
		/* normalize x, if denormal */
		if (expo_x == (unsigned)-1) {
		temp = xx & 0x80000000;
		xx = xx << 8;
		while (!(xx & 0x80000000)) {
		xx <<= 1;
		expo_x--;
		}
		expo_x++;
		xx = (xx >> 8) \| temp;
		}
		/* normalize y, if denormal */
		if (expo_y == (unsigned)-1) {
		temp = yy & 0x80000000;
		yy = yy << 8;
		while (!(yy & 0x80000000)) {
		yy <<= 1;
		expo_y--;
		}
		expo_y++;
		yy = (yy >> 8) \| temp;
		}
		/* normalize z, if denormal */
		if ((expo_z == (unsigned)-1) && ((zz << 1) != 0)) {
		temp = zz & 0x80000000;
		zz = zz << 8;
		while (!(zz & 0x80000000)) {
		zz <<= 1;
		expo_z--;
		}
		expo_z++;
		zz = (zz >> 8) \| temp;
		}
		}

		expo_x = expo_x + expo_y;
		expo_y = xx ^ yy;
		xx = xx & 0x00ffffff;
		yy = yy << 8;
		xx = xx \| 0x00800000;
		yy = yy \| 0x80000000;

		product = ((unsigned long long)xx) * yy;
		xx = (unsigned)(product >> 32);
		yy = (unsigned)(product & 0xffffffff);

		expo_x = expo_x - 127 + 2;
		expo_y = expo_y & 0x80000000;
		/* normalize mantissa */
		if (xx < 0x00800000) {
		xx = (xx << 1) \| (yy >> 31);
		yy = (yy << 1);
		expo_x--;
		}
		temp = 0;

		if ((zz << 1) != 0) { /* z is not zero */
		s = zz & 0x80000000;
		zz &= 0x00ffffff;
		zz \|= 0x00800000;
		ww = 0;
		/* compare and swap. put augend into xx:yy */
		if ((int)expo_z > (int)expo_x) {
		temp = expo_z;
		expo_z = expo_x;
		expo_x = temp;
		temp = zz;
		zz = xx;
		xx = temp;
		temp = ww;
		ww = yy;
		yy = temp;
		temp = expo_y;
		expo_y = s;
		s = temp;
		}
		/* augend_sign = expo_y, augend_mant = xx:yy, augend_expo = expo_x */
		/* addend_sign = s, addend_mant = zz:ww, addend_expo = expo_z */
		expo_z = expo_x - expo_z;
		u = expo_y ^ s;
		if (expo_z <= 49) {
		/* denormalize addend */
		temp = 0;
		while (expo_z >= 32) {
		temp = ww \| (temp != 0);
		ww = zz;
		zz = 0;
		expo_z -= 32;
		}
		if (expo_z) {
		temp = ((temp >> expo_z) \| (ww << (32 - expo_z)) \|
		((temp << (32 - expo_z)) != 0));
		ww = (ww >> expo_z) \| (zz << (32 - expo_z));
		zz = (zz >> expo_z);
		}

		} else {
		temp = 1;
		ww = 0;
		zz = 0;
		}
		if ((int)u < 0) {
		/* signs differ, effective subtraction */
		temp = (unsigned)(-(int)temp);
		s = (temp != 0);
		u = yy - s;
		s = u > yy;
		yy = u - ww;
		s += yy > u;
		xx = (xx - zz) - s;
		if (!(xx \| yy \| temp)) {
		/* complete cancelation, return 0 */
		return __int_as_float(xx);
		}
		if ((int)xx < 0) {
		/* ooops, augend had smaller mantissa. Negate mantissa and flip
		sign of result*/
		temp = ~temp;
		yy = ~yy;
		xx = ~xx;
		if (++temp == 0) {
		if (++yy == 0) {
		++xx;
		}
		}
		expo_y ^= 0x80000000;
		}
		/* normalize mantissa, if necessary */
		while (!(xx & 0x00800000)) {
		xx = (xx << 1) \| (yy >> 31);
		yy = (yy << 1);
		expo_x--;
		}
		} else {
		/* signs are the same, effective addition */
		yy = yy + ww;
		s = yy < ww;
		xx = xx + zz + s;
		if (xx & 0x01000000) {
		temp = temp \| (yy << 31);
		yy = (yy >> 1) \| (xx << 31);
		xx = ((xx & 0x80000000) \| (xx >> 1)) & ~0x40000000;
		expo_x++;
		}
		}
		}
		temp = yy \| (temp != 0);
		if (expo_x <= 0xFD) {
		/* normal */
		xx \|= expo_y; /* or in sign bit */
		xx += (temp && !expo_y); /* round result */
		xx = xx + (expo_x << 23); /* add in exponent */
		return __int_as_float(xx);
		} else if ((int)expo_x >= 126) {
		/* overflow */
		xx = expo_y \| (expo_y ? 0x7f7fffff : 0x7F800000);
		return __int_as_float(xx);
		}
		/* subnormal */
		expo_x = ((unsigned int)-((int)expo_x));
		xx += (temp && !expo_y);
		xx = (xx >> expo_x);
		if ((expo_x > 25) \|\| (xx != 0x00800000)) xx = 0;
		return __int_as_float(expo_y \| xx);
		}

		__device_func__(float __fmaf_rd (float a, float b, float c))
		{
		unsigned long long product;
		unsigned int xx, yy, zz, ww;
		unsigned int temp, s, u;
		unsigned int expo_x, expo_y, expo_z;

		xx = __float_as_int(a);
		yy = __float_as_int(b);
		zz = __float_as_int(c);

		/* Match 'denormals are zero' behavior of the GPU */
		if ((xx << 1) < 0x01000000) xx &= 0x80000000;
		if ((yy << 1) < 0x01000000) yy &= 0x80000000;
		if ((zz << 1) < 0x01000000) zz &= 0x80000000;

		temp = 0xff;
		expo_x = temp & (xx >> 23);
		expo_x = expo_x - 1;
		expo_y = temp & (yy >> 23);
		expo_y = expo_y - 1;
		expo_z = temp & (zz >> 23);
		expo_z = expo_z - 1;

		if (!((expo_x <= 0xFD) &&
		(expo_y <= 0xFD) &&
		(expo_z <= 0xFD))) {
		/* fmad (nan, y, z) --> nan
		fmad (x, nan, z) --> nan
		fmad (x, y, nan) --> nan
		*/
		if ((yy << 1) > 0xff000000) {
		return rsqrtf(b);
		}
		if ((zz << 1) > 0xff000000) {
		return rsqrtf(c);
		}
		if ((xx << 1) > 0xff000000) {
		return rsqrtf(a);
		}
		/* fmad (0, inf, z) --> NaN
		fmad (inf, 0, z) --> NaN
		fmad (-inf,+y,+inf) --> NaN
		fmad (+x,-inf,+inf) --> NaN
		fmad (+inf,-y,+inf) --> NaN
		fmad (-x,+inf,+inf) --> NaN
		fmad (-inf,-y,-inf) --> NaN
		fmad (-x,-inf,-inf) --> NaN
		fmad (+inf,+y,-inf) --> NaN
		fmad (+x,+inf,-inf) --> NaN
		*/
		if ((((xx << 1) == 0) && ((yy << 1) == 0xff000000)) \|\|
		(((yy << 1) == 0) && ((xx << 1) == 0xff000000))) {
		return rsqrtf(__int_as_float(0xffc00000));
		}
		if ((zz << 1) == 0xff000000) {
		if (((yy << 1) == 0xff000000) \|\| ((xx << 1) == 0xff000000)) {
		if ((int)(xx ^ yy ^ zz) < 0) {
		return rsqrtf(__int_as_float(0xffc00000));
		}
		}
		}
		/* fmad (inf, y, z) --> inf
		fmad (x, inf, z) --> inf
		fmad (x, y, inf) --> inf
		*/
		if ((xx << 1) == 0xff000000) {
		xx = xx ^ (yy & 0x80000000);
		return __int_as_float(xx);
		}
		if ((yy << 1) == 0xff000000) {
		yy = yy ^ (xx & 0x80000000);
		return __int_as_float(yy);
		}
		if ((zz << 1) == 0xff000000) {
		return __int_as_float(zz);
		}
		/* fmad (+0, -y, -0) --> -0
		fmad (-0, +y, -0) --> -0
		fmad (+x, -0, -0) --> -0
		fmad (-x, +0, -0) --> -0
		*/
		if (zz == 0x80000000) {
		if (((xx << 1) == 0) \|\| ((yy << 1) == 0)) {
		if ((int)(xx ^ yy) < 0) {
		return __int_as_float(zz);
		}
		}
		}
		/* fmad (0, y, 0) --> +0
		fmad (x, 0, 0) --> +0
		*/
		if (((zz << 1) == 0) &&
		(((xx << 1) == 0) \|\| ((yy << 1) == 0))) {
		zz = (xx ^ yy ^ zz) & 0x80000000;
		return __int_as_float(zz);
		}
		/* fmad (0, y, z) --> z
		fmad (x, 0, z) --> z
		*/
		if (((xx << 1) == 0) \|\| ((yy << 1) == 0)) {
		return __int_as_float(zz);
		}
		/* normalize x, if denormal */
		if (expo_x == (unsigned)-1) {
		temp = xx & 0x80000000;
		xx = xx << 8;
		while (!(xx & 0x80000000)) {
		xx <<= 1;
		expo_x--;
		}
		expo_x++;
		xx = (xx >> 8) \| temp;
		}
		/* normalize y, if denormal */
		if (expo_y == (unsigned)-1) {
		temp = yy & 0x80000000;
		yy = yy << 8;
		while (!(yy & 0x80000000)) {
		yy <<= 1;
		expo_y--;
		}
		expo_y++;
		yy = (yy >> 8) \| temp;
		}
		/* normalize z, if denormal */
		if ((expo_z == (unsigned)-1) && ((zz << 1) != 0)) {
		temp = zz & 0x80000000;
		zz = zz << 8;
		while (!(zz & 0x80000000)) {
		zz <<= 1;
		expo_z--;
		}
		expo_z++;
		zz = (zz >> 8) \| temp;
		}
		}

		expo_x = expo_x + expo_y;
		expo_y = xx ^ yy;
		xx = xx & 0x00ffffff;
		yy = yy << 8;
		xx = xx \| 0x00800000;
		yy = yy \| 0x80000000;

		product = ((unsigned long long)xx) * yy;
		xx = (unsigned)(product >> 32);
		yy = (unsigned)(product & 0xffffffff);

		expo_x = expo_x - 127 + 2;
		expo_y = expo_y & 0x80000000;
		/* normalize mantissa */
		if (xx < 0x00800000) {
		xx = (xx << 1) \| (yy >> 31);
		yy = (yy << 1);
		expo_x--;
		}
		temp = 0;

		if ((zz << 1) != 0) { /* z is not zero */
		s = zz & 0x80000000;
		zz &= 0x00ffffff;
		zz \|= 0x00800000;
		ww = 0;
		/* compare and swap. put augend into xx:yy */
		if ((int)expo_z > (int)expo_x) {
		temp = expo_z;
		expo_z = expo_x;
		expo_x = temp;
		temp = zz;
		zz = xx;
		xx = temp;
		temp = ww;
		ww = yy;
		yy = temp;
		temp = expo_y;
		expo_y = s;
		s = temp;
		}
		/* augend_sign = expo_y, augend_mant = xx:yy, augend_expo = expo_x */
		/* addend_sign = s, addend_mant = zz:ww, addend_expo = expo_z */
		expo_z = expo_x - expo_z;
		u = expo_y ^ s;
		if (expo_z <= 49) {
		/* denormalize addend */
		temp = 0;
		while (expo_z >= 32) {
		temp = ww \| (temp != 0);
		ww = zz;
		zz = 0;
		expo_z -= 32;
		}
		if (expo_z) {
		temp = ((temp >> expo_z) \| (ww << (32 - expo_z)) \|
		((temp << (32 - expo_z)) != 0));
		ww = (ww >> expo_z) \| (zz << (32 - expo_z));
		zz = (zz >> expo_z);
		}

		} else {
		temp = 1;
		ww = 0;
		zz = 0;
		}
		if ((int)u < 0) {
		/* signs differ, effective subtraction */
		temp = (unsigned)(-(int)temp);
		s = (temp != 0);
		u = yy - s;
		s = u > yy;
		yy = u - ww;
		s += yy > u;
		xx = (xx - zz) - s;
		if (!(xx \| yy \| temp)) {
		/* complete cancelation, return -0 */
		return __int_as_float(0x80000000);
		}
		if ((int)xx < 0) {
		/* ooops, augend had smaller mantissa. Negate mantissa and flip
		sign of result*/
		temp = ~temp;
		yy = ~yy;
		xx = ~xx;
		if (++temp == 0) {
		if (++yy == 0) {
		++xx;
		}
		}
		expo_y ^= 0x80000000;
		}
		/* normalize mantissa, if necessary */
		while (!(xx & 0x00800000)) {
		xx = (xx << 1) \| (yy >> 31);
		yy = (yy << 1);
		expo_x--;
		}
		} else {
		/* signs are the same, effective addition */
		yy = yy + ww;
		s = yy < ww;
		xx = xx + zz + s;
		if (xx & 0x01000000) {
		temp = temp \| (yy << 31);
		yy = (yy >> 1) \| (xx << 31);
		xx = ((xx & 0x80000000) \| (xx >> 1)) & ~0x40000000;
		expo_x++;
		}
		}
		}
		temp = yy \| (temp != 0);
		if (expo_x <= 0xFD) {
		/* normal */
		xx \|= expo_y; /* or in sign bit */
		xx += (temp && expo_y); /* round result */
		xx = xx + (expo_x << 23); /* add in exponent */
		return __int_as_float(xx);
		} else if ((int)expo_x >= 126) {
		/* overflow */
		xx = expo_y \| (expo_y ? 0x7f800000 : 0x7F7FFFFF);
		return __int_as_float(xx);
		}
		/* subnormal */
		expo_x = ((unsigned int)-((int)expo_x));
		xx += (temp && expo_y);
		xx = (xx >> expo_x);
		if ((expo_x > 25) \|\| (xx != 0x00800000)) xx = 0;
		return __int_as_float(expo_y \| xx);
		}

		#else /* defined(__CUDABE__) */
		#include "common_types.h"

		static __device__ const unsigned char __internal_rcpTab[128] =
		{
		0xff, 0xfd, 0xfb, 0xf9, 0xf7, 0xf5, 0xf4, 0xf2,
		0xf0, 0xee, 0xed, 0xeb, 0xe9, 0xe8, 0xe6, 0xe4,
		0xe3, 0xe1, 0xe0, 0xde, 0xdd, 0xdb, 0xda, 0xd8,
		0xd7, 0xd5, 0xd4, 0xd3, 0xd1, 0xd0, 0xcf, 0xcd,
		0xcc, 0xcb, 0xca, 0xc8, 0xc7, 0xc6, 0xc5, 0xc4,
		0xc2, 0xc1, 0xc0, 0xbf, 0xbe, 0xbd, 0xbc, 0xbb,
		0xba, 0xb9, 0xb8, 0xb7, 0xb6, 0xb5, 0xb4, 0xb3,
		0xb2, 0xb1, 0xb0, 0xaf, 0xae, 0xad, 0xac, 0xab,
		0xaa, 0xa9, 0xa8, 0xa8, 0xa7, 0xa6, 0xa5, 0xa4,
		0xa3, 0xa3, 0xa2, 0xa1, 0xa0, 0x9f, 0x9f, 0x9e,
		0x9d, 0x9c, 0x9c, 0x9b, 0x9a, 0x99, 0x99, 0x98,
		0x97, 0x97, 0x96, 0x95, 0x95, 0x94, 0x93, 0x93,
		0x92, 0x91, 0x91, 0x90, 0x8f, 0x8f, 0x8e, 0x8e,
		0x8d, 0x8c, 0x8c, 0x8b, 0x8b, 0x8a, 0x89, 0x89,
		0x88, 0x88, 0x87, 0x87, 0x86, 0x85, 0x85, 0x84,
		0x84, 0x83, 0x83, 0x82, 0x82, 0x81, 0x81, 0x80
		};

		static __device__ const unsigned int __internal_invSqrtCubeTab[96] =
		{
		0xfa0bf8fe, 0xee6b28fa, 0xe5f024f7, 0xdaf268f3,
		0xd2f000f0, 0xc890c0ec, 0xc10378e9, 0xb9a758e6,
		0xb4da40e4, 0xadcea0e1, 0xa6f278de, 0xa279c0dc,
		0x9beb48d9, 0x97a5c4d7, 0x916340d4, 0x8d4fc8d2,
		0x895000d0, 0x8563b8ce, 0x818ac0cc, 0x7dc4e8ca,
		0x7a1200c8, 0x7671d8c6, 0x72e440c4, 0x6f6908c2,
		0x6db240c1, 0x6a523cbf, 0x670424bd, 0x6563c0bc,
		0x623028ba, 0x609ce8b9, 0x5d8364b7, 0x5bfd18b6,
		0x58fd40b4, 0x5783a8b3, 0x560e48b2, 0x533000b0,
		0x51c70caf, 0x506238ae, 0x4da4c0ac, 0x4c4c10ab,
		0x4af768aa, 0x49a6b8a9, 0x485a00a8, 0x471134a7,
		0x45cc58a6, 0x434e40a4, 0x4214f8a3, 0x40df88a2,
		0x3fade0a1, 0x3e8000a0, 0x3d55dc9f, 0x3c2f789e,
		0x3c2f789e, 0x3b0cc49d, 0x39edc09c, 0x38d2609b,
		0x37baa89a, 0x36a68899, 0x35960098, 0x34890497,
		0x34890497, 0x337f9896, 0x3279ac95, 0x31774094,
		0x30784893, 0x30784893, 0x2f7cc892, 0x2e84b091,
		0x2d900090, 0x2d900090, 0x2c9eac8f, 0x2bb0b88e,
		0x2bb0b88e, 0x2ac6148d, 0x29dec08c, 0x29dec08c,
		0x28fab08b, 0x2819e88a, 0x2819e88a, 0x273c5889,
		0x273c5889, 0x26620088, 0x258ad487, 0x258ad487,
		0x24b6d886, 0x24b6d886, 0x23e5fc85, 0x23184084,
		0x23184084, 0x224d9883, 0x224d9883, 0x21860882,
		0x21860882, 0x20c18081, 0x20c18081, 0x20000080
		};

		__device_func__(float __internal_frcp_kernel (float x,enum cudaRoundMode mo
		de))
		{
		unsigned long long prod;
		volatile union __cudart_FloatUintCvt arg;
		unsigned int expo;
		unsigned int sign;
		unsigned f, y;

		arg.f = x;
		sign = arg.i & 0x80000000;
		expo = (arg.i >> 23);
		expo = expo & 0xff;
		f = expo - 1;

		if (f <= 0xFD) {
		y = (arg.i << 8);
		y = y \| 0x80000000;
		/* initial approximation */
		arg.i = __internal_rcpTab[(y >> 24) - 128];
		/* first NR iteration */
		f = arg.i * arg.i;
		f = f << 16;
		prod = ((unsigned long long)y) * f;
		arg.i = (arg.i << 24) - (unsigned)(prod >> 32);
		/* second NR iteration */
		f = arg.i + arg.i;
		prod = ((unsigned long long)y) * f;
		f = (unsigned)(-(int)(prod >> 32));
		prod = ((unsigned long long)arg.i) * f;
		y = y >> 8;
		/* compute exponent */
		expo = (2 * 127) - expo - 2;
		arg.i = (unsigned)(prod >> 32);
		if (mode == cudaRoundNearest) {
		arg.i = arg.i >> 6;
		} else {
		arg.i = (arg.i + 32) >> 6;
		}
		if ((int)expo >= 0) {
		f = y * arg.i;
		arg.i = ((expo << 23) + arg.i) \| sign;
		} else {
		/* result is a denormal */
		expo = -(int)expo;
		arg.i = arg.i >> expo;
		f = y * arg.i;
		arg.i = arg.i \| sign;
		}
		if (mode == cudaRoundNearest) {
		expo = f + y;
		if ((int)f < 0) f = (unsigned)(-(int)f);
		if ((int)expo < 0) expo = (unsigned)(-(int)expo);
		if (expo < f) arg.i++;
		} else if (mode == cudaRoundZero) {
		if ((int)f > 0) arg.i = arg.i - 1;
		} else if (mode == cudaRoundPosInf) {
		if (((int)f > 0) && sign) arg.i = arg.i - 1;
		if (((int)f < 0) && !sign) arg.i = arg.i + 1;
		} else { /* mode == cudaRoundMinInf */
		if (((int)f > 0) && !sign) arg.i = arg.i - 1;
		if (((int)f < 0) && sign) arg.i = arg.i + 1;
		}
		return arg.f;
		} else {
		/* zero returns infinity. Must handle negative zero as well */
		if (!(arg.i << 1)) {
		arg.i = 0x7F800000 \| arg.i;
		return arg.f;
		}
		/* infinity returns zero of like sign */
		if ((arg.i << 1) == 0xff000000) {
		arg.i &= 0x80000000;
		return arg.f;
		}
		/* convert SNaNs to QNaNs */
		if ((arg.i << 1) > 0xff000000) {
		arg.i \|= 0x00400000;
		return arg.f;
		}
		/* denormals */
		f = 0;
		arg.i <<= 8;
		do {
		f++;
		arg.i <<= 1;
		} while ((int)arg.i > 0);
		arg.i >>= 8;
		arg.i \|= sign;
		arg.f = __internal_frcp_kernel (arg.f, mode);
		expo = ((arg.i << 1) >> 24);
		if ((expo + f) < 255) {
		arg.i = (arg.i + (f << 23));
		return arg.f;
		}
		if (mode == cudaRoundNearest) {
		arg.i = (arg.i & 0x80000000) \| 0x7f800000;
		} else if (mode == cudaRoundZero) {
		arg.i = (arg.i & 0x80000000) \| 0x7f7fffff;
		} else if (mode == cudaRoundPosInf) {
		arg.i = (arg.i & 0x80000000) \| ((sign) ? 0x7f7fffff : 0x7f800000);
		} else { /* mode == cudaRoundMinInf */
		arg.i = (arg.i & 0x80000000) \| ((sign) ? 0x7f800000 : 0x7f7fffff);
		}
		return arg.f;
		}
		}

		__device_func__(float __internal_fsqrt_kernel (float radicand,
		enum cudaRoundMode mode))
		{
		unsigned long long prod;
		volatile union __cudart_FloatUintCvt arg;
		unsigned int expo;
		unsigned int s, f, x;

		arg.f = radicand;
		expo = arg.i >> 23;
		expo = expo & 0xff;
		f = expo - 1;

		if ((arg.i <= 0x80000000) && (f <= 0xFD)) {
		/* normalize input argument */
		x = (arg.i << 8) \| 0x80000000;
		x = x >> (expo & 1);
		/* initial approximation */
		arg.i = f = __internal_invSqrtCubeTab[((unsigned)x >> 25) - 32];
		/* first NR iteration */
		prod = ((unsigned long long)x) * f;
		arg.i = ((arg.i * 3) << 22) - (unsigned)(prod >> 32);
		/* second NR iteration */
		prod = ((unsigned long long)arg.i) * arg.i;
		s = (unsigned)(prod >> 32);
		prod = ((unsigned long long)x) * s;
		f = 0x30000000 - (unsigned)(prod >> 32);
		prod = ((unsigned long long)f) * arg.i;
		arg.i = (unsigned)(prod >> 32);
		/* compute sqrt(x) as x * 1/sqrt(x) */
		prod = ((unsigned long long)x) * arg.i;
		arg.i = (unsigned)(prod >> 32);
		if (mode == cudaRoundNearest) {
		arg.i = arg.i >> 3;
		} else {
		arg.i = (arg.i + 4) >> 3;
		}
		x = (x << 16) - (arg.i * arg.i);
		/* round to nearest based on remainder; tie case impossible */
		if (mode == cudaRoundNearest) {
		f = x - (2 * arg.i + 1);
		if ((int)f < 0) f = (unsigned)(-(int)f);
		if ((int)x < 0) x = (unsigned)(-(int)x);
		if (f < x) arg.i ++;
		} else if ((mode == cudaRoundZero) \|\| (mode == cudaRoundMinInf)) {
		if ((int)x < 0) arg.i--;
		} else if (mode == cudaRoundPosInf) {
		if ((int)x > 0) arg.i++;
		}
		arg.i = arg.i + (((expo + 125) & ~0x1) << 22);
		return arg.f;
		} else {
		/* if zero, or positive infinity, return argument */
		if (!(arg.i << 1) \|\| (arg.i == 0x7F800000)) {
		return arg.f;
		}
		/* if NaN, return argument, possibly converted to QNaN */
		if ((arg.i << 1) > 0xFF000000) {
		arg.i \|= 0x00400000;
		return arg.f;
		}
		/* if negative, return NaN: INDEFINITE */
		if (arg.i & 0x80000000) {
		arg.i = 0xFFC00000;
		return arg.f;
		}
		/* denormal, normalize it before computing square root */
		x = 0;
		arg.i <<= 8;
		do {
		x++;
		arg.i <<= 1;
		} while ((int)arg.i > 0);
		arg.i >>= 8;
		arg.i += (x & 1) << 23;
		x += (x & 1);
		arg.f = __internal_fsqrt_kernel (arg.f, mode);
		arg.i -= ((x >> 1) << 23);
		return arg.f;
		}
		}

		__device_func__(float __internal_fdiv_kernel (float dividend, float divisor
		,
		enum cudaRoundMode mode))
		{
		unsigned long long prod;
		unsigned r, f, x, y, expox, expoy, sign;
		volatile union __cudart_FloatUintCvt cvtx, cvty, res;

		cvtx.f = dividend;
		cvty.f = divisor;
		expox = ((cvtx.i >> 23) & 0xff) - 1;
		expoy = ((cvty.i >> 23) & 0xff) - 1;
		sign = ((cvtx.i ^ cvty.i) & 0x80000000);

		if ((expox <= 0xFD) && (expoy <= 0xFD)) {
		divide:
		expox = expox - expoy + 127 - 1;
		expoy = expox;
		/* extract mantissas */
		y = (cvty.i << 8) \| 0x80000000;
		x = (cvtx.i & 0x00ffffff) \| 0x00800000;
		/* initial approximation */
		r = __internal_rcpTab[(y >> 24) - 128];
		/* first NR iteration */
		f = r * r;
		prod = ((unsigned long long)y) * (f << 16);
		r = (r << 24) - (unsigned)(prod >> 32);
		/* second NR iteration */
		prod = ((unsigned long long)y) * (r << 1);
		f = (unsigned)-(int)(prod >> 32);
		prod = ((unsigned long long)f) * (r << 1);
		r = (unsigned)(prod >> 32);
		/* produce quotient */
		prod = ((unsigned long long)x) * (r << 1);
		/* normalize mantissa */
		if (((int)((prod >> 32) << 8)) > 0) {
		expox--;
		prod = prod + prod;
		}
		if (mode == cudaRoundNearest) {
		/* preliminary mantissa */
		r = (unsigned)(prod >> 32);
		y = y >> 8;
		/* result is a normal */
		if (expox <= 0xFD) {
		int rem0, rem1, inc;
		/* round mantissa to nearest even */
		prod = ((unsigned long long)y) * r;
		x = x << (23 + ((prod >> 32) >> 15));
		rem1 = x - (unsigned)(prod & 0xffffffff);
		rem0 = rem1 - y;
		inc = abs(rem0) < abs(rem1);
		/* merge sign, mantissa, exponent for final result */
		res.i = sign \| ((expox << 23) + r + inc);
		return res.f;
		} else if ((int)expox >= 254) {
		/* overflow: return infinity */
		res.i = sign \| 0x7f800000;
		return res.f;
		} else {
		/* underflow: result is zero, denormal, or smallest normal */
		int shift = -(int)expox;
		if (shift > 23) {
		/* result is zero or smallest denormal */
		r = (shift < 25) && ((x != y) \|\| (r > 0x00ff0000));
		res.i = sign \| r;
		return res.f;
		}
		if (x == y) {
		/* result is denormal */
		shift = -(int)expoy;
		r = 0x00800000 >> shift;
		res.i = sign \| r;
		return res.f;
		}
		{
		unsigned long long tempx;
		long long remlo, remhi;
		/* result is denormal or smallest normal */
		r = r >> shift;
		prod = ((unsigned long long)y) * r;
		tempx = ((unsigned long long)x) << (23 - shift);
		remlo = 2 * tempx - 2 * prod - y;
		remhi = remlo + 2 * tempx;
		if (remlo < 0) remlo = -remlo;
		if (remhi < 0) remhi = -remhi;
		if (remhi < remlo) tempx = 2 * tempx;
		remlo = tempx - prod;
		remhi = remlo - y;
		if (remlo < 0) remlo = -remlo;
		if (remhi < 0) remhi = -remhi;
		if ((remhi < remlo) \|\| ((remhi == remlo) && (r & 1))) r++;
		res.i = sign \| r;
		return res.f;
		}
		}
		} else if (mode == cudaRoundZero) {
		/* preliminary mantissa */
		prod += 0x0000000080000000ULL;
		r = (unsigned)(prod >> 32);
		y = y >> 8;
		/* result is a normal */
		if (expox <= 0xFD) {
		int rem1;
		prod = ((unsigned long long)y) * r;
		x = x << (23 + ((prod >> 32) >> 15));
		rem1 = x - (unsigned)(prod & 0xffffffff);
		if (rem1 < 0) r--;
		r = (expox << 23) + r;
		if (r == 0x7f800000) r = 0x7f7fffff;
		res.i = sign \| r;
		return res.f;
		} else if ((int)expox >= 254) {
		/* overflow: return largest normal */
		res.i = sign \| 0x7f7fffff;
		return res.f;
		} else {
		/* underflow: result is zero, denormal, or smallest normal */
		int shift = -(int)expox;
		if ((x == y) && (shift < 31)) {
		shift = -(int)expoy;
		r = 0x00800000 >> shift;
		res.i = sign \| r;
		return res.f;
		}
		if (shift > 23) {
		r = 0;
		res.i = sign \| r;
		return res.f;
		}
		{
		unsigned long long tempx;
		long long remlo, remhi;
		/* result is denormal or smallest normal */
		r = r >> shift;
		prod = ((unsigned long long)y) * r;
		tempx = ((unsigned long long)x) << (23 - shift);
		remlo = 2 * tempx - 2 * prod - y;
		remhi = remlo + 2 * tempx;
		if (remlo < 0) remlo = -remlo;
		if (remhi < 0) remhi = -remhi;
		if (remhi < remlo) tempx = 2 * tempx;
		remlo = tempx - prod;
		if ((remlo < 0) & (r != 0)) r--;
		res.i = sign \| r;
		return res.f;
		}
		}
		} else if (mode == cudaRoundPosInf) {
		/* preliminary mantissa */
		prod += 0x0000000080000000ULL;
		r = (unsigned)(prod >> 32);
		y = y >> 8;
		/* result is a normal */
		if (expox <= 0xFD) {
		int rem1;
		prod = ((unsigned long long)y) * r;
		x = x << (23 + ((prod >> 32) >> 15));
		rem1 = x - (unsigned)(prod & 0xffffffff);
		if ((rem1 < 0) && (sign)) r--;
		if ((rem1 > 0) && (!sign)) r++;
		r = (expox << 23) + r;
		if ((r == 0x7f800000) && (sign)) r = 0x7f7fffff;
		res.i = sign \| r;
		return res.f;
		} else if ((int)expox >= 254) {
		/* overflow: return largest normal, or infinity */
		r = sign ? 0x7f7fffff : 0x7f800000;
		res.i = sign \| r;
		return res.f;
		} else {
		/* underflow: result is zero, denormal, or smallest normal */
		int shift = -(int)expox;
		if ((x == y) && (shift <= 24)) {
		shift = -(int)expoy;
		r = 0x00800000 >> shift;
		if (r == 0) r = !sign;
		res.i = sign \| r;
		return res.f;
		}
		if (shift > 23) {
		r = !sign;
		res.i = sign \| r;
		return res.f;
		}
		{
		unsigned long long tempx;
		long long remlo, remhi;
		/* result is denormal or smallest normal */
		r = r >> shift;
		prod = ((unsigned long long)y) * r;
		tempx = ((unsigned long long)x) << (23 - shift);
		remlo = 2 * tempx - 2 * prod - y;
		remhi = remlo + 2 * tempx;
		if (remlo < 0) remlo = -remlo;
		if (remhi < 0) remhi = -remhi;
		if (remhi < remlo) tempx = 2 * tempx;
		remlo = tempx - prod;
		if ((remlo < 0) && (r != 0) && (sign)) r--;
		if ((remlo > 0) && (!sign)) r++;
		res.i = sign \| r;
		return res.f;
		}
		}
		} else if (mode == cudaRoundMinInf) {
		/* preliminary mantissa */
		prod += 0x0000000080000000ULL;
		r = (unsigned)(prod >> 32);
		y = y >> 8;
		/* result is a normal */
		if (expox <= 0xFD) {
		int rem1;
		prod = ((unsigned long long)y) * r;
		x = x << (23 + ((prod >> 32) >> 15));
		rem1 = x - (unsigned)(prod & 0xffffffff);
		if ((rem1 < 0) && (!sign)) r--;
		if ((rem1 > 0) && (sign)) r++;
		r = (expox << 23) + r;
		if ((r == 0x7f800000) && (!sign)) r = 0x7f7fffff;
		res.i = sign \| r;
		return res.f;
		} else if ((int)expox >= 254) {
		/* overflow: return largest normal, or infinity */
		r = sign ? 0x7f800000 : 0x7f7fffff;
		res.i = sign \| r;
		return res.f;
		} else {
		/* underflow: result is zero, denormal, or smallest normal */
		int shift = -(int)expox;
		if ((x == y) && (shift <= 24)) {
		shift = -(int)expoy;
		r = 0x00800000 >> shift;
		if (r == 0) r = !!sign;
		res.i = sign \| r;
		return res.f;
		}
		if (shift > 23) {
		r = !!sign;
		res.i = sign \| r;
		return res.f;
		}
		{
		unsigned long long tempx;
		long long remlo, remhi;
		/* result is denormal or smallest normal */
		r = r >> shift;
		prod = ((unsigned long long)y) * r;
		tempx = ((unsigned long long)x) << (23 - shift);
		remlo = 2 * tempx - 2 * prod - y;
		remhi = remlo + 2 * tempx;
		if (remlo < 0) remlo = -remlo;
		if (remhi < 0) remhi = -remhi;
		if (remhi < remlo) tempx = 2 * tempx;
		remlo = tempx - prod;
		if ((remlo < 0) && (r != 0) && (!sign)) r--;
		if ((remlo > 0) && (sign)) r++;
		res.i = sign \| r;
		return res.f;
		}
		}
		}
		}
		{
		int xzero, yzero, xinf, yinf, xnan, ynan;

		xnan = (cvtx.i << 1) > 0xff000000;
		ynan = (cvty.i << 1) > 0xff000000;
		/* handle NaNs. Convert SNaNs to QNaNs */
		if (xnan) {
		res.i = cvtx.i \| 0x00400000;
		return res.f;
		}
		if (ynan) {
		res.i = cvty.i \| 0x00400000;
		return res.f;
		}
		xzero = (cvtx.i << 1) == 0x00000000;
		yzero = (cvty.i << 1) == 0x00000000;
		xinf = (cvtx.i << 1) == 0xff000000;
		yinf = (cvty.i << 1) == 0xff000000;
		/* 0/0 and INF/INF are invalid operations. Return INDEFINITE */
		if ((xzero & yzero) \| (xinf & yinf)) {
		res.i = 0xffc00000;
		return res.f;
		}
		/* x/INF and 0/y -> 0 */
		if (xzero \| yinf) {
		res.i = sign;
		return res.f;
		}
		/* x/0 and INF/y -> INF */
		if (yzero \| xinf) {
		res.i = sign \| 0x7f800000;
		return res.f;
		}
		/* normalize denormals */
		if ((int)expox < 0) {
		cvtx.i = cvtx.i << 9;
		while ((int)cvtx.i >= 0) {
		expox--;
		cvtx.i = cvtx.i + cvtx.i;
		}
		cvtx.i = cvtx.i >> 8;
		}
		if ((int)expoy < 0) {
		cvty.i = cvty.i << 9;
		while ((int)cvty.i >= 0) {
		expoy--;
		cvty.i = cvty.i + cvty.i;
		}
		cvty.i = cvty.i >> 8;
		}
		goto divide;
		}
		}

		__device_func__(float __internal_fmul_kernel2 (float a, float b,
		enum cudaRoundMode mode))
		{
		unsigned long long product;
		volatile union __cudart_FloatUintCvt xx, yy;

		unsigned expo_x, expo_y;

		xx.f = a;
		yy.f = b;

		expo_y = 0xFF;
		expo_x = expo_y & (xx.i >> 23);
		expo_x = expo_x - 1;
		expo_y = expo_y & (yy.i >> 23);
		expo_y = expo_y - 1;

		if ((expo_x <= 0xFD) &&
		(expo_y <= 0xFD)) {
		multiply:
		expo_x = expo_x + expo_y;
		expo_y = xx.i ^ yy.i;
		xx.i = xx.i & 0x00ffffff;
		yy.i = yy.i << 8;
		xx.i = xx.i \| 0x00800000;
		yy.i = yy.i \| 0x80000000;
		/* compute product */
		product = ((unsigned long long)xx.i) * yy.i;
		expo_x = expo_x - 127 + 2;
		expo_y = expo_y & 0x80000000;
		xx.i = (unsigned int)(product >> 32);
		yy.i = (unsigned int)(product & 0xffffffff);
		/* normalize mantissa */
		if (xx.i < 0x00800000) {
		xx.i = (xx.i << 1) \| (yy.i >> 31);
		yy.i = (yy.i << 1);
		expo_x--;
		}
		if (expo_x <= 0xFD) {
		xx.i = xx.i \| expo_y; /* OR in sign bit */
		xx.i = xx.i + (expo_x << 23); /* add in exponent */
		/* round result to nearest or even */
		if (mode == cudaRoundNearest) {
		if (yy.i < 0x80000000) return xx.f;
		xx.i += ((yy.i == 0x80000000) ? (xx.i & 1) : (yy.i >> 31));
		} else if (mode == cudaRoundZero) {
		} else if (mode == cudaRoundPosInf) {
		xx.i += (yy.i && !expo_y);
		} else if (mode == cudaRoundMinInf) {
		xx.i += (yy.i && expo_y);
		}
		return xx.f;
		} else if ((int)expo_x >= 254) {
		/* overflow: return infinity or largest normal */
		if (mode == cudaRoundNearest) {
		xx.i = expo_y \| 0x7F800000;
		} else if (mode == cudaRoundZero) {
		xx.i = expo_y \| 0x7F7FFFFF;
		} else if (mode == cudaRoundPosInf) {
		xx.i = (expo_y ? 0xff7fffff : 0x7F800000);
		} else { /* (mode == cudaRoundMinInf) */
		xx.i = (expo_y ? 0xFF800000 : 0x7f7fffff);
		}
		return xx.f;
		} else {
		/* zero, denormal, or smallest normal */
		expo_x = ((unsigned int)-((int)expo_x));
		if (mode == cudaRoundNearest) {
		if (expo_x > 25) {
		/* massive underflow: return 0 */
		xx.i = expo_y;
		return xx.f;
		} else {
		yy.i = (xx.i << (32 - expo_x)) \| ((yy.i) ? 1 : 0);
		xx.i = expo_y + (xx.i >> expo_x);
		xx.i += ((yy.i == 0x80000000) ? (xx.i & 1) : (yy.i >> 31));
		return xx.f;
		}
		} else if (mode == cudaRoundZero) {
		if (expo_x > 25) expo_x = 25;
		xx.i = expo_y + (xx.i >> expo_x);
		return xx.f;
		} else if (mode == cudaRoundPosInf) {
		if (expo_x > 25) expo_x = 25;
		yy.i = (xx.i << (32 - expo_x)) \| ((yy.i) ? 1 : 0);
		xx.i = expo_y + (xx.i >> expo_x);
		xx.i += (yy.i && !expo_y);
		return xx.f;
		} else { /* (mode == cudaRoundMinInf) */
		if (expo_x > 25) expo_x = 25;
		yy.i = (xx.i << (32 - expo_x)) \| ((yy.i) ? 1 : 0);
		xx.i = expo_y + (xx.i >> expo_x);
		xx.i += (yy.i && expo_y);
		return xx.f;
		}
		}
		} else {
		product = xx.i ^ yy.i;
		product = product & 0x80000000;
		if (!(xx.i & 0x7fffffff)) {
		if (expo_y != 254) {
		xx.i = (unsigned int)product;
		return xx.f;
		}
		expo_y = yy.i << 1;
		if (expo_y == 0xFF000000) {
		xx.i = expo_y \| 0x00C00000;
		} else {
		xx.i = yy.i \| 0x00400000;
		}
		return xx.f;
		}
		if (!(yy.i & 0x7fffffff)) {
		if (expo_x != 254) {
		xx.i = (unsigned int)product;
		return xx.f;
		}
		expo_x = xx.i << 1;
		if (expo_x == 0xFF000000) {
		xx.i = expo_x \| 0x00C00000;
		} else {
		xx.i = xx.i \| 0x00400000;
		}
		return xx.f;
		}
		if ((expo_y != 254) && (expo_x != 254)) {
		expo_y++;
		expo_x++;
		if (expo_x == 0) {
		expo_y \|= xx.i & 0x80000000;
		/*
		* If both operands are denormals, we only need to normalize
		* one of them as the result will be either a denormal or zero.
		*/
		xx.i = xx.i << 8;
		while (!(xx.i & 0x80000000)) {
		xx.i <<= 1;
		expo_x--;
		}
		xx.i = (xx.i >> 8) \| (expo_y & 0x80000000);
		expo_y &= ~0x80000000;
		expo_y--;
		goto multiply;
		}
		if (expo_y == 0) {
		expo_x \|= yy.i & 0x80000000;
		yy.i = yy.i << 8;
		while (!(yy.i & 0x80000000)) {
		yy.i <<= 1;
		expo_y--;
		}
		yy.i = (yy.i >> 8) \| (expo_x & 0x80000000);
		expo_x &= ~0x80000000;
		expo_x--;
		goto multiply;
		}
		}
		expo_x = xx.i << 1;
		expo_y = yy.i << 1;
		/* if x is NaN, return x */
		if (expo_x > 0xFF000000) {
		/* cvt any SNaNs to QNaNs */
		xx.i = xx.i \| 0x00400000;
		return xx.f;
		}
		/* if y is NaN, return y */
		if (expo_y > 0xFF000000) {
		/* cvt any SNaNs to QNaNs */
		xx.i = yy.i \| 0x00400000;
		return xx.f;
		}
		xx.i = (unsigned int)product \| 0x7f800000;
		return xx.f;
		}
		}

		__device_func__(float __internal_fmaf_kernel (float a, float b, float c,
		enum cudaRoundMode mode))
		{
		unsigned long long product;
		unsigned int xx, yy, zz, ww;
		unsigned int temp, s, u;
		unsigned int expo_x, expo_y, expo_z;
		volatile union __cudart_FloatUintCvt cvt;

		cvt.f = a;
		xx = cvt.i;
		cvt.f = b;
		yy = cvt.i;
		cvt.f = c;
		zz = cvt.i;

		temp = 0xff;
		expo_x = temp & (xx >> 23);
		expo_x = expo_x - 1;
		expo_y = temp & (yy >> 23);
		expo_y = expo_y - 1;
		expo_z = temp & (zz >> 23);
		expo_z = expo_z - 1;

		if (!((expo_x <= 0xFD) &&
		(expo_y <= 0xFD) &&
		(expo_z <= 0xFD))) {
		/* fmad (nan, y, z) --> nan
		fmad (x, nan, z) --> nan
		fmad (x, y, nan) --> nan
		*/
		if ((yy << 1) > 0xff000000) {
		return b + b;
		}
		if ((zz << 1) > 0xff000000) {
		return c + c;
		}
		if ((xx << 1) > 0xff000000) {
		return a + a;
		}
		/* fmad (0, inf, z) --> NaN
		fmad (inf, 0, z) --> NaN
		fmad (-inf,+y,+inf) --> NaN
		fmad (+x,-inf,+inf) --> NaN
		fmad (+inf,-y,+inf) --> NaN
		fmad (-x,+inf,+inf) --> NaN
		fmad (-inf,-y,-inf) --> NaN
		fmad (-x,-inf,-inf) --> NaN
		fmad (+inf,+y,-inf) --> NaN
		fmad (+x,+inf,-inf) --> NaN
		*/
		if ((((xx << 1) == 0) && ((yy << 1) == 0xff000000)) \|\|
		(((yy << 1) == 0) && ((xx << 1) == 0xff000000))) {
		cvt.i = 0xffc00000;
		return cvt.f;
		}
		if ((zz << 1) == 0xff000000) {
		if (((yy << 1) == 0xff000000) \|\| ((xx << 1) == 0xff000000)) {
		if ((int)(xx ^ yy ^ zz) < 0) {
		cvt.i = 0xffc00000;
		return cvt.f;
		}
		}
		}
		/* fmad (inf, y, z) --> inf
		fmad (x, inf, z) --> inf
		fmad (x, y, inf) --> inf
		*/
		if ((xx << 1) == 0xff000000) {
		xx = xx ^ (yy & 0x80000000);
		cvt.i = xx;
		return cvt.f;
		}
		if ((yy << 1) == 0xff000000) {
		yy = yy ^ (xx & 0x80000000);
		cvt.i = yy;
		return cvt.f;
		}
		if ((zz << 1) == 0xff000000) {
		cvt.i = zz;
		return cvt.f;
		}
		/* fmad (+0, -y, -0) --> -0
		fmad (-0, +y, -0) --> -0
		fmad (+x, -0, -0) --> -0
		fmad (-x, +0, -0) --> -0
		*/
		if (zz == 0x80000000) {
		if (((xx << 1) == 0) \|\| ((yy << 1) == 0)) {
		if ((int)(xx ^ yy) < 0) {
		cvt.i = zz;
		return cvt.f;
		}
		}
		}
		/* fmad (0, y, 0) --> +0
		fmad (x, 0, 0) --> +0
		*/
		if (((zz << 1) == 0) &&
		(((xx << 1) == 0) \|\| ((yy << 1) == 0))) {
		if (mode == cudaRoundMinInf) {
		zz = 0x80000000 & (xx ^ yy ^ zz);
		} else {
		zz &= 0x7fffffff;
		}
		cvt.i = zz;
		return cvt.f;
		}
		/* fmad (0, y, z) --> z
		fmad (x, 0, z) --> z
		*/
		if (((xx << 1) == 0) \|\| ((yy << 1) == 0)) {
		cvt.i = zz;
		return cvt.f;
		}
		/* normalize x, if denormal */
		if (expo_x == (unsigned)-1) {
		temp = xx & 0x80000000;
		xx = xx << 8;
		while (!(xx & 0x80000000)) {
		xx <<= 1;
		expo_x--;
		}
		expo_x++;
		xx = (xx >> 8) \| temp;
		}
		/* normalize y, if denormal */
		if (expo_y == (unsigned)-1) {
		temp = yy & 0x80000000;
		yy = yy << 8;
		while (!(yy & 0x80000000)) {
		yy <<= 1;
		expo_y--;
		}
		expo_y++;
		yy = (yy >> 8) \| temp;
		}
		/* normalize z, if denormal */
		if ((expo_z == (unsigned)-1) && ((zz << 1) != 0)) {
		temp = zz & 0x80000000;
		zz = zz << 8;
		while (!(zz & 0x80000000)) {
		zz <<= 1;
		expo_z--;
		}
		expo_z++;
		zz = (zz >> 8) \| temp;
		}
		}

		expo_x = expo_x + expo_y;
		expo_y = xx ^ yy;
		xx = xx & 0x00ffffff;
		yy = yy << 8;
		xx = xx \| 0x00800000;
		yy = yy \| 0x80000000;

		product = ((unsigned long long)xx) * yy;
		xx = (unsigned)(product >> 32);
		yy = (unsigned)(product & 0xffffffff);

		expo_x = expo_x - 127 + 2;
		expo_y = expo_y & 0x80000000;
		/* normalize mantissa */
		if (xx < 0x00800000) {
		xx = (xx << 1) \| (yy >> 31);
		yy = (yy << 1);
		expo_x--;
		}
		temp = 0;

		if ((zz << 1) != 0) { /* z is not zero */
		s = zz & 0x80000000;
		zz &= 0x00ffffff;
		zz \|= 0x00800000;
		ww = 0;
		/* compare and swap. put augend into xx:yy */
		if ((int)expo_z > (int)expo_x) {
		temp = expo_z;
		expo_z = expo_x;
		expo_x = temp;
		temp = zz;
		zz = xx;
		xx = temp;
		temp = ww;
		ww = yy;
		yy = temp;
		temp = expo_y;
		expo_y = s;
		s = temp;
		}
		/* augend_sign = expo_y, augend_mant = xx:yy, augend_expo = expo_x */
		/* addend_sign = s, addend_mant = zz:ww, addend_expo = expo_z */
		expo_z = expo_x - expo_z;
		u = expo_y ^ s;
		if (expo_z <= 49) {
		/* denormalize addend */
		temp = 0;
		while (expo_z >= 32) {
		temp = ww \| (temp != 0);
		ww = zz;
		zz = 0;
		expo_z -= 32;
		}
		if (expo_z) {
		temp = ((temp >> expo_z) \| (ww << (32 - expo_z)) \|
		((temp << (32 - expo_z)) != 0));
		ww = (ww >> expo_z) \| (zz << (32 - expo_z));
		zz = (zz >> expo_z);
		}

		} else {
		temp = 1;
		ww = 0;
		zz = 0;
		}
		if ((int)u < 0) {
		/* signs differ, effective subtraction */
		temp = (unsigned)(-(int)temp);
		s = (temp != 0);
		u = yy - s;
		s = u > yy;
		yy = u - ww;
		s += yy > u;
		xx = (xx - zz) - s;
		if (!(xx \| yy \| temp)) {
		/* complete cancelation, return 0 */
		if (mode == cudaRoundMinInf) {
		xx = 0x80000000;
		}
		cvt.i = xx;
		return cvt.f;
		}
		if ((int)xx < 0) {
		/* ooops, augend had smaller mantissa. Negate mantissa and flip
		sign of result*/
		temp = ~temp;
		yy = ~yy;
		xx = ~xx;
		if (++temp == 0) {
		if (++yy == 0) {
		++xx;
		}
		}
		expo_y ^= 0x80000000;
		}
		/* normalize mantissa, if necessary */
		while (!(xx & 0x00800000)) {
		xx = (xx << 1) \| (yy >> 31);
		yy = (yy << 1);
		expo_x--;
		}
		} else {
		/* signs are the same, effective addition */
		yy = yy + ww;
		s = yy < ww;
		xx = xx + zz + s;
		if (xx & 0x01000000) {
		temp = temp \| (yy << 31);
		yy = (yy >> 1) \| (xx << 31);
		xx = ((xx & 0x80000000) \| (xx >> 1)) & ~0x40000000;
		expo_x++;
		}
		}
		}
		temp = yy \| (temp != 0);
		if (expo_x <= 0xFD) {
		/* normal */
		xx \|= expo_y; /* or in sign bit */
		if (mode == cudaRoundNearest) {
		s = xx & 1; /* mantissa lsb */
		xx += (temp == 0x80000000) ? s : (temp >> 31);
		} else if (mode == cudaRoundPosInf) {
		xx += temp && !expo_y;
		} else if (mode == cudaRoundMinInf) {
		xx += temp && expo_y;
		}
		xx = xx + (expo_x << 23); /* add in exponent */
		cvt.i = xx;
		return cvt.f;
		} else if ((int)expo_x >= 126) {
		/* overflow */
		if (mode == cudaRoundNearest) {
		xx = expo_y \| 0x7f800000;
		} else if (mode == cudaRoundZero) {
		xx = expo_y \| 0x7F7FFFFF;
		} else if (mode == cudaRoundPosInf) {
		xx = expo_y ? 0xFF7FFFFF : 0x7f800000;
		} else if (mode == cudaRoundMinInf) {
		xx = expo_y ? 0xff800000 : 0x7f7fffff;
		}
		cvt.i = xx;
		return cvt.f;
		}
		/* subnormal */
		expo_x = (unsigned int)(-(int)expo_x);
		if (expo_x > 25) {
		/* massive underflow: return 0, or smallest denormal */
		xx = 0;
		if (mode == cudaRoundPosInf) {
		xx += !expo_y;
		} else if (mode == cudaRoundMinInf) {
		xx += !!expo_y;
		}
		cvt.i = expo_y \| xx;
		return cvt.f;
		}
		temp = (xx << (32 - expo_x)) \| ((temp) ? 1 : 0);
		xx = xx >> expo_x;
		if (mode == cudaRoundNearest) {
		xx = xx + ((temp == 0x80000000) ? (xx & 1) : (temp >> 31));
		} else if (mode == cudaRoundPosInf) {
		xx = xx + (!expo_y && temp);
		} else if (mode == cudaRoundMinInf) {
		xx = xx + (expo_y && temp);
		}
		xx = expo_y + xx; /* add in sign bit */
		cvt.i = xx;
		return cvt.f;
		}

		/* NOTE: Does not currently support round-to-nearest, round-to-zero */
		__device_func__(float __internal_fadd_kernel2 (float a, float b,
		enum cudaRoundMode mode))
		{
		volatile union __cudart_FloatUintCvt xx, yy;
		unsigned int expo_x;
		unsigned int expo_y;
		unsigned int temp;

		xx.f = a;
		yy.f = b;

		/* make bigger operand the augend */
		expo_y = yy.i << 1;
		if (expo_y > (xx.i << 1)) {
		expo_y = xx.i;
		xx.i = yy.i;
		yy.i = expo_y;
		}

		temp = 0xff;
		expo_x = temp & (xx.i >> 23);
		expo_x = expo_x - 1;
		expo_y = temp & (yy.i >> 23);
		expo_y = expo_y - 1;

		if ((expo_x <= 0xFD) &&
		(expo_y <= 0xFD)) {
		add:
		expo_y = expo_x - expo_y;
		if (expo_y > 25) {
		expo_y = 31;
		}
		temp = xx.i ^ yy.i;
		xx.i = xx.i & ~0x7f000000;
		xx.i = xx.i \| 0x00800000;
		yy.i = yy.i & ~0xff000000;
		yy.i = yy.i \| 0x00800000;

		if ((int)temp < 0) {
		/* signs differ, effective subtraction */
		temp = 32 - expo_y;
		temp = (expo_y) ? (yy.i << temp) : 0;
		temp = (unsigned)(-((int)temp));
		xx.i = xx.i - (yy.i >> expo_y) - (temp ? 1 : 0);
		if (xx.i & 0x00800000) {
		if (expo_x <= 0xFD) {
		xx.i = xx.i + (expo_x << 23);
		if (mode == cudaRoundMinInf) {
		xx.i += (temp && (xx.i & 0x80000000));
		} else if (mode == cudaRoundPosInf) {
		xx.i += (temp && !(xx.i & 0x80000000));
		}
		return xx.f;
		}
		} else {
		if ((temp \| (xx.i << 1)) == 0) {
		/* operands cancelled, resulting in a clean zero */
		if (mode == cudaRoundMinInf) {
		xx.i = 0x80000000;
		} else if (mode == cudaRoundPosInf) {
		xx.i = 0;
		}
		return xx.f;
		}
		/* normalize result */
		yy.i = xx.i & 0x80000000;
		do {
		xx.i = (xx.i << 1) \| (temp >> 31);
		temp <<= 1;
		expo_x--;
		} while (!(xx.i & 0x00800000));
		xx.i = xx.i \| yy.i;
		}
		} else {
		/* signs are the same, effective addition */
		temp = 32 - expo_y;
		temp = (expo_y) ? (yy.i << temp) : 0;
		xx.i = xx.i + (yy.i >> expo_y);
		if (!(xx.i & 0x01000000)) {
		if (expo_x <= 0xFD) {
		xx.i = xx.i + (expo_x << 23);
		if (mode == cudaRoundMinInf) {
		xx.i += (temp && (xx.i & 0x80000000));
		} else if (mode == cudaRoundPosInf) {
		xx.i += (temp && !(xx.i & 0x80000000));
		}
		return xx.f;
		}
		} else {
		/* normalize result */
		temp = (xx.i << 31) \| (temp >> 1);
		xx.i = ((xx.i & 0x80000000) \| (xx.i >> 1)) & ~0x40000000;
		expo_x++;
		}
		}
		if (expo_x <= 0xFD) {
		if (mode == cudaRoundMinInf) {
		xx.i += (temp && (xx.i & 0x80000000));
		} else if (mode == cudaRoundPosInf) {
		xx.i += (temp && !(xx.i & 0x80000000));
		}
		xx.i = xx.i + (expo_x << 23);
		return xx.f;
		}
		if ((int)expo_x >= 254) {
		/* overflow: return infinity or largest normal */
		temp = xx.i & 0x80000000;
		if (mode == cudaRoundMinInf) {
		xx.i = (temp ? 0xFF800000 : 0x7f7fffff);
		} else if (mode == cudaRoundPosInf) {
		xx.i = (temp ? 0xff7fffff : 0x7F800000);
		}
		return xx.f;
		}
		/* underflow: denormal, or smallest normal */
		expo_y = expo_x + 32;
		yy.i = xx.i & 0x80000000;
		xx.i = xx.i & ~0xff000000;
		expo_x = (unsigned)(-((int)expo_x));
		temp = xx.i << expo_y \| ((temp) ? 1 : 0);
		xx.i = yy.i \| (xx.i >> expo_x);
		if (mode == cudaRoundMinInf) {
		xx.i += (temp && yy.i);
		} else if (mode == cudaRoundPosInf) {
		xx.i += (temp && !yy.i);
		}
		return xx.f;
		} else {
		/* handle special cases separately */
		if (!(yy.i << 1)) {
		if (mode == cudaRoundMinInf) {
		if (!(xx.i << 1)) {
		xx.i = xx.i \| yy.i;
		}
		} else if (mode == cudaRoundPosInf) {
		if (xx.i == 0x80000000) {
		xx.i = yy.i;
		}
		}
		if ((xx.i << 1) > 0xff000000) {
		xx.i \|= 0x00400000;
		}
		return xx.f;
		}
		if ((expo_y != 254) && (expo_x != 254)) {
		/* remove sign bits */
		if (expo_x == (unsigned int) -1) {
		temp = xx.i & 0x80000000;
		xx.i = xx.i << 8;
		while (!(xx.i & 0x80000000)) {
		xx.i <<= 1;
		expo_x--;
		}
		expo_x++;
		xx.i = (xx.i >> 8) \| temp;
		}
		if (expo_y == (unsigned int) -1) {
		temp = yy.i & 0x80000000;
		yy.i = yy.i << 8;
		while (!(yy.i & 0x80000000)) {
		yy.i <<= 1;
		expo_y--;
		}
		expo_y++;
		yy.i = (yy.i >> 8) \| temp;
		}
		goto add;
		}
		expo_x = xx.i << 1;
		expo_y = yy.i << 1;
		/* if x is NaN, return x */
		if (expo_x > 0xff000000) {
		/* cvt any SNaNs to QNaNs */
		xx.i = xx.i \| 0x00400000;
		return xx.f;
		}
		/* if y is NaN, return y */
		if (expo_y > 0xff000000) {
		/* cvt any SNaNs to QNaNs */
		xx.i = yy.i \| 0x00400000;
		return xx.f;
		}
		if ((expo_x == 0xff000000) && (expo_y == 0xff000000)) {
		/*
		* subtraction of infinities with the same sign, and addition of
		* infinities of unlike sign is undefined: return NaN INDEFINITE
		*/
		expo_x = xx.i ^ yy.i;
		xx.i = xx.i \| ((expo_x) ? 0xffc00000 : 0);
		return xx.f;
		}
		/* handle infinities */
		if (expo_y == 0xff000000) {
		xx.i = yy.i;
		}
		return xx.f;
		}
		}

		__device_func__(float __frcp_rn (float a))
		{
		return __internal_frcp_kernel (a, cudaRoundNearest);
		}

		__device_func__(float __frcp_rz (float a))
		{
		return __internal_frcp_kernel (a, cudaRoundZero);
		}

		__device_func__(float __frcp_rd (float a))
		{
		return __internal_frcp_kernel (a, cudaRoundMinInf);
		}

		__device_func__(float __frcp_ru (float a))
		{
		return __internal_frcp_kernel (a, cudaRoundPosInf);
		}

		__device_func__(float __fsqrt_rn (float a))
		{
		return __internal_fsqrt_kernel (a, cudaRoundNearest);
		}

		__device_func__(float __fsqrt_rz (float a))
		{
		return __internal_fsqrt_kernel (a, cudaRoundZero);
		}

		__device_func__(float __fsqrt_rd (float a))
		{
		return __internal_fsqrt_kernel (a, cudaRoundMinInf);
		}

		__device_func__(float __fsqrt_ru (float a))
		{
		return __internal_fsqrt_kernel (a, cudaRoundPosInf);
		}

		__device_func__(float __fdiv_rn (float a, float b))
		{
		return __internal_fdiv_kernel (a, b, cudaRoundNearest);
		}

		__device_func__(float __fdiv_rz (float a, float b))
		{
		return __internal_fdiv_kernel (a, b, cudaRoundZero);
		}

		__device_func__(float __fdiv_rd (float a, float b))
		{
		return __internal_fdiv_kernel (a, b, cudaRoundMinInf);
		}

		__device_func__(float __fdiv_ru (float a, float b))
		{
		return __internal_fdiv_kernel (a, b, cudaRoundPosInf);
		}

		__device_func__(float __fadd_rd (float a, float b))
		{
		return __internal_fadd_kernel2 (a, b, cudaRoundMinInf);
		}

		__device_func__(float __fadd_ru (float a, float b))
		{
		return __internal_fadd_kernel2 (a, b, cudaRoundPosInf);
		}

		__device_func__(float __fmul_rd (float a, float b))
		{
		return __internal_fmul_kernel2 (a, b, cudaRoundMinInf);
		}

		__device_func__(float __fmul_ru (float a, float b))
		{
		return __internal_fmul_kernel2 (a, b, cudaRoundPosInf);
		}

		__device_func__(float __fmaf_rn (float a, float b, float c))
		{
		return __internal_fmaf_kernel (a, b, c, cudaRoundNearest);
		}

		__device_func__(float __fmaf_rz (float a, float b, float c))
		{
		return __internal_fmaf_kernel (a, b, c, cudaRoundZero);
		}

		__device_func__(float __fmaf_ru (float a, float b, float c))
		{
		return __internal_fmaf_kernel (a, b, c, cudaRoundPosInf);
		}

		__device_func__(float __fmaf_rd (float a, float b, float c))
		{
		return __internal_fmaf_kernel (a, b, c, cudaRoundMinInf);
		}

	__device_func__(int __cuda___isnan(double a));	__device_func__(int __cuda___isnan(double a));
	__device_func__(int __cuda___isnanf(float a));	__device_func__(int __cuda___isnanf(float a));
	__device_func__(int __double2int_rz(double));	__device_func__(int __double2int_rz(double));
	__device_func__(unsigned int __double2uint_rz(double));	__device_func__(unsigned int __double2uint_rz(double));
	__device_func__(long long int __double2ll_rz(double));	__device_func__(long long int __double2ll_rz(double));
	__device_func__(unsigned long long int __double2ull_rz(double));	__device_func__(unsigned long long int __double2ull_rz(double));

	#define __internal_clamp(val, max, min, nan) \	#define __internal_clamp(val, max, min, nan) \
	if (sizeof(val) == sizeof(double) && __cuda___isnan((double)val)) re turn nan; \	if (sizeof(val) == sizeof(double) && __cuda___isnan((double)val)) re turn nan; \

	skipping to change at line 409	skipping to change at line 3777
	{	{
	long long int res;	long long int res;
	res = __umul64hi(a, b);	res = __umul64hi(a, b);
	if (a < 0LL) res = res - b;	if (a < 0LL) res = res - b;
	if (b < 0LL) res = res - a;	if (b < 0LL) res = res - a;
	return res;	return res;
	}	}

	__device_func__(float __saturatef(float a))	__device_func__(float __saturatef(float a))
	{	{

		if (__cuda___isnanf(a)) return 0.0f; // update of PTX spec 10/15/2008
	return a >= 1.0f ? 1.0f : a <= 0.0f ? 0.0f : a;	return a >= 1.0f ? 1.0f : a <= 0.0f ? 0.0f : a;
	}	}

	__device_func__(unsigned int __sad(int a, int b, unsigned int c))	__device_func__(unsigned int __sad(int a, int b, unsigned int c))
	{	{
	long long int diff = (long long int)a - (long long int)b;	long long int diff = (long long int)a - (long long int)b;

	return (unsigned int)(__cuda_llabs(diff) + (long long int)c);	return (unsigned int)(__cuda_llabs(diff) + (long long int)c);
	}	}


	skipping to change at line 450	skipping to change at line 3819
	#if !defined(__MULTI_CORE__)	#if !defined(__MULTI_CORE__)
	a &= 0xffffff;	a &= 0xffffff;
	b &= 0xffffff;	b &= 0xffffff;
	#endif /* !__MULTI_CORE__ */	#endif /* !__MULTI_CORE__ */

	return a * b;	return a * b;
	}	}

	__device_func__(float __int_as_float(int a))	__device_func__(float __int_as_float(int a))
	{	{

	volatile union {int a; float b;} u;	volatile union __cudart_FloatIntCvt u;

	u.a = a;


	return u.b;	u.i = a;
		return u.f;
	}	}

	__device_func__(int __float_as_int(float a))	__device_func__(int __float_as_int(float a))
	{	{

	volatile union {float a; int b;} u;	volatile union __cudart_FloatIntCvt u;

	u.a = a;


	return u.b;	u.f = a;
		return u.i;
	}	}

	__device_func__(long long int __internal_float2ll_kernel(float a, long long int max, long long int min, long long int nan, enum cudaRoundMode rndMode) )	__device_func__(long long int __internal_float2ll_kernel(float a, long long int max, long long int min, long long int nan, enum cudaRoundMode rndMode) )
	{	{
	unsigned long long int res, t = 0ULL;	unsigned long long int res, t = 0ULL;
	int shift;	int shift;
	unsigned int ia;	unsigned int ia;

	__internal_clamp(a, max, min, nan);	__internal_clamp(a, max, min, nan);
	ia = __float_as_int(a);	ia = __float_as_int(a);

	skipping to change at line 681	skipping to change at line 4048
	unsigned long long int t = (unsigned long long int)*a;	unsigned long long int t = (unsigned long long int)*a;
	int lz = __internal_normalize64(&t);	int lz = __internal_normalize64(&t);

	*a = (unsigned int)(t >> 32);	*a = (unsigned int)(t >> 32);

	return lz - 32;	return lz - 32;
	}	}

	__device_func__(float __internal_int2float_kernel(int a, enum cudaRoundMode rndMode))	__device_func__(float __internal_int2float_kernel(int a, enum cudaRoundMode rndMode))
	{	{

	volatile union {	volatile union __cudart_FloatUintCvt res;
	float f;
	unsigned int i;
	} res;
	int shift;	int shift;
	unsigned int t;	unsigned int t;
	res.i = a;	res.i = a;
	if (a == 0) return res.f;	if (a == 0) return res.f;
	if (a < 0) res.i = (unsigned int)-a;	if (a < 0) res.i = (unsigned int)-a;
	shift = __internal_normalize((unsigned int*)&res.i);	shift = __internal_normalize((unsigned int*)&res.i);
	t = res.i << 24;	t = res.i << 24;
	res.i = (res.i >> 8);	res.i = (res.i >> 8);
	res.i += (127 + 30 - shift) << 23;	res.i += (127 + 30 - shift) << 23;
	if (a < 0) res.i \|= 0x80000000;	if (a < 0) res.i \|= 0x80000000;

	skipping to change at line 733	skipping to change at line 4097
	{	{
	#if defined(__MULTI_CORE__)	#if defined(__MULTI_CORE__)
	return (float)a;	return (float)a;
	#else /* __MULTI_CORE__ */	#else /* __MULTI_CORE__ */
	return __internal_int2float_kernel(a, cudaRoundNearest);	return __internal_int2float_kernel(a, cudaRoundNearest);
	#endif /* __MULTI_CORE__ */	#endif /* __MULTI_CORE__ */
	}	}

	__device_func__(float __internal_uint2float_kernel(unsigned int a, enum cud aRoundMode rndMode))	__device_func__(float __internal_uint2float_kernel(unsigned int a, enum cud aRoundMode rndMode))
	{	{

	volatile union {	volatile union __cudart_FloatUintCvt res;
	float f;
	unsigned int i;
	} res;
	int shift;	int shift;
	unsigned int t;	unsigned int t;
	res.i = a;	res.i = a;
	if (a == 0) return res.f;	if (a == 0) return res.f;
	shift = __internal_normalize((unsigned int*)&res.i);	shift = __internal_normalize((unsigned int*)&res.i);
	t = res.i << 24;	t = res.i << 24;
	res.i = (res.i >> 8);	res.i = (res.i >> 8);
	res.i += (127 + 30 - shift) << 23;	res.i += (127 + 30 - shift) << 23;
	if ((rndMode == cudaRoundNearest) && (t >= 0x80000000)) {	if ((rndMode == cudaRoundNearest) && (t >= 0x80000000)) {
	res.i += (t == 0x80000000) ? (res.i & 1) : (t >> 31);	res.i += (t == 0x80000000) ? (res.i & 1) : (t >> 31);

	skipping to change at line 806	skipping to change at line 4167
	t = (unsigned int)temp;	t = (unsigned int)temp;
	res += (127 + 62 - shift) << 23; /* add in exponent */	res += (127 + 62 - shift) << 23; /* add in exponent */
	res += t == 0x80000000 ? res & 1 : t >> 31;	res += t == 0x80000000 ? res & 1 : t >> 31;
	return __int_as_float(res);	return __int_as_float(res);
	#endif /* __MULTI_CORE__ */	#endif /* __MULTI_CORE__ */
	}	}

	__device_func__(float __internal_fmul_kernel(float a, float b, int rndNeare st))	__device_func__(float __internal_fmul_kernel(float a, float b, int rndNeare st))
	{	{
	unsigned long long product;	unsigned long long product;

	volatile union {	volatile union __cudart_FloatUintCvt xx, yy;
	float f;
	unsigned int i;
	} xx, yy;
	unsigned expo_x, expo_y;	unsigned expo_x, expo_y;

	xx.f = a;	xx.f = a;
	yy.f = b;	yy.f = b;

	expo_y = 0xFF;	expo_y = 0xFF;
	expo_x = expo_y & (xx.i >> 23);	expo_x = expo_y & (xx.i >> 23);
	expo_x = expo_x - 1;	expo_x = expo_x - 1;
	expo_y = expo_y & (yy.i >> 23);	expo_y = expo_y & (yy.i >> 23);
	expo_y = expo_y - 1;	expo_y = expo_y - 1;

	skipping to change at line 951	skipping to change at line 4309
	xx.i = yy.i \| 0x00400000;	xx.i = yy.i \| 0x00400000;
	return xx.f;	return xx.f;
	}	}
	xx.i = (unsigned int)product \| 0x7f800000;	xx.i = (unsigned int)product \| 0x7f800000;
	return xx.f;	return xx.f;
	}	}
	}	}

	__device_func__(float __internal_fadd_kernel(float a, float b, int rndNeare st))	__device_func__(float __internal_fadd_kernel(float a, float b, int rndNeare st))
	{	{

	volatile union {	volatile union __cudart_FloatUintCvt xx, yy;
	float f;
	unsigned int i;
	} xx, yy;
	unsigned int expo_x;	unsigned int expo_x;
	unsigned int expo_y;	unsigned int expo_y;
	unsigned int temp;	unsigned int temp;

	xx.f = a;	xx.f = a;
	yy.f = b;	yy.f = b;

	/* make bigger operand the augend */	/* make bigger operand the augend */
	expo_y = yy.i << 1;	expo_y = yy.i << 1;
	if (expo_y > (xx.i << 1)) {	if (expo_y > (xx.i << 1)) {

	skipping to change at line 1069	skipping to change at line 4424
	expo_x = (unsigned int)(-((int)expo_x));	expo_x = (unsigned int)(-((int)expo_x));
	temp = xx.i << expo_y \| ((temp) ? 1 : 0);	temp = xx.i << expo_y \| ((temp) ? 1 : 0);
	xx.i = yy.i \| (xx.i >> expo_x);	xx.i = yy.i \| (xx.i >> expo_x);
	xx.i += (((temp == 0x80000000) ? (xx.i & 1) : (temp >> 31))	xx.i += (((temp == 0x80000000) ? (xx.i & 1) : (temp >> 31))
	&& rndNearest);	&& rndNearest);
	return xx.f;	return xx.f;
	} else {	} else {
	/* handle special cases separately */	/* handle special cases separately */
	if (!(yy.i << 1)) {	if (!(yy.i << 1)) {
	if (xx.i == 0x80000000) {	if (xx.i == 0x80000000) {

	xx.i = yy.i;	xx.i = yy.i;
	}	}
	if ((xx.i << 1) > 0xff000000) {	if ((xx.i << 1) > 0xff000000) {

	xx.i \|= 0x00400000;	xx.i \|= 0x00400000;
	}	}
	return xx.f;	return xx.f;
	}	}
	if ((expo_y != 254) && (expo_x != 254)) {	if ((expo_y != 254) && (expo_x != 254)) {
	/* remove sign bits */	/* remove sign bits */
	if (expo_x == (unsigned int) -1) {	if (expo_x == (unsigned int) -1) {
	temp = xx.i & 0x80000000;	temp = xx.i & 0x80000000;
	xx.i = xx.i << 8;	xx.i = xx.i << 8;
	while (!(xx.i & 0x80000000)) {	while (!(xx.i & 0x80000000)) {
	xx.i <<= 1;	xx.i <<= 1;

	skipping to change at line 1182	skipping to change at line 4537

	#elif defined(_WIN32)	#elif defined(_WIN32)

	#define __syncthreads() \	#define __syncthreads() \
	(void)__cudaSynchronizeThreads((void*)0, (void)0)	(void)__cudaSynchronizeThreads((void*)0, (void)0)

	#endif /* __GNUC__ */	#endif /* __GNUC__ */

	#endif /* __MULTI_CORE__ */	#endif /* __MULTI_CORE__ */


		__device_func__(void __prof_trigger(int a))
		{
		}

		__device_func__(void __threadfence(void))
		{
		}

		__device_func__(void __threadfence_block(void))
		{
		}

	#if defined(__GNUC__)	#if defined(__GNUC__)

	__device_func__(void __trap(void))	__device_func__(void __trap(void))
	{	{
	__builtin_trap();	__builtin_trap();
	}	}

	#elif defined(_WIN32)	#elif defined(_WIN32)

	__device_func__(void __trap(void))	__device_func__(void __trap(void))
	{	{
	__debugbreak();	__debugbreak();
	}	}

	#endif /* __GNUC__ */	#endif /* __GNUC__ */


	#endif /* !__CUDABE__ */	#endif /* __CUDABE__ */

	/************************************************************************ ***	/************************************************************************ ***
	* *	* *
	* DEVICE IMPLEMENTATIONS FOR FUNCTIONS WITH BUILTIN NVOPENCC OPERATIONS *	* DEVICE IMPLEMENTATIONS FOR FUNCTIONS WITH BUILTIN NVOPENCC OPERATIONS *
	* *	* *
	************************************************************************* **/	************************************************************************* **/

	__device_func__(float __fdividef(float a, float b))	__device_func__(float __fdividef(float a, float b))
	{	{
	#if defined(__MULTI_CORE__)	#if defined(__MULTI_CORE__)
	return a / b;	return a / b;
	#elif defined(__CUDABE__)	#elif defined(__CUDABE__)
	return a / b;	return a / b;
	#else /* __MULTI_CORE__ */	#else /* __MULTI_CORE__ */
	/* match range restrictions of the device function */	/* match range restrictions of the device function */
	if (__cuda_fabsf(b) > CUDART_TWO_TO_126_F) {	if (__cuda_fabsf(b) > CUDART_TWO_TO_126_F) {
	if (__cuda_fabsf(a) <= CUDART_NORM_HUGE_F) {	if (__cuda_fabsf(a) <= CUDART_NORM_HUGE_F) {
	return ((a / b) / CUDART_NORM_HUGE_F) / CUDART_NORM_HUGE_F;	return ((a / b) / CUDART_NORM_HUGE_F) / CUDART_NORM_HUGE_F;
	} else {	} else {

	return CUDART_NAN_F;	return __int_as_float(0xffc00000);
	}	}
	} else {	} else {
	return a / b;	return a / b;
	}	}
	#endif /* __MULTI_CORE__ */	#endif /* __MULTI_CORE__ */
	}	}

	__device_func__(float __sinf(float a))	__device_func__(float __sinf(float a))
	{	{
	return sinf(a);	return sinf(a);

	skipping to change at line 1261	skipping to change at line 4628
	b *= .25f;	b *= .25f;
	}	}
	return __fdividef(a, b);	return __fdividef(a, b);
	}	}

	__device_func__(float __tanf(float a))	__device_func__(float __tanf(float a))
	{	{
	#if defined(__MULTI_CORE__)	#if defined(__MULTI_CORE__)
	return tanf(a);	return tanf(a);
	#else /* __MULTI_CORE__ */	#else /* __MULTI_CORE__ */

	return __sinf(a) / __cosf(a);	return __fdividef (__sinf(a), __cosf(a));
	#endif /* __MULTI_CORE__ */	#endif /* __MULTI_CORE__ */
	}	}

	__device_func__(void __sincosf(float a, float sptr, float cptr))	__device_func__(void __sincosf(float a, float sptr, float cptr))
	{	{
	#if defined(__MULTI_CORE__)	#if defined(__MULTI_CORE__)
	sincosf(a, sptr, cptr);	sincosf(a, sptr, cptr);
	#else /* __MULTI_CORE__ */	#else /* __MULTI_CORE__ */
	*sptr = __sinf(a);	*sptr = __sinf(a);
	*cptr = __cosf(a);	*cptr = __cosf(a);

	skipping to change at line 1336	skipping to change at line 4703
	#else /* __MULTI_CORE__ */	#else /* __MULTI_CORE__ */
	return __internal_accurate_fdividef(a, b);	return __internal_accurate_fdividef(a, b);
	#endif /* __MULTI_CORE__ */	#endif /* __MULTI_CORE__ */
	}	}

	__device_func__(int __clz(int a))	__device_func__(int __clz(int a))
	{	{
	return (a)?(158-(__float_as_int(__uint2float_rz((unsigned int)a))>>23)):3 2;	return (a)?(158-(__float_as_int(__uint2float_rz((unsigned int)a))>>23)):3 2;
	}	}


	__device_func__(int __ffs(int a))
	{
	return 32 - __clz (a & -a);
	}

	__device_func__(int __popc(unsigned int a))
	{
	a = a - ((a >> 1) & 0x55555555);
	a = (a & 0x33333333) + ((a >> 2) & 0x33333333);
	a = (a + (a >> 4)) & 0x0f0f0f0f;
	a = ((__umul24(a, 0x808080) << 1) + a) >> 24;
	return a;
	}

	__device_func__(int __clzll(long long int a))	__device_func__(int __clzll(long long int a))
	{	{
	int ahi = ((int)((unsigned long long)a >> 32));	int ahi = ((int)((unsigned long long)a >> 32));
	int alo = ((int)((unsigned long long)a & 0xffffffffULL));	int alo = ((int)((unsigned long long)a & 0xffffffffULL));
	int res;	int res;
	if (ahi) {	if (ahi) {

	res = 0;	res = 0;
	} else {	} else {

	res = 32;	res = 32;
	ahi = alo;	ahi = alo;
	}	}
	res = res + __clz(ahi);	res = res + __clz(ahi);
	return res;	return res;
	}	}


	__device_func__(int __ffsll(long long int a))	__device_func__(int __popc(unsigned int a))
	{	{

	return 64 - __clzll (a & -a);	a = a - ((a >> 1) & 0x55555555);
		a = (a & 0x33333333) + ((a >> 2) & 0x33333333);
		a = (a + (a >> 4)) & 0x0f0f0f0f;
		a = ((__umul24(a, 0x808080) << 1) + a) >> 24;
		return a;
	}	}

	__device_func__(int __popcll(unsigned long long int a))	__device_func__(int __popcll(unsigned long long int a))
	{	{
	unsigned int ahi = ((unsigned int)(a >> 32));	unsigned int ahi = ((unsigned int)(a >> 32));
	unsigned int alo = ((unsigned int)(a & 0xffffffffULL));	unsigned int alo = ((unsigned int)(a & 0xffffffffULL));
	alo = alo - ((alo >> 1) & 0x55555555);	alo = alo - ((alo >> 1) & 0x55555555);
	alo = (alo & 0x33333333) + ((alo >> 2) & 0x33333333);	alo = (alo & 0x33333333) + ((alo >> 2) & 0x33333333);
	ahi = ahi - ((ahi >> 1) & 0x55555555);	ahi = ahi - ((ahi >> 1) & 0x55555555);
	ahi = (ahi & 0x33333333) + ((ahi >> 2) & 0x33333333);	ahi = (ahi & 0x33333333) + ((ahi >> 2) & 0x33333333);
	alo = alo + ahi;	alo = alo + ahi;
	alo = (alo & 0x0f0f0f0f) + ((alo >> 4) & 0x0f0f0f0f);	alo = (alo & 0x0f0f0f0f) + ((alo >> 4) & 0x0f0f0f0f);
	alo = ((__umul24(alo, 0x808080) << 1) + alo) >> 24;	alo = ((__umul24(alo, 0x808080) << 1) + alo) >> 24;
	return alo;	return alo;
	}	}


		__device_func__(unsigned int __brev(unsigned int a))
		{
		a = ((a >> 1) & 0x55555555) + ((a & 0x55555555) << 1);
		a = ((a >> 2) & 0x33333333) + ((a & 0x33333333) << 2);
		a = ((a >> 4) & 0x0F0F0F0F) + ((a & 0x0F0F0F0F) << 4);
		a = ((a >> 8) & 0x00FF00FF) + ((a & 0x00FF00FF) << 8);
		a = ( a >> 16 ) + ( a << 16);
		return a;
		}

		__device_func__(unsigned long long int __brevll(unsigned long long int a))
		{
		unsigned int hi = (unsigned int)(a >> 32);
		unsigned int lo = (unsigned int)(a & 0xffffffffULL);
		unsigned int t;
		t = __brev(lo);
		lo = __brev(hi);
		return ((unsigned long long int)t << 32) + (unsigned long long int)lo;
		}

		__device_func__(int __ffs(int a))
		{
		return 32 - __clz (a & -a);
		}

		__device_func__(int __ffsll(long long int a))
		{
		return 64 - __clzll (a & -a);
		}

	#if defined(CUDA_DOUBLE_MATH_FUNCTIONS) && defined(CUDA_FLOAT_MATH_FUNCTION S)	#if defined(CUDA_DOUBLE_MATH_FUNCTIONS) && defined(CUDA_FLOAT_MATH_FUNCTION S)

	#error -- conflicting mode for double math routines	#error -- conflicting mode for double math routines

	#endif /* CUDA_DOUBLE_MATH_FUNCTIONS && CUDA_FLOAT_MATH_FUNCTIONS */	#endif /* CUDA_DOUBLE_MATH_FUNCTIONS && CUDA_FLOAT_MATH_FUNCTIONS */

	#if defined(CUDA_FLOAT_MATH_FUNCTIONS)	#if defined(CUDA_FLOAT_MATH_FUNCTIONS)

	__device_func__(double fdivide(double a, double b))	__device_func__(double fdivide(double a, double b))
	{	{

End of changes. 31 change blocks.
	52 lines changed or deleted	3441 lines changed or added

	device_launch_parameters.h	device_launch_parameters.h
	/*	/*

	* Copyright 1993-2008 NVIDIA Corporation. All rights reserved.	* Copyright 1993-2009 NVIDIA Corporation. All rights reserved.
	*	*
	* NOTICE TO USER:	* NOTICE TO USER:
	*	*
	* This source code is subject to NVIDIA ownership rights under U.S. and	* This source code is subject to NVIDIA ownership rights under U.S. and
	* international Copyright laws. Users and possessors of this source code	* international Copyright laws. Users and possessors of this source code
	* are hereby granted a nonexclusive, royalty-free license to use this code	* are hereby granted a nonexclusive, royalty-free license to use this code
	* in individual and commercial software.	* in individual and commercial software.
	*	*
	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR

End of changes. 1 change blocks.
	1 lines changed or deleted	1 lines changed or added

	device_runtime.h	device_runtime.h
	/*	/*

	* Copyright 1993-2008 NVIDIA Corporation. All rights reserved.	* Copyright 1993-2009 NVIDIA Corporation. All rights reserved.
	*	*
	* NOTICE TO USER:	* NOTICE TO USER:
	*	*
	* This source code is subject to NVIDIA ownership rights under U.S. and	* This source code is subject to NVIDIA ownership rights under U.S. and
	* international Copyright laws. Users and possessors of this source code	* international Copyright laws. Users and possessors of this source code
	* are hereby granted a nonexclusive, royalty-free license to use this code	* are hereby granted a nonexclusive, royalty-free license to use this code
	* in individual and commercial software.	* in individual and commercial software.
	*	*
	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR

	skipping to change at line 56	skipping to change at line 56
	s	s
	#define __unsized_shared_data(name, type_post) \	#define __unsized_shared_data(name, type_post) \
	__unsized##name __unsized##type_post	__unsized##name __unsized##type_post
	#define __sized_shared_data(name, type) \	#define __sized_shared_data(name, type) \
	__sized##name type	__sized##name type
	#define __sized__shared_var(name, s, type) \	#define __sized__shared_var(name, s, type) \
	name	name

	/TEXTURE_TYPE/	/TEXTURE_TYPE/
	typedef const void *__texture_type__;	typedef const void *__texture_type__;

	/SURFACE_TYPE/
	typedef const void *__surface_type__;

	#if defined(__CUDABE__) /* cudabe compiler */	#if defined(__CUDABE__) /* cudabe compiler */

	#define __pad__(f)	#define __pad__(f)
	#define __text__ \	#define __text__ \
	__attribute__((__texture__))	__attribute__((__texture__))

	#define __surf__ \
	__attribute__((__surface__))
	#define ___device__(sc) \	#define ___device__(sc) \
	static	static
	#define __in__(cdecl, decl) \	#define __in__(cdecl, decl) \
	__shared__ cdecl	__shared__ cdecl
	#define __in_type__(cdecl, decl) \	#define __in_type__(cdecl, decl) \
	cdecl	cdecl
	#define __texture_var(name) \	#define __texture_var(name) \
	name	name
	#define __shared_var(name, s, type) \	#define __shared_var(name, s, type) \
	name	name

	skipping to change at line 102	skipping to change at line 98
	#define __cdecl	#define __cdecl
	#undef __w64	#undef __w64
	#define __w64	#define __w64

	#elif defined(__CUDACC__) /* cudafe compiler */	#elif defined(__CUDACC__) /* cudafe compiler */

	#define __loc_sc__(loc, sc) \	#define __loc_sc__(loc, sc) \
	sc loc	sc loc
	#define __pad__(f)	#define __pad__(f)
	#define __text__	#define __text__

	#define __surf__
	#define ___device__(sc) \	#define ___device__(sc) \
	sc __device__	sc __device__
	#define __in__(cdecl, decl) \	#define __in__(cdecl, decl) \
	decl	decl
	#define __in_type__(cdecl, decl) \	#define __in_type__(cdecl, decl) \
	decl	decl
	#define __texture_var(name) \	#define __texture_var(name) \
	name	name
	#define __shared_var(name, s, type) \	#define __shared_var(name, s, type) \
	name	name

	skipping to change at line 170	skipping to change at line 165

	#endif /* __multi_core__ */	#endif /* __multi_core__ */

	#if defined (__MULTI_CORE__)	#if defined (__MULTI_CORE__)

	#define ___device__(sc) \	#define ___device__(sc) \
	static	static
	#define __pad__(f) \	#define __pad__(f) \
	f	f
	#define __text__	#define __text__

	#define __surf__
	#define __cudaGet_blockIdx() \	#define __cudaGet_blockIdx() \
	(*__cudaGetBlockIdxPtr())	(*__cudaGetBlockIdxPtr())
	#define __shared_var(name, s, type) \	#define __shared_var(name, s, type) \
	(s type __cudaGetSharedMem((void*)(&(name))))	(s type __cudaGetSharedMem((void*)(&(name))))
	#define __var_used__ \	#define __var_used__ \
	__attribute__((__used__))	__attribute__((__used__))
	#define __storage_auto__shared__ \	#define __storage_auto__shared__ \
	auto	auto

	#undef __cdecl	#undef __cdecl

End of changes. 5 change blocks.
	7 lines changed or deleted	1 lines changed or added

	device_types.h	device_types.h
	/*	/*

	* Copyright 1993-2008 NVIDIA Corporation. All rights reserved.	* Copyright 1993-2009 NVIDIA Corporation. All rights reserved.
	*	*
	* NOTICE TO USER:	* NOTICE TO USER:
	*	*
	* This source code is subject to NVIDIA ownership rights under U.S. and	* This source code is subject to NVIDIA ownership rights under U.S. and
	* international Copyright laws. Users and possessors of this source code	* international Copyright laws. Users and possessors of this source code
	* are hereby granted a nonexclusive, royalty-free license to use this code	* are hereby granted a nonexclusive, royalty-free license to use this code
	* in individual and commercial software.	* in individual and commercial software.
	*	*
	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR

End of changes. 1 change blocks.
	1 lines changed or deleted	1 lines changed or added

	driver_functions.h	driver_functions.h
	/*	/*

	* Copyright 1993-2008 NVIDIA Corporation. All rights reserved.	* Copyright 1993-2009 NVIDIA Corporation. All rights reserved.
	*	*
	* NOTICE TO USER:	* NOTICE TO USER:
	*	*
	* This source code is subject to NVIDIA ownership rights under U.S. and	* This source code is subject to NVIDIA ownership rights under U.S. and
	* international Copyright laws. Users and possessors of this source code	* international Copyright laws. Users and possessors of this source code
	* are hereby granted a nonexclusive, royalty-free license to use this code	* are hereby granted a nonexclusive, royalty-free license to use this code
	* in individual and commercial software.	* in individual and commercial software.
	*	*
	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR

End of changes. 1 change blocks.
	1 lines changed or deleted	1 lines changed or added

	driver_types.h	driver_types.h
	/*	/*

	* Copyright 1993-2008 NVIDIA Corporation. All rights reserved.	* Copyright 1993-2009 NVIDIA Corporation. All rights reserved.
	*	*
	* NOTICE TO USER:	* NOTICE TO USER:
	*	*
	* This source code is subject to NVIDIA ownership rights under U.S. and	* This source code is subject to NVIDIA ownership rights under U.S. and
	* international Copyright laws. Users and possessors of this source code	* international Copyright laws. Users and possessors of this source code
	* are hereby granted a nonexclusive, royalty-free license to use this code	* are hereby granted a nonexclusive, royalty-free license to use this code
	* in individual and commercial software.	* in individual and commercial software.
	*	*
	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR

	skipping to change at line 39	skipping to change at line 39
	* source code with only those rights set forth herein.	* source code with only those rights set forth herein.
	*	*
	* Any use of this source code in individual and commercial software must	* Any use of this source code in individual and commercial software must
	* include, in the user documentation and internal comments to the code,	* include, in the user documentation and internal comments to the code,
	* the above Disclaimer and U.S. Government End Users Notice.	* the above Disclaimer and U.S. Government End Users Notice.
	*/	*/

	#if !defined(__DRIVER_TYPES_H__)	#if !defined(__DRIVER_TYPES_H__)
	#define __DRIVER_TYPES_H__	#define __DRIVER_TYPES_H__


		/**
		* \file
		* \name Data types used by CUDA Runtime
		* \author NVIDIA Corporation
		* \brief Data types used by CUDA Runtime
		*/
		/**
		* \defgroup CUDART_TYPES Data types used by CUDA Runtime
		* \ingroup CUDART
		*
		* @{
		*/

	/************************************************************************ ***	/************************************************************************ ***
	* *	* *
	* TYPE DEFINITIONS USED BY RUNTIME API *	* TYPE DEFINITIONS USED BY RUNTIME API *
	* *	* *
	************************************************************************* **/	************************************************************************* **/

	#if !defined(__CUDA_INTERNAL_COMPILATION__)	#if !defined(__CUDA_INTERNAL_COMPILATION__)

	#include <limits.h>	#include <limits.h>
	#include <stddef.h>	#include <stddef.h>


		#define cudaHostAllocDefault 0 ///< Default page-locked allocation
		flag
		#define cudaHostAllocPortable 1 ///< Pinned memory accessible by al
		l CUDA contexts
		#define cudaHostAllocMapped 2 ///< Map allocation into device spa
		ce
		#define cudaHostAllocWriteCombined 4 ///< Write-combined memory

		#define cudaEventDefault 0 ///< Default event flag
		#define cudaEventBlockingSync 1 ///< Event uses blocking synchroniz
		ation

		#define cudaDeviceScheduleAuto 0 ///< Device flag - Automatic schedu
		ling
		#define cudaDeviceScheduleSpin 1 ///< Device flag - Spin default sch
		eduling
		#define cudaDeviceScheduleYield 2 ///< Device flag - Yield default sc
		heduling
		#define cudaDeviceBlockingSync 4 ///< Device flag - Use blocking syn
		chronization
		#define cudaDeviceMapHost 8 ///< Device flag - Support mapped p
		inned allocations
		#define cudaDeviceMask 0xf ///< Device flags mask

	#endif /* !__CUDA_INTERNAL_COMPILATION__ */	#endif /* !__CUDA_INTERNAL_COMPILATION__ */

	/************************************************************************ ***	/************************************************************************ ***
	* *	* *
	* *	* *
	* *	* *
	************************************************************************* **/	************************************************************************* **/


		/**
		* CUDA error types
		*/
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	enum cudaError	enum cudaError
	{	{

	cudaSuccess = 0,	cudaSuccess = 0, ///< No errors
	cudaErrorMissingConfiguration,	cudaErrorMissingConfiguration = 1, ///< Missing configurat
	cudaErrorMemoryAllocation,	ion error
	cudaErrorInitializationError,	cudaErrorMemoryAllocation = 2, ///< Memory allocation
	cudaErrorLaunchFailure,	error
	cudaErrorPriorLaunchFailure,	cudaErrorInitializationError = 3, ///< Initialization err
	cudaErrorLaunchTimeout,	or
	cudaErrorLaunchOutOfResources,	cudaErrorLaunchFailure = 4, ///< Launch failure
	cudaErrorInvalidDeviceFunction,	cudaErrorPriorLaunchFailure = 5, ///< Prior launch failu
	cudaErrorInvalidConfiguration,	re
	cudaErrorInvalidDevice,	cudaErrorLaunchTimeout = 6, ///< Launch timeout err
	cudaErrorInvalidValue,	or
	cudaErrorInvalidPitchValue,	cudaErrorLaunchOutOfResources = 7, ///< Launch out of reso
	cudaErrorInvalidSymbol,	urces error
	cudaErrorMapBufferObjectFailed,	cudaErrorInvalidDeviceFunction = 8, ///< Invalid device fun
	cudaErrorUnmapBufferObjectFailed,	ction
	cudaErrorInvalidHostPointer,	cudaErrorInvalidConfiguration = 9, ///< Invalid configurat
	cudaErrorInvalidDevicePointer,	ion
	cudaErrorInvalidTexture,	cudaErrorInvalidDevice = 10, ///< Invalid device
	cudaErrorInvalidTextureBinding,	cudaErrorInvalidValue = 11, ///< Invalid value
	cudaErrorInvalidChannelDescriptor,	cudaErrorInvalidPitchValue = 12, ///< Invalid pitch valu
	cudaErrorInvalidMemcpyDirection,	e
	cudaErrorAddressOfConstant,	cudaErrorInvalidSymbol = 13, ///< Invalid symbol
	cudaErrorTextureFetchFailed,	cudaErrorMapBufferObjectFailed = 14, ///< Map buffer object
	cudaErrorTextureNotBound,	failed
	cudaErrorSynchronizationError,	cudaErrorUnmapBufferObjectFailed = 15, ///< Unmap buffer objec
	cudaErrorInvalidFilterSetting,	t failed
	cudaErrorInvalidNormSetting,	cudaErrorInvalidHostPointer = 16, ///< Invalid host point
	cudaErrorMixedDeviceExecution,	er
	cudaErrorCudartUnloading,	cudaErrorInvalidDevicePointer = 17, ///< Invalid device poi
	cudaErrorUnknown,	nter
	cudaErrorNotYetImplemented,	cudaErrorInvalidTexture = 18, ///< Invalid texture
	cudaErrorMemoryValueTooLarge,	cudaErrorInvalidTextureBinding = 19, ///< Invalid texture bi
	cudaErrorInvalidResourceHandle,	nding
	cudaErrorNotReady,	cudaErrorInvalidChannelDescriptor = 20, ///< Invalid channel de
	cudaErrorInsufficientDriver,	scriptor
	cudaErrorSetOnActiveProcess,	cudaErrorInvalidMemcpyDirection = 21, ///< Invalid memcpy dir
	cudaErrorStartupFailure = 0x7f,	ection
	cudaErrorApiFailureBase = 10000	cudaErrorAddressOfConstant = 22, ///< Address of constan
		t error
		cudaErrorTextureFetchFailed = 23, ///< Texture fetch fail
		ed
		cudaErrorTextureNotBound = 24, ///< Texture not bound
		error
		cudaErrorSynchronizationError = 25, ///< Synchronization er
		ror
		cudaErrorInvalidFilterSetting = 26, ///< Invalid filter set
		ting
		cudaErrorInvalidNormSetting = 27, ///< Invalid norm setti
		ng
		cudaErrorMixedDeviceExecution = 28, ///< Mixed device execu
		tion
		cudaErrorCudartUnloading = 29, ///< CUDA runtime unloa
		ding
		cudaErrorUnknown = 30, ///< Unknown error cond
		ition
		cudaErrorNotYetImplemented = 31, ///< Function not yet i
		mplemented
		cudaErrorMemoryValueTooLarge = 32, ///< Memory value too l
		arge
		cudaErrorInvalidResourceHandle = 33, ///< Invalid resource h
		andle
		cudaErrorNotReady = 34, ///< Not ready error
		cudaErrorInsufficientDriver = 35, ///< CUDA runtime is ne
		wer than driver
		cudaErrorSetOnActiveProcess = 36, ///< Set on active proc
		ess error
		cudaErrorNoDevice = 38, ///< No available CUDA
		device
		cudaErrorStartupFailure = 0x7f, ///< Startup failure
		cudaErrorApiFailureBase = 10000 ///< API failure base
	};	};


		/**
		* Channel format kind
		*/
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	enum cudaChannelFormatKind	enum cudaChannelFormatKind
	{	{

	cudaChannelFormatKindSigned,	cudaChannelFormatKindSigned = 0, ///< Signed channel for
	cudaChannelFormatKindUnsigned,	mat
	cudaChannelFormatKindFloat,	cudaChannelFormatKindUnsigned = 1, ///< Unsigned channel f
	cudaChannelFormatKindNone	ormat
		cudaChannelFormatKindFloat = 2, ///< Float channel form
		at
		cudaChannelFormatKindNone = 3, ///< No channel format
	};	};


		/**
		* CUDA Channel format descriptor
		*/
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct cudaChannelFormatDesc	struct cudaChannelFormatDesc
	{	{

	int x;	int x; ///< x
	int y;	int y; ///< y
	int z;	int z; ///< z
	int w;	int w; ///< w
	enum cudaChannelFormatKind f;	enum cudaChannelFormatKind f; ///< Channel format kind
	};	};


		/**
		* CUDA array
		*/
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct cudaArray;	struct cudaArray;


		/**
		* CUDA memory copy types
		*/
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	enum cudaMemcpyKind	enum cudaMemcpyKind
	{	{

	cudaMemcpyHostToHost = 0,	cudaMemcpyHostToHost = 0, ///< Host -> Host
	cudaMemcpyHostToDevice,	cudaMemcpyHostToDevice = 1, ///< Host -> Device
	cudaMemcpyDeviceToHost,	cudaMemcpyDeviceToHost = 2, ///< Device -> Host
	cudaMemcpyDeviceToDevice	cudaMemcpyDeviceToDevice = 3 ///< Device -> Device
	};	};


		/**
		* CUDA Pitched memory pointer
		*/
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct cudaPitchedPtr	struct cudaPitchedPtr
	{	{

	void *ptr;	void *ptr; ///< Pointer to allocated memory
	size_t pitch;	size_t pitch; ///< Pitch of allocated memory in bytes
	size_t xsize;	size_t xsize; ///< Logical width of allocation in bytes
	size_t ysize;	size_t ysize; ///< Logical height of allocation in bytes
	};	};


		/**
		* CUDA extent
		*/
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct cudaExtent	struct cudaExtent
	{	{

	size_t width;	size_t width; ///< Width in bytes
	size_t height;	size_t height; ///< Height in bytes
	size_t depth;	size_t depth; ///< Depth in bytes
	};	};


		/**
		* CUDA 3D position
		*/
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct cudaPos	struct cudaPos
	{	{

	size_t x;	size_t x; ///< x
	size_t y;	size_t y; ///< y
	size_t z;	size_t z; ///< z
	};	};


		/**
		* CUDA 3D memory copying parameters
		*/
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct cudaMemcpy3DParms	struct cudaMemcpy3DParms
	{	{

	struct cudaArray *srcArray;	struct cudaArray *srcArray; ///< Source memory address
	struct cudaPos srcPos;	struct cudaPos srcPos; ///< Source position offset
	struct cudaPitchedPtr srcPtr;	struct cudaPitchedPtr srcPtr; ///< Pitched source memory address


	struct cudaArray *dstArray;	struct cudaArray *dstArray; ///< Destination memory address
	struct cudaPos dstPos;	struct cudaPos dstPos; ///< Destination position offset
	struct cudaPitchedPtr dstPtr;	struct cudaPitchedPtr dstPtr; ///< Pitched destination memory address


	struct cudaExtent extent;	struct cudaExtent extent; ///< Requested memory copy size
	enum cudaMemcpyKind kind;	enum cudaMemcpyKind kind; ///< Type of transfer
		};

		/**
		* CUDA function attributes
		*/
		/DEVICE_BUILTIN/
		struct cudaFuncAttributes
		{
		size_t sharedSizeBytes; ///< Size of shared memory in bytes
		size_t constSizeBytes; ///< Size of constant memory in bytes
		size_t localSizeBytes; ///< Size of local memory in bytes
		int maxThreadsPerBlock; ///< Maximum number of threads per block
		int numRegs; ///< Number of registers used
		int __cudaReserved[8];
		};

		/**
		* CUDA device compute modes
		*/
		/DEVICE_BUILTIN/
		enum cudaComputeMode
		{
		cudaComputeModeDefault = 0, ///< Default compute mode (Multiple thr
		eads can use ::cudaSetDevice() with this device)
		cudaComputeModeExclusive = 1, ///< Compute-exclusive mode (Only one t
		hread will be able to use ::cudaSetDevice() with this device)
		cudaComputeModeProhibited = 2 ///< Compute-prohibited mode (No thread
		s can use ::cudaSetDevice() with this device)
	};	};


		/**
		* CUDA device properties
		*/
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct cudaDeviceProp	struct cudaDeviceProp
	{	{

	char name[256];	char name[256]; ///< ASCII string identifying device
	size_t totalGlobalMem;	size_t totalGlobalMem; ///< Global memory available on device
	size_t sharedMemPerBlock;	in bytes
	int regsPerBlock;	size_t sharedMemPerBlock; ///< Shared memory available per block
	int warpSize;	in bytes
	size_t memPitch;	int regsPerBlock; ///< 32-bit registers available per blo
	int maxThreadsPerBlock;	ck
	int maxThreadsDim[3];	int warpSize; ///< Warp size in threads
	int maxGridSize[3];	size_t memPitch; ///< Maximum pitch in bytes allowed by
	int clockRate;	memory copies
	size_t totalConstMem;	int maxThreadsPerBlock; ///< Maximum number of threads per bloc
	int major;	k
	int minor;	int maxThreadsDim[3]; ///< Maximum size of each dimension of
	size_t textureAlignment;	a block
	int deviceOverlap;	int maxGridSize[3]; ///< Maximum size of each dimension of
	int multiProcessorCount;	a grid
	int kernelExecTimeoutEnabled;	int clockRate; ///< Clock frequency in kilohertz
	int __cudaReserved[39];	size_t totalConstMem; ///< Constant memory available on devic
		e in bytes
		int major; ///< Major compute capability
		int minor; ///< Minor compute capability
		size_t textureAlignment; ///< Alignment requirement for textures
		int deviceOverlap; ///< Device can concurrently copy memor
		y and execute a kernel
		int multiProcessorCount; ///< Number of multiprocessors on devic
		e
		int kernelExecTimeoutEnabled; ///< Specified whether there is a run t
		ime limit on kernels
		int integrated; ///< Device is integrated as opposed to
		discrete
		int canMapHostMemory; ///< Device can map host memory with cu
		daHostAlloc/cudaHostGetDevicePointer
		int computeMode; ///< Compute mode (See ::cudaComputeMod
		e)
		int __cudaReserved[36];
	};	};

	#define cudaDevicePropDontCare \	#define cudaDevicePropDontCare \
	{ \	{ \
	{'\0'}, /* char name[256]; */ \	{'\0'}, /* char name[256]; */ \
	0, /* size_t totalGlobalMem; */ \	0, /* size_t totalGlobalMem; */ \
	0, /* size_t sharedMemPerBlock; */ \	0, /* size_t sharedMemPerBlock; */ \
	0, /* int regsPerBlock; */ \	0, /* int regsPerBlock; */ \
	0, /* int warpSize; */ \	0, /* int warpSize; */ \
	0, /* size_t memPitch; */ \	0, /* size_t memPitch; */ \
	0, /* int maxThreadsPerBlock; */ \	0, /* int maxThreadsPerBlock; */ \
	{0, 0, 0}, /* int maxThreadsDim[3]; */ \	{0, 0, 0}, /* int maxThreadsDim[3]; */ \
	{0, 0, 0}, /* int maxGridSize[3]; */ \	{0, 0, 0}, /* int maxGridSize[3]; */ \
	0, /* int clockRate; */ \	0, /* int clockRate; */ \
	0, /* size_t totalConstMem; */ \	0, /* size_t totalConstMem; */ \
	-1, /* int major; */ \	-1, /* int major; */ \
	-1, /* int minor; */ \	-1, /* int minor; */ \
	0, /* size_t textureAlignment; */ \	0, /* size_t textureAlignment; */ \
	-1, /* int deviceOverlap; */ \	-1, /* int deviceOverlap; */ \
	0, /* int multiProcessorCount; */ \	0, /* int multiProcessorCount; */ \

	0 /* int kernelExecTimeoutEnabled */ \	0, /* int kernelExecTimeoutEnabled */ \
	}	0, /* int integrated */ \
		0, /* int canMapHostMemory */ \
		0, /* int computeMode */ \
		} ///< Empty device properties

	/************************************************************************ ***	/************************************************************************ ***
	* *	* *
	* SHORTHAND TYPE DEFINITION USED BY RUNTIME API *	* SHORTHAND TYPE DEFINITION USED BY RUNTIME API *
	* *	* *
	************************************************************************* **/	************************************************************************* **/


		/**
		* CUDA Error types
		*/
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	typedef enum cudaError cudaError_t;	typedef enum cudaError cudaError_t;


		/**
		* CUDA stream
		*/
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	typedef int cudaStream_t;	typedef int cudaStream_t;


		/**
		* CUDA event types
		*/
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	typedef int cudaEvent_t;	typedef int cudaEvent_t;


		/** @} */
		/** @} / / END CUDART_TYPES */

	#endif /* !__DRIVER_TYPES_H__ */	#endif /* !__DRIVER_TYPES_H__ */

End of changes. 29 change blocks.
	91 lines changed or deleted	253 lines changed or added

	func_macro.h	func_macro.h
	/*	/*

	* Copyright 1993-2008 NVIDIA Corporation. All rights reserved.	* Copyright 1993-2009 NVIDIA Corporation. All rights reserved.
	*	*
	* NOTICE TO USER:	* NOTICE TO USER:
	*	*
	* This source code is subject to NVIDIA ownership rights under U.S. and	* This source code is subject to NVIDIA ownership rights under U.S. and
	* international Copyright laws. Users and possessors of this source code	* international Copyright laws. Users and possessors of this source code
	* are hereby granted a nonexclusive, royalty-free license to use this code	* are hereby granted a nonexclusive, royalty-free license to use this code
	* in individual and commercial software.	* in individual and commercial software.
	*	*
	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR

End of changes. 1 change blocks.
	1 lines changed or deleted	1 lines changed or added

	host_config.h	host_config.h
	/*	/*

	* Copyright 1993-2008 NVIDIA Corporation. All rights reserved.	* Copyright 1993-2009 NVIDIA Corporation. All rights reserved.
	*	*
	* NOTICE TO USER:	* NOTICE TO USER:
	*	*
	* This source code is subject to NVIDIA ownership rights under U.S. and	* This source code is subject to NVIDIA ownership rights under U.S. and
	* international Copyright laws. Users and possessors of this source code	* international Copyright laws. Users and possessors of this source code
	* are hereby granted a nonexclusive, royalty-free license to use this code	* are hereby granted a nonexclusive, royalty-free license to use this code
	* in individual and commercial software.	* in individual and commercial software.
	*	*
	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR

	skipping to change at line 49	skipping to change at line 49
	/************************************************************************ ***	/************************************************************************ ***
	* *	* *
	* *	* *
	* *	* *
	************************************************************************* **/	************************************************************************* **/

	#if defined(__CUDACC__)	#if defined(__CUDACC__)

	#if defined(__APPLE__)	#if defined(__APPLE__)


		#define _CRTIMP
	#define __THROW	#define __THROW


		#if defined(__MULTI_CORE__)

		#error multicore not supported for MacOs

		#endif /* __MULTI_CORE__ */

	#elif defined(__GNUC__)	#elif defined(__GNUC__)


	#include <features.h> /* for __THROW */	#define _CRTIMP


	#include <bits/c++config.h> /* get _GLIBCXX_ATOMIC_BUILTINS */	#if defined(__MULTI_CORE__) && __GNUC__ > 3

		#error multicore not supported for gcc 4.x

		#endif /* __MULTI_CORE__ & __GNUC__ > 3 */

		#include <features.h> /* for __THROW */
		#include <bits/c++config.h> /* for _GLIBCXX_ATOMIC_BUILTINS */

	#if _GLIBCXX_ATOMIC_BUILTINS == 1	#if _GLIBCXX_ATOMIC_BUILTINS == 1

	#undef _GLIBCXX_ATOMIC_BUILTINS /* for missing __sync_fetch_and_add */	#undef _GLIBCXX_ATOMIC_BUILTINS /* for missing __sync_fetch_and_add */

	#endif /* _GLIBCXX_ATOMIC_BUILTINS == 1 */	#endif /* _GLIBCXX_ATOMIC_BUILTINS == 1 */

	#elif defined(_WIN32)	#elif defined(_WIN32)


	#if _MSC_VER >= 1400	#if defined(__MULTI_CORE__) && _MSC_VER != 1400

		#error multicore support available only for VC8

		#endif /* __MULTI_CORE__ & _MSC_VER != 1400 */

		#if _MSC_VER >= 1500

	#undef _USE_DECLSPECS_FOR_SAL	#undef _USE_DECLSPECS_FOR_SAL
	#define _USE_DECLSPECS_FOR_SAL \	#define _USE_DECLSPECS_FOR_SAL \
	1	1


		#endif /* _MSC_VER >= 1500 */

		#if _MSC_VER >= 1400

	#if !defined(_CRT_NONSTDC_NO_WARNINGS)	#if !defined(_CRT_NONSTDC_NO_WARNINGS)

	#define _CRT_NONSTDC_NO_WARNINGS /* to suppress warnings */	#define _CRT_NONSTDC_NO_WARNINGS /* to suppress warnings */

	#endif /* _CRT_NONSTDC_NO_WARNINGS */	#endif /* _CRT_NONSTDC_NO_WARNINGS */

	#if !defined(_CRT_SECURE_NO_WARNINGS)	#if !defined(_CRT_SECURE_NO_WARNINGS)

	#define _CRT_SECURE_NO_WARNINGS /* to suppress warnings */	#define _CRT_SECURE_NO_WARNINGS /* to suppress warnings */

	#endif /* _CRT_SECURE_NO_WARNINGS */	#endif /* _CRT_SECURE_NO_WARNINGS */

	#endif /* _MSC_VER >= 1400 */	#endif /* _MSC_VER >= 1400 */

	#if !defined(NOMINMAX)	#if !defined(NOMINMAX)

	#define NOMINMAX /* min and max are part of cuda runtime */	#define NOMINMAX /* min and max are part of cuda runtime */

	#endif /* !NOMINMAX */	#endif /* !NOMINMAX */


	#define __THROW	#include <crtdefs.h> /* for _CRTIMP */

	/* forward declarations for windows C++ header files */

	#include <stddef.h>

	class type_info;

	#if !defined(_CRTIMP)
	#if defined(_DLL)
	#define _CRTIMP \
	__declspec(dllimport)
	#else /* _DLL */
	#define _CRTIMP
	#endif /* _DLL */
	#endif /* !_CRTIMP */
	#if defined(_DEBUG)
	#if !defined(_NATIVE_WCHAR_T_DEFINED) && defined(_M_CEE_PURE)
	extern "C++"
	#else /* !_NATIVE_WCHAR_T_DEFINED && _M_CEE_PURE */
	extern "C"
	#endif /* !_NATIVE_WCHAR_T_DEFINED && _M_CEE_PURE */
	_CRTIMP void __cdecl _invalid_parameter(const wchar_t, const wchar_t, con
	st wchar_t*, unsigned int, uintptr_t);
	#else /* _DEBUG */
	extern "C" _CRTIMP void __cdecl _invalid_parameter_noinfo(void);
	#endif /* DEBUG */

	namespace std
	{
	struct _Secure_char_traits_tag;
	namespace _Traits_helper
	{
	template<class T> inline typename T::char_type *copy_s(typename T::char
	_type, size_t, const typename T::char_type, size_t, _Secure_char_traits_t
	ag);
	template<class T> inline typename T::char_type *move_s(typename T::char
	_type, size_t, const typename T::char_type, size_t, _Secure_char_traits_t
	ag);
	}
	}


	namespace stdext	#define __THROW
	{
	template<class U, class V, class W> inline V unchecked_uninitialized_copy
	(U, U, V, W&);
	}

	#endif /* __GNUC__ */	#endif /* __GNUC__ */

	#endif /* __CUDACC__ */	#endif /* __CUDACC__ */

	#endif /* !__HOST_CONFIG_H__ */	#endif /* !__HOST_CONFIG_H__ */

End of changes. 9 change blocks.
	50 lines changed or deleted	30 lines changed or added

	host_defines.h	host_defines.h
	/*	/*

	* Copyright 1993-2008 NVIDIA Corporation. All rights reserved.	* Copyright 1993-2009 NVIDIA Corporation. All rights reserved.
	*	*
	* NOTICE TO USER:	* NOTICE TO USER:
	*	*
	* This source code is subject to NVIDIA ownership rights under U.S. and	* This source code is subject to NVIDIA ownership rights under U.S. and
	* international Copyright laws. Users and possessors of this source code	* international Copyright laws. Users and possessors of this source code
	* are hereby granted a nonexclusive, royalty-free license to use this code	* are hereby granted a nonexclusive, royalty-free license to use this code
	* in individual and commercial software.	* in individual and commercial software.
	*	*
	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR

	skipping to change at line 121	skipping to change at line 121
	#define __device__ \	#define __device__ \
	__location__(__device__)	__location__(__device__)
	#define __host__ \	#define __host__ \
	__location__(__host__)	__location__(__host__)
	#define __global__ \	#define __global__ \
	__location__(__global__)	__location__(__global__)
	#define __shared__ \	#define __shared__ \
	__location__(__shared__)	__location__(__shared__)
	#define __constant__ \	#define __constant__ \
	__location__(__constant__)	__location__(__constant__)

	#define __launch_bounds__(t, b) \	#define __launch_bounds__(t) \
	__location__(__launch_bounds__(t, b))	__location__(__launch_bounds__(t))

	#endif /* !__HOST_DEFINES_H__ */	#endif /* !__HOST_DEFINES_H__ */

End of changes. 2 change blocks.
	3 lines changed or deleted	3 lines changed or added

	host_runtime.h	host_runtime.h
	/*	/*

	* Copyright 1993-2008 NVIDIA Corporation. All rights reserved.	* Copyright 1993-2009 NVIDIA Corporation. All rights reserved.
	*	*
	* NOTICE TO USER:	* NOTICE TO USER:
	*	*
	* This source code is subject to NVIDIA ownership rights under U.S. and	* This source code is subject to NVIDIA ownership rights under U.S. and
	* international Copyright laws. Users and possessors of this source code	* international Copyright laws. Users and possessors of this source code
	* are hereby granted a nonexclusive, royalty-free license to use this code	* are hereby granted a nonexclusive, royalty-free license to use this code
	* in individual and commercial software.	* in individual and commercial software.
	*	*
	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR

	skipping to change at line 63	skipping to change at line 63

	#define __extern_weak__ \	#define __extern_weak__ \
	__weak_import__,	__weak_import__,

	#elif defined(__GNUC__)	#elif defined(__GNUC__)

	#define __extern_weak__	#define __extern_weak__

	#endif /* __APPLE__ */	#endif /* __APPLE__ */


	#if !defined(__cplusplus)	#if defined(__cplusplus)

		#define __device_stub_name(c, cpp) \
		cpp
		#define __cppref__ \
		&

		#else /* __cplusplus */

		#define __device_stub_name(c, cpp) \
		c
		#define __cppref__

	typedef char bool;	typedef char bool;


	#endif /* !__cplusplus */	#endif /* __cplusplus */

	#include "cuda_runtime_api.h"	#include "cuda_runtime_api.h"
	#include "storage_class.h"	#include "storage_class.h"

	#else /* !__CUDA_INTERNAL_COMPILATION__ */	#else /* !__CUDA_INTERNAL_COMPILATION__ */

	#include "host_defines.h"	#include "host_defines.h"

	#define __cudaRegisterBinary() \	#define __cudaRegisterBinary() \
	__cudaFatCubinHandle = __cudaRegisterFatBinary((void*)__cudaFatCubi n); \	__cudaFatCubinHandle = __cudaRegisterFatBinary((void*)__cudaFatCubi n); \
	atexit(__cudaUnregisterBinaryUtil)	atexit(__cudaUnregisterBinaryUtil)
	#define __cudaRegisterVariable(var, ext, size, constant, global) \	#define __cudaRegisterVariable(var, ext, size, constant, global) \
	__cudaRegisterVar(__cudaFatCubinHandle, (char)&__host##var, (char )__device##var, __name##var, ext, size, constant, global)	__cudaRegisterVar(__cudaFatCubinHandle, (char)&__host##var, (char )__device##var, __name##var, ext, size, constant, global)
	#define __cudaRegisterGlobalTexture(tex, dim, norm, ext) \	#define __cudaRegisterGlobalTexture(tex, dim, norm, ext) \
	__cudaRegisterTexture(__cudaFatCubinHandle, (const struct textureRe ference*)&tex, __tex_var(tex), #tex, dim, norm, ext)	__cudaRegisterTexture(__cudaFatCubinHandle, (const struct textureRe ference*)&tex, __tex_var(tex), #tex, dim, norm, ext)
	#define __cudaRegisterUnsizedShared(var) \	#define __cudaRegisterUnsizedShared(var) \
	__cudaRegisterShared(__cudaFatCubinHandle, (void**)__device_var(var ))	__cudaRegisterShared(__cudaFatCubinHandle, (void**)__device_var(var ))
	#define __cudaRegisterSharedVariable(var, size, align, sc) \	#define __cudaRegisterSharedVariable(var, size, align, sc) \
	__cudaRegisterSharedVar(__cudaFatCubinHandle, (void**)__device_var( var), size, align, sc)	__cudaRegisterSharedVar(__cudaFatCubinHandle, (void**)__device_var( var), size, align, sc)

	#define __cudaRegisterEntry(fun, thread_limit) \	#define __cudaRegisterEntry(funptr, fun, thread_limit) \
	__cudaRegisterFunction(__cudaFatCubinHandle, (const char*)__device_	__cudaRegisterFunction(__cudaFatCubinHandle, (const char*)funptr, (
	stub_##fun, (char*)__device_fun(fun), #fun, thread_limit, __ids)	char*)__device_fun(fun), #fun, thread_limit, __ids)

	#define __cudaInitArgBlock(arg) \	#define __cudaInitArgBlock(arg) \
	char __[256]; \	char __[256]; \
	(char*)&arg = __	(char*)&arg = __
	#define __cudaSetupArg(arg, offset) \	#define __cudaSetupArg(arg, offset) \
	if (cudaSetupArgument((void)(char)&arg, sizeof(arg), (size_t)&off set->arg - (size_t)offset) != cudaSuccess) \	if (cudaSetupArgument((void)(char)&arg, sizeof(arg), (size_t)&off set->arg - (size_t)offset) != cudaSuccess) \
	return	return
	#define __cudaLaunch(fun) \	#define __cudaLaunch(fun) \
	{ volatile static char *__f; __f = fun; (void)cudaLaunch(fun); }	{ volatile static char *__f; __f = fun; (void)cudaLaunch(fun); }


End of changes. 4 change blocks.
	6 lines changed or deleted	17 lines changed or added

	math_constants.h	math_constants.h
	/*	/*

	* Copyright 1993-2008 NVIDIA Corporation. All rights reserved.	* Copyright 1993-2009 NVIDIA Corporation. All rights reserved.
	*	*
	* NOTICE TO USER:	* NOTICE TO USER:
	*	*
	* This source code is subject to NVIDIA ownership rights under U.S. and	* This source code is subject to NVIDIA ownership rights under U.S. and
	* international Copyright laws. Users and possessors of this source code	* international Copyright laws. Users and possessors of this source code
	* are hereby granted a nonexclusive, royalty-free license to use this code	* are hereby granted a nonexclusive, royalty-free license to use this code
	* in individual and commercial software.	* in individual and commercial software.
	*	*
	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR

End of changes. 1 change blocks.
	1 lines changed or deleted	1 lines changed or added

	math_functions.h	math_functions.h
	/*	/*

	* Copyright 1993-2008 NVIDIA Corporation. All rights reserved.	* Copyright 1993-2009 NVIDIA Corporation. All rights reserved.
	*	*
	* NOTICE TO USER:	* NOTICE TO USER:
	*	*
	* This source code is subject to NVIDIA ownership rights under U.S. and	* This source code is subject to NVIDIA ownership rights under U.S. and
	* international Copyright laws. Users and possessors of this source code	* international Copyright laws. Users and possessors of this source code
	* are hereby granted a nonexclusive, royalty-free license to use this code	* are hereby granted a nonexclusive, royalty-free license to use this code
	* in individual and commercial software.	* in individual and commercial software.
	*	*
	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR

	skipping to change at line 339	skipping to change at line 339
	extern __host__ __device__ double remquo(double, double, int*) __THR OW;	extern __host__ __device__ double remquo(double, double, int*) __THR OW;
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __host__ __device__ float remquof(float, float, int*) __THRO W;	extern __host__ __device__ float remquof(float, float, int*) __THRO W;

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __host__ __device__ double erf(double) __THROW;	extern __host__ __device__ double erf(double) __THROW;
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __host__ __device__ float erff(float) __THROW;	extern __host__ __device__ float erff(float) __THROW;

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/

		extern __host__ __device__ double erfinv(double) __THROW;
		/DEVICE_BUILTIN/
		extern __host__ __device__ float erfinvf(float) __THROW;

		/DEVICE_BUILTIN/
	extern __host__ __device__ double erfc(double) __THROW;	extern __host__ __device__ double erfc(double) __THROW;
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __host__ __device__ float erfcf(float) __THROW;	extern __host__ __device__ float erfcf(float) __THROW;

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/

		extern __host__ __device__ double erfcinv(double) __THROW;
		/DEVICE_BUILTIN/
		extern __host__ __device__ float erfcinvf(float) __THROW;

		/DEVICE_BUILTIN/
	extern __host__ __device__ double lgamma(double) __THROW;	extern __host__ __device__ double lgamma(double) __THROW;
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __host__ __device__ float lgammaf(float) __THROW;	extern __host__ __device__ float lgammaf(float) __THROW;

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __host__ __device__ double tgamma(double) __THROW;	extern __host__ __device__ double tgamma(double) __THROW;
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __host__ __device__ float tgammaf(float) __THROW;	extern __host__ __device__ float tgammaf(float) __THROW;

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/

	skipping to change at line 670	skipping to change at line 680
	static __inline__ __host__ __device__ void sincos(float a, float sptr, flo at cptr)	static __inline__ __host__ __device__ void sincos(float a, float sptr, flo at cptr)
	{	{
	sincosf(a, sptr, cptr);	sincosf(a, sptr, cptr);
	}	}

	static __inline__ __host__ __device__ float erf(float a)	static __inline__ __host__ __device__ float erf(float a)
	{	{
	return erff(a);	return erff(a);
	}	}


		static __inline__ __host__ __device__ float erfinv(float a)
		{
		return erfinvf(a);
		}

	static __inline__ __host__ __device__ float erfc(float a)	static __inline__ __host__ __device__ float erfc(float a)
	{	{
	return erfcf(a);	return erfcf(a);
	}	}


		static __inline__ __host__ __device__ float erfcinv(float a)
		{
		return erfcinvf(a);
		}

	static __inline__ __host__ __device__ float lgamma(float a)	static __inline__ __host__ __device__ float lgamma(float a)
	{	{
	return lgammaf(a);	return lgammaf(a);
	}	}

	static __inline__ __host__ __device__ float tgamma(float a)	static __inline__ __host__ __device__ float tgamma(float a)
	{	{
	return tgammaf(a);	return tgammaf(a);
	}	}


	skipping to change at line 1055	skipping to change at line 1075
	return copysignf(u, a);	return copysignf(u, a);
	}	}
	}	}

	__device_func__(float __internal_fminf(float a, float b))	__device_func__(float __internal_fminf(float a, float b))
	{	{
	volatile union {	volatile union {
	float f;	float f;
	unsigned int i;	unsigned int i;
	} cvta, cvtb;	} cvta, cvtb;

		int nana, nanb;

	cvta.f = a;	cvta.f = a;
	cvtb.f = b;	cvtb.f = b;

	if ((cvta.i << 1) > 0xff000000) return b;	nana = ((cvta.i << 1) > 0xff000000);
	if ((cvtb.i << 1) > 0xff000000) return a;	nanb = ((cvtb.i << 1) > 0xff000000);
		if (nana && nanb) return a + b;
		if (nana) return b;
		if (nanb) return a;
	if ((cvta.i \| cvtb.i) == 0x80000000) {	if ((cvta.i \| cvtb.i) == 0x80000000) {
	return CUDART_NEG_ZERO_F;	return CUDART_NEG_ZERO_F;
	}	}
	return a < b ? a : b;	return a < b ? a : b;
	}	}

	__device_func__(float __internal_fmaxf(float a, float b))	__device_func__(float __internal_fmaxf(float a, float b))
	{	{
	volatile union {	volatile union {
	float f;	float f;
	unsigned int i;	unsigned int i;
	} cvta, cvtb;	} cvta, cvtb;

		int nana, nanb;

	cvta.f = a;	cvta.f = a;
	cvtb.f = b;	cvtb.f = b;

	if ((cvta.i << 1) > 0xff000000) return b;	nana = ((cvta.i << 1) > 0xff000000);
	if ((cvtb.i << 1) > 0xff000000) return a;	nanb = ((cvtb.i << 1) > 0xff000000);
		if (nana && nanb) return a + b;
		if (nana) return b;
		if (nanb) return a;
	if ((cvta.f == 0.0f) && (cvtb.f == 0.0f)) {	if ((cvta.f == 0.0f) && (cvtb.f == 0.0f)) {
	cvta.i &= cvtb.i;	cvta.i &= cvtb.i;
	return cvta.f;	return cvta.f;
	}	}
	return a > b ? a : b;	return a > b ? a : b;
	}	}

	#if defined(_WIN32)	#if defined(_WIN32)

	__func__(double trunc(double a))	__func__(double trunc(double a))

	skipping to change at line 1140	skipping to change at line 1168
	return ceilf(a);	return ceilf(a);
	}	}

	__device_func__(float __cuda_floorf(float a))	__device_func__(float __cuda_floorf(float a))
	{	{
	return floorf(a);	return floorf(a);
	}	}

	__device_func__(float __cuda_sqrtf(float a))	__device_func__(float __cuda_sqrtf(float a))
	{	{

	return sqrtf(a);	return sqrtf(a);
	}	}

	__device_func__(float __cuda_rsqrtf(float a))	__device_func__(float __cuda_rsqrtf(float a))
	{	{

	return 1.0f / sqrtf(a);	return 1.0f / sqrtf(a);
	}	}

	__device_func__(float __cuda_truncf(float a))	__device_func__(float __cuda_truncf(float a))
	{	{
	return truncf(a);	return truncf(a);
	}	}

	__device_func__(int __cuda_max(int a, int b))	__device_func__(int __cuda_max(int a, int b))
	{	{
	return max(a, b);	return max(a, b);

	skipping to change at line 1308	skipping to change at line 1336
	#endif /* __CUDABE__ */	#endif /* __CUDABE__ */
	return a;	return a;
	}	}

	__device_func__(float __cuda_nanf(const char *tagp))	__device_func__(float __cuda_nanf(const char *tagp))
	{	{
	/* the GPU only has one canonical QNaN, so return that */	/* the GPU only has one canonical QNaN, so return that */
	return CUDART_NAN_F;	return CUDART_NAN_F;
	}	}


		__device_func__(float __internal_fmad(float a, float b, float c))
		{
		return a * b + c;
		}

	/* approximate 2atanh(a/2) for \|a\| < 0.245 /	/* approximate 2atanh(a/2) for \|a\| < 0.245 /
	__device_func__(float __internal_atanhf_kernel(float a_1, float a_2))	__device_func__(float __internal_atanhf_kernel(float a_1, float a_2))
	{	{
	float a, a2, t;	float a, a2, t;

	a = a_1 + a_2;	a = a_1 + a_2;
	a2 = a * a;	a2 = a * a;

	t = 1.566305595598990E-001f/64.0f;	t = 1.566305595598990E-001f/64.0f;
	t = t * a2 + 1.995081856004762E-001f/16.0f;	t = __internal_fmad (t, a2, 1.995081856004762E-001f/16.0f);
	t = t * a2 + 3.333382699617026E-001f/4.0f;	t = __internal_fmad (t, a2, 3.333382699617026E-001f/4.0f);
	t = t * a2;	t = t * a2;

	t = t * a + a_2;	t = __internal_fmad (t, a, a_2);
	t = t + a_1;	t = t + a_1;
	return t;	return t;
	}	}

	/* compute atan(r) in first octant, i.e. 0 <= r <= 1	/* compute atan(r) in first octant, i.e. 0 <= r <= 1
	* eps ~= 2.16e-7	* eps ~= 2.16e-7
	*/	*/
	__device_func__(float __internal_atanf_kernel(float a))	__device_func__(float __internal_atanf_kernel(float a))
	{	{
	float t4, t0, t1;	float t4, t0, t1;

	t4 = a * a;	t4 = a * a;

	t0 = - 5.674867153f;	t0 = -5.674867153f;
	t0 = t4 * - 0.823362947f + t0;	t0 = __internal_fmad (t4, -0.823362947f, t0);
	t0 = t0 * t4 - 6.565555096f;	t0 = __internal_fmad (t0, t4, -6.565555096f);
	t0 = t0 * t4;	t0 = t0 * t4;
	t0 = t0 * a;	t0 = t0 * a;

	t1 = t4 + 11.33538818f;	t1 = t4 + 11.33538818f;
	t1 = t1 * t4 + 28.84246826f;	t1 = __internal_fmad (t1, t4, 28.84246826f);
	t1 = t1 * t4 + 19.69667053f;	t1 = __internal_fmad (t1, t4, 19.69667053f);
	t1 = 1.0f / t1;	t1 = 1.0f / t1;

	a = t0 * t1 + a;	a = __internal_fmad (t0, t1, a);
	return a;	return a;
	}	}

	/* approximate tangent on -pi/4...+pi/4 */	/* approximate tangent on -pi/4...+pi/4 */
	__device_func__(float __internal_tan_kernel(float a))	__device_func__(float __internal_tan_kernel(float a))
	{	{
	float a2, s, t;	float a2, s, t;

	a2 = a * a;	a2 = a * a;

	t = 4.114678393115178E-003f * a2 - 8.231194034909670E-001f;	t = __internal_fmad (4.114678393115178E-003f, a2, -8.231194034909670E-00 1f);
	s = a2 - 2.469348886157666E+000f;	s = a2 - 2.469348886157666E+000f;
	s = 1.0f / s;	s = 1.0f / s;
	t = t * s;	t = t * s;
	t = t * a2;	t = t * a2;

	t = t * a + a;	t = __internal_fmad (t, a, a);
	return t;	return t;
	}	}

	__device_func__(float __internal_accurate_logf(float a))	__device_func__(float __internal_accurate_logf(float a))
	{	{
	float t;	float t;
	float z;	float z;
	float m;	float m;
	int ia, e;	int ia, e;
	ia = __float_as_int(a);	ia = __float_as_int(a);

	skipping to change at line 1380	skipping to change at line 1413
	}	}
	/* log(a) = 2 * atanh((a-1)/(a+1)) */	/* log(a) = 2 * atanh((a-1)/(a+1)) */
	m = __int_as_float((ia & 0x807fffff) \| 0x3f800000);	m = __int_as_float((ia & 0x807fffff) \| 0x3f800000);
	e = ((unsigned)ia >> 23) - 127;	e = ((unsigned)ia >> 23) - 127;
	if (m > CUDART_SQRT_TWO_F) {	if (m > CUDART_SQRT_TWO_F) {
	m = m * 0.5f;	m = m * 0.5f;
	e = e + 1;	e = e + 1;
	}	}
	t = m - 1.0f;	t = m - 1.0f;
	z = m + 1.0f;	z = m + 1.0f;

	z = t / z;	z = __fdividef (t, z);
	z = -t * z;	z = -t * z;
	z = __internal_atanhf_kernel(t, z);	z = __internal_atanhf_kernel(t, z);

	z = (float)e * CUDART_LN2_F + z;	z = __internal_fmad ((float)e, CUDART_LN2_F, z);
	return z;	return z;
	}	}

	__device_func__(float2 __internal_log_ep(float a))	__device_func__(float2 __internal_log_ep(float a))
	{	{
	float2 res;	float2 res;
	int expo;	int expo;
	float m;	float m;
	float log_hi, log_lo;	float log_hi, log_lo;
	float t_hi, t_lo;	float t_hi, t_lo;

	skipping to change at line 1426	skipping to change at line 1459
	/* compute log(m) with extended precision using an algorithm from P.T.P.	/* compute log(m) with extended precision using an algorithm from P.T.P.
	* Tang, "Table Driven Implementation of the Logarithm Function", TOMS,	* Tang, "Table Driven Implementation of the Logarithm Function", TOMS,
	* Vol. 16., No. 4, December 1990, pp. 378-400. A modified polynomial	* Vol. 16., No. 4, December 1990, pp. 378-400. A modified polynomial
	* approximation to atanh(x) on the interval [-0.1716, 0.1716] is utilize d.	* approximation to atanh(x) on the interval [-0.1716, 0.1716] is utilize d.
	*/	*/
	f = m - 1.0f;	f = m - 1.0f;
	g = m + 1.0f;	g = m + 1.0f;
	g = 1.0f / g;	g = 1.0f / g;
	u = 2.0f * f * g;	u = 2.0f * f * g;
	v = u * u;	v = u * u;

	q = 1.49356810919559350E-001f/64.0f;	q = 1.49356810919559350E-001f/64.0f;
	q = q * v + 1.99887797540072460E-001f/16.0f;	q = __internal_fmad (q, v, 1.99887797540072460E-001f/16.0f);
	q = q * v + 3.33333880955515580E-001f/4.0f;	q = __internal_fmad (q, v, 3.33333880955515580E-001f/4.0f);
	q = q * v;	q = q * v;
	q = q * u;	q = q * u;
	log_hi = __int_as_float(__float_as_int(u) & 0xfffff000);	log_hi = __int_as_float(__float_as_int(u) & 0xfffff000);
	v = __int_as_float(__float_as_int(f) & 0xfffff000);	v = __int_as_float(__float_as_int(f) & 0xfffff000);
	u = 2.0f * (f - log_hi);	u = 2.0f * (f - log_hi);
	f = f - v;	f = f - v;

	u = u - log_hi * v;	u = __internal_fmad (-log_hi, v, u);
	u = u - log_hi * f;	u = __internal_fmad (-log_hi, f, u);
	u = g * u;	u = g * u;
	/* compute log(m) = log_hi + u + q in double-single format*/	/* compute log(m) = log_hi + u + q in double-single format*/

	/* log += u; \|log\| > \|u\| */	/* log += u; \|log\| > \|u\| */
	r = log_hi + u;	r = log_hi + u;
	s = u - (r - log_hi);	s = u - (r - log_hi);
	log_hi = r;	log_hi = r;
	log_lo = s;	log_lo = s;
	/* log += q; \|log\| > \|q\| */	/* log += q; \|log\| > \|q\| */
	r = log_hi + q;	r = log_hi + q;

	skipping to change at line 1541	skipping to change at line 1574
	}	}
	result[q] = hi;	result[q] = hi;
	e = e & 31;	e = e & 31;
	/* shift result such that hi:lo<63:62> are the least significant	/* shift result such that hi:lo<63:62> are the least significant
	integer bits, and hi:lo<61:0> are the fractional bits of the result	integer bits, and hi:lo<61:0> are the fractional bits of the result
	*/	*/
	hi = result[idx+2];	hi = result[idx+2];
	lo = result[idx+1];	lo = result[idx+1];
	if (e) {	if (e) {
	q = 32 - e;	q = 32 - e;

	hi = (hi << e) \| (lo >> q);	hi = (hi << e) + (lo >> q);
	lo = (lo << e) \| (result[idx] >> q);	lo = (lo << e) + (result[idx] >> q);
	}	}
	q = hi >> 30;	q = hi >> 30;
	/* fraction */	/* fraction */

	hi = (hi << 2) \| (lo >> 30);	hi = (hi << 2) + (lo >> 30);
	lo = (lo << 2);	lo = (lo << 2);
	e = (hi + (lo > 0)) > 0x80000000; /* fraction >= 0.5 */	e = (hi + (lo > 0)) > 0x80000000; /* fraction >= 0.5 */
	q += e;	q += e;
	if (s) q = -q;	if (s) q = -q;
	if (e) {	if (e) {
	unsigned int t;	unsigned int t;
	hi = ~hi;	hi = ~hi;
	lo = -(int)lo;	lo = -(int)lo;
	t = (lo == 0);	t = (lo == 0);
	hi += t;	hi += t;
	s = s ^ 0x80000000;	s = s ^ 0x80000000;
	}	}
	*quadrant = q;	*quadrant = q;
	/* normalize fraction */	/* normalize fraction */
	e = 0;	e = 0;
	while ((int)hi > 0) {	while ((int)hi > 0) {

	hi = (hi << 1) \| (lo >> 31);	hi = (hi << 1) + (lo >> 31);
	lo = (lo << 1);	lo = (lo << 1);
	e--;	e--;
	}	}
	lo = hi * 0xc90fdaa2;	lo = hi * 0xc90fdaa2;
	hi = __umulhi(hi, 0xc90fdaa2);	hi = __umulhi(hi, 0xc90fdaa2);
	if ((int)hi > 0) {	if ((int)hi > 0) {

	hi = (hi << 1) \| (lo >> 31);	hi = (hi << 1) + (lo >> 31);
	lo = (lo << 1);	lo = (lo << 1);
	e--;	e--;
	}	}
	hi = hi + (lo > 0);	hi = hi + (lo > 0);
	ia = s \| (((e + 126) << 23) + (hi >> 8) + ((hi << 24) >= 0x80000000));	ia = s \| (((e + 126) << 23) + (hi >> 8) + ((hi << 24) >= 0x80000000));
	return __int_as_float(ia);	return __int_as_float(ia);
	}	}

	q = __float2int_rn(a * CUDART_2_OVER_PI_F);	q = __float2int_rn (a * CUDART_2_OVER_PI_F);
	j = (float)q;	j = (float)q;

	a = a - j * 1.5703125000000000e+000f;	a = __internal_fmad (-j, 1.5703125000000000e+000f, a);
	a = a - j * 4.8351287841796875e-004f;	a = __internal_fmad (-j, 4.8351287841796875e-004f, a);
	a = a - j * 3.1385570764541626e-007f;	a = __internal_fmad (-j, 3.1385570764541626e-007f, a);
	a = a - j * 6.0771005065061922e-011f;	a = __internal_fmad (-j, 6.0771005065061922e-011f, a);
	*quadrant = q;	*quadrant = q;
	return a;	return a;
	}	}

	/* High quality implementation of expf(). A naive implementation, expf(x) =	/* High quality implementation of expf(). A naive implementation, expf(x) =
	* exp2f (x * log2(e)), loses significant accuracy for large arguments, and	* exp2f (x * log2(e)), loses significant accuracy for large arguments, and
	* may return results with only 15 to 16 good bits (out of 24). The present	* may return results with only 15 to 16 good bits (out of 24). The present
	* implementation limits the error to about 2 ulps across the entire argume nt	* implementation limits the error to about 2 ulps across the entire argume nt
	* range. It does so by employing an extended precision representation for	* range. It does so by employing an extended precision representation for
	* ln(2) which is composited from ln2_hi = 0.6931457519f which provides the	* ln(2) which is composited from ln2_hi = 0.6931457519f which provides the
	* most significant 16-bit of ln(2), and ln2_lo = 1.4286067653e-6f, which	* most significant 16-bit of ln(2), and ln2_lo = 1.4286067653e-6f, which
	* provides the least significant 24 bits.	* provides the least significant 24 bits.
	*/	*/
	__device_func__(float __internal_expf_kernel(float a, float scale))	__device_func__(float __internal_expf_kernel(float a, float scale))
	{	{
	float j, z;	float j, z;

	j = __cuda_truncf(a * CUDART_L2E_F);	j = __cuda_truncf(a * CUDART_L2E_F);

	z = a - j * 0.6931457519f;	z = __internal_fmad (j, -0.6931457519f, a);
	z = z - j * 1.4286067653e-6f;	z = __internal_fmad (j, -1.4286067653e-6f, z);
	z = z * CUDART_L2E_F;	z = z * CUDART_L2E_F;
	z = __cuda_exp2f(z) * __cuda_exp2f(j + scale);	z = __cuda_exp2f(z) * __cuda_exp2f(j + scale);
	return z;	return z;
	}	}

	__device_func__(float __internal_accurate_expf(float a))	__device_func__(float __internal_accurate_expf(float a))
	{	{
	float z;	float z;


	z = __internal_expf_kernel(a, 0.0f);	z = __internal_expf_kernel(a, 0.0f);
	if (a < -105.0f) z = 0.0f;	if (a < -105.0f) z = 0.0f;
	if (a > 105.0f) z = CUDART_INF_F;	if (a > 105.0f) z = CUDART_INF_F;
	return z;	return z;
	}	}

	__device_func__(float __internal_accurate_exp10f(float a))	__device_func__(float __internal_accurate_exp10f(float a))
	{	{
	float j, z;	float j, z;


	j = __cuda_truncf(a * CUDART_L2T_F);	j = __cuda_truncf(a * CUDART_L2T_F);

	z = a - j * 3.0102920532226563e-001f;	z = __internal_fmad (j, -3.0102920532226563e-001f, a);
	z = z - j * 7.9034171557301747e-007f;	z = __internal_fmad (j, -7.9034171557301747e-007f, z);
	z = z * CUDART_L2T_F;	z = z * CUDART_L2T_F;
	z = __cuda_exp2f(z) * __cuda_exp2f(j);	z = __cuda_exp2f(z) * __cuda_exp2f(j);
	if (a < -46.0f) z = 0.0f;	if (a < -46.0f) z = 0.0f;
	if (a > 46.0f) z = CUDART_INF_F;	if (a > 46.0f) z = CUDART_INF_F;
	return z;	return z;
	}	}

	__device_func__(float __internal_lgammaf_pos(float a))	__device_func__(float __internal_lgammaf_pos(float a))
	{	{
	float sum;	float sum;

	skipping to change at line 1646	skipping to change at line 1681
	if (a == CUDART_INF_F) {	if (a == CUDART_INF_F) {
	return a;	return a;
	}	}
	if (a >= 3.0f) {	if (a >= 3.0f) {
	if (a >= 7.8f) {	if (a >= 7.8f) {
	/* Stirling approximation for a >= 8; coefficients from Hart et al,	/* Stirling approximation for a >= 8; coefficients from Hart et al,
	* "Computer Approximations", Wiley 1968. Approximation 5401	* "Computer Approximations", Wiley 1968. Approximation 5401
	*/	*/
	s = 1.0f / a;	s = 1.0f / a;
	t = s * s;	t = s * s;

	sum = 0.77783067e-3f;	sum = 0.77783067e-3f;
	sum = sum * t - 0.2777655457e-2f;	sum = __internal_fmad (sum, t, -0.2777655457e-2f);
	sum = sum * t + 0.83333273853e-1f;	sum = __internal_fmad (sum, t, 0.83333273853e-1f);
	sum = sum * s + 0.918938533204672f;	sum = __internal_fmad (sum, s, 0.918938533204672f);
	s = 0.5f * __internal_accurate_logf(a);	s = 0.5f * __internal_accurate_logf(a);
	t = a - 0.5f;	t = a - 0.5f;
	s = s * t;	s = s * t;
	t = s - a;	t = s - a;
	s = __fadd_rn(s, sum); /* prevent FMAD merging */	s = __fadd_rn(s, sum); /* prevent FMAD merging */
	t = t + s;	t = t + s;
	return t;	return t;
	} else {	} else {
	a = a - 3.0f;	a = a - 3.0f;

	s = - 7.488903254816711E+002f;	s = -7.488903254816711E+002f;
	s = s * a - 1.234974215949363E+004f;	s = __internal_fmad (s, a, -1.234974215949363E+004f);
	s = s * a - 4.106137688064877E+004f;	s = __internal_fmad (s, a, -4.106137688064877E+004f);
	s = s * a - 4.831066242492429E+004f;	s = __internal_fmad (s, a, -4.831066242492429E+004f);
	s = s * a - 1.430333998207429E+005f;	s = __internal_fmad (s, a, -1.430333998207429E+005f);
	t = a - 2.592509840117874E+002f;	t = a - 2.592509840117874E+002f;
	t = t * a - 1.077717972228532E+004f;	t = __internal_fmad (t, a, -1.077717972228532E+004f);
	t = t * a - 9.268505031444956E+004f;	t = __internal_fmad (t, a, -9.268505031444956E+004f);
	t = t * a - 2.063535768623558E+005f;	t = __internal_fmad (t, a, -2.063535768623558E+005f);
	t = s / t;	t = __fdividef (s, t);
	t = t + a;	t = t + a;
	return t;	return t;
	}	}
	} else if (a >= 1.5f) {	} else if (a >= 1.5f) {
	a = a - 2.0f;	a = a - 2.0f;

	t = + 4.959849168282574E-005f;	t = 4.959849168282574E-005f;
	t = t * a - 2.208948403848352E-004f;	t = __internal_fmad (t, a, -2.208948403848352E-004f);
	t = t * a + 5.413142447864599E-004f;	t = __internal_fmad (t, a, 5.413142447864599E-004f);
	t = t * a - 1.204516976842832E-003f;	t = __internal_fmad (t, a, -1.204516976842832E-003f);
	t = t * a + 2.884251838546602E-003f;	t = __internal_fmad (t, a, 2.884251838546602E-003f);
	t = t * a - 7.382757963931180E-003f;	t = __internal_fmad (t, a, -7.382757963931180E-003f);
	t = t * a + 2.058131963026755E-002f;	t = __internal_fmad (t, a, 2.058131963026755E-002f);
	t = t * a - 6.735248600734503E-002f;	t = __internal_fmad (t, a, -6.735248600734503E-002f);
	t = t * a + 3.224670187176319E-001f;	t = __internal_fmad (t, a, 3.224670187176319E-001f);
	t = t * a + 4.227843368636472E-001f;	t = __internal_fmad (t, a, 4.227843368636472E-001f);
	t = t * a;	t = t * a;
	return t;	return t;
	} else if (a >= 0.7f) {	} else if (a >= 0.7f) {
	a = 1.0f - a;	a = 1.0f - a;

	t = + 4.588266515364258E-002f;	t = 4.588266515364258E-002f;
	t = t * a + 1.037396712740616E-001f;	t = __internal_fmad (t, a, 1.037396712740616E-001f);
	t = t * a + 1.228036339653591E-001f;	t = __internal_fmad (t, a, 1.228036339653591E-001f);
	t = t * a + 1.275242157462838E-001f;	t = __internal_fmad (t, a, 1.275242157462838E-001f);
	t = t * a + 1.432166835245778E-001f;	t = __internal_fmad (t, a, 1.432166835245778E-001f);
	t = t * a + 1.693435824224152E-001f;	t = __internal_fmad (t, a, 1.693435824224152E-001f);
	t = t * a + 2.074079329483975E-001f;	t = __internal_fmad (t, a, 2.074079329483975E-001f);
	t = t * a + 2.705875136435339E-001f;	t = __internal_fmad (t, a, 2.705875136435339E-001f);
	t = t * a + 4.006854436743395E-001f;	t = __internal_fmad (t, a, 4.006854436743395E-001f);
	t = t * a + 8.224669796332661E-001f;	t = __internal_fmad (t, a, 8.224669796332661E-001f);
	t = t * a + 5.772156651487230E-001f;	t = __internal_fmad (t, a, 5.772156651487230E-001f);
	t = t * a;	t = t * a;
	return t;	return t;
	} else {	} else {

	t = + 3.587515669447039E-003f;	t = 3.587515669447039E-003f;
	t = t * a - 5.471285428060787E-003f;	t = __internal_fmad (t, a, -5.471285428060787E-003f);
	t = t * a - 4.462712795343244E-002f;	t = __internal_fmad (t, a, -4.462712795343244E-002f);
	t = t * a + 1.673177015593242E-001f;	t = __internal_fmad (t, a, 1.673177015593242E-001f);
	t = t * a - 4.213597883575600E-002f;	t = __internal_fmad (t, a, -4.213597883575600E-002f);
	t = t * a - 6.558672843439567E-001f;	t = __internal_fmad (t, a, -6.558672843439567E-001f);
	t = t * a + 5.772153712885004E-001f;	t = __internal_fmad (t, a, 5.772153712885004E-001f);
	t = t * a;	t = t * a;

	t = t * a + a;	t = __internal_fmad (t, a, a);
	return -__internal_accurate_logf(t);	return -__internal_accurate_logf(t);
	}	}
	}	}

	/* approximate sine on -pi/4...+pi/4 */	/* approximate sine on -pi/4...+pi/4 */
	__device_func__(float __internal_sin_kernel(float x))	__device_func__(float __internal_sin_kernel(float x))
	{	{
	float x2, z;	float x2, z;

	x2 = x * x;	x2 = x * x;

	z = - 1.95152959e-4f;	z = -1.95152959e-4f;
	z = z * x2 + 8.33216087e-3f;	z = __internal_fmad (z, x2, 8.33216087e-3f);
	z = z * x2 - 1.66666546e-1f;	z = __internal_fmad (z, x2, -1.66666546e-1f);
	z = z * x2;	z = z * x2;

	z = z * x + x;	z = __internal_fmad (z, x, x);

	return z;	return z;
	}	}

	/* approximate cosine on -pi/4...+pi/4 */	/* approximate cosine on -pi/4...+pi/4 */
	__device_func__(float __internal_cos_kernel(float x))	__device_func__(float __internal_cos_kernel(float x))
	{	{
	float x2, z;	float x2, z;

	x2 = x * x;	x2 = x * x;

	z = 2.44331571e-5f;	z = 2.44331571e-5f;
	z = z * x2 - 1.38873163e-3f;	z = __internal_fmad (z, x2, -1.38873163e-3f);
	z = z * x2 + 4.16666457e-2f;	z = __internal_fmad (z, x2, 4.16666457e-2f);
	z = z * x2 - 5.00000000e-1f;	z = __internal_fmad (z, x2, -5.00000000e-1f);
	z = z * x2 + 1.00000000e+0f;	z = __internal_fmad (z, x2, 1.00000000e+0f);
	return z;	return z;
	}	}

	__device_func__(float __internal_accurate_sinf(float a))	__device_func__(float __internal_accurate_sinf(float a))
	{	{
	float z;	float z;
	int i;	int i;

	if ((__cuda___isinff(a)) \|\| (a == CUDART_ZERO_F)) {	if ((__cuda___isinff(a)) \|\| (a == CUDART_ZERO_F)) {
	return __fmul_rn (a, CUDART_ZERO_F);	return __fmul_rn (a, CUDART_ZERO_F);

	skipping to change at line 1802	skipping to change at line 1836
	{	{
	#if defined(__MULTI_CORE__)	#if defined(__MULTI_CORE__)
	return cosf(a);	return cosf(a);
	#elif defined(__USE_FAST_MATH__)	#elif defined(__USE_FAST_MATH__)
	return __cosf(a);	return __cosf(a);
	#else /* __MULTI_CORE__ */	#else /* __MULTI_CORE__ */
	float z;	float z;
	int i;	int i;

	if (__cuda___isinff(a)) {	if (__cuda___isinff(a)) {

	return CUDART_NAN_F;	return __fadd_rn (a, -a); /* return NaN */
	}	}
	z = __internal_trig_reduction_kernel(a, &i);	z = __internal_trig_reduction_kernel(a, &i);
	/* here, abs(z) <= pi/4, and i has the quadrant */	/* here, abs(z) <= pi/4, and i has the quadrant */
	i++;	i++;
	if (i & 1) {	if (i & 1) {
	z = __internal_cos_kernel(z);	z = __internal_cos_kernel(z);
	} else {	} else {
	z = __internal_sin_kernel(z);	z = __internal_sin_kernel(z);
	}	}
	if (i & 2) {	if (i & 2) {

	skipping to change at line 1836	skipping to change at line 1870
	float z;	float z;
	int i;	int i;

	if (__cuda___isinff(a)) {	if (__cuda___isinff(a)) {
	return CUDART_NAN_F;	return CUDART_NAN_F;
	}	}
	z = __internal_trig_reduction_kernel(a, &i);	z = __internal_trig_reduction_kernel(a, &i);
	/* here, abs(z) <= pi/4, and i has the quadrant */	/* here, abs(z) <= pi/4, and i has the quadrant */
	z = __internal_tan_kernel(z);	z = __internal_tan_kernel(z);
	if (i & 1) {	if (i & 1) {

	z = -1.0f / z;	z = - (1.0f / z);
	}	}
	return z;	return z;
	#endif /* __MULTI_CORE__ */	#endif /* __MULTI_CORE__ */
	}	}

	__device_func__(float __cuda_log2f(float a))	__device_func__(float __cuda_log2f(float a))
	{	{
	#if defined(__MULTI_CORE__)	#if defined(__MULTI_CORE__)
	return log2f(a);	return log2f(a);
	#elif defined(__USE_FAST_MATH__)	#elif defined(__USE_FAST_MATH__)

	skipping to change at line 1881	skipping to change at line 1915
	return __internal_accurate_exp10f(a);	return __internal_accurate_exp10f(a);
	#endif /* __MULTI_CORE__ */	#endif /* __MULTI_CORE__ */
	}	}

	__device_func__(float __cuda_coshf(float a))	__device_func__(float __cuda_coshf(float a))
	{	{
	float z;	float z;

	a = __cuda_fabsf(a);	a = __cuda_fabsf(a);
	z = __internal_expf_kernel(a, -2.0f);	z = __internal_expf_kernel(a, -2.0f);

	z = 2.0f * z + 0.125f / z;	z = __internal_fmad (2.0f, z, __fdividef (0.125f, z));
	if (a >= 90.0f) {	if (a >= 90.0f) {
	z = CUDART_INF_F; /* overflow -> infinity */	z = CUDART_INF_F; /* overflow -> infinity */
	}	}
	return z;	return z;
	}	}

	__device_func__(float __cuda_sinhf(float a))	__device_func__(float __cuda_sinhf(float a))
	{	{
	float s, z;	float s, z;

	s = a;	s = a;
	a = __cuda_fabsf(a);	a = __cuda_fabsf(a);
	if (a < 1.0f) { /* danger of catastrophic cancellation */	if (a < 1.0f) { /* danger of catastrophic cancellation */
	float a2 = a * a;	float a2 = a * a;
	/* approximate sinh(x) on [0,1] with a polynomial */	/* approximate sinh(x) on [0,1] with a polynomial */

	z = 2.816951222e-6f;	z = 2.816951222e-6f;
	z = z * a2 + 1.983615978e-4f;	z = __internal_fmad (z, a2, 1.983615978e-4f);
	z = z * a2 + 8.333350058e-3f;	z = __internal_fmad (z, a2, 8.333350058e-3f);
	z = z * a2 + 1.666666650e-1f;	z = __internal_fmad (z, a2, 1.666666650e-1f);
	z = z * a2;	z = z * a2;

	z = z * a + a;	z = __internal_fmad (z, a, a);
	} else {	} else {
	z = __internal_expf_kernel(a, -2.0f);	z = __internal_expf_kernel(a, -2.0f);

	z = 2.0f * z - 0.125f / z;	z = __internal_fmad (2.0f, z, -__fdividef (0.125f, z));
	if (a >= 90.0f) {	if (a >= 90.0f) {
	z = CUDART_INF_F; /* overflow -> infinity */	z = CUDART_INF_F; /* overflow -> infinity */
	}	}
	}	}
	return __cuda_copysignf(z, s);	return __cuda_copysignf(z, s);
	}	}

	__device_func__(float __cuda_tanhf(float a))	__device_func__(float __cuda_tanhf(float a))
	{	{
	float s, t;	float s, t;

	t = __cuda_fabsf(a);	t = __cuda_fabsf(a);
	if (t < 0.55f) {	if (t < 0.55f) {
	float z, z2;	float z, z2;
	z = t;	z = t;
	z2 = z * z;	z2 = z * z;

	t = 1.643758066599993e-2f;	t = 1.643758066599993e-2f;
	t = t * z2 - 5.267181327760551e-2f;	t = __internal_fmad (t, z2, -5.267181327760551e-2f);
	t = t * z2 + 1.332072505223051e-1f;	t = __internal_fmad (t, z2, 1.332072505223051e-1f);
	t = t * z2 - 3.333294663641083e-1f;	t = __internal_fmad (t, z2, -3.333294663641083e-1f);
	t = t * z2;	t = t * z2;

	s = t * z + z;	s = __internal_fmad (t, z, z);
	} else {	} else {

	s = 1.0f - 2.0f / (__internal_expf_kernel(2.0f * t, 0.0f) + 1.0f);	s = 1.0f - __fdividef(2.0f,(__internal_expf_kernel(2.0f * t, 0.0f)+1.0f ));
	if (t >= 88.0f) {	if (t >= 88.0f) {
	s = 1.0f;	s = 1.0f;
	}	}
	}	}
	return __cuda_copysignf(s, a);	return __cuda_copysignf(s, a);
	}	}

	__device_func__(float __cuda_atan2f(float a, float b))	__device_func__(float __cuda_atan2f(float a, float b))
	{	{
	#if defined(__MULTI_CORE__)	#if defined(__MULTI_CORE__)

	skipping to change at line 1999	skipping to change at line 2033
	}	}
	return __cuda_copysignf(t1, a);	return __cuda_copysignf(t1, a);
	}	}

	/* approximate asin(a) on [0, 0.575] */	/* approximate asin(a) on [0, 0.575] */
	__device_func__(float __internal_asinf_kernel(float a))	__device_func__(float __internal_asinf_kernel(float a))
	{	{
	float t2, t3, t4;	float t2, t3, t4;

	t2 = a * a;	t2 = a * a;

	t3 = - 0.501162291f;	t3 = -0.501162291f;
	t3 = t3 * t2 + 0.915201485f;	t3 = __internal_fmad (t3, t2, 0.915201485f);
	t3 = t3 * t2;	t3 = t3 * t2;
	t3 = t3 * a;	t3 = t3 * a;

	t4 = t2 - 5.478654385f;	t4 = t2 - 5.478654385f;
	t4 = t4 * t2 + 5.491230488f;	t4 = __internal_fmad (t4, t2, 5.491230488f);
	t4 = 1.0f / t4;	t4 = 1.0f / t4;

	a = t3 * t4 + a;	a = __internal_fmad (t3, t4, a);
	return a;	return a;
	}	}

	__device_func__(float __cuda_asinf(float a))	__device_func__(float __cuda_asinf(float a))
	{	{
	float t0, t1, t2;	float t0, t1, t2;

	t0 = __cuda_fabsf(a);	t0 = __cuda_fabsf(a);
	t2 = 1.0f - t0;	t2 = 1.0f - t0;
	t2 = 0.5f * t2;	t2 = 0.5f * t2;
	t2 = __cuda_sqrtf(t2);	t2 = __cuda_sqrtf(t2);
	t1 = t0 > 0.575f ? t2 : t0;	t1 = t0 > 0.575f ? t2 : t0;
	t1 = __internal_asinf_kernel(t1);	t1 = __internal_asinf_kernel(t1);

	t2 = -2.0f * t1 + CUDART_PIO2_F;	t2 = __internal_fmad (-2.0f, t1, CUDART_PIO2_F);
	if (t0 > 0.575f) {	if (t0 > 0.575f) {
	t1 = t2;	t1 = t2;
	}	}
	return __cuda_copysignf(t1, a);	return __cuda_copysignf(t1, a);
	}	}

	__device_func__(float __cuda_acosf(float a))	__device_func__(float __cuda_acosf(float a))
	{	{
	float t0, t1, t2;	float t0, t1, t2;


	skipping to change at line 2079	skipping to change at line 2113
	return log1pf(a);	return log1pf(a);
	#else /* __MULTI_CORE__ */	#else /* __MULTI_CORE__ */
	float t;	float t;
	#if !defined(__CUDABE__) && defined(_WIN32)	#if !defined(__CUDABE__) && defined(_WIN32)
	/* MSVC doesn't handle negative zero correctly, so handle it separately * /	/* MSVC doesn't handle negative zero correctly, so handle it separately * /
	if (a == 0.0f) return a;	if (a == 0.0f) return a;
	#endif /* !__CUDABE__ && _WIN32 */	#endif /* !__CUDABE__ && _WIN32 */
	if (a >= -0.394f && a <= 0.65f) {	if (a >= -0.394f && a <= 0.65f) {
	/* log(a+1) = 2atanh(a/(a+2)) /	/* log(a+1) = 2atanh(a/(a+2)) /
	t = a + 2.0f;	t = a + 2.0f;

	t = a / t;	t = __fdividef (a, t);
	t = -a * t;	t = -a * t;
	t = __internal_atanhf_kernel (a, t);	t = __internal_atanhf_kernel (a, t);
	} else {	} else {
	t = __internal_accurate_logf (CUDART_ONE_F + a);	t = __internal_accurate_logf (CUDART_ONE_F + a);
	}	}
	return t;	return t;
	#endif /* __MULTI_CORE__ */	#endif /* __MULTI_CORE__ */
	}	}

	__device_func__(float __cuda_acoshf(float a))	__device_func__(float __cuda_acoshf(float a))

	skipping to change at line 2101	skipping to change at line 2135
	#if defined(__MULTI_CORE__)	#if defined(__MULTI_CORE__)
	return acoshf(a);	return acoshf(a);
	#else /* __MULTI_CORE__ */	#else /* __MULTI_CORE__ */
	float t;	float t;

	t = a - 1.0f;	t = a - 1.0f;
	if (__cuda_fabsf(t) > CUDART_TWO_TO_23_F) {	if (__cuda_fabsf(t) > CUDART_TWO_TO_23_F) {
	/* for large a, acosh = log(2a) /	/* for large a, acosh = log(2a) /
	return CUDART_LN2_F + __internal_accurate_logf(a);	return CUDART_LN2_F + __internal_accurate_logf(a);
	} else {	} else {

	t = t + __cuda_sqrtf(a * t + t);	t = t + __cuda_sqrtf(__internal_fmad (a, t, t));
	return __cuda_log1pf(t);	return __cuda_log1pf(t);
	}	}
	#endif /* __MULTI_CORE__ */	#endif /* __MULTI_CORE__ */
	}	}

	__device_func__(float __cuda_asinhf(float a))	__device_func__(float __cuda_asinhf(float a))
	{	{
	#if defined(__MULTI_CORE__)	#if defined(__MULTI_CORE__)
	return asinhf(a);	return asinhf(a);
	#else /* __MULTI_CORE__ */	#else /* __MULTI_CORE__ */
	float fa, oofa, t;	float fa, oofa, t;

	fa = __cuda_fabsf(a);	fa = __cuda_fabsf(a);
	if (fa > CUDART_TWO_TO_126_F) { /* prevent intermediate underflow */	if (fa > CUDART_TWO_TO_126_F) { /* prevent intermediate underflow */
	t = CUDART_LN2_F + __logf(fa); /* fast version is safe here */	t = CUDART_LN2_F + __logf(fa); /* fast version is safe here */
	} else {	} else {
	oofa = 1.0f / fa;	oofa = 1.0f / fa;

	t = fa + fa / (oofa + __cuda_sqrtf(1.0f + oofa * oofa));	t =fa+__fdividef (fa,(oofa+__cuda_sqrtf(__internal_fmad(oofa,oofa,1.0f) )));
	t = __cuda_log1pf(t);	t = __cuda_log1pf(t);
	}	}
	return __cuda_copysignf(t, a);	return __cuda_copysignf(t, a);
	#endif /* __MULTI_CORE__ */	#endif /* __MULTI_CORE__ */
	}	}

	__device_func__(float __cuda_atanhf(float a))	__device_func__(float __cuda_atanhf(float a))
	{	{
	#if defined(__MULTI_CORE__)	#if defined(__MULTI_CORE__)
	return atanhf(a);	return atanhf(a);
	#else /* __MULTI_CORE__ */	#else /* __MULTI_CORE__ */
	float fa, t;	float fa, t;

	fa = __cuda_fabsf(a);	fa = __cuda_fabsf(a);

	t = (2.0f * fa) / (1.0f - fa);	t = __fdividef ((2.0f * fa), (1.0f - fa));
	t = 0.5f * __cuda_log1pf(t);	t = 0.5f * __cuda_log1pf(t);
	return __cuda_copysignf(t, a);	return __cuda_copysignf(t, a);
	#endif /* __MULTI_CORE__ */	#endif /* __MULTI_CORE__ */
	}	}

	__device_func__(float __cuda_expm1f(float a))	__device_func__(float __cuda_expm1f(float a))
	{	{
	float t, z, j, u;	float t, z, j, u;
	/* expm1(a) = 2^t(expm1(z)+1)-1 /	/* expm1(a) = 2^t(expm1(z)+1)-1 /
	t = __cuda_rintf (a * CUDART_L2E_F);	t = __cuda_rintf (a * CUDART_L2E_F);

	z = a - t * 0.6931457519f;	z = __internal_fmad (-t, 0.6931457519f, a);
	z = z - t * 1.4286067653e-6f;	z = __internal_fmad (-t, 1.4286067653e-6f, z);
	/* prevent loss of accuracy for args a tad outside [-0.5log(2),0.5log(2 )]*/	/* prevent loss of accuracy for args a tad outside [-0.5log(2),0.5log(2 )]*/
	if (__cuda_fabsf(a) < 0.41f) {	if (__cuda_fabsf(a) < 0.41f) {
	z = a;	z = a;
	t = 0.0f;	t = 0.0f;
	}	}
	/* prevent intermediate overflow */	/* prevent intermediate overflow */
	j = t;	j = t;
	if (t == 128.0f) j = j - 1.0f;	if (t == 128.0f) j = j - 1.0f;
	/* expm1(z) on [log(2/3), log(3/2)] */	/* expm1(z) on [log(2/3), log(3/2)] */

	u = 1.38795078474044430E-003f;	u = 1.38795078474044430E-003f;
	u = u * z + 8.38241261853264930E-003f;	u = __internal_fmad (u, z, 8.38241261853264930E-003f);
	u = u * z + 4.16678317762833940E-002f;	u = __internal_fmad (u, z, 4.16678317762833940E-002f);
	u = u * z + 1.66663978874356580E-001f;	u = __internal_fmad (u, z, 1.66663978874356580E-001f);
	u = u * z + 4.99999940395997040E-001f;	u = __internal_fmad (u, z, 4.99999940395997040E-001f);
	u = u * z;	u = u * z;

	u = u * z + z;	u = __internal_fmad (u, z, z);
	if (a == 0.0f) u = a; // preserve input of -0	if (a == 0.0f) u = a; // preserve input of -0
	/* 2^j[exmp1(z)+1]-1 = 2^jexpm1(z)+2^j-1 */	/* 2^j[exmp1(z)+1]-1 = 2^jexpm1(z)+2^j-1 */
	z = __cuda_exp2f (j);	z = __cuda_exp2f (j);
	a = z - 1.0f;	a = z - 1.0f;

	if (a != 0.0f) u = u * z + a; // preserve -0 generated by FTZ	if (a != 0.0f) u = __internal_fmad (u, z, a); // preserve -0 generated by
	if (t == 128.0f) u = u + u; // work around intermediate overflow	FTZ
		if (t == 128.0f) u = u + u; // work around intermediate overflow
	/* handle massive overflow and underflow */	/* handle massive overflow and underflow */
	if (j > 128.0f) u = CUDART_INF_F;	if (j > 128.0f) u = CUDART_INF_F;
	if (j < -25.0f) u = -1.0f;	if (j < -25.0f) u = -1.0f;
	return u;	return u;
	}	}

	__device_func__(float __cuda_hypotf(float a, float b))	__device_func__(float __cuda_hypotf(float a, float b))
	{	{
	#if defined(__MULTI_CORE__)	#if defined(__MULTI_CORE__)
	return hypotf(a, b);	return hypotf(a, b);

	skipping to change at line 2193	skipping to change at line 2227
	b = __cuda_fabsf(b);	b = __cuda_fabsf(b);
	/* can't use min, max because they do not propagate NaNs */	/* can't use min, max because they do not propagate NaNs */
	if (a > b) {	if (a > b) {
	v = a;	v = a;
	w = b;	w = b;
	} else {	} else {
	v = b;	v = b;
	w = a;	w = a;
	}	}
	t = __internal_accurate_fdividef(w, v);	t = __internal_accurate_fdividef(w, v);

	t = 1.0f + t * t;	t = __internal_fmad (t, t, 1.0f);
	t = v * __cuda_sqrtf(t);	t = v * __cuda_sqrtf(t);
	if (v == 0.0f) {	if (v == 0.0f) {
	t = v + w;	t = v + w;
	}	}
	if ((v == CUDART_INF_F) \|\| (w == CUDART_INF_F)) {	if ((v == CUDART_INF_F) \|\| (w == CUDART_INF_F)) {
	t = CUDART_INF_F;	t = CUDART_INF_F;
	}	}
	return t;	return t;
	#endif /* __MULTI_CORE__ */	#endif /* __MULTI_CORE__ */
	}	}

	skipping to change at line 2216	skipping to change at line 2250
	{	{
	#if defined(__MULTI_CORE__)	#if defined(__MULTI_CORE__)
	return cbrtf(a);	return cbrtf(a);
	#else /* __MULTI_CORE__ */	#else /* __MULTI_CORE__ */
	float s, t;	float s, t;

	s = __cuda_fabsf(a);	s = __cuda_fabsf(a);
	if ((a == 0.0f) \|\| (s == CUDART_INF_F)) {	if ((a == 0.0f) \|\| (s == CUDART_INF_F)) {
	return a;	return a;
	}	}

	t = __cuda_exp2f(CUDART_THIRD_F * __log2f(s)); /* initial approximation *	t = __cuda_exp2f(CUDART_THIRD_F * __log2f(s)); /* initial approximation
	/	*/
	t = t - (t - (s / (t * t))) * CUDART_THIRD_F; /* refine approximation */	t = t-(t-(__fdividef(s,(tt))))CUDART_THIRD_F; /* refine approximation
		*/
	if (__cuda___signbitf(a)) {	if (__cuda___signbitf(a)) {

	t = -t;	t = -t;
	}	}
	return t;	return t;
	#endif /* __MULTI_CORE__ */	#endif /* __MULTI_CORE__ */
	}	}

	__device_func__(float __cuda_erff(float a))	__device_func__(float __cuda_erff(float a))
	{	{
	float t, r, q;	float t, r, q;

	t = __cuda_fabsf(a);	t = __cuda_fabsf(a);
	if (t < 1.0f) {	if (t < 1.0f) {
	t = t * t;	t = t * t;

	r = -5.58510127926029810E-004f;	r = -5.58510127926029810E-004f;
	r = r * t + 4.90688891415893070E-003f;	r = __internal_fmad (r, t, 4.90688891415893070E-003f);
	r = r * t - 2.67027980930150640E-002f;	r = __internal_fmad (r, t, -2.67027980930150640E-002f);
	r = r * t + 1.12799056505903940E-001f;	r = __internal_fmad (r, t, 1.12799056505903940E-001f);
	r = r * t - 3.76122956138427440E-001f;	r = __internal_fmad (r, t, -3.76122956138427440E-001f);
	r = r * t + 1.12837911712623450E+000f;	r = __internal_fmad (r, t, 1.12837911712623450E+000f);
	a = a * r;	a = a * r;
	} else if (t <= CUDART_INF_F) {	} else if (t <= CUDART_INF_F) {
	/* coefficients from Hastings, "Approximations for Digital Computers",	/* coefficients from Hastings, "Approximations for Digital Computers",
	* Princeton University Press 1955. Sheet 45.	* Princeton University Press 1955. Sheet 45.
	*/	*/

	q = 0.3275911f * t + 1.0f;	q = __internal_fmad (t, 0.3275911f, 1.0f);
	q = 1.0f / q;	q = 1.0f / q;

	r = 1.061405429f;	r = 1.061405429f;
	r = r * q - 1.453152027f;	r = __internal_fmad (r, q, -1.453152027f);
	r = r * q + 1.421413741f;	r = __internal_fmad (r, q, 1.421413741f);
	r = r * q - 0.284496736f;	r = __internal_fmad (r, q, -0.284496736f);
	r = r * q + 0.254829592f;	r = __internal_fmad (r, q, 0.254829592f);
	r = r * q;	r = r * q;
	q = __internal_expf_kernel(-a * a, 0.0f);	q = __internal_expf_kernel(-a * a, 0.0f);

	r = 1.0f - q * r;	r = __internal_fmad (-q, r, 1.0f);
	if (t >= 5.5f) {	if (t >= 5.5f) {
	r = 1.0f;	r = 1.0f;
	}	}
	a = __int_as_float (__float_as_int(r) \| (__float_as_int(a) & 0x80000000 ));	a = __int_as_float (__float_as_int(r) \| (__float_as_int(a) & 0x80000000 ));
	}	}
	return a;	return a;
	}	}


		__device_func__(float __cuda_erfinvf (float a))
		{
		float fa, t;

		fa = fabsf(a);
		if (fa >= 1.0f) {
		t = __cuda_rsqrtf (__int_as_float (0xffc00000)); /* NaN */
		if (fa == 1.0f) {
		t = a * CUDART_INF_F; /* Infinity */
		}
		} else if (fa > 0.9375f) {
		/* Based on: J.M. Blair, C.A. Edwards, J.H. Johnson: Rational Chebyshev
		Approximations for the Inverse of the Error Function. Mathematics of
		Computation, Vol. 30, No. 136 (Oct. 1976), pp. 827-830. Table 50
		*/
		float p, q;

		t = __cuda_log1pf(-fa);
		t = __cuda_rsqrtf(-t);
		p = -1.64441567910e-1f;
		p = __internal_fmad (p, t, 6.80544246825e-1f);
		p = __internal_fmad (p, t, -1.12808139162e+0f);
		p = __internal_fmad (p, t, 6.90969348887e-1f);
		p = __internal_fmad (p, t, 1.38271964963e+0f);
		p = __internal_fmad (p, t, 1.55047000312e-1f);
		q = t + 1.38522814199e+0f;
		q = __internal_fmad (q, t, 1.55024849822e-1f);
		q = q * t;
		t = __fdividef (p, q);
		if (a < 0.0f) t = -t;
		} else if (fa > 0.75f) {
		/* Based on: J.M. Blair, C.A. Edwards, J.H. Johnson: Rational Chebyshev
		Approximations for the Inverse of the Error Function. Mathematics of
		Computation, Vol. 30, No. 136 (Oct. 1976), pp. 827-830. Table 30
		*/
		float p, q;

		t = __internal_fmad (a, a, -0.87890625f);
		p = -7.1986748896e+0f;
		p = __internal_fmad (p, t, +1.3411974175e+1f);
		p = __internal_fmad (p, t, -5.1381573203e+0f);
		p = __internal_fmad (p, t, 4.9633374831e-1f);
		q = t -1.1436535838e+1f;
		q = __internal_fmad (q, t, 1.3568885572e+1f);
		q = __internal_fmad (q, t, -4.1747509256e+0f);
		q = __internal_fmad (q, t, 3.5327242323e-1f);
		p = __fdividef (p, q);
		t = a * p;
		} else { /* polynomial approximation on [0, 0.75], max error 2 ulps */
		float a2;

		a2 = a * a;
		t = 6.1046168794766742E-001f;
		t = __internal_fmad (t, a2, -8.9504882462753121E-001f);
		t = __internal_fmad (t, a2, 7.0224162369928511E-001f);
		t = __internal_fmad (t, a2, -1.9993784895823222E-001f);
		t = __internal_fmad (t, a2, 1.1920613463949599E-001f);
		t = __internal_fmad (t, a2, 8.0131492246997685E-002f);
		t = __internal_fmad (t, a2, 1.2793154958377403E-001f);
		t = __internal_fmad (t, a2, 2.3200529172828793E-001f);
		t = __internal_fmad (t, a2, 8.8622695604694379E-001f);
		t = t * a;
		}
		return t;
		}

	__device_func__(float __cuda_erfcf(float a))	__device_func__(float __cuda_erfcf(float a))
	{	{
	if (a <= 0.55f) {	if (a <= 0.55f) {
	return 1.0f - __cuda_erff(a);	return 1.0f - __cuda_erff(a);
	} else if (a > 10.0f) {	} else if (a > 10.0f) {
	return 0.0f;	return 0.0f;
	} else {	} else {
	float p;	float p;
	float q;	float q;
	float h;	float h;
	float l;	float l;
	/* This rational approximation has a slight accuracy issue since all th e	/* This rational approximation has a slight accuracy issue since all th e
	* coefficients have same sign so error accumulates when this is comput ed	* coefficients have same sign so error accumulates when this is comput ed
	* in single precision. Also the division at the end isn't IEEE complia nt.	* in single precision. Also the division at the end isn't IEEE complia nt.
	*/	*/

	p = + 4.014893410762552E-006f;	p = 4.014893410762552E-006f;
	p = p * a + 5.640401259462436E-001f;	p = __internal_fmad (p, a, 5.640401259462436E-001f);
	p = p * a + 2.626649872281140E+000f;	p = __internal_fmad (p, a, 2.626649872281140E+000f);
	p = p * a + 5.486372652389673E+000f;	p = __internal_fmad (p, a, 5.486372652389673E+000f);
	p = p * a + 5.250714831459401E+000f;	p = __internal_fmad (p, a, 5.250714831459401E+000f);
	q = a + 4.651376250488319E+000f;	q = a + 4.651376250488319E+000f;
	q = q * a + 1.026302828878470E+001f;	q = __internal_fmad (q, a, 1.026302828878470E+001f);
	q = q * a + 1.140762166021288E+001f;	q = __internal_fmad (q, a, 1.140762166021288E+001f);
	q = q * a + 5.251211619089947E+000f;	q = __internal_fmad (q, a, 5.251211619089947E+000f);
	/* Use reciprocal plus NR refinement for division */	/* Use reciprocal plus NR refinement for division */
	h = 1.0f / q;	h = 1.0f / q;

	q = 2.0f * h - q * h * h;	q = __internal_fmad (-q * h, h, 2.0f * h);
	p = p * q;	p = p * q;
	/* compute exp(-aa) with extended precision to avoid error magnificati on/	/* compute exp(-aa) with extended precision to avoid error magnificati on/
	h = __int_as_float(__float_as_int(a) & 0xfffff000); /* upper 12 bits * /	h = __int_as_float(__float_as_int(a) & 0xfffff000); /* upper 12 bits * /
	l = __fadd_rn (a, -h); /* lower 12 bits */	l = __fadd_rn (a, -h); /* lower 12 bits */
	q = __fmul_rn (-h, h); /* this product is error free */	q = __fmul_rn (-h, h); /* this product is error free */
	q = __internal_expf_kernel(q, 0.0f);	q = __internal_expf_kernel(q, 0.0f);
	a = a + h;	a = a + h;
	l = l * a;	l = l * a;
	h = __internal_expf_kernel(-l, 0.0f);	h = __internal_expf_kernel(-l, 0.0f);
	q = q * h;	q = q * h;
	p = p * q;	p = p * q;
	return p;	return p;
	}	}
	}	}


		__device_func__(float __cuda_erfcinvf (float a))
		{
		float t;
		if (a <= 0.0f) {
		t = CUDART_NAN_F;
		if (a == 0.0f) {
		t = (1.0f - a) * CUDART_INF_F;
		}
		}
		else if (a >= 0.0625f) {
		t = __cuda_erfinvf (1.0f - a);
		}
		else {
		float p, q;
		t = __cuda_logf(a);
		t = __cuda_rsqrtf(-t);
		p = -1.64441567910e-1f;
		p = __internal_fmad (p, t, 6.80544246825e-1f);
		p = __internal_fmad (p, t, -1.12808139162e+0f);
		p = __internal_fmad (p, t, 6.90969348887e-1f);
		p = __internal_fmad (p, t, 1.38271964963e+0f);
		p = __internal_fmad (p, t, 1.55047000312e-1f);
		q = t + 1.38522814199e+0f;
		q = __internal_fmad (q, t, 1.55024849822e-1f);
		q = q * t;
		t = __fdividef (p, q);
		}
		return t;
		}

	__device_func__(float __cuda_lgammaf(float a))	__device_func__(float __cuda_lgammaf(float a))
	{	{
	float t;	float t;
	float i;	float i;
	int quot;	int quot;
	t = __internal_lgammaf_pos(__cuda_fabsf(a));	t = __internal_lgammaf_pos(__cuda_fabsf(a));
	if (a >= 0.0f) return t;	if (a >= 0.0f) return t;
	a = __cuda_fabsf(a);	a = __cuda_fabsf(a);
	i = __cuda_floorf(a);	i = __cuda_floorf(a);
	if (a == i) return CUDART_INF_F; /* a is an integer: return infinity */	if (a == i) return CUDART_INF_F; /* a is an integer: return infinity */
	if (a < 1e-19f) return -__internal_accurate_logf(a);	if (a < 1e-19f) return -__internal_accurate_logf(a);
	i = __cuda_rintf (2.0f * a);	i = __cuda_rintf (2.0f * a);
	quot = (int)i;	quot = (int)i;

	i = a - 0.5f * i;	i = __internal_fmad (-i, 0.5f, a);
	i = i * CUDART_PI_F;	i = i * CUDART_PI_F;
	if (quot & 1) {	if (quot & 1) {
	i = __internal_cos_kernel(i);	i = __internal_cos_kernel(i);
	} else {	} else {
	i = __internal_sin_kernel(i);	i = __internal_sin_kernel(i);
	}	}
	i = __cuda_fabsf(i);	i = __cuda_fabsf(i);
	t = CUDART_LNPI_F - __internal_accurate_logf(i * a) - t;	t = CUDART_LNPI_F - __internal_accurate_logf(i * a) - t;
	return t;	return t;
	}	}

	__device_func__(float __cuda_ldexpf(float a, int b))	__device_func__(float __cuda_ldexpf(float a, int b))
	{	{
	#if defined(__MULTI_CORE__)	#if defined(__MULTI_CORE__)
	return ldexpf(a, b);	return ldexpf(a, b);
	#else /* __MULTI_CORE__ */	#else /* __MULTI_CORE__ */
	float fa = __cuda_fabsf(a);	float fa = __cuda_fabsf(a);

	if ((fa == CUDART_ZERO_F) \|\| (fa == CUDART_INF_F) \|\| (b == 0)) {	if ((fa == CUDART_ZERO_F) \|\| (fa == CUDART_INF_F) \|\| (b == 0)) {

		if (!(fa > CUDART_ZERO_F)) a = a + a;
	return a;	return a;

	}	} else if (__cuda_abs(b) < 126) {
	else if (__cuda_abs(b) < 126) {
	return a * __cuda_exp2f((float)b);	return a * __cuda_exp2f((float)b);

	}	} else if (__cuda_abs(b) < 252) {
	else if (__cuda_abs(b) < 252) {
	int bhalf = b / 2;	int bhalf = b / 2;
	return a * __cuda_exp2f((float)bhalf) * __cuda_exp2f((float)(b - bhalf) );	return a * __cuda_exp2f((float)bhalf) * __cuda_exp2f((float)(b - bhalf) );

	}	} else {
	else {
	int bquarter = b / 4;	int bquarter = b / 4;
	float t = __cuda_exp2f((float)bquarter);	float t = __cuda_exp2f((float)bquarter);
	return a * t * t * t * __cuda_exp2f((float)(b - 3 * bquarter));	return a * t * t * t * __cuda_exp2f((float)(b - 3 * bquarter));
	}	}
	#endif /* __MULTI_CORE__ */	#endif /* __MULTI_CORE__ */
	}	}

	__device_func__(float __cuda_scalbnf(float a, int b))	__device_func__(float __cuda_scalbnf(float a, int b))
	{	{
	#if defined(__MULTI_CORE__)	#if defined(__MULTI_CORE__)

	skipping to change at line 2439	skipping to change at line 2567
	return fmodf(a, b);	return fmodf(a, b);
	#else /* __MULTI_CORE__ */	#else /* __MULTI_CORE__ */
	float orig_a = a;	float orig_a = a;
	float orig_b = b;	float orig_b = b;
	a = __cuda_fabsf(a);	a = __cuda_fabsf(a);
	b = __cuda_fabsf(b);	b = __cuda_fabsf(b);
	if (!((a <= CUDART_INF_F) && (b <= CUDART_INF_F))) {	if (!((a <= CUDART_INF_F) && (b <= CUDART_INF_F))) {
	return orig_a + orig_b;	return orig_a + orig_b;
	}	}
	if ((a == CUDART_INF_F) \|\| (b == CUDART_ZERO_F)) {	if ((a == CUDART_INF_F) \|\| (b == CUDART_ZERO_F)) {

	return CUDART_NAN_F;	return __cuda_rsqrtf (__int_as_float (0xffc00000));
	} else if (a >= b) {	} else if (a >= b) {
	#if !defined(__CUDABE__)	#if !defined(__CUDABE__)
	/* Need to be able to handle denormals correctly */	/* Need to be able to handle denormals correctly */
	int expoa = (a < CUDART_TWO_TO_M126_F) ?	int expoa = (a < CUDART_TWO_TO_M126_F) ?
	((int)__log2f(a)) : (((__float_as_int(a) >> 23) & 0xff) - 127);	((int)__log2f(a)) : (((__float_as_int(a) >> 23) & 0xff) - 127);
	int expob = (b < CUDART_TWO_TO_M126_F) ?	int expob = (b < CUDART_TWO_TO_M126_F) ?
	((int)__log2f(b)) : (((__float_as_int(b) >> 23) & 0xff) - 127);	((int)__log2f(b)) : (((__float_as_int(b) >> 23) & 0xff) - 127);
	int scale = expoa - expob;	int scale = expoa - expob;
	float scaled_b = __cuda_ldexpf(b, scale);	float scaled_b = __cuda_ldexpf(b, scale);
	if (scaled_b <= 0.5f * a) {	if (scaled_b <= 0.5f * a) {

	skipping to change at line 2467	skipping to change at line 2595
	}	}
	#endif /* !__CUDABE__ */	#endif /* !__CUDABE__ */
	while (scaled_b >= b) {	while (scaled_b >= b) {
	if (a >= scaled_b) {	if (a >= scaled_b) {
	a -= scaled_b;	a -= scaled_b;
	}	}
	scaled_b *= 0.5f;	scaled_b *= 0.5f;
	}	}
	return __cuda_copysignf(a, orig_a);	return __cuda_copysignf(a, orig_a);
	} else {	} else {

		if (!(a > CUDART_ZERO_F)) orig_a = orig_a + orig_a;
	return orig_a;	return orig_a;
	}	}
	#endif /* __MULTI_CORE__ */	#endif /* __MULTI_CORE__ */
	}	}

	__device_func__(float __cuda_remainderf(float a, float b))	__device_func__(float __cuda_remainderf(float a, float b))
	{	{

	float twoa = 0.0f;	float twoa = 0.0f;
	unsigned int quot0 = 0; /* quotient bit 0 */	unsigned int quot0 = 0; /* quotient bit 0 */
	float orig_a = a;	float orig_a = a;
	float orig_b = b;	float orig_b = b;

	a = __cuda_fabsf(a);	a = __cuda_fabsf(a);
	b = __cuda_fabsf(b);	b = __cuda_fabsf(b);
	if (!((a <= CUDART_INF_F) && (b <= CUDART_INF_F))) {	if (!((a <= CUDART_INF_F) && (b <= CUDART_INF_F))) {
	return orig_a + orig_b;	return orig_a + orig_b;
	}	}
	if ((a == CUDART_INF_F) \|\| (b == CUDART_ZERO_F)) {	if ((a == CUDART_INF_F) \|\| (b == CUDART_ZERO_F)) {

	return CUDART_NAN_F;	return __cuda_rsqrtf (__int_as_float (0xffc00000));
	} else if (a >= b) {	} else if (a >= b) {
	#if !defined(__CUDABE__)	#if !defined(__CUDABE__)
	int expoa = (a < CUDART_TWO_TO_M126_F) ?	int expoa = (a < CUDART_TWO_TO_M126_F) ?
	((int)__log2f(a)) : (((__float_as_int(a) >> 23) & 0xff) - 127);	((int)__log2f(a)) : (((__float_as_int(a) >> 23) & 0xff) - 127);
	int expob = (b < CUDART_TWO_TO_M126_F) ?	int expob = (b < CUDART_TWO_TO_M126_F) ?
	((int)__log2f(b)) : (((__float_as_int(b) >> 23) & 0xff) - 127);	((int)__log2f(b)) : (((__float_as_int(b) >> 23) & 0xff) - 127);
	int scale = expoa - expob;	int scale = expoa - expob;
	float scaled_b = __cuda_ldexpf(b, scale);	float scaled_b = __cuda_ldexpf(b, scale);
	if (scaled_b <= 0.5f * a) {	if (scaled_b <= 0.5f * a) {
	scaled_b *= 2.0f;	scaled_b *= 2.0f;

	skipping to change at line 2564	skipping to change at line 2693
	/* quo has a value whose sign is the sign of x/y */	/* quo has a value whose sign is the sign of x/y */
	sign = 0 - (__cuda___signbitf(a) != __cuda___signbitf(b));	sign = 0 - (__cuda___signbitf(a) != __cuda___signbitf(b));
	a = __cuda_fabsf(a);	a = __cuda_fabsf(a);
	b = __cuda_fabsf(b);	b = __cuda_fabsf(b);
	if (!((a <= CUDART_INF_F) && (b <= CUDART_INF_F))) {	if (!((a <= CUDART_INF_F) && (b <= CUDART_INF_F))) {
	*quo = quot;	*quo = quot;
	return orig_a + orig_b;	return orig_a + orig_b;
	}	}
	if ((a == CUDART_INF_F) \|\| (b == CUDART_ZERO_F)) {	if ((a == CUDART_INF_F) \|\| (b == CUDART_ZERO_F)) {
	*quo = quot;	*quo = quot;

	return CUDART_NAN_F;	return __cuda_rsqrtf (__int_as_float (0xffc00000));
	} else if (a >= b) {	} else if (a >= b) {
	#if !defined(__CUDABE__)	#if !defined(__CUDABE__)
	/* Need to be able to handle denormals correctly */	/* Need to be able to handle denormals correctly */
	int expoa = (a < CUDART_TWO_TO_M126_F) ?	int expoa = (a < CUDART_TWO_TO_M126_F) ?
	((int)__log2f(a)) : (((__float_as_int(a) >> 23) & 0xff) - 127);	((int)__log2f(a)) : (((__float_as_int(a) >> 23) & 0xff) - 127);
	int expob = (b < CUDART_TWO_TO_M126_F) ?	int expob = (b < CUDART_TWO_TO_M126_F) ?
	((int)__log2f(b)) : (((__float_as_int(b) >> 23) & 0xff) - 127);	((int)__log2f(b)) : (((__float_as_int(b) >> 23) & 0xff) - 127);
	int scale = expoa - expob;	int scale = expoa - expob;
	float scaled_b = __cuda_ldexpf(b, scale);	float scaled_b = __cuda_ldexpf(b, scale);
	if (scaled_b <= 0.5f * a) {	if (scaled_b <= 0.5f * a) {

	skipping to change at line 2640	skipping to change at line 2769
	__float_as_int(a));	__float_as_int(a));
	quot = quot & CUDART_REMQUO_MASK_F;	quot = quot & CUDART_REMQUO_MASK_F;
	quot = quot ^ sign;	quot = quot ^ sign;
	quot = quot - sign;	quot = quot - sign;
	*quo = quot;	*quo = quot;
	return a;	return a;
	}	}

	__device_func__(float __cuda_fmaf(float a, float b, float c))	__device_func__(float __cuda_fmaf(float a, float b, float c))
	{	{

	unsigned int xx, yy, zz, ww;	return __fmaf_rn(a, b, c);
	unsigned int temp, s, u;
	unsigned int expo_x, expo_y, expo_z;

	xx = __float_as_int(a);
	yy = __float_as_int(b);
	zz = __float_as_int(c);

	#if defined(__CUDABE__)
	/* Match 'denormals are zero' behavior of the GPU */
	if ((xx << 1) < 0x01000000) xx &= 0x80000000;
	if ((yy << 1) < 0x01000000) yy &= 0x80000000;
	if ((zz << 1) < 0x01000000) zz &= 0x80000000;
	#endif /* __CUDABE__ */

	temp = 0xff;
	expo_x = temp & (xx >> 23);
	expo_x = expo_x - 1;
	expo_y = temp & (yy >> 23);
	expo_y = expo_y - 1;
	expo_z = temp & (zz >> 23);
	expo_z = expo_z - 1;

	if (!((expo_x <= 0xFD) &&
	(expo_y <= 0xFD) &&
	(expo_z <= 0xFD))) {
	/* fma (nan, y, z) --> nan
	fma (x, nan, z) --> nan
	fma (x, y, nan) --> nan
	*/
	if ((yy << 1) > 0xff000000) {
	return CUDART_NAN_F;
	}
	if ((zz << 1) > 0xff000000) {
	return CUDART_NAN_F;
	}
	if ((xx << 1) > 0xff000000) {
	return CUDART_NAN_F;
	}
	/* fma (0, inf, z) --> NaN
	fma (inf, 0, z) --> NaN
	fma (-inf,+y,+inf) --> NaN
	fma (+x,-inf,+inf) --> NaN
	fma (+inf,-y,+inf) --> NaN
	fma (-x,+inf,+inf) --> NaN
	fma (-inf,-y,-inf) --> NaN
	fma (-x,-inf,-inf) --> NaN
	fma (+inf,+y,-inf) --> NaN
	fma (+x,+inf,-inf) --> NaN
	*/
	if ((((xx << 1) == 0) && ((yy << 1) == 0xff000000)) \|\|
	(((yy << 1) == 0) && ((xx << 1) == 0xff000000))) {
	return CUDART_NAN_F;
	}
	if ((zz << 1) == 0xff000000) {
	if (((yy << 1) == 0xff000000) \|\| ((xx << 1) == 0xff000000)) {
	if ((int)(xx ^ yy ^ zz) < 0) {
	return CUDART_NAN_F;
	}
	}
	}
	/* fma (inf, y, z) --> inf
	fma (x, inf, z) --> inf
	fma (x, y, inf) --> inf
	*/
	if ((xx << 1) == 0xff000000) {
	xx = xx ^ (yy & 0x80000000);
	return __int_as_float(xx);
	}
	if ((yy << 1) == 0xff000000) {
	yy = yy ^ (xx & 0x80000000);
	return __int_as_float(yy);
	}
	if ((zz << 1) == 0xff000000) {
	return __int_as_float(zz);
	}
	/* fma (+0, -y, -0) --> -0
	fma (-0, +y, -0) --> -0
	fma (+x, -0, -0) --> -0
	fma (-x, +0, -0) --> -0
	*/
	if (zz == 0x80000000) {
	if (((xx << 1) == 0) \|\| ((yy << 1) == 0)) {
	if ((int)(xx ^ yy) < 0) {
	return __int_as_float(zz);
	}
	}
	}
	/* fma (0, y, 0) --> +0
	fma (x, 0, 0) --> +0
	*/
	if (((zz << 1) == 0) &&
	(((xx << 1) == 0) \|\| ((yy << 1) == 0))) {
	zz &= 0x7fffffff;
	return __int_as_float(zz);
	}
	/* fma (0, y, z) --> z
	fma (x, 0, z) --> z
	*/
	if (((xx << 1) == 0) \|\| ((yy << 1) == 0)) {
	return __int_as_float(zz);
	}
	/* normalize x, if denormal */
	if (expo_x == (unsigned int)-1) {
	temp = xx & 0x80000000;
	xx = xx << 8;
	while (!(xx & 0x80000000)) {
	xx <<= 1;
	expo_x--;
	}
	expo_x++;
	xx = (xx >> 8) \| temp;
	}
	/* normalize y, if denormal */
	if (expo_y == (unsigned int)-1) {
	temp = yy & 0x80000000;
	yy = yy << 8;
	while (!(yy & 0x80000000)) {
	yy <<= 1;
	expo_y--;
	}
	expo_y++;
	yy = (yy >> 8) \| temp;
	}
	/* normalize z, if denormal */
	if ((expo_z == (unsigned int)-1) && ((zz << 1) != 0)) {
	temp = zz & 0x80000000;
	zz = zz << 8;
	while (!(zz & 0x80000000)) {
	zz <<= 1;
	expo_z--;
	}
	expo_z++;
	zz = (zz >> 8) \| temp;
	}
	}

	expo_x = expo_x + expo_y;
	expo_y = xx ^ yy;
	xx = xx & 0x00ffffff;
	yy = yy << 8;
	xx = xx \| 0x00800000;
	yy = yy \| 0x80000000;

	s = __umulhi(xx, yy);
	yy = xx * yy;
	xx = s;
	expo_x = expo_x - 127 + 2;
	expo_y = expo_y & 0x80000000;

	/* normalize mantissa */
	if (xx < 0x00800000) {
	xx = (xx << 1) \| (yy >> 31);
	yy = (yy << 1);
	expo_x--;
	}
	temp = 0;
	if ((zz << 1) != 0) { /* z is not zero */
	s = zz & 0x80000000;
	zz &= 0x00ffffff;
	zz \|= 0x00800000;
	ww = 0;
	/* compare and swap. put augend into xx:yy */
	if ((int)expo_z > (int)expo_x) {
	temp = expo_z;
	expo_z = expo_x;
	expo_x = temp;
	temp = zz;
	zz = xx;
	xx = temp;
	temp = ww;
	ww = yy;
	yy = temp;
	temp = expo_y;
	expo_y = s;
	s = temp;
	}
	/* augend_sign = expo_y, augend_mant = xx:yy, augend_expo = expo_x */
	/* addend_sign = s, addend_mant = zz:ww, addend_expo = expo_z */
	expo_z = expo_x - expo_z;
	u = expo_y ^ s;
	if (expo_z <= 49) {
	/* denormalize addend */
	temp = 0;
	while (expo_z >= 32) {
	temp = ww \| (temp != 0);
	ww = zz;
	zz = 0;
	expo_z -= 32;
	}
	if (expo_z) {
	temp = ((temp >> expo_z) \| (ww << (32 - expo_z)) \|
	((temp << (32 - expo_z)) != 0));
	ww = (ww >> expo_z) \| (zz << (32 - expo_z));
	zz = (zz >> expo_z);
	}
	} else {
	temp = 1;
	ww = 0;
	zz = 0;
	}
	if ((int)u < 0) {
	/* signs differ, effective subtraction */
	temp = (unsigned)(-(int)temp);
	s = (temp != 0);
	u = yy - s;
	s = u > yy;
	yy = u - ww;
	s += yy > u;
	xx = (xx - zz) - s;
	if (!(xx \| yy \| temp)) {
	/* complete cancelation, return 0 */
	return __int_as_float(xx);
	}
	if ((int)xx < 0) {
	/* Oops, augend had smaller mantissa. Negate mantissa and flip
	sign of result
	*/
	temp = ~temp;
	yy = ~yy;
	xx = ~xx;
	if (++temp == 0) {
	if (++yy == 0) {
	++xx;
	}
	}
	expo_y ^= 0x80000000;
	}
	/* normalize mantissa, if necessary */
	while (!(xx & 0x00800000)) {
	xx = (xx << 1) \| (yy >> 31);
	yy = (yy << 1);
	expo_x--;
	}
	} else {
	/* signs are the same, effective addition */
	yy = yy + ww;
	s = yy < ww;
	xx = xx + zz + s;
	if (xx & 0x01000000) {
	temp = temp \| (yy << 31);
	yy = (yy >> 1) \| (xx << 31);
	xx = ((xx & 0x80000000) \| (xx >> 1)) & ~0x40000000;
	expo_x++;
	}
	}
	}
	temp = yy \| (temp != 0);
	if (expo_x <= 0xFD) {
	/* normal */
	xx \|= expo_y; /* or in sign bit */
	s = xx & 1; /* mantissa lsb */
	xx += (temp == 0x80000000) ? s : (temp >> 31);
	xx = xx + (expo_x << 23); /* add in exponent */
	return __int_as_float(xx);
	} else if ((int)expo_x >= 126) {
	/* overflow */
	xx = expo_y \| 0x7f800000;
	return __int_as_float(xx);
	}
	/* subnormal */
	expo_x = (unsigned int)(-(int)expo_x);
	if (expo_x > 25) {
	/* massive underflow: return 0 */
	return __int_as_float(expo_y);
	}
	yy = (xx << (32 - expo_x)) \| ((yy) ? 1 : 0);
	xx = expo_y + (xx >> expo_x);
	xx = xx + ((yy==0x80000000) ? (xx & 1) : (yy >> 31));
	xx \|= expo_y; /* or in sign bit */
	#if defined(__CUDABE__)
	/* Match 'flush to zero' response of the GPU */
	if ((xx << 1) < 0x01000000) xx = expo_y;
	#endif /* __CUDABE__ */
	return __int_as_float(xx);
	}	}

	__device_func__(float __internal_accurate_powf(float a, float b))	__device_func__(float __internal_accurate_powf(float a, float b))
	{	{
	float2 loga, prod;	float2 loga, prod;
	#if !defined(__CUDABE__) && defined(_MSC_VER) && !defined(_WIN64)	#if !defined(__CUDABE__) && defined(_MSC_VER) && !defined(_WIN64)
	volatile float t;	volatile float t;
	#else	#else
	float t;	float t;
	#endif	#endif

	skipping to change at line 2948	skipping to change at line 2803
	prod.x = prod.x + __int_as_float(0x37000000);	prod.x = prod.x + __int_as_float(0x37000000);
	}	}

	/* compute pow(a,b) = exp(blog(a)) /	/* compute pow(a,b) = exp(blog(a)) /
	t = __cuda_expf(prod.y);	t = __cuda_expf(prod.y);
	/* prevent -INF + INF = NaN */	/* prevent -INF + INF = NaN */
	if (t != CUDART_INF_F) {	if (t != CUDART_INF_F) {
	/* if prod.x is much smaller than prod.y, then exp(prod.y+prod.x) ~=	/* if prod.x is much smaller than prod.y, then exp(prod.y+prod.x) ~=
	* exp(prod.y) + prod.x * exp(prod.y)	* exp(prod.y) + prod.x * exp(prod.y)
	*/	*/

	t = t * prod.x + t;	t = __internal_fmad (t, prod.x, t);
	}	}
	return t;	return t;
	}	}

	__device_func__(float __cuda_powif(float a, int b))	__device_func__(float __cuda_powif(float a, int b))
	{	{
	unsigned int e = __cuda_abs(b);	unsigned int e = __cuda_abs(b);
	float r = 1.0f;	float r = 1.0f;

	while (1) {	while (1) {
	if ((e & 1) != 0) {	if ((e & 1) != 0) {
	r = r * a;	r = r * a;
	}	}
	e = e >> 1;	e = e >> 1;
	if (e == 0) {	if (e == 0) {

	return b < 0 ? 1.0f/r : r;	return b < 0 ? 1.0f / r : r;
	}	}
	a = a * a;	a = a * a;
	}	}
	}	}

	__device_func__(double __cuda_powi(double a, int b))	__device_func__(double __cuda_powi(double a, int b))
	{	{
	unsigned int e = __cuda_abs(b);	unsigned int e = __cuda_abs(b);
	double r = 1.0;	double r = 1.0;

	while (1) {	while (1) {
	if ((e & 1) != 0) {	if ((e & 1) != 0) {
	r = r * a;	r = r * a;
	}	}
	e = e >> 1;	e = e >> 1;
	if (e == 0) {	if (e == 0) {

	return b < 0 ? 1.0/r : r;	return b < 0 ? 1.0 / r : r;
	}	}
	a = a * a;	a = a * a;
	}	}
	}	}

	__device_func__(float __cuda_powf(float a, float b))	__device_func__(float __cuda_powf(float a, float b))
	{	{
	#if defined(__MULTI_CORE__)	#if defined(__MULTI_CORE__)
	return powf(a, b);	return powf(a, b);
	#elif defined(__USE_FAST_MATH__)	#elif defined(__USE_FAST_MATH__)

	skipping to change at line 3024	skipping to change at line 2879
	}	}
	bIsOddInteger = (b - (2.0f * floorf(0.5f * b))) == 1.0f;	bIsOddInteger = (b - (2.0f * floorf(0.5f * b))) == 1.0f;
	if (a == CUDART_ZERO_F) {	if (a == CUDART_ZERO_F) {
	t = bIsOddInteger ? a : CUDART_ZERO_F;	t = bIsOddInteger ? a : CUDART_ZERO_F;
	if (b < CUDART_ZERO_F) {	if (b < CUDART_ZERO_F) {
	t = 1.0f / t;	t = 1.0f / t;
	}	}
	return t;	return t;
	}	}
	if (a == -CUDART_INF_F) {	if (a == -CUDART_INF_F) {

	t = (b < CUDART_ZERO_F) ? -1.0f/a : -a;	t = - ((b < CUDART_ZERO_F) ? (1.0f / a) : a);
	if (bIsOddInteger) {	if (bIsOddInteger) {
	t = __int_as_float(__float_as_int(t) ^ 0x80000000);	t = __int_as_float(__float_as_int(t) ^ 0x80000000);
	}	}
	return t;	return t;
	}	}
	if ((a < CUDART_ZERO_F) && (b != __cuda_truncf(b))) {	if ((a < CUDART_ZERO_F) && (b != __cuda_truncf(b))) {

	return CUDART_NAN_F;	return __cuda_rsqrtf(__int_as_float(0xffc00000));
	}	}
	t = __cuda_fabsf(a);	t = __cuda_fabsf(a);
	t = __internal_accurate_powf(t, b);	t = __internal_accurate_powf(t, b);
	if ((a < CUDART_ZERO_F) && bIsOddInteger) {	if ((a < CUDART_ZERO_F) && bIsOddInteger) {
	t = __int_as_float(__float_as_int(t) ^ 0x80000000);	t = __int_as_float(__float_as_int(t) ^ 0x80000000);
	}	}
	return t;	return t;
	#endif /* __MULTI_CORE__ */	#endif /* __MULTI_CORE__ */
	}	}

	/* approximate 1.0/(xgamma(x)) on [-0.5,0.5] /	/* approximate 1.0/(xgamma(x)) on [-0.5,0.5] /
	__device_func__(float __internal_tgammaf_kernel(float a))	__device_func__(float __internal_tgammaf_kernel(float a))
	{	{
	float t;	float t;

	t = - 1.05767296987211380E-003f;	t = -1.05767296987211380E-003f;
	t = t * a + 7.09279059435508670E-003f;	t = __internal_fmad (t, a, 7.09279059435508670E-003f);
	t = t * a - 9.65347121958557050E-003f;	t = __internal_fmad (t, a, -9.65347121958557050E-003f);
	t = t * a - 4.21736613253687960E-002f;	t = __internal_fmad (t, a, -4.21736613253687960E-002f);
	t = t * a + 1.66542401247154280E-001f;	t = __internal_fmad (t, a, 1.66542401247154280E-001f);
	t = t * a - 4.20043267827838460E-002f;	t = __internal_fmad (t, a, -4.20043267827838460E-002f);
	t = t * a - 6.55878234051332940E-001f;	t = __internal_fmad (t, a, -6.55878234051332940E-001f);
	t = t * a + 5.77215696929794240E-001f;	t = __internal_fmad (t, a, 5.77215696929794240E-001f);
	t = t * a + 1.00000000000000000E+000f;	t = __internal_fmad (t, a, 1.00000000000000000E+000f);
	return t;	return t;
	}	}

	/* Based on: Kraemer, W.: "Berechnung der Gammafunktion G(x) fuer reelle Pu nkt-	/* Based on: Kraemer, W.: "Berechnung der Gammafunktion G(x) fuer reelle Pu nkt-
	und Intervallargumente". Zeitschrift fuer angewandte Mathematik und	und Intervallargumente". Zeitschrift fuer angewandte Mathematik und
	Mechanik, Vol. 70 (1990), No. 6, pp. 581-584	Mechanik, Vol. 70 (1990), No. 6, pp. 581-584
	*/	*/
	__device_func__(float __cuda_tgammaf(float a))	__device_func__(float __cuda_tgammaf(float a))
	{	{
	float s, xx, x=a;	float s, xx, x=a;

	skipping to change at line 3083	skipping to change at line 2938
	xx = xx - 1.0f;	xx = xx - 1.0f;
	s = s * xx;	s = s * xx;
	}	}
	if (x >= 0.5f) {	if (x >= 0.5f) {
	xx = xx - 1.0f;	xx = xx - 1.0f;
	}	}
	xx = __internal_tgammaf_kernel(xx);	xx = __internal_tgammaf_kernel(xx);
	if (x < 0.5f) {	if (x < 0.5f) {
	xx = xx * x;	xx = xx * x;
	}	}

	s = s / xx;	s = __fdividef(s, xx);
	if (x > 34.03f) {	if (x > 34.03f) {
	/* Cannot use s = s * x - s due to intermediate overflow! */	/* Cannot use s = s * x - s due to intermediate overflow! */
	xx = x - 1.0f;	xx = x - 1.0f;
	s = s * xx;	s = s * xx;
	}	}
	return s;	return s;
	} else {	} else {
	if (x == __cuda_floorf(x)) { /* x is negative integer */	if (x == __cuda_floorf(x)) { /* x is negative integer */
	x = CUDART_NAN_F; /* NaN, propagates through on device */	x = CUDART_NAN_F; /* NaN, propagates through on device */
	#if !defined(__CUDABE__)	#if !defined(__CUDABE__)

	skipping to change at line 3133	skipping to change at line 2988
	return s;	return s;
	}	}
	}	}

	__device_func__(float __cuda_roundf(float a))	__device_func__(float __cuda_roundf(float a))
	{	{
	#if defined(__MULTI_CORE__)	#if defined(__MULTI_CORE__)
	return roundf(a);	return roundf(a);
	#else /* __MULTI_CORE__ */	#else /* __MULTI_CORE__ */
	float fa = __cuda_fabsf(a);	float fa = __cuda_fabsf(a);

	if (fa > CUDART_TWO_TO_23_F) {	float u = __cuda_copysignf (0.5f, a);
	return a;	u = __cuda_truncf (a + u);
	} else {	if (fa > CUDART_TWO_TO_23_F) u = a;
	float u = __cuda_floorf(fa + 0.5f);	if (fa < 0.5f) u = __cuda_truncf (a);
	if (fa < 0.5f) u = 0.0f;	return u;
	return __cuda_copysignf(u, a);
	}
	#endif /* __MULTI_CORE__ */	#endif /* __MULTI_CORE__ */
	}	}

	__device_func__(long long int __internal_llroundf_kernel(float a))	__device_func__(long long int __internal_llroundf_kernel(float a))
	{	{
	unsigned long long int res, t = 0LL;	unsigned long long int res, t = 0LL;
	int shift;	int shift;
	unsigned int ia = __float_as_int(a);	unsigned int ia = __float_as_int(a);
	if ((ia << 1) > 0xff000000) return 0LL;	if ((ia << 1) > 0xff000000) return 0LL;
	if ((int)ia >= 0x5f000000) return 0x7fffffffffffffffLL;	if ((int)ia >= 0x5f000000) return 0x7fffffffffffffffLL;

	skipping to change at line 3161	skipping to change at line 3014
	shift = 189 - ((ia >> 23) & 0xff);	shift = 189 - ((ia >> 23) & 0xff);
	res = ((long long int)(((ia << 8) \| 0x80000000) >> 1)) << 32;	res = ((long long int)(((ia << 8) \| 0x80000000) >> 1)) << 32;
	if (shift >= 64) {	if (shift >= 64) {
	t = res;	t = res;
	res = 0;	res = 0;
	} else if (shift) {	} else if (shift) {
	t = res << (64 - shift);	t = res << (64 - shift);
	res = res >> shift;	res = res >> shift;
	}	}
	if (t >= 0x8000000000000000LL) {	if (t >= 0x8000000000000000LL) {

	res++;	res++;
	}	}
	if ((int)ia < 0) res = (unsigned long long int)(-(long long int)res);	if ((int)ia < 0) res = (unsigned long long int)(-(long long int)res);
	return (long long int)res;	return (long long int)res;
	}	}

	__device_func__(long long int __cuda_llroundf(float a))	__device_func__(long long int __cuda_llroundf(float a))
	{	{
	#if defined(__MULTI_CORE__)	#if defined(__MULTI_CORE__)
	return llroundf(a);	return llroundf(a);
	#else /* __MULTI_CORE__ */	#else /* __MULTI_CORE__ */

	skipping to change at line 3749	skipping to change at line 3602
	__func__(int ilogbf(float a))	__func__(int ilogbf(float a))
	{	{
	return ilogb((double)a);	return ilogb((double)a);
	}	}

	__func__(float erff(float a))	__func__(float erff(float a))
	{	{
	return (float)erf((double)a);	return (float)erf((double)a);
	}	}


		__func__(float erfinvf(float a))
		{
		return (float)erfinv((double)a);
		}

	__func__(float erfcf(float a))	__func__(float erfcf(float a))
	{	{
	return (float)erfc((double)a);	return (float)erfc((double)a);
	}	}


		__func__(float erfcinvf(float a))
		{
		return (float)erfcinv((double)a);
		}

	__func__(float lgammaf(float a))	__func__(float lgammaf(float a))
	{	{
	return (float)lgamma((double)a);	return (float)lgamma((double)a);
	}	}

	__func__(float tgammaf(float a))	__func__(float tgammaf(float a))
	{	{
	return (float)tgamma((double)a);	return (float)tgamma((double)a);
	}	}


	skipping to change at line 3821	skipping to change at line 3684
	__func__(double tgamma(double a))	__func__(double tgamma(double a))
	{	{
	return (double)__cuda_tgammaf((float)a);	return (double)__cuda_tgammaf((float)a);
	}	}

	__func__(double erf(double a))	__func__(double erf(double a))
	{	{
	return (double)__cuda_erff((float)a);	return (double)__cuda_erff((float)a);
	}	}


		__func__(double erfinv(double a))
		{
		return (double)__cuda_erfinvf((float)a);
		}

	__func__(double erfc(double a))	__func__(double erfc(double a))
	{	{
	return (double)__cuda_erfcf((float)a);	return (double)__cuda_erfcf((float)a);
	}	}


		__func__(double erfcinv(double a))
		{
		return (double)__cuda_erfcinvf((float)a);
		}

	__func__(double remquo(double a, double b, int *quo))	__func__(double remquo(double a, double b, int *quo))
	{	{
	return (double)__cuda_remquof((float)a, (float)b, quo);	return (double)__cuda_remquof((float)a, (float)b, quo);
	}	}

	__func__(double remainder(double a, double b))	__func__(double remainder(double a, double b))
	{	{
	return (double)__cuda_remainderf((float)a, (float)b);	return (double)__cuda_remainderf((float)a, (float)b);
	}	}


End of changes. 97 change blocks.
	466 lines changed or deleted	341 lines changed or added

	math_functions_dbl_ptx3.h	math_functions_dbl_ptx3.h
	/*	/*

	* Copyright 1993-2008 NVIDIA Corporation. All rights reserved.	* Copyright 1993-2009 NVIDIA Corporation. All rights reserved.
	*	*
	* NOTICE TO USER:	* NOTICE TO USER:
	*	*
	* This source code is subject to NVIDIA ownership rights under U.S. and	* This source code is subject to NVIDIA ownership rights under U.S. and
	* international Copyright laws. Users and possessors of this source code	* international Copyright laws. Users and possessors of this source code
	* are hereby granted a nonexclusive, royalty-free license to use this code	* are hereby granted a nonexclusive, royalty-free license to use this code
	* in individual and commercial software.	* in individual and commercial software.
	*	*
	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR

	skipping to change at line 52	skipping to change at line 52

	#elif !defined(__CUDACC__)	#elif !defined(__CUDACC__)

	#include "crt/func_macro.h"	#include "crt/func_macro.h"

	#define INT_MAX \	#define INT_MAX \
	((int)((unsigned int)-1 >> 1))	((int)((unsigned int)-1 >> 1))

	#include "device_functions.h"	#include "device_functions.h"
	#include "math_constants.h"	#include "math_constants.h"

		#if !defined(__CUDABE__)
		#include "common_types.h"
		#endif
	/************************************************************************ ***	/************************************************************************ ***
	* *	* *
	* DEVICE IMPLEMENTATIONS FOR FUNCTIONS WITH BUILTIN NVOPENCC OPREATIONS *	* DEVICE IMPLEMENTATIONS FOR FUNCTIONS WITH BUILTIN NVOPENCC OPREATIONS *
	* *	* *
	************************************************************************* **/	************************************************************************* **/

	__device_func__(double __cuda_fabs(double a))	__device_func__(double __cuda_fabs(double a))
	{	{
	return fabs(a);	return fabs(a);
	}	}

	__device_func__(double __cuda_fmax(double a, double b))	__device_func__(double __cuda_fmax(double a, double b))
	{	{
	#if !defined(__CUDABE__)	#if !defined(__CUDABE__)

	volatile union {	volatile union __cudart_DoubleUlonglongCvt cvta, cvtb;
	double d;	int nana, nanb;
	unsigned long long int l;
	} cvta, cvtb;
	cvta.d = a;	cvta.d = a;
	cvtb.d = b;	cvtb.d = b;

	if ((cvtb.l << 1) > 0xffe0000000000000ULL) return a;	nana = ((cvta.i << 1) > 0xffe0000000000000ULL);
	if ((cvta.l << 1) > 0xffe0000000000000ULL) return b;	nanb = ((cvtb.i << 1) > 0xffe0000000000000ULL);
		if (nana && nanb) return a + b;
		if (nana) return b;
		if (nanb) return a;
	if ((cvta.d == 0.0) && (cvtb.d == 0.0)) {	if ((cvta.d == 0.0) && (cvtb.d == 0.0)) {

	cvta.l &= cvtb.l;	cvta.i &= cvtb.i;
	return cvta.d;	return cvta.d;
	}	}
	return a > b ? a : b;	return a > b ? a : b;
	#else	#else
	return fmax(a, b);	return fmax(a, b);
	#endif /* !defined(__CUDABE__) */	#endif /* !defined(__CUDABE__) */
	}	}

	__device_func__(double __cuda_fmin(double a, double b))	__device_func__(double __cuda_fmin(double a, double b))
	{	{
	#if !defined(__CUDABE__)	#if !defined(__CUDABE__)

	volatile union {	volatile union __cudart_DoubleUlonglongCvt cvta, cvtb;
	double d;	int nana, nanb;
	unsigned long long int l;
	} cvta, cvtb;
	cvta.d = a;	cvta.d = a;
	cvtb.d = b;	cvtb.d = b;

	if ((cvtb.l << 1) > 0xffe0000000000000ULL) return a;	nana = ((cvta.i << 1) > 0xffe0000000000000ULL);
	if ((cvta.l << 1) > 0xffe0000000000000ULL) return b;	nanb = ((cvtb.i << 1) > 0xffe0000000000000ULL);
	if ((cvta.l \| cvtb.l) == 0x8000000000000000ULL) {	if (nana && nanb) return a + b;
		if (nana) return b;
		if (nanb) return a;
		if ((cvta.i \| cvtb.i) == 0x8000000000000000ULL) {
	return CUDART_NEG_ZERO ;	return CUDART_NEG_ZERO ;
	}	}
	return a < b ? a : b;	return a < b ? a : b;
	#else	#else
	return fmin(a, b);	return fmin(a, b);
	#endif /* !defined(__CUDABE__) */	#endif /* !defined(__CUDABE__) */
	}	}

	__device_func__(double __cuda_ceil(double a))	__device_func__(double __cuda_ceil(double a))
	{	{

	skipping to change at line 539	skipping to change at line 545
	z = __internal_trig_reduction_kerneld(a, &i);	z = __internal_trig_reduction_kerneld(a, &i);
	/* here, abs(z) <= pi/4, and i has the quadrant */	/* here, abs(z) <= pi/4, and i has the quadrant */
	z = __internal_tan_kerneld(z, i & 1);	z = __internal_tan_kerneld(z, i & 1);
	return z;	return z;
	}	}

	__device_func__(double __cuda_log(double a))	__device_func__(double __cuda_log(double a))
	{	{
	double m, f, g, u, v, tmp, q, ulo, log_lo, log_hi;	double m, f, g, u, v, tmp, q, ulo, log_lo, log_hi;
	int ihi, ilo;	int ihi, ilo;

	int e = 0;

	ihi = __double2hiint(a);	ihi = __double2hiint(a);
	ilo = __double2loint(a);	ilo = __double2loint(a);

	if (__cuda___isnan(a)) {
	return a + a;	if ((a > CUDART_ZERO) && (a < CUDART_INF)) {
	}	int e = -1023;
	/* log(x) is undefined for x < 0.0, return INDEFINITE */	/* normalize denormals */
	if (a < 0.0) {	if ((unsigned)ihi < (unsigned)0x00100000) {
		a = a * CUDART_TWO_TO_54;
		e -= 54;
		ihi = __double2hiint(a);
		ilo = __double2loint(a);
		}
		/* a = m * 2^e. m <= sqrt(2): log2(a) = log2(m) + e.
		* m > sqrt(2): log2(a) = log2(m/2) + (e+1)
		*/
		e += ((ihi >> 20) & 0x7ff);
		ihi = (ihi & 0x800fffff) \| 0x3ff00000;
		m = __hiloint2double (ihi, ilo);
		if ((unsigned)ihi > (unsigned)0x3ff6a09e) {
		m = __internal_half(m);
		e = e + 1;
		}
		/* log((1+m)/(1-m)) = 2atanh(m). log(m) = 2atanh ((m-1)/(m+1)) */
		f = m - 1.0;
		g = m + 1.0;
		g = 1.0 / g;
		u = f * g;
		u = u + u;
		/* u = 2.0 * (m - 1.0) / (m + 1.0) */
		v = u * u;
		q = 6.7261411553826339E-2/65536.0;
		q = __fma_rn (q, v, 6.6133829643643394E-2/16384.0);
		q = __fma_rn (q, v, 7.6940931149150890E-2/4096.0);
		q = __fma_rn (q, v, 9.0908745692137444E-2/1024.0);
		q = __fma_rn (q, v, 1.1111111499059706E-1/256.0);
		q = __fma_rn (q, v, 1.4285714283305975E-1/64.0);
		q = __fma_rn (q, v, 2.0000000000007223E-1/16.0);
		q = __fma_rn (q, v, 3.3333333333333326E-1/4.0);
		tmp = __internal_twice (f - u);
		tmp = __fma_rn (-u, f, tmp); // tmp = remainder of division
		ulo = g * tmp; // less significant quotient bits
		/* u + ulo = 2.0 * (m - 1.0) / (m + 1.0) to more than double precision
		*/
		q = q * v;
		q = q * u;
		/* log_hi + log_lo = log(m) to more than double precision */
		log_hi = u;
		log_lo = ulo + q;
		/* log_hi + log_lo = log(m)+e*log(2)=log(a) to more than double precisi
		on*/
		q = __fma_rn ( e, CUDART_LN2_HI, log_hi);
		tmp = __fma_rn (-e, CUDART_LN2_HI, q);
		tmp = tmp - log_hi;
		log_hi = q;
		log_lo = log_lo - tmp;
		log_lo = __fma_rn (e, CUDART_LN2_LO, log_lo);
		return log_hi + log_lo;
		} else {
		if (__cuda___isnan(a)) {
		return a + a;
		}
		/* log(0) = -INF */
		if (a == 0) {
		return -CUDART_INF;
		}
		/* log(INF) = INF */
		if (a == CUDART_INF) {
		return a;
		}
		/* log(x) is undefined for x < 0.0, return INDEFINITE */
	return CUDART_NAN;	return CUDART_NAN;
	}	}

	/* log(0) = -INF */
	if (a == 0) {
	return -CUDART_INF;
	}
	/* log(INF) = INF */
	if (__cuda___isinf(a)) {
	return a;
	}
	/* normalize denormals */
	if (a < CUDART_TWO_TO_M1022) {
	a = a * CUDART_TWO_TO_54;
	e = -54;
	ihi = __double2hiint(a);
	ilo = __double2loint(a);
	}
	/* a = m * 2^e. m <= sqrt(2): log2(a) = log2(m) + e.
	* m > sqrt(2): log2(a) = log2(m/2) + (e+1)
	*/
	e += ((ihi >> 20) & 0x7ff) - 1023;
	m = __hiloint2double ((ihi & 0x800fffff) \| 0x3ff00000, ilo);
	if (m > CUDART_SQRT_TWO) {
	m = __internal_half(m);
	e = e + 1;
	}
	/* log((1+m)/(1-m)) = 2atanh(m). log(m) = 2atanh ((m-1)/(m+1)) */
	f = m - 1.0;
	g = m + 1.0;
	g = 1.0 / g;
	u = f * g;
	u = u + u;
	/* u = 2.0 * (m - 1.0) / (m + 1.0) */
	v = u * u;
	q = 6.7261411553826339E-2/65536.0;
	q = __fma_rn (q, v, 6.6133829643643394E-2/16384.0);
	q = __fma_rn (q, v, 7.6940931149150890E-2/4096.0);
	q = __fma_rn (q, v, 9.0908745692137444E-2/1024.0);
	q = __fma_rn (q, v, 1.1111111499059706E-1/256.0);
	q = __fma_rn (q, v, 1.4285714283305975E-1/64.0);
	q = __fma_rn (q, v, 2.0000000000007223E-1/16.0);
	q = __fma_rn (q, v, 3.3333333333333326E-1/4.0);
	tmp = __internal_twice (f - u);
	tmp = __fma_rn (-u, f, tmp); // tmp = remainder of division
	ulo = g * tmp; // less significant quotient bits
	/* u + ulo = 2.0 * (m - 1.0) / (m + 1.0) to more than double precision */
	q = q * v;
	q = q * u;
	/* log_hi + log_lo = log(m) to more than double precision */
	log_hi = u;
	log_lo = ulo + q;
	/* log_hi + log_lo = log(m)+e*log(2)=log(a) to more than double precision
	*/
	q = __fma_rn ( e, CUDART_LN2_HI, log_hi);
	tmp = __fma_rn (-e, CUDART_LN2_HI, q);
	tmp = tmp - log_hi;
	log_hi = q;
	log_lo = log_lo - tmp;
	log_lo = __fma_rn (e, CUDART_LN2_LO, log_lo);
	return log_hi + log_lo;
	}	}

	/* Requires \|x.y\| > \|y.y\|. 8 DP operations */	/* Requires \|x.y\| > \|y.y\|. 8 DP operations */
	__device_func__(double2 __internal_ddadd_xgty (double2 x, double2 y))	__device_func__(double2 __internal_ddadd_xgty (double2 x, double2 y))
	{	{

	double2 z;	double2 z;
	#if defined(__GNUC__) && !defined(__CUDABE__)	#if defined(__GNUC__) && !defined(__CUDABE__)

	volatile double r, s, e;	volatile
	#else
	double r, s, e;
	#endif	#endif

	r = x.y + y.y;	double r, s, e;
	e = x.y - r;	r = x.y + y.y;
	s = ((e + y.y) + y.x) + x.x;	e = x.y - r;
	z.y = e = r + s;	s = ((e + y.y) + y.x) + x.x;
	z.x = (r - e) + s;	z.y = e = r + s;
	return z;	z.x = (r - e) + s;
		return z;
	}	}

	/* Take full advantage of FMA. Only 8 DP operations */	/* Take full advantage of FMA. Only 8 DP operations */
	__device_func__(double2 __internal_ddmul (double2 x, double2 y))	__device_func__(double2 __internal_ddmul (double2 x, double2 y))
	{	{
	#if defined(__GNUC__) && !defined(__CUDABE__)	#if defined(__GNUC__) && !defined(__CUDABE__)

	volatile double e;	volatile
	#else
	double e;
	#endif	#endif

	double2 t, z;	double e;
	t.y = x.y * y.y;	double2 t, z;
	t.x = __fma_rn (x.y, y.y, -t.y);	t.y = x.y * y.y;
	t.x = __fma_rn (x.x, y.x, t.x);	t.x = __fma_rn (x.y, y.y, -t.y);
	t.x = __fma_rn (x.y, y.x, t.x);	t.x = __fma_rn (x.x, y.x, t.x);
	t.x = __fma_rn (x.x, y.y, t.x);	t.x = __fma_rn (x.y, y.x, t.x);
	z.y = e = t.y + t.x;	t.x = __fma_rn (x.x, y.y, t.x);
	z.x = (t.y - e) + t.x;	z.y = e = t.y + t.x;
	return z;	z.x = (t.y - e) + t.x;
		return z;
	}	}

	__device_func__(double2 __internal_log_ext_prec(double a))	__device_func__(double2 __internal_log_ext_prec(double a))
	{	{
	double2 res;	double2 res;
	double2 qq, cc, uu, tt;	double2 qq, cc, uu, tt;
	double f, g, u, v, q, ulo, tmp, m;	double f, g, u, v, q, ulo, tmp, m;
	int ilo, ihi, expo;	int ilo, ihi, expo;

	ihi = __double2hiint(a);	ihi = __double2hiint(a);

	skipping to change at line 668	skipping to change at line 675
	ihi = __double2hiint(a);	ihi = __double2hiint(a);
	ilo = __double2loint(a);	ilo = __double2loint(a);
	expo = (ihi >> 20) & 0x7ff;	expo = (ihi >> 20) & 0x7ff;
	expo -= 54;	expo -= 54;
	}	}
	expo -= 1023;	expo -= 1023;
	/* log(a) = log(m*2^expo) =	/* log(a) = log(m*2^expo) =
	log(m) + log(2)*expo, if m < sqrt(2),	log(m) + log(2)*expo, if m < sqrt(2),
	log(m0.5) + log(2)(expo+1), if m >= sqrt(2)	log(m0.5) + log(2)(expo+1), if m >= sqrt(2)
	*/	*/

	m = __hiloint2double((ihi & 0x800fffff) \| 0x3ff00000, ilo);	ihi = (ihi & 0x800fffff) \| 0x3ff00000;
	if (m > CUDART_SQRT_TWO) {	m = __hiloint2double (ihi, ilo);
		if ((unsigned)ihi > (unsigned)0x3ff6a09e) {
	m = __internal_half(m);	m = __internal_half(m);
	expo = expo + 1;	expo = expo + 1;
	}	}
	/* compute log(m) with extended precision using an algorithm derived from	/* compute log(m) with extended precision using an algorithm derived from
	* P.T.P. Tang, "Table Driven Implementation of the Logarithm Function",	* P.T.P. Tang, "Table Driven Implementation of the Logarithm Function",
	* TOMS, Vol. 16., No. 4, December 1990, pp. 378-400. A modified polynomi al	* TOMS, Vol. 16., No. 4, December 1990, pp. 378-400. A modified polynomi al
	* approximation to atanh(x) on the interval [-0.1716, 0.1716] is utilize d.	* approximation to atanh(x) on the interval [-0.1716, 0.1716] is utilize d.
	*/	*/
	f = m - 1.0;	f = m - 1.0;
	g = m + 1.0;	g = m + 1.0;

	skipping to change at line 737	skipping to change at line 745
	__device_func__(double __cuda_log10(double a))	__device_func__(double __cuda_log10(double a))
	{	{
	double t;	double t;
	t = __cuda_log(a);	t = __cuda_log(a);
	return __fma_rn (t, CUDART_LGE_HI, t * CUDART_LGE_LO);	return __fma_rn (t, CUDART_LGE_HI, t * CUDART_LGE_LO);
	}	}

	__device_func__(double __cuda_log1p(double a))	__device_func__(double __cuda_log1p(double a))
	{	{
	double t;	double t;

	#if !defined(__CUDABE__)	int i;
	if (__cuda___isnan(a)) {
	return a + a;	i = __double2hiint(a);
	}	if (((unsigned)i < (unsigned)0x3fe55555) \|\| ((int)i < (int)0xbfd99999)) {
	#endif	/* Compute log2(a+1) = 2atanh(a/(a+2)) /
	if ((a < -0.4) \|\| (a > CUDART_TWOTHIRD)) {	t = a + 2.0;
	return __cuda_log (a + 1.0);	t = a / t;
		t = -a * t;
		t = __internal_atanh_kernel(a, t);
		return t;
	}	}

	/* Compute log2(a+1) = 2atanh(a/(a+2)) /	return __cuda_log (a + CUDART_ONE);
	t = a + 2.0;
	t = a / t;
	t = -a * t;
	t = __internal_atanh_kernel(a, t);
	return t;
	}	}

	__device_func__(double __internal_exp_kernel(double a, int scale))	__device_func__(double __internal_exp_kernel(double a, int scale))
	{	{
	double t, fac, z;	double t, fac, z;
	int i;	int i;
	/* exp(a) = 2^(rint(a/log(2)) + z) = 2^(i + z) */	/* exp(a) = 2^(rint(a/log(2)) + z) = 2^(i + z) */
	t = __cuda_rint (a * CUDART_L2E);	t = __cuda_rint (a * CUDART_L2E);
	i = (int)t;	i = (int)t;
	z = __fma_rn (t, -CUDART_LN2_HI, a);	z = __fma_rn (t, -CUDART_LN2_HI, a);
	z = __fma_rn (t, -CUDART_LN2_LO, z);	z = __fma_rn (t, -CUDART_LN2_LO, z);
	fac = 2.0;	fac = 2.0;
	if (i <= -1021) {	if (i <= -1021) {
	i += 55;	i += 55;
	fac = CUDART_TWO_TO_M54;	fac = CUDART_TWO_TO_M54;
	}	}

		/* exp(a) = 2^i * e^z */
	t = __internal_expm1_kernel(z);	t = __internal_expm1_kernel(z);

	/* exp(a) = 2^i * 2^z */	z = __internal_exp2i_kernel(i + scale - 1);
	z = __hiloint2double((1022 + i + scale) << 20, 0);
	t = __fma_rn (t, z, z);	t = __fma_rn (t, z, z);
	t = t * fac;	t = t * fac;
	return t;	return t;
	}	}

	__device_func__(double __cuda_exp(double a))	__device_func__(double __cuda_exp(double a))
	{	{

	if (a > CUDART_LN2_X_1024) {	double t;
	return CUDART_INF;	int i;
		i = __double2hiint(a);
		if (((unsigned)i < (unsigned)0x40862e43) \|\| ((int)i < (int)0xC0874911)) {
		t = __internal_exp_kernel(a, 0);
		return t;
	}	}

	if (a <= -CUDART_LN2_X_1075) {	t = ((unsigned int)i >> 31) ? CUDART_ZERO : CUDART_INF;
	return CUDART_ZERO;	if (__cuda___isnan(a)) {
		t = a + a;
	}	}

	a = __internal_exp_kernel(a, 0);	return t;
	return a;
	}	}

	__device_func__(double __cuda_exp2(double a))	__device_func__(double __cuda_exp2(double a))
	{	{
	double z;	double z;
	double t;	double t;
	double fac;	double fac;
	int i;	int i;


	if (a >= 1024.0) {	i = __double2hiint(a);
	return CUDART_INF;	if (((unsigned)i < (unsigned)0x40900000) \|\| ((int)i < (int)0xc090cc00)) {
	}	t = __cuda_rint (a);
	if (a < -1075.0) {	z = a - t;
	return CUDART_ZERO;	i = (int)t;
		fac = 2.0;
		if (i <= -1021) {
		i += 55;
		fac = CUDART_TWO_TO_M54;
		}
		/* 2^z = exp(log(2)z) /
		z = __fma_rn (z, CUDART_LN2_HI, z * CUDART_LN2_LO);
		t = __internal_expm1_kernel(z);
		z = __internal_exp2i_kernel(i - 1);
		t = __fma_rn (t, z, z);
		t = t * fac;
		return t;
	}	}

	t = __cuda_rint (a);	t = ((unsigned int)i >> 31) ? CUDART_ZERO : CUDART_INF;
	z = a - t;	if (__cuda___isnan(a)) {
	i = (int)t;	t = a + a;
	fac = 2.0;
	if (i <= -1021) {
	i += 55;
	fac = CUDART_TWO_TO_M54;
	}	}

	/* 2^z = exp(log(2)z) /
	z = __fma_rn (z, CUDART_LN2_HI, z * CUDART_LN2_LO);
	t = __internal_expm1_kernel(z);
	z = __hiloint2double((1022 + i) << 20, 0);
	t = __fma_rn (t, z, z);
	t = t * fac;
	return t;	return t;
	}	}

	__device_func__(double __cuda_exp10(double a))	__device_func__(double __cuda_exp10(double a))
	{	{
	double z;	double z;
	double t;	double t;
	double fac;	double fac;
	int i;	int i;


	if (a >= CUDART_LG2_X_1024) {	i = __double2hiint(a);
	return CUDART_INF;	if (((unsigned)i < (unsigned)0x40734414) \|\| ((int)i < (int)0xc07439b8)) {
	}	t = __cuda_rint (a * CUDART_L2T);
	if (a < -CUDART_LG2_X_1075) {	i = (int)t;
	return CUDART_ZERO;	z = __fma_rn (t, -CUDART_LG2_HI, a);
		z = __fma_rn (t, -CUDART_LG2_LO, z);
		fac = 2.0;
		if (i <= -1021) {
		i += 55;
		fac = CUDART_TWO_TO_M54;
		}
		/* 2^z = exp(log(10)z) /
		z = __fma_rn (z, CUDART_LNT_HI, z * CUDART_LNT_LO);
		t = __internal_expm1_kernel(z);
		z = __internal_exp2i_kernel(i - 1);
		t = __fma_rn (t, z, z);
		t = t * fac;
		return t;
	}	}

	t = __cuda_rint (a * CUDART_L2T);	t = ((unsigned int)i >> 31) ? CUDART_ZERO : CUDART_INF;
	i = (int)t;	if (__cuda___isnan(a)) {
	z = __fma_rn (t, -CUDART_LG2_HI, a);	t = a + a;
	z = __fma_rn (t, -CUDART_LG2_LO, z);
	fac = 2.0;
	if (i <= -1021) {
	i += 55;
	fac = CUDART_TWO_TO_M54;
	}	}

	/* 2^z = exp(log(10)z) /
	z = __fma_rn (z, CUDART_LNT_HI, z * CUDART_LNT_LO);
	t = __internal_expm1_kernel(z);
	/* exp(a) = 2^i * 2^z */
	z = __hiloint2double((1022 + i) << 20, 0);
	t = __fma_rn (t, z, z);
	t = t * fac;
	return t;	return t;
	}	}

	__device_func__(double __cuda_expm1(double a))	__device_func__(double __cuda_expm1(double a))
	{	{
	double t, z, u;	double t, z, u;

	int i, j;	int i, j, k;
	if (a > CUDART_LN2_X_1024) {
	return CUDART_INF;	k = __double2hiint(a);
	}	if (((unsigned)k < (unsigned)0x40862e43) \|\| ((int)k < (int)0xc04a8000)) {
	if (a < -53.0) {	t = __cuda_rint (a * CUDART_L2E);
	return -1.0;	i = (int)t;
		z = __fma_rn (t, -CUDART_LN2_HI, a);
		z = __fma_rn (t, -CUDART_LN2_LO, z);
		k = k + k;
		if ((unsigned)k < (unsigned)0x7fb3e647) {
		z = a;
		i = 0;
		}
		t = __internal_expm1_kernel(z);
		j = i;
		if (i == 1024) j--;
		u = __internal_exp2i_kernel(j);
		a = u - 1.0;
		t = __fma_rn (t, u, a);
		if (i == 1024) t = t + t;
		if (k == 0) t = z; /* preserve -0 */
		return t;
	}	}

	t = __cuda_rint (a * CUDART_L2E);	t = ((unsigned int)k >> 31) ? -CUDART_ONE : CUDART_INF;
	i = (int)t;	if (__cuda___isnan(a)) {
	z = __fma_rn (t, -CUDART_LN2_HI, a);	t = a + a;
	z = __fma_rn (t, -CUDART_LN2_LO, z);
	if (__cuda_fabs(a) < 0.405465108) {
	z = a;
	i = 0;
	}	}

	j = (i == 1024) ? (i - 1) : i;
	t = __internal_expm1_kernel(z);
	u = __hiloint2double((1023 + j) << 20, 0);
	a = u - 1.0;
	t = __fma_rn (t, u, a);
	if (z == 0.0) t = z; /* preserve -0 */
	if (i == 1024) t = t + t;
	return t;	return t;
	}	}

	__device_func__(double __cuda_cosh(double a))	__device_func__(double __cuda_cosh(double a))
	{	{
	double z;	double z;

	if (__cuda___isnan(a)) {	int i;

		z = __cuda_fabs(a);
		i = __double2hiint(z);
		if ((unsigned)i < (unsigned)0x408633cf) {
		z = __internal_exp_kernel(z, -2);
		z = __fma_rn(2.0, z, 0.125 / z);
		return z;
		} else {
		if (z > 0.0) a = CUDART_INF_F;
	return a + a;	return a + a;
	}	}

	a = __cuda_fabs(a);
	z = __internal_exp_kernel(a, -2);
	z = __fma_rn(2.0, z, 0.125 / z);
	if (a >= CUDART_LN2_X_1025) {
	z = CUDART_INF_F; /* overflow -> infinity */
	}
	return z;
	}	}

	__device_func__(double __cuda_sinh(double a))	__device_func__(double __cuda_sinh(double a))
	{	{
	double s, z;	double s, z;
	s = a;	s = a;
	a = __cuda_fabs(a);	a = __cuda_fabs(a);
	if (a < 1.0) { /* danger of catastrophic cancellation */	if (a < 1.0) { /* danger of catastrophic cancellation */
	double a2 = a * a;	double a2 = a * a;
	/* approximate sinh(x) on [0,1] with a polynomial */	/* approximate sinh(x) on [0,1] with a polynomial */

	skipping to change at line 957	skipping to change at line 977
	t = __fma_rn (t, a, a);	t = __fma_rn (t, a, a);
	a = __cuda_copysign(t, a);	a = __cuda_copysign(t, a);
	}	}
	return a;	return a;
	}	}

	__device_func__(double __internal_atan_kernel(double a))	__device_func__(double __internal_atan_kernel(double a))
	{	{
	double t, a2;	double t, a2;
	a2 = a * a;	a2 = a * a;

	t = -2.0258553044438358E-005 ;	t = -2.0258553044438358E-005 ;
	t = __fma_rn (t, a2, 2.2302240345758510E-004);	t = __fma_rn (t, a2, 2.2302240345758510E-004);
	t = __fma_rn (t, a2, -1.1640717779930576E-003);	t = __fma_rn (t, a2, -1.1640717779930576E-003);
	t = __fma_rn (t, a2, 3.8559749383629918E-003);	t = __fma_rn (t, a2, 3.8559749383629918E-003);
	t = __fma_rn (t, a2, -9.1845592187165485E-003);	t = __fma_rn (t, a2, -9.1845592187165485E-003);
	t = __fma_rn (t, a2, 1.6978035834597331E-002);	t = __fma_rn (t, a2, 1.6978035834597331E-002);
	t = __fma_rn (t, a2, -2.5826796814495994E-002);	t = __fma_rn (t, a2, -2.5826796814495994E-002);
	t = __fma_rn (t, a2, 3.4067811082715123E-002);	t = __fma_rn (t, a2, 3.4067811082715123E-002);
	t = __fma_rn (t, a2, -4.0926382420509971E-002);	t = __fma_rn (t, a2, -4.0926382420509971E-002);
	t = __fma_rn (t, a2, 4.6739496199157994E-002);	t = __fma_rn (t, a2, 4.6739496199157994E-002);
	t = __fma_rn (t, a2, -5.2392330054601317E-002);	t = __fma_rn (t, a2, -5.2392330054601317E-002);

	skipping to change at line 1049	skipping to change at line 1069
	r = __fma_rn (r, b, 3.038188875134962E-002);	r = __fma_rn (r, b, 3.038188875134962E-002);
	r = __fma_rn (r, b, 4.464285849810986E-002);	r = __fma_rn (r, b, 4.464285849810986E-002);
	r = __fma_rn (r, b, 7.499999998342270E-002);	r = __fma_rn (r, b, 7.499999998342270E-002);
	r = __fma_rn (r, b, 1.666666666667375E-001);	r = __fma_rn (r, b, 1.666666666667375E-001);
	r = r * b;	r = r * b;
	return r;	return r;
	}	}

	__device_func__(double __cuda_asin(double a))	__device_func__(double __cuda_asin(double a))
	{	{

	double t0, t1;	double fa, t0, t1;
	t0 = __cuda_fabs(a);	int ihi, ahi;
	if (t0 > 1.0) {	ahi = __double2hiint(a);
	return CUDART_NAN;	fa = __cuda_fabs(a);
	}	ihi = __double2hiint(fa);
	if (t0 > 0.575) {	if (ihi < 0x3fe26666) {
	t1 = __fma_rn (-0.5, t0, 0.5);	t1 = fa * fa;
		t1 = __internal_asin_kernel (fa, t1);
		t1 = __fma_rn (t1, fa, fa);
		t1 = __cuda_copysign(t1, a);
		} else {
		t1 = __fma_rn (-0.5, fa, 0.5);
	t0 = __cuda_sqrt (t1);	t0 = __cuda_sqrt (t1);
	t1 = __internal_asin_kernel (t0, t1);	t1 = __internal_asin_kernel (t0, t1);
	t0 = -2.0 * t0;	t0 = -2.0 * t0;
	t1 = __fma_rn (t0, t1, CUDART_PIO2_LO);	t1 = __fma_rn (t0, t1, CUDART_PIO2_LO);
	t0 = t0 + CUDART_PIO4_HI;	t0 = t0 + CUDART_PIO4_HI;
	t1 = t0 + t1;	t1 = t0 + t1;
	t1 = t1 + CUDART_PIO4_HI;	t1 = t1 + CUDART_PIO4_HI;

	} else {	if (ahi < 0x3ff00000) {
	t1 = t0 * t0;	t1 = __cuda_copysign(t1, a);
	t1 = __internal_asin_kernel (t0, t1);	}
	t1 = __fma_rn (t1, t0, t0);
	}	}

	return __cuda_copysign(t1, a);	return t1;
	}	}

	__device_func__(double __cuda_acos(double a))	__device_func__(double __cuda_acos(double a))
	{	{
	double t0, t1;	double t0, t1;

		int ihi, ahi;

	#if !defined(__CUDABE__)	#if !defined(__CUDABE__)
	if (__cuda___isnan(a)) {	if (__cuda___isnan(a)) {
	return a + a;	return a + a;
	}	}
	#endif	#endif

		ahi = __double2hiint(a);
	t0 = __cuda_fabs (a);	t0 = __cuda_fabs (a);

	if (t0 > 0.575) {	ihi = __double2hiint(t0);
	t1 = __fma_rn (-0.5, t0, 0.5);	if (ihi < 0x3fe26666) {
	t0 = __cuda_sqrt(t1);
	t1 = __internal_asin_kernel (t0, t1);
	t0 = __fma_rn (t1, t0, t0);
	t0 = 2.0 * t0;
	if (__cuda___signbit(a)) {
	t0 = __fma_rn (1.0, t0, -CUDART_PI_LO);
	t0 = CUDART_PI_HI - t0;
	}
	} else {
	t1 = t0 * t0;	t1 = t0 * t0;
	t1 = __internal_asin_kernel (t0, t1);	t1 = __internal_asin_kernel (t0, t1);
	t0 = __fma_rn (t1, t0, t0);	t0 = __fma_rn (t1, t0, t0);

	if (__cuda___signbit(a)) {	if ((unsigned)ahi >= (unsigned)0x80000000) {
	t0 = __fma_rn (1.0, t0, +CUDART_PIO2_LO);	t0 = __fma_rn (1.0, t0, +CUDART_PIO2_LO);
	t0 = CUDART_PIO2_HI + t0;	t0 = CUDART_PIO2_HI + t0;
	} else {	} else {
	t0 = __fma_rn (1.0, t0, -CUDART_PIO2_LO);	t0 = __fma_rn (1.0, t0, -CUDART_PIO2_LO);
	t0 = CUDART_PIO2_HI - t0;	t0 = CUDART_PIO2_HI - t0;
	}	}

		} else {
		t1 = __fma_rn (-0.5, t0, 0.5);
		t0 = __cuda_sqrt(t1);
		t1 = __internal_asin_kernel (t0, t1);
		t0 = __fma_rn (t1, t0, t0);
		t0 = 2.0 * t0;
		if ((unsigned)ahi >= (unsigned)0x80000000) {
		t0 = __fma_rn (1.0, t0, -CUDART_PI_LO);
		t0 = CUDART_PI_HI - t0;
		}
	}	}
	return t0;	return t0;
	}	}

	__device_func__(double __cuda_acosh(double a))	__device_func__(double __cuda_acosh(double a))
	{	{
	double t;	double t;
	#if !defined(__CUDABE__)	#if !defined(__CUDABE__)
	if (__cuda___isnan(a)) {	if (__cuda___isnan(a)) {
	return a + a;	return a + a;

	skipping to change at line 1382	skipping to change at line 1410
	r = __fma_rn (r, q, 5.22397760611847340E-003);	r = __fma_rn (r, q, 5.22397760611847340E-003);
	r = __fma_rn (r, q, -2.68661706431114690E-002);	r = __fma_rn (r, q, -2.68661706431114690E-002);
	r = __fma_rn (r, q, 1.12837916709441850E-001);	r = __fma_rn (r, q, 1.12837916709441850E-001);
	r = __fma_rn (r, q, -3.76126389031835210E-001);	r = __fma_rn (r, q, -3.76126389031835210E-001);
	r = __fma_rn (r, q, 1.12837916709551260E+000);	r = __fma_rn (r, q, 1.12837916709551260E+000);
	a = r * a;	a = r * a;
	}	}
	return a;	return a;
	}	}


		__device_func__(double __cuda_erfinv(double a))
		{
		double fa, t;

		fa = fabs(a);
		if (fa >= 1.0) {
		t = CUDART_NAN; /* NaN */
		if (fa == 1.0) {
		t = a * CUDART_INF; /* Infinity */
		}
		} else if (fa >= 0.9375) {
		/* Based on: J.M. Blair, C.A. Edwards, J.H. Johnson: Rational Chebyshev
		Approximations for the Inverse of the Error Function. Mathematics of
		Computation, Vol. 30, No. 136 (Oct. 1976), pp. 827-830. Table 59
		*/
		double p, q;

		t = __cuda_log1p(-fa);
		t = __cuda_rsqrt(-t);
		p = 2.7834010353747001060e-3;
		p = __fma_rn (p, t, 8.6030097526280260580e-1);
		p = __fma_rn (p, t, 2.1371214997265515515e+0);
		p = __fma_rn (p, t, 3.1598519601132090206e+0);
		p = __fma_rn (p, t, 3.5780402569085996758e+0);
		p = __fma_rn (p, t, 1.5335297523989890804e+0);
		p = __fma_rn (p, t, 3.4839207139657522572e-1);
		p = __fma_rn (p, t, 5.3644861147153648366e-2);
		p = __fma_rn (p, t, 4.3836709877126095665e-3);
		p = __fma_rn (p, t, 1.3858518113496718808e-4);
		p = __fma_rn (p, t, 1.1738352509991666680e-6);
		q = t+ 2.2859981272422905412e+0;
		q = __fma_rn (q, t, 4.3859045256449554654e+0);
		q = __fma_rn (q, t, 4.6632960348736635331e+0);
		q = __fma_rn (q, t, 3.9846608184671757296e+0);
		q = __fma_rn (q, t, 1.6068377709719017609e+0);
		q = __fma_rn (q, t, 3.5609087305900265560e-1);
		q = __fma_rn (q, t, 5.3963550303200816744e-2);
		q = __fma_rn (q, t, 4.3873424022706935023e-3);
		q = __fma_rn (q, t, 1.3858762165532246059e-4);
		q = __fma_rn (q, t, 1.1738313872397777529e-6);
		t = p / (q * t);
		if (a < 0.0) t = -t;
		} else if (fa >= 0.75) {
		/* Based on: J.M. Blair, C.A. Edwards, J.H. Johnson: Rational Chebyshev
		Approximations for the Inverse of the Error Function. Mathematics of
		Computation, Vol. 30, No. 136 (Oct. 1976), pp. 827-830. Table 39
		*/
		double p, q;

		t = __fma_rn (a, a, -.87890625);
		p = .21489185007307062000e+0;
		p = __fma_rn (p, t, -.64200071507209448655e+1);
		p = __fma_rn (p, t, .29631331505876308123e+2);
		p = __fma_rn (p, t, -.47644367129787181803e+2);
		p = __fma_rn (p, t, .34810057749357500873e+2);
		p = __fma_rn (p, t, -.12954198980646771502e+2);
		p = __fma_rn (p, t, .25349389220714893917e+1);
		p = __fma_rn (p, t, -.24758242362823355486e+0);
		p = __fma_rn (p, t, .94897362808681080020e-2);
		q = t -.12831383833953226499e+2;
		q = __fma_rn (q, t, .41409991778428888716e+2);
		q = __fma_rn (q, t, -.53715373448862143349e+2);
		q = __fma_rn (q, t, .33880176779595142685e+2);
		q = __fma_rn (q, t, -.11315360624238054876e+2);
		q = __fma_rn (q, t, .20369295047216351160e+1);
		q = __fma_rn (q, t, -.18611650627372178511e+0);
		q = __fma_rn (q, t, .67544512778850945940e-2);
		p = p / q;
		t = a * p;
		} else {
		/* Based on: J.M. Blair, C.A. Edwards, J.H. Johnson: Rational Chebyshev
		Approximations for the Inverse of the Error Function. Mathematics of
		Computation, Vol. 30, No. 136 (Oct. 1976), pp. 827-830. Table 18
		*/
		double p, q;

		t = __fma_rn (a, a, -.5625);
		p = -.23886240104308755900e+2;
		p = __fma_rn (p, t, .45560204272689128170e+3);
		p = __fma_rn (p, t, -.22977467176607144887e+4);
		p = __fma_rn (p, t, .46631433533434331287e+4);
		p = __fma_rn (p, t, -.43799652308386926161e+4);
		p = __fma_rn (p, t, .19007153590528134753e+4);
		p = __fma_rn (p, t, -.30786872642313695280e+3);
		q = t -.83288327901936570000e+2;
		q = __fma_rn (q, t, .92741319160935318800e+3);
		q = __fma_rn (q, t, -.35088976383877264098e+4);
		q = __fma_rn (q, t, .59039348134843665626e+4);
		q = __fma_rn (q, t, -.48481635430048872102e+4);
		q = __fma_rn (q, t, .18997769186453057810e+4);
		q = __fma_rn (q, t, -.28386514725366621129e+3);
		p = p / q;
		t = a * p;
		}
		return t;
		}

		__device_func__(double __cuda_erfcinv(double a))
		{
		double t;
		#if !defined(__CUDABE__)
		if (__cuda___isnan(a)) return a + a;
		#endif
		if (a <= CUDART_ZERO) {
		t = CUDART_NAN;
		if (a == CUDART_ZERO) {
		t = (1.0 - a) * CUDART_INF;
		}
		}
		else if (a >= 0.0625) {
		t = __cuda_erfinv (1.0 - a);
		}
		else if (a >= 1e-100) {
		/* Based on: J.M. Blair, C.A. Edwards, J.H. Johnson: Rational Chebyshev
		Approximations for the Inverse of the Error Function. Mathematics of
		Computation, Vol. 30, No. 136 (Oct. 1976), pp. 827-830. Table 59
		*/
		double p, q;
		t = __cuda_log(a);
		t = __cuda_rsqrt(-t);
		p = 2.7834010353747001060e-3;
		p = __fma_rn (p, t, 8.6030097526280260580e-1);
		p = __fma_rn (p, t, 2.1371214997265515515e+0);
		p = __fma_rn (p, t, 3.1598519601132090206e+0);
		p = __fma_rn (p, t, 3.5780402569085996758e+0);
		p = __fma_rn (p, t, 1.5335297523989890804e+0);
		p = __fma_rn (p, t, 3.4839207139657522572e-1);
		p = __fma_rn (p, t, 5.3644861147153648366e-2);
		p = __fma_rn (p, t, 4.3836709877126095665e-3);
		p = __fma_rn (p, t, 1.3858518113496718808e-4);
		p = __fma_rn (p, t, 1.1738352509991666680e-6);
		q = t+ 2.2859981272422905412e+0;
		q = __fma_rn (q, t, 4.3859045256449554654e+0);
		q = __fma_rn (q, t, 4.6632960348736635331e+0);
		q = __fma_rn (q, t, 3.9846608184671757296e+0);
		q = __fma_rn (q, t, 1.6068377709719017609e+0);
		q = __fma_rn (q, t, 3.5609087305900265560e-1);
		q = __fma_rn (q, t, 5.3963550303200816744e-2);
		q = __fma_rn (q, t, 4.3873424022706935023e-3);
		q = __fma_rn (q, t, 1.3858762165532246059e-4);
		q = __fma_rn (q, t, 1.1738313872397777529e-6);
		t = p / (q * t);
		}
		else {
		/* Based on: J.M. Blair, C.A. Edwards, J.H. Johnson: Rational Chebyshev
		Approximations for the Inverse of the Error Function. Mathematics of
		Computation, Vol. 30, No. 136 (Oct. 1976), pp. 827-830. Table 82
		*/
		double p, q;
		t = __cuda_log(a);
		t = __cuda_rsqrt(-t);
		p = 6.9952990607058154858e-1;
		p = __fma_rn (p, t, 1.9507620287580568829e+0);
		p = __fma_rn (p, t, 8.2810030904462690216e-1);
		p = __fma_rn (p, t, 1.1279046353630280005e-1);
		p = __fma_rn (p, t, 6.0537914739162189689e-3);
		p = __fma_rn (p, t, 1.3714329569665128933e-4);
		p = __fma_rn (p, t, 1.2964481560643197452e-6);
		p = __fma_rn (p, t, 4.6156006321345332510e-9);
		p = __fma_rn (p, t, 4.5344689563209398450e-12);
		q = t+ 1.5771922386662040546e+0;
		q = __fma_rn (q, t, 2.1238242087454993542e+0);
		q = __fma_rn (q, t, 8.4001814918178042919e-1);
		q = __fma_rn (q, t, 1.1311889334355782065e-1);
		q = __fma_rn (q, t, 6.0574830550097140404e-3);
		q = __fma_rn (q, t, 1.3715891988350205065e-4);
		q = __fma_rn (q, t, 1.2964671850944981713e-6);
		q = __fma_rn (q, t, 4.6156017600933592558e-9);
		q = __fma_rn (q, t, 4.5344687377088206783e-12);
		t = p / (q * t);
		}
		return t;
		}

	__device_func__(double __cuda_erfc(double a))	__device_func__(double __cuda_erfc(double a))
	{	{
	double p, q, h, l;	double p, q, h, l;

	if (__cuda___isnan(a)) {	int ahi;
	return a + a;
	}	ahi = __double2hiint(a);
	if (a <= 0.55) {	if (ahi < (int)0x3fe80000) {
	return 1.0 - __cuda_erf(a);	return 1.0 - __cuda_erf(a);
	}	}
	if (a > 27.3) {	if (a > 27.3) {
	return 0.0;	return 0.0;
	}	}

	if (a <= 5.0) {	if (ahi < (int)0x40140000) {
	p = 5.64189549785304440E-001;	p = 5.64189549785304440E-001;
	p = __fma_rn (p, a, 8.17405083437083490E+000);	p = __fma_rn (p, a, 8.17405083437083490E+000);
	p = __fma_rn (p, a, 5.68958722557864720E+001);	p = __fma_rn (p, a, 5.68958722557864720E+001);
	p = __fma_rn (p, a, 2.42568747802647010E+002);	p = __fma_rn (p, a, 2.42568747802647010E+002);
	p = __fma_rn (p, a, 6.80381374390412930E+002);	p = __fma_rn (p, a, 6.80381374390412930E+002);
	p = __fma_rn (p, a, 1.25873132236024590E+003);	p = __fma_rn (p, a, 1.25873132236024590E+003);
	p = __fma_rn (p, a, 1.43925353963809330E+003);	p = __fma_rn (p, a, 1.43925353963809330E+003);
	p = __fma_rn (p, a, 8.15949420587659230E+002);	p = __fma_rn (p, a, 8.15949420587659230E+002);
	q = a+ 1.44881247113239940E+001;	q = a+ 1.44881247113239940E+001;
	q = __fma_rn (q, a, 1.01345387970210510E+002);	q = __fma_rn (q, a, 1.01345387970210510E+002);

	skipping to change at line 1426	skipping to change at line 1628
	p = __fma_rn (p, a, 1.22570382896313600E+001);	p = __fma_rn (p, a, 1.22570382896313600E+001);
	p = __fma_rn (p, a, 6.01884641114116460E+000);	p = __fma_rn (p, a, 6.01884641114116460E+000);
	q = a+ 3.62871917534986780E+000;	q = a+ 3.62871917534986780E+000;
	q = __fma_rn (q, a, 1.24663395327043550E+001);	q = __fma_rn (q, a, 1.24663395327043550E+001);
	q = __fma_rn (q, a, 2.13927672803974790E+001);	q = __fma_rn (q, a, 2.13927672803974790E+001);
	q = __fma_rn (q, a, 2.72082423532866070E+001);	q = __fma_rn (q, a, 2.72082423532866070E+001);
	q = __fma_rn (q, a, 1.86422906830006700E+001);	q = __fma_rn (q, a, 1.86422906830006700E+001);
	q = __fma_rn (q, a, 6.13809834548870550E+000);	q = __fma_rn (q, a, 6.13809834548870550E+000);
	}	}
	p = p / q;	p = p / q;

	h = -a * a;	h = a * a;
	l = __fma_rn (-a, a, -h);	l = __fma_rn (a, a, -h);
	q = __internal_exp_kernel(h, 0);	q = __internal_exp_kernel(-h, 0);
	q = __fma_rn (q, l, q);	q = __fma_rn (l, -q, q);
	p = p * q;	p = p * q;
	return p;	return p;
	}	}

	/* approximate 1.0/(agamma(a)) on [-0.5,0.5] /	/* approximate 1.0/(agamma(a)) on [-0.5,0.5] /
	__device_func__(double __internal_tgamma_kernel(double a))	__device_func__(double __internal_tgamma_kernel(double a))
	{	{
	double t;	double t;
	t = -4.42689340712524750E-010;	t = -4.42689340712524750E-010;
	t = __fma_rn (t, a, -2.02665918466589540E-007);	t = __fma_rn (t, a, -2.02665918466589540E-007);

End of changes. 47 change blocks.
	225 lines changed or deleted	428 lines changed or added

	sm_11_atomic_functions.h	sm_11_atomic_functions.h
	/*	/*

	* Copyright 1993-2008 NVIDIA Corporation. All rights reserved.	* Copyright 1993-2009 NVIDIA Corporation. All rights reserved.
	*	*
	* NOTICE TO USER:	* NOTICE TO USER:
	*	*
	* This source code is subject to NVIDIA ownership rights under U.S. and	* This source code is subject to NVIDIA ownership rights under U.S. and
	* international Copyright laws. Users and possessors of this source code	* international Copyright laws. Users and possessors of this source code
	* are hereby granted a nonexclusive, royalty-free license to use this code	* are hereby granted a nonexclusive, royalty-free license to use this code
	* in individual and commercial software.	* in individual and commercial software.
	*	*
	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR

	skipping to change at line 257	skipping to change at line 257
	__builtin___iAtomicXor(address, val)	__builtin___iAtomicXor(address, val)
	#define __uAtomicXor(address, val) \	#define __uAtomicXor(address, val) \
	__builtin___uAtomicXor(address, val)	__builtin___uAtomicXor(address, val)
	#define __iAtomicCAS(address, compare, val) \	#define __iAtomicCAS(address, compare, val) \
	__builtin___iAtomicCAS(address, compare, val)	__builtin___iAtomicCAS(address, compare, val)
	#define __uAtomicCAS(address, compare, val) \	#define __uAtomicCAS(address, compare, val) \
	__builtin___uAtomicCAS(address, compare, val)	__builtin___uAtomicCAS(address, compare, val)

	#else /* __MULTI_CORE__ */	#else /* __MULTI_CORE__ */


		extern void CUDARTAPI __cudaMutexOperation(int lock);

		#define __cudaAtomicOperation(code) \
		__cudaMutexOperation(1); \
		code \
		__cudaMutexOperation(0);

	__device_func__(int __iAtomicAdd(int *address, int val))	__device_func__(int __iAtomicAdd(int *address, int val))
	{	{

	int old = *address;	int old;

	*address = old + val;


		__cudaAtomicOperation(
		old = *address;
		*address = old + val;
		)
	return old;	return old;
	}	}

	__device_func__(unsigned int __uAtomicAdd(unsigned int *address, unsigned i nt val))	__device_func__(unsigned int __uAtomicAdd(unsigned int *address, unsigned i nt val))
	{	{

	unsigned int old = *address;	unsigned int old;


	*address = old + val;	__cudaAtomicOperation(
		old = *address;
		*address = old + val;
		)

	return old;	return old;
	}	}

	__device_func__(int __iAtomicExch(int *address, int val))	__device_func__(int __iAtomicExch(int *address, int val))
	{	{

	int old = *address;	int old;


	*address = val;	__cudaAtomicOperation(
		old = *address;
		*address = val;
		)

	return old;	return old;
	}	}

	__device_func__(unsigned int __uAtomicExch(unsigned int *address, unsigned int val))	__device_func__(unsigned int __uAtomicExch(unsigned int *address, unsigned int val))
	{	{

	unsigned int old = *address;	unsigned int old;


	*address = val;	__cudaAtomicOperation(
		old = *address;
		*address = val;
		)

	return old;	return old;
	}	}

	__device_func__(float __fAtomicExch(float *address, float val))	__device_func__(float __fAtomicExch(float *address, float val))
	{	{

	float old = *address;	float old;


	*address = val;	__cudaAtomicOperation(
		old = *address;
		*address = val;
		)

	return old;	return old;
	}	}

	__device_func__(int __iAtomicMin(int *address, int val))	__device_func__(int __iAtomicMin(int *address, int val))
	{	{

	int old = *address;	int old;


	*address = old < val ? old : val;	__cudaAtomicOperation(
		old = *address;
		*address = old < val ? old : val;
		)

	return old;	return old;
	}	}

	__device_func__(unsigned int __uAtomicMin(unsigned int *address, unsigned i nt val))	__device_func__(unsigned int __uAtomicMin(unsigned int *address, unsigned i nt val))
	{	{

	unsigned int old = *address;	unsigned int old;


	*address = old < val ? old : val;	__cudaAtomicOperation(
		old = *address;
		*address = old < val ? old : val;
		)

	return old;	return old;
	}	}

	__device_func__(int __iAtomicMax(int *address, int val))	__device_func__(int __iAtomicMax(int *address, int val))
	{	{

	int old = *address;	int old;


	*address = old > val ? old : val;	__cudaAtomicOperation(
		old = *address;
		*address = old > val ? old : val;
		)

	return old;	return old;
	}	}

	__device_func__(unsigned int __uAtomicMax(unsigned int *address, unsigned i nt val))	__device_func__(unsigned int __uAtomicMax(unsigned int *address, unsigned i nt val))
	{	{

	unsigned int old = *address;	unsigned int old;


	*address = old > val ? old : val;	__cudaAtomicOperation(
		old = *address;
		*address = old > val ? old : val;
		)

	return old;	return old;
	}	}

	__device_func__(unsigned int __uAtomicInc(unsigned int *address, unsigned i nt val))	__device_func__(unsigned int __uAtomicInc(unsigned int *address, unsigned i nt val))
	{	{

	unsigned int old = *address;	unsigned int old;


	*address = (old >= val) ? 0 : old + 1;	__cudaAtomicOperation(
		old = *address;
		*address = (old >= val) ? 0 : old + 1;
		)

	return old;	return old;
	}	}

	__device_func__(unsigned int __uAtomicDec(unsigned int *address, unsigned i nt val))	__device_func__(unsigned int __uAtomicDec(unsigned int *address, unsigned i nt val))
	{	{

	unsigned int old = *address;	unsigned int old;


	*address = ((old == 0) \| (old > val)) ? val : (old - 1);	__cudaAtomicOperation(
		old = *address;
		*address = ((old == 0) \| (old > val)) ? val : (old - 1);
		)

	return old;	return old;
	}	}

	__device_func__(int __iAtomicAnd(int *address, int val))	__device_func__(int __iAtomicAnd(int *address, int val))
	{	{

	int old = *address;	int old;


	*address = old & val;	__cudaAtomicOperation(
		old = *address;
		*address = old & val;
		)

	return old;	return old;
	}	}

	__device_func__(unsigned int __uAtomicAnd(unsigned int *address, unsigned i nt val))	__device_func__(unsigned int __uAtomicAnd(unsigned int *address, unsigned i nt val))
	{	{

	unsigned int old = *address;	unsigned int old;


	*address = old & val;	__cudaAtomicOperation(
		old = *address;
		*address = old & val;
		)

	return old;	return old;
	}	}

	__device_func__(int __iAtomicOr(int *address, int val))	__device_func__(int __iAtomicOr(int *address, int val))
	{	{

	int old = *address;	int old;


	*address = old \| val;	__cudaAtomicOperation(
		old = *address;
		*address = old \| val;
		)

	return old;	return old;
	}	}

	__device_func__(unsigned int __uAtomicOr(unsigned int *address, unsigned in t val))	__device_func__(unsigned int __uAtomicOr(unsigned int *address, unsigned in t val))
	{	{

	unsigned int old = *address;	unsigned int old;


	*address = old \| val;	__cudaAtomicOperation(
		old = *address;
		*address = old \| val;
		)

	return old;	return old;
	}	}

	__device_func__(int __iAtomicXor(int *address, int val))	__device_func__(int __iAtomicXor(int *address, int val))
	{	{

	int old = *address;	int old;


	*address = old ^ val;	__cudaAtomicOperation(
		old = *address;
		*address = old ^ val;
		)

	return old;	return old;
	}	}

	__device_func__(unsigned int __uAtomicXor(unsigned int *address, unsigned i nt val))	__device_func__(unsigned int __uAtomicXor(unsigned int *address, unsigned i nt val))
	{	{

	unsigned int old = *address;	unsigned int old;


	*address = old ^ val;	__cudaAtomicOperation(
		old = *address;
		*address = old ^ val;
		)

	return old;	return old;
	}	}

	__device_func__(int __iAtomicCAS(int *address, int compare, int val))	__device_func__(int __iAtomicCAS(int *address, int compare, int val))
	{	{

	int old = *address;	int old;


	*address = old == compare ? val : old;	__cudaAtomicOperation(
		old = *address;
		*address = old == compare ? val : old;
		)

	return old;	return old;
	}	}

	__device_func__(unsigned int __uAtomicCAS(unsigned int *address, unsigned i nt compare, unsigned int val))	__device_func__(unsigned int __uAtomicCAS(unsigned int *address, unsigned i nt compare, unsigned int val))
	{	{

	unsigned int old = *address;	unsigned int old;


	*address = old == compare ? val : old;	__cudaAtomicOperation(
		old = *address;
		*address = old == compare ? val : old;
		)

	return old;	return old;
	}	}


		#undef __cudaAtomicOperation

	#endif /* __MULTI_CORE__ */	#endif /* __MULTI_CORE__ */

	#endif /* !__CUDABE__ */	#endif /* !__CUDABE__ */

	#endif /* __cplusplus && __CUDACC__ */	#endif /* __cplusplus && __CUDACC__ */

	#endif /* !__SM_11_ATOMIC_FUNCTIONS_H__ */	#endif /* !__SM_11_ATOMIC_FUNCTIONS_H__ */

End of changes. 41 change blocks.
	40 lines changed or deleted	105 lines changed or added

	sm_12_atomic_functions.h	sm_12_atomic_functions.h
	/*	/*

	* Copyright 1993-2008 NVIDIA Corporation. All rights reserved.	* Copyright 1993-2009 NVIDIA Corporation. All rights reserved.
	*	*
	* NOTICE TO USER:	* NOTICE TO USER:
	*	*
	* This source code is subject to NVIDIA ownership rights under U.S. and	* This source code is subject to NVIDIA ownership rights under U.S. and
	* international Copyright laws. Users and possessors of this source code	* international Copyright laws. Users and possessors of this source code
	* are hereby granted a nonexclusive, royalty-free license to use this code	* are hereby granted a nonexclusive, royalty-free license to use this code
	* in individual and commercial software.	* in individual and commercial software.
	*	*
	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR

	skipping to change at line 118	skipping to change at line 118

	#define __ullAtomicAdd(address, val) \	#define __ullAtomicAdd(address, val) \
	__builtin___ullAtomicAdd(address, val)	__builtin___ullAtomicAdd(address, val)
	#define __ullAtomicExch(address, val) \	#define __ullAtomicExch(address, val) \
	__builtin___ullAtomicExch(address, val)	__builtin___ullAtomicExch(address, val)
	#define __ullAtomicCAS(address, compare, val) \	#define __ullAtomicCAS(address, compare, val) \
	__builtin___ullAtomicCAS(address, compare, val)	__builtin___ullAtomicCAS(address, compare, val)

	#else /* __MULTI_CORE__ */	#else /* __MULTI_CORE__ */


		extern void CUDARTAPI __cudaMutexOperation(int lock);

		#define __cudaAtomicOperation(code) \
		__cudaMutexOperation(1); \
		code \
		__cudaMutexOperation(0);

	__device_func__(unsigned long long int __ullAtomicAdd(unsigned long long in t *address, unsigned long long int val))	__device_func__(unsigned long long int __ullAtomicAdd(unsigned long long in t *address, unsigned long long int val))
	{	{

	unsigned long long int old = *address;	unsigned long long int old;

	*address = old + val;


		__cudaAtomicOperation(
		old = *address;
		*address = old + val;
		)
	return old;	return old;
	}	}

	__device_func__(unsigned long long int __ullAtomicExch(unsigned long long i nt *address, unsigned long long int val))	__device_func__(unsigned long long int __ullAtomicExch(unsigned long long i nt *address, unsigned long long int val))
	{	{

	unsigned long long int old = *address;	unsigned long long int old;

	*address = val;


		__cudaAtomicOperation(
		old = *address;
		*address = val;
		)
	return old;	return old;
	}	}

	__device_func__(unsigned long long int __ullAtomicCAS(unsigned long long in t *address, unsigned long long int compare, unsigned long long int val))	__device_func__(unsigned long long int __ullAtomicCAS(unsigned long long in t *address, unsigned long long int compare, unsigned long long int val))
	{	{

	unsigned long long int old = *address;	unsigned long long int old;

	*address = old == compare ? val : old;


		__cudaAtomicOperation(
		old = *address;
		*address = old == compare ? val : old;
		)
	return old;	return old;
	}	}


		#undef __cudaAtomicOperation

	#endif /* __MULTI_CORE__ */	#endif /* __MULTI_CORE__ */

	__device_func__(int __any(int cond))	__device_func__(int __any(int cond))
	{	{
	return cond;	return cond;
	}	}

	__device_func__(int __all(int cond))	__device_func__(int __all(int cond))
	{	{
	return cond;	return cond;

End of changes. 9 change blocks.
	10 lines changed or deleted	25 lines changed or added

	sm_13_double_functions.h	sm_13_double_functions.h
	/*	/*

	* Copyright 1993-2008 NVIDIA Corporation. All rights reserved.	* Copyright 1993-2009 NVIDIA Corporation. All rights reserved.
	*	*
	* NOTICE TO USER:	* NOTICE TO USER:
	*	*
	* This source code is subject to NVIDIA ownership rights under U.S. and	* This source code is subject to NVIDIA ownership rights under U.S. and
	* international Copyright laws. Users and possessors of this source code	* international Copyright laws. Users and possessors of this source code
	* are hereby granted a nonexclusive, royalty-free license to use this code	* are hereby granted a nonexclusive, royalty-free license to use this code
	* in individual and commercial software.	* in individual and commercial software.
	*	*
	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR

	skipping to change at line 265	skipping to change at line 265

	#include "crt/func_macro.h"	#include "crt/func_macro.h"

	#if !defined(__CUDABE__)	#if !defined(__CUDABE__)

	/************************************************************************ ***	/************************************************************************ ***
	* *	* *
	* HOST IMPLEMENTATIONS FOR FUNCTIONS *	* HOST IMPLEMENTATIONS FOR FUNCTIONS *
	* *	* *
	************************************************************************* **/	************************************************************************* **/

		#include "common_types.h"

	__device_func__(double __longlong_as_double(long long int a))	__device_func__(double __longlong_as_double(long long int a))
	{	{

	volatile union {long long int a; double b;} u;	volatile union __cudart_DoubleLonglongCvt u;
		u.i = a;
	u.a = a;	return u.d;

	return u.b;
	}	}

	__device_func__(long long int __double_as_longlong(double a))	__device_func__(long long int __double_as_longlong(double a))
	{	{

	volatile union {double a; long long int b;} u;	volatile union __cudart_DoubleLonglongCvt u;
		u.d = a;
	u.a = a;	return u.i;

	return u.b;
	}	}

	__device_func__(float __internal_double2float_kernel(double a))	__device_func__(float __internal_double2float_kernel(double a))
	{	{

	volatile union {	volatile union __cudart_DoubleUlonglongCvt xx;
	double d;	volatile union __cudart_FloatUintCvt res;
	unsigned long long int i;
	} xx;
	volatile union {
	float f;
	unsigned int i;
	} res;
	int shift;	int shift;
	xx.d = a;	xx.d = a;
	if (xx.i == 0) return 0.0f;	if (xx.i == 0) return 0.0f;
	res.i = (((unsigned int) (xx.i >> 32)) & 0x80000000);	res.i = (((unsigned int) (xx.i >> 32)) & 0x80000000);
	if ((xx.i & 0x7ff0000000000000ULL) == 0x7ff0000000000000ULL) {	if ((xx.i & 0x7ff0000000000000ULL) == 0x7ff0000000000000ULL) {
	if ((xx.i & 0x7fffffffffffffffULL) > 0x7ff0000000000000ULL) {	if ((xx.i & 0x7fffffffffffffffULL) > 0x7ff0000000000000ULL) {
	// Nan	// Nan
	res.i = 0x7f8fffff;	res.i = 0x7f8fffff;
	} else {	} else {
	// Inf	// Inf

	skipping to change at line 333	skipping to change at line 325
	} else {	} else {
	res.i \|= (unsigned int) (127 + shift) << 23;	res.i \|= (unsigned int) (127 + shift) << 23;
	}	}
	res.i \|= ((unsigned int) (xx.i >> 29)) & 0x007fffff;	res.i \|= ((unsigned int) (xx.i >> 29)) & 0x007fffff;
	xx.i &= 0x1fffffff;	xx.i &= 0x1fffffff;
	return res.f;	return res.f;
	}	}

	__device_func__(double __internal_ll2double_kernel(long long int a, enum cu daRoundMode rndMode))	__device_func__(double __internal_ll2double_kernel(long long int a, enum cu daRoundMode rndMode))
	{	{

	volatile union {	volatile union __cudart_DoubleUlonglongCvt res;
	double d;
	unsigned long long int i;
	} res;
	int shift;	int shift;
	unsigned int t;	unsigned int t;
	res.i = a;	res.i = a;
	if (a == 0) return res.d;	if (a == 0) return res.d;
	if (a < 0) res.i = (unsigned long long int)-a;	if (a < 0) res.i = (unsigned long long int)-a;
	shift = __internal_normalize64((unsigned long long int*)&res.i);	shift = __internal_normalize64((unsigned long long int*)&res.i);
	t = ((unsigned int) res.i) << 21;	t = ((unsigned int) res.i) << 21;
	res.i >>= 11;	res.i >>= 11;
	res.i += ((unsigned long long int)(1023 + 62 - shift)) << 52;	res.i += ((unsigned long long int)(1023 + 62 - shift)) << 52;
	if (a < 0) res.i \|= 0x8000000000000000ULL;	if (a < 0) res.i \|= 0x8000000000000000ULL;

	skipping to change at line 361	skipping to change at line 350
	res.i++;	res.i++;
	}	}
	else if ((rndMode == cudaRoundPosInf) && t && (a > 0)) {	else if ((rndMode == cudaRoundPosInf) && t && (a > 0)) {
	res.i++;	res.i++;
	}	}
	return res.d;	return res.d;
	}	}

	__device_func__(double __internal_ull2double_kernel(unsigned long long int a, enum cudaRoundMode rndMode))	__device_func__(double __internal_ull2double_kernel(unsigned long long int a, enum cudaRoundMode rndMode))
	{	{

	volatile union {	volatile union __cudart_DoubleUlonglongCvt res;
	double d;
	unsigned long long int i;
	} res;
	int shift;	int shift;
	unsigned int t;	unsigned int t;
	res.i = a;	res.i = a;
	if (a == 0) return res.d;	if (a == 0) return res.d;
	shift = __internal_normalize64((unsigned long long int *)&res.i);	shift = __internal_normalize64((unsigned long long int *)&res.i);
	t = ((unsigned int) res.i) << 21;	t = ((unsigned int) res.i) << 21;
	res.i >>= 11;	res.i >>= 11;
	res.i += ((unsigned long long int)(1023 + 62 - shift)) << 52;	res.i += ((unsigned long long int)(1023 + 62 - shift)) << 52;
	if ((rndMode == cudaRoundNearest) && (t >= 0x80000000)) {	if ((rndMode == cudaRoundNearest) && (t >= 0x80000000)) {
	res.i += (t == 0x80000000) ? (res.i & 1) : 1;	res.i += (t == 0x80000000) ? (res.i & 1) : 1;
	}	}
	else if ((rndMode == cudaRoundPosInf) && t) {	else if ((rndMode == cudaRoundPosInf) && t) {
	res.i++;	res.i++;
	}	}
	return res.d;	return res.d;
	}	}

	__device_func__(long long int __internal_double2ll_kernel(double a, long lo ng int max, long long int min, long long int nan, enum cudaRoundMode rndMod e))	__device_func__(long long int __internal_double2ll_kernel(double a, long lo ng int max, long long int min, long long int nan, enum cudaRoundMode rndMod e))
	{	{

	volatile union {	volatile union __cudart_DoubleUlonglongCvt xx, res;
	double d;
	unsigned long long int i;
	} xx, res;
	unsigned long long int t = 0;	unsigned long long int t = 0;
	int shift;	int shift;

	xx.d = a;	xx.d = a;
	__internal_clamp(a, max, min, nan);	__internal_clamp(a, max, min, nan);
	shift = (int) (1023 + 62 - ((xx.i >> 52) & 0x7ff));	shift = (int) (1023 + 62 - ((xx.i >> 52) & 0x7ff));
	res.i = ((xx.i << 11) \| 0x8000000000000000ULL) >> 1;	res.i = ((xx.i << 11) \| 0x8000000000000000ULL) >> 1;
	if (shift >= 64) {	if (shift >= 64) {
	t = res.i;	t = res.i;
	res.i = 0;	res.i = 0;

	skipping to change at line 420	skipping to change at line 403
	res.i++;	res.i++;
	}	}
	if ((long long int)xx.i < 0) {	if ((long long int)xx.i < 0) {
	res.i = (unsigned long long int)(-(long long int)res.i);	res.i = (unsigned long long int)(-(long long int)res.i);
	}	}
	return res.i;	return res.i;
	}	}

	__device_func__(unsigned long long int __internal_double2ull_kernel(double a, unsigned long long int max, unsigned long long int nan, enum cudaRoundMo de rndMode))	__device_func__(unsigned long long int __internal_double2ull_kernel(double a, unsigned long long int max, unsigned long long int nan, enum cudaRoundMo de rndMode))
	{	{

	volatile union {	volatile union __cudart_DoubleUlonglongCvt xx, res;
	double d;
	unsigned long long int i;
	} xx, res;
	unsigned long long int t = 0;	unsigned long long int t = 0;
	int shift;	int shift;


	xx.d = a;	xx.d = a;
	__internal_clamp(a, max, 0LL, nan);	__internal_clamp(a, max, 0LL, nan);

	if (a == 0.0) return 0LL;	if (a == 0.0) return 0LL;
	shift = (int) (1023 + 63 - ((xx.i >> 52) & 0x7ff));	shift = (int) (1023 + 63 - ((xx.i >> 52) & 0x7ff));
	res.i = ((xx.i << 11) \| 0x8000000000000000ULL);	res.i = ((xx.i << 11) \| 0x8000000000000000ULL);
	if (shift >= 64) {	if (shift >= 64) {
	t = res.i >> (int)(shift > 64);	t = res.i >> (int)(shift > 64);
	res.i = 0;	res.i = 0;
	} else if (shift) {	} else if (shift) {

	skipping to change at line 450	skipping to change at line 431
	res.i += (t == 0x8000000000000000ULL) ? (res.i & 1) : 1;	res.i += (t == 0x8000000000000000ULL) ? (res.i & 1) : 1;
	}	}
	else if ((rndMode == cudaRoundPosInf) && t) {	else if ((rndMode == cudaRoundPosInf) && t) {
	res.i++;	res.i++;
	}	}
	return res.i;	return res.i;
	}	}

	__device_func__(int __double2hiint(double a))	__device_func__(int __double2hiint(double a))
	{	{

	volatile union {	volatile union __cudart_DoubleInthiloCvt cvt;
	double d;
	signed int i[2];
	} cvt;

	cvt.d = a;	cvt.d = a;


	return cvt.i[1];	return cvt.i[1];
	}	}

	__device_func__(int __double2loint(double a))	__device_func__(int __double2loint(double a))
	{	{

	volatile union {	volatile union __cudart_DoubleInthiloCvt cvt;
	double d;
	signed int i[2];
	} cvt;

	cvt.d = a;	cvt.d = a;


	return cvt.i[0];	return cvt.i[0];
	}	}

	__device_func__(double __hiloint2double(int a, int b))	__device_func__(double __hiloint2double(int a, int b))
	{	{

	volatile union {	volatile union __cudart_DoubleInthiloCvt cvt;
	double d;
	signed int i[2];
	} cvt;

	cvt.i[0] = b;	cvt.i[0] = b;
	cvt.i[1] = a;	cvt.i[1] = a;


	return cvt.d;	return cvt.d;
	}	}

	__device_func__(float __double2float_rn(double a))	__device_func__(float __double2float_rn(double a))
	{	{
	return (float)a;	return (float)a;
	}	}

	__device_func__(float __double2float_rz(double a))	__device_func__(float __double2float_rz(double a))
	{	{

	skipping to change at line 628	skipping to change at line 597
	}	}

	__device_func__(double __ull2double_ru(unsigned long long int a))	__device_func__(double __ull2double_ru(unsigned long long int a))
	{	{
	return __internal_ull2double_kernel(a, cudaRoundPosInf);	return __internal_ull2double_kernel(a, cudaRoundPosInf);
	}	}

	#endif /* !__CUDABE__ */	#endif /* !__CUDABE__ */

	#if !defined(__CUDABE__) \|\| defined(CUDA_NO_SM_13_DOUBLE_INTRINSICS)	#if !defined(__CUDABE__) \|\| defined(CUDA_NO_SM_13_DOUBLE_INTRINSICS)

		#include "common_types.h"
	__device_func__(double __internal_fma_kernel(double x, double y, double z, enum cudaRoundMode rndMode))	__device_func__(double __internal_fma_kernel(double x, double y, double z, enum cudaRoundMode rndMode))
	{	{
	#ifdef __MULTI_CORE__	#ifdef __MULTI_CORE__
	volatile	volatile
	#endif /* __MULTI_CORE__ */	#endif /* __MULTI_CORE__ */

	struct {	struct __cudart_UintUint xx, yy, zz, ww;
	unsigned int lo;
	unsigned int hi;
	} xx, yy, zz, ww;
	unsigned int s, t, u, prod0, prod1, prod2, prod3, expo_x, expo_y, expo_z;	unsigned int s, t, u, prod0, prod1, prod2, prod3, expo_x, expo_y, expo_z;

	xx.hi = __double2hiint(x);	xx.hi = __double2hiint(x);
	xx.lo = __double2loint(x);	xx.lo = __double2loint(x);
	yy.hi = __double2hiint(y);	yy.hi = __double2hiint(y);
	yy.lo = __double2loint(y);	yy.lo = __double2loint(y);
	zz.hi = __double2hiint(z);	zz.hi = __double2hiint(z);
	zz.lo = __double2loint(z);	zz.lo = __double2loint(z);

	expo_z = 0x7FF;	expo_z = 0x7FF;

End of changes. 18 change blocks.
	55 lines changed or deleted	21 lines changed or added

	storage_class.h	storage_class.h
	/*	/*

	* Copyright 1993-2008 NVIDIA Corporation. All rights reserved.	* Copyright 1993-2009 NVIDIA Corporation. All rights reserved.
	*	*
	* NOTICE TO USER:	* NOTICE TO USER:
	*	*
	* This source code is subject to NVIDIA ownership rights under U.S. and	* This source code is subject to NVIDIA ownership rights under U.S. and
	* international Copyright laws. Users and possessors of this source code	* international Copyright laws. Users and possessors of this source code
	* are hereby granted a nonexclusive, royalty-free license to use this code	* are hereby granted a nonexclusive, royalty-free license to use this code
	* in individual and commercial software.	* in individual and commercial software.
	*	*
	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR

End of changes. 1 change blocks.
	1 lines changed or deleted	1 lines changed or added

	texture_fetch_functions.h	texture_fetch_functions.h
	/*	/*

	* Copyright 1993-2008 NVIDIA Corporation. All rights reserved.	* Copyright 1993-2009 NVIDIA Corporation. All rights reserved.
	*	*
	* NOTICE TO USER:	* NOTICE TO USER:
	*	*
	* This source code is subject to NVIDIA ownership rights under U.S. and	* This source code is subject to NVIDIA ownership rights under U.S. and
	* international Copyright laws. Users and possessors of this source code	* international Copyright laws. Users and possessors of this source code
	* are hereby granted a nonexclusive, royalty-free license to use this code	* are hereby granted a nonexclusive, royalty-free license to use this code
	* in individual and commercial software.	* in individual and commercial software.
	*	*
	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR

End of changes. 1 change blocks.
	1 lines changed or deleted	1 lines changed or added

	texture_types.h	texture_types.h
	/*	/*

	* Copyright 1993-2008 NVIDIA Corporation. All rights reserved.	* Copyright 1993-2009 NVIDIA Corporation. All rights reserved.
	*	*
	* NOTICE TO USER:	* NOTICE TO USER:
	*	*
	* This source code is subject to NVIDIA ownership rights under U.S. and	* This source code is subject to NVIDIA ownership rights under U.S. and
	* international Copyright laws. Users and possessors of this source code	* international Copyright laws. Users and possessors of this source code
	* are hereby granted a nonexclusive, royalty-free license to use this code	* are hereby granted a nonexclusive, royalty-free license to use this code
	* in individual and commercial software.	* in individual and commercial software.
	*	*
	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR

End of changes. 1 change blocks.
	1 lines changed or deleted	1 lines changed or added

	vector_functions.h	vector_functions.h
	/*	/*

	* Copyright 1993-2008 NVIDIA Corporation. All rights reserved.	* Copyright 1993-2009 NVIDIA Corporation. All rights reserved.
	*	*
	* NOTICE TO USER:	* NOTICE TO USER:
	*	*
	* This source code is subject to NVIDIA ownership rights under U.S. and	* This source code is subject to NVIDIA ownership rights under U.S. and
	* international Copyright laws. Users and possessors of this source code	* international Copyright laws. Users and possessors of this source code
	* are hereby granted a nonexclusive, royalty-free license to use this code	* are hereby granted a nonexclusive, royalty-free license to use this code
	* in individual and commercial software.	* in individual and commercial software.
	*	*
	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR

End of changes. 1 change blocks.
	1 lines changed or deleted	1 lines changed or added

	vector_types.h	vector_types.h
	/*	/*

	* Copyright 1993-2008 NVIDIA Corporation. All rights reserved.	* Copyright 1993-2009 NVIDIA Corporation. All rights reserved.
	*	*
	* NOTICE TO USER:	* NOTICE TO USER:
	*	*
	* This source code is subject to NVIDIA ownership rights under U.S. and	* This source code is subject to NVIDIA ownership rights under U.S. and
	* international Copyright laws. Users and possessors of this source code	* international Copyright laws. Users and possessors of this source code
	* are hereby granted a nonexclusive, royalty-free license to use this code	* are hereby granted a nonexclusive, royalty-free license to use this code
	* in individual and commercial software.	* in individual and commercial software.
	*	*
	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR

End of changes. 1 change blocks.
	1 lines changed or deleted	1 lines changed or added

This html diff was produced by rfcdiff 1.41. The latest version is available from http://tools.ietf.org/tools/rfcdiff/