cuda: headers diff between 2.0 and 2.1 versions

	__cudaFatFormat.h	__cudaFatFormat.h
	/*	/*

	* Copyright 1993-2007 NVIDIA Corporation. All rights reserved.	* Copyright 1993-2008 NVIDIA Corporation. All rights reserved.
	*	*
	* NOTICE TO USER:	* NOTICE TO USER:
	*	*
	* This source code is subject to NVIDIA ownership rights under U.S. and	* This source code is subject to NVIDIA ownership rights under U.S. and
	* international Copyright laws. Users and possessors of this source code	* international Copyright laws. Users and possessors of this source code
	* are hereby granted a nonexclusive, royalty-free license to use this code	* are hereby granted a nonexclusive, royalty-free license to use this code
	* in individual and commercial software.	* in individual and commercial software.
	*	*
	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR

	skipping to change at line 133	skipping to change at line 133
	char* debug;	char* debug;
	} __cudaFatDebugEntry;	} __cudaFatDebugEntry;

	typedef enum {	typedef enum {
	__cudaFatDontSearchFlag = (1 << 0),	__cudaFatDontSearchFlag = (1 << 0),
	__cudaFatDontCacheFlag = (1 << 1),	__cudaFatDontCacheFlag = (1 << 1),
	__cudaFatSassDebugFlag = (1 << 2)	__cudaFatSassDebugFlag = (1 << 2)
	} __cudaFatCudaBinaryFlag;	} __cudaFatCudaBinaryFlag;

	/*	/*

		* Imported/exported symbol descriptor, needed for
		* __cudaFat binary linking. Not much information is needed,
		* because this is only an index: full symbol information
		* is contained by the binaries.
		*/
		typedef struct {
		char* name;
		} __cudaFatSymbol;

		/*
	* Fat binary container.	* Fat binary container.
	* A mix of ptx intermediate programs and cubins,	* A mix of ptx intermediate programs and cubins,
	* plus a global identifier that can be used for	* plus a global identifier that can be used for
	* further lookup in a translation cache or a resource	* further lookup in a translation cache or a resource
	* file. This key is a checksum over the device text.	* file. This key is a checksum over the device text.
	* The ptx and cubin array are each terminated with	* The ptx and cubin array are each terminated with

	* entries that have Null components.	* entries that have NULL components.
	*/	*/

	typedef struct {	typedef struct __cudaFatCudaBinaryRec {
	unsigned long magic;	unsigned long magic;
	unsigned long version;	unsigned long version;
	unsigned long gpuInfoVersion;	unsigned long gpuInfoVersion;
	char* key;	char* key;
	char* ident;	char* ident;
	char* usageMode;	char* usageMode;
	__cudaFatPtxEntry *ptx;	__cudaFatPtxEntry *ptx;
	__cudaFatCubinEntry *cubin;	__cudaFatCubinEntry *cubin;
	__cudaFatDebugEntry *debug;	__cudaFatDebugEntry *debug;
	void* debugInfo;	void* debugInfo;
	unsigned int flags;	unsigned int flags;
		__cudaFatSymbol *exported;
		__cudaFatSymbol *imported;
		struct __cudaFatCudaBinaryRec *dependends;
	} __cudaFatCudaBinary;	} __cudaFatCudaBinary;

	/*	/*
	* Current version and magic numbers:	* Current version and magic numbers:
	*/	*/

	#define __cudaFatVERSION 0x00000002	#define __cudaFatVERSION 0x00000003
	#define __cudaFatMAGIC 0x1ee55a01	#define __cudaFatMAGIC 0x1ee55a01

	/*	/*
	* Version history log:	* Version history log:
	* 1 : __cudaFatDebugEntry field added to __cudaFatCudaBinary struct	* 1 : __cudaFatDebugEntry field added to __cudaFatCudaBinary struct
	* 2 : flags and debugInfo field added.	* 2 : flags and debugInfo field added.

		* 3 : import/export symbol list
	*/	*/

	/--------------------------------- Functions ----------------------------- ---/	/--------------------------------- Functions ----------------------------- ---/


		typedef enum {
		__cudaFatAvoidPTX,
		__cudaFatPreferBestCode
		} __cudaFatCompilationPolicy;

	/*	/*
	* Function : Select a load image from the __cudaFat binary	* Function : Select a load image from the __cudaFat binary
	* that will run on the specified GPU.	* that will run on the specified GPU.
	* Parameters : binary (I) Fat binary	* Parameters : binary (I) Fat binary

		* policy (I) Parameter influencing the selection proces
		s in case no
		* fully matching cubin can be found, but ins
		tead a choice can
		* be made between ptx compilation or selecti
		on of a
		* cubin for a less capable GPU.
	* gpuName (I) Name of target GPU	* gpuName (I) Name of target GPU

	* cubin (O) Returned cubin text string, or Null when	* cubin (O) Returned cubin text string, or NULL when
	* no matching cubin for the specified gpu	* no matching cubin for the specified gpu
	* could be found.	* could be found.

	* dbgInfo (O) If this parameter is not Null upon entry, then	* dbgInfo (O) If this parameter is not NULL upon entry, then
	* the name of a file containing debug inform ation	* the name of a file containing debug inform ation

	* on the returned cubin will be returned, or Null	* on the returned cubin will be returned, or NULL
	* will be returned when cubin or such debug info	* will be returned when cubin or such debug info
	* cannot be found.	* cannot be found.
	*/	*/

	void fatGetCubinForGpu( __cudaFatCudaBinary binary, char gpuName, char* *	void fatGetCubinForGpuWithPolicy( __cudaFatCudaBinary *binary, __cudaFatCom
	cubin, char* *dbgInfoFile );	pilationPolicy policy, char* gpuName, char* cubin, char *dbgInfoFile );

		#define fatGetCubinForGpu(binary,gpuName,cubin,dbgInfoFile) \
		fatGetCubinForGpuWithPolicy(binary,__cudaFatAvoidPTX,gpuName,cubi
		n,dbgInfoFile)

	/*	/*
	* Function : Free information previously obtained via function fatG etCubinForGpu.	* Function : Free information previously obtained via function fatG etCubinForGpu.
	* Parameters : cubin (I) Cubin text string to free	* Parameters : cubin (I) Cubin text string to free

	* dbgInfo (I) Debug info filename to free, or Null	* dbgInfo (I) Debug info filename to free, or NULL
	*/	*/
	void fatFreeCubin( char* cubin, char* dbgInfoFile );	void fatFreeCubin( char* cubin, char* dbgInfoFile );

	#ifdef __cplusplus	#ifdef __cplusplus
	}	}
	#endif	#endif

	#endif	#endif

End of changes. 13 change blocks.
	21 lines changed or deleted	51 lines changed or added

	channel_descriptor.h	channel_descriptor.h

	skipping to change at line 39	skipping to change at line 39
	* source code with only those rights set forth herein.	* source code with only those rights set forth herein.
	*	*
	* Any use of this source code in individual and commercial software must	* Any use of this source code in individual and commercial software must
	* include, in the user documentation and internal comments to the code,	* include, in the user documentation and internal comments to the code,
	* the above Disclaimer and U.S. Government End Users Notice.	* the above Disclaimer and U.S. Government End Users Notice.
	*/	*/

	#if !defined(__CHANNEL_DESCRIPTOR_H__)	#if !defined(__CHANNEL_DESCRIPTOR_H__)
	#define __CHANNEL_DESCRIPTOR_H__	#define __CHANNEL_DESCRIPTOR_H__


		#if defined(__cplusplus)

	/************************************************************************ ***	/************************************************************************ ***
	* *	* *
	* *	* *
	* *	* *
	************************************************************************* **/	************************************************************************* **/


	#include "builtin_types.h"	#include "driver_types.h"
	#include "cuda_runtime_api.h"	#include "cuda_runtime_api.h"
	#include "host_defines.h"	#include "host_defines.h"

		#include "vector_types.h"

	/************************************************************************ ***	/************************************************************************ ***
	* *	* *
	* *	* *
	* *	* *
	************************************************************************* **/	************************************************************************* **/


	#if defined(__cplusplus)

	template<class T> __inline__ __host__ cudaChannelFormatDesc cudaCreateChann elDesc(void)	template<class T> __inline__ __host__ cudaChannelFormatDesc cudaCreateChann elDesc(void)
	{	{

	return cudaCreateChannelDesc(0, 0, 0, 0, cudaChannelFormatKindUnsigned);	return cudaCreateChannelDesc(0, 0, 0, 0, cudaChannelFormatKindNone);
	}	}

	template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc< char>(void)	template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc< char>(void)
	{	{
	int e = (int)sizeof(char) * 8;	int e = (int)sizeof(char) * 8;

	#if __SIGNED_CHARS__	#if __SIGNED_CHARS__
	return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);	return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
	#else	#else
	return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);	return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);

End of changes. 5 change blocks.
	4 lines changed or deleted	5 lines changed or added

	cuComplex.h	cuComplex.h

	skipping to change at line 245	skipping to change at line 245
	}	}

	/*	/*
	* We would like to call hypotf(), but it's not available on all platforms.	* We would like to call hypotf(), but it's not available on all platforms.
	* This discrete implementation guards against intermediate underflow and	* This discrete implementation guards against intermediate underflow and
	* overflow by scaling. Otherwise we would lose half the exponent range.	* overflow by scaling. Otherwise we would lose half the exponent range.
	* There are various ways of doing guarded computation. For now chose the	* There are various ways of doing guarded computation. For now chose the
	* simplest and fastest solution, however this may suffer from inaccuracies	* simplest and fastest solution, however this may suffer from inaccuracies
	* if sqrt and division are not IEEE compliant.	* if sqrt and division are not IEEE compliant.
	*/	*/

	__host__ static __inline__ float cuCabsf (cuFloatComplex x)	__host__ __device__ static __inline__ float cuCabsf (cuFloatComplex x)
	{	{
	float a = cuCrealf(x);	float a = cuCrealf(x);
	float b = cuCimagf(x);	float b = cuCimagf(x);
	float v, w, t;	float v, w, t;
	a = (float)fabs(a);	a = (float)fabs(a);
	b = (float)fabs(b);	b = (float)fabs(b);
	if (a > b) {	if (a > b) {
	v = a;	v = a;
	w = b;	w = b;
	} else {	} else {

End of changes. 1 change blocks.
	1 lines changed or deleted	1 lines changed or added

	cublas.h	cublas.h

	skipping to change at line 78	skipping to change at line 78
	* Currently, only a subset of the BLAS core functions is implemented.	* Currently, only a subset of the BLAS core functions is implemented.
	*	*
	* The interface to the CUBLAS library is the header file cublas.h.	* The interface to the CUBLAS library is the header file cublas.h.
	* Applications using CUBLAS need to link against the DSO cublas.so	* Applications using CUBLAS need to link against the DSO cublas.so
	* (Linux) or the DLL cublas.dll (Win32).	* (Linux) or the DLL cublas.dll (Win32).
	*/	*/

	#if !defined(CUBLAS_H_)	#if !defined(CUBLAS_H_)
	#define CUBLAS_H_	#define CUBLAS_H_


		#ifdef __MULTI_CORE__
		#error CUBLAS not supported on multicore
		#endif

	#ifndef CUBLASAPI	#ifndef CUBLASAPI
	#ifdef _WIN32	#ifdef _WIN32
	#define CUBLASAPI __stdcall	#define CUBLASAPI __stdcall
	#else	#else
	#define CUBLASAPI	#define CUBLASAPI
	#endif	#endif
	#endif	#endif

	#if defined(__cplusplus)	#if defined(__cplusplus)
	extern "C" {	extern "C" {

End of changes. 1 change blocks.
	0 lines changed or deleted	4 lines changed or added

	cuda.h	cuda.h

	skipping to change at line 46	skipping to change at line 46
	// ------------------------------------------------------------------------ ----	// ------------------------------------------------------------------------ ----
	//	//
	// Main public header file for the CompUte Device Api	// Main public header file for the CompUte Device Api
	//	//
	// ------------------------------------------------------------------------ ----	// ------------------------------------------------------------------------ ----

	#ifndef __cuda_cuda_h__	#ifndef __cuda_cuda_h__
	#define __cuda_cuda_h__	#define __cuda_cuda_h__

	/* CUDA API version number */	/* CUDA API version number */

	#define CUDA_VERSION 2000 /* 2.0 */	#define CUDA_VERSION 2010 /* 2.1 */

	#ifdef __cplusplus	#ifdef __cplusplus
	extern "C" {	extern "C" {
	#endif	#endif
	typedef unsigned int CUdeviceptr;	typedef unsigned int CUdeviceptr;

	typedef int CUdevice;	typedef int CUdevice;
	typedef struct CUctx_st *CUcontext;	typedef struct CUctx_st *CUcontext;
	typedef struct CUmod_st *CUmodule;	typedef struct CUmod_st *CUmodule;
	typedef struct CUfunc_st *CUfunction;	typedef struct CUfunc_st *CUfunction;

	skipping to change at line 132	skipping to change at line 132
	CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK = 8, // Deprecated, us e CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK	CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK = 8, // Deprecated, us e CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK
	CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY = 9,	CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY = 9,
	CU_DEVICE_ATTRIBUTE_WARP_SIZE = 10,	CU_DEVICE_ATTRIBUTE_WARP_SIZE = 10,
	CU_DEVICE_ATTRIBUTE_MAX_PITCH = 11,	CU_DEVICE_ATTRIBUTE_MAX_PITCH = 11,
	CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK = 12,	CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK = 12,
	CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK = 12, // Deprecated, us e CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK	CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK = 12, // Deprecated, us e CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK
	CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13,	CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13,
	CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT = 14,	CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT = 14,

	CU_DEVICE_ATTRIBUTE_GPU_OVERLAP = 15,	CU_DEVICE_ATTRIBUTE_GPU_OVERLAP = 15,

	CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16	CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16,
		CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT = 17
	} CUdevice_attribute;	} CUdevice_attribute;

	//	//
	// Legacy device properties	// Legacy device properties
	//	//
	typedef struct CUdevprop_st {	typedef struct CUdevprop_st {
	int maxThreadsPerBlock;	int maxThreadsPerBlock;
	int maxThreadsDim[3];	int maxThreadsDim[3];
	int maxGridSize[3];	int maxGridSize[3];
	int sharedMemPerBlock;	int sharedMemPerBlock;

	skipping to change at line 160	skipping to change at line 161

	//	//
	// Memory types	// Memory types
	//	//
	typedef enum CUmemorytype_enum {	typedef enum CUmemorytype_enum {
	CU_MEMORYTYPE_HOST = 0x01,	CU_MEMORYTYPE_HOST = 0x01,
	CU_MEMORYTYPE_DEVICE = 0x02,	CU_MEMORYTYPE_DEVICE = 0x02,
	CU_MEMORYTYPE_ARRAY = 0x03	CU_MEMORYTYPE_ARRAY = 0x03
	} CUmemorytype;	} CUmemorytype;


		//
		// Online compiler options
		//
		typedef enum CUjit_option_enum
		{
		// CU_JIT_MAX_REGISTERS - Max number of registers that a thread may use
		.
		CU_JIT_MAX_REGISTERS = 0,

		// CU_JIT_THREADS_PER_BLOCK -
		// IN: Specifies minimum number of threads per block to target compilat
		ion for
		// OUT: Returns the number of threads the compiler actually targeted.
		This
		// restricts the resource utilization fo the compiler (e.g. max registe
		rs) such
		// that a block with the given number of threads should be able to laun
		ch based
		// on register limitations. Note, this option does not currently take
		into
		// account any other resource limitations, such as shared memory utiliz
		ation.
		CU_JIT_THREADS_PER_BLOCK,

		// CU_JIT_WALL_TIME - returns a float value in the option of the wall c
		lock
		// time, in milliseconds, spent creating the cubin
		CU_JIT_WALL_TIME,

		// CU_JIT_INFO_LUG_BUFFER - pointer to a buffer in which to print any l
		og
		// messsages from PTXAS that are informational in nature
		CU_JIT_INFO_LOG_BUFFER,

		// CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES -
		// IN: Log buffer size in bytes. Log messages will be capped at this s
		ize
		// (including null terminator)
		// OUT: Amount of log buffer filled with messages
		CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES,

		// CU_JIT_ERROR_LOG_BUFFER - pointer to a buffer in which to print any
		log
		// messages from PTXAS that reflect errors
		CU_JIT_ERROR_LOG_BUFFER,

		// CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES -
		// IN: Log buffer size in bytes. Log messages will be capped at this s
		ize
		// (including null terminator)
		// OUT: Amount of log buffer filled with messages
		CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES,

		// CU_JIT_OPTIMIZATION_LEVEL - level of optimizations to apply to gener
		ated
		// code (0 - 4), with 4 being the default and highest level of optimiza
		tions.
		CU_JIT_OPTIMIZATION_LEVEL,

		// CU_JIT_TARGET_FROM_CU_CONTEXT - no option value required. Determine
		s
		// the target based on the current attached context (default)
		CU_JIT_TARGET_FROM_CUCONTEXT,

		// CU_JIT_TARGET - target is chosen based on supplied CUjit_target_enum
		.
		CU_JIT_TARGET,

		// CU_JIT_FALLBACK_STRATEGY - specifies choice of fallback strategy if
		// matching cubin is not found. Choice is based on supplied
		// CUjit_fallback_enum.
		CU_JIT_FALLBACK_STRATEGY

		} CUjit_option;

		//
		// Online compilation targets
		//
		typedef enum CUjit_target_enum
		{
		CU_TARGET_COMPUTE_10 = 0,
		CU_TARGET_COMPUTE_11,
		CU_TARGET_COMPUTE_12,
		CU_TARGET_COMPUTE_13
		} CUjit_target;

		//
		// Cubin matching fallback strategies
		//
		typedef enum CUjit_fallback_enum
		{
		// prefer to compile ptx
		CU_PREFER_PTX = 0,

		// prefer to fall back to compatible binary code
		CU_PREFER_BINARY

		} CUjit_fallback;

	/************************************	/************************************
	**	**
	** Error codes	** Error codes
	**	**
	***********************************/	***********************************/

	typedef enum cudaError_enum {	typedef enum cudaError_enum {

	CUDA_SUCCESS = 0,	CUDA_SUCCESS = 0,
	CUDA_ERROR_INVALID_VALUE = 1,	CUDA_ERROR_INVALID_VALUE = 1,

	skipping to change at line 253	skipping to change at line 337
	CUresult CUDAAPI cuCtxSynchronize(void);	CUresult CUDAAPI cuCtxSynchronize(void);

	/************************************	/************************************
	**	**
	** Module management	** Module management
	**	**
	***********************************/	***********************************/

	CUresult CUDAAPI cuModuleLoad(CUmodule module, const char fname);	CUresult CUDAAPI cuModuleLoad(CUmodule module, const char fname);
	CUresult CUDAAPI cuModuleLoadData(CUmodule module, const void image) ;	CUresult CUDAAPI cuModuleLoadData(CUmodule module, const void image) ;

		CUresult CUDAAPI cuModuleLoadDataEx(CUmodule module, const void imag e, unsigned int numOptions, CUjit_option options, void *optionValues);
	CUresult CUDAAPI cuModuleLoadFatBinary(CUmodule module, const void f atCubin);	CUresult CUDAAPI cuModuleLoadFatBinary(CUmodule module, const void f atCubin);
	CUresult CUDAAPI cuModuleUnload(CUmodule hmod);	CUresult CUDAAPI cuModuleUnload(CUmodule hmod);
	CUresult CUDAAPI cuModuleGetFunction(CUfunction hfunc, CUmodule hmod, const char name);	CUresult CUDAAPI cuModuleGetFunction(CUfunction hfunc, CUmodule hmod, const char name);
	CUresult CUDAAPI cuModuleGetGlobal(CUdeviceptr dptr, unsigned int by tes, CUmodule hmod, const char *name);	CUresult CUDAAPI cuModuleGetGlobal(CUdeviceptr dptr, unsigned int by tes, CUmodule hmod, const char *name);
	CUresult CUDAAPI cuModuleGetTexRef(CUtexref pTexRef, CUmodule hmod, c onst char name);	CUresult CUDAAPI cuModuleGetTexRef(CUtexref pTexRef, CUmodule hmod, c onst char name);

	/************************************	/************************************
	**	**
	** Memory management	** Memory management
	**	**

End of changes. 4 change blocks.
	2 lines changed or deleted	103 lines changed or added

	cuda_runtime.h	cuda_runtime.h

	skipping to change at line 63	skipping to change at line 63
	#include "builtin_types.h"	#include "builtin_types.h"
	#include "channel_descriptor.h"	#include "channel_descriptor.h"
	#include "cuda_runtime_api.h"	#include "cuda_runtime_api.h"
	#include "driver_functions.h"	#include "driver_functions.h"
	#include "host_defines.h"	#include "host_defines.h"
	#include "vector_functions.h"	#include "vector_functions.h"

	#if defined(__CUDACC__)	#if defined(__CUDACC__)

	#include "common_functions.h"	#include "common_functions.h"

		#include "cuda_texture_types.h"
	#include "device_functions.h"	#include "device_functions.h"
	#include "device_launch_parameters.h"	#include "device_launch_parameters.h"

	#endif /* __CUDACC__ */	#endif /* __CUDACC__ */

	#if defined(__cplusplus)	#if defined(__cplusplus)

	/************************************************************************ ***	/************************************************************************ ***
	* *	* *
	* *	* *

End of changes. 1 change blocks.
	0 lines changed or deleted	1 lines changed or added

	cuda_runtime_api.h	cuda_runtime_api.h

	skipping to change at line 41	skipping to change at line 41
	* Any use of this source code in individual and commercial software must	* Any use of this source code in individual and commercial software must
	* include, in the user documentation and internal comments to the code,	* include, in the user documentation and internal comments to the code,
	* the above Disclaimer and U.S. Government End Users Notice.	* the above Disclaimer and U.S. Government End Users Notice.
	*/	*/

	#if !defined(__CUDA_RUNTIME_API_H__)	#if !defined(__CUDA_RUNTIME_API_H__)
	#define __CUDA_RUNTIME_API_H__	#define __CUDA_RUNTIME_API_H__

	/************************************************************************ ***	/************************************************************************ ***
	* *	* *

	* CUDA runtime API version number 2.0 *	* CUDA runtime API version number 2.1 *
	* *	* *
	************************************************************************* **/	************************************************************************* **/

	#define CUDART_VERSION \	#define CUDART_VERSION \

	2000	2010

	/************************************************************************ ***	/************************************************************************ ***
	* *	* *
	* *	* *
	* *	* *
	************************************************************************* **/	************************************************************************* **/

	#include "host_defines.h"	#include "host_defines.h"
	#include "builtin_types.h"	#include "builtin_types.h"


End of changes. 2 change blocks.
	2 lines changed or deleted	2 lines changed or added

	cufft.h	cufft.h
	/*	/*

	* Copyright 1993-2007 NVIDIA Corporation. All rights reserved.	* Copyright 1993-2008 NVIDIA Corporation. All rights reserved.
	*	*
	* NOTICE TO USER:	* NOTICE TO USER:
	*	*
	* This source code is subject to NVIDIA ownership rights under U.S. and	* This source code is subject to NVIDIA ownership rights under U.S. and
	* international Copyright laws. Users and possessors of this source code	* international Copyright laws. Users and possessors of this source code
	* are hereby granted a nonexclusive, royalty-free license to use this code	* are hereby granted a nonexclusive, royalty-free license to use this code
	* in individual and commercial software.	* in individual and commercial software.
	*	*
	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR

	skipping to change at line 45	skipping to change at line 45

	/*	/*
	* cufft.h	* cufft.h
	* Public header file for the NVIDIA Cuda FFT library (CUFFT)	* Public header file for the NVIDIA Cuda FFT library (CUFFT)
	*/	*/

	#ifndef _CUFFT_H_	#ifndef _CUFFT_H_
	#define _CUFFT_H_	#define _CUFFT_H_

	#include <stdio.h>	#include <stdio.h>

	#include <cuComplex.h>	#include "cuComplex.h"

		#ifdef __MULTI_CORE__
		#error CUFFT not supported on multicore
		#endif

	#ifndef CUFFTAPI	#ifndef CUFFTAPI
	#ifdef _WIN32	#ifdef _WIN32
	#define CUFFTAPI __stdcall	#define CUFFTAPI __stdcall
	#else	#else
	#define CUFFTAPI	#define CUFFTAPI
	#endif	#endif
	#endif	#endif

	#ifdef __cplusplus	#ifdef __cplusplus

End of changes. 2 change blocks.
	2 lines changed or deleted	6 lines changed or added

	device_functions.h	device_functions.h

	skipping to change at line 505	skipping to change at line 505

	__device_func__(int __internal_float2int(float a, enum cudaRoundMode rndMod e))	__device_func__(int __internal_float2int(float a, enum cudaRoundMode rndMod e))
	{	{
	return (int)__internal_float2ll_kernel(a, 2147483647LL, -2147483648LL, 0L L, rndMode);	return (int)__internal_float2ll_kernel(a, 2147483647LL, -2147483648LL, 0L L, rndMode);
	}	}

	__device_func__(int __float2int_rz(float a))	__device_func__(int __float2int_rz(float a))
	{	{
	#if defined(__MULTI_CORE__)	#if defined(__MULTI_CORE__)
	return (int)a;	return (int)a;

	#else /* __MULTI_CORE__) */	#else /* __MULTI_CORE__ */
	return __internal_float2int(a, cudaRoundZero);	return __internal_float2int(a, cudaRoundZero);

	#endif /* __MULTI_CORE__) */	#endif /* __MULTI_CORE__ */
	}	}

	__device_func__(int __float2int_ru(float a))	__device_func__(int __float2int_ru(float a))
	{	{
	return __internal_float2int(a, cudaRoundPosInf);	return __internal_float2int(a, cudaRoundPosInf);
	}	}

	__device_func__(int __float2int_rd(float a))	__device_func__(int __float2int_rd(float a))
	{	{
	return __internal_float2int(a, cudaRoundMinInf);	return __internal_float2int(a, cudaRoundMinInf);

	skipping to change at line 534	skipping to change at line 534

	__device_func__(long long int __internal_float2ll(float a, enum cudaRoundMo de rndMode))	__device_func__(long long int __internal_float2ll(float a, enum cudaRoundMo de rndMode))
	{	{
	return __internal_float2ll_kernel(a, 9223372036854775807LL, -922337203685 4775807LL -1LL, -9223372036854775807LL -1LL, rndMode);	return __internal_float2ll_kernel(a, 9223372036854775807LL, -922337203685 4775807LL -1LL, -9223372036854775807LL -1LL, rndMode);
	}	}

	__device_func__(long long int __float2ll_rz(float a))	__device_func__(long long int __float2ll_rz(float a))
	{	{
	#if defined(__MULTI_CORE__)	#if defined(__MULTI_CORE__)
	return (long long int)a;	return (long long int)a;

	#else /* __MULTI_CORE__) */	#else /* __MULTI_CORE__ */
	return __internal_float2ll(a, cudaRoundZero);	return __internal_float2ll(a, cudaRoundZero);

	#endif /* __MULTI_CORE__) */	#endif /* __MULTI_CORE__ */
	}	}

	__device_func__(long long int __float2ll_ru(float a))	__device_func__(long long int __float2ll_ru(float a))
	{	{
	return __internal_float2ll(a, cudaRoundPosInf);	return __internal_float2ll(a, cudaRoundPosInf);
	}	}

	__device_func__(long long int __float2ll_rd(float a))	__device_func__(long long int __float2ll_rd(float a))
	{	{
	return __internal_float2ll(a, cudaRoundMinInf);	return __internal_float2ll(a, cudaRoundMinInf);

	skipping to change at line 589	skipping to change at line 589

	__device_func__(unsigned int __internal_float2uint(float a, enum cudaRoundM ode rndMode))	__device_func__(unsigned int __internal_float2uint(float a, enum cudaRoundM ode rndMode))
	{	{
	return (unsigned int)__internal_float2ull_kernel(a, 4294967295U, 0U, rndM ode);	return (unsigned int)__internal_float2ull_kernel(a, 4294967295U, 0U, rndM ode);
	}	}

	__device_func__(unsigned int __float2uint_rz(float a))	__device_func__(unsigned int __float2uint_rz(float a))
	{	{
	#if defined(__MULTI_CORE__)	#if defined(__MULTI_CORE__)
	return (unsigned int)a;	return (unsigned int)a;

	#else /* __MULTI_CORE__) */	#else /* __MULTI_CORE__ */
	return __internal_float2uint(a, cudaRoundZero);	return __internal_float2uint(a, cudaRoundZero);

	#endif /* __MULTI_CORE__) */	#endif /* __MULTI_CORE__ */
	}	}

	__device_func__(unsigned int __float2uint_ru(float a))	__device_func__(unsigned int __float2uint_ru(float a))
	{	{
	return __internal_float2uint(a, cudaRoundPosInf);	return __internal_float2uint(a, cudaRoundPosInf);
	}	}

	__device_func__(unsigned int __float2uint_rd(float a))	__device_func__(unsigned int __float2uint_rd(float a))
	{	{
	return __internal_float2uint(a, cudaRoundMinInf);	return __internal_float2uint(a, cudaRoundMinInf);

	skipping to change at line 618	skipping to change at line 618

	__device_func__(unsigned long long int __internal_float2ull(float a, enum c udaRoundMode rndMode))	__device_func__(unsigned long long int __internal_float2ull(float a, enum c udaRoundMode rndMode))
	{	{
	return __internal_float2ull_kernel(a, 18446744073709551615ULL, 9223372036 854775808ULL, rndMode);	return __internal_float2ull_kernel(a, 18446744073709551615ULL, 9223372036 854775808ULL, rndMode);
	}	}

	__device_func__(unsigned long long int __float2ull_rz(float a))	__device_func__(unsigned long long int __float2ull_rz(float a))
	{	{
	#if defined(__MULTI_CORE__)	#if defined(__MULTI_CORE__)
	return (unsigned long long int)a;	return (unsigned long long int)a;

	#else /* __MULTI_CORE__) */	#else /* __MULTI_CORE__ */
	return __internal_float2ull(a, cudaRoundZero);	return __internal_float2ull(a, cudaRoundZero);

	#endif /* __MULTI_CORE__) */	#endif /* __MULTI_CORE__ */
	}	}

	__device_func__(unsigned long long int __float2ull_ru(float a))	__device_func__(unsigned long long int __float2ull_ru(float a))
	{	{
	return __internal_float2ull(a, cudaRoundPosInf);	return __internal_float2ull(a, cudaRoundPosInf);
	}	}

	__device_func__(unsigned long long int __float2ull_rd(float a))	__device_func__(unsigned long long int __float2ull_rd(float a))
	{	{
	return __internal_float2ull(a, cudaRoundMinInf);	return __internal_float2ull(a, cudaRoundMinInf);

	skipping to change at line 726	skipping to change at line 726

	__device_func__(float __int2float_rd(int a))	__device_func__(float __int2float_rd(int a))
	{	{
	return __internal_int2float_kernel(a, cudaRoundMinInf);	return __internal_int2float_kernel(a, cudaRoundMinInf);
	}	}

	__device_func__(float __int2float_rn(int a))	__device_func__(float __int2float_rn(int a))
	{	{
	#if defined(__MULTI_CORE__)	#if defined(__MULTI_CORE__)
	return (float)a;	return (float)a;

	#else /* __MULTI_CORE__) */	#else /* __MULTI_CORE__ */
	return __internal_int2float_kernel(a, cudaRoundNearest);	return __internal_int2float_kernel(a, cudaRoundNearest);

	#endif /* __MULTI_CORE__) */	#endif /* __MULTI_CORE__ */
	}	}

	__device_func__(float __internal_uint2float_kernel(unsigned int a, enum cud aRoundMode rndMode))	__device_func__(float __internal_uint2float_kernel(unsigned int a, enum cud aRoundMode rndMode))
	{	{
	volatile union {	volatile union {
	float f;	float f;
	unsigned int i;	unsigned int i;
	} res;	} res;
	int shift;	int shift;
	unsigned int t;	unsigned int t;

	skipping to change at line 773	skipping to change at line 773

	__device_func__(float __uint2float_rd(unsigned int a))	__device_func__(float __uint2float_rd(unsigned int a))
	{	{
	return __internal_uint2float_kernel(a, cudaRoundMinInf);	return __internal_uint2float_kernel(a, cudaRoundMinInf);
	}	}

	__device_func__(float __uint2float_rn(unsigned int a))	__device_func__(float __uint2float_rn(unsigned int a))
	{	{
	#if defined(__MULTI_CORE__)	#if defined(__MULTI_CORE__)
	return (float)a;	return (float)a;

	#else /* __MULTI_CORE__) */	#else /* __MULTI_CORE__ */
	return __internal_uint2float_kernel(a, cudaRoundNearest);	return __internal_uint2float_kernel(a, cudaRoundNearest);

	#endif /* __MULTI_CORE__) */	#endif /* __MULTI_CORE__ */
	}	}

	__device_func__(float __ll2float_rn(long long int a))	__device_func__(float __ll2float_rn(long long int a))
	{	{
	return (float)a;	return (float)a;
	}	}

	__device_func__(float __ull2float_rn(unsigned long long int a))	__device_func__(float __ull2float_rn(unsigned long long int a))
	{	{
	#if defined(__MULTI_CORE__)	#if defined(__MULTI_CORE__)
	return (float)a;	return (float)a;

	#else /* __MULTI_CORE__) */	#else /* __MULTI_CORE__ */
	unsigned long long int temp;	unsigned long long int temp;
	unsigned int res, t;	unsigned int res, t;
	int shift;	int shift;
	if (a == 0ULL) return 0.0f;	if (a == 0ULL) return 0.0f;
	temp = a;	temp = a;
	shift = __internal_normalize64(&temp);	shift = __internal_normalize64(&temp);
	temp = (temp >> 8) \| ((temp & 0xffULL) ? 1ULL : 0ULL);	temp = (temp >> 8) \| ((temp & 0xffULL) ? 1ULL : 0ULL);
	res = (unsigned int)(temp >> 32);	res = (unsigned int)(temp >> 32);
	t = (unsigned int)temp;	t = (unsigned int)temp;
	res += (127 + 62 - shift) << 23; /* add in exponent */	res += (127 + 62 - shift) << 23; /* add in exponent */
	res += t == 0x80000000 ? res & 1 : t >> 31;	res += t == 0x80000000 ? res & 1 : t >> 31;
	return __int_as_float(res);	return __int_as_float(res);

	#endif /* __MULTI_CORE__) */	#endif /* __MULTI_CORE__ */
	}	}

	__device_func__(float __internal_fmul_kernel(float a, float b, int rndNeare st))	__device_func__(float __internal_fmul_kernel(float a, float b, int rndNeare st))
	{	{
	unsigned long long product;	unsigned long long product;
	volatile union {	volatile union {
	float f;	float f;
	unsigned int i;	unsigned int i;
	} xx, yy;	} xx, yy;
	unsigned expo_x, expo_y;	unsigned expo_x, expo_y;

	skipping to change at line 1069	skipping to change at line 1069
	expo_x = (unsigned int)(-((int)expo_x));	expo_x = (unsigned int)(-((int)expo_x));
	temp = xx.i << expo_y \| ((temp) ? 1 : 0);	temp = xx.i << expo_y \| ((temp) ? 1 : 0);
	xx.i = yy.i \| (xx.i >> expo_x);	xx.i = yy.i \| (xx.i >> expo_x);
	xx.i += (((temp == 0x80000000) ? (xx.i & 1) : (temp >> 31))	xx.i += (((temp == 0x80000000) ? (xx.i & 1) : (temp >> 31))
	&& rndNearest);	&& rndNearest);
	return xx.f;	return xx.f;
	} else {	} else {
	/* handle special cases separately */	/* handle special cases separately */
	if (!(yy.i << 1)) {	if (!(yy.i << 1)) {
	if (xx.i == 0x80000000) {	if (xx.i == 0x80000000) {

	xx.i = yy.i;	xx.i = yy.i;
		}
		if ((xx.i << 1) > 0xff000000) {
		xx.i \|= 0x00400000;
	}	}
	return xx.f;	return xx.f;
	}	}
	if ((expo_y != 254) && (expo_x != 254)) {	if ((expo_y != 254) && (expo_x != 254)) {
	/* remove sign bits */	/* remove sign bits */
	if (expo_x == (unsigned int) -1) {	if (expo_x == (unsigned int) -1) {
	temp = xx.i & 0x80000000;	temp = xx.i & 0x80000000;
	xx.i = xx.i << 8;	xx.i = xx.i << 8;
	while (!(xx.i & 0x80000000)) {	while (!(xx.i & 0x80000000)) {
	xx.i <<= 1;	xx.i <<= 1;

	skipping to change at line 1155	skipping to change at line 1158
	return __internal_fmul_kernel(a, b, 1);	return __internal_fmul_kernel(a, b, 1);
	}	}

	__device_func__(void __brkpt(int c))	__device_func__(void __brkpt(int c))
	{	{
	/* TODO */	/* TODO */
	}	}

	#if defined(__MULTI_CORE__)	#if defined(__MULTI_CORE__)


	__device_func__(void __syncthreads(void))	#define __syncthreads() \
	{	__builtin___syncthreads()
	#undef __syncthreads
	__syncthreads();
	#define __syncthreads \
	__syncthreads_wrapper
	}

	#else /* __MULTI_CORE__ */	#else /* __MULTI_CORE__ */

	extern int CUDARTAPI __cudaSynchronizeThreads(void*, void);	extern int CUDARTAPI __cudaSynchronizeThreads(void*, void);

	#if defined(__GNUC__)	#if defined(__GNUC__)

	__device_func__(inline __attribute__((always_inline)) void __syncthreads(vo id))	__device_func__(inline __attribute__((always_inline)) void __syncthreads(vo id))
	{	{
	volatile int _ = 0;	volatile int _ = 0;

	skipping to change at line 1262	skipping to change at line 1260
	a *= .25f;	a *= .25f;
	b *= .25f;	b *= .25f;
	}	}
	return __fdividef(a, b);	return __fdividef(a, b);
	}	}

	__device_func__(float __tanf(float a))	__device_func__(float __tanf(float a))
	{	{
	#if defined(__MULTI_CORE__)	#if defined(__MULTI_CORE__)
	return tanf(a);	return tanf(a);

	#else /* __MULTI_CORE__) */	#else /* __MULTI_CORE__ */
	return __sinf(a) / __cosf(a);	return __sinf(a) / __cosf(a);

	#endif /* __MULTI_CORE__) */	#endif /* __MULTI_CORE__ */
	}	}

	__device_func__(void __sincosf(float a, float sptr, float cptr))	__device_func__(void __sincosf(float a, float sptr, float cptr))
	{	{
	#if defined(__MULTI_CORE__)	#if defined(__MULTI_CORE__)
	sincosf(a, sptr, cptr);	sincosf(a, sptr, cptr);

	#else /* __MULTI_CORE__) */	#else /* __MULTI_CORE__ */
	*sptr = __sinf(a);	*sptr = __sinf(a);
	*cptr = __cosf(a);	*cptr = __cosf(a);

	#endif /* __MULTI_CORE__) */	#endif /* __MULTI_CORE__ */
	}	}

	__device_func__(float __expf(float a))	__device_func__(float __expf(float a))
	{	{
	#if defined(__MULTI_CORE__)	#if defined(__MULTI_CORE__)
	return expf(a);	return expf(a);

	#else /* __MULTI_CORE__) */	#else /* __MULTI_CORE__ */
	return __cuda_exp2f(a * CUDART_L2E_F);	return __cuda_exp2f(a * CUDART_L2E_F);

	#endif /* __MULTI_CORE__) */	#endif /* __MULTI_CORE__ */
	}	}

	__device_func__(float __exp10f(float a))	__device_func__(float __exp10f(float a))
	{	{
	#if defined(__MULTI_CORE__)	#if defined(__MULTI_CORE__)
	return exp10f(a);	return exp10f(a);

	#else /* __MULTI_CORE__) */	#else /* __MULTI_CORE__ */
	return __cuda_exp2f(a * CUDART_L2T_F);	return __cuda_exp2f(a * CUDART_L2T_F);

	#endif /* __MULTI_CORE__) */	#endif /* __MULTI_CORE__ */
	}	}

	__device_func__(float __log10f(float a))	__device_func__(float __log10f(float a))
	{	{
	#if defined(__MULTI_CORE__)	#if defined(__MULTI_CORE__)
	return log10f(a);	return log10f(a);

	#else /* __MULTI_CORE__) */	#else /* __MULTI_CORE__ */
	return CUDART_LG2_F * __log2f(a);	return CUDART_LG2_F * __log2f(a);

	#endif /* __MULTI_CORE__) */	#endif /* __MULTI_CORE__ */
	}	}

	__device_func__(float __logf(float a))	__device_func__(float __logf(float a))
	{	{
	#if defined(__MULTI_CORE__)	#if defined(__MULTI_CORE__)
	return logf(a);	return logf(a);

	#else /* __MULTI_CORE__) */	#else /* __MULTI_CORE__ */
	return CUDART_LN2_F * __log2f(a);	return CUDART_LN2_F * __log2f(a);

	#endif /* __MULTI_CORE__) */	#endif /* __MULTI_CORE__ */
	}	}

	__device_func__(float __powf(float a, float b))	__device_func__(float __powf(float a, float b))
	{	{
	#if defined(__MULTI_CORE__)	#if defined(__MULTI_CORE__)
	return powf(a, b);	return powf(a, b);

	#else /* __MULTI_CORE__) */	#else /* __MULTI_CORE__ */
	return __cuda_exp2f(b * __log2f(a));	return __cuda_exp2f(b * __log2f(a));

	#endif /* __MULTI_CORE__) */	#endif /* __MULTI_CORE__ */
	}	}

	__device_func__(float fdividef(float a, float b))	__device_func__(float fdividef(float a, float b))
	{	{
	#if defined(__MULTI_CORE__)	#if defined(__MULTI_CORE__)
	return a / b;	return a / b;
	#elif defined(__USE_FAST_MATH__)	#elif defined(__USE_FAST_MATH__)
	return __fdividef(a, b);	return __fdividef(a, b);

	#else /* __USE_FAST_MATH__ */	#else /* __MULTI_CORE__ */
	return __internal_accurate_fdividef(a, b);	return __internal_accurate_fdividef(a, b);

	#endif /* __USE_FAST_MATH__ */	#endif /* __MULTI_CORE__ */
	}	}

	__device_func__(int __clz(int a))	__device_func__(int __clz(int a))
	{	{
	return (a)?(158-(__float_as_int(__uint2float_rz((unsigned int)a))>>23)):3 2;	return (a)?(158-(__float_as_int(__uint2float_rz((unsigned int)a))>>23)):3 2;
	}	}

	__device_func__(int __ffs(int a))	__device_func__(int __ffs(int a))
	{	{
	return 32 - __clz (a & -a);	return 32 - __clz (a & -a);
	}	}

	__device_func__(int __popc(unsigned int a))	__device_func__(int __popc(unsigned int a))
	{	{

	unsigned int s = 033333333333;	a = a - ((a >> 1) & 0x55555555);
	unsigned int t = 030707070707;	a = (a & 0x33333333) + ((a >> 2) & 0x33333333);
	unsigned int n;	a = (a + (a >> 4)) & 0x0f0f0f0f;
	n = (a >> 1) & s;	a = ((__umul24(a, 0x808080) << 1) + a) >> 24;
	a = a - n;
	n = (n >> 1) & s;
	a = a - n;
	n = (a >> 3) + a;
	a = n & t;
	t = (t << 2) & t; /* t = 0x04104104 */
	a = (a >> 30) + ((a * t) >> 26);
	return a;	return a;
	}	}

	__device_func__(int __clzll(long long int a))	__device_func__(int __clzll(long long int a))
	{	{

	int ahi = ((int)(a >> 32));	int ahi = ((int)((unsigned long long)a >> 32));
	int alo = ((int)(a & 0xffffffffULL));	int alo = ((int)((unsigned long long)a & 0xffffffffULL));
	int res;	int res;
	if (ahi) {	if (ahi) {
	res = 0;	res = 0;
	} else {	} else {
	res = 32;	res = 32;
	ahi = alo;	ahi = alo;
	}	}
	res = res + __clz(ahi);	res = res + __clz(ahi);
	return res;	return res;
	}	}

	__device_func__(int __ffsll(long long int a))	__device_func__(int __ffsll(long long int a))
	{	{
	return 64 - __clzll (a & -a);	return 64 - __clzll (a & -a);
	}	}

	__device_func__(int __popcll(unsigned long long int a))	__device_func__(int __popcll(unsigned long long int a))
	{	{
	unsigned int ahi = ((unsigned int)(a >> 32));	unsigned int ahi = ((unsigned int)(a >> 32));
	unsigned int alo = ((unsigned int)(a & 0xffffffffULL));	unsigned int alo = ((unsigned int)(a & 0xffffffffULL));

	return __popc(ahi) + __popc(alo);	alo = alo - ((alo >> 1) & 0x55555555);
		alo = (alo & 0x33333333) + ((alo >> 2) & 0x33333333);
		ahi = ahi - ((ahi >> 1) & 0x55555555);
		ahi = (ahi & 0x33333333) + ((ahi >> 2) & 0x33333333);
		alo = alo + ahi;
		alo = (alo & 0x0f0f0f0f) + ((alo >> 4) & 0x0f0f0f0f);
		alo = ((__umul24(alo, 0x808080) << 1) + alo) >> 24;
		return alo;
	}	}

	#if defined(CUDA_DOUBLE_MATH_FUNCTIONS) && defined(CUDA_FLOAT_MATH_FUNCTION S)	#if defined(CUDA_DOUBLE_MATH_FUNCTIONS) && defined(CUDA_FLOAT_MATH_FUNCTION S)

	#error -- conflicting mode for double math routines	#error -- conflicting mode for double math routines

	#endif /* CUDA_DOUBLE_MATH_FUNCTIONS && CUDA_FLOAT_MATH_FUNCTIONS */	#endif /* CUDA_DOUBLE_MATH_FUNCTIONS && CUDA_FLOAT_MATH_FUNCTIONS */

	#if defined(CUDA_FLOAT_MATH_FUNCTIONS)	#if defined(CUDA_FLOAT_MATH_FUNCTIONS)


End of changes. 35 change blocks.
	52 lines changed or deleted	50 lines changed or added

	device_launch_parameters.h	device_launch_parameters.h

	skipping to change at line 36	skipping to change at line 36
	* and is provided to the U.S. Government only as a commercial end item.	* and is provided to the U.S. Government only as a commercial end item.
	* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through	* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
	* 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the	* 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
	* source code with only those rights set forth herein.	* source code with only those rights set forth herein.
	*	*
	* Any use of this source code in individual and commercial software must	* Any use of this source code in individual and commercial software must
	* include, in the user documentation and internal comments to the code,	* include, in the user documentation and internal comments to the code,
	* the above Disclaimer and U.S. Government End Users Notice.	* the above Disclaimer and U.S. Government End Users Notice.
	*/	*/


	#if !defined(__DEVICE_LAUCH_PARAMETERS_H__)	#if !defined(__DEVICE_LAUNCH_PARAMETERS_H__)
	#define __DEVICE_LAUCH_PARAMETERS_H__	#define __DEVICE_LAUNCH_PARAMETERS_H__

	#include "vector_types.h"	#include "vector_types.h"

	#if !defined(__STORAGE__)	#if !defined(__STORAGE__)

	#define __STORAGE__ \	#define __STORAGE__ \
	extern const	extern const

	#endif /* __STORAGE__ */	#endif /* __STORAGE__ */


	skipping to change at line 104	skipping to change at line 104

	#endif /* __cudaGet_gridDim */	#endif /* __cudaGet_gridDim */

	#if !defined(__cudaGet_warpSize)	#if !defined(__cudaGet_warpSize)

	#define __cudaGet_warpSize() \	#define __cudaGet_warpSize() \
	warpSize	warpSize

	#endif /* __cudaGet_warpSize */	#endif /* __cudaGet_warpSize */


	#endif /* !__DEVICE_LAUCH_PARAMETERS_H__ */	#endif /* !__DEVICE_LAUNCH_PARAMETERS_H__ */

End of changes. 2 change blocks.
	2 lines changed or deleted	2 lines changed or added

	device_runtime.h	device_runtime.h

	skipping to change at line 42	skipping to change at line 42
	* include, in the user documentation and internal comments to the code,	* include, in the user documentation and internal comments to the code,
	* the above Disclaimer and U.S. Government End Users Notice.	* the above Disclaimer and U.S. Government End Users Notice.
	*/	*/

	#if !defined(__CUDA_INTERNAL_COMPILATION__)	#if !defined(__CUDA_INTERNAL_COMPILATION__)

	#define __CUDA_INTERNAL_COMPILATION__	#define __CUDA_INTERNAL_COMPILATION__

	#endif /* !__CUDA_INTERNAL_COMPILATION__ */	#endif /* !__CUDA_INTERNAL_COMPILATION__ */


	#include "../host_defines.h"	#include "host_defines.h"


	#define __texture__(type, pref, dim) \
	__##pref##dim##texture
	#define __no_sc__	#define __no_sc__
	#define __empty_array(s) \	#define __empty_array(s) \
	s	s
	#define __unsized_shared_data(name, type_post) \	#define __unsized_shared_data(name, type_post) \
	__unsized##name __unsized##type_post	__unsized##name __unsized##type_post
	#define __sized_shared_data(name, type) \	#define __sized_shared_data(name, type) \
	__sized##name type	__sized##name type
	#define __sized__shared_var(name, s, type) \	#define __sized__shared_var(name, s, type) \
	name	name

	/TEXTURE_TYPE/	/TEXTURE_TYPE/

	typedef const void* __i1texture;	typedef const void *__texture_type__;
	/TEXTURE_TYPE/	/SURFACE_TYPE/
	typedef const void* __i2texture;	typedef const void *__surface_type__;
	/TEXTURE_TYPE/
	typedef const void* __i3texture;
	/TEXTURE_TYPE/
	typedef const void* __u1texture;
	/TEXTURE_TYPE/
	typedef const void* __u2texture;
	/TEXTURE_TYPE/
	typedef const void* __u3texture;
	/TEXTURE_TYPE/
	typedef const void* __f1texture;
	/TEXTURE_TYPE/
	typedef const void* __f2texture;
	/TEXTURE_TYPE/
	typedef const void* __f3texture;

	#if defined(__CUDABE__) /* cudabe compiler */	#if defined(__CUDABE__) /* cudabe compiler */

	#define __pad__(f)	#define __pad__(f)
	#define __text__ \	#define __text__ \
	__attribute__((__texture__))	__attribute__((__texture__))

		#define __surf__ \
		__attribute__((__surface__))
	#define ___device__(sc) \	#define ___device__(sc) \
	static	static
	#define __in__(cdecl, decl) \	#define __in__(cdecl, decl) \
	__shared__ cdecl	__shared__ cdecl
	#define __in_type__(cdecl, decl) \	#define __in_type__(cdecl, decl) \
	cdecl	cdecl
	#define __texture_var(name) \	#define __texture_var(name) \
	name	name
	#define __shared_var(name, s, type) \	#define __shared_var(name, s, type) \
	name	name

	skipping to change at line 116	skipping to change at line 102
	#define __cdecl	#define __cdecl
	#undef __w64	#undef __w64
	#define __w64	#define __w64

	#elif defined(__CUDACC__) /* cudafe compiler */	#elif defined(__CUDACC__) /* cudafe compiler */

	#define __loc_sc__(loc, sc) \	#define __loc_sc__(loc, sc) \
	sc loc	sc loc
	#define __pad__(f)	#define __pad__(f)
	#define __text__	#define __text__

		#define __surf__
	#define ___device__(sc) \	#define ___device__(sc) \
	sc __device__	sc __device__
	#define __in__(cdecl, decl) \	#define __in__(cdecl, decl) \
	decl	decl
	#define __in_type__(cdecl, decl) \	#define __in_type__(cdecl, decl) \
	decl	decl
	#define __texture_var(name) \	#define __texture_var(name) \
	name	name
	#define __shared_var(name, s, type) \	#define __shared_var(name, s, type) \
	name	name

	skipping to change at line 152	skipping to change at line 139
	struct uint3;	struct uint3;
	extern struct uint3* CUDARTAPI __cudaGetBlockIdxPtr(void);	extern struct uint3* CUDARTAPI __cudaGetBlockIdxPtr(void);
	extern void* CUDARTAPI __cudaGetSharedMem(void*);	extern void* CUDARTAPI __cudaGetSharedMem(void*);

	#endif /* __MULTI_CORE__ \|\| __multi_core__ */	#endif /* __MULTI_CORE__ \|\| __multi_core__ */

	#if defined (__multi_core__)	#if defined (__multi_core__)

	#if defined(__GNUC__)	#if defined(__GNUC__)


		#if defined(__cplusplus)

		extern void *alloca(size_t) throw();

		#else /* __cplusplus */

	extern void *alloca(size_t);	extern void *alloca(size_t);


		#endif /* __cplusplus */

	#define __cuda_alloc(s) \	#define __cuda_alloc(s) \
	alloca(s)	alloca(s)

	#else /* __GNUC__ */	#else /* __GNUC__ */

	extern void *_alloca(size_t);	extern void *_alloca(size_t);

	#define __cuda_alloc(s) \	#define __cuda_alloc(s) \
	_alloca(s)	_alloca(s)

	#endif /* __GNUC__ */	#endif /* __GNUC__ */

	#endif /* __multi_core__ */	#endif /* __multi_core__ */

	#if defined (__MULTI_CORE__)	#if defined (__MULTI_CORE__)


	#define __syncthreads \	#define ___device__(sc) \
	__syncthreads_wrapper	static
	#define __pad__(f) \	#define __pad__(f) \
	f	f
	#define __text__	#define __text__

		#define __surf__
	#define __cudaGet_blockIdx() \	#define __cudaGet_blockIdx() \
	(*__cudaGetBlockIdxPtr())	(*__cudaGetBlockIdxPtr())
	#define __shared_var(name, s, type) \	#define __shared_var(name, s, type) \
	(s type __cudaGetSharedMem((void*)(&(name))))	(s type __cudaGetSharedMem((void*)(&(name))))
	#define __var_used__ \	#define __var_used__ \
	__attribute__((__used__))	__attribute__((__used__))

		#define __storage_auto__shared__ \
		auto

	#undef __cdecl	#undef __cdecl
	#define __cdecl	#define __cdecl
	#undef __w64	#undef __w64
	#define __w64	#define __w64

	#else /* __MULTI_CORE__ */	#else /* __MULTI_CORE__ */


	#define __shared_var(name, s, type) \
	name

	#endif /* __MULTI_CORE__ */

	#define ___device__(sc) \	#define ___device__(sc) \
	static __device__	static __device__

	#define __in__(cdecl, decl) \	#define __shared_var(name, s, type) \
	decl
	#define __in_type__(cdecl, decl) \
	decl
	#define __texture_var(name) \
	__texture_##name
	#define __cuda_host_device_name(name) \
	__cuda_host_device_##name
	#define __val_param(name) \
	name	name

	#define __copy_param(local_decl, param)
	#define __unsized_array_size
	#define __unsized__shared_var(name, s, type) \
	(*name)
	#define __unsized__empty_array(s)


	#if defined(__APPLE__)	#if defined(__APPLE__) \|\| defined(__ICC)

	#define __STORAGE__ \	#define __STORAGE__ \
	__attribute__((__weak__))	__attribute__((__weak__))

	#elif defined(__GNUC__)	#elif defined(__GNUC__)

	#define __STORAGE__ \	#define __STORAGE__ \
	__attribute__((__common__))	__attribute__((__common__))

	#elif defined(__cplusplus)	#elif defined(__cplusplus)

	#define __STORAGE__ \	#define __STORAGE__ \
	__declspec(selectany)	__declspec(selectany)

	#pragma warning(disable: 4099 4190)	#pragma warning(disable: 4099 4190)


	#else /* __cplusplus */	#else /* __APPLE__ \|\| __ICC */

	#define __STORAGE__	#define __STORAGE__


	#endif /* __APPLE__ */	#endif /* __APPLE__ \|\| __ICC */

		#endif /* __MULTI_CORE__ */

		#define __in__(cdecl, decl) \
		decl
		#define __in_type__(cdecl, decl) \
		decl
		#define __texture_var(name) \
		__texture_##name
		#define __cuda_host_device_name(name) \
		__cuda_host_device_##name
		#define __val_param(name) \
		name
		#define __copy_param(local_decl, param)
		#define __unsized_array_size
		#define __unsized__shared_var(name, s, type) \
		(*name)
		#define __unsized__empty_array(s)

	/* this is compiled with a host compiler for device emulation */	/* this is compiled with a host compiler for device emulation */
	#define __device_emulation	#define __device_emulation

	#endif /* __CUDABE__ */	#endif /* __CUDABE__ */


	#include "../builtin_types.h"	#include "builtin_types.h"
	#include "../device_launch_parameters.h"	#include "device_launch_parameters.h"
	#include "storage_class.h"	#include "storage_class.h"

End of changes. 17 change blocks.
	46 lines changed or deleted	44 lines changed or added

	driver_types.h	driver_types.h

	skipping to change at line 96	skipping to change at line 96
	cudaErrorSynchronizationError,	cudaErrorSynchronizationError,
	cudaErrorInvalidFilterSetting,	cudaErrorInvalidFilterSetting,
	cudaErrorInvalidNormSetting,	cudaErrorInvalidNormSetting,
	cudaErrorMixedDeviceExecution,	cudaErrorMixedDeviceExecution,
	cudaErrorCudartUnloading,	cudaErrorCudartUnloading,
	cudaErrorUnknown,	cudaErrorUnknown,
	cudaErrorNotYetImplemented,	cudaErrorNotYetImplemented,
	cudaErrorMemoryValueTooLarge,	cudaErrorMemoryValueTooLarge,
	cudaErrorInvalidResourceHandle,	cudaErrorInvalidResourceHandle,
	cudaErrorNotReady,	cudaErrorNotReady,

		cudaErrorInsufficientDriver,
		cudaErrorSetOnActiveProcess,
	cudaErrorStartupFailure = 0x7f,	cudaErrorStartupFailure = 0x7f,
	cudaErrorApiFailureBase = 10000	cudaErrorApiFailureBase = 10000
	};	};

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	enum cudaChannelFormatKind	enum cudaChannelFormatKind
	{	{
	cudaChannelFormatKindSigned,	cudaChannelFormatKindSigned,
	cudaChannelFormatKindUnsigned,	cudaChannelFormatKindUnsigned,

	cudaChannelFormatKindFloat	cudaChannelFormatKindFloat,
		cudaChannelFormatKindNone
	};	};

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct cudaChannelFormatDesc	struct cudaChannelFormatDesc
	{	{
	int x;	int x;
	int y;	int y;
	int z;	int z;
	int w;	int w;
	enum cudaChannelFormatKind f;	enum cudaChannelFormatKind f;

	skipping to change at line 189	skipping to change at line 192
	int maxThreadsPerBlock;	int maxThreadsPerBlock;
	int maxThreadsDim[3];	int maxThreadsDim[3];
	int maxGridSize[3];	int maxGridSize[3];
	int clockRate;	int clockRate;
	size_t totalConstMem;	size_t totalConstMem;
	int major;	int major;
	int minor;	int minor;
	size_t textureAlignment;	size_t textureAlignment;
	int deviceOverlap;	int deviceOverlap;
	int multiProcessorCount;	int multiProcessorCount;

	int __cudaReserved[40];	int kernelExecTimeoutEnabled;
		int __cudaReserved[39];
	};	};


	#define cudaDevicePropDontCare \	#define cudaDevicePropDontCare \
	{ \	{ \
	{'\0'}, /* char name[256]; */ \	{'\0'}, /* char name[256]; */ \
	0, /* size_t totalGlobalMem; */ \	0, /* size_t totalGlobalMem; */ \
	0, /* size_t sharedMemPerBlock; */ \	0, /* size_t sharedMemPerBlock; */ \
	0, /* int regsPerBlock; */ \	0, /* int regsPerBlock; */ \
	0, /* int warpSize; */ \	0, /* int warpSize; */ \
	0, /* size_t memPitch; */ \	0, /* size_t memPitch; */ \
	0, /* int maxThreadsPerBlock; */ \	0, /* int maxThreadsPerBlock; */ \
	{0, 0, 0}, /* int maxThreadsDim[3]; */ \	{0, 0, 0}, /* int maxThreadsDim[3]; */ \
	{0, 0, 0}, /* int maxGridSize[3]; */ \	{0, 0, 0}, /* int maxGridSize[3]; */ \
	0, /* int clockRate; */ \	0, /* int clockRate; */ \
	0, /* size_t totalConstMem; */ \	0, /* size_t totalConstMem; */ \
	-1, /* int major; */ \	-1, /* int major; */ \
	-1, /* int minor; */ \	-1, /* int minor; */ \
	0, /* size_t textureAlignment; */ \	0, /* size_t textureAlignment; */ \
	-1, /* int deviceOverlap; */ \	-1, /* int deviceOverlap; */ \
	0 /* int multiProcessorCount; */ \	0, /* int multiProcessorCount; */ \
		0 /* int kernelExecTimeoutEnabled */ \
	}	}

	/************************************************************************ ***	/************************************************************************ ***
	* *	* *
	* SHORTHAND TYPE DEFINITION USED BY RUNTIME API *	* SHORTHAND TYPE DEFINITION USED BY RUNTIME API *
	* *	* *
	************************************************************************* **/	************************************************************************* **/

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	typedef enum cudaError cudaError_t;	typedef enum cudaError cudaError_t;

End of changes. 4 change blocks.
	20 lines changed or deleted	25 lines changed or added

	host_config.h	host_config.h

	skipping to change at line 65	skipping to change at line 65
	#include <bits/c++config.h> /* get _GLIBCXX_ATOMIC_BUILTINS */	#include <bits/c++config.h> /* get _GLIBCXX_ATOMIC_BUILTINS */

	#if _GLIBCXX_ATOMIC_BUILTINS == 1	#if _GLIBCXX_ATOMIC_BUILTINS == 1

	#undef _GLIBCXX_ATOMIC_BUILTINS /* for missing __sync_fetch_and_add */	#undef _GLIBCXX_ATOMIC_BUILTINS /* for missing __sync_fetch_and_add */

	#endif /* _GLIBCXX_ATOMIC_BUILTINS == 1 */	#endif /* _GLIBCXX_ATOMIC_BUILTINS == 1 */

	#elif defined(_WIN32)	#elif defined(_WIN32)


	#if _MSC_VER == 1400	#if _MSC_VER >= 1400

		#undef _USE_DECLSPECS_FOR_SAL
		#define _USE_DECLSPECS_FOR_SAL \
		1

	#if !defined(_CRT_NONSTDC_NO_WARNINGS)	#if !defined(_CRT_NONSTDC_NO_WARNINGS)

	#define _CRT_NONSTDC_NO_WARNINGS /* to suppress warnings */	#define _CRT_NONSTDC_NO_WARNINGS /* to suppress warnings */

	#endif /* _CRT_NONSTDC_NO_WARNINGS */	#endif /* _CRT_NONSTDC_NO_WARNINGS */

	#if !defined(_CRT_SECURE_NO_WARNINGS)	#if !defined(_CRT_SECURE_NO_WARNINGS)

	#define _CRT_SECURE_NO_WARNINGS /* to suppress warnings */	#define _CRT_SECURE_NO_WARNINGS /* to suppress warnings */

	#endif /* _CRT_SECURE_NO_WARNINGS */	#endif /* _CRT_SECURE_NO_WARNINGS */


	#endif /* _MSC_VER == 1400 */	#endif /* _MSC_VER >= 1400 */

	#if !defined(NOMINMAX)	#if !defined(NOMINMAX)

	#define NOMINMAX /* min and max are part of cuda runtime */	#define NOMINMAX /* min and max are part of cuda runtime */

	#endif /* !NOMINMAX */	#endif /* !NOMINMAX */

	#define __THROW	#define __THROW


	class type_info; /* forward declaration */	/* forward declarations for windows C++ header files */

		#include <stddef.h>

		class type_info;

		#if !defined(_CRTIMP)
		#if defined(_DLL)
		#define _CRTIMP \
		__declspec(dllimport)
		#else /* _DLL */
		#define _CRTIMP
		#endif /* _DLL */
		#endif /* !_CRTIMP */
		#if defined(_DEBUG)
		#if !defined(_NATIVE_WCHAR_T_DEFINED) && defined(_M_CEE_PURE)
		extern "C++"
		#else /* !_NATIVE_WCHAR_T_DEFINED && _M_CEE_PURE */
		extern "C"
		#endif /* !_NATIVE_WCHAR_T_DEFINED && _M_CEE_PURE */
		_CRTIMP void __cdecl _invalid_parameter(const wchar_t, const wchar_t, con
		st wchar_t*, unsigned int, uintptr_t);
		#else /* _DEBUG */
		extern "C" _CRTIMP void __cdecl _invalid_parameter_noinfo(void);
		#endif /* DEBUG */

		namespace std
		{
		struct _Secure_char_traits_tag;
		namespace _Traits_helper
		{
		template<class T> inline typename T::char_type *copy_s(typename T::char
		_type, size_t, const typename T::char_type, size_t, _Secure_char_traits_t
		ag);
		template<class T> inline typename T::char_type *move_s(typename T::char
		_type, size_t, const typename T::char_type, size_t, _Secure_char_traits_t
		ag);
		}
		}

		namespace stdext
		{
		template<class U, class V, class W> inline V unchecked_uninitialized_copy
		(U, U, V, W&);
		}

	#endif /* __GNUC__ */	#endif /* __GNUC__ */

	#endif /* __CUDACC__ */	#endif /* __CUDACC__ */

	#endif /* !__HOST_CONFIG_H__ */	#endif /* !__HOST_CONFIG_H__ */

End of changes. 3 change blocks.
	3 lines changed or deleted	51 lines changed or added

	host_defines.h	host_defines.h

	skipping to change at line 49	skipping to change at line 49
	#if !defined(__GNUC__) && !defined(_WIN32)	#if !defined(__GNUC__) && !defined(_WIN32)

	#error --- !!! UNSUPPORTED COMPILER !!! ---	#error --- !!! UNSUPPORTED COMPILER !!! ---

	#elif defined(__GNUC__)	#elif defined(__GNUC__)

	#define __no_return__ \	#define __no_return__ \
	__attribute__((__noreturn__))	__attribute__((__noreturn__))
	#define __noinline__ \	#define __noinline__ \
	__attribute__((__noinline__))	__attribute__((__noinline__))

		#define __forceinline__ \
		__inline__ __attribute__((__always_inline__))
	#define __align__(n) \	#define __align__(n) \
	__attribute__((__aligned__(n)))	__attribute__((__aligned__(n)))
	#define __thread__ \	#define __thread__ \
	__thread	__thread
	#define __import__	#define __import__
	#define __export__	#define __export__
	#define __location__(a) \	#define __location__(a) \
	__loc__(__attribute__((a)))	__loc__(__attribute__((a)))
	#define CUDARTAPI	#define CUDARTAPI


	skipping to change at line 78	skipping to change at line 80
	#define __restrict__	#define __restrict__

	#endif /* _MSC_VER >= 1400 */	#endif /* _MSC_VER >= 1400 */

	#define __inline__ \	#define __inline__ \
	__inline	__inline
	#define __no_return__ \	#define __no_return__ \
	__declspec(noreturn)	__declspec(noreturn)
	#define __noinline__ \	#define __noinline__ \
	__declspec(noinline)	__declspec(noinline)

		#define __forceinline__ \
		__forceinline
	#define __align__(n) \	#define __align__(n) \
	__declspec(align(n))	__declspec(align(n))
	#define __thread__ \	#define __thread__ \
	__declspec(thread)	__declspec(thread)
	#define __import__ \	#define __import__ \
	__declspec(dllimport)	__declspec(dllimport)
	#define __export__ \	#define __export__ \
	__declspec(dllexport)	__declspec(dllexport)
	#define __location__(a) \	#define __location__(a) \
	__loc__(__declspec(a))	__loc__(__declspec(a))
	#define CUDARTAPI \	#define CUDARTAPI \
	__stdcall	__stdcall

	#endif /* !__GNUC__ && !_WIN32 */	#endif /* !__GNUC__ && !_WIN32 */


	#if defined(__CUDACC__) \|\| defined(__CUDABE__)	#if defined(__CUDACC__) \|\| defined(__CUDABE__) \|\| defined (__MULTI_CORE__)

	#define __loc__(a) \	#define __loc__(a) \
	a	a
	#define __builtin_align__(a) \	#define __builtin_align__(a) \
	__align__(a)	__align__(a)


	#else /* __CUDACC__ \|\| __CUDABE__ */	#else /* __CUDACC__ \|\| __CUDABE__ \|\| __MULTI_CORE__ */

	#define __loc__(a)	#define __loc__(a)
	#define __builtin_align__(a)	#define __builtin_align__(a)


	#endif /* __CUDACC__ \|\| __CUDABE__ */	#endif /* __CUDACC__ \|\| __CUDABE__ \|\| __MULTI_CORE__ */

	#define __device__ \	#define __device__ \
	__location__(__device__)	__location__(__device__)
	#define __host__ \	#define __host__ \
	__location__(__host__)	__location__(__host__)
	#define __global__ \	#define __global__ \
	__location__(__global__)	__location__(__global__)
	#define __shared__ \	#define __shared__ \
	__location__(__shared__)	__location__(__shared__)
	#define __constant__ \	#define __constant__ \

End of changes. 5 change blocks.
	3 lines changed or deleted	7 lines changed or added

	host_runtime.h	host_runtime.h

	skipping to change at line 76	skipping to change at line 76

	#endif /* !__cplusplus */	#endif /* !__cplusplus */

	#include "cuda_runtime_api.h"	#include "cuda_runtime_api.h"
	#include "storage_class.h"	#include "storage_class.h"

	#else /* !__CUDA_INTERNAL_COMPILATION__ */	#else /* !__CUDA_INTERNAL_COMPILATION__ */

	#include "host_defines.h"	#include "host_defines.h"


	#define __cudaRegisterBinary() \	#define __cudaRegisterBinary()
	__cudaFatCubinHandle = __cudaRegisterFatBinary((void*)__cudaFatCubi	\
	n)	__cudaFatCubinHandle = __cudaRegisterFatBinary((void*)__cudaFatCubi
		n); \
		atexit(__cudaUnregisterBinaryUtil)
	#define __cudaRegisterVariable(var, ext, size, constant, global) \	#define __cudaRegisterVariable(var, ext, size, constant, global) \
	__cudaRegisterVar(__cudaFatCubinHandle, (char)&__host##var, (char )__device##var, __name##var, ext, size, constant, global)	__cudaRegisterVar(__cudaFatCubinHandle, (char)&__host##var, (char )__device##var, __name##var, ext, size, constant, global)
	#define __cudaRegisterGlobalTexture(tex, dim, norm, ext) \	#define __cudaRegisterGlobalTexture(tex, dim, norm, ext) \
	__cudaRegisterTexture(__cudaFatCubinHandle, (const struct textureRe ference*)&tex, __tex_var(tex), #tex, dim, norm, ext)	__cudaRegisterTexture(__cudaFatCubinHandle, (const struct textureRe ference*)&tex, __tex_var(tex), #tex, dim, norm, ext)
	#define __cudaRegisterUnsizedShared(var) \	#define __cudaRegisterUnsizedShared(var) \
	__cudaRegisterShared(__cudaFatCubinHandle, (void**)__device_var(var ))	__cudaRegisterShared(__cudaFatCubinHandle, (void**)__device_var(var ))
	#define __cudaRegisterSharedVariable(var, size, align, sc) \	#define __cudaRegisterSharedVariable(var, size, align, sc) \
	__cudaRegisterSharedVar(__cudaFatCubinHandle, (void**)__device_var( var), size, align, sc)	__cudaRegisterSharedVar(__cudaFatCubinHandle, (void**)__device_var( var), size, align, sc)
	#define __cudaRegisterEntry(fun, thread_limit) \	#define __cudaRegisterEntry(fun, thread_limit) \
	__cudaRegisterFunction(__cudaFatCubinHandle, (const char)__device_ stub_##fun, (char)__device_fun(fun), #fun, thread_limit, __ids)	__cudaRegisterFunction(__cudaFatCubinHandle, (const char)__device_ stub_##fun, (char)__device_fun(fun), #fun, thread_limit, __ids)


		#define __cudaInitArgBlock(arg) \
		char __[256]; \
		(char*)&arg = __
	#define __cudaSetupArg(arg, offset) \	#define __cudaSetupArg(arg, offset) \

	if (cudaSetupArgument((void)(char)&arg, sizeof(arg), (size_t)&off set.arg - (size_t)&offset) != cudaSuccess) \	if (cudaSetupArgument((void)(char)&arg, sizeof(arg), (size_t)&off set->arg - (size_t)offset) != cudaSuccess) \
	return	return
	#define __cudaLaunch(fun) \	#define __cudaLaunch(fun) \
	{ volatile static char *__f; __f = fun; (void)cudaLaunch(fun); }	{ volatile static char *__f; __f = fun; (void)cudaLaunch(fun); }

	#if defined(__cplusplus)	#if defined(__cplusplus)
	extern "C" {	extern "C" {
	#endif /* __cplusplus */	#endif /* __cplusplus */

	extern void** CUDARTAPI __cudaRegisterFatBinary(	extern void** CUDARTAPI __cudaRegisterFatBinary(
	void *fatCubin	void *fatCubin

	skipping to change at line 158	skipping to change at line 162
	uint3 *bid,	uint3 *bid,
	dim3 *bDim,	dim3 *bDim,
	dim3 *gDim,	dim3 *gDim,
	int *wSize	int *wSize
	);	);

	#if defined(__cplusplus)	#if defined(__cplusplus)
	}	}
	#endif /* __cplusplus */	#endif /* __cplusplus */


	static void **__cudaFatCubinHandle;	#if defined(__GNUC__) && defined(__cplusplus)

	static void __cudaUnregisterBinaryUtil(void)
	{
	__cudaUnregisterFatBinary(__cudaFatCubinHandle);
	}

	#if defined(__GNUC__)

	__attribute__((__destructor__)) static void __cudaUnregisterBinary(void)
	{
	__cudaUnregisterBinaryUtil();
	}

	#else /* __GNUC__ */

	#if defined(_WIN64)


	#pragma section(".CRT$XPU", read)	extern int atexit(void(*)(void)) throw();


	#else /* _WIN64 */	#else /* __GNUC__ && __cplusplus */


	#pragma section(".CRT$XPU")	extern int atexit(void(*)(void));


	#endif /* _WIN64 */	#endif /* __GNUC__ && __cplusplus */


	__declspec(allocate(".CRT$XPU"))	static void **__cudaFatCubinHandle;
	static void (__cdecl *__cudaUnregister[])(void) = {__cudaUnregisterBinaryUt
	il};


	#endif /* __GNUC__ */	static void __cudaUnregisterBinaryUtil(void)
		{
		__cudaUnregisterFatBinary(__cudaFatCubinHandle);
		}

	#if defined(__device_emulation)	#if defined(__device_emulation)

	#define __device_fun(fun) \	#define __device_fun(fun) \
	__device_wrapper_##fun	__device_wrapper_##fun
	#define __device_var(var) \	#define __device_var(var) \
	(char*)&var	(char*)&var
	#define __tex_var(var) \	#define __tex_var(var) \
	&__texture_var(var)	&__texture_var(var)
	#define __cudaFatCubin \	#define __cudaFatCubin \

End of changes. 10 change blocks.
	29 lines changed or deleted	19 lines changed or added

	math_functions.h	math_functions.h

	skipping to change at line 413	skipping to change at line 413
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __host__ __device__ float fmaf(float, float, float) __THROW;	extern __host__ __device__ float fmaf(float, float, float) __THROW;

	}	}

	#include <math.h>	#include <math.h>
	#include <stdlib.h>	#include <stdlib.h>
	#include <cmath>	#include <cmath>
	#include <cstdlib>	#include <cstdlib>


	using namespace std;

	/************************************************************************ ***	/************************************************************************ ***
	* *	* *
	* *	* *
	* *	* *
	************************************************************************* **/	************************************************************************* **/

	#if defined(__GNUC__)	#if defined(__GNUC__)


	#define __cuda_std_begin \
	namespace std {

	#define __cuda_std_end \
	}

	/* these are here to avoid warnings on the call graph.	/* these are here to avoid warnings on the call graph.
	long double is not supported on the device */	long double is not supported on the device */
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __host__ __device__ int __signbitl(long double) __THROW;	extern __host__ __device__ int __signbitl(long double) __THROW;
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __host__ __device__ int __isinfl(long double) __THROW;	extern __host__ __device__ int __isinfl(long double) __THROW;
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __host__ __device__ int __isnanl(long double) __THROW;	extern __host__ __device__ int __isnanl(long double) __THROW;

	#if defined(__APPLE__)	#if defined(__APPLE__)

	skipping to change at line 482	skipping to change at line 474
	{	{
	extern __host__ __device__ long long int abs(long long int);	extern __host__ __device__ long long int abs(long long int);
	}	}

	namespace std	namespace std
	{	{
	template<typename T> extern __host__ __device__ T __pow_helper(T, int);	template<typename T> extern __host__ __device__ T __pow_helper(T, int);
	template<typename T> extern __host__ __device__ T __cmath_power(T, unsign ed int);	template<typename T> extern __host__ __device__ T __cmath_power(T, unsign ed int);
	}	}


		using std::abs;
		using std::fabs;
		using std::ceil;
		using std::floor;
		using std::sqrt;
		using std::pow;
		using std::log;
		using std::log10;
		using std::fmod;
		using std::modf;
		using std::exp;
		using std::frexp;
		using std::ldexp;
		using std::asin;
		using std::sin;
		using std::sinh;
		using std::acos;
		using std::cos;
		using std::cosh;
		using std::atan;
		using std::atan2;
		using std::tan;
		using std::tanh;

	#elif defined(_WIN32)	#elif defined(_WIN32)

	static __inline__ __host__ __device__ long long int abs(long long int a)	static __inline__ __host__ __device__ long long int abs(long long int a)
	{	{
	return llabs(a);	return llabs(a);
	}	}

	static __inline__ __host__ __device__ int signbit(double a)	static __inline__ __host__ __device__ int signbit(double a)
	{	{
	return __signbit(a);	return __signbit(a);

	skipping to change at line 529	skipping to change at line 545
	static __inline__ __host__ __device__ int isfinite(double a)	static __inline__ __host__ __device__ int isfinite(double a)
	{	{
	return __finite(a);	return __finite(a);
	}	}

	static __inline__ __host__ __device__ int isfinite(float a)	static __inline__ __host__ __device__ int isfinite(float a)
	{	{
	return __finitef(a);	return __finitef(a);
	}	}


	#define __cuda_std_begin
	#define __cuda_std_end

	template<class T> extern __host__ __device__ T _Pow_int(T, int);	template<class T> extern __host__ __device__ T _Pow_int(T, int);

	#endif /* !_WIN32 */	#endif /* !_WIN32 */


	__cuda_std_begin	#if defined(__GNUC__)
		namespace std {
		#endif /* __GNUC__ */

	extern __host__ __device__ long int abs(long int);	extern __host__ __device__ long int abs(long int);
	extern __host__ __device__ float abs(float);	extern __host__ __device__ float abs(float);
	extern __host__ __device__ double abs(double);	extern __host__ __device__ double abs(double);
	extern __host__ __device__ float fabs(float);	extern __host__ __device__ float fabs(float);
	extern __host__ __device__ float ceil(float);	extern __host__ __device__ float ceil(float);
	extern __host__ __device__ float floor(float);	extern __host__ __device__ float floor(float);
	extern __host__ __device__ float sqrt(float);	extern __host__ __device__ float sqrt(float);
	extern __host__ __device__ float pow(float, float);	extern __host__ __device__ float pow(float, float);
	extern __host__ __device__ float pow(float, int);	extern __host__ __device__ float pow(float, int);

	skipping to change at line 566	skipping to change at line 581
	extern __host__ __device__ float sin(float);	extern __host__ __device__ float sin(float);
	extern __host__ __device__ float sinh(float);	extern __host__ __device__ float sinh(float);
	extern __host__ __device__ float acos(float);	extern __host__ __device__ float acos(float);
	extern __host__ __device__ float cos(float);	extern __host__ __device__ float cos(float);
	extern __host__ __device__ float cosh(float);	extern __host__ __device__ float cosh(float);
	extern __host__ __device__ float atan(float);	extern __host__ __device__ float atan(float);
	extern __host__ __device__ float atan2(float, float);	extern __host__ __device__ float atan2(float, float);
	extern __host__ __device__ float tan(float);	extern __host__ __device__ float tan(float);
	extern __host__ __device__ float tanh(float);	extern __host__ __device__ float tanh(float);


	__cuda_std_end	#if defined(__GNUC__)
		}
		#endif /* __GNUC__ */

	static __inline__ __host__ __device__ float logb(float a)	static __inline__ __host__ __device__ float logb(float a)
	{	{
	return logbf(a);	return logbf(a);
	}	}

	static __inline__ __host__ __device__ int ilogb(float a)	static __inline__ __host__ __device__ int ilogb(float a)
	{	{
	return ilogbf(a);	return ilogbf(a);
	}	}

	skipping to change at line 827	skipping to change at line 844

	#elif !defined(__CUDACC__)	#elif !defined(__CUDACC__)

	#include "crt/func_macro.h"	#include "crt/func_macro.h"

	#define INT_MAX \	#define INT_MAX \
	((int)((unsigned int)-1 >> 1))	((int)((unsigned int)-1 >> 1))

	#if defined(__GNUC__)	#if defined(__GNUC__)


	extern int __cuda_error_not_implememted(void);	__func__(int __cuda_error_not_implememted(void));

	#define __cuda___signbitl(a) \	#define __cuda___signbitl(a) \
	__cuda_error_not_implememted()	__cuda_error_not_implememted()
	#define __cuda___isinfl(a) \	#define __cuda___isinfl(a) \
	__cuda_error_not_implememted()	__cuda_error_not_implememted()
	#define __cuda___isnanl(a) \	#define __cuda___isnanl(a) \
	__cuda_error_not_implememted()	__cuda_error_not_implememted()

	#if defined(__APPLE__)	#if defined(__APPLE__)


	skipping to change at line 989	skipping to change at line 1006
	} cvt;	} cvt;

	cvt.d = a;	cvt.d = a;

	return cvt.l < 0ll;	return cvt.l < 0ll;
	}	}

	__func__(double copysign(double a, double b))	__func__(double copysign(double a, double b))
	{	{
	volatile union {	volatile union {

	double d;	double d;
	unsigned long long int l;	unsigned long long int l;
	} cvta, cvtb;	} cvta, cvtb;


	cvta.d = a;	cvta.d = a;
	cvtb.d = b;	cvtb.d = b;
	cvta.l = (cvta.l & 0x7fffffffffffffffULL) \| (cvtb.l & 0x8000000000000000U LL);	cvta.l = (cvta.l & 0x7fffffffffffffffULL) \| (cvtb.l & 0x8000000000000000U LL);


	return cvta.d;	return cvta.d;
	}	}

	__func__(int __signbitf(float a))	__func__(int __signbitf(float a))
	{	{
	return __cuda___signbitf(a);	return __cuda___signbitf(a);
	}	}

	__func__(float copysignf(float a, float b))	__func__(float copysignf(float a, float b))
	{	{

	skipping to change at line 1178	skipping to change at line 1197
	#if defined(__LP64__)	#if defined(__LP64__)
	return (long int)__cuda_llrintf(a);	return (long int)__cuda_llrintf(a);
	#else /* __LP64__ */	#else /* __LP64__ */
	return (long int)__float2int_rn(a);	return (long int)__float2int_rn(a);
	#endif /* __LP64__ */	#endif /* __LP64__ */
	#endif /* __MULTI_CORE__ */	#endif /* __MULTI_CORE__ */
	}	}

	__device_func__(float __cuda_nearbyintf(float a))	__device_func__(float __cuda_nearbyintf(float a))
	{	{

	#if defined(__CUDABE__)	#if defined(__MULTI_CORE__)
		return nearbyintf(a);
		#elif defined(__CUDABE__)
	return roundf(a);	return roundf(a);
	#else /* __CUDABE__ */	#else /* __CUDABE__ */
	return __internal_nearbyintf(a);	return __internal_nearbyintf(a);
	#endif /* __CUDABE__ */	#endif /* __CUDABE__ */
	}	}

	__device_func__(float __cuda_fmaxf(float a, float b))	__device_func__(float __cuda_fmaxf(float a, float b))
	{	{
	#if defined(__MULTI_CORE__)	#if defined(__MULTI_CORE__)
	return fmaxf(a, b);	return fmaxf(a, b);

	skipping to change at line 1447	skipping to change at line 1468
	res.y = e = r + s;	res.y = e = r + s;
	res.x = (r - e) + s;	res.x = (r - e) + s;
	return res;	return res;
	}	}

	__device_func__(float __internal_accurate_log2f(float a))	__device_func__(float __internal_accurate_log2f(float a))
	{	{
	return CUDART_L2E_F * __internal_accurate_logf(a);	return CUDART_L2E_F * __internal_accurate_logf(a);
	}	}


		/* Based on: Guillaume Da Gra
		* Operators on Graphics Hardware. RNC'7, pp. 23-32, 2006.
		*/
		__device_func__(float2 __internal_dsmul (float2 x, float2 y))
		{
		float2 z;
		#if !defined(__CUDABE__)
		volatile float up, vp, u1, u2, v1, v2, mh, ml;
		#else
		float up, vp, u1, u2, v1, v2, mh, ml;
		#endif /* defined(__CUDABE__) */
		up = x.y * 4097.0f;
		u1 = (x.y - up) + up;
		u2 = x.y - u1;
		vp = y.y * 4097.0f;
		v1 = (y.y - vp) + vp;
		v2 = y.y - v1;
		mh = __fmul_rn(x.y,y.y);
		ml = (((u1 * v1 - mh) + u1 * v2) + u2 * v1) + u2 * v2;
		ml = (__fmul_rn(x.y,y.x) + __fmul_rn(x.x,y.y)) + ml;
		z.y = up = mh + ml;
		z.x = (mh - up) + ml;
		return z;
		}

	/* 160 bits of 2/PI for Payne-Hanek style argument reduction. */	/* 160 bits of 2/PI for Payne-Hanek style argument reduction. */
	static __constant__ unsigned int __cudart_i2opi_f [] = {	static __constant__ unsigned int __cudart_i2opi_f [] = {
	0x3c439041,	0x3c439041,
	0xdb629599,	0xdb629599,
	0xf534ddc0,	0xf534ddc0,
	0xfc2757d1,	0xfc2757d1,
	0x4e441529,	0x4e441529,
	0xa2f9836e,	0xa2f9836e,
	};	};


	skipping to change at line 2048	skipping to change at line 2094
	}	}
	return t;	return t;
	#endif /* __MULTI_CORE__ */	#endif /* __MULTI_CORE__ */
	}	}

	__device_func__(float __cuda_acoshf(float a))	__device_func__(float __cuda_acoshf(float a))
	{	{
	#if defined(__MULTI_CORE__)	#if defined(__MULTI_CORE__)
	return acoshf(a);	return acoshf(a);
	#else /* __MULTI_CORE__ */	#else /* __MULTI_CORE__ */

	float s, t;	float t;

	t = a - 1.0f;	t = a - 1.0f;
	if (__cuda_fabsf(t) > CUDART_TWO_TO_23_F) {	if (__cuda_fabsf(t) > CUDART_TWO_TO_23_F) {
	/* for large a, acosh = log(2a) /	/* for large a, acosh = log(2a) /
	return CUDART_LN2_F + __internal_accurate_logf(a);	return CUDART_LN2_F + __internal_accurate_logf(a);
	} else {	} else {

	s = a + 1.0f;	t = t + __cuda_sqrtf(a * t + t);
	t = t + __cuda_sqrtf(s * t);
	return __cuda_log1pf(t);	return __cuda_log1pf(t);
	}	}
	#endif /* __MULTI_CORE__ */	#endif /* __MULTI_CORE__ */
	}	}

	__device_func__(float __cuda_asinhf(float a))	__device_func__(float __cuda_asinhf(float a))
	{	{
	#if defined(__MULTI_CORE__)	#if defined(__MULTI_CORE__)
	return asinhf(a);	return asinhf(a);
	#else /* __MULTI_CORE__ */	#else /* __MULTI_CORE__ */

	skipping to change at line 2880	skipping to change at line 2925
	}	}

	__device_func__(float __internal_accurate_powf(float a, float b))	__device_func__(float __internal_accurate_powf(float a, float b))
	{	{
	float2 loga, prod;	float2 loga, prod;
	#if !defined(__CUDABE__) && defined(_MSC_VER) && !defined(_WIN64)	#if !defined(__CUDABE__) && defined(_MSC_VER) && !defined(_WIN64)
	volatile float t;	volatile float t;
	#else	#else
	float t;	float t;
	#endif	#endif

	#if !defined(__CUDABE__) && defined(__linux__) && !defined(__LP64__)
	volatile float up, vp, u1, u2, v1, v2, mh, ml;
	#else
	float up, vp, u1, u2, v1, v2, mh, ml;
	#endif

	/* compute log(a) in double-single format*/	/* compute log(a) in double-single format*/
	loga = __internal_log_ep(a);	loga = __internal_log_ep(a);

	/* prevent overflow during extended precision multiply */	/* prevent overflow during extended precision multiply */
	if (__cuda_fabsf(b) > 1.0e34f) b *= 1.220703125e-4f;	if (__cuda_fabsf(b) > 1.0e34f) b *= 1.220703125e-4f;

	/* compute b * log(a) in double-single format */	prod.y = b;
	up = loga.y * 4097.0f;	prod.x = 0.0f;
	u1 = (loga.y - up) + up;	prod = __internal_dsmul (prod, loga);
	u2 = loga.y - u1;
	vp = b * 4097.0f;	/* prevent intermediate overflow in exponentiation */
	v1 = (b - vp) + vp;	if (__float_as_int(prod.y) == 0x42b17218) {
	v2 = b - v1;	prod.y = __int_as_float(__float_as_int(prod.y) - 1);
	mh = __fmul_rn(loga.y, b);	prod.x = prod.x + __int_as_float(0x37000000);
	ml = (((u1 * v1 - mh) + u1 * v2) + u2 * v1) + u2 * v2;	}
	ml = __fmul_rn(loga.x, b) + ml;
	prod.y = up = mh + ml;
	prod.x = (mh - up) + ml;

	/* compute pow(a,b) = exp(blog(a)) /	/* compute pow(a,b) = exp(blog(a)) /
	t = __cuda_expf(prod.y);	t = __cuda_expf(prod.y);
	/* prevent -INF + INF = NaN */	/* prevent -INF + INF = NaN */
	if (t != CUDART_INF_F) {	if (t != CUDART_INF_F) {
	/* if prod.x is much smaller than prod.y, then exp(prod.y+prod.x) ~=	/* if prod.x is much smaller than prod.y, then exp(prod.y+prod.x) ~=
	* exp(prod.y) + prod.x * exp(prod.y)	* exp(prod.y) + prod.x * exp(prod.y)
	*/	*/
	t = t * prod.x + t;	t = t * prod.x + t;
	}	}

End of changes. 16 change blocks.
	36 lines changed or deleted	73 lines changed or added

	math_functions_dbl_ptx3.h	math_functions_dbl_ptx3.h

	/*	/*
	* Copyright 1993-2008 NVIDIA Corporation. All rights reserved.	* Copyright 1993-2008 NVIDIA Corporation. All rights reserved.
	*	*
	* NOTICE TO USER:	* NOTICE TO USER:
	*	*
	* This source code is subject to NVIDIA ownership rights under U.S. and	* This source code is subject to NVIDIA ownership rights under U.S. and
	* international Copyright laws. Users and possessors of this source code	* international Copyright laws. Users and possessors of this source code
	* are hereby granted a nonexclusive, royalty-free license to use this code	* are hereby granted a nonexclusive, royalty-free license to use this code
	* in individual and commercial software.	* in individual and commercial software.
	*	*
	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE

	skipping to change at line 221	skipping to change at line 221
	};	};

	__device_func__(double __internal_trig_reduction_kerneld(double a, int *qua drant))	__device_func__(double __internal_trig_reduction_kerneld(double a, int *qua drant))
	{	{
	double j;	double j;
	int q;	int q;
	if (__cuda_fabs(a) > CUDART_TRIG_PLOSS) {	if (__cuda_fabs(a) > CUDART_TRIG_PLOSS) {
	/* Payne-Hanek style argument reduction. */	/* Payne-Hanek style argument reduction. */
	unsigned long long int ia;	unsigned long long int ia;
	unsigned long long int s;	unsigned long long int s;

	unsigned long long int result[19];	unsigned long long int result[5];
	unsigned long long int phi, plo;	unsigned long long int phi, plo;
	unsigned long long int hi, lo;	unsigned long long int hi, lo;
	unsigned int e;	unsigned int e;
	int idx;	int idx;

	ia = __double_as_longlong(a);	ia = __double_as_longlong(a);
	s = ia & 0x8000000000000000ULL;	s = ia & 0x8000000000000000ULL;
	e = (unsigned int)(((ia >> 52) & 0x7ff) - 1024);	e = (unsigned int)(((ia >> 52) & 0x7ff) - 1024);
	ia = (ia << 11) \| 0x8000000000000000ULL;	ia = (ia << 11) \| 0x8000000000000000ULL;
	/* compute x * 2/pi */	/* compute x * 2/pi */

	skipping to change at line 233	skipping to change at line 233
	unsigned long long int hi, lo;	unsigned long long int hi, lo;
	unsigned int e;	unsigned int e;
	int idx;	int idx;

	ia = __double_as_longlong(a);	ia = __double_as_longlong(a);
	s = ia & 0x8000000000000000ULL;	s = ia & 0x8000000000000000ULL;
	e = (unsigned int)(((ia >> 52) & 0x7ff) - 1024);	e = (unsigned int)(((ia >> 52) & 0x7ff) - 1024);
	ia = (ia << 11) \| 0x8000000000000000ULL;	ia = (ia << 11) \| 0x8000000000000000ULL;
	/* compute x * 2/pi */	/* compute x * 2/pi */
	idx = 16 - (e >> 6);	idx = 16 - (e >> 6);


	hi = 0;	hi = 0;
	#if defined(__CUDABE__)	#if defined(__CUDABE__)
	#pragma unroll 1	#pragma unroll 1
	#endif /* __CUDABE__ */	#endif /* __CUDABE__ */

	for (q = 0; q < 18; q++) {	for (q = (idx-1); q < min(18,idx+3); q++) {
	plo = __cudart_i2opi_d[q] * ia;	plo = __cudart_i2opi_d[q] * ia;
	phi = __umul64hi (__cudart_i2opi_d[q], ia);	phi = __umul64hi (__cudart_i2opi_d[q], ia);
	lo = hi + plo;	lo = hi + plo;
	hi = phi + (lo < plo);	hi = phi + (lo < plo);

	result[q] = lo;	result[q-(idx-1)] = lo;
	}	}

	result[q] = hi;	result[q-(idx-1)] = hi;
	e = e & 63;	e = e & 63;
	/* shift result such that hi:lo<127:126> are the least significant	/* shift result such that hi:lo<127:126> are the least significant
	integer bits, and hi:lo<125:0> are the fractional bits of the result	integer bits, and hi:lo<125:0> are the fractional bits of the result
	*/	*/

	hi = result[idx+2];	hi = result[3];
	lo = result[idx+1];	lo = result[2];
	if (e) {	if (e) {
	q = 64 - e;	q = 64 - e;
	hi = (hi << e) \| (lo >> q);	hi = (hi << e) \| (lo >> q);

	lo = (lo << e) \| (result[idx] >> q);	lo = (lo << e) \| (result[1] >> q);
	}	}
	q = (int)(hi >> 62);	q = (int)(hi >> 62);
	/* fraction */	/* fraction */
	hi = (hi << 2) \| (lo >> 62);	hi = (hi << 2) \| (lo >> 62);
	lo = (lo << 2);	lo = (lo << 2);
	e = (hi \| (lo > 0)) > 0x8000000000000000ULL; /* fraction >= 0.5 */	e = (hi \| (lo > 0)) > 0x8000000000000000ULL; /* fraction >= 0.5 */
	q += e;	q += e;
	if (s) q = -q;	if (s) q = -q;
	if (e) {	if (e) {
	unsigned long long int t;	unsigned long long int t;

	skipping to change at line 338	skipping to change at line 337
	z = __fma_rn (z, x2, 2.087588480545065E-009);	z = __fma_rn (z, x2, 2.087588480545065E-009);
	z = __fma_rn (z, x2, -2.755731555403950E-007);	z = __fma_rn (z, x2, -2.755731555403950E-007);
	z = __fma_rn (z, x2, 2.480158729365970E-005);	z = __fma_rn (z, x2, 2.480158729365970E-005);
	z = __fma_rn (z, x2, -1.388888888888074E-003);	z = __fma_rn (z, x2, -1.388888888888074E-003);
	z = __fma_rn (z, x2, 4.166666666666664E-002);	z = __fma_rn (z, x2, 4.166666666666664E-002);
	z = __fma_rn (z, x2, -5.000000000000000E-001);	z = __fma_rn (z, x2, -5.000000000000000E-001);
	z = __fma_rn (z, x2, 1.000000000000000E+000);	z = __fma_rn (z, x2, 1.000000000000000E+000);
	return z;	return z;
	}	}


	/* approximate tangent on -pi/2...+pi/2 ??? */	/* approximate tangent on -pi/4...+pi/4 */
	__device_func__(double __internal_tan_kerneld(double x))	__device_func__(double __internal_tan_kerneld(double x, int i))
	{	{

	double x2, z;	double x2, z, q;
	x2 = x * x;	x2 = x * x;

	z = 2.08720267406905700E-005;	z = 9.8006287203286300E-006;
	z = __fma_rn (z, x2, -4.49127882369154970E-005);	z = __fma_rn (z, x2, -2.4279526494179897E-005);
	z = __fma_rn (z, x2, 8.95189835887631370E-005);	z = __fma_rn (z, x2, 4.8644173130937162E-005);
	z = __fma_rn (z, x2, -2.64581074021952680E-005);	z = __fma_rn (z, x2, -2.5640012693782273E-005);
	z = __fma_rn (z, x2, 1.36576090383605350E-004);	z = __fma_rn (z, x2, 6.7223984330880073E-005);
	z = __fma_rn (z, x2, 2.22355572141495130E-004);	z = __fma_rn (z, x2, 8.3559287318211639E-005);
	z = __fma_rn (z, x2, 5.95072833555129420E-004);	z = __fma_rn (z, x2, 2.4375039850848564E-004);
	z = __fma_rn (z, x2, 1.45475607963143160E-003);	z = __fma_rn (z, x2, 5.8886487754856672E-004);
	z = __fma_rn (z, x2, 3.59228938375760580E-003);	z = __fma_rn (z, x2, 1.4560454844672040E-003);
	z = __fma_rn (z, x2, 8.86321913102899350E-003);	z = __fma_rn (z, x2, 3.5921008885857180E-003);
	z = __fma_rn (z, x2, 2.18694896113231960E-002);	z = __fma_rn (z, x2, 8.8632379218613715E-003);
	z = __fma_rn (z, x2, 5.39682539265723680E-002);	z = __fma_rn (z, x2, 2.1869488399337889E-002);
	z = __fma_rn (z, x2, 1.33333333334148210E-001);	z = __fma_rn (z, x2, 5.3968253972902704E-002);
	z = __fma_rn (z, x2, 3.33333333333327760E-001);	z = __fma_rn (z, x2, 1.3333333333325342E-001);
		z = __fma_rn (z, x2, 3.3333333333333381E-001);
	z = z * x2;	z = z * x2;

	z = __fma_rn (z, x, x);	q = __fma_rn (z, x, x);
	return z;	if (i) {
		double s = q - x;
		double w = __fma_rn (z, x, -s); // tail of q
		z = 1.0 / q;
		z = -z;
		s = __fma_rn (q, z, 1.0);
		q = __fma_rn (z, __fma_rn (z, w, s), z);
		}
		return q;
	}	}

	__device_func__(double __cuda_sqrt(double a))	__device_func__(double __cuda_sqrt(double a))
	{	{
	return sqrt(a);	return sqrt(a);
	}	}

	__device_func__(double __cuda_rsqrt(double a))	__device_func__(double __cuda_rsqrt(double a))
	{	{
	#if !defined(__CUDABE__)	#if !defined(__CUDABE__)

	skipping to change at line 417	skipping to change at line 425
	t = __fma_rn (t, a2, 1.111112158368149E-001/256.0);	t = __fma_rn (t, a2, 1.111112158368149E-001/256.0);
	t = __fma_rn (t, a2, 1.428571416261528E-001/64.0);	t = __fma_rn (t, a2, 1.428571416261528E-001/64.0);
	t = __fma_rn (t, a2, 2.000000000069858E-001/16.0);	t = __fma_rn (t, a2, 2.000000000069858E-001/16.0);
	t = __fma_rn (t, a2, 3.333333333333198E-001/4.0);	t = __fma_rn (t, a2, 3.333333333333198E-001/4.0);
	t = t * a2;	t = t * a2;
	t = __fma_rn (t, a, a_2);	t = __fma_rn (t, a, a_2);
	t = t + a_1;	t = t + a_1;
	return t;	return t;
	}	}


		__device_func__(double __internal_exp2i_kernel(int b))
		{
		return __hiloint2double((b + 1023) << 20, 0);
		}

		__device_func__(double __internal_half(double a))
		{
		unsigned int ihi, ilo;
		ilo = __double2loint(a);
		ihi = __double2hiint(a);
		return __hiloint2double(ihi - 0x00100000, ilo);
		}

		__device_func__(double __internal_twice(double a))
		{
		unsigned int ihi, ilo;
		ilo = __double2loint(a);
		ihi = __double2hiint(a);
		return __hiloint2double(ihi + 0x00100000, ilo);
		}

	__device_func__(double __cuda_sin(double a))	__device_func__(double __cuda_sin(double a))
	{	{
	double z;	double z;
	int i;	int i;
	if (__cuda___isinf(a) \|\| (a == CUDART_ZERO)) {	if (__cuda___isinf(a) \|\| (a == CUDART_ZERO)) {
	return __dmul_rn(a, CUDART_ZERO);	return __dmul_rn(a, CUDART_ZERO);
	}	}
	z = __internal_trig_reduction_kerneld(a, &i);	z = __internal_trig_reduction_kerneld(a, &i);
	/* here, abs(z) <= pi/4, and i has the quadrant */	/* here, abs(z) <= pi/4, and i has the quadrant */
	if (i & 1) {	if (i & 1) {

	skipping to change at line 502	skipping to change at line 531

	__device_func__(double __cuda_tan(double a))	__device_func__(double __cuda_tan(double a))
	{	{
	double z;	double z;
	int i;	int i;
	if (__cuda___isinf(a)) {	if (__cuda___isinf(a)) {
	return CUDART_NAN;	return CUDART_NAN;
	}	}
	z = __internal_trig_reduction_kerneld(a, &i);	z = __internal_trig_reduction_kerneld(a, &i);
	/* here, abs(z) <= pi/4, and i has the quadrant */	/* here, abs(z) <= pi/4, and i has the quadrant */

	z = __internal_tan_kerneld(z);	z = __internal_tan_kerneld(z, i & 1);
	if (i & 1) {
	z = -1.0 / z;
	}
	return z;	return z;
	}	}

	__device_func__(double __cuda_log(double a))	__device_func__(double __cuda_log(double a))
	{	{
	double m, f, g, u, v, tmp, q, ulo, log_lo, log_hi;	double m, f, g, u, v, tmp, q, ulo, log_lo, log_hi;
	int ihi, ilo;	int ihi, ilo;
	int e = 0;	int e = 0;

	ihi = __double2hiint(a);	ihi = __double2hiint(a);

	skipping to change at line 545	skipping to change at line 571
	e = -54;	e = -54;
	ihi = __double2hiint(a);	ihi = __double2hiint(a);
	ilo = __double2loint(a);	ilo = __double2loint(a);
	}	}
	/* a = m * 2^e. m <= sqrt(2): log2(a) = log2(m) + e.	/* a = m * 2^e. m <= sqrt(2): log2(a) = log2(m) + e.
	* m > sqrt(2): log2(a) = log2(m/2) + (e+1)	* m > sqrt(2): log2(a) = log2(m/2) + (e+1)
	*/	*/
	e += ((ihi >> 20) & 0x7ff) - 1023;	e += ((ihi >> 20) & 0x7ff) - 1023;
	m = __hiloint2double ((ihi & 0x800fffff) \| 0x3ff00000, ilo);	m = __hiloint2double ((ihi & 0x800fffff) \| 0x3ff00000, ilo);
	if (m > CUDART_SQRT_TWO) {	if (m > CUDART_SQRT_TWO) {

	m = m * 0.5;	m = __internal_half(m);
	e = e + 1;	e = e + 1;
	}	}
	/* log((1+m)/(1-m)) = 2atanh(m). log(m) = 2atanh ((m-1)/(m+1)) */	/* log((1+m)/(1-m)) = 2atanh(m). log(m) = 2atanh ((m-1)/(m+1)) */
	f = m - 1.0;	f = m - 1.0;
	g = m + 1.0;	g = m + 1.0;
	g = 1.0 / g;	g = 1.0 / g;
	u = f * g;	u = f * g;
	u = u + u;	u = u + u;
	/* u = 2.0 * (m - 1.0) / (m + 1.0) */	/* u = 2.0 * (m - 1.0) / (m + 1.0) */
	v = u * u;	v = u * u;
	q = 6.7261411553826339E-2/65536.0;	q = 6.7261411553826339E-2/65536.0;
	q = __fma_rn (q, v, 6.6133829643643394E-2/16384.0);	q = __fma_rn (q, v, 6.6133829643643394E-2/16384.0);
	q = __fma_rn (q, v, 7.6940931149150890E-2/4096.0);	q = __fma_rn (q, v, 7.6940931149150890E-2/4096.0);
	q = __fma_rn (q, v, 9.0908745692137444E-2/1024.0);	q = __fma_rn (q, v, 9.0908745692137444E-2/1024.0);
	q = __fma_rn (q, v, 1.1111111499059706E-1/256.0);	q = __fma_rn (q, v, 1.1111111499059706E-1/256.0);
	q = __fma_rn (q, v, 1.4285714283305975E-1/64.0);	q = __fma_rn (q, v, 1.4285714283305975E-1/64.0);
	q = __fma_rn (q, v, 2.0000000000007223E-1/16.0);	q = __fma_rn (q, v, 2.0000000000007223E-1/16.0);
	q = __fma_rn (q, v, 3.3333333333333326E-1/4.0);	q = __fma_rn (q, v, 3.3333333333333326E-1/4.0);

	tmp = 2.0 * (f - u);	tmp = __internal_twice (f - u);
	tmp = __fma_rn (-u, f, tmp); // tmp = remainder of division	tmp = __fma_rn (-u, f, tmp); // tmp = remainder of division
	ulo = g * tmp; // less significant quotient bits	ulo = g * tmp; // less significant quotient bits
	/* u + ulo = 2.0 * (m - 1.0) / (m + 1.0) to more than double precision */	/* u + ulo = 2.0 * (m - 1.0) / (m + 1.0) to more than double precision */
	q = q * v;	q = q * v;
	q = q * u;	q = q * u;
	/* log_hi + log_lo = log(m) to more than double precision */	/* log_hi + log_lo = log(m) to more than double precision */
	log_hi = u;	log_hi = u;
	log_lo = ulo + q;	log_lo = ulo + q;
	/* log_hi + log_lo = log(m)+elog(2)=log(a) to more than double precision /	/* log_hi + log_lo = log(m)+elog(2)=log(a) to more than double precision /
	q = __fma_rn ( e, CUDART_LN2_HI, log_hi);	q = __fma_rn ( e, CUDART_LN2_HI, log_hi);
	tmp = __fma_rn (-e, CUDART_LN2_HI, q);	tmp = __fma_rn (-e, CUDART_LN2_HI, q);
	tmp = tmp - log_hi;	tmp = tmp - log_hi;
	log_hi = q;	log_hi = q;
	log_lo = log_lo - tmp;	log_lo = log_lo - tmp;
	log_lo = __fma_rn (e, CUDART_LN2_LO, log_lo);	log_lo = __fma_rn (e, CUDART_LN2_LO, log_lo);
	return log_hi + log_lo;	return log_hi + log_lo;
	}	}

	/* Requires \|x.y\| > \|y.y\|. 8 DP operations */	/* Requires \|x.y\| > \|y.y\|. 8 DP operations */

	__device_func__(double2 ddadd_xgty (double2 x, double2 y))	__device_func__(double2 __internal_ddadd_xgty (double2 x, double2 y))
	{	{
	double2 z;	double2 z;
	#if defined(__GNUC__) && !defined(__CUDABE__)	#if defined(__GNUC__) && !defined(__CUDABE__)
	volatile double r, s, e;	volatile double r, s, e;
	#else	#else
	double r, s, e;	double r, s, e;
	#endif	#endif
	r = x.y + y.y;	r = x.y + y.y;
	e = x.y - r;	e = x.y - r;
	s = ((e + y.y) + y.x) + x.x;	s = ((e + y.y) + y.x) + x.x;
	z.y = e = r + s;	z.y = e = r + s;
	z.x = (r - e) + s;	z.x = (r - e) + s;
	return z;	return z;
	}	}

	/* Take full advantage of FMA. Only 8 DP operations */	/* Take full advantage of FMA. Only 8 DP operations */

	__device_func__(double2 ddmul (double2 x, double2 y))	__device_func__(double2 __internal_ddmul (double2 x, double2 y))
	{	{
	#if defined(__GNUC__) && !defined(__CUDABE__)	#if defined(__GNUC__) && !defined(__CUDABE__)
	volatile double e;	volatile double e;
	#else	#else
	double e;	double e;
	#endif	#endif
	double2 t, z;	double2 t, z;
	t.y = x.y * y.y;	t.y = x.y * y.y;
	t.x = __fma_rn (x.y, y.y, -t.y);	t.x = __fma_rn (x.y, y.y, -t.y);
	t.x = __fma_rn (x.x, y.x, t.x);	t.x = __fma_rn (x.x, y.x, t.x);

	skipping to change at line 644	skipping to change at line 670
	expo = (ihi >> 20) & 0x7ff;	expo = (ihi >> 20) & 0x7ff;
	expo -= 54;	expo -= 54;
	}	}
	expo -= 1023;	expo -= 1023;
	/* log(a) = log(m*2^expo) =	/* log(a) = log(m*2^expo) =
	log(m) + log(2)*expo, if m < sqrt(2),	log(m) + log(2)*expo, if m < sqrt(2),
	log(m0.5) + log(2)(expo+1), if m >= sqrt(2)	log(m0.5) + log(2)(expo+1), if m >= sqrt(2)
	*/	*/
	m = __hiloint2double((ihi & 0x800fffff) \| 0x3ff00000, ilo);	m = __hiloint2double((ihi & 0x800fffff) \| 0x3ff00000, ilo);
	if (m > CUDART_SQRT_TWO) {	if (m > CUDART_SQRT_TWO) {

	m = m * 0.5;	m = __internal_half(m);
	expo = expo + 1;	expo = expo + 1;
	}	}
	/* compute log(m) with extended precision using an algorithm derived from	/* compute log(m) with extended precision using an algorithm derived from
	* P.T.P. Tang, "Table Driven Implementation of the Logarithm Function",	* P.T.P. Tang, "Table Driven Implementation of the Logarithm Function",
	* TOMS, Vol. 16., No. 4, December 1990, pp. 378-400. A modified polynomi al	* TOMS, Vol. 16., No. 4, December 1990, pp. 378-400. A modified polynomi al
	* approximation to atanh(x) on the interval [-0.1716, 0.1716] is utilize d.	* approximation to atanh(x) on the interval [-0.1716, 0.1716] is utilize d.
	*/	*/
	f = m - 1.0;	f = m - 1.0;
	g = m + 1.0;	g = m + 1.0;
	g = 1.0 / g;	g = 1.0 / g;

	skipping to change at line 668	skipping to change at line 694
	v = u * u;	v = u * u;
	q = 6.6253631649203309E-2/65536.0;	q = 6.6253631649203309E-2/65536.0;
	q = __fma_rn (q, v, 6.6250935587260612E-2/16384.0);	q = __fma_rn (q, v, 6.6250935587260612E-2/16384.0);
	q = __fma_rn (q, v, 7.6935437806732829E-2/4096.0);	q = __fma_rn (q, v, 7.6935437806732829E-2/4096.0);
	q = __fma_rn (q, v, 9.0908878711093280E-2/1024.0);	q = __fma_rn (q, v, 9.0908878711093280E-2/1024.0);
	q = __fma_rn (q, v, 1.1111111322892790E-1/256.0);	q = __fma_rn (q, v, 1.1111111322892790E-1/256.0);
	q = __fma_rn (q, v, 1.4285714284546502E-1/64.0);	q = __fma_rn (q, v, 1.4285714284546502E-1/64.0);
	q = __fma_rn (q, v, 2.0000000000003113E-1/16.0);	q = __fma_rn (q, v, 2.0000000000003113E-1/16.0);
	q = q * v;	q = q * v;
	/* u + ulo = 2.0 * (m - 1.0) / (m + 1.0) to more than double precision */	/* u + ulo = 2.0 * (m - 1.0) / (m + 1.0) to more than double precision */

	tmp = 2.0 * (f - u);	tmp = __internal_twice (f - u);
	tmp = __fma_rn (-u, f, tmp); // tmp = remainder of division	tmp = __fma_rn (-u, f, tmp); // tmp = remainder of division
	ulo = g * tmp; // less significand quotient bits	ulo = g * tmp; // less significand quotient bits
	/* switch to double-double at this point */	/* switch to double-double at this point */
	qq.y = q;	qq.y = q;
	qq.x = 0.0;	qq.x = 0.0;
	uu.y = u;	uu.y = u;
	uu.x = ulo;	uu.x = ulo;
	cc.y = 3.3333333333333331E-1/4.0;	cc.y = 3.3333333333333331E-1/4.0;
	cc.x = -9.8201492846582465E-18/4.0;	cc.x = -9.8201492846582465E-18/4.0;

	qq = ddadd_xgty (cc, qq);	qq = __internal_ddadd_xgty (cc, qq);
	/* compute log(m) in double-double format */	/* compute log(m) in double-double format */

	qq = ddmul(qq, uu);	qq = __internal_ddmul(qq, uu);
	qq = ddmul(qq, uu);	qq = __internal_ddmul(qq, uu);
	qq = ddmul(qq, uu);	qq = __internal_ddmul(qq, uu);
	uu = ddadd_xgty (uu, qq);	uu = __internal_ddadd_xgty (uu, qq);
	u = uu.y;	u = uu.y;
	ulo = uu.x;	ulo = uu.x;
	/* log(2)expo in double-double format /	/* log(2)expo in double-double format /
	tt.y = expo * 6.9314718055966296e-001; /* multiplication is exact */	tt.y = expo * 6.9314718055966296e-001; /* multiplication is exact */
	tt.x = expo * 2.8235290563031577e-013;	tt.x = expo * 2.8235290563031577e-013;
	/* log(a) = log(m) + log(2)expo; if expo != 0, \|log(2)expo\| > \|log(m)\| */	/* log(a) = log(m) + log(2)expo; if expo != 0, \|log(2)expo\| > \|log(m)\| */

	res = ddadd_xgty (tt, uu);	res = __internal_ddadd_xgty (tt, uu);
	return res;	return res;
	}	}

	__device_func__(double __cuda_log2(double a))	__device_func__(double __cuda_log2(double a))
	{	{
	double t;	double t;
	t = __cuda_log(a);	t = __cuda_log(a);
	return __fma_rn (t, CUDART_L2E_HI, t * CUDART_L2E_LO);	return __fma_rn (t, CUDART_L2E_HI, t * CUDART_L2E_LO);
	}	}


	skipping to change at line 884	skipping to change at line 910
	z = 1.632386098183803E-010;	z = 1.632386098183803E-010;
	z = __fma_rn (z, a2, 2.504854501385687E-008);	z = __fma_rn (z, a2, 2.504854501385687E-008);
	z = __fma_rn (z, a2, 2.755734274788706E-006);	z = __fma_rn (z, a2, 2.755734274788706E-006);
	z = __fma_rn (z, a2, 1.984126976294102E-004);	z = __fma_rn (z, a2, 1.984126976294102E-004);
	z = __fma_rn (z, a2, 8.333333333452911E-003);	z = __fma_rn (z, a2, 8.333333333452911E-003);
	z = __fma_rn (z, a2, 1.666666666666606E-001);	z = __fma_rn (z, a2, 1.666666666666606E-001);
	z = z * a2;	z = z * a2;
	z = __fma_rn (z, a, a);	z = __fma_rn (z, a, a);
	} else if (a < 2.0) { /* work around accuracy issue in vicinity of 1.4 */	} else if (a < 2.0) { /* work around accuracy issue in vicinity of 1.4 */
	z = __cuda_expm1(a);	z = __cuda_expm1(a);

	z = 0.5 * (z + z / (z + 1.0));	z = __internal_half (z + z / (z + 1.0));
	} else {	} else {
	z = __internal_exp_kernel(a, -1);	z = __internal_exp_kernel(a, -1);
	z = z + (1.0 / (-4.0 * z));	z = z + (1.0 / (-4.0 * z));
	if (a >= CUDART_LN2_X_1025) {	if (a >= CUDART_LN2_X_1025) {
	z = CUDART_INF; /* overflow -> infinity */	z = CUDART_INF; /* overflow -> infinity */
	}	}
	}	}
	z = __cuda_copysign(z, s);	z = __cuda_copysign(z, s);
	return z;	return z;
	}	}

	skipping to change at line 931	skipping to change at line 957
	t = __fma_rn (t, a, a);	t = __fma_rn (t, a, a);
	a = __cuda_copysign(t, a);	a = __cuda_copysign(t, a);
	}	}
	return a;	return a;
	}	}

	__device_func__(double __internal_atan_kernel(double a))	__device_func__(double __internal_atan_kernel(double a))
	{	{
	double t, a2;	double t, a2;
	a2 = a * a;	a2 = a * a;

	t = 3.078869985415414E-005;	t = -2.0258553044438358E-005 ;
	t = __fma_rn (t, a2, -3.242465497855030E-004);	t = __fma_rn (t, a2, 2.2302240345758510E-004);
	t = __fma_rn (t, a2, 1.616330732831762E-003);	t = __fma_rn (t, a2, -1.1640717779930576E-003);
	t = __fma_rn (t, a2, -5.109062682125205E-003);	t = __fma_rn (t, a2, 3.8559749383629918E-003);
	t = __fma_rn (t, a2, 1.161935628866633E-002);	t = __fma_rn (t, a2, -9.1845592187165485E-003);
	t = __fma_rn (t, a2, -2.056787727035855E-002);	t = __fma_rn (t, a2, 1.6978035834597331E-002);
	t = __fma_rn (t, a2, 3.014410555893613E-002);	t = __fma_rn (t, a2, -2.5826796814495994E-002);
	t = __fma_rn (t, a2, -3.868025408479192E-002);	t = __fma_rn (t, a2, 3.4067811082715123E-002);
	t = __fma_rn (t, a2, 4.575089047336649E-002);	t = __fma_rn (t, a2, -4.0926382420509971E-002);
	t = __fma_rn (t, a2, -5.205914168425584E-002);	t = __fma_rn (t, a2, 4.6739496199157994E-002);
	t = __fma_rn (t, a2, 5.868796955641944E-002);	t = __fma_rn (t, a2, -5.2392330054601317E-002);
	t = __fma_rn (t, a2, -6.664240989090695E-002);	t = __fma_rn (t, a2, 5.8773077721790849E-002);
	t = __fma_rn (t, a2, 7.691989357768611E-002);	t = __fma_rn (t, a2, -6.6658603633512573E-002);
	t = __fma_rn (t, a2, -9.090879690584197E-002);	t = __fma_rn (t, a2, 7.6922129305867837E-002);
	t = __fma_rn (t, a2, 1.111110931160943E-001);	t = __fma_rn (t, a2, -9.0909012354005225E-002);
	t = __fma_rn (t, a2, -1.428571421925286E-001);	t = __fma_rn (t, a2, 1.1111110678749424E-001);
	t = __fma_rn (t, a2, 1.999999999874377E-001);	t = __fma_rn (t, a2, -1.4285714271334815E-001);
	t = __fma_rn (t, a2, -3.333333333332495E-001);	t = __fma_rn (t, a2, 1.9999999999755019E-001);
		t = __fma_rn (t, a2, -3.3333333333331860E-001);
	t = t * a2;	t = t * a2;
	t = __fma_rn (t, a, a);	t = __fma_rn (t, a, a);
	return t;	return t;
	}	}

	__device_func__(double __cuda_atan2(double a, double b))	__device_func__(double __cuda_atan2(double a, double b))
	{	{
	double t0, t1, t3;	double t0, t1, t3;
	if (__cuda___isnan(a) \|\| __cuda___isnan(b)) {	if (__cuda___isnan(a) \|\| __cuda___isnan(b)) {
	return a + b;	return a + b;
	}	}
	/* reduce arguments to first octant */	/* reduce arguments to first octant */
	/* r = (\|x\| < \|y\|) ? (\|x\| / \|y\|) : (\|y\| / \|x\|) */	/* r = (\|x\| < \|y\|) ? (\|x\| / \|y\|) : (\|y\| / \|x\|) */
	t3 = __cuda_fabs(b);	t3 = __cuda_fabs(b);
	t1 = __cuda_fabs(a);	t1 = __cuda_fabs(a);
	if (t3 == 0.0 && t1 == 0.0) {	if (t3 == 0.0 && t1 == 0.0) {
	t3 = __cuda___signbit(b) ? CUDART_PI : 0;	t3 = __cuda___signbit(b) ? CUDART_PI : 0;
	} else if (__cuda___isinf(t3) && __cuda___isinf(t1)) {	} else if (__cuda___isinf(t3) && __cuda___isinf(t1)) {
	t3 = __cuda___signbit(b) ? CUDART_3PIO4 : CUDART_PIO4;	t3 = __cuda___signbit(b) ? CUDART_3PIO4 : CUDART_PIO4;
	} else {	} else {

	/* can't use min, max because they do not propagate NaNs */	t0 = __cuda_fmax (t1, t3);
	if (t3 < t1) {	t1 = __cuda_fmin (t1, t3);
	t0 = t1;
	t1 = t3;
	} else {
	t0 = t3;
	t1 = t1;
	}
	t3 = t1 / t0;	t3 = t1 / t0;
	t3 = __internal_atan_kernel(t3);	t3 = __internal_atan_kernel(t3);
	/* Map result according to octant. */	/* Map result according to octant. */
	if (__cuda_fabs(a) > __cuda_fabs(b)) t3 = CUDART_PIO2 - t3;	if (__cuda_fabs(a) > __cuda_fabs(b)) t3 = CUDART_PIO2 - t3;
	if (b < 0.0) t3 = CUDART_PI - t3;	if (b < 0.0) t3 = CUDART_PI - t3;
	}	}
	t3 = __cuda_copysign(t3, a);	t3 = __cuda_copysign(t3, a);
	return t3;	return t3;
	}	}


	skipping to change at line 1023	skipping to change at line 1044
	r = __fma_rn (r, b, 1.745227928732326E-002);	r = __fma_rn (r, b, 1.745227928732326E-002);
	r = __fma_rn (r, b, 1.000422754245580E-002);	r = __fma_rn (r, b, 1.000422754245580E-002);
	r = __fma_rn (r, b, 1.418108777515123E-002);	r = __fma_rn (r, b, 1.418108777515123E-002);
	r = __fma_rn (r, b, 1.733194598980628E-002);	r = __fma_rn (r, b, 1.733194598980628E-002);
	r = __fma_rn (r, b, 2.237350511593569E-002);	r = __fma_rn (r, b, 2.237350511593569E-002);
	r = __fma_rn (r, b, 3.038188875134962E-002);	r = __fma_rn (r, b, 3.038188875134962E-002);
	r = __fma_rn (r, b, 4.464285849810986E-002);	r = __fma_rn (r, b, 4.464285849810986E-002);
	r = __fma_rn (r, b, 7.499999998342270E-002);	r = __fma_rn (r, b, 7.499999998342270E-002);
	r = __fma_rn (r, b, 1.666666666667375E-001);	r = __fma_rn (r, b, 1.666666666667375E-001);
	r = r * b;	r = r * b;

	r = __fma_rn (r, a, a);
	return r;	return r;
	}	}

	__device_func__(double __cuda_asin(double a))	__device_func__(double __cuda_asin(double a))
	{	{
	double t0, t1;	double t0, t1;
	t0 = __cuda_fabs(a);	t0 = __cuda_fabs(a);
	if (t0 > 1.0) {	if (t0 > 1.0) {
	return CUDART_NAN;	return CUDART_NAN;
	}	}
	if (t0 > 0.575) {	if (t0 > 0.575) {
	t1 = __fma_rn (-0.5, t0, 0.5);	t1 = __fma_rn (-0.5, t0, 0.5);
	t0 = __cuda_sqrt (t1);	t0 = __cuda_sqrt (t1);
	t1 = __internal_asin_kernel (t0, t1);	t1 = __internal_asin_kernel (t0, t1);

	t1 = __fma_rn (2.0, t1, -CUDART_PIO2_LO);	t0 = -2.0 * t0;
	t1 = CUDART_PIO2_HI - t1;	t1 = __fma_rn (t0, t1, CUDART_PIO2_LO);
		t0 = t0 + CUDART_PIO4_HI;
		t1 = t0 + t1;
		t1 = t1 + CUDART_PIO4_HI;
	} else {	} else {
	t1 = t0 * t0;	t1 = t0 * t0;
	t1 = __internal_asin_kernel (t0, t1);	t1 = __internal_asin_kernel (t0, t1);

		t1 = __fma_rn (t1, t0, t0);
	}	}
	return __cuda_copysign(t1, a);	return __cuda_copysign(t1, a);
	}	}

	__device_func__(double __cuda_acos(double a))	__device_func__(double __cuda_acos(double a))
	{	{
	double t0, t1;	double t0, t1;
	#if !defined(__CUDABE__)	#if !defined(__CUDABE__)
	if (__cuda___isnan(a)) {	if (__cuda___isnan(a)) {
	return a + a;	return a + a;
	}	}
	#endif	#endif
	t0 = __cuda_fabs (a);	t0 = __cuda_fabs (a);
	if (t0 > 0.575) {	if (t0 > 0.575) {
	t1 = __fma_rn (-0.5, t0, 0.5);	t1 = __fma_rn (-0.5, t0, 0.5);
	t0 = __cuda_sqrt(t1);	t0 = __cuda_sqrt(t1);

	t0 = __internal_asin_kernel (t0, t1);	t1 = __internal_asin_kernel (t0, t1);
		t0 = __fma_rn (t1, t0, t0);
	t0 = 2.0 * t0;	t0 = 2.0 * t0;
	if (__cuda___signbit(a)) {	if (__cuda___signbit(a)) {
	t0 = __fma_rn (1.0, t0, -CUDART_PI_LO);	t0 = __fma_rn (1.0, t0, -CUDART_PI_LO);
	t0 = CUDART_PI_HI - t0;	t0 = CUDART_PI_HI - t0;
	}	}
	} else {	} else {
	t1 = t0 * t0;	t1 = t0 * t0;

	t0 = __internal_asin_kernel (t0, t1);	t1 = __internal_asin_kernel (t0, t1);
		t0 = __fma_rn (t1, t0, t0);
	if (__cuda___signbit(a)) {	if (__cuda___signbit(a)) {
	t0 = __fma_rn (1.0, t0, +CUDART_PIO2_LO);	t0 = __fma_rn (1.0, t0, +CUDART_PIO2_LO);
	t0 = CUDART_PIO2_HI + t0;	t0 = CUDART_PIO2_HI + t0;
	} else {	} else {
	t0 = __fma_rn (1.0, t0, -CUDART_PIO2_LO);	t0 = __fma_rn (1.0, t0, -CUDART_PIO2_LO);
	t0 = CUDART_PIO2_HI - t0;	t0 = CUDART_PIO2_HI - t0;
	}	}
	}	}
	return t0;	return t0;
	}	}

	__device_func__(double __cuda_acosh(double a))	__device_func__(double __cuda_acosh(double a))
	{	{

	double s, t;	double t;
	#if !defined(__CUDABE__)	#if !defined(__CUDABE__)
	if (__cuda___isnan(a)) {	if (__cuda___isnan(a)) {
	return a + a;	return a + a;
	}	}
	#endif	#endif
	t = a - 1.0;	t = a - 1.0;
	if (__cuda_fabs(t) > CUDART_TWO_TO_52) {	if (__cuda_fabs(t) > CUDART_TWO_TO_52) {
	/* for large a, acosh = log(2a) /	/* for large a, acosh = log(2a) /
	return CUDART_LN2 + __cuda_log(a);	return CUDART_LN2 + __cuda_log(a);
	} else {	} else {

	s = a + 1.0;	t = t + __cuda_sqrt(__fma_rn(a, t, t));
	t = t + __cuda_sqrt(s * t);
	return __cuda_log1p(t);	return __cuda_log1p(t);
	}	}
	}	}

	__device_func__(double __cuda_asinh(double a))	__device_func__(double __cuda_asinh(double a))
	{	{
	#if SLIGHTLY_MORE_ACCURATE_BUT_SLOWER	#if SLIGHTLY_MORE_ACCURATE_BUT_SLOWER
	double fa, oofa, t;	double fa, oofa, t;
	fa = __cuda_fabs(a);	fa = __cuda_fabs(a);
	if (fa > 8.9884657373828596e+307) { /* prevent intermediate underflow */	if (fa > 8.9884657373828596e+307) { /* prevent intermediate underflow */

	skipping to change at line 1449	skipping to change at line 1474
	z = __fma_rn (z, x, -5.9216643735369393e-004);	z = __fma_rn (z, x, -5.9216643735369393e-004);
	z = __fma_rn (z, x, 6.9728137583658571e-005);	z = __fma_rn (z, x, 6.9728137583658571e-005);
	z = __fma_rn (z, x, 7.8403922172006662e-004);	z = __fma_rn (z, x, 7.8403922172006662e-004);
	z = __fma_rn (z, x, -2.2947209362139917e-004);	z = __fma_rn (z, x, -2.2947209362139917e-004);
	z = __fma_rn (z, x, -2.6813271604938273e-003);	z = __fma_rn (z, x, -2.6813271604938273e-003);
	z = __fma_rn (z, x, 3.4722222222222220e-003);	z = __fma_rn (z, x, 3.4722222222222220e-003);
	z = __fma_rn (z, x, 8.3333333333333329e-002);	z = __fma_rn (z, x, 8.3333333333333329e-002);
	z = __fma_rn (z, x, 1.0000000000000000e+000);	z = __fma_rn (z, x, 1.0000000000000000e+000);
	return z;	return z;
	}	}

	/* 11 DP operations */
	__device_func__(double2 dbldbladd (double2 x, double2 y))
	{
	double2 z;
	#if defined(__GNUC__) && !defined(__CUDABE__)
	volatile double t1, t2, e;
	#else
	double t1, t2, e;
	#endif
	t1 = x.y + y.y;
	e = t1 - x.y;
	t2 = ((y.y - e) + (x.y - (t1 - e))) + x.x + y.x;
	z.y = e = t1 + t2;
	z.x = t2 - (e - t1);
	return z;
	}

	__device_func__(double __internal_tgamma_stirling(double a))	__device_func__(double __internal_tgamma_stirling(double a))
	{	{
	if (a < 1.7162437695630274e+002) {	if (a < 1.7162437695630274e+002) {
	#if defined(__GNUC__) && !defined(__CUDABE__)	#if defined(__GNUC__) && !defined(__CUDABE__)
	volatile double t_hi, t_lo, e;	volatile double t_hi, t_lo, e;
	#else	#else
	double t_hi, t_lo, e;	double t_hi, t_lo, e;
	#endif	#endif
	double2 loga, prod;	double2 loga, prod;

	skipping to change at line 1491	skipping to change at line 1500
	/* compute (a - 0.5) * log(a) in double-double format */	/* compute (a - 0.5) * log(a) in double-double format */
	t_hi = loga.y * b;	t_hi = loga.y * b;
	t_lo = __fma_rn (loga.y, b, -t_hi);	t_lo = __fma_rn (loga.y, b, -t_hi);
	t_lo = __fma_rn (loga.x, b, t_lo);	t_lo = __fma_rn (loga.x, b, t_lo);
	prod.y = e = t_hi + t_lo;	prod.y = e = t_hi + t_lo;
	prod.x = (t_hi - e) + t_lo;	prod.x = (t_hi - e) + t_lo;

	/* compute (a - 0.5) * log(a) - a in double-double format */	/* compute (a - 0.5) * log(a) - a in double-double format */
	loga.y = -a;	loga.y = -a;
	loga.x = 0.0;	loga.x = 0.0;

	prod = dbldbladd (prod, loga);	prod = __internal_ddadd_xgty (prod, loga);

	/* compute pow(a,b) = exp(blog(a)) /	/* compute pow(a,b) = exp(blog(a)) /
	a = __cuda_exp(prod.y);	a = __cuda_exp(prod.y);
	/* prevent -INF + INF = NaN */	/* prevent -INF + INF = NaN */
	if (!__cuda___isinf(a)) {	if (!__cuda___isinf(a)) {
	/* if prod.x is much smaller than prod.y, then exp(prod.y + prod.x) ~ =	/* if prod.x is much smaller than prod.y, then exp(prod.y + prod.x) ~ =
	* exp(prod.y) + prod.x * exp(prod.y)	* exp(prod.y) + prod.x * exp(prod.y)
	*/	*/
	a = __fma_rn (a, prod.x, a);	a = __fma_rn (a, prod.x, a);
	}	}

	skipping to change at line 1515	skipping to change at line 1524
	return CUDART_INF;	return CUDART_INF;
	}	}
	}	}

	__device_func__(double __cuda_tgamma(double a))	__device_func__(double __cuda_tgamma(double a))
	{	{
	double s, xx, x = a;	double s, xx, x = a;
	if (__cuda___isnan(a)) {	if (__cuda___isnan(a)) {
	return a + a;	return a + a;
	}	}

	if (fabs(x) < 15.0) {	if (__cuda_fabs(x) < 15.0) {
	/* Based on: Kraemer, W.: "Berechnung der Gammafunktion G(x) fuer reel le	/* Based on: Kraemer, W.: "Berechnung der Gammafunktion G(x) fuer reel le
	* Punkt- und Intervallargumente". Zeitschrift fuer angewandte Mathema tik	* Punkt- und Intervallargumente". Zeitschrift fuer angewandte Mathema tik
	* und Mechanik, Vol. 70 (1990), No. 6, pp. 581-584	* und Mechanik, Vol. 70 (1990), No. 6, pp. 581-584
	*/	*/
	if (x >= 0.0) {	if (x >= 0.0) {
	s = 1.0;	s = 1.0;
	xx = x;	xx = x;
	while (xx > 1.5) {	while (xx > 1.5) {
	s = __fma_rn(s, xx, -s);	s = __fma_rn(s, xx, -s);
	xx = xx - 1.0;	xx = xx - 1.0;

	skipping to change at line 1566	skipping to change at line 1575
	if (x == __cuda_floor(x)) {	if (x == __cuda_floor(x)) {
	return CUDART_NAN;	return CUDART_NAN;
	}	}
	if (x < -185.0) {	if (x < -185.0) {
	int negative;	int negative;
	x = __cuda_floor(x);	x = __cuda_floor(x);
	negative = ((x - (2.0 * __cuda_floor(0.5 * x))) == 1.0);	negative = ((x - (2.0 * __cuda_floor(0.5 * x))) == 1.0);
	return negative ? CUDART_NEG_ZERO : CUDART_ZERO;	return negative ? CUDART_NEG_ZERO : CUDART_ZERO;
	}	}
	/* compute sin(pix) accurately /	/* compute sin(pix) accurately /

	xx = __cuda_rint (2.0 * x);	xx = __cuda_rint (__internal_twice(x));
	quot = (int)xx;	quot = (int)xx;
	xx = __fma_rn (-0.5, xx, x);	xx = __fma_rn (-0.5, xx, x);
	xx = xx * CUDART_PI;	xx = xx * CUDART_PI;
	if (quot & 1) {	if (quot & 1) {
	xx = __internal_cos_kerneld (xx);	xx = __internal_cos_kerneld (xx);
	} else {	} else {
	xx = __internal_sin_kerneld (xx);	xx = __internal_sin_kerneld (xx);
	}	}
	if (quot & 2) {	if (quot & 2) {
	xx = -xx;	xx = -xx;
	}	}

	x = fabs (x);	x = __cuda_fabs (x);
	s = __cuda_exp (-x);	s = __cuda_exp (-x);
	t = x - 0.5;	t = x - 0.5;

	if (x > 140.0) t = 0.5 * t;	if (x > 140.0) t = __internal_half(t);
	t = __cuda_pow (x, t);	t = __cuda_pow (x, t);
	if (x > 140.0) s = s * t;	if (x > 140.0) s = s * t;
	s = s * __internal_stirling_poly (x);	s = s * __internal_stirling_poly (x);
	s = s * x;	s = s * x;
	s = s * xx;	s = s * xx;
	s = 1.0 / s;	s = 1.0 / s;
	s = __fma_rn (s, CUDART_SQRT_PIO2_HI, CUDART_SQRT_PIO2_LO * s);	s = __fma_rn (s, CUDART_SQRT_PIO2_HI, CUDART_SQRT_PIO2_LO * s);
	s = s / t;	s = s / t;
	return s;	return s;
	}	}

	skipping to change at line 1617	skipping to change at line 1626
	*/	*/
	s = 1.0 / a;	s = 1.0 / a;
	t = s * s;	t = s * s;
	sum = -0.1633436431e-2;	sum = -0.1633436431e-2;
	sum = __fma_rn (sum, t, 0.83645878922e-3);	sum = __fma_rn (sum, t, 0.83645878922e-3);
	sum = __fma_rn (sum, t, -0.5951896861197e-3);	sum = __fma_rn (sum, t, -0.5951896861197e-3);
	sum = __fma_rn (sum, t, 0.793650576493454e-3);	sum = __fma_rn (sum, t, 0.793650576493454e-3);
	sum = __fma_rn (sum, t, -0.277777777735865004e-2);	sum = __fma_rn (sum, t, -0.277777777735865004e-2);
	sum = __fma_rn (sum, t, 0.833333333333331018375e-1);	sum = __fma_rn (sum, t, 0.833333333333331018375e-1);
	sum = __fma_rn (sum, s, 0.918938533204672);	sum = __fma_rn (sum, s, 0.918938533204672);

	s = 0.5 * __cuda_log (a);	s = __internal_half(__cuda_log (a));
	t = a - 0.5;	t = a - 0.5;
	s = s * t;	s = s * t;
	t = s - a;	t = s - a;
	s = s + sum;	s = s + sum;
	t = t + s;	t = t + s;
	return t;	return t;
	} else {	} else {
	a = a - 3.0;	a = a - 3.0;
	s = -4.02412642744125560E+003;	s = -4.02412642744125560E+003;
	s = __fma_rn (s, a, -2.97693796998962000E+005);	s = __fma_rn (s, a, -2.97693796998962000E+005);

	skipping to change at line 1744	skipping to change at line 1753
	if (quot & 1) {	if (quot & 1) {
	i = __internal_cos_kerneld(i);	i = __internal_cos_kerneld(i);
	} else {	} else {
	i = __internal_sin_kerneld(i);	i = __internal_sin_kerneld(i);
	}	}
	i = __cuda_fabs(i);	i = __cuda_fabs(i);
	t = __cuda_log(CUDART_PI / (i * a)) - t;	t = __cuda_log(CUDART_PI / (i * a)) - t;
	return t;	return t;
	}	}


	__device_func__(double __internal_exp2i_kernel(int b))
	{
	return __hiloint2double((b + 1023) << 20, 0);
	}

	__device_func__(double __cuda_ldexp(double a, int b))	__device_func__(double __cuda_ldexp(double a, int b))
	{	{
	double fa = __cuda_fabs (a);	double fa = __cuda_fabs (a);
	if ((fa == CUDART_ZERO) \|\| (fa == CUDART_INF) \|\| (!(fa <= CUDART_INF))) {	if ((fa == CUDART_ZERO) \|\| (fa == CUDART_INF) \|\| (!(fa <= CUDART_INF))) {
	return a + a;	return a + a;
	}	}
	if (b == 0) {	if (b == 0) {
	return a;	return a;
	}	}
	if (b > 2200) b = 2200;	if (b > 2200) b = 2200;

End of changes. 41 change blocks.
	106 lines changed or deleted	110 lines changed or added

	sm_11_atomic_functions.h	sm_11_atomic_functions.h

	skipping to change at line 214	skipping to change at line 214
	}	}

	#endif /* !CUDA_NO_SM_12_ATOMIC_INTRINSICS */	#endif /* !CUDA_NO_SM_12_ATOMIC_INTRINSICS */

	#elif !defined(__CUDACC__)	#elif !defined(__CUDACC__)

	#include "crt/func_macro.h"	#include "crt/func_macro.h"

	#if !defined(__CUDABE__)	#if !defined(__CUDABE__)


		#if defined(__MULTI_CORE__)

		#define __iAtomicAdd(address, val) \
		__builtin___iAtomicAdd(address, val)
		#define __uAtomicAdd(address, val) \
		__builtin___uAtomicAdd(address, val)
		#define __iAtomicExch(address, val) \
		__builtin___iAtomicExch(address, val)
		#define __uAtomicExch(address, val) \
		__builtin___uAtomicExch(address, val)
		#define __fAtomicExch(address, val) \
		__builtin___fAtomicExch(address, val)
		#define __iAtomicMin(address, val) \
		__builtin___iAtomicMin(address, val)
		#define __uAtomicMin(address, val) \
		__builtin___uAtomicMin(address, val)
		#define __iAtomicMax(address, val) \
		__builtin___iAtomicMax(address, val)
		#define __uAtomicMax(address, val) \
		__builtin___uAtomicMax(address, val)
		#define __uAtomicInc(address, val) \
		__builtin___uAtomicInc(address, val)
		#define __uAtomicDec(address, val) \
		__builtin___uAtomicDec(address, val)
		#define __iAtomicAnd(address, val) \
		__builtin___iAtomicAnd(address, val)
		#define __uAtomicAnd(address, val) \
		__builtin___uAtomicAnd(address, val)
		#define __iAtomicOr(address, val) \
		__builtin___iAtomicOr(address, val)
		#define __uAtomicOr(address, val) \
		__builtin___uAtomicOr(address, val)
		#define __iAtomicXor(address, val) \
		__builtin___iAtomicXor(address, val)
		#define __uAtomicXor(address, val) \
		__builtin___uAtomicXor(address, val)
		#define __iAtomicCAS(address, compare, val) \
		__builtin___iAtomicCAS(address, compare, val)
		#define __uAtomicCAS(address, compare, val) \
		__builtin___uAtomicCAS(address, compare, val)

		#else /* __MULTI_CORE__ */

	__device_func__(int __iAtomicAdd(int *address, int val))	__device_func__(int __iAtomicAdd(int *address, int val))
	{	{
	int old = *address;	int old = *address;

	*address = old + val;	*address = old + val;

	return old;	return old;
	}	}

	__device_func__(unsigned int __uAtomicAdd(unsigned int *address, unsigned i nt val))	__device_func__(unsigned int __uAtomicAdd(unsigned int *address, unsigned i nt val))

	skipping to change at line 385	skipping to change at line 428

	__device_func__(unsigned int __uAtomicCAS(unsigned int *address, unsigned i nt compare, unsigned int val))	__device_func__(unsigned int __uAtomicCAS(unsigned int *address, unsigned i nt compare, unsigned int val))
	{	{
	unsigned int old = *address;	unsigned int old = *address;

	*address = old == compare ? val : old;	*address = old == compare ? val : old;

	return old;	return old;
	}	}


		#endif /* __MULTI_CORE__ */

	#endif /* !__CUDABE__ */	#endif /* !__CUDABE__ */

	#endif /* __cplusplus && __CUDACC__ */	#endif /* __cplusplus && __CUDACC__ */

	#endif /* !__SM_11_ATOMIC_FUNCTIONS_H__ */	#endif /* !__SM_11_ATOMIC_FUNCTIONS_H__ */

End of changes. 2 change blocks.
	0 lines changed or deleted	45 lines changed or added

	sm_12_atomic_functions.h	sm_12_atomic_functions.h

	skipping to change at line 107	skipping to change at line 107
	}	}

	#endif /* !CUDA_NO_SM_12_ATOMIC_INTRINSICS */	#endif /* !CUDA_NO_SM_12_ATOMIC_INTRINSICS */

	#elif !defined(__CUDACC__)	#elif !defined(__CUDACC__)

	#include "crt/func_macro.h"	#include "crt/func_macro.h"

	#if !defined(__CUDABE__)	#if !defined(__CUDABE__)


		#if defined(__MULTI_CORE__)

		#define __ullAtomicAdd(address, val) \
		__builtin___ullAtomicAdd(address, val)
		#define __ullAtomicExch(address, val) \
		__builtin___ullAtomicExch(address, val)
		#define __ullAtomicCAS(address, compare, val) \
		__builtin___ullAtomicCAS(address, compare, val)

		#else /* __MULTI_CORE__ */

	__device_func__(unsigned long long int __ullAtomicAdd(unsigned long long in t *address, unsigned long long int val))	__device_func__(unsigned long long int __ullAtomicAdd(unsigned long long in t *address, unsigned long long int val))
	{	{
	unsigned long long int old = *address;	unsigned long long int old = *address;

	*address = old + val;	*address = old + val;

	return old;	return old;
	}	}

	__device_func__(unsigned long long int __ullAtomicExch(unsigned long long i nt *address, unsigned long long int val))	__device_func__(unsigned long long int __ullAtomicExch(unsigned long long i nt *address, unsigned long long int val))

	skipping to change at line 134	skipping to change at line 145

	__device_func__(unsigned long long int __ullAtomicCAS(unsigned long long in t *address, unsigned long long int compare, unsigned long long int val))	__device_func__(unsigned long long int __ullAtomicCAS(unsigned long long in t *address, unsigned long long int compare, unsigned long long int val))
	{	{
	unsigned long long int old = *address;	unsigned long long int old = *address;

	*address = old == compare ? val : old;	*address = old == compare ? val : old;

	return old;	return old;
	}	}


		#endif /* __MULTI_CORE__ */

	__device_func__(int __any(int cond))	__device_func__(int __any(int cond))
	{	{
	return cond;	return cond;
	}	}

	__device_func__(int __all(int cond))	__device_func__(int __all(int cond))
	{	{
	return cond;	return cond;
	}	}


End of changes. 2 change blocks.
	0 lines changed or deleted	13 lines changed or added

	sm_13_double_functions.h	sm_13_double_functions.h

	skipping to change at line 631	skipping to change at line 631
	{	{
	return __internal_ull2double_kernel(a, cudaRoundPosInf);	return __internal_ull2double_kernel(a, cudaRoundPosInf);
	}	}

	#endif /* !__CUDABE__ */	#endif /* !__CUDABE__ */

	#if !defined(__CUDABE__) \|\| defined(CUDA_NO_SM_13_DOUBLE_INTRINSICS)	#if !defined(__CUDABE__) \|\| defined(CUDA_NO_SM_13_DOUBLE_INTRINSICS)

	__device_func__(double __internal_fma_kernel(double x, double y, double z, enum cudaRoundMode rndMode))	__device_func__(double __internal_fma_kernel(double x, double y, double z, enum cudaRoundMode rndMode))
	{	{

		#ifdef __MULTI_CORE__
		volatile
		#endif /* __MULTI_CORE__ */
	struct {	struct {
	unsigned int lo;	unsigned int lo;
	unsigned int hi;	unsigned int hi;
	} xx, yy, zz, ww;	} xx, yy, zz, ww;
	unsigned int s, t, u, prod0, prod1, prod2, prod3, expo_x, expo_y, expo_z;	unsigned int s, t, u, prod0, prod1, prod2, prod3, expo_x, expo_y, expo_z;

	xx.hi = __double2hiint(x);	xx.hi = __double2hiint(x);
	xx.lo = __double2loint(x);	xx.lo = __double2loint(x);
	yy.hi = __double2hiint(y);	yy.hi = __double2hiint(y);
	yy.lo = __double2loint(y);	yy.lo = __double2loint(y);

End of changes. 1 change blocks.
	0 lines changed or deleted	3 lines changed or added

	storage_class.h	storage_class.h

	skipping to change at line 48	skipping to change at line 48

	#if !defined(__var_used__)	#if !defined(__var_used__)

	#define __var_used__	#define __var_used__

	#endif /* __var_used__ */	#endif /* __var_used__ */

	#if !defined(__loc_sc__)	#if !defined(__loc_sc__)

	#define __loc_sc__(loc, sc) \	#define __loc_sc__(loc, sc) \

	__storage##_##sc##loc __var_used__ loc	__storage##_##sc##loc loc

	#endif /* !__loc_sc__ */	#endif /* !__loc_sc__ */


	#define __storage___device__ \	#if !defined(__storage___device__)
	static	#define __storage___device__ static __var_used__
	#define __storage_extern__device__ \	#endif /* __storage___device__ */
	static
	#define __storage_auto__device__ \
	@@@ COMPILER @@@ ERROR @@@
	#define __storage_static__device__ \
	static


	#define __storage___constant__ \	#if !defined(__storage_extern__device__)
	static	#define __storage_extern__device__ static __var_used__
	#define __storage_extern__constant__ \	#endif /* __storage_extern__device__ */
	static
	#define __storage_auto__constant__ \
	@@@ COMPILER @@@ ERROR @@@
	#define __storage_static__constant__ \
	static


	#define __storage___shared__ \	#if !defined(__storage_auto__device__)
	static	#define __storage_auto__device__ @@@ COMPILER @@@ ERROR @@@
	#define __storage_extern__shared__ \	#endif /* __storage_auto__device__ */
	static
	#define __storage_auto__shared__ \
	static
	#define __storage_static__shared__ \
	static


	#define __storage___text__ \	#if !defined(__storage_static__device__)
	static	#define __storage_static__device__ static __var_used__
	#define __storage_extern__text__ \	#endif /* __storage_static__device__ */
	static
	#define __storage_auto__text__ \	#if !defined(__storage___constant__)
	@@@ COMPILER @@@ ERROR @@@	#define __storage___constant__ static __var_used__
	#define __storage_static__text__ \	#endif /* __storage___constant__ */
	static
		#if !defined(__storage_extern__constant__)
		#define __storage_extern__constant__ static __var_used__
		#endif /* __storage_extern__constant__ */

		#if !defined(__storage_auto__constant__)
		#define __storage_auto__constant__ @@@ COMPILER @@@ ERROR @@@
		#endif /* __storage_auto__constant__ */

		#if !defined(__storage_static__constant__)
		#define __storage_static__constant__ static __var_used__
		#endif /* __storage_static__constant__ */

		#if !defined(__storage___shared__)
		#define __storage___shared__ static __var_used__
		#endif /* __storage___shared__ */

		#if !defined(__storage_extern__shared__)
		#define __storage_extern__shared__ static __var_used__
		#endif /* __storage_extern__shared__ */

		#if !defined(__storage_auto__shared__)
		#define __storage_auto__shared__ static
		#endif /* __storage_auto__shared__ */

		#if !defined(__storage_static__shared__)
		#define __storage_static__shared__ static __var_used__
		#endif /* __storage_static__shared__ */

		#if !defined(__storage___text__)
		#define __storage___text__ static __var_used__
		#endif /* __storage___text__ */

		#if !defined(__storage_extern__text__)
		#define __storage_extern__text__ static __var_used__
		#endif /* __storage_extern__text__ */

		#if !defined(__storage_auto__text__)
		#define __storage_auto__text__ @@@ COMPILER @@@ ERROR @@@
		#endif /* __storage_auto__text__ */

		#if !defined(__storage_static__text__)
		#define __storage_static__text__ static __var_used__
		#endif /* __storage_static__text__ */

		#if !defined(__storage___surf__)
		#define __storage___surf__ static __var_used__
		#endif /* __storage___surf__ */

		#if !defined(__storage_extern__surf__)
		#define __storage_extern__surf__ static __var_used__
		#endif /* __storage_extern__surf__ */

		#if !defined(__storage_auto__surf__)
		#define __storage_auto__surf__ @@@ COMPILER @@@ ERROR @@@
		#endif /* __storage_auto__surf__ */

		#if !defined(__storage_static__surf__)
		#define __storage_static__surf__ static __var_used__
		#endif /* __storage_static__surf__ */

	#endif /* !__STORAGE_CLASS_H__ */	#endif /* !__STORAGE_CLASS_H__ */

End of changes. 5 change blocks.
	33 lines changed or deleted	77 lines changed or added

	texture_fetch_functions.h	texture_fetch_functions.h

	skipping to change at line 47	skipping to change at line 47
	#define __TEXTURE_FETCH_FUNCTIONS_H__	#define __TEXTURE_FETCH_FUNCTIONS_H__

	#if defined(__cplusplus) && defined(__CUDACC__)	#if defined(__cplusplus) && defined(__CUDACC__)

	/************************************************************************ ***	/************************************************************************ ***
	* *	* *
	* *	* *
	* *	* *
	************************************************************************* **/	************************************************************************* **/


		#include "cuda_texture_types.h"
	#include "host_defines.h"	#include "host_defines.h"
	#include "texture_types.h"	#include "texture_types.h"
	#include "vector_functions.h"	#include "vector_functions.h"
	#include "vector_types.h"	#include "vector_types.h"

	/************************************************************************ ***	/************************************************************************ ***
	* *	* *
	* *	* *
	* *	* *
	************************************************************************* **/	************************************************************************* **/

	skipping to change at line 1881	skipping to change at line 1882
	return make_float4(w.x, w.y, w.z, w.w);	return make_float4(w.x, w.y, w.z, w.w);
	}	}

	#elif !defined(__CUDACC__)	#elif !defined(__CUDACC__)

	#include "host_defines.h"	#include "host_defines.h"
	#include "crt/func_macro.h"	#include "crt/func_macro.h"

	#if defined(__CUDABE__)	#if defined(__CUDABE__)


	extern uint4 __utexfetchi1D(__i1texture, int4);	extern uint4 __utexfetchi1D(const void*, int4);
	extern int4 __itexfetchi1D(__i2texture, int4);	extern int4 __itexfetchi1D(const void*, int4);
	extern float4 __ftexfetchi1D(__i3texture, int4);	extern float4 __ftexfetchi1D(const void*, int4);
	extern uint4 __utexfetch1D(__u1texture, float4);	extern uint4 __utexfetch1D(const void*, float4);
	extern int4 __itexfetch1D(__u2texture, float4);	extern int4 __itexfetch1D(const void*, float4);
	extern float4 __ftexfetch1D(__u3texture, float4);	extern float4 __ftexfetch1D(const void*, float4);
	extern uint4 __utexfetch2D(__f1texture, float4);	extern uint4 __utexfetch2D(const void*, float4);
	extern int4 __itexfetch2D(__f2texture, float4);	extern int4 __itexfetch2D(const void*, float4);
	extern float4 __ftexfetch2D(__f3texture, float4);	extern float4 __ftexfetch2D(const void*, float4);
	extern uint4 __utexfetch3D(__f1texture, float4);	extern uint4 __utexfetch3D(const void*, float4);
	extern int4 __itexfetch3D(__f2texture, float4);	extern int4 __itexfetch3D(const void*, float4);
	extern float4 __ftexfetch3D(__f3texture, float4);	extern float4 __ftexfetch3D(const void*, float4);

	#define __utexfetchi(t, i) \	#define __utexfetchi(t, i) \
	__utexfetchi1D(t, i)	__utexfetchi1D(t, i)
	#define __itexfetchi(t, i) \	#define __itexfetchi(t, i) \
	__itexfetchi1D(t, i)	__itexfetchi1D(t, i)
	#define __ftexfetchi(t, i) \	#define __ftexfetchi(t, i) \
	__ftexfetchi1D(t, i)	__ftexfetchi1D(t, i)
	#define __utexfetch(t, i, d) \	#define __utexfetch(t, i, d) \
	__utexfetch##d##D(t, i)	__utexfetch##d##D(t, i)
	#define __itexfetch(t, i, d) \	#define __itexfetch(t, i, d) \

End of changes. 2 change blocks.
	12 lines changed or deleted	13 lines changed or added

	texture_types.h	texture_types.h

	skipping to change at line 45	skipping to change at line 45

	#if !defined(__TEXTURE_TYPES_H__)	#if !defined(__TEXTURE_TYPES_H__)
	#define __TEXTURE_TYPES_H__	#define __TEXTURE_TYPES_H__

	/************************************************************************ ***	/************************************************************************ ***
	* *	* *
	* *	* *
	* *	* *
	************************************************************************* **/	************************************************************************* **/


		#include "driver_types.h"

		/**************************************************************************
		*****
		*
		*
		*
		*
		*
		*
		***************************************************************************
		****/

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	enum cudaTextureAddressMode	enum cudaTextureAddressMode
	{	{
	cudaAddressModeWrap,	cudaAddressModeWrap,
	cudaAddressModeClamp	cudaAddressModeClamp
	};	};

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	enum cudaTextureFilterMode	enum cudaTextureFilterMode
	{	{

	skipping to change at line 76	skipping to change at line 84
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct textureReference	struct textureReference
	{	{
	int normalized;	int normalized;
	enum cudaTextureFilterMode filterMode;	enum cudaTextureFilterMode filterMode;
	enum cudaTextureAddressMode addressMode[3];	enum cudaTextureAddressMode addressMode[3];
	struct cudaChannelFormatDesc channelDesc;	struct cudaChannelFormatDesc channelDesc;
	int __cudaReserved[16];	int __cudaReserved[16];
	};	};


	#if defined(__cplusplus) && defined(__CUDACC__)

	/**************************************************************************
	*****
	*
	*
	*
	*
	*
	*
	***************************************************************************
	****/

	#include "host_defines.h"
	#include "channel_descriptor.h"

	/**************************************************************************
	*****
	*
	*
	*
	*
	*
	*
	***************************************************************************
	****/

	/TEXTURE_TYPE/
	template<class T, int dim = 1, enum cudaTextureReadMode = cudaReadModeEleme
	ntType>
	struct texture : public textureReference
	{
	__host__ texture(int norm = 0,
	enum cudaTextureFilterMode fMode = cudaFilterModePoint,
	enum cudaTextureAddressMode aMode = cudaAddressModeClamp
	)
	{
	normalized = norm;
	filterMode = fMode;
	addressMode[0] = aMode;
	addressMode[1] = aMode;
	addressMode[2] = aMode;
	channelDesc = cudaCreateChannelDesc<T>();
	}

	__host__ texture(int norm,
	enum cudaTextureFilterMode fMode,
	enum cudaTextureAddressMode aMode,
	struct cudaChannelFormatDesc desc)
	{
	normalized = norm;
	filterMode = fMode;
	addressMode[0] = aMode;
	addressMode[1] = aMode;
	addressMode[2] = aMode;
	channelDesc = desc;
	}
	};

	#endif /* __cplusplus && __CUDACC__ */

	#endif /* !__TEXTURE_TYPES_H__ */	#endif /* !__TEXTURE_TYPES_H__ */

End of changes. 2 change blocks.
	61 lines changed or deleted	13 lines changed or added

	vector_functions.h	vector_functions.h

	skipping to change at line 238	skipping to change at line 238
	static __inline__ __host__ __device__ float3 make_float3(float x, float y, float z)	static __inline__ __host__ __device__ float3 make_float3(float x, float y, float z)
	{	{
	float3 t; t.x = x; t.y = y; t.z = z; return t;	float3 t; t.x = x; t.y = y; t.z = z; return t;
	}	}

	static __inline__ __host__ __device__ float4 make_float4(float x, float y, float z, float w)	static __inline__ __host__ __device__ float4 make_float4(float x, float y, float z, float w)
	{	{
	float4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;	float4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
	}	}


		static __inline__ __host__ __device__ longlong1 make_longlong1(long long in
		t x)
		{
		longlong1 t; t.x = x; return t;
		}

		static __inline__ __host__ __device__ ulonglong1 make_ulonglong1(unsigned l
		ong long int x)
		{
		ulonglong1 t; t.x = x; return t;
		}

		static __inline__ __host__ __device__ longlong2 make_longlong2(long long in
		t x, long long int y)
		{
		longlong2 t; t.x = x; t.y = y; return t;
		}

		static __inline__ __host__ __device__ ulonglong2 make_ulonglong2(unsigned l
		ong long int x, unsigned long long int y)
		{
		ulonglong2 t; t.x = x; t.y = y; return t;
		}

	static __inline__ __host__ __device__ double1 make_double1(double x)	static __inline__ __host__ __device__ double1 make_double1(double x)
	{	{
	double1 t; t.x = x; return t;	double1 t; t.x = x; return t;
	}	}

	static __inline__ __host__ __device__ double2 make_double2(double x, double y)	static __inline__ __host__ __device__ double2 make_double2(double x, double y)
	{	{
	double2 t; t.x = x; t.y = y; return t;	double2 t; t.x = x; t.y = y; return t;
	}	}


End of changes. 1 change blocks.
	0 lines changed or deleted	24 lines changed or added

	vector_types.h	vector_types.h

	skipping to change at line 286	skipping to change at line 286
	float x, y, z;	float x, y, z;
	};	};

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct __builtin_align__(16) float4	struct __builtin_align__(16) float4
	{	{
	float x, y, z, w;	float x, y, z, w;
	};	};

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/

		struct longlong1
		{
		long long int x;
		};

		/DEVICE_BUILTIN/
		struct ulonglong1
		{
		unsigned long long int x;
		};

		/DEVICE_BUILTIN/
		struct __builtin_align__(16) longlong2
		{
		long long int x, y;
		};

		/DEVICE_BUILTIN/
		struct __builtin_align__(16) ulonglong2
		{
		unsigned long long int x, y;
		};

		/DEVICE_BUILTIN/
	struct double1	struct double1
	{	{
	double x;	double x;
	};	};

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct __builtin_align__(16) double2	struct __builtin_align__(16) double2
	{	{
	double x, y;	double x, y;
	};	};

	skipping to change at line 376	skipping to change at line 400
	typedef struct ulong4 ulong4;	typedef struct ulong4 ulong4;
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	typedef struct float1 float1;	typedef struct float1 float1;
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	typedef struct float2 float2;	typedef struct float2 float2;
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	typedef struct float3 float3;	typedef struct float3 float3;
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	typedef struct float4 float4;	typedef struct float4 float4;
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/

		typedef struct longlong1 longlong1;
		/DEVICE_BUILTIN/
		typedef struct ulonglong1 ulonglong1;
		/DEVICE_BUILTIN/
		typedef struct longlong2 longlong2;
		/DEVICE_BUILTIN/
		typedef struct ulonglong2 ulonglong2;
		/DEVICE_BUILTIN/
	typedef struct double1 double1;	typedef struct double1 double1;
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	typedef struct double2 double2;	typedef struct double2 double2;

	/************************************************************************ ***	/************************************************************************ ***
	* *	* *
	* *	* *
	* *	* *
	************************************************************************* **/	************************************************************************* **/


End of changes. 2 change blocks.
	0 lines changed or deleted	32 lines changed or added

This html diff was produced by rfcdiff 1.41. The latest version is available from http://tools.ietf.org/tools/rfcdiff/

	math_constants.h	math_constants.h

	skipping to change at line 97	skipping to change at line 97

	#endif /* !CUDA_NO_SM_13_DOUBLE_INTRINSICS */	#endif /* !CUDA_NO_SM_13_DOUBLE_INTRINSICS */

	#define CUDART_ZERO 0.0	#define CUDART_ZERO 0.0
	#define CUDART_ONE 1.0	#define CUDART_ONE 1.0
	#define CUDART_SQRT_TWO 1.4142135623730951e+0	#define CUDART_SQRT_TWO 1.4142135623730951e+0
	#define CUDART_SQRT_HALF 7.0710678118654757e-1	#define CUDART_SQRT_HALF 7.0710678118654757e-1
	#define CUDART_THIRD 3.3333333333333333e-1	#define CUDART_THIRD 3.3333333333333333e-1
	#define CUDART_TWOTHIRD 6.6666666666666667e-1	#define CUDART_TWOTHIRD 6.6666666666666667e-1
	#define CUDART_PIO4 7.8539816339744828e-1	#define CUDART_PIO4 7.8539816339744828e-1

		#define CUDART_PIO4_HI 7.8539816339744828e-1
		#define CUDART_PIO4_LO 3.0616169978683830e-17
	#define CUDART_PIO2 1.5707963267948966e+0	#define CUDART_PIO2 1.5707963267948966e+0
	#define CUDART_PIO2_HI 1.5707963267948966e+0	#define CUDART_PIO2_HI 1.5707963267948966e+0
	#define CUDART_PIO2_LO 6.1232339957367660e-17	#define CUDART_PIO2_LO 6.1232339957367660e-17
	#define CUDART_3PIO4 2.3561944901923448e+0	#define CUDART_3PIO4 2.3561944901923448e+0
	#define CUDART_2_OVER_PI 6.3661977236758138e-1	#define CUDART_2_OVER_PI 6.3661977236758138e-1
	#define CUDART_PI 3.1415926535897931e+0	#define CUDART_PI 3.1415926535897931e+0
	#define CUDART_PI_HI 3.1415926535897931e+0	#define CUDART_PI_HI 3.1415926535897931e+0
	#define CUDART_PI_LO 1.2246467991473532e-16	#define CUDART_PI_LO 1.2246467991473532e-16
	#define CUDART_SQRT_2PI_HI 2.5066282746310007e+0	#define CUDART_SQRT_2PI_HI 2.5066282746310007e+0
	#define CUDART_SQRT_2PI_LO (-1.8328579980459167e-16)	#define CUDART_SQRT_2PI_LO (-1.8328579980459167e-16)

End of changes. 1 change blocks.
	0 lines changed or deleted	2 lines changed or added