cuda: headers diff between 2.2 and 2.3 versions

	__cudaFatFormat.h	__cudaFatFormat.h
	/*	/*

	* Copyright 1993-2009 NVIDIA Corporation. All rights reserved.	* Copyright 1993-2008 NVIDIA Corporation. All rights reserved.
	*	*
	* NOTICE TO USER:	* NOTICE TO USER:
	*	*
	* This source code is subject to NVIDIA ownership rights under U.S. and	* This source code is subject to NVIDIA ownership rights under U.S. and
	* international Copyright laws. Users and possessors of this source code	* international Copyright laws. Users and possessors of this source code
	* are hereby granted a nonexclusive, royalty-free license to use this code	* are hereby granted a nonexclusive, royalty-free license to use this code
	* in individual and commercial software.	* in individual and commercial software.
	*	*
	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR

	skipping to change at line 121	skipping to change at line 121
	char* gpuProfileName;	char* gpuProfileName;
	char* ptx;	char* ptx;
	} __cudaFatPtxEntry;	} __cudaFatPtxEntry;

	/*	/*
	* Debug entry type for __cudaFat binary.	* Debug entry type for __cudaFat binary.
	* Such information might, but need not be available	* Such information might, but need not be available
	* for Cubin entries (ptx files compiled in debug mode	* for Cubin entries (ptx files compiled in debug mode
	* will contain their own debugging information)	* will contain their own debugging information)
	*/	*/

	typedef struct {	typedef struct __cudaFatDebugEntryRec {
	char* gpuProfileName;	char* gpuProfileName;
	char* debug;	char* debug;
		struct __cudaFatDebugEntryRec *next;
		unsigned int size;
	} __cudaFatDebugEntry;	} __cudaFatDebugEntry;

	typedef enum {	typedef enum {
	__cudaFatDontSearchFlag = (1 << 0),	__cudaFatDontSearchFlag = (1 << 0),
	__cudaFatDontCacheFlag = (1 << 1),	__cudaFatDontCacheFlag = (1 << 1),
	__cudaFatSassDebugFlag = (1 << 2)	__cudaFatSassDebugFlag = (1 << 2)
	} __cudaFatCudaBinaryFlag;	} __cudaFatCudaBinaryFlag;

	/*	/*
	* Imported/exported symbol descriptor, needed for	* Imported/exported symbol descriptor, needed for

End of changes. 2 change blocks.
	4 lines changed or deleted	6 lines changed or added

	cuda.h	cuda.h

	skipping to change at line 57	skipping to change at line 57

	/**	/**
	* \defgroup CUDA_TYPES Data types used by CUDA driver	* \defgroup CUDA_TYPES Data types used by CUDA driver
	* \ingroup CUDA_DRIVER	* \ingroup CUDA_DRIVER
	* @{	* @{
	*/	*/

	/**	/**
	* CUDA API version number	* CUDA API version number
	*/	*/

	#define CUDA_VERSION 2020 /* 2.2 */	#define CUDA_VERSION 2030 /* 2.3 */

	#ifdef __cplusplus	#ifdef __cplusplus
	extern "C" {	extern "C" {
	#endif	#endif
	typedef unsigned int CUdeviceptr; ///< CUDA device pointer	typedef unsigned int CUdeviceptr; ///< CUDA device pointer

	typedef int CUdevice; ///< CUDA device	typedef int CUdevice; ///< CUDA device
	typedef struct CUctx_st *CUcontext; ///< CUDA context	typedef struct CUctx_st *CUcontext; ///< CUDA context
	typedef struct CUmod_st *CUmodule; ///< CUDA module	typedef struct CUmod_st *CUmodule; ///< CUDA module
	typedef struct CUfunc_st *CUfunction; ///< CUDA function	typedef struct CUfunc_st *CUfunction; ///< CUDA function

	skipping to change at line 89	skipping to change at line 89
	/**	/**
	* Context creation flags	* Context creation flags
	*/	*/
	typedef enum CUctx_flags_enum {	typedef enum CUctx_flags_enum {
	CU_CTX_SCHED_AUTO = 0, ///< Automatic scheduling	CU_CTX_SCHED_AUTO = 0, ///< Automatic scheduling
	CU_CTX_SCHED_SPIN = 1, ///< Set spin as default scheduling	CU_CTX_SCHED_SPIN = 1, ///< Set spin as default scheduling
	CU_CTX_SCHED_YIELD = 2, ///< Set yield as default scheduling	CU_CTX_SCHED_YIELD = 2, ///< Set yield as default scheduling
	CU_CTX_SCHED_MASK = 0x3,	CU_CTX_SCHED_MASK = 0x3,
	CU_CTX_BLOCKING_SYNC = 4, ///< Use blocking synchronization	CU_CTX_BLOCKING_SYNC = 4, ///< Use blocking synchronization
	CU_CTX_MAP_HOST = 8, ///< Support mapped pinned allocations	CU_CTX_MAP_HOST = 8, ///< Support mapped pinned allocations

	CU_CTX_FLAGS_MASK = 0xf,	CU_CTX_LMEM_RESIZE_TO_MAX = 16, ///< Keep local memory allocation after
		launch
		CU_CTX_FLAGS_MASK = 0x1f,
	} CUctx_flags;	} CUctx_flags;

	/**	/**
	* Event creation flags	* Event creation flags
	*/	*/
	typedef enum CUevent_flags_enum {	typedef enum CUevent_flags_enum {
	CU_EVENT_DEFAULT = 0, ///< Default event flag	CU_EVENT_DEFAULT = 0, ///< Default event flag
	CU_EVENT_BLOCKING_SYNC = 1, ///< Event uses blocking synchronization	CU_EVENT_BLOCKING_SYNC = 1, ///< Event uses blocking synchronization
	} CUevent_flags;	} CUevent_flags;


	skipping to change at line 605	skipping to change at line 606
	);	);
	CUresult CUDAAPI cuMemFree(CUdeviceptr dptr);	CUresult CUDAAPI cuMemFree(CUdeviceptr dptr);
	CUresult CUDAAPI cuMemGetAddressRange( CUdeviceptr pbase, unsigned int psize, CUdeviceptr dptr );	CUresult CUDAAPI cuMemGetAddressRange( CUdeviceptr pbase, unsigned int psize, CUdeviceptr dptr );

	CUresult CUDAAPI cuMemAllocHost(void **pp, unsigned int bytesize);	CUresult CUDAAPI cuMemAllocHost(void **pp, unsigned int bytesize);
	CUresult CUDAAPI cuMemFreeHost(void *p);	CUresult CUDAAPI cuMemFreeHost(void *p);

	CUresult CUDAAPI cuMemHostAlloc(void **pp, size_t bytesize, unsigned in t Flags );	CUresult CUDAAPI cuMemHostAlloc(void **pp, size_t bytesize, unsigned in t Flags );

	CUresult CUDAAPI cuMemHostGetDevicePointer( CUdeviceptr pdptr, void p , unsigned int Flags );	CUresult CUDAAPI cuMemHostGetDevicePointer( CUdeviceptr pdptr, void p , unsigned int Flags );

		CUresult CUDAAPI cuMemHostGetFlags( unsigned int pFlags, void p );

	/************************************	/************************************
	**	**
	** Synchronous Memcpy	** Synchronous Memcpy
	**	**
	** Intra-device memcpy's done with these functions may execute in para llel with the CPU,	** Intra-device memcpy's done with these functions may execute in para llel with the CPU,
	** but if host memory is involved, they wait until the copy is done be fore returning.	** but if host memory is involved, they wait until the copy is done be fore returning.
	**	**
	***********************************/	***********************************/


End of changes. 3 change blocks.
	2 lines changed or deleted	5 lines changed or added

	cudaGL.h	cudaGL.h

	skipping to change at line 43	skipping to change at line 43
	* the above Disclaimer and U.S. Government End Users Notice.	* the above Disclaimer and U.S. Government End Users Notice.
	*/	*/

	#ifndef CUDAGL_H	#ifndef CUDAGL_H
	#define CUDAGL_H	#define CUDAGL_H

	#ifdef __cplusplus	#ifdef __cplusplus
	extern "C" {	extern "C" {
	#endif	#endif


		//
		// Flags to map or unmap a resource
		//
		typedef enum CUGLmap_flags_enum {
		CU_GL_MAP_RESOURCE_FLAGS_NONE = 0x00,
		CU_GL_MAP_RESOURCE_FLAGS_READ_ONLY = 0x01,
		CU_GL_MAP_RESOURCE_FLAGS_WRITE_DISCARD = 0x02,
		} CUGLmap_flags;

	CUresult CUDAAPI cuGLInit(void);	CUresult CUDAAPI cuGLInit(void);
	CUresult CUDAAPI cuGLCtxCreate( CUcontext *pCtx, unsigned int Flags, CUdevi ce device );	CUresult CUDAAPI cuGLCtxCreate( CUcontext *pCtx, unsigned int Flags, CUdevi ce device );
	CUresult CUDAAPI cuGLRegisterBufferObject( GLuint bufferobj );	CUresult CUDAAPI cuGLRegisterBufferObject( GLuint bufferobj );
	CUresult CUDAAPI cuGLMapBufferObject( CUdeviceptr dptr, unsigned int size , GLuint bufferobj );	CUresult CUDAAPI cuGLMapBufferObject( CUdeviceptr dptr, unsigned int size , GLuint bufferobj );
	CUresult CUDAAPI cuGLUnmapBufferObject( GLuint bufferobj );	CUresult CUDAAPI cuGLUnmapBufferObject( GLuint bufferobj );
	CUresult CUDAAPI cuGLUnregisterBufferObject( GLuint bufferobj );	CUresult CUDAAPI cuGLUnregisterBufferObject( GLuint bufferobj );


		CUresult CUDAAPI cuGLSetBufferObjectMapFlags( GLuint bufferobj, unsigned in
		t Flags );
		CUresult CUDAAPI cuGLMapBufferObjectAsync( CUdeviceptr *dptr, unsigned int
		*size, GLuint bufferobj, CUstream hStream );
		CUresult CUDAAPI cuGLUnmapBufferObjectAsync( GLuint bufferobj, CUstream hSt
		ream );

	#if defined(_WIN32)	#if defined(_WIN32)
	#if !defined(WGL_NV_gpu_affinity)	#if !defined(WGL_NV_gpu_affinity)
	typedef void* HGPUNV;	typedef void* HGPUNV;
	#endif	#endif
	CUresult CUDAAPI cuWGLGetDevice( CUdevice *pDevice, HGPUNV hGpu );	CUresult CUDAAPI cuWGLGetDevice( CUdevice *pDevice, HGPUNV hGpu );
	#endif	#endif

	#ifdef __cplusplus	#ifdef __cplusplus
	};	};
	#endif	#endif

End of changes. 2 change blocks.
	0 lines changed or deleted	16 lines changed or added

	cuda_gl_interop.h	cuda_gl_interop.h

	skipping to change at line 67	skipping to change at line 67
	#if defined(__cplusplus)	#if defined(__cplusplus)
	extern "C" {	extern "C" {
	#endif /* __cplusplus */	#endif /* __cplusplus */

	/************************************************************************ ***	/************************************************************************ ***
	* *	* *
	* *	* *
	* *	* *
	************************************************************************* **/	************************************************************************* **/


		/**
		* CUDA GL Map Flags
		*/
		enum cudaGLMapFlags
		{
		cudaGLMapFlagsNone = 0, ///< Default; Assume resource can be rea
		d/written
		cudaGLMapFlagsReadOnly = 1, ///< CUDA kernels will not write to this
		resource
		cudaGLMapFlagsWriteDiscard = 2, ///< CUDA kernels will only write to and
		will not read from this resource
		};

	extern __host__ cudaError_t CUDARTAPI cudaGLSetGLDevice(int device);	extern __host__ cudaError_t CUDARTAPI cudaGLSetGLDevice(int device);
	extern __host__ cudaError_t CUDARTAPI cudaGLRegisterBufferObject(GLuint buf Obj);	extern __host__ cudaError_t CUDARTAPI cudaGLRegisterBufferObject(GLuint buf Obj);
	extern __host__ cudaError_t CUDARTAPI cudaGLMapBufferObject(void **devPtr, GLuint bufObj);	extern __host__ cudaError_t CUDARTAPI cudaGLMapBufferObject(void **devPtr, GLuint bufObj);
	extern __host__ cudaError_t CUDARTAPI cudaGLUnmapBufferObject(GLuint bufObj );	extern __host__ cudaError_t CUDARTAPI cudaGLUnmapBufferObject(GLuint bufObj );
	extern __host__ cudaError_t CUDARTAPI cudaGLUnregisterBufferObject(GLuint b ufObj);	extern __host__ cudaError_t CUDARTAPI cudaGLUnregisterBufferObject(GLuint b ufObj);


		extern __host__ cudaError_t CUDARTAPI cudaGLSetBufferObjectMapFlags(GLuint
		bufObj, unsigned int flags);
		extern __host__ cudaError_t CUDARTAPI cudaGLMapBufferObjectAsync(void **dev
		Ptr, GLuint bufObj, cudaStream_t stream);
		extern __host__ cudaError_t CUDARTAPI cudaGLUnmapBufferObjectAsync(GLuint b
		ufObj, cudaStream_t stream);

	#ifdef _WIN32	#ifdef _WIN32
	#ifndef WGL_NV_gpu_affinity	#ifndef WGL_NV_gpu_affinity
	typedef void* HGPUNV;	typedef void* HGPUNV;
	#endif	#endif
	extern __host__ cudaError_t CUDARTAPI cudaWGLGetDevice(int *device, HGPUNV hGpu);	extern __host__ cudaError_t CUDARTAPI cudaWGLGetDevice(int *device, HGPUNV hGpu);
	#endif	#endif

	#if defined(__cplusplus)	#if defined(__cplusplus)
	}	}
	#endif /* __cplusplus */	#endif /* __cplusplus */

End of changes. 2 change blocks.
	0 lines changed or deleted	21 lines changed or added

	cuda_runtime.h	cuda_runtime.h

	skipping to change at line 94	skipping to change at line 94
	* \ref ::cudaSetupArgument(T,size_t) "cudaSetupArgument()" must be precede d	* \ref ::cudaSetupArgument(T,size_t) "cudaSetupArgument()" must be precede d
	* by a call to ::cudaConfigureCall().	* by a call to ::cudaConfigureCall().
	*	*
	* \param arg - Argument to push for a kernel launch	* \param arg - Argument to push for a kernel launch
	* \param offset - Offset in argument stack to push new arg	* \param offset - Offset in argument stack to push new arg
	*	*
	* \return	* \return
	* ::cudaSuccess	* ::cudaSuccess
	* \notefnerr	* \notefnerr
	*	*

	* \sa \ref ::cudaLaunch(T*) "cudaLaunch (C++ API)",	* \sa ::cudaConfigureCall,
		* \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes, T) "cudaFuncGe
		tAttributes (C++ API)",
		* \ref ::cudaLaunch(T*) "cudaLaunch (C++ API)",
		* ::cudaSetDoubleForDevice,
		* ::cudaSetDoubleForHost,
	* \ref ::cudaSetupArgument(const void*, size_t, size_t) "cudaSetupArgument (C API)"	* \ref ::cudaSetupArgument(const void*, size_t, size_t) "cudaSetupArgument (C API)"

	* ::cudaConfigureCall
	*/	*/
	template<class T>	template<class T>
	__inline__ __host__ cudaError_t cudaSetupArgument(	__inline__ __host__ cudaError_t cudaSetupArgument(
	T arg,	T arg,
	size_t offset	size_t offset
	)	)
	{	{
	return cudaSetupArgument((const void*)&arg, sizeof(T), offset);	return cudaSetupArgument((const void*)&arg, sizeof(T), offset);
	}	}


		template<class T>
		__inline__ __host__ cudaError_t cudaHostAlloc(
		T **ptr,
		size_t size,
		unsigned int flags
		)
		{
		return cudaHostAlloc((void*)(void)ptr, size, flags);
		}

		template<class T>
		__inline__ __host__ cudaError_t cudaHostGetDevicePointer(
		T **pDevice,
		void *pHost,
		unsigned int flags
		)
		{
		return cudaHostGetDevicePointer((void*)(void)pDevice, pHost, flags);
		}

		template<class T>
		__inline__ __host__ cudaError_t cudaMalloc(
		T **devPtr,
		size_t size
		)
		{
		return cudaMalloc((void*)(void)devPtr, size);
		}

		template<class T>
		__inline__ __host__ cudaError_t cudaMallocHost(
		T **ptr,
		size_t size
		)
		{
		return cudaMallocHost((void*)(void)ptr, size);
		}

		template<class T>
		__inline__ __host__ cudaError_t cudaMallocPitch(
		T **devPtr,
		size_t *pitch,
		size_t width,
		size_t height
		)
		{
		return cudaMallocPitch((void*)(void)devPtr, pitch, width, height);
		}

	#if defined(__CUDACC__)	#if defined(__CUDACC__)

	/************************************************************************ ***	/************************************************************************ ***
	* *	* *
	* *	* *
	* *	* *
	************************************************************************* **/	************************************************************************* **/

	/**	/**
	* \addtogroup CUDART_HIGHLEVEL	* \addtogroup CUDART_HIGHLEVEL

	skipping to change at line 305	skipping to change at line 357
	*/	*/
	template<class T>	template<class T>
	__inline__ __host__ cudaError_t cudaGetSymbolSize(	__inline__ __host__ cudaError_t cudaGetSymbolSize(
	size_t *size,	size_t *size,
	const T &symbol	const T &symbol
	)	)
	{	{
	return cudaGetSymbolSize(size, (const char*)&symbol);	return cudaGetSymbolSize(size, (const char*)&symbol);
	}	}


	/** @} / / END CUDART_MEMORY */	/** @} / / END CUDART_HIGHLEVEL */

	/************************************************************************ ***	/************************************************************************ ***
	* *	* *
	* *	* *
	* *	* *
	************************************************************************* **/	************************************************************************* **/

	/**	/**
	* \addtogroup CUDART_HIGHLEVEL	* \addtogroup CUDART_HIGHLEVEL
	*	*

	skipping to change at line 628	skipping to change at line 680
	/************************************************************************ ***	/************************************************************************ ***
	* *	* *
	* *	* *
	* *	* *
	************************************************************************* **/	************************************************************************* **/

	/**	/**
	* \ingroup CUDART_HIGHLEVEL	* \ingroup CUDART_HIGHLEVEL
	* \brief \hl Launches a device function	* \brief \hl Launches a device function
	*	*

	* Launches the function \p entry on the device. \p entry can either be a	* Launches the function \p entry on the device. The parameter \p entry can
	* function that executes on the device, or it can be a character string,	* either be a function that executes on the device, or it can be a charact
	* naming a function that executes on the device. \p entry must be declared	er
	as	* string, naming a function that executes on the device. The parameter
	* a \p __global__ function.	* specified by \p entry must be declared as a \p __global__ function.
	* \ref ::cudaLaunch(T*) "cudaLaunch()" must be preceded by a call to	* \ref ::cudaLaunch(T*) "cudaLaunch()" must be preceded by a call to
	* ::cudaConfigureCall() since it pops the data that was pushed by	* ::cudaConfigureCall() since it pops the data that was pushed by
	* ::cudaConfigureCall() from the execution stack.	* ::cudaConfigureCall() from the execution stack.
	*	*
	* \param entry - Device function pointer or char string naming device func tion	* \param entry - Device function pointer or char string naming device func tion
	* to execute	* to execute
	*	*
	* \return	* \return
	* ::cudaSuccess,	* ::cudaSuccess,
	* ::cudaErrorInvalidDeviceFunction,	* ::cudaErrorInvalidDeviceFunction,

	* ::cudaErrorInvalidConfiguration	* ::cudaErrorInvalidConfiguration,
		* ::cudaErrorLaunchFailure,
		* ::cudaErrorPriorLaunchFailure,
		* ::cudaErrorLaunchTimeout,
		* ::cudaErrorLaunchOutOfResources
	* \notefnerr	* \notefnerr
	*	*
	* \sa ::cudaConfigureCall,	* \sa ::cudaConfigureCall,

	* \ref ::cudaSetupArgument(T,size_t) "cudaSetupArgument (C++ API)",	* \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes, T) "cudaFuncGe
	* \ref ::cudaLaunch(const char*) "cudaLaunch (C API)"	tAttributes (C++ API)",
		* \ref ::cudaLaunch(const char*) "cudaLaunch (C API)",
		* ::cudaSetDoubleForDevice,
		* ::cudaSetDoubleForHost,
		* \ref ::cudaSetupArgument(T,size_t) "cudaSetupArgument (C++ API)"
	*/	*/
	template<class T>	template<class T>
	__inline__ __host__ cudaError_t cudaLaunch(	__inline__ __host__ cudaError_t cudaLaunch(
	T *entry	T *entry
	)	)
	{	{
	return cudaLaunch((const char*)entry);	return cudaLaunch((const char*)entry);
	}	}


		/**
		* \ingroup CUDART_HIGHLEVEL
		* \brief \hl Find out attributes for a given function
		*
		* This function obtains the attributes of a function specified via \p entr
		y.
		* The parameter \p entry can either be a function that executes on the
		* device, or it can be a character string, naming a function that executes
		on
		* the device. The parameter specified by \p entry must be declared as a
		* \p __global__ function. The fetched attributes are placed in \p attr. If
		* the specified function does not exist, then ::cudaErrorInvalidDeviceFunc
		tion
		* is returned.
		*
		* \param attr - Return pointer to function's attributes
		* \param entry - Function to get attributes of
		*
		* \return
		* ::cudaSuccess,
		* ::cudaErrorInitializationError,
		* ::cudaErrorInvalidDeviceFunction
		* \notefnerr
		*
		* \sa ::cudaConfigureCall,
		* \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes, const char) "c
		udaFuncGetAttributes (C API)",
		* \ref ::cudaLaunch(T*) "cudaLaunch (C++ API)",
		* ::cudaSetDoubleForDevice,
		* ::cudaSetDoubleForHost,
		* \ref ::cudaSetupArgument(T,size_t) "cudaSetupArgument (C++ API)"
		*/
		template<class T>
		__inline__ __host__ cudaError_t cudaFuncGetAttributes(
		struct cudaFuncAttributes *attr,
		T *entry
		)
		{
		return cudaFuncGetAttributes(attr, (const char*)entry);
		}

	#endif /* __CUDACC__ */	#endif /* __CUDACC__ */

	#endif /* __cplusplus */	#endif /* __cplusplus */

	#endif /* !__CUDA_RUNTIME_H__ */	#endif /* !__CUDA_RUNTIME_H__ */

End of changes. 8 change blocks.
	11 lines changed or deleted	113 lines changed or added

	cufft.h	cufft.h

	skipping to change at line 45	skipping to change at line 45

	/*	/*
	* cufft.h	* cufft.h
	* Public header file for the NVIDIA Cuda FFT library (CUFFT)	* Public header file for the NVIDIA Cuda FFT library (CUFFT)
	*/	*/
	#ifndef _CUFFT_H_	#ifndef _CUFFT_H_
	#define _CUFFT_H_	#define _CUFFT_H_

	#include <stdio.h>	#include <stdio.h>
	#include "cuComplex.h"	#include "cuComplex.h"

		#include "driver_types.h"

	#ifndef CUFFTAPI	#ifndef CUFFTAPI
	#ifdef _WIN32	#ifdef _WIN32
	#define CUFFTAPI __stdcall	#define CUFFTAPI __stdcall
	#else	#else
	#define CUFFTAPI	#define CUFFTAPI
	#endif	#endif
	#endif	#endif

	#ifdef __cplusplus	#ifdef __cplusplus

	skipping to change at line 77	skipping to change at line 78
	CUFFT_SETUP_FAILED = 0x7,	CUFFT_SETUP_FAILED = 0x7,
	CUFFT_INVALID_SIZE = 0x8	CUFFT_INVALID_SIZE = 0x8
	} cufftResult;	} cufftResult;

	// CUFFT defines and supports the following data types	// CUFFT defines and supports the following data types

	// cufftHandle is a handle type used to store and access CUFFT plans.	// cufftHandle is a handle type used to store and access CUFFT plans.
	typedef unsigned int cufftHandle;	typedef unsigned int cufftHandle;

	// cufftReal is a single-precision, floating-point real data type.	// cufftReal is a single-precision, floating-point real data type.

		// cufftDoubleReal is a double-precision, real data type.
	typedef float cufftReal;	typedef float cufftReal;

		typedef double cufftDoubleReal;

	// cufftComplex is a single-precision, floating-point complex data type tha t	// cufftComplex is a single-precision, floating-point complex data type tha t
	// consists of interleaved real and imaginary components.	// consists of interleaved real and imaginary components.

	// typedef float cufftComplex[2];	// cufftDoubleComplex is the double-precision equivalent.
	typedef cuComplex cufftComplex;	typedef cuComplex cufftComplex;

		typedef cuDoubleComplex cufftDoubleComplex;

	// CUFFT transform directions	// CUFFT transform directions
	#define CUFFT_FORWARD -1 // Forward FFT	#define CUFFT_FORWARD -1 // Forward FFT
	#define CUFFT_INVERSE 1 // Inverse FFT	#define CUFFT_INVERSE 1 // Inverse FFT

	// CUFFT supports the following transform types	// CUFFT supports the following transform types
	typedef enum cufftType_t {	typedef enum cufftType_t {

	CUFFT_R2C = 0x2a, // Real to Complex (interleaved)	CUFFT_R2C = 0x2a, // Real to Complex (interleaved)
	CUFFT_C2R = 0x2c, // Complex (interleaved) to Real	CUFFT_C2R = 0x2c, // Complex (interleaved) to Real
	CUFFT_C2C = 0x29 // Complex to Complex, interleaved	CUFFT_C2C = 0x29, // Complex to Complex, interleaved
		CUFFT_D2Z = 0x6a, // Double to Double-Complex
		CUFFT_Z2D = 0x6c, // Double-Complex to Double
		CUFFT_Z2Z = 0x69 // Double-Complex to Double-Complex
	} cufftType;	} cufftType;

	cufftResult CUFFTAPI cufftPlan1d(cufftHandle *plan,	cufftResult CUFFTAPI cufftPlan1d(cufftHandle *plan,
	int nx,	int nx,
	cufftType type,	cufftType type,
	int batch);	int batch);

	cufftResult CUFFTAPI cufftPlan2d(cufftHandle *plan,	cufftResult CUFFTAPI cufftPlan2d(cufftHandle *plan,
	int nx, int ny,	int nx, int ny,
	cufftType type);	cufftType type);

	skipping to change at line 123	skipping to change at line 130
	int direction);	int direction);

	cufftResult CUFFTAPI cufftExecR2C(cufftHandle plan,	cufftResult CUFFTAPI cufftExecR2C(cufftHandle plan,
	cufftReal *idata,	cufftReal *idata,
	cufftComplex *odata);	cufftComplex *odata);

	cufftResult CUFFTAPI cufftExecC2R(cufftHandle plan,	cufftResult CUFFTAPI cufftExecC2R(cufftHandle plan,
	cufftComplex *idata,	cufftComplex *idata,
	cufftReal *odata);	cufftReal *odata);


		cufftResult CUFFTAPI cufftExecZ2Z(cufftHandle plan,
		cufftDoubleComplex *idata,
		cufftDoubleComplex *odata,
		int direction);

		cufftResult CUFFTAPI cufftExecD2Z(cufftHandle plan,
		cufftDoubleReal *idata,
		cufftDoubleComplex *odata);

		cufftResult CUFFTAPI cufftExecZ2D(cufftHandle plan,
		cufftDoubleComplex *idata,
		cufftDoubleReal *odata);

		cufftResult CUFFTAPI cufftSetStream(cufftHandle p,
		cudaStream_t stream);

	#ifdef __cplusplus	#ifdef __cplusplus
	}	}
	#endif	#endif

	#endif /* _CUFFT_H_ */	#endif /* _CUFFT_H_ */

End of changes. 7 change blocks.
	4 lines changed or deleted	27 lines changed or added

	device_functions.h	device_functions.h

	skipping to change at line 139	skipping to change at line 139
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __device__ float __log10f(float) __THROW;	extern __device__ float __log10f(float) __THROW;

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __device__ float __logf(float) __THROW;	extern __device__ float __logf(float) __THROW;

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __device__ float __powf(float, float) __THROW;	extern __device__ float __powf(float, float) __THROW;

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/

		extern __device__ int __float2int_rn(float);
		/DEVICE_BUILTIN/
	extern __device__ int __float2int_rz(float);	extern __device__ int __float2int_rz(float);
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __device__ int __float2int_ru(float);	extern __device__ int __float2int_ru(float);
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __device__ int __float2int_rd(float);	extern __device__ int __float2int_rd(float);

	/DEVICE_BUILTIN/
	extern __device__ int __float2int_rn(float);

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/

		extern __device__ unsigned int __float2uint_rn(float);
		/DEVICE_BUILTIN/
	extern __device__ unsigned int __float2uint_rz(float);	extern __device__ unsigned int __float2uint_rz(float);
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __device__ unsigned int __float2uint_ru(float);	extern __device__ unsigned int __float2uint_ru(float);
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __device__ unsigned int __float2uint_rd(float);	extern __device__ unsigned int __float2uint_rd(float);

	/DEVICE_BUILTIN/
	extern __device__ unsigned int __float2uint_rn(float);

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/

		extern __device__ float __int2float_rn(int);
		/DEVICE_BUILTIN/
	extern __device__ float __int2float_rz(int);	extern __device__ float __int2float_rz(int);
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __device__ float __int2float_ru(int);	extern __device__ float __int2float_ru(int);
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __device__ float __int2float_rd(int);	extern __device__ float __int2float_rd(int);

	/DEVICE_BUILTIN/
	extern __device__ float __int2float_rn(int);

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/

		extern __device__ float __uint2float_rn(unsigned int);
		/DEVICE_BUILTIN/
	extern __device__ float __uint2float_rz(unsigned int);	extern __device__ float __uint2float_rz(unsigned int);
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __device__ float __uint2float_ru(unsigned int);	extern __device__ float __uint2float_ru(unsigned int);
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __device__ float __uint2float_rd(unsigned int);	extern __device__ float __uint2float_rd(unsigned int);

	/DEVICE_BUILTIN/
	extern __device__ float __uint2float_rn(unsigned int);

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/

		extern __device__ long long int __float2ll_rn(float);
		/DEVICE_BUILTIN/
	extern __device__ long long int __float2ll_rz(float);	extern __device__ long long int __float2ll_rz(float);
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/

	extern __device__ long long int __float2ll_rn(float);	extern __device__ long long int __float2ll_ru(float);
		/DEVICE_BUILTIN/
		extern __device__ long long int __float2ll_rd(float);

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/

		extern __device__ unsigned long long int __float2ull_rn(float);
		/DEVICE_BUILTIN/
	extern __device__ unsigned long long int __float2ull_rz(float);	extern __device__ unsigned long long int __float2ull_rz(float);

		/DEVICE_BUILTIN/
		extern __device__ unsigned long long int __float2ull_ru(float);
		/DEVICE_BUILTIN/
		extern __device__ unsigned long long int __float2ull_rd(float);

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __device__ float __ll2float_rn(long long int);	extern __device__ float __ll2float_rn(long long int);

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __device__ float __ull2float_rn(unsigned long long int);	extern __device__ float __ull2float_rn(unsigned long long int);

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/

		extern __device__ unsigned short __float2half_rn(float);
		/DEVICE_BUILTIN/
		extern __device__ float __half2float(unsigned short);

		/DEVICE_BUILTIN/
	extern __device__ float __fadd_rn(float, float);	extern __device__ float __fadd_rn(float, float);
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __device__ float __fadd_rz(float, float);	extern __device__ float __fadd_rz(float, float);
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __device__ float __fadd_ru(float, float);	extern __device__ float __fadd_ru(float, float);
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __device__ float __fadd_rd(float, float);	extern __device__ float __fadd_rd(float, float);

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __device__ float __fmul_rn(float, float);	extern __device__ float __fmul_rn(float, float);

	skipping to change at line 260	skipping to change at line 275

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __device__ int __clzll(long long int);	extern __device__ int __clzll(long long int);
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __device__ int __ffsll(long long int);	extern __device__ int __ffsll(long long int);
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __device__ int __popcll(unsigned long long int);	extern __device__ int __popcll(unsigned long long int);
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __device__ unsigned long long int __brevll(unsigned long long int);	extern __device__ unsigned long long int __brevll(unsigned long long int);


	#if !defined(CUDA_NO_SM_13_DOUBLE_INTRINSICS)	#if (__CUDA_ARCH__ >= 130)

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __device__ int __double2int_rz(double);	extern __device__ int __double2int_rz(double);

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __device__ unsigned int __double2uint_rz(double);	extern __device__ unsigned int __double2uint_rz(double);

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __device__ long long int __double2ll_rz(double);	extern __device__ long long int __double2ll_rz(double);

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __device__ unsigned long long int __double2ull_rz(double);	extern __device__ unsigned long long int __double2ull_rz(double);


	#endif /* ! CUDA_NO_SM_13_DOUBLE_INTRINSICS */	#endif /* __CUDA_ARCH__ >= 130 */

	}	}

	/************************************************************************ ***	/************************************************************************ ***
	* *	* *
	* *	* *
	* *	* *
	************************************************************************* **/	************************************************************************* **/

	static __inline__ __device__ int mulhi(int a, int b)	static __inline__ __device__ int mulhi(int a, int b)

	skipping to change at line 751	skipping to change at line 766
	inc = abs(rem0) < abs(rem1);	inc = abs(rem0) < abs(rem1);
	resi = ((expo_res << 23) + r + inc);	resi = ((expo_res << 23) + r + inc);
	if (resi != 0x00800000) resi = 0;	if (resi != 0x00800000) resi = 0;
	return __int_as_float(sign \| resi);	return __int_as_float(sign \| resi);
	}	}
	}	}
	if (__cuda_fabsf(divisor) > CUDART_TWO_TO_126_F) {	if (__cuda_fabsf(divisor) > CUDART_TWO_TO_126_F) {
	divisor *= 0.25f;	divisor *= 0.25f;
	dividend *= 0.25f;	dividend *= 0.25f;
	}	}

	return dividend / divisor;	return __fdividef (dividend, divisor);
	}	}

	__device_func__(float __fdiv_rz (float dividend, float divisor))	__device_func__(float __fdiv_rz (float dividend, float divisor))
	{	{
	unsigned long long prod;	unsigned long long prod;
	unsigned r, f, x, y, expox, expoy, sign;	unsigned r, f, x, y, expox, expoy, sign;
	unsigned expo_res;	unsigned expo_res;
	unsigned resi, cvtxi, cvtyi;	unsigned resi, cvtxi, cvtyi;
	float t;	float t;


	skipping to change at line 819	skipping to change at line 834
	if (rem1 < 0) r--;	if (rem1 < 0) r--;
	resi = ((expo_res << 23) + r);	resi = ((expo_res << 23) + r);
	if (resi != 0x00800000) resi = 0;	if (resi != 0x00800000) resi = 0;
	return __int_as_float(sign \| resi);	return __int_as_float(sign \| resi);
	}	}
	}	}
	if (__cuda_fabsf(divisor) > CUDART_TWO_TO_126_F) {	if (__cuda_fabsf(divisor) > CUDART_TWO_TO_126_F) {
	divisor *= 0.25f;	divisor *= 0.25f;
	dividend *= 0.25f;	dividend *= 0.25f;
	}	}

	return dividend / divisor;	return __fdividef (dividend, divisor);
	}	}

	__device_func__(float __fdiv_ru (float dividend, float divisor))	__device_func__(float __fdiv_ru (float dividend, float divisor))
	{	{
	unsigned long long prod;	unsigned long long prod;
	unsigned r, f, x, y, expox, expoy, sign;	unsigned r, f, x, y, expox, expoy, sign;
	unsigned expo_res;	unsigned expo_res;
	unsigned resi, cvtxi, cvtyi;	unsigned resi, cvtxi, cvtyi;
	float t;	float t;


	skipping to change at line 889	skipping to change at line 904
	if ((rem1 > 0) && (!sign)) r++;	if ((rem1 > 0) && (!sign)) r++;
	resi = ((expo_res << 23) + r);	resi = ((expo_res << 23) + r);
	if (resi != 0x00800000) resi = 0;	if (resi != 0x00800000) resi = 0;
	return __int_as_float(sign \| resi);	return __int_as_float(sign \| resi);
	}	}
	}	}
	if (__cuda_fabsf(divisor) > CUDART_TWO_TO_126_F) {	if (__cuda_fabsf(divisor) > CUDART_TWO_TO_126_F) {
	divisor *= 0.25f;	divisor *= 0.25f;
	dividend *= 0.25f;	dividend *= 0.25f;
	}	}

	return dividend / divisor;	return __fdividef (dividend, divisor);
	}	}

	__device_func__(float __fdiv_rd (float dividend, float divisor))	__device_func__(float __fdiv_rd (float dividend, float divisor))
	{	{
	unsigned long long prod;	unsigned long long prod;
	unsigned r, f, x, y, expox, expoy, sign;	unsigned r, f, x, y, expox, expoy, sign;
	unsigned expo_res;	unsigned expo_res;
	unsigned resi, cvtxi, cvtyi;	unsigned resi, cvtxi, cvtyi;
	float t;	float t;


	skipping to change at line 959	skipping to change at line 974
	if ((rem1 > 0) && (sign)) r++;	if ((rem1 > 0) && (sign)) r++;
	resi = ((expo_res << 23) + r);	resi = ((expo_res << 23) + r);
	if (resi != 0x00800000) resi = 0;	if (resi != 0x00800000) resi = 0;
	return __int_as_float(sign \| resi);	return __int_as_float(sign \| resi);
	}	}
	}	}
	if (__cuda_fabsf(divisor) > CUDART_TWO_TO_126_F) {	if (__cuda_fabsf(divisor) > CUDART_TWO_TO_126_F) {
	divisor *= 0.25f;	divisor *= 0.25f;
	dividend *= 0.25f;	dividend *= 0.25f;
	}	}

	return dividend / divisor;	return __fdividef (dividend, divisor);
	}	}

	__device_func__(float __fadd_ru (float a, float b))	__device_func__(float __fadd_ru (float a, float b))
	{	{
	unsigned int expo_x, expo_y;	unsigned int expo_x, expo_y;
	unsigned int xxi, yyi, temp;	unsigned int xxi, yyi, temp;

	xxi = __float_as_int(a);	xxi = __float_as_int(a);
	yyi = __float_as_int(b);	yyi = __float_as_int(b);


	skipping to change at line 2921	skipping to change at line 2937
	while ((int)cvty.i >= 0) {	while ((int)cvty.i >= 0) {
	expoy--;	expoy--;
	cvty.i = cvty.i + cvty.i;	cvty.i = cvty.i + cvty.i;
	}	}
	cvty.i = cvty.i >> 8;	cvty.i = cvty.i >> 8;
	}	}
	goto divide;	goto divide;
	}	}
	}	}


	__device_func__(float __internal_fmul_kernel2 (float a, float b,	__device_func__(float __internal_fmul_kernel (float a, float b,
	enum cudaRoundMode mode))	enum cudaRoundMode mode))
	{	{
	unsigned long long product;	unsigned long long product;
	volatile union __cudart_FloatUintCvt xx, yy;	volatile union __cudart_FloatUintCvt xx, yy;


	unsigned expo_x, expo_y;	unsigned expo_x, expo_y;

	xx.f = a;	xx.f = a;
	yy.f = b;	yy.f = b;

	expo_y = 0xFF;	expo_y = 0xFF;
	expo_x = expo_y & (xx.i >> 23);	expo_x = expo_y & (xx.i >> 23);
	expo_x = expo_x - 1;	expo_x = expo_x - 1;
	expo_y = expo_y & (yy.i >> 23);	expo_y = expo_y & (yy.i >> 23);
	expo_y = expo_y - 1;	expo_y = expo_y - 1;

	skipping to change at line 3418	skipping to change at line 3433
	} else if (mode == cudaRoundPosInf) {	} else if (mode == cudaRoundPosInf) {
	xx = xx + (!expo_y && temp);	xx = xx + (!expo_y && temp);
	} else if (mode == cudaRoundMinInf) {	} else if (mode == cudaRoundMinInf) {
	xx = xx + (expo_y && temp);	xx = xx + (expo_y && temp);
	}	}
	xx = expo_y + xx; /* add in sign bit */	xx = expo_y + xx; /* add in sign bit */
	cvt.i = xx;	cvt.i = xx;
	return cvt.f;	return cvt.f;
	}	}


	/* NOTE: Does not currently support round-to-nearest, round-to-zero */	__device_func__(float __internal_fadd_kernel (float a, float b,
	__device_func__(float __internal_fadd_kernel2 (float a, float b,	enum cudaRoundMode mode))
	enum cudaRoundMode mode))
	{	{
	volatile union __cudart_FloatUintCvt xx, yy;	volatile union __cudart_FloatUintCvt xx, yy;
	unsigned int expo_x;	unsigned int expo_x;
	unsigned int expo_y;	unsigned int expo_y;
	unsigned int temp;	unsigned int temp;

	xx.f = a;	xx.f = a;
	yy.f = b;	yy.f = b;

	/* make bigger operand the augend */	/* make bigger operand the augend */

	skipping to change at line 3466	skipping to change at line 3480

	if ((int)temp < 0) {	if ((int)temp < 0) {
	/* signs differ, effective subtraction */	/* signs differ, effective subtraction */
	temp = 32 - expo_y;	temp = 32 - expo_y;
	temp = (expo_y) ? (yy.i << temp) : 0;	temp = (expo_y) ? (yy.i << temp) : 0;
	temp = (unsigned)(-((int)temp));	temp = (unsigned)(-((int)temp));
	xx.i = xx.i - (yy.i >> expo_y) - (temp ? 1 : 0);	xx.i = xx.i - (yy.i >> expo_y) - (temp ? 1 : 0);
	if (xx.i & 0x00800000) {	if (xx.i & 0x00800000) {
	if (expo_x <= 0xFD) {	if (expo_x <= 0xFD) {
	xx.i = xx.i + (expo_x << 23);	xx.i = xx.i + (expo_x << 23);

	if (mode == cudaRoundMinInf) {	if (mode == cudaRoundNearest) {
	xx.i += (temp && (xx.i & 0x80000000));	if (temp < 0x80000000) return xx.f;
		xx.i += ((temp == 0x80000000) ? (xx.i & 1) : (temp >> 31));
		} else if (mode == cudaRoundZero) {
	} else if (mode == cudaRoundPosInf) {	} else if (mode == cudaRoundPosInf) {
	xx.i += (temp && !(xx.i & 0x80000000));	xx.i += (temp && !(xx.i & 0x80000000));

		} else if (mode == cudaRoundMinInf) {
		xx.i += (temp && (xx.i & 0x80000000));
	}	}
	return xx.f;	return xx.f;
	}	}
	} else {	} else {
	if ((temp \| (xx.i << 1)) == 0) {	if ((temp \| (xx.i << 1)) == 0) {
	/* operands cancelled, resulting in a clean zero */	/* operands cancelled, resulting in a clean zero */
	if (mode == cudaRoundMinInf) {	if (mode == cudaRoundMinInf) {
	xx.i = 0x80000000;	xx.i = 0x80000000;

	} else if (mode == cudaRoundPosInf) {	} else {
	xx.i = 0;	xx.i = 0;
	}	}
	return xx.f;	return xx.f;
	}	}
	/* normalize result */	/* normalize result */
	yy.i = xx.i & 0x80000000;	yy.i = xx.i & 0x80000000;
	do {	do {
	xx.i = (xx.i << 1) \| (temp >> 31);	xx.i = (xx.i << 1) \| (temp >> 31);
	temp <<= 1;	temp <<= 1;
	expo_x--;	expo_x--;

	skipping to change at line 3500	skipping to change at line 3518
	xx.i = xx.i \| yy.i;	xx.i = xx.i \| yy.i;
	}	}
	} else {	} else {
	/* signs are the same, effective addition */	/* signs are the same, effective addition */
	temp = 32 - expo_y;	temp = 32 - expo_y;
	temp = (expo_y) ? (yy.i << temp) : 0;	temp = (expo_y) ? (yy.i << temp) : 0;
	xx.i = xx.i + (yy.i >> expo_y);	xx.i = xx.i + (yy.i >> expo_y);
	if (!(xx.i & 0x01000000)) {	if (!(xx.i & 0x01000000)) {
	if (expo_x <= 0xFD) {	if (expo_x <= 0xFD) {
	xx.i = xx.i + (expo_x << 23);	xx.i = xx.i + (expo_x << 23);

	if (mode == cudaRoundMinInf) {	if (mode == cudaRoundNearest) {
	xx.i += (temp && (xx.i & 0x80000000));	if (temp < 0x80000000) return xx.f;
		xx.i += ((temp == 0x80000000) ? (xx.i & 1) : (temp >> 31));
		} else if (mode == cudaRoundZero) {
	} else if (mode == cudaRoundPosInf) {	} else if (mode == cudaRoundPosInf) {
	xx.i += (temp && !(xx.i & 0x80000000));	xx.i += (temp && !(xx.i & 0x80000000));

		} else if (mode == cudaRoundMinInf) {
		xx.i += (temp && (xx.i & 0x80000000));
	}	}
	return xx.f;	return xx.f;
	}	}
	} else {	} else {
	/* normalize result */	/* normalize result */
	temp = (xx.i << 31) \| (temp >> 1);	temp = (xx.i << 31) \| (temp >> 1);
	xx.i = ((xx.i & 0x80000000) \| (xx.i >> 1)) & ~0x40000000;	xx.i = ((xx.i & 0x80000000) \| (xx.i >> 1)) & ~0x40000000;
	expo_x++;	expo_x++;
	}	}
	}	}
	if (expo_x <= 0xFD) {	if (expo_x <= 0xFD) {

	if (mode == cudaRoundMinInf) {	xx.i = xx.i + (expo_x << 23);
	xx.i += (temp && (xx.i & 0x80000000));	if (mode == cudaRoundNearest) {
		if (temp < 0x80000000) return xx.f;
		xx.i += ((temp == 0x80000000) ? (xx.i & 1) : (temp >> 31));
		} else if (mode == cudaRoundZero) {
	} else if (mode == cudaRoundPosInf) {	} else if (mode == cudaRoundPosInf) {
	xx.i += (temp && !(xx.i & 0x80000000));	xx.i += (temp && !(xx.i & 0x80000000));

		} else if (mode == cudaRoundMinInf) {
		xx.i += (temp && (xx.i & 0x80000000));
	}	}

	xx.i = xx.i + (expo_x << 23);
	return xx.f;	return xx.f;
	}	}
	if ((int)expo_x >= 254) {	if ((int)expo_x >= 254) {
	/* overflow: return infinity or largest normal */	/* overflow: return infinity or largest normal */
	temp = xx.i & 0x80000000;	temp = xx.i & 0x80000000;

	if (mode == cudaRoundMinInf) {	if (mode == cudaRoundNearest) {
		xx.i = (temp) \| 0x7f800000;
		} else if (mode == cudaRoundZero) {
		xx.i = (temp) \| 0x7f7fffff;
		} else if (mode == cudaRoundMinInf) {
	xx.i = (temp ? 0xFF800000 : 0x7f7fffff);	xx.i = (temp ? 0xFF800000 : 0x7f7fffff);
	} else if (mode == cudaRoundPosInf) {	} else if (mode == cudaRoundPosInf) {
	xx.i = (temp ? 0xff7fffff : 0x7F800000);	xx.i = (temp ? 0xff7fffff : 0x7F800000);
	}	}
	return xx.f;	return xx.f;
	}	}
	/* underflow: denormal, or smallest normal */	/* underflow: denormal, or smallest normal */
	expo_y = expo_x + 32;	expo_y = expo_x + 32;
	yy.i = xx.i & 0x80000000;	yy.i = xx.i & 0x80000000;
	xx.i = xx.i & ~0xff000000;	xx.i = xx.i & ~0xff000000;
	expo_x = (unsigned)(-((int)expo_x));	expo_x = (unsigned)(-((int)expo_x));
	temp = xx.i << expo_y \| ((temp) ? 1 : 0);	temp = xx.i << expo_y \| ((temp) ? 1 : 0);
	xx.i = yy.i \| (xx.i >> expo_x);	xx.i = yy.i \| (xx.i >> expo_x);

	if (mode == cudaRoundMinInf) {	if (mode == cudaRoundNearest) {
	xx.i += (temp && yy.i);	xx.i += (temp == 0x80000000) ? (xx.i & 1) : (temp >> 31);
		} else if (mode == cudaRoundZero) {
	} else if (mode == cudaRoundPosInf) {	} else if (mode == cudaRoundPosInf) {
	xx.i += (temp && !yy.i);	xx.i += (temp && !yy.i);

		} else if (mode == cudaRoundMinInf) {
		xx.i += (temp && yy.i);
	}	}
	return xx.f;	return xx.f;
	} else {	} else {
	/* handle special cases separately */	/* handle special cases separately */
	if (!(yy.i << 1)) {	if (!(yy.i << 1)) {
	if (mode == cudaRoundMinInf) {	if (mode == cudaRoundMinInf) {
	if (!(xx.i << 1)) {	if (!(xx.i << 1)) {
	xx.i = xx.i \| yy.i;	xx.i = xx.i \| yy.i;
	}	}

	} else if (mode == cudaRoundPosInf) {	} else {
	if (xx.i == 0x80000000) {	if (xx.i == 0x80000000) {
	xx.i = yy.i;	xx.i = yy.i;
	}	}
	}	}
	if ((xx.i << 1) > 0xff000000) {	if ((xx.i << 1) > 0xff000000) {
	xx.i \|= 0x00400000;	xx.i \|= 0x00400000;
	}	}
	return xx.f;	return xx.f;
	}	}
	if ((expo_y != 254) && (expo_x != 254)) {	if ((expo_y != 254) && (expo_x != 254)) {

	skipping to change at line 3680	skipping to change at line 3713
	return __internal_fdiv_kernel (a, b, cudaRoundMinInf);	return __internal_fdiv_kernel (a, b, cudaRoundMinInf);
	}	}

	__device_func__(float __fdiv_ru (float a, float b))	__device_func__(float __fdiv_ru (float a, float b))
	{	{
	return __internal_fdiv_kernel (a, b, cudaRoundPosInf);	return __internal_fdiv_kernel (a, b, cudaRoundPosInf);
	}	}

	__device_func__(float __fadd_rd (float a, float b))	__device_func__(float __fadd_rd (float a, float b))
	{	{

	return __internal_fadd_kernel2 (a, b, cudaRoundMinInf);	return __internal_fadd_kernel (a, b, cudaRoundMinInf);
	}	}

	__device_func__(float __fadd_ru (float a, float b))	__device_func__(float __fadd_ru (float a, float b))
	{	{

	return __internal_fadd_kernel2 (a, b, cudaRoundPosInf);	return __internal_fadd_kernel (a, b, cudaRoundPosInf);
	}	}

	__device_func__(float __fmul_rd (float a, float b))	__device_func__(float __fmul_rd (float a, float b))
	{	{

	return __internal_fmul_kernel2 (a, b, cudaRoundMinInf);	return __internal_fmul_kernel (a, b, cudaRoundMinInf);
	}	}

	__device_func__(float __fmul_ru (float a, float b))	__device_func__(float __fmul_ru (float a, float b))
	{	{

	return __internal_fmul_kernel2 (a, b, cudaRoundPosInf);	return __internal_fmul_kernel (a, b, cudaRoundPosInf);
	}	}

	__device_func__(float __fmaf_rn (float a, float b, float c))	__device_func__(float __fmaf_rn (float a, float b, float c))
	{	{
	return __internal_fmaf_kernel (a, b, c, cudaRoundNearest);	return __internal_fmaf_kernel (a, b, c, cudaRoundNearest);
	}	}

	__device_func__(float __fmaf_rz (float a, float b, float c))	__device_func__(float __fmaf_rz (float a, float b, float c))
	{	{
	return __internal_fmaf_kernel (a, b, c, cudaRoundZero);	return __internal_fmaf_kernel (a, b, c, cudaRoundZero);

	skipping to change at line 3777	skipping to change at line 3810
	{	{
	long long int res;	long long int res;
	res = __umul64hi(a, b);	res = __umul64hi(a, b);
	if (a < 0LL) res = res - b;	if (a < 0LL) res = res - b;
	if (b < 0LL) res = res - a;	if (b < 0LL) res = res - a;
	return res;	return res;
	}	}

	__device_func__(float __saturatef(float a))	__device_func__(float __saturatef(float a))
	{	{

	if (__cuda___isnanf(a)) return 0.0f; // update of PTX spec 10/15/2008	if (__cuda___isnanf(a)) return 0.0f; /* update of PTX spec 10/15/2008 */
	return a >= 1.0f ? 1.0f : a <= 0.0f ? 0.0f : a;	return a >= 1.0f ? 1.0f : a <= 0.0f ? 0.0f : a;
	}	}

	__device_func__(unsigned int __sad(int a, int b, unsigned int c))	__device_func__(unsigned int __sad(int a, int b, unsigned int c))
	{	{
	long long int diff = (long long int)a - (long long int)b;	long long int diff = (long long int)a - (long long int)b;

	return (unsigned int)(__cuda_llabs(diff) + (long long int)c);	return (unsigned int)(__cuda_llabs(diff) + (long long int)c);
	}	}


	skipping to change at line 4106	skipping to change at line 4139
	{	{
	volatile union __cudart_FloatUintCvt res;	volatile union __cudart_FloatUintCvt res;
	int shift;	int shift;
	unsigned int t;	unsigned int t;
	res.i = a;	res.i = a;
	if (a == 0) return res.f;	if (a == 0) return res.f;
	shift = __internal_normalize((unsigned int*)&res.i);	shift = __internal_normalize((unsigned int*)&res.i);
	t = res.i << 24;	t = res.i << 24;
	res.i = (res.i >> 8);	res.i = (res.i >> 8);
	res.i += (127 + 30 - shift) << 23;	res.i += (127 + 30 - shift) << 23;

	if ((rndMode == cudaRoundNearest) && (t >= 0x80000000)) {	if (rndMode == cudaRoundNearest) {
	res.i += (t == 0x80000000) ? (res.i & 1) : (t >> 31);	res.i += (t == 0x80000000) ? (res.i & 1) : (t >> 31);
	}	}
	else if ((rndMode == cudaRoundPosInf) && t) {	else if ((rndMode == cudaRoundPosInf) && t) {
	res.i++;	res.i++;
	}	}
	return res.f;	return res.f;
	}	}

	__device_func__(float __uint2float_rz(unsigned int a))	__device_func__(float __uint2float_rz(unsigned int a))
	{	{

	skipping to change at line 4144	skipping to change at line 4177
	#else /* __MULTI_CORE__ */	#else /* __MULTI_CORE__ */
	return __internal_uint2float_kernel(a, cudaRoundNearest);	return __internal_uint2float_kernel(a, cudaRoundNearest);
	#endif /* __MULTI_CORE__ */	#endif /* __MULTI_CORE__ */
	}	}

	__device_func__(float __ll2float_rn(long long int a))	__device_func__(float __ll2float_rn(long long int a))
	{	{
	return (float)a;	return (float)a;
	}	}


	__device_func__(float __ull2float_rn(unsigned long long int a))	__device_func__(float __internal_ull2float_kernel(unsigned long long int a, enum cudaRoundMode rndMode))
	{	{

	#if defined(__MULTI_CORE__)
	return (float)a;
	#else /* __MULTI_CORE__ */
	unsigned long long int temp;	unsigned long long int temp;
	unsigned int res, t;	unsigned int res, t;
	int shift;	int shift;
	if (a == 0ULL) return 0.0f;	if (a == 0ULL) return 0.0f;
	temp = a;	temp = a;
	shift = __internal_normalize64(&temp);	shift = __internal_normalize64(&temp);
	temp = (temp >> 8) \| ((temp & 0xffULL) ? 1ULL : 0ULL);	temp = (temp >> 8) \| ((temp & 0xffULL) ? 1ULL : 0ULL);
	res = (unsigned int)(temp >> 32);	res = (unsigned int)(temp >> 32);
	t = (unsigned int)temp;	t = (unsigned int)temp;
	res += (127 + 62 - shift) << 23; /* add in exponent */	res += (127 + 62 - shift) << 23; /* add in exponent */

	res += t == 0x80000000 ? res & 1 : t >> 31;	if (rndMode == cudaRoundNearest) {
		res += (t == 0x80000000) ? (res & 1) : (t >> 31);
		} else if (rndMode == cudaRoundPosInf) {
		res += (t != 0);
		}
	return __int_as_float(res);	return __int_as_float(res);

		}

		__device_func__(float __ull2float_rn(unsigned long long int a))
		{
		#if defined(__MULTI_CORE__)
		return (float)a;
		#else /* __MULTI_CORE__ */
		return __internal_ull2float_kernel(a, cudaRoundNearest);
	#endif /* __MULTI_CORE__ */	#endif /* __MULTI_CORE__ */
	}	}


	__device_func__(float __internal_fmul_kernel(float a, float b, int rndNeare st))	__device_func__(unsigned short __float2half_rn(float f))
	{	{

	unsigned long long product;	unsigned int x = __float_as_int (f);
	volatile union __cudart_FloatUintCvt xx, yy;	unsigned int u = (x & 0x7fffffff), remainder, shift, lsb, lsb_s1, lsb_m1;
	unsigned expo_x, expo_y;	unsigned int sign, exponent, mantissa;


	xx.f = a;	/* Get rid of +NaN/-NaN case first. */
	yy.f = b;	if (u > 0x7f800000) {
		return 0x7fff;
		}


	expo_y = 0xFF;	sign = ((x >> 16) & 0x8000);
	expo_x = expo_y & (xx.i >> 23);
	expo_x = expo_x - 1;
	expo_y = expo_y & (yy.i >> 23);
	expo_y = expo_y - 1;


	if ((expo_x <= 0xFD) &&	/* Get rid of +Inf/-Inf, +0/-0. */
	(expo_y <= 0xFD)) {	if (u > 0x477fefff) {
	multiply:	return sign \| 0x7c00;
	expo_x = expo_x + expo_y;	}
	expo_y = xx.i ^ yy.i;	if (u < 0x33000001) {
	xx.i = xx.i & 0x00ffffff;	return sign \| 0x0000;
	yy.i = yy.i << 8;
	xx.i = xx.i \| 0x00800000;
	yy.i = yy.i \| 0x80000000;
	/* compute product */
	product = ((unsigned long long)xx.i) * yy.i;
	expo_x = expo_x - 127 + 2;
	expo_y = expo_y & 0x80000000;
	xx.i = (unsigned int)(product >> 32);
	yy.i = (unsigned int)(product & 0xffffffff);
	/* normalize mantissa */
	if (xx.i < 0x00800000) {
	xx.i = (xx.i << 1) \| (yy.i >> 31);
	yy.i = (yy.i << 1);
	expo_x--;
	}
	if (expo_x <= 0xFD) {
	xx.i = xx.i \| expo_y; /* OR in sign bit */
	xx.i = xx.i + (expo_x << 23); /* add in exponent */
	/* round result to nearest or even */
	if (yy.i < 0x80000000) return xx.f;
	xx.i += (((yy.i == 0x80000000) ? (xx.i & 1) : (yy.i >> 31))
	&& rndNearest);
	return xx.f;
	} else if ((int)expo_x >= 254) {
	/* overflow: return infinity */
	xx.i = (expo_y \| 0x7F800000) - (!rndNearest);
	return xx.f;
	} else {
	/* zero, denormal, or smallest normal */
	expo_x = ((unsigned int)-((int)expo_x));
	if (expo_x > 25) {
	/* massive underflow: return 0 */
	xx.i = expo_y;
	return xx.f;
	} else {
	yy.i = (xx.i << (32 - expo_x)) \| ((yy.i) ? 1 : 0);
	xx.i = expo_y + (xx.i >> expo_x);
	xx.i += (((yy.i == 0x80000000) ? (xx.i & 1) : (yy.i >> 31))
	&& rndNearest);
	return xx.f;
	}
	}
	} else {
	product = xx.i ^ yy.i;
	product = product & 0x80000000;
	if (!(xx.i & 0x7fffffff)) {
	if (expo_y != 254) {
	xx.i = (unsigned int)product;
	return xx.f;
	}
	expo_y = yy.i << 1;
	if (expo_y == 0xFF000000) {
	xx.i = expo_y \| 0x00C00000;
	} else {
	xx.i = yy.i \| 0x00400000;
	}
	return xx.f;
	}
	if (!(yy.i & 0x7fffffff)) {
	if (expo_x != 254) {
	xx.i = (unsigned int)product;
	return xx.f;
	}
	expo_x = xx.i << 1;
	if (expo_x == 0xFF000000) {
	xx.i = expo_x \| 0x00C00000;
	} else {
	xx.i = xx.i \| 0x00400000;
	}
	return xx.f;
	}
	if ((expo_y != 254) && (expo_x != 254)) {
	expo_y++;
	expo_x++;
	if (expo_x == 0) {
	expo_y \|= xx.i & 0x80000000;
	/*
	* If both operands are denormals, we only need to normalize
	* one of them as the result will be either a denormal or zero.
	*/
	xx.i = xx.i << 8;
	while (!(xx.i & 0x80000000)) {
	xx.i <<= 1;
	expo_x--;
	}
	xx.i = (xx.i >> 8) \| (expo_y & 0x80000000);
	expo_y &= ~0x80000000;
	expo_y--;
	goto multiply;
	}
	if (expo_y == 0) {
	expo_x \|= yy.i & 0x80000000;
	yy.i = yy.i << 8;
	while (!(yy.i & 0x80000000)) {
	yy.i <<= 1;
	expo_y--;
	}
	yy.i = (yy.i >> 8) \| (expo_x & 0x80000000);
	expo_x &= ~0x80000000;
	expo_x--;
	goto multiply;
	}
	}
	expo_x = xx.i << 1;
	expo_y = yy.i << 1;
	/* if x is NaN, return x */
	if (expo_x > 0xFF000000) {
	/* cvt any SNaNs to QNaNs */
	xx.i = xx.i \| 0x00400000;
	return xx.f;
	}
	/* if y is NaN, return y */
	if (expo_y > 0xFF000000) {
	/* cvt any SNaNs to QNaNs */
	xx.i = yy.i \| 0x00400000;
	return xx.f;
	}
	xx.i = (unsigned int)product \| 0x7f800000;
	return xx.f;
	}	}

	}

	__device_func__(float __internal_fadd_kernel(float a, float b, int rndNeare
	st))
	{
	volatile union __cudart_FloatUintCvt xx, yy;
	unsigned int expo_x;
	unsigned int expo_y;
	unsigned int temp;


	xx.f = a;	exponent = ((u >> 23) & 0xff);
	yy.f = b;	mantissa = (u & 0x7fffff);


	/* make bigger operand the augend */	if (exponent > 0x70) {
	expo_y = yy.i << 1;	shift = 13;
	if (expo_y > (xx.i << 1)) {	exponent -= 0x70;
	expo_y = xx.i;	} else {
	xx.i = yy.i;	shift = 0x7e - exponent;
	yy.i = expo_y;	exponent = 0;
		mantissa \|= 0x800000;
	}	}

		lsb = (1 << shift);
		lsb_s1 = (lsb >> 1);
		lsb_m1 = (lsb - 1);


	temp = 0xff;	/* Round to nearest even. */
	expo_x = temp & (xx.i >> 23);	remainder = (mantissa & lsb_m1);
	expo_x = expo_x - 1;	mantissa >>= shift;
	expo_y = temp & (yy.i >> 23);	if (remainder > lsb_s1 \|\| (remainder == lsb_s1 && (mantissa & 0x1))) {
	expo_y = expo_y - 1;	++mantissa;
		if (!(mantissa & 0x3ff)) {
		++exponent;
		mantissa = 0;
		}
		}


	if ((expo_x <= 0xFD) &&	return sign \| (exponent << 10) \| mantissa;
	(expo_y <= 0xFD)) {	}


	add:	__device_func__(float __half2float(unsigned short h))
	expo_y = expo_x - expo_y;	{
	if (expo_y > 25) {	unsigned int sign = ((h >> 15) & 1);
	expo_y = 31;	unsigned int exponent = ((h >> 10) & 0x1f);
	}	unsigned int mantissa = ((h & 0x3ff) << 13);
	temp = xx.i ^ yy.i;
	xx.i = xx.i & ~0x7f000000;
	xx.i = xx.i \| 0x00800000;
	yy.i = yy.i & ~0xff000000;
	yy.i = yy.i \| 0x00800000;


	if ((int)temp < 0) {	if (exponent == 0x1f) { /* NaN or Inf */
	/* signs differ, effective subtraction */	mantissa = (mantissa
	temp = 32 - expo_y;	? (sign = 0, 0x7fffff)
	temp = (expo_y) ? (yy.i << temp) : 0;	: 0);
	temp = (unsigned int)(-((int)temp));	exponent = 0xff;
	xx.i = xx.i - (yy.i >> expo_y) - (temp ? 1 : 0);	} else if (!exponent) { /* Denorm or Zero */
	if (xx.i & 0x00800000) {	if (mantissa) {
	if (expo_x <= 0xFD) {	unsigned int msb;
	xx.i = xx.i & ~0x00800000; /* lop off integer bit */	exponent = 0x71;
	xx.i = (xx.i + (expo_x << 23)) + 0x00800000;	do {
	if (temp < 0x80000000) return xx.f;	msb = (mantissa & 0x400000);
	xx.i += (((temp == 0x80000000) ? (xx.i & 1) : (temp >> 31))	mantissa <<= 1; /* normalize */
	&& rndNearest);	--exponent;
	return xx.f;	} while (!msb);
	}	mantissa &= 0x7fffff; /* 1.mantissa is implicit */
	} else {
	if ((temp \| (xx.i << 1)) == 0) {
	/* operands cancelled, resulting in a clean zero */
	xx.i = 0;
	return xx.f;
	}
	/* normalize result */
	yy.i = xx.i & 0x80000000;
	do {
	xx.i = (xx.i << 1) \| (temp >> 31);
	temp <<= 1;
	expo_x--;
	} while (!(xx.i & 0x00800000));
	xx.i = xx.i \| yy.i;
	}
	} else {
	/* signs are the same, effective addition */
	temp = 32 - expo_y;
	temp = (expo_y) ? (yy.i << temp) : 0;
	xx.i = xx.i + (yy.i >> expo_y);
	if (!(xx.i & 0x01000000)) {
	if (expo_x <= 0xFD) {
	expo_y = xx.i & 1;
	xx.i = xx.i + (expo_x << 23);
	if (temp < 0x80000000) return xx.f;
	xx.i += (((temp == 0x80000000) ? expo_y : (temp >> 31))
	&& rndNearest);
	return xx.f;
	}
	} else {
	/* normalize result */
	temp = (xx.i << 31) \| (temp >> 1);
	/* not ANSI compliant: xx.i = (((int)xx.i)>>1) & ~0x40000000 */
	xx.i = ((xx.i & 0x80000000) \| (xx.i >> 1)) & ~0x40000000;
	expo_x++;
	}
	}
	if (expo_x <= 0xFD) {
	expo_y = xx.i & 1;
	xx.i += (((temp == 0x80000000) ? expo_y : (temp >> 31))
	&& rndNearest);
	xx.i = xx.i + (expo_x << 23);
	return xx.f;
	}
	if ((int)expo_x >= 254) {
	/* overflow: return infinity */
	xx.i = ((xx.i & 0x80000000) \| 0x7f800000) - (!rndNearest);
	return xx.f;
	}	}

	/* underflow: denormal, or smallest normal */
	expo_y = expo_x + 32;
	yy.i = xx.i & 0x80000000;
	xx.i = xx.i & ~0xff000000;

	expo_x = (unsigned int)(-((int)expo_x));
	temp = xx.i << expo_y \| ((temp) ? 1 : 0);
	xx.i = yy.i \| (xx.i >> expo_x);
	xx.i += (((temp == 0x80000000) ? (xx.i & 1) : (temp >> 31))
	&& rndNearest);
	return xx.f;
	} else {	} else {

	/* handle special cases separately */	exponent += 0x70;
	if (!(yy.i << 1)) {
	if (xx.i == 0x80000000) {
	xx.i = yy.i;
	}
	if ((xx.i << 1) > 0xff000000) {
	xx.i \|= 0x00400000;
	}
	return xx.f;
	}
	if ((expo_y != 254) && (expo_x != 254)) {
	/* remove sign bits */
	if (expo_x == (unsigned int) -1) {
	temp = xx.i & 0x80000000;
	xx.i = xx.i << 8;
	while (!(xx.i & 0x80000000)) {
	xx.i <<= 1;
	expo_x--;
	}
	expo_x++;
	xx.i = (xx.i >> 8) \| temp;
	}
	if (expo_y == (unsigned int) -1) {
	temp = yy.i & 0x80000000;
	yy.i = yy.i << 8;
	while (!(yy.i & 0x80000000)) {
	yy.i <<= 1;
	expo_y--;
	}
	expo_y++;
	yy.i = (yy.i >> 8) \| temp;
	}
	goto add;
	}
	expo_x = xx.i << 1;
	expo_y = yy.i << 1;
	/* if x is NaN, return x */
	if (expo_x > 0xff000000) {
	/* cvt any SNaNs to QNaNs */
	xx.i = xx.i \| 0x00400000;
	return xx.f;
	}
	/* if y is NaN, return y */
	if (expo_y > 0xff000000) {
	/* cvt any SNaNs to QNaNs */
	xx.i = yy.i \| 0x00400000;
	return xx.f;
	}
	if ((expo_x == 0xff000000) && (expo_y == 0xff000000)) {
	/*
	* subtraction of infinities with the same sign, and addition of
	* infinities of unlike sign is undefined: return NaN INDEFINITE
	*/
	expo_x = xx.i ^ yy.i;
	xx.i = xx.i \| ((expo_x) ? 0xffc00000 : 0);
	return xx.f;
	}
	/* handle infinities */
	if (expo_y == 0xff000000) {
	xx.i = yy.i;
	}
	return xx.f;
	}	}


		return __int_as_float ((sign << 31) \| (exponent << 23) \| mantissa);
	}	}

	__device_func__(float __fadd_rz(float a, float b))	__device_func__(float __fadd_rz(float a, float b))
	{	{

	return __internal_fadd_kernel(a, b, 0);	return __internal_fadd_kernel(a, b, cudaRoundZero);
	}	}

	__device_func__(float __fmul_rz(float a, float b))	__device_func__(float __fmul_rz(float a, float b))
	{	{

	return __internal_fmul_kernel(a, b, 0);	return __internal_fmul_kernel(a, b, cudaRoundZero);
	}	}

	__device_func__(float __fadd_rn(float a, float b))	__device_func__(float __fadd_rn(float a, float b))
	{	{

	return __internal_fadd_kernel(a, b, 1);	return __internal_fadd_kernel(a, b, cudaRoundNearest);
	}	}

	__device_func__(float __fmul_rn(float a, float b))	__device_func__(float __fmul_rn(float a, float b))
	{	{

	return __internal_fmul_kernel(a, b, 1);	return __internal_fmul_kernel(a, b, cudaRoundNearest);
	}	}

	__device_func__(void __brkpt(int c))	__device_func__(void __brkpt(int c))
	{	{
	/* TODO */	/* TODO */
	}	}

	#if defined(__MULTI_CORE__)	#if defined(__MULTI_CORE__)

	#define __syncthreads() \	#define __syncthreads() \

	skipping to change at line 4543	skipping to change at line 4342
	#endif /* __GNUC__ */	#endif /* __GNUC__ */

	#endif /* __MULTI_CORE__ */	#endif /* __MULTI_CORE__ */

	__device_func__(void __prof_trigger(int a))	__device_func__(void __prof_trigger(int a))
	{	{
	}	}

	__device_func__(void __threadfence(void))	__device_func__(void __threadfence(void))
	{	{

		__syncthreads();
	}	}

	__device_func__(void __threadfence_block(void))	__device_func__(void __threadfence_block(void))
	{	{

		__syncthreads();
	}	}

	#if defined(__GNUC__)	#if defined(__GNUC__)

	__device_func__(void __trap(void))	__device_func__(void __trap(void))
	{	{
	__builtin_trap();	__builtin_trap();
	}	}

	#elif defined(_WIN32)	#elif defined(_WIN32)

	skipping to change at line 4572	skipping to change at line 4373

	#endif /* __GNUC__ */	#endif /* __GNUC__ */

	#endif /* __CUDABE__ */	#endif /* __CUDABE__ */

	/************************************************************************ ***	/************************************************************************ ***
	* *	* *
	* DEVICE IMPLEMENTATIONS FOR FUNCTIONS WITH BUILTIN NVOPENCC OPERATIONS *	* DEVICE IMPLEMENTATIONS FOR FUNCTIONS WITH BUILTIN NVOPENCC OPERATIONS *
	* *	* *
	************************************************************************* **/	************************************************************************* **/

		#if !defined(__CUDABE__)
	__device_func__(float __fdividef(float a, float b))	__device_func__(float __fdividef(float a, float b))
	{	{
	#if defined(__MULTI_CORE__)	#if defined(__MULTI_CORE__)
	return a / b;	return a / b;

	#elif defined(__CUDABE__)
	return a / b;
	#else /* __MULTI_CORE__ */	#else /* __MULTI_CORE__ */

		volatile float aa = a;
		volatile float bb = b;
	/* match range restrictions of the device function */	/* match range restrictions of the device function */

	if (__cuda_fabsf(b) > CUDART_TWO_TO_126_F) {	if (__cuda_fabsf(bb) > CUDART_TWO_TO_126_F) {
	if (__cuda_fabsf(a) <= CUDART_NORM_HUGE_F) {	if (__cuda_fabsf(aa) <= CUDART_NORM_HUGE_F) {
	return ((a / b) / CUDART_NORM_HUGE_F) / CUDART_NORM_HUGE_F;	return ((aa / bb) / CUDART_NORM_HUGE_F) / CUDART_NORM_HUGE_F;
	} else {	} else {

	return __int_as_float(0xffc00000);	bb = 1.0f / bb;
		bb = bb / CUDART_NORM_HUGE_F;
		return aa * bb;
	}	}
	} else {	} else {

	return a / b;	return aa / bb;
	}	}
	#endif /* __MULTI_CORE__ */	#endif /* __MULTI_CORE__ */
	}	}

		#endif /* !defined(__CUDABE__) */

	__device_func__(float __sinf(float a))	__device_func__(float __sinf(float a))
	{	{

		#if !defined(__CUDABE__)
		if ((__float_as_int(a) << 1) == 0xff000000) {
		return __fadd_rn (a, -a); /* return NaN */
		}
		#endif /* !defined(__CUDABE__) */
	return sinf(a);	return sinf(a);
	}	}

	__device_func__(float __cosf(float a))	__device_func__(float __cosf(float a))
	{	{

		#if !defined(__CUDABE__)
		if ((__float_as_int(a) << 1) == 0xff000000) {
		return __fadd_rn (a, -a); /* return NaN */
		}
		#endif /* !defined(__CUDABE__) */
	return cosf(a);	return cosf(a);
	}	}

	__device_func__(float __log2f(float a))	__device_func__(float __log2f(float a))
	{	{
	return log2f(a);	return log2f(a);
	}	}

	/************************************************************************ ***	/************************************************************************ ***
	* *	* *
	* SHARED HOST AND DEVICE IMPLEMENTATIONS *	* SHARED HOST AND DEVICE IMPLEMENTATIONS *
	* *	* *
	************************************************************************* **/	************************************************************************* **/

	__device_func__(float __internal_accurate_fdividef(float a, float b))	__device_func__(float __internal_accurate_fdividef(float a, float b))
	{	{

	if (__cuda_fabsf(b) > CUDART_TWO_TO_126_F) {	return a / b;
	a *= .25f;
	b *= .25f;
	}
	return __fdividef(a, b);
	}	}

	__device_func__(float __tanf(float a))	__device_func__(float __tanf(float a))
	{	{
	#if defined(__MULTI_CORE__)	#if defined(__MULTI_CORE__)
	return tanf(a);	return tanf(a);
	#else /* __MULTI_CORE__ */	#else /* __MULTI_CORE__ */
	return __fdividef (__sinf(a), __cosf(a));	return __fdividef (__sinf(a), __cosf(a));
	#endif /* __MULTI_CORE__ */	#endif /* __MULTI_CORE__ */
	}	}

End of changes. 76 change blocks.
	367 lines changed or deleted	175 lines changed or added

	device_launch_parameters.h	device_launch_parameters.h

	skipping to change at line 53	skipping to change at line 53
	#define __STORAGE__ \	#define __STORAGE__ \
	extern const	extern const

	#endif /* __STORAGE__ */	#endif /* __STORAGE__ */

	#if defined(__cplusplus)	#if defined(__cplusplus)
	extern "C" {	extern "C" {
	#endif /* __cplusplus */	#endif /* __cplusplus */

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/

	struct uint3 __STORAGE__ threadIdx;	uint3 __STORAGE__ threadIdx;
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/

	struct uint3 __STORAGE__ blockIdx;	uint3 __STORAGE__ blockIdx;
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/

	struct dim3 __STORAGE__ blockDim;	dim3 __STORAGE__ blockDim;
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/

	struct dim3 __STORAGE__ gridDim;	dim3 __STORAGE__ gridDim;
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	int __STORAGE__ warpSize;	int __STORAGE__ warpSize;

	#undef __STORAGE__	#undef __STORAGE__

	#if defined(__cplusplus)	#if defined(__cplusplus)
	}	}
	#endif /* __cplusplus */	#endif /* __cplusplus */

	#if !defined(__cudaGet_threadIdx)	#if !defined(__cudaGet_threadIdx)

End of changes. 4 change blocks.
	4 lines changed or deleted	4 lines changed or added

	device_runtime.h	device_runtime.h
	/*	/*

	* Copyright 1993-2009 NVIDIA Corporation. All rights reserved.	* Copyright 1993-2008 NVIDIA Corporation. All rights reserved.
	*	*
	* NOTICE TO USER:	* NOTICE TO USER:
	*	*
	* This source code is subject to NVIDIA ownership rights under U.S. and	* This source code is subject to NVIDIA ownership rights under U.S. and
	* international Copyright laws. Users and possessors of this source code	* international Copyright laws. Users and possessors of this source code
	* are hereby granted a nonexclusive, royalty-free license to use this code	* are hereby granted a nonexclusive, royalty-free license to use this code
	* in individual and commercial software.	* in individual and commercial software.
	*	*
	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR

	skipping to change at line 56	skipping to change at line 56
	s	s
	#define __unsized_shared_data(name, type_post) \	#define __unsized_shared_data(name, type_post) \
	__unsized##name __unsized##type_post	__unsized##name __unsized##type_post
	#define __sized_shared_data(name, type) \	#define __sized_shared_data(name, type) \
	__sized##name type	__sized##name type
	#define __sized__shared_var(name, s, type) \	#define __sized__shared_var(name, s, type) \
	name	name

	/TEXTURE_TYPE/	/TEXTURE_TYPE/
	typedef const void *__texture_type__;	typedef const void *__texture_type__;

		/SURFACE_TYPE/
		typedef const void *__surface_type__;

	#if defined(__CUDABE__) /* cudabe compiler */	#if defined(__CUDABE__) /* cudabe compiler */

	#define __pad__(f)	#define __pad__(f)
	#define __text__ \	#define __text__ \
	__attribute__((__texture__))	__attribute__((__texture__))

		#define __surf__ \
		__attribute__((__surface__))
	#define ___device__(sc) \	#define ___device__(sc) \
	static	static
	#define __in__(cdecl, decl) \	#define __in__(cdecl, decl) \
	__shared__ cdecl	__shared__ cdecl
	#define __in_type__(cdecl, decl) \	#define __in_type__(cdecl, decl) \
	cdecl	cdecl
	#define __texture_var(name) \	#define __texture_var(name) \
	name	name
	#define __shared_var(name, s, type) \	#define __shared_var(name, s, type) \
	name	name

	skipping to change at line 86	skipping to change at line 90
	#define __copy_param(local_decl, param) \	#define __copy_param(local_decl, param) \
	local_decl = param	local_decl = param
	#define __unsized_array_size \	#define __unsized_array_size \
	[]	[]
	#define __unsized__shared_var(name, s, type) \	#define __unsized__shared_var(name, s, type) \
	name	name
	#define __unsized__empty_array(s) \	#define __unsized__empty_array(s) \
	s	s
	#define __var_used__ \	#define __var_used__ \
	__attribute__((__used__))	__attribute__((__used__))

		#define __storage_extern_unsized__shared__ \
		extern

	#undef __cdecl	#undef __cdecl
	#define __cdecl	#define __cdecl
	#undef __w64	#undef __w64
	#define __w64	#define __w64

	#elif defined(__CUDACC__) /* cudafe compiler */	#elif defined(__CUDACC__) /* cudafe compiler */


	#define __loc_sc__(loc, sc) \	#define __loc_sc__(loc, size, sc) \
	sc loc	sc loc
	#define __pad__(f)	#define __pad__(f)
	#define __text__	#define __text__

		#define __surf__
	#define ___device__(sc) \	#define ___device__(sc) \
	sc __device__	sc __device__
	#define __in__(cdecl, decl) \	#define __in__(cdecl, decl) \
	decl	decl
	#define __in_type__(cdecl, decl) \	#define __in_type__(cdecl, decl) \
	decl	decl
	#define __texture_var(name) \	#define __texture_var(name) \
	name	name
	#define __shared_var(name, s, type) \	#define __shared_var(name, s, type) \
	name	name

	skipping to change at line 127	skipping to change at line 134
	#define __unsized__empty_array(s) \	#define __unsized__empty_array(s) \
	s	s

	#else /* host compiler (cl, gcc, open64, ...) */	#else /* host compiler (cl, gcc, open64, ...) */

	#if defined (__MULTI_CORE__) \|\| defined(__multi_core__)	#if defined (__MULTI_CORE__) \|\| defined(__multi_core__)

	struct uint3;	struct uint3;
	extern struct uint3* CUDARTAPI __cudaGetBlockIdxPtr(void);	extern struct uint3* CUDARTAPI __cudaGetBlockIdxPtr(void);
	extern void* CUDARTAPI __cudaGetSharedMem(void*);	extern void* CUDARTAPI __cudaGetSharedMem(void*);

		extern void* CUDARTAPI __cudaCmcHostMalloc(size_t);
		extern size_t CUDARTAPI __cudaCmcGetStackSize(void);

	#endif /* __MULTI_CORE__ \|\| __multi_core__ */	#endif /* __MULTI_CORE__ \|\| __multi_core__ */

	#if defined (__multi_core__)	#if defined (__multi_core__)

	#if defined(__GNUC__)	#if defined(__GNUC__)

	#if defined(__cplusplus)	#if defined(__cplusplus)

	extern void *alloca(size_t) throw();	extern void *alloca(size_t) throw();

	#else /* __cplusplus */	#else /* __cplusplus */

	extern void *alloca(size_t);	extern void *alloca(size_t);

	#endif /* __cplusplus */	#endif /* __cplusplus */


	#define __cuda_alloc(s) \	#define __cuda_alloca(s) \
	alloca(s)	alloca(s)

	#else /* __GNUC__ */	#else /* __GNUC__ */

	extern void *_alloca(size_t);	extern void *_alloca(size_t);


	#define __cuda_alloc(s) \	#define __cuda_alloca(s) \
	_alloca(s)	_alloca(s)

	#endif /* __GNUC__ */	#endif /* __GNUC__ */


		/* check if enough stack size remains for alloca to succeed. If so,
		use faster alloca() to do the allocation. Otherwise, allocate memory
		using the __cudaCmcHostMalloc() runtime function, which uses the slower
		malloc path to allocate memory
		*/
		#define __cudaCmcTargAlloc(num_bytes, max_stacksize, ptr_counter_stacksize,
		ptr_ret_sym) \
		do { \
		if (*(ptr_counter_stacksize) + (num_bytes) >= (max_stacksize)) { \
		*(ptr_ret_sym) = __cudaCmcHostMalloc((num_bytes)); \
		} else { \
		*(ptr_ret_sym) = __cuda_alloca((num_bytes)); \
		(ptr_counter_stacksize) = (ptr_counter_stacksize) + (num_bytes); \
		} \
		} while(0)

	#endif /* __multi_core__ */	#endif /* __multi_core__ */

	#if defined (__MULTI_CORE__)	#if defined (__MULTI_CORE__)

	#define ___device__(sc) \	#define ___device__(sc) \
	static	static
	#define __pad__(f) \	#define __pad__(f) \
	f	f
	#define __text__	#define __text__

		#define __surf__
	#define __cudaGet_blockIdx() \	#define __cudaGet_blockIdx() \
	(*__cudaGetBlockIdxPtr())	(*__cudaGetBlockIdxPtr())
	#define __shared_var(name, s, type) \	#define __shared_var(name, s, type) \
	(s type __cudaGetSharedMem((void*)(&(name))))	(s type __cudaGetSharedMem((void*)(&(name))))
	#define __var_used__ \	#define __var_used__ \
	__attribute__((__used__))	__attribute__((__used__))
	#define __storage_auto__shared__ \	#define __storage_auto__shared__ \
	auto	auto

	#undef __cdecl	#undef __cdecl

	skipping to change at line 201	skipping to change at line 226
	#elif defined(__GNUC__)	#elif defined(__GNUC__)

	#define __STORAGE__ \	#define __STORAGE__ \
	__attribute__((__common__))	__attribute__((__common__))

	#elif defined(__cplusplus)	#elif defined(__cplusplus)

	#define __STORAGE__ \	#define __STORAGE__ \
	__declspec(selectany)	__declspec(selectany)


	#pragma warning(disable: 4099 4190)

	#else /* __APPLE__ \|\| __ICC */	#else /* __APPLE__ \|\| __ICC */

	#define __STORAGE__	#define __STORAGE__

	#endif /* __APPLE__ \|\| __ICC */	#endif /* __APPLE__ \|\| __ICC */

	#endif /* __MULTI_CORE__ */	#endif /* __MULTI_CORE__ */

	#define __in__(cdecl, decl) \	#define __in__(cdecl, decl) \
	decl	decl

	skipping to change at line 230	skipping to change at line 253
	name	name
	#define __copy_param(local_decl, param)	#define __copy_param(local_decl, param)
	#define __unsized_array_size	#define __unsized_array_size
	#define __unsized__shared_var(name, s, type) \	#define __unsized__shared_var(name, s, type) \
	(*name)	(*name)
	#define __unsized__empty_array(s)	#define __unsized__empty_array(s)

	/* this is compiled with a host compiler for device emulation */	/* this is compiled with a host compiler for device emulation */
	#define __device_emulation	#define __device_emulation


		#if defined(__cplusplus)

		#undef __VECTOR_TYPES_H__

		#if defined(_WIN32)

		#pragma warning(disable: 4190 4522)

		#endif /* _WIN32 */

		#endif /* __cplusplus */

	#endif /* __CUDABE__ */	#endif /* __CUDABE__ */


		#if defined(__cplusplus)

		static void __cuda_memcpy(void, const void*, size_t);

		/* for C++ compilation of lowered C++ (i.e. C) code */
		#define __cuda_assign_operators(tag)
		\
		tag& operator=( tag& a) { __cuda_memcpy(this, &a,
		sizeof(tag)); return *this;} \
		tag& operator=(volatile tag& a) volatile { return (tag)this = (ta
		g&)a; } \
		tag& operator=( const tag& a) { return (tag)this = (ta
		g&)a; } \

		#endif /* __cplusplus */

	#include "builtin_types.h"	#include "builtin_types.h"
	#include "device_launch_parameters.h"	#include "device_launch_parameters.h"
	#include "storage_class.h"	#include "storage_class.h"

End of changes. 14 change blocks.
	6 lines changed or deleted	58 lines changed or added

	driver_types.h	driver_types.h

	skipping to change at line 143	skipping to change at line 143

	/**	/**
	* Channel format kind	* Channel format kind
	*/	*/
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	enum cudaChannelFormatKind	enum cudaChannelFormatKind
	{	{
	cudaChannelFormatKindSigned = 0, ///< Signed channel for mat	cudaChannelFormatKindSigned = 0, ///< Signed channel for mat
	cudaChannelFormatKindUnsigned = 1, ///< Unsigned channel f ormat	cudaChannelFormatKindUnsigned = 1, ///< Unsigned channel f ormat
	cudaChannelFormatKindFloat = 2, ///< Float channel form at	cudaChannelFormatKindFloat = 2, ///< Float channel form at

	cudaChannelFormatKindNone = 3, ///< No channel format	cudaChannelFormatKindNone = 3 ///< No channel format
	};	};

	/**	/**
	* CUDA Channel format descriptor	* CUDA Channel format descriptor
	*/	*/
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct cudaChannelFormatDesc	struct cudaChannelFormatDesc
	{	{
	int x; ///< x	int x; ///< x
	int y; ///< y	int y; ///< y

	skipping to change at line 185	skipping to change at line 185
	};	};

	/**	/**
	* CUDA Pitched memory pointer	* CUDA Pitched memory pointer
	*/	*/
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct cudaPitchedPtr	struct cudaPitchedPtr
	{	{
	void *ptr; ///< Pointer to allocated memory	void *ptr; ///< Pointer to allocated memory
	size_t pitch; ///< Pitch of allocated memory in bytes	size_t pitch; ///< Pitch of allocated memory in bytes

	size_t xsize; ///< Logical width of allocation in bytes	size_t xsize; ///< Logical width of allocation in elements
	size_t ysize; ///< Logical height of allocation in bytes	size_t ysize; ///< Logical height of allocation in elements
	};	};

	/**	/**
	* CUDA extent	* CUDA extent
	*/	*/
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct cudaExtent	struct cudaExtent
	{	{
	size_t width; ///< Width in bytes	size_t width; ///< Width in bytes
	size_t height; ///< Height in bytes	size_t height; ///< Height in bytes

End of changes. 2 change blocks.
	3 lines changed or deleted	3 lines changed or added

	func_macro.h	func_macro.h
	/*	/*

	* Copyright 1993-2009 NVIDIA Corporation. All rights reserved.	* Copyright 1993-2008 NVIDIA Corporation. All rights reserved.
	*	*
	* NOTICE TO USER:	* NOTICE TO USER:
	*	*
	* This source code is subject to NVIDIA ownership rights under U.S. and	* This source code is subject to NVIDIA ownership rights under U.S. and
	* international Copyright laws. Users and possessors of this source code	* international Copyright laws. Users and possessors of this source code
	* are hereby granted a nonexclusive, royalty-free license to use this code	* are hereby granted a nonexclusive, royalty-free license to use this code
	* in individual and commercial software.	* in individual and commercial software.
	*	*
	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR

	skipping to change at line 52	skipping to change at line 52
	___device__(static) decl	___device__(static) decl
	#define __device_func__(decl) \	#define __device_func__(decl) \
	___device__(static) decl	___device__(static) decl

	#else /* __CUDABE__ */	#else /* __CUDABE__ */

	#if !defined(__CUDA_INTERNAL_COMPILATION__)	#if !defined(__CUDA_INTERNAL_COMPILATION__)

	#error -- incorrect inclusion of a cudart header file	#error -- incorrect inclusion of a cudart header file


	#endif /* __CUDA_INTERNAL_COMPILATION__ */	#endif /* !__CUDA_INTERNAL_COMPILATION__ */

		#if defined(__cplusplus) && defined(__device_emulation) && !defined(__multi
		_core__)

		#define __begin_host_func \
		}}
		#define __end_host_func \
		namespace __cuda_emu { extern "C" {
		#define __host_device_call(f) \
		__cuda_emu::f

		#else /* __cplusplus && __device_emulation && !__multi_core__ */

		#define __begin_host_func
		#define __end_host_func
		#define __host_device_call(f) \
		f

		#endif /* __cplusplus && __device_emulation !__multi_core__ */

	#if defined(__APPLE__)	#if defined(__APPLE__)

	#define __func__(decl) \	#define __func__(decl) \
	extern __attribute__((__weak_import__, __weak__)) decl; decl	extern __attribute__((__weak_import__, __weak__)) decl; decl

		#define __device_func__(decl) \
		static __attribute__((__unused__)) decl

	#elif defined(__GNUC__)	#elif defined(__GNUC__)

	#define __func__(decl) \	#define __func__(decl) \
	extern __attribute__((__weak__)) decl; decl	extern __attribute__((__weak__)) decl; decl

		#define __device_func__(decl) \
		static __attribute__((__unused__)) decl

	#elif defined(_WIN32)	#elif defined(_WIN32)

	#define __func__(decl) \	#define __func__(decl) \
	static decl	static decl


	#endif /* __APPLE__ */

	#define __device_func__(decl) \	#define __device_func__(decl) \
	static decl	static decl


		#endif /* __APPLE__ */

	#endif /* CUDABE */	#endif /* CUDABE */

	#endif /* __FUNC_MACRO_H__ */	#endif /* __FUNC_MACRO_H__ */

End of changes. 6 change blocks.
	5 lines changed or deleted	27 lines changed or added

	host_config.h	host_config.h

	skipping to change at line 119	skipping to change at line 119
	#if !defined(NOMINMAX)	#if !defined(NOMINMAX)

	#define NOMINMAX /* min and max are part of cuda runtime */	#define NOMINMAX /* min and max are part of cuda runtime */

	#endif /* !NOMINMAX */	#endif /* !NOMINMAX */

	#include <crtdefs.h> /* for _CRTIMP */	#include <crtdefs.h> /* for _CRTIMP */

	#define __THROW	#define __THROW


	#endif /* __GNUC__ */	#endif /* __APPLE__ */

	#endif /* __CUDACC__ */	#endif /* __CUDACC__ */

	#endif /* !__HOST_CONFIG_H__ */	#endif /* !__HOST_CONFIG_H__ */

End of changes. 1 change blocks.
	1 lines changed or deleted	1 lines changed or added

	host_defines.h	host_defines.h

	skipping to change at line 101	skipping to change at line 101
	__loc__(__declspec(a))	__loc__(__declspec(a))
	#define CUDARTAPI \	#define CUDARTAPI \
	__stdcall	__stdcall

	#endif /* !__GNUC__ && !_WIN32 */	#endif /* !__GNUC__ && !_WIN32 */

	#if defined(__CUDACC__) \|\| defined(__CUDABE__) \|\| defined (__MULTI_CORE__)	#if defined(__CUDACC__) \|\| defined(__CUDABE__) \|\| defined (__MULTI_CORE__)

	#define __loc__(a) \	#define __loc__(a) \
	a	a

	#define __builtin_align__(a) \
	__align__(a)

	#else /* __CUDACC__ \|\| __CUDABE__ \|\| __MULTI_CORE__ */	#else /* __CUDACC__ \|\| __CUDABE__ \|\| __MULTI_CORE__ */

	#define __loc__(a)	#define __loc__(a)

	#define __builtin_align__(a)

	#endif /* __CUDACC__ \|\| __CUDABE__ \|\| __MULTI_CORE__ */	#endif /* __CUDACC__ \|\| __CUDABE__ \|\| __MULTI_CORE__ */


		#if defined(__CUDACC__) \|\| defined(__CUDABE__) \|\| defined (__MULTI_CORE__)
		\|\| \
		defined(__GNUC__) \|\| defined(_WIN64)

		#define __builtin_align__(a) \
		__align__(a)

		#else /* __CUDACC__ \|\| __CUDABE__ \|\| __MULTI_CORE__ \|\| __GNUC__ \|\| _WIN64 *
		/

		#define __builtin_align__(a)

		#endif /* __CUDACC__ \|\| __CUDABE__ \|\| __MULTI_CORE__ \|\| __GNUC__ \|\| _WIN64
		*/

	#define __device__ \	#define __device__ \
	__location__(__device__)	__location__(__device__)
	#define __host__ \	#define __host__ \
	__location__(__host__)	__location__(__host__)
	#define __global__ \	#define __global__ \
	__location__(__global__)	__location__(__global__)
	#define __shared__ \	#define __shared__ \
	__location__(__shared__)	__location__(__shared__)
	#define __constant__ \	#define __constant__ \
	__location__(__constant__)	__location__(__constant__)

	#define __launch_bounds__(t) \	#define __launch_bounds__(...) \
	__location__(__launch_bounds__(t))	__location__(__launch_bounds__(__VA_ARGS__))

	#endif /* !__HOST_DEFINES_H__ */	#endif /* !__HOST_DEFINES_H__ */

End of changes. 4 change blocks.
	5 lines changed or deleted	17 lines changed or added

	host_runtime.h	host_runtime.h
	/*	/*

	* Copyright 1993-2009 NVIDIA Corporation. All rights reserved.	* Copyright 1993-2008 NVIDIA Corporation. All rights reserved.
	*	*
	* NOTICE TO USER:	* NOTICE TO USER:
	*	*
	* This source code is subject to NVIDIA ownership rights under U.S. and	* This source code is subject to NVIDIA ownership rights under U.S. and
	* international Copyright laws. Users and possessors of this source code	* international Copyright laws. Users and possessors of this source code
	* are hereby granted a nonexclusive, royalty-free license to use this code	* are hereby granted a nonexclusive, royalty-free license to use this code
	* in individual and commercial software.	* in individual and commercial software.
	*	*
	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR

	skipping to change at line 43	skipping to change at line 43
	* the above Disclaimer and U.S. Government End Users Notice.	* the above Disclaimer and U.S. Government End Users Notice.
	*/	*/

	#if !defined(__CUDA_INTERNAL_COMPILATION__)	#if !defined(__CUDA_INTERNAL_COMPILATION__)

	#define __CUDA_INTERNAL_COMPILATION__	#define __CUDA_INTERNAL_COMPILATION__
	#define __glob_pref_var(var) \	#define __glob_pref_var(var) \
	__global_##var	__global_##var
	#define __global_var(var) \	#define __global_var(var) \
	(*__glob_pref_var(var))	(*__glob_pref_var(var))

	#define __shadow_pref_var(var) \	#define __shadow_var(c, cpp) \
	__shadow_##var	__shadow_pref_var(c, cpp)
	#define __shadow_var(var) \
	__shadow_pref_var(var)
	#define __pad__(f) \
	f
	#define __text__	#define __text__

		#define __surf__
	#define __dv(v)	#define __dv(v)


		#if defined(_WIN32) && !defined(_WIN64)

		#define __pad__(f) \
		f

		#else /* _WIN32 && !_WIN64 */

		#define __pad__(f)

		#endif /* _WIN32 && !_WIN64 */

	#if defined(__APPLE__)	#if defined(__APPLE__)

	#define __extern_weak__ \	#define __extern_weak__ \
	__weak_import__,	__weak_import__,

	#elif defined(__GNUC__)	#elif defined(__GNUC__)

	#define __extern_weak__	#define __extern_weak__

	#endif /* __APPLE__ */	#endif /* __APPLE__ */

	#if defined(__cplusplus)	#if defined(__cplusplus)


		#define __shadow_pref_var(c, cpp) \
		cpp##__cuda_shadow_variable__
	#define __device_stub_name(c, cpp) \	#define __device_stub_name(c, cpp) \
	cpp	cpp

		#define __text_var(c, cpp) \
		cpp
	#define __cppref__ \	#define __cppref__ \
	&	&

	#else /* __cplusplus */	#else /* __cplusplus */


		#define __shadow_pref_var(c, cpp) \
		c##__cuda_shadow_variable__
	#define __device_stub_name(c, cpp) \	#define __device_stub_name(c, cpp) \
	c	c

		#define __text_var(c, cpp) \
		c
	#define __cppref__	#define __cppref__

	typedef char bool;	typedef char bool;

	#endif /* __cplusplus */	#endif /* __cplusplus */


		#if !defined(__GNUC__) \|\| __GNUC__ < 4 \|\| (__GNUC__ == 4 && __GNUC_MINOR__
		< 3)

		#define __specialization_static \
		static

		#else /* !__GNUC__ \|\| __GNUC__ < 4 \|\| (__GNUC__ == 4 && __GNUC_MINOR__ < 3)
		*/

		#define __specialization_static

		#endif /* !__GNUC__ \|\| __GNUC__ < 4 \|\| (__GNUC__ == 4 && __GNUC_MINOR__ < 3
		) */

	#include "cuda_runtime_api.h"	#include "cuda_runtime_api.h"
	#include "storage_class.h"	#include "storage_class.h"

	#else /* !__CUDA_INTERNAL_COMPILATION__ */	#else /* !__CUDA_INTERNAL_COMPILATION__ */

	#include "host_defines.h"	#include "host_defines.h"

	#define __cudaRegisterBinary() \	#define __cudaRegisterBinary() \
	__cudaFatCubinHandle = __cudaRegisterFatBinary((void*)__cudaFatCubi n); \	__cudaFatCubinHandle = __cudaRegisterFatBinary((void*)__cudaFatCubi n); \
	atexit(__cudaUnregisterBinaryUtil)	atexit(__cudaUnregisterBinaryUtil)
	#define __cudaRegisterVariable(var, ext, size, constant, global) \	#define __cudaRegisterVariable(var, ext, size, constant, global) \
	__cudaRegisterVar(__cudaFatCubinHandle, (char)&__host##var, (char )__device##var, __name##var, ext, size, constant, global)	__cudaRegisterVar(__cudaFatCubinHandle, (char)&__host##var, (char )__device##var, __name##var, ext, size, constant, global)
	#define __cudaRegisterGlobalTexture(tex, dim, norm, ext) \	#define __cudaRegisterGlobalTexture(tex, dim, norm, ext) \

	__cudaRegisterTexture(__cudaFatCubinHandle, (const struct textureRe	__cudaRegisterTexture(__cudaFatCubinHandle, (const struct textureRe
	ference*)&tex, __tex_var(tex), #tex, dim, norm, ext)	ference*)&tex, __tex_var(tex), __name##tex, dim, norm, ext)
		#define __cudaRegisterGlobalSurface(surf, dim, ext) \
		__cudaRegisterSurface(__cudaFatCubinHandle, (const struct surfaceRe
		ference*)&surf, __tex_var(surf), __name##surf, dim, ext)
	#define __cudaRegisterUnsizedShared(var) \	#define __cudaRegisterUnsizedShared(var) \
	__cudaRegisterShared(__cudaFatCubinHandle, (void**)__device_var(var ))	__cudaRegisterShared(__cudaFatCubinHandle, (void**)__device_var(var ))
	#define __cudaRegisterSharedVariable(var, size, align, sc) \	#define __cudaRegisterSharedVariable(var, size, align, sc) \
	__cudaRegisterSharedVar(__cudaFatCubinHandle, (void**)__device_var( var), size, align, sc)	__cudaRegisterSharedVar(__cudaFatCubinHandle, (void**)__device_var( var), size, align, sc)
	#define __cudaRegisterEntry(funptr, fun, thread_limit) \	#define __cudaRegisterEntry(funptr, fun, thread_limit) \
	__cudaRegisterFunction(__cudaFatCubinHandle, (const char)funptr, ( char)__device_fun(fun), #fun, thread_limit, __ids)	__cudaRegisterFunction(__cudaFatCubinHandle, (const char)funptr, ( char)__device_fun(fun), #fun, thread_limit, __ids)

	#define __cudaInitArgBlock(arg) \	#define __cudaInitArgBlock(arg) \

	char __[256]; \	(void)(void)&arg = (void*)0
	(char*)&arg = __
	#define __cudaSetupArg(arg, offset) \	#define __cudaSetupArg(arg, offset) \

	if (cudaSetupArgument((void)(char)&arg, sizeof(arg), (size_t)&off set->arg - (size_t)offset) != cudaSuccess) \	if (cudaSetupArgument((void)(char)&arg, sizeof(arg), (size_t)&off set->arg) != cudaSuccess) \
	return	return
	#define __cudaLaunch(fun) \	#define __cudaLaunch(fun) \
	{ volatile static char *__f; __f = fun; (void)cudaLaunch(fun); }	{ volatile static char *__f; __f = fun; (void)cudaLaunch(fun); }

	#if defined(__cplusplus)	#if defined(__cplusplus)
	extern "C" {	extern "C" {
	#endif /* __cplusplus */	#endif /* __cplusplus */

	extern void** CUDARTAPI __cudaRegisterFatBinary(	extern void** CUDARTAPI __cudaRegisterFatBinary(
	void *fatCubin	void *fatCubin

	skipping to change at line 143	skipping to change at line 171
	extern void CUDARTAPI __cudaRegisterTexture(	extern void CUDARTAPI __cudaRegisterTexture(
	void **fatCubinHandle,	void **fatCubinHandle,
	const struct textureReference *hostVar,	const struct textureReference *hostVar,
	const void **deviceAddress,	const void **deviceAddress,
	const char *deviceName,	const char *deviceName,
	int dim,	int dim,
	int norm,	int norm,
	int ext	int ext
	);	);


		extern void CUDARTAPI __cudaRegisterSurface(
		void **fatCubinHandle,
		const struct surfaceReference *hostVar,
		const void **deviceAddress,
		const char *deviceName,
		int dim,
		int ext
		);

	extern void CUDARTAPI __cudaRegisterShared(	extern void CUDARTAPI __cudaRegisterShared(
	void **fatCubinHandle,	void **fatCubinHandle,
	void **devicePtr	void **devicePtr
	);	);

	extern void CUDARTAPI __cudaRegisterSharedVar(	extern void CUDARTAPI __cudaRegisterSharedVar(
	void **fatCubinHandle,	void **fatCubinHandle,
	void **devicePtr,	void **devicePtr,
	size_t size,	size_t size,
	size_t alignment,	size_t alignment,

	skipping to change at line 192	skipping to change at line 229

	static void **__cudaFatCubinHandle;	static void **__cudaFatCubinHandle;

	static void __cudaUnregisterBinaryUtil(void)	static void __cudaUnregisterBinaryUtil(void)
	{	{
	__cudaUnregisterFatBinary(__cudaFatCubinHandle);	__cudaUnregisterFatBinary(__cudaFatCubinHandle);
	}	}

	#if defined(__device_emulation)	#if defined(__device_emulation)


		#if defined(__cplusplus) && !defined(__multi_core__)

		#define __cuda_emu__ \
		__cuda_emu::

		#else /* __cplusplus */

		#define __cuda_emu__

		#endif /* __cplusplus */

	#define __device_fun(fun) \	#define __device_fun(fun) \

	__device_wrapper_##fun	__cuda_emu__ __device_wrapper_##fun
	#define __device_var(var) \	#define __device_var(var) \

	(char*)&var	(char*)&__cuda_emu__ var
	#define __tex_var(var) \	#define __tex_var(var) \

	&__texture_var(var)	&__cuda_emu__ __texture_var(var)
	#define __cudaFatCubin \	#define __cudaFatCubin \
	0	0

	#if defined(__multi_core__)	#if defined(__multi_core__)

	#define __ids \	#define __ids \
	(uint3)0, (uint3)0, &blockDim, &gridDim, &warpSize	(uint3)0, (uint3)0, &blockDim, &gridDim, &warpSize

	#else /* __multi_core__ */	#else /* __multi_core__ */

	#define __ids \	#define __ids \

	&threadIdx, &blockIdx, &blockDim, &gridDim, &warpSize	(uint3)&__cuda_emu__ threadIdx, (uint3)&__cuda_emu__ blockIdx, (d im3)&__cuda_emu__ blockDim, (dim3)&__cuda_emu__ gridDim, &__cuda_emu__ wa rpSize

	#endif /* __multi_core__ */	#endif /* __multi_core__ */

	#else /* __device_emulation */	#else /* __device_emulation */

	#define __device_fun(fun) \	#define __device_fun(fun) \
	#fun	#fun
	#define __device_var(var) \	#define __device_var(var) \
	#var	#var
	#define __tex_var(var) \	#define __tex_var(var) \

	skipping to change at line 237	skipping to change at line 285

	#endif /* __device_emulation */	#endif /* __device_emulation */

	/* UTILITY MACROS */	/* UTILITY MACROS */
	#define __device__global_var(var) \	#define __device__global_var(var) \
	__device_var(var)	__device_var(var)
	#define __name__global_var(var) \	#define __name__global_var(var) \
	#var	#var
	#define __host__global_var(var) \	#define __host__global_var(var) \
	__glob_pref_var(var)	__glob_pref_var(var)

	#define __device__shadow_var(var) \	#define __device__shadow_var(c, cpp) \
	__device_var(var)	__device_var(c)
	#define __name__shadow_var(var) \	#define __name__shadow_var(c, cpp) \
	#var	#c
	#define __host__shadow_var(var) \	#define __name__text_var(c, cpp) \
	__shadow_pref_var(var)	#c
		#define __host__shadow_var(c, cpp) \
		__shadow_pref_var(c, cpp)

		#if defined(_WIN32) && defined(__cplusplus)

		#pragma warning(disable: 4099)

		#endif /* _WIN32 && __cplusplus */

	#endif /* !__CUDA_INTERNAL_COMPILATION__ */	#endif /* !__CUDA_INTERNAL_COMPILATION__ */

End of changes. 19 change blocks.
	22 lines changed or deleted	82 lines changed or added

	math_functions_dbl_ptx3.h	math_functions_dbl_ptx3.h

	skipping to change at line 497	skipping to change at line 497
	if (i & 2) {	if (i & 2) {
	z = -z;	z = -z;
	}	}
	return z;	return z;
	}	}

	__device_func__(void __cuda_sincos(double a, double sptr, double cptr))	__device_func__(void __cuda_sincos(double a, double sptr, double cptr))
	{	{
	double t, u, s, c;	double t, u, s, c;
	int i;	int i;

	if (__cuda___isinf(a)) {	t = __cuda_fabs(a);
	*sptr = CUDART_NAN;	if ((t == CUDART_INF) \|\| (t == CUDART_ZERO)) {
	*cptr = CUDART_NAN;	s = __dmul_rn (a, CUDART_ZERO); /* generate NaN, zero */
	return;	c = 1.0 + s; /* generate NaN, one */
	}	*sptr = s;
	if (a == CUDART_ZERO) {	*cptr = c;
	*sptr = a;
	*cptr = 1.0;
	return;	return;
	}	}
	t = __internal_trig_reduction_kerneld(a, &i);	t = __internal_trig_reduction_kerneld(a, &i);
	u = __internal_cos_kerneld(t);	u = __internal_cos_kerneld(t);
	t = __internal_sin_kerneld(t);	t = __internal_sin_kerneld(t);
	if (i & 1) {	if (i & 1) {
	s = u;	s = u;
	c = t;	c = t;
	} else {	} else {
	s = t;	s = t;

	skipping to change at line 561	skipping to change at line 559
	/* normalize denormals */	/* normalize denormals */
	if ((unsigned)ihi < (unsigned)0x00100000) {	if ((unsigned)ihi < (unsigned)0x00100000) {
	a = a * CUDART_TWO_TO_54;	a = a * CUDART_TWO_TO_54;
	e -= 54;	e -= 54;
	ihi = __double2hiint(a);	ihi = __double2hiint(a);
	ilo = __double2loint(a);	ilo = __double2loint(a);
	}	}
	/* a = m * 2^e. m <= sqrt(2): log2(a) = log2(m) + e.	/* a = m * 2^e. m <= sqrt(2): log2(a) = log2(m) + e.
	* m > sqrt(2): log2(a) = log2(m/2) + (e+1)	* m > sqrt(2): log2(a) = log2(m/2) + (e+1)
	*/	*/

	e += ((ihi >> 20) & 0x7ff);	e += (ihi >> 20);
	ihi = (ihi & 0x800fffff) \| 0x3ff00000;	ihi = (ihi & 0x800fffff) \| 0x3ff00000;
	m = __hiloint2double (ihi, ilo);	m = __hiloint2double (ihi, ilo);
	if ((unsigned)ihi > (unsigned)0x3ff6a09e) {	if ((unsigned)ihi > (unsigned)0x3ff6a09e) {
	m = __internal_half(m);	m = __internal_half(m);
	e = e + 1;	e = e + 1;
	}	}
	/* log((1+m)/(1-m)) = 2atanh(m). log(m) = 2atanh ((m-1)/(m+1)) */	/* log((1+m)/(1-m)) = 2atanh(m). log(m) = 2atanh ((m-1)/(m+1)) */
	f = m - 1.0;	f = m - 1.0;
	g = m + 1.0;	g = m + 1.0;
	g = 1.0 / g;	g = 1.0 / g;

	skipping to change at line 1590	skipping to change at line 1588
	}	}
	return t;	return t;
	}	}

	__device_func__(double __cuda_erfc(double a))	__device_func__(double __cuda_erfc(double a))
	{	{
	double p, q, h, l;	double p, q, h, l;
	int ahi;	int ahi;

	ahi = __double2hiint(a);	ahi = __double2hiint(a);

	if (ahi < (int)0x3fe80000) {	if (ahi < (int)0x3fe80000) { /* 0.75 */
	return 1.0 - __cuda_erf(a);	return 1.0 - __cuda_erf(a);
	}	}
	if (a > 27.3) {	if (a > 27.3) {
	return 0.0;	return 0.0;
	}	}

	if (ahi < (int)0x40140000) {	if (ahi < (int)0x40140000) { /* 5.0 */
	p = 5.64189549785304440E-001;	/* max error 7 ulps on [0.75, 5.0] */
	p = __fma_rn (p, a, 8.17405083437083490E+000);	p = 5.6418956292134603E-001;
	p = __fma_rn (p, a, 5.68958722557864720E+001);	p = __fma_rn (p, a, 7.9573512229784757E+000);
	p = __fma_rn (p, a, 2.42568747802647010E+002);	p = __fma_rn (p, a, 5.4297984550299049E+001);
	p = __fma_rn (p, a, 6.80381374390412930E+002);	p = __fma_rn (p, a, 2.2775657465890461E+002);
	p = __fma_rn (p, a, 1.25873132236024590E+003);	p = __fma_rn (p, a, 6.2995529536738172E+002);
	p = __fma_rn (p, a, 1.43925353963809330E+003);	p = __fma_rn (p, a, 1.1508293767713837E+003);
	p = __fma_rn (p, a, 8.15949420587659230E+002);	p = __fma_rn (p, a, 1.3002167301542784E+003);
	q = a+ 1.44881247113239940E+001;	p = __fma_rn (p, a, 7.2716547570180592E+002);
	q = __fma_rn (q, a, 1.01345387970210510E+002);	q = a+ 1.4104035812651274E+001;
	q = __fma_rn (q, a, 4.37184684964009650E+002);	q = __fma_rn (q, a, 9.6740724349422138E+001);
	q = __fma_rn (q, a, 1.25588209225251330E+003);	q = __fma_rn (q, a, 4.1073916054746462E+002);
	q = __fma_rn (q, a, 2.43864015012647630E+003);	q = __fma_rn (q, a, 1.1641974580374074E+003);
	q = __fma_rn (q, a, 3.10570469008816280E+003);	q = __fma_rn (q, a, 2.2344896486798129E+003);
	q = __fma_rn (q, a, 2.35995386578953550E+003);	q = __fma_rn (q, a, 2.8166572432808462E+003);
	q = __fma_rn (q, a, 8.15949420692539320E+002);	q = __fma_rn (q, a, 2.1207350981593036E+003);
		q = __fma_rn (q, a, 7.2716547619708967E+002);
		p = p / q;
		h = a * a;
		l = __fma_rn (a, a, -h);
		q = __internal_exp_kernel(-h, 0);
		q = __fma_rn (l, -q, q);
		p = p * q;
	} else {	} else {

	p = 5.64189583545675280E-001;	/* max error 4 ulps on [5, 27.3] */
	p = __fma_rn (p, a, 2.04728556066513970E+000);	double ooa, ooasq;
	p = __fma_rn (p, a, 6.75128409662943610E+000);
	p = __fma_rn (p, a, 1.10459345071747900E+001);	ooa = 1.0 / a;
	p = __fma_rn (p, a, 1.22570382896313600E+001);	ooasq = ooa * ooa;
	p = __fma_rn (p, a, 6.01884641114116460E+000);	p = -4.0025406686930527E+005;
	q = a+ 3.62871917534986780E+000;	p = __fma_rn (p, ooasq, 1.4420582543942123E+005);
	q = __fma_rn (q, a, 1.24663395327043550E+001);	p = __fma_rn (p, ooasq, -2.7664185780951841E+004);
	q = __fma_rn (q, a, 2.13927672803974790E+001);	p = __fma_rn (p, ooasq, 4.1144611644767283E+003);
	q = __fma_rn (q, a, 2.72082423532866070E+001);	p = __fma_rn (p, ooasq, -5.8706000519209351E+002);
	q = __fma_rn (q, a, 1.86422906830006700E+001);	p = __fma_rn (p, ooasq, 9.1490086446323375E+001);
	q = __fma_rn (q, a, 6.13809834548870550E+000);	p = __fma_rn (p, ooasq, -1.6659491387740221E+001);
		p = __fma_rn (p, ooasq, 3.7024804085481784E+000);
		p = __fma_rn (p, ooasq, -1.0578553994424316E+000);
		p = __fma_rn (p, ooasq, 4.2314218745087778E-001);
		p = __fma_rn (p, ooasq, -2.8209479177354962E-001);
		p = __fma_rn (p, ooasq, 5.6418958354775606E-001);
		h = a * a;
		l = __fma_rn (a, a, -h);
		q = __internal_exp_kernel(-h, 0);
		q = __fma_rn (l, -q, q);
		p = p * ooa;
		p = p * q;
	}	}

	p = p / q;
	h = a * a;
	l = __fma_rn (a, a, -h);
	q = __internal_exp_kernel(-h, 0);
	q = __fma_rn (l, -q, q);
	p = p * q;
	return p;	return p;
	}	}

	/* approximate 1.0/(agamma(a)) on [-0.5,0.5] /	/* approximate 1.0/(agamma(a)) on [-0.5,0.5] /
	__device_func__(double __internal_tgamma_kernel(double a))	__device_func__(double __internal_tgamma_kernel(double a))
	{	{
	double t;	double t;
	t = -4.42689340712524750E-010;	t = -4.42689340712524750E-010;
	t = __fma_rn (t, a, -2.02665918466589540E-007);	t = __fma_rn (t, a, -2.02665918466589540E-007);
	t = __fma_rn (t, a, 1.13812117211195270E-006);	t = __fma_rn (t, a, 1.13812117211195270E-006);

End of changes. 6 change blocks.
	45 lines changed or deleted	55 lines changed or added

	sm_11_atomic_functions.h	sm_11_atomic_functions.h

	skipping to change at line 41	skipping to change at line 41
	* Any use of this source code in individual and commercial software must	* Any use of this source code in individual and commercial software must
	* include, in the user documentation and internal comments to the code,	* include, in the user documentation and internal comments to the code,
	* the above Disclaimer and U.S. Government End Users Notice.	* the above Disclaimer and U.S. Government End Users Notice.
	*/	*/

	#if !defined(__SM_11_ATOMIC_FUNCTIONS_H__)	#if !defined(__SM_11_ATOMIC_FUNCTIONS_H__)
	#define __SM_11_ATOMIC_FUNCTIONS_H__	#define __SM_11_ATOMIC_FUNCTIONS_H__

	#if defined(__cplusplus) && defined(__CUDACC__)	#if defined(__cplusplus) && defined(__CUDACC__)


	#if !defined(CUDA_NO_SM_11_ATOMIC_INTRINSICS)	#if __CUDA_ARCH__ >= 110

	/************************************************************************ ***	/************************************************************************ ***
	* *	* *
	* *	* *
	* *	* *
	************************************************************************* **/	************************************************************************* **/

	#include "host_defines.h"	#include "host_defines.h"

	extern "C"	extern "C"

	skipping to change at line 206	skipping to change at line 206
	static __inline__ __device__ int atomicCAS(int *address, int compare, int v al)	static __inline__ __device__ int atomicCAS(int *address, int compare, int v al)
	{	{
	return __iAtomicCAS(address, compare, val);	return __iAtomicCAS(address, compare, val);
	}	}

	static __inline__ __device__ unsigned int atomicCAS(unsigned int *address, unsigned int compare, unsigned int val)	static __inline__ __device__ unsigned int atomicCAS(unsigned int *address, unsigned int compare, unsigned int val)
	{	{
	return __uAtomicCAS(address, compare, val);	return __uAtomicCAS(address, compare, val);
	}	}


	#endif /* !CUDA_NO_SM_12_ATOMIC_INTRINSICS */	#endif /* __CUDA_ARCH__ >= 110 */

	#elif !defined(__CUDACC__)	#elif !defined(__CUDACC__)

	#include "crt/func_macro.h"	#include "crt/func_macro.h"

	#if !defined(__CUDABE__)	#if !defined(__CUDABE__)

	#if defined(__MULTI_CORE__)	#if defined(__MULTI_CORE__)

	#define __iAtomicAdd(address, val) \	#define __iAtomicAdd(address, val) \

End of changes. 2 change blocks.
	2 lines changed or deleted	2 lines changed or added

	sm_12_atomic_functions.h	sm_12_atomic_functions.h

	skipping to change at line 41	skipping to change at line 41
	* Any use of this source code in individual and commercial software must	* Any use of this source code in individual and commercial software must
	* include, in the user documentation and internal comments to the code,	* include, in the user documentation and internal comments to the code,
	* the above Disclaimer and U.S. Government End Users Notice.	* the above Disclaimer and U.S. Government End Users Notice.
	*/	*/

	#if !defined(__SM_12_ATOMIC_FUNCTIONS_H__)	#if !defined(__SM_12_ATOMIC_FUNCTIONS_H__)
	#define __SM_12_ATOMIC_FUNCTIONS_H__	#define __SM_12_ATOMIC_FUNCTIONS_H__

	#if defined(__cplusplus) && defined(__CUDACC__)	#if defined(__cplusplus) && defined(__CUDACC__)


	#if !defined(CUDA_NO_SM_12_ATOMIC_INTRINSICS)	#if __CUDA_ARCH__ >= 120

	/************************************************************************ ***	/************************************************************************ ***
	* *	* *
	* *	* *
	* *	* *
	************************************************************************* **/	************************************************************************* **/

	#include "host_defines.h"	#include "host_defines.h"

	extern "C"	extern "C"

	skipping to change at line 99	skipping to change at line 99
	static __inline__ __device__ bool any(bool cond)	static __inline__ __device__ bool any(bool cond)
	{	{
	return (bool)__any((int)cond);	return (bool)__any((int)cond);
	}	}

	static __inline__ __device__ bool all(bool cond)	static __inline__ __device__ bool all(bool cond)
	{	{
	return (bool)__all((int)cond);	return (bool)__all((int)cond);
	}	}


	#endif /* !CUDA_NO_SM_12_ATOMIC_INTRINSICS */	#endif /* __CUDA_ARCH__ >= 120 */

	#elif !defined(__CUDACC__)	#elif !defined(__CUDACC__)

	#include "crt/func_macro.h"	#include "crt/func_macro.h"

	#if !defined(__CUDABE__)	#if !defined(__CUDABE__)

	#if defined(__MULTI_CORE__)	#if defined(__MULTI_CORE__)

	#define __ullAtomicAdd(address, val) \	#define __ullAtomicAdd(address, val) \

End of changes. 2 change blocks.
	2 lines changed or deleted	2 lines changed or added

	sm_13_double_functions.h	sm_13_double_functions.h

	skipping to change at line 47	skipping to change at line 47
	#define __SM_13_DOUBLE_FUNCTIONS_H__	#define __SM_13_DOUBLE_FUNCTIONS_H__

	/************************************************************************ ***	/************************************************************************ ***
	* *	* *
	* *	* *
	* *	* *
	************************************************************************* **/	************************************************************************* **/

	#if defined(__cplusplus) && defined(__CUDACC__)	#if defined(__cplusplus) && defined(__CUDACC__)


	#if !defined(CUDA_NO_SM_13_DOUBLE_INTRINSICS)	#if __CUDA_ARCH__ >= 130

	/************************************************************************ ***	/************************************************************************ ***
	* *	* *
	* *	* *
	* *	* *
	************************************************************************* **/	************************************************************************* **/

	#include "device_types.h"	#include "device_types.h"
	#include "host_defines.h"	#include "host_defines.h"


	skipping to change at line 252	skipping to change at line 252
	static __inline__ __device__ double uint2double(unsigned int a, enum cudaRo undMode mode = cudaRoundNearest)	static __inline__ __device__ double uint2double(unsigned int a, enum cudaRo undMode mode = cudaRoundNearest)
	{	{
	return (double)a;	return (double)a;
	}	}

	static __inline__ __device__ double float2double(float a, enum cudaRoundMod e mode = cudaRoundNearest)	static __inline__ __device__ double float2double(float a, enum cudaRoundMod e mode = cudaRoundNearest)
	{	{
	return (double)a;	return (double)a;
	}	}


	#endif /* !CUDA_NO_SM_13_DOUBLE_INTRINSICS */	#endif /* __CUDA_ARCH__ >= 130 */

	#elif !defined(__CUDACC__)	#elif !defined(__CUDACC__)

	#include "crt/func_macro.h"	#include "crt/func_macro.h"

	#if !defined(__CUDABE__)	#if !defined(__CUDABE__)

	/************************************************************************ ***	/************************************************************************ ***
	* *	* *
	* HOST IMPLEMENTATIONS FOR FUNCTIONS *	* HOST IMPLEMENTATIONS FOR FUNCTIONS *

	skipping to change at line 292	skipping to change at line 292
	{	{
	volatile union __cudart_DoubleUlonglongCvt xx;	volatile union __cudart_DoubleUlonglongCvt xx;
	volatile union __cudart_FloatUintCvt res;	volatile union __cudart_FloatUintCvt res;

	int shift;	int shift;
	xx.d = a;	xx.d = a;
	if (xx.i == 0) return 0.0f;	if (xx.i == 0) return 0.0f;
	res.i = (((unsigned int) (xx.i >> 32)) & 0x80000000);	res.i = (((unsigned int) (xx.i >> 32)) & 0x80000000);
	if ((xx.i & 0x7ff0000000000000ULL) == 0x7ff0000000000000ULL) {	if ((xx.i & 0x7ff0000000000000ULL) == 0x7ff0000000000000ULL) {
	if ((xx.i & 0x7fffffffffffffffULL) > 0x7ff0000000000000ULL) {	if ((xx.i & 0x7fffffffffffffffULL) > 0x7ff0000000000000ULL) {

	// Nan	/* Nan */
	res.i = 0x7f8fffff;	res.i = ((unsigned int)((xx.i >> 32) & 0x80000000) \|
		(255U << 23) \| 0x00400000 \|
		(unsigned int)((xx.i >> (53 - 24)) & 0x007fffff));
	} else {	} else {

	// Inf	/* Inf */
	res.i \|= 0x7f800000;	res.i \|= 0x7f800000;
	}	}
	return res.f;	return res.f;
	}	}
	shift = ((int) ((xx.i >> 52) & 0x7ff)) - 1023;	shift = ((int) ((xx.i >> 52) & 0x7ff)) - 1023;

	// Overflow	/* Overflow */
	xx.i = (xx.i & 0x000fffffffffffffULL);	xx.i = (xx.i & 0x000fffffffffffffULL);
	if (shift >= 128) {	if (shift >= 128) {
	res.i \|= 0x7f7fffff;	res.i \|= 0x7f7fffff;
	return res.f;	return res.f;
	}	}
	if (shift <= -127) {	if (shift <= -127) {
	if (shift < -180) {	if (shift < -180) {

	// Underflow	/* Underflow */
	xx.i = 0;	xx.i = 0;
	} else {	} else {
	xx.i \|= 0x0010000000000000ULL;	xx.i \|= 0x0010000000000000ULL;

	xx.i >>= 127 + shift;	xx.i >>= -126 - shift;
	}	}
	} else {	} else {
	res.i \|= (unsigned int) (127 + shift) << 23;	res.i \|= (unsigned int) (127 + shift) << 23;
	}	}
	res.i \|= ((unsigned int) (xx.i >> 29)) & 0x007fffff;	res.i \|= ((unsigned int) (xx.i >> 29)) & 0x007fffff;
	xx.i &= 0x1fffffff;	xx.i &= 0x1fffffff;
	return res.f;	return res.f;
	}	}

	__device_func__(double __internal_ll2double_kernel(long long int a, enum cu daRoundMode rndMode))	__device_func__(double __internal_ll2double_kernel(long long int a, enum cu daRoundMode rndMode))

	skipping to change at line 596	skipping to change at line 598
	return __internal_ull2double_kernel(a, cudaRoundMinInf);	return __internal_ull2double_kernel(a, cudaRoundMinInf);
	}	}

	__device_func__(double __ull2double_ru(unsigned long long int a))	__device_func__(double __ull2double_ru(unsigned long long int a))
	{	{
	return __internal_ull2double_kernel(a, cudaRoundPosInf);	return __internal_ull2double_kernel(a, cudaRoundPosInf);
	}	}

	#endif /* !__CUDABE__ */	#endif /* !__CUDABE__ */


	#if !defined(__CUDABE__) \|\| defined(CUDA_NO_SM_13_DOUBLE_INTRINSICS)	#if !defined(__CUDABE__) \|\| __CUDA_ARCH__ < 130

	#include "common_types.h"	#include "common_types.h"


	__device_func__(double __internal_fma_kernel(double x, double y, double z, enum cudaRoundMode rndMode))	__device_func__(double __internal_fma_kernel(double x, double y, double z, enum cudaRoundMode rndMode))
	{	{
	#ifdef __MULTI_CORE__	#ifdef __MULTI_CORE__
	volatile	volatile
	#endif /* __MULTI_CORE__ */	#endif /* __MULTI_CORE__ */
	struct __cudart_UintUint xx, yy, zz, ww;	struct __cudart_UintUint xx, yy, zz, ww;
	unsigned int s, t, u, prod0, prod1, prod2, prod3, expo_x, expo_y, expo_z;	unsigned int s, t, u, prod0, prod1, prod2, prod3, expo_x, expo_y, expo_z;

	xx.hi = __double2hiint(x);	xx.hi = __double2hiint(x);
	xx.lo = __double2loint(x);	xx.lo = __double2loint(x);

	skipping to change at line 1113	skipping to change at line 1117
	__device_func__(double __dadd_rn(double a, double b))	__device_func__(double __dadd_rn(double a, double b))
	{	{
	return __fma_rn(a, CUDART_ONE, b);	return __fma_rn(a, CUDART_ONE, b);
	}	}

	__device_func__(double __dmul_rn(double a, double b))	__device_func__(double __dmul_rn(double a, double b))
	{	{
	return __fma_rn(a, b, CUDART_NEG_ZERO);	return __fma_rn(a, b, CUDART_NEG_ZERO);
	}	}


	#endif /* !__CUDABE__ \|\| CUDA_NO_SM_13_DOUBLE_INTRINSICS */	#endif /* !__CUDABE__ \|\| __CUDA_ARCH__ < 130 */

	/************************************************************************ ***	/************************************************************************ ***
	* *	* *
	* HOST / DEVICE IMPLEMENTATIONS FOR FUNCTIONS *	* HOST / DEVICE IMPLEMENTATIONS FOR FUNCTIONS *
	* *	* *
	************************************************************************* **/	************************************************************************* **/
	#endif /* __cplusplus && __CUDACC__ */	#endif /* __cplusplus && __CUDACC__ */

	#endif /* !__SM_13_DOUBLE_FUNCTIONS_H__ */	#endif /* !__SM_13_DOUBLE_FUNCTIONS_H__ */

End of changes. 10 change blocks.
	10 lines changed or deleted	14 lines changed or added

	storage_class.h	storage_class.h
	/*	/*

	* Copyright 1993-2009 NVIDIA Corporation. All rights reserved.	* Copyright 1993-2008 NVIDIA Corporation. All rights reserved.
	*	*
	* NOTICE TO USER:	* NOTICE TO USER:
	*	*
	* This source code is subject to NVIDIA ownership rights under U.S. and	* This source code is subject to NVIDIA ownership rights under U.S. and
	* international Copyright laws. Users and possessors of this source code	* international Copyright laws. Users and possessors of this source code
	* are hereby granted a nonexclusive, royalty-free license to use this code	* are hereby granted a nonexclusive, royalty-free license to use this code
	* in individual and commercial software.	* in individual and commercial software.
	*	*
	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR

	skipping to change at line 47	skipping to change at line 47
	#define __STORAGE_CLASS_H__	#define __STORAGE_CLASS_H__

	#if !defined(__var_used__)	#if !defined(__var_used__)

	#define __var_used__	#define __var_used__

	#endif /* __var_used__ */	#endif /* __var_used__ */

	#if !defined(__loc_sc__)	#if !defined(__loc_sc__)


	#define __loc_sc__(loc, sc) \	#define __loc_sc__(loc, size, sc) \
	__storage##_##sc##loc loc	__storage##_##sc##size##loc loc

	#endif /* !__loc_sc__ */	#endif /* !__loc_sc__ */

	#if !defined(__storage___device__)	#if !defined(__storage___device__)
	#define __storage___device__ static __var_used__	#define __storage___device__ static __var_used__
	#endif /* __storage___device__ */	#endif /* __storage___device__ */

	#if !defined(__storage_extern__device__)	#if !defined(__storage_extern__device__)
	#define __storage_extern__device__ static __var_used__	#define __storage_extern__device__ static __var_used__
	#endif /* __storage_extern__device__ */	#endif /* __storage_extern__device__ */

	skipping to change at line 100	skipping to change at line 100
	#endif /* __storage_extern__shared__ */	#endif /* __storage_extern__shared__ */

	#if !defined(__storage_auto__shared__)	#if !defined(__storage_auto__shared__)
	#define __storage_auto__shared__ static	#define __storage_auto__shared__ static
	#endif /* __storage_auto__shared__ */	#endif /* __storage_auto__shared__ */

	#if !defined(__storage_static__shared__)	#if !defined(__storage_static__shared__)
	#define __storage_static__shared__ static __var_used__	#define __storage_static__shared__ static __var_used__
	#endif /* __storage_static__shared__ */	#endif /* __storage_static__shared__ */


		#if !defined(__storage__unsized__shared__)
		#define __storage__unsized__shared__ @@@ COMPILER @@@ ERROR @@@
		#endif /* __storage__unsized__shared__ */

		#if !defined(__storage_extern_unsized__shared__)
		#define __storage_extern_unsized__shared__ static __var_used__
		#endif /* __storage_extern_unsized__shared__ */

		#if !defined(__storage_auto_unsized__shared__)
		#define __storage_auto_unsized__shared__ @@@ COMPILER @@@ ERROR @@@
		#endif /* __storage_auto_unsized__shared__ */

		#if !defined(__storage_static_unsized__shared__)
		#define __storage_static_unsized__shared__ @@@ COMPILER @@@ ERROR @@@
		#endif /* __storage_static_unsized__shared__ */

	#if !defined(__storage___text__)	#if !defined(__storage___text__)
	#define __storage___text__ static __var_used__	#define __storage___text__ static __var_used__
	#endif /* __storage___text__ */	#endif /* __storage___text__ */

	#if !defined(__storage_extern__text__)	#if !defined(__storage_extern__text__)
	#define __storage_extern__text__ static __var_used__	#define __storage_extern__text__ static __var_used__
	#endif /* __storage_extern__text__ */	#endif /* __storage_extern__text__ */

	#if !defined(__storage_auto__text__)	#if !defined(__storage_auto__text__)
	#define __storage_auto__text__ @@@ COMPILER @@@ ERROR @@@	#define __storage_auto__text__ @@@ COMPILER @@@ ERROR @@@

End of changes. 3 change blocks.
	3 lines changed or deleted	19 lines changed or added

	vector_types.h	vector_types.h

	skipping to change at line 53	skipping to change at line 53
	************************************************************************* **/	************************************************************************* **/

	#include "host_defines.h"	#include "host_defines.h"

	/************************************************************************ ***	/************************************************************************ ***
	* *	* *
	* *	* *
	* *	* *
	************************************************************************* **/	************************************************************************* **/


		#if !defined(__cuda_assign_operators)

		#define __cuda_assign_operators(tag)

		#endif /* !__cuda_assign_operators */

		#if !defined(__CUDACC__) && !defined(__CUDABE__) && \
		!defined (__MULTI_CORE__) && defined(_WIN32) && !defined(_WIN64)

		#define __cuda_builtin_vector_align8(tag, ...) \
		struct tag { \
		union { \
		struct { __VA_ARGS__; }; \
		struct { long long int :1,:0; }; \
		}; \
		__cuda_assign_operators(tag) \
		}

		#else /* !__CUDACC__ && !__CUDABE__ && !__MULTI_CORE__ && _WIN32 && !_WIN64
		*/

		#define __cuda_builtin_vector_align8(tag, ...) \
		struct __align__(8) tag { \
		__VA_ARGS__; \
		__cuda_assign_operators(tag) \
		}

		#endif /* !__CUDACC__ && !__CUDABE__ && !__MULTI_CORE__ && _WIN32 && !_WIN6
		4 */

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct char1	struct char1
	{	{
	signed char x;	signed char x;

		__cuda_assign_operators(char1)
	};	};

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct uchar1	struct uchar1
	{	{
	unsigned char x;	unsigned char x;

		__cuda_assign_operators(uchar1)
	};	};

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/

	struct __builtin_align__(2) char2	struct __align__(2) char2
	{	{
	signed char x, y;	signed char x, y;

		__cuda_assign_operators(char2)
	};	};

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/

	struct __builtin_align__(2) uchar2	struct __align__(2) uchar2
	{	{
	unsigned char x, y;	unsigned char x, y;

		__cuda_assign_operators(uchar2)
	};	};

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct char3	struct char3
	{	{
	signed char x, y, z;	signed char x, y, z;

		__cuda_assign_operators(char3)
	};	};

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct uchar3	struct uchar3
	{	{
	unsigned char x, y, z;	unsigned char x, y, z;

		__cuda_assign_operators(uchar3)
	};	};

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/

	struct __builtin_align__(4) char4	struct __align__(4) char4
	{	{
	signed char x, y, z, w;	signed char x, y, z, w;

		__cuda_assign_operators(char4)
	};	};

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/

	struct __builtin_align__(4) uchar4	struct __align__(4) uchar4
	{	{
	unsigned char x, y, z, w;	unsigned char x, y, z, w;

		__cuda_assign_operators(uchar4)
	};	};

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct short1	struct short1
	{	{
	short x;	short x;

		__cuda_assign_operators(short1)
	};	};

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct ushort1	struct ushort1
	{	{
	unsigned short x;	unsigned short x;

		__cuda_assign_operators(ushort1)
	};	};

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/

	struct __builtin_align__(4) short2	struct __align__(4) short2
	{	{
	short x, y;	short x, y;

		__cuda_assign_operators(short2)
	};	};

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/

	struct __builtin_align__(4) ushort2	struct __align__(4) ushort2
	{	{
	unsigned short x, y;	unsigned short x, y;

		__cuda_assign_operators(ushort2)
	};	};

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct short3	struct short3
	{	{
	short x, y, z;	short x, y, z;

		__cuda_assign_operators(short3)
	};	};

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct ushort3	struct ushort3
	{	{
	unsigned short x, y, z;	unsigned short x, y, z;

		__cuda_assign_operators(ushort3)
	};	};

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/

	struct __builtin_align__(8) short4	__cuda_builtin_vector_align8(short4, short x, y, z, w);
	{
	short x, y, z, w;
	};

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/

	struct __builtin_align__(8) ushort4	__cuda_builtin_vector_align8(ushort4, unsigned short x, y, z, w);
	{
	unsigned short x, y, z, w;
	};

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct int1	struct int1
	{	{
	int x;	int x;

		__cuda_assign_operators(int1)
	};	};

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct uint1	struct uint1
	{	{
	unsigned int x;	unsigned int x;

		__cuda_assign_operators(uint1)
	};	};

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/

	struct __builtin_align__(8) int2	__cuda_builtin_vector_align8(int2, int x, y);
	{
	int x, y;
	};

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/

	struct __builtin_align__(8) uint2	__cuda_builtin_vector_align8(uint2, unsigned int x, y);
	{
	unsigned int x, y;
	};

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct int3	struct int3
	{	{
	int x, y, z;	int x, y, z;

		__cuda_assign_operators(int3)
	};	};

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct uint3	struct uint3
	{	{
	unsigned int x, y, z;	unsigned int x, y, z;

		__cuda_assign_operators(uint3)
	};	};

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct __builtin_align__(16) int4	struct __builtin_align__(16) int4
	{	{
	int x, y, z, w;	int x, y, z, w;

		__cuda_assign_operators(int4)
	};	};

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct __builtin_align__(16) uint4	struct __builtin_align__(16) uint4
	{	{
	unsigned int x, y, z, w;	unsigned int x, y, z, w;

		__cuda_assign_operators(uint4)
	};	};

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct long1	struct long1
	{	{
	long int x;	long int x;

		__cuda_assign_operators(long1)
	};	};

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct ulong1	struct ulong1
	{	{
	unsigned long x;	unsigned long x;

		__cuda_assign_operators(ulong1)
	};	};


	/DEVICE_BUILTIN/
	struct
	#if defined (_WIN32)	#if defined (_WIN32)

	__builtin_align__(8)
		/DEVICE_BUILTIN/
		__cuda_builtin_vector_align8(long2, long int x, y);

		/DEVICE_BUILTIN/
		__cuda_builtin_vector_align8(ulong2, unsigned long int x, y);

	#else /* _WIN32 */	#else /* _WIN32 */

	__builtin_align__(2*sizeof(long int))
	#endif /* _WIN32 */	/DEVICE_BUILTIN/
	long2	struct __align__(2*sizeof(long int)) long2
	{	{
	long int x, y;	long int x, y;

		__cuda_assign_operators(long2)
	};	};

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/

	struct	struct __align__(2*sizeof(unsigned long int)) ulong2
	#if defined (_WIN32)
	__builtin_align__(8)
	#else /* _WIN32 */
	__builtin_align__(2*sizeof(unsigned long int))
	#endif /* _WIN32 */
	ulong2
	{	{
	unsigned long int x, y;	unsigned long int x, y;

		__cuda_assign_operators(ulong2)
	};	};


		#endif /* _WIN32 */

	#if !defined(__LP64__)	#if !defined(__LP64__)

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct long3	struct long3
	{	{
	long int x, y, z;	long int x, y, z;

		__cuda_assign_operators(long3)
	};	};

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct ulong3	struct ulong3
	{	{
	unsigned long int x, y, z;	unsigned long int x, y, z;

		__cuda_assign_operators(ulong3)
	};	};

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct __builtin_align__(16) long4	struct __builtin_align__(16) long4
	{	{
	long int x, y, z, w;	long int x, y, z, w;

		__cuda_assign_operators(long4)
	};	};

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct __builtin_align__(16) ulong4	struct __builtin_align__(16) ulong4
	{	{
	unsigned long int x, y, z, w;	unsigned long int x, y, z, w;

		__cuda_assign_operators(ulong4)
	};	};

	#endif /* !__LP64__ */	#endif /* !__LP64__ */

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct float1	struct float1
	{	{
	float x;	float x;

		__cuda_assign_operators(float1)
	};	};

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/

	struct __builtin_align__(8) float2	__cuda_builtin_vector_align8(float2, float x, y);
	{
	float x, y;
	};

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct float3	struct float3
	{	{
	float x, y, z;	float x, y, z;

		__cuda_assign_operators(float3)
	};	};

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct __builtin_align__(16) float4	struct __builtin_align__(16) float4
	{	{
	float x, y, z, w;	float x, y, z, w;

		__cuda_assign_operators(float4)
	};	};

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct longlong1	struct longlong1
	{	{
	long long int x;	long long int x;

		__cuda_assign_operators(longlong1)
	};	};

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct ulonglong1	struct ulonglong1
	{	{
	unsigned long long int x;	unsigned long long int x;

		__cuda_assign_operators(ulonglong1)
	};	};

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct __builtin_align__(16) longlong2	struct __builtin_align__(16) longlong2
	{	{
	long long int x, y;	long long int x, y;

		__cuda_assign_operators(longlong2)
	};	};

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct __builtin_align__(16) ulonglong2	struct __builtin_align__(16) ulonglong2
	{	{
	unsigned long long int x, y;	unsigned long long int x, y;

		__cuda_assign_operators(ulonglong2)
	};	};

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct double1	struct double1
	{	{
	double x;	double x;

		__cuda_assign_operators(double1)
	};	};

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct __builtin_align__(16) double2	struct __builtin_align__(16) double2
	{	{
	double x, y;	double x, y;

		__cuda_assign_operators(double2)
	};	};

	/************************************************************************ ***	/************************************************************************ ***
	* *	* *
	* *	* *
	* *	* *
	************************************************************************* **/	************************************************************************* **/

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	typedef struct char1 char1;	typedef struct char1 char1;

	skipping to change at line 419	skipping to change at line 469
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	typedef struct double2 double2;	typedef struct double2 double2;

	/************************************************************************ ***	/************************************************************************ ***
	* *	* *
	* *	* *
	* *	* *
	************************************************************************* **/	************************************************************************* **/

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/

	typedef struct dim3 dim3;

	/DEVICE_BUILTIN/
	struct dim3	struct dim3
	{	{
	unsigned int x, y, z;	unsigned int x, y, z;
	#if defined(__cplusplus)	#if defined(__cplusplus)

	dim3(unsigned int x = 1, unsigned int y = 1, unsigned int z = 1) : x(x)	__host__ __device__ dim3(unsigned int x = 1, unsigned int y = 1, unsign
	, y(y), z(z) {}	ed int z = 1) : x(x), y(y), z(z) {}
	dim3(uint3 v) : x(v.x), y(v.y), z(v.z) {}	__host__ __device__ dim3(uint3 v) : x(v.x), y(v.y), z(v.z) {}
	operator uint3(void) { uint3 t; t.x = x; t.y = y; t.z = z; return t; }	__host__ __device__ operator uint3(void) { uint3 t; t.x = x; t.y = y; t
		.z = z; return t; }
	#endif /* __cplusplus */	#endif /* __cplusplus */
	};	};


		/DEVICE_BUILTIN/
		typedef struct dim3 dim3;

		#undef __cuda_assign_operators
		#undef __cuda_builtin_vector_align8

	#endif /* !__VECTOR_TYPES_H__ */	#endif /* !__VECTOR_TYPES_H__ */

End of changes. 57 change blocks.
	46 lines changed or deleted	102 lines changed or added

This html diff was produced by rfcdiff 1.41. The latest version is available from http://tools.ietf.org/tools/rfcdiff/

	math_constants.h	math_constants.h

	skipping to change at line 74	skipping to change at line 74
	#define CUDART_NORM_HUGE_F 3.402823466e38f	#define CUDART_NORM_HUGE_F 3.402823466e38f
	#define CUDART_TWO_TO_23_F 8388608.0f	#define CUDART_TWO_TO_23_F 8388608.0f
	#define CUDART_TWO_TO_24_F 16777216.0f	#define CUDART_TWO_TO_24_F 16777216.0f
	#define CUDART_TWO_TO_31_F 2147483648.0f	#define CUDART_TWO_TO_31_F 2147483648.0f
	#define CUDART_TWO_TO_32_F 4294967296.0f	#define CUDART_TWO_TO_32_F 4294967296.0f
	#define CUDART_REMQUO_BITS_F 3	#define CUDART_REMQUO_BITS_F 3
	#define CUDART_REMQUO_MASK_F (~((~0)<<CUDART_REMQUO_BITS_F))	#define CUDART_REMQUO_MASK_F (~((~0)<<CUDART_REMQUO_BITS_F))
	#define CUDART_TRIG_PLOSS_F 48039.0f	#define CUDART_TRIG_PLOSS_F 48039.0f

	/* double precision constants */	/* double precision constants */

	#if !defined(CUDA_NO_SM_13_DOUBLE_INTRINSICS)	#if __CUDA_ARCH__ >= 130

	#define CUDART_INF __longlong_as_double(0x7ff0000000000000ULL)	#define CUDART_INF __longlong_as_double(0x7ff0000000000000ULL)
	#define CUDART_NAN __longlong_as_double(0xfff8000000000000ULL)	#define CUDART_NAN __longlong_as_double(0xfff8000000000000ULL)
	#define CUDART_NEG_ZERO __longlong_as_double(0x8000000000000000ULL)	#define CUDART_NEG_ZERO __longlong_as_double(0x8000000000000000ULL)
	#define CUDART_MIN_DENORM __longlong_as_double(0x0000000000000001ULL)	#define CUDART_MIN_DENORM __longlong_as_double(0x0000000000000001ULL)


	#else /* !CUDA_NO_SM_13_DOUBLE_INTRINSICS */	#else /* __CUDA_ARCH__ >= 130 */

	#define CUDART_INF __hiloint2double(0x7ff00000, 0x00000000)	#define CUDART_INF __hiloint2double(0x7ff00000, 0x00000000)
	#define CUDART_NAN __hiloint2double(0xfff80000, 0x00000000)	#define CUDART_NAN __hiloint2double(0xfff80000, 0x00000000)
	#define CUDART_NEG_ZERO __hiloint2double(0x80000000, 0x00000000)	#define CUDART_NEG_ZERO __hiloint2double(0x80000000, 0x00000000)
	#define CUDART_MIN_DENORM __hiloint2double(0x00000000, 0x00000001)	#define CUDART_MIN_DENORM __hiloint2double(0x00000000, 0x00000001)


	#endif /* !CUDA_NO_SM_13_DOUBLE_INTRINSICS */	#endif /* __CUDA_ARCH__ >= 130 */

	#define CUDART_ZERO 0.0	#define CUDART_ZERO 0.0
	#define CUDART_ONE 1.0	#define CUDART_ONE 1.0
	#define CUDART_SQRT_TWO 1.4142135623730951e+0	#define CUDART_SQRT_TWO 1.4142135623730951e+0
	#define CUDART_SQRT_HALF 7.0710678118654757e-1	#define CUDART_SQRT_HALF 7.0710678118654757e-1
	#define CUDART_THIRD 3.3333333333333333e-1	#define CUDART_THIRD 3.3333333333333333e-1
	#define CUDART_TWOTHIRD 6.6666666666666667e-1	#define CUDART_TWOTHIRD 6.6666666666666667e-1
	#define CUDART_PIO4 7.8539816339744828e-1	#define CUDART_PIO4 7.8539816339744828e-1
	#define CUDART_PIO4_HI 7.8539816339744828e-1	#define CUDART_PIO4_HI 7.8539816339744828e-1
	#define CUDART_PIO4_LO 3.0616169978683830e-17	#define CUDART_PIO4_LO 3.0616169978683830e-17

End of changes. 3 change blocks.
	3 lines changed or deleted	3 lines changed or added