cuda: headers diff between 2.3 and 3.0 versions

	__cudaFatFormat.h	__cudaFatFormat.h
	/*	/*

	* Copyright 1993-2008 NVIDIA Corporation. All rights reserved.	* Copyright 1993-2009 NVIDIA Corporation. All rights reserved.
	*	*
	* NOTICE TO USER:	* NOTICE TO USER:
	*	*
	* This source code is subject to NVIDIA ownership rights under U.S. and	* This source code is subject to NVIDIA ownership rights under U.S. and
	* international Copyright laws. Users and possessors of this source code	* international Copyright laws. Users and possessors of this source code
	* are hereby granted a nonexclusive, royalty-free license to use this code	* are hereby granted a nonexclusive, royalty-free license to use this code
	* in individual and commercial software.	* in individual and commercial software.
	*	*
	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR

	skipping to change at line 128	skipping to change at line 128
	* for Cubin entries (ptx files compiled in debug mode	* for Cubin entries (ptx files compiled in debug mode
	* will contain their own debugging information)	* will contain their own debugging information)
	*/	*/
	typedef struct __cudaFatDebugEntryRec {	typedef struct __cudaFatDebugEntryRec {
	char* gpuProfileName;	char* gpuProfileName;
	char* debug;	char* debug;
	struct __cudaFatDebugEntryRec *next;	struct __cudaFatDebugEntryRec *next;
	unsigned int size;	unsigned int size;
	} __cudaFatDebugEntry;	} __cudaFatDebugEntry;


		typedef struct __cudaFatElfEntryRec {
		char* gpuProfileName;
		char* elf;
		struct __cudaFatElfEntryRec *next;
		unsigned int size;
		} __cudaFatElfEntry;

	typedef enum {	typedef enum {
	__cudaFatDontSearchFlag = (1 << 0),	__cudaFatDontSearchFlag = (1 << 0),
	__cudaFatDontCacheFlag = (1 << 1),	__cudaFatDontCacheFlag = (1 << 1),
	__cudaFatSassDebugFlag = (1 << 2)	__cudaFatSassDebugFlag = (1 << 2)
	} __cudaFatCudaBinaryFlag;	} __cudaFatCudaBinaryFlag;

	/*	/*
	* Imported/exported symbol descriptor, needed for	* Imported/exported symbol descriptor, needed for
	* __cudaFat binary linking. Not much information is needed,	* __cudaFat binary linking. Not much information is needed,
	* because this is only an index: full symbol information	* because this is only an index: full symbol information

	skipping to change at line 170	skipping to change at line 177
	char* usageMode;	char* usageMode;
	__cudaFatPtxEntry *ptx;	__cudaFatPtxEntry *ptx;
	__cudaFatCubinEntry *cubin;	__cudaFatCubinEntry *cubin;
	__cudaFatDebugEntry *debug;	__cudaFatDebugEntry *debug;
	void* debugInfo;	void* debugInfo;
	unsigned int flags;	unsigned int flags;
	__cudaFatSymbol *exported;	__cudaFatSymbol *exported;
	__cudaFatSymbol *imported;	__cudaFatSymbol *imported;
	struct __cudaFatCudaBinaryRec *dependends;	struct __cudaFatCudaBinaryRec *dependends;
	unsigned int characteristic;	unsigned int characteristic;

		__cudaFatElfEntry *elf;
	} __cudaFatCudaBinary;	} __cudaFatCudaBinary;

	/*	/*
	* Current version and magic numbers:	* Current version and magic numbers:
	*/	*/

	#define __cudaFatVERSION 0x00000003	#define __cudaFatVERSION 0x00000004
	#define __cudaFatMAGIC 0x1ee55a01	#define __cudaFatMAGIC 0x1ee55a01

	/*	/*
	* Version history log:	* Version history log:
	* 1 : __cudaFatDebugEntry field added to __cudaFatCudaBinary struct	* 1 : __cudaFatDebugEntry field added to __cudaFatCudaBinary struct
	* 2 : flags and debugInfo field added.	* 2 : flags and debugInfo field added.
	* 3 : import/export symbol list	* 3 : import/export symbol list

	* 4 : characteristic added	* 4 : characteristic added, elf added
	*/	*/

	/--------------------------------- Functions ----------------------------- ---/	/--------------------------------- Functions ----------------------------- ---/

	typedef enum {	typedef enum {
	__cudaFatAvoidPTX,	__cudaFatAvoidPTX,

	__cudaFatPreferBestCode	__cudaFatPreferBestCode,
		__cudaFatForcePTX
	} __cudaFatCompilationPolicy;	} __cudaFatCompilationPolicy;

	/*	/*
	* Function : Select a load image from the __cudaFat binary	* Function : Select a load image from the __cudaFat binary
	* that will run on the specified GPU.	* that will run on the specified GPU.
	* Parameters : binary (I) Fat binary	* Parameters : binary (I) Fat binary
	* policy (I) Parameter influencing the selection proces s in case no	* policy (I) Parameter influencing the selection proces s in case no
	* fully matching cubin can be found, but ins tead a choice can	* fully matching cubin can be found, but ins tead a choice can
	* be made between ptx compilation or selecti on of a	* be made between ptx compilation or selecti on of a
	* cubin for a less capable GPU.	* cubin for a less capable GPU.

	skipping to change at line 217	skipping to change at line 226
	* on the returned cubin will be returned, or NULL	* on the returned cubin will be returned, or NULL
	* will be returned when cubin or such debug info	* will be returned when cubin or such debug info
	* cannot be found.	* cannot be found.
	*/	*/
	void fatGetCubinForGpuWithPolicy( __cudaFatCudaBinary binary, __cudaFatCom pilationPolicy policy, char gpuName, char* cubin, char *dbgInfoFile );	void fatGetCubinForGpuWithPolicy( __cudaFatCudaBinary binary, __cudaFatCom pilationPolicy policy, char gpuName, char* cubin, char *dbgInfoFile );

	#define fatGetCubinForGpu(binary,gpuName,cubin,dbgInfoFile) \	#define fatGetCubinForGpu(binary,gpuName,cubin,dbgInfoFile) \
	fatGetCubinForGpuWithPolicy(binary,__cudaFatAvoidPTX,gpuName,cubi n,dbgInfoFile)	fatGetCubinForGpuWithPolicy(binary,__cudaFatAvoidPTX,gpuName,cubi n,dbgInfoFile)

	/*	/*

		* Function : Check if a binary will be JITed for the specified targ
		et architecture
		* Parameters : binary (I) Fat binary
		* policy (I) Compilation policy, as described by fatGet
		CubinForGpuWithPolicy
		* gpuName (I) Name of target GPU
		* ptx (O) PTX string to be JITed
		* Function Result : True if the given binary will be JITed; otherwise, Fal
		se
		*/
		unsigned char fatCheckJitForGpuWithPolicy( __cudaFatCudaBinary *binary, __c
		udaFatCompilationPolicy policy, char* gpuName, char* *ptx );

		#define fatCheckJitForGpu(binary,gpuName,ptx) \
		fatCheckJitForGpuWithPolicy(binary,__cudaFatAvoidPTX,gpuName,ptx)

		/*
	* Function : Free information previously obtained via function fatG etCubinForGpu.	* Function : Free information previously obtained via function fatG etCubinForGpu.
	* Parameters : cubin (I) Cubin text string to free	* Parameters : cubin (I) Cubin text string to free
	* dbgInfo (I) Debug info filename to free, or NULL	* dbgInfo (I) Debug info filename to free, or NULL
	*/	*/
	void fatFreeCubin( char* cubin, char* dbgInfoFile );	void fatFreeCubin( char* cubin, char* dbgInfoFile );

	#ifdef __cplusplus	#ifdef __cplusplus
	}	}
	#endif	#endif


End of changes. 7 change blocks.
	4 lines changed or deleted	30 lines changed or added

	builtin_types.h	builtin_types.h
	/*	/*

	* Copyright 1993-2009 NVIDIA Corporation. All rights reserved.	* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
	*	*
	* NOTICE TO USER:	* NOTICE TO USER:
	*	*
	* This source code is subject to NVIDIA ownership rights under U.S. and	* This source code is subject to NVIDIA ownership rights under U.S. and
	* international Copyright laws. Users and possessors of this source code	* international Copyright laws. Users and possessors of this source code
	* are hereby granted a nonexclusive, royalty-free license to use this code	* are hereby granted a nonexclusive, royalty-free license to use this code
	* in individual and commercial software.	* in individual and commercial software.
	*	*
	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR

End of changes. 1 change blocks.
	1 lines changed or deleted	1 lines changed or added

	channel_descriptor.h	channel_descriptor.h
	/*	/*

	* Copyright 1993-2009 NVIDIA Corporation. All rights reserved.	* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
	*	*
	* NOTICE TO USER:	* NOTICE TO USER:
	*	*
	* This source code is subject to NVIDIA ownership rights under U.S. and	* This source code is subject to NVIDIA ownership rights under U.S. and
	* international Copyright laws. Users and possessors of this source code	* international Copyright laws. Users and possessors of this source code
	* are hereby granted a nonexclusive, royalty-free license to use this code	* are hereby granted a nonexclusive, royalty-free license to use this code
	* in individual and commercial software.	* in individual and commercial software.
	*	*
	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR

	skipping to change at line 98	skipping to change at line 98
	* \ref ::cudaBindTextureToArray(const struct texture< T, dim, readMode>&, const struct cudaArray*, const struct cudaChannelFormatDesc&) "cudaBindText ureToArray (High level)",	* \ref ::cudaBindTextureToArray(const struct texture< T, dim, readMode>&, const struct cudaArray*, const struct cudaChannelFormatDesc&) "cudaBindText ureToArray (High level)",
	* \ref ::cudaBindTextureToArray(const struct texture< T, dim, readMode>&, const struct cudaArray*) "cudaBindTextureToArray (High level, inherited cha nnel descriptor)",	* \ref ::cudaBindTextureToArray(const struct texture< T, dim, readMode>&, const struct cudaArray*) "cudaBindTextureToArray (High level, inherited cha nnel descriptor)",
	* \ref ::cudaUnbindTexture(const struct texture< T, dim, readMode>&) "cuda UnbindTexture (High level)",	* \ref ::cudaUnbindTexture(const struct texture< T, dim, readMode>&) "cuda UnbindTexture (High level)",
	* \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture< T, d im, readMode>&) "cudaGetTextureAlignmentOffset (High level)"	* \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture< T, d im, readMode>&) "cudaGetTextureAlignmentOffset (High level)"
	*/	*/
	template<class T> __inline__ __host__ cudaChannelFormatDesc cudaCreateChann elDesc(void)	template<class T> __inline__ __host__ cudaChannelFormatDesc cudaCreateChann elDesc(void)
	{	{
	return cudaCreateChannelDesc(0, 0, 0, 0, cudaChannelFormatKindNone);	return cudaCreateChannelDesc(0, 0, 0, 0, cudaChannelFormatKindNone);
	}	}


		static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf(
		void)
		{
		int e = (int)sizeof(unsigned short) * 8;

		return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
		}

		static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf1
		(void)
		{
		int e = (int)sizeof(unsigned short) * 8;

		return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
		}

		static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf2
		(void)
		{
		int e = (int)sizeof(unsigned short) * 8;

		return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindFloat);
		}

		static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf4
		(void)
		{
		int e = (int)sizeof(unsigned short) * 8;

		return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindFloat);
		}

	template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc< char>(void)	template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc< char>(void)
	{	{
	int e = (int)sizeof(char) * 8;	int e = (int)sizeof(char) * 8;

	#if __SIGNED_CHARS__	#if __SIGNED_CHARS__
	return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);	return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
	#else	#else
	return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);	return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
	#endif	#endif
	}	}

End of changes. 2 change blocks.
	1 lines changed or deleted	33 lines changed or added

	common_functions.h	common_functions.h
	/*	/*

	* Copyright 1993-2009 NVIDIA Corporation. All rights reserved.	* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
	*	*
	* NOTICE TO USER:	* NOTICE TO USER:
	*	*
	* This source code is subject to NVIDIA ownership rights under U.S. and	* This source code is subject to NVIDIA ownership rights under U.S. and
	* international Copyright laws. Users and possessors of this source code	* international Copyright laws. Users and possessors of this source code
	* are hereby granted a nonexclusive, royalty-free license to use this code	* are hereby granted a nonexclusive, royalty-free license to use this code
	* in individual and commercial software.	* in individual and commercial software.
	*	*
	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR

	skipping to change at line 56	skipping to change at line 56

	#include "host_defines.h"	#include "host_defines.h"

	#include <time.h>	#include <time.h>
	#include <string.h>	#include <string.h>

	extern "C"	extern "C"
	{	{

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/

	extern _CRTIMP __host__ __device__ clock_t clock(void) __THROW;	extern _CRTIMP __host__ __device__ clock_t __cdecl clock(void) __THROW;

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/

	extern __host__ __device__ void memset(void s, int c, size_t n) __THROW;	extern __host__ __device__ void * __cdecl memset(void *s, int c, s ize_t n) __THROW;

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/

	extern __host__ __device__ void memcpy(void d, const void *s, size_t n) _ _THROW;	extern __host__ __device__ void * __cdecl memcpy(void d, const vo id s, size_t n) __THROW;

	}	}

	#elif !defined(__CUDACC__)	#elif !defined(__CUDACC__)

	#include "crt/func_macro.h"	#include "crt/func_macro.h"

	__device_func__(clock_t __cuda_clock(void))	__device_func__(clock_t __cuda_clock(void))
	{	{
	return clock();	return clock();
	}	}

	__device_func__(void __cuda_memset(void s, int c, size_t n))	__device_func__(void __cuda_memset(void s, int c, size_t n))
	{	{

	return memset(s, c, n);	char p = (char)s;

		while (n--) *p++ = (char)c;

		return s;
	}	}

	__device_func__(void __cuda_memcpy(void d, const void *s, size_t n))	__device_func__(void __cuda_memcpy(void d, const void *s, size_t n))
	{	{

	return memcpy(d, s, n);	char p = (char)d;
		const char r = (const char)s;

		while (n--) p++ = r++;

		return d;
	}	}

	#endif /* __cplusplus && __CUDACC__ */	#endif /* __cplusplus && __CUDACC__ */

	/************************************************************************ ***	/************************************************************************ ***
	* *	* *
	* *	* *
	* *	* *
	************************************************************************* **/	************************************************************************* **/


End of changes. 6 change blocks.
	6 lines changed or deleted	15 lines changed or added

	common_types.h	common_types.h
	/*	/*

	* Copyright 1993-2009 NVIDIA Corporation. All rights reserved.	* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
	*	*
	* NOTICE TO USER:	* NOTICE TO USER:
	*	*
	* This source code is subject to NVIDIA ownership rights under U.S. and	* This source code is subject to NVIDIA ownership rights under U.S. and
	* international Copyright laws. Users and possessors of this source code	* international Copyright laws. Users and possessors of this source code
	* are hereby granted a nonexclusive, royalty-free license to use this code	* are hereby granted a nonexclusive, royalty-free license to use this code
	* in individual and commercial software.	* in individual and commercial software.
	*	*
	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR

End of changes. 1 change blocks.
	1 lines changed or deleted	1 lines changed or added

	cuComplex.h	cuComplex.h
	/*	/*

	* Copyright 1993-2009 NVIDIA Corporation. All rights reserved.	* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
	*	*
	* NOTICE TO USER:	* NOTICE TO USER:
	*	*
	* This source code is subject to NVIDIA ownership rights under U.S. and	* This source code is subject to NVIDIA ownership rights under U.S. and
	* international Copyright laws. Users and possessors of this source code	* international Copyright laws. Users and possessors of this source code
	* are hereby granted a nonexclusive, royalty-free license to use this code	* are hereby granted a nonexclusive, royalty-free license to use this code
	* in individual and commercial software.	* in individual and commercial software.
	*	*
	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR

	skipping to change at line 43	skipping to change at line 43
	* the above Disclaimer and U.S. Government End Users Notice.	* the above Disclaimer and U.S. Government End Users Notice.
	*/	*/

	#if !defined(CU_COMPLEX_H_)	#if !defined(CU_COMPLEX_H_)
	#define CU_COMPLEX_H_	#define CU_COMPLEX_H_

	#if defined(__cplusplus)	#if defined(__cplusplus)
	extern "C" {	extern "C" {
	#endif /* __cplusplus */	#endif /* __cplusplus */


	#include <math.h> /* import fabs, sqrt */	#include <math.h> /* import fabsf, sqrt */
	#include "vector_types.h"	#include "vector_types.h"

	/* versions for hosts without native support for 'complex' */	/* versions for hosts without native support for 'complex' */

	#if (!defined(__CUDACC__) && defined(CU_USE_NATIVE_COMPLEX))	#if (!defined(__CUDACC__) && defined(CU_USE_NATIVE_COMPLEX))
	#include <complex.h>	#include <complex.h>

	/* wrapper functions around C99 native complex support. NOTE: Untested! */	/* wrapper functions around C99 native complex support. NOTE: Untested! */

	/* -- Single Precision -- */	/* -- Single Precision -- */

	skipping to change at line 223	skipping to change at line 223

	/* This implementation guards against intermediate underflow and overflow	/* This implementation guards against intermediate underflow and overflow
	* by scaling. Such guarded implementations are usually the default for	* by scaling. Such guarded implementations are usually the default for
	* complex library implementations, with some also offering an unguarded,	* complex library implementations, with some also offering an unguarded,
	* faster version.	* faster version.
	*/	*/
	__host__ __device__ static __inline__ cuFloatComplex cuCdivf (cuFloatComple x x,	__host__ __device__ static __inline__ cuFloatComplex cuCdivf (cuFloatComple x x,
	cuFloatComple x y)	cuFloatComple x y)
	{	{
	cuFloatComplex quot;	cuFloatComplex quot;

	float s = ((float)fabs((double)cuCrealf(y))) +	float s = fabsf(cuCrealf(y)) + fabsf(cuCimagf(y));
	((float)fabs((double)cuCimagf(y)));
	float oos = 1.0f / s;	float oos = 1.0f / s;
	float ars = cuCrealf(x) * oos;	float ars = cuCrealf(x) * oos;
	float ais = cuCimagf(x) * oos;	float ais = cuCimagf(x) * oos;
	float brs = cuCrealf(y) * oos;	float brs = cuCrealf(y) * oos;
	float bis = cuCimagf(y) * oos;	float bis = cuCimagf(y) * oos;
	s = (brs * brs) + (bis * bis);	s = (brs * brs) + (bis * bis);
	oos = 1.0f / s;	oos = 1.0f / s;
	quot = make_cuFloatComplex (((ars * brs) + (ais * bis)) * oos,	quot = make_cuFloatComplex (((ars * brs) + (ais * bis)) * oos,
	((ais * brs) - (ars * bis)) * oos);	((ais * brs) - (ars * bis)) * oos);
	return quot;	return quot;

	skipping to change at line 250	skipping to change at line 249
	* overflow by scaling. Otherwise we would lose half the exponent range.	* overflow by scaling. Otherwise we would lose half the exponent range.
	* There are various ways of doing guarded computation. For now chose the	* There are various ways of doing guarded computation. For now chose the
	* simplest and fastest solution, however this may suffer from inaccuracies	* simplest and fastest solution, however this may suffer from inaccuracies
	* if sqrt and division are not IEEE compliant.	* if sqrt and division are not IEEE compliant.
	*/	*/
	__host__ __device__ static __inline__ float cuCabsf (cuFloatComplex x)	__host__ __device__ static __inline__ float cuCabsf (cuFloatComplex x)
	{	{
	float a = cuCrealf(x);	float a = cuCrealf(x);
	float b = cuCimagf(x);	float b = cuCimagf(x);
	float v, w, t;	float v, w, t;

	a = (float)fabs(a);	a = fabsf(a);
	b = (float)fabs(b);	b = fabsf(b);
	if (a > b) {	if (a > b) {
	v = a;	v = a;
	w = b;	w = b;
	} else {	} else {
	v = b;	v = b;
	w = a;	w = a;
	}	}
	t = w / v;	t = w / v;
	t = 1.0f + t * t;	t = 1.0f + t * t;

	t = v * (float)sqrt(t);	t = v * sqrtf(t);
	if ((v == 0.0f) \|\| (v > 3.402823466e38f) \|\| (w > 3.402823466e38f)) {	if ((v == 0.0f) \|\| (v > 3.402823466e38f) \|\| (w > 3.402823466e38f)) {
	t = v + w;	t = v + w;
	}	}
	return t;	return t;
	}	}

	/* Double precision */	/* Double precision */
	typedef double2 cuDoubleComplex;	typedef double2 cuDoubleComplex;

	__host__ __device__ static __inline__ double cuCreal (cuDoubleComplex x)	__host__ __device__ static __inline__ double cuCreal (cuDoubleComplex x)

	skipping to change at line 404	skipping to change at line 403
	{	{
	return make_cuDoubleComplex ((double)cuCrealf(c), (double)cuCimagf(c));	return make_cuDoubleComplex ((double)cuCrealf(c), (double)cuCimagf(c));
	}	}

	__host__ __device__ static __inline__ cuFloatComplex cuComplexDoubleToFloat	__host__ __device__ static __inline__ cuFloatComplex cuComplexDoubleToFloat
	(cuDoubleComplex c)	(cuDoubleComplex c)
	{	{
	return make_cuFloatComplex ((float)cuCreal(c), (float)cuCimag(c));	return make_cuFloatComplex ((float)cuCreal(c), (float)cuCimag(c));
	}	}


		__host__ __device__ static __inline__ cuComplex cuCfmaf( cuComplex x, cuCo
		mplex y, cuComplex d)
		{
		float real_res;
		float imag_res;

		real_res = (cuCrealf(x) * cuCrealf(y)) + cuCrealf(d);
		imag_res = (cuCrealf(x) * cuCimagf(y)) + cuCimagf(d);

		real_res = -(cuCimagf(x) * cuCimagf(y)) + real_res;
		imag_res = (cuCimagf(x) * cuCrealf(y)) + imag_res;

		return make_cuComplex(real_res, imag_res);
		}

		__host__ __device__ static __inline__ cuDoubleComplex cuCfma( cuDoubleComp
		lex x, cuDoubleComplex y, cuDoubleComplex d)
		{
		double real_res;
		double imag_res;

		real_res = (cuCreal(x) * cuCreal(y)) + cuCreal(d);
		imag_res = (cuCreal(x) * cuCimag(y)) + cuCimag(d);

		real_res = -(cuCimag(x) * cuCimag(y)) + real_res;
		imag_res = (cuCimag(x) * cuCreal(y)) + imag_res;

		return make_cuDoubleComplex(real_res, imag_res);
		}

	#endif /* !defined(CU_COMPLEX_H_) */	#endif /* !defined(CU_COMPLEX_H_) */

End of changes. 6 change blocks.
	7 lines changed or deleted	36 lines changed or added

	cublas.h	cublas.h
	/*	/*

	* Copyright 1993-2009 NVIDIA Corporation. All rights reserved.	* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
	*	*
	* NOTICE TO USER:	* NOTICE TO USER:
	*	*
	* This source code is subject to NVIDIA ownership rights under U.S. and	* This source code is subject to NVIDIA ownership rights under U.S. and
	* international Copyright laws. Users and possessors of this source code	* international Copyright laws. Users and possessors of this source code
	* are hereby granted a nonexclusive, royalty-free license to use this code	* are hereby granted a nonexclusive, royalty-free license to use this code
	* in individual and commercial software.	* in individual and commercial software.
	*	*
	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR

	skipping to change at line 554	skipping to change at line 554
	* Error Status	* Error Status
	* ------------	* ------------
	* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize d	* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize d
	* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU	* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
	*/	*/
	void CUBLASAPI cublasSrot (int n, float x, int incx, float y, int incy,	void CUBLASAPI cublasSrot (int n, float x, int incx, float y, int incy,
	float sc, float ss);	float sc, float ss);

	/*	/*
	* void	* void

	* cublasSrotg (float sa, float sb, float sc, float ss)	* cublasSrotg (float host_sa, float host_sb, float host_sc, float host _ss)
	*	*
	* constructs the Givens tranformation	* constructs the Givens tranformation
	*	*
	* ( sc ss )	* ( sc ss )
	* G = ( ) , sc^2 + ss^2 = 1,	* G = ( ) , sc^2 + ss^2 = 1,
	* (-ss sc )	* (-ss sc )
	*	*
	* which zeros the second entry of the 2-vector transpose(sa, sb).	* which zeros the second entry of the 2-vector transpose(sa, sb).
	*	*
	* The quantity r = (+/-) sqrt (sa^2 + sb^2) overwrites sa in storage. The	* The quantity r = (+/-) sqrt (sa^2 + sb^2) overwrites sa in storage. The
	* value of sb is overwritten by a value z which allows sc and ss to be	* value of sb is overwritten by a value z which allows sc and ss to be
	* recovered by the following algorithm:	* recovered by the following algorithm:
	*	*
	* if z=1 set sc = 0.0 and ss = 1.0	* if z=1 set sc = 0.0 and ss = 1.0
	* if abs(z) < 1 set sc = sqrt(1-z^2) and ss = z	* if abs(z) < 1 set sc = sqrt(1-z^2) and ss = z
	* if abs(z) > 1 set sc = 1/z and ss = sqrt(1-sc^2)	* if abs(z) > 1 set sc = 1/z and ss = sqrt(1-sc^2)
	*	*
	* The function srot (n, x, incx, y, incy, sc, ss) normally is called next	* The function srot (n, x, incx, y, incy, sc, ss) normally is called next
	* to apply the transformation to a 2 x n matrix.	* to apply the transformation to a 2 x n matrix.

		* Note that is function is provided for completeness and run exclusively
		* on the Host.
	*	*
	* Input	* Input
	* -----	* -----
	* sa single precision scalar	* sa single precision scalar
	* sb single precision scalar	* sb single precision scalar
	*	*
	* Output	* Output
	* ------	* ------
	* sa single precision r	* sa single precision r
	* sb single precision z	* sb single precision z
	* sc single precision result	* sc single precision result
	* ss single precision result	* ss single precision result
	*	*
	* Reference: http://www.netlib.org/blas/srotg.f	* Reference: http://www.netlib.org/blas/srotg.f
	*	*
	* This function does not set any error status.	* This function does not set any error status.
	*/	*/

	void CUBLASAPI cublasSrotg (float sa, float sb, float sc, float ss);	void CUBLASAPI cublasSrotg (float host_sa, float host_sb, float host_sc, float host_ss);

	/*	/*
	* void	* void
	* cublasSrotm (int n, float x, int incx, float y, int incy,	* cublasSrotm (int n, float x, int incx, float y, int incy,
	* const float* sparam)	* const float* sparam)
	*	*
	* applies the modified Givens transformation, h, to the 2 x n matrix	* applies the modified Givens transformation, h, to the 2 x n matrix
	*	*
	* ( transpose(x) )	* ( transpose(x) )
	* ( transpose(y) )	* ( transpose(y) )

	skipping to change at line 644	skipping to change at line 646
	* Error Status	* Error Status
	* ------------	* ------------
	* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize d	* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize d
	* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU	* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
	*/	*/
	void CUBLASAPI cublasSrotm(int n, float x, int incx, float y, int incy,	void CUBLASAPI cublasSrotm(int n, float x, int incx, float y, int incy,
	const float* sparam);	const float* sparam);

	/*	/*
	* void	* void

	* cublasSrotmg (float psd1, float psd2, float psx1, const float psy1,	* cublasSrotmg (float host_psd1, float host_psd2, float *host_psx1, cons
	* float *sparam)	t float *host_psy1,
		* float *host_sparam)
	*	*
	* constructs the modified Givens transformation matrix h which zeros	* constructs the modified Givens transformation matrix h which zeros
	* the second component of the 2-vector transpose(sqrt(sd1)sx1,sqrt(sd2)s y1).	* the second component of the 2-vector transpose(sqrt(sd1)sx1,sqrt(sd2)s y1).
	* With sparam[0] = sflag, h has one of the following forms:	* With sparam[0] = sflag, h has one of the following forms:
	*	*
	* sflag = -1.0f sflag = 0.0f sflag = 1.0f sflag = -2.0f	* sflag = -1.0f sflag = 0.0f sflag = 1.0f sflag = -2.0f
	*	*
	* (sh00 sh01) (1.0f sh01) (sh00 1.0f) (1.0f 0.0f)	* (sh00 sh01) (1.0f sh01) (sh00 1.0f) (1.0f 0.0f)
	* h = ( ) ( ) ( ) ( )	* h = ( ) ( ) ( ) ( )
	* (sh10 sh11) (sh10 1.0f) (-1.0f sh11) (0.0f 1.0f)	* (sh10 sh11) (sh10 1.0f) (-1.0f sh11) (0.0f 1.0f)
	*	*
	* sparam[1] through sparam[4] contain sh00, sh10, sh01, sh11,	* sparam[1] through sparam[4] contain sh00, sh10, sh01, sh11,
	* respectively. Values of 1.0f, -1.0f, or 0.0f implied by the value	* respectively. Values of 1.0f, -1.0f, or 0.0f implied by the value
	* of sflag are not stored in sparam.	* of sflag are not stored in sparam.

		* Note that is function is provided for completeness and run exclusively
		* on the Host.
	*	*
	* Input	* Input
	* -----	* -----
	* sd1 single precision scalar	* sd1 single precision scalar
	* sd2 single precision scalar	* sd2 single precision scalar
	* sx1 single precision scalar	* sx1 single precision scalar
	* sy1 single precision scalar	* sy1 single precision scalar
	*	*
	* Output	* Output
	* ------	* ------

	skipping to change at line 682	skipping to change at line 686
	* sx1 changed to represent the effect of the transformation	* sx1 changed to represent the effect of the transformation
	* sparam 5-element vector. sparam[0] is sflag described above. sparam[1]	* sparam 5-element vector. sparam[0] is sflag described above. sparam[1]
	* through sparam[4] contain the 2x2 rotation matrix h: sparam[1]	* through sparam[4] contain the 2x2 rotation matrix h: sparam[1]
	* contains sh00, sparam[2] contains sh10, sparam[3] contains sh01,	* contains sh00, sparam[2] contains sh10, sparam[3] contains sh01,
	* and sprams[4] contains sh11.	* and sprams[4] contains sh11.
	*	*
	* Reference: http://www.netlib.org/blas/srotmg.f	* Reference: http://www.netlib.org/blas/srotmg.f
	*	*
	* This functions does not set any error status.	* This functions does not set any error status.
	*/	*/

	void CUBLASAPI cublasSrotmg (float sd1, float sd2, float *sx1,	void CUBLASAPI cublasSrotmg (float host_sd1, float host_sd2, float *host_
	const float sy1, float sparam);	sx1,
		const float host_sy1, float host_sparam);

	/*	/*
	* void	* void
	* sscal (int n, float alpha, float *x, int incx)	* sscal (int n, float alpha, float *x, int incx)
	*	*
	* replaces single precision vector x with single precision alpha * x. For i	* replaces single precision vector x with single precision alpha * x. For i
	* = 0 to n - 1, it replaces x[ix + i * incx] with alpha * x[ix + i * incx] ,	* = 0 to n - 1, it replaces x[ix + i * incx] with alpha * x[ix + i * incx] ,
	* where ix = 1 if incx >= 0, else ix = 1 + (1 - n) * incx.	* where ix = 1 if incx >= 0, else ix = 1 + (1 - n) * incx.
	*	*
	* Input	* Input

	skipping to change at line 819	skipping to change at line 823
	* Error Status	* Error Status
	* ------------	* ------------
	* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize d	* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize d
	* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU	* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
	*/	*/
	void CUBLASAPI cublasCcopy (int n, const cuComplex x, int incx, cuComplex y,	void CUBLASAPI cublasCcopy (int n, const cuComplex x, int incx, cuComplex y,
	int incy);	int incy);

	/*	/*
	* void	* void

		* cublasZcopy (int n, const cuDoubleComplex *x, int incx, cuDoubleComplex
		*y, int incy)
		*
		* copies the double-complex vector x to the double-complex vector y. For
		* i = 0 to n-1, copies x[lx + i * incx] to y[ly + i * incy], where lx = 1
		if
		* incx >= 0, else lx = 1 + (1 - n) * incx, and ly is defined in a similar
		* way using incy.
		*
		* Input
		* -----
		* n number of elements in input vectors
		* x double-complex vector with n elements
		* incx storage spacing between elements of x
		* y double-complex vector with n elements
		* incy storage spacing between elements of y
		*
		* Output
		* ------
		* y contains double complex vector x
		*
		* Reference: http://www.netlib.org/blas/zcopy.f
		*
		* Error status for this function can be retrieved via cublasGetError().
		*
		* Error Status
		* ------------
		* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
		d
		* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
		*/
		void CUBLASAPI cublasZcopy (int n, const cuDoubleComplex *x, int incx, cuDo
		ubleComplex *y,
		int incy);

		/*
		* void
	* cublasCscal (int n, cuComplex alpha, cuComplex *x, int incx)	* cublasCscal (int n, cuComplex alpha, cuComplex *x, int incx)
	*	*
	* replaces single-complex vector x with single-complex alpha * x. For i	* replaces single-complex vector x with single-complex alpha * x. For i
	* = 0 to n - 1, it replaces x[ix + i * incx] with alpha * x[ix + i * incx] ,	* = 0 to n - 1, it replaces x[ix + i * incx] with alpha * x[ix + i * incx] ,
	* where ix = 1 if incx >= 0, else ix = 1 + (1 - n) * incx.	* where ix = 1 if incx >= 0, else ix = 1 + (1 - n) * incx.
	*	*
	* Input	* Input
	* -----	* -----
	* n number of elements in input vectors	* n number of elements in input vectors
	* alpha single-complex scalar multiplier	* alpha single-complex scalar multiplier

	skipping to change at line 849	skipping to change at line 886
	*	*
	* Error Status	* Error Status
	* ------------	* ------------
	* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize d	* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize d
	* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU	* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
	*/	*/
	void CUBLASAPI cublasCscal (int n, cuComplex alpha, cuComplex *x, int incx) ;	void CUBLASAPI cublasCscal (int n, cuComplex alpha, cuComplex *x, int incx) ;

	/*	/*
	* void	* void

	* cublasCrotg (cuComplex ca, cuComplex cb, float sc, cuComplex *cs)	* cublasCrotg (cuComplex host_ca, cuComplex cb, float host_sc, cuComplex *host_cs)
	*	*
	* constructs the complex Givens tranformation	* constructs the complex Givens tranformation
	*	*
	* ( sc cs )	* ( sc cs )
	* G = ( ) , sc^2 + cabs(cs)^2 = 1,	* G = ( ) , sc^2 + cabs(cs)^2 = 1,
	* (-cs sc )	* (-cs sc )
	*	*
	* which zeros the second entry of the complex 2-vector transpose(ca, cb).	* which zeros the second entry of the complex 2-vector transpose(ca, cb).
	*	*
	* The quantity ca/cabs(ca)*norm(ca,cb) overwrites ca in storage. The	* The quantity ca/cabs(ca)*norm(ca,cb) overwrites ca in storage. The
	* function crot (n, x, incx, y, incy, sc, cs) is normally called next	* function crot (n, x, incx, y, incy, sc, cs) is normally called next
	* to apply the transformation to a 2 x n matrix.	* to apply the transformation to a 2 x n matrix.

		* Note that is function is provided for completeness and run exclusively
		* on the Host.
	*	*
	* Input	* Input
	* -----	* -----
	* ca single-precision complex precision scalar	* ca single-precision complex precision scalar
	* cb single-precision complex scalar	* cb single-precision complex scalar
	*	*
	* Output	* Output
	* ------	* ------
	* ca single-precision complex ca/cabs(ca)*norm(ca,cb)	* ca single-precision complex ca/cabs(ca)*norm(ca,cb)
	* sc single-precision cosine component of rotation matrix	* sc single-precision cosine component of rotation matrix
	* cs single-precision complex sine component of rotation matrix	* cs single-precision complex sine component of rotation matrix
	*	*
	* Reference: http://www.netlib.org/blas/crotg.f	* Reference: http://www.netlib.org/blas/crotg.f
	*	*
	* This function does not set any error status.	* This function does not set any error status.
	*/	*/

	__host__ void CUBLASAPI cublasCrotg (cuComplex pca, cuComplex cb, float p	__host__ void CUBLASAPI cublasCrotg (cuComplex *host_ca, cuComplex cb, floa
	sc,	t *host_sc,
	cuComplex *pcs);	cuComplex *host_cs);

	/*	/*
	* void	* void
	* cublasCrot (int n, cuComplex x, int incx, cuComplex y, int incy, float sc,	* cublasCrot (int n, cuComplex x, int incx, cuComplex y, int incy, float sc,
	* cuComplex cs)	* cuComplex cs)
	*	*
	* multiplies a 2x2 matrix ( sc cs) with the 2xn matrix ( transpose(x ) )	* multiplies a 2x2 matrix ( sc cs) with the 2xn matrix ( transpose(x ) )
	* (-conj(cs) sc) ( transpose(y ) )	* (-conj(cs) sc) ( transpose(y ) )
	*	*
	* The elements of x are in x[lx + i * incx], i = 0 ... n - 1, where lx = 1 if	* The elements of x are in x[lx + i * incx], i = 0 ... n - 1, where lx = 1 if

	skipping to change at line 1024	skipping to change at line 1063
	*	*
	* Error Status	* Error Status
	* ------------	* ------------
	* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize d	* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize d
	* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU	* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
	*/	*/
	void CUBLASAPI cublasCswap (int n, cuComplex x, int incx, cuComplex y,	void CUBLASAPI cublasCswap (int n, cuComplex x, int incx, cuComplex y,
	int incy);	int incy);

	/*	/*

		* void
		* cublasZswap (int n, const cuDoubleComplex *x, int incx, cuDoubleComplex
		*y, int incy)
		*
		* interchanges the double-complex vector x with the double-complex vector
		y.
		* For i = 0 to n-1, interchanges x[lx + i * incx] with y[ly + i * incy], w
		here
		* lx = 1 if incx >= 0, else lx = 1 + (1 - n) * incx, and ly is defined in
		a
		* similar way using incy.
		*
		* Input
		* -----
		* n number of elements in input vectors
		* x double-complex vector with n elements
		* incx storage spacing between elements of x
		* y double-complex vector with n elements
		* incy storage spacing between elements of y
		*
		* Output
		* ------
		* x contains-double complex vector y
		* y contains-double complex vector x
		*
		* Reference: http://www.netlib.org/blas/zswap.f
		*
		* Error status for this function can be retrieved via cublasGetError().
		*
		* Error Status
		* ------------
		* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
		d
		* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
		*/
		void CUBLASAPI cublasZswap (int n, cuDoubleComplex *x, int incx, cuDoubleCo
		mplex *y,
		int incy);

		/*
	* cuComplex	* cuComplex
	* cdotu (int n, const cuComplex x, int incx, const cuComplex y, int incy )	* cdotu (int n, const cuComplex x, int incx, const cuComplex y, int incy )
	*	*
	* computes the dot product of two single-complex vectors. It returns the	* computes the dot product of two single-complex vectors. It returns the
	* dot product of the single-complex vectors x and y if successful, and com plex	* dot product of the single-complex vectors x and y if successful, and com plex
	* zero otherwise. It computes the sum for i = 0 to n - 1 of x[lx + i * inc x] *	* zero otherwise. It computes the sum for i = 0 to n - 1 of x[lx + i * inc x] *
	* y[ly + i * incy], where lx = 1 if incx >= 0, else lx = 1 + (1 - n) * inc x;	* y[ly + i * incy], where lx = 1 if incx >= 0, else lx = 1 + (1 - n) * inc x;
	* ly is defined in a similar way using incy.	* ly is defined in a similar way using incy.
	*	*
	* Input	* Input

	skipping to change at line 1212	skipping to change at line 1285
	* Error Status	* Error Status
	* ------------	* ------------
	* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize d	* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize d
	* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU	* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
	*/	*/
	float CUBLASAPI cublasScnrm2 (int n, const cuComplex *x, int incx);	float CUBLASAPI cublasScnrm2 (int n, const cuComplex *x, int incx);

	/* ----------------- CUBLAS double-complex BLAS1 functions ---------------- - */	/* ----------------- CUBLAS double-complex BLAS1 functions ---------------- - */

	/*	/*

		* void
		* cublasZaxpy (int n, cuDoubleComplex alpha, const cuDoubleComplex *x, int
		incx,
		* cuDoubleComplex *y, int incy)
		*
		* multiplies double-complex vector x by double-complex scalar alpha and ad
		ds
		* the result to double-complex vector y; that is, it overwrites double-com
		plex
		* y with double-complex alpha * x + y. For i = 0 to n - 1, it replaces
		* y[ly + i * incy] with alpha * x[lx + i * incx] + y[ly + i * incy], where
		* lx = 0 if incx >= 0, else lx = 1 + (1 - n) * incx, and ly is defined in
		a
		* similar way using incy.
		*
		* Input
		* -----
		* n number of elements in input vectors
		* alpha double-complex scalar multiplier
		* x double-complex vector with n elements
		* incx storage spacing between elements of x
		* y double-complex vector with n elements
		* incy storage spacing between elements of y
		*
		* Output
		* ------
		* y double-complex result (unchanged if n <= 0)
		*
		* Reference: http://www.netlib.org/blas/zaxpy.f
		*
		* Error status for this function can be retrieved via cublasGetError().
		*
		* Error Status
		* ------------
		* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
		d
		* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
		*/
		void CUBLASAPI cublasZaxpy (int n, cuDoubleComplex alpha, const cuDoubleCom
		plex *x,
		int incx, cuDoubleComplex *y, int incy);

		/*
	* cuDoubleComplex	* cuDoubleComplex
	* zdotu (int n, const cuDoubleComplex x, int incx, const cuDoubleComplex y, int incy)	* zdotu (int n, const cuDoubleComplex x, int incx, const cuDoubleComplex y, int incy)
	*	*
	* computes the dot product of two double-complex vectors. It returns the	* computes the dot product of two double-complex vectors. It returns the
	* dot product of the double-complex vectors x and y if successful, and dou ble-complex	* dot product of the double-complex vectors x and y if successful, and dou ble-complex
	* zero otherwise. It computes the sum for i = 0 to n - 1 of x[lx + i * inc x] *	* zero otherwise. It computes the sum for i = 0 to n - 1 of x[lx + i * inc x] *
	* y[ly + i * incy], where lx = 1 if incx >= 0, else lx = 1 + (1 - n) * inc x;	* y[ly + i * incy], where lx = 1 if incx >= 0, else lx = 1 + (1 - n) * inc x;
	* ly is defined in a similar way using incy.	* ly is defined in a similar way using incy.
	*	*
	* Input	* Input

	skipping to change at line 1240	skipping to change at line 1350
	* ------	* ------
	* returns double-complex dot product (zero if n <= 0)	* returns double-complex dot product (zero if n <= 0)
	*	*
	* Reference: http://www.netlib.org/blas/zdotu.f	* Reference: http://www.netlib.org/blas/zdotu.f
	*	*
	* Error status for this function can be retrieved via cublasGetError().	* Error status for this function can be retrieved via cublasGetError().
	*	*
	* Error Status	* Error Status
	* ------------	* ------------
	* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has nor been initialize d	* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has nor been initialize d

		* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
	* CUBLAS_STATUS_EXECUTION_FAILED if function failed to execute on GPU	* CUBLAS_STATUS_EXECUTION_FAILED if function failed to execute on GPU
	*/	*/
	cuDoubleComplex CUBLASAPI cublasZdotu (int n, const cuDoubleComplex *x, int incx,	cuDoubleComplex CUBLASAPI cublasZdotu (int n, const cuDoubleComplex *x, int incx,
	const cuDoubleComplex *y, int incy);	const cuDoubleComplex *y, int incy);

	/*	/*

		* cuDoubleComplex
		* cublasZdotc (int n, const cuDoubleComplex *x, int incx, const cuDoubleCo
		mplex *y, int incy)
		*
		* computes the dot product of two double-precision complex vectors. It ret
		urns the
		* dot product of the double-precision complex vectors conjugate(x) and y i
		f successful,
		* and double-precision complex zero otherwise. It computes the
		* sum for i = 0 to n - 1 of conjugate(x[lx + i * incx]) * y[ly + i * incy
		],
		* where lx = 1 if incx >= 0, else lx = 1 + (1 - n) * incx;
		* ly is defined in a similar way using incy.
		*
		* Input
		* -----
		* n number of elements in input vectors
		* x double-precision complex vector with n elements
		* incx storage spacing between elements of x
		* y double-precision complex vector with n elements
		* incy storage spacing between elements of y
		*
		* Output
		* ------
		* returns double-complex dot product (zero if n <= 0)
		*
		* Reference: http://www.netlib.org/blas/zdotc.f
		*
		* Error status for this function can be retrieved via cublasGetError().
		*
		* Error Status
		* ------------
		* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has nor been initialize
		d
		* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
		* CUBLAS_STATUS_EXECUTION_FAILED if function failed to execute on GPU
		*/
		cuDoubleComplex CUBLASAPI cublasZdotc( int n, const cuDoubleComplex *x, int
		incx,
		const cuDoubleComplex *y, int incy )
		;

		/*
	* void	* void
	* cublasZscal (int n, cuComplex alpha, cuComplex *x, int incx)	* cublasZscal (int n, cuComplex alpha, cuComplex *x, int incx)
	*	*
	* replaces double-complex vector x with double-complex alpha * x. For i	* replaces double-complex vector x with double-complex alpha * x. For i
	* = 0 to n - 1, it replaces x[ix + i * incx] with alpha * x[ix + i * incx] ,	* = 0 to n - 1, it replaces x[ix + i * incx] with alpha * x[ix + i * incx] ,
	* where ix = 1 if incx >= 0, else ix = 1 + (1 - n) * incx.	* where ix = 1 if incx >= 0, else ix = 1 + (1 - n) * incx.
	*	*
	* Input	* Input
	* -----	* -----
	* n number of elements in input vectors	* n number of elements in input vectors

	skipping to change at line 1275	skipping to change at line 1422
	*	*
	* Error status for this function can be retrieved via cublasGetError().	* Error status for this function can be retrieved via cublasGetError().
	*	*
	* Error Status	* Error Status
	* ------------	* ------------
	* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize d	* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize d
	* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU	* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
	*/	*/
	void CUBLASAPI cublasZscal (int n, cuDoubleComplex alpha, cuDoubleComplex * x, int incx);	void CUBLASAPI cublasZscal (int n, cuDoubleComplex alpha, cuDoubleComplex * x, int incx);


		/*
		* void
		* cublasZdscal (int n, double alpha, cuDoubleComplex *x, int incx)
		*
		* replaces double-complex vector x with double-complex alpha * x. For i
		* = 0 to n - 1, it replaces x[ix + i * incx] with alpha * x[ix + i * incx]
		,
		* where ix = 1 if incx >= 0, else ix = 1 + (1 - n) * incx.
		*
		* Input
		* -----
		* n number of elements in input vectors
		* alpha double precision scalar multiplier
		* x double-complex vector with n elements
		* incx storage spacing between elements of x
		*
		* Output
		* ------
		* x double-complex result (unchanged if n <= 0 or incx <= 0)
		*
		* Reference: http://www.netlib.org/blas/zdscal.f
		*
		* Error status for this function can be retrieved via cublasGetError().
		*
		* Error Status
		* ------------
		* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
		d
		* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
		* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
		*/
		void CUBLASAPI cublasZdscal (int n, double alpha, cuDoubleComplex *x,
		int incx);

		/*
		* double
		* cublasDznrm2 (int n, const cuDoubleComplex *x, int incx)
		*
		* computes the Euclidean norm of the double precision complex n-vector x.
		This code
		* uses simple scaling to avoid intermediate underflow and overflow.
		*
		* Input
		* -----
		* n number of elements in input vector
		* x double-complex vector with n elements
		* incx storage spacing between elements of x
		*
		* Output
		* ------
		* returns Euclidian norm (0 if n <= 0 or incx <= 0, or if an error occurs)
		*
		* Reference: http://www.netlib.org/blas/dznrm2.f
		*
		* Error status for this function can be retrieved via cublasGetError().
		*
		* Error Status
		* ------------
		* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
		d
		* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
		* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
		*/
		double CUBLASAPI cublasDznrm2 (int n, const cuDoubleComplex *x, int incx);

		/*
		* void
		* cublasZrotg (cuDoubleComplex host_ca, cuDoubleComplex cb, double host_
		sc, double *host_cs)
		*
		* constructs the complex Givens tranformation
		*
		* ( sc cs )
		* G = ( ) , sc^2 + cabs(cs)^2 = 1,
		* (-cs sc )
		*
		* which zeros the second entry of the complex 2-vector transpose(ca, cb).
		*
		* The quantity ca/cabs(ca)*norm(ca,cb) overwrites ca in storage. The
		* function crot (n, x, incx, y, incy, sc, cs) is normally called next
		* to apply the transformation to a 2 x n matrix.
		* Note that is function is provided for completeness and run exclusively
		* on the Host.
		*
		* Input
		* -----
		* ca double-precision complex precision scalar
		* cb double-precision complex scalar
		*
		* Output
		* ------
		* ca double-precision complex ca/cabs(ca)*norm(ca,cb)
		* sc double-precision cosine component of rotation matrix
		* cs double-precision complex sine component of rotation matrix
		*
		* Reference: http://www.netlib.org/blas/zrotg.f
		*
		* This function does not set any error status.
		*/
		void CUBLASAPI cublasZrotg (cuDoubleComplex *host_ca, cuDoubleComplex cb, d
		ouble *host_sc,
		cuDoubleComplex *host_cs);

		/*
		* cublasZrot (int n, cuDoubleComplex x, int incx, cuDoubleComplex y, int
		incy, double sc,
		* cuDoubleComplex cs)
		*
		* multiplies a 2x2 matrix ( sc cs) with the 2xn matrix ( transpose(x
		) )
		* (-conj(cs) sc) ( transpose(y
		) )
		*
		* The elements of x are in x[lx + i * incx], i = 0 ... n - 1, where lx = 1
		if
		* incx >= 0, else lx = 1 + (1 - n) * incx, and similarly for y using ly an
		d
		* incy.
		*
		* Input
		* -----
		* n number of elements in input vectors
		* x double-precision complex vector with n elements
		* incx storage spacing between elements of x
		* y double-precision complex vector with n elements
		* incy storage spacing between elements of y
		* sc double-precision cosine component of rotation matrix
		* cs double-precision complex sine component of rotation matrix
		*
		* Output
		* ------
		* x rotated double-precision complex vector x (unchanged if n <= 0)
		* y rotated double-precision complex vector y (unchanged if n <= 0)
		*
		* Reference: http://netlib.org/lapack/explore-html/zrot.f.html
		*
		* Error status for this function can be retrieved via cublasGetError().
		*
		* Error Status
		* ------------
		* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
		d
		* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
		* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
		*/
		void CUBLASAPI cublasZrot (int n, cuDoubleComplex *x, int incx,
		cuDoubleComplex *y, int incy, double sc,
		cuDoubleComplex cs);

		/*
		* void
		* zdrot (int n, cuDoubleComplex x, int incx, cuCumplex y, int incy, doub
		le c,
		* double s)
		*
		* multiplies a 2x2 matrix ( c s) with the 2xn matrix ( transpose(x) )
		* (-s c) ( transpose(y) )
		*
		* The elements of x are in x[lx + i * incx], i = 0 ... n - 1, where lx = 1
		if
		* incx >= 0, else lx = 1 + (1 - n) * incx, and similarly for y using ly an
		d
		* incy.
		*
		* Input
		* -----
		* n number of elements in input vectors
		* x double-precision complex vector with n elements
		* incx storage spacing between elements of x
		* y double-precision complex vector with n elements
		* incy storage spacing between elements of y
		* c cosine component of rotation matrix
		* s sine component of rotation matrix
		*
		* Output
		* ------
		* x rotated vector x (unchanged if n <= 0)
		* y rotated vector y (unchanged if n <= 0)
		*
		* Reference http://www.netlib.org/blas/zdrot.f
		*
		* Error status for this function can be retrieved via cublasGetError().
		*
		* Error Status
		* ------------
		* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
		d
		* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
		* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
		*/
		void CUBLASAPI cublasZdrot (int n, cuDoubleComplex *x, int incx,
		cuDoubleComplex *y, int incy, double c, double
		s);

		/*
		* int
		* cublasIzamax (int n, const double *x, int incx)
		*
		* finds the smallest index of the element having maximum absolute value
		* in double-complex vector x; that is, the result is the first i, i = 0
		* to n - 1 that maximizes abs(real(x[1+iincx]))+abs(imag(x[1 + i incx])
		).
		*
		* Input
		* -----
		* n number of elements in input vector
		* x double-complex vector with n elements
		* incx storage spacing between elements of x
		*
		* Output
		* ------
		* returns the smallest index (0 if n <= 0 or incx <= 0)
		*
		* Reference: http://www.netlib.org/blas/izamax.f
		*
		* Error status for this function can be retrieved via cublasGetError().
		*
		* Error Status
		* ------------
		* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
		d
		* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
		* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
		*/
		int CUBLASAPI cublasIzamax (int n, const cuDoubleComplex *x, int incx);

		/*
		* int
		* cublasIzamin (int n, const cuDoubleComplex *x, int incx)
		*
		* finds the smallest index of the element having minimum absolute value
		* in double-complex vector x; that is, the result is the first i, i = 0
		* to n - 1 that minimizes abs(real(x[1+iincx]))+abs(imag(x[1 + i incx])
		).
		*
		* Input
		* -----
		* n number of elements in input vector
		* x double-complex vector with n elements
		* incx storage spacing between elements of x
		*
		* Output
		* ------
		* returns the smallest index (0 if n <= 0 or incx <= 0)
		*
		* Reference: Analogous to IZAMAX, see there.
		*
		* Error status for this function can be retrieved via cublasGetError().
		*
		* Error Status
		* ------------
		* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
		d
		* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
		* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
		*/
		int CUBLASAPI cublasIzamin (int n, const cuDoubleComplex *x, int incx);

		/*
		* double
		* cublasDzasum (int n, const cuDoubleComplex *x, int incx)
		*
		* takes the sum of the absolute values of a complex vector and returns a
		* double precision result. Note that this is not the L1 norm of the vector
		.
		* The result is the sum from 0 to n-1 of abs(real(x[ix+i*incx])) +
		* abs(imag(x(ix+iincx))), where ix = 1 if incx <= 0, else ix = 1+(1-n)in
		cx.
		*
		* Input
		* -----
		* n number of elements in input vector
		* x double-complex vector with n elements
		* incx storage spacing between elements of x
		*
		* Output
		* ------
		* returns the double precision sum of absolute values of real and imaginar
		y
		* parts (0 if n <= 0 or incx <= 0, or if an error occurs)
		*
		* Reference: http://www.netlib.org/blas/dzasum.f
		*
		* Error status for this function can be retrieved via cublasGetError().
		*
		* Error Status
		* ------------
		* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
		d
		* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
		* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
		*/
		double CUBLASAPI cublasDzasum (int n, const cuDoubleComplex *x, int incx);

	/* --------------- CUBLAS single precision BLAS2 functions --------------- - */	/* --------------- CUBLAS single precision BLAS2 functions --------------- - */

	/*	/*
	* void	* void
	* cublasSgbmv (char trans, int m, int n, int kl, int ku, float alpha,	* cublasSgbmv (char trans, int m, int n, int kl, int ku, float alpha,
	* const float A, int lda, const float x, int incx, float be ta,	* const float A, int lda, const float x, int incx, float be ta,
	* float *y, int incy)	* float *y, int incy)
	*	*
	* performs one of the matrix-vector operations	* performs one of the matrix-vector operations
	*	*

	skipping to change at line 1893	skipping to change at line 2309
	* ------	* ------
	* x updated according to x = op(A) * x	* x updated according to x = op(A) * x
	*	*
	* Reference: http://www.netlib.org/blas/stbmv.f	* Reference: http://www.netlib.org/blas/stbmv.f
	*	*
	* Error status for this function can be retrieved via cublasGetError().	* Error status for this function can be retrieved via cublasGetError().
	*	*
	* Error Status	* Error Status
	* ------------	* ------------
	* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize d	* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize d

	* CUBLAS_STATUS_INVALID_VALUE if n < 0, n > 4070, k < 0, or incx == 0	* CUBLAS_STATUS_INVALID_VALUE if n < 0, k < 0, or incx == 0
		* CUBLAS_STATUS_ALLOC_FAILED if function cannot allocate enough intern
		al scratch vector memory
	* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU	* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
	*/	*/
	void CUBLASAPI cublasStbmv (char uplo, char trans, char diag, int n, int k,	void CUBLASAPI cublasStbmv (char uplo, char trans, char diag, int n, int k,
	const float A, int lda, float x, int incx);	const float A, int lda, float x, int incx);

	/*	/*
	* void cublasStbsv (char uplo, char trans, char diag, int n, int k,	* void cublasStbsv (char uplo, char trans, char diag, int n, int k,
	* const float A, int lda, float X, int incx)	* const float A, int lda, float X, int incx)
	*	*
	* solves one of the systems of equations op(A)*x = b, where op(A) is eithe r	* solves one of the systems of equations op(A)*x = b, where op(A) is eithe r

	skipping to change at line 1954	skipping to change at line 2371
	* ------	* ------
	* x updated to contain the solution vector x that solves op(A) * x = b.	* x updated to contain the solution vector x that solves op(A) * x = b.
	*	*
	* Reference: http://www.netlib.org/blas/stbsv.f	* Reference: http://www.netlib.org/blas/stbsv.f
	*	*
	* Error status for this function can be retrieved via cublasGetError().	* Error status for this function can be retrieved via cublasGetError().
	*	*
	* Error Status	* Error Status
	* ------------	* ------------
	* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize d	* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize d

	* CUBLAS_STATUS_INVALID_VALUE if incx == 0, n < 0, or n > 4070	* CUBLAS_STATUS_INVALID_VALUE if incx == 0, n < 0 or n > 4070
	* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU	* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
	*/	*/
	void CUBLASAPI cublasStbsv (char uplo, char trans, char diag, int n, int k,	void CUBLASAPI cublasStbsv (char uplo, char trans, char diag, int n, int k,
	const float A, int lda, float x, int incx);	const float A, int lda, float x, int incx);

	/*	/*
	* void	* void
	* cublasStpmv (char uplo, char trans, char diag, int n, const float *AP,	* cublasStpmv (char uplo, char trans, char diag, int n, const float *AP,
	* float *x, int incx);	* float *x, int incx);
	*	*

	skipping to change at line 2007	skipping to change at line 2424
	* x updated according to x = op(A) * x,	* x updated according to x = op(A) * x,
	*	*
	* Reference: http://www.netlib.org/blas/stpmv.f	* Reference: http://www.netlib.org/blas/stpmv.f
	*	*
	* Error status for this function can be retrieved via cublasGetError().	* Error status for this function can be retrieved via cublasGetError().
	*	*
	* Error Status	* Error Status
	* ------------	* ------------
	* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize d	* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize d
	* CUBLAS_STATUS_INVALID_VALUE if incx == 0 or if n < 0	* CUBLAS_STATUS_INVALID_VALUE if incx == 0 or if n < 0

		* CUBLAS_STATUS_ALLOC_FAILED if function cannot allocate enough intern al scratch vector memory
	* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU	* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
	*/	*/
	void CUBLASAPI cublasStpmv (char uplo, char trans, char diag, int n,	void CUBLASAPI cublasStpmv (char uplo, char trans, char diag, int n,
	const float AP, float x, int incx);	const float AP, float x, int incx);

	/*	/*
	* void	* void
	* cublasStpsv (char uplo, char trans, char diag, int n, const float *AP,	* cublasStpsv (char uplo, char trans, char diag, int n, const float *AP,
	* float *X, int incx)	* float *X, int incx)
	*	*

	skipping to change at line 2129	skipping to change at line 2547

	/*	/*
	* void	* void
	* cublasStrsv (char uplo, char trans, char diag, int n, const float *A,	* cublasStrsv (char uplo, char trans, char diag, int n, const float *A,
	* int lda, float *x, int incx)	* int lda, float *x, int incx)
	*	*
	* solves a system of equations op(A) * x = b, where op(A) is either A or	* solves a system of equations op(A) * x = b, where op(A) is either A or
	* transpose(A). b and x are single precision vectors consisting of n	* transpose(A). b and x are single precision vectors consisting of n
	* elements, and A is an n x n matrix composed of a unit or non-unit, upper	* elements, and A is an n x n matrix composed of a unit or non-unit, upper
	* or lower triangular matrix. Matrix A is stored in column major format,	* or lower triangular matrix. Matrix A is stored in column major format,

	* and lda is the leading dimension of the two-diemnsional array containing	* and lda is the leading dimension of the two-dimensional array containing
	* A.	* A.
	*	*
	* No test for singularity or near-singularity is included in this function .	* No test for singularity or near-singularity is included in this function .
	* Such tests must be performed before calling this function.	* Such tests must be performed before calling this function.
	*	*
	* Input	* Input
	* -----	* -----
	* uplo specifies whether the matrix data is stored in the upper or the	* uplo specifies whether the matrix data is stored in the upper or the
	* lower triangular part of array A. If uplo = 'U' or 'u', then only	* lower triangular part of array A. If uplo = 'U' or 'u', then only
	* the upper triangular part of A may be referenced. If uplo = 'L' o r	* the upper triangular part of A may be referenced. If uplo = 'L' o r
	* 'l', then only the lower triangular part of A may be referenced.	* 'l', then only the lower triangular part of A may be referenced.
	* trans specifies op(A). If transa = 'n' or 'N', op(A) = A. If transa = ' t',	* trans specifies op(A). If transa = 'n' or 'N', op(A) = A. If transa = ' t',
	* 'T', 'c', or 'C', op(A) = transpose(A)	* 'T', 'c', or 'C', op(A) = transpose(A)
	* diag specifies whether or not A is a unit triangular matrix like so:	* diag specifies whether or not A is a unit triangular matrix like so:
	* if diag = 'U' or 'u', A is assumed to be unit triangular. If	* if diag = 'U' or 'u', A is assumed to be unit triangular. If
	* diag = 'N' or 'n', then A is not assumed to be unit triangular.	* diag = 'N' or 'n', then A is not assumed to be unit triangular.
	* n specifies the number of rows and columns of the matrix A. It	* n specifies the number of rows and columns of the matrix A. It

	* must be at least 0. In the current implementation n must be <=	* must be at least 0.
	* 4070.
	* A is a single precision array of dimensions (lda, n). If uplo = 'U'	* A is a single precision array of dimensions (lda, n). If uplo = 'U'
	* or 'u', then A must contains the upper triangular part of a symme tric	* or 'u', then A must contains the upper triangular part of a symme tric
	* matrix, and the strictly lower triangular parts is not referenced .	* matrix, and the strictly lower triangular parts is not referenced .
	* If uplo = 'L' or 'l', then A contains the lower triangular part o f	* If uplo = 'L' or 'l', then A contains the lower triangular part o f
	* a symmetric matrix, and the strictly upper triangular part is not	* a symmetric matrix, and the strictly upper triangular part is not
	* referenced.	* referenced.
	* lda is the leading dimension of the two-dimensional array containing A.	* lda is the leading dimension of the two-dimensional array containing A.
	* lda must be at least max(1, n).	* lda must be at least max(1, n).
	* x single precision array of length at least (1 + (n - 1) * abs(incx )).	* x single precision array of length at least (1 + (n - 1) * abs(incx )).
	* On entry, x contains the n element right-hand side vector b. On e xit,	* On entry, x contains the n element right-hand side vector b. On e xit,

	skipping to change at line 2174	skipping to change at line 2591
	* ------	* ------
	* x updated to contain the solution vector x that solves op(A) * x = b.	* x updated to contain the solution vector x that solves op(A) * x = b.
	*	*
	* Reference: http://www.netlib.org/blas/strsv.f	* Reference: http://www.netlib.org/blas/strsv.f
	*	*
	* Error status for this function can be retrieved via cublasGetError().	* Error status for this function can be retrieved via cublasGetError().
	*	*
	* Error Status	* Error Status
	* ------------	* ------------
	* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize d	* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize d

	* CUBLAS_STATUS_INVALID_VALUE if incx == 0 or if n < 0 or n > 4070	* CUBLAS_STATUS_INVALID_VALUE if incx == 0 or if n < 0
	* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU	* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
	*/	*/
	void CUBLASAPI cublasStrsv (char uplo, char trans, char diag, int n,	void CUBLASAPI cublasStrsv (char uplo, char trans, char diag, int n,
	const float A, int lda, float x, int incx);	const float A, int lda, float x, int incx);


		/* ----------------- CUBLAS double complex BLAS2 functions ----------------
		- */

		/*
		* void
		* cublasZtrmv (char uplo, char trans, char diag, int n, const cuDoubleComp
		lex *A,
		* int lda, cuDoubleComplex *x, int incx);
		*
		* performs one of the matrix-vector operations x = op(A) * x,
		* where op(A) = A, or op(A) = transpose(A) or op(A) = conjugate(transpose(
		A)).
		* x is an n-element double precision complex vector, and
		* A is an n x n, unit or non-unit, upper or lower, triangular matrix compo
		sed
		* of double precision complex elements.
		*
		* Input
		* -----
		* uplo specifies whether the matrix A is an upper or lower triangular
		* matrix. If uplo = 'U' or 'u', then A is an upper triangular matri
		x.
		* If uplo = 'L' or 'l', then A is a lower triangular matrix.
		* trans specifies op(A). If trans = 'n' or 'N', op(A) = A. If trans = 't'
		or
		* 'T', op(A) = transpose(A). If trans = 'c' or 'C', op(A) =
		* conjugate(transpose(A)).
		* diag specifies whether or not matrix A is unit triangular. If diag = '
		U'
		* or 'u', A is assumed to be unit triangular. If diag = 'N' or 'n',
		A
		* is not assumed to be unit triangular.
		* n specifies the number of rows and columns of the matrix A. n must
		be
		* at least zero.
		* A double precision array of dimension (lda, n). If uplo = 'U' or 'u
		',
		* the leading n x n upper triangular part of the array A must conta
		in
		* the upper triangular matrix and the strictly lower triangular par
		t
		* of A is not referenced. If uplo = 'L' or 'l', the leading n x n l
		ower
		* triangular part of the array A must contain the lower triangular
		* matrix and the strictly upper triangular part of A is not referen
		ced.
		* When diag = 'U' or 'u', the diagonal elements of A are not refere
		nced
		* either, but are are assumed to be unity.
		* lda is the leading dimension of A. It must be at least max (1, n).
		* x double precision array of length at least (1 + (n - 1) * abs(incx
		) ).
		* On entry, x contains the source vector. On exit, x is overwritten
		* with the result vector.
		* incx specifies the storage spacing for elements of x. incx must not be
		* zero.
		*
		* Output
		* ------
		* x updated according to x = op(A) * x,
		*
		* Reference: http://www.netlib.org/blas/ztrmv.f
		*
		* Error status for this function can be retrieved via cublasGetError().
		*
		* Error Status
		* ------------
		* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
		d
		* CUBLAS_STATUS_INVALID_VALUE if incx == 0 or if n < 0
		* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
		*/
		void CUBLASAPI cublasZtrmv (char uplo, char trans, char diag, int n,
		const cuDoubleComplex *A, int lda, cuDoubleComp
		lex *x,
		int incx);

		/*
		* void
		* cublasZgbmv (char trans, int m, int n, int kl, int ku, cuDoubleComplex a
		lpha,
		* const cuDoubleComplex A, int lda, const cuDoubleComplex x
		, int incx, cuDoubleComplex beta,
		* cuDoubleComplex *y, int incy);
		*
		* performs one of the matrix-vector operations
		*
		* y = alphaop(A)x + beta*y, op(A)=A or op(A) = transpose(A)
		*
		* alpha and beta are double precision complex scalars. x and y are double
		precision
		* complex vectors. A is an m by n band matrix consisting of double precisi
		on complex elements
		* with kl sub-diagonals and ku super-diagonals.
		*
		* Input
		* -----
		* trans specifies op(A). If trans == 'N' or 'n', op(A) = A. If trans == '
		T',
		* or 't', op(A) = transpose(A). If trans == 'C' or 'c',
		* op(A) = conjugate(transpose(A)).
		* m specifies the number of rows of the matrix A. m must be at least
		* zero.
		* n specifies the number of columns of the matrix A. n must be at lea
		st
		* zero.
		* kl specifies the number of sub-diagonals of matrix A. It must be at
		* least zero.
		* ku specifies the number of super-diagonals of matrix A. It must be a
		t
		* least zero.
		* alpha double precision complex scalar multiplier applied to op(A).
		* A double precision complex array of dimensions (lda, n). The leadin
		g
		* (kl + ku + 1) x n part of the array A must contain the band matri
		x A,
		* supplied column by column, with the leading diagonal of the matri
		x
		* in row (ku + 1) of the array, the first super-diagonal starting a
		t
		* position 2 in row ku, the first sub-diagonal starting at position
		1
		* in row (ku + 2), and so on. Elements in the array A that do not
		* correspond to elements in the band matrix (such as the top left
		* ku x ku triangle) are not referenced.
		* lda leading dimension of A. lda must be at least (kl + ku + 1).
		* x double precision complex array of length at least (1+(n-1)*abs(in
		cx)) when
		* trans == 'N' or 'n' and at least (1+(m-1)*abs(incx)) otherwise.
		* incx specifies the increment for the elements of x. incx must not be z
		ero.
		* beta double precision complex scalar multiplier applied to vector y. I
		f beta is
		* zero, y is not read.
		* y double precision complex array of length at least (1+(m-1)*abs(in
		cy)) when
		* trans == 'N' or 'n' and at least (1+(n-1)*abs(incy)) otherwise. I
		f
		* beta is zero, y is not read.
		* incy On entry, incy specifies the increment for the elements of y. inc
		y
		* must not be zero.
		*
		* Output
		* ------
		* y updated according to y = alphaop(A)x + beta*y
		*
		* Reference: http://www.netlib.org/blas/zgbmv.f
		*
		* Error status for this function can be retrieved via cublasGetError().
		*
		* Error Status
		* ------------
		* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
		d
		* CUBLAS_STATUS_INVALID_VALUE if n < 0, or if incx or incy == 0
		* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
		* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
		*/
		void CUBLASAPI cublasZgbmv (char trans, int m, int n, int kl, int ku,
		cuDoubleComplex alpha, const cuDoubleComplex *A
		, int lda,
		const cuDoubleComplex *x, int incx, cuDoubleCom
		plex beta,
		cuDoubleComplex *y, int incy);

		/*
		* void
		* cublasZtbmv (char uplo, char trans, char diag, int n, int k, const cuDou
		bleComplex *A,
		* int lda, cuDoubleComplex *x, int incx)
		*
		* performs one of the matrix-vector operations x = op(A) * x, where op(A)
		= A,
		* op(A) = transpose(A) or op(A) = conjugate(transpose(A)). x is an n-eleme
		nt
		* double precision complex vector, and A is an n x n, unit or non-unit, up
		per
		* or lower triangular band matrix composed of double precision complex ele
		ments.
		*
		* Input
		* -----
		* uplo specifies whether the matrix A is an upper or lower triangular ba
		nd
		* matrix. If uplo == 'U' or 'u', A is an upper triangular band matr
		ix.
		* If uplo == 'L' or 'l', A is a lower triangular band matrix.
		* trans specifies op(A). If transa == 'N' or 'n', op(A) = A. If trans ==
		'T',
		* or 't', op(A) = transpose(A). If trans == 'C' or 'c',
		* op(A) = conjugate(transpose(A)).
		* diag specifies whether or not matrix A is unit triangular. If diag ==
		'U'
		* or 'u', A is assumed to be unit triangular. If diag == 'N' or 'n'
		, A
		* is not assumed to be unit triangular.
		* n specifies the number of rows and columns of the matrix A. n must
		be
		* at least zero.
		* k specifies the number of super- or sub-diagonals. If uplo == 'U' o
		r
		* 'u', k specifies the number of super-diagonals. If uplo == 'L' or
		* 'l', k specifies the number of sub-diagonals. k must at least be
		* zero.
		* A double precision complex array of dimension (lda, n). If uplo ==
		'U' or 'u',
		* the leading (k + 1) x n part of the array A must contain the uppe
		r
		* triangular band matrix, supplied column by column, with the leadi
		ng
		* diagonal of the matrix in row (k + 1) of the array, the first
		* super-diagonal starting at position 2 in row k, and so on. The to
		p
		* left k x k triangle of the array A is not referenced. If uplo ==
		'L'
		* or 'l', the leading (k + 1) x n part of the array A must constain
		the
		* lower triangular band matrix, supplied column by column, with the
		* leading diagonal of the matrix in row 1 of the array, the first
		* sub-diagonal startingat position 1 in row 2, and so on. The botto
		m
		* right k x k triangle of the array is not referenced.
		* lda is the leading dimension of A. It must be at least (k + 1).
		* x double precision complex array of length at least (1 + (n - 1) *
		abs(incx)).
		* On entry, x contains the source vector. On exit, x is overwritten
		* with the result vector.
		* incx specifies the storage spacing for elements of x. incx must not be
		* zero.
		*
		* Output
		* ------
		* x updated according to x = op(A) * x
		*
		* Reference: http://www.netlib.org/blas/ztbmv.f
		*
		* Error status for this function can be retrieved via cublasGetError().
		*
		* Error Status
		* ------------
		* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
		d
		* CUBLAS_STATUS_INVALID_VALUE if n or k < 0, or if incx == 0
		* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
		* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
		*/
		void CUBLASAPI cublasZtbmv (char uplo, char trans, char diag, int n,
		int k, const cuDoubleComplex *A, int lda, cuDoub
		leComplex *x,
		int incx);

		/*
		* void cublasZtbsv (char uplo, char trans, char diag, int n, int k,
		* const cuDoubleComplex A, int lda, cuDoubleComplex X,
		int incx)
		*
		* solves one of the systems of equations op(A)*x = b, where op(A) is eithe
		r
		* op(A) = A , op(A) = transpose(A) or op(A) = conjugate(transpose(A)).
		* b and x are n element vectors, and A is an n x n unit or non-unit,
		* upper or lower triangular band matrix with k + 1 diagonals. No test
		* for singularity or near-singularity is included in this function.
		* Such tests must be performed before calling this function.
		*
		* Input
		* -----
		* uplo specifies whether the matrix is an upper or lower triangular band
		* matrix as follows: If uplo == 'U' or 'u', A is an upper triangula
		r
		* band matrix. If uplo == 'L' or 'l', A is a lower triangular band
		* matrix.
		* trans specifies op(A). If trans == 'N' or 'n', op(A) = A. If trans == '
		T',
		* 't', op(A) = transpose(A). If trans == 'C' or 'c',
		* op(A) = conjugate(transpose(A)).
		* diag specifies whether A is unit triangular. If diag == 'U' or 'u', A
		is
		* assumed to be unit triangular; thas is, diagonal elements are not
		* read and are assumed to be unity. If diag == 'N' or 'n', A is not
		* assumed to be unit triangular.
		* n specifies the number of rows and columns of the matrix A. n must
		be
		* at least zero.
		* k specifies the number of super- or sub-diagonals. If uplo == 'U' o
		r
		* 'u', k specifies the number of super-diagonals. If uplo == 'L' or
		* 'l', k specifies the number of sub-diagonals. k must at least be
		* zero.
		* A double precision complex array of dimension (lda, n). If uplo ==
		'U' or 'u',
		* the leading (k + 1) x n part of the array A must contain the uppe
		r
		* triangular band matrix, supplied column by column, with the leadi
		ng
		* diagonal of the matrix in row (k + 1) of the array, the first sup
		er-
		* diagonal starting at position 2 in row k, and so on. The top left
		* k x k triangle of the array A is not referenced. If uplo == 'L' o
		r
		* 'l', the leading (k + 1) x n part of the array A must constain th
		e
		* lower triangular band matrix, supplied column by column, with the
		* leading diagonal of the matrix in row 1 of the array, the first
		* sub-diagonal starting at position 1 in row 2, and so on. The bott
		om
		* right k x k triangle of the array is not referenced.
		* x double precision complex array of length at least (1+(n-1)*abs(in
		cx)).
		* incx storage spacing between elements of x. It must not be zero.
		*
		* Output
		* ------
		* x updated to contain the solution vector x that solves op(A) * x =
		b.
		*
		* Reference: http://www.netlib.org/blas/ztbsv.f
		*
		* Error status for this function can be retrieved via cublasGetError().
		*
		* Error Status
		* ------------
		* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
		d
		* CUBLAS_STATUS_INVALID_VALUE if incx == 0, n < 0 or n > 1016
		* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
		* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
		*/
		void CUBLASAPI cublasZtbsv (char uplo, char trans, char diag, int n,
		int k, const cuDoubleComplex *A, int lda, cuDou
		bleComplex *x,
		int incx);

		/*
		* void
		* cublasZhemv (char uplo, int n, cuDoubleComplex alpha, const cuDoubleComp
		lex *A, int lda,
		* const cuDoubleComplex *x, int incx, cuDoubleComplex beta, c
		uDoubleComplex *y, int incy)
		*
		* performs the matrix-vector operation
		*
		* y = alphaAx + beta*y
		*
		* Alpha and beta are double precision complex scalars, and x and y are dou
		ble
		* precision complex vectors, each with n elements. A is a hermitian n x n
		matrix
		* consisting of double precision complex elements that is stored in either
		upper or
		* lower storage mode.
		*
		* Input
		* -----
		* uplo specifies whether the upper or lower triangular part of the array
		A
		* is to be referenced. If uplo == 'U' or 'u', the hermitian matrix
		A
		* is stored in upper storage mode, i.e. only the upper triangular p
		art
		* of A is to be referenced while the lower triangular part of A is
		to
		* be inferred. If uplo == 'L' or 'l', the hermitian matrix A is sto
		red
		* in lower storage mode, i.e. only the lower triangular part of A i
		s
		* to be referenced while the upper triangular part of A is to be
		* inferred.
		* n specifies the number of rows and the number of columns of the
		* hermitian matrix A. n must be at least zero.
		* alpha double precision complex scalar multiplier applied to A*x.
		* A double precision complex array of dimensions (lda, n). If uplo ==
		'U' or 'u',
		* the leading n x n upper triangular part of the array A must conta
		in
		* the upper triangular part of the hermitian matrix and the strictl
		y
		* lower triangular part of A is not referenced. If uplo == 'L' or '
		l',
		* the leading n x n lower triangular part of the array A must conta
		in
		* the lower triangular part of the hermitian matrix and the strictl
		y
		* upper triangular part of A is not referenced. The imaginary parts
		* of the diagonal elements need not be set, they are assumed to be
		zero.
		* lda leading dimension of A. It must be at least max (1, n).
		* x double precision complex array of length at least (1 + (n - 1) *
		abs(incx)).
		* incx storage spacing between elements of x. incx must not be zero.
		* beta double precision complex scalar multiplier applied to vector y.
		* y double precision complex array of length at least (1 + (n - 1) *
		abs(incy)).
		* If beta is zero, y is not read.
		* incy storage spacing between elements of y. incy must not be zero.
		*
		* Output
		* ------
		* y updated according to y = alphaAx + beta*y
		*
		* Reference: http://www.netlib.org/blas/zhemv.f
		*
		* Error status for this function can be retrieved via cublasGetError().
		*
		* Error Status
		* ------------
		* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
		d
		* CUBLAS_STATUS_INVALID_VALUE if n < 0, or if incx or incy == 0
		* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
		* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
		*/
		void CUBLASAPI cublasZhemv (char uplo, int n, cuDoubleComplex alpha,
		const cuDoubleComplex *A, int lda, const cuDoub
		leComplex *x,
		int incx, cuDoubleComplex beta, cuDoubleComplex
		*y, int incy);

		/*
		* void
		* cublasZhpmv (char uplo, int n, cuDoubleComplex alpha, const cuDoubleComp
		lex AP, const cuDoubleComplex x,
		* int incx, cuDoubleComplex beta, cuDoubleComplex *y, int inc
		y)
		*
		* performs the matrix-vector operation
		*
		* y = alpha * A * x + beta * y
		*
		* Alpha and beta are double precision complex scalars, and x and y are dou
		ble
		* precision complex vectors with n elements. A is an hermitian n x n matri
		x
		* consisting of double precision complex elements that is supplied in pack
		ed form.
		*
		* Input
		* -----
		* uplo specifies whether the matrix data is stored in the upper or the l
		ower
		* triangular part of array AP. If uplo == 'U' or 'u', then the uppe
		r
		* triangular part of A is supplied in AP. If uplo == 'L' or 'l', th
		en
		* the lower triangular part of A is supplied in AP.
		* n specifies the number of rows and columns of the matrix A. It must
		be
		* at least zero.
		* alpha double precision complex scalar multiplier applied to A*x.
		* AP double precision complex array with at least ((n * (n + 1)) / 2)
		elements. If
		* uplo == 'U' or 'u', the array AP contains the upper triangular pa
		rt
		* of the hermitian matrix A, packed sequentially, column by column;
		* that is, if i <= j, then A[i,j] is stored is AP[i+(j*(j+1)/2)]. I
		f
		* uplo == 'L' or 'L', the array AP contains the lower triangular pa
		rt
		* of the hermitian matrix A, packed sequentially, column by column;
		* that is, if i >= j, then A[i,j] is stored in AP[i+((2n-j+1)j)/2
		].
		* The imaginary parts of the diagonal elements need not be set, the
		y
		* are assumed to be zero.
		* x double precision complex array of length at least (1 + (n - 1) *
		abs(incx)).
		* incx storage spacing between elements of x. incx must not be zero.
		* beta double precision complex scalar multiplier applied to vector y;
		* y double precision array of length at least (1 + (n - 1) * abs(incy
		)).
		* If beta is zero, y is not read.
		* incy storage spacing between elements of y. incy must not be zero.
		*
		* Output
		* ------
		* y updated according to y = alphaAx + beta*y
		*
		* Reference: http://www.netlib.org/blas/zhpmv.f
		*
		* Error status for this function can be retrieved via cublasGetError().
		*
		* Error Status
		* ------------
		* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
		d
		* CUBLAS_STATUS_INVALID_VALUE if n < 0, or if incx or incy == 0
		* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
		* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
		*/
		void CUBLASAPI cublasZhpmv (char uplo, int n, cuDoubleComplex alpha,
		const cuDoubleComplex *AP, const cuDoubleComple
		x *x,
		int incx, cuDoubleComplex beta, cuDoubleComplex
		*y, int incy);

	/* ----------------- CUBLAS double complex BLAS3 functions ---------------- - */	/* ----------------- CUBLAS double complex BLAS3 functions ---------------- - */

	/*	/*
	* cublasZgemv (char trans, int m, int n, cuDoubleComplex alpha, const cuDo ubleComplex *A, int lda,	* cublasZgemv (char trans, int m, int n, cuDoubleComplex alpha, const cuDo ubleComplex *A, int lda,
	* const cuDoubleComplex x, int incx, cuDoubleComplex beta, c uDoubleComplex y, int incy)	* const cuDoubleComplex x, int incx, cuDoubleComplex beta, c uDoubleComplex y, int incy)
	*	*
	* performs one of the matrix-vector operations	* performs one of the matrix-vector operations
	*	*
	* y = alpha * op(A) * x + beta * y,	* y = alpha * op(A) * x + beta * y,
	*	*

	skipping to change at line 2243	skipping to change at line 3033
	* Error Status	* Error Status
	* ------------	* ------------
	* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize d	* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize d
	* CUBLAS_STATUS_INVALID_VALUE if m or n are < 0, or if incx or incy == 0	* CUBLAS_STATUS_INVALID_VALUE if m or n are < 0, or if incx or incy == 0
	* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU	* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
	*/	*/
	void CUBLASAPI cublasZgemv (char trans, int m, int n, cuDoubleComplex alpha ,	void CUBLASAPI cublasZgemv (char trans, int m, int n, cuDoubleComplex alpha ,
	const cuDoubleComplex A, int lda, const cuDoub leComplex x, int incx,	const cuDoubleComplex A, int lda, const cuDoub leComplex x, int incx,
	cuDoubleComplex beta, cuDoubleComplex *y, int i ncy);	cuDoubleComplex beta, cuDoubleComplex *y, int i ncy);


		/*
		* void
		* cublasZtpmv (char uplo, char trans, char diag, int n, const cuDoubleComp
		lex *AP,
		* cuDoubleComplex *x, int incx);
		*
		* performs one of the matrix-vector operations x = op(A) * x, where op(A)
		= A,
		* op(A) = transpose(A) or op(A) = conjugate(transpose(A)) . x is an n elem
		ent
		* double precision complex vector, and A is an n x n, unit or non-unit, up
		per
		* or lower triangular matrix composed of double precision complex elements
		.
		*
		* Input
		* -----
		* uplo specifies whether the matrix A is an upper or lower triangular
		* matrix. If uplo == 'U' or 'u', then A is an upper triangular matr
		ix.
		* If uplo == 'L' or 'l', then A is a lower triangular matrix.
		* trans specifies op(A). If transa == 'N' or 'n', op(A) = A. If trans ==
		'T',
		* or 't', op(A) = transpose(A). If trans == 'C' or 'c',
		* op(A) = conjugate(transpose(A)).
		*
		* diag specifies whether or not matrix A is unit triangular. If diag ==
		'U'
		* or 'u', A is assumed to be unit triangular. If diag == 'N' or 'n'
		, A
		* is not assumed to be unit triangular.
		* n specifies the number of rows and columns of the matrix A. n must
		be
		* at least zero. In the current implementation n must not exceed 40
		70.
		* AP double precision complex array with at least ((n * (n + 1)) / 2)
		elements. If
		* uplo == 'U' or 'u', the array AP contains the upper triangular pa
		rt
		* of the symmetric matrix A, packed sequentially, column by column;
		* that is, if i <= j, then A[i,j] is stored in AP[i+(j*(j+1)/2)]. I
		f
		* uplo == 'L' or 'L', the array AP contains the lower triangular pa
		rt
		* of the symmetric matrix A, packed sequentially, column by column;
		* that is, if i >= j, then A[i,j] is stored in AP[i+((2n-j+1)j)/2
		].
		* x double precision complex array of length at least (1 + (n - 1) *
		abs(incx)).
		* On entry, x contains the source vector. On exit, x is overwritten
		* with the result vector.
		* incx specifies the storage spacing for elements of x. incx must not be
		* zero.
		*
		* Output
		* ------
		* x updated according to x = op(A) * x,
		*
		* Reference: http://www.netlib.org/blas/ztpmv.f
		*
		* Error status for this function can be retrieved via cublasGetError().
		*
		* Error Status
		* ------------
		* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
		d
		* CUBLAS_STATUS_INVALID_VALUE if incx == 0 or n < 0
		* CUBLAS_STATUS_ALLOC_FAILED if function cannot allocate enough intern
		al scratch vector memory
		* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
		* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
		*/
		void CUBLASAPI cublasZtpmv (char uplo, char trans, char diag, int n,
		const cuDoubleComplex AP, cuDoubleComplex x,
		int incx);

		/*
		* void
		* cublasZtpsv (char uplo, char trans, char diag, int n, const cuDoubleComp
		lex *AP,
		* cuDoubleComplex *X, int incx)
		*
		* solves one of the systems of equations op(A)*x = b, where op(A) is eithe
		r
		* op(A) = A , op(A) = transpose(A) or op(A) = conjugate(transpose)). b and
		* x are n element complex vectors, and A is an n x n unit or non-unit,
		* upper or lower triangular matrix. No test for singularity or near-singul
		arity
		* is included in this routine. Such tests must be performed before calling
		this routine.
		*
		* Input
		* -----
		* uplo specifies whether the matrix is an upper or lower triangular matr
		ix
		* as follows: If uplo == 'U' or 'u', A is an upper triangluar matri
		x.
		* If uplo == 'L' or 'l', A is a lower triangular matrix.
		* trans specifies op(A). If trans == 'N' or 'n', op(A) = A. If trans == '
		T'
		* or 't', op(A) = transpose(A). If trans == 'C' or 'c', op(A) =
		* conjugate(transpose(A)).
		* diag specifies whether A is unit triangular. If diag == 'U' or 'u', A
		is
		* assumed to be unit triangular; thas is, diagonal elements are not
		* read and are assumed to be unity. If diag == 'N' or 'n', A is not
		* assumed to be unit triangular.
		* n specifies the number of rows and columns of the matrix A. n must
		be
		* at least zero.
		* AP double precision complex array with at least ((n*(n+1))/2) elemen
		ts.
		* If uplo == 'U' or 'u', the array AP contains the upper triangular
		* matrix A, packed sequentially, column by column; that is, if i <=
		j, then
		* A[i,j] is stored is AP[i+(j*(j+1)/2)]. If uplo == 'L' or 'L', the
		* array AP contains the lower triangular matrix A, packed sequentia
		lly,
		* column by column; that is, if i >= j, then A[i,j] is stored in
		* AP[i+((2n-j+1)j)/2]. When diag = 'U' or 'u', the diagonal eleme
		nts
		* of A are not referenced and are assumed to be unity.
		* x double precision complex array of length at least (1+(n-1)*abs(in
		cx)).
		* incx storage spacing between elements of x. It must not be zero.
		*
		* Output
		* ------
		* x updated to contain the solution vector x that solves op(A) * x =
		b.
		*
		* Reference: http://www.netlib.org/blas/ztpsv.f
		*
		* Error status for this function can be retrieved via cublasGetError().
		*
		* Error Status
		* ------------
		* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
		d
		* CUBLAS_STATUS_INVALID_VALUE if incx == 0 or if n < 0 or n > 2035
		* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
		* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
		*/
		void CUBLASAPI cublasZtpsv (char uplo, char trans, char diag, int n,
		const cuDoubleComplex AP, cuDoubleComplex x,
		int incx);

	/* ----------------- CUBLAS single complex BLAS2 functions ---------------- - */	/* ----------------- CUBLAS single complex BLAS2 functions ---------------- - */

		/*
		* cublasCgemv (char trans, int m, int n, cuComplex alpha, const cuComplex
		*A,
		* int lda, const cuComplex *x, int incx, cuComplex beta, cuCo
		mplex *y,
		* int incy)
		*
		* performs one of the matrix-vector operations
		*
		* y = alpha * op(A) * x + beta * y,
		*
		* where op(A) is one of
		*
		* op(A) = A or op(A) = transpose(A) or op(A) = conjugate(transpose(
		A))
		*
		* where alpha and beta are single precision scalars, x and y are single
		* precision vectors, and A is an m x n matrix consisting of single precisi
		on
		* elements. Matrix A is stored in column major format, and lda is the lead
		ing
		* dimension of the two-dimensional array in which A is stored.
		*
		* Input
		* -----
		* trans specifies op(A). If transa = 'n' or 'N', op(A) = A. If trans =
		* trans = 't' or 'T', op(A) = transpose(A). If trans = 'c' or 'C',
		* op(A) = conjugate(transpose(A))
		* m specifies the number of rows of the matrix A. m must be at least
		* zero.
		* n specifies the number of columns of the matrix A. n must be at lea
		st
		* zero.
		* alpha single precision scalar multiplier applied to op(A).
		* A single precision array of dimensions (lda, n) if trans = 'n' or
		* 'N'), and of dimensions (lda, m) otherwise. lda must be at least
		* max(1, m) and at least max(1, n) otherwise.
		* lda leading dimension of two-dimensional array used to store matrix A
		* x single precision array of length at least (1 + (n - 1) * abs(incx
		))
		* when trans = 'N' or 'n' and at least (1 + (m - 1) * abs(incx))
		* otherwise.
		* incx specifies the storage spacing between elements of x. incx must no
		t
		* be zero.
		* beta single precision scalar multiplier applied to vector y. If beta
		* is zero, y is not read.
		* y single precision array of length at least (1 + (m - 1) * abs(incy
		))
		* when trans = 'N' or 'n' and at least (1 + (n - 1) * abs(incy))
		* otherwise.
		* incy specifies the storage spacing between elements of y. incy must no
		t
		* be zero.
		*
		* Output
		* ------
		* y updated according to alpha * op(A) * x + beta * y
		*
		* Reference: http://www.netlib.org/blas/cgemv.f
		*
		* Error status for this function can be retrieved via cublasGetError().
		*
		* Error Status
		* ------------
		* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
		d
		* CUBLAS_STATUS_INVALID_VALUE if m or n are < 0, or if incx or incy ==
		0
		* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
		*/
	void CUBLASAPI cublasCgemv (char trans, int m, int n, cuComplex alpha,	void CUBLASAPI cublasCgemv (char trans, int m, int n, cuComplex alpha,
	const cuComplex A, int lda, const cuComplex x ,	const cuComplex A, int lda, const cuComplex x ,
	int incx, cuComplex beta, cuComplex *y, int inc y);	int incx, cuComplex beta, cuComplex *y, int inc y);

		/*
		* void
		* cublasCgbmv (char trans, int m, int n, int kl, int ku, cuComplex alpha,
		* const cuComplex A, int lda, const cuComplex x, int incx,
		cuComplex beta,
		* cuComplex *y, int incy);
		*
		* performs one of the matrix-vector operations
		*
		* y = alphaop(A)x + beta*y, op(A)=A or op(A) = transpose(A)
		*
		* alpha and beta are single precision complex scalars. x and y are single
		precision
		* complex vectors. A is an m by n band matrix consisting of single precisi
		on complex elements
		* with kl sub-diagonals and ku super-diagonals.
		*
		* Input
		* -----
		* trans specifies op(A). If trans == 'N' or 'n', op(A) = A. If trans == '
		T',
		* or 't', op(A) = transpose(A). If trans == 'C' or 'c',
		* op(A) = conjugate(transpose(A)).
		* m specifies the number of rows of the matrix A. m must be at least
		* zero.
		* n specifies the number of columns of the matrix A. n must be at lea
		st
		* zero.
		* kl specifies the number of sub-diagonals of matrix A. It must be at
		* least zero.
		* ku specifies the number of super-diagonals of matrix A. It must be a
		t
		* least zero.
		* alpha single precision complex scalar multiplier applied to op(A).
		* A single precision complex array of dimensions (lda, n). The leadin
		g
		* (kl + ku + 1) x n part of the array A must contain the band matri
		x A,
		* supplied column by column, with the leading diagonal of the matri
		x
		* in row (ku + 1) of the array, the first super-diagonal starting a
		t
		* position 2 in row ku, the first sub-diagonal starting at position
		1
		* in row (ku + 2), and so on. Elements in the array A that do not
		* correspond to elements in the band matrix (such as the top left
		* ku x ku triangle) are not referenced.
		* lda leading dimension of A. lda must be at least (kl + ku + 1).
		* x single precision complex array of length at least (1+(n-1)*abs(in
		cx)) when
		* trans == 'N' or 'n' and at least (1+(m-1)*abs(incx)) otherwise.
		* incx specifies the increment for the elements of x. incx must not be z
		ero.
		* beta single precision complex scalar multiplier applied to vector y. I
		f beta is
		* zero, y is not read.
		* y single precision complex array of length at least (1+(m-1)*abs(in
		cy)) when
		* trans == 'N' or 'n' and at least (1+(n-1)*abs(incy)) otherwise. I
		f
		* beta is zero, y is not read.
		* incy On entry, incy specifies the increment for the elements of y. inc
		y
		* must not be zero.
		*
		* Output
		* ------
		* y updated according to y = alphaop(A)x + beta*y
		*
		* Reference: http://www.netlib.org/blas/cgbmv.f
		*
		* Error status for this function can be retrieved via cublasGetError().
		*
		* Error Status
		* ------------
		* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
		d
		* CUBLAS_STATUS_INVALID_VALUE if n < 0, or if incx or incy == 0
		* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
		*/
	void CUBLASAPI cublasCgbmv (char trans, int m, int n, int kl, int ku,	void CUBLASAPI cublasCgbmv (char trans, int m, int n, int kl, int ku,
	cuComplex alpha, const cuComplex *A, int lda,	cuComplex alpha, const cuComplex *A, int lda,
	const cuComplex *x, int incx, cuComplex beta,	const cuComplex *x, int incx, cuComplex beta,
	cuComplex *y, int incy);	cuComplex *y, int incy);

		/*
		* void
		* cublasChemv (char uplo, int n, cuComplex alpha, const cuComplex *A, int
		lda,
		* const cuComplex x, int incx, cuComplex beta, cuComplex y,
		int incy)
		*
		* performs the matrix-vector operation
		*
		* y = alphaAx + beta*y
		*
		* Alpha and beta are single precision complex scalars, and x and y are sin
		gle
		* precision complex vectors, each with n elements. A is a hermitian n x n
		matrix
		* consisting of single precision complex elements that is stored in either
		upper or
		* lower storage mode.
		*
		* Input
		* -----
		* uplo specifies whether the upper or lower triangular part of the array
		A
		* is to be referenced. If uplo == 'U' or 'u', the hermitian matrix
		A
		* is stored in upper storage mode, i.e. only the upper triangular p
		art
		* of A is to be referenced while the lower triangular part of A is
		to
		* be inferred. If uplo == 'L' or 'l', the hermitian matrix A is sto
		red
		* in lower storage mode, i.e. only the lower triangular part of A i
		s
		* to be referenced while the upper triangular part of A is to be
		* inferred.
		* n specifies the number of rows and the number of columns of the
		* hermitian matrix A. n must be at least zero.
		* alpha single precision complex scalar multiplier applied to A*x.
		* A single precision complex array of dimensions (lda, n). If uplo ==
		'U' or 'u',
		* the leading n x n upper triangular part of the array A must conta
		in
		* the upper triangular part of the hermitian matrix and the strictl
		y
		* lower triangular part of A is not referenced. If uplo == 'L' or '
		l',
		* the leading n x n lower triangular part of the array A must conta
		in
		* the lower triangular part of the hermitian matrix and the strictl
		y
		* upper triangular part of A is not referenced. The imaginary parts
		* of the diagonal elements need not be set, they are assumed to be
		zero.
		* lda leading dimension of A. It must be at least max (1, n).
		* x single precision complex array of length at least (1 + (n - 1) *
		abs(incx)).
		* incx storage spacing between elements of x. incx must not be zero.
		* beta single precision complex scalar multiplier applied to vector y.
		* y single precision complex array of length at least (1 + (n - 1) *
		abs(incy)).
		* If beta is zero, y is not read.
		* incy storage spacing between elements of y. incy must not be zero.
		*
		* Output
		* ------
		* y updated according to y = alphaAx + beta*y
		*
		* Reference: http://www.netlib.org/blas/chemv.f
		*
		* Error status for this function can be retrieved via cublasGetError().
		*
		* Error Status
		* ------------
		* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
		d
		* CUBLAS_STATUS_INVALID_VALUE if n < 0, or if incx or incy == 0
		* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
		*/

	void CUBLASAPI cublasChemv (char uplo, int n, cuComplex alpha,	void CUBLASAPI cublasChemv (char uplo, int n, cuComplex alpha,
	const cuComplex A, int lda, const cuComplex x ,	const cuComplex A, int lda, const cuComplex x ,
	int incx, cuComplex beta, cuComplex *y, int inc y);	int incx, cuComplex beta, cuComplex *y, int inc y);

		/*
		* void
		* cublasChbmv (char uplo, int n, int k, cuComplex alpha, const cuComplex *
		A, int lda,
		* const cuComplex x, int incx, cuComplex beta, cuComplex y,
		int incy)
		*
		* performs the matrix-vector operation
		*
		* y := alphaAx + beta*y
		*
		* alpha and beta are single precision complex scalars. x and y are single
		precision
		* complex vectors with n elements. A is an n by n hermitian band matrix co
		nsisting
		* of single precision complex elements, with k super-diagonals and the sam
		e number
		* of subdiagonals.
		*
		* Input
		* -----
		* uplo specifies whether the upper or lower triangular part of the hermi
		tian
		* band matrix A is being supplied. If uplo == 'U' or 'u', the upper
		* triangular part is being supplied. If uplo == 'L' or 'l', the low
		er
		* triangular part is being supplied.
		* n specifies the number of rows and the number of columns of the
		* hermitian matrix A. n must be at least zero.
		* k specifies the number of super-diagonals of matrix A. Since the ma
		trix
		* is hermitian, this is also the number of sub-diagonals. k must be
		at
		* least zero.
		* alpha single precision complex scalar multiplier applied to A*x.
		* A single precision complex array of dimensions (lda, n). When uplo
		== 'U' or
		* 'u', the leading (k + 1) x n part of array A must contain the upp
		er
		* triangular band of the hermitian matrix, supplied column by colum
		n,
		* with the leading diagonal of the matrix in row (k+1) of the array
		,
		* the first super-diagonal starting at position 2 in row k, and so
		on.
		* The top left k x k triangle of the array A is not referenced. Whe
		n
		* uplo == 'L' or 'l', the leading (k + 1) x n part of the array A m
		ust
		* contain the lower triangular band part of the hermitian matrix,
		* supplied column by column, with the leading diagonal of the matri
		x in
		* row 1 of the array, the first sub-diagonal starting at position 1
		in
		* row 2, and so on. The bottom right k x k triangle of the array A
		is
		* not referenced. The imaginary parts of the diagonal elements need
		* not be set, they are assumed to be zero.
		* lda leading dimension of A. lda must be at least (k + 1).
		* x single precision complex array of length at least (1 + (n - 1) *
		abs(incx)).
		* incx storage spacing between elements of x. incx must not be zero.
		* beta single precision complex scalar multiplier applied to vector y. I
		f beta is
		* zero, y is not read.
		* y single precision complex array of length at least (1 + (n - 1) *
		abs(incy)).
		* If beta is zero, y is not read.
		* incy storage spacing between elements of y. incy must not be zero.
		*
		* Output
		* ------
		* y updated according to alphaAx + beta*y
		*
		* Reference: http://www.netlib.org/blas/chbmv.f
		*
		* Error status for this function can be retrieved via cublasGetError().
		*
		* Error Status
		* ------------
		* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
		d
		* CUBLAS_STATUS_INVALID_VALUE if k or n < 0, or if incx or incy == 0
		* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
		*/
	void CUBLASAPI cublasChbmv (char uplo, int n, int k, cuComplex alpha,	void CUBLASAPI cublasChbmv (char uplo, int n, int k, cuComplex alpha,
	const cuComplex A, int lda, const cuComplex x ,	const cuComplex A, int lda, const cuComplex x ,
	int incx, cuComplex beta, cuComplex *y, int inc y);	int incx, cuComplex beta, cuComplex *y, int inc y);
	void CUBLASAPI cublasChpmv (char uplo, int n, cuComplex alpha,	void CUBLASAPI cublasChpmv (char uplo, int n, cuComplex alpha,
	const cuComplex AP, const cuComplex x, int in cx,	const cuComplex AP, const cuComplex x, int in cx,
	cuComplex beta, cuComplex *y, int incy);	cuComplex beta, cuComplex *y, int incy);


		/*
		*
		* cublasCtrmv (char uplo, char trans, char diag, int n, const cuComplex *A
		,
		* int lda, cuComplex *x, int incx);
		*
		* performs one of the matrix-vector operations x = op(A) * x,
		* where op(A) = A, or op(A) = transpose(A) or op(A) = conjugate(transpose(
		A)).
		* x is an n-element signle precision complex vector, and
		* A is an n x n, unit or non-unit, upper or lower, triangular matrix compo
		sed
		* of single precision complex elements.
		*
		* Input
		* -----
		* uplo specifies whether the matrix A is an upper or lower triangular
		* matrix. If uplo = 'U' or 'u', then A is an upper triangular matri
		x.
		* If uplo = 'L' or 'l', then A is a lower triangular matrix.
		* trans specifies op(A). If trans = 'n' or 'N', op(A) = A. If trans = 't'
		or
		* 'T', op(A) = transpose(A). If trans = 'c' or 'C', op(A) =
		* conjugate(transpose(A)).
		* diag specifies whether or not matrix A is unit triangular. If diag = '
		U'
		* or 'u', A is assumed to be unit triangular. If diag = 'N' or 'n',
		A
		* is not assumed to be unit triangular.
		* n specifies the number of rows and columns of the matrix A. n must
		be
		* at least zero.
		* A single precision array of dimension (lda, n). If uplo = 'U' or 'u
		',
		* the leading n x n upper triangular part of the array A must conta
		in
		* the upper triangular matrix and the strictly lower triangular par
		t
		* of A is not referenced. If uplo = 'L' or 'l', the leading n x n l
		ower
		* triangular part of the array A must contain the lower triangular
		* matrix and the strictly upper triangular part of A is not referen
		ced.
		* When diag = 'U' or 'u', the diagonal elements of A are not refere
		nced
		* either, but are are assumed to be unity.
		* lda is the leading dimension of A. It must be at least max (1, n).
		* x single precision array of length at least (1 + (n - 1) * abs(incx
		) ).
		* On entry, x contains the source vector. On exit, x is overwritten
		* with the result vector.
		* incx specifies the storage spacing for elements of x. incx must not be
		* zero.
		*
		* Output
		* ------
		* x updated according to x = op(A) * x,
		*
		* Reference: http://www.netlib.org/blas/ctrmv.f
		*
		* Error status for this function can be retrieved via cublasGetError().
		*
		* Error Status
		* ------------
		* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
		d
		* CUBLAS_STATUS_INVALID_VALUE if incx == 0 or if n < 0
		* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
		*/
	void CUBLASAPI cublasCtrmv (char uplo, char trans, char diag, int n,	void CUBLASAPI cublasCtrmv (char uplo, char trans, char diag, int n,
	const cuComplex A, int lda, cuComplex x,	const cuComplex A, int lda, cuComplex x,
	int incx);	int incx);


		/*
		* void
		* cublasCtbmv (char uplo, char trans, char diag, int n, int k, const cuCom
		plex *A,
		* int lda, cuComplex *x, int incx)
		*
		* performs one of the matrix-vector operations x = op(A) * x, where op(A)
		= A,
		* op(A) = transpose(A) or op(A) = conjugate(transpose(A)). x is an n-eleme
		nt
		* single precision complex vector, and A is an n x n, unit or non-unit, up
		per
		* or lower triangular band matrix composed of single precision complex ele
		ments.
		*
		* Input
		* -----
		* uplo specifies whether the matrix A is an upper or lower triangular ba
		nd
		* matrix. If uplo == 'U' or 'u', A is an upper triangular band matr
		ix.
		* If uplo == 'L' or 'l', A is a lower triangular band matrix.
		* trans specifies op(A). If transa == 'N' or 'n', op(A) = A. If trans ==
		'T',
		* or 't', op(A) = transpose(A). If trans == 'C' or 'c',
		* op(A) = conjugate(transpose(A)).
		* diag specifies whether or not matrix A is unit triangular. If diag ==
		'U'
		* or 'u', A is assumed to be unit triangular. If diag == 'N' or 'n'
		, A
		* is not assumed to be unit triangular.
		* n specifies the number of rows and columns of the matrix A. n must
		be
		* at least zero.
		* k specifies the number of super- or sub-diagonals. If uplo == 'U' o
		r
		* 'u', k specifies the number of super-diagonals. If uplo == 'L' or
		* 'l', k specifies the number of sub-diagonals. k must at least be
		* zero.
		* A single precision complex array of dimension (lda, n). If uplo ==
		'U' or 'u',
		* the leading (k + 1) x n part of the array A must contain the uppe
		r
		* triangular band matrix, supplied column by column, with the leadi
		ng
		* diagonal of the matrix in row (k + 1) of the array, the first
		* super-diagonal starting at position 2 in row k, and so on. The to
		p
		* left k x k triangle of the array A is not referenced. If uplo ==
		'L'
		* or 'l', the leading (k + 1) x n part of the array A must constain
		the
		* lower triangular band matrix, supplied column by column, with the
		* leading diagonal of the matrix in row 1 of the array, the first
		* sub-diagonal startingat position 1 in row 2, and so on. The botto
		m
		* right k x k triangle of the array is not referenced.
		* lda is the leading dimension of A. It must be at least (k + 1).
		* x single precision complex array of length at least (1 + (n - 1) *
		abs(incx)).
		* On entry, x contains the source vector. On exit, x is overwritten
		* with the result vector.
		* incx specifies the storage spacing for elements of x. incx must not be
		* zero.
		*
		* Output
		* ------
		* x updated according to x = op(A) * x
		*
		* Reference: http://www.netlib.org/blas/ctbmv.f
		*
		* Error status for this function can be retrieved via cublasGetError().
		*
		* Error Status
		* ------------
		* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
		d
		* CUBLAS_STATUS_INVALID_VALUE if n or k < 0, or if incx == 0
		* CUBLAS_STATUS_ALLOC_FAILED if function cannot allocate enough intern
		al scratch vector memory
		* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
		*/
	void CUBLASAPI cublasCtbmv (char uplo, char trans, char diag, int n, int k,	void CUBLASAPI cublasCtbmv (char uplo, char trans, char diag, int n, int k,
	const cuComplex A, int lda, cuComplex x,	const cuComplex A, int lda, cuComplex x,
	int incx);	int incx);

		/*
		* void
		* cublasCtpmv (char uplo, char trans, char diag, int n, const cuComplex *A
		P,
		* cuComplex *x, int incx);
		*
		* performs one of the matrix-vector operations x = op(A) * x, where op(A)
		= A,
		* op(A) = transpose(A) or op(A) = conjugate(transpose(A)) . x is an n elem
		ent
		* single precision complex vector, and A is an n x n, unit or non-unit, up
		per
		* or lower triangular matrix composed of single precision complex elements
		.
		*
		* Input
		* -----
		* uplo specifies whether the matrix A is an upper or lower triangular
		* matrix. If uplo == 'U' or 'u', then A is an upper triangular matr
		ix.
		* If uplo == 'L' or 'l', then A is a lower triangular matrix.
		* trans specifies op(A). If transa == 'N' or 'n', op(A) = A. If trans ==
		'T',
		* or 't', op(A) = transpose(A). If trans == 'C' or 'c',
		* op(A) = conjugate(transpose(A)).
		*
		* diag specifies whether or not matrix A is unit triangular. If diag ==
		'U'
		* or 'u', A is assumed to be unit triangular. If diag == 'N' or 'n'
		, A
		* is not assumed to be unit triangular.
		* n specifies the number of rows and columns of the matrix A. n must
		be
		* at least zero. In the current implementation n must not exceed 40
		70.
		* AP single precision complex array with at least ((n * (n + 1)) / 2)
		elements. If
		* uplo == 'U' or 'u', the array AP contains the upper triangular pa
		rt
		* of the symmetric matrix A, packed sequentially, column by column;
		* that is, if i <= j, then A[i,j] is stored in AP[i+(j*(j+1)/2)]. I
		f
		* uplo == 'L' or 'L', the array AP contains the lower triangular pa
		rt
		* of the symmetric matrix A, packed sequentially, column by column;
		* that is, if i >= j, then A[i,j] is stored in AP[i+((2n-j+1)j)/2
		].
		* x single precision complex array of length at least (1 + (n - 1) *
		abs(incx)).
		* On entry, x contains the source vector. On exit, x is overwritten
		* with the result vector.
		* incx specifies the storage spacing for elements of x. incx must not be
		* zero.
		*
		* Output
		* ------
		* x updated according to x = op(A) * x,
		*
		* Reference: http://www.netlib.org/blas/ctpmv.f
		*
		* Error status for this function can be retrieved via cublasGetError().
		*
		* Error Status
		* ------------
		* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
		d
		* CUBLAS_STATUS_INVALID_VALUE if incx == 0 or n < 0
		* CUBLAS_STATUS_ALLOC_FAILED if function cannot allocate enough intern
		al scratch vector memory
		* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
		*/
	void CUBLASAPI cublasCtpmv (char uplo, char trans, char diag, int n,	void CUBLASAPI cublasCtpmv (char uplo, char trans, char diag, int n,
	const cuComplex AP, cuComplex x, int incx);	const cuComplex AP, cuComplex x, int incx);

		/*
		* void
		* cublasCtrsv (char uplo, char trans, char diag, int n, const cuComplex *A
		,
		* int lda, cuComplex *x, int incx)
		*
		* solves a system of equations op(A) * x = b, where op(A) is either A,
		* transpose(A) or conjugate(transpose(A)). b and x are single precision
		* complex vectors consisting of n elements, and A is an n x n matrix
		* composed of a unit or non-unit, upper or lower triangular matrix.
		* Matrix A is stored in column major format, and lda is the leading
		* dimension of the two-dimensional array containing A.
		*
		* No test for singularity or near-singularity is included in this function
		.
		* Such tests must be performed before calling this function.
		*
		* Input
		* -----
		* uplo specifies whether the matrix data is stored in the upper or the
		* lower triangular part of array A. If uplo = 'U' or 'u', then only
		* the upper triangular part of A may be referenced. If uplo = 'L' o
		r
		* 'l', then only the lower triangular part of A may be referenced.
		* trans specifies op(A). If transa = 'n' or 'N', op(A) = A. If transa = '
		t',
		* 'T', 'c', or 'C', op(A) = transpose(A)
		* diag specifies whether or not A is a unit triangular matrix like so:
		* if diag = 'U' or 'u', A is assumed to be unit triangular. If
		* diag = 'N' or 'n', then A is not assumed to be unit triangular.
		* n specifies the number of rows and columns of the matrix A. It
		* must be at least 0.
		* A is a single precision complex array of dimensions (lda, n). If up
		lo = 'U'
		* or 'u', then A must contains the upper triangular part of a symme
		tric
		* matrix, and the strictly lower triangular parts is not referenced
		.
		* If uplo = 'L' or 'l', then A contains the lower triangular part o
		f
		* a symmetric matrix, and the strictly upper triangular part is not
		* referenced.
		* lda is the leading dimension of the two-dimensional array containing
		A.
		* lda must be at least max(1, n).
		* x single precision complex array of length at least (1 + (n - 1) *
		abs(incx)).
		* On entry, x contains the n element right-hand side vector b. On e
		xit,
		* it is overwritten with the solution vector x.
		* incx specifies the storage spacing between elements of x. incx must no
		t
		* be zero.
		*
		* Output
		* ------
		* x updated to contain the solution vector x that solves op(A) * x =
		b.
		*
		* Reference: http://www.netlib.org/blas/ctrsv.f
		*
		* Error status for this function can be retrieved via cublasGetError().
		*
		* Error Status
		* ------------
		* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
		d
		* CUBLAS_STATUS_INVALID_VALUE if incx == 0 or if n < 0
		* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
		*/
	void CUBLASAPI cublasCtrsv (char uplo, char trans, char diag, int n,	void CUBLASAPI cublasCtrsv (char uplo, char trans, char diag, int n,
	const cuComplex A, int lda, cuComplex x,	const cuComplex A, int lda, cuComplex x,
	int incx);	int incx);

		/*
		* void cublasCtbsv (char uplo, char trans, char diag, int n, int k,
		* const cuComplex A, int lda, cuComplex X, int incx)
		*
		* solves one of the systems of equations op(A)*x = b, where op(A) is eithe
		r
		* op(A) = A , op(A) = transpose(A) or op(A) = conjugate(transpose(A)).
		* b and x are n element vectors, and A is an n x n unit or non-unit,
		* upper or lower triangular band matrix with k + 1 diagonals. No test
		* for singularity or near-singularity is included in this function.
		* Such tests must be performed before calling this function.
		*
		* Input
		* -----
		* uplo specifies whether the matrix is an upper or lower triangular band
		* matrix as follows: If uplo == 'U' or 'u', A is an upper triangula
		r
		* band matrix. If uplo == 'L' or 'l', A is a lower triangular band
		* matrix.
		* trans specifies op(A). If trans == 'N' or 'n', op(A) = A. If trans == '
		T',
		* 't', op(A) = transpose(A). If trans == 'C' or 'c',
		* op(A) = conjugate(transpose(A)).
		* diag specifies whether A is unit triangular. If diag == 'U' or 'u', A
		is
		* assumed to be unit triangular; thas is, diagonal elements are not
		* read and are assumed to be unity. If diag == 'N' or 'n', A is not
		* assumed to be unit triangular.
		* n specifies the number of rows and columns of the matrix A. n must
		be
		* at least zero.
		* k specifies the number of super- or sub-diagonals. If uplo == 'U' o
		r
		* 'u', k specifies the number of super-diagonals. If uplo == 'L' or
		* 'l', k specifies the number of sub-diagonals. k must at least be
		* zero.
		* A single precision complex array of dimension (lda, n). If uplo ==
		'U' or 'u',
		* the leading (k + 1) x n part of the array A must contain the uppe
		r
		* triangular band matrix, supplied column by column, with the leadi
		ng
		* diagonal of the matrix in row (k + 1) of the array, the first sup
		er-
		* diagonal starting at position 2 in row k, and so on. The top left
		* k x k triangle of the array A is not referenced. If uplo == 'L' o
		r
		* 'l', the leading (k + 1) x n part of the array A must constain th
		e
		* lower triangular band matrix, supplied column by column, with the
		* leading diagonal of the matrix in row 1 of the array, the first
		* sub-diagonal starting at position 1 in row 2, and so on. The bott
		om
		* right k x k triangle of the array is not referenced.
		* x single precision complex array of length at least (1+(n-1)*abs(in
		cx)).
		* incx storage spacing between elements of x. It must not be zero.
		*
		* Output
		* ------
		* x updated to contain the solution vector x that solves op(A) * x =
		b.
		*
		* Reference: http://www.netlib.org/blas/ctbsv.f
		*
		* Error status for this function can be retrieved via cublasGetError().
		*
		* Error Status
		* ------------
		* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
		d
		* CUBLAS_STATUS_INVALID_VALUE if incx == 0, n < 0 or n > 2035
		* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
		*/
	void CUBLASAPI cublasCtbsv (char uplo, char trans, char diag, int n, int k,	void CUBLASAPI cublasCtbsv (char uplo, char trans, char diag, int n, int k,
	const cuComplex A, int lda, cuComplex x,	const cuComplex A, int lda, cuComplex x,
	int incx);	int incx);

		/*
		* void
		* cublasCtpsv (char uplo, char trans, char diag, int n, const cuComplex *A
		P,
		* cuComplex *X, int incx)
		*
		* solves one of the systems of equations op(A)*x = b, where op(A) is eithe
		r
		* op(A) = A , op(A) = transpose(A) or op(A) = conjugate(transpose)). b and
		* x are n element complex vectors, and A is an n x n unit or non-unit,
		* upper or lower triangular matrix. No test for singularity or near-singul
		arity
		* is included in this routine. Such tests must be performed before calling
		this routine.
		*
		* Input
		* -----
		* uplo specifies whether the matrix is an upper or lower triangular matr
		ix
		* as follows: If uplo == 'U' or 'u', A is an upper triangluar matri
		x.
		* If uplo == 'L' or 'l', A is a lower triangular matrix.
		* trans specifies op(A). If trans == 'N' or 'n', op(A) = A. If trans == '
		T'
		* or 't', op(A) = transpose(A). If trans == 'C' or 'c', op(A) =
		* conjugate(transpose(A)).
		* diag specifies whether A is unit triangular. If diag == 'U' or 'u', A
		is
		* assumed to be unit triangular; thas is, diagonal elements are not
		* read and are assumed to be unity. If diag == 'N' or 'n', A is not
		* assumed to be unit triangular.
		* n specifies the number of rows and columns of the matrix A. n must
		be
		* at least zero.
		* AP single precision complex array with at least ((n*(n+1))/2) elemen
		ts.
		* If uplo == 'U' or 'u', the array AP contains the upper triangular
		* matrix A, packed sequentially, column by column; that is, if i <=
		j, then
		* A[i,j] is stored is AP[i+(j*(j+1)/2)]. If uplo == 'L' or 'L', the
		* array AP contains the lower triangular matrix A, packed sequentia
		lly,
		* column by column; that is, if i >= j, then A[i,j] is stored in
		* AP[i+((2n-j+1)j)/2]. When diag = 'U' or 'u', the diagonal eleme
		nts
		* of A are not referenced and are assumed to be unity.
		* x single precision complex array of length at least (1+(n-1)*abs(in
		cx)).
		* incx storage spacing between elements of x. It must not be zero.
		*
		* Output
		* ------
		* x updated to contain the solution vector x that solves op(A) * x =
		b.
		*
		* Reference: http://www.netlib.org/blas/ctpsv.f
		*
		* Error status for this function can be retrieved via cublasGetError().
		*
		* Error Status
		* ------------
		* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
		d
		* CUBLAS_STATUS_INVALID_VALUE if incx == 0 or if n < 0 or n > 2035
		* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
		*/
	void CUBLASAPI cublasCtpsv (char uplo, char trans, char diag, int n,	void CUBLASAPI cublasCtpsv (char uplo, char trans, char diag, int n,
	const cuComplex AP, cuComplex x, int incx);	const cuComplex AP, cuComplex x, int incx);


		/*
		* cublasCgeru (int m, int n, cuComplex alpha, const cuComplex *x, int incx
		,
		* const cuComplex y, int incy, cuComplex A, int lda)
		*
		* performs the symmetric rank 1 operation
		*
		* A = alpha * x * transpose(y) + A,
		*
		* where alpha is a single precision complex scalar, x is an m element sing
		le
		* precision complex vector, y is an n element single precision complex vec
		tor, and A
		* is an m by n matrix consisting of single precision complex elements. Mat
		rix A
		* is stored in column major format, and lda is the leading dimension of
		* the two-dimensional array used to store A.
		*
		* Input
		* -----
		* m specifies the number of rows of the matrix A. It must be at least
		* zero.
		* n specifies the number of columns of the matrix A. It must be at
		* least zero.
		* alpha single precision complex scalar multiplier applied to x * transpo
		se(y)
		* x single precision complex array of length at least (1 + (m - 1) *
		abs(incx))
		* incx specifies the storage spacing between elements of x. incx must no
		t
		* be zero.
		* y single precision complex array of length at least (1 + (n - 1) *
		abs(incy))
		* incy specifies the storage spacing between elements of y. incy must no
		t
		* be zero.
		* A single precision complex array of dimensions (lda, n).
		* lda leading dimension of two-dimensional array used to store matrix A
		*
		* Output
		* ------
		* A updated according to A = alpha * x * transpose(y) + A
		*
		* Reference: http://www.netlib.org/blas/cgeru.f
		*
		* Error status for this function can be retrieved via cublasGetError().
		*
		* Error Status
		* ------------
		* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
		d
		* CUBLAS_STATUS_INVALID_VALUE if m <0, n < 0, incx == 0, incy == 0
		* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
		*/
	void CUBLASAPI cublasCgeru (int m, int n, cuComplex alpha, const cuComplex *x,	void CUBLASAPI cublasCgeru (int m, int n, cuComplex alpha, const cuComplex *x,
	int incx, const cuComplex *y, int incy,	int incx, const cuComplex *y, int incy,
	cuComplex *A, int lda);	cuComplex *A, int lda);

		/*
		* cublasCgerc (int m, int n, cuComplex alpha, const cuComplex *x, int incx
		,
		* const cuComplex y, int incy, cuComplex A, int lda)
		*
		* performs the symmetric rank 1 operation
		*
		* A = alpha * x * conjugate(transpose(y)) + A,
		*
		* where alpha is a single precision complex scalar, x is an m element sing
		le
		* precision complex vector, y is an n element single precision complex vec
		tor, and A
		* is an m by n matrix consisting of single precision complex elements. Mat
		rix A
		* is stored in column major format, and lda is the leading dimension of
		* the two-dimensional array used to store A.
		*
		* Input
		* -----
		* m specifies the number of rows of the matrix A. It must be at least
		* zero.
		* n specifies the number of columns of the matrix A. It must be at
		* least zero.
		* alpha single precision complex scalar multiplier applied to x * transpo
		se(y)
		* x single precision complex array of length at least (1 + (m - 1) *
		abs(incx))
		* incx specifies the storage spacing between elements of x. incx must no
		t
		* be zero.
		* y single precision complex array of length at least (1 + (n - 1) *
		abs(incy))
		* incy specifies the storage spacing between elements of y. incy must no
		t
		* be zero.
		* A single precision complex array of dimensions (lda, n).
		* lda leading dimension of two-dimensional array used to store matrix A
		*
		* Output
		* ------
		* A updated according to A = alpha * x * conjugate(transpose(y)) + A
		*
		* Reference: http://www.netlib.org/blas/cgerc.f
		*
		* Error status for this function can be retrieved via cublasGetError().
		*
		* Error Status
		* ------------
		* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
		d
		* CUBLAS_STATUS_INVALID_VALUE if m <0, n < 0, incx == 0, incy == 0
		* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
		*/
	void CUBLASAPI cublasCgerc (int m, int n, cuComplex alpha, const cuComplex *x,	void CUBLASAPI cublasCgerc (int m, int n, cuComplex alpha, const cuComplex *x,
	int incx, const cuComplex *y, int incy,	int incx, const cuComplex *y, int incy,
	cuComplex *A, int lda);	cuComplex *A, int lda);

	void CUBLASAPI cublasCher (char uplo, int n, cuComplex alpha,	/*
		* void
		* cublasCher (char uplo, int n, float alpha, const cuComplex *x, int incx,
		* cuComplex *A, int lda)
		*
		* performs the hermitian rank 1 operation
		*
		* A = alpha * x * conjugate(transpose(x)) + A,
		*
		* where alpha is a single precision real scalar, x is an n element single
		* precision complex vector and A is an n x n hermitian matrix consisting o
		f
		* single precision complex elements. Matrix A is stored in column major fo
		rmat,
		* and lda is the leading dimension of the two-dimensional array
		* containing A.
		*
		* Input
		* -----
		* uplo specifies whether the matrix data is stored in the upper or
		* the lower triangular part of array A. If uplo = 'U' or 'u',
		* then only the upper triangular part of A may be referenced.
		* If uplo = 'L' or 'l', then only the lower triangular part of
		* A may be referenced.
		* n specifies the number of rows and columns of the matrix A. It
		* must be at least 0.
		* alpha single precision real scalar multiplier applied to
		* x * conjugate(transpose(x))
		* x single precision complex array of length at least (1 + (n - 1) *
		abs(incx))
		* incx specifies the storage spacing between elements of x. incx must
		* not be zero.
		* A single precision complex array of dimensions (lda, n). If uplo =
		'U' or
		* 'u', then A must contain the upper triangular part of a hermitian
		* matrix, and the strictly lower triangular part is not referenced.
		* If uplo = 'L' or 'l', then A contains the lower triangular part
		* of a hermitian matrix, and the strictly upper triangular part is
		* not referenced. The imaginary parts of the diagonal elements need
		* not be set, they are assumed to be zero, and on exit they
		* are set to zero.
		* lda leading dimension of the two-dimensional array containing A. lda
		* must be at least max(1, n).
		*
		* Output
		* ------
		* A updated according to A = alpha * x * conjugate(transpose(x)) + A
		*
		* Reference: http://www.netlib.org/blas/cher.f
		*
		* Error status for this function can be retrieved via cublasGetError().
		*
		* Error Status
		* ------------
		* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
		d
		* CUBLAS_STATUS_INVALID_VALUE if n < 0, or incx == 0
		* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
		*/
		void CUBLASAPI cublasCher (char uplo, int n, float alpha,
	const cuComplex x, int incx, cuComplex A,	const cuComplex x, int incx, cuComplex A,
	int lda);	int lda);

	void CUBLASAPI cublasChpr (char uplo, int n, cuComplex alpha,	/*
		* void
		* cublasChpr (char uplo, int n, float alpha, const cuComplex *x, int incx,
		* cuComplex *AP)
		*
		* performs the hermitian rank 1 operation
		*
		* A = alpha * x * conjugate(transpose(x)) + A,
		*
		* where alpha is a single precision real scalar and x is an n element sing
		le
		* precision complex vector. A is a hermitian n x n matrix consisting of si
		ngle
		* precision complex elements that is supplied in packed form.
		*
		* Input
		* -----
		* uplo specifies whether the matrix data is stored in the upper or the l
		ower
		* triangular part of array AP. If uplo == 'U' or 'u', then the uppe
		r
		* triangular part of A is supplied in AP. If uplo == 'L' or 'l', th
		en
		* the lower triangular part of A is supplied in AP.
		* n specifies the number of rows and columns of the matrix A. It must
		be
		* at least zero.
		* alpha single precision real scalar multiplier applied to x * conjugate(
		transpose(x)).
		* x single precision array of length at least (1 + (n - 1) * abs(incx
		)).
		* incx storage spacing between elements of x. incx must not be zero.
		* AP single precision complex array with at least ((n * (n + 1)) / 2)
		elements. If
		* uplo == 'U' or 'u', the array AP contains the upper triangular pa
		rt
		* of the hermitian matrix A, packed sequentially, column by column;
		* that is, if i <= j, then A[i,j] is stored is AP[i+(j*(j+1)/2)]. I
		f
		* uplo == 'L' or 'L', the array AP contains the lower triangular pa
		rt
		* of the hermitian matrix A, packed sequentially, column by column;
		* that is, if i >= j, then A[i,j] is stored in AP[i+((2n-j+1)j)/2
		].
		* The imaginary parts of the diagonal elements need not be set, the
		y
		* are assumed to be zero, and on exit they are set to zero.
		*
		* Output
		* ------
		* A updated according to A = alpha * x * conjugate(transpose(x)) + A
		*
		* Reference: http://www.netlib.org/blas/chpr.f
		*
		* Error status for this function can be retrieved via cublasGetError().
		*
		* Error Status
		* ------------
		* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
		d
		* CUBLAS_STATUS_INVALID_VALUE if n < 0, or incx == 0
		* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
		*/
		void CUBLASAPI cublasChpr (char uplo, int n, float alpha,
	const cuComplex x, int incx, cuComplex AP);	const cuComplex x, int incx, cuComplex AP);


		/*
		* void
		* cublasChpr2 (char uplo, int n, cuComplex alpha, const cuComplex *x, int
		incx,
		* const cuComplex y, int incy, cuComplex AP)
		*
		* performs the hermitian rank 2 operation
		*
		* A = alphaxconjugate(transpose(y)) + conjugate(alpha)yconjugate(tr
		anspose(x)) + A,
		*
		* where alpha is a single precision complex scalar, and x and y are n elem
		ent single
		* precision complex vectors. A is a hermitian n x n matrix consisting of s
		ingle
		* precision complex elements that is supplied in packed form.
		*
		* Input
		* -----
		* uplo specifies whether the matrix data is stored in the upper or the l
		ower
		* triangular part of array A. If uplo == 'U' or 'u', then only the
		* upper triangular part of A may be referenced and the lower triang
		ular
		* part of A is inferred. If uplo == 'L' or 'l', then only the lower
		* triangular part of A may be referenced and the upper triangular p
		art
		* of A is inferred.
		* n specifies the number of rows and columns of the matrix A. It must
		be
		* at least zero.
		* alpha single precision complex scalar multiplier applied to x * conjuga
		te(transpose(y)) +
		* y * conjugate(transpose(x)).
		* x single precision complex array of length at least (1 + (n - 1) *
		abs (incx)).
		* incx storage spacing between elements of x. incx must not be zero.
		* y single precision complex array of length at least (1 + (n - 1) *
		abs (incy)).
		* incy storage spacing between elements of y. incy must not be zero.
		* AP single precision complex array with at least ((n * (n + 1)) / 2)
		elements. If
		* uplo == 'U' or 'u', the array AP contains the upper triangular pa
		rt
		* of the hermitian matrix A, packed sequentially, column by column;
		* that is, if i <= j, then A[i,j] is stored is AP[i+(j*(j+1)/2)]. I
		f
		* uplo == 'L' or 'L', the array AP contains the lower triangular pa
		rt
		* of the hermitian matrix A, packed sequentially, column by column;
		* that is, if i >= j, then A[i,j] is stored in AP[i+((2n-j+1)j)/2
		].
		* The imaginary parts of the diagonal elements need not be set, the
		y
		* are assumed to be zero, and on exit they are set to zero.
		*
		* Output
		* ------
		* A updated according to A = alphaxconjugate(transpose(y))
		* + conjugate(alpha)yconjugate(transpose(x
		))+A
		*
		* Reference: http://www.netlib.org/blas/chpr2.f
		*
		* Error status for this function can be retrieved via cublasGetError().
		*
		* Error Status
		* ------------
		* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
		d
		* CUBLAS_STATUS_INVALID_VALUE if n < 0, incx == 0, incy == 0
		* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
		* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
		*/
		void CUBLASAPI cublasChpr2 (char uplo, int n, cuComplex alpha,
		const cuComplex x, int incx, const cuComplex
		y,
		int incy, cuComplex *AP);

		/*
		* void cublasCher2 (char uplo, int n, cuComplex alpha, const cuComplex *x,
		int incx,
		* const cuComplex y, int incy, cuComplex A, int lda)
		*
		* performs the hermitian rank 2 operation
		*
		* A = alphaxconjugate(transpose(y)) + conjugate(alpha)yconjugate(tr
		anspose(x)) + A,
		*
		* where alpha is a single precision complex scalar, x and y are n element
		single
		* precision complex vector and A is an n by n hermitian matrix consisting
		of single
		* precision complex elements.
		*
		* Input
		* -----
		* uplo specifies whether the matrix data is stored in the upper or the l
		ower
		* triangular part of array A. If uplo == 'U' or 'u', then only the
		* upper triangular part of A may be referenced and the lower triang
		ular
		* part of A is inferred. If uplo == 'L' or 'l', then only the lower
		* triangular part of A may be referenced and the upper triangular p
		art
		* of A is inferred.
		* n specifies the number of rows and columns of the matrix A. It must
		be
		* at least zero.
		* alpha single precision complex scalar multiplier applied to x * conjuga
		te(transpose(y)) +
		* y * conjugate(transpose(x)).
		* x single precision array of length at least (1 + (n - 1) * abs (inc
		x)).
		* incx storage spacing between elements of x. incx must not be zero.
		* y single precision array of length at least (1 + (n - 1) * abs (inc
		y)).
		* incy storage spacing between elements of y. incy must not be zero.
		* A single precision complex array of dimensions (lda, n). If uplo ==
		'U' or 'u',
		* then A must contains the upper triangular part of a hermitian mat
		rix,
		* and the strictly lower triangular parts is not referenced. If upl
		o ==
		* 'L' or 'l', then A contains the lower triangular part of a hermit
		ian
		* matrix, and the strictly upper triangular part is not referenced.
		* The imaginary parts of the diagonal elements need not be set,
		* they are assumed to be zero, and on exit they are set to zero.
		*
		* lda leading dimension of A. It must be at least max(1, n).
		*
		* Output
		* ------
		* A updated according to A = alphaxconjugate(transpose(y))
		* + conjugate(alpha)yconjugate(transpose(x
		))+A
		*
		* Reference: http://www.netlib.org/blas/cher2.f
		*
		* Error status for this function can be retrieved via cublasGetError().
		*
		* Error Status
		* ------------
		* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
		d
		* CUBLAS_STATUS_INVALID_VALUE if n < 0, incx == 0, incy == 0
		* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
		*/
	void CUBLASAPI cublasCher2 (char uplo, int n, cuComplex alpha,	void CUBLASAPI cublasCher2 (char uplo, int n, cuComplex alpha,
	const cuComplex x, int incx, const cuComplex y,	const cuComplex x, int incx, const cuComplex y,
	int incy, cuComplex *A, int lda);	int incy, cuComplex *A, int lda);
	void CUBLASAPI cublasChpr2 (char uplo, int n, cuComplex alpha,	void CUBLASAPI cublasChpr2 (char uplo, int n, cuComplex alpha,
	const cuComplex x, int incx, const cuComplex y,	const cuComplex x, int incx, const cuComplex y,
	int incy, cuComplex *AP);	int incy, cuComplex *AP);

	/* ---------------- CUBLAS single precision BLAS3 functions --------------- - */	/* ---------------- CUBLAS single precision BLAS3 functions --------------- - */

	/*	/*

	skipping to change at line 2495	skipping to change at line 4271
	* otherwise the leading k x n part of the array must contains the	* otherwise the leading k x n part of the array must contains the
	* matrix A.	* matrix A.
	* lda leading dimension of A. When trans == 'N' or 'n' then lda must be at	* lda leading dimension of A. When trans == 'N' or 'n' then lda must be at
	* least max(1, n). Otherwise lda must be at least max(1, k).	* least max(1, n). Otherwise lda must be at least max(1, k).
	* beta single precision scalar multiplier applied to C. If beta izs zero , C	* beta single precision scalar multiplier applied to C. If beta izs zero , C
	* does not have to be a valid input	* does not have to be a valid input
	* C single precision array of dimensions (ldc, n). If uplo == 'U' or 'u',	* C single precision array of dimensions (ldc, n). If uplo == 'U' or 'u',
	* the leading n x n triangular part of the array C must contain the	* the leading n x n triangular part of the array C must contain the
	* upper triangular part of the symmetric matrix C and the strictly	* upper triangular part of the symmetric matrix C and the strictly
	* lower triangular part of C is not referenced. On exit, the upper	* lower triangular part of C is not referenced. On exit, the upper

	* triangular part of C is overwritten by the upper trinagular part of	* triangular part of C is overwritten by the upper triangular part of
	* the updated matrix. If uplo == 'L' or 'l', the leading n x n	* the updated matrix. If uplo == 'L' or 'l', the leading n x n
	* triangular part of the array C must contain the lower triangular part	* triangular part of the array C must contain the lower triangular part
	* of the symmetric matrix C and the strictly upper triangular part of C	* of the symmetric matrix C and the strictly upper triangular part of C
	* is not referenced. On exit, the lower triangular part of C is	* is not referenced. On exit, the lower triangular part of C is

	* overwritten by the lower trinagular part of the updated matrix.	* overwritten by the lower triangular part of the updated matrix.
	* ldc leading dimension of C. It must be at least max(1, n).	* ldc leading dimension of C. It must be at least max(1, n).
	*	*
	* Output	* Output
	* ------	* ------
	* C updated according to C = alpha * A * transpose(A) + beta * C, or C =	* C updated according to C = alpha * A * transpose(A) + beta * C, or C =
	* alpha * transpose(A) * A + beta * C	* alpha * transpose(A) * A + beta * C
	*	*
	* Reference: http://www.netlib.org/blas/ssyrk.f	* Reference: http://www.netlib.org/blas/ssyrk.f
	*	*
	* Error status for this function can be retrieved via cublasGetError().	* Error status for this function can be retrieved via cublasGetError().

	skipping to change at line 2580	skipping to change at line 4356
	* otherwise the leading k x n part of the array must contain the ma trix	* otherwise the leading k x n part of the array must contain the ma trix
	* B.	* B.
	* ldb leading dimension of N. When trans == 'N' or 'n' then ldb must be at	* ldb leading dimension of N. When trans == 'N' or 'n' then ldb must be at
	* least max(1, n). Otherwise ldb must be at least max(1, k).	* least max(1, n). Otherwise ldb must be at least max(1, k).
	* beta single precision scalar multiplier applied to C. If beta is zero, C	* beta single precision scalar multiplier applied to C. If beta is zero, C
	* does not have to be a valid input.	* does not have to be a valid input.
	* C single precision array of dimensions (ldc, n). If uplo == 'U' or 'u',	* C single precision array of dimensions (ldc, n). If uplo == 'U' or 'u',
	* the leading n x n triangular part of the array C must contain the	* the leading n x n triangular part of the array C must contain the
	* upper triangular part of the symmetric matrix C and the strictly	* upper triangular part of the symmetric matrix C and the strictly
	* lower triangular part of C is not referenced. On exit, the upper	* lower triangular part of C is not referenced. On exit, the upper

	* triangular part of C is overwritten by the upper trinagular part of	* triangular part of C is overwritten by the upper triangular part of
	* the updated matrix. If uplo == 'L' or 'l', the leading n x n	* the updated matrix. If uplo == 'L' or 'l', the leading n x n
	* triangular part of the array C must contain the lower triangular part	* triangular part of the array C must contain the lower triangular part
	* of the symmetric matrix C and the strictly upper triangular part of C	* of the symmetric matrix C and the strictly upper triangular part of C
	* is not referenced. On exit, the lower triangular part of C is	* is not referenced. On exit, the lower triangular part of C is

	* overwritten by the lower trinagular part of the updated matrix.	* overwritten by the lower triangular part of the updated matrix.
	* ldc leading dimension of C. Must be at least max(1, n).	* ldc leading dimension of C. Must be at least max(1, n).
	*	*
	* Output	* Output
	* ------	* ------
	* C updated according to alphaAtranspose(B) + alphaBtranspose(A) +	* C updated according to alphaAtranspose(B) + alphaBtranspose(A) +
	* betaC or alphatranspose(A)B + alphatranspose(B)A + betaC	* betaC or alphatranspose(A)B + alphatranspose(B)A + betaC
	*	*
	* Reference: http://www.netlib.org/blas/ssyr2k.f	* Reference: http://www.netlib.org/blas/ssyr2k.f
	*	*
	* Error status for this function can be retrieved via cublasGetError().	* Error status for this function can be retrieved via cublasGetError().

	skipping to change at line 2824	skipping to change at line 4600
	* Error Status	* Error Status
	* ------------	* ------------
	* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize d	* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize d
	* CUBLAS_STATUS_INVALID_VALUE if any of m, n, or k are < 0	* CUBLAS_STATUS_INVALID_VALUE if any of m, n, or k are < 0
	* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU	* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
	*/	*/
	void CUBLASAPI cublasCgemm (char transa, char transb, int m, int n, int k,	void CUBLASAPI cublasCgemm (char transa, char transb, int m, int n, int k,
	cuComplex alpha, const cuComplex *A, int lda,	cuComplex alpha, const cuComplex *A, int lda,
	const cuComplex *B, int ldb, cuComplex beta,	const cuComplex *B, int ldb, cuComplex beta,
	cuComplex *C, int ldc);	cuComplex *C, int ldc);

		/*
		* void
		* cublasCsymm (char side, char uplo, int m, int n, cuComplex alpha,
		* const cuComplex A, int lda, const cuComplex B, int ldb,
		* cuComplex beta, cuComplex *C, int ldc);
		*
		* performs one of the matrix-matrix operations
		*
		* C = alpha * A * B + beta * C, or
		* C = alpha * B * A + beta * C,
		*
		* where alpha and beta are single precision complex scalars, A is a symmet
		ric matrix
		* consisting of single precision complex elements and stored in either low
		er or upper
		* storage mode, and B and C are m x n matrices consisting of single precis
		ion
		* complex elements.
		*
		* Input
		* -----
		* side specifies whether the symmetric matrix A appears on the left side
		* hand side or right hand side of matrix B, as follows. If side ==
		'L'
		* or 'l', then C = alpha * A * B + beta * C. If side = 'R' or 'r',
		* then C = alpha * B * A + beta * C.
		* uplo specifies whether the symmetric matrix A is stored in upper or lo
		wer
		* storage mode, as follows. If uplo == 'U' or 'u', only the upper
		* triangular part of the symmetric matrix is to be referenced, and
		the
		* elements of the strictly lower triangular part are to be infered
		from
		* those in the upper triangular part. If uplo == 'L' or 'l', only t
		he
		* lower triangular part of the symmetric matrix is to be referenced
		,
		* and the elements of the strictly upper triangular part are to be
		* infered from those in the lower triangular part.
		* m specifies the number of rows of the matrix C, and the number of r
		ows
		* of matrix B. It also specifies the dimensions of symmetric matrix
		A
		* when side == 'L' or 'l'. m must be at least zero.
		* n specifies the number of columns of the matrix C, and the number o
		f
		* columns of matrix B. It also specifies the dimensions of symmetri
		c
		* matrix A when side == 'R' or 'r'. n must be at least zero.
		* alpha single precision scalar multiplier applied to A * B, or B * A
		* A single precision array of dimensions (lda, ka), where ka is m whe
		n
		* side == 'L' or 'l' and is n otherwise. If side == 'L' or 'l' the
		* leading m x m part of array A must contain the symmetric matrix,
		* such that when uplo == 'U' or 'u', the leading m x m part stores
		the
		* upper triangular part of the symmetric matrix, and the strictly l
		ower
		* triangular part of A is not referenced, and when uplo == 'U' or '
		u',
		* the leading m x m part stores the lower triangular part of the
		* symmetric matrix and the strictly upper triangular part is not
		* referenced. If side == 'R' or 'r' the leading n x n part of array
		A
		* must contain the symmetric matrix, such that when uplo == 'U' or
		'u',
		* the leading n x n part stores the upper triangular part of the
		* symmetric matrix and the strictly lower triangular part of A is n
		ot
		* referenced, and when uplo == 'U' or 'u', the leading n x n part
		* stores the lower triangular part of the symmetric matrix and the
		* strictly upper triangular part is not referenced.
		* lda leading dimension of A. When side == 'L' or 'l', it must be at le
		ast
		* max(1, m) and at least max(1, n) otherwise.
		* B single precision array of dimensions (ldb, n). On entry, the lead
		ing
		* m x n part of the array contains the matrix B.
		* ldb leading dimension of B. It must be at least max (1, m).
		* beta single precision scalar multiplier applied to C. If beta is zero,
		C
		* does not have to be a valid input
		* C single precision array of dimensions (ldc, n)
		* ldc leading dimension of C. Must be at least max(1, m)
		*
		* Output
		* ------
		* C updated according to C = alpha * A * B + beta * C, or C = alpha *
		* B * A + beta * C
		*
		* Reference: http://www.netlib.org/blas/csymm.f
		*
		* Error status for this function can be retrieved via cublasGetError().
		*
		* Error Status
		* ------------
		* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
		d
		* CUBLAS_STATUS_INVALID_VALUE if m or n are < 0
		* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
		*/
	void CUBLASAPI cublasCsymm (char side, char uplo, int m, int n,	void CUBLASAPI cublasCsymm (char side, char uplo, int m, int n,
	cuComplex alpha, const cuComplex *A, int lda,	cuComplex alpha, const cuComplex *A, int lda,
	const cuComplex *B, int ldb, cuComplex beta,	const cuComplex *B, int ldb, cuComplex beta,
	cuComplex *C, int ldc);	cuComplex *C, int ldc);

		/*
		* void
		* cublasChemm (char side, char uplo, int m, int n, cuComplex alpha,
		* const cuComplex A, int lda, const cuComplex B, int ldb,
		* cuComplex beta, cuComplex *C, int ldc);
		*
		* performs one of the matrix-matrix operations
		*
		* C = alpha * A * B + beta * C, or
		* C = alpha * B * A + beta * C,
		*
		* where alpha and beta are single precision complex scalars, A is a hermit
		ian matrix
		* consisting of single precision complex elements and stored in either low
		er or upper
		* storage mode, and B and C are m x n matrices consisting of single precis
		ion
		* complex elements.
		*
		* Input
		* -----
		* side specifies whether the hermitian matrix A appears on the left side
		* hand side or right hand side of matrix B, as follows. If side ==
		'L'
		* or 'l', then C = alpha * A * B + beta * C. If side = 'R' or 'r',
		* then C = alpha * B * A + beta * C.
		* uplo specifies whether the hermitian matrix A is stored in upper or lo
		wer
		* storage mode, as follows. If uplo == 'U' or 'u', only the upper
		* triangular part of the hermitian matrix is to be referenced, and
		the
		* elements of the strictly lower triangular part are to be infered
		from
		* those in the upper triangular part. If uplo == 'L' or 'l', only t
		he
		* lower triangular part of the hermitian matrix is to be referenced
		,
		* and the elements of the strictly upper triangular part are to be
		* infered from those in the lower triangular part.
		* m specifies the number of rows of the matrix C, and the number of r
		ows
		* of matrix B. It also specifies the dimensions of hermitian matrix
		A
		* when side == 'L' or 'l'. m must be at least zero.
		* n specifies the number of columns of the matrix C, and the number o
		f
		* columns of matrix B. It also specifies the dimensions of hermitia
		n
		* matrix A when side == 'R' or 'r'. n must be at least zero.
		* alpha single precision complex scalar multiplier applied to A * B, or B
		* A
		* A single precision complex array of dimensions (lda, ka), where ka
		is m when
		* side == 'L' or 'l' and is n otherwise. If side == 'L' or 'l' the
		* leading m x m part of array A must contain the hermitian matrix,
		* such that when uplo == 'U' or 'u', the leading m x m part stores
		the
		* upper triangular part of the hermitian matrix, and the strictly l
		ower
		* triangular part of A is not referenced, and when uplo == 'U' or '
		u',
		* the leading m x m part stores the lower triangular part of the
		* hermitian matrix and the strictly upper triangular part is not
		* referenced. If side == 'R' or 'r' the leading n x n part of array
		A
		* must contain the hermitian matrix, such that when uplo == 'U' or
		'u',
		* the leading n x n part stores the upper triangular part of the
		* hermitian matrix and the strictly lower triangular part of A is n
		ot
		* referenced, and when uplo == 'U' or 'u', the leading n x n part
		* stores the lower triangular part of the hermitian matrix and the
		* strictly upper triangular part is not referenced. The imaginary p
		arts
		* of the diagonal elements need not be set, they are assumed to be
		zero.
		* lda leading dimension of A. When side == 'L' or 'l', it must be at le
		ast
		* max(1, m) and at least max(1, n) otherwise.
		* B single precision complex array of dimensions (ldb, n). On entry,
		the leading
		* m x n part of the array contains the matrix B.
		* ldb leading dimension of B. It must be at least max (1, m).
		* beta single precision complex scalar multiplier applied to C. If beta
		is zero, C
		* does not have to be a valid input
		* C single precision complex array of dimensions (ldc, n)
		* ldc leading dimension of C. Must be at least max(1, m)
		*
		* Output
		* ------
		* C updated according to C = alpha * A * B + beta * C, or C = alpha *
		* B * A + beta * C
		*
		* Reference: http://www.netlib.org/blas/chemm.f
		*
		* Error status for this function can be retrieved via cublasGetError().
		*
		* Error Status
		* ------------
		* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
		d
		* CUBLAS_STATUS_INVALID_VALUE if m or n are < 0
		* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
		*/
	void CUBLASAPI cublasChemm (char side, char uplo, int m, int n,	void CUBLASAPI cublasChemm (char side, char uplo, int m, int n,
	cuComplex alpha, const cuComplex *A, int lda,	cuComplex alpha, const cuComplex *A, int lda,
	const cuComplex *B, int ldb, cuComplex beta,	const cuComplex *B, int ldb, cuComplex beta,
	cuComplex *C, int ldc);	cuComplex *C, int ldc);

		/*
		* void
		* cublasCsyrk (char uplo, char trans, int n, int k, cuComplex alpha,
		* const cuComplex A, int lda, cuComplex beta, cuComplex C,
		int ldc)
		*
		* performs one of the symmetric rank k operations
		*
		* C = alpha * A * transpose(A) + beta * C, or
		* C = alpha * transpose(A) * A + beta * C.
		*
		* Alpha and beta are single precision complex scalars. C is an n x n symme
		tric matrix
		* consisting of single precision complex elements and stored in either low
		er or
		* upper storage mode. A is a matrix consisting of single precision complex
		elements
		* with dimension of n x k in the first case, and k x n in the second case.
		*
		* Input
		* -----
		* uplo specifies whether the symmetric matrix C is stored in upper or lo
		wer
		* storage mode as follows. If uplo == 'U' or 'u', only the upper
		* triangular part of the symmetric matrix is to be referenced, and
		the
		* elements of the strictly lower triangular part are to be infered
		from
		* those in the upper triangular part. If uplo == 'L' or 'l', only t
		he
		* lower triangular part of the symmetric matrix is to be referenced
		,
		* and the elements of the strictly upper triangular part are to be
		* infered from those in the lower triangular part.
		* trans specifies the operation to be performed. If trans == 'N' or 'n',
		C =
		* alpha * transpose(A) + beta * C. If trans == 'T', 't', 'C', or 'c
		',
		* C = transpose(A) * A + beta * C.
		* n specifies the number of rows and the number columns of matrix C.
		If
		* trans == 'N' or 'n', n specifies the number of rows of matrix A.
		If
		* trans == 'T', 't', 'C', or 'c', n specifies the columns of matrix
		A.
		* n must be at least zero.
		* k If trans == 'N' or 'n', k specifies the number of rows of matrix
		A.
		* If trans == 'T', 't', 'C', or 'c', k specifies the number of rows
		of
		* matrix A. k must be at least zero.
		* alpha single precision complex scalar multiplier applied to A * transpo
		se(A) or
		* transpose(A) * A.
		* A single precision complex array of dimensions (lda, ka), where ka
		is k when
		* trans == 'N' or 'n', and is n otherwise. When trans == 'N' or 'n'
		,
		* the leading n x k part of array A must contain the matrix A,
		* otherwise the leading k x n part of the array must contains the
		* matrix A.
		* lda leading dimension of A. When trans == 'N' or 'n' then lda must be
		at
		* least max(1, n). Otherwise lda must be at least max(1, k).
		* beta single precision complex scalar multiplier applied to C. If beta
		izs zero, C
		* does not have to be a valid input
		* C single precision complex array of dimensions (ldc, n). If uplo =
		'U' or 'u',
		* the leading n x n triangular part of the array C must contain the
		* upper triangular part of the symmetric matrix C and the strictly
		* lower triangular part of C is not referenced. On exit, the upper
		* triangular part of C is overwritten by the upper triangular part
		of
		* the updated matrix. If uplo = 'L' or 'l', the leading n x n
		* triangular part of the array C must contain the lower triangular
		part
		* of the symmetric matrix C and the strictly upper triangular part
		of C
		* is not referenced. On exit, the lower triangular part of C is
		* overwritten by the lower triangular part of the updated matrix.
		* ldc leading dimension of C. It must be at least max(1, n).
		*
		* Output
		* ------
		* C updated according to C = alpha * A * transpose(A) + beta * C, or
		C =
		* alpha * transpose(A) * A + beta * C
		*
		* Reference: http://www.netlib.org/blas/csyrk.f
		*
		* Error status for this function can be retrieved via cublasGetError().
		*
		* Error Status
		* ------------
		* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
		d
		* CUBLAS_STATUS_INVALID_VALUE if n < 0 or k < 0
		* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
		*/

	void CUBLASAPI cublasCsyrk (char uplo, char trans, int n, int k,	void CUBLASAPI cublasCsyrk (char uplo, char trans, int n, int k,
	cuComplex alpha, const cuComplex *A, int lda,	cuComplex alpha, const cuComplex *A, int lda,
	cuComplex beta, cuComplex *C, int ldc);	cuComplex beta, cuComplex *C, int ldc);

		/*
		* void
		* cublasCherk (char uplo, char trans, int n, int k, float alpha,
		* const cuComplex A, int lda, float beta, cuComplex C, int
		ldc)
		*
		* performs one of the hermitian rank k operations
		*
		* C = alpha * A * conjugate(transpose(A)) + beta * C, or
		* C = alpha * conjugate(transpose(A)) * A + beta * C.
		*
		* Alpha and beta are single precision real scalars. C is an n x n hermitia
		n matrix
		* consisting of single precision complex elements and stored in either low
		er or
		* upper storage mode. A is a matrix consisting of single precision complex
		elements
		* with dimension of n x k in the first case, and k x n in the second case.
		*
		* Input
		* -----
		* uplo specifies whether the hermitian matrix C is stored in upper or lo
		wer
		* storage mode as follows. If uplo == 'U' or 'u', only the upper
		* triangular part of the hermitian matrix is to be referenced, and
		the
		* elements of the strictly lower triangular part are to be infered
		from
		* those in the upper triangular part. If uplo == 'L' or 'l', only t
		he
		* lower triangular part of the hermitian matrix is to be referenced
		,
		* and the elements of the strictly upper triangular part are to be
		* infered from those in the lower triangular part.
		* trans specifies the operation to be performed. If trans == 'N' or 'n',
		C =
		* alpha * A * conjugate(transpose(A)) + beta * C. If trans == 'T',
		't', 'C', or 'c',
		* C = alpha * conjugate(transpose(A)) * A + beta * C.
		* n specifies the number of rows and the number columns of matrix C.
		If
		* trans == 'N' or 'n', n specifies the number of rows of matrix A.
		If
		* trans == 'T', 't', 'C', or 'c', n specifies the columns of matrix
		A.
		* n must be at least zero.
		* k If trans == 'N' or 'n', k specifies the number of columns of matr
		ix A.
		* If trans == 'T', 't', 'C', or 'c', k specifies the number of rows
		of
		* matrix A. k must be at least zero.
		* alpha single precision scalar multiplier applied to A * conjugate(trans
		pose(A)) or
		* conjugate(transpose(A)) * A.
		* A single precision complex array of dimensions (lda, ka), where ka
		is k when
		* trans == 'N' or 'n', and is n otherwise. When trans == 'N' or 'n'
		,
		* the leading n x k part of array A must contain the matrix A,
		* otherwise the leading k x n part of the array must contains the
		* matrix A.
		* lda leading dimension of A. When trans == 'N' or 'n' then lda must be
		at
		* least max(1, n). Otherwise lda must be at least max(1, k).
		* beta single precision scalar multiplier applied to C. If beta is zero,
		C
		* does not have to be a valid input.
		* C single precision complex array of dimensions (ldc, n). If uplo =
		'U' or 'u',
		* the leading n x n triangular part of the array C must contain the
		* upper triangular part of the hermitian matrix C and the strictly
		* lower triangular part of C is not referenced. On exit, the upper
		* triangular part of C is overwritten by the upper triangular part
		of
		* the updated matrix. If uplo = 'L' or 'l', the leading n x n
		* triangular part of the array C must contain the lower triangular
		part
		* of the hermitian matrix C and the strictly upper triangular part
		of C
		* is not referenced. On exit, the lower triangular part of C is
		* overwritten by the lower triangular part of the updated matrix.
		* The imaginary parts of the diagonal elements need
		* not be set, they are assumed to be zero, and on exit they
		* are set to zero.
		* ldc leading dimension of C. It must be at least max(1, n).
		*
		* Output
		* ------
		* C updated according to C = alpha * A * conjugate(transpose(A)) + be
		ta * C, or C =
		* alpha * conjugate(transpose(A)) * A + beta * C
		*
		* Reference: http://www.netlib.org/blas/cherk.f
		*
		* Error status for this function can be retrieved via cublasGetError().
		*
		* Error Status
		* ------------
		* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
		d
		* CUBLAS_STATUS_INVALID_VALUE if n < 0 or k < 0
		* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
		*/
	void CUBLASAPI cublasCherk (char uplo, char trans, int n, int k,	void CUBLASAPI cublasCherk (char uplo, char trans, int n, int k,

	cuComplex alpha, const cuComplex *A, int lda,	float alpha, const cuComplex *A, int lda,
	cuComplex beta, cuComplex *C, int ldc);	float beta, cuComplex *C, int ldc);

		/*
		* void
		* cublasCsyr2k (char uplo, char trans, int n, int k, cuComplex alpha,
		* const cuComplex A, int lda, const cuComplex B, int ldb,
		* cuComplex beta, cuComplex *C, int ldc)
		*
		* performs one of the symmetric rank 2k operations
		*
		* C = alpha * A * transpose(B) + alpha * B * transpose(A) + beta * C, o
		r
		* C = alpha * transpose(A) * B + alpha * transpose(B) * A + beta * C.
		*
		* Alpha and beta are single precision complex scalars. C is an n x n symme
		tric matrix
		* consisting of single precision complex elements and stored in either low
		er or upper
		* storage mode. A and B are matrices consisting of single precision comple
		x elements
		* with dimension of n x k in the first case, and k x n in the second case.
		*
		* Input
		* -----
		* uplo specifies whether the symmetric matrix C is stored in upper or lo
		wer
		* storage mode, as follows. If uplo == 'U' or 'u', only the upper
		* triangular part of the symmetric matrix is to be referenced, and
		the
		* elements of the strictly lower triangular part are to be infered
		from
		* those in the upper triangular part. If uplo == 'L' or 'l', only t
		he
		* lower triangular part of the symmetric matrix is to be references
		,
		* and the elements of the strictly upper triangular part are to be
		* infered from those in the lower triangular part.
		* trans specifies the operation to be performed. If trans == 'N' or 'n',
		* C = alpha * A * transpose(B) + alpha * B * transpose(A) + beta *
		C,
		* If trans == 'T', 't', 'C', or 'c', C = alpha * transpose(A) * B +
		* alpha * transpose(B) * A + beta * C.
		* n specifies the number of rows and the number columns of matrix C.
		If
		* trans == 'N' or 'n', n specifies the number of rows of matrix A.
		If
		* trans == 'T', 't', 'C', or 'c', n specifies the columns of matrix
		A.
		* n must be at least zero.
		* k If trans == 'N' or 'n', k specifies the number of rows of matrix
		A.
		* If trans == 'T', 't', 'C', or 'c', k specifies the number of rows
		of
		* matrix A. k must be at least zero.
		* alpha single precision complex scalar multiplier.
		* A single precision complex array of dimensions (lda, ka), where ka
		is k when
		* trans == 'N' or 'n', and is n otherwise. When trans == 'N' or 'n'
		,
		* the leading n x k part of array A must contain the matrix A,
		* otherwise the leading k x n part of the array must contain the ma
		trix
		* A.
		* lda leading dimension of A. When trans == 'N' or 'n' then lda must be
		at
		* least max(1, n). Otherwise lda must be at least max(1,k).
		* B single precision complex array of dimensions (lda, kb), where kb
		is k when
		* trans == 'N' or 'n', and is n otherwise. When trans == 'N' or 'n'
		,
		* the leading n x k part of array B must contain the matrix B,
		* otherwise the leading k x n part of the array must contain the ma
		trix
		* B.
		* ldb leading dimension of N. When trans == 'N' or 'n' then ldb must be
		at
		* least max(1, n). Otherwise ldb must be at least max(1, k).
		* beta single precision complex scalar multiplier applied to C. If beta
		is zero, C
		* does not have to be a valid input.
		* C single precision complex array of dimensions (ldc, n). If uplo ==
		'U' or 'u',
		* the leading n x n triangular part of the array C must contain the
		* upper triangular part of the symmetric matrix C and the strictly
		* lower triangular part of C is not referenced. On exit, the upper
		* triangular part of C is overwritten by the upper triangular part
		of
		* the updated matrix. If uplo == 'L' or 'l', the leading n x n
		* triangular part of the array C must contain the lower triangular
		part
		* of the symmetric matrix C and the strictly upper triangular part
		of C
		* is not referenced. On exit, the lower triangular part of C is
		* overwritten by the lower triangular part of the updated matrix.
		* ldc leading dimension of C. Must be at least max(1, n).
		*
		* Output
		* ------
		* C updated according to alphaAtranspose(B) + alphaBtranspose(A)
		+
		* betaC or alphatranspose(A)B + alphatranspose(B)A + betaC
		*
		* Reference: http://www.netlib.org/blas/csyr2k.f
		*
		* Error status for this function can be retrieved via cublasGetError().
		*
		* Error Status
		* ------------
		* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
		d
		* CUBLAS_STATUS_INVALID_VALUE if n < 0 or k < 0
		* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
		*/
	void CUBLASAPI cublasCsyr2k (char uplo, char trans, int n, int k,	void CUBLASAPI cublasCsyr2k (char uplo, char trans, int n, int k,
	cuComplex alpha, const cuComplex *A, int lda,	cuComplex alpha, const cuComplex *A, int lda,
	const cuComplex *B, int ldb, cuComplex beta,	const cuComplex *B, int ldb, cuComplex beta,
	cuComplex *C, int ldc);	cuComplex *C, int ldc);


		/*
		* void
		* cublasCher2k (char uplo, char trans, int n, int k, cuComplex alpha,
		* const cuComplex A, int lda, const cuComplex B, int ldb,
		* float beta, cuComplex *C, int ldc)
		*
		* performs one of the hermitian rank 2k operations
		*
		* C = alpha * A * conjugate(transpose(B))
		* + conjugate(alpha) * B * conjugate(transpose(A))
		* + beta * C ,
		* or
		* C = alpha * conjugate(transpose(A)) * B
		* + conjugate(alpha) * conjugate(transpose(B)) * A
		* + beta * C.
		*
		* Alpha is single precision complex scalar whereas Beta is a single preoci
		sion real scalar.
		* C is an n x n hermitian matrix consisting of single precision complex el
		ements
		* and stored in either lower or upper storage mode. A and B are matrices c
		onsisting
		* of single precision complex elements with dimension of n x k in the firs
		t case,
		* and k x n in the second case.
		*
		* Input
		* -----
		* uplo specifies whether the hermitian matrix C is stored in upper or lo
		wer
		* storage mode, as follows. If uplo == 'U' or 'u', only the upper
		* triangular part of the hermitian matrix is to be referenced, and
		the
		* elements of the strictly lower triangular part are to be infered
		from
		* those in the upper triangular part. If uplo == 'L' or 'l', only t
		he
		* lower triangular part of the hermitian matrix is to be references
		,
		* and the elements of the strictly upper triangular part are to be
		* infered from those in the lower triangular part.
		* trans specifies the operation to be performed. If trans == 'N' or 'n',
		* C = alpha * A * conjugate(transpose(B))
		* + conjugate(alpha) * B * conjugate(transpose(A))
		* + beta * C .
		* If trans == 'T', 't', 'C', or 'c',
		* C = alpha * conjugate(transpose(A)) * B
		* + conjugate(alpha) * conjugate(transpose(B)) * A
		* + beta * C.
		* n specifies the number of rows and the number columns of matrix C.
		If
		* trans == 'N' or 'n', n specifies the number of rows of matrix A.
		If
		* trans == 'T', 't', 'C', or 'c', n specifies the columns of matrix
		A.
		* n must be at least zero.
		* k If trans == 'N' or 'n', k specifies the number of rows of matrix
		A.
		* If trans == 'T', 't', 'C', or 'c', k specifies the number of rows
		of
		* matrix A. k must be at least zero.
		* alpha single precision complex scalar multiplier.
		* A single precision complex array of dimensions (lda, ka), where ka
		is k when
		* trans == 'N' or 'n', and is n otherwise. When trans == 'N' or 'n'
		,
		* the leading n x k part of array A must contain the matrix A,
		* otherwise the leading k x n part of the array must contain the ma
		trix
		* A.
		* lda leading dimension of A. When trans == 'N' or 'n' then lda must be
		at
		* least max(1, n). Otherwise lda must be at least max(1,k).
		* B single precision complex array of dimensions (lda, kb), where kb
		is k when
		* trans == 'N' or 'n', and is n otherwise. When trans == 'N' or 'n'
		,
		* the leading n x k part of array B must contain the matrix B,
		* otherwise the leading k x n part of the array must contain the ma
		trix
		* B.
		* ldb leading dimension of N. When trans == 'N' or 'n' then ldb must be
		at
		* least max(1, n). Otherwise ldb must be at least max(1, k).
		* beta single precision scalar multiplier applied to C. If beta is zero,
		C
		* does not have to be a valid input.
		* C single precision complex array of dimensions (ldc, n). If uplo ==
		'U' or 'u',
		* the leading n x n triangular part of the array C must contain the
		* upper triangular part of the hermitian matrix C and the strictly
		* lower triangular part of C is not referenced. On exit, the upper
		* triangular part of C is overwritten by the upper triangular part
		of
		* the updated matrix. If uplo == 'L' or 'l', the leading n x n
		* triangular part of the array C must contain the lower triangular
		part
		* of the hermitian matrix C and the strictly upper triangular part
		of C
		* is not referenced. On exit, the lower triangular part of C is
		* overwritten by the lower triangular part of the updated matrix.
		* The imaginary parts of the diagonal elements need
		* not be set, they are assumed to be zero, and on exit they
		* are set to zero.
		* ldc leading dimension of C. Must be at least max(1, n).
		*
		* Output
		* ------
		* C updated according to alphaAconjugate(transpose(B)) +
		* + conjugate(alpha)Bconjugate(transpose(A)) + beta*C or
		* alphaconjugate(transpose(A))B + conjugate(alpha)*conjugate(tran
		spose(B))*A
		* + beta*C.
		*
		* Reference: http://www.netlib.org/blas/cher2k.f
		*
		* Error status for this function can be retrieved via cublasGetError().
		*
		* Error Status
		* ------------
		* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
		d
		* CUBLAS_STATUS_INVALID_VALUE if n < 0 or k < 0
		* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
		*/
	void CUBLASAPI cublasCher2k (char uplo, char trans, int n, int k,	void CUBLASAPI cublasCher2k (char uplo, char trans, int n, int k,
	cuComplex alpha, const cuComplex *A, int lda,	cuComplex alpha, const cuComplex *A, int lda,

	const cuComplex *B, int ldb, cuComplex beta,	const cuComplex *B, int ldb, float beta,
	cuComplex *C, int ldc);	cuComplex *C, int ldc);

		/*
		* void
		* cublasCtrmm (char side, char uplo, char transa, char diag, int m, int n,
		* cuComplex alpha, const cuComplex *A, int lda, const cuCompl
		ex *B,
		* int ldb)
		*
		* performs one of the matrix-matrix operations
		*
		* B = alpha * op(A) * B, or B = alpha * B * op(A)
		*
		* where alpha is a single-precision complex scalar, B is an m x n matrix c
		omposed
		* of single precision complex elements, and A is a unit or non-unit, upper
		or lower,
		* triangular matrix composed of single precision complex elements. op(A) i
		s one of
		*
		* op(A) = A , op(A) = transpose(A) or op(A) = conjugate(transpose(A))
		*
		* Matrices A and B are stored in column major format, and lda and ldb are
		* the leading dimensions of the two-dimensonials arrays that contain A and
		* B, respectively.
		*
		* Input
		* -----
		* side specifies whether op(A) multiplies B from the left or right.
		* If side = 'L' or 'l', then B = alpha * op(A) * B. If side =
		* 'R' or 'r', then B = alpha * B * op(A).
		* uplo specifies whether the matrix A is an upper or lower triangular
		* matrix. If uplo = 'U' or 'u', A is an upper triangular matrix.
		* If uplo = 'L' or 'l', A is a lower triangular matrix.
		* transa specifies the form of op(A) to be used in the matrix
		* multiplication. If transa = 'N' or 'n', then op(A) = A. If
		* transa = 'T' or 't', then op(A) = transpose(A).
		* If transa = 'C' or 'c', then op(A) = conjugate(transpose(A)).
		* diag specifies whether or not A is unit triangular. If diag = 'U'
		* or 'u', A is assumed to be unit triangular. If diag = 'N' or
		* 'n', A is not assumed to be unit triangular.
		* m the number of rows of matrix B. m must be at least zero.
		* n the number of columns of matrix B. n must be at least zero.
		* alpha single precision complex scalar multiplier applied to op(A)*B, or
		* B*op(A), respectively. If alpha is zero no accesses are made
		* to matrix A, and no read accesses are made to matrix B.
		* A single precision complex array of dimensions (lda, k). k = m if s
		ide =
		* 'L' or 'l', k = n if side = 'R' or 'r'. If uplo = 'U' or 'u'
		* the leading k x k upper triangular part of the array A must
		* contain the upper triangular matrix, and the strictly lower
		* triangular part of A is not referenced. If uplo = 'L' or 'l'
		* the leading k x k lower triangular part of the array A must
		* contain the lower triangular matrix, and the strictly upper
		* triangular part of A is not referenced. When diag = 'U' or 'u'
		* the diagonal elements of A are no referenced and are assumed
		* to be unity.
		* lda leading dimension of A. When side = 'L' or 'l', it must be at
		* least max(1,m) and at least max(1,n) otherwise
		* B single precision complex array of dimensions (ldb, n). On entry,
		the
		* leading m x n part of the array contains the matrix B. It is
		* overwritten with the transformed matrix on exit.
		* ldb leading dimension of B. It must be at least max (1, m).
		*
		* Output
		* ------
		* B updated according to B = alpha * op(A) * B or B = alpha * B * op
		(A)
		*
		* Reference: http://www.netlib.org/blas/ctrmm.f
		*
		* Error status for this function can be retrieved via cublasGetError().
		*
		* Error Status
		* ------------
		* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
		d
		* CUBLAS_STATUS_INVALID_VALUE if m or n < 0
		* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
		*/
	void CUBLASAPI cublasCtrmm (char side, char uplo, char transa, char diag,	void CUBLASAPI cublasCtrmm (char side, char uplo, char transa, char diag,
	int m, int n, cuComplex alpha, const cuComplex *A,	int m, int n, cuComplex alpha, const cuComplex *A,
	int lda, cuComplex *B, int ldb);	int lda, cuComplex *B, int ldb);

		/*
		* void
		* cublasCtrsm (char side, char uplo, char transa, char diag, int m, int n,
		* cuComplex alpha, const cuComplex *A, int lda,
		* cuComplex *B, int ldb)
		*
		* solves one of the matrix equations
		*
		* op(A) * X = alpha * B, or X * op(A) = alpha * B,
		*
		* where alpha is a single precision complex scalar, and X and B are m x n
		matrices
		* that are composed of single precision complex elements. A is a unit or n
		on-unit,
		* upper or lower triangular matrix, and op(A) is one of
		*
		* op(A) = A or op(A) = transpose(A) or op( A ) = conj( A' ).
		*
		* The result matrix X overwrites input matrix B; that is, on exit the resu
		lt
		* is stored in B. Matrices A and B are stored in column major format, and
		* lda and ldb are the leading dimensions of the two-dimensonials arrays th
		at
		* contain A and B, respectively.
		*
		* Input
		* -----
		* side specifies whether op(A) appears on the left or right of X as
		* follows: side = 'L' or 'l' indicates solve op(A) * X = alpha * B.
		* side = 'R' or 'r' indicates solve X * op(A) = alpha * B.
		* uplo specifies whether the matrix A is an upper or lower triangular
		* matrix as follows: uplo = 'U' or 'u' indicates A is an upper
		* triangular matrix. uplo = 'L' or 'l' indicates A is a lower
		* triangular matrix.
		* transa specifies the form of op(A) to be used in matrix multiplication
		* as follows: If transa = 'N' or 'N', then op(A) = A. If transa =
		* 'T', 't', 'C', or 'c', then op(A) = transpose(A).
		* diag specifies whether or not A is a unit triangular matrix like so:
		* if diag = 'U' or 'u', A is assumed to be unit triangular. If
		* diag = 'N' or 'n', then A is not assumed to be unit triangular.
		* m specifies the number of rows of B. m must be at least zero.
		* n specifies the number of columns of B. n must be at least zero.
		* alpha is a single precision complex scalar to be multiplied with B. Whe
		n alpha is
		* zero, then A is not referenced and B need not be set before entry
		.
		* A is a single precision complex array of dimensions (lda, k), where
		k is
		* m when side = 'L' or 'l', and is n when side = 'R' or 'r'. If
		* uplo = 'U' or 'u', the leading k x k upper triangular part of
		* the array A must contain the upper triangular matrix and the
		* strictly lower triangular matrix of A is not referenced. When
		* uplo = 'L' or 'l', the leading k x k lower triangular part of
		* the array A must contain the lower triangular matrix and the
		* strictly upper triangular part of A is not referenced. Note that
		* when diag = 'U' or 'u', the diagonal elements of A are not
		* referenced, and are assumed to be unity.
		* lda is the leading dimension of the two dimensional array containing
		A.
		* When side = 'L' or 'l' then lda must be at least max(1, m), when
		* side = 'R' or 'r' then lda must be at least max(1, n).
		* B is a single precision complex array of dimensions (ldb, n). ldb m
		ust be
		* at least max (1,m). The leading m x n part of the array B must
		* contain the right-hand side matrix B. On exit B is overwritten
		* by the solution matrix X.
		* ldb is the leading dimension of the two dimensional array containing
		B.
		* ldb must be at least max(1, m).
		*
		* Output
		* ------
		* B contains the solution matrix X satisfying op(A) * X = alpha * B,
		* or X * op(A) = alpha * B
		*
		* Reference: http://www.netlib.org/blas/ctrsm.f
		*
		* Error status for this function can be retrieved via cublasGetError().
		*
		* Error Status
		* ------------
		* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
		d
		* CUBLAS_STATUS_INVALID_VALUE if m or n < 0
		* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
		*/
	void CUBLASAPI cublasCtrsm (char side, char uplo, char transa, char diag,	void CUBLASAPI cublasCtrsm (char side, char uplo, char transa, char diag,
	int m, int n, cuComplex alpha, const cuComplex *A,	int m, int n, cuComplex alpha, const cuComplex *A,
	int lda, cuComplex *B, int ldb);	int lda, cuComplex *B, int ldb);


	void CUBLASAPI cublasXerbla (const char *srName, int info);	void CUBLASAPI cublasXerbla (const char *srName, int info);

	/* ---------------- CUBLAS double-precision BLAS1 functions --------------- - */	/* ---------------- CUBLAS double-precision BLAS1 functions --------------- - */

	/*	/*
	* double	* double
	* cublasDasum (int n, const double *x, int incx)	* cublasDasum (int n, const double *x, int incx)
	*	*
	* computes the sum of the absolute values of the elements of double	* computes the sum of the absolute values of the elements of double
	* precision vector x; that is, the result is the sum from i = 0 to n - 1 o f	* precision vector x; that is, the result is the sum from i = 0 to n - 1 o f

	skipping to change at line 3068	skipping to change at line 5474
	* ------------	* ------------
	* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize d	* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize d
	* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support	* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
	* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU	* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
	*/	*/
	void CUBLASAPI cublasDrot (int n, double x, int incx, double y, int incy,	void CUBLASAPI cublasDrot (int n, double x, int incx, double y, int incy,
	double sc, double ss);	double sc, double ss);

	/*	/*
	* void	* void

	* cublasDrotg (double sa, double sb, double sc, double ss)	* cublasDrotg (double host_sa, double host_sb, double host_sc, double host_ss)
	*	*
	* constructs the Givens tranformation	* constructs the Givens tranformation
	*	*
	* ( sc ss )	* ( sc ss )
	* G = ( ) , sc^2 + ss^2 = 1,	* G = ( ) , sc^2 + ss^2 = 1,
	* (-ss sc )	* (-ss sc )
	*	*
	* which zeros the second entry of the 2-vector transpose(sa, sb).	* which zeros the second entry of the 2-vector transpose(sa, sb).
	*	*
	* The quantity r = (+/-) sqrt (sa^2 + sb^2) overwrites sa in storage. The	* The quantity r = (+/-) sqrt (sa^2 + sb^2) overwrites sa in storage. The
	* value of sb is overwritten by a value z which allows sc and ss to be	* value of sb is overwritten by a value z which allows sc and ss to be
	* recovered by the following algorithm:	* recovered by the following algorithm:
	*	*
	* if z=1 set sc = 0.0 and ss = 1.0	* if z=1 set sc = 0.0 and ss = 1.0
	* if abs(z) < 1 set sc = sqrt(1-z^2) and ss = z	* if abs(z) < 1 set sc = sqrt(1-z^2) and ss = z
	* if abs(z) > 1 set sc = 1/z and ss = sqrt(1-sc^2)	* if abs(z) > 1 set sc = 1/z and ss = sqrt(1-sc^2)
	*	*
	* The function drot (n, x, incx, y, incy, sc, ss) normally is called next	* The function drot (n, x, incx, y, incy, sc, ss) normally is called next
	* to apply the transformation to a 2 x n matrix.	* to apply the transformation to a 2 x n matrix.

		* Note that is function is provided for completeness and run exclusively
		* on the Host.
	*	*
	* Input	* Input
	* -----	* -----
	* sa double-precision scalar	* sa double-precision scalar
	* sb double-precision scalar	* sb double-precision scalar
	*	*
	* Output	* Output
	* ------	* ------
	* sa double-precision r	* sa double-precision r
	* sb double-precision z	* sb double-precision z
	* sc double-precision result	* sc double-precision result
	* ss double-precision result	* ss double-precision result
	*	*
	* Reference: http://www.netlib.org/blas/drotg.f	* Reference: http://www.netlib.org/blas/drotg.f
	*	*
	* This function does not set any error status.	* This function does not set any error status.
	*/	*/

	void CUBLASAPI cublasDrotg (double sa, double sb, double sc, double ss) ;	void CUBLASAPI cublasDrotg (double host_sa, double host_sb, double host_ sc, double host_ss);

	/*	/*
	* void	* void
	* cublasDrotm (int n, double x, int incx, double y, int incy,	* cublasDrotm (int n, double x, int incx, double y, int incy,
	* const double* sparam)	* const double* sparam)
	*	*
	* applies the modified Givens transformation, h, to the 2 x n matrix	* applies the modified Givens transformation, h, to the 2 x n matrix
	*	*
	* ( transpose(x) )	* ( transpose(x) )
	* ( transpose(y) )	* ( transpose(y) )

	skipping to change at line 3127	skipping to change at line 5535
	* The elements of x are in x[lx + i * incx], i = 0 to n-1, where lx = 1 if	* The elements of x are in x[lx + i * incx], i = 0 to n-1, where lx = 1 if
	* incx >= 0, else lx = 1 + (1 - n) * incx, and similarly for y using ly an d	* incx >= 0, else lx = 1 + (1 - n) * incx, and similarly for y using ly an d
	* incy. With sparam[0] = sflag, h has one of the following forms:	* incy. With sparam[0] = sflag, h has one of the following forms:
	*	*
	* sflag = -1.0 sflag = 0.0 sflag = 1.0 sflag = -2.0	* sflag = -1.0 sflag = 0.0 sflag = 1.0 sflag = -2.0
	*	*
	* (sh00 sh01) (1.0 sh01) (sh00 1.0) (1.0 0.0)	* (sh00 sh01) (1.0 sh01) (sh00 1.0) (1.0 0.0)
	* h = ( ) ( ) ( ) ( )	* h = ( ) ( ) ( ) ( )
	* (sh10 sh11) (sh10 1.0) (-1.0 sh11) (0.0 1.0)	* (sh10 sh11) (sh10 1.0) (-1.0 sh11) (0.0 1.0)
	*	*

		* Note that is function is provided for completeness and run exclusively
		* on the Host.
		*
	* Input	* Input
	* -----	* -----
	* n number of elements in input vectors	* n number of elements in input vectors
	* x double-precision vector with n elements	* x double-precision vector with n elements
	* incx storage spacing between elements of x	* incx storage spacing between elements of x
	* y double-precision vector with n elements	* y double-precision vector with n elements
	* incy storage spacing between elements of y	* incy storage spacing between elements of y
	* sparam 5-element vector. sparam[0] is sflag described above. sparam[1]	* sparam 5-element vector. sparam[0] is sflag described above. sparam[1]
	* through sparam[4] contain the 2x2 rotation matrix h: sparam[1]	* through sparam[4] contain the 2x2 rotation matrix h: sparam[1]
	* contains sh00, sparam[2] contains sh10, sparam[3] contains sh01,	* contains sh00, sparam[2] contains sh10, sparam[3] contains sh01,

	skipping to change at line 3159	skipping to change at line 5570
	* ------------	* ------------
	* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize d	* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize d
	* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support	* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
	* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU	* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
	*/	*/
	void CUBLASAPI cublasDrotm(int n, double x, int incx, double y, int incy,	void CUBLASAPI cublasDrotm(int n, double x, int incx, double y, int incy,
	const double* sparam);	const double* sparam);

	/*	/*
	* void	* void

	* cublasDrotmg (double psd1, double psd2, double psx1, const double ps	* cublasDrotmg (double host_sd1, double host_sd2, double *host_sx1, cons
	y1,	t double *host_sy1,
	* double *sparam)	* double *host_sparam)
	*	*
	* constructs the modified Givens transformation matrix h which zeros	* constructs the modified Givens transformation matrix h which zeros
	* the second component of the 2-vector transpose(sqrt(sd1)sx1,sqrt(sd2)s y1).	* the second component of the 2-vector transpose(sqrt(sd1)sx1,sqrt(sd2)s y1).
	* With sparam[0] = sflag, h has one of the following forms:	* With sparam[0] = sflag, h has one of the following forms:
	*	*
	* sflag = -1.0 sflag = 0.0 sflag = 1.0 sflag = -2.0	* sflag = -1.0 sflag = 0.0 sflag = 1.0 sflag = -2.0
	*	*
	* (sh00 sh01) (1.0 sh01) (sh00 1.0) (1.0 0.0)	* (sh00 sh01) (1.0 sh01) (sh00 1.0) (1.0 0.0)
	* h = ( ) ( ) ( ) ( )	* h = ( ) ( ) ( ) ( )
	* (sh10 sh11) (sh10 1.0) (-1.0 sh11) (0.0 1.0)	* (sh10 sh11) (sh10 1.0) (-1.0 sh11) (0.0 1.0)
	*	*
	* sparam[1] through sparam[4] contain sh00, sh10, sh01, sh11,	* sparam[1] through sparam[4] contain sh00, sh10, sh01, sh11,
	* respectively. Values of 1.0, -1.0, or 0.0 implied by the value	* respectively. Values of 1.0, -1.0, or 0.0 implied by the value
	* of sflag are not stored in sparam.	* of sflag are not stored in sparam.

		* Note that is function is provided for completeness and run exclusively
		* on the Host.
	*	*
	* Input	* Input
	* -----	* -----
	* sd1 single precision scalar	* sd1 single precision scalar
	* sd2 single precision scalar	* sd2 single precision scalar
	* sx1 single precision scalar	* sx1 single precision scalar
	* sy1 single precision scalar	* sy1 single precision scalar
	*	*
	* Output	* Output
	* ------	* ------

	skipping to change at line 3197	skipping to change at line 5610
	* sx1 changed to represent the effect of the transformation	* sx1 changed to represent the effect of the transformation
	* sparam 5-element vector. sparam[0] is sflag described above. sparam[1]	* sparam 5-element vector. sparam[0] is sflag described above. sparam[1]
	* through sparam[4] contain the 2x2 rotation matrix h: sparam[1]	* through sparam[4] contain the 2x2 rotation matrix h: sparam[1]
	* contains sh00, sparam[2] contains sh10, sparam[3] contains sh01,	* contains sh00, sparam[2] contains sh10, sparam[3] contains sh01,
	* and sprams[4] contains sh11.	* and sprams[4] contains sh11.
	*	*
	* Reference: http://www.netlib.org/blas/drotmg.f	* Reference: http://www.netlib.org/blas/drotmg.f
	*	*
	* This functions does not set any error status.	* This functions does not set any error status.
	*/	*/

	void CUBLASAPI cublasDrotmg (double sd1, double sd2, double *sx1,	void CUBLASAPI cublasDrotmg (double host_sd1, double host_sd2, double *ho
	const double sy1, double sparam);	st_sx1,
		const double host_sy1, double host_sparam);

	/*	/*
	* void	* void
	* cublasDscal (int n, double alpha, double *x, int incx)	* cublasDscal (int n, double alpha, double *x, int incx)
	*	*
	* replaces double-precision vector x with double-precision alpha * x. For	* replaces double-precision vector x with double-precision alpha * x. For
	* i = 0 to n-1, it replaces x[lx + i * incx] with alpha * x[lx + i * incx] ,	* i = 0 to n-1, it replaces x[lx + i * incx] with alpha * x[lx + i * incx] ,
	* where lx = 1 if incx >= 0, else lx = 1 + (1 - n) * incx.	* where lx = 1 if incx >= 0, else lx = 1 + (1 - n) * incx.
	*	*
	* Input	* Input

	skipping to change at line 3493	skipping to change at line 5906
	* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize d	* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize d
	* CUBLAS_STATUS_INVALID_VALUE if n < 0, or incx == 0	* CUBLAS_STATUS_INVALID_VALUE if n < 0, or incx == 0
	* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support	* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
	* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU	* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
	*/	*/
	void CUBLASAPI cublasDsyr (char uplo, int n, double alpha,	void CUBLASAPI cublasDsyr (char uplo, int n, double alpha,
	const double x, int incx, double A,	const double x, int incx, double A,
	int lda);	int lda);

	/*	/*

		* void cublasDsyr2 (char uplo, int n, double alpha, const double *x, int i
		ncx,
		* const double y, int incy, double A, int lda)
		*
		* performs the symmetric rank 2 operation
		*
		* A = alphaxtranspose(y) + alphaytranspose(x) + A,
		*
		* where alpha is a double precision scalar, x and y are n element double
		* precision vector and A is an n by n symmetric matrix consisting of doubl
		e
		* precision elements.
		*
		* Input
		* -----
		* uplo specifies whether the matrix data is stored in the upper or the l
		ower
		* triangular part of array A. If uplo == 'U' or 'u', then only the
		* upper triangular part of A may be referenced and the lower triang
		ular
		* part of A is inferred. If uplo == 'L' or 'l', then only the lower
		* triangular part of A may be referenced and the upper triangular p
		art
		* of A is inferred.
		* n specifies the number of rows and columns of the matrix A. It must
		be
		* at least zero.
		* alpha double precision scalar multiplier applied to x * transpose(y) +
		* y * transpose(x).
		* x double precision array of length at least (1 + (n - 1) * abs (inc
		x)).
		* incx storage spacing between elements of x. incx must not be zero.
		* y double precision array of length at least (1 + (n - 1) * abs (inc
		y)).
		* incy storage spacing between elements of y. incy must not be zero.
		* A double precision array of dimensions (lda, n). If uplo == 'U' or
		'u',
		* then A must contains the upper triangular part of a symmetric mat
		rix,
		* and the strictly lower triangular parts is not referenced. If upl
		o ==
		* 'L' or 'l', then A contains the lower triangular part of a symmet
		ric
		* matrix, and the strictly upper triangular part is not referenced.
		* lda leading dimension of A. It must be at least max(1, n).
		*
		* Output
		* ------
		* A updated according to A = alphaxtranspose(y)+alphaytranspose(x
		)+A
		*
		* Reference: http://www.netlib.org/blas/dsyr2.f
		*
		* Error status for this function can be retrieved via cublasGetError().
		*
		* Error Status
		* ------------
		* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
		d
		* CUBLAS_STATUS_INVALID_VALUE if n < 0, incx == 0, incy == 0
		* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
		* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
		*/
		void CUBLASAPI cublasDsyr2 (char uplo, int n, double alpha,
		const double x, int incx, const double y,
		int incy, double *A, int lda);

		/*
		* void
		* cublasDspr (char uplo, int n, double alpha, const double *x, int incx,
		* double *AP)
		*
		* performs the symmetric rank 1 operation
		*
		* A = alpha * x * transpose(x) + A,
		*
		* where alpha is a double precision scalar and x is an n element double
		* precision vector. A is a symmetric n x n matrix consisting of double
		* precision elements that is supplied in packed form.
		*
		* Input
		* -----
		* uplo specifies whether the matrix data is stored in the upper or the l
		ower
		* triangular part of array AP. If uplo == 'U' or 'u', then the uppe
		r
		* triangular part of A is supplied in AP. If uplo == 'L' or 'l', th
		en
		* the lower triangular part of A is supplied in AP.
		* n specifies the number of rows and columns of the matrix A. It must
		be
		* at least zero.
		* alpha double precision scalar multiplier applied to x * transpose(x).
		* x double precision array of length at least (1 + (n - 1) * abs(incx
		)).
		* incx storage spacing between elements of x. incx must not be zero.
		* AP double precision array with at least ((n * (n + 1)) / 2) elements
		. If
		* uplo == 'U' or 'u', the array AP contains the upper triangular pa
		rt
		* of the symmetric matrix A, packed sequentially, column by column;
		* that is, if i <= j, then A[i,j] is stored is AP[i+(j*(j+1)/2)]. I
		f
		* uplo == 'L' or 'L', the array AP contains the lower triangular pa
		rt
		* of the symmetric matrix A, packed sequentially, column by column;
		* that is, if i >= j, then A[i,j] is stored in AP[i+((2n-j+1)j)/2
		].
		*
		* Output
		* ------
		* A updated according to A = alpha * x * transpose(x) + A
		*
		* Reference: http://www.netlib.org/blas/dspr.f
		*
		* Error status for this function can be retrieved via cublasGetError().
		*
		* Error Status
		* ------------
		* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
		d
		* CUBLAS_STATUS_INVALID_VALUE if n < 0, or incx == 0
		* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
		* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
		*/
		void CUBLASAPI cublasDspr (char uplo, int n, double alpha,
		const double x, int incx, double AP);
		/*
		* void
		* cublasDspr2 (char uplo, int n, double alpha, const double *x, int incx,
		* const double y, int incy, double AP)
		*
		* performs the symmetric rank 2 operation
		*
		* A = alphaxtranspose(y) + alphaytranspose(x) + A,
		*
		* where alpha is a double precision scalar, and x and y are n element doub
		le
		* precision vectors. A is a symmetric n x n matrix consisting of double
		* precision elements that is supplied in packed form.
		*
		* Input
		* -----
		* uplo specifies whether the matrix data is stored in the upper or the l
		ower
		* triangular part of array A. If uplo == 'U' or 'u', then only the
		* upper triangular part of A may be referenced and the lower triang
		ular
		* part of A is inferred. If uplo == 'L' or 'l', then only the lower
		* triangular part of A may be referenced and the upper triangular p
		art
		* of A is inferred.
		* n specifies the number of rows and columns of the matrix A. It must
		be
		* at least zero.
		* alpha double precision scalar multiplier applied to x * transpose(y) +
		* y * transpose(x).
		* x double precision array of length at least (1 + (n - 1) * abs (inc
		x)).
		* incx storage spacing between elements of x. incx must not be zero.
		* y double precision array of length at least (1 + (n - 1) * abs (inc
		y)).
		* incy storage spacing between elements of y. incy must not be zero.
		* AP double precision array with at least ((n * (n + 1)) / 2) elements
		. If
		* uplo == 'U' or 'u', the array AP contains the upper triangular pa
		rt
		* of the symmetric matrix A, packed sequentially, column by column;
		* that is, if i <= j, then A[i,j] is stored is AP[i+(j*(j+1)/2)]. I
		f
		* uplo == 'L' or 'L', the array AP contains the lower triangular pa
		rt
		* of the symmetric matrix A, packed sequentially, column by column;
		* that is, if i >= j, then A[i,j] is stored in AP[i+((2n-j+1)j)/2
		].
		*
		* Output
		* ------
		* A updated according to A = alphaxtranspose(y)+alphaytranspose(x
		)+A
		*
		* Reference: http://www.netlib.org/blas/dspr2.f
		*
		* Error status for this function can be retrieved via cublasGetError().
		*
		* Error Status
		* ------------
		* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
		d
		* CUBLAS_STATUS_INVALID_VALUE if n < 0, incx == 0, incy == 0
		* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
		* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
		*/
		void CUBLASAPI cublasDspr2 (char uplo, int n, double alpha,
		const double x, int incx, const double y,
		int incy, double *AP);

		/*
	* void	* void
	* cublasDtrsv (char uplo, char trans, char diag, int n, const double *A,	* cublasDtrsv (char uplo, char trans, char diag, int n, const double *A,
	* int lda, double *x, int incx)	* int lda, double *x, int incx)
	*	*
	* solves a system of equations op(A) * x = b, where op(A) is either A or	* solves a system of equations op(A) * x = b, where op(A) is either A or
	* transpose(A). b and x are double precision vectors consisting of n	* transpose(A). b and x are double precision vectors consisting of n
	* elements, and A is an n x n matrix composed of a unit or non-unit, upper	* elements, and A is an n x n matrix composed of a unit or non-unit, upper
	* or lower triangular matrix. Matrix A is stored in column major format,	* or lower triangular matrix. Matrix A is stored in column major format,

	* and lda is the leading dimension of the two-diemnsional array containing	* and lda is the leading dimension of the two-dimensional array containing
	* A.	* A.
	*	*
	* No test for singularity or near-singularity is included in this function .	* No test for singularity or near-singularity is included in this function .
	* Such tests must be performed before calling this function.	* Such tests must be performed before calling this function.
	*	*
	* Input	* Input
	* -----	* -----
	* uplo specifies whether the matrix data is stored in the upper or the	* uplo specifies whether the matrix data is stored in the upper or the
	* lower triangular part of array A. If uplo = 'U' or 'u', then only	* lower triangular part of array A. If uplo = 'U' or 'u', then only
	* the upper triangular part of A may be referenced. If uplo = 'L' o r	* the upper triangular part of A may be referenced. If uplo = 'L' o r
	* 'l', then only the lower triangular part of A may be referenced.	* 'l', then only the lower triangular part of A may be referenced.
	* trans specifies op(A). If transa = 'n' or 'N', op(A) = A. If transa = ' t',	* trans specifies op(A). If transa = 'n' or 'N', op(A) = A. If transa = ' t',
	* 'T', 'c', or 'C', op(A) = transpose(A)	* 'T', 'c', or 'C', op(A) = transpose(A)
	* diag specifies whether or not A is a unit triangular matrix like so:	* diag specifies whether or not A is a unit triangular matrix like so:
	* if diag = 'U' or 'u', A is assumed to be unit triangular. If	* if diag = 'U' or 'u', A is assumed to be unit triangular. If
	* diag = 'N' or 'n', then A is not assumed to be unit triangular.	* diag = 'N' or 'n', then A is not assumed to be unit triangular.
	* n specifies the number of rows and columns of the matrix A. It	* n specifies the number of rows and columns of the matrix A. It

	* must be at least 0. In the current implementation n must be <=	* must be at least 0.
	* 2040.
	* A is a double precision array of dimensions (lda, n). If uplo = 'U'	* A is a double precision array of dimensions (lda, n). If uplo = 'U'
	* or 'u', then A must contains the upper triangular part of a symme tric	* or 'u', then A must contains the upper triangular part of a symme tric
	* matrix, and the strictly lower triangular parts is not referenced .	* matrix, and the strictly lower triangular parts is not referenced .
	* If uplo = 'L' or 'l', then A contains the lower triangular part o f	* If uplo = 'L' or 'l', then A contains the lower triangular part o f
	* a symmetric matrix, and the strictly upper triangular part is not	* a symmetric matrix, and the strictly upper triangular part is not
	* referenced.	* referenced.
	* lda is the leading dimension of the two-dimensional array containing A.	* lda is the leading dimension of the two-dimensional array containing A.
	* lda must be at least max(1, n).	* lda must be at least max(1, n).
	* x double precision array of length at least (1 + (n - 1) * abs(incx )).	* x double precision array of length at least (1 + (n - 1) * abs(incx )).
	* On entry, x contains the n element right-hand side vector b. On e xit,	* On entry, x contains the n element right-hand side vector b. On e xit,

	skipping to change at line 3546	skipping to change at line 6117
	* ------	* ------
	* x updated to contain the solution vector x that solves op(A) * x = b.	* x updated to contain the solution vector x that solves op(A) * x = b.
	*	*
	* Reference: http://www.netlib.org/blas/dtrsv.f	* Reference: http://www.netlib.org/blas/dtrsv.f
	*	*
	* Error status for this function can be retrieved via cublasGetError().	* Error status for this function can be retrieved via cublasGetError().
	*	*
	* Error Status	* Error Status
	* ------------	* ------------
	* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize d	* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize d

	* CUBLAS_STATUS_INVALID_VALUE if incx == 0 or if n < 0 or n > 2040	* CUBLAS_STATUS_INVALID_VALUE if incx == 0 or if n < 0
	* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support	* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
	* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU	* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
	*/	*/
	void CUBLASAPI cublasDtrsv (char uplo, char trans, char diag, int n,	void CUBLASAPI cublasDtrsv (char uplo, char trans, char diag, int n,
	const double A, int lda, double x,	const double A, int lda, double x,
	int incx);	int incx);


		/*
		* void
		* cublasDtrmv (char uplo, char trans, char diag, int n, const double *A,
		* int lda, double *x, int incx);
		*
		* performs one of the matrix-vector operations x = op(A) * x, where op(A)
		=
		= A, or op(A) = transpose(A). x is an n-element single precision vector, a
		nd
		* A is an n x n, unit or non-unit, upper or lower, triangular matrix compo
		sed
		* of single precision elements.
		*
		* Input
		* -----
		* uplo specifies whether the matrix A is an upper or lower triangular
		* matrix. If uplo = 'U' or 'u', then A is an upper triangular matri
		x.
		* If uplo = 'L' or 'l', then A is a lower triangular matrix.
		* trans specifies op(A). If transa = 'N' or 'n', op(A) = A. If trans = 'T
		',
		* 't', 'C', or 'c', op(A) = transpose(A)
		* diag specifies whether or not matrix A is unit triangular. If diag = '
		U'
		* or 'u', A is assumed to be unit triangular. If diag = 'N' or 'n',
		A
		* is not assumed to be unit triangular.
		* n specifies the number of rows and columns of the matrix A. n must
		be
		* at least zero.
		* A single precision array of dimension (lda, n). If uplo = 'U' or 'u
		',
		* the leading n x n upper triangular part of the array A must conta
		in
		* the upper triangular matrix and the strictly lower triangular par
		t
		* of A is not referenced. If uplo = 'L' or 'l', the leading n x n l
		ower
		* triangular part of the array A must contain the lower triangular
		* matrix and the strictly upper triangular part of A is not referen
		ced.
		* When diag = 'U' or 'u', the diagonal elements of A are not refere
		nced
		* either, but are are assumed to be unity.
		* lda is the leading dimension of A. It must be at least max (1, n).
		* x single precision array of length at least (1 + (n - 1) * abs(incx
		) ).
		* On entry, x contains the source vector. On exit, x is overwritten
		* with the result vector.
		* incx specifies the storage spacing for elements of x. incx must not be
		* zero.
		*
		* Output
		* ------
		* x updated according to x = op(A) * x,
		*
		* Reference: http://www.netlib.org/blas/dtrmv.f
		*
		* Error status for this function can be retrieved via cublasGetError().
		*
		* Error Status
		* ------------
		* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
		d
		* CUBLAS_STATUS_INVALID_VALUE if incx == 0 or if n < 0
		* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
		*/
		void CUBLASAPI cublasDtrmv (char uplo, char trans, char diag, int n,
		const double A, int lda, double x, int incx);

		/*
		* void
		* cublasDgbmv (char trans, int m, int n, int kl, int ku, double alpha,
		* const double A, int lda, const double x, int incx, double
		beta,
		* double *y, int incy);
		*
		* performs one of the matrix-vector operations
		*
		* y = alphaop(A)x + beta*y, op(A)=A or op(A) = transpose(A)
		*
		* alpha and beta are double precision scalars. x and y are double precisio
		n
		* vectors. A is an m by n band matrix consisting of double precision eleme
		nts
		* with kl sub-diagonals and ku super-diagonals.
		*
		* Input
		* -----
		* trans specifies op(A). If trans == 'N' or 'n', op(A) = A. If trans == '
		T',
		* 't', 'C', or 'c', op(A) = transpose(A)
		* m specifies the number of rows of the matrix A. m must be at least
		* zero.
		* n specifies the number of columns of the matrix A. n must be at lea
		st
		* zero.
		* kl specifies the number of sub-diagonals of matrix A. It must be at
		* least zero.
		* ku specifies the number of super-diagonals of matrix A. It must be a
		t
		* least zero.
		* alpha double precision scalar multiplier applied to op(A).
		* A double precision array of dimensions (lda, n). The leading
		* (kl + ku + 1) x n part of the array A must contain the band matri
		x A,
		* supplied column by column, with the leading diagonal of the matri
		x
		* in row (ku + 1) of the array, the first super-diagonal starting a
		t
		* position 2 in row ku, the first sub-diagonal starting at position
		1
		* in row (ku + 2), and so on. Elements in the array A that do not
		* correspond to elements in the band matrix (such as the top left
		* ku x ku triangle) are not referenced.
		* lda leading dimension of A. lda must be at least (kl + ku + 1).
		* x double precision array of length at least (1+(n-1)*abs(incx)) whe
		n
		* trans == 'N' or 'n' and at least (1+(m-1)*abs(incx)) otherwise.
		* incx specifies the increment for the elements of x. incx must not be z
		ero.
		* beta double precision scalar multiplier applied to vector y. If beta i
		s
		* zero, y is not read.
		* y double precision array of length at least (1+(m-1)*abs(incy)) whe
		n
		* trans == 'N' or 'n' and at least (1+(n-1)*abs(incy)) otherwise. I
		f
		* beta is zero, y is not read.
		* incy On entry, incy specifies the increment for the elements of y. inc
		y
		* must not be zero.
		*
		* Output
		* ------
		* y updated according to y = alphaop(A)x + beta*y
		*
		* Reference: http://www.netlib.org/blas/dgbmv.f
		*
		* Error status for this function can be retrieved via cublasGetError().
		*
		* Error Status
		* ------------
		* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
		d
		* CUBLAS_STATUS_INVALID_VALUE if n < 0, or if incx or incy == 0
		* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
		* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
		*/
		void CUBLASAPI cublasDgbmv (char trans, int m, int n, int kl, int ku,
		double alpha, const double *A, int lda,
		const double *x, int incx, double beta,
		double *y, int incy);

		/*
		* void
		* cublasDtbmv (char uplo, char trans, char diag, int n, int k, const doubl
		e *A,
		* int lda, double *x, int incx)
		*
		* performs one of the matrix-vector operations x = op(A) * x, where op(A)
		= A,
		* or op(A) = transpose(A). x is an n-element double precision vector, and
		A is
		* an n x n, unit or non-unit, upper or lower triangular band matrix compos
		ed
		* of double precision elements.
		*
		* Input
		* -----
		* uplo specifies whether the matrix A is an upper or lower triangular ba
		nd
		* matrix. If uplo == 'U' or 'u', A is an upper triangular band matr
		ix.
		* If uplo == 'L' or 'l', A is a lower triangular band matrix.
		* trans specifies op(A). If transa == 'N' or 'n', op(A) = A. If trans ==
		'T',
		* 't', 'C', or 'c', op(A) = transpose(A)
		* diag specifies whether or not matrix A is unit triangular. If diag ==
		'U'
		* or 'u', A is assumed to be unit triangular. If diag == 'N' or 'n'
		, A
		* is not assumed to be unit triangular.
		* n specifies the number of rows and columns of the matrix A. n must
		be
		* at least zero.
		* k specifies the number of super- or sub-diagonals. If uplo == 'U' o
		r
		* 'u', k specifies the number of super-diagonals. If uplo == 'L' or
		* 'l', k specifies the number of sub-diagonals. k must at least be
		* zero.
		* A double precision array of dimension (lda, n). If uplo == 'U' or '
		u',
		* the leading (k + 1) x n part of the array A must contain the uppe
		r
		* triangular band matrix, supplied column by column, with the leadi
		ng
		* diagonal of the matrix in row (k + 1) of the array, the first
		* super-diagonal starting at position 2 in row k, and so on. The to
		p
		* left k x k triangle of the array A is not referenced. If uplo ==
		'L'
		* or 'l', the leading (k + 1) x n part of the array A must constain
		the
		* lower triangular band matrix, supplied column by column, with the
		* leading diagonal of the matrix in row 1 of the array, the first
		* sub-diagonal startingat position 1 in row 2, and so on. The botto
		m
		* right k x k triangle of the array is not referenced.
		* lda is the leading dimension of A. It must be at least (k + 1).
		* x double precision array of length at least (1 + (n - 1) * abs(incx
		)).
		* On entry, x contains the source vector. On exit, x is overwritten
		* with the result vector.
		* incx specifies the storage spacing for elements of x. incx must not be
		* zero.
		*
		* Output
		* ------
		* x updated according to x = op(A) * x
		*
		* Reference: http://www.netlib.org/blas/dtbmv.f
		*
		* Error status for this function can be retrieved via cublasGetError().
		*
		* Error Status
		* ------------
		* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
		d
		* CUBLAS_STATUS_INVALID_VALUE if n or k < 0, or if incx == 0
		* CUBLAS_STATUS_ALLOC_FAILED if function cannot allocate enough intern
		al scratch vector memory
		* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
		* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
		*/
		void CUBLASAPI cublasDtbmv (char uplo, char trans, char diag, int n,
		int k, const double A, int lda, double x,
		int incx);

		/*
		* void
		* cublasDtpmv (char uplo, char trans, char diag, int n, const double *AP,
		* double *x, int incx);
		*
		* performs one of the matrix-vector operations x = op(A) * x, where op(A)
		= A,
		* or op(A) = transpose(A). x is an n element double precision vector, and
		A
		* is an n x n, unit or non-unit, upper or lower triangular matrix composed
		* of double precision elements.
		*
		* Input
		* -----
		* uplo specifies whether the matrix A is an upper or lower triangular
		* matrix. If uplo == 'U' or 'u', then A is an upper triangular matr
		ix.
		* If uplo == 'L' or 'l', then A is a lower triangular matrix.
		* trans specifies op(A). If transa == 'N' or 'n', op(A) = A. If trans ==
		'T',
		* 't', 'C', or 'c', op(A) = transpose(A)
		* diag specifies whether or not matrix A is unit triangular. If diag ==
		'U'
		* or 'u', A is assumed to be unit triangular. If diag == 'N' or 'n'
		, A
		* is not assumed to be unit triangular.
		* n specifies the number of rows and columns of the matrix A. n must
		be
		* at least zero. In the current implementation n must not exceed 40
		70.
		* AP double precision array with at least ((n * (n + 1)) / 2) elements
		. If
		* uplo == 'U' or 'u', the array AP contains the upper triangular pa
		rt
		* of the symmetric matrix A, packed sequentially, column by column;
		* that is, if i <= j, then A[i,j] is stored in AP[i+(j*(j+1)/2)]. I
		f
		* uplo == 'L' or 'L', the array AP contains the lower triangular pa
		rt
		* of the symmetric matrix A, packed sequentially, column by column;
		* that is, if i >= j, then A[i,j] is stored in AP[i+((2n-j+1)j)/2
		].
		* x double precision array of length at least (1 + (n - 1) * abs(incx
		)).
		* On entry, x contains the source vector. On exit, x is overwritten
		* with the result vector.
		* incx specifies the storage spacing for elements of x. incx must not be
		* zero.
		*
		* Output
		* ------
		* x updated according to x = op(A) * x,
		*
		* Reference: http://www.netlib.org/blas/dtpmv.f
		*
		* Error status for this function can be retrieved via cublasGetError().
		*
		* Error Status
		* ------------
		* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
		d
		* CUBLAS_STATUS_INVALID_VALUE if incx == 0 or n < 0
		* CUBLAS_STATUS_ALLOC_FAILED if function cannot allocate enough intern
		al scratch vector memory
		* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
		* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
		*/
		void CUBLASAPI cublasDtpmv (char uplo, char trans, char diag, int n,
		const double AP, double x, int incx);

		/*
		* void
		* cublasDtpsv (char uplo, char trans, char diag, int n, const double *AP,
		* double *X, int incx)
		*
		* solves one of the systems of equations op(A)*x = b, where op(A) is eithe
		r
		* op(A) = A or op(A) = transpose(A). b and x are n element vectors, and A
		is
		* an n x n unit or non-unit, upper or lower triangular matrix. No test for
		* singularity or near-singularity is included in this routine. Such tests
		* must be performed before calling this routine.
		*
		* Input
		* -----
		* uplo specifies whether the matrix is an upper or lower triangular matr
		ix
		* as follows: If uplo == 'U' or 'u', A is an upper triangluar matri
		x.
		* If uplo == 'L' or 'l', A is a lower triangular matrix.
		* trans specifies op(A). If trans == 'N' or 'n', op(A) = A. If trans == '
		T',
		* 't', 'C', or 'c', op(A) = transpose(A).
		* diag specifies whether A is unit triangular. If diag == 'U' or 'u', A
		is
		* assumed to be unit triangular; thas is, diagonal elements are not
		* read and are assumed to be unity. If diag == 'N' or 'n', A is not
		* assumed to be unit triangular.
		* n specifies the number of rows and columns of the matrix A. n must
		be
		* at least zero.
		* AP double precision array with at least ((n*(n+1))/2) elements. If u
		plo
		* == 'U' or 'u', the array AP contains the upper triangular matrix
		A,
		* packed sequentially, column by column; that is, if i <= j, then
		* A[i,j] is stored is AP[i+(j*(j+1)/2)]. If uplo == 'L' or 'L', the
		* array AP contains the lower triangular matrix A, packed sequentia
		lly,
		* column by column; that is, if i >= j, then A[i,j] is stored in
		* AP[i+((2n-j+1)j)/2]. When diag = 'U' or 'u', the diagonal eleme
		nts
		* of A are not referenced and are assumed to be unity.
		* x double precision array of length at least (1+(n-1)*abs(incx)).
		* incx storage spacing between elements of x. It must not be zero.
		*
		* Output
		* ------
		* x updated to contain the solution vector x that solves op(A) * x =
		b.
		*
		* Reference: http://www.netlib.org/blas/dtpsv.f
		*
		* Error status for this function can be retrieved via cublasGetError().
		*
		* Error Status
		* ------------
		* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
		d
		* CUBLAS_STATUS_INVALID_VALUE if incx == 0 or if n < 0 or n > 2035
		* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
		* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
		*/
		void CUBLASAPI cublasDtpsv (char uplo, char trans, char diag, int n,
		const double AP, double x, int incx);

		/*
		* void cublasDtbsv (char uplo, char trans, char diag, int n, int k,
		* const double A, int lda, double X, int incx)
		*
		* solves one of the systems of equations op(A)*x = b, where op(A) is eithe
		r
		* op(A) = A or op(A) = transpose(A). b and x are n element vectors, and A
		is
		* an n x n unit or non-unit, upper or lower triangular band matrix with k
		+ 1
		* diagonals. No test for singularity or near-singularity is included in th
		is
		* function. Such tests must be performed before calling this function.
		*
		* Input
		* -----
		* uplo specifies whether the matrix is an upper or lower triangular band
		* matrix as follows: If uplo == 'U' or 'u', A is an upper triangula
		r
		* band matrix. If uplo == 'L' or 'l', A is a lower triangular band
		* matrix.
		* trans specifies op(A). If trans == 'N' or 'n', op(A) = A. If trans == '
		T',
		* 't', 'C', or 'c', op(A) = transpose(A).
		* diag specifies whether A is unit triangular. If diag == 'U' or 'u', A
		is
		* assumed to be unit triangular; thas is, diagonal elements are not
		* read and are assumed to be unity. If diag == 'N' or 'n', A is not
		* assumed to be unit triangular.
		* n specifies the number of rows and columns of the matrix A. n must
		be
		* at least zero.
		* k specifies the number of super- or sub-diagonals. If uplo == 'U' o
		r
		* 'u', k specifies the number of super-diagonals. If uplo == 'L' or
		* 'l', k specifies the number of sub-diagonals. k must at least be
		* zero.
		* A double precision array of dimension (lda, n). If uplo == 'U' or '
		u',
		* the leading (k + 1) x n part of the array A must contain the uppe
		r
		* triangular band matrix, supplied column by column, with the leadi
		ng
		* diagonal of the matrix in row (k + 1) of the array, the first sup
		er-
		* diagonal starting at position 2 in row k, and so on. The top left
		* k x k triangle of the array A is not referenced. If uplo == 'L' o
		r
		* 'l', the leading (k + 1) x n part of the array A must constain th
		e
		* lower triangular band matrix, supplied column by column, with the
		* leading diagonal of the matrix in row 1 of the array, the first
		* sub-diagonal starting at position 1 in row 2, and so on. The bott
		om
		* right k x k triangle of the array is not referenced.
		* x double precision array of length at least (1+(n-1)*abs(incx)).
		* incx storage spacing between elements of x. It must not be zero.
		*
		* Output
		* ------
		* x updated to contain the solution vector x that solves op(A) * x =
		b.
		*
		* Reference: http://www.netlib.org/blas/dtbsv.f
		*
		* Error status for this function can be retrieved via cublasGetError().
		*
		* Error Status
		* ------------
		* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
		d
		* CUBLAS_STATUS_INVALID_VALUE if incx == 0, n < 0 or n > 2035
		* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
		* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
		*/
		void CUBLASAPI cublasDtbsv (char uplo, char trans, char diag, int n,
		int k, const double A, int lda, double x,
		int incx);

		/*
		* void
		* cublasDsymv (char uplo, int n, double alpha, const double *A, int lda,
		* const double x, int incx, double beta, double y, int incy
		)
		*
		* performs the matrix-vector operation
		*
		* y = alphaAx + beta*y
		*
		* Alpha and beta are double precision scalars, and x and y are double
		* precision vectors, each with n elements. A is a symmetric n x n matrix
		* consisting of double precision elements that is stored in either upper o
		r
		* lower storage mode.
		*
		* Input
		* -----
		* uplo specifies whether the upper or lower triangular part of the array
		A
		* is to be referenced. If uplo == 'U' or 'u', the symmetric matrix
		A
		* is stored in upper storage mode, i.e. only the upper triangular p
		art
		* of A is to be referenced while the lower triangular part of A is
		to
		* be inferred. If uplo == 'L' or 'l', the symmetric matrix A is sto
		red
		* in lower storage mode, i.e. only the lower triangular part of A i
		s
		* to be referenced while the upper triangular part of A is to be
		* inferred.
		* n specifies the number of rows and the number of columns of the
		* symmetric matrix A. n must be at least zero.
		* alpha double precision scalar multiplier applied to A*x.
		* A double precision array of dimensions (lda, n). If uplo == 'U' or
		'u',
		* the leading n x n upper triangular part of the array A must conta
		in
		* the upper triangular part of the symmetric matrix and the strictl
		y
		* lower triangular part of A is not referenced. If uplo == 'L' or '
		l',
		* the leading n x n lower triangular part of the array A must conta
		in
		* the lower triangular part of the symmetric matrix and the strictl
		y
		* upper triangular part of A is not referenced.
		* lda leading dimension of A. It must be at least max (1, n).
		* x double precision array of length at least (1 + (n - 1) * abs(incx
		)).
		* incx storage spacing between elements of x. incx must not be zero.
		* beta double precision scalar multiplier applied to vector y.
		* y double precision array of length at least (1 + (n - 1) * abs(incy
		)).
		* If beta is zero, y is not read.
		* incy storage spacing between elements of y. incy must not be zero.
		*
		* Output
		* ------
		* y updated according to y = alphaAx + beta*y
		*
		* Reference: http://www.netlib.org/blas/dsymv.f
		*
		* Error status for this function can be retrieved via cublasGetError().
		*
		* Error Status
		* ------------
		* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
		d
		* CUBLAS_STATUS_INVALID_VALUE if n < 0, or if incx or incy == 0
		* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
		* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
		*/
		void CUBLASAPI cublasDsymv (char uplo, int n, double alpha,
		const double A, int lda, const double x,
		int incx, double beta, double *y, int incy);

		/*
		* void
		* cublasDsbmv (char uplo, int n, int k, double alpha, const double *A, int
		lda,
		* const double x, int incx, double beta, double y, int incy
		)
		*
		* performs the matrix-vector operation
		*
		* y := alphaAx + beta*y
		*
		* alpha and beta are double precision scalars. x and y are double precisio
		n
		* vectors with n elements. A is an n by n symmetric band matrix consisting
		* of double precision elements, with k super-diagonals and the same number
		* of subdiagonals.
		*
		* Input
		* -----
		* uplo specifies whether the upper or lower triangular part of the symme
		tric
		* band matrix A is being supplied. If uplo == 'U' or 'u', the upper
		* triangular part is being supplied. If uplo == 'L' or 'l', the low
		er
		* triangular part is being supplied.
		* n specifies the number of rows and the number of columns of the
		* symmetric matrix A. n must be at least zero.
		* k specifies the number of super-diagonals of matrix A. Since the ma
		trix
		* is symmetric, this is also the number of sub-diagonals. k must be
		at
		* least zero.
		* alpha double precision scalar multiplier applied to A*x.
		* A double precision array of dimensions (lda, n). When uplo == 'U' o
		r
		* 'u', the leading (k + 1) x n part of array A must contain the upp
		er
		* triangular band of the symmetric matrix, supplied column by colum
		n,
		* with the leading diagonal of the matrix in row (k+1) of the array
		,
		* the first super-diagonal starting at position 2 in row k, and so
		on.
		* The top left k x k triangle of the array A is not referenced. Whe
		n
		* uplo == 'L' or 'l', the leading (k + 1) x n part of the array A m
		ust
		* contain the lower triangular band part of the symmetric matrix,
		* supplied column by column, with the leading diagonal of the matri
		x in
		* row 1 of the array, the first sub-diagonal starting at position 1
		in
		* row 2, and so on. The bottom right k x k triangle of the array A
		is
		* not referenced.
		* lda leading dimension of A. lda must be at least (k + 1).
		* x double precision array of length at least (1 + (n - 1) * abs(incx
		)).
		* incx storage spacing between elements of x. incx must not be zero.
		* beta double precision scalar multiplier applied to vector y. If beta i
		s
		* zero, y is not read.
		* y double precision array of length at least (1 + (n - 1) * abs(incy
		)).
		* If beta is zero, y is not read.
		* incy storage spacing between elements of y. incy must not be zero.
		*
		* Output
		* ------
		* y updated according to alphaAx + beta*y
		*
		* Reference: http://www.netlib.org/blas/dsbmv.f
		*
		* Error status for this function can be retrieved via cublasGetError().
		*
		* Error Status
		* ------------
		* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
		d
		* CUBLAS_STATUS_INVALID_VALUE if k or n < 0, or if incx or incy == 0
		* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
		* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
		*/
		void CUBLASAPI cublasDsbmv (char uplo, int n, int k, double alpha,
		const double A, int lda, const double x,
		int incx, double beta, double *y, int incy);

		/*
		* void
		* cublasDspmv (char uplo, int n, double alpha, const double *AP, const dou
		ble *x,
		* int incx, double beta, double *y, int incy)
		*
		* performs the matrix-vector operation
		*
		* y = alpha * A * x + beta * y
		*
		* Alpha and beta are double precision scalars, and x and y are double
		* precision vectors with n elements. A is a symmetric n x n matrix
		* consisting of double precision elements that is supplied in packed form.
		*
		* Input
		* -----
		* uplo specifies whether the matrix data is stored in the upper or the l
		ower
		* triangular part of array AP. If uplo == 'U' or 'u', then the uppe
		r
		* triangular part of A is supplied in AP. If uplo == 'L' or 'l', th
		en
		* the lower triangular part of A is supplied in AP.
		* n specifies the number of rows and columns of the matrix A. It must
		be
		* at least zero.
		* alpha double precision scalar multiplier applied to A*x.
		* AP double precision array with at least ((n * (n + 1)) / 2) elements
		. If
		* uplo == 'U' or 'u', the array AP contains the upper triangular pa
		rt
		* of the symmetric matrix A, packed sequentially, column by column;
		* that is, if i <= j, then A[i,j] is stored is AP[i+(j*(j+1)/2)]. I
		f
		* uplo == 'L' or 'L', the array AP contains the lower triangular pa
		rt
		* of the symmetric matrix A, packed sequentially, column by column;
		* that is, if i >= j, then A[i,j] is stored in AP[i+((2n-j+1)j)/2
		].
		* x double precision array of length at least (1 + (n - 1) * abs(incx
		)).
		* incx storage spacing between elements of x. incx must not be zero.
		* beta double precision scalar multiplier applied to vector y;
		* y double precision array of length at least (1 + (n - 1) * abs(incy
		)).
		* If beta is zero, y is not read.
		* incy storage spacing between elements of y. incy must not be zero.
		*
		* Output
		* ------
		* y updated according to y = alphaAx + beta*y
		*
		* Reference: http://www.netlib.org/blas/dspmv.f
		*
		* Error status for this function can be retrieved via cublasGetError().
		*
		* Error Status
		* ------------
		* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
		d
		* CUBLAS_STATUS_INVALID_VALUE if n < 0, or if incx or incy == 0
		* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
		* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
		*/
		void CUBLASAPI cublasDspmv (char uplo, int n, double alpha,
		const double AP, const double x,
		int incx, double beta, double *y, int incy);

	/* ---------------- CUBLAS double precision BLAS3 functions --------------- - */	/* ---------------- CUBLAS double precision BLAS3 functions --------------- - */

	/*	/*
	* void	* void
	* cublasDgemm (char transa, char transb, int m, int n, int k, double alpha ,	* cublasDgemm (char transa, char transb, int m, int n, int k, double alpha ,
	* const double A, int lda, const double B, int ldb,	* const double A, int lda, const double B, int ldb,
	* double beta, double *C, int ldc)	* double beta, double *C, int ldc)
	*	*
	* computes the product of matrix A and matrix B, multiplies the result	* computes the product of matrix A and matrix B, multiplies the result
	* by scalar alpha, and adds the sum to the product of matrix C and	* by scalar alpha, and adds the sum to the product of matrix C and

	skipping to change at line 3945	skipping to change at line 7051
	* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support	* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
	* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU	* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
	*/	*/
	void CUBLASAPI cublasDsymm (char side, char uplo, int m, int n,	void CUBLASAPI cublasDsymm (char side, char uplo, int m, int n,
	double alpha, const double *A, int lda,	double alpha, const double *A, int lda,
	const double *B, int ldb, double beta,	const double *B, int ldb, double beta,
	double *C, int ldc);	double *C, int ldc);

	/*	/*
	* void	* void

		* cublasZsymm (char side, char uplo, int m, int n, cuDoubleComplex alpha,
		* const cuDoubleComplex A, int lda, const cuDoubleComplex B
		, int ldb,
		* cuDoubleComplex beta, cuDoubleComplex *C, int ldc);
		*
		* performs one of the matrix-matrix operations
		*
		* C = alpha * A * B + beta * C, or
		* C = alpha * B * A + beta * C,
		*
		* where alpha and beta are double precision complex scalars, A is a symmet
		ric matrix
		* consisting of double precision complex elements and stored in either low
		er or upper
		* storage mode, and B and C are m x n matrices consisting of double precis
		ion
		* complex elements.
		*
		* Input
		* -----
		* side specifies whether the symmetric matrix A appears on the left side
		* hand side or right hand side of matrix B, as follows. If side ==
		'L'
		* or 'l', then C = alpha * A * B + beta * C. If side = 'R' or 'r',
		* then C = alpha * B * A + beta * C.
		* uplo specifies whether the symmetric matrix A is stored in upper or lo
		wer
		* storage mode, as follows. If uplo == 'U' or 'u', only the upper
		* triangular part of the symmetric matrix is to be referenced, and
		the
		* elements of the strictly lower triangular part are to be infered
		from
		* those in the upper triangular part. If uplo == 'L' or 'l', only t
		he
		* lower triangular part of the symmetric matrix is to be referenced
		,
		* and the elements of the strictly upper triangular part are to be
		* infered from those in the lower triangular part.
		* m specifies the number of rows of the matrix C, and the number of r
		ows
		* of matrix B. It also specifies the dimensions of symmetric matrix
		A
		* when side == 'L' or 'l'. m must be at least zero.
		* n specifies the number of columns of the matrix C, and the number o
		f
		* columns of matrix B. It also specifies the dimensions of symmetri
		c
		* matrix A when side == 'R' or 'r'. n must be at least zero.
		* alpha double precision scalar multiplier applied to A * B, or B * A
		* A double precision array of dimensions (lda, ka), where ka is m whe
		n
		* side == 'L' or 'l' and is n otherwise. If side == 'L' or 'l' the
		* leading m x m part of array A must contain the symmetric matrix,
		* such that when uplo == 'U' or 'u', the leading m x m part stores
		the
		* upper triangular part of the symmetric matrix, and the strictly l
		ower
		* triangular part of A is not referenced, and when uplo == 'U' or '
		u',
		* the leading m x m part stores the lower triangular part of the
		* symmetric matrix and the strictly upper triangular part is not
		* referenced. If side == 'R' or 'r' the leading n x n part of array
		A
		* must contain the symmetric matrix, such that when uplo == 'U' or
		'u',
		* the leading n x n part stores the upper triangular part of the
		* symmetric matrix and the strictly lower triangular part of A is n
		ot
		* referenced, and when uplo == 'U' or 'u', the leading n x n part
		* stores the lower triangular part of the symmetric matrix and the
		* strictly upper triangular part is not referenced.
		* lda leading dimension of A. When side == 'L' or 'l', it must be at le
		ast
		* max(1, m) and at least max(1, n) otherwise.
		* B double precision array of dimensions (ldb, n). On entry, the lead
		ing
		* m x n part of the array contains the matrix B.
		* ldb leading dimension of B. It must be at least max (1, m).
		* beta double precision scalar multiplier applied to C. If beta is zero,
		C
		* does not have to be a valid input
		* C double precision array of dimensions (ldc, n)
		* ldc leading dimension of C. Must be at least max(1, m)
		*
		* Output
		* ------
		* C updated according to C = alpha * A * B + beta * C, or C = alpha *
		* B * A + beta * C
		*
		* Reference: http://www.netlib.org/blas/zsymm.f
		*
		* Error status for this function can be retrieved via cublasGetError().
		*
		* Error Status
		* ------------
		* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
		d
		* CUBLAS_STATUS_INVALID_VALUE if m or n are < 0
		* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
		* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
		*/
		void CUBLASAPI cublasZsymm (char side, char uplo, int m, int n,
		cuDoubleComplex alpha, const cuDoubleComplex *A
		, int lda,
		const cuDoubleComplex *B, int ldb, cuDoubleComp
		lex beta,
		cuDoubleComplex *C, int ldc);

		/*
		* void
	* cublasDsyrk (char uplo, char trans, int n, int k, double alpha,	* cublasDsyrk (char uplo, char trans, int n, int k, double alpha,
	* const double A, int lda, double beta, double C, int ldc)	* const double A, int lda, double beta, double C, int ldc)
	*	*
	* performs one of the symmetric rank k operations	* performs one of the symmetric rank k operations
	*	*
	* C = alpha * A * transpose(A) + beta * C, or	* C = alpha * A * transpose(A) + beta * C, or
	* C = alpha * transpose(A) * A + beta * C.	* C = alpha * transpose(A) * A + beta * C.
	*	*
	* Alpha and beta are double precision scalars. C is an n x n symmetric mat rix	* Alpha and beta are double precision scalars. C is an n x n symmetric mat rix
	* consisting of double precision elements and stored in either lower or	* consisting of double precision elements and stored in either lower or

	skipping to change at line 3993	skipping to change at line 7182
	* otherwise the leading k x n part of the array must contains the	* otherwise the leading k x n part of the array must contains the
	* matrix A.	* matrix A.
	* lda leading dimension of A. When trans == 'N' or 'n' then lda must be at	* lda leading dimension of A. When trans == 'N' or 'n' then lda must be at
	* least max(1, n). Otherwise lda must be at least max(1, k).	* least max(1, n). Otherwise lda must be at least max(1, k).
	* beta double precision scalar multiplier applied to C. If beta izs zero , C	* beta double precision scalar multiplier applied to C. If beta izs zero , C
	* does not have to be a valid input	* does not have to be a valid input
	* C double precision array of dimensions (ldc, n). If uplo = 'U' or ' u',	* C double precision array of dimensions (ldc, n). If uplo = 'U' or ' u',
	* the leading n x n triangular part of the array C must contain the	* the leading n x n triangular part of the array C must contain the
	* upper triangular part of the symmetric matrix C and the strictly	* upper triangular part of the symmetric matrix C and the strictly
	* lower triangular part of C is not referenced. On exit, the upper	* lower triangular part of C is not referenced. On exit, the upper

	* triangular part of C is overwritten by the upper trinagular part of	* triangular part of C is overwritten by the upper triangular part of
	* the updated matrix. If uplo = 'L' or 'l', the leading n x n	* the updated matrix. If uplo = 'L' or 'l', the leading n x n
	* triangular part of the array C must contain the lower triangular part	* triangular part of the array C must contain the lower triangular part
	* of the symmetric matrix C and the strictly upper triangular part of C	* of the symmetric matrix C and the strictly upper triangular part of C
	* is not referenced. On exit, the lower triangular part of C is	* is not referenced. On exit, the lower triangular part of C is

	* overwritten by the lower trinagular part of the updated matrix.	* overwritten by the lower triangular part of the updated matrix.
	* ldc leading dimension of C. It must be at least max(1, n).	* ldc leading dimension of C. It must be at least max(1, n).
	*	*
	* Output	* Output
	* ------	* ------
	* C updated according to C = alpha * A * transpose(A) + beta * C, or C =	* C updated according to C = alpha * A * transpose(A) + beta * C, or C =
	* alpha * transpose(A) * A + beta * C	* alpha * transpose(A) * A + beta * C
	*	*
	* Reference: http://www.netlib.org/blas/dsyrk.f	* Reference: http://www.netlib.org/blas/dsyrk.f
	*	*
	* Error status for this function can be retrieved via cublasGetError().	* Error status for this function can be retrieved via cublasGetError().

	skipping to change at line 4071	skipping to change at line 7260
	* otherwise the leading k x n part of the array must contains the	* otherwise the leading k x n part of the array must contains the
	* matrix A.	* matrix A.
	* lda leading dimension of A. When trans == 'N' or 'n' then lda must be at	* lda leading dimension of A. When trans == 'N' or 'n' then lda must be at
	* least max(1, n). Otherwise lda must be at least max(1, k).	* least max(1, n). Otherwise lda must be at least max(1, k).
	* beta double precision complex scalar multiplier applied to C. If beta izs zero, C	* beta double precision complex scalar multiplier applied to C. If beta izs zero, C
	* does not have to be a valid input	* does not have to be a valid input
	* C double precision complex array of dimensions (ldc, n). If uplo = 'U' or 'u',	* C double precision complex array of dimensions (ldc, n). If uplo = 'U' or 'u',
	* the leading n x n triangular part of the array C must contain the	* the leading n x n triangular part of the array C must contain the
	* upper triangular part of the symmetric matrix C and the strictly	* upper triangular part of the symmetric matrix C and the strictly
	* lower triangular part of C is not referenced. On exit, the upper	* lower triangular part of C is not referenced. On exit, the upper

	* triangular part of C is overwritten by the upper trinagular part of	* triangular part of C is overwritten by the upper triangular part of
	* the updated matrix. If uplo = 'L' or 'l', the leading n x n	* the updated matrix. If uplo = 'L' or 'l', the leading n x n
	* triangular part of the array C must contain the lower triangular part	* triangular part of the array C must contain the lower triangular part
	* of the symmetric matrix C and the strictly upper triangular part of C	* of the symmetric matrix C and the strictly upper triangular part of C
	* is not referenced. On exit, the lower triangular part of C is	* is not referenced. On exit, the lower triangular part of C is

	* overwritten by the lower trinagular part of the updated matrix.	* overwritten by the lower triangular part of the updated matrix.
	* ldc leading dimension of C. It must be at least max(1, n).	* ldc leading dimension of C. It must be at least max(1, n).
	*	*
	* Output	* Output
	* ------	* ------
	* C updated according to C = alpha * A * transpose(A) + beta * C, or C =	* C updated according to C = alpha * A * transpose(A) + beta * C, or C =
	* alpha * transpose(A) * A + beta * C	* alpha * transpose(A) * A + beta * C
	*	*
	* Reference: http://www.netlib.org/blas/zsyrk.f	* Reference: http://www.netlib.org/blas/zsyrk.f
	*	*
	* Error status for this function can be retrieved via cublasGetError().	* Error status for this function can be retrieved via cublasGetError().

	skipping to change at line 4100	skipping to change at line 7289
	* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize d	* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize d
	* CUBLAS_STATUS_INVALID_VALUE if n < 0 or k < 0	* CUBLAS_STATUS_INVALID_VALUE if n < 0 or k < 0
	* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support	* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
	* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU	* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
	*/	*/
	void CUBLASAPI cublasZsyrk (char uplo, char trans, int n, int k,	void CUBLASAPI cublasZsyrk (char uplo, char trans, int n, int k,
	cuDoubleComplex alpha,	cuDoubleComplex alpha,
	const cuDoubleComplex *A, int lda,	const cuDoubleComplex *A, int lda,
	cuDoubleComplex beta,	cuDoubleComplex beta,
	cuDoubleComplex *C, int ldc);	cuDoubleComplex *C, int ldc);

		/*
		* void
		* cublasZsyr2k (char uplo, char trans, int n, int k, cuDoubleComplex alpha
		,
		* const cuDoubleComplex A, int lda, const cuDoubleComplex
		B, int ldb,
		* cuDoubleComplex beta, cuDoubleComplex *C, int ldc)
		*
		* performs one of the symmetric rank 2k operations
		*
		* C = alpha * A * transpose(B) + alpha * B * transpose(A) + beta * C, o
		r
		* C = alpha * transpose(A) * B + alpha * transpose(B) * A + beta * C.
		*
		* Alpha and beta are double precision complex scalars. C is an n x n symme
		tric matrix
		* consisting of double precision complex elements and stored in either low
		er or upper
		* storage mode. A and B are matrices consisting of double precision comple
		x elements
		* with dimension of n x k in the first case, and k x n in the second case.
		*
		* Input
		* -----
		* uplo specifies whether the symmetric matrix C is stored in upper or lo
		wer
		* storage mode, as follows. If uplo == 'U' or 'u', only the upper
		* triangular part of the symmetric matrix is to be referenced, and
		the
		* elements of the strictly lower triangular part are to be infered
		from
		* those in the upper triangular part. If uplo == 'L' or 'l', only t
		he
		* lower triangular part of the symmetric matrix is to be references
		,
		* and the elements of the strictly upper triangular part are to be
		* infered from those in the lower triangular part.
		* trans specifies the operation to be performed. If trans == 'N' or 'n',
		* C = alpha * A * transpose(B) + alpha * B * transpose(A) + beta *
		C,
		* If trans == 'T', 't', 'C', or 'c', C = alpha * transpose(A) * B +
		* alpha * transpose(B) * A + beta * C.
		* n specifies the number of rows and the number columns of matrix C.
		If
		* trans == 'N' or 'n', n specifies the number of rows of matrix A.
		If
		* trans == 'T', 't', 'C', or 'c', n specifies the columns of matrix
		A.
		* n must be at least zero.
		* k If trans == 'N' or 'n', k specifies the number of rows of matrix
		A.
		* If trans == 'T', 't', 'C', or 'c', k specifies the number of rows
		of
		* matrix A. k must be at least zero.
		* alpha double precision scalar multiplier.
		* A double precision array of dimensions (lda, ka), where ka is k whe
		n
		* trans == 'N' or 'n', and is n otherwise. When trans == 'N' or 'n'
		,
		* the leading n x k part of array A must contain the matrix A,
		* otherwise the leading k x n part of the array must contain the ma
		trix
		* A.
		* lda leading dimension of A. When trans == 'N' or 'n' then lda must be
		at
		* least max(1, n). Otherwise lda must be at least max(1,k).
		* B double precision array of dimensions (lda, kb), where kb is k whe
		n
		* trans == 'N' or 'n', and is n otherwise. When trans == 'N' or 'n'
		,
		* the leading n x k part of array B must contain the matrix B,
		* otherwise the leading k x n part of the array must contain the ma
		trix
		* B.
		* ldb leading dimension of N. When trans == 'N' or 'n' then ldb must be
		at
		* least max(1, n). Otherwise ldb must be at least max(1, k).
		* beta double precision scalar multiplier applied to C. If beta is zero,
		C
		* does not have to be a valid input.
		* C double precision array of dimensions (ldc, n). If uplo == 'U' or
		'u',
		* the leading n x n triangular part of the array C must contain the
		* upper triangular part of the symmetric matrix C and the strictly
		* lower triangular part of C is not referenced. On exit, the upper
		* triangular part of C is overwritten by the upper triangular part
		of
		* the updated matrix. If uplo == 'L' or 'l', the leading n x n
		* triangular part of the array C must contain the lower triangular
		part
		* of the symmetric matrix C and the strictly upper triangular part
		of C
		* is not referenced. On exit, the lower triangular part of C is
		* overwritten by the lower triangular part of the updated matrix.
		* ldc leading dimension of C. Must be at least max(1, n).
		*
		* Output
		* ------
		* C updated according to alphaAtranspose(B) + alphaBtranspose(A)
		+
		* betaC or alphatranspose(A)B + alphatranspose(B)A + betaC
		*
		* Reference: http://www.netlib.org/blas/zsyr2k.f
		*
		* Error status for this function can be retrieved via cublasGetError().
		*
		* Error Status
		* ------------
		* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
		d
		* CUBLAS_STATUS_INVALID_VALUE if n < 0 or k < 0
		* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
		* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
		*/
		void CUBLASAPI cublasZsyr2k (char uplo, char trans, int n, int k,
		cuDoubleComplex alpha, const cuDoubleComplex *
		A, int lda,
		const cuDoubleComplex *B, int ldb, cuDoubleCom
		plex beta,
		cuDoubleComplex *C, int ldc);
		/*
		* void
		* cublasZher2k (char uplo, char trans, int n, int k, cuDoubleComplex alpha
		,
		* const cuDoubleComplex A, int lda, const cuDoubleComplex
		B, int ldb,
		* double beta, cuDoubleComplex *C, int ldc)
		*
		* performs one of the hermitian rank 2k operations
		*
		* C = alpha * A * conjugate(transpose(B))
		* + conjugate(alpha) * B * conjugate(transpose(A))
		* + beta * C ,
		* or
		* C = alpha * conjugate(transpose(A)) * B
		* + conjugate(alpha) * conjugate(transpose(B)) * A
		* + beta * C.
		*
		* Alpha is double precision complex scalar whereas Beta is a double precis
		ion real scalar.
		* C is an n x n hermitian matrix consisting of double precision complex el
		ements and
		* stored in either lower or upper storage mode. A and B are matrices consi
		sting of
		* double precision complex elements with dimension of n x k in the first c
		ase,
		* and k x n in the second case.
		*
		* Input
		* -----
		* uplo specifies whether the hermitian matrix C is stored in upper or lo
		wer
		* storage mode, as follows. If uplo == 'U' or 'u', only the upper
		* triangular part of the hermitian matrix is to be referenced, and
		the
		* elements of the strictly lower triangular part are to be infered
		from
		* those in the upper triangular part. If uplo == 'L' or 'l', only t
		he
		* lower triangular part of the hermitian matrix is to be references
		,
		* and the elements of the strictly upper triangular part are to be
		* infered from those in the lower triangular part.
		* trans specifies the operation to be performed. If trans == 'N' or 'n',
		* C = alpha * A * conjugate(transpose(B))
		* + conjugate(alpha) * B * conjugate(transpose(A))
		* + beta * C .
		* If trans == 'T', 't', 'C', or 'c',
		* C = alpha * conjugate(transpose(A)) * B
		* + conjugate(alpha) * conjugate(transpose(B)) * A
		* + beta * C.
		* n specifies the number of rows and the number columns of matrix C.
		If
		* trans == 'N' or 'n', n specifies the number of rows of matrix A.
		If
		* trans == 'T', 't', 'C', or 'c', n specifies the columns of matrix
		A.
		* n must be at least zero.
		* k If trans == 'N' or 'n', k specifies the number of rows of matrix
		A.
		* If trans == 'T', 't', 'C', or 'c', k specifies the number of rows
		of
		* matrix A. k must be at least zero.
		* alpha double precision scalar multiplier.
		* A double precision array of dimensions (lda, ka), where ka is k whe
		n
		* trans == 'N' or 'n', and is n otherwise. When trans == 'N' or 'n'
		,
		* the leading n x k part of array A must contain the matrix A,
		* otherwise the leading k x n part of the array must contain the ma
		trix
		* A.
		* lda leading dimension of A. When trans == 'N' or 'n' then lda must be
		at
		* least max(1, n). Otherwise lda must be at least max(1,k).
		* B double precision array of dimensions (lda, kb), where kb is k whe
		n
		* trans == 'N' or 'n', and is n otherwise. When trans == 'N' or 'n'
		,
		* the leading n x k part of array B must contain the matrix B,
		* otherwise the leading k x n part of the array must contain the ma
		trix
		* B.
		* ldb leading dimension of N. When trans == 'N' or 'n' then ldb must be
		at
		* least max(1, n). Otherwise ldb must be at least max(1, k).
		* beta double precision scalar multiplier applied to C. If beta is zero,
		C
		* does not have to be a valid input.
		* C double precision array of dimensions (ldc, n). If uplo == 'U' or
		'u',
		* the leading n x n triangular part of the array C must contain the
		* upper triangular part of the hermitian matrix C and the strictly
		* lower triangular part of C is not referenced. On exit, the upper
		* triangular part of C is overwritten by the upper triangular part
		of
		* the updated matrix. If uplo == 'L' or 'l', the leading n x n
		* triangular part of the array C must contain the lower triangular
		part
		* of the hermitian matrix C and the strictly upper triangular part
		of C
		* is not referenced. On exit, the lower triangular part of C is
		* overwritten by the lower triangular part of the updated matrix.
		* The imaginary parts of the diagonal elements need
		* not be set, they are assumed to be zero, and on exit they
		* are set to zero.
		* ldc leading dimension of C. Must be at least max(1, n).
		*
		* Output
		* ------
		* C updated according to alphaAconjugate(transpose(B)) +
		* + conjugate(alpha)Bconjugate(transpose(A)) + beta*C or
		* alphaconjugate(transpose(A))B + conjugate(alpha)*conjugate(tran
		spose(B))*A
		* + beta*C.
		*
		* Reference: http://www.netlib.org/blas/zher2k.f
		*
		* Error status for this function can be retrieved via cublasGetError().
		*
		* Error Status
		* ------------
		* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
		d
		* CUBLAS_STATUS_INVALID_VALUE if n < 0 or k < 0
		* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
		* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
		*/
		void CUBLASAPI cublasZher2k (char uplo, char trans, int n, int k,
		cuDoubleComplex alpha, const cuDoubleComplex *
		A, int lda,
		const cuDoubleComplex *B, int ldb, double beta
		,
		cuDoubleComplex *C, int ldc);

		/*
		* void
		* cublasZher (char uplo, int n, double alpha, const cuDoubleComplex *x, in
		t incx,
		* cuDoubleComplex *A, int lda)
		*
		* performs the hermitian rank 1 operation
		*
		* A = alpha * x * conjugate(transpose(x)) + A,
		*
		* where alpha is a double precision real scalar, x is an n element double
		* precision complex vector and A is an n x n hermitian matrix consisting o
		f
		* double precision complex elements. Matrix A is stored in column major fo
		rmat,
		* and lda is the leading dimension of the two-dimensional array
		* containing A.
		*
		* Input
		* -----
		* uplo specifies whether the matrix data is stored in the upper or
		* the lower triangular part of array A. If uplo = 'U' or 'u',
		* then only the upper triangular part of A may be referenced.
		* If uplo = 'L' or 'l', then only the lower triangular part of
		* A may be referenced.
		* n specifies the number of rows and columns of the matrix A. It
		* must be at least 0.
		* alpha double precision real scalar multiplier applied to
		* x * conjugate(transpose(x))
		* x double precision complex array of length at least (1 + (n - 1) *
		abs(incx))
		* incx specifies the storage spacing between elements of x. incx must
		* not be zero.
		* A double precision complex array of dimensions (lda, n). If uplo =
		'U' or
		* 'u', then A must contain the upper triangular part of a hermitian
		* matrix, and the strictly lower triangular part is not referenced.
		* If uplo = 'L' or 'l', then A contains the lower triangular part
		* of a hermitian matrix, and the strictly upper triangular part is
		* not referenced. The imaginary parts of the diagonal elements need
		* not be set, they are assumed to be zero, and on exit they
		* are set to zero.
		* lda leading dimension of the two-dimensional array containing A. lda
		* must be at least max(1, n).
		*
		* Output
		* ------
		* A updated according to A = alpha * x * conjugate(transpose(x)) + A
		*
		* Reference: http://www.netlib.org/blas/zher.f
		*
		* Error status for this function can be retrieved via cublasGetError().
		*
		* Error Status
		* ------------
		* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
		d
		* CUBLAS_STATUS_INVALID_VALUE if n < 0, or incx == 0
		* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
		* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
		*/
		void CUBLASAPI cublasZher (char uplo, int n, double alpha,
		const cuDoubleComplex *x, int incx, cuDoubleComp
		lex *A,
		int lda);

		/*
		* void
		* cublasZher (char uplo, int n, double alpha, const cuDoubleComplex *x, in
		t incx,
		* cuDoubleComplex *A, int lda)
		*
		* performs the hermitian rank 1 operation
		*
		* A = alpha * x * conjugate(transpose(x) + A,
		*
		* where alpha is a double precision real scalar, x is an n element double
		* precision complex vector and A is an n x n hermitian matrix consisting o
		f
		* double precision complex elements. Matrix A is stored in column major fo
		rmat,
		* and lda is the leading dimension of the two-dimensional array
		* containing A.
		*
		* Input
		* -----
		* uplo specifies whether the matrix data is stored in the upper or
		* the lower triangular part of array A. If uplo = 'U' or 'u',
		* then only the upper triangular part of A may be referenced.
		* If uplo = 'L' or 'l', then only the lower triangular part of
		* A may be referenced.
		* n specifies the number of rows and columns of the matrix A. It
		* must be at least 0.
		* alpha double precision real scalar multiplier applied to
		* x * conjugate(transpose(x))
		* x double precision complex array of length at least (1 + (n - 1) *
		abs(incx))
		* incx specifies the storage spacing between elements of x. incx must
		* not be zero.
		* A double precision complex array of dimensions (lda, n). If uplo =
		'U' or
		* 'u', then A must contain the upper triangular part of a hermitian
		* matrix, and the strictly lower triangular part is not referenced.
		* If uplo = 'L' or 'l', then A contains the lower triangular part
		* of a hermitian matrix, and the strictly upper triangular part is
		* not referenced. The imaginary parts of the diagonal elements need
		* not be set, they are assumed to be zero, and on exit they
		* are set to zero.
		* lda leading dimension of the two-dimensional array containing A. lda
		* must be at least max(1, n).
		*
		* Output
		* ------
		* A updated according to A = alpha * x * conjugate(transpose(x) + A
		*
		* Reference: http://www.netlib.org/blas/zher.f
		*
		* Error status for this function can be retrieved via cublasGetError().
		*
		* Error Status
		* ------------
		* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
		d
		* CUBLAS_STATUS_INVALID_VALUE if n < 0, or incx == 0
		* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
		* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
		*/
		void CUBLASAPI cublasZher (char uplo, int n, double alpha, const cuDoubleCo
		mplex *x,
		int incx, cuDoubleComplex *A, int lda);

		/*
		* void
		* cublasZhpr (char uplo, int n, double alpha, const cuDoubleComplex *x, in
		t incx,
		* cuDoubleComplex *AP)
		*
		* performs the hermitian rank 1 operation
		*
		* A = alpha * x * conjugate(transpose(x)) + A,
		*
		* where alpha is a double precision real scalar and x is an n element doub
		le
		* precision complex vector. A is a hermitian n x n matrix consisting of do
		uble
		* precision complex elements that is supplied in packed form.
		*
		* Input
		* -----
		* uplo specifies whether the matrix data is stored in the upper or the l
		ower
		* triangular part of array AP. If uplo == 'U' or 'u', then the uppe
		r
		* triangular part of A is supplied in AP. If uplo == 'L' or 'l', th
		en
		* the lower triangular part of A is supplied in AP.
		* n specifies the number of rows and columns of the matrix A. It must
		be
		* at least zero.
		* alpha double precision real scalar multiplier applied to x * conjugate(
		transpose(x)).
		* x double precision array of length at least (1 + (n - 1) * abs(incx
		)).
		* incx storage spacing between elements of x. incx must not be zero.
		* AP double precision complex array with at least ((n * (n + 1)) / 2)
		elements. If
		* uplo == 'U' or 'u', the array AP contains the upper triangular pa
		rt
		* of the hermitian matrix A, packed sequentially, column by column;
		* that is, if i <= j, then A[i,j] is stored is AP[i+(j*(j+1)/2)]. I
		f
		* uplo == 'L' or 'L', the array AP contains the lower triangular pa
		rt
		* of the hermitian matrix A, packed sequentially, column by column;
		* that is, if i >= j, then A[i,j] is stored in AP[i+((2n-j+1)j)/2
		].
		* The imaginary parts of the diagonal elements need not be set, the
		y
		* are assumed to be zero, and on exit they are set to zero.
		*
		* Output
		* ------
		* A updated according to A = alpha * x * conjugate(transpose(x)) + A
		*
		* Reference: http://www.netlib.org/blas/zhpr.f
		*
		* Error status for this function can be retrieved via cublasGetError().
		*
		* Error Status
		* ------------
		* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
		d
		* CUBLAS_STATUS_INVALID_VALUE if n < 0, or incx == 0
		* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
		* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
		*/
		void CUBLASAPI cublasZhpr (char uplo, int n, double alpha,
		const cuDoubleComplex *x, int incx, cuDoubleComp
		lex *AP);

		/*
		* void
		* cublasZhpr2 (char uplo, int n, cuDoubleComplex alpha, const cuDoubleComp
		lex *x, int incx,
		* const cuDoubleComplex y, int incy, cuDoubleComplex AP)
		*
		* performs the hermitian rank 2 operation
		*
		* A = alphaxconjugate(transpose(y)) + conjugate(alpha)yconjugate(tr
		anspose(x)) + A,
		*
		* where alpha is a double precision complex scalar, and x and y are n elem
		ent double
		* precision complex vectors. A is a hermitian n x n matrix consisting of d
		ouble
		* precision complex elements that is supplied in packed form.
		*
		* Input
		* -----
		* uplo specifies whether the matrix data is stored in the upper or the l
		ower
		* triangular part of array A. If uplo == 'U' or 'u', then only the
		* upper triangular part of A may be referenced and the lower triang
		ular
		* part of A is inferred. If uplo == 'L' or 'l', then only the lower
		* triangular part of A may be referenced and the upper triangular p
		art
		* of A is inferred.
		* n specifies the number of rows and columns of the matrix A. It must
		be
		* at least zero.
		* alpha double precision complex scalar multiplier applied to x * conjuga
		te(transpose(y)) +
		* y * conjugate(transpose(x)).
		* x double precision complex array of length at least (1 + (n - 1) *
		abs (incx)).
		* incx storage spacing between elements of x. incx must not be zero.
		* y double precision complex array of length at least (1 + (n - 1) *
		abs (incy)).
		* incy storage spacing between elements of y. incy must not be zero.
		* AP double precision complex array with at least ((n * (n + 1)) / 2)
		elements. If
		* uplo == 'U' or 'u', the array AP contains the upper triangular pa
		rt
		* of the hermitian matrix A, packed sequentially, column by column;
		* that is, if i <= j, then A[i,j] is stored is AP[i+(j*(j+1)/2)]. I
		f
		* uplo == 'L' or 'L', the array AP contains the lower triangular pa
		rt
		* of the hermitian matrix A, packed sequentially, column by column;
		* that is, if i >= j, then A[i,j] is stored in AP[i+((2n-j+1)j)/2
		].
		* The imaginary parts of the diagonal elements need not be set, the
		y
		* are assumed to be zero, and on exit they are set to zero.
		*
		* Output
		* ------
		* A updated according to A = alphaxconjugate(transpose(y))
		* + conjugate(alpha)yconjugate(transpose(x
		))+A
		*
		* Reference: http://www.netlib.org/blas/zhpr2.f
		*
		* Error status for this function can be retrieved via cublasGetError().
		*
		* Error Status
		* ------------
		* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
		d
		* CUBLAS_STATUS_INVALID_VALUE if n < 0, incx == 0, incy == 0
		* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
		* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
		*/
		void CUBLASAPI cublasZhpr2 (char uplo, int n, cuDoubleComplex alpha,
		const cuDoubleComplex *x, int incx, const cuDou
		bleComplex *y,
		int incy, cuDoubleComplex *AP);

		/*
		* void cublasZher2 (char uplo, int n, cuDoubleComplex alpha, const cuDoubl
		eComplex *x, int incx,
		* const cuDoubleComplex y, int incy, cuDoubleComplex A
		, int lda)
		*
		* performs the hermitian rank 2 operation
		*
		* A = alphaxconjugate(transpose(y)) + conjugate(alpha)yconjugate(tr
		anspose(x)) + A,
		*
		* where alpha is a double precision complex scalar, x and y are n element
		double
		* precision complex vector and A is an n by n hermitian matrix consisting
		of double
		* precision complex elements.
		*
		* Input
		* -----
		* uplo specifies whether the matrix data is stored in the upper or the l
		ower
		* triangular part of array A. If uplo == 'U' or 'u', then only the
		* upper triangular part of A may be referenced and the lower triang
		ular
		* part of A is inferred. If uplo == 'L' or 'l', then only the lower
		* triangular part of A may be referenced and the upper triangular p
		art
		* of A is inferred.
		* n specifies the number of rows and columns of the matrix A. It must
		be
		* at least zero.
		* alpha double precision complex scalar multiplier applied to x * conjuga
		te(transpose(y)) +
		* y * conjugate(transpose(x)).
		* x double precision array of length at least (1 + (n - 1) * abs (inc
		x)).
		* incx storage spacing between elements of x. incx must not be zero.
		* y double precision array of length at least (1 + (n - 1) * abs (inc
		y)).
		* incy storage spacing between elements of y. incy must not be zero.
		* A double precision complex array of dimensions (lda, n). If uplo ==
		'U' or 'u',
		* then A must contains the upper triangular part of a hermitian mat
		rix,
		* and the strictly lower triangular parts is not referenced. If upl
		o ==
		* 'L' or 'l', then A contains the lower triangular part of a hermit
		ian
		* matrix, and the strictly upper triangular part is not referenced.
		* The imaginary parts of the diagonal elements need not be set,
		* they are assumed to be zero, and on exit they are set to zero.
		*
		* lda leading dimension of A. It must be at least max(1, n).
		*
		* Output
		* ------
		* A updated according to A = alphaxconjugate(transpose(y))
		* + conjugate(alpha)yconjugate(transpose(x
		))+A
		*
		* Reference: http://www.netlib.org/blas/zher2.f
		*
		* Error status for this function can be retrieved via cublasGetError().
		*
		* Error Status
		* ------------
		* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
		d
		* CUBLAS_STATUS_INVALID_VALUE if n < 0, incx == 0, incy == 0
		* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
		* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
		*/
		void CUBLASAPI cublasZher2 (char uplo, int n, cuDoubleComplex alpha,
		const cuDoubleComplex *x, int incx, const cuDo
		ubleComplex *y,
		int incy, cuDoubleComplex *A, int lda);

	/*	/*
	* void	* void
	* cublasDsyr2k (char uplo, char trans, int n, int k, double alpha,	* cublasDsyr2k (char uplo, char trans, int n, int k, double alpha,
	* const double A, int lda, const double B, int ldb,	* const double A, int lda, const double B, int ldb,
	* double beta, double *C, int ldc)	* double beta, double *C, int ldc)
	*	*
	* performs one of the symmetric rank 2k operations	* performs one of the symmetric rank 2k operations
	*	*
	* C = alpha * A * transpose(B) + alpha * B * transpose(A) + beta * C, o r	* C = alpha * A * transpose(B) + alpha * B * transpose(A) + beta * C, o r

	skipping to change at line 4159	skipping to change at line 7821
	* otherwise the leading k x n part of the array must contain the ma trix	* otherwise the leading k x n part of the array must contain the ma trix
	* B.	* B.
	* ldb leading dimension of N. When trans == 'N' or 'n' then ldb must be at	* ldb leading dimension of N. When trans == 'N' or 'n' then ldb must be at
	* least max(1, n). Otherwise ldb must be at least max(1, k).	* least max(1, n). Otherwise ldb must be at least max(1, k).
	* beta double precision scalar multiplier applied to C. If beta is zero, C	* beta double precision scalar multiplier applied to C. If beta is zero, C
	* does not have to be a valid input.	* does not have to be a valid input.
	* C double precision array of dimensions (ldc, n). If uplo == 'U' or 'u',	* C double precision array of dimensions (ldc, n). If uplo == 'U' or 'u',
	* the leading n x n triangular part of the array C must contain the	* the leading n x n triangular part of the array C must contain the
	* upper triangular part of the symmetric matrix C and the strictly	* upper triangular part of the symmetric matrix C and the strictly
	* lower triangular part of C is not referenced. On exit, the upper	* lower triangular part of C is not referenced. On exit, the upper

	* triangular part of C is overwritten by the upper trinagular part of	* triangular part of C is overwritten by the upper triangular part of
	* the updated matrix. If uplo == 'L' or 'l', the leading n x n	* the updated matrix. If uplo == 'L' or 'l', the leading n x n
	* triangular part of the array C must contain the lower triangular part	* triangular part of the array C must contain the lower triangular part
	* of the symmetric matrix C and the strictly upper triangular part of C	* of the symmetric matrix C and the strictly upper triangular part of C
	* is not referenced. On exit, the lower triangular part of C is	* is not referenced. On exit, the lower triangular part of C is

	* overwritten by the lower trinagular part of the updated matrix.	* overwritten by the lower triangular part of the updated matrix.
	* ldc leading dimension of C. Must be at least max(1, n).	* ldc leading dimension of C. Must be at least max(1, n).
	*	*
	* Output	* Output
	* ------	* ------
	* C updated according to alphaAtranspose(B) + alphaBtranspose(A) +	* C updated according to alphaAtranspose(B) + alphaBtranspose(A) +
	* betaC or alphatranspose(A)B + alphatranspose(B)A + betaC	* betaC or alphatranspose(A)B + alphatranspose(B)A + betaC
	*	*
	* Reference: http://www.netlib.org/blas/dsyr2k.f	* Reference: http://www.netlib.org/blas/dsyr2k.f
	*	*
	* Error status for this function can be retrieved via cublasGetError().	* Error status for this function can be retrieved via cublasGetError().

	skipping to change at line 4256	skipping to change at line 7918
	* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support	* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
	* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU	* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
	*/	*/
	void CUBLASAPI cublasZgemm (char transa, char transb, int m, int n,	void CUBLASAPI cublasZgemm (char transa, char transb, int m, int n,
	int k, cuDoubleComplex alpha,	int k, cuDoubleComplex alpha,
	const cuDoubleComplex *A, int lda,	const cuDoubleComplex *A, int lda,
	const cuDoubleComplex *B, int ldb,	const cuDoubleComplex *B, int ldb,
	cuDoubleComplex beta, cuDoubleComplex *C,	cuDoubleComplex beta, cuDoubleComplex *C,
	int ldc);	int ldc);


		/*
		* void
		* cublasZtrmm (char side, char uplo, char transa, char diag, int m, int n,
		* cuDoubleComplex alpha, const cuDoubleComplex *A, int lda, c
		onst cuDoubleComplex *B,
		* int ldb)
		*
		* performs one of the matrix-matrix operations
		*
		* B = alpha * op(A) * B, or B = alpha * B * op(A)
		*
		* where alpha is a double-precision complex scalar, B is an m x n matrix c
		omposed
		* of double precision complex elements, and A is a unit or non-unit, upper
		or lower,
		* triangular matrix composed of double precision complex elements. op(A) i
		s one of
		*
		* op(A) = A , op(A) = transpose(A) or op(A) = conjugate(transpose(A))
		*
		* Matrices A and B are stored in column major format, and lda and ldb are
		* the leading dimensions of the two-dimensonials arrays that contain A and
		* B, respectively.
		*
		* Input
		* -----
		* side specifies whether op(A) multiplies B from the left or right.
		* If side = 'L' or 'l', then B = alpha * op(A) * B. If side =
		* 'R' or 'r', then B = alpha * B * op(A).
		* uplo specifies whether the matrix A is an upper or lower triangular
		* matrix. If uplo = 'U' or 'u', A is an upper triangular matrix.
		* If uplo = 'L' or 'l', A is a lower triangular matrix.
		* transa specifies the form of op(A) to be used in the matrix
		* multiplication. If transa = 'N' or 'n', then op(A) = A. If
		* transa = 'T' or 't', then op(A) = transpose(A).
		* If transa = 'C' or 'c', then op(A) = conjugate(transpose(A)).
		* diag specifies whether or not A is unit triangular. If diag = 'U'
		* or 'u', A is assumed to be unit triangular. If diag = 'N' or
		* 'n', A is not assumed to be unit triangular.
		* m the number of rows of matrix B. m must be at least zero.
		* n the number of columns of matrix B. n must be at least zero.
		* alpha double precision complex scalar multiplier applied to op(A)*B, or
		* B*op(A), respectively. If alpha is zero no accesses are made
		* to matrix A, and no read accesses are made to matrix B.
		* A double precision complex array of dimensions (lda, k). k = m if s
		ide =
		* 'L' or 'l', k = n if side = 'R' or 'r'. If uplo = 'U' or 'u'
		* the leading k x k upper triangular part of the array A must
		* contain the upper triangular matrix, and the strictly lower
		* triangular part of A is not referenced. If uplo = 'L' or 'l'
		* the leading k x k lower triangular part of the array A must
		* contain the lower triangular matrix, and the strictly upper
		* triangular part of A is not referenced. When diag = 'U' or 'u'
		* the diagonal elements of A are no referenced and are assumed
		* to be unity.
		* lda leading dimension of A. When side = 'L' or 'l', it must be at
		* least max(1,m) and at least max(1,n) otherwise
		* B double precision complex array of dimensions (ldb, n). On entry,
		the
		* leading m x n part of the array contains the matrix B. It is
		* overwritten with the transformed matrix on exit.
		* ldb leading dimension of B. It must be at least max (1, m).
		*
		* Output
		* ------
		* B updated according to B = alpha * op(A) * B or B = alpha * B * op
		(A)
		*
		* Reference: http://www.netlib.org/blas/ztrmm.f
		*
		* Error status for this function can be retrieved via cublasGetError().
		*
		* Error Status
		* ------------
		* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
		d
		* CUBLAS_STATUS_INVALID_VALUE if m or n < 0
		* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
		* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
		*/
		void CUBLASAPI cublasZtrmm (char side, char uplo, char transa,
		char diag, int m, int n, cuDoubleComplex alpha,
		const cuDoubleComplex *A, int lda, cuDoubleComp
		lex *B,
		int ldb);

		/*
		* cublasZgeru (int m, int n, cuDoubleComplex alpha, const cuDoubleComplex
		*x, int incx,
		* const cuDoubleComplex y, int incy, cuDoubleComplex A, int
		lda)
		*
		* performs the symmetric rank 1 operation
		*
		* A = alpha * x * transpose(y) + A,
		*
		* where alpha is a double precision complex scalar, x is an m element doub
		le
		* precision complex vector, y is an n element double precision complex vec
		tor, and A
		* is an m by n matrix consisting of double precision complex elements. Mat
		rix A
		* is stored in column major format, and lda is the leading dimension of
		* the two-dimensional array used to store A.
		*
		* Input
		* -----
		* m specifies the number of rows of the matrix A. It must be at least
		* zero.
		* n specifies the number of columns of the matrix A. It must be at
		* least zero.
		* alpha double precision complex scalar multiplier applied to x * transpo
		se(y)
		* x double precision complex array of length at least (1 + (m - 1) *
		abs(incx))
		* incx specifies the storage spacing between elements of x. incx must no
		t
		* be zero.
		* y double precision complex array of length at least (1 + (n - 1) *
		abs(incy))
		* incy specifies the storage spacing between elements of y. incy must no
		t
		* be zero.
		* A double precision complex array of dimensions (lda, n).
		* lda leading dimension of two-dimensional array used to store matrix A
		*
		* Output
		* ------
		* A updated according to A = alpha * x * transpose(y) + A
		*
		* Reference: http://www.netlib.org/blas/zgeru.f
		*
		* Error status for this function can be retrieved via cublasGetError().
		*
		* Error Status
		* ------------
		* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
		d
		* CUBLAS_STATUS_INVALID_VALUE if m < 0, n < 0, incx == 0, incy == 0
		* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
		* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
		*/
		void CUBLASAPI cublasZgeru (int m, int n, cuDoubleComplex alpha,
		const cuDoubleComplex *x, int incx, const cuDou
		bleComplex *y,
		int incy, cuDoubleComplex *A, int lda);

		/*
		* cublasZgerc (int m, int n, cuDoubleComplex alpha, const cuDoubleComplex
		*x, int incx,
		* const cuDoubleComplex y, int incy, cuDoubleComplex A, int
		lda)
		*
		* performs the symmetric rank 1 operation
		*
		* A = alpha * x * conjugate(transpose(y)) + A,
		*
		* where alpha is a double precision complex scalar, x is an m element doub
		le
		* precision complex vector, y is an n element double precision complex vec
		tor, and A
		* is an m by n matrix consisting of double precision complex elements. Mat
		rix A
		* is stored in column major format, and lda is the leading dimension of
		* the two-dimensional array used to store A.
		*
		* Input
		* -----
		* m specifies the number of rows of the matrix A. It must be at least
		* zero.
		* n specifies the number of columns of the matrix A. It must be at
		* least zero.
		* alpha double precision complex scalar multiplier applied to x * conjuga
		te(transpose(y))
		* x double precision array of length at least (1 + (m - 1) * abs(incx
		))
		* incx specifies the storage spacing between elements of x. incx must no
		t
		* be zero.
		* y double precision complex array of length at least (1 + (n - 1) *
		abs(incy))
		* incy specifies the storage spacing between elements of y. incy must no
		t
		* be zero.
		* A double precision complex array of dimensions (lda, n).
		* lda leading dimension of two-dimensional array used to store matrix A
		*
		* Output
		* ------
		* A updated according to A = alpha * x * conjugate(transpose(y)) + A
		*
		* Reference: http://www.netlib.org/blas/zgerc.f
		*
		* Error status for this function can be retrieved via cublasGetError().
		*
		* Error Status
		* ------------
		* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
		d
		* CUBLAS_STATUS_INVALID_VALUE if m < 0, n < 0, incx == 0, incy == 0
		* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
		* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
		*/
		void CUBLASAPI cublasZgerc (int m, int n, cuDoubleComplex alpha,
		const cuDoubleComplex *x, int incx, const cuDou
		bleComplex *y,
		int incy, cuDoubleComplex *A, int lda);

		/*
		* void
		* cublasZherk (char uplo, char trans, int n, int k, double alpha,
		* const cuDoubleComplex *A, int lda, double beta, cuDoubleCom
		plex *C, int ldc)
		*
		* performs one of the hermitian rank k operations
		*
		* C = alpha * A * conjugate(transpose(A)) + beta * C, or
		* C = alpha * conjugate(transpose(A)) * A + beta * C.
		*
		* Alpha and beta are double precision scalars. C is an n x n hermitian mat
		rix
		* consisting of double precision complex elements and stored in either low
		er or
		* upper storage mode. A is a matrix consisting of double precision complex
		elements
		* with dimension of n x k in the first case, and k x n in the second case.
		*
		* Input
		* -----
		* uplo specifies whether the hermitian matrix C is stored in upper or lo
		wer
		* storage mode as follows. If uplo == 'U' or 'u', only the upper
		* triangular part of the hermitian matrix is to be referenced, and
		the
		* elements of the strictly lower triangular part are to be infered
		from
		* those in the upper triangular part. If uplo == 'L' or 'l', only t
		he
		* lower triangular part of the hermitian matrix is to be referenced
		,
		* and the elements of the strictly upper triangular part are to be
		* infered from those in the lower triangular part.
		* trans specifies the operation to be performed. If trans == 'N' or 'n',
		C =
		* alpha * A * conjugate(transpose(A)) + beta * C. If trans == 'T',
		't', 'C', or 'c',
		* C = alpha * conjugate(transpose(A)) * A + beta * C.
		* n specifies the number of rows and the number columns of matrix C.
		If
		* trans == 'N' or 'n', n specifies the number of rows of matrix A.
		If
		* trans == 'T', 't', 'C', or 'c', n specifies the columns of matrix
		A.
		* n must be at least zero.
		* k If trans == 'N' or 'n', k specifies the number of columns of matr
		ix A.
		* If trans == 'T', 't', 'C', or 'c', k specifies the number of rows
		of
		* matrix A. k must be at least zero.
		* alpha double precision scalar multiplier applied to A * conjugate(trans
		pose(A)) or
		* conjugate(transpose(A)) * A.
		* A double precision complex array of dimensions (lda, ka), where ka
		is k when
		* trans == 'N' or 'n', and is n otherwise. When trans == 'N' or 'n'
		,
		* the leading n x k part of array A must contain the matrix A,
		* otherwise the leading k x n part of the array must contains the
		* matrix A.
		* lda leading dimension of A. When trans == 'N' or 'n' then lda must be
		at
		* least max(1, n). Otherwise lda must be at least max(1, k).
		* beta double precision scalar multiplier applied to C. If beta is zero,
		C
		* does not have to be a valid input
		* C double precision complex array of dimensions (ldc, n). If uplo =
		'U' or 'u',
		* the leading n x n triangular part of the array C must contain the
		* upper triangular part of the hermitian matrix C and the strictly
		* lower triangular part of C is not referenced. On exit, the upper
		* triangular part of C is overwritten by the upper triangular part
		of
		* the updated matrix. If uplo = 'L' or 'l', the leading n x n
		* triangular part of the array C must contain the lower triangular
		part
		* of the hermitian matrix C and the strictly upper triangular part
		of C
		* is not referenced. On exit, the lower triangular part of C is
		* overwritten by the lower triangular part of the updated matrix.
		* The imaginary parts of the diagonal elements need
		* not be set, they are assumed to be zero, and on exit they
		* are set to zero.
		* ldc leading dimension of C. It must be at least max(1, n).
		*
		* Output
		* ------
		* C updated according to C = alpha * A * conjugate(transpose(A)) + be
		ta * C, or C =
		* alpha * conjugate(transpose(A)) * A + beta * C
		*
		* Reference: http://www.netlib.org/blas/zherk.f
		*
		* Error status for this function can be retrieved via cublasGetError().
		*
		* Error Status
		* ------------
		* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
		d
		* CUBLAS_STATUS_INVALID_VALUE if n < 0 or k < 0
		* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
		* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
		*/
		void CUBLASAPI cublasZherk (char uplo, char trans, int n, int k,
		double alpha,
		const cuDoubleComplex *A, int lda,
		double beta,
		cuDoubleComplex *C, int ldc);

		/*
		* void
		* cublasZhemm (char side, char uplo, int m, int n, cuDoubleComplex alpha,
		* const cuDoubleComplex A, int lda, const cuDoubleComplex B
		, int ldb,
		* cuDoubleComplex beta, cuDoubleComplex *C, int ldc);
		*
		* performs one of the matrix-matrix operations
		*
		* C = alpha * A * B + beta * C, or
		* C = alpha * B * A + beta * C,
		*
		* where alpha and beta are double precision complex scalars, A is a hermit
		ian matrix
		* consisting of double precision complex elements and stored in either low
		er or upper
		* storage mode, and B and C are m x n matrices consisting of double precis
		ion
		* complex elements.
		*
		* Input
		* -----
		* side specifies whether the hermitian matrix A appears on the left side
		* hand side or right hand side of matrix B, as follows. If side ==
		'L'
		* or 'l', then C = alpha * A * B + beta * C. If side = 'R' or 'r',
		* then C = alpha * B * A + beta * C.
		* uplo specifies whether the hermitian matrix A is stored in upper or lo
		wer
		* storage mode, as follows. If uplo == 'U' or 'u', only the upper
		* triangular part of the hermitian matrix is to be referenced, and
		the
		* elements of the strictly lower triangular part are to be infered
		from
		* those in the upper triangular part. If uplo == 'L' or 'l', only t
		he
		* lower triangular part of the hermitian matrix is to be referenced
		,
		* and the elements of the strictly upper triangular part are to be
		* infered from those in the lower triangular part.
		* m specifies the number of rows of the matrix C, and the number of r
		ows
		* of matrix B. It also specifies the dimensions of hermitian matrix
		A
		* when side == 'L' or 'l'. m must be at least zero.
		* n specifies the number of columns of the matrix C, and the number o
		f
		* columns of matrix B. It also specifies the dimensions of hermitia
		n
		* matrix A when side == 'R' or 'r'. n must be at least zero.
		* alpha double precision scalar multiplier applied to A * B, or B * A
		* A double precision complex array of dimensions (lda, ka), where ka
		is m when
		* side == 'L' or 'l' and is n otherwise. If side == 'L' or 'l' the
		* leading m x m part of array A must contain the hermitian matrix,
		* such that when uplo == 'U' or 'u', the leading m x m part stores
		the
		* upper triangular part of the hermitian matrix, and the strictly l
		ower
		* triangular part of A is not referenced, and when uplo == 'U' or '
		u',
		* the leading m x m part stores the lower triangular part of the
		* hermitian matrix and the strictly upper triangular part is not
		* referenced. If side == 'R' or 'r' the leading n x n part of array
		A
		* must contain the hermitian matrix, such that when uplo == 'U' or
		'u',
		* the leading n x n part stores the upper triangular part of the
		* hermitian matrix and the strictly lower triangular part of A is n
		ot
		* referenced, and when uplo == 'U' or 'u', the leading n x n part
		* stores the lower triangular part of the hermitian matrix and the
		* strictly upper triangular part is not referenced. The imaginary p
		arts
		* of the diagonal elements need not be set, they are assumed to be
		zero.
		*
		* lda leading dimension of A. When side == 'L' or 'l', it must be at le
		ast
		* max(1, m) and at least max(1, n) otherwise.
		* B double precision complex array of dimensions (ldb, n). On entry,
		the leading
		* m x n part of the array contains the matrix B.
		* ldb leading dimension of B. It must be at least max (1, m).
		* beta double precision complex scalar multiplier applied to C. If beta
		is zero, C
		* does not have to be a valid input
		* C double precision complex array of dimensions (ldc, n)
		* ldc leading dimension of C. Must be at least max(1, m)
		*
		* Output
		* ------
		* C updated according to C = alpha * A * B + beta * C, or C = alpha *
		* B * A + beta * C
		*
		* Reference: http://www.netlib.org/blas/zhemm.f
		*
		* Error status for this function can be retrieved via cublasGetError().
		*
		* Error Status
		* ------------
		* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
		d
		* CUBLAS_STATUS_INVALID_VALUE if m or n are < 0
		* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
		* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
		*/
		void CUBLASAPI cublasZhemm (char side, char uplo, int m, int n,
		cuDoubleComplex alpha, const cuDoubleComplex *A
		, int lda,
		const cuDoubleComplex *B, int ldb, cuDoubleComp
		lex beta,
		cuDoubleComplex *C, int ldc);

		/*
		* void
		* cublasZtrsv (char uplo, char trans, char diag, int n, const cuDoubleComp
		lex *A,
		* int lda, cuDoubleComplex *x, int incx)
		*
		* solves a system of equations op(A) * x = b, where op(A) is either A,
		* transpose(A) or conjugate(transpose(A)). b and x are double precision
		* complex vectors consisting of n elements, and A is an n x n matrix
		* composed of a unit or non-unit, upper or lower triangular matrix.
		* Matrix A is stored in column major format, and lda is the leading
		* dimension of the two-dimensional array containing A.
		*
		* No test for singularity or near-singularity is included in this function
		.
		* Such tests must be performed before calling this function.
		*
		* Input
		* -----
		* uplo specifies whether the matrix data is stored in the upper or the
		* lower triangular part of array A. If uplo = 'U' or 'u', then only
		* the upper triangular part of A may be referenced. If uplo = 'L' o
		r
		* 'l', then only the lower triangular part of A may be referenced.
		* trans specifies op(A). If transa = 'n' or 'N', op(A) = A. If transa = '
		t',
		* 'T', 'c', or 'C', op(A) = transpose(A)
		* diag specifies whether or not A is a unit triangular matrix like so:
		* if diag = 'U' or 'u', A is assumed to be unit triangular. If
		* diag = 'N' or 'n', then A is not assumed to be unit triangular.
		* n specifies the number of rows and columns of the matrix A. It
		* must be at least 0.
		* A is a double precision complex array of dimensions (lda, n). If up
		lo = 'U'
		* or 'u', then A must contains the upper triangular part of a symme
		tric
		* matrix, and the strictly lower triangular parts is not referenced
		.
		* If uplo = 'L' or 'l', then A contains the lower triangular part o
		f
		* a symmetric matrix, and the strictly upper triangular part is not
		* referenced.
		* lda is the leading dimension of the two-dimensional array containing
		A.
		* lda must be at least max(1, n).
		* x double precision complex array of length at least (1 + (n - 1) *
		abs(incx)).
		* On entry, x contains the n element right-hand side vector b. On e
		xit,
		* it is overwritten with the solution vector x.
		* incx specifies the storage spacing between elements of x. incx must no
		t
		* be zero.
		*
		* Output
		* ------
		* x updated to contain the solution vector x that solves op(A) * x =
		b.
		*
		* Reference: http://www.netlib.org/blas/ztrsv.f
		*
		* Error status for this function can be retrieved via cublasGetError().
		*
		* Error Status
		* ------------
		* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
		d
		* CUBLAS_STATUS_INVALID_VALUE if incx == 0 or if n < 0
		* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
		* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
		*/
		void CUBLASAPI cublasZtrsv (char uplo, char trans, char diag, int n,
		const cuDoubleComplex *A, int lda, cuDoubleComp
		lex *x,
		int incx);

		/*
		* void
		* cublasZhbmv (char uplo, int n, int k, cuDoubleComplex alpha, const cuDou
		bleComplex *A, int lda,
		* const cuDoubleComplex *x, int incx, cuDoubleComplex beta, c
		uDoubleComplex *y, int incy)
		*
		* performs the matrix-vector operation
		*
		* y := alphaAx + beta*y
		*
		* alpha and beta are double precision complex scalars. x and y are double
		precision
		* complex vectors with n elements. A is an n by n hermitian band matrix co
		nsisting
		* of double precision complex elements, with k super-diagonals and the sam
		e number
		* of subdiagonals.
		*
		* Input
		* -----
		* uplo specifies whether the upper or lower triangular part of the hermi
		tian
		* band matrix A is being supplied. If uplo == 'U' or 'u', the upper
		* triangular part is being supplied. If uplo == 'L' or 'l', the low
		er
		* triangular part is being supplied.
		* n specifies the number of rows and the number of columns of the
		* hermitian matrix A. n must be at least zero.
		* k specifies the number of super-diagonals of matrix A. Since the ma
		trix
		* is hermitian, this is also the number of sub-diagonals. k must be
		at
		* least zero.
		* alpha double precision complex scalar multiplier applied to A*x.
		* A double precision complex array of dimensions (lda, n). When uplo
		== 'U' or
		* 'u', the leading (k + 1) x n part of array A must contain the upp
		er
		* triangular band of the hermitian matrix, supplied column by colum
		n,
		* with the leading diagonal of the matrix in row (k+1) of the array
		,
		* the first super-diagonal starting at position 2 in row k, and so
		on.
		* The top left k x k triangle of the array A is not referenced. Whe
		n
		* uplo == 'L' or 'l', the leading (k + 1) x n part of the array A m
		ust
		* contain the lower triangular band part of the hermitian matrix,
		* supplied column by column, with the leading diagonal of the matri
		x in
		* row 1 of the array, the first sub-diagonal starting at position 1
		in
		* row 2, and so on. The bottom right k x k triangle of the array A
		is
		* not referenced. The imaginary parts of the diagonal elements need
		* not be set, they are assumed to be zero.
		* lda leading dimension of A. lda must be at least (k + 1).
		* x double precision complex array of length at least (1 + (n - 1) *
		abs(incx)).
		* incx storage spacing between elements of x. incx must not be zero.
		* beta double precision complex scalar multiplier applied to vector y. I
		f beta is
		* zero, y is not read.
		* y double precision complex array of length at least (1 + (n - 1) *
		abs(incy)).
		* If beta is zero, y is not read.
		* incy storage spacing between elements of y. incy must not be zero.
		*
		* Output
		* ------
		* y updated according to alphaAx + beta*y
		*
		* Reference: http://www.netlib.org/blas/zhbmv.f
		*
		* Error status for this function can be retrieved via cublasGetError().
		*
		* Error Status
		* ------------
		* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
		d
		* CUBLAS_STATUS_INVALID_VALUE if k or n < 0, or if incx or incy == 0
		* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
		* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
		*/
		void CUBLASAPI cublasZhbmv (char uplo, int n, int k, cuDoubleComplex alpha,
		const cuDoubleComplex *A, int lda, const cuDoub
		leComplex *x,
		int incx, cuDoubleComplex beta, cuDoubleComplex
		*y, int incy);
	#if defined(__cplusplus)	#if defined(__cplusplus)
	}	}
	#endif /* __cplusplus */	#endif /* __cplusplus */

	#endif /* !defined(CUBLAS_H_) */	#endif /* !defined(CUBLAS_H_) */

End of changes. 74 change blocks.
	44 lines changed or deleted	5308 lines changed or added

	cuda.h	cuda.h
	/*	/*

	* Copyright 1993-2009 NVIDIA Corporation. All rights reserved.	* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
	*	*
	* NOTICE TO USER:	* NOTICE TO USER:
	*	*
	* This source code is subject to NVIDIA ownership rights under U.S. and	* This source code is subject to NVIDIA ownership rights under U.S. and
	* international Copyright laws. Users and possessors of this source code	* international Copyright laws. Users and possessors of this source code
	* are hereby granted a nonexclusive, royalty-free license to use this code	* are hereby granted a nonexclusive, royalty-free license to use this code
	* in individual and commercial software.	* in individual and commercial software.
	*	*
	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR

	skipping to change at line 57	skipping to change at line 57

	/**	/**
	* \defgroup CUDA_TYPES Data types used by CUDA driver	* \defgroup CUDA_TYPES Data types used by CUDA driver
	* \ingroup CUDA_DRIVER	* \ingroup CUDA_DRIVER
	* @{	* @{
	*/	*/

	/**	/**
	* CUDA API version number	* CUDA API version number
	*/	*/

	#define CUDA_VERSION 2030 /* 2.3 */	#define CUDA_VERSION 3000 /* 3.0 */

	#ifdef __cplusplus	#ifdef __cplusplus
	extern "C" {	extern "C" {
	#endif	#endif
	typedef unsigned int CUdeviceptr; ///< CUDA device pointer	typedef unsigned int CUdeviceptr; ///< CUDA device pointer

	typedef int CUdevice; ///< CUDA device	typedef int CUdevice; ///< CUDA device
	typedef struct CUctx_st *CUcontext; ///< CUDA context	typedef struct CUctx_st *CUcontext; ///< CUDA context
	typedef struct CUmod_st *CUmodule; ///< CUDA module	typedef struct CUmod_st *CUmodule; ///< CUDA module
	typedef struct CUfunc_st *CUfunction; ///< CUDA function	typedef struct CUfunc_st *CUfunction; ///< CUDA function
	typedef struct CUarray_st *CUarray; ///< CUDA array	typedef struct CUarray_st *CUarray; ///< CUDA array
	typedef struct CUtexref_st *CUtexref; ///< CUDA texture reference	typedef struct CUtexref_st *CUtexref; ///< CUDA texture reference
	typedef struct CUevent_st *CUevent; ///< CUDA event	typedef struct CUevent_st *CUevent; ///< CUDA event
	typedef struct CUstream_st *CUstream; ///< CUDA stream	typedef struct CUstream_st *CUstream; ///< CUDA stream

		typedef struct CUgraphicsResource_st *CUgraphicsResource; ///< CUDA gra
		phics interop resource

		typedef struct CUuuid_st { ///< CUDA definition of UUID
		char bytes[16];
		} CUuuid;

	/************************************	/************************************
	**	**
	** Enums	** Enums
	**	**
	***********************************/	***********************************/

	/**	/**
	* Context creation flags	* Context creation flags
	*/	*/
	typedef enum CUctx_flags_enum {	typedef enum CUctx_flags_enum {
	CU_CTX_SCHED_AUTO = 0, ///< Automatic scheduling	CU_CTX_SCHED_AUTO = 0, ///< Automatic scheduling
	CU_CTX_SCHED_SPIN = 1, ///< Set spin as default scheduling	CU_CTX_SCHED_SPIN = 1, ///< Set spin as default scheduling
	CU_CTX_SCHED_YIELD = 2, ///< Set yield as default scheduling	CU_CTX_SCHED_YIELD = 2, ///< Set yield as default scheduling
	CU_CTX_SCHED_MASK = 0x3,	CU_CTX_SCHED_MASK = 0x3,
	CU_CTX_BLOCKING_SYNC = 4, ///< Use blocking synchronization	CU_CTX_BLOCKING_SYNC = 4, ///< Use blocking synchronization
	CU_CTX_MAP_HOST = 8, ///< Support mapped pinned allocations	CU_CTX_MAP_HOST = 8, ///< Support mapped pinned allocations
	CU_CTX_LMEM_RESIZE_TO_MAX = 16, ///< Keep local memory allocation after launch	CU_CTX_LMEM_RESIZE_TO_MAX = 16, ///< Keep local memory allocation after launch

	CU_CTX_FLAGS_MASK = 0x1f,	CU_CTX_FLAGS_MASK = 0x1f
	} CUctx_flags;	} CUctx_flags;

	/**	/**
	* Event creation flags	* Event creation flags
	*/	*/
	typedef enum CUevent_flags_enum {	typedef enum CUevent_flags_enum {
	CU_EVENT_DEFAULT = 0, ///< Default event flag	CU_EVENT_DEFAULT = 0, ///< Default event flag

	CU_EVENT_BLOCKING_SYNC = 1, ///< Event uses blocking synchronization	CU_EVENT_BLOCKING_SYNC = 1 ///< Event uses blocking synchronization
	} CUevent_flags;	} CUevent_flags;

	/**	/**
	* Array formats	* Array formats
	*/	*/
	typedef enum CUarray_format_enum {	typedef enum CUarray_format_enum {
	CU_AD_FORMAT_UNSIGNED_INT8 = 0x01, ///< Unsigned 8-bit integers	CU_AD_FORMAT_UNSIGNED_INT8 = 0x01, ///< Unsigned 8-bit integers
	CU_AD_FORMAT_UNSIGNED_INT16 = 0x02, ///< Unsigned 16-bit integers	CU_AD_FORMAT_UNSIGNED_INT16 = 0x02, ///< Unsigned 16-bit integers
	CU_AD_FORMAT_UNSIGNED_INT32 = 0x03, ///< Unsigned 32-bit integers	CU_AD_FORMAT_UNSIGNED_INT32 = 0x03, ///< Unsigned 32-bit integers
	CU_AD_FORMAT_SIGNED_INT8 = 0x08, ///< Signed 8-bit integers	CU_AD_FORMAT_SIGNED_INT8 = 0x08, ///< Signed 8-bit integers

	skipping to change at line 121	skipping to change at line 126
	CU_AD_FORMAT_HALF = 0x10, ///< 16-bit floating point	CU_AD_FORMAT_HALF = 0x10, ///< 16-bit floating point
	CU_AD_FORMAT_FLOAT = 0x20 ///< 32-bit floating point	CU_AD_FORMAT_FLOAT = 0x20 ///< 32-bit floating point
	} CUarray_format;	} CUarray_format;

	/**	/**
	* Texture reference addressing modes	* Texture reference addressing modes
	*/	*/
	typedef enum CUaddress_mode_enum {	typedef enum CUaddress_mode_enum {
	CU_TR_ADDRESS_MODE_WRAP = 0, ///< Wrapping address mode	CU_TR_ADDRESS_MODE_WRAP = 0, ///< Wrapping address mode
	CU_TR_ADDRESS_MODE_CLAMP = 1, ///< Clamp to edge address mode	CU_TR_ADDRESS_MODE_CLAMP = 1, ///< Clamp to edge address mode

	CU_TR_ADDRESS_MODE_MIRROR = 2, ///< Mirror address mode	CU_TR_ADDRESS_MODE_MIRROR = 2 ///< Mirror address mode
	} CUaddress_mode;	} CUaddress_mode;

	/**	/**
	* Texture reference filtering modes	* Texture reference filtering modes
	*/	*/
	typedef enum CUfilter_mode_enum {	typedef enum CUfilter_mode_enum {
	CU_TR_FILTER_MODE_POINT = 0, ///< Point filter mode	CU_TR_FILTER_MODE_POINT = 0, ///< Point filter mode
	CU_TR_FILTER_MODE_LINEAR = 1 ///< Linear filter mode	CU_TR_FILTER_MODE_LINEAR = 1 ///< Linear filter mode
	} CUfilter_mode;	} CUfilter_mode;


	skipping to change at line 158	skipping to change at line 163
	CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK = 12, ///< Maximum number of 32-bit registers available per block	CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK = 12, ///< Maximum number of 32-bit registers available per block
	CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK = 12, ///< Deprecated, use CU _DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK	CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK = 12, ///< Deprecated, use CU _DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK
	CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13, ///< Peak clock frequen cy in kilohertz	CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13, ///< Peak clock frequen cy in kilohertz
	CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT = 14, ///< Alignment requirem ent for textures	CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT = 14, ///< Alignment requirem ent for textures

	CU_DEVICE_ATTRIBUTE_GPU_OVERLAP = 15, ///< Device can possibl y copy memory and execute a kernel concurrently	CU_DEVICE_ATTRIBUTE_GPU_OVERLAP = 15, ///< Device can possibl y copy memory and execute a kernel concurrently
	CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16, ///< Number of multipro cessors on device	CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16, ///< Number of multipro cessors on device
	CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT = 17, ///< Specifies whether there is a run time limit on kernels	CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT = 17, ///< Specifies whether there is a run time limit on kernels
	CU_DEVICE_ATTRIBUTE_INTEGRATED = 18, ///< Device is integrat ed with host memory	CU_DEVICE_ATTRIBUTE_INTEGRATED = 18, ///< Device is integrat ed with host memory
	CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY = 19, ///< Device can map hos t memory into CUDA address space	CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY = 19, ///< Device can map hos t memory into CUDA address space

	CU_DEVICE_ATTRIBUTE_COMPUTE_MODE = 20 ///< Compute mode (See	CU_DEVICE_ATTRIBUTE_COMPUTE_MODE = 20, ///< Compute mode (See
	::CUcomputemode for details)	::CUcomputemode for details)
		CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH = 21, ///< Maximum 1D textu
		re width
		CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH = 22, ///< Maximum 2D textu
		re width
		CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT = 23,///< Maximum 2D textu
		re height
		CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH = 24, ///< Maximum 3D textu
		re width
		CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT = 25,///< Maximum 3D textu
		re height
		CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH = 26, ///< Maximum 3D textu
		re depth
		CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_WIDTH = 27, ///< Maximum te
		xture array width
		CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_HEIGHT = 28,///< Maximum te
		xture array height
		CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES = 29, ///< Maximu
		m slices in a texture array
		CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT = 30, ///< Alignment requirement
		for surfaces
		CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS = 31, ///< Device can possibly e
		xecute multiple kernels concurrently
		CU_DEVICE_ATTRIBUTE_ECC_ENABLED = 32 ///< Device has ECC support enable
		d
	} CUdevice_attribute;	} CUdevice_attribute;

	/**	/**
	* Legacy device properties	* Legacy device properties
	*/	*/
	typedef struct CUdevprop_st {	typedef struct CUdevprop_st {
	int maxThreadsPerBlock; ///< Maximum number of threads per block	int maxThreadsPerBlock; ///< Maximum number of threads per block
	int maxThreadsDim[3]; ///< Maximum size of each dimension of a bl ock	int maxThreadsDim[3]; ///< Maximum size of each dimension of a bl ock
	int maxGridSize[3]; ///< Maximum size of each dimension of a gr id	int maxGridSize[3]; ///< Maximum size of each dimension of a gr id
	int sharedMemPerBlock; ///< Shared memory available per block in b ytes	int sharedMemPerBlock; ///< Shared memory available per block in b ytes

	skipping to change at line 211	skipping to change at line 228
	/**	/**
	* The size in bytes of thread local memory used by this function.	* The size in bytes of thread local memory used by this function.
	*/	*/
	CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES = 3,	CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES = 3,

	/**	/**
	* The number of registers used by each thread of this function.	* The number of registers used by each thread of this function.
	*/	*/
	CU_FUNC_ATTRIBUTE_NUM_REGS = 4,	CU_FUNC_ATTRIBUTE_NUM_REGS = 4,


		/**
		* The PTX virtual architecture version for which the function was comp
		iled.
		*/
		CU_FUNC_ATTRIBUTE_PTX_VERSION = 5,

		/**
		* The binary version for which the function was compiled.
		*/
		CU_FUNC_ATTRIBUTE_BINARY_VERSION = 6,

	CU_FUNC_ATTRIBUTE_MAX	CU_FUNC_ATTRIBUTE_MAX
	} CUfunction_attribute;	} CUfunction_attribute;

	/**	/**

		* Function cache configurations
		*/
		typedef enum CUfunc_cache_enum {
		CU_FUNC_CACHE_PREFER_NONE = 0x00,
		CU_FUNC_CACHE_PREFER_SHARED = 0x01,
		CU_FUNC_CACHE_PREFER_L1 = 0x02
		} CUfunc_cache;

		/**
	* Memory types	* Memory types
	*/	*/
	typedef enum CUmemorytype_enum {	typedef enum CUmemorytype_enum {
	CU_MEMORYTYPE_HOST = 0x01, ///< Host memory	CU_MEMORYTYPE_HOST = 0x01, ///< Host memory
	CU_MEMORYTYPE_DEVICE = 0x02, ///< Device memory	CU_MEMORYTYPE_DEVICE = 0x02, ///< Device memory
	CU_MEMORYTYPE_ARRAY = 0x03 ///< Array memory	CU_MEMORYTYPE_ARRAY = 0x03 ///< Array memory
	} CUmemorytype;	} CUmemorytype;

	/**	/**
	* Compute Modes	* Compute Modes

	skipping to change at line 238	skipping to change at line 274
	CU_COMPUTEMODE_EXCLUSIVE = 1, ///< Compute-exclusive mode (Only on e context can be present on this device at a time)	CU_COMPUTEMODE_EXCLUSIVE = 1, ///< Compute-exclusive mode (Only on e context can be present on this device at a time)
	CU_COMPUTEMODE_PROHIBITED = 2 ///< Compute-prohibited mode (No con texts can be created on this device at this time)	CU_COMPUTEMODE_PROHIBITED = 2 ///< Compute-prohibited mode (No con texts can be created on this device at this time)
	} CUcomputemode;	} CUcomputemode;

	/**	/**
	* Online compiler options	* Online compiler options
	*/	*/
	typedef enum CUjit_option_enum	typedef enum CUjit_option_enum
	{	{
	/**	/**

	* Max number of registers that a thread may use.	* Max number of registers that a thread may use.\n
		* Option type: unsigned int
	*/	*/
	CU_JIT_MAX_REGISTERS = 0,	CU_JIT_MAX_REGISTERS = 0,

	/**	/**
	* IN: Specifies minimum number of threads per block to target compilat ion	* IN: Specifies minimum number of threads per block to target compilat ion
	* for\n	* for\n
	* OUT: Returns the number of threads the compiler actually targeted.	* OUT: Returns the number of threads the compiler actually targeted.
	* This restricts the resource utilization fo the compiler (e.g. max	* This restricts the resource utilization fo the compiler (e.g. max
	* registers) such that a block with the given number of threads should be	* registers) such that a block with the given number of threads should be
	* able to launch based on register limitations. Note, this option does not	* able to launch based on register limitations. Note, this option does not
	* currently take into account any other resource limitations, such as	* currently take into account any other resource limitations, such as

	* shared memory utilization.	* shared memory utilization.\n
		* Option type: unsigned int
	*/	*/
	CU_JIT_THREADS_PER_BLOCK,	CU_JIT_THREADS_PER_BLOCK,

	/**	/**
	* Returns a float value in the option of the wall clock time, in	* Returns a float value in the option of the wall clock time, in

	* milliseconds, spent creating the cubin	* milliseconds, spent creating the cubin\n
		* Option type: float
	*/	*/
	CU_JIT_WALL_TIME,	CU_JIT_WALL_TIME,

	/**	/**
	* Pointer to a buffer in which to print any log messsages from PTXAS	* Pointer to a buffer in which to print any log messsages from PTXAS

	* that are informational in nature	* that are informational in nature (the buffer size is specified via
		* option ::CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES) \n
		* Option type: char*
	*/	*/
	CU_JIT_INFO_LOG_BUFFER,	CU_JIT_INFO_LOG_BUFFER,

	/**	/**
	* IN: Log buffer size in bytes. Log messages will be capped at this s ize	* IN: Log buffer size in bytes. Log messages will be capped at this s ize
	* (including null terminator)\n	* (including null terminator)\n

	* OUT: Amount of log buffer filled with messages	* OUT: Amount of log buffer filled with messages\n
		* Option type: unsigned int
	*/	*/
	CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES,	CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES,

	/**	/**
	* Pointer to a buffer in which to print any log messages from PTXAS th at	* Pointer to a buffer in which to print any log messages from PTXAS th at

	* reflect errors	* reflect errors (the buffer size is specified via option
		* ::CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES)\n
		* Option type: char*
	*/	*/
	CU_JIT_ERROR_LOG_BUFFER,	CU_JIT_ERROR_LOG_BUFFER,

	/**	/**
	* IN: Log buffer size in bytes. Log messages will be capped at this s ize	* IN: Log buffer size in bytes. Log messages will be capped at this s ize
	* (including null terminator)\n	* (including null terminator)\n

	* OUT: Amount of log buffer filled with messages	* OUT: Amount of log buffer filled with messages\n
		* Option type: unsigned int
	*/	*/
	CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES,	CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES,

	/**	/**
	* Level of optimizations to apply to generated code (0 - 4), with 4	* Level of optimizations to apply to generated code (0 - 4), with 4

	* being the default and highest level of optimizations.	* being the default and highest level of optimizations.\n
		* Option type: unsigned int
	*/	*/
	CU_JIT_OPTIMIZATION_LEVEL,	CU_JIT_OPTIMIZATION_LEVEL,

	/**	/**
	* No option value required. Determines the target based on the current	* No option value required. Determines the target based on the current

	* attached context (default)	* attached context (default)\n
		* Option type: No option value needed
	*/	*/
	CU_JIT_TARGET_FROM_CUCONTEXT,	CU_JIT_TARGET_FROM_CUCONTEXT,

	/**	/**

	* Target is chosen based on supplied CUjit_target_enum.	* Target is chosen based on supplied ::CUjit_target_enum.\n
		* Option type: unsigned int for enumerated type ::CUjit_target_enum
	*/	*/
	CU_JIT_TARGET,	CU_JIT_TARGET,

	/**	/**
	* Specifies choice of fallback strategy if matching cubin is not found .	* Specifies choice of fallback strategy if matching cubin is not found .

	* Choice is based on supplied CUjit_fallback_enum.	* Choice is based on supplied ::CUjit_fallback_enum.\n
		* Option type: unsigned int for enumerated type ::CUjit_fallback_enum
	*/	*/
	CU_JIT_FALLBACK_STRATEGY	CU_JIT_FALLBACK_STRATEGY

	} CUjit_option;	} CUjit_option;

	/**	/**
	* Online compilation targets	* Online compilation targets
	*/	*/
	typedef enum CUjit_target_enum	typedef enum CUjit_target_enum
	{	{
	CU_TARGET_COMPUTE_10 = 0, ///< Compute device class 1.0	CU_TARGET_COMPUTE_10 = 0, ///< Compute device class 1.0
	CU_TARGET_COMPUTE_11, ///< Compute device class 1.1	CU_TARGET_COMPUTE_11, ///< Compute device class 1.1
	CU_TARGET_COMPUTE_12, ///< Compute device class 1.2	CU_TARGET_COMPUTE_12, ///< Compute device class 1.2

	CU_TARGET_COMPUTE_13 ///< Compute device class 1.3	CU_TARGET_COMPUTE_13, ///< Compute device class 1.3
		CU_TARGET_COMPUTE_20 ///< Compute device class 2.0
	} CUjit_target;	} CUjit_target;

	/**	/**
	* Cubin matching fallback strategies	* Cubin matching fallback strategies
	*/	*/
	typedef enum CUjit_fallback_enum	typedef enum CUjit_fallback_enum
	{	{
	/** Prefer to compile ptx */	/** Prefer to compile ptx */
	CU_PREFER_PTX = 0,	CU_PREFER_PTX = 0,

	/** Prefer to fall back to compatible binary code */	/** Prefer to fall back to compatible binary code */
	CU_PREFER_BINARY	CU_PREFER_BINARY

	} CUjit_fallback;	} CUjit_fallback;


		/**
		* Flags to register a graphics resource
		*/
		typedef enum CUgraphicsRegisterFlags_enum {
		CU_GRAPHICS_REGISTER_FLAGS_NONE = 0x00
		} CUgraphicsRegisterFlags;

		/**
		* Flags for mapping and unmapping interop resources
		*/
		typedef enum CUgraphicsMapResourceFlags_enum {
		CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE = 0x00,
		CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY = 0x01,
		CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITE_DISCARD = 0x02
		} CUgraphicsMapResourceFlags;

		/**
		* Array indices for cube faces
		*/
		typedef enum CUarray_cubemap_face_enum {
		CU_CUBEMAP_FACE_POSITIVE_X = 0x00, ///< Positive X face of cubemap
		CU_CUBEMAP_FACE_NEGATIVE_X = 0x01, ///< Negative X face of cubemap
		CU_CUBEMAP_FACE_POSITIVE_Y = 0x02, ///< Positive Y face of cubemap
		CU_CUBEMAP_FACE_NEGATIVE_Y = 0x03, ///< Negative Y face of cubemap
		CU_CUBEMAP_FACE_POSITIVE_Z = 0x04, ///< Positive Z face of cubemap
		CU_CUBEMAP_FACE_NEGATIVE_Z = 0x05 ///< Negative Z face of cubemap
		} CUarray_cubemap_face;

	/************************************	/************************************
	**	**
	** Error codes	** Error codes
	**	**
	***********************************/	***********************************/

	/**	/**
	* Error codes	* Error codes
	*/	*/
	typedef enum cudaError_enum {	typedef enum cudaError_enum {

	skipping to change at line 365	skipping to change at line 443
	CUDA_ERROR_INVALID_IMAGE = 200, ///< Invalid kernel image	CUDA_ERROR_INVALID_IMAGE = 200, ///< Invalid kernel image
	CUDA_ERROR_INVALID_CONTEXT = 201, ///< Invalid context	CUDA_ERROR_INVALID_CONTEXT = 201, ///< Invalid context
	CUDA_ERROR_CONTEXT_ALREADY_CURRENT = 202, ///< Context already curren t	CUDA_ERROR_CONTEXT_ALREADY_CURRENT = 202, ///< Context already curren t
	CUDA_ERROR_MAP_FAILED = 205, ///< Map failed	CUDA_ERROR_MAP_FAILED = 205, ///< Map failed
	CUDA_ERROR_UNMAP_FAILED = 206, ///< Unmap failed	CUDA_ERROR_UNMAP_FAILED = 206, ///< Unmap failed
	CUDA_ERROR_ARRAY_IS_MAPPED = 207, ///< Array is mapped	CUDA_ERROR_ARRAY_IS_MAPPED = 207, ///< Array is mapped
	CUDA_ERROR_ALREADY_MAPPED = 208, ///< Already mapped	CUDA_ERROR_ALREADY_MAPPED = 208, ///< Already mapped
	CUDA_ERROR_NO_BINARY_FOR_GPU = 209, ///< No binary for GPU	CUDA_ERROR_NO_BINARY_FOR_GPU = 209, ///< No binary for GPU
	CUDA_ERROR_ALREADY_ACQUIRED = 210, ///< Already acquired	CUDA_ERROR_ALREADY_ACQUIRED = 210, ///< Already acquired
	CUDA_ERROR_NOT_MAPPED = 211, ///< Not mapped	CUDA_ERROR_NOT_MAPPED = 211, ///< Not mapped

		CUDA_ERROR_NOT_MAPPED_AS_ARRAY = 212, ///< Mapped resource not a
		vailable for access as an array
		CUDA_ERROR_NOT_MAPPED_AS_POINTER = 213, ///< Mapped resource not a
		vailable for access as a pointer
		CUDA_ERROR_ECC_UNCORRECTABLE = 214, ///< Uncorrectable ECC erro
		r detected

	CUDA_ERROR_INVALID_SOURCE = 300, ///< Invalid source	CUDA_ERROR_INVALID_SOURCE = 300, ///< Invalid source
	CUDA_ERROR_FILE_NOT_FOUND = 301, ///< File not found	CUDA_ERROR_FILE_NOT_FOUND = 301, ///< File not found

	CUDA_ERROR_INVALID_HANDLE = 400, ///< Invalid handle	CUDA_ERROR_INVALID_HANDLE = 400, ///< Invalid handle

	CUDA_ERROR_NOT_FOUND = 500, ///< Not found	CUDA_ERROR_NOT_FOUND = 500, ///< Not found

	CUDA_ERROR_NOT_READY = 600, ///< CUDA not ready	CUDA_ERROR_NOT_READY = 600, ///< CUDA not ready

	CUDA_ERROR_LAUNCH_FAILED = 700, ///< Launch failed	CUDA_ERROR_LAUNCH_FAILED = 700, ///< Launch failed
	CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES = 701, ///< Launch exceeded resour ces	CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES = 701, ///< Launch exceeded resour ces
	CUDA_ERROR_LAUNCH_TIMEOUT = 702, ///< Launch exceeded timeou t	CUDA_ERROR_LAUNCH_TIMEOUT = 702, ///< Launch exceeded timeou t
	CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING = 703, ///< Launch with incomp atible texturing	CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING = 703, ///< Launch with incomp atible texturing


		CUDA_ERROR_POINTER_IS_64BIT = 800, ///< Attempted to retrieve
		64-bit pointer via 32-bit API function
		CUDA_ERROR_SIZE_IS_64BIT = 801, ///< Attempted to retrieve
		64-bit size via 32-bit API function

	CUDA_ERROR_UNKNOWN = 999 ///< Unknown error	CUDA_ERROR_UNKNOWN = 999 ///< Unknown error
	} CUresult;	} CUresult;

	/**	/**
	* If set, host memory is portable between CUDA contexts.	* If set, host memory is portable between CUDA contexts.
	* Flag for ::cuMemHostAlloc()	* Flag for ::cuMemHostAlloc()
	*/	*/
	#define CU_MEMHOSTALLOC_PORTABLE 0x01	#define CU_MEMHOSTALLOC_PORTABLE 0x01

	/**	/**

	skipping to change at line 492	skipping to change at line 576
	unsigned int Height; ///< Height of 3D array	unsigned int Height; ///< Height of 3D array
	unsigned int Depth; ///< Depth of 3D array	unsigned int Depth; ///< Depth of 3D array

	CUarray_format Format; ///< Array format	CUarray_format Format; ///< Array format

	unsigned int NumChannels; ///< Channels per array element	unsigned int NumChannels; ///< Channels per array element

	unsigned int Flags; ///< Flags	unsigned int Flags; ///< Flags
	} CUDA_ARRAY3D_DESCRIPTOR;	} CUDA_ARRAY3D_DESCRIPTOR;


		// if set, the CUDA array contains an array of 2D slices
		// and the Depth member of CUDA_ARRAY3D_DESCRIPTOR specifies
		// the number of slices, not the depth of a 3D array.
		#define CUDA_ARRAY3D_2DARRAY 0x01

	/**	/**
	* Override the texref format with a format inferred from the array.	* Override the texref format with a format inferred from the array.
	* Flag for ::cuTexRefSetArray()	* Flag for ::cuTexRefSetArray()
	*/	*/
	#define CU_TRSA_OVERRIDE_FORMAT 0x01	#define CU_TRSA_OVERRIDE_FORMAT 0x01

	/**	/**
	* Read the texture as integers rather than promoting the values to floats	* Read the texture as integers rather than promoting the values to floats
	* in the range [0,1].	* in the range [0,1].
	* Flag for ::cuTexRefSetFlags()	* Flag for ::cuTexRefSetFlags()

	skipping to change at line 663	skipping to change at line 752
	**	**
	***********************************/	***********************************/

	// 1D functions	// 1D functions
	// system <-> device memory	// system <-> device memory
	CUresult CUDAAPI cuMemcpyHtoDAsync (CUdeviceptr dstDevice,	CUresult CUDAAPI cuMemcpyHtoDAsync (CUdeviceptr dstDevice,
	const void *srcHost, unsigned int ByteCount, CUstream hStream ) ;	const void *srcHost, unsigned int ByteCount, CUstream hStream ) ;
	CUresult CUDAAPI cuMemcpyDtoHAsync (void *dstHost,	CUresult CUDAAPI cuMemcpyDtoHAsync (void *dstHost,
	CUdeviceptr srcDevice, unsigned int ByteCount, CUstream hStream );	CUdeviceptr srcDevice, unsigned int ByteCount, CUstream hStream );


		// device <-> device memory
		CUresult CUDAAPI cuMemcpyDtoDAsync (CUdeviceptr dstDevice,
		CUdeviceptr srcDevice, unsigned int ByteCount, CUstream hStream
		);

	// system <-> array memory	// system <-> array memory
	CUresult CUDAAPI cuMemcpyHtoAAsync( CUarray dstArray, unsigned int dstIndex,	CUresult CUDAAPI cuMemcpyHtoAAsync( CUarray dstArray, unsigned int dstIndex,
	const void *pSrc, unsigned int ByteCount, CUstream hStream );	const void *pSrc, unsigned int ByteCount, CUstream hStream );
	CUresult CUDAAPI cuMemcpyAtoHAsync( void *dstHost, CUarray srcArra y, unsigned int srcIndex,	CUresult CUDAAPI cuMemcpyAtoHAsync( void *dstHost, CUarray srcArra y, unsigned int srcIndex,
	unsigned int ByteCount, CUstream hStream );	unsigned int ByteCount, CUstream hStream );

	// 2D memcpy	// 2D memcpy
	CUresult CUDAAPI cuMemcpy2DAsync( const CUDA_MEMCPY2D *pCopy, CUst ream hStream );	CUresult CUDAAPI cuMemcpy2DAsync( const CUDA_MEMCPY2D *pCopy, CUst ream hStream );

	// 3D memcpy	// 3D memcpy

	skipping to change at line 697	skipping to change at line 790

	/************************************	/************************************
	**	**
	** Function management	** Function management
	**	**
	***********************************/	***********************************/

	CUresult CUDAAPI cuFuncSetBlockShape (CUfunction hfunc, int x, int y, i nt z);	CUresult CUDAAPI cuFuncSetBlockShape (CUfunction hfunc, int x, int y, i nt z);
	CUresult CUDAAPI cuFuncSetSharedSize (CUfunction hfunc, unsigned int by tes);	CUresult CUDAAPI cuFuncSetSharedSize (CUfunction hfunc, unsigned int by tes);
	CUresult CUDAAPI cuFuncGetAttribute (int *pi, CUfunction_attribute attr ib, CUfunction hfunc);	CUresult CUDAAPI cuFuncGetAttribute (int *pi, CUfunction_attribute attr ib, CUfunction hfunc);

		CUresult CUDAAPI cuFuncSetCacheConfig(CUfunction hfunc, CUfunc_cache co nfig);

	/************************************	/************************************
	**	**
	** Array management	** Array management
	**	**
	***********************************/	***********************************/

	CUresult CUDAAPI cuArrayCreate( CUarray pHandle, const CUDA_ARRAY_DES CRIPTOR pAllocateArray );	CUresult CUDAAPI cuArrayCreate( CUarray pHandle, const CUDA_ARRAY_DES CRIPTOR pAllocateArray );
	CUresult CUDAAPI cuArrayGetDescriptor( CUDA_ARRAY_DESCRIPTOR *pArrayDe scriptor, CUarray hArray );	CUresult CUDAAPI cuArrayGetDescriptor( CUDA_ARRAY_DESCRIPTOR *pArrayDe scriptor, CUarray hArray );
	CUresult CUDAAPI cuArrayDestroy( CUarray hArray );	CUresult CUDAAPI cuArrayDestroy( CUarray hArray );

	skipping to change at line 743	skipping to change at line 837

	/************************************	/************************************
	**	**
	** Parameter management	** Parameter management
	**	**
	***********************************/	***********************************/

	CUresult CUDAAPI cuParamSetSize (CUfunction hfunc, unsigned int numbyt es);	CUresult CUDAAPI cuParamSetSize (CUfunction hfunc, unsigned int numbyt es);
	CUresult CUDAAPI cuParamSeti (CUfunction hfunc, int offset, unsigne d int value);	CUresult CUDAAPI cuParamSeti (CUfunction hfunc, int offset, unsigne d int value);
	CUresult CUDAAPI cuParamSetf (CUfunction hfunc, int offset, float v alue);	CUresult CUDAAPI cuParamSetf (CUfunction hfunc, int offset, float v alue);

	CUresult CUDAAPI cuParamSetv (CUfunction hfunc, int offset, void * ptr, unsigned int numbytes);	CUresult CUDAAPI cuParamSetv (CUfunction hfunc, int offset, void *p tr, unsigned int numbytes);
	CUresult CUDAAPI cuParamSetTexRef(CUfunction hfunc, int texunit, CUtex ref hTexRef);	CUresult CUDAAPI cuParamSetTexRef(CUfunction hfunc, int texunit, CUtex ref hTexRef);

	/************************************	/************************************
	**	**
	** Launch functions	** Launch functions
	**	**
	***********************************/	***********************************/

	CUresult CUDAAPI cuLaunch ( CUfunction f );	CUresult CUDAAPI cuLaunch ( CUfunction f );
	CUresult CUDAAPI cuLaunchGrid (CUfunction f, int grid_width, int grid_h eight);	CUresult CUDAAPI cuLaunchGrid (CUfunction f, int grid_width, int grid_h eight);

	skipping to change at line 778	skipping to change at line 872
	/************************************	/************************************
	**	**
	** Streams	** Streams
	**	**
	***********************************/	***********************************/
	CUresult CUDAAPI cuStreamCreate( CUstream *phStream, unsigned int Flag s );	CUresult CUDAAPI cuStreamCreate( CUstream *phStream, unsigned int Flag s );
	CUresult CUDAAPI cuStreamQuery( CUstream hStream );	CUresult CUDAAPI cuStreamQuery( CUstream hStream );
	CUresult CUDAAPI cuStreamSynchronize( CUstream hStream );	CUresult CUDAAPI cuStreamSynchronize( CUstream hStream );
	CUresult CUDAAPI cuStreamDestroy( CUstream hStream );	CUresult CUDAAPI cuStreamDestroy( CUstream hStream );


		/************************************
		**
		** Graphics interop
		**
		***********************************/
		CUresult CUDAAPI cuGraphicsUnregisterResource(CUgraphicsResource resour
		ce);
		CUresult CUDAAPI cuGraphicsSubResourceGetMappedArray( CUarray *pArray,
		CUgraphicsResource resource, unsigned int arrayIndex, unsigned int mipLevel
		);
		CUresult CUDAAPI cuGraphicsResourceGetMappedPointer( CUdeviceptr *pDevP
		tr, unsigned int *pSize, CUgraphicsResource resource );
		CUresult CUDAAPI cuGraphicsResourceSetMapFlags( CUgraphicsResource reso
		urce, unsigned int flags );
		CUresult CUDAAPI cuGraphicsMapResources( unsigned int count, CUgraphics
		Resource *resources, CUstream hStream );
		CUresult CUDAAPI cuGraphicsUnmapResources( unsigned int count, CUgraphi
		csResource *resources, CUstream hStream );

		/************************************
		**
		** Export tables
		**
		***********************************/
		CUresult CUDAAPI cuGetExportTable( const void **ppExportTable, const CU
		uuid *pExportTableId );

	#ifdef __cplusplus	#ifdef __cplusplus
	}	}
	#endif	#endif

	#endif /* __cuda_cuda_h__ */	#endif /* __cuda_cuda_h__ */

End of changes. 29 change blocks.
	20 lines changed or deleted	161 lines changed or added

	cudaGL.h	cudaGL.h
	/*	/*

	* Copyright 1993-2009 NVIDIA Corporation. All rights reserved.	* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
	*	*
	* NOTICE TO USER:	* NOTICE TO USER:
	*	*
	* This source code is subject to NVIDIA ownership rights under U.S. and	* This source code is subject to NVIDIA ownership rights under U.S. and
	* international Copyright laws. Users and possessors of this source code	* international Copyright laws. Users and possessors of this source code
	* are hereby granted a nonexclusive, royalty-free license to use this code	* are hereby granted a nonexclusive, royalty-free license to use this code
	* in individual and commercial software.	* in individual and commercial software.
	*	*
	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR

	skipping to change at line 43	skipping to change at line 43
	* the above Disclaimer and U.S. Government End Users Notice.	* the above Disclaimer and U.S. Government End Users Notice.
	*/	*/

	#ifndef CUDAGL_H	#ifndef CUDAGL_H
	#define CUDAGL_H	#define CUDAGL_H

	#ifdef __cplusplus	#ifdef __cplusplus
	extern "C" {	extern "C" {
	#endif	#endif


		CUresult CUDAAPI cuGLCtxCreate( CUcontext *pCtx, unsigned int Flags, CUdevi
		ce device );
		CUresult CUDAAPI cuGraphicsGLRegisterBuffer( CUgraphicsResource *pCudaResou
		rce, GLuint buffer, unsigned int Flags );
		CUresult CUDAAPI cuGraphicsGLRegisterImage( CUgraphicsResource *pCudaResour
		ce, GLuint image, GLenum target, unsigned int Flags );

		#if defined(_WIN32)
		#if !defined(WGL_NV_gpu_affinity)
		typedef void* HGPUNV;
		#endif
		CUresult CUDAAPI cuWGLGetDevice( CUdevice *pDevice, HGPUNV hGpu );
		#endif

	//	//

	// Flags to map or unmap a resource	// CUDA 2.x compatibility API. These functions are deprecated, please use t he ones above.
	//	//


		// Flags to map or unmap a resource
	typedef enum CUGLmap_flags_enum {	typedef enum CUGLmap_flags_enum {
	CU_GL_MAP_RESOURCE_FLAGS_NONE = 0x00,	CU_GL_MAP_RESOURCE_FLAGS_NONE = 0x00,
	CU_GL_MAP_RESOURCE_FLAGS_READ_ONLY = 0x01,	CU_GL_MAP_RESOURCE_FLAGS_READ_ONLY = 0x01,
	CU_GL_MAP_RESOURCE_FLAGS_WRITE_DISCARD = 0x02,	CU_GL_MAP_RESOURCE_FLAGS_WRITE_DISCARD = 0x02,
	} CUGLmap_flags;	} CUGLmap_flags;

	CUresult CUDAAPI cuGLInit(void);	CUresult CUDAAPI cuGLInit(void);

	CUresult CUDAAPI cuGLCtxCreate( CUcontext *pCtx, unsigned int Flags, CUdevi	CUresult CUDAAPI cuGLRegisterBufferObject( GLuint buffer );
	ce device );	CUresult CUDAAPI cuGLMapBufferObject( CUdeviceptr dptr, unsigned int size
	CUresult CUDAAPI cuGLRegisterBufferObject( GLuint bufferobj );	, GLuint buffer );
	CUresult CUDAAPI cuGLMapBufferObject( CUdeviceptr dptr, unsigned int size	CUresult CUDAAPI cuGLUnmapBufferObject( GLuint buffer );
	, GLuint bufferobj );	CUresult CUDAAPI cuGLUnregisterBufferObject( GLuint buffer );
	CUresult CUDAAPI cuGLUnmapBufferObject( GLuint bufferobj );
	CUresult CUDAAPI cuGLUnregisterBufferObject( GLuint bufferobj );

	CUresult CUDAAPI cuGLSetBufferObjectMapFlags( GLuint bufferobj, unsigned in
	t Flags );
	CUresult CUDAAPI cuGLMapBufferObjectAsync( CUdeviceptr *dptr, unsigned int
	*size, GLuint bufferobj, CUstream hStream );
	CUresult CUDAAPI cuGLUnmapBufferObjectAsync( GLuint bufferobj, CUstream hSt
	ream );


	#if defined(_WIN32)	CUresult CUDAAPI cuGLSetBufferObjectMapFlags( GLuint buffer, unsigned int F
	#if !defined(WGL_NV_gpu_affinity)	lags );
	typedef void* HGPUNV;	CUresult CUDAAPI cuGLMapBufferObjectAsync( CUdeviceptr *dptr, unsigned int
	#endif	*size, GLuint buffer, CUstream hStream );
	CUresult CUDAAPI cuWGLGetDevice( CUdevice *pDevice, HGPUNV hGpu );	CUresult CUDAAPI cuGLUnmapBufferObjectAsync( GLuint buffer, CUstream hStrea
	#endif	m );

	#ifdef __cplusplus	#ifdef __cplusplus
	};	};
	#endif	#endif

	#endif	#endif

End of changes. 6 change blocks.
	22 lines changed or deleted	29 lines changed or added

	cuda_gl_interop.h	cuda_gl_interop.h
	/*	/*

	* Copyright 1993-2009 NVIDIA Corporation. All rights reserved.	* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
	*	*
	* NOTICE TO USER:	* NOTICE TO USER:
	*	*
	* This source code is subject to NVIDIA ownership rights under U.S. and	* This source code is subject to NVIDIA ownership rights under U.S. and
	* international Copyright laws. Users and possessors of this source code	* international Copyright laws. Users and possessors of this source code
	* are hereby granted a nonexclusive, royalty-free license to use this code	* are hereby granted a nonexclusive, royalty-free license to use this code
	* in individual and commercial software.	* in individual and commercial software.
	*	*
	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR

	skipping to change at line 67	skipping to change at line 67
	#if defined(__cplusplus)	#if defined(__cplusplus)
	extern "C" {	extern "C" {
	#endif /* __cplusplus */	#endif /* __cplusplus */

	/************************************************************************ ***	/************************************************************************ ***
	* *	* *
	* *	* *
	* *	* *
	************************************************************************* **/	************************************************************************* **/


		extern __host__ cudaError_t CUDARTAPI cudaGLSetGLDevice(int device);
		extern __host__ cudaError_t CUDARTAPI cudaGraphicsGLRegisterImage(struct cu
		daGraphicsResource **resource, GLuint image, GLenum target, unsigned int Fl
		ags);
		extern __host__ cudaError_t CUDARTAPI cudaGraphicsGLRegisterBuffer(struct c
		udaGraphicsResource **resource, GLuint buffer, unsigned int Flags);

		#ifdef _WIN32
		#ifndef WGL_NV_gpu_affinity
		typedef void* HGPUNV;
		#endif
		extern __host__ cudaError_t CUDARTAPI cudaWGLGetDevice(int *device, HGPUNV
		hGpu);
		#endif

	/**	/**
	* CUDA GL Map Flags	* CUDA GL Map Flags
	*/	*/
	enum cudaGLMapFlags	enum cudaGLMapFlags
	{	{
	cudaGLMapFlagsNone = 0, ///< Default; Assume resource can be rea d/written	cudaGLMapFlagsNone = 0, ///< Default; Assume resource can be rea d/written
	cudaGLMapFlagsReadOnly = 1, ///< CUDA kernels will not write to this resource	cudaGLMapFlagsReadOnly = 1, ///< CUDA kernels will not write to this resource
	cudaGLMapFlagsWriteDiscard = 2, ///< CUDA kernels will only write to and will not read from this resource	cudaGLMapFlagsWriteDiscard = 2, ///< CUDA kernels will only write to and will not read from this resource
	};	};


	extern __host__ cudaError_t CUDARTAPI cudaGLSetGLDevice(int device);
	extern __host__ cudaError_t CUDARTAPI cudaGLRegisterBufferObject(GLuint buf Obj);	extern __host__ cudaError_t CUDARTAPI cudaGLRegisterBufferObject(GLuint buf Obj);
	extern __host__ cudaError_t CUDARTAPI cudaGLMapBufferObject(void **devPtr, GLuint bufObj);	extern __host__ cudaError_t CUDARTAPI cudaGLMapBufferObject(void **devPtr, GLuint bufObj);
	extern __host__ cudaError_t CUDARTAPI cudaGLUnmapBufferObject(GLuint bufObj );	extern __host__ cudaError_t CUDARTAPI cudaGLUnmapBufferObject(GLuint bufObj );
	extern __host__ cudaError_t CUDARTAPI cudaGLUnregisterBufferObject(GLuint b ufObj);	extern __host__ cudaError_t CUDARTAPI cudaGLUnregisterBufferObject(GLuint b ufObj);

	extern __host__ cudaError_t CUDARTAPI cudaGLSetBufferObjectMapFlags(GLuint bufObj, unsigned int flags);	extern __host__ cudaError_t CUDARTAPI cudaGLSetBufferObjectMapFlags(GLuint bufObj, unsigned int flags);
	extern __host__ cudaError_t CUDARTAPI cudaGLMapBufferObjectAsync(void **dev Ptr, GLuint bufObj, cudaStream_t stream);	extern __host__ cudaError_t CUDARTAPI cudaGLMapBufferObjectAsync(void **dev Ptr, GLuint bufObj, cudaStream_t stream);
	extern __host__ cudaError_t CUDARTAPI cudaGLUnmapBufferObjectAsync(GLuint b ufObj, cudaStream_t stream);	extern __host__ cudaError_t CUDARTAPI cudaGLUnmapBufferObjectAsync(GLuint b ufObj, cudaStream_t stream);


	#ifdef _WIN32
	#ifndef WGL_NV_gpu_affinity
	typedef void* HGPUNV;
	#endif
	extern __host__ cudaError_t CUDARTAPI cudaWGLGetDevice(int *device, HGPUNV
	hGpu);
	#endif

	#if defined(__cplusplus)	#if defined(__cplusplus)
	}	}
	#endif /* __cplusplus */	#endif /* __cplusplus */

	#endif /* __CUDA_GL_INTEROP_H__ */	#endif /* __CUDA_GL_INTEROP_H__ */

End of changes. 4 change blocks.
	10 lines changed or deleted	16 lines changed or added

	cuda_runtime.h	cuda_runtime.h
	/*	/*

	* Copyright 1993-2009 NVIDIA Corporation. All rights reserved.	* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
	*	*
	* NOTICE TO USER:	* NOTICE TO USER:
	*	*
	* This source code is subject to NVIDIA ownership rights under U.S. and	* This source code is subject to NVIDIA ownership rights under U.S. and
	* international Copyright laws. Users and possessors of this source code	* international Copyright laws. Users and possessors of this source code
	* are hereby granted a nonexclusive, royalty-free license to use this code	* are hereby granted a nonexclusive, royalty-free license to use this code
	* in individual and commercial software.	* in individual and commercial software.
	*	*
	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR

	skipping to change at line 199	skipping to change at line 199
	enum cudaMemcpyKind kind = cudaMemcpyHostToDevice	enum cudaMemcpyKind kind = cudaMemcpyHostToDevice
	)	)
	{	{
	return cudaMemcpyToSymbol((const char*)&symbol, src, count, offset, kind) ;	return cudaMemcpyToSymbol((const char*)&symbol, src, count, offset, kind) ;
	}	}

	static __inline__ __host__ cudaError_t cudaMemcpyToSymbolAsync(	static __inline__ __host__ cudaError_t cudaMemcpyToSymbolAsync(
	char *symbol,	char *symbol,
	const void *src,	const void *src,
	size_t count,	size_t count,

	size_t offset,	size_t offset = 0,
	enum cudaMemcpyKind kind,	enum cudaMemcpyKind kind = cudaMemcpyHostToDevice,
	cudaStream_t stream	cudaStream_t stream = 0
	)	)
	{	{
	return cudaMemcpyToSymbolAsync((const char*)symbol, src, count, offset, k ind, stream);	return cudaMemcpyToSymbolAsync((const char*)symbol, src, count, offset, k ind, stream);
	}	}

	template<class T>	template<class T>
	__inline__ __host__ cudaError_t cudaMemcpyToSymbolAsync(	__inline__ __host__ cudaError_t cudaMemcpyToSymbolAsync(
	const T &symbol,	const T &symbol,
	const void *src,	const void *src,
	size_t count,	size_t count,

	size_t offset,	size_t offset = 0,
	enum cudaMemcpyKind kind,	enum cudaMemcpyKind kind = cudaMemcpyHostToDevice,
	cudaStream_t stream	cudaStream_t stream = 0
	)	)
	{	{
	return cudaMemcpyToSymbolAsync((const char*)&symbol, src, count, offset, kind, stream);	return cudaMemcpyToSymbolAsync((const char*)&symbol, src, count, offset, kind, stream);
	}	}

	/************************************************************************ ***	/************************************************************************ ***
	* *	* *
	* *	* *
	* *	* *
	************************************************************************* **/	************************************************************************* **/

	skipping to change at line 253	skipping to change at line 253
	enum cudaMemcpyKind kind = cudaMemcpyDeviceToHost	enum cudaMemcpyKind kind = cudaMemcpyDeviceToHost
	)	)
	{	{
	return cudaMemcpyFromSymbol(dst, (const char*)&symbol, count, offset, kin d);	return cudaMemcpyFromSymbol(dst, (const char*)&symbol, count, offset, kin d);
	}	}

	static __inline__ __host__ cudaError_t cudaMemcpyFromSymbolAsync(	static __inline__ __host__ cudaError_t cudaMemcpyFromSymbolAsync(
	void *dst,	void *dst,
	char *symbol,	char *symbol,
	size_t count,	size_t count,

	size_t offset,	size_t offset = 0,
	enum cudaMemcpyKind kind,	enum cudaMemcpyKind kind = cudaMemcpyDeviceToHost,
	cudaStream_t stream	cudaStream_t stream = 0
	)	)
	{	{
	return cudaMemcpyFromSymbolAsync(dst, (const char*)symbol, count, offset, kind, stream);	return cudaMemcpyFromSymbolAsync(dst, (const char*)symbol, count, offset, kind, stream);
	}	}

	template<class T>	template<class T>
	__inline__ __host__ cudaError_t cudaMemcpyFromSymbolAsync(	__inline__ __host__ cudaError_t cudaMemcpyFromSymbolAsync(
	void *dst,	void *dst,
	const T &symbol,	const T &symbol,
	size_t count,	size_t count,

	size_t offset,	size_t offset = 0,
	enum cudaMemcpyKind kind,	enum cudaMemcpyKind kind = cudaMemcpyDeviceToHost,
	cudaStream_t stream	cudaStream_t stream = 0
	)	)
	{	{
	return cudaMemcpyFromSymbolAsync(dst, (const char*)&symbol, count, offset , kind, stream);	return cudaMemcpyFromSymbolAsync(dst, (const char*)&symbol, count, offset , kind, stream);
	}	}

	static __inline__ __host__ cudaError_t cudaGetSymbolAddress(	static __inline__ __host__ cudaError_t cudaGetSymbolAddress(
	void **devPtr,	void **devPtr,
	char *symbol	char *symbol
	)	)
	{	{

	skipping to change at line 678	skipping to change at line 678
	/** @} / / END CUDART_HIGHLEVEL */	/** @} / / END CUDART_HIGHLEVEL */

	/************************************************************************ ***	/************************************************************************ ***
	* *	* *
	* *	* *
	* *	* *
	************************************************************************* **/	************************************************************************* **/

	/**	/**
	* \ingroup CUDART_HIGHLEVEL	* \ingroup CUDART_HIGHLEVEL

		* \brief Sets the preferred cache configuration for a device function
		*
		* On devices where the L1 cache and shared memory use the same hardware
		* resources, this sets through \p cacheConfig the preferred cache configur
		ation
		* for the function specified via \p func. This is only a preference. The
		* runtime will use the requested configuration if possible, but it is free
		to
		* choose a different configuration if required to execute \p func.
		*
		* \p func can either be a pointer to a function that executes
		* on the device, or it can be a character string specifying the
		* fully-decorated (C++) name for a function that executes on the device.
		* The parameter specified by \p func must be declared as a \p __global__
		* function. If the specified function does not exist,
		* then ::cudaErrorInvalidDeviceFunction is returned.
		*
		* This setting does nothing on devices where the size of the L1 cache and
		* shared memory are fixed.
		*
		* Switching between configuration modes may insert a device-side
		* synchronization point for streamed kernel launches.
		*
		* \param func - Device char string naming device function
		* \param cacheConfig - Cache configuration mode
		*
		* \return
		* ::cudaSuccess,
		* ::cudaErrorInitializationError,
		* ::cudaErrorInvalidDeviceFunction
		* \notefnerr
		*
		* \sa ::cudaConfigureCall,
		* \ref ::cudaFuncSetCacheConfig(const char*, enum cudaFuncCache) "cudaFunc
		SetCacheConfig (C API)",
		* \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes, T) "cudaFuncGe
		tAttributes (C++ API)",
		* \ref ::cudaLaunch(const char*) "cudaLaunch (C API)",
		* ::cudaSetDoubleForDevice,
		* ::cudaSetDoubleForHost,
		* \ref ::cudaSetupArgument(T,size_t) "cudaSetupArgument (C++ API)"
		*/
		template<class T>
		__inline__ __host__ cudaError_t cudaFuncSetCacheConfig(
		T *func,
		enum cudaFuncCache cacheConfig
		)
		{
		return cudaFuncSetCacheConfig((const char*)func, cacheConfig);
		}

		/**
		* \ingroup CUDART_HIGHLEVEL
	* \brief \hl Launches a device function	* \brief \hl Launches a device function
	*	*
	* Launches the function \p entry on the device. The parameter \p entry can	* Launches the function \p entry on the device. The parameter \p entry can
	* either be a function that executes on the device, or it can be a charact er	* either be a function that executes on the device, or it can be a charact er
	* string, naming a function that executes on the device. The parameter	* string, naming a function that executes on the device. The parameter
	* specified by \p entry must be declared as a \p __global__ function.	* specified by \p entry must be declared as a \p __global__ function.
	* \ref ::cudaLaunch(T*) "cudaLaunch()" must be preceded by a call to	* \ref ::cudaLaunch(T*) "cudaLaunch()" must be preceded by a call to
	* ::cudaConfigureCall() since it pops the data that was pushed by	* ::cudaConfigureCall() since it pops the data that was pushed by
	* ::cudaConfigureCall() from the execution stack.	* ::cudaConfigureCall() from the execution stack.
	*	*

	skipping to change at line 702	skipping to change at line 751
	* ::cudaSuccess,	* ::cudaSuccess,
	* ::cudaErrorInvalidDeviceFunction,	* ::cudaErrorInvalidDeviceFunction,
	* ::cudaErrorInvalidConfiguration,	* ::cudaErrorInvalidConfiguration,
	* ::cudaErrorLaunchFailure,	* ::cudaErrorLaunchFailure,
	* ::cudaErrorPriorLaunchFailure,	* ::cudaErrorPriorLaunchFailure,
	* ::cudaErrorLaunchTimeout,	* ::cudaErrorLaunchTimeout,
	* ::cudaErrorLaunchOutOfResources	* ::cudaErrorLaunchOutOfResources
	* \notefnerr	* \notefnerr
	*	*
	* \sa ::cudaConfigureCall,	* \sa ::cudaConfigureCall,

		* \ref ::cudaFuncSetCacheConfig(T*, enum cudaFuncCache) "cudaFuncSetCacheC onfig (C++ API)",
	* \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes, T) "cudaFuncGe tAttributes (C++ API)",	* \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes, T) "cudaFuncGe tAttributes (C++ API)",
	* \ref ::cudaLaunch(const char*) "cudaLaunch (C API)",	* \ref ::cudaLaunch(const char*) "cudaLaunch (C API)",
	* ::cudaSetDoubleForDevice,	* ::cudaSetDoubleForDevice,
	* ::cudaSetDoubleForHost,	* ::cudaSetDoubleForHost,
	* \ref ::cudaSetupArgument(T,size_t) "cudaSetupArgument (C++ API)"	* \ref ::cudaSetupArgument(T,size_t) "cudaSetupArgument (C++ API)"
	*/	*/
	template<class T>	template<class T>
	__inline__ __host__ cudaError_t cudaLaunch(	__inline__ __host__ cudaError_t cudaLaunch(
	T *entry	T *entry
	)	)
	{	{
	return cudaLaunch((const char*)entry);	return cudaLaunch((const char*)entry);
	}	}

	/**	/**
	* \ingroup CUDART_HIGHLEVEL	* \ingroup CUDART_HIGHLEVEL
	* \brief \hl Find out attributes for a given function	* \brief \hl Find out attributes for a given function
	*	*
	* This function obtains the attributes of a function specified via \p entr y.	* This function obtains the attributes of a function specified via \p entr y.

	* The parameter \p entry can either be a function that executes on the	* The parameter \p entry can either be a pointer to a function that execut
	* device, or it can be a character string, naming a function that executes	es
	on	* on the device, or it can be a character string specifying the
	* the device. The parameter specified by \p entry must be declared as a	* fully-decorated (C++) name of a function that executes on the device. Th
	* \p __global__ function. The fetched attributes are placed in \p attr. If	e
	* the specified function does not exist, then ::cudaErrorInvalidDeviceFunc	* parameter specified by \p entry must be declared as a \p __global__
	tion	* function. The fetched attributes are placed in \p attr. If the specified
	* is returned.	* function does not exist, then ::cudaErrorInvalidDeviceFunction is return
		ed.
	*	*
	* \param attr - Return pointer to function's attributes	* \param attr - Return pointer to function's attributes
	* \param entry - Function to get attributes of	* \param entry - Function to get attributes of
	*	*
	* \return	* \return
	* ::cudaSuccess,	* ::cudaSuccess,
	* ::cudaErrorInitializationError,	* ::cudaErrorInitializationError,
	* ::cudaErrorInvalidDeviceFunction	* ::cudaErrorInvalidDeviceFunction
	* \notefnerr	* \notefnerr
	*	*
	* \sa ::cudaConfigureCall,	* \sa ::cudaConfigureCall,

		* \ref ::cudaFuncSetCacheConfig(T*, enum cudaFuncCache) "cudaFuncSetCacheC onfig (C++ API)",
	* \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes, const char) "c udaFuncGetAttributes (C API)",	* \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes, const char) "c udaFuncGetAttributes (C API)",
	* \ref ::cudaLaunch(T*) "cudaLaunch (C++ API)",	* \ref ::cudaLaunch(T*) "cudaLaunch (C++ API)",
	* ::cudaSetDoubleForDevice,	* ::cudaSetDoubleForDevice,
	* ::cudaSetDoubleForHost,	* ::cudaSetDoubleForHost,
	* \ref ::cudaSetupArgument(T,size_t) "cudaSetupArgument (C++ API)"	* \ref ::cudaSetupArgument(T,size_t) "cudaSetupArgument (C++ API)"
	*/	*/
	template<class T>	template<class T>
	__inline__ __host__ cudaError_t cudaFuncGetAttributes(	__inline__ __host__ cudaError_t cudaFuncGetAttributes(
	struct cudaFuncAttributes *attr,	struct cudaFuncAttributes *attr,
	T *entry	T *entry

End of changes. 9 change blocks.
	21 lines changed or deleted	77 lines changed or added

	cuda_runtime_api.h	cuda_runtime_api.h
	/*	/*

	* Copyright 1993-2009 NVIDIA Corporation. All rights reserved.	* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
	*	*
	* NOTICE TO USER:	* NOTICE TO USER:
	*	*
	* This source code is subject to NVIDIA ownership rights under U.S. and	* This source code is subject to NVIDIA ownership rights under U.S. and
	* international Copyright laws. Users and possessors of this source code	* international Copyright laws. Users and possessors of this source code
	* are hereby granted a nonexclusive, royalty-free license to use this code	* are hereby granted a nonexclusive, royalty-free license to use this code
	* in individual and commercial software.	* in individual and commercial software.
	*	*
	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR

	skipping to change at line 41	skipping to change at line 41
	* Any use of this source code in individual and commercial software must	* Any use of this source code in individual and commercial software must
	* include, in the user documentation and internal comments to the code,	* include, in the user documentation and internal comments to the code,
	* the above Disclaimer and U.S. Government End Users Notice.	* the above Disclaimer and U.S. Government End Users Notice.
	*/	*/

	#if !defined(__CUDA_RUNTIME_API_H__)	#if !defined(__CUDA_RUNTIME_API_H__)
	#define __CUDA_RUNTIME_API_H__	#define __CUDA_RUNTIME_API_H__

	/************************************************************************ ***	/************************************************************************ ***
	* *	* *

	* CUDA Runtime API Version 2.3 *	* CUDA Runtime API Version 3.0 *
	* *	* *
	************************************************************************* **/	************************************************************************* **/


	#define CUDART_VERSION 2030	#define CUDART_VERSION 3000

	/************************************************************************ ***	/************************************************************************ ***
	* *	* *
	* *	* *
	* *	* *
	************************************************************************* **/	************************************************************************* **/

	#include "host_defines.h"	#include "host_defines.h"
	#include "builtin_types.h"	#include "builtin_types.h"


	skipping to change at line 93	skipping to change at line 93
	/************************************************************************ ***	/************************************************************************ ***
	* *	* *
	* *	* *
	* *	* *
	************************************************************************* **/	************************************************************************* **/

	extern __host__ cudaError_t CUDARTAPI cudaMalloc3D(struct cudaPitchedPtr* p itchedDevPtr, struct cudaExtent extent);	extern __host__ cudaError_t CUDARTAPI cudaMalloc3D(struct cudaPitchedPtr* p itchedDevPtr, struct cudaExtent extent);
	extern __host__ cudaError_t CUDARTAPI cudaMalloc3DArray(struct cudaArray** arrayPtr, const struct cudaChannelFormatDesc* desc, struct cudaExtent exten t);	extern __host__ cudaError_t CUDARTAPI cudaMalloc3DArray(struct cudaArray** arrayPtr, const struct cudaChannelFormatDesc* desc, struct cudaExtent exten t);
	extern __host__ cudaError_t CUDARTAPI cudaMemset3D(struct cudaPitchedPtr pi tchedDevPtr, int value, struct cudaExtent extent);	extern __host__ cudaError_t CUDARTAPI cudaMemset3D(struct cudaPitchedPtr pi tchedDevPtr, int value, struct cudaExtent extent);
	extern __host__ cudaError_t CUDARTAPI cudaMemcpy3D(const struct cudaMemcpy3 DParms *p);	extern __host__ cudaError_t CUDARTAPI cudaMemcpy3D(const struct cudaMemcpy3 DParms *p);

	extern __host__ cudaError_t CUDARTAPI cudaMemcpy3DAsync(const struct cudaMe mcpy3DParms *p, cudaStream_t stream);	extern __host__ cudaError_t CUDARTAPI cudaMemcpy3DAsync(const struct cudaMe mcpy3DParms *p, cudaStream_t stream __dv(0));

	/************************************************************************ ***	/************************************************************************ ***
	* *	* *
	* *	* *
	* *	* *
	************************************************************************* **/	************************************************************************* **/

	extern __host__ cudaError_t CUDARTAPI cudaMalloc(void **devPtr, size_t size );	extern __host__ cudaError_t CUDARTAPI cudaMalloc(void **devPtr, size_t size );
	extern __host__ cudaError_t CUDARTAPI cudaMallocHost(void **ptr, size_t siz e);	extern __host__ cudaError_t CUDARTAPI cudaMallocHost(void **ptr, size_t siz e);
	extern __host__ cudaError_t CUDARTAPI cudaMallocPitch(void *devPtr, size_t pitch, size_t width, size_t height);	extern __host__ cudaError_t CUDARTAPI cudaMallocPitch(void *devPtr, size_t pitch, size_t width, size_t height);

	skipping to change at line 119	skipping to change at line 119
	extern __host__ cudaError_t CUDARTAPI cudaHostAlloc(void **pHost, size_t by tes, unsigned int flags);	extern __host__ cudaError_t CUDARTAPI cudaHostAlloc(void **pHost, size_t by tes, unsigned int flags);
	extern __host__ cudaError_t CUDARTAPI cudaHostGetDevicePointer(void *pDevi ce, void pHost, unsigned int flags);	extern __host__ cudaError_t CUDARTAPI cudaHostGetDevicePointer(void *pDevi ce, void pHost, unsigned int flags);
	extern __host__ cudaError_t CUDARTAPI cudaHostGetFlags(unsigned int pFlags , void pHost);	extern __host__ cudaError_t CUDARTAPI cudaHostGetFlags(unsigned int pFlags , void pHost);

	/************************************************************************ ***	/************************************************************************ ***
	* *	* *
	* *	* *
	* *	* *
	************************************************************************* **/	************************************************************************* **/


		extern __host__ cudaError_t CUDARTAPI cudaMemGetInfo(size_t free, size_t total);
	extern __host__ cudaError_t CUDARTAPI cudaMemcpy(void dst, const void src , size_t count, enum cudaMemcpyKind kind);	extern __host__ cudaError_t CUDARTAPI cudaMemcpy(void dst, const void src , size_t count, enum cudaMemcpyKind kind);
	extern __host__ cudaError_t CUDARTAPI cudaMemcpyToArray(struct cudaArray d st, size_t wOffset, size_t hOffset, const void src, size_t count, enum cud aMemcpyKind kind);	extern __host__ cudaError_t CUDARTAPI cudaMemcpyToArray(struct cudaArray d st, size_t wOffset, size_t hOffset, const void src, size_t count, enum cud aMemcpyKind kind);
	extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromArray(void dst, const struct cudaArray src, size_t wOffset, size_t hOffset, size_t count, enum c udaMemcpyKind kind);	extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromArray(void dst, const struct cudaArray src, size_t wOffset, size_t hOffset, size_t count, enum c udaMemcpyKind kind);
	extern __host__ cudaError_t CUDARTAPI cudaMemcpyArrayToArray(struct cudaArr ay dst, size_t wOffsetDst, size_t hOffsetDst, const struct cudaArray src, size_t wOffsetSrc, size_t hOffsetSrc, size_t count, enum cudaMemcpyKind ki nd __dv(cudaMemcpyDeviceToDevice));	extern __host__ cudaError_t CUDARTAPI cudaMemcpyArrayToArray(struct cudaArr ay dst, size_t wOffsetDst, size_t hOffsetDst, const struct cudaArray src, size_t wOffsetSrc, size_t hOffsetSrc, size_t count, enum cudaMemcpyKind ki nd __dv(cudaMemcpyDeviceToDevice));
	extern __host__ cudaError_t CUDARTAPI cudaMemcpy2D(void dst, size_t dpitch , const void src, size_t spitch, size_t width, size_t height, enum cudaMem cpyKind kind);	extern __host__ cudaError_t CUDARTAPI cudaMemcpy2D(void dst, size_t dpitch , const void src, size_t spitch, size_t width, size_t height, enum cudaMem cpyKind kind);
	extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DToArray(struct cudaArray dst, size_t wOffset, size_t hOffset, const void src, size_t spitch, size_ t width, size_t height, enum cudaMemcpyKind kind);	extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DToArray(struct cudaArray dst, size_t wOffset, size_t hOffset, const void src, size_t spitch, size_ t width, size_t height, enum cudaMemcpyKind kind);
	extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DFromArray(void dst, size _t dpitch, const struct cudaArray src, size_t wOffset, size_t hOffset, siz e_t width, size_t height, enum cudaMemcpyKind kind);	extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DFromArray(void dst, size _t dpitch, const struct cudaArray src, size_t wOffset, size_t hOffset, siz e_t width, size_t height, enum cudaMemcpyKind kind);
	extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DArrayToArray(struct cudaA rray dst, size_t wOffsetDst, size_t hOffsetDst, const struct cudaArray sr c, size_t wOffsetSrc, size_t hOffsetSrc, size_t width, size_t height, enum cudaMemcpyKind kind __dv(cudaMemcpyDeviceToDevice));	extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DArrayToArray(struct cudaA rray dst, size_t wOffsetDst, size_t hOffsetDst, const struct cudaArray sr c, size_t wOffsetSrc, size_t hOffsetSrc, size_t width, size_t height, enum cudaMemcpyKind kind __dv(cudaMemcpyDeviceToDevice));
	extern __host__ cudaError_t CUDARTAPI cudaMemcpyToSymbol(const char symbol , const void src, size_t count, size_t offset __dv(0), enum cudaMemcpyKind kind __dv(cudaMemcpyHostToDevice));	extern __host__ cudaError_t CUDARTAPI cudaMemcpyToSymbol(const char symbol , const void src, size_t count, size_t offset __dv(0), enum cudaMemcpyKind kind __dv(cudaMemcpyHostToDevice));
	extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromSymbol(void dst, const char symbol, size_t count, size_t offset __dv(0), enum cudaMemcpyKind kin d __dv(cudaMemcpyDeviceToHost));	extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromSymbol(void dst, const char symbol, size_t count, size_t offset __dv(0), enum cudaMemcpyKind kin d __dv(cudaMemcpyDeviceToHost));

	/************************************************************************ ***	/************************************************************************ ***
	* *	* *
	* *	* *
	* *	* *
	************************************************************************* **/	************************************************************************* **/


	extern __host__ cudaError_t CUDARTAPI cudaMemcpyAsync(void *dst, const void	extern __host__ cudaError_t CUDARTAPI cudaMemcpyAsync(void *dst, const void
	*src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream);	*src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0))
	extern __host__ cudaError_t CUDARTAPI cudaMemcpyToArrayAsync(struct cudaArr	;
	ay dst, size_t wOffset, size_t hOffset, const void src, size_t count, enu	extern __host__ cudaError_t CUDARTAPI cudaMemcpyToArrayAsync(struct cudaArr
	m cudaMemcpyKind kind, cudaStream_t stream);	ay dst, size_t wOffset, size_t hOffset, const void src, size_t count, enu
	extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromArrayAsync(void *dst, c	m cudaMemcpyKind kind, cudaStream_t stream __dv(0));
	onst struct cudaArray *src, size_t wOffset, size_t hOffset, size_t count, e	extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromArrayAsync(void *dst, c
	num cudaMemcpyKind kind, cudaStream_t stream);	onst struct cudaArray *src, size_t wOffset, size_t hOffset, size_t count, e
	extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DAsync(void *dst, size_t d	num cudaMemcpyKind kind, cudaStream_t stream __dv(0));
	pitch, const void *src, size_t spitch, size_t width, size_t height, enum cu	extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DAsync(void *dst, size_t d
	daMemcpyKind kind, cudaStream_t stream);	pitch, const void *src, size_t spitch, size_t width, size_t height, enum cu
	extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DToArrayAsync(struct cudaA	daMemcpyKind kind, cudaStream_t stream __dv(0));
	rray dst, size_t wOffset, size_t hOffset, const void src, size_t spitch,	extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DToArrayAsync(struct cudaA
	size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream)	rray dst, size_t wOffset, size_t hOffset, const void src, size_t spitch,
	;	size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream
	extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DFromArrayAsync(void *dst,	__dv(0));
	size_t dpitch, const struct cudaArray *src, size_t wOffset, size_t hOffset	extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DFromArrayAsync(void *dst,
	, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t strea	size_t dpitch, const struct cudaArray *src, size_t wOffset, size_t hOffset
	m);	, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t strea
	extern __host__ cudaError_t CUDARTAPI cudaMemcpyToSymbolAsync(const char *s	m __dv(0));
	ymbol, const void *src, size_t count, size_t offset, enum cudaMemcpyKind ki	extern __host__ cudaError_t CUDARTAPI cudaMemcpyToSymbolAsync(const char *s
	nd, cudaStream_t stream);	ymbol, const void *src, size_t count, size_t offset, enum cudaMemcpyKind ki
	extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromSymbolAsync(void *dst,	nd, cudaStream_t stream __dv(0));
	const char *symbol, size_t count, size_t offset, enum cudaMemcpyKind kind,	extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromSymbolAsync(void *dst,
	cudaStream_t stream);	const char *symbol, size_t count, size_t offset, enum cudaMemcpyKind kind,
		cudaStream_t stream __dv(0));

	/************************************************************************ ***	/************************************************************************ ***
	* *	* *
	* *	* *
	* *	* *
	************************************************************************* **/	************************************************************************* **/

	extern __host__ cudaError_t CUDARTAPI cudaMemset(void *devPtr, int value, s ize_t count);	extern __host__ cudaError_t CUDARTAPI cudaMemset(void *devPtr, int value, s ize_t count);
	extern __host__ cudaError_t CUDARTAPI cudaMemset2D(void *devPtr, size_t pit ch, int value, size_t width, size_t height);	extern __host__ cudaError_t CUDARTAPI cudaMemset2D(void *devPtr, size_t pit ch, int value, size_t width, size_t height);


	skipping to change at line 216	skipping to change at line 217
	extern __host__ const char* CUDARTAPI cudaGetErrorString(cudaError_t error) ;	extern __host__ const char* CUDARTAPI cudaGetErrorString(cudaError_t error) ;

	/************************************************************************ ***	/************************************************************************ ***
	* *	* *
	* *	* *
	* *	* *
	************************************************************************* **/	************************************************************************* **/

	extern __host__ cudaError_t CUDARTAPI cudaConfigureCall(dim3 gridDim, dim3 blockDim, size_t sharedMem __dv(0), cudaStream_t stream __dv(0));	extern __host__ cudaError_t CUDARTAPI cudaConfigureCall(dim3 gridDim, dim3 blockDim, size_t sharedMem __dv(0), cudaStream_t stream __dv(0));
	extern __host__ cudaError_t CUDARTAPI cudaSetupArgument(const void *arg, si ze_t size, size_t offset);	extern __host__ cudaError_t CUDARTAPI cudaSetupArgument(const void *arg, si ze_t size, size_t offset);

		extern __host__ cudaError_t CUDARTAPI cudaFuncSetCacheConfig(const char *fu nc, enum cudaFuncCache cacheConfig);
	extern __host__ cudaError_t CUDARTAPI cudaLaunch(const char *entry);	extern __host__ cudaError_t CUDARTAPI cudaLaunch(const char *entry);
	extern __host__ cudaError_t CUDARTAPI cudaFuncGetAttributes(struct cudaFunc Attributes attr, const char func);	extern __host__ cudaError_t CUDARTAPI cudaFuncGetAttributes(struct cudaFunc Attributes attr, const char func);

	/************************************************************************ ***	/************************************************************************ ***
	* *	* *
	* *	* *
	* *	* *
	************************************************************************* **/	************************************************************************* **/

	extern __host__ cudaError_t CUDARTAPI cudaStreamCreate(cudaStream_t *pStrea m);	extern __host__ cudaError_t CUDARTAPI cudaStreamCreate(cudaStream_t *pStrea m);

	skipping to change at line 238	skipping to change at line 240
	extern __host__ cudaError_t CUDARTAPI cudaStreamQuery(cudaStream_t stream);	extern __host__ cudaError_t CUDARTAPI cudaStreamQuery(cudaStream_t stream);

	/************************************************************************ ***	/************************************************************************ ***
	* *	* *
	* *	* *
	* *	* *
	************************************************************************* **/	************************************************************************* **/

	extern __host__ cudaError_t CUDARTAPI cudaEventCreate(cudaEvent_t *event);	extern __host__ cudaError_t CUDARTAPI cudaEventCreate(cudaEvent_t *event);
	extern __host__ cudaError_t CUDARTAPI cudaEventCreateWithFlags(cudaEvent_t *event, int flags);	extern __host__ cudaError_t CUDARTAPI cudaEventCreateWithFlags(cudaEvent_t *event, int flags);

	extern __host__ cudaError_t CUDARTAPI cudaEventRecord(cudaEvent_t event, cu daStream_t stream);	extern __host__ cudaError_t CUDARTAPI cudaEventRecord(cudaEvent_t event, cu daStream_t stream __dv(0));
	extern __host__ cudaError_t CUDARTAPI cudaEventQuery(cudaEvent_t event);	extern __host__ cudaError_t CUDARTAPI cudaEventQuery(cudaEvent_t event);
	extern __host__ cudaError_t CUDARTAPI cudaEventSynchronize(cudaEvent_t even t);	extern __host__ cudaError_t CUDARTAPI cudaEventSynchronize(cudaEvent_t even t);
	extern __host__ cudaError_t CUDARTAPI cudaEventDestroy(cudaEvent_t event);	extern __host__ cudaError_t CUDARTAPI cudaEventDestroy(cudaEvent_t event);
	extern __host__ cudaError_t CUDARTAPI cudaEventElapsedTime(float *ms, cudaE vent_t start, cudaEvent_t end);	extern __host__ cudaError_t CUDARTAPI cudaEventElapsedTime(float *ms, cudaE vent_t start, cudaEvent_t end);

	/************************************************************************ ***	/************************************************************************ ***
	* *	* *
	* *	* *
	* *	* *
	************************************************************************* **/	************************************************************************* **/

	skipping to change at line 271	skipping to change at line 273

	/************************************************************************ ***	/************************************************************************ ***
	* *	* *
	* *	* *
	* *	* *
	************************************************************************* **/	************************************************************************* **/

	extern __host__ cudaError_t CUDARTAPI cudaDriverGetVersion(int *driverVersi on);	extern __host__ cudaError_t CUDARTAPI cudaDriverGetVersion(int *driverVersi on);
	extern __host__ cudaError_t CUDARTAPI cudaRuntimeGetVersion(int *runtimeVer sion);	extern __host__ cudaError_t CUDARTAPI cudaRuntimeGetVersion(int *runtimeVer sion);


		/**************************************************************************
		*****
		*
		*
		*
		*
		*
		*
		***************************************************************************
		****/

		extern __host__ cudaError_t CUDARTAPI cudaGraphicsUnregisterResource(struct
		cudaGraphicsResource *resource);
		extern __host__ cudaError_t CUDARTAPI cudaGraphicsResourceSetMapFlags(struc
		t cudaGraphicsResource *resource, unsigned int flags);
		extern __host__ cudaError_t CUDARTAPI cudaGraphicsMapResources(int count, s
		truct cudaGraphicsResource **resources, cudaStream_t stream __dv(0));
		extern __host__ cudaError_t CUDARTAPI cudaGraphicsUnmapResources(int count,
		struct cudaGraphicsResource **resources, cudaStream_t stream __dv(0));
		extern __host__ cudaError_t CUDARTAPI cudaGraphicsResourceGetMappedPointer(
		void *devPtr, size_t size, struct cudaGraphicsResource *resource);
		extern __host__ cudaError_t CUDARTAPI cudaGraphicsSubResourceGetMappedArray
		(struct cudaArray *arrayPtr, struct cudaGraphicsResource resource, unsign
		ed int arrayIndex, unsigned int mipLevel);

	#if defined(__cplusplus)	#if defined(__cplusplus)
	}	}
	#endif /* __cplusplus */	#endif /* __cplusplus */

	#undef __dv	#undef __dv

	#endif /* !__CUDA_RUNTIME_API_H__ */	#endif /* !__CUDA_RUNTIME_API_H__ */

End of changes. 9 change blocks.
	30 lines changed or deleted	58 lines changed or added

	cuda_texture_types.h	cuda_texture_types.h
	/*	/*

	* Copyright 1993-2009 NVIDIA Corporation. All rights reserved.	* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
	*	*
	* NOTICE TO USER:	* NOTICE TO USER:
	*	*
	* This source code is subject to NVIDIA ownership rights under U.S. and	* This source code is subject to NVIDIA ownership rights under U.S. and
	* international Copyright laws. Users and possessors of this source code	* international Copyright laws. Users and possessors of this source code
	* are hereby granted a nonexclusive, royalty-free license to use this code	* are hereby granted a nonexclusive, royalty-free license to use this code
	* in individual and commercial software.	* in individual and commercial software.
	*	*
	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR

End of changes. 1 change blocks.
	1 lines changed or deleted	1 lines changed or added

	cufft.h	cufft.h
	/*	/*

	* Copyright 1993-2009 NVIDIA Corporation. All rights reserved.	* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
	*	*
	* NOTICE TO USER:	* NOTICE TO USER:
	*	*
	* This source code is subject to NVIDIA ownership rights under U.S. and	* This source code is subject to NVIDIA ownership rights under U.S. and
	* international Copyright laws. Users and possessors of this source code	* international Copyright laws. Users and possessors of this source code
	* are hereby granted a nonexclusive, royalty-free license to use this code	* are hereby granted a nonexclusive, royalty-free license to use this code
	* in individual and commercial software.	* in individual and commercial software.
	*	*
	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR

	skipping to change at line 105	skipping to change at line 105
	CUFFT_C2R = 0x2c, // Complex (interleaved) to Real	CUFFT_C2R = 0x2c, // Complex (interleaved) to Real
	CUFFT_C2C = 0x29, // Complex to Complex, interleaved	CUFFT_C2C = 0x29, // Complex to Complex, interleaved
	CUFFT_D2Z = 0x6a, // Double to Double-Complex	CUFFT_D2Z = 0x6a, // Double to Double-Complex
	CUFFT_Z2D = 0x6c, // Double-Complex to Double	CUFFT_Z2D = 0x6c, // Double-Complex to Double
	CUFFT_Z2Z = 0x69 // Double-Complex to Double-Complex	CUFFT_Z2Z = 0x69 // Double-Complex to Double-Complex
	} cufftType;	} cufftType;

	cufftResult CUFFTAPI cufftPlan1d(cufftHandle *plan,	cufftResult CUFFTAPI cufftPlan1d(cufftHandle *plan,
	int nx,	int nx,
	cufftType type,	cufftType type,

	int batch);	int batch /* deprecated - use cufftPlanMan y */);

	cufftResult CUFFTAPI cufftPlan2d(cufftHandle *plan,	cufftResult CUFFTAPI cufftPlan2d(cufftHandle *plan,
	int nx, int ny,	int nx, int ny,
	cufftType type);	cufftType type);

	cufftResult CUFFTAPI cufftPlan3d(cufftHandle *plan,	cufftResult CUFFTAPI cufftPlan3d(cufftHandle *plan,
	int nx, int ny, int nz,	int nx, int ny, int nz,
	cufftType type);	cufftType type);


		cufftResult CUFFTAPI cufftPlanMany(cufftHandle *plan,
		int rank,
		int *n,
		int *inembed, int istride, int idist,
		// Unused: pass "NULL, 1, 0"
		int *onembed, int ostride, int odist,
		// Unused: pass "NULL, 1, 0"
		cufftType type,
		int batch);

	cufftResult CUFFTAPI cufftDestroy(cufftHandle plan);	cufftResult CUFFTAPI cufftDestroy(cufftHandle plan);

	cufftResult CUFFTAPI cufftExecC2C(cufftHandle plan,	cufftResult CUFFTAPI cufftExecC2C(cufftHandle plan,
	cufftComplex *idata,	cufftComplex *idata,
	cufftComplex *odata,	cufftComplex *odata,
	int direction);	int direction);

	cufftResult CUFFTAPI cufftExecR2C(cufftHandle plan,	cufftResult CUFFTAPI cufftExecR2C(cufftHandle plan,
	cufftReal *idata,	cufftReal *idata,
	cufftComplex *odata);	cufftComplex *odata);

	skipping to change at line 143	skipping to change at line 151
	int direction);	int direction);

	cufftResult CUFFTAPI cufftExecD2Z(cufftHandle plan,	cufftResult CUFFTAPI cufftExecD2Z(cufftHandle plan,
	cufftDoubleReal *idata,	cufftDoubleReal *idata,
	cufftDoubleComplex *odata);	cufftDoubleComplex *odata);

	cufftResult CUFFTAPI cufftExecZ2D(cufftHandle plan,	cufftResult CUFFTAPI cufftExecZ2D(cufftHandle plan,
	cufftDoubleComplex *idata,	cufftDoubleComplex *idata,
	cufftDoubleReal *odata);	cufftDoubleReal *odata);


	cufftResult CUFFTAPI cufftSetStream(cufftHandle p,	cufftResult CUFFTAPI cufftSetStream(cufftHandle plan,
	cudaStream_t stream);	cudaStream_t stream);

	#ifdef __cplusplus	#ifdef __cplusplus
	}	}
	#endif	#endif

	#endif /* _CUFFT_H_ */	#endif /* _CUFFT_H_ */

End of changes. 4 change blocks.
	3 lines changed or deleted	13 lines changed or added

	device_functions.h	device_functions.h
	/*	/*

	* Copyright 1993-2009 NVIDIA Corporation. All rights reserved.	* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
	*	*
	* NOTICE TO USER:	* NOTICE TO USER:
	*	*
	* This source code is subject to NVIDIA ownership rights under U.S. and	* This source code is subject to NVIDIA ownership rights under U.S. and
	* international Copyright laws. Users and possessors of this source code	* international Copyright laws. Users and possessors of this source code
	* are hereby granted a nonexclusive, royalty-free license to use this code	* are hereby granted a nonexclusive, royalty-free license to use this code
	* in individual and commercial software.	* in individual and commercial software.
	*	*
	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR

	skipping to change at line 194	skipping to change at line 194
	extern __device__ unsigned long long int __float2ull_rn(float);	extern __device__ unsigned long long int __float2ull_rn(float);
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __device__ unsigned long long int __float2ull_rz(float);	extern __device__ unsigned long long int __float2ull_rz(float);
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __device__ unsigned long long int __float2ull_ru(float);	extern __device__ unsigned long long int __float2ull_ru(float);
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __device__ unsigned long long int __float2ull_rd(float);	extern __device__ unsigned long long int __float2ull_rd(float);

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __device__ float __ll2float_rn(long long int);	extern __device__ float __ll2float_rn(long long int);

		/DEVICE_BUILTIN/
		extern __device__ float __ll2float_rz(long long int);
		/DEVICE_BUILTIN/
		extern __device__ float __ll2float_ru(long long int);
		/DEVICE_BUILTIN/
		extern __device__ float __ll2float_rd(long long int);

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __device__ float __ull2float_rn(unsigned long long int);	extern __device__ float __ull2float_rn(unsigned long long int);

		/DEVICE_BUILTIN/
		extern __device__ float __ull2float_rz(unsigned long long
		int);
		/DEVICE_BUILTIN/
		extern __device__ float __ull2float_ru(unsigned long long
		int);
		/DEVICE_BUILTIN/
		extern __device__ float __ull2float_rd(unsigned long long
		int);

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __device__ unsigned short __float2half_rn(float);	extern __device__ unsigned short __float2half_rn(float);
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __device__ float __half2float(unsigned short);	extern __device__ float __half2float(unsigned short);

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __device__ float __fadd_rn(float, float);	extern __device__ float __fadd_rn(float, float);
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __device__ float __fadd_rz(float, float);	extern __device__ float __fadd_rz(float, float);

	skipping to change at line 275	skipping to change at line 287

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __device__ int __clzll(long long int);	extern __device__ int __clzll(long long int);
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __device__ int __ffsll(long long int);	extern __device__ int __ffsll(long long int);
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __device__ int __popcll(unsigned long long int);	extern __device__ int __popcll(unsigned long long int);
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __device__ unsigned long long int __brevll(unsigned long long int);	extern __device__ unsigned long long int __brevll(unsigned long long int);


	#if (__CUDA_ARCH__ >= 130)	#if !defined(__CUDA_ARCH__) \|\| __CUDA_ARCH__ >= 130

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __device__ int __double2int_rz(double);	extern __device__ int __double2int_rz(double);

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __device__ unsigned int __double2uint_rz(double);	extern __device__ unsigned int __double2uint_rz(double);

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __device__ long long int __double2ll_rz(double);	extern __device__ long long int __double2ll_rz(double);

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __device__ unsigned long long int __double2ull_rz(double);	extern __device__ unsigned long long int __double2ull_rz(double);


	#endif /* __CUDA_ARCH__ >= 130 */	#endif /* !__CUDA_ARCH__ \|\| __CUDA_ARCH__ >= 130 */

	}	}

	/************************************************************************ ***	/************************************************************************ ***
	* *	* *
	* *	* *
	* *	* *
	************************************************************************* **/	************************************************************************* **/

	static __inline__ __device__ int mulhi(int a, int b)	static __inline__ __device__ int mulhi(int a, int b)

	skipping to change at line 445	skipping to change at line 457

	#elif !defined(__CUDACC__)	#elif !defined(__CUDACC__)

	#include "crt/func_macro.h"	#include "crt/func_macro.h"

	#include "host_defines.h"	#include "host_defines.h"
	#include "math_constants.h"	#include "math_constants.h"

	#if defined(__CUDABE__)	#if defined(__CUDABE__)


		#if (__CUDA_ARCH__ < 200)

	__device_func__(float __frcp_rn (float x))	__device_func__(float __frcp_rn (float x))
	{	{
	unsigned int expo;	unsigned int expo;
	unsigned f, y;	unsigned f, y;
	unsigned int argi;	unsigned int argi;
	float t;	float t;

	argi = __float_as_int(x);	argi = __float_as_int(x);
	expo = (argi >> 23);	expo = (argi >> 23);
	expo = expo & 0xff;	expo = expo & 0xff;

	skipping to change at line 2381	skipping to change at line 2395
	return __int_as_float(xx);	return __int_as_float(xx);
	}	}
	/* subnormal */	/* subnormal */
	expo_x = ((unsigned int)-((int)expo_x));	expo_x = ((unsigned int)-((int)expo_x));
	xx += (temp && expo_y);	xx += (temp && expo_y);
	xx = (xx >> expo_x);	xx = (xx >> expo_x);
	if ((expo_x > 25) \|\| (xx != 0x00800000)) xx = 0;	if ((expo_x > 25) \|\| (xx != 0x00800000)) xx = 0;
	return __int_as_float(expo_y \| xx);	return __int_as_float(expo_y \| xx);
	}	}


		#endif /* __CUDA_ARCH__ < 200 */

	#else /* defined(__CUDABE__) */	#else /* defined(__CUDABE__) */

	#include "common_types.h"	#include "common_types.h"

	static __device__ const unsigned char __internal_rcpTab[128] =	static __device__ const unsigned char __internal_rcpTab[128] =
	{	{
	0xff, 0xfd, 0xfb, 0xf9, 0xf7, 0xf5, 0xf4, 0xf2,	0xff, 0xfd, 0xfb, 0xf9, 0xf7, 0xf5, 0xf4, 0xf2,
	0xf0, 0xee, 0xed, 0xeb, 0xe9, 0xe8, 0xe6, 0xe4,	0xf0, 0xee, 0xed, 0xeb, 0xe9, 0xe8, 0xe6, 0xe4,
	0xe3, 0xe1, 0xe0, 0xde, 0xdd, 0xdb, 0xda, 0xd8,	0xe3, 0xe1, 0xe0, 0xde, 0xdd, 0xdb, 0xda, 0xd8,
	0xd7, 0xd5, 0xd4, 0xd3, 0xd1, 0xd0, 0xcf, 0xcd,	0xd7, 0xd5, 0xd4, 0xd3, 0xd1, 0xd0, 0xcf, 0xcd,

	skipping to change at line 3830	skipping to change at line 3846

	__device_func__(unsigned int __usad(unsigned int a, unsigned int b, unsigne d int c))	__device_func__(unsigned int __usad(unsigned int a, unsigned int b, unsigne d int c))
	{	{
	long long int diff = (long long int)a - (long long int)b;	long long int diff = (long long int)a - (long long int)b;

	return (unsigned int)(__cuda_llabs(diff) + (long long int)c);	return (unsigned int)(__cuda_llabs(diff) + (long long int)c);
	}	}

	__device_func__(int __mul24(int a, int b))	__device_func__(int __mul24(int a, int b))
	{	{

	#if !defined(__MULTI_CORE__)
	a &= 0xffffff;	a &= 0xffffff;
	a = (a & 0x800000) != 0 ? a \| ~0xffffff : a;	a = (a & 0x800000) != 0 ? a \| ~0xffffff : a;
	b &= 0xffffff;	b &= 0xffffff;
	b = (b & 0x800000) != 0 ? b \| ~0xffffff : b;	b = (b & 0x800000) != 0 ? b \| ~0xffffff : b;

	#endif /* !__MULTI_CORE__ */

	return a * b;	return a * b;
	}	}

	__device_func__(unsigned int __umul24(unsigned int a, unsigned int b))	__device_func__(unsigned int __umul24(unsigned int a, unsigned int b))
	{	{

	#if !defined(__MULTI_CORE__)
	a &= 0xffffff;	a &= 0xffffff;
	b &= 0xffffff;	b &= 0xffffff;

	#endif /* !__MULTI_CORE__ */

	return a * b;	return a * b;
	}	}

	__device_func__(float __int_as_float(int a))	__device_func__(float __int_as_float(int a))
	{	{
	volatile union __cudart_FloatIntCvt u;	volatile union __cudart_FloatIntCvt u;

	u.i = a;	u.i = a;
	return u.f;	return u.f;

	skipping to change at line 3903	skipping to change at line 3915
	return (long long int)res;	return (long long int)res;
	}	}

	__device_func__(int __internal_float2int(float a, enum cudaRoundMode rndMod e))	__device_func__(int __internal_float2int(float a, enum cudaRoundMode rndMod e))
	{	{
	return (int)__internal_float2ll_kernel(a, 2147483647LL, -2147483648LL, 0L L, rndMode);	return (int)__internal_float2ll_kernel(a, 2147483647LL, -2147483648LL, 0L L, rndMode);
	}	}

	__device_func__(int __float2int_rz(float a))	__device_func__(int __float2int_rz(float a))
	{	{

	#if defined(__MULTI_CORE__)
	return (int)a;
	#else /* __MULTI_CORE__ */
	return __internal_float2int(a, cudaRoundZero);	return __internal_float2int(a, cudaRoundZero);

	#endif /* __MULTI_CORE__ */
	}	}

	__device_func__(int __float2int_ru(float a))	__device_func__(int __float2int_ru(float a))
	{	{
	return __internal_float2int(a, cudaRoundPosInf);	return __internal_float2int(a, cudaRoundPosInf);
	}	}

	__device_func__(int __float2int_rd(float a))	__device_func__(int __float2int_rd(float a))
	{	{
	return __internal_float2int(a, cudaRoundMinInf);	return __internal_float2int(a, cudaRoundMinInf);

	skipping to change at line 3932	skipping to change at line 3940
	return __internal_float2int(a, cudaRoundNearest);	return __internal_float2int(a, cudaRoundNearest);
	}	}

	__device_func__(long long int __internal_float2ll(float a, enum cudaRoundMo de rndMode))	__device_func__(long long int __internal_float2ll(float a, enum cudaRoundMo de rndMode))
	{	{
	return __internal_float2ll_kernel(a, 9223372036854775807LL, -922337203685 4775807LL -1LL, -9223372036854775807LL -1LL, rndMode);	return __internal_float2ll_kernel(a, 9223372036854775807LL, -922337203685 4775807LL -1LL, -9223372036854775807LL -1LL, rndMode);
	}	}

	__device_func__(long long int __float2ll_rz(float a))	__device_func__(long long int __float2ll_rz(float a))
	{	{

	#if defined(__MULTI_CORE__)
	return (long long int)a;
	#else /* __MULTI_CORE__ */
	return __internal_float2ll(a, cudaRoundZero);	return __internal_float2ll(a, cudaRoundZero);

	#endif /* __MULTI_CORE__ */
	}	}

	__device_func__(long long int __float2ll_ru(float a))	__device_func__(long long int __float2ll_ru(float a))
	{	{
	return __internal_float2ll(a, cudaRoundPosInf);	return __internal_float2ll(a, cudaRoundPosInf);
	}	}

	__device_func__(long long int __float2ll_rd(float a))	__device_func__(long long int __float2ll_rd(float a))
	{	{
	return __internal_float2ll(a, cudaRoundMinInf);	return __internal_float2ll(a, cudaRoundMinInf);

	skipping to change at line 3987	skipping to change at line 3991
	return res;	return res;
	}	}

	__device_func__(unsigned int __internal_float2uint(float a, enum cudaRoundM ode rndMode))	__device_func__(unsigned int __internal_float2uint(float a, enum cudaRoundM ode rndMode))
	{	{
	return (unsigned int)__internal_float2ull_kernel(a, 4294967295U, 0U, rndM ode);	return (unsigned int)__internal_float2ull_kernel(a, 4294967295U, 0U, rndM ode);
	}	}

	__device_func__(unsigned int __float2uint_rz(float a))	__device_func__(unsigned int __float2uint_rz(float a))
	{	{

	#if defined(__MULTI_CORE__)
	return (unsigned int)a;
	#else /* __MULTI_CORE__ */
	return __internal_float2uint(a, cudaRoundZero);	return __internal_float2uint(a, cudaRoundZero);

	#endif /* __MULTI_CORE__ */
	}	}

	__device_func__(unsigned int __float2uint_ru(float a))	__device_func__(unsigned int __float2uint_ru(float a))
	{	{
	return __internal_float2uint(a, cudaRoundPosInf);	return __internal_float2uint(a, cudaRoundPosInf);
	}	}

	__device_func__(unsigned int __float2uint_rd(float a))	__device_func__(unsigned int __float2uint_rd(float a))
	{	{
	return __internal_float2uint(a, cudaRoundMinInf);	return __internal_float2uint(a, cudaRoundMinInf);

	skipping to change at line 4016	skipping to change at line 4016
	return __internal_float2uint(a, cudaRoundNearest);	return __internal_float2uint(a, cudaRoundNearest);
	}	}

	__device_func__(unsigned long long int __internal_float2ull(float a, enum c udaRoundMode rndMode))	__device_func__(unsigned long long int __internal_float2ull(float a, enum c udaRoundMode rndMode))
	{	{
	return __internal_float2ull_kernel(a, 18446744073709551615ULL, 9223372036 854775808ULL, rndMode);	return __internal_float2ull_kernel(a, 18446744073709551615ULL, 9223372036 854775808ULL, rndMode);
	}	}

	__device_func__(unsigned long long int __float2ull_rz(float a))	__device_func__(unsigned long long int __float2ull_rz(float a))
	{	{

	#if defined(__MULTI_CORE__)
	return (unsigned long long int)a;
	#else /* __MULTI_CORE__ */
	return __internal_float2ull(a, cudaRoundZero);	return __internal_float2ull(a, cudaRoundZero);

	#endif /* __MULTI_CORE__ */
	}	}

	__device_func__(unsigned long long int __float2ull_ru(float a))	__device_func__(unsigned long long int __float2ull_ru(float a))
	{	{
	return __internal_float2ull(a, cudaRoundPosInf);	return __internal_float2ull(a, cudaRoundPosInf);
	}	}

	__device_func__(unsigned long long int __float2ull_rd(float a))	__device_func__(unsigned long long int __float2ull_rd(float a))
	{	{
	return __internal_float2ull(a, cudaRoundMinInf);	return __internal_float2ull(a, cudaRoundMinInf);

	skipping to change at line 4121	skipping to change at line 4117
	return __internal_int2float_kernel(a, cudaRoundPosInf);	return __internal_int2float_kernel(a, cudaRoundPosInf);
	}	}

	__device_func__(float __int2float_rd(int a))	__device_func__(float __int2float_rd(int a))
	{	{
	return __internal_int2float_kernel(a, cudaRoundMinInf);	return __internal_int2float_kernel(a, cudaRoundMinInf);
	}	}

	__device_func__(float __int2float_rn(int a))	__device_func__(float __int2float_rn(int a))
	{	{

	#if defined(__MULTI_CORE__)
	return (float)a;
	#else /* __MULTI_CORE__ */
	return __internal_int2float_kernel(a, cudaRoundNearest);	return __internal_int2float_kernel(a, cudaRoundNearest);

	#endif /* __MULTI_CORE__ */
	}	}

	__device_func__(float __internal_uint2float_kernel(unsigned int a, enum cud aRoundMode rndMode))	__device_func__(float __internal_uint2float_kernel(unsigned int a, enum cud aRoundMode rndMode))
	{	{
	volatile union __cudart_FloatUintCvt res;	volatile union __cudart_FloatUintCvt res;
	int shift;	int shift;
	unsigned int t;	unsigned int t;
	res.i = a;	res.i = a;
	if (a == 0) return res.f;	if (a == 0) return res.f;
	shift = __internal_normalize((unsigned int*)&res.i);	shift = __internal_normalize((unsigned int*)&res.i);

	skipping to change at line 4165	skipping to change at line 4157
	return __internal_uint2float_kernel(a, cudaRoundPosInf);	return __internal_uint2float_kernel(a, cudaRoundPosInf);
	}	}

	__device_func__(float __uint2float_rd(unsigned int a))	__device_func__(float __uint2float_rd(unsigned int a))
	{	{
	return __internal_uint2float_kernel(a, cudaRoundMinInf);	return __internal_uint2float_kernel(a, cudaRoundMinInf);
	}	}

	__device_func__(float __uint2float_rn(unsigned int a))	__device_func__(float __uint2float_rn(unsigned int a))
	{	{

	#if defined(__MULTI_CORE__)
	return (float)a;
	#else /* __MULTI_CORE__ */
	return __internal_uint2float_kernel(a, cudaRoundNearest);	return __internal_uint2float_kernel(a, cudaRoundNearest);

	#endif /* __MULTI_CORE__ */
	}

	__device_func__(float __ll2float_rn(long long int a))
	{
	return (float)a;
	}	}

	__device_func__(float __internal_ull2float_kernel(unsigned long long int a, enum cudaRoundMode rndMode))	__device_func__(float __internal_ull2float_kernel(unsigned long long int a, enum cudaRoundMode rndMode))
	{	{
	unsigned long long int temp;	unsigned long long int temp;
	unsigned int res, t;	unsigned int res, t;
	int shift;	int shift;
	if (a == 0ULL) return 0.0f;	if (a == 0ULL) return 0.0f;
	temp = a;	temp = a;
	shift = __internal_normalize64(&temp);	shift = __internal_normalize64(&temp);

	skipping to change at line 4197	skipping to change at line 4180
	t = (unsigned int)temp;	t = (unsigned int)temp;
	res += (127 + 62 - shift) << 23; /* add in exponent */	res += (127 + 62 - shift) << 23; /* add in exponent */
	if (rndMode == cudaRoundNearest) {	if (rndMode == cudaRoundNearest) {
	res += (t == 0x80000000) ? (res & 1) : (t >> 31);	res += (t == 0x80000000) ? (res & 1) : (t >> 31);
	} else if (rndMode == cudaRoundPosInf) {	} else if (rndMode == cudaRoundPosInf) {
	res += (t != 0);	res += (t != 0);
	}	}
	return __int_as_float(res);	return __int_as_float(res);
	}	}


		__device_func__(float __internal_ll2float_kernel(long long int a, enum cuda
		RoundMode rndMode))
		{
		unsigned long long int temp;
		volatile float res = 0.0f;

		if (a < 0LL) {
		temp = (~((unsigned long long int)a)) + 1ULL;
		if (rndMode == cudaRoundPosInf) {
		rndMode = cudaRoundMinInf;
		} else if (rndMode == cudaRoundMinInf) {
		rndMode = cudaRoundPosInf;
		}
		} else {
		temp = (unsigned long long int)a;
		}
		res = __internal_ull2float_kernel (temp, rndMode);
		if (a < 0LL) {
		res = -res;
		}
		return res;
		}

		__device_func__(float __ll2float_rn(long long int a))
		{
		return __internal_ll2float_kernel(a, cudaRoundNearest);
		}

		__device_func__(float __ll2float_rz(long long int a))
		{
		return __internal_ll2float_kernel(a, cudaRoundZero);
		}

		__device_func__(float __ll2float_ru(long long int a))
		{
		return __internal_ll2float_kernel(a, cudaRoundPosInf);
		}

		__device_func__(float __ll2float_rd(long long int a))
		{
		return __internal_ll2float_kernel(a, cudaRoundMinInf);
		}

	__device_func__(float __ull2float_rn(unsigned long long int a))	__device_func__(float __ull2float_rn(unsigned long long int a))
	{	{

	#if defined(__MULTI_CORE__)
	return (float)a;
	#else /* __MULTI_CORE__ */
	return __internal_ull2float_kernel(a, cudaRoundNearest);	return __internal_ull2float_kernel(a, cudaRoundNearest);

	#endif /* __MULTI_CORE__ */	}

		__device_func__(float __ull2float_rz(unsigned long long int a))
		{
		return __internal_ull2float_kernel(a, cudaRoundZero);
		}

		__device_func__(float __ull2float_ru(unsigned long long int a))
		{
		return __internal_ull2float_kernel(a, cudaRoundPosInf);
		}

		__device_func__(float __ull2float_rd(unsigned long long int a))
		{
		return __internal_ull2float_kernel(a, cudaRoundMinInf);
	}	}

	__device_func__(unsigned short __float2half_rn(float f))	__device_func__(unsigned short __float2half_rn(float f))
	{	{
	unsigned int x = __float_as_int (f);	unsigned int x = __float_as_int (f);
	unsigned int u = (x & 0x7fffffff), remainder, shift, lsb, lsb_s1, lsb_m1;	unsigned int u = (x & 0x7fffffff), remainder, shift, lsb, lsb_s1, lsb_m1;
	unsigned int sign, exponent, mantissa;	unsigned int sign, exponent, mantissa;

	/* Get rid of +NaN/-NaN case first. */	/* Get rid of +NaN/-NaN case first. */
	if (u > 0x7f800000) {	if (u > 0x7f800000) {

	skipping to change at line 4310	skipping to change at line 4346
	__device_func__(float __fmul_rn(float a, float b))	__device_func__(float __fmul_rn(float a, float b))
	{	{
	return __internal_fmul_kernel(a, b, cudaRoundNearest);	return __internal_fmul_kernel(a, b, cudaRoundNearest);
	}	}

	__device_func__(void __brkpt(int c))	__device_func__(void __brkpt(int c))
	{	{
	/* TODO */	/* TODO */
	}	}


	#if defined(__MULTI_CORE__)	#if defined(__cplusplus)
		extern "C" {
	#define __syncthreads() \	#endif /* __cplusplus */
	__builtin___syncthreads()

	#else /* __MULTI_CORE__ */

	extern int CUDARTAPI __cudaSynchronizeThreads(void*, void);	extern int CUDARTAPI __cudaSynchronizeThreads(void*, void);


		#if defined(__cplusplus)
		}
		#endif /* __cplusplus */

	#if defined(__GNUC__)	#if defined(__GNUC__)

	__device_func__(inline __attribute__((always_inline)) void __syncthreads(vo id))	__device_func__(inline __attribute__((always_inline)) void __syncthreads(vo id))
	{	{
	volatile int _ = 0;	volatile int _ = 0;
	L: if (__cudaSynchronizeThreads((void*)&&L, (void)&_)) goto L;	L: if (__cudaSynchronizeThreads((void*)&&L, (void)&_)) goto L;
	}	}

	#elif defined(_WIN32)	#elif defined(_WIN32)

	#define __syncthreads() \	#define __syncthreads() \
	(void)__cudaSynchronizeThreads((void*)0, (void)0)	(void)__cudaSynchronizeThreads((void*)0, (void)0)

	#endif /* __GNUC__ */	#endif /* __GNUC__ */


	#endif /* __MULTI_CORE__ */

	__device_func__(void __prof_trigger(int a))	__device_func__(void __prof_trigger(int a))
	{	{
	}	}

	__device_func__(void __threadfence(void))	__device_func__(void __threadfence(void))
	{	{
	__syncthreads();	__syncthreads();
	}	}

	__device_func__(void __threadfence_block(void))	__device_func__(void __threadfence_block(void))

	skipping to change at line 4376	skipping to change at line 4411
	#endif /* __CUDABE__ */	#endif /* __CUDABE__ */

	/************************************************************************ ***	/************************************************************************ ***
	* *	* *
	* DEVICE IMPLEMENTATIONS FOR FUNCTIONS WITH BUILTIN NVOPENCC OPERATIONS *	* DEVICE IMPLEMENTATIONS FOR FUNCTIONS WITH BUILTIN NVOPENCC OPERATIONS *
	* *	* *
	************************************************************************* **/	************************************************************************* **/
	#if !defined(__CUDABE__)	#if !defined(__CUDABE__)
	__device_func__(float __fdividef(float a, float b))	__device_func__(float __fdividef(float a, float b))
	{	{

	#if defined(__MULTI_CORE__)
	return a / b;
	#else /* __MULTI_CORE__ */
	volatile float aa = a;	volatile float aa = a;
	volatile float bb = b;	volatile float bb = b;
	/* match range restrictions of the device function */	/* match range restrictions of the device function */
	if (__cuda_fabsf(bb) > CUDART_TWO_TO_126_F) {	if (__cuda_fabsf(bb) > CUDART_TWO_TO_126_F) {
	if (__cuda_fabsf(aa) <= CUDART_NORM_HUGE_F) {	if (__cuda_fabsf(aa) <= CUDART_NORM_HUGE_F) {
	return ((aa / bb) / CUDART_NORM_HUGE_F) / CUDART_NORM_HUGE_F;	return ((aa / bb) / CUDART_NORM_HUGE_F) / CUDART_NORM_HUGE_F;
	} else {	} else {
	bb = 1.0f / bb;	bb = 1.0f / bb;
	bb = bb / CUDART_NORM_HUGE_F;	bb = bb / CUDART_NORM_HUGE_F;
	return aa * bb;	return aa * bb;
	}	}
	} else {	} else {
	return aa / bb;	return aa / bb;
	}	}

	#endif /* __MULTI_CORE__ */
	}	}
	#endif /* !defined(__CUDABE__) */	#endif /* !defined(__CUDABE__) */

	__device_func__(float __sinf(float a))	__device_func__(float __sinf(float a))
	{	{
	#if !defined(__CUDABE__)	#if !defined(__CUDABE__)
	if ((__float_as_int(a) << 1) == 0xff000000) {	if ((__float_as_int(a) << 1) == 0xff000000) {
	return __fadd_rn (a, -a); /* return NaN */	return __fadd_rn (a, -a); /* return NaN */
	}	}
	#endif /* !defined(__CUDABE__) */	#endif /* !defined(__CUDABE__) */

	skipping to change at line 4427	skipping to change at line 4458
	__device_func__(float __log2f(float a))	__device_func__(float __log2f(float a))
	{	{
	return log2f(a);	return log2f(a);
	}	}

	/************************************************************************ ***	/************************************************************************ ***
	* *	* *
	* SHARED HOST AND DEVICE IMPLEMENTATIONS *	* SHARED HOST AND DEVICE IMPLEMENTATIONS *
	* *	* *
	************************************************************************* **/	************************************************************************* **/


	__device_func__(float __internal_accurate_fdividef(float a, float b))
	{
	return a / b;
	}

	__device_func__(float __tanf(float a))	__device_func__(float __tanf(float a))
	{	{

	#if defined(__MULTI_CORE__)
	return tanf(a);
	#else /* __MULTI_CORE__ */
	return __fdividef (__sinf(a), __cosf(a));	return __fdividef (__sinf(a), __cosf(a));

	#endif /* __MULTI_CORE__ */
	}	}

	__device_func__(void __sincosf(float a, float sptr, float cptr))	__device_func__(void __sincosf(float a, float sptr, float cptr))
	{	{

	#if defined(__MULTI_CORE__)
	sincosf(a, sptr, cptr);
	#else /* __MULTI_CORE__ */
	*sptr = __sinf(a);	*sptr = __sinf(a);
	*cptr = __cosf(a);	*cptr = __cosf(a);

	#endif /* __MULTI_CORE__ */
	}	}

	__device_func__(float __expf(float a))	__device_func__(float __expf(float a))
	{	{

	#if defined(__MULTI_CORE__)
	return expf(a);
	#else /* __MULTI_CORE__ */
	return __cuda_exp2f(a * CUDART_L2E_F);	return __cuda_exp2f(a * CUDART_L2E_F);

	#endif /* __MULTI_CORE__ */
	}	}

	__device_func__(float __exp10f(float a))	__device_func__(float __exp10f(float a))
	{	{

	#if defined(__MULTI_CORE__)
	return exp10f(a);
	#else /* __MULTI_CORE__ */
	return __cuda_exp2f(a * CUDART_L2T_F);	return __cuda_exp2f(a * CUDART_L2T_F);

	#endif /* __MULTI_CORE__ */
	}	}

	__device_func__(float __log10f(float a))	__device_func__(float __log10f(float a))
	{	{

	#if defined(__MULTI_CORE__)
	return log10f(a);
	#else /* __MULTI_CORE__ */
	return CUDART_LG2_F * __log2f(a);	return CUDART_LG2_F * __log2f(a);

	#endif /* __MULTI_CORE__ */
	}	}

	__device_func__(float __logf(float a))	__device_func__(float __logf(float a))
	{	{

	#if defined(__MULTI_CORE__)
	return logf(a);
	#else /* __MULTI_CORE__ */
	return CUDART_LN2_F * __log2f(a);	return CUDART_LN2_F * __log2f(a);

	#endif /* __MULTI_CORE__ */
	}	}

	__device_func__(float __powf(float a, float b))	__device_func__(float __powf(float a, float b))
	{	{

	#if defined(__MULTI_CORE__)
	return powf(a, b);
	#else /* __MULTI_CORE__ */
	return __cuda_exp2f(b * __log2f(a));	return __cuda_exp2f(b * __log2f(a));

	#endif /* __MULTI_CORE__ */
	}	}

	__device_func__(float fdividef(float a, float b))	__device_func__(float fdividef(float a, float b))
	{	{

	#if defined(__MULTI_CORE__)	#if defined(__USE_FAST_MATH__) && !defined(__CUDA_PREC_DIV)
	return a / b;
	#elif defined(__USE_FAST_MATH__)
	return __fdividef(a, b);	return __fdividef(a, b);

	#else /* __MULTI_CORE__ */	#else /* __USE_FAST_MATH__ && !__CUDA_PREC_DIV */
	return __internal_accurate_fdividef(a, b);	return a / b;
	#endif /* __MULTI_CORE__ */	#endif /* __USE_FAST_MATH__ && !__CUDA_PREC_DIV */
	}	}


		#if !defined(__CUDABE__) \|\| (__CUDA_ARCH__ < 200)

	__device_func__(int __clz(int a))	__device_func__(int __clz(int a))
	{	{
	return (a)?(158-(__float_as_int(__uint2float_rz((unsigned int)a))>>23)):3 2;	return (a)?(158-(__float_as_int(__uint2float_rz((unsigned int)a))>>23)):3 2;
	}	}

	__device_func__(int __clzll(long long int a))	__device_func__(int __clzll(long long int a))
	{	{
	int ahi = ((int)((unsigned long long)a >> 32));	int ahi = ((int)((unsigned long long)a >> 32));
	int alo = ((int)((unsigned long long)a & 0xffffffffULL));	int alo = ((int)((unsigned long long)a & 0xffffffffULL));
	int res;	int res;

	skipping to change at line 4553	skipping to change at line 4550
	ahi = ahi - ((ahi >> 1) & 0x55555555);	ahi = ahi - ((ahi >> 1) & 0x55555555);
	ahi = (ahi & 0x33333333) + ((ahi >> 2) & 0x33333333);	ahi = (ahi & 0x33333333) + ((ahi >> 2) & 0x33333333);
	alo = alo + ahi;	alo = alo + ahi;
	alo = (alo & 0x0f0f0f0f) + ((alo >> 4) & 0x0f0f0f0f);	alo = (alo & 0x0f0f0f0f) + ((alo >> 4) & 0x0f0f0f0f);
	alo = ((__umul24(alo, 0x808080) << 1) + alo) >> 24;	alo = ((__umul24(alo, 0x808080) << 1) + alo) >> 24;
	return alo;	return alo;
	}	}

	__device_func__(unsigned int __brev(unsigned int a))	__device_func__(unsigned int __brev(unsigned int a))
	{	{

	a = ((a >> 1) & 0x55555555) + ((a & 0x55555555) << 1);	/* Use Knuth's algorithm from http://www.hackersdelight.org/revisions.pdf
	a = ((a >> 2) & 0x33333333) + ((a & 0x33333333) << 2);	*/
	a = ((a >> 4) & 0x0F0F0F0F) + ((a & 0x0F0F0F0F) << 4);	unsigned int t;
	a = ((a >> 8) & 0x00FF00FF) + ((a & 0x00FF00FF) << 8);	a = (a << 15) \| (a >> 17);
	a = ( a >> 16 ) + ( a << 16);	t = (a ^ (a >> 10)) & 0x003f801f;
		a = (t + (t << 10)) ^ a;
		t = (a ^ (a >> 4)) & 0x0e038421;
		a = (t + (t << 4)) ^ a;
		t = (a ^ (a >> 2)) & 0x22488842;
		a = (t + (t << 2)) ^ a;
	return a;	return a;
	}	}

	__device_func__(unsigned long long int __brevll(unsigned long long int a))	__device_func__(unsigned long long int __brevll(unsigned long long int a))
	{	{
	unsigned int hi = (unsigned int)(a >> 32);	unsigned int hi = (unsigned int)(a >> 32);
	unsigned int lo = (unsigned int)(a & 0xffffffffULL);	unsigned int lo = (unsigned int)(a & 0xffffffffULL);
	unsigned int t;	unsigned int t;
	t = __brev(lo);	t = __brev(lo);
	lo = __brev(hi);	lo = __brev(hi);
	return ((unsigned long long int)t << 32) + (unsigned long long int)lo;	return ((unsigned long long int)t << 32) + (unsigned long long int)lo;
	}	}


		#endif /* __CUDABE__ \|\| __CUDA_ARCH__ < 200 */

	__device_func__(int __ffs(int a))	__device_func__(int __ffs(int a))
	{	{
	return 32 - __clz (a & -a);	return 32 - __clz (a & -a);
	}	}

	__device_func__(int __ffsll(long long int a))	__device_func__(int __ffsll(long long int a))
	{	{
	return 64 - __clzll (a & -a);	return 64 - __clzll (a & -a);
	}	}


	skipping to change at line 4669	skipping to change at line 4672

	/************************************************************************ ***	/************************************************************************ ***
	* *	* *
	* *	* *
	* *	* *
	************************************************************************* **/	************************************************************************* **/

	#include "sm_11_atomic_functions.h"	#include "sm_11_atomic_functions.h"
	#include "sm_12_atomic_functions.h"	#include "sm_12_atomic_functions.h"
	#include "sm_13_double_functions.h"	#include "sm_13_double_functions.h"

		#include "sm_20_atomic_functions.h"
		#include "sm_20_intrinsics.h"
	#include "texture_fetch_functions.h"	#include "texture_fetch_functions.h"

	#endif /* !__DEVICE_FUNCTIONS_H__ */	#endif /* !__DEVICE_FUNCTIONS_H__ */

End of changes. 52 change blocks.
	97 lines changed or deleted	107 lines changed or added

	device_launch_parameters.h	device_launch_parameters.h
	/*	/*

	* Copyright 1993-2009 NVIDIA Corporation. All rights reserved.	* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
	*	*
	* NOTICE TO USER:	* NOTICE TO USER:
	*	*
	* This source code is subject to NVIDIA ownership rights under U.S. and	* This source code is subject to NVIDIA ownership rights under U.S. and
	* international Copyright laws. Users and possessors of this source code	* international Copyright laws. Users and possessors of this source code
	* are hereby granted a nonexclusive, royalty-free license to use this code	* are hereby granted a nonexclusive, royalty-free license to use this code
	* in individual and commercial software.	* in individual and commercial software.
	*	*
	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR

End of changes. 1 change blocks.
	1 lines changed or deleted	1 lines changed or added

	device_runtime.h	device_runtime.h

	skipping to change at line 69	skipping to change at line 69
	#if defined(__CUDABE__) /* cudabe compiler */	#if defined(__CUDABE__) /* cudabe compiler */

	#define __pad__(f)	#define __pad__(f)
	#define __text__ \	#define __text__ \
	__attribute__((__texture__))	__attribute__((__texture__))
	#define __surf__ \	#define __surf__ \
	__attribute__((__surface__))	__attribute__((__surface__))
	#define ___device__(sc) \	#define ___device__(sc) \
	static	static
	#define __in__(cdecl, decl) \	#define __in__(cdecl, decl) \

	__shared__ cdecl	cdecl
	#define __in_type__(cdecl, decl) \	#define __in_type__(cdecl, decl) \
	cdecl	cdecl
	#define __texture_var(name) \	#define __texture_var(name) \
	name	name
	#define __shared_var(name, s, type) \	#define __shared_var(name, s, type) \
	name	name

	#define __cuda_host_device_name(name) \
	name
	#define __val_param(name) \	#define __val_param(name) \
	__val_param##name	__val_param##name
	#define __copy_param(local_decl, param) \	#define __copy_param(local_decl, param) \
	local_decl = param	local_decl = param
	#define __unsized_array_size \	#define __unsized_array_size \
	[]	[]
	#define __unsized__shared_var(name, s, type) \	#define __unsized__shared_var(name, s, type) \
	name	name
	#define __unsized__empty_array(s) \	#define __unsized__empty_array(s) \
	s	s
	#define __var_used__ \	#define __var_used__ \
	__attribute__((__used__))	__attribute__((__used__))
	#define __storage_extern_unsized__shared__ \	#define __storage_extern_unsized__shared__ \
	extern	extern

		#define __cxa_vec_util(n, num, size, f) \
		int i; for (i = 0; i < num; i++) f(n + i)
		#define __cxa_vec_ctor(n, num, size, c, d) \
		({ __cxa_vec_util(n, num, size, c); (void)0; })
		#define __cxa_vec_dtor(n, num, size, d) \
		{ __cxa_vec_util(n, num, size, d); }

	#undef __cdecl	#undef __cdecl
	#define __cdecl	#define __cdecl
	#undef __w64	#undef __w64
	#define __w64	#define __w64

	#elif defined(__CUDACC__) /* cudafe compiler */	#elif defined(__CUDACC__) /* cudafe compiler */

	#define __loc_sc__(loc, size, sc) \	#define __loc_sc__(loc, size, sc) \
	sc loc	sc loc

	skipping to change at line 115	skipping to change at line 119
	#define ___device__(sc) \	#define ___device__(sc) \
	sc __device__	sc __device__
	#define __in__(cdecl, decl) \	#define __in__(cdecl, decl) \
	decl	decl
	#define __in_type__(cdecl, decl) \	#define __in_type__(cdecl, decl) \
	decl	decl
	#define __texture_var(name) \	#define __texture_var(name) \
	name	name
	#define __shared_var(name, s, type) \	#define __shared_var(name, s, type) \
	name	name

	#define __cuda_host_device_name(name) \
	name
	#define __val_param(name) \	#define __val_param(name) \
	name	name
	#define __copy_param(local_decl, param)	#define __copy_param(local_decl, param)
	#define __unsized_array_size \	#define __unsized_array_size \
	[]	[]
	#define __unsized__shared_var(name, s, type) \	#define __unsized__shared_var(name, s, type) \
	name	name
	#define __unsized__empty_array(s) \	#define __unsized__empty_array(s) \
	s	s


	skipping to change at line 240	skipping to change at line 242
	#endif /* __APPLE__ \|\| __ICC */	#endif /* __APPLE__ \|\| __ICC */

	#endif /* __MULTI_CORE__ */	#endif /* __MULTI_CORE__ */

	#define __in__(cdecl, decl) \	#define __in__(cdecl, decl) \
	decl	decl
	#define __in_type__(cdecl, decl) \	#define __in_type__(cdecl, decl) \
	decl	decl
	#define __texture_var(name) \	#define __texture_var(name) \
	__texture_##name	__texture_##name

	#define __cuda_host_device_name(name) \
	__cuda_host_device_##name
	#define __val_param(name) \	#define __val_param(name) \
	name	name
	#define __copy_param(local_decl, param)	#define __copy_param(local_decl, param)
	#define __unsized_array_size	#define __unsized_array_size
	#define __unsized__shared_var(name, s, type) \	#define __unsized__shared_var(name, s, type) \
	(*name)	(*name)
	#define __unsized__empty_array(s)	#define __unsized__empty_array(s)

		#define __cxa_vec_ctor(n, num, size, c, d) \
		__cxa_vec_util((void)n, num, size, (void ()(void*))c)
		#define __cxa_vec_dtor(n, num, size, d) \
		__cxa_vec_util((void)n, num, size, (void ()(void*))d)

		static void __cxa_vec_util(void n, size_t num, size_t size, void (f)(void
		*))
		{
		size_t i;

		for (i = 0; i < num; i++) {
		f((void)((char)n + i * size));
		}
		}

	/* this is compiled with a host compiler for device emulation */	/* this is compiled with a host compiler for device emulation */
	#define __device_emulation	#define __device_emulation

	#if defined(__cplusplus)	#if defined(__cplusplus)

	#undef __VECTOR_TYPES_H__	#undef __VECTOR_TYPES_H__

	#if defined(_WIN32)	#if defined(_WIN32)


End of changes. 6 change blocks.
	7 lines changed or deleted	21 lines changed or added

	device_types.h	device_types.h
	/*	/*

	* Copyright 1993-2009 NVIDIA Corporation. All rights reserved.	* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
	*	*
	* NOTICE TO USER:	* NOTICE TO USER:
	*	*
	* This source code is subject to NVIDIA ownership rights under U.S. and	* This source code is subject to NVIDIA ownership rights under U.S. and
	* international Copyright laws. Users and possessors of this source code	* international Copyright laws. Users and possessors of this source code
	* are hereby granted a nonexclusive, royalty-free license to use this code	* are hereby granted a nonexclusive, royalty-free license to use this code
	* in individual and commercial software.	* in individual and commercial software.
	*	*
	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR

End of changes. 1 change blocks.
	1 lines changed or deleted	1 lines changed or added

	driver_functions.h	driver_functions.h
	/*	/*

	* Copyright 1993-2009 NVIDIA Corporation. All rights reserved.	* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
	*	*
	* NOTICE TO USER:	* NOTICE TO USER:
	*	*
	* This source code is subject to NVIDIA ownership rights under U.S. and	* This source code is subject to NVIDIA ownership rights under U.S. and
	* international Copyright laws. Users and possessors of this source code	* international Copyright laws. Users and possessors of this source code
	* are hereby granted a nonexclusive, royalty-free license to use this code	* are hereby granted a nonexclusive, royalty-free license to use this code
	* in individual and commercial software.	* in individual and commercial software.
	*	*
	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR

End of changes. 1 change blocks.
	1 lines changed or deleted	1 lines changed or added

	driver_types.h	driver_types.h
	/*	/*

	* Copyright 1993-2009 NVIDIA Corporation. All rights reserved.	* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
	*	*
	* NOTICE TO USER:	* NOTICE TO USER:
	*	*
	* This source code is subject to NVIDIA ownership rights under U.S. and	* This source code is subject to NVIDIA ownership rights under U.S. and
	* international Copyright laws. Users and possessors of this source code	* international Copyright laws. Users and possessors of this source code
	* are hereby granted a nonexclusive, royalty-free license to use this code	* are hereby granted a nonexclusive, royalty-free license to use this code
	* in individual and commercial software.	* in individual and commercial software.
	*	*
	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR

	skipping to change at line 76	skipping to change at line 76
	#define cudaHostAllocWriteCombined 4 ///< Write-combined memory	#define cudaHostAllocWriteCombined 4 ///< Write-combined memory

	#define cudaEventDefault 0 ///< Default event flag	#define cudaEventDefault 0 ///< Default event flag
	#define cudaEventBlockingSync 1 ///< Event uses blocking synchroniz ation	#define cudaEventBlockingSync 1 ///< Event uses blocking synchroniz ation

	#define cudaDeviceScheduleAuto 0 ///< Device flag - Automatic schedu ling	#define cudaDeviceScheduleAuto 0 ///< Device flag - Automatic schedu ling
	#define cudaDeviceScheduleSpin 1 ///< Device flag - Spin default sch eduling	#define cudaDeviceScheduleSpin 1 ///< Device flag - Spin default sch eduling
	#define cudaDeviceScheduleYield 2 ///< Device flag - Yield default sc heduling	#define cudaDeviceScheduleYield 2 ///< Device flag - Yield default sc heduling
	#define cudaDeviceBlockingSync 4 ///< Device flag - Use blocking syn chronization	#define cudaDeviceBlockingSync 4 ///< Device flag - Use blocking syn chronization
	#define cudaDeviceMapHost 8 ///< Device flag - Support mapped p inned allocations	#define cudaDeviceMapHost 8 ///< Device flag - Support mapped p inned allocations

	#define cudaDeviceMask 0xf ///< Device flags mask	#define cudaDeviceLmemResizeToMax 16 ///< Device flag - Keep local memor
		y allocation after launch
		#define cudaDeviceMask 0x1f ///< Device flags mask

	#endif /* !__CUDA_INTERNAL_COMPILATION__ */	#endif /* !__CUDA_INTERNAL_COMPILATION__ */

	/************************************************************************ ***	/************************************************************************ ***
	* *	* *
	* *	* *
	* *	* *
	************************************************************************* **/	************************************************************************* **/

	/**	/**

	skipping to change at line 130	skipping to change at line 131
	cudaErrorMixedDeviceExecution = 28, ///< Mixed device execu tion	cudaErrorMixedDeviceExecution = 28, ///< Mixed device execu tion
	cudaErrorCudartUnloading = 29, ///< CUDA runtime unloa ding	cudaErrorCudartUnloading = 29, ///< CUDA runtime unloa ding
	cudaErrorUnknown = 30, ///< Unknown error cond ition	cudaErrorUnknown = 30, ///< Unknown error cond ition
	cudaErrorNotYetImplemented = 31, ///< Function not yet i mplemented	cudaErrorNotYetImplemented = 31, ///< Function not yet i mplemented
	cudaErrorMemoryValueTooLarge = 32, ///< Memory value too l arge	cudaErrorMemoryValueTooLarge = 32, ///< Memory value too l arge
	cudaErrorInvalidResourceHandle = 33, ///< Invalid resource h andle	cudaErrorInvalidResourceHandle = 33, ///< Invalid resource h andle
	cudaErrorNotReady = 34, ///< Not ready error	cudaErrorNotReady = 34, ///< Not ready error
	cudaErrorInsufficientDriver = 35, ///< CUDA runtime is ne wer than driver	cudaErrorInsufficientDriver = 35, ///< CUDA runtime is ne wer than driver
	cudaErrorSetOnActiveProcess = 36, ///< Set on active proc ess error	cudaErrorSetOnActiveProcess = 36, ///< Set on active proc ess error
	cudaErrorNoDevice = 38, ///< No available CUDA device	cudaErrorNoDevice = 38, ///< No available CUDA device

		cudaErrorECCUncorrectable = 39, ///< Uncorrectable ECC error detected
	cudaErrorStartupFailure = 0x7f, ///< Startup failure	cudaErrorStartupFailure = 0x7f, ///< Startup failure
	cudaErrorApiFailureBase = 10000 ///< API failure base	cudaErrorApiFailureBase = 10000 ///< API failure base
	};	};

	/**	/**
	* Channel format kind	* Channel format kind
	*/	*/
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	enum cudaChannelFormatKind	enum cudaChannelFormatKind
	{	{

	skipping to change at line 179	skipping to change at line 181
	enum cudaMemcpyKind	enum cudaMemcpyKind
	{	{
	cudaMemcpyHostToHost = 0, ///< Host -> Host	cudaMemcpyHostToHost = 0, ///< Host -> Host
	cudaMemcpyHostToDevice = 1, ///< Host -> Device	cudaMemcpyHostToDevice = 1, ///< Host -> Device
	cudaMemcpyDeviceToHost = 2, ///< Device -> Host	cudaMemcpyDeviceToHost = 2, ///< Device -> Host
	cudaMemcpyDeviceToDevice = 3 ///< Device -> Device	cudaMemcpyDeviceToDevice = 3 ///< Device -> Device
	};	};

	/**	/**
	* CUDA Pitched memory pointer	* CUDA Pitched memory pointer

		* \sa ::make_cudaPitchedPtr
	*/	*/
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct cudaPitchedPtr	struct cudaPitchedPtr
	{	{
	void *ptr; ///< Pointer to allocated memory	void *ptr; ///< Pointer to allocated memory
	size_t pitch; ///< Pitch of allocated memory in bytes	size_t pitch; ///< Pitch of allocated memory in bytes
	size_t xsize; ///< Logical width of allocation in elements	size_t xsize; ///< Logical width of allocation in elements
	size_t ysize; ///< Logical height of allocation in elements	size_t ysize; ///< Logical height of allocation in elements
	};	};

	/**	/**
	* CUDA extent	* CUDA extent

		* \sa ::make_cudaExtent
	*/	*/
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct cudaExtent	struct cudaExtent
	{	{
	size_t width; ///< Width in bytes	size_t width; ///< Width in bytes
	size_t height; ///< Height in bytes	size_t height; ///< Height in bytes
	size_t depth; ///< Depth in bytes	size_t depth; ///< Depth in bytes
	};	};

	/**	/**
	* CUDA 3D position	* CUDA 3D position

		* \sa ::make_cudaPos
	*/	*/
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct cudaPos	struct cudaPos
	{	{
	size_t x; ///< x	size_t x; ///< x
	size_t y; ///< y	size_t y; ///< y
	size_t z; ///< z	size_t z; ///< z
	};	};

	/**	/**

	skipping to change at line 230	skipping to change at line 235

	struct cudaArray *dstArray; ///< Destination memory address	struct cudaArray *dstArray; ///< Destination memory address
	struct cudaPos dstPos; ///< Destination position offset	struct cudaPos dstPos; ///< Destination position offset
	struct cudaPitchedPtr dstPtr; ///< Pitched destination memory address	struct cudaPitchedPtr dstPtr; ///< Pitched destination memory address

	struct cudaExtent extent; ///< Requested memory copy size	struct cudaExtent extent; ///< Requested memory copy size
	enum cudaMemcpyKind kind; ///< Type of transfer	enum cudaMemcpyKind kind; ///< Type of transfer
	};	};

	/**	/**

		* CUDA graphics interop resource
		*/
		/DEVICE_BUILTIN/
		struct cudaGraphicsResource;

		/**
		* CUDA graphics interop register flags
		*/
		/DEVICE_BUILTIN/
		enum cudaGraphicsRegisterFlags
		{
		cudaGraphicsRegisterFlagsNone = 0, ///< Default
		};

		/**
		* CUDA graphics interop map flags
		*/
		/DEVICE_BUILTIN/
		enum cudaGraphicsMapFlags
		{
		cudaGraphicsMapFlagsNone = 0, ///< Default; Assume resource can
		be read/written
		cudaGraphicsMapFlagsReadOnly = 1, ///< CUDA will not write to this r
		esource
		cudaGraphicsMapFlagsWriteDiscard = 2, ///< CUDA will only write to and w
		ill not read from this resource
		};

		/**
		* CUDA graphics interop array indices for cube maps
		*/
		/DEVICE_BUILTIN/
		enum cudaGraphicsCubeFace {
		cudaGraphicsCubeFacePositiveX = 0x00, ///< Positive X face of cubemap
		cudaGraphicsCubeFaceNegativeX = 0x01, ///< Negative X face of cubemap
		cudaGraphicsCubeFacePositiveY = 0x02, ///< Positive Y face of cubemap
		cudaGraphicsCubeFaceNegativeY = 0x03, ///< Negative Y face of cubemap
		cudaGraphicsCubeFacePositiveZ = 0x04, ///< Positive Z face of cubemap
		cudaGraphicsCubeFaceNegativeZ = 0x05, ///< Negative Z face of cubemap
		};

		/**
	* CUDA function attributes	* CUDA function attributes
	*/	*/
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct cudaFuncAttributes	struct cudaFuncAttributes
	{	{
	size_t sharedSizeBytes; ///< Size of shared memory in bytes	size_t sharedSizeBytes; ///< Size of shared memory in bytes
	size_t constSizeBytes; ///< Size of constant memory in bytes	size_t constSizeBytes; ///< Size of constant memory in bytes
	size_t localSizeBytes; ///< Size of local memory in bytes	size_t localSizeBytes; ///< Size of local memory in bytes
	int maxThreadsPerBlock; ///< Maximum number of threads per block	int maxThreadsPerBlock; ///< Maximum number of threads per block
	int numRegs; ///< Number of registers used	int numRegs; ///< Number of registers used

	int __cudaReserved[8];	/** \brief PTX virtual architecture version for which the function was
		* compiled. This value is the major PTX version * 10 + the minor PTX
		* version, so a PTX version 1.3 function would return the value 13.
		* For device emulation kernels, this is set to 9999.
		*/
		int ptxVersion;
		/** \brief Binary architecture version for which the function was compil
		ed.
		* This value is the major binary version * 10 + the minor binary versi
		on,
		* so a binary version 1.3 function would return the value 13.
		* For device emulation kernels, this is set to 9999.
		*/
		int binaryVersion;
		int __cudaReserved[6];
		};

		/**
		* CUDA function cache configurations
		*/
		/DEVICE_BUILTIN/
		enum cudaFuncCache
		{
		cudaFuncCachePreferNone = 0, ///< Default function cache configurati
		on, no preference
		cudaFuncCachePreferShared = 1, ///< Prefer larger shared memory and sm
		aller L1 cache
		cudaFuncCachePreferL1 = 2 ///< Prefer larger L1 cache and smaller
		shared memory
	};	};

	/**	/**
	* CUDA device compute modes	* CUDA device compute modes
	*/	*/
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	enum cudaComputeMode	enum cudaComputeMode
	{	{
	cudaComputeModeDefault = 0, ///< Default compute mode (Multiple thr eads can use ::cudaSetDevice() with this device)	cudaComputeModeDefault = 0, ///< Default compute mode (Multiple thr eads can use ::cudaSetDevice() with this device)
	cudaComputeModeExclusive = 1, ///< Compute-exclusive mode (Only one t hread will be able to use ::cudaSetDevice() with this device)	cudaComputeModeExclusive = 1, ///< Compute-exclusive mode (Only one t hread will be able to use ::cudaSetDevice() with this device)

	skipping to change at line 280	skipping to change at line 347
	size_t totalConstMem; ///< Constant memory available on devic e in bytes	size_t totalConstMem; ///< Constant memory available on devic e in bytes
	int major; ///< Major compute capability	int major; ///< Major compute capability
	int minor; ///< Minor compute capability	int minor; ///< Minor compute capability
	size_t textureAlignment; ///< Alignment requirement for textures	size_t textureAlignment; ///< Alignment requirement for textures
	int deviceOverlap; ///< Device can concurrently copy memor y and execute a kernel	int deviceOverlap; ///< Device can concurrently copy memor y and execute a kernel
	int multiProcessorCount; ///< Number of multiprocessors on devic e	int multiProcessorCount; ///< Number of multiprocessors on devic e
	int kernelExecTimeoutEnabled; ///< Specified whether there is a run t ime limit on kernels	int kernelExecTimeoutEnabled; ///< Specified whether there is a run t ime limit on kernels
	int integrated; ///< Device is integrated as opposed to discrete	int integrated; ///< Device is integrated as opposed to discrete
	int canMapHostMemory; ///< Device can map host memory with cu daHostAlloc/cudaHostGetDevicePointer	int canMapHostMemory; ///< Device can map host memory with cu daHostAlloc/cudaHostGetDevicePointer
	int computeMode; ///< Compute mode (See ::cudaComputeMod e)	int computeMode; ///< Compute mode (See ::cudaComputeMod e)

	int __cudaReserved[36];	int maxTexture1D; ///< Maximum 1D texture size
		int maxTexture2D[2]; ///< Maximum 2D texture dimensions
		int maxTexture3D[3]; ///< Maximum 3D texture dimensions
		int maxTexture2DArray[3]; ///< Maximum 2D texture array dimension
		s
		int concurrentKernels; ///< Device can possibly execute multip
		le kernels concurrently
		int __cudaReserved[26];
	};	};

	#define cudaDevicePropDontCare \	#define cudaDevicePropDontCare \
	{ \	{ \
	{'\0'}, /* char name[256]; */ \	{'\0'}, /* char name[256]; */ \
	0, /* size_t totalGlobalMem; */ \	0, /* size_t totalGlobalMem; */ \
	0, /* size_t sharedMemPerBlock; */ \	0, /* size_t sharedMemPerBlock; */ \
	0, /* int regsPerBlock; */ \	0, /* int regsPerBlock; */ \
	0, /* int warpSize; */ \	0, /* int warpSize; */ \
	0, /* size_t memPitch; */ \	0, /* size_t memPitch; */ \

	skipping to change at line 305	skipping to change at line 377
	0, /* size_t totalConstMem; */ \	0, /* size_t totalConstMem; */ \
	-1, /* int major; */ \	-1, /* int major; */ \
	-1, /* int minor; */ \	-1, /* int minor; */ \
	0, /* size_t textureAlignment; */ \	0, /* size_t textureAlignment; */ \
	-1, /* int deviceOverlap; */ \	-1, /* int deviceOverlap; */ \
	0, /* int multiProcessorCount; */ \	0, /* int multiProcessorCount; */ \
	0, /* int kernelExecTimeoutEnabled */ \	0, /* int kernelExecTimeoutEnabled */ \
	0, /* int integrated */ \	0, /* int integrated */ \
	0, /* int canMapHostMemory */ \	0, /* int canMapHostMemory */ \
	0, /* int computeMode */ \	0, /* int computeMode */ \

		0, /* int maxTexture1D */ \
		{0, 0}, /* int maxTexture2D[2] */ \
		{0, 0, 0}, /* int maxTexture3D[3] */ \
		{0, 0, 0}, /* int maxTexture2DArray[3] */ \
		0 /* int concurrentKernels */ \
	} ///< Empty device properties	} ///< Empty device properties

	/************************************************************************ ***	/************************************************************************ ***
	* *	* *
	* SHORTHAND TYPE DEFINITION USED BY RUNTIME API *	* SHORTHAND TYPE DEFINITION USED BY RUNTIME API *
	* *	* *
	************************************************************************* **/	************************************************************************* **/

	/**	/**
	* CUDA Error types	* CUDA Error types

End of changes. 10 change blocks.
	4 lines changed or deleted	92 lines changed or added

	func_macro.h	func_macro.h

	skipping to change at line 57	skipping to change at line 57

	#if !defined(__CUDA_INTERNAL_COMPILATION__)	#if !defined(__CUDA_INTERNAL_COMPILATION__)

	#error -- incorrect inclusion of a cudart header file	#error -- incorrect inclusion of a cudart header file

	#endif /* !__CUDA_INTERNAL_COMPILATION__ */	#endif /* !__CUDA_INTERNAL_COMPILATION__ */

	#if defined(__cplusplus) && defined(__device_emulation) && !defined(__multi _core__)	#if defined(__cplusplus) && defined(__device_emulation) && !defined(__multi _core__)

	#define __begin_host_func \	#define __begin_host_func \

	}}	}
	#define __end_host_func \	#define __end_host_func \

	namespace __cuda_emu { extern "C" {	namespace __cuda_emu {
	#define __host_device_call(f) \	#define __host_device_call(f) \
	__cuda_emu::f	__cuda_emu::f

	#else /* __cplusplus && __device_emulation && !__multi_core__ */	#else /* __cplusplus && __device_emulation && !__multi_core__ */

	#define __begin_host_func	#define __begin_host_func
	#define __end_host_func	#define __end_host_func
	#define __host_device_call(f) \	#define __host_device_call(f) \
	f	f


End of changes. 2 change blocks.
	2 lines changed or deleted	2 lines changed or added

	host_config.h	host_config.h
	/*	/*

	* Copyright 1993-2009 NVIDIA Corporation. All rights reserved.	* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
	*	*
	* NOTICE TO USER:	* NOTICE TO USER:
	*	*
	* This source code is subject to NVIDIA ownership rights under U.S. and	* This source code is subject to NVIDIA ownership rights under U.S. and
	* international Copyright laws. Users and possessors of this source code	* international Copyright laws. Users and possessors of this source code
	* are hereby granted a nonexclusive, royalty-free license to use this code	* are hereby granted a nonexclusive, royalty-free license to use this code
	* in individual and commercial software.	* in individual and commercial software.
	*	*
	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR

	skipping to change at line 52	skipping to change at line 52
	* *	* *
	************************************************************************* **/	************************************************************************* **/

	#if defined(__CUDACC__)	#if defined(__CUDACC__)

	#if defined(__APPLE__)	#if defined(__APPLE__)

	#define _CRTIMP	#define _CRTIMP
	#define __THROW	#define __THROW


	#if defined(__MULTI_CORE__)	#if defined(__BLOCKS__)


	#error multicore not supported for MacOs	#undef __BLOCKS__


	#endif /* __MULTI_CORE__ */	#endif /* __BLOCKS__ */

	#elif defined(__GNUC__)	#elif defined(__GNUC__)

	#define _CRTIMP	#define _CRTIMP


	#if defined(__MULTI_CORE__) && __GNUC__ > 3	#include <features.h> /* for __THROW */

	#error multicore not supported for gcc 4.x

	#endif /* __MULTI_CORE__ & __GNUC__ > 3 */

	#include <features.h> /* for __THROW */
	#include <bits/c++config.h> /* for _GLIBCXX_ATOMIC_BUILTINS */

	#if _GLIBCXX_ATOMIC_BUILTINS == 1

	#undef _GLIBCXX_ATOMIC_BUILTINS /* for missing __sync_fetch_and_add */

	#endif /* _GLIBCXX_ATOMIC_BUILTINS == 1 */

	#elif defined(_WIN32)	#elif defined(_WIN32)


	#if defined(__MULTI_CORE__) && _MSC_VER != 1400	#if _MSC_VER >= 1400

	#error multicore support available only for VC8

	#endif /* __MULTI_CORE__ & _MSC_VER != 1400 */

	#if _MSC_VER >= 1500	#if _MSC_VER >= 1500

	#undef _USE_DECLSPECS_FOR_SAL	#undef _USE_DECLSPECS_FOR_SAL
	#define _USE_DECLSPECS_FOR_SAL \	#define _USE_DECLSPECS_FOR_SAL \
	1	1

	#endif /* _MSC_VER >= 1500 */	#endif /* _MSC_VER >= 1500 */


	#if _MSC_VER >= 1400

	#if !defined(_CRT_NONSTDC_NO_WARNINGS)	#if !defined(_CRT_NONSTDC_NO_WARNINGS)

	#define _CRT_NONSTDC_NO_WARNINGS /* to suppress warnings */	#define _CRT_NONSTDC_NO_WARNINGS /* to suppress warnings */


	#endif /* _CRT_NONSTDC_NO_WARNINGS */	#endif /* !_CRT_NONSTDC_NO_WARNINGS */

	#if !defined(_CRT_SECURE_NO_WARNINGS)	#if !defined(_CRT_SECURE_NO_WARNINGS)

	#define _CRT_SECURE_NO_WARNINGS /* to suppress warnings */	#define _CRT_SECURE_NO_WARNINGS /* to suppress warnings */


	#endif /* _CRT_SECURE_NO_WARNINGS */	#endif /* !_CRT_SECURE_NO_WARNINGS */

	#endif /* _MSC_VER >= 1400 */	#endif /* _MSC_VER >= 1400 */

	#if !defined(NOMINMAX)	#if !defined(NOMINMAX)

	#define NOMINMAX /* min and max are part of cuda runtime */	#define NOMINMAX /* min and max are part of cuda runtime */

	#endif /* !NOMINMAX */	#endif /* !NOMINMAX */

	#include <crtdefs.h> /* for _CRTIMP */	#include <crtdefs.h> /* for _CRTIMP */

End of changes. 9 change blocks.
	27 lines changed or deleted	8 lines changed or added

	host_defines.h	host_defines.h
	/*	/*

	* Copyright 1993-2009 NVIDIA Corporation. All rights reserved.	* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
	*	*
	* NOTICE TO USER:	* NOTICE TO USER:
	*	*
	* This source code is subject to NVIDIA ownership rights under U.S. and	* This source code is subject to NVIDIA ownership rights under U.S. and
	* international Copyright laws. Users and possessors of this source code	* international Copyright laws. Users and possessors of this source code
	* are hereby granted a nonexclusive, royalty-free license to use this code	* are hereby granted a nonexclusive, royalty-free license to use this code
	* in individual and commercial software.	* in individual and commercial software.
	*	*
	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR

	skipping to change at line 46	skipping to change at line 46
	#if !defined(__HOST_DEFINES_H__)	#if !defined(__HOST_DEFINES_H__)
	#define __HOST_DEFINES_H__	#define __HOST_DEFINES_H__

	#if !defined(__GNUC__) && !defined(_WIN32)	#if !defined(__GNUC__) && !defined(_WIN32)

	#error --- !!! UNSUPPORTED COMPILER !!! ---	#error --- !!! UNSUPPORTED COMPILER !!! ---

	#elif defined(__GNUC__)	#elif defined(__GNUC__)

	#define __no_return__ \	#define __no_return__ \

	__attribute__((__noreturn__))	__attribute__((noreturn))
	#define __noinline__ \	#define __noinline__ \

	__attribute__((__noinline__))	__attribute__((noinline))
	#define __forceinline__ \	#define __forceinline__ \

	__inline__ __attribute__((__always_inline__))	__inline__ __attribute__((always_inline))
	#define __align__(n) \	#define __align__(n) \

	__attribute__((__aligned__(n)))	__attribute__((aligned(n)))
	#define __thread__ \	#define __thread__ \
	__thread	__thread
	#define __import__	#define __import__
	#define __export__	#define __export__

		#define __cdecl
		#define __annotate__(a) \
		__attribute__((a))
	#define __location__(a) \	#define __location__(a) \

	__loc__(__attribute__((a)))	__annotate__(a)
	#define CUDARTAPI	#define CUDARTAPI

	#elif defined(_WIN32)	#elif defined(_WIN32)

	#if _MSC_VER >= 1400	#if _MSC_VER >= 1400

	#define __restrict__ \	#define __restrict__ \
	__restrict	__restrict

	#else /* _MSC_VER >= 1400 */	#else /* _MSC_VER >= 1400 */

	skipping to change at line 90	skipping to change at line 93
	#define __forceinline__ \	#define __forceinline__ \
	__forceinline	__forceinline
	#define __align__(n) \	#define __align__(n) \
	__declspec(align(n))	__declspec(align(n))
	#define __thread__ \	#define __thread__ \
	__declspec(thread)	__declspec(thread)
	#define __import__ \	#define __import__ \
	__declspec(dllimport)	__declspec(dllimport)
	#define __export__ \	#define __export__ \
	__declspec(dllexport)	__declspec(dllexport)

		#define __annotate__(a) \
		__declspec(a)
	#define __location__(a) \	#define __location__(a) \

	__loc__(__declspec(a))	__annotate__(__##a##__)
	#define CUDARTAPI \	#define CUDARTAPI \
	__stdcall	__stdcall

	#endif /* !__GNUC__ && !_WIN32 */	#endif /* !__GNUC__ && !_WIN32 */


	#if defined(__CUDACC__) \|\| defined(__CUDABE__) \|\| defined (__MULTI_CORE__)	#if !defined(__CUDACC__) && !defined(__CUDABE__)

	#define __loc__(a) \
	a

	#else /* __CUDACC__ \|\| __CUDABE__ \|\| __MULTI_CORE__ */


	#define __loc__(a)	#undef __annotate__
		#define __annotate__(a)


	#endif /* __CUDACC__ \|\| __CUDABE__ \|\| __MULTI_CORE__ */	#endif /* !__CUDACC__ && !__CUDABE__ */


	#if defined(__CUDACC__) \|\| defined(__CUDABE__) \|\| defined (__MULTI_CORE__) \|\| \	#if defined(__CUDACC__) \|\| defined(__CUDABE__) \|\| \
	defined(__GNUC__) \|\| defined(_WIN64)	defined(__GNUC__) \|\| defined(_WIN64)

	#define __builtin_align__(a) \	#define __builtin_align__(a) \
	__align__(a)	__align__(a)


	#else /* __CUDACC__ \|\| __CUDABE__ \|\| __MULTI_CORE__ \|\| __GNUC__ \|\| _WIN64 * /	#else /* __CUDACC__ \|\| __CUDABE__ \|\| __GNUC__ \|\| _WIN64 */

	#define __builtin_align__(a)	#define __builtin_align__(a)


	#endif /* __CUDACC__ \|\| __CUDABE__ \|\| __MULTI_CORE__ \|\| __GNUC__ \|\| _WIN64 */	#endif /* __CUDACC__ \|\| __CUDABE__ \|\| __GNUC__ \|\| _WIN64 */

	#define __device__ \	#define __device__ \

	__location__(__device__)	__location__(device)
	#define __host__ \	#define __host__ \

	__location__(__host__)	__location__(host)
	#define __global__ \	#define __global__ \

	__location__(__global__)	__location__(global)
	#define __shared__ \	#define __shared__ \

	__location__(__shared__)	__location__(shared)
	#define __constant__ \	#define __constant__ \

	__location__(__constant__)	__location__(constant)
	#define __launch_bounds__(...) \	#define __launch_bounds__(...) \

	__location__(__launch_bounds__(__VA_ARGS__))	__annotate__(launch_bounds(__VA_ARGS__))

	#endif /* !__HOST_DEFINES_H__ */	#endif /* !__HOST_DEFINES_H__ */

End of changes. 21 change blocks.
	24 lines changed or deleted	25 lines changed or added

	host_runtime.h	host_runtime.h

	skipping to change at line 48	skipping to change at line 48
	#define __CUDA_INTERNAL_COMPILATION__	#define __CUDA_INTERNAL_COMPILATION__
	#define __glob_pref_var(var) \	#define __glob_pref_var(var) \
	__global_##var	__global_##var
	#define __global_var(var) \	#define __global_var(var) \
	(*__glob_pref_var(var))	(*__glob_pref_var(var))
	#define __shadow_var(c, cpp) \	#define __shadow_var(c, cpp) \
	__shadow_pref_var(c, cpp)	__shadow_pref_var(c, cpp)
	#define __text__	#define __text__
	#define __surf__	#define __surf__
	#define __dv(v)	#define __dv(v)

		#define __name__shadow_var(c, cpp) \
		__pick(#c, #cpp)
		#define __name__text_var(c, cpp) \
		__pick(#c, #cpp)
		#define __shadow_pref_var(c, cpp) \
		__pick(c##__cuda_shadow_variable__, cpp##__cuda_shadow_variable__)
		#define __device_stub_name(c, cpp) \
		__pick(c, cpp)
		#define __text_var(c, cpp) \
		__pick(c, cpp)
		#define __cppref__ \
		__pick(, &)

	#if defined(_WIN32) && !defined(_WIN64)	#if defined(_WIN32) && !defined(_WIN64)

	#define __pad__(f) \	#define __pad__(f) \
	f	f

	#else /* _WIN32 && !_WIN64 */	#else /* _WIN32 && !_WIN64 */

	#define __pad__(f)	#define __pad__(f)


	skipping to change at line 73	skipping to change at line 85
	__weak_import__,	__weak_import__,

	#elif defined(__GNUC__)	#elif defined(__GNUC__)

	#define __extern_weak__	#define __extern_weak__

	#endif /* __APPLE__ */	#endif /* __APPLE__ */

	#if defined(__cplusplus)	#if defined(__cplusplus)


	#define __shadow_pref_var(c, cpp) \	#define __pick(c, cpp) \
	cpp##__cuda_shadow_variable__
	#define __device_stub_name(c, cpp) \
	cpp
	#define __text_var(c, cpp) \
	cpp	cpp

	#define __cppref__ \
	&

	#else /* __cplusplus */	#else /* __cplusplus */


	#define __shadow_pref_var(c, cpp) \	#define __pick(c, cpp) \
	c##__cuda_shadow_variable__
	#define __device_stub_name(c, cpp) \
	c
	#define __text_var(c, cpp) \
	c	c

	#define __cppref__

	typedef char bool;	typedef char bool;

	#endif /* __cplusplus */	#endif /* __cplusplus */

	#if !defined(__GNUC__) \|\| __GNUC__ < 4 \|\| (__GNUC__ == 4 && __GNUC_MINOR__ < 3)	#if !defined(__GNUC__) \|\| __GNUC__ < 4 \|\| (__GNUC__ == 4 && __GNUC_MINOR__ < 3)

	#define __specialization_static \	#define __specialization_static \
	static	static


	skipping to change at line 128	skipping to change at line 129
	__cudaRegisterVar(__cudaFatCubinHandle, (char)&__host##var, (char )__device##var, __name##var, ext, size, constant, global)	__cudaRegisterVar(__cudaFatCubinHandle, (char)&__host##var, (char )__device##var, __name##var, ext, size, constant, global)
	#define __cudaRegisterGlobalTexture(tex, dim, norm, ext) \	#define __cudaRegisterGlobalTexture(tex, dim, norm, ext) \
	__cudaRegisterTexture(__cudaFatCubinHandle, (const struct textureRe ference*)&tex, __tex_var(tex), __name##tex, dim, norm, ext)	__cudaRegisterTexture(__cudaFatCubinHandle, (const struct textureRe ference*)&tex, __tex_var(tex), __name##tex, dim, norm, ext)
	#define __cudaRegisterGlobalSurface(surf, dim, ext) \	#define __cudaRegisterGlobalSurface(surf, dim, ext) \
	__cudaRegisterSurface(__cudaFatCubinHandle, (const struct surfaceRe ference*)&surf, __tex_var(surf), __name##surf, dim, ext)	__cudaRegisterSurface(__cudaFatCubinHandle, (const struct surfaceRe ference*)&surf, __tex_var(surf), __name##surf, dim, ext)
	#define __cudaRegisterUnsizedShared(var) \	#define __cudaRegisterUnsizedShared(var) \
	__cudaRegisterShared(__cudaFatCubinHandle, (void**)__device_var(var ))	__cudaRegisterShared(__cudaFatCubinHandle, (void**)__device_var(var ))
	#define __cudaRegisterSharedVariable(var, size, align, sc) \	#define __cudaRegisterSharedVariable(var, size, align, sc) \
	__cudaRegisterSharedVar(__cudaFatCubinHandle, (void**)__device_var( var), size, align, sc)	__cudaRegisterSharedVar(__cudaFatCubinHandle, (void**)__device_var( var), size, align, sc)
	#define __cudaRegisterEntry(funptr, fun, thread_limit) \	#define __cudaRegisterEntry(funptr, fun, thread_limit) \

	__cudaRegisterFunction(__cudaFatCubinHandle, (const char)funptr, ( char)__device_fun(fun), #fun, thread_limit, __ids)	__cudaRegisterFunction(__cudaFatCubinHandle, (const char)funptr, ( char)__device_fun(fun), #fun, __cuda_tl__(thread_limit), __ids)

	#define __cudaInitArgBlock(arg) \	#define __cudaInitArgBlock(arg) \
	(void)(void)&arg = (void*)0	(void)(void)&arg = (void*)0
	#define __cudaSetupArg(arg, offset) \	#define __cudaSetupArg(arg, offset) \
	if (cudaSetupArgument((void)(char)&arg, sizeof(arg), (size_t)&off set->arg) != cudaSuccess) \	if (cudaSetupArgument((void)(char)&arg, sizeof(arg), (size_t)&off set->arg) != cudaSuccess) \
	return	return
	#define __cudaLaunch(fun) \	#define __cudaLaunch(fun) \
	{ volatile static char *__f; __f = fun; (void)cudaLaunch(fun); }	{ volatile static char *__f; __f = fun; (void)cudaLaunch(fun); }

	#if defined(__cplusplus)	#if defined(__cplusplus)

	skipping to change at line 216	skipping to change at line 217
	#if defined(__cplusplus)	#if defined(__cplusplus)
	}	}
	#endif /* __cplusplus */	#endif /* __cplusplus */

	#if defined(__GNUC__) && defined(__cplusplus)	#if defined(__GNUC__) && defined(__cplusplus)

	extern int atexit(void(*)(void)) throw();	extern int atexit(void(*)(void)) throw();

	#else /* __GNUC__ && __cplusplus */	#else /* __GNUC__ && __cplusplus */


	extern int atexit(void(*)(void));	extern int __cdecl atexit(void(__cdecl *)(void));

	#endif /* __GNUC__ && __cplusplus */	#endif /* __GNUC__ && __cplusplus */

	static void **__cudaFatCubinHandle;	static void **__cudaFatCubinHandle;


	static void __cudaUnregisterBinaryUtil(void)	static void __cdecl __cudaUnregisterBinaryUtil(void)
	{	{
	__cudaUnregisterFatBinary(__cudaFatCubinHandle);	__cudaUnregisterFatBinary(__cudaFatCubinHandle);
	}	}

	#if defined(__device_emulation)	#if defined(__device_emulation)

	#if defined(__cplusplus) && !defined(__multi_core__)	#if defined(__cplusplus) && !defined(__multi_core__)

	#define __cuda_emu__ \	#define __cuda_emu__ \
	__cuda_emu::	__cuda_emu::

	#else /* __cplusplus */	#else /* __cplusplus */

	#define __cuda_emu__	#define __cuda_emu__

	#endif /* __cplusplus */	#endif /* __cplusplus */

	#define __device_fun(fun) \	#define __device_fun(fun) \
	__cuda_emu__ __device_wrapper_##fun	__cuda_emu__ __device_wrapper_##fun
	#define __device_var(var) \	#define __device_var(var) \

	(char*)&__cuda_emu__ var	&__cuda_emu__ var
	#define __tex_var(var) \	#define __tex_var(var) \
	&__cuda_emu__ __texture_var(var)	&__cuda_emu__ __texture_var(var)
	#define __cudaFatCubin \	#define __cudaFatCubin \
	0	0

		#define __cuda_tl__(l) \
		l

	#if defined(__multi_core__)	#if defined(__multi_core__)

	#define __ids \	#define __ids \
	(uint3)0, (uint3)0, &blockDim, &gridDim, &warpSize	(uint3)0, (uint3)0, &blockDim, &gridDim, &warpSize

	#else /* __multi_core__ */	#else /* __multi_core__ */

	#define __ids \	#define __ids \
	(uint3)&__cuda_emu__ threadIdx, (uint3)&__cuda_emu__ blockIdx, (d im3)&__cuda_emu__ blockDim, (dim3)&__cuda_emu__ gridDim, &__cuda_emu__ wa rpSize	(uint3)&__cuda_emu__ threadIdx, (uint3)&__cuda_emu__ blockIdx, (d im3)&__cuda_emu__ blockDim, (dim3)&__cuda_emu__ gridDim, &__cuda_emu__ wa rpSize

	skipping to change at line 270	skipping to change at line 273

	#else /* __device_emulation */	#else /* __device_emulation */

	#define __device_fun(fun) \	#define __device_fun(fun) \
	#fun	#fun
	#define __device_var(var) \	#define __device_var(var) \
	#var	#var
	#define __tex_var(var) \	#define __tex_var(var) \
	0	0
	#define __cudaFatCubin \	#define __cudaFatCubin \

	(&__fatDeviceText)	&__fatDeviceText
		#define __cuda_tl__(l) \
		-1
	#define __ids \	#define __ids \
	(uint3)0, (uint3)0, (dim3)0, (dim3)0, (int*)0	(uint3)0, (uint3)0, (dim3)0, (dim3)0, (int*)0

	#include "common_functions.h"	#include "common_functions.h"

	#endif /* __device_emulation */	#endif /* __device_emulation */

	/* UTILITY MACROS */	/* UTILITY MACROS */
	#define __device__global_var(var) \	#define __device__global_var(var) \
	__device_var(var)	__device_var(var)
	#define __name__global_var(var) \	#define __name__global_var(var) \
	#var	#var
	#define __host__global_var(var) \	#define __host__global_var(var) \
	__glob_pref_var(var)	__glob_pref_var(var)
	#define __device__shadow_var(c, cpp) \	#define __device__shadow_var(c, cpp) \
	__device_var(c)	__device_var(c)

	#define __name__shadow_var(c, cpp) \
	#c
	#define __name__text_var(c, cpp) \
	#c
	#define __host__shadow_var(c, cpp) \	#define __host__shadow_var(c, cpp) \
	__shadow_pref_var(c, cpp)	__shadow_pref_var(c, cpp)


	#if defined(_WIN32) && defined(__cplusplus)	#if defined(_WIN32)

		#if defined(__cplusplus)

	#pragma warning(disable: 4099)	#pragma warning(disable: 4099)


	#endif /* _WIN32 && __cplusplus */	#endif /* __cplusplus */

		#if !defined(_WIN64)

		#pragma warning(disable: 4408)

		#endif /* !_WIN64 */

		#endif /* _WIN32 */

	#endif /* !__CUDA_INTERNAL_COMPILATION__ */	#endif /* !__CUDA_INTERNAL_COMPILATION__ */

End of changes. 14 change blocks.
	24 lines changed or deleted	35 lines changed or added

	math_functions_dbl_ptx3.h	math_functions_dbl_ptx3.h
	/*	/*

	* Copyright 1993-2009 NVIDIA Corporation. All rights reserved.	* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
	*	*
	* NOTICE TO USER:	* NOTICE TO USER:
	*	*
	* This source code is subject to NVIDIA ownership rights under U.S. and	* This source code is subject to NVIDIA ownership rights under U.S. and
	* international Copyright laws. Users and possessors of this source code	* international Copyright laws. Users and possessors of this source code
	* are hereby granted a nonexclusive, royalty-free license to use this code	* are hereby granted a nonexclusive, royalty-free license to use this code
	* in individual and commercial software.	* in individual and commercial software.
	*	*
	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR

	skipping to change at line 197	skipping to change at line 197
	{	{
	int alo, ahi, bhi;	int alo, ahi, bhi;

	bhi = __double2hiint(b);	bhi = __double2hiint(b);
	alo = __double2loint(a);	alo = __double2loint(a);
	ahi = __double2hiint(a);	ahi = __double2hiint(a);
	ahi = (bhi & 0x80000000) \| (ahi & ~0x80000000);	ahi = (bhi & 0x80000000) \| (ahi & ~0x80000000);
	return __hiloint2double(ahi, alo);	return __hiloint2double(ahi, alo);
	}	}


		/* like copysign, but requires that argument a is postive */
		__device_func__(double __internal_copysign_pos(double a, double b))
		{
		int alo, ahi, bhi;

		bhi = __double2hiint(b);
		alo = __double2loint(a);
		ahi = __double2hiint(a);
		ahi = (bhi & 0x80000000) \| ahi;
		return __hiloint2double(ahi, alo);
		}

	/* 1152 bits of 2/PI for Payne-Hanek style argument reduction. */	/* 1152 bits of 2/PI for Payne-Hanek style argument reduction. */
	static __constant__ unsigned long long int __cudart_i2opi_d [] = {	static __constant__ unsigned long long int __cudart_i2opi_d [] = {
	0x6bfb5fb11f8d5d08ULL,	0x6bfb5fb11f8d5d08ULL,
	0x3d0739f78a5292eaULL,	0x3d0739f78a5292eaULL,
	0x7527bac7ebe5f17bULL,	0x7527bac7ebe5f17bULL,
	0x4f463f669e5fea2dULL,	0x4f463f669e5fea2dULL,
	0x6d367ecf27cb09b7ULL,	0x6d367ecf27cb09b7ULL,
	0xef2f118b5a0a6d1fULL,	0xef2f118b5a0a6d1fULL,
	0x1ff897ffde05980fULL,	0x1ff897ffde05980fULL,
	0x9c845f8bbdf9283bULL,	0x9c845f8bbdf9283bULL,

	skipping to change at line 472	skipping to change at line 484
	z = __internal_cos_kerneld(z);	z = __internal_cos_kerneld(z);
	} else {	} else {
	z = __internal_sin_kerneld(z);	z = __internal_sin_kerneld(z);
	}	}
	if (i & 2) {	if (i & 2) {
	z = -z;	z = -z;
	}	}
	return z;	return z;
	}	}


		__device_func__(double __cuda_sinpi(double a))
		{
		double z;
		double fi;
		int i;

		if (__cuda___isinf(a) \|\| (a == CUDART_ZERO)) {
		return __dmul_rn(a, CUDART_ZERO);
		}
		/* IEEE-754: sinPi(+n) is +0 and sinPi(-n) is -0 for positive integers n.
		*/
		if (a == __cuda_trunc(a)) {
		return __longlong_as_double(__double_as_longlong(a)&0x8000000000000000U
		LL);
		}
		fi = __cuda_rint (a * 2.0);
		z = __fma_rn (fi, -0.5, a);
		z = __fma_rn (z, CUDART_PI_HI, z * CUDART_PI_LO);
		i = (int)(((long long)fi) & 3);
		if (i & 1) {
		z = __internal_cos_kerneld(z);
		} else {
		z = __internal_sin_kerneld(z);
		}
		if (i & 2) {
		z = -z;
		}
		return z;
		}

	__device_func__(double __cuda_cos(double a))	__device_func__(double __cuda_cos(double a))
	{	{
	double z;	double z;
	int i;	int i;
	if (__cuda___isinf(a)) {	if (__cuda___isinf(a)) {
	return CUDART_NAN;	return CUDART_NAN;
	}	}
	z = __internal_trig_reduction_kerneld(a, &i);	z = __internal_trig_reduction_kerneld(a, &i);
	/* here, abs(z) <= pi/4, and i has the quadrant */	/* here, abs(z) <= pi/4, and i has the quadrant */
	i++;	i++;

	skipping to change at line 531	skipping to change at line 571
	}	}
	*sptr = s;	*sptr = s;
	*cptr = c;	*cptr = c;
	}	}

	__device_func__(double __cuda_tan(double a))	__device_func__(double __cuda_tan(double a))
	{	{
	double z;	double z;
	int i;	int i;
	if (__cuda___isinf(a)) {	if (__cuda___isinf(a)) {

	return CUDART_NAN;	return __dadd_rn (a, -a); /* return NaN */
	}	}
	z = __internal_trig_reduction_kerneld(a, &i);	z = __internal_trig_reduction_kerneld(a, &i);
	/* here, abs(z) <= pi/4, and i has the quadrant */	/* here, abs(z) <= pi/4, and i has the quadrant */
	z = __internal_tan_kerneld(z, i & 1);	z = __internal_tan_kerneld(z, i & 1);
	return z;	return z;
	}	}

	__device_func__(double __cuda_log(double a))	__device_func__(double __cuda_log(double a))
	{	{
	double m, f, g, u, v, tmp, q, ulo, log_lo, log_hi;	double m, f, g, u, v, tmp, q, ulo, log_lo, log_hi;

	skipping to change at line 582	skipping to change at line 622
	/* u = 2.0 * (m - 1.0) / (m + 1.0) */	/* u = 2.0 * (m - 1.0) / (m + 1.0) */
	v = u * u;	v = u * u;
	q = 6.7261411553826339E-2/65536.0;	q = 6.7261411553826339E-2/65536.0;
	q = __fma_rn (q, v, 6.6133829643643394E-2/16384.0);	q = __fma_rn (q, v, 6.6133829643643394E-2/16384.0);
	q = __fma_rn (q, v, 7.6940931149150890E-2/4096.0);	q = __fma_rn (q, v, 7.6940931149150890E-2/4096.0);
	q = __fma_rn (q, v, 9.0908745692137444E-2/1024.0);	q = __fma_rn (q, v, 9.0908745692137444E-2/1024.0);
	q = __fma_rn (q, v, 1.1111111499059706E-1/256.0);	q = __fma_rn (q, v, 1.1111111499059706E-1/256.0);
	q = __fma_rn (q, v, 1.4285714283305975E-1/64.0);	q = __fma_rn (q, v, 1.4285714283305975E-1/64.0);
	q = __fma_rn (q, v, 2.0000000000007223E-1/16.0);	q = __fma_rn (q, v, 2.0000000000007223E-1/16.0);
	q = __fma_rn (q, v, 3.3333333333333326E-1/4.0);	q = __fma_rn (q, v, 3.3333333333333326E-1/4.0);

	tmp = __internal_twice (f - u);	tmp = 2.0 * (f - u);
	tmp = __fma_rn (-u, f, tmp); // tmp = remainder of division	tmp = __fma_rn (-u, f, tmp); // tmp = remainder of division
	ulo = g * tmp; // less significant quotient bits	ulo = g * tmp; // less significant quotient bits
	/* u + ulo = 2.0 * (m - 1.0) / (m + 1.0) to more than double precision */	/* u + ulo = 2.0 * (m - 1.0) / (m + 1.0) to more than double precision */
	q = q * v;	q = q * v;
	q = q * u;	q = q * u;
	/* log_hi + log_lo = log(m) to more than double precision */	/* log_hi + log_lo = log(m) to more than double precision */
	log_hi = u;	log_hi = u;
	log_lo = ulo + q;	log_lo = ulo + q;
	/* log_hi + log_lo = log(m)+elog(2)=log(a) to more than double precisi on/	/* log_hi + log_lo = log(m)+elog(2)=log(a) to more than double precisi on/
	q = __fma_rn ( e, CUDART_LN2_HI, log_hi);	q = __fma_rn ( e, CUDART_LN2_HI, log_hi);

	skipping to change at line 700	skipping to change at line 740
	v = u * u;	v = u * u;
	q = 6.6253631649203309E-2/65536.0;	q = 6.6253631649203309E-2/65536.0;
	q = __fma_rn (q, v, 6.6250935587260612E-2/16384.0);	q = __fma_rn (q, v, 6.6250935587260612E-2/16384.0);
	q = __fma_rn (q, v, 7.6935437806732829E-2/4096.0);	q = __fma_rn (q, v, 7.6935437806732829E-2/4096.0);
	q = __fma_rn (q, v, 9.0908878711093280E-2/1024.0);	q = __fma_rn (q, v, 9.0908878711093280E-2/1024.0);
	q = __fma_rn (q, v, 1.1111111322892790E-1/256.0);	q = __fma_rn (q, v, 1.1111111322892790E-1/256.0);
	q = __fma_rn (q, v, 1.4285714284546502E-1/64.0);	q = __fma_rn (q, v, 1.4285714284546502E-1/64.0);
	q = __fma_rn (q, v, 2.0000000000003113E-1/16.0);	q = __fma_rn (q, v, 2.0000000000003113E-1/16.0);
	q = q * v;	q = q * v;
	/* u + ulo = 2.0 * (m - 1.0) / (m + 1.0) to more than double precision */	/* u + ulo = 2.0 * (m - 1.0) / (m + 1.0) to more than double precision */

	tmp = __internal_twice (f - u);	tmp = 2.0 * (f - u);
	tmp = __fma_rn (-u, f, tmp); // tmp = remainder of division	tmp = __fma_rn (-u, f, tmp); // tmp = remainder of division
	ulo = g * tmp; // less significand quotient bits	ulo = g * tmp; // less significand quotient bits
	/* switch to double-double at this point */	/* switch to double-double at this point */
	qq.y = q;	qq.y = q;
	qq.x = 0.0;	qq.x = 0.0;
	uu.y = u;	uu.y = u;
	uu.x = ulo;	uu.x = ulo;
	cc.y = 3.3333333333333331E-1/4.0;	cc.y = 3.3333333333333331E-1/4.0;
	cc.x = -9.8201492846582465E-18/4.0;	cc.x = -9.8201492846582465E-18/4.0;
	qq = __internal_ddadd_xgty (cc, qq);	qq = __internal_ddadd_xgty (cc, qq);

	skipping to change at line 936	skipping to change at line 976
	} else if (a < 2.0) { /* work around accuracy issue in vicinity of 1.4 */	} else if (a < 2.0) { /* work around accuracy issue in vicinity of 1.4 */
	z = __cuda_expm1(a);	z = __cuda_expm1(a);
	z = __internal_half (z + z / (z + 1.0));	z = __internal_half (z + z / (z + 1.0));
	} else {	} else {
	z = __internal_exp_kernel(a, -1);	z = __internal_exp_kernel(a, -1);
	z = z + (1.0 / (-4.0 * z));	z = z + (1.0 / (-4.0 * z));
	if (a >= CUDART_LN2_X_1025) {	if (a >= CUDART_LN2_X_1025) {
	z = CUDART_INF; /* overflow -> infinity */	z = CUDART_INF; /* overflow -> infinity */
	}	}
	}	}

	z = __cuda_copysign(z, s);	z = __internal_copysign_pos(z, s);
	return z;	return z;
	}	}

	__device_func__(double __cuda_tanh(double a))	__device_func__(double __cuda_tanh(double a))
	{	{
	double t;	double t;
	t = __cuda_fabs(a);	t = __cuda_fabs(a);
	if (t >= 0.55) {	if (t >= 0.55) {
	double s;	double s;
	s = 1.0 - 2.0 / (__internal_exp_kernel(2.0 * t, 0) + 1.0);	s = 1.0 - 2.0 / (__internal_exp_kernel(2.0 * t, 0) + 1.0);
	if (t > 350.0) {	if (t > 350.0) {
	s = 1.0; /* overflow -> 1.0 */	s = 1.0; /* overflow -> 1.0 */
	}	}

	a = __cuda_copysign(s, a);	a = __internal_copysign_pos(s, a);
	} else {	} else {
	double a2;	double a2;
	a2 = a * a;	a2 = a * a;
	t = 5.102147717274194E-005;	t = 5.102147717274194E-005;
	t = __fma_rn (t, a2, -2.103023983278533E-004);	t = __fma_rn (t, a2, -2.103023983278533E-004);
	t = __fma_rn (t, a2, 5.791370145050539E-004);	t = __fma_rn (t, a2, 5.791370145050539E-004);
	t = __fma_rn (t, a2, -1.453216755611004E-003);	t = __fma_rn (t, a2, -1.453216755611004E-003);
	t = __fma_rn (t, a2, 3.591719696944118E-003);	t = __fma_rn (t, a2, 3.591719696944118E-003);
	t = __fma_rn (t, a2, -8.863194503940334E-003);	t = __fma_rn (t, a2, -8.863194503940334E-003);
	t = __fma_rn (t, a2, 2.186948597477980E-002);	t = __fma_rn (t, a2, 2.186948597477980E-002);
	t = __fma_rn (t, a2, -5.396825387607743E-002);	t = __fma_rn (t, a2, -5.396825387607743E-002);
	t = __fma_rn (t, a2, 1.333333333316870E-001);	t = __fma_rn (t, a2, 1.333333333316870E-001);
	t = __fma_rn (t, a2, -3.333333333333232E-001);	t = __fma_rn (t, a2, -3.333333333333232E-001);
	t = t * a2;	t = t * a2;
	t = __fma_rn (t, a, a);	t = __fma_rn (t, a, a);

	a = __cuda_copysign(t, a);	a = __internal_copysign_pos(t, a);
	}	}
	return a;	return a;
	}	}

	__device_func__(double __internal_atan_kernel(double a))	__device_func__(double __internal_atan_kernel(double a))
	{	{
	double t, a2;	double t, a2;
	a2 = a * a;	a2 = a * a;
	t = -2.0258553044438358E-005 ;	t = -2.0258553044438358E-005 ;
	t = __fma_rn (t, a2, 2.2302240345758510E-004);	t = __fma_rn (t, a2, 2.2302240345758510E-004);

	skipping to change at line 1022	skipping to change at line 1062
	t3 = __cuda___signbit(b) ? CUDART_3PIO4 : CUDART_PIO4;	t3 = __cuda___signbit(b) ? CUDART_3PIO4 : CUDART_PIO4;
	} else {	} else {
	t0 = __cuda_fmax (t1, t3);	t0 = __cuda_fmax (t1, t3);
	t1 = __cuda_fmin (t1, t3);	t1 = __cuda_fmin (t1, t3);
	t3 = t1 / t0;	t3 = t1 / t0;
	t3 = __internal_atan_kernel(t3);	t3 = __internal_atan_kernel(t3);
	/* Map result according to octant. */	/* Map result according to octant. */
	if (__cuda_fabs(a) > __cuda_fabs(b)) t3 = CUDART_PIO2 - t3;	if (__cuda_fabs(a) > __cuda_fabs(b)) t3 = CUDART_PIO2 - t3;
	if (b < 0.0) t3 = CUDART_PI - t3;	if (b < 0.0) t3 = CUDART_PI - t3;
	}	}

	t3 = __cuda_copysign(t3, a);	t3 = __internal_copysign_pos(t3, a);
	return t3;	return t3;
	}	}

	__device_func__(double __cuda_atan(double a))	__device_func__(double __cuda_atan(double a))
	{	{
	double t0, t1;	double t0, t1;
	/* reduce argument to first octant */	/* reduce argument to first octant */
	t0 = __cuda_fabs(a);	t0 = __cuda_fabs(a);
	t1 = t0;	t1 = t0;
	if (t0 > 1.0) {	if (t0 > 1.0) {
	t1 = 1.0 / t1;	t1 = 1.0 / t1;
	}	}
	/* approximate atan(r) in first octant */	/* approximate atan(r) in first octant */
	t1 = __internal_atan_kernel(t1);	t1 = __internal_atan_kernel(t1);
	/* map result according to octant. */	/* map result according to octant. */
	if (t0 > 1.0) {	if (t0 > 1.0) {
	t1 = CUDART_PIO2 - t1;	t1 = CUDART_PIO2 - t1;
	}	}

	return __cuda_copysign (t1, a);	return __internal_copysign_pos(t1, a);
	}	}

	/* b should be the square of a */	/* b should be the square of a */
	__device_func__(double __internal_asin_kernel(double a, double b))	__device_func__(double __internal_asin_kernel(double a, double b))
	{	{
	double r;	double r;
	r = 6.259798167646803E-002;	r = 6.259798167646803E-002;
	r = __fma_rn (r, b, -7.620591484676952E-002);	r = __fma_rn (r, b, -7.620591484676952E-002);
	r = __fma_rn (r, b, 6.686894879337643E-002);	r = __fma_rn (r, b, 6.686894879337643E-002);
	r = __fma_rn (r, b, -1.787828218369301E-002);	r = __fma_rn (r, b, -1.787828218369301E-002);

	skipping to change at line 1076	skipping to change at line 1116
	{	{
	double fa, t0, t1;	double fa, t0, t1;
	int ihi, ahi;	int ihi, ahi;
	ahi = __double2hiint(a);	ahi = __double2hiint(a);
	fa = __cuda_fabs(a);	fa = __cuda_fabs(a);
	ihi = __double2hiint(fa);	ihi = __double2hiint(fa);
	if (ihi < 0x3fe26666) {	if (ihi < 0x3fe26666) {
	t1 = fa * fa;	t1 = fa * fa;
	t1 = __internal_asin_kernel (fa, t1);	t1 = __internal_asin_kernel (fa, t1);
	t1 = __fma_rn (t1, fa, fa);	t1 = __fma_rn (t1, fa, fa);

	t1 = __cuda_copysign(t1, a);	t1 = __internal_copysign_pos(t1, a);
	} else {	} else {
	t1 = __fma_rn (-0.5, fa, 0.5);	t1 = __fma_rn (-0.5, fa, 0.5);
	t0 = __cuda_sqrt (t1);	t0 = __cuda_sqrt (t1);
	t1 = __internal_asin_kernel (t0, t1);	t1 = __internal_asin_kernel (t0, t1);
	t0 = -2.0 * t0;	t0 = -2.0 * t0;
	t1 = __fma_rn (t0, t1, CUDART_PIO2_LO);	t1 = __fma_rn (t0, t1, CUDART_PIO2_LO);
	t0 = t0 + CUDART_PIO4_HI;	t0 = t0 + CUDART_PIO4_HI;
	t1 = t0 + t1;	t1 = t0 + t1;
	t1 = t1 + CUDART_PIO4_HI;	t1 = t1 + CUDART_PIO4_HI;
	if (ahi < 0x3ff00000) {	if (ahi < 0x3ff00000) {

	t1 = __cuda_copysign(t1, a);	t1 = __internal_copysign_pos(t1, a);
	}	}
	}	}
	return t1;	return t1;
	}	}

	__device_func__(double __cuda_acos(double a))	__device_func__(double __cuda_acos(double a))
	{	{
	double t0, t1;	double t0, t1;
	int ihi, ahi;	int ihi, ahi;


	skipping to change at line 1151	skipping to change at line 1191
	/* for large a, acosh = log(2a) /	/* for large a, acosh = log(2a) /
	return CUDART_LN2 + __cuda_log(a);	return CUDART_LN2 + __cuda_log(a);
	} else {	} else {
	t = t + __cuda_sqrt(__fma_rn(a, t, t));	t = t + __cuda_sqrt(__fma_rn(a, t, t));
	return __cuda_log1p(t);	return __cuda_log1p(t);
	}	}
	}	}

	__device_func__(double __cuda_asinh(double a))	__device_func__(double __cuda_asinh(double a))
	{	{

	#if SLIGHTLY_MORE_ACCURATE_BUT_SLOWER
	double fa, oofa, t;
	fa = __cuda_fabs(a);
	if (fa > 8.9884657373828596e+307) { /* prevent intermediate underflow */
	t = CUDART_LN2 + __cuda_log(fa);
	} else {
	oofa = 1.0 / fa;
	t = fa + fa / (oofa + __cuda_sqrt(__fma_rn(oofa, oofa, 1.0)));
	t = __cuda_log1p(t);
	}
	#else
	double fa, t;	double fa, t;
	fa = __cuda_fabs(a);	fa = __cuda_fabs(a);

	if (fa > 1.0e153) {	if (__double2hiint(fa) >= 0x5ff00000) { /* prevent intermediate underflow */
	t = CUDART_LN2 + __cuda_log(fa);	t = CUDART_LN2 + __cuda_log(fa);
	} else {	} else {
	t = fa * fa;	t = fa * fa;
	t = __cuda_log1p (fa + t / (1.0 + __cuda_sqrt(1.0 + t)));	t = __cuda_log1p (fa + t / (1.0 + __cuda_sqrt(1.0 + t)));
	}	}

	#endif	return __internal_copysign_pos(t, a);
	return __cuda_copysign(t, a);
	}	}

	__device_func__(double __cuda_atanh(double a))	__device_func__(double __cuda_atanh(double a))
	{	{
	double fa, t;	double fa, t;
	#if !defined(__CUDABE__)	#if !defined(__CUDABE__)
	if (__cuda___isnan(a)) {	if (__cuda___isnan(a)) {
	return a + a;	return a + a;
	}	}
	#endif	#endif

	skipping to change at line 1198	skipping to change at line 1226
	}	}
	#endif	#endif
	if (__cuda___signbit(a)) {	if (__cuda___signbit(a)) {
	t = -t;	t = -t;
	}	}
	return t;	return t;
	}	}

	__device_func__(double __cuda_hypot(double a, double b))	__device_func__(double __cuda_hypot(double a, double b))
	{	{

	double v, w, t;	double v, w, t, fa, fb;
	if (__cuda___isinf(a) \|\| __cuda___isinf(b)) {
	return CUDART_INF;	fa = __cuda_fabs(a);
	}	fb = __cuda_fabs(b);
	if (__cuda___isnan(a) \|\| __cuda___isnan(b)) {	v = __cuda_fmax(fa, fb);
	return a + b;	w = __cuda_fmin(fa, fb);
	}
	a = __cuda_fabs(a);
	b = __cuda_fabs(b);
	v = __cuda_fmax(a, b);
	w = __cuda_fmin(a, b);
	t = w / v;	t = w / v;
	t = __fma_rn (t, t, 1.0);	t = __fma_rn (t, t, 1.0);
	t = v * __cuda_sqrt(t);	t = v * __cuda_sqrt(t);
	if (v == 0.0) {	if (v == 0.0) {

	t = v + w;	t = v + w; /* fixup for zero divide */
		}
		if ((!(fa <= CUDART_INF)) \|\| (!(fb <= CUDART_INF))) {
		t = a + b; /* fixup for NaNs */
		}
		if (v == CUDART_INF) {
		t = v + w; /* fixup for infinities */
	}	}
	return t;	return t;
	}	}

	__device_func__(double __cuda_cbrt(double a))	__device_func__(double __cuda_cbrt(double a))
	{	{
	float s;	float s;
	double t, r;	double t, r;
	int ilo, ihi, expo, nexpo, denorm;	int ilo, ihi, expo, nexpo, denorm;
	if ((a == 0.0) \|\| !(__cuda___finite(a))) {	if ((a == 0.0) \|\| !(__cuda___finite(a))) {

	skipping to change at line 1245	skipping to change at line 1274
	ilo = __double2loint(t);	ilo = __double2loint(t);
	ihi = __double2hiint(t);	ihi = __double2hiint(t);
	expo = ((int)((unsigned int)ihi >> 20) & 0x7ff);	expo = ((int)((unsigned int)ihi >> 20) & 0x7ff);
	}	}
	/* scale into float range */	/* scale into float range */
	nexpo = __float2int_rn(CUDART_THIRD_F * (float)(expo - 1022));	nexpo = __float2int_rn(CUDART_THIRD_F * (float)(expo - 1022));
	ihi -= (3 * nexpo) << 20;	ihi -= (3 * nexpo) << 20;
	r = __hiloint2double(ihi, ilo);	r = __hiloint2double(ihi, ilo);
	/* initial approximation */	/* initial approximation */
	s = (float)r;	s = (float)r;

	t = __cuda_exp2f(CUDART_THIRD_F * __log2f(s));	t = __cuda_exp2f(-CUDART_THIRD_F * __log2f(s)); /* approximate invcbrt
	/* refine approximation */	*/
	t = t - (t - (r / (t * t))) * CUDART_THIRD;	t = __fma_rn(__fma_rn(tt,-rt,1.0), CUDART_THIRDt, t);/ refine invcbrt
	t = t - (t - (r / (t * t))) * CUDART_THIRD;	*/
		t = r * t * t; /* approximate cbrt
		*/
		t = __fma_rn(t - (r / (t * t)), -CUDART_THIRD, t); /* refine cbrt
		*/
	/* scale result back into double range */	/* scale result back into double range */
	ilo = __double2loint(t);	ilo = __double2loint(t);
	ihi = __double2hiint(t);	ihi = __double2hiint(t);
	ihi += (nexpo - denorm) << 20;	ihi += (nexpo - denorm) << 20;
	t = __hiloint2double(ihi, ilo);	t = __hiloint2double(ihi, ilo);
	if (__cuda___signbit(a)) {	if (__cuda___signbit(a)) {

	t = -t;	t = -t;
		}
		return t;
		}

		__device_func__(double __cuda_rcbrt(double a))
		{
		float s;
		double t, r;
		int ilo, ihi, expo, nexpo, denorm;
		if ((a == 0.0) \|\| !(__cuda___finite(a))) {
		return 1.0 / a;
		}
		t = __cuda_fabs(a);
		ilo = __double2loint(t);
		ihi = __double2hiint(t);
		expo = ((int)((unsigned int)ihi >> 20) & 0x7ff);
		denorm = 0;
		if (expo == 0) {
		/* denormal */
		t = t * CUDART_TWO_TO_54;
		denorm = 18;
		ilo = __double2loint(t);
		ihi = __double2hiint(t);
		expo = ((int)((unsigned int)ihi >> 20) & 0x7ff);
		}
		/* scale into float range */
		nexpo = __float2int_rn(CUDART_THIRD_F * (float)(expo - 1022));
		ihi -= (3 * nexpo) << 20;
		r = __hiloint2double(ihi, ilo);
		/* initial approximation */
		s = (float)r;
		t = __cuda_exp2f(-CUDART_THIRD_F * __log2f(s)); /* approximate invcbrt
		*/
		t = __fma_rn(__fma_rn(tt,-rt,1.0), CUDART_THIRDt, t);/ refine invcbrt
		*/
		t = __fma_rn(__fma_rn(tt,-rt,1.0), CUDART_THIRDt, t);/ refine invcbrt
		*/
		/* scale result back into double range */
		ilo = __double2loint(t);
		ihi = __double2hiint(t);
		ihi += (-(nexpo - denorm)) << 20;
		t = __hiloint2double(ihi, ilo);
		if (__cuda___signbit(a)) {
		t = -t;
	}	}
	return t;	return t;
	}	}

	__device_func__(double __internal_accurate_pow(double a, double b))	__device_func__(double __internal_accurate_pow(double a, double b))
	{	{
	double2 loga;	double2 loga;
	double2 prod;	double2 prod;
	double t_hi, t_lo;	double t_hi, t_lo;
	double tmp;	double tmp;
	#if !defined(__CUDABE__) && defined(__linux__) && !defined(__LP64__)	#if !defined(__CUDABE__) && defined(__linux__) && !defined(__LP64__)

	volatile double e;	volatile
	#else
	double e;
	#endif	#endif

		double e;

	/* compute log(a) in double-double format*/	/* compute log(a) in double-double format*/
	loga = __internal_log_ext_prec(a);	loga = __internal_log_ext_prec(a);

	/* prevent overflow during extended precision multiply */	/* prevent overflow during extended precision multiply */
	if (__cuda_fabs(b) > 1e304) b *= 1.220703125e-4;	if (__cuda_fabs(b) > 1e304) b *= 1.220703125e-4;
	/* compute b * log(a) in double-double format */	/* compute b * log(a) in double-double format */
	t_hi = loga.y * b;	t_hi = loga.y * b;
	t_lo = __fma_rn (loga.y, b, -t_hi);	t_lo = __fma_rn (loga.y, b, -t_hi);
	t_lo = __fma_rn (loga.x, b, t_lo);	t_lo = __fma_rn (loga.x, b, t_lo);

	skipping to change at line 1319	skipping to change at line 1389
	if (__cuda___isinf(b)) {	if (__cuda___isinf(b)) {
	if (a == -1.0) {	if (a == -1.0) {
	return 1.0;	return 1.0;
	}	}
	t = __cuda_fabs(a) > 1.0 ? CUDART_INF : CUDART_ZERO;	t = __cuda_fabs(a) > 1.0 ? CUDART_INF : CUDART_ZERO;
	if (b < CUDART_ZERO) {	if (b < CUDART_ZERO) {
	t = 1.0 / t;	t = 1.0 / t;
	}	}
	return t;	return t;
	}	}

	bIsOddInteger = (b - (2.0 * __cuda_floor(0.5 * b))) == 1.0;	bIsOddInteger = __cuda_fabs(b - (2.0f * __cuda_trunc(0.5 * b))) == 1.0;
	if (a == CUDART_ZERO) {	if (a == CUDART_ZERO) {
	t = bIsOddInteger ? a : CUDART_ZERO;	t = bIsOddInteger ? a : CUDART_ZERO;
	if (b < CUDART_ZERO) {	if (b < CUDART_ZERO) {
	t = 1.0 / t;	t = 1.0 / t;
	}	}
	return t;	return t;
	}	}
	if (a == -CUDART_INF) {	if (a == -CUDART_INF) {
	t = (b < CUDART_ZERO) ? -1.0/a : -a;	t = (b < CUDART_ZERO) ? -1.0/a : -a;
	if (bIsOddInteger) {	if (bIsOddInteger) {

	skipping to change at line 1348	skipping to change at line 1418
	t = __internal_accurate_pow(t, b);	t = __internal_accurate_pow(t, b);
	if ((a < CUDART_ZERO) && bIsOddInteger) {	if ((a < CUDART_ZERO) && bIsOddInteger) {
	t = __longlong_as_double(__double_as_longlong(t) ^ 0x8000000000000000UL L);	t = __longlong_as_double(__double_as_longlong(t) ^ 0x8000000000000000UL L);
	}	}
	return t;	return t;
	}	}

	__device_func__(double __cuda_erf(double a))	__device_func__(double __cuda_erf(double a))
	{	{
	double t, r, q;	double t, r, q;

	#if !defined(__CUDABE__)
	if (__cuda___isnan(a)) {
	return a + a;
	}
	#endif
	t = __cuda_fabs(a);	t = __cuda_fabs(a);
	if (t >= 1.0) {	if (t >= 1.0) {
	r = -1.28836351230756500E-019;	r = -1.28836351230756500E-019;
	r = __fma_rn (r, t, 1.30597472161093370E-017);	r = __fma_rn (r, t, 1.30597472161093370E-017);
	r = __fma_rn (r, t, -6.33924401259620500E-016);	r = __fma_rn (r, t, -6.33924401259620500E-016);
	r = __fma_rn (r, t, 1.96231865908940140E-014);	r = __fma_rn (r, t, 1.96231865908940140E-014);
	r = __fma_rn (r, t, -4.35272243559990750E-013);	r = __fma_rn (r, t, -4.35272243559990750E-013);
	r = __fma_rn (r, t, 7.37083927929352150E-012);	r = __fma_rn (r, t, 7.37083927929352150E-012);
	r = __fma_rn (r, t, -9.91402142550461630E-011);	r = __fma_rn (r, t, -9.91402142550461630E-011);
	r = __fma_rn (r, t, 1.08817017167760820E-009);	r = __fma_rn (r, t, 1.08817017167760820E-009);

	skipping to change at line 1388	skipping to change at line 1454
	r = __fma_rn (r, t, 4.99394435612628580E-001);	r = __fma_rn (r, t, 4.99394435612628580E-001);
	r = __fma_rn (r, t, -7.52014596480123030E-001);	r = __fma_rn (r, t, -7.52014596480123030E-001);
	r = __fma_rn (r, t, 9.99933138314926250E-001);	r = __fma_rn (r, t, 9.99933138314926250E-001);
	r = __fma_rn (r, t, -1.12836725321102670E+000);	r = __fma_rn (r, t, -1.12836725321102670E+000);
	r = __fma_rn (r, t, 9.99998988715182450E-001);	r = __fma_rn (r, t, 9.99998988715182450E-001);
	q = __internal_exp_kernel(-t * t, 0);	q = __internal_exp_kernel(-t * t, 0);
	r = __fma_rn (r, -q, 1.0);	r = __fma_rn (r, -q, 1.0);
	if (t >= 6.5) {	if (t >= 6.5) {
	r = 1.0;	r = 1.0;
	}	}

	a = __cuda_copysign (r, a);	a = __internal_copysign_pos(r, a);
	} else {	} else {

	q = t * t;	q = a * a;
	r = -7.77946848895991420E-010;	r = -7.77946848895991420E-010;
	r = __fma_rn (r, q, 1.37109803980285950E-008);	r = __fma_rn (r, q, 1.37109803980285950E-008);
	r = __fma_rn (r, q, -1.62063137584932240E-007);	r = __fma_rn (r, q, -1.62063137584932240E-007);
	r = __fma_rn (r, q, 1.64471315712790040E-006);	r = __fma_rn (r, q, 1.64471315712790040E-006);
	r = __fma_rn (r, q, -1.49247123020098620E-005);	r = __fma_rn (r, q, -1.49247123020098620E-005);
	r = __fma_rn (r, q, 1.20552935769006260E-004);	r = __fma_rn (r, q, 1.20552935769006260E-004);
	r = __fma_rn (r, q, -8.54832592931448980E-004);	r = __fma_rn (r, q, -8.54832592931448980E-004);
	r = __fma_rn (r, q, 5.22397760611847340E-003);	r = __fma_rn (r, q, 5.22397760611847340E-003);
	r = __fma_rn (r, q, -2.68661706431114690E-002);	r = __fma_rn (r, q, -2.68661706431114690E-002);
	r = __fma_rn (r, q, 1.12837916709441850E-001);	r = __fma_rn (r, q, 1.12837916709441850E-001);

	skipping to change at line 1412	skipping to change at line 1478
	r = __fma_rn (r, q, 1.12837916709551260E+000);	r = __fma_rn (r, q, 1.12837916709551260E+000);
	a = r * a;	a = r * a;
	}	}
	return a;	return a;
	}	}

	__device_func__(double __cuda_erfinv(double a))	__device_func__(double __cuda_erfinv(double a))
	{	{
	double fa, t;	double fa, t;


	fa = fabs(a);	fa = __cuda_fabs(a);
	if (fa >= 1.0) {	if (fa >= 1.0) {
	t = CUDART_NAN; /* NaN */	t = CUDART_NAN; /* NaN */
	if (fa == 1.0) {	if (fa == 1.0) {
	t = a * CUDART_INF; /* Infinity */	t = a * CUDART_INF; /* Infinity */
	}	}
	} else if (fa >= 0.9375) {	} else if (fa >= 0.9375) {
	/* Based on: J.M. Blair, C.A. Edwards, J.H. Johnson: Rational Chebyshev	/* Based on: J.M. Blair, C.A. Edwards, J.H. Johnson: Rational Chebyshev
	Approximations for the Inverse of the Error Function. Mathematics of	Approximations for the Inverse of the Error Function. Mathematics of
	Computation, Vol. 30, No. 136 (Oct. 1976), pp. 827-830. Table 59	Computation, Vol. 30, No. 136 (Oct. 1976), pp. 827-830. Table 59
	*/	*/

	skipping to change at line 1588	skipping to change at line 1654
	}	}
	return t;	return t;
	}	}

	__device_func__(double __cuda_erfc(double a))	__device_func__(double __cuda_erfc(double a))
	{	{
	double p, q, h, l;	double p, q, h, l;
	int ahi;	int ahi;

	ahi = __double2hiint(a);	ahi = __double2hiint(a);

	if (ahi < (int)0x3fe80000) { /* 0.75 */	if (ahi < (int)0x3fea0400) { /* 1665/2048 */
	return 1.0 - __cuda_erf(a);	return 1.0 - __cuda_erf(a);
	}	}

	if (a > 27.3) {
	return 0.0;
	}
	if (ahi < (int)0x40140000) { /* 5.0 */	if (ahi < (int)0x40140000) { /* 5.0 */

	/* max error 7 ulps on [0.75, 5.0] */	/* On the interval [1665/2048, 5.0] the following approximation is used
	p = 5.6418956292134603E-001;	:
	p = __fma_rn (p, a, 7.9573512229784757E+000);	erfc(a) = (1.0 + 1/a * r(1/a)) * 1/a * 0.5 * exp(-a*a), where the ra
	p = __fma_rn (p, a, 5.4297984550299049E+001);	nge
	p = __fma_rn (p, a, 2.2775657465890461E+002);	of r(1/a) is approximately [-0.17, 0.11]. r(1/a) is computed by rati
	p = __fma_rn (p, a, 6.2995529536738172E+002);	onal
	p = __fma_rn (p, a, 1.1508293767713837E+003);	approximation.
	p = __fma_rn (p, a, 1.3002167301542784E+003);	*/
	p = __fma_rn (p, a, 7.2716547570180592E+002);	double t;
	q = a+ 1.4104035812651274E+001;
	q = __fma_rn (q, a, 9.6740724349422138E+001);	t = 1.0 / a;
	q = __fma_rn (q, a, 4.1073916054746462E+002);	p = -1.0000000252849461E+000;
	q = __fma_rn (q, a, 1.1641974580374074E+003);	p = __fma_rn (p, t, -7.3398971987771156E-001);
	q = __fma_rn (q, a, 2.2344896486798129E+003);	p = __fma_rn (p, t, -1.4685633784433072E-001);
	q = __fma_rn (q, a, 2.8166572432808462E+003);	p = __fma_rn (p, t, 1.2963557011001836E-001);
	q = __fma_rn (q, a, 2.1207350981593036E+003);	p = __fma_rn (p, t, 1.0901177826674287E-001);
	q = __fma_rn (q, a, 7.2716547619708967E+002);	p = __fma_rn (p, t, 3.9250612663155882E-002);
	p = p / q;	p = __fma_rn (p, t, 7.5883167167654269E-003);
		p = __fma_rn (p, t, 6.6438196820856965E-004);
		q = t + 2.7339900293714838E+000;
		q = __fma_rn (q, t, 3.3580762542361291E+000);
		q = __fma_rn (q, t, 2.4165688909166021E+000);
		q = __fma_rn (q, t, 1.1092158770004934E+000);
		q = __fma_rn (q, t, 3.2845571970789467E-001);
		q = __fma_rn (q, t, 5.9110343116276186E-002);
		q = __fma_rn (q, t, 5.1750858802842702E-003);
		q = __fma_rn (q, t, 1.2937416364002241E-009);
		q = 1.0 / q;
		p = p * q;
		p = p * t;
	h = a * a;	h = a * a;
	l = __fma_rn (a, a, -h);	l = __fma_rn (a, a, -h);

	q = __internal_exp_kernel(-h, 0);	q = __internal_exp_kernel(-h, -1);
	q = __fma_rn (l, -q, q);	q = __fma_rn (l, -q, q);

	p = p * q;	p = __fma_rn (p, q, q);
		p = p * t;
	} else {	} else {
	/* max error 4 ulps on [5, 27.3] */	/* max error 4 ulps on [5, 27.3] */
	double ooa, ooasq;	double ooa, ooasq;

	ooa = 1.0 / a;	ooa = 1.0 / a;
	ooasq = ooa * ooa;	ooasq = ooa * ooa;
	p = -4.0025406686930527E+005;	p = -4.0025406686930527E+005;
	p = __fma_rn (p, ooasq, 1.4420582543942123E+005);	p = __fma_rn (p, ooasq, 1.4420582543942123E+005);
	p = __fma_rn (p, ooasq, -2.7664185780951841E+004);	p = __fma_rn (p, ooasq, -2.7664185780951841E+004);
	p = __fma_rn (p, ooasq, 4.1144611644767283E+003);	p = __fma_rn (p, ooasq, 4.1144611644767283E+003);

	skipping to change at line 1642	skipping to change at line 1715
	p = __fma_rn (p, ooasq, -1.0578553994424316E+000);	p = __fma_rn (p, ooasq, -1.0578553994424316E+000);
	p = __fma_rn (p, ooasq, 4.2314218745087778E-001);	p = __fma_rn (p, ooasq, 4.2314218745087778E-001);
	p = __fma_rn (p, ooasq, -2.8209479177354962E-001);	p = __fma_rn (p, ooasq, -2.8209479177354962E-001);
	p = __fma_rn (p, ooasq, 5.6418958354775606E-001);	p = __fma_rn (p, ooasq, 5.6418958354775606E-001);
	h = a * a;	h = a * a;
	l = __fma_rn (a, a, -h);	l = __fma_rn (a, a, -h);
	q = __internal_exp_kernel(-h, 0);	q = __internal_exp_kernel(-h, 0);
	q = __fma_rn (l, -q, q);	q = __fma_rn (l, -q, q);
	p = p * ooa;	p = p * ooa;
	p = p * q;	p = p * q;

		if (a > 27.3) {
		p = 0.0;
		}
	}	}
	return p;	return p;
	}	}

	/* approximate 1.0/(agamma(a)) on [-0.5,0.5] /	/* approximate 1.0/(agamma(a)) on [-0.5,0.5] /
	__device_func__(double __internal_tgamma_kernel(double a))	__device_func__(double __internal_tgamma_kernel(double a))
	{	{
	double t;	double t;
	t = -4.42689340712524750E-010;	t = -4.42689340712524750E-010;
	t = __fma_rn (t, a, -2.02665918466589540E-007);	t = __fma_rn (t, a, -2.02665918466589540E-007);

	skipping to change at line 1691	skipping to change at line 1767
	z = __fma_rn (z, x, 3.4722222222222220e-003);	z = __fma_rn (z, x, 3.4722222222222220e-003);
	z = __fma_rn (z, x, 8.3333333333333329e-002);	z = __fma_rn (z, x, 8.3333333333333329e-002);
	z = __fma_rn (z, x, 1.0000000000000000e+000);	z = __fma_rn (z, x, 1.0000000000000000e+000);
	return z;	return z;
	}	}

	__device_func__(double __internal_tgamma_stirling(double a))	__device_func__(double __internal_tgamma_stirling(double a))
	{	{
	if (a < 1.7162437695630274e+002) {	if (a < 1.7162437695630274e+002) {
	#if defined(__GNUC__) && !defined(__CUDABE__)	#if defined(__GNUC__) && !defined(__CUDABE__)

	volatile double t_hi, t_lo, e;	volatile
	#else
	double t_hi, t_lo, e;
	#endif	#endif

		double t_hi, t_lo, e;

	double2 loga, prod;	double2 loga, prod;
	double z = __internal_stirling_poly (a);	double z = __internal_stirling_poly (a);
	double b = a - 0.5;	double b = a - 0.5;

	/* compute log(a) in double-double format*/	/* compute log(a) in double-double format*/
	loga = __internal_log_ext_prec(a);	loga = __internal_log_ext_prec(a);

	/* compute (a - 0.5) * log(a) in double-double format */	/* compute (a - 0.5) * log(a) in double-double format */
	t_hi = loga.y * b;	t_hi = loga.y * b;
	t_lo = __fma_rn (loga.y, b, -t_hi);	t_lo = __fma_rn (loga.y, b, -t_hi);

	skipping to change at line 1759	skipping to change at line 1835
	xx = xx - 1.0;	xx = xx - 1.0;
	}	}
	xx = __internal_tgamma_kernel (xx);	xx = __internal_tgamma_kernel (xx);
	if (x < 0.5) {	if (x < 0.5) {
	xx = xx * x;	xx = xx * x;
	}	}
	s = s / xx;	s = s / xx;
	} else {	} else {
	xx = x;	xx = x;
	s = xx;	s = xx;

	if (x == __cuda_floor(x)) {	if (x == __cuda_trunc(x)) {
	return CUDART_NAN;	return CUDART_NAN;
	}	}
	while (xx < -0.5) {	while (xx < -0.5) {
	s = __fma_rn (s, xx, s);	s = __fma_rn (s, xx, s);
	xx = xx + 1.0;	xx = xx + 1.0;
	}	}
	xx = __internal_tgamma_kernel (xx);	xx = __internal_tgamma_kernel (xx);
	s = s * xx;	s = s * xx;
	s = 1.0 / s;	s = 1.0 / s;
	}	}
	return s;	return s;
	} else {	} else {
	if (x >= 0.0) {	if (x >= 0.0) {
	return __internal_tgamma_stirling (x);	return __internal_tgamma_stirling (x);
	} else {	} else {
	double t;	double t;
	int quot;	int quot;

	if (x == __cuda_floor(x)) {	if (x == __cuda_trunc(x)) {
	return CUDART_NAN;	return CUDART_NAN;
	}	}
	if (x < -185.0) {	if (x < -185.0) {
	int negative;	int negative;
	x = __cuda_floor(x);	x = __cuda_floor(x);
	negative = ((x - (2.0 * __cuda_floor(0.5 * x))) == 1.0);	negative = ((x - (2.0 * __cuda_floor(0.5 * x))) == 1.0);
	return negative ? CUDART_NEG_ZERO : CUDART_ZERO;	return negative ? CUDART_NEG_ZERO : CUDART_ZERO;
	}	}
	/* compute sin(pix) accurately /	/* compute sin(pix) accurately /
	xx = __cuda_rint (__internal_twice(x));	xx = __cuda_rint (__internal_twice(x));

	skipping to change at line 1948	skipping to change at line 2024
	{	{
	double t;	double t;
	double i;	double i;
	long long int quot;	long long int quot;
	if (__cuda___isnan(a)) {	if (__cuda___isnan(a)) {
	return a + a;	return a + a;
	}	}
	t = __internal_lgamma_pos(__cuda_fabs(a));	t = __internal_lgamma_pos(__cuda_fabs(a));
	if (a >= 0.0) return t;	if (a >= 0.0) return t;
	a = __cuda_fabs(a);	a = __cuda_fabs(a);

	i = __cuda_floor(a);	i = __cuda_trunc(a);
	if (a == i) return CUDART_INF; /* a is an integer: return infinity */	if (a == i) return CUDART_INF; /* a is an integer: return infinity */
	if (a < 1e-19) return -__cuda_log(a);	if (a < 1e-19) return -__cuda_log(a);
	i = __cuda_rint (2.0 * a);	i = __cuda_rint (2.0 * a);
	quot = (long long int)i;	quot = (long long int)i;
	i = __fma_rn (-0.5, i, a);	i = __fma_rn (-0.5, i, a);
	i = i * CUDART_PI;	i = i * CUDART_PI;
	if (quot & 1) {	if (quot & 1) {
	i = __internal_cos_kerneld(i);	i = __internal_cos_kerneld(i);
	} else {	} else {
	i = __internal_sin_kerneld(i);	i = __internal_sin_kerneld(i);

	skipping to change at line 2038	skipping to change at line 2114
	return a;	return a;
	}	}

	__device_func__(double __cuda_modf(double a, double *b))	__device_func__(double __cuda_modf(double a, double *b))
	{	{
	double t;	double t;
	if (__cuda___finite(a)) {	if (__cuda___finite(a)) {
	t = __cuda_trunc(a);	t = __cuda_trunc(a);
	*b = t;	*b = t;
	t = a - t;	t = a - t;

	return __cuda_copysign(t, a);	return __internal_copysign_pos(t, a);
	} else if (__cuda___isinf(a)) {	} else if (__cuda___isinf(a)) {
	t = 0.0;	t = 0.0;
	*b = a;	*b = a;

	return __cuda_copysign(t, a);	return __internal_copysign_pos(t, a);
	} else {	} else {
	*b = a + a;	*b = a + a;
	return a + a;	return a + a;
	}	}
	}	}

	__device_func__(double __cuda_fmod(double a, double b))	__device_func__(double __cuda_fmod(double a, double b))
	{	{
	double orig_a = a;	double orig_a = a;
	double orig_b = b;	double orig_b = b;

	skipping to change at line 2086	skipping to change at line 2162
	}	}
	if (scaled_b > a) {	if (scaled_b > a) {
	scaled_b *= 0.5;	scaled_b *= 0.5;
	}	}
	while (scaled_b >= b) {	while (scaled_b >= b) {
	if (a >= scaled_b) {	if (a >= scaled_b) {
	a -= scaled_b;	a -= scaled_b;
	}	}
	scaled_b *= 0.5;	scaled_b *= 0.5;
	}	}

	return __cuda_copysign (a, orig_a);	return __internal_copysign_pos(a, orig_a);
	} else {	} else {
	return orig_a;	return orig_a;
	}	}
	}	}

	__device_func__(double __cuda_remainder(double a, double b))	__device_func__(double __cuda_remainder(double a, double b))
	{	{
	double orig_a;	double orig_a;
	double twoa = 0.0;	double twoa = 0.0;
	unsigned int quot0 = 0; /* quotient bit 0 */	unsigned int quot0 = 0; /* quotient bit 0 */

	skipping to change at line 2224	skipping to change at line 2300

	__device_func__(double __cuda_nextafter(double a, double b))	__device_func__(double __cuda_nextafter(double a, double b))
	{	{
	unsigned long long int ia;	unsigned long long int ia;
	unsigned long long int ib;	unsigned long long int ib;
	ia = __double_as_longlong(a);	ia = __double_as_longlong(a);
	ib = __double_as_longlong(b);	ib = __double_as_longlong(b);
	if (__cuda___isnan(a) \|\| __cuda___isnan(b)) return a + b; /* NaN */	if (__cuda___isnan(a) \|\| __cuda___isnan(b)) return a + b; /* NaN */
	if (((ia \| ib) << 1) == 0ULL) return b;	if (((ia \| ib) << 1) == 0ULL) return b;
	if ((ia + ia) == 0ULL) {	if ((ia + ia) == 0ULL) {

	return __cuda_copysign (CUDART_MIN_DENORM, b); /* crossover */	return __internal_copysign_pos(CUDART_MIN_DENORM, b); /* crossover */
	}	}
	if ((a < b) && (a < 0.0)) ia--;	if ((a < b) && (a < 0.0)) ia--;
	if ((a < b) && (a > 0.0)) ia++;	if ((a < b) && (a > 0.0)) ia++;
	if ((a > b) && (a < 0.0)) ia++;	if ((a > b) && (a < 0.0)) ia++;
	if ((a > b) && (a > 0.0)) ia--;	if ((a > b) && (a > 0.0)) ia--;
	a = __longlong_as_double(ia);	a = __longlong_as_double(ia);
	return a;	return a;
	}	}


	__device_func__(double __cuda_nan(const char *s))	__device_func__(double __cuda_nan(const char *tagp))
	{	{

	unsigned long long i = 0;	unsigned long long int i;
	int c;
	int ovfl = 0;	i = __internal_nan_kernel (tagp);
	int invld = 0;
	if (*s == '0') {
	s++;
	if ((s == 'x') \|\| (s == 'X')) {
	s++;
	while (*s == '0') s++;
	while (*s) {
	if (i > 0x0fffffffffffffffULL) {
	ovfl = 1;
	}
	c = (((s) >= 'A') && ((s) <= 'F')) ? (s + 'a' - 'A') : (s);
	if ((c >= 'a') && (c <= 'f')) {
	c = c - 'a' + 10;
	i = i * 16 + c;
	} else if ((c >= '0') && (c <= '9')) {
	c = c - '0';
	i = i * 16 + c;
	} else {
	invld = 1;
	}
	s++;
	}
	} else {
	while (*s == '0') s++;
	while (*s) {
	if (i > 0x1fffffffffffffffULL) {
	ovfl = 1;
	}
	c = *s;
	if ((c >= '0') && (c <= '7')) {
	c = c - '0';
	i = i * 8 + c;
	} else {
	invld = 1;
	}
	s++;
	}
	}
	} else {
	while (*s) {
	c = *s;
	if ((i > 1844674407370955161ULL) \|\|
	((i == 1844674407370955161ULL) && (c > '5'))) {
	ovfl = 1;
	}
	if ((c >= '0') && (c <= '9')) {
	c = c - '0';
	i = i * 10 + c;
	} else {
	invld = 1;
	}
	s++;
	}
	}
	if (ovfl) {
	i = ~0ULL;
	}
	if (invld) {
	i = 0ULL;
	}
	i = (i & 0x000fffffffffffffULL) \| 0x7ff8000000000000ULL;	i = (i & 0x000fffffffffffffULL) \| 0x7ff8000000000000ULL;
	return __longlong_as_double(i);	return __longlong_as_double(i);
	}	}

	__device_func__(double __cuda_round(double a))	__device_func__(double __cuda_round(double a))
	{	{
	double fa = __cuda_fabs(a);	double fa = __cuda_fabs(a);

	if (fa > CUDART_TWO_TO_52) {	if (fa >= CUDART_TWO_TO_52) {
	return a;	return a;
	} else {	} else {

	double u = __cuda_floor(fa + 0.5);	double u;
		u = __cuda_trunc(fa + 0.5);
	if (fa < 0.5) u = 0;	if (fa < 0.5) u = 0;

	return __cuda_copysign(u, a);	u = __internal_copysign_pos(u, a);
		return u;
	}	}
	}	}

	__device_func__(long long int __cuda_llround(double a))	__device_func__(long long int __cuda_llround(double a))
	{	{
	#if !defined(__CUDABE__)	#if !defined(__CUDABE__)
	if (a >= 9223372036854775807.0) return 0x7fffffffffffffffLL;	if (a >= 9223372036854775807.0) return 0x7fffffffffffffffLL;
	if (a <= -9223372036854775808.0) return 0x8000000000000000LL;	if (a <= -9223372036854775808.0) return 0x8000000000000000LL;
	#endif /* !__CUDABE__ */	#endif /* !__CUDABE__ */
	return (long long int)(__cuda_round(a));	return (long long int)(__cuda_round(a));

	skipping to change at line 2363	skipping to change at line 2381
	if (__cuda___isnan(a)) return -INT_MAX-1;	if (__cuda___isnan(a)) return -INT_MAX-1;
	if (__cuda___isinf(a)) return INT_MAX;	if (__cuda___isinf(a)) return INT_MAX;
	if (a == 0.0) return -INT_MAX-1;	if (a == 0.0) return -INT_MAX-1;
	a = __cuda_fabs(a);	a = __cuda_fabs(a);
	ilo = __double2loint(a);	ilo = __double2loint(a);
	ihi = __double2hiint(a);	ihi = __double2hiint(a);
	i = ((unsigned long long int)ihi) << 32 \| (unsigned long long int)ilo;	i = ((unsigned long long int)ihi) << 32 \| (unsigned long long int)ilo;
	if (a >= CUDART_TWO_TO_M1022) {	if (a >= CUDART_TWO_TO_M1022) {
	return ((int)((ihi >> 20) & 0x7ff)) - 1023;	return ((int)((ihi >> 20) & 0x7ff)) - 1023;
	} else {	} else {

	int expo = -1022;	return -1011 - __clzll(i);
	while (i < 0x0010000000000000ULL) {
	expo--;
	i = i + i;
	}
	return expo;
	}	}
	}	}

	__device_func__(double __cuda_logb(double a))	__device_func__(double __cuda_logb(double a))
	{	{
	unsigned long long int i;	unsigned long long int i;
	unsigned int ihi;	unsigned int ihi;
	unsigned int ilo;	unsigned int ilo;
	if (__cuda___isnan(a)) return a + a;	if (__cuda___isnan(a)) return a + a;
	a = __cuda_fabs(a);	a = __cuda_fabs(a);
	if (a == CUDART_INF) return a;	if (a == CUDART_INF) return a;
	if (a == 0.0) return -CUDART_INF;	if (a == 0.0) return -CUDART_INF;
	ilo = __double2loint(a);	ilo = __double2loint(a);
	ihi = __double2hiint(a);	ihi = __double2hiint(a);
	i = ((unsigned long long int)ihi) << 32 \| (unsigned long long int)ilo;	i = ((unsigned long long int)ihi) << 32 \| (unsigned long long int)ilo;
	if (a >= CUDART_TWO_TO_M1022) {	if (a >= CUDART_TWO_TO_M1022) {
	return (double)((int)((ihi >> 20) & 0x7ff)) - 1023;	return (double)((int)((ihi >> 20) & 0x7ff)) - 1023;
	} else {	} else {

	int expo = -1022;	int expo = -1011 - __clzll(i);
	while (i < 0x0010000000000000ULL) {
	expo--;
	i = i + i;
	}
	return (double)expo;	return (double)expo;
	}	}
	}	}

	__device_func__(double __cuda_fma(double a, double b, double c))	__device_func__(double __cuda_fma(double a, double b, double c))
	{	{
	return __fma_rn(a, b, c);	return __fma_rn(a, b, c);
	}	}

	#if __APPLE__	#if __APPLE__

End of changes. 49 change blocks.
	166 lines changed or deleted	186 lines changed or added

	sm_11_atomic_functions.h	sm_11_atomic_functions.h
	/*	/*

	* Copyright 1993-2009 NVIDIA Corporation. All rights reserved.	* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
	*	*
	* NOTICE TO USER:	* NOTICE TO USER:
	*	*
	* This source code is subject to NVIDIA ownership rights under U.S. and	* This source code is subject to NVIDIA ownership rights under U.S. and
	* international Copyright laws. Users and possessors of this source code	* international Copyright laws. Users and possessors of this source code
	* are hereby granted a nonexclusive, royalty-free license to use this code	* are hereby granted a nonexclusive, royalty-free license to use this code
	* in individual and commercial software.	* in individual and commercial software.
	*	*
	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR

	skipping to change at line 41	skipping to change at line 41
	* Any use of this source code in individual and commercial software must	* Any use of this source code in individual and commercial software must
	* include, in the user documentation and internal comments to the code,	* include, in the user documentation and internal comments to the code,
	* the above Disclaimer and U.S. Government End Users Notice.	* the above Disclaimer and U.S. Government End Users Notice.
	*/	*/

	#if !defined(__SM_11_ATOMIC_FUNCTIONS_H__)	#if !defined(__SM_11_ATOMIC_FUNCTIONS_H__)
	#define __SM_11_ATOMIC_FUNCTIONS_H__	#define __SM_11_ATOMIC_FUNCTIONS_H__

	#if defined(__cplusplus) && defined(__CUDACC__)	#if defined(__cplusplus) && defined(__CUDACC__)


	#if __CUDA_ARCH__ >= 110	#if !defined(__CUDA_ARCH__) \|\| __CUDA_ARCH__ >= 110

	/************************************************************************ ***	/************************************************************************ ***
	* *	* *
	* *	* *
	* *	* *
	************************************************************************* **/	************************************************************************* **/

	#include "host_defines.h"	#include "host_defines.h"

	extern "C"	extern "C"

	skipping to change at line 206	skipping to change at line 206
	static __inline__ __device__ int atomicCAS(int *address, int compare, int v al)	static __inline__ __device__ int atomicCAS(int *address, int compare, int v al)
	{	{
	return __iAtomicCAS(address, compare, val);	return __iAtomicCAS(address, compare, val);
	}	}

	static __inline__ __device__ unsigned int atomicCAS(unsigned int *address, unsigned int compare, unsigned int val)	static __inline__ __device__ unsigned int atomicCAS(unsigned int *address, unsigned int compare, unsigned int val)
	{	{
	return __uAtomicCAS(address, compare, val);	return __uAtomicCAS(address, compare, val);
	}	}


	#endif /* __CUDA_ARCH__ >= 110 */	#endif /* !__CUDA_ARCH__ \|\| __CUDA_ARCH__ >= 110 */

	#elif !defined(__CUDACC__)	#elif !defined(__CUDACC__)

	#include "crt/func_macro.h"	#include "crt/func_macro.h"

	#if !defined(__CUDABE__)	#if !defined(__CUDABE__)


	#if defined(__MULTI_CORE__)	#if defined(__cplusplus)
		extern "C" {
	#define __iAtomicAdd(address, val) \	#endif /* __cplusplus */
	__builtin___iAtomicAdd(address, val)
	#define __uAtomicAdd(address, val) \
	__builtin___uAtomicAdd(address, val)
	#define __iAtomicExch(address, val) \
	__builtin___iAtomicExch(address, val)
	#define __uAtomicExch(address, val) \
	__builtin___uAtomicExch(address, val)
	#define __fAtomicExch(address, val) \
	__builtin___fAtomicExch(address, val)
	#define __iAtomicMin(address, val) \
	__builtin___iAtomicMin(address, val)
	#define __uAtomicMin(address, val) \
	__builtin___uAtomicMin(address, val)
	#define __iAtomicMax(address, val) \
	__builtin___iAtomicMax(address, val)
	#define __uAtomicMax(address, val) \
	__builtin___uAtomicMax(address, val)
	#define __uAtomicInc(address, val) \
	__builtin___uAtomicInc(address, val)
	#define __uAtomicDec(address, val) \
	__builtin___uAtomicDec(address, val)
	#define __iAtomicAnd(address, val) \
	__builtin___iAtomicAnd(address, val)
	#define __uAtomicAnd(address, val) \
	__builtin___uAtomicAnd(address, val)
	#define __iAtomicOr(address, val) \
	__builtin___iAtomicOr(address, val)
	#define __uAtomicOr(address, val) \
	__builtin___uAtomicOr(address, val)
	#define __iAtomicXor(address, val) \
	__builtin___iAtomicXor(address, val)
	#define __uAtomicXor(address, val) \
	__builtin___uAtomicXor(address, val)
	#define __iAtomicCAS(address, compare, val) \
	__builtin___iAtomicCAS(address, compare, val)
	#define __uAtomicCAS(address, compare, val) \
	__builtin___uAtomicCAS(address, compare, val)

	#else /* __MULTI_CORE__ */

	extern void CUDARTAPI __cudaMutexOperation(int lock);	extern void CUDARTAPI __cudaMutexOperation(int lock);


		#if defined(__cplusplus)
		}
		#endif /* __cplusplus */

	#define __cudaAtomicOperation(code) \	#define __cudaAtomicOperation(code) \
	__cudaMutexOperation(1); \	__cudaMutexOperation(1); \
	code \	code \
	__cudaMutexOperation(0);	__cudaMutexOperation(0);

	__device_func__(int __iAtomicAdd(int *address, int val))	__device_func__(int __iAtomicAdd(int *address, int val))
	{	{
	int old;	int old;

	__cudaAtomicOperation(	__cudaAtomicOperation(

	skipping to change at line 493	skipping to change at line 458
	__cudaAtomicOperation(	__cudaAtomicOperation(
	old = *address;	old = *address;
	*address = old == compare ? val : old;	*address = old == compare ? val : old;
	)	)

	return old;	return old;
	}	}

	#undef __cudaAtomicOperation	#undef __cudaAtomicOperation


	#endif /* __MULTI_CORE__ */

	#endif /* !__CUDABE__ */	#endif /* !__CUDABE__ */

	#endif /* __cplusplus && __CUDACC__ */	#endif /* __cplusplus && __CUDACC__ */

	#endif /* !__SM_11_ATOMIC_FUNCTIONS_H__ */	#endif /* !__SM_11_ATOMIC_FUNCTIONS_H__ */

End of changes. 6 change blocks.
	47 lines changed or deleted	10 lines changed or added

	sm_12_atomic_functions.h	sm_12_atomic_functions.h
	/*	/*

	* Copyright 1993-2009 NVIDIA Corporation. All rights reserved.	* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
	*	*
	* NOTICE TO USER:	* NOTICE TO USER:
	*	*
	* This source code is subject to NVIDIA ownership rights under U.S. and	* This source code is subject to NVIDIA ownership rights under U.S. and
	* international Copyright laws. Users and possessors of this source code	* international Copyright laws. Users and possessors of this source code
	* are hereby granted a nonexclusive, royalty-free license to use this code	* are hereby granted a nonexclusive, royalty-free license to use this code
	* in individual and commercial software.	* in individual and commercial software.
	*	*
	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR

	skipping to change at line 41	skipping to change at line 41
	* Any use of this source code in individual and commercial software must	* Any use of this source code in individual and commercial software must
	* include, in the user documentation and internal comments to the code,	* include, in the user documentation and internal comments to the code,
	* the above Disclaimer and U.S. Government End Users Notice.	* the above Disclaimer and U.S. Government End Users Notice.
	*/	*/

	#if !defined(__SM_12_ATOMIC_FUNCTIONS_H__)	#if !defined(__SM_12_ATOMIC_FUNCTIONS_H__)
	#define __SM_12_ATOMIC_FUNCTIONS_H__	#define __SM_12_ATOMIC_FUNCTIONS_H__

	#if defined(__cplusplus) && defined(__CUDACC__)	#if defined(__cplusplus) && defined(__CUDACC__)


	#if __CUDA_ARCH__ >= 120	#if !defined(__CUDA_ARCH__) \|\| __CUDA_ARCH__ >= 120

	/************************************************************************ ***	/************************************************************************ ***
	* *	* *
	* *	* *
	* *	* *
	************************************************************************* **/	************************************************************************* **/

	#include "host_defines.h"	#include "host_defines.h"

	extern "C"	extern "C"

	skipping to change at line 99	skipping to change at line 99
	static __inline__ __device__ bool any(bool cond)	static __inline__ __device__ bool any(bool cond)
	{	{
	return (bool)__any((int)cond);	return (bool)__any((int)cond);
	}	}

	static __inline__ __device__ bool all(bool cond)	static __inline__ __device__ bool all(bool cond)
	{	{
	return (bool)__all((int)cond);	return (bool)__all((int)cond);
	}	}


	#endif /* __CUDA_ARCH__ >= 120 */	#endif /* !__CUDA_ARCH__ \|\| __CUDA_ARCH__ >= 120 */

	#elif !defined(__CUDACC__)	#elif !defined(__CUDACC__)

	#include "crt/func_macro.h"	#include "crt/func_macro.h"

	#if !defined(__CUDABE__)	#if !defined(__CUDABE__)


	#if defined(__MULTI_CORE__)	#if defined(__cplusplus)
		extern "C" {
	#define __ullAtomicAdd(address, val) \	#endif /* __cplusplus */
	__builtin___ullAtomicAdd(address, val)
	#define __ullAtomicExch(address, val) \
	__builtin___ullAtomicExch(address, val)
	#define __ullAtomicCAS(address, compare, val) \
	__builtin___ullAtomicCAS(address, compare, val)

	#else /* __MULTI_CORE__ */

	extern void CUDARTAPI __cudaMutexOperation(int lock);	extern void CUDARTAPI __cudaMutexOperation(int lock);


		#if defined(__cplusplus)
		}
		#endif /* __cplusplus */

	#define __cudaAtomicOperation(code) \	#define __cudaAtomicOperation(code) \
	__cudaMutexOperation(1); \	__cudaMutexOperation(1); \
	code \	code \
	__cudaMutexOperation(0);	__cudaMutexOperation(0);

	__device_func__(unsigned long long int __ullAtomicAdd(unsigned long long in t *address, unsigned long long int val))	__device_func__(unsigned long long int __ullAtomicAdd(unsigned long long in t *address, unsigned long long int val))
	{	{
	unsigned long long int old;	unsigned long long int old;

	__cudaAtomicOperation(	__cudaAtomicOperation(

	skipping to change at line 160	skipping to change at line 157

	__cudaAtomicOperation(	__cudaAtomicOperation(
	old = *address;	old = *address;
	*address = old == compare ? val : old;	*address = old == compare ? val : old;
	)	)
	return old;	return old;
	}	}

	#undef __cudaAtomicOperation	#undef __cudaAtomicOperation


	#endif /* __MULTI_CORE__ */

	__device_func__(int __any(int cond))	__device_func__(int __any(int cond))
	{	{
	return cond;	return cond;
	}	}

	__device_func__(int __all(int cond))	__device_func__(int __all(int cond))
	{	{
	return cond;	return cond;
	}	}


End of changes. 6 change blocks.
	15 lines changed or deleted	10 lines changed or added

	sm_13_double_functions.h	sm_13_double_functions.h
	/*	/*

	* Copyright 1993-2009 NVIDIA Corporation. All rights reserved.	* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
	*	*
	* NOTICE TO USER:	* NOTICE TO USER:
	*	*
	* This source code is subject to NVIDIA ownership rights under U.S. and	* This source code is subject to NVIDIA ownership rights under U.S. and
	* international Copyright laws. Users and possessors of this source code	* international Copyright laws. Users and possessors of this source code
	* are hereby granted a nonexclusive, royalty-free license to use this code	* are hereby granted a nonexclusive, royalty-free license to use this code
	* in individual and commercial software.	* in individual and commercial software.
	*	*
	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR

	skipping to change at line 47	skipping to change at line 47
	#define __SM_13_DOUBLE_FUNCTIONS_H__	#define __SM_13_DOUBLE_FUNCTIONS_H__

	/************************************************************************ ***	/************************************************************************ ***
	* *	* *
	* *	* *
	* *	* *
	************************************************************************* **/	************************************************************************* **/

	#if defined(__cplusplus) && defined(__CUDACC__)	#if defined(__cplusplus) && defined(__CUDACC__)


	#if __CUDA_ARCH__ >= 130	#if !defined(__CUDA_ARCH__) \|\| __CUDA_ARCH__ >= 130

	/************************************************************************ ***	/************************************************************************ ***
	* *	* *
	* *	* *
	* *	* *
	************************************************************************* **/	************************************************************************* **/

	#include "device_types.h"	#include "device_types.h"
	#include "host_defines.h"	#include "host_defines.h"


	skipping to change at line 97	skipping to change at line 97
	extern __device__ double __dmul_rz(double, double);	extern __device__ double __dmul_rz(double, double);
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __device__ double __dmul_ru(double, double);	extern __device__ double __dmul_ru(double, double);
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __device__ double __dmul_rd(double, double);	extern __device__ double __dmul_rd(double, double);

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __device__ float __double2float_rn(double);	extern __device__ float __double2float_rn(double);
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __device__ float __double2float_rz(double);	extern __device__ float __double2float_rz(double);

		/DEVICE_BUILTIN/
		extern __device__ float __double2float_ru(double);
		/DEVICE_BUILTIN/
		extern __device__ float __double2float_rd(double);

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __device__ int __double2int_rn(double);	extern __device__ int __double2int_rn(double);
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __device__ int __double2int_ru(double);	extern __device__ int __double2int_ru(double);
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __device__ int __double2int_rd(double);	extern __device__ int __double2int_rd(double);

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	extern __device__ unsigned int __double2uint_rn(double);	extern __device__ unsigned int __double2uint_rn(double);

	skipping to change at line 252	skipping to change at line 256
	static __inline__ __device__ double uint2double(unsigned int a, enum cudaRo undMode mode = cudaRoundNearest)	static __inline__ __device__ double uint2double(unsigned int a, enum cudaRo undMode mode = cudaRoundNearest)
	{	{
	return (double)a;	return (double)a;
	}	}

	static __inline__ __device__ double float2double(float a, enum cudaRoundMod e mode = cudaRoundNearest)	static __inline__ __device__ double float2double(float a, enum cudaRoundMod e mode = cudaRoundNearest)
	{	{
	return (double)a;	return (double)a;
	}	}


	#endif /* __CUDA_ARCH__ >= 130 */	#endif /* !__CUDA_ARCH__ \|\| __CUDA_ARCH__ >= 130 */

	#elif !defined(__CUDACC__)	#elif !defined(__CUDACC__)

	#include "crt/func_macro.h"	#include "crt/func_macro.h"

	#if !defined(__CUDABE__)	#if !defined(__CUDABE__)

	/************************************************************************ ***	/************************************************************************ ***
	* *	* *
	* HOST IMPLEMENTATIONS FOR FUNCTIONS *	* HOST IMPLEMENTATIONS FOR FUNCTIONS *

	skipping to change at line 281	skipping to change at line 286
	return u.d;	return u.d;
	}	}

	__device_func__(long long int __double_as_longlong(double a))	__device_func__(long long int __double_as_longlong(double a))
	{	{
	volatile union __cudart_DoubleLonglongCvt u;	volatile union __cudart_DoubleLonglongCvt u;
	u.d = a;	u.d = a;
	return u.i;	return u.i;
	}	}


	__device_func__(float __internal_double2float_kernel(double a))	/* Note: this kernel does not support round-to-nearest-or-even */
		__device_func__(float __internal_double2float_kernel(double a, enum cudaRou
		ndMode rndMode))
	{	{
	volatile union __cudart_DoubleUlonglongCvt xx;	volatile union __cudart_DoubleUlonglongCvt xx;
	volatile union __cudart_FloatUintCvt res;	volatile union __cudart_FloatUintCvt res;


		unsigned long long sticky;
	int shift;	int shift;
	xx.d = a;	xx.d = a;

	if (xx.i == 0) return 0.0f;
	res.i = (((unsigned int) (xx.i >> 32)) & 0x80000000);	res.i = (((unsigned int) (xx.i >> 32)) & 0x80000000);

		if (a == 0.0) {
		/* Zero */
		return res.f;
		}
	if ((xx.i & 0x7ff0000000000000ULL) == 0x7ff0000000000000ULL) {	if ((xx.i & 0x7ff0000000000000ULL) == 0x7ff0000000000000ULL) {
	if ((xx.i & 0x7fffffffffffffffULL) > 0x7ff0000000000000ULL) {	if ((xx.i & 0x7fffffffffffffffULL) > 0x7ff0000000000000ULL) {
	/* Nan */	/* Nan */
	res.i = ((unsigned int)((xx.i >> 32) & 0x80000000) \|	res.i = ((unsigned int)((xx.i >> 32) & 0x80000000) \|
	(255U << 23) \| 0x00400000 \|	(255U << 23) \| 0x00400000 \|
	(unsigned int)((xx.i >> (53 - 24)) & 0x007fffff));	(unsigned int)((xx.i >> (53 - 24)) & 0x007fffff));
	} else {	} else {
	/* Inf */	/* Inf */
	res.i \|= 0x7f800000;	res.i \|= 0x7f800000;
	}	}
	return res.f;	return res.f;
	}	}
	shift = ((int) ((xx.i >> 52) & 0x7ff)) - 1023;	shift = ((int) ((xx.i >> 52) & 0x7ff)) - 1023;
	/* Overflow */	/* Overflow */
	xx.i = (xx.i & 0x000fffffffffffffULL);	xx.i = (xx.i & 0x000fffffffffffffULL);
	if (shift >= 128) {	if (shift >= 128) {

	res.i \|= 0x7f7fffff;	if ((rndMode == cudaRoundZero) \|\|
		((rndMode == cudaRoundMinInf) && !res.i) \|\|
		((rndMode == cudaRoundPosInf) && res.i)) {
		res.i \|= 0x7f7fffff;
		} else {
		res.i \|= 0x7f800000;
		}
	return res.f;	return res.f;
	}	}
	if (shift <= -127) {	if (shift <= -127) {

		/* Underflow */
		xx.i \|= 0x0010000000000000ULL;
	if (shift < -180) {	if (shift < -180) {

	/* Underflow */	sticky = xx.i;
	xx.i = 0;	xx.i = 0;
	} else {	} else {

	xx.i \|= 0x0010000000000000ULL;	sticky = xx.i << (64 - (-126 - shift));
	xx.i >>= -126 - shift;	xx.i >>= (-126 - shift);
	}	}

	} else {	sticky \|= xx.i << (64 - 29);
	res.i \|= (unsigned int) (127 + shift) << 23;	if ((((rndMode == cudaRoundPosInf) && !res.i) \|\|
		((rndMode == cudaRoundMinInf) && res.i)) &&
		sticky) {
		res.i += 1;
		}
		res.i += ((unsigned int) (xx.i >> 29)) & 0x007fffff;
		return res.f;
	}	}

	res.i \|= ((unsigned int) (xx.i >> 29)) & 0x007fffff;	sticky = xx.i << (64 - 29);
	xx.i &= 0x1fffffff;	if ((((rndMode == cudaRoundPosInf) && !res.i) \|\|
		((rndMode == cudaRoundMinInf) && res.i)) &&
		sticky) {
		res.i += 1;
		}
		res.i += ((unsigned int) (xx.i >> 29)) & 0x007fffff;
		res.i += (unsigned int) (127 + shift) << 23;
	return res.f;	return res.f;
	}	}

	__device_func__(double __internal_ll2double_kernel(long long int a, enum cu daRoundMode rndMode))	__device_func__(double __internal_ll2double_kernel(long long int a, enum cu daRoundMode rndMode))
	{	{
	volatile union __cudart_DoubleUlonglongCvt res;	volatile union __cudart_DoubleUlonglongCvt res;
	int shift;	int shift;
	unsigned int t;	unsigned int t;
	res.i = a;	res.i = a;
	if (a == 0) return res.d;	if (a == 0) return res.d;

	skipping to change at line 463	skipping to change at line 493
	return cvt.d;	return cvt.d;
	}	}

	__device_func__(float __double2float_rn(double a))	__device_func__(float __double2float_rn(double a))
	{	{
	return (float)a;	return (float)a;
	}	}

	__device_func__(float __double2float_rz(double a))	__device_func__(float __double2float_rz(double a))
	{	{

	return __internal_double2float_kernel(a);	return __internal_double2float_kernel(a, cudaRoundZero);
		}

		__device_func__(float __double2float_ru(double a))
		{
		return __internal_double2float_kernel(a, cudaRoundPosInf);
		}

		__device_func__(float __double2float_rd(double a))
		{
		return __internal_double2float_kernel(a, cudaRoundMinInf);
	}	}

	__device_func__(int __internal_double2int(double a, enum cudaRoundMode rndM ode))	__device_func__(int __internal_double2int(double a, enum cudaRoundMode rndM ode))
	{	{
	return (int)__internal_double2ll_kernel(a, 2147483647LL, -2147483648LL, - 2147483648LL, rndMode);	return (int)__internal_double2ll_kernel(a, 2147483647LL, -2147483648LL, - 2147483648LL, rndMode);
	}	}

	__device_func__(int __double2int_rn(double a))	__device_func__(int __double2int_rn(double a))
	{	{
	return __internal_double2int(a, cudaRoundNearest);	return __internal_double2int(a, cudaRoundNearest);

	skipping to change at line 604	skipping to change at line 644
	}	}

	#endif /* !__CUDABE__ */	#endif /* !__CUDABE__ */

	#if !defined(__CUDABE__) \|\| __CUDA_ARCH__ < 130	#if !defined(__CUDABE__) \|\| __CUDA_ARCH__ < 130

	#include "common_types.h"	#include "common_types.h"

	__device_func__(double __internal_fma_kernel(double x, double y, double z, enum cudaRoundMode rndMode))	__device_func__(double __internal_fma_kernel(double x, double y, double z, enum cudaRoundMode rndMode))
	{	{

	#ifdef __MULTI_CORE__
	volatile
	#endif /* __MULTI_CORE__ */
	struct __cudart_UintUint xx, yy, zz, ww;	struct __cudart_UintUint xx, yy, zz, ww;
	unsigned int s, t, u, prod0, prod1, prod2, prod3, expo_x, expo_y, expo_z;	unsigned int s, t, u, prod0, prod1, prod2, prod3, expo_x, expo_y, expo_z;

	xx.hi = __double2hiint(x);	xx.hi = __double2hiint(x);
	xx.lo = __double2loint(x);	xx.lo = __double2loint(x);
	yy.hi = __double2hiint(y);	yy.hi = __double2hiint(y);
	yy.lo = __double2loint(y);	yy.lo = __double2loint(y);
	zz.hi = __double2hiint(z);	zz.hi = __double2hiint(z);
	zz.lo = __double2loint(z);	zz.lo = __double2loint(z);


End of changes. 16 change blocks.
	17 lines changed or deleted	54 lines changed or added

	texture_fetch_functions.h	texture_fetch_functions.h
	/*	/*

	* Copyright 1993-2009 NVIDIA Corporation. All rights reserved.	* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
	*	*
	* NOTICE TO USER:	* NOTICE TO USER:
	*	*
	* This source code is subject to NVIDIA ownership rights under U.S. and	* This source code is subject to NVIDIA ownership rights under U.S. and
	* international Copyright laws. Users and possessors of this source code	* international Copyright laws. Users and possessors of this source code
	* are hereby granted a nonexclusive, royalty-free license to use this code	* are hereby granted a nonexclusive, royalty-free license to use this code
	* in individual and commercial software.	* in individual and commercial software.
	*	*
	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR

	skipping to change at line 1910	skipping to change at line 1910
	__ftexfetchi1D(t, i)	__ftexfetchi1D(t, i)
	#define __utexfetch(t, i, d) \	#define __utexfetch(t, i, d) \
	__utexfetch##d##D(t, i)	__utexfetch##d##D(t, i)
	#define __itexfetch(t, i, d) \	#define __itexfetch(t, i, d) \
	__itexfetch##d##D(t, i)	__itexfetch##d##D(t, i)
	#define __ftexfetch(t, i, d) \	#define __ftexfetch(t, i, d) \
	__ftexfetch##d##D(t, i)	__ftexfetch##d##D(t, i)

	#else /* __CUDABE__ */	#else /* __CUDABE__ */


		#if defined(__cplusplus)
		extern "C" {
		#endif /* __cplusplus */

	extern void CUDARTAPI __cudaTextureFetch(const void tex, void index, int integer, void *val);	extern void CUDARTAPI __cudaTextureFetch(const void tex, void index, int integer, void *val);


		#if defined(__cplusplus)
		}
		#endif /* __cplusplus */

	__device_func__(int4 __itexfetchi(const void *tex, int4 index))	__device_func__(int4 __itexfetchi(const void *tex, int4 index))
	{	{
	int4 val;	int4 val;

	__cudaTextureFetch(tex, (void)&index, 1, (void)&val);	__cudaTextureFetch(tex, (void)&index, 1, (void)&val);

	return val;	return val;
	}	}

	__device_func__(uint4 __utexfetchi(const void *tex, int4 index))	__device_func__(uint4 __utexfetchi(const void *tex, int4 index))

End of changes. 3 change blocks.
	1 lines changed or deleted	9 lines changed or added

	texture_types.h	texture_types.h
	/*	/*

	* Copyright 1993-2009 NVIDIA Corporation. All rights reserved.	* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
	*	*
	* NOTICE TO USER:	* NOTICE TO USER:
	*	*
	* This source code is subject to NVIDIA ownership rights under U.S. and	* This source code is subject to NVIDIA ownership rights under U.S. and
	* international Copyright laws. Users and possessors of this source code	* international Copyright laws. Users and possessors of this source code
	* are hereby granted a nonexclusive, royalty-free license to use this code	* are hereby granted a nonexclusive, royalty-free license to use this code
	* in individual and commercial software.	* in individual and commercial software.
	*	*
	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR

	skipping to change at line 57	skipping to change at line 57
	/************************************************************************ ***	/************************************************************************ ***
	* *	* *
	* *	* *
	* *	* *
	************************************************************************* **/	************************************************************************* **/

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	enum cudaTextureAddressMode	enum cudaTextureAddressMode
	{	{
	cudaAddressModeWrap,	cudaAddressModeWrap,

	cudaAddressModeClamp	cudaAddressModeClamp,
		cudaAddressModeMirror
	};	};

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	enum cudaTextureFilterMode	enum cudaTextureFilterMode
	{	{
	cudaFilterModePoint,	cudaFilterModePoint,
	cudaFilterModeLinear	cudaFilterModeLinear
	};	};

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/

End of changes. 2 change blocks.
	2 lines changed or deleted	3 lines changed or added

	vector_functions.h	vector_functions.h
	/*	/*

	* Copyright 1993-2009 NVIDIA Corporation. All rights reserved.	* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
	*	*
	* NOTICE TO USER:	* NOTICE TO USER:
	*	*
	* This source code is subject to NVIDIA ownership rights under U.S. and	* This source code is subject to NVIDIA ownership rights under U.S. and
	* international Copyright laws. Users and possessors of this source code	* international Copyright laws. Users and possessors of this source code
	* are hereby granted a nonexclusive, royalty-free license to use this code	* are hereby granted a nonexclusive, royalty-free license to use this code
	* in individual and commercial software.	* in individual and commercial software.
	*	*
	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR

	skipping to change at line 194	skipping to change at line 194
	static __inline__ __host__ __device__ long2 make_long2(long int x, long int y)	static __inline__ __host__ __device__ long2 make_long2(long int x, long int y)
	{	{
	long2 t; t.x = x; t.y = y; return t;	long2 t; t.x = x; t.y = y; return t;
	}	}

	static __inline__ __host__ __device__ ulong2 make_ulong2(unsigned long int x, unsigned long int y)	static __inline__ __host__ __device__ ulong2 make_ulong2(unsigned long int x, unsigned long int y)
	{	{
	ulong2 t; t.x = x; t.y = y; return t;	ulong2 t; t.x = x; t.y = y; return t;
	}	}


	#if !defined(__LP64__)

	static __inline__ __host__ __device__ long3 make_long3(long int x, long int y, long int z)	static __inline__ __host__ __device__ long3 make_long3(long int x, long int y, long int z)
	{	{
	long3 t; t.x = x; t.y = y; t.z = z; return t;	long3 t; t.x = x; t.y = y; t.z = z; return t;
	}	}

	static __inline__ __host__ __device__ ulong3 make_ulong3(unsigned long int x, unsigned long int y, unsigned long int z)	static __inline__ __host__ __device__ ulong3 make_ulong3(unsigned long int x, unsigned long int y, unsigned long int z)
	{	{
	ulong3 t; t.x = x; t.y = y; t.z = z; return t;	ulong3 t; t.x = x; t.y = y; t.z = z; return t;
	}	}

	static __inline__ __host__ __device__ long4 make_long4(long int x, long int y, long int z, long int w)	static __inline__ __host__ __device__ long4 make_long4(long int x, long int y, long int z, long int w)
	{	{
	long4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;	long4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
	}	}

	static __inline__ __host__ __device__ ulong4 make_ulong4(unsigned long int x, unsigned long int y, unsigned long int z, unsigned long int w)	static __inline__ __host__ __device__ ulong4 make_ulong4(unsigned long int x, unsigned long int y, unsigned long int z, unsigned long int w)
	{	{
	ulong4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;	ulong4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
	}	}


	#endif /* !__LP64__ */

	static __inline__ __host__ __device__ float1 make_float1(float x)	static __inline__ __host__ __device__ float1 make_float1(float x)
	{	{
	float1 t; t.x = x; return t;	float1 t; t.x = x; return t;
	}	}

	static __inline__ __host__ __device__ float2 make_float2(float x, float y)	static __inline__ __host__ __device__ float2 make_float2(float x, float y)
	{	{
	float2 t; t.x = x; t.y = y; return t;	float2 t; t.x = x; t.y = y; return t;
	}	}


	skipping to change at line 258	skipping to change at line 254
	static __inline__ __host__ __device__ longlong2 make_longlong2(long long in t x, long long int y)	static __inline__ __host__ __device__ longlong2 make_longlong2(long long in t x, long long int y)
	{	{
	longlong2 t; t.x = x; t.y = y; return t;	longlong2 t; t.x = x; t.y = y; return t;
	}	}

	static __inline__ __host__ __device__ ulonglong2 make_ulonglong2(unsigned l ong long int x, unsigned long long int y)	static __inline__ __host__ __device__ ulonglong2 make_ulonglong2(unsigned l ong long int x, unsigned long long int y)
	{	{
	ulonglong2 t; t.x = x; t.y = y; return t;	ulonglong2 t; t.x = x; t.y = y; return t;
	}	}


		static __inline__ __host__ __device__ longlong3 make_longlong3(long long in
		t x, long long int y, long long int z)
		{
		longlong3 t; t.x = x; t.y = y; t.z = z; return t;
		}

		static __inline__ __host__ __device__ ulonglong3 make_ulonglong3(unsigned l
		ong long int x, unsigned long long int y, unsigned long long int z)
		{
		ulonglong3 t; t.x = x; t.y = y; t.z = z; return t;
		}

		static __inline__ __host__ __device__ longlong4 make_longlong4(long long in
		t x, long long int y, long long int z, long long int w)
		{
		longlong4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
		}

		static __inline__ __host__ __device__ ulonglong4 make_ulonglong4(unsigned l
		ong long int x, unsigned long long int y, unsigned long long int z, unsigne
		d long long int w)
		{
		ulonglong4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
		}

	static __inline__ __host__ __device__ double1 make_double1(double x)	static __inline__ __host__ __device__ double1 make_double1(double x)
	{	{
	double1 t; t.x = x; return t;	double1 t; t.x = x; return t;
	}	}

	static __inline__ __host__ __device__ double2 make_double2(double x, double y)	static __inline__ __host__ __device__ double2 make_double2(double x, double y)
	{	{
	double2 t; t.x = x; t.y = y; return t;	double2 t; t.x = x; t.y = y; return t;
	}	}


		static __inline__ __host__ __device__ double3 make_double3(double x, double
		y, double z)
		{
		double3 t; t.x = x; t.y = y; t.z = z; return t;
		}

		static __inline__ __host__ __device__ double4 make_double4(double x, double
		y, double z, double w)
		{
		double4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
		}

	#endif /* !__VECTOR_FUNCTIONS_H__ */	#endif /* !__VECTOR_FUNCTIONS_H__ */

End of changes. 5 change blocks.
	5 lines changed or deleted	38 lines changed or added

	vector_types.h	vector_types.h
	/*	/*

	* Copyright 1993-2009 NVIDIA Corporation. All rights reserved.	* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
	*	*
	* NOTICE TO USER:	* NOTICE TO USER:
	*	*
	* This source code is subject to NVIDIA ownership rights under U.S. and	* This source code is subject to NVIDIA ownership rights under U.S. and
	* international Copyright laws. Users and possessors of this source code	* international Copyright laws. Users and possessors of this source code
	* are hereby granted a nonexclusive, royalty-free license to use this code	* are hereby granted a nonexclusive, royalty-free license to use this code
	* in individual and commercial software.	* in individual and commercial software.
	*	*
	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE	* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR	* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR

	skipping to change at line 60	skipping to change at line 60
	* *	* *
	************************************************************************* **/	************************************************************************* **/

	#if !defined(__cuda_assign_operators)	#if !defined(__cuda_assign_operators)

	#define __cuda_assign_operators(tag)	#define __cuda_assign_operators(tag)

	#endif /* !__cuda_assign_operators */	#endif /* !__cuda_assign_operators */

	#if !defined(__CUDACC__) && !defined(__CUDABE__) && \	#if !defined(__CUDACC__) && !defined(__CUDABE__) && \

	!defined (__MULTI_CORE__) && defined(_WIN32) && !defined(_WIN64)	defined(_WIN32) && !defined(_WIN64)

	#define __cuda_builtin_vector_align8(tag, ...) \	#define __cuda_builtin_vector_align8(tag, ...) \
	struct tag { \	struct tag { \
	union { \	union { \
	struct { __VA_ARGS__; }; \	struct { __VA_ARGS__; }; \
	struct { long long int :1,:0; }; \	struct { long long int :1,:0; }; \
	}; \	}; \
	__cuda_assign_operators(tag) \	__cuda_assign_operators(tag) \
	}	}


	#else /* !__CUDACC__ && !__CUDABE__ && !__MULTI_CORE__ && _WIN32 && !_WIN64 */	#else /* !__CUDACC__ && !__CUDABE__ && _WIN32 && !_WIN64 */

	#define __cuda_builtin_vector_align8(tag, ...) \	#define __cuda_builtin_vector_align8(tag, ...) \
	struct __align__(8) tag { \	struct __align__(8) tag { \
	__VA_ARGS__; \	__VA_ARGS__; \
	__cuda_assign_operators(tag) \	__cuda_assign_operators(tag) \
	}	}


	#endif /* !__CUDACC__ && !__CUDABE__ && !__MULTI_CORE__ && _WIN32 && !_WIN6 4 */	#endif /* !__CUDACC__ && !__CUDABE__ && _WIN32 && !_WIN64 */

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct char1	struct char1
	{	{
	signed char x;	signed char x;
	__cuda_assign_operators(char1)	__cuda_assign_operators(char1)
	};	};

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct uchar1	struct uchar1

	skipping to change at line 273	skipping to change at line 273

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct __align__(2*sizeof(unsigned long int)) ulong2	struct __align__(2*sizeof(unsigned long int)) ulong2
	{	{
	unsigned long int x, y;	unsigned long int x, y;
	__cuda_assign_operators(ulong2)	__cuda_assign_operators(ulong2)
	};	};

	#endif /* _WIN32 */	#endif /* _WIN32 */


	#if !defined(__LP64__)

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct long3	struct long3
	{	{
	long int x, y, z;	long int x, y, z;
	__cuda_assign_operators(long3)	__cuda_assign_operators(long3)
	};	};

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct ulong3	struct ulong3
	{	{

	skipping to change at line 303	skipping to change at line 301
	__cuda_assign_operators(long4)	__cuda_assign_operators(long4)
	};	};

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct __builtin_align__(16) ulong4	struct __builtin_align__(16) ulong4
	{	{
	unsigned long int x, y, z, w;	unsigned long int x, y, z, w;
	__cuda_assign_operators(ulong4)	__cuda_assign_operators(ulong4)
	};	};


	#endif /* !__LP64__ */

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct float1	struct float1
	{	{
	float x;	float x;
	__cuda_assign_operators(float1)	__cuda_assign_operators(float1)
	};	};

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	__cuda_builtin_vector_align8(float2, float x, y);	__cuda_builtin_vector_align8(float2, float x, y);


	skipping to change at line 358	skipping to change at line 354
	};	};

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct __builtin_align__(16) ulonglong2	struct __builtin_align__(16) ulonglong2
	{	{
	unsigned long long int x, y;	unsigned long long int x, y;
	__cuda_assign_operators(ulonglong2)	__cuda_assign_operators(ulonglong2)
	};	};

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/

		struct longlong3
		{
		long long int x, y, z;
		__cuda_assign_operators(longlong3)
		};

		/DEVICE_BUILTIN/
		struct ulonglong3
		{
		unsigned long long int x, y, z;
		__cuda_assign_operators(ulonglong3)
		};

		/DEVICE_BUILTIN/
		struct __builtin_align__(16) longlong4
		{
		long long int x, y, z ,w;
		__cuda_assign_operators(longlong4)
		};

		/DEVICE_BUILTIN/
		struct __builtin_align__(16) ulonglong4
		{
		unsigned long long int x, y, z, w;
		__cuda_assign_operators(ulonglong4)
		};

		/DEVICE_BUILTIN/
	struct double1	struct double1
	{	{
	double x;	double x;
	__cuda_assign_operators(double1)	__cuda_assign_operators(double1)
	};	};

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct __builtin_align__(16) double2	struct __builtin_align__(16) double2
	{	{
	double x, y;	double x, y;
	__cuda_assign_operators(double2)	__cuda_assign_operators(double2)
	};	};


		/DEVICE_BUILTIN/
		struct double3
		{
		double x, y, z;
		__cuda_assign_operators(double3)
		};

		/DEVICE_BUILTIN/
		struct __builtin_align__(16) double4
		{
		double x, y, z, w;
		__cuda_assign_operators(double4)
		};

	/************************************************************************ ***	/************************************************************************ ***
	* *	* *
	* *	* *
	* *	* *
	************************************************************************* **/	************************************************************************* **/

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	typedef struct char1 char1;	typedef struct char1 char1;
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	typedef struct uchar1 uchar1;	typedef struct uchar1 uchar1;

	skipping to change at line 458	skipping to change at line 496
	typedef struct float4 float4;	typedef struct float4 float4;
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	typedef struct longlong1 longlong1;	typedef struct longlong1 longlong1;
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	typedef struct ulonglong1 ulonglong1;	typedef struct ulonglong1 ulonglong1;
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	typedef struct longlong2 longlong2;	typedef struct longlong2 longlong2;
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	typedef struct ulonglong2 ulonglong2;	typedef struct ulonglong2 ulonglong2;
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/

		typedef struct longlong3 longlong3;
		/DEVICE_BUILTIN/
		typedef struct ulonglong3 ulonglong3;
		/DEVICE_BUILTIN/
		typedef struct longlong4 longlong4;
		/DEVICE_BUILTIN/
		typedef struct ulonglong4 ulonglong4;
		/DEVICE_BUILTIN/
	typedef struct double1 double1;	typedef struct double1 double1;
	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	typedef struct double2 double2;	typedef struct double2 double2;

		/DEVICE_BUILTIN/
		typedef struct double3 double3;
		/DEVICE_BUILTIN/
		typedef struct double4 double4;

	/************************************************************************ ***	/************************************************************************ ***
	* *	* *
	* *	* *
	* *	* *
	************************************************************************* **/	************************************************************************* **/

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	struct dim3	struct dim3
	{	{
	unsigned int x, y, z;	unsigned int x, y, z;

	#if defined(__cplusplus)	#if defined(__cplusplus) && !defined(__CUDABE__)
	__host__ __device__ dim3(unsigned int x = 1, unsigned int y = 1, unsign ed int z = 1) : x(x), y(y), z(z) {}	__host__ __device__ dim3(unsigned int x = 1, unsigned int y = 1, unsign ed int z = 1) : x(x), y(y), z(z) {}
	__host__ __device__ dim3(uint3 v) : x(v.x), y(v.y), z(v.z) {}	__host__ __device__ dim3(uint3 v) : x(v.x), y(v.y), z(v.z) {}
	__host__ __device__ operator uint3(void) { uint3 t; t.x = x; t.y = y; t .z = z; return t; }	__host__ __device__ operator uint3(void) { uint3 t; t.x = x; t.y = y; t .z = z; return t; }

	#endif /* __cplusplus */	#endif /* __cplusplus && !__CUDABE__ */
	};	};

	/DEVICE_BUILTIN/	/DEVICE_BUILTIN/
	typedef struct dim3 dim3;	typedef struct dim3 dim3;

	#undef __cuda_assign_operators	#undef __cuda_assign_operators
	#undef __cuda_builtin_vector_align8	#undef __cuda_builtin_vector_align8

	#endif /* !__VECTOR_TYPES_H__ */	#endif /* !__VECTOR_TYPES_H__ */

End of changes. 12 change blocks.
	10 lines changed or deleted	60 lines changed or added

This html diff was produced by rfcdiff 1.41. The latest version is available from http://tools.ietf.org/tools/rfcdiff/