__cudaFatFormat.h   __cudaFatFormat.h 
/* /*
* Copyright 1993-2008 NVIDIA Corporation. All rights reserved. * Copyright 1993-2009 NVIDIA Corporation. All rights reserved.
* *
* NOTICE TO USER: * NOTICE TO USER:
* *
* This source code is subject to NVIDIA ownership rights under U.S. and * This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws. Users and possessors of this source code * international Copyright laws. Users and possessors of this source code
* are hereby granted a nonexclusive, royalty-free license to use this code * are hereby granted a nonexclusive, royalty-free license to use this code
* in individual and commercial software. * in individual and commercial software.
* *
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
skipping to change at line 128 skipping to change at line 128
* for Cubin entries (ptx files compiled in debug mode * for Cubin entries (ptx files compiled in debug mode
* will contain their own debugging information) * will contain their own debugging information)
*/ */
typedef struct __cudaFatDebugEntryRec { typedef struct __cudaFatDebugEntryRec {
char* gpuProfileName; char* gpuProfileName;
char* debug; char* debug;
struct __cudaFatDebugEntryRec *next; struct __cudaFatDebugEntryRec *next;
unsigned int size; unsigned int size;
} __cudaFatDebugEntry; } __cudaFatDebugEntry;
typedef struct __cudaFatElfEntryRec {
char* gpuProfileName;
char* elf;
struct __cudaFatElfEntryRec *next;
unsigned int size;
} __cudaFatElfEntry;
typedef enum { typedef enum {
__cudaFatDontSearchFlag = (1 << 0), __cudaFatDontSearchFlag = (1 << 0),
__cudaFatDontCacheFlag = (1 << 1), __cudaFatDontCacheFlag = (1 << 1),
__cudaFatSassDebugFlag = (1 << 2) __cudaFatSassDebugFlag = (1 << 2)
} __cudaFatCudaBinaryFlag; } __cudaFatCudaBinaryFlag;
/* /*
* Imported/exported symbol descriptor, needed for * Imported/exported symbol descriptor, needed for
* __cudaFat binary linking. Not much information is needed, * __cudaFat binary linking. Not much information is needed,
* because this is only an index: full symbol information * because this is only an index: full symbol information
skipping to change at line 170 skipping to change at line 177
char* usageMode; char* usageMode;
__cudaFatPtxEntry *ptx; __cudaFatPtxEntry *ptx;
__cudaFatCubinEntry *cubin; __cudaFatCubinEntry *cubin;
__cudaFatDebugEntry *debug; __cudaFatDebugEntry *debug;
void* debugInfo; void* debugInfo;
unsigned int flags; unsigned int flags;
__cudaFatSymbol *exported; __cudaFatSymbol *exported;
__cudaFatSymbol *imported; __cudaFatSymbol *imported;
struct __cudaFatCudaBinaryRec *dependends; struct __cudaFatCudaBinaryRec *dependends;
unsigned int characteristic; unsigned int characteristic;
__cudaFatElfEntry *elf;
} __cudaFatCudaBinary; } __cudaFatCudaBinary;
/* /*
* Current version and magic numbers: * Current version and magic numbers:
*/ */
#define __cudaFatVERSION 0x00000003 #define __cudaFatVERSION 0x00000004
#define __cudaFatMAGIC 0x1ee55a01 #define __cudaFatMAGIC 0x1ee55a01
/* /*
* Version history log: * Version history log:
* 1 : __cudaFatDebugEntry field added to __cudaFatCudaBinary struct * 1 : __cudaFatDebugEntry field added to __cudaFatCudaBinary struct
* 2 : flags and debugInfo field added. * 2 : flags and debugInfo field added.
* 3 : import/export symbol list * 3 : import/export symbol list
* 4 : characteristic added * 4 : characteristic added, elf added
*/ */
/*--------------------------------- Functions ----------------------------- ---*/ /*--------------------------------- Functions ----------------------------- ---*/
typedef enum { typedef enum {
__cudaFatAvoidPTX, __cudaFatAvoidPTX,
__cudaFatPreferBestCode __cudaFatPreferBestCode,
__cudaFatForcePTX
} __cudaFatCompilationPolicy; } __cudaFatCompilationPolicy;
/* /*
* Function : Select a load image from the __cudaFat binary * Function : Select a load image from the __cudaFat binary
* that will run on the specified GPU. * that will run on the specified GPU.
* Parameters : binary (I) Fat binary * Parameters : binary (I) Fat binary
* policy (I) Parameter influencing the selection proces s in case no * policy (I) Parameter influencing the selection proces s in case no
* fully matching cubin can be found, but ins tead a choice can * fully matching cubin can be found, but ins tead a choice can
* be made between ptx compilation or selecti on of a * be made between ptx compilation or selecti on of a
* cubin for a less capable GPU. * cubin for a less capable GPU.
skipping to change at line 217 skipping to change at line 226
* on the returned cubin will be returned, or NULL * on the returned cubin will be returned, or NULL
* will be returned when cubin or such debug info * will be returned when cubin or such debug info
* cannot be found. * cannot be found.
*/ */
void fatGetCubinForGpuWithPolicy( __cudaFatCudaBinary *binary, __cudaFatCom pilationPolicy policy, char* gpuName, char* *cubin, char* *dbgInfoFile ); void fatGetCubinForGpuWithPolicy( __cudaFatCudaBinary *binary, __cudaFatCom pilationPolicy policy, char* gpuName, char* *cubin, char* *dbgInfoFile );
#define fatGetCubinForGpu(binary,gpuName,cubin,dbgInfoFile) \ #define fatGetCubinForGpu(binary,gpuName,cubin,dbgInfoFile) \
fatGetCubinForGpuWithPolicy(binary,__cudaFatAvoidPTX,gpuName,cubi n,dbgInfoFile) fatGetCubinForGpuWithPolicy(binary,__cudaFatAvoidPTX,gpuName,cubi n,dbgInfoFile)
/* /*
* Function : Check if a binary will be JITed for the specified targ
et architecture
* Parameters : binary (I) Fat binary
* policy (I) Compilation policy, as described by fatGet
CubinForGpuWithPolicy
* gpuName (I) Name of target GPU
* ptx (O) PTX string to be JITed
* Function Result : True if the given binary will be JITed; otherwise, Fal
se
*/
unsigned char fatCheckJitForGpuWithPolicy( __cudaFatCudaBinary *binary, __c
udaFatCompilationPolicy policy, char* gpuName, char* *ptx );
#define fatCheckJitForGpu(binary,gpuName,ptx) \
fatCheckJitForGpuWithPolicy(binary,__cudaFatAvoidPTX,gpuName,ptx)
/*
* Function : Free information previously obtained via function fatG etCubinForGpu. * Function : Free information previously obtained via function fatG etCubinForGpu.
* Parameters : cubin (I) Cubin text string to free * Parameters : cubin (I) Cubin text string to free
* dbgInfo (I) Debug info filename to free, or NULL * dbgInfo (I) Debug info filename to free, or NULL
*/ */
void fatFreeCubin( char* cubin, char* dbgInfoFile ); void fatFreeCubin( char* cubin, char* dbgInfoFile );
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif
 End of changes. 7 change blocks. 
4 lines changed or deleted 30 lines changed or added


 builtin_types.h   builtin_types.h 
/* /*
* Copyright 1993-2009 NVIDIA Corporation. All rights reserved. * Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
* *
* NOTICE TO USER: * NOTICE TO USER:
* *
* This source code is subject to NVIDIA ownership rights under U.S. and * This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws. Users and possessors of this source code * international Copyright laws. Users and possessors of this source code
* are hereby granted a nonexclusive, royalty-free license to use this code * are hereby granted a nonexclusive, royalty-free license to use this code
* in individual and commercial software. * in individual and commercial software.
* *
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
 End of changes. 1 change blocks. 
1 lines changed or deleted 1 lines changed or added


 channel_descriptor.h   channel_descriptor.h 
/* /*
* Copyright 1993-2009 NVIDIA Corporation. All rights reserved. * Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
* *
* NOTICE TO USER: * NOTICE TO USER:
* *
* This source code is subject to NVIDIA ownership rights under U.S. and * This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws. Users and possessors of this source code * international Copyright laws. Users and possessors of this source code
* are hereby granted a nonexclusive, royalty-free license to use this code * are hereby granted a nonexclusive, royalty-free license to use this code
* in individual and commercial software. * in individual and commercial software.
* *
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
skipping to change at line 98 skipping to change at line 98
* \ref ::cudaBindTextureToArray(const struct texture< T, dim, readMode>&, const struct cudaArray*, const struct cudaChannelFormatDesc&) "cudaBindText ureToArray (High level)", * \ref ::cudaBindTextureToArray(const struct texture< T, dim, readMode>&, const struct cudaArray*, const struct cudaChannelFormatDesc&) "cudaBindText ureToArray (High level)",
* \ref ::cudaBindTextureToArray(const struct texture< T, dim, readMode>&, const struct cudaArray*) "cudaBindTextureToArray (High level, inherited cha nnel descriptor)", * \ref ::cudaBindTextureToArray(const struct texture< T, dim, readMode>&, const struct cudaArray*) "cudaBindTextureToArray (High level, inherited cha nnel descriptor)",
* \ref ::cudaUnbindTexture(const struct texture< T, dim, readMode>&) "cuda UnbindTexture (High level)", * \ref ::cudaUnbindTexture(const struct texture< T, dim, readMode>&) "cuda UnbindTexture (High level)",
* \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture< T, d im, readMode>&) "cudaGetTextureAlignmentOffset (High level)" * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture< T, d im, readMode>&) "cudaGetTextureAlignmentOffset (High level)"
*/ */
template<class T> __inline__ __host__ cudaChannelFormatDesc cudaCreateChann elDesc(void) template<class T> __inline__ __host__ cudaChannelFormatDesc cudaCreateChann elDesc(void)
{ {
return cudaCreateChannelDesc(0, 0, 0, 0, cudaChannelFormatKindNone); return cudaCreateChannelDesc(0, 0, 0, 0, cudaChannelFormatKindNone);
} }
static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf(
void)
{
int e = (int)sizeof(unsigned short) * 8;
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
}
static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf1
(void)
{
int e = (int)sizeof(unsigned short) * 8;
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
}
static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf2
(void)
{
int e = (int)sizeof(unsigned short) * 8;
return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindFloat);
}
static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf4
(void)
{
int e = (int)sizeof(unsigned short) * 8;
return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindFloat);
}
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc< char>(void) template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc< char>(void)
{ {
int e = (int)sizeof(char) * 8; int e = (int)sizeof(char) * 8;
#if __SIGNED_CHARS__ #if __SIGNED_CHARS__
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned); return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
#else #else
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned); return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
#endif #endif
} }
 End of changes. 2 change blocks. 
1 lines changed or deleted 33 lines changed or added


 common_functions.h   common_functions.h 
/* /*
* Copyright 1993-2009 NVIDIA Corporation. All rights reserved. * Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
* *
* NOTICE TO USER: * NOTICE TO USER:
* *
* This source code is subject to NVIDIA ownership rights under U.S. and * This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws. Users and possessors of this source code * international Copyright laws. Users and possessors of this source code
* are hereby granted a nonexclusive, royalty-free license to use this code * are hereby granted a nonexclusive, royalty-free license to use this code
* in individual and commercial software. * in individual and commercial software.
* *
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
skipping to change at line 56 skipping to change at line 56
#include "host_defines.h" #include "host_defines.h"
#include <time.h> #include <time.h>
#include <string.h> #include <string.h>
extern "C" extern "C"
{ {
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern _CRTIMP __host__ __device__ clock_t clock(void) __THROW; extern _CRTIMP __host__ __device__ clock_t __cdecl clock(void) __THROW;
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __host__ __device__ void *memset(void *s, int c, size_t n) __THROW; extern __host__ __device__ void * __cdecl memset(void *s, int c, s ize_t n) __THROW;
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __host__ __device__ void *memcpy(void *d, const void *s, size_t n) _ _THROW; extern __host__ __device__ void * __cdecl memcpy(void *d, const vo id *s, size_t n) __THROW;
} }
#elif !defined(__CUDACC__) #elif !defined(__CUDACC__)
#include "crt/func_macro.h" #include "crt/func_macro.h"
__device_func__(clock_t __cuda_clock(void)) __device_func__(clock_t __cuda_clock(void))
{ {
return clock(); return clock();
} }
__device_func__(void *__cuda_memset(void *s, int c, size_t n)) __device_func__(void *__cuda_memset(void *s, int c, size_t n))
{ {
return memset(s, c, n); char *p = (char*)s;
while (n--) *p++ = (char)c;
return s;
} }
__device_func__(void *__cuda_memcpy(void *d, const void *s, size_t n)) __device_func__(void *__cuda_memcpy(void *d, const void *s, size_t n))
{ {
return memcpy(d, s, n); char *p = (char*)d;
const char *r = (const char*)s;
while (n--) *p++ = *r++;
return d;
} }
#endif /* __cplusplus && __CUDACC__ */ #endif /* __cplusplus && __CUDACC__ */
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
 End of changes. 6 change blocks. 
6 lines changed or deleted 15 lines changed or added


 common_types.h   common_types.h 
/* /*
* Copyright 1993-2009 NVIDIA Corporation. All rights reserved. * Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
* *
* NOTICE TO USER: * NOTICE TO USER:
* *
* This source code is subject to NVIDIA ownership rights under U.S. and * This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws. Users and possessors of this source code * international Copyright laws. Users and possessors of this source code
* are hereby granted a nonexclusive, royalty-free license to use this code * are hereby granted a nonexclusive, royalty-free license to use this code
* in individual and commercial software. * in individual and commercial software.
* *
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
 End of changes. 1 change blocks. 
1 lines changed or deleted 1 lines changed or added


 cuComplex.h   cuComplex.h 
/* /*
* Copyright 1993-2009 NVIDIA Corporation. All rights reserved. * Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
* *
* NOTICE TO USER: * NOTICE TO USER:
* *
* This source code is subject to NVIDIA ownership rights under U.S. and * This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws. Users and possessors of this source code * international Copyright laws. Users and possessors of this source code
* are hereby granted a nonexclusive, royalty-free license to use this code * are hereby granted a nonexclusive, royalty-free license to use this code
* in individual and commercial software. * in individual and commercial software.
* *
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
skipping to change at line 43 skipping to change at line 43
* the above Disclaimer and U.S. Government End Users Notice. * the above Disclaimer and U.S. Government End Users Notice.
*/ */
#if !defined(CU_COMPLEX_H_) #if !defined(CU_COMPLEX_H_)
#define CU_COMPLEX_H_ #define CU_COMPLEX_H_
#if defined(__cplusplus) #if defined(__cplusplus)
extern "C" { extern "C" {
#endif /* __cplusplus */ #endif /* __cplusplus */
#include <math.h> /* import fabs, sqrt */ #include <math.h> /* import fabsf, sqrt */
#include "vector_types.h" #include "vector_types.h"
/* versions for hosts without native support for 'complex' */ /* versions for hosts without native support for 'complex' */
#if (!defined(__CUDACC__) && defined(CU_USE_NATIVE_COMPLEX)) #if (!defined(__CUDACC__) && defined(CU_USE_NATIVE_COMPLEX))
#include <complex.h> #include <complex.h>
/* wrapper functions around C99 native complex support. NOTE: Untested! */ /* wrapper functions around C99 native complex support. NOTE: Untested! */
/* -- Single Precision -- */ /* -- Single Precision -- */
skipping to change at line 223 skipping to change at line 223
/* This implementation guards against intermediate underflow and overflow /* This implementation guards against intermediate underflow and overflow
* by scaling. Such guarded implementations are usually the default for * by scaling. Such guarded implementations are usually the default for
* complex library implementations, with some also offering an unguarded, * complex library implementations, with some also offering an unguarded,
* faster version. * faster version.
*/ */
__host__ __device__ static __inline__ cuFloatComplex cuCdivf (cuFloatComple x x, __host__ __device__ static __inline__ cuFloatComplex cuCdivf (cuFloatComple x x,
cuFloatComple x y) cuFloatComple x y)
{ {
cuFloatComplex quot; cuFloatComplex quot;
float s = ((float)fabs((double)cuCrealf(y))) + float s = fabsf(cuCrealf(y)) + fabsf(cuCimagf(y));
((float)fabs((double)cuCimagf(y)));
float oos = 1.0f / s; float oos = 1.0f / s;
float ars = cuCrealf(x) * oos; float ars = cuCrealf(x) * oos;
float ais = cuCimagf(x) * oos; float ais = cuCimagf(x) * oos;
float brs = cuCrealf(y) * oos; float brs = cuCrealf(y) * oos;
float bis = cuCimagf(y) * oos; float bis = cuCimagf(y) * oos;
s = (brs * brs) + (bis * bis); s = (brs * brs) + (bis * bis);
oos = 1.0f / s; oos = 1.0f / s;
quot = make_cuFloatComplex (((ars * brs) + (ais * bis)) * oos, quot = make_cuFloatComplex (((ars * brs) + (ais * bis)) * oos,
((ais * brs) - (ars * bis)) * oos); ((ais * brs) - (ars * bis)) * oos);
return quot; return quot;
skipping to change at line 250 skipping to change at line 249
* overflow by scaling. Otherwise we would lose half the exponent range. * overflow by scaling. Otherwise we would lose half the exponent range.
* There are various ways of doing guarded computation. For now chose the * There are various ways of doing guarded computation. For now chose the
* simplest and fastest solution, however this may suffer from inaccuracies * simplest and fastest solution, however this may suffer from inaccuracies
* if sqrt and division are not IEEE compliant. * if sqrt and division are not IEEE compliant.
*/ */
__host__ __device__ static __inline__ float cuCabsf (cuFloatComplex x) __host__ __device__ static __inline__ float cuCabsf (cuFloatComplex x)
{ {
float a = cuCrealf(x); float a = cuCrealf(x);
float b = cuCimagf(x); float b = cuCimagf(x);
float v, w, t; float v, w, t;
a = (float)fabs(a); a = fabsf(a);
b = (float)fabs(b); b = fabsf(b);
if (a > b) { if (a > b) {
v = a; v = a;
w = b; w = b;
} else { } else {
v = b; v = b;
w = a; w = a;
} }
t = w / v; t = w / v;
t = 1.0f + t * t; t = 1.0f + t * t;
t = v * (float)sqrt(t); t = v * sqrtf(t);
if ((v == 0.0f) || (v > 3.402823466e38f) || (w > 3.402823466e38f)) { if ((v == 0.0f) || (v > 3.402823466e38f) || (w > 3.402823466e38f)) {
t = v + w; t = v + w;
} }
return t; return t;
} }
/* Double precision */ /* Double precision */
typedef double2 cuDoubleComplex; typedef double2 cuDoubleComplex;
__host__ __device__ static __inline__ double cuCreal (cuDoubleComplex x) __host__ __device__ static __inline__ double cuCreal (cuDoubleComplex x)
skipping to change at line 404 skipping to change at line 403
{ {
return make_cuDoubleComplex ((double)cuCrealf(c), (double)cuCimagf(c)); return make_cuDoubleComplex ((double)cuCrealf(c), (double)cuCimagf(c));
} }
__host__ __device__ static __inline__ cuFloatComplex cuComplexDoubleToFloat __host__ __device__ static __inline__ cuFloatComplex cuComplexDoubleToFloat
(cuDoubleComplex c) (cuDoubleComplex c)
{ {
return make_cuFloatComplex ((float)cuCreal(c), (float)cuCimag(c)); return make_cuFloatComplex ((float)cuCreal(c), (float)cuCimag(c));
} }
__host__ __device__ static __inline__ cuComplex cuCfmaf( cuComplex x, cuCo
mplex y, cuComplex d)
{
float real_res;
float imag_res;
real_res = (cuCrealf(x) * cuCrealf(y)) + cuCrealf(d);
imag_res = (cuCrealf(x) * cuCimagf(y)) + cuCimagf(d);
real_res = -(cuCimagf(x) * cuCimagf(y)) + real_res;
imag_res = (cuCimagf(x) * cuCrealf(y)) + imag_res;
return make_cuComplex(real_res, imag_res);
}
__host__ __device__ static __inline__ cuDoubleComplex cuCfma( cuDoubleComp
lex x, cuDoubleComplex y, cuDoubleComplex d)
{
double real_res;
double imag_res;
real_res = (cuCreal(x) * cuCreal(y)) + cuCreal(d);
imag_res = (cuCreal(x) * cuCimag(y)) + cuCimag(d);
real_res = -(cuCimag(x) * cuCimag(y)) + real_res;
imag_res = (cuCimag(x) * cuCreal(y)) + imag_res;
return make_cuDoubleComplex(real_res, imag_res);
}
#endif /* !defined(CU_COMPLEX_H_) */ #endif /* !defined(CU_COMPLEX_H_) */
 End of changes. 6 change blocks. 
7 lines changed or deleted 36 lines changed or added


 cublas.h   cublas.h 
/* /*
* Copyright 1993-2009 NVIDIA Corporation. All rights reserved. * Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
* *
* NOTICE TO USER: * NOTICE TO USER:
* *
* This source code is subject to NVIDIA ownership rights under U.S. and * This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws. Users and possessors of this source code * international Copyright laws. Users and possessors of this source code
* are hereby granted a nonexclusive, royalty-free license to use this code * are hereby granted a nonexclusive, royalty-free license to use this code
* in individual and commercial software. * in individual and commercial software.
* *
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
skipping to change at line 554 skipping to change at line 554
* Error Status * Error Status
* ------------ * ------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize d * CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize d
* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU * CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
*/ */
void CUBLASAPI cublasSrot (int n, float *x, int incx, float *y, int incy, void CUBLASAPI cublasSrot (int n, float *x, int incx, float *y, int incy,
float sc, float ss); float sc, float ss);
/* /*
* void * void
* cublasSrotg (float *sa, float *sb, float *sc, float *ss) * cublasSrotg (float *host_sa, float *host_sb, float *host_sc, float *host _ss)
* *
* constructs the Givens tranformation * constructs the Givens tranformation
* *
* ( sc ss ) * ( sc ss )
* G = ( ) , sc^2 + ss^2 = 1, * G = ( ) , sc^2 + ss^2 = 1,
* (-ss sc ) * (-ss sc )
* *
* which zeros the second entry of the 2-vector transpose(sa, sb). * which zeros the second entry of the 2-vector transpose(sa, sb).
* *
* The quantity r = (+/-) sqrt (sa^2 + sb^2) overwrites sa in storage. The * The quantity r = (+/-) sqrt (sa^2 + sb^2) overwrites sa in storage. The
* value of sb is overwritten by a value z which allows sc and ss to be * value of sb is overwritten by a value z which allows sc and ss to be
* recovered by the following algorithm: * recovered by the following algorithm:
* *
* if z=1 set sc = 0.0 and ss = 1.0 * if z=1 set sc = 0.0 and ss = 1.0
* if abs(z) < 1 set sc = sqrt(1-z^2) and ss = z * if abs(z) < 1 set sc = sqrt(1-z^2) and ss = z
* if abs(z) > 1 set sc = 1/z and ss = sqrt(1-sc^2) * if abs(z) > 1 set sc = 1/z and ss = sqrt(1-sc^2)
* *
* The function srot (n, x, incx, y, incy, sc, ss) normally is called next * The function srot (n, x, incx, y, incy, sc, ss) normally is called next
* to apply the transformation to a 2 x n matrix. * to apply the transformation to a 2 x n matrix.
* Note that is function is provided for completeness and run exclusively
* on the Host.
* *
* Input * Input
* ----- * -----
* sa single precision scalar * sa single precision scalar
* sb single precision scalar * sb single precision scalar
* *
* Output * Output
* ------ * ------
* sa single precision r * sa single precision r
* sb single precision z * sb single precision z
* sc single precision result * sc single precision result
* ss single precision result * ss single precision result
* *
* Reference: http://www.netlib.org/blas/srotg.f * Reference: http://www.netlib.org/blas/srotg.f
* *
* This function does not set any error status. * This function does not set any error status.
*/ */
void CUBLASAPI cublasSrotg (float *sa, float *sb, float *sc, float *ss); void CUBLASAPI cublasSrotg (float *host_sa, float *host_sb, float *host_sc, float *host_ss);
/* /*
* void * void
* cublasSrotm (int n, float *x, int incx, float *y, int incy, * cublasSrotm (int n, float *x, int incx, float *y, int incy,
* const float* sparam) * const float* sparam)
* *
* applies the modified Givens transformation, h, to the 2 x n matrix * applies the modified Givens transformation, h, to the 2 x n matrix
* *
* ( transpose(x) ) * ( transpose(x) )
* ( transpose(y) ) * ( transpose(y) )
skipping to change at line 644 skipping to change at line 646
* Error Status * Error Status
* ------------ * ------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize d * CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize d
* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU * CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
*/ */
void CUBLASAPI cublasSrotm(int n, float *x, int incx, float *y, int incy, void CUBLASAPI cublasSrotm(int n, float *x, int incx, float *y, int incy,
const float* sparam); const float* sparam);
/* /*
* void * void
* cublasSrotmg (float *psd1, float *psd2, float *psx1, const float *psy1, * cublasSrotmg (float *host_psd1, float *host_psd2, float *host_psx1, cons
* float *sparam) t float *host_psy1,
* float *host_sparam)
* *
* constructs the modified Givens transformation matrix h which zeros * constructs the modified Givens transformation matrix h which zeros
* the second component of the 2-vector transpose(sqrt(sd1)*sx1,sqrt(sd2)*s y1). * the second component of the 2-vector transpose(sqrt(sd1)*sx1,sqrt(sd2)*s y1).
* With sparam[0] = sflag, h has one of the following forms: * With sparam[0] = sflag, h has one of the following forms:
* *
* sflag = -1.0f sflag = 0.0f sflag = 1.0f sflag = -2.0f * sflag = -1.0f sflag = 0.0f sflag = 1.0f sflag = -2.0f
* *
* (sh00 sh01) (1.0f sh01) (sh00 1.0f) (1.0f 0.0f) * (sh00 sh01) (1.0f sh01) (sh00 1.0f) (1.0f 0.0f)
* h = ( ) ( ) ( ) ( ) * h = ( ) ( ) ( ) ( )
* (sh10 sh11) (sh10 1.0f) (-1.0f sh11) (0.0f 1.0f) * (sh10 sh11) (sh10 1.0f) (-1.0f sh11) (0.0f 1.0f)
* *
* sparam[1] through sparam[4] contain sh00, sh10, sh01, sh11, * sparam[1] through sparam[4] contain sh00, sh10, sh01, sh11,
* respectively. Values of 1.0f, -1.0f, or 0.0f implied by the value * respectively. Values of 1.0f, -1.0f, or 0.0f implied by the value
* of sflag are not stored in sparam. * of sflag are not stored in sparam.
* Note that is function is provided for completeness and run exclusively
* on the Host.
* *
* Input * Input
* ----- * -----
* sd1 single precision scalar * sd1 single precision scalar
* sd2 single precision scalar * sd2 single precision scalar
* sx1 single precision scalar * sx1 single precision scalar
* sy1 single precision scalar * sy1 single precision scalar
* *
* Output * Output
* ------ * ------
skipping to change at line 682 skipping to change at line 686
* sx1 changed to represent the effect of the transformation * sx1 changed to represent the effect of the transformation
* sparam 5-element vector. sparam[0] is sflag described above. sparam[1] * sparam 5-element vector. sparam[0] is sflag described above. sparam[1]
* through sparam[4] contain the 2x2 rotation matrix h: sparam[1] * through sparam[4] contain the 2x2 rotation matrix h: sparam[1]
* contains sh00, sparam[2] contains sh10, sparam[3] contains sh01, * contains sh00, sparam[2] contains sh10, sparam[3] contains sh01,
* and sprams[4] contains sh11. * and sprams[4] contains sh11.
* *
* Reference: http://www.netlib.org/blas/srotmg.f * Reference: http://www.netlib.org/blas/srotmg.f
* *
* This functions does not set any error status. * This functions does not set any error status.
*/ */
void CUBLASAPI cublasSrotmg (float *sd1, float *sd2, float *sx1, void CUBLASAPI cublasSrotmg (float *host_sd1, float *host_sd2, float *host_
const float *sy1, float* sparam); sx1,
const float *host_sy1, float* host_sparam);
/* /*
* void * void
* sscal (int n, float alpha, float *x, int incx) * sscal (int n, float alpha, float *x, int incx)
* *
* replaces single precision vector x with single precision alpha * x. For i * replaces single precision vector x with single precision alpha * x. For i
* = 0 to n - 1, it replaces x[ix + i * incx] with alpha * x[ix + i * incx] , * = 0 to n - 1, it replaces x[ix + i * incx] with alpha * x[ix + i * incx] ,
* where ix = 1 if incx >= 0, else ix = 1 + (1 - n) * incx. * where ix = 1 if incx >= 0, else ix = 1 + (1 - n) * incx.
* *
* Input * Input
skipping to change at line 819 skipping to change at line 823
* Error Status * Error Status
* ------------ * ------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize d * CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize d
* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU * CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
*/ */
void CUBLASAPI cublasCcopy (int n, const cuComplex *x, int incx, cuComplex *y, void CUBLASAPI cublasCcopy (int n, const cuComplex *x, int incx, cuComplex *y,
int incy); int incy);
/* /*
* void * void
* cublasZcopy (int n, const cuDoubleComplex *x, int incx, cuDoubleComplex
*y, int incy)
*
* copies the double-complex vector x to the double-complex vector y. For
* i = 0 to n-1, copies x[lx + i * incx] to y[ly + i * incy], where lx = 1
if
* incx >= 0, else lx = 1 + (1 - n) * incx, and ly is defined in a similar
* way using incy.
*
* Input
* -----
* n number of elements in input vectors
* x double-complex vector with n elements
* incx storage spacing between elements of x
* y double-complex vector with n elements
* incy storage spacing between elements of y
*
* Output
* ------
* y contains double complex vector x
*
* Reference: http://www.netlib.org/blas/zcopy.f
*
* Error status for this function can be retrieved via cublasGetError().
*
* Error Status
* ------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
d
* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
*/
void CUBLASAPI cublasZcopy (int n, const cuDoubleComplex *x, int incx, cuDo
ubleComplex *y,
int incy);
/*
* void
* cublasCscal (int n, cuComplex alpha, cuComplex *x, int incx) * cublasCscal (int n, cuComplex alpha, cuComplex *x, int incx)
* *
* replaces single-complex vector x with single-complex alpha * x. For i * replaces single-complex vector x with single-complex alpha * x. For i
* = 0 to n - 1, it replaces x[ix + i * incx] with alpha * x[ix + i * incx] , * = 0 to n - 1, it replaces x[ix + i * incx] with alpha * x[ix + i * incx] ,
* where ix = 1 if incx >= 0, else ix = 1 + (1 - n) * incx. * where ix = 1 if incx >= 0, else ix = 1 + (1 - n) * incx.
* *
* Input * Input
* ----- * -----
* n number of elements in input vectors * n number of elements in input vectors
* alpha single-complex scalar multiplier * alpha single-complex scalar multiplier
skipping to change at line 849 skipping to change at line 886
* *
* Error Status * Error Status
* ------------ * ------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize d * CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize d
* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU * CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
*/ */
void CUBLASAPI cublasCscal (int n, cuComplex alpha, cuComplex *x, int incx) ; void CUBLASAPI cublasCscal (int n, cuComplex alpha, cuComplex *x, int incx) ;
/* /*
* void * void
* cublasCrotg (cuComplex *ca, cuComplex cb, float *sc, cuComplex *cs) * cublasCrotg (cuComplex *host_ca, cuComplex cb, float *host_sc, cuComplex *host_cs)
* *
* constructs the complex Givens tranformation * constructs the complex Givens tranformation
* *
* ( sc cs ) * ( sc cs )
* G = ( ) , sc^2 + cabs(cs)^2 = 1, * G = ( ) , sc^2 + cabs(cs)^2 = 1,
* (-cs sc ) * (-cs sc )
* *
* which zeros the second entry of the complex 2-vector transpose(ca, cb). * which zeros the second entry of the complex 2-vector transpose(ca, cb).
* *
* The quantity ca/cabs(ca)*norm(ca,cb) overwrites ca in storage. The * The quantity ca/cabs(ca)*norm(ca,cb) overwrites ca in storage. The
* function crot (n, x, incx, y, incy, sc, cs) is normally called next * function crot (n, x, incx, y, incy, sc, cs) is normally called next
* to apply the transformation to a 2 x n matrix. * to apply the transformation to a 2 x n matrix.
* Note that is function is provided for completeness and run exclusively
* on the Host.
* *
* Input * Input
* ----- * -----
* ca single-precision complex precision scalar * ca single-precision complex precision scalar
* cb single-precision complex scalar * cb single-precision complex scalar
* *
* Output * Output
* ------ * ------
* ca single-precision complex ca/cabs(ca)*norm(ca,cb) * ca single-precision complex ca/cabs(ca)*norm(ca,cb)
* sc single-precision cosine component of rotation matrix * sc single-precision cosine component of rotation matrix
* cs single-precision complex sine component of rotation matrix * cs single-precision complex sine component of rotation matrix
* *
* Reference: http://www.netlib.org/blas/crotg.f * Reference: http://www.netlib.org/blas/crotg.f
* *
* This function does not set any error status. * This function does not set any error status.
*/ */
__host__ void CUBLASAPI cublasCrotg (cuComplex *pca, cuComplex cb, float *p __host__ void CUBLASAPI cublasCrotg (cuComplex *host_ca, cuComplex cb, floa
sc, t *host_sc,
cuComplex *pcs); cuComplex *host_cs);
/* /*
* void * void
* cublasCrot (int n, cuComplex *x, int incx, cuComplex *y, int incy, float sc, * cublasCrot (int n, cuComplex *x, int incx, cuComplex *y, int incy, float sc,
* cuComplex cs) * cuComplex cs)
* *
* multiplies a 2x2 matrix ( sc cs) with the 2xn matrix ( transpose(x ) ) * multiplies a 2x2 matrix ( sc cs) with the 2xn matrix ( transpose(x ) )
* (-conj(cs) sc) ( transpose(y ) ) * (-conj(cs) sc) ( transpose(y ) )
* *
* The elements of x are in x[lx + i * incx], i = 0 ... n - 1, where lx = 1 if * The elements of x are in x[lx + i * incx], i = 0 ... n - 1, where lx = 1 if
skipping to change at line 1024 skipping to change at line 1063
* *
* Error Status * Error Status
* ------------ * ------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize d * CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize d
* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU * CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
*/ */
void CUBLASAPI cublasCswap (int n, cuComplex *x, int incx, cuComplex *y, void CUBLASAPI cublasCswap (int n, cuComplex *x, int incx, cuComplex *y,
int incy); int incy);
/* /*
* void
* cublasZswap (int n, const cuDoubleComplex *x, int incx, cuDoubleComplex
*y, int incy)
*
* interchanges the double-complex vector x with the double-complex vector
y.
* For i = 0 to n-1, interchanges x[lx + i * incx] with y[ly + i * incy], w
here
* lx = 1 if incx >= 0, else lx = 1 + (1 - n) * incx, and ly is defined in
a
* similar way using incy.
*
* Input
* -----
* n number of elements in input vectors
* x double-complex vector with n elements
* incx storage spacing between elements of x
* y double-complex vector with n elements
* incy storage spacing between elements of y
*
* Output
* ------
* x contains-double complex vector y
* y contains-double complex vector x
*
* Reference: http://www.netlib.org/blas/zswap.f
*
* Error status for this function can be retrieved via cublasGetError().
*
* Error Status
* ------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
d
* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
*/
void CUBLASAPI cublasZswap (int n, cuDoubleComplex *x, int incx, cuDoubleCo
mplex *y,
int incy);
/*
* cuComplex * cuComplex
* cdotu (int n, const cuComplex *x, int incx, const cuComplex *y, int incy ) * cdotu (int n, const cuComplex *x, int incx, const cuComplex *y, int incy )
* *
* computes the dot product of two single-complex vectors. It returns the * computes the dot product of two single-complex vectors. It returns the
* dot product of the single-complex vectors x and y if successful, and com plex * dot product of the single-complex vectors x and y if successful, and com plex
* zero otherwise. It computes the sum for i = 0 to n - 1 of x[lx + i * inc x] * * zero otherwise. It computes the sum for i = 0 to n - 1 of x[lx + i * inc x] *
* y[ly + i * incy], where lx = 1 if incx >= 0, else lx = 1 + (1 - n) * inc x; * y[ly + i * incy], where lx = 1 if incx >= 0, else lx = 1 + (1 - n) * inc x;
* ly is defined in a similar way using incy. * ly is defined in a similar way using incy.
* *
* Input * Input
skipping to change at line 1212 skipping to change at line 1285
* Error Status * Error Status
* ------------ * ------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize d * CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize d
* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU * CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
*/ */
float CUBLASAPI cublasScnrm2 (int n, const cuComplex *x, int incx); float CUBLASAPI cublasScnrm2 (int n, const cuComplex *x, int incx);
/* ----------------- CUBLAS double-complex BLAS1 functions ---------------- - */ /* ----------------- CUBLAS double-complex BLAS1 functions ---------------- - */
/* /*
* void
* cublasZaxpy (int n, cuDoubleComplex alpha, const cuDoubleComplex *x, int
incx,
* cuDoubleComplex *y, int incy)
*
* multiplies double-complex vector x by double-complex scalar alpha and ad
ds
* the result to double-complex vector y; that is, it overwrites double-com
plex
* y with double-complex alpha * x + y. For i = 0 to n - 1, it replaces
* y[ly + i * incy] with alpha * x[lx + i * incx] + y[ly + i * incy], where
* lx = 0 if incx >= 0, else lx = 1 + (1 - n) * incx, and ly is defined in
a
* similar way using incy.
*
* Input
* -----
* n number of elements in input vectors
* alpha double-complex scalar multiplier
* x double-complex vector with n elements
* incx storage spacing between elements of x
* y double-complex vector with n elements
* incy storage spacing between elements of y
*
* Output
* ------
* y double-complex result (unchanged if n <= 0)
*
* Reference: http://www.netlib.org/blas/zaxpy.f
*
* Error status for this function can be retrieved via cublasGetError().
*
* Error Status
* ------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
d
* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
*/
void CUBLASAPI cublasZaxpy (int n, cuDoubleComplex alpha, const cuDoubleCom
plex *x,
int incx, cuDoubleComplex *y, int incy);
/*
* cuDoubleComplex * cuDoubleComplex
* zdotu (int n, const cuDoubleComplex *x, int incx, const cuDoubleComplex *y, int incy) * zdotu (int n, const cuDoubleComplex *x, int incx, const cuDoubleComplex *y, int incy)
* *
* computes the dot product of two double-complex vectors. It returns the * computes the dot product of two double-complex vectors. It returns the
* dot product of the double-complex vectors x and y if successful, and dou ble-complex * dot product of the double-complex vectors x and y if successful, and dou ble-complex
* zero otherwise. It computes the sum for i = 0 to n - 1 of x[lx + i * inc x] * * zero otherwise. It computes the sum for i = 0 to n - 1 of x[lx + i * inc x] *
* y[ly + i * incy], where lx = 1 if incx >= 0, else lx = 1 + (1 - n) * inc x; * y[ly + i * incy], where lx = 1 if incx >= 0, else lx = 1 + (1 - n) * inc x;
* ly is defined in a similar way using incy. * ly is defined in a similar way using incy.
* *
* Input * Input
skipping to change at line 1240 skipping to change at line 1350
* ------ * ------
* returns double-complex dot product (zero if n <= 0) * returns double-complex dot product (zero if n <= 0)
* *
* Reference: http://www.netlib.org/blas/zdotu.f * Reference: http://www.netlib.org/blas/zdotu.f
* *
* Error status for this function can be retrieved via cublasGetError(). * Error status for this function can be retrieved via cublasGetError().
* *
* Error Status * Error Status
* ------------ * ------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has nor been initialize d * CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has nor been initialize d
* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
* CUBLAS_STATUS_EXECUTION_FAILED if function failed to execute on GPU * CUBLAS_STATUS_EXECUTION_FAILED if function failed to execute on GPU
*/ */
cuDoubleComplex CUBLASAPI cublasZdotu (int n, const cuDoubleComplex *x, int incx, cuDoubleComplex CUBLASAPI cublasZdotu (int n, const cuDoubleComplex *x, int incx,
const cuDoubleComplex *y, int incy); const cuDoubleComplex *y, int incy);
/* /*
* cuDoubleComplex
* cublasZdotc (int n, const cuDoubleComplex *x, int incx, const cuDoubleCo
mplex *y, int incy)
*
* computes the dot product of two double-precision complex vectors. It ret
urns the
* dot product of the double-precision complex vectors conjugate(x) and y i
f successful,
* and double-precision complex zero otherwise. It computes the
* sum for i = 0 to n - 1 of conjugate(x[lx + i * incx]) * y[ly + i * incy
],
* where lx = 1 if incx >= 0, else lx = 1 + (1 - n) * incx;
* ly is defined in a similar way using incy.
*
* Input
* -----
* n number of elements in input vectors
* x double-precision complex vector with n elements
* incx storage spacing between elements of x
* y double-precision complex vector with n elements
* incy storage spacing between elements of y
*
* Output
* ------
* returns double-complex dot product (zero if n <= 0)
*
* Reference: http://www.netlib.org/blas/zdotc.f
*
* Error status for this function can be retrieved via cublasGetError().
*
* Error Status
* ------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has nor been initialize
d
* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
* CUBLAS_STATUS_EXECUTION_FAILED if function failed to execute on GPU
*/
cuDoubleComplex CUBLASAPI cublasZdotc( int n, const cuDoubleComplex *x, int
incx,
const cuDoubleComplex *y, int incy )
;
/*
* void * void
* cublasZscal (int n, cuComplex alpha, cuComplex *x, int incx) * cublasZscal (int n, cuComplex alpha, cuComplex *x, int incx)
* *
* replaces double-complex vector x with double-complex alpha * x. For i * replaces double-complex vector x with double-complex alpha * x. For i
* = 0 to n - 1, it replaces x[ix + i * incx] with alpha * x[ix + i * incx] , * = 0 to n - 1, it replaces x[ix + i * incx] with alpha * x[ix + i * incx] ,
* where ix = 1 if incx >= 0, else ix = 1 + (1 - n) * incx. * where ix = 1 if incx >= 0, else ix = 1 + (1 - n) * incx.
* *
* Input * Input
* ----- * -----
* n number of elements in input vectors * n number of elements in input vectors
skipping to change at line 1275 skipping to change at line 1422
* *
* Error status for this function can be retrieved via cublasGetError(). * Error status for this function can be retrieved via cublasGetError().
* *
* Error Status * Error Status
* ------------ * ------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize d * CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize d
* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU * CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
*/ */
void CUBLASAPI cublasZscal (int n, cuDoubleComplex alpha, cuDoubleComplex * x, int incx); void CUBLASAPI cublasZscal (int n, cuDoubleComplex alpha, cuDoubleComplex * x, int incx);
/*
* void
* cublasZdscal (int n, double alpha, cuDoubleComplex *x, int incx)
*
* replaces double-complex vector x with double-complex alpha * x. For i
* = 0 to n - 1, it replaces x[ix + i * incx] with alpha * x[ix + i * incx]
,
* where ix = 1 if incx >= 0, else ix = 1 + (1 - n) * incx.
*
* Input
* -----
* n number of elements in input vectors
* alpha double precision scalar multiplier
* x double-complex vector with n elements
* incx storage spacing between elements of x
*
* Output
* ------
* x double-complex result (unchanged if n <= 0 or incx <= 0)
*
* Reference: http://www.netlib.org/blas/zdscal.f
*
* Error status for this function can be retrieved via cublasGetError().
*
* Error Status
* ------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
d
* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
*/
void CUBLASAPI cublasZdscal (int n, double alpha, cuDoubleComplex *x,
int incx);
/*
* double
* cublasDznrm2 (int n, const cuDoubleComplex *x, int incx)
*
* computes the Euclidean norm of the double precision complex n-vector x.
This code
* uses simple scaling to avoid intermediate underflow and overflow.
*
* Input
* -----
* n number of elements in input vector
* x double-complex vector with n elements
* incx storage spacing between elements of x
*
* Output
* ------
* returns Euclidian norm (0 if n <= 0 or incx <= 0, or if an error occurs)
*
* Reference: http://www.netlib.org/blas/dznrm2.f
*
* Error status for this function can be retrieved via cublasGetError().
*
* Error Status
* ------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
d
* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
*/
double CUBLASAPI cublasDznrm2 (int n, const cuDoubleComplex *x, int incx);
/*
* void
* cublasZrotg (cuDoubleComplex *host_ca, cuDoubleComplex cb, double *host_
sc, double *host_cs)
*
* constructs the complex Givens tranformation
*
* ( sc cs )
* G = ( ) , sc^2 + cabs(cs)^2 = 1,
* (-cs sc )
*
* which zeros the second entry of the complex 2-vector transpose(ca, cb).
*
* The quantity ca/cabs(ca)*norm(ca,cb) overwrites ca in storage. The
* function crot (n, x, incx, y, incy, sc, cs) is normally called next
* to apply the transformation to a 2 x n matrix.
* Note that is function is provided for completeness and run exclusively
* on the Host.
*
* Input
* -----
* ca double-precision complex precision scalar
* cb double-precision complex scalar
*
* Output
* ------
* ca double-precision complex ca/cabs(ca)*norm(ca,cb)
* sc double-precision cosine component of rotation matrix
* cs double-precision complex sine component of rotation matrix
*
* Reference: http://www.netlib.org/blas/zrotg.f
*
* This function does not set any error status.
*/
void CUBLASAPI cublasZrotg (cuDoubleComplex *host_ca, cuDoubleComplex cb, d
ouble *host_sc,
cuDoubleComplex *host_cs);
/*
* cublasZrot (int n, cuDoubleComplex *x, int incx, cuDoubleComplex *y, int
incy, double sc,
* cuDoubleComplex cs)
*
* multiplies a 2x2 matrix ( sc cs) with the 2xn matrix ( transpose(x
) )
* (-conj(cs) sc) ( transpose(y
) )
*
* The elements of x are in x[lx + i * incx], i = 0 ... n - 1, where lx = 1
if
* incx >= 0, else lx = 1 + (1 - n) * incx, and similarly for y using ly an
d
* incy.
*
* Input
* -----
* n number of elements in input vectors
* x double-precision complex vector with n elements
* incx storage spacing between elements of x
* y double-precision complex vector with n elements
* incy storage spacing between elements of y
* sc double-precision cosine component of rotation matrix
* cs double-precision complex sine component of rotation matrix
*
* Output
* ------
* x rotated double-precision complex vector x (unchanged if n <= 0)
* y rotated double-precision complex vector y (unchanged if n <= 0)
*
* Reference: http://netlib.org/lapack/explore-html/zrot.f.html
*
* Error status for this function can be retrieved via cublasGetError().
*
* Error Status
* ------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
d
* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
*/
void CUBLASAPI cublasZrot (int n, cuDoubleComplex *x, int incx,
cuDoubleComplex *y, int incy, double sc,
cuDoubleComplex cs);
/*
* void
* zdrot (int n, cuDoubleComplex *x, int incx, cuCumplex *y, int incy, doub
le c,
* double s)
*
* multiplies a 2x2 matrix ( c s) with the 2xn matrix ( transpose(x) )
* (-s c) ( transpose(y) )
*
* The elements of x are in x[lx + i * incx], i = 0 ... n - 1, where lx = 1
if
* incx >= 0, else lx = 1 + (1 - n) * incx, and similarly for y using ly an
d
* incy.
*
* Input
* -----
* n number of elements in input vectors
* x double-precision complex vector with n elements
* incx storage spacing between elements of x
* y double-precision complex vector with n elements
* incy storage spacing between elements of y
* c cosine component of rotation matrix
* s sine component of rotation matrix
*
* Output
* ------
* x rotated vector x (unchanged if n <= 0)
* y rotated vector y (unchanged if n <= 0)
*
* Reference http://www.netlib.org/blas/zdrot.f
*
* Error status for this function can be retrieved via cublasGetError().
*
* Error Status
* ------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
d
* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
*/
void CUBLASAPI cublasZdrot (int n, cuDoubleComplex *x, int incx,
cuDoubleComplex *y, int incy, double c, double
s);
/*
* int
* cublasIzamax (int n, const double *x, int incx)
*
* finds the smallest index of the element having maximum absolute value
* in double-complex vector x; that is, the result is the first i, i = 0
* to n - 1 that maximizes abs(real(x[1+i*incx]))+abs(imag(x[1 + i * incx])
).
*
* Input
* -----
* n number of elements in input vector
* x double-complex vector with n elements
* incx storage spacing between elements of x
*
* Output
* ------
* returns the smallest index (0 if n <= 0 or incx <= 0)
*
* Reference: http://www.netlib.org/blas/izamax.f
*
* Error status for this function can be retrieved via cublasGetError().
*
* Error Status
* ------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
d
* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
*/
int CUBLASAPI cublasIzamax (int n, const cuDoubleComplex *x, int incx);
/*
* int
* cublasIzamin (int n, const cuDoubleComplex *x, int incx)
*
* finds the smallest index of the element having minimum absolute value
* in double-complex vector x; that is, the result is the first i, i = 0
* to n - 1 that minimizes abs(real(x[1+i*incx]))+abs(imag(x[1 + i * incx])
).
*
* Input
* -----
* n number of elements in input vector
* x double-complex vector with n elements
* incx storage spacing between elements of x
*
* Output
* ------
* returns the smallest index (0 if n <= 0 or incx <= 0)
*
* Reference: Analogous to IZAMAX, see there.
*
* Error status for this function can be retrieved via cublasGetError().
*
* Error Status
* ------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
d
* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
*/
int CUBLASAPI cublasIzamin (int n, const cuDoubleComplex *x, int incx);
/*
* double
* cublasDzasum (int n, const cuDoubleComplex *x, int incx)
*
* takes the sum of the absolute values of a complex vector and returns a
* double precision result. Note that this is not the L1 norm of the vector
.
* The result is the sum from 0 to n-1 of abs(real(x[ix+i*incx])) +
* abs(imag(x(ix+i*incx))), where ix = 1 if incx <= 0, else ix = 1+(1-n)*in
cx.
*
* Input
* -----
* n number of elements in input vector
* x double-complex vector with n elements
* incx storage spacing between elements of x
*
* Output
* ------
* returns the double precision sum of absolute values of real and imaginar
y
* parts (0 if n <= 0 or incx <= 0, or if an error occurs)
*
* Reference: http://www.netlib.org/blas/dzasum.f
*
* Error status for this function can be retrieved via cublasGetError().
*
* Error Status
* ------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
d
* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
*/
double CUBLASAPI cublasDzasum (int n, const cuDoubleComplex *x, int incx);
/* --------------- CUBLAS single precision BLAS2 functions --------------- - */ /* --------------- CUBLAS single precision BLAS2 functions --------------- - */
/* /*
* void * void
* cublasSgbmv (char trans, int m, int n, int kl, int ku, float alpha, * cublasSgbmv (char trans, int m, int n, int kl, int ku, float alpha,
* const float *A, int lda, const float *x, int incx, float be ta, * const float *A, int lda, const float *x, int incx, float be ta,
* float *y, int incy) * float *y, int incy)
* *
* performs one of the matrix-vector operations * performs one of the matrix-vector operations
* *
skipping to change at line 1893 skipping to change at line 2309
* ------ * ------
* x updated according to x = op(A) * x * x updated according to x = op(A) * x
* *
* Reference: http://www.netlib.org/blas/stbmv.f * Reference: http://www.netlib.org/blas/stbmv.f
* *
* Error status for this function can be retrieved via cublasGetError(). * Error status for this function can be retrieved via cublasGetError().
* *
* Error Status * Error Status
* ------------ * ------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize d * CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize d
* CUBLAS_STATUS_INVALID_VALUE if n < 0, n > 4070, k < 0, or incx == 0 * CUBLAS_STATUS_INVALID_VALUE if n < 0, k < 0, or incx == 0
* CUBLAS_STATUS_ALLOC_FAILED if function cannot allocate enough intern
al scratch vector memory
* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU * CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
*/ */
void CUBLASAPI cublasStbmv (char uplo, char trans, char diag, int n, int k, void CUBLASAPI cublasStbmv (char uplo, char trans, char diag, int n, int k,
const float *A, int lda, float *x, int incx); const float *A, int lda, float *x, int incx);
/* /*
* void cublasStbsv (char uplo, char trans, char diag, int n, int k, * void cublasStbsv (char uplo, char trans, char diag, int n, int k,
* const float *A, int lda, float *X, int incx) * const float *A, int lda, float *X, int incx)
* *
* solves one of the systems of equations op(A)*x = b, where op(A) is eithe r * solves one of the systems of equations op(A)*x = b, where op(A) is eithe r
skipping to change at line 1954 skipping to change at line 2371
* ------ * ------
* x updated to contain the solution vector x that solves op(A) * x = b. * x updated to contain the solution vector x that solves op(A) * x = b.
* *
* Reference: http://www.netlib.org/blas/stbsv.f * Reference: http://www.netlib.org/blas/stbsv.f
* *
* Error status for this function can be retrieved via cublasGetError(). * Error status for this function can be retrieved via cublasGetError().
* *
* Error Status * Error Status
* ------------ * ------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize d * CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize d
* CUBLAS_STATUS_INVALID_VALUE if incx == 0, n < 0, or n > 4070 * CUBLAS_STATUS_INVALID_VALUE if incx == 0, n < 0 or n > 4070
* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU * CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
*/ */
void CUBLASAPI cublasStbsv (char uplo, char trans, char diag, int n, int k, void CUBLASAPI cublasStbsv (char uplo, char trans, char diag, int n, int k,
const float *A, int lda, float *x, int incx); const float *A, int lda, float *x, int incx);
/* /*
* void * void
* cublasStpmv (char uplo, char trans, char diag, int n, const float *AP, * cublasStpmv (char uplo, char trans, char diag, int n, const float *AP,
* float *x, int incx); * float *x, int incx);
* *
skipping to change at line 2007 skipping to change at line 2424
* x updated according to x = op(A) * x, * x updated according to x = op(A) * x,
* *
* Reference: http://www.netlib.org/blas/stpmv.f * Reference: http://www.netlib.org/blas/stpmv.f
* *
* Error status for this function can be retrieved via cublasGetError(). * Error status for this function can be retrieved via cublasGetError().
* *
* Error Status * Error Status
* ------------ * ------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize d * CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize d
* CUBLAS_STATUS_INVALID_VALUE if incx == 0 or if n < 0 * CUBLAS_STATUS_INVALID_VALUE if incx == 0 or if n < 0
* CUBLAS_STATUS_ALLOC_FAILED if function cannot allocate enough intern al scratch vector memory
* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU * CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
*/ */
void CUBLASAPI cublasStpmv (char uplo, char trans, char diag, int n, void CUBLASAPI cublasStpmv (char uplo, char trans, char diag, int n,
const float *AP, float *x, int incx); const float *AP, float *x, int incx);
/* /*
* void * void
* cublasStpsv (char uplo, char trans, char diag, int n, const float *AP, * cublasStpsv (char uplo, char trans, char diag, int n, const float *AP,
* float *X, int incx) * float *X, int incx)
* *
skipping to change at line 2129 skipping to change at line 2547
/* /*
* void * void
* cublasStrsv (char uplo, char trans, char diag, int n, const float *A, * cublasStrsv (char uplo, char trans, char diag, int n, const float *A,
* int lda, float *x, int incx) * int lda, float *x, int incx)
* *
* solves a system of equations op(A) * x = b, where op(A) is either A or * solves a system of equations op(A) * x = b, where op(A) is either A or
* transpose(A). b and x are single precision vectors consisting of n * transpose(A). b and x are single precision vectors consisting of n
* elements, and A is an n x n matrix composed of a unit or non-unit, upper * elements, and A is an n x n matrix composed of a unit or non-unit, upper
* or lower triangular matrix. Matrix A is stored in column major format, * or lower triangular matrix. Matrix A is stored in column major format,
* and lda is the leading dimension of the two-diemnsional array containing * and lda is the leading dimension of the two-dimensional array containing
* A. * A.
* *
* No test for singularity or near-singularity is included in this function . * No test for singularity or near-singularity is included in this function .
* Such tests must be performed before calling this function. * Such tests must be performed before calling this function.
* *
* Input * Input
* ----- * -----
* uplo specifies whether the matrix data is stored in the upper or the * uplo specifies whether the matrix data is stored in the upper or the
* lower triangular part of array A. If uplo = 'U' or 'u', then only * lower triangular part of array A. If uplo = 'U' or 'u', then only
* the upper triangular part of A may be referenced. If uplo = 'L' o r * the upper triangular part of A may be referenced. If uplo = 'L' o r
* 'l', then only the lower triangular part of A may be referenced. * 'l', then only the lower triangular part of A may be referenced.
* trans specifies op(A). If transa = 'n' or 'N', op(A) = A. If transa = ' t', * trans specifies op(A). If transa = 'n' or 'N', op(A) = A. If transa = ' t',
* 'T', 'c', or 'C', op(A) = transpose(A) * 'T', 'c', or 'C', op(A) = transpose(A)
* diag specifies whether or not A is a unit triangular matrix like so: * diag specifies whether or not A is a unit triangular matrix like so:
* if diag = 'U' or 'u', A is assumed to be unit triangular. If * if diag = 'U' or 'u', A is assumed to be unit triangular. If
* diag = 'N' or 'n', then A is not assumed to be unit triangular. * diag = 'N' or 'n', then A is not assumed to be unit triangular.
* n specifies the number of rows and columns of the matrix A. It * n specifies the number of rows and columns of the matrix A. It
* must be at least 0. In the current implementation n must be <= * must be at least 0.
* 4070.
* A is a single precision array of dimensions (lda, n). If uplo = 'U' * A is a single precision array of dimensions (lda, n). If uplo = 'U'
* or 'u', then A must contains the upper triangular part of a symme tric * or 'u', then A must contains the upper triangular part of a symme tric
* matrix, and the strictly lower triangular parts is not referenced . * matrix, and the strictly lower triangular parts is not referenced .
* If uplo = 'L' or 'l', then A contains the lower triangular part o f * If uplo = 'L' or 'l', then A contains the lower triangular part o f
* a symmetric matrix, and the strictly upper triangular part is not * a symmetric matrix, and the strictly upper triangular part is not
* referenced. * referenced.
* lda is the leading dimension of the two-dimensional array containing A. * lda is the leading dimension of the two-dimensional array containing A.
* lda must be at least max(1, n). * lda must be at least max(1, n).
* x single precision array of length at least (1 + (n - 1) * abs(incx )). * x single precision array of length at least (1 + (n - 1) * abs(incx )).
* On entry, x contains the n element right-hand side vector b. On e xit, * On entry, x contains the n element right-hand side vector b. On e xit,
skipping to change at line 2174 skipping to change at line 2591
* ------ * ------
* x updated to contain the solution vector x that solves op(A) * x = b. * x updated to contain the solution vector x that solves op(A) * x = b.
* *
* Reference: http://www.netlib.org/blas/strsv.f * Reference: http://www.netlib.org/blas/strsv.f
* *
* Error status for this function can be retrieved via cublasGetError(). * Error status for this function can be retrieved via cublasGetError().
* *
* Error Status * Error Status
* ------------ * ------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize d * CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize d
* CUBLAS_STATUS_INVALID_VALUE if incx == 0 or if n < 0 or n > 4070 * CUBLAS_STATUS_INVALID_VALUE if incx == 0 or if n < 0
* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU * CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
*/ */
void CUBLASAPI cublasStrsv (char uplo, char trans, char diag, int n, void CUBLASAPI cublasStrsv (char uplo, char trans, char diag, int n,
const float *A, int lda, float *x, int incx); const float *A, int lda, float *x, int incx);
/* ----------------- CUBLAS double complex BLAS2 functions ----------------
- */
/*
* void
* cublasZtrmv (char uplo, char trans, char diag, int n, const cuDoubleComp
lex *A,
* int lda, cuDoubleComplex *x, int incx);
*
* performs one of the matrix-vector operations x = op(A) * x,
* where op(A) = A, or op(A) = transpose(A) or op(A) = conjugate(transpose(
A)).
* x is an n-element double precision complex vector, and
* A is an n x n, unit or non-unit, upper or lower, triangular matrix compo
sed
* of double precision complex elements.
*
* Input
* -----
* uplo specifies whether the matrix A is an upper or lower triangular
* matrix. If uplo = 'U' or 'u', then A is an upper triangular matri
x.
* If uplo = 'L' or 'l', then A is a lower triangular matrix.
* trans specifies op(A). If trans = 'n' or 'N', op(A) = A. If trans = 't'
or
* 'T', op(A) = transpose(A). If trans = 'c' or 'C', op(A) =
* conjugate(transpose(A)).
* diag specifies whether or not matrix A is unit triangular. If diag = '
U'
* or 'u', A is assumed to be unit triangular. If diag = 'N' or 'n',
A
* is not assumed to be unit triangular.
* n specifies the number of rows and columns of the matrix A. n must
be
* at least zero.
* A double precision array of dimension (lda, n). If uplo = 'U' or 'u
',
* the leading n x n upper triangular part of the array A must conta
in
* the upper triangular matrix and the strictly lower triangular par
t
* of A is not referenced. If uplo = 'L' or 'l', the leading n x n l
ower
* triangular part of the array A must contain the lower triangular
* matrix and the strictly upper triangular part of A is not referen
ced.
* When diag = 'U' or 'u', the diagonal elements of A are not refere
nced
* either, but are are assumed to be unity.
* lda is the leading dimension of A. It must be at least max (1, n).
* x double precision array of length at least (1 + (n - 1) * abs(incx
) ).
* On entry, x contains the source vector. On exit, x is overwritten
* with the result vector.
* incx specifies the storage spacing for elements of x. incx must not be
* zero.
*
* Output
* ------
* x updated according to x = op(A) * x,
*
* Reference: http://www.netlib.org/blas/ztrmv.f
*
* Error status for this function can be retrieved via cublasGetError().
*
* Error Status
* ------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
d
* CUBLAS_STATUS_INVALID_VALUE if incx == 0 or if n < 0
* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
*/
void CUBLASAPI cublasZtrmv (char uplo, char trans, char diag, int n,
const cuDoubleComplex *A, int lda, cuDoubleComp
lex *x,
int incx);
/*
* void
* cublasZgbmv (char trans, int m, int n, int kl, int ku, cuDoubleComplex a
lpha,
* const cuDoubleComplex *A, int lda, const cuDoubleComplex *x
, int incx, cuDoubleComplex beta,
* cuDoubleComplex *y, int incy);
*
* performs one of the matrix-vector operations
*
* y = alpha*op(A)*x + beta*y, op(A)=A or op(A) = transpose(A)
*
* alpha and beta are double precision complex scalars. x and y are double
precision
* complex vectors. A is an m by n band matrix consisting of double precisi
on complex elements
* with kl sub-diagonals and ku super-diagonals.
*
* Input
* -----
* trans specifies op(A). If trans == 'N' or 'n', op(A) = A. If trans == '
T',
* or 't', op(A) = transpose(A). If trans == 'C' or 'c',
* op(A) = conjugate(transpose(A)).
* m specifies the number of rows of the matrix A. m must be at least
* zero.
* n specifies the number of columns of the matrix A. n must be at lea
st
* zero.
* kl specifies the number of sub-diagonals of matrix A. It must be at
* least zero.
* ku specifies the number of super-diagonals of matrix A. It must be a
t
* least zero.
* alpha double precision complex scalar multiplier applied to op(A).
* A double precision complex array of dimensions (lda, n). The leadin
g
* (kl + ku + 1) x n part of the array A must contain the band matri
x A,
* supplied column by column, with the leading diagonal of the matri
x
* in row (ku + 1) of the array, the first super-diagonal starting a
t
* position 2 in row ku, the first sub-diagonal starting at position
1
* in row (ku + 2), and so on. Elements in the array A that do not
* correspond to elements in the band matrix (such as the top left
* ku x ku triangle) are not referenced.
* lda leading dimension of A. lda must be at least (kl + ku + 1).
* x double precision complex array of length at least (1+(n-1)*abs(in
cx)) when
* trans == 'N' or 'n' and at least (1+(m-1)*abs(incx)) otherwise.
* incx specifies the increment for the elements of x. incx must not be z
ero.
* beta double precision complex scalar multiplier applied to vector y. I
f beta is
* zero, y is not read.
* y double precision complex array of length at least (1+(m-1)*abs(in
cy)) when
* trans == 'N' or 'n' and at least (1+(n-1)*abs(incy)) otherwise. I
f
* beta is zero, y is not read.
* incy On entry, incy specifies the increment for the elements of y. inc
y
* must not be zero.
*
* Output
* ------
* y updated according to y = alpha*op(A)*x + beta*y
*
* Reference: http://www.netlib.org/blas/zgbmv.f
*
* Error status for this function can be retrieved via cublasGetError().
*
* Error Status
* ------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
d
* CUBLAS_STATUS_INVALID_VALUE if n < 0, or if incx or incy == 0
* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
*/
void CUBLASAPI cublasZgbmv (char trans, int m, int n, int kl, int ku,
cuDoubleComplex alpha, const cuDoubleComplex *A
, int lda,
const cuDoubleComplex *x, int incx, cuDoubleCom
plex beta,
cuDoubleComplex *y, int incy);
/*
* void
* cublasZtbmv (char uplo, char trans, char diag, int n, int k, const cuDou
bleComplex *A,
* int lda, cuDoubleComplex *x, int incx)
*
* performs one of the matrix-vector operations x = op(A) * x, where op(A)
= A,
* op(A) = transpose(A) or op(A) = conjugate(transpose(A)). x is an n-eleme
nt
* double precision complex vector, and A is an n x n, unit or non-unit, up
per
* or lower triangular band matrix composed of double precision complex ele
ments.
*
* Input
* -----
* uplo specifies whether the matrix A is an upper or lower triangular ba
nd
* matrix. If uplo == 'U' or 'u', A is an upper triangular band matr
ix.
* If uplo == 'L' or 'l', A is a lower triangular band matrix.
* trans specifies op(A). If transa == 'N' or 'n', op(A) = A. If trans ==
'T',
* or 't', op(A) = transpose(A). If trans == 'C' or 'c',
* op(A) = conjugate(transpose(A)).
* diag specifies whether or not matrix A is unit triangular. If diag ==
'U'
* or 'u', A is assumed to be unit triangular. If diag == 'N' or 'n'
, A
* is not assumed to be unit triangular.
* n specifies the number of rows and columns of the matrix A. n must
be
* at least zero.
* k specifies the number of super- or sub-diagonals. If uplo == 'U' o
r
* 'u', k specifies the number of super-diagonals. If uplo == 'L' or
* 'l', k specifies the number of sub-diagonals. k must at least be
* zero.
* A double precision complex array of dimension (lda, n). If uplo ==
'U' or 'u',
* the leading (k + 1) x n part of the array A must contain the uppe
r
* triangular band matrix, supplied column by column, with the leadi
ng
* diagonal of the matrix in row (k + 1) of the array, the first
* super-diagonal starting at position 2 in row k, and so on. The to
p
* left k x k triangle of the array A is not referenced. If uplo ==
'L'
* or 'l', the leading (k + 1) x n part of the array A must constain
the
* lower triangular band matrix, supplied column by column, with the
* leading diagonal of the matrix in row 1 of the array, the first
* sub-diagonal startingat position 1 in row 2, and so on. The botto
m
* right k x k triangle of the array is not referenced.
* lda is the leading dimension of A. It must be at least (k + 1).
* x double precision complex array of length at least (1 + (n - 1) *
abs(incx)).
* On entry, x contains the source vector. On exit, x is overwritten
* with the result vector.
* incx specifies the storage spacing for elements of x. incx must not be
* zero.
*
* Output
* ------
* x updated according to x = op(A) * x
*
* Reference: http://www.netlib.org/blas/ztbmv.f
*
* Error status for this function can be retrieved via cublasGetError().
*
* Error Status
* ------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
d
* CUBLAS_STATUS_INVALID_VALUE if n or k < 0, or if incx == 0
* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
*/
void CUBLASAPI cublasZtbmv (char uplo, char trans, char diag, int n,
int k, const cuDoubleComplex *A, int lda, cuDoub
leComplex *x,
int incx);
/*
* void cublasZtbsv (char uplo, char trans, char diag, int n, int k,
* const cuDoubleComplex *A, int lda, cuDoubleComplex *X,
int incx)
*
* solves one of the systems of equations op(A)*x = b, where op(A) is eithe
r
* op(A) = A , op(A) = transpose(A) or op(A) = conjugate(transpose(A)).
* b and x are n element vectors, and A is an n x n unit or non-unit,
* upper or lower triangular band matrix with k + 1 diagonals. No test
* for singularity or near-singularity is included in this function.
* Such tests must be performed before calling this function.
*
* Input
* -----
* uplo specifies whether the matrix is an upper or lower triangular band
* matrix as follows: If uplo == 'U' or 'u', A is an upper triangula
r
* band matrix. If uplo == 'L' or 'l', A is a lower triangular band
* matrix.
* trans specifies op(A). If trans == 'N' or 'n', op(A) = A. If trans == '
T',
* 't', op(A) = transpose(A). If trans == 'C' or 'c',
* op(A) = conjugate(transpose(A)).
* diag specifies whether A is unit triangular. If diag == 'U' or 'u', A
is
* assumed to be unit triangular; thas is, diagonal elements are not
* read and are assumed to be unity. If diag == 'N' or 'n', A is not
* assumed to be unit triangular.
* n specifies the number of rows and columns of the matrix A. n must
be
* at least zero.
* k specifies the number of super- or sub-diagonals. If uplo == 'U' o
r
* 'u', k specifies the number of super-diagonals. If uplo == 'L' or
* 'l', k specifies the number of sub-diagonals. k must at least be
* zero.
* A double precision complex array of dimension (lda, n). If uplo ==
'U' or 'u',
* the leading (k + 1) x n part of the array A must contain the uppe
r
* triangular band matrix, supplied column by column, with the leadi
ng
* diagonal of the matrix in row (k + 1) of the array, the first sup
er-
* diagonal starting at position 2 in row k, and so on. The top left
* k x k triangle of the array A is not referenced. If uplo == 'L' o
r
* 'l', the leading (k + 1) x n part of the array A must constain th
e
* lower triangular band matrix, supplied column by column, with the
* leading diagonal of the matrix in row 1 of the array, the first
* sub-diagonal starting at position 1 in row 2, and so on. The bott
om
* right k x k triangle of the array is not referenced.
* x double precision complex array of length at least (1+(n-1)*abs(in
cx)).
* incx storage spacing between elements of x. It must not be zero.
*
* Output
* ------
* x updated to contain the solution vector x that solves op(A) * x =
b.
*
* Reference: http://www.netlib.org/blas/ztbsv.f
*
* Error status for this function can be retrieved via cublasGetError().
*
* Error Status
* ------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
d
* CUBLAS_STATUS_INVALID_VALUE if incx == 0, n < 0 or n > 1016
* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
*/
void CUBLASAPI cublasZtbsv (char uplo, char trans, char diag, int n,
int k, const cuDoubleComplex *A, int lda, cuDou
bleComplex *x,
int incx);
/*
* void
* cublasZhemv (char uplo, int n, cuDoubleComplex alpha, const cuDoubleComp
lex *A, int lda,
* const cuDoubleComplex *x, int incx, cuDoubleComplex beta, c
uDoubleComplex *y, int incy)
*
* performs the matrix-vector operation
*
* y = alpha*A*x + beta*y
*
* Alpha and beta are double precision complex scalars, and x and y are dou
ble
* precision complex vectors, each with n elements. A is a hermitian n x n
matrix
* consisting of double precision complex elements that is stored in either
upper or
* lower storage mode.
*
* Input
* -----
* uplo specifies whether the upper or lower triangular part of the array
A
* is to be referenced. If uplo == 'U' or 'u', the hermitian matrix
A
* is stored in upper storage mode, i.e. only the upper triangular p
art
* of A is to be referenced while the lower triangular part of A is
to
* be inferred. If uplo == 'L' or 'l', the hermitian matrix A is sto
red
* in lower storage mode, i.e. only the lower triangular part of A i
s
* to be referenced while the upper triangular part of A is to be
* inferred.
* n specifies the number of rows and the number of columns of the
* hermitian matrix A. n must be at least zero.
* alpha double precision complex scalar multiplier applied to A*x.
* A double precision complex array of dimensions (lda, n). If uplo ==
'U' or 'u',
* the leading n x n upper triangular part of the array A must conta
in
* the upper triangular part of the hermitian matrix and the strictl
y
* lower triangular part of A is not referenced. If uplo == 'L' or '
l',
* the leading n x n lower triangular part of the array A must conta
in
* the lower triangular part of the hermitian matrix and the strictl
y
* upper triangular part of A is not referenced. The imaginary parts
* of the diagonal elements need not be set, they are assumed to be
zero.
* lda leading dimension of A. It must be at least max (1, n).
* x double precision complex array of length at least (1 + (n - 1) *
abs(incx)).
* incx storage spacing between elements of x. incx must not be zero.
* beta double precision complex scalar multiplier applied to vector y.
* y double precision complex array of length at least (1 + (n - 1) *
abs(incy)).
* If beta is zero, y is not read.
* incy storage spacing between elements of y. incy must not be zero.
*
* Output
* ------
* y updated according to y = alpha*A*x + beta*y
*
* Reference: http://www.netlib.org/blas/zhemv.f
*
* Error status for this function can be retrieved via cublasGetError().
*
* Error Status
* ------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
d
* CUBLAS_STATUS_INVALID_VALUE if n < 0, or if incx or incy == 0
* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
*/
void CUBLASAPI cublasZhemv (char uplo, int n, cuDoubleComplex alpha,
const cuDoubleComplex *A, int lda, const cuDoub
leComplex *x,
int incx, cuDoubleComplex beta, cuDoubleComplex
*y, int incy);
/*
* void
* cublasZhpmv (char uplo, int n, cuDoubleComplex alpha, const cuDoubleComp
lex *AP, const cuDoubleComplex *x,
* int incx, cuDoubleComplex beta, cuDoubleComplex *y, int inc
y)
*
* performs the matrix-vector operation
*
* y = alpha * A * x + beta * y
*
* Alpha and beta are double precision complex scalars, and x and y are dou
ble
* precision complex vectors with n elements. A is an hermitian n x n matri
x
* consisting of double precision complex elements that is supplied in pack
ed form.
*
* Input
* -----
* uplo specifies whether the matrix data is stored in the upper or the l
ower
* triangular part of array AP. If uplo == 'U' or 'u', then the uppe
r
* triangular part of A is supplied in AP. If uplo == 'L' or 'l', th
en
* the lower triangular part of A is supplied in AP.
* n specifies the number of rows and columns of the matrix A. It must
be
* at least zero.
* alpha double precision complex scalar multiplier applied to A*x.
* AP double precision complex array with at least ((n * (n + 1)) / 2)
elements. If
* uplo == 'U' or 'u', the array AP contains the upper triangular pa
rt
* of the hermitian matrix A, packed sequentially, column by column;
* that is, if i <= j, then A[i,j] is stored is AP[i+(j*(j+1)/2)]. I
f
* uplo == 'L' or 'L', the array AP contains the lower triangular pa
rt
* of the hermitian matrix A, packed sequentially, column by column;
* that is, if i >= j, then A[i,j] is stored in AP[i+((2*n-j+1)*j)/2
].
* The imaginary parts of the diagonal elements need not be set, the
y
* are assumed to be zero.
* x double precision complex array of length at least (1 + (n - 1) *
abs(incx)).
* incx storage spacing between elements of x. incx must not be zero.
* beta double precision complex scalar multiplier applied to vector y;
* y double precision array of length at least (1 + (n - 1) * abs(incy
)).
* If beta is zero, y is not read.
* incy storage spacing between elements of y. incy must not be zero.
*
* Output
* ------
* y updated according to y = alpha*A*x + beta*y
*
* Reference: http://www.netlib.org/blas/zhpmv.f
*
* Error status for this function can be retrieved via cublasGetError().
*
* Error Status
* ------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
d
* CUBLAS_STATUS_INVALID_VALUE if n < 0, or if incx or incy == 0
* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
*/
void CUBLASAPI cublasZhpmv (char uplo, int n, cuDoubleComplex alpha,
const cuDoubleComplex *AP, const cuDoubleComple
x *x,
int incx, cuDoubleComplex beta, cuDoubleComplex
*y, int incy);
/* ----------------- CUBLAS double complex BLAS3 functions ---------------- - */ /* ----------------- CUBLAS double complex BLAS3 functions ---------------- - */
/* /*
* cublasZgemv (char trans, int m, int n, cuDoubleComplex alpha, const cuDo ubleComplex *A, int lda, * cublasZgemv (char trans, int m, int n, cuDoubleComplex alpha, const cuDo ubleComplex *A, int lda,
* const cuDoubleComplex *x, int incx, cuDoubleComplex beta, c uDoubleComplex *y, int incy) * const cuDoubleComplex *x, int incx, cuDoubleComplex beta, c uDoubleComplex *y, int incy)
* *
* performs one of the matrix-vector operations * performs one of the matrix-vector operations
* *
* y = alpha * op(A) * x + beta * y, * y = alpha * op(A) * x + beta * y,
* *
skipping to change at line 2243 skipping to change at line 3033
* Error Status * Error Status
* ------------ * ------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize d * CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize d
* CUBLAS_STATUS_INVALID_VALUE if m or n are < 0, or if incx or incy == 0 * CUBLAS_STATUS_INVALID_VALUE if m or n are < 0, or if incx or incy == 0
* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU * CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
*/ */
void CUBLASAPI cublasZgemv (char trans, int m, int n, cuDoubleComplex alpha , void CUBLASAPI cublasZgemv (char trans, int m, int n, cuDoubleComplex alpha ,
const cuDoubleComplex *A, int lda, const cuDoub leComplex *x, int incx, const cuDoubleComplex *A, int lda, const cuDoub leComplex *x, int incx,
cuDoubleComplex beta, cuDoubleComplex *y, int i ncy); cuDoubleComplex beta, cuDoubleComplex *y, int i ncy);
/*
* void
* cublasZtpmv (char uplo, char trans, char diag, int n, const cuDoubleComp
lex *AP,
* cuDoubleComplex *x, int incx);
*
* performs one of the matrix-vector operations x = op(A) * x, where op(A)
= A,
* op(A) = transpose(A) or op(A) = conjugate(transpose(A)) . x is an n elem
ent
* double precision complex vector, and A is an n x n, unit or non-unit, up
per
* or lower triangular matrix composed of double precision complex elements
.
*
* Input
* -----
* uplo specifies whether the matrix A is an upper or lower triangular
* matrix. If uplo == 'U' or 'u', then A is an upper triangular matr
ix.
* If uplo == 'L' or 'l', then A is a lower triangular matrix.
* trans specifies op(A). If transa == 'N' or 'n', op(A) = A. If trans ==
'T',
* or 't', op(A) = transpose(A). If trans == 'C' or 'c',
* op(A) = conjugate(transpose(A)).
*
* diag specifies whether or not matrix A is unit triangular. If diag ==
'U'
* or 'u', A is assumed to be unit triangular. If diag == 'N' or 'n'
, A
* is not assumed to be unit triangular.
* n specifies the number of rows and columns of the matrix A. n must
be
* at least zero. In the current implementation n must not exceed 40
70.
* AP double precision complex array with at least ((n * (n + 1)) / 2)
elements. If
* uplo == 'U' or 'u', the array AP contains the upper triangular pa
rt
* of the symmetric matrix A, packed sequentially, column by column;
* that is, if i <= j, then A[i,j] is stored in AP[i+(j*(j+1)/2)]. I
f
* uplo == 'L' or 'L', the array AP contains the lower triangular pa
rt
* of the symmetric matrix A, packed sequentially, column by column;
* that is, if i >= j, then A[i,j] is stored in AP[i+((2*n-j+1)*j)/2
].
* x double precision complex array of length at least (1 + (n - 1) *
abs(incx)).
* On entry, x contains the source vector. On exit, x is overwritten
* with the result vector.
* incx specifies the storage spacing for elements of x. incx must not be
* zero.
*
* Output
* ------
* x updated according to x = op(A) * x,
*
* Reference: http://www.netlib.org/blas/ztpmv.f
*
* Error status for this function can be retrieved via cublasGetError().
*
* Error Status
* ------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
d
* CUBLAS_STATUS_INVALID_VALUE if incx == 0 or n < 0
* CUBLAS_STATUS_ALLOC_FAILED if function cannot allocate enough intern
al scratch vector memory
* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
*/
void CUBLASAPI cublasZtpmv (char uplo, char trans, char diag, int n,
const cuDoubleComplex *AP, cuDoubleComplex *x,
int incx);
/*
* void
* cublasZtpsv (char uplo, char trans, char diag, int n, const cuDoubleComp
lex *AP,
* cuDoubleComplex *X, int incx)
*
* solves one of the systems of equations op(A)*x = b, where op(A) is eithe
r
* op(A) = A , op(A) = transpose(A) or op(A) = conjugate(transpose)). b and
* x are n element complex vectors, and A is an n x n unit or non-unit,
* upper or lower triangular matrix. No test for singularity or near-singul
arity
* is included in this routine. Such tests must be performed before calling
this routine.
*
* Input
* -----
* uplo specifies whether the matrix is an upper or lower triangular matr
ix
* as follows: If uplo == 'U' or 'u', A is an upper triangluar matri
x.
* If uplo == 'L' or 'l', A is a lower triangular matrix.
* trans specifies op(A). If trans == 'N' or 'n', op(A) = A. If trans == '
T'
* or 't', op(A) = transpose(A). If trans == 'C' or 'c', op(A) =
* conjugate(transpose(A)).
* diag specifies whether A is unit triangular. If diag == 'U' or 'u', A
is
* assumed to be unit triangular; thas is, diagonal elements are not
* read and are assumed to be unity. If diag == 'N' or 'n', A is not
* assumed to be unit triangular.
* n specifies the number of rows and columns of the matrix A. n must
be
* at least zero.
* AP double precision complex array with at least ((n*(n+1))/2) elemen
ts.
* If uplo == 'U' or 'u', the array AP contains the upper triangular
* matrix A, packed sequentially, column by column; that is, if i <=
j, then
* A[i,j] is stored is AP[i+(j*(j+1)/2)]. If uplo == 'L' or 'L', the
* array AP contains the lower triangular matrix A, packed sequentia
lly,
* column by column; that is, if i >= j, then A[i,j] is stored in
* AP[i+((2*n-j+1)*j)/2]. When diag = 'U' or 'u', the diagonal eleme
nts
* of A are not referenced and are assumed to be unity.
* x double precision complex array of length at least (1+(n-1)*abs(in
cx)).
* incx storage spacing between elements of x. It must not be zero.
*
* Output
* ------
* x updated to contain the solution vector x that solves op(A) * x =
b.
*
* Reference: http://www.netlib.org/blas/ztpsv.f
*
* Error status for this function can be retrieved via cublasGetError().
*
* Error Status
* ------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
d
* CUBLAS_STATUS_INVALID_VALUE if incx == 0 or if n < 0 or n > 2035
* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
*/
void CUBLASAPI cublasZtpsv (char uplo, char trans, char diag, int n,
const cuDoubleComplex *AP, cuDoubleComplex *x,
int incx);
/* ----------------- CUBLAS single complex BLAS2 functions ---------------- - */ /* ----------------- CUBLAS single complex BLAS2 functions ---------------- - */
/*
* cublasCgemv (char trans, int m, int n, cuComplex alpha, const cuComplex
*A,
* int lda, const cuComplex *x, int incx, cuComplex beta, cuCo
mplex *y,
* int incy)
*
* performs one of the matrix-vector operations
*
* y = alpha * op(A) * x + beta * y,
*
* where op(A) is one of
*
* op(A) = A or op(A) = transpose(A) or op(A) = conjugate(transpose(
A))
*
* where alpha and beta are single precision scalars, x and y are single
* precision vectors, and A is an m x n matrix consisting of single precisi
on
* elements. Matrix A is stored in column major format, and lda is the lead
ing
* dimension of the two-dimensional array in which A is stored.
*
* Input
* -----
* trans specifies op(A). If transa = 'n' or 'N', op(A) = A. If trans =
* trans = 't' or 'T', op(A) = transpose(A). If trans = 'c' or 'C',
* op(A) = conjugate(transpose(A))
* m specifies the number of rows of the matrix A. m must be at least
* zero.
* n specifies the number of columns of the matrix A. n must be at lea
st
* zero.
* alpha single precision scalar multiplier applied to op(A).
* A single precision array of dimensions (lda, n) if trans = 'n' or
* 'N'), and of dimensions (lda, m) otherwise. lda must be at least
* max(1, m) and at least max(1, n) otherwise.
* lda leading dimension of two-dimensional array used to store matrix A
* x single precision array of length at least (1 + (n - 1) * abs(incx
))
* when trans = 'N' or 'n' and at least (1 + (m - 1) * abs(incx))
* otherwise.
* incx specifies the storage spacing between elements of x. incx must no
t
* be zero.
* beta single precision scalar multiplier applied to vector y. If beta
* is zero, y is not read.
* y single precision array of length at least (1 + (m - 1) * abs(incy
))
* when trans = 'N' or 'n' and at least (1 + (n - 1) * abs(incy))
* otherwise.
* incy specifies the storage spacing between elements of y. incy must no
t
* be zero.
*
* Output
* ------
* y updated according to alpha * op(A) * x + beta * y
*
* Reference: http://www.netlib.org/blas/cgemv.f
*
* Error status for this function can be retrieved via cublasGetError().
*
* Error Status
* ------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
d
* CUBLAS_STATUS_INVALID_VALUE if m or n are < 0, or if incx or incy ==
0
* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
*/
void CUBLASAPI cublasCgemv (char trans, int m, int n, cuComplex alpha, void CUBLASAPI cublasCgemv (char trans, int m, int n, cuComplex alpha,
const cuComplex *A, int lda, const cuComplex *x , const cuComplex *A, int lda, const cuComplex *x ,
int incx, cuComplex beta, cuComplex *y, int inc y); int incx, cuComplex beta, cuComplex *y, int inc y);
/*
* void
* cublasCgbmv (char trans, int m, int n, int kl, int ku, cuComplex alpha,
* const cuComplex *A, int lda, const cuComplex *x, int incx,
cuComplex beta,
* cuComplex *y, int incy);
*
* performs one of the matrix-vector operations
*
* y = alpha*op(A)*x + beta*y, op(A)=A or op(A) = transpose(A)
*
* alpha and beta are single precision complex scalars. x and y are single
precision
* complex vectors. A is an m by n band matrix consisting of single precisi
on complex elements
* with kl sub-diagonals and ku super-diagonals.
*
* Input
* -----
* trans specifies op(A). If trans == 'N' or 'n', op(A) = A. If trans == '
T',
* or 't', op(A) = transpose(A). If trans == 'C' or 'c',
* op(A) = conjugate(transpose(A)).
* m specifies the number of rows of the matrix A. m must be at least
* zero.
* n specifies the number of columns of the matrix A. n must be at lea
st
* zero.
* kl specifies the number of sub-diagonals of matrix A. It must be at
* least zero.
* ku specifies the number of super-diagonals of matrix A. It must be a
t
* least zero.
* alpha single precision complex scalar multiplier applied to op(A).
* A single precision complex array of dimensions (lda, n). The leadin
g
* (kl + ku + 1) x n part of the array A must contain the band matri
x A,
* supplied column by column, with the leading diagonal of the matri
x
* in row (ku + 1) of the array, the first super-diagonal starting a
t
* position 2 in row ku, the first sub-diagonal starting at position
1
* in row (ku + 2), and so on. Elements in the array A that do not
* correspond to elements in the band matrix (such as the top left
* ku x ku triangle) are not referenced.
* lda leading dimension of A. lda must be at least (kl + ku + 1).
* x single precision complex array of length at least (1+(n-1)*abs(in
cx)) when
* trans == 'N' or 'n' and at least (1+(m-1)*abs(incx)) otherwise.
* incx specifies the increment for the elements of x. incx must not be z
ero.
* beta single precision complex scalar multiplier applied to vector y. I
f beta is
* zero, y is not read.
* y single precision complex array of length at least (1+(m-1)*abs(in
cy)) when
* trans == 'N' or 'n' and at least (1+(n-1)*abs(incy)) otherwise. I
f
* beta is zero, y is not read.
* incy On entry, incy specifies the increment for the elements of y. inc
y
* must not be zero.
*
* Output
* ------
* y updated according to y = alpha*op(A)*x + beta*y
*
* Reference: http://www.netlib.org/blas/cgbmv.f
*
* Error status for this function can be retrieved via cublasGetError().
*
* Error Status
* ------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
d
* CUBLAS_STATUS_INVALID_VALUE if n < 0, or if incx or incy == 0
* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
*/
void CUBLASAPI cublasCgbmv (char trans, int m, int n, int kl, int ku, void CUBLASAPI cublasCgbmv (char trans, int m, int n, int kl, int ku,
cuComplex alpha, const cuComplex *A, int lda, cuComplex alpha, const cuComplex *A, int lda,
const cuComplex *x, int incx, cuComplex beta, const cuComplex *x, int incx, cuComplex beta,
cuComplex *y, int incy); cuComplex *y, int incy);
/*
* void
* cublasChemv (char uplo, int n, cuComplex alpha, const cuComplex *A, int
lda,
* const cuComplex *x, int incx, cuComplex beta, cuComplex *y,
int incy)
*
* performs the matrix-vector operation
*
* y = alpha*A*x + beta*y
*
* Alpha and beta are single precision complex scalars, and x and y are sin
gle
* precision complex vectors, each with n elements. A is a hermitian n x n
matrix
* consisting of single precision complex elements that is stored in either
upper or
* lower storage mode.
*
* Input
* -----
* uplo specifies whether the upper or lower triangular part of the array
A
* is to be referenced. If uplo == 'U' or 'u', the hermitian matrix
A
* is stored in upper storage mode, i.e. only the upper triangular p
art
* of A is to be referenced while the lower triangular part of A is
to
* be inferred. If uplo == 'L' or 'l', the hermitian matrix A is sto
red
* in lower storage mode, i.e. only the lower triangular part of A i
s
* to be referenced while the upper triangular part of A is to be
* inferred.
* n specifies the number of rows and the number of columns of the
* hermitian matrix A. n must be at least zero.
* alpha single precision complex scalar multiplier applied to A*x.
* A single precision complex array of dimensions (lda, n). If uplo ==
'U' or 'u',
* the leading n x n upper triangular part of the array A must conta
in
* the upper triangular part of the hermitian matrix and the strictl
y
* lower triangular part of A is not referenced. If uplo == 'L' or '
l',
* the leading n x n lower triangular part of the array A must conta
in
* the lower triangular part of the hermitian matrix and the strictl
y
* upper triangular part of A is not referenced. The imaginary parts
* of the diagonal elements need not be set, they are assumed to be
zero.
* lda leading dimension of A. It must be at least max (1, n).
* x single precision complex array of length at least (1 + (n - 1) *
abs(incx)).
* incx storage spacing between elements of x. incx must not be zero.
* beta single precision complex scalar multiplier applied to vector y.
* y single precision complex array of length at least (1 + (n - 1) *
abs(incy)).
* If beta is zero, y is not read.
* incy storage spacing between elements of y. incy must not be zero.
*
* Output
* ------
* y updated according to y = alpha*A*x + beta*y
*
* Reference: http://www.netlib.org/blas/chemv.f
*
* Error status for this function can be retrieved via cublasGetError().
*
* Error Status
* ------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
d
* CUBLAS_STATUS_INVALID_VALUE if n < 0, or if incx or incy == 0
* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
*/
void CUBLASAPI cublasChemv (char uplo, int n, cuComplex alpha, void CUBLASAPI cublasChemv (char uplo, int n, cuComplex alpha,
const cuComplex *A, int lda, const cuComplex *x , const cuComplex *A, int lda, const cuComplex *x ,
int incx, cuComplex beta, cuComplex *y, int inc y); int incx, cuComplex beta, cuComplex *y, int inc y);
/*
* void
* cublasChbmv (char uplo, int n, int k, cuComplex alpha, const cuComplex *
A, int lda,
* const cuComplex *x, int incx, cuComplex beta, cuComplex *y,
int incy)
*
* performs the matrix-vector operation
*
* y := alpha*A*x + beta*y
*
* alpha and beta are single precision complex scalars. x and y are single
precision
* complex vectors with n elements. A is an n by n hermitian band matrix co
nsisting
* of single precision complex elements, with k super-diagonals and the sam
e number
* of subdiagonals.
*
* Input
* -----
* uplo specifies whether the upper or lower triangular part of the hermi
tian
* band matrix A is being supplied. If uplo == 'U' or 'u', the upper
* triangular part is being supplied. If uplo == 'L' or 'l', the low
er
* triangular part is being supplied.
* n specifies the number of rows and the number of columns of the
* hermitian matrix A. n must be at least zero.
* k specifies the number of super-diagonals of matrix A. Since the ma
trix
* is hermitian, this is also the number of sub-diagonals. k must be
at
* least zero.
* alpha single precision complex scalar multiplier applied to A*x.
* A single precision complex array of dimensions (lda, n). When uplo
== 'U' or
* 'u', the leading (k + 1) x n part of array A must contain the upp
er
* triangular band of the hermitian matrix, supplied column by colum
n,
* with the leading diagonal of the matrix in row (k+1) of the array
,
* the first super-diagonal starting at position 2 in row k, and so
on.
* The top left k x k triangle of the array A is not referenced. Whe
n
* uplo == 'L' or 'l', the leading (k + 1) x n part of the array A m
ust
* contain the lower triangular band part of the hermitian matrix,
* supplied column by column, with the leading diagonal of the matri
x in
* row 1 of the array, the first sub-diagonal starting at position 1
in
* row 2, and so on. The bottom right k x k triangle of the array A
is
* not referenced. The imaginary parts of the diagonal elements need
* not be set, they are assumed to be zero.
* lda leading dimension of A. lda must be at least (k + 1).
* x single precision complex array of length at least (1 + (n - 1) *
abs(incx)).
* incx storage spacing between elements of x. incx must not be zero.
* beta single precision complex scalar multiplier applied to vector y. I
f beta is
* zero, y is not read.
* y single precision complex array of length at least (1 + (n - 1) *
abs(incy)).
* If beta is zero, y is not read.
* incy storage spacing between elements of y. incy must not be zero.
*
* Output
* ------
* y updated according to alpha*A*x + beta*y
*
* Reference: http://www.netlib.org/blas/chbmv.f
*
* Error status for this function can be retrieved via cublasGetError().
*
* Error Status
* ------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
d
* CUBLAS_STATUS_INVALID_VALUE if k or n < 0, or if incx or incy == 0
* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
*/
void CUBLASAPI cublasChbmv (char uplo, int n, int k, cuComplex alpha, void CUBLASAPI cublasChbmv (char uplo, int n, int k, cuComplex alpha,
const cuComplex *A, int lda, const cuComplex *x , const cuComplex *A, int lda, const cuComplex *x ,
int incx, cuComplex beta, cuComplex *y, int inc y); int incx, cuComplex beta, cuComplex *y, int inc y);
void CUBLASAPI cublasChpmv (char uplo, int n, cuComplex alpha, void CUBLASAPI cublasChpmv (char uplo, int n, cuComplex alpha,
const cuComplex *AP, const cuComplex *x, int in cx, const cuComplex *AP, const cuComplex *x, int in cx,
cuComplex beta, cuComplex *y, int incy); cuComplex beta, cuComplex *y, int incy);
/*
*
* cublasCtrmv (char uplo, char trans, char diag, int n, const cuComplex *A
,
* int lda, cuComplex *x, int incx);
*
* performs one of the matrix-vector operations x = op(A) * x,
* where op(A) = A, or op(A) = transpose(A) or op(A) = conjugate(transpose(
A)).
* x is an n-element signle precision complex vector, and
* A is an n x n, unit or non-unit, upper or lower, triangular matrix compo
sed
* of single precision complex elements.
*
* Input
* -----
* uplo specifies whether the matrix A is an upper or lower triangular
* matrix. If uplo = 'U' or 'u', then A is an upper triangular matri
x.
* If uplo = 'L' or 'l', then A is a lower triangular matrix.
* trans specifies op(A). If trans = 'n' or 'N', op(A) = A. If trans = 't'
or
* 'T', op(A) = transpose(A). If trans = 'c' or 'C', op(A) =
* conjugate(transpose(A)).
* diag specifies whether or not matrix A is unit triangular. If diag = '
U'
* or 'u', A is assumed to be unit triangular. If diag = 'N' or 'n',
A
* is not assumed to be unit triangular.
* n specifies the number of rows and columns of the matrix A. n must
be
* at least zero.
* A single precision array of dimension (lda, n). If uplo = 'U' or 'u
',
* the leading n x n upper triangular part of the array A must conta
in
* the upper triangular matrix and the strictly lower triangular par
t
* of A is not referenced. If uplo = 'L' or 'l', the leading n x n l
ower
* triangular part of the array A must contain the lower triangular
* matrix and the strictly upper triangular part of A is not referen
ced.
* When diag = 'U' or 'u', the diagonal elements of A are not refere
nced
* either, but are are assumed to be unity.
* lda is the leading dimension of A. It must be at least max (1, n).
* x single precision array of length at least (1 + (n - 1) * abs(incx
) ).
* On entry, x contains the source vector. On exit, x is overwritten
* with the result vector.
* incx specifies the storage spacing for elements of x. incx must not be
* zero.
*
* Output
* ------
* x updated according to x = op(A) * x,
*
* Reference: http://www.netlib.org/blas/ctrmv.f
*
* Error status for this function can be retrieved via cublasGetError().
*
* Error Status
* ------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
d
* CUBLAS_STATUS_INVALID_VALUE if incx == 0 or if n < 0
* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
*/
void CUBLASAPI cublasCtrmv (char uplo, char trans, char diag, int n, void CUBLASAPI cublasCtrmv (char uplo, char trans, char diag, int n,
const cuComplex *A, int lda, cuComplex *x, const cuComplex *A, int lda, cuComplex *x,
int incx); int incx);
/*
* void
* cublasCtbmv (char uplo, char trans, char diag, int n, int k, const cuCom
plex *A,
* int lda, cuComplex *x, int incx)
*
* performs one of the matrix-vector operations x = op(A) * x, where op(A)
= A,
* op(A) = transpose(A) or op(A) = conjugate(transpose(A)). x is an n-eleme
nt
* single precision complex vector, and A is an n x n, unit or non-unit, up
per
* or lower triangular band matrix composed of single precision complex ele
ments.
*
* Input
* -----
* uplo specifies whether the matrix A is an upper or lower triangular ba
nd
* matrix. If uplo == 'U' or 'u', A is an upper triangular band matr
ix.
* If uplo == 'L' or 'l', A is a lower triangular band matrix.
* trans specifies op(A). If transa == 'N' or 'n', op(A) = A. If trans ==
'T',
* or 't', op(A) = transpose(A). If trans == 'C' or 'c',
* op(A) = conjugate(transpose(A)).
* diag specifies whether or not matrix A is unit triangular. If diag ==
'U'
* or 'u', A is assumed to be unit triangular. If diag == 'N' or 'n'
, A
* is not assumed to be unit triangular.
* n specifies the number of rows and columns of the matrix A. n must
be
* at least zero.
* k specifies the number of super- or sub-diagonals. If uplo == 'U' o
r
* 'u', k specifies the number of super-diagonals. If uplo == 'L' or
* 'l', k specifies the number of sub-diagonals. k must at least be
* zero.
* A single precision complex array of dimension (lda, n). If uplo ==
'U' or 'u',
* the leading (k + 1) x n part of the array A must contain the uppe
r
* triangular band matrix, supplied column by column, with the leadi
ng
* diagonal of the matrix in row (k + 1) of the array, the first
* super-diagonal starting at position 2 in row k, and so on. The to
p
* left k x k triangle of the array A is not referenced. If uplo ==
'L'
* or 'l', the leading (k + 1) x n part of the array A must constain
the
* lower triangular band matrix, supplied column by column, with the
* leading diagonal of the matrix in row 1 of the array, the first
* sub-diagonal startingat position 1 in row 2, and so on. The botto
m
* right k x k triangle of the array is not referenced.
* lda is the leading dimension of A. It must be at least (k + 1).
* x single precision complex array of length at least (1 + (n - 1) *
abs(incx)).
* On entry, x contains the source vector. On exit, x is overwritten
* with the result vector.
* incx specifies the storage spacing for elements of x. incx must not be
* zero.
*
* Output
* ------
* x updated according to x = op(A) * x
*
* Reference: http://www.netlib.org/blas/ctbmv.f
*
* Error status for this function can be retrieved via cublasGetError().
*
* Error Status
* ------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
d
* CUBLAS_STATUS_INVALID_VALUE if n or k < 0, or if incx == 0
* CUBLAS_STATUS_ALLOC_FAILED if function cannot allocate enough intern
al scratch vector memory
* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
*/
void CUBLASAPI cublasCtbmv (char uplo, char trans, char diag, int n, int k, void CUBLASAPI cublasCtbmv (char uplo, char trans, char diag, int n, int k,
const cuComplex *A, int lda, cuComplex *x, const cuComplex *A, int lda, cuComplex *x,
int incx); int incx);
/*
* void
* cublasCtpmv (char uplo, char trans, char diag, int n, const cuComplex *A
P,
* cuComplex *x, int incx);
*
* performs one of the matrix-vector operations x = op(A) * x, where op(A)
= A,
* op(A) = transpose(A) or op(A) = conjugate(transpose(A)) . x is an n elem
ent
* single precision complex vector, and A is an n x n, unit or non-unit, up
per
* or lower triangular matrix composed of single precision complex elements
.
*
* Input
* -----
* uplo specifies whether the matrix A is an upper or lower triangular
* matrix. If uplo == 'U' or 'u', then A is an upper triangular matr
ix.
* If uplo == 'L' or 'l', then A is a lower triangular matrix.
* trans specifies op(A). If transa == 'N' or 'n', op(A) = A. If trans ==
'T',
* or 't', op(A) = transpose(A). If trans == 'C' or 'c',
* op(A) = conjugate(transpose(A)).
*
* diag specifies whether or not matrix A is unit triangular. If diag ==
'U'
* or 'u', A is assumed to be unit triangular. If diag == 'N' or 'n'
, A
* is not assumed to be unit triangular.
* n specifies the number of rows and columns of the matrix A. n must
be
* at least zero. In the current implementation n must not exceed 40
70.
* AP single precision complex array with at least ((n * (n + 1)) / 2)
elements. If
* uplo == 'U' or 'u', the array AP contains the upper triangular pa
rt
* of the symmetric matrix A, packed sequentially, column by column;
* that is, if i <= j, then A[i,j] is stored in AP[i+(j*(j+1)/2)]. I
f
* uplo == 'L' or 'L', the array AP contains the lower triangular pa
rt
* of the symmetric matrix A, packed sequentially, column by column;
* that is, if i >= j, then A[i,j] is stored in AP[i+((2*n-j+1)*j)/2
].
* x single precision complex array of length at least (1 + (n - 1) *
abs(incx)).
* On entry, x contains the source vector. On exit, x is overwritten
* with the result vector.
* incx specifies the storage spacing for elements of x. incx must not be
* zero.
*
* Output
* ------
* x updated according to x = op(A) * x,
*
* Reference: http://www.netlib.org/blas/ctpmv.f
*
* Error status for this function can be retrieved via cublasGetError().
*
* Error Status
* ------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
d
* CUBLAS_STATUS_INVALID_VALUE if incx == 0 or n < 0
* CUBLAS_STATUS_ALLOC_FAILED if function cannot allocate enough intern
al scratch vector memory
* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
*/
void CUBLASAPI cublasCtpmv (char uplo, char trans, char diag, int n, void CUBLASAPI cublasCtpmv (char uplo, char trans, char diag, int n,
const cuComplex *AP, cuComplex *x, int incx); const cuComplex *AP, cuComplex *x, int incx);
/*
* void
* cublasCtrsv (char uplo, char trans, char diag, int n, const cuComplex *A
,
* int lda, cuComplex *x, int incx)
*
* solves a system of equations op(A) * x = b, where op(A) is either A,
* transpose(A) or conjugate(transpose(A)). b and x are single precision
* complex vectors consisting of n elements, and A is an n x n matrix
* composed of a unit or non-unit, upper or lower triangular matrix.
* Matrix A is stored in column major format, and lda is the leading
* dimension of the two-dimensional array containing A.
*
* No test for singularity or near-singularity is included in this function
.
* Such tests must be performed before calling this function.
*
* Input
* -----
* uplo specifies whether the matrix data is stored in the upper or the
* lower triangular part of array A. If uplo = 'U' or 'u', then only
* the upper triangular part of A may be referenced. If uplo = 'L' o
r
* 'l', then only the lower triangular part of A may be referenced.
* trans specifies op(A). If transa = 'n' or 'N', op(A) = A. If transa = '
t',
* 'T', 'c', or 'C', op(A) = transpose(A)
* diag specifies whether or not A is a unit triangular matrix like so:
* if diag = 'U' or 'u', A is assumed to be unit triangular. If
* diag = 'N' or 'n', then A is not assumed to be unit triangular.
* n specifies the number of rows and columns of the matrix A. It
* must be at least 0.
* A is a single precision complex array of dimensions (lda, n). If up
lo = 'U'
* or 'u', then A must contains the upper triangular part of a symme
tric
* matrix, and the strictly lower triangular parts is not referenced
.
* If uplo = 'L' or 'l', then A contains the lower triangular part o
f
* a symmetric matrix, and the strictly upper triangular part is not
* referenced.
* lda is the leading dimension of the two-dimensional array containing
A.
* lda must be at least max(1, n).
* x single precision complex array of length at least (1 + (n - 1) *
abs(incx)).
* On entry, x contains the n element right-hand side vector b. On e
xit,
* it is overwritten with the solution vector x.
* incx specifies the storage spacing between elements of x. incx must no
t
* be zero.
*
* Output
* ------
* x updated to contain the solution vector x that solves op(A) * x =
b.
*
* Reference: http://www.netlib.org/blas/ctrsv.f
*
* Error status for this function can be retrieved via cublasGetError().
*
* Error Status
* ------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
d
* CUBLAS_STATUS_INVALID_VALUE if incx == 0 or if n < 0
* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
*/
void CUBLASAPI cublasCtrsv (char uplo, char trans, char diag, int n, void CUBLASAPI cublasCtrsv (char uplo, char trans, char diag, int n,
const cuComplex *A, int lda, cuComplex *x, const cuComplex *A, int lda, cuComplex *x,
int incx); int incx);
/*
* void cublasCtbsv (char uplo, char trans, char diag, int n, int k,
* const cuComplex *A, int lda, cuComplex *X, int incx)
*
* solves one of the systems of equations op(A)*x = b, where op(A) is eithe
r
* op(A) = A , op(A) = transpose(A) or op(A) = conjugate(transpose(A)).
* b and x are n element vectors, and A is an n x n unit or non-unit,
* upper or lower triangular band matrix with k + 1 diagonals. No test
* for singularity or near-singularity is included in this function.
* Such tests must be performed before calling this function.
*
* Input
* -----
* uplo specifies whether the matrix is an upper or lower triangular band
* matrix as follows: If uplo == 'U' or 'u', A is an upper triangula
r
* band matrix. If uplo == 'L' or 'l', A is a lower triangular band
* matrix.
* trans specifies op(A). If trans == 'N' or 'n', op(A) = A. If trans == '
T',
* 't', op(A) = transpose(A). If trans == 'C' or 'c',
* op(A) = conjugate(transpose(A)).
* diag specifies whether A is unit triangular. If diag == 'U' or 'u', A
is
* assumed to be unit triangular; thas is, diagonal elements are not
* read and are assumed to be unity. If diag == 'N' or 'n', A is not
* assumed to be unit triangular.
* n specifies the number of rows and columns of the matrix A. n must
be
* at least zero.
* k specifies the number of super- or sub-diagonals. If uplo == 'U' o
r
* 'u', k specifies the number of super-diagonals. If uplo == 'L' or
* 'l', k specifies the number of sub-diagonals. k must at least be
* zero.
* A single precision complex array of dimension (lda, n). If uplo ==
'U' or 'u',
* the leading (k + 1) x n part of the array A must contain the uppe
r
* triangular band matrix, supplied column by column, with the leadi
ng
* diagonal of the matrix in row (k + 1) of the array, the first sup
er-
* diagonal starting at position 2 in row k, and so on. The top left
* k x k triangle of the array A is not referenced. If uplo == 'L' o
r
* 'l', the leading (k + 1) x n part of the array A must constain th
e
* lower triangular band matrix, supplied column by column, with the
* leading diagonal of the matrix in row 1 of the array, the first
* sub-diagonal starting at position 1 in row 2, and so on. The bott
om
* right k x k triangle of the array is not referenced.
* x single precision complex array of length at least (1+(n-1)*abs(in
cx)).
* incx storage spacing between elements of x. It must not be zero.
*
* Output
* ------
* x updated to contain the solution vector x that solves op(A) * x =
b.
*
* Reference: http://www.netlib.org/blas/ctbsv.f
*
* Error status for this function can be retrieved via cublasGetError().
*
* Error Status
* ------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
d
* CUBLAS_STATUS_INVALID_VALUE if incx == 0, n < 0 or n > 2035
* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
*/
void CUBLASAPI cublasCtbsv (char uplo, char trans, char diag, int n, int k, void CUBLASAPI cublasCtbsv (char uplo, char trans, char diag, int n, int k,
const cuComplex *A, int lda, cuComplex *x, const cuComplex *A, int lda, cuComplex *x,
int incx); int incx);
/*
* void
* cublasCtpsv (char uplo, char trans, char diag, int n, const cuComplex *A
P,
* cuComplex *X, int incx)
*
* solves one of the systems of equations op(A)*x = b, where op(A) is eithe
r
* op(A) = A , op(A) = transpose(A) or op(A) = conjugate(transpose)). b and
* x are n element complex vectors, and A is an n x n unit or non-unit,
* upper or lower triangular matrix. No test for singularity or near-singul
arity
* is included in this routine. Such tests must be performed before calling
this routine.
*
* Input
* -----
* uplo specifies whether the matrix is an upper or lower triangular matr
ix
* as follows: If uplo == 'U' or 'u', A is an upper triangluar matri
x.
* If uplo == 'L' or 'l', A is a lower triangular matrix.
* trans specifies op(A). If trans == 'N' or 'n', op(A) = A. If trans == '
T'
* or 't', op(A) = transpose(A). If trans == 'C' or 'c', op(A) =
* conjugate(transpose(A)).
* diag specifies whether A is unit triangular. If diag == 'U' or 'u', A
is
* assumed to be unit triangular; thas is, diagonal elements are not
* read and are assumed to be unity. If diag == 'N' or 'n', A is not
* assumed to be unit triangular.
* n specifies the number of rows and columns of the matrix A. n must
be
* at least zero.
* AP single precision complex array with at least ((n*(n+1))/2) elemen
ts.
* If uplo == 'U' or 'u', the array AP contains the upper triangular
* matrix A, packed sequentially, column by column; that is, if i <=
j, then
* A[i,j] is stored is AP[i+(j*(j+1)/2)]. If uplo == 'L' or 'L', the
* array AP contains the lower triangular matrix A, packed sequentia
lly,
* column by column; that is, if i >= j, then A[i,j] is stored in
* AP[i+((2*n-j+1)*j)/2]. When diag = 'U' or 'u', the diagonal eleme
nts
* of A are not referenced and are assumed to be unity.
* x single precision complex array of length at least (1+(n-1)*abs(in
cx)).
* incx storage spacing between elements of x. It must not be zero.
*
* Output
* ------
* x updated to contain the solution vector x that solves op(A) * x =
b.
*
* Reference: http://www.netlib.org/blas/ctpsv.f
*
* Error status for this function can be retrieved via cublasGetError().
*
* Error Status
* ------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
d
* CUBLAS_STATUS_INVALID_VALUE if incx == 0 or if n < 0 or n > 2035
* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
*/
void CUBLASAPI cublasCtpsv (char uplo, char trans, char diag, int n, void CUBLASAPI cublasCtpsv (char uplo, char trans, char diag, int n,
const cuComplex *AP, cuComplex *x, int incx); const cuComplex *AP, cuComplex *x, int incx);
/*
* cublasCgeru (int m, int n, cuComplex alpha, const cuComplex *x, int incx
,
* const cuComplex *y, int incy, cuComplex *A, int lda)
*
* performs the symmetric rank 1 operation
*
* A = alpha * x * transpose(y) + A,
*
* where alpha is a single precision complex scalar, x is an m element sing
le
* precision complex vector, y is an n element single precision complex vec
tor, and A
* is an m by n matrix consisting of single precision complex elements. Mat
rix A
* is stored in column major format, and lda is the leading dimension of
* the two-dimensional array used to store A.
*
* Input
* -----
* m specifies the number of rows of the matrix A. It must be at least
* zero.
* n specifies the number of columns of the matrix A. It must be at
* least zero.
* alpha single precision complex scalar multiplier applied to x * transpo
se(y)
* x single precision complex array of length at least (1 + (m - 1) *
abs(incx))
* incx specifies the storage spacing between elements of x. incx must no
t
* be zero.
* y single precision complex array of length at least (1 + (n - 1) *
abs(incy))
* incy specifies the storage spacing between elements of y. incy must no
t
* be zero.
* A single precision complex array of dimensions (lda, n).
* lda leading dimension of two-dimensional array used to store matrix A
*
* Output
* ------
* A updated according to A = alpha * x * transpose(y) + A
*
* Reference: http://www.netlib.org/blas/cgeru.f
*
* Error status for this function can be retrieved via cublasGetError().
*
* Error Status
* ------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
d
* CUBLAS_STATUS_INVALID_VALUE if m <0, n < 0, incx == 0, incy == 0
* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
*/
void CUBLASAPI cublasCgeru (int m, int n, cuComplex alpha, const cuComplex *x, void CUBLASAPI cublasCgeru (int m, int n, cuComplex alpha, const cuComplex *x,
int incx, const cuComplex *y, int incy, int incx, const cuComplex *y, int incy,
cuComplex *A, int lda); cuComplex *A, int lda);
/*
* cublasCgerc (int m, int n, cuComplex alpha, const cuComplex *x, int incx
,
* const cuComplex *y, int incy, cuComplex *A, int lda)
*
* performs the symmetric rank 1 operation
*
* A = alpha * x * conjugate(transpose(y)) + A,
*
* where alpha is a single precision complex scalar, x is an m element sing
le
* precision complex vector, y is an n element single precision complex vec
tor, and A
* is an m by n matrix consisting of single precision complex elements. Mat
rix A
* is stored in column major format, and lda is the leading dimension of
* the two-dimensional array used to store A.
*
* Input
* -----
* m specifies the number of rows of the matrix A. It must be at least
* zero.
* n specifies the number of columns of the matrix A. It must be at
* least zero.
* alpha single precision complex scalar multiplier applied to x * transpo
se(y)
* x single precision complex array of length at least (1 + (m - 1) *
abs(incx))
* incx specifies the storage spacing between elements of x. incx must no
t
* be zero.
* y single precision complex array of length at least (1 + (n - 1) *
abs(incy))
* incy specifies the storage spacing between elements of y. incy must no
t
* be zero.
* A single precision complex array of dimensions (lda, n).
* lda leading dimension of two-dimensional array used to store matrix A
*
* Output
* ------
* A updated according to A = alpha * x * conjugate(transpose(y)) + A
*
* Reference: http://www.netlib.org/blas/cgerc.f
*
* Error status for this function can be retrieved via cublasGetError().
*
* Error Status
* ------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
d
* CUBLAS_STATUS_INVALID_VALUE if m <0, n < 0, incx == 0, incy == 0
* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
*/
void CUBLASAPI cublasCgerc (int m, int n, cuComplex alpha, const cuComplex *x, void CUBLASAPI cublasCgerc (int m, int n, cuComplex alpha, const cuComplex *x,
int incx, const cuComplex *y, int incy, int incx, const cuComplex *y, int incy,
cuComplex *A, int lda); cuComplex *A, int lda);
void CUBLASAPI cublasCher (char uplo, int n, cuComplex alpha, /*
* void
* cublasCher (char uplo, int n, float alpha, const cuComplex *x, int incx,
* cuComplex *A, int lda)
*
* performs the hermitian rank 1 operation
*
* A = alpha * x * conjugate(transpose(x)) + A,
*
* where alpha is a single precision real scalar, x is an n element single
* precision complex vector and A is an n x n hermitian matrix consisting o
f
* single precision complex elements. Matrix A is stored in column major fo
rmat,
* and lda is the leading dimension of the two-dimensional array
* containing A.
*
* Input
* -----
* uplo specifies whether the matrix data is stored in the upper or
* the lower triangular part of array A. If uplo = 'U' or 'u',
* then only the upper triangular part of A may be referenced.
* If uplo = 'L' or 'l', then only the lower triangular part of
* A may be referenced.
* n specifies the number of rows and columns of the matrix A. It
* must be at least 0.
* alpha single precision real scalar multiplier applied to
* x * conjugate(transpose(x))
* x single precision complex array of length at least (1 + (n - 1) *
abs(incx))
* incx specifies the storage spacing between elements of x. incx must
* not be zero.
* A single precision complex array of dimensions (lda, n). If uplo =
'U' or
* 'u', then A must contain the upper triangular part of a hermitian
* matrix, and the strictly lower triangular part is not referenced.
* If uplo = 'L' or 'l', then A contains the lower triangular part
* of a hermitian matrix, and the strictly upper triangular part is
* not referenced. The imaginary parts of the diagonal elements need
* not be set, they are assumed to be zero, and on exit they
* are set to zero.
* lda leading dimension of the two-dimensional array containing A. lda
* must be at least max(1, n).
*
* Output
* ------
* A updated according to A = alpha * x * conjugate(transpose(x)) + A
*
* Reference: http://www.netlib.org/blas/cher.f
*
* Error status for this function can be retrieved via cublasGetError().
*
* Error Status
* ------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
d
* CUBLAS_STATUS_INVALID_VALUE if n < 0, or incx == 0
* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
*/
void CUBLASAPI cublasCher (char uplo, int n, float alpha,
const cuComplex *x, int incx, cuComplex *A, const cuComplex *x, int incx, cuComplex *A,
int lda); int lda);
void CUBLASAPI cublasChpr (char uplo, int n, cuComplex alpha, /*
* void
* cublasChpr (char uplo, int n, float alpha, const cuComplex *x, int incx,
* cuComplex *AP)
*
* performs the hermitian rank 1 operation
*
* A = alpha * x * conjugate(transpose(x)) + A,
*
* where alpha is a single precision real scalar and x is an n element sing
le
* precision complex vector. A is a hermitian n x n matrix consisting of si
ngle
* precision complex elements that is supplied in packed form.
*
* Input
* -----
* uplo specifies whether the matrix data is stored in the upper or the l
ower
* triangular part of array AP. If uplo == 'U' or 'u', then the uppe
r
* triangular part of A is supplied in AP. If uplo == 'L' or 'l', th
en
* the lower triangular part of A is supplied in AP.
* n specifies the number of rows and columns of the matrix A. It must
be
* at least zero.
* alpha single precision real scalar multiplier applied to x * conjugate(
transpose(x)).
* x single precision array of length at least (1 + (n - 1) * abs(incx
)).
* incx storage spacing between elements of x. incx must not be zero.
* AP single precision complex array with at least ((n * (n + 1)) / 2)
elements. If
* uplo == 'U' or 'u', the array AP contains the upper triangular pa
rt
* of the hermitian matrix A, packed sequentially, column by column;
* that is, if i <= j, then A[i,j] is stored is AP[i+(j*(j+1)/2)]. I
f
* uplo == 'L' or 'L', the array AP contains the lower triangular pa
rt
* of the hermitian matrix A, packed sequentially, column by column;
* that is, if i >= j, then A[i,j] is stored in AP[i+((2*n-j+1)*j)/2
].
* The imaginary parts of the diagonal elements need not be set, the
y
* are assumed to be zero, and on exit they are set to zero.
*
* Output
* ------
* A updated according to A = alpha * x * conjugate(transpose(x)) + A
*
* Reference: http://www.netlib.org/blas/chpr.f
*
* Error status for this function can be retrieved via cublasGetError().
*
* Error Status
* ------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
d
* CUBLAS_STATUS_INVALID_VALUE if n < 0, or incx == 0
* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
*/
void CUBLASAPI cublasChpr (char uplo, int n, float alpha,
const cuComplex *x, int incx, cuComplex *AP); const cuComplex *x, int incx, cuComplex *AP);
/*
* void
* cublasChpr2 (char uplo, int n, cuComplex alpha, const cuComplex *x, int
incx,
* const cuComplex *y, int incy, cuComplex *AP)
*
* performs the hermitian rank 2 operation
*
* A = alpha*x*conjugate(transpose(y)) + conjugate(alpha)*y*conjugate(tr
anspose(x)) + A,
*
* where alpha is a single precision complex scalar, and x and y are n elem
ent single
* precision complex vectors. A is a hermitian n x n matrix consisting of s
ingle
* precision complex elements that is supplied in packed form.
*
* Input
* -----
* uplo specifies whether the matrix data is stored in the upper or the l
ower
* triangular part of array A. If uplo == 'U' or 'u', then only the
* upper triangular part of A may be referenced and the lower triang
ular
* part of A is inferred. If uplo == 'L' or 'l', then only the lower
* triangular part of A may be referenced and the upper triangular p
art
* of A is inferred.
* n specifies the number of rows and columns of the matrix A. It must
be
* at least zero.
* alpha single precision complex scalar multiplier applied to x * conjuga
te(transpose(y)) +
* y * conjugate(transpose(x)).
* x single precision complex array of length at least (1 + (n - 1) *
abs (incx)).
* incx storage spacing between elements of x. incx must not be zero.
* y single precision complex array of length at least (1 + (n - 1) *
abs (incy)).
* incy storage spacing between elements of y. incy must not be zero.
* AP single precision complex array with at least ((n * (n + 1)) / 2)
elements. If
* uplo == 'U' or 'u', the array AP contains the upper triangular pa
rt
* of the hermitian matrix A, packed sequentially, column by column;
* that is, if i <= j, then A[i,j] is stored is AP[i+(j*(j+1)/2)]. I
f
* uplo == 'L' or 'L', the array AP contains the lower triangular pa
rt
* of the hermitian matrix A, packed sequentially, column by column;
* that is, if i >= j, then A[i,j] is stored in AP[i+((2*n-j+1)*j)/2
].
* The imaginary parts of the diagonal elements need not be set, the
y
* are assumed to be zero, and on exit they are set to zero.
*
* Output
* ------
* A updated according to A = alpha*x*conjugate(transpose(y))
* + conjugate(alpha)*y*conjugate(transpose(x
))+A
*
* Reference: http://www.netlib.org/blas/chpr2.f
*
* Error status for this function can be retrieved via cublasGetError().
*
* Error Status
* ------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
d
* CUBLAS_STATUS_INVALID_VALUE if n < 0, incx == 0, incy == 0
* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
*/
void CUBLASAPI cublasChpr2 (char uplo, int n, cuComplex alpha,
const cuComplex *x, int incx, const cuComplex *
y,
int incy, cuComplex *AP);
/*
* void cublasCher2 (char uplo, int n, cuComplex alpha, const cuComplex *x,
int incx,
* const cuComplex *y, int incy, cuComplex *A, int lda)
*
* performs the hermitian rank 2 operation
*
* A = alpha*x*conjugate(transpose(y)) + conjugate(alpha)*y*conjugate(tr
anspose(x)) + A,
*
* where alpha is a single precision complex scalar, x and y are n element
single
* precision complex vector and A is an n by n hermitian matrix consisting
of single
* precision complex elements.
*
* Input
* -----
* uplo specifies whether the matrix data is stored in the upper or the l
ower
* triangular part of array A. If uplo == 'U' or 'u', then only the
* upper triangular part of A may be referenced and the lower triang
ular
* part of A is inferred. If uplo == 'L' or 'l', then only the lower
* triangular part of A may be referenced and the upper triangular p
art
* of A is inferred.
* n specifies the number of rows and columns of the matrix A. It must
be
* at least zero.
* alpha single precision complex scalar multiplier applied to x * conjuga
te(transpose(y)) +
* y * conjugate(transpose(x)).
* x single precision array of length at least (1 + (n - 1) * abs (inc
x)).
* incx storage spacing between elements of x. incx must not be zero.
* y single precision array of length at least (1 + (n - 1) * abs (inc
y)).
* incy storage spacing between elements of y. incy must not be zero.
* A single precision complex array of dimensions (lda, n). If uplo ==
'U' or 'u',
* then A must contains the upper triangular part of a hermitian mat
rix,
* and the strictly lower triangular parts is not referenced. If upl
o ==
* 'L' or 'l', then A contains the lower triangular part of a hermit
ian
* matrix, and the strictly upper triangular part is not referenced.
* The imaginary parts of the diagonal elements need not be set,
* they are assumed to be zero, and on exit they are set to zero.
*
* lda leading dimension of A. It must be at least max(1, n).
*
* Output
* ------
* A updated according to A = alpha*x*conjugate(transpose(y))
* + conjugate(alpha)*y*conjugate(transpose(x
))+A
*
* Reference: http://www.netlib.org/blas/cher2.f
*
* Error status for this function can be retrieved via cublasGetError().
*
* Error Status
* ------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
d
* CUBLAS_STATUS_INVALID_VALUE if n < 0, incx == 0, incy == 0
* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
*/
void CUBLASAPI cublasCher2 (char uplo, int n, cuComplex alpha, void CUBLASAPI cublasCher2 (char uplo, int n, cuComplex alpha,
const cuComplex *x, int incx, const cuComplex * y, const cuComplex *x, int incx, const cuComplex * y,
int incy, cuComplex *A, int lda); int incy, cuComplex *A, int lda);
void CUBLASAPI cublasChpr2 (char uplo, int n, cuComplex alpha, void CUBLASAPI cublasChpr2 (char uplo, int n, cuComplex alpha,
const cuComplex *x, int incx, const cuComplex * y, const cuComplex *x, int incx, const cuComplex * y,
int incy, cuComplex *AP); int incy, cuComplex *AP);
/* ---------------- CUBLAS single precision BLAS3 functions --------------- - */ /* ---------------- CUBLAS single precision BLAS3 functions --------------- - */
/* /*
skipping to change at line 2495 skipping to change at line 4271
* otherwise the leading k x n part of the array must contains the * otherwise the leading k x n part of the array must contains the
* matrix A. * matrix A.
* lda leading dimension of A. When trans == 'N' or 'n' then lda must be at * lda leading dimension of A. When trans == 'N' or 'n' then lda must be at
* least max(1, n). Otherwise lda must be at least max(1, k). * least max(1, n). Otherwise lda must be at least max(1, k).
* beta single precision scalar multiplier applied to C. If beta izs zero , C * beta single precision scalar multiplier applied to C. If beta izs zero , C
* does not have to be a valid input * does not have to be a valid input
* C single precision array of dimensions (ldc, n). If uplo == 'U' or 'u', * C single precision array of dimensions (ldc, n). If uplo == 'U' or 'u',
* the leading n x n triangular part of the array C must contain the * the leading n x n triangular part of the array C must contain the
* upper triangular part of the symmetric matrix C and the strictly * upper triangular part of the symmetric matrix C and the strictly
* lower triangular part of C is not referenced. On exit, the upper * lower triangular part of C is not referenced. On exit, the upper
* triangular part of C is overwritten by the upper trinagular part of * triangular part of C is overwritten by the upper triangular part of
* the updated matrix. If uplo == 'L' or 'l', the leading n x n * the updated matrix. If uplo == 'L' or 'l', the leading n x n
* triangular part of the array C must contain the lower triangular part * triangular part of the array C must contain the lower triangular part
* of the symmetric matrix C and the strictly upper triangular part of C * of the symmetric matrix C and the strictly upper triangular part of C
* is not referenced. On exit, the lower triangular part of C is * is not referenced. On exit, the lower triangular part of C is
* overwritten by the lower trinagular part of the updated matrix. * overwritten by the lower triangular part of the updated matrix.
* ldc leading dimension of C. It must be at least max(1, n). * ldc leading dimension of C. It must be at least max(1, n).
* *
* Output * Output
* ------ * ------
* C updated according to C = alpha * A * transpose(A) + beta * C, or C = * C updated according to C = alpha * A * transpose(A) + beta * C, or C =
* alpha * transpose(A) * A + beta * C * alpha * transpose(A) * A + beta * C
* *
* Reference: http://www.netlib.org/blas/ssyrk.f * Reference: http://www.netlib.org/blas/ssyrk.f
* *
* Error status for this function can be retrieved via cublasGetError(). * Error status for this function can be retrieved via cublasGetError().
skipping to change at line 2580 skipping to change at line 4356
* otherwise the leading k x n part of the array must contain the ma trix * otherwise the leading k x n part of the array must contain the ma trix
* B. * B.
* ldb leading dimension of N. When trans == 'N' or 'n' then ldb must be at * ldb leading dimension of N. When trans == 'N' or 'n' then ldb must be at
* least max(1, n). Otherwise ldb must be at least max(1, k). * least max(1, n). Otherwise ldb must be at least max(1, k).
* beta single precision scalar multiplier applied to C. If beta is zero, C * beta single precision scalar multiplier applied to C. If beta is zero, C
* does not have to be a valid input. * does not have to be a valid input.
* C single precision array of dimensions (ldc, n). If uplo == 'U' or 'u', * C single precision array of dimensions (ldc, n). If uplo == 'U' or 'u',
* the leading n x n triangular part of the array C must contain the * the leading n x n triangular part of the array C must contain the
* upper triangular part of the symmetric matrix C and the strictly * upper triangular part of the symmetric matrix C and the strictly
* lower triangular part of C is not referenced. On exit, the upper * lower triangular part of C is not referenced. On exit, the upper
* triangular part of C is overwritten by the upper trinagular part of * triangular part of C is overwritten by the upper triangular part of
* the updated matrix. If uplo == 'L' or 'l', the leading n x n * the updated matrix. If uplo == 'L' or 'l', the leading n x n
* triangular part of the array C must contain the lower triangular part * triangular part of the array C must contain the lower triangular part
* of the symmetric matrix C and the strictly upper triangular part of C * of the symmetric matrix C and the strictly upper triangular part of C
* is not referenced. On exit, the lower triangular part of C is * is not referenced. On exit, the lower triangular part of C is
* overwritten by the lower trinagular part of the updated matrix. * overwritten by the lower triangular part of the updated matrix.
* ldc leading dimension of C. Must be at least max(1, n). * ldc leading dimension of C. Must be at least max(1, n).
* *
* Output * Output
* ------ * ------
* C updated according to alpha*A*transpose(B) + alpha*B*transpose(A) + * C updated according to alpha*A*transpose(B) + alpha*B*transpose(A) +
* beta*C or alpha*transpose(A)*B + alpha*transpose(B)*A + beta*C * beta*C or alpha*transpose(A)*B + alpha*transpose(B)*A + beta*C
* *
* Reference: http://www.netlib.org/blas/ssyr2k.f * Reference: http://www.netlib.org/blas/ssyr2k.f
* *
* Error status for this function can be retrieved via cublasGetError(). * Error status for this function can be retrieved via cublasGetError().
skipping to change at line 2824 skipping to change at line 4600
* Error Status * Error Status
* ------------ * ------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize d * CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize d
* CUBLAS_STATUS_INVALID_VALUE if any of m, n, or k are < 0 * CUBLAS_STATUS_INVALID_VALUE if any of m, n, or k are < 0
* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU * CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
*/ */
void CUBLASAPI cublasCgemm (char transa, char transb, int m, int n, int k, void CUBLASAPI cublasCgemm (char transa, char transb, int m, int n, int k,
cuComplex alpha, const cuComplex *A, int lda, cuComplex alpha, const cuComplex *A, int lda,
const cuComplex *B, int ldb, cuComplex beta, const cuComplex *B, int ldb, cuComplex beta,
cuComplex *C, int ldc); cuComplex *C, int ldc);
/*
* void
* cublasCsymm (char side, char uplo, int m, int n, cuComplex alpha,
* const cuComplex *A, int lda, const cuComplex *B, int ldb,
* cuComplex beta, cuComplex *C, int ldc);
*
* performs one of the matrix-matrix operations
*
* C = alpha * A * B + beta * C, or
* C = alpha * B * A + beta * C,
*
* where alpha and beta are single precision complex scalars, A is a symmet
ric matrix
* consisting of single precision complex elements and stored in either low
er or upper
* storage mode, and B and C are m x n matrices consisting of single precis
ion
* complex elements.
*
* Input
* -----
* side specifies whether the symmetric matrix A appears on the left side
* hand side or right hand side of matrix B, as follows. If side ==
'L'
* or 'l', then C = alpha * A * B + beta * C. If side = 'R' or 'r',
* then C = alpha * B * A + beta * C.
* uplo specifies whether the symmetric matrix A is stored in upper or lo
wer
* storage mode, as follows. If uplo == 'U' or 'u', only the upper
* triangular part of the symmetric matrix is to be referenced, and
the
* elements of the strictly lower triangular part are to be infered
from
* those in the upper triangular part. If uplo == 'L' or 'l', only t
he
* lower triangular part of the symmetric matrix is to be referenced
,
* and the elements of the strictly upper triangular part are to be
* infered from those in the lower triangular part.
* m specifies the number of rows of the matrix C, and the number of r
ows
* of matrix B. It also specifies the dimensions of symmetric matrix
A
* when side == 'L' or 'l'. m must be at least zero.
* n specifies the number of columns of the matrix C, and the number o
f
* columns of matrix B. It also specifies the dimensions of symmetri
c
* matrix A when side == 'R' or 'r'. n must be at least zero.
* alpha single precision scalar multiplier applied to A * B, or B * A
* A single precision array of dimensions (lda, ka), where ka is m whe
n
* side == 'L' or 'l' and is n otherwise. If side == 'L' or 'l' the
* leading m x m part of array A must contain the symmetric matrix,
* such that when uplo == 'U' or 'u', the leading m x m part stores
the
* upper triangular part of the symmetric matrix, and the strictly l
ower
* triangular part of A is not referenced, and when uplo == 'U' or '
u',
* the leading m x m part stores the lower triangular part of the
* symmetric matrix and the strictly upper triangular part is not
* referenced. If side == 'R' or 'r' the leading n x n part of array
A
* must contain the symmetric matrix, such that when uplo == 'U' or
'u',
* the leading n x n part stores the upper triangular part of the
* symmetric matrix and the strictly lower triangular part of A is n
ot
* referenced, and when uplo == 'U' or 'u', the leading n x n part
* stores the lower triangular part of the symmetric matrix and the
* strictly upper triangular part is not referenced.
* lda leading dimension of A. When side == 'L' or 'l', it must be at le
ast
* max(1, m) and at least max(1, n) otherwise.
* B single precision array of dimensions (ldb, n). On entry, the lead
ing
* m x n part of the array contains the matrix B.
* ldb leading dimension of B. It must be at least max (1, m).
* beta single precision scalar multiplier applied to C. If beta is zero,
C
* does not have to be a valid input
* C single precision array of dimensions (ldc, n)
* ldc leading dimension of C. Must be at least max(1, m)
*
* Output
* ------
* C updated according to C = alpha * A * B + beta * C, or C = alpha *
* B * A + beta * C
*
* Reference: http://www.netlib.org/blas/csymm.f
*
* Error status for this function can be retrieved via cublasGetError().
*
* Error Status
* ------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
d
* CUBLAS_STATUS_INVALID_VALUE if m or n are < 0
* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
*/
void CUBLASAPI cublasCsymm (char side, char uplo, int m, int n, void CUBLASAPI cublasCsymm (char side, char uplo, int m, int n,
cuComplex alpha, const cuComplex *A, int lda, cuComplex alpha, const cuComplex *A, int lda,
const cuComplex *B, int ldb, cuComplex beta, const cuComplex *B, int ldb, cuComplex beta,
cuComplex *C, int ldc); cuComplex *C, int ldc);
/*
* void
* cublasChemm (char side, char uplo, int m, int n, cuComplex alpha,
* const cuComplex *A, int lda, const cuComplex *B, int ldb,
* cuComplex beta, cuComplex *C, int ldc);
*
* performs one of the matrix-matrix operations
*
* C = alpha * A * B + beta * C, or
* C = alpha * B * A + beta * C,
*
* where alpha and beta are single precision complex scalars, A is a hermit
ian matrix
* consisting of single precision complex elements and stored in either low
er or upper
* storage mode, and B and C are m x n matrices consisting of single precis
ion
* complex elements.
*
* Input
* -----
* side specifies whether the hermitian matrix A appears on the left side
* hand side or right hand side of matrix B, as follows. If side ==
'L'
* or 'l', then C = alpha * A * B + beta * C. If side = 'R' or 'r',
* then C = alpha * B * A + beta * C.
* uplo specifies whether the hermitian matrix A is stored in upper or lo
wer
* storage mode, as follows. If uplo == 'U' or 'u', only the upper
* triangular part of the hermitian matrix is to be referenced, and
the
* elements of the strictly lower triangular part are to be infered
from
* those in the upper triangular part. If uplo == 'L' or 'l', only t
he
* lower triangular part of the hermitian matrix is to be referenced
,
* and the elements of the strictly upper triangular part are to be
* infered from those in the lower triangular part.
* m specifies the number of rows of the matrix C, and the number of r
ows
* of matrix B. It also specifies the dimensions of hermitian matrix
A
* when side == 'L' or 'l'. m must be at least zero.
* n specifies the number of columns of the matrix C, and the number o
f
* columns of matrix B. It also specifies the dimensions of hermitia
n
* matrix A when side == 'R' or 'r'. n must be at least zero.
* alpha single precision complex scalar multiplier applied to A * B, or B
* A
* A single precision complex array of dimensions (lda, ka), where ka
is m when
* side == 'L' or 'l' and is n otherwise. If side == 'L' or 'l' the
* leading m x m part of array A must contain the hermitian matrix,
* such that when uplo == 'U' or 'u', the leading m x m part stores
the
* upper triangular part of the hermitian matrix, and the strictly l
ower
* triangular part of A is not referenced, and when uplo == 'U' or '
u',
* the leading m x m part stores the lower triangular part of the
* hermitian matrix and the strictly upper triangular part is not
* referenced. If side == 'R' or 'r' the leading n x n part of array
A
* must contain the hermitian matrix, such that when uplo == 'U' or
'u',
* the leading n x n part stores the upper triangular part of the
* hermitian matrix and the strictly lower triangular part of A is n
ot
* referenced, and when uplo == 'U' or 'u', the leading n x n part
* stores the lower triangular part of the hermitian matrix and the
* strictly upper triangular part is not referenced. The imaginary p
arts
* of the diagonal elements need not be set, they are assumed to be
zero.
* lda leading dimension of A. When side == 'L' or 'l', it must be at le
ast
* max(1, m) and at least max(1, n) otherwise.
* B single precision complex array of dimensions (ldb, n). On entry,
the leading
* m x n part of the array contains the matrix B.
* ldb leading dimension of B. It must be at least max (1, m).
* beta single precision complex scalar multiplier applied to C. If beta
is zero, C
* does not have to be a valid input
* C single precision complex array of dimensions (ldc, n)
* ldc leading dimension of C. Must be at least max(1, m)
*
* Output
* ------
* C updated according to C = alpha * A * B + beta * C, or C = alpha *
* B * A + beta * C
*
* Reference: http://www.netlib.org/blas/chemm.f
*
* Error status for this function can be retrieved via cublasGetError().
*
* Error Status
* ------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
d
* CUBLAS_STATUS_INVALID_VALUE if m or n are < 0
* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
*/
void CUBLASAPI cublasChemm (char side, char uplo, int m, int n, void CUBLASAPI cublasChemm (char side, char uplo, int m, int n,
cuComplex alpha, const cuComplex *A, int lda, cuComplex alpha, const cuComplex *A, int lda,
const cuComplex *B, int ldb, cuComplex beta, const cuComplex *B, int ldb, cuComplex beta,
cuComplex *C, int ldc); cuComplex *C, int ldc);
/*
* void
* cublasCsyrk (char uplo, char trans, int n, int k, cuComplex alpha,
* const cuComplex *A, int lda, cuComplex beta, cuComplex *C,
int ldc)
*
* performs one of the symmetric rank k operations
*
* C = alpha * A * transpose(A) + beta * C, or
* C = alpha * transpose(A) * A + beta * C.
*
* Alpha and beta are single precision complex scalars. C is an n x n symme
tric matrix
* consisting of single precision complex elements and stored in either low
er or
* upper storage mode. A is a matrix consisting of single precision complex
elements
* with dimension of n x k in the first case, and k x n in the second case.
*
* Input
* -----
* uplo specifies whether the symmetric matrix C is stored in upper or lo
wer
* storage mode as follows. If uplo == 'U' or 'u', only the upper
* triangular part of the symmetric matrix is to be referenced, and
the
* elements of the strictly lower triangular part are to be infered
from
* those in the upper triangular part. If uplo == 'L' or 'l', only t
he
* lower triangular part of the symmetric matrix is to be referenced
,
* and the elements of the strictly upper triangular part are to be
* infered from those in the lower triangular part.
* trans specifies the operation to be performed. If trans == 'N' or 'n',
C =
* alpha * transpose(A) + beta * C. If trans == 'T', 't', 'C', or 'c
',
* C = transpose(A) * A + beta * C.
* n specifies the number of rows and the number columns of matrix C.
If
* trans == 'N' or 'n', n specifies the number of rows of matrix A.
If
* trans == 'T', 't', 'C', or 'c', n specifies the columns of matrix
A.
* n must be at least zero.
* k If trans == 'N' or 'n', k specifies the number of rows of matrix
A.
* If trans == 'T', 't', 'C', or 'c', k specifies the number of rows
of
* matrix A. k must be at least zero.
* alpha single precision complex scalar multiplier applied to A * transpo
se(A) or
* transpose(A) * A.
* A single precision complex array of dimensions (lda, ka), where ka
is k when
* trans == 'N' or 'n', and is n otherwise. When trans == 'N' or 'n'
,
* the leading n x k part of array A must contain the matrix A,
* otherwise the leading k x n part of the array must contains the
* matrix A.
* lda leading dimension of A. When trans == 'N' or 'n' then lda must be
at
* least max(1, n). Otherwise lda must be at least max(1, k).
* beta single precision complex scalar multiplier applied to C. If beta
izs zero, C
* does not have to be a valid input
* C single precision complex array of dimensions (ldc, n). If uplo =
'U' or 'u',
* the leading n x n triangular part of the array C must contain the
* upper triangular part of the symmetric matrix C and the strictly
* lower triangular part of C is not referenced. On exit, the upper
* triangular part of C is overwritten by the upper triangular part
of
* the updated matrix. If uplo = 'L' or 'l', the leading n x n
* triangular part of the array C must contain the lower triangular
part
* of the symmetric matrix C and the strictly upper triangular part
of C
* is not referenced. On exit, the lower triangular part of C is
* overwritten by the lower triangular part of the updated matrix.
* ldc leading dimension of C. It must be at least max(1, n).
*
* Output
* ------
* C updated according to C = alpha * A * transpose(A) + beta * C, or
C =
* alpha * transpose(A) * A + beta * C
*
* Reference: http://www.netlib.org/blas/csyrk.f
*
* Error status for this function can be retrieved via cublasGetError().
*
* Error Status
* ------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
d
* CUBLAS_STATUS_INVALID_VALUE if n < 0 or k < 0
* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
*/
void CUBLASAPI cublasCsyrk (char uplo, char trans, int n, int k, void CUBLASAPI cublasCsyrk (char uplo, char trans, int n, int k,
cuComplex alpha, const cuComplex *A, int lda, cuComplex alpha, const cuComplex *A, int lda,
cuComplex beta, cuComplex *C, int ldc); cuComplex beta, cuComplex *C, int ldc);
/*
* void
* cublasCherk (char uplo, char trans, int n, int k, float alpha,
* const cuComplex *A, int lda, float beta, cuComplex *C, int
ldc)
*
* performs one of the hermitian rank k operations
*
* C = alpha * A * conjugate(transpose(A)) + beta * C, or
* C = alpha * conjugate(transpose(A)) * A + beta * C.
*
* Alpha and beta are single precision real scalars. C is an n x n hermitia
n matrix
* consisting of single precision complex elements and stored in either low
er or
* upper storage mode. A is a matrix consisting of single precision complex
elements
* with dimension of n x k in the first case, and k x n in the second case.
*
* Input
* -----
* uplo specifies whether the hermitian matrix C is stored in upper or lo
wer
* storage mode as follows. If uplo == 'U' or 'u', only the upper
* triangular part of the hermitian matrix is to be referenced, and
the
* elements of the strictly lower triangular part are to be infered
from
* those in the upper triangular part. If uplo == 'L' or 'l', only t
he
* lower triangular part of the hermitian matrix is to be referenced
,
* and the elements of the strictly upper triangular part are to be
* infered from those in the lower triangular part.
* trans specifies the operation to be performed. If trans == 'N' or 'n',
C =
* alpha * A * conjugate(transpose(A)) + beta * C. If trans == 'T',
't', 'C', or 'c',
* C = alpha * conjugate(transpose(A)) * A + beta * C.
* n specifies the number of rows and the number columns of matrix C.
If
* trans == 'N' or 'n', n specifies the number of rows of matrix A.
If
* trans == 'T', 't', 'C', or 'c', n specifies the columns of matrix
A.
* n must be at least zero.
* k If trans == 'N' or 'n', k specifies the number of columns of matr
ix A.
* If trans == 'T', 't', 'C', or 'c', k specifies the number of rows
of
* matrix A. k must be at least zero.
* alpha single precision scalar multiplier applied to A * conjugate(trans
pose(A)) or
* conjugate(transpose(A)) * A.
* A single precision complex array of dimensions (lda, ka), where ka
is k when
* trans == 'N' or 'n', and is n otherwise. When trans == 'N' or 'n'
,
* the leading n x k part of array A must contain the matrix A,
* otherwise the leading k x n part of the array must contains the
* matrix A.
* lda leading dimension of A. When trans == 'N' or 'n' then lda must be
at
* least max(1, n). Otherwise lda must be at least max(1, k).
* beta single precision scalar multiplier applied to C. If beta is zero,
C
* does not have to be a valid input.
* C single precision complex array of dimensions (ldc, n). If uplo =
'U' or 'u',
* the leading n x n triangular part of the array C must contain the
* upper triangular part of the hermitian matrix C and the strictly
* lower triangular part of C is not referenced. On exit, the upper
* triangular part of C is overwritten by the upper triangular part
of
* the updated matrix. If uplo = 'L' or 'l', the leading n x n
* triangular part of the array C must contain the lower triangular
part
* of the hermitian matrix C and the strictly upper triangular part
of C
* is not referenced. On exit, the lower triangular part of C is
* overwritten by the lower triangular part of the updated matrix.
* The imaginary parts of the diagonal elements need
* not be set, they are assumed to be zero, and on exit they
* are set to zero.
* ldc leading dimension of C. It must be at least max(1, n).
*
* Output
* ------
* C updated according to C = alpha * A * conjugate(transpose(A)) + be
ta * C, or C =
* alpha * conjugate(transpose(A)) * A + beta * C
*
* Reference: http://www.netlib.org/blas/cherk.f
*
* Error status for this function can be retrieved via cublasGetError().
*
* Error Status
* ------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
d
* CUBLAS_STATUS_INVALID_VALUE if n < 0 or k < 0
* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
*/
void CUBLASAPI cublasCherk (char uplo, char trans, int n, int k, void CUBLASAPI cublasCherk (char uplo, char trans, int n, int k,
cuComplex alpha, const cuComplex *A, int lda, float alpha, const cuComplex *A, int lda,
cuComplex beta, cuComplex *C, int ldc); float beta, cuComplex *C, int ldc);
/*
* void
* cublasCsyr2k (char uplo, char trans, int n, int k, cuComplex alpha,
* const cuComplex *A, int lda, const cuComplex *B, int ldb,
* cuComplex beta, cuComplex *C, int ldc)
*
* performs one of the symmetric rank 2k operations
*
* C = alpha * A * transpose(B) + alpha * B * transpose(A) + beta * C, o
r
* C = alpha * transpose(A) * B + alpha * transpose(B) * A + beta * C.
*
* Alpha and beta are single precision complex scalars. C is an n x n symme
tric matrix
* consisting of single precision complex elements and stored in either low
er or upper
* storage mode. A and B are matrices consisting of single precision comple
x elements
* with dimension of n x k in the first case, and k x n in the second case.
*
* Input
* -----
* uplo specifies whether the symmetric matrix C is stored in upper or lo
wer
* storage mode, as follows. If uplo == 'U' or 'u', only the upper
* triangular part of the symmetric matrix is to be referenced, and
the
* elements of the strictly lower triangular part are to be infered
from
* those in the upper triangular part. If uplo == 'L' or 'l', only t
he
* lower triangular part of the symmetric matrix is to be references
,
* and the elements of the strictly upper triangular part are to be
* infered from those in the lower triangular part.
* trans specifies the operation to be performed. If trans == 'N' or 'n',
* C = alpha * A * transpose(B) + alpha * B * transpose(A) + beta *
C,
* If trans == 'T', 't', 'C', or 'c', C = alpha * transpose(A) * B +
* alpha * transpose(B) * A + beta * C.
* n specifies the number of rows and the number columns of matrix C.
If
* trans == 'N' or 'n', n specifies the number of rows of matrix A.
If
* trans == 'T', 't', 'C', or 'c', n specifies the columns of matrix
A.
* n must be at least zero.
* k If trans == 'N' or 'n', k specifies the number of rows of matrix
A.
* If trans == 'T', 't', 'C', or 'c', k specifies the number of rows
of
* matrix A. k must be at least zero.
* alpha single precision complex scalar multiplier.
* A single precision complex array of dimensions (lda, ka), where ka
is k when
* trans == 'N' or 'n', and is n otherwise. When trans == 'N' or 'n'
,
* the leading n x k part of array A must contain the matrix A,
* otherwise the leading k x n part of the array must contain the ma
trix
* A.
* lda leading dimension of A. When trans == 'N' or 'n' then lda must be
at
* least max(1, n). Otherwise lda must be at least max(1,k).
* B single precision complex array of dimensions (lda, kb), where kb
is k when
* trans == 'N' or 'n', and is n otherwise. When trans == 'N' or 'n'
,
* the leading n x k part of array B must contain the matrix B,
* otherwise the leading k x n part of the array must contain the ma
trix
* B.
* ldb leading dimension of N. When trans == 'N' or 'n' then ldb must be
at
* least max(1, n). Otherwise ldb must be at least max(1, k).
* beta single precision complex scalar multiplier applied to C. If beta
is zero, C
* does not have to be a valid input.
* C single precision complex array of dimensions (ldc, n). If uplo ==
'U' or 'u',
* the leading n x n triangular part of the array C must contain the
* upper triangular part of the symmetric matrix C and the strictly
* lower triangular part of C is not referenced. On exit, the upper
* triangular part of C is overwritten by the upper triangular part
of
* the updated matrix. If uplo == 'L' or 'l', the leading n x n
* triangular part of the array C must contain the lower triangular
part
* of the symmetric matrix C and the strictly upper triangular part
of C
* is not referenced. On exit, the lower triangular part of C is
* overwritten by the lower triangular part of the updated matrix.
* ldc leading dimension of C. Must be at least max(1, n).
*
* Output
* ------
* C updated according to alpha*A*transpose(B) + alpha*B*transpose(A)
+
* beta*C or alpha*transpose(A)*B + alpha*transpose(B)*A + beta*C
*
* Reference: http://www.netlib.org/blas/csyr2k.f
*
* Error status for this function can be retrieved via cublasGetError().
*
* Error Status
* ------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
d
* CUBLAS_STATUS_INVALID_VALUE if n < 0 or k < 0
* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
*/
void CUBLASAPI cublasCsyr2k (char uplo, char trans, int n, int k, void CUBLASAPI cublasCsyr2k (char uplo, char trans, int n, int k,
cuComplex alpha, const cuComplex *A, int lda, cuComplex alpha, const cuComplex *A, int lda,
const cuComplex *B, int ldb, cuComplex beta, const cuComplex *B, int ldb, cuComplex beta,
cuComplex *C, int ldc); cuComplex *C, int ldc);
/*
* void
* cublasCher2k (char uplo, char trans, int n, int k, cuComplex alpha,
* const cuComplex *A, int lda, const cuComplex *B, int ldb,
* float beta, cuComplex *C, int ldc)
*
* performs one of the hermitian rank 2k operations
*
* C = alpha * A * conjugate(transpose(B))
* + conjugate(alpha) * B * conjugate(transpose(A))
* + beta * C ,
* or
* C = alpha * conjugate(transpose(A)) * B
* + conjugate(alpha) * conjugate(transpose(B)) * A
* + beta * C.
*
* Alpha is single precision complex scalar whereas Beta is a single preoci
sion real scalar.
* C is an n x n hermitian matrix consisting of single precision complex el
ements
* and stored in either lower or upper storage mode. A and B are matrices c
onsisting
* of single precision complex elements with dimension of n x k in the firs
t case,
* and k x n in the second case.
*
* Input
* -----
* uplo specifies whether the hermitian matrix C is stored in upper or lo
wer
* storage mode, as follows. If uplo == 'U' or 'u', only the upper
* triangular part of the hermitian matrix is to be referenced, and
the
* elements of the strictly lower triangular part are to be infered
from
* those in the upper triangular part. If uplo == 'L' or 'l', only t
he
* lower triangular part of the hermitian matrix is to be references
,
* and the elements of the strictly upper triangular part are to be
* infered from those in the lower triangular part.
* trans specifies the operation to be performed. If trans == 'N' or 'n',
* C = alpha * A * conjugate(transpose(B))
* + conjugate(alpha) * B * conjugate(transpose(A))
* + beta * C .
* If trans == 'T', 't', 'C', or 'c',
* C = alpha * conjugate(transpose(A)) * B
* + conjugate(alpha) * conjugate(transpose(B)) * A
* + beta * C.
* n specifies the number of rows and the number columns of matrix C.
If
* trans == 'N' or 'n', n specifies the number of rows of matrix A.
If
* trans == 'T', 't', 'C', or 'c', n specifies the columns of matrix
A.
* n must be at least zero.
* k If trans == 'N' or 'n', k specifies the number of rows of matrix
A.
* If trans == 'T', 't', 'C', or 'c', k specifies the number of rows
of
* matrix A. k must be at least zero.
* alpha single precision complex scalar multiplier.
* A single precision complex array of dimensions (lda, ka), where ka
is k when
* trans == 'N' or 'n', and is n otherwise. When trans == 'N' or 'n'
,
* the leading n x k part of array A must contain the matrix A,
* otherwise the leading k x n part of the array must contain the ma
trix
* A.
* lda leading dimension of A. When trans == 'N' or 'n' then lda must be
at
* least max(1, n). Otherwise lda must be at least max(1,k).
* B single precision complex array of dimensions (lda, kb), where kb
is k when
* trans == 'N' or 'n', and is n otherwise. When trans == 'N' or 'n'
,
* the leading n x k part of array B must contain the matrix B,
* otherwise the leading k x n part of the array must contain the ma
trix
* B.
* ldb leading dimension of N. When trans == 'N' or 'n' then ldb must be
at
* least max(1, n). Otherwise ldb must be at least max(1, k).
* beta single precision scalar multiplier applied to C. If beta is zero,
C
* does not have to be a valid input.
* C single precision complex array of dimensions (ldc, n). If uplo ==
'U' or 'u',
* the leading n x n triangular part of the array C must contain the
* upper triangular part of the hermitian matrix C and the strictly
* lower triangular part of C is not referenced. On exit, the upper
* triangular part of C is overwritten by the upper triangular part
of
* the updated matrix. If uplo == 'L' or 'l', the leading n x n
* triangular part of the array C must contain the lower triangular
part
* of the hermitian matrix C and the strictly upper triangular part
of C
* is not referenced. On exit, the lower triangular part of C is
* overwritten by the lower triangular part of the updated matrix.
* The imaginary parts of the diagonal elements need
* not be set, they are assumed to be zero, and on exit they
* are set to zero.
* ldc leading dimension of C. Must be at least max(1, n).
*
* Output
* ------
* C updated according to alpha*A*conjugate(transpose(B)) +
* + conjugate(alpha)*B*conjugate(transpose(A)) + beta*C or
* alpha*conjugate(transpose(A))*B + conjugate(alpha)*conjugate(tran
spose(B))*A
* + beta*C.
*
* Reference: http://www.netlib.org/blas/cher2k.f
*
* Error status for this function can be retrieved via cublasGetError().
*
* Error Status
* ------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
d
* CUBLAS_STATUS_INVALID_VALUE if n < 0 or k < 0
* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
*/
void CUBLASAPI cublasCher2k (char uplo, char trans, int n, int k, void CUBLASAPI cublasCher2k (char uplo, char trans, int n, int k,
cuComplex alpha, const cuComplex *A, int lda, cuComplex alpha, const cuComplex *A, int lda,
const cuComplex *B, int ldb, cuComplex beta, const cuComplex *B, int ldb, float beta,
cuComplex *C, int ldc); cuComplex *C, int ldc);
/*
* void
* cublasCtrmm (char side, char uplo, char transa, char diag, int m, int n,
* cuComplex alpha, const cuComplex *A, int lda, const cuCompl
ex *B,
* int ldb)
*
* performs one of the matrix-matrix operations
*
* B = alpha * op(A) * B, or B = alpha * B * op(A)
*
* where alpha is a single-precision complex scalar, B is an m x n matrix c
omposed
* of single precision complex elements, and A is a unit or non-unit, upper
or lower,
* triangular matrix composed of single precision complex elements. op(A) i
s one of
*
* op(A) = A , op(A) = transpose(A) or op(A) = conjugate(transpose(A))
*
* Matrices A and B are stored in column major format, and lda and ldb are
* the leading dimensions of the two-dimensonials arrays that contain A and
* B, respectively.
*
* Input
* -----
* side specifies whether op(A) multiplies B from the left or right.
* If side = 'L' or 'l', then B = alpha * op(A) * B. If side =
* 'R' or 'r', then B = alpha * B * op(A).
* uplo specifies whether the matrix A is an upper or lower triangular
* matrix. If uplo = 'U' or 'u', A is an upper triangular matrix.
* If uplo = 'L' or 'l', A is a lower triangular matrix.
* transa specifies the form of op(A) to be used in the matrix
* multiplication. If transa = 'N' or 'n', then op(A) = A. If
* transa = 'T' or 't', then op(A) = transpose(A).
* If transa = 'C' or 'c', then op(A) = conjugate(transpose(A)).
* diag specifies whether or not A is unit triangular. If diag = 'U'
* or 'u', A is assumed to be unit triangular. If diag = 'N' or
* 'n', A is not assumed to be unit triangular.
* m the number of rows of matrix B. m must be at least zero.
* n the number of columns of matrix B. n must be at least zero.
* alpha single precision complex scalar multiplier applied to op(A)*B, or
* B*op(A), respectively. If alpha is zero no accesses are made
* to matrix A, and no read accesses are made to matrix B.
* A single precision complex array of dimensions (lda, k). k = m if s
ide =
* 'L' or 'l', k = n if side = 'R' or 'r'. If uplo = 'U' or 'u'
* the leading k x k upper triangular part of the array A must
* contain the upper triangular matrix, and the strictly lower
* triangular part of A is not referenced. If uplo = 'L' or 'l'
* the leading k x k lower triangular part of the array A must
* contain the lower triangular matrix, and the strictly upper
* triangular part of A is not referenced. When diag = 'U' or 'u'
* the diagonal elements of A are no referenced and are assumed
* to be unity.
* lda leading dimension of A. When side = 'L' or 'l', it must be at
* least max(1,m) and at least max(1,n) otherwise
* B single precision complex array of dimensions (ldb, n). On entry,
the
* leading m x n part of the array contains the matrix B. It is
* overwritten with the transformed matrix on exit.
* ldb leading dimension of B. It must be at least max (1, m).
*
* Output
* ------
* B updated according to B = alpha * op(A) * B or B = alpha * B * op
(A)
*
* Reference: http://www.netlib.org/blas/ctrmm.f
*
* Error status for this function can be retrieved via cublasGetError().
*
* Error Status
* ------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
d
* CUBLAS_STATUS_INVALID_VALUE if m or n < 0
* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
*/
void CUBLASAPI cublasCtrmm (char side, char uplo, char transa, char diag, void CUBLASAPI cublasCtrmm (char side, char uplo, char transa, char diag,
int m, int n, cuComplex alpha, const cuComplex *A, int m, int n, cuComplex alpha, const cuComplex *A,
int lda, cuComplex *B, int ldb); int lda, cuComplex *B, int ldb);
/*
* void
* cublasCtrsm (char side, char uplo, char transa, char diag, int m, int n,
* cuComplex alpha, const cuComplex *A, int lda,
* cuComplex *B, int ldb)
*
* solves one of the matrix equations
*
* op(A) * X = alpha * B, or X * op(A) = alpha * B,
*
* where alpha is a single precision complex scalar, and X and B are m x n
matrices
* that are composed of single precision complex elements. A is a unit or n
on-unit,
* upper or lower triangular matrix, and op(A) is one of
*
* op(A) = A or op(A) = transpose(A) or op( A ) = conj( A' ).
*
* The result matrix X overwrites input matrix B; that is, on exit the resu
lt
* is stored in B. Matrices A and B are stored in column major format, and
* lda and ldb are the leading dimensions of the two-dimensonials arrays th
at
* contain A and B, respectively.
*
* Input
* -----
* side specifies whether op(A) appears on the left or right of X as
* follows: side = 'L' or 'l' indicates solve op(A) * X = alpha * B.
* side = 'R' or 'r' indicates solve X * op(A) = alpha * B.
* uplo specifies whether the matrix A is an upper or lower triangular
* matrix as follows: uplo = 'U' or 'u' indicates A is an upper
* triangular matrix. uplo = 'L' or 'l' indicates A is a lower
* triangular matrix.
* transa specifies the form of op(A) to be used in matrix multiplication
* as follows: If transa = 'N' or 'N', then op(A) = A. If transa =
* 'T', 't', 'C', or 'c', then op(A) = transpose(A).
* diag specifies whether or not A is a unit triangular matrix like so:
* if diag = 'U' or 'u', A is assumed to be unit triangular. If
* diag = 'N' or 'n', then A is not assumed to be unit triangular.
* m specifies the number of rows of B. m must be at least zero.
* n specifies the number of columns of B. n must be at least zero.
* alpha is a single precision complex scalar to be multiplied with B. Whe
n alpha is
* zero, then A is not referenced and B need not be set before entry
.
* A is a single precision complex array of dimensions (lda, k), where
k is
* m when side = 'L' or 'l', and is n when side = 'R' or 'r'. If
* uplo = 'U' or 'u', the leading k x k upper triangular part of
* the array A must contain the upper triangular matrix and the
* strictly lower triangular matrix of A is not referenced. When
* uplo = 'L' or 'l', the leading k x k lower triangular part of
* the array A must contain the lower triangular matrix and the
* strictly upper triangular part of A is not referenced. Note that
* when diag = 'U' or 'u', the diagonal elements of A are not
* referenced, and are assumed to be unity.
* lda is the leading dimension of the two dimensional array containing
A.
* When side = 'L' or 'l' then lda must be at least max(1, m), when
* side = 'R' or 'r' then lda must be at least max(1, n).
* B is a single precision complex array of dimensions (ldb, n). ldb m
ust be
* at least max (1,m). The leading m x n part of the array B must
* contain the right-hand side matrix B. On exit B is overwritten
* by the solution matrix X.
* ldb is the leading dimension of the two dimensional array containing
B.
* ldb must be at least max(1, m).
*
* Output
* ------
* B contains the solution matrix X satisfying op(A) * X = alpha * B,
* or X * op(A) = alpha * B
*
* Reference: http://www.netlib.org/blas/ctrsm.f
*
* Error status for this function can be retrieved via cublasGetError().
*
* Error Status
* ------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
d
* CUBLAS_STATUS_INVALID_VALUE if m or n < 0
* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
*/
void CUBLASAPI cublasCtrsm (char side, char uplo, char transa, char diag, void CUBLASAPI cublasCtrsm (char side, char uplo, char transa, char diag,
int m, int n, cuComplex alpha, const cuComplex *A, int m, int n, cuComplex alpha, const cuComplex *A,
int lda, cuComplex *B, int ldb); int lda, cuComplex *B, int ldb);
void CUBLASAPI cublasXerbla (const char *srName, int info); void CUBLASAPI cublasXerbla (const char *srName, int info);
/* ---------------- CUBLAS double-precision BLAS1 functions --------------- - */ /* ---------------- CUBLAS double-precision BLAS1 functions --------------- - */
/* /*
* double * double
* cublasDasum (int n, const double *x, int incx) * cublasDasum (int n, const double *x, int incx)
* *
* computes the sum of the absolute values of the elements of double * computes the sum of the absolute values of the elements of double
* precision vector x; that is, the result is the sum from i = 0 to n - 1 o f * precision vector x; that is, the result is the sum from i = 0 to n - 1 o f
skipping to change at line 3068 skipping to change at line 5474
* ------------ * ------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize d * CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize d
* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support * CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU * CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
*/ */
void CUBLASAPI cublasDrot (int n, double *x, int incx, double *y, int incy, void CUBLASAPI cublasDrot (int n, double *x, int incx, double *y, int incy,
double sc, double ss); double sc, double ss);
/* /*
* void * void
* cublasDrotg (double *sa, double *sb, double *sc, double *ss) * cublasDrotg (double *host_sa, double *host_sb, double *host_sc, double * host_ss)
* *
* constructs the Givens tranformation * constructs the Givens tranformation
* *
* ( sc ss ) * ( sc ss )
* G = ( ) , sc^2 + ss^2 = 1, * G = ( ) , sc^2 + ss^2 = 1,
* (-ss sc ) * (-ss sc )
* *
* which zeros the second entry of the 2-vector transpose(sa, sb). * which zeros the second entry of the 2-vector transpose(sa, sb).
* *
* The quantity r = (+/-) sqrt (sa^2 + sb^2) overwrites sa in storage. The * The quantity r = (+/-) sqrt (sa^2 + sb^2) overwrites sa in storage. The
* value of sb is overwritten by a value z which allows sc and ss to be * value of sb is overwritten by a value z which allows sc and ss to be
* recovered by the following algorithm: * recovered by the following algorithm:
* *
* if z=1 set sc = 0.0 and ss = 1.0 * if z=1 set sc = 0.0 and ss = 1.0
* if abs(z) < 1 set sc = sqrt(1-z^2) and ss = z * if abs(z) < 1 set sc = sqrt(1-z^2) and ss = z
* if abs(z) > 1 set sc = 1/z and ss = sqrt(1-sc^2) * if abs(z) > 1 set sc = 1/z and ss = sqrt(1-sc^2)
* *
* The function drot (n, x, incx, y, incy, sc, ss) normally is called next * The function drot (n, x, incx, y, incy, sc, ss) normally is called next
* to apply the transformation to a 2 x n matrix. * to apply the transformation to a 2 x n matrix.
* Note that is function is provided for completeness and run exclusively
* on the Host.
* *
* Input * Input
* ----- * -----
* sa double-precision scalar * sa double-precision scalar
* sb double-precision scalar * sb double-precision scalar
* *
* Output * Output
* ------ * ------
* sa double-precision r * sa double-precision r
* sb double-precision z * sb double-precision z
* sc double-precision result * sc double-precision result
* ss double-precision result * ss double-precision result
* *
* Reference: http://www.netlib.org/blas/drotg.f * Reference: http://www.netlib.org/blas/drotg.f
* *
* This function does not set any error status. * This function does not set any error status.
*/ */
void CUBLASAPI cublasDrotg (double *sa, double *sb, double *sc, double *ss) ; void CUBLASAPI cublasDrotg (double *host_sa, double *host_sb, double *host_ sc, double *host_ss);
/* /*
* void * void
* cublasDrotm (int n, double *x, int incx, double *y, int incy, * cublasDrotm (int n, double *x, int incx, double *y, int incy,
* const double* sparam) * const double* sparam)
* *
* applies the modified Givens transformation, h, to the 2 x n matrix * applies the modified Givens transformation, h, to the 2 x n matrix
* *
* ( transpose(x) ) * ( transpose(x) )
* ( transpose(y) ) * ( transpose(y) )
skipping to change at line 3127 skipping to change at line 5535
* The elements of x are in x[lx + i * incx], i = 0 to n-1, where lx = 1 if * The elements of x are in x[lx + i * incx], i = 0 to n-1, where lx = 1 if
* incx >= 0, else lx = 1 + (1 - n) * incx, and similarly for y using ly an d * incx >= 0, else lx = 1 + (1 - n) * incx, and similarly for y using ly an d
* incy. With sparam[0] = sflag, h has one of the following forms: * incy. With sparam[0] = sflag, h has one of the following forms:
* *
* sflag = -1.0 sflag = 0.0 sflag = 1.0 sflag = -2.0 * sflag = -1.0 sflag = 0.0 sflag = 1.0 sflag = -2.0
* *
* (sh00 sh01) (1.0 sh01) (sh00 1.0) (1.0 0.0) * (sh00 sh01) (1.0 sh01) (sh00 1.0) (1.0 0.0)
* h = ( ) ( ) ( ) ( ) * h = ( ) ( ) ( ) ( )
* (sh10 sh11) (sh10 1.0) (-1.0 sh11) (0.0 1.0) * (sh10 sh11) (sh10 1.0) (-1.0 sh11) (0.0 1.0)
* *
* Note that is function is provided for completeness and run exclusively
* on the Host.
*
* Input * Input
* ----- * -----
* n number of elements in input vectors * n number of elements in input vectors
* x double-precision vector with n elements * x double-precision vector with n elements
* incx storage spacing between elements of x * incx storage spacing between elements of x
* y double-precision vector with n elements * y double-precision vector with n elements
* incy storage spacing between elements of y * incy storage spacing between elements of y
* sparam 5-element vector. sparam[0] is sflag described above. sparam[1] * sparam 5-element vector. sparam[0] is sflag described above. sparam[1]
* through sparam[4] contain the 2x2 rotation matrix h: sparam[1] * through sparam[4] contain the 2x2 rotation matrix h: sparam[1]
* contains sh00, sparam[2] contains sh10, sparam[3] contains sh01, * contains sh00, sparam[2] contains sh10, sparam[3] contains sh01,
skipping to change at line 3159 skipping to change at line 5570
* ------------ * ------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize d * CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize d
* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support * CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU * CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
*/ */
void CUBLASAPI cublasDrotm(int n, double *x, int incx, double *y, int incy, void CUBLASAPI cublasDrotm(int n, double *x, int incx, double *y, int incy,
const double* sparam); const double* sparam);
/* /*
* void * void
* cublasDrotmg (double *psd1, double *psd2, double *psx1, const double *ps * cublasDrotmg (double *host_sd1, double *host_sd2, double *host_sx1, cons
y1, t double *host_sy1,
* double *sparam) * double *host_sparam)
* *
* constructs the modified Givens transformation matrix h which zeros * constructs the modified Givens transformation matrix h which zeros
* the second component of the 2-vector transpose(sqrt(sd1)*sx1,sqrt(sd2)*s y1). * the second component of the 2-vector transpose(sqrt(sd1)*sx1,sqrt(sd2)*s y1).
* With sparam[0] = sflag, h has one of the following forms: * With sparam[0] = sflag, h has one of the following forms:
* *
* sflag = -1.0 sflag = 0.0 sflag = 1.0 sflag = -2.0 * sflag = -1.0 sflag = 0.0 sflag = 1.0 sflag = -2.0
* *
* (sh00 sh01) (1.0 sh01) (sh00 1.0) (1.0 0.0) * (sh00 sh01) (1.0 sh01) (sh00 1.0) (1.0 0.0)
* h = ( ) ( ) ( ) ( ) * h = ( ) ( ) ( ) ( )
* (sh10 sh11) (sh10 1.0) (-1.0 sh11) (0.0 1.0) * (sh10 sh11) (sh10 1.0) (-1.0 sh11) (0.0 1.0)
* *
* sparam[1] through sparam[4] contain sh00, sh10, sh01, sh11, * sparam[1] through sparam[4] contain sh00, sh10, sh01, sh11,
* respectively. Values of 1.0, -1.0, or 0.0 implied by the value * respectively. Values of 1.0, -1.0, or 0.0 implied by the value
* of sflag are not stored in sparam. * of sflag are not stored in sparam.
* Note that is function is provided for completeness and run exclusively
* on the Host.
* *
* Input * Input
* ----- * -----
* sd1 single precision scalar * sd1 single precision scalar
* sd2 single precision scalar * sd2 single precision scalar
* sx1 single precision scalar * sx1 single precision scalar
* sy1 single precision scalar * sy1 single precision scalar
* *
* Output * Output
* ------ * ------
skipping to change at line 3197 skipping to change at line 5610
* sx1 changed to represent the effect of the transformation * sx1 changed to represent the effect of the transformation
* sparam 5-element vector. sparam[0] is sflag described above. sparam[1] * sparam 5-element vector. sparam[0] is sflag described above. sparam[1]
* through sparam[4] contain the 2x2 rotation matrix h: sparam[1] * through sparam[4] contain the 2x2 rotation matrix h: sparam[1]
* contains sh00, sparam[2] contains sh10, sparam[3] contains sh01, * contains sh00, sparam[2] contains sh10, sparam[3] contains sh01,
* and sprams[4] contains sh11. * and sprams[4] contains sh11.
* *
* Reference: http://www.netlib.org/blas/drotmg.f * Reference: http://www.netlib.org/blas/drotmg.f
* *
* This functions does not set any error status. * This functions does not set any error status.
*/ */
void CUBLASAPI cublasDrotmg (double *sd1, double *sd2, double *sx1, void CUBLASAPI cublasDrotmg (double *host_sd1, double *host_sd2, double *ho
const double *sy1, double* sparam); st_sx1,
const double *host_sy1, double* host_sparam);
/* /*
* void * void
* cublasDscal (int n, double alpha, double *x, int incx) * cublasDscal (int n, double alpha, double *x, int incx)
* *
* replaces double-precision vector x with double-precision alpha * x. For * replaces double-precision vector x with double-precision alpha * x. For
* i = 0 to n-1, it replaces x[lx + i * incx] with alpha * x[lx + i * incx] , * i = 0 to n-1, it replaces x[lx + i * incx] with alpha * x[lx + i * incx] ,
* where lx = 1 if incx >= 0, else lx = 1 + (1 - n) * incx. * where lx = 1 if incx >= 0, else lx = 1 + (1 - n) * incx.
* *
* Input * Input
skipping to change at line 3493 skipping to change at line 5906
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize d * CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize d
* CUBLAS_STATUS_INVALID_VALUE if n < 0, or incx == 0 * CUBLAS_STATUS_INVALID_VALUE if n < 0, or incx == 0
* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support * CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU * CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
*/ */
void CUBLASAPI cublasDsyr (char uplo, int n, double alpha, void CUBLASAPI cublasDsyr (char uplo, int n, double alpha,
const double *x, int incx, double *A, const double *x, int incx, double *A,
int lda); int lda);
/* /*
* void cublasDsyr2 (char uplo, int n, double alpha, const double *x, int i
ncx,
* const double *y, int incy, double *A, int lda)
*
* performs the symmetric rank 2 operation
*
* A = alpha*x*transpose(y) + alpha*y*transpose(x) + A,
*
* where alpha is a double precision scalar, x and y are n element double
* precision vector and A is an n by n symmetric matrix consisting of doubl
e
* precision elements.
*
* Input
* -----
* uplo specifies whether the matrix data is stored in the upper or the l
ower
* triangular part of array A. If uplo == 'U' or 'u', then only the
* upper triangular part of A may be referenced and the lower triang
ular
* part of A is inferred. If uplo == 'L' or 'l', then only the lower
* triangular part of A may be referenced and the upper triangular p
art
* of A is inferred.
* n specifies the number of rows and columns of the matrix A. It must
be
* at least zero.
* alpha double precision scalar multiplier applied to x * transpose(y) +
* y * transpose(x).
* x double precision array of length at least (1 + (n - 1) * abs (inc
x)).
* incx storage spacing between elements of x. incx must not be zero.
* y double precision array of length at least (1 + (n - 1) * abs (inc
y)).
* incy storage spacing between elements of y. incy must not be zero.
* A double precision array of dimensions (lda, n). If uplo == 'U' or
'u',
* then A must contains the upper triangular part of a symmetric mat
rix,
* and the strictly lower triangular parts is not referenced. If upl
o ==
* 'L' or 'l', then A contains the lower triangular part of a symmet
ric
* matrix, and the strictly upper triangular part is not referenced.
* lda leading dimension of A. It must be at least max(1, n).
*
* Output
* ------
* A updated according to A = alpha*x*transpose(y)+alpha*y*transpose(x
)+A
*
* Reference: http://www.netlib.org/blas/dsyr2.f
*
* Error status for this function can be retrieved via cublasGetError().
*
* Error Status
* ------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
d
* CUBLAS_STATUS_INVALID_VALUE if n < 0, incx == 0, incy == 0
* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
*/
void CUBLASAPI cublasDsyr2 (char uplo, int n, double alpha,
const double *x, int incx, const double *y,
int incy, double *A, int lda);
/*
* void
* cublasDspr (char uplo, int n, double alpha, const double *x, int incx,
* double *AP)
*
* performs the symmetric rank 1 operation
*
* A = alpha * x * transpose(x) + A,
*
* where alpha is a double precision scalar and x is an n element double
* precision vector. A is a symmetric n x n matrix consisting of double
* precision elements that is supplied in packed form.
*
* Input
* -----
* uplo specifies whether the matrix data is stored in the upper or the l
ower
* triangular part of array AP. If uplo == 'U' or 'u', then the uppe
r
* triangular part of A is supplied in AP. If uplo == 'L' or 'l', th
en
* the lower triangular part of A is supplied in AP.
* n specifies the number of rows and columns of the matrix A. It must
be
* at least zero.
* alpha double precision scalar multiplier applied to x * transpose(x).
* x double precision array of length at least (1 + (n - 1) * abs(incx
)).
* incx storage spacing between elements of x. incx must not be zero.
* AP double precision array with at least ((n * (n + 1)) / 2) elements
. If
* uplo == 'U' or 'u', the array AP contains the upper triangular pa
rt
* of the symmetric matrix A, packed sequentially, column by column;
* that is, if i <= j, then A[i,j] is stored is AP[i+(j*(j+1)/2)]. I
f
* uplo == 'L' or 'L', the array AP contains the lower triangular pa
rt
* of the symmetric matrix A, packed sequentially, column by column;
* that is, if i >= j, then A[i,j] is stored in AP[i+((2*n-j+1)*j)/2
].
*
* Output
* ------
* A updated according to A = alpha * x * transpose(x) + A
*
* Reference: http://www.netlib.org/blas/dspr.f
*
* Error status for this function can be retrieved via cublasGetError().
*
* Error Status
* ------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
d
* CUBLAS_STATUS_INVALID_VALUE if n < 0, or incx == 0
* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
*/
void CUBLASAPI cublasDspr (char uplo, int n, double alpha,
const double *x, int incx, double *AP);
/*
* void
* cublasDspr2 (char uplo, int n, double alpha, const double *x, int incx,
* const double *y, int incy, double *AP)
*
* performs the symmetric rank 2 operation
*
* A = alpha*x*transpose(y) + alpha*y*transpose(x) + A,
*
* where alpha is a double precision scalar, and x and y are n element doub
le
* precision vectors. A is a symmetric n x n matrix consisting of double
* precision elements that is supplied in packed form.
*
* Input
* -----
* uplo specifies whether the matrix data is stored in the upper or the l
ower
* triangular part of array A. If uplo == 'U' or 'u', then only the
* upper triangular part of A may be referenced and the lower triang
ular
* part of A is inferred. If uplo == 'L' or 'l', then only the lower
* triangular part of A may be referenced and the upper triangular p
art
* of A is inferred.
* n specifies the number of rows and columns of the matrix A. It must
be
* at least zero.
* alpha double precision scalar multiplier applied to x * transpose(y) +
* y * transpose(x).
* x double precision array of length at least (1 + (n - 1) * abs (inc
x)).
* incx storage spacing between elements of x. incx must not be zero.
* y double precision array of length at least (1 + (n - 1) * abs (inc
y)).
* incy storage spacing between elements of y. incy must not be zero.
* AP double precision array with at least ((n * (n + 1)) / 2) elements
. If
* uplo == 'U' or 'u', the array AP contains the upper triangular pa
rt
* of the symmetric matrix A, packed sequentially, column by column;
* that is, if i <= j, then A[i,j] is stored is AP[i+(j*(j+1)/2)]. I
f
* uplo == 'L' or 'L', the array AP contains the lower triangular pa
rt
* of the symmetric matrix A, packed sequentially, column by column;
* that is, if i >= j, then A[i,j] is stored in AP[i+((2*n-j+1)*j)/2
].
*
* Output
* ------
* A updated according to A = alpha*x*transpose(y)+alpha*y*transpose(x
)+A
*
* Reference: http://www.netlib.org/blas/dspr2.f
*
* Error status for this function can be retrieved via cublasGetError().
*
* Error Status
* ------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
d
* CUBLAS_STATUS_INVALID_VALUE if n < 0, incx == 0, incy == 0
* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
*/
void CUBLASAPI cublasDspr2 (char uplo, int n, double alpha,
const double *x, int incx, const double *y,
int incy, double *AP);
/*
* void * void
* cublasDtrsv (char uplo, char trans, char diag, int n, const double *A, * cublasDtrsv (char uplo, char trans, char diag, int n, const double *A,
* int lda, double *x, int incx) * int lda, double *x, int incx)
* *
* solves a system of equations op(A) * x = b, where op(A) is either A or * solves a system of equations op(A) * x = b, where op(A) is either A or
* transpose(A). b and x are double precision vectors consisting of n * transpose(A). b and x are double precision vectors consisting of n
* elements, and A is an n x n matrix composed of a unit or non-unit, upper * elements, and A is an n x n matrix composed of a unit or non-unit, upper
* or lower triangular matrix. Matrix A is stored in column major format, * or lower triangular matrix. Matrix A is stored in column major format,
* and lda is the leading dimension of the two-diemnsional array containing * and lda is the leading dimension of the two-dimensional array containing
* A. * A.
* *
* No test for singularity or near-singularity is included in this function . * No test for singularity or near-singularity is included in this function .
* Such tests must be performed before calling this function. * Such tests must be performed before calling this function.
* *
* Input * Input
* ----- * -----
* uplo specifies whether the matrix data is stored in the upper or the * uplo specifies whether the matrix data is stored in the upper or the
* lower triangular part of array A. If uplo = 'U' or 'u', then only * lower triangular part of array A. If uplo = 'U' or 'u', then only
* the upper triangular part of A may be referenced. If uplo = 'L' o r * the upper triangular part of A may be referenced. If uplo = 'L' o r
* 'l', then only the lower triangular part of A may be referenced. * 'l', then only the lower triangular part of A may be referenced.
* trans specifies op(A). If transa = 'n' or 'N', op(A) = A. If transa = ' t', * trans specifies op(A). If transa = 'n' or 'N', op(A) = A. If transa = ' t',
* 'T', 'c', or 'C', op(A) = transpose(A) * 'T', 'c', or 'C', op(A) = transpose(A)
* diag specifies whether or not A is a unit triangular matrix like so: * diag specifies whether or not A is a unit triangular matrix like so:
* if diag = 'U' or 'u', A is assumed to be unit triangular. If * if diag = 'U' or 'u', A is assumed to be unit triangular. If
* diag = 'N' or 'n', then A is not assumed to be unit triangular. * diag = 'N' or 'n', then A is not assumed to be unit triangular.
* n specifies the number of rows and columns of the matrix A. It * n specifies the number of rows and columns of the matrix A. It
* must be at least 0. In the current implementation n must be <= * must be at least 0.
* 2040.
* A is a double precision array of dimensions (lda, n). If uplo = 'U' * A is a double precision array of dimensions (lda, n). If uplo = 'U'
* or 'u', then A must contains the upper triangular part of a symme tric * or 'u', then A must contains the upper triangular part of a symme tric
* matrix, and the strictly lower triangular parts is not referenced . * matrix, and the strictly lower triangular parts is not referenced .
* If uplo = 'L' or 'l', then A contains the lower triangular part o f * If uplo = 'L' or 'l', then A contains the lower triangular part o f
* a symmetric matrix, and the strictly upper triangular part is not * a symmetric matrix, and the strictly upper triangular part is not
* referenced. * referenced.
* lda is the leading dimension of the two-dimensional array containing A. * lda is the leading dimension of the two-dimensional array containing A.
* lda must be at least max(1, n). * lda must be at least max(1, n).
* x double precision array of length at least (1 + (n - 1) * abs(incx )). * x double precision array of length at least (1 + (n - 1) * abs(incx )).
* On entry, x contains the n element right-hand side vector b. On e xit, * On entry, x contains the n element right-hand side vector b. On e xit,
skipping to change at line 3546 skipping to change at line 6117
* ------ * ------
* x updated to contain the solution vector x that solves op(A) * x = b. * x updated to contain the solution vector x that solves op(A) * x = b.
* *
* Reference: http://www.netlib.org/blas/dtrsv.f * Reference: http://www.netlib.org/blas/dtrsv.f
* *
* Error status for this function can be retrieved via cublasGetError(). * Error status for this function can be retrieved via cublasGetError().
* *
* Error Status * Error Status
* ------------ * ------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize d * CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize d
* CUBLAS_STATUS_INVALID_VALUE if incx == 0 or if n < 0 or n > 2040 * CUBLAS_STATUS_INVALID_VALUE if incx == 0 or if n < 0
* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support * CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU * CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
*/ */
void CUBLASAPI cublasDtrsv (char uplo, char trans, char diag, int n, void CUBLASAPI cublasDtrsv (char uplo, char trans, char diag, int n,
const double *A, int lda, double *x, const double *A, int lda, double *x,
int incx); int incx);
/*
* void
* cublasDtrmv (char uplo, char trans, char diag, int n, const double *A,
* int lda, double *x, int incx);
*
* performs one of the matrix-vector operations x = op(A) * x, where op(A)
=
= A, or op(A) = transpose(A). x is an n-element single precision vector, a
nd
* A is an n x n, unit or non-unit, upper or lower, triangular matrix compo
sed
* of single precision elements.
*
* Input
* -----
* uplo specifies whether the matrix A is an upper or lower triangular
* matrix. If uplo = 'U' or 'u', then A is an upper triangular matri
x.
* If uplo = 'L' or 'l', then A is a lower triangular matrix.
* trans specifies op(A). If transa = 'N' or 'n', op(A) = A. If trans = 'T
',
* 't', 'C', or 'c', op(A) = transpose(A)
* diag specifies whether or not matrix A is unit triangular. If diag = '
U'
* or 'u', A is assumed to be unit triangular. If diag = 'N' or 'n',
A
* is not assumed to be unit triangular.
* n specifies the number of rows and columns of the matrix A. n must
be
* at least zero.
* A single precision array of dimension (lda, n). If uplo = 'U' or 'u
',
* the leading n x n upper triangular part of the array A must conta
in
* the upper triangular matrix and the strictly lower triangular par
t
* of A is not referenced. If uplo = 'L' or 'l', the leading n x n l
ower
* triangular part of the array A must contain the lower triangular
* matrix and the strictly upper triangular part of A is not referen
ced.
* When diag = 'U' or 'u', the diagonal elements of A are not refere
nced
* either, but are are assumed to be unity.
* lda is the leading dimension of A. It must be at least max (1, n).
* x single precision array of length at least (1 + (n - 1) * abs(incx
) ).
* On entry, x contains the source vector. On exit, x is overwritten
* with the result vector.
* incx specifies the storage spacing for elements of x. incx must not be
* zero.
*
* Output
* ------
* x updated according to x = op(A) * x,
*
* Reference: http://www.netlib.org/blas/dtrmv.f
*
* Error status for this function can be retrieved via cublasGetError().
*
* Error Status
* ------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
d
* CUBLAS_STATUS_INVALID_VALUE if incx == 0 or if n < 0
* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
*/
void CUBLASAPI cublasDtrmv (char uplo, char trans, char diag, int n,
const double *A, int lda, double *x, int incx);
/*
* void
* cublasDgbmv (char trans, int m, int n, int kl, int ku, double alpha,
* const double *A, int lda, const double *x, int incx, double
beta,
* double *y, int incy);
*
* performs one of the matrix-vector operations
*
* y = alpha*op(A)*x + beta*y, op(A)=A or op(A) = transpose(A)
*
* alpha and beta are double precision scalars. x and y are double precisio
n
* vectors. A is an m by n band matrix consisting of double precision eleme
nts
* with kl sub-diagonals and ku super-diagonals.
*
* Input
* -----
* trans specifies op(A). If trans == 'N' or 'n', op(A) = A. If trans == '
T',
* 't', 'C', or 'c', op(A) = transpose(A)
* m specifies the number of rows of the matrix A. m must be at least
* zero.
* n specifies the number of columns of the matrix A. n must be at lea
st
* zero.
* kl specifies the number of sub-diagonals of matrix A. It must be at
* least zero.
* ku specifies the number of super-diagonals of matrix A. It must be a
t
* least zero.
* alpha double precision scalar multiplier applied to op(A).
* A double precision array of dimensions (lda, n). The leading
* (kl + ku + 1) x n part of the array A must contain the band matri
x A,
* supplied column by column, with the leading diagonal of the matri
x
* in row (ku + 1) of the array, the first super-diagonal starting a
t
* position 2 in row ku, the first sub-diagonal starting at position
1
* in row (ku + 2), and so on. Elements in the array A that do not
* correspond to elements in the band matrix (such as the top left
* ku x ku triangle) are not referenced.
* lda leading dimension of A. lda must be at least (kl + ku + 1).
* x double precision array of length at least (1+(n-1)*abs(incx)) whe
n
* trans == 'N' or 'n' and at least (1+(m-1)*abs(incx)) otherwise.
* incx specifies the increment for the elements of x. incx must not be z
ero.
* beta double precision scalar multiplier applied to vector y. If beta i
s
* zero, y is not read.
* y double precision array of length at least (1+(m-1)*abs(incy)) whe
n
* trans == 'N' or 'n' and at least (1+(n-1)*abs(incy)) otherwise. I
f
* beta is zero, y is not read.
* incy On entry, incy specifies the increment for the elements of y. inc
y
* must not be zero.
*
* Output
* ------
* y updated according to y = alpha*op(A)*x + beta*y
*
* Reference: http://www.netlib.org/blas/dgbmv.f
*
* Error status for this function can be retrieved via cublasGetError().
*
* Error Status
* ------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
d
* CUBLAS_STATUS_INVALID_VALUE if n < 0, or if incx or incy == 0
* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
*/
void CUBLASAPI cublasDgbmv (char trans, int m, int n, int kl, int ku,
double alpha, const double *A, int lda,
const double *x, int incx, double beta,
double *y, int incy);
/*
* void
* cublasDtbmv (char uplo, char trans, char diag, int n, int k, const doubl
e *A,
* int lda, double *x, int incx)
*
* performs one of the matrix-vector operations x = op(A) * x, where op(A)
= A,
* or op(A) = transpose(A). x is an n-element double precision vector, and
A is
* an n x n, unit or non-unit, upper or lower triangular band matrix compos
ed
* of double precision elements.
*
* Input
* -----
* uplo specifies whether the matrix A is an upper or lower triangular ba
nd
* matrix. If uplo == 'U' or 'u', A is an upper triangular band matr
ix.
* If uplo == 'L' or 'l', A is a lower triangular band matrix.
* trans specifies op(A). If transa == 'N' or 'n', op(A) = A. If trans ==
'T',
* 't', 'C', or 'c', op(A) = transpose(A)
* diag specifies whether or not matrix A is unit triangular. If diag ==
'U'
* or 'u', A is assumed to be unit triangular. If diag == 'N' or 'n'
, A
* is not assumed to be unit triangular.
* n specifies the number of rows and columns of the matrix A. n must
be
* at least zero.
* k specifies the number of super- or sub-diagonals. If uplo == 'U' o
r
* 'u', k specifies the number of super-diagonals. If uplo == 'L' or
* 'l', k specifies the number of sub-diagonals. k must at least be
* zero.
* A double precision array of dimension (lda, n). If uplo == 'U' or '
u',
* the leading (k + 1) x n part of the array A must contain the uppe
r
* triangular band matrix, supplied column by column, with the leadi
ng
* diagonal of the matrix in row (k + 1) of the array, the first
* super-diagonal starting at position 2 in row k, and so on. The to
p
* left k x k triangle of the array A is not referenced. If uplo ==
'L'
* or 'l', the leading (k + 1) x n part of the array A must constain
the
* lower triangular band matrix, supplied column by column, with the
* leading diagonal of the matrix in row 1 of the array, the first
* sub-diagonal startingat position 1 in row 2, and so on. The botto
m
* right k x k triangle of the array is not referenced.
* lda is the leading dimension of A. It must be at least (k + 1).
* x double precision array of length at least (1 + (n - 1) * abs(incx
)).
* On entry, x contains the source vector. On exit, x is overwritten
* with the result vector.
* incx specifies the storage spacing for elements of x. incx must not be
* zero.
*
* Output
* ------
* x updated according to x = op(A) * x
*
* Reference: http://www.netlib.org/blas/dtbmv.f
*
* Error status for this function can be retrieved via cublasGetError().
*
* Error Status
* ------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
d
* CUBLAS_STATUS_INVALID_VALUE if n or k < 0, or if incx == 0
* CUBLAS_STATUS_ALLOC_FAILED if function cannot allocate enough intern
al scratch vector memory
* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
*/
void CUBLASAPI cublasDtbmv (char uplo, char trans, char diag, int n,
int k, const double *A, int lda, double *x,
int incx);
/*
* void
* cublasDtpmv (char uplo, char trans, char diag, int n, const double *AP,
* double *x, int incx);
*
* performs one of the matrix-vector operations x = op(A) * x, where op(A)
= A,
* or op(A) = transpose(A). x is an n element double precision vector, and
A
* is an n x n, unit or non-unit, upper or lower triangular matrix composed
* of double precision elements.
*
* Input
* -----
* uplo specifies whether the matrix A is an upper or lower triangular
* matrix. If uplo == 'U' or 'u', then A is an upper triangular matr
ix.
* If uplo == 'L' or 'l', then A is a lower triangular matrix.
* trans specifies op(A). If transa == 'N' or 'n', op(A) = A. If trans ==
'T',
* 't', 'C', or 'c', op(A) = transpose(A)
* diag specifies whether or not matrix A is unit triangular. If diag ==
'U'
* or 'u', A is assumed to be unit triangular. If diag == 'N' or 'n'
, A
* is not assumed to be unit triangular.
* n specifies the number of rows and columns of the matrix A. n must
be
* at least zero. In the current implementation n must not exceed 40
70.
* AP double precision array with at least ((n * (n + 1)) / 2) elements
. If
* uplo == 'U' or 'u', the array AP contains the upper triangular pa
rt
* of the symmetric matrix A, packed sequentially, column by column;
* that is, if i <= j, then A[i,j] is stored in AP[i+(j*(j+1)/2)]. I
f
* uplo == 'L' or 'L', the array AP contains the lower triangular pa
rt
* of the symmetric matrix A, packed sequentially, column by column;
* that is, if i >= j, then A[i,j] is stored in AP[i+((2*n-j+1)*j)/2
].
* x double precision array of length at least (1 + (n - 1) * abs(incx
)).
* On entry, x contains the source vector. On exit, x is overwritten
* with the result vector.
* incx specifies the storage spacing for elements of x. incx must not be
* zero.
*
* Output
* ------
* x updated according to x = op(A) * x,
*
* Reference: http://www.netlib.org/blas/dtpmv.f
*
* Error status for this function can be retrieved via cublasGetError().
*
* Error Status
* ------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
d
* CUBLAS_STATUS_INVALID_VALUE if incx == 0 or n < 0
* CUBLAS_STATUS_ALLOC_FAILED if function cannot allocate enough intern
al scratch vector memory
* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
*/
void CUBLASAPI cublasDtpmv (char uplo, char trans, char diag, int n,
const double *AP, double *x, int incx);
/*
* void
* cublasDtpsv (char uplo, char trans, char diag, int n, const double *AP,
* double *X, int incx)
*
* solves one of the systems of equations op(A)*x = b, where op(A) is eithe
r
* op(A) = A or op(A) = transpose(A). b and x are n element vectors, and A
is
* an n x n unit or non-unit, upper or lower triangular matrix. No test for
* singularity or near-singularity is included in this routine. Such tests
* must be performed before calling this routine.
*
* Input
* -----
* uplo specifies whether the matrix is an upper or lower triangular matr
ix
* as follows: If uplo == 'U' or 'u', A is an upper triangluar matri
x.
* If uplo == 'L' or 'l', A is a lower triangular matrix.
* trans specifies op(A). If trans == 'N' or 'n', op(A) = A. If trans == '
T',
* 't', 'C', or 'c', op(A) = transpose(A).
* diag specifies whether A is unit triangular. If diag == 'U' or 'u', A
is
* assumed to be unit triangular; thas is, diagonal elements are not
* read and are assumed to be unity. If diag == 'N' or 'n', A is not
* assumed to be unit triangular.
* n specifies the number of rows and columns of the matrix A. n must
be
* at least zero.
* AP double precision array with at least ((n*(n+1))/2) elements. If u
plo
* == 'U' or 'u', the array AP contains the upper triangular matrix
A,
* packed sequentially, column by column; that is, if i <= j, then
* A[i,j] is stored is AP[i+(j*(j+1)/2)]. If uplo == 'L' or 'L', the
* array AP contains the lower triangular matrix A, packed sequentia
lly,
* column by column; that is, if i >= j, then A[i,j] is stored in
* AP[i+((2*n-j+1)*j)/2]. When diag = 'U' or 'u', the diagonal eleme
nts
* of A are not referenced and are assumed to be unity.
* x double precision array of length at least (1+(n-1)*abs(incx)).
* incx storage spacing between elements of x. It must not be zero.
*
* Output
* ------
* x updated to contain the solution vector x that solves op(A) * x =
b.
*
* Reference: http://www.netlib.org/blas/dtpsv.f
*
* Error status for this function can be retrieved via cublasGetError().
*
* Error Status
* ------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
d
* CUBLAS_STATUS_INVALID_VALUE if incx == 0 or if n < 0 or n > 2035
* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
*/
void CUBLASAPI cublasDtpsv (char uplo, char trans, char diag, int n,
const double *AP, double *x, int incx);
/*
* void cublasDtbsv (char uplo, char trans, char diag, int n, int k,
* const double *A, int lda, double *X, int incx)
*
* solves one of the systems of equations op(A)*x = b, where op(A) is eithe
r
* op(A) = A or op(A) = transpose(A). b and x are n element vectors, and A
is
* an n x n unit or non-unit, upper or lower triangular band matrix with k
+ 1
* diagonals. No test for singularity or near-singularity is included in th
is
* function. Such tests must be performed before calling this function.
*
* Input
* -----
* uplo specifies whether the matrix is an upper or lower triangular band
* matrix as follows: If uplo == 'U' or 'u', A is an upper triangula
r
* band matrix. If uplo == 'L' or 'l', A is a lower triangular band
* matrix.
* trans specifies op(A). If trans == 'N' or 'n', op(A) = A. If trans == '
T',
* 't', 'C', or 'c', op(A) = transpose(A).
* diag specifies whether A is unit triangular. If diag == 'U' or 'u', A
is
* assumed to be unit triangular; thas is, diagonal elements are not
* read and are assumed to be unity. If diag == 'N' or 'n', A is not
* assumed to be unit triangular.
* n specifies the number of rows and columns of the matrix A. n must
be
* at least zero.
* k specifies the number of super- or sub-diagonals. If uplo == 'U' o
r
* 'u', k specifies the number of super-diagonals. If uplo == 'L' or
* 'l', k specifies the number of sub-diagonals. k must at least be
* zero.
* A double precision array of dimension (lda, n). If uplo == 'U' or '
u',
* the leading (k + 1) x n part of the array A must contain the uppe
r
* triangular band matrix, supplied column by column, with the leadi
ng
* diagonal of the matrix in row (k + 1) of the array, the first sup
er-
* diagonal starting at position 2 in row k, and so on. The top left
* k x k triangle of the array A is not referenced. If uplo == 'L' o
r
* 'l', the leading (k + 1) x n part of the array A must constain th
e
* lower triangular band matrix, supplied column by column, with the
* leading diagonal of the matrix in row 1 of the array, the first
* sub-diagonal starting at position 1 in row 2, and so on. The bott
om
* right k x k triangle of the array is not referenced.
* x double precision array of length at least (1+(n-1)*abs(incx)).
* incx storage spacing between elements of x. It must not be zero.
*
* Output
* ------
* x updated to contain the solution vector x that solves op(A) * x =
b.
*
* Reference: http://www.netlib.org/blas/dtbsv.f
*
* Error status for this function can be retrieved via cublasGetError().
*
* Error Status
* ------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
d
* CUBLAS_STATUS_INVALID_VALUE if incx == 0, n < 0 or n > 2035
* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
*/
void CUBLASAPI cublasDtbsv (char uplo, char trans, char diag, int n,
int k, const double *A, int lda, double *x,
int incx);
/*
* void
* cublasDsymv (char uplo, int n, double alpha, const double *A, int lda,
* const double *x, int incx, double beta, double *y, int incy
)
*
* performs the matrix-vector operation
*
* y = alpha*A*x + beta*y
*
* Alpha and beta are double precision scalars, and x and y are double
* precision vectors, each with n elements. A is a symmetric n x n matrix
* consisting of double precision elements that is stored in either upper o
r
* lower storage mode.
*
* Input
* -----
* uplo specifies whether the upper or lower triangular part of the array
A
* is to be referenced. If uplo == 'U' or 'u', the symmetric matrix
A
* is stored in upper storage mode, i.e. only the upper triangular p
art
* of A is to be referenced while the lower triangular part of A is
to
* be inferred. If uplo == 'L' or 'l', the symmetric matrix A is sto
red
* in lower storage mode, i.e. only the lower triangular part of A i
s
* to be referenced while the upper triangular part of A is to be
* inferred.
* n specifies the number of rows and the number of columns of the
* symmetric matrix A. n must be at least zero.
* alpha double precision scalar multiplier applied to A*x.
* A double precision array of dimensions (lda, n). If uplo == 'U' or
'u',
* the leading n x n upper triangular part of the array A must conta
in
* the upper triangular part of the symmetric matrix and the strictl
y
* lower triangular part of A is not referenced. If uplo == 'L' or '
l',
* the leading n x n lower triangular part of the array A must conta
in
* the lower triangular part of the symmetric matrix and the strictl
y
* upper triangular part of A is not referenced.
* lda leading dimension of A. It must be at least max (1, n).
* x double precision array of length at least (1 + (n - 1) * abs(incx
)).
* incx storage spacing between elements of x. incx must not be zero.
* beta double precision scalar multiplier applied to vector y.
* y double precision array of length at least (1 + (n - 1) * abs(incy
)).
* If beta is zero, y is not read.
* incy storage spacing between elements of y. incy must not be zero.
*
* Output
* ------
* y updated according to y = alpha*A*x + beta*y
*
* Reference: http://www.netlib.org/blas/dsymv.f
*
* Error status for this function can be retrieved via cublasGetError().
*
* Error Status
* ------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
d
* CUBLAS_STATUS_INVALID_VALUE if n < 0, or if incx or incy == 0
* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
*/
void CUBLASAPI cublasDsymv (char uplo, int n, double alpha,
const double *A, int lda, const double *x,
int incx, double beta, double *y, int incy);
/*
* void
* cublasDsbmv (char uplo, int n, int k, double alpha, const double *A, int
lda,
* const double *x, int incx, double beta, double *y, int incy
)
*
* performs the matrix-vector operation
*
* y := alpha*A*x + beta*y
*
* alpha and beta are double precision scalars. x and y are double precisio
n
* vectors with n elements. A is an n by n symmetric band matrix consisting
* of double precision elements, with k super-diagonals and the same number
* of subdiagonals.
*
* Input
* -----
* uplo specifies whether the upper or lower triangular part of the symme
tric
* band matrix A is being supplied. If uplo == 'U' or 'u', the upper
* triangular part is being supplied. If uplo == 'L' or 'l', the low
er
* triangular part is being supplied.
* n specifies the number of rows and the number of columns of the
* symmetric matrix A. n must be at least zero.
* k specifies the number of super-diagonals of matrix A. Since the ma
trix
* is symmetric, this is also the number of sub-diagonals. k must be
at
* least zero.
* alpha double precision scalar multiplier applied to A*x.
* A double precision array of dimensions (lda, n). When uplo == 'U' o
r
* 'u', the leading (k + 1) x n part of array A must contain the upp
er
* triangular band of the symmetric matrix, supplied column by colum
n,
* with the leading diagonal of the matrix in row (k+1) of the array
,
* the first super-diagonal starting at position 2 in row k, and so
on.
* The top left k x k triangle of the array A is not referenced. Whe
n
* uplo == 'L' or 'l', the leading (k + 1) x n part of the array A m
ust
* contain the lower triangular band part of the symmetric matrix,
* supplied column by column, with the leading diagonal of the matri
x in
* row 1 of the array, the first sub-diagonal starting at position 1
in
* row 2, and so on. The bottom right k x k triangle of the array A
is
* not referenced.
* lda leading dimension of A. lda must be at least (k + 1).
* x double precision array of length at least (1 + (n - 1) * abs(incx
)).
* incx storage spacing between elements of x. incx must not be zero.
* beta double precision scalar multiplier applied to vector y. If beta i
s
* zero, y is not read.
* y double precision array of length at least (1 + (n - 1) * abs(incy
)).
* If beta is zero, y is not read.
* incy storage spacing between elements of y. incy must not be zero.
*
* Output
* ------
* y updated according to alpha*A*x + beta*y
*
* Reference: http://www.netlib.org/blas/dsbmv.f
*
* Error status for this function can be retrieved via cublasGetError().
*
* Error Status
* ------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
d
* CUBLAS_STATUS_INVALID_VALUE if k or n < 0, or if incx or incy == 0
* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
*/
void CUBLASAPI cublasDsbmv (char uplo, int n, int k, double alpha,
const double *A, int lda, const double *x,
int incx, double beta, double *y, int incy);
/*
* void
* cublasDspmv (char uplo, int n, double alpha, const double *AP, const dou
ble *x,
* int incx, double beta, double *y, int incy)
*
* performs the matrix-vector operation
*
* y = alpha * A * x + beta * y
*
* Alpha and beta are double precision scalars, and x and y are double
* precision vectors with n elements. A is a symmetric n x n matrix
* consisting of double precision elements that is supplied in packed form.
*
* Input
* -----
* uplo specifies whether the matrix data is stored in the upper or the l
ower
* triangular part of array AP. If uplo == 'U' or 'u', then the uppe
r
* triangular part of A is supplied in AP. If uplo == 'L' or 'l', th
en
* the lower triangular part of A is supplied in AP.
* n specifies the number of rows and columns of the matrix A. It must
be
* at least zero.
* alpha double precision scalar multiplier applied to A*x.
* AP double precision array with at least ((n * (n + 1)) / 2) elements
. If
* uplo == 'U' or 'u', the array AP contains the upper triangular pa
rt
* of the symmetric matrix A, packed sequentially, column by column;
* that is, if i <= j, then A[i,j] is stored is AP[i+(j*(j+1)/2)]. I
f
* uplo == 'L' or 'L', the array AP contains the lower triangular pa
rt
* of the symmetric matrix A, packed sequentially, column by column;
* that is, if i >= j, then A[i,j] is stored in AP[i+((2*n-j+1)*j)/2
].
* x double precision array of length at least (1 + (n - 1) * abs(incx
)).
* incx storage spacing between elements of x. incx must not be zero.
* beta double precision scalar multiplier applied to vector y;
* y double precision array of length at least (1 + (n - 1) * abs(incy
)).
* If beta is zero, y is not read.
* incy storage spacing between elements of y. incy must not be zero.
*
* Output
* ------
* y updated according to y = alpha*A*x + beta*y
*
* Reference: http://www.netlib.org/blas/dspmv.f
*
* Error status for this function can be retrieved via cublasGetError().
*
* Error Status
* ------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
d
* CUBLAS_STATUS_INVALID_VALUE if n < 0, or if incx or incy == 0
* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
*/
void CUBLASAPI cublasDspmv (char uplo, int n, double alpha,
const double *AP, const double *x,
int incx, double beta, double *y, int incy);
/* ---------------- CUBLAS double precision BLAS3 functions --------------- - */ /* ---------------- CUBLAS double precision BLAS3 functions --------------- - */
/* /*
* void * void
* cublasDgemm (char transa, char transb, int m, int n, int k, double alpha , * cublasDgemm (char transa, char transb, int m, int n, int k, double alpha ,
* const double *A, int lda, const double *B, int ldb, * const double *A, int lda, const double *B, int ldb,
* double beta, double *C, int ldc) * double beta, double *C, int ldc)
* *
* computes the product of matrix A and matrix B, multiplies the result * computes the product of matrix A and matrix B, multiplies the result
* by scalar alpha, and adds the sum to the product of matrix C and * by scalar alpha, and adds the sum to the product of matrix C and
skipping to change at line 3945 skipping to change at line 7051
* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support * CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU * CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
*/ */
void CUBLASAPI cublasDsymm (char side, char uplo, int m, int n, void CUBLASAPI cublasDsymm (char side, char uplo, int m, int n,
double alpha, const double *A, int lda, double alpha, const double *A, int lda,
const double *B, int ldb, double beta, const double *B, int ldb, double beta,
double *C, int ldc); double *C, int ldc);
/* /*
* void * void
* cublasZsymm (char side, char uplo, int m, int n, cuDoubleComplex alpha,
* const cuDoubleComplex *A, int lda, const cuDoubleComplex *B
, int ldb,
* cuDoubleComplex beta, cuDoubleComplex *C, int ldc);
*
* performs one of the matrix-matrix operations
*
* C = alpha * A * B + beta * C, or
* C = alpha * B * A + beta * C,
*
* where alpha and beta are double precision complex scalars, A is a symmet
ric matrix
* consisting of double precision complex elements and stored in either low
er or upper
* storage mode, and B and C are m x n matrices consisting of double precis
ion
* complex elements.
*
* Input
* -----
* side specifies whether the symmetric matrix A appears on the left side
* hand side or right hand side of matrix B, as follows. If side ==
'L'
* or 'l', then C = alpha * A * B + beta * C. If side = 'R' or 'r',
* then C = alpha * B * A + beta * C.
* uplo specifies whether the symmetric matrix A is stored in upper or lo
wer
* storage mode, as follows. If uplo == 'U' or 'u', only the upper
* triangular part of the symmetric matrix is to be referenced, and
the
* elements of the strictly lower triangular part are to be infered
from
* those in the upper triangular part. If uplo == 'L' or 'l', only t
he
* lower triangular part of the symmetric matrix is to be referenced
,
* and the elements of the strictly upper triangular part are to be
* infered from those in the lower triangular part.
* m specifies the number of rows of the matrix C, and the number of r
ows
* of matrix B. It also specifies the dimensions of symmetric matrix
A
* when side == 'L' or 'l'. m must be at least zero.
* n specifies the number of columns of the matrix C, and the number o
f
* columns of matrix B. It also specifies the dimensions of symmetri
c
* matrix A when side == 'R' or 'r'. n must be at least zero.
* alpha double precision scalar multiplier applied to A * B, or B * A
* A double precision array of dimensions (lda, ka), where ka is m whe
n
* side == 'L' or 'l' and is n otherwise. If side == 'L' or 'l' the
* leading m x m part of array A must contain the symmetric matrix,
* such that when uplo == 'U' or 'u', the leading m x m part stores
the
* upper triangular part of the symmetric matrix, and the strictly l
ower
* triangular part of A is not referenced, and when uplo == 'U' or '
u',
* the leading m x m part stores the lower triangular part of the
* symmetric matrix and the strictly upper triangular part is not
* referenced. If side == 'R' or 'r' the leading n x n part of array
A
* must contain the symmetric matrix, such that when uplo == 'U' or
'u',
* the leading n x n part stores the upper triangular part of the
* symmetric matrix and the strictly lower triangular part of A is n
ot
* referenced, and when uplo == 'U' or 'u', the leading n x n part
* stores the lower triangular part of the symmetric matrix and the
* strictly upper triangular part is not referenced.
* lda leading dimension of A. When side == 'L' or 'l', it must be at le
ast
* max(1, m) and at least max(1, n) otherwise.
* B double precision array of dimensions (ldb, n). On entry, the lead
ing
* m x n part of the array contains the matrix B.
* ldb leading dimension of B. It must be at least max (1, m).
* beta double precision scalar multiplier applied to C. If beta is zero,
C
* does not have to be a valid input
* C double precision array of dimensions (ldc, n)
* ldc leading dimension of C. Must be at least max(1, m)
*
* Output
* ------
* C updated according to C = alpha * A * B + beta * C, or C = alpha *
* B * A + beta * C
*
* Reference: http://www.netlib.org/blas/zsymm.f
*
* Error status for this function can be retrieved via cublasGetError().
*
* Error Status
* ------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
d
* CUBLAS_STATUS_INVALID_VALUE if m or n are < 0
* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
*/
void CUBLASAPI cublasZsymm (char side, char uplo, int m, int n,
cuDoubleComplex alpha, const cuDoubleComplex *A
, int lda,
const cuDoubleComplex *B, int ldb, cuDoubleComp
lex beta,
cuDoubleComplex *C, int ldc);
/*
* void
* cublasDsyrk (char uplo, char trans, int n, int k, double alpha, * cublasDsyrk (char uplo, char trans, int n, int k, double alpha,
* const double *A, int lda, double beta, double *C, int ldc) * const double *A, int lda, double beta, double *C, int ldc)
* *
* performs one of the symmetric rank k operations * performs one of the symmetric rank k operations
* *
* C = alpha * A * transpose(A) + beta * C, or * C = alpha * A * transpose(A) + beta * C, or
* C = alpha * transpose(A) * A + beta * C. * C = alpha * transpose(A) * A + beta * C.
* *
* Alpha and beta are double precision scalars. C is an n x n symmetric mat rix * Alpha and beta are double precision scalars. C is an n x n symmetric mat rix
* consisting of double precision elements and stored in either lower or * consisting of double precision elements and stored in either lower or
skipping to change at line 3993 skipping to change at line 7182
* otherwise the leading k x n part of the array must contains the * otherwise the leading k x n part of the array must contains the
* matrix A. * matrix A.
* lda leading dimension of A. When trans == 'N' or 'n' then lda must be at * lda leading dimension of A. When trans == 'N' or 'n' then lda must be at
* least max(1, n). Otherwise lda must be at least max(1, k). * least max(1, n). Otherwise lda must be at least max(1, k).
* beta double precision scalar multiplier applied to C. If beta izs zero , C * beta double precision scalar multiplier applied to C. If beta izs zero , C
* does not have to be a valid input * does not have to be a valid input
* C double precision array of dimensions (ldc, n). If uplo = 'U' or ' u', * C double precision array of dimensions (ldc, n). If uplo = 'U' or ' u',
* the leading n x n triangular part of the array C must contain the * the leading n x n triangular part of the array C must contain the
* upper triangular part of the symmetric matrix C and the strictly * upper triangular part of the symmetric matrix C and the strictly
* lower triangular part of C is not referenced. On exit, the upper * lower triangular part of C is not referenced. On exit, the upper
* triangular part of C is overwritten by the upper trinagular part of * triangular part of C is overwritten by the upper triangular part of
* the updated matrix. If uplo = 'L' or 'l', the leading n x n * the updated matrix. If uplo = 'L' or 'l', the leading n x n
* triangular part of the array C must contain the lower triangular part * triangular part of the array C must contain the lower triangular part
* of the symmetric matrix C and the strictly upper triangular part of C * of the symmetric matrix C and the strictly upper triangular part of C
* is not referenced. On exit, the lower triangular part of C is * is not referenced. On exit, the lower triangular part of C is
* overwritten by the lower trinagular part of the updated matrix. * overwritten by the lower triangular part of the updated matrix.
* ldc leading dimension of C. It must be at least max(1, n). * ldc leading dimension of C. It must be at least max(1, n).
* *
* Output * Output
* ------ * ------
* C updated according to C = alpha * A * transpose(A) + beta * C, or C = * C updated according to C = alpha * A * transpose(A) + beta * C, or C =
* alpha * transpose(A) * A + beta * C * alpha * transpose(A) * A + beta * C
* *
* Reference: http://www.netlib.org/blas/dsyrk.f * Reference: http://www.netlib.org/blas/dsyrk.f
* *
* Error status for this function can be retrieved via cublasGetError(). * Error status for this function can be retrieved via cublasGetError().
skipping to change at line 4071 skipping to change at line 7260
* otherwise the leading k x n part of the array must contains the * otherwise the leading k x n part of the array must contains the
* matrix A. * matrix A.
* lda leading dimension of A. When trans == 'N' or 'n' then lda must be at * lda leading dimension of A. When trans == 'N' or 'n' then lda must be at
* least max(1, n). Otherwise lda must be at least max(1, k). * least max(1, n). Otherwise lda must be at least max(1, k).
* beta double precision complex scalar multiplier applied to C. If beta izs zero, C * beta double precision complex scalar multiplier applied to C. If beta izs zero, C
* does not have to be a valid input * does not have to be a valid input
* C double precision complex array of dimensions (ldc, n). If uplo = 'U' or 'u', * C double precision complex array of dimensions (ldc, n). If uplo = 'U' or 'u',
* the leading n x n triangular part of the array C must contain the * the leading n x n triangular part of the array C must contain the
* upper triangular part of the symmetric matrix C and the strictly * upper triangular part of the symmetric matrix C and the strictly
* lower triangular part of C is not referenced. On exit, the upper * lower triangular part of C is not referenced. On exit, the upper
* triangular part of C is overwritten by the upper trinagular part of * triangular part of C is overwritten by the upper triangular part of
* the updated matrix. If uplo = 'L' or 'l', the leading n x n * the updated matrix. If uplo = 'L' or 'l', the leading n x n
* triangular part of the array C must contain the lower triangular part * triangular part of the array C must contain the lower triangular part
* of the symmetric matrix C and the strictly upper triangular part of C * of the symmetric matrix C and the strictly upper triangular part of C
* is not referenced. On exit, the lower triangular part of C is * is not referenced. On exit, the lower triangular part of C is
* overwritten by the lower trinagular part of the updated matrix. * overwritten by the lower triangular part of the updated matrix.
* ldc leading dimension of C. It must be at least max(1, n). * ldc leading dimension of C. It must be at least max(1, n).
* *
* Output * Output
* ------ * ------
* C updated according to C = alpha * A * transpose(A) + beta * C, or C = * C updated according to C = alpha * A * transpose(A) + beta * C, or C =
* alpha * transpose(A) * A + beta * C * alpha * transpose(A) * A + beta * C
* *
* Reference: http://www.netlib.org/blas/zsyrk.f * Reference: http://www.netlib.org/blas/zsyrk.f
* *
* Error status for this function can be retrieved via cublasGetError(). * Error status for this function can be retrieved via cublasGetError().
skipping to change at line 4100 skipping to change at line 7289
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize d * CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize d
* CUBLAS_STATUS_INVALID_VALUE if n < 0 or k < 0 * CUBLAS_STATUS_INVALID_VALUE if n < 0 or k < 0
* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support * CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU * CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
*/ */
void CUBLASAPI cublasZsyrk (char uplo, char trans, int n, int k, void CUBLASAPI cublasZsyrk (char uplo, char trans, int n, int k,
cuDoubleComplex alpha, cuDoubleComplex alpha,
const cuDoubleComplex *A, int lda, const cuDoubleComplex *A, int lda,
cuDoubleComplex beta, cuDoubleComplex beta,
cuDoubleComplex *C, int ldc); cuDoubleComplex *C, int ldc);
/*
* void
* cublasZsyr2k (char uplo, char trans, int n, int k, cuDoubleComplex alpha
,
* const cuDoubleComplex *A, int lda, const cuDoubleComplex *
B, int ldb,
* cuDoubleComplex beta, cuDoubleComplex *C, int ldc)
*
* performs one of the symmetric rank 2k operations
*
* C = alpha * A * transpose(B) + alpha * B * transpose(A) + beta * C, o
r
* C = alpha * transpose(A) * B + alpha * transpose(B) * A + beta * C.
*
* Alpha and beta are double precision complex scalars. C is an n x n symme
tric matrix
* consisting of double precision complex elements and stored in either low
er or upper
* storage mode. A and B are matrices consisting of double precision comple
x elements
* with dimension of n x k in the first case, and k x n in the second case.
*
* Input
* -----
* uplo specifies whether the symmetric matrix C is stored in upper or lo
wer
* storage mode, as follows. If uplo == 'U' or 'u', only the upper
* triangular part of the symmetric matrix is to be referenced, and
the
* elements of the strictly lower triangular part are to be infered
from
* those in the upper triangular part. If uplo == 'L' or 'l', only t
he
* lower triangular part of the symmetric matrix is to be references
,
* and the elements of the strictly upper triangular part are to be
* infered from those in the lower triangular part.
* trans specifies the operation to be performed. If trans == 'N' or 'n',
* C = alpha * A * transpose(B) + alpha * B * transpose(A) + beta *
C,
* If trans == 'T', 't', 'C', or 'c', C = alpha * transpose(A) * B +
* alpha * transpose(B) * A + beta * C.
* n specifies the number of rows and the number columns of matrix C.
If
* trans == 'N' or 'n', n specifies the number of rows of matrix A.
If
* trans == 'T', 't', 'C', or 'c', n specifies the columns of matrix
A.
* n must be at least zero.
* k If trans == 'N' or 'n', k specifies the number of rows of matrix
A.
* If trans == 'T', 't', 'C', or 'c', k specifies the number of rows
of
* matrix A. k must be at least zero.
* alpha double precision scalar multiplier.
* A double precision array of dimensions (lda, ka), where ka is k whe
n
* trans == 'N' or 'n', and is n otherwise. When trans == 'N' or 'n'
,
* the leading n x k part of array A must contain the matrix A,
* otherwise the leading k x n part of the array must contain the ma
trix
* A.
* lda leading dimension of A. When trans == 'N' or 'n' then lda must be
at
* least max(1, n). Otherwise lda must be at least max(1,k).
* B double precision array of dimensions (lda, kb), where kb is k whe
n
* trans == 'N' or 'n', and is n otherwise. When trans == 'N' or 'n'
,
* the leading n x k part of array B must contain the matrix B,
* otherwise the leading k x n part of the array must contain the ma
trix
* B.
* ldb leading dimension of N. When trans == 'N' or 'n' then ldb must be
at
* least max(1, n). Otherwise ldb must be at least max(1, k).
* beta double precision scalar multiplier applied to C. If beta is zero,
C
* does not have to be a valid input.
* C double precision array of dimensions (ldc, n). If uplo == 'U' or
'u',
* the leading n x n triangular part of the array C must contain the
* upper triangular part of the symmetric matrix C and the strictly
* lower triangular part of C is not referenced. On exit, the upper
* triangular part of C is overwritten by the upper triangular part
of
* the updated matrix. If uplo == 'L' or 'l', the leading n x n
* triangular part of the array C must contain the lower triangular
part
* of the symmetric matrix C and the strictly upper triangular part
of C
* is not referenced. On exit, the lower triangular part of C is
* overwritten by the lower triangular part of the updated matrix.
* ldc leading dimension of C. Must be at least max(1, n).
*
* Output
* ------
* C updated according to alpha*A*transpose(B) + alpha*B*transpose(A)
+
* beta*C or alpha*transpose(A)*B + alpha*transpose(B)*A + beta*C
*
* Reference: http://www.netlib.org/blas/zsyr2k.f
*
* Error status for this function can be retrieved via cublasGetError().
*
* Error Status
* ------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
d
* CUBLAS_STATUS_INVALID_VALUE if n < 0 or k < 0
* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
*/
void CUBLASAPI cublasZsyr2k (char uplo, char trans, int n, int k,
cuDoubleComplex alpha, const cuDoubleComplex *
A, int lda,
const cuDoubleComplex *B, int ldb, cuDoubleCom
plex beta,
cuDoubleComplex *C, int ldc);
/*
* void
* cublasZher2k (char uplo, char trans, int n, int k, cuDoubleComplex alpha
,
* const cuDoubleComplex *A, int lda, const cuDoubleComplex *
B, int ldb,
* double beta, cuDoubleComplex *C, int ldc)
*
* performs one of the hermitian rank 2k operations
*
* C = alpha * A * conjugate(transpose(B))
* + conjugate(alpha) * B * conjugate(transpose(A))
* + beta * C ,
* or
* C = alpha * conjugate(transpose(A)) * B
* + conjugate(alpha) * conjugate(transpose(B)) * A
* + beta * C.
*
* Alpha is double precision complex scalar whereas Beta is a double precis
ion real scalar.
* C is an n x n hermitian matrix consisting of double precision complex el
ements and
* stored in either lower or upper storage mode. A and B are matrices consi
sting of
* double precision complex elements with dimension of n x k in the first c
ase,
* and k x n in the second case.
*
* Input
* -----
* uplo specifies whether the hermitian matrix C is stored in upper or lo
wer
* storage mode, as follows. If uplo == 'U' or 'u', only the upper
* triangular part of the hermitian matrix is to be referenced, and
the
* elements of the strictly lower triangular part are to be infered
from
* those in the upper triangular part. If uplo == 'L' or 'l', only t
he
* lower triangular part of the hermitian matrix is to be references
,
* and the elements of the strictly upper triangular part are to be
* infered from those in the lower triangular part.
* trans specifies the operation to be performed. If trans == 'N' or 'n',
* C = alpha * A * conjugate(transpose(B))
* + conjugate(alpha) * B * conjugate(transpose(A))
* + beta * C .
* If trans == 'T', 't', 'C', or 'c',
* C = alpha * conjugate(transpose(A)) * B
* + conjugate(alpha) * conjugate(transpose(B)) * A
* + beta * C.
* n specifies the number of rows and the number columns of matrix C.
If
* trans == 'N' or 'n', n specifies the number of rows of matrix A.
If
* trans == 'T', 't', 'C', or 'c', n specifies the columns of matrix
A.
* n must be at least zero.
* k If trans == 'N' or 'n', k specifies the number of rows of matrix
A.
* If trans == 'T', 't', 'C', or 'c', k specifies the number of rows
of
* matrix A. k must be at least zero.
* alpha double precision scalar multiplier.
* A double precision array of dimensions (lda, ka), where ka is k whe
n
* trans == 'N' or 'n', and is n otherwise. When trans == 'N' or 'n'
,
* the leading n x k part of array A must contain the matrix A,
* otherwise the leading k x n part of the array must contain the ma
trix
* A.
* lda leading dimension of A. When trans == 'N' or 'n' then lda must be
at
* least max(1, n). Otherwise lda must be at least max(1,k).
* B double precision array of dimensions (lda, kb), where kb is k whe
n
* trans == 'N' or 'n', and is n otherwise. When trans == 'N' or 'n'
,
* the leading n x k part of array B must contain the matrix B,
* otherwise the leading k x n part of the array must contain the ma
trix
* B.
* ldb leading dimension of N. When trans == 'N' or 'n' then ldb must be
at
* least max(1, n). Otherwise ldb must be at least max(1, k).
* beta double precision scalar multiplier applied to C. If beta is zero,
C
* does not have to be a valid input.
* C double precision array of dimensions (ldc, n). If uplo == 'U' or
'u',
* the leading n x n triangular part of the array C must contain the
* upper triangular part of the hermitian matrix C and the strictly
* lower triangular part of C is not referenced. On exit, the upper
* triangular part of C is overwritten by the upper triangular part
of
* the updated matrix. If uplo == 'L' or 'l', the leading n x n
* triangular part of the array C must contain the lower triangular
part
* of the hermitian matrix C and the strictly upper triangular part
of C
* is not referenced. On exit, the lower triangular part of C is
* overwritten by the lower triangular part of the updated matrix.
* The imaginary parts of the diagonal elements need
* not be set, they are assumed to be zero, and on exit they
* are set to zero.
* ldc leading dimension of C. Must be at least max(1, n).
*
* Output
* ------
* C updated according to alpha*A*conjugate(transpose(B)) +
* + conjugate(alpha)*B*conjugate(transpose(A)) + beta*C or
* alpha*conjugate(transpose(A))*B + conjugate(alpha)*conjugate(tran
spose(B))*A
* + beta*C.
*
* Reference: http://www.netlib.org/blas/zher2k.f
*
* Error status for this function can be retrieved via cublasGetError().
*
* Error Status
* ------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
d
* CUBLAS_STATUS_INVALID_VALUE if n < 0 or k < 0
* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
*/
void CUBLASAPI cublasZher2k (char uplo, char trans, int n, int k,
cuDoubleComplex alpha, const cuDoubleComplex *
A, int lda,
const cuDoubleComplex *B, int ldb, double beta
,
cuDoubleComplex *C, int ldc);
/*
* void
* cublasZher (char uplo, int n, double alpha, const cuDoubleComplex *x, in
t incx,
* cuDoubleComplex *A, int lda)
*
* performs the hermitian rank 1 operation
*
* A = alpha * x * conjugate(transpose(x)) + A,
*
* where alpha is a double precision real scalar, x is an n element double
* precision complex vector and A is an n x n hermitian matrix consisting o
f
* double precision complex elements. Matrix A is stored in column major fo
rmat,
* and lda is the leading dimension of the two-dimensional array
* containing A.
*
* Input
* -----
* uplo specifies whether the matrix data is stored in the upper or
* the lower triangular part of array A. If uplo = 'U' or 'u',
* then only the upper triangular part of A may be referenced.
* If uplo = 'L' or 'l', then only the lower triangular part of
* A may be referenced.
* n specifies the number of rows and columns of the matrix A. It
* must be at least 0.
* alpha double precision real scalar multiplier applied to
* x * conjugate(transpose(x))
* x double precision complex array of length at least (1 + (n - 1) *
abs(incx))
* incx specifies the storage spacing between elements of x. incx must
* not be zero.
* A double precision complex array of dimensions (lda, n). If uplo =
'U' or
* 'u', then A must contain the upper triangular part of a hermitian
* matrix, and the strictly lower triangular part is not referenced.
* If uplo = 'L' or 'l', then A contains the lower triangular part
* of a hermitian matrix, and the strictly upper triangular part is
* not referenced. The imaginary parts of the diagonal elements need
* not be set, they are assumed to be zero, and on exit they
* are set to zero.
* lda leading dimension of the two-dimensional array containing A. lda
* must be at least max(1, n).
*
* Output
* ------
* A updated according to A = alpha * x * conjugate(transpose(x)) + A
*
* Reference: http://www.netlib.org/blas/zher.f
*
* Error status for this function can be retrieved via cublasGetError().
*
* Error Status
* ------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
d
* CUBLAS_STATUS_INVALID_VALUE if n < 0, or incx == 0
* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
*/
void CUBLASAPI cublasZher (char uplo, int n, double alpha,
const cuDoubleComplex *x, int incx, cuDoubleComp
lex *A,
int lda);
/*
* void
* cublasZher (char uplo, int n, double alpha, const cuDoubleComplex *x, in
t incx,
* cuDoubleComplex *A, int lda)
*
* performs the hermitian rank 1 operation
*
* A = alpha * x * conjugate(transpose(x) + A,
*
* where alpha is a double precision real scalar, x is an n element double
* precision complex vector and A is an n x n hermitian matrix consisting o
f
* double precision complex elements. Matrix A is stored in column major fo
rmat,
* and lda is the leading dimension of the two-dimensional array
* containing A.
*
* Input
* -----
* uplo specifies whether the matrix data is stored in the upper or
* the lower triangular part of array A. If uplo = 'U' or 'u',
* then only the upper triangular part of A may be referenced.
* If uplo = 'L' or 'l', then only the lower triangular part of
* A may be referenced.
* n specifies the number of rows and columns of the matrix A. It
* must be at least 0.
* alpha double precision real scalar multiplier applied to
* x * conjugate(transpose(x))
* x double precision complex array of length at least (1 + (n - 1) *
abs(incx))
* incx specifies the storage spacing between elements of x. incx must
* not be zero.
* A double precision complex array of dimensions (lda, n). If uplo =
'U' or
* 'u', then A must contain the upper triangular part of a hermitian
* matrix, and the strictly lower triangular part is not referenced.
* If uplo = 'L' or 'l', then A contains the lower triangular part
* of a hermitian matrix, and the strictly upper triangular part is
* not referenced. The imaginary parts of the diagonal elements need
* not be set, they are assumed to be zero, and on exit they
* are set to zero.
* lda leading dimension of the two-dimensional array containing A. lda
* must be at least max(1, n).
*
* Output
* ------
* A updated according to A = alpha * x * conjugate(transpose(x) + A
*
* Reference: http://www.netlib.org/blas/zher.f
*
* Error status for this function can be retrieved via cublasGetError().
*
* Error Status
* ------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
d
* CUBLAS_STATUS_INVALID_VALUE if n < 0, or incx == 0
* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
*/
void CUBLASAPI cublasZher (char uplo, int n, double alpha, const cuDoubleCo
mplex *x,
int incx, cuDoubleComplex *A, int lda);
/*
* void
* cublasZhpr (char uplo, int n, double alpha, const cuDoubleComplex *x, in
t incx,
* cuDoubleComplex *AP)
*
* performs the hermitian rank 1 operation
*
* A = alpha * x * conjugate(transpose(x)) + A,
*
* where alpha is a double precision real scalar and x is an n element doub
le
* precision complex vector. A is a hermitian n x n matrix consisting of do
uble
* precision complex elements that is supplied in packed form.
*
* Input
* -----
* uplo specifies whether the matrix data is stored in the upper or the l
ower
* triangular part of array AP. If uplo == 'U' or 'u', then the uppe
r
* triangular part of A is supplied in AP. If uplo == 'L' or 'l', th
en
* the lower triangular part of A is supplied in AP.
* n specifies the number of rows and columns of the matrix A. It must
be
* at least zero.
* alpha double precision real scalar multiplier applied to x * conjugate(
transpose(x)).
* x double precision array of length at least (1 + (n - 1) * abs(incx
)).
* incx storage spacing between elements of x. incx must not be zero.
* AP double precision complex array with at least ((n * (n + 1)) / 2)
elements. If
* uplo == 'U' or 'u', the array AP contains the upper triangular pa
rt
* of the hermitian matrix A, packed sequentially, column by column;
* that is, if i <= j, then A[i,j] is stored is AP[i+(j*(j+1)/2)]. I
f
* uplo == 'L' or 'L', the array AP contains the lower triangular pa
rt
* of the hermitian matrix A, packed sequentially, column by column;
* that is, if i >= j, then A[i,j] is stored in AP[i+((2*n-j+1)*j)/2
].
* The imaginary parts of the diagonal elements need not be set, the
y
* are assumed to be zero, and on exit they are set to zero.
*
* Output
* ------
* A updated according to A = alpha * x * conjugate(transpose(x)) + A
*
* Reference: http://www.netlib.org/blas/zhpr.f
*
* Error status for this function can be retrieved via cublasGetError().
*
* Error Status
* ------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
d
* CUBLAS_STATUS_INVALID_VALUE if n < 0, or incx == 0
* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
*/
void CUBLASAPI cublasZhpr (char uplo, int n, double alpha,
const cuDoubleComplex *x, int incx, cuDoubleComp
lex *AP);
/*
* void
* cublasZhpr2 (char uplo, int n, cuDoubleComplex alpha, const cuDoubleComp
lex *x, int incx,
* const cuDoubleComplex *y, int incy, cuDoubleComplex *AP)
*
* performs the hermitian rank 2 operation
*
* A = alpha*x*conjugate(transpose(y)) + conjugate(alpha)*y*conjugate(tr
anspose(x)) + A,
*
* where alpha is a double precision complex scalar, and x and y are n elem
ent double
* precision complex vectors. A is a hermitian n x n matrix consisting of d
ouble
* precision complex elements that is supplied in packed form.
*
* Input
* -----
* uplo specifies whether the matrix data is stored in the upper or the l
ower
* triangular part of array A. If uplo == 'U' or 'u', then only the
* upper triangular part of A may be referenced and the lower triang
ular
* part of A is inferred. If uplo == 'L' or 'l', then only the lower
* triangular part of A may be referenced and the upper triangular p
art
* of A is inferred.
* n specifies the number of rows and columns of the matrix A. It must
be
* at least zero.
* alpha double precision complex scalar multiplier applied to x * conjuga
te(transpose(y)) +
* y * conjugate(transpose(x)).
* x double precision complex array of length at least (1 + (n - 1) *
abs (incx)).
* incx storage spacing between elements of x. incx must not be zero.
* y double precision complex array of length at least (1 + (n - 1) *
abs (incy)).
* incy storage spacing between elements of y. incy must not be zero.
* AP double precision complex array with at least ((n * (n + 1)) / 2)
elements. If
* uplo == 'U' or 'u', the array AP contains the upper triangular pa
rt
* of the hermitian matrix A, packed sequentially, column by column;
* that is, if i <= j, then A[i,j] is stored is AP[i+(j*(j+1)/2)]. I
f
* uplo == 'L' or 'L', the array AP contains the lower triangular pa
rt
* of the hermitian matrix A, packed sequentially, column by column;
* that is, if i >= j, then A[i,j] is stored in AP[i+((2*n-j+1)*j)/2
].
* The imaginary parts of the diagonal elements need not be set, the
y
* are assumed to be zero, and on exit they are set to zero.
*
* Output
* ------
* A updated according to A = alpha*x*conjugate(transpose(y))
* + conjugate(alpha)*y*conjugate(transpose(x
))+A
*
* Reference: http://www.netlib.org/blas/zhpr2.f
*
* Error status for this function can be retrieved via cublasGetError().
*
* Error Status
* ------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
d
* CUBLAS_STATUS_INVALID_VALUE if n < 0, incx == 0, incy == 0
* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
*/
void CUBLASAPI cublasZhpr2 (char uplo, int n, cuDoubleComplex alpha,
const cuDoubleComplex *x, int incx, const cuDou
bleComplex *y,
int incy, cuDoubleComplex *AP);
/*
* void cublasZher2 (char uplo, int n, cuDoubleComplex alpha, const cuDoubl
eComplex *x, int incx,
* const cuDoubleComplex *y, int incy, cuDoubleComplex *A
, int lda)
*
* performs the hermitian rank 2 operation
*
* A = alpha*x*conjugate(transpose(y)) + conjugate(alpha)*y*conjugate(tr
anspose(x)) + A,
*
* where alpha is a double precision complex scalar, x and y are n element
double
* precision complex vector and A is an n by n hermitian matrix consisting
of double
* precision complex elements.
*
* Input
* -----
* uplo specifies whether the matrix data is stored in the upper or the l
ower
* triangular part of array A. If uplo == 'U' or 'u', then only the
* upper triangular part of A may be referenced and the lower triang
ular
* part of A is inferred. If uplo == 'L' or 'l', then only the lower
* triangular part of A may be referenced and the upper triangular p
art
* of A is inferred.
* n specifies the number of rows and columns of the matrix A. It must
be
* at least zero.
* alpha double precision complex scalar multiplier applied to x * conjuga
te(transpose(y)) +
* y * conjugate(transpose(x)).
* x double precision array of length at least (1 + (n - 1) * abs (inc
x)).
* incx storage spacing between elements of x. incx must not be zero.
* y double precision array of length at least (1 + (n - 1) * abs (inc
y)).
* incy storage spacing between elements of y. incy must not be zero.
* A double precision complex array of dimensions (lda, n). If uplo ==
'U' or 'u',
* then A must contains the upper triangular part of a hermitian mat
rix,
* and the strictly lower triangular parts is not referenced. If upl
o ==
* 'L' or 'l', then A contains the lower triangular part of a hermit
ian
* matrix, and the strictly upper triangular part is not referenced.
* The imaginary parts of the diagonal elements need not be set,
* they are assumed to be zero, and on exit they are set to zero.
*
* lda leading dimension of A. It must be at least max(1, n).
*
* Output
* ------
* A updated according to A = alpha*x*conjugate(transpose(y))
* + conjugate(alpha)*y*conjugate(transpose(x
))+A
*
* Reference: http://www.netlib.org/blas/zher2.f
*
* Error status for this function can be retrieved via cublasGetError().
*
* Error Status
* ------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
d
* CUBLAS_STATUS_INVALID_VALUE if n < 0, incx == 0, incy == 0
* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
*/
void CUBLASAPI cublasZher2 (char uplo, int n, cuDoubleComplex alpha,
const cuDoubleComplex *x, int incx, const cuDo
ubleComplex *y,
int incy, cuDoubleComplex *A, int lda);
/* /*
* void * void
* cublasDsyr2k (char uplo, char trans, int n, int k, double alpha, * cublasDsyr2k (char uplo, char trans, int n, int k, double alpha,
* const double *A, int lda, const double *B, int ldb, * const double *A, int lda, const double *B, int ldb,
* double beta, double *C, int ldc) * double beta, double *C, int ldc)
* *
* performs one of the symmetric rank 2k operations * performs one of the symmetric rank 2k operations
* *
* C = alpha * A * transpose(B) + alpha * B * transpose(A) + beta * C, o r * C = alpha * A * transpose(B) + alpha * B * transpose(A) + beta * C, o r
skipping to change at line 4159 skipping to change at line 7821
* otherwise the leading k x n part of the array must contain the ma trix * otherwise the leading k x n part of the array must contain the ma trix
* B. * B.
* ldb leading dimension of N. When trans == 'N' or 'n' then ldb must be at * ldb leading dimension of N. When trans == 'N' or 'n' then ldb must be at
* least max(1, n). Otherwise ldb must be at least max(1, k). * least max(1, n). Otherwise ldb must be at least max(1, k).
* beta double precision scalar multiplier applied to C. If beta is zero, C * beta double precision scalar multiplier applied to C. If beta is zero, C
* does not have to be a valid input. * does not have to be a valid input.
* C double precision array of dimensions (ldc, n). If uplo == 'U' or 'u', * C double precision array of dimensions (ldc, n). If uplo == 'U' or 'u',
* the leading n x n triangular part of the array C must contain the * the leading n x n triangular part of the array C must contain the
* upper triangular part of the symmetric matrix C and the strictly * upper triangular part of the symmetric matrix C and the strictly
* lower triangular part of C is not referenced. On exit, the upper * lower triangular part of C is not referenced. On exit, the upper
* triangular part of C is overwritten by the upper trinagular part of * triangular part of C is overwritten by the upper triangular part of
* the updated matrix. If uplo == 'L' or 'l', the leading n x n * the updated matrix. If uplo == 'L' or 'l', the leading n x n
* triangular part of the array C must contain the lower triangular part * triangular part of the array C must contain the lower triangular part
* of the symmetric matrix C and the strictly upper triangular part of C * of the symmetric matrix C and the strictly upper triangular part of C
* is not referenced. On exit, the lower triangular part of C is * is not referenced. On exit, the lower triangular part of C is
* overwritten by the lower trinagular part of the updated matrix. * overwritten by the lower triangular part of the updated matrix.
* ldc leading dimension of C. Must be at least max(1, n). * ldc leading dimension of C. Must be at least max(1, n).
* *
* Output * Output
* ------ * ------
* C updated according to alpha*A*transpose(B) + alpha*B*transpose(A) + * C updated according to alpha*A*transpose(B) + alpha*B*transpose(A) +
* beta*C or alpha*transpose(A)*B + alpha*transpose(B)*A + beta*C * beta*C or alpha*transpose(A)*B + alpha*transpose(B)*A + beta*C
* *
* Reference: http://www.netlib.org/blas/dsyr2k.f * Reference: http://www.netlib.org/blas/dsyr2k.f
* *
* Error status for this function can be retrieved via cublasGetError(). * Error status for this function can be retrieved via cublasGetError().
skipping to change at line 4256 skipping to change at line 7918
* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support * CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU * CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
*/ */
void CUBLASAPI cublasZgemm (char transa, char transb, int m, int n, void CUBLASAPI cublasZgemm (char transa, char transb, int m, int n,
int k, cuDoubleComplex alpha, int k, cuDoubleComplex alpha,
const cuDoubleComplex *A, int lda, const cuDoubleComplex *A, int lda,
const cuDoubleComplex *B, int ldb, const cuDoubleComplex *B, int ldb,
cuDoubleComplex beta, cuDoubleComplex *C, cuDoubleComplex beta, cuDoubleComplex *C,
int ldc); int ldc);
/*
* void
* cublasZtrmm (char side, char uplo, char transa, char diag, int m, int n,
* cuDoubleComplex alpha, const cuDoubleComplex *A, int lda, c
onst cuDoubleComplex *B,
* int ldb)
*
* performs one of the matrix-matrix operations
*
* B = alpha * op(A) * B, or B = alpha * B * op(A)
*
* where alpha is a double-precision complex scalar, B is an m x n matrix c
omposed
* of double precision complex elements, and A is a unit or non-unit, upper
or lower,
* triangular matrix composed of double precision complex elements. op(A) i
s one of
*
* op(A) = A , op(A) = transpose(A) or op(A) = conjugate(transpose(A))
*
* Matrices A and B are stored in column major format, and lda and ldb are
* the leading dimensions of the two-dimensonials arrays that contain A and
* B, respectively.
*
* Input
* -----
* side specifies whether op(A) multiplies B from the left or right.
* If side = 'L' or 'l', then B = alpha * op(A) * B. If side =
* 'R' or 'r', then B = alpha * B * op(A).
* uplo specifies whether the matrix A is an upper or lower triangular
* matrix. If uplo = 'U' or 'u', A is an upper triangular matrix.
* If uplo = 'L' or 'l', A is a lower triangular matrix.
* transa specifies the form of op(A) to be used in the matrix
* multiplication. If transa = 'N' or 'n', then op(A) = A. If
* transa = 'T' or 't', then op(A) = transpose(A).
* If transa = 'C' or 'c', then op(A) = conjugate(transpose(A)).
* diag specifies whether or not A is unit triangular. If diag = 'U'
* or 'u', A is assumed to be unit triangular. If diag = 'N' or
* 'n', A is not assumed to be unit triangular.
* m the number of rows of matrix B. m must be at least zero.
* n the number of columns of matrix B. n must be at least zero.
* alpha double precision complex scalar multiplier applied to op(A)*B, or
* B*op(A), respectively. If alpha is zero no accesses are made
* to matrix A, and no read accesses are made to matrix B.
* A double precision complex array of dimensions (lda, k). k = m if s
ide =
* 'L' or 'l', k = n if side = 'R' or 'r'. If uplo = 'U' or 'u'
* the leading k x k upper triangular part of the array A must
* contain the upper triangular matrix, and the strictly lower
* triangular part of A is not referenced. If uplo = 'L' or 'l'
* the leading k x k lower triangular part of the array A must
* contain the lower triangular matrix, and the strictly upper
* triangular part of A is not referenced. When diag = 'U' or 'u'
* the diagonal elements of A are no referenced and are assumed
* to be unity.
* lda leading dimension of A. When side = 'L' or 'l', it must be at
* least max(1,m) and at least max(1,n) otherwise
* B double precision complex array of dimensions (ldb, n). On entry,
the
* leading m x n part of the array contains the matrix B. It is
* overwritten with the transformed matrix on exit.
* ldb leading dimension of B. It must be at least max (1, m).
*
* Output
* ------
* B updated according to B = alpha * op(A) * B or B = alpha * B * op
(A)
*
* Reference: http://www.netlib.org/blas/ztrmm.f
*
* Error status for this function can be retrieved via cublasGetError().
*
* Error Status
* ------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
d
* CUBLAS_STATUS_INVALID_VALUE if m or n < 0
* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
*/
void CUBLASAPI cublasZtrmm (char side, char uplo, char transa,
char diag, int m, int n, cuDoubleComplex alpha,
const cuDoubleComplex *A, int lda, cuDoubleComp
lex *B,
int ldb);
/*
* cublasZgeru (int m, int n, cuDoubleComplex alpha, const cuDoubleComplex
*x, int incx,
* const cuDoubleComplex *y, int incy, cuDoubleComplex *A, int
lda)
*
* performs the symmetric rank 1 operation
*
* A = alpha * x * transpose(y) + A,
*
* where alpha is a double precision complex scalar, x is an m element doub
le
* precision complex vector, y is an n element double precision complex vec
tor, and A
* is an m by n matrix consisting of double precision complex elements. Mat
rix A
* is stored in column major format, and lda is the leading dimension of
* the two-dimensional array used to store A.
*
* Input
* -----
* m specifies the number of rows of the matrix A. It must be at least
* zero.
* n specifies the number of columns of the matrix A. It must be at
* least zero.
* alpha double precision complex scalar multiplier applied to x * transpo
se(y)
* x double precision complex array of length at least (1 + (m - 1) *
abs(incx))
* incx specifies the storage spacing between elements of x. incx must no
t
* be zero.
* y double precision complex array of length at least (1 + (n - 1) *
abs(incy))
* incy specifies the storage spacing between elements of y. incy must no
t
* be zero.
* A double precision complex array of dimensions (lda, n).
* lda leading dimension of two-dimensional array used to store matrix A
*
* Output
* ------
* A updated according to A = alpha * x * transpose(y) + A
*
* Reference: http://www.netlib.org/blas/zgeru.f
*
* Error status for this function can be retrieved via cublasGetError().
*
* Error Status
* ------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
d
* CUBLAS_STATUS_INVALID_VALUE if m < 0, n < 0, incx == 0, incy == 0
* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
*/
void CUBLASAPI cublasZgeru (int m, int n, cuDoubleComplex alpha,
const cuDoubleComplex *x, int incx, const cuDou
bleComplex *y,
int incy, cuDoubleComplex *A, int lda);
/*
* cublasZgerc (int m, int n, cuDoubleComplex alpha, const cuDoubleComplex
*x, int incx,
* const cuDoubleComplex *y, int incy, cuDoubleComplex *A, int
lda)
*
* performs the symmetric rank 1 operation
*
* A = alpha * x * conjugate(transpose(y)) + A,
*
* where alpha is a double precision complex scalar, x is an m element doub
le
* precision complex vector, y is an n element double precision complex vec
tor, and A
* is an m by n matrix consisting of double precision complex elements. Mat
rix A
* is stored in column major format, and lda is the leading dimension of
* the two-dimensional array used to store A.
*
* Input
* -----
* m specifies the number of rows of the matrix A. It must be at least
* zero.
* n specifies the number of columns of the matrix A. It must be at
* least zero.
* alpha double precision complex scalar multiplier applied to x * conjuga
te(transpose(y))
* x double precision array of length at least (1 + (m - 1) * abs(incx
))
* incx specifies the storage spacing between elements of x. incx must no
t
* be zero.
* y double precision complex array of length at least (1 + (n - 1) *
abs(incy))
* incy specifies the storage spacing between elements of y. incy must no
t
* be zero.
* A double precision complex array of dimensions (lda, n).
* lda leading dimension of two-dimensional array used to store matrix A
*
* Output
* ------
* A updated according to A = alpha * x * conjugate(transpose(y)) + A
*
* Reference: http://www.netlib.org/blas/zgerc.f
*
* Error status for this function can be retrieved via cublasGetError().
*
* Error Status
* ------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
d
* CUBLAS_STATUS_INVALID_VALUE if m < 0, n < 0, incx == 0, incy == 0
* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
*/
void CUBLASAPI cublasZgerc (int m, int n, cuDoubleComplex alpha,
const cuDoubleComplex *x, int incx, const cuDou
bleComplex *y,
int incy, cuDoubleComplex *A, int lda);
/*
* void
* cublasZherk (char uplo, char trans, int n, int k, double alpha,
* const cuDoubleComplex *A, int lda, double beta, cuDoubleCom
plex *C, int ldc)
*
* performs one of the hermitian rank k operations
*
* C = alpha * A * conjugate(transpose(A)) + beta * C, or
* C = alpha * conjugate(transpose(A)) * A + beta * C.
*
* Alpha and beta are double precision scalars. C is an n x n hermitian mat
rix
* consisting of double precision complex elements and stored in either low
er or
* upper storage mode. A is a matrix consisting of double precision complex
elements
* with dimension of n x k in the first case, and k x n in the second case.
*
* Input
* -----
* uplo specifies whether the hermitian matrix C is stored in upper or lo
wer
* storage mode as follows. If uplo == 'U' or 'u', only the upper
* triangular part of the hermitian matrix is to be referenced, and
the
* elements of the strictly lower triangular part are to be infered
from
* those in the upper triangular part. If uplo == 'L' or 'l', only t
he
* lower triangular part of the hermitian matrix is to be referenced
,
* and the elements of the strictly upper triangular part are to be
* infered from those in the lower triangular part.
* trans specifies the operation to be performed. If trans == 'N' or 'n',
C =
* alpha * A * conjugate(transpose(A)) + beta * C. If trans == 'T',
't', 'C', or 'c',
* C = alpha * conjugate(transpose(A)) * A + beta * C.
* n specifies the number of rows and the number columns of matrix C.
If
* trans == 'N' or 'n', n specifies the number of rows of matrix A.
If
* trans == 'T', 't', 'C', or 'c', n specifies the columns of matrix
A.
* n must be at least zero.
* k If trans == 'N' or 'n', k specifies the number of columns of matr
ix A.
* If trans == 'T', 't', 'C', or 'c', k specifies the number of rows
of
* matrix A. k must be at least zero.
* alpha double precision scalar multiplier applied to A * conjugate(trans
pose(A)) or
* conjugate(transpose(A)) * A.
* A double precision complex array of dimensions (lda, ka), where ka
is k when
* trans == 'N' or 'n', and is n otherwise. When trans == 'N' or 'n'
,
* the leading n x k part of array A must contain the matrix A,
* otherwise the leading k x n part of the array must contains the
* matrix A.
* lda leading dimension of A. When trans == 'N' or 'n' then lda must be
at
* least max(1, n). Otherwise lda must be at least max(1, k).
* beta double precision scalar multiplier applied to C. If beta is zero,
C
* does not have to be a valid input
* C double precision complex array of dimensions (ldc, n). If uplo =
'U' or 'u',
* the leading n x n triangular part of the array C must contain the
* upper triangular part of the hermitian matrix C and the strictly
* lower triangular part of C is not referenced. On exit, the upper
* triangular part of C is overwritten by the upper triangular part
of
* the updated matrix. If uplo = 'L' or 'l', the leading n x n
* triangular part of the array C must contain the lower triangular
part
* of the hermitian matrix C and the strictly upper triangular part
of C
* is not referenced. On exit, the lower triangular part of C is
* overwritten by the lower triangular part of the updated matrix.
* The imaginary parts of the diagonal elements need
* not be set, they are assumed to be zero, and on exit they
* are set to zero.
* ldc leading dimension of C. It must be at least max(1, n).
*
* Output
* ------
* C updated according to C = alpha * A * conjugate(transpose(A)) + be
ta * C, or C =
* alpha * conjugate(transpose(A)) * A + beta * C
*
* Reference: http://www.netlib.org/blas/zherk.f
*
* Error status for this function can be retrieved via cublasGetError().
*
* Error Status
* ------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
d
* CUBLAS_STATUS_INVALID_VALUE if n < 0 or k < 0
* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
*/
void CUBLASAPI cublasZherk (char uplo, char trans, int n, int k,
double alpha,
const cuDoubleComplex *A, int lda,
double beta,
cuDoubleComplex *C, int ldc);
/*
* void
* cublasZhemm (char side, char uplo, int m, int n, cuDoubleComplex alpha,
* const cuDoubleComplex *A, int lda, const cuDoubleComplex *B
, int ldb,
* cuDoubleComplex beta, cuDoubleComplex *C, int ldc);
*
* performs one of the matrix-matrix operations
*
* C = alpha * A * B + beta * C, or
* C = alpha * B * A + beta * C,
*
* where alpha and beta are double precision complex scalars, A is a hermit
ian matrix
* consisting of double precision complex elements and stored in either low
er or upper
* storage mode, and B and C are m x n matrices consisting of double precis
ion
* complex elements.
*
* Input
* -----
* side specifies whether the hermitian matrix A appears on the left side
* hand side or right hand side of matrix B, as follows. If side ==
'L'
* or 'l', then C = alpha * A * B + beta * C. If side = 'R' or 'r',
* then C = alpha * B * A + beta * C.
* uplo specifies whether the hermitian matrix A is stored in upper or lo
wer
* storage mode, as follows. If uplo == 'U' or 'u', only the upper
* triangular part of the hermitian matrix is to be referenced, and
the
* elements of the strictly lower triangular part are to be infered
from
* those in the upper triangular part. If uplo == 'L' or 'l', only t
he
* lower triangular part of the hermitian matrix is to be referenced
,
* and the elements of the strictly upper triangular part are to be
* infered from those in the lower triangular part.
* m specifies the number of rows of the matrix C, and the number of r
ows
* of matrix B. It also specifies the dimensions of hermitian matrix
A
* when side == 'L' or 'l'. m must be at least zero.
* n specifies the number of columns of the matrix C, and the number o
f
* columns of matrix B. It also specifies the dimensions of hermitia
n
* matrix A when side == 'R' or 'r'. n must be at least zero.
* alpha double precision scalar multiplier applied to A * B, or B * A
* A double precision complex array of dimensions (lda, ka), where ka
is m when
* side == 'L' or 'l' and is n otherwise. If side == 'L' or 'l' the
* leading m x m part of array A must contain the hermitian matrix,
* such that when uplo == 'U' or 'u', the leading m x m part stores
the
* upper triangular part of the hermitian matrix, and the strictly l
ower
* triangular part of A is not referenced, and when uplo == 'U' or '
u',
* the leading m x m part stores the lower triangular part of the
* hermitian matrix and the strictly upper triangular part is not
* referenced. If side == 'R' or 'r' the leading n x n part of array
A
* must contain the hermitian matrix, such that when uplo == 'U' or
'u',
* the leading n x n part stores the upper triangular part of the
* hermitian matrix and the strictly lower triangular part of A is n
ot
* referenced, and when uplo == 'U' or 'u', the leading n x n part
* stores the lower triangular part of the hermitian matrix and the
* strictly upper triangular part is not referenced. The imaginary p
arts
* of the diagonal elements need not be set, they are assumed to be
zero.
*
* lda leading dimension of A. When side == 'L' or 'l', it must be at le
ast
* max(1, m) and at least max(1, n) otherwise.
* B double precision complex array of dimensions (ldb, n). On entry,
the leading
* m x n part of the array contains the matrix B.
* ldb leading dimension of B. It must be at least max (1, m).
* beta double precision complex scalar multiplier applied to C. If beta
is zero, C
* does not have to be a valid input
* C double precision complex array of dimensions (ldc, n)
* ldc leading dimension of C. Must be at least max(1, m)
*
* Output
* ------
* C updated according to C = alpha * A * B + beta * C, or C = alpha *
* B * A + beta * C
*
* Reference: http://www.netlib.org/blas/zhemm.f
*
* Error status for this function can be retrieved via cublasGetError().
*
* Error Status
* ------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
d
* CUBLAS_STATUS_INVALID_VALUE if m or n are < 0
* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
*/
void CUBLASAPI cublasZhemm (char side, char uplo, int m, int n,
cuDoubleComplex alpha, const cuDoubleComplex *A
, int lda,
const cuDoubleComplex *B, int ldb, cuDoubleComp
lex beta,
cuDoubleComplex *C, int ldc);
/*
* void
* cublasZtrsv (char uplo, char trans, char diag, int n, const cuDoubleComp
lex *A,
* int lda, cuDoubleComplex *x, int incx)
*
* solves a system of equations op(A) * x = b, where op(A) is either A,
* transpose(A) or conjugate(transpose(A)). b and x are double precision
* complex vectors consisting of n elements, and A is an n x n matrix
* composed of a unit or non-unit, upper or lower triangular matrix.
* Matrix A is stored in column major format, and lda is the leading
* dimension of the two-dimensional array containing A.
*
* No test for singularity or near-singularity is included in this function
.
* Such tests must be performed before calling this function.
*
* Input
* -----
* uplo specifies whether the matrix data is stored in the upper or the
* lower triangular part of array A. If uplo = 'U' or 'u', then only
* the upper triangular part of A may be referenced. If uplo = 'L' o
r
* 'l', then only the lower triangular part of A may be referenced.
* trans specifies op(A). If transa = 'n' or 'N', op(A) = A. If transa = '
t',
* 'T', 'c', or 'C', op(A) = transpose(A)
* diag specifies whether or not A is a unit triangular matrix like so:
* if diag = 'U' or 'u', A is assumed to be unit triangular. If
* diag = 'N' or 'n', then A is not assumed to be unit triangular.
* n specifies the number of rows and columns of the matrix A. It
* must be at least 0.
* A is a double precision complex array of dimensions (lda, n). If up
lo = 'U'
* or 'u', then A must contains the upper triangular part of a symme
tric
* matrix, and the strictly lower triangular parts is not referenced
.
* If uplo = 'L' or 'l', then A contains the lower triangular part o
f
* a symmetric matrix, and the strictly upper triangular part is not
* referenced.
* lda is the leading dimension of the two-dimensional array containing
A.
* lda must be at least max(1, n).
* x double precision complex array of length at least (1 + (n - 1) *
abs(incx)).
* On entry, x contains the n element right-hand side vector b. On e
xit,
* it is overwritten with the solution vector x.
* incx specifies the storage spacing between elements of x. incx must no
t
* be zero.
*
* Output
* ------
* x updated to contain the solution vector x that solves op(A) * x =
b.
*
* Reference: http://www.netlib.org/blas/ztrsv.f
*
* Error status for this function can be retrieved via cublasGetError().
*
* Error Status
* ------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
d
* CUBLAS_STATUS_INVALID_VALUE if incx == 0 or if n < 0
* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
*/
void CUBLASAPI cublasZtrsv (char uplo, char trans, char diag, int n,
const cuDoubleComplex *A, int lda, cuDoubleComp
lex *x,
int incx);
/*
* void
* cublasZhbmv (char uplo, int n, int k, cuDoubleComplex alpha, const cuDou
bleComplex *A, int lda,
* const cuDoubleComplex *x, int incx, cuDoubleComplex beta, c
uDoubleComplex *y, int incy)
*
* performs the matrix-vector operation
*
* y := alpha*A*x + beta*y
*
* alpha and beta are double precision complex scalars. x and y are double
precision
* complex vectors with n elements. A is an n by n hermitian band matrix co
nsisting
* of double precision complex elements, with k super-diagonals and the sam
e number
* of subdiagonals.
*
* Input
* -----
* uplo specifies whether the upper or lower triangular part of the hermi
tian
* band matrix A is being supplied. If uplo == 'U' or 'u', the upper
* triangular part is being supplied. If uplo == 'L' or 'l', the low
er
* triangular part is being supplied.
* n specifies the number of rows and the number of columns of the
* hermitian matrix A. n must be at least zero.
* k specifies the number of super-diagonals of matrix A. Since the ma
trix
* is hermitian, this is also the number of sub-diagonals. k must be
at
* least zero.
* alpha double precision complex scalar multiplier applied to A*x.
* A double precision complex array of dimensions (lda, n). When uplo
== 'U' or
* 'u', the leading (k + 1) x n part of array A must contain the upp
er
* triangular band of the hermitian matrix, supplied column by colum
n,
* with the leading diagonal of the matrix in row (k+1) of the array
,
* the first super-diagonal starting at position 2 in row k, and so
on.
* The top left k x k triangle of the array A is not referenced. Whe
n
* uplo == 'L' or 'l', the leading (k + 1) x n part of the array A m
ust
* contain the lower triangular band part of the hermitian matrix,
* supplied column by column, with the leading diagonal of the matri
x in
* row 1 of the array, the first sub-diagonal starting at position 1
in
* row 2, and so on. The bottom right k x k triangle of the array A
is
* not referenced. The imaginary parts of the diagonal elements need
* not be set, they are assumed to be zero.
* lda leading dimension of A. lda must be at least (k + 1).
* x double precision complex array of length at least (1 + (n - 1) *
abs(incx)).
* incx storage spacing between elements of x. incx must not be zero.
* beta double precision complex scalar multiplier applied to vector y. I
f beta is
* zero, y is not read.
* y double precision complex array of length at least (1 + (n - 1) *
abs(incy)).
* If beta is zero, y is not read.
* incy storage spacing between elements of y. incy must not be zero.
*
* Output
* ------
* y updated according to alpha*A*x + beta*y
*
* Reference: http://www.netlib.org/blas/zhbmv.f
*
* Error status for this function can be retrieved via cublasGetError().
*
* Error Status
* ------------
* CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialize
d
* CUBLAS_STATUS_INVALID_VALUE if k or n < 0, or if incx or incy == 0
* CUBLAS_STATUS_ARCH_MISMATCH if invoked on device without DP support
* CUBLAS_STATUS_EXECUTION_FAILED if function failed to launch on GPU
*/
void CUBLASAPI cublasZhbmv (char uplo, int n, int k, cuDoubleComplex alpha,
const cuDoubleComplex *A, int lda, const cuDoub
leComplex *x,
int incx, cuDoubleComplex beta, cuDoubleComplex
*y, int incy);
#if defined(__cplusplus) #if defined(__cplusplus)
} }
#endif /* __cplusplus */ #endif /* __cplusplus */
#endif /* !defined(CUBLAS_H_) */ #endif /* !defined(CUBLAS_H_) */
 End of changes. 74 change blocks. 
44 lines changed or deleted 5308 lines changed or added


 cuda.h   cuda.h 
/* /*
* Copyright 1993-2009 NVIDIA Corporation. All rights reserved. * Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
* *
* NOTICE TO USER: * NOTICE TO USER:
* *
* This source code is subject to NVIDIA ownership rights under U.S. and * This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws. Users and possessors of this source code * international Copyright laws. Users and possessors of this source code
* are hereby granted a nonexclusive, royalty-free license to use this code * are hereby granted a nonexclusive, royalty-free license to use this code
* in individual and commercial software. * in individual and commercial software.
* *
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
skipping to change at line 57 skipping to change at line 57
/** /**
* \defgroup CUDA_TYPES Data types used by CUDA driver * \defgroup CUDA_TYPES Data types used by CUDA driver
* \ingroup CUDA_DRIVER * \ingroup CUDA_DRIVER
* @{ * @{
*/ */
/** /**
* CUDA API version number * CUDA API version number
*/ */
#define CUDA_VERSION 2030 /* 2.3 */ #define CUDA_VERSION 3000 /* 3.0 */
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
#endif #endif
typedef unsigned int CUdeviceptr; ///< CUDA device pointer typedef unsigned int CUdeviceptr; ///< CUDA device pointer
typedef int CUdevice; ///< CUDA device typedef int CUdevice; ///< CUDA device
typedef struct CUctx_st *CUcontext; ///< CUDA context typedef struct CUctx_st *CUcontext; ///< CUDA context
typedef struct CUmod_st *CUmodule; ///< CUDA module typedef struct CUmod_st *CUmodule; ///< CUDA module
typedef struct CUfunc_st *CUfunction; ///< CUDA function typedef struct CUfunc_st *CUfunction; ///< CUDA function
typedef struct CUarray_st *CUarray; ///< CUDA array typedef struct CUarray_st *CUarray; ///< CUDA array
typedef struct CUtexref_st *CUtexref; ///< CUDA texture reference typedef struct CUtexref_st *CUtexref; ///< CUDA texture reference
typedef struct CUevent_st *CUevent; ///< CUDA event typedef struct CUevent_st *CUevent; ///< CUDA event
typedef struct CUstream_st *CUstream; ///< CUDA stream typedef struct CUstream_st *CUstream; ///< CUDA stream
typedef struct CUgraphicsResource_st *CUgraphicsResource; ///< CUDA gra
phics interop resource
typedef struct CUuuid_st { ///< CUDA definition of UUID
char bytes[16];
} CUuuid;
/************************************ /************************************
** **
** Enums ** Enums
** **
***********************************/ ***********************************/
/** /**
* Context creation flags * Context creation flags
*/ */
typedef enum CUctx_flags_enum { typedef enum CUctx_flags_enum {
CU_CTX_SCHED_AUTO = 0, ///< Automatic scheduling CU_CTX_SCHED_AUTO = 0, ///< Automatic scheduling
CU_CTX_SCHED_SPIN = 1, ///< Set spin as default scheduling CU_CTX_SCHED_SPIN = 1, ///< Set spin as default scheduling
CU_CTX_SCHED_YIELD = 2, ///< Set yield as default scheduling CU_CTX_SCHED_YIELD = 2, ///< Set yield as default scheduling
CU_CTX_SCHED_MASK = 0x3, CU_CTX_SCHED_MASK = 0x3,
CU_CTX_BLOCKING_SYNC = 4, ///< Use blocking synchronization CU_CTX_BLOCKING_SYNC = 4, ///< Use blocking synchronization
CU_CTX_MAP_HOST = 8, ///< Support mapped pinned allocations CU_CTX_MAP_HOST = 8, ///< Support mapped pinned allocations
CU_CTX_LMEM_RESIZE_TO_MAX = 16, ///< Keep local memory allocation after launch CU_CTX_LMEM_RESIZE_TO_MAX = 16, ///< Keep local memory allocation after launch
CU_CTX_FLAGS_MASK = 0x1f, CU_CTX_FLAGS_MASK = 0x1f
} CUctx_flags; } CUctx_flags;
/** /**
* Event creation flags * Event creation flags
*/ */
typedef enum CUevent_flags_enum { typedef enum CUevent_flags_enum {
CU_EVENT_DEFAULT = 0, ///< Default event flag CU_EVENT_DEFAULT = 0, ///< Default event flag
CU_EVENT_BLOCKING_SYNC = 1, ///< Event uses blocking synchronization CU_EVENT_BLOCKING_SYNC = 1 ///< Event uses blocking synchronization
} CUevent_flags; } CUevent_flags;
/** /**
* Array formats * Array formats
*/ */
typedef enum CUarray_format_enum { typedef enum CUarray_format_enum {
CU_AD_FORMAT_UNSIGNED_INT8 = 0x01, ///< Unsigned 8-bit integers CU_AD_FORMAT_UNSIGNED_INT8 = 0x01, ///< Unsigned 8-bit integers
CU_AD_FORMAT_UNSIGNED_INT16 = 0x02, ///< Unsigned 16-bit integers CU_AD_FORMAT_UNSIGNED_INT16 = 0x02, ///< Unsigned 16-bit integers
CU_AD_FORMAT_UNSIGNED_INT32 = 0x03, ///< Unsigned 32-bit integers CU_AD_FORMAT_UNSIGNED_INT32 = 0x03, ///< Unsigned 32-bit integers
CU_AD_FORMAT_SIGNED_INT8 = 0x08, ///< Signed 8-bit integers CU_AD_FORMAT_SIGNED_INT8 = 0x08, ///< Signed 8-bit integers
skipping to change at line 121 skipping to change at line 126
CU_AD_FORMAT_HALF = 0x10, ///< 16-bit floating point CU_AD_FORMAT_HALF = 0x10, ///< 16-bit floating point
CU_AD_FORMAT_FLOAT = 0x20 ///< 32-bit floating point CU_AD_FORMAT_FLOAT = 0x20 ///< 32-bit floating point
} CUarray_format; } CUarray_format;
/** /**
* Texture reference addressing modes * Texture reference addressing modes
*/ */
typedef enum CUaddress_mode_enum { typedef enum CUaddress_mode_enum {
CU_TR_ADDRESS_MODE_WRAP = 0, ///< Wrapping address mode CU_TR_ADDRESS_MODE_WRAP = 0, ///< Wrapping address mode
CU_TR_ADDRESS_MODE_CLAMP = 1, ///< Clamp to edge address mode CU_TR_ADDRESS_MODE_CLAMP = 1, ///< Clamp to edge address mode
CU_TR_ADDRESS_MODE_MIRROR = 2, ///< Mirror address mode CU_TR_ADDRESS_MODE_MIRROR = 2 ///< Mirror address mode
} CUaddress_mode; } CUaddress_mode;
/** /**
* Texture reference filtering modes * Texture reference filtering modes
*/ */
typedef enum CUfilter_mode_enum { typedef enum CUfilter_mode_enum {
CU_TR_FILTER_MODE_POINT = 0, ///< Point filter mode CU_TR_FILTER_MODE_POINT = 0, ///< Point filter mode
CU_TR_FILTER_MODE_LINEAR = 1 ///< Linear filter mode CU_TR_FILTER_MODE_LINEAR = 1 ///< Linear filter mode
} CUfilter_mode; } CUfilter_mode;
skipping to change at line 158 skipping to change at line 163
CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK = 12, ///< Maximum number of 32-bit registers available per block CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK = 12, ///< Maximum number of 32-bit registers available per block
CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK = 12, ///< Deprecated, use CU _DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK = 12, ///< Deprecated, use CU _DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK
CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13, ///< Peak clock frequen cy in kilohertz CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13, ///< Peak clock frequen cy in kilohertz
CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT = 14, ///< Alignment requirem ent for textures CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT = 14, ///< Alignment requirem ent for textures
CU_DEVICE_ATTRIBUTE_GPU_OVERLAP = 15, ///< Device can possibl y copy memory and execute a kernel concurrently CU_DEVICE_ATTRIBUTE_GPU_OVERLAP = 15, ///< Device can possibl y copy memory and execute a kernel concurrently
CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16, ///< Number of multipro cessors on device CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16, ///< Number of multipro cessors on device
CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT = 17, ///< Specifies whether there is a run time limit on kernels CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT = 17, ///< Specifies whether there is a run time limit on kernels
CU_DEVICE_ATTRIBUTE_INTEGRATED = 18, ///< Device is integrat ed with host memory CU_DEVICE_ATTRIBUTE_INTEGRATED = 18, ///< Device is integrat ed with host memory
CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY = 19, ///< Device can map hos t memory into CUDA address space CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY = 19, ///< Device can map hos t memory into CUDA address space
CU_DEVICE_ATTRIBUTE_COMPUTE_MODE = 20 ///< Compute mode (See CU_DEVICE_ATTRIBUTE_COMPUTE_MODE = 20, ///< Compute mode (See
::CUcomputemode for details) ::CUcomputemode for details)
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH = 21, ///< Maximum 1D textu
re width
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH = 22, ///< Maximum 2D textu
re width
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT = 23,///< Maximum 2D textu
re height
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH = 24, ///< Maximum 3D textu
re width
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT = 25,///< Maximum 3D textu
re height
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH = 26, ///< Maximum 3D textu
re depth
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_WIDTH = 27, ///< Maximum te
xture array width
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_HEIGHT = 28,///< Maximum te
xture array height
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES = 29, ///< Maximu
m slices in a texture array
CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT = 30, ///< Alignment requirement
for surfaces
CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS = 31, ///< Device can possibly e
xecute multiple kernels concurrently
CU_DEVICE_ATTRIBUTE_ECC_ENABLED = 32 ///< Device has ECC support enable
d
} CUdevice_attribute; } CUdevice_attribute;
/** /**
* Legacy device properties * Legacy device properties
*/ */
typedef struct CUdevprop_st { typedef struct CUdevprop_st {
int maxThreadsPerBlock; ///< Maximum number of threads per block int maxThreadsPerBlock; ///< Maximum number of threads per block
int maxThreadsDim[3]; ///< Maximum size of each dimension of a bl ock int maxThreadsDim[3]; ///< Maximum size of each dimension of a bl ock
int maxGridSize[3]; ///< Maximum size of each dimension of a gr id int maxGridSize[3]; ///< Maximum size of each dimension of a gr id
int sharedMemPerBlock; ///< Shared memory available per block in b ytes int sharedMemPerBlock; ///< Shared memory available per block in b ytes
skipping to change at line 211 skipping to change at line 228
/** /**
* The size in bytes of thread local memory used by this function. * The size in bytes of thread local memory used by this function.
*/ */
CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES = 3, CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES = 3,
/** /**
* The number of registers used by each thread of this function. * The number of registers used by each thread of this function.
*/ */
CU_FUNC_ATTRIBUTE_NUM_REGS = 4, CU_FUNC_ATTRIBUTE_NUM_REGS = 4,
/**
* The PTX virtual architecture version for which the function was comp
iled.
*/
CU_FUNC_ATTRIBUTE_PTX_VERSION = 5,
/**
* The binary version for which the function was compiled.
*/
CU_FUNC_ATTRIBUTE_BINARY_VERSION = 6,
CU_FUNC_ATTRIBUTE_MAX CU_FUNC_ATTRIBUTE_MAX
} CUfunction_attribute; } CUfunction_attribute;
/** /**
* Function cache configurations
*/
typedef enum CUfunc_cache_enum {
CU_FUNC_CACHE_PREFER_NONE = 0x00,
CU_FUNC_CACHE_PREFER_SHARED = 0x01,
CU_FUNC_CACHE_PREFER_L1 = 0x02
} CUfunc_cache;
/**
* Memory types * Memory types
*/ */
typedef enum CUmemorytype_enum { typedef enum CUmemorytype_enum {
CU_MEMORYTYPE_HOST = 0x01, ///< Host memory CU_MEMORYTYPE_HOST = 0x01, ///< Host memory
CU_MEMORYTYPE_DEVICE = 0x02, ///< Device memory CU_MEMORYTYPE_DEVICE = 0x02, ///< Device memory
CU_MEMORYTYPE_ARRAY = 0x03 ///< Array memory CU_MEMORYTYPE_ARRAY = 0x03 ///< Array memory
} CUmemorytype; } CUmemorytype;
/** /**
* Compute Modes * Compute Modes
skipping to change at line 238 skipping to change at line 274
CU_COMPUTEMODE_EXCLUSIVE = 1, ///< Compute-exclusive mode (Only on e context can be present on this device at a time) CU_COMPUTEMODE_EXCLUSIVE = 1, ///< Compute-exclusive mode (Only on e context can be present on this device at a time)
CU_COMPUTEMODE_PROHIBITED = 2 ///< Compute-prohibited mode (No con texts can be created on this device at this time) CU_COMPUTEMODE_PROHIBITED = 2 ///< Compute-prohibited mode (No con texts can be created on this device at this time)
} CUcomputemode; } CUcomputemode;
/** /**
* Online compiler options * Online compiler options
*/ */
typedef enum CUjit_option_enum typedef enum CUjit_option_enum
{ {
/** /**
* Max number of registers that a thread may use. * Max number of registers that a thread may use.\n
* Option type: unsigned int
*/ */
CU_JIT_MAX_REGISTERS = 0, CU_JIT_MAX_REGISTERS = 0,
/** /**
* IN: Specifies minimum number of threads per block to target compilat ion * IN: Specifies minimum number of threads per block to target compilat ion
* for\n * for\n
* OUT: Returns the number of threads the compiler actually targeted. * OUT: Returns the number of threads the compiler actually targeted.
* This restricts the resource utilization fo the compiler (e.g. max * This restricts the resource utilization fo the compiler (e.g. max
* registers) such that a block with the given number of threads should be * registers) such that a block with the given number of threads should be
* able to launch based on register limitations. Note, this option does not * able to launch based on register limitations. Note, this option does not
* currently take into account any other resource limitations, such as * currently take into account any other resource limitations, such as
* shared memory utilization. * shared memory utilization.\n
* Option type: unsigned int
*/ */
CU_JIT_THREADS_PER_BLOCK, CU_JIT_THREADS_PER_BLOCK,
/** /**
* Returns a float value in the option of the wall clock time, in * Returns a float value in the option of the wall clock time, in
* milliseconds, spent creating the cubin * milliseconds, spent creating the cubin\n
* Option type: float
*/ */
CU_JIT_WALL_TIME, CU_JIT_WALL_TIME,
/** /**
* Pointer to a buffer in which to print any log messsages from PTXAS * Pointer to a buffer in which to print any log messsages from PTXAS
* that are informational in nature * that are informational in nature (the buffer size is specified via
* option ::CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES) \n
* Option type: char*
*/ */
CU_JIT_INFO_LOG_BUFFER, CU_JIT_INFO_LOG_BUFFER,
/** /**
* IN: Log buffer size in bytes. Log messages will be capped at this s ize * IN: Log buffer size in bytes. Log messages will be capped at this s ize
* (including null terminator)\n * (including null terminator)\n
* OUT: Amount of log buffer filled with messages * OUT: Amount of log buffer filled with messages\n
* Option type: unsigned int
*/ */
CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES,
/** /**
* Pointer to a buffer in which to print any log messages from PTXAS th at * Pointer to a buffer in which to print any log messages from PTXAS th at
* reflect errors * reflect errors (the buffer size is specified via option
* ::CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES)\n
* Option type: char*
*/ */
CU_JIT_ERROR_LOG_BUFFER, CU_JIT_ERROR_LOG_BUFFER,
/** /**
* IN: Log buffer size in bytes. Log messages will be capped at this s ize * IN: Log buffer size in bytes. Log messages will be capped at this s ize
* (including null terminator)\n * (including null terminator)\n
* OUT: Amount of log buffer filled with messages * OUT: Amount of log buffer filled with messages\n
* Option type: unsigned int
*/ */
CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES,
/** /**
* Level of optimizations to apply to generated code (0 - 4), with 4 * Level of optimizations to apply to generated code (0 - 4), with 4
* being the default and highest level of optimizations. * being the default and highest level of optimizations.\n
* Option type: unsigned int
*/ */
CU_JIT_OPTIMIZATION_LEVEL, CU_JIT_OPTIMIZATION_LEVEL,
/** /**
* No option value required. Determines the target based on the current * No option value required. Determines the target based on the current
* attached context (default) * attached context (default)\n
* Option type: No option value needed
*/ */
CU_JIT_TARGET_FROM_CUCONTEXT, CU_JIT_TARGET_FROM_CUCONTEXT,
/** /**
* Target is chosen based on supplied CUjit_target_enum. * Target is chosen based on supplied ::CUjit_target_enum.\n
* Option type: unsigned int for enumerated type ::CUjit_target_enum
*/ */
CU_JIT_TARGET, CU_JIT_TARGET,
/** /**
* Specifies choice of fallback strategy if matching cubin is not found . * Specifies choice of fallback strategy if matching cubin is not found .
* Choice is based on supplied CUjit_fallback_enum. * Choice is based on supplied ::CUjit_fallback_enum.\n
* Option type: unsigned int for enumerated type ::CUjit_fallback_enum
*/ */
CU_JIT_FALLBACK_STRATEGY CU_JIT_FALLBACK_STRATEGY
} CUjit_option; } CUjit_option;
/** /**
* Online compilation targets * Online compilation targets
*/ */
typedef enum CUjit_target_enum typedef enum CUjit_target_enum
{ {
CU_TARGET_COMPUTE_10 = 0, ///< Compute device class 1.0 CU_TARGET_COMPUTE_10 = 0, ///< Compute device class 1.0
CU_TARGET_COMPUTE_11, ///< Compute device class 1.1 CU_TARGET_COMPUTE_11, ///< Compute device class 1.1
CU_TARGET_COMPUTE_12, ///< Compute device class 1.2 CU_TARGET_COMPUTE_12, ///< Compute device class 1.2
CU_TARGET_COMPUTE_13 ///< Compute device class 1.3 CU_TARGET_COMPUTE_13, ///< Compute device class 1.3
CU_TARGET_COMPUTE_20 ///< Compute device class 2.0
} CUjit_target; } CUjit_target;
/** /**
* Cubin matching fallback strategies * Cubin matching fallback strategies
*/ */
typedef enum CUjit_fallback_enum typedef enum CUjit_fallback_enum
{ {
/** Prefer to compile ptx */ /** Prefer to compile ptx */
CU_PREFER_PTX = 0, CU_PREFER_PTX = 0,
/** Prefer to fall back to compatible binary code */ /** Prefer to fall back to compatible binary code */
CU_PREFER_BINARY CU_PREFER_BINARY
} CUjit_fallback; } CUjit_fallback;
/**
* Flags to register a graphics resource
*/
typedef enum CUgraphicsRegisterFlags_enum {
CU_GRAPHICS_REGISTER_FLAGS_NONE = 0x00
} CUgraphicsRegisterFlags;
/**
* Flags for mapping and unmapping interop resources
*/
typedef enum CUgraphicsMapResourceFlags_enum {
CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE = 0x00,
CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY = 0x01,
CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITE_DISCARD = 0x02
} CUgraphicsMapResourceFlags;
/**
* Array indices for cube faces
*/
typedef enum CUarray_cubemap_face_enum {
CU_CUBEMAP_FACE_POSITIVE_X = 0x00, ///< Positive X face of cubemap
CU_CUBEMAP_FACE_NEGATIVE_X = 0x01, ///< Negative X face of cubemap
CU_CUBEMAP_FACE_POSITIVE_Y = 0x02, ///< Positive Y face of cubemap
CU_CUBEMAP_FACE_NEGATIVE_Y = 0x03, ///< Negative Y face of cubemap
CU_CUBEMAP_FACE_POSITIVE_Z = 0x04, ///< Positive Z face of cubemap
CU_CUBEMAP_FACE_NEGATIVE_Z = 0x05 ///< Negative Z face of cubemap
} CUarray_cubemap_face;
/************************************ /************************************
** **
** Error codes ** Error codes
** **
***********************************/ ***********************************/
/** /**
* Error codes * Error codes
*/ */
typedef enum cudaError_enum { typedef enum cudaError_enum {
skipping to change at line 365 skipping to change at line 443
CUDA_ERROR_INVALID_IMAGE = 200, ///< Invalid kernel image CUDA_ERROR_INVALID_IMAGE = 200, ///< Invalid kernel image
CUDA_ERROR_INVALID_CONTEXT = 201, ///< Invalid context CUDA_ERROR_INVALID_CONTEXT = 201, ///< Invalid context
CUDA_ERROR_CONTEXT_ALREADY_CURRENT = 202, ///< Context already curren t CUDA_ERROR_CONTEXT_ALREADY_CURRENT = 202, ///< Context already curren t
CUDA_ERROR_MAP_FAILED = 205, ///< Map failed CUDA_ERROR_MAP_FAILED = 205, ///< Map failed
CUDA_ERROR_UNMAP_FAILED = 206, ///< Unmap failed CUDA_ERROR_UNMAP_FAILED = 206, ///< Unmap failed
CUDA_ERROR_ARRAY_IS_MAPPED = 207, ///< Array is mapped CUDA_ERROR_ARRAY_IS_MAPPED = 207, ///< Array is mapped
CUDA_ERROR_ALREADY_MAPPED = 208, ///< Already mapped CUDA_ERROR_ALREADY_MAPPED = 208, ///< Already mapped
CUDA_ERROR_NO_BINARY_FOR_GPU = 209, ///< No binary for GPU CUDA_ERROR_NO_BINARY_FOR_GPU = 209, ///< No binary for GPU
CUDA_ERROR_ALREADY_ACQUIRED = 210, ///< Already acquired CUDA_ERROR_ALREADY_ACQUIRED = 210, ///< Already acquired
CUDA_ERROR_NOT_MAPPED = 211, ///< Not mapped CUDA_ERROR_NOT_MAPPED = 211, ///< Not mapped
CUDA_ERROR_NOT_MAPPED_AS_ARRAY = 212, ///< Mapped resource not a
vailable for access as an array
CUDA_ERROR_NOT_MAPPED_AS_POINTER = 213, ///< Mapped resource not a
vailable for access as a pointer
CUDA_ERROR_ECC_UNCORRECTABLE = 214, ///< Uncorrectable ECC erro
r detected
CUDA_ERROR_INVALID_SOURCE = 300, ///< Invalid source CUDA_ERROR_INVALID_SOURCE = 300, ///< Invalid source
CUDA_ERROR_FILE_NOT_FOUND = 301, ///< File not found CUDA_ERROR_FILE_NOT_FOUND = 301, ///< File not found
CUDA_ERROR_INVALID_HANDLE = 400, ///< Invalid handle CUDA_ERROR_INVALID_HANDLE = 400, ///< Invalid handle
CUDA_ERROR_NOT_FOUND = 500, ///< Not found CUDA_ERROR_NOT_FOUND = 500, ///< Not found
CUDA_ERROR_NOT_READY = 600, ///< CUDA not ready CUDA_ERROR_NOT_READY = 600, ///< CUDA not ready
CUDA_ERROR_LAUNCH_FAILED = 700, ///< Launch failed CUDA_ERROR_LAUNCH_FAILED = 700, ///< Launch failed
CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES = 701, ///< Launch exceeded resour ces CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES = 701, ///< Launch exceeded resour ces
CUDA_ERROR_LAUNCH_TIMEOUT = 702, ///< Launch exceeded timeou t CUDA_ERROR_LAUNCH_TIMEOUT = 702, ///< Launch exceeded timeou t
CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING = 703, ///< Launch with incomp atible texturing CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING = 703, ///< Launch with incomp atible texturing
CUDA_ERROR_POINTER_IS_64BIT = 800, ///< Attempted to retrieve
64-bit pointer via 32-bit API function
CUDA_ERROR_SIZE_IS_64BIT = 801, ///< Attempted to retrieve
64-bit size via 32-bit API function
CUDA_ERROR_UNKNOWN = 999 ///< Unknown error CUDA_ERROR_UNKNOWN = 999 ///< Unknown error
} CUresult; } CUresult;
/** /**
* If set, host memory is portable between CUDA contexts. * If set, host memory is portable between CUDA contexts.
* Flag for ::cuMemHostAlloc() * Flag for ::cuMemHostAlloc()
*/ */
#define CU_MEMHOSTALLOC_PORTABLE 0x01 #define CU_MEMHOSTALLOC_PORTABLE 0x01
/** /**
skipping to change at line 492 skipping to change at line 576
unsigned int Height; ///< Height of 3D array unsigned int Height; ///< Height of 3D array
unsigned int Depth; ///< Depth of 3D array unsigned int Depth; ///< Depth of 3D array
CUarray_format Format; ///< Array format CUarray_format Format; ///< Array format
unsigned int NumChannels; ///< Channels per array element unsigned int NumChannels; ///< Channels per array element
unsigned int Flags; ///< Flags unsigned int Flags; ///< Flags
} CUDA_ARRAY3D_DESCRIPTOR; } CUDA_ARRAY3D_DESCRIPTOR;
// if set, the CUDA array contains an array of 2D slices
// and the Depth member of CUDA_ARRAY3D_DESCRIPTOR specifies
// the number of slices, not the depth of a 3D array.
#define CUDA_ARRAY3D_2DARRAY 0x01
/** /**
* Override the texref format with a format inferred from the array. * Override the texref format with a format inferred from the array.
* Flag for ::cuTexRefSetArray() * Flag for ::cuTexRefSetArray()
*/ */
#define CU_TRSA_OVERRIDE_FORMAT 0x01 #define CU_TRSA_OVERRIDE_FORMAT 0x01
/** /**
* Read the texture as integers rather than promoting the values to floats * Read the texture as integers rather than promoting the values to floats
* in the range [0,1]. * in the range [0,1].
* Flag for ::cuTexRefSetFlags() * Flag for ::cuTexRefSetFlags()
skipping to change at line 663 skipping to change at line 752
** **
***********************************/ ***********************************/
// 1D functions // 1D functions
// system <-> device memory // system <-> device memory
CUresult CUDAAPI cuMemcpyHtoDAsync (CUdeviceptr dstDevice, CUresult CUDAAPI cuMemcpyHtoDAsync (CUdeviceptr dstDevice,
const void *srcHost, unsigned int ByteCount, CUstream hStream ) ; const void *srcHost, unsigned int ByteCount, CUstream hStream ) ;
CUresult CUDAAPI cuMemcpyDtoHAsync (void *dstHost, CUresult CUDAAPI cuMemcpyDtoHAsync (void *dstHost,
CUdeviceptr srcDevice, unsigned int ByteCount, CUstream hStream ); CUdeviceptr srcDevice, unsigned int ByteCount, CUstream hStream );
// device <-> device memory
CUresult CUDAAPI cuMemcpyDtoDAsync (CUdeviceptr dstDevice,
CUdeviceptr srcDevice, unsigned int ByteCount, CUstream hStream
);
// system <-> array memory // system <-> array memory
CUresult CUDAAPI cuMemcpyHtoAAsync( CUarray dstArray, unsigned int dstIndex, CUresult CUDAAPI cuMemcpyHtoAAsync( CUarray dstArray, unsigned int dstIndex,
const void *pSrc, unsigned int ByteCount, CUstream hStream ); const void *pSrc, unsigned int ByteCount, CUstream hStream );
CUresult CUDAAPI cuMemcpyAtoHAsync( void *dstHost, CUarray srcArra y, unsigned int srcIndex, CUresult CUDAAPI cuMemcpyAtoHAsync( void *dstHost, CUarray srcArra y, unsigned int srcIndex,
unsigned int ByteCount, CUstream hStream ); unsigned int ByteCount, CUstream hStream );
// 2D memcpy // 2D memcpy
CUresult CUDAAPI cuMemcpy2DAsync( const CUDA_MEMCPY2D *pCopy, CUst ream hStream ); CUresult CUDAAPI cuMemcpy2DAsync( const CUDA_MEMCPY2D *pCopy, CUst ream hStream );
// 3D memcpy // 3D memcpy
skipping to change at line 697 skipping to change at line 790
/************************************ /************************************
** **
** Function management ** Function management
** **
***********************************/ ***********************************/
CUresult CUDAAPI cuFuncSetBlockShape (CUfunction hfunc, int x, int y, i nt z); CUresult CUDAAPI cuFuncSetBlockShape (CUfunction hfunc, int x, int y, i nt z);
CUresult CUDAAPI cuFuncSetSharedSize (CUfunction hfunc, unsigned int by tes); CUresult CUDAAPI cuFuncSetSharedSize (CUfunction hfunc, unsigned int by tes);
CUresult CUDAAPI cuFuncGetAttribute (int *pi, CUfunction_attribute attr ib, CUfunction hfunc); CUresult CUDAAPI cuFuncGetAttribute (int *pi, CUfunction_attribute attr ib, CUfunction hfunc);
CUresult CUDAAPI cuFuncSetCacheConfig(CUfunction hfunc, CUfunc_cache co nfig);
/************************************ /************************************
** **
** Array management ** Array management
** **
***********************************/ ***********************************/
CUresult CUDAAPI cuArrayCreate( CUarray *pHandle, const CUDA_ARRAY_DES CRIPTOR *pAllocateArray ); CUresult CUDAAPI cuArrayCreate( CUarray *pHandle, const CUDA_ARRAY_DES CRIPTOR *pAllocateArray );
CUresult CUDAAPI cuArrayGetDescriptor( CUDA_ARRAY_DESCRIPTOR *pArrayDe scriptor, CUarray hArray ); CUresult CUDAAPI cuArrayGetDescriptor( CUDA_ARRAY_DESCRIPTOR *pArrayDe scriptor, CUarray hArray );
CUresult CUDAAPI cuArrayDestroy( CUarray hArray ); CUresult CUDAAPI cuArrayDestroy( CUarray hArray );
skipping to change at line 743 skipping to change at line 837
/************************************ /************************************
** **
** Parameter management ** Parameter management
** **
***********************************/ ***********************************/
CUresult CUDAAPI cuParamSetSize (CUfunction hfunc, unsigned int numbyt es); CUresult CUDAAPI cuParamSetSize (CUfunction hfunc, unsigned int numbyt es);
CUresult CUDAAPI cuParamSeti (CUfunction hfunc, int offset, unsigne d int value); CUresult CUDAAPI cuParamSeti (CUfunction hfunc, int offset, unsigne d int value);
CUresult CUDAAPI cuParamSetf (CUfunction hfunc, int offset, float v alue); CUresult CUDAAPI cuParamSetf (CUfunction hfunc, int offset, float v alue);
CUresult CUDAAPI cuParamSetv (CUfunction hfunc, int offset, void * ptr, unsigned int numbytes); CUresult CUDAAPI cuParamSetv (CUfunction hfunc, int offset, void *p tr, unsigned int numbytes);
CUresult CUDAAPI cuParamSetTexRef(CUfunction hfunc, int texunit, CUtex ref hTexRef); CUresult CUDAAPI cuParamSetTexRef(CUfunction hfunc, int texunit, CUtex ref hTexRef);
/************************************ /************************************
** **
** Launch functions ** Launch functions
** **
***********************************/ ***********************************/
CUresult CUDAAPI cuLaunch ( CUfunction f ); CUresult CUDAAPI cuLaunch ( CUfunction f );
CUresult CUDAAPI cuLaunchGrid (CUfunction f, int grid_width, int grid_h eight); CUresult CUDAAPI cuLaunchGrid (CUfunction f, int grid_width, int grid_h eight);
skipping to change at line 778 skipping to change at line 872
/************************************ /************************************
** **
** Streams ** Streams
** **
***********************************/ ***********************************/
CUresult CUDAAPI cuStreamCreate( CUstream *phStream, unsigned int Flag s ); CUresult CUDAAPI cuStreamCreate( CUstream *phStream, unsigned int Flag s );
CUresult CUDAAPI cuStreamQuery( CUstream hStream ); CUresult CUDAAPI cuStreamQuery( CUstream hStream );
CUresult CUDAAPI cuStreamSynchronize( CUstream hStream ); CUresult CUDAAPI cuStreamSynchronize( CUstream hStream );
CUresult CUDAAPI cuStreamDestroy( CUstream hStream ); CUresult CUDAAPI cuStreamDestroy( CUstream hStream );
/************************************
**
** Graphics interop
**
***********************************/
CUresult CUDAAPI cuGraphicsUnregisterResource(CUgraphicsResource resour
ce);
CUresult CUDAAPI cuGraphicsSubResourceGetMappedArray( CUarray *pArray,
CUgraphicsResource resource, unsigned int arrayIndex, unsigned int mipLevel
);
CUresult CUDAAPI cuGraphicsResourceGetMappedPointer( CUdeviceptr *pDevP
tr, unsigned int *pSize, CUgraphicsResource resource );
CUresult CUDAAPI cuGraphicsResourceSetMapFlags( CUgraphicsResource reso
urce, unsigned int flags );
CUresult CUDAAPI cuGraphicsMapResources( unsigned int count, CUgraphics
Resource *resources, CUstream hStream );
CUresult CUDAAPI cuGraphicsUnmapResources( unsigned int count, CUgraphi
csResource *resources, CUstream hStream );
/************************************
**
** Export tables
**
***********************************/
CUresult CUDAAPI cuGetExportTable( const void **ppExportTable, const CU
uuid *pExportTableId );
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif
#endif /* __cuda_cuda_h__ */ #endif /* __cuda_cuda_h__ */
 End of changes. 29 change blocks. 
20 lines changed or deleted 161 lines changed or added


 cudaGL.h   cudaGL.h 
/* /*
* Copyright 1993-2009 NVIDIA Corporation. All rights reserved. * Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
* *
* NOTICE TO USER: * NOTICE TO USER:
* *
* This source code is subject to NVIDIA ownership rights under U.S. and * This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws. Users and possessors of this source code * international Copyright laws. Users and possessors of this source code
* are hereby granted a nonexclusive, royalty-free license to use this code * are hereby granted a nonexclusive, royalty-free license to use this code
* in individual and commercial software. * in individual and commercial software.
* *
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
skipping to change at line 43 skipping to change at line 43
* the above Disclaimer and U.S. Government End Users Notice. * the above Disclaimer and U.S. Government End Users Notice.
*/ */
#ifndef CUDAGL_H #ifndef CUDAGL_H
#define CUDAGL_H #define CUDAGL_H
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
#endif #endif
CUresult CUDAAPI cuGLCtxCreate( CUcontext *pCtx, unsigned int Flags, CUdevi
ce device );
CUresult CUDAAPI cuGraphicsGLRegisterBuffer( CUgraphicsResource *pCudaResou
rce, GLuint buffer, unsigned int Flags );
CUresult CUDAAPI cuGraphicsGLRegisterImage( CUgraphicsResource *pCudaResour
ce, GLuint image, GLenum target, unsigned int Flags );
#if defined(_WIN32)
#if !defined(WGL_NV_gpu_affinity)
typedef void* HGPUNV;
#endif
CUresult CUDAAPI cuWGLGetDevice( CUdevice *pDevice, HGPUNV hGpu );
#endif
// //
// Flags to map or unmap a resource // CUDA 2.x compatibility API. These functions are deprecated, please use t he ones above.
// //
// Flags to map or unmap a resource
typedef enum CUGLmap_flags_enum { typedef enum CUGLmap_flags_enum {
CU_GL_MAP_RESOURCE_FLAGS_NONE = 0x00, CU_GL_MAP_RESOURCE_FLAGS_NONE = 0x00,
CU_GL_MAP_RESOURCE_FLAGS_READ_ONLY = 0x01, CU_GL_MAP_RESOURCE_FLAGS_READ_ONLY = 0x01,
CU_GL_MAP_RESOURCE_FLAGS_WRITE_DISCARD = 0x02, CU_GL_MAP_RESOURCE_FLAGS_WRITE_DISCARD = 0x02,
} CUGLmap_flags; } CUGLmap_flags;
CUresult CUDAAPI cuGLInit(void); CUresult CUDAAPI cuGLInit(void);
CUresult CUDAAPI cuGLCtxCreate( CUcontext *pCtx, unsigned int Flags, CUdevi CUresult CUDAAPI cuGLRegisterBufferObject( GLuint buffer );
ce device ); CUresult CUDAAPI cuGLMapBufferObject( CUdeviceptr *dptr, unsigned int *size
CUresult CUDAAPI cuGLRegisterBufferObject( GLuint bufferobj ); , GLuint buffer );
CUresult CUDAAPI cuGLMapBufferObject( CUdeviceptr *dptr, unsigned int *size CUresult CUDAAPI cuGLUnmapBufferObject( GLuint buffer );
, GLuint bufferobj ); CUresult CUDAAPI cuGLUnregisterBufferObject( GLuint buffer );
CUresult CUDAAPI cuGLUnmapBufferObject( GLuint bufferobj );
CUresult CUDAAPI cuGLUnregisterBufferObject( GLuint bufferobj );
CUresult CUDAAPI cuGLSetBufferObjectMapFlags( GLuint bufferobj, unsigned in
t Flags );
CUresult CUDAAPI cuGLMapBufferObjectAsync( CUdeviceptr *dptr, unsigned int
*size, GLuint bufferobj, CUstream hStream );
CUresult CUDAAPI cuGLUnmapBufferObjectAsync( GLuint bufferobj, CUstream hSt
ream );
#if defined(_WIN32) CUresult CUDAAPI cuGLSetBufferObjectMapFlags( GLuint buffer, unsigned int F
#if !defined(WGL_NV_gpu_affinity) lags );
typedef void* HGPUNV; CUresult CUDAAPI cuGLMapBufferObjectAsync( CUdeviceptr *dptr, unsigned int
#endif *size, GLuint buffer, CUstream hStream );
CUresult CUDAAPI cuWGLGetDevice( CUdevice *pDevice, HGPUNV hGpu ); CUresult CUDAAPI cuGLUnmapBufferObjectAsync( GLuint buffer, CUstream hStrea
#endif m );
#ifdef __cplusplus #ifdef __cplusplus
}; };
#endif #endif
#endif #endif
 End of changes. 6 change blocks. 
22 lines changed or deleted 29 lines changed or added


 cuda_gl_interop.h   cuda_gl_interop.h 
/* /*
* Copyright 1993-2009 NVIDIA Corporation. All rights reserved. * Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
* *
* NOTICE TO USER: * NOTICE TO USER:
* *
* This source code is subject to NVIDIA ownership rights under U.S. and * This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws. Users and possessors of this source code * international Copyright laws. Users and possessors of this source code
* are hereby granted a nonexclusive, royalty-free license to use this code * are hereby granted a nonexclusive, royalty-free license to use this code
* in individual and commercial software. * in individual and commercial software.
* *
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
skipping to change at line 67 skipping to change at line 67
#if defined(__cplusplus) #if defined(__cplusplus)
extern "C" { extern "C" {
#endif /* __cplusplus */ #endif /* __cplusplus */
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
extern __host__ cudaError_t CUDARTAPI cudaGLSetGLDevice(int device);
extern __host__ cudaError_t CUDARTAPI cudaGraphicsGLRegisterImage(struct cu
daGraphicsResource **resource, GLuint image, GLenum target, unsigned int Fl
ags);
extern __host__ cudaError_t CUDARTAPI cudaGraphicsGLRegisterBuffer(struct c
udaGraphicsResource **resource, GLuint buffer, unsigned int Flags);
#ifdef _WIN32
#ifndef WGL_NV_gpu_affinity
typedef void* HGPUNV;
#endif
extern __host__ cudaError_t CUDARTAPI cudaWGLGetDevice(int *device, HGPUNV
hGpu);
#endif
/** /**
* CUDA GL Map Flags * CUDA GL Map Flags
*/ */
enum cudaGLMapFlags enum cudaGLMapFlags
{ {
cudaGLMapFlagsNone = 0, ///< Default; Assume resource can be rea d/written cudaGLMapFlagsNone = 0, ///< Default; Assume resource can be rea d/written
cudaGLMapFlagsReadOnly = 1, ///< CUDA kernels will not write to this resource cudaGLMapFlagsReadOnly = 1, ///< CUDA kernels will not write to this resource
cudaGLMapFlagsWriteDiscard = 2, ///< CUDA kernels will only write to and will not read from this resource cudaGLMapFlagsWriteDiscard = 2, ///< CUDA kernels will only write to and will not read from this resource
}; };
extern __host__ cudaError_t CUDARTAPI cudaGLSetGLDevice(int device);
extern __host__ cudaError_t CUDARTAPI cudaGLRegisterBufferObject(GLuint buf Obj); extern __host__ cudaError_t CUDARTAPI cudaGLRegisterBufferObject(GLuint buf Obj);
extern __host__ cudaError_t CUDARTAPI cudaGLMapBufferObject(void **devPtr, GLuint bufObj); extern __host__ cudaError_t CUDARTAPI cudaGLMapBufferObject(void **devPtr, GLuint bufObj);
extern __host__ cudaError_t CUDARTAPI cudaGLUnmapBufferObject(GLuint bufObj ); extern __host__ cudaError_t CUDARTAPI cudaGLUnmapBufferObject(GLuint bufObj );
extern __host__ cudaError_t CUDARTAPI cudaGLUnregisterBufferObject(GLuint b ufObj); extern __host__ cudaError_t CUDARTAPI cudaGLUnregisterBufferObject(GLuint b ufObj);
extern __host__ cudaError_t CUDARTAPI cudaGLSetBufferObjectMapFlags(GLuint bufObj, unsigned int flags); extern __host__ cudaError_t CUDARTAPI cudaGLSetBufferObjectMapFlags(GLuint bufObj, unsigned int flags);
extern __host__ cudaError_t CUDARTAPI cudaGLMapBufferObjectAsync(void **dev Ptr, GLuint bufObj, cudaStream_t stream); extern __host__ cudaError_t CUDARTAPI cudaGLMapBufferObjectAsync(void **dev Ptr, GLuint bufObj, cudaStream_t stream);
extern __host__ cudaError_t CUDARTAPI cudaGLUnmapBufferObjectAsync(GLuint b ufObj, cudaStream_t stream); extern __host__ cudaError_t CUDARTAPI cudaGLUnmapBufferObjectAsync(GLuint b ufObj, cudaStream_t stream);
#ifdef _WIN32
#ifndef WGL_NV_gpu_affinity
typedef void* HGPUNV;
#endif
extern __host__ cudaError_t CUDARTAPI cudaWGLGetDevice(int *device, HGPUNV
hGpu);
#endif
#if defined(__cplusplus) #if defined(__cplusplus)
} }
#endif /* __cplusplus */ #endif /* __cplusplus */
#endif /* __CUDA_GL_INTEROP_H__ */ #endif /* __CUDA_GL_INTEROP_H__ */
 End of changes. 4 change blocks. 
10 lines changed or deleted 16 lines changed or added


 cuda_runtime.h   cuda_runtime.h 
/* /*
* Copyright 1993-2009 NVIDIA Corporation. All rights reserved. * Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
* *
* NOTICE TO USER: * NOTICE TO USER:
* *
* This source code is subject to NVIDIA ownership rights under U.S. and * This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws. Users and possessors of this source code * international Copyright laws. Users and possessors of this source code
* are hereby granted a nonexclusive, royalty-free license to use this code * are hereby granted a nonexclusive, royalty-free license to use this code
* in individual and commercial software. * in individual and commercial software.
* *
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
skipping to change at line 199 skipping to change at line 199
enum cudaMemcpyKind kind = cudaMemcpyHostToDevice enum cudaMemcpyKind kind = cudaMemcpyHostToDevice
) )
{ {
return cudaMemcpyToSymbol((const char*)&symbol, src, count, offset, kind) ; return cudaMemcpyToSymbol((const char*)&symbol, src, count, offset, kind) ;
} }
static __inline__ __host__ cudaError_t cudaMemcpyToSymbolAsync( static __inline__ __host__ cudaError_t cudaMemcpyToSymbolAsync(
char *symbol, char *symbol,
const void *src, const void *src,
size_t count, size_t count,
size_t offset, size_t offset = 0,
enum cudaMemcpyKind kind, enum cudaMemcpyKind kind = cudaMemcpyHostToDevice,
cudaStream_t stream cudaStream_t stream = 0
) )
{ {
return cudaMemcpyToSymbolAsync((const char*)symbol, src, count, offset, k ind, stream); return cudaMemcpyToSymbolAsync((const char*)symbol, src, count, offset, k ind, stream);
} }
template<class T> template<class T>
__inline__ __host__ cudaError_t cudaMemcpyToSymbolAsync( __inline__ __host__ cudaError_t cudaMemcpyToSymbolAsync(
const T &symbol, const T &symbol,
const void *src, const void *src,
size_t count, size_t count,
size_t offset, size_t offset = 0,
enum cudaMemcpyKind kind, enum cudaMemcpyKind kind = cudaMemcpyHostToDevice,
cudaStream_t stream cudaStream_t stream = 0
) )
{ {
return cudaMemcpyToSymbolAsync((const char*)&symbol, src, count, offset, kind, stream); return cudaMemcpyToSymbolAsync((const char*)&symbol, src, count, offset, kind, stream);
} }
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
skipping to change at line 253 skipping to change at line 253
enum cudaMemcpyKind kind = cudaMemcpyDeviceToHost enum cudaMemcpyKind kind = cudaMemcpyDeviceToHost
) )
{ {
return cudaMemcpyFromSymbol(dst, (const char*)&symbol, count, offset, kin d); return cudaMemcpyFromSymbol(dst, (const char*)&symbol, count, offset, kin d);
} }
static __inline__ __host__ cudaError_t cudaMemcpyFromSymbolAsync( static __inline__ __host__ cudaError_t cudaMemcpyFromSymbolAsync(
void *dst, void *dst,
char *symbol, char *symbol,
size_t count, size_t count,
size_t offset, size_t offset = 0,
enum cudaMemcpyKind kind, enum cudaMemcpyKind kind = cudaMemcpyDeviceToHost,
cudaStream_t stream cudaStream_t stream = 0
) )
{ {
return cudaMemcpyFromSymbolAsync(dst, (const char*)symbol, count, offset, kind, stream); return cudaMemcpyFromSymbolAsync(dst, (const char*)symbol, count, offset, kind, stream);
} }
template<class T> template<class T>
__inline__ __host__ cudaError_t cudaMemcpyFromSymbolAsync( __inline__ __host__ cudaError_t cudaMemcpyFromSymbolAsync(
void *dst, void *dst,
const T &symbol, const T &symbol,
size_t count, size_t count,
size_t offset, size_t offset = 0,
enum cudaMemcpyKind kind, enum cudaMemcpyKind kind = cudaMemcpyDeviceToHost,
cudaStream_t stream cudaStream_t stream = 0
) )
{ {
return cudaMemcpyFromSymbolAsync(dst, (const char*)&symbol, count, offset , kind, stream); return cudaMemcpyFromSymbolAsync(dst, (const char*)&symbol, count, offset , kind, stream);
} }
static __inline__ __host__ cudaError_t cudaGetSymbolAddress( static __inline__ __host__ cudaError_t cudaGetSymbolAddress(
void **devPtr, void **devPtr,
char *symbol char *symbol
) )
{ {
skipping to change at line 678 skipping to change at line 678
/** @} */ /* END CUDART_HIGHLEVEL */ /** @} */ /* END CUDART_HIGHLEVEL */
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
/** /**
* \ingroup CUDART_HIGHLEVEL * \ingroup CUDART_HIGHLEVEL
* \brief Sets the preferred cache configuration for a device function
*
* On devices where the L1 cache and shared memory use the same hardware
* resources, this sets through \p cacheConfig the preferred cache configur
ation
* for the function specified via \p func. This is only a preference. The
* runtime will use the requested configuration if possible, but it is free
to
* choose a different configuration if required to execute \p func.
*
* \p func can either be a pointer to a function that executes
* on the device, or it can be a character string specifying the
* fully-decorated (C++) name for a function that executes on the device.
* The parameter specified by \p func must be declared as a \p __global__
* function. If the specified function does not exist,
* then ::cudaErrorInvalidDeviceFunction is returned.
*
* This setting does nothing on devices where the size of the L1 cache and
* shared memory are fixed.
*
* Switching between configuration modes may insert a device-side
* synchronization point for streamed kernel launches.
*
* \param func - Device char string naming device function
* \param cacheConfig - Cache configuration mode
*
* \return
* ::cudaSuccess,
* ::cudaErrorInitializationError,
* ::cudaErrorInvalidDeviceFunction
* \notefnerr
*
* \sa ::cudaConfigureCall,
* \ref ::cudaFuncSetCacheConfig(const char*, enum cudaFuncCache) "cudaFunc
SetCacheConfig (C API)",
* \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes*, T*) "cudaFuncGe
tAttributes (C++ API)",
* \ref ::cudaLaunch(const char*) "cudaLaunch (C API)",
* ::cudaSetDoubleForDevice,
* ::cudaSetDoubleForHost,
* \ref ::cudaSetupArgument(T,size_t) "cudaSetupArgument (C++ API)"
*/
template<class T>
__inline__ __host__ cudaError_t cudaFuncSetCacheConfig(
T *func,
enum cudaFuncCache cacheConfig
)
{
return cudaFuncSetCacheConfig((const char*)func, cacheConfig);
}
/**
* \ingroup CUDART_HIGHLEVEL
* \brief \hl Launches a device function * \brief \hl Launches a device function
* *
* Launches the function \p entry on the device. The parameter \p entry can * Launches the function \p entry on the device. The parameter \p entry can
* either be a function that executes on the device, or it can be a charact er * either be a function that executes on the device, or it can be a charact er
* string, naming a function that executes on the device. The parameter * string, naming a function that executes on the device. The parameter
* specified by \p entry must be declared as a \p __global__ function. * specified by \p entry must be declared as a \p __global__ function.
* \ref ::cudaLaunch(T*) "cudaLaunch()" must be preceded by a call to * \ref ::cudaLaunch(T*) "cudaLaunch()" must be preceded by a call to
* ::cudaConfigureCall() since it pops the data that was pushed by * ::cudaConfigureCall() since it pops the data that was pushed by
* ::cudaConfigureCall() from the execution stack. * ::cudaConfigureCall() from the execution stack.
* *
skipping to change at line 702 skipping to change at line 751
* ::cudaSuccess, * ::cudaSuccess,
* ::cudaErrorInvalidDeviceFunction, * ::cudaErrorInvalidDeviceFunction,
* ::cudaErrorInvalidConfiguration, * ::cudaErrorInvalidConfiguration,
* ::cudaErrorLaunchFailure, * ::cudaErrorLaunchFailure,
* ::cudaErrorPriorLaunchFailure, * ::cudaErrorPriorLaunchFailure,
* ::cudaErrorLaunchTimeout, * ::cudaErrorLaunchTimeout,
* ::cudaErrorLaunchOutOfResources * ::cudaErrorLaunchOutOfResources
* \notefnerr * \notefnerr
* *
* \sa ::cudaConfigureCall, * \sa ::cudaConfigureCall,
* \ref ::cudaFuncSetCacheConfig(T*, enum cudaFuncCache) "cudaFuncSetCacheC onfig (C++ API)",
* \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes*, T*) "cudaFuncGe tAttributes (C++ API)", * \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes*, T*) "cudaFuncGe tAttributes (C++ API)",
* \ref ::cudaLaunch(const char*) "cudaLaunch (C API)", * \ref ::cudaLaunch(const char*) "cudaLaunch (C API)",
* ::cudaSetDoubleForDevice, * ::cudaSetDoubleForDevice,
* ::cudaSetDoubleForHost, * ::cudaSetDoubleForHost,
* \ref ::cudaSetupArgument(T,size_t) "cudaSetupArgument (C++ API)" * \ref ::cudaSetupArgument(T,size_t) "cudaSetupArgument (C++ API)"
*/ */
template<class T> template<class T>
__inline__ __host__ cudaError_t cudaLaunch( __inline__ __host__ cudaError_t cudaLaunch(
T *entry T *entry
) )
{ {
return cudaLaunch((const char*)entry); return cudaLaunch((const char*)entry);
} }
/** /**
* \ingroup CUDART_HIGHLEVEL * \ingroup CUDART_HIGHLEVEL
* \brief \hl Find out attributes for a given function * \brief \hl Find out attributes for a given function
* *
* This function obtains the attributes of a function specified via \p entr y. * This function obtains the attributes of a function specified via \p entr y.
* The parameter \p entry can either be a function that executes on the * The parameter \p entry can either be a pointer to a function that execut
* device, or it can be a character string, naming a function that executes es
on * on the device, or it can be a character string specifying the
* the device. The parameter specified by \p entry must be declared as a * fully-decorated (C++) name of a function that executes on the device. Th
* \p __global__ function. The fetched attributes are placed in \p attr. If e
* the specified function does not exist, then ::cudaErrorInvalidDeviceFunc * parameter specified by \p entry must be declared as a \p __global__
tion * function. The fetched attributes are placed in \p attr. If the specified
* is returned. * function does not exist, then ::cudaErrorInvalidDeviceFunction is return
ed.
* *
* \param attr - Return pointer to function's attributes * \param attr - Return pointer to function's attributes
* \param entry - Function to get attributes of * \param entry - Function to get attributes of
* *
* \return * \return
* ::cudaSuccess, * ::cudaSuccess,
* ::cudaErrorInitializationError, * ::cudaErrorInitializationError,
* ::cudaErrorInvalidDeviceFunction * ::cudaErrorInvalidDeviceFunction
* \notefnerr * \notefnerr
* *
* \sa ::cudaConfigureCall, * \sa ::cudaConfigureCall,
* \ref ::cudaFuncSetCacheConfig(T*, enum cudaFuncCache) "cudaFuncSetCacheC onfig (C++ API)",
* \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes*, const char*) "c udaFuncGetAttributes (C API)", * \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes*, const char*) "c udaFuncGetAttributes (C API)",
* \ref ::cudaLaunch(T*) "cudaLaunch (C++ API)", * \ref ::cudaLaunch(T*) "cudaLaunch (C++ API)",
* ::cudaSetDoubleForDevice, * ::cudaSetDoubleForDevice,
* ::cudaSetDoubleForHost, * ::cudaSetDoubleForHost,
* \ref ::cudaSetupArgument(T,size_t) "cudaSetupArgument (C++ API)" * \ref ::cudaSetupArgument(T,size_t) "cudaSetupArgument (C++ API)"
*/ */
template<class T> template<class T>
__inline__ __host__ cudaError_t cudaFuncGetAttributes( __inline__ __host__ cudaError_t cudaFuncGetAttributes(
struct cudaFuncAttributes *attr, struct cudaFuncAttributes *attr,
T *entry T *entry
 End of changes. 9 change blocks. 
21 lines changed or deleted 77 lines changed or added


 cuda_runtime_api.h   cuda_runtime_api.h 
/* /*
* Copyright 1993-2009 NVIDIA Corporation. All rights reserved. * Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
* *
* NOTICE TO USER: * NOTICE TO USER:
* *
* This source code is subject to NVIDIA ownership rights under U.S. and * This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws. Users and possessors of this source code * international Copyright laws. Users and possessors of this source code
* are hereby granted a nonexclusive, royalty-free license to use this code * are hereby granted a nonexclusive, royalty-free license to use this code
* in individual and commercial software. * in individual and commercial software.
* *
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
skipping to change at line 41 skipping to change at line 41
* Any use of this source code in individual and commercial software must * Any use of this source code in individual and commercial software must
* include, in the user documentation and internal comments to the code, * include, in the user documentation and internal comments to the code,
* the above Disclaimer and U.S. Government End Users Notice. * the above Disclaimer and U.S. Government End Users Notice.
*/ */
#if !defined(__CUDA_RUNTIME_API_H__) #if !defined(__CUDA_RUNTIME_API_H__)
#define __CUDA_RUNTIME_API_H__ #define __CUDA_RUNTIME_API_H__
/************************************************************************** ***** /************************************************************************** *****
* * * *
* CUDA Runtime API Version 2.3 * * CUDA Runtime API Version 3.0 *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
#define CUDART_VERSION 2030 #define CUDART_VERSION 3000
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
#include "host_defines.h" #include "host_defines.h"
#include "builtin_types.h" #include "builtin_types.h"
skipping to change at line 93 skipping to change at line 93
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
extern __host__ cudaError_t CUDARTAPI cudaMalloc3D(struct cudaPitchedPtr* p itchedDevPtr, struct cudaExtent extent); extern __host__ cudaError_t CUDARTAPI cudaMalloc3D(struct cudaPitchedPtr* p itchedDevPtr, struct cudaExtent extent);
extern __host__ cudaError_t CUDARTAPI cudaMalloc3DArray(struct cudaArray** arrayPtr, const struct cudaChannelFormatDesc* desc, struct cudaExtent exten t); extern __host__ cudaError_t CUDARTAPI cudaMalloc3DArray(struct cudaArray** arrayPtr, const struct cudaChannelFormatDesc* desc, struct cudaExtent exten t);
extern __host__ cudaError_t CUDARTAPI cudaMemset3D(struct cudaPitchedPtr pi tchedDevPtr, int value, struct cudaExtent extent); extern __host__ cudaError_t CUDARTAPI cudaMemset3D(struct cudaPitchedPtr pi tchedDevPtr, int value, struct cudaExtent extent);
extern __host__ cudaError_t CUDARTAPI cudaMemcpy3D(const struct cudaMemcpy3 DParms *p); extern __host__ cudaError_t CUDARTAPI cudaMemcpy3D(const struct cudaMemcpy3 DParms *p);
extern __host__ cudaError_t CUDARTAPI cudaMemcpy3DAsync(const struct cudaMe mcpy3DParms *p, cudaStream_t stream); extern __host__ cudaError_t CUDARTAPI cudaMemcpy3DAsync(const struct cudaMe mcpy3DParms *p, cudaStream_t stream __dv(0));
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
extern __host__ cudaError_t CUDARTAPI cudaMalloc(void **devPtr, size_t size ); extern __host__ cudaError_t CUDARTAPI cudaMalloc(void **devPtr, size_t size );
extern __host__ cudaError_t CUDARTAPI cudaMallocHost(void **ptr, size_t siz e); extern __host__ cudaError_t CUDARTAPI cudaMallocHost(void **ptr, size_t siz e);
extern __host__ cudaError_t CUDARTAPI cudaMallocPitch(void **devPtr, size_t *pitch, size_t width, size_t height); extern __host__ cudaError_t CUDARTAPI cudaMallocPitch(void **devPtr, size_t *pitch, size_t width, size_t height);
skipping to change at line 119 skipping to change at line 119
extern __host__ cudaError_t CUDARTAPI cudaHostAlloc(void **pHost, size_t by tes, unsigned int flags); extern __host__ cudaError_t CUDARTAPI cudaHostAlloc(void **pHost, size_t by tes, unsigned int flags);
extern __host__ cudaError_t CUDARTAPI cudaHostGetDevicePointer(void **pDevi ce, void *pHost, unsigned int flags); extern __host__ cudaError_t CUDARTAPI cudaHostGetDevicePointer(void **pDevi ce, void *pHost, unsigned int flags);
extern __host__ cudaError_t CUDARTAPI cudaHostGetFlags(unsigned int *pFlags , void *pHost); extern __host__ cudaError_t CUDARTAPI cudaHostGetFlags(unsigned int *pFlags , void *pHost);
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
extern __host__ cudaError_t CUDARTAPI cudaMemGetInfo(size_t *free, size_t * total);
extern __host__ cudaError_t CUDARTAPI cudaMemcpy(void *dst, const void *src , size_t count, enum cudaMemcpyKind kind); extern __host__ cudaError_t CUDARTAPI cudaMemcpy(void *dst, const void *src , size_t count, enum cudaMemcpyKind kind);
extern __host__ cudaError_t CUDARTAPI cudaMemcpyToArray(struct cudaArray *d st, size_t wOffset, size_t hOffset, const void *src, size_t count, enum cud aMemcpyKind kind); extern __host__ cudaError_t CUDARTAPI cudaMemcpyToArray(struct cudaArray *d st, size_t wOffset, size_t hOffset, const void *src, size_t count, enum cud aMemcpyKind kind);
extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromArray(void *dst, const struct cudaArray *src, size_t wOffset, size_t hOffset, size_t count, enum c udaMemcpyKind kind); extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromArray(void *dst, const struct cudaArray *src, size_t wOffset, size_t hOffset, size_t count, enum c udaMemcpyKind kind);
extern __host__ cudaError_t CUDARTAPI cudaMemcpyArrayToArray(struct cudaArr ay *dst, size_t wOffsetDst, size_t hOffsetDst, const struct cudaArray *src, size_t wOffsetSrc, size_t hOffsetSrc, size_t count, enum cudaMemcpyKind ki nd __dv(cudaMemcpyDeviceToDevice)); extern __host__ cudaError_t CUDARTAPI cudaMemcpyArrayToArray(struct cudaArr ay *dst, size_t wOffsetDst, size_t hOffsetDst, const struct cudaArray *src, size_t wOffsetSrc, size_t hOffsetSrc, size_t count, enum cudaMemcpyKind ki nd __dv(cudaMemcpyDeviceToDevice));
extern __host__ cudaError_t CUDARTAPI cudaMemcpy2D(void *dst, size_t dpitch , const void *src, size_t spitch, size_t width, size_t height, enum cudaMem cpyKind kind); extern __host__ cudaError_t CUDARTAPI cudaMemcpy2D(void *dst, size_t dpitch , const void *src, size_t spitch, size_t width, size_t height, enum cudaMem cpyKind kind);
extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DToArray(struct cudaArray *dst, size_t wOffset, size_t hOffset, const void *src, size_t spitch, size_ t width, size_t height, enum cudaMemcpyKind kind); extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DToArray(struct cudaArray *dst, size_t wOffset, size_t hOffset, const void *src, size_t spitch, size_ t width, size_t height, enum cudaMemcpyKind kind);
extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DFromArray(void *dst, size _t dpitch, const struct cudaArray *src, size_t wOffset, size_t hOffset, siz e_t width, size_t height, enum cudaMemcpyKind kind); extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DFromArray(void *dst, size _t dpitch, const struct cudaArray *src, size_t wOffset, size_t hOffset, siz e_t width, size_t height, enum cudaMemcpyKind kind);
extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DArrayToArray(struct cudaA rray *dst, size_t wOffsetDst, size_t hOffsetDst, const struct cudaArray *sr c, size_t wOffsetSrc, size_t hOffsetSrc, size_t width, size_t height, enum cudaMemcpyKind kind __dv(cudaMemcpyDeviceToDevice)); extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DArrayToArray(struct cudaA rray *dst, size_t wOffsetDst, size_t hOffsetDst, const struct cudaArray *sr c, size_t wOffsetSrc, size_t hOffsetSrc, size_t width, size_t height, enum cudaMemcpyKind kind __dv(cudaMemcpyDeviceToDevice));
extern __host__ cudaError_t CUDARTAPI cudaMemcpyToSymbol(const char *symbol , const void *src, size_t count, size_t offset __dv(0), enum cudaMemcpyKind kind __dv(cudaMemcpyHostToDevice)); extern __host__ cudaError_t CUDARTAPI cudaMemcpyToSymbol(const char *symbol , const void *src, size_t count, size_t offset __dv(0), enum cudaMemcpyKind kind __dv(cudaMemcpyHostToDevice));
extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromSymbol(void *dst, const char *symbol, size_t count, size_t offset __dv(0), enum cudaMemcpyKind kin d __dv(cudaMemcpyDeviceToHost)); extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromSymbol(void *dst, const char *symbol, size_t count, size_t offset __dv(0), enum cudaMemcpyKind kin d __dv(cudaMemcpyDeviceToHost));
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
extern __host__ cudaError_t CUDARTAPI cudaMemcpyAsync(void *dst, const void extern __host__ cudaError_t CUDARTAPI cudaMemcpyAsync(void *dst, const void
*src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream); *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0))
extern __host__ cudaError_t CUDARTAPI cudaMemcpyToArrayAsync(struct cudaArr ;
ay *dst, size_t wOffset, size_t hOffset, const void *src, size_t count, enu extern __host__ cudaError_t CUDARTAPI cudaMemcpyToArrayAsync(struct cudaArr
m cudaMemcpyKind kind, cudaStream_t stream); ay *dst, size_t wOffset, size_t hOffset, const void *src, size_t count, enu
extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromArrayAsync(void *dst, c m cudaMemcpyKind kind, cudaStream_t stream __dv(0));
onst struct cudaArray *src, size_t wOffset, size_t hOffset, size_t count, e extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromArrayAsync(void *dst, c
num cudaMemcpyKind kind, cudaStream_t stream); onst struct cudaArray *src, size_t wOffset, size_t hOffset, size_t count, e
extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DAsync(void *dst, size_t d num cudaMemcpyKind kind, cudaStream_t stream __dv(0));
pitch, const void *src, size_t spitch, size_t width, size_t height, enum cu extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DAsync(void *dst, size_t d
daMemcpyKind kind, cudaStream_t stream); pitch, const void *src, size_t spitch, size_t width, size_t height, enum cu
extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DToArrayAsync(struct cudaA daMemcpyKind kind, cudaStream_t stream __dv(0));
rray *dst, size_t wOffset, size_t hOffset, const void *src, size_t spitch, extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DToArrayAsync(struct cudaA
size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream) rray *dst, size_t wOffset, size_t hOffset, const void *src, size_t spitch,
; size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream
extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DFromArrayAsync(void *dst, __dv(0));
size_t dpitch, const struct cudaArray *src, size_t wOffset, size_t hOffset extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DFromArrayAsync(void *dst,
, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t strea size_t dpitch, const struct cudaArray *src, size_t wOffset, size_t hOffset
m); , size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t strea
extern __host__ cudaError_t CUDARTAPI cudaMemcpyToSymbolAsync(const char *s m __dv(0));
ymbol, const void *src, size_t count, size_t offset, enum cudaMemcpyKind ki extern __host__ cudaError_t CUDARTAPI cudaMemcpyToSymbolAsync(const char *s
nd, cudaStream_t stream); ymbol, const void *src, size_t count, size_t offset, enum cudaMemcpyKind ki
extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromSymbolAsync(void *dst, nd, cudaStream_t stream __dv(0));
const char *symbol, size_t count, size_t offset, enum cudaMemcpyKind kind, extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromSymbolAsync(void *dst,
cudaStream_t stream); const char *symbol, size_t count, size_t offset, enum cudaMemcpyKind kind,
cudaStream_t stream __dv(0));
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
extern __host__ cudaError_t CUDARTAPI cudaMemset(void *devPtr, int value, s ize_t count); extern __host__ cudaError_t CUDARTAPI cudaMemset(void *devPtr, int value, s ize_t count);
extern __host__ cudaError_t CUDARTAPI cudaMemset2D(void *devPtr, size_t pit ch, int value, size_t width, size_t height); extern __host__ cudaError_t CUDARTAPI cudaMemset2D(void *devPtr, size_t pit ch, int value, size_t width, size_t height);
skipping to change at line 216 skipping to change at line 217
extern __host__ const char* CUDARTAPI cudaGetErrorString(cudaError_t error) ; extern __host__ const char* CUDARTAPI cudaGetErrorString(cudaError_t error) ;
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
extern __host__ cudaError_t CUDARTAPI cudaConfigureCall(dim3 gridDim, dim3 blockDim, size_t sharedMem __dv(0), cudaStream_t stream __dv(0)); extern __host__ cudaError_t CUDARTAPI cudaConfigureCall(dim3 gridDim, dim3 blockDim, size_t sharedMem __dv(0), cudaStream_t stream __dv(0));
extern __host__ cudaError_t CUDARTAPI cudaSetupArgument(const void *arg, si ze_t size, size_t offset); extern __host__ cudaError_t CUDARTAPI cudaSetupArgument(const void *arg, si ze_t size, size_t offset);
extern __host__ cudaError_t CUDARTAPI cudaFuncSetCacheConfig(const char *fu nc, enum cudaFuncCache cacheConfig);
extern __host__ cudaError_t CUDARTAPI cudaLaunch(const char *entry); extern __host__ cudaError_t CUDARTAPI cudaLaunch(const char *entry);
extern __host__ cudaError_t CUDARTAPI cudaFuncGetAttributes(struct cudaFunc Attributes *attr, const char *func); extern __host__ cudaError_t CUDARTAPI cudaFuncGetAttributes(struct cudaFunc Attributes *attr, const char *func);
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
extern __host__ cudaError_t CUDARTAPI cudaStreamCreate(cudaStream_t *pStrea m); extern __host__ cudaError_t CUDARTAPI cudaStreamCreate(cudaStream_t *pStrea m);
skipping to change at line 238 skipping to change at line 240
extern __host__ cudaError_t CUDARTAPI cudaStreamQuery(cudaStream_t stream); extern __host__ cudaError_t CUDARTAPI cudaStreamQuery(cudaStream_t stream);
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
extern __host__ cudaError_t CUDARTAPI cudaEventCreate(cudaEvent_t *event); extern __host__ cudaError_t CUDARTAPI cudaEventCreate(cudaEvent_t *event);
extern __host__ cudaError_t CUDARTAPI cudaEventCreateWithFlags(cudaEvent_t *event, int flags); extern __host__ cudaError_t CUDARTAPI cudaEventCreateWithFlags(cudaEvent_t *event, int flags);
extern __host__ cudaError_t CUDARTAPI cudaEventRecord(cudaEvent_t event, cu daStream_t stream); extern __host__ cudaError_t CUDARTAPI cudaEventRecord(cudaEvent_t event, cu daStream_t stream __dv(0));
extern __host__ cudaError_t CUDARTAPI cudaEventQuery(cudaEvent_t event); extern __host__ cudaError_t CUDARTAPI cudaEventQuery(cudaEvent_t event);
extern __host__ cudaError_t CUDARTAPI cudaEventSynchronize(cudaEvent_t even t); extern __host__ cudaError_t CUDARTAPI cudaEventSynchronize(cudaEvent_t even t);
extern __host__ cudaError_t CUDARTAPI cudaEventDestroy(cudaEvent_t event); extern __host__ cudaError_t CUDARTAPI cudaEventDestroy(cudaEvent_t event);
extern __host__ cudaError_t CUDARTAPI cudaEventElapsedTime(float *ms, cudaE vent_t start, cudaEvent_t end); extern __host__ cudaError_t CUDARTAPI cudaEventElapsedTime(float *ms, cudaE vent_t start, cudaEvent_t end);
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
skipping to change at line 271 skipping to change at line 273
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
extern __host__ cudaError_t CUDARTAPI cudaDriverGetVersion(int *driverVersi on); extern __host__ cudaError_t CUDARTAPI cudaDriverGetVersion(int *driverVersi on);
extern __host__ cudaError_t CUDARTAPI cudaRuntimeGetVersion(int *runtimeVer sion); extern __host__ cudaError_t CUDARTAPI cudaRuntimeGetVersion(int *runtimeVer sion);
/**************************************************************************
*****
*
*
*
*
*
*
***************************************************************************
****/
extern __host__ cudaError_t CUDARTAPI cudaGraphicsUnregisterResource(struct
cudaGraphicsResource *resource);
extern __host__ cudaError_t CUDARTAPI cudaGraphicsResourceSetMapFlags(struc
t cudaGraphicsResource *resource, unsigned int flags);
extern __host__ cudaError_t CUDARTAPI cudaGraphicsMapResources(int count, s
truct cudaGraphicsResource **resources, cudaStream_t stream __dv(0));
extern __host__ cudaError_t CUDARTAPI cudaGraphicsUnmapResources(int count,
struct cudaGraphicsResource **resources, cudaStream_t stream __dv(0));
extern __host__ cudaError_t CUDARTAPI cudaGraphicsResourceGetMappedPointer(
void **devPtr, size_t *size, struct cudaGraphicsResource *resource);
extern __host__ cudaError_t CUDARTAPI cudaGraphicsSubResourceGetMappedArray
(struct cudaArray **arrayPtr, struct cudaGraphicsResource *resource, unsign
ed int arrayIndex, unsigned int mipLevel);
#if defined(__cplusplus) #if defined(__cplusplus)
} }
#endif /* __cplusplus */ #endif /* __cplusplus */
#undef __dv #undef __dv
#endif /* !__CUDA_RUNTIME_API_H__ */ #endif /* !__CUDA_RUNTIME_API_H__ */
 End of changes. 9 change blocks. 
30 lines changed or deleted 58 lines changed or added


 cuda_texture_types.h   cuda_texture_types.h 
/* /*
* Copyright 1993-2009 NVIDIA Corporation. All rights reserved. * Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
* *
* NOTICE TO USER: * NOTICE TO USER:
* *
* This source code is subject to NVIDIA ownership rights under U.S. and * This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws. Users and possessors of this source code * international Copyright laws. Users and possessors of this source code
* are hereby granted a nonexclusive, royalty-free license to use this code * are hereby granted a nonexclusive, royalty-free license to use this code
* in individual and commercial software. * in individual and commercial software.
* *
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
 End of changes. 1 change blocks. 
1 lines changed or deleted 1 lines changed or added


 cufft.h   cufft.h 
/* /*
* Copyright 1993-2009 NVIDIA Corporation. All rights reserved. * Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
* *
* NOTICE TO USER: * NOTICE TO USER:
* *
* This source code is subject to NVIDIA ownership rights under U.S. and * This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws. Users and possessors of this source code * international Copyright laws. Users and possessors of this source code
* are hereby granted a nonexclusive, royalty-free license to use this code * are hereby granted a nonexclusive, royalty-free license to use this code
* in individual and commercial software. * in individual and commercial software.
* *
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
skipping to change at line 105 skipping to change at line 105
CUFFT_C2R = 0x2c, // Complex (interleaved) to Real CUFFT_C2R = 0x2c, // Complex (interleaved) to Real
CUFFT_C2C = 0x29, // Complex to Complex, interleaved CUFFT_C2C = 0x29, // Complex to Complex, interleaved
CUFFT_D2Z = 0x6a, // Double to Double-Complex CUFFT_D2Z = 0x6a, // Double to Double-Complex
CUFFT_Z2D = 0x6c, // Double-Complex to Double CUFFT_Z2D = 0x6c, // Double-Complex to Double
CUFFT_Z2Z = 0x69 // Double-Complex to Double-Complex CUFFT_Z2Z = 0x69 // Double-Complex to Double-Complex
} cufftType; } cufftType;
cufftResult CUFFTAPI cufftPlan1d(cufftHandle *plan, cufftResult CUFFTAPI cufftPlan1d(cufftHandle *plan,
int nx, int nx,
cufftType type, cufftType type,
int batch); int batch /* deprecated - use cufftPlanMan y */);
cufftResult CUFFTAPI cufftPlan2d(cufftHandle *plan, cufftResult CUFFTAPI cufftPlan2d(cufftHandle *plan,
int nx, int ny, int nx, int ny,
cufftType type); cufftType type);
cufftResult CUFFTAPI cufftPlan3d(cufftHandle *plan, cufftResult CUFFTAPI cufftPlan3d(cufftHandle *plan,
int nx, int ny, int nz, int nx, int ny, int nz,
cufftType type); cufftType type);
cufftResult CUFFTAPI cufftPlanMany(cufftHandle *plan,
int rank,
int *n,
int *inembed, int istride, int idist,
// Unused: pass "NULL, 1, 0"
int *onembed, int ostride, int odist,
// Unused: pass "NULL, 1, 0"
cufftType type,
int batch);
cufftResult CUFFTAPI cufftDestroy(cufftHandle plan); cufftResult CUFFTAPI cufftDestroy(cufftHandle plan);
cufftResult CUFFTAPI cufftExecC2C(cufftHandle plan, cufftResult CUFFTAPI cufftExecC2C(cufftHandle plan,
cufftComplex *idata, cufftComplex *idata,
cufftComplex *odata, cufftComplex *odata,
int direction); int direction);
cufftResult CUFFTAPI cufftExecR2C(cufftHandle plan, cufftResult CUFFTAPI cufftExecR2C(cufftHandle plan,
cufftReal *idata, cufftReal *idata,
cufftComplex *odata); cufftComplex *odata);
skipping to change at line 143 skipping to change at line 151
int direction); int direction);
cufftResult CUFFTAPI cufftExecD2Z(cufftHandle plan, cufftResult CUFFTAPI cufftExecD2Z(cufftHandle plan,
cufftDoubleReal *idata, cufftDoubleReal *idata,
cufftDoubleComplex *odata); cufftDoubleComplex *odata);
cufftResult CUFFTAPI cufftExecZ2D(cufftHandle plan, cufftResult CUFFTAPI cufftExecZ2D(cufftHandle plan,
cufftDoubleComplex *idata, cufftDoubleComplex *idata,
cufftDoubleReal *odata); cufftDoubleReal *odata);
cufftResult CUFFTAPI cufftSetStream(cufftHandle p, cufftResult CUFFTAPI cufftSetStream(cufftHandle plan,
cudaStream_t stream); cudaStream_t stream);
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif
#endif /* _CUFFT_H_ */ #endif /* _CUFFT_H_ */
 End of changes. 4 change blocks. 
3 lines changed or deleted 13 lines changed or added


 device_functions.h   device_functions.h 
/* /*
* Copyright 1993-2009 NVIDIA Corporation. All rights reserved. * Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
* *
* NOTICE TO USER: * NOTICE TO USER:
* *
* This source code is subject to NVIDIA ownership rights under U.S. and * This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws. Users and possessors of this source code * international Copyright laws. Users and possessors of this source code
* are hereby granted a nonexclusive, royalty-free license to use this code * are hereby granted a nonexclusive, royalty-free license to use this code
* in individual and commercial software. * in individual and commercial software.
* *
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
skipping to change at line 194 skipping to change at line 194
extern __device__ unsigned long long int __float2ull_rn(float); extern __device__ unsigned long long int __float2ull_rn(float);
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ unsigned long long int __float2ull_rz(float); extern __device__ unsigned long long int __float2ull_rz(float);
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ unsigned long long int __float2ull_ru(float); extern __device__ unsigned long long int __float2ull_ru(float);
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ unsigned long long int __float2ull_rd(float); extern __device__ unsigned long long int __float2ull_rd(float);
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ float __ll2float_rn(long long int); extern __device__ float __ll2float_rn(long long int);
/*DEVICE_BUILTIN*/
extern __device__ float __ll2float_rz(long long int);
/*DEVICE_BUILTIN*/
extern __device__ float __ll2float_ru(long long int);
/*DEVICE_BUILTIN*/
extern __device__ float __ll2float_rd(long long int);
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ float __ull2float_rn(unsigned long long int); extern __device__ float __ull2float_rn(unsigned long long int);
/*DEVICE_BUILTIN*/
extern __device__ float __ull2float_rz(unsigned long long
int);
/*DEVICE_BUILTIN*/
extern __device__ float __ull2float_ru(unsigned long long
int);
/*DEVICE_BUILTIN*/
extern __device__ float __ull2float_rd(unsigned long long
int);
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ unsigned short __float2half_rn(float); extern __device__ unsigned short __float2half_rn(float);
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ float __half2float(unsigned short); extern __device__ float __half2float(unsigned short);
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ float __fadd_rn(float, float); extern __device__ float __fadd_rn(float, float);
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ float __fadd_rz(float, float); extern __device__ float __fadd_rz(float, float);
skipping to change at line 275 skipping to change at line 287
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ int __clzll(long long int); extern __device__ int __clzll(long long int);
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ int __ffsll(long long int); extern __device__ int __ffsll(long long int);
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ int __popcll(unsigned long long int); extern __device__ int __popcll(unsigned long long int);
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ unsigned long long int __brevll(unsigned long long int); extern __device__ unsigned long long int __brevll(unsigned long long int);
#if (__CUDA_ARCH__ >= 130) #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 130
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ int __double2int_rz(double); extern __device__ int __double2int_rz(double);
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ unsigned int __double2uint_rz(double); extern __device__ unsigned int __double2uint_rz(double);
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ long long int __double2ll_rz(double); extern __device__ long long int __double2ll_rz(double);
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ unsigned long long int __double2ull_rz(double); extern __device__ unsigned long long int __double2ull_rz(double);
#endif /* __CUDA_ARCH__ >= 130 */ #endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 130 */
} }
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
static __inline__ __device__ int mulhi(int a, int b) static __inline__ __device__ int mulhi(int a, int b)
skipping to change at line 445 skipping to change at line 457
#elif !defined(__CUDACC__) #elif !defined(__CUDACC__)
#include "crt/func_macro.h" #include "crt/func_macro.h"
#include "host_defines.h" #include "host_defines.h"
#include "math_constants.h" #include "math_constants.h"
#if defined(__CUDABE__) #if defined(__CUDABE__)
#if (__CUDA_ARCH__ < 200)
__device_func__(float __frcp_rn (float x)) __device_func__(float __frcp_rn (float x))
{ {
unsigned int expo; unsigned int expo;
unsigned f, y; unsigned f, y;
unsigned int argi; unsigned int argi;
float t; float t;
argi = __float_as_int(x); argi = __float_as_int(x);
expo = (argi >> 23); expo = (argi >> 23);
expo = expo & 0xff; expo = expo & 0xff;
skipping to change at line 2381 skipping to change at line 2395
return __int_as_float(xx); return __int_as_float(xx);
} }
/* subnormal */ /* subnormal */
expo_x = ((unsigned int)-((int)expo_x)); expo_x = ((unsigned int)-((int)expo_x));
xx += (temp && expo_y); xx += (temp && expo_y);
xx = (xx >> expo_x); xx = (xx >> expo_x);
if ((expo_x > 25) || (xx != 0x00800000)) xx = 0; if ((expo_x > 25) || (xx != 0x00800000)) xx = 0;
return __int_as_float(expo_y | xx); return __int_as_float(expo_y | xx);
} }
#endif /* __CUDA_ARCH__ < 200 */
#else /* defined(__CUDABE__) */ #else /* defined(__CUDABE__) */
#include "common_types.h" #include "common_types.h"
static __device__ const unsigned char __internal_rcpTab[128] = static __device__ const unsigned char __internal_rcpTab[128] =
{ {
0xff, 0xfd, 0xfb, 0xf9, 0xf7, 0xf5, 0xf4, 0xf2, 0xff, 0xfd, 0xfb, 0xf9, 0xf7, 0xf5, 0xf4, 0xf2,
0xf0, 0xee, 0xed, 0xeb, 0xe9, 0xe8, 0xe6, 0xe4, 0xf0, 0xee, 0xed, 0xeb, 0xe9, 0xe8, 0xe6, 0xe4,
0xe3, 0xe1, 0xe0, 0xde, 0xdd, 0xdb, 0xda, 0xd8, 0xe3, 0xe1, 0xe0, 0xde, 0xdd, 0xdb, 0xda, 0xd8,
0xd7, 0xd5, 0xd4, 0xd3, 0xd1, 0xd0, 0xcf, 0xcd, 0xd7, 0xd5, 0xd4, 0xd3, 0xd1, 0xd0, 0xcf, 0xcd,
skipping to change at line 3830 skipping to change at line 3846
__device_func__(unsigned int __usad(unsigned int a, unsigned int b, unsigne d int c)) __device_func__(unsigned int __usad(unsigned int a, unsigned int b, unsigne d int c))
{ {
long long int diff = (long long int)a - (long long int)b; long long int diff = (long long int)a - (long long int)b;
return (unsigned int)(__cuda_llabs(diff) + (long long int)c); return (unsigned int)(__cuda_llabs(diff) + (long long int)c);
} }
__device_func__(int __mul24(int a, int b)) __device_func__(int __mul24(int a, int b))
{ {
#if !defined(__MULTI_CORE__)
a &= 0xffffff; a &= 0xffffff;
a = (a & 0x800000) != 0 ? a | ~0xffffff : a; a = (a & 0x800000) != 0 ? a | ~0xffffff : a;
b &= 0xffffff; b &= 0xffffff;
b = (b & 0x800000) != 0 ? b | ~0xffffff : b; b = (b & 0x800000) != 0 ? b | ~0xffffff : b;
#endif /* !__MULTI_CORE__ */
return a * b; return a * b;
} }
__device_func__(unsigned int __umul24(unsigned int a, unsigned int b)) __device_func__(unsigned int __umul24(unsigned int a, unsigned int b))
{ {
#if !defined(__MULTI_CORE__)
a &= 0xffffff; a &= 0xffffff;
b &= 0xffffff; b &= 0xffffff;
#endif /* !__MULTI_CORE__ */
return a * b; return a * b;
} }
__device_func__(float __int_as_float(int a)) __device_func__(float __int_as_float(int a))
{ {
volatile union __cudart_FloatIntCvt u; volatile union __cudart_FloatIntCvt u;
u.i = a; u.i = a;
return u.f; return u.f;
skipping to change at line 3903 skipping to change at line 3915
return (long long int)res; return (long long int)res;
} }
__device_func__(int __internal_float2int(float a, enum cudaRoundMode rndMod e)) __device_func__(int __internal_float2int(float a, enum cudaRoundMode rndMod e))
{ {
return (int)__internal_float2ll_kernel(a, 2147483647LL, -2147483648LL, 0L L, rndMode); return (int)__internal_float2ll_kernel(a, 2147483647LL, -2147483648LL, 0L L, rndMode);
} }
__device_func__(int __float2int_rz(float a)) __device_func__(int __float2int_rz(float a))
{ {
#if defined(__MULTI_CORE__)
return (int)a;
#else /* __MULTI_CORE__ */
return __internal_float2int(a, cudaRoundZero); return __internal_float2int(a, cudaRoundZero);
#endif /* __MULTI_CORE__ */
} }
__device_func__(int __float2int_ru(float a)) __device_func__(int __float2int_ru(float a))
{ {
return __internal_float2int(a, cudaRoundPosInf); return __internal_float2int(a, cudaRoundPosInf);
} }
__device_func__(int __float2int_rd(float a)) __device_func__(int __float2int_rd(float a))
{ {
return __internal_float2int(a, cudaRoundMinInf); return __internal_float2int(a, cudaRoundMinInf);
skipping to change at line 3932 skipping to change at line 3940
return __internal_float2int(a, cudaRoundNearest); return __internal_float2int(a, cudaRoundNearest);
} }
__device_func__(long long int __internal_float2ll(float a, enum cudaRoundMo de rndMode)) __device_func__(long long int __internal_float2ll(float a, enum cudaRoundMo de rndMode))
{ {
return __internal_float2ll_kernel(a, 9223372036854775807LL, -922337203685 4775807LL -1LL, -9223372036854775807LL -1LL, rndMode); return __internal_float2ll_kernel(a, 9223372036854775807LL, -922337203685 4775807LL -1LL, -9223372036854775807LL -1LL, rndMode);
} }
__device_func__(long long int __float2ll_rz(float a)) __device_func__(long long int __float2ll_rz(float a))
{ {
#if defined(__MULTI_CORE__)
return (long long int)a;
#else /* __MULTI_CORE__ */
return __internal_float2ll(a, cudaRoundZero); return __internal_float2ll(a, cudaRoundZero);
#endif /* __MULTI_CORE__ */
} }
__device_func__(long long int __float2ll_ru(float a)) __device_func__(long long int __float2ll_ru(float a))
{ {
return __internal_float2ll(a, cudaRoundPosInf); return __internal_float2ll(a, cudaRoundPosInf);
} }
__device_func__(long long int __float2ll_rd(float a)) __device_func__(long long int __float2ll_rd(float a))
{ {
return __internal_float2ll(a, cudaRoundMinInf); return __internal_float2ll(a, cudaRoundMinInf);
skipping to change at line 3987 skipping to change at line 3991
return res; return res;
} }
__device_func__(unsigned int __internal_float2uint(float a, enum cudaRoundM ode rndMode)) __device_func__(unsigned int __internal_float2uint(float a, enum cudaRoundM ode rndMode))
{ {
return (unsigned int)__internal_float2ull_kernel(a, 4294967295U, 0U, rndM ode); return (unsigned int)__internal_float2ull_kernel(a, 4294967295U, 0U, rndM ode);
} }
__device_func__(unsigned int __float2uint_rz(float a)) __device_func__(unsigned int __float2uint_rz(float a))
{ {
#if defined(__MULTI_CORE__)
return (unsigned int)a;
#else /* __MULTI_CORE__ */
return __internal_float2uint(a, cudaRoundZero); return __internal_float2uint(a, cudaRoundZero);
#endif /* __MULTI_CORE__ */
} }
__device_func__(unsigned int __float2uint_ru(float a)) __device_func__(unsigned int __float2uint_ru(float a))
{ {
return __internal_float2uint(a, cudaRoundPosInf); return __internal_float2uint(a, cudaRoundPosInf);
} }
__device_func__(unsigned int __float2uint_rd(float a)) __device_func__(unsigned int __float2uint_rd(float a))
{ {
return __internal_float2uint(a, cudaRoundMinInf); return __internal_float2uint(a, cudaRoundMinInf);
skipping to change at line 4016 skipping to change at line 4016
return __internal_float2uint(a, cudaRoundNearest); return __internal_float2uint(a, cudaRoundNearest);
} }
__device_func__(unsigned long long int __internal_float2ull(float a, enum c udaRoundMode rndMode)) __device_func__(unsigned long long int __internal_float2ull(float a, enum c udaRoundMode rndMode))
{ {
return __internal_float2ull_kernel(a, 18446744073709551615ULL, 9223372036 854775808ULL, rndMode); return __internal_float2ull_kernel(a, 18446744073709551615ULL, 9223372036 854775808ULL, rndMode);
} }
__device_func__(unsigned long long int __float2ull_rz(float a)) __device_func__(unsigned long long int __float2ull_rz(float a))
{ {
#if defined(__MULTI_CORE__)
return (unsigned long long int)a;
#else /* __MULTI_CORE__ */
return __internal_float2ull(a, cudaRoundZero); return __internal_float2ull(a, cudaRoundZero);
#endif /* __MULTI_CORE__ */
} }
__device_func__(unsigned long long int __float2ull_ru(float a)) __device_func__(unsigned long long int __float2ull_ru(float a))
{ {
return __internal_float2ull(a, cudaRoundPosInf); return __internal_float2ull(a, cudaRoundPosInf);
} }
__device_func__(unsigned long long int __float2ull_rd(float a)) __device_func__(unsigned long long int __float2ull_rd(float a))
{ {
return __internal_float2ull(a, cudaRoundMinInf); return __internal_float2ull(a, cudaRoundMinInf);
skipping to change at line 4121 skipping to change at line 4117
return __internal_int2float_kernel(a, cudaRoundPosInf); return __internal_int2float_kernel(a, cudaRoundPosInf);
} }
__device_func__(float __int2float_rd(int a)) __device_func__(float __int2float_rd(int a))
{ {
return __internal_int2float_kernel(a, cudaRoundMinInf); return __internal_int2float_kernel(a, cudaRoundMinInf);
} }
__device_func__(float __int2float_rn(int a)) __device_func__(float __int2float_rn(int a))
{ {
#if defined(__MULTI_CORE__)
return (float)a;
#else /* __MULTI_CORE__ */
return __internal_int2float_kernel(a, cudaRoundNearest); return __internal_int2float_kernel(a, cudaRoundNearest);
#endif /* __MULTI_CORE__ */
} }
__device_func__(float __internal_uint2float_kernel(unsigned int a, enum cud aRoundMode rndMode)) __device_func__(float __internal_uint2float_kernel(unsigned int a, enum cud aRoundMode rndMode))
{ {
volatile union __cudart_FloatUintCvt res; volatile union __cudart_FloatUintCvt res;
int shift; int shift;
unsigned int t; unsigned int t;
res.i = a; res.i = a;
if (a == 0) return res.f; if (a == 0) return res.f;
shift = __internal_normalize((unsigned int*)&res.i); shift = __internal_normalize((unsigned int*)&res.i);
skipping to change at line 4165 skipping to change at line 4157
return __internal_uint2float_kernel(a, cudaRoundPosInf); return __internal_uint2float_kernel(a, cudaRoundPosInf);
} }
__device_func__(float __uint2float_rd(unsigned int a)) __device_func__(float __uint2float_rd(unsigned int a))
{ {
return __internal_uint2float_kernel(a, cudaRoundMinInf); return __internal_uint2float_kernel(a, cudaRoundMinInf);
} }
__device_func__(float __uint2float_rn(unsigned int a)) __device_func__(float __uint2float_rn(unsigned int a))
{ {
#if defined(__MULTI_CORE__)
return (float)a;
#else /* __MULTI_CORE__ */
return __internal_uint2float_kernel(a, cudaRoundNearest); return __internal_uint2float_kernel(a, cudaRoundNearest);
#endif /* __MULTI_CORE__ */
}
__device_func__(float __ll2float_rn(long long int a))
{
return (float)a;
} }
__device_func__(float __internal_ull2float_kernel(unsigned long long int a, enum cudaRoundMode rndMode)) __device_func__(float __internal_ull2float_kernel(unsigned long long int a, enum cudaRoundMode rndMode))
{ {
unsigned long long int temp; unsigned long long int temp;
unsigned int res, t; unsigned int res, t;
int shift; int shift;
if (a == 0ULL) return 0.0f; if (a == 0ULL) return 0.0f;
temp = a; temp = a;
shift = __internal_normalize64(&temp); shift = __internal_normalize64(&temp);
skipping to change at line 4197 skipping to change at line 4180
t = (unsigned int)temp; t = (unsigned int)temp;
res += (127 + 62 - shift) << 23; /* add in exponent */ res += (127 + 62 - shift) << 23; /* add in exponent */
if (rndMode == cudaRoundNearest) { if (rndMode == cudaRoundNearest) {
res += (t == 0x80000000) ? (res & 1) : (t >> 31); res += (t == 0x80000000) ? (res & 1) : (t >> 31);
} else if (rndMode == cudaRoundPosInf) { } else if (rndMode == cudaRoundPosInf) {
res += (t != 0); res += (t != 0);
} }
return __int_as_float(res); return __int_as_float(res);
} }
__device_func__(float __internal_ll2float_kernel(long long int a, enum cuda
RoundMode rndMode))
{
unsigned long long int temp;
volatile float res = 0.0f;
if (a < 0LL) {
temp = (~((unsigned long long int)a)) + 1ULL;
if (rndMode == cudaRoundPosInf) {
rndMode = cudaRoundMinInf;
} else if (rndMode == cudaRoundMinInf) {
rndMode = cudaRoundPosInf;
}
} else {
temp = (unsigned long long int)a;
}
res = __internal_ull2float_kernel (temp, rndMode);
if (a < 0LL) {
res = -res;
}
return res;
}
__device_func__(float __ll2float_rn(long long int a))
{
return __internal_ll2float_kernel(a, cudaRoundNearest);
}
__device_func__(float __ll2float_rz(long long int a))
{
return __internal_ll2float_kernel(a, cudaRoundZero);
}
__device_func__(float __ll2float_ru(long long int a))
{
return __internal_ll2float_kernel(a, cudaRoundPosInf);
}
__device_func__(float __ll2float_rd(long long int a))
{
return __internal_ll2float_kernel(a, cudaRoundMinInf);
}
__device_func__(float __ull2float_rn(unsigned long long int a)) __device_func__(float __ull2float_rn(unsigned long long int a))
{ {
#if defined(__MULTI_CORE__)
return (float)a;
#else /* __MULTI_CORE__ */
return __internal_ull2float_kernel(a, cudaRoundNearest); return __internal_ull2float_kernel(a, cudaRoundNearest);
#endif /* __MULTI_CORE__ */ }
__device_func__(float __ull2float_rz(unsigned long long int a))
{
return __internal_ull2float_kernel(a, cudaRoundZero);
}
__device_func__(float __ull2float_ru(unsigned long long int a))
{
return __internal_ull2float_kernel(a, cudaRoundPosInf);
}
__device_func__(float __ull2float_rd(unsigned long long int a))
{
return __internal_ull2float_kernel(a, cudaRoundMinInf);
} }
__device_func__(unsigned short __float2half_rn(float f)) __device_func__(unsigned short __float2half_rn(float f))
{ {
unsigned int x = __float_as_int (f); unsigned int x = __float_as_int (f);
unsigned int u = (x & 0x7fffffff), remainder, shift, lsb, lsb_s1, lsb_m1; unsigned int u = (x & 0x7fffffff), remainder, shift, lsb, lsb_s1, lsb_m1;
unsigned int sign, exponent, mantissa; unsigned int sign, exponent, mantissa;
/* Get rid of +NaN/-NaN case first. */ /* Get rid of +NaN/-NaN case first. */
if (u > 0x7f800000) { if (u > 0x7f800000) {
skipping to change at line 4310 skipping to change at line 4346
__device_func__(float __fmul_rn(float a, float b)) __device_func__(float __fmul_rn(float a, float b))
{ {
return __internal_fmul_kernel(a, b, cudaRoundNearest); return __internal_fmul_kernel(a, b, cudaRoundNearest);
} }
__device_func__(void __brkpt(int c)) __device_func__(void __brkpt(int c))
{ {
/* TODO */ /* TODO */
} }
#if defined(__MULTI_CORE__) #if defined(__cplusplus)
extern "C" {
#define __syncthreads() \ #endif /* __cplusplus */
__builtin___syncthreads()
#else /* __MULTI_CORE__ */
extern int CUDARTAPI __cudaSynchronizeThreads(void**, void*); extern int CUDARTAPI __cudaSynchronizeThreads(void**, void*);
#if defined(__cplusplus)
}
#endif /* __cplusplus */
#if defined(__GNUC__) #if defined(__GNUC__)
__device_func__(inline __attribute__((always_inline)) void __syncthreads(vo id)) __device_func__(inline __attribute__((always_inline)) void __syncthreads(vo id))
{ {
volatile int _ = 0; volatile int _ = 0;
L: if (__cudaSynchronizeThreads((void**)&&L, (void*)&_)) goto L; L: if (__cudaSynchronizeThreads((void**)&&L, (void*)&_)) goto L;
} }
#elif defined(_WIN32) #elif defined(_WIN32)
#define __syncthreads() \ #define __syncthreads() \
(void)__cudaSynchronizeThreads((void**)0, (void*)0) (void)__cudaSynchronizeThreads((void**)0, (void*)0)
#endif /* __GNUC__ */ #endif /* __GNUC__ */
#endif /* __MULTI_CORE__ */
__device_func__(void __prof_trigger(int a)) __device_func__(void __prof_trigger(int a))
{ {
} }
__device_func__(void __threadfence(void)) __device_func__(void __threadfence(void))
{ {
__syncthreads(); __syncthreads();
} }
__device_func__(void __threadfence_block(void)) __device_func__(void __threadfence_block(void))
skipping to change at line 4376 skipping to change at line 4411
#endif /* __CUDABE__ */ #endif /* __CUDABE__ */
/************************************************************************** ***** /************************************************************************** *****
* * * *
* DEVICE IMPLEMENTATIONS FOR FUNCTIONS WITH BUILTIN NVOPENCC OPERATIONS * * DEVICE IMPLEMENTATIONS FOR FUNCTIONS WITH BUILTIN NVOPENCC OPERATIONS *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
#if !defined(__CUDABE__) #if !defined(__CUDABE__)
__device_func__(float __fdividef(float a, float b)) __device_func__(float __fdividef(float a, float b))
{ {
#if defined(__MULTI_CORE__)
return a / b;
#else /* __MULTI_CORE__ */
volatile float aa = a; volatile float aa = a;
volatile float bb = b; volatile float bb = b;
/* match range restrictions of the device function */ /* match range restrictions of the device function */
if (__cuda_fabsf(bb) > CUDART_TWO_TO_126_F) { if (__cuda_fabsf(bb) > CUDART_TWO_TO_126_F) {
if (__cuda_fabsf(aa) <= CUDART_NORM_HUGE_F) { if (__cuda_fabsf(aa) <= CUDART_NORM_HUGE_F) {
return ((aa / bb) / CUDART_NORM_HUGE_F) / CUDART_NORM_HUGE_F; return ((aa / bb) / CUDART_NORM_HUGE_F) / CUDART_NORM_HUGE_F;
} else { } else {
bb = 1.0f / bb; bb = 1.0f / bb;
bb = bb / CUDART_NORM_HUGE_F; bb = bb / CUDART_NORM_HUGE_F;
return aa * bb; return aa * bb;
} }
} else { } else {
return aa / bb; return aa / bb;
} }
#endif /* __MULTI_CORE__ */
} }
#endif /* !defined(__CUDABE__) */ #endif /* !defined(__CUDABE__) */
__device_func__(float __sinf(float a)) __device_func__(float __sinf(float a))
{ {
#if !defined(__CUDABE__) #if !defined(__CUDABE__)
if ((__float_as_int(a) << 1) == 0xff000000) { if ((__float_as_int(a) << 1) == 0xff000000) {
return __fadd_rn (a, -a); /* return NaN */ return __fadd_rn (a, -a); /* return NaN */
} }
#endif /* !defined(__CUDABE__) */ #endif /* !defined(__CUDABE__) */
skipping to change at line 4427 skipping to change at line 4458
__device_func__(float __log2f(float a)) __device_func__(float __log2f(float a))
{ {
return log2f(a); return log2f(a);
} }
/************************************************************************** ***** /************************************************************************** *****
* * * *
* SHARED HOST AND DEVICE IMPLEMENTATIONS * * SHARED HOST AND DEVICE IMPLEMENTATIONS *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
__device_func__(float __internal_accurate_fdividef(float a, float b))
{
return a / b;
}
__device_func__(float __tanf(float a)) __device_func__(float __tanf(float a))
{ {
#if defined(__MULTI_CORE__)
return tanf(a);
#else /* __MULTI_CORE__ */
return __fdividef (__sinf(a), __cosf(a)); return __fdividef (__sinf(a), __cosf(a));
#endif /* __MULTI_CORE__ */
} }
__device_func__(void __sincosf(float a, float *sptr, float *cptr)) __device_func__(void __sincosf(float a, float *sptr, float *cptr))
{ {
#if defined(__MULTI_CORE__)
sincosf(a, sptr, cptr);
#else /* __MULTI_CORE__ */
*sptr = __sinf(a); *sptr = __sinf(a);
*cptr = __cosf(a); *cptr = __cosf(a);
#endif /* __MULTI_CORE__ */
} }
__device_func__(float __expf(float a)) __device_func__(float __expf(float a))
{ {
#if defined(__MULTI_CORE__)
return expf(a);
#else /* __MULTI_CORE__ */
return __cuda_exp2f(a * CUDART_L2E_F); return __cuda_exp2f(a * CUDART_L2E_F);
#endif /* __MULTI_CORE__ */
} }
__device_func__(float __exp10f(float a)) __device_func__(float __exp10f(float a))
{ {
#if defined(__MULTI_CORE__)
return exp10f(a);
#else /* __MULTI_CORE__ */
return __cuda_exp2f(a * CUDART_L2T_F); return __cuda_exp2f(a * CUDART_L2T_F);
#endif /* __MULTI_CORE__ */
} }
__device_func__(float __log10f(float a)) __device_func__(float __log10f(float a))
{ {
#if defined(__MULTI_CORE__)
return log10f(a);
#else /* __MULTI_CORE__ */
return CUDART_LG2_F * __log2f(a); return CUDART_LG2_F * __log2f(a);
#endif /* __MULTI_CORE__ */
} }
__device_func__(float __logf(float a)) __device_func__(float __logf(float a))
{ {
#if defined(__MULTI_CORE__)
return logf(a);
#else /* __MULTI_CORE__ */
return CUDART_LN2_F * __log2f(a); return CUDART_LN2_F * __log2f(a);
#endif /* __MULTI_CORE__ */
} }
__device_func__(float __powf(float a, float b)) __device_func__(float __powf(float a, float b))
{ {
#if defined(__MULTI_CORE__)
return powf(a, b);
#else /* __MULTI_CORE__ */
return __cuda_exp2f(b * __log2f(a)); return __cuda_exp2f(b * __log2f(a));
#endif /* __MULTI_CORE__ */
} }
__device_func__(float fdividef(float a, float b)) __device_func__(float fdividef(float a, float b))
{ {
#if defined(__MULTI_CORE__) #if defined(__USE_FAST_MATH__) && !defined(__CUDA_PREC_DIV)
return a / b;
#elif defined(__USE_FAST_MATH__)
return __fdividef(a, b); return __fdividef(a, b);
#else /* __MULTI_CORE__ */ #else /* __USE_FAST_MATH__ && !__CUDA_PREC_DIV */
return __internal_accurate_fdividef(a, b); return a / b;
#endif /* __MULTI_CORE__ */ #endif /* __USE_FAST_MATH__ && !__CUDA_PREC_DIV */
} }
#if !defined(__CUDABE__) || (__CUDA_ARCH__ < 200)
__device_func__(int __clz(int a)) __device_func__(int __clz(int a))
{ {
return (a)?(158-(__float_as_int(__uint2float_rz((unsigned int)a))>>23)):3 2; return (a)?(158-(__float_as_int(__uint2float_rz((unsigned int)a))>>23)):3 2;
} }
__device_func__(int __clzll(long long int a)) __device_func__(int __clzll(long long int a))
{ {
int ahi = ((int)((unsigned long long)a >> 32)); int ahi = ((int)((unsigned long long)a >> 32));
int alo = ((int)((unsigned long long)a & 0xffffffffULL)); int alo = ((int)((unsigned long long)a & 0xffffffffULL));
int res; int res;
skipping to change at line 4553 skipping to change at line 4550
ahi = ahi - ((ahi >> 1) & 0x55555555); ahi = ahi - ((ahi >> 1) & 0x55555555);
ahi = (ahi & 0x33333333) + ((ahi >> 2) & 0x33333333); ahi = (ahi & 0x33333333) + ((ahi >> 2) & 0x33333333);
alo = alo + ahi; alo = alo + ahi;
alo = (alo & 0x0f0f0f0f) + ((alo >> 4) & 0x0f0f0f0f); alo = (alo & 0x0f0f0f0f) + ((alo >> 4) & 0x0f0f0f0f);
alo = ((__umul24(alo, 0x808080) << 1) + alo) >> 24; alo = ((__umul24(alo, 0x808080) << 1) + alo) >> 24;
return alo; return alo;
} }
__device_func__(unsigned int __brev(unsigned int a)) __device_func__(unsigned int __brev(unsigned int a))
{ {
a = ((a >> 1) & 0x55555555) + ((a & 0x55555555) << 1); /* Use Knuth's algorithm from http://www.hackersdelight.org/revisions.pdf
a = ((a >> 2) & 0x33333333) + ((a & 0x33333333) << 2); */
a = ((a >> 4) & 0x0F0F0F0F) + ((a & 0x0F0F0F0F) << 4); unsigned int t;
a = ((a >> 8) & 0x00FF00FF) + ((a & 0x00FF00FF) << 8); a = (a << 15) | (a >> 17);
a = ( a >> 16 ) + ( a << 16); t = (a ^ (a >> 10)) & 0x003f801f;
a = (t + (t << 10)) ^ a;
t = (a ^ (a >> 4)) & 0x0e038421;
a = (t + (t << 4)) ^ a;
t = (a ^ (a >> 2)) & 0x22488842;
a = (t + (t << 2)) ^ a;
return a; return a;
} }
__device_func__(unsigned long long int __brevll(unsigned long long int a)) __device_func__(unsigned long long int __brevll(unsigned long long int a))
{ {
unsigned int hi = (unsigned int)(a >> 32); unsigned int hi = (unsigned int)(a >> 32);
unsigned int lo = (unsigned int)(a & 0xffffffffULL); unsigned int lo = (unsigned int)(a & 0xffffffffULL);
unsigned int t; unsigned int t;
t = __brev(lo); t = __brev(lo);
lo = __brev(hi); lo = __brev(hi);
return ((unsigned long long int)t << 32) + (unsigned long long int)lo; return ((unsigned long long int)t << 32) + (unsigned long long int)lo;
} }
#endif /* __CUDABE__ || __CUDA_ARCH__ < 200 */
__device_func__(int __ffs(int a)) __device_func__(int __ffs(int a))
{ {
return 32 - __clz (a & -a); return 32 - __clz (a & -a);
} }
__device_func__(int __ffsll(long long int a)) __device_func__(int __ffsll(long long int a))
{ {
return 64 - __clzll (a & -a); return 64 - __clzll (a & -a);
} }
skipping to change at line 4669 skipping to change at line 4672
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
#include "sm_11_atomic_functions.h" #include "sm_11_atomic_functions.h"
#include "sm_12_atomic_functions.h" #include "sm_12_atomic_functions.h"
#include "sm_13_double_functions.h" #include "sm_13_double_functions.h"
#include "sm_20_atomic_functions.h"
#include "sm_20_intrinsics.h"
#include "texture_fetch_functions.h" #include "texture_fetch_functions.h"
#endif /* !__DEVICE_FUNCTIONS_H__ */ #endif /* !__DEVICE_FUNCTIONS_H__ */
 End of changes. 52 change blocks. 
97 lines changed or deleted 107 lines changed or added


 device_launch_parameters.h   device_launch_parameters.h 
/* /*
* Copyright 1993-2009 NVIDIA Corporation. All rights reserved. * Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
* *
* NOTICE TO USER: * NOTICE TO USER:
* *
* This source code is subject to NVIDIA ownership rights under U.S. and * This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws. Users and possessors of this source code * international Copyright laws. Users and possessors of this source code
* are hereby granted a nonexclusive, royalty-free license to use this code * are hereby granted a nonexclusive, royalty-free license to use this code
* in individual and commercial software. * in individual and commercial software.
* *
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
 End of changes. 1 change blocks. 
1 lines changed or deleted 1 lines changed or added


 device_runtime.h   device_runtime.h 
skipping to change at line 69 skipping to change at line 69
#if defined(__CUDABE__) /* cudabe compiler */ #if defined(__CUDABE__) /* cudabe compiler */
#define __pad__(f) #define __pad__(f)
#define __text__ \ #define __text__ \
__attribute__((__texture__)) __attribute__((__texture__))
#define __surf__ \ #define __surf__ \
__attribute__((__surface__)) __attribute__((__surface__))
#define ___device__(sc) \ #define ___device__(sc) \
static static
#define __in__(cdecl, decl) \ #define __in__(cdecl, decl) \
__shared__ cdecl cdecl
#define __in_type__(cdecl, decl) \ #define __in_type__(cdecl, decl) \
cdecl cdecl
#define __texture_var(name) \ #define __texture_var(name) \
name name
#define __shared_var(name, s, type) \ #define __shared_var(name, s, type) \
name name
#define __cuda_host_device_name(name) \
name
#define __val_param(name) \ #define __val_param(name) \
__val_param##name __val_param##name
#define __copy_param(local_decl, param) \ #define __copy_param(local_decl, param) \
local_decl = param local_decl = param
#define __unsized_array_size \ #define __unsized_array_size \
[] []
#define __unsized__shared_var(name, s, type) \ #define __unsized__shared_var(name, s, type) \
name name
#define __unsized__empty_array(s) \ #define __unsized__empty_array(s) \
s s
#define __var_used__ \ #define __var_used__ \
__attribute__((__used__)) __attribute__((__used__))
#define __storage_extern_unsized__shared__ \ #define __storage_extern_unsized__shared__ \
extern extern
#define __cxa_vec_util(n, num, size, f) \
int i; for (i = 0; i < num; i++) f(n + i)
#define __cxa_vec_ctor(n, num, size, c, d) \
({ __cxa_vec_util(n, num, size, c); (void)0; })
#define __cxa_vec_dtor(n, num, size, d) \
{ __cxa_vec_util(n, num, size, d); }
#undef __cdecl #undef __cdecl
#define __cdecl #define __cdecl
#undef __w64 #undef __w64
#define __w64 #define __w64
#elif defined(__CUDACC__) /* cudafe compiler */ #elif defined(__CUDACC__) /* cudafe compiler */
#define __loc_sc__(loc, size, sc) \ #define __loc_sc__(loc, size, sc) \
sc loc sc loc
skipping to change at line 115 skipping to change at line 119
#define ___device__(sc) \ #define ___device__(sc) \
sc __device__ sc __device__
#define __in__(cdecl, decl) \ #define __in__(cdecl, decl) \
decl decl
#define __in_type__(cdecl, decl) \ #define __in_type__(cdecl, decl) \
decl decl
#define __texture_var(name) \ #define __texture_var(name) \
name name
#define __shared_var(name, s, type) \ #define __shared_var(name, s, type) \
name name
#define __cuda_host_device_name(name) \
name
#define __val_param(name) \ #define __val_param(name) \
name name
#define __copy_param(local_decl, param) #define __copy_param(local_decl, param)
#define __unsized_array_size \ #define __unsized_array_size \
[] []
#define __unsized__shared_var(name, s, type) \ #define __unsized__shared_var(name, s, type) \
name name
#define __unsized__empty_array(s) \ #define __unsized__empty_array(s) \
s s
skipping to change at line 240 skipping to change at line 242
#endif /* __APPLE__ || __ICC */ #endif /* __APPLE__ || __ICC */
#endif /* __MULTI_CORE__ */ #endif /* __MULTI_CORE__ */
#define __in__(cdecl, decl) \ #define __in__(cdecl, decl) \
decl decl
#define __in_type__(cdecl, decl) \ #define __in_type__(cdecl, decl) \
decl decl
#define __texture_var(name) \ #define __texture_var(name) \
__texture_##name __texture_##name
#define __cuda_host_device_name(name) \
__cuda_host_device_##name
#define __val_param(name) \ #define __val_param(name) \
name name
#define __copy_param(local_decl, param) #define __copy_param(local_decl, param)
#define __unsized_array_size #define __unsized_array_size
#define __unsized__shared_var(name, s, type) \ #define __unsized__shared_var(name, s, type) \
(*name) (*name)
#define __unsized__empty_array(s) #define __unsized__empty_array(s)
#define __cxa_vec_ctor(n, num, size, c, d) \
__cxa_vec_util((void*)n, num, size, (void (*)(void*))c)
#define __cxa_vec_dtor(n, num, size, d) \
__cxa_vec_util((void*)n, num, size, (void (*)(void*))d)
static void __cxa_vec_util(void *n, size_t num, size_t size, void (*f)(void
*))
{
size_t i;
for (i = 0; i < num; i++) {
f((void*)((char*)n + i * size));
}
}
/* this is compiled with a host compiler for device emulation */ /* this is compiled with a host compiler for device emulation */
#define __device_emulation #define __device_emulation
#if defined(__cplusplus) #if defined(__cplusplus)
#undef __VECTOR_TYPES_H__ #undef __VECTOR_TYPES_H__
#if defined(_WIN32) #if defined(_WIN32)
 End of changes. 6 change blocks. 
7 lines changed or deleted 21 lines changed or added


 device_types.h   device_types.h 
/* /*
* Copyright 1993-2009 NVIDIA Corporation. All rights reserved. * Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
* *
* NOTICE TO USER: * NOTICE TO USER:
* *
* This source code is subject to NVIDIA ownership rights under U.S. and * This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws. Users and possessors of this source code * international Copyright laws. Users and possessors of this source code
* are hereby granted a nonexclusive, royalty-free license to use this code * are hereby granted a nonexclusive, royalty-free license to use this code
* in individual and commercial software. * in individual and commercial software.
* *
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
 End of changes. 1 change blocks. 
1 lines changed or deleted 1 lines changed or added


 driver_functions.h   driver_functions.h 
/* /*
* Copyright 1993-2009 NVIDIA Corporation. All rights reserved. * Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
* *
* NOTICE TO USER: * NOTICE TO USER:
* *
* This source code is subject to NVIDIA ownership rights under U.S. and * This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws. Users and possessors of this source code * international Copyright laws. Users and possessors of this source code
* are hereby granted a nonexclusive, royalty-free license to use this code * are hereby granted a nonexclusive, royalty-free license to use this code
* in individual and commercial software. * in individual and commercial software.
* *
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
 End of changes. 1 change blocks. 
1 lines changed or deleted 1 lines changed or added


 driver_types.h   driver_types.h 
/* /*
* Copyright 1993-2009 NVIDIA Corporation. All rights reserved. * Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
* *
* NOTICE TO USER: * NOTICE TO USER:
* *
* This source code is subject to NVIDIA ownership rights under U.S. and * This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws. Users and possessors of this source code * international Copyright laws. Users and possessors of this source code
* are hereby granted a nonexclusive, royalty-free license to use this code * are hereby granted a nonexclusive, royalty-free license to use this code
* in individual and commercial software. * in individual and commercial software.
* *
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
skipping to change at line 76 skipping to change at line 76
#define cudaHostAllocWriteCombined 4 ///< Write-combined memory #define cudaHostAllocWriteCombined 4 ///< Write-combined memory
#define cudaEventDefault 0 ///< Default event flag #define cudaEventDefault 0 ///< Default event flag
#define cudaEventBlockingSync 1 ///< Event uses blocking synchroniz ation #define cudaEventBlockingSync 1 ///< Event uses blocking synchroniz ation
#define cudaDeviceScheduleAuto 0 ///< Device flag - Automatic schedu ling #define cudaDeviceScheduleAuto 0 ///< Device flag - Automatic schedu ling
#define cudaDeviceScheduleSpin 1 ///< Device flag - Spin default sch eduling #define cudaDeviceScheduleSpin 1 ///< Device flag - Spin default sch eduling
#define cudaDeviceScheduleYield 2 ///< Device flag - Yield default sc heduling #define cudaDeviceScheduleYield 2 ///< Device flag - Yield default sc heduling
#define cudaDeviceBlockingSync 4 ///< Device flag - Use blocking syn chronization #define cudaDeviceBlockingSync 4 ///< Device flag - Use blocking syn chronization
#define cudaDeviceMapHost 8 ///< Device flag - Support mapped p inned allocations #define cudaDeviceMapHost 8 ///< Device flag - Support mapped p inned allocations
#define cudaDeviceMask 0xf ///< Device flags mask #define cudaDeviceLmemResizeToMax 16 ///< Device flag - Keep local memor
y allocation after launch
#define cudaDeviceMask 0x1f ///< Device flags mask
#endif /* !__CUDA_INTERNAL_COMPILATION__ */ #endif /* !__CUDA_INTERNAL_COMPILATION__ */
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
/** /**
skipping to change at line 130 skipping to change at line 131
cudaErrorMixedDeviceExecution = 28, ///< Mixed device execu tion cudaErrorMixedDeviceExecution = 28, ///< Mixed device execu tion
cudaErrorCudartUnloading = 29, ///< CUDA runtime unloa ding cudaErrorCudartUnloading = 29, ///< CUDA runtime unloa ding
cudaErrorUnknown = 30, ///< Unknown error cond ition cudaErrorUnknown = 30, ///< Unknown error cond ition
cudaErrorNotYetImplemented = 31, ///< Function not yet i mplemented cudaErrorNotYetImplemented = 31, ///< Function not yet i mplemented
cudaErrorMemoryValueTooLarge = 32, ///< Memory value too l arge cudaErrorMemoryValueTooLarge = 32, ///< Memory value too l arge
cudaErrorInvalidResourceHandle = 33, ///< Invalid resource h andle cudaErrorInvalidResourceHandle = 33, ///< Invalid resource h andle
cudaErrorNotReady = 34, ///< Not ready error cudaErrorNotReady = 34, ///< Not ready error
cudaErrorInsufficientDriver = 35, ///< CUDA runtime is ne wer than driver cudaErrorInsufficientDriver = 35, ///< CUDA runtime is ne wer than driver
cudaErrorSetOnActiveProcess = 36, ///< Set on active proc ess error cudaErrorSetOnActiveProcess = 36, ///< Set on active proc ess error
cudaErrorNoDevice = 38, ///< No available CUDA device cudaErrorNoDevice = 38, ///< No available CUDA device
cudaErrorECCUncorrectable = 39, ///< Uncorrectable ECC error detected
cudaErrorStartupFailure = 0x7f, ///< Startup failure cudaErrorStartupFailure = 0x7f, ///< Startup failure
cudaErrorApiFailureBase = 10000 ///< API failure base cudaErrorApiFailureBase = 10000 ///< API failure base
}; };
/** /**
* Channel format kind * Channel format kind
*/ */
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
enum cudaChannelFormatKind enum cudaChannelFormatKind
{ {
skipping to change at line 179 skipping to change at line 181
enum cudaMemcpyKind enum cudaMemcpyKind
{ {
cudaMemcpyHostToHost = 0, ///< Host -> Host cudaMemcpyHostToHost = 0, ///< Host -> Host
cudaMemcpyHostToDevice = 1, ///< Host -> Device cudaMemcpyHostToDevice = 1, ///< Host -> Device
cudaMemcpyDeviceToHost = 2, ///< Device -> Host cudaMemcpyDeviceToHost = 2, ///< Device -> Host
cudaMemcpyDeviceToDevice = 3 ///< Device -> Device cudaMemcpyDeviceToDevice = 3 ///< Device -> Device
}; };
/** /**
* CUDA Pitched memory pointer * CUDA Pitched memory pointer
* \sa ::make_cudaPitchedPtr
*/ */
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct cudaPitchedPtr struct cudaPitchedPtr
{ {
void *ptr; ///< Pointer to allocated memory void *ptr; ///< Pointer to allocated memory
size_t pitch; ///< Pitch of allocated memory in bytes size_t pitch; ///< Pitch of allocated memory in bytes
size_t xsize; ///< Logical width of allocation in elements size_t xsize; ///< Logical width of allocation in elements
size_t ysize; ///< Logical height of allocation in elements size_t ysize; ///< Logical height of allocation in elements
}; };
/** /**
* CUDA extent * CUDA extent
* \sa ::make_cudaExtent
*/ */
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct cudaExtent struct cudaExtent
{ {
size_t width; ///< Width in bytes size_t width; ///< Width in bytes
size_t height; ///< Height in bytes size_t height; ///< Height in bytes
size_t depth; ///< Depth in bytes size_t depth; ///< Depth in bytes
}; };
/** /**
* CUDA 3D position * CUDA 3D position
* \sa ::make_cudaPos
*/ */
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct cudaPos struct cudaPos
{ {
size_t x; ///< x size_t x; ///< x
size_t y; ///< y size_t y; ///< y
size_t z; ///< z size_t z; ///< z
}; };
/** /**
skipping to change at line 230 skipping to change at line 235
struct cudaArray *dstArray; ///< Destination memory address struct cudaArray *dstArray; ///< Destination memory address
struct cudaPos dstPos; ///< Destination position offset struct cudaPos dstPos; ///< Destination position offset
struct cudaPitchedPtr dstPtr; ///< Pitched destination memory address struct cudaPitchedPtr dstPtr; ///< Pitched destination memory address
struct cudaExtent extent; ///< Requested memory copy size struct cudaExtent extent; ///< Requested memory copy size
enum cudaMemcpyKind kind; ///< Type of transfer enum cudaMemcpyKind kind; ///< Type of transfer
}; };
/** /**
* CUDA graphics interop resource
*/
/*DEVICE_BUILTIN*/
struct cudaGraphicsResource;
/**
* CUDA graphics interop register flags
*/
/*DEVICE_BUILTIN*/
enum cudaGraphicsRegisterFlags
{
cudaGraphicsRegisterFlagsNone = 0, ///< Default
};
/**
* CUDA graphics interop map flags
*/
/*DEVICE_BUILTIN*/
enum cudaGraphicsMapFlags
{
cudaGraphicsMapFlagsNone = 0, ///< Default; Assume resource can
be read/written
cudaGraphicsMapFlagsReadOnly = 1, ///< CUDA will not write to this r
esource
cudaGraphicsMapFlagsWriteDiscard = 2, ///< CUDA will only write to and w
ill not read from this resource
};
/**
* CUDA graphics interop array indices for cube maps
*/
/*DEVICE_BUILTIN*/
enum cudaGraphicsCubeFace {
cudaGraphicsCubeFacePositiveX = 0x00, ///< Positive X face of cubemap
cudaGraphicsCubeFaceNegativeX = 0x01, ///< Negative X face of cubemap
cudaGraphicsCubeFacePositiveY = 0x02, ///< Positive Y face of cubemap
cudaGraphicsCubeFaceNegativeY = 0x03, ///< Negative Y face of cubemap
cudaGraphicsCubeFacePositiveZ = 0x04, ///< Positive Z face of cubemap
cudaGraphicsCubeFaceNegativeZ = 0x05, ///< Negative Z face of cubemap
};
/**
* CUDA function attributes * CUDA function attributes
*/ */
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct cudaFuncAttributes struct cudaFuncAttributes
{ {
size_t sharedSizeBytes; ///< Size of shared memory in bytes size_t sharedSizeBytes; ///< Size of shared memory in bytes
size_t constSizeBytes; ///< Size of constant memory in bytes size_t constSizeBytes; ///< Size of constant memory in bytes
size_t localSizeBytes; ///< Size of local memory in bytes size_t localSizeBytes; ///< Size of local memory in bytes
int maxThreadsPerBlock; ///< Maximum number of threads per block int maxThreadsPerBlock; ///< Maximum number of threads per block
int numRegs; ///< Number of registers used int numRegs; ///< Number of registers used
int __cudaReserved[8]; /** \brief PTX virtual architecture version for which the function was
* compiled. This value is the major PTX version * 10 + the minor PTX
* version, so a PTX version 1.3 function would return the value 13.
* For device emulation kernels, this is set to 9999.
*/
int ptxVersion;
/** \brief Binary architecture version for which the function was compil
ed.
* This value is the major binary version * 10 + the minor binary versi
on,
* so a binary version 1.3 function would return the value 13.
* For device emulation kernels, this is set to 9999.
*/
int binaryVersion;
int __cudaReserved[6];
};
/**
* CUDA function cache configurations
*/
/*DEVICE_BUILTIN*/
enum cudaFuncCache
{
cudaFuncCachePreferNone = 0, ///< Default function cache configurati
on, no preference
cudaFuncCachePreferShared = 1, ///< Prefer larger shared memory and sm
aller L1 cache
cudaFuncCachePreferL1 = 2 ///< Prefer larger L1 cache and smaller
shared memory
}; };
/** /**
* CUDA device compute modes * CUDA device compute modes
*/ */
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
enum cudaComputeMode enum cudaComputeMode
{ {
cudaComputeModeDefault = 0, ///< Default compute mode (Multiple thr eads can use ::cudaSetDevice() with this device) cudaComputeModeDefault = 0, ///< Default compute mode (Multiple thr eads can use ::cudaSetDevice() with this device)
cudaComputeModeExclusive = 1, ///< Compute-exclusive mode (Only one t hread will be able to use ::cudaSetDevice() with this device) cudaComputeModeExclusive = 1, ///< Compute-exclusive mode (Only one t hread will be able to use ::cudaSetDevice() with this device)
skipping to change at line 280 skipping to change at line 347
size_t totalConstMem; ///< Constant memory available on devic e in bytes size_t totalConstMem; ///< Constant memory available on devic e in bytes
int major; ///< Major compute capability int major; ///< Major compute capability
int minor; ///< Minor compute capability int minor; ///< Minor compute capability
size_t textureAlignment; ///< Alignment requirement for textures size_t textureAlignment; ///< Alignment requirement for textures
int deviceOverlap; ///< Device can concurrently copy memor y and execute a kernel int deviceOverlap; ///< Device can concurrently copy memor y and execute a kernel
int multiProcessorCount; ///< Number of multiprocessors on devic e int multiProcessorCount; ///< Number of multiprocessors on devic e
int kernelExecTimeoutEnabled; ///< Specified whether there is a run t ime limit on kernels int kernelExecTimeoutEnabled; ///< Specified whether there is a run t ime limit on kernels
int integrated; ///< Device is integrated as opposed to discrete int integrated; ///< Device is integrated as opposed to discrete
int canMapHostMemory; ///< Device can map host memory with cu daHostAlloc/cudaHostGetDevicePointer int canMapHostMemory; ///< Device can map host memory with cu daHostAlloc/cudaHostGetDevicePointer
int computeMode; ///< Compute mode (See ::cudaComputeMod e) int computeMode; ///< Compute mode (See ::cudaComputeMod e)
int __cudaReserved[36]; int maxTexture1D; ///< Maximum 1D texture size
int maxTexture2D[2]; ///< Maximum 2D texture dimensions
int maxTexture3D[3]; ///< Maximum 3D texture dimensions
int maxTexture2DArray[3]; ///< Maximum 2D texture array dimension
s
int concurrentKernels; ///< Device can possibly execute multip
le kernels concurrently
int __cudaReserved[26];
}; };
#define cudaDevicePropDontCare \ #define cudaDevicePropDontCare \
{ \ { \
{'\0'}, /* char name[256]; */ \ {'\0'}, /* char name[256]; */ \
0, /* size_t totalGlobalMem; */ \ 0, /* size_t totalGlobalMem; */ \
0, /* size_t sharedMemPerBlock; */ \ 0, /* size_t sharedMemPerBlock; */ \
0, /* int regsPerBlock; */ \ 0, /* int regsPerBlock; */ \
0, /* int warpSize; */ \ 0, /* int warpSize; */ \
0, /* size_t memPitch; */ \ 0, /* size_t memPitch; */ \
skipping to change at line 305 skipping to change at line 377
0, /* size_t totalConstMem; */ \ 0, /* size_t totalConstMem; */ \
-1, /* int major; */ \ -1, /* int major; */ \
-1, /* int minor; */ \ -1, /* int minor; */ \
0, /* size_t textureAlignment; */ \ 0, /* size_t textureAlignment; */ \
-1, /* int deviceOverlap; */ \ -1, /* int deviceOverlap; */ \
0, /* int multiProcessorCount; */ \ 0, /* int multiProcessorCount; */ \
0, /* int kernelExecTimeoutEnabled */ \ 0, /* int kernelExecTimeoutEnabled */ \
0, /* int integrated */ \ 0, /* int integrated */ \
0, /* int canMapHostMemory */ \ 0, /* int canMapHostMemory */ \
0, /* int computeMode */ \ 0, /* int computeMode */ \
0, /* int maxTexture1D */ \
{0, 0}, /* int maxTexture2D[2] */ \
{0, 0, 0}, /* int maxTexture3D[3] */ \
{0, 0, 0}, /* int maxTexture2DArray[3] */ \
0 /* int concurrentKernels */ \
} ///< Empty device properties } ///< Empty device properties
/************************************************************************** ***** /************************************************************************** *****
* * * *
* SHORTHAND TYPE DEFINITION USED BY RUNTIME API * * SHORTHAND TYPE DEFINITION USED BY RUNTIME API *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
/** /**
* CUDA Error types * CUDA Error types
 End of changes. 10 change blocks. 
4 lines changed or deleted 92 lines changed or added


 func_macro.h   func_macro.h 
skipping to change at line 57 skipping to change at line 57
#if !defined(__CUDA_INTERNAL_COMPILATION__) #if !defined(__CUDA_INTERNAL_COMPILATION__)
#error -- incorrect inclusion of a cudart header file #error -- incorrect inclusion of a cudart header file
#endif /* !__CUDA_INTERNAL_COMPILATION__ */ #endif /* !__CUDA_INTERNAL_COMPILATION__ */
#if defined(__cplusplus) && defined(__device_emulation) && !defined(__multi _core__) #if defined(__cplusplus) && defined(__device_emulation) && !defined(__multi _core__)
#define __begin_host_func \ #define __begin_host_func \
}} }
#define __end_host_func \ #define __end_host_func \
namespace __cuda_emu { extern "C" { namespace __cuda_emu {
#define __host_device_call(f) \ #define __host_device_call(f) \
__cuda_emu::f __cuda_emu::f
#else /* __cplusplus && __device_emulation && !__multi_core__ */ #else /* __cplusplus && __device_emulation && !__multi_core__ */
#define __begin_host_func #define __begin_host_func
#define __end_host_func #define __end_host_func
#define __host_device_call(f) \ #define __host_device_call(f) \
f f
 End of changes. 2 change blocks. 
2 lines changed or deleted 2 lines changed or added


 host_config.h   host_config.h 
/* /*
* Copyright 1993-2009 NVIDIA Corporation. All rights reserved. * Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
* *
* NOTICE TO USER: * NOTICE TO USER:
* *
* This source code is subject to NVIDIA ownership rights under U.S. and * This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws. Users and possessors of this source code * international Copyright laws. Users and possessors of this source code
* are hereby granted a nonexclusive, royalty-free license to use this code * are hereby granted a nonexclusive, royalty-free license to use this code
* in individual and commercial software. * in individual and commercial software.
* *
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
skipping to change at line 52 skipping to change at line 52
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
#if defined(__CUDACC__) #if defined(__CUDACC__)
#if defined(__APPLE__) #if defined(__APPLE__)
#define _CRTIMP #define _CRTIMP
#define __THROW #define __THROW
#if defined(__MULTI_CORE__) #if defined(__BLOCKS__)
#error multicore not supported for MacOs #undef __BLOCKS__
#endif /* __MULTI_CORE__ */ #endif /* __BLOCKS__ */
#elif defined(__GNUC__) #elif defined(__GNUC__)
#define _CRTIMP #define _CRTIMP
#if defined(__MULTI_CORE__) && __GNUC__ > 3 #include <features.h> /* for __THROW */
#error multicore not supported for gcc 4.x
#endif /* __MULTI_CORE__ & __GNUC__ > 3 */
#include <features.h> /* for __THROW */
#include <bits/c++config.h> /* for _GLIBCXX_ATOMIC_BUILTINS */
#if _GLIBCXX_ATOMIC_BUILTINS == 1
#undef _GLIBCXX_ATOMIC_BUILTINS /* for missing __sync_fetch_and_add */
#endif /* _GLIBCXX_ATOMIC_BUILTINS == 1 */
#elif defined(_WIN32) #elif defined(_WIN32)
#if defined(__MULTI_CORE__) && _MSC_VER != 1400 #if _MSC_VER >= 1400
#error multicore support available only for VC8
#endif /* __MULTI_CORE__ & _MSC_VER != 1400 */
#if _MSC_VER >= 1500 #if _MSC_VER >= 1500
#undef _USE_DECLSPECS_FOR_SAL #undef _USE_DECLSPECS_FOR_SAL
#define _USE_DECLSPECS_FOR_SAL \ #define _USE_DECLSPECS_FOR_SAL \
1 1
#endif /* _MSC_VER >= 1500 */ #endif /* _MSC_VER >= 1500 */
#if _MSC_VER >= 1400
#if !defined(_CRT_NONSTDC_NO_WARNINGS) #if !defined(_CRT_NONSTDC_NO_WARNINGS)
#define _CRT_NONSTDC_NO_WARNINGS /* to suppress warnings */ #define _CRT_NONSTDC_NO_WARNINGS /* to suppress warnings */
#endif /* _CRT_NONSTDC_NO_WARNINGS */ #endif /* !_CRT_NONSTDC_NO_WARNINGS */
#if !defined(_CRT_SECURE_NO_WARNINGS) #if !defined(_CRT_SECURE_NO_WARNINGS)
#define _CRT_SECURE_NO_WARNINGS /* to suppress warnings */ #define _CRT_SECURE_NO_WARNINGS /* to suppress warnings */
#endif /* _CRT_SECURE_NO_WARNINGS */ #endif /* !_CRT_SECURE_NO_WARNINGS */
#endif /* _MSC_VER >= 1400 */ #endif /* _MSC_VER >= 1400 */
#if !defined(NOMINMAX) #if !defined(NOMINMAX)
#define NOMINMAX /* min and max are part of cuda runtime */ #define NOMINMAX /* min and max are part of cuda runtime */
#endif /* !NOMINMAX */ #endif /* !NOMINMAX */
#include <crtdefs.h> /* for _CRTIMP */ #include <crtdefs.h> /* for _CRTIMP */
 End of changes. 9 change blocks. 
27 lines changed or deleted 8 lines changed or added


 host_defines.h   host_defines.h 
/* /*
* Copyright 1993-2009 NVIDIA Corporation. All rights reserved. * Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
* *
* NOTICE TO USER: * NOTICE TO USER:
* *
* This source code is subject to NVIDIA ownership rights under U.S. and * This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws. Users and possessors of this source code * international Copyright laws. Users and possessors of this source code
* are hereby granted a nonexclusive, royalty-free license to use this code * are hereby granted a nonexclusive, royalty-free license to use this code
* in individual and commercial software. * in individual and commercial software.
* *
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
skipping to change at line 46 skipping to change at line 46
#if !defined(__HOST_DEFINES_H__) #if !defined(__HOST_DEFINES_H__)
#define __HOST_DEFINES_H__ #define __HOST_DEFINES_H__
#if !defined(__GNUC__) && !defined(_WIN32) #if !defined(__GNUC__) && !defined(_WIN32)
#error --- !!! UNSUPPORTED COMPILER !!! --- #error --- !!! UNSUPPORTED COMPILER !!! ---
#elif defined(__GNUC__) #elif defined(__GNUC__)
#define __no_return__ \ #define __no_return__ \
__attribute__((__noreturn__)) __attribute__((noreturn))
#define __noinline__ \ #define __noinline__ \
__attribute__((__noinline__)) __attribute__((noinline))
#define __forceinline__ \ #define __forceinline__ \
__inline__ __attribute__((__always_inline__)) __inline__ __attribute__((always_inline))
#define __align__(n) \ #define __align__(n) \
__attribute__((__aligned__(n))) __attribute__((aligned(n)))
#define __thread__ \ #define __thread__ \
__thread __thread
#define __import__ #define __import__
#define __export__ #define __export__
#define __cdecl
#define __annotate__(a) \
__attribute__((a))
#define __location__(a) \ #define __location__(a) \
__loc__(__attribute__((a))) __annotate__(a)
#define CUDARTAPI #define CUDARTAPI
#elif defined(_WIN32) #elif defined(_WIN32)
#if _MSC_VER >= 1400 #if _MSC_VER >= 1400
#define __restrict__ \ #define __restrict__ \
__restrict __restrict
#else /* _MSC_VER >= 1400 */ #else /* _MSC_VER >= 1400 */
skipping to change at line 90 skipping to change at line 93
#define __forceinline__ \ #define __forceinline__ \
__forceinline __forceinline
#define __align__(n) \ #define __align__(n) \
__declspec(align(n)) __declspec(align(n))
#define __thread__ \ #define __thread__ \
__declspec(thread) __declspec(thread)
#define __import__ \ #define __import__ \
__declspec(dllimport) __declspec(dllimport)
#define __export__ \ #define __export__ \
__declspec(dllexport) __declspec(dllexport)
#define __annotate__(a) \
__declspec(a)
#define __location__(a) \ #define __location__(a) \
__loc__(__declspec(a)) __annotate__(__##a##__)
#define CUDARTAPI \ #define CUDARTAPI \
__stdcall __stdcall
#endif /* !__GNUC__ && !_WIN32 */ #endif /* !__GNUC__ && !_WIN32 */
#if defined(__CUDACC__) || defined(__CUDABE__) || defined (__MULTI_CORE__) #if !defined(__CUDACC__) && !defined(__CUDABE__)
#define __loc__(a) \
a
#else /* __CUDACC__ || __CUDABE__ || __MULTI_CORE__ */
#define __loc__(a) #undef __annotate__
#define __annotate__(a)
#endif /* __CUDACC__ || __CUDABE__ || __MULTI_CORE__ */ #endif /* !__CUDACC__ && !__CUDABE__ */
#if defined(__CUDACC__) || defined(__CUDABE__) || defined (__MULTI_CORE__) || \ #if defined(__CUDACC__) || defined(__CUDABE__) || \
defined(__GNUC__) || defined(_WIN64) defined(__GNUC__) || defined(_WIN64)
#define __builtin_align__(a) \ #define __builtin_align__(a) \
__align__(a) __align__(a)
#else /* __CUDACC__ || __CUDABE__ || __MULTI_CORE__ || __GNUC__ || _WIN64 * / #else /* __CUDACC__ || __CUDABE__ || __GNUC__ || _WIN64 */
#define __builtin_align__(a) #define __builtin_align__(a)
#endif /* __CUDACC__ || __CUDABE__ || __MULTI_CORE__ || __GNUC__ || _WIN64 */ #endif /* __CUDACC__ || __CUDABE__ || __GNUC__ || _WIN64 */
#define __device__ \ #define __device__ \
__location__(__device__) __location__(device)
#define __host__ \ #define __host__ \
__location__(__host__) __location__(host)
#define __global__ \ #define __global__ \
__location__(__global__) __location__(global)
#define __shared__ \ #define __shared__ \
__location__(__shared__) __location__(shared)
#define __constant__ \ #define __constant__ \
__location__(__constant__) __location__(constant)
#define __launch_bounds__(...) \ #define __launch_bounds__(...) \
__location__(__launch_bounds__(__VA_ARGS__)) __annotate__(launch_bounds(__VA_ARGS__))
#endif /* !__HOST_DEFINES_H__ */ #endif /* !__HOST_DEFINES_H__ */
 End of changes. 21 change blocks. 
24 lines changed or deleted 25 lines changed or added


 host_runtime.h   host_runtime.h 
skipping to change at line 48 skipping to change at line 48
#define __CUDA_INTERNAL_COMPILATION__ #define __CUDA_INTERNAL_COMPILATION__
#define __glob_pref_var(var) \ #define __glob_pref_var(var) \
__global_##var __global_##var
#define __global_var(var) \ #define __global_var(var) \
(*__glob_pref_var(var)) (*__glob_pref_var(var))
#define __shadow_var(c, cpp) \ #define __shadow_var(c, cpp) \
__shadow_pref_var(c, cpp) __shadow_pref_var(c, cpp)
#define __text__ #define __text__
#define __surf__ #define __surf__
#define __dv(v) #define __dv(v)
#define __name__shadow_var(c, cpp) \
__pick(#c, #cpp)
#define __name__text_var(c, cpp) \
__pick(#c, #cpp)
#define __shadow_pref_var(c, cpp) \
__pick(c##__cuda_shadow_variable__, cpp##__cuda_shadow_variable__)
#define __device_stub_name(c, cpp) \
__pick(c, cpp)
#define __text_var(c, cpp) \
__pick(c, cpp)
#define __cppref__ \
__pick(, &)
#if defined(_WIN32) && !defined(_WIN64) #if defined(_WIN32) && !defined(_WIN64)
#define __pad__(f) \ #define __pad__(f) \
f f
#else /* _WIN32 && !_WIN64 */ #else /* _WIN32 && !_WIN64 */
#define __pad__(f) #define __pad__(f)
skipping to change at line 73 skipping to change at line 85
__weak_import__, __weak_import__,
#elif defined(__GNUC__) #elif defined(__GNUC__)
#define __extern_weak__ #define __extern_weak__
#endif /* __APPLE__ */ #endif /* __APPLE__ */
#if defined(__cplusplus) #if defined(__cplusplus)
#define __shadow_pref_var(c, cpp) \ #define __pick(c, cpp) \
cpp##__cuda_shadow_variable__
#define __device_stub_name(c, cpp) \
cpp
#define __text_var(c, cpp) \
cpp cpp
#define __cppref__ \
&
#else /* __cplusplus */ #else /* __cplusplus */
#define __shadow_pref_var(c, cpp) \ #define __pick(c, cpp) \
c##__cuda_shadow_variable__
#define __device_stub_name(c, cpp) \
c
#define __text_var(c, cpp) \
c c
#define __cppref__
typedef char bool; typedef char bool;
#endif /* __cplusplus */ #endif /* __cplusplus */
#if !defined(__GNUC__) || __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 3) #if !defined(__GNUC__) || __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 3)
#define __specialization_static \ #define __specialization_static \
static static
skipping to change at line 128 skipping to change at line 129
__cudaRegisterVar(__cudaFatCubinHandle, (char*)&__host##var, (char* )__device##var, __name##var, ext, size, constant, global) __cudaRegisterVar(__cudaFatCubinHandle, (char*)&__host##var, (char* )__device##var, __name##var, ext, size, constant, global)
#define __cudaRegisterGlobalTexture(tex, dim, norm, ext) \ #define __cudaRegisterGlobalTexture(tex, dim, norm, ext) \
__cudaRegisterTexture(__cudaFatCubinHandle, (const struct textureRe ference*)&tex, __tex_var(tex), __name##tex, dim, norm, ext) __cudaRegisterTexture(__cudaFatCubinHandle, (const struct textureRe ference*)&tex, __tex_var(tex), __name##tex, dim, norm, ext)
#define __cudaRegisterGlobalSurface(surf, dim, ext) \ #define __cudaRegisterGlobalSurface(surf, dim, ext) \
__cudaRegisterSurface(__cudaFatCubinHandle, (const struct surfaceRe ference*)&surf, __tex_var(surf), __name##surf, dim, ext) __cudaRegisterSurface(__cudaFatCubinHandle, (const struct surfaceRe ference*)&surf, __tex_var(surf), __name##surf, dim, ext)
#define __cudaRegisterUnsizedShared(var) \ #define __cudaRegisterUnsizedShared(var) \
__cudaRegisterShared(__cudaFatCubinHandle, (void**)__device_var(var )) __cudaRegisterShared(__cudaFatCubinHandle, (void**)__device_var(var ))
#define __cudaRegisterSharedVariable(var, size, align, sc) \ #define __cudaRegisterSharedVariable(var, size, align, sc) \
__cudaRegisterSharedVar(__cudaFatCubinHandle, (void**)__device_var( var), size, align, sc) __cudaRegisterSharedVar(__cudaFatCubinHandle, (void**)__device_var( var), size, align, sc)
#define __cudaRegisterEntry(funptr, fun, thread_limit) \ #define __cudaRegisterEntry(funptr, fun, thread_limit) \
__cudaRegisterFunction(__cudaFatCubinHandle, (const char*)funptr, ( char*)__device_fun(fun), #fun, thread_limit, __ids) __cudaRegisterFunction(__cudaFatCubinHandle, (const char*)funptr, ( char*)__device_fun(fun), #fun, __cuda_tl__(thread_limit), __ids)
#define __cudaInitArgBlock(arg) \ #define __cudaInitArgBlock(arg) \
*(void**)(void*)&arg = (void*)0 *(void**)(void*)&arg = (void*)0
#define __cudaSetupArg(arg, offset) \ #define __cudaSetupArg(arg, offset) \
if (cudaSetupArgument((void*)(char*)&arg, sizeof(arg), (size_t)&off set->arg) != cudaSuccess) \ if (cudaSetupArgument((void*)(char*)&arg, sizeof(arg), (size_t)&off set->arg) != cudaSuccess) \
return return
#define __cudaLaunch(fun) \ #define __cudaLaunch(fun) \
{ volatile static char *__f; __f = fun; (void)cudaLaunch(fun); } { volatile static char *__f; __f = fun; (void)cudaLaunch(fun); }
#if defined(__cplusplus) #if defined(__cplusplus)
skipping to change at line 216 skipping to change at line 217
#if defined(__cplusplus) #if defined(__cplusplus)
} }
#endif /* __cplusplus */ #endif /* __cplusplus */
#if defined(__GNUC__) && defined(__cplusplus) #if defined(__GNUC__) && defined(__cplusplus)
extern int atexit(void(*)(void)) throw(); extern int atexit(void(*)(void)) throw();
#else /* __GNUC__ && __cplusplus */ #else /* __GNUC__ && __cplusplus */
extern int atexit(void(*)(void)); extern int __cdecl atexit(void(__cdecl *)(void));
#endif /* __GNUC__ && __cplusplus */ #endif /* __GNUC__ && __cplusplus */
static void **__cudaFatCubinHandle; static void **__cudaFatCubinHandle;
static void __cudaUnregisterBinaryUtil(void) static void __cdecl __cudaUnregisterBinaryUtil(void)
{ {
__cudaUnregisterFatBinary(__cudaFatCubinHandle); __cudaUnregisterFatBinary(__cudaFatCubinHandle);
} }
#if defined(__device_emulation) #if defined(__device_emulation)
#if defined(__cplusplus) && !defined(__multi_core__) #if defined(__cplusplus) && !defined(__multi_core__)
#define __cuda_emu__ \ #define __cuda_emu__ \
__cuda_emu:: __cuda_emu::
#else /* __cplusplus */ #else /* __cplusplus */
#define __cuda_emu__ #define __cuda_emu__
#endif /* __cplusplus */ #endif /* __cplusplus */
#define __device_fun(fun) \ #define __device_fun(fun) \
__cuda_emu__ __device_wrapper_##fun __cuda_emu__ __device_wrapper_##fun
#define __device_var(var) \ #define __device_var(var) \
(char*)&__cuda_emu__ var &__cuda_emu__ var
#define __tex_var(var) \ #define __tex_var(var) \
&__cuda_emu__ __texture_var(var) &__cuda_emu__ __texture_var(var)
#define __cudaFatCubin \ #define __cudaFatCubin \
0 0
#define __cuda_tl__(l) \
l
#if defined(__multi_core__) #if defined(__multi_core__)
#define __ids \ #define __ids \
(uint3*)0, (uint3*)0, &blockDim, &gridDim, &warpSize (uint3*)0, (uint3*)0, &blockDim, &gridDim, &warpSize
#else /* __multi_core__ */ #else /* __multi_core__ */
#define __ids \ #define __ids \
(uint3*)&__cuda_emu__ threadIdx, (uint3*)&__cuda_emu__ blockIdx, (d im3*)&__cuda_emu__ blockDim, (dim3*)&__cuda_emu__ gridDim, &__cuda_emu__ wa rpSize (uint3*)&__cuda_emu__ threadIdx, (uint3*)&__cuda_emu__ blockIdx, (d im3*)&__cuda_emu__ blockDim, (dim3*)&__cuda_emu__ gridDim, &__cuda_emu__ wa rpSize
skipping to change at line 270 skipping to change at line 273
#else /* __device_emulation */ #else /* __device_emulation */
#define __device_fun(fun) \ #define __device_fun(fun) \
#fun #fun
#define __device_var(var) \ #define __device_var(var) \
#var #var
#define __tex_var(var) \ #define __tex_var(var) \
0 0
#define __cudaFatCubin \ #define __cudaFatCubin \
(&__fatDeviceText) &__fatDeviceText
#define __cuda_tl__(l) \
-1
#define __ids \ #define __ids \
(uint3*)0, (uint3*)0, (dim3*)0, (dim3*)0, (int*)0 (uint3*)0, (uint3*)0, (dim3*)0, (dim3*)0, (int*)0
#include "common_functions.h" #include "common_functions.h"
#endif /* __device_emulation */ #endif /* __device_emulation */
/* UTILITY MACROS */ /* UTILITY MACROS */
#define __device__global_var(var) \ #define __device__global_var(var) \
__device_var(var) __device_var(var)
#define __name__global_var(var) \ #define __name__global_var(var) \
#var #var
#define __host__global_var(var) \ #define __host__global_var(var) \
__glob_pref_var(var) __glob_pref_var(var)
#define __device__shadow_var(c, cpp) \ #define __device__shadow_var(c, cpp) \
__device_var(c) __device_var(c)
#define __name__shadow_var(c, cpp) \
#c
#define __name__text_var(c, cpp) \
#c
#define __host__shadow_var(c, cpp) \ #define __host__shadow_var(c, cpp) \
__shadow_pref_var(c, cpp) __shadow_pref_var(c, cpp)
#if defined(_WIN32) && defined(__cplusplus) #if defined(_WIN32)
#if defined(__cplusplus)
#pragma warning(disable: 4099) #pragma warning(disable: 4099)
#endif /* _WIN32 && __cplusplus */ #endif /* __cplusplus */
#if !defined(_WIN64)
#pragma warning(disable: 4408)
#endif /* !_WIN64 */
#endif /* _WIN32 */
#endif /* !__CUDA_INTERNAL_COMPILATION__ */ #endif /* !__CUDA_INTERNAL_COMPILATION__ */
 End of changes. 14 change blocks. 
24 lines changed or deleted 35 lines changed or added


 math_constants.h   math_constants.h 
/* /*
* Copyright 1993-2009 NVIDIA Corporation. All rights reserved. * Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
* *
* NOTICE TO USER: * NOTICE TO USER:
* *
* This source code is subject to NVIDIA ownership rights under U.S. and * This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws. Users and possessors of this source code * international Copyright laws. Users and possessors of this source code
* are hereby granted a nonexclusive, royalty-free license to use this code * are hereby granted a nonexclusive, royalty-free license to use this code
* in individual and commercial software. * in individual and commercial software.
* *
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
skipping to change at line 107 skipping to change at line 107
#define CUDART_PIO4_HI 7.8539816339744828e-1 #define CUDART_PIO4_HI 7.8539816339744828e-1
#define CUDART_PIO4_LO 3.0616169978683830e-17 #define CUDART_PIO4_LO 3.0616169978683830e-17
#define CUDART_PIO2 1.5707963267948966e+0 #define CUDART_PIO2 1.5707963267948966e+0
#define CUDART_PIO2_HI 1.5707963267948966e+0 #define CUDART_PIO2_HI 1.5707963267948966e+0
#define CUDART_PIO2_LO 6.1232339957367660e-17 #define CUDART_PIO2_LO 6.1232339957367660e-17
#define CUDART_3PIO4 2.3561944901923448e+0 #define CUDART_3PIO4 2.3561944901923448e+0
#define CUDART_2_OVER_PI 6.3661977236758138e-1 #define CUDART_2_OVER_PI 6.3661977236758138e-1
#define CUDART_PI 3.1415926535897931e+0 #define CUDART_PI 3.1415926535897931e+0
#define CUDART_PI_HI 3.1415926535897931e+0 #define CUDART_PI_HI 3.1415926535897931e+0
#define CUDART_PI_LO 1.2246467991473532e-16 #define CUDART_PI_LO 1.2246467991473532e-16
#define CUDART_SQRT_2PI 2.5066282746310007e+0
#define CUDART_SQRT_2PI_HI 2.5066282746310007e+0 #define CUDART_SQRT_2PI_HI 2.5066282746310007e+0
#define CUDART_SQRT_2PI_LO (-1.8328579980459167e-16) #define CUDART_SQRT_2PI_LO (-1.8328579980459167e-16)
#define CUDART_SQRT_PIO2 1.2533141373155003e+0
#define CUDART_SQRT_PIO2_HI 1.2533141373155003e+0 #define CUDART_SQRT_PIO2_HI 1.2533141373155003e+0
#define CUDART_SQRT_PIO2_LO (-9.1642899902295834e-17) #define CUDART_SQRT_PIO2_LO (-9.1642899902295834e-17)
#define CUDART_L2E 1.4426950408889634e+0 #define CUDART_L2E 1.4426950408889634e+0
#define CUDART_L2E_HI 1.4426950408889634e+0 #define CUDART_L2E_HI 1.4426950408889634e+0
#define CUDART_L2E_LO 2.0355273740931033e-17 #define CUDART_L2E_LO 2.0355273740931033e-17
#define CUDART_L2T 3.3219280948873622e+0 #define CUDART_L2T 3.3219280948873622e+0
#define CUDART_LG2 3.0102999566398120e-1 #define CUDART_LG2 3.0102999566398120e-1
#define CUDART_LG2_HI 3.0102999566398120e-1 #define CUDART_LG2_HI 3.0102999566398120e-1
#define CUDART_LG2_LO (-2.8037281277851704e-18) #define CUDART_LG2_LO (-2.8037281277851704e-18)
#define CUDART_LGE 4.3429448190325182e-1 #define CUDART_LGE 4.3429448190325182e-1
 End of changes. 3 change blocks. 
1 lines changed or deleted 3 lines changed or added


 math_functions_dbl_ptx1.h   math_functions_dbl_ptx1.h 
/* /*
* Copyright 1993-2009 NVIDIA Corporation. All rights reserved. * Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
* *
* NOTICE TO USER: * NOTICE TO USER:
* *
* This source code is subject to NVIDIA ownership rights under U.S. and * This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws. Users and possessors of this source code * international Copyright laws. Users and possessors of this source code
* are hereby granted a nonexclusive, royalty-free license to use this code * are hereby granted a nonexclusive, royalty-free license to use this code
* in individual and commercial software. * in individual and commercial software.
* *
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
skipping to change at line 115 skipping to change at line 115
__device_func__(double __cuda_copysign(double a, double b)) __device_func__(double __cuda_copysign(double a, double b))
{ {
return (double)__cuda_copysignf((float)a, (float)b); return (double)__cuda_copysignf((float)a, (float)b);
} }
__device_func__(double __cuda_sin(double a)) __device_func__(double __cuda_sin(double a))
{ {
return (double)__cuda_sinf((float)a); return (double)__cuda_sinf((float)a);
} }
__device_func__(double __cuda_sinpi(double a))
{
return (double)__cuda_sinpif((float)a);
}
__device_func__(double __cuda_cos(double a)) __device_func__(double __cuda_cos(double a))
{ {
return (double)__cuda_cosf((float)a); return (double)__cuda_cosf((float)a);
} }
__device_func__(void __cuda_sincos(double a, double *sptr, double *cptr)) __device_func__(void __cuda_sincos(double a, double *sptr, double *cptr))
{ {
float fs, fc; float fs, fc;
__cuda_sincosf((float)a, &fs, &fc); __cuda_sincosf((float)a, &fs, &fc);
skipping to change at line 235 skipping to change at line 240
__device_func__(double __cuda_hypot(double a, double b)) __device_func__(double __cuda_hypot(double a, double b))
{ {
return (double)__cuda_hypotf((float)a, (float)b); return (double)__cuda_hypotf((float)a, (float)b);
} }
__device_func__(double __cuda_cbrt(double a)) __device_func__(double __cuda_cbrt(double a))
{ {
return (double)__cuda_cbrtf((float)a); return (double)__cuda_cbrtf((float)a);
} }
__device_func__(double __cuda_rcbrt(double a))
{
return (double)__cuda_rcbrtf((float)a);
}
__device_func__(double __cuda_erf(double a)) __device_func__(double __cuda_erf(double a))
{ {
return (double)__cuda_erff((float)a); return (double)__cuda_erff((float)a);
} }
__device_func__(double __cuda_erfinv(double a)) __device_func__(double __cuda_erfinv(double a))
{ {
return (double)__cuda_erfinvf((float)a); return (double)__cuda_erfinvf((float)a);
} }
 End of changes. 3 change blocks. 
1 lines changed or deleted 11 lines changed or added


 math_functions_dbl_ptx3.h   math_functions_dbl_ptx3.h 
/* /*
* Copyright 1993-2009 NVIDIA Corporation. All rights reserved. * Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
* *
* NOTICE TO USER: * NOTICE TO USER:
* *
* This source code is subject to NVIDIA ownership rights under U.S. and * This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws. Users and possessors of this source code * international Copyright laws. Users and possessors of this source code
* are hereby granted a nonexclusive, royalty-free license to use this code * are hereby granted a nonexclusive, royalty-free license to use this code
* in individual and commercial software. * in individual and commercial software.
* *
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
skipping to change at line 197 skipping to change at line 197
{ {
int alo, ahi, bhi; int alo, ahi, bhi;
bhi = __double2hiint(b); bhi = __double2hiint(b);
alo = __double2loint(a); alo = __double2loint(a);
ahi = __double2hiint(a); ahi = __double2hiint(a);
ahi = (bhi & 0x80000000) | (ahi & ~0x80000000); ahi = (bhi & 0x80000000) | (ahi & ~0x80000000);
return __hiloint2double(ahi, alo); return __hiloint2double(ahi, alo);
} }
/* like copysign, but requires that argument a is postive */
__device_func__(double __internal_copysign_pos(double a, double b))
{
int alo, ahi, bhi;
bhi = __double2hiint(b);
alo = __double2loint(a);
ahi = __double2hiint(a);
ahi = (bhi & 0x80000000) | ahi;
return __hiloint2double(ahi, alo);
}
/* 1152 bits of 2/PI for Payne-Hanek style argument reduction. */ /* 1152 bits of 2/PI for Payne-Hanek style argument reduction. */
static __constant__ unsigned long long int __cudart_i2opi_d [] = { static __constant__ unsigned long long int __cudart_i2opi_d [] = {
0x6bfb5fb11f8d5d08ULL, 0x6bfb5fb11f8d5d08ULL,
0x3d0739f78a5292eaULL, 0x3d0739f78a5292eaULL,
0x7527bac7ebe5f17bULL, 0x7527bac7ebe5f17bULL,
0x4f463f669e5fea2dULL, 0x4f463f669e5fea2dULL,
0x6d367ecf27cb09b7ULL, 0x6d367ecf27cb09b7ULL,
0xef2f118b5a0a6d1fULL, 0xef2f118b5a0a6d1fULL,
0x1ff897ffde05980fULL, 0x1ff897ffde05980fULL,
0x9c845f8bbdf9283bULL, 0x9c845f8bbdf9283bULL,
skipping to change at line 472 skipping to change at line 484
z = __internal_cos_kerneld(z); z = __internal_cos_kerneld(z);
} else { } else {
z = __internal_sin_kerneld(z); z = __internal_sin_kerneld(z);
} }
if (i & 2) { if (i & 2) {
z = -z; z = -z;
} }
return z; return z;
} }
__device_func__(double __cuda_sinpi(double a))
{
double z;
double fi;
int i;
if (__cuda___isinf(a) || (a == CUDART_ZERO)) {
return __dmul_rn(a, CUDART_ZERO);
}
/* IEEE-754: sinPi(+n) is +0 and sinPi(-n) is -0 for positive integers n.
*/
if (a == __cuda_trunc(a)) {
return __longlong_as_double(__double_as_longlong(a)&0x8000000000000000U
LL);
}
fi = __cuda_rint (a * 2.0);
z = __fma_rn (fi, -0.5, a);
z = __fma_rn (z, CUDART_PI_HI, z * CUDART_PI_LO);
i = (int)(((long long)fi) & 3);
if (i & 1) {
z = __internal_cos_kerneld(z);
} else {
z = __internal_sin_kerneld(z);
}
if (i & 2) {
z = -z;
}
return z;
}
__device_func__(double __cuda_cos(double a)) __device_func__(double __cuda_cos(double a))
{ {
double z; double z;
int i; int i;
if (__cuda___isinf(a)) { if (__cuda___isinf(a)) {
return CUDART_NAN; return CUDART_NAN;
} }
z = __internal_trig_reduction_kerneld(a, &i); z = __internal_trig_reduction_kerneld(a, &i);
/* here, abs(z) <= pi/4, and i has the quadrant */ /* here, abs(z) <= pi/4, and i has the quadrant */
i++; i++;
skipping to change at line 531 skipping to change at line 571
} }
*sptr = s; *sptr = s;
*cptr = c; *cptr = c;
} }
__device_func__(double __cuda_tan(double a)) __device_func__(double __cuda_tan(double a))
{ {
double z; double z;
int i; int i;
if (__cuda___isinf(a)) { if (__cuda___isinf(a)) {
return CUDART_NAN; return __dadd_rn (a, -a); /* return NaN */
} }
z = __internal_trig_reduction_kerneld(a, &i); z = __internal_trig_reduction_kerneld(a, &i);
/* here, abs(z) <= pi/4, and i has the quadrant */ /* here, abs(z) <= pi/4, and i has the quadrant */
z = __internal_tan_kerneld(z, i & 1); z = __internal_tan_kerneld(z, i & 1);
return z; return z;
} }
__device_func__(double __cuda_log(double a)) __device_func__(double __cuda_log(double a))
{ {
double m, f, g, u, v, tmp, q, ulo, log_lo, log_hi; double m, f, g, u, v, tmp, q, ulo, log_lo, log_hi;
skipping to change at line 582 skipping to change at line 622
/* u = 2.0 * (m - 1.0) / (m + 1.0) */ /* u = 2.0 * (m - 1.0) / (m + 1.0) */
v = u * u; v = u * u;
q = 6.7261411553826339E-2/65536.0; q = 6.7261411553826339E-2/65536.0;
q = __fma_rn (q, v, 6.6133829643643394E-2/16384.0); q = __fma_rn (q, v, 6.6133829643643394E-2/16384.0);
q = __fma_rn (q, v, 7.6940931149150890E-2/4096.0); q = __fma_rn (q, v, 7.6940931149150890E-2/4096.0);
q = __fma_rn (q, v, 9.0908745692137444E-2/1024.0); q = __fma_rn (q, v, 9.0908745692137444E-2/1024.0);
q = __fma_rn (q, v, 1.1111111499059706E-1/256.0); q = __fma_rn (q, v, 1.1111111499059706E-1/256.0);
q = __fma_rn (q, v, 1.4285714283305975E-1/64.0); q = __fma_rn (q, v, 1.4285714283305975E-1/64.0);
q = __fma_rn (q, v, 2.0000000000007223E-1/16.0); q = __fma_rn (q, v, 2.0000000000007223E-1/16.0);
q = __fma_rn (q, v, 3.3333333333333326E-1/4.0); q = __fma_rn (q, v, 3.3333333333333326E-1/4.0);
tmp = __internal_twice (f - u); tmp = 2.0 * (f - u);
tmp = __fma_rn (-u, f, tmp); // tmp = remainder of division tmp = __fma_rn (-u, f, tmp); // tmp = remainder of division
ulo = g * tmp; // less significant quotient bits ulo = g * tmp; // less significant quotient bits
/* u + ulo = 2.0 * (m - 1.0) / (m + 1.0) to more than double precision */ /* u + ulo = 2.0 * (m - 1.0) / (m + 1.0) to more than double precision */
q = q * v; q = q * v;
q = q * u; q = q * u;
/* log_hi + log_lo = log(m) to more than double precision */ /* log_hi + log_lo = log(m) to more than double precision */
log_hi = u; log_hi = u;
log_lo = ulo + q; log_lo = ulo + q;
/* log_hi + log_lo = log(m)+e*log(2)=log(a) to more than double precisi on*/ /* log_hi + log_lo = log(m)+e*log(2)=log(a) to more than double precisi on*/
q = __fma_rn ( e, CUDART_LN2_HI, log_hi); q = __fma_rn ( e, CUDART_LN2_HI, log_hi);
skipping to change at line 700 skipping to change at line 740
v = u * u; v = u * u;
q = 6.6253631649203309E-2/65536.0; q = 6.6253631649203309E-2/65536.0;
q = __fma_rn (q, v, 6.6250935587260612E-2/16384.0); q = __fma_rn (q, v, 6.6250935587260612E-2/16384.0);
q = __fma_rn (q, v, 7.6935437806732829E-2/4096.0); q = __fma_rn (q, v, 7.6935437806732829E-2/4096.0);
q = __fma_rn (q, v, 9.0908878711093280E-2/1024.0); q = __fma_rn (q, v, 9.0908878711093280E-2/1024.0);
q = __fma_rn (q, v, 1.1111111322892790E-1/256.0); q = __fma_rn (q, v, 1.1111111322892790E-1/256.0);
q = __fma_rn (q, v, 1.4285714284546502E-1/64.0); q = __fma_rn (q, v, 1.4285714284546502E-1/64.0);
q = __fma_rn (q, v, 2.0000000000003113E-1/16.0); q = __fma_rn (q, v, 2.0000000000003113E-1/16.0);
q = q * v; q = q * v;
/* u + ulo = 2.0 * (m - 1.0) / (m + 1.0) to more than double precision */ /* u + ulo = 2.0 * (m - 1.0) / (m + 1.0) to more than double precision */
tmp = __internal_twice (f - u); tmp = 2.0 * (f - u);
tmp = __fma_rn (-u, f, tmp); // tmp = remainder of division tmp = __fma_rn (-u, f, tmp); // tmp = remainder of division
ulo = g * tmp; // less significand quotient bits ulo = g * tmp; // less significand quotient bits
/* switch to double-double at this point */ /* switch to double-double at this point */
qq.y = q; qq.y = q;
qq.x = 0.0; qq.x = 0.0;
uu.y = u; uu.y = u;
uu.x = ulo; uu.x = ulo;
cc.y = 3.3333333333333331E-1/4.0; cc.y = 3.3333333333333331E-1/4.0;
cc.x = -9.8201492846582465E-18/4.0; cc.x = -9.8201492846582465E-18/4.0;
qq = __internal_ddadd_xgty (cc, qq); qq = __internal_ddadd_xgty (cc, qq);
skipping to change at line 936 skipping to change at line 976
} else if (a < 2.0) { /* work around accuracy issue in vicinity of 1.4 */ } else if (a < 2.0) { /* work around accuracy issue in vicinity of 1.4 */
z = __cuda_expm1(a); z = __cuda_expm1(a);
z = __internal_half (z + z / (z + 1.0)); z = __internal_half (z + z / (z + 1.0));
} else { } else {
z = __internal_exp_kernel(a, -1); z = __internal_exp_kernel(a, -1);
z = z + (1.0 / (-4.0 * z)); z = z + (1.0 / (-4.0 * z));
if (a >= CUDART_LN2_X_1025) { if (a >= CUDART_LN2_X_1025) {
z = CUDART_INF; /* overflow -> infinity */ z = CUDART_INF; /* overflow -> infinity */
} }
} }
z = __cuda_copysign(z, s); z = __internal_copysign_pos(z, s);
return z; return z;
} }
__device_func__(double __cuda_tanh(double a)) __device_func__(double __cuda_tanh(double a))
{ {
double t; double t;
t = __cuda_fabs(a); t = __cuda_fabs(a);
if (t >= 0.55) { if (t >= 0.55) {
double s; double s;
s = 1.0 - 2.0 / (__internal_exp_kernel(2.0 * t, 0) + 1.0); s = 1.0 - 2.0 / (__internal_exp_kernel(2.0 * t, 0) + 1.0);
if (t > 350.0) { if (t > 350.0) {
s = 1.0; /* overflow -> 1.0 */ s = 1.0; /* overflow -> 1.0 */
} }
a = __cuda_copysign(s, a); a = __internal_copysign_pos(s, a);
} else { } else {
double a2; double a2;
a2 = a * a; a2 = a * a;
t = 5.102147717274194E-005; t = 5.102147717274194E-005;
t = __fma_rn (t, a2, -2.103023983278533E-004); t = __fma_rn (t, a2, -2.103023983278533E-004);
t = __fma_rn (t, a2, 5.791370145050539E-004); t = __fma_rn (t, a2, 5.791370145050539E-004);
t = __fma_rn (t, a2, -1.453216755611004E-003); t = __fma_rn (t, a2, -1.453216755611004E-003);
t = __fma_rn (t, a2, 3.591719696944118E-003); t = __fma_rn (t, a2, 3.591719696944118E-003);
t = __fma_rn (t, a2, -8.863194503940334E-003); t = __fma_rn (t, a2, -8.863194503940334E-003);
t = __fma_rn (t, a2, 2.186948597477980E-002); t = __fma_rn (t, a2, 2.186948597477980E-002);
t = __fma_rn (t, a2, -5.396825387607743E-002); t = __fma_rn (t, a2, -5.396825387607743E-002);
t = __fma_rn (t, a2, 1.333333333316870E-001); t = __fma_rn (t, a2, 1.333333333316870E-001);
t = __fma_rn (t, a2, -3.333333333333232E-001); t = __fma_rn (t, a2, -3.333333333333232E-001);
t = t * a2; t = t * a2;
t = __fma_rn (t, a, a); t = __fma_rn (t, a, a);
a = __cuda_copysign(t, a); a = __internal_copysign_pos(t, a);
} }
return a; return a;
} }
__device_func__(double __internal_atan_kernel(double a)) __device_func__(double __internal_atan_kernel(double a))
{ {
double t, a2; double t, a2;
a2 = a * a; a2 = a * a;
t = -2.0258553044438358E-005 ; t = -2.0258553044438358E-005 ;
t = __fma_rn (t, a2, 2.2302240345758510E-004); t = __fma_rn (t, a2, 2.2302240345758510E-004);
skipping to change at line 1022 skipping to change at line 1062
t3 = __cuda___signbit(b) ? CUDART_3PIO4 : CUDART_PIO4; t3 = __cuda___signbit(b) ? CUDART_3PIO4 : CUDART_PIO4;
} else { } else {
t0 = __cuda_fmax (t1, t3); t0 = __cuda_fmax (t1, t3);
t1 = __cuda_fmin (t1, t3); t1 = __cuda_fmin (t1, t3);
t3 = t1 / t0; t3 = t1 / t0;
t3 = __internal_atan_kernel(t3); t3 = __internal_atan_kernel(t3);
/* Map result according to octant. */ /* Map result according to octant. */
if (__cuda_fabs(a) > __cuda_fabs(b)) t3 = CUDART_PIO2 - t3; if (__cuda_fabs(a) > __cuda_fabs(b)) t3 = CUDART_PIO2 - t3;
if (b < 0.0) t3 = CUDART_PI - t3; if (b < 0.0) t3 = CUDART_PI - t3;
} }
t3 = __cuda_copysign(t3, a); t3 = __internal_copysign_pos(t3, a);
return t3; return t3;
} }
__device_func__(double __cuda_atan(double a)) __device_func__(double __cuda_atan(double a))
{ {
double t0, t1; double t0, t1;
/* reduce argument to first octant */ /* reduce argument to first octant */
t0 = __cuda_fabs(a); t0 = __cuda_fabs(a);
t1 = t0; t1 = t0;
if (t0 > 1.0) { if (t0 > 1.0) {
t1 = 1.0 / t1; t1 = 1.0 / t1;
} }
/* approximate atan(r) in first octant */ /* approximate atan(r) in first octant */
t1 = __internal_atan_kernel(t1); t1 = __internal_atan_kernel(t1);
/* map result according to octant. */ /* map result according to octant. */
if (t0 > 1.0) { if (t0 > 1.0) {
t1 = CUDART_PIO2 - t1; t1 = CUDART_PIO2 - t1;
} }
return __cuda_copysign (t1, a); return __internal_copysign_pos(t1, a);
} }
/* b should be the square of a */ /* b should be the square of a */
__device_func__(double __internal_asin_kernel(double a, double b)) __device_func__(double __internal_asin_kernel(double a, double b))
{ {
double r; double r;
r = 6.259798167646803E-002; r = 6.259798167646803E-002;
r = __fma_rn (r, b, -7.620591484676952E-002); r = __fma_rn (r, b, -7.620591484676952E-002);
r = __fma_rn (r, b, 6.686894879337643E-002); r = __fma_rn (r, b, 6.686894879337643E-002);
r = __fma_rn (r, b, -1.787828218369301E-002); r = __fma_rn (r, b, -1.787828218369301E-002);
skipping to change at line 1076 skipping to change at line 1116
{ {
double fa, t0, t1; double fa, t0, t1;
int ihi, ahi; int ihi, ahi;
ahi = __double2hiint(a); ahi = __double2hiint(a);
fa = __cuda_fabs(a); fa = __cuda_fabs(a);
ihi = __double2hiint(fa); ihi = __double2hiint(fa);
if (ihi < 0x3fe26666) { if (ihi < 0x3fe26666) {
t1 = fa * fa; t1 = fa * fa;
t1 = __internal_asin_kernel (fa, t1); t1 = __internal_asin_kernel (fa, t1);
t1 = __fma_rn (t1, fa, fa); t1 = __fma_rn (t1, fa, fa);
t1 = __cuda_copysign(t1, a); t1 = __internal_copysign_pos(t1, a);
} else { } else {
t1 = __fma_rn (-0.5, fa, 0.5); t1 = __fma_rn (-0.5, fa, 0.5);
t0 = __cuda_sqrt (t1); t0 = __cuda_sqrt (t1);
t1 = __internal_asin_kernel (t0, t1); t1 = __internal_asin_kernel (t0, t1);
t0 = -2.0 * t0; t0 = -2.0 * t0;
t1 = __fma_rn (t0, t1, CUDART_PIO2_LO); t1 = __fma_rn (t0, t1, CUDART_PIO2_LO);
t0 = t0 + CUDART_PIO4_HI; t0 = t0 + CUDART_PIO4_HI;
t1 = t0 + t1; t1 = t0 + t1;
t1 = t1 + CUDART_PIO4_HI; t1 = t1 + CUDART_PIO4_HI;
if (ahi < 0x3ff00000) { if (ahi < 0x3ff00000) {
t1 = __cuda_copysign(t1, a); t1 = __internal_copysign_pos(t1, a);
} }
} }
return t1; return t1;
} }
__device_func__(double __cuda_acos(double a)) __device_func__(double __cuda_acos(double a))
{ {
double t0, t1; double t0, t1;
int ihi, ahi; int ihi, ahi;
skipping to change at line 1151 skipping to change at line 1191
/* for large a, acosh = log(2*a) */ /* for large a, acosh = log(2*a) */
return CUDART_LN2 + __cuda_log(a); return CUDART_LN2 + __cuda_log(a);
} else { } else {
t = t + __cuda_sqrt(__fma_rn(a, t, t)); t = t + __cuda_sqrt(__fma_rn(a, t, t));
return __cuda_log1p(t); return __cuda_log1p(t);
} }
} }
__device_func__(double __cuda_asinh(double a)) __device_func__(double __cuda_asinh(double a))
{ {
#if SLIGHTLY_MORE_ACCURATE_BUT_SLOWER
double fa, oofa, t;
fa = __cuda_fabs(a);
if (fa > 8.9884657373828596e+307) { /* prevent intermediate underflow */
t = CUDART_LN2 + __cuda_log(fa);
} else {
oofa = 1.0 / fa;
t = fa + fa / (oofa + __cuda_sqrt(__fma_rn(oofa, oofa, 1.0)));
t = __cuda_log1p(t);
}
#else
double fa, t; double fa, t;
fa = __cuda_fabs(a); fa = __cuda_fabs(a);
if (fa > 1.0e153) { if (__double2hiint(fa) >= 0x5ff00000) { /* prevent intermediate underflow */
t = CUDART_LN2 + __cuda_log(fa); t = CUDART_LN2 + __cuda_log(fa);
} else { } else {
t = fa * fa; t = fa * fa;
t = __cuda_log1p (fa + t / (1.0 + __cuda_sqrt(1.0 + t))); t = __cuda_log1p (fa + t / (1.0 + __cuda_sqrt(1.0 + t)));
} }
#endif return __internal_copysign_pos(t, a);
return __cuda_copysign(t, a);
} }
__device_func__(double __cuda_atanh(double a)) __device_func__(double __cuda_atanh(double a))
{ {
double fa, t; double fa, t;
#if !defined(__CUDABE__) #if !defined(__CUDABE__)
if (__cuda___isnan(a)) { if (__cuda___isnan(a)) {
return a + a; return a + a;
} }
#endif #endif
skipping to change at line 1198 skipping to change at line 1226
} }
#endif #endif
if (__cuda___signbit(a)) { if (__cuda___signbit(a)) {
t = -t; t = -t;
} }
return t; return t;
} }
__device_func__(double __cuda_hypot(double a, double b)) __device_func__(double __cuda_hypot(double a, double b))
{ {
double v, w, t; double v, w, t, fa, fb;
if (__cuda___isinf(a) || __cuda___isinf(b)) {
return CUDART_INF; fa = __cuda_fabs(a);
} fb = __cuda_fabs(b);
if (__cuda___isnan(a) || __cuda___isnan(b)) { v = __cuda_fmax(fa, fb);
return a + b; w = __cuda_fmin(fa, fb);
}
a = __cuda_fabs(a);
b = __cuda_fabs(b);
v = __cuda_fmax(a, b);
w = __cuda_fmin(a, b);
t = w / v; t = w / v;
t = __fma_rn (t, t, 1.0); t = __fma_rn (t, t, 1.0);
t = v * __cuda_sqrt(t); t = v * __cuda_sqrt(t);
if (v == 0.0) { if (v == 0.0) {
t = v + w; t = v + w; /* fixup for zero divide */
}
if ((!(fa <= CUDART_INF)) || (!(fb <= CUDART_INF))) {
t = a + b; /* fixup for NaNs */
}
if (v == CUDART_INF) {
t = v + w; /* fixup for infinities */
} }
return t; return t;
} }
__device_func__(double __cuda_cbrt(double a)) __device_func__(double __cuda_cbrt(double a))
{ {
float s; float s;
double t, r; double t, r;
int ilo, ihi, expo, nexpo, denorm; int ilo, ihi, expo, nexpo, denorm;
if ((a == 0.0) || !(__cuda___finite(a))) { if ((a == 0.0) || !(__cuda___finite(a))) {
skipping to change at line 1245 skipping to change at line 1274
ilo = __double2loint(t); ilo = __double2loint(t);
ihi = __double2hiint(t); ihi = __double2hiint(t);
expo = ((int)((unsigned int)ihi >> 20) & 0x7ff); expo = ((int)((unsigned int)ihi >> 20) & 0x7ff);
} }
/* scale into float range */ /* scale into float range */
nexpo = __float2int_rn(CUDART_THIRD_F * (float)(expo - 1022)); nexpo = __float2int_rn(CUDART_THIRD_F * (float)(expo - 1022));
ihi -= (3 * nexpo) << 20; ihi -= (3 * nexpo) << 20;
r = __hiloint2double(ihi, ilo); r = __hiloint2double(ihi, ilo);
/* initial approximation */ /* initial approximation */
s = (float)r; s = (float)r;
t = __cuda_exp2f(CUDART_THIRD_F * __log2f(s)); t = __cuda_exp2f(-CUDART_THIRD_F * __log2f(s)); /* approximate invcbrt
/* refine approximation */ */
t = t - (t - (r / (t * t))) * CUDART_THIRD; t = __fma_rn(__fma_rn(t*t,-r*t,1.0), CUDART_THIRD*t, t);/* refine invcbrt
t = t - (t - (r / (t * t))) * CUDART_THIRD; */
t = r * t * t; /* approximate cbrt
*/
t = __fma_rn(t - (r / (t * t)), -CUDART_THIRD, t); /* refine cbrt
*/
/* scale result back into double range */ /* scale result back into double range */
ilo = __double2loint(t); ilo = __double2loint(t);
ihi = __double2hiint(t); ihi = __double2hiint(t);
ihi += (nexpo - denorm) << 20; ihi += (nexpo - denorm) << 20;
t = __hiloint2double(ihi, ilo); t = __hiloint2double(ihi, ilo);
if (__cuda___signbit(a)) { if (__cuda___signbit(a)) {
t = -t; t = -t;
}
return t;
}
__device_func__(double __cuda_rcbrt(double a))
{
float s;
double t, r;
int ilo, ihi, expo, nexpo, denorm;
if ((a == 0.0) || !(__cuda___finite(a))) {
return 1.0 / a;
}
t = __cuda_fabs(a);
ilo = __double2loint(t);
ihi = __double2hiint(t);
expo = ((int)((unsigned int)ihi >> 20) & 0x7ff);
denorm = 0;
if (expo == 0) {
/* denormal */
t = t * CUDART_TWO_TO_54;
denorm = 18;
ilo = __double2loint(t);
ihi = __double2hiint(t);
expo = ((int)((unsigned int)ihi >> 20) & 0x7ff);
}
/* scale into float range */
nexpo = __float2int_rn(CUDART_THIRD_F * (float)(expo - 1022));
ihi -= (3 * nexpo) << 20;
r = __hiloint2double(ihi, ilo);
/* initial approximation */
s = (float)r;
t = __cuda_exp2f(-CUDART_THIRD_F * __log2f(s)); /* approximate invcbrt
*/
t = __fma_rn(__fma_rn(t*t,-r*t,1.0), CUDART_THIRD*t, t);/* refine invcbrt
*/
t = __fma_rn(__fma_rn(t*t,-r*t,1.0), CUDART_THIRD*t, t);/* refine invcbrt
*/
/* scale result back into double range */
ilo = __double2loint(t);
ihi = __double2hiint(t);
ihi += (-(nexpo - denorm)) << 20;
t = __hiloint2double(ihi, ilo);
if (__cuda___signbit(a)) {
t = -t;
} }
return t; return t;
} }
__device_func__(double __internal_accurate_pow(double a, double b)) __device_func__(double __internal_accurate_pow(double a, double b))
{ {
double2 loga; double2 loga;
double2 prod; double2 prod;
double t_hi, t_lo; double t_hi, t_lo;
double tmp; double tmp;
#if !defined(__CUDABE__) && defined(__linux__) && !defined(__LP64__) #if !defined(__CUDABE__) && defined(__linux__) && !defined(__LP64__)
volatile double e; volatile
#else
double e;
#endif #endif
double e;
/* compute log(a) in double-double format*/ /* compute log(a) in double-double format*/
loga = __internal_log_ext_prec(a); loga = __internal_log_ext_prec(a);
/* prevent overflow during extended precision multiply */ /* prevent overflow during extended precision multiply */
if (__cuda_fabs(b) > 1e304) b *= 1.220703125e-4; if (__cuda_fabs(b) > 1e304) b *= 1.220703125e-4;
/* compute b * log(a) in double-double format */ /* compute b * log(a) in double-double format */
t_hi = loga.y * b; t_hi = loga.y * b;
t_lo = __fma_rn (loga.y, b, -t_hi); t_lo = __fma_rn (loga.y, b, -t_hi);
t_lo = __fma_rn (loga.x, b, t_lo); t_lo = __fma_rn (loga.x, b, t_lo);
skipping to change at line 1319 skipping to change at line 1389
if (__cuda___isinf(b)) { if (__cuda___isinf(b)) {
if (a == -1.0) { if (a == -1.0) {
return 1.0; return 1.0;
} }
t = __cuda_fabs(a) > 1.0 ? CUDART_INF : CUDART_ZERO; t = __cuda_fabs(a) > 1.0 ? CUDART_INF : CUDART_ZERO;
if (b < CUDART_ZERO) { if (b < CUDART_ZERO) {
t = 1.0 / t; t = 1.0 / t;
} }
return t; return t;
} }
bIsOddInteger = (b - (2.0 * __cuda_floor(0.5 * b))) == 1.0; bIsOddInteger = __cuda_fabs(b - (2.0f * __cuda_trunc(0.5 * b))) == 1.0;
if (a == CUDART_ZERO) { if (a == CUDART_ZERO) {
t = bIsOddInteger ? a : CUDART_ZERO; t = bIsOddInteger ? a : CUDART_ZERO;
if (b < CUDART_ZERO) { if (b < CUDART_ZERO) {
t = 1.0 / t; t = 1.0 / t;
} }
return t; return t;
} }
if (a == -CUDART_INF) { if (a == -CUDART_INF) {
t = (b < CUDART_ZERO) ? -1.0/a : -a; t = (b < CUDART_ZERO) ? -1.0/a : -a;
if (bIsOddInteger) { if (bIsOddInteger) {
skipping to change at line 1348 skipping to change at line 1418
t = __internal_accurate_pow(t, b); t = __internal_accurate_pow(t, b);
if ((a < CUDART_ZERO) && bIsOddInteger) { if ((a < CUDART_ZERO) && bIsOddInteger) {
t = __longlong_as_double(__double_as_longlong(t) ^ 0x8000000000000000UL L); t = __longlong_as_double(__double_as_longlong(t) ^ 0x8000000000000000UL L);
} }
return t; return t;
} }
__device_func__(double __cuda_erf(double a)) __device_func__(double __cuda_erf(double a))
{ {
double t, r, q; double t, r, q;
#if !defined(__CUDABE__)
if (__cuda___isnan(a)) {
return a + a;
}
#endif
t = __cuda_fabs(a); t = __cuda_fabs(a);
if (t >= 1.0) { if (t >= 1.0) {
r = -1.28836351230756500E-019; r = -1.28836351230756500E-019;
r = __fma_rn (r, t, 1.30597472161093370E-017); r = __fma_rn (r, t, 1.30597472161093370E-017);
r = __fma_rn (r, t, -6.33924401259620500E-016); r = __fma_rn (r, t, -6.33924401259620500E-016);
r = __fma_rn (r, t, 1.96231865908940140E-014); r = __fma_rn (r, t, 1.96231865908940140E-014);
r = __fma_rn (r, t, -4.35272243559990750E-013); r = __fma_rn (r, t, -4.35272243559990750E-013);
r = __fma_rn (r, t, 7.37083927929352150E-012); r = __fma_rn (r, t, 7.37083927929352150E-012);
r = __fma_rn (r, t, -9.91402142550461630E-011); r = __fma_rn (r, t, -9.91402142550461630E-011);
r = __fma_rn (r, t, 1.08817017167760820E-009); r = __fma_rn (r, t, 1.08817017167760820E-009);
skipping to change at line 1388 skipping to change at line 1454
r = __fma_rn (r, t, 4.99394435612628580E-001); r = __fma_rn (r, t, 4.99394435612628580E-001);
r = __fma_rn (r, t, -7.52014596480123030E-001); r = __fma_rn (r, t, -7.52014596480123030E-001);
r = __fma_rn (r, t, 9.99933138314926250E-001); r = __fma_rn (r, t, 9.99933138314926250E-001);
r = __fma_rn (r, t, -1.12836725321102670E+000); r = __fma_rn (r, t, -1.12836725321102670E+000);
r = __fma_rn (r, t, 9.99998988715182450E-001); r = __fma_rn (r, t, 9.99998988715182450E-001);
q = __internal_exp_kernel(-t * t, 0); q = __internal_exp_kernel(-t * t, 0);
r = __fma_rn (r, -q, 1.0); r = __fma_rn (r, -q, 1.0);
if (t >= 6.5) { if (t >= 6.5) {
r = 1.0; r = 1.0;
} }
a = __cuda_copysign (r, a); a = __internal_copysign_pos(r, a);
} else { } else {
q = t * t; q = a * a;
r = -7.77946848895991420E-010; r = -7.77946848895991420E-010;
r = __fma_rn (r, q, 1.37109803980285950E-008); r = __fma_rn (r, q, 1.37109803980285950E-008);
r = __fma_rn (r, q, -1.62063137584932240E-007); r = __fma_rn (r, q, -1.62063137584932240E-007);
r = __fma_rn (r, q, 1.64471315712790040E-006); r = __fma_rn (r, q, 1.64471315712790040E-006);
r = __fma_rn (r, q, -1.49247123020098620E-005); r = __fma_rn (r, q, -1.49247123020098620E-005);
r = __fma_rn (r, q, 1.20552935769006260E-004); r = __fma_rn (r, q, 1.20552935769006260E-004);
r = __fma_rn (r, q, -8.54832592931448980E-004); r = __fma_rn (r, q, -8.54832592931448980E-004);
r = __fma_rn (r, q, 5.22397760611847340E-003); r = __fma_rn (r, q, 5.22397760611847340E-003);
r = __fma_rn (r, q, -2.68661706431114690E-002); r = __fma_rn (r, q, -2.68661706431114690E-002);
r = __fma_rn (r, q, 1.12837916709441850E-001); r = __fma_rn (r, q, 1.12837916709441850E-001);
skipping to change at line 1412 skipping to change at line 1478
r = __fma_rn (r, q, 1.12837916709551260E+000); r = __fma_rn (r, q, 1.12837916709551260E+000);
a = r * a; a = r * a;
} }
return a; return a;
} }
__device_func__(double __cuda_erfinv(double a)) __device_func__(double __cuda_erfinv(double a))
{ {
double fa, t; double fa, t;
fa = fabs(a); fa = __cuda_fabs(a);
if (fa >= 1.0) { if (fa >= 1.0) {
t = CUDART_NAN; /* NaN */ t = CUDART_NAN; /* NaN */
if (fa == 1.0) { if (fa == 1.0) {
t = a * CUDART_INF; /* Infinity */ t = a * CUDART_INF; /* Infinity */
} }
} else if (fa >= 0.9375) { } else if (fa >= 0.9375) {
/* Based on: J.M. Blair, C.A. Edwards, J.H. Johnson: Rational Chebyshev /* Based on: J.M. Blair, C.A. Edwards, J.H. Johnson: Rational Chebyshev
Approximations for the Inverse of the Error Function. Mathematics of Approximations for the Inverse of the Error Function. Mathematics of
Computation, Vol. 30, No. 136 (Oct. 1976), pp. 827-830. Table 59 Computation, Vol. 30, No. 136 (Oct. 1976), pp. 827-830. Table 59
*/ */
skipping to change at line 1588 skipping to change at line 1654
} }
return t; return t;
} }
__device_func__(double __cuda_erfc(double a)) __device_func__(double __cuda_erfc(double a))
{ {
double p, q, h, l; double p, q, h, l;
int ahi; int ahi;
ahi = __double2hiint(a); ahi = __double2hiint(a);
if (ahi < (int)0x3fe80000) { /* 0.75 */ if (ahi < (int)0x3fea0400) { /* 1665/2048 */
return 1.0 - __cuda_erf(a); return 1.0 - __cuda_erf(a);
} }
if (a > 27.3) {
return 0.0;
}
if (ahi < (int)0x40140000) { /* 5.0 */ if (ahi < (int)0x40140000) { /* 5.0 */
/* max error 7 ulps on [0.75, 5.0] */ /* On the interval [1665/2048, 5.0] the following approximation is used
p = 5.6418956292134603E-001; :
p = __fma_rn (p, a, 7.9573512229784757E+000); erfc(a) = (1.0 + 1/a * r(1/a)) * 1/a * 0.5 * exp(-a*a), where the ra
p = __fma_rn (p, a, 5.4297984550299049E+001); nge
p = __fma_rn (p, a, 2.2775657465890461E+002); of r(1/a) is approximately [-0.17, 0.11]. r(1/a) is computed by rati
p = __fma_rn (p, a, 6.2995529536738172E+002); onal
p = __fma_rn (p, a, 1.1508293767713837E+003); approximation.
p = __fma_rn (p, a, 1.3002167301542784E+003); */
p = __fma_rn (p, a, 7.2716547570180592E+002); double t;
q = a+ 1.4104035812651274E+001;
q = __fma_rn (q, a, 9.6740724349422138E+001); t = 1.0 / a;
q = __fma_rn (q, a, 4.1073916054746462E+002); p = -1.0000000252849461E+000;
q = __fma_rn (q, a, 1.1641974580374074E+003); p = __fma_rn (p, t, -7.3398971987771156E-001);
q = __fma_rn (q, a, 2.2344896486798129E+003); p = __fma_rn (p, t, -1.4685633784433072E-001);
q = __fma_rn (q, a, 2.8166572432808462E+003); p = __fma_rn (p, t, 1.2963557011001836E-001);
q = __fma_rn (q, a, 2.1207350981593036E+003); p = __fma_rn (p, t, 1.0901177826674287E-001);
q = __fma_rn (q, a, 7.2716547619708967E+002); p = __fma_rn (p, t, 3.9250612663155882E-002);
p = p / q; p = __fma_rn (p, t, 7.5883167167654269E-003);
p = __fma_rn (p, t, 6.6438196820856965E-004);
q = t + 2.7339900293714838E+000;
q = __fma_rn (q, t, 3.3580762542361291E+000);
q = __fma_rn (q, t, 2.4165688909166021E+000);
q = __fma_rn (q, t, 1.1092158770004934E+000);
q = __fma_rn (q, t, 3.2845571970789467E-001);
q = __fma_rn (q, t, 5.9110343116276186E-002);
q = __fma_rn (q, t, 5.1750858802842702E-003);
q = __fma_rn (q, t, 1.2937416364002241E-009);
q = 1.0 / q;
p = p * q;
p = p * t;
h = a * a; h = a * a;
l = __fma_rn (a, a, -h); l = __fma_rn (a, a, -h);
q = __internal_exp_kernel(-h, 0); q = __internal_exp_kernel(-h, -1);
q = __fma_rn (l, -q, q); q = __fma_rn (l, -q, q);
p = p * q; p = __fma_rn (p, q, q);
p = p * t;
} else { } else {
/* max error 4 ulps on [5, 27.3] */ /* max error 4 ulps on [5, 27.3] */
double ooa, ooasq; double ooa, ooasq;
ooa = 1.0 / a; ooa = 1.0 / a;
ooasq = ooa * ooa; ooasq = ooa * ooa;
p = -4.0025406686930527E+005; p = -4.0025406686930527E+005;
p = __fma_rn (p, ooasq, 1.4420582543942123E+005); p = __fma_rn (p, ooasq, 1.4420582543942123E+005);
p = __fma_rn (p, ooasq, -2.7664185780951841E+004); p = __fma_rn (p, ooasq, -2.7664185780951841E+004);
p = __fma_rn (p, ooasq, 4.1144611644767283E+003); p = __fma_rn (p, ooasq, 4.1144611644767283E+003);
skipping to change at line 1642 skipping to change at line 1715
p = __fma_rn (p, ooasq, -1.0578553994424316E+000); p = __fma_rn (p, ooasq, -1.0578553994424316E+000);
p = __fma_rn (p, ooasq, 4.2314218745087778E-001); p = __fma_rn (p, ooasq, 4.2314218745087778E-001);
p = __fma_rn (p, ooasq, -2.8209479177354962E-001); p = __fma_rn (p, ooasq, -2.8209479177354962E-001);
p = __fma_rn (p, ooasq, 5.6418958354775606E-001); p = __fma_rn (p, ooasq, 5.6418958354775606E-001);
h = a * a; h = a * a;
l = __fma_rn (a, a, -h); l = __fma_rn (a, a, -h);
q = __internal_exp_kernel(-h, 0); q = __internal_exp_kernel(-h, 0);
q = __fma_rn (l, -q, q); q = __fma_rn (l, -q, q);
p = p * ooa; p = p * ooa;
p = p * q; p = p * q;
if (a > 27.3) {
p = 0.0;
}
} }
return p; return p;
} }
/* approximate 1.0/(a*gamma(a)) on [-0.5,0.5] */ /* approximate 1.0/(a*gamma(a)) on [-0.5,0.5] */
__device_func__(double __internal_tgamma_kernel(double a)) __device_func__(double __internal_tgamma_kernel(double a))
{ {
double t; double t;
t = -4.42689340712524750E-010; t = -4.42689340712524750E-010;
t = __fma_rn (t, a, -2.02665918466589540E-007); t = __fma_rn (t, a, -2.02665918466589540E-007);
skipping to change at line 1691 skipping to change at line 1767
z = __fma_rn (z, x, 3.4722222222222220e-003); z = __fma_rn (z, x, 3.4722222222222220e-003);
z = __fma_rn (z, x, 8.3333333333333329e-002); z = __fma_rn (z, x, 8.3333333333333329e-002);
z = __fma_rn (z, x, 1.0000000000000000e+000); z = __fma_rn (z, x, 1.0000000000000000e+000);
return z; return z;
} }
__device_func__(double __internal_tgamma_stirling(double a)) __device_func__(double __internal_tgamma_stirling(double a))
{ {
if (a < 1.7162437695630274e+002) { if (a < 1.7162437695630274e+002) {
#if defined(__GNUC__) && !defined(__CUDABE__) #if defined(__GNUC__) && !defined(__CUDABE__)
volatile double t_hi, t_lo, e; volatile
#else
double t_hi, t_lo, e;
#endif #endif
double t_hi, t_lo, e;
double2 loga, prod; double2 loga, prod;
double z = __internal_stirling_poly (a); double z = __internal_stirling_poly (a);
double b = a - 0.5; double b = a - 0.5;
/* compute log(a) in double-double format*/ /* compute log(a) in double-double format*/
loga = __internal_log_ext_prec(a); loga = __internal_log_ext_prec(a);
/* compute (a - 0.5) * log(a) in double-double format */ /* compute (a - 0.5) * log(a) in double-double format */
t_hi = loga.y * b; t_hi = loga.y * b;
t_lo = __fma_rn (loga.y, b, -t_hi); t_lo = __fma_rn (loga.y, b, -t_hi);
skipping to change at line 1759 skipping to change at line 1835
xx = xx - 1.0; xx = xx - 1.0;
} }
xx = __internal_tgamma_kernel (xx); xx = __internal_tgamma_kernel (xx);
if (x < 0.5) { if (x < 0.5) {
xx = xx * x; xx = xx * x;
} }
s = s / xx; s = s / xx;
} else { } else {
xx = x; xx = x;
s = xx; s = xx;
if (x == __cuda_floor(x)) { if (x == __cuda_trunc(x)) {
return CUDART_NAN; return CUDART_NAN;
} }
while (xx < -0.5) { while (xx < -0.5) {
s = __fma_rn (s, xx, s); s = __fma_rn (s, xx, s);
xx = xx + 1.0; xx = xx + 1.0;
} }
xx = __internal_tgamma_kernel (xx); xx = __internal_tgamma_kernel (xx);
s = s * xx; s = s * xx;
s = 1.0 / s; s = 1.0 / s;
} }
return s; return s;
} else { } else {
if (x >= 0.0) { if (x >= 0.0) {
return __internal_tgamma_stirling (x); return __internal_tgamma_stirling (x);
} else { } else {
double t; double t;
int quot; int quot;
if (x == __cuda_floor(x)) { if (x == __cuda_trunc(x)) {
return CUDART_NAN; return CUDART_NAN;
} }
if (x < -185.0) { if (x < -185.0) {
int negative; int negative;
x = __cuda_floor(x); x = __cuda_floor(x);
negative = ((x - (2.0 * __cuda_floor(0.5 * x))) == 1.0); negative = ((x - (2.0 * __cuda_floor(0.5 * x))) == 1.0);
return negative ? CUDART_NEG_ZERO : CUDART_ZERO; return negative ? CUDART_NEG_ZERO : CUDART_ZERO;
} }
/* compute sin(pi*x) accurately */ /* compute sin(pi*x) accurately */
xx = __cuda_rint (__internal_twice(x)); xx = __cuda_rint (__internal_twice(x));
skipping to change at line 1948 skipping to change at line 2024
{ {
double t; double t;
double i; double i;
long long int quot; long long int quot;
if (__cuda___isnan(a)) { if (__cuda___isnan(a)) {
return a + a; return a + a;
} }
t = __internal_lgamma_pos(__cuda_fabs(a)); t = __internal_lgamma_pos(__cuda_fabs(a));
if (a >= 0.0) return t; if (a >= 0.0) return t;
a = __cuda_fabs(a); a = __cuda_fabs(a);
i = __cuda_floor(a); i = __cuda_trunc(a);
if (a == i) return CUDART_INF; /* a is an integer: return infinity */ if (a == i) return CUDART_INF; /* a is an integer: return infinity */
if (a < 1e-19) return -__cuda_log(a); if (a < 1e-19) return -__cuda_log(a);
i = __cuda_rint (2.0 * a); i = __cuda_rint (2.0 * a);
quot = (long long int)i; quot = (long long int)i;
i = __fma_rn (-0.5, i, a); i = __fma_rn (-0.5, i, a);
i = i * CUDART_PI; i = i * CUDART_PI;
if (quot & 1) { if (quot & 1) {
i = __internal_cos_kerneld(i); i = __internal_cos_kerneld(i);
} else { } else {
i = __internal_sin_kerneld(i); i = __internal_sin_kerneld(i);
skipping to change at line 2038 skipping to change at line 2114
return a; return a;
} }
__device_func__(double __cuda_modf(double a, double *b)) __device_func__(double __cuda_modf(double a, double *b))
{ {
double t; double t;
if (__cuda___finite(a)) { if (__cuda___finite(a)) {
t = __cuda_trunc(a); t = __cuda_trunc(a);
*b = t; *b = t;
t = a - t; t = a - t;
return __cuda_copysign(t, a); return __internal_copysign_pos(t, a);
} else if (__cuda___isinf(a)) { } else if (__cuda___isinf(a)) {
t = 0.0; t = 0.0;
*b = a; *b = a;
return __cuda_copysign(t, a); return __internal_copysign_pos(t, a);
} else { } else {
*b = a + a; *b = a + a;
return a + a; return a + a;
} }
} }
__device_func__(double __cuda_fmod(double a, double b)) __device_func__(double __cuda_fmod(double a, double b))
{ {
double orig_a = a; double orig_a = a;
double orig_b = b; double orig_b = b;
skipping to change at line 2086 skipping to change at line 2162
} }
if (scaled_b > a) { if (scaled_b > a) {
scaled_b *= 0.5; scaled_b *= 0.5;
} }
while (scaled_b >= b) { while (scaled_b >= b) {
if (a >= scaled_b) { if (a >= scaled_b) {
a -= scaled_b; a -= scaled_b;
} }
scaled_b *= 0.5; scaled_b *= 0.5;
} }
return __cuda_copysign (a, orig_a); return __internal_copysign_pos(a, orig_a);
} else { } else {
return orig_a; return orig_a;
} }
} }
__device_func__(double __cuda_remainder(double a, double b)) __device_func__(double __cuda_remainder(double a, double b))
{ {
double orig_a; double orig_a;
double twoa = 0.0; double twoa = 0.0;
unsigned int quot0 = 0; /* quotient bit 0 */ unsigned int quot0 = 0; /* quotient bit 0 */
skipping to change at line 2224 skipping to change at line 2300
__device_func__(double __cuda_nextafter(double a, double b)) __device_func__(double __cuda_nextafter(double a, double b))
{ {
unsigned long long int ia; unsigned long long int ia;
unsigned long long int ib; unsigned long long int ib;
ia = __double_as_longlong(a); ia = __double_as_longlong(a);
ib = __double_as_longlong(b); ib = __double_as_longlong(b);
if (__cuda___isnan(a) || __cuda___isnan(b)) return a + b; /* NaN */ if (__cuda___isnan(a) || __cuda___isnan(b)) return a + b; /* NaN */
if (((ia | ib) << 1) == 0ULL) return b; if (((ia | ib) << 1) == 0ULL) return b;
if ((ia + ia) == 0ULL) { if ((ia + ia) == 0ULL) {
return __cuda_copysign (CUDART_MIN_DENORM, b); /* crossover */ return __internal_copysign_pos(CUDART_MIN_DENORM, b); /* crossover */
} }
if ((a < b) && (a < 0.0)) ia--; if ((a < b) && (a < 0.0)) ia--;
if ((a < b) && (a > 0.0)) ia++; if ((a < b) && (a > 0.0)) ia++;
if ((a > b) && (a < 0.0)) ia++; if ((a > b) && (a < 0.0)) ia++;
if ((a > b) && (a > 0.0)) ia--; if ((a > b) && (a > 0.0)) ia--;
a = __longlong_as_double(ia); a = __longlong_as_double(ia);
return a; return a;
} }
__device_func__(double __cuda_nan(const char *s)) __device_func__(double __cuda_nan(const char *tagp))
{ {
unsigned long long i = 0; unsigned long long int i;
int c;
int ovfl = 0; i = __internal_nan_kernel (tagp);
int invld = 0;
if (*s == '0') {
s++;
if ((*s == 'x') || (*s == 'X')) {
s++;
while (*s == '0') s++;
while (*s) {
if (i > 0x0fffffffffffffffULL) {
ovfl = 1;
}
c = (((*s) >= 'A') && ((*s) <= 'F')) ? (*s + 'a' - 'A') : (*s);
if ((c >= 'a') && (c <= 'f')) {
c = c - 'a' + 10;
i = i * 16 + c;
} else if ((c >= '0') && (c <= '9')) {
c = c - '0';
i = i * 16 + c;
} else {
invld = 1;
}
s++;
}
} else {
while (*s == '0') s++;
while (*s) {
if (i > 0x1fffffffffffffffULL) {
ovfl = 1;
}
c = *s;
if ((c >= '0') && (c <= '7')) {
c = c - '0';
i = i * 8 + c;
} else {
invld = 1;
}
s++;
}
}
} else {
while (*s) {
c = *s;
if ((i > 1844674407370955161ULL) ||
((i == 1844674407370955161ULL) && (c > '5'))) {
ovfl = 1;
}
if ((c >= '0') && (c <= '9')) {
c = c - '0';
i = i * 10 + c;
} else {
invld = 1;
}
s++;
}
}
if (ovfl) {
i = ~0ULL;
}
if (invld) {
i = 0ULL;
}
i = (i & 0x000fffffffffffffULL) | 0x7ff8000000000000ULL; i = (i & 0x000fffffffffffffULL) | 0x7ff8000000000000ULL;
return __longlong_as_double(i); return __longlong_as_double(i);
} }
__device_func__(double __cuda_round(double a)) __device_func__(double __cuda_round(double a))
{ {
double fa = __cuda_fabs(a); double fa = __cuda_fabs(a);
if (fa > CUDART_TWO_TO_52) { if (fa >= CUDART_TWO_TO_52) {
return a; return a;
} else { } else {
double u = __cuda_floor(fa + 0.5); double u;
u = __cuda_trunc(fa + 0.5);
if (fa < 0.5) u = 0; if (fa < 0.5) u = 0;
return __cuda_copysign(u, a); u = __internal_copysign_pos(u, a);
return u;
} }
} }
__device_func__(long long int __cuda_llround(double a)) __device_func__(long long int __cuda_llround(double a))
{ {
#if !defined(__CUDABE__) #if !defined(__CUDABE__)
if (a >= 9223372036854775807.0) return 0x7fffffffffffffffLL; if (a >= 9223372036854775807.0) return 0x7fffffffffffffffLL;
if (a <= -9223372036854775808.0) return 0x8000000000000000LL; if (a <= -9223372036854775808.0) return 0x8000000000000000LL;
#endif /* !__CUDABE__ */ #endif /* !__CUDABE__ */
return (long long int)(__cuda_round(a)); return (long long int)(__cuda_round(a));
skipping to change at line 2363 skipping to change at line 2381
if (__cuda___isnan(a)) return -INT_MAX-1; if (__cuda___isnan(a)) return -INT_MAX-1;
if (__cuda___isinf(a)) return INT_MAX; if (__cuda___isinf(a)) return INT_MAX;
if (a == 0.0) return -INT_MAX-1; if (a == 0.0) return -INT_MAX-1;
a = __cuda_fabs(a); a = __cuda_fabs(a);
ilo = __double2loint(a); ilo = __double2loint(a);
ihi = __double2hiint(a); ihi = __double2hiint(a);
i = ((unsigned long long int)ihi) << 32 | (unsigned long long int)ilo; i = ((unsigned long long int)ihi) << 32 | (unsigned long long int)ilo;
if (a >= CUDART_TWO_TO_M1022) { if (a >= CUDART_TWO_TO_M1022) {
return ((int)((ihi >> 20) & 0x7ff)) - 1023; return ((int)((ihi >> 20) & 0x7ff)) - 1023;
} else { } else {
int expo = -1022; return -1011 - __clzll(i);
while (i < 0x0010000000000000ULL) {
expo--;
i = i + i;
}
return expo;
} }
} }
__device_func__(double __cuda_logb(double a)) __device_func__(double __cuda_logb(double a))
{ {
unsigned long long int i; unsigned long long int i;
unsigned int ihi; unsigned int ihi;
unsigned int ilo; unsigned int ilo;
if (__cuda___isnan(a)) return a + a; if (__cuda___isnan(a)) return a + a;
a = __cuda_fabs(a); a = __cuda_fabs(a);
if (a == CUDART_INF) return a; if (a == CUDART_INF) return a;
if (a == 0.0) return -CUDART_INF; if (a == 0.0) return -CUDART_INF;
ilo = __double2loint(a); ilo = __double2loint(a);
ihi = __double2hiint(a); ihi = __double2hiint(a);
i = ((unsigned long long int)ihi) << 32 | (unsigned long long int)ilo; i = ((unsigned long long int)ihi) << 32 | (unsigned long long int)ilo;
if (a >= CUDART_TWO_TO_M1022) { if (a >= CUDART_TWO_TO_M1022) {
return (double)((int)((ihi >> 20) & 0x7ff)) - 1023; return (double)((int)((ihi >> 20) & 0x7ff)) - 1023;
} else { } else {
int expo = -1022; int expo = -1011 - __clzll(i);
while (i < 0x0010000000000000ULL) {
expo--;
i = i + i;
}
return (double)expo; return (double)expo;
} }
} }
__device_func__(double __cuda_fma(double a, double b, double c)) __device_func__(double __cuda_fma(double a, double b, double c))
{ {
return __fma_rn(a, b, c); return __fma_rn(a, b, c);
} }
#if __APPLE__ #if __APPLE__
 End of changes. 49 change blocks. 
166 lines changed or deleted 186 lines changed or added


 sm_11_atomic_functions.h   sm_11_atomic_functions.h 
/* /*
* Copyright 1993-2009 NVIDIA Corporation. All rights reserved. * Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
* *
* NOTICE TO USER: * NOTICE TO USER:
* *
* This source code is subject to NVIDIA ownership rights under U.S. and * This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws. Users and possessors of this source code * international Copyright laws. Users and possessors of this source code
* are hereby granted a nonexclusive, royalty-free license to use this code * are hereby granted a nonexclusive, royalty-free license to use this code
* in individual and commercial software. * in individual and commercial software.
* *
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
skipping to change at line 41 skipping to change at line 41
* Any use of this source code in individual and commercial software must * Any use of this source code in individual and commercial software must
* include, in the user documentation and internal comments to the code, * include, in the user documentation and internal comments to the code,
* the above Disclaimer and U.S. Government End Users Notice. * the above Disclaimer and U.S. Government End Users Notice.
*/ */
#if !defined(__SM_11_ATOMIC_FUNCTIONS_H__) #if !defined(__SM_11_ATOMIC_FUNCTIONS_H__)
#define __SM_11_ATOMIC_FUNCTIONS_H__ #define __SM_11_ATOMIC_FUNCTIONS_H__
#if defined(__cplusplus) && defined(__CUDACC__) #if defined(__cplusplus) && defined(__CUDACC__)
#if __CUDA_ARCH__ >= 110 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 110
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
#include "host_defines.h" #include "host_defines.h"
extern "C" extern "C"
skipping to change at line 206 skipping to change at line 206
static __inline__ __device__ int atomicCAS(int *address, int compare, int v al) static __inline__ __device__ int atomicCAS(int *address, int compare, int v al)
{ {
return __iAtomicCAS(address, compare, val); return __iAtomicCAS(address, compare, val);
} }
static __inline__ __device__ unsigned int atomicCAS(unsigned int *address, unsigned int compare, unsigned int val) static __inline__ __device__ unsigned int atomicCAS(unsigned int *address, unsigned int compare, unsigned int val)
{ {
return __uAtomicCAS(address, compare, val); return __uAtomicCAS(address, compare, val);
} }
#endif /* __CUDA_ARCH__ >= 110 */ #endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 110 */
#elif !defined(__CUDACC__) #elif !defined(__CUDACC__)
#include "crt/func_macro.h" #include "crt/func_macro.h"
#if !defined(__CUDABE__) #if !defined(__CUDABE__)
#if defined(__MULTI_CORE__) #if defined(__cplusplus)
extern "C" {
#define __iAtomicAdd(address, val) \ #endif /* __cplusplus */
__builtin___iAtomicAdd(address, val)
#define __uAtomicAdd(address, val) \
__builtin___uAtomicAdd(address, val)
#define __iAtomicExch(address, val) \
__builtin___iAtomicExch(address, val)
#define __uAtomicExch(address, val) \
__builtin___uAtomicExch(address, val)
#define __fAtomicExch(address, val) \
__builtin___fAtomicExch(address, val)
#define __iAtomicMin(address, val) \
__builtin___iAtomicMin(address, val)
#define __uAtomicMin(address, val) \
__builtin___uAtomicMin(address, val)
#define __iAtomicMax(address, val) \
__builtin___iAtomicMax(address, val)
#define __uAtomicMax(address, val) \
__builtin___uAtomicMax(address, val)
#define __uAtomicInc(address, val) \
__builtin___uAtomicInc(address, val)
#define __uAtomicDec(address, val) \
__builtin___uAtomicDec(address, val)
#define __iAtomicAnd(address, val) \
__builtin___iAtomicAnd(address, val)
#define __uAtomicAnd(address, val) \
__builtin___uAtomicAnd(address, val)
#define __iAtomicOr(address, val) \
__builtin___iAtomicOr(address, val)
#define __uAtomicOr(address, val) \
__builtin___uAtomicOr(address, val)
#define __iAtomicXor(address, val) \
__builtin___iAtomicXor(address, val)
#define __uAtomicXor(address, val) \
__builtin___uAtomicXor(address, val)
#define __iAtomicCAS(address, compare, val) \
__builtin___iAtomicCAS(address, compare, val)
#define __uAtomicCAS(address, compare, val) \
__builtin___uAtomicCAS(address, compare, val)
#else /* __MULTI_CORE__ */
extern void CUDARTAPI __cudaMutexOperation(int lock); extern void CUDARTAPI __cudaMutexOperation(int lock);
#if defined(__cplusplus)
}
#endif /* __cplusplus */
#define __cudaAtomicOperation(code) \ #define __cudaAtomicOperation(code) \
__cudaMutexOperation(1); \ __cudaMutexOperation(1); \
code \ code \
__cudaMutexOperation(0); __cudaMutexOperation(0);
__device_func__(int __iAtomicAdd(int *address, int val)) __device_func__(int __iAtomicAdd(int *address, int val))
{ {
int old; int old;
__cudaAtomicOperation( __cudaAtomicOperation(
skipping to change at line 493 skipping to change at line 458
__cudaAtomicOperation( __cudaAtomicOperation(
old = *address; old = *address;
*address = old == compare ? val : old; *address = old == compare ? val : old;
) )
return old; return old;
} }
#undef __cudaAtomicOperation #undef __cudaAtomicOperation
#endif /* __MULTI_CORE__ */
#endif /* !__CUDABE__ */ #endif /* !__CUDABE__ */
#endif /* __cplusplus && __CUDACC__ */ #endif /* __cplusplus && __CUDACC__ */
#endif /* !__SM_11_ATOMIC_FUNCTIONS_H__ */ #endif /* !__SM_11_ATOMIC_FUNCTIONS_H__ */
 End of changes. 6 change blocks. 
47 lines changed or deleted 10 lines changed or added


 sm_12_atomic_functions.h   sm_12_atomic_functions.h 
/* /*
* Copyright 1993-2009 NVIDIA Corporation. All rights reserved. * Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
* *
* NOTICE TO USER: * NOTICE TO USER:
* *
* This source code is subject to NVIDIA ownership rights under U.S. and * This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws. Users and possessors of this source code * international Copyright laws. Users and possessors of this source code
* are hereby granted a nonexclusive, royalty-free license to use this code * are hereby granted a nonexclusive, royalty-free license to use this code
* in individual and commercial software. * in individual and commercial software.
* *
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
skipping to change at line 41 skipping to change at line 41
* Any use of this source code in individual and commercial software must * Any use of this source code in individual and commercial software must
* include, in the user documentation and internal comments to the code, * include, in the user documentation and internal comments to the code,
* the above Disclaimer and U.S. Government End Users Notice. * the above Disclaimer and U.S. Government End Users Notice.
*/ */
#if !defined(__SM_12_ATOMIC_FUNCTIONS_H__) #if !defined(__SM_12_ATOMIC_FUNCTIONS_H__)
#define __SM_12_ATOMIC_FUNCTIONS_H__ #define __SM_12_ATOMIC_FUNCTIONS_H__
#if defined(__cplusplus) && defined(__CUDACC__) #if defined(__cplusplus) && defined(__CUDACC__)
#if __CUDA_ARCH__ >= 120 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 120
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
#include "host_defines.h" #include "host_defines.h"
extern "C" extern "C"
skipping to change at line 99 skipping to change at line 99
static __inline__ __device__ bool any(bool cond) static __inline__ __device__ bool any(bool cond)
{ {
return (bool)__any((int)cond); return (bool)__any((int)cond);
} }
static __inline__ __device__ bool all(bool cond) static __inline__ __device__ bool all(bool cond)
{ {
return (bool)__all((int)cond); return (bool)__all((int)cond);
} }
#endif /* __CUDA_ARCH__ >= 120 */ #endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 120 */
#elif !defined(__CUDACC__) #elif !defined(__CUDACC__)
#include "crt/func_macro.h" #include "crt/func_macro.h"
#if !defined(__CUDABE__) #if !defined(__CUDABE__)
#if defined(__MULTI_CORE__) #if defined(__cplusplus)
extern "C" {
#define __ullAtomicAdd(address, val) \ #endif /* __cplusplus */
__builtin___ullAtomicAdd(address, val)
#define __ullAtomicExch(address, val) \
__builtin___ullAtomicExch(address, val)
#define __ullAtomicCAS(address, compare, val) \
__builtin___ullAtomicCAS(address, compare, val)
#else /* __MULTI_CORE__ */
extern void CUDARTAPI __cudaMutexOperation(int lock); extern void CUDARTAPI __cudaMutexOperation(int lock);
#if defined(__cplusplus)
}
#endif /* __cplusplus */
#define __cudaAtomicOperation(code) \ #define __cudaAtomicOperation(code) \
__cudaMutexOperation(1); \ __cudaMutexOperation(1); \
code \ code \
__cudaMutexOperation(0); __cudaMutexOperation(0);
__device_func__(unsigned long long int __ullAtomicAdd(unsigned long long in t *address, unsigned long long int val)) __device_func__(unsigned long long int __ullAtomicAdd(unsigned long long in t *address, unsigned long long int val))
{ {
unsigned long long int old; unsigned long long int old;
__cudaAtomicOperation( __cudaAtomicOperation(
skipping to change at line 160 skipping to change at line 157
__cudaAtomicOperation( __cudaAtomicOperation(
old = *address; old = *address;
*address = old == compare ? val : old; *address = old == compare ? val : old;
) )
return old; return old;
} }
#undef __cudaAtomicOperation #undef __cudaAtomicOperation
#endif /* __MULTI_CORE__ */
__device_func__(int __any(int cond)) __device_func__(int __any(int cond))
{ {
return cond; return cond;
} }
__device_func__(int __all(int cond)) __device_func__(int __all(int cond))
{ {
return cond; return cond;
} }
 End of changes. 6 change blocks. 
15 lines changed or deleted 10 lines changed or added


 sm_13_double_functions.h   sm_13_double_functions.h 
/* /*
* Copyright 1993-2009 NVIDIA Corporation. All rights reserved. * Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
* *
* NOTICE TO USER: * NOTICE TO USER:
* *
* This source code is subject to NVIDIA ownership rights under U.S. and * This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws. Users and possessors of this source code * international Copyright laws. Users and possessors of this source code
* are hereby granted a nonexclusive, royalty-free license to use this code * are hereby granted a nonexclusive, royalty-free license to use this code
* in individual and commercial software. * in individual and commercial software.
* *
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
skipping to change at line 47 skipping to change at line 47
#define __SM_13_DOUBLE_FUNCTIONS_H__ #define __SM_13_DOUBLE_FUNCTIONS_H__
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
#if defined(__cplusplus) && defined(__CUDACC__) #if defined(__cplusplus) && defined(__CUDACC__)
#if __CUDA_ARCH__ >= 130 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 130
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
#include "device_types.h" #include "device_types.h"
#include "host_defines.h" #include "host_defines.h"
skipping to change at line 97 skipping to change at line 97
extern __device__ double __dmul_rz(double, double); extern __device__ double __dmul_rz(double, double);
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ double __dmul_ru(double, double); extern __device__ double __dmul_ru(double, double);
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ double __dmul_rd(double, double); extern __device__ double __dmul_rd(double, double);
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ float __double2float_rn(double); extern __device__ float __double2float_rn(double);
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ float __double2float_rz(double); extern __device__ float __double2float_rz(double);
/*DEVICE_BUILTIN*/
extern __device__ float __double2float_ru(double);
/*DEVICE_BUILTIN*/
extern __device__ float __double2float_rd(double);
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ int __double2int_rn(double); extern __device__ int __double2int_rn(double);
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ int __double2int_ru(double); extern __device__ int __double2int_ru(double);
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ int __double2int_rd(double); extern __device__ int __double2int_rd(double);
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ unsigned int __double2uint_rn(double); extern __device__ unsigned int __double2uint_rn(double);
skipping to change at line 252 skipping to change at line 256
static __inline__ __device__ double uint2double(unsigned int a, enum cudaRo undMode mode = cudaRoundNearest) static __inline__ __device__ double uint2double(unsigned int a, enum cudaRo undMode mode = cudaRoundNearest)
{ {
return (double)a; return (double)a;
} }
static __inline__ __device__ double float2double(float a, enum cudaRoundMod e mode = cudaRoundNearest) static __inline__ __device__ double float2double(float a, enum cudaRoundMod e mode = cudaRoundNearest)
{ {
return (double)a; return (double)a;
} }
#endif /* __CUDA_ARCH__ >= 130 */ #endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 130 */
#elif !defined(__CUDACC__) #elif !defined(__CUDACC__)
#include "crt/func_macro.h" #include "crt/func_macro.h"
#if !defined(__CUDABE__) #if !defined(__CUDABE__)
/************************************************************************** ***** /************************************************************************** *****
* * * *
* HOST IMPLEMENTATIONS FOR FUNCTIONS * * HOST IMPLEMENTATIONS FOR FUNCTIONS *
skipping to change at line 281 skipping to change at line 286
return u.d; return u.d;
} }
__device_func__(long long int __double_as_longlong(double a)) __device_func__(long long int __double_as_longlong(double a))
{ {
volatile union __cudart_DoubleLonglongCvt u; volatile union __cudart_DoubleLonglongCvt u;
u.d = a; u.d = a;
return u.i; return u.i;
} }
__device_func__(float __internal_double2float_kernel(double a)) /* Note: this kernel does not support round-to-nearest-or-even */
__device_func__(float __internal_double2float_kernel(double a, enum cudaRou
ndMode rndMode))
{ {
volatile union __cudart_DoubleUlonglongCvt xx; volatile union __cudart_DoubleUlonglongCvt xx;
volatile union __cudart_FloatUintCvt res; volatile union __cudart_FloatUintCvt res;
unsigned long long sticky;
int shift; int shift;
xx.d = a; xx.d = a;
if (xx.i == 0) return 0.0f;
res.i = (((unsigned int) (xx.i >> 32)) & 0x80000000); res.i = (((unsigned int) (xx.i >> 32)) & 0x80000000);
if (a == 0.0) {
/* Zero */
return res.f;
}
if ((xx.i & 0x7ff0000000000000ULL) == 0x7ff0000000000000ULL) { if ((xx.i & 0x7ff0000000000000ULL) == 0x7ff0000000000000ULL) {
if ((xx.i & 0x7fffffffffffffffULL) > 0x7ff0000000000000ULL) { if ((xx.i & 0x7fffffffffffffffULL) > 0x7ff0000000000000ULL) {
/* Nan */ /* Nan */
res.i = ((unsigned int)((xx.i >> 32) & 0x80000000) | res.i = ((unsigned int)((xx.i >> 32) & 0x80000000) |
(255U << 23) | 0x00400000 | (255U << 23) | 0x00400000 |
(unsigned int)((xx.i >> (53 - 24)) & 0x007fffff)); (unsigned int)((xx.i >> (53 - 24)) & 0x007fffff));
} else { } else {
/* Inf */ /* Inf */
res.i |= 0x7f800000; res.i |= 0x7f800000;
} }
return res.f; return res.f;
} }
shift = ((int) ((xx.i >> 52) & 0x7ff)) - 1023; shift = ((int) ((xx.i >> 52) & 0x7ff)) - 1023;
/* Overflow */ /* Overflow */
xx.i = (xx.i & 0x000fffffffffffffULL); xx.i = (xx.i & 0x000fffffffffffffULL);
if (shift >= 128) { if (shift >= 128) {
res.i |= 0x7f7fffff; if ((rndMode == cudaRoundZero) ||
((rndMode == cudaRoundMinInf) && !res.i) ||
((rndMode == cudaRoundPosInf) && res.i)) {
res.i |= 0x7f7fffff;
} else {
res.i |= 0x7f800000;
}
return res.f; return res.f;
} }
if (shift <= -127) { if (shift <= -127) {
/* Underflow */
xx.i |= 0x0010000000000000ULL;
if (shift < -180) { if (shift < -180) {
/* Underflow */ sticky = xx.i;
xx.i = 0; xx.i = 0;
} else { } else {
xx.i |= 0x0010000000000000ULL; sticky = xx.i << (64 - (-126 - shift));
xx.i >>= -126 - shift; xx.i >>= (-126 - shift);
} }
} else { sticky |= xx.i << (64 - 29);
res.i |= (unsigned int) (127 + shift) << 23; if ((((rndMode == cudaRoundPosInf) && !res.i) ||
((rndMode == cudaRoundMinInf) && res.i)) &&
sticky) {
res.i += 1;
}
res.i += ((unsigned int) (xx.i >> 29)) & 0x007fffff;
return res.f;
} }
res.i |= ((unsigned int) (xx.i >> 29)) & 0x007fffff; sticky = xx.i << (64 - 29);
xx.i &= 0x1fffffff; if ((((rndMode == cudaRoundPosInf) && !res.i) ||
((rndMode == cudaRoundMinInf) && res.i)) &&
sticky) {
res.i += 1;
}
res.i += ((unsigned int) (xx.i >> 29)) & 0x007fffff;
res.i += (unsigned int) (127 + shift) << 23;
return res.f; return res.f;
} }
__device_func__(double __internal_ll2double_kernel(long long int a, enum cu daRoundMode rndMode)) __device_func__(double __internal_ll2double_kernel(long long int a, enum cu daRoundMode rndMode))
{ {
volatile union __cudart_DoubleUlonglongCvt res; volatile union __cudart_DoubleUlonglongCvt res;
int shift; int shift;
unsigned int t; unsigned int t;
res.i = a; res.i = a;
if (a == 0) return res.d; if (a == 0) return res.d;
skipping to change at line 463 skipping to change at line 493
return cvt.d; return cvt.d;
} }
__device_func__(float __double2float_rn(double a)) __device_func__(float __double2float_rn(double a))
{ {
return (float)a; return (float)a;
} }
__device_func__(float __double2float_rz(double a)) __device_func__(float __double2float_rz(double a))
{ {
return __internal_double2float_kernel(a); return __internal_double2float_kernel(a, cudaRoundZero);
}
__device_func__(float __double2float_ru(double a))
{
return __internal_double2float_kernel(a, cudaRoundPosInf);
}
__device_func__(float __double2float_rd(double a))
{
return __internal_double2float_kernel(a, cudaRoundMinInf);
} }
__device_func__(int __internal_double2int(double a, enum cudaRoundMode rndM ode)) __device_func__(int __internal_double2int(double a, enum cudaRoundMode rndM ode))
{ {
return (int)__internal_double2ll_kernel(a, 2147483647LL, -2147483648LL, - 2147483648LL, rndMode); return (int)__internal_double2ll_kernel(a, 2147483647LL, -2147483648LL, - 2147483648LL, rndMode);
} }
__device_func__(int __double2int_rn(double a)) __device_func__(int __double2int_rn(double a))
{ {
return __internal_double2int(a, cudaRoundNearest); return __internal_double2int(a, cudaRoundNearest);
skipping to change at line 604 skipping to change at line 644
} }
#endif /* !__CUDABE__ */ #endif /* !__CUDABE__ */
#if !defined(__CUDABE__) || __CUDA_ARCH__ < 130 #if !defined(__CUDABE__) || __CUDA_ARCH__ < 130
#include "common_types.h" #include "common_types.h"
__device_func__(double __internal_fma_kernel(double x, double y, double z, enum cudaRoundMode rndMode)) __device_func__(double __internal_fma_kernel(double x, double y, double z, enum cudaRoundMode rndMode))
{ {
#ifdef __MULTI_CORE__
volatile
#endif /* __MULTI_CORE__ */
struct __cudart_UintUint xx, yy, zz, ww; struct __cudart_UintUint xx, yy, zz, ww;
unsigned int s, t, u, prod0, prod1, prod2, prod3, expo_x, expo_y, expo_z; unsigned int s, t, u, prod0, prod1, prod2, prod3, expo_x, expo_y, expo_z;
xx.hi = __double2hiint(x); xx.hi = __double2hiint(x);
xx.lo = __double2loint(x); xx.lo = __double2loint(x);
yy.hi = __double2hiint(y); yy.hi = __double2hiint(y);
yy.lo = __double2loint(y); yy.lo = __double2loint(y);
zz.hi = __double2hiint(z); zz.hi = __double2hiint(z);
zz.lo = __double2loint(z); zz.lo = __double2loint(z);
 End of changes. 16 change blocks. 
17 lines changed or deleted 54 lines changed or added


 texture_fetch_functions.h   texture_fetch_functions.h 
/* /*
* Copyright 1993-2009 NVIDIA Corporation. All rights reserved. * Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
* *
* NOTICE TO USER: * NOTICE TO USER:
* *
* This source code is subject to NVIDIA ownership rights under U.S. and * This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws. Users and possessors of this source code * international Copyright laws. Users and possessors of this source code
* are hereby granted a nonexclusive, royalty-free license to use this code * are hereby granted a nonexclusive, royalty-free license to use this code
* in individual and commercial software. * in individual and commercial software.
* *
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
skipping to change at line 1910 skipping to change at line 1910
__ftexfetchi1D(t, i) __ftexfetchi1D(t, i)
#define __utexfetch(t, i, d) \ #define __utexfetch(t, i, d) \
__utexfetch##d##D(t, i) __utexfetch##d##D(t, i)
#define __itexfetch(t, i, d) \ #define __itexfetch(t, i, d) \
__itexfetch##d##D(t, i) __itexfetch##d##D(t, i)
#define __ftexfetch(t, i, d) \ #define __ftexfetch(t, i, d) \
__ftexfetch##d##D(t, i) __ftexfetch##d##D(t, i)
#else /* __CUDABE__ */ #else /* __CUDABE__ */
#if defined(__cplusplus)
extern "C" {
#endif /* __cplusplus */
extern void CUDARTAPI __cudaTextureFetch(const void *tex, void *index, int integer, void *val); extern void CUDARTAPI __cudaTextureFetch(const void *tex, void *index, int integer, void *val);
#if defined(__cplusplus)
}
#endif /* __cplusplus */
__device_func__(int4 __itexfetchi(const void *tex, int4 index)) __device_func__(int4 __itexfetchi(const void *tex, int4 index))
{ {
int4 val; int4 val;
__cudaTextureFetch(tex, (void*)&index, 1, (void*)&val); __cudaTextureFetch(tex, (void*)&index, 1, (void*)&val);
return val; return val;
} }
__device_func__(uint4 __utexfetchi(const void *tex, int4 index)) __device_func__(uint4 __utexfetchi(const void *tex, int4 index))
 End of changes. 3 change blocks. 
1 lines changed or deleted 9 lines changed or added


 texture_types.h   texture_types.h 
/* /*
* Copyright 1993-2009 NVIDIA Corporation. All rights reserved. * Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
* *
* NOTICE TO USER: * NOTICE TO USER:
* *
* This source code is subject to NVIDIA ownership rights under U.S. and * This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws. Users and possessors of this source code * international Copyright laws. Users and possessors of this source code
* are hereby granted a nonexclusive, royalty-free license to use this code * are hereby granted a nonexclusive, royalty-free license to use this code
* in individual and commercial software. * in individual and commercial software.
* *
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
skipping to change at line 57 skipping to change at line 57
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
enum cudaTextureAddressMode enum cudaTextureAddressMode
{ {
cudaAddressModeWrap, cudaAddressModeWrap,
cudaAddressModeClamp cudaAddressModeClamp,
cudaAddressModeMirror
}; };
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
enum cudaTextureFilterMode enum cudaTextureFilterMode
{ {
cudaFilterModePoint, cudaFilterModePoint,
cudaFilterModeLinear cudaFilterModeLinear
}; };
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
 End of changes. 2 change blocks. 
2 lines changed or deleted 3 lines changed or added


 vector_functions.h   vector_functions.h 
/* /*
* Copyright 1993-2009 NVIDIA Corporation. All rights reserved. * Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
* *
* NOTICE TO USER: * NOTICE TO USER:
* *
* This source code is subject to NVIDIA ownership rights under U.S. and * This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws. Users and possessors of this source code * international Copyright laws. Users and possessors of this source code
* are hereby granted a nonexclusive, royalty-free license to use this code * are hereby granted a nonexclusive, royalty-free license to use this code
* in individual and commercial software. * in individual and commercial software.
* *
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
skipping to change at line 194 skipping to change at line 194
static __inline__ __host__ __device__ long2 make_long2(long int x, long int y) static __inline__ __host__ __device__ long2 make_long2(long int x, long int y)
{ {
long2 t; t.x = x; t.y = y; return t; long2 t; t.x = x; t.y = y; return t;
} }
static __inline__ __host__ __device__ ulong2 make_ulong2(unsigned long int x, unsigned long int y) static __inline__ __host__ __device__ ulong2 make_ulong2(unsigned long int x, unsigned long int y)
{ {
ulong2 t; t.x = x; t.y = y; return t; ulong2 t; t.x = x; t.y = y; return t;
} }
#if !defined(__LP64__)
static __inline__ __host__ __device__ long3 make_long3(long int x, long int y, long int z) static __inline__ __host__ __device__ long3 make_long3(long int x, long int y, long int z)
{ {
long3 t; t.x = x; t.y = y; t.z = z; return t; long3 t; t.x = x; t.y = y; t.z = z; return t;
} }
static __inline__ __host__ __device__ ulong3 make_ulong3(unsigned long int x, unsigned long int y, unsigned long int z) static __inline__ __host__ __device__ ulong3 make_ulong3(unsigned long int x, unsigned long int y, unsigned long int z)
{ {
ulong3 t; t.x = x; t.y = y; t.z = z; return t; ulong3 t; t.x = x; t.y = y; t.z = z; return t;
} }
static __inline__ __host__ __device__ long4 make_long4(long int x, long int y, long int z, long int w) static __inline__ __host__ __device__ long4 make_long4(long int x, long int y, long int z, long int w)
{ {
long4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t; long4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
} }
static __inline__ __host__ __device__ ulong4 make_ulong4(unsigned long int x, unsigned long int y, unsigned long int z, unsigned long int w) static __inline__ __host__ __device__ ulong4 make_ulong4(unsigned long int x, unsigned long int y, unsigned long int z, unsigned long int w)
{ {
ulong4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t; ulong4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
} }
#endif /* !__LP64__ */
static __inline__ __host__ __device__ float1 make_float1(float x) static __inline__ __host__ __device__ float1 make_float1(float x)
{ {
float1 t; t.x = x; return t; float1 t; t.x = x; return t;
} }
static __inline__ __host__ __device__ float2 make_float2(float x, float y) static __inline__ __host__ __device__ float2 make_float2(float x, float y)
{ {
float2 t; t.x = x; t.y = y; return t; float2 t; t.x = x; t.y = y; return t;
} }
skipping to change at line 258 skipping to change at line 254
static __inline__ __host__ __device__ longlong2 make_longlong2(long long in t x, long long int y) static __inline__ __host__ __device__ longlong2 make_longlong2(long long in t x, long long int y)
{ {
longlong2 t; t.x = x; t.y = y; return t; longlong2 t; t.x = x; t.y = y; return t;
} }
static __inline__ __host__ __device__ ulonglong2 make_ulonglong2(unsigned l ong long int x, unsigned long long int y) static __inline__ __host__ __device__ ulonglong2 make_ulonglong2(unsigned l ong long int x, unsigned long long int y)
{ {
ulonglong2 t; t.x = x; t.y = y; return t; ulonglong2 t; t.x = x; t.y = y; return t;
} }
static __inline__ __host__ __device__ longlong3 make_longlong3(long long in
t x, long long int y, long long int z)
{
longlong3 t; t.x = x; t.y = y; t.z = z; return t;
}
static __inline__ __host__ __device__ ulonglong3 make_ulonglong3(unsigned l
ong long int x, unsigned long long int y, unsigned long long int z)
{
ulonglong3 t; t.x = x; t.y = y; t.z = z; return t;
}
static __inline__ __host__ __device__ longlong4 make_longlong4(long long in
t x, long long int y, long long int z, long long int w)
{
longlong4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
}
static __inline__ __host__ __device__ ulonglong4 make_ulonglong4(unsigned l
ong long int x, unsigned long long int y, unsigned long long int z, unsigne
d long long int w)
{
ulonglong4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
}
static __inline__ __host__ __device__ double1 make_double1(double x) static __inline__ __host__ __device__ double1 make_double1(double x)
{ {
double1 t; t.x = x; return t; double1 t; t.x = x; return t;
} }
static __inline__ __host__ __device__ double2 make_double2(double x, double y) static __inline__ __host__ __device__ double2 make_double2(double x, double y)
{ {
double2 t; t.x = x; t.y = y; return t; double2 t; t.x = x; t.y = y; return t;
} }
static __inline__ __host__ __device__ double3 make_double3(double x, double
y, double z)
{
double3 t; t.x = x; t.y = y; t.z = z; return t;
}
static __inline__ __host__ __device__ double4 make_double4(double x, double
y, double z, double w)
{
double4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
}
#endif /* !__VECTOR_FUNCTIONS_H__ */ #endif /* !__VECTOR_FUNCTIONS_H__ */
 End of changes. 5 change blocks. 
5 lines changed or deleted 38 lines changed or added


 vector_types.h   vector_types.h 
/* /*
* Copyright 1993-2009 NVIDIA Corporation. All rights reserved. * Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
* *
* NOTICE TO USER: * NOTICE TO USER:
* *
* This source code is subject to NVIDIA ownership rights under U.S. and * This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws. Users and possessors of this source code * international Copyright laws. Users and possessors of this source code
* are hereby granted a nonexclusive, royalty-free license to use this code * are hereby granted a nonexclusive, royalty-free license to use this code
* in individual and commercial software. * in individual and commercial software.
* *
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
skipping to change at line 60 skipping to change at line 60
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
#if !defined(__cuda_assign_operators) #if !defined(__cuda_assign_operators)
#define __cuda_assign_operators(tag) #define __cuda_assign_operators(tag)
#endif /* !__cuda_assign_operators */ #endif /* !__cuda_assign_operators */
#if !defined(__CUDACC__) && !defined(__CUDABE__) && \ #if !defined(__CUDACC__) && !defined(__CUDABE__) && \
!defined (__MULTI_CORE__) && defined(_WIN32) && !defined(_WIN64) defined(_WIN32) && !defined(_WIN64)
#define __cuda_builtin_vector_align8(tag, ...) \ #define __cuda_builtin_vector_align8(tag, ...) \
struct tag { \ struct tag { \
union { \ union { \
struct { __VA_ARGS__; }; \ struct { __VA_ARGS__; }; \
struct { long long int :1,:0; }; \ struct { long long int :1,:0; }; \
}; \ }; \
__cuda_assign_operators(tag) \ __cuda_assign_operators(tag) \
} }
#else /* !__CUDACC__ && !__CUDABE__ && !__MULTI_CORE__ && _WIN32 && !_WIN64 */ #else /* !__CUDACC__ && !__CUDABE__ && _WIN32 && !_WIN64 */
#define __cuda_builtin_vector_align8(tag, ...) \ #define __cuda_builtin_vector_align8(tag, ...) \
struct __align__(8) tag { \ struct __align__(8) tag { \
__VA_ARGS__; \ __VA_ARGS__; \
__cuda_assign_operators(tag) \ __cuda_assign_operators(tag) \
} }
#endif /* !__CUDACC__ && !__CUDABE__ && !__MULTI_CORE__ && _WIN32 && !_WIN6 4 */ #endif /* !__CUDACC__ && !__CUDABE__ && _WIN32 && !_WIN64 */
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct char1 struct char1
{ {
signed char x; signed char x;
__cuda_assign_operators(char1) __cuda_assign_operators(char1)
}; };
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct uchar1 struct uchar1
skipping to change at line 273 skipping to change at line 273
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct __align__(2*sizeof(unsigned long int)) ulong2 struct __align__(2*sizeof(unsigned long int)) ulong2
{ {
unsigned long int x, y; unsigned long int x, y;
__cuda_assign_operators(ulong2) __cuda_assign_operators(ulong2)
}; };
#endif /* _WIN32 */ #endif /* _WIN32 */
#if !defined(__LP64__)
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct long3 struct long3
{ {
long int x, y, z; long int x, y, z;
__cuda_assign_operators(long3) __cuda_assign_operators(long3)
}; };
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct ulong3 struct ulong3
{ {
skipping to change at line 303 skipping to change at line 301
__cuda_assign_operators(long4) __cuda_assign_operators(long4)
}; };
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct __builtin_align__(16) ulong4 struct __builtin_align__(16) ulong4
{ {
unsigned long int x, y, z, w; unsigned long int x, y, z, w;
__cuda_assign_operators(ulong4) __cuda_assign_operators(ulong4)
}; };
#endif /* !__LP64__ */
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct float1 struct float1
{ {
float x; float x;
__cuda_assign_operators(float1) __cuda_assign_operators(float1)
}; };
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
__cuda_builtin_vector_align8(float2, float x, y); __cuda_builtin_vector_align8(float2, float x, y);
skipping to change at line 358 skipping to change at line 354
}; };
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct __builtin_align__(16) ulonglong2 struct __builtin_align__(16) ulonglong2
{ {
unsigned long long int x, y; unsigned long long int x, y;
__cuda_assign_operators(ulonglong2) __cuda_assign_operators(ulonglong2)
}; };
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct longlong3
{
long long int x, y, z;
__cuda_assign_operators(longlong3)
};
/*DEVICE_BUILTIN*/
struct ulonglong3
{
unsigned long long int x, y, z;
__cuda_assign_operators(ulonglong3)
};
/*DEVICE_BUILTIN*/
struct __builtin_align__(16) longlong4
{
long long int x, y, z ,w;
__cuda_assign_operators(longlong4)
};
/*DEVICE_BUILTIN*/
struct __builtin_align__(16) ulonglong4
{
unsigned long long int x, y, z, w;
__cuda_assign_operators(ulonglong4)
};
/*DEVICE_BUILTIN*/
struct double1 struct double1
{ {
double x; double x;
__cuda_assign_operators(double1) __cuda_assign_operators(double1)
}; };
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct __builtin_align__(16) double2 struct __builtin_align__(16) double2
{ {
double x, y; double x, y;
__cuda_assign_operators(double2) __cuda_assign_operators(double2)
}; };
/*DEVICE_BUILTIN*/
struct double3
{
double x, y, z;
__cuda_assign_operators(double3)
};
/*DEVICE_BUILTIN*/
struct __builtin_align__(16) double4
{
double x, y, z, w;
__cuda_assign_operators(double4)
};
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
typedef struct char1 char1; typedef struct char1 char1;
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
typedef struct uchar1 uchar1; typedef struct uchar1 uchar1;
skipping to change at line 458 skipping to change at line 496
typedef struct float4 float4; typedef struct float4 float4;
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
typedef struct longlong1 longlong1; typedef struct longlong1 longlong1;
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
typedef struct ulonglong1 ulonglong1; typedef struct ulonglong1 ulonglong1;
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
typedef struct longlong2 longlong2; typedef struct longlong2 longlong2;
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
typedef struct ulonglong2 ulonglong2; typedef struct ulonglong2 ulonglong2;
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
typedef struct longlong3 longlong3;
/*DEVICE_BUILTIN*/
typedef struct ulonglong3 ulonglong3;
/*DEVICE_BUILTIN*/
typedef struct longlong4 longlong4;
/*DEVICE_BUILTIN*/
typedef struct ulonglong4 ulonglong4;
/*DEVICE_BUILTIN*/
typedef struct double1 double1; typedef struct double1 double1;
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
typedef struct double2 double2; typedef struct double2 double2;
/*DEVICE_BUILTIN*/
typedef struct double3 double3;
/*DEVICE_BUILTIN*/
typedef struct double4 double4;
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct dim3 struct dim3
{ {
unsigned int x, y, z; unsigned int x, y, z;
#if defined(__cplusplus) #if defined(__cplusplus) && !defined(__CUDABE__)
__host__ __device__ dim3(unsigned int x = 1, unsigned int y = 1, unsign ed int z = 1) : x(x), y(y), z(z) {} __host__ __device__ dim3(unsigned int x = 1, unsigned int y = 1, unsign ed int z = 1) : x(x), y(y), z(z) {}
__host__ __device__ dim3(uint3 v) : x(v.x), y(v.y), z(v.z) {} __host__ __device__ dim3(uint3 v) : x(v.x), y(v.y), z(v.z) {}
__host__ __device__ operator uint3(void) { uint3 t; t.x = x; t.y = y; t .z = z; return t; } __host__ __device__ operator uint3(void) { uint3 t; t.x = x; t.y = y; t .z = z; return t; }
#endif /* __cplusplus */ #endif /* __cplusplus && !__CUDABE__ */
}; };
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
typedef struct dim3 dim3; typedef struct dim3 dim3;
#undef __cuda_assign_operators #undef __cuda_assign_operators
#undef __cuda_builtin_vector_align8 #undef __cuda_builtin_vector_align8
#endif /* !__VECTOR_TYPES_H__ */ #endif /* !__VECTOR_TYPES_H__ */
 End of changes. 12 change blocks. 
10 lines changed or deleted 60 lines changed or added

This html diff was produced by rfcdiff 1.41. The latest version is available from http://tools.ietf.org/tools/rfcdiff/