__cudaFatFormat.h | __cudaFatFormat.h | |||
---|---|---|---|---|
/* | /* | |||
* Copyright 1993-2009 NVIDIA Corporation. All rights reserved. | * Copyright 1993-2008 NVIDIA Corporation. All rights reserved. | |||
* | * | |||
* NOTICE TO USER: | * NOTICE TO USER: | |||
* | * | |||
* This source code is subject to NVIDIA ownership rights under U.S. and | * This source code is subject to NVIDIA ownership rights under U.S. and | |||
* international Copyright laws. Users and possessors of this source code | * international Copyright laws. Users and possessors of this source code | |||
* are hereby granted a nonexclusive, royalty-free license to use this code | * are hereby granted a nonexclusive, royalty-free license to use this code | |||
* in individual and commercial software. | * in individual and commercial software. | |||
* | * | |||
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE | * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE | |||
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR | * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR | |||
skipping to change at line 121 | skipping to change at line 121 | |||
char* gpuProfileName; | char* gpuProfileName; | |||
char* ptx; | char* ptx; | |||
} __cudaFatPtxEntry; | } __cudaFatPtxEntry; | |||
/* | /* | |||
* Debug entry type for __cudaFat binary. | * Debug entry type for __cudaFat binary. | |||
* Such information might, but need not be available | * Such information might, but need not be available | |||
* for Cubin entries (ptx files compiled in debug mode | * for Cubin entries (ptx files compiled in debug mode | |||
* will contain their own debugging information) | * will contain their own debugging information) | |||
*/ | */ | |||
typedef struct { | typedef struct __cudaFatDebugEntryRec { | |||
char* gpuProfileName; | char* gpuProfileName; | |||
char* debug; | char* debug; | |||
struct __cudaFatDebugEntryRec *next; | ||||
unsigned int size; | ||||
} __cudaFatDebugEntry; | } __cudaFatDebugEntry; | |||
typedef enum { | typedef enum { | |||
__cudaFatDontSearchFlag = (1 << 0), | __cudaFatDontSearchFlag = (1 << 0), | |||
__cudaFatDontCacheFlag = (1 << 1), | __cudaFatDontCacheFlag = (1 << 1), | |||
__cudaFatSassDebugFlag = (1 << 2) | __cudaFatSassDebugFlag = (1 << 2) | |||
} __cudaFatCudaBinaryFlag; | } __cudaFatCudaBinaryFlag; | |||
/* | /* | |||
* Imported/exported symbol descriptor, needed for | * Imported/exported symbol descriptor, needed for | |||
End of changes. 2 change blocks. | ||||
4 lines changed or deleted | 6 lines changed or added | |||
cuda.h | cuda.h | |||
---|---|---|---|---|
skipping to change at line 57 | skipping to change at line 57 | |||
/** | /** | |||
* \defgroup CUDA_TYPES Data types used by CUDA driver | * \defgroup CUDA_TYPES Data types used by CUDA driver | |||
* \ingroup CUDA_DRIVER | * \ingroup CUDA_DRIVER | |||
* @{ | * @{ | |||
*/ | */ | |||
/** | /** | |||
* CUDA API version number | * CUDA API version number | |||
*/ | */ | |||
#define CUDA_VERSION 2020 /* 2.2 */ | #define CUDA_VERSION 2030 /* 2.3 */ | |||
#ifdef __cplusplus | #ifdef __cplusplus | |||
extern "C" { | extern "C" { | |||
#endif | #endif | |||
typedef unsigned int CUdeviceptr; ///< CUDA device pointer | typedef unsigned int CUdeviceptr; ///< CUDA device pointer | |||
typedef int CUdevice; ///< CUDA device | typedef int CUdevice; ///< CUDA device | |||
typedef struct CUctx_st *CUcontext; ///< CUDA context | typedef struct CUctx_st *CUcontext; ///< CUDA context | |||
typedef struct CUmod_st *CUmodule; ///< CUDA module | typedef struct CUmod_st *CUmodule; ///< CUDA module | |||
typedef struct CUfunc_st *CUfunction; ///< CUDA function | typedef struct CUfunc_st *CUfunction; ///< CUDA function | |||
skipping to change at line 89 | skipping to change at line 89 | |||
/** | /** | |||
* Context creation flags | * Context creation flags | |||
*/ | */ | |||
typedef enum CUctx_flags_enum { | typedef enum CUctx_flags_enum { | |||
CU_CTX_SCHED_AUTO = 0, ///< Automatic scheduling | CU_CTX_SCHED_AUTO = 0, ///< Automatic scheduling | |||
CU_CTX_SCHED_SPIN = 1, ///< Set spin as default scheduling | CU_CTX_SCHED_SPIN = 1, ///< Set spin as default scheduling | |||
CU_CTX_SCHED_YIELD = 2, ///< Set yield as default scheduling | CU_CTX_SCHED_YIELD = 2, ///< Set yield as default scheduling | |||
CU_CTX_SCHED_MASK = 0x3, | CU_CTX_SCHED_MASK = 0x3, | |||
CU_CTX_BLOCKING_SYNC = 4, ///< Use blocking synchronization | CU_CTX_BLOCKING_SYNC = 4, ///< Use blocking synchronization | |||
CU_CTX_MAP_HOST = 8, ///< Support mapped pinned allocations | CU_CTX_MAP_HOST = 8, ///< Support mapped pinned allocations | |||
CU_CTX_FLAGS_MASK = 0xf, | CU_CTX_LMEM_RESIZE_TO_MAX = 16, ///< Keep local memory allocation after | |||
launch | ||||
CU_CTX_FLAGS_MASK = 0x1f, | ||||
} CUctx_flags; | } CUctx_flags; | |||
/** | /** | |||
* Event creation flags | * Event creation flags | |||
*/ | */ | |||
typedef enum CUevent_flags_enum { | typedef enum CUevent_flags_enum { | |||
CU_EVENT_DEFAULT = 0, ///< Default event flag | CU_EVENT_DEFAULT = 0, ///< Default event flag | |||
CU_EVENT_BLOCKING_SYNC = 1, ///< Event uses blocking synchronization | CU_EVENT_BLOCKING_SYNC = 1, ///< Event uses blocking synchronization | |||
} CUevent_flags; | } CUevent_flags; | |||
skipping to change at line 605 | skipping to change at line 606 | |||
); | ); | |||
CUresult CUDAAPI cuMemFree(CUdeviceptr dptr); | CUresult CUDAAPI cuMemFree(CUdeviceptr dptr); | |||
CUresult CUDAAPI cuMemGetAddressRange( CUdeviceptr *pbase, unsigned int *psize, CUdeviceptr dptr ); | CUresult CUDAAPI cuMemGetAddressRange( CUdeviceptr *pbase, unsigned int *psize, CUdeviceptr dptr ); | |||
CUresult CUDAAPI cuMemAllocHost(void **pp, unsigned int bytesize); | CUresult CUDAAPI cuMemAllocHost(void **pp, unsigned int bytesize); | |||
CUresult CUDAAPI cuMemFreeHost(void *p); | CUresult CUDAAPI cuMemFreeHost(void *p); | |||
CUresult CUDAAPI cuMemHostAlloc(void **pp, size_t bytesize, unsigned in t Flags ); | CUresult CUDAAPI cuMemHostAlloc(void **pp, size_t bytesize, unsigned in t Flags ); | |||
CUresult CUDAAPI cuMemHostGetDevicePointer( CUdeviceptr *pdptr, void *p , unsigned int Flags ); | CUresult CUDAAPI cuMemHostGetDevicePointer( CUdeviceptr *pdptr, void *p , unsigned int Flags ); | |||
CUresult CUDAAPI cuMemHostGetFlags( unsigned int *pFlags, void *p ); | ||||
/************************************ | /************************************ | |||
** | ** | |||
** Synchronous Memcpy | ** Synchronous Memcpy | |||
** | ** | |||
** Intra-device memcpy's done with these functions may execute in para llel with the CPU, | ** Intra-device memcpy's done with these functions may execute in para llel with the CPU, | |||
** but if host memory is involved, they wait until the copy is done be fore returning. | ** but if host memory is involved, they wait until the copy is done be fore returning. | |||
** | ** | |||
***********************************/ | ***********************************/ | |||
End of changes. 3 change blocks. | ||||
2 lines changed or deleted | 5 lines changed or added | |||
cudaGL.h | cudaGL.h | |||
---|---|---|---|---|
skipping to change at line 43 | skipping to change at line 43 | |||
* the above Disclaimer and U.S. Government End Users Notice. | * the above Disclaimer and U.S. Government End Users Notice. | |||
*/ | */ | |||
#ifndef CUDAGL_H | #ifndef CUDAGL_H | |||
#define CUDAGL_H | #define CUDAGL_H | |||
#ifdef __cplusplus | #ifdef __cplusplus | |||
extern "C" { | extern "C" { | |||
#endif | #endif | |||
// | ||||
// Flags to map or unmap a resource | ||||
// | ||||
typedef enum CUGLmap_flags_enum { | ||||
CU_GL_MAP_RESOURCE_FLAGS_NONE = 0x00, | ||||
CU_GL_MAP_RESOURCE_FLAGS_READ_ONLY = 0x01, | ||||
CU_GL_MAP_RESOURCE_FLAGS_WRITE_DISCARD = 0x02, | ||||
} CUGLmap_flags; | ||||
CUresult CUDAAPI cuGLInit(void); | CUresult CUDAAPI cuGLInit(void); | |||
CUresult CUDAAPI cuGLCtxCreate( CUcontext *pCtx, unsigned int Flags, CUdevi ce device ); | CUresult CUDAAPI cuGLCtxCreate( CUcontext *pCtx, unsigned int Flags, CUdevi ce device ); | |||
CUresult CUDAAPI cuGLRegisterBufferObject( GLuint bufferobj ); | CUresult CUDAAPI cuGLRegisterBufferObject( GLuint bufferobj ); | |||
CUresult CUDAAPI cuGLMapBufferObject( CUdeviceptr *dptr, unsigned int *size , GLuint bufferobj ); | CUresult CUDAAPI cuGLMapBufferObject( CUdeviceptr *dptr, unsigned int *size , GLuint bufferobj ); | |||
CUresult CUDAAPI cuGLUnmapBufferObject( GLuint bufferobj ); | CUresult CUDAAPI cuGLUnmapBufferObject( GLuint bufferobj ); | |||
CUresult CUDAAPI cuGLUnregisterBufferObject( GLuint bufferobj ); | CUresult CUDAAPI cuGLUnregisterBufferObject( GLuint bufferobj ); | |||
CUresult CUDAAPI cuGLSetBufferObjectMapFlags( GLuint bufferobj, unsigned in | ||||
t Flags ); | ||||
CUresult CUDAAPI cuGLMapBufferObjectAsync( CUdeviceptr *dptr, unsigned int | ||||
*size, GLuint bufferobj, CUstream hStream ); | ||||
CUresult CUDAAPI cuGLUnmapBufferObjectAsync( GLuint bufferobj, CUstream hSt | ||||
ream ); | ||||
#if defined(_WIN32) | #if defined(_WIN32) | |||
#if !defined(WGL_NV_gpu_affinity) | #if !defined(WGL_NV_gpu_affinity) | |||
typedef void* HGPUNV; | typedef void* HGPUNV; | |||
#endif | #endif | |||
CUresult CUDAAPI cuWGLGetDevice( CUdevice *pDevice, HGPUNV hGpu ); | CUresult CUDAAPI cuWGLGetDevice( CUdevice *pDevice, HGPUNV hGpu ); | |||
#endif | #endif | |||
#ifdef __cplusplus | #ifdef __cplusplus | |||
}; | }; | |||
#endif | #endif | |||
End of changes. 2 change blocks. | ||||
0 lines changed or deleted | 16 lines changed or added | |||
cuda_gl_interop.h | cuda_gl_interop.h | |||
---|---|---|---|---|
skipping to change at line 67 | skipping to change at line 67 | |||
#if defined(__cplusplus) | #if defined(__cplusplus) | |||
extern "C" { | extern "C" { | |||
#endif /* __cplusplus */ | #endif /* __cplusplus */ | |||
/************************************************************************** ***** | /************************************************************************** ***** | |||
* * | * * | |||
* * | * * | |||
* * | * * | |||
*************************************************************************** ****/ | *************************************************************************** ****/ | |||
/** | ||||
* CUDA GL Map Flags | ||||
*/ | ||||
enum cudaGLMapFlags | ||||
{ | ||||
cudaGLMapFlagsNone = 0, ///< Default; Assume resource can be rea | ||||
d/written | ||||
cudaGLMapFlagsReadOnly = 1, ///< CUDA kernels will not write to this | ||||
resource | ||||
cudaGLMapFlagsWriteDiscard = 2, ///< CUDA kernels will only write to and | ||||
will not read from this resource | ||||
}; | ||||
extern __host__ cudaError_t CUDARTAPI cudaGLSetGLDevice(int device); | extern __host__ cudaError_t CUDARTAPI cudaGLSetGLDevice(int device); | |||
extern __host__ cudaError_t CUDARTAPI cudaGLRegisterBufferObject(GLuint buf Obj); | extern __host__ cudaError_t CUDARTAPI cudaGLRegisterBufferObject(GLuint buf Obj); | |||
extern __host__ cudaError_t CUDARTAPI cudaGLMapBufferObject(void **devPtr, GLuint bufObj); | extern __host__ cudaError_t CUDARTAPI cudaGLMapBufferObject(void **devPtr, GLuint bufObj); | |||
extern __host__ cudaError_t CUDARTAPI cudaGLUnmapBufferObject(GLuint bufObj ); | extern __host__ cudaError_t CUDARTAPI cudaGLUnmapBufferObject(GLuint bufObj ); | |||
extern __host__ cudaError_t CUDARTAPI cudaGLUnregisterBufferObject(GLuint b ufObj); | extern __host__ cudaError_t CUDARTAPI cudaGLUnregisterBufferObject(GLuint b ufObj); | |||
extern __host__ cudaError_t CUDARTAPI cudaGLSetBufferObjectMapFlags(GLuint | ||||
bufObj, unsigned int flags); | ||||
extern __host__ cudaError_t CUDARTAPI cudaGLMapBufferObjectAsync(void **dev | ||||
Ptr, GLuint bufObj, cudaStream_t stream); | ||||
extern __host__ cudaError_t CUDARTAPI cudaGLUnmapBufferObjectAsync(GLuint b | ||||
ufObj, cudaStream_t stream); | ||||
#ifdef _WIN32 | #ifdef _WIN32 | |||
#ifndef WGL_NV_gpu_affinity | #ifndef WGL_NV_gpu_affinity | |||
typedef void* HGPUNV; | typedef void* HGPUNV; | |||
#endif | #endif | |||
extern __host__ cudaError_t CUDARTAPI cudaWGLGetDevice(int *device, HGPUNV hGpu); | extern __host__ cudaError_t CUDARTAPI cudaWGLGetDevice(int *device, HGPUNV hGpu); | |||
#endif | #endif | |||
#if defined(__cplusplus) | #if defined(__cplusplus) | |||
} | } | |||
#endif /* __cplusplus */ | #endif /* __cplusplus */ | |||
End of changes. 2 change blocks. | ||||
0 lines changed or deleted | 21 lines changed or added | |||
cuda_runtime.h | cuda_runtime.h | |||
---|---|---|---|---|
skipping to change at line 94 | skipping to change at line 94 | |||
* \ref ::cudaSetupArgument(T,size_t) "cudaSetupArgument()" must be precede d | * \ref ::cudaSetupArgument(T,size_t) "cudaSetupArgument()" must be precede d | |||
* by a call to ::cudaConfigureCall(). | * by a call to ::cudaConfigureCall(). | |||
* | * | |||
* \param arg - Argument to push for a kernel launch | * \param arg - Argument to push for a kernel launch | |||
* \param offset - Offset in argument stack to push new arg | * \param offset - Offset in argument stack to push new arg | |||
* | * | |||
* \return | * \return | |||
* ::cudaSuccess | * ::cudaSuccess | |||
* \notefnerr | * \notefnerr | |||
* | * | |||
* \sa \ref ::cudaLaunch(T*) "cudaLaunch (C++ API)", | * \sa ::cudaConfigureCall, | |||
* \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes*, T*) "cudaFuncGe | ||||
tAttributes (C++ API)", | ||||
* \ref ::cudaLaunch(T*) "cudaLaunch (C++ API)", | ||||
* ::cudaSetDoubleForDevice, | ||||
* ::cudaSetDoubleForHost, | ||||
* \ref ::cudaSetupArgument(const void*, size_t, size_t) "cudaSetupArgument (C API)" | * \ref ::cudaSetupArgument(const void*, size_t, size_t) "cudaSetupArgument (C API)" | |||
* ::cudaConfigureCall | ||||
*/ | */ | |||
template<class T> | template<class T> | |||
__inline__ __host__ cudaError_t cudaSetupArgument( | __inline__ __host__ cudaError_t cudaSetupArgument( | |||
T arg, | T arg, | |||
size_t offset | size_t offset | |||
) | ) | |||
{ | { | |||
return cudaSetupArgument((const void*)&arg, sizeof(T), offset); | return cudaSetupArgument((const void*)&arg, sizeof(T), offset); | |||
} | } | |||
template<class T> | ||||
__inline__ __host__ cudaError_t cudaHostAlloc( | ||||
T **ptr, | ||||
size_t size, | ||||
unsigned int flags | ||||
) | ||||
{ | ||||
return cudaHostAlloc((void**)(void*)ptr, size, flags); | ||||
} | ||||
template<class T> | ||||
__inline__ __host__ cudaError_t cudaHostGetDevicePointer( | ||||
T **pDevice, | ||||
void *pHost, | ||||
unsigned int flags | ||||
) | ||||
{ | ||||
return cudaHostGetDevicePointer((void**)(void*)pDevice, pHost, flags); | ||||
} | ||||
template<class T> | ||||
__inline__ __host__ cudaError_t cudaMalloc( | ||||
T **devPtr, | ||||
size_t size | ||||
) | ||||
{ | ||||
return cudaMalloc((void**)(void*)devPtr, size); | ||||
} | ||||
template<class T> | ||||
__inline__ __host__ cudaError_t cudaMallocHost( | ||||
T **ptr, | ||||
size_t size | ||||
) | ||||
{ | ||||
return cudaMallocHost((void**)(void*)ptr, size); | ||||
} | ||||
template<class T> | ||||
__inline__ __host__ cudaError_t cudaMallocPitch( | ||||
T **devPtr, | ||||
size_t *pitch, | ||||
size_t width, | ||||
size_t height | ||||
) | ||||
{ | ||||
return cudaMallocPitch((void**)(void*)devPtr, pitch, width, height); | ||||
} | ||||
#if defined(__CUDACC__) | #if defined(__CUDACC__) | |||
/************************************************************************** ***** | /************************************************************************** ***** | |||
* * | * * | |||
* * | * * | |||
* * | * * | |||
*************************************************************************** ****/ | *************************************************************************** ****/ | |||
/** | /** | |||
* \addtogroup CUDART_HIGHLEVEL | * \addtogroup CUDART_HIGHLEVEL | |||
skipping to change at line 305 | skipping to change at line 357 | |||
*/ | */ | |||
template<class T> | template<class T> | |||
__inline__ __host__ cudaError_t cudaGetSymbolSize( | __inline__ __host__ cudaError_t cudaGetSymbolSize( | |||
size_t *size, | size_t *size, | |||
const T &symbol | const T &symbol | |||
) | ) | |||
{ | { | |||
return cudaGetSymbolSize(size, (const char*)&symbol); | return cudaGetSymbolSize(size, (const char*)&symbol); | |||
} | } | |||
/** @} */ /* END CUDART_MEMORY */ | /** @} */ /* END CUDART_HIGHLEVEL */ | |||
/************************************************************************** ***** | /************************************************************************** ***** | |||
* * | * * | |||
* * | * * | |||
* * | * * | |||
*************************************************************************** ****/ | *************************************************************************** ****/ | |||
/** | /** | |||
* \addtogroup CUDART_HIGHLEVEL | * \addtogroup CUDART_HIGHLEVEL | |||
* | * | |||
skipping to change at line 628 | skipping to change at line 680 | |||
/************************************************************************** ***** | /************************************************************************** ***** | |||
* * | * * | |||
* * | * * | |||
* * | * * | |||
*************************************************************************** ****/ | *************************************************************************** ****/ | |||
/** | /** | |||
* \ingroup CUDART_HIGHLEVEL | * \ingroup CUDART_HIGHLEVEL | |||
* \brief \hl Launches a device function | * \brief \hl Launches a device function | |||
* | * | |||
* Launches the function \p entry on the device. \p entry can either be a | * Launches the function \p entry on the device. The parameter \p entry can | |||
* function that executes on the device, or it can be a character string, | * either be a function that executes on the device, or it can be a charact | |||
* naming a function that executes on the device. \p entry must be declared | er | |||
as | * string, naming a function that executes on the device. The parameter | |||
* a \p __global__ function. | * specified by \p entry must be declared as a \p __global__ function. | |||
* \ref ::cudaLaunch(T*) "cudaLaunch()" must be preceded by a call to | * \ref ::cudaLaunch(T*) "cudaLaunch()" must be preceded by a call to | |||
* ::cudaConfigureCall() since it pops the data that was pushed by | * ::cudaConfigureCall() since it pops the data that was pushed by | |||
* ::cudaConfigureCall() from the execution stack. | * ::cudaConfigureCall() from the execution stack. | |||
* | * | |||
* \param entry - Device function pointer or char string naming device func tion | * \param entry - Device function pointer or char string naming device func tion | |||
* to execute | * to execute | |||
* | * | |||
* \return | * \return | |||
* ::cudaSuccess, | * ::cudaSuccess, | |||
* ::cudaErrorInvalidDeviceFunction, | * ::cudaErrorInvalidDeviceFunction, | |||
* ::cudaErrorInvalidConfiguration | * ::cudaErrorInvalidConfiguration, | |||
* ::cudaErrorLaunchFailure, | ||||
* ::cudaErrorPriorLaunchFailure, | ||||
* ::cudaErrorLaunchTimeout, | ||||
* ::cudaErrorLaunchOutOfResources | ||||
* \notefnerr | * \notefnerr | |||
* | * | |||
* \sa ::cudaConfigureCall, | * \sa ::cudaConfigureCall, | |||
* \ref ::cudaSetupArgument(T,size_t) "cudaSetupArgument (C++ API)", | * \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes*, T*) "cudaFuncGe | |||
* \ref ::cudaLaunch(const char*) "cudaLaunch (C API)" | tAttributes (C++ API)", | |||
* \ref ::cudaLaunch(const char*) "cudaLaunch (C API)", | ||||
* ::cudaSetDoubleForDevice, | ||||
* ::cudaSetDoubleForHost, | ||||
* \ref ::cudaSetupArgument(T,size_t) "cudaSetupArgument (C++ API)" | ||||
*/ | */ | |||
template<class T> | template<class T> | |||
__inline__ __host__ cudaError_t cudaLaunch( | __inline__ __host__ cudaError_t cudaLaunch( | |||
T *entry | T *entry | |||
) | ) | |||
{ | { | |||
return cudaLaunch((const char*)entry); | return cudaLaunch((const char*)entry); | |||
} | } | |||
/** | ||||
* \ingroup CUDART_HIGHLEVEL | ||||
* \brief \hl Find out attributes for a given function | ||||
* | ||||
* This function obtains the attributes of a function specified via \p entr | ||||
y. | ||||
* The parameter \p entry can either be a function that executes on the | ||||
* device, or it can be a character string, naming a function that executes | ||||
on | ||||
* the device. The parameter specified by \p entry must be declared as a | ||||
* \p __global__ function. The fetched attributes are placed in \p attr. If | ||||
* the specified function does not exist, then ::cudaErrorInvalidDeviceFunc | ||||
tion | ||||
* is returned. | ||||
* | ||||
* \param attr - Return pointer to function's attributes | ||||
* \param entry - Function to get attributes of | ||||
* | ||||
* \return | ||||
* ::cudaSuccess, | ||||
* ::cudaErrorInitializationError, | ||||
* ::cudaErrorInvalidDeviceFunction | ||||
* \notefnerr | ||||
* | ||||
* \sa ::cudaConfigureCall, | ||||
* \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes*, const char*) "c | ||||
udaFuncGetAttributes (C API)", | ||||
* \ref ::cudaLaunch(T*) "cudaLaunch (C++ API)", | ||||
* ::cudaSetDoubleForDevice, | ||||
* ::cudaSetDoubleForHost, | ||||
* \ref ::cudaSetupArgument(T,size_t) "cudaSetupArgument (C++ API)" | ||||
*/ | ||||
template<class T> | ||||
__inline__ __host__ cudaError_t cudaFuncGetAttributes( | ||||
struct cudaFuncAttributes *attr, | ||||
T *entry | ||||
) | ||||
{ | ||||
return cudaFuncGetAttributes(attr, (const char*)entry); | ||||
} | ||||
#endif /* __CUDACC__ */ | #endif /* __CUDACC__ */ | |||
#endif /* __cplusplus */ | #endif /* __cplusplus */ | |||
#endif /* !__CUDA_RUNTIME_H__ */ | #endif /* !__CUDA_RUNTIME_H__ */ | |||
End of changes. 8 change blocks. | ||||
11 lines changed or deleted | 113 lines changed or added | |||
cuda_runtime_api.h | cuda_runtime_api.h | |||
---|---|---|---|---|
skipping to change at line 41 | skipping to change at line 41 | |||
* Any use of this source code in individual and commercial software must | * Any use of this source code in individual and commercial software must | |||
* include, in the user documentation and internal comments to the code, | * include, in the user documentation and internal comments to the code, | |||
* the above Disclaimer and U.S. Government End Users Notice. | * the above Disclaimer and U.S. Government End Users Notice. | |||
*/ | */ | |||
#if !defined(__CUDA_RUNTIME_API_H__) | #if !defined(__CUDA_RUNTIME_API_H__) | |||
#define __CUDA_RUNTIME_API_H__ | #define __CUDA_RUNTIME_API_H__ | |||
/************************************************************************** ***** | /************************************************************************** ***** | |||
* * | * * | |||
* CUDA Runtime API Version 2.2 * | * CUDA Runtime API Version 2.3 * | |||
* * | * * | |||
*************************************************************************** ****/ | *************************************************************************** ****/ | |||
#define CUDART_VERSION 2020 | #define CUDART_VERSION 2030 | |||
/************************************************************************** ***** | /************************************************************************** ***** | |||
* * | * * | |||
* * | * * | |||
* * | * * | |||
*************************************************************************** ****/ | *************************************************************************** ****/ | |||
#include "host_defines.h" | #include "host_defines.h" | |||
#include "builtin_types.h" | #include "builtin_types.h" | |||
skipping to change at line 111 | skipping to change at line 111 | |||
extern __host__ cudaError_t CUDARTAPI cudaMalloc(void **devPtr, size_t size ); | extern __host__ cudaError_t CUDARTAPI cudaMalloc(void **devPtr, size_t size ); | |||
extern __host__ cudaError_t CUDARTAPI cudaMallocHost(void **ptr, size_t siz e); | extern __host__ cudaError_t CUDARTAPI cudaMallocHost(void **ptr, size_t siz e); | |||
extern __host__ cudaError_t CUDARTAPI cudaMallocPitch(void **devPtr, size_t *pitch, size_t width, size_t height); | extern __host__ cudaError_t CUDARTAPI cudaMallocPitch(void **devPtr, size_t *pitch, size_t width, size_t height); | |||
extern __host__ cudaError_t CUDARTAPI cudaMallocArray(struct cudaArray **ar ray, const struct cudaChannelFormatDesc *desc, size_t width, size_t height __dv(1)); | extern __host__ cudaError_t CUDARTAPI cudaMallocArray(struct cudaArray **ar ray, const struct cudaChannelFormatDesc *desc, size_t width, size_t height __dv(1)); | |||
extern __host__ cudaError_t CUDARTAPI cudaFree(void *devPtr); | extern __host__ cudaError_t CUDARTAPI cudaFree(void *devPtr); | |||
extern __host__ cudaError_t CUDARTAPI cudaFreeHost(void *ptr); | extern __host__ cudaError_t CUDARTAPI cudaFreeHost(void *ptr); | |||
extern __host__ cudaError_t CUDARTAPI cudaFreeArray(struct cudaArray *array ); | extern __host__ cudaError_t CUDARTAPI cudaFreeArray(struct cudaArray *array ); | |||
extern __host__ cudaError_t CUDARTAPI cudaHostAlloc(void **pHost, size_t by tes, unsigned int flags); | extern __host__ cudaError_t CUDARTAPI cudaHostAlloc(void **pHost, size_t by tes, unsigned int flags); | |||
extern __host__ cudaError_t CUDARTAPI cudaHostGetDevicePointer(void **pDevi ce, void *pHost, unsigned int flags); | extern __host__ cudaError_t CUDARTAPI cudaHostGetDevicePointer(void **pDevi ce, void *pHost, unsigned int flags); | |||
extern __host__ cudaError_t CUDARTAPI cudaHostGetFlags(unsigned int *pFlags , void *pHost); | ||||
/************************************************************************** ***** | /************************************************************************** ***** | |||
* * | * * | |||
* * | * * | |||
* * | * * | |||
*************************************************************************** ****/ | *************************************************************************** ****/ | |||
extern __host__ cudaError_t CUDARTAPI cudaMemcpy(void *dst, const void *src , size_t count, enum cudaMemcpyKind kind); | extern __host__ cudaError_t CUDARTAPI cudaMemcpy(void *dst, const void *src , size_t count, enum cudaMemcpyKind kind); | |||
extern __host__ cudaError_t CUDARTAPI cudaMemcpyToArray(struct cudaArray *d st, size_t wOffset, size_t hOffset, const void *src, size_t count, enum cud aMemcpyKind kind); | extern __host__ cudaError_t CUDARTAPI cudaMemcpyToArray(struct cudaArray *d st, size_t wOffset, size_t hOffset, const void *src, size_t count, enum cud aMemcpyKind kind); | |||
extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromArray(void *dst, const struct cudaArray *src, size_t wOffset, size_t hOffset, size_t count, enum c udaMemcpyKind kind); | extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromArray(void *dst, const struct cudaArray *src, size_t wOffset, size_t hOffset, size_t count, enum c udaMemcpyKind kind); | |||
End of changes. 3 change blocks. | ||||
2 lines changed or deleted | 3 lines changed or added | |||
cufft.h | cufft.h | |||
---|---|---|---|---|
skipping to change at line 45 | skipping to change at line 45 | |||
/* | /* | |||
* cufft.h | * cufft.h | |||
* Public header file for the NVIDIA Cuda FFT library (CUFFT) | * Public header file for the NVIDIA Cuda FFT library (CUFFT) | |||
*/ | */ | |||
#ifndef _CUFFT_H_ | #ifndef _CUFFT_H_ | |||
#define _CUFFT_H_ | #define _CUFFT_H_ | |||
#include <stdio.h> | #include <stdio.h> | |||
#include "cuComplex.h" | #include "cuComplex.h" | |||
#include "driver_types.h" | ||||
#ifndef CUFFTAPI | #ifndef CUFFTAPI | |||
#ifdef _WIN32 | #ifdef _WIN32 | |||
#define CUFFTAPI __stdcall | #define CUFFTAPI __stdcall | |||
#else | #else | |||
#define CUFFTAPI | #define CUFFTAPI | |||
#endif | #endif | |||
#endif | #endif | |||
#ifdef __cplusplus | #ifdef __cplusplus | |||
skipping to change at line 77 | skipping to change at line 78 | |||
CUFFT_SETUP_FAILED = 0x7, | CUFFT_SETUP_FAILED = 0x7, | |||
CUFFT_INVALID_SIZE = 0x8 | CUFFT_INVALID_SIZE = 0x8 | |||
} cufftResult; | } cufftResult; | |||
// CUFFT defines and supports the following data types | // CUFFT defines and supports the following data types | |||
// cufftHandle is a handle type used to store and access CUFFT plans. | // cufftHandle is a handle type used to store and access CUFFT plans. | |||
typedef unsigned int cufftHandle; | typedef unsigned int cufftHandle; | |||
// cufftReal is a single-precision, floating-point real data type. | // cufftReal is a single-precision, floating-point real data type. | |||
// cufftDoubleReal is a double-precision, real data type. | ||||
typedef float cufftReal; | typedef float cufftReal; | |||
typedef double cufftDoubleReal; | ||||
// cufftComplex is a single-precision, floating-point complex data type tha t | // cufftComplex is a single-precision, floating-point complex data type tha t | |||
// consists of interleaved real and imaginary components. | // consists of interleaved real and imaginary components. | |||
// typedef float cufftComplex[2]; | // cufftDoubleComplex is the double-precision equivalent. | |||
typedef cuComplex cufftComplex; | typedef cuComplex cufftComplex; | |||
typedef cuDoubleComplex cufftDoubleComplex; | ||||
// CUFFT transform directions | // CUFFT transform directions | |||
#define CUFFT_FORWARD -1 // Forward FFT | #define CUFFT_FORWARD -1 // Forward FFT | |||
#define CUFFT_INVERSE 1 // Inverse FFT | #define CUFFT_INVERSE 1 // Inverse FFT | |||
// CUFFT supports the following transform types | // CUFFT supports the following transform types | |||
typedef enum cufftType_t { | typedef enum cufftType_t { | |||
CUFFT_R2C = 0x2a, // Real to Complex (interleaved) | CUFFT_R2C = 0x2a, // Real to Complex (interleaved) | |||
CUFFT_C2R = 0x2c, // Complex (interleaved) to Real | CUFFT_C2R = 0x2c, // Complex (interleaved) to Real | |||
CUFFT_C2C = 0x29 // Complex to Complex, interleaved | CUFFT_C2C = 0x29, // Complex to Complex, interleaved | |||
CUFFT_D2Z = 0x6a, // Double to Double-Complex | ||||
CUFFT_Z2D = 0x6c, // Double-Complex to Double | ||||
CUFFT_Z2Z = 0x69 // Double-Complex to Double-Complex | ||||
} cufftType; | } cufftType; | |||
cufftResult CUFFTAPI cufftPlan1d(cufftHandle *plan, | cufftResult CUFFTAPI cufftPlan1d(cufftHandle *plan, | |||
int nx, | int nx, | |||
cufftType type, | cufftType type, | |||
int batch); | int batch); | |||
cufftResult CUFFTAPI cufftPlan2d(cufftHandle *plan, | cufftResult CUFFTAPI cufftPlan2d(cufftHandle *plan, | |||
int nx, int ny, | int nx, int ny, | |||
cufftType type); | cufftType type); | |||
skipping to change at line 123 | skipping to change at line 130 | |||
int direction); | int direction); | |||
cufftResult CUFFTAPI cufftExecR2C(cufftHandle plan, | cufftResult CUFFTAPI cufftExecR2C(cufftHandle plan, | |||
cufftReal *idata, | cufftReal *idata, | |||
cufftComplex *odata); | cufftComplex *odata); | |||
cufftResult CUFFTAPI cufftExecC2R(cufftHandle plan, | cufftResult CUFFTAPI cufftExecC2R(cufftHandle plan, | |||
cufftComplex *idata, | cufftComplex *idata, | |||
cufftReal *odata); | cufftReal *odata); | |||
cufftResult CUFFTAPI cufftExecZ2Z(cufftHandle plan, | ||||
cufftDoubleComplex *idata, | ||||
cufftDoubleComplex *odata, | ||||
int direction); | ||||
cufftResult CUFFTAPI cufftExecD2Z(cufftHandle plan, | ||||
cufftDoubleReal *idata, | ||||
cufftDoubleComplex *odata); | ||||
cufftResult CUFFTAPI cufftExecZ2D(cufftHandle plan, | ||||
cufftDoubleComplex *idata, | ||||
cufftDoubleReal *odata); | ||||
cufftResult CUFFTAPI cufftSetStream(cufftHandle p, | ||||
cudaStream_t stream); | ||||
#ifdef __cplusplus | #ifdef __cplusplus | |||
} | } | |||
#endif | #endif | |||
#endif /* _CUFFT_H_ */ | #endif /* _CUFFT_H_ */ | |||
End of changes. 7 change blocks. | ||||
4 lines changed or deleted | 27 lines changed or added | |||
device_functions.h | device_functions.h | |||
---|---|---|---|---|
skipping to change at line 139 | skipping to change at line 139 | |||
/*DEVICE_BUILTIN*/ | /*DEVICE_BUILTIN*/ | |||
extern __device__ float __log10f(float) __THROW; | extern __device__ float __log10f(float) __THROW; | |||
/*DEVICE_BUILTIN*/ | /*DEVICE_BUILTIN*/ | |||
extern __device__ float __logf(float) __THROW; | extern __device__ float __logf(float) __THROW; | |||
/*DEVICE_BUILTIN*/ | /*DEVICE_BUILTIN*/ | |||
extern __device__ float __powf(float, float) __THROW; | extern __device__ float __powf(float, float) __THROW; | |||
/*DEVICE_BUILTIN*/ | /*DEVICE_BUILTIN*/ | |||
extern __device__ int __float2int_rn(float); | ||||
/*DEVICE_BUILTIN*/ | ||||
extern __device__ int __float2int_rz(float); | extern __device__ int __float2int_rz(float); | |||
/*DEVICE_BUILTIN*/ | /*DEVICE_BUILTIN*/ | |||
extern __device__ int __float2int_ru(float); | extern __device__ int __float2int_ru(float); | |||
/*DEVICE_BUILTIN*/ | /*DEVICE_BUILTIN*/ | |||
extern __device__ int __float2int_rd(float); | extern __device__ int __float2int_rd(float); | |||
/*DEVICE_BUILTIN*/ | ||||
extern __device__ int __float2int_rn(float); | ||||
/*DEVICE_BUILTIN*/ | /*DEVICE_BUILTIN*/ | |||
extern __device__ unsigned int __float2uint_rn(float); | ||||
/*DEVICE_BUILTIN*/ | ||||
extern __device__ unsigned int __float2uint_rz(float); | extern __device__ unsigned int __float2uint_rz(float); | |||
/*DEVICE_BUILTIN*/ | /*DEVICE_BUILTIN*/ | |||
extern __device__ unsigned int __float2uint_ru(float); | extern __device__ unsigned int __float2uint_ru(float); | |||
/*DEVICE_BUILTIN*/ | /*DEVICE_BUILTIN*/ | |||
extern __device__ unsigned int __float2uint_rd(float); | extern __device__ unsigned int __float2uint_rd(float); | |||
/*DEVICE_BUILTIN*/ | ||||
extern __device__ unsigned int __float2uint_rn(float); | ||||
/*DEVICE_BUILTIN*/ | /*DEVICE_BUILTIN*/ | |||
extern __device__ float __int2float_rn(int); | ||||
/*DEVICE_BUILTIN*/ | ||||
extern __device__ float __int2float_rz(int); | extern __device__ float __int2float_rz(int); | |||
/*DEVICE_BUILTIN*/ | /*DEVICE_BUILTIN*/ | |||
extern __device__ float __int2float_ru(int); | extern __device__ float __int2float_ru(int); | |||
/*DEVICE_BUILTIN*/ | /*DEVICE_BUILTIN*/ | |||
extern __device__ float __int2float_rd(int); | extern __device__ float __int2float_rd(int); | |||
/*DEVICE_BUILTIN*/ | ||||
extern __device__ float __int2float_rn(int); | ||||
/*DEVICE_BUILTIN*/ | /*DEVICE_BUILTIN*/ | |||
extern __device__ float __uint2float_rn(unsigned int); | ||||
/*DEVICE_BUILTIN*/ | ||||
extern __device__ float __uint2float_rz(unsigned int); | extern __device__ float __uint2float_rz(unsigned int); | |||
/*DEVICE_BUILTIN*/ | /*DEVICE_BUILTIN*/ | |||
extern __device__ float __uint2float_ru(unsigned int); | extern __device__ float __uint2float_ru(unsigned int); | |||
/*DEVICE_BUILTIN*/ | /*DEVICE_BUILTIN*/ | |||
extern __device__ float __uint2float_rd(unsigned int); | extern __device__ float __uint2float_rd(unsigned int); | |||
/*DEVICE_BUILTIN*/ | ||||
extern __device__ float __uint2float_rn(unsigned int); | ||||
/*DEVICE_BUILTIN*/ | /*DEVICE_BUILTIN*/ | |||
extern __device__ long long int __float2ll_rn(float); | ||||
/*DEVICE_BUILTIN*/ | ||||
extern __device__ long long int __float2ll_rz(float); | extern __device__ long long int __float2ll_rz(float); | |||
/*DEVICE_BUILTIN*/ | /*DEVICE_BUILTIN*/ | |||
extern __device__ long long int __float2ll_rn(float); | extern __device__ long long int __float2ll_ru(float); | |||
/*DEVICE_BUILTIN*/ | ||||
extern __device__ long long int __float2ll_rd(float); | ||||
/*DEVICE_BUILTIN*/ | /*DEVICE_BUILTIN*/ | |||
extern __device__ unsigned long long int __float2ull_rn(float); | ||||
/*DEVICE_BUILTIN*/ | ||||
extern __device__ unsigned long long int __float2ull_rz(float); | extern __device__ unsigned long long int __float2ull_rz(float); | |||
/*DEVICE_BUILTIN*/ | ||||
extern __device__ unsigned long long int __float2ull_ru(float); | ||||
/*DEVICE_BUILTIN*/ | ||||
extern __device__ unsigned long long int __float2ull_rd(float); | ||||
/*DEVICE_BUILTIN*/ | /*DEVICE_BUILTIN*/ | |||
extern __device__ float __ll2float_rn(long long int); | extern __device__ float __ll2float_rn(long long int); | |||
/*DEVICE_BUILTIN*/ | /*DEVICE_BUILTIN*/ | |||
extern __device__ float __ull2float_rn(unsigned long long int); | extern __device__ float __ull2float_rn(unsigned long long int); | |||
/*DEVICE_BUILTIN*/ | /*DEVICE_BUILTIN*/ | |||
extern __device__ unsigned short __float2half_rn(float); | ||||
/*DEVICE_BUILTIN*/ | ||||
extern __device__ float __half2float(unsigned short); | ||||
/*DEVICE_BUILTIN*/ | ||||
extern __device__ float __fadd_rn(float, float); | extern __device__ float __fadd_rn(float, float); | |||
/*DEVICE_BUILTIN*/ | /*DEVICE_BUILTIN*/ | |||
extern __device__ float __fadd_rz(float, float); | extern __device__ float __fadd_rz(float, float); | |||
/*DEVICE_BUILTIN*/ | /*DEVICE_BUILTIN*/ | |||
extern __device__ float __fadd_ru(float, float); | extern __device__ float __fadd_ru(float, float); | |||
/*DEVICE_BUILTIN*/ | /*DEVICE_BUILTIN*/ | |||
extern __device__ float __fadd_rd(float, float); | extern __device__ float __fadd_rd(float, float); | |||
/*DEVICE_BUILTIN*/ | /*DEVICE_BUILTIN*/ | |||
extern __device__ float __fmul_rn(float, float); | extern __device__ float __fmul_rn(float, float); | |||
skipping to change at line 260 | skipping to change at line 275 | |||
/*DEVICE_BUILTIN*/ | /*DEVICE_BUILTIN*/ | |||
extern __device__ int __clzll(long long int); | extern __device__ int __clzll(long long int); | |||
/*DEVICE_BUILTIN*/ | /*DEVICE_BUILTIN*/ | |||
extern __device__ int __ffsll(long long int); | extern __device__ int __ffsll(long long int); | |||
/*DEVICE_BUILTIN*/ | /*DEVICE_BUILTIN*/ | |||
extern __device__ int __popcll(unsigned long long int); | extern __device__ int __popcll(unsigned long long int); | |||
/*DEVICE_BUILTIN*/ | /*DEVICE_BUILTIN*/ | |||
extern __device__ unsigned long long int __brevll(unsigned long long int); | extern __device__ unsigned long long int __brevll(unsigned long long int); | |||
#if !defined(CUDA_NO_SM_13_DOUBLE_INTRINSICS) | #if (__CUDA_ARCH__ >= 130) | |||
/*DEVICE_BUILTIN*/ | /*DEVICE_BUILTIN*/ | |||
extern __device__ int __double2int_rz(double); | extern __device__ int __double2int_rz(double); | |||
/*DEVICE_BUILTIN*/ | /*DEVICE_BUILTIN*/ | |||
extern __device__ unsigned int __double2uint_rz(double); | extern __device__ unsigned int __double2uint_rz(double); | |||
/*DEVICE_BUILTIN*/ | /*DEVICE_BUILTIN*/ | |||
extern __device__ long long int __double2ll_rz(double); | extern __device__ long long int __double2ll_rz(double); | |||
/*DEVICE_BUILTIN*/ | /*DEVICE_BUILTIN*/ | |||
extern __device__ unsigned long long int __double2ull_rz(double); | extern __device__ unsigned long long int __double2ull_rz(double); | |||
#endif /* ! CUDA_NO_SM_13_DOUBLE_INTRINSICS */ | #endif /* __CUDA_ARCH__ >= 130 */ | |||
} | } | |||
/************************************************************************** ***** | /************************************************************************** ***** | |||
* * | * * | |||
* * | * * | |||
* * | * * | |||
*************************************************************************** ****/ | *************************************************************************** ****/ | |||
static __inline__ __device__ int mulhi(int a, int b) | static __inline__ __device__ int mulhi(int a, int b) | |||
skipping to change at line 751 | skipping to change at line 766 | |||
inc = abs(rem0) < abs(rem1); | inc = abs(rem0) < abs(rem1); | |||
resi = ((expo_res << 23) + r + inc); | resi = ((expo_res << 23) + r + inc); | |||
if (resi != 0x00800000) resi = 0; | if (resi != 0x00800000) resi = 0; | |||
return __int_as_float(sign | resi); | return __int_as_float(sign | resi); | |||
} | } | |||
} | } | |||
if (__cuda_fabsf(divisor) > CUDART_TWO_TO_126_F) { | if (__cuda_fabsf(divisor) > CUDART_TWO_TO_126_F) { | |||
divisor *= 0.25f; | divisor *= 0.25f; | |||
dividend *= 0.25f; | dividend *= 0.25f; | |||
} | } | |||
return dividend / divisor; | return __fdividef (dividend, divisor); | |||
} | } | |||
__device_func__(float __fdiv_rz (float dividend, float divisor)) | __device_func__(float __fdiv_rz (float dividend, float divisor)) | |||
{ | { | |||
unsigned long long prod; | unsigned long long prod; | |||
unsigned r, f, x, y, expox, expoy, sign; | unsigned r, f, x, y, expox, expoy, sign; | |||
unsigned expo_res; | unsigned expo_res; | |||
unsigned resi, cvtxi, cvtyi; | unsigned resi, cvtxi, cvtyi; | |||
float t; | float t; | |||
skipping to change at line 819 | skipping to change at line 834 | |||
if (rem1 < 0) r--; | if (rem1 < 0) r--; | |||
resi = ((expo_res << 23) + r); | resi = ((expo_res << 23) + r); | |||
if (resi != 0x00800000) resi = 0; | if (resi != 0x00800000) resi = 0; | |||
return __int_as_float(sign | resi); | return __int_as_float(sign | resi); | |||
} | } | |||
} | } | |||
if (__cuda_fabsf(divisor) > CUDART_TWO_TO_126_F) { | if (__cuda_fabsf(divisor) > CUDART_TWO_TO_126_F) { | |||
divisor *= 0.25f; | divisor *= 0.25f; | |||
dividend *= 0.25f; | dividend *= 0.25f; | |||
} | } | |||
return dividend / divisor; | return __fdividef (dividend, divisor); | |||
} | } | |||
__device_func__(float __fdiv_ru (float dividend, float divisor)) | __device_func__(float __fdiv_ru (float dividend, float divisor)) | |||
{ | { | |||
unsigned long long prod; | unsigned long long prod; | |||
unsigned r, f, x, y, expox, expoy, sign; | unsigned r, f, x, y, expox, expoy, sign; | |||
unsigned expo_res; | unsigned expo_res; | |||
unsigned resi, cvtxi, cvtyi; | unsigned resi, cvtxi, cvtyi; | |||
float t; | float t; | |||
skipping to change at line 889 | skipping to change at line 904 | |||
if ((rem1 > 0) && (!sign)) r++; | if ((rem1 > 0) && (!sign)) r++; | |||
resi = ((expo_res << 23) + r); | resi = ((expo_res << 23) + r); | |||
if (resi != 0x00800000) resi = 0; | if (resi != 0x00800000) resi = 0; | |||
return __int_as_float(sign | resi); | return __int_as_float(sign | resi); | |||
} | } | |||
} | } | |||
if (__cuda_fabsf(divisor) > CUDART_TWO_TO_126_F) { | if (__cuda_fabsf(divisor) > CUDART_TWO_TO_126_F) { | |||
divisor *= 0.25f; | divisor *= 0.25f; | |||
dividend *= 0.25f; | dividend *= 0.25f; | |||
} | } | |||
return dividend / divisor; | return __fdividef (dividend, divisor); | |||
} | } | |||
__device_func__(float __fdiv_rd (float dividend, float divisor)) | __device_func__(float __fdiv_rd (float dividend, float divisor)) | |||
{ | { | |||
unsigned long long prod; | unsigned long long prod; | |||
unsigned r, f, x, y, expox, expoy, sign; | unsigned r, f, x, y, expox, expoy, sign; | |||
unsigned expo_res; | unsigned expo_res; | |||
unsigned resi, cvtxi, cvtyi; | unsigned resi, cvtxi, cvtyi; | |||
float t; | float t; | |||
skipping to change at line 959 | skipping to change at line 974 | |||
if ((rem1 > 0) && (sign)) r++; | if ((rem1 > 0) && (sign)) r++; | |||
resi = ((expo_res << 23) + r); | resi = ((expo_res << 23) + r); | |||
if (resi != 0x00800000) resi = 0; | if (resi != 0x00800000) resi = 0; | |||
return __int_as_float(sign | resi); | return __int_as_float(sign | resi); | |||
} | } | |||
} | } | |||
if (__cuda_fabsf(divisor) > CUDART_TWO_TO_126_F) { | if (__cuda_fabsf(divisor) > CUDART_TWO_TO_126_F) { | |||
divisor *= 0.25f; | divisor *= 0.25f; | |||
dividend *= 0.25f; | dividend *= 0.25f; | |||
} | } | |||
return dividend / divisor; | return __fdividef (dividend, divisor); | |||
} | } | |||
__device_func__(float __fadd_ru (float a, float b)) | __device_func__(float __fadd_ru (float a, float b)) | |||
{ | { | |||
unsigned int expo_x, expo_y; | unsigned int expo_x, expo_y; | |||
unsigned int xxi, yyi, temp; | unsigned int xxi, yyi, temp; | |||
xxi = __float_as_int(a); | xxi = __float_as_int(a); | |||
yyi = __float_as_int(b); | yyi = __float_as_int(b); | |||
skipping to change at line 2921 | skipping to change at line 2937 | |||
while ((int)cvty.i >= 0) { | while ((int)cvty.i >= 0) { | |||
expoy--; | expoy--; | |||
cvty.i = cvty.i + cvty.i; | cvty.i = cvty.i + cvty.i; | |||
} | } | |||
cvty.i = cvty.i >> 8; | cvty.i = cvty.i >> 8; | |||
} | } | |||
goto divide; | goto divide; | |||
} | } | |||
} | } | |||
__device_func__(float __internal_fmul_kernel2 (float a, float b, | __device_func__(float __internal_fmul_kernel (float a, float b, | |||
enum cudaRoundMode mode)) | enum cudaRoundMode mode)) | |||
{ | { | |||
unsigned long long product; | unsigned long long product; | |||
volatile union __cudart_FloatUintCvt xx, yy; | volatile union __cudart_FloatUintCvt xx, yy; | |||
unsigned expo_x, expo_y; | unsigned expo_x, expo_y; | |||
xx.f = a; | xx.f = a; | |||
yy.f = b; | yy.f = b; | |||
expo_y = 0xFF; | expo_y = 0xFF; | |||
expo_x = expo_y & (xx.i >> 23); | expo_x = expo_y & (xx.i >> 23); | |||
expo_x = expo_x - 1; | expo_x = expo_x - 1; | |||
expo_y = expo_y & (yy.i >> 23); | expo_y = expo_y & (yy.i >> 23); | |||
expo_y = expo_y - 1; | expo_y = expo_y - 1; | |||
skipping to change at line 3418 | skipping to change at line 3433 | |||
} else if (mode == cudaRoundPosInf) { | } else if (mode == cudaRoundPosInf) { | |||
xx = xx + (!expo_y && temp); | xx = xx + (!expo_y && temp); | |||
} else if (mode == cudaRoundMinInf) { | } else if (mode == cudaRoundMinInf) { | |||
xx = xx + (expo_y && temp); | xx = xx + (expo_y && temp); | |||
} | } | |||
xx = expo_y + xx; /* add in sign bit */ | xx = expo_y + xx; /* add in sign bit */ | |||
cvt.i = xx; | cvt.i = xx; | |||
return cvt.f; | return cvt.f; | |||
} | } | |||
/* NOTE: Does not currently support round-to-nearest, round-to-zero */ | __device_func__(float __internal_fadd_kernel (float a, float b, | |||
__device_func__(float __internal_fadd_kernel2 (float a, float b, | enum cudaRoundMode mode)) | |||
enum cudaRoundMode mode)) | ||||
{ | { | |||
volatile union __cudart_FloatUintCvt xx, yy; | volatile union __cudart_FloatUintCvt xx, yy; | |||
unsigned int expo_x; | unsigned int expo_x; | |||
unsigned int expo_y; | unsigned int expo_y; | |||
unsigned int temp; | unsigned int temp; | |||
xx.f = a; | xx.f = a; | |||
yy.f = b; | yy.f = b; | |||
/* make bigger operand the augend */ | /* make bigger operand the augend */ | |||
skipping to change at line 3466 | skipping to change at line 3480 | |||
if ((int)temp < 0) { | if ((int)temp < 0) { | |||
/* signs differ, effective subtraction */ | /* signs differ, effective subtraction */ | |||
temp = 32 - expo_y; | temp = 32 - expo_y; | |||
temp = (expo_y) ? (yy.i << temp) : 0; | temp = (expo_y) ? (yy.i << temp) : 0; | |||
temp = (unsigned)(-((int)temp)); | temp = (unsigned)(-((int)temp)); | |||
xx.i = xx.i - (yy.i >> expo_y) - (temp ? 1 : 0); | xx.i = xx.i - (yy.i >> expo_y) - (temp ? 1 : 0); | |||
if (xx.i & 0x00800000) { | if (xx.i & 0x00800000) { | |||
if (expo_x <= 0xFD) { | if (expo_x <= 0xFD) { | |||
xx.i = xx.i + (expo_x << 23); | xx.i = xx.i + (expo_x << 23); | |||
if (mode == cudaRoundMinInf) { | if (mode == cudaRoundNearest) { | |||
xx.i += (temp && (xx.i & 0x80000000)); | if (temp < 0x80000000) return xx.f; | |||
xx.i += ((temp == 0x80000000) ? (xx.i & 1) : (temp >> 31)); | ||||
} else if (mode == cudaRoundZero) { | ||||
} else if (mode == cudaRoundPosInf) { | } else if (mode == cudaRoundPosInf) { | |||
xx.i += (temp && !(xx.i & 0x80000000)); | xx.i += (temp && !(xx.i & 0x80000000)); | |||
} else if (mode == cudaRoundMinInf) { | ||||
xx.i += (temp && (xx.i & 0x80000000)); | ||||
} | } | |||
return xx.f; | return xx.f; | |||
} | } | |||
} else { | } else { | |||
if ((temp | (xx.i << 1)) == 0) { | if ((temp | (xx.i << 1)) == 0) { | |||
/* operands cancelled, resulting in a clean zero */ | /* operands cancelled, resulting in a clean zero */ | |||
if (mode == cudaRoundMinInf) { | if (mode == cudaRoundMinInf) { | |||
xx.i = 0x80000000; | xx.i = 0x80000000; | |||
} else if (mode == cudaRoundPosInf) { | } else { | |||
xx.i = 0; | xx.i = 0; | |||
} | } | |||
return xx.f; | return xx.f; | |||
} | } | |||
/* normalize result */ | /* normalize result */ | |||
yy.i = xx.i & 0x80000000; | yy.i = xx.i & 0x80000000; | |||
do { | do { | |||
xx.i = (xx.i << 1) | (temp >> 31); | xx.i = (xx.i << 1) | (temp >> 31); | |||
temp <<= 1; | temp <<= 1; | |||
expo_x--; | expo_x--; | |||
skipping to change at line 3500 | skipping to change at line 3518 | |||
xx.i = xx.i | yy.i; | xx.i = xx.i | yy.i; | |||
} | } | |||
} else { | } else { | |||
/* signs are the same, effective addition */ | /* signs are the same, effective addition */ | |||
temp = 32 - expo_y; | temp = 32 - expo_y; | |||
temp = (expo_y) ? (yy.i << temp) : 0; | temp = (expo_y) ? (yy.i << temp) : 0; | |||
xx.i = xx.i + (yy.i >> expo_y); | xx.i = xx.i + (yy.i >> expo_y); | |||
if (!(xx.i & 0x01000000)) { | if (!(xx.i & 0x01000000)) { | |||
if (expo_x <= 0xFD) { | if (expo_x <= 0xFD) { | |||
xx.i = xx.i + (expo_x << 23); | xx.i = xx.i + (expo_x << 23); | |||
if (mode == cudaRoundMinInf) { | if (mode == cudaRoundNearest) { | |||
xx.i += (temp && (xx.i & 0x80000000)); | if (temp < 0x80000000) return xx.f; | |||
xx.i += ((temp == 0x80000000) ? (xx.i & 1) : (temp >> 31)); | ||||
} else if (mode == cudaRoundZero) { | ||||
} else if (mode == cudaRoundPosInf) { | } else if (mode == cudaRoundPosInf) { | |||
xx.i += (temp && !(xx.i & 0x80000000)); | xx.i += (temp && !(xx.i & 0x80000000)); | |||
} else if (mode == cudaRoundMinInf) { | ||||
xx.i += (temp && (xx.i & 0x80000000)); | ||||
} | } | |||
return xx.f; | return xx.f; | |||
} | } | |||
} else { | } else { | |||
/* normalize result */ | /* normalize result */ | |||
temp = (xx.i << 31) | (temp >> 1); | temp = (xx.i << 31) | (temp >> 1); | |||
xx.i = ((xx.i & 0x80000000) | (xx.i >> 1)) & ~0x40000000; | xx.i = ((xx.i & 0x80000000) | (xx.i >> 1)) & ~0x40000000; | |||
expo_x++; | expo_x++; | |||
} | } | |||
} | } | |||
if (expo_x <= 0xFD) { | if (expo_x <= 0xFD) { | |||
if (mode == cudaRoundMinInf) { | xx.i = xx.i + (expo_x << 23); | |||
xx.i += (temp && (xx.i & 0x80000000)); | if (mode == cudaRoundNearest) { | |||
if (temp < 0x80000000) return xx.f; | ||||
xx.i += ((temp == 0x80000000) ? (xx.i & 1) : (temp >> 31)); | ||||
} else if (mode == cudaRoundZero) { | ||||
} else if (mode == cudaRoundPosInf) { | } else if (mode == cudaRoundPosInf) { | |||
xx.i += (temp && !(xx.i & 0x80000000)); | xx.i += (temp && !(xx.i & 0x80000000)); | |||
} else if (mode == cudaRoundMinInf) { | ||||
xx.i += (temp && (xx.i & 0x80000000)); | ||||
} | } | |||
xx.i = xx.i + (expo_x << 23); | ||||
return xx.f; | return xx.f; | |||
} | } | |||
if ((int)expo_x >= 254) { | if ((int)expo_x >= 254) { | |||
/* overflow: return infinity or largest normal */ | /* overflow: return infinity or largest normal */ | |||
temp = xx.i & 0x80000000; | temp = xx.i & 0x80000000; | |||
if (mode == cudaRoundMinInf) { | if (mode == cudaRoundNearest) { | |||
xx.i = (temp) | 0x7f800000; | ||||
} else if (mode == cudaRoundZero) { | ||||
xx.i = (temp) | 0x7f7fffff; | ||||
} else if (mode == cudaRoundMinInf) { | ||||
xx.i = (temp ? 0xFF800000 : 0x7f7fffff); | xx.i = (temp ? 0xFF800000 : 0x7f7fffff); | |||
} else if (mode == cudaRoundPosInf) { | } else if (mode == cudaRoundPosInf) { | |||
xx.i = (temp ? 0xff7fffff : 0x7F800000); | xx.i = (temp ? 0xff7fffff : 0x7F800000); | |||
} | } | |||
return xx.f; | return xx.f; | |||
} | } | |||
/* underflow: denormal, or smallest normal */ | /* underflow: denormal, or smallest normal */ | |||
expo_y = expo_x + 32; | expo_y = expo_x + 32; | |||
yy.i = xx.i & 0x80000000; | yy.i = xx.i & 0x80000000; | |||
xx.i = xx.i & ~0xff000000; | xx.i = xx.i & ~0xff000000; | |||
expo_x = (unsigned)(-((int)expo_x)); | expo_x = (unsigned)(-((int)expo_x)); | |||
temp = xx.i << expo_y | ((temp) ? 1 : 0); | temp = xx.i << expo_y | ((temp) ? 1 : 0); | |||
xx.i = yy.i | (xx.i >> expo_x); | xx.i = yy.i | (xx.i >> expo_x); | |||
if (mode == cudaRoundMinInf) { | if (mode == cudaRoundNearest) { | |||
xx.i += (temp && yy.i); | xx.i += (temp == 0x80000000) ? (xx.i & 1) : (temp >> 31); | |||
} else if (mode == cudaRoundZero) { | ||||
} else if (mode == cudaRoundPosInf) { | } else if (mode == cudaRoundPosInf) { | |||
xx.i += (temp && !yy.i); | xx.i += (temp && !yy.i); | |||
} else if (mode == cudaRoundMinInf) { | ||||
xx.i += (temp && yy.i); | ||||
} | } | |||
return xx.f; | return xx.f; | |||
} else { | } else { | |||
/* handle special cases separately */ | /* handle special cases separately */ | |||
if (!(yy.i << 1)) { | if (!(yy.i << 1)) { | |||
if (mode == cudaRoundMinInf) { | if (mode == cudaRoundMinInf) { | |||
if (!(xx.i << 1)) { | if (!(xx.i << 1)) { | |||
xx.i = xx.i | yy.i; | xx.i = xx.i | yy.i; | |||
} | } | |||
} else if (mode == cudaRoundPosInf) { | } else { | |||
if (xx.i == 0x80000000) { | if (xx.i == 0x80000000) { | |||
xx.i = yy.i; | xx.i = yy.i; | |||
} | } | |||
} | } | |||
if ((xx.i << 1) > 0xff000000) { | if ((xx.i << 1) > 0xff000000) { | |||
xx.i |= 0x00400000; | xx.i |= 0x00400000; | |||
} | } | |||
return xx.f; | return xx.f; | |||
} | } | |||
if ((expo_y != 254) && (expo_x != 254)) { | if ((expo_y != 254) && (expo_x != 254)) { | |||
skipping to change at line 3680 | skipping to change at line 3713 | |||
return __internal_fdiv_kernel (a, b, cudaRoundMinInf); | return __internal_fdiv_kernel (a, b, cudaRoundMinInf); | |||
} | } | |||
__device_func__(float __fdiv_ru (float a, float b)) | __device_func__(float __fdiv_ru (float a, float b)) | |||
{ | { | |||
return __internal_fdiv_kernel (a, b, cudaRoundPosInf); | return __internal_fdiv_kernel (a, b, cudaRoundPosInf); | |||
} | } | |||
__device_func__(float __fadd_rd (float a, float b)) | __device_func__(float __fadd_rd (float a, float b)) | |||
{ | { | |||
return __internal_fadd_kernel2 (a, b, cudaRoundMinInf); | return __internal_fadd_kernel (a, b, cudaRoundMinInf); | |||
} | } | |||
__device_func__(float __fadd_ru (float a, float b)) | __device_func__(float __fadd_ru (float a, float b)) | |||
{ | { | |||
return __internal_fadd_kernel2 (a, b, cudaRoundPosInf); | return __internal_fadd_kernel (a, b, cudaRoundPosInf); | |||
} | } | |||
__device_func__(float __fmul_rd (float a, float b)) | __device_func__(float __fmul_rd (float a, float b)) | |||
{ | { | |||
return __internal_fmul_kernel2 (a, b, cudaRoundMinInf); | return __internal_fmul_kernel (a, b, cudaRoundMinInf); | |||
} | } | |||
__device_func__(float __fmul_ru (float a, float b)) | __device_func__(float __fmul_ru (float a, float b)) | |||
{ | { | |||
return __internal_fmul_kernel2 (a, b, cudaRoundPosInf); | return __internal_fmul_kernel (a, b, cudaRoundPosInf); | |||
} | } | |||
__device_func__(float __fmaf_rn (float a, float b, float c)) | __device_func__(float __fmaf_rn (float a, float b, float c)) | |||
{ | { | |||
return __internal_fmaf_kernel (a, b, c, cudaRoundNearest); | return __internal_fmaf_kernel (a, b, c, cudaRoundNearest); | |||
} | } | |||
__device_func__(float __fmaf_rz (float a, float b, float c)) | __device_func__(float __fmaf_rz (float a, float b, float c)) | |||
{ | { | |||
return __internal_fmaf_kernel (a, b, c, cudaRoundZero); | return __internal_fmaf_kernel (a, b, c, cudaRoundZero); | |||
skipping to change at line 3777 | skipping to change at line 3810 | |||
{ | { | |||
long long int res; | long long int res; | |||
res = __umul64hi(a, b); | res = __umul64hi(a, b); | |||
if (a < 0LL) res = res - b; | if (a < 0LL) res = res - b; | |||
if (b < 0LL) res = res - a; | if (b < 0LL) res = res - a; | |||
return res; | return res; | |||
} | } | |||
__device_func__(float __saturatef(float a)) | __device_func__(float __saturatef(float a)) | |||
{ | { | |||
if (__cuda___isnanf(a)) return 0.0f; // update of PTX spec 10/15/2008 | if (__cuda___isnanf(a)) return 0.0f; /* update of PTX spec 10/15/2008 */ | |||
return a >= 1.0f ? 1.0f : a <= 0.0f ? 0.0f : a; | return a >= 1.0f ? 1.0f : a <= 0.0f ? 0.0f : a; | |||
} | } | |||
__device_func__(unsigned int __sad(int a, int b, unsigned int c)) | __device_func__(unsigned int __sad(int a, int b, unsigned int c)) | |||
{ | { | |||
long long int diff = (long long int)a - (long long int)b; | long long int diff = (long long int)a - (long long int)b; | |||
return (unsigned int)(__cuda_llabs(diff) + (long long int)c); | return (unsigned int)(__cuda_llabs(diff) + (long long int)c); | |||
} | } | |||
skipping to change at line 4106 | skipping to change at line 4139 | |||
{ | { | |||
volatile union __cudart_FloatUintCvt res; | volatile union __cudart_FloatUintCvt res; | |||
int shift; | int shift; | |||
unsigned int t; | unsigned int t; | |||
res.i = a; | res.i = a; | |||
if (a == 0) return res.f; | if (a == 0) return res.f; | |||
shift = __internal_normalize((unsigned int*)&res.i); | shift = __internal_normalize((unsigned int*)&res.i); | |||
t = res.i << 24; | t = res.i << 24; | |||
res.i = (res.i >> 8); | res.i = (res.i >> 8); | |||
res.i += (127 + 30 - shift) << 23; | res.i += (127 + 30 - shift) << 23; | |||
if ((rndMode == cudaRoundNearest) && (t >= 0x80000000)) { | if (rndMode == cudaRoundNearest) { | |||
res.i += (t == 0x80000000) ? (res.i & 1) : (t >> 31); | res.i += (t == 0x80000000) ? (res.i & 1) : (t >> 31); | |||
} | } | |||
else if ((rndMode == cudaRoundPosInf) && t) { | else if ((rndMode == cudaRoundPosInf) && t) { | |||
res.i++; | res.i++; | |||
} | } | |||
return res.f; | return res.f; | |||
} | } | |||
__device_func__(float __uint2float_rz(unsigned int a)) | __device_func__(float __uint2float_rz(unsigned int a)) | |||
{ | { | |||
skipping to change at line 4144 | skipping to change at line 4177 | |||
#else /* __MULTI_CORE__ */ | #else /* __MULTI_CORE__ */ | |||
return __internal_uint2float_kernel(a, cudaRoundNearest); | return __internal_uint2float_kernel(a, cudaRoundNearest); | |||
#endif /* __MULTI_CORE__ */ | #endif /* __MULTI_CORE__ */ | |||
} | } | |||
__device_func__(float __ll2float_rn(long long int a)) | __device_func__(float __ll2float_rn(long long int a)) | |||
{ | { | |||
return (float)a; | return (float)a; | |||
} | } | |||
__device_func__(float __ull2float_rn(unsigned long long int a)) | __device_func__(float __internal_ull2float_kernel(unsigned long long int a, enum cudaRoundMode rndMode)) | |||
{ | { | |||
#if defined(__MULTI_CORE__) | ||||
return (float)a; | ||||
#else /* __MULTI_CORE__ */ | ||||
unsigned long long int temp; | unsigned long long int temp; | |||
unsigned int res, t; | unsigned int res, t; | |||
int shift; | int shift; | |||
if (a == 0ULL) return 0.0f; | if (a == 0ULL) return 0.0f; | |||
temp = a; | temp = a; | |||
shift = __internal_normalize64(&temp); | shift = __internal_normalize64(&temp); | |||
temp = (temp >> 8) | ((temp & 0xffULL) ? 1ULL : 0ULL); | temp = (temp >> 8) | ((temp & 0xffULL) ? 1ULL : 0ULL); | |||
res = (unsigned int)(temp >> 32); | res = (unsigned int)(temp >> 32); | |||
t = (unsigned int)temp; | t = (unsigned int)temp; | |||
res += (127 + 62 - shift) << 23; /* add in exponent */ | res += (127 + 62 - shift) << 23; /* add in exponent */ | |||
res += t == 0x80000000 ? res & 1 : t >> 31; | if (rndMode == cudaRoundNearest) { | |||
res += (t == 0x80000000) ? (res & 1) : (t >> 31); | ||||
} else if (rndMode == cudaRoundPosInf) { | ||||
res += (t != 0); | ||||
} | ||||
return __int_as_float(res); | return __int_as_float(res); | |||
} | ||||
__device_func__(float __ull2float_rn(unsigned long long int a)) | ||||
{ | ||||
#if defined(__MULTI_CORE__) | ||||
return (float)a; | ||||
#else /* __MULTI_CORE__ */ | ||||
return __internal_ull2float_kernel(a, cudaRoundNearest); | ||||
#endif /* __MULTI_CORE__ */ | #endif /* __MULTI_CORE__ */ | |||
} | } | |||
__device_func__(float __internal_fmul_kernel(float a, float b, int rndNeare st)) | __device_func__(unsigned short __float2half_rn(float f)) | |||
{ | { | |||
unsigned long long product; | unsigned int x = __float_as_int (f); | |||
volatile union __cudart_FloatUintCvt xx, yy; | unsigned int u = (x & 0x7fffffff), remainder, shift, lsb, lsb_s1, lsb_m1; | |||
unsigned expo_x, expo_y; | unsigned int sign, exponent, mantissa; | |||
xx.f = a; | /* Get rid of +NaN/-NaN case first. */ | |||
yy.f = b; | if (u > 0x7f800000) { | |||
return 0x7fff; | ||||
} | ||||
expo_y = 0xFF; | sign = ((x >> 16) & 0x8000); | |||
expo_x = expo_y & (xx.i >> 23); | ||||
expo_x = expo_x - 1; | ||||
expo_y = expo_y & (yy.i >> 23); | ||||
expo_y = expo_y - 1; | ||||
if ((expo_x <= 0xFD) && | /* Get rid of +Inf/-Inf, +0/-0. */ | |||
(expo_y <= 0xFD)) { | if (u > 0x477fefff) { | |||
multiply: | return sign | 0x7c00; | |||
expo_x = expo_x + expo_y; | } | |||
expo_y = xx.i ^ yy.i; | if (u < 0x33000001) { | |||
xx.i = xx.i & 0x00ffffff; | return sign | 0x0000; | |||
yy.i = yy.i << 8; | ||||
xx.i = xx.i | 0x00800000; | ||||
yy.i = yy.i | 0x80000000; | ||||
/* compute product */ | ||||
product = ((unsigned long long)xx.i) * yy.i; | ||||
expo_x = expo_x - 127 + 2; | ||||
expo_y = expo_y & 0x80000000; | ||||
xx.i = (unsigned int)(product >> 32); | ||||
yy.i = (unsigned int)(product & 0xffffffff); | ||||
/* normalize mantissa */ | ||||
if (xx.i < 0x00800000) { | ||||
xx.i = (xx.i << 1) | (yy.i >> 31); | ||||
yy.i = (yy.i << 1); | ||||
expo_x--; | ||||
} | ||||
if (expo_x <= 0xFD) { | ||||
xx.i = xx.i | expo_y; /* OR in sign bit */ | ||||
xx.i = xx.i + (expo_x << 23); /* add in exponent */ | ||||
/* round result to nearest or even */ | ||||
if (yy.i < 0x80000000) return xx.f; | ||||
xx.i += (((yy.i == 0x80000000) ? (xx.i & 1) : (yy.i >> 31)) | ||||
&& rndNearest); | ||||
return xx.f; | ||||
} else if ((int)expo_x >= 254) { | ||||
/* overflow: return infinity */ | ||||
xx.i = (expo_y | 0x7F800000) - (!rndNearest); | ||||
return xx.f; | ||||
} else { | ||||
/* zero, denormal, or smallest normal */ | ||||
expo_x = ((unsigned int)-((int)expo_x)); | ||||
if (expo_x > 25) { | ||||
/* massive underflow: return 0 */ | ||||
xx.i = expo_y; | ||||
return xx.f; | ||||
} else { | ||||
yy.i = (xx.i << (32 - expo_x)) | ((yy.i) ? 1 : 0); | ||||
xx.i = expo_y + (xx.i >> expo_x); | ||||
xx.i += (((yy.i == 0x80000000) ? (xx.i & 1) : (yy.i >> 31)) | ||||
&& rndNearest); | ||||
return xx.f; | ||||
} | ||||
} | ||||
} else { | ||||
product = xx.i ^ yy.i; | ||||
product = product & 0x80000000; | ||||
if (!(xx.i & 0x7fffffff)) { | ||||
if (expo_y != 254) { | ||||
xx.i = (unsigned int)product; | ||||
return xx.f; | ||||
} | ||||
expo_y = yy.i << 1; | ||||
if (expo_y == 0xFF000000) { | ||||
xx.i = expo_y | 0x00C00000; | ||||
} else { | ||||
xx.i = yy.i | 0x00400000; | ||||
} | ||||
return xx.f; | ||||
} | ||||
if (!(yy.i & 0x7fffffff)) { | ||||
if (expo_x != 254) { | ||||
xx.i = (unsigned int)product; | ||||
return xx.f; | ||||
} | ||||
expo_x = xx.i << 1; | ||||
if (expo_x == 0xFF000000) { | ||||
xx.i = expo_x | 0x00C00000; | ||||
} else { | ||||
xx.i = xx.i | 0x00400000; | ||||
} | ||||
return xx.f; | ||||
} | ||||
if ((expo_y != 254) && (expo_x != 254)) { | ||||
expo_y++; | ||||
expo_x++; | ||||
if (expo_x == 0) { | ||||
expo_y |= xx.i & 0x80000000; | ||||
/* | ||||
* If both operands are denormals, we only need to normalize | ||||
* one of them as the result will be either a denormal or zero. | ||||
*/ | ||||
xx.i = xx.i << 8; | ||||
while (!(xx.i & 0x80000000)) { | ||||
xx.i <<= 1; | ||||
expo_x--; | ||||
} | ||||
xx.i = (xx.i >> 8) | (expo_y & 0x80000000); | ||||
expo_y &= ~0x80000000; | ||||
expo_y--; | ||||
goto multiply; | ||||
} | ||||
if (expo_y == 0) { | ||||
expo_x |= yy.i & 0x80000000; | ||||
yy.i = yy.i << 8; | ||||
while (!(yy.i & 0x80000000)) { | ||||
yy.i <<= 1; | ||||
expo_y--; | ||||
} | ||||
yy.i = (yy.i >> 8) | (expo_x & 0x80000000); | ||||
expo_x &= ~0x80000000; | ||||
expo_x--; | ||||
goto multiply; | ||||
} | ||||
} | ||||
expo_x = xx.i << 1; | ||||
expo_y = yy.i << 1; | ||||
/* if x is NaN, return x */ | ||||
if (expo_x > 0xFF000000) { | ||||
/* cvt any SNaNs to QNaNs */ | ||||
xx.i = xx.i | 0x00400000; | ||||
return xx.f; | ||||
} | ||||
/* if y is NaN, return y */ | ||||
if (expo_y > 0xFF000000) { | ||||
/* cvt any SNaNs to QNaNs */ | ||||
xx.i = yy.i | 0x00400000; | ||||
return xx.f; | ||||
} | ||||
xx.i = (unsigned int)product | 0x7f800000; | ||||
return xx.f; | ||||
} | } | |||
} | ||||
__device_func__(float __internal_fadd_kernel(float a, float b, int rndNeare | ||||
st)) | ||||
{ | ||||
volatile union __cudart_FloatUintCvt xx, yy; | ||||
unsigned int expo_x; | ||||
unsigned int expo_y; | ||||
unsigned int temp; | ||||
xx.f = a; | exponent = ((u >> 23) & 0xff); | |||
yy.f = b; | mantissa = (u & 0x7fffff); | |||
/* make bigger operand the augend */ | if (exponent > 0x70) { | |||
expo_y = yy.i << 1; | shift = 13; | |||
if (expo_y > (xx.i << 1)) { | exponent -= 0x70; | |||
expo_y = xx.i; | } else { | |||
xx.i = yy.i; | shift = 0x7e - exponent; | |||
yy.i = expo_y; | exponent = 0; | |||
mantissa |= 0x800000; | ||||
} | } | |||
lsb = (1 << shift); | ||||
lsb_s1 = (lsb >> 1); | ||||
lsb_m1 = (lsb - 1); | ||||
temp = 0xff; | /* Round to nearest even. */ | |||
expo_x = temp & (xx.i >> 23); | remainder = (mantissa & lsb_m1); | |||
expo_x = expo_x - 1; | mantissa >>= shift; | |||
expo_y = temp & (yy.i >> 23); | if (remainder > lsb_s1 || (remainder == lsb_s1 && (mantissa & 0x1))) { | |||
expo_y = expo_y - 1; | ++mantissa; | |||
if (!(mantissa & 0x3ff)) { | ||||
++exponent; | ||||
mantissa = 0; | ||||
} | ||||
} | ||||
if ((expo_x <= 0xFD) && | return sign | (exponent << 10) | mantissa; | |||
(expo_y <= 0xFD)) { | } | |||
add: | __device_func__(float __half2float(unsigned short h)) | |||
expo_y = expo_x - expo_y; | { | |||
if (expo_y > 25) { | unsigned int sign = ((h >> 15) & 1); | |||
expo_y = 31; | unsigned int exponent = ((h >> 10) & 0x1f); | |||
} | unsigned int mantissa = ((h & 0x3ff) << 13); | |||
temp = xx.i ^ yy.i; | ||||
xx.i = xx.i & ~0x7f000000; | ||||
xx.i = xx.i | 0x00800000; | ||||
yy.i = yy.i & ~0xff000000; | ||||
yy.i = yy.i | 0x00800000; | ||||
if ((int)temp < 0) { | if (exponent == 0x1f) { /* NaN or Inf */ | |||
/* signs differ, effective subtraction */ | mantissa = (mantissa | |||
temp = 32 - expo_y; | ? (sign = 0, 0x7fffff) | |||
temp = (expo_y) ? (yy.i << temp) : 0; | : 0); | |||
temp = (unsigned int)(-((int)temp)); | exponent = 0xff; | |||
xx.i = xx.i - (yy.i >> expo_y) - (temp ? 1 : 0); | } else if (!exponent) { /* Denorm or Zero */ | |||
if (xx.i & 0x00800000) { | if (mantissa) { | |||
if (expo_x <= 0xFD) { | unsigned int msb; | |||
xx.i = xx.i & ~0x00800000; /* lop off integer bit */ | exponent = 0x71; | |||
xx.i = (xx.i + (expo_x << 23)) + 0x00800000; | do { | |||
if (temp < 0x80000000) return xx.f; | msb = (mantissa & 0x400000); | |||
xx.i += (((temp == 0x80000000) ? (xx.i & 1) : (temp >> 31)) | mantissa <<= 1; /* normalize */ | |||
&& rndNearest); | --exponent; | |||
return xx.f; | } while (!msb); | |||
} | mantissa &= 0x7fffff; /* 1.mantissa is implicit */ | |||
} else { | ||||
if ((temp | (xx.i << 1)) == 0) { | ||||
/* operands cancelled, resulting in a clean zero */ | ||||
xx.i = 0; | ||||
return xx.f; | ||||
} | ||||
/* normalize result */ | ||||
yy.i = xx.i & 0x80000000; | ||||
do { | ||||
xx.i = (xx.i << 1) | (temp >> 31); | ||||
temp <<= 1; | ||||
expo_x--; | ||||
} while (!(xx.i & 0x00800000)); | ||||
xx.i = xx.i | yy.i; | ||||
} | ||||
} else { | ||||
/* signs are the same, effective addition */ | ||||
temp = 32 - expo_y; | ||||
temp = (expo_y) ? (yy.i << temp) : 0; | ||||
xx.i = xx.i + (yy.i >> expo_y); | ||||
if (!(xx.i & 0x01000000)) { | ||||
if (expo_x <= 0xFD) { | ||||
expo_y = xx.i & 1; | ||||
xx.i = xx.i + (expo_x << 23); | ||||
if (temp < 0x80000000) return xx.f; | ||||
xx.i += (((temp == 0x80000000) ? expo_y : (temp >> 31)) | ||||
&& rndNearest); | ||||
return xx.f; | ||||
} | ||||
} else { | ||||
/* normalize result */ | ||||
temp = (xx.i << 31) | (temp >> 1); | ||||
/* not ANSI compliant: xx.i = (((int)xx.i)>>1) & ~0x40000000 */ | ||||
xx.i = ((xx.i & 0x80000000) | (xx.i >> 1)) & ~0x40000000; | ||||
expo_x++; | ||||
} | ||||
} | ||||
if (expo_x <= 0xFD) { | ||||
expo_y = xx.i & 1; | ||||
xx.i += (((temp == 0x80000000) ? expo_y : (temp >> 31)) | ||||
&& rndNearest); | ||||
xx.i = xx.i + (expo_x << 23); | ||||
return xx.f; | ||||
} | ||||
if ((int)expo_x >= 254) { | ||||
/* overflow: return infinity */ | ||||
xx.i = ((xx.i & 0x80000000) | 0x7f800000) - (!rndNearest); | ||||
return xx.f; | ||||
} | } | |||
/* underflow: denormal, or smallest normal */ | ||||
expo_y = expo_x + 32; | ||||
yy.i = xx.i & 0x80000000; | ||||
xx.i = xx.i & ~0xff000000; | ||||
expo_x = (unsigned int)(-((int)expo_x)); | ||||
temp = xx.i << expo_y | ((temp) ? 1 : 0); | ||||
xx.i = yy.i | (xx.i >> expo_x); | ||||
xx.i += (((temp == 0x80000000) ? (xx.i & 1) : (temp >> 31)) | ||||
&& rndNearest); | ||||
return xx.f; | ||||
} else { | } else { | |||
/* handle special cases separately */ | exponent += 0x70; | |||
if (!(yy.i << 1)) { | ||||
if (xx.i == 0x80000000) { | ||||
xx.i = yy.i; | ||||
} | ||||
if ((xx.i << 1) > 0xff000000) { | ||||
xx.i |= 0x00400000; | ||||
} | ||||
return xx.f; | ||||
} | ||||
if ((expo_y != 254) && (expo_x != 254)) { | ||||
/* remove sign bits */ | ||||
if (expo_x == (unsigned int) -1) { | ||||
temp = xx.i & 0x80000000; | ||||
xx.i = xx.i << 8; | ||||
while (!(xx.i & 0x80000000)) { | ||||
xx.i <<= 1; | ||||
expo_x--; | ||||
} | ||||
expo_x++; | ||||
xx.i = (xx.i >> 8) | temp; | ||||
} | ||||
if (expo_y == (unsigned int) -1) { | ||||
temp = yy.i & 0x80000000; | ||||
yy.i = yy.i << 8; | ||||
while (!(yy.i & 0x80000000)) { | ||||
yy.i <<= 1; | ||||
expo_y--; | ||||
} | ||||
expo_y++; | ||||
yy.i = (yy.i >> 8) | temp; | ||||
} | ||||
goto add; | ||||
} | ||||
expo_x = xx.i << 1; | ||||
expo_y = yy.i << 1; | ||||
/* if x is NaN, return x */ | ||||
if (expo_x > 0xff000000) { | ||||
/* cvt any SNaNs to QNaNs */ | ||||
xx.i = xx.i | 0x00400000; | ||||
return xx.f; | ||||
} | ||||
/* if y is NaN, return y */ | ||||
if (expo_y > 0xff000000) { | ||||
/* cvt any SNaNs to QNaNs */ | ||||
xx.i = yy.i | 0x00400000; | ||||
return xx.f; | ||||
} | ||||
if ((expo_x == 0xff000000) && (expo_y == 0xff000000)) { | ||||
/* | ||||
* subtraction of infinities with the same sign, and addition of | ||||
* infinities of unlike sign is undefined: return NaN INDEFINITE | ||||
*/ | ||||
expo_x = xx.i ^ yy.i; | ||||
xx.i = xx.i | ((expo_x) ? 0xffc00000 : 0); | ||||
return xx.f; | ||||
} | ||||
/* handle infinities */ | ||||
if (expo_y == 0xff000000) { | ||||
xx.i = yy.i; | ||||
} | ||||
return xx.f; | ||||
} | } | |||
return __int_as_float ((sign << 31) | (exponent << 23) | mantissa); | ||||
} | } | |||
__device_func__(float __fadd_rz(float a, float b)) | __device_func__(float __fadd_rz(float a, float b)) | |||
{ | { | |||
return __internal_fadd_kernel(a, b, 0); | return __internal_fadd_kernel(a, b, cudaRoundZero); | |||
} | } | |||
__device_func__(float __fmul_rz(float a, float b)) | __device_func__(float __fmul_rz(float a, float b)) | |||
{ | { | |||
return __internal_fmul_kernel(a, b, 0); | return __internal_fmul_kernel(a, b, cudaRoundZero); | |||
} | } | |||
__device_func__(float __fadd_rn(float a, float b)) | __device_func__(float __fadd_rn(float a, float b)) | |||
{ | { | |||
return __internal_fadd_kernel(a, b, 1); | return __internal_fadd_kernel(a, b, cudaRoundNearest); | |||
} | } | |||
__device_func__(float __fmul_rn(float a, float b)) | __device_func__(float __fmul_rn(float a, float b)) | |||
{ | { | |||
return __internal_fmul_kernel(a, b, 1); | return __internal_fmul_kernel(a, b, cudaRoundNearest); | |||
} | } | |||
__device_func__(void __brkpt(int c)) | __device_func__(void __brkpt(int c)) | |||
{ | { | |||
/* TODO */ | /* TODO */ | |||
} | } | |||
#if defined(__MULTI_CORE__) | #if defined(__MULTI_CORE__) | |||
#define __syncthreads() \ | #define __syncthreads() \ | |||
skipping to change at line 4543 | skipping to change at line 4342 | |||
#endif /* __GNUC__ */ | #endif /* __GNUC__ */ | |||
#endif /* __MULTI_CORE__ */ | #endif /* __MULTI_CORE__ */ | |||
__device_func__(void __prof_trigger(int a)) | __device_func__(void __prof_trigger(int a)) | |||
{ | { | |||
} | } | |||
__device_func__(void __threadfence(void)) | __device_func__(void __threadfence(void)) | |||
{ | { | |||
__syncthreads(); | ||||
} | } | |||
__device_func__(void __threadfence_block(void)) | __device_func__(void __threadfence_block(void)) | |||
{ | { | |||
__syncthreads(); | ||||
} | } | |||
#if defined(__GNUC__) | #if defined(__GNUC__) | |||
__device_func__(void __trap(void)) | __device_func__(void __trap(void)) | |||
{ | { | |||
__builtin_trap(); | __builtin_trap(); | |||
} | } | |||
#elif defined(_WIN32) | #elif defined(_WIN32) | |||
skipping to change at line 4572 | skipping to change at line 4373 | |||
#endif /* __GNUC__ */ | #endif /* __GNUC__ */ | |||
#endif /* __CUDABE__ */ | #endif /* __CUDABE__ */ | |||
/************************************************************************** ***** | /************************************************************************** ***** | |||
* * | * * | |||
* DEVICE IMPLEMENTATIONS FOR FUNCTIONS WITH BUILTIN NVOPENCC OPERATIONS * | * DEVICE IMPLEMENTATIONS FOR FUNCTIONS WITH BUILTIN NVOPENCC OPERATIONS * | |||
* * | * * | |||
*************************************************************************** ****/ | *************************************************************************** ****/ | |||
#if !defined(__CUDABE__) | ||||
__device_func__(float __fdividef(float a, float b)) | __device_func__(float __fdividef(float a, float b)) | |||
{ | { | |||
#if defined(__MULTI_CORE__) | #if defined(__MULTI_CORE__) | |||
return a / b; | return a / b; | |||
#elif defined(__CUDABE__) | ||||
return a / b; | ||||
#else /* __MULTI_CORE__ */ | #else /* __MULTI_CORE__ */ | |||
volatile float aa = a; | ||||
volatile float bb = b; | ||||
/* match range restrictions of the device function */ | /* match range restrictions of the device function */ | |||
if (__cuda_fabsf(b) > CUDART_TWO_TO_126_F) { | if (__cuda_fabsf(bb) > CUDART_TWO_TO_126_F) { | |||
if (__cuda_fabsf(a) <= CUDART_NORM_HUGE_F) { | if (__cuda_fabsf(aa) <= CUDART_NORM_HUGE_F) { | |||
return ((a / b) / CUDART_NORM_HUGE_F) / CUDART_NORM_HUGE_F; | return ((aa / bb) / CUDART_NORM_HUGE_F) / CUDART_NORM_HUGE_F; | |||
} else { | } else { | |||
return __int_as_float(0xffc00000); | bb = 1.0f / bb; | |||
bb = bb / CUDART_NORM_HUGE_F; | ||||
return aa * bb; | ||||
} | } | |||
} else { | } else { | |||
return a / b; | return aa / bb; | |||
} | } | |||
#endif /* __MULTI_CORE__ */ | #endif /* __MULTI_CORE__ */ | |||
} | } | |||
#endif /* !defined(__CUDABE__) */ | ||||
__device_func__(float __sinf(float a)) | __device_func__(float __sinf(float a)) | |||
{ | { | |||
#if !defined(__CUDABE__) | ||||
if ((__float_as_int(a) << 1) == 0xff000000) { | ||||
return __fadd_rn (a, -a); /* return NaN */ | ||||
} | ||||
#endif /* !defined(__CUDABE__) */ | ||||
return sinf(a); | return sinf(a); | |||
} | } | |||
__device_func__(float __cosf(float a)) | __device_func__(float __cosf(float a)) | |||
{ | { | |||
#if !defined(__CUDABE__) | ||||
if ((__float_as_int(a) << 1) == 0xff000000) { | ||||
return __fadd_rn (a, -a); /* return NaN */ | ||||
} | ||||
#endif /* !defined(__CUDABE__) */ | ||||
return cosf(a); | return cosf(a); | |||
} | } | |||
__device_func__(float __log2f(float a)) | __device_func__(float __log2f(float a)) | |||
{ | { | |||
return log2f(a); | return log2f(a); | |||
} | } | |||
/************************************************************************** ***** | /************************************************************************** ***** | |||
* * | * * | |||
* SHARED HOST AND DEVICE IMPLEMENTATIONS * | * SHARED HOST AND DEVICE IMPLEMENTATIONS * | |||
* * | * * | |||
*************************************************************************** ****/ | *************************************************************************** ****/ | |||
__device_func__(float __internal_accurate_fdividef(float a, float b)) | __device_func__(float __internal_accurate_fdividef(float a, float b)) | |||
{ | { | |||
if (__cuda_fabsf(b) > CUDART_TWO_TO_126_F) { | return a / b; | |||
a *= .25f; | ||||
b *= .25f; | ||||
} | ||||
return __fdividef(a, b); | ||||
} | } | |||
__device_func__(float __tanf(float a)) | __device_func__(float __tanf(float a)) | |||
{ | { | |||
#if defined(__MULTI_CORE__) | #if defined(__MULTI_CORE__) | |||
return tanf(a); | return tanf(a); | |||
#else /* __MULTI_CORE__ */ | #else /* __MULTI_CORE__ */ | |||
return __fdividef (__sinf(a), __cosf(a)); | return __fdividef (__sinf(a), __cosf(a)); | |||
#endif /* __MULTI_CORE__ */ | #endif /* __MULTI_CORE__ */ | |||
} | } | |||
End of changes. 76 change blocks. | ||||
367 lines changed or deleted | 175 lines changed or added | |||
device_launch_parameters.h | device_launch_parameters.h | |||
---|---|---|---|---|
skipping to change at line 53 | skipping to change at line 53 | |||
#define __STORAGE__ \ | #define __STORAGE__ \ | |||
extern const | extern const | |||
#endif /* __STORAGE__ */ | #endif /* __STORAGE__ */ | |||
#if defined(__cplusplus) | #if defined(__cplusplus) | |||
extern "C" { | extern "C" { | |||
#endif /* __cplusplus */ | #endif /* __cplusplus */ | |||
/*DEVICE_BUILTIN*/ | /*DEVICE_BUILTIN*/ | |||
struct uint3 __STORAGE__ threadIdx; | uint3 __STORAGE__ threadIdx; | |||
/*DEVICE_BUILTIN*/ | /*DEVICE_BUILTIN*/ | |||
struct uint3 __STORAGE__ blockIdx; | uint3 __STORAGE__ blockIdx; | |||
/*DEVICE_BUILTIN*/ | /*DEVICE_BUILTIN*/ | |||
struct dim3 __STORAGE__ blockDim; | dim3 __STORAGE__ blockDim; | |||
/*DEVICE_BUILTIN*/ | /*DEVICE_BUILTIN*/ | |||
struct dim3 __STORAGE__ gridDim; | dim3 __STORAGE__ gridDim; | |||
/*DEVICE_BUILTIN*/ | /*DEVICE_BUILTIN*/ | |||
int __STORAGE__ warpSize; | int __STORAGE__ warpSize; | |||
#undef __STORAGE__ | #undef __STORAGE__ | |||
#if defined(__cplusplus) | #if defined(__cplusplus) | |||
} | } | |||
#endif /* __cplusplus */ | #endif /* __cplusplus */ | |||
#if !defined(__cudaGet_threadIdx) | #if !defined(__cudaGet_threadIdx) | |||
End of changes. 4 change blocks. | ||||
4 lines changed or deleted | 4 lines changed or added | |||
device_runtime.h | device_runtime.h | |||
---|---|---|---|---|
/* | /* | |||
* Copyright 1993-2009 NVIDIA Corporation. All rights reserved. | * Copyright 1993-2008 NVIDIA Corporation. All rights reserved. | |||
* | * | |||
* NOTICE TO USER: | * NOTICE TO USER: | |||
* | * | |||
* This source code is subject to NVIDIA ownership rights under U.S. and | * This source code is subject to NVIDIA ownership rights under U.S. and | |||
* international Copyright laws. Users and possessors of this source code | * international Copyright laws. Users and possessors of this source code | |||
* are hereby granted a nonexclusive, royalty-free license to use this code | * are hereby granted a nonexclusive, royalty-free license to use this code | |||
* in individual and commercial software. | * in individual and commercial software. | |||
* | * | |||
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE | * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE | |||
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR | * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR | |||
skipping to change at line 56 | skipping to change at line 56 | |||
s | s | |||
#define __unsized_shared_data(name, type_post) \ | #define __unsized_shared_data(name, type_post) \ | |||
__unsized##name __unsized##type_post | __unsized##name __unsized##type_post | |||
#define __sized_shared_data(name, type) \ | #define __sized_shared_data(name, type) \ | |||
__sized##name type | __sized##name type | |||
#define __sized__shared_var(name, s, type) \ | #define __sized__shared_var(name, s, type) \ | |||
name | name | |||
/*TEXTURE_TYPE*/ | /*TEXTURE_TYPE*/ | |||
typedef const void *__texture_type__; | typedef const void *__texture_type__; | |||
/*SURFACE_TYPE*/ | ||||
typedef const void *__surface_type__; | ||||
#if defined(__CUDABE__) /* cudabe compiler */ | #if defined(__CUDABE__) /* cudabe compiler */ | |||
#define __pad__(f) | #define __pad__(f) | |||
#define __text__ \ | #define __text__ \ | |||
__attribute__((__texture__)) | __attribute__((__texture__)) | |||
#define __surf__ \ | ||||
__attribute__((__surface__)) | ||||
#define ___device__(sc) \ | #define ___device__(sc) \ | |||
static | static | |||
#define __in__(cdecl, decl) \ | #define __in__(cdecl, decl) \ | |||
__shared__ cdecl | __shared__ cdecl | |||
#define __in_type__(cdecl, decl) \ | #define __in_type__(cdecl, decl) \ | |||
cdecl | cdecl | |||
#define __texture_var(name) \ | #define __texture_var(name) \ | |||
name | name | |||
#define __shared_var(name, s, type) \ | #define __shared_var(name, s, type) \ | |||
name | name | |||
skipping to change at line 86 | skipping to change at line 90 | |||
#define __copy_param(local_decl, param) \ | #define __copy_param(local_decl, param) \ | |||
local_decl = param | local_decl = param | |||
#define __unsized_array_size \ | #define __unsized_array_size \ | |||
[] | [] | |||
#define __unsized__shared_var(name, s, type) \ | #define __unsized__shared_var(name, s, type) \ | |||
name | name | |||
#define __unsized__empty_array(s) \ | #define __unsized__empty_array(s) \ | |||
s | s | |||
#define __var_used__ \ | #define __var_used__ \ | |||
__attribute__((__used__)) | __attribute__((__used__)) | |||
#define __storage_extern_unsized__shared__ \ | ||||
extern | ||||
#undef __cdecl | #undef __cdecl | |||
#define __cdecl | #define __cdecl | |||
#undef __w64 | #undef __w64 | |||
#define __w64 | #define __w64 | |||
#elif defined(__CUDACC__) /* cudafe compiler */ | #elif defined(__CUDACC__) /* cudafe compiler */ | |||
#define __loc_sc__(loc, sc) \ | #define __loc_sc__(loc, size, sc) \ | |||
sc loc | sc loc | |||
#define __pad__(f) | #define __pad__(f) | |||
#define __text__ | #define __text__ | |||
#define __surf__ | ||||
#define ___device__(sc) \ | #define ___device__(sc) \ | |||
sc __device__ | sc __device__ | |||
#define __in__(cdecl, decl) \ | #define __in__(cdecl, decl) \ | |||
decl | decl | |||
#define __in_type__(cdecl, decl) \ | #define __in_type__(cdecl, decl) \ | |||
decl | decl | |||
#define __texture_var(name) \ | #define __texture_var(name) \ | |||
name | name | |||
#define __shared_var(name, s, type) \ | #define __shared_var(name, s, type) \ | |||
name | name | |||
skipping to change at line 127 | skipping to change at line 134 | |||
#define __unsized__empty_array(s) \ | #define __unsized__empty_array(s) \ | |||
s | s | |||
#else /* host compiler (cl, gcc, open64, ...) */ | #else /* host compiler (cl, gcc, open64, ...) */ | |||
#if defined (__MULTI_CORE__) || defined(__multi_core__) | #if defined (__MULTI_CORE__) || defined(__multi_core__) | |||
struct uint3; | struct uint3; | |||
extern struct uint3* CUDARTAPI __cudaGetBlockIdxPtr(void); | extern struct uint3* CUDARTAPI __cudaGetBlockIdxPtr(void); | |||
extern void* CUDARTAPI __cudaGetSharedMem(void*); | extern void* CUDARTAPI __cudaGetSharedMem(void*); | |||
extern void* CUDARTAPI __cudaCmcHostMalloc(size_t); | ||||
extern size_t CUDARTAPI __cudaCmcGetStackSize(void); | ||||
#endif /* __MULTI_CORE__ || __multi_core__ */ | #endif /* __MULTI_CORE__ || __multi_core__ */ | |||
#if defined (__multi_core__) | #if defined (__multi_core__) | |||
#if defined(__GNUC__) | #if defined(__GNUC__) | |||
#if defined(__cplusplus) | #if defined(__cplusplus) | |||
extern void *alloca(size_t) throw(); | extern void *alloca(size_t) throw(); | |||
#else /* __cplusplus */ | #else /* __cplusplus */ | |||
extern void *alloca(size_t); | extern void *alloca(size_t); | |||
#endif /* __cplusplus */ | #endif /* __cplusplus */ | |||
#define __cuda_alloc(s) \ | #define __cuda_alloca(s) \ | |||
alloca(s) | alloca(s) | |||
#else /* __GNUC__ */ | #else /* __GNUC__ */ | |||
extern void *_alloca(size_t); | extern void *_alloca(size_t); | |||
#define __cuda_alloc(s) \ | #define __cuda_alloca(s) \ | |||
_alloca(s) | _alloca(s) | |||
#endif /* __GNUC__ */ | #endif /* __GNUC__ */ | |||
/* check if enough stack size remains for alloca to succeed. If so, | ||||
use faster alloca() to do the allocation. Otherwise, allocate memory | ||||
using the __cudaCmcHostMalloc() runtime function, which uses the slower | ||||
malloc path to allocate memory | ||||
*/ | ||||
#define __cudaCmcTargAlloc(num_bytes, max_stacksize, ptr_counter_stacksize, | ||||
ptr_ret_sym) \ | ||||
do { \ | ||||
if (*(ptr_counter_stacksize) + (num_bytes) >= (max_stacksize)) { \ | ||||
*(ptr_ret_sym) = __cudaCmcHostMalloc((num_bytes)); \ | ||||
} else { \ | ||||
*(ptr_ret_sym) = __cuda_alloca((num_bytes)); \ | ||||
*(ptr_counter_stacksize) = *(ptr_counter_stacksize) + (num_bytes); \ | ||||
} \ | ||||
} while(0) | ||||
#endif /* __multi_core__ */ | #endif /* __multi_core__ */ | |||
#if defined (__MULTI_CORE__) | #if defined (__MULTI_CORE__) | |||
#define ___device__(sc) \ | #define ___device__(sc) \ | |||
static | static | |||
#define __pad__(f) \ | #define __pad__(f) \ | |||
f | f | |||
#define __text__ | #define __text__ | |||
#define __surf__ | ||||
#define __cudaGet_blockIdx() \ | #define __cudaGet_blockIdx() \ | |||
(*__cudaGetBlockIdxPtr()) | (*__cudaGetBlockIdxPtr()) | |||
#define __shared_var(name, s, type) \ | #define __shared_var(name, s, type) \ | |||
(s type __cudaGetSharedMem((void*)(&(name)))) | (s type __cudaGetSharedMem((void*)(&(name)))) | |||
#define __var_used__ \ | #define __var_used__ \ | |||
__attribute__((__used__)) | __attribute__((__used__)) | |||
#define __storage_auto__shared__ \ | #define __storage_auto__shared__ \ | |||
auto | auto | |||
#undef __cdecl | #undef __cdecl | |||
skipping to change at line 201 | skipping to change at line 226 | |||
#elif defined(__GNUC__) | #elif defined(__GNUC__) | |||
#define __STORAGE__ \ | #define __STORAGE__ \ | |||
__attribute__((__common__)) | __attribute__((__common__)) | |||
#elif defined(__cplusplus) | #elif defined(__cplusplus) | |||
#define __STORAGE__ \ | #define __STORAGE__ \ | |||
__declspec(selectany) | __declspec(selectany) | |||
#pragma warning(disable: 4099 4190) | ||||
#else /* __APPLE__ || __ICC */ | #else /* __APPLE__ || __ICC */ | |||
#define __STORAGE__ | #define __STORAGE__ | |||
#endif /* __APPLE__ || __ICC */ | #endif /* __APPLE__ || __ICC */ | |||
#endif /* __MULTI_CORE__ */ | #endif /* __MULTI_CORE__ */ | |||
#define __in__(cdecl, decl) \ | #define __in__(cdecl, decl) \ | |||
decl | decl | |||
skipping to change at line 230 | skipping to change at line 253 | |||
name | name | |||
#define __copy_param(local_decl, param) | #define __copy_param(local_decl, param) | |||
#define __unsized_array_size | #define __unsized_array_size | |||
#define __unsized__shared_var(name, s, type) \ | #define __unsized__shared_var(name, s, type) \ | |||
(*name) | (*name) | |||
#define __unsized__empty_array(s) | #define __unsized__empty_array(s) | |||
/* this is compiled with a host compiler for device emulation */ | /* this is compiled with a host compiler for device emulation */ | |||
#define __device_emulation | #define __device_emulation | |||
#if defined(__cplusplus) | ||||
#undef __VECTOR_TYPES_H__ | ||||
#if defined(_WIN32) | ||||
#pragma warning(disable: 4190 4522) | ||||
#endif /* _WIN32 */ | ||||
#endif /* __cplusplus */ | ||||
#endif /* __CUDABE__ */ | #endif /* __CUDABE__ */ | |||
#if defined(__cplusplus) | ||||
static void *__cuda_memcpy(void*, const void*, size_t); | ||||
/* for C++ compilation of lowered C++ (i.e. C) code */ | ||||
#define __cuda_assign_operators(tag) | ||||
\ | ||||
tag& operator=( tag& a) { __cuda_memcpy(this, &a, | ||||
sizeof(tag)); return *this;} \ | ||||
tag& operator=(volatile tag& a) volatile { return *(tag*)this = (ta | ||||
g&)a; } \ | ||||
tag& operator=( const tag& a) { return *(tag*)this = (ta | ||||
g&)a; } \ | ||||
#endif /* __cplusplus */ | ||||
#include "builtin_types.h" | #include "builtin_types.h" | |||
#include "device_launch_parameters.h" | #include "device_launch_parameters.h" | |||
#include "storage_class.h" | #include "storage_class.h" | |||
End of changes. 14 change blocks. | ||||
6 lines changed or deleted | 58 lines changed or added | |||
driver_types.h | driver_types.h | |||
---|---|---|---|---|
skipping to change at line 143 | skipping to change at line 143 | |||
/** | /** | |||
* Channel format kind | * Channel format kind | |||
*/ | */ | |||
/*DEVICE_BUILTIN*/ | /*DEVICE_BUILTIN*/ | |||
enum cudaChannelFormatKind | enum cudaChannelFormatKind | |||
{ | { | |||
cudaChannelFormatKindSigned = 0, ///< Signed channel for mat | cudaChannelFormatKindSigned = 0, ///< Signed channel for mat | |||
cudaChannelFormatKindUnsigned = 1, ///< Unsigned channel f ormat | cudaChannelFormatKindUnsigned = 1, ///< Unsigned channel f ormat | |||
cudaChannelFormatKindFloat = 2, ///< Float channel form at | cudaChannelFormatKindFloat = 2, ///< Float channel form at | |||
cudaChannelFormatKindNone = 3, ///< No channel format | cudaChannelFormatKindNone = 3 ///< No channel format | |||
}; | }; | |||
/** | /** | |||
* CUDA Channel format descriptor | * CUDA Channel format descriptor | |||
*/ | */ | |||
/*DEVICE_BUILTIN*/ | /*DEVICE_BUILTIN*/ | |||
struct cudaChannelFormatDesc | struct cudaChannelFormatDesc | |||
{ | { | |||
int x; ///< x | int x; ///< x | |||
int y; ///< y | int y; ///< y | |||
skipping to change at line 185 | skipping to change at line 185 | |||
}; | }; | |||
/** | /** | |||
* CUDA Pitched memory pointer | * CUDA Pitched memory pointer | |||
*/ | */ | |||
/*DEVICE_BUILTIN*/ | /*DEVICE_BUILTIN*/ | |||
struct cudaPitchedPtr | struct cudaPitchedPtr | |||
{ | { | |||
void *ptr; ///< Pointer to allocated memory | void *ptr; ///< Pointer to allocated memory | |||
size_t pitch; ///< Pitch of allocated memory in bytes | size_t pitch; ///< Pitch of allocated memory in bytes | |||
size_t xsize; ///< Logical width of allocation in bytes | size_t xsize; ///< Logical width of allocation in elements | |||
size_t ysize; ///< Logical height of allocation in bytes | size_t ysize; ///< Logical height of allocation in elements | |||
}; | }; | |||
/** | /** | |||
* CUDA extent | * CUDA extent | |||
*/ | */ | |||
/*DEVICE_BUILTIN*/ | /*DEVICE_BUILTIN*/ | |||
struct cudaExtent | struct cudaExtent | |||
{ | { | |||
size_t width; ///< Width in bytes | size_t width; ///< Width in bytes | |||
size_t height; ///< Height in bytes | size_t height; ///< Height in bytes | |||
End of changes. 2 change blocks. | ||||
3 lines changed or deleted | 3 lines changed or added | |||
func_macro.h | func_macro.h | |||
---|---|---|---|---|
/* | /* | |||
* Copyright 1993-2009 NVIDIA Corporation. All rights reserved. | * Copyright 1993-2008 NVIDIA Corporation. All rights reserved. | |||
* | * | |||
* NOTICE TO USER: | * NOTICE TO USER: | |||
* | * | |||
* This source code is subject to NVIDIA ownership rights under U.S. and | * This source code is subject to NVIDIA ownership rights under U.S. and | |||
* international Copyright laws. Users and possessors of this source code | * international Copyright laws. Users and possessors of this source code | |||
* are hereby granted a nonexclusive, royalty-free license to use this code | * are hereby granted a nonexclusive, royalty-free license to use this code | |||
* in individual and commercial software. | * in individual and commercial software. | |||
* | * | |||
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE | * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE | |||
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR | * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR | |||
skipping to change at line 52 | skipping to change at line 52 | |||
___device__(static) decl | ___device__(static) decl | |||
#define __device_func__(decl) \ | #define __device_func__(decl) \ | |||
___device__(static) decl | ___device__(static) decl | |||
#else /* __CUDABE__ */ | #else /* __CUDABE__ */ | |||
#if !defined(__CUDA_INTERNAL_COMPILATION__) | #if !defined(__CUDA_INTERNAL_COMPILATION__) | |||
#error -- incorrect inclusion of a cudart header file | #error -- incorrect inclusion of a cudart header file | |||
#endif /* __CUDA_INTERNAL_COMPILATION__ */ | #endif /* !__CUDA_INTERNAL_COMPILATION__ */ | |||
#if defined(__cplusplus) && defined(__device_emulation) && !defined(__multi | ||||
_core__) | ||||
#define __begin_host_func \ | ||||
}} | ||||
#define __end_host_func \ | ||||
namespace __cuda_emu { extern "C" { | ||||
#define __host_device_call(f) \ | ||||
__cuda_emu::f | ||||
#else /* __cplusplus && __device_emulation && !__multi_core__ */ | ||||
#define __begin_host_func | ||||
#define __end_host_func | ||||
#define __host_device_call(f) \ | ||||
f | ||||
#endif /* __cplusplus && __device_emulation !__multi_core__ */ | ||||
#if defined(__APPLE__) | #if defined(__APPLE__) | |||
#define __func__(decl) \ | #define __func__(decl) \ | |||
extern __attribute__((__weak_import__, __weak__)) decl; decl | extern __attribute__((__weak_import__, __weak__)) decl; decl | |||
#define __device_func__(decl) \ | ||||
static __attribute__((__unused__)) decl | ||||
#elif defined(__GNUC__) | #elif defined(__GNUC__) | |||
#define __func__(decl) \ | #define __func__(decl) \ | |||
extern __attribute__((__weak__)) decl; decl | extern __attribute__((__weak__)) decl; decl | |||
#define __device_func__(decl) \ | ||||
static __attribute__((__unused__)) decl | ||||
#elif defined(_WIN32) | #elif defined(_WIN32) | |||
#define __func__(decl) \ | #define __func__(decl) \ | |||
static decl | static decl | |||
#endif /* __APPLE__ */ | ||||
#define __device_func__(decl) \ | #define __device_func__(decl) \ | |||
static decl | static decl | |||
#endif /* __APPLE__ */ | ||||
#endif /* CUDABE */ | #endif /* CUDABE */ | |||
#endif /* __FUNC_MACRO_H__ */ | #endif /* __FUNC_MACRO_H__ */ | |||
End of changes. 6 change blocks. | ||||
5 lines changed or deleted | 27 lines changed or added | |||
host_config.h | host_config.h | |||
---|---|---|---|---|
skipping to change at line 119 | skipping to change at line 119 | |||
#if !defined(NOMINMAX) | #if !defined(NOMINMAX) | |||
#define NOMINMAX /* min and max are part of cuda runtime */ | #define NOMINMAX /* min and max are part of cuda runtime */ | |||
#endif /* !NOMINMAX */ | #endif /* !NOMINMAX */ | |||
#include <crtdefs.h> /* for _CRTIMP */ | #include <crtdefs.h> /* for _CRTIMP */ | |||
#define __THROW | #define __THROW | |||
#endif /* __GNUC__ */ | #endif /* __APPLE__ */ | |||
#endif /* __CUDACC__ */ | #endif /* __CUDACC__ */ | |||
#endif /* !__HOST_CONFIG_H__ */ | #endif /* !__HOST_CONFIG_H__ */ | |||
End of changes. 1 change blocks. | ||||
1 lines changed or deleted | 1 lines changed or added | |||
host_defines.h | host_defines.h | |||
---|---|---|---|---|
skipping to change at line 101 | skipping to change at line 101 | |||
__loc__(__declspec(a)) | __loc__(__declspec(a)) | |||
#define CUDARTAPI \ | #define CUDARTAPI \ | |||
__stdcall | __stdcall | |||
#endif /* !__GNUC__ && !_WIN32 */ | #endif /* !__GNUC__ && !_WIN32 */ | |||
#if defined(__CUDACC__) || defined(__CUDABE__) || defined (__MULTI_CORE__) | #if defined(__CUDACC__) || defined(__CUDABE__) || defined (__MULTI_CORE__) | |||
#define __loc__(a) \ | #define __loc__(a) \ | |||
a | a | |||
#define __builtin_align__(a) \ | ||||
__align__(a) | ||||
#else /* __CUDACC__ || __CUDABE__ || __MULTI_CORE__ */ | #else /* __CUDACC__ || __CUDABE__ || __MULTI_CORE__ */ | |||
#define __loc__(a) | #define __loc__(a) | |||
#define __builtin_align__(a) | ||||
#endif /* __CUDACC__ || __CUDABE__ || __MULTI_CORE__ */ | #endif /* __CUDACC__ || __CUDABE__ || __MULTI_CORE__ */ | |||
#if defined(__CUDACC__) || defined(__CUDABE__) || defined (__MULTI_CORE__) | ||||
|| \ | ||||
defined(__GNUC__) || defined(_WIN64) | ||||
#define __builtin_align__(a) \ | ||||
__align__(a) | ||||
#else /* __CUDACC__ || __CUDABE__ || __MULTI_CORE__ || __GNUC__ || _WIN64 * | ||||
/ | ||||
#define __builtin_align__(a) | ||||
#endif /* __CUDACC__ || __CUDABE__ || __MULTI_CORE__ || __GNUC__ || _WIN64 | ||||
*/ | ||||
#define __device__ \ | #define __device__ \ | |||
__location__(__device__) | __location__(__device__) | |||
#define __host__ \ | #define __host__ \ | |||
__location__(__host__) | __location__(__host__) | |||
#define __global__ \ | #define __global__ \ | |||
__location__(__global__) | __location__(__global__) | |||
#define __shared__ \ | #define __shared__ \ | |||
__location__(__shared__) | __location__(__shared__) | |||
#define __constant__ \ | #define __constant__ \ | |||
__location__(__constant__) | __location__(__constant__) | |||
#define __launch_bounds__(t) \ | #define __launch_bounds__(...) \ | |||
__location__(__launch_bounds__(t)) | __location__(__launch_bounds__(__VA_ARGS__)) | |||
#endif /* !__HOST_DEFINES_H__ */ | #endif /* !__HOST_DEFINES_H__ */ | |||
End of changes. 4 change blocks. | ||||
5 lines changed or deleted | 17 lines changed or added | |||
host_runtime.h | host_runtime.h | |||
---|---|---|---|---|
/* | /* | |||
* Copyright 1993-2009 NVIDIA Corporation. All rights reserved. | * Copyright 1993-2008 NVIDIA Corporation. All rights reserved. | |||
* | * | |||
* NOTICE TO USER: | * NOTICE TO USER: | |||
* | * | |||
* This source code is subject to NVIDIA ownership rights under U.S. and | * This source code is subject to NVIDIA ownership rights under U.S. and | |||
* international Copyright laws. Users and possessors of this source code | * international Copyright laws. Users and possessors of this source code | |||
* are hereby granted a nonexclusive, royalty-free license to use this code | * are hereby granted a nonexclusive, royalty-free license to use this code | |||
* in individual and commercial software. | * in individual and commercial software. | |||
* | * | |||
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE | * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE | |||
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR | * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR | |||
skipping to change at line 43 | skipping to change at line 43 | |||
* the above Disclaimer and U.S. Government End Users Notice. | * the above Disclaimer and U.S. Government End Users Notice. | |||
*/ | */ | |||
#if !defined(__CUDA_INTERNAL_COMPILATION__) | #if !defined(__CUDA_INTERNAL_COMPILATION__) | |||
#define __CUDA_INTERNAL_COMPILATION__ | #define __CUDA_INTERNAL_COMPILATION__ | |||
#define __glob_pref_var(var) \ | #define __glob_pref_var(var) \ | |||
__global_##var | __global_##var | |||
#define __global_var(var) \ | #define __global_var(var) \ | |||
(*__glob_pref_var(var)) | (*__glob_pref_var(var)) | |||
#define __shadow_pref_var(var) \ | #define __shadow_var(c, cpp) \ | |||
__shadow_##var | __shadow_pref_var(c, cpp) | |||
#define __shadow_var(var) \ | ||||
__shadow_pref_var(var) | ||||
#define __pad__(f) \ | ||||
f | ||||
#define __text__ | #define __text__ | |||
#define __surf__ | ||||
#define __dv(v) | #define __dv(v) | |||
#if defined(_WIN32) && !defined(_WIN64) | ||||
#define __pad__(f) \ | ||||
f | ||||
#else /* _WIN32 && !_WIN64 */ | ||||
#define __pad__(f) | ||||
#endif /* _WIN32 && !_WIN64 */ | ||||
#if defined(__APPLE__) | #if defined(__APPLE__) | |||
#define __extern_weak__ \ | #define __extern_weak__ \ | |||
__weak_import__, | __weak_import__, | |||
#elif defined(__GNUC__) | #elif defined(__GNUC__) | |||
#define __extern_weak__ | #define __extern_weak__ | |||
#endif /* __APPLE__ */ | #endif /* __APPLE__ */ | |||
#if defined(__cplusplus) | #if defined(__cplusplus) | |||
#define __shadow_pref_var(c, cpp) \ | ||||
cpp##__cuda_shadow_variable__ | ||||
#define __device_stub_name(c, cpp) \ | #define __device_stub_name(c, cpp) \ | |||
cpp | cpp | |||
#define __text_var(c, cpp) \ | ||||
cpp | ||||
#define __cppref__ \ | #define __cppref__ \ | |||
& | & | |||
#else /* __cplusplus */ | #else /* __cplusplus */ | |||
#define __shadow_pref_var(c, cpp) \ | ||||
c##__cuda_shadow_variable__ | ||||
#define __device_stub_name(c, cpp) \ | #define __device_stub_name(c, cpp) \ | |||
c | c | |||
#define __text_var(c, cpp) \ | ||||
c | ||||
#define __cppref__ | #define __cppref__ | |||
typedef char bool; | typedef char bool; | |||
#endif /* __cplusplus */ | #endif /* __cplusplus */ | |||
#if !defined(__GNUC__) || __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ | ||||
< 3) | ||||
#define __specialization_static \ | ||||
static | ||||
#else /* !__GNUC__ || __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 3) | ||||
*/ | ||||
#define __specialization_static | ||||
#endif /* !__GNUC__ || __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 3 | ||||
) */ | ||||
#include "cuda_runtime_api.h" | #include "cuda_runtime_api.h" | |||
#include "storage_class.h" | #include "storage_class.h" | |||
#else /* !__CUDA_INTERNAL_COMPILATION__ */ | #else /* !__CUDA_INTERNAL_COMPILATION__ */ | |||
#include "host_defines.h" | #include "host_defines.h" | |||
#define __cudaRegisterBinary() \ | #define __cudaRegisterBinary() \ | |||
__cudaFatCubinHandle = __cudaRegisterFatBinary((void*)__cudaFatCubi n); \ | __cudaFatCubinHandle = __cudaRegisterFatBinary((void*)__cudaFatCubi n); \ | |||
atexit(__cudaUnregisterBinaryUtil) | atexit(__cudaUnregisterBinaryUtil) | |||
#define __cudaRegisterVariable(var, ext, size, constant, global) \ | #define __cudaRegisterVariable(var, ext, size, constant, global) \ | |||
__cudaRegisterVar(__cudaFatCubinHandle, (char*)&__host##var, (char* )__device##var, __name##var, ext, size, constant, global) | __cudaRegisterVar(__cudaFatCubinHandle, (char*)&__host##var, (char* )__device##var, __name##var, ext, size, constant, global) | |||
#define __cudaRegisterGlobalTexture(tex, dim, norm, ext) \ | #define __cudaRegisterGlobalTexture(tex, dim, norm, ext) \ | |||
__cudaRegisterTexture(__cudaFatCubinHandle, (const struct textureRe | __cudaRegisterTexture(__cudaFatCubinHandle, (const struct textureRe | |||
ference*)&tex, __tex_var(tex), #tex, dim, norm, ext) | ference*)&tex, __tex_var(tex), __name##tex, dim, norm, ext) | |||
#define __cudaRegisterGlobalSurface(surf, dim, ext) \ | ||||
__cudaRegisterSurface(__cudaFatCubinHandle, (const struct surfaceRe | ||||
ference*)&surf, __tex_var(surf), __name##surf, dim, ext) | ||||
#define __cudaRegisterUnsizedShared(var) \ | #define __cudaRegisterUnsizedShared(var) \ | |||
__cudaRegisterShared(__cudaFatCubinHandle, (void**)__device_var(var )) | __cudaRegisterShared(__cudaFatCubinHandle, (void**)__device_var(var )) | |||
#define __cudaRegisterSharedVariable(var, size, align, sc) \ | #define __cudaRegisterSharedVariable(var, size, align, sc) \ | |||
__cudaRegisterSharedVar(__cudaFatCubinHandle, (void**)__device_var( var), size, align, sc) | __cudaRegisterSharedVar(__cudaFatCubinHandle, (void**)__device_var( var), size, align, sc) | |||
#define __cudaRegisterEntry(funptr, fun, thread_limit) \ | #define __cudaRegisterEntry(funptr, fun, thread_limit) \ | |||
__cudaRegisterFunction(__cudaFatCubinHandle, (const char*)funptr, ( char*)__device_fun(fun), #fun, thread_limit, __ids) | __cudaRegisterFunction(__cudaFatCubinHandle, (const char*)funptr, ( char*)__device_fun(fun), #fun, thread_limit, __ids) | |||
#define __cudaInitArgBlock(arg) \ | #define __cudaInitArgBlock(arg) \ | |||
char __[256]; \ | *(void**)(void*)&arg = (void*)0 | |||
*(char**)&arg = __ | ||||
#define __cudaSetupArg(arg, offset) \ | #define __cudaSetupArg(arg, offset) \ | |||
if (cudaSetupArgument((void*)(char*)&arg, sizeof(arg), (size_t)&off set->arg - (size_t)offset) != cudaSuccess) \ | if (cudaSetupArgument((void*)(char*)&arg, sizeof(arg), (size_t)&off set->arg) != cudaSuccess) \ | |||
return | return | |||
#define __cudaLaunch(fun) \ | #define __cudaLaunch(fun) \ | |||
{ volatile static char *__f; __f = fun; (void)cudaLaunch(fun); } | { volatile static char *__f; __f = fun; (void)cudaLaunch(fun); } | |||
#if defined(__cplusplus) | #if defined(__cplusplus) | |||
extern "C" { | extern "C" { | |||
#endif /* __cplusplus */ | #endif /* __cplusplus */ | |||
extern void** CUDARTAPI __cudaRegisterFatBinary( | extern void** CUDARTAPI __cudaRegisterFatBinary( | |||
void *fatCubin | void *fatCubin | |||
skipping to change at line 143 | skipping to change at line 171 | |||
extern void CUDARTAPI __cudaRegisterTexture( | extern void CUDARTAPI __cudaRegisterTexture( | |||
void **fatCubinHandle, | void **fatCubinHandle, | |||
const struct textureReference *hostVar, | const struct textureReference *hostVar, | |||
const void **deviceAddress, | const void **deviceAddress, | |||
const char *deviceName, | const char *deviceName, | |||
int dim, | int dim, | |||
int norm, | int norm, | |||
int ext | int ext | |||
); | ); | |||
extern void CUDARTAPI __cudaRegisterSurface( | ||||
void **fatCubinHandle, | ||||
const struct surfaceReference *hostVar, | ||||
const void **deviceAddress, | ||||
const char *deviceName, | ||||
int dim, | ||||
int ext | ||||
); | ||||
extern void CUDARTAPI __cudaRegisterShared( | extern void CUDARTAPI __cudaRegisterShared( | |||
void **fatCubinHandle, | void **fatCubinHandle, | |||
void **devicePtr | void **devicePtr | |||
); | ); | |||
extern void CUDARTAPI __cudaRegisterSharedVar( | extern void CUDARTAPI __cudaRegisterSharedVar( | |||
void **fatCubinHandle, | void **fatCubinHandle, | |||
void **devicePtr, | void **devicePtr, | |||
size_t size, | size_t size, | |||
size_t alignment, | size_t alignment, | |||
skipping to change at line 192 | skipping to change at line 229 | |||
static void **__cudaFatCubinHandle; | static void **__cudaFatCubinHandle; | |||
static void __cudaUnregisterBinaryUtil(void) | static void __cudaUnregisterBinaryUtil(void) | |||
{ | { | |||
__cudaUnregisterFatBinary(__cudaFatCubinHandle); | __cudaUnregisterFatBinary(__cudaFatCubinHandle); | |||
} | } | |||
#if defined(__device_emulation) | #if defined(__device_emulation) | |||
#if defined(__cplusplus) && !defined(__multi_core__) | ||||
#define __cuda_emu__ \ | ||||
__cuda_emu:: | ||||
#else /* __cplusplus */ | ||||
#define __cuda_emu__ | ||||
#endif /* __cplusplus */ | ||||
#define __device_fun(fun) \ | #define __device_fun(fun) \ | |||
__device_wrapper_##fun | __cuda_emu__ __device_wrapper_##fun | |||
#define __device_var(var) \ | #define __device_var(var) \ | |||
(char*)&var | (char*)&__cuda_emu__ var | |||
#define __tex_var(var) \ | #define __tex_var(var) \ | |||
&__texture_var(var) | &__cuda_emu__ __texture_var(var) | |||
#define __cudaFatCubin \ | #define __cudaFatCubin \ | |||
0 | 0 | |||
#if defined(__multi_core__) | #if defined(__multi_core__) | |||
#define __ids \ | #define __ids \ | |||
(uint3*)0, (uint3*)0, &blockDim, &gridDim, &warpSize | (uint3*)0, (uint3*)0, &blockDim, &gridDim, &warpSize | |||
#else /* __multi_core__ */ | #else /* __multi_core__ */ | |||
#define __ids \ | #define __ids \ | |||
&threadIdx, &blockIdx, &blockDim, &gridDim, &warpSize | (uint3*)&__cuda_emu__ threadIdx, (uint3*)&__cuda_emu__ blockIdx, (d im3*)&__cuda_emu__ blockDim, (dim3*)&__cuda_emu__ gridDim, &__cuda_emu__ wa rpSize | |||
#endif /* __multi_core__ */ | #endif /* __multi_core__ */ | |||
#else /* __device_emulation */ | #else /* __device_emulation */ | |||
#define __device_fun(fun) \ | #define __device_fun(fun) \ | |||
#fun | #fun | |||
#define __device_var(var) \ | #define __device_var(var) \ | |||
#var | #var | |||
#define __tex_var(var) \ | #define __tex_var(var) \ | |||
skipping to change at line 237 | skipping to change at line 285 | |||
#endif /* __device_emulation */ | #endif /* __device_emulation */ | |||
/* UTILITY MACROS */ | /* UTILITY MACROS */ | |||
#define __device__global_var(var) \ | #define __device__global_var(var) \ | |||
__device_var(var) | __device_var(var) | |||
#define __name__global_var(var) \ | #define __name__global_var(var) \ | |||
#var | #var | |||
#define __host__global_var(var) \ | #define __host__global_var(var) \ | |||
__glob_pref_var(var) | __glob_pref_var(var) | |||
#define __device__shadow_var(var) \ | #define __device__shadow_var(c, cpp) \ | |||
__device_var(var) | __device_var(c) | |||
#define __name__shadow_var(var) \ | #define __name__shadow_var(c, cpp) \ | |||
#var | #c | |||
#define __host__shadow_var(var) \ | #define __name__text_var(c, cpp) \ | |||
__shadow_pref_var(var) | #c | |||
#define __host__shadow_var(c, cpp) \ | ||||
__shadow_pref_var(c, cpp) | ||||
#if defined(_WIN32) && defined(__cplusplus) | ||||
#pragma warning(disable: 4099) | ||||
#endif /* _WIN32 && __cplusplus */ | ||||
#endif /* !__CUDA_INTERNAL_COMPILATION__ */ | #endif /* !__CUDA_INTERNAL_COMPILATION__ */ | |||
End of changes. 19 change blocks. | ||||
22 lines changed or deleted | 82 lines changed or added | |||
math_constants.h | math_constants.h | |||
---|---|---|---|---|
skipping to change at line 74 | skipping to change at line 74 | |||
#define CUDART_NORM_HUGE_F 3.402823466e38f | #define CUDART_NORM_HUGE_F 3.402823466e38f | |||
#define CUDART_TWO_TO_23_F 8388608.0f | #define CUDART_TWO_TO_23_F 8388608.0f | |||
#define CUDART_TWO_TO_24_F 16777216.0f | #define CUDART_TWO_TO_24_F 16777216.0f | |||
#define CUDART_TWO_TO_31_F 2147483648.0f | #define CUDART_TWO_TO_31_F 2147483648.0f | |||
#define CUDART_TWO_TO_32_F 4294967296.0f | #define CUDART_TWO_TO_32_F 4294967296.0f | |||
#define CUDART_REMQUO_BITS_F 3 | #define CUDART_REMQUO_BITS_F 3 | |||
#define CUDART_REMQUO_MASK_F (~((~0)<<CUDART_REMQUO_BITS_F)) | #define CUDART_REMQUO_MASK_F (~((~0)<<CUDART_REMQUO_BITS_F)) | |||
#define CUDART_TRIG_PLOSS_F 48039.0f | #define CUDART_TRIG_PLOSS_F 48039.0f | |||
/* double precision constants */ | /* double precision constants */ | |||
#if !defined(CUDA_NO_SM_13_DOUBLE_INTRINSICS) | #if __CUDA_ARCH__ >= 130 | |||
#define CUDART_INF __longlong_as_double(0x7ff0000000000000ULL) | #define CUDART_INF __longlong_as_double(0x7ff0000000000000ULL) | |||
#define CUDART_NAN __longlong_as_double(0xfff8000000000000ULL) | #define CUDART_NAN __longlong_as_double(0xfff8000000000000ULL) | |||
#define CUDART_NEG_ZERO __longlong_as_double(0x8000000000000000ULL) | #define CUDART_NEG_ZERO __longlong_as_double(0x8000000000000000ULL) | |||
#define CUDART_MIN_DENORM __longlong_as_double(0x0000000000000001ULL) | #define CUDART_MIN_DENORM __longlong_as_double(0x0000000000000001ULL) | |||
#else /* !CUDA_NO_SM_13_DOUBLE_INTRINSICS */ | #else /* __CUDA_ARCH__ >= 130 */ | |||
#define CUDART_INF __hiloint2double(0x7ff00000, 0x00000000) | #define CUDART_INF __hiloint2double(0x7ff00000, 0x00000000) | |||
#define CUDART_NAN __hiloint2double(0xfff80000, 0x00000000) | #define CUDART_NAN __hiloint2double(0xfff80000, 0x00000000) | |||
#define CUDART_NEG_ZERO __hiloint2double(0x80000000, 0x00000000) | #define CUDART_NEG_ZERO __hiloint2double(0x80000000, 0x00000000) | |||
#define CUDART_MIN_DENORM __hiloint2double(0x00000000, 0x00000001) | #define CUDART_MIN_DENORM __hiloint2double(0x00000000, 0x00000001) | |||
#endif /* !CUDA_NO_SM_13_DOUBLE_INTRINSICS */ | #endif /* __CUDA_ARCH__ >= 130 */ | |||
#define CUDART_ZERO 0.0 | #define CUDART_ZERO 0.0 | |||
#define CUDART_ONE 1.0 | #define CUDART_ONE 1.0 | |||
#define CUDART_SQRT_TWO 1.4142135623730951e+0 | #define CUDART_SQRT_TWO 1.4142135623730951e+0 | |||
#define CUDART_SQRT_HALF 7.0710678118654757e-1 | #define CUDART_SQRT_HALF 7.0710678118654757e-1 | |||
#define CUDART_THIRD 3.3333333333333333e-1 | #define CUDART_THIRD 3.3333333333333333e-1 | |||
#define CUDART_TWOTHIRD 6.6666666666666667e-1 | #define CUDART_TWOTHIRD 6.6666666666666667e-1 | |||
#define CUDART_PIO4 7.8539816339744828e-1 | #define CUDART_PIO4 7.8539816339744828e-1 | |||
#define CUDART_PIO4_HI 7.8539816339744828e-1 | #define CUDART_PIO4_HI 7.8539816339744828e-1 | |||
#define CUDART_PIO4_LO 3.0616169978683830e-17 | #define CUDART_PIO4_LO 3.0616169978683830e-17 | |||
End of changes. 3 change blocks. | ||||
3 lines changed or deleted | 3 lines changed or added | |||
math_functions_dbl_ptx3.h | math_functions_dbl_ptx3.h | |||
---|---|---|---|---|
skipping to change at line 497 | skipping to change at line 497 | |||
if (i & 2) { | if (i & 2) { | |||
z = -z; | z = -z; | |||
} | } | |||
return z; | return z; | |||
} | } | |||
__device_func__(void __cuda_sincos(double a, double *sptr, double *cptr)) | __device_func__(void __cuda_sincos(double a, double *sptr, double *cptr)) | |||
{ | { | |||
double t, u, s, c; | double t, u, s, c; | |||
int i; | int i; | |||
if (__cuda___isinf(a)) { | t = __cuda_fabs(a); | |||
*sptr = CUDART_NAN; | if ((t == CUDART_INF) || (t == CUDART_ZERO)) { | |||
*cptr = CUDART_NAN; | s = __dmul_rn (a, CUDART_ZERO); /* generate NaN, zero */ | |||
return; | c = 1.0 + s; /* generate NaN, one */ | |||
} | *sptr = s; | |||
if (a == CUDART_ZERO) { | *cptr = c; | |||
*sptr = a; | ||||
*cptr = 1.0; | ||||
return; | return; | |||
} | } | |||
t = __internal_trig_reduction_kerneld(a, &i); | t = __internal_trig_reduction_kerneld(a, &i); | |||
u = __internal_cos_kerneld(t); | u = __internal_cos_kerneld(t); | |||
t = __internal_sin_kerneld(t); | t = __internal_sin_kerneld(t); | |||
if (i & 1) { | if (i & 1) { | |||
s = u; | s = u; | |||
c = t; | c = t; | |||
} else { | } else { | |||
s = t; | s = t; | |||
skipping to change at line 561 | skipping to change at line 559 | |||
/* normalize denormals */ | /* normalize denormals */ | |||
if ((unsigned)ihi < (unsigned)0x00100000) { | if ((unsigned)ihi < (unsigned)0x00100000) { | |||
a = a * CUDART_TWO_TO_54; | a = a * CUDART_TWO_TO_54; | |||
e -= 54; | e -= 54; | |||
ihi = __double2hiint(a); | ihi = __double2hiint(a); | |||
ilo = __double2loint(a); | ilo = __double2loint(a); | |||
} | } | |||
/* a = m * 2^e. m <= sqrt(2): log2(a) = log2(m) + e. | /* a = m * 2^e. m <= sqrt(2): log2(a) = log2(m) + e. | |||
* m > sqrt(2): log2(a) = log2(m/2) + (e+1) | * m > sqrt(2): log2(a) = log2(m/2) + (e+1) | |||
*/ | */ | |||
e += ((ihi >> 20) & 0x7ff); | e += (ihi >> 20); | |||
ihi = (ihi & 0x800fffff) | 0x3ff00000; | ihi = (ihi & 0x800fffff) | 0x3ff00000; | |||
m = __hiloint2double (ihi, ilo); | m = __hiloint2double (ihi, ilo); | |||
if ((unsigned)ihi > (unsigned)0x3ff6a09e) { | if ((unsigned)ihi > (unsigned)0x3ff6a09e) { | |||
m = __internal_half(m); | m = __internal_half(m); | |||
e = e + 1; | e = e + 1; | |||
} | } | |||
/* log((1+m)/(1-m)) = 2*atanh(m). log(m) = 2*atanh ((m-1)/(m+1)) */ | /* log((1+m)/(1-m)) = 2*atanh(m). log(m) = 2*atanh ((m-1)/(m+1)) */ | |||
f = m - 1.0; | f = m - 1.0; | |||
g = m + 1.0; | g = m + 1.0; | |||
g = 1.0 / g; | g = 1.0 / g; | |||
skipping to change at line 1590 | skipping to change at line 1588 | |||
} | } | |||
return t; | return t; | |||
} | } | |||
__device_func__(double __cuda_erfc(double a)) | __device_func__(double __cuda_erfc(double a)) | |||
{ | { | |||
double p, q, h, l; | double p, q, h, l; | |||
int ahi; | int ahi; | |||
ahi = __double2hiint(a); | ahi = __double2hiint(a); | |||
if (ahi < (int)0x3fe80000) { | if (ahi < (int)0x3fe80000) { /* 0.75 */ | |||
return 1.0 - __cuda_erf(a); | return 1.0 - __cuda_erf(a); | |||
} | } | |||
if (a > 27.3) { | if (a > 27.3) { | |||
return 0.0; | return 0.0; | |||
} | } | |||
if (ahi < (int)0x40140000) { | if (ahi < (int)0x40140000) { /* 5.0 */ | |||
p = 5.64189549785304440E-001; | /* max error 7 ulps on [0.75, 5.0] */ | |||
p = __fma_rn (p, a, 8.17405083437083490E+000); | p = 5.6418956292134603E-001; | |||
p = __fma_rn (p, a, 5.68958722557864720E+001); | p = __fma_rn (p, a, 7.9573512229784757E+000); | |||
p = __fma_rn (p, a, 2.42568747802647010E+002); | p = __fma_rn (p, a, 5.4297984550299049E+001); | |||
p = __fma_rn (p, a, 6.80381374390412930E+002); | p = __fma_rn (p, a, 2.2775657465890461E+002); | |||
p = __fma_rn (p, a, 1.25873132236024590E+003); | p = __fma_rn (p, a, 6.2995529536738172E+002); | |||
p = __fma_rn (p, a, 1.43925353963809330E+003); | p = __fma_rn (p, a, 1.1508293767713837E+003); | |||
p = __fma_rn (p, a, 8.15949420587659230E+002); | p = __fma_rn (p, a, 1.3002167301542784E+003); | |||
q = a+ 1.44881247113239940E+001; | p = __fma_rn (p, a, 7.2716547570180592E+002); | |||
q = __fma_rn (q, a, 1.01345387970210510E+002); | q = a+ 1.4104035812651274E+001; | |||
q = __fma_rn (q, a, 4.37184684964009650E+002); | q = __fma_rn (q, a, 9.6740724349422138E+001); | |||
q = __fma_rn (q, a, 1.25588209225251330E+003); | q = __fma_rn (q, a, 4.1073916054746462E+002); | |||
q = __fma_rn (q, a, 2.43864015012647630E+003); | q = __fma_rn (q, a, 1.1641974580374074E+003); | |||
q = __fma_rn (q, a, 3.10570469008816280E+003); | q = __fma_rn (q, a, 2.2344896486798129E+003); | |||
q = __fma_rn (q, a, 2.35995386578953550E+003); | q = __fma_rn (q, a, 2.8166572432808462E+003); | |||
q = __fma_rn (q, a, 8.15949420692539320E+002); | q = __fma_rn (q, a, 2.1207350981593036E+003); | |||
q = __fma_rn (q, a, 7.2716547619708967E+002); | ||||
p = p / q; | ||||
h = a * a; | ||||
l = __fma_rn (a, a, -h); | ||||
q = __internal_exp_kernel(-h, 0); | ||||
q = __fma_rn (l, -q, q); | ||||
p = p * q; | ||||
} else { | } else { | |||
p = 5.64189583545675280E-001; | /* max error 4 ulps on [5, 27.3] */ | |||
p = __fma_rn (p, a, 2.04728556066513970E+000); | double ooa, ooasq; | |||
p = __fma_rn (p, a, 6.75128409662943610E+000); | ||||
p = __fma_rn (p, a, 1.10459345071747900E+001); | ooa = 1.0 / a; | |||
p = __fma_rn (p, a, 1.22570382896313600E+001); | ooasq = ooa * ooa; | |||
p = __fma_rn (p, a, 6.01884641114116460E+000); | p = -4.0025406686930527E+005; | |||
q = a+ 3.62871917534986780E+000; | p = __fma_rn (p, ooasq, 1.4420582543942123E+005); | |||
q = __fma_rn (q, a, 1.24663395327043550E+001); | p = __fma_rn (p, ooasq, -2.7664185780951841E+004); | |||
q = __fma_rn (q, a, 2.13927672803974790E+001); | p = __fma_rn (p, ooasq, 4.1144611644767283E+003); | |||
q = __fma_rn (q, a, 2.72082423532866070E+001); | p = __fma_rn (p, ooasq, -5.8706000519209351E+002); | |||
q = __fma_rn (q, a, 1.86422906830006700E+001); | p = __fma_rn (p, ooasq, 9.1490086446323375E+001); | |||
q = __fma_rn (q, a, 6.13809834548870550E+000); | p = __fma_rn (p, ooasq, -1.6659491387740221E+001); | |||
p = __fma_rn (p, ooasq, 3.7024804085481784E+000); | ||||
p = __fma_rn (p, ooasq, -1.0578553994424316E+000); | ||||
p = __fma_rn (p, ooasq, 4.2314218745087778E-001); | ||||
p = __fma_rn (p, ooasq, -2.8209479177354962E-001); | ||||
p = __fma_rn (p, ooasq, 5.6418958354775606E-001); | ||||
h = a * a; | ||||
l = __fma_rn (a, a, -h); | ||||
q = __internal_exp_kernel(-h, 0); | ||||
q = __fma_rn (l, -q, q); | ||||
p = p * ooa; | ||||
p = p * q; | ||||
} | } | |||
p = p / q; | ||||
h = a * a; | ||||
l = __fma_rn (a, a, -h); | ||||
q = __internal_exp_kernel(-h, 0); | ||||
q = __fma_rn (l, -q, q); | ||||
p = p * q; | ||||
return p; | return p; | |||
} | } | |||
/* approximate 1.0/(a*gamma(a)) on [-0.5,0.5] */ | /* approximate 1.0/(a*gamma(a)) on [-0.5,0.5] */ | |||
__device_func__(double __internal_tgamma_kernel(double a)) | __device_func__(double __internal_tgamma_kernel(double a)) | |||
{ | { | |||
double t; | double t; | |||
t = -4.42689340712524750E-010; | t = -4.42689340712524750E-010; | |||
t = __fma_rn (t, a, -2.02665918466589540E-007); | t = __fma_rn (t, a, -2.02665918466589540E-007); | |||
t = __fma_rn (t, a, 1.13812117211195270E-006); | t = __fma_rn (t, a, 1.13812117211195270E-006); | |||
End of changes. 6 change blocks. | ||||
45 lines changed or deleted | 55 lines changed or added | |||
sm_11_atomic_functions.h | sm_11_atomic_functions.h | |||
---|---|---|---|---|
skipping to change at line 41 | skipping to change at line 41 | |||
* Any use of this source code in individual and commercial software must | * Any use of this source code in individual and commercial software must | |||
* include, in the user documentation and internal comments to the code, | * include, in the user documentation and internal comments to the code, | |||
* the above Disclaimer and U.S. Government End Users Notice. | * the above Disclaimer and U.S. Government End Users Notice. | |||
*/ | */ | |||
#if !defined(__SM_11_ATOMIC_FUNCTIONS_H__) | #if !defined(__SM_11_ATOMIC_FUNCTIONS_H__) | |||
#define __SM_11_ATOMIC_FUNCTIONS_H__ | #define __SM_11_ATOMIC_FUNCTIONS_H__ | |||
#if defined(__cplusplus) && defined(__CUDACC__) | #if defined(__cplusplus) && defined(__CUDACC__) | |||
#if !defined(CUDA_NO_SM_11_ATOMIC_INTRINSICS) | #if __CUDA_ARCH__ >= 110 | |||
/************************************************************************** ***** | /************************************************************************** ***** | |||
* * | * * | |||
* * | * * | |||
* * | * * | |||
*************************************************************************** ****/ | *************************************************************************** ****/ | |||
#include "host_defines.h" | #include "host_defines.h" | |||
extern "C" | extern "C" | |||
skipping to change at line 206 | skipping to change at line 206 | |||
static __inline__ __device__ int atomicCAS(int *address, int compare, int v al) | static __inline__ __device__ int atomicCAS(int *address, int compare, int v al) | |||
{ | { | |||
return __iAtomicCAS(address, compare, val); | return __iAtomicCAS(address, compare, val); | |||
} | } | |||
static __inline__ __device__ unsigned int atomicCAS(unsigned int *address, unsigned int compare, unsigned int val) | static __inline__ __device__ unsigned int atomicCAS(unsigned int *address, unsigned int compare, unsigned int val) | |||
{ | { | |||
return __uAtomicCAS(address, compare, val); | return __uAtomicCAS(address, compare, val); | |||
} | } | |||
#endif /* !CUDA_NO_SM_12_ATOMIC_INTRINSICS */ | #endif /* __CUDA_ARCH__ >= 110 */ | |||
#elif !defined(__CUDACC__) | #elif !defined(__CUDACC__) | |||
#include "crt/func_macro.h" | #include "crt/func_macro.h" | |||
#if !defined(__CUDABE__) | #if !defined(__CUDABE__) | |||
#if defined(__MULTI_CORE__) | #if defined(__MULTI_CORE__) | |||
#define __iAtomicAdd(address, val) \ | #define __iAtomicAdd(address, val) \ | |||
End of changes. 2 change blocks. | ||||
2 lines changed or deleted | 2 lines changed or added | |||
sm_12_atomic_functions.h | sm_12_atomic_functions.h | |||
---|---|---|---|---|
skipping to change at line 41 | skipping to change at line 41 | |||
* Any use of this source code in individual and commercial software must | * Any use of this source code in individual and commercial software must | |||
* include, in the user documentation and internal comments to the code, | * include, in the user documentation and internal comments to the code, | |||
* the above Disclaimer and U.S. Government End Users Notice. | * the above Disclaimer and U.S. Government End Users Notice. | |||
*/ | */ | |||
#if !defined(__SM_12_ATOMIC_FUNCTIONS_H__) | #if !defined(__SM_12_ATOMIC_FUNCTIONS_H__) | |||
#define __SM_12_ATOMIC_FUNCTIONS_H__ | #define __SM_12_ATOMIC_FUNCTIONS_H__ | |||
#if defined(__cplusplus) && defined(__CUDACC__) | #if defined(__cplusplus) && defined(__CUDACC__) | |||
#if !defined(CUDA_NO_SM_12_ATOMIC_INTRINSICS) | #if __CUDA_ARCH__ >= 120 | |||
/************************************************************************** ***** | /************************************************************************** ***** | |||
* * | * * | |||
* * | * * | |||
* * | * * | |||
*************************************************************************** ****/ | *************************************************************************** ****/ | |||
#include "host_defines.h" | #include "host_defines.h" | |||
extern "C" | extern "C" | |||
skipping to change at line 99 | skipping to change at line 99 | |||
static __inline__ __device__ bool any(bool cond) | static __inline__ __device__ bool any(bool cond) | |||
{ | { | |||
return (bool)__any((int)cond); | return (bool)__any((int)cond); | |||
} | } | |||
static __inline__ __device__ bool all(bool cond) | static __inline__ __device__ bool all(bool cond) | |||
{ | { | |||
return (bool)__all((int)cond); | return (bool)__all((int)cond); | |||
} | } | |||
#endif /* !CUDA_NO_SM_12_ATOMIC_INTRINSICS */ | #endif /* __CUDA_ARCH__ >= 120 */ | |||
#elif !defined(__CUDACC__) | #elif !defined(__CUDACC__) | |||
#include "crt/func_macro.h" | #include "crt/func_macro.h" | |||
#if !defined(__CUDABE__) | #if !defined(__CUDABE__) | |||
#if defined(__MULTI_CORE__) | #if defined(__MULTI_CORE__) | |||
#define __ullAtomicAdd(address, val) \ | #define __ullAtomicAdd(address, val) \ | |||
End of changes. 2 change blocks. | ||||
2 lines changed or deleted | 2 lines changed or added | |||
sm_13_double_functions.h | sm_13_double_functions.h | |||
---|---|---|---|---|
skipping to change at line 47 | skipping to change at line 47 | |||
#define __SM_13_DOUBLE_FUNCTIONS_H__ | #define __SM_13_DOUBLE_FUNCTIONS_H__ | |||
/************************************************************************** ***** | /************************************************************************** ***** | |||
* * | * * | |||
* * | * * | |||
* * | * * | |||
*************************************************************************** ****/ | *************************************************************************** ****/ | |||
#if defined(__cplusplus) && defined(__CUDACC__) | #if defined(__cplusplus) && defined(__CUDACC__) | |||
#if !defined(CUDA_NO_SM_13_DOUBLE_INTRINSICS) | #if __CUDA_ARCH__ >= 130 | |||
/************************************************************************** ***** | /************************************************************************** ***** | |||
* * | * * | |||
* * | * * | |||
* * | * * | |||
*************************************************************************** ****/ | *************************************************************************** ****/ | |||
#include "device_types.h" | #include "device_types.h" | |||
#include "host_defines.h" | #include "host_defines.h" | |||
skipping to change at line 252 | skipping to change at line 252 | |||
static __inline__ __device__ double uint2double(unsigned int a, enum cudaRo undMode mode = cudaRoundNearest) | static __inline__ __device__ double uint2double(unsigned int a, enum cudaRo undMode mode = cudaRoundNearest) | |||
{ | { | |||
return (double)a; | return (double)a; | |||
} | } | |||
static __inline__ __device__ double float2double(float a, enum cudaRoundMod e mode = cudaRoundNearest) | static __inline__ __device__ double float2double(float a, enum cudaRoundMod e mode = cudaRoundNearest) | |||
{ | { | |||
return (double)a; | return (double)a; | |||
} | } | |||
#endif /* !CUDA_NO_SM_13_DOUBLE_INTRINSICS */ | #endif /* __CUDA_ARCH__ >= 130 */ | |||
#elif !defined(__CUDACC__) | #elif !defined(__CUDACC__) | |||
#include "crt/func_macro.h" | #include "crt/func_macro.h" | |||
#if !defined(__CUDABE__) | #if !defined(__CUDABE__) | |||
/************************************************************************** ***** | /************************************************************************** ***** | |||
* * | * * | |||
* HOST IMPLEMENTATIONS FOR FUNCTIONS * | * HOST IMPLEMENTATIONS FOR FUNCTIONS * | |||
skipping to change at line 292 | skipping to change at line 292 | |||
{ | { | |||
volatile union __cudart_DoubleUlonglongCvt xx; | volatile union __cudart_DoubleUlonglongCvt xx; | |||
volatile union __cudart_FloatUintCvt res; | volatile union __cudart_FloatUintCvt res; | |||
int shift; | int shift; | |||
xx.d = a; | xx.d = a; | |||
if (xx.i == 0) return 0.0f; | if (xx.i == 0) return 0.0f; | |||
res.i = (((unsigned int) (xx.i >> 32)) & 0x80000000); | res.i = (((unsigned int) (xx.i >> 32)) & 0x80000000); | |||
if ((xx.i & 0x7ff0000000000000ULL) == 0x7ff0000000000000ULL) { | if ((xx.i & 0x7ff0000000000000ULL) == 0x7ff0000000000000ULL) { | |||
if ((xx.i & 0x7fffffffffffffffULL) > 0x7ff0000000000000ULL) { | if ((xx.i & 0x7fffffffffffffffULL) > 0x7ff0000000000000ULL) { | |||
// Nan | /* Nan */ | |||
res.i = 0x7f8fffff; | res.i = ((unsigned int)((xx.i >> 32) & 0x80000000) | | |||
(255U << 23) | 0x00400000 | | ||||
(unsigned int)((xx.i >> (53 - 24)) & 0x007fffff)); | ||||
} else { | } else { | |||
// Inf | /* Inf */ | |||
res.i |= 0x7f800000; | res.i |= 0x7f800000; | |||
} | } | |||
return res.f; | return res.f; | |||
} | } | |||
shift = ((int) ((xx.i >> 52) & 0x7ff)) - 1023; | shift = ((int) ((xx.i >> 52) & 0x7ff)) - 1023; | |||
// Overflow | /* Overflow */ | |||
xx.i = (xx.i & 0x000fffffffffffffULL); | xx.i = (xx.i & 0x000fffffffffffffULL); | |||
if (shift >= 128) { | if (shift >= 128) { | |||
res.i |= 0x7f7fffff; | res.i |= 0x7f7fffff; | |||
return res.f; | return res.f; | |||
} | } | |||
if (shift <= -127) { | if (shift <= -127) { | |||
if (shift < -180) { | if (shift < -180) { | |||
// Underflow | /* Underflow */ | |||
xx.i = 0; | xx.i = 0; | |||
} else { | } else { | |||
xx.i |= 0x0010000000000000ULL; | xx.i |= 0x0010000000000000ULL; | |||
xx.i >>= 127 + shift; | xx.i >>= -126 - shift; | |||
} | } | |||
} else { | } else { | |||
res.i |= (unsigned int) (127 + shift) << 23; | res.i |= (unsigned int) (127 + shift) << 23; | |||
} | } | |||
res.i |= ((unsigned int) (xx.i >> 29)) & 0x007fffff; | res.i |= ((unsigned int) (xx.i >> 29)) & 0x007fffff; | |||
xx.i &= 0x1fffffff; | xx.i &= 0x1fffffff; | |||
return res.f; | return res.f; | |||
} | } | |||
__device_func__(double __internal_ll2double_kernel(long long int a, enum cu daRoundMode rndMode)) | __device_func__(double __internal_ll2double_kernel(long long int a, enum cu daRoundMode rndMode)) | |||
skipping to change at line 596 | skipping to change at line 598 | |||
return __internal_ull2double_kernel(a, cudaRoundMinInf); | return __internal_ull2double_kernel(a, cudaRoundMinInf); | |||
} | } | |||
__device_func__(double __ull2double_ru(unsigned long long int a)) | __device_func__(double __ull2double_ru(unsigned long long int a)) | |||
{ | { | |||
return __internal_ull2double_kernel(a, cudaRoundPosInf); | return __internal_ull2double_kernel(a, cudaRoundPosInf); | |||
} | } | |||
#endif /* !__CUDABE__ */ | #endif /* !__CUDABE__ */ | |||
#if !defined(__CUDABE__) || defined(CUDA_NO_SM_13_DOUBLE_INTRINSICS) | #if !defined(__CUDABE__) || __CUDA_ARCH__ < 130 | |||
#include "common_types.h" | #include "common_types.h" | |||
__device_func__(double __internal_fma_kernel(double x, double y, double z, enum cudaRoundMode rndMode)) | __device_func__(double __internal_fma_kernel(double x, double y, double z, enum cudaRoundMode rndMode)) | |||
{ | { | |||
#ifdef __MULTI_CORE__ | #ifdef __MULTI_CORE__ | |||
volatile | volatile | |||
#endif /* __MULTI_CORE__ */ | #endif /* __MULTI_CORE__ */ | |||
struct __cudart_UintUint xx, yy, zz, ww; | struct __cudart_UintUint xx, yy, zz, ww; | |||
unsigned int s, t, u, prod0, prod1, prod2, prod3, expo_x, expo_y, expo_z; | unsigned int s, t, u, prod0, prod1, prod2, prod3, expo_x, expo_y, expo_z; | |||
xx.hi = __double2hiint(x); | xx.hi = __double2hiint(x); | |||
xx.lo = __double2loint(x); | xx.lo = __double2loint(x); | |||
skipping to change at line 1113 | skipping to change at line 1117 | |||
__device_func__(double __dadd_rn(double a, double b)) | __device_func__(double __dadd_rn(double a, double b)) | |||
{ | { | |||
return __fma_rn(a, CUDART_ONE, b); | return __fma_rn(a, CUDART_ONE, b); | |||
} | } | |||
__device_func__(double __dmul_rn(double a, double b)) | __device_func__(double __dmul_rn(double a, double b)) | |||
{ | { | |||
return __fma_rn(a, b, CUDART_NEG_ZERO); | return __fma_rn(a, b, CUDART_NEG_ZERO); | |||
} | } | |||
#endif /* !__CUDABE__ || CUDA_NO_SM_13_DOUBLE_INTRINSICS */ | #endif /* !__CUDABE__ || __CUDA_ARCH__ < 130 */ | |||
/************************************************************************** ***** | /************************************************************************** ***** | |||
* * | * * | |||
* HOST / DEVICE IMPLEMENTATIONS FOR FUNCTIONS * | * HOST / DEVICE IMPLEMENTATIONS FOR FUNCTIONS * | |||
* * | * * | |||
*************************************************************************** ****/ | *************************************************************************** ****/ | |||
#endif /* __cplusplus && __CUDACC__ */ | #endif /* __cplusplus && __CUDACC__ */ | |||
#endif /* !__SM_13_DOUBLE_FUNCTIONS_H__ */ | #endif /* !__SM_13_DOUBLE_FUNCTIONS_H__ */ | |||
End of changes. 10 change blocks. | ||||
10 lines changed or deleted | 14 lines changed or added | |||
storage_class.h | storage_class.h | |||
---|---|---|---|---|
/* | /* | |||
* Copyright 1993-2009 NVIDIA Corporation. All rights reserved. | * Copyright 1993-2008 NVIDIA Corporation. All rights reserved. | |||
* | * | |||
* NOTICE TO USER: | * NOTICE TO USER: | |||
* | * | |||
* This source code is subject to NVIDIA ownership rights under U.S. and | * This source code is subject to NVIDIA ownership rights under U.S. and | |||
* international Copyright laws. Users and possessors of this source code | * international Copyright laws. Users and possessors of this source code | |||
* are hereby granted a nonexclusive, royalty-free license to use this code | * are hereby granted a nonexclusive, royalty-free license to use this code | |||
* in individual and commercial software. | * in individual and commercial software. | |||
* | * | |||
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE | * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE | |||
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR | * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR | |||
skipping to change at line 47 | skipping to change at line 47 | |||
#define __STORAGE_CLASS_H__ | #define __STORAGE_CLASS_H__ | |||
#if !defined(__var_used__) | #if !defined(__var_used__) | |||
#define __var_used__ | #define __var_used__ | |||
#endif /* __var_used__ */ | #endif /* __var_used__ */ | |||
#if !defined(__loc_sc__) | #if !defined(__loc_sc__) | |||
#define __loc_sc__(loc, sc) \ | #define __loc_sc__(loc, size, sc) \ | |||
__storage##_##sc##loc loc | __storage##_##sc##size##loc loc | |||
#endif /* !__loc_sc__ */ | #endif /* !__loc_sc__ */ | |||
#if !defined(__storage___device__) | #if !defined(__storage___device__) | |||
#define __storage___device__ static __var_used__ | #define __storage___device__ static __var_used__ | |||
#endif /* __storage___device__ */ | #endif /* __storage___device__ */ | |||
#if !defined(__storage_extern__device__) | #if !defined(__storage_extern__device__) | |||
#define __storage_extern__device__ static __var_used__ | #define __storage_extern__device__ static __var_used__ | |||
#endif /* __storage_extern__device__ */ | #endif /* __storage_extern__device__ */ | |||
skipping to change at line 100 | skipping to change at line 100 | |||
#endif /* __storage_extern__shared__ */ | #endif /* __storage_extern__shared__ */ | |||
#if !defined(__storage_auto__shared__) | #if !defined(__storage_auto__shared__) | |||
#define __storage_auto__shared__ static | #define __storage_auto__shared__ static | |||
#endif /* __storage_auto__shared__ */ | #endif /* __storage_auto__shared__ */ | |||
#if !defined(__storage_static__shared__) | #if !defined(__storage_static__shared__) | |||
#define __storage_static__shared__ static __var_used__ | #define __storage_static__shared__ static __var_used__ | |||
#endif /* __storage_static__shared__ */ | #endif /* __storage_static__shared__ */ | |||
#if !defined(__storage__unsized__shared__) | ||||
#define __storage__unsized__shared__ @@@ COMPILER @@@ ERROR @@@ | ||||
#endif /* __storage__unsized__shared__ */ | ||||
#if !defined(__storage_extern_unsized__shared__) | ||||
#define __storage_extern_unsized__shared__ static __var_used__ | ||||
#endif /* __storage_extern_unsized__shared__ */ | ||||
#if !defined(__storage_auto_unsized__shared__) | ||||
#define __storage_auto_unsized__shared__ @@@ COMPILER @@@ ERROR @@@ | ||||
#endif /* __storage_auto_unsized__shared__ */ | ||||
#if !defined(__storage_static_unsized__shared__) | ||||
#define __storage_static_unsized__shared__ @@@ COMPILER @@@ ERROR @@@ | ||||
#endif /* __storage_static_unsized__shared__ */ | ||||
#if !defined(__storage___text__) | #if !defined(__storage___text__) | |||
#define __storage___text__ static __var_used__ | #define __storage___text__ static __var_used__ | |||
#endif /* __storage___text__ */ | #endif /* __storage___text__ */ | |||
#if !defined(__storage_extern__text__) | #if !defined(__storage_extern__text__) | |||
#define __storage_extern__text__ static __var_used__ | #define __storage_extern__text__ static __var_used__ | |||
#endif /* __storage_extern__text__ */ | #endif /* __storage_extern__text__ */ | |||
#if !defined(__storage_auto__text__) | #if !defined(__storage_auto__text__) | |||
#define __storage_auto__text__ @@@ COMPILER @@@ ERROR @@@ | #define __storage_auto__text__ @@@ COMPILER @@@ ERROR @@@ | |||
End of changes. 3 change blocks. | ||||
3 lines changed or deleted | 19 lines changed or added | |||
vector_types.h | vector_types.h | |||
---|---|---|---|---|
skipping to change at line 53 | skipping to change at line 53 | |||
*************************************************************************** ****/ | *************************************************************************** ****/ | |||
#include "host_defines.h" | #include "host_defines.h" | |||
/************************************************************************** ***** | /************************************************************************** ***** | |||
* * | * * | |||
* * | * * | |||
* * | * * | |||
*************************************************************************** ****/ | *************************************************************************** ****/ | |||
#if !defined(__cuda_assign_operators) | ||||
#define __cuda_assign_operators(tag) | ||||
#endif /* !__cuda_assign_operators */ | ||||
#if !defined(__CUDACC__) && !defined(__CUDABE__) && \ | ||||
!defined (__MULTI_CORE__) && defined(_WIN32) && !defined(_WIN64) | ||||
#define __cuda_builtin_vector_align8(tag, ...) \ | ||||
struct tag { \ | ||||
union { \ | ||||
struct { __VA_ARGS__; }; \ | ||||
struct { long long int :1,:0; }; \ | ||||
}; \ | ||||
__cuda_assign_operators(tag) \ | ||||
} | ||||
#else /* !__CUDACC__ && !__CUDABE__ && !__MULTI_CORE__ && _WIN32 && !_WIN64 | ||||
*/ | ||||
#define __cuda_builtin_vector_align8(tag, ...) \ | ||||
struct __align__(8) tag { \ | ||||
__VA_ARGS__; \ | ||||
__cuda_assign_operators(tag) \ | ||||
} | ||||
#endif /* !__CUDACC__ && !__CUDABE__ && !__MULTI_CORE__ && _WIN32 && !_WIN6 | ||||
4 */ | ||||
/*DEVICE_BUILTIN*/ | /*DEVICE_BUILTIN*/ | |||
struct char1 | struct char1 | |||
{ | { | |||
signed char x; | signed char x; | |||
__cuda_assign_operators(char1) | ||||
}; | }; | |||
/*DEVICE_BUILTIN*/ | /*DEVICE_BUILTIN*/ | |||
struct uchar1 | struct uchar1 | |||
{ | { | |||
unsigned char x; | unsigned char x; | |||
__cuda_assign_operators(uchar1) | ||||
}; | }; | |||
/*DEVICE_BUILTIN*/ | /*DEVICE_BUILTIN*/ | |||
struct __builtin_align__(2) char2 | struct __align__(2) char2 | |||
{ | { | |||
signed char x, y; | signed char x, y; | |||
__cuda_assign_operators(char2) | ||||
}; | }; | |||
/*DEVICE_BUILTIN*/ | /*DEVICE_BUILTIN*/ | |||
struct __builtin_align__(2) uchar2 | struct __align__(2) uchar2 | |||
{ | { | |||
unsigned char x, y; | unsigned char x, y; | |||
__cuda_assign_operators(uchar2) | ||||
}; | }; | |||
/*DEVICE_BUILTIN*/ | /*DEVICE_BUILTIN*/ | |||
struct char3 | struct char3 | |||
{ | { | |||
signed char x, y, z; | signed char x, y, z; | |||
__cuda_assign_operators(char3) | ||||
}; | }; | |||
/*DEVICE_BUILTIN*/ | /*DEVICE_BUILTIN*/ | |||
struct uchar3 | struct uchar3 | |||
{ | { | |||
unsigned char x, y, z; | unsigned char x, y, z; | |||
__cuda_assign_operators(uchar3) | ||||
}; | }; | |||
/*DEVICE_BUILTIN*/ | /*DEVICE_BUILTIN*/ | |||
struct __builtin_align__(4) char4 | struct __align__(4) char4 | |||
{ | { | |||
signed char x, y, z, w; | signed char x, y, z, w; | |||
__cuda_assign_operators(char4) | ||||
}; | }; | |||
/*DEVICE_BUILTIN*/ | /*DEVICE_BUILTIN*/ | |||
struct __builtin_align__(4) uchar4 | struct __align__(4) uchar4 | |||
{ | { | |||
unsigned char x, y, z, w; | unsigned char x, y, z, w; | |||
__cuda_assign_operators(uchar4) | ||||
}; | }; | |||
/*DEVICE_BUILTIN*/ | /*DEVICE_BUILTIN*/ | |||
struct short1 | struct short1 | |||
{ | { | |||
short x; | short x; | |||
__cuda_assign_operators(short1) | ||||
}; | }; | |||
/*DEVICE_BUILTIN*/ | /*DEVICE_BUILTIN*/ | |||
struct ushort1 | struct ushort1 | |||
{ | { | |||
unsigned short x; | unsigned short x; | |||
__cuda_assign_operators(ushort1) | ||||
}; | }; | |||
/*DEVICE_BUILTIN*/ | /*DEVICE_BUILTIN*/ | |||
struct __builtin_align__(4) short2 | struct __align__(4) short2 | |||
{ | { | |||
short x, y; | short x, y; | |||
__cuda_assign_operators(short2) | ||||
}; | }; | |||
/*DEVICE_BUILTIN*/ | /*DEVICE_BUILTIN*/ | |||
struct __builtin_align__(4) ushort2 | struct __align__(4) ushort2 | |||
{ | { | |||
unsigned short x, y; | unsigned short x, y; | |||
__cuda_assign_operators(ushort2) | ||||
}; | }; | |||
/*DEVICE_BUILTIN*/ | /*DEVICE_BUILTIN*/ | |||
struct short3 | struct short3 | |||
{ | { | |||
short x, y, z; | short x, y, z; | |||
__cuda_assign_operators(short3) | ||||
}; | }; | |||
/*DEVICE_BUILTIN*/ | /*DEVICE_BUILTIN*/ | |||
struct ushort3 | struct ushort3 | |||
{ | { | |||
unsigned short x, y, z; | unsigned short x, y, z; | |||
__cuda_assign_operators(ushort3) | ||||
}; | }; | |||
/*DEVICE_BUILTIN*/ | /*DEVICE_BUILTIN*/ | |||
struct __builtin_align__(8) short4 | __cuda_builtin_vector_align8(short4, short x, y, z, w); | |||
{ | ||||
short x, y, z, w; | ||||
}; | ||||
/*DEVICE_BUILTIN*/ | /*DEVICE_BUILTIN*/ | |||
struct __builtin_align__(8) ushort4 | __cuda_builtin_vector_align8(ushort4, unsigned short x, y, z, w); | |||
{ | ||||
unsigned short x, y, z, w; | ||||
}; | ||||
/*DEVICE_BUILTIN*/ | /*DEVICE_BUILTIN*/ | |||
struct int1 | struct int1 | |||
{ | { | |||
int x; | int x; | |||
__cuda_assign_operators(int1) | ||||
}; | }; | |||
/*DEVICE_BUILTIN*/ | /*DEVICE_BUILTIN*/ | |||
struct uint1 | struct uint1 | |||
{ | { | |||
unsigned int x; | unsigned int x; | |||
__cuda_assign_operators(uint1) | ||||
}; | }; | |||
/*DEVICE_BUILTIN*/ | /*DEVICE_BUILTIN*/ | |||
struct __builtin_align__(8) int2 | __cuda_builtin_vector_align8(int2, int x, y); | |||
{ | ||||
int x, y; | ||||
}; | ||||
/*DEVICE_BUILTIN*/ | /*DEVICE_BUILTIN*/ | |||
struct __builtin_align__(8) uint2 | __cuda_builtin_vector_align8(uint2, unsigned int x, y); | |||
{ | ||||
unsigned int x, y; | ||||
}; | ||||
/*DEVICE_BUILTIN*/ | /*DEVICE_BUILTIN*/ | |||
struct int3 | struct int3 | |||
{ | { | |||
int x, y, z; | int x, y, z; | |||
__cuda_assign_operators(int3) | ||||
}; | }; | |||
/*DEVICE_BUILTIN*/ | /*DEVICE_BUILTIN*/ | |||
struct uint3 | struct uint3 | |||
{ | { | |||
unsigned int x, y, z; | unsigned int x, y, z; | |||
__cuda_assign_operators(uint3) | ||||
}; | }; | |||
/*DEVICE_BUILTIN*/ | /*DEVICE_BUILTIN*/ | |||
struct __builtin_align__(16) int4 | struct __builtin_align__(16) int4 | |||
{ | { | |||
int x, y, z, w; | int x, y, z, w; | |||
__cuda_assign_operators(int4) | ||||
}; | }; | |||
/*DEVICE_BUILTIN*/ | /*DEVICE_BUILTIN*/ | |||
struct __builtin_align__(16) uint4 | struct __builtin_align__(16) uint4 | |||
{ | { | |||
unsigned int x, y, z, w; | unsigned int x, y, z, w; | |||
__cuda_assign_operators(uint4) | ||||
}; | }; | |||
/*DEVICE_BUILTIN*/ | /*DEVICE_BUILTIN*/ | |||
struct long1 | struct long1 | |||
{ | { | |||
long int x; | long int x; | |||
__cuda_assign_operators(long1) | ||||
}; | }; | |||
/*DEVICE_BUILTIN*/ | /*DEVICE_BUILTIN*/ | |||
struct ulong1 | struct ulong1 | |||
{ | { | |||
unsigned long x; | unsigned long x; | |||
__cuda_assign_operators(ulong1) | ||||
}; | }; | |||
/*DEVICE_BUILTIN*/ | ||||
struct | ||||
#if defined (_WIN32) | #if defined (_WIN32) | |||
__builtin_align__(8) | ||||
/*DEVICE_BUILTIN*/ | ||||
__cuda_builtin_vector_align8(long2, long int x, y); | ||||
/*DEVICE_BUILTIN*/ | ||||
__cuda_builtin_vector_align8(ulong2, unsigned long int x, y); | ||||
#else /* _WIN32 */ | #else /* _WIN32 */ | |||
__builtin_align__(2*sizeof(long int)) | ||||
#endif /* _WIN32 */ | /*DEVICE_BUILTIN*/ | |||
long2 | struct __align__(2*sizeof(long int)) long2 | |||
{ | { | |||
long int x, y; | long int x, y; | |||
__cuda_assign_operators(long2) | ||||
}; | }; | |||
/*DEVICE_BUILTIN*/ | /*DEVICE_BUILTIN*/ | |||
struct | struct __align__(2*sizeof(unsigned long int)) ulong2 | |||
#if defined (_WIN32) | ||||
__builtin_align__(8) | ||||
#else /* _WIN32 */ | ||||
__builtin_align__(2*sizeof(unsigned long int)) | ||||
#endif /* _WIN32 */ | ||||
ulong2 | ||||
{ | { | |||
unsigned long int x, y; | unsigned long int x, y; | |||
__cuda_assign_operators(ulong2) | ||||
}; | }; | |||
#endif /* _WIN32 */ | ||||
#if !defined(__LP64__) | #if !defined(__LP64__) | |||
/*DEVICE_BUILTIN*/ | /*DEVICE_BUILTIN*/ | |||
struct long3 | struct long3 | |||
{ | { | |||
long int x, y, z; | long int x, y, z; | |||
__cuda_assign_operators(long3) | ||||
}; | }; | |||
/*DEVICE_BUILTIN*/ | /*DEVICE_BUILTIN*/ | |||
struct ulong3 | struct ulong3 | |||
{ | { | |||
unsigned long int x, y, z; | unsigned long int x, y, z; | |||
__cuda_assign_operators(ulong3) | ||||
}; | }; | |||
/*DEVICE_BUILTIN*/ | /*DEVICE_BUILTIN*/ | |||
struct __builtin_align__(16) long4 | struct __builtin_align__(16) long4 | |||
{ | { | |||
long int x, y, z, w; | long int x, y, z, w; | |||
__cuda_assign_operators(long4) | ||||
}; | }; | |||
/*DEVICE_BUILTIN*/ | /*DEVICE_BUILTIN*/ | |||
struct __builtin_align__(16) ulong4 | struct __builtin_align__(16) ulong4 | |||
{ | { | |||
unsigned long int x, y, z, w; | unsigned long int x, y, z, w; | |||
__cuda_assign_operators(ulong4) | ||||
}; | }; | |||
#endif /* !__LP64__ */ | #endif /* !__LP64__ */ | |||
/*DEVICE_BUILTIN*/ | /*DEVICE_BUILTIN*/ | |||
struct float1 | struct float1 | |||
{ | { | |||
float x; | float x; | |||
__cuda_assign_operators(float1) | ||||
}; | }; | |||
/*DEVICE_BUILTIN*/ | /*DEVICE_BUILTIN*/ | |||
struct __builtin_align__(8) float2 | __cuda_builtin_vector_align8(float2, float x, y); | |||
{ | ||||
float x, y; | ||||
}; | ||||
/*DEVICE_BUILTIN*/ | /*DEVICE_BUILTIN*/ | |||
struct float3 | struct float3 | |||
{ | { | |||
float x, y, z; | float x, y, z; | |||
__cuda_assign_operators(float3) | ||||
}; | }; | |||
/*DEVICE_BUILTIN*/ | /*DEVICE_BUILTIN*/ | |||
struct __builtin_align__(16) float4 | struct __builtin_align__(16) float4 | |||
{ | { | |||
float x, y, z, w; | float x, y, z, w; | |||
__cuda_assign_operators(float4) | ||||
}; | }; | |||
/*DEVICE_BUILTIN*/ | /*DEVICE_BUILTIN*/ | |||
struct longlong1 | struct longlong1 | |||
{ | { | |||
long long int x; | long long int x; | |||
__cuda_assign_operators(longlong1) | ||||
}; | }; | |||
/*DEVICE_BUILTIN*/ | /*DEVICE_BUILTIN*/ | |||
struct ulonglong1 | struct ulonglong1 | |||
{ | { | |||
unsigned long long int x; | unsigned long long int x; | |||
__cuda_assign_operators(ulonglong1) | ||||
}; | }; | |||
/*DEVICE_BUILTIN*/ | /*DEVICE_BUILTIN*/ | |||
struct __builtin_align__(16) longlong2 | struct __builtin_align__(16) longlong2 | |||
{ | { | |||
long long int x, y; | long long int x, y; | |||
__cuda_assign_operators(longlong2) | ||||
}; | }; | |||
/*DEVICE_BUILTIN*/ | /*DEVICE_BUILTIN*/ | |||
struct __builtin_align__(16) ulonglong2 | struct __builtin_align__(16) ulonglong2 | |||
{ | { | |||
unsigned long long int x, y; | unsigned long long int x, y; | |||
__cuda_assign_operators(ulonglong2) | ||||
}; | }; | |||
/*DEVICE_BUILTIN*/ | /*DEVICE_BUILTIN*/ | |||
struct double1 | struct double1 | |||
{ | { | |||
double x; | double x; | |||
__cuda_assign_operators(double1) | ||||
}; | }; | |||
/*DEVICE_BUILTIN*/ | /*DEVICE_BUILTIN*/ | |||
struct __builtin_align__(16) double2 | struct __builtin_align__(16) double2 | |||
{ | { | |||
double x, y; | double x, y; | |||
__cuda_assign_operators(double2) | ||||
}; | }; | |||
/************************************************************************** ***** | /************************************************************************** ***** | |||
* * | * * | |||
* * | * * | |||
* * | * * | |||
*************************************************************************** ****/ | *************************************************************************** ****/ | |||
/*DEVICE_BUILTIN*/ | /*DEVICE_BUILTIN*/ | |||
typedef struct char1 char1; | typedef struct char1 char1; | |||
skipping to change at line 419 | skipping to change at line 469 | |||
/*DEVICE_BUILTIN*/ | /*DEVICE_BUILTIN*/ | |||
typedef struct double2 double2; | typedef struct double2 double2; | |||
/************************************************************************** ***** | /************************************************************************** ***** | |||
* * | * * | |||
* * | * * | |||
* * | * * | |||
*************************************************************************** ****/ | *************************************************************************** ****/ | |||
/*DEVICE_BUILTIN*/ | /*DEVICE_BUILTIN*/ | |||
typedef struct dim3 dim3; | ||||
/*DEVICE_BUILTIN*/ | ||||
struct dim3 | struct dim3 | |||
{ | { | |||
unsigned int x, y, z; | unsigned int x, y, z; | |||
#if defined(__cplusplus) | #if defined(__cplusplus) | |||
dim3(unsigned int x = 1, unsigned int y = 1, unsigned int z = 1) : x(x) | __host__ __device__ dim3(unsigned int x = 1, unsigned int y = 1, unsign | |||
, y(y), z(z) {} | ed int z = 1) : x(x), y(y), z(z) {} | |||
dim3(uint3 v) : x(v.x), y(v.y), z(v.z) {} | __host__ __device__ dim3(uint3 v) : x(v.x), y(v.y), z(v.z) {} | |||
operator uint3(void) { uint3 t; t.x = x; t.y = y; t.z = z; return t; } | __host__ __device__ operator uint3(void) { uint3 t; t.x = x; t.y = y; t | |||
.z = z; return t; } | ||||
#endif /* __cplusplus */ | #endif /* __cplusplus */ | |||
}; | }; | |||
/*DEVICE_BUILTIN*/ | ||||
typedef struct dim3 dim3; | ||||
#undef __cuda_assign_operators | ||||
#undef __cuda_builtin_vector_align8 | ||||
#endif /* !__VECTOR_TYPES_H__ */ | #endif /* !__VECTOR_TYPES_H__ */ | |||
End of changes. 57 change blocks. | ||||
46 lines changed or deleted | 102 lines changed or added | |||