__cudaFatFormat.h   __cudaFatFormat.h 
/* /*
* Copyright 1993-2009 NVIDIA Corporation. All rights reserved. * Copyright 1993-2008 NVIDIA Corporation. All rights reserved.
* *
* NOTICE TO USER: * NOTICE TO USER:
* *
* This source code is subject to NVIDIA ownership rights under U.S. and * This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws. Users and possessors of this source code * international Copyright laws. Users and possessors of this source code
* are hereby granted a nonexclusive, royalty-free license to use this code * are hereby granted a nonexclusive, royalty-free license to use this code
* in individual and commercial software. * in individual and commercial software.
* *
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
skipping to change at line 121 skipping to change at line 121
char* gpuProfileName; char* gpuProfileName;
char* ptx; char* ptx;
} __cudaFatPtxEntry; } __cudaFatPtxEntry;
/* /*
* Debug entry type for __cudaFat binary. * Debug entry type for __cudaFat binary.
* Such information might, but need not be available * Such information might, but need not be available
* for Cubin entries (ptx files compiled in debug mode * for Cubin entries (ptx files compiled in debug mode
* will contain their own debugging information) * will contain their own debugging information)
*/ */
typedef struct { typedef struct __cudaFatDebugEntryRec {
char* gpuProfileName; char* gpuProfileName;
char* debug; char* debug;
struct __cudaFatDebugEntryRec *next;
unsigned int size;
} __cudaFatDebugEntry; } __cudaFatDebugEntry;
typedef enum { typedef enum {
__cudaFatDontSearchFlag = (1 << 0), __cudaFatDontSearchFlag = (1 << 0),
__cudaFatDontCacheFlag = (1 << 1), __cudaFatDontCacheFlag = (1 << 1),
__cudaFatSassDebugFlag = (1 << 2) __cudaFatSassDebugFlag = (1 << 2)
} __cudaFatCudaBinaryFlag; } __cudaFatCudaBinaryFlag;
/* /*
* Imported/exported symbol descriptor, needed for * Imported/exported symbol descriptor, needed for
 End of changes. 2 change blocks. 
4 lines changed or deleted 6 lines changed or added


 cuda.h   cuda.h 
skipping to change at line 57 skipping to change at line 57
/** /**
* \defgroup CUDA_TYPES Data types used by CUDA driver * \defgroup CUDA_TYPES Data types used by CUDA driver
* \ingroup CUDA_DRIVER * \ingroup CUDA_DRIVER
* @{ * @{
*/ */
/** /**
* CUDA API version number * CUDA API version number
*/ */
#define CUDA_VERSION 2020 /* 2.2 */ #define CUDA_VERSION 2030 /* 2.3 */
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
#endif #endif
typedef unsigned int CUdeviceptr; ///< CUDA device pointer typedef unsigned int CUdeviceptr; ///< CUDA device pointer
typedef int CUdevice; ///< CUDA device typedef int CUdevice; ///< CUDA device
typedef struct CUctx_st *CUcontext; ///< CUDA context typedef struct CUctx_st *CUcontext; ///< CUDA context
typedef struct CUmod_st *CUmodule; ///< CUDA module typedef struct CUmod_st *CUmodule; ///< CUDA module
typedef struct CUfunc_st *CUfunction; ///< CUDA function typedef struct CUfunc_st *CUfunction; ///< CUDA function
skipping to change at line 89 skipping to change at line 89
/** /**
* Context creation flags * Context creation flags
*/ */
typedef enum CUctx_flags_enum { typedef enum CUctx_flags_enum {
CU_CTX_SCHED_AUTO = 0, ///< Automatic scheduling CU_CTX_SCHED_AUTO = 0, ///< Automatic scheduling
CU_CTX_SCHED_SPIN = 1, ///< Set spin as default scheduling CU_CTX_SCHED_SPIN = 1, ///< Set spin as default scheduling
CU_CTX_SCHED_YIELD = 2, ///< Set yield as default scheduling CU_CTX_SCHED_YIELD = 2, ///< Set yield as default scheduling
CU_CTX_SCHED_MASK = 0x3, CU_CTX_SCHED_MASK = 0x3,
CU_CTX_BLOCKING_SYNC = 4, ///< Use blocking synchronization CU_CTX_BLOCKING_SYNC = 4, ///< Use blocking synchronization
CU_CTX_MAP_HOST = 8, ///< Support mapped pinned allocations CU_CTX_MAP_HOST = 8, ///< Support mapped pinned allocations
CU_CTX_FLAGS_MASK = 0xf, CU_CTX_LMEM_RESIZE_TO_MAX = 16, ///< Keep local memory allocation after
launch
CU_CTX_FLAGS_MASK = 0x1f,
} CUctx_flags; } CUctx_flags;
/** /**
* Event creation flags * Event creation flags
*/ */
typedef enum CUevent_flags_enum { typedef enum CUevent_flags_enum {
CU_EVENT_DEFAULT = 0, ///< Default event flag CU_EVENT_DEFAULT = 0, ///< Default event flag
CU_EVENT_BLOCKING_SYNC = 1, ///< Event uses blocking synchronization CU_EVENT_BLOCKING_SYNC = 1, ///< Event uses blocking synchronization
} CUevent_flags; } CUevent_flags;
skipping to change at line 605 skipping to change at line 606
); );
CUresult CUDAAPI cuMemFree(CUdeviceptr dptr); CUresult CUDAAPI cuMemFree(CUdeviceptr dptr);
CUresult CUDAAPI cuMemGetAddressRange( CUdeviceptr *pbase, unsigned int *psize, CUdeviceptr dptr ); CUresult CUDAAPI cuMemGetAddressRange( CUdeviceptr *pbase, unsigned int *psize, CUdeviceptr dptr );
CUresult CUDAAPI cuMemAllocHost(void **pp, unsigned int bytesize); CUresult CUDAAPI cuMemAllocHost(void **pp, unsigned int bytesize);
CUresult CUDAAPI cuMemFreeHost(void *p); CUresult CUDAAPI cuMemFreeHost(void *p);
CUresult CUDAAPI cuMemHostAlloc(void **pp, size_t bytesize, unsigned in t Flags ); CUresult CUDAAPI cuMemHostAlloc(void **pp, size_t bytesize, unsigned in t Flags );
CUresult CUDAAPI cuMemHostGetDevicePointer( CUdeviceptr *pdptr, void *p , unsigned int Flags ); CUresult CUDAAPI cuMemHostGetDevicePointer( CUdeviceptr *pdptr, void *p , unsigned int Flags );
CUresult CUDAAPI cuMemHostGetFlags( unsigned int *pFlags, void *p );
/************************************ /************************************
** **
** Synchronous Memcpy ** Synchronous Memcpy
** **
** Intra-device memcpy's done with these functions may execute in para llel with the CPU, ** Intra-device memcpy's done with these functions may execute in para llel with the CPU,
** but if host memory is involved, they wait until the copy is done be fore returning. ** but if host memory is involved, they wait until the copy is done be fore returning.
** **
***********************************/ ***********************************/
 End of changes. 3 change blocks. 
2 lines changed or deleted 5 lines changed or added


 cudaGL.h   cudaGL.h 
skipping to change at line 43 skipping to change at line 43
* the above Disclaimer and U.S. Government End Users Notice. * the above Disclaimer and U.S. Government End Users Notice.
*/ */
#ifndef CUDAGL_H #ifndef CUDAGL_H
#define CUDAGL_H #define CUDAGL_H
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
#endif #endif
//
// Flags to map or unmap a resource
//
typedef enum CUGLmap_flags_enum {
CU_GL_MAP_RESOURCE_FLAGS_NONE = 0x00,
CU_GL_MAP_RESOURCE_FLAGS_READ_ONLY = 0x01,
CU_GL_MAP_RESOURCE_FLAGS_WRITE_DISCARD = 0x02,
} CUGLmap_flags;
CUresult CUDAAPI cuGLInit(void); CUresult CUDAAPI cuGLInit(void);
CUresult CUDAAPI cuGLCtxCreate( CUcontext *pCtx, unsigned int Flags, CUdevi ce device ); CUresult CUDAAPI cuGLCtxCreate( CUcontext *pCtx, unsigned int Flags, CUdevi ce device );
CUresult CUDAAPI cuGLRegisterBufferObject( GLuint bufferobj ); CUresult CUDAAPI cuGLRegisterBufferObject( GLuint bufferobj );
CUresult CUDAAPI cuGLMapBufferObject( CUdeviceptr *dptr, unsigned int *size , GLuint bufferobj ); CUresult CUDAAPI cuGLMapBufferObject( CUdeviceptr *dptr, unsigned int *size , GLuint bufferobj );
CUresult CUDAAPI cuGLUnmapBufferObject( GLuint bufferobj ); CUresult CUDAAPI cuGLUnmapBufferObject( GLuint bufferobj );
CUresult CUDAAPI cuGLUnregisterBufferObject( GLuint bufferobj ); CUresult CUDAAPI cuGLUnregisterBufferObject( GLuint bufferobj );
CUresult CUDAAPI cuGLSetBufferObjectMapFlags( GLuint bufferobj, unsigned in
t Flags );
CUresult CUDAAPI cuGLMapBufferObjectAsync( CUdeviceptr *dptr, unsigned int
*size, GLuint bufferobj, CUstream hStream );
CUresult CUDAAPI cuGLUnmapBufferObjectAsync( GLuint bufferobj, CUstream hSt
ream );
#if defined(_WIN32) #if defined(_WIN32)
#if !defined(WGL_NV_gpu_affinity) #if !defined(WGL_NV_gpu_affinity)
typedef void* HGPUNV; typedef void* HGPUNV;
#endif #endif
CUresult CUDAAPI cuWGLGetDevice( CUdevice *pDevice, HGPUNV hGpu ); CUresult CUDAAPI cuWGLGetDevice( CUdevice *pDevice, HGPUNV hGpu );
#endif #endif
#ifdef __cplusplus #ifdef __cplusplus
}; };
#endif #endif
 End of changes. 2 change blocks. 
0 lines changed or deleted 16 lines changed or added


 cuda_gl_interop.h   cuda_gl_interop.h 
skipping to change at line 67 skipping to change at line 67
#if defined(__cplusplus) #if defined(__cplusplus)
extern "C" { extern "C" {
#endif /* __cplusplus */ #endif /* __cplusplus */
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
/**
* CUDA GL Map Flags
*/
enum cudaGLMapFlags
{
cudaGLMapFlagsNone = 0, ///< Default; Assume resource can be rea
d/written
cudaGLMapFlagsReadOnly = 1, ///< CUDA kernels will not write to this
resource
cudaGLMapFlagsWriteDiscard = 2, ///< CUDA kernels will only write to and
will not read from this resource
};
extern __host__ cudaError_t CUDARTAPI cudaGLSetGLDevice(int device); extern __host__ cudaError_t CUDARTAPI cudaGLSetGLDevice(int device);
extern __host__ cudaError_t CUDARTAPI cudaGLRegisterBufferObject(GLuint buf Obj); extern __host__ cudaError_t CUDARTAPI cudaGLRegisterBufferObject(GLuint buf Obj);
extern __host__ cudaError_t CUDARTAPI cudaGLMapBufferObject(void **devPtr, GLuint bufObj); extern __host__ cudaError_t CUDARTAPI cudaGLMapBufferObject(void **devPtr, GLuint bufObj);
extern __host__ cudaError_t CUDARTAPI cudaGLUnmapBufferObject(GLuint bufObj ); extern __host__ cudaError_t CUDARTAPI cudaGLUnmapBufferObject(GLuint bufObj );
extern __host__ cudaError_t CUDARTAPI cudaGLUnregisterBufferObject(GLuint b ufObj); extern __host__ cudaError_t CUDARTAPI cudaGLUnregisterBufferObject(GLuint b ufObj);
extern __host__ cudaError_t CUDARTAPI cudaGLSetBufferObjectMapFlags(GLuint
bufObj, unsigned int flags);
extern __host__ cudaError_t CUDARTAPI cudaGLMapBufferObjectAsync(void **dev
Ptr, GLuint bufObj, cudaStream_t stream);
extern __host__ cudaError_t CUDARTAPI cudaGLUnmapBufferObjectAsync(GLuint b
ufObj, cudaStream_t stream);
#ifdef _WIN32 #ifdef _WIN32
#ifndef WGL_NV_gpu_affinity #ifndef WGL_NV_gpu_affinity
typedef void* HGPUNV; typedef void* HGPUNV;
#endif #endif
extern __host__ cudaError_t CUDARTAPI cudaWGLGetDevice(int *device, HGPUNV hGpu); extern __host__ cudaError_t CUDARTAPI cudaWGLGetDevice(int *device, HGPUNV hGpu);
#endif #endif
#if defined(__cplusplus) #if defined(__cplusplus)
} }
#endif /* __cplusplus */ #endif /* __cplusplus */
 End of changes. 2 change blocks. 
0 lines changed or deleted 21 lines changed or added


 cuda_runtime.h   cuda_runtime.h 
skipping to change at line 94 skipping to change at line 94
* \ref ::cudaSetupArgument(T,size_t) "cudaSetupArgument()" must be precede d * \ref ::cudaSetupArgument(T,size_t) "cudaSetupArgument()" must be precede d
* by a call to ::cudaConfigureCall(). * by a call to ::cudaConfigureCall().
* *
* \param arg - Argument to push for a kernel launch * \param arg - Argument to push for a kernel launch
* \param offset - Offset in argument stack to push new arg * \param offset - Offset in argument stack to push new arg
* *
* \return * \return
* ::cudaSuccess * ::cudaSuccess
* \notefnerr * \notefnerr
* *
* \sa \ref ::cudaLaunch(T*) "cudaLaunch (C++ API)", * \sa ::cudaConfigureCall,
* \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes*, T*) "cudaFuncGe
tAttributes (C++ API)",
* \ref ::cudaLaunch(T*) "cudaLaunch (C++ API)",
* ::cudaSetDoubleForDevice,
* ::cudaSetDoubleForHost,
* \ref ::cudaSetupArgument(const void*, size_t, size_t) "cudaSetupArgument (C API)" * \ref ::cudaSetupArgument(const void*, size_t, size_t) "cudaSetupArgument (C API)"
* ::cudaConfigureCall
*/ */
template<class T> template<class T>
__inline__ __host__ cudaError_t cudaSetupArgument( __inline__ __host__ cudaError_t cudaSetupArgument(
T arg, T arg,
size_t offset size_t offset
) )
{ {
return cudaSetupArgument((const void*)&arg, sizeof(T), offset); return cudaSetupArgument((const void*)&arg, sizeof(T), offset);
} }
template<class T>
__inline__ __host__ cudaError_t cudaHostAlloc(
T **ptr,
size_t size,
unsigned int flags
)
{
return cudaHostAlloc((void**)(void*)ptr, size, flags);
}
template<class T>
__inline__ __host__ cudaError_t cudaHostGetDevicePointer(
T **pDevice,
void *pHost,
unsigned int flags
)
{
return cudaHostGetDevicePointer((void**)(void*)pDevice, pHost, flags);
}
template<class T>
__inline__ __host__ cudaError_t cudaMalloc(
T **devPtr,
size_t size
)
{
return cudaMalloc((void**)(void*)devPtr, size);
}
template<class T>
__inline__ __host__ cudaError_t cudaMallocHost(
T **ptr,
size_t size
)
{
return cudaMallocHost((void**)(void*)ptr, size);
}
template<class T>
__inline__ __host__ cudaError_t cudaMallocPitch(
T **devPtr,
size_t *pitch,
size_t width,
size_t height
)
{
return cudaMallocPitch((void**)(void*)devPtr, pitch, width, height);
}
#if defined(__CUDACC__) #if defined(__CUDACC__)
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
/** /**
* \addtogroup CUDART_HIGHLEVEL * \addtogroup CUDART_HIGHLEVEL
skipping to change at line 305 skipping to change at line 357
*/ */
template<class T> template<class T>
__inline__ __host__ cudaError_t cudaGetSymbolSize( __inline__ __host__ cudaError_t cudaGetSymbolSize(
size_t *size, size_t *size,
const T &symbol const T &symbol
) )
{ {
return cudaGetSymbolSize(size, (const char*)&symbol); return cudaGetSymbolSize(size, (const char*)&symbol);
} }
/** @} */ /* END CUDART_MEMORY */ /** @} */ /* END CUDART_HIGHLEVEL */
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
/** /**
* \addtogroup CUDART_HIGHLEVEL * \addtogroup CUDART_HIGHLEVEL
* *
skipping to change at line 628 skipping to change at line 680
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
/** /**
* \ingroup CUDART_HIGHLEVEL * \ingroup CUDART_HIGHLEVEL
* \brief \hl Launches a device function * \brief \hl Launches a device function
* *
* Launches the function \p entry on the device. \p entry can either be a * Launches the function \p entry on the device. The parameter \p entry can
* function that executes on the device, or it can be a character string, * either be a function that executes on the device, or it can be a charact
* naming a function that executes on the device. \p entry must be declared er
as * string, naming a function that executes on the device. The parameter
* a \p __global__ function. * specified by \p entry must be declared as a \p __global__ function.
* \ref ::cudaLaunch(T*) "cudaLaunch()" must be preceded by a call to * \ref ::cudaLaunch(T*) "cudaLaunch()" must be preceded by a call to
* ::cudaConfigureCall() since it pops the data that was pushed by * ::cudaConfigureCall() since it pops the data that was pushed by
* ::cudaConfigureCall() from the execution stack. * ::cudaConfigureCall() from the execution stack.
* *
* \param entry - Device function pointer or char string naming device func tion * \param entry - Device function pointer or char string naming device func tion
* to execute * to execute
* *
* \return * \return
* ::cudaSuccess, * ::cudaSuccess,
* ::cudaErrorInvalidDeviceFunction, * ::cudaErrorInvalidDeviceFunction,
* ::cudaErrorInvalidConfiguration * ::cudaErrorInvalidConfiguration,
* ::cudaErrorLaunchFailure,
* ::cudaErrorPriorLaunchFailure,
* ::cudaErrorLaunchTimeout,
* ::cudaErrorLaunchOutOfResources
* \notefnerr * \notefnerr
* *
* \sa ::cudaConfigureCall, * \sa ::cudaConfigureCall,
* \ref ::cudaSetupArgument(T,size_t) "cudaSetupArgument (C++ API)", * \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes*, T*) "cudaFuncGe
* \ref ::cudaLaunch(const char*) "cudaLaunch (C API)" tAttributes (C++ API)",
* \ref ::cudaLaunch(const char*) "cudaLaunch (C API)",
* ::cudaSetDoubleForDevice,
* ::cudaSetDoubleForHost,
* \ref ::cudaSetupArgument(T,size_t) "cudaSetupArgument (C++ API)"
*/ */
template<class T> template<class T>
__inline__ __host__ cudaError_t cudaLaunch( __inline__ __host__ cudaError_t cudaLaunch(
T *entry T *entry
) )
{ {
return cudaLaunch((const char*)entry); return cudaLaunch((const char*)entry);
} }
/**
* \ingroup CUDART_HIGHLEVEL
* \brief \hl Find out attributes for a given function
*
* This function obtains the attributes of a function specified via \p entr
y.
* The parameter \p entry can either be a function that executes on the
* device, or it can be a character string, naming a function that executes
on
* the device. The parameter specified by \p entry must be declared as a
* \p __global__ function. The fetched attributes are placed in \p attr. If
* the specified function does not exist, then ::cudaErrorInvalidDeviceFunc
tion
* is returned.
*
* \param attr - Return pointer to function's attributes
* \param entry - Function to get attributes of
*
* \return
* ::cudaSuccess,
* ::cudaErrorInitializationError,
* ::cudaErrorInvalidDeviceFunction
* \notefnerr
*
* \sa ::cudaConfigureCall,
* \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes*, const char*) "c
udaFuncGetAttributes (C API)",
* \ref ::cudaLaunch(T*) "cudaLaunch (C++ API)",
* ::cudaSetDoubleForDevice,
* ::cudaSetDoubleForHost,
* \ref ::cudaSetupArgument(T,size_t) "cudaSetupArgument (C++ API)"
*/
template<class T>
__inline__ __host__ cudaError_t cudaFuncGetAttributes(
struct cudaFuncAttributes *attr,
T *entry
)
{
return cudaFuncGetAttributes(attr, (const char*)entry);
}
#endif /* __CUDACC__ */ #endif /* __CUDACC__ */
#endif /* __cplusplus */ #endif /* __cplusplus */
#endif /* !__CUDA_RUNTIME_H__ */ #endif /* !__CUDA_RUNTIME_H__ */
 End of changes. 8 change blocks. 
11 lines changed or deleted 113 lines changed or added


 cuda_runtime_api.h   cuda_runtime_api.h 
skipping to change at line 41 skipping to change at line 41
* Any use of this source code in individual and commercial software must * Any use of this source code in individual and commercial software must
* include, in the user documentation and internal comments to the code, * include, in the user documentation and internal comments to the code,
* the above Disclaimer and U.S. Government End Users Notice. * the above Disclaimer and U.S. Government End Users Notice.
*/ */
#if !defined(__CUDA_RUNTIME_API_H__) #if !defined(__CUDA_RUNTIME_API_H__)
#define __CUDA_RUNTIME_API_H__ #define __CUDA_RUNTIME_API_H__
/************************************************************************** ***** /************************************************************************** *****
* * * *
* CUDA Runtime API Version 2.2 * * CUDA Runtime API Version 2.3 *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
#define CUDART_VERSION 2020 #define CUDART_VERSION 2030
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
#include "host_defines.h" #include "host_defines.h"
#include "builtin_types.h" #include "builtin_types.h"
skipping to change at line 111 skipping to change at line 111
extern __host__ cudaError_t CUDARTAPI cudaMalloc(void **devPtr, size_t size ); extern __host__ cudaError_t CUDARTAPI cudaMalloc(void **devPtr, size_t size );
extern __host__ cudaError_t CUDARTAPI cudaMallocHost(void **ptr, size_t siz e); extern __host__ cudaError_t CUDARTAPI cudaMallocHost(void **ptr, size_t siz e);
extern __host__ cudaError_t CUDARTAPI cudaMallocPitch(void **devPtr, size_t *pitch, size_t width, size_t height); extern __host__ cudaError_t CUDARTAPI cudaMallocPitch(void **devPtr, size_t *pitch, size_t width, size_t height);
extern __host__ cudaError_t CUDARTAPI cudaMallocArray(struct cudaArray **ar ray, const struct cudaChannelFormatDesc *desc, size_t width, size_t height __dv(1)); extern __host__ cudaError_t CUDARTAPI cudaMallocArray(struct cudaArray **ar ray, const struct cudaChannelFormatDesc *desc, size_t width, size_t height __dv(1));
extern __host__ cudaError_t CUDARTAPI cudaFree(void *devPtr); extern __host__ cudaError_t CUDARTAPI cudaFree(void *devPtr);
extern __host__ cudaError_t CUDARTAPI cudaFreeHost(void *ptr); extern __host__ cudaError_t CUDARTAPI cudaFreeHost(void *ptr);
extern __host__ cudaError_t CUDARTAPI cudaFreeArray(struct cudaArray *array ); extern __host__ cudaError_t CUDARTAPI cudaFreeArray(struct cudaArray *array );
extern __host__ cudaError_t CUDARTAPI cudaHostAlloc(void **pHost, size_t by tes, unsigned int flags); extern __host__ cudaError_t CUDARTAPI cudaHostAlloc(void **pHost, size_t by tes, unsigned int flags);
extern __host__ cudaError_t CUDARTAPI cudaHostGetDevicePointer(void **pDevi ce, void *pHost, unsigned int flags); extern __host__ cudaError_t CUDARTAPI cudaHostGetDevicePointer(void **pDevi ce, void *pHost, unsigned int flags);
extern __host__ cudaError_t CUDARTAPI cudaHostGetFlags(unsigned int *pFlags , void *pHost);
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
extern __host__ cudaError_t CUDARTAPI cudaMemcpy(void *dst, const void *src , size_t count, enum cudaMemcpyKind kind); extern __host__ cudaError_t CUDARTAPI cudaMemcpy(void *dst, const void *src , size_t count, enum cudaMemcpyKind kind);
extern __host__ cudaError_t CUDARTAPI cudaMemcpyToArray(struct cudaArray *d st, size_t wOffset, size_t hOffset, const void *src, size_t count, enum cud aMemcpyKind kind); extern __host__ cudaError_t CUDARTAPI cudaMemcpyToArray(struct cudaArray *d st, size_t wOffset, size_t hOffset, const void *src, size_t count, enum cud aMemcpyKind kind);
extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromArray(void *dst, const struct cudaArray *src, size_t wOffset, size_t hOffset, size_t count, enum c udaMemcpyKind kind); extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromArray(void *dst, const struct cudaArray *src, size_t wOffset, size_t hOffset, size_t count, enum c udaMemcpyKind kind);
 End of changes. 3 change blocks. 
2 lines changed or deleted 3 lines changed or added


 cufft.h   cufft.h 
skipping to change at line 45 skipping to change at line 45
/* /*
* cufft.h * cufft.h
* Public header file for the NVIDIA Cuda FFT library (CUFFT) * Public header file for the NVIDIA Cuda FFT library (CUFFT)
*/ */
#ifndef _CUFFT_H_ #ifndef _CUFFT_H_
#define _CUFFT_H_ #define _CUFFT_H_
#include <stdio.h> #include <stdio.h>
#include "cuComplex.h" #include "cuComplex.h"
#include "driver_types.h"
#ifndef CUFFTAPI #ifndef CUFFTAPI
#ifdef _WIN32 #ifdef _WIN32
#define CUFFTAPI __stdcall #define CUFFTAPI __stdcall
#else #else
#define CUFFTAPI #define CUFFTAPI
#endif #endif
#endif #endif
#ifdef __cplusplus #ifdef __cplusplus
skipping to change at line 77 skipping to change at line 78
CUFFT_SETUP_FAILED = 0x7, CUFFT_SETUP_FAILED = 0x7,
CUFFT_INVALID_SIZE = 0x8 CUFFT_INVALID_SIZE = 0x8
} cufftResult; } cufftResult;
// CUFFT defines and supports the following data types // CUFFT defines and supports the following data types
// cufftHandle is a handle type used to store and access CUFFT plans. // cufftHandle is a handle type used to store and access CUFFT plans.
typedef unsigned int cufftHandle; typedef unsigned int cufftHandle;
// cufftReal is a single-precision, floating-point real data type. // cufftReal is a single-precision, floating-point real data type.
// cufftDoubleReal is a double-precision, real data type.
typedef float cufftReal; typedef float cufftReal;
typedef double cufftDoubleReal;
// cufftComplex is a single-precision, floating-point complex data type tha t // cufftComplex is a single-precision, floating-point complex data type tha t
// consists of interleaved real and imaginary components. // consists of interleaved real and imaginary components.
// typedef float cufftComplex[2]; // cufftDoubleComplex is the double-precision equivalent.
typedef cuComplex cufftComplex; typedef cuComplex cufftComplex;
typedef cuDoubleComplex cufftDoubleComplex;
// CUFFT transform directions // CUFFT transform directions
#define CUFFT_FORWARD -1 // Forward FFT #define CUFFT_FORWARD -1 // Forward FFT
#define CUFFT_INVERSE 1 // Inverse FFT #define CUFFT_INVERSE 1 // Inverse FFT
// CUFFT supports the following transform types // CUFFT supports the following transform types
typedef enum cufftType_t { typedef enum cufftType_t {
CUFFT_R2C = 0x2a, // Real to Complex (interleaved) CUFFT_R2C = 0x2a, // Real to Complex (interleaved)
CUFFT_C2R = 0x2c, // Complex (interleaved) to Real CUFFT_C2R = 0x2c, // Complex (interleaved) to Real
CUFFT_C2C = 0x29 // Complex to Complex, interleaved CUFFT_C2C = 0x29, // Complex to Complex, interleaved
CUFFT_D2Z = 0x6a, // Double to Double-Complex
CUFFT_Z2D = 0x6c, // Double-Complex to Double
CUFFT_Z2Z = 0x69 // Double-Complex to Double-Complex
} cufftType; } cufftType;
cufftResult CUFFTAPI cufftPlan1d(cufftHandle *plan, cufftResult CUFFTAPI cufftPlan1d(cufftHandle *plan,
int nx, int nx,
cufftType type, cufftType type,
int batch); int batch);
cufftResult CUFFTAPI cufftPlan2d(cufftHandle *plan, cufftResult CUFFTAPI cufftPlan2d(cufftHandle *plan,
int nx, int ny, int nx, int ny,
cufftType type); cufftType type);
skipping to change at line 123 skipping to change at line 130
int direction); int direction);
cufftResult CUFFTAPI cufftExecR2C(cufftHandle plan, cufftResult CUFFTAPI cufftExecR2C(cufftHandle plan,
cufftReal *idata, cufftReal *idata,
cufftComplex *odata); cufftComplex *odata);
cufftResult CUFFTAPI cufftExecC2R(cufftHandle plan, cufftResult CUFFTAPI cufftExecC2R(cufftHandle plan,
cufftComplex *idata, cufftComplex *idata,
cufftReal *odata); cufftReal *odata);
cufftResult CUFFTAPI cufftExecZ2Z(cufftHandle plan,
cufftDoubleComplex *idata,
cufftDoubleComplex *odata,
int direction);
cufftResult CUFFTAPI cufftExecD2Z(cufftHandle plan,
cufftDoubleReal *idata,
cufftDoubleComplex *odata);
cufftResult CUFFTAPI cufftExecZ2D(cufftHandle plan,
cufftDoubleComplex *idata,
cufftDoubleReal *odata);
cufftResult CUFFTAPI cufftSetStream(cufftHandle p,
cudaStream_t stream);
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif
#endif /* _CUFFT_H_ */ #endif /* _CUFFT_H_ */
 End of changes. 7 change blocks. 
4 lines changed or deleted 27 lines changed or added


 device_functions.h   device_functions.h 
skipping to change at line 139 skipping to change at line 139
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ float __log10f(float) __THROW; extern __device__ float __log10f(float) __THROW;
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ float __logf(float) __THROW; extern __device__ float __logf(float) __THROW;
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ float __powf(float, float) __THROW; extern __device__ float __powf(float, float) __THROW;
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ int __float2int_rn(float);
/*DEVICE_BUILTIN*/
extern __device__ int __float2int_rz(float); extern __device__ int __float2int_rz(float);
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ int __float2int_ru(float); extern __device__ int __float2int_ru(float);
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ int __float2int_rd(float); extern __device__ int __float2int_rd(float);
/*DEVICE_BUILTIN*/
extern __device__ int __float2int_rn(float);
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ unsigned int __float2uint_rn(float);
/*DEVICE_BUILTIN*/
extern __device__ unsigned int __float2uint_rz(float); extern __device__ unsigned int __float2uint_rz(float);
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ unsigned int __float2uint_ru(float); extern __device__ unsigned int __float2uint_ru(float);
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ unsigned int __float2uint_rd(float); extern __device__ unsigned int __float2uint_rd(float);
/*DEVICE_BUILTIN*/
extern __device__ unsigned int __float2uint_rn(float);
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ float __int2float_rn(int);
/*DEVICE_BUILTIN*/
extern __device__ float __int2float_rz(int); extern __device__ float __int2float_rz(int);
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ float __int2float_ru(int); extern __device__ float __int2float_ru(int);
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ float __int2float_rd(int); extern __device__ float __int2float_rd(int);
/*DEVICE_BUILTIN*/
extern __device__ float __int2float_rn(int);
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ float __uint2float_rn(unsigned int);
/*DEVICE_BUILTIN*/
extern __device__ float __uint2float_rz(unsigned int); extern __device__ float __uint2float_rz(unsigned int);
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ float __uint2float_ru(unsigned int); extern __device__ float __uint2float_ru(unsigned int);
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ float __uint2float_rd(unsigned int); extern __device__ float __uint2float_rd(unsigned int);
/*DEVICE_BUILTIN*/
extern __device__ float __uint2float_rn(unsigned int);
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ long long int __float2ll_rn(float);
/*DEVICE_BUILTIN*/
extern __device__ long long int __float2ll_rz(float); extern __device__ long long int __float2ll_rz(float);
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ long long int __float2ll_rn(float); extern __device__ long long int __float2ll_ru(float);
/*DEVICE_BUILTIN*/
extern __device__ long long int __float2ll_rd(float);
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ unsigned long long int __float2ull_rn(float);
/*DEVICE_BUILTIN*/
extern __device__ unsigned long long int __float2ull_rz(float); extern __device__ unsigned long long int __float2ull_rz(float);
/*DEVICE_BUILTIN*/
extern __device__ unsigned long long int __float2ull_ru(float);
/*DEVICE_BUILTIN*/
extern __device__ unsigned long long int __float2ull_rd(float);
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ float __ll2float_rn(long long int); extern __device__ float __ll2float_rn(long long int);
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ float __ull2float_rn(unsigned long long int); extern __device__ float __ull2float_rn(unsigned long long int);
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ unsigned short __float2half_rn(float);
/*DEVICE_BUILTIN*/
extern __device__ float __half2float(unsigned short);
/*DEVICE_BUILTIN*/
extern __device__ float __fadd_rn(float, float); extern __device__ float __fadd_rn(float, float);
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ float __fadd_rz(float, float); extern __device__ float __fadd_rz(float, float);
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ float __fadd_ru(float, float); extern __device__ float __fadd_ru(float, float);
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ float __fadd_rd(float, float); extern __device__ float __fadd_rd(float, float);
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ float __fmul_rn(float, float); extern __device__ float __fmul_rn(float, float);
skipping to change at line 260 skipping to change at line 275
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ int __clzll(long long int); extern __device__ int __clzll(long long int);
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ int __ffsll(long long int); extern __device__ int __ffsll(long long int);
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ int __popcll(unsigned long long int); extern __device__ int __popcll(unsigned long long int);
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ unsigned long long int __brevll(unsigned long long int); extern __device__ unsigned long long int __brevll(unsigned long long int);
#if !defined(CUDA_NO_SM_13_DOUBLE_INTRINSICS) #if (__CUDA_ARCH__ >= 130)
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ int __double2int_rz(double); extern __device__ int __double2int_rz(double);
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ unsigned int __double2uint_rz(double); extern __device__ unsigned int __double2uint_rz(double);
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ long long int __double2ll_rz(double); extern __device__ long long int __double2ll_rz(double);
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __device__ unsigned long long int __double2ull_rz(double); extern __device__ unsigned long long int __double2ull_rz(double);
#endif /* ! CUDA_NO_SM_13_DOUBLE_INTRINSICS */ #endif /* __CUDA_ARCH__ >= 130 */
} }
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
static __inline__ __device__ int mulhi(int a, int b) static __inline__ __device__ int mulhi(int a, int b)
skipping to change at line 751 skipping to change at line 766
inc = abs(rem0) < abs(rem1); inc = abs(rem0) < abs(rem1);
resi = ((expo_res << 23) + r + inc); resi = ((expo_res << 23) + r + inc);
if (resi != 0x00800000) resi = 0; if (resi != 0x00800000) resi = 0;
return __int_as_float(sign | resi); return __int_as_float(sign | resi);
} }
} }
if (__cuda_fabsf(divisor) > CUDART_TWO_TO_126_F) { if (__cuda_fabsf(divisor) > CUDART_TWO_TO_126_F) {
divisor *= 0.25f; divisor *= 0.25f;
dividend *= 0.25f; dividend *= 0.25f;
} }
return dividend / divisor; return __fdividef (dividend, divisor);
} }
__device_func__(float __fdiv_rz (float dividend, float divisor)) __device_func__(float __fdiv_rz (float dividend, float divisor))
{ {
unsigned long long prod; unsigned long long prod;
unsigned r, f, x, y, expox, expoy, sign; unsigned r, f, x, y, expox, expoy, sign;
unsigned expo_res; unsigned expo_res;
unsigned resi, cvtxi, cvtyi; unsigned resi, cvtxi, cvtyi;
float t; float t;
skipping to change at line 819 skipping to change at line 834
if (rem1 < 0) r--; if (rem1 < 0) r--;
resi = ((expo_res << 23) + r); resi = ((expo_res << 23) + r);
if (resi != 0x00800000) resi = 0; if (resi != 0x00800000) resi = 0;
return __int_as_float(sign | resi); return __int_as_float(sign | resi);
} }
} }
if (__cuda_fabsf(divisor) > CUDART_TWO_TO_126_F) { if (__cuda_fabsf(divisor) > CUDART_TWO_TO_126_F) {
divisor *= 0.25f; divisor *= 0.25f;
dividend *= 0.25f; dividend *= 0.25f;
} }
return dividend / divisor; return __fdividef (dividend, divisor);
} }
__device_func__(float __fdiv_ru (float dividend, float divisor)) __device_func__(float __fdiv_ru (float dividend, float divisor))
{ {
unsigned long long prod; unsigned long long prod;
unsigned r, f, x, y, expox, expoy, sign; unsigned r, f, x, y, expox, expoy, sign;
unsigned expo_res; unsigned expo_res;
unsigned resi, cvtxi, cvtyi; unsigned resi, cvtxi, cvtyi;
float t; float t;
skipping to change at line 889 skipping to change at line 904
if ((rem1 > 0) && (!sign)) r++; if ((rem1 > 0) && (!sign)) r++;
resi = ((expo_res << 23) + r); resi = ((expo_res << 23) + r);
if (resi != 0x00800000) resi = 0; if (resi != 0x00800000) resi = 0;
return __int_as_float(sign | resi); return __int_as_float(sign | resi);
} }
} }
if (__cuda_fabsf(divisor) > CUDART_TWO_TO_126_F) { if (__cuda_fabsf(divisor) > CUDART_TWO_TO_126_F) {
divisor *= 0.25f; divisor *= 0.25f;
dividend *= 0.25f; dividend *= 0.25f;
} }
return dividend / divisor; return __fdividef (dividend, divisor);
} }
__device_func__(float __fdiv_rd (float dividend, float divisor)) __device_func__(float __fdiv_rd (float dividend, float divisor))
{ {
unsigned long long prod; unsigned long long prod;
unsigned r, f, x, y, expox, expoy, sign; unsigned r, f, x, y, expox, expoy, sign;
unsigned expo_res; unsigned expo_res;
unsigned resi, cvtxi, cvtyi; unsigned resi, cvtxi, cvtyi;
float t; float t;
skipping to change at line 959 skipping to change at line 974
if ((rem1 > 0) && (sign)) r++; if ((rem1 > 0) && (sign)) r++;
resi = ((expo_res << 23) + r); resi = ((expo_res << 23) + r);
if (resi != 0x00800000) resi = 0; if (resi != 0x00800000) resi = 0;
return __int_as_float(sign | resi); return __int_as_float(sign | resi);
} }
} }
if (__cuda_fabsf(divisor) > CUDART_TWO_TO_126_F) { if (__cuda_fabsf(divisor) > CUDART_TWO_TO_126_F) {
divisor *= 0.25f; divisor *= 0.25f;
dividend *= 0.25f; dividend *= 0.25f;
} }
return dividend / divisor; return __fdividef (dividend, divisor);
} }
__device_func__(float __fadd_ru (float a, float b)) __device_func__(float __fadd_ru (float a, float b))
{ {
unsigned int expo_x, expo_y; unsigned int expo_x, expo_y;
unsigned int xxi, yyi, temp; unsigned int xxi, yyi, temp;
xxi = __float_as_int(a); xxi = __float_as_int(a);
yyi = __float_as_int(b); yyi = __float_as_int(b);
skipping to change at line 2921 skipping to change at line 2937
while ((int)cvty.i >= 0) { while ((int)cvty.i >= 0) {
expoy--; expoy--;
cvty.i = cvty.i + cvty.i; cvty.i = cvty.i + cvty.i;
} }
cvty.i = cvty.i >> 8; cvty.i = cvty.i >> 8;
} }
goto divide; goto divide;
} }
} }
__device_func__(float __internal_fmul_kernel2 (float a, float b, __device_func__(float __internal_fmul_kernel (float a, float b,
enum cudaRoundMode mode)) enum cudaRoundMode mode))
{ {
unsigned long long product; unsigned long long product;
volatile union __cudart_FloatUintCvt xx, yy; volatile union __cudart_FloatUintCvt xx, yy;
unsigned expo_x, expo_y; unsigned expo_x, expo_y;
xx.f = a; xx.f = a;
yy.f = b; yy.f = b;
expo_y = 0xFF; expo_y = 0xFF;
expo_x = expo_y & (xx.i >> 23); expo_x = expo_y & (xx.i >> 23);
expo_x = expo_x - 1; expo_x = expo_x - 1;
expo_y = expo_y & (yy.i >> 23); expo_y = expo_y & (yy.i >> 23);
expo_y = expo_y - 1; expo_y = expo_y - 1;
skipping to change at line 3418 skipping to change at line 3433
} else if (mode == cudaRoundPosInf) { } else if (mode == cudaRoundPosInf) {
xx = xx + (!expo_y && temp); xx = xx + (!expo_y && temp);
} else if (mode == cudaRoundMinInf) { } else if (mode == cudaRoundMinInf) {
xx = xx + (expo_y && temp); xx = xx + (expo_y && temp);
} }
xx = expo_y + xx; /* add in sign bit */ xx = expo_y + xx; /* add in sign bit */
cvt.i = xx; cvt.i = xx;
return cvt.f; return cvt.f;
} }
/* NOTE: Does not currently support round-to-nearest, round-to-zero */ __device_func__(float __internal_fadd_kernel (float a, float b,
__device_func__(float __internal_fadd_kernel2 (float a, float b, enum cudaRoundMode mode))
enum cudaRoundMode mode))
{ {
volatile union __cudart_FloatUintCvt xx, yy; volatile union __cudart_FloatUintCvt xx, yy;
unsigned int expo_x; unsigned int expo_x;
unsigned int expo_y; unsigned int expo_y;
unsigned int temp; unsigned int temp;
xx.f = a; xx.f = a;
yy.f = b; yy.f = b;
/* make bigger operand the augend */ /* make bigger operand the augend */
skipping to change at line 3466 skipping to change at line 3480
if ((int)temp < 0) { if ((int)temp < 0) {
/* signs differ, effective subtraction */ /* signs differ, effective subtraction */
temp = 32 - expo_y; temp = 32 - expo_y;
temp = (expo_y) ? (yy.i << temp) : 0; temp = (expo_y) ? (yy.i << temp) : 0;
temp = (unsigned)(-((int)temp)); temp = (unsigned)(-((int)temp));
xx.i = xx.i - (yy.i >> expo_y) - (temp ? 1 : 0); xx.i = xx.i - (yy.i >> expo_y) - (temp ? 1 : 0);
if (xx.i & 0x00800000) { if (xx.i & 0x00800000) {
if (expo_x <= 0xFD) { if (expo_x <= 0xFD) {
xx.i = xx.i + (expo_x << 23); xx.i = xx.i + (expo_x << 23);
if (mode == cudaRoundMinInf) { if (mode == cudaRoundNearest) {
xx.i += (temp && (xx.i & 0x80000000)); if (temp < 0x80000000) return xx.f;
xx.i += ((temp == 0x80000000) ? (xx.i & 1) : (temp >> 31));
} else if (mode == cudaRoundZero) {
} else if (mode == cudaRoundPosInf) { } else if (mode == cudaRoundPosInf) {
xx.i += (temp && !(xx.i & 0x80000000)); xx.i += (temp && !(xx.i & 0x80000000));
} else if (mode == cudaRoundMinInf) {
xx.i += (temp && (xx.i & 0x80000000));
} }
return xx.f; return xx.f;
} }
} else { } else {
if ((temp | (xx.i << 1)) == 0) { if ((temp | (xx.i << 1)) == 0) {
/* operands cancelled, resulting in a clean zero */ /* operands cancelled, resulting in a clean zero */
if (mode == cudaRoundMinInf) { if (mode == cudaRoundMinInf) {
xx.i = 0x80000000; xx.i = 0x80000000;
} else if (mode == cudaRoundPosInf) { } else {
xx.i = 0; xx.i = 0;
} }
return xx.f; return xx.f;
} }
/* normalize result */ /* normalize result */
yy.i = xx.i & 0x80000000; yy.i = xx.i & 0x80000000;
do { do {
xx.i = (xx.i << 1) | (temp >> 31); xx.i = (xx.i << 1) | (temp >> 31);
temp <<= 1; temp <<= 1;
expo_x--; expo_x--;
skipping to change at line 3500 skipping to change at line 3518
xx.i = xx.i | yy.i; xx.i = xx.i | yy.i;
} }
} else { } else {
/* signs are the same, effective addition */ /* signs are the same, effective addition */
temp = 32 - expo_y; temp = 32 - expo_y;
temp = (expo_y) ? (yy.i << temp) : 0; temp = (expo_y) ? (yy.i << temp) : 0;
xx.i = xx.i + (yy.i >> expo_y); xx.i = xx.i + (yy.i >> expo_y);
if (!(xx.i & 0x01000000)) { if (!(xx.i & 0x01000000)) {
if (expo_x <= 0xFD) { if (expo_x <= 0xFD) {
xx.i = xx.i + (expo_x << 23); xx.i = xx.i + (expo_x << 23);
if (mode == cudaRoundMinInf) { if (mode == cudaRoundNearest) {
xx.i += (temp && (xx.i & 0x80000000)); if (temp < 0x80000000) return xx.f;
xx.i += ((temp == 0x80000000) ? (xx.i & 1) : (temp >> 31));
} else if (mode == cudaRoundZero) {
} else if (mode == cudaRoundPosInf) { } else if (mode == cudaRoundPosInf) {
xx.i += (temp && !(xx.i & 0x80000000)); xx.i += (temp && !(xx.i & 0x80000000));
} else if (mode == cudaRoundMinInf) {
xx.i += (temp && (xx.i & 0x80000000));
} }
return xx.f; return xx.f;
} }
} else { } else {
/* normalize result */ /* normalize result */
temp = (xx.i << 31) | (temp >> 1); temp = (xx.i << 31) | (temp >> 1);
xx.i = ((xx.i & 0x80000000) | (xx.i >> 1)) & ~0x40000000; xx.i = ((xx.i & 0x80000000) | (xx.i >> 1)) & ~0x40000000;
expo_x++; expo_x++;
} }
} }
if (expo_x <= 0xFD) { if (expo_x <= 0xFD) {
if (mode == cudaRoundMinInf) { xx.i = xx.i + (expo_x << 23);
xx.i += (temp && (xx.i & 0x80000000)); if (mode == cudaRoundNearest) {
if (temp < 0x80000000) return xx.f;
xx.i += ((temp == 0x80000000) ? (xx.i & 1) : (temp >> 31));
} else if (mode == cudaRoundZero) {
} else if (mode == cudaRoundPosInf) { } else if (mode == cudaRoundPosInf) {
xx.i += (temp && !(xx.i & 0x80000000)); xx.i += (temp && !(xx.i & 0x80000000));
} else if (mode == cudaRoundMinInf) {
xx.i += (temp && (xx.i & 0x80000000));
} }
xx.i = xx.i + (expo_x << 23);
return xx.f; return xx.f;
} }
if ((int)expo_x >= 254) { if ((int)expo_x >= 254) {
/* overflow: return infinity or largest normal */ /* overflow: return infinity or largest normal */
temp = xx.i & 0x80000000; temp = xx.i & 0x80000000;
if (mode == cudaRoundMinInf) { if (mode == cudaRoundNearest) {
xx.i = (temp) | 0x7f800000;
} else if (mode == cudaRoundZero) {
xx.i = (temp) | 0x7f7fffff;
} else if (mode == cudaRoundMinInf) {
xx.i = (temp ? 0xFF800000 : 0x7f7fffff); xx.i = (temp ? 0xFF800000 : 0x7f7fffff);
} else if (mode == cudaRoundPosInf) { } else if (mode == cudaRoundPosInf) {
xx.i = (temp ? 0xff7fffff : 0x7F800000); xx.i = (temp ? 0xff7fffff : 0x7F800000);
} }
return xx.f; return xx.f;
} }
/* underflow: denormal, or smallest normal */ /* underflow: denormal, or smallest normal */
expo_y = expo_x + 32; expo_y = expo_x + 32;
yy.i = xx.i & 0x80000000; yy.i = xx.i & 0x80000000;
xx.i = xx.i & ~0xff000000; xx.i = xx.i & ~0xff000000;
expo_x = (unsigned)(-((int)expo_x)); expo_x = (unsigned)(-((int)expo_x));
temp = xx.i << expo_y | ((temp) ? 1 : 0); temp = xx.i << expo_y | ((temp) ? 1 : 0);
xx.i = yy.i | (xx.i >> expo_x); xx.i = yy.i | (xx.i >> expo_x);
if (mode == cudaRoundMinInf) { if (mode == cudaRoundNearest) {
xx.i += (temp && yy.i); xx.i += (temp == 0x80000000) ? (xx.i & 1) : (temp >> 31);
} else if (mode == cudaRoundZero) {
} else if (mode == cudaRoundPosInf) { } else if (mode == cudaRoundPosInf) {
xx.i += (temp && !yy.i); xx.i += (temp && !yy.i);
} else if (mode == cudaRoundMinInf) {
xx.i += (temp && yy.i);
} }
return xx.f; return xx.f;
} else { } else {
/* handle special cases separately */ /* handle special cases separately */
if (!(yy.i << 1)) { if (!(yy.i << 1)) {
if (mode == cudaRoundMinInf) { if (mode == cudaRoundMinInf) {
if (!(xx.i << 1)) { if (!(xx.i << 1)) {
xx.i = xx.i | yy.i; xx.i = xx.i | yy.i;
} }
} else if (mode == cudaRoundPosInf) { } else {
if (xx.i == 0x80000000) { if (xx.i == 0x80000000) {
xx.i = yy.i; xx.i = yy.i;
} }
} }
if ((xx.i << 1) > 0xff000000) { if ((xx.i << 1) > 0xff000000) {
xx.i |= 0x00400000; xx.i |= 0x00400000;
} }
return xx.f; return xx.f;
} }
if ((expo_y != 254) && (expo_x != 254)) { if ((expo_y != 254) && (expo_x != 254)) {
skipping to change at line 3680 skipping to change at line 3713
return __internal_fdiv_kernel (a, b, cudaRoundMinInf); return __internal_fdiv_kernel (a, b, cudaRoundMinInf);
} }
__device_func__(float __fdiv_ru (float a, float b)) __device_func__(float __fdiv_ru (float a, float b))
{ {
return __internal_fdiv_kernel (a, b, cudaRoundPosInf); return __internal_fdiv_kernel (a, b, cudaRoundPosInf);
} }
__device_func__(float __fadd_rd (float a, float b)) __device_func__(float __fadd_rd (float a, float b))
{ {
return __internal_fadd_kernel2 (a, b, cudaRoundMinInf); return __internal_fadd_kernel (a, b, cudaRoundMinInf);
} }
__device_func__(float __fadd_ru (float a, float b)) __device_func__(float __fadd_ru (float a, float b))
{ {
return __internal_fadd_kernel2 (a, b, cudaRoundPosInf); return __internal_fadd_kernel (a, b, cudaRoundPosInf);
} }
__device_func__(float __fmul_rd (float a, float b)) __device_func__(float __fmul_rd (float a, float b))
{ {
return __internal_fmul_kernel2 (a, b, cudaRoundMinInf); return __internal_fmul_kernel (a, b, cudaRoundMinInf);
} }
__device_func__(float __fmul_ru (float a, float b)) __device_func__(float __fmul_ru (float a, float b))
{ {
return __internal_fmul_kernel2 (a, b, cudaRoundPosInf); return __internal_fmul_kernel (a, b, cudaRoundPosInf);
} }
__device_func__(float __fmaf_rn (float a, float b, float c)) __device_func__(float __fmaf_rn (float a, float b, float c))
{ {
return __internal_fmaf_kernel (a, b, c, cudaRoundNearest); return __internal_fmaf_kernel (a, b, c, cudaRoundNearest);
} }
__device_func__(float __fmaf_rz (float a, float b, float c)) __device_func__(float __fmaf_rz (float a, float b, float c))
{ {
return __internal_fmaf_kernel (a, b, c, cudaRoundZero); return __internal_fmaf_kernel (a, b, c, cudaRoundZero);
skipping to change at line 3777 skipping to change at line 3810
{ {
long long int res; long long int res;
res = __umul64hi(a, b); res = __umul64hi(a, b);
if (a < 0LL) res = res - b; if (a < 0LL) res = res - b;
if (b < 0LL) res = res - a; if (b < 0LL) res = res - a;
return res; return res;
} }
__device_func__(float __saturatef(float a)) __device_func__(float __saturatef(float a))
{ {
if (__cuda___isnanf(a)) return 0.0f; // update of PTX spec 10/15/2008 if (__cuda___isnanf(a)) return 0.0f; /* update of PTX spec 10/15/2008 */
return a >= 1.0f ? 1.0f : a <= 0.0f ? 0.0f : a; return a >= 1.0f ? 1.0f : a <= 0.0f ? 0.0f : a;
} }
__device_func__(unsigned int __sad(int a, int b, unsigned int c)) __device_func__(unsigned int __sad(int a, int b, unsigned int c))
{ {
long long int diff = (long long int)a - (long long int)b; long long int diff = (long long int)a - (long long int)b;
return (unsigned int)(__cuda_llabs(diff) + (long long int)c); return (unsigned int)(__cuda_llabs(diff) + (long long int)c);
} }
skipping to change at line 4106 skipping to change at line 4139
{ {
volatile union __cudart_FloatUintCvt res; volatile union __cudart_FloatUintCvt res;
int shift; int shift;
unsigned int t; unsigned int t;
res.i = a; res.i = a;
if (a == 0) return res.f; if (a == 0) return res.f;
shift = __internal_normalize((unsigned int*)&res.i); shift = __internal_normalize((unsigned int*)&res.i);
t = res.i << 24; t = res.i << 24;
res.i = (res.i >> 8); res.i = (res.i >> 8);
res.i += (127 + 30 - shift) << 23; res.i += (127 + 30 - shift) << 23;
if ((rndMode == cudaRoundNearest) && (t >= 0x80000000)) { if (rndMode == cudaRoundNearest) {
res.i += (t == 0x80000000) ? (res.i & 1) : (t >> 31); res.i += (t == 0x80000000) ? (res.i & 1) : (t >> 31);
} }
else if ((rndMode == cudaRoundPosInf) && t) { else if ((rndMode == cudaRoundPosInf) && t) {
res.i++; res.i++;
} }
return res.f; return res.f;
} }
__device_func__(float __uint2float_rz(unsigned int a)) __device_func__(float __uint2float_rz(unsigned int a))
{ {
skipping to change at line 4144 skipping to change at line 4177
#else /* __MULTI_CORE__ */ #else /* __MULTI_CORE__ */
return __internal_uint2float_kernel(a, cudaRoundNearest); return __internal_uint2float_kernel(a, cudaRoundNearest);
#endif /* __MULTI_CORE__ */ #endif /* __MULTI_CORE__ */
} }
__device_func__(float __ll2float_rn(long long int a)) __device_func__(float __ll2float_rn(long long int a))
{ {
return (float)a; return (float)a;
} }
__device_func__(float __ull2float_rn(unsigned long long int a)) __device_func__(float __internal_ull2float_kernel(unsigned long long int a, enum cudaRoundMode rndMode))
{ {
#if defined(__MULTI_CORE__)
return (float)a;
#else /* __MULTI_CORE__ */
unsigned long long int temp; unsigned long long int temp;
unsigned int res, t; unsigned int res, t;
int shift; int shift;
if (a == 0ULL) return 0.0f; if (a == 0ULL) return 0.0f;
temp = a; temp = a;
shift = __internal_normalize64(&temp); shift = __internal_normalize64(&temp);
temp = (temp >> 8) | ((temp & 0xffULL) ? 1ULL : 0ULL); temp = (temp >> 8) | ((temp & 0xffULL) ? 1ULL : 0ULL);
res = (unsigned int)(temp >> 32); res = (unsigned int)(temp >> 32);
t = (unsigned int)temp; t = (unsigned int)temp;
res += (127 + 62 - shift) << 23; /* add in exponent */ res += (127 + 62 - shift) << 23; /* add in exponent */
res += t == 0x80000000 ? res & 1 : t >> 31; if (rndMode == cudaRoundNearest) {
res += (t == 0x80000000) ? (res & 1) : (t >> 31);
} else if (rndMode == cudaRoundPosInf) {
res += (t != 0);
}
return __int_as_float(res); return __int_as_float(res);
}
__device_func__(float __ull2float_rn(unsigned long long int a))
{
#if defined(__MULTI_CORE__)
return (float)a;
#else /* __MULTI_CORE__ */
return __internal_ull2float_kernel(a, cudaRoundNearest);
#endif /* __MULTI_CORE__ */ #endif /* __MULTI_CORE__ */
} }
__device_func__(float __internal_fmul_kernel(float a, float b, int rndNeare st)) __device_func__(unsigned short __float2half_rn(float f))
{ {
unsigned long long product; unsigned int x = __float_as_int (f);
volatile union __cudart_FloatUintCvt xx, yy; unsigned int u = (x & 0x7fffffff), remainder, shift, lsb, lsb_s1, lsb_m1;
unsigned expo_x, expo_y; unsigned int sign, exponent, mantissa;
xx.f = a; /* Get rid of +NaN/-NaN case first. */
yy.f = b; if (u > 0x7f800000) {
return 0x7fff;
}
expo_y = 0xFF; sign = ((x >> 16) & 0x8000);
expo_x = expo_y & (xx.i >> 23);
expo_x = expo_x - 1;
expo_y = expo_y & (yy.i >> 23);
expo_y = expo_y - 1;
if ((expo_x <= 0xFD) && /* Get rid of +Inf/-Inf, +0/-0. */
(expo_y <= 0xFD)) { if (u > 0x477fefff) {
multiply: return sign | 0x7c00;
expo_x = expo_x + expo_y; }
expo_y = xx.i ^ yy.i; if (u < 0x33000001) {
xx.i = xx.i & 0x00ffffff; return sign | 0x0000;
yy.i = yy.i << 8;
xx.i = xx.i | 0x00800000;
yy.i = yy.i | 0x80000000;
/* compute product */
product = ((unsigned long long)xx.i) * yy.i;
expo_x = expo_x - 127 + 2;
expo_y = expo_y & 0x80000000;
xx.i = (unsigned int)(product >> 32);
yy.i = (unsigned int)(product & 0xffffffff);
/* normalize mantissa */
if (xx.i < 0x00800000) {
xx.i = (xx.i << 1) | (yy.i >> 31);
yy.i = (yy.i << 1);
expo_x--;
}
if (expo_x <= 0xFD) {
xx.i = xx.i | expo_y; /* OR in sign bit */
xx.i = xx.i + (expo_x << 23); /* add in exponent */
/* round result to nearest or even */
if (yy.i < 0x80000000) return xx.f;
xx.i += (((yy.i == 0x80000000) ? (xx.i & 1) : (yy.i >> 31))
&& rndNearest);
return xx.f;
} else if ((int)expo_x >= 254) {
/* overflow: return infinity */
xx.i = (expo_y | 0x7F800000) - (!rndNearest);
return xx.f;
} else {
/* zero, denormal, or smallest normal */
expo_x = ((unsigned int)-((int)expo_x));
if (expo_x > 25) {
/* massive underflow: return 0 */
xx.i = expo_y;
return xx.f;
} else {
yy.i = (xx.i << (32 - expo_x)) | ((yy.i) ? 1 : 0);
xx.i = expo_y + (xx.i >> expo_x);
xx.i += (((yy.i == 0x80000000) ? (xx.i & 1) : (yy.i >> 31))
&& rndNearest);
return xx.f;
}
}
} else {
product = xx.i ^ yy.i;
product = product & 0x80000000;
if (!(xx.i & 0x7fffffff)) {
if (expo_y != 254) {
xx.i = (unsigned int)product;
return xx.f;
}
expo_y = yy.i << 1;
if (expo_y == 0xFF000000) {
xx.i = expo_y | 0x00C00000;
} else {
xx.i = yy.i | 0x00400000;
}
return xx.f;
}
if (!(yy.i & 0x7fffffff)) {
if (expo_x != 254) {
xx.i = (unsigned int)product;
return xx.f;
}
expo_x = xx.i << 1;
if (expo_x == 0xFF000000) {
xx.i = expo_x | 0x00C00000;
} else {
xx.i = xx.i | 0x00400000;
}
return xx.f;
}
if ((expo_y != 254) && (expo_x != 254)) {
expo_y++;
expo_x++;
if (expo_x == 0) {
expo_y |= xx.i & 0x80000000;
/*
* If both operands are denormals, we only need to normalize
* one of them as the result will be either a denormal or zero.
*/
xx.i = xx.i << 8;
while (!(xx.i & 0x80000000)) {
xx.i <<= 1;
expo_x--;
}
xx.i = (xx.i >> 8) | (expo_y & 0x80000000);
expo_y &= ~0x80000000;
expo_y--;
goto multiply;
}
if (expo_y == 0) {
expo_x |= yy.i & 0x80000000;
yy.i = yy.i << 8;
while (!(yy.i & 0x80000000)) {
yy.i <<= 1;
expo_y--;
}
yy.i = (yy.i >> 8) | (expo_x & 0x80000000);
expo_x &= ~0x80000000;
expo_x--;
goto multiply;
}
}
expo_x = xx.i << 1;
expo_y = yy.i << 1;
/* if x is NaN, return x */
if (expo_x > 0xFF000000) {
/* cvt any SNaNs to QNaNs */
xx.i = xx.i | 0x00400000;
return xx.f;
}
/* if y is NaN, return y */
if (expo_y > 0xFF000000) {
/* cvt any SNaNs to QNaNs */
xx.i = yy.i | 0x00400000;
return xx.f;
}
xx.i = (unsigned int)product | 0x7f800000;
return xx.f;
} }
}
__device_func__(float __internal_fadd_kernel(float a, float b, int rndNeare
st))
{
volatile union __cudart_FloatUintCvt xx, yy;
unsigned int expo_x;
unsigned int expo_y;
unsigned int temp;
xx.f = a; exponent = ((u >> 23) & 0xff);
yy.f = b; mantissa = (u & 0x7fffff);
/* make bigger operand the augend */ if (exponent > 0x70) {
expo_y = yy.i << 1; shift = 13;
if (expo_y > (xx.i << 1)) { exponent -= 0x70;
expo_y = xx.i; } else {
xx.i = yy.i; shift = 0x7e - exponent;
yy.i = expo_y; exponent = 0;
mantissa |= 0x800000;
} }
lsb = (1 << shift);
lsb_s1 = (lsb >> 1);
lsb_m1 = (lsb - 1);
temp = 0xff; /* Round to nearest even. */
expo_x = temp & (xx.i >> 23); remainder = (mantissa & lsb_m1);
expo_x = expo_x - 1; mantissa >>= shift;
expo_y = temp & (yy.i >> 23); if (remainder > lsb_s1 || (remainder == lsb_s1 && (mantissa & 0x1))) {
expo_y = expo_y - 1; ++mantissa;
if (!(mantissa & 0x3ff)) {
++exponent;
mantissa = 0;
}
}
if ((expo_x <= 0xFD) && return sign | (exponent << 10) | mantissa;
(expo_y <= 0xFD)) { }
add: __device_func__(float __half2float(unsigned short h))
expo_y = expo_x - expo_y; {
if (expo_y > 25) { unsigned int sign = ((h >> 15) & 1);
expo_y = 31; unsigned int exponent = ((h >> 10) & 0x1f);
} unsigned int mantissa = ((h & 0x3ff) << 13);
temp = xx.i ^ yy.i;
xx.i = xx.i & ~0x7f000000;
xx.i = xx.i | 0x00800000;
yy.i = yy.i & ~0xff000000;
yy.i = yy.i | 0x00800000;
if ((int)temp < 0) { if (exponent == 0x1f) { /* NaN or Inf */
/* signs differ, effective subtraction */ mantissa = (mantissa
temp = 32 - expo_y; ? (sign = 0, 0x7fffff)
temp = (expo_y) ? (yy.i << temp) : 0; : 0);
temp = (unsigned int)(-((int)temp)); exponent = 0xff;
xx.i = xx.i - (yy.i >> expo_y) - (temp ? 1 : 0); } else if (!exponent) { /* Denorm or Zero */
if (xx.i & 0x00800000) { if (mantissa) {
if (expo_x <= 0xFD) { unsigned int msb;
xx.i = xx.i & ~0x00800000; /* lop off integer bit */ exponent = 0x71;
xx.i = (xx.i + (expo_x << 23)) + 0x00800000; do {
if (temp < 0x80000000) return xx.f; msb = (mantissa & 0x400000);
xx.i += (((temp == 0x80000000) ? (xx.i & 1) : (temp >> 31)) mantissa <<= 1; /* normalize */
&& rndNearest); --exponent;
return xx.f; } while (!msb);
} mantissa &= 0x7fffff; /* 1.mantissa is implicit */
} else {
if ((temp | (xx.i << 1)) == 0) {
/* operands cancelled, resulting in a clean zero */
xx.i = 0;
return xx.f;
}
/* normalize result */
yy.i = xx.i & 0x80000000;
do {
xx.i = (xx.i << 1) | (temp >> 31);
temp <<= 1;
expo_x--;
} while (!(xx.i & 0x00800000));
xx.i = xx.i | yy.i;
}
} else {
/* signs are the same, effective addition */
temp = 32 - expo_y;
temp = (expo_y) ? (yy.i << temp) : 0;
xx.i = xx.i + (yy.i >> expo_y);
if (!(xx.i & 0x01000000)) {
if (expo_x <= 0xFD) {
expo_y = xx.i & 1;
xx.i = xx.i + (expo_x << 23);
if (temp < 0x80000000) return xx.f;
xx.i += (((temp == 0x80000000) ? expo_y : (temp >> 31))
&& rndNearest);
return xx.f;
}
} else {
/* normalize result */
temp = (xx.i << 31) | (temp >> 1);
/* not ANSI compliant: xx.i = (((int)xx.i)>>1) & ~0x40000000 */
xx.i = ((xx.i & 0x80000000) | (xx.i >> 1)) & ~0x40000000;
expo_x++;
}
}
if (expo_x <= 0xFD) {
expo_y = xx.i & 1;
xx.i += (((temp == 0x80000000) ? expo_y : (temp >> 31))
&& rndNearest);
xx.i = xx.i + (expo_x << 23);
return xx.f;
}
if ((int)expo_x >= 254) {
/* overflow: return infinity */
xx.i = ((xx.i & 0x80000000) | 0x7f800000) - (!rndNearest);
return xx.f;
} }
/* underflow: denormal, or smallest normal */
expo_y = expo_x + 32;
yy.i = xx.i & 0x80000000;
xx.i = xx.i & ~0xff000000;
expo_x = (unsigned int)(-((int)expo_x));
temp = xx.i << expo_y | ((temp) ? 1 : 0);
xx.i = yy.i | (xx.i >> expo_x);
xx.i += (((temp == 0x80000000) ? (xx.i & 1) : (temp >> 31))
&& rndNearest);
return xx.f;
} else { } else {
/* handle special cases separately */ exponent += 0x70;
if (!(yy.i << 1)) {
if (xx.i == 0x80000000) {
xx.i = yy.i;
}
if ((xx.i << 1) > 0xff000000) {
xx.i |= 0x00400000;
}
return xx.f;
}
if ((expo_y != 254) && (expo_x != 254)) {
/* remove sign bits */
if (expo_x == (unsigned int) -1) {
temp = xx.i & 0x80000000;
xx.i = xx.i << 8;
while (!(xx.i & 0x80000000)) {
xx.i <<= 1;
expo_x--;
}
expo_x++;
xx.i = (xx.i >> 8) | temp;
}
if (expo_y == (unsigned int) -1) {
temp = yy.i & 0x80000000;
yy.i = yy.i << 8;
while (!(yy.i & 0x80000000)) {
yy.i <<= 1;
expo_y--;
}
expo_y++;
yy.i = (yy.i >> 8) | temp;
}
goto add;
}
expo_x = xx.i << 1;
expo_y = yy.i << 1;
/* if x is NaN, return x */
if (expo_x > 0xff000000) {
/* cvt any SNaNs to QNaNs */
xx.i = xx.i | 0x00400000;
return xx.f;
}
/* if y is NaN, return y */
if (expo_y > 0xff000000) {
/* cvt any SNaNs to QNaNs */
xx.i = yy.i | 0x00400000;
return xx.f;
}
if ((expo_x == 0xff000000) && (expo_y == 0xff000000)) {
/*
* subtraction of infinities with the same sign, and addition of
* infinities of unlike sign is undefined: return NaN INDEFINITE
*/
expo_x = xx.i ^ yy.i;
xx.i = xx.i | ((expo_x) ? 0xffc00000 : 0);
return xx.f;
}
/* handle infinities */
if (expo_y == 0xff000000) {
xx.i = yy.i;
}
return xx.f;
} }
return __int_as_float ((sign << 31) | (exponent << 23) | mantissa);
} }
__device_func__(float __fadd_rz(float a, float b)) __device_func__(float __fadd_rz(float a, float b))
{ {
return __internal_fadd_kernel(a, b, 0); return __internal_fadd_kernel(a, b, cudaRoundZero);
} }
__device_func__(float __fmul_rz(float a, float b)) __device_func__(float __fmul_rz(float a, float b))
{ {
return __internal_fmul_kernel(a, b, 0); return __internal_fmul_kernel(a, b, cudaRoundZero);
} }
__device_func__(float __fadd_rn(float a, float b)) __device_func__(float __fadd_rn(float a, float b))
{ {
return __internal_fadd_kernel(a, b, 1); return __internal_fadd_kernel(a, b, cudaRoundNearest);
} }
__device_func__(float __fmul_rn(float a, float b)) __device_func__(float __fmul_rn(float a, float b))
{ {
return __internal_fmul_kernel(a, b, 1); return __internal_fmul_kernel(a, b, cudaRoundNearest);
} }
__device_func__(void __brkpt(int c)) __device_func__(void __brkpt(int c))
{ {
/* TODO */ /* TODO */
} }
#if defined(__MULTI_CORE__) #if defined(__MULTI_CORE__)
#define __syncthreads() \ #define __syncthreads() \
skipping to change at line 4543 skipping to change at line 4342
#endif /* __GNUC__ */ #endif /* __GNUC__ */
#endif /* __MULTI_CORE__ */ #endif /* __MULTI_CORE__ */
__device_func__(void __prof_trigger(int a)) __device_func__(void __prof_trigger(int a))
{ {
} }
__device_func__(void __threadfence(void)) __device_func__(void __threadfence(void))
{ {
__syncthreads();
} }
__device_func__(void __threadfence_block(void)) __device_func__(void __threadfence_block(void))
{ {
__syncthreads();
} }
#if defined(__GNUC__) #if defined(__GNUC__)
__device_func__(void __trap(void)) __device_func__(void __trap(void))
{ {
__builtin_trap(); __builtin_trap();
} }
#elif defined(_WIN32) #elif defined(_WIN32)
skipping to change at line 4572 skipping to change at line 4373
#endif /* __GNUC__ */ #endif /* __GNUC__ */
#endif /* __CUDABE__ */ #endif /* __CUDABE__ */
/************************************************************************** ***** /************************************************************************** *****
* * * *
* DEVICE IMPLEMENTATIONS FOR FUNCTIONS WITH BUILTIN NVOPENCC OPERATIONS * * DEVICE IMPLEMENTATIONS FOR FUNCTIONS WITH BUILTIN NVOPENCC OPERATIONS *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
#if !defined(__CUDABE__)
__device_func__(float __fdividef(float a, float b)) __device_func__(float __fdividef(float a, float b))
{ {
#if defined(__MULTI_CORE__) #if defined(__MULTI_CORE__)
return a / b; return a / b;
#elif defined(__CUDABE__)
return a / b;
#else /* __MULTI_CORE__ */ #else /* __MULTI_CORE__ */
volatile float aa = a;
volatile float bb = b;
/* match range restrictions of the device function */ /* match range restrictions of the device function */
if (__cuda_fabsf(b) > CUDART_TWO_TO_126_F) { if (__cuda_fabsf(bb) > CUDART_TWO_TO_126_F) {
if (__cuda_fabsf(a) <= CUDART_NORM_HUGE_F) { if (__cuda_fabsf(aa) <= CUDART_NORM_HUGE_F) {
return ((a / b) / CUDART_NORM_HUGE_F) / CUDART_NORM_HUGE_F; return ((aa / bb) / CUDART_NORM_HUGE_F) / CUDART_NORM_HUGE_F;
} else { } else {
return __int_as_float(0xffc00000); bb = 1.0f / bb;
bb = bb / CUDART_NORM_HUGE_F;
return aa * bb;
} }
} else { } else {
return a / b; return aa / bb;
} }
#endif /* __MULTI_CORE__ */ #endif /* __MULTI_CORE__ */
} }
#endif /* !defined(__CUDABE__) */
__device_func__(float __sinf(float a)) __device_func__(float __sinf(float a))
{ {
#if !defined(__CUDABE__)
if ((__float_as_int(a) << 1) == 0xff000000) {
return __fadd_rn (a, -a); /* return NaN */
}
#endif /* !defined(__CUDABE__) */
return sinf(a); return sinf(a);
} }
__device_func__(float __cosf(float a)) __device_func__(float __cosf(float a))
{ {
#if !defined(__CUDABE__)
if ((__float_as_int(a) << 1) == 0xff000000) {
return __fadd_rn (a, -a); /* return NaN */
}
#endif /* !defined(__CUDABE__) */
return cosf(a); return cosf(a);
} }
__device_func__(float __log2f(float a)) __device_func__(float __log2f(float a))
{ {
return log2f(a); return log2f(a);
} }
/************************************************************************** ***** /************************************************************************** *****
* * * *
* SHARED HOST AND DEVICE IMPLEMENTATIONS * * SHARED HOST AND DEVICE IMPLEMENTATIONS *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
__device_func__(float __internal_accurate_fdividef(float a, float b)) __device_func__(float __internal_accurate_fdividef(float a, float b))
{ {
if (__cuda_fabsf(b) > CUDART_TWO_TO_126_F) { return a / b;
a *= .25f;
b *= .25f;
}
return __fdividef(a, b);
} }
__device_func__(float __tanf(float a)) __device_func__(float __tanf(float a))
{ {
#if defined(__MULTI_CORE__) #if defined(__MULTI_CORE__)
return tanf(a); return tanf(a);
#else /* __MULTI_CORE__ */ #else /* __MULTI_CORE__ */
return __fdividef (__sinf(a), __cosf(a)); return __fdividef (__sinf(a), __cosf(a));
#endif /* __MULTI_CORE__ */ #endif /* __MULTI_CORE__ */
} }
 End of changes. 76 change blocks. 
367 lines changed or deleted 175 lines changed or added


 device_launch_parameters.h   device_launch_parameters.h 
skipping to change at line 53 skipping to change at line 53
#define __STORAGE__ \ #define __STORAGE__ \
extern const extern const
#endif /* __STORAGE__ */ #endif /* __STORAGE__ */
#if defined(__cplusplus) #if defined(__cplusplus)
extern "C" { extern "C" {
#endif /* __cplusplus */ #endif /* __cplusplus */
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct uint3 __STORAGE__ threadIdx; uint3 __STORAGE__ threadIdx;
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct uint3 __STORAGE__ blockIdx; uint3 __STORAGE__ blockIdx;
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct dim3 __STORAGE__ blockDim; dim3 __STORAGE__ blockDim;
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct dim3 __STORAGE__ gridDim; dim3 __STORAGE__ gridDim;
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
int __STORAGE__ warpSize; int __STORAGE__ warpSize;
#undef __STORAGE__ #undef __STORAGE__
#if defined(__cplusplus) #if defined(__cplusplus)
} }
#endif /* __cplusplus */ #endif /* __cplusplus */
#if !defined(__cudaGet_threadIdx) #if !defined(__cudaGet_threadIdx)
 End of changes. 4 change blocks. 
4 lines changed or deleted 4 lines changed or added


 device_runtime.h   device_runtime.h 
/* /*
* Copyright 1993-2009 NVIDIA Corporation. All rights reserved. * Copyright 1993-2008 NVIDIA Corporation. All rights reserved.
* *
* NOTICE TO USER: * NOTICE TO USER:
* *
* This source code is subject to NVIDIA ownership rights under U.S. and * This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws. Users and possessors of this source code * international Copyright laws. Users and possessors of this source code
* are hereby granted a nonexclusive, royalty-free license to use this code * are hereby granted a nonexclusive, royalty-free license to use this code
* in individual and commercial software. * in individual and commercial software.
* *
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
skipping to change at line 56 skipping to change at line 56
s s
#define __unsized_shared_data(name, type_post) \ #define __unsized_shared_data(name, type_post) \
__unsized##name __unsized##type_post __unsized##name __unsized##type_post
#define __sized_shared_data(name, type) \ #define __sized_shared_data(name, type) \
__sized##name type __sized##name type
#define __sized__shared_var(name, s, type) \ #define __sized__shared_var(name, s, type) \
name name
/*TEXTURE_TYPE*/ /*TEXTURE_TYPE*/
typedef const void *__texture_type__; typedef const void *__texture_type__;
/*SURFACE_TYPE*/
typedef const void *__surface_type__;
#if defined(__CUDABE__) /* cudabe compiler */ #if defined(__CUDABE__) /* cudabe compiler */
#define __pad__(f) #define __pad__(f)
#define __text__ \ #define __text__ \
__attribute__((__texture__)) __attribute__((__texture__))
#define __surf__ \
__attribute__((__surface__))
#define ___device__(sc) \ #define ___device__(sc) \
static static
#define __in__(cdecl, decl) \ #define __in__(cdecl, decl) \
__shared__ cdecl __shared__ cdecl
#define __in_type__(cdecl, decl) \ #define __in_type__(cdecl, decl) \
cdecl cdecl
#define __texture_var(name) \ #define __texture_var(name) \
name name
#define __shared_var(name, s, type) \ #define __shared_var(name, s, type) \
name name
skipping to change at line 86 skipping to change at line 90
#define __copy_param(local_decl, param) \ #define __copy_param(local_decl, param) \
local_decl = param local_decl = param
#define __unsized_array_size \ #define __unsized_array_size \
[] []
#define __unsized__shared_var(name, s, type) \ #define __unsized__shared_var(name, s, type) \
name name
#define __unsized__empty_array(s) \ #define __unsized__empty_array(s) \
s s
#define __var_used__ \ #define __var_used__ \
__attribute__((__used__)) __attribute__((__used__))
#define __storage_extern_unsized__shared__ \
extern
#undef __cdecl #undef __cdecl
#define __cdecl #define __cdecl
#undef __w64 #undef __w64
#define __w64 #define __w64
#elif defined(__CUDACC__) /* cudafe compiler */ #elif defined(__CUDACC__) /* cudafe compiler */
#define __loc_sc__(loc, sc) \ #define __loc_sc__(loc, size, sc) \
sc loc sc loc
#define __pad__(f) #define __pad__(f)
#define __text__ #define __text__
#define __surf__
#define ___device__(sc) \ #define ___device__(sc) \
sc __device__ sc __device__
#define __in__(cdecl, decl) \ #define __in__(cdecl, decl) \
decl decl
#define __in_type__(cdecl, decl) \ #define __in_type__(cdecl, decl) \
decl decl
#define __texture_var(name) \ #define __texture_var(name) \
name name
#define __shared_var(name, s, type) \ #define __shared_var(name, s, type) \
name name
skipping to change at line 127 skipping to change at line 134
#define __unsized__empty_array(s) \ #define __unsized__empty_array(s) \
s s
#else /* host compiler (cl, gcc, open64, ...) */ #else /* host compiler (cl, gcc, open64, ...) */
#if defined (__MULTI_CORE__) || defined(__multi_core__) #if defined (__MULTI_CORE__) || defined(__multi_core__)
struct uint3; struct uint3;
extern struct uint3* CUDARTAPI __cudaGetBlockIdxPtr(void); extern struct uint3* CUDARTAPI __cudaGetBlockIdxPtr(void);
extern void* CUDARTAPI __cudaGetSharedMem(void*); extern void* CUDARTAPI __cudaGetSharedMem(void*);
extern void* CUDARTAPI __cudaCmcHostMalloc(size_t);
extern size_t CUDARTAPI __cudaCmcGetStackSize(void);
#endif /* __MULTI_CORE__ || __multi_core__ */ #endif /* __MULTI_CORE__ || __multi_core__ */
#if defined (__multi_core__) #if defined (__multi_core__)
#if defined(__GNUC__) #if defined(__GNUC__)
#if defined(__cplusplus) #if defined(__cplusplus)
extern void *alloca(size_t) throw(); extern void *alloca(size_t) throw();
#else /* __cplusplus */ #else /* __cplusplus */
extern void *alloca(size_t); extern void *alloca(size_t);
#endif /* __cplusplus */ #endif /* __cplusplus */
#define __cuda_alloc(s) \ #define __cuda_alloca(s) \
alloca(s) alloca(s)
#else /* __GNUC__ */ #else /* __GNUC__ */
extern void *_alloca(size_t); extern void *_alloca(size_t);
#define __cuda_alloc(s) \ #define __cuda_alloca(s) \
_alloca(s) _alloca(s)
#endif /* __GNUC__ */ #endif /* __GNUC__ */
/* check if enough stack size remains for alloca to succeed. If so,
use faster alloca() to do the allocation. Otherwise, allocate memory
using the __cudaCmcHostMalloc() runtime function, which uses the slower
malloc path to allocate memory
*/
#define __cudaCmcTargAlloc(num_bytes, max_stacksize, ptr_counter_stacksize,
ptr_ret_sym) \
do { \
if (*(ptr_counter_stacksize) + (num_bytes) >= (max_stacksize)) { \
*(ptr_ret_sym) = __cudaCmcHostMalloc((num_bytes)); \
} else { \
*(ptr_ret_sym) = __cuda_alloca((num_bytes)); \
*(ptr_counter_stacksize) = *(ptr_counter_stacksize) + (num_bytes); \
} \
} while(0)
#endif /* __multi_core__ */ #endif /* __multi_core__ */
#if defined (__MULTI_CORE__) #if defined (__MULTI_CORE__)
#define ___device__(sc) \ #define ___device__(sc) \
static static
#define __pad__(f) \ #define __pad__(f) \
f f
#define __text__ #define __text__
#define __surf__
#define __cudaGet_blockIdx() \ #define __cudaGet_blockIdx() \
(*__cudaGetBlockIdxPtr()) (*__cudaGetBlockIdxPtr())
#define __shared_var(name, s, type) \ #define __shared_var(name, s, type) \
(s type __cudaGetSharedMem((void*)(&(name)))) (s type __cudaGetSharedMem((void*)(&(name))))
#define __var_used__ \ #define __var_used__ \
__attribute__((__used__)) __attribute__((__used__))
#define __storage_auto__shared__ \ #define __storage_auto__shared__ \
auto auto
#undef __cdecl #undef __cdecl
skipping to change at line 201 skipping to change at line 226
#elif defined(__GNUC__) #elif defined(__GNUC__)
#define __STORAGE__ \ #define __STORAGE__ \
__attribute__((__common__)) __attribute__((__common__))
#elif defined(__cplusplus) #elif defined(__cplusplus)
#define __STORAGE__ \ #define __STORAGE__ \
__declspec(selectany) __declspec(selectany)
#pragma warning(disable: 4099 4190)
#else /* __APPLE__ || __ICC */ #else /* __APPLE__ || __ICC */
#define __STORAGE__ #define __STORAGE__
#endif /* __APPLE__ || __ICC */ #endif /* __APPLE__ || __ICC */
#endif /* __MULTI_CORE__ */ #endif /* __MULTI_CORE__ */
#define __in__(cdecl, decl) \ #define __in__(cdecl, decl) \
decl decl
skipping to change at line 230 skipping to change at line 253
name name
#define __copy_param(local_decl, param) #define __copy_param(local_decl, param)
#define __unsized_array_size #define __unsized_array_size
#define __unsized__shared_var(name, s, type) \ #define __unsized__shared_var(name, s, type) \
(*name) (*name)
#define __unsized__empty_array(s) #define __unsized__empty_array(s)
/* this is compiled with a host compiler for device emulation */ /* this is compiled with a host compiler for device emulation */
#define __device_emulation #define __device_emulation
#if defined(__cplusplus)
#undef __VECTOR_TYPES_H__
#if defined(_WIN32)
#pragma warning(disable: 4190 4522)
#endif /* _WIN32 */
#endif /* __cplusplus */
#endif /* __CUDABE__ */ #endif /* __CUDABE__ */
#if defined(__cplusplus)
static void *__cuda_memcpy(void*, const void*, size_t);
/* for C++ compilation of lowered C++ (i.e. C) code */
#define __cuda_assign_operators(tag)
\
tag& operator=( tag& a) { __cuda_memcpy(this, &a,
sizeof(tag)); return *this;} \
tag& operator=(volatile tag& a) volatile { return *(tag*)this = (ta
g&)a; } \
tag& operator=( const tag& a) { return *(tag*)this = (ta
g&)a; } \
#endif /* __cplusplus */
#include "builtin_types.h" #include "builtin_types.h"
#include "device_launch_parameters.h" #include "device_launch_parameters.h"
#include "storage_class.h" #include "storage_class.h"
 End of changes. 14 change blocks. 
6 lines changed or deleted 58 lines changed or added


 driver_types.h   driver_types.h 
skipping to change at line 143 skipping to change at line 143
/** /**
* Channel format kind * Channel format kind
*/ */
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
enum cudaChannelFormatKind enum cudaChannelFormatKind
{ {
cudaChannelFormatKindSigned = 0, ///< Signed channel for mat cudaChannelFormatKindSigned = 0, ///< Signed channel for mat
cudaChannelFormatKindUnsigned = 1, ///< Unsigned channel f ormat cudaChannelFormatKindUnsigned = 1, ///< Unsigned channel f ormat
cudaChannelFormatKindFloat = 2, ///< Float channel form at cudaChannelFormatKindFloat = 2, ///< Float channel form at
cudaChannelFormatKindNone = 3, ///< No channel format cudaChannelFormatKindNone = 3 ///< No channel format
}; };
/** /**
* CUDA Channel format descriptor * CUDA Channel format descriptor
*/ */
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct cudaChannelFormatDesc struct cudaChannelFormatDesc
{ {
int x; ///< x int x; ///< x
int y; ///< y int y; ///< y
skipping to change at line 185 skipping to change at line 185
}; };
/** /**
* CUDA Pitched memory pointer * CUDA Pitched memory pointer
*/ */
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct cudaPitchedPtr struct cudaPitchedPtr
{ {
void *ptr; ///< Pointer to allocated memory void *ptr; ///< Pointer to allocated memory
size_t pitch; ///< Pitch of allocated memory in bytes size_t pitch; ///< Pitch of allocated memory in bytes
size_t xsize; ///< Logical width of allocation in bytes size_t xsize; ///< Logical width of allocation in elements
size_t ysize; ///< Logical height of allocation in bytes size_t ysize; ///< Logical height of allocation in elements
}; };
/** /**
* CUDA extent * CUDA extent
*/ */
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct cudaExtent struct cudaExtent
{ {
size_t width; ///< Width in bytes size_t width; ///< Width in bytes
size_t height; ///< Height in bytes size_t height; ///< Height in bytes
 End of changes. 2 change blocks. 
3 lines changed or deleted 3 lines changed or added


 func_macro.h   func_macro.h 
/* /*
* Copyright 1993-2009 NVIDIA Corporation. All rights reserved. * Copyright 1993-2008 NVIDIA Corporation. All rights reserved.
* *
* NOTICE TO USER: * NOTICE TO USER:
* *
* This source code is subject to NVIDIA ownership rights under U.S. and * This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws. Users and possessors of this source code * international Copyright laws. Users and possessors of this source code
* are hereby granted a nonexclusive, royalty-free license to use this code * are hereby granted a nonexclusive, royalty-free license to use this code
* in individual and commercial software. * in individual and commercial software.
* *
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
skipping to change at line 52 skipping to change at line 52
___device__(static) decl ___device__(static) decl
#define __device_func__(decl) \ #define __device_func__(decl) \
___device__(static) decl ___device__(static) decl
#else /* __CUDABE__ */ #else /* __CUDABE__ */
#if !defined(__CUDA_INTERNAL_COMPILATION__) #if !defined(__CUDA_INTERNAL_COMPILATION__)
#error -- incorrect inclusion of a cudart header file #error -- incorrect inclusion of a cudart header file
#endif /* __CUDA_INTERNAL_COMPILATION__ */ #endif /* !__CUDA_INTERNAL_COMPILATION__ */
#if defined(__cplusplus) && defined(__device_emulation) && !defined(__multi
_core__)
#define __begin_host_func \
}}
#define __end_host_func \
namespace __cuda_emu { extern "C" {
#define __host_device_call(f) \
__cuda_emu::f
#else /* __cplusplus && __device_emulation && !__multi_core__ */
#define __begin_host_func
#define __end_host_func
#define __host_device_call(f) \
f
#endif /* __cplusplus && __device_emulation !__multi_core__ */
#if defined(__APPLE__) #if defined(__APPLE__)
#define __func__(decl) \ #define __func__(decl) \
extern __attribute__((__weak_import__, __weak__)) decl; decl extern __attribute__((__weak_import__, __weak__)) decl; decl
#define __device_func__(decl) \
static __attribute__((__unused__)) decl
#elif defined(__GNUC__) #elif defined(__GNUC__)
#define __func__(decl) \ #define __func__(decl) \
extern __attribute__((__weak__)) decl; decl extern __attribute__((__weak__)) decl; decl
#define __device_func__(decl) \
static __attribute__((__unused__)) decl
#elif defined(_WIN32) #elif defined(_WIN32)
#define __func__(decl) \ #define __func__(decl) \
static decl static decl
#endif /* __APPLE__ */
#define __device_func__(decl) \ #define __device_func__(decl) \
static decl static decl
#endif /* __APPLE__ */
#endif /* CUDABE */ #endif /* CUDABE */
#endif /* __FUNC_MACRO_H__ */ #endif /* __FUNC_MACRO_H__ */
 End of changes. 6 change blocks. 
5 lines changed or deleted 27 lines changed or added


 host_config.h   host_config.h 
skipping to change at line 119 skipping to change at line 119
#if !defined(NOMINMAX) #if !defined(NOMINMAX)
#define NOMINMAX /* min and max are part of cuda runtime */ #define NOMINMAX /* min and max are part of cuda runtime */
#endif /* !NOMINMAX */ #endif /* !NOMINMAX */
#include <crtdefs.h> /* for _CRTIMP */ #include <crtdefs.h> /* for _CRTIMP */
#define __THROW #define __THROW
#endif /* __GNUC__ */ #endif /* __APPLE__ */
#endif /* __CUDACC__ */ #endif /* __CUDACC__ */
#endif /* !__HOST_CONFIG_H__ */ #endif /* !__HOST_CONFIG_H__ */
 End of changes. 1 change blocks. 
1 lines changed or deleted 1 lines changed or added


 host_defines.h   host_defines.h 
skipping to change at line 101 skipping to change at line 101
__loc__(__declspec(a)) __loc__(__declspec(a))
#define CUDARTAPI \ #define CUDARTAPI \
__stdcall __stdcall
#endif /* !__GNUC__ && !_WIN32 */ #endif /* !__GNUC__ && !_WIN32 */
#if defined(__CUDACC__) || defined(__CUDABE__) || defined (__MULTI_CORE__) #if defined(__CUDACC__) || defined(__CUDABE__) || defined (__MULTI_CORE__)
#define __loc__(a) \ #define __loc__(a) \
a a
#define __builtin_align__(a) \
__align__(a)
#else /* __CUDACC__ || __CUDABE__ || __MULTI_CORE__ */ #else /* __CUDACC__ || __CUDABE__ || __MULTI_CORE__ */
#define __loc__(a) #define __loc__(a)
#define __builtin_align__(a)
#endif /* __CUDACC__ || __CUDABE__ || __MULTI_CORE__ */ #endif /* __CUDACC__ || __CUDABE__ || __MULTI_CORE__ */
#if defined(__CUDACC__) || defined(__CUDABE__) || defined (__MULTI_CORE__)
|| \
defined(__GNUC__) || defined(_WIN64)
#define __builtin_align__(a) \
__align__(a)
#else /* __CUDACC__ || __CUDABE__ || __MULTI_CORE__ || __GNUC__ || _WIN64 *
/
#define __builtin_align__(a)
#endif /* __CUDACC__ || __CUDABE__ || __MULTI_CORE__ || __GNUC__ || _WIN64
*/
#define __device__ \ #define __device__ \
__location__(__device__) __location__(__device__)
#define __host__ \ #define __host__ \
__location__(__host__) __location__(__host__)
#define __global__ \ #define __global__ \
__location__(__global__) __location__(__global__)
#define __shared__ \ #define __shared__ \
__location__(__shared__) __location__(__shared__)
#define __constant__ \ #define __constant__ \
__location__(__constant__) __location__(__constant__)
#define __launch_bounds__(t) \ #define __launch_bounds__(...) \
__location__(__launch_bounds__(t)) __location__(__launch_bounds__(__VA_ARGS__))
#endif /* !__HOST_DEFINES_H__ */ #endif /* !__HOST_DEFINES_H__ */
 End of changes. 4 change blocks. 
5 lines changed or deleted 17 lines changed or added


 host_runtime.h   host_runtime.h 
/* /*
* Copyright 1993-2009 NVIDIA Corporation. All rights reserved. * Copyright 1993-2008 NVIDIA Corporation. All rights reserved.
* *
* NOTICE TO USER: * NOTICE TO USER:
* *
* This source code is subject to NVIDIA ownership rights under U.S. and * This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws. Users and possessors of this source code * international Copyright laws. Users and possessors of this source code
* are hereby granted a nonexclusive, royalty-free license to use this code * are hereby granted a nonexclusive, royalty-free license to use this code
* in individual and commercial software. * in individual and commercial software.
* *
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
skipping to change at line 43 skipping to change at line 43
* the above Disclaimer and U.S. Government End Users Notice. * the above Disclaimer and U.S. Government End Users Notice.
*/ */
#if !defined(__CUDA_INTERNAL_COMPILATION__) #if !defined(__CUDA_INTERNAL_COMPILATION__)
#define __CUDA_INTERNAL_COMPILATION__ #define __CUDA_INTERNAL_COMPILATION__
#define __glob_pref_var(var) \ #define __glob_pref_var(var) \
__global_##var __global_##var
#define __global_var(var) \ #define __global_var(var) \
(*__glob_pref_var(var)) (*__glob_pref_var(var))
#define __shadow_pref_var(var) \ #define __shadow_var(c, cpp) \
__shadow_##var __shadow_pref_var(c, cpp)
#define __shadow_var(var) \
__shadow_pref_var(var)
#define __pad__(f) \
f
#define __text__ #define __text__
#define __surf__
#define __dv(v) #define __dv(v)
#if defined(_WIN32) && !defined(_WIN64)
#define __pad__(f) \
f
#else /* _WIN32 && !_WIN64 */
#define __pad__(f)
#endif /* _WIN32 && !_WIN64 */
#if defined(__APPLE__) #if defined(__APPLE__)
#define __extern_weak__ \ #define __extern_weak__ \
__weak_import__, __weak_import__,
#elif defined(__GNUC__) #elif defined(__GNUC__)
#define __extern_weak__ #define __extern_weak__
#endif /* __APPLE__ */ #endif /* __APPLE__ */
#if defined(__cplusplus) #if defined(__cplusplus)
#define __shadow_pref_var(c, cpp) \
cpp##__cuda_shadow_variable__
#define __device_stub_name(c, cpp) \ #define __device_stub_name(c, cpp) \
cpp cpp
#define __text_var(c, cpp) \
cpp
#define __cppref__ \ #define __cppref__ \
& &
#else /* __cplusplus */ #else /* __cplusplus */
#define __shadow_pref_var(c, cpp) \
c##__cuda_shadow_variable__
#define __device_stub_name(c, cpp) \ #define __device_stub_name(c, cpp) \
c c
#define __text_var(c, cpp) \
c
#define __cppref__ #define __cppref__
typedef char bool; typedef char bool;
#endif /* __cplusplus */ #endif /* __cplusplus */
#if !defined(__GNUC__) || __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__
< 3)
#define __specialization_static \
static
#else /* !__GNUC__ || __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 3)
*/
#define __specialization_static
#endif /* !__GNUC__ || __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 3
) */
#include "cuda_runtime_api.h" #include "cuda_runtime_api.h"
#include "storage_class.h" #include "storage_class.h"
#else /* !__CUDA_INTERNAL_COMPILATION__ */ #else /* !__CUDA_INTERNAL_COMPILATION__ */
#include "host_defines.h" #include "host_defines.h"
#define __cudaRegisterBinary() \ #define __cudaRegisterBinary() \
__cudaFatCubinHandle = __cudaRegisterFatBinary((void*)__cudaFatCubi n); \ __cudaFatCubinHandle = __cudaRegisterFatBinary((void*)__cudaFatCubi n); \
atexit(__cudaUnregisterBinaryUtil) atexit(__cudaUnregisterBinaryUtil)
#define __cudaRegisterVariable(var, ext, size, constant, global) \ #define __cudaRegisterVariable(var, ext, size, constant, global) \
__cudaRegisterVar(__cudaFatCubinHandle, (char*)&__host##var, (char* )__device##var, __name##var, ext, size, constant, global) __cudaRegisterVar(__cudaFatCubinHandle, (char*)&__host##var, (char* )__device##var, __name##var, ext, size, constant, global)
#define __cudaRegisterGlobalTexture(tex, dim, norm, ext) \ #define __cudaRegisterGlobalTexture(tex, dim, norm, ext) \
__cudaRegisterTexture(__cudaFatCubinHandle, (const struct textureRe __cudaRegisterTexture(__cudaFatCubinHandle, (const struct textureRe
ference*)&tex, __tex_var(tex), #tex, dim, norm, ext) ference*)&tex, __tex_var(tex), __name##tex, dim, norm, ext)
#define __cudaRegisterGlobalSurface(surf, dim, ext) \
__cudaRegisterSurface(__cudaFatCubinHandle, (const struct surfaceRe
ference*)&surf, __tex_var(surf), __name##surf, dim, ext)
#define __cudaRegisterUnsizedShared(var) \ #define __cudaRegisterUnsizedShared(var) \
__cudaRegisterShared(__cudaFatCubinHandle, (void**)__device_var(var )) __cudaRegisterShared(__cudaFatCubinHandle, (void**)__device_var(var ))
#define __cudaRegisterSharedVariable(var, size, align, sc) \ #define __cudaRegisterSharedVariable(var, size, align, sc) \
__cudaRegisterSharedVar(__cudaFatCubinHandle, (void**)__device_var( var), size, align, sc) __cudaRegisterSharedVar(__cudaFatCubinHandle, (void**)__device_var( var), size, align, sc)
#define __cudaRegisterEntry(funptr, fun, thread_limit) \ #define __cudaRegisterEntry(funptr, fun, thread_limit) \
__cudaRegisterFunction(__cudaFatCubinHandle, (const char*)funptr, ( char*)__device_fun(fun), #fun, thread_limit, __ids) __cudaRegisterFunction(__cudaFatCubinHandle, (const char*)funptr, ( char*)__device_fun(fun), #fun, thread_limit, __ids)
#define __cudaInitArgBlock(arg) \ #define __cudaInitArgBlock(arg) \
char __[256]; \ *(void**)(void*)&arg = (void*)0
*(char**)&arg = __
#define __cudaSetupArg(arg, offset) \ #define __cudaSetupArg(arg, offset) \
if (cudaSetupArgument((void*)(char*)&arg, sizeof(arg), (size_t)&off set->arg - (size_t)offset) != cudaSuccess) \ if (cudaSetupArgument((void*)(char*)&arg, sizeof(arg), (size_t)&off set->arg) != cudaSuccess) \
return return
#define __cudaLaunch(fun) \ #define __cudaLaunch(fun) \
{ volatile static char *__f; __f = fun; (void)cudaLaunch(fun); } { volatile static char *__f; __f = fun; (void)cudaLaunch(fun); }
#if defined(__cplusplus) #if defined(__cplusplus)
extern "C" { extern "C" {
#endif /* __cplusplus */ #endif /* __cplusplus */
extern void** CUDARTAPI __cudaRegisterFatBinary( extern void** CUDARTAPI __cudaRegisterFatBinary(
void *fatCubin void *fatCubin
skipping to change at line 143 skipping to change at line 171
extern void CUDARTAPI __cudaRegisterTexture( extern void CUDARTAPI __cudaRegisterTexture(
void **fatCubinHandle, void **fatCubinHandle,
const struct textureReference *hostVar, const struct textureReference *hostVar,
const void **deviceAddress, const void **deviceAddress,
const char *deviceName, const char *deviceName,
int dim, int dim,
int norm, int norm,
int ext int ext
); );
extern void CUDARTAPI __cudaRegisterSurface(
void **fatCubinHandle,
const struct surfaceReference *hostVar,
const void **deviceAddress,
const char *deviceName,
int dim,
int ext
);
extern void CUDARTAPI __cudaRegisterShared( extern void CUDARTAPI __cudaRegisterShared(
void **fatCubinHandle, void **fatCubinHandle,
void **devicePtr void **devicePtr
); );
extern void CUDARTAPI __cudaRegisterSharedVar( extern void CUDARTAPI __cudaRegisterSharedVar(
void **fatCubinHandle, void **fatCubinHandle,
void **devicePtr, void **devicePtr,
size_t size, size_t size,
size_t alignment, size_t alignment,
skipping to change at line 192 skipping to change at line 229
static void **__cudaFatCubinHandle; static void **__cudaFatCubinHandle;
static void __cudaUnregisterBinaryUtil(void) static void __cudaUnregisterBinaryUtil(void)
{ {
__cudaUnregisterFatBinary(__cudaFatCubinHandle); __cudaUnregisterFatBinary(__cudaFatCubinHandle);
} }
#if defined(__device_emulation) #if defined(__device_emulation)
#if defined(__cplusplus) && !defined(__multi_core__)
#define __cuda_emu__ \
__cuda_emu::
#else /* __cplusplus */
#define __cuda_emu__
#endif /* __cplusplus */
#define __device_fun(fun) \ #define __device_fun(fun) \
__device_wrapper_##fun __cuda_emu__ __device_wrapper_##fun
#define __device_var(var) \ #define __device_var(var) \
(char*)&var (char*)&__cuda_emu__ var
#define __tex_var(var) \ #define __tex_var(var) \
&__texture_var(var) &__cuda_emu__ __texture_var(var)
#define __cudaFatCubin \ #define __cudaFatCubin \
0 0
#if defined(__multi_core__) #if defined(__multi_core__)
#define __ids \ #define __ids \
(uint3*)0, (uint3*)0, &blockDim, &gridDim, &warpSize (uint3*)0, (uint3*)0, &blockDim, &gridDim, &warpSize
#else /* __multi_core__ */ #else /* __multi_core__ */
#define __ids \ #define __ids \
&threadIdx, &blockIdx, &blockDim, &gridDim, &warpSize (uint3*)&__cuda_emu__ threadIdx, (uint3*)&__cuda_emu__ blockIdx, (d im3*)&__cuda_emu__ blockDim, (dim3*)&__cuda_emu__ gridDim, &__cuda_emu__ wa rpSize
#endif /* __multi_core__ */ #endif /* __multi_core__ */
#else /* __device_emulation */ #else /* __device_emulation */
#define __device_fun(fun) \ #define __device_fun(fun) \
#fun #fun
#define __device_var(var) \ #define __device_var(var) \
#var #var
#define __tex_var(var) \ #define __tex_var(var) \
skipping to change at line 237 skipping to change at line 285
#endif /* __device_emulation */ #endif /* __device_emulation */
/* UTILITY MACROS */ /* UTILITY MACROS */
#define __device__global_var(var) \ #define __device__global_var(var) \
__device_var(var) __device_var(var)
#define __name__global_var(var) \ #define __name__global_var(var) \
#var #var
#define __host__global_var(var) \ #define __host__global_var(var) \
__glob_pref_var(var) __glob_pref_var(var)
#define __device__shadow_var(var) \ #define __device__shadow_var(c, cpp) \
__device_var(var) __device_var(c)
#define __name__shadow_var(var) \ #define __name__shadow_var(c, cpp) \
#var #c
#define __host__shadow_var(var) \ #define __name__text_var(c, cpp) \
__shadow_pref_var(var) #c
#define __host__shadow_var(c, cpp) \
__shadow_pref_var(c, cpp)
#if defined(_WIN32) && defined(__cplusplus)
#pragma warning(disable: 4099)
#endif /* _WIN32 && __cplusplus */
#endif /* !__CUDA_INTERNAL_COMPILATION__ */ #endif /* !__CUDA_INTERNAL_COMPILATION__ */
 End of changes. 19 change blocks. 
22 lines changed or deleted 82 lines changed or added


 math_constants.h   math_constants.h 
skipping to change at line 74 skipping to change at line 74
#define CUDART_NORM_HUGE_F 3.402823466e38f #define CUDART_NORM_HUGE_F 3.402823466e38f
#define CUDART_TWO_TO_23_F 8388608.0f #define CUDART_TWO_TO_23_F 8388608.0f
#define CUDART_TWO_TO_24_F 16777216.0f #define CUDART_TWO_TO_24_F 16777216.0f
#define CUDART_TWO_TO_31_F 2147483648.0f #define CUDART_TWO_TO_31_F 2147483648.0f
#define CUDART_TWO_TO_32_F 4294967296.0f #define CUDART_TWO_TO_32_F 4294967296.0f
#define CUDART_REMQUO_BITS_F 3 #define CUDART_REMQUO_BITS_F 3
#define CUDART_REMQUO_MASK_F (~((~0)<<CUDART_REMQUO_BITS_F)) #define CUDART_REMQUO_MASK_F (~((~0)<<CUDART_REMQUO_BITS_F))
#define CUDART_TRIG_PLOSS_F 48039.0f #define CUDART_TRIG_PLOSS_F 48039.0f
/* double precision constants */ /* double precision constants */
#if !defined(CUDA_NO_SM_13_DOUBLE_INTRINSICS) #if __CUDA_ARCH__ >= 130
#define CUDART_INF __longlong_as_double(0x7ff0000000000000ULL) #define CUDART_INF __longlong_as_double(0x7ff0000000000000ULL)
#define CUDART_NAN __longlong_as_double(0xfff8000000000000ULL) #define CUDART_NAN __longlong_as_double(0xfff8000000000000ULL)
#define CUDART_NEG_ZERO __longlong_as_double(0x8000000000000000ULL) #define CUDART_NEG_ZERO __longlong_as_double(0x8000000000000000ULL)
#define CUDART_MIN_DENORM __longlong_as_double(0x0000000000000001ULL) #define CUDART_MIN_DENORM __longlong_as_double(0x0000000000000001ULL)
#else /* !CUDA_NO_SM_13_DOUBLE_INTRINSICS */ #else /* __CUDA_ARCH__ >= 130 */
#define CUDART_INF __hiloint2double(0x7ff00000, 0x00000000) #define CUDART_INF __hiloint2double(0x7ff00000, 0x00000000)
#define CUDART_NAN __hiloint2double(0xfff80000, 0x00000000) #define CUDART_NAN __hiloint2double(0xfff80000, 0x00000000)
#define CUDART_NEG_ZERO __hiloint2double(0x80000000, 0x00000000) #define CUDART_NEG_ZERO __hiloint2double(0x80000000, 0x00000000)
#define CUDART_MIN_DENORM __hiloint2double(0x00000000, 0x00000001) #define CUDART_MIN_DENORM __hiloint2double(0x00000000, 0x00000001)
#endif /* !CUDA_NO_SM_13_DOUBLE_INTRINSICS */ #endif /* __CUDA_ARCH__ >= 130 */
#define CUDART_ZERO 0.0 #define CUDART_ZERO 0.0
#define CUDART_ONE 1.0 #define CUDART_ONE 1.0
#define CUDART_SQRT_TWO 1.4142135623730951e+0 #define CUDART_SQRT_TWO 1.4142135623730951e+0
#define CUDART_SQRT_HALF 7.0710678118654757e-1 #define CUDART_SQRT_HALF 7.0710678118654757e-1
#define CUDART_THIRD 3.3333333333333333e-1 #define CUDART_THIRD 3.3333333333333333e-1
#define CUDART_TWOTHIRD 6.6666666666666667e-1 #define CUDART_TWOTHIRD 6.6666666666666667e-1
#define CUDART_PIO4 7.8539816339744828e-1 #define CUDART_PIO4 7.8539816339744828e-1
#define CUDART_PIO4_HI 7.8539816339744828e-1 #define CUDART_PIO4_HI 7.8539816339744828e-1
#define CUDART_PIO4_LO 3.0616169978683830e-17 #define CUDART_PIO4_LO 3.0616169978683830e-17
 End of changes. 3 change blocks. 
3 lines changed or deleted 3 lines changed or added


 math_functions_dbl_ptx3.h   math_functions_dbl_ptx3.h 
skipping to change at line 497 skipping to change at line 497
if (i & 2) { if (i & 2) {
z = -z; z = -z;
} }
return z; return z;
} }
__device_func__(void __cuda_sincos(double a, double *sptr, double *cptr)) __device_func__(void __cuda_sincos(double a, double *sptr, double *cptr))
{ {
double t, u, s, c; double t, u, s, c;
int i; int i;
if (__cuda___isinf(a)) { t = __cuda_fabs(a);
*sptr = CUDART_NAN; if ((t == CUDART_INF) || (t == CUDART_ZERO)) {
*cptr = CUDART_NAN; s = __dmul_rn (a, CUDART_ZERO); /* generate NaN, zero */
return; c = 1.0 + s; /* generate NaN, one */
} *sptr = s;
if (a == CUDART_ZERO) { *cptr = c;
*sptr = a;
*cptr = 1.0;
return; return;
} }
t = __internal_trig_reduction_kerneld(a, &i); t = __internal_trig_reduction_kerneld(a, &i);
u = __internal_cos_kerneld(t); u = __internal_cos_kerneld(t);
t = __internal_sin_kerneld(t); t = __internal_sin_kerneld(t);
if (i & 1) { if (i & 1) {
s = u; s = u;
c = t; c = t;
} else { } else {
s = t; s = t;
skipping to change at line 561 skipping to change at line 559
/* normalize denormals */ /* normalize denormals */
if ((unsigned)ihi < (unsigned)0x00100000) { if ((unsigned)ihi < (unsigned)0x00100000) {
a = a * CUDART_TWO_TO_54; a = a * CUDART_TWO_TO_54;
e -= 54; e -= 54;
ihi = __double2hiint(a); ihi = __double2hiint(a);
ilo = __double2loint(a); ilo = __double2loint(a);
} }
/* a = m * 2^e. m <= sqrt(2): log2(a) = log2(m) + e. /* a = m * 2^e. m <= sqrt(2): log2(a) = log2(m) + e.
* m > sqrt(2): log2(a) = log2(m/2) + (e+1) * m > sqrt(2): log2(a) = log2(m/2) + (e+1)
*/ */
e += ((ihi >> 20) & 0x7ff); e += (ihi >> 20);
ihi = (ihi & 0x800fffff) | 0x3ff00000; ihi = (ihi & 0x800fffff) | 0x3ff00000;
m = __hiloint2double (ihi, ilo); m = __hiloint2double (ihi, ilo);
if ((unsigned)ihi > (unsigned)0x3ff6a09e) { if ((unsigned)ihi > (unsigned)0x3ff6a09e) {
m = __internal_half(m); m = __internal_half(m);
e = e + 1; e = e + 1;
} }
/* log((1+m)/(1-m)) = 2*atanh(m). log(m) = 2*atanh ((m-1)/(m+1)) */ /* log((1+m)/(1-m)) = 2*atanh(m). log(m) = 2*atanh ((m-1)/(m+1)) */
f = m - 1.0; f = m - 1.0;
g = m + 1.0; g = m + 1.0;
g = 1.0 / g; g = 1.0 / g;
skipping to change at line 1590 skipping to change at line 1588
} }
return t; return t;
} }
__device_func__(double __cuda_erfc(double a)) __device_func__(double __cuda_erfc(double a))
{ {
double p, q, h, l; double p, q, h, l;
int ahi; int ahi;
ahi = __double2hiint(a); ahi = __double2hiint(a);
if (ahi < (int)0x3fe80000) { if (ahi < (int)0x3fe80000) { /* 0.75 */
return 1.0 - __cuda_erf(a); return 1.0 - __cuda_erf(a);
} }
if (a > 27.3) { if (a > 27.3) {
return 0.0; return 0.0;
} }
if (ahi < (int)0x40140000) { if (ahi < (int)0x40140000) { /* 5.0 */
p = 5.64189549785304440E-001; /* max error 7 ulps on [0.75, 5.0] */
p = __fma_rn (p, a, 8.17405083437083490E+000); p = 5.6418956292134603E-001;
p = __fma_rn (p, a, 5.68958722557864720E+001); p = __fma_rn (p, a, 7.9573512229784757E+000);
p = __fma_rn (p, a, 2.42568747802647010E+002); p = __fma_rn (p, a, 5.4297984550299049E+001);
p = __fma_rn (p, a, 6.80381374390412930E+002); p = __fma_rn (p, a, 2.2775657465890461E+002);
p = __fma_rn (p, a, 1.25873132236024590E+003); p = __fma_rn (p, a, 6.2995529536738172E+002);
p = __fma_rn (p, a, 1.43925353963809330E+003); p = __fma_rn (p, a, 1.1508293767713837E+003);
p = __fma_rn (p, a, 8.15949420587659230E+002); p = __fma_rn (p, a, 1.3002167301542784E+003);
q = a+ 1.44881247113239940E+001; p = __fma_rn (p, a, 7.2716547570180592E+002);
q = __fma_rn (q, a, 1.01345387970210510E+002); q = a+ 1.4104035812651274E+001;
q = __fma_rn (q, a, 4.37184684964009650E+002); q = __fma_rn (q, a, 9.6740724349422138E+001);
q = __fma_rn (q, a, 1.25588209225251330E+003); q = __fma_rn (q, a, 4.1073916054746462E+002);
q = __fma_rn (q, a, 2.43864015012647630E+003); q = __fma_rn (q, a, 1.1641974580374074E+003);
q = __fma_rn (q, a, 3.10570469008816280E+003); q = __fma_rn (q, a, 2.2344896486798129E+003);
q = __fma_rn (q, a, 2.35995386578953550E+003); q = __fma_rn (q, a, 2.8166572432808462E+003);
q = __fma_rn (q, a, 8.15949420692539320E+002); q = __fma_rn (q, a, 2.1207350981593036E+003);
q = __fma_rn (q, a, 7.2716547619708967E+002);
p = p / q;
h = a * a;
l = __fma_rn (a, a, -h);
q = __internal_exp_kernel(-h, 0);
q = __fma_rn (l, -q, q);
p = p * q;
} else { } else {
p = 5.64189583545675280E-001; /* max error 4 ulps on [5, 27.3] */
p = __fma_rn (p, a, 2.04728556066513970E+000); double ooa, ooasq;
p = __fma_rn (p, a, 6.75128409662943610E+000);
p = __fma_rn (p, a, 1.10459345071747900E+001); ooa = 1.0 / a;
p = __fma_rn (p, a, 1.22570382896313600E+001); ooasq = ooa * ooa;
p = __fma_rn (p, a, 6.01884641114116460E+000); p = -4.0025406686930527E+005;
q = a+ 3.62871917534986780E+000; p = __fma_rn (p, ooasq, 1.4420582543942123E+005);
q = __fma_rn (q, a, 1.24663395327043550E+001); p = __fma_rn (p, ooasq, -2.7664185780951841E+004);
q = __fma_rn (q, a, 2.13927672803974790E+001); p = __fma_rn (p, ooasq, 4.1144611644767283E+003);
q = __fma_rn (q, a, 2.72082423532866070E+001); p = __fma_rn (p, ooasq, -5.8706000519209351E+002);
q = __fma_rn (q, a, 1.86422906830006700E+001); p = __fma_rn (p, ooasq, 9.1490086446323375E+001);
q = __fma_rn (q, a, 6.13809834548870550E+000); p = __fma_rn (p, ooasq, -1.6659491387740221E+001);
p = __fma_rn (p, ooasq, 3.7024804085481784E+000);
p = __fma_rn (p, ooasq, -1.0578553994424316E+000);
p = __fma_rn (p, ooasq, 4.2314218745087778E-001);
p = __fma_rn (p, ooasq, -2.8209479177354962E-001);
p = __fma_rn (p, ooasq, 5.6418958354775606E-001);
h = a * a;
l = __fma_rn (a, a, -h);
q = __internal_exp_kernel(-h, 0);
q = __fma_rn (l, -q, q);
p = p * ooa;
p = p * q;
} }
p = p / q;
h = a * a;
l = __fma_rn (a, a, -h);
q = __internal_exp_kernel(-h, 0);
q = __fma_rn (l, -q, q);
p = p * q;
return p; return p;
} }
/* approximate 1.0/(a*gamma(a)) on [-0.5,0.5] */ /* approximate 1.0/(a*gamma(a)) on [-0.5,0.5] */
__device_func__(double __internal_tgamma_kernel(double a)) __device_func__(double __internal_tgamma_kernel(double a))
{ {
double t; double t;
t = -4.42689340712524750E-010; t = -4.42689340712524750E-010;
t = __fma_rn (t, a, -2.02665918466589540E-007); t = __fma_rn (t, a, -2.02665918466589540E-007);
t = __fma_rn (t, a, 1.13812117211195270E-006); t = __fma_rn (t, a, 1.13812117211195270E-006);
 End of changes. 6 change blocks. 
45 lines changed or deleted 55 lines changed or added


 sm_11_atomic_functions.h   sm_11_atomic_functions.h 
skipping to change at line 41 skipping to change at line 41
* Any use of this source code in individual and commercial software must * Any use of this source code in individual and commercial software must
* include, in the user documentation and internal comments to the code, * include, in the user documentation and internal comments to the code,
* the above Disclaimer and U.S. Government End Users Notice. * the above Disclaimer and U.S. Government End Users Notice.
*/ */
#if !defined(__SM_11_ATOMIC_FUNCTIONS_H__) #if !defined(__SM_11_ATOMIC_FUNCTIONS_H__)
#define __SM_11_ATOMIC_FUNCTIONS_H__ #define __SM_11_ATOMIC_FUNCTIONS_H__
#if defined(__cplusplus) && defined(__CUDACC__) #if defined(__cplusplus) && defined(__CUDACC__)
#if !defined(CUDA_NO_SM_11_ATOMIC_INTRINSICS) #if __CUDA_ARCH__ >= 110
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
#include "host_defines.h" #include "host_defines.h"
extern "C" extern "C"
skipping to change at line 206 skipping to change at line 206
static __inline__ __device__ int atomicCAS(int *address, int compare, int v al) static __inline__ __device__ int atomicCAS(int *address, int compare, int v al)
{ {
return __iAtomicCAS(address, compare, val); return __iAtomicCAS(address, compare, val);
} }
static __inline__ __device__ unsigned int atomicCAS(unsigned int *address, unsigned int compare, unsigned int val) static __inline__ __device__ unsigned int atomicCAS(unsigned int *address, unsigned int compare, unsigned int val)
{ {
return __uAtomicCAS(address, compare, val); return __uAtomicCAS(address, compare, val);
} }
#endif /* !CUDA_NO_SM_12_ATOMIC_INTRINSICS */ #endif /* __CUDA_ARCH__ >= 110 */
#elif !defined(__CUDACC__) #elif !defined(__CUDACC__)
#include "crt/func_macro.h" #include "crt/func_macro.h"
#if !defined(__CUDABE__) #if !defined(__CUDABE__)
#if defined(__MULTI_CORE__) #if defined(__MULTI_CORE__)
#define __iAtomicAdd(address, val) \ #define __iAtomicAdd(address, val) \
 End of changes. 2 change blocks. 
2 lines changed or deleted 2 lines changed or added


 sm_12_atomic_functions.h   sm_12_atomic_functions.h 
skipping to change at line 41 skipping to change at line 41
* Any use of this source code in individual and commercial software must * Any use of this source code in individual and commercial software must
* include, in the user documentation and internal comments to the code, * include, in the user documentation and internal comments to the code,
* the above Disclaimer and U.S. Government End Users Notice. * the above Disclaimer and U.S. Government End Users Notice.
*/ */
#if !defined(__SM_12_ATOMIC_FUNCTIONS_H__) #if !defined(__SM_12_ATOMIC_FUNCTIONS_H__)
#define __SM_12_ATOMIC_FUNCTIONS_H__ #define __SM_12_ATOMIC_FUNCTIONS_H__
#if defined(__cplusplus) && defined(__CUDACC__) #if defined(__cplusplus) && defined(__CUDACC__)
#if !defined(CUDA_NO_SM_12_ATOMIC_INTRINSICS) #if __CUDA_ARCH__ >= 120
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
#include "host_defines.h" #include "host_defines.h"
extern "C" extern "C"
skipping to change at line 99 skipping to change at line 99
static __inline__ __device__ bool any(bool cond) static __inline__ __device__ bool any(bool cond)
{ {
return (bool)__any((int)cond); return (bool)__any((int)cond);
} }
static __inline__ __device__ bool all(bool cond) static __inline__ __device__ bool all(bool cond)
{ {
return (bool)__all((int)cond); return (bool)__all((int)cond);
} }
#endif /* !CUDA_NO_SM_12_ATOMIC_INTRINSICS */ #endif /* __CUDA_ARCH__ >= 120 */
#elif !defined(__CUDACC__) #elif !defined(__CUDACC__)
#include "crt/func_macro.h" #include "crt/func_macro.h"
#if !defined(__CUDABE__) #if !defined(__CUDABE__)
#if defined(__MULTI_CORE__) #if defined(__MULTI_CORE__)
#define __ullAtomicAdd(address, val) \ #define __ullAtomicAdd(address, val) \
 End of changes. 2 change blocks. 
2 lines changed or deleted 2 lines changed or added


 sm_13_double_functions.h   sm_13_double_functions.h 
skipping to change at line 47 skipping to change at line 47
#define __SM_13_DOUBLE_FUNCTIONS_H__ #define __SM_13_DOUBLE_FUNCTIONS_H__
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
#if defined(__cplusplus) && defined(__CUDACC__) #if defined(__cplusplus) && defined(__CUDACC__)
#if !defined(CUDA_NO_SM_13_DOUBLE_INTRINSICS) #if __CUDA_ARCH__ >= 130
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
#include "device_types.h" #include "device_types.h"
#include "host_defines.h" #include "host_defines.h"
skipping to change at line 252 skipping to change at line 252
static __inline__ __device__ double uint2double(unsigned int a, enum cudaRo undMode mode = cudaRoundNearest) static __inline__ __device__ double uint2double(unsigned int a, enum cudaRo undMode mode = cudaRoundNearest)
{ {
return (double)a; return (double)a;
} }
static __inline__ __device__ double float2double(float a, enum cudaRoundMod e mode = cudaRoundNearest) static __inline__ __device__ double float2double(float a, enum cudaRoundMod e mode = cudaRoundNearest)
{ {
return (double)a; return (double)a;
} }
#endif /* !CUDA_NO_SM_13_DOUBLE_INTRINSICS */ #endif /* __CUDA_ARCH__ >= 130 */
#elif !defined(__CUDACC__) #elif !defined(__CUDACC__)
#include "crt/func_macro.h" #include "crt/func_macro.h"
#if !defined(__CUDABE__) #if !defined(__CUDABE__)
/************************************************************************** ***** /************************************************************************** *****
* * * *
* HOST IMPLEMENTATIONS FOR FUNCTIONS * * HOST IMPLEMENTATIONS FOR FUNCTIONS *
skipping to change at line 292 skipping to change at line 292
{ {
volatile union __cudart_DoubleUlonglongCvt xx; volatile union __cudart_DoubleUlonglongCvt xx;
volatile union __cudart_FloatUintCvt res; volatile union __cudart_FloatUintCvt res;
int shift; int shift;
xx.d = a; xx.d = a;
if (xx.i == 0) return 0.0f; if (xx.i == 0) return 0.0f;
res.i = (((unsigned int) (xx.i >> 32)) & 0x80000000); res.i = (((unsigned int) (xx.i >> 32)) & 0x80000000);
if ((xx.i & 0x7ff0000000000000ULL) == 0x7ff0000000000000ULL) { if ((xx.i & 0x7ff0000000000000ULL) == 0x7ff0000000000000ULL) {
if ((xx.i & 0x7fffffffffffffffULL) > 0x7ff0000000000000ULL) { if ((xx.i & 0x7fffffffffffffffULL) > 0x7ff0000000000000ULL) {
// Nan /* Nan */
res.i = 0x7f8fffff; res.i = ((unsigned int)((xx.i >> 32) & 0x80000000) |
(255U << 23) | 0x00400000 |
(unsigned int)((xx.i >> (53 - 24)) & 0x007fffff));
} else { } else {
// Inf /* Inf */
res.i |= 0x7f800000; res.i |= 0x7f800000;
} }
return res.f; return res.f;
} }
shift = ((int) ((xx.i >> 52) & 0x7ff)) - 1023; shift = ((int) ((xx.i >> 52) & 0x7ff)) - 1023;
// Overflow /* Overflow */
xx.i = (xx.i & 0x000fffffffffffffULL); xx.i = (xx.i & 0x000fffffffffffffULL);
if (shift >= 128) { if (shift >= 128) {
res.i |= 0x7f7fffff; res.i |= 0x7f7fffff;
return res.f; return res.f;
} }
if (shift <= -127) { if (shift <= -127) {
if (shift < -180) { if (shift < -180) {
// Underflow /* Underflow */
xx.i = 0; xx.i = 0;
} else { } else {
xx.i |= 0x0010000000000000ULL; xx.i |= 0x0010000000000000ULL;
xx.i >>= 127 + shift; xx.i >>= -126 - shift;
} }
} else { } else {
res.i |= (unsigned int) (127 + shift) << 23; res.i |= (unsigned int) (127 + shift) << 23;
} }
res.i |= ((unsigned int) (xx.i >> 29)) & 0x007fffff; res.i |= ((unsigned int) (xx.i >> 29)) & 0x007fffff;
xx.i &= 0x1fffffff; xx.i &= 0x1fffffff;
return res.f; return res.f;
} }
__device_func__(double __internal_ll2double_kernel(long long int a, enum cu daRoundMode rndMode)) __device_func__(double __internal_ll2double_kernel(long long int a, enum cu daRoundMode rndMode))
skipping to change at line 596 skipping to change at line 598
return __internal_ull2double_kernel(a, cudaRoundMinInf); return __internal_ull2double_kernel(a, cudaRoundMinInf);
} }
__device_func__(double __ull2double_ru(unsigned long long int a)) __device_func__(double __ull2double_ru(unsigned long long int a))
{ {
return __internal_ull2double_kernel(a, cudaRoundPosInf); return __internal_ull2double_kernel(a, cudaRoundPosInf);
} }
#endif /* !__CUDABE__ */ #endif /* !__CUDABE__ */
#if !defined(__CUDABE__) || defined(CUDA_NO_SM_13_DOUBLE_INTRINSICS) #if !defined(__CUDABE__) || __CUDA_ARCH__ < 130
#include "common_types.h" #include "common_types.h"
__device_func__(double __internal_fma_kernel(double x, double y, double z, enum cudaRoundMode rndMode)) __device_func__(double __internal_fma_kernel(double x, double y, double z, enum cudaRoundMode rndMode))
{ {
#ifdef __MULTI_CORE__ #ifdef __MULTI_CORE__
volatile volatile
#endif /* __MULTI_CORE__ */ #endif /* __MULTI_CORE__ */
struct __cudart_UintUint xx, yy, zz, ww; struct __cudart_UintUint xx, yy, zz, ww;
unsigned int s, t, u, prod0, prod1, prod2, prod3, expo_x, expo_y, expo_z; unsigned int s, t, u, prod0, prod1, prod2, prod3, expo_x, expo_y, expo_z;
xx.hi = __double2hiint(x); xx.hi = __double2hiint(x);
xx.lo = __double2loint(x); xx.lo = __double2loint(x);
skipping to change at line 1113 skipping to change at line 1117
__device_func__(double __dadd_rn(double a, double b)) __device_func__(double __dadd_rn(double a, double b))
{ {
return __fma_rn(a, CUDART_ONE, b); return __fma_rn(a, CUDART_ONE, b);
} }
__device_func__(double __dmul_rn(double a, double b)) __device_func__(double __dmul_rn(double a, double b))
{ {
return __fma_rn(a, b, CUDART_NEG_ZERO); return __fma_rn(a, b, CUDART_NEG_ZERO);
} }
#endif /* !__CUDABE__ || CUDA_NO_SM_13_DOUBLE_INTRINSICS */ #endif /* !__CUDABE__ || __CUDA_ARCH__ < 130 */
/************************************************************************** ***** /************************************************************************** *****
* * * *
* HOST / DEVICE IMPLEMENTATIONS FOR FUNCTIONS * * HOST / DEVICE IMPLEMENTATIONS FOR FUNCTIONS *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
#endif /* __cplusplus && __CUDACC__ */ #endif /* __cplusplus && __CUDACC__ */
#endif /* !__SM_13_DOUBLE_FUNCTIONS_H__ */ #endif /* !__SM_13_DOUBLE_FUNCTIONS_H__ */
 End of changes. 10 change blocks. 
10 lines changed or deleted 14 lines changed or added


 storage_class.h   storage_class.h 
/* /*
* Copyright 1993-2009 NVIDIA Corporation. All rights reserved. * Copyright 1993-2008 NVIDIA Corporation. All rights reserved.
* *
* NOTICE TO USER: * NOTICE TO USER:
* *
* This source code is subject to NVIDIA ownership rights under U.S. and * This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws. Users and possessors of this source code * international Copyright laws. Users and possessors of this source code
* are hereby granted a nonexclusive, royalty-free license to use this code * are hereby granted a nonexclusive, royalty-free license to use this code
* in individual and commercial software. * in individual and commercial software.
* *
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
skipping to change at line 47 skipping to change at line 47
#define __STORAGE_CLASS_H__ #define __STORAGE_CLASS_H__
#if !defined(__var_used__) #if !defined(__var_used__)
#define __var_used__ #define __var_used__
#endif /* __var_used__ */ #endif /* __var_used__ */
#if !defined(__loc_sc__) #if !defined(__loc_sc__)
#define __loc_sc__(loc, sc) \ #define __loc_sc__(loc, size, sc) \
__storage##_##sc##loc loc __storage##_##sc##size##loc loc
#endif /* !__loc_sc__ */ #endif /* !__loc_sc__ */
#if !defined(__storage___device__) #if !defined(__storage___device__)
#define __storage___device__ static __var_used__ #define __storage___device__ static __var_used__
#endif /* __storage___device__ */ #endif /* __storage___device__ */
#if !defined(__storage_extern__device__) #if !defined(__storage_extern__device__)
#define __storage_extern__device__ static __var_used__ #define __storage_extern__device__ static __var_used__
#endif /* __storage_extern__device__ */ #endif /* __storage_extern__device__ */
skipping to change at line 100 skipping to change at line 100
#endif /* __storage_extern__shared__ */ #endif /* __storage_extern__shared__ */
#if !defined(__storage_auto__shared__) #if !defined(__storage_auto__shared__)
#define __storage_auto__shared__ static #define __storage_auto__shared__ static
#endif /* __storage_auto__shared__ */ #endif /* __storage_auto__shared__ */
#if !defined(__storage_static__shared__) #if !defined(__storage_static__shared__)
#define __storage_static__shared__ static __var_used__ #define __storage_static__shared__ static __var_used__
#endif /* __storage_static__shared__ */ #endif /* __storage_static__shared__ */
#if !defined(__storage__unsized__shared__)
#define __storage__unsized__shared__ @@@ COMPILER @@@ ERROR @@@
#endif /* __storage__unsized__shared__ */
#if !defined(__storage_extern_unsized__shared__)
#define __storage_extern_unsized__shared__ static __var_used__
#endif /* __storage_extern_unsized__shared__ */
#if !defined(__storage_auto_unsized__shared__)
#define __storage_auto_unsized__shared__ @@@ COMPILER @@@ ERROR @@@
#endif /* __storage_auto_unsized__shared__ */
#if !defined(__storage_static_unsized__shared__)
#define __storage_static_unsized__shared__ @@@ COMPILER @@@ ERROR @@@
#endif /* __storage_static_unsized__shared__ */
#if !defined(__storage___text__) #if !defined(__storage___text__)
#define __storage___text__ static __var_used__ #define __storage___text__ static __var_used__
#endif /* __storage___text__ */ #endif /* __storage___text__ */
#if !defined(__storage_extern__text__) #if !defined(__storage_extern__text__)
#define __storage_extern__text__ static __var_used__ #define __storage_extern__text__ static __var_used__
#endif /* __storage_extern__text__ */ #endif /* __storage_extern__text__ */
#if !defined(__storage_auto__text__) #if !defined(__storage_auto__text__)
#define __storage_auto__text__ @@@ COMPILER @@@ ERROR @@@ #define __storage_auto__text__ @@@ COMPILER @@@ ERROR @@@
 End of changes. 3 change blocks. 
3 lines changed or deleted 19 lines changed or added


 vector_types.h   vector_types.h 
skipping to change at line 53 skipping to change at line 53
*************************************************************************** ****/ *************************************************************************** ****/
#include "host_defines.h" #include "host_defines.h"
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
#if !defined(__cuda_assign_operators)
#define __cuda_assign_operators(tag)
#endif /* !__cuda_assign_operators */
#if !defined(__CUDACC__) && !defined(__CUDABE__) && \
!defined (__MULTI_CORE__) && defined(_WIN32) && !defined(_WIN64)
#define __cuda_builtin_vector_align8(tag, ...) \
struct tag { \
union { \
struct { __VA_ARGS__; }; \
struct { long long int :1,:0; }; \
}; \
__cuda_assign_operators(tag) \
}
#else /* !__CUDACC__ && !__CUDABE__ && !__MULTI_CORE__ && _WIN32 && !_WIN64
*/
#define __cuda_builtin_vector_align8(tag, ...) \
struct __align__(8) tag { \
__VA_ARGS__; \
__cuda_assign_operators(tag) \
}
#endif /* !__CUDACC__ && !__CUDABE__ && !__MULTI_CORE__ && _WIN32 && !_WIN6
4 */
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct char1 struct char1
{ {
signed char x; signed char x;
__cuda_assign_operators(char1)
}; };
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct uchar1 struct uchar1
{ {
unsigned char x; unsigned char x;
__cuda_assign_operators(uchar1)
}; };
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct __builtin_align__(2) char2 struct __align__(2) char2
{ {
signed char x, y; signed char x, y;
__cuda_assign_operators(char2)
}; };
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct __builtin_align__(2) uchar2 struct __align__(2) uchar2
{ {
unsigned char x, y; unsigned char x, y;
__cuda_assign_operators(uchar2)
}; };
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct char3 struct char3
{ {
signed char x, y, z; signed char x, y, z;
__cuda_assign_operators(char3)
}; };
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct uchar3 struct uchar3
{ {
unsigned char x, y, z; unsigned char x, y, z;
__cuda_assign_operators(uchar3)
}; };
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct __builtin_align__(4) char4 struct __align__(4) char4
{ {
signed char x, y, z, w; signed char x, y, z, w;
__cuda_assign_operators(char4)
}; };
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct __builtin_align__(4) uchar4 struct __align__(4) uchar4
{ {
unsigned char x, y, z, w; unsigned char x, y, z, w;
__cuda_assign_operators(uchar4)
}; };
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct short1 struct short1
{ {
short x; short x;
__cuda_assign_operators(short1)
}; };
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct ushort1 struct ushort1
{ {
unsigned short x; unsigned short x;
__cuda_assign_operators(ushort1)
}; };
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct __builtin_align__(4) short2 struct __align__(4) short2
{ {
short x, y; short x, y;
__cuda_assign_operators(short2)
}; };
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct __builtin_align__(4) ushort2 struct __align__(4) ushort2
{ {
unsigned short x, y; unsigned short x, y;
__cuda_assign_operators(ushort2)
}; };
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct short3 struct short3
{ {
short x, y, z; short x, y, z;
__cuda_assign_operators(short3)
}; };
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct ushort3 struct ushort3
{ {
unsigned short x, y, z; unsigned short x, y, z;
__cuda_assign_operators(ushort3)
}; };
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct __builtin_align__(8) short4 __cuda_builtin_vector_align8(short4, short x, y, z, w);
{
short x, y, z, w;
};
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct __builtin_align__(8) ushort4 __cuda_builtin_vector_align8(ushort4, unsigned short x, y, z, w);
{
unsigned short x, y, z, w;
};
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct int1 struct int1
{ {
int x; int x;
__cuda_assign_operators(int1)
}; };
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct uint1 struct uint1
{ {
unsigned int x; unsigned int x;
__cuda_assign_operators(uint1)
}; };
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct __builtin_align__(8) int2 __cuda_builtin_vector_align8(int2, int x, y);
{
int x, y;
};
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct __builtin_align__(8) uint2 __cuda_builtin_vector_align8(uint2, unsigned int x, y);
{
unsigned int x, y;
};
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct int3 struct int3
{ {
int x, y, z; int x, y, z;
__cuda_assign_operators(int3)
}; };
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct uint3 struct uint3
{ {
unsigned int x, y, z; unsigned int x, y, z;
__cuda_assign_operators(uint3)
}; };
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct __builtin_align__(16) int4 struct __builtin_align__(16) int4
{ {
int x, y, z, w; int x, y, z, w;
__cuda_assign_operators(int4)
}; };
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct __builtin_align__(16) uint4 struct __builtin_align__(16) uint4
{ {
unsigned int x, y, z, w; unsigned int x, y, z, w;
__cuda_assign_operators(uint4)
}; };
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct long1 struct long1
{ {
long int x; long int x;
__cuda_assign_operators(long1)
}; };
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct ulong1 struct ulong1
{ {
unsigned long x; unsigned long x;
__cuda_assign_operators(ulong1)
}; };
/*DEVICE_BUILTIN*/
struct
#if defined (_WIN32) #if defined (_WIN32)
__builtin_align__(8)
/*DEVICE_BUILTIN*/
__cuda_builtin_vector_align8(long2, long int x, y);
/*DEVICE_BUILTIN*/
__cuda_builtin_vector_align8(ulong2, unsigned long int x, y);
#else /* _WIN32 */ #else /* _WIN32 */
__builtin_align__(2*sizeof(long int))
#endif /* _WIN32 */ /*DEVICE_BUILTIN*/
long2 struct __align__(2*sizeof(long int)) long2
{ {
long int x, y; long int x, y;
__cuda_assign_operators(long2)
}; };
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct struct __align__(2*sizeof(unsigned long int)) ulong2
#if defined (_WIN32)
__builtin_align__(8)
#else /* _WIN32 */
__builtin_align__(2*sizeof(unsigned long int))
#endif /* _WIN32 */
ulong2
{ {
unsigned long int x, y; unsigned long int x, y;
__cuda_assign_operators(ulong2)
}; };
#endif /* _WIN32 */
#if !defined(__LP64__) #if !defined(__LP64__)
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct long3 struct long3
{ {
long int x, y, z; long int x, y, z;
__cuda_assign_operators(long3)
}; };
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct ulong3 struct ulong3
{ {
unsigned long int x, y, z; unsigned long int x, y, z;
__cuda_assign_operators(ulong3)
}; };
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct __builtin_align__(16) long4 struct __builtin_align__(16) long4
{ {
long int x, y, z, w; long int x, y, z, w;
__cuda_assign_operators(long4)
}; };
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct __builtin_align__(16) ulong4 struct __builtin_align__(16) ulong4
{ {
unsigned long int x, y, z, w; unsigned long int x, y, z, w;
__cuda_assign_operators(ulong4)
}; };
#endif /* !__LP64__ */ #endif /* !__LP64__ */
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct float1 struct float1
{ {
float x; float x;
__cuda_assign_operators(float1)
}; };
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct __builtin_align__(8) float2 __cuda_builtin_vector_align8(float2, float x, y);
{
float x, y;
};
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct float3 struct float3
{ {
float x, y, z; float x, y, z;
__cuda_assign_operators(float3)
}; };
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct __builtin_align__(16) float4 struct __builtin_align__(16) float4
{ {
float x, y, z, w; float x, y, z, w;
__cuda_assign_operators(float4)
}; };
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct longlong1 struct longlong1
{ {
long long int x; long long int x;
__cuda_assign_operators(longlong1)
}; };
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct ulonglong1 struct ulonglong1
{ {
unsigned long long int x; unsigned long long int x;
__cuda_assign_operators(ulonglong1)
}; };
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct __builtin_align__(16) longlong2 struct __builtin_align__(16) longlong2
{ {
long long int x, y; long long int x, y;
__cuda_assign_operators(longlong2)
}; };
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct __builtin_align__(16) ulonglong2 struct __builtin_align__(16) ulonglong2
{ {
unsigned long long int x, y; unsigned long long int x, y;
__cuda_assign_operators(ulonglong2)
}; };
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct double1 struct double1
{ {
double x; double x;
__cuda_assign_operators(double1)
}; };
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct __builtin_align__(16) double2 struct __builtin_align__(16) double2
{ {
double x, y; double x, y;
__cuda_assign_operators(double2)
}; };
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
typedef struct char1 char1; typedef struct char1 char1;
skipping to change at line 419 skipping to change at line 469
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
typedef struct double2 double2; typedef struct double2 double2;
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
typedef struct dim3 dim3;
/*DEVICE_BUILTIN*/
struct dim3 struct dim3
{ {
unsigned int x, y, z; unsigned int x, y, z;
#if defined(__cplusplus) #if defined(__cplusplus)
dim3(unsigned int x = 1, unsigned int y = 1, unsigned int z = 1) : x(x) __host__ __device__ dim3(unsigned int x = 1, unsigned int y = 1, unsign
, y(y), z(z) {} ed int z = 1) : x(x), y(y), z(z) {}
dim3(uint3 v) : x(v.x), y(v.y), z(v.z) {} __host__ __device__ dim3(uint3 v) : x(v.x), y(v.y), z(v.z) {}
operator uint3(void) { uint3 t; t.x = x; t.y = y; t.z = z; return t; } __host__ __device__ operator uint3(void) { uint3 t; t.x = x; t.y = y; t
.z = z; return t; }
#endif /* __cplusplus */ #endif /* __cplusplus */
}; };
/*DEVICE_BUILTIN*/
typedef struct dim3 dim3;
#undef __cuda_assign_operators
#undef __cuda_builtin_vector_align8
#endif /* !__VECTOR_TYPES_H__ */ #endif /* !__VECTOR_TYPES_H__ */
 End of changes. 57 change blocks. 
46 lines changed or deleted 102 lines changed or added

This html diff was produced by rfcdiff 1.41. The latest version is available from http://tools.ietf.org/tools/rfcdiff/