__cudaFatFormat.h | __cudaFatFormat.h | |||
---|---|---|---|---|
/* | /* | |||
* Copyright 1993-2007 NVIDIA Corporation. All rights reserved. | * Copyright 1993-2008 NVIDIA Corporation. All rights reserved. | |||
* | * | |||
* NOTICE TO USER: | * NOTICE TO USER: | |||
* | * | |||
* This source code is subject to NVIDIA ownership rights under U.S. and | * This source code is subject to NVIDIA ownership rights under U.S. and | |||
* international Copyright laws. Users and possessors of this source code | * international Copyright laws. Users and possessors of this source code | |||
* are hereby granted a nonexclusive, royalty-free license to use this code | * are hereby granted a nonexclusive, royalty-free license to use this code | |||
* in individual and commercial software. | * in individual and commercial software. | |||
* | * | |||
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE | * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE | |||
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR | * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR | |||
skipping to change at line 133 | skipping to change at line 133 | |||
char* debug; | char* debug; | |||
} __cudaFatDebugEntry; | } __cudaFatDebugEntry; | |||
typedef enum { | typedef enum { | |||
__cudaFatDontSearchFlag = (1 << 0), | __cudaFatDontSearchFlag = (1 << 0), | |||
__cudaFatDontCacheFlag = (1 << 1), | __cudaFatDontCacheFlag = (1 << 1), | |||
__cudaFatSassDebugFlag = (1 << 2) | __cudaFatSassDebugFlag = (1 << 2) | |||
} __cudaFatCudaBinaryFlag; | } __cudaFatCudaBinaryFlag; | |||
/* | /* | |||
* Imported/exported symbol descriptor, needed for | ||||
* __cudaFat binary linking. Not much information is needed, | ||||
* because this is only an index: full symbol information | ||||
* is contained by the binaries. | ||||
*/ | ||||
typedef struct { | ||||
char* name; | ||||
} __cudaFatSymbol; | ||||
/* | ||||
* Fat binary container. | * Fat binary container. | |||
* A mix of ptx intermediate programs and cubins, | * A mix of ptx intermediate programs and cubins, | |||
* plus a global identifier that can be used for | * plus a global identifier that can be used for | |||
* further lookup in a translation cache or a resource | * further lookup in a translation cache or a resource | |||
* file. This key is a checksum over the device text. | * file. This key is a checksum over the device text. | |||
* The ptx and cubin array are each terminated with | * The ptx and cubin array are each terminated with | |||
* entries that have Null components. | * entries that have NULL components. | |||
*/ | */ | |||
typedef struct { | typedef struct __cudaFatCudaBinaryRec { | |||
unsigned long magic; | unsigned long magic; | |||
unsigned long version; | unsigned long version; | |||
unsigned long gpuInfoVersion; | unsigned long gpuInfoVersion; | |||
char* key; | char* key; | |||
char* ident; | char* ident; | |||
char* usageMode; | char* usageMode; | |||
__cudaFatPtxEntry *ptx; | __cudaFatPtxEntry *ptx; | |||
__cudaFatCubinEntry *cubin; | __cudaFatCubinEntry *cubin; | |||
__cudaFatDebugEntry *debug; | __cudaFatDebugEntry *debug; | |||
void* debugInfo; | void* debugInfo; | |||
unsigned int flags; | unsigned int flags; | |||
__cudaFatSymbol *exported; | ||||
__cudaFatSymbol *imported; | ||||
struct __cudaFatCudaBinaryRec *dependends; | ||||
} __cudaFatCudaBinary; | } __cudaFatCudaBinary; | |||
/* | /* | |||
* Current version and magic numbers: | * Current version and magic numbers: | |||
*/ | */ | |||
#define __cudaFatVERSION 0x00000002 | #define __cudaFatVERSION 0x00000003 | |||
#define __cudaFatMAGIC 0x1ee55a01 | #define __cudaFatMAGIC 0x1ee55a01 | |||
/* | /* | |||
* Version history log: | * Version history log: | |||
* 1 : __cudaFatDebugEntry field added to __cudaFatCudaBinary struct | * 1 : __cudaFatDebugEntry field added to __cudaFatCudaBinary struct | |||
* 2 : flags and debugInfo field added. | * 2 : flags and debugInfo field added. | |||
* 3 : import/export symbol list | ||||
*/ | */ | |||
/*--------------------------------- Functions ----------------------------- ---*/ | /*--------------------------------- Functions ----------------------------- ---*/ | |||
typedef enum { | ||||
__cudaFatAvoidPTX, | ||||
__cudaFatPreferBestCode | ||||
} __cudaFatCompilationPolicy; | ||||
/* | /* | |||
* Function : Select a load image from the __cudaFat binary | * Function : Select a load image from the __cudaFat binary | |||
* that will run on the specified GPU. | * that will run on the specified GPU. | |||
* Parameters : binary (I) Fat binary | * Parameters : binary (I) Fat binary | |||
* policy (I) Parameter influencing the selection proces | ||||
s in case no | ||||
* fully matching cubin can be found, but ins | ||||
tead a choice can | ||||
* be made between ptx compilation or selecti | ||||
on of a | ||||
* cubin for a less capable GPU. | ||||
* gpuName (I) Name of target GPU | * gpuName (I) Name of target GPU | |||
* cubin (O) Returned cubin text string, or Null when | * cubin (O) Returned cubin text string, or NULL when | |||
* no matching cubin for the specified gpu | * no matching cubin for the specified gpu | |||
* could be found. | * could be found. | |||
* dbgInfo (O) If this parameter is not Null upon entry, then | * dbgInfo (O) If this parameter is not NULL upon entry, then | |||
* the name of a file containing debug inform ation | * the name of a file containing debug inform ation | |||
* on the returned cubin will be returned, or Null | * on the returned cubin will be returned, or NULL | |||
* will be returned when cubin or such debug info | * will be returned when cubin or such debug info | |||
* cannot be found. | * cannot be found. | |||
*/ | */ | |||
void fatGetCubinForGpu( __cudaFatCudaBinary *binary, char* gpuName, char* * | void fatGetCubinForGpuWithPolicy( __cudaFatCudaBinary *binary, __cudaFatCom | |||
cubin, char* *dbgInfoFile ); | pilationPolicy policy, char* gpuName, char* *cubin, char* *dbgInfoFile ); | |||
#define fatGetCubinForGpu(binary,gpuName,cubin,dbgInfoFile) \ | ||||
fatGetCubinForGpuWithPolicy(binary,__cudaFatAvoidPTX,gpuName,cubi | ||||
n,dbgInfoFile) | ||||
/* | /* | |||
* Function : Free information previously obtained via function fatG etCubinForGpu. | * Function : Free information previously obtained via function fatG etCubinForGpu. | |||
* Parameters : cubin (I) Cubin text string to free | * Parameters : cubin (I) Cubin text string to free | |||
* dbgInfo (I) Debug info filename to free, or Null | * dbgInfo (I) Debug info filename to free, or NULL | |||
*/ | */ | |||
void fatFreeCubin( char* cubin, char* dbgInfoFile ); | void fatFreeCubin( char* cubin, char* dbgInfoFile ); | |||
#ifdef __cplusplus | #ifdef __cplusplus | |||
} | } | |||
#endif | #endif | |||
#endif | #endif | |||
End of changes. 13 change blocks. | ||||
21 lines changed or deleted | 51 lines changed or added | |||
channel_descriptor.h | channel_descriptor.h | |||
---|---|---|---|---|
skipping to change at line 39 | skipping to change at line 39 | |||
* source code with only those rights set forth herein. | * source code with only those rights set forth herein. | |||
* | * | |||
* Any use of this source code in individual and commercial software must | * Any use of this source code in individual and commercial software must | |||
* include, in the user documentation and internal comments to the code, | * include, in the user documentation and internal comments to the code, | |||
* the above Disclaimer and U.S. Government End Users Notice. | * the above Disclaimer and U.S. Government End Users Notice. | |||
*/ | */ | |||
#if !defined(__CHANNEL_DESCRIPTOR_H__) | #if !defined(__CHANNEL_DESCRIPTOR_H__) | |||
#define __CHANNEL_DESCRIPTOR_H__ | #define __CHANNEL_DESCRIPTOR_H__ | |||
#if defined(__cplusplus) | ||||
/************************************************************************** ***** | /************************************************************************** ***** | |||
* * | * * | |||
* * | * * | |||
* * | * * | |||
*************************************************************************** ****/ | *************************************************************************** ****/ | |||
#include "builtin_types.h" | #include "driver_types.h" | |||
#include "cuda_runtime_api.h" | #include "cuda_runtime_api.h" | |||
#include "host_defines.h" | #include "host_defines.h" | |||
#include "vector_types.h" | ||||
/************************************************************************** ***** | /************************************************************************** ***** | |||
* * | * * | |||
* * | * * | |||
* * | * * | |||
*************************************************************************** ****/ | *************************************************************************** ****/ | |||
#if defined(__cplusplus) | ||||
template<class T> __inline__ __host__ cudaChannelFormatDesc cudaCreateChann elDesc(void) | template<class T> __inline__ __host__ cudaChannelFormatDesc cudaCreateChann elDesc(void) | |||
{ | { | |||
return cudaCreateChannelDesc(0, 0, 0, 0, cudaChannelFormatKindUnsigned); | return cudaCreateChannelDesc(0, 0, 0, 0, cudaChannelFormatKindNone); | |||
} | } | |||
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc< char>(void) | template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc< char>(void) | |||
{ | { | |||
int e = (int)sizeof(char) * 8; | int e = (int)sizeof(char) * 8; | |||
#if __SIGNED_CHARS__ | #if __SIGNED_CHARS__ | |||
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned); | return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned); | |||
#else | #else | |||
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned); | return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned); | |||
End of changes. 5 change blocks. | ||||
4 lines changed or deleted | 5 lines changed or added | |||
cuComplex.h | cuComplex.h | |||
---|---|---|---|---|
skipping to change at line 245 | skipping to change at line 245 | |||
} | } | |||
/* | /* | |||
* We would like to call hypotf(), but it's not available on all platforms. | * We would like to call hypotf(), but it's not available on all platforms. | |||
* This discrete implementation guards against intermediate underflow and | * This discrete implementation guards against intermediate underflow and | |||
* overflow by scaling. Otherwise we would lose half the exponent range. | * overflow by scaling. Otherwise we would lose half the exponent range. | |||
* There are various ways of doing guarded computation. For now chose the | * There are various ways of doing guarded computation. For now chose the | |||
* simplest and fastest solution, however this may suffer from inaccuracies | * simplest and fastest solution, however this may suffer from inaccuracies | |||
* if sqrt and division are not IEEE compliant. | * if sqrt and division are not IEEE compliant. | |||
*/ | */ | |||
__host__ static __inline__ float cuCabsf (cuFloatComplex x) | __host__ __device__ static __inline__ float cuCabsf (cuFloatComplex x) | |||
{ | { | |||
float a = cuCrealf(x); | float a = cuCrealf(x); | |||
float b = cuCimagf(x); | float b = cuCimagf(x); | |||
float v, w, t; | float v, w, t; | |||
a = (float)fabs(a); | a = (float)fabs(a); | |||
b = (float)fabs(b); | b = (float)fabs(b); | |||
if (a > b) { | if (a > b) { | |||
v = a; | v = a; | |||
w = b; | w = b; | |||
} else { | } else { | |||
End of changes. 1 change blocks. | ||||
1 lines changed or deleted | 1 lines changed or added | |||
cublas.h | cublas.h | |||
---|---|---|---|---|
skipping to change at line 78 | skipping to change at line 78 | |||
* Currently, only a subset of the BLAS core functions is implemented. | * Currently, only a subset of the BLAS core functions is implemented. | |||
* | * | |||
* The interface to the CUBLAS library is the header file cublas.h. | * The interface to the CUBLAS library is the header file cublas.h. | |||
* Applications using CUBLAS need to link against the DSO cublas.so | * Applications using CUBLAS need to link against the DSO cublas.so | |||
* (Linux) or the DLL cublas.dll (Win32). | * (Linux) or the DLL cublas.dll (Win32). | |||
*/ | */ | |||
#if !defined(CUBLAS_H_) | #if !defined(CUBLAS_H_) | |||
#define CUBLAS_H_ | #define CUBLAS_H_ | |||
#ifdef __MULTI_CORE__ | ||||
#error CUBLAS not supported on multicore | ||||
#endif | ||||
#ifndef CUBLASAPI | #ifndef CUBLASAPI | |||
#ifdef _WIN32 | #ifdef _WIN32 | |||
#define CUBLASAPI __stdcall | #define CUBLASAPI __stdcall | |||
#else | #else | |||
#define CUBLASAPI | #define CUBLASAPI | |||
#endif | #endif | |||
#endif | #endif | |||
#if defined(__cplusplus) | #if defined(__cplusplus) | |||
extern "C" { | extern "C" { | |||
End of changes. 1 change blocks. | ||||
0 lines changed or deleted | 4 lines changed or added | |||
cuda.h | cuda.h | |||
---|---|---|---|---|
skipping to change at line 46 | skipping to change at line 46 | |||
// ------------------------------------------------------------------------ ---- | // ------------------------------------------------------------------------ ---- | |||
// | // | |||
// Main public header file for the CompUte Device Api | // Main public header file for the CompUte Device Api | |||
// | // | |||
// ------------------------------------------------------------------------ ---- | // ------------------------------------------------------------------------ ---- | |||
#ifndef __cuda_cuda_h__ | #ifndef __cuda_cuda_h__ | |||
#define __cuda_cuda_h__ | #define __cuda_cuda_h__ | |||
/* CUDA API version number */ | /* CUDA API version number */ | |||
#define CUDA_VERSION 2000 /* 2.0 */ | #define CUDA_VERSION 2010 /* 2.1 */ | |||
#ifdef __cplusplus | #ifdef __cplusplus | |||
extern "C" { | extern "C" { | |||
#endif | #endif | |||
typedef unsigned int CUdeviceptr; | typedef unsigned int CUdeviceptr; | |||
typedef int CUdevice; | typedef int CUdevice; | |||
typedef struct CUctx_st *CUcontext; | typedef struct CUctx_st *CUcontext; | |||
typedef struct CUmod_st *CUmodule; | typedef struct CUmod_st *CUmodule; | |||
typedef struct CUfunc_st *CUfunction; | typedef struct CUfunc_st *CUfunction; | |||
skipping to change at line 132 | skipping to change at line 132 | |||
CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK = 8, // Deprecated, us e CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK | CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK = 8, // Deprecated, us e CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK | |||
CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY = 9, | CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY = 9, | |||
CU_DEVICE_ATTRIBUTE_WARP_SIZE = 10, | CU_DEVICE_ATTRIBUTE_WARP_SIZE = 10, | |||
CU_DEVICE_ATTRIBUTE_MAX_PITCH = 11, | CU_DEVICE_ATTRIBUTE_MAX_PITCH = 11, | |||
CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK = 12, | CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK = 12, | |||
CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK = 12, // Deprecated, us e CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK | CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK = 12, // Deprecated, us e CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK | |||
CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13, | CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13, | |||
CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT = 14, | CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT = 14, | |||
CU_DEVICE_ATTRIBUTE_GPU_OVERLAP = 15, | CU_DEVICE_ATTRIBUTE_GPU_OVERLAP = 15, | |||
CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16 | CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16, | |||
CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT = 17 | ||||
} CUdevice_attribute; | } CUdevice_attribute; | |||
// | // | |||
// Legacy device properties | // Legacy device properties | |||
// | // | |||
typedef struct CUdevprop_st { | typedef struct CUdevprop_st { | |||
int maxThreadsPerBlock; | int maxThreadsPerBlock; | |||
int maxThreadsDim[3]; | int maxThreadsDim[3]; | |||
int maxGridSize[3]; | int maxGridSize[3]; | |||
int sharedMemPerBlock; | int sharedMemPerBlock; | |||
skipping to change at line 160 | skipping to change at line 161 | |||
// | // | |||
// Memory types | // Memory types | |||
// | // | |||
typedef enum CUmemorytype_enum { | typedef enum CUmemorytype_enum { | |||
CU_MEMORYTYPE_HOST = 0x01, | CU_MEMORYTYPE_HOST = 0x01, | |||
CU_MEMORYTYPE_DEVICE = 0x02, | CU_MEMORYTYPE_DEVICE = 0x02, | |||
CU_MEMORYTYPE_ARRAY = 0x03 | CU_MEMORYTYPE_ARRAY = 0x03 | |||
} CUmemorytype; | } CUmemorytype; | |||
// | ||||
// Online compiler options | ||||
// | ||||
typedef enum CUjit_option_enum | ||||
{ | ||||
// CU_JIT_MAX_REGISTERS - Max number of registers that a thread may use | ||||
. | ||||
CU_JIT_MAX_REGISTERS = 0, | ||||
// CU_JIT_THREADS_PER_BLOCK - | ||||
// IN: Specifies minimum number of threads per block to target compilat | ||||
ion for | ||||
// OUT: Returns the number of threads the compiler actually targeted. | ||||
This | ||||
// restricts the resource utilization fo the compiler (e.g. max registe | ||||
rs) such | ||||
// that a block with the given number of threads should be able to laun | ||||
ch based | ||||
// on register limitations. Note, this option does not currently take | ||||
into | ||||
// account any other resource limitations, such as shared memory utiliz | ||||
ation. | ||||
CU_JIT_THREADS_PER_BLOCK, | ||||
// CU_JIT_WALL_TIME - returns a float value in the option of the wall c | ||||
lock | ||||
// time, in milliseconds, spent creating the cubin | ||||
CU_JIT_WALL_TIME, | ||||
// CU_JIT_INFO_LUG_BUFFER - pointer to a buffer in which to print any l | ||||
og | ||||
// messsages from PTXAS that are informational in nature | ||||
CU_JIT_INFO_LOG_BUFFER, | ||||
// CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES - | ||||
// IN: Log buffer size in bytes. Log messages will be capped at this s | ||||
ize | ||||
// (including null terminator) | ||||
// OUT: Amount of log buffer filled with messages | ||||
CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, | ||||
// CU_JIT_ERROR_LOG_BUFFER - pointer to a buffer in which to print any | ||||
log | ||||
// messages from PTXAS that reflect errors | ||||
CU_JIT_ERROR_LOG_BUFFER, | ||||
// CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES - | ||||
// IN: Log buffer size in bytes. Log messages will be capped at this s | ||||
ize | ||||
// (including null terminator) | ||||
// OUT: Amount of log buffer filled with messages | ||||
CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, | ||||
// CU_JIT_OPTIMIZATION_LEVEL - level of optimizations to apply to gener | ||||
ated | ||||
// code (0 - 4), with 4 being the default and highest level of optimiza | ||||
tions. | ||||
CU_JIT_OPTIMIZATION_LEVEL, | ||||
// CU_JIT_TARGET_FROM_CU_CONTEXT - no option value required. Determine | ||||
s | ||||
// the target based on the current attached context (default) | ||||
CU_JIT_TARGET_FROM_CUCONTEXT, | ||||
// CU_JIT_TARGET - target is chosen based on supplied CUjit_target_enum | ||||
. | ||||
CU_JIT_TARGET, | ||||
// CU_JIT_FALLBACK_STRATEGY - specifies choice of fallback strategy if | ||||
// matching cubin is not found. Choice is based on supplied | ||||
// CUjit_fallback_enum. | ||||
CU_JIT_FALLBACK_STRATEGY | ||||
} CUjit_option; | ||||
// | ||||
// Online compilation targets | ||||
// | ||||
typedef enum CUjit_target_enum | ||||
{ | ||||
CU_TARGET_COMPUTE_10 = 0, | ||||
CU_TARGET_COMPUTE_11, | ||||
CU_TARGET_COMPUTE_12, | ||||
CU_TARGET_COMPUTE_13 | ||||
} CUjit_target; | ||||
// | ||||
// Cubin matching fallback strategies | ||||
// | ||||
typedef enum CUjit_fallback_enum | ||||
{ | ||||
// prefer to compile ptx | ||||
CU_PREFER_PTX = 0, | ||||
// prefer to fall back to compatible binary code | ||||
CU_PREFER_BINARY | ||||
} CUjit_fallback; | ||||
/************************************ | /************************************ | |||
** | ** | |||
** Error codes | ** Error codes | |||
** | ** | |||
***********************************/ | ***********************************/ | |||
typedef enum cudaError_enum { | typedef enum cudaError_enum { | |||
CUDA_SUCCESS = 0, | CUDA_SUCCESS = 0, | |||
CUDA_ERROR_INVALID_VALUE = 1, | CUDA_ERROR_INVALID_VALUE = 1, | |||
skipping to change at line 253 | skipping to change at line 337 | |||
CUresult CUDAAPI cuCtxSynchronize(void); | CUresult CUDAAPI cuCtxSynchronize(void); | |||
/************************************ | /************************************ | |||
** | ** | |||
** Module management | ** Module management | |||
** | ** | |||
***********************************/ | ***********************************/ | |||
CUresult CUDAAPI cuModuleLoad(CUmodule *module, const char *fname); | CUresult CUDAAPI cuModuleLoad(CUmodule *module, const char *fname); | |||
CUresult CUDAAPI cuModuleLoadData(CUmodule *module, const void *image) ; | CUresult CUDAAPI cuModuleLoadData(CUmodule *module, const void *image) ; | |||
CUresult CUDAAPI cuModuleLoadDataEx(CUmodule *module, const void *imag e, unsigned int numOptions, CUjit_option *options, void **optionValues); | ||||
CUresult CUDAAPI cuModuleLoadFatBinary(CUmodule *module, const void *f atCubin); | CUresult CUDAAPI cuModuleLoadFatBinary(CUmodule *module, const void *f atCubin); | |||
CUresult CUDAAPI cuModuleUnload(CUmodule hmod); | CUresult CUDAAPI cuModuleUnload(CUmodule hmod); | |||
CUresult CUDAAPI cuModuleGetFunction(CUfunction *hfunc, CUmodule hmod, const char *name); | CUresult CUDAAPI cuModuleGetFunction(CUfunction *hfunc, CUmodule hmod, const char *name); | |||
CUresult CUDAAPI cuModuleGetGlobal(CUdeviceptr *dptr, unsigned int *by tes, CUmodule hmod, const char *name); | CUresult CUDAAPI cuModuleGetGlobal(CUdeviceptr *dptr, unsigned int *by tes, CUmodule hmod, const char *name); | |||
CUresult CUDAAPI cuModuleGetTexRef(CUtexref *pTexRef, CUmodule hmod, c onst char *name); | CUresult CUDAAPI cuModuleGetTexRef(CUtexref *pTexRef, CUmodule hmod, c onst char *name); | |||
/************************************ | /************************************ | |||
** | ** | |||
** Memory management | ** Memory management | |||
** | ** | |||
End of changes. 4 change blocks. | ||||
2 lines changed or deleted | 103 lines changed or added | |||
cuda_runtime.h | cuda_runtime.h | |||
---|---|---|---|---|
skipping to change at line 63 | skipping to change at line 63 | |||
#include "builtin_types.h" | #include "builtin_types.h" | |||
#include "channel_descriptor.h" | #include "channel_descriptor.h" | |||
#include "cuda_runtime_api.h" | #include "cuda_runtime_api.h" | |||
#include "driver_functions.h" | #include "driver_functions.h" | |||
#include "host_defines.h" | #include "host_defines.h" | |||
#include "vector_functions.h" | #include "vector_functions.h" | |||
#if defined(__CUDACC__) | #if defined(__CUDACC__) | |||
#include "common_functions.h" | #include "common_functions.h" | |||
#include "cuda_texture_types.h" | ||||
#include "device_functions.h" | #include "device_functions.h" | |||
#include "device_launch_parameters.h" | #include "device_launch_parameters.h" | |||
#endif /* __CUDACC__ */ | #endif /* __CUDACC__ */ | |||
#if defined(__cplusplus) | #if defined(__cplusplus) | |||
/************************************************************************** ***** | /************************************************************************** ***** | |||
* * | * * | |||
* * | * * | |||
End of changes. 1 change blocks. | ||||
0 lines changed or deleted | 1 lines changed or added | |||
cuda_runtime_api.h | cuda_runtime_api.h | |||
---|---|---|---|---|
skipping to change at line 41 | skipping to change at line 41 | |||
* Any use of this source code in individual and commercial software must | * Any use of this source code in individual and commercial software must | |||
* include, in the user documentation and internal comments to the code, | * include, in the user documentation and internal comments to the code, | |||
* the above Disclaimer and U.S. Government End Users Notice. | * the above Disclaimer and U.S. Government End Users Notice. | |||
*/ | */ | |||
#if !defined(__CUDA_RUNTIME_API_H__) | #if !defined(__CUDA_RUNTIME_API_H__) | |||
#define __CUDA_RUNTIME_API_H__ | #define __CUDA_RUNTIME_API_H__ | |||
/************************************************************************** ***** | /************************************************************************** ***** | |||
* * | * * | |||
* CUDA runtime API version number 2.0 * | * CUDA runtime API version number 2.1 * | |||
* * | * * | |||
*************************************************************************** ****/ | *************************************************************************** ****/ | |||
#define CUDART_VERSION \ | #define CUDART_VERSION \ | |||
2000 | 2010 | |||
/************************************************************************** ***** | /************************************************************************** ***** | |||
* * | * * | |||
* * | * * | |||
* * | * * | |||
*************************************************************************** ****/ | *************************************************************************** ****/ | |||
#include "host_defines.h" | #include "host_defines.h" | |||
#include "builtin_types.h" | #include "builtin_types.h" | |||
End of changes. 2 change blocks. | ||||
2 lines changed or deleted | 2 lines changed or added | |||
cufft.h | cufft.h | |||
---|---|---|---|---|
/* | /* | |||
* Copyright 1993-2007 NVIDIA Corporation. All rights reserved. | * Copyright 1993-2008 NVIDIA Corporation. All rights reserved. | |||
* | * | |||
* NOTICE TO USER: | * NOTICE TO USER: | |||
* | * | |||
* This source code is subject to NVIDIA ownership rights under U.S. and | * This source code is subject to NVIDIA ownership rights under U.S. and | |||
* international Copyright laws. Users and possessors of this source code | * international Copyright laws. Users and possessors of this source code | |||
* are hereby granted a nonexclusive, royalty-free license to use this code | * are hereby granted a nonexclusive, royalty-free license to use this code | |||
* in individual and commercial software. | * in individual and commercial software. | |||
* | * | |||
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE | * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE | |||
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR | * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR | |||
skipping to change at line 45 | skipping to change at line 45 | |||
/* | /* | |||
* cufft.h | * cufft.h | |||
* Public header file for the NVIDIA Cuda FFT library (CUFFT) | * Public header file for the NVIDIA Cuda FFT library (CUFFT) | |||
*/ | */ | |||
#ifndef _CUFFT_H_ | #ifndef _CUFFT_H_ | |||
#define _CUFFT_H_ | #define _CUFFT_H_ | |||
#include <stdio.h> | #include <stdio.h> | |||
#include <cuComplex.h> | #include "cuComplex.h" | |||
#ifdef __MULTI_CORE__ | ||||
#error CUFFT not supported on multicore | ||||
#endif | ||||
#ifndef CUFFTAPI | #ifndef CUFFTAPI | |||
#ifdef _WIN32 | #ifdef _WIN32 | |||
#define CUFFTAPI __stdcall | #define CUFFTAPI __stdcall | |||
#else | #else | |||
#define CUFFTAPI | #define CUFFTAPI | |||
#endif | #endif | |||
#endif | #endif | |||
#ifdef __cplusplus | #ifdef __cplusplus | |||
End of changes. 2 change blocks. | ||||
2 lines changed or deleted | 6 lines changed or added | |||
device_functions.h | device_functions.h | |||
---|---|---|---|---|
skipping to change at line 505 | skipping to change at line 505 | |||
__device_func__(int __internal_float2int(float a, enum cudaRoundMode rndMod e)) | __device_func__(int __internal_float2int(float a, enum cudaRoundMode rndMod e)) | |||
{ | { | |||
return (int)__internal_float2ll_kernel(a, 2147483647LL, -2147483648LL, 0L L, rndMode); | return (int)__internal_float2ll_kernel(a, 2147483647LL, -2147483648LL, 0L L, rndMode); | |||
} | } | |||
__device_func__(int __float2int_rz(float a)) | __device_func__(int __float2int_rz(float a)) | |||
{ | { | |||
#if defined(__MULTI_CORE__) | #if defined(__MULTI_CORE__) | |||
return (int)a; | return (int)a; | |||
#else /* __MULTI_CORE__) */ | #else /* __MULTI_CORE__ */ | |||
return __internal_float2int(a, cudaRoundZero); | return __internal_float2int(a, cudaRoundZero); | |||
#endif /* __MULTI_CORE__) */ | #endif /* __MULTI_CORE__ */ | |||
} | } | |||
__device_func__(int __float2int_ru(float a)) | __device_func__(int __float2int_ru(float a)) | |||
{ | { | |||
return __internal_float2int(a, cudaRoundPosInf); | return __internal_float2int(a, cudaRoundPosInf); | |||
} | } | |||
__device_func__(int __float2int_rd(float a)) | __device_func__(int __float2int_rd(float a)) | |||
{ | { | |||
return __internal_float2int(a, cudaRoundMinInf); | return __internal_float2int(a, cudaRoundMinInf); | |||
skipping to change at line 534 | skipping to change at line 534 | |||
__device_func__(long long int __internal_float2ll(float a, enum cudaRoundMo de rndMode)) | __device_func__(long long int __internal_float2ll(float a, enum cudaRoundMo de rndMode)) | |||
{ | { | |||
return __internal_float2ll_kernel(a, 9223372036854775807LL, -922337203685 4775807LL -1LL, -9223372036854775807LL -1LL, rndMode); | return __internal_float2ll_kernel(a, 9223372036854775807LL, -922337203685 4775807LL -1LL, -9223372036854775807LL -1LL, rndMode); | |||
} | } | |||
__device_func__(long long int __float2ll_rz(float a)) | __device_func__(long long int __float2ll_rz(float a)) | |||
{ | { | |||
#if defined(__MULTI_CORE__) | #if defined(__MULTI_CORE__) | |||
return (long long int)a; | return (long long int)a; | |||
#else /* __MULTI_CORE__) */ | #else /* __MULTI_CORE__ */ | |||
return __internal_float2ll(a, cudaRoundZero); | return __internal_float2ll(a, cudaRoundZero); | |||
#endif /* __MULTI_CORE__) */ | #endif /* __MULTI_CORE__ */ | |||
} | } | |||
__device_func__(long long int __float2ll_ru(float a)) | __device_func__(long long int __float2ll_ru(float a)) | |||
{ | { | |||
return __internal_float2ll(a, cudaRoundPosInf); | return __internal_float2ll(a, cudaRoundPosInf); | |||
} | } | |||
__device_func__(long long int __float2ll_rd(float a)) | __device_func__(long long int __float2ll_rd(float a)) | |||
{ | { | |||
return __internal_float2ll(a, cudaRoundMinInf); | return __internal_float2ll(a, cudaRoundMinInf); | |||
skipping to change at line 589 | skipping to change at line 589 | |||
__device_func__(unsigned int __internal_float2uint(float a, enum cudaRoundM ode rndMode)) | __device_func__(unsigned int __internal_float2uint(float a, enum cudaRoundM ode rndMode)) | |||
{ | { | |||
return (unsigned int)__internal_float2ull_kernel(a, 4294967295U, 0U, rndM ode); | return (unsigned int)__internal_float2ull_kernel(a, 4294967295U, 0U, rndM ode); | |||
} | } | |||
__device_func__(unsigned int __float2uint_rz(float a)) | __device_func__(unsigned int __float2uint_rz(float a)) | |||
{ | { | |||
#if defined(__MULTI_CORE__) | #if defined(__MULTI_CORE__) | |||
return (unsigned int)a; | return (unsigned int)a; | |||
#else /* __MULTI_CORE__) */ | #else /* __MULTI_CORE__ */ | |||
return __internal_float2uint(a, cudaRoundZero); | return __internal_float2uint(a, cudaRoundZero); | |||
#endif /* __MULTI_CORE__) */ | #endif /* __MULTI_CORE__ */ | |||
} | } | |||
__device_func__(unsigned int __float2uint_ru(float a)) | __device_func__(unsigned int __float2uint_ru(float a)) | |||
{ | { | |||
return __internal_float2uint(a, cudaRoundPosInf); | return __internal_float2uint(a, cudaRoundPosInf); | |||
} | } | |||
__device_func__(unsigned int __float2uint_rd(float a)) | __device_func__(unsigned int __float2uint_rd(float a)) | |||
{ | { | |||
return __internal_float2uint(a, cudaRoundMinInf); | return __internal_float2uint(a, cudaRoundMinInf); | |||
skipping to change at line 618 | skipping to change at line 618 | |||
__device_func__(unsigned long long int __internal_float2ull(float a, enum c udaRoundMode rndMode)) | __device_func__(unsigned long long int __internal_float2ull(float a, enum c udaRoundMode rndMode)) | |||
{ | { | |||
return __internal_float2ull_kernel(a, 18446744073709551615ULL, 9223372036 854775808ULL, rndMode); | return __internal_float2ull_kernel(a, 18446744073709551615ULL, 9223372036 854775808ULL, rndMode); | |||
} | } | |||
__device_func__(unsigned long long int __float2ull_rz(float a)) | __device_func__(unsigned long long int __float2ull_rz(float a)) | |||
{ | { | |||
#if defined(__MULTI_CORE__) | #if defined(__MULTI_CORE__) | |||
return (unsigned long long int)a; | return (unsigned long long int)a; | |||
#else /* __MULTI_CORE__) */ | #else /* __MULTI_CORE__ */ | |||
return __internal_float2ull(a, cudaRoundZero); | return __internal_float2ull(a, cudaRoundZero); | |||
#endif /* __MULTI_CORE__) */ | #endif /* __MULTI_CORE__ */ | |||
} | } | |||
__device_func__(unsigned long long int __float2ull_ru(float a)) | __device_func__(unsigned long long int __float2ull_ru(float a)) | |||
{ | { | |||
return __internal_float2ull(a, cudaRoundPosInf); | return __internal_float2ull(a, cudaRoundPosInf); | |||
} | } | |||
__device_func__(unsigned long long int __float2ull_rd(float a)) | __device_func__(unsigned long long int __float2ull_rd(float a)) | |||
{ | { | |||
return __internal_float2ull(a, cudaRoundMinInf); | return __internal_float2ull(a, cudaRoundMinInf); | |||
skipping to change at line 726 | skipping to change at line 726 | |||
__device_func__(float __int2float_rd(int a)) | __device_func__(float __int2float_rd(int a)) | |||
{ | { | |||
return __internal_int2float_kernel(a, cudaRoundMinInf); | return __internal_int2float_kernel(a, cudaRoundMinInf); | |||
} | } | |||
__device_func__(float __int2float_rn(int a)) | __device_func__(float __int2float_rn(int a)) | |||
{ | { | |||
#if defined(__MULTI_CORE__) | #if defined(__MULTI_CORE__) | |||
return (float)a; | return (float)a; | |||
#else /* __MULTI_CORE__) */ | #else /* __MULTI_CORE__ */ | |||
return __internal_int2float_kernel(a, cudaRoundNearest); | return __internal_int2float_kernel(a, cudaRoundNearest); | |||
#endif /* __MULTI_CORE__) */ | #endif /* __MULTI_CORE__ */ | |||
} | } | |||
__device_func__(float __internal_uint2float_kernel(unsigned int a, enum cud aRoundMode rndMode)) | __device_func__(float __internal_uint2float_kernel(unsigned int a, enum cud aRoundMode rndMode)) | |||
{ | { | |||
volatile union { | volatile union { | |||
float f; | float f; | |||
unsigned int i; | unsigned int i; | |||
} res; | } res; | |||
int shift; | int shift; | |||
unsigned int t; | unsigned int t; | |||
skipping to change at line 773 | skipping to change at line 773 | |||
__device_func__(float __uint2float_rd(unsigned int a)) | __device_func__(float __uint2float_rd(unsigned int a)) | |||
{ | { | |||
return __internal_uint2float_kernel(a, cudaRoundMinInf); | return __internal_uint2float_kernel(a, cudaRoundMinInf); | |||
} | } | |||
__device_func__(float __uint2float_rn(unsigned int a)) | __device_func__(float __uint2float_rn(unsigned int a)) | |||
{ | { | |||
#if defined(__MULTI_CORE__) | #if defined(__MULTI_CORE__) | |||
return (float)a; | return (float)a; | |||
#else /* __MULTI_CORE__) */ | #else /* __MULTI_CORE__ */ | |||
return __internal_uint2float_kernel(a, cudaRoundNearest); | return __internal_uint2float_kernel(a, cudaRoundNearest); | |||
#endif /* __MULTI_CORE__) */ | #endif /* __MULTI_CORE__ */ | |||
} | } | |||
__device_func__(float __ll2float_rn(long long int a)) | __device_func__(float __ll2float_rn(long long int a)) | |||
{ | { | |||
return (float)a; | return (float)a; | |||
} | } | |||
__device_func__(float __ull2float_rn(unsigned long long int a)) | __device_func__(float __ull2float_rn(unsigned long long int a)) | |||
{ | { | |||
#if defined(__MULTI_CORE__) | #if defined(__MULTI_CORE__) | |||
return (float)a; | return (float)a; | |||
#else /* __MULTI_CORE__) */ | #else /* __MULTI_CORE__ */ | |||
unsigned long long int temp; | unsigned long long int temp; | |||
unsigned int res, t; | unsigned int res, t; | |||
int shift; | int shift; | |||
if (a == 0ULL) return 0.0f; | if (a == 0ULL) return 0.0f; | |||
temp = a; | temp = a; | |||
shift = __internal_normalize64(&temp); | shift = __internal_normalize64(&temp); | |||
temp = (temp >> 8) | ((temp & 0xffULL) ? 1ULL : 0ULL); | temp = (temp >> 8) | ((temp & 0xffULL) ? 1ULL : 0ULL); | |||
res = (unsigned int)(temp >> 32); | res = (unsigned int)(temp >> 32); | |||
t = (unsigned int)temp; | t = (unsigned int)temp; | |||
res += (127 + 62 - shift) << 23; /* add in exponent */ | res += (127 + 62 - shift) << 23; /* add in exponent */ | |||
res += t == 0x80000000 ? res & 1 : t >> 31; | res += t == 0x80000000 ? res & 1 : t >> 31; | |||
return __int_as_float(res); | return __int_as_float(res); | |||
#endif /* __MULTI_CORE__) */ | #endif /* __MULTI_CORE__ */ | |||
} | } | |||
__device_func__(float __internal_fmul_kernel(float a, float b, int rndNeare st)) | __device_func__(float __internal_fmul_kernel(float a, float b, int rndNeare st)) | |||
{ | { | |||
unsigned long long product; | unsigned long long product; | |||
volatile union { | volatile union { | |||
float f; | float f; | |||
unsigned int i; | unsigned int i; | |||
} xx, yy; | } xx, yy; | |||
unsigned expo_x, expo_y; | unsigned expo_x, expo_y; | |||
skipping to change at line 1069 | skipping to change at line 1069 | |||
expo_x = (unsigned int)(-((int)expo_x)); | expo_x = (unsigned int)(-((int)expo_x)); | |||
temp = xx.i << expo_y | ((temp) ? 1 : 0); | temp = xx.i << expo_y | ((temp) ? 1 : 0); | |||
xx.i = yy.i | (xx.i >> expo_x); | xx.i = yy.i | (xx.i >> expo_x); | |||
xx.i += (((temp == 0x80000000) ? (xx.i & 1) : (temp >> 31)) | xx.i += (((temp == 0x80000000) ? (xx.i & 1) : (temp >> 31)) | |||
&& rndNearest); | && rndNearest); | |||
return xx.f; | return xx.f; | |||
} else { | } else { | |||
/* handle special cases separately */ | /* handle special cases separately */ | |||
if (!(yy.i << 1)) { | if (!(yy.i << 1)) { | |||
if (xx.i == 0x80000000) { | if (xx.i == 0x80000000) { | |||
xx.i = yy.i; | xx.i = yy.i; | |||
} | ||||
if ((xx.i << 1) > 0xff000000) { | ||||
xx.i |= 0x00400000; | ||||
} | } | |||
return xx.f; | return xx.f; | |||
} | } | |||
if ((expo_y != 254) && (expo_x != 254)) { | if ((expo_y != 254) && (expo_x != 254)) { | |||
/* remove sign bits */ | /* remove sign bits */ | |||
if (expo_x == (unsigned int) -1) { | if (expo_x == (unsigned int) -1) { | |||
temp = xx.i & 0x80000000; | temp = xx.i & 0x80000000; | |||
xx.i = xx.i << 8; | xx.i = xx.i << 8; | |||
while (!(xx.i & 0x80000000)) { | while (!(xx.i & 0x80000000)) { | |||
xx.i <<= 1; | xx.i <<= 1; | |||
skipping to change at line 1155 | skipping to change at line 1158 | |||
return __internal_fmul_kernel(a, b, 1); | return __internal_fmul_kernel(a, b, 1); | |||
} | } | |||
__device_func__(void __brkpt(int c)) | __device_func__(void __brkpt(int c)) | |||
{ | { | |||
/* TODO */ | /* TODO */ | |||
} | } | |||
#if defined(__MULTI_CORE__) | #if defined(__MULTI_CORE__) | |||
__device_func__(void __syncthreads(void)) | #define __syncthreads() \ | |||
{ | __builtin___syncthreads() | |||
#undef __syncthreads | ||||
__syncthreads(); | ||||
#define __syncthreads \ | ||||
__syncthreads_wrapper | ||||
} | ||||
#else /* __MULTI_CORE__ */ | #else /* __MULTI_CORE__ */ | |||
extern int CUDARTAPI __cudaSynchronizeThreads(void**, void*); | extern int CUDARTAPI __cudaSynchronizeThreads(void**, void*); | |||
#if defined(__GNUC__) | #if defined(__GNUC__) | |||
__device_func__(inline __attribute__((always_inline)) void __syncthreads(vo id)) | __device_func__(inline __attribute__((always_inline)) void __syncthreads(vo id)) | |||
{ | { | |||
volatile int _ = 0; | volatile int _ = 0; | |||
skipping to change at line 1262 | skipping to change at line 1260 | |||
a *= .25f; | a *= .25f; | |||
b *= .25f; | b *= .25f; | |||
} | } | |||
return __fdividef(a, b); | return __fdividef(a, b); | |||
} | } | |||
__device_func__(float __tanf(float a)) | __device_func__(float __tanf(float a)) | |||
{ | { | |||
#if defined(__MULTI_CORE__) | #if defined(__MULTI_CORE__) | |||
return tanf(a); | return tanf(a); | |||
#else /* __MULTI_CORE__) */ | #else /* __MULTI_CORE__ */ | |||
return __sinf(a) / __cosf(a); | return __sinf(a) / __cosf(a); | |||
#endif /* __MULTI_CORE__) */ | #endif /* __MULTI_CORE__ */ | |||
} | } | |||
__device_func__(void __sincosf(float a, float *sptr, float *cptr)) | __device_func__(void __sincosf(float a, float *sptr, float *cptr)) | |||
{ | { | |||
#if defined(__MULTI_CORE__) | #if defined(__MULTI_CORE__) | |||
sincosf(a, sptr, cptr); | sincosf(a, sptr, cptr); | |||
#else /* __MULTI_CORE__) */ | #else /* __MULTI_CORE__ */ | |||
*sptr = __sinf(a); | *sptr = __sinf(a); | |||
*cptr = __cosf(a); | *cptr = __cosf(a); | |||
#endif /* __MULTI_CORE__) */ | #endif /* __MULTI_CORE__ */ | |||
} | } | |||
__device_func__(float __expf(float a)) | __device_func__(float __expf(float a)) | |||
{ | { | |||
#if defined(__MULTI_CORE__) | #if defined(__MULTI_CORE__) | |||
return expf(a); | return expf(a); | |||
#else /* __MULTI_CORE__) */ | #else /* __MULTI_CORE__ */ | |||
return __cuda_exp2f(a * CUDART_L2E_F); | return __cuda_exp2f(a * CUDART_L2E_F); | |||
#endif /* __MULTI_CORE__) */ | #endif /* __MULTI_CORE__ */ | |||
} | } | |||
__device_func__(float __exp10f(float a)) | __device_func__(float __exp10f(float a)) | |||
{ | { | |||
#if defined(__MULTI_CORE__) | #if defined(__MULTI_CORE__) | |||
return exp10f(a); | return exp10f(a); | |||
#else /* __MULTI_CORE__) */ | #else /* __MULTI_CORE__ */ | |||
return __cuda_exp2f(a * CUDART_L2T_F); | return __cuda_exp2f(a * CUDART_L2T_F); | |||
#endif /* __MULTI_CORE__) */ | #endif /* __MULTI_CORE__ */ | |||
} | } | |||
__device_func__(float __log10f(float a)) | __device_func__(float __log10f(float a)) | |||
{ | { | |||
#if defined(__MULTI_CORE__) | #if defined(__MULTI_CORE__) | |||
return log10f(a); | return log10f(a); | |||
#else /* __MULTI_CORE__) */ | #else /* __MULTI_CORE__ */ | |||
return CUDART_LG2_F * __log2f(a); | return CUDART_LG2_F * __log2f(a); | |||
#endif /* __MULTI_CORE__) */ | #endif /* __MULTI_CORE__ */ | |||
} | } | |||
__device_func__(float __logf(float a)) | __device_func__(float __logf(float a)) | |||
{ | { | |||
#if defined(__MULTI_CORE__) | #if defined(__MULTI_CORE__) | |||
return logf(a); | return logf(a); | |||
#else /* __MULTI_CORE__) */ | #else /* __MULTI_CORE__ */ | |||
return CUDART_LN2_F * __log2f(a); | return CUDART_LN2_F * __log2f(a); | |||
#endif /* __MULTI_CORE__) */ | #endif /* __MULTI_CORE__ */ | |||
} | } | |||
__device_func__(float __powf(float a, float b)) | __device_func__(float __powf(float a, float b)) | |||
{ | { | |||
#if defined(__MULTI_CORE__) | #if defined(__MULTI_CORE__) | |||
return powf(a, b); | return powf(a, b); | |||
#else /* __MULTI_CORE__) */ | #else /* __MULTI_CORE__ */ | |||
return __cuda_exp2f(b * __log2f(a)); | return __cuda_exp2f(b * __log2f(a)); | |||
#endif /* __MULTI_CORE__) */ | #endif /* __MULTI_CORE__ */ | |||
} | } | |||
__device_func__(float fdividef(float a, float b)) | __device_func__(float fdividef(float a, float b)) | |||
{ | { | |||
#if defined(__MULTI_CORE__) | #if defined(__MULTI_CORE__) | |||
return a / b; | return a / b; | |||
#elif defined(__USE_FAST_MATH__) | #elif defined(__USE_FAST_MATH__) | |||
return __fdividef(a, b); | return __fdividef(a, b); | |||
#else /* __USE_FAST_MATH__ */ | #else /* __MULTI_CORE__ */ | |||
return __internal_accurate_fdividef(a, b); | return __internal_accurate_fdividef(a, b); | |||
#endif /* __USE_FAST_MATH__ */ | #endif /* __MULTI_CORE__ */ | |||
} | } | |||
__device_func__(int __clz(int a)) | __device_func__(int __clz(int a)) | |||
{ | { | |||
return (a)?(158-(__float_as_int(__uint2float_rz((unsigned int)a))>>23)):3 2; | return (a)?(158-(__float_as_int(__uint2float_rz((unsigned int)a))>>23)):3 2; | |||
} | } | |||
__device_func__(int __ffs(int a)) | __device_func__(int __ffs(int a)) | |||
{ | { | |||
return 32 - __clz (a & -a); | return 32 - __clz (a & -a); | |||
} | } | |||
__device_func__(int __popc(unsigned int a)) | __device_func__(int __popc(unsigned int a)) | |||
{ | { | |||
unsigned int s = 033333333333; | a = a - ((a >> 1) & 0x55555555); | |||
unsigned int t = 030707070707; | a = (a & 0x33333333) + ((a >> 2) & 0x33333333); | |||
unsigned int n; | a = (a + (a >> 4)) & 0x0f0f0f0f; | |||
n = (a >> 1) & s; | a = ((__umul24(a, 0x808080) << 1) + a) >> 24; | |||
a = a - n; | ||||
n = (n >> 1) & s; | ||||
a = a - n; | ||||
n = (a >> 3) + a; | ||||
a = n & t; | ||||
t = (t << 2) & t; /* t = 0x04104104 */ | ||||
a = (a >> 30) + ((a * t) >> 26); | ||||
return a; | return a; | |||
} | } | |||
__device_func__(int __clzll(long long int a)) | __device_func__(int __clzll(long long int a)) | |||
{ | { | |||
int ahi = ((int)(a >> 32)); | int ahi = ((int)((unsigned long long)a >> 32)); | |||
int alo = ((int)(a & 0xffffffffULL)); | int alo = ((int)((unsigned long long)a & 0xffffffffULL)); | |||
int res; | int res; | |||
if (ahi) { | if (ahi) { | |||
res = 0; | res = 0; | |||
} else { | } else { | |||
res = 32; | res = 32; | |||
ahi = alo; | ahi = alo; | |||
} | } | |||
res = res + __clz(ahi); | res = res + __clz(ahi); | |||
return res; | return res; | |||
} | } | |||
__device_func__(int __ffsll(long long int a)) | __device_func__(int __ffsll(long long int a)) | |||
{ | { | |||
return 64 - __clzll (a & -a); | return 64 - __clzll (a & -a); | |||
} | } | |||
__device_func__(int __popcll(unsigned long long int a)) | __device_func__(int __popcll(unsigned long long int a)) | |||
{ | { | |||
unsigned int ahi = ((unsigned int)(a >> 32)); | unsigned int ahi = ((unsigned int)(a >> 32)); | |||
unsigned int alo = ((unsigned int)(a & 0xffffffffULL)); | unsigned int alo = ((unsigned int)(a & 0xffffffffULL)); | |||
return __popc(ahi) + __popc(alo); | alo = alo - ((alo >> 1) & 0x55555555); | |||
alo = (alo & 0x33333333) + ((alo >> 2) & 0x33333333); | ||||
ahi = ahi - ((ahi >> 1) & 0x55555555); | ||||
ahi = (ahi & 0x33333333) + ((ahi >> 2) & 0x33333333); | ||||
alo = alo + ahi; | ||||
alo = (alo & 0x0f0f0f0f) + ((alo >> 4) & 0x0f0f0f0f); | ||||
alo = ((__umul24(alo, 0x808080) << 1) + alo) >> 24; | ||||
return alo; | ||||
} | } | |||
#if defined(CUDA_DOUBLE_MATH_FUNCTIONS) && defined(CUDA_FLOAT_MATH_FUNCTION S) | #if defined(CUDA_DOUBLE_MATH_FUNCTIONS) && defined(CUDA_FLOAT_MATH_FUNCTION S) | |||
#error -- conflicting mode for double math routines | #error -- conflicting mode for double math routines | |||
#endif /* CUDA_DOUBLE_MATH_FUNCTIONS && CUDA_FLOAT_MATH_FUNCTIONS */ | #endif /* CUDA_DOUBLE_MATH_FUNCTIONS && CUDA_FLOAT_MATH_FUNCTIONS */ | |||
#if defined(CUDA_FLOAT_MATH_FUNCTIONS) | #if defined(CUDA_FLOAT_MATH_FUNCTIONS) | |||
End of changes. 35 change blocks. | ||||
52 lines changed or deleted | 50 lines changed or added | |||
device_launch_parameters.h | device_launch_parameters.h | |||
---|---|---|---|---|
skipping to change at line 36 | skipping to change at line 36 | |||
* and is provided to the U.S. Government only as a commercial end item. | * and is provided to the U.S. Government only as a commercial end item. | |||
* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through | * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through | |||
* 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the | * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the | |||
* source code with only those rights set forth herein. | * source code with only those rights set forth herein. | |||
* | * | |||
* Any use of this source code in individual and commercial software must | * Any use of this source code in individual and commercial software must | |||
* include, in the user documentation and internal comments to the code, | * include, in the user documentation and internal comments to the code, | |||
* the above Disclaimer and U.S. Government End Users Notice. | * the above Disclaimer and U.S. Government End Users Notice. | |||
*/ | */ | |||
#if !defined(__DEVICE_LAUCH_PARAMETERS_H__) | #if !defined(__DEVICE_LAUNCH_PARAMETERS_H__) | |||
#define __DEVICE_LAUCH_PARAMETERS_H__ | #define __DEVICE_LAUNCH_PARAMETERS_H__ | |||
#include "vector_types.h" | #include "vector_types.h" | |||
#if !defined(__STORAGE__) | #if !defined(__STORAGE__) | |||
#define __STORAGE__ \ | #define __STORAGE__ \ | |||
extern const | extern const | |||
#endif /* __STORAGE__ */ | #endif /* __STORAGE__ */ | |||
skipping to change at line 104 | skipping to change at line 104 | |||
#endif /* __cudaGet_gridDim */ | #endif /* __cudaGet_gridDim */ | |||
#if !defined(__cudaGet_warpSize) | #if !defined(__cudaGet_warpSize) | |||
#define __cudaGet_warpSize() \ | #define __cudaGet_warpSize() \ | |||
warpSize | warpSize | |||
#endif /* __cudaGet_warpSize */ | #endif /* __cudaGet_warpSize */ | |||
#endif /* !__DEVICE_LAUCH_PARAMETERS_H__ */ | #endif /* !__DEVICE_LAUNCH_PARAMETERS_H__ */ | |||
End of changes. 2 change blocks. | ||||
2 lines changed or deleted | 2 lines changed or added | |||
device_runtime.h | device_runtime.h | |||
---|---|---|---|---|
skipping to change at line 42 | skipping to change at line 42 | |||
* include, in the user documentation and internal comments to the code, | * include, in the user documentation and internal comments to the code, | |||
* the above Disclaimer and U.S. Government End Users Notice. | * the above Disclaimer and U.S. Government End Users Notice. | |||
*/ | */ | |||
#if !defined(__CUDA_INTERNAL_COMPILATION__) | #if !defined(__CUDA_INTERNAL_COMPILATION__) | |||
#define __CUDA_INTERNAL_COMPILATION__ | #define __CUDA_INTERNAL_COMPILATION__ | |||
#endif /* !__CUDA_INTERNAL_COMPILATION__ */ | #endif /* !__CUDA_INTERNAL_COMPILATION__ */ | |||
#include "../host_defines.h" | #include "host_defines.h" | |||
#define __texture__(type, pref, dim) \ | ||||
__##pref##dim##texture | ||||
#define __no_sc__ | #define __no_sc__ | |||
#define __empty_array(s) \ | #define __empty_array(s) \ | |||
s | s | |||
#define __unsized_shared_data(name, type_post) \ | #define __unsized_shared_data(name, type_post) \ | |||
__unsized##name __unsized##type_post | __unsized##name __unsized##type_post | |||
#define __sized_shared_data(name, type) \ | #define __sized_shared_data(name, type) \ | |||
__sized##name type | __sized##name type | |||
#define __sized__shared_var(name, s, type) \ | #define __sized__shared_var(name, s, type) \ | |||
name | name | |||
/*TEXTURE_TYPE*/ | /*TEXTURE_TYPE*/ | |||
typedef const void* __i1texture; | typedef const void *__texture_type__; | |||
/*TEXTURE_TYPE*/ | /*SURFACE_TYPE*/ | |||
typedef const void* __i2texture; | typedef const void *__surface_type__; | |||
/*TEXTURE_TYPE*/ | ||||
typedef const void* __i3texture; | ||||
/*TEXTURE_TYPE*/ | ||||
typedef const void* __u1texture; | ||||
/*TEXTURE_TYPE*/ | ||||
typedef const void* __u2texture; | ||||
/*TEXTURE_TYPE*/ | ||||
typedef const void* __u3texture; | ||||
/*TEXTURE_TYPE*/ | ||||
typedef const void* __f1texture; | ||||
/*TEXTURE_TYPE*/ | ||||
typedef const void* __f2texture; | ||||
/*TEXTURE_TYPE*/ | ||||
typedef const void* __f3texture; | ||||
#if defined(__CUDABE__) /* cudabe compiler */ | #if defined(__CUDABE__) /* cudabe compiler */ | |||
#define __pad__(f) | #define __pad__(f) | |||
#define __text__ \ | #define __text__ \ | |||
__attribute__((__texture__)) | __attribute__((__texture__)) | |||
#define __surf__ \ | ||||
__attribute__((__surface__)) | ||||
#define ___device__(sc) \ | #define ___device__(sc) \ | |||
static | static | |||
#define __in__(cdecl, decl) \ | #define __in__(cdecl, decl) \ | |||
__shared__ cdecl | __shared__ cdecl | |||
#define __in_type__(cdecl, decl) \ | #define __in_type__(cdecl, decl) \ | |||
cdecl | cdecl | |||
#define __texture_var(name) \ | #define __texture_var(name) \ | |||
name | name | |||
#define __shared_var(name, s, type) \ | #define __shared_var(name, s, type) \ | |||
name | name | |||
skipping to change at line 116 | skipping to change at line 102 | |||
#define __cdecl | #define __cdecl | |||
#undef __w64 | #undef __w64 | |||
#define __w64 | #define __w64 | |||
#elif defined(__CUDACC__) /* cudafe compiler */ | #elif defined(__CUDACC__) /* cudafe compiler */ | |||
#define __loc_sc__(loc, sc) \ | #define __loc_sc__(loc, sc) \ | |||
sc loc | sc loc | |||
#define __pad__(f) | #define __pad__(f) | |||
#define __text__ | #define __text__ | |||
#define __surf__ | ||||
#define ___device__(sc) \ | #define ___device__(sc) \ | |||
sc __device__ | sc __device__ | |||
#define __in__(cdecl, decl) \ | #define __in__(cdecl, decl) \ | |||
decl | decl | |||
#define __in_type__(cdecl, decl) \ | #define __in_type__(cdecl, decl) \ | |||
decl | decl | |||
#define __texture_var(name) \ | #define __texture_var(name) \ | |||
name | name | |||
#define __shared_var(name, s, type) \ | #define __shared_var(name, s, type) \ | |||
name | name | |||
skipping to change at line 152 | skipping to change at line 139 | |||
struct uint3; | struct uint3; | |||
extern struct uint3* CUDARTAPI __cudaGetBlockIdxPtr(void); | extern struct uint3* CUDARTAPI __cudaGetBlockIdxPtr(void); | |||
extern void* CUDARTAPI __cudaGetSharedMem(void*); | extern void* CUDARTAPI __cudaGetSharedMem(void*); | |||
#endif /* __MULTI_CORE__ || __multi_core__ */ | #endif /* __MULTI_CORE__ || __multi_core__ */ | |||
#if defined (__multi_core__) | #if defined (__multi_core__) | |||
#if defined(__GNUC__) | #if defined(__GNUC__) | |||
#if defined(__cplusplus) | ||||
extern void *alloca(size_t) throw(); | ||||
#else /* __cplusplus */ | ||||
extern void *alloca(size_t); | extern void *alloca(size_t); | |||
#endif /* __cplusplus */ | ||||
#define __cuda_alloc(s) \ | #define __cuda_alloc(s) \ | |||
alloca(s) | alloca(s) | |||
#else /* __GNUC__ */ | #else /* __GNUC__ */ | |||
extern void *_alloca(size_t); | extern void *_alloca(size_t); | |||
#define __cuda_alloc(s) \ | #define __cuda_alloc(s) \ | |||
_alloca(s) | _alloca(s) | |||
#endif /* __GNUC__ */ | #endif /* __GNUC__ */ | |||
#endif /* __multi_core__ */ | #endif /* __multi_core__ */ | |||
#if defined (__MULTI_CORE__) | #if defined (__MULTI_CORE__) | |||
#define __syncthreads \ | #define ___device__(sc) \ | |||
__syncthreads_wrapper | static | |||
#define __pad__(f) \ | #define __pad__(f) \ | |||
f | f | |||
#define __text__ | #define __text__ | |||
#define __surf__ | ||||
#define __cudaGet_blockIdx() \ | #define __cudaGet_blockIdx() \ | |||
(*__cudaGetBlockIdxPtr()) | (*__cudaGetBlockIdxPtr()) | |||
#define __shared_var(name, s, type) \ | #define __shared_var(name, s, type) \ | |||
(s type __cudaGetSharedMem((void*)(&(name)))) | (s type __cudaGetSharedMem((void*)(&(name)))) | |||
#define __var_used__ \ | #define __var_used__ \ | |||
__attribute__((__used__)) | __attribute__((__used__)) | |||
#define __storage_auto__shared__ \ | ||||
auto | ||||
#undef __cdecl | #undef __cdecl | |||
#define __cdecl | #define __cdecl | |||
#undef __w64 | #undef __w64 | |||
#define __w64 | #define __w64 | |||
#else /* __MULTI_CORE__ */ | #else /* __MULTI_CORE__ */ | |||
#define __shared_var(name, s, type) \ | ||||
name | ||||
#endif /* __MULTI_CORE__ */ | ||||
#define ___device__(sc) \ | #define ___device__(sc) \ | |||
static __device__ | static __device__ | |||
#define __in__(cdecl, decl) \ | #define __shared_var(name, s, type) \ | |||
decl | ||||
#define __in_type__(cdecl, decl) \ | ||||
decl | ||||
#define __texture_var(name) \ | ||||
__texture_##name | ||||
#define __cuda_host_device_name(name) \ | ||||
__cuda_host_device_##name | ||||
#define __val_param(name) \ | ||||
name | name | |||
#define __copy_param(local_decl, param) | ||||
#define __unsized_array_size | ||||
#define __unsized__shared_var(name, s, type) \ | ||||
(*name) | ||||
#define __unsized__empty_array(s) | ||||
#if defined(__APPLE__) | #if defined(__APPLE__) || defined(__ICC) | |||
#define __STORAGE__ \ | #define __STORAGE__ \ | |||
__attribute__((__weak__)) | __attribute__((__weak__)) | |||
#elif defined(__GNUC__) | #elif defined(__GNUC__) | |||
#define __STORAGE__ \ | #define __STORAGE__ \ | |||
__attribute__((__common__)) | __attribute__((__common__)) | |||
#elif defined(__cplusplus) | #elif defined(__cplusplus) | |||
#define __STORAGE__ \ | #define __STORAGE__ \ | |||
__declspec(selectany) | __declspec(selectany) | |||
#pragma warning(disable: 4099 4190) | #pragma warning(disable: 4099 4190) | |||
#else /* __cplusplus */ | #else /* __APPLE__ || __ICC */ | |||
#define __STORAGE__ | #define __STORAGE__ | |||
#endif /* __APPLE__ */ | #endif /* __APPLE__ || __ICC */ | |||
#endif /* __MULTI_CORE__ */ | ||||
#define __in__(cdecl, decl) \ | ||||
decl | ||||
#define __in_type__(cdecl, decl) \ | ||||
decl | ||||
#define __texture_var(name) \ | ||||
__texture_##name | ||||
#define __cuda_host_device_name(name) \ | ||||
__cuda_host_device_##name | ||||
#define __val_param(name) \ | ||||
name | ||||
#define __copy_param(local_decl, param) | ||||
#define __unsized_array_size | ||||
#define __unsized__shared_var(name, s, type) \ | ||||
(*name) | ||||
#define __unsized__empty_array(s) | ||||
/* this is compiled with a host compiler for device emulation */ | /* this is compiled with a host compiler for device emulation */ | |||
#define __device_emulation | #define __device_emulation | |||
#endif /* __CUDABE__ */ | #endif /* __CUDABE__ */ | |||
#include "../builtin_types.h" | #include "builtin_types.h" | |||
#include "../device_launch_parameters.h" | #include "device_launch_parameters.h" | |||
#include "storage_class.h" | #include "storage_class.h" | |||
End of changes. 17 change blocks. | ||||
46 lines changed or deleted | 44 lines changed or added | |||
driver_types.h | driver_types.h | |||
---|---|---|---|---|
skipping to change at line 96 | skipping to change at line 96 | |||
cudaErrorSynchronizationError, | cudaErrorSynchronizationError, | |||
cudaErrorInvalidFilterSetting, | cudaErrorInvalidFilterSetting, | |||
cudaErrorInvalidNormSetting, | cudaErrorInvalidNormSetting, | |||
cudaErrorMixedDeviceExecution, | cudaErrorMixedDeviceExecution, | |||
cudaErrorCudartUnloading, | cudaErrorCudartUnloading, | |||
cudaErrorUnknown, | cudaErrorUnknown, | |||
cudaErrorNotYetImplemented, | cudaErrorNotYetImplemented, | |||
cudaErrorMemoryValueTooLarge, | cudaErrorMemoryValueTooLarge, | |||
cudaErrorInvalidResourceHandle, | cudaErrorInvalidResourceHandle, | |||
cudaErrorNotReady, | cudaErrorNotReady, | |||
cudaErrorInsufficientDriver, | ||||
cudaErrorSetOnActiveProcess, | ||||
cudaErrorStartupFailure = 0x7f, | cudaErrorStartupFailure = 0x7f, | |||
cudaErrorApiFailureBase = 10000 | cudaErrorApiFailureBase = 10000 | |||
}; | }; | |||
/*DEVICE_BUILTIN*/ | /*DEVICE_BUILTIN*/ | |||
enum cudaChannelFormatKind | enum cudaChannelFormatKind | |||
{ | { | |||
cudaChannelFormatKindSigned, | cudaChannelFormatKindSigned, | |||
cudaChannelFormatKindUnsigned, | cudaChannelFormatKindUnsigned, | |||
cudaChannelFormatKindFloat | cudaChannelFormatKindFloat, | |||
cudaChannelFormatKindNone | ||||
}; | }; | |||
/*DEVICE_BUILTIN*/ | /*DEVICE_BUILTIN*/ | |||
struct cudaChannelFormatDesc | struct cudaChannelFormatDesc | |||
{ | { | |||
int x; | int x; | |||
int y; | int y; | |||
int z; | int z; | |||
int w; | int w; | |||
enum cudaChannelFormatKind f; | enum cudaChannelFormatKind f; | |||
skipping to change at line 189 | skipping to change at line 192 | |||
int maxThreadsPerBlock; | int maxThreadsPerBlock; | |||
int maxThreadsDim[3]; | int maxThreadsDim[3]; | |||
int maxGridSize[3]; | int maxGridSize[3]; | |||
int clockRate; | int clockRate; | |||
size_t totalConstMem; | size_t totalConstMem; | |||
int major; | int major; | |||
int minor; | int minor; | |||
size_t textureAlignment; | size_t textureAlignment; | |||
int deviceOverlap; | int deviceOverlap; | |||
int multiProcessorCount; | int multiProcessorCount; | |||
int __cudaReserved[40]; | int kernelExecTimeoutEnabled; | |||
int __cudaReserved[39]; | ||||
}; | }; | |||
#define cudaDevicePropDontCare \ | #define cudaDevicePropDontCare \ | |||
{ \ | { \ | |||
{'\0'}, /* char name[256]; */ \ | {'\0'}, /* char name[256]; */ \ | |||
0, /* size_t totalGlobalMem; */ \ | 0, /* size_t totalGlobalMem; */ \ | |||
0, /* size_t sharedMemPerBlock; */ \ | 0, /* size_t sharedMemPerBlock; */ \ | |||
0, /* int regsPerBlock; */ \ | 0, /* int regsPerBlock; */ \ | |||
0, /* int warpSize; */ \ | 0, /* int warpSize; */ \ | |||
0, /* size_t memPitch; */ \ | 0, /* size_t memPitch; */ \ | |||
0, /* int maxThreadsPerBlock; */ \ | 0, /* int maxThreadsPerBlock; */ \ | |||
{0, 0, 0}, /* int maxThreadsDim[3]; */ \ | {0, 0, 0}, /* int maxThreadsDim[3]; */ \ | |||
{0, 0, 0}, /* int maxGridSize[3]; */ \ | {0, 0, 0}, /* int maxGridSize[3]; */ \ | |||
0, /* int clockRate; */ \ | 0, /* int clockRate; */ \ | |||
0, /* size_t totalConstMem; */ \ | 0, /* size_t totalConstMem; */ \ | |||
-1, /* int major; */ \ | -1, /* int major; */ \ | |||
-1, /* int minor; */ \ | -1, /* int minor; */ \ | |||
0, /* size_t textureAlignment; */ \ | 0, /* size_t textureAlignment; */ \ | |||
-1, /* int deviceOverlap; */ \ | -1, /* int deviceOverlap; */ \ | |||
0 /* int multiProcessorCount; */ \ | 0, /* int multiProcessorCount; */ \ | |||
0 /* int kernelExecTimeoutEnabled */ \ | ||||
} | } | |||
/************************************************************************** ***** | /************************************************************************** ***** | |||
* * | * * | |||
* SHORTHAND TYPE DEFINITION USED BY RUNTIME API * | * SHORTHAND TYPE DEFINITION USED BY RUNTIME API * | |||
* * | * * | |||
*************************************************************************** ****/ | *************************************************************************** ****/ | |||
/*DEVICE_BUILTIN*/ | /*DEVICE_BUILTIN*/ | |||
typedef enum cudaError cudaError_t; | typedef enum cudaError cudaError_t; | |||
End of changes. 4 change blocks. | ||||
20 lines changed or deleted | 25 lines changed or added | |||
host_config.h | host_config.h | |||
---|---|---|---|---|
skipping to change at line 65 | skipping to change at line 65 | |||
#include <bits/c++config.h> /* get _GLIBCXX_ATOMIC_BUILTINS */ | #include <bits/c++config.h> /* get _GLIBCXX_ATOMIC_BUILTINS */ | |||
#if _GLIBCXX_ATOMIC_BUILTINS == 1 | #if _GLIBCXX_ATOMIC_BUILTINS == 1 | |||
#undef _GLIBCXX_ATOMIC_BUILTINS /* for missing __sync_fetch_and_add */ | #undef _GLIBCXX_ATOMIC_BUILTINS /* for missing __sync_fetch_and_add */ | |||
#endif /* _GLIBCXX_ATOMIC_BUILTINS == 1 */ | #endif /* _GLIBCXX_ATOMIC_BUILTINS == 1 */ | |||
#elif defined(_WIN32) | #elif defined(_WIN32) | |||
#if _MSC_VER == 1400 | #if _MSC_VER >= 1400 | |||
#undef _USE_DECLSPECS_FOR_SAL | ||||
#define _USE_DECLSPECS_FOR_SAL \ | ||||
1 | ||||
#if !defined(_CRT_NONSTDC_NO_WARNINGS) | #if !defined(_CRT_NONSTDC_NO_WARNINGS) | |||
#define _CRT_NONSTDC_NO_WARNINGS /* to suppress warnings */ | #define _CRT_NONSTDC_NO_WARNINGS /* to suppress warnings */ | |||
#endif /* _CRT_NONSTDC_NO_WARNINGS */ | #endif /* _CRT_NONSTDC_NO_WARNINGS */ | |||
#if !defined(_CRT_SECURE_NO_WARNINGS) | #if !defined(_CRT_SECURE_NO_WARNINGS) | |||
#define _CRT_SECURE_NO_WARNINGS /* to suppress warnings */ | #define _CRT_SECURE_NO_WARNINGS /* to suppress warnings */ | |||
#endif /* _CRT_SECURE_NO_WARNINGS */ | #endif /* _CRT_SECURE_NO_WARNINGS */ | |||
#endif /* _MSC_VER == 1400 */ | #endif /* _MSC_VER >= 1400 */ | |||
#if !defined(NOMINMAX) | #if !defined(NOMINMAX) | |||
#define NOMINMAX /* min and max are part of cuda runtime */ | #define NOMINMAX /* min and max are part of cuda runtime */ | |||
#endif /* !NOMINMAX */ | #endif /* !NOMINMAX */ | |||
#define __THROW | #define __THROW | |||
class type_info; /* forward declaration */ | /* forward declarations for windows C++ header files */ | |||
#include <stddef.h> | ||||
class type_info; | ||||
#if !defined(_CRTIMP) | ||||
#if defined(_DLL) | ||||
#define _CRTIMP \ | ||||
__declspec(dllimport) | ||||
#else /* _DLL */ | ||||
#define _CRTIMP | ||||
#endif /* _DLL */ | ||||
#endif /* !_CRTIMP */ | ||||
#if defined(_DEBUG) | ||||
#if !defined(_NATIVE_WCHAR_T_DEFINED) && defined(_M_CEE_PURE) | ||||
extern "C++" | ||||
#else /* !_NATIVE_WCHAR_T_DEFINED && _M_CEE_PURE */ | ||||
extern "C" | ||||
#endif /* !_NATIVE_WCHAR_T_DEFINED && _M_CEE_PURE */ | ||||
_CRTIMP void __cdecl _invalid_parameter(const wchar_t*, const wchar_t*, con | ||||
st wchar_t*, unsigned int, uintptr_t); | ||||
#else /* _DEBUG */ | ||||
extern "C" _CRTIMP void __cdecl _invalid_parameter_noinfo(void); | ||||
#endif /* DEBUG */ | ||||
namespace std | ||||
{ | ||||
struct _Secure_char_traits_tag; | ||||
namespace _Traits_helper | ||||
{ | ||||
template<class T> inline typename T::char_type *copy_s(typename T::char | ||||
_type*, size_t, const typename T::char_type*, size_t, _Secure_char_traits_t | ||||
ag); | ||||
template<class T> inline typename T::char_type *move_s(typename T::char | ||||
_type*, size_t, const typename T::char_type*, size_t, _Secure_char_traits_t | ||||
ag); | ||||
} | ||||
} | ||||
namespace stdext | ||||
{ | ||||
template<class U, class V, class W> inline V unchecked_uninitialized_copy | ||||
(U, U, V, W&); | ||||
} | ||||
#endif /* __GNUC__ */ | #endif /* __GNUC__ */ | |||
#endif /* __CUDACC__ */ | #endif /* __CUDACC__ */ | |||
#endif /* !__HOST_CONFIG_H__ */ | #endif /* !__HOST_CONFIG_H__ */ | |||
End of changes. 3 change blocks. | ||||
3 lines changed or deleted | 51 lines changed or added | |||
host_defines.h | host_defines.h | |||
---|---|---|---|---|
skipping to change at line 49 | skipping to change at line 49 | |||
#if !defined(__GNUC__) && !defined(_WIN32) | #if !defined(__GNUC__) && !defined(_WIN32) | |||
#error --- !!! UNSUPPORTED COMPILER !!! --- | #error --- !!! UNSUPPORTED COMPILER !!! --- | |||
#elif defined(__GNUC__) | #elif defined(__GNUC__) | |||
#define __no_return__ \ | #define __no_return__ \ | |||
__attribute__((__noreturn__)) | __attribute__((__noreturn__)) | |||
#define __noinline__ \ | #define __noinline__ \ | |||
__attribute__((__noinline__)) | __attribute__((__noinline__)) | |||
#define __forceinline__ \ | ||||
__inline__ __attribute__((__always_inline__)) | ||||
#define __align__(n) \ | #define __align__(n) \ | |||
__attribute__((__aligned__(n))) | __attribute__((__aligned__(n))) | |||
#define __thread__ \ | #define __thread__ \ | |||
__thread | __thread | |||
#define __import__ | #define __import__ | |||
#define __export__ | #define __export__ | |||
#define __location__(a) \ | #define __location__(a) \ | |||
__loc__(__attribute__((a))) | __loc__(__attribute__((a))) | |||
#define CUDARTAPI | #define CUDARTAPI | |||
skipping to change at line 78 | skipping to change at line 80 | |||
#define __restrict__ | #define __restrict__ | |||
#endif /* _MSC_VER >= 1400 */ | #endif /* _MSC_VER >= 1400 */ | |||
#define __inline__ \ | #define __inline__ \ | |||
__inline | __inline | |||
#define __no_return__ \ | #define __no_return__ \ | |||
__declspec(noreturn) | __declspec(noreturn) | |||
#define __noinline__ \ | #define __noinline__ \ | |||
__declspec(noinline) | __declspec(noinline) | |||
#define __forceinline__ \ | ||||
__forceinline | ||||
#define __align__(n) \ | #define __align__(n) \ | |||
__declspec(align(n)) | __declspec(align(n)) | |||
#define __thread__ \ | #define __thread__ \ | |||
__declspec(thread) | __declspec(thread) | |||
#define __import__ \ | #define __import__ \ | |||
__declspec(dllimport) | __declspec(dllimport) | |||
#define __export__ \ | #define __export__ \ | |||
__declspec(dllexport) | __declspec(dllexport) | |||
#define __location__(a) \ | #define __location__(a) \ | |||
__loc__(__declspec(a)) | __loc__(__declspec(a)) | |||
#define CUDARTAPI \ | #define CUDARTAPI \ | |||
__stdcall | __stdcall | |||
#endif /* !__GNUC__ && !_WIN32 */ | #endif /* !__GNUC__ && !_WIN32 */ | |||
#if defined(__CUDACC__) || defined(__CUDABE__) | #if defined(__CUDACC__) || defined(__CUDABE__) || defined (__MULTI_CORE__) | |||
#define __loc__(a) \ | #define __loc__(a) \ | |||
a | a | |||
#define __builtin_align__(a) \ | #define __builtin_align__(a) \ | |||
__align__(a) | __align__(a) | |||
#else /* __CUDACC__ || __CUDABE__ */ | #else /* __CUDACC__ || __CUDABE__ || __MULTI_CORE__ */ | |||
#define __loc__(a) | #define __loc__(a) | |||
#define __builtin_align__(a) | #define __builtin_align__(a) | |||
#endif /* __CUDACC__ || __CUDABE__ */ | #endif /* __CUDACC__ || __CUDABE__ || __MULTI_CORE__ */ | |||
#define __device__ \ | #define __device__ \ | |||
__location__(__device__) | __location__(__device__) | |||
#define __host__ \ | #define __host__ \ | |||
__location__(__host__) | __location__(__host__) | |||
#define __global__ \ | #define __global__ \ | |||
__location__(__global__) | __location__(__global__) | |||
#define __shared__ \ | #define __shared__ \ | |||
__location__(__shared__) | __location__(__shared__) | |||
#define __constant__ \ | #define __constant__ \ | |||
End of changes. 5 change blocks. | ||||
3 lines changed or deleted | 7 lines changed or added | |||
host_runtime.h | host_runtime.h | |||
---|---|---|---|---|
skipping to change at line 76 | skipping to change at line 76 | |||
#endif /* !__cplusplus */ | #endif /* !__cplusplus */ | |||
#include "cuda_runtime_api.h" | #include "cuda_runtime_api.h" | |||
#include "storage_class.h" | #include "storage_class.h" | |||
#else /* !__CUDA_INTERNAL_COMPILATION__ */ | #else /* !__CUDA_INTERNAL_COMPILATION__ */ | |||
#include "host_defines.h" | #include "host_defines.h" | |||
#define __cudaRegisterBinary() \ | #define __cudaRegisterBinary() | |||
__cudaFatCubinHandle = __cudaRegisterFatBinary((void*)__cudaFatCubi | \ | |||
n) | __cudaFatCubinHandle = __cudaRegisterFatBinary((void*)__cudaFatCubi | |||
n); \ | ||||
atexit(__cudaUnregisterBinaryUtil) | ||||
#define __cudaRegisterVariable(var, ext, size, constant, global) \ | #define __cudaRegisterVariable(var, ext, size, constant, global) \ | |||
__cudaRegisterVar(__cudaFatCubinHandle, (char*)&__host##var, (char* )__device##var, __name##var, ext, size, constant, global) | __cudaRegisterVar(__cudaFatCubinHandle, (char*)&__host##var, (char* )__device##var, __name##var, ext, size, constant, global) | |||
#define __cudaRegisterGlobalTexture(tex, dim, norm, ext) \ | #define __cudaRegisterGlobalTexture(tex, dim, norm, ext) \ | |||
__cudaRegisterTexture(__cudaFatCubinHandle, (const struct textureRe ference*)&tex, __tex_var(tex), #tex, dim, norm, ext) | __cudaRegisterTexture(__cudaFatCubinHandle, (const struct textureRe ference*)&tex, __tex_var(tex), #tex, dim, norm, ext) | |||
#define __cudaRegisterUnsizedShared(var) \ | #define __cudaRegisterUnsizedShared(var) \ | |||
__cudaRegisterShared(__cudaFatCubinHandle, (void**)__device_var(var )) | __cudaRegisterShared(__cudaFatCubinHandle, (void**)__device_var(var )) | |||
#define __cudaRegisterSharedVariable(var, size, align, sc) \ | #define __cudaRegisterSharedVariable(var, size, align, sc) \ | |||
__cudaRegisterSharedVar(__cudaFatCubinHandle, (void**)__device_var( var), size, align, sc) | __cudaRegisterSharedVar(__cudaFatCubinHandle, (void**)__device_var( var), size, align, sc) | |||
#define __cudaRegisterEntry(fun, thread_limit) \ | #define __cudaRegisterEntry(fun, thread_limit) \ | |||
__cudaRegisterFunction(__cudaFatCubinHandle, (const char*)__device_ stub_##fun, (char*)__device_fun(fun), #fun, thread_limit, __ids) | __cudaRegisterFunction(__cudaFatCubinHandle, (const char*)__device_ stub_##fun, (char*)__device_fun(fun), #fun, thread_limit, __ids) | |||
#define __cudaInitArgBlock(arg) \ | ||||
char __[256]; \ | ||||
*(char**)&arg = __ | ||||
#define __cudaSetupArg(arg, offset) \ | #define __cudaSetupArg(arg, offset) \ | |||
if (cudaSetupArgument((void*)(char*)&arg, sizeof(arg), (size_t)&off set.arg - (size_t)&offset) != cudaSuccess) \ | if (cudaSetupArgument((void*)(char*)&arg, sizeof(arg), (size_t)&off set->arg - (size_t)offset) != cudaSuccess) \ | |||
return | return | |||
#define __cudaLaunch(fun) \ | #define __cudaLaunch(fun) \ | |||
{ volatile static char *__f; __f = fun; (void)cudaLaunch(fun); } | { volatile static char *__f; __f = fun; (void)cudaLaunch(fun); } | |||
#if defined(__cplusplus) | #if defined(__cplusplus) | |||
extern "C" { | extern "C" { | |||
#endif /* __cplusplus */ | #endif /* __cplusplus */ | |||
extern void** CUDARTAPI __cudaRegisterFatBinary( | extern void** CUDARTAPI __cudaRegisterFatBinary( | |||
void *fatCubin | void *fatCubin | |||
skipping to change at line 158 | skipping to change at line 162 | |||
uint3 *bid, | uint3 *bid, | |||
dim3 *bDim, | dim3 *bDim, | |||
dim3 *gDim, | dim3 *gDim, | |||
int *wSize | int *wSize | |||
); | ); | |||
#if defined(__cplusplus) | #if defined(__cplusplus) | |||
} | } | |||
#endif /* __cplusplus */ | #endif /* __cplusplus */ | |||
static void **__cudaFatCubinHandle; | #if defined(__GNUC__) && defined(__cplusplus) | |||
static void __cudaUnregisterBinaryUtil(void) | ||||
{ | ||||
__cudaUnregisterFatBinary(__cudaFatCubinHandle); | ||||
} | ||||
#if defined(__GNUC__) | ||||
__attribute__((__destructor__)) static void __cudaUnregisterBinary(void) | ||||
{ | ||||
__cudaUnregisterBinaryUtil(); | ||||
} | ||||
#else /* __GNUC__ */ | ||||
#if defined(_WIN64) | ||||
#pragma section(".CRT$XPU", read) | extern int atexit(void(*)(void)) throw(); | |||
#else /* _WIN64 */ | #else /* __GNUC__ && __cplusplus */ | |||
#pragma section(".CRT$XPU") | extern int atexit(void(*)(void)); | |||
#endif /* _WIN64 */ | #endif /* __GNUC__ && __cplusplus */ | |||
__declspec(allocate(".CRT$XPU")) | static void **__cudaFatCubinHandle; | |||
static void (__cdecl *__cudaUnregister[])(void) = {__cudaUnregisterBinaryUt | ||||
il}; | ||||
#endif /* __GNUC__ */ | static void __cudaUnregisterBinaryUtil(void) | |||
{ | ||||
__cudaUnregisterFatBinary(__cudaFatCubinHandle); | ||||
} | ||||
#if defined(__device_emulation) | #if defined(__device_emulation) | |||
#define __device_fun(fun) \ | #define __device_fun(fun) \ | |||
__device_wrapper_##fun | __device_wrapper_##fun | |||
#define __device_var(var) \ | #define __device_var(var) \ | |||
(char*)&var | (char*)&var | |||
#define __tex_var(var) \ | #define __tex_var(var) \ | |||
&__texture_var(var) | &__texture_var(var) | |||
#define __cudaFatCubin \ | #define __cudaFatCubin \ | |||
End of changes. 10 change blocks. | ||||
29 lines changed or deleted | 19 lines changed or added | |||
math_constants.h | math_constants.h | |||
---|---|---|---|---|
skipping to change at line 97 | skipping to change at line 97 | |||
#endif /* !CUDA_NO_SM_13_DOUBLE_INTRINSICS */ | #endif /* !CUDA_NO_SM_13_DOUBLE_INTRINSICS */ | |||
#define CUDART_ZERO 0.0 | #define CUDART_ZERO 0.0 | |||
#define CUDART_ONE 1.0 | #define CUDART_ONE 1.0 | |||
#define CUDART_SQRT_TWO 1.4142135623730951e+0 | #define CUDART_SQRT_TWO 1.4142135623730951e+0 | |||
#define CUDART_SQRT_HALF 7.0710678118654757e-1 | #define CUDART_SQRT_HALF 7.0710678118654757e-1 | |||
#define CUDART_THIRD 3.3333333333333333e-1 | #define CUDART_THIRD 3.3333333333333333e-1 | |||
#define CUDART_TWOTHIRD 6.6666666666666667e-1 | #define CUDART_TWOTHIRD 6.6666666666666667e-1 | |||
#define CUDART_PIO4 7.8539816339744828e-1 | #define CUDART_PIO4 7.8539816339744828e-1 | |||
#define CUDART_PIO4_HI 7.8539816339744828e-1 | ||||
#define CUDART_PIO4_LO 3.0616169978683830e-17 | ||||
#define CUDART_PIO2 1.5707963267948966e+0 | #define CUDART_PIO2 1.5707963267948966e+0 | |||
#define CUDART_PIO2_HI 1.5707963267948966e+0 | #define CUDART_PIO2_HI 1.5707963267948966e+0 | |||
#define CUDART_PIO2_LO 6.1232339957367660e-17 | #define CUDART_PIO2_LO 6.1232339957367660e-17 | |||
#define CUDART_3PIO4 2.3561944901923448e+0 | #define CUDART_3PIO4 2.3561944901923448e+0 | |||
#define CUDART_2_OVER_PI 6.3661977236758138e-1 | #define CUDART_2_OVER_PI 6.3661977236758138e-1 | |||
#define CUDART_PI 3.1415926535897931e+0 | #define CUDART_PI 3.1415926535897931e+0 | |||
#define CUDART_PI_HI 3.1415926535897931e+0 | #define CUDART_PI_HI 3.1415926535897931e+0 | |||
#define CUDART_PI_LO 1.2246467991473532e-16 | #define CUDART_PI_LO 1.2246467991473532e-16 | |||
#define CUDART_SQRT_2PI_HI 2.5066282746310007e+0 | #define CUDART_SQRT_2PI_HI 2.5066282746310007e+0 | |||
#define CUDART_SQRT_2PI_LO (-1.8328579980459167e-16) | #define CUDART_SQRT_2PI_LO (-1.8328579980459167e-16) | |||
End of changes. 1 change blocks. | ||||
0 lines changed or deleted | 2 lines changed or added | |||
math_functions.h | math_functions.h | |||
---|---|---|---|---|
skipping to change at line 413 | skipping to change at line 413 | |||
/*DEVICE_BUILTIN*/ | /*DEVICE_BUILTIN*/ | |||
extern __host__ __device__ float fmaf(float, float, float) __THROW; | extern __host__ __device__ float fmaf(float, float, float) __THROW; | |||
} | } | |||
#include <math.h> | #include <math.h> | |||
#include <stdlib.h> | #include <stdlib.h> | |||
#include <cmath> | #include <cmath> | |||
#include <cstdlib> | #include <cstdlib> | |||
using namespace std; | ||||
/************************************************************************** ***** | /************************************************************************** ***** | |||
* * | * * | |||
* * | * * | |||
* * | * * | |||
*************************************************************************** ****/ | *************************************************************************** ****/ | |||
#if defined(__GNUC__) | #if defined(__GNUC__) | |||
#define __cuda_std_begin \ | ||||
namespace std { | ||||
#define __cuda_std_end \ | ||||
} | ||||
/* these are here to avoid warnings on the call graph. | /* these are here to avoid warnings on the call graph. | |||
long double is not supported on the device */ | long double is not supported on the device */ | |||
/*DEVICE_BUILTIN*/ | /*DEVICE_BUILTIN*/ | |||
extern __host__ __device__ int __signbitl(long double) __THROW; | extern __host__ __device__ int __signbitl(long double) __THROW; | |||
/*DEVICE_BUILTIN*/ | /*DEVICE_BUILTIN*/ | |||
extern __host__ __device__ int __isinfl(long double) __THROW; | extern __host__ __device__ int __isinfl(long double) __THROW; | |||
/*DEVICE_BUILTIN*/ | /*DEVICE_BUILTIN*/ | |||
extern __host__ __device__ int __isnanl(long double) __THROW; | extern __host__ __device__ int __isnanl(long double) __THROW; | |||
#if defined(__APPLE__) | #if defined(__APPLE__) | |||
skipping to change at line 482 | skipping to change at line 474 | |||
{ | { | |||
extern __host__ __device__ long long int abs(long long int); | extern __host__ __device__ long long int abs(long long int); | |||
} | } | |||
namespace std | namespace std | |||
{ | { | |||
template<typename T> extern __host__ __device__ T __pow_helper(T, int); | template<typename T> extern __host__ __device__ T __pow_helper(T, int); | |||
template<typename T> extern __host__ __device__ T __cmath_power(T, unsign ed int); | template<typename T> extern __host__ __device__ T __cmath_power(T, unsign ed int); | |||
} | } | |||
using std::abs; | ||||
using std::fabs; | ||||
using std::ceil; | ||||
using std::floor; | ||||
using std::sqrt; | ||||
using std::pow; | ||||
using std::log; | ||||
using std::log10; | ||||
using std::fmod; | ||||
using std::modf; | ||||
using std::exp; | ||||
using std::frexp; | ||||
using std::ldexp; | ||||
using std::asin; | ||||
using std::sin; | ||||
using std::sinh; | ||||
using std::acos; | ||||
using std::cos; | ||||
using std::cosh; | ||||
using std::atan; | ||||
using std::atan2; | ||||
using std::tan; | ||||
using std::tanh; | ||||
#elif defined(_WIN32) | #elif defined(_WIN32) | |||
static __inline__ __host__ __device__ long long int abs(long long int a) | static __inline__ __host__ __device__ long long int abs(long long int a) | |||
{ | { | |||
return llabs(a); | return llabs(a); | |||
} | } | |||
static __inline__ __host__ __device__ int signbit(double a) | static __inline__ __host__ __device__ int signbit(double a) | |||
{ | { | |||
return __signbit(a); | return __signbit(a); | |||
skipping to change at line 529 | skipping to change at line 545 | |||
static __inline__ __host__ __device__ int isfinite(double a) | static __inline__ __host__ __device__ int isfinite(double a) | |||
{ | { | |||
return __finite(a); | return __finite(a); | |||
} | } | |||
static __inline__ __host__ __device__ int isfinite(float a) | static __inline__ __host__ __device__ int isfinite(float a) | |||
{ | { | |||
return __finitef(a); | return __finitef(a); | |||
} | } | |||
#define __cuda_std_begin | ||||
#define __cuda_std_end | ||||
template<class T> extern __host__ __device__ T _Pow_int(T, int); | template<class T> extern __host__ __device__ T _Pow_int(T, int); | |||
#endif /* !_WIN32 */ | #endif /* !_WIN32 */ | |||
__cuda_std_begin | #if defined(__GNUC__) | |||
namespace std { | ||||
#endif /* __GNUC__ */ | ||||
extern __host__ __device__ long int abs(long int); | extern __host__ __device__ long int abs(long int); | |||
extern __host__ __device__ float abs(float); | extern __host__ __device__ float abs(float); | |||
extern __host__ __device__ double abs(double); | extern __host__ __device__ double abs(double); | |||
extern __host__ __device__ float fabs(float); | extern __host__ __device__ float fabs(float); | |||
extern __host__ __device__ float ceil(float); | extern __host__ __device__ float ceil(float); | |||
extern __host__ __device__ float floor(float); | extern __host__ __device__ float floor(float); | |||
extern __host__ __device__ float sqrt(float); | extern __host__ __device__ float sqrt(float); | |||
extern __host__ __device__ float pow(float, float); | extern __host__ __device__ float pow(float, float); | |||
extern __host__ __device__ float pow(float, int); | extern __host__ __device__ float pow(float, int); | |||
skipping to change at line 566 | skipping to change at line 581 | |||
extern __host__ __device__ float sin(float); | extern __host__ __device__ float sin(float); | |||
extern __host__ __device__ float sinh(float); | extern __host__ __device__ float sinh(float); | |||
extern __host__ __device__ float acos(float); | extern __host__ __device__ float acos(float); | |||
extern __host__ __device__ float cos(float); | extern __host__ __device__ float cos(float); | |||
extern __host__ __device__ float cosh(float); | extern __host__ __device__ float cosh(float); | |||
extern __host__ __device__ float atan(float); | extern __host__ __device__ float atan(float); | |||
extern __host__ __device__ float atan2(float, float); | extern __host__ __device__ float atan2(float, float); | |||
extern __host__ __device__ float tan(float); | extern __host__ __device__ float tan(float); | |||
extern __host__ __device__ float tanh(float); | extern __host__ __device__ float tanh(float); | |||
__cuda_std_end | #if defined(__GNUC__) | |||
} | ||||
#endif /* __GNUC__ */ | ||||
static __inline__ __host__ __device__ float logb(float a) | static __inline__ __host__ __device__ float logb(float a) | |||
{ | { | |||
return logbf(a); | return logbf(a); | |||
} | } | |||
static __inline__ __host__ __device__ int ilogb(float a) | static __inline__ __host__ __device__ int ilogb(float a) | |||
{ | { | |||
return ilogbf(a); | return ilogbf(a); | |||
} | } | |||
skipping to change at line 827 | skipping to change at line 844 | |||
#elif !defined(__CUDACC__) | #elif !defined(__CUDACC__) | |||
#include "crt/func_macro.h" | #include "crt/func_macro.h" | |||
#define INT_MAX \ | #define INT_MAX \ | |||
((int)((unsigned int)-1 >> 1)) | ((int)((unsigned int)-1 >> 1)) | |||
#if defined(__GNUC__) | #if defined(__GNUC__) | |||
extern int __cuda_error_not_implememted(void); | __func__(int __cuda_error_not_implememted(void)); | |||
#define __cuda___signbitl(a) \ | #define __cuda___signbitl(a) \ | |||
__cuda_error_not_implememted() | __cuda_error_not_implememted() | |||
#define __cuda___isinfl(a) \ | #define __cuda___isinfl(a) \ | |||
__cuda_error_not_implememted() | __cuda_error_not_implememted() | |||
#define __cuda___isnanl(a) \ | #define __cuda___isnanl(a) \ | |||
__cuda_error_not_implememted() | __cuda_error_not_implememted() | |||
#if defined(__APPLE__) | #if defined(__APPLE__) | |||
skipping to change at line 989 | skipping to change at line 1006 | |||
} cvt; | } cvt; | |||
cvt.d = a; | cvt.d = a; | |||
return cvt.l < 0ll; | return cvt.l < 0ll; | |||
} | } | |||
__func__(double copysign(double a, double b)) | __func__(double copysign(double a, double b)) | |||
{ | { | |||
volatile union { | volatile union { | |||
double d; | double d; | |||
unsigned long long int l; | unsigned long long int l; | |||
} cvta, cvtb; | } cvta, cvtb; | |||
cvta.d = a; | cvta.d = a; | |||
cvtb.d = b; | cvtb.d = b; | |||
cvta.l = (cvta.l & 0x7fffffffffffffffULL) | (cvtb.l & 0x8000000000000000U LL); | cvta.l = (cvta.l & 0x7fffffffffffffffULL) | (cvtb.l & 0x8000000000000000U LL); | |||
return cvta.d; | return cvta.d; | |||
} | } | |||
__func__(int __signbitf(float a)) | __func__(int __signbitf(float a)) | |||
{ | { | |||
return __cuda___signbitf(a); | return __cuda___signbitf(a); | |||
} | } | |||
__func__(float copysignf(float a, float b)) | __func__(float copysignf(float a, float b)) | |||
{ | { | |||
skipping to change at line 1178 | skipping to change at line 1197 | |||
#if defined(__LP64__) | #if defined(__LP64__) | |||
return (long int)__cuda_llrintf(a); | return (long int)__cuda_llrintf(a); | |||
#else /* __LP64__ */ | #else /* __LP64__ */ | |||
return (long int)__float2int_rn(a); | return (long int)__float2int_rn(a); | |||
#endif /* __LP64__ */ | #endif /* __LP64__ */ | |||
#endif /* __MULTI_CORE__ */ | #endif /* __MULTI_CORE__ */ | |||
} | } | |||
__device_func__(float __cuda_nearbyintf(float a)) | __device_func__(float __cuda_nearbyintf(float a)) | |||
{ | { | |||
#if defined(__CUDABE__) | #if defined(__MULTI_CORE__) | |||
return nearbyintf(a); | ||||
#elif defined(__CUDABE__) | ||||
return roundf(a); | return roundf(a); | |||
#else /* __CUDABE__ */ | #else /* __CUDABE__ */ | |||
return __internal_nearbyintf(a); | return __internal_nearbyintf(a); | |||
#endif /* __CUDABE__ */ | #endif /* __CUDABE__ */ | |||
} | } | |||
__device_func__(float __cuda_fmaxf(float a, float b)) | __device_func__(float __cuda_fmaxf(float a, float b)) | |||
{ | { | |||
#if defined(__MULTI_CORE__) | #if defined(__MULTI_CORE__) | |||
return fmaxf(a, b); | return fmaxf(a, b); | |||
skipping to change at line 1447 | skipping to change at line 1468 | |||
res.y = e = r + s; | res.y = e = r + s; | |||
res.x = (r - e) + s; | res.x = (r - e) + s; | |||
return res; | return res; | |||
} | } | |||
__device_func__(float __internal_accurate_log2f(float a)) | __device_func__(float __internal_accurate_log2f(float a)) | |||
{ | { | |||
return CUDART_L2E_F * __internal_accurate_logf(a); | return CUDART_L2E_F * __internal_accurate_logf(a); | |||
} | } | |||
/* Based on: Guillaume Da Gra | ||||
* Operators on Graphics Hardware. RNC'7, pp. 23-32, 2006. | ||||
*/ | ||||
__device_func__(float2 __internal_dsmul (float2 x, float2 y)) | ||||
{ | ||||
float2 z; | ||||
#if !defined(__CUDABE__) | ||||
volatile float up, vp, u1, u2, v1, v2, mh, ml; | ||||
#else | ||||
float up, vp, u1, u2, v1, v2, mh, ml; | ||||
#endif /* defined(__CUDABE__) */ | ||||
up = x.y * 4097.0f; | ||||
u1 = (x.y - up) + up; | ||||
u2 = x.y - u1; | ||||
vp = y.y * 4097.0f; | ||||
v1 = (y.y - vp) + vp; | ||||
v2 = y.y - v1; | ||||
mh = __fmul_rn(x.y,y.y); | ||||
ml = (((u1 * v1 - mh) + u1 * v2) + u2 * v1) + u2 * v2; | ||||
ml = (__fmul_rn(x.y,y.x) + __fmul_rn(x.x,y.y)) + ml; | ||||
z.y = up = mh + ml; | ||||
z.x = (mh - up) + ml; | ||||
return z; | ||||
} | ||||
/* 160 bits of 2/PI for Payne-Hanek style argument reduction. */ | /* 160 bits of 2/PI for Payne-Hanek style argument reduction. */ | |||
static __constant__ unsigned int __cudart_i2opi_f [] = { | static __constant__ unsigned int __cudart_i2opi_f [] = { | |||
0x3c439041, | 0x3c439041, | |||
0xdb629599, | 0xdb629599, | |||
0xf534ddc0, | 0xf534ddc0, | |||
0xfc2757d1, | 0xfc2757d1, | |||
0x4e441529, | 0x4e441529, | |||
0xa2f9836e, | 0xa2f9836e, | |||
}; | }; | |||
skipping to change at line 2048 | skipping to change at line 2094 | |||
} | } | |||
return t; | return t; | |||
#endif /* __MULTI_CORE__ */ | #endif /* __MULTI_CORE__ */ | |||
} | } | |||
__device_func__(float __cuda_acoshf(float a)) | __device_func__(float __cuda_acoshf(float a)) | |||
{ | { | |||
#if defined(__MULTI_CORE__) | #if defined(__MULTI_CORE__) | |||
return acoshf(a); | return acoshf(a); | |||
#else /* __MULTI_CORE__ */ | #else /* __MULTI_CORE__ */ | |||
float s, t; | float t; | |||
t = a - 1.0f; | t = a - 1.0f; | |||
if (__cuda_fabsf(t) > CUDART_TWO_TO_23_F) { | if (__cuda_fabsf(t) > CUDART_TWO_TO_23_F) { | |||
/* for large a, acosh = log(2*a) */ | /* for large a, acosh = log(2*a) */ | |||
return CUDART_LN2_F + __internal_accurate_logf(a); | return CUDART_LN2_F + __internal_accurate_logf(a); | |||
} else { | } else { | |||
s = a + 1.0f; | t = t + __cuda_sqrtf(a * t + t); | |||
t = t + __cuda_sqrtf(s * t); | ||||
return __cuda_log1pf(t); | return __cuda_log1pf(t); | |||
} | } | |||
#endif /* __MULTI_CORE__ */ | #endif /* __MULTI_CORE__ */ | |||
} | } | |||
__device_func__(float __cuda_asinhf(float a)) | __device_func__(float __cuda_asinhf(float a)) | |||
{ | { | |||
#if defined(__MULTI_CORE__) | #if defined(__MULTI_CORE__) | |||
return asinhf(a); | return asinhf(a); | |||
#else /* __MULTI_CORE__ */ | #else /* __MULTI_CORE__ */ | |||
skipping to change at line 2880 | skipping to change at line 2925 | |||
} | } | |||
__device_func__(float __internal_accurate_powf(float a, float b)) | __device_func__(float __internal_accurate_powf(float a, float b)) | |||
{ | { | |||
float2 loga, prod; | float2 loga, prod; | |||
#if !defined(__CUDABE__) && defined(_MSC_VER) && !defined(_WIN64) | #if !defined(__CUDABE__) && defined(_MSC_VER) && !defined(_WIN64) | |||
volatile float t; | volatile float t; | |||
#else | #else | |||
float t; | float t; | |||
#endif | #endif | |||
#if !defined(__CUDABE__) && defined(__linux__) && !defined(__LP64__) | ||||
volatile float up, vp, u1, u2, v1, v2, mh, ml; | ||||
#else | ||||
float up, vp, u1, u2, v1, v2, mh, ml; | ||||
#endif | ||||
/* compute log(a) in double-single format*/ | /* compute log(a) in double-single format*/ | |||
loga = __internal_log_ep(a); | loga = __internal_log_ep(a); | |||
/* prevent overflow during extended precision multiply */ | /* prevent overflow during extended precision multiply */ | |||
if (__cuda_fabsf(b) > 1.0e34f) b *= 1.220703125e-4f; | if (__cuda_fabsf(b) > 1.0e34f) b *= 1.220703125e-4f; | |||
/* compute b * log(a) in double-single format */ | prod.y = b; | |||
up = loga.y * 4097.0f; | prod.x = 0.0f; | |||
u1 = (loga.y - up) + up; | prod = __internal_dsmul (prod, loga); | |||
u2 = loga.y - u1; | ||||
vp = b * 4097.0f; | /* prevent intermediate overflow in exponentiation */ | |||
v1 = (b - vp) + vp; | if (__float_as_int(prod.y) == 0x42b17218) { | |||
v2 = b - v1; | prod.y = __int_as_float(__float_as_int(prod.y) - 1); | |||
mh = __fmul_rn(loga.y, b); | prod.x = prod.x + __int_as_float(0x37000000); | |||
ml = (((u1 * v1 - mh) + u1 * v2) + u2 * v1) + u2 * v2; | } | |||
ml = __fmul_rn(loga.x, b) + ml; | ||||
prod.y = up = mh + ml; | ||||
prod.x = (mh - up) + ml; | ||||
/* compute pow(a,b) = exp(b*log(a)) */ | /* compute pow(a,b) = exp(b*log(a)) */ | |||
t = __cuda_expf(prod.y); | t = __cuda_expf(prod.y); | |||
/* prevent -INF + INF = NaN */ | /* prevent -INF + INF = NaN */ | |||
if (t != CUDART_INF_F) { | if (t != CUDART_INF_F) { | |||
/* if prod.x is much smaller than prod.y, then exp(prod.y+prod.x) ~= | /* if prod.x is much smaller than prod.y, then exp(prod.y+prod.x) ~= | |||
* exp(prod.y) + prod.x * exp(prod.y) | * exp(prod.y) + prod.x * exp(prod.y) | |||
*/ | */ | |||
t = t * prod.x + t; | t = t * prod.x + t; | |||
} | } | |||
End of changes. 16 change blocks. | ||||
36 lines changed or deleted | 73 lines changed or added | |||
math_functions_dbl_ptx3.h | math_functions_dbl_ptx3.h | |||
---|---|---|---|---|
/* | /* | |||
* Copyright 1993-2008 NVIDIA Corporation. All rights reserved. | * Copyright 1993-2008 NVIDIA Corporation. All rights reserved. | |||
* | * | |||
* NOTICE TO USER: | * NOTICE TO USER: | |||
* | * | |||
* This source code is subject to NVIDIA ownership rights under U.S. and | * This source code is subject to NVIDIA ownership rights under U.S. and | |||
* international Copyright laws. Users and possessors of this source code | * international Copyright laws. Users and possessors of this source code | |||
* are hereby granted a nonexclusive, royalty-free license to use this code | * are hereby granted a nonexclusive, royalty-free license to use this code | |||
* in individual and commercial software. | * in individual and commercial software. | |||
* | * | |||
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE | * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE | |||
skipping to change at line 221 | skipping to change at line 221 | |||
}; | }; | |||
__device_func__(double __internal_trig_reduction_kerneld(double a, int *qua drant)) | __device_func__(double __internal_trig_reduction_kerneld(double a, int *qua drant)) | |||
{ | { | |||
double j; | double j; | |||
int q; | int q; | |||
if (__cuda_fabs(a) > CUDART_TRIG_PLOSS) { | if (__cuda_fabs(a) > CUDART_TRIG_PLOSS) { | |||
/* Payne-Hanek style argument reduction. */ | /* Payne-Hanek style argument reduction. */ | |||
unsigned long long int ia; | unsigned long long int ia; | |||
unsigned long long int s; | unsigned long long int s; | |||
unsigned long long int result[19]; | unsigned long long int result[5]; | |||
unsigned long long int phi, plo; | unsigned long long int phi, plo; | |||
unsigned long long int hi, lo; | unsigned long long int hi, lo; | |||
unsigned int e; | unsigned int e; | |||
int idx; | int idx; | |||
ia = __double_as_longlong(a); | ia = __double_as_longlong(a); | |||
s = ia & 0x8000000000000000ULL; | s = ia & 0x8000000000000000ULL; | |||
e = (unsigned int)(((ia >> 52) & 0x7ff) - 1024); | e = (unsigned int)(((ia >> 52) & 0x7ff) - 1024); | |||
ia = (ia << 11) | 0x8000000000000000ULL; | ia = (ia << 11) | 0x8000000000000000ULL; | |||
/* compute x * 2/pi */ | /* compute x * 2/pi */ | |||
skipping to change at line 233 | skipping to change at line 233 | |||
unsigned long long int hi, lo; | unsigned long long int hi, lo; | |||
unsigned int e; | unsigned int e; | |||
int idx; | int idx; | |||
ia = __double_as_longlong(a); | ia = __double_as_longlong(a); | |||
s = ia & 0x8000000000000000ULL; | s = ia & 0x8000000000000000ULL; | |||
e = (unsigned int)(((ia >> 52) & 0x7ff) - 1024); | e = (unsigned int)(((ia >> 52) & 0x7ff) - 1024); | |||
ia = (ia << 11) | 0x8000000000000000ULL; | ia = (ia << 11) | 0x8000000000000000ULL; | |||
/* compute x * 2/pi */ | /* compute x * 2/pi */ | |||
idx = 16 - (e >> 6); | idx = 16 - (e >> 6); | |||
hi = 0; | hi = 0; | |||
#if defined(__CUDABE__) | #if defined(__CUDABE__) | |||
#pragma unroll 1 | #pragma unroll 1 | |||
#endif /* __CUDABE__ */ | #endif /* __CUDABE__ */ | |||
for (q = 0; q < 18; q++) { | for (q = (idx-1); q < min(18,idx+3); q++) { | |||
plo = __cudart_i2opi_d[q] * ia; | plo = __cudart_i2opi_d[q] * ia; | |||
phi = __umul64hi (__cudart_i2opi_d[q], ia); | phi = __umul64hi (__cudart_i2opi_d[q], ia); | |||
lo = hi + plo; | lo = hi + plo; | |||
hi = phi + (lo < plo); | hi = phi + (lo < plo); | |||
result[q] = lo; | result[q-(idx-1)] = lo; | |||
} | } | |||
result[q] = hi; | result[q-(idx-1)] = hi; | |||
e = e & 63; | e = e & 63; | |||
/* shift result such that hi:lo<127:126> are the least significant | /* shift result such that hi:lo<127:126> are the least significant | |||
integer bits, and hi:lo<125:0> are the fractional bits of the result | integer bits, and hi:lo<125:0> are the fractional bits of the result | |||
*/ | */ | |||
hi = result[idx+2]; | hi = result[3]; | |||
lo = result[idx+1]; | lo = result[2]; | |||
if (e) { | if (e) { | |||
q = 64 - e; | q = 64 - e; | |||
hi = (hi << e) | (lo >> q); | hi = (hi << e) | (lo >> q); | |||
lo = (lo << e) | (result[idx] >> q); | lo = (lo << e) | (result[1] >> q); | |||
} | } | |||
q = (int)(hi >> 62); | q = (int)(hi >> 62); | |||
/* fraction */ | /* fraction */ | |||
hi = (hi << 2) | (lo >> 62); | hi = (hi << 2) | (lo >> 62); | |||
lo = (lo << 2); | lo = (lo << 2); | |||
e = (hi | (lo > 0)) > 0x8000000000000000ULL; /* fraction >= 0.5 */ | e = (hi | (lo > 0)) > 0x8000000000000000ULL; /* fraction >= 0.5 */ | |||
q += e; | q += e; | |||
if (s) q = -q; | if (s) q = -q; | |||
if (e) { | if (e) { | |||
unsigned long long int t; | unsigned long long int t; | |||
skipping to change at line 338 | skipping to change at line 337 | |||
z = __fma_rn (z, x2, 2.087588480545065E-009); | z = __fma_rn (z, x2, 2.087588480545065E-009); | |||
z = __fma_rn (z, x2, -2.755731555403950E-007); | z = __fma_rn (z, x2, -2.755731555403950E-007); | |||
z = __fma_rn (z, x2, 2.480158729365970E-005); | z = __fma_rn (z, x2, 2.480158729365970E-005); | |||
z = __fma_rn (z, x2, -1.388888888888074E-003); | z = __fma_rn (z, x2, -1.388888888888074E-003); | |||
z = __fma_rn (z, x2, 4.166666666666664E-002); | z = __fma_rn (z, x2, 4.166666666666664E-002); | |||
z = __fma_rn (z, x2, -5.000000000000000E-001); | z = __fma_rn (z, x2, -5.000000000000000E-001); | |||
z = __fma_rn (z, x2, 1.000000000000000E+000); | z = __fma_rn (z, x2, 1.000000000000000E+000); | |||
return z; | return z; | |||
} | } | |||
/* approximate tangent on -pi/2...+pi/2 ??? */ | /* approximate tangent on -pi/4...+pi/4 */ | |||
__device_func__(double __internal_tan_kerneld(double x)) | __device_func__(double __internal_tan_kerneld(double x, int i)) | |||
{ | { | |||
double x2, z; | double x2, z, q; | |||
x2 = x * x; | x2 = x * x; | |||
z = 2.08720267406905700E-005; | z = 9.8006287203286300E-006; | |||
z = __fma_rn (z, x2, -4.49127882369154970E-005); | z = __fma_rn (z, x2, -2.4279526494179897E-005); | |||
z = __fma_rn (z, x2, 8.95189835887631370E-005); | z = __fma_rn (z, x2, 4.8644173130937162E-005); | |||
z = __fma_rn (z, x2, -2.64581074021952680E-005); | z = __fma_rn (z, x2, -2.5640012693782273E-005); | |||
z = __fma_rn (z, x2, 1.36576090383605350E-004); | z = __fma_rn (z, x2, 6.7223984330880073E-005); | |||
z = __fma_rn (z, x2, 2.22355572141495130E-004); | z = __fma_rn (z, x2, 8.3559287318211639E-005); | |||
z = __fma_rn (z, x2, 5.95072833555129420E-004); | z = __fma_rn (z, x2, 2.4375039850848564E-004); | |||
z = __fma_rn (z, x2, 1.45475607963143160E-003); | z = __fma_rn (z, x2, 5.8886487754856672E-004); | |||
z = __fma_rn (z, x2, 3.59228938375760580E-003); | z = __fma_rn (z, x2, 1.4560454844672040E-003); | |||
z = __fma_rn (z, x2, 8.86321913102899350E-003); | z = __fma_rn (z, x2, 3.5921008885857180E-003); | |||
z = __fma_rn (z, x2, 2.18694896113231960E-002); | z = __fma_rn (z, x2, 8.8632379218613715E-003); | |||
z = __fma_rn (z, x2, 5.39682539265723680E-002); | z = __fma_rn (z, x2, 2.1869488399337889E-002); | |||
z = __fma_rn (z, x2, 1.33333333334148210E-001); | z = __fma_rn (z, x2, 5.3968253972902704E-002); | |||
z = __fma_rn (z, x2, 3.33333333333327760E-001); | z = __fma_rn (z, x2, 1.3333333333325342E-001); | |||
z = __fma_rn (z, x2, 3.3333333333333381E-001); | ||||
z = z * x2; | z = z * x2; | |||
z = __fma_rn (z, x, x); | q = __fma_rn (z, x, x); | |||
return z; | if (i) { | |||
double s = q - x; | ||||
double w = __fma_rn (z, x, -s); // tail of q | ||||
z = 1.0 / q; | ||||
z = -z; | ||||
s = __fma_rn (q, z, 1.0); | ||||
q = __fma_rn (z, __fma_rn (z, w, s), z); | ||||
} | ||||
return q; | ||||
} | } | |||
__device_func__(double __cuda_sqrt(double a)) | __device_func__(double __cuda_sqrt(double a)) | |||
{ | { | |||
return sqrt(a); | return sqrt(a); | |||
} | } | |||
__device_func__(double __cuda_rsqrt(double a)) | __device_func__(double __cuda_rsqrt(double a)) | |||
{ | { | |||
#if !defined(__CUDABE__) | #if !defined(__CUDABE__) | |||
skipping to change at line 417 | skipping to change at line 425 | |||
t = __fma_rn (t, a2, 1.111112158368149E-001/256.0); | t = __fma_rn (t, a2, 1.111112158368149E-001/256.0); | |||
t = __fma_rn (t, a2, 1.428571416261528E-001/64.0); | t = __fma_rn (t, a2, 1.428571416261528E-001/64.0); | |||
t = __fma_rn (t, a2, 2.000000000069858E-001/16.0); | t = __fma_rn (t, a2, 2.000000000069858E-001/16.0); | |||
t = __fma_rn (t, a2, 3.333333333333198E-001/4.0); | t = __fma_rn (t, a2, 3.333333333333198E-001/4.0); | |||
t = t * a2; | t = t * a2; | |||
t = __fma_rn (t, a, a_2); | t = __fma_rn (t, a, a_2); | |||
t = t + a_1; | t = t + a_1; | |||
return t; | return t; | |||
} | } | |||
__device_func__(double __internal_exp2i_kernel(int b)) | ||||
{ | ||||
return __hiloint2double((b + 1023) << 20, 0); | ||||
} | ||||
__device_func__(double __internal_half(double a)) | ||||
{ | ||||
unsigned int ihi, ilo; | ||||
ilo = __double2loint(a); | ||||
ihi = __double2hiint(a); | ||||
return __hiloint2double(ihi - 0x00100000, ilo); | ||||
} | ||||
__device_func__(double __internal_twice(double a)) | ||||
{ | ||||
unsigned int ihi, ilo; | ||||
ilo = __double2loint(a); | ||||
ihi = __double2hiint(a); | ||||
return __hiloint2double(ihi + 0x00100000, ilo); | ||||
} | ||||
__device_func__(double __cuda_sin(double a)) | __device_func__(double __cuda_sin(double a)) | |||
{ | { | |||
double z; | double z; | |||
int i; | int i; | |||
if (__cuda___isinf(a) || (a == CUDART_ZERO)) { | if (__cuda___isinf(a) || (a == CUDART_ZERO)) { | |||
return __dmul_rn(a, CUDART_ZERO); | return __dmul_rn(a, CUDART_ZERO); | |||
} | } | |||
z = __internal_trig_reduction_kerneld(a, &i); | z = __internal_trig_reduction_kerneld(a, &i); | |||
/* here, abs(z) <= pi/4, and i has the quadrant */ | /* here, abs(z) <= pi/4, and i has the quadrant */ | |||
if (i & 1) { | if (i & 1) { | |||
skipping to change at line 502 | skipping to change at line 531 | |||
__device_func__(double __cuda_tan(double a)) | __device_func__(double __cuda_tan(double a)) | |||
{ | { | |||
double z; | double z; | |||
int i; | int i; | |||
if (__cuda___isinf(a)) { | if (__cuda___isinf(a)) { | |||
return CUDART_NAN; | return CUDART_NAN; | |||
} | } | |||
z = __internal_trig_reduction_kerneld(a, &i); | z = __internal_trig_reduction_kerneld(a, &i); | |||
/* here, abs(z) <= pi/4, and i has the quadrant */ | /* here, abs(z) <= pi/4, and i has the quadrant */ | |||
z = __internal_tan_kerneld(z); | z = __internal_tan_kerneld(z, i & 1); | |||
if (i & 1) { | ||||
z = -1.0 / z; | ||||
} | ||||
return z; | return z; | |||
} | } | |||
__device_func__(double __cuda_log(double a)) | __device_func__(double __cuda_log(double a)) | |||
{ | { | |||
double m, f, g, u, v, tmp, q, ulo, log_lo, log_hi; | double m, f, g, u, v, tmp, q, ulo, log_lo, log_hi; | |||
int ihi, ilo; | int ihi, ilo; | |||
int e = 0; | int e = 0; | |||
ihi = __double2hiint(a); | ihi = __double2hiint(a); | |||
skipping to change at line 545 | skipping to change at line 571 | |||
e = -54; | e = -54; | |||
ihi = __double2hiint(a); | ihi = __double2hiint(a); | |||
ilo = __double2loint(a); | ilo = __double2loint(a); | |||
} | } | |||
/* a = m * 2^e. m <= sqrt(2): log2(a) = log2(m) + e. | /* a = m * 2^e. m <= sqrt(2): log2(a) = log2(m) + e. | |||
* m > sqrt(2): log2(a) = log2(m/2) + (e+1) | * m > sqrt(2): log2(a) = log2(m/2) + (e+1) | |||
*/ | */ | |||
e += ((ihi >> 20) & 0x7ff) - 1023; | e += ((ihi >> 20) & 0x7ff) - 1023; | |||
m = __hiloint2double ((ihi & 0x800fffff) | 0x3ff00000, ilo); | m = __hiloint2double ((ihi & 0x800fffff) | 0x3ff00000, ilo); | |||
if (m > CUDART_SQRT_TWO) { | if (m > CUDART_SQRT_TWO) { | |||
m = m * 0.5; | m = __internal_half(m); | |||
e = e + 1; | e = e + 1; | |||
} | } | |||
/* log((1+m)/(1-m)) = 2*atanh(m). log(m) = 2*atanh ((m-1)/(m+1)) */ | /* log((1+m)/(1-m)) = 2*atanh(m). log(m) = 2*atanh ((m-1)/(m+1)) */ | |||
f = m - 1.0; | f = m - 1.0; | |||
g = m + 1.0; | g = m + 1.0; | |||
g = 1.0 / g; | g = 1.0 / g; | |||
u = f * g; | u = f * g; | |||
u = u + u; | u = u + u; | |||
/* u = 2.0 * (m - 1.0) / (m + 1.0) */ | /* u = 2.0 * (m - 1.0) / (m + 1.0) */ | |||
v = u * u; | v = u * u; | |||
q = 6.7261411553826339E-2/65536.0; | q = 6.7261411553826339E-2/65536.0; | |||
q = __fma_rn (q, v, 6.6133829643643394E-2/16384.0); | q = __fma_rn (q, v, 6.6133829643643394E-2/16384.0); | |||
q = __fma_rn (q, v, 7.6940931149150890E-2/4096.0); | q = __fma_rn (q, v, 7.6940931149150890E-2/4096.0); | |||
q = __fma_rn (q, v, 9.0908745692137444E-2/1024.0); | q = __fma_rn (q, v, 9.0908745692137444E-2/1024.0); | |||
q = __fma_rn (q, v, 1.1111111499059706E-1/256.0); | q = __fma_rn (q, v, 1.1111111499059706E-1/256.0); | |||
q = __fma_rn (q, v, 1.4285714283305975E-1/64.0); | q = __fma_rn (q, v, 1.4285714283305975E-1/64.0); | |||
q = __fma_rn (q, v, 2.0000000000007223E-1/16.0); | q = __fma_rn (q, v, 2.0000000000007223E-1/16.0); | |||
q = __fma_rn (q, v, 3.3333333333333326E-1/4.0); | q = __fma_rn (q, v, 3.3333333333333326E-1/4.0); | |||
tmp = 2.0 * (f - u); | tmp = __internal_twice (f - u); | |||
tmp = __fma_rn (-u, f, tmp); // tmp = remainder of division | tmp = __fma_rn (-u, f, tmp); // tmp = remainder of division | |||
ulo = g * tmp; // less significant quotient bits | ulo = g * tmp; // less significant quotient bits | |||
/* u + ulo = 2.0 * (m - 1.0) / (m + 1.0) to more than double precision */ | /* u + ulo = 2.0 * (m - 1.0) / (m + 1.0) to more than double precision */ | |||
q = q * v; | q = q * v; | |||
q = q * u; | q = q * u; | |||
/* log_hi + log_lo = log(m) to more than double precision */ | /* log_hi + log_lo = log(m) to more than double precision */ | |||
log_hi = u; | log_hi = u; | |||
log_lo = ulo + q; | log_lo = ulo + q; | |||
/* log_hi + log_lo = log(m)+e*log(2)=log(a) to more than double precision */ | /* log_hi + log_lo = log(m)+e*log(2)=log(a) to more than double precision */ | |||
q = __fma_rn ( e, CUDART_LN2_HI, log_hi); | q = __fma_rn ( e, CUDART_LN2_HI, log_hi); | |||
tmp = __fma_rn (-e, CUDART_LN2_HI, q); | tmp = __fma_rn (-e, CUDART_LN2_HI, q); | |||
tmp = tmp - log_hi; | tmp = tmp - log_hi; | |||
log_hi = q; | log_hi = q; | |||
log_lo = log_lo - tmp; | log_lo = log_lo - tmp; | |||
log_lo = __fma_rn (e, CUDART_LN2_LO, log_lo); | log_lo = __fma_rn (e, CUDART_LN2_LO, log_lo); | |||
return log_hi + log_lo; | return log_hi + log_lo; | |||
} | } | |||
/* Requires |x.y| > |y.y|. 8 DP operations */ | /* Requires |x.y| > |y.y|. 8 DP operations */ | |||
__device_func__(double2 ddadd_xgty (double2 x, double2 y)) | __device_func__(double2 __internal_ddadd_xgty (double2 x, double2 y)) | |||
{ | { | |||
double2 z; | double2 z; | |||
#if defined(__GNUC__) && !defined(__CUDABE__) | #if defined(__GNUC__) && !defined(__CUDABE__) | |||
volatile double r, s, e; | volatile double r, s, e; | |||
#else | #else | |||
double r, s, e; | double r, s, e; | |||
#endif | #endif | |||
r = x.y + y.y; | r = x.y + y.y; | |||
e = x.y - r; | e = x.y - r; | |||
s = ((e + y.y) + y.x) + x.x; | s = ((e + y.y) + y.x) + x.x; | |||
z.y = e = r + s; | z.y = e = r + s; | |||
z.x = (r - e) + s; | z.x = (r - e) + s; | |||
return z; | return z; | |||
} | } | |||
/* Take full advantage of FMA. Only 8 DP operations */ | /* Take full advantage of FMA. Only 8 DP operations */ | |||
__device_func__(double2 ddmul (double2 x, double2 y)) | __device_func__(double2 __internal_ddmul (double2 x, double2 y)) | |||
{ | { | |||
#if defined(__GNUC__) && !defined(__CUDABE__) | #if defined(__GNUC__) && !defined(__CUDABE__) | |||
volatile double e; | volatile double e; | |||
#else | #else | |||
double e; | double e; | |||
#endif | #endif | |||
double2 t, z; | double2 t, z; | |||
t.y = x.y * y.y; | t.y = x.y * y.y; | |||
t.x = __fma_rn (x.y, y.y, -t.y); | t.x = __fma_rn (x.y, y.y, -t.y); | |||
t.x = __fma_rn (x.x, y.x, t.x); | t.x = __fma_rn (x.x, y.x, t.x); | |||
skipping to change at line 644 | skipping to change at line 670 | |||
expo = (ihi >> 20) & 0x7ff; | expo = (ihi >> 20) & 0x7ff; | |||
expo -= 54; | expo -= 54; | |||
} | } | |||
expo -= 1023; | expo -= 1023; | |||
/* log(a) = log(m*2^expo) = | /* log(a) = log(m*2^expo) = | |||
log(m) + log(2)*expo, if m < sqrt(2), | log(m) + log(2)*expo, if m < sqrt(2), | |||
log(m*0.5) + log(2)*(expo+1), if m >= sqrt(2) | log(m*0.5) + log(2)*(expo+1), if m >= sqrt(2) | |||
*/ | */ | |||
m = __hiloint2double((ihi & 0x800fffff) | 0x3ff00000, ilo); | m = __hiloint2double((ihi & 0x800fffff) | 0x3ff00000, ilo); | |||
if (m > CUDART_SQRT_TWO) { | if (m > CUDART_SQRT_TWO) { | |||
m = m * 0.5; | m = __internal_half(m); | |||
expo = expo + 1; | expo = expo + 1; | |||
} | } | |||
/* compute log(m) with extended precision using an algorithm derived from | /* compute log(m) with extended precision using an algorithm derived from | |||
* P.T.P. Tang, "Table Driven Implementation of the Logarithm Function", | * P.T.P. Tang, "Table Driven Implementation of the Logarithm Function", | |||
* TOMS, Vol. 16., No. 4, December 1990, pp. 378-400. A modified polynomi al | * TOMS, Vol. 16., No. 4, December 1990, pp. 378-400. A modified polynomi al | |||
* approximation to atanh(x) on the interval [-0.1716, 0.1716] is utilize d. | * approximation to atanh(x) on the interval [-0.1716, 0.1716] is utilize d. | |||
*/ | */ | |||
f = m - 1.0; | f = m - 1.0; | |||
g = m + 1.0; | g = m + 1.0; | |||
g = 1.0 / g; | g = 1.0 / g; | |||
skipping to change at line 668 | skipping to change at line 694 | |||
v = u * u; | v = u * u; | |||
q = 6.6253631649203309E-2/65536.0; | q = 6.6253631649203309E-2/65536.0; | |||
q = __fma_rn (q, v, 6.6250935587260612E-2/16384.0); | q = __fma_rn (q, v, 6.6250935587260612E-2/16384.0); | |||
q = __fma_rn (q, v, 7.6935437806732829E-2/4096.0); | q = __fma_rn (q, v, 7.6935437806732829E-2/4096.0); | |||
q = __fma_rn (q, v, 9.0908878711093280E-2/1024.0); | q = __fma_rn (q, v, 9.0908878711093280E-2/1024.0); | |||
q = __fma_rn (q, v, 1.1111111322892790E-1/256.0); | q = __fma_rn (q, v, 1.1111111322892790E-1/256.0); | |||
q = __fma_rn (q, v, 1.4285714284546502E-1/64.0); | q = __fma_rn (q, v, 1.4285714284546502E-1/64.0); | |||
q = __fma_rn (q, v, 2.0000000000003113E-1/16.0); | q = __fma_rn (q, v, 2.0000000000003113E-1/16.0); | |||
q = q * v; | q = q * v; | |||
/* u + ulo = 2.0 * (m - 1.0) / (m + 1.0) to more than double precision */ | /* u + ulo = 2.0 * (m - 1.0) / (m + 1.0) to more than double precision */ | |||
tmp = 2.0 * (f - u); | tmp = __internal_twice (f - u); | |||
tmp = __fma_rn (-u, f, tmp); // tmp = remainder of division | tmp = __fma_rn (-u, f, tmp); // tmp = remainder of division | |||
ulo = g * tmp; // less significand quotient bits | ulo = g * tmp; // less significand quotient bits | |||
/* switch to double-double at this point */ | /* switch to double-double at this point */ | |||
qq.y = q; | qq.y = q; | |||
qq.x = 0.0; | qq.x = 0.0; | |||
uu.y = u; | uu.y = u; | |||
uu.x = ulo; | uu.x = ulo; | |||
cc.y = 3.3333333333333331E-1/4.0; | cc.y = 3.3333333333333331E-1/4.0; | |||
cc.x = -9.8201492846582465E-18/4.0; | cc.x = -9.8201492846582465E-18/4.0; | |||
qq = ddadd_xgty (cc, qq); | qq = __internal_ddadd_xgty (cc, qq); | |||
/* compute log(m) in double-double format */ | /* compute log(m) in double-double format */ | |||
qq = ddmul(qq, uu); | qq = __internal_ddmul(qq, uu); | |||
qq = ddmul(qq, uu); | qq = __internal_ddmul(qq, uu); | |||
qq = ddmul(qq, uu); | qq = __internal_ddmul(qq, uu); | |||
uu = ddadd_xgty (uu, qq); | uu = __internal_ddadd_xgty (uu, qq); | |||
u = uu.y; | u = uu.y; | |||
ulo = uu.x; | ulo = uu.x; | |||
/* log(2)*expo in double-double format */ | /* log(2)*expo in double-double format */ | |||
tt.y = expo * 6.9314718055966296e-001; /* multiplication is exact */ | tt.y = expo * 6.9314718055966296e-001; /* multiplication is exact */ | |||
tt.x = expo * 2.8235290563031577e-013; | tt.x = expo * 2.8235290563031577e-013; | |||
/* log(a) = log(m) + log(2)*expo; if expo != 0, |log(2)*expo| > |log(m)| */ | /* log(a) = log(m) + log(2)*expo; if expo != 0, |log(2)*expo| > |log(m)| */ | |||
res = ddadd_xgty (tt, uu); | res = __internal_ddadd_xgty (tt, uu); | |||
return res; | return res; | |||
} | } | |||
__device_func__(double __cuda_log2(double a)) | __device_func__(double __cuda_log2(double a)) | |||
{ | { | |||
double t; | double t; | |||
t = __cuda_log(a); | t = __cuda_log(a); | |||
return __fma_rn (t, CUDART_L2E_HI, t * CUDART_L2E_LO); | return __fma_rn (t, CUDART_L2E_HI, t * CUDART_L2E_LO); | |||
} | } | |||
skipping to change at line 884 | skipping to change at line 910 | |||
z = 1.632386098183803E-010; | z = 1.632386098183803E-010; | |||
z = __fma_rn (z, a2, 2.504854501385687E-008); | z = __fma_rn (z, a2, 2.504854501385687E-008); | |||
z = __fma_rn (z, a2, 2.755734274788706E-006); | z = __fma_rn (z, a2, 2.755734274788706E-006); | |||
z = __fma_rn (z, a2, 1.984126976294102E-004); | z = __fma_rn (z, a2, 1.984126976294102E-004); | |||
z = __fma_rn (z, a2, 8.333333333452911E-003); | z = __fma_rn (z, a2, 8.333333333452911E-003); | |||
z = __fma_rn (z, a2, 1.666666666666606E-001); | z = __fma_rn (z, a2, 1.666666666666606E-001); | |||
z = z * a2; | z = z * a2; | |||
z = __fma_rn (z, a, a); | z = __fma_rn (z, a, a); | |||
} else if (a < 2.0) { /* work around accuracy issue in vicinity of 1.4 */ | } else if (a < 2.0) { /* work around accuracy issue in vicinity of 1.4 */ | |||
z = __cuda_expm1(a); | z = __cuda_expm1(a); | |||
z = 0.5 * (z + z / (z + 1.0)); | z = __internal_half (z + z / (z + 1.0)); | |||
} else { | } else { | |||
z = __internal_exp_kernel(a, -1); | z = __internal_exp_kernel(a, -1); | |||
z = z + (1.0 / (-4.0 * z)); | z = z + (1.0 / (-4.0 * z)); | |||
if (a >= CUDART_LN2_X_1025) { | if (a >= CUDART_LN2_X_1025) { | |||
z = CUDART_INF; /* overflow -> infinity */ | z = CUDART_INF; /* overflow -> infinity */ | |||
} | } | |||
} | } | |||
z = __cuda_copysign(z, s); | z = __cuda_copysign(z, s); | |||
return z; | return z; | |||
} | } | |||
skipping to change at line 931 | skipping to change at line 957 | |||
t = __fma_rn (t, a, a); | t = __fma_rn (t, a, a); | |||
a = __cuda_copysign(t, a); | a = __cuda_copysign(t, a); | |||
} | } | |||
return a; | return a; | |||
} | } | |||
__device_func__(double __internal_atan_kernel(double a)) | __device_func__(double __internal_atan_kernel(double a)) | |||
{ | { | |||
double t, a2; | double t, a2; | |||
a2 = a * a; | a2 = a * a; | |||
t = 3.078869985415414E-005; | t = -2.0258553044438358E-005 ; | |||
t = __fma_rn (t, a2, -3.242465497855030E-004); | t = __fma_rn (t, a2, 2.2302240345758510E-004); | |||
t = __fma_rn (t, a2, 1.616330732831762E-003); | t = __fma_rn (t, a2, -1.1640717779930576E-003); | |||
t = __fma_rn (t, a2, -5.109062682125205E-003); | t = __fma_rn (t, a2, 3.8559749383629918E-003); | |||
t = __fma_rn (t, a2, 1.161935628866633E-002); | t = __fma_rn (t, a2, -9.1845592187165485E-003); | |||
t = __fma_rn (t, a2, -2.056787727035855E-002); | t = __fma_rn (t, a2, 1.6978035834597331E-002); | |||
t = __fma_rn (t, a2, 3.014410555893613E-002); | t = __fma_rn (t, a2, -2.5826796814495994E-002); | |||
t = __fma_rn (t, a2, -3.868025408479192E-002); | t = __fma_rn (t, a2, 3.4067811082715123E-002); | |||
t = __fma_rn (t, a2, 4.575089047336649E-002); | t = __fma_rn (t, a2, -4.0926382420509971E-002); | |||
t = __fma_rn (t, a2, -5.205914168425584E-002); | t = __fma_rn (t, a2, 4.6739496199157994E-002); | |||
t = __fma_rn (t, a2, 5.868796955641944E-002); | t = __fma_rn (t, a2, -5.2392330054601317E-002); | |||
t = __fma_rn (t, a2, -6.664240989090695E-002); | t = __fma_rn (t, a2, 5.8773077721790849E-002); | |||
t = __fma_rn (t, a2, 7.691989357768611E-002); | t = __fma_rn (t, a2, -6.6658603633512573E-002); | |||
t = __fma_rn (t, a2, -9.090879690584197E-002); | t = __fma_rn (t, a2, 7.6922129305867837E-002); | |||
t = __fma_rn (t, a2, 1.111110931160943E-001); | t = __fma_rn (t, a2, -9.0909012354005225E-002); | |||
t = __fma_rn (t, a2, -1.428571421925286E-001); | t = __fma_rn (t, a2, 1.1111110678749424E-001); | |||
t = __fma_rn (t, a2, 1.999999999874377E-001); | t = __fma_rn (t, a2, -1.4285714271334815E-001); | |||
t = __fma_rn (t, a2, -3.333333333332495E-001); | t = __fma_rn (t, a2, 1.9999999999755019E-001); | |||
t = __fma_rn (t, a2, -3.3333333333331860E-001); | ||||
t = t * a2; | t = t * a2; | |||
t = __fma_rn (t, a, a); | t = __fma_rn (t, a, a); | |||
return t; | return t; | |||
} | } | |||
__device_func__(double __cuda_atan2(double a, double b)) | __device_func__(double __cuda_atan2(double a, double b)) | |||
{ | { | |||
double t0, t1, t3; | double t0, t1, t3; | |||
if (__cuda___isnan(a) || __cuda___isnan(b)) { | if (__cuda___isnan(a) || __cuda___isnan(b)) { | |||
return a + b; | return a + b; | |||
} | } | |||
/* reduce arguments to first octant */ | /* reduce arguments to first octant */ | |||
/* r = (|x| < |y|) ? (|x| / |y|) : (|y| / |x|) */ | /* r = (|x| < |y|) ? (|x| / |y|) : (|y| / |x|) */ | |||
t3 = __cuda_fabs(b); | t3 = __cuda_fabs(b); | |||
t1 = __cuda_fabs(a); | t1 = __cuda_fabs(a); | |||
if (t3 == 0.0 && t1 == 0.0) { | if (t3 == 0.0 && t1 == 0.0) { | |||
t3 = __cuda___signbit(b) ? CUDART_PI : 0; | t3 = __cuda___signbit(b) ? CUDART_PI : 0; | |||
} else if (__cuda___isinf(t3) && __cuda___isinf(t1)) { | } else if (__cuda___isinf(t3) && __cuda___isinf(t1)) { | |||
t3 = __cuda___signbit(b) ? CUDART_3PIO4 : CUDART_PIO4; | t3 = __cuda___signbit(b) ? CUDART_3PIO4 : CUDART_PIO4; | |||
} else { | } else { | |||
/* can't use min, max because they do not propagate NaNs */ | t0 = __cuda_fmax (t1, t3); | |||
if (t3 < t1) { | t1 = __cuda_fmin (t1, t3); | |||
t0 = t1; | ||||
t1 = t3; | ||||
} else { | ||||
t0 = t3; | ||||
t1 = t1; | ||||
} | ||||
t3 = t1 / t0; | t3 = t1 / t0; | |||
t3 = __internal_atan_kernel(t3); | t3 = __internal_atan_kernel(t3); | |||
/* Map result according to octant. */ | /* Map result according to octant. */ | |||
if (__cuda_fabs(a) > __cuda_fabs(b)) t3 = CUDART_PIO2 - t3; | if (__cuda_fabs(a) > __cuda_fabs(b)) t3 = CUDART_PIO2 - t3; | |||
if (b < 0.0) t3 = CUDART_PI - t3; | if (b < 0.0) t3 = CUDART_PI - t3; | |||
} | } | |||
t3 = __cuda_copysign(t3, a); | t3 = __cuda_copysign(t3, a); | |||
return t3; | return t3; | |||
} | } | |||
skipping to change at line 1023 | skipping to change at line 1044 | |||
r = __fma_rn (r, b, 1.745227928732326E-002); | r = __fma_rn (r, b, 1.745227928732326E-002); | |||
r = __fma_rn (r, b, 1.000422754245580E-002); | r = __fma_rn (r, b, 1.000422754245580E-002); | |||
r = __fma_rn (r, b, 1.418108777515123E-002); | r = __fma_rn (r, b, 1.418108777515123E-002); | |||
r = __fma_rn (r, b, 1.733194598980628E-002); | r = __fma_rn (r, b, 1.733194598980628E-002); | |||
r = __fma_rn (r, b, 2.237350511593569E-002); | r = __fma_rn (r, b, 2.237350511593569E-002); | |||
r = __fma_rn (r, b, 3.038188875134962E-002); | r = __fma_rn (r, b, 3.038188875134962E-002); | |||
r = __fma_rn (r, b, 4.464285849810986E-002); | r = __fma_rn (r, b, 4.464285849810986E-002); | |||
r = __fma_rn (r, b, 7.499999998342270E-002); | r = __fma_rn (r, b, 7.499999998342270E-002); | |||
r = __fma_rn (r, b, 1.666666666667375E-001); | r = __fma_rn (r, b, 1.666666666667375E-001); | |||
r = r * b; | r = r * b; | |||
r = __fma_rn (r, a, a); | ||||
return r; | return r; | |||
} | } | |||
__device_func__(double __cuda_asin(double a)) | __device_func__(double __cuda_asin(double a)) | |||
{ | { | |||
double t0, t1; | double t0, t1; | |||
t0 = __cuda_fabs(a); | t0 = __cuda_fabs(a); | |||
if (t0 > 1.0) { | if (t0 > 1.0) { | |||
return CUDART_NAN; | return CUDART_NAN; | |||
} | } | |||
if (t0 > 0.575) { | if (t0 > 0.575) { | |||
t1 = __fma_rn (-0.5, t0, 0.5); | t1 = __fma_rn (-0.5, t0, 0.5); | |||
t0 = __cuda_sqrt (t1); | t0 = __cuda_sqrt (t1); | |||
t1 = __internal_asin_kernel (t0, t1); | t1 = __internal_asin_kernel (t0, t1); | |||
t1 = __fma_rn (2.0, t1, -CUDART_PIO2_LO); | t0 = -2.0 * t0; | |||
t1 = CUDART_PIO2_HI - t1; | t1 = __fma_rn (t0, t1, CUDART_PIO2_LO); | |||
t0 = t0 + CUDART_PIO4_HI; | ||||
t1 = t0 + t1; | ||||
t1 = t1 + CUDART_PIO4_HI; | ||||
} else { | } else { | |||
t1 = t0 * t0; | t1 = t0 * t0; | |||
t1 = __internal_asin_kernel (t0, t1); | t1 = __internal_asin_kernel (t0, t1); | |||
t1 = __fma_rn (t1, t0, t0); | ||||
} | } | |||
return __cuda_copysign(t1, a); | return __cuda_copysign(t1, a); | |||
} | } | |||
__device_func__(double __cuda_acos(double a)) | __device_func__(double __cuda_acos(double a)) | |||
{ | { | |||
double t0, t1; | double t0, t1; | |||
#if !defined(__CUDABE__) | #if !defined(__CUDABE__) | |||
if (__cuda___isnan(a)) { | if (__cuda___isnan(a)) { | |||
return a + a; | return a + a; | |||
} | } | |||
#endif | #endif | |||
t0 = __cuda_fabs (a); | t0 = __cuda_fabs (a); | |||
if (t0 > 0.575) { | if (t0 > 0.575) { | |||
t1 = __fma_rn (-0.5, t0, 0.5); | t1 = __fma_rn (-0.5, t0, 0.5); | |||
t0 = __cuda_sqrt(t1); | t0 = __cuda_sqrt(t1); | |||
t0 = __internal_asin_kernel (t0, t1); | t1 = __internal_asin_kernel (t0, t1); | |||
t0 = __fma_rn (t1, t0, t0); | ||||
t0 = 2.0 * t0; | t0 = 2.0 * t0; | |||
if (__cuda___signbit(a)) { | if (__cuda___signbit(a)) { | |||
t0 = __fma_rn (1.0, t0, -CUDART_PI_LO); | t0 = __fma_rn (1.0, t0, -CUDART_PI_LO); | |||
t0 = CUDART_PI_HI - t0; | t0 = CUDART_PI_HI - t0; | |||
} | } | |||
} else { | } else { | |||
t1 = t0 * t0; | t1 = t0 * t0; | |||
t0 = __internal_asin_kernel (t0, t1); | t1 = __internal_asin_kernel (t0, t1); | |||
t0 = __fma_rn (t1, t0, t0); | ||||
if (__cuda___signbit(a)) { | if (__cuda___signbit(a)) { | |||
t0 = __fma_rn (1.0, t0, +CUDART_PIO2_LO); | t0 = __fma_rn (1.0, t0, +CUDART_PIO2_LO); | |||
t0 = CUDART_PIO2_HI + t0; | t0 = CUDART_PIO2_HI + t0; | |||
} else { | } else { | |||
t0 = __fma_rn (1.0, t0, -CUDART_PIO2_LO); | t0 = __fma_rn (1.0, t0, -CUDART_PIO2_LO); | |||
t0 = CUDART_PIO2_HI - t0; | t0 = CUDART_PIO2_HI - t0; | |||
} | } | |||
} | } | |||
return t0; | return t0; | |||
} | } | |||
__device_func__(double __cuda_acosh(double a)) | __device_func__(double __cuda_acosh(double a)) | |||
{ | { | |||
double s, t; | double t; | |||
#if !defined(__CUDABE__) | #if !defined(__CUDABE__) | |||
if (__cuda___isnan(a)) { | if (__cuda___isnan(a)) { | |||
return a + a; | return a + a; | |||
} | } | |||
#endif | #endif | |||
t = a - 1.0; | t = a - 1.0; | |||
if (__cuda_fabs(t) > CUDART_TWO_TO_52) { | if (__cuda_fabs(t) > CUDART_TWO_TO_52) { | |||
/* for large a, acosh = log(2*a) */ | /* for large a, acosh = log(2*a) */ | |||
return CUDART_LN2 + __cuda_log(a); | return CUDART_LN2 + __cuda_log(a); | |||
} else { | } else { | |||
s = a + 1.0; | t = t + __cuda_sqrt(__fma_rn(a, t, t)); | |||
t = t + __cuda_sqrt(s * t); | ||||
return __cuda_log1p(t); | return __cuda_log1p(t); | |||
} | } | |||
} | } | |||
__device_func__(double __cuda_asinh(double a)) | __device_func__(double __cuda_asinh(double a)) | |||
{ | { | |||
#if SLIGHTLY_MORE_ACCURATE_BUT_SLOWER | #if SLIGHTLY_MORE_ACCURATE_BUT_SLOWER | |||
double fa, oofa, t; | double fa, oofa, t; | |||
fa = __cuda_fabs(a); | fa = __cuda_fabs(a); | |||
if (fa > 8.9884657373828596e+307) { /* prevent intermediate underflow */ | if (fa > 8.9884657373828596e+307) { /* prevent intermediate underflow */ | |||
skipping to change at line 1449 | skipping to change at line 1474 | |||
z = __fma_rn (z, x, -5.9216643735369393e-004); | z = __fma_rn (z, x, -5.9216643735369393e-004); | |||
z = __fma_rn (z, x, 6.9728137583658571e-005); | z = __fma_rn (z, x, 6.9728137583658571e-005); | |||
z = __fma_rn (z, x, 7.8403922172006662e-004); | z = __fma_rn (z, x, 7.8403922172006662e-004); | |||
z = __fma_rn (z, x, -2.2947209362139917e-004); | z = __fma_rn (z, x, -2.2947209362139917e-004); | |||
z = __fma_rn (z, x, -2.6813271604938273e-003); | z = __fma_rn (z, x, -2.6813271604938273e-003); | |||
z = __fma_rn (z, x, 3.4722222222222220e-003); | z = __fma_rn (z, x, 3.4722222222222220e-003); | |||
z = __fma_rn (z, x, 8.3333333333333329e-002); | z = __fma_rn (z, x, 8.3333333333333329e-002); | |||
z = __fma_rn (z, x, 1.0000000000000000e+000); | z = __fma_rn (z, x, 1.0000000000000000e+000); | |||
return z; | return z; | |||
} | } | |||
/* 11 DP operations */ | ||||
__device_func__(double2 dbldbladd (double2 x, double2 y)) | ||||
{ | ||||
double2 z; | ||||
#if defined(__GNUC__) && !defined(__CUDABE__) | ||||
volatile double t1, t2, e; | ||||
#else | ||||
double t1, t2, e; | ||||
#endif | ||||
t1 = x.y + y.y; | ||||
e = t1 - x.y; | ||||
t2 = ((y.y - e) + (x.y - (t1 - e))) + x.x + y.x; | ||||
z.y = e = t1 + t2; | ||||
z.x = t2 - (e - t1); | ||||
return z; | ||||
} | ||||
__device_func__(double __internal_tgamma_stirling(double a)) | __device_func__(double __internal_tgamma_stirling(double a)) | |||
{ | { | |||
if (a < 1.7162437695630274e+002) { | if (a < 1.7162437695630274e+002) { | |||
#if defined(__GNUC__) && !defined(__CUDABE__) | #if defined(__GNUC__) && !defined(__CUDABE__) | |||
volatile double t_hi, t_lo, e; | volatile double t_hi, t_lo, e; | |||
#else | #else | |||
double t_hi, t_lo, e; | double t_hi, t_lo, e; | |||
#endif | #endif | |||
double2 loga, prod; | double2 loga, prod; | |||
skipping to change at line 1491 | skipping to change at line 1500 | |||
/* compute (a - 0.5) * log(a) in double-double format */ | /* compute (a - 0.5) * log(a) in double-double format */ | |||
t_hi = loga.y * b; | t_hi = loga.y * b; | |||
t_lo = __fma_rn (loga.y, b, -t_hi); | t_lo = __fma_rn (loga.y, b, -t_hi); | |||
t_lo = __fma_rn (loga.x, b, t_lo); | t_lo = __fma_rn (loga.x, b, t_lo); | |||
prod.y = e = t_hi + t_lo; | prod.y = e = t_hi + t_lo; | |||
prod.x = (t_hi - e) + t_lo; | prod.x = (t_hi - e) + t_lo; | |||
/* compute (a - 0.5) * log(a) - a in double-double format */ | /* compute (a - 0.5) * log(a) - a in double-double format */ | |||
loga.y = -a; | loga.y = -a; | |||
loga.x = 0.0; | loga.x = 0.0; | |||
prod = dbldbladd (prod, loga); | prod = __internal_ddadd_xgty (prod, loga); | |||
/* compute pow(a,b) = exp(b*log(a)) */ | /* compute pow(a,b) = exp(b*log(a)) */ | |||
a = __cuda_exp(prod.y); | a = __cuda_exp(prod.y); | |||
/* prevent -INF + INF = NaN */ | /* prevent -INF + INF = NaN */ | |||
if (!__cuda___isinf(a)) { | if (!__cuda___isinf(a)) { | |||
/* if prod.x is much smaller than prod.y, then exp(prod.y + prod.x) ~ = | /* if prod.x is much smaller than prod.y, then exp(prod.y + prod.x) ~ = | |||
* exp(prod.y) + prod.x * exp(prod.y) | * exp(prod.y) + prod.x * exp(prod.y) | |||
*/ | */ | |||
a = __fma_rn (a, prod.x, a); | a = __fma_rn (a, prod.x, a); | |||
} | } | |||
skipping to change at line 1515 | skipping to change at line 1524 | |||
return CUDART_INF; | return CUDART_INF; | |||
} | } | |||
} | } | |||
__device_func__(double __cuda_tgamma(double a)) | __device_func__(double __cuda_tgamma(double a)) | |||
{ | { | |||
double s, xx, x = a; | double s, xx, x = a; | |||
if (__cuda___isnan(a)) { | if (__cuda___isnan(a)) { | |||
return a + a; | return a + a; | |||
} | } | |||
if (fabs(x) < 15.0) { | if (__cuda_fabs(x) < 15.0) { | |||
/* Based on: Kraemer, W.: "Berechnung der Gammafunktion G(x) fuer reel le | /* Based on: Kraemer, W.: "Berechnung der Gammafunktion G(x) fuer reel le | |||
* Punkt- und Intervallargumente". Zeitschrift fuer angewandte Mathema tik | * Punkt- und Intervallargumente". Zeitschrift fuer angewandte Mathema tik | |||
* und Mechanik, Vol. 70 (1990), No. 6, pp. 581-584 | * und Mechanik, Vol. 70 (1990), No. 6, pp. 581-584 | |||
*/ | */ | |||
if (x >= 0.0) { | if (x >= 0.0) { | |||
s = 1.0; | s = 1.0; | |||
xx = x; | xx = x; | |||
while (xx > 1.5) { | while (xx > 1.5) { | |||
s = __fma_rn(s, xx, -s); | s = __fma_rn(s, xx, -s); | |||
xx = xx - 1.0; | xx = xx - 1.0; | |||
skipping to change at line 1566 | skipping to change at line 1575 | |||
if (x == __cuda_floor(x)) { | if (x == __cuda_floor(x)) { | |||
return CUDART_NAN; | return CUDART_NAN; | |||
} | } | |||
if (x < -185.0) { | if (x < -185.0) { | |||
int negative; | int negative; | |||
x = __cuda_floor(x); | x = __cuda_floor(x); | |||
negative = ((x - (2.0 * __cuda_floor(0.5 * x))) == 1.0); | negative = ((x - (2.0 * __cuda_floor(0.5 * x))) == 1.0); | |||
return negative ? CUDART_NEG_ZERO : CUDART_ZERO; | return negative ? CUDART_NEG_ZERO : CUDART_ZERO; | |||
} | } | |||
/* compute sin(pi*x) accurately */ | /* compute sin(pi*x) accurately */ | |||
xx = __cuda_rint (2.0 * x); | xx = __cuda_rint (__internal_twice(x)); | |||
quot = (int)xx; | quot = (int)xx; | |||
xx = __fma_rn (-0.5, xx, x); | xx = __fma_rn (-0.5, xx, x); | |||
xx = xx * CUDART_PI; | xx = xx * CUDART_PI; | |||
if (quot & 1) { | if (quot & 1) { | |||
xx = __internal_cos_kerneld (xx); | xx = __internal_cos_kerneld (xx); | |||
} else { | } else { | |||
xx = __internal_sin_kerneld (xx); | xx = __internal_sin_kerneld (xx); | |||
} | } | |||
if (quot & 2) { | if (quot & 2) { | |||
xx = -xx; | xx = -xx; | |||
} | } | |||
x = fabs (x); | x = __cuda_fabs (x); | |||
s = __cuda_exp (-x); | s = __cuda_exp (-x); | |||
t = x - 0.5; | t = x - 0.5; | |||
if (x > 140.0) t = 0.5 * t; | if (x > 140.0) t = __internal_half(t); | |||
t = __cuda_pow (x, t); | t = __cuda_pow (x, t); | |||
if (x > 140.0) s = s * t; | if (x > 140.0) s = s * t; | |||
s = s * __internal_stirling_poly (x); | s = s * __internal_stirling_poly (x); | |||
s = s * x; | s = s * x; | |||
s = s * xx; | s = s * xx; | |||
s = 1.0 / s; | s = 1.0 / s; | |||
s = __fma_rn (s, CUDART_SQRT_PIO2_HI, CUDART_SQRT_PIO2_LO * s); | s = __fma_rn (s, CUDART_SQRT_PIO2_HI, CUDART_SQRT_PIO2_LO * s); | |||
s = s / t; | s = s / t; | |||
return s; | return s; | |||
} | } | |||
skipping to change at line 1617 | skipping to change at line 1626 | |||
*/ | */ | |||
s = 1.0 / a; | s = 1.0 / a; | |||
t = s * s; | t = s * s; | |||
sum = -0.1633436431e-2; | sum = -0.1633436431e-2; | |||
sum = __fma_rn (sum, t, 0.83645878922e-3); | sum = __fma_rn (sum, t, 0.83645878922e-3); | |||
sum = __fma_rn (sum, t, -0.5951896861197e-3); | sum = __fma_rn (sum, t, -0.5951896861197e-3); | |||
sum = __fma_rn (sum, t, 0.793650576493454e-3); | sum = __fma_rn (sum, t, 0.793650576493454e-3); | |||
sum = __fma_rn (sum, t, -0.277777777735865004e-2); | sum = __fma_rn (sum, t, -0.277777777735865004e-2); | |||
sum = __fma_rn (sum, t, 0.833333333333331018375e-1); | sum = __fma_rn (sum, t, 0.833333333333331018375e-1); | |||
sum = __fma_rn (sum, s, 0.918938533204672); | sum = __fma_rn (sum, s, 0.918938533204672); | |||
s = 0.5 * __cuda_log (a); | s = __internal_half(__cuda_log (a)); | |||
t = a - 0.5; | t = a - 0.5; | |||
s = s * t; | s = s * t; | |||
t = s - a; | t = s - a; | |||
s = s + sum; | s = s + sum; | |||
t = t + s; | t = t + s; | |||
return t; | return t; | |||
} else { | } else { | |||
a = a - 3.0; | a = a - 3.0; | |||
s = -4.02412642744125560E+003; | s = -4.02412642744125560E+003; | |||
s = __fma_rn (s, a, -2.97693796998962000E+005); | s = __fma_rn (s, a, -2.97693796998962000E+005); | |||
skipping to change at line 1744 | skipping to change at line 1753 | |||
if (quot & 1) { | if (quot & 1) { | |||
i = __internal_cos_kerneld(i); | i = __internal_cos_kerneld(i); | |||
} else { | } else { | |||
i = __internal_sin_kerneld(i); | i = __internal_sin_kerneld(i); | |||
} | } | |||
i = __cuda_fabs(i); | i = __cuda_fabs(i); | |||
t = __cuda_log(CUDART_PI / (i * a)) - t; | t = __cuda_log(CUDART_PI / (i * a)) - t; | |||
return t; | return t; | |||
} | } | |||
__device_func__(double __internal_exp2i_kernel(int b)) | ||||
{ | ||||
return __hiloint2double((b + 1023) << 20, 0); | ||||
} | ||||
__device_func__(double __cuda_ldexp(double a, int b)) | __device_func__(double __cuda_ldexp(double a, int b)) | |||
{ | { | |||
double fa = __cuda_fabs (a); | double fa = __cuda_fabs (a); | |||
if ((fa == CUDART_ZERO) || (fa == CUDART_INF) || (!(fa <= CUDART_INF))) { | if ((fa == CUDART_ZERO) || (fa == CUDART_INF) || (!(fa <= CUDART_INF))) { | |||
return a + a; | return a + a; | |||
} | } | |||
if (b == 0) { | if (b == 0) { | |||
return a; | return a; | |||
} | } | |||
if (b > 2200) b = 2200; | if (b > 2200) b = 2200; | |||
End of changes. 41 change blocks. | ||||
106 lines changed or deleted | 110 lines changed or added | |||
sm_11_atomic_functions.h | sm_11_atomic_functions.h | |||
---|---|---|---|---|
skipping to change at line 214 | skipping to change at line 214 | |||
} | } | |||
#endif /* !CUDA_NO_SM_12_ATOMIC_INTRINSICS */ | #endif /* !CUDA_NO_SM_12_ATOMIC_INTRINSICS */ | |||
#elif !defined(__CUDACC__) | #elif !defined(__CUDACC__) | |||
#include "crt/func_macro.h" | #include "crt/func_macro.h" | |||
#if !defined(__CUDABE__) | #if !defined(__CUDABE__) | |||
#if defined(__MULTI_CORE__) | ||||
#define __iAtomicAdd(address, val) \ | ||||
__builtin___iAtomicAdd(address, val) | ||||
#define __uAtomicAdd(address, val) \ | ||||
__builtin___uAtomicAdd(address, val) | ||||
#define __iAtomicExch(address, val) \ | ||||
__builtin___iAtomicExch(address, val) | ||||
#define __uAtomicExch(address, val) \ | ||||
__builtin___uAtomicExch(address, val) | ||||
#define __fAtomicExch(address, val) \ | ||||
__builtin___fAtomicExch(address, val) | ||||
#define __iAtomicMin(address, val) \ | ||||
__builtin___iAtomicMin(address, val) | ||||
#define __uAtomicMin(address, val) \ | ||||
__builtin___uAtomicMin(address, val) | ||||
#define __iAtomicMax(address, val) \ | ||||
__builtin___iAtomicMax(address, val) | ||||
#define __uAtomicMax(address, val) \ | ||||
__builtin___uAtomicMax(address, val) | ||||
#define __uAtomicInc(address, val) \ | ||||
__builtin___uAtomicInc(address, val) | ||||
#define __uAtomicDec(address, val) \ | ||||
__builtin___uAtomicDec(address, val) | ||||
#define __iAtomicAnd(address, val) \ | ||||
__builtin___iAtomicAnd(address, val) | ||||
#define __uAtomicAnd(address, val) \ | ||||
__builtin___uAtomicAnd(address, val) | ||||
#define __iAtomicOr(address, val) \ | ||||
__builtin___iAtomicOr(address, val) | ||||
#define __uAtomicOr(address, val) \ | ||||
__builtin___uAtomicOr(address, val) | ||||
#define __iAtomicXor(address, val) \ | ||||
__builtin___iAtomicXor(address, val) | ||||
#define __uAtomicXor(address, val) \ | ||||
__builtin___uAtomicXor(address, val) | ||||
#define __iAtomicCAS(address, compare, val) \ | ||||
__builtin___iAtomicCAS(address, compare, val) | ||||
#define __uAtomicCAS(address, compare, val) \ | ||||
__builtin___uAtomicCAS(address, compare, val) | ||||
#else /* __MULTI_CORE__ */ | ||||
__device_func__(int __iAtomicAdd(int *address, int val)) | __device_func__(int __iAtomicAdd(int *address, int val)) | |||
{ | { | |||
int old = *address; | int old = *address; | |||
*address = old + val; | *address = old + val; | |||
return old; | return old; | |||
} | } | |||
__device_func__(unsigned int __uAtomicAdd(unsigned int *address, unsigned i nt val)) | __device_func__(unsigned int __uAtomicAdd(unsigned int *address, unsigned i nt val)) | |||
skipping to change at line 385 | skipping to change at line 428 | |||
__device_func__(unsigned int __uAtomicCAS(unsigned int *address, unsigned i nt compare, unsigned int val)) | __device_func__(unsigned int __uAtomicCAS(unsigned int *address, unsigned i nt compare, unsigned int val)) | |||
{ | { | |||
unsigned int old = *address; | unsigned int old = *address; | |||
*address = old == compare ? val : old; | *address = old == compare ? val : old; | |||
return old; | return old; | |||
} | } | |||
#endif /* __MULTI_CORE__ */ | ||||
#endif /* !__CUDABE__ */ | #endif /* !__CUDABE__ */ | |||
#endif /* __cplusplus && __CUDACC__ */ | #endif /* __cplusplus && __CUDACC__ */ | |||
#endif /* !__SM_11_ATOMIC_FUNCTIONS_H__ */ | #endif /* !__SM_11_ATOMIC_FUNCTIONS_H__ */ | |||
End of changes. 2 change blocks. | ||||
0 lines changed or deleted | 45 lines changed or added | |||
sm_12_atomic_functions.h | sm_12_atomic_functions.h | |||
---|---|---|---|---|
skipping to change at line 107 | skipping to change at line 107 | |||
} | } | |||
#endif /* !CUDA_NO_SM_12_ATOMIC_INTRINSICS */ | #endif /* !CUDA_NO_SM_12_ATOMIC_INTRINSICS */ | |||
#elif !defined(__CUDACC__) | #elif !defined(__CUDACC__) | |||
#include "crt/func_macro.h" | #include "crt/func_macro.h" | |||
#if !defined(__CUDABE__) | #if !defined(__CUDABE__) | |||
#if defined(__MULTI_CORE__) | ||||
#define __ullAtomicAdd(address, val) \ | ||||
__builtin___ullAtomicAdd(address, val) | ||||
#define __ullAtomicExch(address, val) \ | ||||
__builtin___ullAtomicExch(address, val) | ||||
#define __ullAtomicCAS(address, compare, val) \ | ||||
__builtin___ullAtomicCAS(address, compare, val) | ||||
#else /* __MULTI_CORE__ */ | ||||
__device_func__(unsigned long long int __ullAtomicAdd(unsigned long long in t *address, unsigned long long int val)) | __device_func__(unsigned long long int __ullAtomicAdd(unsigned long long in t *address, unsigned long long int val)) | |||
{ | { | |||
unsigned long long int old = *address; | unsigned long long int old = *address; | |||
*address = old + val; | *address = old + val; | |||
return old; | return old; | |||
} | } | |||
__device_func__(unsigned long long int __ullAtomicExch(unsigned long long i nt *address, unsigned long long int val)) | __device_func__(unsigned long long int __ullAtomicExch(unsigned long long i nt *address, unsigned long long int val)) | |||
skipping to change at line 134 | skipping to change at line 145 | |||
__device_func__(unsigned long long int __ullAtomicCAS(unsigned long long in t *address, unsigned long long int compare, unsigned long long int val)) | __device_func__(unsigned long long int __ullAtomicCAS(unsigned long long in t *address, unsigned long long int compare, unsigned long long int val)) | |||
{ | { | |||
unsigned long long int old = *address; | unsigned long long int old = *address; | |||
*address = old == compare ? val : old; | *address = old == compare ? val : old; | |||
return old; | return old; | |||
} | } | |||
#endif /* __MULTI_CORE__ */ | ||||
__device_func__(int __any(int cond)) | __device_func__(int __any(int cond)) | |||
{ | { | |||
return cond; | return cond; | |||
} | } | |||
__device_func__(int __all(int cond)) | __device_func__(int __all(int cond)) | |||
{ | { | |||
return cond; | return cond; | |||
} | } | |||
End of changes. 2 change blocks. | ||||
0 lines changed or deleted | 13 lines changed or added | |||
sm_13_double_functions.h | sm_13_double_functions.h | |||
---|---|---|---|---|
skipping to change at line 631 | skipping to change at line 631 | |||
{ | { | |||
return __internal_ull2double_kernel(a, cudaRoundPosInf); | return __internal_ull2double_kernel(a, cudaRoundPosInf); | |||
} | } | |||
#endif /* !__CUDABE__ */ | #endif /* !__CUDABE__ */ | |||
#if !defined(__CUDABE__) || defined(CUDA_NO_SM_13_DOUBLE_INTRINSICS) | #if !defined(__CUDABE__) || defined(CUDA_NO_SM_13_DOUBLE_INTRINSICS) | |||
__device_func__(double __internal_fma_kernel(double x, double y, double z, enum cudaRoundMode rndMode)) | __device_func__(double __internal_fma_kernel(double x, double y, double z, enum cudaRoundMode rndMode)) | |||
{ | { | |||
#ifdef __MULTI_CORE__ | ||||
volatile | ||||
#endif /* __MULTI_CORE__ */ | ||||
struct { | struct { | |||
unsigned int lo; | unsigned int lo; | |||
unsigned int hi; | unsigned int hi; | |||
} xx, yy, zz, ww; | } xx, yy, zz, ww; | |||
unsigned int s, t, u, prod0, prod1, prod2, prod3, expo_x, expo_y, expo_z; | unsigned int s, t, u, prod0, prod1, prod2, prod3, expo_x, expo_y, expo_z; | |||
xx.hi = __double2hiint(x); | xx.hi = __double2hiint(x); | |||
xx.lo = __double2loint(x); | xx.lo = __double2loint(x); | |||
yy.hi = __double2hiint(y); | yy.hi = __double2hiint(y); | |||
yy.lo = __double2loint(y); | yy.lo = __double2loint(y); | |||
End of changes. 1 change blocks. | ||||
0 lines changed or deleted | 3 lines changed or added | |||
storage_class.h | storage_class.h | |||
---|---|---|---|---|
skipping to change at line 48 | skipping to change at line 48 | |||
#if !defined(__var_used__) | #if !defined(__var_used__) | |||
#define __var_used__ | #define __var_used__ | |||
#endif /* __var_used__ */ | #endif /* __var_used__ */ | |||
#if !defined(__loc_sc__) | #if !defined(__loc_sc__) | |||
#define __loc_sc__(loc, sc) \ | #define __loc_sc__(loc, sc) \ | |||
__storage##_##sc##loc __var_used__ loc | __storage##_##sc##loc loc | |||
#endif /* !__loc_sc__ */ | #endif /* !__loc_sc__ */ | |||
#define __storage___device__ \ | #if !defined(__storage___device__) | |||
static | #define __storage___device__ static __var_used__ | |||
#define __storage_extern__device__ \ | #endif /* __storage___device__ */ | |||
static | ||||
#define __storage_auto__device__ \ | ||||
@@@ COMPILER @@@ ERROR @@@ | ||||
#define __storage_static__device__ \ | ||||
static | ||||
#define __storage___constant__ \ | #if !defined(__storage_extern__device__) | |||
static | #define __storage_extern__device__ static __var_used__ | |||
#define __storage_extern__constant__ \ | #endif /* __storage_extern__device__ */ | |||
static | ||||
#define __storage_auto__constant__ \ | ||||
@@@ COMPILER @@@ ERROR @@@ | ||||
#define __storage_static__constant__ \ | ||||
static | ||||
#define __storage___shared__ \ | #if !defined(__storage_auto__device__) | |||
static | #define __storage_auto__device__ @@@ COMPILER @@@ ERROR @@@ | |||
#define __storage_extern__shared__ \ | #endif /* __storage_auto__device__ */ | |||
static | ||||
#define __storage_auto__shared__ \ | ||||
static | ||||
#define __storage_static__shared__ \ | ||||
static | ||||
#define __storage___text__ \ | #if !defined(__storage_static__device__) | |||
static | #define __storage_static__device__ static __var_used__ | |||
#define __storage_extern__text__ \ | #endif /* __storage_static__device__ */ | |||
static | ||||
#define __storage_auto__text__ \ | #if !defined(__storage___constant__) | |||
@@@ COMPILER @@@ ERROR @@@ | #define __storage___constant__ static __var_used__ | |||
#define __storage_static__text__ \ | #endif /* __storage___constant__ */ | |||
static | ||||
#if !defined(__storage_extern__constant__) | ||||
#define __storage_extern__constant__ static __var_used__ | ||||
#endif /* __storage_extern__constant__ */ | ||||
#if !defined(__storage_auto__constant__) | ||||
#define __storage_auto__constant__ @@@ COMPILER @@@ ERROR @@@ | ||||
#endif /* __storage_auto__constant__ */ | ||||
#if !defined(__storage_static__constant__) | ||||
#define __storage_static__constant__ static __var_used__ | ||||
#endif /* __storage_static__constant__ */ | ||||
#if !defined(__storage___shared__) | ||||
#define __storage___shared__ static __var_used__ | ||||
#endif /* __storage___shared__ */ | ||||
#if !defined(__storage_extern__shared__) | ||||
#define __storage_extern__shared__ static __var_used__ | ||||
#endif /* __storage_extern__shared__ */ | ||||
#if !defined(__storage_auto__shared__) | ||||
#define __storage_auto__shared__ static | ||||
#endif /* __storage_auto__shared__ */ | ||||
#if !defined(__storage_static__shared__) | ||||
#define __storage_static__shared__ static __var_used__ | ||||
#endif /* __storage_static__shared__ */ | ||||
#if !defined(__storage___text__) | ||||
#define __storage___text__ static __var_used__ | ||||
#endif /* __storage___text__ */ | ||||
#if !defined(__storage_extern__text__) | ||||
#define __storage_extern__text__ static __var_used__ | ||||
#endif /* __storage_extern__text__ */ | ||||
#if !defined(__storage_auto__text__) | ||||
#define __storage_auto__text__ @@@ COMPILER @@@ ERROR @@@ | ||||
#endif /* __storage_auto__text__ */ | ||||
#if !defined(__storage_static__text__) | ||||
#define __storage_static__text__ static __var_used__ | ||||
#endif /* __storage_static__text__ */ | ||||
#if !defined(__storage___surf__) | ||||
#define __storage___surf__ static __var_used__ | ||||
#endif /* __storage___surf__ */ | ||||
#if !defined(__storage_extern__surf__) | ||||
#define __storage_extern__surf__ static __var_used__ | ||||
#endif /* __storage_extern__surf__ */ | ||||
#if !defined(__storage_auto__surf__) | ||||
#define __storage_auto__surf__ @@@ COMPILER @@@ ERROR @@@ | ||||
#endif /* __storage_auto__surf__ */ | ||||
#if !defined(__storage_static__surf__) | ||||
#define __storage_static__surf__ static __var_used__ | ||||
#endif /* __storage_static__surf__ */ | ||||
#endif /* !__STORAGE_CLASS_H__ */ | #endif /* !__STORAGE_CLASS_H__ */ | |||
End of changes. 5 change blocks. | ||||
33 lines changed or deleted | 77 lines changed or added | |||
texture_fetch_functions.h | texture_fetch_functions.h | |||
---|---|---|---|---|
skipping to change at line 47 | skipping to change at line 47 | |||
#define __TEXTURE_FETCH_FUNCTIONS_H__ | #define __TEXTURE_FETCH_FUNCTIONS_H__ | |||
#if defined(__cplusplus) && defined(__CUDACC__) | #if defined(__cplusplus) && defined(__CUDACC__) | |||
/************************************************************************** ***** | /************************************************************************** ***** | |||
* * | * * | |||
* * | * * | |||
* * | * * | |||
*************************************************************************** ****/ | *************************************************************************** ****/ | |||
#include "cuda_texture_types.h" | ||||
#include "host_defines.h" | #include "host_defines.h" | |||
#include "texture_types.h" | #include "texture_types.h" | |||
#include "vector_functions.h" | #include "vector_functions.h" | |||
#include "vector_types.h" | #include "vector_types.h" | |||
/************************************************************************** ***** | /************************************************************************** ***** | |||
* * | * * | |||
* * | * * | |||
* * | * * | |||
*************************************************************************** ****/ | *************************************************************************** ****/ | |||
skipping to change at line 1881 | skipping to change at line 1882 | |||
return make_float4(w.x, w.y, w.z, w.w); | return make_float4(w.x, w.y, w.z, w.w); | |||
} | } | |||
#elif !defined(__CUDACC__) | #elif !defined(__CUDACC__) | |||
#include "host_defines.h" | #include "host_defines.h" | |||
#include "crt/func_macro.h" | #include "crt/func_macro.h" | |||
#if defined(__CUDABE__) | #if defined(__CUDABE__) | |||
extern uint4 __utexfetchi1D(__i1texture, int4); | extern uint4 __utexfetchi1D(const void*, int4); | |||
extern int4 __itexfetchi1D(__i2texture, int4); | extern int4 __itexfetchi1D(const void*, int4); | |||
extern float4 __ftexfetchi1D(__i3texture, int4); | extern float4 __ftexfetchi1D(const void*, int4); | |||
extern uint4 __utexfetch1D(__u1texture, float4); | extern uint4 __utexfetch1D(const void*, float4); | |||
extern int4 __itexfetch1D(__u2texture, float4); | extern int4 __itexfetch1D(const void*, float4); | |||
extern float4 __ftexfetch1D(__u3texture, float4); | extern float4 __ftexfetch1D(const void*, float4); | |||
extern uint4 __utexfetch2D(__f1texture, float4); | extern uint4 __utexfetch2D(const void*, float4); | |||
extern int4 __itexfetch2D(__f2texture, float4); | extern int4 __itexfetch2D(const void*, float4); | |||
extern float4 __ftexfetch2D(__f3texture, float4); | extern float4 __ftexfetch2D(const void*, float4); | |||
extern uint4 __utexfetch3D(__f1texture, float4); | extern uint4 __utexfetch3D(const void*, float4); | |||
extern int4 __itexfetch3D(__f2texture, float4); | extern int4 __itexfetch3D(const void*, float4); | |||
extern float4 __ftexfetch3D(__f3texture, float4); | extern float4 __ftexfetch3D(const void*, float4); | |||
#define __utexfetchi(t, i) \ | #define __utexfetchi(t, i) \ | |||
__utexfetchi1D(t, i) | __utexfetchi1D(t, i) | |||
#define __itexfetchi(t, i) \ | #define __itexfetchi(t, i) \ | |||
__itexfetchi1D(t, i) | __itexfetchi1D(t, i) | |||
#define __ftexfetchi(t, i) \ | #define __ftexfetchi(t, i) \ | |||
__ftexfetchi1D(t, i) | __ftexfetchi1D(t, i) | |||
#define __utexfetch(t, i, d) \ | #define __utexfetch(t, i, d) \ | |||
__utexfetch##d##D(t, i) | __utexfetch##d##D(t, i) | |||
#define __itexfetch(t, i, d) \ | #define __itexfetch(t, i, d) \ | |||
End of changes. 2 change blocks. | ||||
12 lines changed or deleted | 13 lines changed or added | |||
texture_types.h | texture_types.h | |||
---|---|---|---|---|
skipping to change at line 45 | skipping to change at line 45 | |||
#if !defined(__TEXTURE_TYPES_H__) | #if !defined(__TEXTURE_TYPES_H__) | |||
#define __TEXTURE_TYPES_H__ | #define __TEXTURE_TYPES_H__ | |||
/************************************************************************** ***** | /************************************************************************** ***** | |||
* * | * * | |||
* * | * * | |||
* * | * * | |||
*************************************************************************** ****/ | *************************************************************************** ****/ | |||
#include "driver_types.h" | ||||
/************************************************************************** | ||||
***** | ||||
* | ||||
* | ||||
* | ||||
* | ||||
* | ||||
* | ||||
*************************************************************************** | ||||
****/ | ||||
/*DEVICE_BUILTIN*/ | /*DEVICE_BUILTIN*/ | |||
enum cudaTextureAddressMode | enum cudaTextureAddressMode | |||
{ | { | |||
cudaAddressModeWrap, | cudaAddressModeWrap, | |||
cudaAddressModeClamp | cudaAddressModeClamp | |||
}; | }; | |||
/*DEVICE_BUILTIN*/ | /*DEVICE_BUILTIN*/ | |||
enum cudaTextureFilterMode | enum cudaTextureFilterMode | |||
{ | { | |||
skipping to change at line 76 | skipping to change at line 84 | |||
/*DEVICE_BUILTIN*/ | /*DEVICE_BUILTIN*/ | |||
struct textureReference | struct textureReference | |||
{ | { | |||
int normalized; | int normalized; | |||
enum cudaTextureFilterMode filterMode; | enum cudaTextureFilterMode filterMode; | |||
enum cudaTextureAddressMode addressMode[3]; | enum cudaTextureAddressMode addressMode[3]; | |||
struct cudaChannelFormatDesc channelDesc; | struct cudaChannelFormatDesc channelDesc; | |||
int __cudaReserved[16]; | int __cudaReserved[16]; | |||
}; | }; | |||
#if defined(__cplusplus) && defined(__CUDACC__) | ||||
/************************************************************************** | ||||
***** | ||||
* | ||||
* | ||||
* | ||||
* | ||||
* | ||||
* | ||||
*************************************************************************** | ||||
****/ | ||||
#include "host_defines.h" | ||||
#include "channel_descriptor.h" | ||||
/************************************************************************** | ||||
***** | ||||
* | ||||
* | ||||
* | ||||
* | ||||
* | ||||
* | ||||
*************************************************************************** | ||||
****/ | ||||
/*TEXTURE_TYPE*/ | ||||
template<class T, int dim = 1, enum cudaTextureReadMode = cudaReadModeEleme | ||||
ntType> | ||||
struct texture : public textureReference | ||||
{ | ||||
__host__ texture(int norm = 0, | ||||
enum cudaTextureFilterMode fMode = cudaFilterModePoint, | ||||
enum cudaTextureAddressMode aMode = cudaAddressModeClamp | ||||
) | ||||
{ | ||||
normalized = norm; | ||||
filterMode = fMode; | ||||
addressMode[0] = aMode; | ||||
addressMode[1] = aMode; | ||||
addressMode[2] = aMode; | ||||
channelDesc = cudaCreateChannelDesc<T>(); | ||||
} | ||||
__host__ texture(int norm, | ||||
enum cudaTextureFilterMode fMode, | ||||
enum cudaTextureAddressMode aMode, | ||||
struct cudaChannelFormatDesc desc) | ||||
{ | ||||
normalized = norm; | ||||
filterMode = fMode; | ||||
addressMode[0] = aMode; | ||||
addressMode[1] = aMode; | ||||
addressMode[2] = aMode; | ||||
channelDesc = desc; | ||||
} | ||||
}; | ||||
#endif /* __cplusplus && __CUDACC__ */ | ||||
#endif /* !__TEXTURE_TYPES_H__ */ | #endif /* !__TEXTURE_TYPES_H__ */ | |||
End of changes. 2 change blocks. | ||||
61 lines changed or deleted | 13 lines changed or added | |||
vector_functions.h | vector_functions.h | |||
---|---|---|---|---|
skipping to change at line 238 | skipping to change at line 238 | |||
static __inline__ __host__ __device__ float3 make_float3(float x, float y, float z) | static __inline__ __host__ __device__ float3 make_float3(float x, float y, float z) | |||
{ | { | |||
float3 t; t.x = x; t.y = y; t.z = z; return t; | float3 t; t.x = x; t.y = y; t.z = z; return t; | |||
} | } | |||
static __inline__ __host__ __device__ float4 make_float4(float x, float y, float z, float w) | static __inline__ __host__ __device__ float4 make_float4(float x, float y, float z, float w) | |||
{ | { | |||
float4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t; | float4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t; | |||
} | } | |||
static __inline__ __host__ __device__ longlong1 make_longlong1(long long in | ||||
t x) | ||||
{ | ||||
longlong1 t; t.x = x; return t; | ||||
} | ||||
static __inline__ __host__ __device__ ulonglong1 make_ulonglong1(unsigned l | ||||
ong long int x) | ||||
{ | ||||
ulonglong1 t; t.x = x; return t; | ||||
} | ||||
static __inline__ __host__ __device__ longlong2 make_longlong2(long long in | ||||
t x, long long int y) | ||||
{ | ||||
longlong2 t; t.x = x; t.y = y; return t; | ||||
} | ||||
static __inline__ __host__ __device__ ulonglong2 make_ulonglong2(unsigned l | ||||
ong long int x, unsigned long long int y) | ||||
{ | ||||
ulonglong2 t; t.x = x; t.y = y; return t; | ||||
} | ||||
static __inline__ __host__ __device__ double1 make_double1(double x) | static __inline__ __host__ __device__ double1 make_double1(double x) | |||
{ | { | |||
double1 t; t.x = x; return t; | double1 t; t.x = x; return t; | |||
} | } | |||
static __inline__ __host__ __device__ double2 make_double2(double x, double y) | static __inline__ __host__ __device__ double2 make_double2(double x, double y) | |||
{ | { | |||
double2 t; t.x = x; t.y = y; return t; | double2 t; t.x = x; t.y = y; return t; | |||
} | } | |||
End of changes. 1 change blocks. | ||||
0 lines changed or deleted | 24 lines changed or added | |||
vector_types.h | vector_types.h | |||
---|---|---|---|---|
skipping to change at line 286 | skipping to change at line 286 | |||
float x, y, z; | float x, y, z; | |||
}; | }; | |||
/*DEVICE_BUILTIN*/ | /*DEVICE_BUILTIN*/ | |||
struct __builtin_align__(16) float4 | struct __builtin_align__(16) float4 | |||
{ | { | |||
float x, y, z, w; | float x, y, z, w; | |||
}; | }; | |||
/*DEVICE_BUILTIN*/ | /*DEVICE_BUILTIN*/ | |||
struct longlong1 | ||||
{ | ||||
long long int x; | ||||
}; | ||||
/*DEVICE_BUILTIN*/ | ||||
struct ulonglong1 | ||||
{ | ||||
unsigned long long int x; | ||||
}; | ||||
/*DEVICE_BUILTIN*/ | ||||
struct __builtin_align__(16) longlong2 | ||||
{ | ||||
long long int x, y; | ||||
}; | ||||
/*DEVICE_BUILTIN*/ | ||||
struct __builtin_align__(16) ulonglong2 | ||||
{ | ||||
unsigned long long int x, y; | ||||
}; | ||||
/*DEVICE_BUILTIN*/ | ||||
struct double1 | struct double1 | |||
{ | { | |||
double x; | double x; | |||
}; | }; | |||
/*DEVICE_BUILTIN*/ | /*DEVICE_BUILTIN*/ | |||
struct __builtin_align__(16) double2 | struct __builtin_align__(16) double2 | |||
{ | { | |||
double x, y; | double x, y; | |||
}; | }; | |||
skipping to change at line 376 | skipping to change at line 400 | |||
typedef struct ulong4 ulong4; | typedef struct ulong4 ulong4; | |||
/*DEVICE_BUILTIN*/ | /*DEVICE_BUILTIN*/ | |||
typedef struct float1 float1; | typedef struct float1 float1; | |||
/*DEVICE_BUILTIN*/ | /*DEVICE_BUILTIN*/ | |||
typedef struct float2 float2; | typedef struct float2 float2; | |||
/*DEVICE_BUILTIN*/ | /*DEVICE_BUILTIN*/ | |||
typedef struct float3 float3; | typedef struct float3 float3; | |||
/*DEVICE_BUILTIN*/ | /*DEVICE_BUILTIN*/ | |||
typedef struct float4 float4; | typedef struct float4 float4; | |||
/*DEVICE_BUILTIN*/ | /*DEVICE_BUILTIN*/ | |||
typedef struct longlong1 longlong1; | ||||
/*DEVICE_BUILTIN*/ | ||||
typedef struct ulonglong1 ulonglong1; | ||||
/*DEVICE_BUILTIN*/ | ||||
typedef struct longlong2 longlong2; | ||||
/*DEVICE_BUILTIN*/ | ||||
typedef struct ulonglong2 ulonglong2; | ||||
/*DEVICE_BUILTIN*/ | ||||
typedef struct double1 double1; | typedef struct double1 double1; | |||
/*DEVICE_BUILTIN*/ | /*DEVICE_BUILTIN*/ | |||
typedef struct double2 double2; | typedef struct double2 double2; | |||
/************************************************************************** ***** | /************************************************************************** ***** | |||
* * | * * | |||
* * | * * | |||
* * | * * | |||
*************************************************************************** ****/ | *************************************************************************** ****/ | |||
End of changes. 2 change blocks. | ||||
0 lines changed or deleted | 32 lines changed or added | |||