__cudaFatFormat.h   __cudaFatFormat.h 
/* /*
* Copyright 1993-2007 NVIDIA Corporation. All rights reserved. * Copyright 1993-2008 NVIDIA Corporation. All rights reserved.
* *
* NOTICE TO USER: * NOTICE TO USER:
* *
* This source code is subject to NVIDIA ownership rights under U.S. and * This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws. Users and possessors of this source code * international Copyright laws. Users and possessors of this source code
* are hereby granted a nonexclusive, royalty-free license to use this code * are hereby granted a nonexclusive, royalty-free license to use this code
* in individual and commercial software. * in individual and commercial software.
* *
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
skipping to change at line 133 skipping to change at line 133
char* debug; char* debug;
} __cudaFatDebugEntry; } __cudaFatDebugEntry;
typedef enum { typedef enum {
__cudaFatDontSearchFlag = (1 << 0), __cudaFatDontSearchFlag = (1 << 0),
__cudaFatDontCacheFlag = (1 << 1), __cudaFatDontCacheFlag = (1 << 1),
__cudaFatSassDebugFlag = (1 << 2) __cudaFatSassDebugFlag = (1 << 2)
} __cudaFatCudaBinaryFlag; } __cudaFatCudaBinaryFlag;
/* /*
* Imported/exported symbol descriptor, needed for
* __cudaFat binary linking. Not much information is needed,
* because this is only an index: full symbol information
* is contained by the binaries.
*/
typedef struct {
char* name;
} __cudaFatSymbol;
/*
* Fat binary container. * Fat binary container.
* A mix of ptx intermediate programs and cubins, * A mix of ptx intermediate programs and cubins,
* plus a global identifier that can be used for * plus a global identifier that can be used for
* further lookup in a translation cache or a resource * further lookup in a translation cache or a resource
* file. This key is a checksum over the device text. * file. This key is a checksum over the device text.
* The ptx and cubin array are each terminated with * The ptx and cubin array are each terminated with
* entries that have Null components. * entries that have NULL components.
*/ */
typedef struct { typedef struct __cudaFatCudaBinaryRec {
unsigned long magic; unsigned long magic;
unsigned long version; unsigned long version;
unsigned long gpuInfoVersion; unsigned long gpuInfoVersion;
char* key; char* key;
char* ident; char* ident;
char* usageMode; char* usageMode;
__cudaFatPtxEntry *ptx; __cudaFatPtxEntry *ptx;
__cudaFatCubinEntry *cubin; __cudaFatCubinEntry *cubin;
__cudaFatDebugEntry *debug; __cudaFatDebugEntry *debug;
void* debugInfo; void* debugInfo;
unsigned int flags; unsigned int flags;
__cudaFatSymbol *exported;
__cudaFatSymbol *imported;
struct __cudaFatCudaBinaryRec *dependends;
} __cudaFatCudaBinary; } __cudaFatCudaBinary;
/* /*
* Current version and magic numbers: * Current version and magic numbers:
*/ */
#define __cudaFatVERSION 0x00000002 #define __cudaFatVERSION 0x00000003
#define __cudaFatMAGIC 0x1ee55a01 #define __cudaFatMAGIC 0x1ee55a01
/* /*
* Version history log: * Version history log:
* 1 : __cudaFatDebugEntry field added to __cudaFatCudaBinary struct * 1 : __cudaFatDebugEntry field added to __cudaFatCudaBinary struct
* 2 : flags and debugInfo field added. * 2 : flags and debugInfo field added.
* 3 : import/export symbol list
*/ */
/*--------------------------------- Functions ----------------------------- ---*/ /*--------------------------------- Functions ----------------------------- ---*/
typedef enum {
__cudaFatAvoidPTX,
__cudaFatPreferBestCode
} __cudaFatCompilationPolicy;
/* /*
* Function : Select a load image from the __cudaFat binary * Function : Select a load image from the __cudaFat binary
* that will run on the specified GPU. * that will run on the specified GPU.
* Parameters : binary (I) Fat binary * Parameters : binary (I) Fat binary
* policy (I) Parameter influencing the selection proces
s in case no
* fully matching cubin can be found, but ins
tead a choice can
* be made between ptx compilation or selecti
on of a
* cubin for a less capable GPU.
* gpuName (I) Name of target GPU * gpuName (I) Name of target GPU
* cubin (O) Returned cubin text string, or Null when * cubin (O) Returned cubin text string, or NULL when
* no matching cubin for the specified gpu * no matching cubin for the specified gpu
* could be found. * could be found.
* dbgInfo (O) If this parameter is not Null upon entry, then * dbgInfo (O) If this parameter is not NULL upon entry, then
* the name of a file containing debug inform ation * the name of a file containing debug inform ation
* on the returned cubin will be returned, or Null * on the returned cubin will be returned, or NULL
* will be returned when cubin or such debug info * will be returned when cubin or such debug info
* cannot be found. * cannot be found.
*/ */
void fatGetCubinForGpu( __cudaFatCudaBinary *binary, char* gpuName, char* * void fatGetCubinForGpuWithPolicy( __cudaFatCudaBinary *binary, __cudaFatCom
cubin, char* *dbgInfoFile ); pilationPolicy policy, char* gpuName, char* *cubin, char* *dbgInfoFile );
#define fatGetCubinForGpu(binary,gpuName,cubin,dbgInfoFile) \
fatGetCubinForGpuWithPolicy(binary,__cudaFatAvoidPTX,gpuName,cubi
n,dbgInfoFile)
/* /*
* Function : Free information previously obtained via function fatG etCubinForGpu. * Function : Free information previously obtained via function fatG etCubinForGpu.
* Parameters : cubin (I) Cubin text string to free * Parameters : cubin (I) Cubin text string to free
* dbgInfo (I) Debug info filename to free, or Null * dbgInfo (I) Debug info filename to free, or NULL
*/ */
void fatFreeCubin( char* cubin, char* dbgInfoFile ); void fatFreeCubin( char* cubin, char* dbgInfoFile );
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif
#endif #endif
 End of changes. 13 change blocks. 
21 lines changed or deleted 51 lines changed or added


 channel_descriptor.h   channel_descriptor.h 
skipping to change at line 39 skipping to change at line 39
* source code with only those rights set forth herein. * source code with only those rights set forth herein.
* *
* Any use of this source code in individual and commercial software must * Any use of this source code in individual and commercial software must
* include, in the user documentation and internal comments to the code, * include, in the user documentation and internal comments to the code,
* the above Disclaimer and U.S. Government End Users Notice. * the above Disclaimer and U.S. Government End Users Notice.
*/ */
#if !defined(__CHANNEL_DESCRIPTOR_H__) #if !defined(__CHANNEL_DESCRIPTOR_H__)
#define __CHANNEL_DESCRIPTOR_H__ #define __CHANNEL_DESCRIPTOR_H__
#if defined(__cplusplus)
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
#include "builtin_types.h" #include "driver_types.h"
#include "cuda_runtime_api.h" #include "cuda_runtime_api.h"
#include "host_defines.h" #include "host_defines.h"
#include "vector_types.h"
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
#if defined(__cplusplus)
template<class T> __inline__ __host__ cudaChannelFormatDesc cudaCreateChann elDesc(void) template<class T> __inline__ __host__ cudaChannelFormatDesc cudaCreateChann elDesc(void)
{ {
return cudaCreateChannelDesc(0, 0, 0, 0, cudaChannelFormatKindUnsigned); return cudaCreateChannelDesc(0, 0, 0, 0, cudaChannelFormatKindNone);
} }
template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc< char>(void) template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc< char>(void)
{ {
int e = (int)sizeof(char) * 8; int e = (int)sizeof(char) * 8;
#if __SIGNED_CHARS__ #if __SIGNED_CHARS__
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned); return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
#else #else
return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned); return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
 End of changes. 5 change blocks. 
4 lines changed or deleted 5 lines changed or added


 cuComplex.h   cuComplex.h 
skipping to change at line 245 skipping to change at line 245
} }
/* /*
* We would like to call hypotf(), but it's not available on all platforms. * We would like to call hypotf(), but it's not available on all platforms.
* This discrete implementation guards against intermediate underflow and * This discrete implementation guards against intermediate underflow and
* overflow by scaling. Otherwise we would lose half the exponent range. * overflow by scaling. Otherwise we would lose half the exponent range.
* There are various ways of doing guarded computation. For now chose the * There are various ways of doing guarded computation. For now chose the
* simplest and fastest solution, however this may suffer from inaccuracies * simplest and fastest solution, however this may suffer from inaccuracies
* if sqrt and division are not IEEE compliant. * if sqrt and division are not IEEE compliant.
*/ */
__host__ static __inline__ float cuCabsf (cuFloatComplex x) __host__ __device__ static __inline__ float cuCabsf (cuFloatComplex x)
{ {
float a = cuCrealf(x); float a = cuCrealf(x);
float b = cuCimagf(x); float b = cuCimagf(x);
float v, w, t; float v, w, t;
a = (float)fabs(a); a = (float)fabs(a);
b = (float)fabs(b); b = (float)fabs(b);
if (a > b) { if (a > b) {
v = a; v = a;
w = b; w = b;
} else { } else {
 End of changes. 1 change blocks. 
1 lines changed or deleted 1 lines changed or added


 cublas.h   cublas.h 
skipping to change at line 78 skipping to change at line 78
* Currently, only a subset of the BLAS core functions is implemented. * Currently, only a subset of the BLAS core functions is implemented.
* *
* The interface to the CUBLAS library is the header file cublas.h. * The interface to the CUBLAS library is the header file cublas.h.
* Applications using CUBLAS need to link against the DSO cublas.so * Applications using CUBLAS need to link against the DSO cublas.so
* (Linux) or the DLL cublas.dll (Win32). * (Linux) or the DLL cublas.dll (Win32).
*/ */
#if !defined(CUBLAS_H_) #if !defined(CUBLAS_H_)
#define CUBLAS_H_ #define CUBLAS_H_
#ifdef __MULTI_CORE__
#error CUBLAS not supported on multicore
#endif
#ifndef CUBLASAPI #ifndef CUBLASAPI
#ifdef _WIN32 #ifdef _WIN32
#define CUBLASAPI __stdcall #define CUBLASAPI __stdcall
#else #else
#define CUBLASAPI #define CUBLASAPI
#endif #endif
#endif #endif
#if defined(__cplusplus) #if defined(__cplusplus)
extern "C" { extern "C" {
 End of changes. 1 change blocks. 
0 lines changed or deleted 4 lines changed or added


 cuda.h   cuda.h 
skipping to change at line 46 skipping to change at line 46
// ------------------------------------------------------------------------ ---- // ------------------------------------------------------------------------ ----
// //
// Main public header file for the CompUte Device Api // Main public header file for the CompUte Device Api
// //
// ------------------------------------------------------------------------ ---- // ------------------------------------------------------------------------ ----
#ifndef __cuda_cuda_h__ #ifndef __cuda_cuda_h__
#define __cuda_cuda_h__ #define __cuda_cuda_h__
/* CUDA API version number */ /* CUDA API version number */
#define CUDA_VERSION 2000 /* 2.0 */ #define CUDA_VERSION 2010 /* 2.1 */
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
#endif #endif
typedef unsigned int CUdeviceptr; typedef unsigned int CUdeviceptr;
typedef int CUdevice; typedef int CUdevice;
typedef struct CUctx_st *CUcontext; typedef struct CUctx_st *CUcontext;
typedef struct CUmod_st *CUmodule; typedef struct CUmod_st *CUmodule;
typedef struct CUfunc_st *CUfunction; typedef struct CUfunc_st *CUfunction;
skipping to change at line 132 skipping to change at line 132
CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK = 8, // Deprecated, us e CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK = 8, // Deprecated, us e CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK
CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY = 9, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY = 9,
CU_DEVICE_ATTRIBUTE_WARP_SIZE = 10, CU_DEVICE_ATTRIBUTE_WARP_SIZE = 10,
CU_DEVICE_ATTRIBUTE_MAX_PITCH = 11, CU_DEVICE_ATTRIBUTE_MAX_PITCH = 11,
CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK = 12, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK = 12,
CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK = 12, // Deprecated, us e CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK = 12, // Deprecated, us e CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK
CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13, CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13,
CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT = 14, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT = 14,
CU_DEVICE_ATTRIBUTE_GPU_OVERLAP = 15, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP = 15,
CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16 CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16,
CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT = 17
} CUdevice_attribute; } CUdevice_attribute;
// //
// Legacy device properties // Legacy device properties
// //
typedef struct CUdevprop_st { typedef struct CUdevprop_st {
int maxThreadsPerBlock; int maxThreadsPerBlock;
int maxThreadsDim[3]; int maxThreadsDim[3];
int maxGridSize[3]; int maxGridSize[3];
int sharedMemPerBlock; int sharedMemPerBlock;
skipping to change at line 160 skipping to change at line 161
// //
// Memory types // Memory types
// //
typedef enum CUmemorytype_enum { typedef enum CUmemorytype_enum {
CU_MEMORYTYPE_HOST = 0x01, CU_MEMORYTYPE_HOST = 0x01,
CU_MEMORYTYPE_DEVICE = 0x02, CU_MEMORYTYPE_DEVICE = 0x02,
CU_MEMORYTYPE_ARRAY = 0x03 CU_MEMORYTYPE_ARRAY = 0x03
} CUmemorytype; } CUmemorytype;
//
// Online compiler options
//
typedef enum CUjit_option_enum
{
// CU_JIT_MAX_REGISTERS - Max number of registers that a thread may use
.
CU_JIT_MAX_REGISTERS = 0,
// CU_JIT_THREADS_PER_BLOCK -
// IN: Specifies minimum number of threads per block to target compilat
ion for
// OUT: Returns the number of threads the compiler actually targeted.
This
// restricts the resource utilization fo the compiler (e.g. max registe
rs) such
// that a block with the given number of threads should be able to laun
ch based
// on register limitations. Note, this option does not currently take
into
// account any other resource limitations, such as shared memory utiliz
ation.
CU_JIT_THREADS_PER_BLOCK,
// CU_JIT_WALL_TIME - returns a float value in the option of the wall c
lock
// time, in milliseconds, spent creating the cubin
CU_JIT_WALL_TIME,
// CU_JIT_INFO_LUG_BUFFER - pointer to a buffer in which to print any l
og
// messsages from PTXAS that are informational in nature
CU_JIT_INFO_LOG_BUFFER,
// CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES -
// IN: Log buffer size in bytes. Log messages will be capped at this s
ize
// (including null terminator)
// OUT: Amount of log buffer filled with messages
CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES,
// CU_JIT_ERROR_LOG_BUFFER - pointer to a buffer in which to print any
log
// messages from PTXAS that reflect errors
CU_JIT_ERROR_LOG_BUFFER,
// CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES -
// IN: Log buffer size in bytes. Log messages will be capped at this s
ize
// (including null terminator)
// OUT: Amount of log buffer filled with messages
CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES,
// CU_JIT_OPTIMIZATION_LEVEL - level of optimizations to apply to gener
ated
// code (0 - 4), with 4 being the default and highest level of optimiza
tions.
CU_JIT_OPTIMIZATION_LEVEL,
// CU_JIT_TARGET_FROM_CU_CONTEXT - no option value required. Determine
s
// the target based on the current attached context (default)
CU_JIT_TARGET_FROM_CUCONTEXT,
// CU_JIT_TARGET - target is chosen based on supplied CUjit_target_enum
.
CU_JIT_TARGET,
// CU_JIT_FALLBACK_STRATEGY - specifies choice of fallback strategy if
// matching cubin is not found. Choice is based on supplied
// CUjit_fallback_enum.
CU_JIT_FALLBACK_STRATEGY
} CUjit_option;
//
// Online compilation targets
//
typedef enum CUjit_target_enum
{
CU_TARGET_COMPUTE_10 = 0,
CU_TARGET_COMPUTE_11,
CU_TARGET_COMPUTE_12,
CU_TARGET_COMPUTE_13
} CUjit_target;
//
// Cubin matching fallback strategies
//
typedef enum CUjit_fallback_enum
{
// prefer to compile ptx
CU_PREFER_PTX = 0,
// prefer to fall back to compatible binary code
CU_PREFER_BINARY
} CUjit_fallback;
/************************************ /************************************
** **
** Error codes ** Error codes
** **
***********************************/ ***********************************/
typedef enum cudaError_enum { typedef enum cudaError_enum {
CUDA_SUCCESS = 0, CUDA_SUCCESS = 0,
CUDA_ERROR_INVALID_VALUE = 1, CUDA_ERROR_INVALID_VALUE = 1,
skipping to change at line 253 skipping to change at line 337
CUresult CUDAAPI cuCtxSynchronize(void); CUresult CUDAAPI cuCtxSynchronize(void);
/************************************ /************************************
** **
** Module management ** Module management
** **
***********************************/ ***********************************/
CUresult CUDAAPI cuModuleLoad(CUmodule *module, const char *fname); CUresult CUDAAPI cuModuleLoad(CUmodule *module, const char *fname);
CUresult CUDAAPI cuModuleLoadData(CUmodule *module, const void *image) ; CUresult CUDAAPI cuModuleLoadData(CUmodule *module, const void *image) ;
CUresult CUDAAPI cuModuleLoadDataEx(CUmodule *module, const void *imag e, unsigned int numOptions, CUjit_option *options, void **optionValues);
CUresult CUDAAPI cuModuleLoadFatBinary(CUmodule *module, const void *f atCubin); CUresult CUDAAPI cuModuleLoadFatBinary(CUmodule *module, const void *f atCubin);
CUresult CUDAAPI cuModuleUnload(CUmodule hmod); CUresult CUDAAPI cuModuleUnload(CUmodule hmod);
CUresult CUDAAPI cuModuleGetFunction(CUfunction *hfunc, CUmodule hmod, const char *name); CUresult CUDAAPI cuModuleGetFunction(CUfunction *hfunc, CUmodule hmod, const char *name);
CUresult CUDAAPI cuModuleGetGlobal(CUdeviceptr *dptr, unsigned int *by tes, CUmodule hmod, const char *name); CUresult CUDAAPI cuModuleGetGlobal(CUdeviceptr *dptr, unsigned int *by tes, CUmodule hmod, const char *name);
CUresult CUDAAPI cuModuleGetTexRef(CUtexref *pTexRef, CUmodule hmod, c onst char *name); CUresult CUDAAPI cuModuleGetTexRef(CUtexref *pTexRef, CUmodule hmod, c onst char *name);
/************************************ /************************************
** **
** Memory management ** Memory management
** **
 End of changes. 4 change blocks. 
2 lines changed or deleted 103 lines changed or added


 cuda_runtime.h   cuda_runtime.h 
skipping to change at line 63 skipping to change at line 63
#include "builtin_types.h" #include "builtin_types.h"
#include "channel_descriptor.h" #include "channel_descriptor.h"
#include "cuda_runtime_api.h" #include "cuda_runtime_api.h"
#include "driver_functions.h" #include "driver_functions.h"
#include "host_defines.h" #include "host_defines.h"
#include "vector_functions.h" #include "vector_functions.h"
#if defined(__CUDACC__) #if defined(__CUDACC__)
#include "common_functions.h" #include "common_functions.h"
#include "cuda_texture_types.h"
#include "device_functions.h" #include "device_functions.h"
#include "device_launch_parameters.h" #include "device_launch_parameters.h"
#endif /* __CUDACC__ */ #endif /* __CUDACC__ */
#if defined(__cplusplus) #if defined(__cplusplus)
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
 End of changes. 1 change blocks. 
0 lines changed or deleted 1 lines changed or added


 cuda_runtime_api.h   cuda_runtime_api.h 
skipping to change at line 41 skipping to change at line 41
* Any use of this source code in individual and commercial software must * Any use of this source code in individual and commercial software must
* include, in the user documentation and internal comments to the code, * include, in the user documentation and internal comments to the code,
* the above Disclaimer and U.S. Government End Users Notice. * the above Disclaimer and U.S. Government End Users Notice.
*/ */
#if !defined(__CUDA_RUNTIME_API_H__) #if !defined(__CUDA_RUNTIME_API_H__)
#define __CUDA_RUNTIME_API_H__ #define __CUDA_RUNTIME_API_H__
/************************************************************************** ***** /************************************************************************** *****
* * * *
* CUDA runtime API version number 2.0 * * CUDA runtime API version number 2.1 *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
#define CUDART_VERSION \ #define CUDART_VERSION \
2000 2010
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
#include "host_defines.h" #include "host_defines.h"
#include "builtin_types.h" #include "builtin_types.h"
 End of changes. 2 change blocks. 
2 lines changed or deleted 2 lines changed or added


 cufft.h   cufft.h 
/* /*
* Copyright 1993-2007 NVIDIA Corporation. All rights reserved. * Copyright 1993-2008 NVIDIA Corporation. All rights reserved.
* *
* NOTICE TO USER: * NOTICE TO USER:
* *
* This source code is subject to NVIDIA ownership rights under U.S. and * This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws. Users and possessors of this source code * international Copyright laws. Users and possessors of this source code
* are hereby granted a nonexclusive, royalty-free license to use this code * are hereby granted a nonexclusive, royalty-free license to use this code
* in individual and commercial software. * in individual and commercial software.
* *
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
skipping to change at line 45 skipping to change at line 45
/* /*
* cufft.h * cufft.h
* Public header file for the NVIDIA Cuda FFT library (CUFFT) * Public header file for the NVIDIA Cuda FFT library (CUFFT)
*/ */
#ifndef _CUFFT_H_ #ifndef _CUFFT_H_
#define _CUFFT_H_ #define _CUFFT_H_
#include <stdio.h> #include <stdio.h>
#include <cuComplex.h> #include "cuComplex.h"
#ifdef __MULTI_CORE__
#error CUFFT not supported on multicore
#endif
#ifndef CUFFTAPI #ifndef CUFFTAPI
#ifdef _WIN32 #ifdef _WIN32
#define CUFFTAPI __stdcall #define CUFFTAPI __stdcall
#else #else
#define CUFFTAPI #define CUFFTAPI
#endif #endif
#endif #endif
#ifdef __cplusplus #ifdef __cplusplus
 End of changes. 2 change blocks. 
2 lines changed or deleted 6 lines changed or added


 device_functions.h   device_functions.h 
skipping to change at line 505 skipping to change at line 505
__device_func__(int __internal_float2int(float a, enum cudaRoundMode rndMod e)) __device_func__(int __internal_float2int(float a, enum cudaRoundMode rndMod e))
{ {
return (int)__internal_float2ll_kernel(a, 2147483647LL, -2147483648LL, 0L L, rndMode); return (int)__internal_float2ll_kernel(a, 2147483647LL, -2147483648LL, 0L L, rndMode);
} }
__device_func__(int __float2int_rz(float a)) __device_func__(int __float2int_rz(float a))
{ {
#if defined(__MULTI_CORE__) #if defined(__MULTI_CORE__)
return (int)a; return (int)a;
#else /* __MULTI_CORE__) */ #else /* __MULTI_CORE__ */
return __internal_float2int(a, cudaRoundZero); return __internal_float2int(a, cudaRoundZero);
#endif /* __MULTI_CORE__) */ #endif /* __MULTI_CORE__ */
} }
__device_func__(int __float2int_ru(float a)) __device_func__(int __float2int_ru(float a))
{ {
return __internal_float2int(a, cudaRoundPosInf); return __internal_float2int(a, cudaRoundPosInf);
} }
__device_func__(int __float2int_rd(float a)) __device_func__(int __float2int_rd(float a))
{ {
return __internal_float2int(a, cudaRoundMinInf); return __internal_float2int(a, cudaRoundMinInf);
skipping to change at line 534 skipping to change at line 534
__device_func__(long long int __internal_float2ll(float a, enum cudaRoundMo de rndMode)) __device_func__(long long int __internal_float2ll(float a, enum cudaRoundMo de rndMode))
{ {
return __internal_float2ll_kernel(a, 9223372036854775807LL, -922337203685 4775807LL -1LL, -9223372036854775807LL -1LL, rndMode); return __internal_float2ll_kernel(a, 9223372036854775807LL, -922337203685 4775807LL -1LL, -9223372036854775807LL -1LL, rndMode);
} }
__device_func__(long long int __float2ll_rz(float a)) __device_func__(long long int __float2ll_rz(float a))
{ {
#if defined(__MULTI_CORE__) #if defined(__MULTI_CORE__)
return (long long int)a; return (long long int)a;
#else /* __MULTI_CORE__) */ #else /* __MULTI_CORE__ */
return __internal_float2ll(a, cudaRoundZero); return __internal_float2ll(a, cudaRoundZero);
#endif /* __MULTI_CORE__) */ #endif /* __MULTI_CORE__ */
} }
__device_func__(long long int __float2ll_ru(float a)) __device_func__(long long int __float2ll_ru(float a))
{ {
return __internal_float2ll(a, cudaRoundPosInf); return __internal_float2ll(a, cudaRoundPosInf);
} }
__device_func__(long long int __float2ll_rd(float a)) __device_func__(long long int __float2ll_rd(float a))
{ {
return __internal_float2ll(a, cudaRoundMinInf); return __internal_float2ll(a, cudaRoundMinInf);
skipping to change at line 589 skipping to change at line 589
__device_func__(unsigned int __internal_float2uint(float a, enum cudaRoundM ode rndMode)) __device_func__(unsigned int __internal_float2uint(float a, enum cudaRoundM ode rndMode))
{ {
return (unsigned int)__internal_float2ull_kernel(a, 4294967295U, 0U, rndM ode); return (unsigned int)__internal_float2ull_kernel(a, 4294967295U, 0U, rndM ode);
} }
__device_func__(unsigned int __float2uint_rz(float a)) __device_func__(unsigned int __float2uint_rz(float a))
{ {
#if defined(__MULTI_CORE__) #if defined(__MULTI_CORE__)
return (unsigned int)a; return (unsigned int)a;
#else /* __MULTI_CORE__) */ #else /* __MULTI_CORE__ */
return __internal_float2uint(a, cudaRoundZero); return __internal_float2uint(a, cudaRoundZero);
#endif /* __MULTI_CORE__) */ #endif /* __MULTI_CORE__ */
} }
__device_func__(unsigned int __float2uint_ru(float a)) __device_func__(unsigned int __float2uint_ru(float a))
{ {
return __internal_float2uint(a, cudaRoundPosInf); return __internal_float2uint(a, cudaRoundPosInf);
} }
__device_func__(unsigned int __float2uint_rd(float a)) __device_func__(unsigned int __float2uint_rd(float a))
{ {
return __internal_float2uint(a, cudaRoundMinInf); return __internal_float2uint(a, cudaRoundMinInf);
skipping to change at line 618 skipping to change at line 618
__device_func__(unsigned long long int __internal_float2ull(float a, enum c udaRoundMode rndMode)) __device_func__(unsigned long long int __internal_float2ull(float a, enum c udaRoundMode rndMode))
{ {
return __internal_float2ull_kernel(a, 18446744073709551615ULL, 9223372036 854775808ULL, rndMode); return __internal_float2ull_kernel(a, 18446744073709551615ULL, 9223372036 854775808ULL, rndMode);
} }
__device_func__(unsigned long long int __float2ull_rz(float a)) __device_func__(unsigned long long int __float2ull_rz(float a))
{ {
#if defined(__MULTI_CORE__) #if defined(__MULTI_CORE__)
return (unsigned long long int)a; return (unsigned long long int)a;
#else /* __MULTI_CORE__) */ #else /* __MULTI_CORE__ */
return __internal_float2ull(a, cudaRoundZero); return __internal_float2ull(a, cudaRoundZero);
#endif /* __MULTI_CORE__) */ #endif /* __MULTI_CORE__ */
} }
__device_func__(unsigned long long int __float2ull_ru(float a)) __device_func__(unsigned long long int __float2ull_ru(float a))
{ {
return __internal_float2ull(a, cudaRoundPosInf); return __internal_float2ull(a, cudaRoundPosInf);
} }
__device_func__(unsigned long long int __float2ull_rd(float a)) __device_func__(unsigned long long int __float2ull_rd(float a))
{ {
return __internal_float2ull(a, cudaRoundMinInf); return __internal_float2ull(a, cudaRoundMinInf);
skipping to change at line 726 skipping to change at line 726
__device_func__(float __int2float_rd(int a)) __device_func__(float __int2float_rd(int a))
{ {
return __internal_int2float_kernel(a, cudaRoundMinInf); return __internal_int2float_kernel(a, cudaRoundMinInf);
} }
__device_func__(float __int2float_rn(int a)) __device_func__(float __int2float_rn(int a))
{ {
#if defined(__MULTI_CORE__) #if defined(__MULTI_CORE__)
return (float)a; return (float)a;
#else /* __MULTI_CORE__) */ #else /* __MULTI_CORE__ */
return __internal_int2float_kernel(a, cudaRoundNearest); return __internal_int2float_kernel(a, cudaRoundNearest);
#endif /* __MULTI_CORE__) */ #endif /* __MULTI_CORE__ */
} }
__device_func__(float __internal_uint2float_kernel(unsigned int a, enum cud aRoundMode rndMode)) __device_func__(float __internal_uint2float_kernel(unsigned int a, enum cud aRoundMode rndMode))
{ {
volatile union { volatile union {
float f; float f;
unsigned int i; unsigned int i;
} res; } res;
int shift; int shift;
unsigned int t; unsigned int t;
skipping to change at line 773 skipping to change at line 773
__device_func__(float __uint2float_rd(unsigned int a)) __device_func__(float __uint2float_rd(unsigned int a))
{ {
return __internal_uint2float_kernel(a, cudaRoundMinInf); return __internal_uint2float_kernel(a, cudaRoundMinInf);
} }
__device_func__(float __uint2float_rn(unsigned int a)) __device_func__(float __uint2float_rn(unsigned int a))
{ {
#if defined(__MULTI_CORE__) #if defined(__MULTI_CORE__)
return (float)a; return (float)a;
#else /* __MULTI_CORE__) */ #else /* __MULTI_CORE__ */
return __internal_uint2float_kernel(a, cudaRoundNearest); return __internal_uint2float_kernel(a, cudaRoundNearest);
#endif /* __MULTI_CORE__) */ #endif /* __MULTI_CORE__ */
} }
__device_func__(float __ll2float_rn(long long int a)) __device_func__(float __ll2float_rn(long long int a))
{ {
return (float)a; return (float)a;
} }
__device_func__(float __ull2float_rn(unsigned long long int a)) __device_func__(float __ull2float_rn(unsigned long long int a))
{ {
#if defined(__MULTI_CORE__) #if defined(__MULTI_CORE__)
return (float)a; return (float)a;
#else /* __MULTI_CORE__) */ #else /* __MULTI_CORE__ */
unsigned long long int temp; unsigned long long int temp;
unsigned int res, t; unsigned int res, t;
int shift; int shift;
if (a == 0ULL) return 0.0f; if (a == 0ULL) return 0.0f;
temp = a; temp = a;
shift = __internal_normalize64(&temp); shift = __internal_normalize64(&temp);
temp = (temp >> 8) | ((temp & 0xffULL) ? 1ULL : 0ULL); temp = (temp >> 8) | ((temp & 0xffULL) ? 1ULL : 0ULL);
res = (unsigned int)(temp >> 32); res = (unsigned int)(temp >> 32);
t = (unsigned int)temp; t = (unsigned int)temp;
res += (127 + 62 - shift) << 23; /* add in exponent */ res += (127 + 62 - shift) << 23; /* add in exponent */
res += t == 0x80000000 ? res & 1 : t >> 31; res += t == 0x80000000 ? res & 1 : t >> 31;
return __int_as_float(res); return __int_as_float(res);
#endif /* __MULTI_CORE__) */ #endif /* __MULTI_CORE__ */
} }
__device_func__(float __internal_fmul_kernel(float a, float b, int rndNeare st)) __device_func__(float __internal_fmul_kernel(float a, float b, int rndNeare st))
{ {
unsigned long long product; unsigned long long product;
volatile union { volatile union {
float f; float f;
unsigned int i; unsigned int i;
} xx, yy; } xx, yy;
unsigned expo_x, expo_y; unsigned expo_x, expo_y;
skipping to change at line 1069 skipping to change at line 1069
expo_x = (unsigned int)(-((int)expo_x)); expo_x = (unsigned int)(-((int)expo_x));
temp = xx.i << expo_y | ((temp) ? 1 : 0); temp = xx.i << expo_y | ((temp) ? 1 : 0);
xx.i = yy.i | (xx.i >> expo_x); xx.i = yy.i | (xx.i >> expo_x);
xx.i += (((temp == 0x80000000) ? (xx.i & 1) : (temp >> 31)) xx.i += (((temp == 0x80000000) ? (xx.i & 1) : (temp >> 31))
&& rndNearest); && rndNearest);
return xx.f; return xx.f;
} else { } else {
/* handle special cases separately */ /* handle special cases separately */
if (!(yy.i << 1)) { if (!(yy.i << 1)) {
if (xx.i == 0x80000000) { if (xx.i == 0x80000000) {
xx.i = yy.i; xx.i = yy.i;
}
if ((xx.i << 1) > 0xff000000) {
xx.i |= 0x00400000;
} }
return xx.f; return xx.f;
} }
if ((expo_y != 254) && (expo_x != 254)) { if ((expo_y != 254) && (expo_x != 254)) {
/* remove sign bits */ /* remove sign bits */
if (expo_x == (unsigned int) -1) { if (expo_x == (unsigned int) -1) {
temp = xx.i & 0x80000000; temp = xx.i & 0x80000000;
xx.i = xx.i << 8; xx.i = xx.i << 8;
while (!(xx.i & 0x80000000)) { while (!(xx.i & 0x80000000)) {
xx.i <<= 1; xx.i <<= 1;
skipping to change at line 1155 skipping to change at line 1158
return __internal_fmul_kernel(a, b, 1); return __internal_fmul_kernel(a, b, 1);
} }
__device_func__(void __brkpt(int c)) __device_func__(void __brkpt(int c))
{ {
/* TODO */ /* TODO */
} }
#if defined(__MULTI_CORE__) #if defined(__MULTI_CORE__)
__device_func__(void __syncthreads(void)) #define __syncthreads() \
{ __builtin___syncthreads()
#undef __syncthreads
__syncthreads();
#define __syncthreads \
__syncthreads_wrapper
}
#else /* __MULTI_CORE__ */ #else /* __MULTI_CORE__ */
extern int CUDARTAPI __cudaSynchronizeThreads(void**, void*); extern int CUDARTAPI __cudaSynchronizeThreads(void**, void*);
#if defined(__GNUC__) #if defined(__GNUC__)
__device_func__(inline __attribute__((always_inline)) void __syncthreads(vo id)) __device_func__(inline __attribute__((always_inline)) void __syncthreads(vo id))
{ {
volatile int _ = 0; volatile int _ = 0;
skipping to change at line 1262 skipping to change at line 1260
a *= .25f; a *= .25f;
b *= .25f; b *= .25f;
} }
return __fdividef(a, b); return __fdividef(a, b);
} }
__device_func__(float __tanf(float a)) __device_func__(float __tanf(float a))
{ {
#if defined(__MULTI_CORE__) #if defined(__MULTI_CORE__)
return tanf(a); return tanf(a);
#else /* __MULTI_CORE__) */ #else /* __MULTI_CORE__ */
return __sinf(a) / __cosf(a); return __sinf(a) / __cosf(a);
#endif /* __MULTI_CORE__) */ #endif /* __MULTI_CORE__ */
} }
__device_func__(void __sincosf(float a, float *sptr, float *cptr)) __device_func__(void __sincosf(float a, float *sptr, float *cptr))
{ {
#if defined(__MULTI_CORE__) #if defined(__MULTI_CORE__)
sincosf(a, sptr, cptr); sincosf(a, sptr, cptr);
#else /* __MULTI_CORE__) */ #else /* __MULTI_CORE__ */
*sptr = __sinf(a); *sptr = __sinf(a);
*cptr = __cosf(a); *cptr = __cosf(a);
#endif /* __MULTI_CORE__) */ #endif /* __MULTI_CORE__ */
} }
__device_func__(float __expf(float a)) __device_func__(float __expf(float a))
{ {
#if defined(__MULTI_CORE__) #if defined(__MULTI_CORE__)
return expf(a); return expf(a);
#else /* __MULTI_CORE__) */ #else /* __MULTI_CORE__ */
return __cuda_exp2f(a * CUDART_L2E_F); return __cuda_exp2f(a * CUDART_L2E_F);
#endif /* __MULTI_CORE__) */ #endif /* __MULTI_CORE__ */
} }
__device_func__(float __exp10f(float a)) __device_func__(float __exp10f(float a))
{ {
#if defined(__MULTI_CORE__) #if defined(__MULTI_CORE__)
return exp10f(a); return exp10f(a);
#else /* __MULTI_CORE__) */ #else /* __MULTI_CORE__ */
return __cuda_exp2f(a * CUDART_L2T_F); return __cuda_exp2f(a * CUDART_L2T_F);
#endif /* __MULTI_CORE__) */ #endif /* __MULTI_CORE__ */
} }
__device_func__(float __log10f(float a)) __device_func__(float __log10f(float a))
{ {
#if defined(__MULTI_CORE__) #if defined(__MULTI_CORE__)
return log10f(a); return log10f(a);
#else /* __MULTI_CORE__) */ #else /* __MULTI_CORE__ */
return CUDART_LG2_F * __log2f(a); return CUDART_LG2_F * __log2f(a);
#endif /* __MULTI_CORE__) */ #endif /* __MULTI_CORE__ */
} }
__device_func__(float __logf(float a)) __device_func__(float __logf(float a))
{ {
#if defined(__MULTI_CORE__) #if defined(__MULTI_CORE__)
return logf(a); return logf(a);
#else /* __MULTI_CORE__) */ #else /* __MULTI_CORE__ */
return CUDART_LN2_F * __log2f(a); return CUDART_LN2_F * __log2f(a);
#endif /* __MULTI_CORE__) */ #endif /* __MULTI_CORE__ */
} }
__device_func__(float __powf(float a, float b)) __device_func__(float __powf(float a, float b))
{ {
#if defined(__MULTI_CORE__) #if defined(__MULTI_CORE__)
return powf(a, b); return powf(a, b);
#else /* __MULTI_CORE__) */ #else /* __MULTI_CORE__ */
return __cuda_exp2f(b * __log2f(a)); return __cuda_exp2f(b * __log2f(a));
#endif /* __MULTI_CORE__) */ #endif /* __MULTI_CORE__ */
} }
__device_func__(float fdividef(float a, float b)) __device_func__(float fdividef(float a, float b))
{ {
#if defined(__MULTI_CORE__) #if defined(__MULTI_CORE__)
return a / b; return a / b;
#elif defined(__USE_FAST_MATH__) #elif defined(__USE_FAST_MATH__)
return __fdividef(a, b); return __fdividef(a, b);
#else /* __USE_FAST_MATH__ */ #else /* __MULTI_CORE__ */
return __internal_accurate_fdividef(a, b); return __internal_accurate_fdividef(a, b);
#endif /* __USE_FAST_MATH__ */ #endif /* __MULTI_CORE__ */
} }
__device_func__(int __clz(int a)) __device_func__(int __clz(int a))
{ {
return (a)?(158-(__float_as_int(__uint2float_rz((unsigned int)a))>>23)):3 2; return (a)?(158-(__float_as_int(__uint2float_rz((unsigned int)a))>>23)):3 2;
} }
__device_func__(int __ffs(int a)) __device_func__(int __ffs(int a))
{ {
return 32 - __clz (a & -a); return 32 - __clz (a & -a);
} }
__device_func__(int __popc(unsigned int a)) __device_func__(int __popc(unsigned int a))
{ {
unsigned int s = 033333333333; a = a - ((a >> 1) & 0x55555555);
unsigned int t = 030707070707; a = (a & 0x33333333) + ((a >> 2) & 0x33333333);
unsigned int n; a = (a + (a >> 4)) & 0x0f0f0f0f;
n = (a >> 1) & s; a = ((__umul24(a, 0x808080) << 1) + a) >> 24;
a = a - n;
n = (n >> 1) & s;
a = a - n;
n = (a >> 3) + a;
a = n & t;
t = (t << 2) & t; /* t = 0x04104104 */
a = (a >> 30) + ((a * t) >> 26);
return a; return a;
} }
__device_func__(int __clzll(long long int a)) __device_func__(int __clzll(long long int a))
{ {
int ahi = ((int)(a >> 32)); int ahi = ((int)((unsigned long long)a >> 32));
int alo = ((int)(a & 0xffffffffULL)); int alo = ((int)((unsigned long long)a & 0xffffffffULL));
int res; int res;
if (ahi) { if (ahi) {
res = 0; res = 0;
} else { } else {
res = 32; res = 32;
ahi = alo; ahi = alo;
} }
res = res + __clz(ahi); res = res + __clz(ahi);
return res; return res;
} }
__device_func__(int __ffsll(long long int a)) __device_func__(int __ffsll(long long int a))
{ {
return 64 - __clzll (a & -a); return 64 - __clzll (a & -a);
} }
__device_func__(int __popcll(unsigned long long int a)) __device_func__(int __popcll(unsigned long long int a))
{ {
unsigned int ahi = ((unsigned int)(a >> 32)); unsigned int ahi = ((unsigned int)(a >> 32));
unsigned int alo = ((unsigned int)(a & 0xffffffffULL)); unsigned int alo = ((unsigned int)(a & 0xffffffffULL));
return __popc(ahi) + __popc(alo); alo = alo - ((alo >> 1) & 0x55555555);
alo = (alo & 0x33333333) + ((alo >> 2) & 0x33333333);
ahi = ahi - ((ahi >> 1) & 0x55555555);
ahi = (ahi & 0x33333333) + ((ahi >> 2) & 0x33333333);
alo = alo + ahi;
alo = (alo & 0x0f0f0f0f) + ((alo >> 4) & 0x0f0f0f0f);
alo = ((__umul24(alo, 0x808080) << 1) + alo) >> 24;
return alo;
} }
#if defined(CUDA_DOUBLE_MATH_FUNCTIONS) && defined(CUDA_FLOAT_MATH_FUNCTION S) #if defined(CUDA_DOUBLE_MATH_FUNCTIONS) && defined(CUDA_FLOAT_MATH_FUNCTION S)
#error -- conflicting mode for double math routines #error -- conflicting mode for double math routines
#endif /* CUDA_DOUBLE_MATH_FUNCTIONS && CUDA_FLOAT_MATH_FUNCTIONS */ #endif /* CUDA_DOUBLE_MATH_FUNCTIONS && CUDA_FLOAT_MATH_FUNCTIONS */
#if defined(CUDA_FLOAT_MATH_FUNCTIONS) #if defined(CUDA_FLOAT_MATH_FUNCTIONS)
 End of changes. 35 change blocks. 
52 lines changed or deleted 50 lines changed or added


 device_launch_parameters.h   device_launch_parameters.h 
skipping to change at line 36 skipping to change at line 36
* and is provided to the U.S. Government only as a commercial end item. * and is provided to the U.S. Government only as a commercial end item.
* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
* 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
* source code with only those rights set forth herein. * source code with only those rights set forth herein.
* *
* Any use of this source code in individual and commercial software must * Any use of this source code in individual and commercial software must
* include, in the user documentation and internal comments to the code, * include, in the user documentation and internal comments to the code,
* the above Disclaimer and U.S. Government End Users Notice. * the above Disclaimer and U.S. Government End Users Notice.
*/ */
#if !defined(__DEVICE_LAUCH_PARAMETERS_H__) #if !defined(__DEVICE_LAUNCH_PARAMETERS_H__)
#define __DEVICE_LAUCH_PARAMETERS_H__ #define __DEVICE_LAUNCH_PARAMETERS_H__
#include "vector_types.h" #include "vector_types.h"
#if !defined(__STORAGE__) #if !defined(__STORAGE__)
#define __STORAGE__ \ #define __STORAGE__ \
extern const extern const
#endif /* __STORAGE__ */ #endif /* __STORAGE__ */
skipping to change at line 104 skipping to change at line 104
#endif /* __cudaGet_gridDim */ #endif /* __cudaGet_gridDim */
#if !defined(__cudaGet_warpSize) #if !defined(__cudaGet_warpSize)
#define __cudaGet_warpSize() \ #define __cudaGet_warpSize() \
warpSize warpSize
#endif /* __cudaGet_warpSize */ #endif /* __cudaGet_warpSize */
#endif /* !__DEVICE_LAUCH_PARAMETERS_H__ */ #endif /* !__DEVICE_LAUNCH_PARAMETERS_H__ */
 End of changes. 2 change blocks. 
2 lines changed or deleted 2 lines changed or added


 device_runtime.h   device_runtime.h 
skipping to change at line 42 skipping to change at line 42
* include, in the user documentation and internal comments to the code, * include, in the user documentation and internal comments to the code,
* the above Disclaimer and U.S. Government End Users Notice. * the above Disclaimer and U.S. Government End Users Notice.
*/ */
#if !defined(__CUDA_INTERNAL_COMPILATION__) #if !defined(__CUDA_INTERNAL_COMPILATION__)
#define __CUDA_INTERNAL_COMPILATION__ #define __CUDA_INTERNAL_COMPILATION__
#endif /* !__CUDA_INTERNAL_COMPILATION__ */ #endif /* !__CUDA_INTERNAL_COMPILATION__ */
#include "../host_defines.h" #include "host_defines.h"
#define __texture__(type, pref, dim) \
__##pref##dim##texture
#define __no_sc__ #define __no_sc__
#define __empty_array(s) \ #define __empty_array(s) \
s s
#define __unsized_shared_data(name, type_post) \ #define __unsized_shared_data(name, type_post) \
__unsized##name __unsized##type_post __unsized##name __unsized##type_post
#define __sized_shared_data(name, type) \ #define __sized_shared_data(name, type) \
__sized##name type __sized##name type
#define __sized__shared_var(name, s, type) \ #define __sized__shared_var(name, s, type) \
name name
/*TEXTURE_TYPE*/ /*TEXTURE_TYPE*/
typedef const void* __i1texture; typedef const void *__texture_type__;
/*TEXTURE_TYPE*/ /*SURFACE_TYPE*/
typedef const void* __i2texture; typedef const void *__surface_type__;
/*TEXTURE_TYPE*/
typedef const void* __i3texture;
/*TEXTURE_TYPE*/
typedef const void* __u1texture;
/*TEXTURE_TYPE*/
typedef const void* __u2texture;
/*TEXTURE_TYPE*/
typedef const void* __u3texture;
/*TEXTURE_TYPE*/
typedef const void* __f1texture;
/*TEXTURE_TYPE*/
typedef const void* __f2texture;
/*TEXTURE_TYPE*/
typedef const void* __f3texture;
#if defined(__CUDABE__) /* cudabe compiler */ #if defined(__CUDABE__) /* cudabe compiler */
#define __pad__(f) #define __pad__(f)
#define __text__ \ #define __text__ \
__attribute__((__texture__)) __attribute__((__texture__))
#define __surf__ \
__attribute__((__surface__))
#define ___device__(sc) \ #define ___device__(sc) \
static static
#define __in__(cdecl, decl) \ #define __in__(cdecl, decl) \
__shared__ cdecl __shared__ cdecl
#define __in_type__(cdecl, decl) \ #define __in_type__(cdecl, decl) \
cdecl cdecl
#define __texture_var(name) \ #define __texture_var(name) \
name name
#define __shared_var(name, s, type) \ #define __shared_var(name, s, type) \
name name
skipping to change at line 116 skipping to change at line 102
#define __cdecl #define __cdecl
#undef __w64 #undef __w64
#define __w64 #define __w64
#elif defined(__CUDACC__) /* cudafe compiler */ #elif defined(__CUDACC__) /* cudafe compiler */
#define __loc_sc__(loc, sc) \ #define __loc_sc__(loc, sc) \
sc loc sc loc
#define __pad__(f) #define __pad__(f)
#define __text__ #define __text__
#define __surf__
#define ___device__(sc) \ #define ___device__(sc) \
sc __device__ sc __device__
#define __in__(cdecl, decl) \ #define __in__(cdecl, decl) \
decl decl
#define __in_type__(cdecl, decl) \ #define __in_type__(cdecl, decl) \
decl decl
#define __texture_var(name) \ #define __texture_var(name) \
name name
#define __shared_var(name, s, type) \ #define __shared_var(name, s, type) \
name name
skipping to change at line 152 skipping to change at line 139
struct uint3; struct uint3;
extern struct uint3* CUDARTAPI __cudaGetBlockIdxPtr(void); extern struct uint3* CUDARTAPI __cudaGetBlockIdxPtr(void);
extern void* CUDARTAPI __cudaGetSharedMem(void*); extern void* CUDARTAPI __cudaGetSharedMem(void*);
#endif /* __MULTI_CORE__ || __multi_core__ */ #endif /* __MULTI_CORE__ || __multi_core__ */
#if defined (__multi_core__) #if defined (__multi_core__)
#if defined(__GNUC__) #if defined(__GNUC__)
#if defined(__cplusplus)
extern void *alloca(size_t) throw();
#else /* __cplusplus */
extern void *alloca(size_t); extern void *alloca(size_t);
#endif /* __cplusplus */
#define __cuda_alloc(s) \ #define __cuda_alloc(s) \
alloca(s) alloca(s)
#else /* __GNUC__ */ #else /* __GNUC__ */
extern void *_alloca(size_t); extern void *_alloca(size_t);
#define __cuda_alloc(s) \ #define __cuda_alloc(s) \
_alloca(s) _alloca(s)
#endif /* __GNUC__ */ #endif /* __GNUC__ */
#endif /* __multi_core__ */ #endif /* __multi_core__ */
#if defined (__MULTI_CORE__) #if defined (__MULTI_CORE__)
#define __syncthreads \ #define ___device__(sc) \
__syncthreads_wrapper static
#define __pad__(f) \ #define __pad__(f) \
f f
#define __text__ #define __text__
#define __surf__
#define __cudaGet_blockIdx() \ #define __cudaGet_blockIdx() \
(*__cudaGetBlockIdxPtr()) (*__cudaGetBlockIdxPtr())
#define __shared_var(name, s, type) \ #define __shared_var(name, s, type) \
(s type __cudaGetSharedMem((void*)(&(name)))) (s type __cudaGetSharedMem((void*)(&(name))))
#define __var_used__ \ #define __var_used__ \
__attribute__((__used__)) __attribute__((__used__))
#define __storage_auto__shared__ \
auto
#undef __cdecl #undef __cdecl
#define __cdecl #define __cdecl
#undef __w64 #undef __w64
#define __w64 #define __w64
#else /* __MULTI_CORE__ */ #else /* __MULTI_CORE__ */
#define __shared_var(name, s, type) \
name
#endif /* __MULTI_CORE__ */
#define ___device__(sc) \ #define ___device__(sc) \
static __device__ static __device__
#define __in__(cdecl, decl) \ #define __shared_var(name, s, type) \
decl
#define __in_type__(cdecl, decl) \
decl
#define __texture_var(name) \
__texture_##name
#define __cuda_host_device_name(name) \
__cuda_host_device_##name
#define __val_param(name) \
name name
#define __copy_param(local_decl, param)
#define __unsized_array_size
#define __unsized__shared_var(name, s, type) \
(*name)
#define __unsized__empty_array(s)
#if defined(__APPLE__) #if defined(__APPLE__) || defined(__ICC)
#define __STORAGE__ \ #define __STORAGE__ \
__attribute__((__weak__)) __attribute__((__weak__))
#elif defined(__GNUC__) #elif defined(__GNUC__)
#define __STORAGE__ \ #define __STORAGE__ \
__attribute__((__common__)) __attribute__((__common__))
#elif defined(__cplusplus) #elif defined(__cplusplus)
#define __STORAGE__ \ #define __STORAGE__ \
__declspec(selectany) __declspec(selectany)
#pragma warning(disable: 4099 4190) #pragma warning(disable: 4099 4190)
#else /* __cplusplus */ #else /* __APPLE__ || __ICC */
#define __STORAGE__ #define __STORAGE__
#endif /* __APPLE__ */ #endif /* __APPLE__ || __ICC */
#endif /* __MULTI_CORE__ */
#define __in__(cdecl, decl) \
decl
#define __in_type__(cdecl, decl) \
decl
#define __texture_var(name) \
__texture_##name
#define __cuda_host_device_name(name) \
__cuda_host_device_##name
#define __val_param(name) \
name
#define __copy_param(local_decl, param)
#define __unsized_array_size
#define __unsized__shared_var(name, s, type) \
(*name)
#define __unsized__empty_array(s)
/* this is compiled with a host compiler for device emulation */ /* this is compiled with a host compiler for device emulation */
#define __device_emulation #define __device_emulation
#endif /* __CUDABE__ */ #endif /* __CUDABE__ */
#include "../builtin_types.h" #include "builtin_types.h"
#include "../device_launch_parameters.h" #include "device_launch_parameters.h"
#include "storage_class.h" #include "storage_class.h"
 End of changes. 17 change blocks. 
46 lines changed or deleted 44 lines changed or added


 driver_types.h   driver_types.h 
skipping to change at line 96 skipping to change at line 96
cudaErrorSynchronizationError, cudaErrorSynchronizationError,
cudaErrorInvalidFilterSetting, cudaErrorInvalidFilterSetting,
cudaErrorInvalidNormSetting, cudaErrorInvalidNormSetting,
cudaErrorMixedDeviceExecution, cudaErrorMixedDeviceExecution,
cudaErrorCudartUnloading, cudaErrorCudartUnloading,
cudaErrorUnknown, cudaErrorUnknown,
cudaErrorNotYetImplemented, cudaErrorNotYetImplemented,
cudaErrorMemoryValueTooLarge, cudaErrorMemoryValueTooLarge,
cudaErrorInvalidResourceHandle, cudaErrorInvalidResourceHandle,
cudaErrorNotReady, cudaErrorNotReady,
cudaErrorInsufficientDriver,
cudaErrorSetOnActiveProcess,
cudaErrorStartupFailure = 0x7f, cudaErrorStartupFailure = 0x7f,
cudaErrorApiFailureBase = 10000 cudaErrorApiFailureBase = 10000
}; };
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
enum cudaChannelFormatKind enum cudaChannelFormatKind
{ {
cudaChannelFormatKindSigned, cudaChannelFormatKindSigned,
cudaChannelFormatKindUnsigned, cudaChannelFormatKindUnsigned,
cudaChannelFormatKindFloat cudaChannelFormatKindFloat,
cudaChannelFormatKindNone
}; };
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct cudaChannelFormatDesc struct cudaChannelFormatDesc
{ {
int x; int x;
int y; int y;
int z; int z;
int w; int w;
enum cudaChannelFormatKind f; enum cudaChannelFormatKind f;
skipping to change at line 189 skipping to change at line 192
int maxThreadsPerBlock; int maxThreadsPerBlock;
int maxThreadsDim[3]; int maxThreadsDim[3];
int maxGridSize[3]; int maxGridSize[3];
int clockRate; int clockRate;
size_t totalConstMem; size_t totalConstMem;
int major; int major;
int minor; int minor;
size_t textureAlignment; size_t textureAlignment;
int deviceOverlap; int deviceOverlap;
int multiProcessorCount; int multiProcessorCount;
int __cudaReserved[40]; int kernelExecTimeoutEnabled;
int __cudaReserved[39];
}; };
#define cudaDevicePropDontCare \ #define cudaDevicePropDontCare \
{ \ { \
{'\0'}, /* char name[256]; */ \ {'\0'}, /* char name[256]; */ \
0, /* size_t totalGlobalMem; */ \ 0, /* size_t totalGlobalMem; */ \
0, /* size_t sharedMemPerBlock; */ \ 0, /* size_t sharedMemPerBlock; */ \
0, /* int regsPerBlock; */ \ 0, /* int regsPerBlock; */ \
0, /* int warpSize; */ \ 0, /* int warpSize; */ \
0, /* size_t memPitch; */ \ 0, /* size_t memPitch; */ \
0, /* int maxThreadsPerBlock; */ \ 0, /* int maxThreadsPerBlock; */ \
{0, 0, 0}, /* int maxThreadsDim[3]; */ \ {0, 0, 0}, /* int maxThreadsDim[3]; */ \
{0, 0, 0}, /* int maxGridSize[3]; */ \ {0, 0, 0}, /* int maxGridSize[3]; */ \
0, /* int clockRate; */ \ 0, /* int clockRate; */ \
0, /* size_t totalConstMem; */ \ 0, /* size_t totalConstMem; */ \
-1, /* int major; */ \ -1, /* int major; */ \
-1, /* int minor; */ \ -1, /* int minor; */ \
0, /* size_t textureAlignment; */ \ 0, /* size_t textureAlignment; */ \
-1, /* int deviceOverlap; */ \ -1, /* int deviceOverlap; */ \
0 /* int multiProcessorCount; */ \ 0, /* int multiProcessorCount; */ \
0 /* int kernelExecTimeoutEnabled */ \
} }
/************************************************************************** ***** /************************************************************************** *****
* * * *
* SHORTHAND TYPE DEFINITION USED BY RUNTIME API * * SHORTHAND TYPE DEFINITION USED BY RUNTIME API *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
typedef enum cudaError cudaError_t; typedef enum cudaError cudaError_t;
 End of changes. 4 change blocks. 
20 lines changed or deleted 25 lines changed or added


 host_config.h   host_config.h 
skipping to change at line 65 skipping to change at line 65
#include <bits/c++config.h> /* get _GLIBCXX_ATOMIC_BUILTINS */ #include <bits/c++config.h> /* get _GLIBCXX_ATOMIC_BUILTINS */
#if _GLIBCXX_ATOMIC_BUILTINS == 1 #if _GLIBCXX_ATOMIC_BUILTINS == 1
#undef _GLIBCXX_ATOMIC_BUILTINS /* for missing __sync_fetch_and_add */ #undef _GLIBCXX_ATOMIC_BUILTINS /* for missing __sync_fetch_and_add */
#endif /* _GLIBCXX_ATOMIC_BUILTINS == 1 */ #endif /* _GLIBCXX_ATOMIC_BUILTINS == 1 */
#elif defined(_WIN32) #elif defined(_WIN32)
#if _MSC_VER == 1400 #if _MSC_VER >= 1400
#undef _USE_DECLSPECS_FOR_SAL
#define _USE_DECLSPECS_FOR_SAL \
1
#if !defined(_CRT_NONSTDC_NO_WARNINGS) #if !defined(_CRT_NONSTDC_NO_WARNINGS)
#define _CRT_NONSTDC_NO_WARNINGS /* to suppress warnings */ #define _CRT_NONSTDC_NO_WARNINGS /* to suppress warnings */
#endif /* _CRT_NONSTDC_NO_WARNINGS */ #endif /* _CRT_NONSTDC_NO_WARNINGS */
#if !defined(_CRT_SECURE_NO_WARNINGS) #if !defined(_CRT_SECURE_NO_WARNINGS)
#define _CRT_SECURE_NO_WARNINGS /* to suppress warnings */ #define _CRT_SECURE_NO_WARNINGS /* to suppress warnings */
#endif /* _CRT_SECURE_NO_WARNINGS */ #endif /* _CRT_SECURE_NO_WARNINGS */
#endif /* _MSC_VER == 1400 */ #endif /* _MSC_VER >= 1400 */
#if !defined(NOMINMAX) #if !defined(NOMINMAX)
#define NOMINMAX /* min and max are part of cuda runtime */ #define NOMINMAX /* min and max are part of cuda runtime */
#endif /* !NOMINMAX */ #endif /* !NOMINMAX */
#define __THROW #define __THROW
class type_info; /* forward declaration */ /* forward declarations for windows C++ header files */
#include <stddef.h>
class type_info;
#if !defined(_CRTIMP)
#if defined(_DLL)
#define _CRTIMP \
__declspec(dllimport)
#else /* _DLL */
#define _CRTIMP
#endif /* _DLL */
#endif /* !_CRTIMP */
#if defined(_DEBUG)
#if !defined(_NATIVE_WCHAR_T_DEFINED) && defined(_M_CEE_PURE)
extern "C++"
#else /* !_NATIVE_WCHAR_T_DEFINED && _M_CEE_PURE */
extern "C"
#endif /* !_NATIVE_WCHAR_T_DEFINED && _M_CEE_PURE */
_CRTIMP void __cdecl _invalid_parameter(const wchar_t*, const wchar_t*, con
st wchar_t*, unsigned int, uintptr_t);
#else /* _DEBUG */
extern "C" _CRTIMP void __cdecl _invalid_parameter_noinfo(void);
#endif /* DEBUG */
namespace std
{
struct _Secure_char_traits_tag;
namespace _Traits_helper
{
template<class T> inline typename T::char_type *copy_s(typename T::char
_type*, size_t, const typename T::char_type*, size_t, _Secure_char_traits_t
ag);
template<class T> inline typename T::char_type *move_s(typename T::char
_type*, size_t, const typename T::char_type*, size_t, _Secure_char_traits_t
ag);
}
}
namespace stdext
{
template<class U, class V, class W> inline V unchecked_uninitialized_copy
(U, U, V, W&);
}
#endif /* __GNUC__ */ #endif /* __GNUC__ */
#endif /* __CUDACC__ */ #endif /* __CUDACC__ */
#endif /* !__HOST_CONFIG_H__ */ #endif /* !__HOST_CONFIG_H__ */
 End of changes. 3 change blocks. 
3 lines changed or deleted 51 lines changed or added


 host_defines.h   host_defines.h 
skipping to change at line 49 skipping to change at line 49
#if !defined(__GNUC__) && !defined(_WIN32) #if !defined(__GNUC__) && !defined(_WIN32)
#error --- !!! UNSUPPORTED COMPILER !!! --- #error --- !!! UNSUPPORTED COMPILER !!! ---
#elif defined(__GNUC__) #elif defined(__GNUC__)
#define __no_return__ \ #define __no_return__ \
__attribute__((__noreturn__)) __attribute__((__noreturn__))
#define __noinline__ \ #define __noinline__ \
__attribute__((__noinline__)) __attribute__((__noinline__))
#define __forceinline__ \
__inline__ __attribute__((__always_inline__))
#define __align__(n) \ #define __align__(n) \
__attribute__((__aligned__(n))) __attribute__((__aligned__(n)))
#define __thread__ \ #define __thread__ \
__thread __thread
#define __import__ #define __import__
#define __export__ #define __export__
#define __location__(a) \ #define __location__(a) \
__loc__(__attribute__((a))) __loc__(__attribute__((a)))
#define CUDARTAPI #define CUDARTAPI
skipping to change at line 78 skipping to change at line 80
#define __restrict__ #define __restrict__
#endif /* _MSC_VER >= 1400 */ #endif /* _MSC_VER >= 1400 */
#define __inline__ \ #define __inline__ \
__inline __inline
#define __no_return__ \ #define __no_return__ \
__declspec(noreturn) __declspec(noreturn)
#define __noinline__ \ #define __noinline__ \
__declspec(noinline) __declspec(noinline)
#define __forceinline__ \
__forceinline
#define __align__(n) \ #define __align__(n) \
__declspec(align(n)) __declspec(align(n))
#define __thread__ \ #define __thread__ \
__declspec(thread) __declspec(thread)
#define __import__ \ #define __import__ \
__declspec(dllimport) __declspec(dllimport)
#define __export__ \ #define __export__ \
__declspec(dllexport) __declspec(dllexport)
#define __location__(a) \ #define __location__(a) \
__loc__(__declspec(a)) __loc__(__declspec(a))
#define CUDARTAPI \ #define CUDARTAPI \
__stdcall __stdcall
#endif /* !__GNUC__ && !_WIN32 */ #endif /* !__GNUC__ && !_WIN32 */
#if defined(__CUDACC__) || defined(__CUDABE__) #if defined(__CUDACC__) || defined(__CUDABE__) || defined (__MULTI_CORE__)
#define __loc__(a) \ #define __loc__(a) \
a a
#define __builtin_align__(a) \ #define __builtin_align__(a) \
__align__(a) __align__(a)
#else /* __CUDACC__ || __CUDABE__ */ #else /* __CUDACC__ || __CUDABE__ || __MULTI_CORE__ */
#define __loc__(a) #define __loc__(a)
#define __builtin_align__(a) #define __builtin_align__(a)
#endif /* __CUDACC__ || __CUDABE__ */ #endif /* __CUDACC__ || __CUDABE__ || __MULTI_CORE__ */
#define __device__ \ #define __device__ \
__location__(__device__) __location__(__device__)
#define __host__ \ #define __host__ \
__location__(__host__) __location__(__host__)
#define __global__ \ #define __global__ \
__location__(__global__) __location__(__global__)
#define __shared__ \ #define __shared__ \
__location__(__shared__) __location__(__shared__)
#define __constant__ \ #define __constant__ \
 End of changes. 5 change blocks. 
3 lines changed or deleted 7 lines changed or added


 host_runtime.h   host_runtime.h 
skipping to change at line 76 skipping to change at line 76
#endif /* !__cplusplus */ #endif /* !__cplusplus */
#include "cuda_runtime_api.h" #include "cuda_runtime_api.h"
#include "storage_class.h" #include "storage_class.h"
#else /* !__CUDA_INTERNAL_COMPILATION__ */ #else /* !__CUDA_INTERNAL_COMPILATION__ */
#include "host_defines.h" #include "host_defines.h"
#define __cudaRegisterBinary() \ #define __cudaRegisterBinary()
__cudaFatCubinHandle = __cudaRegisterFatBinary((void*)__cudaFatCubi \
n) __cudaFatCubinHandle = __cudaRegisterFatBinary((void*)__cudaFatCubi
n); \
atexit(__cudaUnregisterBinaryUtil)
#define __cudaRegisterVariable(var, ext, size, constant, global) \ #define __cudaRegisterVariable(var, ext, size, constant, global) \
__cudaRegisterVar(__cudaFatCubinHandle, (char*)&__host##var, (char* )__device##var, __name##var, ext, size, constant, global) __cudaRegisterVar(__cudaFatCubinHandle, (char*)&__host##var, (char* )__device##var, __name##var, ext, size, constant, global)
#define __cudaRegisterGlobalTexture(tex, dim, norm, ext) \ #define __cudaRegisterGlobalTexture(tex, dim, norm, ext) \
__cudaRegisterTexture(__cudaFatCubinHandle, (const struct textureRe ference*)&tex, __tex_var(tex), #tex, dim, norm, ext) __cudaRegisterTexture(__cudaFatCubinHandle, (const struct textureRe ference*)&tex, __tex_var(tex), #tex, dim, norm, ext)
#define __cudaRegisterUnsizedShared(var) \ #define __cudaRegisterUnsizedShared(var) \
__cudaRegisterShared(__cudaFatCubinHandle, (void**)__device_var(var )) __cudaRegisterShared(__cudaFatCubinHandle, (void**)__device_var(var ))
#define __cudaRegisterSharedVariable(var, size, align, sc) \ #define __cudaRegisterSharedVariable(var, size, align, sc) \
__cudaRegisterSharedVar(__cudaFatCubinHandle, (void**)__device_var( var), size, align, sc) __cudaRegisterSharedVar(__cudaFatCubinHandle, (void**)__device_var( var), size, align, sc)
#define __cudaRegisterEntry(fun, thread_limit) \ #define __cudaRegisterEntry(fun, thread_limit) \
__cudaRegisterFunction(__cudaFatCubinHandle, (const char*)__device_ stub_##fun, (char*)__device_fun(fun), #fun, thread_limit, __ids) __cudaRegisterFunction(__cudaFatCubinHandle, (const char*)__device_ stub_##fun, (char*)__device_fun(fun), #fun, thread_limit, __ids)
#define __cudaInitArgBlock(arg) \
char __[256]; \
*(char**)&arg = __
#define __cudaSetupArg(arg, offset) \ #define __cudaSetupArg(arg, offset) \
if (cudaSetupArgument((void*)(char*)&arg, sizeof(arg), (size_t)&off set.arg - (size_t)&offset) != cudaSuccess) \ if (cudaSetupArgument((void*)(char*)&arg, sizeof(arg), (size_t)&off set->arg - (size_t)offset) != cudaSuccess) \
return return
#define __cudaLaunch(fun) \ #define __cudaLaunch(fun) \
{ volatile static char *__f; __f = fun; (void)cudaLaunch(fun); } { volatile static char *__f; __f = fun; (void)cudaLaunch(fun); }
#if defined(__cplusplus) #if defined(__cplusplus)
extern "C" { extern "C" {
#endif /* __cplusplus */ #endif /* __cplusplus */
extern void** CUDARTAPI __cudaRegisterFatBinary( extern void** CUDARTAPI __cudaRegisterFatBinary(
void *fatCubin void *fatCubin
skipping to change at line 158 skipping to change at line 162
uint3 *bid, uint3 *bid,
dim3 *bDim, dim3 *bDim,
dim3 *gDim, dim3 *gDim,
int *wSize int *wSize
); );
#if defined(__cplusplus) #if defined(__cplusplus)
} }
#endif /* __cplusplus */ #endif /* __cplusplus */
static void **__cudaFatCubinHandle; #if defined(__GNUC__) && defined(__cplusplus)
static void __cudaUnregisterBinaryUtil(void)
{
__cudaUnregisterFatBinary(__cudaFatCubinHandle);
}
#if defined(__GNUC__)
__attribute__((__destructor__)) static void __cudaUnregisterBinary(void)
{
__cudaUnregisterBinaryUtil();
}
#else /* __GNUC__ */
#if defined(_WIN64)
#pragma section(".CRT$XPU", read) extern int atexit(void(*)(void)) throw();
#else /* _WIN64 */ #else /* __GNUC__ && __cplusplus */
#pragma section(".CRT$XPU") extern int atexit(void(*)(void));
#endif /* _WIN64 */ #endif /* __GNUC__ && __cplusplus */
__declspec(allocate(".CRT$XPU")) static void **__cudaFatCubinHandle;
static void (__cdecl *__cudaUnregister[])(void) = {__cudaUnregisterBinaryUt
il};
#endif /* __GNUC__ */ static void __cudaUnregisterBinaryUtil(void)
{
__cudaUnregisterFatBinary(__cudaFatCubinHandle);
}
#if defined(__device_emulation) #if defined(__device_emulation)
#define __device_fun(fun) \ #define __device_fun(fun) \
__device_wrapper_##fun __device_wrapper_##fun
#define __device_var(var) \ #define __device_var(var) \
(char*)&var (char*)&var
#define __tex_var(var) \ #define __tex_var(var) \
&__texture_var(var) &__texture_var(var)
#define __cudaFatCubin \ #define __cudaFatCubin \
 End of changes. 10 change blocks. 
29 lines changed or deleted 19 lines changed or added


 math_constants.h   math_constants.h 
skipping to change at line 97 skipping to change at line 97
#endif /* !CUDA_NO_SM_13_DOUBLE_INTRINSICS */ #endif /* !CUDA_NO_SM_13_DOUBLE_INTRINSICS */
#define CUDART_ZERO 0.0 #define CUDART_ZERO 0.0
#define CUDART_ONE 1.0 #define CUDART_ONE 1.0
#define CUDART_SQRT_TWO 1.4142135623730951e+0 #define CUDART_SQRT_TWO 1.4142135623730951e+0
#define CUDART_SQRT_HALF 7.0710678118654757e-1 #define CUDART_SQRT_HALF 7.0710678118654757e-1
#define CUDART_THIRD 3.3333333333333333e-1 #define CUDART_THIRD 3.3333333333333333e-1
#define CUDART_TWOTHIRD 6.6666666666666667e-1 #define CUDART_TWOTHIRD 6.6666666666666667e-1
#define CUDART_PIO4 7.8539816339744828e-1 #define CUDART_PIO4 7.8539816339744828e-1
#define CUDART_PIO4_HI 7.8539816339744828e-1
#define CUDART_PIO4_LO 3.0616169978683830e-17
#define CUDART_PIO2 1.5707963267948966e+0 #define CUDART_PIO2 1.5707963267948966e+0
#define CUDART_PIO2_HI 1.5707963267948966e+0 #define CUDART_PIO2_HI 1.5707963267948966e+0
#define CUDART_PIO2_LO 6.1232339957367660e-17 #define CUDART_PIO2_LO 6.1232339957367660e-17
#define CUDART_3PIO4 2.3561944901923448e+0 #define CUDART_3PIO4 2.3561944901923448e+0
#define CUDART_2_OVER_PI 6.3661977236758138e-1 #define CUDART_2_OVER_PI 6.3661977236758138e-1
#define CUDART_PI 3.1415926535897931e+0 #define CUDART_PI 3.1415926535897931e+0
#define CUDART_PI_HI 3.1415926535897931e+0 #define CUDART_PI_HI 3.1415926535897931e+0
#define CUDART_PI_LO 1.2246467991473532e-16 #define CUDART_PI_LO 1.2246467991473532e-16
#define CUDART_SQRT_2PI_HI 2.5066282746310007e+0 #define CUDART_SQRT_2PI_HI 2.5066282746310007e+0
#define CUDART_SQRT_2PI_LO (-1.8328579980459167e-16) #define CUDART_SQRT_2PI_LO (-1.8328579980459167e-16)
 End of changes. 1 change blocks. 
0 lines changed or deleted 2 lines changed or added


 math_functions.h   math_functions.h 
skipping to change at line 413 skipping to change at line 413
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __host__ __device__ float fmaf(float, float, float) __THROW; extern __host__ __device__ float fmaf(float, float, float) __THROW;
} }
#include <math.h> #include <math.h>
#include <stdlib.h> #include <stdlib.h>
#include <cmath> #include <cmath>
#include <cstdlib> #include <cstdlib>
using namespace std;
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
#if defined(__GNUC__) #if defined(__GNUC__)
#define __cuda_std_begin \
namespace std {
#define __cuda_std_end \
}
/* these are here to avoid warnings on the call graph. /* these are here to avoid warnings on the call graph.
long double is not supported on the device */ long double is not supported on the device */
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __host__ __device__ int __signbitl(long double) __THROW; extern __host__ __device__ int __signbitl(long double) __THROW;
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __host__ __device__ int __isinfl(long double) __THROW; extern __host__ __device__ int __isinfl(long double) __THROW;
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
extern __host__ __device__ int __isnanl(long double) __THROW; extern __host__ __device__ int __isnanl(long double) __THROW;
#if defined(__APPLE__) #if defined(__APPLE__)
skipping to change at line 482 skipping to change at line 474
{ {
extern __host__ __device__ long long int abs(long long int); extern __host__ __device__ long long int abs(long long int);
} }
namespace std namespace std
{ {
template<typename T> extern __host__ __device__ T __pow_helper(T, int); template<typename T> extern __host__ __device__ T __pow_helper(T, int);
template<typename T> extern __host__ __device__ T __cmath_power(T, unsign ed int); template<typename T> extern __host__ __device__ T __cmath_power(T, unsign ed int);
} }
using std::abs;
using std::fabs;
using std::ceil;
using std::floor;
using std::sqrt;
using std::pow;
using std::log;
using std::log10;
using std::fmod;
using std::modf;
using std::exp;
using std::frexp;
using std::ldexp;
using std::asin;
using std::sin;
using std::sinh;
using std::acos;
using std::cos;
using std::cosh;
using std::atan;
using std::atan2;
using std::tan;
using std::tanh;
#elif defined(_WIN32) #elif defined(_WIN32)
static __inline__ __host__ __device__ long long int abs(long long int a) static __inline__ __host__ __device__ long long int abs(long long int a)
{ {
return llabs(a); return llabs(a);
} }
static __inline__ __host__ __device__ int signbit(double a) static __inline__ __host__ __device__ int signbit(double a)
{ {
return __signbit(a); return __signbit(a);
skipping to change at line 529 skipping to change at line 545
static __inline__ __host__ __device__ int isfinite(double a) static __inline__ __host__ __device__ int isfinite(double a)
{ {
return __finite(a); return __finite(a);
} }
static __inline__ __host__ __device__ int isfinite(float a) static __inline__ __host__ __device__ int isfinite(float a)
{ {
return __finitef(a); return __finitef(a);
} }
#define __cuda_std_begin
#define __cuda_std_end
template<class T> extern __host__ __device__ T _Pow_int(T, int); template<class T> extern __host__ __device__ T _Pow_int(T, int);
#endif /* !_WIN32 */ #endif /* !_WIN32 */
__cuda_std_begin #if defined(__GNUC__)
namespace std {
#endif /* __GNUC__ */
extern __host__ __device__ long int abs(long int); extern __host__ __device__ long int abs(long int);
extern __host__ __device__ float abs(float); extern __host__ __device__ float abs(float);
extern __host__ __device__ double abs(double); extern __host__ __device__ double abs(double);
extern __host__ __device__ float fabs(float); extern __host__ __device__ float fabs(float);
extern __host__ __device__ float ceil(float); extern __host__ __device__ float ceil(float);
extern __host__ __device__ float floor(float); extern __host__ __device__ float floor(float);
extern __host__ __device__ float sqrt(float); extern __host__ __device__ float sqrt(float);
extern __host__ __device__ float pow(float, float); extern __host__ __device__ float pow(float, float);
extern __host__ __device__ float pow(float, int); extern __host__ __device__ float pow(float, int);
skipping to change at line 566 skipping to change at line 581
extern __host__ __device__ float sin(float); extern __host__ __device__ float sin(float);
extern __host__ __device__ float sinh(float); extern __host__ __device__ float sinh(float);
extern __host__ __device__ float acos(float); extern __host__ __device__ float acos(float);
extern __host__ __device__ float cos(float); extern __host__ __device__ float cos(float);
extern __host__ __device__ float cosh(float); extern __host__ __device__ float cosh(float);
extern __host__ __device__ float atan(float); extern __host__ __device__ float atan(float);
extern __host__ __device__ float atan2(float, float); extern __host__ __device__ float atan2(float, float);
extern __host__ __device__ float tan(float); extern __host__ __device__ float tan(float);
extern __host__ __device__ float tanh(float); extern __host__ __device__ float tanh(float);
__cuda_std_end #if defined(__GNUC__)
}
#endif /* __GNUC__ */
static __inline__ __host__ __device__ float logb(float a) static __inline__ __host__ __device__ float logb(float a)
{ {
return logbf(a); return logbf(a);
} }
static __inline__ __host__ __device__ int ilogb(float a) static __inline__ __host__ __device__ int ilogb(float a)
{ {
return ilogbf(a); return ilogbf(a);
} }
skipping to change at line 827 skipping to change at line 844
#elif !defined(__CUDACC__) #elif !defined(__CUDACC__)
#include "crt/func_macro.h" #include "crt/func_macro.h"
#define INT_MAX \ #define INT_MAX \
((int)((unsigned int)-1 >> 1)) ((int)((unsigned int)-1 >> 1))
#if defined(__GNUC__) #if defined(__GNUC__)
extern int __cuda_error_not_implememted(void); __func__(int __cuda_error_not_implememted(void));
#define __cuda___signbitl(a) \ #define __cuda___signbitl(a) \
__cuda_error_not_implememted() __cuda_error_not_implememted()
#define __cuda___isinfl(a) \ #define __cuda___isinfl(a) \
__cuda_error_not_implememted() __cuda_error_not_implememted()
#define __cuda___isnanl(a) \ #define __cuda___isnanl(a) \
__cuda_error_not_implememted() __cuda_error_not_implememted()
#if defined(__APPLE__) #if defined(__APPLE__)
skipping to change at line 989 skipping to change at line 1006
} cvt; } cvt;
cvt.d = a; cvt.d = a;
return cvt.l < 0ll; return cvt.l < 0ll;
} }
__func__(double copysign(double a, double b)) __func__(double copysign(double a, double b))
{ {
volatile union { volatile union {
double d; double d;
unsigned long long int l; unsigned long long int l;
} cvta, cvtb; } cvta, cvtb;
cvta.d = a; cvta.d = a;
cvtb.d = b; cvtb.d = b;
cvta.l = (cvta.l & 0x7fffffffffffffffULL) | (cvtb.l & 0x8000000000000000U LL); cvta.l = (cvta.l & 0x7fffffffffffffffULL) | (cvtb.l & 0x8000000000000000U LL);
return cvta.d; return cvta.d;
} }
__func__(int __signbitf(float a)) __func__(int __signbitf(float a))
{ {
return __cuda___signbitf(a); return __cuda___signbitf(a);
} }
__func__(float copysignf(float a, float b)) __func__(float copysignf(float a, float b))
{ {
skipping to change at line 1178 skipping to change at line 1197
#if defined(__LP64__) #if defined(__LP64__)
return (long int)__cuda_llrintf(a); return (long int)__cuda_llrintf(a);
#else /* __LP64__ */ #else /* __LP64__ */
return (long int)__float2int_rn(a); return (long int)__float2int_rn(a);
#endif /* __LP64__ */ #endif /* __LP64__ */
#endif /* __MULTI_CORE__ */ #endif /* __MULTI_CORE__ */
} }
__device_func__(float __cuda_nearbyintf(float a)) __device_func__(float __cuda_nearbyintf(float a))
{ {
#if defined(__CUDABE__) #if defined(__MULTI_CORE__)
return nearbyintf(a);
#elif defined(__CUDABE__)
return roundf(a); return roundf(a);
#else /* __CUDABE__ */ #else /* __CUDABE__ */
return __internal_nearbyintf(a); return __internal_nearbyintf(a);
#endif /* __CUDABE__ */ #endif /* __CUDABE__ */
} }
__device_func__(float __cuda_fmaxf(float a, float b)) __device_func__(float __cuda_fmaxf(float a, float b))
{ {
#if defined(__MULTI_CORE__) #if defined(__MULTI_CORE__)
return fmaxf(a, b); return fmaxf(a, b);
skipping to change at line 1447 skipping to change at line 1468
res.y = e = r + s; res.y = e = r + s;
res.x = (r - e) + s; res.x = (r - e) + s;
return res; return res;
} }
__device_func__(float __internal_accurate_log2f(float a)) __device_func__(float __internal_accurate_log2f(float a))
{ {
return CUDART_L2E_F * __internal_accurate_logf(a); return CUDART_L2E_F * __internal_accurate_logf(a);
} }
/* Based on: Guillaume Da Gra
* Operators on Graphics Hardware. RNC'7, pp. 23-32, 2006.
*/
__device_func__(float2 __internal_dsmul (float2 x, float2 y))
{
float2 z;
#if !defined(__CUDABE__)
volatile float up, vp, u1, u2, v1, v2, mh, ml;
#else
float up, vp, u1, u2, v1, v2, mh, ml;
#endif /* defined(__CUDABE__) */
up = x.y * 4097.0f;
u1 = (x.y - up) + up;
u2 = x.y - u1;
vp = y.y * 4097.0f;
v1 = (y.y - vp) + vp;
v2 = y.y - v1;
mh = __fmul_rn(x.y,y.y);
ml = (((u1 * v1 - mh) + u1 * v2) + u2 * v1) + u2 * v2;
ml = (__fmul_rn(x.y,y.x) + __fmul_rn(x.x,y.y)) + ml;
z.y = up = mh + ml;
z.x = (mh - up) + ml;
return z;
}
/* 160 bits of 2/PI for Payne-Hanek style argument reduction. */ /* 160 bits of 2/PI for Payne-Hanek style argument reduction. */
static __constant__ unsigned int __cudart_i2opi_f [] = { static __constant__ unsigned int __cudart_i2opi_f [] = {
0x3c439041, 0x3c439041,
0xdb629599, 0xdb629599,
0xf534ddc0, 0xf534ddc0,
0xfc2757d1, 0xfc2757d1,
0x4e441529, 0x4e441529,
0xa2f9836e, 0xa2f9836e,
}; };
skipping to change at line 2048 skipping to change at line 2094
} }
return t; return t;
#endif /* __MULTI_CORE__ */ #endif /* __MULTI_CORE__ */
} }
__device_func__(float __cuda_acoshf(float a)) __device_func__(float __cuda_acoshf(float a))
{ {
#if defined(__MULTI_CORE__) #if defined(__MULTI_CORE__)
return acoshf(a); return acoshf(a);
#else /* __MULTI_CORE__ */ #else /* __MULTI_CORE__ */
float s, t; float t;
t = a - 1.0f; t = a - 1.0f;
if (__cuda_fabsf(t) > CUDART_TWO_TO_23_F) { if (__cuda_fabsf(t) > CUDART_TWO_TO_23_F) {
/* for large a, acosh = log(2*a) */ /* for large a, acosh = log(2*a) */
return CUDART_LN2_F + __internal_accurate_logf(a); return CUDART_LN2_F + __internal_accurate_logf(a);
} else { } else {
s = a + 1.0f; t = t + __cuda_sqrtf(a * t + t);
t = t + __cuda_sqrtf(s * t);
return __cuda_log1pf(t); return __cuda_log1pf(t);
} }
#endif /* __MULTI_CORE__ */ #endif /* __MULTI_CORE__ */
} }
__device_func__(float __cuda_asinhf(float a)) __device_func__(float __cuda_asinhf(float a))
{ {
#if defined(__MULTI_CORE__) #if defined(__MULTI_CORE__)
return asinhf(a); return asinhf(a);
#else /* __MULTI_CORE__ */ #else /* __MULTI_CORE__ */
skipping to change at line 2880 skipping to change at line 2925
} }
__device_func__(float __internal_accurate_powf(float a, float b)) __device_func__(float __internal_accurate_powf(float a, float b))
{ {
float2 loga, prod; float2 loga, prod;
#if !defined(__CUDABE__) && defined(_MSC_VER) && !defined(_WIN64) #if !defined(__CUDABE__) && defined(_MSC_VER) && !defined(_WIN64)
volatile float t; volatile float t;
#else #else
float t; float t;
#endif #endif
#if !defined(__CUDABE__) && defined(__linux__) && !defined(__LP64__)
volatile float up, vp, u1, u2, v1, v2, mh, ml;
#else
float up, vp, u1, u2, v1, v2, mh, ml;
#endif
/* compute log(a) in double-single format*/ /* compute log(a) in double-single format*/
loga = __internal_log_ep(a); loga = __internal_log_ep(a);
/* prevent overflow during extended precision multiply */ /* prevent overflow during extended precision multiply */
if (__cuda_fabsf(b) > 1.0e34f) b *= 1.220703125e-4f; if (__cuda_fabsf(b) > 1.0e34f) b *= 1.220703125e-4f;
/* compute b * log(a) in double-single format */ prod.y = b;
up = loga.y * 4097.0f; prod.x = 0.0f;
u1 = (loga.y - up) + up; prod = __internal_dsmul (prod, loga);
u2 = loga.y - u1;
vp = b * 4097.0f; /* prevent intermediate overflow in exponentiation */
v1 = (b - vp) + vp; if (__float_as_int(prod.y) == 0x42b17218) {
v2 = b - v1; prod.y = __int_as_float(__float_as_int(prod.y) - 1);
mh = __fmul_rn(loga.y, b); prod.x = prod.x + __int_as_float(0x37000000);
ml = (((u1 * v1 - mh) + u1 * v2) + u2 * v1) + u2 * v2; }
ml = __fmul_rn(loga.x, b) + ml;
prod.y = up = mh + ml;
prod.x = (mh - up) + ml;
/* compute pow(a,b) = exp(b*log(a)) */ /* compute pow(a,b) = exp(b*log(a)) */
t = __cuda_expf(prod.y); t = __cuda_expf(prod.y);
/* prevent -INF + INF = NaN */ /* prevent -INF + INF = NaN */
if (t != CUDART_INF_F) { if (t != CUDART_INF_F) {
/* if prod.x is much smaller than prod.y, then exp(prod.y+prod.x) ~= /* if prod.x is much smaller than prod.y, then exp(prod.y+prod.x) ~=
* exp(prod.y) + prod.x * exp(prod.y) * exp(prod.y) + prod.x * exp(prod.y)
*/ */
t = t * prod.x + t; t = t * prod.x + t;
} }
 End of changes. 16 change blocks. 
36 lines changed or deleted 73 lines changed or added


 math_functions_dbl_ptx3.h   math_functions_dbl_ptx3.h 
/* /*
* Copyright 1993-2008 NVIDIA Corporation. All rights reserved. * Copyright 1993-2008 NVIDIA Corporation. All rights reserved.
* *
* NOTICE TO USER: * NOTICE TO USER:
* *
* This source code is subject to NVIDIA ownership rights under U.S. and * This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws. Users and possessors of this source code * international Copyright laws. Users and possessors of this source code
* are hereby granted a nonexclusive, royalty-free license to use this code * are hereby granted a nonexclusive, royalty-free license to use this code
* in individual and commercial software. * in individual and commercial software.
* *
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
skipping to change at line 221 skipping to change at line 221
}; };
__device_func__(double __internal_trig_reduction_kerneld(double a, int *qua drant)) __device_func__(double __internal_trig_reduction_kerneld(double a, int *qua drant))
{ {
double j; double j;
int q; int q;
if (__cuda_fabs(a) > CUDART_TRIG_PLOSS) { if (__cuda_fabs(a) > CUDART_TRIG_PLOSS) {
/* Payne-Hanek style argument reduction. */ /* Payne-Hanek style argument reduction. */
unsigned long long int ia; unsigned long long int ia;
unsigned long long int s; unsigned long long int s;
unsigned long long int result[19]; unsigned long long int result[5];
unsigned long long int phi, plo; unsigned long long int phi, plo;
unsigned long long int hi, lo; unsigned long long int hi, lo;
unsigned int e; unsigned int e;
int idx; int idx;
ia = __double_as_longlong(a); ia = __double_as_longlong(a);
s = ia & 0x8000000000000000ULL; s = ia & 0x8000000000000000ULL;
e = (unsigned int)(((ia >> 52) & 0x7ff) - 1024); e = (unsigned int)(((ia >> 52) & 0x7ff) - 1024);
ia = (ia << 11) | 0x8000000000000000ULL; ia = (ia << 11) | 0x8000000000000000ULL;
/* compute x * 2/pi */ /* compute x * 2/pi */
skipping to change at line 233 skipping to change at line 233
unsigned long long int hi, lo; unsigned long long int hi, lo;
unsigned int e; unsigned int e;
int idx; int idx;
ia = __double_as_longlong(a); ia = __double_as_longlong(a);
s = ia & 0x8000000000000000ULL; s = ia & 0x8000000000000000ULL;
e = (unsigned int)(((ia >> 52) & 0x7ff) - 1024); e = (unsigned int)(((ia >> 52) & 0x7ff) - 1024);
ia = (ia << 11) | 0x8000000000000000ULL; ia = (ia << 11) | 0x8000000000000000ULL;
/* compute x * 2/pi */ /* compute x * 2/pi */
idx = 16 - (e >> 6); idx = 16 - (e >> 6);
hi = 0; hi = 0;
#if defined(__CUDABE__) #if defined(__CUDABE__)
#pragma unroll 1 #pragma unroll 1
#endif /* __CUDABE__ */ #endif /* __CUDABE__ */
for (q = 0; q < 18; q++) { for (q = (idx-1); q < min(18,idx+3); q++) {
plo = __cudart_i2opi_d[q] * ia; plo = __cudart_i2opi_d[q] * ia;
phi = __umul64hi (__cudart_i2opi_d[q], ia); phi = __umul64hi (__cudart_i2opi_d[q], ia);
lo = hi + plo; lo = hi + plo;
hi = phi + (lo < plo); hi = phi + (lo < plo);
result[q] = lo; result[q-(idx-1)] = lo;
} }
result[q] = hi; result[q-(idx-1)] = hi;
e = e & 63; e = e & 63;
/* shift result such that hi:lo<127:126> are the least significant /* shift result such that hi:lo<127:126> are the least significant
integer bits, and hi:lo<125:0> are the fractional bits of the result integer bits, and hi:lo<125:0> are the fractional bits of the result
*/ */
hi = result[idx+2]; hi = result[3];
lo = result[idx+1]; lo = result[2];
if (e) { if (e) {
q = 64 - e; q = 64 - e;
hi = (hi << e) | (lo >> q); hi = (hi << e) | (lo >> q);
lo = (lo << e) | (result[idx] >> q); lo = (lo << e) | (result[1] >> q);
} }
q = (int)(hi >> 62); q = (int)(hi >> 62);
/* fraction */ /* fraction */
hi = (hi << 2) | (lo >> 62); hi = (hi << 2) | (lo >> 62);
lo = (lo << 2); lo = (lo << 2);
e = (hi | (lo > 0)) > 0x8000000000000000ULL; /* fraction >= 0.5 */ e = (hi | (lo > 0)) > 0x8000000000000000ULL; /* fraction >= 0.5 */
q += e; q += e;
if (s) q = -q; if (s) q = -q;
if (e) { if (e) {
unsigned long long int t; unsigned long long int t;
skipping to change at line 338 skipping to change at line 337
z = __fma_rn (z, x2, 2.087588480545065E-009); z = __fma_rn (z, x2, 2.087588480545065E-009);
z = __fma_rn (z, x2, -2.755731555403950E-007); z = __fma_rn (z, x2, -2.755731555403950E-007);
z = __fma_rn (z, x2, 2.480158729365970E-005); z = __fma_rn (z, x2, 2.480158729365970E-005);
z = __fma_rn (z, x2, -1.388888888888074E-003); z = __fma_rn (z, x2, -1.388888888888074E-003);
z = __fma_rn (z, x2, 4.166666666666664E-002); z = __fma_rn (z, x2, 4.166666666666664E-002);
z = __fma_rn (z, x2, -5.000000000000000E-001); z = __fma_rn (z, x2, -5.000000000000000E-001);
z = __fma_rn (z, x2, 1.000000000000000E+000); z = __fma_rn (z, x2, 1.000000000000000E+000);
return z; return z;
} }
/* approximate tangent on -pi/2...+pi/2 ??? */ /* approximate tangent on -pi/4...+pi/4 */
__device_func__(double __internal_tan_kerneld(double x)) __device_func__(double __internal_tan_kerneld(double x, int i))
{ {
double x2, z; double x2, z, q;
x2 = x * x; x2 = x * x;
z = 2.08720267406905700E-005; z = 9.8006287203286300E-006;
z = __fma_rn (z, x2, -4.49127882369154970E-005); z = __fma_rn (z, x2, -2.4279526494179897E-005);
z = __fma_rn (z, x2, 8.95189835887631370E-005); z = __fma_rn (z, x2, 4.8644173130937162E-005);
z = __fma_rn (z, x2, -2.64581074021952680E-005); z = __fma_rn (z, x2, -2.5640012693782273E-005);
z = __fma_rn (z, x2, 1.36576090383605350E-004); z = __fma_rn (z, x2, 6.7223984330880073E-005);
z = __fma_rn (z, x2, 2.22355572141495130E-004); z = __fma_rn (z, x2, 8.3559287318211639E-005);
z = __fma_rn (z, x2, 5.95072833555129420E-004); z = __fma_rn (z, x2, 2.4375039850848564E-004);
z = __fma_rn (z, x2, 1.45475607963143160E-003); z = __fma_rn (z, x2, 5.8886487754856672E-004);
z = __fma_rn (z, x2, 3.59228938375760580E-003); z = __fma_rn (z, x2, 1.4560454844672040E-003);
z = __fma_rn (z, x2, 8.86321913102899350E-003); z = __fma_rn (z, x2, 3.5921008885857180E-003);
z = __fma_rn (z, x2, 2.18694896113231960E-002); z = __fma_rn (z, x2, 8.8632379218613715E-003);
z = __fma_rn (z, x2, 5.39682539265723680E-002); z = __fma_rn (z, x2, 2.1869488399337889E-002);
z = __fma_rn (z, x2, 1.33333333334148210E-001); z = __fma_rn (z, x2, 5.3968253972902704E-002);
z = __fma_rn (z, x2, 3.33333333333327760E-001); z = __fma_rn (z, x2, 1.3333333333325342E-001);
z = __fma_rn (z, x2, 3.3333333333333381E-001);
z = z * x2; z = z * x2;
z = __fma_rn (z, x, x); q = __fma_rn (z, x, x);
return z; if (i) {
double s = q - x;
double w = __fma_rn (z, x, -s); // tail of q
z = 1.0 / q;
z = -z;
s = __fma_rn (q, z, 1.0);
q = __fma_rn (z, __fma_rn (z, w, s), z);
}
return q;
} }
__device_func__(double __cuda_sqrt(double a)) __device_func__(double __cuda_sqrt(double a))
{ {
return sqrt(a); return sqrt(a);
} }
__device_func__(double __cuda_rsqrt(double a)) __device_func__(double __cuda_rsqrt(double a))
{ {
#if !defined(__CUDABE__) #if !defined(__CUDABE__)
skipping to change at line 417 skipping to change at line 425
t = __fma_rn (t, a2, 1.111112158368149E-001/256.0); t = __fma_rn (t, a2, 1.111112158368149E-001/256.0);
t = __fma_rn (t, a2, 1.428571416261528E-001/64.0); t = __fma_rn (t, a2, 1.428571416261528E-001/64.0);
t = __fma_rn (t, a2, 2.000000000069858E-001/16.0); t = __fma_rn (t, a2, 2.000000000069858E-001/16.0);
t = __fma_rn (t, a2, 3.333333333333198E-001/4.0); t = __fma_rn (t, a2, 3.333333333333198E-001/4.0);
t = t * a2; t = t * a2;
t = __fma_rn (t, a, a_2); t = __fma_rn (t, a, a_2);
t = t + a_1; t = t + a_1;
return t; return t;
} }
__device_func__(double __internal_exp2i_kernel(int b))
{
return __hiloint2double((b + 1023) << 20, 0);
}
__device_func__(double __internal_half(double a))
{
unsigned int ihi, ilo;
ilo = __double2loint(a);
ihi = __double2hiint(a);
return __hiloint2double(ihi - 0x00100000, ilo);
}
__device_func__(double __internal_twice(double a))
{
unsigned int ihi, ilo;
ilo = __double2loint(a);
ihi = __double2hiint(a);
return __hiloint2double(ihi + 0x00100000, ilo);
}
__device_func__(double __cuda_sin(double a)) __device_func__(double __cuda_sin(double a))
{ {
double z; double z;
int i; int i;
if (__cuda___isinf(a) || (a == CUDART_ZERO)) { if (__cuda___isinf(a) || (a == CUDART_ZERO)) {
return __dmul_rn(a, CUDART_ZERO); return __dmul_rn(a, CUDART_ZERO);
} }
z = __internal_trig_reduction_kerneld(a, &i); z = __internal_trig_reduction_kerneld(a, &i);
/* here, abs(z) <= pi/4, and i has the quadrant */ /* here, abs(z) <= pi/4, and i has the quadrant */
if (i & 1) { if (i & 1) {
skipping to change at line 502 skipping to change at line 531
__device_func__(double __cuda_tan(double a)) __device_func__(double __cuda_tan(double a))
{ {
double z; double z;
int i; int i;
if (__cuda___isinf(a)) { if (__cuda___isinf(a)) {
return CUDART_NAN; return CUDART_NAN;
} }
z = __internal_trig_reduction_kerneld(a, &i); z = __internal_trig_reduction_kerneld(a, &i);
/* here, abs(z) <= pi/4, and i has the quadrant */ /* here, abs(z) <= pi/4, and i has the quadrant */
z = __internal_tan_kerneld(z); z = __internal_tan_kerneld(z, i & 1);
if (i & 1) {
z = -1.0 / z;
}
return z; return z;
} }
__device_func__(double __cuda_log(double a)) __device_func__(double __cuda_log(double a))
{ {
double m, f, g, u, v, tmp, q, ulo, log_lo, log_hi; double m, f, g, u, v, tmp, q, ulo, log_lo, log_hi;
int ihi, ilo; int ihi, ilo;
int e = 0; int e = 0;
ihi = __double2hiint(a); ihi = __double2hiint(a);
skipping to change at line 545 skipping to change at line 571
e = -54; e = -54;
ihi = __double2hiint(a); ihi = __double2hiint(a);
ilo = __double2loint(a); ilo = __double2loint(a);
} }
/* a = m * 2^e. m <= sqrt(2): log2(a) = log2(m) + e. /* a = m * 2^e. m <= sqrt(2): log2(a) = log2(m) + e.
* m > sqrt(2): log2(a) = log2(m/2) + (e+1) * m > sqrt(2): log2(a) = log2(m/2) + (e+1)
*/ */
e += ((ihi >> 20) & 0x7ff) - 1023; e += ((ihi >> 20) & 0x7ff) - 1023;
m = __hiloint2double ((ihi & 0x800fffff) | 0x3ff00000, ilo); m = __hiloint2double ((ihi & 0x800fffff) | 0x3ff00000, ilo);
if (m > CUDART_SQRT_TWO) { if (m > CUDART_SQRT_TWO) {
m = m * 0.5; m = __internal_half(m);
e = e + 1; e = e + 1;
} }
/* log((1+m)/(1-m)) = 2*atanh(m). log(m) = 2*atanh ((m-1)/(m+1)) */ /* log((1+m)/(1-m)) = 2*atanh(m). log(m) = 2*atanh ((m-1)/(m+1)) */
f = m - 1.0; f = m - 1.0;
g = m + 1.0; g = m + 1.0;
g = 1.0 / g; g = 1.0 / g;
u = f * g; u = f * g;
u = u + u; u = u + u;
/* u = 2.0 * (m - 1.0) / (m + 1.0) */ /* u = 2.0 * (m - 1.0) / (m + 1.0) */
v = u * u; v = u * u;
q = 6.7261411553826339E-2/65536.0; q = 6.7261411553826339E-2/65536.0;
q = __fma_rn (q, v, 6.6133829643643394E-2/16384.0); q = __fma_rn (q, v, 6.6133829643643394E-2/16384.0);
q = __fma_rn (q, v, 7.6940931149150890E-2/4096.0); q = __fma_rn (q, v, 7.6940931149150890E-2/4096.0);
q = __fma_rn (q, v, 9.0908745692137444E-2/1024.0); q = __fma_rn (q, v, 9.0908745692137444E-2/1024.0);
q = __fma_rn (q, v, 1.1111111499059706E-1/256.0); q = __fma_rn (q, v, 1.1111111499059706E-1/256.0);
q = __fma_rn (q, v, 1.4285714283305975E-1/64.0); q = __fma_rn (q, v, 1.4285714283305975E-1/64.0);
q = __fma_rn (q, v, 2.0000000000007223E-1/16.0); q = __fma_rn (q, v, 2.0000000000007223E-1/16.0);
q = __fma_rn (q, v, 3.3333333333333326E-1/4.0); q = __fma_rn (q, v, 3.3333333333333326E-1/4.0);
tmp = 2.0 * (f - u); tmp = __internal_twice (f - u);
tmp = __fma_rn (-u, f, tmp); // tmp = remainder of division tmp = __fma_rn (-u, f, tmp); // tmp = remainder of division
ulo = g * tmp; // less significant quotient bits ulo = g * tmp; // less significant quotient bits
/* u + ulo = 2.0 * (m - 1.0) / (m + 1.0) to more than double precision */ /* u + ulo = 2.0 * (m - 1.0) / (m + 1.0) to more than double precision */
q = q * v; q = q * v;
q = q * u; q = q * u;
/* log_hi + log_lo = log(m) to more than double precision */ /* log_hi + log_lo = log(m) to more than double precision */
log_hi = u; log_hi = u;
log_lo = ulo + q; log_lo = ulo + q;
/* log_hi + log_lo = log(m)+e*log(2)=log(a) to more than double precision */ /* log_hi + log_lo = log(m)+e*log(2)=log(a) to more than double precision */
q = __fma_rn ( e, CUDART_LN2_HI, log_hi); q = __fma_rn ( e, CUDART_LN2_HI, log_hi);
tmp = __fma_rn (-e, CUDART_LN2_HI, q); tmp = __fma_rn (-e, CUDART_LN2_HI, q);
tmp = tmp - log_hi; tmp = tmp - log_hi;
log_hi = q; log_hi = q;
log_lo = log_lo - tmp; log_lo = log_lo - tmp;
log_lo = __fma_rn (e, CUDART_LN2_LO, log_lo); log_lo = __fma_rn (e, CUDART_LN2_LO, log_lo);
return log_hi + log_lo; return log_hi + log_lo;
} }
/* Requires |x.y| > |y.y|. 8 DP operations */ /* Requires |x.y| > |y.y|. 8 DP operations */
__device_func__(double2 ddadd_xgty (double2 x, double2 y)) __device_func__(double2 __internal_ddadd_xgty (double2 x, double2 y))
{ {
double2 z; double2 z;
#if defined(__GNUC__) && !defined(__CUDABE__) #if defined(__GNUC__) && !defined(__CUDABE__)
volatile double r, s, e; volatile double r, s, e;
#else #else
double r, s, e; double r, s, e;
#endif #endif
r = x.y + y.y; r = x.y + y.y;
e = x.y - r; e = x.y - r;
s = ((e + y.y) + y.x) + x.x; s = ((e + y.y) + y.x) + x.x;
z.y = e = r + s; z.y = e = r + s;
z.x = (r - e) + s; z.x = (r - e) + s;
return z; return z;
} }
/* Take full advantage of FMA. Only 8 DP operations */ /* Take full advantage of FMA. Only 8 DP operations */
__device_func__(double2 ddmul (double2 x, double2 y)) __device_func__(double2 __internal_ddmul (double2 x, double2 y))
{ {
#if defined(__GNUC__) && !defined(__CUDABE__) #if defined(__GNUC__) && !defined(__CUDABE__)
volatile double e; volatile double e;
#else #else
double e; double e;
#endif #endif
double2 t, z; double2 t, z;
t.y = x.y * y.y; t.y = x.y * y.y;
t.x = __fma_rn (x.y, y.y, -t.y); t.x = __fma_rn (x.y, y.y, -t.y);
t.x = __fma_rn (x.x, y.x, t.x); t.x = __fma_rn (x.x, y.x, t.x);
skipping to change at line 644 skipping to change at line 670
expo = (ihi >> 20) & 0x7ff; expo = (ihi >> 20) & 0x7ff;
expo -= 54; expo -= 54;
} }
expo -= 1023; expo -= 1023;
/* log(a) = log(m*2^expo) = /* log(a) = log(m*2^expo) =
log(m) + log(2)*expo, if m < sqrt(2), log(m) + log(2)*expo, if m < sqrt(2),
log(m*0.5) + log(2)*(expo+1), if m >= sqrt(2) log(m*0.5) + log(2)*(expo+1), if m >= sqrt(2)
*/ */
m = __hiloint2double((ihi & 0x800fffff) | 0x3ff00000, ilo); m = __hiloint2double((ihi & 0x800fffff) | 0x3ff00000, ilo);
if (m > CUDART_SQRT_TWO) { if (m > CUDART_SQRT_TWO) {
m = m * 0.5; m = __internal_half(m);
expo = expo + 1; expo = expo + 1;
} }
/* compute log(m) with extended precision using an algorithm derived from /* compute log(m) with extended precision using an algorithm derived from
* P.T.P. Tang, "Table Driven Implementation of the Logarithm Function", * P.T.P. Tang, "Table Driven Implementation of the Logarithm Function",
* TOMS, Vol. 16., No. 4, December 1990, pp. 378-400. A modified polynomi al * TOMS, Vol. 16., No. 4, December 1990, pp. 378-400. A modified polynomi al
* approximation to atanh(x) on the interval [-0.1716, 0.1716] is utilize d. * approximation to atanh(x) on the interval [-0.1716, 0.1716] is utilize d.
*/ */
f = m - 1.0; f = m - 1.0;
g = m + 1.0; g = m + 1.0;
g = 1.0 / g; g = 1.0 / g;
skipping to change at line 668 skipping to change at line 694
v = u * u; v = u * u;
q = 6.6253631649203309E-2/65536.0; q = 6.6253631649203309E-2/65536.0;
q = __fma_rn (q, v, 6.6250935587260612E-2/16384.0); q = __fma_rn (q, v, 6.6250935587260612E-2/16384.0);
q = __fma_rn (q, v, 7.6935437806732829E-2/4096.0); q = __fma_rn (q, v, 7.6935437806732829E-2/4096.0);
q = __fma_rn (q, v, 9.0908878711093280E-2/1024.0); q = __fma_rn (q, v, 9.0908878711093280E-2/1024.0);
q = __fma_rn (q, v, 1.1111111322892790E-1/256.0); q = __fma_rn (q, v, 1.1111111322892790E-1/256.0);
q = __fma_rn (q, v, 1.4285714284546502E-1/64.0); q = __fma_rn (q, v, 1.4285714284546502E-1/64.0);
q = __fma_rn (q, v, 2.0000000000003113E-1/16.0); q = __fma_rn (q, v, 2.0000000000003113E-1/16.0);
q = q * v; q = q * v;
/* u + ulo = 2.0 * (m - 1.0) / (m + 1.0) to more than double precision */ /* u + ulo = 2.0 * (m - 1.0) / (m + 1.0) to more than double precision */
tmp = 2.0 * (f - u); tmp = __internal_twice (f - u);
tmp = __fma_rn (-u, f, tmp); // tmp = remainder of division tmp = __fma_rn (-u, f, tmp); // tmp = remainder of division
ulo = g * tmp; // less significand quotient bits ulo = g * tmp; // less significand quotient bits
/* switch to double-double at this point */ /* switch to double-double at this point */
qq.y = q; qq.y = q;
qq.x = 0.0; qq.x = 0.0;
uu.y = u; uu.y = u;
uu.x = ulo; uu.x = ulo;
cc.y = 3.3333333333333331E-1/4.0; cc.y = 3.3333333333333331E-1/4.0;
cc.x = -9.8201492846582465E-18/4.0; cc.x = -9.8201492846582465E-18/4.0;
qq = ddadd_xgty (cc, qq); qq = __internal_ddadd_xgty (cc, qq);
/* compute log(m) in double-double format */ /* compute log(m) in double-double format */
qq = ddmul(qq, uu); qq = __internal_ddmul(qq, uu);
qq = ddmul(qq, uu); qq = __internal_ddmul(qq, uu);
qq = ddmul(qq, uu); qq = __internal_ddmul(qq, uu);
uu = ddadd_xgty (uu, qq); uu = __internal_ddadd_xgty (uu, qq);
u = uu.y; u = uu.y;
ulo = uu.x; ulo = uu.x;
/* log(2)*expo in double-double format */ /* log(2)*expo in double-double format */
tt.y = expo * 6.9314718055966296e-001; /* multiplication is exact */ tt.y = expo * 6.9314718055966296e-001; /* multiplication is exact */
tt.x = expo * 2.8235290563031577e-013; tt.x = expo * 2.8235290563031577e-013;
/* log(a) = log(m) + log(2)*expo; if expo != 0, |log(2)*expo| > |log(m)| */ /* log(a) = log(m) + log(2)*expo; if expo != 0, |log(2)*expo| > |log(m)| */
res = ddadd_xgty (tt, uu); res = __internal_ddadd_xgty (tt, uu);
return res; return res;
} }
__device_func__(double __cuda_log2(double a)) __device_func__(double __cuda_log2(double a))
{ {
double t; double t;
t = __cuda_log(a); t = __cuda_log(a);
return __fma_rn (t, CUDART_L2E_HI, t * CUDART_L2E_LO); return __fma_rn (t, CUDART_L2E_HI, t * CUDART_L2E_LO);
} }
skipping to change at line 884 skipping to change at line 910
z = 1.632386098183803E-010; z = 1.632386098183803E-010;
z = __fma_rn (z, a2, 2.504854501385687E-008); z = __fma_rn (z, a2, 2.504854501385687E-008);
z = __fma_rn (z, a2, 2.755734274788706E-006); z = __fma_rn (z, a2, 2.755734274788706E-006);
z = __fma_rn (z, a2, 1.984126976294102E-004); z = __fma_rn (z, a2, 1.984126976294102E-004);
z = __fma_rn (z, a2, 8.333333333452911E-003); z = __fma_rn (z, a2, 8.333333333452911E-003);
z = __fma_rn (z, a2, 1.666666666666606E-001); z = __fma_rn (z, a2, 1.666666666666606E-001);
z = z * a2; z = z * a2;
z = __fma_rn (z, a, a); z = __fma_rn (z, a, a);
} else if (a < 2.0) { /* work around accuracy issue in vicinity of 1.4 */ } else if (a < 2.0) { /* work around accuracy issue in vicinity of 1.4 */
z = __cuda_expm1(a); z = __cuda_expm1(a);
z = 0.5 * (z + z / (z + 1.0)); z = __internal_half (z + z / (z + 1.0));
} else { } else {
z = __internal_exp_kernel(a, -1); z = __internal_exp_kernel(a, -1);
z = z + (1.0 / (-4.0 * z)); z = z + (1.0 / (-4.0 * z));
if (a >= CUDART_LN2_X_1025) { if (a >= CUDART_LN2_X_1025) {
z = CUDART_INF; /* overflow -> infinity */ z = CUDART_INF; /* overflow -> infinity */
} }
} }
z = __cuda_copysign(z, s); z = __cuda_copysign(z, s);
return z; return z;
} }
skipping to change at line 931 skipping to change at line 957
t = __fma_rn (t, a, a); t = __fma_rn (t, a, a);
a = __cuda_copysign(t, a); a = __cuda_copysign(t, a);
} }
return a; return a;
} }
__device_func__(double __internal_atan_kernel(double a)) __device_func__(double __internal_atan_kernel(double a))
{ {
double t, a2; double t, a2;
a2 = a * a; a2 = a * a;
t = 3.078869985415414E-005; t = -2.0258553044438358E-005 ;
t = __fma_rn (t, a2, -3.242465497855030E-004); t = __fma_rn (t, a2, 2.2302240345758510E-004);
t = __fma_rn (t, a2, 1.616330732831762E-003); t = __fma_rn (t, a2, -1.1640717779930576E-003);
t = __fma_rn (t, a2, -5.109062682125205E-003); t = __fma_rn (t, a2, 3.8559749383629918E-003);
t = __fma_rn (t, a2, 1.161935628866633E-002); t = __fma_rn (t, a2, -9.1845592187165485E-003);
t = __fma_rn (t, a2, -2.056787727035855E-002); t = __fma_rn (t, a2, 1.6978035834597331E-002);
t = __fma_rn (t, a2, 3.014410555893613E-002); t = __fma_rn (t, a2, -2.5826796814495994E-002);
t = __fma_rn (t, a2, -3.868025408479192E-002); t = __fma_rn (t, a2, 3.4067811082715123E-002);
t = __fma_rn (t, a2, 4.575089047336649E-002); t = __fma_rn (t, a2, -4.0926382420509971E-002);
t = __fma_rn (t, a2, -5.205914168425584E-002); t = __fma_rn (t, a2, 4.6739496199157994E-002);
t = __fma_rn (t, a2, 5.868796955641944E-002); t = __fma_rn (t, a2, -5.2392330054601317E-002);
t = __fma_rn (t, a2, -6.664240989090695E-002); t = __fma_rn (t, a2, 5.8773077721790849E-002);
t = __fma_rn (t, a2, 7.691989357768611E-002); t = __fma_rn (t, a2, -6.6658603633512573E-002);
t = __fma_rn (t, a2, -9.090879690584197E-002); t = __fma_rn (t, a2, 7.6922129305867837E-002);
t = __fma_rn (t, a2, 1.111110931160943E-001); t = __fma_rn (t, a2, -9.0909012354005225E-002);
t = __fma_rn (t, a2, -1.428571421925286E-001); t = __fma_rn (t, a2, 1.1111110678749424E-001);
t = __fma_rn (t, a2, 1.999999999874377E-001); t = __fma_rn (t, a2, -1.4285714271334815E-001);
t = __fma_rn (t, a2, -3.333333333332495E-001); t = __fma_rn (t, a2, 1.9999999999755019E-001);
t = __fma_rn (t, a2, -3.3333333333331860E-001);
t = t * a2; t = t * a2;
t = __fma_rn (t, a, a); t = __fma_rn (t, a, a);
return t; return t;
} }
__device_func__(double __cuda_atan2(double a, double b)) __device_func__(double __cuda_atan2(double a, double b))
{ {
double t0, t1, t3; double t0, t1, t3;
if (__cuda___isnan(a) || __cuda___isnan(b)) { if (__cuda___isnan(a) || __cuda___isnan(b)) {
return a + b; return a + b;
} }
/* reduce arguments to first octant */ /* reduce arguments to first octant */
/* r = (|x| < |y|) ? (|x| / |y|) : (|y| / |x|) */ /* r = (|x| < |y|) ? (|x| / |y|) : (|y| / |x|) */
t3 = __cuda_fabs(b); t3 = __cuda_fabs(b);
t1 = __cuda_fabs(a); t1 = __cuda_fabs(a);
if (t3 == 0.0 && t1 == 0.0) { if (t3 == 0.0 && t1 == 0.0) {
t3 = __cuda___signbit(b) ? CUDART_PI : 0; t3 = __cuda___signbit(b) ? CUDART_PI : 0;
} else if (__cuda___isinf(t3) && __cuda___isinf(t1)) { } else if (__cuda___isinf(t3) && __cuda___isinf(t1)) {
t3 = __cuda___signbit(b) ? CUDART_3PIO4 : CUDART_PIO4; t3 = __cuda___signbit(b) ? CUDART_3PIO4 : CUDART_PIO4;
} else { } else {
/* can't use min, max because they do not propagate NaNs */ t0 = __cuda_fmax (t1, t3);
if (t3 < t1) { t1 = __cuda_fmin (t1, t3);
t0 = t1;
t1 = t3;
} else {
t0 = t3;
t1 = t1;
}
t3 = t1 / t0; t3 = t1 / t0;
t3 = __internal_atan_kernel(t3); t3 = __internal_atan_kernel(t3);
/* Map result according to octant. */ /* Map result according to octant. */
if (__cuda_fabs(a) > __cuda_fabs(b)) t3 = CUDART_PIO2 - t3; if (__cuda_fabs(a) > __cuda_fabs(b)) t3 = CUDART_PIO2 - t3;
if (b < 0.0) t3 = CUDART_PI - t3; if (b < 0.0) t3 = CUDART_PI - t3;
} }
t3 = __cuda_copysign(t3, a); t3 = __cuda_copysign(t3, a);
return t3; return t3;
} }
skipping to change at line 1023 skipping to change at line 1044
r = __fma_rn (r, b, 1.745227928732326E-002); r = __fma_rn (r, b, 1.745227928732326E-002);
r = __fma_rn (r, b, 1.000422754245580E-002); r = __fma_rn (r, b, 1.000422754245580E-002);
r = __fma_rn (r, b, 1.418108777515123E-002); r = __fma_rn (r, b, 1.418108777515123E-002);
r = __fma_rn (r, b, 1.733194598980628E-002); r = __fma_rn (r, b, 1.733194598980628E-002);
r = __fma_rn (r, b, 2.237350511593569E-002); r = __fma_rn (r, b, 2.237350511593569E-002);
r = __fma_rn (r, b, 3.038188875134962E-002); r = __fma_rn (r, b, 3.038188875134962E-002);
r = __fma_rn (r, b, 4.464285849810986E-002); r = __fma_rn (r, b, 4.464285849810986E-002);
r = __fma_rn (r, b, 7.499999998342270E-002); r = __fma_rn (r, b, 7.499999998342270E-002);
r = __fma_rn (r, b, 1.666666666667375E-001); r = __fma_rn (r, b, 1.666666666667375E-001);
r = r * b; r = r * b;
r = __fma_rn (r, a, a);
return r; return r;
} }
__device_func__(double __cuda_asin(double a)) __device_func__(double __cuda_asin(double a))
{ {
double t0, t1; double t0, t1;
t0 = __cuda_fabs(a); t0 = __cuda_fabs(a);
if (t0 > 1.0) { if (t0 > 1.0) {
return CUDART_NAN; return CUDART_NAN;
} }
if (t0 > 0.575) { if (t0 > 0.575) {
t1 = __fma_rn (-0.5, t0, 0.5); t1 = __fma_rn (-0.5, t0, 0.5);
t0 = __cuda_sqrt (t1); t0 = __cuda_sqrt (t1);
t1 = __internal_asin_kernel (t0, t1); t1 = __internal_asin_kernel (t0, t1);
t1 = __fma_rn (2.0, t1, -CUDART_PIO2_LO); t0 = -2.0 * t0;
t1 = CUDART_PIO2_HI - t1; t1 = __fma_rn (t0, t1, CUDART_PIO2_LO);
t0 = t0 + CUDART_PIO4_HI;
t1 = t0 + t1;
t1 = t1 + CUDART_PIO4_HI;
} else { } else {
t1 = t0 * t0; t1 = t0 * t0;
t1 = __internal_asin_kernel (t0, t1); t1 = __internal_asin_kernel (t0, t1);
t1 = __fma_rn (t1, t0, t0);
} }
return __cuda_copysign(t1, a); return __cuda_copysign(t1, a);
} }
__device_func__(double __cuda_acos(double a)) __device_func__(double __cuda_acos(double a))
{ {
double t0, t1; double t0, t1;
#if !defined(__CUDABE__) #if !defined(__CUDABE__)
if (__cuda___isnan(a)) { if (__cuda___isnan(a)) {
return a + a; return a + a;
} }
#endif #endif
t0 = __cuda_fabs (a); t0 = __cuda_fabs (a);
if (t0 > 0.575) { if (t0 > 0.575) {
t1 = __fma_rn (-0.5, t0, 0.5); t1 = __fma_rn (-0.5, t0, 0.5);
t0 = __cuda_sqrt(t1); t0 = __cuda_sqrt(t1);
t0 = __internal_asin_kernel (t0, t1); t1 = __internal_asin_kernel (t0, t1);
t0 = __fma_rn (t1, t0, t0);
t0 = 2.0 * t0; t0 = 2.0 * t0;
if (__cuda___signbit(a)) { if (__cuda___signbit(a)) {
t0 = __fma_rn (1.0, t0, -CUDART_PI_LO); t0 = __fma_rn (1.0, t0, -CUDART_PI_LO);
t0 = CUDART_PI_HI - t0; t0 = CUDART_PI_HI - t0;
} }
} else { } else {
t1 = t0 * t0; t1 = t0 * t0;
t0 = __internal_asin_kernel (t0, t1); t1 = __internal_asin_kernel (t0, t1);
t0 = __fma_rn (t1, t0, t0);
if (__cuda___signbit(a)) { if (__cuda___signbit(a)) {
t0 = __fma_rn (1.0, t0, +CUDART_PIO2_LO); t0 = __fma_rn (1.0, t0, +CUDART_PIO2_LO);
t0 = CUDART_PIO2_HI + t0; t0 = CUDART_PIO2_HI + t0;
} else { } else {
t0 = __fma_rn (1.0, t0, -CUDART_PIO2_LO); t0 = __fma_rn (1.0, t0, -CUDART_PIO2_LO);
t0 = CUDART_PIO2_HI - t0; t0 = CUDART_PIO2_HI - t0;
} }
} }
return t0; return t0;
} }
__device_func__(double __cuda_acosh(double a)) __device_func__(double __cuda_acosh(double a))
{ {
double s, t; double t;
#if !defined(__CUDABE__) #if !defined(__CUDABE__)
if (__cuda___isnan(a)) { if (__cuda___isnan(a)) {
return a + a; return a + a;
} }
#endif #endif
t = a - 1.0; t = a - 1.0;
if (__cuda_fabs(t) > CUDART_TWO_TO_52) { if (__cuda_fabs(t) > CUDART_TWO_TO_52) {
/* for large a, acosh = log(2*a) */ /* for large a, acosh = log(2*a) */
return CUDART_LN2 + __cuda_log(a); return CUDART_LN2 + __cuda_log(a);
} else { } else {
s = a + 1.0; t = t + __cuda_sqrt(__fma_rn(a, t, t));
t = t + __cuda_sqrt(s * t);
return __cuda_log1p(t); return __cuda_log1p(t);
} }
} }
__device_func__(double __cuda_asinh(double a)) __device_func__(double __cuda_asinh(double a))
{ {
#if SLIGHTLY_MORE_ACCURATE_BUT_SLOWER #if SLIGHTLY_MORE_ACCURATE_BUT_SLOWER
double fa, oofa, t; double fa, oofa, t;
fa = __cuda_fabs(a); fa = __cuda_fabs(a);
if (fa > 8.9884657373828596e+307) { /* prevent intermediate underflow */ if (fa > 8.9884657373828596e+307) { /* prevent intermediate underflow */
skipping to change at line 1449 skipping to change at line 1474
z = __fma_rn (z, x, -5.9216643735369393e-004); z = __fma_rn (z, x, -5.9216643735369393e-004);
z = __fma_rn (z, x, 6.9728137583658571e-005); z = __fma_rn (z, x, 6.9728137583658571e-005);
z = __fma_rn (z, x, 7.8403922172006662e-004); z = __fma_rn (z, x, 7.8403922172006662e-004);
z = __fma_rn (z, x, -2.2947209362139917e-004); z = __fma_rn (z, x, -2.2947209362139917e-004);
z = __fma_rn (z, x, -2.6813271604938273e-003); z = __fma_rn (z, x, -2.6813271604938273e-003);
z = __fma_rn (z, x, 3.4722222222222220e-003); z = __fma_rn (z, x, 3.4722222222222220e-003);
z = __fma_rn (z, x, 8.3333333333333329e-002); z = __fma_rn (z, x, 8.3333333333333329e-002);
z = __fma_rn (z, x, 1.0000000000000000e+000); z = __fma_rn (z, x, 1.0000000000000000e+000);
return z; return z;
} }
/* 11 DP operations */
__device_func__(double2 dbldbladd (double2 x, double2 y))
{
double2 z;
#if defined(__GNUC__) && !defined(__CUDABE__)
volatile double t1, t2, e;
#else
double t1, t2, e;
#endif
t1 = x.y + y.y;
e = t1 - x.y;
t2 = ((y.y - e) + (x.y - (t1 - e))) + x.x + y.x;
z.y = e = t1 + t2;
z.x = t2 - (e - t1);
return z;
}
__device_func__(double __internal_tgamma_stirling(double a)) __device_func__(double __internal_tgamma_stirling(double a))
{ {
if (a < 1.7162437695630274e+002) { if (a < 1.7162437695630274e+002) {
#if defined(__GNUC__) && !defined(__CUDABE__) #if defined(__GNUC__) && !defined(__CUDABE__)
volatile double t_hi, t_lo, e; volatile double t_hi, t_lo, e;
#else #else
double t_hi, t_lo, e; double t_hi, t_lo, e;
#endif #endif
double2 loga, prod; double2 loga, prod;
skipping to change at line 1491 skipping to change at line 1500
/* compute (a - 0.5) * log(a) in double-double format */ /* compute (a - 0.5) * log(a) in double-double format */
t_hi = loga.y * b; t_hi = loga.y * b;
t_lo = __fma_rn (loga.y, b, -t_hi); t_lo = __fma_rn (loga.y, b, -t_hi);
t_lo = __fma_rn (loga.x, b, t_lo); t_lo = __fma_rn (loga.x, b, t_lo);
prod.y = e = t_hi + t_lo; prod.y = e = t_hi + t_lo;
prod.x = (t_hi - e) + t_lo; prod.x = (t_hi - e) + t_lo;
/* compute (a - 0.5) * log(a) - a in double-double format */ /* compute (a - 0.5) * log(a) - a in double-double format */
loga.y = -a; loga.y = -a;
loga.x = 0.0; loga.x = 0.0;
prod = dbldbladd (prod, loga); prod = __internal_ddadd_xgty (prod, loga);
/* compute pow(a,b) = exp(b*log(a)) */ /* compute pow(a,b) = exp(b*log(a)) */
a = __cuda_exp(prod.y); a = __cuda_exp(prod.y);
/* prevent -INF + INF = NaN */ /* prevent -INF + INF = NaN */
if (!__cuda___isinf(a)) { if (!__cuda___isinf(a)) {
/* if prod.x is much smaller than prod.y, then exp(prod.y + prod.x) ~ = /* if prod.x is much smaller than prod.y, then exp(prod.y + prod.x) ~ =
* exp(prod.y) + prod.x * exp(prod.y) * exp(prod.y) + prod.x * exp(prod.y)
*/ */
a = __fma_rn (a, prod.x, a); a = __fma_rn (a, prod.x, a);
} }
skipping to change at line 1515 skipping to change at line 1524
return CUDART_INF; return CUDART_INF;
} }
} }
__device_func__(double __cuda_tgamma(double a)) __device_func__(double __cuda_tgamma(double a))
{ {
double s, xx, x = a; double s, xx, x = a;
if (__cuda___isnan(a)) { if (__cuda___isnan(a)) {
return a + a; return a + a;
} }
if (fabs(x) < 15.0) { if (__cuda_fabs(x) < 15.0) {
/* Based on: Kraemer, W.: "Berechnung der Gammafunktion G(x) fuer reel le /* Based on: Kraemer, W.: "Berechnung der Gammafunktion G(x) fuer reel le
* Punkt- und Intervallargumente". Zeitschrift fuer angewandte Mathema tik * Punkt- und Intervallargumente". Zeitschrift fuer angewandte Mathema tik
* und Mechanik, Vol. 70 (1990), No. 6, pp. 581-584 * und Mechanik, Vol. 70 (1990), No. 6, pp. 581-584
*/ */
if (x >= 0.0) { if (x >= 0.0) {
s = 1.0; s = 1.0;
xx = x; xx = x;
while (xx > 1.5) { while (xx > 1.5) {
s = __fma_rn(s, xx, -s); s = __fma_rn(s, xx, -s);
xx = xx - 1.0; xx = xx - 1.0;
skipping to change at line 1566 skipping to change at line 1575
if (x == __cuda_floor(x)) { if (x == __cuda_floor(x)) {
return CUDART_NAN; return CUDART_NAN;
} }
if (x < -185.0) { if (x < -185.0) {
int negative; int negative;
x = __cuda_floor(x); x = __cuda_floor(x);
negative = ((x - (2.0 * __cuda_floor(0.5 * x))) == 1.0); negative = ((x - (2.0 * __cuda_floor(0.5 * x))) == 1.0);
return negative ? CUDART_NEG_ZERO : CUDART_ZERO; return negative ? CUDART_NEG_ZERO : CUDART_ZERO;
} }
/* compute sin(pi*x) accurately */ /* compute sin(pi*x) accurately */
xx = __cuda_rint (2.0 * x); xx = __cuda_rint (__internal_twice(x));
quot = (int)xx; quot = (int)xx;
xx = __fma_rn (-0.5, xx, x); xx = __fma_rn (-0.5, xx, x);
xx = xx * CUDART_PI; xx = xx * CUDART_PI;
if (quot & 1) { if (quot & 1) {
xx = __internal_cos_kerneld (xx); xx = __internal_cos_kerneld (xx);
} else { } else {
xx = __internal_sin_kerneld (xx); xx = __internal_sin_kerneld (xx);
} }
if (quot & 2) { if (quot & 2) {
xx = -xx; xx = -xx;
} }
x = fabs (x); x = __cuda_fabs (x);
s = __cuda_exp (-x); s = __cuda_exp (-x);
t = x - 0.5; t = x - 0.5;
if (x > 140.0) t = 0.5 * t; if (x > 140.0) t = __internal_half(t);
t = __cuda_pow (x, t); t = __cuda_pow (x, t);
if (x > 140.0) s = s * t; if (x > 140.0) s = s * t;
s = s * __internal_stirling_poly (x); s = s * __internal_stirling_poly (x);
s = s * x; s = s * x;
s = s * xx; s = s * xx;
s = 1.0 / s; s = 1.0 / s;
s = __fma_rn (s, CUDART_SQRT_PIO2_HI, CUDART_SQRT_PIO2_LO * s); s = __fma_rn (s, CUDART_SQRT_PIO2_HI, CUDART_SQRT_PIO2_LO * s);
s = s / t; s = s / t;
return s; return s;
} }
skipping to change at line 1617 skipping to change at line 1626
*/ */
s = 1.0 / a; s = 1.0 / a;
t = s * s; t = s * s;
sum = -0.1633436431e-2; sum = -0.1633436431e-2;
sum = __fma_rn (sum, t, 0.83645878922e-3); sum = __fma_rn (sum, t, 0.83645878922e-3);
sum = __fma_rn (sum, t, -0.5951896861197e-3); sum = __fma_rn (sum, t, -0.5951896861197e-3);
sum = __fma_rn (sum, t, 0.793650576493454e-3); sum = __fma_rn (sum, t, 0.793650576493454e-3);
sum = __fma_rn (sum, t, -0.277777777735865004e-2); sum = __fma_rn (sum, t, -0.277777777735865004e-2);
sum = __fma_rn (sum, t, 0.833333333333331018375e-1); sum = __fma_rn (sum, t, 0.833333333333331018375e-1);
sum = __fma_rn (sum, s, 0.918938533204672); sum = __fma_rn (sum, s, 0.918938533204672);
s = 0.5 * __cuda_log (a); s = __internal_half(__cuda_log (a));
t = a - 0.5; t = a - 0.5;
s = s * t; s = s * t;
t = s - a; t = s - a;
s = s + sum; s = s + sum;
t = t + s; t = t + s;
return t; return t;
} else { } else {
a = a - 3.0; a = a - 3.0;
s = -4.02412642744125560E+003; s = -4.02412642744125560E+003;
s = __fma_rn (s, a, -2.97693796998962000E+005); s = __fma_rn (s, a, -2.97693796998962000E+005);
skipping to change at line 1744 skipping to change at line 1753
if (quot & 1) { if (quot & 1) {
i = __internal_cos_kerneld(i); i = __internal_cos_kerneld(i);
} else { } else {
i = __internal_sin_kerneld(i); i = __internal_sin_kerneld(i);
} }
i = __cuda_fabs(i); i = __cuda_fabs(i);
t = __cuda_log(CUDART_PI / (i * a)) - t; t = __cuda_log(CUDART_PI / (i * a)) - t;
return t; return t;
} }
__device_func__(double __internal_exp2i_kernel(int b))
{
return __hiloint2double((b + 1023) << 20, 0);
}
__device_func__(double __cuda_ldexp(double a, int b)) __device_func__(double __cuda_ldexp(double a, int b))
{ {
double fa = __cuda_fabs (a); double fa = __cuda_fabs (a);
if ((fa == CUDART_ZERO) || (fa == CUDART_INF) || (!(fa <= CUDART_INF))) { if ((fa == CUDART_ZERO) || (fa == CUDART_INF) || (!(fa <= CUDART_INF))) {
return a + a; return a + a;
} }
if (b == 0) { if (b == 0) {
return a; return a;
} }
if (b > 2200) b = 2200; if (b > 2200) b = 2200;
 End of changes. 41 change blocks. 
106 lines changed or deleted 110 lines changed or added


 sm_11_atomic_functions.h   sm_11_atomic_functions.h 
skipping to change at line 214 skipping to change at line 214
} }
#endif /* !CUDA_NO_SM_12_ATOMIC_INTRINSICS */ #endif /* !CUDA_NO_SM_12_ATOMIC_INTRINSICS */
#elif !defined(__CUDACC__) #elif !defined(__CUDACC__)
#include "crt/func_macro.h" #include "crt/func_macro.h"
#if !defined(__CUDABE__) #if !defined(__CUDABE__)
#if defined(__MULTI_CORE__)
#define __iAtomicAdd(address, val) \
__builtin___iAtomicAdd(address, val)
#define __uAtomicAdd(address, val) \
__builtin___uAtomicAdd(address, val)
#define __iAtomicExch(address, val) \
__builtin___iAtomicExch(address, val)
#define __uAtomicExch(address, val) \
__builtin___uAtomicExch(address, val)
#define __fAtomicExch(address, val) \
__builtin___fAtomicExch(address, val)
#define __iAtomicMin(address, val) \
__builtin___iAtomicMin(address, val)
#define __uAtomicMin(address, val) \
__builtin___uAtomicMin(address, val)
#define __iAtomicMax(address, val) \
__builtin___iAtomicMax(address, val)
#define __uAtomicMax(address, val) \
__builtin___uAtomicMax(address, val)
#define __uAtomicInc(address, val) \
__builtin___uAtomicInc(address, val)
#define __uAtomicDec(address, val) \
__builtin___uAtomicDec(address, val)
#define __iAtomicAnd(address, val) \
__builtin___iAtomicAnd(address, val)
#define __uAtomicAnd(address, val) \
__builtin___uAtomicAnd(address, val)
#define __iAtomicOr(address, val) \
__builtin___iAtomicOr(address, val)
#define __uAtomicOr(address, val) \
__builtin___uAtomicOr(address, val)
#define __iAtomicXor(address, val) \
__builtin___iAtomicXor(address, val)
#define __uAtomicXor(address, val) \
__builtin___uAtomicXor(address, val)
#define __iAtomicCAS(address, compare, val) \
__builtin___iAtomicCAS(address, compare, val)
#define __uAtomicCAS(address, compare, val) \
__builtin___uAtomicCAS(address, compare, val)
#else /* __MULTI_CORE__ */
__device_func__(int __iAtomicAdd(int *address, int val)) __device_func__(int __iAtomicAdd(int *address, int val))
{ {
int old = *address; int old = *address;
*address = old + val; *address = old + val;
return old; return old;
} }
__device_func__(unsigned int __uAtomicAdd(unsigned int *address, unsigned i nt val)) __device_func__(unsigned int __uAtomicAdd(unsigned int *address, unsigned i nt val))
skipping to change at line 385 skipping to change at line 428
__device_func__(unsigned int __uAtomicCAS(unsigned int *address, unsigned i nt compare, unsigned int val)) __device_func__(unsigned int __uAtomicCAS(unsigned int *address, unsigned i nt compare, unsigned int val))
{ {
unsigned int old = *address; unsigned int old = *address;
*address = old == compare ? val : old; *address = old == compare ? val : old;
return old; return old;
} }
#endif /* __MULTI_CORE__ */
#endif /* !__CUDABE__ */ #endif /* !__CUDABE__ */
#endif /* __cplusplus && __CUDACC__ */ #endif /* __cplusplus && __CUDACC__ */
#endif /* !__SM_11_ATOMIC_FUNCTIONS_H__ */ #endif /* !__SM_11_ATOMIC_FUNCTIONS_H__ */
 End of changes. 2 change blocks. 
0 lines changed or deleted 45 lines changed or added


 sm_12_atomic_functions.h   sm_12_atomic_functions.h 
skipping to change at line 107 skipping to change at line 107
} }
#endif /* !CUDA_NO_SM_12_ATOMIC_INTRINSICS */ #endif /* !CUDA_NO_SM_12_ATOMIC_INTRINSICS */
#elif !defined(__CUDACC__) #elif !defined(__CUDACC__)
#include "crt/func_macro.h" #include "crt/func_macro.h"
#if !defined(__CUDABE__) #if !defined(__CUDABE__)
#if defined(__MULTI_CORE__)
#define __ullAtomicAdd(address, val) \
__builtin___ullAtomicAdd(address, val)
#define __ullAtomicExch(address, val) \
__builtin___ullAtomicExch(address, val)
#define __ullAtomicCAS(address, compare, val) \
__builtin___ullAtomicCAS(address, compare, val)
#else /* __MULTI_CORE__ */
__device_func__(unsigned long long int __ullAtomicAdd(unsigned long long in t *address, unsigned long long int val)) __device_func__(unsigned long long int __ullAtomicAdd(unsigned long long in t *address, unsigned long long int val))
{ {
unsigned long long int old = *address; unsigned long long int old = *address;
*address = old + val; *address = old + val;
return old; return old;
} }
__device_func__(unsigned long long int __ullAtomicExch(unsigned long long i nt *address, unsigned long long int val)) __device_func__(unsigned long long int __ullAtomicExch(unsigned long long i nt *address, unsigned long long int val))
skipping to change at line 134 skipping to change at line 145
__device_func__(unsigned long long int __ullAtomicCAS(unsigned long long in t *address, unsigned long long int compare, unsigned long long int val)) __device_func__(unsigned long long int __ullAtomicCAS(unsigned long long in t *address, unsigned long long int compare, unsigned long long int val))
{ {
unsigned long long int old = *address; unsigned long long int old = *address;
*address = old == compare ? val : old; *address = old == compare ? val : old;
return old; return old;
} }
#endif /* __MULTI_CORE__ */
__device_func__(int __any(int cond)) __device_func__(int __any(int cond))
{ {
return cond; return cond;
} }
__device_func__(int __all(int cond)) __device_func__(int __all(int cond))
{ {
return cond; return cond;
} }
 End of changes. 2 change blocks. 
0 lines changed or deleted 13 lines changed or added


 sm_13_double_functions.h   sm_13_double_functions.h 
skipping to change at line 631 skipping to change at line 631
{ {
return __internal_ull2double_kernel(a, cudaRoundPosInf); return __internal_ull2double_kernel(a, cudaRoundPosInf);
} }
#endif /* !__CUDABE__ */ #endif /* !__CUDABE__ */
#if !defined(__CUDABE__) || defined(CUDA_NO_SM_13_DOUBLE_INTRINSICS) #if !defined(__CUDABE__) || defined(CUDA_NO_SM_13_DOUBLE_INTRINSICS)
__device_func__(double __internal_fma_kernel(double x, double y, double z, enum cudaRoundMode rndMode)) __device_func__(double __internal_fma_kernel(double x, double y, double z, enum cudaRoundMode rndMode))
{ {
#ifdef __MULTI_CORE__
volatile
#endif /* __MULTI_CORE__ */
struct { struct {
unsigned int lo; unsigned int lo;
unsigned int hi; unsigned int hi;
} xx, yy, zz, ww; } xx, yy, zz, ww;
unsigned int s, t, u, prod0, prod1, prod2, prod3, expo_x, expo_y, expo_z; unsigned int s, t, u, prod0, prod1, prod2, prod3, expo_x, expo_y, expo_z;
xx.hi = __double2hiint(x); xx.hi = __double2hiint(x);
xx.lo = __double2loint(x); xx.lo = __double2loint(x);
yy.hi = __double2hiint(y); yy.hi = __double2hiint(y);
yy.lo = __double2loint(y); yy.lo = __double2loint(y);
 End of changes. 1 change blocks. 
0 lines changed or deleted 3 lines changed or added


 storage_class.h   storage_class.h 
skipping to change at line 48 skipping to change at line 48
#if !defined(__var_used__) #if !defined(__var_used__)
#define __var_used__ #define __var_used__
#endif /* __var_used__ */ #endif /* __var_used__ */
#if !defined(__loc_sc__) #if !defined(__loc_sc__)
#define __loc_sc__(loc, sc) \ #define __loc_sc__(loc, sc) \
__storage##_##sc##loc __var_used__ loc __storage##_##sc##loc loc
#endif /* !__loc_sc__ */ #endif /* !__loc_sc__ */
#define __storage___device__ \ #if !defined(__storage___device__)
static #define __storage___device__ static __var_used__
#define __storage_extern__device__ \ #endif /* __storage___device__ */
static
#define __storage_auto__device__ \
@@@ COMPILER @@@ ERROR @@@
#define __storage_static__device__ \
static
#define __storage___constant__ \ #if !defined(__storage_extern__device__)
static #define __storage_extern__device__ static __var_used__
#define __storage_extern__constant__ \ #endif /* __storage_extern__device__ */
static
#define __storage_auto__constant__ \
@@@ COMPILER @@@ ERROR @@@
#define __storage_static__constant__ \
static
#define __storage___shared__ \ #if !defined(__storage_auto__device__)
static #define __storage_auto__device__ @@@ COMPILER @@@ ERROR @@@
#define __storage_extern__shared__ \ #endif /* __storage_auto__device__ */
static
#define __storage_auto__shared__ \
static
#define __storage_static__shared__ \
static
#define __storage___text__ \ #if !defined(__storage_static__device__)
static #define __storage_static__device__ static __var_used__
#define __storage_extern__text__ \ #endif /* __storage_static__device__ */
static
#define __storage_auto__text__ \ #if !defined(__storage___constant__)
@@@ COMPILER @@@ ERROR @@@ #define __storage___constant__ static __var_used__
#define __storage_static__text__ \ #endif /* __storage___constant__ */
static
#if !defined(__storage_extern__constant__)
#define __storage_extern__constant__ static __var_used__
#endif /* __storage_extern__constant__ */
#if !defined(__storage_auto__constant__)
#define __storage_auto__constant__ @@@ COMPILER @@@ ERROR @@@
#endif /* __storage_auto__constant__ */
#if !defined(__storage_static__constant__)
#define __storage_static__constant__ static __var_used__
#endif /* __storage_static__constant__ */
#if !defined(__storage___shared__)
#define __storage___shared__ static __var_used__
#endif /* __storage___shared__ */
#if !defined(__storage_extern__shared__)
#define __storage_extern__shared__ static __var_used__
#endif /* __storage_extern__shared__ */
#if !defined(__storage_auto__shared__)
#define __storage_auto__shared__ static
#endif /* __storage_auto__shared__ */
#if !defined(__storage_static__shared__)
#define __storage_static__shared__ static __var_used__
#endif /* __storage_static__shared__ */
#if !defined(__storage___text__)
#define __storage___text__ static __var_used__
#endif /* __storage___text__ */
#if !defined(__storage_extern__text__)
#define __storage_extern__text__ static __var_used__
#endif /* __storage_extern__text__ */
#if !defined(__storage_auto__text__)
#define __storage_auto__text__ @@@ COMPILER @@@ ERROR @@@
#endif /* __storage_auto__text__ */
#if !defined(__storage_static__text__)
#define __storage_static__text__ static __var_used__
#endif /* __storage_static__text__ */
#if !defined(__storage___surf__)
#define __storage___surf__ static __var_used__
#endif /* __storage___surf__ */
#if !defined(__storage_extern__surf__)
#define __storage_extern__surf__ static __var_used__
#endif /* __storage_extern__surf__ */
#if !defined(__storage_auto__surf__)
#define __storage_auto__surf__ @@@ COMPILER @@@ ERROR @@@
#endif /* __storage_auto__surf__ */
#if !defined(__storage_static__surf__)
#define __storage_static__surf__ static __var_used__
#endif /* __storage_static__surf__ */
#endif /* !__STORAGE_CLASS_H__ */ #endif /* !__STORAGE_CLASS_H__ */
 End of changes. 5 change blocks. 
33 lines changed or deleted 77 lines changed or added


 texture_fetch_functions.h   texture_fetch_functions.h 
skipping to change at line 47 skipping to change at line 47
#define __TEXTURE_FETCH_FUNCTIONS_H__ #define __TEXTURE_FETCH_FUNCTIONS_H__
#if defined(__cplusplus) && defined(__CUDACC__) #if defined(__cplusplus) && defined(__CUDACC__)
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
#include "cuda_texture_types.h"
#include "host_defines.h" #include "host_defines.h"
#include "texture_types.h" #include "texture_types.h"
#include "vector_functions.h" #include "vector_functions.h"
#include "vector_types.h" #include "vector_types.h"
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
skipping to change at line 1881 skipping to change at line 1882
return make_float4(w.x, w.y, w.z, w.w); return make_float4(w.x, w.y, w.z, w.w);
} }
#elif !defined(__CUDACC__) #elif !defined(__CUDACC__)
#include "host_defines.h" #include "host_defines.h"
#include "crt/func_macro.h" #include "crt/func_macro.h"
#if defined(__CUDABE__) #if defined(__CUDABE__)
extern uint4 __utexfetchi1D(__i1texture, int4); extern uint4 __utexfetchi1D(const void*, int4);
extern int4 __itexfetchi1D(__i2texture, int4); extern int4 __itexfetchi1D(const void*, int4);
extern float4 __ftexfetchi1D(__i3texture, int4); extern float4 __ftexfetchi1D(const void*, int4);
extern uint4 __utexfetch1D(__u1texture, float4); extern uint4 __utexfetch1D(const void*, float4);
extern int4 __itexfetch1D(__u2texture, float4); extern int4 __itexfetch1D(const void*, float4);
extern float4 __ftexfetch1D(__u3texture, float4); extern float4 __ftexfetch1D(const void*, float4);
extern uint4 __utexfetch2D(__f1texture, float4); extern uint4 __utexfetch2D(const void*, float4);
extern int4 __itexfetch2D(__f2texture, float4); extern int4 __itexfetch2D(const void*, float4);
extern float4 __ftexfetch2D(__f3texture, float4); extern float4 __ftexfetch2D(const void*, float4);
extern uint4 __utexfetch3D(__f1texture, float4); extern uint4 __utexfetch3D(const void*, float4);
extern int4 __itexfetch3D(__f2texture, float4); extern int4 __itexfetch3D(const void*, float4);
extern float4 __ftexfetch3D(__f3texture, float4); extern float4 __ftexfetch3D(const void*, float4);
#define __utexfetchi(t, i) \ #define __utexfetchi(t, i) \
__utexfetchi1D(t, i) __utexfetchi1D(t, i)
#define __itexfetchi(t, i) \ #define __itexfetchi(t, i) \
__itexfetchi1D(t, i) __itexfetchi1D(t, i)
#define __ftexfetchi(t, i) \ #define __ftexfetchi(t, i) \
__ftexfetchi1D(t, i) __ftexfetchi1D(t, i)
#define __utexfetch(t, i, d) \ #define __utexfetch(t, i, d) \
__utexfetch##d##D(t, i) __utexfetch##d##D(t, i)
#define __itexfetch(t, i, d) \ #define __itexfetch(t, i, d) \
 End of changes. 2 change blocks. 
12 lines changed or deleted 13 lines changed or added


 texture_types.h   texture_types.h 
skipping to change at line 45 skipping to change at line 45
#if !defined(__TEXTURE_TYPES_H__) #if !defined(__TEXTURE_TYPES_H__)
#define __TEXTURE_TYPES_H__ #define __TEXTURE_TYPES_H__
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
#include "driver_types.h"
/**************************************************************************
*****
*
*
*
*
*
*
***************************************************************************
****/
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
enum cudaTextureAddressMode enum cudaTextureAddressMode
{ {
cudaAddressModeWrap, cudaAddressModeWrap,
cudaAddressModeClamp cudaAddressModeClamp
}; };
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
enum cudaTextureFilterMode enum cudaTextureFilterMode
{ {
skipping to change at line 76 skipping to change at line 84
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct textureReference struct textureReference
{ {
int normalized; int normalized;
enum cudaTextureFilterMode filterMode; enum cudaTextureFilterMode filterMode;
enum cudaTextureAddressMode addressMode[3]; enum cudaTextureAddressMode addressMode[3];
struct cudaChannelFormatDesc channelDesc; struct cudaChannelFormatDesc channelDesc;
int __cudaReserved[16]; int __cudaReserved[16];
}; };
#if defined(__cplusplus) && defined(__CUDACC__)
/**************************************************************************
*****
*
*
*
*
*
*
***************************************************************************
****/
#include "host_defines.h"
#include "channel_descriptor.h"
/**************************************************************************
*****
*
*
*
*
*
*
***************************************************************************
****/
/*TEXTURE_TYPE*/
template<class T, int dim = 1, enum cudaTextureReadMode = cudaReadModeEleme
ntType>
struct texture : public textureReference
{
__host__ texture(int norm = 0,
enum cudaTextureFilterMode fMode = cudaFilterModePoint,
enum cudaTextureAddressMode aMode = cudaAddressModeClamp
)
{
normalized = norm;
filterMode = fMode;
addressMode[0] = aMode;
addressMode[1] = aMode;
addressMode[2] = aMode;
channelDesc = cudaCreateChannelDesc<T>();
}
__host__ texture(int norm,
enum cudaTextureFilterMode fMode,
enum cudaTextureAddressMode aMode,
struct cudaChannelFormatDesc desc)
{
normalized = norm;
filterMode = fMode;
addressMode[0] = aMode;
addressMode[1] = aMode;
addressMode[2] = aMode;
channelDesc = desc;
}
};
#endif /* __cplusplus && __CUDACC__ */
#endif /* !__TEXTURE_TYPES_H__ */ #endif /* !__TEXTURE_TYPES_H__ */
 End of changes. 2 change blocks. 
61 lines changed or deleted 13 lines changed or added


 vector_functions.h   vector_functions.h 
skipping to change at line 238 skipping to change at line 238
static __inline__ __host__ __device__ float3 make_float3(float x, float y, float z) static __inline__ __host__ __device__ float3 make_float3(float x, float y, float z)
{ {
float3 t; t.x = x; t.y = y; t.z = z; return t; float3 t; t.x = x; t.y = y; t.z = z; return t;
} }
static __inline__ __host__ __device__ float4 make_float4(float x, float y, float z, float w) static __inline__ __host__ __device__ float4 make_float4(float x, float y, float z, float w)
{ {
float4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t; float4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
} }
static __inline__ __host__ __device__ longlong1 make_longlong1(long long in
t x)
{
longlong1 t; t.x = x; return t;
}
static __inline__ __host__ __device__ ulonglong1 make_ulonglong1(unsigned l
ong long int x)
{
ulonglong1 t; t.x = x; return t;
}
static __inline__ __host__ __device__ longlong2 make_longlong2(long long in
t x, long long int y)
{
longlong2 t; t.x = x; t.y = y; return t;
}
static __inline__ __host__ __device__ ulonglong2 make_ulonglong2(unsigned l
ong long int x, unsigned long long int y)
{
ulonglong2 t; t.x = x; t.y = y; return t;
}
static __inline__ __host__ __device__ double1 make_double1(double x) static __inline__ __host__ __device__ double1 make_double1(double x)
{ {
double1 t; t.x = x; return t; double1 t; t.x = x; return t;
} }
static __inline__ __host__ __device__ double2 make_double2(double x, double y) static __inline__ __host__ __device__ double2 make_double2(double x, double y)
{ {
double2 t; t.x = x; t.y = y; return t; double2 t; t.x = x; t.y = y; return t;
} }
 End of changes. 1 change blocks. 
0 lines changed or deleted 24 lines changed or added


 vector_types.h   vector_types.h 
skipping to change at line 286 skipping to change at line 286
float x, y, z; float x, y, z;
}; };
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct __builtin_align__(16) float4 struct __builtin_align__(16) float4
{ {
float x, y, z, w; float x, y, z, w;
}; };
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct longlong1
{
long long int x;
};
/*DEVICE_BUILTIN*/
struct ulonglong1
{
unsigned long long int x;
};
/*DEVICE_BUILTIN*/
struct __builtin_align__(16) longlong2
{
long long int x, y;
};
/*DEVICE_BUILTIN*/
struct __builtin_align__(16) ulonglong2
{
unsigned long long int x, y;
};
/*DEVICE_BUILTIN*/
struct double1 struct double1
{ {
double x; double x;
}; };
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
struct __builtin_align__(16) double2 struct __builtin_align__(16) double2
{ {
double x, y; double x, y;
}; };
skipping to change at line 376 skipping to change at line 400
typedef struct ulong4 ulong4; typedef struct ulong4 ulong4;
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
typedef struct float1 float1; typedef struct float1 float1;
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
typedef struct float2 float2; typedef struct float2 float2;
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
typedef struct float3 float3; typedef struct float3 float3;
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
typedef struct float4 float4; typedef struct float4 float4;
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
typedef struct longlong1 longlong1;
/*DEVICE_BUILTIN*/
typedef struct ulonglong1 ulonglong1;
/*DEVICE_BUILTIN*/
typedef struct longlong2 longlong2;
/*DEVICE_BUILTIN*/
typedef struct ulonglong2 ulonglong2;
/*DEVICE_BUILTIN*/
typedef struct double1 double1; typedef struct double1 double1;
/*DEVICE_BUILTIN*/ /*DEVICE_BUILTIN*/
typedef struct double2 double2; typedef struct double2 double2;
/************************************************************************** ***** /************************************************************************** *****
* * * *
* * * *
* * * *
*************************************************************************** ****/ *************************************************************************** ****/
 End of changes. 2 change blocks. 
0 lines changed or deleted 32 lines changed or added

This html diff was produced by rfcdiff 1.41. The latest version is available from http://tools.ietf.org/tools/rfcdiff/