You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
349 lines
12 KiB
C
349 lines
12 KiB
C
/*
|
|
* Copyright 1993-2012 NVIDIA Corporation. All rights reserved.
|
|
*
|
|
* NOTICE TO LICENSEE:
|
|
*
|
|
* This source code and/or documentation ("Licensed Deliverables") are
|
|
* subject to NVIDIA intellectual property rights under U.S. and
|
|
* international Copyright laws.
|
|
*
|
|
* These Licensed Deliverables contained herein is PROPRIETARY and
|
|
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
|
|
* conditions of a form of NVIDIA software license agreement by and
|
|
* between NVIDIA and Licensee ("License Agreement") or electronically
|
|
* accepted by Licensee. Notwithstanding any terms or conditions to
|
|
* the contrary in the License Agreement, reproduction or disclosure
|
|
* of the Licensed Deliverables to any third party without the express
|
|
* written consent of NVIDIA is prohibited.
|
|
*
|
|
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
|
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
|
|
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
|
|
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
|
|
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
|
|
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
|
|
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
|
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
|
|
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
|
|
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
|
|
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
|
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
|
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
|
* OF THESE LICENSED DELIVERABLES.
|
|
*
|
|
* U.S. Government End Users. These Licensed Deliverables are a
|
|
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
|
|
* 1995), consisting of "commercial computer software" and "commercial
|
|
* computer software documentation" as such terms are used in 48
|
|
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
|
|
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
|
|
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
|
|
* U.S. Government End Users acquire the Licensed Deliverables with
|
|
* only those rights set forth herein.
|
|
*
|
|
* Any use of the Licensed Deliverables in individual and commercial
|
|
* software must include, in the user documentation and internal
|
|
* comments to the code, the above Disclaimer and U.S. Government End
|
|
* Users Notice.
|
|
*/
|
|
|
|
#if !defined(CU_COMPLEX_H_)
|
|
#define CU_COMPLEX_H_
|
|
|
|
#if !defined(__CUDACC_RTC__)
|
|
#if defined(__GNUC__)
|
|
#if defined(__clang__) || (!defined(__PGIC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 2)))
|
|
#pragma GCC diagnostic ignored "-Wunused-function"
|
|
#endif
|
|
#endif
|
|
#endif
|
|
|
|
/* When trying to include C header file in C++ Code extern "C" is required
|
|
* But the Standard QNX headers already have ifdef extern in them when compiling C++ Code
|
|
* extern "C" cannot be nested
|
|
* Hence keep the header out of extern "C" block
|
|
*/
|
|
|
|
#if !defined(__CUDACC__)
|
|
#include <math.h> /* import fabsf, sqrt */
|
|
#endif /* !defined(__CUDACC__) */
|
|
|
|
#if defined(__cplusplus)
|
|
extern "C" {
|
|
#endif /* __cplusplus */
|
|
|
|
#include "vector_types.h"
|
|
|
|
typedef float2 cuFloatComplex;
|
|
|
|
__host__ __device__ static __inline__ float cuCrealf (cuFloatComplex x)
|
|
{
|
|
return x.x;
|
|
}
|
|
|
|
__host__ __device__ static __inline__ float cuCimagf (cuFloatComplex x)
|
|
{
|
|
return x.y;
|
|
}
|
|
|
|
__host__ __device__ static __inline__ cuFloatComplex make_cuFloatComplex
|
|
(float r, float i)
|
|
{
|
|
cuFloatComplex res;
|
|
res.x = r;
|
|
res.y = i;
|
|
return res;
|
|
}
|
|
|
|
__host__ __device__ static __inline__ cuFloatComplex cuConjf (cuFloatComplex x)
|
|
{
|
|
return make_cuFloatComplex (cuCrealf(x), -cuCimagf(x));
|
|
}
|
|
__host__ __device__ static __inline__ cuFloatComplex cuCaddf (cuFloatComplex x,
|
|
cuFloatComplex y)
|
|
{
|
|
return make_cuFloatComplex (cuCrealf(x) + cuCrealf(y),
|
|
cuCimagf(x) + cuCimagf(y));
|
|
}
|
|
|
|
__host__ __device__ static __inline__ cuFloatComplex cuCsubf (cuFloatComplex x,
|
|
cuFloatComplex y)
|
|
{
|
|
return make_cuFloatComplex (cuCrealf(x) - cuCrealf(y),
|
|
cuCimagf(x) - cuCimagf(y));
|
|
}
|
|
|
|
/* This implementation could suffer from intermediate overflow even though
|
|
* the final result would be in range. However, various implementations do
|
|
* not guard against this (presumably to avoid losing performance), so we
|
|
* don't do it either to stay competitive.
|
|
*/
|
|
__host__ __device__ static __inline__ cuFloatComplex cuCmulf (cuFloatComplex x,
|
|
cuFloatComplex y)
|
|
{
|
|
cuFloatComplex prod;
|
|
prod = make_cuFloatComplex ((cuCrealf(x) * cuCrealf(y)) -
|
|
(cuCimagf(x) * cuCimagf(y)),
|
|
(cuCrealf(x) * cuCimagf(y)) +
|
|
(cuCimagf(x) * cuCrealf(y)));
|
|
return prod;
|
|
}
|
|
|
|
/* This implementation guards against intermediate underflow and overflow
|
|
* by scaling. Such guarded implementations are usually the default for
|
|
* complex library implementations, with some also offering an unguarded,
|
|
* faster version.
|
|
*/
|
|
__host__ __device__ static __inline__ cuFloatComplex cuCdivf (cuFloatComplex x,
|
|
cuFloatComplex y)
|
|
{
|
|
cuFloatComplex quot;
|
|
float s = fabsf(cuCrealf(y)) + fabsf(cuCimagf(y));
|
|
float oos = 1.0f / s;
|
|
float ars = cuCrealf(x) * oos;
|
|
float ais = cuCimagf(x) * oos;
|
|
float brs = cuCrealf(y) * oos;
|
|
float bis = cuCimagf(y) * oos;
|
|
s = (brs * brs) + (bis * bis);
|
|
oos = 1.0f / s;
|
|
quot = make_cuFloatComplex (((ars * brs) + (ais * bis)) * oos,
|
|
((ais * brs) - (ars * bis)) * oos);
|
|
return quot;
|
|
}
|
|
|
|
/*
|
|
* We would like to call hypotf(), but it's not available on all platforms.
|
|
* This discrete implementation guards against intermediate underflow and
|
|
* overflow by scaling. Otherwise we would lose half the exponent range.
|
|
* There are various ways of doing guarded computation. For now chose the
|
|
* simplest and fastest solution, however this may suffer from inaccuracies
|
|
* if sqrt and division are not IEEE compliant.
|
|
*/
|
|
__host__ __device__ static __inline__ float cuCabsf (cuFloatComplex x)
|
|
{
|
|
float a = cuCrealf(x);
|
|
float b = cuCimagf(x);
|
|
float v, w, t;
|
|
a = fabsf(a);
|
|
b = fabsf(b);
|
|
if (a > b) {
|
|
v = a;
|
|
w = b;
|
|
} else {
|
|
v = b;
|
|
w = a;
|
|
}
|
|
t = w / v;
|
|
t = 1.0f + t * t;
|
|
t = v * sqrtf(t);
|
|
if ((v == 0.0f) || (v > 3.402823466e38f) || (w > 3.402823466e38f)) {
|
|
t = v + w;
|
|
}
|
|
return t;
|
|
}
|
|
|
|
/* Double precision */
|
|
typedef double2 cuDoubleComplex;
|
|
|
|
__host__ __device__ static __inline__ double cuCreal (cuDoubleComplex x)
|
|
{
|
|
return x.x;
|
|
}
|
|
|
|
__host__ __device__ static __inline__ double cuCimag (cuDoubleComplex x)
|
|
{
|
|
return x.y;
|
|
}
|
|
|
|
__host__ __device__ static __inline__ cuDoubleComplex make_cuDoubleComplex
|
|
(double r, double i)
|
|
{
|
|
cuDoubleComplex res;
|
|
res.x = r;
|
|
res.y = i;
|
|
return res;
|
|
}
|
|
|
|
__host__ __device__ static __inline__ cuDoubleComplex cuConj(cuDoubleComplex x)
|
|
{
|
|
return make_cuDoubleComplex (cuCreal(x), -cuCimag(x));
|
|
}
|
|
|
|
__host__ __device__ static __inline__ cuDoubleComplex cuCadd(cuDoubleComplex x,
|
|
cuDoubleComplex y)
|
|
{
|
|
return make_cuDoubleComplex (cuCreal(x) + cuCreal(y),
|
|
cuCimag(x) + cuCimag(y));
|
|
}
|
|
|
|
__host__ __device__ static __inline__ cuDoubleComplex cuCsub(cuDoubleComplex x,
|
|
cuDoubleComplex y)
|
|
{
|
|
return make_cuDoubleComplex (cuCreal(x) - cuCreal(y),
|
|
cuCimag(x) - cuCimag(y));
|
|
}
|
|
|
|
/* This implementation could suffer from intermediate overflow even though
|
|
* the final result would be in range. However, various implementations do
|
|
* not guard against this (presumably to avoid losing performance), so we
|
|
* don't do it either to stay competitive.
|
|
*/
|
|
__host__ __device__ static __inline__ cuDoubleComplex cuCmul(cuDoubleComplex x,
|
|
cuDoubleComplex y)
|
|
{
|
|
cuDoubleComplex prod;
|
|
prod = make_cuDoubleComplex ((cuCreal(x) * cuCreal(y)) -
|
|
(cuCimag(x) * cuCimag(y)),
|
|
(cuCreal(x) * cuCimag(y)) +
|
|
(cuCimag(x) * cuCreal(y)));
|
|
return prod;
|
|
}
|
|
|
|
/* This implementation guards against intermediate underflow and overflow
|
|
* by scaling. Such guarded implementations are usually the default for
|
|
* complex library implementations, with some also offering an unguarded,
|
|
* faster version.
|
|
*/
|
|
__host__ __device__ static __inline__ cuDoubleComplex cuCdiv(cuDoubleComplex x,
|
|
cuDoubleComplex y)
|
|
{
|
|
cuDoubleComplex quot;
|
|
double s = (fabs(cuCreal(y))) + (fabs(cuCimag(y)));
|
|
double oos = 1.0 / s;
|
|
double ars = cuCreal(x) * oos;
|
|
double ais = cuCimag(x) * oos;
|
|
double brs = cuCreal(y) * oos;
|
|
double bis = cuCimag(y) * oos;
|
|
s = (brs * brs) + (bis * bis);
|
|
oos = 1.0 / s;
|
|
quot = make_cuDoubleComplex (((ars * brs) + (ais * bis)) * oos,
|
|
((ais * brs) - (ars * bis)) * oos);
|
|
return quot;
|
|
}
|
|
|
|
/* This implementation guards against intermediate underflow and overflow
|
|
* by scaling. Otherwise we would lose half the exponent range. There are
|
|
* various ways of doing guarded computation. For now chose the simplest
|
|
* and fastest solution, however this may suffer from inaccuracies if sqrt
|
|
* and division are not IEEE compliant.
|
|
*/
|
|
__host__ __device__ static __inline__ double cuCabs (cuDoubleComplex x)
|
|
{
|
|
double a = cuCreal(x);
|
|
double b = cuCimag(x);
|
|
double v, w, t;
|
|
a = fabs(a);
|
|
b = fabs(b);
|
|
if (a > b) {
|
|
v = a;
|
|
w = b;
|
|
} else {
|
|
v = b;
|
|
w = a;
|
|
}
|
|
t = w / v;
|
|
t = 1.0 + t * t;
|
|
t = v * sqrt(t);
|
|
if ((v == 0.0) ||
|
|
(v > 1.79769313486231570e+308) || (w > 1.79769313486231570e+308)) {
|
|
t = v + w;
|
|
}
|
|
return t;
|
|
}
|
|
|
|
#if defined(__cplusplus)
|
|
}
|
|
#endif /* __cplusplus */
|
|
|
|
/* aliases */
|
|
typedef cuFloatComplex cuComplex;
|
|
__host__ __device__ static __inline__ cuComplex make_cuComplex (float x,
|
|
float y)
|
|
{
|
|
return make_cuFloatComplex (x, y);
|
|
}
|
|
|
|
/* float-to-double promotion */
|
|
__host__ __device__ static __inline__ cuDoubleComplex cuComplexFloatToDouble
|
|
(cuFloatComplex c)
|
|
{
|
|
return make_cuDoubleComplex ((double)cuCrealf(c), (double)cuCimagf(c));
|
|
}
|
|
|
|
__host__ __device__ static __inline__ cuFloatComplex cuComplexDoubleToFloat
|
|
(cuDoubleComplex c)
|
|
{
|
|
return make_cuFloatComplex ((float)cuCreal(c), (float)cuCimag(c));
|
|
}
|
|
|
|
|
|
__host__ __device__ static __inline__ cuComplex cuCfmaf( cuComplex x, cuComplex y, cuComplex d)
|
|
{
|
|
float real_res;
|
|
float imag_res;
|
|
|
|
real_res = (cuCrealf(x) * cuCrealf(y)) + cuCrealf(d);
|
|
imag_res = (cuCrealf(x) * cuCimagf(y)) + cuCimagf(d);
|
|
|
|
real_res = -(cuCimagf(x) * cuCimagf(y)) + real_res;
|
|
imag_res = (cuCimagf(x) * cuCrealf(y)) + imag_res;
|
|
|
|
return make_cuComplex(real_res, imag_res);
|
|
}
|
|
|
|
__host__ __device__ static __inline__ cuDoubleComplex cuCfma( cuDoubleComplex x, cuDoubleComplex y, cuDoubleComplex d)
|
|
{
|
|
double real_res;
|
|
double imag_res;
|
|
|
|
real_res = (cuCreal(x) * cuCreal(y)) + cuCreal(d);
|
|
imag_res = (cuCreal(x) * cuCimag(y)) + cuCimag(d);
|
|
|
|
real_res = -(cuCimag(x) * cuCimag(y)) + real_res;
|
|
imag_res = (cuCimag(x) * cuCreal(y)) + imag_res;
|
|
|
|
return make_cuDoubleComplex(real_res, imag_res);
|
|
}
|
|
|
|
#endif /* !defined(CU_COMPLEX_H_) */
|