cloudy  trunk
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
vectorize_hyper.cpp
Go to the documentation of this file.
1 /* This file is part of Cloudy and is copyright (C)1978-2022 by Gary J. Ferland and
2  * others. For conditions of distribution and use see copyright notice in license.txt */
3 #include "cddefines.h"
4 #include "vectorize.h"
5 #include "vectorize_math.h"
6 #include "vectorize_log_core.h"
7 #include "vectorize_hyper_core.h"
8 
9 //
10 // Written by Peter A.M. van Hoof, Royal Observatory of Belgium, Brussels
11 //
12 // this file contains vectorized versions of the single and double variants of the asinh()
13 // function. They are vectorized using AVX instructions, but also make use of AVX2, FMA,
14 // and AVX512 instructions when available. The basic algorithms for calculating the asinh()
15 // functions were somewhat simplified from the openlibm library versions available at
16 // http://openlibm.org/ which is subject to the following copyright:
17 //
18 // ====================================================
19 // Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
20 //
21 // Developed at SunSoft, a Sun Microsystems, Inc. business.
22 // Permission to use, copy, modify, and distribute this
23 // software is freely granted, provided that this notice
24 // is preserved.
25 // ====================================================
26 //
27 // an optimized version vfast_asinh() is also supplied which can only handle arguments in
28 // the domain [0,sqrt(FLT_MAX)] or [0,sqrt(DBL_MAX)].
29 //
30 
31 #ifdef __AVX__
32 
33 #ifdef __AVX512F__
34 
35 inline v8df v1asinhd(v8df x)
36 {
37  v8di ix = _mm512_castpd_si512(x);
38  v8di is = _mm512_and_epi64(ix, asinh_mask2);
39  ix = _mm512_and_epi64(ix, asinh_mask1);
40  x = _mm512_castsi512_pd(ix);
41  __mmask8 invalid = _mm512_cmp_pd_mask(x, dbl_max, _CMP_NLE_UQ);
42  if( ! _mm512_kortestz(invalid, invalid) )
43  throw domain_error( DEMsg("v1asinhd", x, invalid) );
44  v8df rh = v1logd_core(x);
45  rh = _mm512_add_pd(rh, ln_two);
46  v8df xl = _mm512_min_pd(x, asinh_2p28);
47  v8df rl = v1asinhd_core(xl);
48  v8df r = _mm512_setzero_pd();
49  __mmask8 mask = _mm512_cmp_pd_mask(xl, x, _CMP_EQ_OQ);
50  r = _mm512_mask_add_pd(rh, mask, rl, r);
51  v8di ir = _mm512_castpd_si512(r);
52  ir = _mm512_or_epi64(ir, is);
53  return _mm512_castsi512_pd(ir);
54 }
55 
56 inline v8df v1fast_asinhd(v8df x)
57 {
58  __mmask8 invalid1 = _mm512_cmp_pd_mask(x, zero, _CMP_LT_OQ);
59  __mmask8 invalid2 = _mm512_cmp_pd_mask(x, sqrt_dbl_max, _CMP_NLE_UQ);
60  if( ! _mm512_kortestz(invalid1, invalid2) )
61  {
62  __mmask8 invalid = invalid1 | invalid2;
63  throw domain_error( DEMsg("v1fast_asinhd", x, invalid) );
64  }
65  return v1asinhd_core(x);
66 }
67 
68 inline v16sf v1asinhf(v16sf x)
69 {
70  v16si ix = _mm512_castps_si512(x);
71  v16si is = _mm512_and_epi32(ix, asinh_mask2f);
72  ix = _mm512_and_epi32(ix, asinh_mask1f);
73  x = _mm512_castsi512_ps(ix);
74  __mmask16 invalid = _mm512_cmp_ps_mask(x, flt_max, _CMP_NLE_UQ);
75  if( ! _mm512_kortestz(invalid, invalid) )
76  throw domain_error( DEMsg("v1asinhf", x, invalid) );
77  v16sf rh = v1logf_core(x);
78  rh = _mm512_add_ps(rh, ln_twof);
79  v16sf xl = _mm512_min_ps(x, asinhf_2p28);
80  v16sf rl = v1asinhf_core(xl);
81  v16sf r = _mm512_setzero_ps();
82  __mmask16 mask = _mm512_cmp_ps_mask(xl, x, _CMP_EQ_OQ);
83  r = _mm512_mask_add_ps(rh, mask, rl, r);
84  v16si ir = _mm512_castps_si512(r);
85  ir = _mm512_or_epi32(ir, is);
86  return _mm512_castsi512_ps(ir);
87 }
88 
89 inline v16sf v1fast_asinhf(v16sf x)
90 {
91  __mmask16 invalid1 = _mm512_cmp_ps_mask(x, zerof, _CMP_LT_OQ);
92  __mmask16 invalid2 = _mm512_cmp_ps_mask(x, sqrt_flt_max, _CMP_NLE_UQ);
93  if( ! _mm512_kortestz(invalid1, invalid2) )
94  {
95  __mmask16 invalid = invalid1 | invalid2;
96  throw domain_error( DEMsg("v1fast_asinhf", x, invalid) );
97  }
98  return v1asinhf_core(x);
99 }
100 
101 #else
102 
103 inline v4df v1asinhd(v4df x)
104 {
105  v4df signbit = x;
106  v4df mask1 = _mm256_castsi256_pd(asinh_mask1);
107  x = _mm256_and_pd(x, mask1);
108  v4df invalid = _mm256_cmp_pd(x, dbl_max, _CMP_NLE_UQ);
109  if( ! _mm256_testz_pd(invalid, invalid) )
110  throw domain_error( DEMsg("v1asinhd", x, invalid) );
111  mask1 = _mm256_castsi256_pd(asinh_mask2);
112  signbit = _mm256_and_pd(signbit, mask1);
113  // use the approximation for large arguments: asinh(x) ~ log(x) + log(2)
114  // this formula can be safely used up to x = DBL_MAX, but is not correct for x < 2^28
115  v4df rh = v1logd_core(x);
116  rh = _mm256_add_pd(rh, ln_two);
117  // use the expression for small arguments: asinh(x) = log1p(x + x^2/(1 + sqrt(1+x^2)))
118  // this formula is exact, but will overflow for x > sqrt(DBL_MAX), hence the use of xl
119  v4df xl = _mm256_min_pd(x, asinh_2p28);
120  v4df rl = v1asinhd_core(xl);
121  v4df mask = _mm256_cmp_pd(xl, x, _CMP_EQ_OQ);
122  v4df r = _mm256_blendv_pd(rh, rl, mask);
123  return _mm256_or_pd(r, signbit);
124 }
125 
126 inline v4df v1fast_asinhd(v4df x)
127 {
128  v4df invalid1 = _mm256_cmp_pd(x, zero, _CMP_LT_OQ);
129  v4df invalid2 = _mm256_cmp_pd(x, sqrt_dbl_max, _CMP_NLE_UQ);
130  v4df invalid = _mm256_or_pd(invalid1, invalid2);
131  if( ! _mm256_testz_pd(invalid, invalid) )
132  throw domain_error( DEMsg("v1fast_asinhd", x, invalid) );
133  return v1asinhd_core(x);
134 }
135 
136 inline v8sf v1asinhf(v8sf x)
137 {
138  v8sf signbit = x;
139  v8sf mask1 = _mm256_castsi256_ps(asinh_mask1f);
140  x = _mm256_and_ps(x, mask1);
141  v8sf invalid = _mm256_cmp_ps(x, flt_max, _CMP_NLE_UQ);
142  if( ! _mm256_testz_ps(invalid, invalid) )
143  throw domain_error( DEMsg("v1asinhf", x, invalid) );
144  mask1 = _mm256_castsi256_ps(asinh_mask2f);
145  signbit = _mm256_and_ps(signbit, mask1);
146  v8sf rh = v1logf_core(x);
147  rh = _mm256_add_ps(rh, ln_twof);
148  v8sf xl = _mm256_min_ps(x, asinhf_2p28);
149  v8sf rl = v1asinhf_core(xl);
150  v8sf mask = _mm256_cmp_ps(xl, x, _CMP_EQ_OQ);
151  v8sf r = _mm256_blendv_ps(rh, rl, mask);
152  return _mm256_or_ps(r, signbit);
153 }
154 
155 inline v8sf v1fast_asinhf(v8sf x)
156 {
157  v8sf invalid1 = _mm256_cmp_ps(x, zerof, _CMP_LT_OQ);
158  v8sf invalid2 = _mm256_cmp_ps(x, sqrt_flt_max, _CMP_NLE_UQ);
159  v8sf invalid = _mm256_or_ps(invalid1, invalid2);
160  if( ! _mm256_testz_ps(invalid, invalid) )
161  throw domain_error( DEMsg("v1fast_asinhf", x, invalid) );
162  return v1asinhf_core(x);
163 }
164 
165 #endif // __AVX512F__
166 
167 #else
168 
169 // stub routines, should never be called
170 inline int v1asinhd(int) { return 0; }
171 inline int v1fast_asinhd(int) { return 0; }
172 inline int v1asinhf(int) { return 0; }
173 inline int v1fast_asinhf(int) { return 0; }
174 
175 #endif // __AVX__
176 
177 // wrapper routines to give math functions C++ linkage
178 // this prevents warnings from the Oracle Studio compiler
179 inline double wr_asinhd(double x)
180 {
181  return asinh(x);
182 }
183 
185 {
186  return asinhf(x);
187 }
188 
189 // Fall-backs for non-AVX hardware -- not actually fast
190 inline double fast_asinh(double x)
191 {
192  return asinh(x);
193 }
194 
196 {
197  return asinhf(x);
198 }
199 
200 void vasinh(const double x[], double y[], long nlo, long nhi)
201 {
202  DEBUG_ENTRY( "vasinh()" );
203 
204  vecfun( x, y, nlo, nhi, wr_asinhd, v1asinhd );
205 }
206 
207 void vfast_asinh(const double x[], double y[], long nlo, long nhi)
208 {
209  DEBUG_ENTRY( "vfast_asinh()" );
210 
211  vecfun( x, y, nlo, nhi, wr_asinhd, v1fast_asinhd );
212 }
213 
214 void vasinh(const sys_float x[], sys_float y[], long nlo, long nhi)
215 {
216  DEBUG_ENTRY( "vasinh()" );
217 
218  vecfun( x, y, nlo, nhi, wr_asinhf, v1asinhf );
219 }
220 
221 void vfast_asinh(const sys_float x[], sys_float y[], long nlo, long nhi)
222 {
223  DEBUG_ENTRY( "vfast_asinh()" );
224 
225  vecfun( x, y, nlo, nhi, wr_asinhf, v1fast_asinhf );
226 }
227 
228 void vasinh(double *y, double x0, double x1, double x2, double x3)
229 {
230  V1FUN_PD_4(asinh, 0.);
231 }
232 
233 void vfast_asinh(double *y, double x0, double x1, double x2, double x3)
234 {
235  V1FUN_PD_4(fast_asinh, 0.);
236 }
237 
238 void vasinh(double *y, double x0, double x1, double x2, double x3, double x4, double x5, double x6, double x7)
239 {
240  V1FUN_PD_8(asinh, 0.);
241 }
242 
243 void vfast_asinh(double *y, double x0, double x1, double x2, double x3, double x4, double x5, double x6, double x7)
244 {
245  V1FUN_PD_8(fast_asinh, 0.);
246 }
247 
249 {
250  V1FUN_PS_4(asinh, 0.f);
251 }
252 
254 {
255  V1FUN_PS_4(fast_asinh, 0.f);
256 }
257 
259  sys_float x6, sys_float x7)
260 {
261  V1FUN_PS_8(asinh, 0.f);
262 }
263 
265  sys_float x6, sys_float x7)
266 {
267  V1FUN_PS_8(fast_asinh, 0.f);
268 }
269 
271  sys_float x6, sys_float x7, sys_float x8, sys_float x9, sys_float x10, sys_float x11, sys_float x12,
272  sys_float x13, sys_float x14, sys_float x15)
273 {
274  V1FUN_PS_16(asinh, 0.f);
275 }
276 
278  sys_float x6, sys_float x7, sys_float x8, sys_float x9, sys_float x10, sys_float x11, sys_float x12,
279  sys_float x13, sys_float x14, sys_float x15)
280 {
281  V1FUN_PS_16(fast_asinh, 0.f);
282 }
#define V1FUN_PD_8(FUN, V)
sys_float wr_asinhf(sys_float x)
int v1fast_asinhf(int)
static double x2[63]
static double x1[83]
int v1asinhd(int)
void vfast_asinh(const double x[], double y[], long nlo, long nhi)
int v1fast_asinhd(int)
sys_float fast_asinhf(sys_float x)
void zero(void)
Definition: zero.cpp:43
void vecfun(const T x[], T y[], long nlo, long nhi, T(*scalfun1)(T), V(*)(V))
static double x0[83]
float sys_float
Definition: cddefines.h:127
int v1asinhf(int)
#define V1FUN_PS_4(FUN, V)
void vasinh(const double x[], double y[], long nlo, long nhi)
#define V1FUN_PD_4(FUN, V)
double wr_asinhd(double x)
#define DEBUG_ENTRY(funcname)
Definition: cddefines.h:723
double fast_asinh(double x)
#define V1FUN_PS_8(FUN, V)
#define V1FUN_PS_16(FUN, V)