cloudy  trunk
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
vectorize_exp.cpp
Go to the documentation of this file.
1 /* This file is part of Cloudy and is copyright (C)1978-2022 by Gary J. Ferland and
2  * others. For conditions of distribution and use see copyright notice in license.txt */
3 #include "cddefines.h"
4 #include "vectorize.h"
5 #include "vectorize_math.h"
6 #include "vectorize_exp_core.h"
7 
8 //
9 // Written by Peter A.M. van Hoof, Royal Observatory of Belgium, Brussels
10 //
11 // This file contains vectorized versions of the single and double variants of the exp()
12 // function. They are vectorized using AVX instructions, but also make use of AVX2, FMA,
13 // and AVX512 instructions when available. The basic algorithms for calculating the exp()
14 // functions were slightly modified from versions taken from the Cephes library written by
15 // Stephen L. Moshier, available at: http://www.netlib.org/cephes/
16 //
17 // The algorithms for calculating the expm1() functions are simplified versions of the
18 // openlibm routines available at http://openlibm.org/ which is subject to the following
19 // copyright:
20 //
21 // ====================================================
22 // Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
23 //
24 // Developed at SunSoft, a Sun Microsystems, Inc. business.
25 // Permission to use, copy, modify, and distribute this
26 // software is freely granted, provided that this notice
27 // is preserved.
28 // ====================================================
29 //
30 // This work was also inspired by similar routines written by Giovanni Garberoglio which
31 // are available at: http://software-lisc.fbk.eu/avx_mathfun/
32 //
33 
34 #ifdef __AVX__
35 
36 #ifdef __AVX512F__
37 
38 inline v8df v1expd(v8df x)
39 {
40  __mmask8 invalid = _mm512_cmp_pd_mask(x, exp_max_arg, _CMP_NLE_UQ);
41  if( ! _mm512_kortestz(invalid, invalid) )
42  throw domain_error( DEMsg("v1expd", x, invalid) );
43  x = _mm512_max_pd(exp_min_arg, x);
44  v8si m = v1expd_core(x);
45  v8df p2n = v1pow2d_core(m);
46  return _mm512_mul_pd(x, p2n);
47 }
48 
49 inline v8df v1exp10d(v8df x)
50 {
51  __mmask8 invalid = _mm512_cmp_pd_mask(x, exp10_max_arg, _CMP_NLE_UQ);
52  if( ! _mm512_kortestz(invalid, invalid) )
53  throw domain_error( DEMsg("v1exp10d", x, invalid) );
54  x = _mm512_max_pd(exp10_min_arg, x);
55  v8df h = _mm512_mul_pd(x, third);
56  v8df y = _mm512_roundscale_pd(h, _MM_FROUND_TRUNC);
57  v8df z = _mm512_mul_pd(y, three);
58  x = _mm512_sub_pd(x, z);
59  x = _mm512_mul_pd(x, ln_ten);
60  x = _mm512_fmadd_pd(z, exp10_c1, x);
61  v8si m1 = v1expd_core(x);
62  y = _mm512_mul_pd(y, ten);
63  v8si m2 = _mm512_cvtpd_epi32(y);
64  v8si m = _mm256_add_epi32(m1, m2);
65  m = _mm256_max_epi32(m, moff);
66  v8df p2n = v1pow2d_core(m);
67  return _mm512_mul_pd(x, p2n);
68 }
69 
70 inline v8df v1expm1d(v8df x)
71 {
72  __mmask8 invalid = _mm512_cmp_pd_mask(x, expm1_max_arg, _CMP_NLE_UQ);
73  if( ! _mm512_kortestz(invalid, invalid) )
74  throw domain_error( DEMsg("v1expm1d", x, invalid) );
75  x = _mm512_max_pd(expm1_min_arg, x);
76  return v1expm1d_core(x);
77 }
78 
79 inline v16sf v1expf(v16sf x)
80 {
81  __mmask16 invalid = _mm512_cmp_ps_mask(x, expf_max_arg, _CMP_NLE_UQ);
82  if( ! _mm512_kortestz(invalid, invalid) )
83  throw domain_error( DEMsg("v1expf", x, invalid) );
84  x = _mm512_max_ps(expf_min_arg, x);
85  v16si n = v1expf_core(x);
86  v16sf p2n = v1pow2f_core(n);
87  return _mm512_mul_ps(x, p2n);
88 }
89 
90 inline v16sf v1exp10f(v16sf x)
91 {
92  __mmask16 invalid = _mm512_cmp_ps_mask(x, exp10f_max_arg, _CMP_NLE_UQ);
93  if( ! _mm512_kortestz(invalid, invalid) )
94  throw domain_error( DEMsg("v1exp10f", x, invalid) );
95  x = _mm512_max_ps(exp10f_min_arg, x);
96  v16sf h = _mm512_mul_ps(x, thirdf);
97  v16sf y = _mm512_roundscale_ps(h, _MM_FROUND_TRUNC);
98  v16sf z = _mm512_mul_ps(y, threef);
99  x = _mm512_sub_ps(x, z);
100  x = _mm512_mul_ps(x, ln_tenf);
101  x = _mm512_fmadd_ps(z, exp10f_c1, x);
102  v16si m1 = v1expf_core(x);
103  y = _mm512_mul_ps(y, tenf);
104  v16si m2 = _mm512_cvtps_epi32(y);
105  v16si m = _mm512_add_epi32(m1, m2);
106  m = _mm512_max_epi32(m, mofff);
107  v16sf p2n = v1pow2f_core(m);
108  return _mm512_mul_ps(x, p2n);
109 }
110 
111 inline v16sf v1expm1f(v16sf x)
112 {
113  __mmask16 invalid = _mm512_cmp_ps_mask(x, expm1f_max_arg, _CMP_NLE_UQ);
114  if( ! _mm512_kortestz(invalid, invalid) )
115  throw domain_error( DEMsg("v1expm1f", x, invalid) );
116  x = _mm512_max_ps(expm1f_min_arg, x);
117  return v1expm1f_core(x);
118 }
119 
120 #else
121 
122 inline v4df v1expd(v4df x)
123 {
124  v4df invalid = _mm256_cmp_pd(x, exp_max_arg, _CMP_NLE_UQ);
125  if( ! _mm256_testz_pd(invalid, invalid) )
126  throw domain_error( DEMsg("v1expd", x, invalid) );
127  x = _mm256_max_pd(exp_min_arg, x);
128  v4si m = v1expd_core(x);
129  v4df p2n = v1pow2d_core(m);
130  return _mm256_mul_pd(x, p2n);
131 }
132 
133 inline v4df v1exp10d(v4df x)
134 {
135  v4df invalid = _mm256_cmp_pd(x, exp10_max_arg, _CMP_NLE_UQ);
136  if( ! _mm256_testz_pd(invalid, invalid) )
137  throw domain_error( DEMsg("v1exp10d", x, invalid) );
138  x = _mm256_max_pd(exp10_min_arg, x);
139  v4df h = _mm256_mul_pd(x, third);
140  v4df y = _mm256_round_pd(h, _MM_FROUND_TRUNC);
141  v4df z = _mm256_mul_pd(y, three);
142  x = _mm256_sub_pd(x, z);
143  x = _mm256_mul_pd(x, ln_ten);
144 #ifdef __FMA__
145  x = _mm256_fmadd_pd(z, exp10_c1, x);
146 #else
147  h = _mm256_mul_pd(z, exp10_c1);
148  x = _mm256_add_pd(h, x);
149 #endif
150  v4si m1 = v1expd_core(x);
151  y = _mm256_mul_pd(y, ten);
152  v4si m2 = _mm256_cvtpd_epi32(y);
153  v4si m = _mm_add_epi32(m1, m2);
154  m = _mm_max_epi32(m, moff);
155  v4df p2n = v1pow2d_core(m);
156  return _mm256_mul_pd(x, p2n);
157 }
158 
159 inline v4df v1expm1d(v4df x)
160 {
161  v4df invalid = _mm256_cmp_pd(x, expm1_max_arg, _CMP_NLE_UQ);
162  if( ! _mm256_testz_pd(invalid, invalid) )
163  throw domain_error( DEMsg("v1expm1d", x, invalid) );
164  x = _mm256_max_pd(expm1_min_arg, x);
165  return v1expm1d_core(x);
166 }
167 
168 inline v8sf v1expf(v8sf x)
169 {
170  v8sf invalid = _mm256_cmp_ps(x, expf_max_arg, _CMP_NLE_UQ);
171  if( ! _mm256_testz_ps(invalid, invalid) )
172  throw domain_error( DEMsg("v1expf", x, invalid) );
173  x = _mm256_max_ps(expf_min_arg, x);
174  v8si n = v1expf_core(x);
175  v8sf p2n = v1pow2f_core(n);
176  return _mm256_mul_ps(x, p2n);
177 }
178 
179 inline v8sf v1exp10f(v8sf x)
180 {
181  v8sf invalid = _mm256_cmp_ps(x, exp10f_max_arg, _CMP_NLE_UQ);
182  if( ! _mm256_testz_ps(invalid, invalid) )
183  throw domain_error( DEMsg("v1exp10f", x, invalid) );
184  x = _mm256_max_ps(exp10f_min_arg, x);
185  v8sf h = _mm256_mul_ps(x, thirdf);
186  v8sf y = _mm256_round_ps(h, _MM_FROUND_TRUNC);
187  v8sf z = _mm256_mul_ps(y, threef);
188  x = _mm256_sub_ps(x, z);
189  x = _mm256_mul_ps(x, ln_tenf);
190 #ifdef __FMA__
191  x = _mm256_fmadd_ps(z, exp10f_c1, x);
192 #else
193  h = _mm256_mul_ps(z, exp10f_c1);
194  x = _mm256_add_ps(h, x);
195 #endif
196  v8si m1 = v1expf_core(x);
197  y = _mm256_mul_ps(y, tenf);
198  v8si m2 = _mm256_cvtps_epi32(y);
199 #ifdef __AVX2__
200  v8si m = _mm256_add_epi32(m1, m2);
201  m = _mm256_max_epi32(m, mofff);
202  v8sf p2n = v1pow2f_core(m);
203 #else
204  v4si m1l = _mm256_extractf128_si256(m1, 0);
205  v4si m2l = _mm256_extractf128_si256(m2, 0);
206  v4si m1u = _mm256_extractf128_si256(m1, 1);
207  v4si m2u = _mm256_extractf128_si256(m2, 1);
208  m1l = _mm_add_epi32(m1l, m2l);
209  m1u = _mm_add_epi32(m1u, m2u);
210  m1l = _mm_max_epi32(m1l, mofff);
211  m1u = _mm_max_epi32(m1u, mofff);
212  m1l = _mm_add_epi32(m1l, offf);
213  m1u = _mm_add_epi32(m1u, offf);
214  m1l = _mm_slli_epi32(m1l, 23);
215  m1u = _mm_slli_epi32(m1u, 23);
216  v8si m = _mm256_setzero_si256();
217  m = _mm256_insertf128_si256(m, m1l, 0);
218  m = _mm256_insertf128_si256(m, m1u, 1);
219  v8sf p2n = _mm256_castsi256_ps(m);
220 #endif
221  return _mm256_mul_ps(x, p2n);
222 }
223 
224 inline v8sf v1expm1f(v8sf x)
225 {
226  v8sf invalid = _mm256_cmp_ps(x, expm1f_max_arg, _CMP_NLE_UQ);
227  if( ! _mm256_testz_ps(invalid, invalid) )
228  throw domain_error( DEMsg("v1expm1f", x, invalid) );
229  x = _mm256_max_ps(expm1f_min_arg, x);
230  return v1expm1f_core(x);
231 }
232 
233 #endif // __AVX512F__
234 
235 #else
236 
237 // stub routines, should never be called
238 inline int v1expd(int) { return 0; }
239 inline int v1exp10d(int) { return 0; }
240 inline int v1expm1d(int) { return 0; }
241 inline int v1expf(int) { return 0; }
242 inline int v1exp10f(int) { return 0; }
243 inline int v1expm1f(int) { return 0; }
244 
245 #endif // __AVX__
246 
247 // wrapper routines to give math functions C++ linkage
248 // this prevents warnings from the Oracle Studio compiler
249 inline double wr_expd(double x)
250 {
251  return exp(x);
252 }
253 
254 inline double wr_expm1d(double x)
255 {
256  return expm1(x);
257 }
258 
260 {
261  return expf(x);
262 }
263 
265 {
266  return expm1f(x);
267 }
268 
269 // wrapper routines needed to resolve overloaded functions
270 // these prevent problems with the Oracle Studio compiler
271 // it cannot handle overloaded functions as a template argument
272 inline double wr_exp10d(double x)
273 {
274  return exp10(x);
275 }
276 
278 {
279  return exp10(x);
280 }
281 
282 
283 void vexp(const double x[], double y[], long nlo, long nhi)
284 {
285  DEBUG_ENTRY( "vexp()" );
286 
287  vecfun( x, y, nlo, nhi, wr_expd, v1expd );
288 }
289 
290 void vexp10(const double x[], double y[], long nlo, long nhi)
291 {
292  DEBUG_ENTRY( "vexp10()" );
293 
294  vecfun( x, y, nlo, nhi, wr_exp10d, v1exp10d );
295 }
296 
297 void vexpm1(const double x[], double y[], long nlo, long nhi)
298 {
299  DEBUG_ENTRY( "vexpm1()" );
300 
301  vecfun( x, y, nlo, nhi, wr_expm1d, v1expm1d );
302 }
303 
304 void vexp(const sys_float x[], sys_float y[], long nlo, long nhi)
305 {
306  DEBUG_ENTRY( "vexp()" );
307 
308  vecfun( x, y, nlo, nhi, wr_expf, v1expf );
309 }
310 
311 void vexp10(const sys_float x[], sys_float y[], long nlo, long nhi)
312 {
313  DEBUG_ENTRY( "vexp10()" );
314 
315  vecfun( x, y, nlo, nhi, wr_exp10f, v1exp10f );
316 }
317 
318 void vexpm1(const sys_float x[], sys_float y[], long nlo, long nhi)
319 {
320  DEBUG_ENTRY( "vexpm1()" );
321 
322  vecfun( x, y, nlo, nhi, wr_expm1f, v1expm1f );
323 }
324 
325 void vexp(double *y, double x0, double x1, double x2, double x3)
326 {
327  V1FUN_PD_4(exp, 0.);
328 }
329 
330 void vexp10(double *y, double x0, double x1, double x2, double x3)
331 {
332  V1FUN_PD_4(exp10, 0.);
333 }
334 
335 void vexpm1(double *y, double x0, double x1, double x2, double x3)
336 {
337  V1FUN_PD_4(expm1, 0.);
338 }
339 
340 void vexp(double *y, double x0, double x1, double x2, double x3, double x4, double x5, double x6, double x7)
341 {
342  V1FUN_PD_8(exp, 0.);
343 }
344 
345 void vexp10(double *y, double x0, double x1, double x2, double x3, double x4, double x5, double x6, double x7)
346 {
347  V1FUN_PD_8(exp10, 0.);
348 }
349 
350 void vexpm1(double *y, double x0, double x1, double x2, double x3, double x4, double x5, double x6, double x7)
351 {
352  V1FUN_PD_8(expm1, 0.);
353 }
354 
356 {
357  V1FUN_PS_4(exp, 0.f);
358 }
359 
361 {
362  V1FUN_PS_4(exp10, 0.f);
363 }
364 
366 {
367  V1FUN_PS_4(expm1, 0.f);
368 }
369 
371  sys_float x6, sys_float x7)
372 {
373  V1FUN_PS_8(exp, 0.f);
374 }
375 
377  sys_float x6, sys_float x7)
378 {
379  V1FUN_PS_8(exp10, 0.f);
380 }
381 
383  sys_float x6, sys_float x7)
384 {
385  V1FUN_PS_8(expm1, 0.f);
386 }
387 
389  sys_float x6, sys_float x7, sys_float x8, sys_float x9, sys_float x10, sys_float x11, sys_float x12,
390  sys_float x13, sys_float x14, sys_float x15)
391 {
392  V1FUN_PS_16(exp, 0.f);
393 }
394 
396  sys_float x6, sys_float x7, sys_float x8, sys_float x9, sys_float x10, sys_float x11, sys_float x12,
397  sys_float x13, sys_float x14, sys_float x15)
398 {
399  V1FUN_PS_16(exp10, 0.f);
400 }
401 
403  sys_float x6, sys_float x7, sys_float x8, sys_float x9, sys_float x10, sys_float x11, sys_float x12,
404  sys_float x13, sys_float x14, sys_float x15)
405 {
406  V1FUN_PS_16(expm1, 0.f);
407 }
#define V1FUN_PD_8(FUN, V)
int v1expm1d(int)
static double x2[63]
double exp10(double x)
Definition: cddefines.h:1368
int v1expd(int)
static double x1[83]
int v1exp10f(int)
sys_float wr_exp10f(sys_float x)
void vexp(const double x[], double y[], long nlo, long nhi)
void vecfun(const T x[], T y[], long nlo, long nhi, T(*scalfun1)(T), V(*)(V))
int v1expf(int)
int v1exp10d(int)
static double x0[83]
float sys_float
Definition: cddefines.h:127
double wr_exp10d(double x)
double wr_expd(double x)
#define V1FUN_PS_4(FUN, V)
void vexp10(const double x[], double y[], long nlo, long nhi)
sys_float wr_expm1f(sys_float x)
#define V1FUN_PD_4(FUN, V)
#define DEBUG_ENTRY(funcname)
Definition: cddefines.h:723
#define V1FUN_PS_8(FUN, V)
sys_float wr_expf(sys_float x)
void vexpm1(const double x[], double y[], long nlo, long nhi)
#define V1FUN_PS_16(FUN, V)
int v1expm1f(int)
double wr_expm1d(double x)