cloudy  trunk
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
vectorize_exp_core.h
Go to the documentation of this file.
1 /* This file is part of Cloudy and is copyright (C)1978-2022 by Gary J. Ferland and
2  * others. For conditions of distribution and use see copyright notice in license.txt */
3 
4 #ifndef VECTORIZE_EXP_CORE_H
5 #define VECTORIZE_EXP_CORE_H
6 
7 #include "vectorize_math.h"
8 
9 //
10 // Written by Peter A.M. van Hoof, Royal Observatory of Belgium, Brussels
11 //
12 // This file contains vectorized versions of the single and double variants of the exp()
13 // function. They are vectorized using AVX instructions, but also make use of AVX2, FMA,
14 // and AVX512 instructions when available. The basic algorithms for calculating the exp()
15 // functions were slightly modified from versions taken from the Cephes library written by
16 // Stephen L. Moshier, available at: http://www.netlib.org/cephes/
17 //
18 // The algorithms for calculating the expm1() functions are simplified versions of the
19 // openlibm routines available at http://openlibm.org/ which is subject to the following
20 // copyright:
21 //
22 // ====================================================
23 // Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
24 //
25 // Developed at SunSoft, a Sun Microsystems, Inc. business.
26 // Permission to use, copy, modify, and distribute this
27 // software is freely granted, provided that this notice
28 // is preserved.
29 // ====================================================
30 //
31 // This work was also inspired by similar routines written by Giovanni Garberoglio which
32 // are available at: http://software-lisc.fbk.eu/avx_mathfun/
33 //
34 
35 #ifdef __AVX__
36 
37 VECDI_CONST(exp_max_arg,0x40862e42fefa39ee); // +709.78271289338385941
38 VECDI_CONST(exp_min_arg,0xc086232bdd7abcd3); // -708.39641853226419244
39 
40 VECDI_CONST(exp10_max_arg,0x40734413509f79fe); // +308.25471555991668993
41 VECDI_CONST(exp10_min_arg,0xc0733a7146f72a42); // -307.65265556858878426
42 
43 VECDI_CONST(expm1_max_arg,0x40862e42fefa39ef); // +709.78271289338397310
44 VECDI_CONST(expm1_min_arg,0xc042b708872320e2); // -37.429947750237047899
45 
46 VECI_CONST(off,0x3ff);
47 VECI_CONST(moff,0xfffffc01);
48 
49 VECDI_CONST(exp_c1,0xbfe62e4000000000); // -6.9314575195312500000e-01
50 VECDI_CONST(exp_c2,0xbeb7f7d1cf79abca); // -1.4286068203094172555e-06
51 
52 VECDI_CONST(exp_p0,0x3f2089cdd5e44be8); // 1.2617719307481058306e-04
53 VECDI_CONST(exp_p1,0x3f9f06d10cca2c7e); // 3.0299440770744194562e-02
54 VECDI_CONST(exp_p2,0x3ff0000000000000); // 1.0000000000000000000e+00
55 VECDI_CONST(exp_q0,0x3ec92eb6bc365fa0); // 3.0019850513866445972e-06
56 VECDI_CONST(exp_q1,0x3f64ae39b508b6c0); // 2.5244834034968410830e-03
57 VECDI_CONST(exp_q2,0x3fcd17099887e074); // 2.2726554820815503266e-01
58 VECDI_CONST(exp_q3,0x4000000000000000); // 2.000000000000000000000e0
59 
60 VECDI_CONST(exp10_c1,0xbf8030c37085dc7f); // -7.90550887243868070612e-3
61 
62 VECDI_CONST(expm1_q1,0xbfa11111111110f4); // -3.33333333333331316428e-02
63 VECDI_CONST(expm1_q2,0x3f5a01a019fe5585); // 1.58730158725481460165e-03
64 VECDI_CONST(expm1_q3,0xbf14ce199eaadbb7); // -7.93650757867487942473e-05
65 VECDI_CONST(expm1_q4,0x3ed0cfca86e65239); // 4.00821782732936239552e-06
66 VECDI_CONST(expm1_q5,0xbe8afdb76e09c32d); // -2.01099218183624371326e-07
67 
68 #ifdef __AVX512F__
69 
70 inline v8df v1pow2d_core(v8si m)
71 {
72  m = _mm256_add_epi32(m, off);
73  v8di n = _mm512_cvtepi32_epi64(m);
74  n = _mm512_slli_epi64(n, 52);
75  return _mm512_castsi512_pd(n);
76 }
77 
78 inline v8si v1expd_core(v8df& x)
79 {
80  v8df px = _mm512_mul_pd(x, log2e);
81  px = _mm512_floor_pd(px);
82  v8si m = _mm512_cvtpd_epi32(px);
83  x = _mm512_fmadd_pd(px, exp_c1, x);
84  x = _mm512_fmadd_pd(px, exp_c2, x);
85  v8df xx = _mm512_mul_pd(x, x);
86  px = _mm512_fmadd_pd(exp_p0, xx, exp_p1);
87  px = _mm512_fmadd_pd(px, xx, exp_p2);
88  v8df qx = _mm512_fmadd_pd(exp_q0, xx, exp_q1);
89  qx = _mm512_fmadd_pd(qx, xx, exp_q2);
90  qx = _mm512_fmadd_pd(qx, xx, exp_q3);
91  px = _mm512_mul_pd(x, px);
92  xx = _mm512_sub_pd(qx, px);
93  x = _mm512_div_pd(px, xx);
94  x = _mm512_fmadd_pd(two, x, one);
95  return m;
96 }
97 
98 inline v8df v1expm1d_core(v8df x)
99 {
100  v8df px = _mm512_mul_pd(x, log2e);
101  px = _mm512_roundscale_pd(px, _MM_FROUND_NINT);
102  v8df py = _mm512_min_pd(px, c1023);
103  v8si k = _mm512_cvtpd_epi32(py);
104  py = _mm512_sub_pd(px, py);
105  py = _mm512_add_pd(py, one);
106  x = _mm512_fmadd_pd(px, exp_c1, x);
107  x = _mm512_fmadd_pd(px, exp_c2, x);
108  v8df hfx = _mm512_mul_pd(x, half);
109  v8df hxs = _mm512_mul_pd(x, hfx);
110  // sign of t is reversed compared to openlibm implementation
111  v8df r1 = _mm512_fmadd_pd(expm1_q5, hxs, expm1_q4);
112  r1 = _mm512_fmadd_pd(r1, hxs, expm1_q3);
113  r1 = _mm512_fmadd_pd(r1, hxs, expm1_q2);
114  r1 = _mm512_fmadd_pd(r1, hxs, expm1_q1);
115  r1 = _mm512_fmadd_pd(r1, hxs, one);
116  v8df t = _mm512_fmadd_pd(r1, hfx, mthree);
117  v8df e = _mm512_fmadd_pd(x, t, six);
118  v8df h = _mm512_add_pd(r1, t);
119  h = _mm512_div_pd(h, e);
120  e = _mm512_mul_pd(h, hxs);
121  v8df p2n = v1pow2d_core(k);
122  v8df p2mn = _mm512_div_pd(one, p2n);
123  e = _mm512_mul_pd(e, x);
124  e = _mm512_sub_pd(e, hxs);
125  e = _mm512_sub_pd(e, x);
126  t = _mm512_sub_pd(one, p2mn);
127  v8df y = _mm512_sub_pd(t, e);
128  y = _mm512_mul_pd(y, py);
129  return _mm512_mul_pd(y, p2n);
130 }
131 
132 #else
133 
134 inline v4df v1pow2d_core(v4si m)
135 {
136  m = _mm_add_epi32(m, off);
137 #ifdef __AVX2__
138  v4di n = _mm256_cvtepi32_epi64(m);
139  n = _mm256_slli_epi64(n, 52);
140 #else
141  m = _mm_slli_epi32(m, 20);
142  v4si z = _mm_set1_epi32(0);
143  v4si nl = _mm_unpacklo_epi32(z, m);
144  v8si n = _mm256_set1_epi32(0);
145  n = _mm256_insertf128_si256(n, nl, 0);
146  v4si nu = _mm_unpackhi_epi32(z, m);
147  n = _mm256_insertf128_si256(n, nu, 1);
148 #endif
149  return _mm256_castsi256_pd(n);
150 }
151 
152 inline v4si v1expd_core(v4df& x)
153 {
154  v4df px = _mm256_mul_pd(x, log2e);
155  px = _mm256_floor_pd(px);
156  v4si m = _mm256_cvtpd_epi32(px);
157 #ifdef __FMA__
158  x = _mm256_fmadd_pd(px, exp_c1, x);
159  x = _mm256_fmadd_pd(px, exp_c2, x);
160 #else
161  v4df y = _mm256_mul_pd(px, exp_c1);
162  x = _mm256_add_pd(x, y);
163  y = _mm256_mul_pd(px, exp_c2);
164  x = _mm256_add_pd(x, y);
165 #endif
166  v4df xx = _mm256_mul_pd(x, x);
167 #ifdef __FMA__
168  px = _mm256_fmadd_pd(exp_p0, xx, exp_p1);
169  px = _mm256_fmadd_pd(px, xx, exp_p2);
170  v4df qx = _mm256_fmadd_pd(exp_q0, xx, exp_q1);
171  qx = _mm256_fmadd_pd(qx, xx, exp_q2);
172  qx = _mm256_fmadd_pd(qx, xx, exp_q3);
173 #else
174  px = _mm256_mul_pd(exp_p0, xx);
175  px = _mm256_add_pd(px, exp_p1);
176  px = _mm256_mul_pd(px, xx);
177  px = _mm256_add_pd(px, exp_p2);
178  v4df qx = _mm256_mul_pd(exp_q0, xx);
179  qx = _mm256_add_pd(qx, exp_q1);
180  qx = _mm256_mul_pd(qx, xx);
181  qx = _mm256_add_pd(qx, exp_q2);
182  qx = _mm256_mul_pd(qx, xx);
183  qx = _mm256_add_pd(qx, exp_q3);
184 #endif
185  px = _mm256_mul_pd(x, px);
186  xx = _mm256_sub_pd(qx, px);
187  x = _mm256_div_pd(px, xx);
188 #ifdef __FMA__
189  x = _mm256_fmadd_pd(two, x, one);
190 #else
191  x = _mm256_mul_pd(x, two);
192  x = _mm256_add_pd(x, one);
193 #endif
194  return m;
195 }
196 
197 inline v4df v1expm1d_core(v4df x)
198 {
199  v4df px = _mm256_mul_pd(x, log2e);
200  px = _mm256_round_pd(px, _MM_FROUND_NINT);
201  v4df py = _mm256_min_pd(px, c1023);
202  v4si k = _mm256_cvtpd_epi32(py);
203  py = _mm256_sub_pd(px, py);
204  py = _mm256_add_pd(py, one);
205 #ifdef __FMA__
206  x = _mm256_fmadd_pd(px, exp_c1, x);
207  x = _mm256_fmadd_pd(px, exp_c2, x);
208 #else
209  v4df w = _mm256_mul_pd(px, exp_c1);
210  x = _mm256_add_pd(x, w);
211  w = _mm256_mul_pd(px, exp_c2);
212  x = _mm256_add_pd(x, w);
213 #endif
214  v4df hfx = _mm256_mul_pd(x, half);
215  v4df hxs = _mm256_mul_pd(x, hfx);
216  // sign of t is reversed compared to openlibm implementation
217 #ifdef __FMA__
218  v4df r1 = _mm256_fmadd_pd(expm1_q5, hxs, expm1_q4);
219  r1 = _mm256_fmadd_pd(r1, hxs, expm1_q3);
220  r1 = _mm256_fmadd_pd(r1, hxs, expm1_q2);
221  r1 = _mm256_fmadd_pd(r1, hxs, expm1_q1);
222  r1 = _mm256_fmadd_pd(r1, hxs, one);
223  v4df t = _mm256_fmadd_pd(r1, hfx, mthree);
224  v4df e = _mm256_fmadd_pd(x, t, six);
225 #else
226  v4df r1 = _mm256_mul_pd(expm1_q5, hxs);
227  r1 = _mm256_add_pd(r1, expm1_q4);
228  r1 = _mm256_mul_pd(r1, hxs);
229  r1 = _mm256_add_pd(r1, expm1_q3);
230  r1 = _mm256_mul_pd(r1, hxs);
231  r1 = _mm256_add_pd(r1, expm1_q2);
232  r1 = _mm256_mul_pd(r1, hxs);
233  r1 = _mm256_add_pd(r1, expm1_q1);
234  r1 = _mm256_mul_pd(r1, hxs);
235  r1 = _mm256_add_pd(r1, one);
236  v4df t = _mm256_mul_pd(r1, hfx);
237  t = _mm256_add_pd(t, mthree);
238  v4df e = _mm256_mul_pd(x, t);
239  e = _mm256_add_pd(e, six);
240 #endif
241  v4df h = _mm256_add_pd(r1, t);
242  h = _mm256_div_pd(h, e);
243  e = _mm256_mul_pd(h, hxs);
244  v4df p2n = v1pow2d_core(k);
245  v4df p2mn = _mm256_div_pd(one, p2n);
246  e = _mm256_mul_pd(e, x);
247  e = _mm256_sub_pd(e, hxs);
248  e = _mm256_sub_pd(e, x);
249  t = _mm256_sub_pd(one, p2mn);
250  v4df y = _mm256_sub_pd(t, e);
251  y = _mm256_mul_pd(y, py);
252  return _mm256_mul_pd(y, p2n);
253 }
254 
255 #endif // __AVX512F__
256 
257 VECFI_CONST(expf_max_arg,0x42b17217); // +88.72283173
258 VECFI_CONST(expf_min_arg,0xc2aeac51); // -87.33655548
259 
260 VECFI_CONST(exp10f_max_arg,0x421a209a); // +38.53183746
261 VECFI_CONST(exp10f_min_arg,0xc217b819); // -37.92978287
262 
263 VECFI_CONST(expm1f_max_arg,0x42b17217); // +88.72283173
264 VECFI_CONST(expm1f_min_arg,0xc18aa123); // -17.32868004
265 
266 #if defined(__AVX2__) || defined(__AVX512F__)
267 VECII_CONST(offf,0x7f);
268 VECII_CONST(mofff,0xffffff81);
269 #else
270 VECI_CONST(offf,0x7f);
271 VECI_CONST(mofff,0xffffff81);
272 #endif
273 
274 VECFI_CONST(expf_c1,0xbf318000); // -0.693359375f
275 VECFI_CONST(expf_c2,0x395e8083); // 2.12194440e-4f
276 
277 VECFI_CONST(expf_p0,0x39506967); // 1.9875691500e-4f
278 VECFI_CONST(expf_p1,0x3ab743ce); // 1.3981999507e-3f
279 VECFI_CONST(expf_p2,0x3c088908); // 8.3334519073e-3f
280 VECFI_CONST(expf_p3,0x3d2aa9c1); // 4.1665795894e-2f
281 VECFI_CONST(expf_p4,0x3e2aaaaa); // 1.6666665459e-1f
282 VECFI_CONST(expf_p5,0x3f000000); // 5.0000000000e-1f
283 
284 VECFI_CONST(exp10f_c1,0xbc01861c); // -7.9055088724e-3
285 
286 VECFI_CONST(expm1f_q1,0xbd088868); // -3.3333212137e-2
287 VECFI_CONST(expm1f_q2,0x3acf3010); // 1.5807170421e-3
288 
289 #ifdef __AVX512F__
290 
291 inline v16sf v1pow2f_core(v16si n)
292 {
293  n = _mm512_add_epi32(n, offf);
294  n = _mm512_slli_epi32(n, 23);
295  return _mm512_castsi512_ps(n);
296 }
297 
298 inline v16si v1expf_core(v16sf& x)
299 {
300  v16sf px = _mm512_mul_ps(x, log2ef);
301  px = _mm512_floor_ps(px);
302  v16si n = _mm512_cvtps_epi32(px);
303  x = _mm512_fmadd_ps(px, expf_c1, x);
304  x = _mm512_fmadd_ps(px, expf_c2, x);
305  v16sf xx = _mm512_mul_ps(x, x);
306  px = _mm512_fmadd_ps(expf_p0, x, expf_p1);
307  px = _mm512_fmadd_ps(px, x, expf_p2);
308  px = _mm512_fmadd_ps(px, x, expf_p3);
309  px = _mm512_fmadd_ps(px, x, expf_p4);
310  px = _mm512_fmadd_ps(px, x, expf_p5);
311  px = _mm512_mul_ps(px, xx);
312  px = _mm512_add_ps(px, x);
313  x = _mm512_add_ps(px, onef);
314  return n;
315 }
316 
317 inline v16sf v1expm1f_core(v16sf x)
318 {
319  v16sf px = _mm512_mul_ps(x, log2ef);
320  px = _mm512_roundscale_ps(px, _MM_FROUND_NINT);
321  px = _mm512_min_ps(px, c127);
322  v16si n = _mm512_cvtps_epi32(px);
323  x = _mm512_fmadd_ps(px, expf_c1, x);
324  x = _mm512_fmadd_ps(px, expf_c2, x);
325  v16sf hfx = _mm512_mul_ps(x, halff);
326  v16sf hxs = _mm512_mul_ps(x, hfx);
327  // sign of t is reversed compared to openlibm implementation
328  v16sf r1 = _mm512_fmadd_ps(expm1f_q2, hxs, expm1f_q1);
329  r1 = _mm512_fmadd_ps(r1, hxs, onef);
330  v16sf t = _mm512_fmadd_ps(r1, hfx, mthreef);
331  v16sf e = _mm512_fmadd_ps(x, t, sixf);
332  v16sf h = _mm512_add_ps(r1, t);
333  h = _mm512_div_ps(h, e);
334  e = _mm512_mul_ps(h, hxs);
335  v16sf p2n = v1pow2f_core(n);
336  v16sf p2mn = _mm512_div_ps(onef, p2n);
337  e = _mm512_mul_ps(e, x);
338  e = _mm512_sub_ps(e, hxs);
339  e = _mm512_sub_ps(e, x);
340  t = _mm512_sub_ps(onef, p2mn);
341  v16sf y = _mm512_sub_ps(t, e);
342  return _mm512_mul_ps(y, p2n);
343 }
344 
345 #else
346 
347 inline v8sf v1pow2f_core(v8si n)
348 {
349 #ifdef __AVX2__
350  n = _mm256_add_epi32(n, offf);
351  n = _mm256_slli_epi32(n, 23);
352 #else
353  v4si nl = _mm256_extractf128_si256(n, 0);
354  nl = _mm_add_epi32(nl, offf);
355  v4si nu = _mm256_extractf128_si256(n, 1);
356  nu = _mm_add_epi32(nu, offf);
357  nl = _mm_slli_epi32(nl, 23);
358  nu = _mm_slli_epi32(nu, 23);
359  n = _mm256_insertf128_si256(n, nl, 0);
360  n = _mm256_insertf128_si256(n, nu, 1);
361 #endif
362  return _mm256_castsi256_ps(n);
363 }
364 
365 inline v8si v1expf_core(v8sf& x)
366 {
367  v8sf px = _mm256_mul_ps(x, log2ef);
368  px = _mm256_floor_ps(px);
369  v8si n = _mm256_cvtps_epi32(px);
370 #ifdef __FMA__
371  x = _mm256_fmadd_ps(px, expf_c1, x);
372  x = _mm256_fmadd_ps(px, expf_c2, x);
373 #else
374  v8sf y = _mm256_mul_ps(px, expf_c1);
375  x = _mm256_add_ps(x, y);
376  y = _mm256_mul_ps(px, expf_c2);
377  x = _mm256_add_ps(x, y);
378 #endif
379  v8sf xx = _mm256_mul_ps(x, x);
380 #ifdef __FMA__
381  px = _mm256_fmadd_ps(expf_p0, x, expf_p1);
382  px = _mm256_fmadd_ps(px, x, expf_p2);
383  px = _mm256_fmadd_ps(px, x, expf_p3);
384  px = _mm256_fmadd_ps(px, x, expf_p4);
385  px = _mm256_fmadd_ps(px, x, expf_p5);
386 #else
387  px = _mm256_mul_ps(expf_p0, x);
388  px = _mm256_add_ps(px, expf_p1);
389  px = _mm256_mul_ps(px, x);
390  px = _mm256_add_ps(px, expf_p2);
391  px = _mm256_mul_ps(px, x);
392  px = _mm256_add_ps(px, expf_p3);
393  px = _mm256_mul_ps(px, x);
394  px = _mm256_add_ps(px, expf_p4);
395  px = _mm256_mul_ps(px, x);
396  px = _mm256_add_ps(px, expf_p5);
397 #endif
398  px = _mm256_mul_ps(px, xx);
399  px = _mm256_add_ps(px, x);
400  x = _mm256_add_ps(px, onef);
401  return n;
402 }
403 
404 inline v8sf v1expm1f_core(v8sf x)
405 {
406  v8sf px = _mm256_mul_ps(x, log2ef);
407  px = _mm256_round_ps(px, _MM_FROUND_NINT);
408  px = _mm256_min_ps(px, c127);
409  v8si n = _mm256_cvtps_epi32(px);
410 #ifdef __FMA__
411  x = _mm256_fmadd_ps(px, expf_c1, x);
412  x = _mm256_fmadd_ps(px, expf_c2, x);
413 #else
414  v8sf w = _mm256_mul_ps(px, expf_c1);
415  x = _mm256_add_ps(x, w);
416  w = _mm256_mul_ps(px, expf_c2);
417  x = _mm256_add_ps(x, w);
418 #endif
419  v8sf hfx = _mm256_mul_ps(x, halff);
420  v8sf hxs = _mm256_mul_ps(x, hfx);
421  // sign of t is reversed compared to openlibm implementation
422 #ifdef __FMA__
423  v8sf r1 = _mm256_fmadd_ps(expm1f_q2, hxs, expm1f_q1);
424  r1 = _mm256_fmadd_ps(r1, hxs, onef);
425  v8sf t = _mm256_fmadd_ps(r1, hfx, mthreef);
426  v8sf e = _mm256_fmadd_ps(x, t, sixf);
427 #else
428  v8sf r1 = _mm256_mul_ps(expm1f_q2, hxs);
429  r1 = _mm256_add_ps(r1, expm1f_q1);
430  r1 = _mm256_mul_ps(r1, hxs);
431  r1 = _mm256_add_ps(r1, onef);
432  v8sf t = _mm256_mul_ps(r1, hfx);
433  t = _mm256_add_ps(t, mthreef);
434  v8sf e = _mm256_mul_ps(x, t);
435  e = _mm256_add_ps(e, sixf);
436 #endif
437  v8sf h = _mm256_add_ps(r1, t);
438  h = _mm256_div_ps(h, e);
439  e = _mm256_mul_ps(h, hxs);
440  v8sf p2n = v1pow2f_core(n);
441  v8sf p2mn = _mm256_div_ps(onef, p2n);
442  e = _mm256_mul_ps(e, x);
443  e = _mm256_sub_ps(e, hxs);
444  e = _mm256_sub_ps(e, x);
445  t = _mm256_sub_ps(onef, p2mn);
446  v8sf y = _mm256_sub_ps(t, e);
447  return _mm256_mul_ps(y, p2n);
448 }
449 
450 #endif // __AVX512F__
451 
452 #endif // __AVX__
453 
454 #endif