Cloudy
Spectral Synthesis Code for Astrophysics
Loading...
Searching...
No Matches
vectorize_exp_core.h
Go to the documentation of this file.
1/* This file is part of Cloudy and is copyright (C)1978-2025 by Gary J. Ferland and
2 * others. For conditions of distribution and use see copyright notice in license.txt */
3
4#ifndef VECTORIZE_EXP_CORE_H
5#define VECTORIZE_EXP_CORE_H
6
7#include "vectorize_math.h"
8
9//
10// Written by Peter A.M. van Hoof, Royal Observatory of Belgium, Brussels
11//
12// This file contains vectorized versions of the single and double variants of the exp()
13// function. They are vectorized using AVX instructions, but also make use of AVX2, FMA,
14// and AVX512 instructions when available. The basic algorithms for calculating the exp()
15// functions were slightly modified from versions taken from the Cephes library written by
16// Stephen L. Moshier, available at: http://www.netlib.org/cephes/
17//
18// The algorithms for calculating the expm1() functions are simplified versions of the
19// openlibm routines available at http://openlibm.org/ which is subject to the following
20// copyright:
21//
22// ====================================================
23// Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
24//
25// Developed at SunSoft, a Sun Microsystems, Inc. business.
26// Permission to use, copy, modify, and distribute this
27// software is freely granted, provided that this notice
28// is preserved.
29// ====================================================
30//
31// This work was also inspired by similar routines written by Giovanni Garberoglio which
32// are available at: http://software-lisc.fbk.eu/avx_mathfun/
33//
34
35#ifdef __AVX__
36
37VECDI_CONST(exp_max_arg,0x40862e42fefa39ee); // +709.78271289338385941
38VECDI_CONST(exp_min_arg,0xc086232bdd7abcd3); // -708.39641853226419244
39
40VECDI_CONST(exp10_max_arg,0x40734413509f79fe); // +308.25471555991668993
41VECDI_CONST(exp10_min_arg,0xc0733a7146f72a42); // -307.65265556858878426
42
43VECDI_CONST(expm1_max_arg,0x40862e42fefa39ef); // +709.78271289338397310
44VECDI_CONST(expm1_min_arg,0xc042b708872320e2); // -37.429947750237047899
45
46VECI_CONST(off,0x3ff);
47VECI_CONST(moff,0xfffffc01);
48
49VECDI_CONST(exp_c1,0xbfe62e4000000000); // -6.9314575195312500000e-01
50VECDI_CONST(exp_c2,0xbeb7f7d1cf79abca); // -1.4286068203094172555e-06
51
52VECDI_CONST(exp_p0,0x3f2089cdd5e44be8); // 1.2617719307481058306e-04
53VECDI_CONST(exp_p1,0x3f9f06d10cca2c7e); // 3.0299440770744194562e-02
54VECDI_CONST(exp_p2,0x3ff0000000000000); // 1.0000000000000000000e+00
55VECDI_CONST(exp_q0,0x3ec92eb6bc365fa0); // 3.0019850513866445972e-06
56VECDI_CONST(exp_q1,0x3f64ae39b508b6c0); // 2.5244834034968410830e-03
57VECDI_CONST(exp_q2,0x3fcd17099887e074); // 2.2726554820815503266e-01
58VECDI_CONST(exp_q3,0x4000000000000000); // 2.000000000000000000000e0
59
60VECDI_CONST(exp10_c1,0xbf8030c37085dc7f); // -7.90550887243868070612e-3
61
62VECDI_CONST(expm1_q1,0xbfa11111111110f4); // -3.33333333333331316428e-02
63VECDI_CONST(expm1_q2,0x3f5a01a019fe5585); // 1.58730158725481460165e-03
64VECDI_CONST(expm1_q3,0xbf14ce199eaadbb7); // -7.93650757867487942473e-05
65VECDI_CONST(expm1_q4,0x3ed0cfca86e65239); // 4.00821782732936239552e-06
66VECDI_CONST(expm1_q5,0xbe8afdb76e09c32d); // -2.01099218183624371326e-07
67
68#ifdef __AVX512F__
69
70inline v8df v1pow2d_core(v8si m)
71{
72 m = _mm256_add_epi32(m, off);
73 v8di n = _mm512_cvtepi32_epi64(m);
74 n = _mm512_slli_epi64(n, 52);
75 return _mm512_castsi512_pd(n);
76}
77
78inline v8si v1expd_core(v8df& x)
79{
80 v8df px = _mm512_mul_pd(x, log2e);
81 px = _mm512_floor_pd(px);
82 v8si m = _mm512_cvtpd_epi32(px);
83 x = _mm512_fmadd_pd(px, exp_c1, x);
84 x = _mm512_fmadd_pd(px, exp_c2, x);
85 v8df xx = _mm512_mul_pd(x, x);
86 px = _mm512_fmadd_pd(exp_p0, xx, exp_p1);
87 px = _mm512_fmadd_pd(px, xx, exp_p2);
88 v8df qx = _mm512_fmadd_pd(exp_q0, xx, exp_q1);
89 qx = _mm512_fmadd_pd(qx, xx, exp_q2);
90 qx = _mm512_fmadd_pd(qx, xx, exp_q3);
91 px = _mm512_mul_pd(x, px);
92 xx = _mm512_sub_pd(qx, px);
93 x = _mm512_div_pd(px, xx);
94 x = _mm512_fmadd_pd(two, x, one);
95 return m;
96}
97
98inline v8df v1expm1d_core(v8df x)
99{
100 v8df px = _mm512_mul_pd(x, log2e);
101 px = _mm512_roundscale_pd(px, _MM_FROUND_NINT);
102 v8df py = _mm512_min_pd(px, c1023);
103 v8si k = _mm512_cvtpd_epi32(py);
104 py = _mm512_sub_pd(px, py);
105 py = _mm512_add_pd(py, one);
106 x = _mm512_fmadd_pd(px, exp_c1, x);
107 x = _mm512_fmadd_pd(px, exp_c2, x);
108 v8df hfx = _mm512_mul_pd(x, half);
109 v8df hxs = _mm512_mul_pd(x, hfx);
110 // sign of t is reversed compared to openlibm implementation
111 v8df r1 = _mm512_fmadd_pd(expm1_q5, hxs, expm1_q4);
112 r1 = _mm512_fmadd_pd(r1, hxs, expm1_q3);
113 r1 = _mm512_fmadd_pd(r1, hxs, expm1_q2);
114 r1 = _mm512_fmadd_pd(r1, hxs, expm1_q1);
115 r1 = _mm512_fmadd_pd(r1, hxs, one);
116 v8df t = _mm512_fmadd_pd(r1, hfx, mthree);
117 v8df e = _mm512_fmadd_pd(x, t, six);
118 v8df h = _mm512_add_pd(r1, t);
119 h = _mm512_div_pd(h, e);
120 e = _mm512_mul_pd(h, hxs);
121 v8df p2n = v1pow2d_core(k);
122 v8df p2mn = _mm512_div_pd(one, p2n);
123 e = _mm512_mul_pd(e, x);
124 e = _mm512_sub_pd(e, hxs);
125 e = _mm512_sub_pd(e, x);
126 t = _mm512_sub_pd(one, p2mn);
127 v8df y = _mm512_sub_pd(t, e);
128 y = _mm512_mul_pd(y, py);
129 return _mm512_mul_pd(y, p2n);
130}
131
132#else
133
134inline v4df v1pow2d_core(v4si m)
135{
136 m = _mm_add_epi32(m, off);
137#ifdef __AVX2__
138 v4di n = _mm256_cvtepi32_epi64(m);
139 n = _mm256_slli_epi64(n, 52);
140#else
141 m = _mm_slli_epi32(m, 20);
142 v4si z = _mm_set1_epi32(0);
143 v4si nl = _mm_unpacklo_epi32(z, m);
144 v8si n = _mm256_set1_epi32(0);
145 n = _mm256_insertf128_si256(n, nl, 0);
146 v4si nu = _mm_unpackhi_epi32(z, m);
147 n = _mm256_insertf128_si256(n, nu, 1);
148#endif
149 return _mm256_castsi256_pd(n);
150}
151
152inline v4si v1expd_core(v4df& x)
153{
154 v4df px = _mm256_mul_pd(x, log2e);
155 px = _mm256_floor_pd(px);
156 v4si m = _mm256_cvtpd_epi32(px);
157#ifdef __FMA__
158 x = _mm256_fmadd_pd(px, exp_c1, x);
159 x = _mm256_fmadd_pd(px, exp_c2, x);
160#else
161 v4df y = _mm256_mul_pd(px, exp_c1);
162 x = _mm256_add_pd(x, y);
163 y = _mm256_mul_pd(px, exp_c2);
164 x = _mm256_add_pd(x, y);
165#endif
166 v4df xx = _mm256_mul_pd(x, x);
167#ifdef __FMA__
168 px = _mm256_fmadd_pd(exp_p0, xx, exp_p1);
169 px = _mm256_fmadd_pd(px, xx, exp_p2);
170 v4df qx = _mm256_fmadd_pd(exp_q0, xx, exp_q1);
171 qx = _mm256_fmadd_pd(qx, xx, exp_q2);
172 qx = _mm256_fmadd_pd(qx, xx, exp_q3);
173#else
174 px = _mm256_mul_pd(exp_p0, xx);
175 px = _mm256_add_pd(px, exp_p1);
176 px = _mm256_mul_pd(px, xx);
177 px = _mm256_add_pd(px, exp_p2);
178 v4df qx = _mm256_mul_pd(exp_q0, xx);
179 qx = _mm256_add_pd(qx, exp_q1);
180 qx = _mm256_mul_pd(qx, xx);
181 qx = _mm256_add_pd(qx, exp_q2);
182 qx = _mm256_mul_pd(qx, xx);
183 qx = _mm256_add_pd(qx, exp_q3);
184#endif
185 px = _mm256_mul_pd(x, px);
186 xx = _mm256_sub_pd(qx, px);
187 x = _mm256_div_pd(px, xx);
188#ifdef __FMA__
189 x = _mm256_fmadd_pd(two, x, one);
190#else
191 x = _mm256_mul_pd(x, two);
192 x = _mm256_add_pd(x, one);
193#endif
194 return m;
195}
196
197inline v4df v1expm1d_core(v4df x)
198{
199 v4df px = _mm256_mul_pd(x, log2e);
200 px = _mm256_round_pd(px, _MM_FROUND_NINT);
201 v4df py = _mm256_min_pd(px, c1023);
202 v4si k = _mm256_cvtpd_epi32(py);
203 py = _mm256_sub_pd(px, py);
204 py = _mm256_add_pd(py, one);
205#ifdef __FMA__
206 x = _mm256_fmadd_pd(px, exp_c1, x);
207 x = _mm256_fmadd_pd(px, exp_c2, x);
208#else
209 v4df w = _mm256_mul_pd(px, exp_c1);
210 x = _mm256_add_pd(x, w);
211 w = _mm256_mul_pd(px, exp_c2);
212 x = _mm256_add_pd(x, w);
213#endif
214 v4df hfx = _mm256_mul_pd(x, half);
215 v4df hxs = _mm256_mul_pd(x, hfx);
216 // sign of t is reversed compared to openlibm implementation
217#ifdef __FMA__
218 v4df r1 = _mm256_fmadd_pd(expm1_q5, hxs, expm1_q4);
219 r1 = _mm256_fmadd_pd(r1, hxs, expm1_q3);
220 r1 = _mm256_fmadd_pd(r1, hxs, expm1_q2);
221 r1 = _mm256_fmadd_pd(r1, hxs, expm1_q1);
222 r1 = _mm256_fmadd_pd(r1, hxs, one);
223 v4df t = _mm256_fmadd_pd(r1, hfx, mthree);
224 v4df e = _mm256_fmadd_pd(x, t, six);
225#else
226 v4df r1 = _mm256_mul_pd(expm1_q5, hxs);
227 r1 = _mm256_add_pd(r1, expm1_q4);
228 r1 = _mm256_mul_pd(r1, hxs);
229 r1 = _mm256_add_pd(r1, expm1_q3);
230 r1 = _mm256_mul_pd(r1, hxs);
231 r1 = _mm256_add_pd(r1, expm1_q2);
232 r1 = _mm256_mul_pd(r1, hxs);
233 r1 = _mm256_add_pd(r1, expm1_q1);
234 r1 = _mm256_mul_pd(r1, hxs);
235 r1 = _mm256_add_pd(r1, one);
236 v4df t = _mm256_mul_pd(r1, hfx);
237 t = _mm256_add_pd(t, mthree);
238 v4df e = _mm256_mul_pd(x, t);
239 e = _mm256_add_pd(e, six);
240#endif
241 v4df h = _mm256_add_pd(r1, t);
242 h = _mm256_div_pd(h, e);
243 e = _mm256_mul_pd(h, hxs);
244 v4df p2n = v1pow2d_core(k);
245 v4df p2mn = _mm256_div_pd(one, p2n);
246 e = _mm256_mul_pd(e, x);
247 e = _mm256_sub_pd(e, hxs);
248 e = _mm256_sub_pd(e, x);
249 t = _mm256_sub_pd(one, p2mn);
250 v4df y = _mm256_sub_pd(t, e);
251 y = _mm256_mul_pd(y, py);
252 return _mm256_mul_pd(y, p2n);
253}
254
255#endif // __AVX512F__
256
257VECFI_CONST(expf_max_arg,0x42b17217); // +88.72283173
258VECFI_CONST(expf_min_arg,0xc2aeac51); // -87.33655548
259
260VECFI_CONST(exp10f_max_arg,0x421a209a); // +38.53183746
261VECFI_CONST(exp10f_min_arg,0xc217b819); // -37.92978287
262
263VECFI_CONST(expm1f_max_arg,0x42b17217); // +88.72283173
264VECFI_CONST(expm1f_min_arg,0xc18aa123); // -17.32868004
265
266#if defined(__AVX2__) || defined(__AVX512F__)
267VECII_CONST(offf,0x7f);
268VECII_CONST(mofff,0xffffff81);
269#else
270VECI_CONST(offf,0x7f);
271VECI_CONST(mofff,0xffffff81);
272#endif
273
274VECFI_CONST(expf_c1,0xbf318000); // -0.693359375f
275VECFI_CONST(expf_c2,0x395e8083); // 2.12194440e-4f
276
277VECFI_CONST(expf_p0,0x39506967); // 1.9875691500e-4f
278VECFI_CONST(expf_p1,0x3ab743ce); // 1.3981999507e-3f
279VECFI_CONST(expf_p2,0x3c088908); // 8.3334519073e-3f
280VECFI_CONST(expf_p3,0x3d2aa9c1); // 4.1665795894e-2f
281VECFI_CONST(expf_p4,0x3e2aaaaa); // 1.6666665459e-1f
282VECFI_CONST(expf_p5,0x3f000000); // 5.0000000000e-1f
283
284VECFI_CONST(exp10f_c1,0xbc01861c); // -7.9055088724e-3
285
286VECFI_CONST(expm1f_q1,0xbd088868); // -3.3333212137e-2
287VECFI_CONST(expm1f_q2,0x3acf3010); // 1.5807170421e-3
288
289#ifdef __AVX512F__
290
291inline v16sf v1pow2f_core(v16si n)
292{
293 n = _mm512_add_epi32(n, offf);
294 n = _mm512_slli_epi32(n, 23);
295 return _mm512_castsi512_ps(n);
296}
297
298inline v16si v1expf_core(v16sf& x)
299{
300 v16sf px = _mm512_mul_ps(x, log2ef);
301 px = _mm512_floor_ps(px);
302 v16si n = _mm512_cvtps_epi32(px);
303 x = _mm512_fmadd_ps(px, expf_c1, x);
304 x = _mm512_fmadd_ps(px, expf_c2, x);
305 v16sf xx = _mm512_mul_ps(x, x);
306 px = _mm512_fmadd_ps(expf_p0, x, expf_p1);
307 px = _mm512_fmadd_ps(px, x, expf_p2);
308 px = _mm512_fmadd_ps(px, x, expf_p3);
309 px = _mm512_fmadd_ps(px, x, expf_p4);
310 px = _mm512_fmadd_ps(px, x, expf_p5);
311 px = _mm512_mul_ps(px, xx);
312 px = _mm512_add_ps(px, x);
313 x = _mm512_add_ps(px, onef);
314 return n;
315}
316
317inline v16sf v1expm1f_core(v16sf x)
318{
319 v16sf px = _mm512_mul_ps(x, log2ef);
320 px = _mm512_roundscale_ps(px, _MM_FROUND_NINT);
321 px = _mm512_min_ps(px, c127);
322 v16si n = _mm512_cvtps_epi32(px);
323 x = _mm512_fmadd_ps(px, expf_c1, x);
324 x = _mm512_fmadd_ps(px, expf_c2, x);
325 v16sf hfx = _mm512_mul_ps(x, halff);
326 v16sf hxs = _mm512_mul_ps(x, hfx);
327 // sign of t is reversed compared to openlibm implementation
328 v16sf r1 = _mm512_fmadd_ps(expm1f_q2, hxs, expm1f_q1);
329 r1 = _mm512_fmadd_ps(r1, hxs, onef);
330 v16sf t = _mm512_fmadd_ps(r1, hfx, mthreef);
331 v16sf e = _mm512_fmadd_ps(x, t, sixf);
332 v16sf h = _mm512_add_ps(r1, t);
333 h = _mm512_div_ps(h, e);
334 e = _mm512_mul_ps(h, hxs);
335 v16sf p2n = v1pow2f_core(n);
336 v16sf p2mn = _mm512_div_ps(onef, p2n);
337 e = _mm512_mul_ps(e, x);
338 e = _mm512_sub_ps(e, hxs);
339 e = _mm512_sub_ps(e, x);
340 t = _mm512_sub_ps(onef, p2mn);
341 v16sf y = _mm512_sub_ps(t, e);
342 return _mm512_mul_ps(y, p2n);
343}
344
345#else
346
347inline v8sf v1pow2f_core(v8si n)
348{
349#ifdef __AVX2__
350 n = _mm256_add_epi32(n, offf);
351 n = _mm256_slli_epi32(n, 23);
352#else
353 v4si nl = _mm256_extractf128_si256(n, 0);
354 nl = _mm_add_epi32(nl, offf);
355 v4si nu = _mm256_extractf128_si256(n, 1);
356 nu = _mm_add_epi32(nu, offf);
357 nl = _mm_slli_epi32(nl, 23);
358 nu = _mm_slli_epi32(nu, 23);
359 n = _mm256_insertf128_si256(n, nl, 0);
360 n = _mm256_insertf128_si256(n, nu, 1);
361#endif
362 return _mm256_castsi256_ps(n);
363}
364
365inline v8si v1expf_core(v8sf& x)
366{
367 v8sf px = _mm256_mul_ps(x, log2ef);
368 px = _mm256_floor_ps(px);
369 v8si n = _mm256_cvtps_epi32(px);
370#ifdef __FMA__
371 x = _mm256_fmadd_ps(px, expf_c1, x);
372 x = _mm256_fmadd_ps(px, expf_c2, x);
373#else
374 v8sf y = _mm256_mul_ps(px, expf_c1);
375 x = _mm256_add_ps(x, y);
376 y = _mm256_mul_ps(px, expf_c2);
377 x = _mm256_add_ps(x, y);
378#endif
379 v8sf xx = _mm256_mul_ps(x, x);
380#ifdef __FMA__
381 px = _mm256_fmadd_ps(expf_p0, x, expf_p1);
382 px = _mm256_fmadd_ps(px, x, expf_p2);
383 px = _mm256_fmadd_ps(px, x, expf_p3);
384 px = _mm256_fmadd_ps(px, x, expf_p4);
385 px = _mm256_fmadd_ps(px, x, expf_p5);
386#else
387 px = _mm256_mul_ps(expf_p0, x);
388 px = _mm256_add_ps(px, expf_p1);
389 px = _mm256_mul_ps(px, x);
390 px = _mm256_add_ps(px, expf_p2);
391 px = _mm256_mul_ps(px, x);
392 px = _mm256_add_ps(px, expf_p3);
393 px = _mm256_mul_ps(px, x);
394 px = _mm256_add_ps(px, expf_p4);
395 px = _mm256_mul_ps(px, x);
396 px = _mm256_add_ps(px, expf_p5);
397#endif
398 px = _mm256_mul_ps(px, xx);
399 px = _mm256_add_ps(px, x);
400 x = _mm256_add_ps(px, onef);
401 return n;
402}
403
404inline v8sf v1expm1f_core(v8sf x)
405{
406 v8sf px = _mm256_mul_ps(x, log2ef);
407 px = _mm256_round_ps(px, _MM_FROUND_NINT);
408 px = _mm256_min_ps(px, c127);
409 v8si n = _mm256_cvtps_epi32(px);
410#ifdef __FMA__
411 x = _mm256_fmadd_ps(px, expf_c1, x);
412 x = _mm256_fmadd_ps(px, expf_c2, x);
413#else
414 v8sf w = _mm256_mul_ps(px, expf_c1);
415 x = _mm256_add_ps(x, w);
416 w = _mm256_mul_ps(px, expf_c2);
417 x = _mm256_add_ps(x, w);
418#endif
419 v8sf hfx = _mm256_mul_ps(x, halff);
420 v8sf hxs = _mm256_mul_ps(x, hfx);
421 // sign of t is reversed compared to openlibm implementation
422#ifdef __FMA__
423 v8sf r1 = _mm256_fmadd_ps(expm1f_q2, hxs, expm1f_q1);
424 r1 = _mm256_fmadd_ps(r1, hxs, onef);
425 v8sf t = _mm256_fmadd_ps(r1, hfx, mthreef);
426 v8sf e = _mm256_fmadd_ps(x, t, sixf);
427#else
428 v8sf r1 = _mm256_mul_ps(expm1f_q2, hxs);
429 r1 = _mm256_add_ps(r1, expm1f_q1);
430 r1 = _mm256_mul_ps(r1, hxs);
431 r1 = _mm256_add_ps(r1, onef);
432 v8sf t = _mm256_mul_ps(r1, hfx);
433 t = _mm256_add_ps(t, mthreef);
434 v8sf e = _mm256_mul_ps(x, t);
435 e = _mm256_add_ps(e, sixf);
436#endif
437 v8sf h = _mm256_add_ps(r1, t);
438 h = _mm256_div_ps(h, e);
439 e = _mm256_mul_ps(h, hxs);
440 v8sf p2n = v1pow2f_core(n);
441 v8sf p2mn = _mm256_div_ps(onef, p2n);
442 e = _mm256_mul_ps(e, x);
443 e = _mm256_sub_ps(e, hxs);
444 e = _mm256_sub_ps(e, x);
445 t = _mm256_sub_ps(onef, p2mn);
446 v8sf y = _mm256_sub_ps(t, e);
447 return _mm256_mul_ps(y, p2n);
448}
449
450#endif // __AVX512F__
451
452#endif // __AVX__
453
454#endif