4 #ifndef VECTORIZE_EXP_CORE_H
5 #define VECTORIZE_EXP_CORE_H
37 VECDI_CONST(exp_max_arg,0x40862e42fefa39ee);
38 VECDI_CONST(exp_min_arg,0xc086232bdd7abcd3);
40 VECDI_CONST(exp10_max_arg,0x40734413509f79fe);
41 VECDI_CONST(exp10_min_arg,0xc0733a7146f72a42);
43 VECDI_CONST(expm1_max_arg,0x40862e42fefa39ef);
44 VECDI_CONST(expm1_min_arg,0xc042b708872320e2);
46 VECI_CONST(off,0x3ff);
47 VECI_CONST(moff,0xfffffc01);
49 VECDI_CONST(exp_c1,0xbfe62e4000000000);
50 VECDI_CONST(exp_c2,0xbeb7f7d1cf79abca);
52 VECDI_CONST(exp_p0,0x3f2089cdd5e44be8);
53 VECDI_CONST(exp_p1,0x3f9f06d10cca2c7e);
54 VECDI_CONST(exp_p2,0x3ff0000000000000);
55 VECDI_CONST(exp_q0,0x3ec92eb6bc365fa0);
56 VECDI_CONST(exp_q1,0x3f64ae39b508b6c0);
57 VECDI_CONST(exp_q2,0x3fcd17099887e074);
58 VECDI_CONST(exp_q3,0x4000000000000000);
60 VECDI_CONST(exp10_c1,0xbf8030c37085dc7f);
62 VECDI_CONST(expm1_q1,0xbfa11111111110f4);
63 VECDI_CONST(expm1_q2,0x3f5a01a019fe5585);
64 VECDI_CONST(expm1_q3,0xbf14ce199eaadbb7);
65 VECDI_CONST(expm1_q4,0x3ed0cfca86e65239);
66 VECDI_CONST(expm1_q5,0xbe8afdb76e09c32d);
70 inline v8df v1pow2d_core(v8si m)
72 m = _mm256_add_epi32(m, off);
73 v8di n = _mm512_cvtepi32_epi64(m);
74 n = _mm512_slli_epi64(n, 52);
75 return _mm512_castsi512_pd(n);
78 inline v8si v1expd_core(v8df& x)
80 v8df px = _mm512_mul_pd(x, log2e);
81 px = _mm512_floor_pd(px);
82 v8si m = _mm512_cvtpd_epi32(px);
83 x = _mm512_fmadd_pd(px, exp_c1, x);
84 x = _mm512_fmadd_pd(px, exp_c2, x);
85 v8df xx = _mm512_mul_pd(x, x);
86 px = _mm512_fmadd_pd(exp_p0, xx, exp_p1);
87 px = _mm512_fmadd_pd(px, xx, exp_p2);
88 v8df qx = _mm512_fmadd_pd(exp_q0, xx, exp_q1);
89 qx = _mm512_fmadd_pd(qx, xx, exp_q2);
90 qx = _mm512_fmadd_pd(qx, xx, exp_q3);
91 px = _mm512_mul_pd(x, px);
92 xx = _mm512_sub_pd(qx, px);
93 x = _mm512_div_pd(px, xx);
94 x = _mm512_fmadd_pd(two, x, one);
98 inline v8df v1expm1d_core(v8df x)
100 v8df px = _mm512_mul_pd(x, log2e);
101 px = _mm512_roundscale_pd(px, _MM_FROUND_NINT);
102 v8df py = _mm512_min_pd(px, c1023);
103 v8si k = _mm512_cvtpd_epi32(py);
104 py = _mm512_sub_pd(px, py);
105 py = _mm512_add_pd(py, one);
106 x = _mm512_fmadd_pd(px, exp_c1, x);
107 x = _mm512_fmadd_pd(px, exp_c2, x);
108 v8df hfx = _mm512_mul_pd(x, half);
109 v8df hxs = _mm512_mul_pd(x, hfx);
111 v8df r1 = _mm512_fmadd_pd(expm1_q5, hxs, expm1_q4);
112 r1 = _mm512_fmadd_pd(r1, hxs, expm1_q3);
113 r1 = _mm512_fmadd_pd(r1, hxs, expm1_q2);
114 r1 = _mm512_fmadd_pd(r1, hxs, expm1_q1);
115 r1 = _mm512_fmadd_pd(r1, hxs, one);
116 v8df t = _mm512_fmadd_pd(r1, hfx, mthree);
117 v8df e = _mm512_fmadd_pd(x, t, six);
118 v8df h = _mm512_add_pd(r1, t);
119 h = _mm512_div_pd(h, e);
120 e = _mm512_mul_pd(h, hxs);
121 v8df p2n = v1pow2d_core(k);
122 v8df p2mn = _mm512_div_pd(one, p2n);
123 e = _mm512_mul_pd(e, x);
124 e = _mm512_sub_pd(e, hxs);
125 e = _mm512_sub_pd(e, x);
126 t = _mm512_sub_pd(one, p2mn);
127 v8df y = _mm512_sub_pd(t, e);
128 y = _mm512_mul_pd(y, py);
129 return _mm512_mul_pd(y, p2n);
134 inline v4df v1pow2d_core(v4si m)
136 m = _mm_add_epi32(m, off);
138 v4di n = _mm256_cvtepi32_epi64(m);
139 n = _mm256_slli_epi64(n, 52);
141 m = _mm_slli_epi32(m, 20);
142 v4si z = _mm_set1_epi32(0);
143 v4si nl = _mm_unpacklo_epi32(z, m);
144 v8si n = _mm256_set1_epi32(0);
145 n = _mm256_insertf128_si256(n, nl, 0);
146 v4si nu = _mm_unpackhi_epi32(z, m);
147 n = _mm256_insertf128_si256(n, nu, 1);
149 return _mm256_castsi256_pd(n);
152 inline v4si v1expd_core(v4df& x)
154 v4df px = _mm256_mul_pd(x, log2e);
155 px = _mm256_floor_pd(px);
156 v4si m = _mm256_cvtpd_epi32(px);
158 x = _mm256_fmadd_pd(px, exp_c1, x);
159 x = _mm256_fmadd_pd(px, exp_c2, x);
161 v4df y = _mm256_mul_pd(px, exp_c1);
162 x = _mm256_add_pd(x, y);
163 y = _mm256_mul_pd(px, exp_c2);
164 x = _mm256_add_pd(x, y);
166 v4df xx = _mm256_mul_pd(x, x);
168 px = _mm256_fmadd_pd(exp_p0, xx, exp_p1);
169 px = _mm256_fmadd_pd(px, xx, exp_p2);
170 v4df qx = _mm256_fmadd_pd(exp_q0, xx, exp_q1);
171 qx = _mm256_fmadd_pd(qx, xx, exp_q2);
172 qx = _mm256_fmadd_pd(qx, xx, exp_q3);
174 px = _mm256_mul_pd(exp_p0, xx);
175 px = _mm256_add_pd(px, exp_p1);
176 px = _mm256_mul_pd(px, xx);
177 px = _mm256_add_pd(px, exp_p2);
178 v4df qx = _mm256_mul_pd(exp_q0, xx);
179 qx = _mm256_add_pd(qx, exp_q1);
180 qx = _mm256_mul_pd(qx, xx);
181 qx = _mm256_add_pd(qx, exp_q2);
182 qx = _mm256_mul_pd(qx, xx);
183 qx = _mm256_add_pd(qx, exp_q3);
185 px = _mm256_mul_pd(x, px);
186 xx = _mm256_sub_pd(qx, px);
187 x = _mm256_div_pd(px, xx);
189 x = _mm256_fmadd_pd(two, x, one);
191 x = _mm256_mul_pd(x, two);
192 x = _mm256_add_pd(x, one);
197 inline v4df v1expm1d_core(v4df x)
199 v4df px = _mm256_mul_pd(x, log2e);
200 px = _mm256_round_pd(px, _MM_FROUND_NINT);
201 v4df py = _mm256_min_pd(px, c1023);
202 v4si k = _mm256_cvtpd_epi32(py);
203 py = _mm256_sub_pd(px, py);
204 py = _mm256_add_pd(py, one);
206 x = _mm256_fmadd_pd(px, exp_c1, x);
207 x = _mm256_fmadd_pd(px, exp_c2, x);
209 v4df w = _mm256_mul_pd(px, exp_c1);
210 x = _mm256_add_pd(x, w);
211 w = _mm256_mul_pd(px, exp_c2);
212 x = _mm256_add_pd(x, w);
214 v4df hfx = _mm256_mul_pd(x, half);
215 v4df hxs = _mm256_mul_pd(x, hfx);
218 v4df r1 = _mm256_fmadd_pd(expm1_q5, hxs, expm1_q4);
219 r1 = _mm256_fmadd_pd(r1, hxs, expm1_q3);
220 r1 = _mm256_fmadd_pd(r1, hxs, expm1_q2);
221 r1 = _mm256_fmadd_pd(r1, hxs, expm1_q1);
222 r1 = _mm256_fmadd_pd(r1, hxs, one);
223 v4df t = _mm256_fmadd_pd(r1, hfx, mthree);
224 v4df e = _mm256_fmadd_pd(x, t, six);
226 v4df r1 = _mm256_mul_pd(expm1_q5, hxs);
227 r1 = _mm256_add_pd(r1, expm1_q4);
228 r1 = _mm256_mul_pd(r1, hxs);
229 r1 = _mm256_add_pd(r1, expm1_q3);
230 r1 = _mm256_mul_pd(r1, hxs);
231 r1 = _mm256_add_pd(r1, expm1_q2);
232 r1 = _mm256_mul_pd(r1, hxs);
233 r1 = _mm256_add_pd(r1, expm1_q1);
234 r1 = _mm256_mul_pd(r1, hxs);
235 r1 = _mm256_add_pd(r1, one);
236 v4df t = _mm256_mul_pd(r1, hfx);
237 t = _mm256_add_pd(t, mthree);
238 v4df e = _mm256_mul_pd(x, t);
239 e = _mm256_add_pd(e, six);
241 v4df h = _mm256_add_pd(r1, t);
242 h = _mm256_div_pd(h, e);
243 e = _mm256_mul_pd(h, hxs);
244 v4df p2n = v1pow2d_core(k);
245 v4df p2mn = _mm256_div_pd(one, p2n);
246 e = _mm256_mul_pd(e, x);
247 e = _mm256_sub_pd(e, hxs);
248 e = _mm256_sub_pd(e, x);
249 t = _mm256_sub_pd(one, p2mn);
250 v4df y = _mm256_sub_pd(t, e);
251 y = _mm256_mul_pd(y, py);
252 return _mm256_mul_pd(y, p2n);
255 #endif // __AVX512F__
257 VECFI_CONST(expf_max_arg,0x42b17217);
258 VECFI_CONST(expf_min_arg,0xc2aeac51);
260 VECFI_CONST(exp10f_max_arg,0x421a209a);
261 VECFI_CONST(exp10f_min_arg,0xc217b819);
263 VECFI_CONST(expm1f_max_arg,0x42b17217);
264 VECFI_CONST(expm1f_min_arg,0xc18aa123);
266 #if defined(__AVX2__) || defined(__AVX512F__)
267 VECII_CONST(offf,0x7f);
268 VECII_CONST(mofff,0xffffff81);
270 VECI_CONST(offf,0x7f);
271 VECI_CONST(mofff,0xffffff81);
274 VECFI_CONST(expf_c1,0xbf318000);
275 VECFI_CONST(expf_c2,0x395e8083);
277 VECFI_CONST(expf_p0,0x39506967);
278 VECFI_CONST(expf_p1,0x3ab743ce);
279 VECFI_CONST(expf_p2,0x3c088908);
280 VECFI_CONST(expf_p3,0x3d2aa9c1);
281 VECFI_CONST(expf_p4,0x3e2aaaaa);
282 VECFI_CONST(expf_p5,0x3f000000);
284 VECFI_CONST(exp10f_c1,0xbc01861c);
286 VECFI_CONST(expm1f_q1,0xbd088868);
287 VECFI_CONST(expm1f_q2,0x3acf3010);
291 inline v16sf v1pow2f_core(v16si n)
293 n = _mm512_add_epi32(n, offf);
294 n = _mm512_slli_epi32(n, 23);
295 return _mm512_castsi512_ps(n);
298 inline v16si v1expf_core(v16sf& x)
300 v16sf px = _mm512_mul_ps(x, log2ef);
301 px = _mm512_floor_ps(px);
302 v16si n = _mm512_cvtps_epi32(px);
303 x = _mm512_fmadd_ps(px, expf_c1, x);
304 x = _mm512_fmadd_ps(px, expf_c2, x);
305 v16sf xx = _mm512_mul_ps(x, x);
306 px = _mm512_fmadd_ps(expf_p0, x, expf_p1);
307 px = _mm512_fmadd_ps(px, x, expf_p2);
308 px = _mm512_fmadd_ps(px, x, expf_p3);
309 px = _mm512_fmadd_ps(px, x, expf_p4);
310 px = _mm512_fmadd_ps(px, x, expf_p5);
311 px = _mm512_mul_ps(px, xx);
312 px = _mm512_add_ps(px, x);
313 x = _mm512_add_ps(px, onef);
317 inline v16sf v1expm1f_core(v16sf x)
319 v16sf px = _mm512_mul_ps(x, log2ef);
320 px = _mm512_roundscale_ps(px, _MM_FROUND_NINT);
321 px = _mm512_min_ps(px, c127);
322 v16si n = _mm512_cvtps_epi32(px);
323 x = _mm512_fmadd_ps(px, expf_c1, x);
324 x = _mm512_fmadd_ps(px, expf_c2, x);
325 v16sf hfx = _mm512_mul_ps(x, halff);
326 v16sf hxs = _mm512_mul_ps(x, hfx);
328 v16sf r1 = _mm512_fmadd_ps(expm1f_q2, hxs, expm1f_q1);
329 r1 = _mm512_fmadd_ps(r1, hxs, onef);
330 v16sf t = _mm512_fmadd_ps(r1, hfx, mthreef);
331 v16sf e = _mm512_fmadd_ps(x, t, sixf);
332 v16sf h = _mm512_add_ps(r1, t);
333 h = _mm512_div_ps(h, e);
334 e = _mm512_mul_ps(h, hxs);
335 v16sf p2n = v1pow2f_core(n);
336 v16sf p2mn = _mm512_div_ps(onef, p2n);
337 e = _mm512_mul_ps(e, x);
338 e = _mm512_sub_ps(e, hxs);
339 e = _mm512_sub_ps(e, x);
340 t = _mm512_sub_ps(onef, p2mn);
341 v16sf y = _mm512_sub_ps(t, e);
342 return _mm512_mul_ps(y, p2n);
347 inline v8sf v1pow2f_core(v8si n)
350 n = _mm256_add_epi32(n, offf);
351 n = _mm256_slli_epi32(n, 23);
353 v4si nl = _mm256_extractf128_si256(n, 0);
354 nl = _mm_add_epi32(nl, offf);
355 v4si nu = _mm256_extractf128_si256(n, 1);
356 nu = _mm_add_epi32(nu, offf);
357 nl = _mm_slli_epi32(nl, 23);
358 nu = _mm_slli_epi32(nu, 23);
359 n = _mm256_insertf128_si256(n, nl, 0);
360 n = _mm256_insertf128_si256(n, nu, 1);
362 return _mm256_castsi256_ps(n);
365 inline v8si v1expf_core(v8sf& x)
367 v8sf px = _mm256_mul_ps(x, log2ef);
368 px = _mm256_floor_ps(px);
369 v8si n = _mm256_cvtps_epi32(px);
371 x = _mm256_fmadd_ps(px, expf_c1, x);
372 x = _mm256_fmadd_ps(px, expf_c2, x);
374 v8sf y = _mm256_mul_ps(px, expf_c1);
375 x = _mm256_add_ps(x, y);
376 y = _mm256_mul_ps(px, expf_c2);
377 x = _mm256_add_ps(x, y);
379 v8sf xx = _mm256_mul_ps(x, x);
381 px = _mm256_fmadd_ps(expf_p0, x, expf_p1);
382 px = _mm256_fmadd_ps(px, x, expf_p2);
383 px = _mm256_fmadd_ps(px, x, expf_p3);
384 px = _mm256_fmadd_ps(px, x, expf_p4);
385 px = _mm256_fmadd_ps(px, x, expf_p5);
387 px = _mm256_mul_ps(expf_p0, x);
388 px = _mm256_add_ps(px, expf_p1);
389 px = _mm256_mul_ps(px, x);
390 px = _mm256_add_ps(px, expf_p2);
391 px = _mm256_mul_ps(px, x);
392 px = _mm256_add_ps(px, expf_p3);
393 px = _mm256_mul_ps(px, x);
394 px = _mm256_add_ps(px, expf_p4);
395 px = _mm256_mul_ps(px, x);
396 px = _mm256_add_ps(px, expf_p5);
398 px = _mm256_mul_ps(px, xx);
399 px = _mm256_add_ps(px, x);
400 x = _mm256_add_ps(px, onef);
404 inline v8sf v1expm1f_core(v8sf x)
406 v8sf px = _mm256_mul_ps(x, log2ef);
407 px = _mm256_round_ps(px, _MM_FROUND_NINT);
408 px = _mm256_min_ps(px, c127);
409 v8si n = _mm256_cvtps_epi32(px);
411 x = _mm256_fmadd_ps(px, expf_c1, x);
412 x = _mm256_fmadd_ps(px, expf_c2, x);
414 v8sf w = _mm256_mul_ps(px, expf_c1);
415 x = _mm256_add_ps(x, w);
416 w = _mm256_mul_ps(px, expf_c2);
417 x = _mm256_add_ps(x, w);
419 v8sf hfx = _mm256_mul_ps(x, halff);
420 v8sf hxs = _mm256_mul_ps(x, hfx);
423 v8sf r1 = _mm256_fmadd_ps(expm1f_q2, hxs, expm1f_q1);
424 r1 = _mm256_fmadd_ps(r1, hxs, onef);
425 v8sf t = _mm256_fmadd_ps(r1, hfx, mthreef);
426 v8sf e = _mm256_fmadd_ps(x, t, sixf);
428 v8sf r1 = _mm256_mul_ps(expm1f_q2, hxs);
429 r1 = _mm256_add_ps(r1, expm1f_q1);
430 r1 = _mm256_mul_ps(r1, hxs);
431 r1 = _mm256_add_ps(r1, onef);
432 v8sf t = _mm256_mul_ps(r1, hfx);
433 t = _mm256_add_ps(t, mthreef);
434 v8sf e = _mm256_mul_ps(x, t);
435 e = _mm256_add_ps(e, sixf);
437 v8sf h = _mm256_add_ps(r1, t);
438 h = _mm256_div_ps(h, e);
439 e = _mm256_mul_ps(h, hxs);
440 v8sf p2n = v1pow2f_core(n);
441 v8sf p2mn = _mm256_div_ps(onef, p2n);
442 e = _mm256_mul_ps(e, x);
443 e = _mm256_sub_ps(e, hxs);
444 e = _mm256_sub_ps(e, x);
445 t = _mm256_sub_ps(onef, p2mn);
446 v8sf y = _mm256_sub_ps(t, e);
447 return _mm256_mul_ps(y, p2n);
450 #endif // __AVX512F__