4 #ifndef VECTORIZE_MATH_H
5 #define VECTORIZE_MATH_H
30 typedef __m512i v16si;
34 #define VECD_CONST(name,x) \
35 ALIGNED(64) static const double __avx_##name[8] = {x,x,x,x,x,x,x,x}; \
36 static const v8df& name = *reinterpret_cast<const v8df*>(__avx_##name)
38 #define VECDI_CONST(name,x) \
39 ALIGNED(64) static const uint64 __avx_##name[8] = {x,x,x,x,x,x,x,x}; \
40 static const v8df& name = *reinterpret_cast<const v8df*>(__avx_##name)
42 #define VECF_CONST(name,x) \
43 ALIGNED(64) static const sys_float __avx_##name[16] = {x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x}; \
44 static const v16sf& name = *reinterpret_cast<const v16sf*>(__avx_##name)
46 #define VECFI_CONST(name,x) \
47 ALIGNED(64) static const uint32 __avx_##name[16] = {x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x}; \
48 static const v16sf& name = *reinterpret_cast<const v16sf*>(__avx_##name)
50 #define VECI_CONST(name,x) \
51 ALIGNED(32) static const uint32 __avx_##name[8] = {x,x,x,x,x,x,x,x}; \
52 static const v8si& name = *reinterpret_cast<const v8si*>(__avx_##name)
54 #define VECII_CONST(name,x) \
55 ALIGNED(64) static const uint32 __avx_##name[16] = {x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x}; \
56 static const v16si& name = *reinterpret_cast<const v16si*>(__avx_##name)
58 #define VECL_CONST(name,x) \
59 ALIGNED(32) static const uint64 __avx_##name[4] = {x,x,x,x}; \
60 static const v4di& name = *reinterpret_cast<const v4di*>(__avx_##name)
62 #define VECLL_CONST(name,x) \
63 ALIGNED(64) static const uint64 __avx_##name[8] = {x,x,x,x,x,x,x,x}; \
64 static const v8di& name = *reinterpret_cast<const v8di*>(__avx_##name)
76 inline
bool getmaskbit(
unsigned int mask,
unsigned int n)
78 return ( ((mask>>n)&1) == 1 );
82 inline string print_pack(
const T x[],
int npack,
unsigned int mask,
const string& name)
85 for(
int i=0; i < npack; ++i )
87 if( getmaskbit(mask, i) )
91 oss << name <<
"[" << i <<
"] = " << x[i] <<
"\n";
96 inline string DEMsg(
const string& routine, v8df x, __mmask8 mask)
100 oss <<
"domain error in " << routine <<
".\n";
102 oss << print_pack(p.d, 8, mask,
"x");
106 inline string DEMsg(
const string& routine, v16sf x, __mmask16 mask)
110 oss <<
"domain error in " << routine <<
".\n";
112 oss << print_pack(p.f, 16, mask,
"x");
116 inline string DEMsg(
const string& routine, v8df x, __mmask8 mask1, v8df y, __mmask8 mask2)
120 oss <<
"domain error in " << routine <<
".\n";
122 oss << print_pack(p.d, 8, mask1,
"x");
124 oss << print_pack(p.d, 8, mask2,
"y");
128 inline string DEMsg(
const string& routine, v16sf x, __mmask16 mask1, v16sf y, __mmask16 mask2)
132 oss <<
"domain error in " << routine <<
".\n";
134 oss << print_pack(p.f, 16, mask1,
"x");
136 oss << print_pack(p.f, 16, mask2,
"y");
142 #define VECD_CONST(name,x) \
143 ALIGNED(32) static const double __avx_##name[4] = {x,x,x,x}; \
144 static const v4df& name = *reinterpret_cast<const v4df*>(__avx_##name)
146 #define VECDI_CONST(name,x) \
147 ALIGNED(32) static const uint64 __avx_##name[4] = {x,x,x,x}; \
148 static const v4df& name = *reinterpret_cast<const v4df*>(__avx_##name)
150 #define VECF_CONST(name,x) \
151 ALIGNED(32) static const sys_float __avx_##name[8] = {x,x,x,x,x,x,x,x}; \
152 static const v8sf& name = *reinterpret_cast<const v8sf*>(__avx_##name)
154 #define VECFI_CONST(name,x) \
155 ALIGNED(32) static const uint32 __avx_##name[8] = {x,x,x,x,x,x,x,x}; \
156 static const v8sf& name = *reinterpret_cast<const v8sf*>(__avx_##name)
158 #define VECI_CONST(name,x) \
159 ALIGNED(16) static const uint32 __avx_##name[4] = {x,x,x,x}; \
160 static const v4si& name = *reinterpret_cast<const v4si*>(__avx_##name)
162 #define VECII_CONST(name,x) \
163 ALIGNED(32) static const uint32 __avx_##name[8] = {x,x,x,x,x,x,x,x}; \
164 static const v8si& name = *reinterpret_cast<const v8si*>(__avx_##name)
166 #define VECL_CONST(name,x) \
167 ALIGNED(16) static const uint64 __avx_##name[2] = {x,x}; \
168 static const v2di& name = *reinterpret_cast<const v2di*>(__avx_##name)
170 #define VECLL_CONST(name,x) \
171 ALIGNED(32) static const uint64 __avx_##name[4] = {x,x,x,x}; \
172 static const v4di& name = *reinterpret_cast<const v4di*>(__avx_##name)
184 template<class T, class U>
185 inline
string print_pack(const T x[], const U m[],
int npack, const
string& name)
188 for(
int i=0; i < npack; ++i )
194 oss << name <<
"[" << i <<
"] = " << x[i] <<
"\n";
199 inline string DEMsg(
const string& routine, v4df x, v4df mask)
203 oss <<
"domain error in " << routine <<
".\n";
206 oss << print_pack(p.d, m.l, 4,
"x");
210 inline string DEMsg(
const string& routine, v8sf x, v8sf mask)
214 oss <<
"domain error in " << routine <<
".\n";
217 oss << print_pack(p.f, m.i, 8,
"x");
221 inline string DEMsg(
const string& routine, v4df x, v4df mask1, v4df y, v4df mask2)
225 oss <<
"domain error in " << routine <<
".\n";
228 oss << print_pack(p.d, m.l, 4,
"x");
231 oss << print_pack(p.d, m.l, 4,
"y");
235 inline string DEMsg(
const string& routine, v8sf x, v8sf mask1, v8sf y, v8sf mask2)
239 oss <<
"domain error in " << routine <<
".\n";
242 oss << print_pack(p.f, m.i, 8,
"x");
245 oss << print_pack(p.f, m.i, 8,
"y");
249 #endif // __AVX512F__
251 VECD_CONST(mthree,-3.);
252 VECD_CONST(mone,-1.);
253 VECD_CONST(mhalf,-0.5);
255 VECDI_CONST(third,0x3fd5555555555555);
256 VECD_CONST(half,0.5);
258 VECD_CONST(c1p5,1.5);
260 VECD_CONST(three,3.);
263 VECD_CONST(c1023,1023.);
265 VECDI_CONST(ln_two,0x3fe62e42fefa39ef);
266 VECDI_CONST(log2e,0x3ff71547652b82fe);
267 VECDI_CONST(ln_ten,0x40026bb1bbb55516);
268 VECDI_CONST(log10e,0x3fdbcb7b1526e50e);
270 VECDI_CONST(dbl_min,0x0010000000000000);
271 VECDI_CONST(dbl_max,0x7fefffffffffffff);
272 VECDI_CONST(sqrt_dbl_max,0x5fefffffffffffff);
274 VECF_CONST(mthreef,-3.f);
275 VECF_CONST(monef,-1.f);
276 VECF_CONST(mhalff,-0.5f);
277 VECF_CONST(zerof,0.f);
278 VECFI_CONST(thirdf,0x3eaaaaab);
279 VECF_CONST(halff,0.5f);
280 VECF_CONST(onef,1.f);
281 VECF_CONST(twof,2.f);
282 VECF_CONST(c1p5f,1.5f);
283 VECF_CONST(threef,3.f);
284 VECF_CONST(sixf,6.f);
285 VECF_CONST(tenf,10.f);
286 VECF_CONST(c127,127.f);
288 VECFI_CONST(ln_twof,0x3f317218);
289 VECFI_CONST(log2ef,0x3fb8aa3b);
290 VECFI_CONST(ln_tenf,0x40135d8e);
291 VECFI_CONST(log10ef,0x3ede5bd9);
293 VECFI_CONST(flt_min,0x00800000);
294 VECFI_CONST(flt_max,0x7f7fffff);
295 VECFI_CONST(sqrt_flt_max,0x5f7fffff);
298 inline void vecfun_partial(
const sys_float x[],
sys_float y[],
long nlo,
long nhi,
long npack, V (*vecfun1)(V))
301 for(
long i=0; i < npack; ++i )
302 xx.f[i] = x[
min(nlo+i,nhi-1)];
303 yy.ps = vecfun1(xx.ps);
304 for(
long i=nlo; i < nhi; ++i )
309 inline void vecfun_partial(
const double x[],
double y[],
long nlo,
long nhi,
long npack, V (*vecfun1)(V))
312 for(
long i=0; i < npack; ++i )
313 xx.d[i] = x[
min(nlo+i,nhi-1)];
314 yy.pd = vecfun1(xx.pd);
315 for(
long i=nlo; i < nhi; ++i )
321 long npack, V (*vecfun1)(V, V))
324 for(
long i=0; i < npack; ++i )
326 xx1.f[i] = x1[
min(nlo+i,nhi-1)];
327 xx2.f[i] = x2[
min(nlo+i,nhi-1)];
329 yy.ps = vecfun1(xx1.ps, xx2.ps);
330 for(
long i=nlo; i < nhi; ++i )
335 inline void vecfun2_partial(
const double x1[],
const double x2[],
double y[],
long nlo,
long nhi,
long npack,
339 for(
long i=0; i < npack; ++i )
341 xx1.d[i] = x1[
min(nlo+i,nhi-1)];
342 xx2.d[i] = x2[
min(nlo+i,nhi-1)];
344 yy.pd = vecfun1(xx1.pd, xx2.pd);
345 for(
long i=nlo; i < nhi; ++i )
353 template<
class T,
class V>
354 inline void vecfun(
const T x[], T y[],
long nlo,
long nhi, T (*)(T), V (*vecfun1)(V))
360 long npack =
sizeof(V)/
sizeof(T);
361 if( nhi-nlo < npack )
363 vecfun_partial(x, y, nlo, nhi, npack, vecfun1);
367 long ox = (
reinterpret_cast<long>(x)/
sizeof(T)+nlo)%npack;
368 long oy = (
reinterpret_cast<long>(y)/
sizeof(T)+nlo)%npack;
369 bool lgNonAligned = ( ox != oy );
370 T *yl, *ylocal = NULL;
373 ylocal =
new T[nhi+npack-1];
374 long ol = (
reinterpret_cast<long>(ylocal)/
sizeof(T)+nlo)%npack;
378 yl = &ylocal[ox+npack-ol];
388 vecfun_partial(x, yl, ilo,
min(ilo+npack-ox,nhi), npack, vecfun1);
389 ilo =
min(ilo+npack-ox,nhi);
392 for( i=ilo; i < nhi-npack+1; i += npack )
394 const V& xx = *
reinterpret_cast<const V*
>(&x[i]);
395 V& yy = *
reinterpret_cast<V*
>(&yl[i]);
401 vecfun_partial(x, yl, ilo, nhi, npack, vecfun1);
404 for( i=nlo; i < nhi; ++i )
410 template<
class T,
class V>
411 inline void vecfun2(
const T x1[],
const T x2[], T y[],
long nlo,
long nhi, T (*)(T, T), V (*vecfun1)(V, V))
417 long npack =
sizeof(V)/
sizeof(T);
418 if( nhi-nlo < npack )
420 vecfun2_partial(x1, x2, y, nlo, nhi, npack, vecfun1);
424 long ox1 = (
reinterpret_cast<long>(
x1)/
sizeof(T)+nlo)%npack;
425 long ox2 = (
reinterpret_cast<long>(
x2)/
sizeof(T)+nlo)%npack;
426 long oy = (
reinterpret_cast<long>(y)/
sizeof(T)+nlo)%npack;
427 bool lgNonAligned1 = ( ox1 != ox2 );
428 bool lgNonAligned2 = ( ox1 != oy );
433 x2local =
new T[nhi+npack-1];
434 long ol = (
reinterpret_cast<long>(x2local)/
sizeof(T)+nlo)%npack;
437 ptr = &x2local[ox1-ol];
439 ptr = &x2local[ox1+npack-ol];
440 memcpy(ptr+nlo, x2+nlo,
size_t((nhi-nlo)*
sizeof(T)));
447 T *yl, *ylocal = NULL;
450 ylocal =
new T[nhi+npack-1];
451 long ol = (
reinterpret_cast<long>(ylocal)/
sizeof(T)+nlo)%npack;
453 yl = &ylocal[ox1-ol];
455 yl = &ylocal[ox1+npack-ol];
465 vecfun2_partial(x1, x2l, yl, ilo,
min(ilo+npack-ox1,nhi), npack, vecfun1);
466 ilo =
min(ilo+npack-ox1,nhi);
469 for( i=ilo; i < nhi-npack+1; i += npack )
471 const V& xx1 = *
reinterpret_cast<const V*
>(&x1[i]);
472 const V& xx2 = *
reinterpret_cast<const V*
>(&x2l[i]);
473 V& yy = *
reinterpret_cast<V*
>(&yl[i]);
474 yy = vecfun1(xx1, xx2);
479 vecfun2_partial(x1, x2l, yl, ilo, nhi, npack, vecfun1);
484 for( i=nlo; i < nhi; ++i )
491 #define V1FUN_PD_4(FUN, V) \
492 v8df xx = _mm512_set_pd( V, V, V, V, x3, x2, x1, x0 ); \
493 v8df yy = v1##FUN##d(xx); \
494 memcpy(y, &yy, 4*sizeof(double))
496 #define V1FUN_PD_4(FUN, V) \
497 v4df xx = _mm256_set_pd( x3, x2, x1, x0 ); \
498 v4df yy = v1##FUN##d(xx); \
499 memcpy(y, &yy, 4*sizeof(double))
503 #define V1FUN_PD_8(FUN, V) \
504 v8df xx = _mm512_set_pd( x7, x6, x5, x4, x3, x2, x1, x0 ); \
505 v8df yy = v1##FUN##d(xx); \
506 memcpy(y, &yy, 8*sizeof(double))
508 #define V1FUN_PD_8(FUN, V) \
510 v4df xx = _mm256_set_pd( x3, x2, x1, x0 ); \
511 yy[0] = v1##FUN##d(xx); \
512 xx = _mm256_set_pd( x7, x6, x5, x4 ); \
513 yy[1] = v1##FUN##d(xx); \
514 memcpy(y, &yy, 8*sizeof(double))
518 #define V1FUN_PS_4(FUN, V) \
519 v16sf xx = _mm512_set_ps( V, V, V, V, V, V, V, V, V, V, V, V, x3, x2, x1, x0 ); \
520 v16sf yy = v1##FUN##f(xx); \
521 memcpy(y, &yy, 4*sizeof(sys_float))
523 #define V1FUN_PS_4(FUN, V) \
524 v8sf xx = _mm256_set_ps( V, V, V, V, x3, x2, x1, x0 ); \
525 v8sf yy = v1##FUN##f(xx); \
526 memcpy(y, &yy, 4*sizeof(sys_float))
530 #define V1FUN_PS_8(FUN, V) \
531 v16sf xx = _mm512_set_ps( V, V, V, V, V, V, V, V, x7, x6, x5, x4, x3, x2, x1, x0 ); \
532 v16sf yy = v1##FUN##f(xx); \
533 memcpy(y, &yy, 8*sizeof(sys_float))
535 #define V1FUN_PS_8(FUN, V) \
536 v8sf xx = _mm256_set_ps( x7, x6, x5, x4, x3, x2, x1, x0 ); \
537 v8sf yy = v1##FUN##f(xx); \
538 memcpy(y, &yy, 8*sizeof(sys_float))
542 #define V1FUN_PS_16(FUN, V) \
543 v16sf xx = _mm512_set_ps( x15, x14, x13, x12, x11, x10, x9, x8, x7, x6, x5, x4, x3, x2, x1, x0 ); \
544 v16sf yy = v1##FUN##f(xx); \
545 memcpy(y, &yy, 16*sizeof(sys_float))
547 #define V1FUN_PS_16(FUN, V) \
549 v8sf xx = _mm256_set_ps( x7, x6, x5, x4, x3, x2, x1, x0 ); \
550 yy[0] = v1##FUN##f(xx); \
551 xx = _mm256_set_ps( x15, x14, x13, x12, x11, x10, x9, x8 ); \
552 yy[1] = v1##FUN##f(xx); \
553 memcpy(y, &yy, 16*sizeof(sys_float))
557 #define V1FUN2_PD_4(FUN, V) \
558 v8df xx = _mm512_set_pd( V, V, V, V, x3, x2, x1, x0 ); \
559 v8df yy = _mm512_set_pd( V, V, V, V, y3, y2, y1, y0 ); \
560 v8df zz = v1##FUN##d(xx, yy); \
561 memcpy(z, &zz, 4*sizeof(double))
563 #define V1FUN2_PD_4(FUN, V) \
564 v4df xx = _mm256_set_pd( x3, x2, x1, x0 ); \
565 v4df yy = _mm256_set_pd( y3, y2, y1, y0 ); \
566 v4df zz = v1##FUN##d(xx, yy); \
567 memcpy(z, &zz, 4*sizeof(double))
571 #define V1FUN2_PS_4(FUN, V) \
572 v16sf xx = _mm512_set_ps( V, V, V, V, V, V, V, V, V, V, V, V, x3, x2, x1, x0 ); \
573 v16sf yy = _mm512_set_ps( V, V, V, V, V, V, V, V, V, V, V, V, y3, y2, y1, y0 ); \
574 v16sf zz = v1##FUN##f(xx, yy); \
575 memcpy(z, &zz, 4*sizeof(sys_float))
577 #define V1FUN2_PS_4(FUN, V) \
578 v8sf xx = _mm256_set_ps( V, V, V, V, x3, x2, x1, x0 ); \
579 v8sf yy = _mm256_set_ps( V, V, V, V, y3, y2, y1, y0 ); \
580 v8sf zz = v1##FUN##f(xx, yy); \
581 memcpy(z, &zz, 4*sizeof(sys_float))
585 #define V1FUN2_PS_8(FUN, V) \
586 v16sf xx = _mm512_set_ps( V, V, V, V, V, V, V, V, x7, x6, x5, x4, x3, x2, x1, x0 ); \
587 v16sf yy = _mm512_set_ps( V, V, V, V, V, V, V, V, y7, y6, y5, y4, y3, y2, y1, y0 ); \
588 v16sf zz = v1##FUN##f(xx, yy); \
589 memcpy(z, &zz, 8*sizeof(sys_float))
591 #define V1FUN2_PS_8(FUN, V) \
592 v8sf xx = _mm256_set_ps( x7, x6, x5, x4, x3, x2, x1, x0 ); \
593 v8sf yy = _mm256_set_ps( y7, y6, y5, y4, y3, y2, y1, y0 ); \
594 v8sf zz = v1##FUN##f(xx, yy); \
595 memcpy(z, &zz, 8*sizeof(sys_float))
601 template<
class T,
class V>
602 inline void vecfun(
const T x[], T y[],
long nlo,
long nhi, T (*scalfun1)(T), V (*)(V))
604 for(
long i=nlo; i < nhi; ++i )
605 y[i] = scalfun1(x[i]);
608 template<
class T,
class V>
609 inline void vecfun2(
const T x1[],
const T x2[], T y[],
long nlo,
long nhi, T (*scalfun1)(T, T), V (*)(V, V))
611 for(
long i=nlo; i < nhi; ++i )
612 y[i] = scalfun1(x1[i], x2[i]);
615 #define V1FUN_4(FUN, V) \
621 #define V1FUN_8(FUN, V) \
631 #define V1FUN_16(FUN, V) \
649 #define V1FUN_PD_4(FUN, V) \
652 #define V1FUN_PD_8(FUN, V) \
655 #define V1FUN_PS_4(FUN, V) \
658 #define V1FUN_PS_8(FUN, V) \
661 #define V1FUN_PS_16(FUN, V) \
664 #define V1FUN2_4(FUN, V) \
665 z[0] = FUN(x0, y0); \
666 z[1] = FUN(x1, y1); \
667 z[2] = FUN(x2, y2); \
670 #define V1FUN2_8(FUN, V) \
671 z[0] = FUN(x0, y0); \
672 z[1] = FUN(x1, y1); \
673 z[2] = FUN(x2, y2); \
674 z[3] = FUN(x3, y3); \
675 z[4] = FUN(x4, y4); \
676 z[5] = FUN(x5, y5); \
677 z[6] = FUN(x6, y6); \
680 #define V1FUN2_PD_4(FUN, V) \
683 #define V1FUN2_PS_4(FUN, V) \
686 #define V1FUN2_PS_8(FUN, V) \
void vecfun2(const T x1[], const T x2[], T y[], long nlo, long nhi, T(*scalfun1)(T, T), V(*)(V, V))
ALIGNED(CD_ALIGN) static const double qg32_w[numPoints]
void vecfun(const T x[], T y[], long nlo, long nhi, T(*scalfun1)(T), V(*)(V))