4 #ifndef VECTORIZE_MATH_H
5 #define VECTORIZE_MATH_H
25 typedef __m512i v16si;
29 #define VECD_CONST(name,x) \
30 ALIGNED(64) static const double __avx_##name[8] = {x,x,x,x,x,x,x,x}; \
31 static const v8df& name = *reinterpret_cast<const v8df*>(__avx_##name)
33 #define VECDI_CONST(name,x) \
34 ALIGNED(64) static const uint64 __avx_##name[8] = {x,x,x,x,x,x,x,x}; \
35 static const v8df& name = *reinterpret_cast<const v8df*>(__avx_##name)
37 #define VECF_CONST(name,x) \
38 ALIGNED(64) static const sys_float __avx_##name[16] = {x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x}; \
39 static const v16sf& name = *reinterpret_cast<const v16sf*>(__avx_##name)
41 #define VECFI_CONST(name,x) \
42 ALIGNED(64) static const uint32 __avx_##name[16] = {x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x}; \
43 static const v16sf& name = *reinterpret_cast<const v16sf*>(__avx_##name)
45 #define VECI_CONST(name,x) \
46 ALIGNED(32) static const uint32 __avx_##name[8] = {x,x,x,x,x,x,x,x}; \
47 static const v8si& name = *reinterpret_cast<const v8si*>(__avx_##name)
49 #define VECII_CONST(name,x) \
50 ALIGNED(64) static const uint32 __avx_##name[16] = {x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x}; \
51 static const v16si& name = *reinterpret_cast<const v16si*>(__avx_##name)
53 #define VECL_CONST(name,x) \
54 ALIGNED(32) static const uint64 __avx_##name[4] = {x,x,x,x}; \
55 static const v4di& name = *reinterpret_cast<const v4di*>(__avx_##name)
57 #define VECLL_CONST(name,x) \
58 ALIGNED(64) static const uint64 __avx_##name[8] = {x,x,x,x,x,x,x,x}; \
59 static const v8di& name = *reinterpret_cast<const v8di*>(__avx_##name)
73 inline
bool getmaskbit(
unsigned int mask,
unsigned int n)
75 return ( ((mask>>n)&1) == 1 );
79 inline string print_pack(
const T x[],
int npack,
unsigned int mask,
const string& name)
82 for(
int i=0; i < npack; ++i )
84 if( getmaskbit(mask, i) )
88 oss << name <<
"[" << i <<
"] = " << x[i] <<
"\n";
93 inline string DEMsg(
const string& routine, v8df x, __mmask8 mask)
97 oss <<
"domain error in " << routine <<
".\n";
99 oss << print_pack(p.d, 8, mask,
"x");
103 inline string DEMsg(
const string& routine, v16sf x, __mmask16 mask)
107 oss <<
"domain error in " << routine <<
".\n";
109 oss << print_pack(p.f, 16, mask,
"x");
113 inline string DEMsg(
const string& routine, v8df x, __mmask8 mask1, v8df y, __mmask8 mask2)
117 oss <<
"domain error in " << routine <<
".\n";
119 oss << print_pack(p.d, 8, mask1,
"x");
121 oss << print_pack(p.d, 8, mask2,
"y");
125 inline string DEMsg(
const string& routine, v16sf x, __mmask16 mask1, v16sf y, __mmask16 mask2)
129 oss <<
"domain error in " << routine <<
".\n";
131 oss << print_pack(p.f, 16, mask1,
"x");
133 oss << print_pack(p.f, 16, mask2,
"y");
139 #define VECD_CONST(name,x) \
140 ALIGNED(32) static const double __avx_##name[4] = {x,x,x,x}; \
141 static const v4df& name = *reinterpret_cast<const v4df*>(__avx_##name)
143 #define VECDI_CONST(name,x) \
144 ALIGNED(32) static const uint64 __avx_##name[4] = {x,x,x,x}; \
145 static const v4df& name = *reinterpret_cast<const v4df*>(__avx_##name)
147 #define VECF_CONST(name,x) \
148 ALIGNED(32) static const sys_float __avx_##name[8] = {x,x,x,x,x,x,x,x}; \
149 static const v8sf& name = *reinterpret_cast<const v8sf*>(__avx_##name)
151 #define VECFI_CONST(name,x) \
152 ALIGNED(32) static const uint32 __avx_##name[8] = {x,x,x,x,x,x,x,x}; \
153 static const v8sf& name = *reinterpret_cast<const v8sf*>(__avx_##name)
155 #define VECI_CONST(name,x) \
156 ALIGNED(16) static const uint32 __avx_##name[4] = {x,x,x,x}; \
157 static const v4si& name = *reinterpret_cast<const v4si*>(__avx_##name)
159 #define VECII_CONST(name,x) \
160 ALIGNED(32) static const uint32 __avx_##name[8] = {x,x,x,x,x,x,x,x}; \
161 static const v8si& name = *reinterpret_cast<const v8si*>(__avx_##name)
163 #define VECL_CONST(name,x) \
164 ALIGNED(16) static const uint64 __avx_##name[2] = {x,x}; \
165 static const v2di& name = *reinterpret_cast<const v2di*>(__avx_##name)
167 #define VECLL_CONST(name,x) \
168 ALIGNED(32) static const uint64 __avx_##name[4] = {x,x,x,x}; \
169 static const v4di& name = *reinterpret_cast<const v4di*>(__avx_##name)
183 template<class T, class U>
184 inline
string print_pack(const T x[], const U m[],
int npack, const
string& name)
187 for(
int i=0; i < npack; ++i )
193 oss << name <<
"[" << i <<
"] = " << x[i] <<
"\n";
198 inline string DEMsg(
const string& routine, v4df x, v4df mask)
202 oss <<
"domain error in " << routine <<
".\n";
205 oss << print_pack(p.d, m.l, 4,
"x");
209 inline string DEMsg(
const string& routine, v8sf x, v8sf mask)
213 oss <<
"domain error in " << routine <<
".\n";
216 oss << print_pack(p.f, m.i, 8,
"x");
220 inline string DEMsg(
const string& routine, v4df x, v4df mask1, v4df y, v4df mask2)
224 oss <<
"domain error in " << routine <<
".\n";
227 oss << print_pack(p.d, m.l, 4,
"x");
230 oss << print_pack(p.d, m.l, 4,
"y");
234 inline string DEMsg(
const string& routine, v8sf x, v8sf mask1, v8sf y, v8sf mask2)
238 oss <<
"domain error in " << routine <<
".\n";
241 oss << print_pack(p.f, m.i, 8,
"x");
244 oss << print_pack(p.f, m.i, 8,
"y");
248 #endif // __AVX512F__
250 VECD_CONST(mthree,-3.);
251 VECD_CONST(mone,-1.);
252 VECD_CONST(mhalf,-0.5);
254 VECDI_CONST(third,0x3fd5555555555555);
255 VECD_CONST(half,0.5);
257 VECD_CONST(c1p5,1.5);
259 VECD_CONST(three,3.);
262 VECD_CONST(c1023,1023.);
264 VECDI_CONST(ln_two,0x3fe62e42fefa39ef);
265 VECDI_CONST(log2e,0x3ff71547652b82fe);
266 VECDI_CONST(ln_ten,0x40026bb1bbb55516);
267 VECDI_CONST(log10e,0x3fdbcb7b1526e50e);
269 VECDI_CONST(dbl_min,0x0010000000000000);
270 VECDI_CONST(dbl_max,0x7fefffffffffffff);
271 VECDI_CONST(sqrt_dbl_max,0x5fefffffffffffff);
273 VECF_CONST(mthreef,-3.f);
274 VECF_CONST(monef,-1.f);
275 VECF_CONST(mhalff,-0.5f);
276 VECF_CONST(zerof,0.f);
277 VECFI_CONST(thirdf,0x3eaaaaab);
278 VECF_CONST(halff,0.5f);
279 VECF_CONST(onef,1.f);
280 VECF_CONST(twof,2.f);
281 VECF_CONST(c1p5f,1.5f);
282 VECF_CONST(threef,3.f);
283 VECF_CONST(sixf,6.f);
284 VECF_CONST(tenf,10.f);
285 VECF_CONST(c127,127.f);
287 VECFI_CONST(ln_twof,0x3f317218);
288 VECFI_CONST(log2ef,0x3fb8aa3b);
289 VECFI_CONST(ln_tenf,0x40135d8e);
290 VECFI_CONST(log10ef,0x3ede5bd9);
292 VECFI_CONST(flt_min,0x00800000);
293 VECFI_CONST(flt_max,0x7f7fffff);
294 VECFI_CONST(sqrt_flt_max,0x5f7fffff);
297 inline void vecfun_partial(
const sys_float x[],
sys_float y[],
long nlo,
long nhi,
long npack, V (*vecfun1)(V))
300 for(
long i=0; i < npack; ++i )
301 xx.f[i] = x[
min(nlo+i,nhi-1)];
302 yy.pf = vecfun1(xx.pf);
303 for(
long i=nlo; i < nhi; ++i )
308 inline void vecfun_partial(
const double x[],
double y[],
long nlo,
long nhi,
long npack, V (*vecfun1)(V))
311 for(
long i=0; i < npack; ++i )
312 xx.d[i] = x[
min(nlo+i,nhi-1)];
313 yy.pd = vecfun1(xx.pd);
314 for(
long i=nlo; i < nhi; ++i )
320 long npack, V (*vecfun1)(V, V))
323 for(
long i=0; i < npack; ++i )
325 xx1.f[i] = x1[
min(nlo+i,nhi-1)];
326 xx2.f[i] = x2[
min(nlo+i,nhi-1)];
328 yy.pf = vecfun1(xx1.pf, xx2.pf);
329 for(
long i=nlo; i < nhi; ++i )
334 inline void vecfun2_partial(
const double x1[],
const double x2[],
double y[],
long nlo,
long nhi,
long npack,
338 for(
long i=0; i < npack; ++i )
340 xx1.d[i] = x1[
min(nlo+i,nhi-1)];
341 xx2.d[i] = x2[
min(nlo+i,nhi-1)];
343 yy.pd = vecfun1(xx1.pd, xx2.pd);
344 for(
long i=nlo; i < nhi; ++i )
352 template<
class T,
class V>
353 inline void vecfun(
const T x[], T y[],
long nlo,
long nhi, T (*)(T), V (*vecfun1)(V))
359 long npack =
sizeof(V)/
sizeof(T);
360 if( nhi-nlo < npack )
362 vecfun_partial(x, y, nlo, nhi, npack, vecfun1);
366 long ox = (
reinterpret_cast<long>(x)/
sizeof(T)+nlo)%npack;
367 long oy = (
reinterpret_cast<long>(y)/
sizeof(T)+nlo)%npack;
368 bool lgNonAligned = ( ox != oy );
369 T *yl, *ylocal =
NULL;
372 ylocal =
new T[nhi+npack-1];
373 long ol = (
reinterpret_cast<long>(ylocal)/
sizeof(T)+nlo)%npack;
377 yl = &ylocal[ox+npack-ol];
387 vecfun_partial(x, yl, ilo,
min(ilo+npack-ox,nhi), npack, vecfun1);
388 ilo =
min(ilo+npack-ox,nhi);
391 for( i=ilo; i < nhi-npack+1; i += npack )
393 const V& xx = *
reinterpret_cast<const V*
>(&x[i]);
394 V& yy = *
reinterpret_cast<V*
>(&yl[i]);
400 vecfun_partial(x, yl, ilo, nhi, npack, vecfun1);
403 for( i=nlo; i < nhi; ++i )
409 template<
class T,
class V>
410 inline void vecfun2(
const T x1[],
const T x2[], T y[],
long nlo,
long nhi, T (*)(T, T), V (*vecfun1)(V, V))
416 long npack =
sizeof(V)/
sizeof(T);
417 if( nhi-nlo < npack )
419 vecfun2_partial(x1, x2, y, nlo, nhi, npack, vecfun1);
423 long ox1 = (
reinterpret_cast<long>(
x1)/
sizeof(T)+nlo)%npack;
424 long ox2 = (
reinterpret_cast<long>(
x2)/
sizeof(T)+nlo)%npack;
425 long oy = (
reinterpret_cast<long>(y)/
sizeof(T)+nlo)%npack;
426 bool lgNonAligned1 = ( ox1 != ox2 );
427 bool lgNonAligned2 = ( ox1 != oy );
432 x2local =
new T[nhi+npack-1];
433 long ol = (
reinterpret_cast<long>(x2local)/
sizeof(T)+nlo)%npack;
436 ptr = &x2local[ox1-ol];
438 ptr = &x2local[ox1+npack-ol];
439 memcpy(ptr+nlo, x2+nlo,
size_t((nhi-nlo)*
sizeof(T)));
446 T *yl, *ylocal =
NULL;
449 ylocal =
new T[nhi+npack-1];
450 long ol = (
reinterpret_cast<long>(ylocal)/
sizeof(T)+nlo)%npack;
452 yl = &ylocal[ox1-ol];
454 yl = &ylocal[ox1+npack-ol];
464 vecfun2_partial(x1, x2l, yl, ilo,
min(ilo+npack-ox1,nhi), npack, vecfun1);
465 ilo =
min(ilo+npack-ox1,nhi);
468 for( i=ilo; i < nhi-npack+1; i += npack )
470 const V& xx1 = *
reinterpret_cast<const V*
>(&x1[i]);
471 const V& xx2 = *
reinterpret_cast<const V*
>(&x2l[i]);
472 V& yy = *
reinterpret_cast<V*
>(&yl[i]);
473 yy = vecfun1(xx1, xx2);
478 vecfun2_partial(x1, x2l, yl, ilo, nhi, npack, vecfun1);
483 for( i=nlo; i < nhi; ++i )
490 #define V1FUN_PD_4(FUN, V) \
491 v8df xx = _mm512_set_pd( V, V, V, V, x3, x2, x1, x0 ); \
492 v8df yy = v1##FUN##d(xx); \
493 memcpy(y, &yy, 4*sizeof(double))
495 #define V1FUN_PD_4(FUN, V) \
496 v4df xx = _mm256_set_pd( x3, x2, x1, x0 ); \
497 v4df yy = v1##FUN##d(xx); \
498 memcpy(y, &yy, 4*sizeof(double))
502 #define V1FUN_PD_8(FUN, V) \
503 v8df xx = _mm512_set_pd( x7, x6, x5, x4, x3, x2, x1, x0 ); \
504 v8df yy = v1##FUN##d(xx); \
505 memcpy(y, &yy, 8*sizeof(double))
507 #define V1FUN_PD_8(FUN, V) \
509 v4df xx = _mm256_set_pd( x3, x2, x1, x0 ); \
510 yy[0] = v1##FUN##d(xx); \
511 xx = _mm256_set_pd( x7, x6, x5, x4 ); \
512 yy[1] = v1##FUN##d(xx); \
513 memcpy(y, &yy, 8*sizeof(double))
517 #define V1FUN_PS_4(FUN, V) \
518 v16sf xx = _mm512_set_ps( V, V, V, V, V, V, V, V, V, V, V, V, x3, x2, x1, x0 ); \
519 v16sf yy = v1##FUN##f(xx); \
520 memcpy(y, &yy, 4*sizeof(sys_float))
522 #define V1FUN_PS_4(FUN, V) \
523 v8sf xx = _mm256_set_ps( V, V, V, V, x3, x2, x1, x0 ); \
524 v8sf yy = v1##FUN##f(xx); \
525 memcpy(y, &yy, 4*sizeof(sys_float))
529 #define V1FUN_PS_8(FUN, V) \
530 v16sf xx = _mm512_set_ps( V, V, V, V, V, V, V, V, x7, x6, x5, x4, x3, x2, x1, x0 ); \
531 v16sf yy = v1##FUN##f(xx); \
532 memcpy(y, &yy, 8*sizeof(sys_float))
534 #define V1FUN_PS_8(FUN, V) \
535 v8sf xx = _mm256_set_ps( x7, x6, x5, x4, x3, x2, x1, x0 ); \
536 v8sf yy = v1##FUN##f(xx); \
537 memcpy(y, &yy, 8*sizeof(sys_float))
541 #define V1FUN_PS_16(FUN, V) \
542 v16sf xx = _mm512_set_ps( x15, x14, x13, x12, x11, x10, x9, x8, x7, x6, x5, x4, x3, x2, x1, x0 ); \
543 v16sf yy = v1##FUN##f(xx); \
544 memcpy(y, &yy, 16*sizeof(sys_float))
546 #define V1FUN_PS_16(FUN, V) \
548 v8sf xx = _mm256_set_ps( x7, x6, x5, x4, x3, x2, x1, x0 ); \
549 yy[0] = v1##FUN##f(xx); \
550 xx = _mm256_set_ps( x15, x14, x13, x12, x11, x10, x9, x8 ); \
551 yy[1] = v1##FUN##f(xx); \
552 memcpy(y, &yy, 16*sizeof(sys_float))
556 #define V1FUN2_PD_4(FUN, V) \
557 v8df xx = _mm512_set_pd( V, V, V, V, x3, x2, x1, x0 ); \
558 v8df yy = _mm512_set_pd( V, V, V, V, y3, y2, y1, y0 ); \
559 v8df zz = v1##FUN##d(xx, yy); \
560 memcpy(z, &zz, 4*sizeof(double))
562 #define V1FUN2_PD_4(FUN, V) \
563 v4df xx = _mm256_set_pd( x3, x2, x1, x0 ); \
564 v4df yy = _mm256_set_pd( y3, y2, y1, y0 ); \
565 v4df zz = v1##FUN##d(xx, yy); \
566 memcpy(z, &zz, 4*sizeof(double))
570 #define V1FUN2_PS_4(FUN, V) \
571 v16sf xx = _mm512_set_ps( V, V, V, V, V, V, V, V, V, V, V, V, x3, x2, x1, x0 ); \
572 v16sf yy = _mm512_set_ps( V, V, V, V, V, V, V, V, V, V, V, V, y3, y2, y1, y0 ); \
573 v16sf zz = v1##FUN##f(xx, yy); \
574 memcpy(z, &zz, 4*sizeof(sys_float))
576 #define V1FUN2_PS_4(FUN, V) \
577 v8sf xx = _mm256_set_ps( V, V, V, V, x3, x2, x1, x0 ); \
578 v8sf yy = _mm256_set_ps( V, V, V, V, y3, y2, y1, y0 ); \
579 v8sf zz = v1##FUN##f(xx, yy); \
580 memcpy(z, &zz, 4*sizeof(sys_float))
584 #define V1FUN2_PS_8(FUN, V) \
585 v16sf xx = _mm512_set_ps( V, V, V, V, V, V, V, V, x7, x6, x5, x4, x3, x2, x1, x0 ); \
586 v16sf yy = _mm512_set_ps( V, V, V, V, V, V, V, V, y7, y6, y5, y4, y3, y2, y1, y0 ); \
587 v16sf zz = v1##FUN##f(xx, yy); \
588 memcpy(z, &zz, 8*sizeof(sys_float))
590 #define V1FUN2_PS_8(FUN, V) \
591 v8sf xx = _mm256_set_ps( x7, x6, x5, x4, x3, x2, x1, x0 ); \
592 v8sf yy = _mm256_set_ps( y7, y6, y5, y4, y3, y2, y1, y0 ); \
593 v8sf zz = v1##FUN##f(xx, yy); \
594 memcpy(z, &zz, 8*sizeof(sys_float))
600 template<
class T,
class V>
601 inline void vecfun(
const T x[], T y[],
long nlo,
long nhi, T (*scalfun1)(T), V (*)(V))
603 for(
long i=nlo; i < nhi; ++i )
604 y[i] = scalfun1(x[i]);
607 template<
class T,
class V>
608 inline void vecfun2(
const T x1[],
const T x2[], T y[],
long nlo,
long nhi, T (*scalfun1)(T, T), V (*)(V, V))
610 for(
long i=nlo; i < nhi; ++i )
611 y[i] = scalfun1(x1[i], x2[i]);
614 #define V1FUN_4(FUN, V) \
620 #define V1FUN_8(FUN, V) \
630 #define V1FUN_16(FUN, V) \
648 #define V1FUN_PD_4(FUN, V) \
651 #define V1FUN_PD_8(FUN, V) \
654 #define V1FUN_PS_4(FUN, V) \
657 #define V1FUN_PS_8(FUN, V) \
660 #define V1FUN_PS_16(FUN, V) \
663 #define V1FUN2_4(FUN, V) \
664 z[0] = FUN(x0, y0); \
665 z[1] = FUN(x1, y1); \
666 z[2] = FUN(x2, y2); \
669 #define V1FUN2_8(FUN, V) \
670 z[0] = FUN(x0, y0); \
671 z[1] = FUN(x1, y1); \
672 z[2] = FUN(x2, y2); \
673 z[3] = FUN(x3, y3); \
674 z[4] = FUN(x4, y4); \
675 z[5] = FUN(x5, y5); \
676 z[6] = FUN(x6, y6); \
679 #define V1FUN2_PD_4(FUN, V) \
682 #define V1FUN2_PS_4(FUN, V) \
685 #define V1FUN2_PS_8(FUN, V) \
static double x2[63]
Definition: atmdat_3body.cpp:18
static double x1[83]
Definition: atmdat_3body.cpp:27
void vecfun2(const T x1[], const T x2[], T y[], long nlo, long nhi, T(*scalfun1)(T, T), V(*)(V, V))
Definition: vectorize_math.h:608
ALIGNED(CD_ALIGN) static const double qg32_w[numPoints]
void zero(void)
Definition: zero.cpp:30
void vecfun(const T x[], T y[], long nlo, long nhi, T(*scalfun1)(T), V(*)(V))
Definition: vectorize_math.h:601
float sys_float
Definition: cddefines.h:130
long min(int a, long b)
Definition: cddefines.h:766
#define NULL
Definition: cddefines.h:115