4#ifndef VECTORIZE_MATH_H
5#define VECTORIZE_MATH_H
29#define VECD_CONST(name,x) \
30ALIGNED(64) static const double __avx_##name[8] = {x,x,x,x,x,x,x,x}; \
31static const v8df& name = *reinterpret_cast<const v8df*>(__avx_##name)
33#define VECDI_CONST(name,x) \
34ALIGNED(64) static const uint64 __avx_##name[8] = {x,x,x,x,x,x,x,x}; \
35static const v8df& name = *reinterpret_cast<const v8df*>(__avx_##name)
37#define VECF_CONST(name,x) \
38ALIGNED(64) static const sys_float __avx_##name[16] = {x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x}; \
39static const v16sf& name = *reinterpret_cast<const v16sf*>(__avx_##name)
41#define VECFI_CONST(name,x) \
42ALIGNED(64) static const uint32 __avx_##name[16] = {x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x}; \
43static const v16sf& name = *reinterpret_cast<const v16sf*>(__avx_##name)
45#define VECI_CONST(name,x) \
46ALIGNED(32) static const uint32 __avx_##name[8] = {x,x,x,x,x,x,x,x}; \
47static const v8si& name = *reinterpret_cast<const v8si*>(__avx_##name)
49#define VECII_CONST(name,x) \
50ALIGNED(64) static const uint32 __avx_##name[16] = {x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x}; \
51static const v16si& name = *reinterpret_cast<const v16si*>(__avx_##name)
53#define VECL_CONST(name,x) \
54ALIGNED(32) static const uint64 __avx_##name[4] = {x,x,x,x}; \
55static const v4di& name = *reinterpret_cast<const v4di*>(__avx_##name)
57#define VECLL_CONST(name,x) \
58ALIGNED(64) static const uint64 __avx_##name[8] = {x,x,x,x,x,x,x,x}; \
59static const v8di& name = *reinterpret_cast<const v8di*>(__avx_##name)
73inline
bool getmaskbit(
unsigned int mask,
unsigned int n)
75 return ( ((mask>>n)&1) == 1 );
79inline string print_pack(
const T x[],
int npack,
unsigned int mask,
const string& name)
82 for(
int i=0; i < npack; ++i )
84 if( getmaskbit(mask, i) )
88 oss << name <<
"[" << i <<
"] = " << x[i] <<
"\n";
93inline string DEMsg(
const string& routine, v8df x, __mmask8 mask)
97 oss <<
"domain error in " << routine <<
".\n";
99 oss << print_pack(p.d, 8, mask,
"x");
103inline string DEMsg(
const string& routine, v16sf x, __mmask16 mask)
107 oss <<
"domain error in " << routine <<
".\n";
109 oss << print_pack(p.f, 16, mask,
"x");
113inline string DEMsg(
const string& routine, v8df x, __mmask8 mask1, v8df y, __mmask8 mask2)
117 oss <<
"domain error in " << routine <<
".\n";
119 oss << print_pack(p.d, 8, mask1,
"x");
121 oss << print_pack(p.d, 8, mask2,
"y");
125inline string DEMsg(
const string& routine, v16sf x, __mmask16 mask1, v16sf y, __mmask16 mask2)
129 oss <<
"domain error in " << routine <<
".\n";
131 oss << print_pack(p.f, 16, mask1,
"x");
133 oss << print_pack(p.f, 16, mask2,
"y");
139#define VECD_CONST(name,x) \
140ALIGNED(32) static const double __avx_##name[4] = {x,x,x,x}; \
141static const v4df& name = *reinterpret_cast<const v4df*>(__avx_##name)
143#define VECDI_CONST(name,x) \
144ALIGNED(32) static const uint64 __avx_##name[4] = {x,x,x,x}; \
145static const v4df& name = *reinterpret_cast<const v4df*>(__avx_##name)
147#define VECF_CONST(name,x) \
148ALIGNED(32) static const sys_float __avx_##name[8] = {x,x,x,x,x,x,x,x}; \
149static const v8sf& name = *reinterpret_cast<const v8sf*>(__avx_##name)
151#define VECFI_CONST(name,x) \
152ALIGNED(32) static const uint32 __avx_##name[8] = {x,x,x,x,x,x,x,x}; \
153static const v8sf& name = *reinterpret_cast<const v8sf*>(__avx_##name)
155#define VECI_CONST(name,x) \
156ALIGNED(16) static const uint32 __avx_##name[4] = {x,x,x,x}; \
157static const v4si& name = *reinterpret_cast<const v4si*>(__avx_##name)
159#define VECII_CONST(name,x) \
160ALIGNED(32) static const uint32 __avx_##name[8] = {x,x,x,x,x,x,x,x}; \
161static const v8si& name = *reinterpret_cast<const v8si*>(__avx_##name)
163#define VECL_CONST(name,x) \
164ALIGNED(16) static const uint64 __avx_##name[2] = {x,x}; \
165static const v2di& name = *reinterpret_cast<const v2di*>(__avx_##name)
167#define VECLL_CONST(name,x) \
168ALIGNED(32) static const uint64 __avx_##name[4] = {x,x,x,x}; \
169static const v4di& name = *reinterpret_cast<const v4di*>(__avx_##name)
183template<class T, class U>
184inline
string print_pack(const T x[], const U m[],
int npack, const
string& name)
187 for(
int i=0; i < npack; ++i )
193 oss << name <<
"[" << i <<
"] = " << x[i] <<
"\n";
198inline string DEMsg(
const string& routine, v4df x, v4df mask)
202 oss <<
"domain error in " << routine <<
".\n";
205 oss << print_pack(p.d, m.l, 4,
"x");
209inline string DEMsg(
const string& routine, v8sf x, v8sf mask)
213 oss <<
"domain error in " << routine <<
".\n";
216 oss << print_pack(p.f, m.i, 8,
"x");
220inline string DEMsg(
const string& routine, v4df x, v4df mask1, v4df y, v4df mask2)
224 oss <<
"domain error in " << routine <<
".\n";
227 oss << print_pack(p.d, m.l, 4,
"x");
230 oss << print_pack(p.d, m.l, 4,
"y");
234inline string DEMsg(
const string& routine, v8sf x, v8sf mask1, v8sf y, v8sf mask2)
238 oss <<
"domain error in " << routine <<
".\n";
241 oss << print_pack(p.f, m.i, 8,
"x");
244 oss << print_pack(p.f, m.i, 8,
"y");
250VECD_CONST(mthree,-3.);
252VECD_CONST(mhalf,-0.5);
254VECDI_CONST(third,0x3fd5555555555555);
262VECD_CONST(c1023,1023.);
264VECDI_CONST(ln_two,0x3fe62e42fefa39ef);
265VECDI_CONST(log2e,0x3ff71547652b82fe);
266VECDI_CONST(ln_ten,0x40026bb1bbb55516);
267VECDI_CONST(log10e,0x3fdbcb7b1526e50e);
269VECDI_CONST(dbl_min,0x0010000000000000);
270VECDI_CONST(dbl_max,0x7fefffffffffffff);
271VECDI_CONST(sqrt_dbl_max,0x5fefffffffffffff);
273VECF_CONST(mthreef,-3.f);
274VECF_CONST(monef,-1.f);
275VECF_CONST(mhalff,-0.5f);
276VECF_CONST(zerof,0.f);
277VECFI_CONST(thirdf,0x3eaaaaab);
278VECF_CONST(halff,0.5f);
281VECF_CONST(c1p5f,1.5f);
282VECF_CONST(threef,3.f);
284VECF_CONST(tenf,10.f);
285VECF_CONST(c127,127.f);
287VECFI_CONST(ln_twof,0x3f317218);
288VECFI_CONST(log2ef,0x3fb8aa3b);
289VECFI_CONST(ln_tenf,0x40135d8e);
290VECFI_CONST(log10ef,0x3ede5bd9);
292VECFI_CONST(flt_min,0x00800000);
293VECFI_CONST(flt_max,0x7f7fffff);
294VECFI_CONST(sqrt_flt_max,0x5f7fffff);
297inline void vecfun_partial(
const sys_float x[],
sys_float y[],
long nlo,
long nhi,
long npack, V (*vecfun1)(V))
300 for(
long i=0; i < npack; ++i )
301 xx.f[i] = x[
min(nlo+i,nhi-1)];
302 yy.pf = vecfun1(xx.pf);
303 for(
long i=nlo; i < nhi; ++i )
308inline void vecfun_partial(
const double x[],
double y[],
long nlo,
long nhi,
long npack, V (*vecfun1)(V))
311 for(
long i=0; i < npack; ++i )
312 xx.d[i] = x[
min(nlo+i,nhi-1)];
313 yy.pd = vecfun1(xx.pd);
314 for(
long i=nlo; i < nhi; ++i )
320 long npack, V (*vecfun1)(V, V))
323 for(
long i=0; i < npack; ++i )
325 xx1.f[i] =
x1[
min(nlo+i,nhi-1)];
326 xx2.f[i] =
x2[
min(nlo+i,nhi-1)];
328 yy.pf = vecfun1(xx1.pf, xx2.pf);
329 for(
long i=nlo; i < nhi; ++i )
334inline void vecfun2_partial(
const double x1[],
const double x2[],
double y[],
long nlo,
long nhi,
long npack,
338 for(
long i=0; i < npack; ++i )
340 xx1.d[i] =
x1[
min(nlo+i,nhi-1)];
341 xx2.d[i] =
x2[
min(nlo+i,nhi-1)];
343 yy.pd = vecfun1(xx1.pd, xx2.pd);
344 for(
long i=nlo; i < nhi; ++i )
352template<
class T,
class V>
353inline void vecfun(
const T x[], T y[],
long nlo,
long nhi, T (*)(T), V (*vecfun1)(V))
359 long npack =
sizeof(V)/
sizeof(T);
360 if( nhi-nlo < npack )
362 vecfun_partial(x, y, nlo, nhi, npack, vecfun1);
366 long ox = (
reinterpret_cast<long>(x)/
sizeof(T)+nlo)%npack;
367 long oy = (
reinterpret_cast<long>(y)/
sizeof(T)+nlo)%npack;
368 bool lgNonAligned = ( ox != oy );
369 T *yl, *ylocal =
NULL;
372 ylocal =
new T[nhi+npack-1];
373 long ol = (
reinterpret_cast<long>(ylocal)/
sizeof(T)+nlo)%npack;
377 yl = &ylocal[ox+npack-ol];
387 vecfun_partial(x, yl, ilo,
min(ilo+npack-ox,nhi), npack, vecfun1);
388 ilo =
min(ilo+npack-ox,nhi);
391 for( i=ilo; i < nhi-npack+1; i += npack )
393 const V& xx = *
reinterpret_cast<const V*
>(&x[i]);
394 V& yy = *
reinterpret_cast<V*
>(&yl[i]);
400 vecfun_partial(x, yl, ilo, nhi, npack, vecfun1);
403 for( i=nlo; i < nhi; ++i )
409template<
class T,
class V>
410inline void vecfun2(
const T
x1[],
const T
x2[], T y[],
long nlo,
long nhi, T (*)(T, T), V (*vecfun1)(V, V))
416 long npack =
sizeof(V)/
sizeof(T);
417 if( nhi-nlo < npack )
419 vecfun2_partial(
x1,
x2, y, nlo, nhi, npack, vecfun1);
423 long ox1 = (
reinterpret_cast<long>(
x1)/
sizeof(T)+nlo)%npack;
424 long ox2 = (
reinterpret_cast<long>(
x2)/
sizeof(T)+nlo)%npack;
425 long oy = (
reinterpret_cast<long>(y)/
sizeof(T)+nlo)%npack;
426 bool lgNonAligned1 = ( ox1 != ox2 );
427 bool lgNonAligned2 = ( ox1 != oy );
432 x2local =
new T[nhi+npack-1];
433 long ol = (
reinterpret_cast<long>(x2local)/
sizeof(T)+nlo)%npack;
436 ptr = &x2local[ox1-ol];
438 ptr = &x2local[ox1+npack-ol];
439 memcpy(ptr+nlo,
x2+nlo,
size_t((nhi-nlo)*
sizeof(T)));
446 T *yl, *ylocal =
NULL;
449 ylocal =
new T[nhi+npack-1];
450 long ol = (
reinterpret_cast<long>(ylocal)/
sizeof(T)+nlo)%npack;
452 yl = &ylocal[ox1-ol];
454 yl = &ylocal[ox1+npack-ol];
464 vecfun2_partial(
x1, x2l, yl, ilo,
min(ilo+npack-ox1,nhi), npack, vecfun1);
465 ilo =
min(ilo+npack-ox1,nhi);
468 for( i=ilo; i < nhi-npack+1; i += npack )
470 const V& xx1 = *
reinterpret_cast<const V*
>(&
x1[i]);
471 const V& xx2 = *
reinterpret_cast<const V*
>(&x2l[i]);
472 V& yy = *
reinterpret_cast<V*
>(&yl[i]);
473 yy = vecfun1(xx1, xx2);
478 vecfun2_partial(
x1, x2l, yl, ilo, nhi, npack, vecfun1);
483 for( i=nlo; i < nhi; ++i )
490#define V1FUN_PD_4(FUN, V) \
491 v8df xx = _mm512_set_pd( V, V, V, V, x3, x2, x1, x0 ); \
492 v8df yy = v1##FUN##d(xx); \
493 memcpy(y, &yy, 4*sizeof(double))
495#define V1FUN_PD_4(FUN, V) \
496 v4df xx = _mm256_set_pd( x3, x2, x1, x0 ); \
497 v4df yy = v1##FUN##d(xx); \
498 memcpy(y, &yy, 4*sizeof(double))
502#define V1FUN_PD_8(FUN, V) \
503 v8df xx = _mm512_set_pd( x7, x6, x5, x4, x3, x2, x1, x0 ); \
504 v8df yy = v1##FUN##d(xx); \
505 memcpy(y, &yy, 8*sizeof(double))
507#define V1FUN_PD_8(FUN, V) \
509 v4df xx = _mm256_set_pd( x3, x2, x1, x0 ); \
510 yy[0] = v1##FUN##d(xx); \
511 xx = _mm256_set_pd( x7, x6, x5, x4 ); \
512 yy[1] = v1##FUN##d(xx); \
513 memcpy(y, &yy, 8*sizeof(double))
517#define V1FUN_PS_4(FUN, V) \
518 v16sf xx = _mm512_set_ps( V, V, V, V, V, V, V, V, V, V, V, V, x3, x2, x1, x0 ); \
519 v16sf yy = v1##FUN##f(xx); \
520 memcpy(y, &yy, 4*sizeof(sys_float))
522#define V1FUN_PS_4(FUN, V) \
523 v8sf xx = _mm256_set_ps( V, V, V, V, x3, x2, x1, x0 ); \
524 v8sf yy = v1##FUN##f(xx); \
525 memcpy(y, &yy, 4*sizeof(sys_float))
529#define V1FUN_PS_8(FUN, V) \
530 v16sf xx = _mm512_set_ps( V, V, V, V, V, V, V, V, x7, x6, x5, x4, x3, x2, x1, x0 ); \
531 v16sf yy = v1##FUN##f(xx); \
532 memcpy(y, &yy, 8*sizeof(sys_float))
534#define V1FUN_PS_8(FUN, V) \
535 v8sf xx = _mm256_set_ps( x7, x6, x5, x4, x3, x2, x1, x0 ); \
536 v8sf yy = v1##FUN##f(xx); \
537 memcpy(y, &yy, 8*sizeof(sys_float))
541#define V1FUN_PS_16(FUN, V) \
542 v16sf xx = _mm512_set_ps( x15, x14, x13, x12, x11, x10, x9, x8, x7, x6, x5, x4, x3, x2, x1, x0 ); \
543 v16sf yy = v1##FUN##f(xx); \
544 memcpy(y, &yy, 16*sizeof(sys_float))
546#define V1FUN_PS_16(FUN, V) \
548 v8sf xx = _mm256_set_ps( x7, x6, x5, x4, x3, x2, x1, x0 ); \
549 yy[0] = v1##FUN##f(xx); \
550 xx = _mm256_set_ps( x15, x14, x13, x12, x11, x10, x9, x8 ); \
551 yy[1] = v1##FUN##f(xx); \
552 memcpy(y, &yy, 16*sizeof(sys_float))
556#define V1FUN2_PD_4(FUN, V) \
557 v8df xx = _mm512_set_pd( V, V, V, V, x3, x2, x1, x0 ); \
558 v8df yy = _mm512_set_pd( V, V, V, V, y3, y2, y1, y0 ); \
559 v8df zz = v1##FUN##d(xx, yy); \
560 memcpy(z, &zz, 4*sizeof(double))
562#define V1FUN2_PD_4(FUN, V) \
563 v4df xx = _mm256_set_pd( x3, x2, x1, x0 ); \
564 v4df yy = _mm256_set_pd( y3, y2, y1, y0 ); \
565 v4df zz = v1##FUN##d(xx, yy); \
566 memcpy(z, &zz, 4*sizeof(double))
570#define V1FUN2_PS_4(FUN, V) \
571 v16sf xx = _mm512_set_ps( V, V, V, V, V, V, V, V, V, V, V, V, x3, x2, x1, x0 ); \
572 v16sf yy = _mm512_set_ps( V, V, V, V, V, V, V, V, V, V, V, V, y3, y2, y1, y0 ); \
573 v16sf zz = v1##FUN##f(xx, yy); \
574 memcpy(z, &zz, 4*sizeof(sys_float))
576#define V1FUN2_PS_4(FUN, V) \
577 v8sf xx = _mm256_set_ps( V, V, V, V, x3, x2, x1, x0 ); \
578 v8sf yy = _mm256_set_ps( V, V, V, V, y3, y2, y1, y0 ); \
579 v8sf zz = v1##FUN##f(xx, yy); \
580 memcpy(z, &zz, 4*sizeof(sys_float))
584#define V1FUN2_PS_8(FUN, V) \
585 v16sf xx = _mm512_set_ps( V, V, V, V, V, V, V, V, x7, x6, x5, x4, x3, x2, x1, x0 ); \
586 v16sf yy = _mm512_set_ps( V, V, V, V, V, V, V, V, y7, y6, y5, y4, y3, y2, y1, y0 ); \
587 v16sf zz = v1##FUN##f(xx, yy); \
588 memcpy(z, &zz, 8*sizeof(sys_float))
590#define V1FUN2_PS_8(FUN, V) \
591 v8sf xx = _mm256_set_ps( x7, x6, x5, x4, x3, x2, x1, x0 ); \
592 v8sf yy = _mm256_set_ps( y7, y6, y5, y4, y3, y2, y1, y0 ); \
593 v8sf zz = v1##FUN##f(xx, yy); \
594 memcpy(z, &zz, 8*sizeof(sys_float))
600template<
class T,
class V>
601inline void vecfun(
const T x[], T y[],
long nlo,
long nhi, T (*scalfun1)(T), V (*)(V))
603 for(
long i=nlo; i < nhi; ++i )
604 y[i] = scalfun1(x[i]);
607template<
class T,
class V>
608inline void vecfun2(
const T
x1[],
const T
x2[], T y[],
long nlo,
long nhi, T (*scalfun1)(T, T), V (*)(V, V))
610 for(
long i=nlo; i < nhi; ++i )
611 y[i] = scalfun1(
x1[i],
x2[i]);
614#define V1FUN_4(FUN, V) \
620#define V1FUN_8(FUN, V) \
630#define V1FUN_16(FUN, V) \
648#define V1FUN_PD_4(FUN, V) \
651#define V1FUN_PD_8(FUN, V) \
654#define V1FUN_PS_4(FUN, V) \
657#define V1FUN_PS_8(FUN, V) \
660#define V1FUN_PS_16(FUN, V) \
663#define V1FUN2_4(FUN, V) \
664 z[0] = FUN(x0, y0); \
665 z[1] = FUN(x1, y1); \
666 z[2] = FUN(x2, y2); \
669#define V1FUN2_8(FUN, V) \
670 z[0] = FUN(x0, y0); \
671 z[1] = FUN(x1, y1); \
672 z[2] = FUN(x2, y2); \
673 z[3] = FUN(x3, y3); \
674 z[4] = FUN(x4, y4); \
675 z[5] = FUN(x5, y5); \
676 z[6] = FUN(x6, y6); \
679#define V1FUN2_PD_4(FUN, V) \
682#define V1FUN2_PS_4(FUN, V) \
685#define V1FUN2_PS_8(FUN, V) \
static double x2[63]
Definition atmdat_3body.cpp:20
static double x1[83]
Definition atmdat_3body.cpp:28
float sys_float
Definition cddefines.h:130
#define NULL
Definition cddefines.h:115
long min(int a, long b)
Definition cddefines.h:766
#define ALIGNED(X)
Definition cpu.h:475
void zero(void)
Definition zero.cpp:30
void vecfun(const T x[], T y[], long nlo, long nhi, T(*scalfun1)(T), V(*)(V))
Definition vectorize_math.h:601
void vecfun2(const T x1[], const T x2[], T y[], long nlo, long nhi, T(*scalfun1)(T, T), V(*)(V, V))
Definition vectorize_math.h:608