Cloudy
Spectral Synthesis Code for Astrophysics
Loading...
Searching...
No Matches
vectorize_math.h
Go to the documentation of this file.
1/* This file is part of Cloudy and is copyright (C)1978-2025 by Gary J. Ferland and
2 * others. For conditions of distribution and use see copyright notice in license.txt */
3
4#ifndef VECTORIZE_MATH_H
5#define VECTORIZE_MATH_H
6
7#ifdef __AVX__
8
9// use shorthand for packed types
10typedef __m128d v2df;
11typedef __m128 v4sf;
12typedef __m128i v2di;
13typedef __m128i v4si;
14
15typedef __m256d v4df;
16typedef __m256 v8sf;
17typedef __m256i v4di;
18typedef __m256i v8si;
19
20#ifdef __AVX512F__
21
22typedef __m512d v8df;
23typedef __m512 v16sf;
24typedef __m512i v8di;
25typedef __m512i v16si;
26
27// some macros for defining vector constants
28
29#define VECD_CONST(name,x) \
30ALIGNED(64) static const double __avx_##name[8] = {x,x,x,x,x,x,x,x}; \
31static const v8df& name = *reinterpret_cast<const v8df*>(__avx_##name)
32
33#define VECDI_CONST(name,x) \
34ALIGNED(64) static const uint64 __avx_##name[8] = {x,x,x,x,x,x,x,x}; \
35static const v8df& name = *reinterpret_cast<const v8df*>(__avx_##name)
36
37#define VECF_CONST(name,x) \
38ALIGNED(64) static const sys_float __avx_##name[16] = {x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x}; \
39static const v16sf& name = *reinterpret_cast<const v16sf*>(__avx_##name)
40
41#define VECFI_CONST(name,x) \
42ALIGNED(64) static const uint32 __avx_##name[16] = {x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x}; \
43static const v16sf& name = *reinterpret_cast<const v16sf*>(__avx_##name)
44
45#define VECI_CONST(name,x) \
46ALIGNED(32) static const uint32 __avx_##name[8] = {x,x,x,x,x,x,x,x}; \
47static const v8si& name = *reinterpret_cast<const v8si*>(__avx_##name)
48
49#define VECII_CONST(name,x) \
50ALIGNED(64) static const uint32 __avx_##name[16] = {x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x}; \
51static const v16si& name = *reinterpret_cast<const v16si*>(__avx_##name)
52
53#define VECL_CONST(name,x) \
54ALIGNED(32) static const uint64 __avx_##name[4] = {x,x,x,x}; \
55static const v4di& name = *reinterpret_cast<const v4di*>(__avx_##name)
56
57#define VECLL_CONST(name,x) \
58ALIGNED(64) static const uint64 __avx_##name[8] = {x,x,x,x,x,x,x,x}; \
59static const v8di& name = *reinterpret_cast<const v8di*>(__avx_##name)
60
61union vpack
62{
63 ALIGNED(64) v8df pd;
64 ALIGNED(64) v8di pl;
65 ALIGNED(64) v16sf pf;
66 ALIGNED(64) v16si pi;
67 ALIGNED(64) double d[8];
68 ALIGNED(64) int64 l[8];
69 ALIGNED(64) sys_float f[16];
70 ALIGNED(64) int32 i[16];
71};
72
73inline bool getmaskbit(unsigned int mask, unsigned int n)
74{
75 return ( ((mask>>n)&1) == 1 );
76}
77
78template<class T>
79inline string print_pack(const T x[], int npack, unsigned int mask, const string& name)
80{
81 ostringstream oss;
82 for( int i=0; i < npack; ++i )
83 {
84 if( getmaskbit(mask, i) )
85 oss << " => ";
86 else
87 oss << " ";
88 oss << name << "[" << i << "] = " << x[i] << "\n";
89 }
90 return oss.str();
91}
92
93inline string DEMsg(const string& routine, v8df x, __mmask8 mask)
94{
95 vpack p;
96 ostringstream oss;
97 oss << "domain error in " << routine << ".\n";
98 p.pd = x;
99 oss << print_pack(p.d, 8, mask, "x");
100 return oss.str();
101}
102
103inline string DEMsg(const string& routine, v16sf x, __mmask16 mask)
104{
105 vpack p;
106 ostringstream oss;
107 oss << "domain error in " << routine << ".\n";
108 p.pf = x;
109 oss << print_pack(p.f, 16, mask, "x");
110 return oss.str();
111}
112
113inline string DEMsg(const string& routine, v8df x, __mmask8 mask1, v8df y, __mmask8 mask2)
114{
115 vpack p;
116 ostringstream oss;
117 oss << "domain error in " << routine << ".\n";
118 p.pd = x;
119 oss << print_pack(p.d, 8, mask1, "x");
120 p.pd = y;
121 oss << print_pack(p.d, 8, mask2, "y");
122 return oss.str();
123}
124
125inline string DEMsg(const string& routine, v16sf x, __mmask16 mask1, v16sf y, __mmask16 mask2)
126{
127 vpack p;
128 ostringstream oss;
129 oss << "domain error in " << routine << ".\n";
130 p.pf = x;
131 oss << print_pack(p.f, 16, mask1, "x");
132 p.pf = y;
133 oss << print_pack(p.f, 16, mask2, "y");
134 return oss.str();
135}
136
137#else // __AVX512F__
138
139#define VECD_CONST(name,x) \
140ALIGNED(32) static const double __avx_##name[4] = {x,x,x,x}; \
141static const v4df& name = *reinterpret_cast<const v4df*>(__avx_##name)
142
143#define VECDI_CONST(name,x) \
144ALIGNED(32) static const uint64 __avx_##name[4] = {x,x,x,x}; \
145static const v4df& name = *reinterpret_cast<const v4df*>(__avx_##name)
146
147#define VECF_CONST(name,x) \
148ALIGNED(32) static const sys_float __avx_##name[8] = {x,x,x,x,x,x,x,x}; \
149static const v8sf& name = *reinterpret_cast<const v8sf*>(__avx_##name)
150
151#define VECFI_CONST(name,x) \
152ALIGNED(32) static const uint32 __avx_##name[8] = {x,x,x,x,x,x,x,x}; \
153static const v8sf& name = *reinterpret_cast<const v8sf*>(__avx_##name)
154
155#define VECI_CONST(name,x) \
156ALIGNED(16) static const uint32 __avx_##name[4] = {x,x,x,x}; \
157static const v4si& name = *reinterpret_cast<const v4si*>(__avx_##name)
158
159#define VECII_CONST(name,x) \
160ALIGNED(32) static const uint32 __avx_##name[8] = {x,x,x,x,x,x,x,x}; \
161static const v8si& name = *reinterpret_cast<const v8si*>(__avx_##name)
162
163#define VECL_CONST(name,x) \
164ALIGNED(16) static const uint64 __avx_##name[2] = {x,x}; \
165static const v2di& name = *reinterpret_cast<const v2di*>(__avx_##name)
166
167#define VECLL_CONST(name,x) \
168ALIGNED(32) static const uint64 __avx_##name[4] = {x,x,x,x}; \
169static const v4di& name = *reinterpret_cast<const v4di*>(__avx_##name)
170
171union vpack
172{
173 ALIGNED(32) v4df pd;
174 ALIGNED(32) v4di pl;
175 ALIGNED(32) v8sf pf;
176 ALIGNED(32) v8si pi;
177 ALIGNED(32) double d[4];
178 ALIGNED(32) int64 l[4];
179 ALIGNED(32) sys_float f[8];
180 ALIGNED(32) int32 i[8];
181};
182
183template<class T, class U>
184inline string print_pack(const T x[], const U m[], int npack, const string& name)
185{
186 ostringstream oss;
187 for( int i=0; i < npack; ++i )
188 {
189 if( m[i] < 0 )
190 oss << " => ";
191 else
192 oss << " ";
193 oss << name << "[" << i << "] = " << x[i] << "\n";
194 }
195 return oss.str();
196}
197
198inline string DEMsg(const string& routine, v4df x, v4df mask)
199{
200 vpack p, m;
201 ostringstream oss;
202 oss << "domain error in " << routine << ".\n";
203 p.pd = x;
204 m.pd = mask;
205 oss << print_pack(p.d, m.l, 4, "x");
206 return oss.str();
207}
208
209inline string DEMsg(const string& routine, v8sf x, v8sf mask)
210{
211 vpack p, m;
212 ostringstream oss;
213 oss << "domain error in " << routine << ".\n";
214 p.pf = x;
215 m.pf = mask;
216 oss << print_pack(p.f, m.i, 8, "x");
217 return oss.str();
218}
219
220inline string DEMsg(const string& routine, v4df x, v4df mask1, v4df y, v4df mask2)
221{
222 vpack p, m;
223 ostringstream oss;
224 oss << "domain error in " << routine << ".\n";
225 p.pd = x;
226 m.pd = mask1;
227 oss << print_pack(p.d, m.l, 4, "x");
228 p.pd = y;
229 m.pd = mask2;
230 oss << print_pack(p.d, m.l, 4, "y");
231 return oss.str();
232}
233
234inline string DEMsg(const string& routine, v8sf x, v8sf mask1, v8sf y, v8sf mask2)
235{
236 vpack p, m;
237 ostringstream oss;
238 oss << "domain error in " << routine << ".\n";
239 p.pf = x;
240 m.pf = mask1;
241 oss << print_pack(p.f, m.i, 8, "x");
242 p.pf = y;
243 m.pf = mask2;
244 oss << print_pack(p.f, m.i, 8, "y");
245 return oss.str();
246}
247
248#endif // __AVX512F__
249
250VECD_CONST(mthree,-3.);
251VECD_CONST(mone,-1.);
252VECD_CONST(mhalf,-0.5);
253VECD_CONST(zero,0.);
254VECDI_CONST(third,0x3fd5555555555555); // 0.33333333333333333333
255VECD_CONST(half,0.5);
256VECD_CONST(one,1.);
257VECD_CONST(c1p5,1.5);
258VECD_CONST(two,2.);
259VECD_CONST(three,3.);
260VECD_CONST(six,6.);
261VECD_CONST(ten,10.);
262VECD_CONST(c1023,1023.);
263
264VECDI_CONST(ln_two,0x3fe62e42fefa39ef); // log(2.)
265VECDI_CONST(log2e,0x3ff71547652b82fe); // log2(e) = 1./log(2.));
266VECDI_CONST(ln_ten,0x40026bb1bbb55516); // log(10.)
267VECDI_CONST(log10e,0x3fdbcb7b1526e50e); // log10(e) = 1./log(10.)
268
269VECDI_CONST(dbl_min,0x0010000000000000); // DBL_MIN
270VECDI_CONST(dbl_max,0x7fefffffffffffff); // DBL_MAX
271VECDI_CONST(sqrt_dbl_max,0x5fefffffffffffff); // sqrt(DBL_MAX)
272
273VECF_CONST(mthreef,-3.f);
274VECF_CONST(monef,-1.f);
275VECF_CONST(mhalff,-0.5f);
276VECF_CONST(zerof,0.f);
277VECFI_CONST(thirdf,0x3eaaaaab); // 0.3333333333f
278VECF_CONST(halff,0.5f);
279VECF_CONST(onef,1.f);
280VECF_CONST(twof,2.f);
281VECF_CONST(c1p5f,1.5f);
282VECF_CONST(threef,3.f);
283VECF_CONST(sixf,6.f);
284VECF_CONST(tenf,10.f);
285VECF_CONST(c127,127.f);
286
287VECFI_CONST(ln_twof,0x3f317218); // logf(2.f)
288VECFI_CONST(log2ef,0x3fb8aa3b); // log2f(e) = 1.f/logf(2.f)
289VECFI_CONST(ln_tenf,0x40135d8e); // logf(10.f)
290VECFI_CONST(log10ef,0x3ede5bd9); // log10f(e) = 1.f/logf(10.f)
291
292VECFI_CONST(flt_min,0x00800000); // FLT_MIN
293VECFI_CONST(flt_max,0x7f7fffff); // FLT_MAX
294VECFI_CONST(sqrt_flt_max,0x5f7fffff); // sqrt(FLT_MAX)
295
296template<class V>
297inline void vecfun_partial(const sys_float x[], sys_float y[], long nlo, long nhi, long npack, V (*vecfun1)(V))
298{
299 vpack xx, yy;
300 for( long i=0; i < npack; ++i )
301 xx.f[i] = x[min(nlo+i,nhi-1)];
302 yy.pf = vecfun1(xx.pf);
303 for( long i=nlo; i < nhi; ++i )
304 y[i] = yy.f[i-nlo];
305}
306
307template<class V>
308inline void vecfun_partial(const double x[], double y[], long nlo, long nhi, long npack, V (*vecfun1)(V))
309{
310 vpack xx, yy;
311 for( long i=0; i < npack; ++i )
312 xx.d[i] = x[min(nlo+i,nhi-1)];
313 yy.pd = vecfun1(xx.pd);
314 for( long i=nlo; i < nhi; ++i )
315 y[i] = yy.d[i-nlo];
316}
317
318template<class V>
319inline void vecfun2_partial(const sys_float x1[], const sys_float x2[], sys_float y[], long nlo, long nhi,
320 long npack, V (*vecfun1)(V, V))
321{
322 vpack xx1, xx2, yy;
323 for( long i=0; i < npack; ++i )
324 {
325 xx1.f[i] = x1[min(nlo+i,nhi-1)];
326 xx2.f[i] = x2[min(nlo+i,nhi-1)];
327 }
328 yy.pf = vecfun1(xx1.pf, xx2.pf);
329 for( long i=nlo; i < nhi; ++i )
330 y[i] = yy.f[i-nlo];
331}
332
333template<class V>
334inline void vecfun2_partial(const double x1[], const double x2[], double y[], long nlo, long nhi, long npack,
335 V (*vecfun1)(V, V))
336{
337 vpack xx1, xx2, yy;
338 for( long i=0; i < npack; ++i )
339 {
340 xx1.d[i] = x1[min(nlo+i,nhi-1)];
341 xx2.d[i] = x2[min(nlo+i,nhi-1)];
342 }
343 yy.pd = vecfun1(xx1.pd, xx2.pd);
344 for( long i=nlo; i < nhi; ++i )
345 y[i] = yy.d[i-nlo];
346}
347
348// this is the generic wrapper routine for all vectorized math functions
349// it makes sure that the vectorized function gets arguments that are properly aligned
350// scalfun1 is the normal scalar routine working on a single argument
351// vecfun1 is the vectorized routine working on a packed register
352template<class T, class V>
353inline void vecfun(const T x[], T y[], long nlo, long nhi, T (*)(T), V (*vecfun1)(V))
354{
355 if( nhi <= nlo )
356 return;
357
358 // determine number of elements of type T in a packed register
359 long npack = sizeof(V)/sizeof(T);
360 if( nhi-nlo < npack )
361 {
362 vecfun_partial(x, y, nlo, nhi, npack, vecfun1);
363 return;
364 }
365 long i, ilo = nlo;
366 long ox = (reinterpret_cast<long>(x)/sizeof(T)+nlo)%npack;
367 long oy = (reinterpret_cast<long>(y)/sizeof(T)+nlo)%npack;
368 bool lgNonAligned = ( ox != oy );
369 T *yl, *ylocal = NULL;
370 if( lgNonAligned )
371 {
372 ylocal = new T[nhi+npack-1];
373 long ol = (reinterpret_cast<long>(ylocal)/sizeof(T)+nlo)%npack;
374 if( ox >= ol )
375 yl = &ylocal[ox-ol];
376 else
377 yl = &ylocal[ox+npack-ol];
378 }
379 else
380 {
381 yl = y;
382 }
383 // the initial element is not aligned correctly ->
384 // use scalar routine until we are correctly aligned
385 if( ox > 0 )
386 {
387 vecfun_partial(x, yl, ilo, min(ilo+npack-ox,nhi), npack, vecfun1);
388 ilo = min(ilo+npack-ox,nhi);
389 }
390 // use vectorized routine as long as there are at least npack evaluations left
391 for( i=ilo; i < nhi-npack+1; i += npack )
392 {
393 const V& xx = *reinterpret_cast<const V*>(&x[i]);
394 V& yy = *reinterpret_cast<V*>(&yl[i]);
395 yy = vecfun1(xx);
396 }
397 ilo = i;
398 // use partial routine again for the remaining calls (if any)...
399 if( ilo < nhi )
400 vecfun_partial(x, yl, ilo, nhi, npack, vecfun1);
401 if( lgNonAligned )
402 {
403 for( i=nlo; i < nhi; ++i )
404 y[i] = yl[i];
405 delete[] ylocal;
406 }
407}
408
409template<class T, class V>
410inline void vecfun2(const T x1[], const T x2[], T y[], long nlo, long nhi, T (*)(T, T), V (*vecfun1)(V, V))
411{
412 if( nhi <= nlo )
413 return;
414
415 // determine number of elements of type T in a packed register
416 long npack = sizeof(V)/sizeof(T);
417 if( nhi-nlo < npack )
418 {
419 vecfun2_partial(x1, x2, y, nlo, nhi, npack, vecfun1);
420 return;
421 }
422 long i, ilo = nlo;
423 long ox1 = (reinterpret_cast<long>(x1)/sizeof(T)+nlo)%npack;
424 long ox2 = (reinterpret_cast<long>(x2)/sizeof(T)+nlo)%npack;
425 long oy = (reinterpret_cast<long>(y)/sizeof(T)+nlo)%npack;
426 bool lgNonAligned1 = ( ox1 != ox2 );
427 bool lgNonAligned2 = ( ox1 != oy );
428 const T *x2l;
429 T *x2local = NULL;
430 if( lgNonAligned1 )
431 {
432 x2local = new T[nhi+npack-1];
433 long ol = (reinterpret_cast<long>(x2local)/sizeof(T)+nlo)%npack;
434 T *ptr;
435 if( ox1 >= ol )
436 ptr = &x2local[ox1-ol];
437 else
438 ptr = &x2local[ox1+npack-ol];
439 memcpy(ptr+nlo, x2+nlo, size_t((nhi-nlo)*sizeof(T)));
440 x2l = ptr;
441 }
442 else
443 {
444 x2l = x2;
445 }
446 T *yl, *ylocal = NULL;
447 if( lgNonAligned2 )
448 {
449 ylocal = new T[nhi+npack-1];
450 long ol = (reinterpret_cast<long>(ylocal)/sizeof(T)+nlo)%npack;
451 if( ox1 >= ol )
452 yl = &ylocal[ox1-ol];
453 else
454 yl = &ylocal[ox1+npack-ol];
455 }
456 else
457 {
458 yl = y;
459 }
460 // the initial element is not aligned correctly ->
461 // use scalar routine until we are correctly aligned
462 if( ox1 > 0 )
463 {
464 vecfun2_partial(x1, x2l, yl, ilo, min(ilo+npack-ox1,nhi), npack, vecfun1);
465 ilo = min(ilo+npack-ox1,nhi);
466 }
467 // use vectorized routine as long as there are at least npack evaluations left
468 for( i=ilo; i < nhi-npack+1; i += npack )
469 {
470 const V& xx1 = *reinterpret_cast<const V*>(&x1[i]);
471 const V& xx2 = *reinterpret_cast<const V*>(&x2l[i]);
472 V& yy = *reinterpret_cast<V*>(&yl[i]);
473 yy = vecfun1(xx1, xx2);
474 }
475 ilo = i;
476 // use partial routine again for the remaining calls (if any)...
477 if( ilo < nhi )
478 vecfun2_partial(x1, x2l, yl, ilo, nhi, npack, vecfun1);
479 if( lgNonAligned1 )
480 delete[] x2local;
481 if( lgNonAligned2 )
482 {
483 for( i=nlo; i < nhi; ++i )
484 y[i] = yl[i];
485 delete[] ylocal;
486 }
487}
488
489#ifdef __AVX512F__
490#define V1FUN_PD_4(FUN, V) \
491 v8df xx = _mm512_set_pd( V, V, V, V, x3, x2, x1, x0 ); \
492 v8df yy = v1##FUN##d(xx); \
493 memcpy(y, &yy, 4*sizeof(double))
494#else
495#define V1FUN_PD_4(FUN, V) \
496 v4df xx = _mm256_set_pd( x3, x2, x1, x0 ); \
497 v4df yy = v1##FUN##d(xx); \
498 memcpy(y, &yy, 4*sizeof(double))
499#endif
500
501#ifdef __AVX512F__
502#define V1FUN_PD_8(FUN, V) \
503 v8df xx = _mm512_set_pd( x7, x6, x5, x4, x3, x2, x1, x0 ); \
504 v8df yy = v1##FUN##d(xx); \
505 memcpy(y, &yy, 8*sizeof(double))
506#else
507#define V1FUN_PD_8(FUN, V) \
508 v4df yy[2]; \
509 v4df xx = _mm256_set_pd( x3, x2, x1, x0 ); \
510 yy[0] = v1##FUN##d(xx); \
511 xx = _mm256_set_pd( x7, x6, x5, x4 ); \
512 yy[1] = v1##FUN##d(xx); \
513 memcpy(y, &yy, 8*sizeof(double))
514#endif
515
516#ifdef __AVX512F__
517#define V1FUN_PS_4(FUN, V) \
518 v16sf xx = _mm512_set_ps( V, V, V, V, V, V, V, V, V, V, V, V, x3, x2, x1, x0 ); \
519 v16sf yy = v1##FUN##f(xx); \
520 memcpy(y, &yy, 4*sizeof(sys_float))
521#else
522#define V1FUN_PS_4(FUN, V) \
523 v8sf xx = _mm256_set_ps( V, V, V, V, x3, x2, x1, x0 ); \
524 v8sf yy = v1##FUN##f(xx); \
525 memcpy(y, &yy, 4*sizeof(sys_float))
526#endif
527
528#ifdef __AVX512F__
529#define V1FUN_PS_8(FUN, V) \
530 v16sf xx = _mm512_set_ps( V, V, V, V, V, V, V, V, x7, x6, x5, x4, x3, x2, x1, x0 ); \
531 v16sf yy = v1##FUN##f(xx); \
532 memcpy(y, &yy, 8*sizeof(sys_float))
533#else
534#define V1FUN_PS_8(FUN, V) \
535 v8sf xx = _mm256_set_ps( x7, x6, x5, x4, x3, x2, x1, x0 ); \
536 v8sf yy = v1##FUN##f(xx); \
537 memcpy(y, &yy, 8*sizeof(sys_float))
538#endif
539
540#ifdef __AVX512F__
541#define V1FUN_PS_16(FUN, V) \
542 v16sf xx = _mm512_set_ps( x15, x14, x13, x12, x11, x10, x9, x8, x7, x6, x5, x4, x3, x2, x1, x0 ); \
543 v16sf yy = v1##FUN##f(xx); \
544 memcpy(y, &yy, 16*sizeof(sys_float))
545#else
546#define V1FUN_PS_16(FUN, V) \
547 v8sf yy[2]; \
548 v8sf xx = _mm256_set_ps( x7, x6, x5, x4, x3, x2, x1, x0 ); \
549 yy[0] = v1##FUN##f(xx); \
550 xx = _mm256_set_ps( x15, x14, x13, x12, x11, x10, x9, x8 ); \
551 yy[1] = v1##FUN##f(xx); \
552 memcpy(y, &yy, 16*sizeof(sys_float))
553#endif
554
555#ifdef __AVX512F__
556#define V1FUN2_PD_4(FUN, V) \
557 v8df xx = _mm512_set_pd( V, V, V, V, x3, x2, x1, x0 ); \
558 v8df yy = _mm512_set_pd( V, V, V, V, y3, y2, y1, y0 ); \
559 v8df zz = v1##FUN##d(xx, yy); \
560 memcpy(z, &zz, 4*sizeof(double))
561#else
562#define V1FUN2_PD_4(FUN, V) \
563 v4df xx = _mm256_set_pd( x3, x2, x1, x0 ); \
564 v4df yy = _mm256_set_pd( y3, y2, y1, y0 ); \
565 v4df zz = v1##FUN##d(xx, yy); \
566 memcpy(z, &zz, 4*sizeof(double))
567#endif
568
569#ifdef __AVX512F__
570#define V1FUN2_PS_4(FUN, V) \
571 v16sf xx = _mm512_set_ps( V, V, V, V, V, V, V, V, V, V, V, V, x3, x2, x1, x0 ); \
572 v16sf yy = _mm512_set_ps( V, V, V, V, V, V, V, V, V, V, V, V, y3, y2, y1, y0 ); \
573 v16sf zz = v1##FUN##f(xx, yy); \
574 memcpy(z, &zz, 4*sizeof(sys_float))
575#else
576#define V1FUN2_PS_4(FUN, V) \
577 v8sf xx = _mm256_set_ps( V, V, V, V, x3, x2, x1, x0 ); \
578 v8sf yy = _mm256_set_ps( V, V, V, V, y3, y2, y1, y0 ); \
579 v8sf zz = v1##FUN##f(xx, yy); \
580 memcpy(z, &zz, 4*sizeof(sys_float))
581#endif
582
583#ifdef __AVX512F__
584#define V1FUN2_PS_8(FUN, V) \
585 v16sf xx = _mm512_set_ps( V, V, V, V, V, V, V, V, x7, x6, x5, x4, x3, x2, x1, x0 ); \
586 v16sf yy = _mm512_set_ps( V, V, V, V, V, V, V, V, y7, y6, y5, y4, y3, y2, y1, y0 ); \
587 v16sf zz = v1##FUN##f(xx, yy); \
588 memcpy(z, &zz, 8*sizeof(sys_float))
589#else
590#define V1FUN2_PS_8(FUN, V) \
591 v8sf xx = _mm256_set_ps( x7, x6, x5, x4, x3, x2, x1, x0 ); \
592 v8sf yy = _mm256_set_ps( y7, y6, y5, y4, y3, y2, y1, y0 ); \
593 v8sf zz = v1##FUN##f(xx, yy); \
594 memcpy(z, &zz, 8*sizeof(sys_float))
595#endif
596
597#else // __AVX__
598
599// fallback for non-AVX capable hardware
600template<class T, class V>
601inline void vecfun(const T x[], T y[], long nlo, long nhi, T (*scalfun1)(T), V (*)(V))
602{
603 for( long i=nlo; i < nhi; ++i )
604 y[i] = scalfun1(x[i]);
605}
606
607template<class T, class V>
608inline void vecfun2(const T x1[], const T x2[], T y[], long nlo, long nhi, T (*scalfun1)(T, T), V (*)(V, V))
609{
610 for( long i=nlo; i < nhi; ++i )
611 y[i] = scalfun1(x1[i], x2[i]);
612}
613
614#define V1FUN_4(FUN, V) \
615 y[0] = FUN(x0); \
616 y[1] = FUN(x1); \
617 y[2] = FUN(x2); \
618 y[3] = FUN(x3)
619
620#define V1FUN_8(FUN, V) \
621 y[0] = FUN(x0); \
622 y[1] = FUN(x1); \
623 y[2] = FUN(x2); \
624 y[3] = FUN(x3); \
625 y[4] = FUN(x4); \
626 y[5] = FUN(x5); \
627 y[6] = FUN(x6); \
628 y[7] = FUN(x7)
629
630#define V1FUN_16(FUN, V) \
631 y[0] = FUN(x0); \
632 y[1] = FUN(x1); \
633 y[2] = FUN(x2); \
634 y[3] = FUN(x3); \
635 y[4] = FUN(x4); \
636 y[5] = FUN(x5); \
637 y[6] = FUN(x6); \
638 y[7] = FUN(x7); \
639 y[8] = FUN(x8); \
640 y[9] = FUN(x9); \
641 y[10] = FUN(x10); \
642 y[11] = FUN(x11); \
643 y[12] = FUN(x12); \
644 y[13] = FUN(x13); \
645 y[14] = FUN(x14); \
646 y[15] = FUN(x15)
647
648#define V1FUN_PD_4(FUN, V) \
649 V1FUN_4(FUN, V)
650
651#define V1FUN_PD_8(FUN, V) \
652 V1FUN_8(FUN, V)
653
654#define V1FUN_PS_4(FUN, V) \
655 V1FUN_4(FUN##f, V)
656
657#define V1FUN_PS_8(FUN, V) \
658 V1FUN_8(FUN##f, V)
659
660#define V1FUN_PS_16(FUN, V) \
661 V1FUN_16(FUN##f, V)
662
663#define V1FUN2_4(FUN, V) \
664 z[0] = FUN(x0, y0); \
665 z[1] = FUN(x1, y1); \
666 z[2] = FUN(x2, y2); \
667 z[3] = FUN(x3, y3)
668
669#define V1FUN2_8(FUN, V) \
670 z[0] = FUN(x0, y0); \
671 z[1] = FUN(x1, y1); \
672 z[2] = FUN(x2, y2); \
673 z[3] = FUN(x3, y3); \
674 z[4] = FUN(x4, y4); \
675 z[5] = FUN(x5, y5); \
676 z[6] = FUN(x6, y6); \
677 z[7] = FUN(x7, y7)
678
679#define V1FUN2_PD_4(FUN, V) \
680 V1FUN2_4(FUN, V)
681
682#define V1FUN2_PS_4(FUN, V) \
683 V1FUN2_4(FUN##f, V)
684
685#define V1FUN2_PS_8(FUN, V) \
686 V1FUN2_8(FUN##f, V)
687
688#endif // __AVX__
689
690#endif
static double x2[63]
Definition atmdat_3body.cpp:20
static double x1[83]
Definition atmdat_3body.cpp:28
float sys_float
Definition cddefines.h:130
#define NULL
Definition cddefines.h:115
long min(int a, long b)
Definition cddefines.h:766
#define ALIGNED(X)
Definition cpu.h:475
void zero(void)
Definition zero.cpp:30
void vecfun(const T x[], T y[], long nlo, long nhi, T(*scalfun1)(T), V(*)(V))
Definition vectorize_math.h:601
void vecfun2(const T x1[], const T x2[], T y[], long nlo, long nhi, T(*scalfun1)(T, T), V(*)(V, V))
Definition vectorize_math.h:608