Cloudy
Spectral Synthesis Code for Astrophysics
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
vectorize_math.h
Go to the documentation of this file.
1 /* This file is part of Cloudy and is copyright (C)1978-2023 by Gary J. Ferland and
2  * others. For conditions of distribution and use see copyright notice in license.txt */
3 
4 #ifndef VECTORIZE_MATH_H
5 #define VECTORIZE_MATH_H
6 
7 #ifdef __AVX__
8 
9 // use shorthand for packed types
10 typedef __m128d v2df;
11 typedef __m128 v4sf;
12 typedef __m128i v2di;
13 typedef __m128i v4si;
14 
15 typedef __m256d v4df;
16 typedef __m256 v8sf;
17 typedef __m256i v4di;
18 typedef __m256i v8si;
19 
20 #ifdef __AVX512F__
21 
22 typedef __m512d v8df;
23 typedef __m512 v16sf;
24 typedef __m512i v8di;
25 typedef __m512i v16si;
26 
27 // some macros for defining vector constants
28 
29 #define VECD_CONST(name,x) \
30 ALIGNED(64) static const double __avx_##name[8] = {x,x,x,x,x,x,x,x}; \
31 static const v8df& name = *reinterpret_cast<const v8df*>(__avx_##name)
32 
33 #define VECDI_CONST(name,x) \
34 ALIGNED(64) static const uint64 __avx_##name[8] = {x,x,x,x,x,x,x,x}; \
35 static const v8df& name = *reinterpret_cast<const v8df*>(__avx_##name)
36 
37 #define VECF_CONST(name,x) \
38 ALIGNED(64) static const sys_float __avx_##name[16] = {x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x}; \
39 static const v16sf& name = *reinterpret_cast<const v16sf*>(__avx_##name)
40 
41 #define VECFI_CONST(name,x) \
42 ALIGNED(64) static const uint32 __avx_##name[16] = {x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x}; \
43 static const v16sf& name = *reinterpret_cast<const v16sf*>(__avx_##name)
44 
45 #define VECI_CONST(name,x) \
46 ALIGNED(32) static const uint32 __avx_##name[8] = {x,x,x,x,x,x,x,x}; \
47 static const v8si& name = *reinterpret_cast<const v8si*>(__avx_##name)
48 
49 #define VECII_CONST(name,x) \
50 ALIGNED(64) static const uint32 __avx_##name[16] = {x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x}; \
51 static const v16si& name = *reinterpret_cast<const v16si*>(__avx_##name)
52 
53 #define VECL_CONST(name,x) \
54 ALIGNED(32) static const uint64 __avx_##name[4] = {x,x,x,x}; \
55 static const v4di& name = *reinterpret_cast<const v4di*>(__avx_##name)
56 
57 #define VECLL_CONST(name,x) \
58 ALIGNED(64) static const uint64 __avx_##name[8] = {x,x,x,x,x,x,x,x}; \
59 static const v8di& name = *reinterpret_cast<const v8di*>(__avx_##name)
60 
61 union vpack
62 {
63  ALIGNED(64) v8df pd;
64  ALIGNED(64) v8di pl;
65  ALIGNED(64) v16sf pf;
66  ALIGNED(64) v16si pi;
67  ALIGNED(64) double d[8];
68  ALIGNED(64) int64 l[8];
69  ALIGNED(64) sys_float f[16];
70  ALIGNED(64) int32 i[16];
71 };
72 
73 inline bool getmaskbit(unsigned int mask, unsigned int n)
74 {
75  return ( ((mask>>n)&1) == 1 );
76 }
77 
78 template<class T>
79 inline string print_pack(const T x[], int npack, unsigned int mask, const string& name)
80 {
81  ostringstream oss;
82  for( int i=0; i < npack; ++i )
83  {
84  if( getmaskbit(mask, i) )
85  oss << " => ";
86  else
87  oss << " ";
88  oss << name << "[" << i << "] = " << x[i] << "\n";
89  }
90  return oss.str();
91 }
92 
93 inline string DEMsg(const string& routine, v8df x, __mmask8 mask)
94 {
95  vpack p;
96  ostringstream oss;
97  oss << "domain error in " << routine << ".\n";
98  p.pd = x;
99  oss << print_pack(p.d, 8, mask, "x");
100  return oss.str();
101 }
102 
103 inline string DEMsg(const string& routine, v16sf x, __mmask16 mask)
104 {
105  vpack p;
106  ostringstream oss;
107  oss << "domain error in " << routine << ".\n";
108  p.pf = x;
109  oss << print_pack(p.f, 16, mask, "x");
110  return oss.str();
111 }
112 
113 inline string DEMsg(const string& routine, v8df x, __mmask8 mask1, v8df y, __mmask8 mask2)
114 {
115  vpack p;
116  ostringstream oss;
117  oss << "domain error in " << routine << ".\n";
118  p.pd = x;
119  oss << print_pack(p.d, 8, mask1, "x");
120  p.pd = y;
121  oss << print_pack(p.d, 8, mask2, "y");
122  return oss.str();
123 }
124 
125 inline string DEMsg(const string& routine, v16sf x, __mmask16 mask1, v16sf y, __mmask16 mask2)
126 {
127  vpack p;
128  ostringstream oss;
129  oss << "domain error in " << routine << ".\n";
130  p.pf = x;
131  oss << print_pack(p.f, 16, mask1, "x");
132  p.pf = y;
133  oss << print_pack(p.f, 16, mask2, "y");
134  return oss.str();
135 }
136 
137 #else // __AVX512F__
138 
139 #define VECD_CONST(name,x) \
140 ALIGNED(32) static const double __avx_##name[4] = {x,x,x,x}; \
141 static const v4df& name = *reinterpret_cast<const v4df*>(__avx_##name)
142 
143 #define VECDI_CONST(name,x) \
144 ALIGNED(32) static const uint64 __avx_##name[4] = {x,x,x,x}; \
145 static const v4df& name = *reinterpret_cast<const v4df*>(__avx_##name)
146 
147 #define VECF_CONST(name,x) \
148 ALIGNED(32) static const sys_float __avx_##name[8] = {x,x,x,x,x,x,x,x}; \
149 static const v8sf& name = *reinterpret_cast<const v8sf*>(__avx_##name)
150 
151 #define VECFI_CONST(name,x) \
152 ALIGNED(32) static const uint32 __avx_##name[8] = {x,x,x,x,x,x,x,x}; \
153 static const v8sf& name = *reinterpret_cast<const v8sf*>(__avx_##name)
154 
155 #define VECI_CONST(name,x) \
156 ALIGNED(16) static const uint32 __avx_##name[4] = {x,x,x,x}; \
157 static const v4si& name = *reinterpret_cast<const v4si*>(__avx_##name)
158 
159 #define VECII_CONST(name,x) \
160 ALIGNED(32) static const uint32 __avx_##name[8] = {x,x,x,x,x,x,x,x}; \
161 static const v8si& name = *reinterpret_cast<const v8si*>(__avx_##name)
162 
163 #define VECL_CONST(name,x) \
164 ALIGNED(16) static const uint64 __avx_##name[2] = {x,x}; \
165 static const v2di& name = *reinterpret_cast<const v2di*>(__avx_##name)
166 
167 #define VECLL_CONST(name,x) \
168 ALIGNED(32) static const uint64 __avx_##name[4] = {x,x,x,x}; \
169 static const v4di& name = *reinterpret_cast<const v4di*>(__avx_##name)
170 
171 union vpack
172 {
173  ALIGNED(32) v4df pd;
174  ALIGNED(32) v4di pl;
175  ALIGNED(32) v8sf pf;
176  ALIGNED(32) v8si pi;
177  ALIGNED(32) double d[4];
178  ALIGNED(32) int64 l[4];
179  ALIGNED(32) sys_float f[8];
180  ALIGNED(32) int32 i[8];
181 };
182 
183 template<class T, class U>
184 inline string print_pack(const T x[], const U m[], int npack, const string& name)
185 {
186  ostringstream oss;
187  for( int i=0; i < npack; ++i )
188  {
189  if( m[i] < 0 )
190  oss << " => ";
191  else
192  oss << " ";
193  oss << name << "[" << i << "] = " << x[i] << "\n";
194  }
195  return oss.str();
196 }
197 
198 inline string DEMsg(const string& routine, v4df x, v4df mask)
199 {
200  vpack p, m;
201  ostringstream oss;
202  oss << "domain error in " << routine << ".\n";
203  p.pd = x;
204  m.pd = mask;
205  oss << print_pack(p.d, m.l, 4, "x");
206  return oss.str();
207 }
208 
209 inline string DEMsg(const string& routine, v8sf x, v8sf mask)
210 {
211  vpack p, m;
212  ostringstream oss;
213  oss << "domain error in " << routine << ".\n";
214  p.pf = x;
215  m.pf = mask;
216  oss << print_pack(p.f, m.i, 8, "x");
217  return oss.str();
218 }
219 
220 inline string DEMsg(const string& routine, v4df x, v4df mask1, v4df y, v4df mask2)
221 {
222  vpack p, m;
223  ostringstream oss;
224  oss << "domain error in " << routine << ".\n";
225  p.pd = x;
226  m.pd = mask1;
227  oss << print_pack(p.d, m.l, 4, "x");
228  p.pd = y;
229  m.pd = mask2;
230  oss << print_pack(p.d, m.l, 4, "y");
231  return oss.str();
232 }
233 
234 inline string DEMsg(const string& routine, v8sf x, v8sf mask1, v8sf y, v8sf mask2)
235 {
236  vpack p, m;
237  ostringstream oss;
238  oss << "domain error in " << routine << ".\n";
239  p.pf = x;
240  m.pf = mask1;
241  oss << print_pack(p.f, m.i, 8, "x");
242  p.pf = y;
243  m.pf = mask2;
244  oss << print_pack(p.f, m.i, 8, "y");
245  return oss.str();
246 }
247 
248 #endif // __AVX512F__
249 
250 VECD_CONST(mthree,-3.);
251 VECD_CONST(mone,-1.);
252 VECD_CONST(mhalf,-0.5);
253 VECD_CONST(zero,0.);
254 VECDI_CONST(third,0x3fd5555555555555); // 0.33333333333333333333
255 VECD_CONST(half,0.5);
256 VECD_CONST(one,1.);
257 VECD_CONST(c1p5,1.5);
258 VECD_CONST(two,2.);
259 VECD_CONST(three,3.);
260 VECD_CONST(six,6.);
261 VECD_CONST(ten,10.);
262 VECD_CONST(c1023,1023.);
263 
264 VECDI_CONST(ln_two,0x3fe62e42fefa39ef); // log(2.)
265 VECDI_CONST(log2e,0x3ff71547652b82fe); // log2(e) = 1./log(2.));
266 VECDI_CONST(ln_ten,0x40026bb1bbb55516); // log(10.)
267 VECDI_CONST(log10e,0x3fdbcb7b1526e50e); // log10(e) = 1./log(10.)
268 
269 VECDI_CONST(dbl_min,0x0010000000000000); // DBL_MIN
270 VECDI_CONST(dbl_max,0x7fefffffffffffff); // DBL_MAX
271 VECDI_CONST(sqrt_dbl_max,0x5fefffffffffffff); // sqrt(DBL_MAX)
272 
273 VECF_CONST(mthreef,-3.f);
274 VECF_CONST(monef,-1.f);
275 VECF_CONST(mhalff,-0.5f);
276 VECF_CONST(zerof,0.f);
277 VECFI_CONST(thirdf,0x3eaaaaab); // 0.3333333333f
278 VECF_CONST(halff,0.5f);
279 VECF_CONST(onef,1.f);
280 VECF_CONST(twof,2.f);
281 VECF_CONST(c1p5f,1.5f);
282 VECF_CONST(threef,3.f);
283 VECF_CONST(sixf,6.f);
284 VECF_CONST(tenf,10.f);
285 VECF_CONST(c127,127.f);
286 
287 VECFI_CONST(ln_twof,0x3f317218); // logf(2.f)
288 VECFI_CONST(log2ef,0x3fb8aa3b); // log2f(e) = 1.f/logf(2.f)
289 VECFI_CONST(ln_tenf,0x40135d8e); // logf(10.f)
290 VECFI_CONST(log10ef,0x3ede5bd9); // log10f(e) = 1.f/logf(10.f)
291 
292 VECFI_CONST(flt_min,0x00800000); // FLT_MIN
293 VECFI_CONST(flt_max,0x7f7fffff); // FLT_MAX
294 VECFI_CONST(sqrt_flt_max,0x5f7fffff); // sqrt(FLT_MAX)
295 
296 template<class V>
297 inline void vecfun_partial(const sys_float x[], sys_float y[], long nlo, long nhi, long npack, V (*vecfun1)(V))
298 {
299  vpack xx, yy;
300  for( long i=0; i < npack; ++i )
301  xx.f[i] = x[min(nlo+i,nhi-1)];
302  yy.pf = vecfun1(xx.pf);
303  for( long i=nlo; i < nhi; ++i )
304  y[i] = yy.f[i-nlo];
305 }
306 
307 template<class V>
308 inline void vecfun_partial(const double x[], double y[], long nlo, long nhi, long npack, V (*vecfun1)(V))
309 {
310  vpack xx, yy;
311  for( long i=0; i < npack; ++i )
312  xx.d[i] = x[min(nlo+i,nhi-1)];
313  yy.pd = vecfun1(xx.pd);
314  for( long i=nlo; i < nhi; ++i )
315  y[i] = yy.d[i-nlo];
316 }
317 
318 template<class V>
319 inline void vecfun2_partial(const sys_float x1[], const sys_float x2[], sys_float y[], long nlo, long nhi,
320  long npack, V (*vecfun1)(V, V))
321 {
322  vpack xx1, xx2, yy;
323  for( long i=0; i < npack; ++i )
324  {
325  xx1.f[i] = x1[min(nlo+i,nhi-1)];
326  xx2.f[i] = x2[min(nlo+i,nhi-1)];
327  }
328  yy.pf = vecfun1(xx1.pf, xx2.pf);
329  for( long i=nlo; i < nhi; ++i )
330  y[i] = yy.f[i-nlo];
331 }
332 
333 template<class V>
334 inline void vecfun2_partial(const double x1[], const double x2[], double y[], long nlo, long nhi, long npack,
335  V (*vecfun1)(V, V))
336 {
337  vpack xx1, xx2, yy;
338  for( long i=0; i < npack; ++i )
339  {
340  xx1.d[i] = x1[min(nlo+i,nhi-1)];
341  xx2.d[i] = x2[min(nlo+i,nhi-1)];
342  }
343  yy.pd = vecfun1(xx1.pd, xx2.pd);
344  for( long i=nlo; i < nhi; ++i )
345  y[i] = yy.d[i-nlo];
346 }
347 
348 // this is the generic wrapper routine for all vectorized math functions
349 // it makes sure that the vectorized function gets arguments that are properly aligned
350 // scalfun1 is the normal scalar routine working on a single argument
351 // vecfun1 is the vectorized routine working on a packed register
352 template<class T, class V>
353 inline void vecfun(const T x[], T y[], long nlo, long nhi, T (*)(T), V (*vecfun1)(V))
354 {
355  if( nhi <= nlo )
356  return;
357 
358  // determine number of elements of type T in a packed register
359  long npack = sizeof(V)/sizeof(T);
360  if( nhi-nlo < npack )
361  {
362  vecfun_partial(x, y, nlo, nhi, npack, vecfun1);
363  return;
364  }
365  long i, ilo = nlo;
366  long ox = (reinterpret_cast<long>(x)/sizeof(T)+nlo)%npack;
367  long oy = (reinterpret_cast<long>(y)/sizeof(T)+nlo)%npack;
368  bool lgNonAligned = ( ox != oy );
369  T *yl, *ylocal = NULL;
370  if( lgNonAligned )
371  {
372  ylocal = new T[nhi+npack-1];
373  long ol = (reinterpret_cast<long>(ylocal)/sizeof(T)+nlo)%npack;
374  if( ox >= ol )
375  yl = &ylocal[ox-ol];
376  else
377  yl = &ylocal[ox+npack-ol];
378  }
379  else
380  {
381  yl = y;
382  }
383  // the initial element is not aligned correctly ->
384  // use scalar routine until we are correctly aligned
385  if( ox > 0 )
386  {
387  vecfun_partial(x, yl, ilo, min(ilo+npack-ox,nhi), npack, vecfun1);
388  ilo = min(ilo+npack-ox,nhi);
389  }
390  // use vectorized routine as long as there are at least npack evaluations left
391  for( i=ilo; i < nhi-npack+1; i += npack )
392  {
393  const V& xx = *reinterpret_cast<const V*>(&x[i]);
394  V& yy = *reinterpret_cast<V*>(&yl[i]);
395  yy = vecfun1(xx);
396  }
397  ilo = i;
398  // use partial routine again for the remaining calls (if any)...
399  if( ilo < nhi )
400  vecfun_partial(x, yl, ilo, nhi, npack, vecfun1);
401  if( lgNonAligned )
402  {
403  for( i=nlo; i < nhi; ++i )
404  y[i] = yl[i];
405  delete[] ylocal;
406  }
407 }
408 
409 template<class T, class V>
410 inline void vecfun2(const T x1[], const T x2[], T y[], long nlo, long nhi, T (*)(T, T), V (*vecfun1)(V, V))
411 {
412  if( nhi <= nlo )
413  return;
414 
415  // determine number of elements of type T in a packed register
416  long npack = sizeof(V)/sizeof(T);
417  if( nhi-nlo < npack )
418  {
419  vecfun2_partial(x1, x2, y, nlo, nhi, npack, vecfun1);
420  return;
421  }
422  long i, ilo = nlo;
423  long ox1 = (reinterpret_cast<long>(x1)/sizeof(T)+nlo)%npack;
424  long ox2 = (reinterpret_cast<long>(x2)/sizeof(T)+nlo)%npack;
425  long oy = (reinterpret_cast<long>(y)/sizeof(T)+nlo)%npack;
426  bool lgNonAligned1 = ( ox1 != ox2 );
427  bool lgNonAligned2 = ( ox1 != oy );
428  const T *x2l;
429  T *x2local = NULL;
430  if( lgNonAligned1 )
431  {
432  x2local = new T[nhi+npack-1];
433  long ol = (reinterpret_cast<long>(x2local)/sizeof(T)+nlo)%npack;
434  T *ptr;
435  if( ox1 >= ol )
436  ptr = &x2local[ox1-ol];
437  else
438  ptr = &x2local[ox1+npack-ol];
439  memcpy(ptr+nlo, x2+nlo, size_t((nhi-nlo)*sizeof(T)));
440  x2l = ptr;
441  }
442  else
443  {
444  x2l = x2;
445  }
446  T *yl, *ylocal = NULL;
447  if( lgNonAligned2 )
448  {
449  ylocal = new T[nhi+npack-1];
450  long ol = (reinterpret_cast<long>(ylocal)/sizeof(T)+nlo)%npack;
451  if( ox1 >= ol )
452  yl = &ylocal[ox1-ol];
453  else
454  yl = &ylocal[ox1+npack-ol];
455  }
456  else
457  {
458  yl = y;
459  }
460  // the initial element is not aligned correctly ->
461  // use scalar routine until we are correctly aligned
462  if( ox1 > 0 )
463  {
464  vecfun2_partial(x1, x2l, yl, ilo, min(ilo+npack-ox1,nhi), npack, vecfun1);
465  ilo = min(ilo+npack-ox1,nhi);
466  }
467  // use vectorized routine as long as there are at least npack evaluations left
468  for( i=ilo; i < nhi-npack+1; i += npack )
469  {
470  const V& xx1 = *reinterpret_cast<const V*>(&x1[i]);
471  const V& xx2 = *reinterpret_cast<const V*>(&x2l[i]);
472  V& yy = *reinterpret_cast<V*>(&yl[i]);
473  yy = vecfun1(xx1, xx2);
474  }
475  ilo = i;
476  // use partial routine again for the remaining calls (if any)...
477  if( ilo < nhi )
478  vecfun2_partial(x1, x2l, yl, ilo, nhi, npack, vecfun1);
479  if( lgNonAligned1 )
480  delete[] x2local;
481  if( lgNonAligned2 )
482  {
483  for( i=nlo; i < nhi; ++i )
484  y[i] = yl[i];
485  delete[] ylocal;
486  }
487 }
488 
489 #ifdef __AVX512F__
490 #define V1FUN_PD_4(FUN, V) \
491  v8df xx = _mm512_set_pd( V, V, V, V, x3, x2, x1, x0 ); \
492  v8df yy = v1##FUN##d(xx); \
493  memcpy(y, &yy, 4*sizeof(double))
494 #else
495 #define V1FUN_PD_4(FUN, V) \
496  v4df xx = _mm256_set_pd( x3, x2, x1, x0 ); \
497  v4df yy = v1##FUN##d(xx); \
498  memcpy(y, &yy, 4*sizeof(double))
499 #endif
500 
501 #ifdef __AVX512F__
502 #define V1FUN_PD_8(FUN, V) \
503  v8df xx = _mm512_set_pd( x7, x6, x5, x4, x3, x2, x1, x0 ); \
504  v8df yy = v1##FUN##d(xx); \
505  memcpy(y, &yy, 8*sizeof(double))
506 #else
507 #define V1FUN_PD_8(FUN, V) \
508  v4df yy[2]; \
509  v4df xx = _mm256_set_pd( x3, x2, x1, x0 ); \
510  yy[0] = v1##FUN##d(xx); \
511  xx = _mm256_set_pd( x7, x6, x5, x4 ); \
512  yy[1] = v1##FUN##d(xx); \
513  memcpy(y, &yy, 8*sizeof(double))
514 #endif
515 
516 #ifdef __AVX512F__
517 #define V1FUN_PS_4(FUN, V) \
518  v16sf xx = _mm512_set_ps( V, V, V, V, V, V, V, V, V, V, V, V, x3, x2, x1, x0 ); \
519  v16sf yy = v1##FUN##f(xx); \
520  memcpy(y, &yy, 4*sizeof(sys_float))
521 #else
522 #define V1FUN_PS_4(FUN, V) \
523  v8sf xx = _mm256_set_ps( V, V, V, V, x3, x2, x1, x0 ); \
524  v8sf yy = v1##FUN##f(xx); \
525  memcpy(y, &yy, 4*sizeof(sys_float))
526 #endif
527 
528 #ifdef __AVX512F__
529 #define V1FUN_PS_8(FUN, V) \
530  v16sf xx = _mm512_set_ps( V, V, V, V, V, V, V, V, x7, x6, x5, x4, x3, x2, x1, x0 ); \
531  v16sf yy = v1##FUN##f(xx); \
532  memcpy(y, &yy, 8*sizeof(sys_float))
533 #else
534 #define V1FUN_PS_8(FUN, V) \
535  v8sf xx = _mm256_set_ps( x7, x6, x5, x4, x3, x2, x1, x0 ); \
536  v8sf yy = v1##FUN##f(xx); \
537  memcpy(y, &yy, 8*sizeof(sys_float))
538 #endif
539 
540 #ifdef __AVX512F__
541 #define V1FUN_PS_16(FUN, V) \
542  v16sf xx = _mm512_set_ps( x15, x14, x13, x12, x11, x10, x9, x8, x7, x6, x5, x4, x3, x2, x1, x0 ); \
543  v16sf yy = v1##FUN##f(xx); \
544  memcpy(y, &yy, 16*sizeof(sys_float))
545 #else
546 #define V1FUN_PS_16(FUN, V) \
547  v8sf yy[2]; \
548  v8sf xx = _mm256_set_ps( x7, x6, x5, x4, x3, x2, x1, x0 ); \
549  yy[0] = v1##FUN##f(xx); \
550  xx = _mm256_set_ps( x15, x14, x13, x12, x11, x10, x9, x8 ); \
551  yy[1] = v1##FUN##f(xx); \
552  memcpy(y, &yy, 16*sizeof(sys_float))
553 #endif
554 
555 #ifdef __AVX512F__
556 #define V1FUN2_PD_4(FUN, V) \
557  v8df xx = _mm512_set_pd( V, V, V, V, x3, x2, x1, x0 ); \
558  v8df yy = _mm512_set_pd( V, V, V, V, y3, y2, y1, y0 ); \
559  v8df zz = v1##FUN##d(xx, yy); \
560  memcpy(z, &zz, 4*sizeof(double))
561 #else
562 #define V1FUN2_PD_4(FUN, V) \
563  v4df xx = _mm256_set_pd( x3, x2, x1, x0 ); \
564  v4df yy = _mm256_set_pd( y3, y2, y1, y0 ); \
565  v4df zz = v1##FUN##d(xx, yy); \
566  memcpy(z, &zz, 4*sizeof(double))
567 #endif
568 
569 #ifdef __AVX512F__
570 #define V1FUN2_PS_4(FUN, V) \
571  v16sf xx = _mm512_set_ps( V, V, V, V, V, V, V, V, V, V, V, V, x3, x2, x1, x0 ); \
572  v16sf yy = _mm512_set_ps( V, V, V, V, V, V, V, V, V, V, V, V, y3, y2, y1, y0 ); \
573  v16sf zz = v1##FUN##f(xx, yy); \
574  memcpy(z, &zz, 4*sizeof(sys_float))
575 #else
576 #define V1FUN2_PS_4(FUN, V) \
577  v8sf xx = _mm256_set_ps( V, V, V, V, x3, x2, x1, x0 ); \
578  v8sf yy = _mm256_set_ps( V, V, V, V, y3, y2, y1, y0 ); \
579  v8sf zz = v1##FUN##f(xx, yy); \
580  memcpy(z, &zz, 4*sizeof(sys_float))
581 #endif
582 
583 #ifdef __AVX512F__
584 #define V1FUN2_PS_8(FUN, V) \
585  v16sf xx = _mm512_set_ps( V, V, V, V, V, V, V, V, x7, x6, x5, x4, x3, x2, x1, x0 ); \
586  v16sf yy = _mm512_set_ps( V, V, V, V, V, V, V, V, y7, y6, y5, y4, y3, y2, y1, y0 ); \
587  v16sf zz = v1##FUN##f(xx, yy); \
588  memcpy(z, &zz, 8*sizeof(sys_float))
589 #else
590 #define V1FUN2_PS_8(FUN, V) \
591  v8sf xx = _mm256_set_ps( x7, x6, x5, x4, x3, x2, x1, x0 ); \
592  v8sf yy = _mm256_set_ps( y7, y6, y5, y4, y3, y2, y1, y0 ); \
593  v8sf zz = v1##FUN##f(xx, yy); \
594  memcpy(z, &zz, 8*sizeof(sys_float))
595 #endif
596 
597 #else // __AVX__
598 
599 // fallback for non-AVX capable hardware
600 template<class T, class V>
601 inline void vecfun(const T x[], T y[], long nlo, long nhi, T (*scalfun1)(T), V (*)(V))
602 {
603  for( long i=nlo; i < nhi; ++i )
604  y[i] = scalfun1(x[i]);
605 }
606 
607 template<class T, class V>
608 inline void vecfun2(const T x1[], const T x2[], T y[], long nlo, long nhi, T (*scalfun1)(T, T), V (*)(V, V))
609 {
610  for( long i=nlo; i < nhi; ++i )
611  y[i] = scalfun1(x1[i], x2[i]);
612 }
613 
614 #define V1FUN_4(FUN, V) \
615  y[0] = FUN(x0); \
616  y[1] = FUN(x1); \
617  y[2] = FUN(x2); \
618  y[3] = FUN(x3)
619 
620 #define V1FUN_8(FUN, V) \
621  y[0] = FUN(x0); \
622  y[1] = FUN(x1); \
623  y[2] = FUN(x2); \
624  y[3] = FUN(x3); \
625  y[4] = FUN(x4); \
626  y[5] = FUN(x5); \
627  y[6] = FUN(x6); \
628  y[7] = FUN(x7)
629 
630 #define V1FUN_16(FUN, V) \
631  y[0] = FUN(x0); \
632  y[1] = FUN(x1); \
633  y[2] = FUN(x2); \
634  y[3] = FUN(x3); \
635  y[4] = FUN(x4); \
636  y[5] = FUN(x5); \
637  y[6] = FUN(x6); \
638  y[7] = FUN(x7); \
639  y[8] = FUN(x8); \
640  y[9] = FUN(x9); \
641  y[10] = FUN(x10); \
642  y[11] = FUN(x11); \
643  y[12] = FUN(x12); \
644  y[13] = FUN(x13); \
645  y[14] = FUN(x14); \
646  y[15] = FUN(x15)
647 
648 #define V1FUN_PD_4(FUN, V) \
649  V1FUN_4(FUN, V)
650 
651 #define V1FUN_PD_8(FUN, V) \
652  V1FUN_8(FUN, V)
653 
654 #define V1FUN_PS_4(FUN, V) \
655  V1FUN_4(FUN##f, V)
656 
657 #define V1FUN_PS_8(FUN, V) \
658  V1FUN_8(FUN##f, V)
659 
660 #define V1FUN_PS_16(FUN, V) \
661  V1FUN_16(FUN##f, V)
662 
663 #define V1FUN2_4(FUN, V) \
664  z[0] = FUN(x0, y0); \
665  z[1] = FUN(x1, y1); \
666  z[2] = FUN(x2, y2); \
667  z[3] = FUN(x3, y3)
668 
669 #define V1FUN2_8(FUN, V) \
670  z[0] = FUN(x0, y0); \
671  z[1] = FUN(x1, y1); \
672  z[2] = FUN(x2, y2); \
673  z[3] = FUN(x3, y3); \
674  z[4] = FUN(x4, y4); \
675  z[5] = FUN(x5, y5); \
676  z[6] = FUN(x6, y6); \
677  z[7] = FUN(x7, y7)
678 
679 #define V1FUN2_PD_4(FUN, V) \
680  V1FUN2_4(FUN, V)
681 
682 #define V1FUN2_PS_4(FUN, V) \
683  V1FUN2_4(FUN##f, V)
684 
685 #define V1FUN2_PS_8(FUN, V) \
686  V1FUN2_8(FUN##f, V)
687 
688 #endif // __AVX__
689 
690 #endif
static double x2[63]
Definition: atmdat_3body.cpp:18
static double x1[83]
Definition: atmdat_3body.cpp:27
void vecfun2(const T x1[], const T x2[], T y[], long nlo, long nhi, T(*scalfun1)(T, T), V(*)(V, V))
Definition: vectorize_math.h:608
ALIGNED(CD_ALIGN) static const double qg32_w[numPoints]
void zero(void)
Definition: zero.cpp:30
void vecfun(const T x[], T y[], long nlo, long nhi, T(*scalfun1)(T), V(*)(V))
Definition: vectorize_math.h:601
float sys_float
Definition: cddefines.h:130
long min(int a, long b)
Definition: cddefines.h:766
#define NULL
Definition: cddefines.h:115