cloudy  trunk
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
vectorize_math.h
Go to the documentation of this file.
1 /* This file is part of Cloudy and is copyright (C)1978-2022 by Gary J. Ferland and
2  * others. For conditions of distribution and use see copyright notice in license.txt */
3 
4 #ifndef VECTORIZE_MATH_H
5 #define VECTORIZE_MATH_H
6 
7 // uint64 is required for AVX support -- uint64 should be present on all modern systems
8 #ifndef HAVE_UINT64
9 #undef __AVX__
10 #endif
11 
12 #ifdef __AVX__
13 
14 // use shorthand for packed types
15 typedef __m128d v2df;
16 typedef __m128 v4sf;
17 typedef __m128i v2di;
18 typedef __m128i v4si;
19 
20 typedef __m256d v4df;
21 typedef __m256 v8sf;
22 typedef __m256i v4di;
23 typedef __m256i v8si;
24 
25 #ifdef __AVX512F__
26 
27 typedef __m512d v8df;
28 typedef __m512 v16sf;
29 typedef __m512i v8di;
30 typedef __m512i v16si;
31 
32 // some macros for defining vector constants
33 
34 #define VECD_CONST(name,x) \
35 ALIGNED(64) static const double __avx_##name[8] = {x,x,x,x,x,x,x,x}; \
36 static const v8df& name = *reinterpret_cast<const v8df*>(__avx_##name)
37 
38 #define VECDI_CONST(name,x) \
39 ALIGNED(64) static const uint64 __avx_##name[8] = {x,x,x,x,x,x,x,x}; \
40 static const v8df& name = *reinterpret_cast<const v8df*>(__avx_##name)
41 
42 #define VECF_CONST(name,x) \
43 ALIGNED(64) static const sys_float __avx_##name[16] = {x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x}; \
44 static const v16sf& name = *reinterpret_cast<const v16sf*>(__avx_##name)
45 
46 #define VECFI_CONST(name,x) \
47 ALIGNED(64) static const uint32 __avx_##name[16] = {x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x}; \
48 static const v16sf& name = *reinterpret_cast<const v16sf*>(__avx_##name)
49 
50 #define VECI_CONST(name,x) \
51 ALIGNED(32) static const uint32 __avx_##name[8] = {x,x,x,x,x,x,x,x}; \
52 static const v8si& name = *reinterpret_cast<const v8si*>(__avx_##name)
53 
54 #define VECII_CONST(name,x) \
55 ALIGNED(64) static const uint32 __avx_##name[16] = {x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x}; \
56 static const v16si& name = *reinterpret_cast<const v16si*>(__avx_##name)
57 
58 #define VECL_CONST(name,x) \
59 ALIGNED(32) static const uint64 __avx_##name[4] = {x,x,x,x}; \
60 static const v4di& name = *reinterpret_cast<const v4di*>(__avx_##name)
61 
62 #define VECLL_CONST(name,x) \
63 ALIGNED(64) static const uint64 __avx_##name[8] = {x,x,x,x,x,x,x,x}; \
64 static const v8di& name = *reinterpret_cast<const v8di*>(__avx_##name)
65 
66 union vpack
67 {
68  ALIGNED(64) v8df pd;
69  ALIGNED(64) v16sf ps;
70  ALIGNED(64) double d[8];
71  ALIGNED(64) int64 l[8];
72  ALIGNED(64) sys_float f[16];
73  ALIGNED(64) int32 i[16];
74 };
75 
76 inline bool getmaskbit(unsigned int mask, unsigned int n)
77 {
78  return ( ((mask>>n)&1) == 1 );
79 }
80 
81 template<class T>
82 inline string print_pack(const T x[], int npack, unsigned int mask, const string& name)
83 {
84  ostringstream oss;
85  for( int i=0; i < npack; ++i )
86  {
87  if( getmaskbit(mask, i) )
88  oss << " => ";
89  else
90  oss << " ";
91  oss << name << "[" << i << "] = " << x[i] << "\n";
92  }
93  return oss.str();
94 }
95 
96 inline string DEMsg(const string& routine, v8df x, __mmask8 mask)
97 {
98  vpack p;
99  ostringstream oss;
100  oss << "domain error in " << routine << ".\n";
101  p.pd = x;
102  oss << print_pack(p.d, 8, mask, "x");
103  return oss.str();
104 }
105 
106 inline string DEMsg(const string& routine, v16sf x, __mmask16 mask)
107 {
108  vpack p;
109  ostringstream oss;
110  oss << "domain error in " << routine << ".\n";
111  p.ps = x;
112  oss << print_pack(p.f, 16, mask, "x");
113  return oss.str();
114 }
115 
116 inline string DEMsg(const string& routine, v8df x, __mmask8 mask1, v8df y, __mmask8 mask2)
117 {
118  vpack p;
119  ostringstream oss;
120  oss << "domain error in " << routine << ".\n";
121  p.pd = x;
122  oss << print_pack(p.d, 8, mask1, "x");
123  p.pd = y;
124  oss << print_pack(p.d, 8, mask2, "y");
125  return oss.str();
126 }
127 
128 inline string DEMsg(const string& routine, v16sf x, __mmask16 mask1, v16sf y, __mmask16 mask2)
129 {
130  vpack p;
131  ostringstream oss;
132  oss << "domain error in " << routine << ".\n";
133  p.ps = x;
134  oss << print_pack(p.f, 16, mask1, "x");
135  p.ps = y;
136  oss << print_pack(p.f, 16, mask2, "y");
137  return oss.str();
138 }
139 
140 #else // __AVX512F__
141 
142 #define VECD_CONST(name,x) \
143 ALIGNED(32) static const double __avx_##name[4] = {x,x,x,x}; \
144 static const v4df& name = *reinterpret_cast<const v4df*>(__avx_##name)
145 
146 #define VECDI_CONST(name,x) \
147 ALIGNED(32) static const uint64 __avx_##name[4] = {x,x,x,x}; \
148 static const v4df& name = *reinterpret_cast<const v4df*>(__avx_##name)
149 
150 #define VECF_CONST(name,x) \
151 ALIGNED(32) static const sys_float __avx_##name[8] = {x,x,x,x,x,x,x,x}; \
152 static const v8sf& name = *reinterpret_cast<const v8sf*>(__avx_##name)
153 
154 #define VECFI_CONST(name,x) \
155 ALIGNED(32) static const uint32 __avx_##name[8] = {x,x,x,x,x,x,x,x}; \
156 static const v8sf& name = *reinterpret_cast<const v8sf*>(__avx_##name)
157 
158 #define VECI_CONST(name,x) \
159 ALIGNED(16) static const uint32 __avx_##name[4] = {x,x,x,x}; \
160 static const v4si& name = *reinterpret_cast<const v4si*>(__avx_##name)
161 
162 #define VECII_CONST(name,x) \
163 ALIGNED(32) static const uint32 __avx_##name[8] = {x,x,x,x,x,x,x,x}; \
164 static const v8si& name = *reinterpret_cast<const v8si*>(__avx_##name)
165 
166 #define VECL_CONST(name,x) \
167 ALIGNED(16) static const uint64 __avx_##name[2] = {x,x}; \
168 static const v2di& name = *reinterpret_cast<const v2di*>(__avx_##name)
169 
170 #define VECLL_CONST(name,x) \
171 ALIGNED(32) static const uint64 __avx_##name[4] = {x,x,x,x}; \
172 static const v4di& name = *reinterpret_cast<const v4di*>(__avx_##name)
173 
174 union vpack
175 {
176  ALIGNED(32) v4df pd;
177  ALIGNED(32) v8sf ps;
178  ALIGNED(32) double d[4];
179  ALIGNED(32) int64 l[4];
180  ALIGNED(32) sys_float f[8];
181  ALIGNED(32) int32 i[8];
182 };
183 
184 template<class T, class U>
185 inline string print_pack(const T x[], const U m[], int npack, const string& name)
186 {
187  ostringstream oss;
188  for( int i=0; i < npack; ++i )
189  {
190  if( m[i] < 0 )
191  oss << " => ";
192  else
193  oss << " ";
194  oss << name << "[" << i << "] = " << x[i] << "\n";
195  }
196  return oss.str();
197 }
198 
199 inline string DEMsg(const string& routine, v4df x, v4df mask)
200 {
201  vpack p, m;
202  ostringstream oss;
203  oss << "domain error in " << routine << ".\n";
204  p.pd = x;
205  m.pd = mask;
206  oss << print_pack(p.d, m.l, 4, "x");
207  return oss.str();
208 }
209 
210 inline string DEMsg(const string& routine, v8sf x, v8sf mask)
211 {
212  vpack p, m;
213  ostringstream oss;
214  oss << "domain error in " << routine << ".\n";
215  p.ps = x;
216  m.ps = mask;
217  oss << print_pack(p.f, m.i, 8, "x");
218  return oss.str();
219 }
220 
221 inline string DEMsg(const string& routine, v4df x, v4df mask1, v4df y, v4df mask2)
222 {
223  vpack p, m;
224  ostringstream oss;
225  oss << "domain error in " << routine << ".\n";
226  p.pd = x;
227  m.pd = mask1;
228  oss << print_pack(p.d, m.l, 4, "x");
229  p.pd = y;
230  m.pd = mask2;
231  oss << print_pack(p.d, m.l, 4, "y");
232  return oss.str();
233 }
234 
235 inline string DEMsg(const string& routine, v8sf x, v8sf mask1, v8sf y, v8sf mask2)
236 {
237  vpack p, m;
238  ostringstream oss;
239  oss << "domain error in " << routine << ".\n";
240  p.ps = x;
241  m.ps = mask1;
242  oss << print_pack(p.f, m.i, 8, "x");
243  p.ps = y;
244  m.ps = mask2;
245  oss << print_pack(p.f, m.i, 8, "y");
246  return oss.str();
247 }
248 
249 #endif // __AVX512F__
250 
251 VECD_CONST(mthree,-3.);
252 VECD_CONST(mone,-1.);
253 VECD_CONST(mhalf,-0.5);
254 VECD_CONST(zero,0.);
255 VECDI_CONST(third,0x3fd5555555555555); // 0.33333333333333333333
256 VECD_CONST(half,0.5);
257 VECD_CONST(one,1.);
258 VECD_CONST(c1p5,1.5);
259 VECD_CONST(two,2.);
260 VECD_CONST(three,3.);
261 VECD_CONST(six,6.);
262 VECD_CONST(ten,10.);
263 VECD_CONST(c1023,1023.);
264 
265 VECDI_CONST(ln_two,0x3fe62e42fefa39ef); // log(2.)
266 VECDI_CONST(log2e,0x3ff71547652b82fe); // log2(e) = 1./log(2.));
267 VECDI_CONST(ln_ten,0x40026bb1bbb55516); // log(10.)
268 VECDI_CONST(log10e,0x3fdbcb7b1526e50e); // log10(e) = 1./log(10.)
269 
270 VECDI_CONST(dbl_min,0x0010000000000000); // DBL_MIN
271 VECDI_CONST(dbl_max,0x7fefffffffffffff); // DBL_MAX
272 VECDI_CONST(sqrt_dbl_max,0x5fefffffffffffff); // sqrt(DBL_MAX)
273 
274 VECF_CONST(mthreef,-3.f);
275 VECF_CONST(monef,-1.f);
276 VECF_CONST(mhalff,-0.5f);
277 VECF_CONST(zerof,0.f);
278 VECFI_CONST(thirdf,0x3eaaaaab); // 0.3333333333f
279 VECF_CONST(halff,0.5f);
280 VECF_CONST(onef,1.f);
281 VECF_CONST(twof,2.f);
282 VECF_CONST(c1p5f,1.5f);
283 VECF_CONST(threef,3.f);
284 VECF_CONST(sixf,6.f);
285 VECF_CONST(tenf,10.f);
286 VECF_CONST(c127,127.f);
287 
288 VECFI_CONST(ln_twof,0x3f317218); // logf(2.f)
289 VECFI_CONST(log2ef,0x3fb8aa3b); // log2f(e) = 1.f/logf(2.f)
290 VECFI_CONST(ln_tenf,0x40135d8e); // logf(10.f)
291 VECFI_CONST(log10ef,0x3ede5bd9); // log10f(e) = 1.f/logf(10.f)
292 
293 VECFI_CONST(flt_min,0x00800000); // FLT_MIN
294 VECFI_CONST(flt_max,0x7f7fffff); // FLT_MAX
295 VECFI_CONST(sqrt_flt_max,0x5f7fffff); // sqrt(FLT_MAX)
296 
297 template<class V>
298 inline void vecfun_partial(const sys_float x[], sys_float y[], long nlo, long nhi, long npack, V (*vecfun1)(V))
299 {
300  vpack xx, yy;
301  for( long i=0; i < npack; ++i )
302  xx.f[i] = x[min(nlo+i,nhi-1)];
303  yy.ps = vecfun1(xx.ps);
304  for( long i=nlo; i < nhi; ++i )
305  y[i] = yy.f[i-nlo];
306 }
307 
308 template<class V>
309 inline void vecfun_partial(const double x[], double y[], long nlo, long nhi, long npack, V (*vecfun1)(V))
310 {
311  vpack xx, yy;
312  for( long i=0; i < npack; ++i )
313  xx.d[i] = x[min(nlo+i,nhi-1)];
314  yy.pd = vecfun1(xx.pd);
315  for( long i=nlo; i < nhi; ++i )
316  y[i] = yy.d[i-nlo];
317 }
318 
319 template<class V>
320 inline void vecfun2_partial(const sys_float x1[], const sys_float x2[], sys_float y[], long nlo, long nhi,
321  long npack, V (*vecfun1)(V, V))
322 {
323  vpack xx1, xx2, yy;
324  for( long i=0; i < npack; ++i )
325  {
326  xx1.f[i] = x1[min(nlo+i,nhi-1)];
327  xx2.f[i] = x2[min(nlo+i,nhi-1)];
328  }
329  yy.ps = vecfun1(xx1.ps, xx2.ps);
330  for( long i=nlo; i < nhi; ++i )
331  y[i] = yy.f[i-nlo];
332 }
333 
334 template<class V>
335 inline void vecfun2_partial(const double x1[], const double x2[], double y[], long nlo, long nhi, long npack,
336  V (*vecfun1)(V, V))
337 {
338  vpack xx1, xx2, yy;
339  for( long i=0; i < npack; ++i )
340  {
341  xx1.d[i] = x1[min(nlo+i,nhi-1)];
342  xx2.d[i] = x2[min(nlo+i,nhi-1)];
343  }
344  yy.pd = vecfun1(xx1.pd, xx2.pd);
345  for( long i=nlo; i < nhi; ++i )
346  y[i] = yy.d[i-nlo];
347 }
348 
349 // this is the generic wrapper routine for all vectorized math functions
350 // it makes sure that the vectorized function gets arguments that are properly aligned
351 // scalfun1 is the normal scalar routine working on a single argument
352 // vecfun1 is the vectorized routine working on a packed register
353 template<class T, class V>
354 inline void vecfun(const T x[], T y[], long nlo, long nhi, T (*)(T), V (*vecfun1)(V))
355 {
356  if( nhi <= nlo )
357  return;
358 
359  // determine number of elements of type T in a packed register
360  long npack = sizeof(V)/sizeof(T);
361  if( nhi-nlo < npack )
362  {
363  vecfun_partial(x, y, nlo, nhi, npack, vecfun1);
364  return;
365  }
366  long i, ilo = nlo;
367  long ox = (reinterpret_cast<long>(x)/sizeof(T)+nlo)%npack;
368  long oy = (reinterpret_cast<long>(y)/sizeof(T)+nlo)%npack;
369  bool lgNonAligned = ( ox != oy );
370  T *yl, *ylocal = NULL;
371  if( lgNonAligned )
372  {
373  ylocal = new T[nhi+npack-1];
374  long ol = (reinterpret_cast<long>(ylocal)/sizeof(T)+nlo)%npack;
375  if( ox >= ol )
376  yl = &ylocal[ox-ol];
377  else
378  yl = &ylocal[ox+npack-ol];
379  }
380  else
381  {
382  yl = y;
383  }
384  // the initial element is not aligned correctly ->
385  // use scalar routine until we are correctly aligned
386  if( ox > 0 )
387  {
388  vecfun_partial(x, yl, ilo, min(ilo+npack-ox,nhi), npack, vecfun1);
389  ilo = min(ilo+npack-ox,nhi);
390  }
391  // use vectorized routine as long as there are at least npack evaluations left
392  for( i=ilo; i < nhi-npack+1; i += npack )
393  {
394  const V& xx = *reinterpret_cast<const V*>(&x[i]);
395  V& yy = *reinterpret_cast<V*>(&yl[i]);
396  yy = vecfun1(xx);
397  }
398  ilo = i;
399  // use partial routine again for the remaining calls (if any)...
400  if( ilo < nhi )
401  vecfun_partial(x, yl, ilo, nhi, npack, vecfun1);
402  if( lgNonAligned )
403  {
404  for( i=nlo; i < nhi; ++i )
405  y[i] = yl[i];
406  delete[] ylocal;
407  }
408 }
409 
410 template<class T, class V>
411 inline void vecfun2(const T x1[], const T x2[], T y[], long nlo, long nhi, T (*)(T, T), V (*vecfun1)(V, V))
412 {
413  if( nhi <= nlo )
414  return;
415 
416  // determine number of elements of type T in a packed register
417  long npack = sizeof(V)/sizeof(T);
418  if( nhi-nlo < npack )
419  {
420  vecfun2_partial(x1, x2, y, nlo, nhi, npack, vecfun1);
421  return;
422  }
423  long i, ilo = nlo;
424  long ox1 = (reinterpret_cast<long>(x1)/sizeof(T)+nlo)%npack;
425  long ox2 = (reinterpret_cast<long>(x2)/sizeof(T)+nlo)%npack;
426  long oy = (reinterpret_cast<long>(y)/sizeof(T)+nlo)%npack;
427  bool lgNonAligned1 = ( ox1 != ox2 );
428  bool lgNonAligned2 = ( ox1 != oy );
429  const T *x2l;
430  T *x2local = NULL;
431  if( lgNonAligned1 )
432  {
433  x2local = new T[nhi+npack-1];
434  long ol = (reinterpret_cast<long>(x2local)/sizeof(T)+nlo)%npack;
435  T *ptr;
436  if( ox1 >= ol )
437  ptr = &x2local[ox1-ol];
438  else
439  ptr = &x2local[ox1+npack-ol];
440  memcpy(ptr+nlo, x2+nlo, size_t((nhi-nlo)*sizeof(T)));
441  x2l = ptr;
442  }
443  else
444  {
445  x2l = x2;
446  }
447  T *yl, *ylocal = NULL;
448  if( lgNonAligned2 )
449  {
450  ylocal = new T[nhi+npack-1];
451  long ol = (reinterpret_cast<long>(ylocal)/sizeof(T)+nlo)%npack;
452  if( ox1 >= ol )
453  yl = &ylocal[ox1-ol];
454  else
455  yl = &ylocal[ox1+npack-ol];
456  }
457  else
458  {
459  yl = y;
460  }
461  // the initial element is not aligned correctly ->
462  // use scalar routine until we are correctly aligned
463  if( ox1 > 0 )
464  {
465  vecfun2_partial(x1, x2l, yl, ilo, min(ilo+npack-ox1,nhi), npack, vecfun1);
466  ilo = min(ilo+npack-ox1,nhi);
467  }
468  // use vectorized routine as long as there are at least npack evaluations left
469  for( i=ilo; i < nhi-npack+1; i += npack )
470  {
471  const V& xx1 = *reinterpret_cast<const V*>(&x1[i]);
472  const V& xx2 = *reinterpret_cast<const V*>(&x2l[i]);
473  V& yy = *reinterpret_cast<V*>(&yl[i]);
474  yy = vecfun1(xx1, xx2);
475  }
476  ilo = i;
477  // use partial routine again for the remaining calls (if any)...
478  if( ilo < nhi )
479  vecfun2_partial(x1, x2l, yl, ilo, nhi, npack, vecfun1);
480  if( lgNonAligned1 )
481  delete[] x2local;
482  if( lgNonAligned2 )
483  {
484  for( i=nlo; i < nhi; ++i )
485  y[i] = yl[i];
486  delete[] ylocal;
487  }
488 }
489 
490 #ifdef __AVX512F__
491 #define V1FUN_PD_4(FUN, V) \
492  v8df xx = _mm512_set_pd( V, V, V, V, x3, x2, x1, x0 ); \
493  v8df yy = v1##FUN##d(xx); \
494  memcpy(y, &yy, 4*sizeof(double))
495 #else
496 #define V1FUN_PD_4(FUN, V) \
497  v4df xx = _mm256_set_pd( x3, x2, x1, x0 ); \
498  v4df yy = v1##FUN##d(xx); \
499  memcpy(y, &yy, 4*sizeof(double))
500 #endif
501 
502 #ifdef __AVX512F__
503 #define V1FUN_PD_8(FUN, V) \
504  v8df xx = _mm512_set_pd( x7, x6, x5, x4, x3, x2, x1, x0 ); \
505  v8df yy = v1##FUN##d(xx); \
506  memcpy(y, &yy, 8*sizeof(double))
507 #else
508 #define V1FUN_PD_8(FUN, V) \
509  v4df yy[2]; \
510  v4df xx = _mm256_set_pd( x3, x2, x1, x0 ); \
511  yy[0] = v1##FUN##d(xx); \
512  xx = _mm256_set_pd( x7, x6, x5, x4 ); \
513  yy[1] = v1##FUN##d(xx); \
514  memcpy(y, &yy, 8*sizeof(double))
515 #endif
516 
517 #ifdef __AVX512F__
518 #define V1FUN_PS_4(FUN, V) \
519  v16sf xx = _mm512_set_ps( V, V, V, V, V, V, V, V, V, V, V, V, x3, x2, x1, x0 ); \
520  v16sf yy = v1##FUN##f(xx); \
521  memcpy(y, &yy, 4*sizeof(sys_float))
522 #else
523 #define V1FUN_PS_4(FUN, V) \
524  v8sf xx = _mm256_set_ps( V, V, V, V, x3, x2, x1, x0 ); \
525  v8sf yy = v1##FUN##f(xx); \
526  memcpy(y, &yy, 4*sizeof(sys_float))
527 #endif
528 
529 #ifdef __AVX512F__
530 #define V1FUN_PS_8(FUN, V) \
531  v16sf xx = _mm512_set_ps( V, V, V, V, V, V, V, V, x7, x6, x5, x4, x3, x2, x1, x0 ); \
532  v16sf yy = v1##FUN##f(xx); \
533  memcpy(y, &yy, 8*sizeof(sys_float))
534 #else
535 #define V1FUN_PS_8(FUN, V) \
536  v8sf xx = _mm256_set_ps( x7, x6, x5, x4, x3, x2, x1, x0 ); \
537  v8sf yy = v1##FUN##f(xx); \
538  memcpy(y, &yy, 8*sizeof(sys_float))
539 #endif
540 
541 #ifdef __AVX512F__
542 #define V1FUN_PS_16(FUN, V) \
543  v16sf xx = _mm512_set_ps( x15, x14, x13, x12, x11, x10, x9, x8, x7, x6, x5, x4, x3, x2, x1, x0 ); \
544  v16sf yy = v1##FUN##f(xx); \
545  memcpy(y, &yy, 16*sizeof(sys_float))
546 #else
547 #define V1FUN_PS_16(FUN, V) \
548  v8sf yy[2]; \
549  v8sf xx = _mm256_set_ps( x7, x6, x5, x4, x3, x2, x1, x0 ); \
550  yy[0] = v1##FUN##f(xx); \
551  xx = _mm256_set_ps( x15, x14, x13, x12, x11, x10, x9, x8 ); \
552  yy[1] = v1##FUN##f(xx); \
553  memcpy(y, &yy, 16*sizeof(sys_float))
554 #endif
555 
556 #ifdef __AVX512F__
557 #define V1FUN2_PD_4(FUN, V) \
558  v8df xx = _mm512_set_pd( V, V, V, V, x3, x2, x1, x0 ); \
559  v8df yy = _mm512_set_pd( V, V, V, V, y3, y2, y1, y0 ); \
560  v8df zz = v1##FUN##d(xx, yy); \
561  memcpy(z, &zz, 4*sizeof(double))
562 #else
563 #define V1FUN2_PD_4(FUN, V) \
564  v4df xx = _mm256_set_pd( x3, x2, x1, x0 ); \
565  v4df yy = _mm256_set_pd( y3, y2, y1, y0 ); \
566  v4df zz = v1##FUN##d(xx, yy); \
567  memcpy(z, &zz, 4*sizeof(double))
568 #endif
569 
570 #ifdef __AVX512F__
571 #define V1FUN2_PS_4(FUN, V) \
572  v16sf xx = _mm512_set_ps( V, V, V, V, V, V, V, V, V, V, V, V, x3, x2, x1, x0 ); \
573  v16sf yy = _mm512_set_ps( V, V, V, V, V, V, V, V, V, V, V, V, y3, y2, y1, y0 ); \
574  v16sf zz = v1##FUN##f(xx, yy); \
575  memcpy(z, &zz, 4*sizeof(sys_float))
576 #else
577 #define V1FUN2_PS_4(FUN, V) \
578  v8sf xx = _mm256_set_ps( V, V, V, V, x3, x2, x1, x0 ); \
579  v8sf yy = _mm256_set_ps( V, V, V, V, y3, y2, y1, y0 ); \
580  v8sf zz = v1##FUN##f(xx, yy); \
581  memcpy(z, &zz, 4*sizeof(sys_float))
582 #endif
583 
584 #ifdef __AVX512F__
585 #define V1FUN2_PS_8(FUN, V) \
586  v16sf xx = _mm512_set_ps( V, V, V, V, V, V, V, V, x7, x6, x5, x4, x3, x2, x1, x0 ); \
587  v16sf yy = _mm512_set_ps( V, V, V, V, V, V, V, V, y7, y6, y5, y4, y3, y2, y1, y0 ); \
588  v16sf zz = v1##FUN##f(xx, yy); \
589  memcpy(z, &zz, 8*sizeof(sys_float))
590 #else
591 #define V1FUN2_PS_8(FUN, V) \
592  v8sf xx = _mm256_set_ps( x7, x6, x5, x4, x3, x2, x1, x0 ); \
593  v8sf yy = _mm256_set_ps( y7, y6, y5, y4, y3, y2, y1, y0 ); \
594  v8sf zz = v1##FUN##f(xx, yy); \
595  memcpy(z, &zz, 8*sizeof(sys_float))
596 #endif
597 
598 #else // __AVX__
599 
600 // fallback for non-AVX capable hardware
601 template<class T, class V>
602 inline void vecfun(const T x[], T y[], long nlo, long nhi, T (*scalfun1)(T), V (*)(V))
603 {
604  for( long i=nlo; i < nhi; ++i )
605  y[i] = scalfun1(x[i]);
606 }
607 
608 template<class T, class V>
609 inline void vecfun2(const T x1[], const T x2[], T y[], long nlo, long nhi, T (*scalfun1)(T, T), V (*)(V, V))
610 {
611  for( long i=nlo; i < nhi; ++i )
612  y[i] = scalfun1(x1[i], x2[i]);
613 }
614 
615 #define V1FUN_4(FUN, V) \
616  y[0] = FUN(x0); \
617  y[1] = FUN(x1); \
618  y[2] = FUN(x2); \
619  y[3] = FUN(x3)
620 
621 #define V1FUN_8(FUN, V) \
622  y[0] = FUN(x0); \
623  y[1] = FUN(x1); \
624  y[2] = FUN(x2); \
625  y[3] = FUN(x3); \
626  y[4] = FUN(x4); \
627  y[5] = FUN(x5); \
628  y[6] = FUN(x6); \
629  y[7] = FUN(x7)
630 
631 #define V1FUN_16(FUN, V) \
632  y[0] = FUN(x0); \
633  y[1] = FUN(x1); \
634  y[2] = FUN(x2); \
635  y[3] = FUN(x3); \
636  y[4] = FUN(x4); \
637  y[5] = FUN(x5); \
638  y[6] = FUN(x6); \
639  y[7] = FUN(x7); \
640  y[8] = FUN(x8); \
641  y[9] = FUN(x9); \
642  y[10] = FUN(x10); \
643  y[11] = FUN(x11); \
644  y[12] = FUN(x12); \
645  y[13] = FUN(x13); \
646  y[14] = FUN(x14); \
647  y[15] = FUN(x15)
648 
649 #define V1FUN_PD_4(FUN, V) \
650  V1FUN_4(FUN, V)
651 
652 #define V1FUN_PD_8(FUN, V) \
653  V1FUN_8(FUN, V)
654 
655 #define V1FUN_PS_4(FUN, V) \
656  V1FUN_4(FUN##f, V)
657 
658 #define V1FUN_PS_8(FUN, V) \
659  V1FUN_8(FUN##f, V)
660 
661 #define V1FUN_PS_16(FUN, V) \
662  V1FUN_16(FUN##f, V)
663 
664 #define V1FUN2_4(FUN, V) \
665  z[0] = FUN(x0, y0); \
666  z[1] = FUN(x1, y1); \
667  z[2] = FUN(x2, y2); \
668  z[3] = FUN(x3, y3)
669 
670 #define V1FUN2_8(FUN, V) \
671  z[0] = FUN(x0, y0); \
672  z[1] = FUN(x1, y1); \
673  z[2] = FUN(x2, y2); \
674  z[3] = FUN(x3, y3); \
675  z[4] = FUN(x4, y4); \
676  z[5] = FUN(x5, y5); \
677  z[6] = FUN(x6, y6); \
678  z[7] = FUN(x7, y7)
679 
680 #define V1FUN2_PD_4(FUN, V) \
681  V1FUN2_4(FUN, V)
682 
683 #define V1FUN2_PS_4(FUN, V) \
684  V1FUN2_4(FUN##f, V)
685 
686 #define V1FUN2_PS_8(FUN, V) \
687  V1FUN2_8(FUN##f, V)
688 
689 #endif // __AVX__
690 
691 #endif
static double x2[63]
static double x1[83]
void vecfun2(const T x1[], const T x2[], T y[], long nlo, long nhi, T(*scalfun1)(T, T), V(*)(V, V))
ALIGNED(CD_ALIGN) static const double qg32_w[numPoints]
void zero(void)
Definition: zero.cpp:43
void vecfun(const T x[], T y[], long nlo, long nhi, T(*scalfun1)(T), V(*)(V))
float sys_float
Definition: cddefines.h:127
long min(int a, long b)
Definition: cddefines.h:762