cloudy  trunk
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
vectorize_sqrt_core.h
Go to the documentation of this file.
1 /* This file is part of Cloudy and is copyright (C)1978-2022 by Gary J. Ferland and
2  * others. For conditions of distribution and use see copyright notice in license.txt */
3 
4 #ifndef VECTORIZE_SQRT_CORE_H
5 #define VECTORIZE_SQRT_CORE_H
6 
7 #include "vectorize_math.h"
8 
9 //
10 // Written by Peter A.M. van Hoof, Royal Observatory of Belgium, Brussels
11 //
12 // this file contains vectorized versions of the single and double variants of the sqrt()
13 // and hypot() functions. They are vectorized using AVX instructions, but also make use of
14 // AVX2, FMA, and AVX512 instructions when available. The basic algorithms for calculating
15 // the sqrt() functions were derived from the algorithm for calculating rsqrt() described
16 // here: http://en.wikipedia.org/wiki/Fast_inverse_square_root
17 //
18 // Alternatively one can also use the sqrt hardware instruction, but on some hardware the
19 // the software implementation is faster... The hardware instruction is chosen as the
20 // default implementation below.
21 //
22 
23 #ifdef __AVX__
24 
25 VECLL_CONST(sqrt_mask1,0x7fffffffffffffff);
26 
27 #ifdef __AVX2__
28 VECLL_CONST(sqrt_magic,0x5fe6eb50c7b537a9);
29 #else
30 VECL_CONST(sqrt_magic,0x5fe6eb50c7b537a9);
31 #endif
32 
33 #ifdef __AVX512F__
34 
35 inline v8df v1sqrtd_core(v8df x)
36 {
37 #if 0
38  v8di ir = _mm512_castpd_si512(x);
39  ir = _mm512_srli_epi64(ir, 1);
40  ir = _mm512_sub_epi64(sqrt_magic, ir);
41  v8df r = _mm512_castsi512_pd(ir);
42  __mmask8 zmask = _mm512_cmp_pd_mask(x, dbl_min, _CMP_LT_OQ);
43  r = _mm512_mask_load_pd(r, zmask, &zero);
44  // do not precompute x/2. to avoid underflow to denormalized numbers
45  // this may be flushed to zero, but is also very slow....
46  v8df rx = _mm512_mul_pd(r, x);
47  v8df rx2 = _mm512_mul_pd(rx, mhalf);
48  v8df y = _mm512_fmadd_pd(rx2, r, c1p5);
49  r = _mm512_mul_pd(r, y);
50  rx = _mm512_mul_pd(r, x);
51  rx2 = _mm512_mul_pd(rx, mhalf);
52  y = _mm512_fmadd_pd(rx2, r, c1p5);
53  r = _mm512_mul_pd(r, y);
54  rx = _mm512_mul_pd(r, x);
55  rx2 = _mm512_mul_pd(rx, mhalf);
56  y = _mm512_fmadd_pd(rx2, r, c1p5);
57  r = _mm512_mul_pd(r, y);
58  rx = _mm512_mul_pd(r, x);
59  rx2 = _mm512_mul_pd(rx, mhalf);
60  y = _mm512_fmadd_pd(rx2, r, c1p5);
61  r = _mm512_mul_pd(r, y);
62  return _mm512_mul_pd(x, r);
63 #else
64  return _mm512_sqrt_pd(x);
65 #endif
66 }
67 
68 inline v8df v1hypotd_core(v8df x, v8df y)
69 {
70  v8df xp = _mm512_max_pd(x, y);
71  v8df yp = _mm512_min_pd(x, y);
72  __mmask8 zmask = _mm512_cmp_pd_mask(xp, zero, _CMP_NEQ_OQ);
73  v8df arg = _mm512_mask_div_pd(zero, zmask, yp, xp);
74  arg = _mm512_fmadd_pd(arg, arg, one);
75  v8df s = v1sqrtd_core(arg);
76  return _mm512_mul_pd(xp, s);
77 }
78 
79 #else
80 
81 inline v4df v1sqrtd_core(v4df x)
82 {
83 #if 0
84 #ifdef __AVX2__
85  v4di ir = _mm256_castpd_si256(x);
86  ir = _mm256_srli_epi64(ir, 1);
87  ir = _mm256_sub_epi64(sqrt_magic, ir);
88 #else
89  v2df xl = _mm256_extractf128_pd(x, 0);
90  v2df xh = _mm256_extractf128_pd(x, 1);
91  v2di ixl = _mm_castpd_si128(xl);
92  v2di ixh = _mm_castpd_si128(xh);
93  ixl = _mm_srli_epi64(ixl, 1);
94  ixh = _mm_srli_epi64(ixh, 1);
95  ixl = _mm_sub_epi64(sqrt_magic, ixl);
96  ixh = _mm_sub_epi64(sqrt_magic, ixh);
97  v4di ir = _mm256_setzero_si256();
98  ir = _mm256_insertf128_si256(ir, ixl, 0);
99  ir = _mm256_insertf128_si256(ir, ixh, 1);
100 #endif
101  v4df r = _mm256_castsi256_pd(ir);
102  v4df zmask = _mm256_cmp_pd(x, dbl_min, _CMP_LT_OQ);
103  r = _mm256_blendv_pd(r, zero, zmask);
104  // do not precompute x/2. to avoid underflow to denormalized numbers
105  // this may be flushed to zero, but is also very slow....
106  v4df rx = _mm256_mul_pd(r, x);
107  v4df rx2 = _mm256_mul_pd(rx, mhalf);
108 #ifdef __FMA__
109  v4df y = _mm256_fmadd_pd(rx2, r, c1p5);
110 #else
111  v4df y = _mm256_mul_pd(rx2, r);
112  y = _mm256_add_pd(y, c1p5);
113 #endif
114  r = _mm256_mul_pd(r, y);
115  rx = _mm256_mul_pd(r, x);
116  rx2 = _mm256_mul_pd(rx, mhalf);
117 #ifdef __FMA__
118  y = _mm256_fmadd_pd(rx2, r, c1p5);
119 #else
120  y = _mm256_mul_pd(rx2, r);
121  y = _mm256_add_pd(y, c1p5);
122 #endif
123  r = _mm256_mul_pd(r, y);
124  rx = _mm256_mul_pd(r, x);
125  rx2 = _mm256_mul_pd(rx, mhalf);
126 #ifdef __FMA__
127  y = _mm256_fmadd_pd(rx2, r, c1p5);
128 #else
129  y = _mm256_mul_pd(rx2, r);
130  y = _mm256_add_pd(y, c1p5);
131 #endif
132  r = _mm256_mul_pd(r, y);
133  rx = _mm256_mul_pd(r, x);
134  rx2 = _mm256_mul_pd(rx, mhalf);
135 #ifdef __FMA__
136  y = _mm256_fmadd_pd(rx2, r, c1p5);
137 #else
138  y = _mm256_mul_pd(rx2, r);
139  y = _mm256_add_pd(y, c1p5);
140 #endif
141  r = _mm256_mul_pd(r, y);
142  return _mm256_mul_pd(x, r);
143 #else
144  return _mm256_sqrt_pd(x);
145 #endif
146 }
147 
148 inline v4df v1hypotd_core(v4df x, v4df y)
149 {
150  v4df xp = _mm256_max_pd(x, y);
151  v4df yp = _mm256_min_pd(x, y);
152  v4df zmask = _mm256_cmp_pd(xp, zero, _CMP_EQ_OQ);
153  xp = _mm256_blendv_pd(xp, one, zmask);
154  v4df arg = _mm256_div_pd(yp, xp);
155  xp = _mm256_blendv_pd(xp, zero, zmask);
156 #ifdef __FMA__
157  arg = _mm256_fmadd_pd(arg, arg, one);
158 #else
159  arg = _mm256_mul_pd(arg, arg);
160  arg = _mm256_add_pd(arg, one);
161 #endif
162  v4df s = v1sqrtd_core(arg);
163  return _mm256_mul_pd(xp, s);
164 }
165 
166 #endif // __AVX512F__
167 
168 VECII_CONST(sqrt_mask1f,0x7fffffff);
169 
170 #ifdef __AVX2__
171 VECII_CONST(sqrt_magicf,0x5f375a86);
172 #else
173 VECI_CONST(sqrt_magicf,0x5f375a86);
174 #endif
175 
176 #ifdef __AVX512F__
177 
178 inline v16sf v1sqrtf_core(v16sf x)
179 {
180 #if 0
181  v16si ir = _mm512_castps_si512(x);
182  ir = _mm512_srli_epi32(ir, 1);
183  ir = _mm512_sub_epi32(sqrt_magicf,ir);
184  v16sf r = _mm512_castsi512_ps(ir);
185  __mmask16 zmask = _mm512_cmp_ps_mask(x, flt_min, _CMP_LT_OS);
186  r = _mm512_mask_load_ps(r, zmask, &zerof);
187  // do not precompute x/2.f to avoid underflow to denormalized numbers
188  // this may be flushed to zero, but is also very slow....
189  v16sf rx = _mm512_mul_ps(r, x);
190  v16sf rx2 = _mm512_mul_ps(rx, mhalff);
191  v16sf y = _mm512_fmadd_ps(rx2, r, c1p5f);
192  r = _mm512_mul_ps(r, y);
193  rx = _mm512_mul_ps(r, x);
194  rx2 = _mm512_mul_ps(rx, mhalff);
195  y = _mm512_fmadd_ps(rx2, r, c1p5f);
196  r = _mm512_mul_ps(r, y);
197  rx = _mm512_mul_ps(r, x);
198  rx2 = _mm512_mul_ps(rx, mhalff);
199  y = _mm512_fmadd_ps(rx2, r, c1p5f);
200  r = _mm512_mul_ps(r, y);
201  return _mm512_mul_ps(x, r);
202 #else
203  return _mm512_sqrt_ps(x);
204 #endif
205 }
206 
207 inline v16sf v1hypotf_core(v16sf x, v16sf y)
208 {
209  v16sf xp = _mm512_max_ps(x, y);
210  v16sf yp = _mm512_min_ps(x, y);
211  __mmask16 zmask = _mm512_cmp_ps_mask(xp, zerof, _CMP_NEQ_OQ);
212  v16sf arg = _mm512_mask_div_ps(zerof, zmask, yp, xp);
213  arg = _mm512_fmadd_ps(arg, arg, onef);
214  v16sf s = v1sqrtf_core(arg);
215  return _mm512_mul_ps(xp, s);
216 }
217 
218 #else
219 
220 inline v8sf v1sqrtf_core(v8sf x)
221 {
222 #if 0
223 #ifdef __AVX2__
224  v8si ir = _mm256_castps_si256(x);
225  ir = _mm256_srli_epi32(ir, 1);
226  ir = _mm256_sub_epi32(sqrt_magicf,ir);
227 #else
228  v4sf xl = _mm256_extractf128_ps(x, 0);
229  v4sf xh = _mm256_extractf128_ps(x, 1);
230  v4si ixl = _mm_castps_si128(xl);
231  v4si ixh = _mm_castps_si128(xh);
232  ixl = _mm_srli_epi32(ixl, 1);
233  ixh = _mm_srli_epi32(ixh, 1);
234  ixl = _mm_sub_epi32(sqrt_magicf,ixl);
235  ixh = _mm_sub_epi32(sqrt_magicf,ixh);
236  v8si ir = _mm256_setzero_si256();
237  ir = _mm256_insertf128_si256(ir, ixl, 0);
238  ir = _mm256_insertf128_si256(ir, ixh, 1);
239 #endif
240  v8sf r = _mm256_castsi256_ps(ir);
241  v8sf zmask = _mm256_cmp_ps(x, flt_min, _CMP_LT_OQ);
242  r = _mm256_blendv_ps(r, zerof, zmask);
243  // do not precompute x/2.f to avoid underflow to denormalized numbers
244  // this may be flushed to zero, but is also very slow....
245  v8sf rx = _mm256_mul_ps(r, x);
246  v8sf rx2 = _mm256_mul_ps(rx, mhalff);
247 #ifdef __FMA__
248  v8sf y = _mm256_fmadd_ps(rx2, r, c1p5f);
249 #else
250  v8sf y = _mm256_mul_ps(rx2, r);
251  y = _mm256_add_ps(y, c1p5f);
252 #endif
253  r = _mm256_mul_ps(r, y);
254  rx = _mm256_mul_ps(r, x);
255  rx2 = _mm256_mul_ps(rx, mhalff);
256 #ifdef __FMA__
257  y = _mm256_fmadd_ps(rx2, r, c1p5f);
258 #else
259  y = _mm256_mul_ps(rx2, r);
260  y = _mm256_add_ps(y, c1p5f);
261 #endif
262  r = _mm256_mul_ps(r, y);
263  rx = _mm256_mul_ps(r, x);
264  rx2 = _mm256_mul_ps(rx, mhalff);
265 #ifdef __FMA__
266  y = _mm256_fmadd_ps(rx2, r, c1p5f);
267 #else
268  y = _mm256_mul_ps(rx2, r);
269  y = _mm256_add_ps(y, c1p5f);
270 #endif
271  r = _mm256_mul_ps(r, y);
272  return _mm256_mul_ps(x, r);
273 #else
274  return _mm256_sqrt_ps(x);
275 #endif
276 }
277 
278 inline v8sf v1hypotf_core(v8sf x, v8sf y)
279 {
280  v8sf xp = _mm256_max_ps(x, y);
281  v8sf yp = _mm256_min_ps(x, y);
282  v8sf zmask = _mm256_cmp_ps(xp, zerof, _CMP_EQ_OQ);
283  xp = _mm256_blendv_ps(xp, onef, zmask);
284  v8sf arg = _mm256_div_ps(yp, xp);
285  xp = _mm256_blendv_ps(xp, zerof, zmask);
286 #ifdef __FMA__
287  arg = _mm256_fmadd_ps(arg, arg, onef);
288 #else
289  arg = _mm256_mul_ps(arg, arg);
290  arg = _mm256_add_ps(arg, onef);
291 #endif
292  v8sf s = v1sqrtf_core(arg);
293  return _mm256_mul_ps(xp, s);
294 }
295 
296 #endif // __AVX512F__
297 
298 #endif // __AVX__
299 
300 #endif
void zero(void)
Definition: zero.cpp:43