Cloudy
Spectral Synthesis Code for Astrophysics
Loading...
Searching...
No Matches
vectorize_sqrt_core.h
Go to the documentation of this file.
1/* This file is part of Cloudy and is copyright (C)1978-2025 by Gary J. Ferland and
2 * others. For conditions of distribution and use see copyright notice in license.txt */
3
4#ifndef VECTORIZE_SQRT_CORE_H
5#define VECTORIZE_SQRT_CORE_H
6
7#include "vectorize_math.h"
8
9//
10// Written by Peter A.M. van Hoof, Royal Observatory of Belgium, Brussels
11//
12// this file contains vectorized versions of the single and double variants of the sqrt()
13// and hypot() functions. They are vectorized using AVX instructions, but also make use of
14// AVX2, FMA, and AVX512 instructions when available. The basic algorithms for calculating
15// the sqrt() functions were derived from the algorithm for calculating rsqrt() described
16// here: http://en.wikipedia.org/wiki/Fast_inverse_square_root
17//
18// Alternatively one can also use the sqrt hardware instruction, but on some hardware the
19// the software implementation is faster... The hardware instruction is chosen as the
20// default implementation below.
21//
22
23#ifdef __AVX__
24
25VECLL_CONST(sqrt_mask1,0x7fffffffffffffff);
26
27#ifdef __AVX2__
28VECLL_CONST(sqrt_magic,0x5fe6eb50c7b537a9);
29#else
30VECL_CONST(sqrt_magic,0x5fe6eb50c7b537a9);
31#endif
32
33#ifdef __AVX512F__
34
35inline v8df v1sqrtd_core(v8df x)
36{
37#if 0
38 v8di ir = _mm512_castpd_si512(x);
39 ir = _mm512_srli_epi64(ir, 1);
40 ir = _mm512_sub_epi64(sqrt_magic, ir);
41 v8df r = _mm512_castsi512_pd(ir);
42 __mmask8 zmask = _mm512_cmp_pd_mask(x, dbl_min, _CMP_LT_OQ);
43 r = _mm512_mask_load_pd(r, zmask, &zero);
44 // do not precompute x/2. to avoid underflow to denormalized numbers
45 // this may be flushed to zero, but is also very slow....
46 v8df rx = _mm512_mul_pd(r, x);
47 v8df rx2 = _mm512_mul_pd(rx, mhalf);
48 v8df y = _mm512_fmadd_pd(rx2, r, c1p5);
49 r = _mm512_mul_pd(r, y);
50 rx = _mm512_mul_pd(r, x);
51 rx2 = _mm512_mul_pd(rx, mhalf);
52 y = _mm512_fmadd_pd(rx2, r, c1p5);
53 r = _mm512_mul_pd(r, y);
54 rx = _mm512_mul_pd(r, x);
55 rx2 = _mm512_mul_pd(rx, mhalf);
56 y = _mm512_fmadd_pd(rx2, r, c1p5);
57 r = _mm512_mul_pd(r, y);
58 rx = _mm512_mul_pd(r, x);
59 rx2 = _mm512_mul_pd(rx, mhalf);
60 y = _mm512_fmadd_pd(rx2, r, c1p5);
61 r = _mm512_mul_pd(r, y);
62 return _mm512_mul_pd(x, r);
63#else
64 return _mm512_sqrt_pd(x);
65#endif
66}
67
68inline v8df v1hypotd_core(v8df x, v8df y)
69{
70 v8df xp = _mm512_max_pd(x, y);
71 v8df yp = _mm512_min_pd(x, y);
72 __mmask8 zmask = _mm512_cmp_pd_mask(xp, zero, _CMP_NEQ_OQ);
73 v8df arg = _mm512_mask_div_pd(zero, zmask, yp, xp);
74 arg = _mm512_fmadd_pd(arg, arg, one);
75 v8df s = v1sqrtd_core(arg);
76 return _mm512_mul_pd(xp, s);
77}
78
79#else
80
81inline v4df v1sqrtd_core(v4df x)
82{
83#if 0
84#ifdef __AVX2__
85 v4di ir = _mm256_castpd_si256(x);
86 ir = _mm256_srli_epi64(ir, 1);
87 ir = _mm256_sub_epi64(sqrt_magic, ir);
88#else
89 v2df xl = _mm256_extractf128_pd(x, 0);
90 v2df xh = _mm256_extractf128_pd(x, 1);
91 v2di ixl = _mm_castpd_si128(xl);
92 v2di ixh = _mm_castpd_si128(xh);
93 ixl = _mm_srli_epi64(ixl, 1);
94 ixh = _mm_srli_epi64(ixh, 1);
95 ixl = _mm_sub_epi64(sqrt_magic, ixl);
96 ixh = _mm_sub_epi64(sqrt_magic, ixh);
97 v4di ir = _mm256_setzero_si256();
98 ir = _mm256_insertf128_si256(ir, ixl, 0);
99 ir = _mm256_insertf128_si256(ir, ixh, 1);
100#endif
101 v4df r = _mm256_castsi256_pd(ir);
102 v4df zmask = _mm256_cmp_pd(x, dbl_min, _CMP_LT_OQ);
103 r = _mm256_blendv_pd(r, zero, zmask);
104 // do not precompute x/2. to avoid underflow to denormalized numbers
105 // this may be flushed to zero, but is also very slow....
106 v4df rx = _mm256_mul_pd(r, x);
107 v4df rx2 = _mm256_mul_pd(rx, mhalf);
108#ifdef __FMA__
109 v4df y = _mm256_fmadd_pd(rx2, r, c1p5);
110#else
111 v4df y = _mm256_mul_pd(rx2, r);
112 y = _mm256_add_pd(y, c1p5);
113#endif
114 r = _mm256_mul_pd(r, y);
115 rx = _mm256_mul_pd(r, x);
116 rx2 = _mm256_mul_pd(rx, mhalf);
117#ifdef __FMA__
118 y = _mm256_fmadd_pd(rx2, r, c1p5);
119#else
120 y = _mm256_mul_pd(rx2, r);
121 y = _mm256_add_pd(y, c1p5);
122#endif
123 r = _mm256_mul_pd(r, y);
124 rx = _mm256_mul_pd(r, x);
125 rx2 = _mm256_mul_pd(rx, mhalf);
126#ifdef __FMA__
127 y = _mm256_fmadd_pd(rx2, r, c1p5);
128#else
129 y = _mm256_mul_pd(rx2, r);
130 y = _mm256_add_pd(y, c1p5);
131#endif
132 r = _mm256_mul_pd(r, y);
133 rx = _mm256_mul_pd(r, x);
134 rx2 = _mm256_mul_pd(rx, mhalf);
135#ifdef __FMA__
136 y = _mm256_fmadd_pd(rx2, r, c1p5);
137#else
138 y = _mm256_mul_pd(rx2, r);
139 y = _mm256_add_pd(y, c1p5);
140#endif
141 r = _mm256_mul_pd(r, y);
142 return _mm256_mul_pd(x, r);
143#else
144 return _mm256_sqrt_pd(x);
145#endif
146}
147
148inline v4df v1hypotd_core(v4df x, v4df y)
149{
150 v4df xp = _mm256_max_pd(x, y);
151 v4df yp = _mm256_min_pd(x, y);
152 v4df zmask = _mm256_cmp_pd(xp, zero, _CMP_EQ_OQ);
153 xp = _mm256_blendv_pd(xp, one, zmask);
154 v4df arg = _mm256_div_pd(yp, xp);
155 xp = _mm256_blendv_pd(xp, zero, zmask);
156#ifdef __FMA__
157 arg = _mm256_fmadd_pd(arg, arg, one);
158#else
159 arg = _mm256_mul_pd(arg, arg);
160 arg = _mm256_add_pd(arg, one);
161#endif
162 v4df s = v1sqrtd_core(arg);
163 return _mm256_mul_pd(xp, s);
164}
165
166#endif // __AVX512F__
167
168VECII_CONST(sqrt_mask1f,0x7fffffff);
169
170#ifdef __AVX2__
171VECII_CONST(sqrt_magicf,0x5f375a86);
172#else
173VECI_CONST(sqrt_magicf,0x5f375a86);
174#endif
175
176#ifdef __AVX512F__
177
178inline v16sf v1sqrtf_core(v16sf x)
179{
180#if 0
181 v16si ir = _mm512_castps_si512(x);
182 ir = _mm512_srli_epi32(ir, 1);
183 ir = _mm512_sub_epi32(sqrt_magicf,ir);
184 v16sf r = _mm512_castsi512_ps(ir);
185 __mmask16 zmask = _mm512_cmp_ps_mask(x, flt_min, _CMP_LT_OS);
186 r = _mm512_mask_load_ps(r, zmask, &zerof);
187 // do not precompute x/2.f to avoid underflow to denormalized numbers
188 // this may be flushed to zero, but is also very slow....
189 v16sf rx = _mm512_mul_ps(r, x);
190 v16sf rx2 = _mm512_mul_ps(rx, mhalff);
191 v16sf y = _mm512_fmadd_ps(rx2, r, c1p5f);
192 r = _mm512_mul_ps(r, y);
193 rx = _mm512_mul_ps(r, x);
194 rx2 = _mm512_mul_ps(rx, mhalff);
195 y = _mm512_fmadd_ps(rx2, r, c1p5f);
196 r = _mm512_mul_ps(r, y);
197 rx = _mm512_mul_ps(r, x);
198 rx2 = _mm512_mul_ps(rx, mhalff);
199 y = _mm512_fmadd_ps(rx2, r, c1p5f);
200 r = _mm512_mul_ps(r, y);
201 return _mm512_mul_ps(x, r);
202#else
203 return _mm512_sqrt_ps(x);
204#endif
205}
206
207inline v16sf v1hypotf_core(v16sf x, v16sf y)
208{
209 v16sf xp = _mm512_max_ps(x, y);
210 v16sf yp = _mm512_min_ps(x, y);
211 __mmask16 zmask = _mm512_cmp_ps_mask(xp, zerof, _CMP_NEQ_OQ);
212 v16sf arg = _mm512_mask_div_ps(zerof, zmask, yp, xp);
213 arg = _mm512_fmadd_ps(arg, arg, onef);
214 v16sf s = v1sqrtf_core(arg);
215 return _mm512_mul_ps(xp, s);
216}
217
218#else
219
220inline v8sf v1sqrtf_core(v8sf x)
221{
222#if 0
223#ifdef __AVX2__
224 v8si ir = _mm256_castps_si256(x);
225 ir = _mm256_srli_epi32(ir, 1);
226 ir = _mm256_sub_epi32(sqrt_magicf,ir);
227#else
228 v4sf xl = _mm256_extractf128_ps(x, 0);
229 v4sf xh = _mm256_extractf128_ps(x, 1);
230 v4si ixl = _mm_castps_si128(xl);
231 v4si ixh = _mm_castps_si128(xh);
232 ixl = _mm_srli_epi32(ixl, 1);
233 ixh = _mm_srli_epi32(ixh, 1);
234 ixl = _mm_sub_epi32(sqrt_magicf,ixl);
235 ixh = _mm_sub_epi32(sqrt_magicf,ixh);
236 v8si ir = _mm256_setzero_si256();
237 ir = _mm256_insertf128_si256(ir, ixl, 0);
238 ir = _mm256_insertf128_si256(ir, ixh, 1);
239#endif
240 v8sf r = _mm256_castsi256_ps(ir);
241 v8sf zmask = _mm256_cmp_ps(x, flt_min, _CMP_LT_OQ);
242 r = _mm256_blendv_ps(r, zerof, zmask);
243 // do not precompute x/2.f to avoid underflow to denormalized numbers
244 // this may be flushed to zero, but is also very slow....
245 v8sf rx = _mm256_mul_ps(r, x);
246 v8sf rx2 = _mm256_mul_ps(rx, mhalff);
247#ifdef __FMA__
248 v8sf y = _mm256_fmadd_ps(rx2, r, c1p5f);
249#else
250 v8sf y = _mm256_mul_ps(rx2, r);
251 y = _mm256_add_ps(y, c1p5f);
252#endif
253 r = _mm256_mul_ps(r, y);
254 rx = _mm256_mul_ps(r, x);
255 rx2 = _mm256_mul_ps(rx, mhalff);
256#ifdef __FMA__
257 y = _mm256_fmadd_ps(rx2, r, c1p5f);
258#else
259 y = _mm256_mul_ps(rx2, r);
260 y = _mm256_add_ps(y, c1p5f);
261#endif
262 r = _mm256_mul_ps(r, y);
263 rx = _mm256_mul_ps(r, x);
264 rx2 = _mm256_mul_ps(rx, mhalff);
265#ifdef __FMA__
266 y = _mm256_fmadd_ps(rx2, r, c1p5f);
267#else
268 y = _mm256_mul_ps(rx2, r);
269 y = _mm256_add_ps(y, c1p5f);
270#endif
271 r = _mm256_mul_ps(r, y);
272 return _mm256_mul_ps(x, r);
273#else
274 return _mm256_sqrt_ps(x);
275#endif
276}
277
278inline v8sf v1hypotf_core(v8sf x, v8sf y)
279{
280 v8sf xp = _mm256_max_ps(x, y);
281 v8sf yp = _mm256_min_ps(x, y);
282 v8sf zmask = _mm256_cmp_ps(xp, zerof, _CMP_EQ_OQ);
283 xp = _mm256_blendv_ps(xp, onef, zmask);
284 v8sf arg = _mm256_div_ps(yp, xp);
285 xp = _mm256_blendv_ps(xp, zerof, zmask);
286#ifdef __FMA__
287 arg = _mm256_fmadd_ps(arg, arg, onef);
288#else
289 arg = _mm256_mul_ps(arg, arg);
290 arg = _mm256_add_ps(arg, onef);
291#endif
292 v8sf s = v1sqrtf_core(arg);
293 return _mm256_mul_ps(xp, s);
294}
295
296#endif // __AVX512F__
297
298#endif // __AVX__
299
300#endif
void zero(void)
Definition zero.cpp:30