cloudy  trunk
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
vectorize_log_core.h
Go to the documentation of this file.
1 /* This file is part of Cloudy and is copyright (C)1978-2022 by Gary J. Ferland and
2  * others. For conditions of distribution and use see copyright notice in license.txt */
3 
4 #ifndef VECTORIZE_LOG_CORE_H
5 #define VECTORIZE_LOG_CORE_H
6 
7 #include "vectorize_math.h"
8 
9 //
10 // Written by Peter A.M. van Hoof, Royal Observatory of Belgium, Brussels
11 //
12 // this file contains vectorized versions of the single and double variants of the log()
13 // function. They are vectorized using AVX instructions, but also make use of AVX2, FMA,
14 // and AVX512 instructions when available. The basic algorithms for calculating the log()
15 // functions were slightly simplified from the openlibm library versions available at
16 // http://openlibm.org/ which is subject to the following copyright:
17 //
18 // ====================================================
19 // Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
20 //
21 // Developed at SunSoft, a Sun Microsystems, Inc. business.
22 // Permission to use, copy, modify, and distribute this
23 // software is freely granted, provided that this notice
24 // is preserved.
25 // ====================================================
26 //
27 // This work was also inspired by similar routines written by Giovanni Garberoglio which
28 // are available at: http://software-lisc.fbk.eu/avx_mathfun/
29 //
30 
31 #ifdef __AVX__
32 
33 #if defined(__AVX512F__) || defined(__AVX2__)
34 VECLL_CONST(log_off,0x3ff);
35 VECLL_CONST(log_mask2,0x000fffffffffffff);
36 VECLL_CONST(log_sq2off,0x95f6400000000);
37 VECLL_CONST(log_mask3,0x0010000000000000);
38 VECLL_CONST(log_mask4,0x3ff0000000000000);
39 #else
40 VECI_CONST(log_off,0x3ff);
41 VECI_CONST(log_mask2,0x000fffff);
42 VECI_CONST(log_sq2off,0x95f64);
43 VECI_CONST(log_mask3,0x00100000);
44 VECI_CONST(log_mask4,0x3ff00000);
45 #endif
46 
47 VECDI_CONST(log_lg1,0xbfe5555555555593); // -6.666666666666735130e-01
48 VECDI_CONST(log_lg2,0xbfd999999997fa04); // -3.999999999940941908e-01
49 VECDI_CONST(log_lg3,0xbfd2492494229359); // -2.857142874366239149e-01
50 VECDI_CONST(log_lg4,0xbfcc71c51d8e78af); // -2.222219843214978396e-01
51 VECDI_CONST(log_lg5,0xbfc7466496cb03de); // -1.818357216161805012e-01
52 VECDI_CONST(log_lg6,0xbfc39a09d078c69f); // -1.531383769920937332e-01
53 VECDI_CONST(log_lg7,0xbfc2f112df3e5244); // -1.479819860511658591e-01
54 
55 #ifdef __AVX512F__
56 
57 inline v8df v1logd_core(v8df x)
58 {
59  v8di ix = _mm512_castpd_si512(x);
60  v8di k = _mm512_srli_epi64(ix, 52);
61  k = _mm512_sub_epi64(k, log_off);
62  v8di iy = _mm512_and_epi64(ix, log_mask2);
63  v8di iz = _mm512_add_epi64(iy, log_sq2off);
64  iz = _mm512_and_epi64(iz, log_mask3);
65  v8di ie = _mm512_xor_epi64(iz, log_mask4);
66  iy = _mm512_or_epi64(iy, ie);
67  iz = _mm512_srli_epi64(iz, 52);
68  k = _mm512_add_epi64(k, iz);
69  v8df f = _mm512_castsi512_pd(iy);
70  f = _mm512_sub_pd(f, one);
71  v8df s = _mm512_add_pd(f, two);
72  s = _mm512_div_pd(f, s);
73 #ifdef __AVX512DQ__
74  v8df dk = _mm512_cvtepi64_pd(k);
75 #else
76  v8si k2 = _mm512_cvtepi64_epi32(k);
77  v8df dk = _mm512_cvtepi32_pd(k2);
78 #endif
79  v8df z = _mm512_mul_pd(s, s);
80  v8df R = _mm512_fmadd_pd(log_lg7, z, log_lg6);
81  R = _mm512_fmadd_pd(R, z, log_lg5);
82  R = _mm512_fmadd_pd(R, z, log_lg4);
83  R = _mm512_fmadd_pd(R, z, log_lg3);
84  R = _mm512_fmadd_pd(R, z, log_lg2);
85  R = _mm512_fmadd_pd(R, z, log_lg1);
86  R = _mm512_fmadd_pd(R, z, f);
87  R = _mm512_fnmadd_pd(R, s, f);
88  return _mm512_fmadd_pd(dk, ln_two, R);
89 }
90 
91 inline v8df v1log1pd_core(v8df x)
92 {
93  v8df xarg = x;
94  x = _mm512_add_pd(x, one);
95  v8df c = _mm512_sub_pd(x, one);
96  c = _mm512_sub_pd(xarg, c);
97  c = _mm512_div_pd(c, x);
98  v8df y = v1logd_core(x);
99  return _mm512_add_pd(y, c);
100 }
101 
102 #else
103 
104 inline v4df v1logd_core(v4df x)
105 {
106 #ifdef __AVX2__
107  v4di ix = _mm256_castpd_si256(x);
108  v4di k = _mm256_srli_epi64(ix, 52);
109  k = _mm256_sub_epi64(k, log_off);
110  v4di iy = _mm256_and_si256(ix, log_mask2);
111  v4di iz = _mm256_add_epi64(iy, log_sq2off);
112  iz = _mm256_and_si256(iz, log_mask3);
113  v4di ie = _mm256_xor_si256(iz, log_mask4);
114  iy = _mm256_or_si256(iy, ie);
115  iz = _mm256_srli_epi64(iz, 52);
116  k = _mm256_add_epi64(k, iz);
117  v4di kp = _mm256_permutevar8x32_epi32(k, _mm256_set_epi32(7,5,3,1,6,4,2,0) );
118  v4si kk = _mm256_extractf128_si256(kp, 0);
119  x = _mm256_castsi256_pd(iy);
120  v4df dk = _mm256_cvtepi32_pd(kk);
121 #else
122  v8sf xs = _mm256_castpd_ps(x);
123  xs = _mm256_permute_ps(xs, _MM_SHUFFLE(3, 1, 2, 0));
124  v4sf xls = _mm256_extractf128_ps(xs, 0);
125  v4si xl = _mm_castps_si128(xls);
126  v4sf xhs = _mm256_extractf128_ps(xs, 1);
127  v4si xh = _mm_castps_si128(xhs);
128  v4si hx = _mm_unpackhi_epi64(xl, xh);
129  v4si lx = _mm_unpacklo_epi64(xl, xh);
130  v4si k = _mm_srli_epi32(hx, 20);
131  k = _mm_sub_epi32(k, log_off);
132  hx = _mm_and_si128(hx, log_mask2);
133  v4si i = _mm_add_epi32(hx, log_sq2off);
134  i = _mm_and_si128(i, log_mask3);
135  v4si ii = _mm_xor_si128(i, log_mask4);
136  hx = _mm_or_si128(hx, ii);
137  v8si xi = _mm256_setzero_si256();
138  xl = _mm_unpacklo_epi32(lx,hx);
139  xi = _mm256_insertf128_si256(xi, xl, 0);
140  xh = _mm_unpackhi_epi32(lx,hx);
141  xi = _mm256_insertf128_si256(xi, xh, 1);
142  x = _mm256_castsi256_pd(xi);
143  i = _mm_srli_epi32(i, 20);
144  k = _mm_add_epi32(k, i);
145  v4df dk = _mm256_cvtepi32_pd(k);
146 #endif
147  // f and s have opposite sign compared to openlibm!
148  v4df f = _mm256_sub_pd(one, x);
149  v4df s = _mm256_sub_pd(two, f);
150  s = _mm256_div_pd(f, s);
151  v4df z = _mm256_mul_pd(s, s);
152 #ifdef __FMA__
153  v4df R = _mm256_fmadd_pd(log_lg7, z, log_lg6);
154  R = _mm256_fmadd_pd(R, z, log_lg5);
155  R = _mm256_fmadd_pd(R, z, log_lg4);
156  R = _mm256_fmadd_pd(R, z, log_lg3);
157  R = _mm256_fmadd_pd(R, z, log_lg2);
158  R = _mm256_fmadd_pd(R, z, log_lg1);
159  R = _mm256_fmsub_pd(R, z, f);
160  R = _mm256_fmsub_pd(R, s, f);
161  v4df y = _mm256_fmadd_pd(dk, ln_two, R);
162 #else
163  v4df R = _mm256_mul_pd(log_lg7, z);
164  R = _mm256_add_pd(R, log_lg6);
165  R = _mm256_mul_pd(R, z);
166  R = _mm256_add_pd(R, log_lg5);
167  R = _mm256_mul_pd(R, z);
168  R = _mm256_add_pd(R, log_lg4);
169  R = _mm256_mul_pd(R, z);
170  R = _mm256_add_pd(R, log_lg3);
171  R = _mm256_mul_pd(R, z);
172  R = _mm256_add_pd(R, log_lg2);
173  R = _mm256_mul_pd(R, z);
174  R = _mm256_add_pd(R, log_lg1);
175  R = _mm256_mul_pd(R, z);
176  R = _mm256_sub_pd(R, f);
177  R = _mm256_mul_pd(R, s);
178  R = _mm256_sub_pd(R, f);
179  v4df R2 = _mm256_mul_pd(dk, ln_two);
180  v4df y = _mm256_add_pd(R, R2);
181 #endif
182  return y;
183 }
184 
185 inline v4df v1log1pd_core(v4df x)
186 {
187  v4df xarg = x;
188  x = _mm256_add_pd(x, one);
189  v4df c = _mm256_sub_pd(x, one);
190  c = _mm256_sub_pd(xarg, c);
191  c = _mm256_div_pd(c, x);
192  v4df y = v1logd_core(x);
193  return _mm256_add_pd(y, c);
194 }
195 
196 #endif // __AVX512F__
197 
198 #if defined(__AVX512F__) || defined(__AVX2__)
199 VECII_CONST(log_offf,0x7f);
200 VECII_CONST(log_mask2f,0x007fffff);
201 VECII_CONST(log_sq2offf,(0x95f64<<3));
202 VECII_CONST(log_mask3f,0x00800000);
203 VECII_CONST(log_mask4f,0x3f800000);
204 #else
205 VECI_CONST(log_offf,0x7f);
206 VECI_CONST(log_mask2f,0x007fffff);
207 VECI_CONST(log_sq2offf,(0x95f64<<3));
208 VECI_CONST(log_mask3f,0x00800000);
209 VECI_CONST(log_mask4f,0x3f800000);
210 #endif
211 
212 VECFI_CONST(log_lg1f,0xbf2aaaaa); // -0.66666662693f
213 VECFI_CONST(log_lg2f,0xbeccce13); // -0.40000972152f
214 VECFI_CONST(log_lg3f,0xbe91e9ee); // -0.28498786688f
215 VECFI_CONST(log_lg4f,0xbe789e26); // -0.24279078841f
216 
217 #ifdef __AVX512F__
218 
219 inline v16sf v1logf_core(v16sf x)
220 {
221  v16si ix = _mm512_castps_si512(x);
222  v16si k = _mm512_srli_epi32(ix, 23);
223  k = _mm512_sub_epi32(k, log_offf);
224  v16si iy = _mm512_and_epi32(ix, log_mask2f);
225  v16si iz = _mm512_add_epi32(iy, log_sq2offf);
226  iz = _mm512_and_epi32(iz, log_mask3f);
227  v16si ie = _mm512_xor_epi32(iz, log_mask4f);
228  iy = _mm512_or_epi32(iy, ie);
229  iz = _mm512_srli_epi32(iz, 23);
230  k = _mm512_add_epi32(k, iz);
231  v16sf f = _mm512_castsi512_ps(iy);
232  f = _mm512_sub_ps(f, onef);
233  v16sf s = _mm512_add_ps(f, twof);
234  s = _mm512_div_ps(f, s);
235  v16sf dk = _mm512_cvtepi32_ps(k);
236  v16sf z = _mm512_mul_ps(s, s);
237  v16sf R = _mm512_fmadd_ps(log_lg4f, z, log_lg3f);
238  R = _mm512_fmadd_ps(R, z, log_lg2f);
239  R = _mm512_fmadd_ps(R, z, log_lg1f);
240  R = _mm512_fmadd_ps(R, z, f);
241  R = _mm512_fnmadd_ps(R, s, f);
242  return _mm512_fmadd_ps(dk, ln_twof, R);
243 }
244 
245 inline v16sf v1log1pf_core(v16sf x)
246 {
247  v16sf xarg = x;
248  x = _mm512_add_ps(x, onef);
249  v16sf c = _mm512_sub_ps(x, onef);
250  c = _mm512_sub_ps(xarg, c);
251  c = _mm512_div_ps(c, x);
252  v16sf y = v1logf_core(x);
253  return _mm512_add_ps(y, c);
254 }
255 
256 #else
257 
258 inline v8sf v1logf_core(v8sf x)
259 {
260  v8si ix = _mm256_castps_si256(x);
261 #ifdef __AVX2__
262  v8si k = _mm256_srli_epi32(ix, 23);
263  k = _mm256_sub_epi32(k, log_offf);
264  v8si iy = _mm256_and_si256(ix, log_mask2f);
265  v8si iz = _mm256_add_epi32(iy, log_sq2offf);
266  iz = _mm256_and_si256(iz, log_mask3f);
267  v8si ie = _mm256_xor_si256(iz, log_mask4f);
268  iy = _mm256_or_si256(iy, ie);
269  iz = _mm256_srli_epi32(iz, 23);
270  k = _mm256_add_epi32(k, iz);
271 #else
272  v4si ixl = _mm256_extractf128_si256(ix, 0);
273  v4si kl = _mm_srli_epi32(ixl, 23);
274  kl = _mm_sub_epi32(kl, log_offf);
275  v4si iyl = _mm_and_si128(ixl, log_mask2f);
276  v4si iz = _mm_add_epi32(iyl, log_sq2offf);
277  iz = _mm_and_si128(iz, log_mask3f);
278  v4si ie = _mm_xor_si128(iz, log_mask4f);
279  iyl = _mm_or_si128(iyl, ie);
280  iz = _mm_srli_epi32(iz, 23);
281  kl = _mm_add_epi32(kl, iz);
282  v4si ixh = _mm256_extractf128_si256(ix, 1);
283  v4si kh = _mm_srli_epi32(ixh, 23);
284  kh = _mm_sub_epi32(kh, log_offf);
285  v4si iyh = _mm_and_si128(ixh, log_mask2f);
286  iz = _mm_add_epi32(iyh, log_sq2offf);
287  iz = _mm_and_si128(iz, log_mask3f);
288  ie = _mm_xor_si128(iz, log_mask4f);
289  iyh = _mm_or_si128(iyh, ie);
290  iz = _mm_srli_epi32(iz, 23);
291  kh = _mm_add_epi32(kh, iz);
292  v8si iy = _mm256_setzero_si256();
293  iy = _mm256_insertf128_si256(iy, iyl, 0);
294  iy = _mm256_insertf128_si256(iy, iyh, 1);
295  v8si k = _mm256_setzero_si256();
296  k = _mm256_insertf128_si256(k, kl, 0);
297  k = _mm256_insertf128_si256(k, kh, 1);
298 #endif
299  x = _mm256_castsi256_ps(iy);
300  v8sf dk = _mm256_cvtepi32_ps(k);
301  // f and s have opposite sign compared to openlibm!
302  v8sf f = _mm256_sub_ps(onef, x);
303  v8sf s = _mm256_sub_ps(twof, f);
304  s = _mm256_div_ps(f, s);
305  v8sf z = _mm256_mul_ps(s, s);
306 #ifdef __FMA__
307  v8sf R = _mm256_fmadd_ps(log_lg4f, z, log_lg3f);
308  R = _mm256_fmadd_ps(R, z, log_lg2f);
309  R = _mm256_fmadd_ps(R, z, log_lg1f);
310  R = _mm256_fmsub_ps(R, z, f);
311  R = _mm256_fmsub_ps(R, s, f);
312  v8sf y = _mm256_fmadd_ps(dk, ln_twof, R);
313 #else
314  v8sf R = _mm256_mul_ps(log_lg4f, z);
315  R = _mm256_add_ps(R, log_lg3f);
316  R = _mm256_mul_ps(R, z);
317  R = _mm256_add_ps(R, log_lg2f);
318  R = _mm256_mul_ps(R, z);
319  R = _mm256_add_ps(R, log_lg1f);
320  R = _mm256_mul_ps(R, z);
321  R = _mm256_sub_ps(R, f);
322  R = _mm256_mul_ps(R, s);
323  R = _mm256_sub_ps(R, f);
324  v8sf R2 = _mm256_mul_ps(dk, ln_twof);
325  v8sf y = _mm256_add_ps(R, R2);
326 #endif
327  return y;
328 }
329 
330 inline v8sf v1log1pf_core(v8sf x)
331 {
332  v8sf xarg = x;
333  x = _mm256_add_ps(x, onef);
334  v8sf c = _mm256_sub_ps(x, onef);
335  c = _mm256_sub_ps(xarg, c);
336  c = _mm256_div_ps(c, x);
337  v8sf y = v1logf_core(x);
338  return _mm256_add_ps(y, c);
339 }
340 
341 #endif // __AVX512F__
342 
343 #endif // __AVX__
344 
345 #endif