4 #ifndef VECTORIZE_LOG_CORE_H
5 #define VECTORIZE_LOG_CORE_H
33 #if defined(__AVX512F__) || defined(__AVX2__)
34 VECLL_CONST(log_off,0x3ff);
35 VECLL_CONST(log_mask2,0x000fffffffffffff);
36 VECLL_CONST(log_sq2off,0x95f6400000000);
37 VECLL_CONST(log_mask3,0x0010000000000000);
38 VECLL_CONST(log_mask4,0x3ff0000000000000);
40 VECI_CONST(log_off,0x3ff);
41 VECI_CONST(log_mask2,0x000fffff);
42 VECI_CONST(log_sq2off,0x95f64);
43 VECI_CONST(log_mask3,0x00100000);
44 VECI_CONST(log_mask4,0x3ff00000);
47 VECDI_CONST(log_lg1,0xbfe5555555555593);
48 VECDI_CONST(log_lg2,0xbfd999999997fa04);
49 VECDI_CONST(log_lg3,0xbfd2492494229359);
50 VECDI_CONST(log_lg4,0xbfcc71c51d8e78af);
51 VECDI_CONST(log_lg5,0xbfc7466496cb03de);
52 VECDI_CONST(log_lg6,0xbfc39a09d078c69f);
53 VECDI_CONST(log_lg7,0xbfc2f112df3e5244);
57 inline v8df v1logd_core(v8df x)
59 v8di ix = _mm512_castpd_si512(x);
60 v8di k = _mm512_srli_epi64(ix, 52);
61 k = _mm512_sub_epi64(k, log_off);
62 v8di iy = _mm512_and_epi64(ix, log_mask2);
63 v8di iz = _mm512_add_epi64(iy, log_sq2off);
64 iz = _mm512_and_epi64(iz, log_mask3);
65 v8di ie = _mm512_xor_epi64(iz, log_mask4);
66 iy = _mm512_or_epi64(iy, ie);
67 iz = _mm512_srli_epi64(iz, 52);
68 k = _mm512_add_epi64(k, iz);
69 v8df f = _mm512_castsi512_pd(iy);
70 f = _mm512_sub_pd(f, one);
71 v8df s = _mm512_add_pd(f, two);
72 s = _mm512_div_pd(f, s);
74 v8df dk = _mm512_cvtepi64_pd(k);
76 v8si k2 = _mm512_cvtepi64_epi32(k);
77 v8df dk = _mm512_cvtepi32_pd(k2);
79 v8df z = _mm512_mul_pd(s, s);
80 v8df R = _mm512_fmadd_pd(log_lg7, z, log_lg6);
81 R = _mm512_fmadd_pd(R, z, log_lg5);
82 R = _mm512_fmadd_pd(R, z, log_lg4);
83 R = _mm512_fmadd_pd(R, z, log_lg3);
84 R = _mm512_fmadd_pd(R, z, log_lg2);
85 R = _mm512_fmadd_pd(R, z, log_lg1);
86 R = _mm512_fmadd_pd(R, z, f);
87 R = _mm512_fnmadd_pd(R, s, f);
88 return _mm512_fmadd_pd(dk, ln_two, R);
91 inline v8df v1log1pd_core(v8df x)
94 x = _mm512_add_pd(x, one);
95 v8df c = _mm512_sub_pd(x, one);
96 c = _mm512_sub_pd(xarg, c);
97 c = _mm512_div_pd(c, x);
98 v8df y = v1logd_core(x);
99 return _mm512_add_pd(y, c);
104 inline v4df v1logd_core(v4df x)
107 v4di ix = _mm256_castpd_si256(x);
108 v4di k = _mm256_srli_epi64(ix, 52);
109 k = _mm256_sub_epi64(k, log_off);
110 v4di iy = _mm256_and_si256(ix, log_mask2);
111 v4di iz = _mm256_add_epi64(iy, log_sq2off);
112 iz = _mm256_and_si256(iz, log_mask3);
113 v4di ie = _mm256_xor_si256(iz, log_mask4);
114 iy = _mm256_or_si256(iy, ie);
115 iz = _mm256_srli_epi64(iz, 52);
116 k = _mm256_add_epi64(k, iz);
117 v4di kp = _mm256_permutevar8x32_epi32(k, _mm256_set_epi32(7,5,3,1,6,4,2,0) );
118 v4si kk = _mm256_extractf128_si256(kp, 0);
119 x = _mm256_castsi256_pd(iy);
120 v4df dk = _mm256_cvtepi32_pd(kk);
122 v8sf xs = _mm256_castpd_ps(x);
123 xs = _mm256_permute_ps(xs, _MM_SHUFFLE(3, 1, 2, 0));
124 v4sf xls = _mm256_extractf128_ps(xs, 0);
125 v4si xl = _mm_castps_si128(xls);
126 v4sf xhs = _mm256_extractf128_ps(xs, 1);
127 v4si xh = _mm_castps_si128(xhs);
128 v4si hx = _mm_unpackhi_epi64(xl, xh);
129 v4si lx = _mm_unpacklo_epi64(xl, xh);
130 v4si k = _mm_srli_epi32(hx, 20);
131 k = _mm_sub_epi32(k, log_off);
132 hx = _mm_and_si128(hx, log_mask2);
133 v4si i = _mm_add_epi32(hx, log_sq2off);
134 i = _mm_and_si128(i, log_mask3);
135 v4si ii = _mm_xor_si128(i, log_mask4);
136 hx = _mm_or_si128(hx, ii);
137 v8si xi = _mm256_setzero_si256();
138 xl = _mm_unpacklo_epi32(lx,hx);
139 xi = _mm256_insertf128_si256(xi, xl, 0);
140 xh = _mm_unpackhi_epi32(lx,hx);
141 xi = _mm256_insertf128_si256(xi, xh, 1);
142 x = _mm256_castsi256_pd(xi);
143 i = _mm_srli_epi32(i, 20);
144 k = _mm_add_epi32(k, i);
145 v4df dk = _mm256_cvtepi32_pd(k);
148 v4df f = _mm256_sub_pd(one, x);
149 v4df s = _mm256_sub_pd(two, f);
150 s = _mm256_div_pd(f, s);
151 v4df z = _mm256_mul_pd(s, s);
153 v4df R = _mm256_fmadd_pd(log_lg7, z, log_lg6);
154 R = _mm256_fmadd_pd(R, z, log_lg5);
155 R = _mm256_fmadd_pd(R, z, log_lg4);
156 R = _mm256_fmadd_pd(R, z, log_lg3);
157 R = _mm256_fmadd_pd(R, z, log_lg2);
158 R = _mm256_fmadd_pd(R, z, log_lg1);
159 R = _mm256_fmsub_pd(R, z, f);
160 R = _mm256_fmsub_pd(R, s, f);
161 v4df y = _mm256_fmadd_pd(dk, ln_two, R);
163 v4df R = _mm256_mul_pd(log_lg7, z);
164 R = _mm256_add_pd(R, log_lg6);
165 R = _mm256_mul_pd(R, z);
166 R = _mm256_add_pd(R, log_lg5);
167 R = _mm256_mul_pd(R, z);
168 R = _mm256_add_pd(R, log_lg4);
169 R = _mm256_mul_pd(R, z);
170 R = _mm256_add_pd(R, log_lg3);
171 R = _mm256_mul_pd(R, z);
172 R = _mm256_add_pd(R, log_lg2);
173 R = _mm256_mul_pd(R, z);
174 R = _mm256_add_pd(R, log_lg1);
175 R = _mm256_mul_pd(R, z);
176 R = _mm256_sub_pd(R, f);
177 R = _mm256_mul_pd(R, s);
178 R = _mm256_sub_pd(R, f);
179 v4df R2 = _mm256_mul_pd(dk, ln_two);
180 v4df y = _mm256_add_pd(R, R2);
185 inline v4df v1log1pd_core(v4df x)
188 x = _mm256_add_pd(x, one);
189 v4df c = _mm256_sub_pd(x, one);
190 c = _mm256_sub_pd(xarg, c);
191 c = _mm256_div_pd(c, x);
192 v4df y = v1logd_core(x);
193 return _mm256_add_pd(y, c);
196 #endif // __AVX512F__
198 #if defined(__AVX512F__) || defined(__AVX2__)
199 VECII_CONST(log_offf,0x7f);
200 VECII_CONST(log_mask2f,0x007fffff);
201 VECII_CONST(log_sq2offf,(0x95f64<<3));
202 VECII_CONST(log_mask3f,0x00800000);
203 VECII_CONST(log_mask4f,0x3f800000);
205 VECI_CONST(log_offf,0x7f);
206 VECI_CONST(log_mask2f,0x007fffff);
207 VECI_CONST(log_sq2offf,(0x95f64<<3));
208 VECI_CONST(log_mask3f,0x00800000);
209 VECI_CONST(log_mask4f,0x3f800000);
212 VECFI_CONST(log_lg1f,0xbf2aaaaa);
213 VECFI_CONST(log_lg2f,0xbeccce13);
214 VECFI_CONST(log_lg3f,0xbe91e9ee);
215 VECFI_CONST(log_lg4f,0xbe789e26);
219 inline v16sf v1logf_core(v16sf x)
221 v16si ix = _mm512_castps_si512(x);
222 v16si k = _mm512_srli_epi32(ix, 23);
223 k = _mm512_sub_epi32(k, log_offf);
224 v16si iy = _mm512_and_epi32(ix, log_mask2f);
225 v16si iz = _mm512_add_epi32(iy, log_sq2offf);
226 iz = _mm512_and_epi32(iz, log_mask3f);
227 v16si ie = _mm512_xor_epi32(iz, log_mask4f);
228 iy = _mm512_or_epi32(iy, ie);
229 iz = _mm512_srli_epi32(iz, 23);
230 k = _mm512_add_epi32(k, iz);
231 v16sf f = _mm512_castsi512_ps(iy);
232 f = _mm512_sub_ps(f, onef);
233 v16sf s = _mm512_add_ps(f, twof);
234 s = _mm512_div_ps(f, s);
235 v16sf dk = _mm512_cvtepi32_ps(k);
236 v16sf z = _mm512_mul_ps(s, s);
237 v16sf R = _mm512_fmadd_ps(log_lg4f, z, log_lg3f);
238 R = _mm512_fmadd_ps(R, z, log_lg2f);
239 R = _mm512_fmadd_ps(R, z, log_lg1f);
240 R = _mm512_fmadd_ps(R, z, f);
241 R = _mm512_fnmadd_ps(R, s, f);
242 return _mm512_fmadd_ps(dk, ln_twof, R);
245 inline v16sf v1log1pf_core(v16sf x)
248 x = _mm512_add_ps(x, onef);
249 v16sf c = _mm512_sub_ps(x, onef);
250 c = _mm512_sub_ps(xarg, c);
251 c = _mm512_div_ps(c, x);
252 v16sf y = v1logf_core(x);
253 return _mm512_add_ps(y, c);
258 inline v8sf v1logf_core(v8sf x)
260 v8si ix = _mm256_castps_si256(x);
262 v8si k = _mm256_srli_epi32(ix, 23);
263 k = _mm256_sub_epi32(k, log_offf);
264 v8si iy = _mm256_and_si256(ix, log_mask2f);
265 v8si iz = _mm256_add_epi32(iy, log_sq2offf);
266 iz = _mm256_and_si256(iz, log_mask3f);
267 v8si ie = _mm256_xor_si256(iz, log_mask4f);
268 iy = _mm256_or_si256(iy, ie);
269 iz = _mm256_srli_epi32(iz, 23);
270 k = _mm256_add_epi32(k, iz);
272 v4si ixl = _mm256_extractf128_si256(ix, 0);
273 v4si kl = _mm_srli_epi32(ixl, 23);
274 kl = _mm_sub_epi32(kl, log_offf);
275 v4si iyl = _mm_and_si128(ixl, log_mask2f);
276 v4si iz = _mm_add_epi32(iyl, log_sq2offf);
277 iz = _mm_and_si128(iz, log_mask3f);
278 v4si ie = _mm_xor_si128(iz, log_mask4f);
279 iyl = _mm_or_si128(iyl, ie);
280 iz = _mm_srli_epi32(iz, 23);
281 kl = _mm_add_epi32(kl, iz);
282 v4si ixh = _mm256_extractf128_si256(ix, 1);
283 v4si kh = _mm_srli_epi32(ixh, 23);
284 kh = _mm_sub_epi32(kh, log_offf);
285 v4si iyh = _mm_and_si128(ixh, log_mask2f);
286 iz = _mm_add_epi32(iyh, log_sq2offf);
287 iz = _mm_and_si128(iz, log_mask3f);
288 ie = _mm_xor_si128(iz, log_mask4f);
289 iyh = _mm_or_si128(iyh, ie);
290 iz = _mm_srli_epi32(iz, 23);
291 kh = _mm_add_epi32(kh, iz);
292 v8si iy = _mm256_setzero_si256();
293 iy = _mm256_insertf128_si256(iy, iyl, 0);
294 iy = _mm256_insertf128_si256(iy, iyh, 1);
295 v8si k = _mm256_setzero_si256();
296 k = _mm256_insertf128_si256(k, kl, 0);
297 k = _mm256_insertf128_si256(k, kh, 1);
299 x = _mm256_castsi256_ps(iy);
300 v8sf dk = _mm256_cvtepi32_ps(k);
302 v8sf f = _mm256_sub_ps(onef, x);
303 v8sf s = _mm256_sub_ps(twof, f);
304 s = _mm256_div_ps(f, s);
305 v8sf z = _mm256_mul_ps(s, s);
307 v8sf R = _mm256_fmadd_ps(log_lg4f, z, log_lg3f);
308 R = _mm256_fmadd_ps(R, z, log_lg2f);
309 R = _mm256_fmadd_ps(R, z, log_lg1f);
310 R = _mm256_fmsub_ps(R, z, f);
311 R = _mm256_fmsub_ps(R, s, f);
312 v8sf y = _mm256_fmadd_ps(dk, ln_twof, R);
314 v8sf R = _mm256_mul_ps(log_lg4f, z);
315 R = _mm256_add_ps(R, log_lg3f);
316 R = _mm256_mul_ps(R, z);
317 R = _mm256_add_ps(R, log_lg2f);
318 R = _mm256_mul_ps(R, z);
319 R = _mm256_add_ps(R, log_lg1f);
320 R = _mm256_mul_ps(R, z);
321 R = _mm256_sub_ps(R, f);
322 R = _mm256_mul_ps(R, s);
323 R = _mm256_sub_ps(R, f);
324 v8sf R2 = _mm256_mul_ps(dk, ln_twof);
325 v8sf y = _mm256_add_ps(R, R2);
330 inline v8sf v1log1pf_core(v8sf x)
333 x = _mm256_add_ps(x, onef);
334 v8sf c = _mm256_sub_ps(x, onef);
335 c = _mm256_sub_ps(xarg, c);
336 c = _mm256_div_ps(c, x);
337 v8sf y = v1logf_core(x);
338 return _mm256_add_ps(y, c);
341 #endif // __AVX512F__