cloudy  trunk
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
vectorize.h
Go to the documentation of this file.
1 /* This file is part of Cloudy and is copyright (C)1978-2022 by Gary J. Ferland and
2  * others. For conditions of distribution and use see copyright notice in license.txt */
3 
4 #ifndef VECTORIZE_H
5 #define VECTORIZE_H
6 
7 #include "vectorize_reduce.h"
8 #include "vectorize_exp.h"
9 #include "vectorize_log.h"
10 #include "vectorize_sqrt.h"
11 #include "vectorize_hyper.h"
12 
13 //
14 // The t_avx_pool class maintains a pool of scratch arrays that can be used by the vectorization
15 // routines. The memory is correctly aligned on an AVX boundary.
16 //
17 // To use the pool, declare a pointer variable as follows
18 //
19 // #include "vectorize.h"
20 // avx_ptr<realnum> a(n), b(nlo, nhi), c(nlo, nhi);
21 //
22 // You can index the resulting pointers as you would a normal array
23 //
24 // realnum c = a[0]*b[2];
25 //
26 // Indexing the arrays is valid for the following indices:
27 //
28 // a[i] : for 0 <= i < n
29 // b[i] : for nlo <= i < nhi
30 //
31 // In the case of the array b[], the element b[nlo] will be correctly aligned on an AVX boundary. To
32 // pass the scratch arrays to the vectorization routine, use the following:
33 //
34 // vexp( b.ptr0(), c.ptr0(), nlo, nhi );
35 //
36 // When the avx_ptr variable goes out of scope, the scratch array will be released automatically.
37 // An avx_ptr supports bounds checking when the BOUNDS_CHECK macro is defined at compile time
38 // (similar to multi_arr and flex_arr).
39 //
40 // The t_avx_pool class maintains a minimum pool of p_min_size arrays. The size of the pool can grow
41 // when needed. When a scratch array is released with an index number >= p_min_size, the memory will
42 // be freed immediately to conserve memory.
43 //
44 // The default scratch array can hold p_def_size doubles. In the default setup that should be larger
45 // than rfield.nflux_with_check for efficiency since an array of that size is plausible to be
46 // requested. If the frequency mesh is larger (e.g. due to the SET CONTINUUM RESOLUTION command) the
47 // following optimization will be used. The first time an array larger than p_def_size doubles is
48 // requested, it will have to allocated. When that array is subsequently released, it is swapped
49 // with an unused smaller array with index < p_min_size, and then the smaller array is freed. This
50 // gives near optimal performance even for large frequency grids, at the expense of a slight
51 // overhead when releasing the scratch array.
52 //
53 
54 #ifdef _MSC_VER
55 // posix_memalign not defined on windows
56 inline int posix_memalign(void **p, size_t a, size_t s)
57 {
58  *p = _aligned_malloc(s, a);
59  return ( *p == NULL ) ? errno : 0;
60 }
61 
62 inline void posix_memalign_free(void *p)
63 {
64  _aligned_free(p);
65 }
66 #else
67 inline void posix_memalign_free(void *p)
68 {
69  free(p);
70 }
71 #endif
72 
74 {
75  vector<void*> p_ptr;
76  vector<size_t> p_size;
77  vector<bool> p_used;
78 
79  // for efficiency this number should be larger than rfield.nflux_with_check
80  // in the default setup -- adjust this constant if that is not the case.
81  static const size_t p_def_size = 8500*sizeof(double);
82  static const size_t p_min_size = 30;
83 
84  void p_alloc(size_t sz, bool lgUsed)
85  {
86  void *p_ptr_alloc;
87  if( posix_memalign(&p_ptr_alloc, CD_ALIGN, sz) != 0 )
88  throw bad_alloc();
89  p_ptr.push_back(p_ptr_alloc);
90  p_size.push_back(sz);
91  p_used.push_back(lgUsed);
92  }
93 
94 public:
96  {
97  for( size_t i=0; i < p_min_size; ++i )
98  p_alloc(p_def_size,false);
99  }
101  {
102  for( size_t i=0; i < p_ptr.size(); ++i )
104  }
105  void* avx_alloc(size_t sz)
106  {
107  void* p_ptr_alloc = NULL;
108  for( size_t i=0; i < p_ptr.size(); i++ )
109  {
110  if( !p_used[i] && sz <= p_size[i] )
111  {
112  p_ptr_alloc = p_ptr[i];
113  p_used[i] = true;
114  break;
115  }
116  }
117  if( p_ptr_alloc == NULL )
118  {
119  p_alloc(sz,true);
120  p_ptr_alloc = p_ptr.back();
121  }
122  return p_ptr_alloc;
123  }
124  void avx_free(void* p_ptr_alloc)
125  {
126  size_t i;
127  for( i=0; i < p_ptr.size(); i++ )
128  {
129  if( p_ptr[i] == p_ptr_alloc )
130  {
131  if( i >= p_min_size )
132  {
133  //
134  size_t j = p_min_size;
135  for( j=0; j < p_min_size; ++j )
136  {
137  if( !p_used[j] && p_size[j] < p_size[i] )
138  break;
139  }
140  if( j < p_min_size )
141  {
143  p_ptr[j] = p_ptr[i];
144  p_size[j] = p_size[i];
145  }
146  else
147  {
149  }
150  p_ptr[i] = NULL;
151  p_size[i] = 0;
152  }
153  p_used[i] = false;
154  break;
155  }
156  }
157  // clean up unused entries at the back
158  while( p_ptr.back() == NULL )
159  {
160  p_ptr.pop_back();
161  p_size.pop_back();
162  p_used.pop_back();
163  }
164  }
165 };
166 
167 extern t_avx_pool avx_pool;
168 
169 template<class T, bool lgBC=lgBOUNDSCHECKVAL>
170 class avx_ptr
171 {
172  long p_begin;
173  long p_end;
175  T* p_ptr;
176 
177  // make default constructor private since it shouldn't be used
179  {
180  p_ptr = NULL;
181  p_ptr_alloc = NULL;
182  p_begin = 0;
183  p_end = 0;
184  }
185  void p_alloc(long begin, long end)
186  {
187  size_t sz = size_t(max(end-begin,0)*sizeof(T));
188  if( sz == 0 )
189  {
190  p_ptr_alloc = NULL;
191  p_ptr = NULL;
192  p_begin = 0;
193  p_end = 0;
194  }
195  else
196  {
197  p_ptr_alloc = static_cast<T*>(avx_pool.avx_alloc(sz));
198  p_ptr = p_ptr_alloc - begin;
199  p_begin = begin;
200  p_end = end;
201  }
202  }
203 public:
204  typedef T& reference;
205  typedef const T& const_reference;
206 
207  explicit avx_ptr(long size)
208  {
209  p_alloc(0, size);
210  }
211  avx_ptr(long begin, long end)
212  {
213  p_alloc(begin, end);
214  }
216  {
217  if( p_ptr_alloc != NULL )
218  avx_pool.avx_free(p_ptr_alloc);
219  }
221  {
222  if( lgBC && ( i < p_begin || i >= p_end ) )
223  OUT_OF_RANGE( "avx_ptr::operator[]" );
224  return *(p_ptr+i);
225  }
227  {
228  if( lgBC && ( i < p_begin || i >= p_end ) )
229  OUT_OF_RANGE( "avx_ptr::operator[]" );
230  return *(p_ptr+i);
231  }
232  T* data()
233  {
234  return p_ptr_alloc;
235  }
236  const T* data() const
237  {
238  return p_ptr_alloc;
239  }
240  T* ptr0()
241  {
242  return p_ptr;
243  }
244  const T* ptr0() const
245  {
246  return p_ptr;
247  }
248 };
249 
250 //
251 // The allocator_avx class can be used to get a vector with internal data aligned on an AVX boundary
252 //
253 // Use the class as follows:
254 //
255 // #include "vectorize.h"
256 // vector< realnum, allocator_avx<realnum> > arr;
257 //
258 // When C++11 is in effect, you can also use the following shorthand:
259 //
260 // vector_avx<realnum> arr;
261 //
262 
263 template<class T>
265 
266 template<>
267 class allocator_avx<void>
268 {
269 public:
270  typedef size_t size_type;
271  typedef ptrdiff_t difference_type;
272  typedef void* pointer;
273  typedef const void* const_pointer;
274  typedef void value_type;
275 
276  template<class U>
277  struct rebind { typedef allocator_avx<U> other; };
278 
279 #if __cplusplus >= 201103L
280  typedef true_type propagate_on_container_move_assignment;
281 #endif
282 };
283 
284 template<class T>
285 class allocator_avx : public allocator<T>
286 {
287 public:
288  typedef size_t size_type;
289  typedef ptrdiff_t difference_type;
290  typedef T* pointer;
291  typedef const T* const_pointer;
292  typedef T& reference;
293  typedef const T& const_reference;
294  typedef T value_type;
295 
296  template<class U>
297  struct rebind
298  {
300  };
301 
302 #if __cplusplus >= 201103L
303  typedef true_type propagate_on_container_move_assignment;
304 #endif
305 
306  allocator_avx() throw() {}
307 
308  allocator_avx(const allocator_avx& a) throw() : allocator<T>(a) {}
309 
310  template<class U>
311  allocator_avx(const allocator_avx<U>&) throw() {}
312 
313  ~allocator_avx() throw() {}
314 
316  {
317  void* p;
318  if( posix_memalign(&p, CD_ALIGN, n*sizeof(T)) != 0 )
319  throw bad_alloc();
320  return pointer(p);
321  }
322 
323  void deallocate(pointer p, size_type) throw()
324  {
326  }
327 };
328 
329 template<class T, class U>
330 inline bool operator== (const allocator_avx<T>&, const allocator_avx<U>&)
331 {
332  return true;
333 }
334 
335 template<class T>
336 inline bool operator== (const allocator_avx<T>&, const allocator_avx<T>&)
337 {
338  return true;
339 }
340 
341 template<class T, class U>
342 inline bool operator!= (const allocator_avx<T>&, const allocator_avx<U>&)
343 {
344  return false;
345 }
346 
347 template<class T>
348 inline bool operator!= (const allocator_avx<T>&, const allocator_avx<T>&)
349 {
350  return false;
351 }
352 
353 #if __cplusplus >= 201103L
354 
355 template<typename T>
356 using vector_avx = typename std::vector<T,allocator_avx<T>>;
357 
358 #endif
359 
360 #endif
void posix_memalign_free(void *p)
Definition: vectorize.h:67
T * p_ptr
Definition: vectorize.h:175
t_avx_pool()
Definition: vectorize.h:95
static const size_t p_min_size
Definition: vectorize.h:82
const T * const_pointer
Definition: vectorize.h:291
T * ptr0()
Definition: vectorize.h:240
void avx_free(void *p_ptr_alloc)
Definition: vectorize.h:124
ptrdiff_t difference_type
Definition: vectorize.h:289
allocator_avx< U > other
Definition: vectorize.h:277
allocator_avx(const allocator_avx< U > &)
Definition: vectorize.h:311
long p_end
Definition: vectorize.h:173
vector< size_t > p_size
Definition: vectorize.h:76
t_avx_pool avx_pool
Definition: vectorize.cpp:7
avx_ptr(long size)
Definition: vectorize.h:207
void p_alloc(size_t sz, bool lgUsed)
Definition: vectorize.h:84
ptrdiff_t difference_type
Definition: vectorize.h:271
const T * ptr0() const
Definition: vectorize.h:244
NORETURN void OUT_OF_RANGE(const char *str)
Definition: cddefines.h:643
bool operator!=(const count_ptr< T > &a, const count_ptr< T > &b)
Definition: count_ptr.h:113
void * avx_alloc(size_t sz)
Definition: vectorize.h:105
allocator_avx< U > other
Definition: vectorize.h:299
void deallocate(pointer p, size_type)
Definition: vectorize.h:323
avx_ptr()
Definition: vectorize.h:178
reference operator[](long i)
Definition: vectorize.h:220
T * data()
Definition: vectorize.h:232
long max(int a, long b)
Definition: cddefines.h:817
~avx_ptr()
Definition: vectorize.h:215
void p_alloc(long begin, long end)
Definition: vectorize.h:185
const T & const_reference
Definition: vectorize.h:205
bool operator==(const count_ptr< T > &a, const count_ptr< T > &b)
Definition: count_ptr.h:108
const void * const_pointer
Definition: vectorize.h:273
avx_ptr(long begin, long end)
Definition: vectorize.h:211
allocator_avx(const allocator_avx &a)
Definition: vectorize.h:308
const T & const_reference
Definition: vectorize.h:293
pointer allocate(size_type n, typename allocator_avx< void >::const_pointer=NULL)
Definition: vectorize.h:315
T & reference
Definition: vectorize.h:204
const T * data() const
Definition: vectorize.h:236
#define CD_ALIGN
Definition: cpu.h:156
vector< void * > p_ptr
Definition: vectorize.h:75
size_t size_type
Definition: vectorize.h:288
long p_begin
Definition: vectorize.h:172
vector< bool > p_used
Definition: vectorize.h:77
static const size_t p_def_size
Definition: vectorize.h:81
T * p_ptr_alloc
Definition: vectorize.h:174