Kokkos Core Kernels Package  Version of the Day
Kokkos_MemoryPool.hpp
1 /*
2 //@HEADER
3 // ************************************************************************
4 //
5 // Kokkos v. 3.0
6 // Copyright (2020) National Technology & Engineering
7 // Solutions of Sandia, LLC (NTESS).
8 //
9 // Under the terms of Contract DE-NA0003525 with NTESS,
10 // the U.S. Government retains certain rights in this software.
11 //
12 // Redistribution and use in source and binary forms, with or without
13 // modification, are permitted provided that the following conditions are
14 // met:
15 //
16 // 1. Redistributions of source code must retain the above copyright
17 // notice, this list of conditions and the following disclaimer.
18 //
19 // 2. Redistributions in binary form must reproduce the above copyright
20 // notice, this list of conditions and the following disclaimer in the
21 // documentation and/or other materials provided with the distribution.
22 //
23 // 3. Neither the name of the Corporation nor the names of the
24 // contributors may be used to endorse or promote products derived from
25 // this software without specific prior written permission.
26 //
27 // THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
28 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
30 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
31 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
32 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
33 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
34 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
35 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
36 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
37 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 //
39 // Questions? Contact Christian R. Trott (crtrott@sandia.gov)
40 //
41 // ************************************************************************
42 //@HEADER
43 */
44 
45 #ifndef KOKKOS_MEMORYPOOL_HPP
46 #define KOKKOS_MEMORYPOOL_HPP
47 
48 #include <Kokkos_Core_fwd.hpp>
49 #include <Kokkos_Parallel.hpp>
50 #include <Kokkos_Atomic.hpp>
51 #include <impl/Kokkos_ConcurrentBitset.hpp>
52 #include <impl/Kokkos_Error.hpp>
53 #include <impl/Kokkos_SharedAlloc.hpp>
54 
55 #include <iostream>
56 
57 namespace Kokkos {
58 namespace Impl {
59 /* Report violation of size constraints:
60  * min_block_alloc_size <= max_block_alloc_size
61  * max_block_alloc_size <= min_superblock_size
62  * min_superblock_size <= max_superblock_size
63  * min_superblock_size <= min_total_alloc_size
64  * min_superblock_size <= min_block_alloc_size *
65  * max_block_per_superblock
66  */
67 void memory_pool_bounds_verification(size_t min_block_alloc_size,
68  size_t max_block_alloc_size,
69  size_t min_superblock_size,
70  size_t max_superblock_size,
71  size_t max_block_per_superblock,
72  size_t min_total_alloc_size);
73 } // namespace Impl
74 } // namespace Kokkos
75 
76 namespace Kokkos {
77 
78 namespace Impl {
79 
80 void _print_memory_pool_state(std::ostream &s, uint32_t const *sb_state_ptr,
81  int32_t sb_count, uint32_t sb_size_lg2,
82  uint32_t sb_state_size, uint32_t state_shift,
83  uint32_t state_used_mask);
84 
85 } // end namespace Impl
86 
87 template <typename DeviceType>
88 class MemoryPool {
89  private:
90  using CB = Kokkos::Impl::concurrent_bitset;
91 
92  enum : uint32_t { bits_per_int_lg2 = CB::bits_per_int_lg2 };
93  enum : uint32_t { state_shift = CB::state_shift };
94  enum : uint32_t { state_used_mask = CB::state_used_mask };
95  enum : uint32_t { state_header_mask = CB::state_header_mask };
96  enum : uint32_t { max_bit_count_lg2 = CB::max_bit_count_lg2 };
97  enum : uint32_t { max_bit_count = CB::max_bit_count };
98 
99  enum : uint32_t { HINT_PER_BLOCK_SIZE = 2 };
100 
101  /* Each superblock has a concurrent bitset state
102  * which is an array of uint32_t integers.
103  * [ { block_count_lg2 : state_shift bits
104  * , used_block_count : ( 32 - state_shift ) bits
105  * }
106  * , { block allocation bit set }* ]
107  *
108  * As superblocks are assigned (allocated) to a block size
109  * and released (deallocated) back to empty the superblock state
110  * is concurrently updated.
111  */
112 
113  /* Mapping between block_size <-> block_state
114  *
115  * block_state = ( m_sb_size_lg2 - block_size_lg2 ) << state_shift
116  * block_size = m_sb_size_lg2 - ( block_state >> state_shift )
117  *
118  * Thus A_block_size < B_block_size <=> A_block_state > B_block_state
119  */
120 
121  using base_memory_space = typename DeviceType::memory_space;
122 
123  enum {
125  base_memory_space>::accessible
126  };
127 
128  using Tracker = Kokkos::Impl::SharedAllocationTracker;
129  using Record = Kokkos::Impl::SharedAllocationRecord<base_memory_space>;
130 
131  Tracker m_tracker;
132  uint32_t *m_sb_state_array;
133  uint32_t m_sb_state_size;
134  uint32_t m_sb_size_lg2;
135  uint32_t m_max_block_size_lg2;
136  uint32_t m_min_block_size_lg2;
137  int32_t m_sb_count;
138  int32_t m_hint_offset; // Offset to K * #block_size array of hints
139  int32_t m_data_offset; // Offset to 0th superblock data
140  int32_t m_unused_padding;
141 
142  public:
143  using memory_space = typename DeviceType::memory_space;
144 
146  enum : uint32_t { max_superblock_size = 1LU << 31 /* 2 gigabytes */ };
147  enum : uint32_t { max_block_per_superblock = max_bit_count };
148 
149  //--------------------------------------------------------------------------
150 
151  KOKKOS_INLINE_FUNCTION
152  bool operator==(MemoryPool const &other) const {
153  return m_sb_state_array == other.m_sb_state_array;
154  }
155 
156  KOKKOS_INLINE_FUNCTION
157  size_t capacity() const noexcept {
158  return size_t(m_sb_count) << m_sb_size_lg2;
159  }
160 
161  KOKKOS_INLINE_FUNCTION
162  size_t min_block_size() const noexcept {
163  return (1LU << m_min_block_size_lg2);
164  }
165 
166  KOKKOS_INLINE_FUNCTION
167  size_t max_block_size() const noexcept {
168  return (1LU << m_max_block_size_lg2);
169  }
170 
171  struct usage_statistics {
172  size_t capacity_bytes;
173  size_t superblock_bytes;
174  size_t max_block_bytes;
175  size_t min_block_bytes;
176  size_t capacity_superblocks;
177  size_t consumed_superblocks;
178  size_t consumed_blocks;
179  size_t consumed_bytes;
180  size_t reserved_blocks;
181  size_t reserved_bytes;
182  };
183 
184  void get_usage_statistics(usage_statistics &stats) const {
185  Kokkos::HostSpace host;
186 
187  const size_t alloc_size = m_hint_offset * sizeof(uint32_t);
188 
189  uint32_t *const sb_state_array =
190  accessible ? m_sb_state_array : (uint32_t *)host.allocate(alloc_size);
191 
192  if (!accessible) {
193  Kokkos::Impl::DeepCopy<Kokkos::HostSpace, base_memory_space>(
194  sb_state_array, m_sb_state_array, alloc_size);
195  }
196 
197  stats.superblock_bytes = (1LU << m_sb_size_lg2);
198  stats.max_block_bytes = (1LU << m_max_block_size_lg2);
199  stats.min_block_bytes = (1LU << m_min_block_size_lg2);
200  stats.capacity_bytes = stats.superblock_bytes * m_sb_count;
201  stats.capacity_superblocks = m_sb_count;
202  stats.consumed_superblocks = 0;
203  stats.consumed_blocks = 0;
204  stats.consumed_bytes = 0;
205  stats.reserved_blocks = 0;
206  stats.reserved_bytes = 0;
207 
208  const uint32_t *sb_state_ptr = sb_state_array;
209 
210  for (int32_t i = 0; i < m_sb_count; ++i, sb_state_ptr += m_sb_state_size) {
211  const uint32_t block_count_lg2 = (*sb_state_ptr) >> state_shift;
212 
213  if (block_count_lg2) {
214  const uint32_t block_count = 1u << block_count_lg2;
215  const uint32_t block_size_lg2 = m_sb_size_lg2 - block_count_lg2;
216  const uint32_t block_size = 1u << block_size_lg2;
217  const uint32_t block_used = (*sb_state_ptr) & state_used_mask;
218 
219  stats.consumed_superblocks++;
220  stats.consumed_blocks += block_used;
221  stats.consumed_bytes += block_used * block_size;
222  stats.reserved_blocks += block_count - block_used;
223  stats.reserved_bytes += (block_count - block_used) * block_size;
224  }
225  }
226 
227  if (!accessible) {
228  host.deallocate(sb_state_array, alloc_size);
229  }
230  }
231 
232  void print_state(std::ostream &s) const {
233  Kokkos::HostSpace host;
234 
235  const size_t alloc_size = m_hint_offset * sizeof(uint32_t);
236 
237  uint32_t *const sb_state_array =
238  accessible ? m_sb_state_array : (uint32_t *)host.allocate(alloc_size);
239 
240  if (!accessible) {
241  Kokkos::Impl::DeepCopy<Kokkos::HostSpace, base_memory_space>(
242  sb_state_array, m_sb_state_array, alloc_size);
243  }
244 
245  Impl::_print_memory_pool_state(s, sb_state_array, m_sb_count, m_sb_size_lg2,
246  m_sb_state_size, state_shift,
247  state_used_mask);
248 
249  if (!accessible) {
250  host.deallocate(sb_state_array, alloc_size);
251  }
252  }
253 
254  //--------------------------------------------------------------------------
255 
256  KOKKOS_DEFAULTED_FUNCTION MemoryPool(MemoryPool &&) = default;
257  KOKKOS_DEFAULTED_FUNCTION MemoryPool(const MemoryPool &) = default;
258  KOKKOS_DEFAULTED_FUNCTION MemoryPool &operator=(MemoryPool &&) = default;
259  KOKKOS_DEFAULTED_FUNCTION MemoryPool &operator=(const MemoryPool &) = default;
260 
261  KOKKOS_INLINE_FUNCTION MemoryPool()
262  : m_tracker(),
263  m_sb_state_array(nullptr),
264  m_sb_state_size(0),
265  m_sb_size_lg2(0),
266  m_max_block_size_lg2(0),
267  m_min_block_size_lg2(0),
268  m_sb_count(0),
269  m_hint_offset(0),
270  m_data_offset(0),
271  m_unused_padding(0) {}
272 
287  MemoryPool(const base_memory_space &memspace,
288  const size_t min_total_alloc_size, size_t min_block_alloc_size = 0,
289  size_t max_block_alloc_size = 0, size_t min_superblock_size = 0)
290  : m_tracker(),
291  m_sb_state_array(nullptr),
292  m_sb_state_size(0),
293  m_sb_size_lg2(0),
294  m_max_block_size_lg2(0),
295  m_min_block_size_lg2(0),
296  m_sb_count(0),
297  m_hint_offset(0),
298  m_data_offset(0),
299  m_unused_padding(0) {
300  const uint32_t int_align_lg2 = 3; /* align as int[8] */
301  const uint32_t int_align_mask = (1u << int_align_lg2) - 1;
302  const uint32_t default_min_block_size = 1u << 6; /* 64 bytes */
303  const uint32_t default_max_block_size = 1u << 12; /* 4k bytes */
304  const uint32_t default_min_superblock_size = 1u << 20; /* 1M bytes */
305 
306  //--------------------------------------------------
307  // Default block and superblock sizes:
308 
309  if (0 == min_block_alloc_size) {
310  // Default all sizes:
311 
312  min_superblock_size =
313  std::min(size_t(default_min_superblock_size), min_total_alloc_size);
314 
315  min_block_alloc_size =
316  std::min(size_t(default_min_block_size), min_superblock_size);
317 
318  max_block_alloc_size =
319  std::min(size_t(default_max_block_size), min_superblock_size);
320  } else if (0 == min_superblock_size) {
321  // Choose superblock size as minimum of:
322  // max_block_per_superblock * min_block_size
323  // max_superblock_size
324  // min_total_alloc_size
325 
326  const size_t max_superblock =
327  min_block_alloc_size * max_block_per_superblock;
328 
329  min_superblock_size =
330  std::min(max_superblock,
331  std::min(size_t(max_superblock_size), min_total_alloc_size));
332  }
333 
334  if (0 == max_block_alloc_size) {
335  max_block_alloc_size = min_superblock_size;
336  }
337 
338  //--------------------------------------------------
339 
340  /* Enforce size constraints:
341  * min_block_alloc_size <= max_block_alloc_size
342  * max_block_alloc_size <= min_superblock_size
343  * min_superblock_size <= max_superblock_size
344  * min_superblock_size <= min_total_alloc_size
345  * min_superblock_size <= min_block_alloc_size *
346  * max_block_per_superblock
347  */
348 
349  Kokkos::Impl::memory_pool_bounds_verification(
350  min_block_alloc_size, max_block_alloc_size, min_superblock_size,
351  max_superblock_size, max_block_per_superblock, min_total_alloc_size);
352 
353  //--------------------------------------------------
354  // Block and superblock size is power of two:
355  // Maximum value is 'max_superblock_size'
356 
357  m_min_block_size_lg2 =
358  Kokkos::Impl::integral_power_of_two_that_contains(min_block_alloc_size);
359 
360  m_max_block_size_lg2 =
361  Kokkos::Impl::integral_power_of_two_that_contains(max_block_alloc_size);
362 
363  m_sb_size_lg2 =
364  Kokkos::Impl::integral_power_of_two_that_contains(min_superblock_size);
365 
366  {
367  // number of superblocks is multiple of superblock size that
368  // can hold min_total_alloc_size.
369 
370  const uint64_t sb_size_mask = (1LU << m_sb_size_lg2) - 1;
371 
372  m_sb_count = (min_total_alloc_size + sb_size_mask) >> m_sb_size_lg2;
373  }
374 
375  {
376  // Any superblock can be assigned to the smallest size block
377  // Size the block bitset to maximum number of blocks
378 
379  const uint32_t max_block_count_lg2 = m_sb_size_lg2 - m_min_block_size_lg2;
380 
381  m_sb_state_size =
382  (CB::buffer_bound_lg2(max_block_count_lg2) + int_align_mask) &
383  ~int_align_mask;
384  }
385 
386  // Array of all superblock states
387 
388  const size_t all_sb_state_size =
389  (m_sb_count * m_sb_state_size + int_align_mask) & ~int_align_mask;
390 
391  // Number of block sizes
392 
393  const int32_t number_block_sizes =
394  1 + m_max_block_size_lg2 - m_min_block_size_lg2;
395 
396  // Array length for possible block sizes
397  // Hint array is one uint32_t per block size
398 
399  const int32_t block_size_array_size =
400  (number_block_sizes + int_align_mask) & ~int_align_mask;
401 
402  m_hint_offset = all_sb_state_size;
403  m_data_offset = m_hint_offset + block_size_array_size * HINT_PER_BLOCK_SIZE;
404 
405  // Allocation:
406 
407  const size_t header_size = m_data_offset * sizeof(uint32_t);
408  const size_t alloc_size =
409  header_size + (size_t(m_sb_count) << m_sb_size_lg2);
410 
411  Record *rec = Record::allocate(memspace, "Kokkos::MemoryPool", alloc_size);
412 
413  m_tracker.assign_allocated_record_to_uninitialized(rec);
414 
415  m_sb_state_array = (uint32_t *)rec->data();
416 
417  Kokkos::HostSpace host;
418 
419  uint32_t *const sb_state_array =
420  accessible ? m_sb_state_array : (uint32_t *)host.allocate(header_size);
421 
422  for (int32_t i = 0; i < m_data_offset; ++i) sb_state_array[i] = 0;
423 
424  // Initial assignment of empty superblocks to block sizes:
425 
426  for (int32_t i = 0; i < number_block_sizes; ++i) {
427  const uint32_t block_size_lg2 = i + m_min_block_size_lg2;
428  const uint32_t block_count_lg2 = m_sb_size_lg2 - block_size_lg2;
429  const uint32_t block_state = block_count_lg2 << state_shift;
430  const uint32_t hint_begin = m_hint_offset + i * HINT_PER_BLOCK_SIZE;
431 
432  // for block size index 'i':
433  // sb_id_hint = sb_state_array[ hint_begin ];
434  // sb_id_begin = sb_state_array[ hint_begin + 1 ];
435 
436  const int32_t jbeg = (i * m_sb_count) / number_block_sizes;
437  const int32_t jend = ((i + 1) * m_sb_count) / number_block_sizes;
438 
439  sb_state_array[hint_begin] = uint32_t(jbeg);
440  sb_state_array[hint_begin + 1] = uint32_t(jbeg);
441 
442  for (int32_t j = jbeg; j < jend; ++j) {
443  sb_state_array[j * m_sb_state_size] = block_state;
444  }
445  }
446 
447  // Write out initialized state:
448 
449  if (!accessible) {
450  Kokkos::Impl::DeepCopy<base_memory_space, Kokkos::HostSpace>(
451  m_sb_state_array, sb_state_array, header_size);
452 
453  host.deallocate(sb_state_array, header_size);
454  } else {
455  Kokkos::memory_fence();
456  }
457  }
458 
459  //--------------------------------------------------------------------------
460 
461  private:
462  /* Given a size 'n' get the block size in which it can be allocated.
463  * Restrict lower bound to minimum block size.
464  */
465  KOKKOS_FORCEINLINE_FUNCTION
466  uint32_t get_block_size_lg2(uint32_t n) const noexcept {
467  const unsigned i = Kokkos::Impl::integral_power_of_two_that_contains(n);
468 
469  return i < m_min_block_size_lg2 ? m_min_block_size_lg2 : i;
470  }
471 
472  public:
473  /* Return 0 for invalid block size */
474  KOKKOS_INLINE_FUNCTION
475  uint32_t allocate_block_size(uint64_t alloc_size) const noexcept {
476  return alloc_size <= (1UL << m_max_block_size_lg2)
477  ? (1UL << get_block_size_lg2(uint32_t(alloc_size)))
478  : 0;
479  }
480 
481  //--------------------------------------------------------------------------
491  KOKKOS_FUNCTION
492  void *allocate(size_t alloc_size, int32_t attempt_limit = 1) const noexcept {
493  if (size_t(1LU << m_max_block_size_lg2) < alloc_size) {
494  Kokkos::abort(
495  "Kokkos MemoryPool allocation request exceeded specified maximum "
496  "allocation size");
497  }
498 
499  if (0 == alloc_size) return nullptr;
500 
501  void *p = nullptr;
502 
503  const uint32_t block_size_lg2 = get_block_size_lg2(alloc_size);
504 
505  // Allocation will fit within a superblock
506  // that has block sizes ( 1 << block_size_lg2 )
507 
508  const uint32_t block_count_lg2 = m_sb_size_lg2 - block_size_lg2;
509  const uint32_t block_state = block_count_lg2 << state_shift;
510  const uint32_t block_count = 1u << block_count_lg2;
511 
512  // Superblock hints for this block size:
513  // hint_sb_id_ptr[0] is the dynamically changing hint
514  // hint_sb_id_ptr[1] is the static start point
515 
516  volatile uint32_t *const hint_sb_id_ptr =
517  m_sb_state_array /* memory pool state array */
518  + m_hint_offset /* offset to hint portion of array */
519  + HINT_PER_BLOCK_SIZE /* number of hints per block size */
520  * (block_size_lg2 - m_min_block_size_lg2); /* block size id */
521 
522  const int32_t sb_id_begin = int32_t(hint_sb_id_ptr[1]);
523 
524  // Fast query clock register 'tic' to pseudo-randomize
525  // the guess for which block within a superblock should
526  // be claimed. If not available then a search occurs.
527 #if defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOS_ARCH_INTEL_GEN)
528  const uint32_t block_id_hint = alloc_size;
529 #else
530  const uint32_t block_id_hint =
531  (uint32_t)(Kokkos::Impl::clock_tic()
532 #if defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA)
533  // Spread out potentially concurrent access
534  // by threads within a warp or thread block.
535  + (threadIdx.x + blockDim.x * threadIdx.y)
536 #endif
537  );
538 #endif
539 
540  // expected state of superblock for allocation
541  uint32_t sb_state = block_state;
542 
543  int32_t sb_id = -1;
544 
545  volatile uint32_t *sb_state_array = nullptr;
546 
547  while (attempt_limit) {
548  int32_t hint_sb_id = -1;
549 
550  if (sb_id < 0) {
551  // No superblock specified, try the hint for this block size
552 
553  sb_id = hint_sb_id = int32_t(*hint_sb_id_ptr);
554 
555  sb_state_array = m_sb_state_array + (sb_id * m_sb_state_size);
556  }
557 
558  // Require:
559  // 0 <= sb_id
560  // sb_state_array == m_sb_state_array + m_sb_state_size * sb_id
561 
562  if (sb_state == (state_header_mask & *sb_state_array)) {
563  // This superblock state is as expected, for the moment.
564  // Attempt to claim a bit. The attempt updates the state
565  // so have already made sure the state header is as expected.
566 
567  const uint32_t count_lg2 = sb_state >> state_shift;
568  const uint32_t mask = (1u << count_lg2) - 1;
569 
570  const Kokkos::pair<int, int> result = CB::acquire_bounded_lg2(
571  sb_state_array, count_lg2, block_id_hint & mask, sb_state);
572 
573  // If result.first < 0 then failed to acquire
574  // due to either full or buffer was wrong state.
575  // Could be wrong state if a deallocation raced the
576  // superblock to empty before the acquire could succeed.
577 
578  if (0 <= result.first) { // acquired a bit
579 
580  const uint32_t size_lg2 = m_sb_size_lg2 - count_lg2;
581 
582  // Set the allocated block pointer
583 
584  p = ((char *)(m_sb_state_array + m_data_offset)) +
585  (uint64_t(sb_id) << m_sb_size_lg2) // superblock memory
586  + (uint64_t(result.first) << size_lg2); // block memory
587 
588 #if 0
589  printf( " MemoryPool(0x%lx) pointer(0x%lx) allocate(%lu) sb_id(%d) sb_state(0x%x) block_size(%d) block_capacity(%d) block_id(%d) block_claimed(%d)\n"
590  , (uintptr_t)m_sb_state_array
591  , (uintptr_t)p
592  , alloc_size
593  , sb_id
594  , sb_state
595  , (1u << size_lg2)
596  , (1u << count_lg2)
597  , result.first
598  , result.second );
599 #endif
600 
601  break; // Success
602  }
603  }
604  //------------------------------------------------------------------
605  // Arrive here if failed to acquire a block.
606  // Must find a new superblock.
607 
608  // Start searching at designated index for this block size.
609  // Look for superblock that, in preferential order,
610  // 1) part-full superblock of this block size
611  // 2) empty superblock to claim for this block size
612  // 3) part-full superblock of the next larger block size
613 
614  sb_state = block_state; // Expect to find the desired state
615  sb_id = -1;
616 
617  bool update_hint = false;
618  int32_t sb_id_empty = -1;
619  int32_t sb_id_large = -1;
620  uint32_t sb_state_large = 0;
621 
622  sb_state_array = m_sb_state_array + sb_id_begin * m_sb_state_size;
623 
624  for (int32_t i = 0, id = sb_id_begin; i < m_sb_count; ++i) {
625  // Query state of the candidate superblock.
626  // Note that the state may change at any moment
627  // as concurrent allocations and deallocations occur.
628 
629  const uint32_t full_state = *sb_state_array;
630  const uint32_t used = full_state & state_used_mask;
631  const uint32_t state = full_state & state_header_mask;
632 
633  if (state == block_state) {
634  // Superblock is assigned to this block size
635 
636  if (used < block_count) {
637  // There is room to allocate one block
638 
639  sb_id = id;
640 
641  // Is there room to allocate more than one block?
642 
643  update_hint = used + 1 < block_count;
644 
645  break;
646  }
647  } else if (0 == used) {
648  // Superblock is empty
649 
650  if (-1 == sb_id_empty) {
651  // Superblock is not assigned to this block size
652  // and is the first empty superblock encountered.
653  // Save this id to use if a partfull superblock is not found.
654 
655  sb_id_empty = id;
656  }
657  } else if ((-1 == sb_id_empty /* have not found an empty */) &&
658  (-1 == sb_id_large /* have not found a larger */) &&
659  (state < block_state /* a larger block */) &&
660  // is not full:
661  (used < (1u << (state >> state_shift)))) {
662  // First superblock encountered that is
663  // larger than this block size and
664  // has room for an allocation.
665  // Save this id to use of partfull or empty superblock not found
666  sb_id_large = id;
667  sb_state_large = state;
668  }
669 
670  // Iterate around the superblock array:
671 
672  if (++id < m_sb_count) {
673  sb_state_array += m_sb_state_size;
674  } else {
675  id = 0;
676  sb_state_array = m_sb_state_array;
677  }
678  }
679 
680  // printf(" search m_sb_count(%d) sb_id(%d) sb_id_empty(%d)
681  // sb_id_large(%d)\n" , m_sb_count , sb_id , sb_id_empty , sb_id_large);
682 
683  if (sb_id < 0) {
684  // Did not find a partfull superblock for this block size.
685 
686  if (0 <= sb_id_empty) {
687  // Found first empty superblock following designated superblock
688  // Attempt to claim it for this block size.
689  // If the claim fails assume that another thread claimed it
690  // for this block size and try to use it anyway,
691  // but do not update hint.
692 
693  sb_id = sb_id_empty;
694 
695  sb_state_array = m_sb_state_array + (sb_id * m_sb_state_size);
696 
697  // If successfully changed assignment of empty superblock 'sb_id'
698  // to this block_size then update the hint.
699 
700  const uint32_t state_empty = state_header_mask & *sb_state_array;
701 
702  // If this thread claims the empty block then update the hint
703  update_hint =
704  state_empty == Kokkos::atomic_compare_exchange(
705  sb_state_array, state_empty, block_state);
706  } else if (0 <= sb_id_large) {
707  // Found a larger superblock with space available
708 
709  sb_id = sb_id_large;
710  sb_state = sb_state_large;
711 
712  sb_state_array = m_sb_state_array + (sb_id * m_sb_state_size);
713  } else {
714  // Did not find a potentially usable superblock
715  --attempt_limit;
716  }
717  }
718 
719  if (update_hint) {
720  Kokkos::atomic_compare_exchange(hint_sb_id_ptr, uint32_t(hint_sb_id),
721  uint32_t(sb_id));
722  }
723  } // end allocation attempt loop
724  //--------------------------------------------------------------------
725 
726  return p;
727  }
728  // end allocate
729  //--------------------------------------------------------------------------
730 
737  KOKKOS_INLINE_FUNCTION
738  void deallocate(void *p, size_t /* alloc_size */) const noexcept {
739  if (nullptr == p) return;
740 
741  // Determine which superblock and block
742  const ptrdiff_t d =
743  ((char *)p) - ((char *)(m_sb_state_array + m_data_offset));
744 
745  // Verify contained within the memory pool's superblocks:
746  const int ok_contains =
747  (0 <= d) && (size_t(d) < (size_t(m_sb_count) << m_sb_size_lg2));
748 
749  int ok_block_aligned = 0;
750  int ok_dealloc_once = 0;
751 
752  if (ok_contains) {
753  const int sb_id = d >> m_sb_size_lg2;
754 
755  // State array for the superblock.
756  volatile uint32_t *const sb_state_array =
757  m_sb_state_array + (sb_id * m_sb_state_size);
758 
759  const uint32_t block_state = (*sb_state_array) & state_header_mask;
760  const uint32_t block_size_lg2 =
761  m_sb_size_lg2 - (block_state >> state_shift);
762 
763  ok_block_aligned = 0 == (d & ((1UL << block_size_lg2) - 1));
764 
765  if (ok_block_aligned) {
766  // Map address to block's bit
767  // mask into superblock and then shift down for block index
768 
769  const uint32_t bit =
770  (d & (ptrdiff_t(1LU << m_sb_size_lg2) - 1)) >> block_size_lg2;
771 
772  const int result = CB::release(sb_state_array, bit, block_state);
773 
774  ok_dealloc_once = 0 <= result;
775 
776 #if 0
777  printf( " MemoryPool(0x%lx) pointer(0x%lx) deallocate sb_id(%d) block_size(%d) block_capacity(%d) block_id(%d) block_claimed(%d)\n"
778  , (uintptr_t)m_sb_state_array
779  , (uintptr_t)p
780  , sb_id
781  , (1u << block_size_lg2)
782  , (1u << (m_sb_size_lg2 - block_size_lg2))
783  , bit
784  , result );
785 #endif
786  }
787  }
788 
789  if (!ok_contains || !ok_block_aligned || !ok_dealloc_once) {
790 #if 0
791  printf( " MemoryPool(0x%lx) pointer(0x%lx) deallocate ok_contains(%d) ok_block_aligned(%d) ok_dealloc_once(%d)\n"
792  , (uintptr_t)m_sb_state_array
793  , (uintptr_t)p
794  , int(ok_contains)
795  , int(ok_block_aligned)
796  , int(ok_dealloc_once) );
797 #endif
798  Kokkos::abort("Kokkos MemoryPool::deallocate given erroneous pointer");
799  }
800  }
801  // end deallocate
802  //--------------------------------------------------------------------------
803 
804  KOKKOS_INLINE_FUNCTION
805  int number_of_superblocks() const noexcept { return m_sb_count; }
806 
807  KOKKOS_INLINE_FUNCTION
808  void superblock_state(int sb_id, int &block_size, int &block_count_capacity,
809  int &block_count_used) const noexcept {
810  block_size = 0;
811  block_count_capacity = 0;
812  block_count_used = 0;
813 
815  Kokkos::Impl::ActiveExecutionMemorySpace,
816  base_memory_space>::accessible) {
817  // Can access the state array
818 
819  const uint32_t state =
820  ((uint32_t volatile *)m_sb_state_array)[sb_id * m_sb_state_size];
821 
822  const uint32_t block_count_lg2 = state >> state_shift;
823  const uint32_t block_used = state & state_used_mask;
824 
825  block_size = 1LU << (m_sb_size_lg2 - block_count_lg2);
826  block_count_capacity = 1LU << block_count_lg2;
827  block_count_used = block_used;
828  }
829  }
830 };
831 
832 } // namespace Kokkos
833 
834 #endif /* #ifndef KOKKOS_MEMORYPOOL_HPP */
void * allocate(const size_t arg_alloc_size) const
Allocate untracked memory in the space.
Replacement for std::pair that works on CUDA devices.
Definition: Kokkos_Pair.hpp:65
first_type first
The first element of the pair.
Definition: Kokkos_Pair.hpp:72
Memory management for host memory.
Declaration of parallel operators.
Atomic functions.
void deallocate(void *const arg_alloc_ptr, const size_t arg_alloc_size) const
Deallocate untracked memory in the space.
Definition: dummy.cpp:3
second_type second
The second element of the pair.
Definition: Kokkos_Pair.hpp:74
Access relationship between DstMemorySpace and SrcMemorySpace.