kmp_alloc.cpp revision 360784
1/*
2 * kmp_alloc.cpp -- private/shared dynamic memory allocation and management
3 */
4
5//===----------------------------------------------------------------------===//
6//
7// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8// See https://llvm.org/LICENSE.txt for license information.
9// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10//
11//===----------------------------------------------------------------------===//
12
13#include "kmp.h"
14#include "kmp_io.h"
15#include "kmp_wrapper_malloc.h"
16
17// Disable bget when it is not used
18#if KMP_USE_BGET
19
20/* Thread private buffer management code */
21
22typedef int (*bget_compact_t)(size_t, int);
23typedef void *(*bget_acquire_t)(size_t);
24typedef void (*bget_release_t)(void *);
25
26/* NOTE: bufsize must be a signed datatype */
27
28#if KMP_OS_WINDOWS
29#if KMP_ARCH_X86 || KMP_ARCH_ARM
30typedef kmp_int32 bufsize;
31#else
32typedef kmp_int64 bufsize;
33#endif
34#else
35typedef ssize_t bufsize;
36#endif // KMP_OS_WINDOWS
37
38/* The three modes of operation are, fifo search, lifo search, and best-fit */
39
40typedef enum bget_mode {
41  bget_mode_fifo = 0,
42  bget_mode_lifo = 1,
43  bget_mode_best = 2
44} bget_mode_t;
45
46static void bpool(kmp_info_t *th, void *buffer, bufsize len);
47static void *bget(kmp_info_t *th, bufsize size);
48static void *bgetz(kmp_info_t *th, bufsize size);
49static void *bgetr(kmp_info_t *th, void *buffer, bufsize newsize);
50static void brel(kmp_info_t *th, void *buf);
51static void bectl(kmp_info_t *th, bget_compact_t compact,
52                  bget_acquire_t acquire, bget_release_t release,
53                  bufsize pool_incr);
54
55/* BGET CONFIGURATION */
56/* Buffer allocation size quantum: all buffers allocated are a
57   multiple of this size.  This MUST be a power of two. */
58
59/* On IA-32 architecture with  Linux* OS, malloc() does not
60   ensure 16 byte alignment */
61
62#if KMP_ARCH_X86 || !KMP_HAVE_QUAD
63
64#define SizeQuant 8
65#define AlignType double
66
67#else
68
69#define SizeQuant 16
70#define AlignType _Quad
71
72#endif
73
74// Define this symbol to enable the bstats() function which calculates the
75// total free space in the buffer pool, the largest available buffer, and the
76// total space currently allocated.
77#define BufStats 1
78
79#ifdef KMP_DEBUG
80
81// Define this symbol to enable the bpoold() function which dumps the buffers
82// in a buffer pool.
83#define BufDump 1
84
85// Define this symbol to enable the bpoolv() function for validating a buffer
86// pool.
87#define BufValid 1
88
89// Define this symbol to enable the bufdump() function which allows dumping the
90// contents of an allocated or free buffer.
91#define DumpData 1
92
93#ifdef NOT_USED_NOW
94
95// Wipe free buffers to a guaranteed pattern of garbage to trip up miscreants
96// who attempt to use pointers into released buffers.
97#define FreeWipe 1
98
99// Use a best fit algorithm when searching for space for an allocation request.
100// This uses memory more efficiently, but allocation will be much slower.
101#define BestFit 1
102
103#endif /* NOT_USED_NOW */
104#endif /* KMP_DEBUG */
105
106static bufsize bget_bin_size[] = {
107    0,
108    //    1 << 6,    /* .5 Cache line */
109    1 << 7, /* 1 Cache line, new */
110    1 << 8, /* 2 Cache lines */
111    1 << 9, /* 4 Cache lines, new */
112    1 << 10, /* 8 Cache lines */
113    1 << 11, /* 16 Cache lines, new */
114    1 << 12, 1 << 13, /* new */
115    1 << 14, 1 << 15, /* new */
116    1 << 16, 1 << 17, 1 << 18, 1 << 19, 1 << 20, /*  1MB */
117    1 << 21, /*  2MB */
118    1 << 22, /*  4MB */
119    1 << 23, /*  8MB */
120    1 << 24, /* 16MB */
121    1 << 25, /* 32MB */
122};
123
124#define MAX_BGET_BINS (int)(sizeof(bget_bin_size) / sizeof(bufsize))
125
126struct bfhead;
127
128//  Declare the interface, including the requested buffer size type, bufsize.
129
130/* Queue links */
131typedef struct qlinks {
132  struct bfhead *flink; /* Forward link */
133  struct bfhead *blink; /* Backward link */
134} qlinks_t;
135
136/* Header in allocated and free buffers */
137typedef struct bhead2 {
138  kmp_info_t *bthr; /* The thread which owns the buffer pool */
139  bufsize prevfree; /* Relative link back to previous free buffer in memory or
140                       0 if previous buffer is allocated.  */
141  bufsize bsize; /* Buffer size: positive if free, negative if allocated. */
142} bhead2_t;
143
144/* Make sure the bhead structure is a multiple of SizeQuant in size. */
145typedef union bhead {
146  KMP_ALIGN(SizeQuant)
147  AlignType b_align;
148  char b_pad[sizeof(bhead2_t) + (SizeQuant - (sizeof(bhead2_t) % SizeQuant))];
149  bhead2_t bb;
150} bhead_t;
151#define BH(p) ((bhead_t *)(p))
152
153/*  Header in directly allocated buffers (by acqfcn) */
154typedef struct bdhead {
155  bufsize tsize; /* Total size, including overhead */
156  bhead_t bh; /* Common header */
157} bdhead_t;
158#define BDH(p) ((bdhead_t *)(p))
159
160/* Header in free buffers */
161typedef struct bfhead {
162  bhead_t bh; /* Common allocated/free header */
163  qlinks_t ql; /* Links on free list */
164} bfhead_t;
165#define BFH(p) ((bfhead_t *)(p))
166
167typedef struct thr_data {
168  bfhead_t freelist[MAX_BGET_BINS];
169#if BufStats
170  size_t totalloc; /* Total space currently allocated */
171  long numget, numrel; /* Number of bget() and brel() calls */
172  long numpblk; /* Number of pool blocks */
173  long numpget, numprel; /* Number of block gets and rels */
174  long numdget, numdrel; /* Number of direct gets and rels */
175#endif /* BufStats */
176
177  /* Automatic expansion block management functions */
178  bget_compact_t compfcn;
179  bget_acquire_t acqfcn;
180  bget_release_t relfcn;
181
182  bget_mode_t mode; /* what allocation mode to use? */
183
184  bufsize exp_incr; /* Expansion block size */
185  bufsize pool_len; /* 0: no bpool calls have been made
186                       -1: not all pool blocks are the same size
187                       >0: (common) block size for all bpool calls made so far
188                    */
189  bfhead_t *last_pool; /* Last pool owned by this thread (delay dealocation) */
190} thr_data_t;
191
192/*  Minimum allocation quantum: */
193#define QLSize (sizeof(qlinks_t))
194#define SizeQ ((SizeQuant > QLSize) ? SizeQuant : QLSize)
195#define MaxSize                                                                \
196  (bufsize)(                                                                   \
197      ~(((bufsize)(1) << (sizeof(bufsize) * CHAR_BIT - 1)) | (SizeQuant - 1)))
198// Maximun for the requested size.
199
200/* End sentinel: value placed in bsize field of dummy block delimiting
201   end of pool block.  The most negative number which will  fit  in  a
202   bufsize, defined in a way that the compiler will accept. */
203
204#define ESent                                                                  \
205  ((bufsize)(-(((((bufsize)1) << ((int)sizeof(bufsize) * 8 - 2)) - 1) * 2) - 2))
206
207/* Thread Data management routines */
208static int bget_get_bin(bufsize size) {
209  // binary chop bins
210  int lo = 0, hi = MAX_BGET_BINS - 1;
211
212  KMP_DEBUG_ASSERT(size > 0);
213
214  while ((hi - lo) > 1) {
215    int mid = (lo + hi) >> 1;
216    if (size < bget_bin_size[mid])
217      hi = mid - 1;
218    else
219      lo = mid;
220  }
221
222  KMP_DEBUG_ASSERT((lo >= 0) && (lo < MAX_BGET_BINS));
223
224  return lo;
225}
226
227static void set_thr_data(kmp_info_t *th) {
228  int i;
229  thr_data_t *data;
230
231  data = (thr_data_t *)((!th->th.th_local.bget_data)
232                            ? __kmp_allocate(sizeof(*data))
233                            : th->th.th_local.bget_data);
234
235  memset(data, '\0', sizeof(*data));
236
237  for (i = 0; i < MAX_BGET_BINS; ++i) {
238    data->freelist[i].ql.flink = &data->freelist[i];
239    data->freelist[i].ql.blink = &data->freelist[i];
240  }
241
242  th->th.th_local.bget_data = data;
243  th->th.th_local.bget_list = 0;
244#if !USE_CMP_XCHG_FOR_BGET
245#ifdef USE_QUEUING_LOCK_FOR_BGET
246  __kmp_init_lock(&th->th.th_local.bget_lock);
247#else
248  __kmp_init_bootstrap_lock(&th->th.th_local.bget_lock);
249#endif /* USE_LOCK_FOR_BGET */
250#endif /* ! USE_CMP_XCHG_FOR_BGET */
251}
252
253static thr_data_t *get_thr_data(kmp_info_t *th) {
254  thr_data_t *data;
255
256  data = (thr_data_t *)th->th.th_local.bget_data;
257
258  KMP_DEBUG_ASSERT(data != 0);
259
260  return data;
261}
262
263/* Walk the free list and release the enqueued buffers */
264static void __kmp_bget_dequeue(kmp_info_t *th) {
265  void *p = TCR_SYNC_PTR(th->th.th_local.bget_list);
266
267  if (p != 0) {
268#if USE_CMP_XCHG_FOR_BGET
269    {
270      volatile void *old_value = TCR_SYNC_PTR(th->th.th_local.bget_list);
271      while (!KMP_COMPARE_AND_STORE_PTR(&th->th.th_local.bget_list,
272                                        CCAST(void *, old_value), nullptr)) {
273        KMP_CPU_PAUSE();
274        old_value = TCR_SYNC_PTR(th->th.th_local.bget_list);
275      }
276      p = CCAST(void *, old_value);
277    }
278#else /* ! USE_CMP_XCHG_FOR_BGET */
279#ifdef USE_QUEUING_LOCK_FOR_BGET
280    __kmp_acquire_lock(&th->th.th_local.bget_lock, __kmp_gtid_from_thread(th));
281#else
282    __kmp_acquire_bootstrap_lock(&th->th.th_local.bget_lock);
283#endif /* USE_QUEUING_LOCK_FOR_BGET */
284
285    p = (void *)th->th.th_local.bget_list;
286    th->th.th_local.bget_list = 0;
287
288#ifdef USE_QUEUING_LOCK_FOR_BGET
289    __kmp_release_lock(&th->th.th_local.bget_lock, __kmp_gtid_from_thread(th));
290#else
291    __kmp_release_bootstrap_lock(&th->th.th_local.bget_lock);
292#endif
293#endif /* USE_CMP_XCHG_FOR_BGET */
294
295    /* Check again to make sure the list is not empty */
296    while (p != 0) {
297      void *buf = p;
298      bfhead_t *b = BFH(((char *)p) - sizeof(bhead_t));
299
300      KMP_DEBUG_ASSERT(b->bh.bb.bsize != 0);
301      KMP_DEBUG_ASSERT(((kmp_uintptr_t)TCR_PTR(b->bh.bb.bthr) & ~1) ==
302                       (kmp_uintptr_t)th); // clear possible mark
303      KMP_DEBUG_ASSERT(b->ql.blink == 0);
304
305      p = (void *)b->ql.flink;
306
307      brel(th, buf);
308    }
309  }
310}
311
312/* Chain together the free buffers by using the thread owner field */
313static void __kmp_bget_enqueue(kmp_info_t *th, void *buf
314#ifdef USE_QUEUING_LOCK_FOR_BGET
315                               ,
316                               kmp_int32 rel_gtid
317#endif
318                               ) {
319  bfhead_t *b = BFH(((char *)buf) - sizeof(bhead_t));
320
321  KMP_DEBUG_ASSERT(b->bh.bb.bsize != 0);
322  KMP_DEBUG_ASSERT(((kmp_uintptr_t)TCR_PTR(b->bh.bb.bthr) & ~1) ==
323                   (kmp_uintptr_t)th); // clear possible mark
324
325  b->ql.blink = 0;
326
327  KC_TRACE(10, ("__kmp_bget_enqueue: moving buffer to T#%d list\n",
328                __kmp_gtid_from_thread(th)));
329
330#if USE_CMP_XCHG_FOR_BGET
331  {
332    volatile void *old_value = TCR_PTR(th->th.th_local.bget_list);
333    /* the next pointer must be set before setting bget_list to buf to avoid
334       exposing a broken list to other threads, even for an instant. */
335    b->ql.flink = BFH(CCAST(void *, old_value));
336
337    while (!KMP_COMPARE_AND_STORE_PTR(&th->th.th_local.bget_list,
338                                      CCAST(void *, old_value), buf)) {
339      KMP_CPU_PAUSE();
340      old_value = TCR_PTR(th->th.th_local.bget_list);
341      /* the next pointer must be set before setting bget_list to buf to avoid
342         exposing a broken list to other threads, even for an instant. */
343      b->ql.flink = BFH(CCAST(void *, old_value));
344    }
345  }
346#else /* ! USE_CMP_XCHG_FOR_BGET */
347#ifdef USE_QUEUING_LOCK_FOR_BGET
348  __kmp_acquire_lock(&th->th.th_local.bget_lock, rel_gtid);
349#else
350  __kmp_acquire_bootstrap_lock(&th->th.th_local.bget_lock);
351#endif
352
353  b->ql.flink = BFH(th->th.th_local.bget_list);
354  th->th.th_local.bget_list = (void *)buf;
355
356#ifdef USE_QUEUING_LOCK_FOR_BGET
357  __kmp_release_lock(&th->th.th_local.bget_lock, rel_gtid);
358#else
359  __kmp_release_bootstrap_lock(&th->th.th_local.bget_lock);
360#endif
361#endif /* USE_CMP_XCHG_FOR_BGET */
362}
363
364/* insert buffer back onto a new freelist */
365static void __kmp_bget_insert_into_freelist(thr_data_t *thr, bfhead_t *b) {
366  int bin;
367
368  KMP_DEBUG_ASSERT(((size_t)b) % SizeQuant == 0);
369  KMP_DEBUG_ASSERT(b->bh.bb.bsize % SizeQuant == 0);
370
371  bin = bget_get_bin(b->bh.bb.bsize);
372
373  KMP_DEBUG_ASSERT(thr->freelist[bin].ql.blink->ql.flink ==
374                   &thr->freelist[bin]);
375  KMP_DEBUG_ASSERT(thr->freelist[bin].ql.flink->ql.blink ==
376                   &thr->freelist[bin]);
377
378  b->ql.flink = &thr->freelist[bin];
379  b->ql.blink = thr->freelist[bin].ql.blink;
380
381  thr->freelist[bin].ql.blink = b;
382  b->ql.blink->ql.flink = b;
383}
384
385/* unlink the buffer from the old freelist */
386static void __kmp_bget_remove_from_freelist(bfhead_t *b) {
387  KMP_DEBUG_ASSERT(b->ql.blink->ql.flink == b);
388  KMP_DEBUG_ASSERT(b->ql.flink->ql.blink == b);
389
390  b->ql.blink->ql.flink = b->ql.flink;
391  b->ql.flink->ql.blink = b->ql.blink;
392}
393
394/*  GET STATS -- check info on free list */
395static void bcheck(kmp_info_t *th, bufsize *max_free, bufsize *total_free) {
396  thr_data_t *thr = get_thr_data(th);
397  int bin;
398
399  *total_free = *max_free = 0;
400
401  for (bin = 0; bin < MAX_BGET_BINS; ++bin) {
402    bfhead_t *b, *best;
403
404    best = &thr->freelist[bin];
405    b = best->ql.flink;
406
407    while (b != &thr->freelist[bin]) {
408      *total_free += (b->bh.bb.bsize - sizeof(bhead_t));
409      if ((best == &thr->freelist[bin]) || (b->bh.bb.bsize < best->bh.bb.bsize))
410        best = b;
411
412      /* Link to next buffer */
413      b = b->ql.flink;
414    }
415
416    if (*max_free < best->bh.bb.bsize)
417      *max_free = best->bh.bb.bsize;
418  }
419
420  if (*max_free > (bufsize)sizeof(bhead_t))
421    *max_free -= sizeof(bhead_t);
422}
423
424/*  BGET  --  Allocate a buffer.  */
425static void *bget(kmp_info_t *th, bufsize requested_size) {
426  thr_data_t *thr = get_thr_data(th);
427  bufsize size = requested_size;
428  bfhead_t *b;
429  void *buf;
430  int compactseq = 0;
431  int use_blink = 0;
432  /* For BestFit */
433  bfhead_t *best;
434
435  if (size < 0 || size + sizeof(bhead_t) > MaxSize) {
436    return NULL;
437  }
438
439  __kmp_bget_dequeue(th); /* Release any queued buffers */
440
441  if (size < (bufsize)SizeQ) { // Need at least room for the queue links.
442    size = SizeQ;
443  }
444#if defined(SizeQuant) && (SizeQuant > 1)
445  size = (size + (SizeQuant - 1)) & (~(SizeQuant - 1));
446#endif
447
448  size += sizeof(bhead_t); // Add overhead in allocated buffer to size required.
449  KMP_DEBUG_ASSERT(size >= 0);
450  KMP_DEBUG_ASSERT(size % SizeQuant == 0);
451
452  use_blink = (thr->mode == bget_mode_lifo);
453
454  /* If a compact function was provided in the call to bectl(), wrap
455     a loop around the allocation process  to  allow  compaction  to
456     intervene in case we don't find a suitable buffer in the chain. */
457
458  for (;;) {
459    int bin;
460
461    for (bin = bget_get_bin(size); bin < MAX_BGET_BINS; ++bin) {
462      /* Link to next buffer */
463      b = (use_blink ? thr->freelist[bin].ql.blink
464                     : thr->freelist[bin].ql.flink);
465
466      if (thr->mode == bget_mode_best) {
467        best = &thr->freelist[bin];
468
469        /* Scan the free list searching for the first buffer big enough
470           to hold the requested size buffer. */
471        while (b != &thr->freelist[bin]) {
472          if (b->bh.bb.bsize >= (bufsize)size) {
473            if ((best == &thr->freelist[bin]) ||
474                (b->bh.bb.bsize < best->bh.bb.bsize)) {
475              best = b;
476            }
477          }
478
479          /* Link to next buffer */
480          b = (use_blink ? b->ql.blink : b->ql.flink);
481        }
482        b = best;
483      }
484
485      while (b != &thr->freelist[bin]) {
486        if ((bufsize)b->bh.bb.bsize >= (bufsize)size) {
487
488          // Buffer is big enough to satisfy the request. Allocate it to the
489          // caller. We must decide whether the buffer is large enough to split
490          // into the part given to the caller and a free buffer that remains
491          // on the free list, or whether the entire buffer should be removed
492          // from the free list and given to the caller in its entirety. We
493          // only split the buffer if enough room remains for a header plus the
494          // minimum quantum of allocation.
495          if ((b->bh.bb.bsize - (bufsize)size) >
496              (bufsize)(SizeQ + (sizeof(bhead_t)))) {
497            bhead_t *ba, *bn;
498
499            ba = BH(((char *)b) + (b->bh.bb.bsize - (bufsize)size));
500            bn = BH(((char *)ba) + size);
501
502            KMP_DEBUG_ASSERT(bn->bb.prevfree == b->bh.bb.bsize);
503
504            /* Subtract size from length of free block. */
505            b->bh.bb.bsize -= (bufsize)size;
506
507            /* Link allocated buffer to the previous free buffer. */
508            ba->bb.prevfree = b->bh.bb.bsize;
509
510            /* Plug negative size into user buffer. */
511            ba->bb.bsize = -size;
512
513            /* Mark this buffer as owned by this thread. */
514            TCW_PTR(ba->bb.bthr,
515                    th); // not an allocated address (do not mark it)
516            /* Mark buffer after this one not preceded by free block. */
517            bn->bb.prevfree = 0;
518
519            // unlink buffer from old freelist, and reinsert into new freelist
520            __kmp_bget_remove_from_freelist(b);
521            __kmp_bget_insert_into_freelist(thr, b);
522#if BufStats
523            thr->totalloc += (size_t)size;
524            thr->numget++; /* Increment number of bget() calls */
525#endif
526            buf = (void *)((((char *)ba) + sizeof(bhead_t)));
527            KMP_DEBUG_ASSERT(((size_t)buf) % SizeQuant == 0);
528            return buf;
529          } else {
530            bhead_t *ba;
531
532            ba = BH(((char *)b) + b->bh.bb.bsize);
533
534            KMP_DEBUG_ASSERT(ba->bb.prevfree == b->bh.bb.bsize);
535
536            /* The buffer isn't big enough to split.  Give  the  whole
537               shebang to the caller and remove it from the free list. */
538
539            __kmp_bget_remove_from_freelist(b);
540#if BufStats
541            thr->totalloc += (size_t)b->bh.bb.bsize;
542            thr->numget++; /* Increment number of bget() calls */
543#endif
544            /* Negate size to mark buffer allocated. */
545            b->bh.bb.bsize = -(b->bh.bb.bsize);
546
547            /* Mark this buffer as owned by this thread. */
548            TCW_PTR(ba->bb.bthr, th); // not an allocated address (do not mark)
549            /* Zero the back pointer in the next buffer in memory
550               to indicate that this buffer is allocated. */
551            ba->bb.prevfree = 0;
552
553            /* Give user buffer starting at queue links. */
554            buf = (void *)&(b->ql);
555            KMP_DEBUG_ASSERT(((size_t)buf) % SizeQuant == 0);
556            return buf;
557          }
558        }
559
560        /* Link to next buffer */
561        b = (use_blink ? b->ql.blink : b->ql.flink);
562      }
563    }
564
565    /* We failed to find a buffer. If there's a compact function defined,
566       notify it of the size requested. If it returns TRUE, try the allocation
567       again. */
568
569    if ((thr->compfcn == 0) || (!(*thr->compfcn)(size, ++compactseq))) {
570      break;
571    }
572  }
573
574  /* No buffer available with requested size free. */
575
576  /* Don't give up yet -- look in the reserve supply. */
577  if (thr->acqfcn != 0) {
578    if (size > (bufsize)(thr->exp_incr - sizeof(bhead_t))) {
579      /* Request is too large to fit in a single expansion block.
580         Try to satisy it by a direct buffer acquisition. */
581      bdhead_t *bdh;
582
583      size += sizeof(bdhead_t) - sizeof(bhead_t);
584
585      KE_TRACE(10, ("%%%%%% MALLOC( %d )\n", (int)size));
586
587      /* richryan */
588      bdh = BDH((*thr->acqfcn)((bufsize)size));
589      if (bdh != NULL) {
590
591        // Mark the buffer special by setting size field of its header to zero.
592        bdh->bh.bb.bsize = 0;
593
594        /* Mark this buffer as owned by this thread. */
595        TCW_PTR(bdh->bh.bb.bthr, th); // don't mark buffer as allocated,
596        // because direct buffer never goes to free list
597        bdh->bh.bb.prevfree = 0;
598        bdh->tsize = size;
599#if BufStats
600        thr->totalloc += (size_t)size;
601        thr->numget++; /* Increment number of bget() calls */
602        thr->numdget++; /* Direct bget() call count */
603#endif
604        buf = (void *)(bdh + 1);
605        KMP_DEBUG_ASSERT(((size_t)buf) % SizeQuant == 0);
606        return buf;
607      }
608
609    } else {
610
611      /*  Try to obtain a new expansion block */
612      void *newpool;
613
614      KE_TRACE(10, ("%%%%%% MALLOCB( %d )\n", (int)thr->exp_incr));
615
616      /* richryan */
617      newpool = (*thr->acqfcn)((bufsize)thr->exp_incr);
618      KMP_DEBUG_ASSERT(((size_t)newpool) % SizeQuant == 0);
619      if (newpool != NULL) {
620        bpool(th, newpool, thr->exp_incr);
621        buf = bget(
622            th, requested_size); /* This can't, I say, can't get into a loop. */
623        return buf;
624      }
625    }
626  }
627
628  /*  Still no buffer available */
629
630  return NULL;
631}
632
633/*  BGETZ  --  Allocate a buffer and clear its contents to zero.  We clear
634               the  entire  contents  of  the buffer to zero, not just the
635               region requested by the caller. */
636
637static void *bgetz(kmp_info_t *th, bufsize size) {
638  char *buf = (char *)bget(th, size);
639
640  if (buf != NULL) {
641    bhead_t *b;
642    bufsize rsize;
643
644    b = BH(buf - sizeof(bhead_t));
645    rsize = -(b->bb.bsize);
646    if (rsize == 0) {
647      bdhead_t *bd;
648
649      bd = BDH(buf - sizeof(bdhead_t));
650      rsize = bd->tsize - (bufsize)sizeof(bdhead_t);
651    } else {
652      rsize -= sizeof(bhead_t);
653    }
654
655    KMP_DEBUG_ASSERT(rsize >= size);
656
657    (void)memset(buf, 0, (bufsize)rsize);
658  }
659  return ((void *)buf);
660}
661
662/*  BGETR  --  Reallocate a buffer.  This is a minimal implementation,
663               simply in terms of brel()  and  bget().   It  could  be
664               enhanced to allow the buffer to grow into adjacent free
665               blocks and to avoid moving data unnecessarily.  */
666
667static void *bgetr(kmp_info_t *th, void *buf, bufsize size) {
668  void *nbuf;
669  bufsize osize; /* Old size of buffer */
670  bhead_t *b;
671
672  nbuf = bget(th, size);
673  if (nbuf == NULL) { /* Acquire new buffer */
674    return NULL;
675  }
676  if (buf == NULL) {
677    return nbuf;
678  }
679  b = BH(((char *)buf) - sizeof(bhead_t));
680  osize = -b->bb.bsize;
681  if (osize == 0) {
682    /*  Buffer acquired directly through acqfcn. */
683    bdhead_t *bd;
684
685    bd = BDH(((char *)buf) - sizeof(bdhead_t));
686    osize = bd->tsize - (bufsize)sizeof(bdhead_t);
687  } else {
688    osize -= sizeof(bhead_t);
689  }
690
691  KMP_DEBUG_ASSERT(osize > 0);
692
693  (void)KMP_MEMCPY((char *)nbuf, (char *)buf, /* Copy the data */
694                   (size_t)((size < osize) ? size : osize));
695  brel(th, buf);
696
697  return nbuf;
698}
699
700/*  BREL  --  Release a buffer.  */
701static void brel(kmp_info_t *th, void *buf) {
702  thr_data_t *thr = get_thr_data(th);
703  bfhead_t *b, *bn;
704  kmp_info_t *bth;
705
706  KMP_DEBUG_ASSERT(buf != NULL);
707  KMP_DEBUG_ASSERT(((size_t)buf) % SizeQuant == 0);
708
709  b = BFH(((char *)buf) - sizeof(bhead_t));
710
711  if (b->bh.bb.bsize == 0) { /* Directly-acquired buffer? */
712    bdhead_t *bdh;
713
714    bdh = BDH(((char *)buf) - sizeof(bdhead_t));
715    KMP_DEBUG_ASSERT(b->bh.bb.prevfree == 0);
716#if BufStats
717    thr->totalloc -= (size_t)bdh->tsize;
718    thr->numdrel++; /* Number of direct releases */
719    thr->numrel++; /* Increment number of brel() calls */
720#endif /* BufStats */
721#ifdef FreeWipe
722    (void)memset((char *)buf, 0x55, (size_t)(bdh->tsize - sizeof(bdhead_t)));
723#endif /* FreeWipe */
724
725    KE_TRACE(10, ("%%%%%% FREE( %p )\n", (void *)bdh));
726
727    KMP_DEBUG_ASSERT(thr->relfcn != 0);
728    (*thr->relfcn)((void *)bdh); /* Release it directly. */
729    return;
730  }
731
732  bth = (kmp_info_t *)((kmp_uintptr_t)TCR_PTR(b->bh.bb.bthr) &
733                       ~1); // clear possible mark before comparison
734  if (bth != th) {
735    /* Add this buffer to be released by the owning thread later */
736    __kmp_bget_enqueue(bth, buf
737#ifdef USE_QUEUING_LOCK_FOR_BGET
738                       ,
739                       __kmp_gtid_from_thread(th)
740#endif
741                           );
742    return;
743  }
744
745  /* Buffer size must be negative, indicating that the buffer is allocated. */
746  if (b->bh.bb.bsize >= 0) {
747    bn = NULL;
748  }
749  KMP_DEBUG_ASSERT(b->bh.bb.bsize < 0);
750
751  /*  Back pointer in next buffer must be zero, indicating the same thing: */
752
753  KMP_DEBUG_ASSERT(BH((char *)b - b->bh.bb.bsize)->bb.prevfree == 0);
754
755#if BufStats
756  thr->numrel++; /* Increment number of brel() calls */
757  thr->totalloc += (size_t)b->bh.bb.bsize;
758#endif
759
760  /* If the back link is nonzero, the previous buffer is free.  */
761
762  if (b->bh.bb.prevfree != 0) {
763    /* The previous buffer is free. Consolidate this buffer with it by adding
764       the length of this buffer to the previous free buffer. Note that we
765       subtract the size in the buffer being released, since it's negative to
766       indicate that the buffer is allocated. */
767    bufsize size = b->bh.bb.bsize;
768
769    /* Make the previous buffer the one we're working on. */
770    KMP_DEBUG_ASSERT(BH((char *)b - b->bh.bb.prevfree)->bb.bsize ==
771                     b->bh.bb.prevfree);
772    b = BFH(((char *)b) - b->bh.bb.prevfree);
773    b->bh.bb.bsize -= size;
774
775    /* unlink the buffer from the old freelist */
776    __kmp_bget_remove_from_freelist(b);
777  } else {
778    /* The previous buffer isn't allocated. Mark this buffer size as positive
779       (i.e. free) and fall through to place the buffer on the free list as an
780       isolated free block. */
781    b->bh.bb.bsize = -b->bh.bb.bsize;
782  }
783
784  /* insert buffer back onto a new freelist */
785  __kmp_bget_insert_into_freelist(thr, b);
786
787  /* Now we look at the next buffer in memory, located by advancing from
788     the  start  of  this  buffer  by its size, to see if that buffer is
789     free.  If it is, we combine  this  buffer  with  the  next  one  in
790     memory, dechaining the second buffer from the free list. */
791  bn = BFH(((char *)b) + b->bh.bb.bsize);
792  if (bn->bh.bb.bsize > 0) {
793
794    /* The buffer is free.  Remove it from the free list and add
795       its size to that of our buffer. */
796    KMP_DEBUG_ASSERT(BH((char *)bn + bn->bh.bb.bsize)->bb.prevfree ==
797                     bn->bh.bb.bsize);
798
799    __kmp_bget_remove_from_freelist(bn);
800
801    b->bh.bb.bsize += bn->bh.bb.bsize;
802
803    /* unlink the buffer from the old freelist, and reinsert it into the new
804     * freelist */
805    __kmp_bget_remove_from_freelist(b);
806    __kmp_bget_insert_into_freelist(thr, b);
807
808    /* Finally,  advance  to   the  buffer  that   follows  the  newly
809       consolidated free block.  We must set its  backpointer  to  the
810       head  of  the  consolidated free block.  We know the next block
811       must be an allocated block because the process of recombination
812       guarantees  that  two  free  blocks will never be contiguous in
813       memory.  */
814    bn = BFH(((char *)b) + b->bh.bb.bsize);
815  }
816#ifdef FreeWipe
817  (void)memset(((char *)b) + sizeof(bfhead_t), 0x55,
818               (size_t)(b->bh.bb.bsize - sizeof(bfhead_t)));
819#endif
820  KMP_DEBUG_ASSERT(bn->bh.bb.bsize < 0);
821
822  /* The next buffer is allocated.  Set the backpointer in it  to  point
823     to this buffer; the previous free buffer in memory. */
824
825  bn->bh.bb.prevfree = b->bh.bb.bsize;
826
827  /*  If  a  block-release function is defined, and this free buffer
828      constitutes the entire block, release it.  Note that  pool_len
829      is  defined  in  such a way that the test will fail unless all
830      pool blocks are the same size.  */
831  if (thr->relfcn != 0 &&
832      b->bh.bb.bsize == (bufsize)(thr->pool_len - sizeof(bhead_t))) {
833#if BufStats
834    if (thr->numpblk !=
835        1) { /* Do not release the last buffer until finalization time */
836#endif
837
838      KMP_DEBUG_ASSERT(b->bh.bb.prevfree == 0);
839      KMP_DEBUG_ASSERT(BH((char *)b + b->bh.bb.bsize)->bb.bsize == ESent);
840      KMP_DEBUG_ASSERT(BH((char *)b + b->bh.bb.bsize)->bb.prevfree ==
841                       b->bh.bb.bsize);
842
843      /*  Unlink the buffer from the free list  */
844      __kmp_bget_remove_from_freelist(b);
845
846      KE_TRACE(10, ("%%%%%% FREE( %p )\n", (void *)b));
847
848      (*thr->relfcn)(b);
849#if BufStats
850      thr->numprel++; /* Nr of expansion block releases */
851      thr->numpblk--; /* Total number of blocks */
852      KMP_DEBUG_ASSERT(thr->numpblk == thr->numpget - thr->numprel);
853
854      // avoid leaving stale last_pool pointer around if it is being dealloced
855      if (thr->last_pool == b)
856        thr->last_pool = 0;
857    } else {
858      thr->last_pool = b;
859    }
860#endif /* BufStats */
861  }
862}
863
864/*  BECTL  --  Establish automatic pool expansion control  */
865static void bectl(kmp_info_t *th, bget_compact_t compact,
866                  bget_acquire_t acquire, bget_release_t release,
867                  bufsize pool_incr) {
868  thr_data_t *thr = get_thr_data(th);
869
870  thr->compfcn = compact;
871  thr->acqfcn = acquire;
872  thr->relfcn = release;
873  thr->exp_incr = pool_incr;
874}
875
876/*  BPOOL  --  Add a region of memory to the buffer pool.  */
877static void bpool(kmp_info_t *th, void *buf, bufsize len) {
878  /*    int bin = 0; */
879  thr_data_t *thr = get_thr_data(th);
880  bfhead_t *b = BFH(buf);
881  bhead_t *bn;
882
883  __kmp_bget_dequeue(th); /* Release any queued buffers */
884
885#ifdef SizeQuant
886  len &= ~(SizeQuant - 1);
887#endif
888  if (thr->pool_len == 0) {
889    thr->pool_len = len;
890  } else if (len != thr->pool_len) {
891    thr->pool_len = -1;
892  }
893#if BufStats
894  thr->numpget++; /* Number of block acquisitions */
895  thr->numpblk++; /* Number of blocks total */
896  KMP_DEBUG_ASSERT(thr->numpblk == thr->numpget - thr->numprel);
897#endif /* BufStats */
898
899  /* Since the block is initially occupied by a single free  buffer,
900     it  had  better  not  be  (much) larger than the largest buffer
901     whose size we can store in bhead.bb.bsize. */
902  KMP_DEBUG_ASSERT(len - sizeof(bhead_t) <= -((bufsize)ESent + 1));
903
904  /* Clear  the  backpointer at  the start of the block to indicate that
905     there  is  no  free  block  prior  to  this   one.    That   blocks
906     recombination when the first block in memory is released. */
907  b->bh.bb.prevfree = 0;
908
909  /* Create a dummy allocated buffer at the end of the pool.  This dummy
910     buffer is seen when a buffer at the end of the pool is released and
911     blocks  recombination  of  the last buffer with the dummy buffer at
912     the end.  The length in the dummy buffer  is  set  to  the  largest
913     negative  number  to  denote  the  end  of  the pool for diagnostic
914     routines (this specific value is  not  counted  on  by  the  actual
915     allocation and release functions). */
916  len -= sizeof(bhead_t);
917  b->bh.bb.bsize = (bufsize)len;
918  /* Set the owner of this buffer */
919  TCW_PTR(b->bh.bb.bthr,
920          (kmp_info_t *)((kmp_uintptr_t)th |
921                         1)); // mark the buffer as allocated address
922
923  /* Chain the new block to the free list. */
924  __kmp_bget_insert_into_freelist(thr, b);
925
926#ifdef FreeWipe
927  (void)memset(((char *)b) + sizeof(bfhead_t), 0x55,
928               (size_t)(len - sizeof(bfhead_t)));
929#endif
930  bn = BH(((char *)b) + len);
931  bn->bb.prevfree = (bufsize)len;
932  /* Definition of ESent assumes two's complement! */
933  KMP_DEBUG_ASSERT((~0) == -1 && (bn != 0));
934
935  bn->bb.bsize = ESent;
936}
937
938/*  BFREED  --  Dump the free lists for this thread. */
939static void bfreed(kmp_info_t *th) {
940  int bin = 0, count = 0;
941  int gtid = __kmp_gtid_from_thread(th);
942  thr_data_t *thr = get_thr_data(th);
943
944#if BufStats
945  __kmp_printf_no_lock("__kmp_printpool: T#%d total=%" KMP_UINT64_SPEC
946                       " get=%" KMP_INT64_SPEC " rel=%" KMP_INT64_SPEC
947                       " pblk=%" KMP_INT64_SPEC " pget=%" KMP_INT64_SPEC
948                       " prel=%" KMP_INT64_SPEC " dget=%" KMP_INT64_SPEC
949                       " drel=%" KMP_INT64_SPEC "\n",
950                       gtid, (kmp_uint64)thr->totalloc, (kmp_int64)thr->numget,
951                       (kmp_int64)thr->numrel, (kmp_int64)thr->numpblk,
952                       (kmp_int64)thr->numpget, (kmp_int64)thr->numprel,
953                       (kmp_int64)thr->numdget, (kmp_int64)thr->numdrel);
954#endif
955
956  for (bin = 0; bin < MAX_BGET_BINS; ++bin) {
957    bfhead_t *b;
958
959    for (b = thr->freelist[bin].ql.flink; b != &thr->freelist[bin];
960         b = b->ql.flink) {
961      bufsize bs = b->bh.bb.bsize;
962
963      KMP_DEBUG_ASSERT(b->ql.blink->ql.flink == b);
964      KMP_DEBUG_ASSERT(b->ql.flink->ql.blink == b);
965      KMP_DEBUG_ASSERT(bs > 0);
966
967      count += 1;
968
969      __kmp_printf_no_lock(
970          "__kmp_printpool: T#%d Free block: 0x%p size %6ld bytes.\n", gtid, b,
971          (long)bs);
972#ifdef FreeWipe
973      {
974        char *lerr = ((char *)b) + sizeof(bfhead_t);
975        if ((bs > sizeof(bfhead_t)) &&
976            ((*lerr != 0x55) ||
977             (memcmp(lerr, lerr + 1, (size_t)(bs - (sizeof(bfhead_t) + 1))) !=
978              0))) {
979          __kmp_printf_no_lock("__kmp_printpool: T#%d     (Contents of above "
980                               "free block have been overstored.)\n",
981                               gtid);
982        }
983      }
984#endif
985    }
986  }
987
988  if (count == 0)
989    __kmp_printf_no_lock("__kmp_printpool: T#%d No free blocks\n", gtid);
990}
991
992void __kmp_initialize_bget(kmp_info_t *th) {
993  KMP_DEBUG_ASSERT(SizeQuant >= sizeof(void *) && (th != 0));
994
995  set_thr_data(th);
996
997  bectl(th, (bget_compact_t)0, (bget_acquire_t)malloc, (bget_release_t)free,
998        (bufsize)__kmp_malloc_pool_incr);
999}
1000
1001void __kmp_finalize_bget(kmp_info_t *th) {
1002  thr_data_t *thr;
1003  bfhead_t *b;
1004
1005  KMP_DEBUG_ASSERT(th != 0);
1006
1007#if BufStats
1008  thr = (thr_data_t *)th->th.th_local.bget_data;
1009  KMP_DEBUG_ASSERT(thr != NULL);
1010  b = thr->last_pool;
1011
1012  /*  If a block-release function is defined, and this free buffer constitutes
1013      the entire block, release it. Note that pool_len is defined in such a way
1014      that the test will fail unless all pool blocks are the same size.  */
1015
1016  // Deallocate the last pool if one exists because we no longer do it in brel()
1017  if (thr->relfcn != 0 && b != 0 && thr->numpblk != 0 &&
1018      b->bh.bb.bsize == (bufsize)(thr->pool_len - sizeof(bhead_t))) {
1019    KMP_DEBUG_ASSERT(b->bh.bb.prevfree == 0);
1020    KMP_DEBUG_ASSERT(BH((char *)b + b->bh.bb.bsize)->bb.bsize == ESent);
1021    KMP_DEBUG_ASSERT(BH((char *)b + b->bh.bb.bsize)->bb.prevfree ==
1022                     b->bh.bb.bsize);
1023
1024    /*  Unlink the buffer from the free list  */
1025    __kmp_bget_remove_from_freelist(b);
1026
1027    KE_TRACE(10, ("%%%%%% FREE( %p )\n", (void *)b));
1028
1029    (*thr->relfcn)(b);
1030    thr->numprel++; /* Nr of expansion block releases */
1031    thr->numpblk--; /* Total number of blocks */
1032    KMP_DEBUG_ASSERT(thr->numpblk == thr->numpget - thr->numprel);
1033  }
1034#endif /* BufStats */
1035
1036  /* Deallocate bget_data */
1037  if (th->th.th_local.bget_data != NULL) {
1038    __kmp_free(th->th.th_local.bget_data);
1039    th->th.th_local.bget_data = NULL;
1040  }
1041}
1042
1043void kmpc_set_poolsize(size_t size) {
1044  bectl(__kmp_get_thread(), (bget_compact_t)0, (bget_acquire_t)malloc,
1045        (bget_release_t)free, (bufsize)size);
1046}
1047
1048size_t kmpc_get_poolsize(void) {
1049  thr_data_t *p;
1050
1051  p = get_thr_data(__kmp_get_thread());
1052
1053  return p->exp_incr;
1054}
1055
1056void kmpc_set_poolmode(int mode) {
1057  thr_data_t *p;
1058
1059  if (mode == bget_mode_fifo || mode == bget_mode_lifo ||
1060      mode == bget_mode_best) {
1061    p = get_thr_data(__kmp_get_thread());
1062    p->mode = (bget_mode_t)mode;
1063  }
1064}
1065
1066int kmpc_get_poolmode(void) {
1067  thr_data_t *p;
1068
1069  p = get_thr_data(__kmp_get_thread());
1070
1071  return p->mode;
1072}
1073
1074void kmpc_get_poolstat(size_t *maxmem, size_t *allmem) {
1075  kmp_info_t *th = __kmp_get_thread();
1076  bufsize a, b;
1077
1078  __kmp_bget_dequeue(th); /* Release any queued buffers */
1079
1080  bcheck(th, &a, &b);
1081
1082  *maxmem = a;
1083  *allmem = b;
1084}
1085
1086void kmpc_poolprint(void) {
1087  kmp_info_t *th = __kmp_get_thread();
1088
1089  __kmp_bget_dequeue(th); /* Release any queued buffers */
1090
1091  bfreed(th);
1092}
1093
1094#endif // #if KMP_USE_BGET
1095
1096void *kmpc_malloc(size_t size) {
1097  void *ptr;
1098  ptr = bget(__kmp_entry_thread(), (bufsize)(size + sizeof(ptr)));
1099  if (ptr != NULL) {
1100    // save allocated pointer just before one returned to user
1101    *(void **)ptr = ptr;
1102    ptr = (void **)ptr + 1;
1103  }
1104  return ptr;
1105}
1106
1107#define IS_POWER_OF_TWO(n) (((n) & ((n)-1)) == 0)
1108
1109void *kmpc_aligned_malloc(size_t size, size_t alignment) {
1110  void *ptr;
1111  void *ptr_allocated;
1112  KMP_DEBUG_ASSERT(alignment < 32 * 1024); // Alignment should not be too big
1113  if (!IS_POWER_OF_TWO(alignment)) {
1114    // AC: do we need to issue a warning here?
1115    errno = EINVAL;
1116    return NULL;
1117  }
1118  size = size + sizeof(void *) + alignment;
1119  ptr_allocated = bget(__kmp_entry_thread(), (bufsize)size);
1120  if (ptr_allocated != NULL) {
1121    // save allocated pointer just before one returned to user
1122    ptr = (void *)(((kmp_uintptr_t)ptr_allocated + sizeof(void *) + alignment) &
1123                   ~(alignment - 1));
1124    *((void **)ptr - 1) = ptr_allocated;
1125  } else {
1126    ptr = NULL;
1127  }
1128  return ptr;
1129}
1130
1131void *kmpc_calloc(size_t nelem, size_t elsize) {
1132  void *ptr;
1133  ptr = bgetz(__kmp_entry_thread(), (bufsize)(nelem * elsize + sizeof(ptr)));
1134  if (ptr != NULL) {
1135    // save allocated pointer just before one returned to user
1136    *(void **)ptr = ptr;
1137    ptr = (void **)ptr + 1;
1138  }
1139  return ptr;
1140}
1141
1142void *kmpc_realloc(void *ptr, size_t size) {
1143  void *result = NULL;
1144  if (ptr == NULL) {
1145    // If pointer is NULL, realloc behaves like malloc.
1146    result = bget(__kmp_entry_thread(), (bufsize)(size + sizeof(ptr)));
1147    // save allocated pointer just before one returned to user
1148    if (result != NULL) {
1149      *(void **)result = result;
1150      result = (void **)result + 1;
1151    }
1152  } else if (size == 0) {
1153    // If size is 0, realloc behaves like free.
1154    // The thread must be registered by the call to kmpc_malloc() or
1155    // kmpc_calloc() before.
1156    // So it should be safe to call __kmp_get_thread(), not
1157    // __kmp_entry_thread().
1158    KMP_ASSERT(*((void **)ptr - 1));
1159    brel(__kmp_get_thread(), *((void **)ptr - 1));
1160  } else {
1161    result = bgetr(__kmp_entry_thread(), *((void **)ptr - 1),
1162                   (bufsize)(size + sizeof(ptr)));
1163    if (result != NULL) {
1164      *(void **)result = result;
1165      result = (void **)result + 1;
1166    }
1167  }
1168  return result;
1169}
1170
1171// NOTE: the library must have already been initialized by a previous allocate
1172void kmpc_free(void *ptr) {
1173  if (!__kmp_init_serial) {
1174    return;
1175  }
1176  if (ptr != NULL) {
1177    kmp_info_t *th = __kmp_get_thread();
1178    __kmp_bget_dequeue(th); /* Release any queued buffers */
1179    // extract allocated pointer and free it
1180    KMP_ASSERT(*((void **)ptr - 1));
1181    brel(th, *((void **)ptr - 1));
1182  }
1183}
1184
1185void *___kmp_thread_malloc(kmp_info_t *th, size_t size KMP_SRC_LOC_DECL) {
1186  void *ptr;
1187  KE_TRACE(30, ("-> __kmp_thread_malloc( %p, %d ) called from %s:%d\n", th,
1188                (int)size KMP_SRC_LOC_PARM));
1189  ptr = bget(th, (bufsize)size);
1190  KE_TRACE(30, ("<- __kmp_thread_malloc() returns %p\n", ptr));
1191  return ptr;
1192}
1193
1194void *___kmp_thread_calloc(kmp_info_t *th, size_t nelem,
1195                           size_t elsize KMP_SRC_LOC_DECL) {
1196  void *ptr;
1197  KE_TRACE(30, ("-> __kmp_thread_calloc( %p, %d, %d ) called from %s:%d\n", th,
1198                (int)nelem, (int)elsize KMP_SRC_LOC_PARM));
1199  ptr = bgetz(th, (bufsize)(nelem * elsize));
1200  KE_TRACE(30, ("<- __kmp_thread_calloc() returns %p\n", ptr));
1201  return ptr;
1202}
1203
1204void *___kmp_thread_realloc(kmp_info_t *th, void *ptr,
1205                            size_t size KMP_SRC_LOC_DECL) {
1206  KE_TRACE(30, ("-> __kmp_thread_realloc( %p, %p, %d ) called from %s:%d\n", th,
1207                ptr, (int)size KMP_SRC_LOC_PARM));
1208  ptr = bgetr(th, ptr, (bufsize)size);
1209  KE_TRACE(30, ("<- __kmp_thread_realloc() returns %p\n", ptr));
1210  return ptr;
1211}
1212
1213void ___kmp_thread_free(kmp_info_t *th, void *ptr KMP_SRC_LOC_DECL) {
1214  KE_TRACE(30, ("-> __kmp_thread_free( %p, %p ) called from %s:%d\n", th,
1215                ptr KMP_SRC_LOC_PARM));
1216  if (ptr != NULL) {
1217    __kmp_bget_dequeue(th); /* Release any queued buffers */
1218    brel(th, ptr);
1219  }
1220  KE_TRACE(30, ("<- __kmp_thread_free()\n"));
1221}
1222
1223/* OMP 5.0 Memory Management support */
1224static const char *kmp_mk_lib_name;
1225static void *h_memkind;
1226/* memkind experimental API: */
1227// memkind_alloc
1228static void *(*kmp_mk_alloc)(void *k, size_t sz);
1229// memkind_free
1230static void (*kmp_mk_free)(void *kind, void *ptr);
1231// memkind_check_available
1232static int (*kmp_mk_check)(void *kind);
1233// kinds we are going to use
1234static void **mk_default;
1235static void **mk_interleave;
1236static void **mk_hbw;
1237static void **mk_hbw_interleave;
1238static void **mk_hbw_preferred;
1239static void **mk_hugetlb;
1240static void **mk_hbw_hugetlb;
1241static void **mk_hbw_preferred_hugetlb;
1242
1243#if KMP_OS_UNIX && KMP_DYNAMIC_LIB
1244static inline void chk_kind(void ***pkind) {
1245  KMP_DEBUG_ASSERT(pkind);
1246  if (*pkind) // symbol found
1247    if (kmp_mk_check(**pkind)) // kind not available or error
1248      *pkind = NULL;
1249}
1250#endif
1251
1252void __kmp_init_memkind() {
1253// as of 2018-07-31 memkind does not support Windows*, exclude it for now
1254#if KMP_OS_UNIX && KMP_DYNAMIC_LIB
1255  // use of statically linked memkind is problematic, as it depends on libnuma
1256  kmp_mk_lib_name = "libmemkind.so";
1257  h_memkind = dlopen(kmp_mk_lib_name, RTLD_LAZY);
1258  if (h_memkind) {
1259    kmp_mk_check = (int (*)(void *))dlsym(h_memkind, "memkind_check_available");
1260    kmp_mk_alloc =
1261        (void *(*)(void *, size_t))dlsym(h_memkind, "memkind_malloc");
1262    kmp_mk_free = (void (*)(void *, void *))dlsym(h_memkind, "memkind_free");
1263    mk_default = (void **)dlsym(h_memkind, "MEMKIND_DEFAULT");
1264    if (kmp_mk_check && kmp_mk_alloc && kmp_mk_free && mk_default &&
1265        !kmp_mk_check(*mk_default)) {
1266      __kmp_memkind_available = 1;
1267      mk_interleave = (void **)dlsym(h_memkind, "MEMKIND_INTERLEAVE");
1268      chk_kind(&mk_interleave);
1269      mk_hbw = (void **)dlsym(h_memkind, "MEMKIND_HBW");
1270      chk_kind(&mk_hbw);
1271      mk_hbw_interleave = (void **)dlsym(h_memkind, "MEMKIND_HBW_INTERLEAVE");
1272      chk_kind(&mk_hbw_interleave);
1273      mk_hbw_preferred = (void **)dlsym(h_memkind, "MEMKIND_HBW_PREFERRED");
1274      chk_kind(&mk_hbw_preferred);
1275      mk_hugetlb = (void **)dlsym(h_memkind, "MEMKIND_HUGETLB");
1276      chk_kind(&mk_hugetlb);
1277      mk_hbw_hugetlb = (void **)dlsym(h_memkind, "MEMKIND_HBW_HUGETLB");
1278      chk_kind(&mk_hbw_hugetlb);
1279      mk_hbw_preferred_hugetlb =
1280          (void **)dlsym(h_memkind, "MEMKIND_HBW_PREFERRED_HUGETLB");
1281      chk_kind(&mk_hbw_preferred_hugetlb);
1282      KE_TRACE(25, ("__kmp_init_memkind: memkind library initialized\n"));
1283      return; // success
1284    }
1285    dlclose(h_memkind); // failure
1286    h_memkind = NULL;
1287  }
1288  kmp_mk_check = NULL;
1289  kmp_mk_alloc = NULL;
1290  kmp_mk_free = NULL;
1291  mk_default = NULL;
1292  mk_interleave = NULL;
1293  mk_hbw = NULL;
1294  mk_hbw_interleave = NULL;
1295  mk_hbw_preferred = NULL;
1296  mk_hugetlb = NULL;
1297  mk_hbw_hugetlb = NULL;
1298  mk_hbw_preferred_hugetlb = NULL;
1299#else
1300  kmp_mk_lib_name = "";
1301  h_memkind = NULL;
1302  kmp_mk_check = NULL;
1303  kmp_mk_alloc = NULL;
1304  kmp_mk_free = NULL;
1305  mk_default = NULL;
1306  mk_interleave = NULL;
1307  mk_hbw = NULL;
1308  mk_hbw_interleave = NULL;
1309  mk_hbw_preferred = NULL;
1310  mk_hugetlb = NULL;
1311  mk_hbw_hugetlb = NULL;
1312  mk_hbw_preferred_hugetlb = NULL;
1313#endif
1314}
1315
1316void __kmp_fini_memkind() {
1317#if KMP_OS_UNIX && KMP_DYNAMIC_LIB
1318  if (__kmp_memkind_available)
1319    KE_TRACE(25, ("__kmp_fini_memkind: finalize memkind library\n"));
1320  if (h_memkind) {
1321    dlclose(h_memkind);
1322    h_memkind = NULL;
1323  }
1324  kmp_mk_check = NULL;
1325  kmp_mk_alloc = NULL;
1326  kmp_mk_free = NULL;
1327  mk_default = NULL;
1328  mk_interleave = NULL;
1329  mk_hbw = NULL;
1330  mk_hbw_interleave = NULL;
1331  mk_hbw_preferred = NULL;
1332  mk_hugetlb = NULL;
1333  mk_hbw_hugetlb = NULL;
1334  mk_hbw_preferred_hugetlb = NULL;
1335#endif
1336}
1337
1338omp_allocator_handle_t __kmpc_init_allocator(int gtid, omp_memspace_handle_t ms,
1339                                             int ntraits,
1340                                             omp_alloctrait_t traits[]) {
1341  // OpenMP 5.0 only allows predefined memspaces
1342  KMP_DEBUG_ASSERT(ms == omp_default_mem_space || ms == omp_low_lat_mem_space ||
1343                   ms == omp_large_cap_mem_space || ms == omp_const_mem_space ||
1344                   ms == omp_high_bw_mem_space);
1345  kmp_allocator_t *al;
1346  int i;
1347  al = (kmp_allocator_t *)__kmp_allocate(sizeof(kmp_allocator_t)); // zeroed
1348  al->memspace = ms; // not used currently
1349  for (i = 0; i < ntraits; ++i) {
1350    switch (traits[i].key) {
1351    case OMP_ATK_THREADMODEL:
1352    case OMP_ATK_ACCESS:
1353    case OMP_ATK_PINNED:
1354      break;
1355    case OMP_ATK_ALIGNMENT:
1356      al->alignment = traits[i].value;
1357      KMP_ASSERT(IS_POWER_OF_TWO(al->alignment));
1358      break;
1359    case OMP_ATK_POOL_SIZE:
1360      al->pool_size = traits[i].value;
1361      break;
1362    case OMP_ATK_FALLBACK:
1363      al->fb = (omp_alloctrait_value_t)traits[i].value;
1364      KMP_DEBUG_ASSERT(
1365          al->fb == OMP_ATV_DEFAULT_MEM_FB || al->fb == OMP_ATV_NULL_FB ||
1366          al->fb == OMP_ATV_ABORT_FB || al->fb == OMP_ATV_ALLOCATOR_FB);
1367      break;
1368    case OMP_ATK_FB_DATA:
1369      al->fb_data = RCAST(kmp_allocator_t *, traits[i].value);
1370      break;
1371    case OMP_ATK_PARTITION:
1372      al->memkind = RCAST(void **, traits[i].value);
1373      break;
1374    default:
1375      KMP_ASSERT2(0, "Unexpected allocator trait");
1376    }
1377  }
1378  if (al->fb == 0) {
1379    // set default allocator
1380    al->fb = OMP_ATV_DEFAULT_MEM_FB;
1381    al->fb_data = (kmp_allocator_t *)omp_default_mem_alloc;
1382  } else if (al->fb == OMP_ATV_ALLOCATOR_FB) {
1383    KMP_ASSERT(al->fb_data != NULL);
1384  } else if (al->fb == OMP_ATV_DEFAULT_MEM_FB) {
1385    al->fb_data = (kmp_allocator_t *)omp_default_mem_alloc;
1386  }
1387  if (__kmp_memkind_available) {
1388    // Let's use memkind library if available
1389    if (ms == omp_high_bw_mem_space) {
1390      if (al->memkind == (void *)OMP_ATV_INTERLEAVED && mk_hbw_interleave) {
1391        al->memkind = mk_hbw_interleave;
1392      } else if (mk_hbw_preferred) {
1393        // AC: do not try to use MEMKIND_HBW for now, because memkind library
1394        // cannot reliably detect exhaustion of HBW memory.
1395        // It could be possible using hbw_verify_memory_region() but memkind
1396        // manual says: "Using this function in production code may result in
1397        // serious performance penalty".
1398        al->memkind = mk_hbw_preferred;
1399      } else {
1400        // HBW is requested but not available --> return NULL allocator
1401        __kmp_free(al);
1402        return omp_null_allocator;
1403      }
1404    } else {
1405      if (al->memkind == (void *)OMP_ATV_INTERLEAVED && mk_interleave) {
1406        al->memkind = mk_interleave;
1407      } else {
1408        al->memkind = mk_default;
1409      }
1410    }
1411  } else {
1412    if (ms == omp_high_bw_mem_space) {
1413      // cannot detect HBW memory presence without memkind library
1414      __kmp_free(al);
1415      return omp_null_allocator;
1416    }
1417  }
1418  return (omp_allocator_handle_t)al;
1419}
1420
1421void __kmpc_destroy_allocator(int gtid, omp_allocator_handle_t allocator) {
1422  if (allocator > kmp_max_mem_alloc)
1423    __kmp_free(allocator);
1424}
1425
1426void __kmpc_set_default_allocator(int gtid, omp_allocator_handle_t allocator) {
1427  if (allocator == omp_null_allocator)
1428    allocator = omp_default_mem_alloc;
1429  __kmp_threads[gtid]->th.th_def_allocator = allocator;
1430}
1431
1432omp_allocator_handle_t __kmpc_get_default_allocator(int gtid) {
1433  return __kmp_threads[gtid]->th.th_def_allocator;
1434}
1435
1436typedef struct kmp_mem_desc { // Memory block descriptor
1437  void *ptr_alloc; // Pointer returned by allocator
1438  size_t size_a; // Size of allocated memory block (initial+descriptor+align)
1439  void *ptr_align; // Pointer to aligned memory, returned
1440  kmp_allocator_t *allocator; // allocator
1441} kmp_mem_desc_t;
1442static int alignment = sizeof(void *); // let's align to pointer size
1443
1444void *__kmpc_alloc(int gtid, size_t size, omp_allocator_handle_t allocator) {
1445  void *ptr = NULL;
1446  kmp_allocator_t *al;
1447  KMP_DEBUG_ASSERT(__kmp_init_serial);
1448  if (allocator == omp_null_allocator)
1449    allocator = __kmp_threads[gtid]->th.th_def_allocator;
1450
1451  KE_TRACE(25, ("__kmpc_alloc: T#%d (%d, %p)\n", gtid, (int)size, allocator));
1452  al = RCAST(kmp_allocator_t *, CCAST(omp_allocator_handle_t, allocator));
1453
1454  int sz_desc = sizeof(kmp_mem_desc_t);
1455  kmp_mem_desc_t desc;
1456  kmp_uintptr_t addr; // address returned by allocator
1457  kmp_uintptr_t addr_align; // address to return to caller
1458  kmp_uintptr_t addr_descr; // address of memory block descriptor
1459  int align = alignment; // default alignment
1460  if (allocator > kmp_max_mem_alloc && al->alignment > 0) {
1461    align = al->alignment; // alignment requested by user
1462  }
1463  desc.size_a = size + sz_desc + align;
1464
1465  if (__kmp_memkind_available) {
1466    if (allocator < kmp_max_mem_alloc) {
1467      // pre-defined allocator
1468      if (allocator == omp_high_bw_mem_alloc && mk_hbw_preferred) {
1469        ptr = kmp_mk_alloc(*mk_hbw_preferred, desc.size_a);
1470      } else {
1471        ptr = kmp_mk_alloc(*mk_default, desc.size_a);
1472      }
1473    } else if (al->pool_size > 0) {
1474      // custom allocator with pool size requested
1475      kmp_uint64 used =
1476          KMP_TEST_THEN_ADD64((kmp_int64 *)&al->pool_used, desc.size_a);
1477      if (used + desc.size_a > al->pool_size) {
1478        // not enough space, need to go fallback path
1479        KMP_TEST_THEN_ADD64((kmp_int64 *)&al->pool_used, -desc.size_a);
1480        if (al->fb == OMP_ATV_DEFAULT_MEM_FB) {
1481          al = (kmp_allocator_t *)omp_default_mem_alloc;
1482          ptr = kmp_mk_alloc(*mk_default, desc.size_a);
1483        } else if (al->fb == OMP_ATV_ABORT_FB) {
1484          KMP_ASSERT(0); // abort fallback requested
1485        } else if (al->fb == OMP_ATV_ALLOCATOR_FB) {
1486          KMP_ASSERT(al != al->fb_data);
1487          al = al->fb_data;
1488          return __kmpc_alloc(gtid, size, (omp_allocator_handle_t)al);
1489        } // else ptr == NULL;
1490      } else {
1491        // pool has enough space
1492        ptr = kmp_mk_alloc(*al->memkind, desc.size_a);
1493        if (ptr == NULL) {
1494          if (al->fb == OMP_ATV_DEFAULT_MEM_FB) {
1495            al = (kmp_allocator_t *)omp_default_mem_alloc;
1496            ptr = kmp_mk_alloc(*mk_default, desc.size_a);
1497          } else if (al->fb == OMP_ATV_ABORT_FB) {
1498            KMP_ASSERT(0); // abort fallback requested
1499          } else if (al->fb == OMP_ATV_ALLOCATOR_FB) {
1500            KMP_ASSERT(al != al->fb_data);
1501            al = al->fb_data;
1502            return __kmpc_alloc(gtid, size, (omp_allocator_handle_t)al);
1503          }
1504        }
1505      }
1506    } else {
1507      // custom allocator, pool size not requested
1508      ptr = kmp_mk_alloc(*al->memkind, desc.size_a);
1509      if (ptr == NULL) {
1510        if (al->fb == OMP_ATV_DEFAULT_MEM_FB) {
1511          al = (kmp_allocator_t *)omp_default_mem_alloc;
1512          ptr = kmp_mk_alloc(*mk_default, desc.size_a);
1513        } else if (al->fb == OMP_ATV_ABORT_FB) {
1514          KMP_ASSERT(0); // abort fallback requested
1515        } else if (al->fb == OMP_ATV_ALLOCATOR_FB) {
1516          KMP_ASSERT(al != al->fb_data);
1517          al = al->fb_data;
1518          return __kmpc_alloc(gtid, size, (omp_allocator_handle_t)al);
1519        }
1520      }
1521    }
1522  } else if (allocator < kmp_max_mem_alloc) {
1523    // pre-defined allocator
1524    if (allocator == omp_high_bw_mem_alloc) {
1525      // ptr = NULL;
1526    } else {
1527      ptr = __kmp_thread_malloc(__kmp_thread_from_gtid(gtid), desc.size_a);
1528    }
1529  } else if (al->pool_size > 0) {
1530    // custom allocator with pool size requested
1531    kmp_uint64 used =
1532        KMP_TEST_THEN_ADD64((kmp_int64 *)&al->pool_used, desc.size_a);
1533    if (used + desc.size_a > al->pool_size) {
1534      // not enough space, need to go fallback path
1535      KMP_TEST_THEN_ADD64((kmp_int64 *)&al->pool_used, -desc.size_a);
1536      if (al->fb == OMP_ATV_DEFAULT_MEM_FB) {
1537        al = (kmp_allocator_t *)omp_default_mem_alloc;
1538        ptr = __kmp_thread_malloc(__kmp_thread_from_gtid(gtid), desc.size_a);
1539      } else if (al->fb == OMP_ATV_ABORT_FB) {
1540        KMP_ASSERT(0); // abort fallback requested
1541      } else if (al->fb == OMP_ATV_ALLOCATOR_FB) {
1542        KMP_ASSERT(al != al->fb_data);
1543        al = al->fb_data;
1544        return __kmpc_alloc(gtid, size, (omp_allocator_handle_t)al);
1545      } // else ptr == NULL;
1546    } else {
1547      // pool has enough space
1548      ptr = __kmp_thread_malloc(__kmp_thread_from_gtid(gtid), desc.size_a);
1549      if (ptr == NULL && al->fb == OMP_ATV_ABORT_FB) {
1550        KMP_ASSERT(0); // abort fallback requested
1551      } // no sense to look for another fallback because of same internal alloc
1552    }
1553  } else {
1554    // custom allocator, pool size not requested
1555    ptr = __kmp_thread_malloc(__kmp_thread_from_gtid(gtid), desc.size_a);
1556    if (ptr == NULL && al->fb == OMP_ATV_ABORT_FB) {
1557      KMP_ASSERT(0); // abort fallback requested
1558    } // no sense to look for another fallback because of same internal alloc
1559  }
1560  KE_TRACE(10, ("__kmpc_alloc: T#%d %p=alloc(%d)\n", gtid, ptr, desc.size_a));
1561  if (ptr == NULL)
1562    return NULL;
1563
1564  addr = (kmp_uintptr_t)ptr;
1565  addr_align = (addr + sz_desc + align - 1) & ~(align - 1);
1566  addr_descr = addr_align - sz_desc;
1567
1568  desc.ptr_alloc = ptr;
1569  desc.ptr_align = (void *)addr_align;
1570  desc.allocator = al;
1571  *((kmp_mem_desc_t *)addr_descr) = desc; // save descriptor contents
1572  KMP_MB();
1573
1574  KE_TRACE(25, ("__kmpc_alloc returns %p, T#%d\n", desc.ptr_align, gtid));
1575  return desc.ptr_align;
1576}
1577
1578void __kmpc_free(int gtid, void *ptr, const omp_allocator_handle_t allocator) {
1579  KE_TRACE(25, ("__kmpc_free: T#%d free(%p,%p)\n", gtid, ptr, allocator));
1580  if (ptr == NULL)
1581    return;
1582
1583  kmp_allocator_t *al;
1584  omp_allocator_handle_t oal;
1585  al = RCAST(kmp_allocator_t *, CCAST(omp_allocator_handle_t, allocator));
1586  kmp_mem_desc_t desc;
1587  kmp_uintptr_t addr_align; // address to return to caller
1588  kmp_uintptr_t addr_descr; // address of memory block descriptor
1589
1590  addr_align = (kmp_uintptr_t)ptr;
1591  addr_descr = addr_align - sizeof(kmp_mem_desc_t);
1592  desc = *((kmp_mem_desc_t *)addr_descr); // read descriptor
1593
1594  KMP_DEBUG_ASSERT(desc.ptr_align == ptr);
1595  if (allocator) {
1596    KMP_DEBUG_ASSERT(desc.allocator == al || desc.allocator == al->fb_data);
1597  }
1598  al = desc.allocator;
1599  oal = (omp_allocator_handle_t)al; // cast to void* for comparisons
1600  KMP_DEBUG_ASSERT(al);
1601
1602  if (__kmp_memkind_available) {
1603    if (oal < kmp_max_mem_alloc) {
1604      // pre-defined allocator
1605      if (oal == omp_high_bw_mem_alloc && mk_hbw_preferred) {
1606        kmp_mk_free(*mk_hbw_preferred, desc.ptr_alloc);
1607      } else {
1608        kmp_mk_free(*mk_default, desc.ptr_alloc);
1609      }
1610    } else {
1611      if (al->pool_size > 0) { // custom allocator with pool size requested
1612        kmp_uint64 used =
1613            KMP_TEST_THEN_ADD64((kmp_int64 *)&al->pool_used, -desc.size_a);
1614        (void)used; // to suppress compiler warning
1615        KMP_DEBUG_ASSERT(used >= desc.size_a);
1616      }
1617      kmp_mk_free(*al->memkind, desc.ptr_alloc);
1618    }
1619  } else {
1620    if (oal > kmp_max_mem_alloc && al->pool_size > 0) {
1621      kmp_uint64 used =
1622          KMP_TEST_THEN_ADD64((kmp_int64 *)&al->pool_used, -desc.size_a);
1623      (void)used; // to suppress compiler warning
1624      KMP_DEBUG_ASSERT(used >= desc.size_a);
1625    }
1626    __kmp_thread_free(__kmp_thread_from_gtid(gtid), desc.ptr_alloc);
1627  }
1628  KE_TRACE(10, ("__kmpc_free: T#%d freed %p (%p)\n", gtid, desc.ptr_alloc,
1629                allocator));
1630}
1631
1632/* If LEAK_MEMORY is defined, __kmp_free() will *not* free memory. It causes
1633   memory leaks, but it may be useful for debugging memory corruptions, used
1634   freed pointers, etc. */
1635/* #define LEAK_MEMORY */
1636struct kmp_mem_descr { // Memory block descriptor.
1637  void *ptr_allocated; // Pointer returned by malloc(), subject for free().
1638  size_t size_allocated; // Size of allocated memory block.
1639  void *ptr_aligned; // Pointer to aligned memory, to be used by client code.
1640  size_t size_aligned; // Size of aligned memory block.
1641};
1642typedef struct kmp_mem_descr kmp_mem_descr_t;
1643
1644/* Allocate memory on requested boundary, fill allocated memory with 0x00.
1645   NULL is NEVER returned, __kmp_abort() is called in case of memory allocation
1646   error. Must use __kmp_free when freeing memory allocated by this routine! */
1647static void *___kmp_allocate_align(size_t size,
1648                                   size_t alignment KMP_SRC_LOC_DECL) {
1649  /* __kmp_allocate() allocates (by call to malloc()) bigger memory block than
1650     requested to return properly aligned pointer. Original pointer returned
1651     by malloc() and size of allocated block is saved in descriptor just
1652     before the aligned pointer. This information used by __kmp_free() -- it
1653     has to pass to free() original pointer, not aligned one.
1654
1655          +---------+------------+-----------------------------------+---------+
1656          | padding | descriptor |           aligned block           | padding |
1657          +---------+------------+-----------------------------------+---------+
1658          ^                      ^
1659          |                      |
1660          |                      +- Aligned pointer returned to caller
1661          +- Pointer returned by malloc()
1662
1663      Aligned block is filled with zeros, paddings are filled with 0xEF. */
1664
1665  kmp_mem_descr_t descr;
1666  kmp_uintptr_t addr_allocated; // Address returned by malloc().
1667  kmp_uintptr_t addr_aligned; // Aligned address to return to caller.
1668  kmp_uintptr_t addr_descr; // Address of memory block descriptor.
1669
1670  KE_TRACE(25, ("-> ___kmp_allocate_align( %d, %d ) called from %s:%d\n",
1671                (int)size, (int)alignment KMP_SRC_LOC_PARM));
1672
1673  KMP_DEBUG_ASSERT(alignment < 32 * 1024); // Alignment should not be too
1674  KMP_DEBUG_ASSERT(sizeof(void *) <= sizeof(kmp_uintptr_t));
1675  // Make sure kmp_uintptr_t is enough to store addresses.
1676
1677  descr.size_aligned = size;
1678  descr.size_allocated =
1679      descr.size_aligned + sizeof(kmp_mem_descr_t) + alignment;
1680
1681#if KMP_DEBUG
1682  descr.ptr_allocated = _malloc_src_loc(descr.size_allocated, _file_, _line_);
1683#else
1684  descr.ptr_allocated = malloc_src_loc(descr.size_allocated KMP_SRC_LOC_PARM);
1685#endif
1686  KE_TRACE(10, ("   malloc( %d ) returned %p\n", (int)descr.size_allocated,
1687                descr.ptr_allocated));
1688  if (descr.ptr_allocated == NULL) {
1689    KMP_FATAL(OutOfHeapMemory);
1690  }
1691
1692  addr_allocated = (kmp_uintptr_t)descr.ptr_allocated;
1693  addr_aligned =
1694      (addr_allocated + sizeof(kmp_mem_descr_t) + alignment) & ~(alignment - 1);
1695  addr_descr = addr_aligned - sizeof(kmp_mem_descr_t);
1696
1697  descr.ptr_aligned = (void *)addr_aligned;
1698
1699  KE_TRACE(26, ("   ___kmp_allocate_align: "
1700                "ptr_allocated=%p, size_allocated=%d, "
1701                "ptr_aligned=%p, size_aligned=%d\n",
1702                descr.ptr_allocated, (int)descr.size_allocated,
1703                descr.ptr_aligned, (int)descr.size_aligned));
1704
1705  KMP_DEBUG_ASSERT(addr_allocated <= addr_descr);
1706  KMP_DEBUG_ASSERT(addr_descr + sizeof(kmp_mem_descr_t) == addr_aligned);
1707  KMP_DEBUG_ASSERT(addr_aligned + descr.size_aligned <=
1708                   addr_allocated + descr.size_allocated);
1709  KMP_DEBUG_ASSERT(addr_aligned % alignment == 0);
1710#ifdef KMP_DEBUG
1711  memset(descr.ptr_allocated, 0xEF, descr.size_allocated);
1712// Fill allocated memory block with 0xEF.
1713#endif
1714  memset(descr.ptr_aligned, 0x00, descr.size_aligned);
1715  // Fill the aligned memory block (which is intended for using by caller) with
1716  // 0x00. Do not
1717  // put this filling under KMP_DEBUG condition! Many callers expect zeroed
1718  // memory. (Padding
1719  // bytes remain filled with 0xEF in debugging library.)
1720  *((kmp_mem_descr_t *)addr_descr) = descr;
1721
1722  KMP_MB();
1723
1724  KE_TRACE(25, ("<- ___kmp_allocate_align() returns %p\n", descr.ptr_aligned));
1725  return descr.ptr_aligned;
1726} // func ___kmp_allocate_align
1727
1728/* Allocate memory on cache line boundary, fill allocated memory with 0x00.
1729   Do not call this func directly! Use __kmp_allocate macro instead.
1730   NULL is NEVER returned, __kmp_abort() is called in case of memory allocation
1731   error. Must use __kmp_free when freeing memory allocated by this routine! */
1732void *___kmp_allocate(size_t size KMP_SRC_LOC_DECL) {
1733  void *ptr;
1734  KE_TRACE(25, ("-> __kmp_allocate( %d ) called from %s:%d\n",
1735                (int)size KMP_SRC_LOC_PARM));
1736  ptr = ___kmp_allocate_align(size, __kmp_align_alloc KMP_SRC_LOC_PARM);
1737  KE_TRACE(25, ("<- __kmp_allocate() returns %p\n", ptr));
1738  return ptr;
1739} // func ___kmp_allocate
1740
1741/* Allocate memory on page boundary, fill allocated memory with 0x00.
1742   Does not call this func directly! Use __kmp_page_allocate macro instead.
1743   NULL is NEVER returned, __kmp_abort() is called in case of memory allocation
1744   error. Must use __kmp_free when freeing memory allocated by this routine! */
1745void *___kmp_page_allocate(size_t size KMP_SRC_LOC_DECL) {
1746  int page_size = 8 * 1024;
1747  void *ptr;
1748
1749  KE_TRACE(25, ("-> __kmp_page_allocate( %d ) called from %s:%d\n",
1750                (int)size KMP_SRC_LOC_PARM));
1751  ptr = ___kmp_allocate_align(size, page_size KMP_SRC_LOC_PARM);
1752  KE_TRACE(25, ("<- __kmp_page_allocate( %d ) returns %p\n", (int)size, ptr));
1753  return ptr;
1754} // ___kmp_page_allocate
1755
1756/* Free memory allocated by __kmp_allocate() and __kmp_page_allocate().
1757   In debug mode, fill the memory block with 0xEF before call to free(). */
1758void ___kmp_free(void *ptr KMP_SRC_LOC_DECL) {
1759  kmp_mem_descr_t descr;
1760  kmp_uintptr_t addr_allocated; // Address returned by malloc().
1761  kmp_uintptr_t addr_aligned; // Aligned address passed by caller.
1762
1763  KE_TRACE(25,
1764           ("-> __kmp_free( %p ) called from %s:%d\n", ptr KMP_SRC_LOC_PARM));
1765  KMP_ASSERT(ptr != NULL);
1766
1767  descr = *(kmp_mem_descr_t *)((kmp_uintptr_t)ptr - sizeof(kmp_mem_descr_t));
1768
1769  KE_TRACE(26, ("   __kmp_free:     "
1770                "ptr_allocated=%p, size_allocated=%d, "
1771                "ptr_aligned=%p, size_aligned=%d\n",
1772                descr.ptr_allocated, (int)descr.size_allocated,
1773                descr.ptr_aligned, (int)descr.size_aligned));
1774
1775  addr_allocated = (kmp_uintptr_t)descr.ptr_allocated;
1776  addr_aligned = (kmp_uintptr_t)descr.ptr_aligned;
1777
1778  KMP_DEBUG_ASSERT(addr_aligned % CACHE_LINE == 0);
1779  KMP_DEBUG_ASSERT(descr.ptr_aligned == ptr);
1780  KMP_DEBUG_ASSERT(addr_allocated + sizeof(kmp_mem_descr_t) <= addr_aligned);
1781  KMP_DEBUG_ASSERT(descr.size_aligned < descr.size_allocated);
1782  KMP_DEBUG_ASSERT(addr_aligned + descr.size_aligned <=
1783                   addr_allocated + descr.size_allocated);
1784
1785#ifdef KMP_DEBUG
1786  memset(descr.ptr_allocated, 0xEF, descr.size_allocated);
1787// Fill memory block with 0xEF, it helps catch using freed memory.
1788#endif
1789
1790#ifndef LEAK_MEMORY
1791  KE_TRACE(10, ("   free( %p )\n", descr.ptr_allocated));
1792#ifdef KMP_DEBUG
1793  _free_src_loc(descr.ptr_allocated, _file_, _line_);
1794#else
1795  free_src_loc(descr.ptr_allocated KMP_SRC_LOC_PARM);
1796#endif
1797#endif
1798  KMP_MB();
1799  KE_TRACE(25, ("<- __kmp_free() returns\n"));
1800} // func ___kmp_free
1801
1802#if USE_FAST_MEMORY == 3
1803// Allocate fast memory by first scanning the thread's free lists
1804// If a chunk the right size exists, grab it off the free list.
1805// Otherwise allocate normally using kmp_thread_malloc.
1806
1807// AC: How to choose the limit? Just get 16 for now...
1808#define KMP_FREE_LIST_LIMIT 16
1809
1810// Always use 128 bytes for determining buckets for caching memory blocks
1811#define DCACHE_LINE 128
1812
1813void *___kmp_fast_allocate(kmp_info_t *this_thr, size_t size KMP_SRC_LOC_DECL) {
1814  void *ptr;
1815  int num_lines;
1816  int idx;
1817  int index;
1818  void *alloc_ptr;
1819  size_t alloc_size;
1820  kmp_mem_descr_t *descr;
1821
1822  KE_TRACE(25, ("-> __kmp_fast_allocate( T#%d, %d ) called from %s:%d\n",
1823                __kmp_gtid_from_thread(this_thr), (int)size KMP_SRC_LOC_PARM));
1824
1825  num_lines = (size + DCACHE_LINE - 1) / DCACHE_LINE;
1826  idx = num_lines - 1;
1827  KMP_DEBUG_ASSERT(idx >= 0);
1828  if (idx < 2) {
1829    index = 0; // idx is [ 0, 1 ], use first free list
1830    num_lines = 2; // 1, 2 cache lines or less than cache line
1831  } else if ((idx >>= 2) == 0) {
1832    index = 1; // idx is [ 2, 3 ], use second free list
1833    num_lines = 4; // 3, 4 cache lines
1834  } else if ((idx >>= 2) == 0) {
1835    index = 2; // idx is [ 4, 15 ], use third free list
1836    num_lines = 16; // 5, 6, ..., 16 cache lines
1837  } else if ((idx >>= 2) == 0) {
1838    index = 3; // idx is [ 16, 63 ], use fourth free list
1839    num_lines = 64; // 17, 18, ..., 64 cache lines
1840  } else {
1841    goto alloc_call; // 65 or more cache lines ( > 8KB ), don't use free lists
1842  }
1843
1844  ptr = this_thr->th.th_free_lists[index].th_free_list_self;
1845  if (ptr != NULL) {
1846    // pop the head of no-sync free list
1847    this_thr->th.th_free_lists[index].th_free_list_self = *((void **)ptr);
1848    KMP_DEBUG_ASSERT(
1849        this_thr ==
1850        ((kmp_mem_descr_t *)((kmp_uintptr_t)ptr - sizeof(kmp_mem_descr_t)))
1851            ->ptr_aligned);
1852    goto end;
1853  }
1854  ptr = TCR_SYNC_PTR(this_thr->th.th_free_lists[index].th_free_list_sync);
1855  if (ptr != NULL) {
1856    // no-sync free list is empty, use sync free list (filled in by other
1857    // threads only)
1858    // pop the head of the sync free list, push NULL instead
1859    while (!KMP_COMPARE_AND_STORE_PTR(
1860        &this_thr->th.th_free_lists[index].th_free_list_sync, ptr, nullptr)) {
1861      KMP_CPU_PAUSE();
1862      ptr = TCR_SYNC_PTR(this_thr->th.th_free_lists[index].th_free_list_sync);
1863    }
1864    // push the rest of chain into no-sync free list (can be NULL if there was
1865    // the only block)
1866    this_thr->th.th_free_lists[index].th_free_list_self = *((void **)ptr);
1867    KMP_DEBUG_ASSERT(
1868        this_thr ==
1869        ((kmp_mem_descr_t *)((kmp_uintptr_t)ptr - sizeof(kmp_mem_descr_t)))
1870            ->ptr_aligned);
1871    goto end;
1872  }
1873
1874alloc_call:
1875  // haven't found block in the free lists, thus allocate it
1876  size = num_lines * DCACHE_LINE;
1877
1878  alloc_size = size + sizeof(kmp_mem_descr_t) + DCACHE_LINE;
1879  KE_TRACE(25, ("__kmp_fast_allocate: T#%d Calling __kmp_thread_malloc with "
1880                "alloc_size %d\n",
1881                __kmp_gtid_from_thread(this_thr), alloc_size));
1882  alloc_ptr = bget(this_thr, (bufsize)alloc_size);
1883
1884  // align ptr to DCACHE_LINE
1885  ptr = (void *)((((kmp_uintptr_t)alloc_ptr) + sizeof(kmp_mem_descr_t) +
1886                  DCACHE_LINE) &
1887                 ~(DCACHE_LINE - 1));
1888  descr = (kmp_mem_descr_t *)(((kmp_uintptr_t)ptr) - sizeof(kmp_mem_descr_t));
1889
1890  descr->ptr_allocated = alloc_ptr; // remember allocated pointer
1891  // we don't need size_allocated
1892  descr->ptr_aligned = (void *)this_thr; // remember allocating thread
1893  // (it is already saved in bget buffer,
1894  // but we may want to use another allocator in future)
1895  descr->size_aligned = size;
1896
1897end:
1898  KE_TRACE(25, ("<- __kmp_fast_allocate( T#%d ) returns %p\n",
1899                __kmp_gtid_from_thread(this_thr), ptr));
1900  return ptr;
1901} // func __kmp_fast_allocate
1902
1903// Free fast memory and place it on the thread's free list if it is of
1904// the correct size.
1905void ___kmp_fast_free(kmp_info_t *this_thr, void *ptr KMP_SRC_LOC_DECL) {
1906  kmp_mem_descr_t *descr;
1907  kmp_info_t *alloc_thr;
1908  size_t size;
1909  size_t idx;
1910  int index;
1911
1912  KE_TRACE(25, ("-> __kmp_fast_free( T#%d, %p ) called from %s:%d\n",
1913                __kmp_gtid_from_thread(this_thr), ptr KMP_SRC_LOC_PARM));
1914  KMP_ASSERT(ptr != NULL);
1915
1916  descr = (kmp_mem_descr_t *)(((kmp_uintptr_t)ptr) - sizeof(kmp_mem_descr_t));
1917
1918  KE_TRACE(26, ("   __kmp_fast_free:     size_aligned=%d\n",
1919                (int)descr->size_aligned));
1920
1921  size = descr->size_aligned; // 2, 4, 16, 64, 65, 66, ... cache lines
1922
1923  idx = DCACHE_LINE * 2; // 2 cache lines is minimal size of block
1924  if (idx == size) {
1925    index = 0; // 2 cache lines
1926  } else if ((idx <<= 1) == size) {
1927    index = 1; // 4 cache lines
1928  } else if ((idx <<= 2) == size) {
1929    index = 2; // 16 cache lines
1930  } else if ((idx <<= 2) == size) {
1931    index = 3; // 64 cache lines
1932  } else {
1933    KMP_DEBUG_ASSERT(size > DCACHE_LINE * 64);
1934    goto free_call; // 65 or more cache lines ( > 8KB )
1935  }
1936
1937  alloc_thr = (kmp_info_t *)descr->ptr_aligned; // get thread owning the block
1938  if (alloc_thr == this_thr) {
1939    // push block to self no-sync free list, linking previous head (LIFO)
1940    *((void **)ptr) = this_thr->th.th_free_lists[index].th_free_list_self;
1941    this_thr->th.th_free_lists[index].th_free_list_self = ptr;
1942  } else {
1943    void *head = this_thr->th.th_free_lists[index].th_free_list_other;
1944    if (head == NULL) {
1945      // Create new free list
1946      this_thr->th.th_free_lists[index].th_free_list_other = ptr;
1947      *((void **)ptr) = NULL; // mark the tail of the list
1948      descr->size_allocated = (size_t)1; // head of the list keeps its length
1949    } else {
1950      // need to check existed "other" list's owner thread and size of queue
1951      kmp_mem_descr_t *dsc =
1952          (kmp_mem_descr_t *)((char *)head - sizeof(kmp_mem_descr_t));
1953      // allocating thread, same for all queue nodes
1954      kmp_info_t *q_th = (kmp_info_t *)(dsc->ptr_aligned);
1955      size_t q_sz =
1956          dsc->size_allocated + 1; // new size in case we add current task
1957      if (q_th == alloc_thr && q_sz <= KMP_FREE_LIST_LIMIT) {
1958        // we can add current task to "other" list, no sync needed
1959        *((void **)ptr) = head;
1960        descr->size_allocated = q_sz;
1961        this_thr->th.th_free_lists[index].th_free_list_other = ptr;
1962      } else {
1963        // either queue blocks owner is changing or size limit exceeded
1964        // return old queue to allocating thread (q_th) synchroneously,
1965        // and start new list for alloc_thr's tasks
1966        void *old_ptr;
1967        void *tail = head;
1968        void *next = *((void **)head);
1969        while (next != NULL) {
1970          KMP_DEBUG_ASSERT(
1971              // queue size should decrease by 1 each step through the list
1972              ((kmp_mem_descr_t *)((char *)next - sizeof(kmp_mem_descr_t)))
1973                      ->size_allocated +
1974                  1 ==
1975              ((kmp_mem_descr_t *)((char *)tail - sizeof(kmp_mem_descr_t)))
1976                  ->size_allocated);
1977          tail = next; // remember tail node
1978          next = *((void **)next);
1979        }
1980        KMP_DEBUG_ASSERT(q_th != NULL);
1981        // push block to owner's sync free list
1982        old_ptr = TCR_PTR(q_th->th.th_free_lists[index].th_free_list_sync);
1983        /* the next pointer must be set before setting free_list to ptr to avoid
1984           exposing a broken list to other threads, even for an instant. */
1985        *((void **)tail) = old_ptr;
1986
1987        while (!KMP_COMPARE_AND_STORE_PTR(
1988            &q_th->th.th_free_lists[index].th_free_list_sync, old_ptr, head)) {
1989          KMP_CPU_PAUSE();
1990          old_ptr = TCR_PTR(q_th->th.th_free_lists[index].th_free_list_sync);
1991          *((void **)tail) = old_ptr;
1992        }
1993
1994        // start new list of not-selt tasks
1995        this_thr->th.th_free_lists[index].th_free_list_other = ptr;
1996        *((void **)ptr) = NULL;
1997        descr->size_allocated = (size_t)1; // head of queue keeps its length
1998      }
1999    }
2000  }
2001  goto end;
2002
2003free_call:
2004  KE_TRACE(25, ("__kmp_fast_free: T#%d Calling __kmp_thread_free for size %d\n",
2005                __kmp_gtid_from_thread(this_thr), size));
2006  __kmp_bget_dequeue(this_thr); /* Release any queued buffers */
2007  brel(this_thr, descr->ptr_allocated);
2008
2009end:
2010  KE_TRACE(25, ("<- __kmp_fast_free() returns\n"));
2011
2012} // func __kmp_fast_free
2013
2014// Initialize the thread free lists related to fast memory
2015// Only do this when a thread is initially created.
2016void __kmp_initialize_fast_memory(kmp_info_t *this_thr) {
2017  KE_TRACE(10, ("__kmp_initialize_fast_memory: Called from th %p\n", this_thr));
2018
2019  memset(this_thr->th.th_free_lists, 0, NUM_LISTS * sizeof(kmp_free_list_t));
2020}
2021
2022// Free the memory in the thread free lists related to fast memory
2023// Only do this when a thread is being reaped (destroyed).
2024void __kmp_free_fast_memory(kmp_info_t *th) {
2025  // Suppose we use BGET underlying allocator, walk through its structures...
2026  int bin;
2027  thr_data_t *thr = get_thr_data(th);
2028  void **lst = NULL;
2029
2030  KE_TRACE(
2031      5, ("__kmp_free_fast_memory: Called T#%d\n", __kmp_gtid_from_thread(th)));
2032
2033  __kmp_bget_dequeue(th); // Release any queued buffers
2034
2035  // Dig through free lists and extract all allocated blocks
2036  for (bin = 0; bin < MAX_BGET_BINS; ++bin) {
2037    bfhead_t *b = thr->freelist[bin].ql.flink;
2038    while (b != &thr->freelist[bin]) {
2039      if ((kmp_uintptr_t)b->bh.bb.bthr & 1) { // the buffer is allocated address
2040        *((void **)b) =
2041            lst; // link the list (override bthr, but keep flink yet)
2042        lst = (void **)b; // push b into lst
2043      }
2044      b = b->ql.flink; // get next buffer
2045    }
2046  }
2047  while (lst != NULL) {
2048    void *next = *lst;
2049    KE_TRACE(10, ("__kmp_free_fast_memory: freeing %p, next=%p th %p (%d)\n",
2050                  lst, next, th, __kmp_gtid_from_thread(th)));
2051    (*thr->relfcn)(lst);
2052#if BufStats
2053    // count blocks to prevent problems in __kmp_finalize_bget()
2054    thr->numprel++; /* Nr of expansion block releases */
2055    thr->numpblk--; /* Total number of blocks */
2056#endif
2057    lst = (void **)next;
2058  }
2059
2060  KE_TRACE(
2061      5, ("__kmp_free_fast_memory: Freed T#%d\n", __kmp_gtid_from_thread(th)));
2062}
2063
2064#endif // USE_FAST_MEMORY
2065