1/*
2 * kmp_alloc.cpp -- private/shared dynamic memory allocation and management
3 */
4
5//===----------------------------------------------------------------------===//
6//
7// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8// See https://llvm.org/LICENSE.txt for license information.
9// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10//
11//===----------------------------------------------------------------------===//
12
13#include "kmp.h"
14#include "kmp_io.h"
15#include "kmp_wrapper_malloc.h"
16
17// Disable bget when it is not used
18#if KMP_USE_BGET
19
20/* Thread private buffer management code */
21
22typedef int (*bget_compact_t)(size_t, int);
23typedef void *(*bget_acquire_t)(size_t);
24typedef void (*bget_release_t)(void *);
25
26/* NOTE: bufsize must be a signed datatype */
27
28#if KMP_OS_WINDOWS
29#if KMP_ARCH_X86 || KMP_ARCH_ARM
30typedef kmp_int32 bufsize;
31#else
32typedef kmp_int64 bufsize;
33#endif
34#else
35typedef ssize_t bufsize;
36#endif // KMP_OS_WINDOWS
37
38/* The three modes of operation are, fifo search, lifo search, and best-fit */
39
40typedef enum bget_mode {
41  bget_mode_fifo = 0,
42  bget_mode_lifo = 1,
43  bget_mode_best = 2
44} bget_mode_t;
45
46static void bpool(kmp_info_t *th, void *buffer, bufsize len);
47static void *bget(kmp_info_t *th, bufsize size);
48static void *bgetz(kmp_info_t *th, bufsize size);
49static void *bgetr(kmp_info_t *th, void *buffer, bufsize newsize);
50static void brel(kmp_info_t *th, void *buf);
51static void bectl(kmp_info_t *th, bget_compact_t compact,
52                  bget_acquire_t acquire, bget_release_t release,
53                  bufsize pool_incr);
54
55/* BGET CONFIGURATION */
56/* Buffer allocation size quantum: all buffers allocated are a
57   multiple of this size.  This MUST be a power of two. */
58
59/* On IA-32 architecture with  Linux* OS, malloc() does not
60   ensure 16 byte alignment */
61
62#if KMP_ARCH_X86 || !KMP_HAVE_QUAD
63
64#define SizeQuant 8
65#define AlignType double
66
67#else
68
69#define SizeQuant 16
70#define AlignType _Quad
71
72#endif
73
74// Define this symbol to enable the bstats() function which calculates the
75// total free space in the buffer pool, the largest available buffer, and the
76// total space currently allocated.
77#define BufStats 1
78
79#ifdef KMP_DEBUG
80
81// Define this symbol to enable the bpoold() function which dumps the buffers
82// in a buffer pool.
83#define BufDump 1
84
85// Define this symbol to enable the bpoolv() function for validating a buffer
86// pool.
87#define BufValid 1
88
89// Define this symbol to enable the bufdump() function which allows dumping the
90// contents of an allocated or free buffer.
91#define DumpData 1
92
93#ifdef NOT_USED_NOW
94
95// Wipe free buffers to a guaranteed pattern of garbage to trip up miscreants
96// who attempt to use pointers into released buffers.
97#define FreeWipe 1
98
99// Use a best fit algorithm when searching for space for an allocation request.
100// This uses memory more efficiently, but allocation will be much slower.
101#define BestFit 1
102
103#endif /* NOT_USED_NOW */
104#endif /* KMP_DEBUG */
105
106static bufsize bget_bin_size[] = {
107    0,
108    //    1 << 6,    /* .5 Cache line */
109    1 << 7, /* 1 Cache line, new */
110    1 << 8, /* 2 Cache lines */
111    1 << 9, /* 4 Cache lines, new */
112    1 << 10, /* 8 Cache lines */
113    1 << 11, /* 16 Cache lines, new */
114    1 << 12, 1 << 13, /* new */
115    1 << 14, 1 << 15, /* new */
116    1 << 16, 1 << 17, 1 << 18, 1 << 19, 1 << 20, /*  1MB */
117    1 << 21, /*  2MB */
118    1 << 22, /*  4MB */
119    1 << 23, /*  8MB */
120    1 << 24, /* 16MB */
121    1 << 25, /* 32MB */
122};
123
124#define MAX_BGET_BINS (int)(sizeof(bget_bin_size) / sizeof(bufsize))
125
126struct bfhead;
127
128//  Declare the interface, including the requested buffer size type, bufsize.
129
130/* Queue links */
131typedef struct qlinks {
132  struct bfhead *flink; /* Forward link */
133  struct bfhead *blink; /* Backward link */
134} qlinks_t;
135
136/* Header in allocated and free buffers */
137typedef struct bhead2 {
138  kmp_info_t *bthr; /* The thread which owns the buffer pool */
139  bufsize prevfree; /* Relative link back to previous free buffer in memory or
140                       0 if previous buffer is allocated.  */
141  bufsize bsize; /* Buffer size: positive if free, negative if allocated. */
142} bhead2_t;
143
144/* Make sure the bhead structure is a multiple of SizeQuant in size. */
145typedef union bhead {
146  KMP_ALIGN(SizeQuant)
147  AlignType b_align;
148  char b_pad[sizeof(bhead2_t) + (SizeQuant - (sizeof(bhead2_t) % SizeQuant))];
149  bhead2_t bb;
150} bhead_t;
151#define BH(p) ((bhead_t *)(p))
152
153/*  Header in directly allocated buffers (by acqfcn) */
154typedef struct bdhead {
155  bufsize tsize; /* Total size, including overhead */
156  bhead_t bh; /* Common header */
157} bdhead_t;
158#define BDH(p) ((bdhead_t *)(p))
159
160/* Header in free buffers */
161typedef struct bfhead {
162  bhead_t bh; /* Common allocated/free header */
163  qlinks_t ql; /* Links on free list */
164} bfhead_t;
165#define BFH(p) ((bfhead_t *)(p))
166
167typedef struct thr_data {
168  bfhead_t freelist[MAX_BGET_BINS];
169#if BufStats
170  size_t totalloc; /* Total space currently allocated */
171  long numget, numrel; /* Number of bget() and brel() calls */
172  long numpblk; /* Number of pool blocks */
173  long numpget, numprel; /* Number of block gets and rels */
174  long numdget, numdrel; /* Number of direct gets and rels */
175#endif /* BufStats */
176
177  /* Automatic expansion block management functions */
178  bget_compact_t compfcn;
179  bget_acquire_t acqfcn;
180  bget_release_t relfcn;
181
182  bget_mode_t mode; /* what allocation mode to use? */
183
184  bufsize exp_incr; /* Expansion block size */
185  bufsize pool_len; /* 0: no bpool calls have been made
186                       -1: not all pool blocks are the same size
187                       >0: (common) block size for all bpool calls made so far
188                    */
189  bfhead_t *last_pool; /* Last pool owned by this thread (delay deallocation) */
190} thr_data_t;
191
192/*  Minimum allocation quantum: */
193#define QLSize (sizeof(qlinks_t))
194#define SizeQ ((SizeQuant > QLSize) ? SizeQuant : QLSize)
195#define MaxSize                                                                \
196  (bufsize)(                                                                   \
197      ~(((bufsize)(1) << (sizeof(bufsize) * CHAR_BIT - 1)) | (SizeQuant - 1)))
198// Maximum for the requested size.
199
200/* End sentinel: value placed in bsize field of dummy block delimiting
201   end of pool block.  The most negative number which will  fit  in  a
202   bufsize, defined in a way that the compiler will accept. */
203
204#define ESent                                                                  \
205  ((bufsize)(-(((((bufsize)1) << ((int)sizeof(bufsize) * 8 - 2)) - 1) * 2) - 2))
206
207/* Thread Data management routines */
208static int bget_get_bin(bufsize size) {
209  // binary chop bins
210  int lo = 0, hi = MAX_BGET_BINS - 1;
211
212  KMP_DEBUG_ASSERT(size > 0);
213
214  while ((hi - lo) > 1) {
215    int mid = (lo + hi) >> 1;
216    if (size < bget_bin_size[mid])
217      hi = mid - 1;
218    else
219      lo = mid;
220  }
221
222  KMP_DEBUG_ASSERT((lo >= 0) && (lo < MAX_BGET_BINS));
223
224  return lo;
225}
226
227static void set_thr_data(kmp_info_t *th) {
228  int i;
229  thr_data_t *data;
230
231  data = (thr_data_t *)((!th->th.th_local.bget_data)
232                            ? __kmp_allocate(sizeof(*data))
233                            : th->th.th_local.bget_data);
234
235  memset(data, '\0', sizeof(*data));
236
237  for (i = 0; i < MAX_BGET_BINS; ++i) {
238    data->freelist[i].ql.flink = &data->freelist[i];
239    data->freelist[i].ql.blink = &data->freelist[i];
240  }
241
242  th->th.th_local.bget_data = data;
243  th->th.th_local.bget_list = 0;
244#if !USE_CMP_XCHG_FOR_BGET
245#ifdef USE_QUEUING_LOCK_FOR_BGET
246  __kmp_init_lock(&th->th.th_local.bget_lock);
247#else
248  __kmp_init_bootstrap_lock(&th->th.th_local.bget_lock);
249#endif /* USE_LOCK_FOR_BGET */
250#endif /* ! USE_CMP_XCHG_FOR_BGET */
251}
252
253static thr_data_t *get_thr_data(kmp_info_t *th) {
254  thr_data_t *data;
255
256  data = (thr_data_t *)th->th.th_local.bget_data;
257
258  KMP_DEBUG_ASSERT(data != 0);
259
260  return data;
261}
262
263/* Walk the free list and release the enqueued buffers */
264static void __kmp_bget_dequeue(kmp_info_t *th) {
265  void *p = TCR_SYNC_PTR(th->th.th_local.bget_list);
266
267  if (p != 0) {
268#if USE_CMP_XCHG_FOR_BGET
269    {
270      volatile void *old_value = TCR_SYNC_PTR(th->th.th_local.bget_list);
271      while (!KMP_COMPARE_AND_STORE_PTR(&th->th.th_local.bget_list,
272                                        CCAST(void *, old_value), nullptr)) {
273        KMP_CPU_PAUSE();
274        old_value = TCR_SYNC_PTR(th->th.th_local.bget_list);
275      }
276      p = CCAST(void *, old_value);
277    }
278#else /* ! USE_CMP_XCHG_FOR_BGET */
279#ifdef USE_QUEUING_LOCK_FOR_BGET
280    __kmp_acquire_lock(&th->th.th_local.bget_lock, __kmp_gtid_from_thread(th));
281#else
282    __kmp_acquire_bootstrap_lock(&th->th.th_local.bget_lock);
283#endif /* USE_QUEUING_LOCK_FOR_BGET */
284
285    p = (void *)th->th.th_local.bget_list;
286    th->th.th_local.bget_list = 0;
287
288#ifdef USE_QUEUING_LOCK_FOR_BGET
289    __kmp_release_lock(&th->th.th_local.bget_lock, __kmp_gtid_from_thread(th));
290#else
291    __kmp_release_bootstrap_lock(&th->th.th_local.bget_lock);
292#endif
293#endif /* USE_CMP_XCHG_FOR_BGET */
294
295    /* Check again to make sure the list is not empty */
296    while (p != 0) {
297      void *buf = p;
298      bfhead_t *b = BFH(((char *)p) - sizeof(bhead_t));
299
300      KMP_DEBUG_ASSERT(b->bh.bb.bsize != 0);
301      KMP_DEBUG_ASSERT(((kmp_uintptr_t)TCR_PTR(b->bh.bb.bthr) & ~1) ==
302                       (kmp_uintptr_t)th); // clear possible mark
303      KMP_DEBUG_ASSERT(b->ql.blink == 0);
304
305      p = (void *)b->ql.flink;
306
307      brel(th, buf);
308    }
309  }
310}
311
312/* Chain together the free buffers by using the thread owner field */
313static void __kmp_bget_enqueue(kmp_info_t *th, void *buf
314#ifdef USE_QUEUING_LOCK_FOR_BGET
315                               ,
316                               kmp_int32 rel_gtid
317#endif
318) {
319  bfhead_t *b = BFH(((char *)buf) - sizeof(bhead_t));
320
321  KMP_DEBUG_ASSERT(b->bh.bb.bsize != 0);
322  KMP_DEBUG_ASSERT(((kmp_uintptr_t)TCR_PTR(b->bh.bb.bthr) & ~1) ==
323                   (kmp_uintptr_t)th); // clear possible mark
324
325  b->ql.blink = 0;
326
327  KC_TRACE(10, ("__kmp_bget_enqueue: moving buffer to T#%d list\n",
328                __kmp_gtid_from_thread(th)));
329
330#if USE_CMP_XCHG_FOR_BGET
331  {
332    volatile void *old_value = TCR_PTR(th->th.th_local.bget_list);
333    /* the next pointer must be set before setting bget_list to buf to avoid
334       exposing a broken list to other threads, even for an instant. */
335    b->ql.flink = BFH(CCAST(void *, old_value));
336
337    while (!KMP_COMPARE_AND_STORE_PTR(&th->th.th_local.bget_list,
338                                      CCAST(void *, old_value), buf)) {
339      KMP_CPU_PAUSE();
340      old_value = TCR_PTR(th->th.th_local.bget_list);
341      /* the next pointer must be set before setting bget_list to buf to avoid
342         exposing a broken list to other threads, even for an instant. */
343      b->ql.flink = BFH(CCAST(void *, old_value));
344    }
345  }
346#else /* ! USE_CMP_XCHG_FOR_BGET */
347#ifdef USE_QUEUING_LOCK_FOR_BGET
348  __kmp_acquire_lock(&th->th.th_local.bget_lock, rel_gtid);
349#else
350  __kmp_acquire_bootstrap_lock(&th->th.th_local.bget_lock);
351#endif
352
353  b->ql.flink = BFH(th->th.th_local.bget_list);
354  th->th.th_local.bget_list = (void *)buf;
355
356#ifdef USE_QUEUING_LOCK_FOR_BGET
357  __kmp_release_lock(&th->th.th_local.bget_lock, rel_gtid);
358#else
359  __kmp_release_bootstrap_lock(&th->th.th_local.bget_lock);
360#endif
361#endif /* USE_CMP_XCHG_FOR_BGET */
362}
363
364/* insert buffer back onto a new freelist */
365static void __kmp_bget_insert_into_freelist(thr_data_t *thr, bfhead_t *b) {
366  int bin;
367
368  KMP_DEBUG_ASSERT(((size_t)b) % SizeQuant == 0);
369  KMP_DEBUG_ASSERT(b->bh.bb.bsize % SizeQuant == 0);
370
371  bin = bget_get_bin(b->bh.bb.bsize);
372
373  KMP_DEBUG_ASSERT(thr->freelist[bin].ql.blink->ql.flink ==
374                   &thr->freelist[bin]);
375  KMP_DEBUG_ASSERT(thr->freelist[bin].ql.flink->ql.blink ==
376                   &thr->freelist[bin]);
377
378  b->ql.flink = &thr->freelist[bin];
379  b->ql.blink = thr->freelist[bin].ql.blink;
380
381  thr->freelist[bin].ql.blink = b;
382  b->ql.blink->ql.flink = b;
383}
384
385/* unlink the buffer from the old freelist */
386static void __kmp_bget_remove_from_freelist(bfhead_t *b) {
387  KMP_DEBUG_ASSERT(b->ql.blink->ql.flink == b);
388  KMP_DEBUG_ASSERT(b->ql.flink->ql.blink == b);
389
390  b->ql.blink->ql.flink = b->ql.flink;
391  b->ql.flink->ql.blink = b->ql.blink;
392}
393
394/*  GET STATS -- check info on free list */
395static void bcheck(kmp_info_t *th, bufsize *max_free, bufsize *total_free) {
396  thr_data_t *thr = get_thr_data(th);
397  int bin;
398
399  *total_free = *max_free = 0;
400
401  for (bin = 0; bin < MAX_BGET_BINS; ++bin) {
402    bfhead_t *b, *best;
403
404    best = &thr->freelist[bin];
405    b = best->ql.flink;
406
407    while (b != &thr->freelist[bin]) {
408      *total_free += (b->bh.bb.bsize - sizeof(bhead_t));
409      if ((best == &thr->freelist[bin]) || (b->bh.bb.bsize < best->bh.bb.bsize))
410        best = b;
411
412      /* Link to next buffer */
413      b = b->ql.flink;
414    }
415
416    if (*max_free < best->bh.bb.bsize)
417      *max_free = best->bh.bb.bsize;
418  }
419
420  if (*max_free > (bufsize)sizeof(bhead_t))
421    *max_free -= sizeof(bhead_t);
422}
423
424/*  BGET  --  Allocate a buffer.  */
425static void *bget(kmp_info_t *th, bufsize requested_size) {
426  thr_data_t *thr = get_thr_data(th);
427  bufsize size = requested_size;
428  bfhead_t *b;
429  void *buf;
430  int compactseq = 0;
431  int use_blink = 0;
432  /* For BestFit */
433  bfhead_t *best;
434
435  if (size < 0 || size + sizeof(bhead_t) > MaxSize) {
436    return NULL;
437  }
438
439  __kmp_bget_dequeue(th); /* Release any queued buffers */
440
441  if (size < (bufsize)SizeQ) { // Need at least room for the queue links.
442    size = SizeQ;
443  }
444#if defined(SizeQuant) && (SizeQuant > 1)
445  size = (size + (SizeQuant - 1)) & (~(SizeQuant - 1));
446#endif
447
448  size += sizeof(bhead_t); // Add overhead in allocated buffer to size required.
449  KMP_DEBUG_ASSERT(size >= 0);
450  KMP_DEBUG_ASSERT(size % SizeQuant == 0);
451
452  use_blink = (thr->mode == bget_mode_lifo);
453
454  /* If a compact function was provided in the call to bectl(), wrap
455     a loop around the allocation process  to  allow  compaction  to
456     intervene in case we don't find a suitable buffer in the chain. */
457
458  for (;;) {
459    int bin;
460
461    for (bin = bget_get_bin(size); bin < MAX_BGET_BINS; ++bin) {
462      /* Link to next buffer */
463      b = (use_blink ? thr->freelist[bin].ql.blink
464                     : thr->freelist[bin].ql.flink);
465
466      if (thr->mode == bget_mode_best) {
467        best = &thr->freelist[bin];
468
469        /* Scan the free list searching for the first buffer big enough
470           to hold the requested size buffer. */
471        while (b != &thr->freelist[bin]) {
472          if (b->bh.bb.bsize >= (bufsize)size) {
473            if ((best == &thr->freelist[bin]) ||
474                (b->bh.bb.bsize < best->bh.bb.bsize)) {
475              best = b;
476            }
477          }
478
479          /* Link to next buffer */
480          b = (use_blink ? b->ql.blink : b->ql.flink);
481        }
482        b = best;
483      }
484
485      while (b != &thr->freelist[bin]) {
486        if ((bufsize)b->bh.bb.bsize >= (bufsize)size) {
487
488          // Buffer is big enough to satisfy the request. Allocate it to the
489          // caller. We must decide whether the buffer is large enough to split
490          // into the part given to the caller and a free buffer that remains
491          // on the free list, or whether the entire buffer should be removed
492          // from the free list and given to the caller in its entirety. We
493          // only split the buffer if enough room remains for a header plus the
494          // minimum quantum of allocation.
495          if ((b->bh.bb.bsize - (bufsize)size) >
496              (bufsize)(SizeQ + (sizeof(bhead_t)))) {
497            bhead_t *ba, *bn;
498
499            ba = BH(((char *)b) + (b->bh.bb.bsize - (bufsize)size));
500            bn = BH(((char *)ba) + size);
501
502            KMP_DEBUG_ASSERT(bn->bb.prevfree == b->bh.bb.bsize);
503
504            /* Subtract size from length of free block. */
505            b->bh.bb.bsize -= (bufsize)size;
506
507            /* Link allocated buffer to the previous free buffer. */
508            ba->bb.prevfree = b->bh.bb.bsize;
509
510            /* Plug negative size into user buffer. */
511            ba->bb.bsize = -size;
512
513            /* Mark this buffer as owned by this thread. */
514            TCW_PTR(ba->bb.bthr,
515                    th); // not an allocated address (do not mark it)
516            /* Mark buffer after this one not preceded by free block. */
517            bn->bb.prevfree = 0;
518
519            // unlink buffer from old freelist, and reinsert into new freelist
520            __kmp_bget_remove_from_freelist(b);
521            __kmp_bget_insert_into_freelist(thr, b);
522#if BufStats
523            thr->totalloc += (size_t)size;
524            thr->numget++; /* Increment number of bget() calls */
525#endif
526            buf = (void *)((((char *)ba) + sizeof(bhead_t)));
527            KMP_DEBUG_ASSERT(((size_t)buf) % SizeQuant == 0);
528            return buf;
529          } else {
530            bhead_t *ba;
531
532            ba = BH(((char *)b) + b->bh.bb.bsize);
533
534            KMP_DEBUG_ASSERT(ba->bb.prevfree == b->bh.bb.bsize);
535
536            /* The buffer isn't big enough to split.  Give  the  whole
537               shebang to the caller and remove it from the free list. */
538
539            __kmp_bget_remove_from_freelist(b);
540#if BufStats
541            thr->totalloc += (size_t)b->bh.bb.bsize;
542            thr->numget++; /* Increment number of bget() calls */
543#endif
544            /* Negate size to mark buffer allocated. */
545            b->bh.bb.bsize = -(b->bh.bb.bsize);
546
547            /* Mark this buffer as owned by this thread. */
548            TCW_PTR(ba->bb.bthr, th); // not an allocated address (do not mark)
549            /* Zero the back pointer in the next buffer in memory
550               to indicate that this buffer is allocated. */
551            ba->bb.prevfree = 0;
552
553            /* Give user buffer starting at queue links. */
554            buf = (void *)&(b->ql);
555            KMP_DEBUG_ASSERT(((size_t)buf) % SizeQuant == 0);
556            return buf;
557          }
558        }
559
560        /* Link to next buffer */
561        b = (use_blink ? b->ql.blink : b->ql.flink);
562      }
563    }
564
565    /* We failed to find a buffer. If there's a compact function defined,
566       notify it of the size requested. If it returns TRUE, try the allocation
567       again. */
568
569    if ((thr->compfcn == 0) || (!(*thr->compfcn)(size, ++compactseq))) {
570      break;
571    }
572  }
573
574  /* No buffer available with requested size free. */
575
576  /* Don't give up yet -- look in the reserve supply. */
577  if (thr->acqfcn != 0) {
578    if (size > (bufsize)(thr->exp_incr - sizeof(bhead_t))) {
579      /* Request is too large to fit in a single expansion block.
580         Try to satisfy it by a direct buffer acquisition. */
581      bdhead_t *bdh;
582
583      size += sizeof(bdhead_t) - sizeof(bhead_t);
584
585      KE_TRACE(10, ("%%%%%% MALLOC( %d )\n", (int)size));
586
587      /* richryan */
588      bdh = BDH((*thr->acqfcn)((bufsize)size));
589      if (bdh != NULL) {
590
591        // Mark the buffer special by setting size field of its header to zero.
592        bdh->bh.bb.bsize = 0;
593
594        /* Mark this buffer as owned by this thread. */
595        TCW_PTR(bdh->bh.bb.bthr, th); // don't mark buffer as allocated,
596        // because direct buffer never goes to free list
597        bdh->bh.bb.prevfree = 0;
598        bdh->tsize = size;
599#if BufStats
600        thr->totalloc += (size_t)size;
601        thr->numget++; /* Increment number of bget() calls */
602        thr->numdget++; /* Direct bget() call count */
603#endif
604        buf = (void *)(bdh + 1);
605        KMP_DEBUG_ASSERT(((size_t)buf) % SizeQuant == 0);
606        return buf;
607      }
608
609    } else {
610
611      /*  Try to obtain a new expansion block */
612      void *newpool;
613
614      KE_TRACE(10, ("%%%%%% MALLOCB( %d )\n", (int)thr->exp_incr));
615
616      /* richryan */
617      newpool = (*thr->acqfcn)((bufsize)thr->exp_incr);
618      KMP_DEBUG_ASSERT(((size_t)newpool) % SizeQuant == 0);
619      if (newpool != NULL) {
620        bpool(th, newpool, thr->exp_incr);
621        buf = bget(
622            th, requested_size); /* This can't, I say, can't get into a loop. */
623        return buf;
624      }
625    }
626  }
627
628  /*  Still no buffer available */
629
630  return NULL;
631}
632
633/*  BGETZ  --  Allocate a buffer and clear its contents to zero.  We clear
634               the  entire  contents  of  the buffer to zero, not just the
635               region requested by the caller. */
636
637static void *bgetz(kmp_info_t *th, bufsize size) {
638  char *buf = (char *)bget(th, size);
639
640  if (buf != NULL) {
641    bhead_t *b;
642    bufsize rsize;
643
644    b = BH(buf - sizeof(bhead_t));
645    rsize = -(b->bb.bsize);
646    if (rsize == 0) {
647      bdhead_t *bd;
648
649      bd = BDH(buf - sizeof(bdhead_t));
650      rsize = bd->tsize - (bufsize)sizeof(bdhead_t);
651    } else {
652      rsize -= sizeof(bhead_t);
653    }
654
655    KMP_DEBUG_ASSERT(rsize >= size);
656
657    (void)memset(buf, 0, (bufsize)rsize);
658  }
659  return ((void *)buf);
660}
661
662/*  BGETR  --  Reallocate a buffer.  This is a minimal implementation,
663               simply in terms of brel()  and  bget().   It  could  be
664               enhanced to allow the buffer to grow into adjacent free
665               blocks and to avoid moving data unnecessarily.  */
666
667static void *bgetr(kmp_info_t *th, void *buf, bufsize size) {
668  void *nbuf;
669  bufsize osize; /* Old size of buffer */
670  bhead_t *b;
671
672  nbuf = bget(th, size);
673  if (nbuf == NULL) { /* Acquire new buffer */
674    return NULL;
675  }
676  if (buf == NULL) {
677    return nbuf;
678  }
679  b = BH(((char *)buf) - sizeof(bhead_t));
680  osize = -b->bb.bsize;
681  if (osize == 0) {
682    /*  Buffer acquired directly through acqfcn. */
683    bdhead_t *bd;
684
685    bd = BDH(((char *)buf) - sizeof(bdhead_t));
686    osize = bd->tsize - (bufsize)sizeof(bdhead_t);
687  } else {
688    osize -= sizeof(bhead_t);
689  }
690
691  KMP_DEBUG_ASSERT(osize > 0);
692
693  (void)KMP_MEMCPY((char *)nbuf, (char *)buf, /* Copy the data */
694                   (size_t)((size < osize) ? size : osize));
695  brel(th, buf);
696
697  return nbuf;
698}
699
700/*  BREL  --  Release a buffer.  */
701static void brel(kmp_info_t *th, void *buf) {
702  thr_data_t *thr = get_thr_data(th);
703  bfhead_t *b, *bn;
704  kmp_info_t *bth;
705
706  KMP_DEBUG_ASSERT(buf != NULL);
707  KMP_DEBUG_ASSERT(((size_t)buf) % SizeQuant == 0);
708
709  b = BFH(((char *)buf) - sizeof(bhead_t));
710
711  if (b->bh.bb.bsize == 0) { /* Directly-acquired buffer? */
712    bdhead_t *bdh;
713
714    bdh = BDH(((char *)buf) - sizeof(bdhead_t));
715    KMP_DEBUG_ASSERT(b->bh.bb.prevfree == 0);
716#if BufStats
717    thr->totalloc -= (size_t)bdh->tsize;
718    thr->numdrel++; /* Number of direct releases */
719    thr->numrel++; /* Increment number of brel() calls */
720#endif /* BufStats */
721#ifdef FreeWipe
722    (void)memset((char *)buf, 0x55, (size_t)(bdh->tsize - sizeof(bdhead_t)));
723#endif /* FreeWipe */
724
725    KE_TRACE(10, ("%%%%%% FREE( %p )\n", (void *)bdh));
726
727    KMP_DEBUG_ASSERT(thr->relfcn != 0);
728    (*thr->relfcn)((void *)bdh); /* Release it directly. */
729    return;
730  }
731
732  bth = (kmp_info_t *)((kmp_uintptr_t)TCR_PTR(b->bh.bb.bthr) &
733                       ~1); // clear possible mark before comparison
734  if (bth != th) {
735    /* Add this buffer to be released by the owning thread later */
736    __kmp_bget_enqueue(bth, buf
737#ifdef USE_QUEUING_LOCK_FOR_BGET
738                       ,
739                       __kmp_gtid_from_thread(th)
740#endif
741    );
742    return;
743  }
744
745  /* Buffer size must be negative, indicating that the buffer is allocated. */
746  if (b->bh.bb.bsize >= 0) {
747    bn = NULL;
748  }
749  KMP_DEBUG_ASSERT(b->bh.bb.bsize < 0);
750
751  /*  Back pointer in next buffer must be zero, indicating the same thing: */
752
753  KMP_DEBUG_ASSERT(BH((char *)b - b->bh.bb.bsize)->bb.prevfree == 0);
754
755#if BufStats
756  thr->numrel++; /* Increment number of brel() calls */
757  thr->totalloc += (size_t)b->bh.bb.bsize;
758#endif
759
760  /* If the back link is nonzero, the previous buffer is free.  */
761
762  if (b->bh.bb.prevfree != 0) {
763    /* The previous buffer is free. Consolidate this buffer with it by adding
764       the length of this buffer to the previous free buffer. Note that we
765       subtract the size in the buffer being released, since it's negative to
766       indicate that the buffer is allocated. */
767    bufsize size = b->bh.bb.bsize;
768
769    /* Make the previous buffer the one we're working on. */
770    KMP_DEBUG_ASSERT(BH((char *)b - b->bh.bb.prevfree)->bb.bsize ==
771                     b->bh.bb.prevfree);
772    b = BFH(((char *)b) - b->bh.bb.prevfree);
773    b->bh.bb.bsize -= size;
774
775    /* unlink the buffer from the old freelist */
776    __kmp_bget_remove_from_freelist(b);
777  } else {
778    /* The previous buffer isn't allocated. Mark this buffer size as positive
779       (i.e. free) and fall through to place the buffer on the free list as an
780       isolated free block. */
781    b->bh.bb.bsize = -b->bh.bb.bsize;
782  }
783
784  /* insert buffer back onto a new freelist */
785  __kmp_bget_insert_into_freelist(thr, b);
786
787  /* Now we look at the next buffer in memory, located by advancing from
788     the  start  of  this  buffer  by its size, to see if that buffer is
789     free.  If it is, we combine  this  buffer  with  the  next  one  in
790     memory, dechaining the second buffer from the free list. */
791  bn = BFH(((char *)b) + b->bh.bb.bsize);
792  if (bn->bh.bb.bsize > 0) {
793
794    /* The buffer is free.  Remove it from the free list and add
795       its size to that of our buffer. */
796    KMP_DEBUG_ASSERT(BH((char *)bn + bn->bh.bb.bsize)->bb.prevfree ==
797                     bn->bh.bb.bsize);
798
799    __kmp_bget_remove_from_freelist(bn);
800
801    b->bh.bb.bsize += bn->bh.bb.bsize;
802
803    /* unlink the buffer from the old freelist, and reinsert it into the new
804     * freelist */
805    __kmp_bget_remove_from_freelist(b);
806    __kmp_bget_insert_into_freelist(thr, b);
807
808    /* Finally,  advance  to   the  buffer  that   follows  the  newly
809       consolidated free block.  We must set its  backpointer  to  the
810       head  of  the  consolidated free block.  We know the next block
811       must be an allocated block because the process of recombination
812       guarantees  that  two  free  blocks will never be contiguous in
813       memory.  */
814    bn = BFH(((char *)b) + b->bh.bb.bsize);
815  }
816#ifdef FreeWipe
817  (void)memset(((char *)b) + sizeof(bfhead_t), 0x55,
818               (size_t)(b->bh.bb.bsize - sizeof(bfhead_t)));
819#endif
820  KMP_DEBUG_ASSERT(bn->bh.bb.bsize < 0);
821
822  /* The next buffer is allocated.  Set the backpointer in it  to  point
823     to this buffer; the previous free buffer in memory. */
824
825  bn->bh.bb.prevfree = b->bh.bb.bsize;
826
827  /*  If  a  block-release function is defined, and this free buffer
828      constitutes the entire block, release it.  Note that  pool_len
829      is  defined  in  such a way that the test will fail unless all
830      pool blocks are the same size.  */
831  if (thr->relfcn != 0 &&
832      b->bh.bb.bsize == (bufsize)(thr->pool_len - sizeof(bhead_t))) {
833#if BufStats
834    if (thr->numpblk !=
835        1) { /* Do not release the last buffer until finalization time */
836#endif
837
838      KMP_DEBUG_ASSERT(b->bh.bb.prevfree == 0);
839      KMP_DEBUG_ASSERT(BH((char *)b + b->bh.bb.bsize)->bb.bsize == ESent);
840      KMP_DEBUG_ASSERT(BH((char *)b + b->bh.bb.bsize)->bb.prevfree ==
841                       b->bh.bb.bsize);
842
843      /*  Unlink the buffer from the free list  */
844      __kmp_bget_remove_from_freelist(b);
845
846      KE_TRACE(10, ("%%%%%% FREE( %p )\n", (void *)b));
847
848      (*thr->relfcn)(b);
849#if BufStats
850      thr->numprel++; /* Nr of expansion block releases */
851      thr->numpblk--; /* Total number of blocks */
852      KMP_DEBUG_ASSERT(thr->numpblk == thr->numpget - thr->numprel);
853
854      // avoid leaving stale last_pool pointer around if it is being dealloced
855      if (thr->last_pool == b)
856        thr->last_pool = 0;
857    } else {
858      thr->last_pool = b;
859    }
860#endif /* BufStats */
861  }
862}
863
864/*  BECTL  --  Establish automatic pool expansion control  */
865static void bectl(kmp_info_t *th, bget_compact_t compact,
866                  bget_acquire_t acquire, bget_release_t release,
867                  bufsize pool_incr) {
868  thr_data_t *thr = get_thr_data(th);
869
870  thr->compfcn = compact;
871  thr->acqfcn = acquire;
872  thr->relfcn = release;
873  thr->exp_incr = pool_incr;
874}
875
876/*  BPOOL  --  Add a region of memory to the buffer pool.  */
877static void bpool(kmp_info_t *th, void *buf, bufsize len) {
878  /*    int bin = 0; */
879  thr_data_t *thr = get_thr_data(th);
880  bfhead_t *b = BFH(buf);
881  bhead_t *bn;
882
883  __kmp_bget_dequeue(th); /* Release any queued buffers */
884
885#ifdef SizeQuant
886  len &= ~((bufsize)(SizeQuant - 1));
887#endif
888  if (thr->pool_len == 0) {
889    thr->pool_len = len;
890  } else if (len != thr->pool_len) {
891    thr->pool_len = -1;
892  }
893#if BufStats
894  thr->numpget++; /* Number of block acquisitions */
895  thr->numpblk++; /* Number of blocks total */
896  KMP_DEBUG_ASSERT(thr->numpblk == thr->numpget - thr->numprel);
897#endif /* BufStats */
898
899  /* Since the block is initially occupied by a single free  buffer,
900     it  had  better  not  be  (much) larger than the largest buffer
901     whose size we can store in bhead.bb.bsize. */
902  KMP_DEBUG_ASSERT(len - sizeof(bhead_t) <= -((bufsize)ESent + 1));
903
904  /* Clear  the  backpointer at  the start of the block to indicate that
905     there  is  no  free  block  prior  to  this   one.    That   blocks
906     recombination when the first block in memory is released. */
907  b->bh.bb.prevfree = 0;
908
909  /* Create a dummy allocated buffer at the end of the pool.  This dummy
910     buffer is seen when a buffer at the end of the pool is released and
911     blocks  recombination  of  the last buffer with the dummy buffer at
912     the end.  The length in the dummy buffer  is  set  to  the  largest
913     negative  number  to  denote  the  end  of  the pool for diagnostic
914     routines (this specific value is  not  counted  on  by  the  actual
915     allocation and release functions). */
916  len -= sizeof(bhead_t);
917  b->bh.bb.bsize = (bufsize)len;
918  /* Set the owner of this buffer */
919  TCW_PTR(b->bh.bb.bthr,
920          (kmp_info_t *)((kmp_uintptr_t)th |
921                         1)); // mark the buffer as allocated address
922
923  /* Chain the new block to the free list. */
924  __kmp_bget_insert_into_freelist(thr, b);
925
926#ifdef FreeWipe
927  (void)memset(((char *)b) + sizeof(bfhead_t), 0x55,
928               (size_t)(len - sizeof(bfhead_t)));
929#endif
930  bn = BH(((char *)b) + len);
931  bn->bb.prevfree = (bufsize)len;
932  /* Definition of ESent assumes two's complement! */
933  KMP_DEBUG_ASSERT((~0) == -1 && (bn != 0));
934
935  bn->bb.bsize = ESent;
936}
937
938/*  BFREED  --  Dump the free lists for this thread. */
939static void bfreed(kmp_info_t *th) {
940  int bin = 0, count = 0;
941  int gtid = __kmp_gtid_from_thread(th);
942  thr_data_t *thr = get_thr_data(th);
943
944#if BufStats
945  __kmp_printf_no_lock("__kmp_printpool: T#%d total=%" KMP_UINT64_SPEC
946                       " get=%" KMP_INT64_SPEC " rel=%" KMP_INT64_SPEC
947                       " pblk=%" KMP_INT64_SPEC " pget=%" KMP_INT64_SPEC
948                       " prel=%" KMP_INT64_SPEC " dget=%" KMP_INT64_SPEC
949                       " drel=%" KMP_INT64_SPEC "\n",
950                       gtid, (kmp_uint64)thr->totalloc, (kmp_int64)thr->numget,
951                       (kmp_int64)thr->numrel, (kmp_int64)thr->numpblk,
952                       (kmp_int64)thr->numpget, (kmp_int64)thr->numprel,
953                       (kmp_int64)thr->numdget, (kmp_int64)thr->numdrel);
954#endif
955
956  for (bin = 0; bin < MAX_BGET_BINS; ++bin) {
957    bfhead_t *b;
958
959    for (b = thr->freelist[bin].ql.flink; b != &thr->freelist[bin];
960         b = b->ql.flink) {
961      bufsize bs = b->bh.bb.bsize;
962
963      KMP_DEBUG_ASSERT(b->ql.blink->ql.flink == b);
964      KMP_DEBUG_ASSERT(b->ql.flink->ql.blink == b);
965      KMP_DEBUG_ASSERT(bs > 0);
966
967      count += 1;
968
969      __kmp_printf_no_lock(
970          "__kmp_printpool: T#%d Free block: 0x%p size %6ld bytes.\n", gtid, b,
971          (long)bs);
972#ifdef FreeWipe
973      {
974        char *lerr = ((char *)b) + sizeof(bfhead_t);
975        if ((bs > sizeof(bfhead_t)) &&
976            ((*lerr != 0x55) ||
977             (memcmp(lerr, lerr + 1, (size_t)(bs - (sizeof(bfhead_t) + 1))) !=
978              0))) {
979          __kmp_printf_no_lock("__kmp_printpool: T#%d     (Contents of above "
980                               "free block have been overstored.)\n",
981                               gtid);
982        }
983      }
984#endif
985    }
986  }
987
988  if (count == 0)
989    __kmp_printf_no_lock("__kmp_printpool: T#%d No free blocks\n", gtid);
990}
991
992void __kmp_initialize_bget(kmp_info_t *th) {
993  KMP_DEBUG_ASSERT(SizeQuant >= sizeof(void *) && (th != 0));
994
995  set_thr_data(th);
996
997  bectl(th, (bget_compact_t)0, (bget_acquire_t)malloc, (bget_release_t)free,
998        (bufsize)__kmp_malloc_pool_incr);
999}
1000
1001void __kmp_finalize_bget(kmp_info_t *th) {
1002  thr_data_t *thr;
1003  bfhead_t *b;
1004
1005  KMP_DEBUG_ASSERT(th != 0);
1006
1007#if BufStats
1008  thr = (thr_data_t *)th->th.th_local.bget_data;
1009  KMP_DEBUG_ASSERT(thr != NULL);
1010  b = thr->last_pool;
1011
1012  /*  If a block-release function is defined, and this free buffer constitutes
1013      the entire block, release it. Note that pool_len is defined in such a way
1014      that the test will fail unless all pool blocks are the same size.  */
1015
1016  // Deallocate the last pool if one exists because we no longer do it in brel()
1017  if (thr->relfcn != 0 && b != 0 && thr->numpblk != 0 &&
1018      b->bh.bb.bsize == (bufsize)(thr->pool_len - sizeof(bhead_t))) {
1019    KMP_DEBUG_ASSERT(b->bh.bb.prevfree == 0);
1020    KMP_DEBUG_ASSERT(BH((char *)b + b->bh.bb.bsize)->bb.bsize == ESent);
1021    KMP_DEBUG_ASSERT(BH((char *)b + b->bh.bb.bsize)->bb.prevfree ==
1022                     b->bh.bb.bsize);
1023
1024    /*  Unlink the buffer from the free list  */
1025    __kmp_bget_remove_from_freelist(b);
1026
1027    KE_TRACE(10, ("%%%%%% FREE( %p )\n", (void *)b));
1028
1029    (*thr->relfcn)(b);
1030    thr->numprel++; /* Nr of expansion block releases */
1031    thr->numpblk--; /* Total number of blocks */
1032    KMP_DEBUG_ASSERT(thr->numpblk == thr->numpget - thr->numprel);
1033  }
1034#endif /* BufStats */
1035
1036  /* Deallocate bget_data */
1037  if (th->th.th_local.bget_data != NULL) {
1038    __kmp_free(th->th.th_local.bget_data);
1039    th->th.th_local.bget_data = NULL;
1040  }
1041}
1042
1043void kmpc_set_poolsize(size_t size) {
1044  bectl(__kmp_get_thread(), (bget_compact_t)0, (bget_acquire_t)malloc,
1045        (bget_release_t)free, (bufsize)size);
1046}
1047
1048size_t kmpc_get_poolsize(void) {
1049  thr_data_t *p;
1050
1051  p = get_thr_data(__kmp_get_thread());
1052
1053  return p->exp_incr;
1054}
1055
1056void kmpc_set_poolmode(int mode) {
1057  thr_data_t *p;
1058
1059  if (mode == bget_mode_fifo || mode == bget_mode_lifo ||
1060      mode == bget_mode_best) {
1061    p = get_thr_data(__kmp_get_thread());
1062    p->mode = (bget_mode_t)mode;
1063  }
1064}
1065
1066int kmpc_get_poolmode(void) {
1067  thr_data_t *p;
1068
1069  p = get_thr_data(__kmp_get_thread());
1070
1071  return p->mode;
1072}
1073
1074void kmpc_get_poolstat(size_t *maxmem, size_t *allmem) {
1075  kmp_info_t *th = __kmp_get_thread();
1076  bufsize a, b;
1077
1078  __kmp_bget_dequeue(th); /* Release any queued buffers */
1079
1080  bcheck(th, &a, &b);
1081
1082  *maxmem = a;
1083  *allmem = b;
1084}
1085
1086void kmpc_poolprint(void) {
1087  kmp_info_t *th = __kmp_get_thread();
1088
1089  __kmp_bget_dequeue(th); /* Release any queued buffers */
1090
1091  bfreed(th);
1092}
1093
1094#endif // #if KMP_USE_BGET
1095
1096void *kmpc_malloc(size_t size) {
1097  void *ptr;
1098  ptr = bget(__kmp_entry_thread(), (bufsize)(size + sizeof(ptr)));
1099  if (ptr != NULL) {
1100    // save allocated pointer just before one returned to user
1101    *(void **)ptr = ptr;
1102    ptr = (void **)ptr + 1;
1103  }
1104  return ptr;
1105}
1106
1107#define IS_POWER_OF_TWO(n) (((n) & ((n)-1)) == 0)
1108
1109void *kmpc_aligned_malloc(size_t size, size_t alignment) {
1110  void *ptr;
1111  void *ptr_allocated;
1112  KMP_DEBUG_ASSERT(alignment < 32 * 1024); // Alignment should not be too big
1113  if (!IS_POWER_OF_TWO(alignment)) {
1114    // AC: do we need to issue a warning here?
1115    errno = EINVAL;
1116    return NULL;
1117  }
1118  size = size + sizeof(void *) + alignment;
1119  ptr_allocated = bget(__kmp_entry_thread(), (bufsize)size);
1120  if (ptr_allocated != NULL) {
1121    // save allocated pointer just before one returned to user
1122    ptr = (void *)(((kmp_uintptr_t)ptr_allocated + sizeof(void *) + alignment) &
1123                   ~(alignment - 1));
1124    *((void **)ptr - 1) = ptr_allocated;
1125  } else {
1126    ptr = NULL;
1127  }
1128  return ptr;
1129}
1130
1131void *kmpc_calloc(size_t nelem, size_t elsize) {
1132  void *ptr;
1133  ptr = bgetz(__kmp_entry_thread(), (bufsize)(nelem * elsize + sizeof(ptr)));
1134  if (ptr != NULL) {
1135    // save allocated pointer just before one returned to user
1136    *(void **)ptr = ptr;
1137    ptr = (void **)ptr + 1;
1138  }
1139  return ptr;
1140}
1141
1142void *kmpc_realloc(void *ptr, size_t size) {
1143  void *result = NULL;
1144  if (ptr == NULL) {
1145    // If pointer is NULL, realloc behaves like malloc.
1146    result = bget(__kmp_entry_thread(), (bufsize)(size + sizeof(ptr)));
1147    // save allocated pointer just before one returned to user
1148    if (result != NULL) {
1149      *(void **)result = result;
1150      result = (void **)result + 1;
1151    }
1152  } else if (size == 0) {
1153    // If size is 0, realloc behaves like free.
1154    // The thread must be registered by the call to kmpc_malloc() or
1155    // kmpc_calloc() before.
1156    // So it should be safe to call __kmp_get_thread(), not
1157    // __kmp_entry_thread().
1158    KMP_ASSERT(*((void **)ptr - 1));
1159    brel(__kmp_get_thread(), *((void **)ptr - 1));
1160  } else {
1161    result = bgetr(__kmp_entry_thread(), *((void **)ptr - 1),
1162                   (bufsize)(size + sizeof(ptr)));
1163    if (result != NULL) {
1164      *(void **)result = result;
1165      result = (void **)result + 1;
1166    }
1167  }
1168  return result;
1169}
1170
1171// NOTE: the library must have already been initialized by a previous allocate
1172void kmpc_free(void *ptr) {
1173  if (!__kmp_init_serial) {
1174    return;
1175  }
1176  if (ptr != NULL) {
1177    kmp_info_t *th = __kmp_get_thread();
1178    __kmp_bget_dequeue(th); /* Release any queued buffers */
1179    // extract allocated pointer and free it
1180    KMP_ASSERT(*((void **)ptr - 1));
1181    brel(th, *((void **)ptr - 1));
1182  }
1183}
1184
1185void *___kmp_thread_malloc(kmp_info_t *th, size_t size KMP_SRC_LOC_DECL) {
1186  void *ptr;
1187  KE_TRACE(30, ("-> __kmp_thread_malloc( %p, %d ) called from %s:%d\n", th,
1188                (int)size KMP_SRC_LOC_PARM));
1189  ptr = bget(th, (bufsize)size);
1190  KE_TRACE(30, ("<- __kmp_thread_malloc() returns %p\n", ptr));
1191  return ptr;
1192}
1193
1194void *___kmp_thread_calloc(kmp_info_t *th, size_t nelem,
1195                           size_t elsize KMP_SRC_LOC_DECL) {
1196  void *ptr;
1197  KE_TRACE(30, ("-> __kmp_thread_calloc( %p, %d, %d ) called from %s:%d\n", th,
1198                (int)nelem, (int)elsize KMP_SRC_LOC_PARM));
1199  ptr = bgetz(th, (bufsize)(nelem * elsize));
1200  KE_TRACE(30, ("<- __kmp_thread_calloc() returns %p\n", ptr));
1201  return ptr;
1202}
1203
1204void *___kmp_thread_realloc(kmp_info_t *th, void *ptr,
1205                            size_t size KMP_SRC_LOC_DECL) {
1206  KE_TRACE(30, ("-> __kmp_thread_realloc( %p, %p, %d ) called from %s:%d\n", th,
1207                ptr, (int)size KMP_SRC_LOC_PARM));
1208  ptr = bgetr(th, ptr, (bufsize)size);
1209  KE_TRACE(30, ("<- __kmp_thread_realloc() returns %p\n", ptr));
1210  return ptr;
1211}
1212
1213void ___kmp_thread_free(kmp_info_t *th, void *ptr KMP_SRC_LOC_DECL) {
1214  KE_TRACE(30, ("-> __kmp_thread_free( %p, %p ) called from %s:%d\n", th,
1215                ptr KMP_SRC_LOC_PARM));
1216  if (ptr != NULL) {
1217    __kmp_bget_dequeue(th); /* Release any queued buffers */
1218    brel(th, ptr);
1219  }
1220  KE_TRACE(30, ("<- __kmp_thread_free()\n"));
1221}
1222
1223/* OMP 5.0 Memory Management support */
1224static const char *kmp_mk_lib_name;
1225static void *h_memkind;
1226/* memkind experimental API: */
1227// memkind_alloc
1228static void *(*kmp_mk_alloc)(void *k, size_t sz);
1229// memkind_free
1230static void (*kmp_mk_free)(void *kind, void *ptr);
1231// memkind_check_available
1232static int (*kmp_mk_check)(void *kind);
1233// kinds we are going to use
1234static void **mk_default;
1235static void **mk_interleave;
1236static void **mk_hbw;
1237static void **mk_hbw_interleave;
1238static void **mk_hbw_preferred;
1239static void **mk_hugetlb;
1240static void **mk_hbw_hugetlb;
1241static void **mk_hbw_preferred_hugetlb;
1242static void **mk_dax_kmem;
1243static void **mk_dax_kmem_all;
1244static void **mk_dax_kmem_preferred;
1245static void *(*kmp_target_alloc_host)(size_t size, int device);
1246static void *(*kmp_target_alloc_shared)(size_t size, int device);
1247static void *(*kmp_target_alloc_device)(size_t size, int device);
1248static void *(*kmp_target_lock_mem)(void *ptr, size_t size, int device);
1249static void *(*kmp_target_unlock_mem)(void *ptr, int device);
1250static void *(*kmp_target_free_host)(void *ptr, int device);
1251static void *(*kmp_target_free_shared)(void *ptr, int device);
1252static void *(*kmp_target_free_device)(void *ptr, int device);
1253static bool __kmp_target_mem_available;
1254#define KMP_IS_TARGET_MEM_SPACE(MS)                                            \
1255  (MS == llvm_omp_target_host_mem_space ||                                     \
1256   MS == llvm_omp_target_shared_mem_space ||                                   \
1257   MS == llvm_omp_target_device_mem_space)
1258#define KMP_IS_TARGET_MEM_ALLOC(MA)                                            \
1259  (MA == llvm_omp_target_host_mem_alloc ||                                     \
1260   MA == llvm_omp_target_shared_mem_alloc ||                                   \
1261   MA == llvm_omp_target_device_mem_alloc)
1262
1263#if KMP_OS_UNIX && KMP_DYNAMIC_LIB && !KMP_OS_DARWIN
1264static inline void chk_kind(void ***pkind) {
1265  KMP_DEBUG_ASSERT(pkind);
1266  if (*pkind) // symbol found
1267    if (kmp_mk_check(**pkind)) // kind not available or error
1268      *pkind = NULL;
1269}
1270#endif
1271
1272void __kmp_init_memkind() {
1273// as of 2018-07-31 memkind does not support Windows*, exclude it for now
1274#if KMP_OS_UNIX && KMP_DYNAMIC_LIB && !KMP_OS_DARWIN
1275  // use of statically linked memkind is problematic, as it depends on libnuma
1276  kmp_mk_lib_name = "libmemkind.so";
1277  h_memkind = dlopen(kmp_mk_lib_name, RTLD_LAZY);
1278  if (h_memkind) {
1279    kmp_mk_check = (int (*)(void *))dlsym(h_memkind, "memkind_check_available");
1280    kmp_mk_alloc =
1281        (void *(*)(void *, size_t))dlsym(h_memkind, "memkind_malloc");
1282    kmp_mk_free = (void (*)(void *, void *))dlsym(h_memkind, "memkind_free");
1283    mk_default = (void **)dlsym(h_memkind, "MEMKIND_DEFAULT");
1284    if (kmp_mk_check && kmp_mk_alloc && kmp_mk_free && mk_default &&
1285        !kmp_mk_check(*mk_default)) {
1286      __kmp_memkind_available = 1;
1287      mk_interleave = (void **)dlsym(h_memkind, "MEMKIND_INTERLEAVE");
1288      chk_kind(&mk_interleave);
1289      mk_hbw = (void **)dlsym(h_memkind, "MEMKIND_HBW");
1290      chk_kind(&mk_hbw);
1291      mk_hbw_interleave = (void **)dlsym(h_memkind, "MEMKIND_HBW_INTERLEAVE");
1292      chk_kind(&mk_hbw_interleave);
1293      mk_hbw_preferred = (void **)dlsym(h_memkind, "MEMKIND_HBW_PREFERRED");
1294      chk_kind(&mk_hbw_preferred);
1295      mk_hugetlb = (void **)dlsym(h_memkind, "MEMKIND_HUGETLB");
1296      chk_kind(&mk_hugetlb);
1297      mk_hbw_hugetlb = (void **)dlsym(h_memkind, "MEMKIND_HBW_HUGETLB");
1298      chk_kind(&mk_hbw_hugetlb);
1299      mk_hbw_preferred_hugetlb =
1300          (void **)dlsym(h_memkind, "MEMKIND_HBW_PREFERRED_HUGETLB");
1301      chk_kind(&mk_hbw_preferred_hugetlb);
1302      mk_dax_kmem = (void **)dlsym(h_memkind, "MEMKIND_DAX_KMEM");
1303      chk_kind(&mk_dax_kmem);
1304      mk_dax_kmem_all = (void **)dlsym(h_memkind, "MEMKIND_DAX_KMEM_ALL");
1305      chk_kind(&mk_dax_kmem_all);
1306      mk_dax_kmem_preferred =
1307          (void **)dlsym(h_memkind, "MEMKIND_DAX_KMEM_PREFERRED");
1308      chk_kind(&mk_dax_kmem_preferred);
1309      KE_TRACE(25, ("__kmp_init_memkind: memkind library initialized\n"));
1310      return; // success
1311    }
1312    dlclose(h_memkind); // failure
1313  }
1314#else // !(KMP_OS_UNIX && KMP_DYNAMIC_LIB)
1315  kmp_mk_lib_name = "";
1316#endif // !(KMP_OS_UNIX && KMP_DYNAMIC_LIB)
1317  h_memkind = NULL;
1318  kmp_mk_check = NULL;
1319  kmp_mk_alloc = NULL;
1320  kmp_mk_free = NULL;
1321  mk_default = NULL;
1322  mk_interleave = NULL;
1323  mk_hbw = NULL;
1324  mk_hbw_interleave = NULL;
1325  mk_hbw_preferred = NULL;
1326  mk_hugetlb = NULL;
1327  mk_hbw_hugetlb = NULL;
1328  mk_hbw_preferred_hugetlb = NULL;
1329  mk_dax_kmem = NULL;
1330  mk_dax_kmem_all = NULL;
1331  mk_dax_kmem_preferred = NULL;
1332}
1333
1334void __kmp_fini_memkind() {
1335#if KMP_OS_UNIX && KMP_DYNAMIC_LIB
1336  if (__kmp_memkind_available)
1337    KE_TRACE(25, ("__kmp_fini_memkind: finalize memkind library\n"));
1338  if (h_memkind) {
1339    dlclose(h_memkind);
1340    h_memkind = NULL;
1341  }
1342  kmp_mk_check = NULL;
1343  kmp_mk_alloc = NULL;
1344  kmp_mk_free = NULL;
1345  mk_default = NULL;
1346  mk_interleave = NULL;
1347  mk_hbw = NULL;
1348  mk_hbw_interleave = NULL;
1349  mk_hbw_preferred = NULL;
1350  mk_hugetlb = NULL;
1351  mk_hbw_hugetlb = NULL;
1352  mk_hbw_preferred_hugetlb = NULL;
1353  mk_dax_kmem = NULL;
1354  mk_dax_kmem_all = NULL;
1355  mk_dax_kmem_preferred = NULL;
1356#endif
1357}
1358
1359void __kmp_init_target_mem() {
1360  *(void **)(&kmp_target_alloc_host) = KMP_DLSYM("llvm_omp_target_alloc_host");
1361  *(void **)(&kmp_target_alloc_shared) =
1362      KMP_DLSYM("llvm_omp_target_alloc_shared");
1363  *(void **)(&kmp_target_alloc_device) =
1364      KMP_DLSYM("llvm_omp_target_alloc_device");
1365  *(void **)(&kmp_target_free_host) = KMP_DLSYM("llvm_omp_target_free_host");
1366  *(void **)(&kmp_target_free_shared) =
1367      KMP_DLSYM("llvm_omp_target_free_shared");
1368  *(void **)(&kmp_target_free_device) =
1369      KMP_DLSYM("llvm_omp_target_free_device");
1370  __kmp_target_mem_available =
1371      kmp_target_alloc_host && kmp_target_alloc_shared &&
1372      kmp_target_alloc_device && kmp_target_free_host &&
1373      kmp_target_free_shared && kmp_target_free_device;
1374  // lock/pin and unlock/unpin target calls
1375  *(void **)(&kmp_target_lock_mem) = KMP_DLSYM("llvm_omp_target_lock_mem");
1376  *(void **)(&kmp_target_unlock_mem) = KMP_DLSYM("llvm_omp_target_unlock_mem");
1377}
1378
1379omp_allocator_handle_t __kmpc_init_allocator(int gtid, omp_memspace_handle_t ms,
1380                                             int ntraits,
1381                                             omp_alloctrait_t traits[]) {
1382  // OpenMP 5.0 only allows predefined memspaces
1383  KMP_DEBUG_ASSERT(ms == omp_default_mem_space || ms == omp_low_lat_mem_space ||
1384                   ms == omp_large_cap_mem_space || ms == omp_const_mem_space ||
1385                   ms == omp_high_bw_mem_space || KMP_IS_TARGET_MEM_SPACE(ms));
1386  kmp_allocator_t *al;
1387  int i;
1388  al = (kmp_allocator_t *)__kmp_allocate(sizeof(kmp_allocator_t)); // zeroed
1389  al->memspace = ms; // not used currently
1390  for (i = 0; i < ntraits; ++i) {
1391    switch (traits[i].key) {
1392    case omp_atk_sync_hint:
1393    case omp_atk_access:
1394      break;
1395    case omp_atk_pinned:
1396      al->pinned = true;
1397      break;
1398    case omp_atk_alignment:
1399      __kmp_type_convert(traits[i].value, &(al->alignment));
1400      KMP_ASSERT(IS_POWER_OF_TWO(al->alignment));
1401      break;
1402    case omp_atk_pool_size:
1403      al->pool_size = traits[i].value;
1404      break;
1405    case omp_atk_fallback:
1406      al->fb = (omp_alloctrait_value_t)traits[i].value;
1407      KMP_DEBUG_ASSERT(
1408          al->fb == omp_atv_default_mem_fb || al->fb == omp_atv_null_fb ||
1409          al->fb == omp_atv_abort_fb || al->fb == omp_atv_allocator_fb);
1410      break;
1411    case omp_atk_fb_data:
1412      al->fb_data = RCAST(kmp_allocator_t *, traits[i].value);
1413      break;
1414    case omp_atk_partition:
1415      al->memkind = RCAST(void **, traits[i].value);
1416      break;
1417    default:
1418      KMP_ASSERT2(0, "Unexpected allocator trait");
1419    }
1420  }
1421  if (al->fb == 0) {
1422    // set default allocator
1423    al->fb = omp_atv_default_mem_fb;
1424    al->fb_data = (kmp_allocator_t *)omp_default_mem_alloc;
1425  } else if (al->fb == omp_atv_allocator_fb) {
1426    KMP_ASSERT(al->fb_data != NULL);
1427  } else if (al->fb == omp_atv_default_mem_fb) {
1428    al->fb_data = (kmp_allocator_t *)omp_default_mem_alloc;
1429  }
1430  if (__kmp_memkind_available) {
1431    // Let's use memkind library if available
1432    if (ms == omp_high_bw_mem_space) {
1433      if (al->memkind == (void *)omp_atv_interleaved && mk_hbw_interleave) {
1434        al->memkind = mk_hbw_interleave;
1435      } else if (mk_hbw_preferred) {
1436        // AC: do not try to use MEMKIND_HBW for now, because memkind library
1437        // cannot reliably detect exhaustion of HBW memory.
1438        // It could be possible using hbw_verify_memory_region() but memkind
1439        // manual says: "Using this function in production code may result in
1440        // serious performance penalty".
1441        al->memkind = mk_hbw_preferred;
1442      } else {
1443        // HBW is requested but not available --> return NULL allocator
1444        __kmp_free(al);
1445        return omp_null_allocator;
1446      }
1447    } else if (ms == omp_large_cap_mem_space) {
1448      if (mk_dax_kmem_all) {
1449        // All pmem nodes are visited
1450        al->memkind = mk_dax_kmem_all;
1451      } else if (mk_dax_kmem) {
1452        // Only closest pmem node is visited
1453        al->memkind = mk_dax_kmem;
1454      } else {
1455        __kmp_free(al);
1456        return omp_null_allocator;
1457      }
1458    } else {
1459      if (al->memkind == (void *)omp_atv_interleaved && mk_interleave) {
1460        al->memkind = mk_interleave;
1461      } else {
1462        al->memkind = mk_default;
1463      }
1464    }
1465  } else if (KMP_IS_TARGET_MEM_SPACE(ms) && !__kmp_target_mem_available) {
1466    __kmp_free(al);
1467    return omp_null_allocator;
1468  } else {
1469    if (ms == omp_high_bw_mem_space) {
1470      // cannot detect HBW memory presence without memkind library
1471      __kmp_free(al);
1472      return omp_null_allocator;
1473    }
1474  }
1475  return (omp_allocator_handle_t)al;
1476}
1477
1478void __kmpc_destroy_allocator(int gtid, omp_allocator_handle_t allocator) {
1479  if (allocator > kmp_max_mem_alloc)
1480    __kmp_free(allocator);
1481}
1482
1483void __kmpc_set_default_allocator(int gtid, omp_allocator_handle_t allocator) {
1484  if (allocator == omp_null_allocator)
1485    allocator = omp_default_mem_alloc;
1486  __kmp_threads[gtid]->th.th_def_allocator = allocator;
1487}
1488
1489omp_allocator_handle_t __kmpc_get_default_allocator(int gtid) {
1490  return __kmp_threads[gtid]->th.th_def_allocator;
1491}
1492
1493typedef struct kmp_mem_desc { // Memory block descriptor
1494  void *ptr_alloc; // Pointer returned by allocator
1495  size_t size_a; // Size of allocated memory block (initial+descriptor+align)
1496  size_t size_orig; // Original size requested
1497  void *ptr_align; // Pointer to aligned memory, returned
1498  kmp_allocator_t *allocator; // allocator
1499} kmp_mem_desc_t;
1500static int alignment = sizeof(void *); // align to pointer size by default
1501
1502// external interfaces are wrappers over internal implementation
1503void *__kmpc_alloc(int gtid, size_t size, omp_allocator_handle_t allocator) {
1504  KE_TRACE(25, ("__kmpc_alloc: T#%d (%d, %p)\n", gtid, (int)size, allocator));
1505  void *ptr = __kmp_alloc(gtid, 0, size, allocator);
1506  KE_TRACE(25, ("__kmpc_alloc returns %p, T#%d\n", ptr, gtid));
1507  return ptr;
1508}
1509
1510void *__kmpc_aligned_alloc(int gtid, size_t algn, size_t size,
1511                           omp_allocator_handle_t allocator) {
1512  KE_TRACE(25, ("__kmpc_aligned_alloc: T#%d (%d, %d, %p)\n", gtid, (int)algn,
1513                (int)size, allocator));
1514  void *ptr = __kmp_alloc(gtid, algn, size, allocator);
1515  KE_TRACE(25, ("__kmpc_aligned_alloc returns %p, T#%d\n", ptr, gtid));
1516  return ptr;
1517}
1518
1519void *__kmpc_calloc(int gtid, size_t nmemb, size_t size,
1520                    omp_allocator_handle_t allocator) {
1521  KE_TRACE(25, ("__kmpc_calloc: T#%d (%d, %d, %p)\n", gtid, (int)nmemb,
1522                (int)size, allocator));
1523  void *ptr = __kmp_calloc(gtid, 0, nmemb, size, allocator);
1524  KE_TRACE(25, ("__kmpc_calloc returns %p, T#%d\n", ptr, gtid));
1525  return ptr;
1526}
1527
1528void *__kmpc_realloc(int gtid, void *ptr, size_t size,
1529                     omp_allocator_handle_t allocator,
1530                     omp_allocator_handle_t free_allocator) {
1531  KE_TRACE(25, ("__kmpc_realloc: T#%d (%p, %d, %p, %p)\n", gtid, ptr, (int)size,
1532                allocator, free_allocator));
1533  void *nptr = __kmp_realloc(gtid, ptr, size, allocator, free_allocator);
1534  KE_TRACE(25, ("__kmpc_realloc returns %p, T#%d\n", nptr, gtid));
1535  return nptr;
1536}
1537
1538void __kmpc_free(int gtid, void *ptr, omp_allocator_handle_t allocator) {
1539  KE_TRACE(25, ("__kmpc_free: T#%d free(%p,%p)\n", gtid, ptr, allocator));
1540  ___kmpc_free(gtid, ptr, allocator);
1541  KE_TRACE(10, ("__kmpc_free: T#%d freed %p (%p)\n", gtid, ptr, allocator));
1542  return;
1543}
1544
1545// internal implementation, called from inside the library
1546void *__kmp_alloc(int gtid, size_t algn, size_t size,
1547                  omp_allocator_handle_t allocator) {
1548  void *ptr = NULL;
1549  kmp_allocator_t *al;
1550  KMP_DEBUG_ASSERT(__kmp_init_serial);
1551  if (size == 0)
1552    return NULL;
1553  if (allocator == omp_null_allocator)
1554    allocator = __kmp_threads[gtid]->th.th_def_allocator;
1555  kmp_int32 default_device =
1556      __kmp_threads[gtid]->th.th_current_task->td_icvs.default_device;
1557
1558  al = RCAST(kmp_allocator_t *, allocator);
1559
1560  int sz_desc = sizeof(kmp_mem_desc_t);
1561  kmp_mem_desc_t desc;
1562  kmp_uintptr_t addr; // address returned by allocator
1563  kmp_uintptr_t addr_align; // address to return to caller
1564  kmp_uintptr_t addr_descr; // address of memory block descriptor
1565  size_t align = alignment; // default alignment
1566  if (allocator > kmp_max_mem_alloc && al->alignment > align)
1567    align = al->alignment; // alignment required by allocator trait
1568  if (align < algn)
1569    align = algn; // max of allocator trait, parameter and sizeof(void*)
1570  desc.size_orig = size;
1571  desc.size_a = size + sz_desc + align;
1572  bool is_pinned = false;
1573  if (allocator > kmp_max_mem_alloc)
1574    is_pinned = al->pinned;
1575
1576  // Use default allocator if libmemkind is not available
1577  int use_default_allocator = (__kmp_memkind_available) ? false : true;
1578
1579  if (KMP_IS_TARGET_MEM_ALLOC(allocator)) {
1580    // Use size input directly as the memory may not be accessible on host.
1581    // Use default device for now.
1582    if (__kmp_target_mem_available) {
1583      kmp_int32 device =
1584          __kmp_threads[gtid]->th.th_current_task->td_icvs.default_device;
1585      if (allocator == llvm_omp_target_host_mem_alloc)
1586        ptr = kmp_target_alloc_host(size, device);
1587      else if (allocator == llvm_omp_target_shared_mem_alloc)
1588        ptr = kmp_target_alloc_shared(size, device);
1589      else // allocator == llvm_omp_target_device_mem_alloc
1590        ptr = kmp_target_alloc_device(size, device);
1591      return ptr;
1592    } else {
1593      KMP_INFORM(TargetMemNotAvailable);
1594    }
1595  }
1596
1597  if (allocator >= kmp_max_mem_alloc && KMP_IS_TARGET_MEM_SPACE(al->memspace)) {
1598    if (__kmp_target_mem_available) {
1599      kmp_int32 device =
1600          __kmp_threads[gtid]->th.th_current_task->td_icvs.default_device;
1601      if (al->memspace == llvm_omp_target_host_mem_space)
1602        ptr = kmp_target_alloc_host(size, device);
1603      else if (al->memspace == llvm_omp_target_shared_mem_space)
1604        ptr = kmp_target_alloc_shared(size, device);
1605      else // al->memspace == llvm_omp_target_device_mem_space
1606        ptr = kmp_target_alloc_device(size, device);
1607      return ptr;
1608    } else {
1609      KMP_INFORM(TargetMemNotAvailable);
1610    }
1611  }
1612
1613  if (__kmp_memkind_available) {
1614    if (allocator < kmp_max_mem_alloc) {
1615      // pre-defined allocator
1616      if (allocator == omp_high_bw_mem_alloc && mk_hbw_preferred) {
1617        ptr = kmp_mk_alloc(*mk_hbw_preferred, desc.size_a);
1618      } else if (allocator == omp_large_cap_mem_alloc && mk_dax_kmem_all) {
1619        ptr = kmp_mk_alloc(*mk_dax_kmem_all, desc.size_a);
1620      } else {
1621        ptr = kmp_mk_alloc(*mk_default, desc.size_a);
1622      }
1623    } else if (al->pool_size > 0) {
1624      // custom allocator with pool size requested
1625      kmp_uint64 used =
1626          KMP_TEST_THEN_ADD64((kmp_int64 *)&al->pool_used, desc.size_a);
1627      if (used + desc.size_a > al->pool_size) {
1628        // not enough space, need to go fallback path
1629        KMP_TEST_THEN_ADD64((kmp_int64 *)&al->pool_used, -desc.size_a);
1630        if (al->fb == omp_atv_default_mem_fb) {
1631          al = (kmp_allocator_t *)omp_default_mem_alloc;
1632          ptr = kmp_mk_alloc(*mk_default, desc.size_a);
1633        } else if (al->fb == omp_atv_abort_fb) {
1634          KMP_ASSERT(0); // abort fallback requested
1635        } else if (al->fb == omp_atv_allocator_fb) {
1636          KMP_ASSERT(al != al->fb_data);
1637          al = al->fb_data;
1638          ptr = __kmp_alloc(gtid, algn, size, (omp_allocator_handle_t)al);
1639          if (is_pinned && kmp_target_lock_mem)
1640            kmp_target_lock_mem(ptr, size, default_device);
1641          return ptr;
1642        } // else ptr == NULL;
1643      } else {
1644        // pool has enough space
1645        ptr = kmp_mk_alloc(*al->memkind, desc.size_a);
1646        if (ptr == NULL) {
1647          if (al->fb == omp_atv_default_mem_fb) {
1648            al = (kmp_allocator_t *)omp_default_mem_alloc;
1649            ptr = kmp_mk_alloc(*mk_default, desc.size_a);
1650          } else if (al->fb == omp_atv_abort_fb) {
1651            KMP_ASSERT(0); // abort fallback requested
1652          } else if (al->fb == omp_atv_allocator_fb) {
1653            KMP_ASSERT(al != al->fb_data);
1654            al = al->fb_data;
1655            ptr = __kmp_alloc(gtid, algn, size, (omp_allocator_handle_t)al);
1656            if (is_pinned && kmp_target_lock_mem)
1657              kmp_target_lock_mem(ptr, size, default_device);
1658            return ptr;
1659          }
1660        }
1661      }
1662    } else {
1663      // custom allocator, pool size not requested
1664      ptr = kmp_mk_alloc(*al->memkind, desc.size_a);
1665      if (ptr == NULL) {
1666        if (al->fb == omp_atv_default_mem_fb) {
1667          al = (kmp_allocator_t *)omp_default_mem_alloc;
1668          ptr = kmp_mk_alloc(*mk_default, desc.size_a);
1669        } else if (al->fb == omp_atv_abort_fb) {
1670          KMP_ASSERT(0); // abort fallback requested
1671        } else if (al->fb == omp_atv_allocator_fb) {
1672          KMP_ASSERT(al != al->fb_data);
1673          al = al->fb_data;
1674          ptr = __kmp_alloc(gtid, algn, size, (omp_allocator_handle_t)al);
1675          if (is_pinned && kmp_target_lock_mem)
1676            kmp_target_lock_mem(ptr, size, default_device);
1677          return ptr;
1678        }
1679      }
1680    }
1681  } else if (allocator < kmp_max_mem_alloc) {
1682    // pre-defined allocator
1683    if (allocator == omp_high_bw_mem_alloc) {
1684      KMP_WARNING(OmpNoAllocator, "omp_high_bw_mem_alloc");
1685    } else if (allocator == omp_large_cap_mem_alloc) {
1686      KMP_WARNING(OmpNoAllocator, "omp_large_cap_mem_alloc");
1687    } else if (allocator == omp_const_mem_alloc) {
1688      KMP_WARNING(OmpNoAllocator, "omp_const_mem_alloc");
1689    } else if (allocator == omp_low_lat_mem_alloc) {
1690      KMP_WARNING(OmpNoAllocator, "omp_low_lat_mem_alloc");
1691    } else if (allocator == omp_cgroup_mem_alloc) {
1692      KMP_WARNING(OmpNoAllocator, "omp_cgroup_mem_alloc");
1693    } else if (allocator == omp_pteam_mem_alloc) {
1694      KMP_WARNING(OmpNoAllocator, "omp_pteam_mem_alloc");
1695    } else if (allocator == omp_thread_mem_alloc) {
1696      KMP_WARNING(OmpNoAllocator, "omp_thread_mem_alloc");
1697    } else { // default allocator requested
1698      use_default_allocator = true;
1699    }
1700    if (use_default_allocator) {
1701      ptr = __kmp_thread_malloc(__kmp_thread_from_gtid(gtid), desc.size_a);
1702      use_default_allocator = false;
1703    }
1704  } else if (al->pool_size > 0) {
1705    // custom allocator with pool size requested
1706    kmp_uint64 used =
1707        KMP_TEST_THEN_ADD64((kmp_int64 *)&al->pool_used, desc.size_a);
1708    if (used + desc.size_a > al->pool_size) {
1709      // not enough space, need to go fallback path
1710      KMP_TEST_THEN_ADD64((kmp_int64 *)&al->pool_used, -desc.size_a);
1711      if (al->fb == omp_atv_default_mem_fb) {
1712        al = (kmp_allocator_t *)omp_default_mem_alloc;
1713        ptr = __kmp_thread_malloc(__kmp_thread_from_gtid(gtid), desc.size_a);
1714      } else if (al->fb == omp_atv_abort_fb) {
1715        KMP_ASSERT(0); // abort fallback requested
1716      } else if (al->fb == omp_atv_allocator_fb) {
1717        KMP_ASSERT(al != al->fb_data);
1718        al = al->fb_data;
1719        ptr = __kmp_alloc(gtid, algn, size, (omp_allocator_handle_t)al);
1720        if (is_pinned && kmp_target_lock_mem)
1721          kmp_target_lock_mem(ptr, size, default_device);
1722        return ptr;
1723      } // else ptr == NULL;
1724    } else {
1725      // pool has enough space
1726      ptr = __kmp_thread_malloc(__kmp_thread_from_gtid(gtid), desc.size_a);
1727      if (ptr == NULL && al->fb == omp_atv_abort_fb) {
1728        KMP_ASSERT(0); // abort fallback requested
1729      } // no sense to look for another fallback because of same internal alloc
1730    }
1731  } else {
1732    // custom allocator, pool size not requested
1733    ptr = __kmp_thread_malloc(__kmp_thread_from_gtid(gtid), desc.size_a);
1734    if (ptr == NULL && al->fb == omp_atv_abort_fb) {
1735      KMP_ASSERT(0); // abort fallback requested
1736    } // no sense to look for another fallback because of same internal alloc
1737  }
1738  KE_TRACE(10, ("__kmp_alloc: T#%d %p=alloc(%d)\n", gtid, ptr, desc.size_a));
1739  if (ptr == NULL)
1740    return NULL;
1741
1742  if (is_pinned && kmp_target_lock_mem)
1743    kmp_target_lock_mem(ptr, desc.size_a, default_device);
1744
1745  addr = (kmp_uintptr_t)ptr;
1746  addr_align = (addr + sz_desc + align - 1) & ~(align - 1);
1747  addr_descr = addr_align - sz_desc;
1748
1749  desc.ptr_alloc = ptr;
1750  desc.ptr_align = (void *)addr_align;
1751  desc.allocator = al;
1752  *((kmp_mem_desc_t *)addr_descr) = desc; // save descriptor contents
1753  KMP_MB();
1754
1755  return desc.ptr_align;
1756}
1757
1758void *__kmp_calloc(int gtid, size_t algn, size_t nmemb, size_t size,
1759                   omp_allocator_handle_t allocator) {
1760  void *ptr = NULL;
1761  kmp_allocator_t *al;
1762  KMP_DEBUG_ASSERT(__kmp_init_serial);
1763
1764  if (allocator == omp_null_allocator)
1765    allocator = __kmp_threads[gtid]->th.th_def_allocator;
1766
1767  al = RCAST(kmp_allocator_t *, allocator);
1768
1769  if (nmemb == 0 || size == 0)
1770    return ptr;
1771
1772  if ((SIZE_MAX - sizeof(kmp_mem_desc_t)) / size < nmemb) {
1773    if (al->fb == omp_atv_abort_fb) {
1774      KMP_ASSERT(0);
1775    }
1776    return ptr;
1777  }
1778
1779  ptr = __kmp_alloc(gtid, algn, nmemb * size, allocator);
1780
1781  if (ptr) {
1782    memset(ptr, 0x00, nmemb * size);
1783  }
1784  return ptr;
1785}
1786
1787void *__kmp_realloc(int gtid, void *ptr, size_t size,
1788                    omp_allocator_handle_t allocator,
1789                    omp_allocator_handle_t free_allocator) {
1790  void *nptr = NULL;
1791  KMP_DEBUG_ASSERT(__kmp_init_serial);
1792
1793  if (size == 0) {
1794    if (ptr != NULL)
1795      ___kmpc_free(gtid, ptr, free_allocator);
1796    return nptr;
1797  }
1798
1799  nptr = __kmp_alloc(gtid, 0, size, allocator);
1800
1801  if (nptr != NULL && ptr != NULL) {
1802    kmp_mem_desc_t desc;
1803    kmp_uintptr_t addr_align; // address to return to caller
1804    kmp_uintptr_t addr_descr; // address of memory block descriptor
1805
1806    addr_align = (kmp_uintptr_t)ptr;
1807    addr_descr = addr_align - sizeof(kmp_mem_desc_t);
1808    desc = *((kmp_mem_desc_t *)addr_descr); // read descriptor
1809
1810    KMP_DEBUG_ASSERT(desc.ptr_align == ptr);
1811    KMP_DEBUG_ASSERT(desc.size_orig > 0);
1812    KMP_DEBUG_ASSERT(desc.size_orig < desc.size_a);
1813    KMP_MEMCPY((char *)nptr, (char *)ptr,
1814               (size_t)((size < desc.size_orig) ? size : desc.size_orig));
1815  }
1816
1817  if (nptr != NULL) {
1818    ___kmpc_free(gtid, ptr, free_allocator);
1819  }
1820
1821  return nptr;
1822}
1823
1824void ___kmpc_free(int gtid, void *ptr, omp_allocator_handle_t allocator) {
1825  if (ptr == NULL)
1826    return;
1827
1828  kmp_allocator_t *al;
1829  omp_allocator_handle_t oal;
1830  al = RCAST(kmp_allocator_t *, CCAST(omp_allocator_handle_t, allocator));
1831  kmp_mem_desc_t desc;
1832  kmp_uintptr_t addr_align; // address to return to caller
1833  kmp_uintptr_t addr_descr; // address of memory block descriptor
1834  if (__kmp_target_mem_available && (KMP_IS_TARGET_MEM_ALLOC(allocator) ||
1835                                     (allocator > kmp_max_mem_alloc &&
1836                                      KMP_IS_TARGET_MEM_SPACE(al->memspace)))) {
1837    kmp_int32 device =
1838        __kmp_threads[gtid]->th.th_current_task->td_icvs.default_device;
1839    if (allocator == llvm_omp_target_host_mem_alloc) {
1840      kmp_target_free_host(ptr, device);
1841    } else if (allocator == llvm_omp_target_shared_mem_alloc) {
1842      kmp_target_free_shared(ptr, device);
1843    } else if (allocator == llvm_omp_target_device_mem_alloc) {
1844      kmp_target_free_device(ptr, device);
1845    }
1846    return;
1847  }
1848
1849  addr_align = (kmp_uintptr_t)ptr;
1850  addr_descr = addr_align - sizeof(kmp_mem_desc_t);
1851  desc = *((kmp_mem_desc_t *)addr_descr); // read descriptor
1852
1853  KMP_DEBUG_ASSERT(desc.ptr_align == ptr);
1854  if (allocator) {
1855    KMP_DEBUG_ASSERT(desc.allocator == al || desc.allocator == al->fb_data);
1856  }
1857  al = desc.allocator;
1858  oal = (omp_allocator_handle_t)al; // cast to void* for comparisons
1859  KMP_DEBUG_ASSERT(al);
1860
1861  if (allocator > kmp_max_mem_alloc && kmp_target_unlock_mem && al->pinned) {
1862    kmp_int32 device =
1863        __kmp_threads[gtid]->th.th_current_task->td_icvs.default_device;
1864    kmp_target_unlock_mem(desc.ptr_alloc, device);
1865  }
1866
1867  if (__kmp_memkind_available) {
1868    if (oal < kmp_max_mem_alloc) {
1869      // pre-defined allocator
1870      if (oal == omp_high_bw_mem_alloc && mk_hbw_preferred) {
1871        kmp_mk_free(*mk_hbw_preferred, desc.ptr_alloc);
1872      } else if (oal == omp_large_cap_mem_alloc && mk_dax_kmem_all) {
1873        kmp_mk_free(*mk_dax_kmem_all, desc.ptr_alloc);
1874      } else {
1875        kmp_mk_free(*mk_default, desc.ptr_alloc);
1876      }
1877    } else {
1878      if (al->pool_size > 0) { // custom allocator with pool size requested
1879        kmp_uint64 used =
1880            KMP_TEST_THEN_ADD64((kmp_int64 *)&al->pool_used, -desc.size_a);
1881        (void)used; // to suppress compiler warning
1882        KMP_DEBUG_ASSERT(used >= desc.size_a);
1883      }
1884      kmp_mk_free(*al->memkind, desc.ptr_alloc);
1885    }
1886  } else {
1887    if (oal > kmp_max_mem_alloc && al->pool_size > 0) {
1888      kmp_uint64 used =
1889          KMP_TEST_THEN_ADD64((kmp_int64 *)&al->pool_used, -desc.size_a);
1890      (void)used; // to suppress compiler warning
1891      KMP_DEBUG_ASSERT(used >= desc.size_a);
1892    }
1893    __kmp_thread_free(__kmp_thread_from_gtid(gtid), desc.ptr_alloc);
1894  }
1895}
1896
1897/* If LEAK_MEMORY is defined, __kmp_free() will *not* free memory. It causes
1898   memory leaks, but it may be useful for debugging memory corruptions, used
1899   freed pointers, etc. */
1900/* #define LEAK_MEMORY */
1901struct kmp_mem_descr { // Memory block descriptor.
1902  void *ptr_allocated; // Pointer returned by malloc(), subject for free().
1903  size_t size_allocated; // Size of allocated memory block.
1904  void *ptr_aligned; // Pointer to aligned memory, to be used by client code.
1905  size_t size_aligned; // Size of aligned memory block.
1906};
1907typedef struct kmp_mem_descr kmp_mem_descr_t;
1908
1909/* Allocate memory on requested boundary, fill allocated memory with 0x00.
1910   NULL is NEVER returned, __kmp_abort() is called in case of memory allocation
1911   error. Must use __kmp_free when freeing memory allocated by this routine! */
1912static void *___kmp_allocate_align(size_t size,
1913                                   size_t alignment KMP_SRC_LOC_DECL) {
1914  /* __kmp_allocate() allocates (by call to malloc()) bigger memory block than
1915     requested to return properly aligned pointer. Original pointer returned
1916     by malloc() and size of allocated block is saved in descriptor just
1917     before the aligned pointer. This information used by __kmp_free() -- it
1918     has to pass to free() original pointer, not aligned one.
1919
1920          +---------+------------+-----------------------------------+---------+
1921          | padding | descriptor |           aligned block           | padding |
1922          +---------+------------+-----------------------------------+---------+
1923          ^                      ^
1924          |                      |
1925          |                      +- Aligned pointer returned to caller
1926          +- Pointer returned by malloc()
1927
1928      Aligned block is filled with zeros, paddings are filled with 0xEF. */
1929
1930  kmp_mem_descr_t descr;
1931  kmp_uintptr_t addr_allocated; // Address returned by malloc().
1932  kmp_uintptr_t addr_aligned; // Aligned address to return to caller.
1933  kmp_uintptr_t addr_descr; // Address of memory block descriptor.
1934
1935  KE_TRACE(25, ("-> ___kmp_allocate_align( %d, %d ) called from %s:%d\n",
1936                (int)size, (int)alignment KMP_SRC_LOC_PARM));
1937
1938  KMP_DEBUG_ASSERT(alignment < 32 * 1024); // Alignment should not be too
1939  KMP_DEBUG_ASSERT(sizeof(void *) <= sizeof(kmp_uintptr_t));
1940  // Make sure kmp_uintptr_t is enough to store addresses.
1941
1942  descr.size_aligned = size;
1943  descr.size_allocated =
1944      descr.size_aligned + sizeof(kmp_mem_descr_t) + alignment;
1945
1946#if KMP_DEBUG
1947  descr.ptr_allocated = _malloc_src_loc(descr.size_allocated, _file_, _line_);
1948#else
1949  descr.ptr_allocated = malloc_src_loc(descr.size_allocated KMP_SRC_LOC_PARM);
1950#endif
1951  KE_TRACE(10, ("   malloc( %d ) returned %p\n", (int)descr.size_allocated,
1952                descr.ptr_allocated));
1953  if (descr.ptr_allocated == NULL) {
1954    KMP_FATAL(OutOfHeapMemory);
1955  }
1956
1957  addr_allocated = (kmp_uintptr_t)descr.ptr_allocated;
1958  addr_aligned =
1959      (addr_allocated + sizeof(kmp_mem_descr_t) + alignment) & ~(alignment - 1);
1960  addr_descr = addr_aligned - sizeof(kmp_mem_descr_t);
1961
1962  descr.ptr_aligned = (void *)addr_aligned;
1963
1964  KE_TRACE(26, ("   ___kmp_allocate_align: "
1965                "ptr_allocated=%p, size_allocated=%d, "
1966                "ptr_aligned=%p, size_aligned=%d\n",
1967                descr.ptr_allocated, (int)descr.size_allocated,
1968                descr.ptr_aligned, (int)descr.size_aligned));
1969
1970  KMP_DEBUG_ASSERT(addr_allocated <= addr_descr);
1971  KMP_DEBUG_ASSERT(addr_descr + sizeof(kmp_mem_descr_t) == addr_aligned);
1972  KMP_DEBUG_ASSERT(addr_aligned + descr.size_aligned <=
1973                   addr_allocated + descr.size_allocated);
1974  KMP_DEBUG_ASSERT(addr_aligned % alignment == 0);
1975#ifdef KMP_DEBUG
1976  memset(descr.ptr_allocated, 0xEF, descr.size_allocated);
1977// Fill allocated memory block with 0xEF.
1978#endif
1979  memset(descr.ptr_aligned, 0x00, descr.size_aligned);
1980  // Fill the aligned memory block (which is intended for using by caller) with
1981  // 0x00. Do not
1982  // put this filling under KMP_DEBUG condition! Many callers expect zeroed
1983  // memory. (Padding
1984  // bytes remain filled with 0xEF in debugging library.)
1985  *((kmp_mem_descr_t *)addr_descr) = descr;
1986
1987  KMP_MB();
1988
1989  KE_TRACE(25, ("<- ___kmp_allocate_align() returns %p\n", descr.ptr_aligned));
1990  return descr.ptr_aligned;
1991} // func ___kmp_allocate_align
1992
1993/* Allocate memory on cache line boundary, fill allocated memory with 0x00.
1994   Do not call this func directly! Use __kmp_allocate macro instead.
1995   NULL is NEVER returned, __kmp_abort() is called in case of memory allocation
1996   error. Must use __kmp_free when freeing memory allocated by this routine! */
1997void *___kmp_allocate(size_t size KMP_SRC_LOC_DECL) {
1998  void *ptr;
1999  KE_TRACE(25, ("-> __kmp_allocate( %d ) called from %s:%d\n",
2000                (int)size KMP_SRC_LOC_PARM));
2001  ptr = ___kmp_allocate_align(size, __kmp_align_alloc KMP_SRC_LOC_PARM);
2002  KE_TRACE(25, ("<- __kmp_allocate() returns %p\n", ptr));
2003  return ptr;
2004} // func ___kmp_allocate
2005
2006/* Allocate memory on page boundary, fill allocated memory with 0x00.
2007   Does not call this func directly! Use __kmp_page_allocate macro instead.
2008   NULL is NEVER returned, __kmp_abort() is called in case of memory allocation
2009   error. Must use __kmp_free when freeing memory allocated by this routine! */
2010void *___kmp_page_allocate(size_t size KMP_SRC_LOC_DECL) {
2011  int page_size = 8 * 1024;
2012  void *ptr;
2013
2014  KE_TRACE(25, ("-> __kmp_page_allocate( %d ) called from %s:%d\n",
2015                (int)size KMP_SRC_LOC_PARM));
2016  ptr = ___kmp_allocate_align(size, page_size KMP_SRC_LOC_PARM);
2017  KE_TRACE(25, ("<- __kmp_page_allocate( %d ) returns %p\n", (int)size, ptr));
2018  return ptr;
2019} // ___kmp_page_allocate
2020
2021/* Free memory allocated by __kmp_allocate() and __kmp_page_allocate().
2022   In debug mode, fill the memory block with 0xEF before call to free(). */
2023void ___kmp_free(void *ptr KMP_SRC_LOC_DECL) {
2024  kmp_mem_descr_t descr;
2025#if KMP_DEBUG
2026  kmp_uintptr_t addr_allocated; // Address returned by malloc().
2027  kmp_uintptr_t addr_aligned; // Aligned address passed by caller.
2028#endif
2029  KE_TRACE(25,
2030           ("-> __kmp_free( %p ) called from %s:%d\n", ptr KMP_SRC_LOC_PARM));
2031  KMP_ASSERT(ptr != NULL);
2032
2033  descr = *(kmp_mem_descr_t *)((kmp_uintptr_t)ptr - sizeof(kmp_mem_descr_t));
2034
2035  KE_TRACE(26, ("   __kmp_free:     "
2036                "ptr_allocated=%p, size_allocated=%d, "
2037                "ptr_aligned=%p, size_aligned=%d\n",
2038                descr.ptr_allocated, (int)descr.size_allocated,
2039                descr.ptr_aligned, (int)descr.size_aligned));
2040#if KMP_DEBUG
2041  addr_allocated = (kmp_uintptr_t)descr.ptr_allocated;
2042  addr_aligned = (kmp_uintptr_t)descr.ptr_aligned;
2043  KMP_DEBUG_ASSERT(addr_aligned % CACHE_LINE == 0);
2044  KMP_DEBUG_ASSERT(descr.ptr_aligned == ptr);
2045  KMP_DEBUG_ASSERT(addr_allocated + sizeof(kmp_mem_descr_t) <= addr_aligned);
2046  KMP_DEBUG_ASSERT(descr.size_aligned < descr.size_allocated);
2047  KMP_DEBUG_ASSERT(addr_aligned + descr.size_aligned <=
2048                   addr_allocated + descr.size_allocated);
2049  memset(descr.ptr_allocated, 0xEF, descr.size_allocated);
2050// Fill memory block with 0xEF, it helps catch using freed memory.
2051#endif
2052
2053#ifndef LEAK_MEMORY
2054  KE_TRACE(10, ("   free( %p )\n", descr.ptr_allocated));
2055#ifdef KMP_DEBUG
2056  _free_src_loc(descr.ptr_allocated, _file_, _line_);
2057#else
2058  free_src_loc(descr.ptr_allocated KMP_SRC_LOC_PARM);
2059#endif
2060#endif
2061  KMP_MB();
2062  KE_TRACE(25, ("<- __kmp_free() returns\n"));
2063} // func ___kmp_free
2064
2065#if USE_FAST_MEMORY == 3
2066// Allocate fast memory by first scanning the thread's free lists
2067// If a chunk the right size exists, grab it off the free list.
2068// Otherwise allocate normally using kmp_thread_malloc.
2069
2070// AC: How to choose the limit? Just get 16 for now...
2071#define KMP_FREE_LIST_LIMIT 16
2072
2073// Always use 128 bytes for determining buckets for caching memory blocks
2074#define DCACHE_LINE 128
2075
2076void *___kmp_fast_allocate(kmp_info_t *this_thr, size_t size KMP_SRC_LOC_DECL) {
2077  void *ptr;
2078  size_t num_lines, idx;
2079  int index;
2080  void *alloc_ptr;
2081  size_t alloc_size;
2082  kmp_mem_descr_t *descr;
2083
2084  KE_TRACE(25, ("-> __kmp_fast_allocate( T#%d, %d ) called from %s:%d\n",
2085                __kmp_gtid_from_thread(this_thr), (int)size KMP_SRC_LOC_PARM));
2086
2087  num_lines = (size + DCACHE_LINE - 1) / DCACHE_LINE;
2088  idx = num_lines - 1;
2089  KMP_DEBUG_ASSERT(idx >= 0);
2090  if (idx < 2) {
2091    index = 0; // idx is [ 0, 1 ], use first free list
2092    num_lines = 2; // 1, 2 cache lines or less than cache line
2093  } else if ((idx >>= 2) == 0) {
2094    index = 1; // idx is [ 2, 3 ], use second free list
2095    num_lines = 4; // 3, 4 cache lines
2096  } else if ((idx >>= 2) == 0) {
2097    index = 2; // idx is [ 4, 15 ], use third free list
2098    num_lines = 16; // 5, 6, ..., 16 cache lines
2099  } else if ((idx >>= 2) == 0) {
2100    index = 3; // idx is [ 16, 63 ], use fourth free list
2101    num_lines = 64; // 17, 18, ..., 64 cache lines
2102  } else {
2103    goto alloc_call; // 65 or more cache lines ( > 8KB ), don't use free lists
2104  }
2105
2106  ptr = this_thr->th.th_free_lists[index].th_free_list_self;
2107  if (ptr != NULL) {
2108    // pop the head of no-sync free list
2109    this_thr->th.th_free_lists[index].th_free_list_self = *((void **)ptr);
2110    KMP_DEBUG_ASSERT(this_thr == ((kmp_mem_descr_t *)((kmp_uintptr_t)ptr -
2111                                                      sizeof(kmp_mem_descr_t)))
2112                                     ->ptr_aligned);
2113    goto end;
2114  }
2115  ptr = TCR_SYNC_PTR(this_thr->th.th_free_lists[index].th_free_list_sync);
2116  if (ptr != NULL) {
2117    // no-sync free list is empty, use sync free list (filled in by other
2118    // threads only)
2119    // pop the head of the sync free list, push NULL instead
2120    while (!KMP_COMPARE_AND_STORE_PTR(
2121        &this_thr->th.th_free_lists[index].th_free_list_sync, ptr, nullptr)) {
2122      KMP_CPU_PAUSE();
2123      ptr = TCR_SYNC_PTR(this_thr->th.th_free_lists[index].th_free_list_sync);
2124    }
2125    // push the rest of chain into no-sync free list (can be NULL if there was
2126    // the only block)
2127    this_thr->th.th_free_lists[index].th_free_list_self = *((void **)ptr);
2128    KMP_DEBUG_ASSERT(this_thr == ((kmp_mem_descr_t *)((kmp_uintptr_t)ptr -
2129                                                      sizeof(kmp_mem_descr_t)))
2130                                     ->ptr_aligned);
2131    goto end;
2132  }
2133
2134alloc_call:
2135  // haven't found block in the free lists, thus allocate it
2136  size = num_lines * DCACHE_LINE;
2137
2138  alloc_size = size + sizeof(kmp_mem_descr_t) + DCACHE_LINE;
2139  KE_TRACE(25, ("__kmp_fast_allocate: T#%d Calling __kmp_thread_malloc with "
2140                "alloc_size %d\n",
2141                __kmp_gtid_from_thread(this_thr), alloc_size));
2142  alloc_ptr = bget(this_thr, (bufsize)alloc_size);
2143
2144  // align ptr to DCACHE_LINE
2145  ptr = (void *)((((kmp_uintptr_t)alloc_ptr) + sizeof(kmp_mem_descr_t) +
2146                  DCACHE_LINE) &
2147                 ~(DCACHE_LINE - 1));
2148  descr = (kmp_mem_descr_t *)(((kmp_uintptr_t)ptr) - sizeof(kmp_mem_descr_t));
2149
2150  descr->ptr_allocated = alloc_ptr; // remember allocated pointer
2151  // we don't need size_allocated
2152  descr->ptr_aligned = (void *)this_thr; // remember allocating thread
2153  // (it is already saved in bget buffer,
2154  // but we may want to use another allocator in future)
2155  descr->size_aligned = size;
2156
2157end:
2158  KE_TRACE(25, ("<- __kmp_fast_allocate( T#%d ) returns %p\n",
2159                __kmp_gtid_from_thread(this_thr), ptr));
2160  return ptr;
2161} // func __kmp_fast_allocate
2162
2163// Free fast memory and place it on the thread's free list if it is of
2164// the correct size.
2165void ___kmp_fast_free(kmp_info_t *this_thr, void *ptr KMP_SRC_LOC_DECL) {
2166  kmp_mem_descr_t *descr;
2167  kmp_info_t *alloc_thr;
2168  size_t size;
2169  size_t idx;
2170  int index;
2171
2172  KE_TRACE(25, ("-> __kmp_fast_free( T#%d, %p ) called from %s:%d\n",
2173                __kmp_gtid_from_thread(this_thr), ptr KMP_SRC_LOC_PARM));
2174  KMP_ASSERT(ptr != NULL);
2175
2176  descr = (kmp_mem_descr_t *)(((kmp_uintptr_t)ptr) - sizeof(kmp_mem_descr_t));
2177
2178  KE_TRACE(26, ("   __kmp_fast_free:     size_aligned=%d\n",
2179                (int)descr->size_aligned));
2180
2181  size = descr->size_aligned; // 2, 4, 16, 64, 65, 66, ... cache lines
2182
2183  idx = DCACHE_LINE * 2; // 2 cache lines is minimal size of block
2184  if (idx == size) {
2185    index = 0; // 2 cache lines
2186  } else if ((idx <<= 1) == size) {
2187    index = 1; // 4 cache lines
2188  } else if ((idx <<= 2) == size) {
2189    index = 2; // 16 cache lines
2190  } else if ((idx <<= 2) == size) {
2191    index = 3; // 64 cache lines
2192  } else {
2193    KMP_DEBUG_ASSERT(size > DCACHE_LINE * 64);
2194    goto free_call; // 65 or more cache lines ( > 8KB )
2195  }
2196
2197  alloc_thr = (kmp_info_t *)descr->ptr_aligned; // get thread owning the block
2198  if (alloc_thr == this_thr) {
2199    // push block to self no-sync free list, linking previous head (LIFO)
2200    *((void **)ptr) = this_thr->th.th_free_lists[index].th_free_list_self;
2201    this_thr->th.th_free_lists[index].th_free_list_self = ptr;
2202  } else {
2203    void *head = this_thr->th.th_free_lists[index].th_free_list_other;
2204    if (head == NULL) {
2205      // Create new free list
2206      this_thr->th.th_free_lists[index].th_free_list_other = ptr;
2207      *((void **)ptr) = NULL; // mark the tail of the list
2208      descr->size_allocated = (size_t)1; // head of the list keeps its length
2209    } else {
2210      // need to check existed "other" list's owner thread and size of queue
2211      kmp_mem_descr_t *dsc =
2212          (kmp_mem_descr_t *)((char *)head - sizeof(kmp_mem_descr_t));
2213      // allocating thread, same for all queue nodes
2214      kmp_info_t *q_th = (kmp_info_t *)(dsc->ptr_aligned);
2215      size_t q_sz =
2216          dsc->size_allocated + 1; // new size in case we add current task
2217      if (q_th == alloc_thr && q_sz <= KMP_FREE_LIST_LIMIT) {
2218        // we can add current task to "other" list, no sync needed
2219        *((void **)ptr) = head;
2220        descr->size_allocated = q_sz;
2221        this_thr->th.th_free_lists[index].th_free_list_other = ptr;
2222      } else {
2223        // either queue blocks owner is changing or size limit exceeded
2224        // return old queue to allocating thread (q_th) synchronously,
2225        // and start new list for alloc_thr's tasks
2226        void *old_ptr;
2227        void *tail = head;
2228        void *next = *((void **)head);
2229        while (next != NULL) {
2230          KMP_DEBUG_ASSERT(
2231              // queue size should decrease by 1 each step through the list
2232              ((kmp_mem_descr_t *)((char *)next - sizeof(kmp_mem_descr_t)))
2233                      ->size_allocated +
2234                  1 ==
2235              ((kmp_mem_descr_t *)((char *)tail - sizeof(kmp_mem_descr_t)))
2236                  ->size_allocated);
2237          tail = next; // remember tail node
2238          next = *((void **)next);
2239        }
2240        KMP_DEBUG_ASSERT(q_th != NULL);
2241        // push block to owner's sync free list
2242        old_ptr = TCR_PTR(q_th->th.th_free_lists[index].th_free_list_sync);
2243        /* the next pointer must be set before setting free_list to ptr to avoid
2244           exposing a broken list to other threads, even for an instant. */
2245        *((void **)tail) = old_ptr;
2246
2247        while (!KMP_COMPARE_AND_STORE_PTR(
2248            &q_th->th.th_free_lists[index].th_free_list_sync, old_ptr, head)) {
2249          KMP_CPU_PAUSE();
2250          old_ptr = TCR_PTR(q_th->th.th_free_lists[index].th_free_list_sync);
2251          *((void **)tail) = old_ptr;
2252        }
2253
2254        // start new list of not-selt tasks
2255        this_thr->th.th_free_lists[index].th_free_list_other = ptr;
2256        *((void **)ptr) = NULL;
2257        descr->size_allocated = (size_t)1; // head of queue keeps its length
2258      }
2259    }
2260  }
2261  goto end;
2262
2263free_call:
2264  KE_TRACE(25, ("__kmp_fast_free: T#%d Calling __kmp_thread_free for size %d\n",
2265                __kmp_gtid_from_thread(this_thr), size));
2266  __kmp_bget_dequeue(this_thr); /* Release any queued buffers */
2267  brel(this_thr, descr->ptr_allocated);
2268
2269end:
2270  KE_TRACE(25, ("<- __kmp_fast_free() returns\n"));
2271
2272} // func __kmp_fast_free
2273
2274// Initialize the thread free lists related to fast memory
2275// Only do this when a thread is initially created.
2276void __kmp_initialize_fast_memory(kmp_info_t *this_thr) {
2277  KE_TRACE(10, ("__kmp_initialize_fast_memory: Called from th %p\n", this_thr));
2278
2279  memset(this_thr->th.th_free_lists, 0, NUM_LISTS * sizeof(kmp_free_list_t));
2280}
2281
2282// Free the memory in the thread free lists related to fast memory
2283// Only do this when a thread is being reaped (destroyed).
2284void __kmp_free_fast_memory(kmp_info_t *th) {
2285  // Suppose we use BGET underlying allocator, walk through its structures...
2286  int bin;
2287  thr_data_t *thr = get_thr_data(th);
2288  void **lst = NULL;
2289
2290  KE_TRACE(
2291      5, ("__kmp_free_fast_memory: Called T#%d\n", __kmp_gtid_from_thread(th)));
2292
2293  __kmp_bget_dequeue(th); // Release any queued buffers
2294
2295  // Dig through free lists and extract all allocated blocks
2296  for (bin = 0; bin < MAX_BGET_BINS; ++bin) {
2297    bfhead_t *b = thr->freelist[bin].ql.flink;
2298    while (b != &thr->freelist[bin]) {
2299      if ((kmp_uintptr_t)b->bh.bb.bthr & 1) { // the buffer is allocated address
2300        *((void **)b) =
2301            lst; // link the list (override bthr, but keep flink yet)
2302        lst = (void **)b; // push b into lst
2303      }
2304      b = b->ql.flink; // get next buffer
2305    }
2306  }
2307  while (lst != NULL) {
2308    void *next = *lst;
2309    KE_TRACE(10, ("__kmp_free_fast_memory: freeing %p, next=%p th %p (%d)\n",
2310                  lst, next, th, __kmp_gtid_from_thread(th)));
2311    (*thr->relfcn)(lst);
2312#if BufStats
2313    // count blocks to prevent problems in __kmp_finalize_bget()
2314    thr->numprel++; /* Nr of expansion block releases */
2315    thr->numpblk--; /* Total number of blocks */
2316#endif
2317    lst = (void **)next;
2318  }
2319
2320  KE_TRACE(
2321      5, ("__kmp_free_fast_memory: Freed T#%d\n", __kmp_gtid_from_thread(th)));
2322}
2323
2324#endif // USE_FAST_MEMORY
2325