1/*
2 * kmp_runtime.cpp -- KPTS runtime support library
3 */
4
5//===----------------------------------------------------------------------===//
6//
7// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8// See https://llvm.org/LICENSE.txt for license information.
9// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10//
11//===----------------------------------------------------------------------===//
12
13#include "kmp.h"
14#include "kmp_affinity.h"
15#include "kmp_atomic.h"
16#include "kmp_environment.h"
17#include "kmp_error.h"
18#include "kmp_i18n.h"
19#include "kmp_io.h"
20#include "kmp_itt.h"
21#include "kmp_settings.h"
22#include "kmp_stats.h"
23#include "kmp_str.h"
24#include "kmp_wait_release.h"
25#include "kmp_wrapper_getpid.h"
26#include "kmp_dispatch.h"
27#include "kmp_utils.h"
28#if KMP_USE_HIER_SCHED
29#include "kmp_dispatch_hier.h"
30#endif
31
32#if OMPT_SUPPORT
33#include "ompt-specific.h"
34#endif
35#if OMPD_SUPPORT
36#include "ompd-specific.h"
37#endif
38
39#if OMP_PROFILING_SUPPORT
40#include "llvm/Support/TimeProfiler.h"
41static char *ProfileTraceFile = nullptr;
42#endif
43
44/* these are temporary issues to be dealt with */
45#define KMP_USE_PRCTL 0
46
47#if KMP_OS_WINDOWS
48#include <process.h>
49#endif
50
51#ifndef KMP_USE_SHM
52// Windows and WASI do not need these include files as they don't use shared
53// memory.
54#else
55#include <sys/mman.h>
56#include <sys/stat.h>
57#include <fcntl.h>
58#define SHM_SIZE 1024
59#endif
60
61#if defined(KMP_GOMP_COMPAT)
62char const __kmp_version_alt_comp[] =
63    KMP_VERSION_PREFIX "alternative compiler support: yes";
64#endif /* defined(KMP_GOMP_COMPAT) */
65
66char const __kmp_version_omp_api[] =
67    KMP_VERSION_PREFIX "API version: 5.0 (201611)";
68
69#ifdef KMP_DEBUG
70char const __kmp_version_lock[] =
71    KMP_VERSION_PREFIX "lock type: run time selectable";
72#endif /* KMP_DEBUG */
73
74#define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
75
76/* ------------------------------------------------------------------------ */
77
78#if KMP_USE_MONITOR
79kmp_info_t __kmp_monitor;
80#endif
81
82/* Forward declarations */
83
84void __kmp_cleanup(void);
85
86static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
87                                  int gtid);
88static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
89                                  kmp_internal_control_t *new_icvs,
90                                  ident_t *loc);
91#if KMP_AFFINITY_SUPPORTED
92static void __kmp_partition_places(kmp_team_t *team,
93                                   int update_master_only = 0);
94#endif
95static void __kmp_do_serial_initialize(void);
96void __kmp_fork_barrier(int gtid, int tid);
97void __kmp_join_barrier(int gtid);
98void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
99                          kmp_internal_control_t *new_icvs, ident_t *loc);
100
101#ifdef USE_LOAD_BALANCE
102static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
103#endif
104
105static int __kmp_expand_threads(int nNeed);
106#if KMP_OS_WINDOWS
107static int __kmp_unregister_root_other_thread(int gtid);
108#endif
109static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
110kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
111
112void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
113                               int new_nthreads);
114void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads);
115
116/* Calculate the identifier of the current thread */
117/* fast (and somewhat portable) way to get unique identifier of executing
118   thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
119int __kmp_get_global_thread_id() {
120  int i;
121  kmp_info_t **other_threads;
122  size_t stack_data;
123  char *stack_addr;
124  size_t stack_size;
125  char *stack_base;
126
127  KA_TRACE(
128      1000,
129      ("*** __kmp_get_global_thread_id: entering, nproc=%d  all_nproc=%d\n",
130       __kmp_nth, __kmp_all_nth));
131
132  /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
133     a parallel region, made it return KMP_GTID_DNE to force serial_initialize
134     by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
135     __kmp_init_gtid for this to work. */
136
137  if (!TCR_4(__kmp_init_gtid))
138    return KMP_GTID_DNE;
139
140#ifdef KMP_TDATA_GTID
141  if (TCR_4(__kmp_gtid_mode) >= 3) {
142    KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
143    return __kmp_gtid;
144  }
145#endif
146  if (TCR_4(__kmp_gtid_mode) >= 2) {
147    KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
148    return __kmp_gtid_get_specific();
149  }
150  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
151
152  stack_addr = (char *)&stack_data;
153  other_threads = __kmp_threads;
154
155  /* ATT: The code below is a source of potential bugs due to unsynchronized
156     access to __kmp_threads array. For example:
157     1. Current thread loads other_threads[i] to thr and checks it, it is
158        non-NULL.
159     2. Current thread is suspended by OS.
160     3. Another thread unregisters and finishes (debug versions of free()
161        may fill memory with something like 0xEF).
162     4. Current thread is resumed.
163     5. Current thread reads junk from *thr.
164     TODO: Fix it.  --ln  */
165
166  for (i = 0; i < __kmp_threads_capacity; i++) {
167
168    kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
169    if (!thr)
170      continue;
171
172    stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
173    stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
174
175    /* stack grows down -- search through all of the active threads */
176
177    if (stack_addr <= stack_base) {
178      size_t stack_diff = stack_base - stack_addr;
179
180      if (stack_diff <= stack_size) {
181        /* The only way we can be closer than the allocated */
182        /* stack size is if we are running on this thread. */
183        // __kmp_gtid_get_specific can return negative value because this
184        // function can be called by thread destructor. However, before the
185        // thread destructor is called, the value of the corresponding
186        // thread-specific data will be reset to NULL.
187        KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() < 0 ||
188                         __kmp_gtid_get_specific() == i);
189        return i;
190      }
191    }
192  }
193
194  /* get specific to try and determine our gtid */
195  KA_TRACE(1000,
196           ("*** __kmp_get_global_thread_id: internal alg. failed to find "
197            "thread, using TLS\n"));
198  i = __kmp_gtid_get_specific();
199
200  /*fprintf( stderr, "=== %d\n", i );  */ /* GROO */
201
202  /* if we havn't been assigned a gtid, then return code */
203  if (i < 0)
204    return i;
205
206  // other_threads[i] can be nullptr at this point because the corresponding
207  // thread could have already been destructed. It can happen when this function
208  // is called in end library routine.
209  if (!TCR_SYNC_PTR(other_threads[i]))
210    return i;
211
212  /* dynamically updated stack window for uber threads to avoid get_specific
213     call */
214  if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
215    KMP_FATAL(StackOverflow, i);
216  }
217
218  stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
219  if (stack_addr > stack_base) {
220    TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
221    TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
222            other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
223                stack_base);
224  } else {
225    TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
226            stack_base - stack_addr);
227  }
228
229  /* Reprint stack bounds for ubermaster since they have been refined */
230  if (__kmp_storage_map) {
231    char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
232    char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
233    __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
234                                 other_threads[i]->th.th_info.ds.ds_stacksize,
235                                 "th_%d stack (refinement)", i);
236  }
237  return i;
238}
239
240int __kmp_get_global_thread_id_reg() {
241  int gtid;
242
243  if (!__kmp_init_serial) {
244    gtid = KMP_GTID_DNE;
245  } else
246#ifdef KMP_TDATA_GTID
247      if (TCR_4(__kmp_gtid_mode) >= 3) {
248    KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
249    gtid = __kmp_gtid;
250  } else
251#endif
252      if (TCR_4(__kmp_gtid_mode) >= 2) {
253    KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
254    gtid = __kmp_gtid_get_specific();
255  } else {
256    KA_TRACE(1000,
257             ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
258    gtid = __kmp_get_global_thread_id();
259  }
260
261  /* we must be a new uber master sibling thread */
262  if (gtid == KMP_GTID_DNE) {
263    KA_TRACE(10,
264             ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
265              "Registering a new gtid.\n"));
266    __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
267    if (!__kmp_init_serial) {
268      __kmp_do_serial_initialize();
269      gtid = __kmp_gtid_get_specific();
270    } else {
271      gtid = __kmp_register_root(FALSE);
272    }
273    __kmp_release_bootstrap_lock(&__kmp_initz_lock);
274    /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
275  }
276
277  KMP_DEBUG_ASSERT(gtid >= 0);
278
279  return gtid;
280}
281
282/* caller must hold forkjoin_lock */
283void __kmp_check_stack_overlap(kmp_info_t *th) {
284  int f;
285  char *stack_beg = NULL;
286  char *stack_end = NULL;
287  int gtid;
288
289  KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
290  if (__kmp_storage_map) {
291    stack_end = (char *)th->th.th_info.ds.ds_stackbase;
292    stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
293
294    gtid = __kmp_gtid_from_thread(th);
295
296    if (gtid == KMP_GTID_MONITOR) {
297      __kmp_print_storage_map_gtid(
298          gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
299          "th_%s stack (%s)", "mon",
300          (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
301    } else {
302      __kmp_print_storage_map_gtid(
303          gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
304          "th_%d stack (%s)", gtid,
305          (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
306    }
307  }
308
309  /* No point in checking ubermaster threads since they use refinement and
310   * cannot overlap */
311  gtid = __kmp_gtid_from_thread(th);
312  if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
313    KA_TRACE(10,
314             ("__kmp_check_stack_overlap: performing extensive checking\n"));
315    if (stack_beg == NULL) {
316      stack_end = (char *)th->th.th_info.ds.ds_stackbase;
317      stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
318    }
319
320    for (f = 0; f < __kmp_threads_capacity; f++) {
321      kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
322
323      if (f_th && f_th != th) {
324        char *other_stack_end =
325            (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
326        char *other_stack_beg =
327            other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
328        if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
329            (stack_end > other_stack_beg && stack_end < other_stack_end)) {
330
331          /* Print the other stack values before the abort */
332          if (__kmp_storage_map)
333            __kmp_print_storage_map_gtid(
334                -1, other_stack_beg, other_stack_end,
335                (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
336                "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
337
338          __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
339                      __kmp_msg_null);
340        }
341      }
342    }
343  }
344  KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
345}
346
347/* ------------------------------------------------------------------------ */
348
349void __kmp_infinite_loop(void) {
350  static int done = FALSE;
351
352  while (!done) {
353    KMP_YIELD(TRUE);
354  }
355}
356
357#define MAX_MESSAGE 512
358
359void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
360                                  char const *format, ...) {
361  char buffer[MAX_MESSAGE];
362  va_list ap;
363
364  va_start(ap, format);
365  KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
366               p2, (unsigned long)size, format);
367  __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
368  __kmp_vprintf(kmp_err, buffer, ap);
369#if KMP_PRINT_DATA_PLACEMENT
370  int node;
371  if (gtid >= 0) {
372    if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
373      if (__kmp_storage_map_verbose) {
374        node = __kmp_get_host_node(p1);
375        if (node < 0) /* doesn't work, so don't try this next time */
376          __kmp_storage_map_verbose = FALSE;
377        else {
378          char *last;
379          int lastNode;
380          int localProc = __kmp_get_cpu_from_gtid(gtid);
381
382          const int page_size = KMP_GET_PAGE_SIZE();
383
384          p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
385          p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
386          if (localProc >= 0)
387            __kmp_printf_no_lock("  GTID %d localNode %d\n", gtid,
388                                 localProc >> 1);
389          else
390            __kmp_printf_no_lock("  GTID %d\n", gtid);
391#if KMP_USE_PRCTL
392          /* The more elaborate format is disabled for now because of the prctl
393           * hanging bug. */
394          do {
395            last = p1;
396            lastNode = node;
397            /* This loop collates adjacent pages with the same host node. */
398            do {
399              (char *)p1 += page_size;
400            } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
401            __kmp_printf_no_lock("    %p-%p memNode %d\n", last, (char *)p1 - 1,
402                                 lastNode);
403          } while (p1 <= p2);
404#else
405          __kmp_printf_no_lock("    %p-%p memNode %d\n", p1,
406                               (char *)p1 + (page_size - 1),
407                               __kmp_get_host_node(p1));
408          if (p1 < p2) {
409            __kmp_printf_no_lock("    %p-%p memNode %d\n", p2,
410                                 (char *)p2 + (page_size - 1),
411                                 __kmp_get_host_node(p2));
412          }
413#endif
414        }
415      }
416    } else
417      __kmp_printf_no_lock("  %s\n", KMP_I18N_STR(StorageMapWarning));
418  }
419#endif /* KMP_PRINT_DATA_PLACEMENT */
420  __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
421
422  va_end(ap);
423}
424
425void __kmp_warn(char const *format, ...) {
426  char buffer[MAX_MESSAGE];
427  va_list ap;
428
429  if (__kmp_generate_warnings == kmp_warnings_off) {
430    return;
431  }
432
433  va_start(ap, format);
434
435  KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
436  __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
437  __kmp_vprintf(kmp_err, buffer, ap);
438  __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
439
440  va_end(ap);
441}
442
443void __kmp_abort_process() {
444  // Later threads may stall here, but that's ok because abort() will kill them.
445  __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
446
447  if (__kmp_debug_buf) {
448    __kmp_dump_debug_buffer();
449  }
450
451#if KMP_OS_WINDOWS
452  // Let other threads know of abnormal termination and prevent deadlock
453  // if abort happened during library initialization or shutdown
454  __kmp_global.g.g_abort = SIGABRT;
455
456  /* On Windows* OS by default abort() causes pop-up error box, which stalls
457     nightly testing. Unfortunately, we cannot reliably suppress pop-up error
458     boxes. _set_abort_behavior() works well, but this function is not
459     available in VS7 (this is not problem for DLL, but it is a problem for
460     static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
461     help, at least in some versions of MS C RTL.
462
463     It seems following sequence is the only way to simulate abort() and
464     avoid pop-up error box. */
465  raise(SIGABRT);
466  _exit(3); // Just in case, if signal ignored, exit anyway.
467#else
468  __kmp_unregister_library();
469  abort();
470#endif
471
472  __kmp_infinite_loop();
473  __kmp_release_bootstrap_lock(&__kmp_exit_lock);
474
475} // __kmp_abort_process
476
477void __kmp_abort_thread(void) {
478  // TODO: Eliminate g_abort global variable and this function.
479  // In case of abort just call abort(), it will kill all the threads.
480  __kmp_infinite_loop();
481} // __kmp_abort_thread
482
483/* Print out the storage map for the major kmp_info_t thread data structures
484   that are allocated together. */
485
486static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
487  __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
488                               gtid);
489
490  __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
491                               sizeof(kmp_desc_t), "th_%d.th_info", gtid);
492
493  __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
494                               sizeof(kmp_local_t), "th_%d.th_local", gtid);
495
496  __kmp_print_storage_map_gtid(
497      gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
498      sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
499
500  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
501                               &thr->th.th_bar[bs_plain_barrier + 1],
502                               sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
503                               gtid);
504
505  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
506                               &thr->th.th_bar[bs_forkjoin_barrier + 1],
507                               sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
508                               gtid);
509
510#if KMP_FAST_REDUCTION_BARRIER
511  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
512                               &thr->th.th_bar[bs_reduction_barrier + 1],
513                               sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
514                               gtid);
515#endif // KMP_FAST_REDUCTION_BARRIER
516}
517
518/* Print out the storage map for the major kmp_team_t team data structures
519   that are allocated together. */
520
521static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
522                                         int team_id, int num_thr) {
523  int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
524  __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
525                               header, team_id);
526
527  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
528                               &team->t.t_bar[bs_last_barrier],
529                               sizeof(kmp_balign_team_t) * bs_last_barrier,
530                               "%s_%d.t_bar", header, team_id);
531
532  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
533                               &team->t.t_bar[bs_plain_barrier + 1],
534                               sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
535                               header, team_id);
536
537  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
538                               &team->t.t_bar[bs_forkjoin_barrier + 1],
539                               sizeof(kmp_balign_team_t),
540                               "%s_%d.t_bar[forkjoin]", header, team_id);
541
542#if KMP_FAST_REDUCTION_BARRIER
543  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
544                               &team->t.t_bar[bs_reduction_barrier + 1],
545                               sizeof(kmp_balign_team_t),
546                               "%s_%d.t_bar[reduction]", header, team_id);
547#endif // KMP_FAST_REDUCTION_BARRIER
548
549  __kmp_print_storage_map_gtid(
550      -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
551      sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
552
553  __kmp_print_storage_map_gtid(
554      -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
555      sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
556
557  __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
558                               &team->t.t_disp_buffer[num_disp_buff],
559                               sizeof(dispatch_shared_info_t) * num_disp_buff,
560                               "%s_%d.t_disp_buffer", header, team_id);
561}
562
563static void __kmp_init_allocator() {
564  __kmp_init_memkind();
565  __kmp_init_target_mem();
566}
567static void __kmp_fini_allocator() { __kmp_fini_memkind(); }
568
569/* ------------------------------------------------------------------------ */
570
571#if ENABLE_LIBOMPTARGET
572static void __kmp_init_omptarget() {
573  __kmp_init_target_task();
574}
575#endif
576
577/* ------------------------------------------------------------------------ */
578
579#if KMP_DYNAMIC_LIB
580#if KMP_OS_WINDOWS
581
582BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
583  //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
584
585  switch (fdwReason) {
586
587  case DLL_PROCESS_ATTACH:
588    KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
589
590    return TRUE;
591
592  case DLL_PROCESS_DETACH:
593    KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
594
595    // According to Windows* documentation for DllMain entry point:
596    // for DLL_PROCESS_DETACH, lpReserved is used for telling the difference:
597    //   lpReserved == NULL when FreeLibrary() is called,
598    //   lpReserved != NULL when the process is terminated.
599    // When FreeLibrary() is called, worker threads remain alive. So the
600    // runtime's state is consistent and executing proper shutdown is OK.
601    // When the process is terminated, worker threads have exited or been
602    // forcefully terminated by the OS and only the shutdown thread remains.
603    // This can leave the runtime in an inconsistent state.
604    // Hence, only attempt proper cleanup when FreeLibrary() is called.
605    // Otherwise, rely on OS to reclaim resources.
606    if (lpReserved == NULL)
607      __kmp_internal_end_library(__kmp_gtid_get_specific());
608
609    return TRUE;
610
611  case DLL_THREAD_ATTACH:
612    KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
613
614    /* if we want to register new siblings all the time here call
615     * __kmp_get_gtid(); */
616    return TRUE;
617
618  case DLL_THREAD_DETACH:
619    KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
620
621    __kmp_internal_end_thread(__kmp_gtid_get_specific());
622    return TRUE;
623  }
624
625  return TRUE;
626}
627
628#endif /* KMP_OS_WINDOWS */
629#endif /* KMP_DYNAMIC_LIB */
630
631/* __kmp_parallel_deo -- Wait until it's our turn. */
632void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
633  int gtid = *gtid_ref;
634#ifdef BUILD_PARALLEL_ORDERED
635  kmp_team_t *team = __kmp_team_from_gtid(gtid);
636#endif /* BUILD_PARALLEL_ORDERED */
637
638  if (__kmp_env_consistency_check) {
639    if (__kmp_threads[gtid]->th.th_root->r.r_active)
640#if KMP_USE_DYNAMIC_LOCK
641      __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
642#else
643      __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
644#endif
645  }
646#ifdef BUILD_PARALLEL_ORDERED
647  if (!team->t.t_serialized) {
648    KMP_MB();
649    KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ,
650             NULL);
651    KMP_MB();
652  }
653#endif /* BUILD_PARALLEL_ORDERED */
654}
655
656/* __kmp_parallel_dxo -- Signal the next task. */
657void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
658  int gtid = *gtid_ref;
659#ifdef BUILD_PARALLEL_ORDERED
660  int tid = __kmp_tid_from_gtid(gtid);
661  kmp_team_t *team = __kmp_team_from_gtid(gtid);
662#endif /* BUILD_PARALLEL_ORDERED */
663
664  if (__kmp_env_consistency_check) {
665    if (__kmp_threads[gtid]->th.th_root->r.r_active)
666      __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
667  }
668#ifdef BUILD_PARALLEL_ORDERED
669  if (!team->t.t_serialized) {
670    KMP_MB(); /* Flush all pending memory write invalidates.  */
671
672    /* use the tid of the next thread in this team */
673    /* TODO replace with general release procedure */
674    team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
675
676    KMP_MB(); /* Flush all pending memory write invalidates.  */
677  }
678#endif /* BUILD_PARALLEL_ORDERED */
679}
680
681/* ------------------------------------------------------------------------ */
682/* The BARRIER for a SINGLE process section is always explicit   */
683
684int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
685  int status;
686  kmp_info_t *th;
687  kmp_team_t *team;
688
689  if (!TCR_4(__kmp_init_parallel))
690    __kmp_parallel_initialize();
691  __kmp_resume_if_soft_paused();
692
693  th = __kmp_threads[gtid];
694  team = th->th.th_team;
695  status = 0;
696
697  th->th.th_ident = id_ref;
698
699  if (team->t.t_serialized) {
700    status = 1;
701  } else {
702    kmp_int32 old_this = th->th.th_local.this_construct;
703
704    ++th->th.th_local.this_construct;
705    /* try to set team count to thread count--success means thread got the
706       single block */
707    /* TODO: Should this be acquire or release? */
708    if (team->t.t_construct == old_this) {
709      status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this,
710                                              th->th.th_local.this_construct);
711    }
712#if USE_ITT_BUILD
713    if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
714        KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
715        team->t.t_active_level == 1) {
716      // Only report metadata by primary thread of active team at level 1
717      __kmp_itt_metadata_single(id_ref);
718    }
719#endif /* USE_ITT_BUILD */
720  }
721
722  if (__kmp_env_consistency_check) {
723    if (status && push_ws) {
724      __kmp_push_workshare(gtid, ct_psingle, id_ref);
725    } else {
726      __kmp_check_workshare(gtid, ct_psingle, id_ref);
727    }
728  }
729#if USE_ITT_BUILD
730  if (status) {
731    __kmp_itt_single_start(gtid);
732  }
733#endif /* USE_ITT_BUILD */
734  return status;
735}
736
737void __kmp_exit_single(int gtid) {
738#if USE_ITT_BUILD
739  __kmp_itt_single_end(gtid);
740#endif /* USE_ITT_BUILD */
741  if (__kmp_env_consistency_check)
742    __kmp_pop_workshare(gtid, ct_psingle, NULL);
743}
744
745/* determine if we can go parallel or must use a serialized parallel region and
746 * how many threads we can use
747 * set_nproc is the number of threads requested for the team
748 * returns 0 if we should serialize or only use one thread,
749 * otherwise the number of threads to use
750 * The forkjoin lock is held by the caller. */
751static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
752                                 int master_tid, int set_nthreads,
753                                 int enter_teams) {
754  int capacity;
755  int new_nthreads;
756  KMP_DEBUG_ASSERT(__kmp_init_serial);
757  KMP_DEBUG_ASSERT(root && parent_team);
758  kmp_info_t *this_thr = parent_team->t.t_threads[master_tid];
759
760  // If dyn-var is set, dynamically adjust the number of desired threads,
761  // according to the method specified by dynamic_mode.
762  new_nthreads = set_nthreads;
763  if (!get__dynamic_2(parent_team, master_tid)) {
764    ;
765  }
766#ifdef USE_LOAD_BALANCE
767  else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
768    new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
769    if (new_nthreads == 1) {
770      KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
771                    "reservation to 1 thread\n",
772                    master_tid));
773      return 1;
774    }
775    if (new_nthreads < set_nthreads) {
776      KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
777                    "reservation to %d threads\n",
778                    master_tid, new_nthreads));
779    }
780  }
781#endif /* USE_LOAD_BALANCE */
782  else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
783    new_nthreads = __kmp_avail_proc - __kmp_nth +
784                   (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
785    if (new_nthreads <= 1) {
786      KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
787                    "reservation to 1 thread\n",
788                    master_tid));
789      return 1;
790    }
791    if (new_nthreads < set_nthreads) {
792      KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
793                    "reservation to %d threads\n",
794                    master_tid, new_nthreads));
795    } else {
796      new_nthreads = set_nthreads;
797    }
798  } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
799    if (set_nthreads > 2) {
800      new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
801      new_nthreads = (new_nthreads % set_nthreads) + 1;
802      if (new_nthreads == 1) {
803        KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
804                      "reservation to 1 thread\n",
805                      master_tid));
806        return 1;
807      }
808      if (new_nthreads < set_nthreads) {
809        KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
810                      "reservation to %d threads\n",
811                      master_tid, new_nthreads));
812      }
813    }
814  } else {
815    KMP_ASSERT(0);
816  }
817
818  // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
819  if (__kmp_nth + new_nthreads -
820          (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
821      __kmp_max_nth) {
822    int tl_nthreads = __kmp_max_nth - __kmp_nth +
823                      (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
824    if (tl_nthreads <= 0) {
825      tl_nthreads = 1;
826    }
827
828    // If dyn-var is false, emit a 1-time warning.
829    if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
830      __kmp_reserve_warn = 1;
831      __kmp_msg(kmp_ms_warning,
832                KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
833                KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
834    }
835    if (tl_nthreads == 1) {
836      KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
837                    "reduced reservation to 1 thread\n",
838                    master_tid));
839      return 1;
840    }
841    KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
842                  "reservation to %d threads\n",
843                  master_tid, tl_nthreads));
844    new_nthreads = tl_nthreads;
845  }
846
847  // Respect OMP_THREAD_LIMIT
848  int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads;
849  int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit;
850  if (cg_nthreads + new_nthreads -
851          (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
852      max_cg_threads) {
853    int tl_nthreads = max_cg_threads - cg_nthreads +
854                      (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
855    if (tl_nthreads <= 0) {
856      tl_nthreads = 1;
857    }
858
859    // If dyn-var is false, emit a 1-time warning.
860    if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
861      __kmp_reserve_warn = 1;
862      __kmp_msg(kmp_ms_warning,
863                KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
864                KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
865    }
866    if (tl_nthreads == 1) {
867      KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
868                    "reduced reservation to 1 thread\n",
869                    master_tid));
870      return 1;
871    }
872    KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
873                  "reservation to %d threads\n",
874                  master_tid, tl_nthreads));
875    new_nthreads = tl_nthreads;
876  }
877
878  // Check if the threads array is large enough, or needs expanding.
879  // See comment in __kmp_register_root() about the adjustment if
880  // __kmp_threads[0] == NULL.
881  capacity = __kmp_threads_capacity;
882  if (TCR_PTR(__kmp_threads[0]) == NULL) {
883    --capacity;
884  }
885  // If it is not for initializing the hidden helper team, we need to take
886  // __kmp_hidden_helper_threads_num out of the capacity because it is included
887  // in __kmp_threads_capacity.
888  if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
889    capacity -= __kmp_hidden_helper_threads_num;
890  }
891  if (__kmp_nth + new_nthreads -
892          (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
893      capacity) {
894    // Expand the threads array.
895    int slotsRequired = __kmp_nth + new_nthreads -
896                        (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
897                        capacity;
898    int slotsAdded = __kmp_expand_threads(slotsRequired);
899    if (slotsAdded < slotsRequired) {
900      // The threads array was not expanded enough.
901      new_nthreads -= (slotsRequired - slotsAdded);
902      KMP_ASSERT(new_nthreads >= 1);
903
904      // If dyn-var is false, emit a 1-time warning.
905      if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
906        __kmp_reserve_warn = 1;
907        if (__kmp_tp_cached) {
908          __kmp_msg(kmp_ms_warning,
909                    KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
910                    KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
911                    KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
912        } else {
913          __kmp_msg(kmp_ms_warning,
914                    KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
915                    KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
916        }
917      }
918    }
919  }
920
921#ifdef KMP_DEBUG
922  if (new_nthreads == 1) {
923    KC_TRACE(10,
924             ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
925              "dead roots and rechecking; requested %d threads\n",
926              __kmp_get_gtid(), set_nthreads));
927  } else {
928    KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
929                  " %d threads\n",
930                  __kmp_get_gtid(), new_nthreads, set_nthreads));
931  }
932#endif // KMP_DEBUG
933  return new_nthreads;
934}
935
936/* Allocate threads from the thread pool and assign them to the new team. We are
937   assured that there are enough threads available, because we checked on that
938   earlier within critical section forkjoin */
939static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
940                                    kmp_info_t *master_th, int master_gtid,
941                                    int fork_teams_workers) {
942  int i;
943  int use_hot_team;
944
945  KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
946  KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
947  KMP_MB();
948
949  /* first, let's setup the primary thread */
950  master_th->th.th_info.ds.ds_tid = 0;
951  master_th->th.th_team = team;
952  master_th->th.th_team_nproc = team->t.t_nproc;
953  master_th->th.th_team_master = master_th;
954  master_th->th.th_team_serialized = FALSE;
955  master_th->th.th_dispatch = &team->t.t_dispatch[0];
956
957/* make sure we are not the optimized hot team */
958#if KMP_NESTED_HOT_TEAMS
959  use_hot_team = 0;
960  kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
961  if (hot_teams) { // hot teams array is not allocated if
962    // KMP_HOT_TEAMS_MAX_LEVEL=0
963    int level = team->t.t_active_level - 1; // index in array of hot teams
964    if (master_th->th.th_teams_microtask) { // are we inside the teams?
965      if (master_th->th.th_teams_size.nteams > 1) {
966        ++level; // level was not increased in teams construct for
967        // team_of_masters
968      }
969      if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
970          master_th->th.th_teams_level == team->t.t_level) {
971        ++level; // level was not increased in teams construct for
972        // team_of_workers before the parallel
973      } // team->t.t_level will be increased inside parallel
974    }
975    if (level < __kmp_hot_teams_max_level) {
976      if (hot_teams[level].hot_team) {
977        // hot team has already been allocated for given level
978        KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
979        use_hot_team = 1; // the team is ready to use
980      } else {
981        use_hot_team = 0; // AC: threads are not allocated yet
982        hot_teams[level].hot_team = team; // remember new hot team
983        hot_teams[level].hot_team_nth = team->t.t_nproc;
984      }
985    } else {
986      use_hot_team = 0;
987    }
988  }
989#else
990  use_hot_team = team == root->r.r_hot_team;
991#endif
992  if (!use_hot_team) {
993
994    /* install the primary thread */
995    team->t.t_threads[0] = master_th;
996    __kmp_initialize_info(master_th, team, 0, master_gtid);
997
998    /* now, install the worker threads */
999    for (i = 1; i < team->t.t_nproc; i++) {
1000
1001      /* fork or reallocate a new thread and install it in team */
1002      kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
1003      team->t.t_threads[i] = thr;
1004      KMP_DEBUG_ASSERT(thr);
1005      KMP_DEBUG_ASSERT(thr->th.th_team == team);
1006      /* align team and thread arrived states */
1007      KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
1008                    "T#%d(%d:%d) join =%llu, plain=%llu\n",
1009                    __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
1010                    __kmp_gtid_from_tid(i, team), team->t.t_id, i,
1011                    team->t.t_bar[bs_forkjoin_barrier].b_arrived,
1012                    team->t.t_bar[bs_plain_barrier].b_arrived));
1013      thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
1014      thr->th.th_teams_level = master_th->th.th_teams_level;
1015      thr->th.th_teams_size = master_th->th.th_teams_size;
1016      { // Initialize threads' barrier data.
1017        int b;
1018        kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
1019        for (b = 0; b < bs_last_barrier; ++b) {
1020          balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
1021          KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
1022#if USE_DEBUGGER
1023          balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
1024#endif
1025        }
1026      }
1027    }
1028
1029#if KMP_AFFINITY_SUPPORTED
1030    // Do not partition the places list for teams construct workers who
1031    // haven't actually been forked to do real work yet. This partitioning
1032    // will take place in the parallel region nested within the teams construct.
1033    if (!fork_teams_workers) {
1034      __kmp_partition_places(team);
1035    }
1036#endif
1037
1038    if (team->t.t_nproc > 1 &&
1039        __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
1040      team->t.b->update_num_threads(team->t.t_nproc);
1041      __kmp_add_threads_to_team(team, team->t.t_nproc);
1042    }
1043  }
1044
1045  if (__kmp_display_affinity && team->t.t_display_affinity != 1) {
1046    for (i = 0; i < team->t.t_nproc; i++) {
1047      kmp_info_t *thr = team->t.t_threads[i];
1048      if (thr->th.th_prev_num_threads != team->t.t_nproc ||
1049          thr->th.th_prev_level != team->t.t_level) {
1050        team->t.t_display_affinity = 1;
1051        break;
1052      }
1053    }
1054  }
1055
1056  KMP_MB();
1057}
1058
1059#if KMP_ARCH_X86 || KMP_ARCH_X86_64
1060// Propagate any changes to the floating point control registers out to the team
1061// We try to avoid unnecessary writes to the relevant cache line in the team
1062// structure, so we don't make changes unless they are needed.
1063inline static void propagateFPControl(kmp_team_t *team) {
1064  if (__kmp_inherit_fp_control) {
1065    kmp_int16 x87_fpu_control_word;
1066    kmp_uint32 mxcsr;
1067
1068    // Get primary thread's values of FPU control flags (both X87 and vector)
1069    __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1070    __kmp_store_mxcsr(&mxcsr);
1071    mxcsr &= KMP_X86_MXCSR_MASK;
1072
1073    // There is no point looking at t_fp_control_saved here.
1074    // If it is TRUE, we still have to update the values if they are different
1075    // from those we now have. If it is FALSE we didn't save anything yet, but
1076    // our objective is the same. We have to ensure that the values in the team
1077    // are the same as those we have.
1078    // So, this code achieves what we need whether or not t_fp_control_saved is
1079    // true. By checking whether the value needs updating we avoid unnecessary
1080    // writes that would put the cache-line into a written state, causing all
1081    // threads in the team to have to read it again.
1082    KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1083    KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1084    // Although we don't use this value, other code in the runtime wants to know
1085    // whether it should restore them. So we must ensure it is correct.
1086    KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1087  } else {
1088    // Similarly here. Don't write to this cache-line in the team structure
1089    // unless we have to.
1090    KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1091  }
1092}
1093
1094// Do the opposite, setting the hardware registers to the updated values from
1095// the team.
1096inline static void updateHWFPControl(kmp_team_t *team) {
1097  if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1098    // Only reset the fp control regs if they have been changed in the team.
1099    // the parallel region that we are exiting.
1100    kmp_int16 x87_fpu_control_word;
1101    kmp_uint32 mxcsr;
1102    __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1103    __kmp_store_mxcsr(&mxcsr);
1104    mxcsr &= KMP_X86_MXCSR_MASK;
1105
1106    if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1107      __kmp_clear_x87_fpu_status_word();
1108      __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
1109    }
1110
1111    if (team->t.t_mxcsr != mxcsr) {
1112      __kmp_load_mxcsr(&team->t.t_mxcsr);
1113    }
1114  }
1115}
1116#else
1117#define propagateFPControl(x) ((void)0)
1118#define updateHWFPControl(x) ((void)0)
1119#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1120
1121static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1122                                     int realloc); // forward declaration
1123
1124/* Run a parallel region that has been serialized, so runs only in a team of the
1125   single primary thread. */
1126void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1127  kmp_info_t *this_thr;
1128  kmp_team_t *serial_team;
1129
1130  KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1131
1132  /* Skip all this code for autopar serialized loops since it results in
1133     unacceptable overhead */
1134  if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1135    return;
1136
1137  if (!TCR_4(__kmp_init_parallel))
1138    __kmp_parallel_initialize();
1139  __kmp_resume_if_soft_paused();
1140
1141  this_thr = __kmp_threads[global_tid];
1142  serial_team = this_thr->th.th_serial_team;
1143
1144  /* utilize the serialized team held by this thread */
1145  KMP_DEBUG_ASSERT(serial_team);
1146  KMP_MB();
1147
1148  if (__kmp_tasking_mode != tskm_immediate_exec) {
1149    KMP_DEBUG_ASSERT(
1150        this_thr->th.th_task_team ==
1151        this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1152    KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] ==
1153                     NULL);
1154    KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / "
1155                  "team %p, new task_team = NULL\n",
1156                  global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
1157    this_thr->th.th_task_team = NULL;
1158  }
1159
1160  kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1161  if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1162    proc_bind = proc_bind_false;
1163  } else if (proc_bind == proc_bind_default) {
1164    // No proc_bind clause was specified, so use the current value
1165    // of proc-bind-var for this parallel region.
1166    proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1167  }
1168  // Reset for next parallel region
1169  this_thr->th.th_set_proc_bind = proc_bind_default;
1170
1171  // Reset num_threads for next parallel region
1172  this_thr->th.th_set_nproc = 0;
1173
1174#if OMPT_SUPPORT
1175  ompt_data_t ompt_parallel_data = ompt_data_none;
1176  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1177  if (ompt_enabled.enabled &&
1178      this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1179
1180    ompt_task_info_t *parent_task_info;
1181    parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
1182
1183    parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1184    if (ompt_enabled.ompt_callback_parallel_begin) {
1185      int team_size = 1;
1186
1187      ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1188          &(parent_task_info->task_data), &(parent_task_info->frame),
1189          &ompt_parallel_data, team_size,
1190          ompt_parallel_invoker_program | ompt_parallel_team, codeptr);
1191    }
1192  }
1193#endif // OMPT_SUPPORT
1194
1195  if (this_thr->th.th_team != serial_team) {
1196    // Nested level will be an index in the nested nthreads array
1197    int level = this_thr->th.th_team->t.t_level;
1198
1199    if (serial_team->t.t_serialized) {
1200      /* this serial team was already used
1201         TODO increase performance by making this locks more specific */
1202      kmp_team_t *new_team;
1203
1204      __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1205
1206      new_team =
1207          __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1208#if OMPT_SUPPORT
1209                              ompt_parallel_data,
1210#endif
1211                              proc_bind, &this_thr->th.th_current_task->td_icvs,
1212                              0 USE_NESTED_HOT_ARG(NULL));
1213      __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1214      KMP_ASSERT(new_team);
1215
1216      /* setup new serialized team and install it */
1217      new_team->t.t_threads[0] = this_thr;
1218      new_team->t.t_parent = this_thr->th.th_team;
1219      serial_team = new_team;
1220      this_thr->th.th_serial_team = serial_team;
1221
1222      KF_TRACE(
1223          10,
1224          ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1225           global_tid, serial_team));
1226
1227      /* TODO the above breaks the requirement that if we run out of resources,
1228         then we can still guarantee that serialized teams are ok, since we may
1229         need to allocate a new one */
1230    } else {
1231      KF_TRACE(
1232          10,
1233          ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1234           global_tid, serial_team));
1235    }
1236
1237    /* we have to initialize this serial team */
1238    KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1239    KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1240    KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1241    serial_team->t.t_ident = loc;
1242    serial_team->t.t_serialized = 1;
1243    serial_team->t.t_nproc = 1;
1244    serial_team->t.t_parent = this_thr->th.th_team;
1245    serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
1246    this_thr->th.th_team = serial_team;
1247    serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1248
1249    KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d curtask=%p\n", global_tid,
1250                  this_thr->th.th_current_task));
1251    KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1252    this_thr->th.th_current_task->td_flags.executing = 0;
1253
1254    __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
1255
1256    /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1257       implicit task for each serialized task represented by
1258       team->t.t_serialized? */
1259    copy_icvs(&this_thr->th.th_current_task->td_icvs,
1260              &this_thr->th.th_current_task->td_parent->td_icvs);
1261
1262    // Thread value exists in the nested nthreads array for the next nested
1263    // level
1264    if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1265      this_thr->th.th_current_task->td_icvs.nproc =
1266          __kmp_nested_nth.nth[level + 1];
1267    }
1268
1269    if (__kmp_nested_proc_bind.used &&
1270        (level + 1 < __kmp_nested_proc_bind.used)) {
1271      this_thr->th.th_current_task->td_icvs.proc_bind =
1272          __kmp_nested_proc_bind.bind_types[level + 1];
1273    }
1274
1275#if USE_DEBUGGER
1276    serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1277#endif
1278    this_thr->th.th_info.ds.ds_tid = 0;
1279
1280    /* set thread cache values */
1281    this_thr->th.th_team_nproc = 1;
1282    this_thr->th.th_team_master = this_thr;
1283    this_thr->th.th_team_serialized = 1;
1284
1285    serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1286    serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1287    serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save
1288
1289    propagateFPControl(serial_team);
1290
1291    /* check if we need to allocate dispatch buffers stack */
1292    KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1293    if (!serial_team->t.t_dispatch->th_disp_buffer) {
1294      serial_team->t.t_dispatch->th_disp_buffer =
1295          (dispatch_private_info_t *)__kmp_allocate(
1296              sizeof(dispatch_private_info_t));
1297    }
1298    this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1299
1300    KMP_MB();
1301
1302  } else {
1303    /* this serialized team is already being used,
1304     * that's fine, just add another nested level */
1305    KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1306    KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1307    KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1308    ++serial_team->t.t_serialized;
1309    this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1310
1311    // Nested level will be an index in the nested nthreads array
1312    int level = this_thr->th.th_team->t.t_level;
1313    // Thread value exists in the nested nthreads array for the next nested
1314    // level
1315    if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1316      this_thr->th.th_current_task->td_icvs.nproc =
1317          __kmp_nested_nth.nth[level + 1];
1318    }
1319    serial_team->t.t_level++;
1320    KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1321                  "of serial team %p to %d\n",
1322                  global_tid, serial_team, serial_team->t.t_level));
1323
1324    /* allocate/push dispatch buffers stack */
1325    KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1326    {
1327      dispatch_private_info_t *disp_buffer =
1328          (dispatch_private_info_t *)__kmp_allocate(
1329              sizeof(dispatch_private_info_t));
1330      disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1331      serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1332    }
1333    this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1334
1335    KMP_MB();
1336  }
1337  KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1338
1339  // Perform the display affinity functionality for
1340  // serialized parallel regions
1341  if (__kmp_display_affinity) {
1342    if (this_thr->th.th_prev_level != serial_team->t.t_level ||
1343        this_thr->th.th_prev_num_threads != 1) {
1344      // NULL means use the affinity-format-var ICV
1345      __kmp_aux_display_affinity(global_tid, NULL);
1346      this_thr->th.th_prev_level = serial_team->t.t_level;
1347      this_thr->th.th_prev_num_threads = 1;
1348    }
1349  }
1350
1351  if (__kmp_env_consistency_check)
1352    __kmp_push_parallel(global_tid, NULL);
1353#if OMPT_SUPPORT
1354  serial_team->t.ompt_team_info.master_return_address = codeptr;
1355  if (ompt_enabled.enabled &&
1356      this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1357    OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1358        OMPT_GET_FRAME_ADDRESS(0);
1359
1360    ompt_lw_taskteam_t lw_taskteam;
1361    __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
1362                            &ompt_parallel_data, codeptr);
1363
1364    __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
1365    // don't use lw_taskteam after linking. content was swaped
1366
1367    /* OMPT implicit task begin */
1368    if (ompt_enabled.ompt_callback_implicit_task) {
1369      ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1370          ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
1371          OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid),
1372          ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1373      OMPT_CUR_TASK_INFO(this_thr)->thread_num =
1374          __kmp_tid_from_gtid(global_tid);
1375    }
1376
1377    /* OMPT state */
1378    this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1379    OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1380        OMPT_GET_FRAME_ADDRESS(0);
1381  }
1382#endif
1383}
1384
1385// Test if this fork is for a team closely nested in a teams construct
1386static inline bool __kmp_is_fork_in_teams(kmp_info_t *master_th,
1387                                          microtask_t microtask, int level,
1388                                          int teams_level, kmp_va_list ap) {
1389  return (master_th->th.th_teams_microtask && ap &&
1390          microtask != (microtask_t)__kmp_teams_master && level == teams_level);
1391}
1392
1393// Test if this fork is for the teams construct, i.e. to form the outer league
1394// of teams
1395static inline bool __kmp_is_entering_teams(int active_level, int level,
1396                                           int teams_level, kmp_va_list ap) {
1397  return ((ap == NULL && active_level == 0) ||
1398          (ap && teams_level > 0 && teams_level == level));
1399}
1400
1401// AC: This is start of parallel that is nested inside teams construct.
1402// The team is actual (hot), all workers are ready at the fork barrier.
1403// No lock needed to initialize the team a bit, then free workers.
1404static inline int
1405__kmp_fork_in_teams(ident_t *loc, int gtid, kmp_team_t *parent_team,
1406                    kmp_int32 argc, kmp_info_t *master_th, kmp_root_t *root,
1407                    enum fork_context_e call_context, microtask_t microtask,
1408                    launch_t invoker, int master_set_numthreads, int level,
1409#if OMPT_SUPPORT
1410                    ompt_data_t ompt_parallel_data, void *return_address,
1411#endif
1412                    kmp_va_list ap) {
1413  void **argv;
1414  int i;
1415
1416  parent_team->t.t_ident = loc;
1417  __kmp_alloc_argv_entries(argc, parent_team, TRUE);
1418  parent_team->t.t_argc = argc;
1419  argv = (void **)parent_team->t.t_argv;
1420  for (i = argc - 1; i >= 0; --i) {
1421    *argv++ = va_arg(kmp_va_deref(ap), void *);
1422  }
1423  // Increment our nested depth levels, but not increase the serialization
1424  if (parent_team == master_th->th.th_serial_team) {
1425    // AC: we are in serialized parallel
1426    __kmpc_serialized_parallel(loc, gtid);
1427    KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1428
1429    if (call_context == fork_context_gnu) {
1430      // AC: need to decrement t_serialized for enquiry functions to work
1431      // correctly, will restore at join time
1432      parent_team->t.t_serialized--;
1433      return TRUE;
1434    }
1435
1436#if OMPD_SUPPORT
1437    parent_team->t.t_pkfn = microtask;
1438#endif
1439
1440#if OMPT_SUPPORT
1441    void *dummy;
1442    void **exit_frame_p;
1443    ompt_data_t *implicit_task_data;
1444    ompt_lw_taskteam_t lw_taskteam;
1445
1446    if (ompt_enabled.enabled) {
1447      __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1448                              &ompt_parallel_data, return_address);
1449      exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr);
1450
1451      __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1452      // Don't use lw_taskteam after linking. Content was swapped.
1453
1454      /* OMPT implicit task begin */
1455      implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1456      if (ompt_enabled.ompt_callback_implicit_task) {
1457        OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
1458        ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1459            ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), implicit_task_data,
1460            1, OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1461      }
1462
1463      /* OMPT state */
1464      master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1465    } else {
1466      exit_frame_p = &dummy;
1467    }
1468#endif
1469
1470    // AC: need to decrement t_serialized for enquiry functions to work
1471    // correctly, will restore at join time
1472    parent_team->t.t_serialized--;
1473
1474    {
1475      KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1476      KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1477      __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1478#if OMPT_SUPPORT
1479                             ,
1480                             exit_frame_p
1481#endif
1482                             );
1483    }
1484
1485#if OMPT_SUPPORT
1486    if (ompt_enabled.enabled) {
1487      *exit_frame_p = NULL;
1488      OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none;
1489      if (ompt_enabled.ompt_callback_implicit_task) {
1490        ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1491            ompt_scope_end, NULL, implicit_task_data, 1,
1492            OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1493      }
1494      ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1495      __ompt_lw_taskteam_unlink(master_th);
1496      if (ompt_enabled.ompt_callback_parallel_end) {
1497        ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1498            &ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th),
1499            OMPT_INVOKER(call_context) | ompt_parallel_team, return_address);
1500      }
1501      master_th->th.ompt_thread_info.state = ompt_state_overhead;
1502    }
1503#endif
1504    return TRUE;
1505  }
1506
1507  parent_team->t.t_pkfn = microtask;
1508  parent_team->t.t_invoke = invoker;
1509  KMP_ATOMIC_INC(&root->r.r_in_parallel);
1510  parent_team->t.t_active_level++;
1511  parent_team->t.t_level++;
1512  parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save
1513
1514  // If the threads allocated to the team are less than the thread limit, update
1515  // the thread limit here. th_teams_size.nth is specific to this team nested
1516  // in a teams construct, the team is fully created, and we're about to do
1517  // the actual fork. Best to do this here so that the subsequent uses below
1518  // and in the join have the correct value.
1519  master_th->th.th_teams_size.nth = parent_team->t.t_nproc;
1520
1521#if OMPT_SUPPORT
1522  if (ompt_enabled.enabled) {
1523    ompt_lw_taskteam_t lw_taskteam;
1524    __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, &ompt_parallel_data,
1525                            return_address);
1526    __ompt_lw_taskteam_link(&lw_taskteam, master_th, 1, true);
1527  }
1528#endif
1529
1530  /* Change number of threads in the team if requested */
1531  if (master_set_numthreads) { // The parallel has num_threads clause
1532    if (master_set_numthreads <= master_th->th.th_teams_size.nth) {
1533      // AC: only can reduce number of threads dynamically, can't increase
1534      kmp_info_t **other_threads = parent_team->t.t_threads;
1535      // NOTE: if using distributed barrier, we need to run this code block
1536      // even when the team size appears not to have changed from the max.
1537      int old_proc = master_th->th.th_teams_size.nth;
1538      if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
1539        __kmp_resize_dist_barrier(parent_team, old_proc, master_set_numthreads);
1540        __kmp_add_threads_to_team(parent_team, master_set_numthreads);
1541      }
1542      parent_team->t.t_nproc = master_set_numthreads;
1543      for (i = 0; i < master_set_numthreads; ++i) {
1544        other_threads[i]->th.th_team_nproc = master_set_numthreads;
1545      }
1546    }
1547    // Keep extra threads hot in the team for possible next parallels
1548    master_th->th.th_set_nproc = 0;
1549  }
1550
1551#if USE_DEBUGGER
1552  if (__kmp_debugging) { // Let debugger override number of threads.
1553    int nth = __kmp_omp_num_threads(loc);
1554    if (nth > 0) { // 0 means debugger doesn't want to change num threads
1555      master_set_numthreads = nth;
1556    }
1557  }
1558#endif
1559
1560  // Figure out the proc_bind policy for the nested parallel within teams
1561  kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1562  // proc_bind_default means don't update
1563  kmp_proc_bind_t proc_bind_icv = proc_bind_default;
1564  if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1565    proc_bind = proc_bind_false;
1566  } else {
1567    // No proc_bind clause specified; use current proc-bind-var
1568    if (proc_bind == proc_bind_default) {
1569      proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1570    }
1571    /* else: The proc_bind policy was specified explicitly on parallel clause.
1572       This overrides proc-bind-var for this parallel region, but does not
1573       change proc-bind-var. */
1574    // Figure the value of proc-bind-var for the child threads.
1575    if ((level + 1 < __kmp_nested_proc_bind.used) &&
1576        (__kmp_nested_proc_bind.bind_types[level + 1] !=
1577         master_th->th.th_current_task->td_icvs.proc_bind)) {
1578      proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
1579    }
1580  }
1581  KMP_CHECK_UPDATE(parent_team->t.t_proc_bind, proc_bind);
1582  // Need to change the bind-var ICV to correct value for each implicit task
1583  if (proc_bind_icv != proc_bind_default &&
1584      master_th->th.th_current_task->td_icvs.proc_bind != proc_bind_icv) {
1585    kmp_info_t **other_threads = parent_team->t.t_threads;
1586    for (i = 0; i < master_th->th.th_team_nproc; ++i) {
1587      other_threads[i]->th.th_current_task->td_icvs.proc_bind = proc_bind_icv;
1588    }
1589  }
1590  // Reset for next parallel region
1591  master_th->th.th_set_proc_bind = proc_bind_default;
1592
1593#if USE_ITT_BUILD && USE_ITT_NOTIFY
1594  if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) ||
1595       KMP_ITT_DEBUG) &&
1596      __kmp_forkjoin_frames_mode == 3 &&
1597      parent_team->t.t_active_level == 1 // only report frames at level 1
1598      && master_th->th.th_teams_size.nteams == 1) {
1599    kmp_uint64 tmp_time = __itt_get_timestamp();
1600    master_th->th.th_frame_time = tmp_time;
1601    parent_team->t.t_region_time = tmp_time;
1602  }
1603  if (__itt_stack_caller_create_ptr) {
1604    KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
1605    // create new stack stitching id before entering fork barrier
1606    parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
1607  }
1608#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1609#if KMP_AFFINITY_SUPPORTED
1610  __kmp_partition_places(parent_team);
1611#endif
1612
1613  KF_TRACE(10, ("__kmp_fork_in_teams: before internal fork: root=%p, team=%p, "
1614                "master_th=%p, gtid=%d\n",
1615                root, parent_team, master_th, gtid));
1616  __kmp_internal_fork(loc, gtid, parent_team);
1617  KF_TRACE(10, ("__kmp_fork_in_teams: after internal fork: root=%p, team=%p, "
1618                "master_th=%p, gtid=%d\n",
1619                root, parent_team, master_th, gtid));
1620
1621  if (call_context == fork_context_gnu)
1622    return TRUE;
1623
1624  /* Invoke microtask for PRIMARY thread */
1625  KA_TRACE(20, ("__kmp_fork_in_teams: T#%d(%d:0) invoke microtask = %p\n", gtid,
1626                parent_team->t.t_id, parent_team->t.t_pkfn));
1627
1628  if (!parent_team->t.t_invoke(gtid)) {
1629    KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
1630  }
1631  KA_TRACE(20, ("__kmp_fork_in_teams: T#%d(%d:0) done microtask = %p\n", gtid,
1632                parent_team->t.t_id, parent_team->t.t_pkfn));
1633  KMP_MB(); /* Flush all pending memory write invalidates.  */
1634
1635  KA_TRACE(20, ("__kmp_fork_in_teams: parallel exit T#%d\n", gtid));
1636
1637  return TRUE;
1638}
1639
1640// Create a serialized parallel region
1641static inline int
1642__kmp_serial_fork_call(ident_t *loc, int gtid, enum fork_context_e call_context,
1643                       kmp_int32 argc, microtask_t microtask, launch_t invoker,
1644                       kmp_info_t *master_th, kmp_team_t *parent_team,
1645#if OMPT_SUPPORT
1646                       ompt_data_t *ompt_parallel_data, void **return_address,
1647                       ompt_data_t **parent_task_data,
1648#endif
1649                       kmp_va_list ap) {
1650  kmp_team_t *team;
1651  int i;
1652  void **argv;
1653
1654/* josh todo: hypothetical question: what do we do for OS X*? */
1655#if KMP_OS_LINUX &&                                                            \
1656    (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1657  SimpleVLA<void *> args(argc);
1658#else
1659  void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1660#endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1661          KMP_ARCH_AARCH64) */
1662
1663  KA_TRACE(
1664      20, ("__kmp_serial_fork_call: T#%d serializing parallel region\n", gtid));
1665
1666  __kmpc_serialized_parallel(loc, gtid);
1667
1668#if OMPD_SUPPORT
1669  master_th->th.th_serial_team->t.t_pkfn = microtask;
1670#endif
1671
1672  if (call_context == fork_context_intel) {
1673    /* TODO this sucks, use the compiler itself to pass args! :) */
1674    master_th->th.th_serial_team->t.t_ident = loc;
1675    if (!ap) {
1676      // revert change made in __kmpc_serialized_parallel()
1677      master_th->th.th_serial_team->t.t_level--;
1678// Get args from parent team for teams construct
1679
1680#if OMPT_SUPPORT
1681      void *dummy;
1682      void **exit_frame_p;
1683      ompt_task_info_t *task_info;
1684      ompt_lw_taskteam_t lw_taskteam;
1685
1686      if (ompt_enabled.enabled) {
1687        __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1688                                ompt_parallel_data, *return_address);
1689
1690        __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1691        // don't use lw_taskteam after linking. content was swaped
1692        task_info = OMPT_CUR_TASK_INFO(master_th);
1693        exit_frame_p = &(task_info->frame.exit_frame.ptr);
1694        if (ompt_enabled.ompt_callback_implicit_task) {
1695          OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
1696          ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1697              ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1698              &(task_info->task_data), 1,
1699              OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1700        }
1701
1702        /* OMPT state */
1703        master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1704      } else {
1705        exit_frame_p = &dummy;
1706      }
1707#endif
1708
1709      {
1710        KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1711        KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1712        __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1713#if OMPT_SUPPORT
1714                               ,
1715                               exit_frame_p
1716#endif
1717                               );
1718      }
1719
1720#if OMPT_SUPPORT
1721      if (ompt_enabled.enabled) {
1722        *exit_frame_p = NULL;
1723        if (ompt_enabled.ompt_callback_implicit_task) {
1724          ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1725              ompt_scope_end, NULL, &(task_info->task_data), 1,
1726              OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1727        }
1728        *ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1729        __ompt_lw_taskteam_unlink(master_th);
1730        if (ompt_enabled.ompt_callback_parallel_end) {
1731          ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1732              ompt_parallel_data, *parent_task_data,
1733              OMPT_INVOKER(call_context) | ompt_parallel_team, *return_address);
1734        }
1735        master_th->th.ompt_thread_info.state = ompt_state_overhead;
1736      }
1737#endif
1738    } else if (microtask == (microtask_t)__kmp_teams_master) {
1739      KMP_DEBUG_ASSERT(master_th->th.th_team == master_th->th.th_serial_team);
1740      team = master_th->th.th_team;
1741      // team->t.t_pkfn = microtask;
1742      team->t.t_invoke = invoker;
1743      __kmp_alloc_argv_entries(argc, team, TRUE);
1744      team->t.t_argc = argc;
1745      argv = (void **)team->t.t_argv;
1746      if (ap) {
1747        for (i = argc - 1; i >= 0; --i)
1748          *argv++ = va_arg(kmp_va_deref(ap), void *);
1749      } else {
1750        for (i = 0; i < argc; ++i)
1751          // Get args from parent team for teams construct
1752          argv[i] = parent_team->t.t_argv[i];
1753      }
1754      // AC: revert change made in __kmpc_serialized_parallel()
1755      //     because initial code in teams should have level=0
1756      team->t.t_level--;
1757      // AC: call special invoker for outer "parallel" of teams construct
1758      invoker(gtid);
1759#if OMPT_SUPPORT
1760      if (ompt_enabled.enabled) {
1761        ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th);
1762        if (ompt_enabled.ompt_callback_implicit_task) {
1763          ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1764              ompt_scope_end, NULL, &(task_info->task_data), 0,
1765              OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial);
1766        }
1767        if (ompt_enabled.ompt_callback_parallel_end) {
1768          ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1769              ompt_parallel_data, *parent_task_data,
1770              OMPT_INVOKER(call_context) | ompt_parallel_league,
1771              *return_address);
1772        }
1773        master_th->th.ompt_thread_info.state = ompt_state_overhead;
1774      }
1775#endif
1776    } else {
1777      argv = args;
1778      for (i = argc - 1; i >= 0; --i)
1779        *argv++ = va_arg(kmp_va_deref(ap), void *);
1780      KMP_MB();
1781
1782#if OMPT_SUPPORT
1783      void *dummy;
1784      void **exit_frame_p;
1785      ompt_task_info_t *task_info;
1786      ompt_lw_taskteam_t lw_taskteam;
1787      ompt_data_t *implicit_task_data;
1788
1789      if (ompt_enabled.enabled) {
1790        __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1791                                ompt_parallel_data, *return_address);
1792        __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1793        // don't use lw_taskteam after linking. content was swaped
1794        task_info = OMPT_CUR_TASK_INFO(master_th);
1795        exit_frame_p = &(task_info->frame.exit_frame.ptr);
1796
1797        /* OMPT implicit task begin */
1798        implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1799        if (ompt_enabled.ompt_callback_implicit_task) {
1800          ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1801              ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1802              implicit_task_data, 1, __kmp_tid_from_gtid(gtid),
1803              ompt_task_implicit);
1804          OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
1805        }
1806
1807        /* OMPT state */
1808        master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1809      } else {
1810        exit_frame_p = &dummy;
1811      }
1812#endif
1813
1814      {
1815        KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1816        KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1817        __kmp_invoke_microtask(microtask, gtid, 0, argc, args
1818#if OMPT_SUPPORT
1819                               ,
1820                               exit_frame_p
1821#endif
1822                               );
1823      }
1824
1825#if OMPT_SUPPORT
1826      if (ompt_enabled.enabled) {
1827        *exit_frame_p = NULL;
1828        if (ompt_enabled.ompt_callback_implicit_task) {
1829          ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1830              ompt_scope_end, NULL, &(task_info->task_data), 1,
1831              OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1832        }
1833
1834        *ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1835        __ompt_lw_taskteam_unlink(master_th);
1836        if (ompt_enabled.ompt_callback_parallel_end) {
1837          ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1838              ompt_parallel_data, *parent_task_data,
1839              OMPT_INVOKER(call_context) | ompt_parallel_team, *return_address);
1840        }
1841        master_th->th.ompt_thread_info.state = ompt_state_overhead;
1842      }
1843#endif
1844    }
1845  } else if (call_context == fork_context_gnu) {
1846#if OMPT_SUPPORT
1847    if (ompt_enabled.enabled) {
1848      ompt_lw_taskteam_t lwt;
1849      __ompt_lw_taskteam_init(&lwt, master_th, gtid, ompt_parallel_data,
1850                              *return_address);
1851
1852      lwt.ompt_task_info.frame.exit_frame = ompt_data_none;
1853      __ompt_lw_taskteam_link(&lwt, master_th, 1);
1854    }
1855// don't use lw_taskteam after linking. content was swaped
1856#endif
1857
1858    // we were called from GNU native code
1859    KA_TRACE(20, ("__kmp_serial_fork_call: T#%d serial exit\n", gtid));
1860    return FALSE;
1861  } else {
1862    KMP_ASSERT2(call_context < fork_context_last,
1863                "__kmp_serial_fork_call: unknown fork_context parameter");
1864  }
1865
1866  KA_TRACE(20, ("__kmp_serial_fork_call: T#%d serial exit\n", gtid));
1867  KMP_MB();
1868  return FALSE;
1869}
1870
1871/* most of the work for a fork */
1872/* return true if we really went parallel, false if serialized */
1873int __kmp_fork_call(ident_t *loc, int gtid,
1874                    enum fork_context_e call_context, // Intel, GNU, ...
1875                    kmp_int32 argc, microtask_t microtask, launch_t invoker,
1876                    kmp_va_list ap) {
1877  void **argv;
1878  int i;
1879  int master_tid;
1880  int master_this_cons;
1881  kmp_team_t *team;
1882  kmp_team_t *parent_team;
1883  kmp_info_t *master_th;
1884  kmp_root_t *root;
1885  int nthreads;
1886  int master_active;
1887  int master_set_numthreads;
1888  int task_thread_limit = 0;
1889  int level;
1890  int active_level;
1891  int teams_level;
1892#if KMP_NESTED_HOT_TEAMS
1893  kmp_hot_team_ptr_t **p_hot_teams;
1894#endif
1895  { // KMP_TIME_BLOCK
1896    KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1897    KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1898
1899    KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1900    if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1901      /* Some systems prefer the stack for the root thread(s) to start with */
1902      /* some gap from the parent stack to prevent false sharing. */
1903      void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1904      /* These 2 lines below are so this does not get optimized out */
1905      if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1906        __kmp_stkpadding += (short)((kmp_int64)dummy);
1907    }
1908
1909    /* initialize if needed */
1910    KMP_DEBUG_ASSERT(
1911        __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1912    if (!TCR_4(__kmp_init_parallel))
1913      __kmp_parallel_initialize();
1914    __kmp_resume_if_soft_paused();
1915
1916    /* setup current data */
1917    // AC: potentially unsafe, not in sync with library shutdown,
1918    // __kmp_threads can be freed
1919    master_th = __kmp_threads[gtid];
1920
1921    parent_team = master_th->th.th_team;
1922    master_tid = master_th->th.th_info.ds.ds_tid;
1923    master_this_cons = master_th->th.th_local.this_construct;
1924    root = master_th->th.th_root;
1925    master_active = root->r.r_active;
1926    master_set_numthreads = master_th->th.th_set_nproc;
1927    task_thread_limit =
1928        master_th->th.th_current_task->td_icvs.task_thread_limit;
1929
1930#if OMPT_SUPPORT
1931    ompt_data_t ompt_parallel_data = ompt_data_none;
1932    ompt_data_t *parent_task_data;
1933    ompt_frame_t *ompt_frame;
1934    void *return_address = NULL;
1935
1936    if (ompt_enabled.enabled) {
1937      __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
1938                                    NULL, NULL);
1939      return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1940    }
1941#endif
1942
1943    // Assign affinity to root thread if it hasn't happened yet
1944    __kmp_assign_root_init_mask();
1945
1946    // Nested level will be an index in the nested nthreads array
1947    level = parent_team->t.t_level;
1948    // used to launch non-serial teams even if nested is not allowed
1949    active_level = parent_team->t.t_active_level;
1950    // needed to check nesting inside the teams
1951    teams_level = master_th->th.th_teams_level;
1952#if KMP_NESTED_HOT_TEAMS
1953    p_hot_teams = &master_th->th.th_hot_teams;
1954    if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
1955      *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
1956          sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1957      (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1958      // it is either actual or not needed (when active_level > 0)
1959      (*p_hot_teams)[0].hot_team_nth = 1;
1960    }
1961#endif
1962
1963#if OMPT_SUPPORT
1964    if (ompt_enabled.enabled) {
1965      if (ompt_enabled.ompt_callback_parallel_begin) {
1966        int team_size = master_set_numthreads
1967                            ? master_set_numthreads
1968                            : get__nproc_2(parent_team, master_tid);
1969        int flags = OMPT_INVOKER(call_context) |
1970                    ((microtask == (microtask_t)__kmp_teams_master)
1971                         ? ompt_parallel_league
1972                         : ompt_parallel_team);
1973        ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1974            parent_task_data, ompt_frame, &ompt_parallel_data, team_size, flags,
1975            return_address);
1976      }
1977      master_th->th.ompt_thread_info.state = ompt_state_overhead;
1978    }
1979#endif
1980
1981    master_th->th.th_ident = loc;
1982
1983    // Parallel closely nested in teams construct:
1984    if (__kmp_is_fork_in_teams(master_th, microtask, level, teams_level, ap)) {
1985      return __kmp_fork_in_teams(loc, gtid, parent_team, argc, master_th, root,
1986                                 call_context, microtask, invoker,
1987                                 master_set_numthreads, level,
1988#if OMPT_SUPPORT
1989                                 ompt_parallel_data, return_address,
1990#endif
1991                                 ap);
1992    } // End parallel closely nested in teams construct
1993
1994#if KMP_DEBUG
1995    if (__kmp_tasking_mode != tskm_immediate_exec) {
1996      KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
1997                       parent_team->t.t_task_team[master_th->th.th_task_state]);
1998    }
1999#endif
2000
2001    // Need this to happen before we determine the number of threads, not while
2002    // we are allocating the team
2003    //__kmp_push_current_task_to_thread(master_th, parent_team, 0);
2004
2005    // Determine the number of threads
2006    int enter_teams =
2007        __kmp_is_entering_teams(active_level, level, teams_level, ap);
2008    if ((!enter_teams &&
2009         (parent_team->t.t_active_level >=
2010          master_th->th.th_current_task->td_icvs.max_active_levels)) ||
2011        (__kmp_library == library_serial)) {
2012      KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team\n", gtid));
2013      nthreads = 1;
2014    } else {
2015      nthreads = master_set_numthreads
2016                     ? master_set_numthreads
2017                     // TODO: get nproc directly from current task
2018                     : get__nproc_2(parent_team, master_tid);
2019      // Use the thread_limit set for the current target task if exists, else go
2020      // with the deduced nthreads
2021      nthreads = task_thread_limit > 0 && task_thread_limit < nthreads
2022                     ? task_thread_limit
2023                     : nthreads;
2024      // Check if we need to take forkjoin lock? (no need for serialized
2025      // parallel out of teams construct).
2026      if (nthreads > 1) {
2027        /* determine how many new threads we can use */
2028        __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2029        /* AC: If we execute teams from parallel region (on host), then teams
2030           should be created but each can only have 1 thread if nesting is
2031           disabled. If teams called from serial region, then teams and their
2032           threads should be created regardless of the nesting setting. */
2033        nthreads = __kmp_reserve_threads(root, parent_team, master_tid,
2034                                         nthreads, enter_teams);
2035        if (nthreads == 1) {
2036          // Free lock for single thread execution here; for multi-thread
2037          // execution it will be freed later after team of threads created
2038          // and initialized
2039          __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2040        }
2041      }
2042    }
2043    KMP_DEBUG_ASSERT(nthreads > 0);
2044
2045    // If we temporarily changed the set number of threads then restore it now
2046    master_th->th.th_set_nproc = 0;
2047
2048    if (nthreads == 1) {
2049      return __kmp_serial_fork_call(loc, gtid, call_context, argc, microtask,
2050                                    invoker, master_th, parent_team,
2051#if OMPT_SUPPORT
2052                                    &ompt_parallel_data, &return_address,
2053                                    &parent_task_data,
2054#endif
2055                                    ap);
2056    } // if (nthreads == 1)
2057
2058    // GEH: only modify the executing flag in the case when not serialized
2059    //      serialized case is handled in kmpc_serialized_parallel
2060    KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
2061                  "curtask=%p, curtask_max_aclevel=%d\n",
2062                  parent_team->t.t_active_level, master_th,
2063                  master_th->th.th_current_task,
2064                  master_th->th.th_current_task->td_icvs.max_active_levels));
2065    // TODO: GEH - cannot do this assertion because root thread not set up as
2066    // executing
2067    // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
2068    master_th->th.th_current_task->td_flags.executing = 0;
2069
2070    if (!master_th->th.th_teams_microtask || level > teams_level) {
2071      /* Increment our nested depth level */
2072      KMP_ATOMIC_INC(&root->r.r_in_parallel);
2073    }
2074
2075    // See if we need to make a copy of the ICVs.
2076    int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
2077    if ((level + 1 < __kmp_nested_nth.used) &&
2078        (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
2079      nthreads_icv = __kmp_nested_nth.nth[level + 1];
2080    } else {
2081      nthreads_icv = 0; // don't update
2082    }
2083
2084    // Figure out the proc_bind_policy for the new team.
2085    kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
2086    // proc_bind_default means don't update
2087    kmp_proc_bind_t proc_bind_icv = proc_bind_default;
2088    if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
2089      proc_bind = proc_bind_false;
2090    } else {
2091      // No proc_bind clause specified; use current proc-bind-var for this
2092      // parallel region
2093      if (proc_bind == proc_bind_default) {
2094        proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
2095      }
2096      // Have teams construct take proc_bind value from KMP_TEAMS_PROC_BIND
2097      if (master_th->th.th_teams_microtask &&
2098          microtask == (microtask_t)__kmp_teams_master) {
2099        proc_bind = __kmp_teams_proc_bind;
2100      }
2101      /* else: The proc_bind policy was specified explicitly on parallel clause.
2102         This overrides proc-bind-var for this parallel region, but does not
2103         change proc-bind-var. */
2104      // Figure the value of proc-bind-var for the child threads.
2105      if ((level + 1 < __kmp_nested_proc_bind.used) &&
2106          (__kmp_nested_proc_bind.bind_types[level + 1] !=
2107           master_th->th.th_current_task->td_icvs.proc_bind)) {
2108        // Do not modify the proc bind icv for the two teams construct forks
2109        // They just let the proc bind icv pass through
2110        if (!master_th->th.th_teams_microtask ||
2111            !(microtask == (microtask_t)__kmp_teams_master || ap == NULL))
2112          proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
2113      }
2114    }
2115
2116    // Reset for next parallel region
2117    master_th->th.th_set_proc_bind = proc_bind_default;
2118
2119    if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) {
2120      kmp_internal_control_t new_icvs;
2121      copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
2122      new_icvs.next = NULL;
2123      if (nthreads_icv > 0) {
2124        new_icvs.nproc = nthreads_icv;
2125      }
2126      if (proc_bind_icv != proc_bind_default) {
2127        new_icvs.proc_bind = proc_bind_icv;
2128      }
2129
2130      /* allocate a new parallel team */
2131      KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2132      team = __kmp_allocate_team(root, nthreads, nthreads,
2133#if OMPT_SUPPORT
2134                                 ompt_parallel_data,
2135#endif
2136                                 proc_bind, &new_icvs,
2137                                 argc USE_NESTED_HOT_ARG(master_th));
2138      if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
2139        copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs, &new_icvs);
2140    } else {
2141      /* allocate a new parallel team */
2142      KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2143      team = __kmp_allocate_team(root, nthreads, nthreads,
2144#if OMPT_SUPPORT
2145                                 ompt_parallel_data,
2146#endif
2147                                 proc_bind,
2148                                 &master_th->th.th_current_task->td_icvs,
2149                                 argc USE_NESTED_HOT_ARG(master_th));
2150      if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
2151        copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs,
2152                  &master_th->th.th_current_task->td_icvs);
2153    }
2154    KF_TRACE(
2155        10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
2156
2157    /* setup the new team */
2158    KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
2159    KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
2160    KMP_CHECK_UPDATE(team->t.t_ident, loc);
2161    KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
2162    KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
2163#if OMPT_SUPPORT
2164    KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
2165                          return_address);
2166#endif
2167    KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
2168    // TODO: parent_team->t.t_level == INT_MAX ???
2169    if (!master_th->th.th_teams_microtask || level > teams_level) {
2170      int new_level = parent_team->t.t_level + 1;
2171      KMP_CHECK_UPDATE(team->t.t_level, new_level);
2172      new_level = parent_team->t.t_active_level + 1;
2173      KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2174    } else {
2175      // AC: Do not increase parallel level at start of the teams construct
2176      int new_level = parent_team->t.t_level;
2177      KMP_CHECK_UPDATE(team->t.t_level, new_level);
2178      new_level = parent_team->t.t_active_level;
2179      KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2180    }
2181    kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2182    // set primary thread's schedule as new run-time schedule
2183    KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
2184
2185    KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2186    KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
2187
2188    // Update the floating point rounding in the team if required.
2189    propagateFPControl(team);
2190#if OMPD_SUPPORT
2191    if (ompd_state & OMPD_ENABLE_BP)
2192      ompd_bp_parallel_begin();
2193#endif
2194
2195    if (__kmp_tasking_mode != tskm_immediate_exec) {
2196      // Set primary thread's task team to team's task team. Unless this is hot
2197      // team, it should be NULL.
2198      KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2199                       parent_team->t.t_task_team[master_th->th.th_task_state]);
2200      KA_TRACE(20, ("__kmp_fork_call: Primary T#%d pushing task_team %p / team "
2201                    "%p, new task_team %p / team %p\n",
2202                    __kmp_gtid_from_thread(master_th),
2203                    master_th->th.th_task_team, parent_team,
2204                    team->t.t_task_team[master_th->th.th_task_state], team));
2205
2206      if (active_level || master_th->th.th_task_team) {
2207        // Take a memo of primary thread's task_state
2208        KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2209        if (master_th->th.th_task_state_top >=
2210            master_th->th.th_task_state_stack_sz) { // increase size
2211          kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
2212          kmp_uint8 *old_stack, *new_stack;
2213          kmp_uint32 i;
2214          new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
2215          for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
2216            new_stack[i] = master_th->th.th_task_state_memo_stack[i];
2217          }
2218          for (i = master_th->th.th_task_state_stack_sz; i < new_size;
2219               ++i) { // zero-init rest of stack
2220            new_stack[i] = 0;
2221          }
2222          old_stack = master_th->th.th_task_state_memo_stack;
2223          master_th->th.th_task_state_memo_stack = new_stack;
2224          master_th->th.th_task_state_stack_sz = new_size;
2225          __kmp_free(old_stack);
2226        }
2227        // Store primary thread's task_state on stack
2228        master_th->th
2229            .th_task_state_memo_stack[master_th->th.th_task_state_top] =
2230            master_th->th.th_task_state;
2231        master_th->th.th_task_state_top++;
2232#if KMP_NESTED_HOT_TEAMS
2233        if (master_th->th.th_hot_teams &&
2234            active_level < __kmp_hot_teams_max_level &&
2235            team == master_th->th.th_hot_teams[active_level].hot_team) {
2236          // Restore primary thread's nested state if nested hot team
2237          master_th->th.th_task_state =
2238              master_th->th
2239                  .th_task_state_memo_stack[master_th->th.th_task_state_top];
2240        } else {
2241#endif
2242          master_th->th.th_task_state = 0;
2243#if KMP_NESTED_HOT_TEAMS
2244        }
2245#endif
2246      }
2247#if !KMP_NESTED_HOT_TEAMS
2248      KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
2249                       (team == root->r.r_hot_team));
2250#endif
2251    }
2252
2253    KA_TRACE(
2254        20,
2255        ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2256         gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2257         team->t.t_nproc));
2258    KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2259                     (team->t.t_master_tid == 0 &&
2260                      (team->t.t_parent == root->r.r_root_team ||
2261                       team->t.t_parent->t.t_serialized)));
2262    KMP_MB();
2263
2264    /* now, setup the arguments */
2265    argv = (void **)team->t.t_argv;
2266    if (ap) {
2267      for (i = argc - 1; i >= 0; --i) {
2268        void *new_argv = va_arg(kmp_va_deref(ap), void *);
2269        KMP_CHECK_UPDATE(*argv, new_argv);
2270        argv++;
2271      }
2272    } else {
2273      for (i = 0; i < argc; ++i) {
2274        // Get args from parent team for teams construct
2275        KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2276      }
2277    }
2278
2279    /* now actually fork the threads */
2280    KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2281    if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2282      root->r.r_active = TRUE;
2283
2284    __kmp_fork_team_threads(root, team, master_th, gtid, !ap);
2285    __kmp_setup_icv_copy(team, nthreads,
2286                         &master_th->th.th_current_task->td_icvs, loc);
2287
2288#if OMPT_SUPPORT
2289    master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
2290#endif
2291
2292    __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2293
2294#if USE_ITT_BUILD
2295    if (team->t.t_active_level == 1 // only report frames at level 1
2296        && !master_th->th.th_teams_microtask) { // not in teams construct
2297#if USE_ITT_NOTIFY
2298      if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2299          (__kmp_forkjoin_frames_mode == 3 ||
2300           __kmp_forkjoin_frames_mode == 1)) {
2301        kmp_uint64 tmp_time = 0;
2302        if (__itt_get_timestamp_ptr)
2303          tmp_time = __itt_get_timestamp();
2304        // Internal fork - report frame begin
2305        master_th->th.th_frame_time = tmp_time;
2306        if (__kmp_forkjoin_frames_mode == 3)
2307          team->t.t_region_time = tmp_time;
2308      } else
2309// only one notification scheme (either "submit" or "forking/joined", not both)
2310#endif /* USE_ITT_NOTIFY */
2311        if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2312            __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2313          // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
2314          __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2315        }
2316    }
2317#endif /* USE_ITT_BUILD */
2318
2319    /* now go on and do the work */
2320    KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2321    KMP_MB();
2322    KF_TRACE(10,
2323             ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2324              root, team, master_th, gtid));
2325
2326#if USE_ITT_BUILD
2327    if (__itt_stack_caller_create_ptr) {
2328      // create new stack stitching id before entering fork barrier
2329      if (!enter_teams) {
2330        KMP_DEBUG_ASSERT(team->t.t_stack_id == NULL);
2331        team->t.t_stack_id = __kmp_itt_stack_caller_create();
2332      } else if (parent_team->t.t_serialized) {
2333        // keep stack stitching id in the serialized parent_team;
2334        // current team will be used for parallel inside the teams;
2335        // if parent_team is active, then it already keeps stack stitching id
2336        // for the league of teams
2337        KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
2338        parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
2339      }
2340    }
2341#endif /* USE_ITT_BUILD */
2342
2343    // AC: skip __kmp_internal_fork at teams construct, let only primary
2344    // threads execute
2345    if (ap) {
2346      __kmp_internal_fork(loc, gtid, team);
2347      KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2348                    "master_th=%p, gtid=%d\n",
2349                    root, team, master_th, gtid));
2350    }
2351
2352    if (call_context == fork_context_gnu) {
2353      KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2354      return TRUE;
2355    }
2356
2357    /* Invoke microtask for PRIMARY thread */
2358    KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2359                  team->t.t_id, team->t.t_pkfn));
2360  } // END of timer KMP_fork_call block
2361
2362#if KMP_STATS_ENABLED
2363  // If beginning a teams construct, then change thread state
2364  stats_state_e previous_state = KMP_GET_THREAD_STATE();
2365  if (!ap) {
2366    KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION);
2367  }
2368#endif
2369
2370  if (!team->t.t_invoke(gtid)) {
2371    KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
2372  }
2373
2374#if KMP_STATS_ENABLED
2375  // If was beginning of a teams construct, then reset thread state
2376  if (!ap) {
2377    KMP_SET_THREAD_STATE(previous_state);
2378  }
2379#endif
2380
2381  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2382                team->t.t_id, team->t.t_pkfn));
2383  KMP_MB(); /* Flush all pending memory write invalidates.  */
2384
2385  KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2386#if OMPT_SUPPORT
2387  if (ompt_enabled.enabled) {
2388    master_th->th.ompt_thread_info.state = ompt_state_overhead;
2389  }
2390#endif
2391
2392  return TRUE;
2393}
2394
2395#if OMPT_SUPPORT
2396static inline void __kmp_join_restore_state(kmp_info_t *thread,
2397                                            kmp_team_t *team) {
2398  // restore state outside the region
2399  thread->th.ompt_thread_info.state =
2400      ((team->t.t_serialized) ? ompt_state_work_serial
2401                              : ompt_state_work_parallel);
2402}
2403
2404static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
2405                                   kmp_team_t *team, ompt_data_t *parallel_data,
2406                                   int flags, void *codeptr) {
2407  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2408  if (ompt_enabled.ompt_callback_parallel_end) {
2409    ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
2410        parallel_data, &(task_info->task_data), flags, codeptr);
2411  }
2412
2413  task_info->frame.enter_frame = ompt_data_none;
2414  __kmp_join_restore_state(thread, team);
2415}
2416#endif
2417
2418void __kmp_join_call(ident_t *loc, int gtid
2419#if OMPT_SUPPORT
2420                     ,
2421                     enum fork_context_e fork_context
2422#endif
2423                     ,
2424                     int exit_teams) {
2425  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2426  kmp_team_t *team;
2427  kmp_team_t *parent_team;
2428  kmp_info_t *master_th;
2429  kmp_root_t *root;
2430  int master_active;
2431
2432  KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2433
2434  /* setup current data */
2435  master_th = __kmp_threads[gtid];
2436  root = master_th->th.th_root;
2437  team = master_th->th.th_team;
2438  parent_team = team->t.t_parent;
2439
2440  master_th->th.th_ident = loc;
2441
2442#if OMPT_SUPPORT
2443  void *team_microtask = (void *)team->t.t_pkfn;
2444  // For GOMP interface with serialized parallel, need the
2445  // __kmpc_end_serialized_parallel to call hooks for OMPT end-implicit-task
2446  // and end-parallel events.
2447  if (ompt_enabled.enabled &&
2448      !(team->t.t_serialized && fork_context == fork_context_gnu)) {
2449    master_th->th.ompt_thread_info.state = ompt_state_overhead;
2450  }
2451#endif
2452
2453#if KMP_DEBUG
2454  if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2455    KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2456                  "th_task_team = %p\n",
2457                  __kmp_gtid_from_thread(master_th), team,
2458                  team->t.t_task_team[master_th->th.th_task_state],
2459                  master_th->th.th_task_team));
2460    KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2461                     team->t.t_task_team[master_th->th.th_task_state]);
2462  }
2463#endif
2464
2465  if (team->t.t_serialized) {
2466    if (master_th->th.th_teams_microtask) {
2467      // We are in teams construct
2468      int level = team->t.t_level;
2469      int tlevel = master_th->th.th_teams_level;
2470      if (level == tlevel) {
2471        // AC: we haven't incremented it earlier at start of teams construct,
2472        //     so do it here - at the end of teams construct
2473        team->t.t_level++;
2474      } else if (level == tlevel + 1) {
2475        // AC: we are exiting parallel inside teams, need to increment
2476        // serialization in order to restore it in the next call to
2477        // __kmpc_end_serialized_parallel
2478        team->t.t_serialized++;
2479      }
2480    }
2481    __kmpc_end_serialized_parallel(loc, gtid);
2482
2483#if OMPT_SUPPORT
2484    if (ompt_enabled.enabled) {
2485      if (fork_context == fork_context_gnu) {
2486        __ompt_lw_taskteam_unlink(master_th);
2487      }
2488      __kmp_join_restore_state(master_th, parent_team);
2489    }
2490#endif
2491
2492    return;
2493  }
2494
2495  master_active = team->t.t_master_active;
2496
2497  if (!exit_teams) {
2498    // AC: No barrier for internal teams at exit from teams construct.
2499    //     But there is barrier for external team (league).
2500    __kmp_internal_join(loc, gtid, team);
2501#if USE_ITT_BUILD
2502    if (__itt_stack_caller_create_ptr) {
2503      KMP_DEBUG_ASSERT(team->t.t_stack_id != NULL);
2504      // destroy the stack stitching id after join barrier
2505      __kmp_itt_stack_caller_destroy((__itt_caller)team->t.t_stack_id);
2506      team->t.t_stack_id = NULL;
2507    }
2508#endif
2509  } else {
2510    master_th->th.th_task_state =
2511        0; // AC: no tasking in teams (out of any parallel)
2512#if USE_ITT_BUILD
2513    if (__itt_stack_caller_create_ptr && parent_team->t.t_serialized) {
2514      KMP_DEBUG_ASSERT(parent_team->t.t_stack_id != NULL);
2515      // destroy the stack stitching id on exit from the teams construct
2516      // if parent_team is active, then the id will be destroyed later on
2517      // by master of the league of teams
2518      __kmp_itt_stack_caller_destroy((__itt_caller)parent_team->t.t_stack_id);
2519      parent_team->t.t_stack_id = NULL;
2520    }
2521#endif
2522  }
2523
2524  KMP_MB();
2525
2526#if OMPT_SUPPORT
2527  ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
2528  void *codeptr = team->t.ompt_team_info.master_return_address;
2529#endif
2530
2531#if USE_ITT_BUILD
2532  // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
2533  if (team->t.t_active_level == 1 &&
2534      (!master_th->th.th_teams_microtask || /* not in teams construct */
2535       master_th->th.th_teams_size.nteams == 1)) {
2536    master_th->th.th_ident = loc;
2537    // only one notification scheme (either "submit" or "forking/joined", not
2538    // both)
2539    if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2540        __kmp_forkjoin_frames_mode == 3)
2541      __kmp_itt_frame_submit(gtid, team->t.t_region_time,
2542                             master_th->th.th_frame_time, 0, loc,
2543                             master_th->th.th_team_nproc, 1);
2544    else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2545             !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2546      __kmp_itt_region_joined(gtid);
2547  } // active_level == 1
2548#endif /* USE_ITT_BUILD */
2549
2550#if KMP_AFFINITY_SUPPORTED
2551  if (!exit_teams) {
2552    // Restore master thread's partition.
2553    master_th->th.th_first_place = team->t.t_first_place;
2554    master_th->th.th_last_place = team->t.t_last_place;
2555  }
2556#endif // KMP_AFFINITY_SUPPORTED
2557
2558  if (master_th->th.th_teams_microtask && !exit_teams &&
2559      team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2560      team->t.t_level == master_th->th.th_teams_level + 1) {
2561// AC: We need to leave the team structure intact at the end of parallel
2562// inside the teams construct, so that at the next parallel same (hot) team
2563// works, only adjust nesting levels
2564#if OMPT_SUPPORT
2565    ompt_data_t ompt_parallel_data = ompt_data_none;
2566    if (ompt_enabled.enabled) {
2567      ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2568      if (ompt_enabled.ompt_callback_implicit_task) {
2569        int ompt_team_size = team->t.t_nproc;
2570        ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2571            ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2572            OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
2573      }
2574      task_info->frame.exit_frame = ompt_data_none;
2575      task_info->task_data = ompt_data_none;
2576      ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
2577      __ompt_lw_taskteam_unlink(master_th);
2578    }
2579#endif
2580    /* Decrement our nested depth level */
2581    team->t.t_level--;
2582    team->t.t_active_level--;
2583    KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2584
2585    // Restore number of threads in the team if needed. This code relies on
2586    // the proper adjustment of th_teams_size.nth after the fork in
2587    // __kmp_teams_master on each teams primary thread in the case that
2588    // __kmp_reserve_threads reduced it.
2589    if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2590      int old_num = master_th->th.th_team_nproc;
2591      int new_num = master_th->th.th_teams_size.nth;
2592      kmp_info_t **other_threads = team->t.t_threads;
2593      team->t.t_nproc = new_num;
2594      for (int i = 0; i < old_num; ++i) {
2595        other_threads[i]->th.th_team_nproc = new_num;
2596      }
2597      // Adjust states of non-used threads of the team
2598      for (int i = old_num; i < new_num; ++i) {
2599        // Re-initialize thread's barrier data.
2600        KMP_DEBUG_ASSERT(other_threads[i]);
2601        kmp_balign_t *balign = other_threads[i]->th.th_bar;
2602        for (int b = 0; b < bs_last_barrier; ++b) {
2603          balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2604          KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2605#if USE_DEBUGGER
2606          balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2607#endif
2608        }
2609        if (__kmp_tasking_mode != tskm_immediate_exec) {
2610          // Synchronize thread's task state
2611          other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2612        }
2613      }
2614    }
2615
2616#if OMPT_SUPPORT
2617    if (ompt_enabled.enabled) {
2618      __kmp_join_ompt(gtid, master_th, parent_team, &ompt_parallel_data,
2619                      OMPT_INVOKER(fork_context) | ompt_parallel_team, codeptr);
2620    }
2621#endif
2622
2623    return;
2624  }
2625
2626  /* do cleanup and restore the parent team */
2627  master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2628  master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2629
2630  master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2631
2632  /* jc: The following lock has instructions with REL and ACQ semantics,
2633     separating the parallel user code called in this parallel region
2634     from the serial user code called after this function returns. */
2635  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2636
2637  if (!master_th->th.th_teams_microtask ||
2638      team->t.t_level > master_th->th.th_teams_level) {
2639    /* Decrement our nested depth level */
2640    KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2641  }
2642  KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2643
2644#if OMPT_SUPPORT
2645  if (ompt_enabled.enabled) {
2646    ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2647    if (ompt_enabled.ompt_callback_implicit_task) {
2648      int flags = (team_microtask == (void *)__kmp_teams_master)
2649                      ? ompt_task_initial
2650                      : ompt_task_implicit;
2651      int ompt_team_size = (flags == ompt_task_initial) ? 0 : team->t.t_nproc;
2652      ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2653          ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2654          OMPT_CUR_TASK_INFO(master_th)->thread_num, flags);
2655    }
2656    task_info->frame.exit_frame = ompt_data_none;
2657    task_info->task_data = ompt_data_none;
2658  }
2659#endif
2660
2661  KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2662                master_th, team));
2663  __kmp_pop_current_task_from_thread(master_th);
2664
2665  master_th->th.th_def_allocator = team->t.t_def_allocator;
2666
2667#if OMPD_SUPPORT
2668  if (ompd_state & OMPD_ENABLE_BP)
2669    ompd_bp_parallel_end();
2670#endif
2671  updateHWFPControl(team);
2672
2673  if (root->r.r_active != master_active)
2674    root->r.r_active = master_active;
2675
2676  __kmp_free_team(root, team USE_NESTED_HOT_ARG(
2677                            master_th)); // this will free worker threads
2678
2679  /* this race was fun to find. make sure the following is in the critical
2680     region otherwise assertions may fail occasionally since the old team may be
2681     reallocated and the hierarchy appears inconsistent. it is actually safe to
2682     run and won't cause any bugs, but will cause those assertion failures. it's
2683     only one deref&assign so might as well put this in the critical region */
2684  master_th->th.th_team = parent_team;
2685  master_th->th.th_team_nproc = parent_team->t.t_nproc;
2686  master_th->th.th_team_master = parent_team->t.t_threads[0];
2687  master_th->th.th_team_serialized = parent_team->t.t_serialized;
2688
2689  /* restore serialized team, if need be */
2690  if (parent_team->t.t_serialized &&
2691      parent_team != master_th->th.th_serial_team &&
2692      parent_team != root->r.r_root_team) {
2693    __kmp_free_team(root,
2694                    master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
2695    master_th->th.th_serial_team = parent_team;
2696  }
2697
2698  if (__kmp_tasking_mode != tskm_immediate_exec) {
2699    if (master_th->th.th_task_state_top >
2700        0) { // Restore task state from memo stack
2701      KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2702      // Remember primary thread's state if we re-use this nested hot team
2703      master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
2704          master_th->th.th_task_state;
2705      --master_th->th.th_task_state_top; // pop
2706      // Now restore state at this level
2707      master_th->th.th_task_state =
2708          master_th->th
2709              .th_task_state_memo_stack[master_th->th.th_task_state_top];
2710    } else if (team != root->r.r_hot_team) {
2711      // Reset the task state of primary thread if we are not hot team because
2712      // in this case all the worker threads will be free, and their task state
2713      // will be reset. If not reset the primary's, the task state will be
2714      // inconsistent.
2715      master_th->th.th_task_state = 0;
2716    }
2717    // Copy the task team from the parent team to the primary thread
2718    master_th->th.th_task_team =
2719        parent_team->t.t_task_team[master_th->th.th_task_state];
2720    KA_TRACE(20,
2721             ("__kmp_join_call: Primary T#%d restoring task_team %p, team %p\n",
2722              __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2723              parent_team));
2724  }
2725
2726  // TODO: GEH - cannot do this assertion because root thread not set up as
2727  // executing
2728  // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2729  master_th->th.th_current_task->td_flags.executing = 1;
2730
2731  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2732
2733#if KMP_AFFINITY_SUPPORTED
2734  if (master_th->th.th_team->t.t_level == 0 && __kmp_affinity.flags.reset) {
2735    __kmp_reset_root_init_mask(gtid);
2736  }
2737#endif
2738#if OMPT_SUPPORT
2739  int flags =
2740      OMPT_INVOKER(fork_context) |
2741      ((team_microtask == (void *)__kmp_teams_master) ? ompt_parallel_league
2742                                                      : ompt_parallel_team);
2743  if (ompt_enabled.enabled) {
2744    __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, flags,
2745                    codeptr);
2746  }
2747#endif
2748
2749  KMP_MB();
2750  KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2751}
2752
2753/* Check whether we should push an internal control record onto the
2754   serial team stack.  If so, do it.  */
2755void __kmp_save_internal_controls(kmp_info_t *thread) {
2756
2757  if (thread->th.th_team != thread->th.th_serial_team) {
2758    return;
2759  }
2760  if (thread->th.th_team->t.t_serialized > 1) {
2761    int push = 0;
2762
2763    if (thread->th.th_team->t.t_control_stack_top == NULL) {
2764      push = 1;
2765    } else {
2766      if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2767          thread->th.th_team->t.t_serialized) {
2768        push = 1;
2769      }
2770    }
2771    if (push) { /* push a record on the serial team's stack */
2772      kmp_internal_control_t *control =
2773          (kmp_internal_control_t *)__kmp_allocate(
2774              sizeof(kmp_internal_control_t));
2775
2776      copy_icvs(control, &thread->th.th_current_task->td_icvs);
2777
2778      control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2779
2780      control->next = thread->th.th_team->t.t_control_stack_top;
2781      thread->th.th_team->t.t_control_stack_top = control;
2782    }
2783  }
2784}
2785
2786/* Changes set_nproc */
2787void __kmp_set_num_threads(int new_nth, int gtid) {
2788  kmp_info_t *thread;
2789  kmp_root_t *root;
2790
2791  KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2792  KMP_DEBUG_ASSERT(__kmp_init_serial);
2793
2794  if (new_nth < 1)
2795    new_nth = 1;
2796  else if (new_nth > __kmp_max_nth)
2797    new_nth = __kmp_max_nth;
2798
2799  KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2800  thread = __kmp_threads[gtid];
2801  if (thread->th.th_current_task->td_icvs.nproc == new_nth)
2802    return; // nothing to do
2803
2804  __kmp_save_internal_controls(thread);
2805
2806  set__nproc(thread, new_nth);
2807
2808  // If this omp_set_num_threads() call will cause the hot team size to be
2809  // reduced (in the absence of a num_threads clause), then reduce it now,
2810  // rather than waiting for the next parallel region.
2811  root = thread->th.th_root;
2812  if (__kmp_init_parallel && (!root->r.r_active) &&
2813      (root->r.r_hot_team->t.t_nproc > new_nth)
2814#if KMP_NESTED_HOT_TEAMS
2815      && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2816#endif
2817  ) {
2818    kmp_team_t *hot_team = root->r.r_hot_team;
2819    int f;
2820
2821    __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2822
2823    if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
2824      __kmp_resize_dist_barrier(hot_team, hot_team->t.t_nproc, new_nth);
2825    }
2826    // Release the extra threads we don't need any more.
2827    for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2828      KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2829      if (__kmp_tasking_mode != tskm_immediate_exec) {
2830        // When decreasing team size, threads no longer in the team should unref
2831        // task team.
2832        hot_team->t.t_threads[f]->th.th_task_team = NULL;
2833      }
2834      __kmp_free_thread(hot_team->t.t_threads[f]);
2835      hot_team->t.t_threads[f] = NULL;
2836    }
2837    hot_team->t.t_nproc = new_nth;
2838#if KMP_NESTED_HOT_TEAMS
2839    if (thread->th.th_hot_teams) {
2840      KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2841      thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2842    }
2843#endif
2844
2845    if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
2846      hot_team->t.b->update_num_threads(new_nth);
2847      __kmp_add_threads_to_team(hot_team, new_nth);
2848    }
2849
2850    __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2851
2852    // Update the t_nproc field in the threads that are still active.
2853    for (f = 0; f < new_nth; f++) {
2854      KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2855      hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2856    }
2857    // Special flag in case omp_set_num_threads() call
2858    hot_team->t.t_size_changed = -1;
2859  }
2860}
2861
2862/* Changes max_active_levels */
2863void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2864  kmp_info_t *thread;
2865
2866  KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2867                "%d = (%d)\n",
2868                gtid, max_active_levels));
2869  KMP_DEBUG_ASSERT(__kmp_init_serial);
2870
2871  // validate max_active_levels
2872  if (max_active_levels < 0) {
2873    KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2874    // We ignore this call if the user has specified a negative value.
2875    // The current setting won't be changed. The last valid setting will be
2876    // used. A warning will be issued (if warnings are allowed as controlled by
2877    // the KMP_WARNINGS env var).
2878    KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2879                  "max_active_levels for thread %d = (%d)\n",
2880                  gtid, max_active_levels));
2881    return;
2882  }
2883  if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2884    // it's OK, the max_active_levels is within the valid range: [ 0;
2885    // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2886    // We allow a zero value. (implementation defined behavior)
2887  } else {
2888    KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2889                KMP_MAX_ACTIVE_LEVELS_LIMIT);
2890    max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2891    // Current upper limit is MAX_INT. (implementation defined behavior)
2892    // If the input exceeds the upper limit, we correct the input to be the
2893    // upper limit. (implementation defined behavior)
2894    // Actually, the flow should never get here until we use MAX_INT limit.
2895  }
2896  KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2897                "max_active_levels for thread %d = (%d)\n",
2898                gtid, max_active_levels));
2899
2900  thread = __kmp_threads[gtid];
2901
2902  __kmp_save_internal_controls(thread);
2903
2904  set__max_active_levels(thread, max_active_levels);
2905}
2906
2907/* Gets max_active_levels */
2908int __kmp_get_max_active_levels(int gtid) {
2909  kmp_info_t *thread;
2910
2911  KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2912  KMP_DEBUG_ASSERT(__kmp_init_serial);
2913
2914  thread = __kmp_threads[gtid];
2915  KMP_DEBUG_ASSERT(thread->th.th_current_task);
2916  KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2917                "curtask_maxaclevel=%d\n",
2918                gtid, thread->th.th_current_task,
2919                thread->th.th_current_task->td_icvs.max_active_levels));
2920  return thread->th.th_current_task->td_icvs.max_active_levels;
2921}
2922
2923// nteams-var per-device ICV
2924void __kmp_set_num_teams(int num_teams) {
2925  if (num_teams > 0)
2926    __kmp_nteams = num_teams;
2927}
2928int __kmp_get_max_teams(void) { return __kmp_nteams; }
2929// teams-thread-limit-var per-device ICV
2930void __kmp_set_teams_thread_limit(int limit) {
2931  if (limit > 0)
2932    __kmp_teams_thread_limit = limit;
2933}
2934int __kmp_get_teams_thread_limit(void) { return __kmp_teams_thread_limit; }
2935
2936KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int));
2937KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int));
2938
2939/* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2940void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2941  kmp_info_t *thread;
2942  kmp_sched_t orig_kind;
2943  //    kmp_team_t *team;
2944
2945  KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2946                gtid, (int)kind, chunk));
2947  KMP_DEBUG_ASSERT(__kmp_init_serial);
2948
2949  // Check if the kind parameter is valid, correct if needed.
2950  // Valid parameters should fit in one of two intervals - standard or extended:
2951  //       <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2952  // 2008-01-25: 0,  1 - 4,       5,         100,     101 - 102, 103
2953  orig_kind = kind;
2954  kind = __kmp_sched_without_mods(kind);
2955
2956  if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2957      (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2958    // TODO: Hint needs attention in case we change the default schedule.
2959    __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2960              KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2961              __kmp_msg_null);
2962    kind = kmp_sched_default;
2963    chunk = 0; // ignore chunk value in case of bad kind
2964  }
2965
2966  thread = __kmp_threads[gtid];
2967
2968  __kmp_save_internal_controls(thread);
2969
2970  if (kind < kmp_sched_upper_std) {
2971    if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2972      // differ static chunked vs. unchunked:  chunk should be invalid to
2973      // indicate unchunked schedule (which is the default)
2974      thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2975    } else {
2976      thread->th.th_current_task->td_icvs.sched.r_sched_type =
2977          __kmp_sch_map[kind - kmp_sched_lower - 1];
2978    }
2979  } else {
2980    //    __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2981    //    kmp_sched_lower - 2 ];
2982    thread->th.th_current_task->td_icvs.sched.r_sched_type =
2983        __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2984                      kmp_sched_lower - 2];
2985  }
2986  __kmp_sched_apply_mods_intkind(
2987      orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type));
2988  if (kind == kmp_sched_auto || chunk < 1) {
2989    // ignore parameter chunk for schedule auto
2990    thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2991  } else {
2992    thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2993  }
2994}
2995
2996/* Gets def_sched_var ICV values */
2997void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
2998  kmp_info_t *thread;
2999  enum sched_type th_type;
3000
3001  KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
3002  KMP_DEBUG_ASSERT(__kmp_init_serial);
3003
3004  thread = __kmp_threads[gtid];
3005
3006  th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
3007  switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) {
3008  case kmp_sch_static:
3009  case kmp_sch_static_greedy:
3010  case kmp_sch_static_balanced:
3011    *kind = kmp_sched_static;
3012    __kmp_sched_apply_mods_stdkind(kind, th_type);
3013    *chunk = 0; // chunk was not set, try to show this fact via zero value
3014    return;
3015  case kmp_sch_static_chunked:
3016    *kind = kmp_sched_static;
3017    break;
3018  case kmp_sch_dynamic_chunked:
3019    *kind = kmp_sched_dynamic;
3020    break;
3021  case kmp_sch_guided_chunked:
3022  case kmp_sch_guided_iterative_chunked:
3023  case kmp_sch_guided_analytical_chunked:
3024    *kind = kmp_sched_guided;
3025    break;
3026  case kmp_sch_auto:
3027    *kind = kmp_sched_auto;
3028    break;
3029  case kmp_sch_trapezoidal:
3030    *kind = kmp_sched_trapezoidal;
3031    break;
3032#if KMP_STATIC_STEAL_ENABLED
3033  case kmp_sch_static_steal:
3034    *kind = kmp_sched_static_steal;
3035    break;
3036#endif
3037  default:
3038    KMP_FATAL(UnknownSchedulingType, th_type);
3039  }
3040
3041  __kmp_sched_apply_mods_stdkind(kind, th_type);
3042  *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
3043}
3044
3045int __kmp_get_ancestor_thread_num(int gtid, int level) {
3046
3047  int ii, dd;
3048  kmp_team_t *team;
3049  kmp_info_t *thr;
3050
3051  KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
3052  KMP_DEBUG_ASSERT(__kmp_init_serial);
3053
3054  // validate level
3055  if (level == 0)
3056    return 0;
3057  if (level < 0)
3058    return -1;
3059  thr = __kmp_threads[gtid];
3060  team = thr->th.th_team;
3061  ii = team->t.t_level;
3062  if (level > ii)
3063    return -1;
3064
3065  if (thr->th.th_teams_microtask) {
3066    // AC: we are in teams region where multiple nested teams have same level
3067    int tlevel = thr->th.th_teams_level; // the level of the teams construct
3068    if (level <=
3069        tlevel) { // otherwise usual algorithm works (will not touch the teams)
3070      KMP_DEBUG_ASSERT(ii >= tlevel);
3071      // AC: As we need to pass by the teams league, we need to artificially
3072      // increase ii
3073      if (ii == tlevel) {
3074        ii += 2; // three teams have same level
3075      } else {
3076        ii++; // two teams have same level
3077      }
3078    }
3079  }
3080
3081  if (ii == level)
3082    return __kmp_tid_from_gtid(gtid);
3083
3084  dd = team->t.t_serialized;
3085  level++;
3086  while (ii > level) {
3087    for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
3088    }
3089    if ((team->t.t_serialized) && (!dd)) {
3090      team = team->t.t_parent;
3091      continue;
3092    }
3093    if (ii > level) {
3094      team = team->t.t_parent;
3095      dd = team->t.t_serialized;
3096      ii--;
3097    }
3098  }
3099
3100  return (dd > 1) ? (0) : (team->t.t_master_tid);
3101}
3102
3103int __kmp_get_team_size(int gtid, int level) {
3104
3105  int ii, dd;
3106  kmp_team_t *team;
3107  kmp_info_t *thr;
3108
3109  KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
3110  KMP_DEBUG_ASSERT(__kmp_init_serial);
3111
3112  // validate level
3113  if (level == 0)
3114    return 1;
3115  if (level < 0)
3116    return -1;
3117  thr = __kmp_threads[gtid];
3118  team = thr->th.th_team;
3119  ii = team->t.t_level;
3120  if (level > ii)
3121    return -1;
3122
3123  if (thr->th.th_teams_microtask) {
3124    // AC: we are in teams region where multiple nested teams have same level
3125    int tlevel = thr->th.th_teams_level; // the level of the teams construct
3126    if (level <=
3127        tlevel) { // otherwise usual algorithm works (will not touch the teams)
3128      KMP_DEBUG_ASSERT(ii >= tlevel);
3129      // AC: As we need to pass by the teams league, we need to artificially
3130      // increase ii
3131      if (ii == tlevel) {
3132        ii += 2; // three teams have same level
3133      } else {
3134        ii++; // two teams have same level
3135      }
3136    }
3137  }
3138
3139  while (ii > level) {
3140    for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
3141    }
3142    if (team->t.t_serialized && (!dd)) {
3143      team = team->t.t_parent;
3144      continue;
3145    }
3146    if (ii > level) {
3147      team = team->t.t_parent;
3148      ii--;
3149    }
3150  }
3151
3152  return team->t.t_nproc;
3153}
3154
3155kmp_r_sched_t __kmp_get_schedule_global() {
3156  // This routine created because pairs (__kmp_sched, __kmp_chunk) and
3157  // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
3158  // independently. So one can get the updated schedule here.
3159
3160  kmp_r_sched_t r_sched;
3161
3162  // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
3163  // __kmp_guided. __kmp_sched should keep original value, so that user can set
3164  // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
3165  // different roots (even in OMP 2.5)
3166  enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched);
3167  enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched);
3168  if (s == kmp_sch_static) {
3169    // replace STATIC with more detailed schedule (balanced or greedy)
3170    r_sched.r_sched_type = __kmp_static;
3171  } else if (s == kmp_sch_guided_chunked) {
3172    // replace GUIDED with more detailed schedule (iterative or analytical)
3173    r_sched.r_sched_type = __kmp_guided;
3174  } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
3175    r_sched.r_sched_type = __kmp_sched;
3176  }
3177  SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers);
3178
3179  if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
3180    // __kmp_chunk may be wrong here (if it was not ever set)
3181    r_sched.chunk = KMP_DEFAULT_CHUNK;
3182  } else {
3183    r_sched.chunk = __kmp_chunk;
3184  }
3185
3186  return r_sched;
3187}
3188
3189/* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
3190   at least argc number of *t_argv entries for the requested team. */
3191static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
3192
3193  KMP_DEBUG_ASSERT(team);
3194  if (!realloc || argc > team->t.t_max_argc) {
3195
3196    KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
3197                   "current entries=%d\n",
3198                   team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
3199    /* if previously allocated heap space for args, free them */
3200    if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
3201      __kmp_free((void *)team->t.t_argv);
3202
3203    if (argc <= KMP_INLINE_ARGV_ENTRIES) {
3204      /* use unused space in the cache line for arguments */
3205      team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
3206      KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
3207                     "argv entries\n",
3208                     team->t.t_id, team->t.t_max_argc));
3209      team->t.t_argv = &team->t.t_inline_argv[0];
3210      if (__kmp_storage_map) {
3211        __kmp_print_storage_map_gtid(
3212            -1, &team->t.t_inline_argv[0],
3213            &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
3214            (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
3215            team->t.t_id);
3216      }
3217    } else {
3218      /* allocate space for arguments in the heap */
3219      team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
3220                               ? KMP_MIN_MALLOC_ARGV_ENTRIES
3221                               : 2 * argc;
3222      KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
3223                     "argv entries\n",
3224                     team->t.t_id, team->t.t_max_argc));
3225      team->t.t_argv =
3226          (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
3227      if (__kmp_storage_map) {
3228        __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
3229                                     &team->t.t_argv[team->t.t_max_argc],
3230                                     sizeof(void *) * team->t.t_max_argc,
3231                                     "team_%d.t_argv", team->t.t_id);
3232      }
3233    }
3234  }
3235}
3236
3237static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
3238  int i;
3239  int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
3240  team->t.t_threads =
3241      (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
3242  team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
3243      sizeof(dispatch_shared_info_t) * num_disp_buff);
3244  team->t.t_dispatch =
3245      (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
3246  team->t.t_implicit_task_taskdata =
3247      (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
3248  team->t.t_max_nproc = max_nth;
3249
3250  /* setup dispatch buffers */
3251  for (i = 0; i < num_disp_buff; ++i) {
3252    team->t.t_disp_buffer[i].buffer_index = i;
3253    team->t.t_disp_buffer[i].doacross_buf_idx = i;
3254  }
3255}
3256
3257static void __kmp_free_team_arrays(kmp_team_t *team) {
3258  /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3259  int i;
3260  for (i = 0; i < team->t.t_max_nproc; ++i) {
3261    if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3262      __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3263      team->t.t_dispatch[i].th_disp_buffer = NULL;
3264    }
3265  }
3266#if KMP_USE_HIER_SCHED
3267  __kmp_dispatch_free_hierarchies(team);
3268#endif
3269  __kmp_free(team->t.t_threads);
3270  __kmp_free(team->t.t_disp_buffer);
3271  __kmp_free(team->t.t_dispatch);
3272  __kmp_free(team->t.t_implicit_task_taskdata);
3273  team->t.t_threads = NULL;
3274  team->t.t_disp_buffer = NULL;
3275  team->t.t_dispatch = NULL;
3276  team->t.t_implicit_task_taskdata = 0;
3277}
3278
3279static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3280  kmp_info_t **oldThreads = team->t.t_threads;
3281
3282  __kmp_free(team->t.t_disp_buffer);
3283  __kmp_free(team->t.t_dispatch);
3284  __kmp_free(team->t.t_implicit_task_taskdata);
3285  __kmp_allocate_team_arrays(team, max_nth);
3286
3287  KMP_MEMCPY(team->t.t_threads, oldThreads,
3288             team->t.t_nproc * sizeof(kmp_info_t *));
3289
3290  __kmp_free(oldThreads);
3291}
3292
3293static kmp_internal_control_t __kmp_get_global_icvs(void) {
3294
3295  kmp_r_sched_t r_sched =
3296      __kmp_get_schedule_global(); // get current state of scheduling globals
3297
3298  KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3299
3300  kmp_internal_control_t g_icvs = {
3301    0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3302    (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3303    // adjustment of threads (per thread)
3304    (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3305    // whether blocktime is explicitly set
3306    __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3307#if KMP_USE_MONITOR
3308    __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3309// intervals
3310#endif
3311    __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3312    // next parallel region (per thread)
3313    // (use a max ub on value if __kmp_parallel_initialize not called yet)
3314    __kmp_cg_max_nth, // int thread_limit;
3315    __kmp_task_max_nth, // int task_thread_limit; // to set the thread_limit
3316    // on task. This is used in the case of target thread_limit
3317    __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3318    // for max_active_levels
3319    r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3320    // {sched,chunk} pair
3321    __kmp_nested_proc_bind.bind_types[0],
3322    __kmp_default_device,
3323    NULL // struct kmp_internal_control *next;
3324  };
3325
3326  return g_icvs;
3327}
3328
3329static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3330
3331  kmp_internal_control_t gx_icvs;
3332  gx_icvs.serial_nesting_level =
3333      0; // probably =team->t.t_serial like in save_inter_controls
3334  copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
3335  gx_icvs.next = NULL;
3336
3337  return gx_icvs;
3338}
3339
3340static void __kmp_initialize_root(kmp_root_t *root) {
3341  int f;
3342  kmp_team_t *root_team;
3343  kmp_team_t *hot_team;
3344  int hot_team_max_nth;
3345  kmp_r_sched_t r_sched =
3346      __kmp_get_schedule_global(); // get current state of scheduling globals
3347  kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3348  KMP_DEBUG_ASSERT(root);
3349  KMP_ASSERT(!root->r.r_begin);
3350
3351  /* setup the root state structure */
3352  __kmp_init_lock(&root->r.r_begin_lock);
3353  root->r.r_begin = FALSE;
3354  root->r.r_active = FALSE;
3355  root->r.r_in_parallel = 0;
3356  root->r.r_blocktime = __kmp_dflt_blocktime;
3357#if KMP_AFFINITY_SUPPORTED
3358  root->r.r_affinity_assigned = FALSE;
3359#endif
3360
3361  /* setup the root team for this task */
3362  /* allocate the root team structure */
3363  KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3364
3365  root_team =
3366      __kmp_allocate_team(root,
3367                          1, // new_nproc
3368                          1, // max_nproc
3369#if OMPT_SUPPORT
3370                          ompt_data_none, // root parallel id
3371#endif
3372                          __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3373                          0 // argc
3374                          USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
3375      );
3376#if USE_DEBUGGER
3377  // Non-NULL value should be assigned to make the debugger display the root
3378  // team.
3379  TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3380#endif
3381
3382  KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3383
3384  root->r.r_root_team = root_team;
3385  root_team->t.t_control_stack_top = NULL;
3386
3387  /* initialize root team */
3388  root_team->t.t_threads[0] = NULL;
3389  root_team->t.t_nproc = 1;
3390  root_team->t.t_serialized = 1;
3391  // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3392  root_team->t.t_sched.sched = r_sched.sched;
3393  KA_TRACE(
3394      20,
3395      ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3396       root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3397
3398  /* setup the  hot team for this task */
3399  /* allocate the hot team structure */
3400  KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3401
3402  hot_team =
3403      __kmp_allocate_team(root,
3404                          1, // new_nproc
3405                          __kmp_dflt_team_nth_ub * 2, // max_nproc
3406#if OMPT_SUPPORT
3407                          ompt_data_none, // root parallel id
3408#endif
3409                          __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3410                          0 // argc
3411                          USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
3412      );
3413  KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3414
3415  root->r.r_hot_team = hot_team;
3416  root_team->t.t_control_stack_top = NULL;
3417
3418  /* first-time initialization */
3419  hot_team->t.t_parent = root_team;
3420
3421  /* initialize hot team */
3422  hot_team_max_nth = hot_team->t.t_max_nproc;
3423  for (f = 0; f < hot_team_max_nth; ++f) {
3424    hot_team->t.t_threads[f] = NULL;
3425  }
3426  hot_team->t.t_nproc = 1;
3427  // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3428  hot_team->t.t_sched.sched = r_sched.sched;
3429  hot_team->t.t_size_changed = 0;
3430}
3431
3432#ifdef KMP_DEBUG
3433
3434typedef struct kmp_team_list_item {
3435  kmp_team_p const *entry;
3436  struct kmp_team_list_item *next;
3437} kmp_team_list_item_t;
3438typedef kmp_team_list_item_t *kmp_team_list_t;
3439
3440static void __kmp_print_structure_team_accum( // Add team to list of teams.
3441    kmp_team_list_t list, // List of teams.
3442    kmp_team_p const *team // Team to add.
3443) {
3444
3445  // List must terminate with item where both entry and next are NULL.
3446  // Team is added to the list only once.
3447  // List is sorted in ascending order by team id.
3448  // Team id is *not* a key.
3449
3450  kmp_team_list_t l;
3451
3452  KMP_DEBUG_ASSERT(list != NULL);
3453  if (team == NULL) {
3454    return;
3455  }
3456
3457  __kmp_print_structure_team_accum(list, team->t.t_parent);
3458  __kmp_print_structure_team_accum(list, team->t.t_next_pool);
3459
3460  // Search list for the team.
3461  l = list;
3462  while (l->next != NULL && l->entry != team) {
3463    l = l->next;
3464  }
3465  if (l->next != NULL) {
3466    return; // Team has been added before, exit.
3467  }
3468
3469  // Team is not found. Search list again for insertion point.
3470  l = list;
3471  while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3472    l = l->next;
3473  }
3474
3475  // Insert team.
3476  {
3477    kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3478        sizeof(kmp_team_list_item_t));
3479    *item = *l;
3480    l->entry = team;
3481    l->next = item;
3482  }
3483}
3484
3485static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3486
3487) {
3488  __kmp_printf("%s", title);
3489  if (team != NULL) {
3490    __kmp_printf("%2x %p\n", team->t.t_id, team);
3491  } else {
3492    __kmp_printf(" - (nil)\n");
3493  }
3494}
3495
3496static void __kmp_print_structure_thread(char const *title,
3497                                         kmp_info_p const *thread) {
3498  __kmp_printf("%s", title);
3499  if (thread != NULL) {
3500    __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3501  } else {
3502    __kmp_printf(" - (nil)\n");
3503  }
3504}
3505
3506void __kmp_print_structure(void) {
3507
3508  kmp_team_list_t list;
3509
3510  // Initialize list of teams.
3511  list =
3512      (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3513  list->entry = NULL;
3514  list->next = NULL;
3515
3516  __kmp_printf("\n------------------------------\nGlobal Thread "
3517               "Table\n------------------------------\n");
3518  {
3519    int gtid;
3520    for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3521      __kmp_printf("%2d", gtid);
3522      if (__kmp_threads != NULL) {
3523        __kmp_printf(" %p", __kmp_threads[gtid]);
3524      }
3525      if (__kmp_root != NULL) {
3526        __kmp_printf(" %p", __kmp_root[gtid]);
3527      }
3528      __kmp_printf("\n");
3529    }
3530  }
3531
3532  // Print out __kmp_threads array.
3533  __kmp_printf("\n------------------------------\nThreads\n--------------------"
3534               "----------\n");
3535  if (__kmp_threads != NULL) {
3536    int gtid;
3537    for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3538      kmp_info_t const *thread = __kmp_threads[gtid];
3539      if (thread != NULL) {
3540        __kmp_printf("GTID %2d %p:\n", gtid, thread);
3541        __kmp_printf("    Our Root:        %p\n", thread->th.th_root);
3542        __kmp_print_structure_team("    Our Team:     ", thread->th.th_team);
3543        __kmp_print_structure_team("    Serial Team:  ",
3544                                   thread->th.th_serial_team);
3545        __kmp_printf("    Threads:      %2d\n", thread->th.th_team_nproc);
3546        __kmp_print_structure_thread("    Primary:      ",
3547                                     thread->th.th_team_master);
3548        __kmp_printf("    Serialized?:  %2d\n", thread->th.th_team_serialized);
3549        __kmp_printf("    Set NProc:    %2d\n", thread->th.th_set_nproc);
3550        __kmp_printf("    Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3551        __kmp_print_structure_thread("    Next in pool: ",
3552                                     thread->th.th_next_pool);
3553        __kmp_printf("\n");
3554        __kmp_print_structure_team_accum(list, thread->th.th_team);
3555        __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
3556      }
3557    }
3558  } else {
3559    __kmp_printf("Threads array is not allocated.\n");
3560  }
3561
3562  // Print out __kmp_root array.
3563  __kmp_printf("\n------------------------------\nUbers\n----------------------"
3564               "--------\n");
3565  if (__kmp_root != NULL) {
3566    int gtid;
3567    for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3568      kmp_root_t const *root = __kmp_root[gtid];
3569      if (root != NULL) {
3570        __kmp_printf("GTID %2d %p:\n", gtid, root);
3571        __kmp_print_structure_team("    Root Team:    ", root->r.r_root_team);
3572        __kmp_print_structure_team("    Hot Team:     ", root->r.r_hot_team);
3573        __kmp_print_structure_thread("    Uber Thread:  ",
3574                                     root->r.r_uber_thread);
3575        __kmp_printf("    Active?:      %2d\n", root->r.r_active);
3576        __kmp_printf("    In Parallel:  %2d\n",
3577                     KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel));
3578        __kmp_printf("\n");
3579        __kmp_print_structure_team_accum(list, root->r.r_root_team);
3580        __kmp_print_structure_team_accum(list, root->r.r_hot_team);
3581      }
3582    }
3583  } else {
3584    __kmp_printf("Ubers array is not allocated.\n");
3585  }
3586
3587  __kmp_printf("\n------------------------------\nTeams\n----------------------"
3588               "--------\n");
3589  while (list->next != NULL) {
3590    kmp_team_p const *team = list->entry;
3591    int i;
3592    __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
3593    __kmp_print_structure_team("    Parent Team:      ", team->t.t_parent);
3594    __kmp_printf("    Primary TID:      %2d\n", team->t.t_master_tid);
3595    __kmp_printf("    Max threads:      %2d\n", team->t.t_max_nproc);
3596    __kmp_printf("    Levels of serial: %2d\n", team->t.t_serialized);
3597    __kmp_printf("    Number threads:   %2d\n", team->t.t_nproc);
3598    for (i = 0; i < team->t.t_nproc; ++i) {
3599      __kmp_printf("    Thread %2d:      ", i);
3600      __kmp_print_structure_thread("", team->t.t_threads[i]);
3601    }
3602    __kmp_print_structure_team("    Next in pool:     ", team->t.t_next_pool);
3603    __kmp_printf("\n");
3604    list = list->next;
3605  }
3606
3607  // Print out __kmp_thread_pool and __kmp_team_pool.
3608  __kmp_printf("\n------------------------------\nPools\n----------------------"
3609               "--------\n");
3610  __kmp_print_structure_thread("Thread pool:          ",
3611                               CCAST(kmp_info_t *, __kmp_thread_pool));
3612  __kmp_print_structure_team("Team pool:            ",
3613                             CCAST(kmp_team_t *, __kmp_team_pool));
3614  __kmp_printf("\n");
3615
3616  // Free team list.
3617  while (list != NULL) {
3618    kmp_team_list_item_t *item = list;
3619    list = list->next;
3620    KMP_INTERNAL_FREE(item);
3621  }
3622}
3623
3624#endif
3625
3626//---------------------------------------------------------------------------
3627//  Stuff for per-thread fast random number generator
3628//  Table of primes
3629static const unsigned __kmp_primes[] = {
3630    0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3631    0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3632    0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3633    0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3634    0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3635    0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3636    0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3637    0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3638    0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3639    0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3640    0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3641
3642//---------------------------------------------------------------------------
3643//  __kmp_get_random: Get a random number using a linear congruential method.
3644unsigned short __kmp_get_random(kmp_info_t *thread) {
3645  unsigned x = thread->th.th_x;
3646  unsigned short r = (unsigned short)(x >> 16);
3647
3648  thread->th.th_x = x * thread->th.th_a + 1;
3649
3650  KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3651                thread->th.th_info.ds.ds_tid, r));
3652
3653  return r;
3654}
3655//--------------------------------------------------------
3656// __kmp_init_random: Initialize a random number generator
3657void __kmp_init_random(kmp_info_t *thread) {
3658  unsigned seed = thread->th.th_info.ds.ds_tid;
3659
3660  thread->th.th_a =
3661      __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3662  thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3663  KA_TRACE(30,
3664           ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3665}
3666
3667#if KMP_OS_WINDOWS
3668/* reclaim array entries for root threads that are already dead, returns number
3669 * reclaimed */
3670static int __kmp_reclaim_dead_roots(void) {
3671  int i, r = 0;
3672
3673  for (i = 0; i < __kmp_threads_capacity; ++i) {
3674    if (KMP_UBER_GTID(i) &&
3675        !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3676        !__kmp_root[i]
3677             ->r.r_active) { // AC: reclaim only roots died in non-active state
3678      r += __kmp_unregister_root_other_thread(i);
3679    }
3680  }
3681  return r;
3682}
3683#endif
3684
3685/* This function attempts to create free entries in __kmp_threads and
3686   __kmp_root, and returns the number of free entries generated.
3687
3688   For Windows* OS static library, the first mechanism used is to reclaim array
3689   entries for root threads that are already dead.
3690
3691   On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3692   __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3693   capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3694   threadprivate cache array has been created. Synchronization with
3695   __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3696
3697   After any dead root reclamation, if the clipping value allows array expansion
3698   to result in the generation of a total of nNeed free slots, the function does
3699   that expansion. If not, nothing is done beyond the possible initial root
3700   thread reclamation.
3701
3702   If any argument is negative, the behavior is undefined. */
3703static int __kmp_expand_threads(int nNeed) {
3704  int added = 0;
3705  int minimumRequiredCapacity;
3706  int newCapacity;
3707  kmp_info_t **newThreads;
3708  kmp_root_t **newRoot;
3709
3710  // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so
3711  // resizing __kmp_threads does not need additional protection if foreign
3712  // threads are present
3713
3714#if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB
3715  /* only for Windows static library */
3716  /* reclaim array entries for root threads that are already dead */
3717  added = __kmp_reclaim_dead_roots();
3718
3719  if (nNeed) {
3720    nNeed -= added;
3721    if (nNeed < 0)
3722      nNeed = 0;
3723  }
3724#endif
3725  if (nNeed <= 0)
3726    return added;
3727
3728  // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3729  // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3730  // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
3731  // > __kmp_max_nth in one of two ways:
3732  //
3733  // 1) The initialization thread (gtid = 0) exits.  __kmp_threads[0]
3734  //    may not be reused by another thread, so we may need to increase
3735  //    __kmp_threads_capacity to __kmp_max_nth + 1.
3736  //
3737  // 2) New foreign root(s) are encountered.  We always register new foreign
3738  //    roots. This may cause a smaller # of threads to be allocated at
3739  //    subsequent parallel regions, but the worker threads hang around (and
3740  //    eventually go to sleep) and need slots in the __kmp_threads[] array.
3741  //
3742  // Anyway, that is the reason for moving the check to see if
3743  // __kmp_max_nth was exceeded into __kmp_reserve_threads()
3744  // instead of having it performed here. -BB
3745
3746  KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity);
3747
3748  /* compute expansion headroom to check if we can expand */
3749  if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) {
3750    /* possible expansion too small -- give up */
3751    return added;
3752  }
3753  minimumRequiredCapacity = __kmp_threads_capacity + nNeed;
3754
3755  newCapacity = __kmp_threads_capacity;
3756  do {
3757    newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1)
3758                                                          : __kmp_sys_max_nth;
3759  } while (newCapacity < minimumRequiredCapacity);
3760  newThreads = (kmp_info_t **)__kmp_allocate(
3761      (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE);
3762  newRoot =
3763      (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity);
3764  KMP_MEMCPY(newThreads, __kmp_threads,
3765             __kmp_threads_capacity * sizeof(kmp_info_t *));
3766  KMP_MEMCPY(newRoot, __kmp_root,
3767             __kmp_threads_capacity * sizeof(kmp_root_t *));
3768  // Put old __kmp_threads array on a list. Any ongoing references to the old
3769  // list will be valid. This list is cleaned up at library shutdown.
3770  kmp_old_threads_list_t *node =
3771      (kmp_old_threads_list_t *)__kmp_allocate(sizeof(kmp_old_threads_list_t));
3772  node->threads = __kmp_threads;
3773  node->next = __kmp_old_threads_list;
3774  __kmp_old_threads_list = node;
3775
3776  *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3777  *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3778  added += newCapacity - __kmp_threads_capacity;
3779  *(volatile int *)&__kmp_threads_capacity = newCapacity;
3780
3781  if (newCapacity > __kmp_tp_capacity) {
3782    __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3783    if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3784      __kmp_threadprivate_resize_cache(newCapacity);
3785    } else { // increase __kmp_tp_capacity to correspond with kmp_threads size
3786      *(volatile int *)&__kmp_tp_capacity = newCapacity;
3787    }
3788    __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3789  }
3790
3791  return added;
3792}
3793
3794/* Register the current thread as a root thread and obtain our gtid. We must
3795   have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3796   thread that calls from __kmp_do_serial_initialize() */
3797int __kmp_register_root(int initial_thread) {
3798  kmp_info_t *root_thread;
3799  kmp_root_t *root;
3800  int gtid;
3801  int capacity;
3802  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3803  KA_TRACE(20, ("__kmp_register_root: entered\n"));
3804  KMP_MB();
3805
3806  /* 2007-03-02:
3807     If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3808     initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3809     work as expected -- it may return false (that means there is at least one
3810     empty slot in __kmp_threads array), but it is possible the only free slot
3811     is #0, which is reserved for initial thread and so cannot be used for this
3812     one. Following code workarounds this bug.
3813
3814     However, right solution seems to be not reserving slot #0 for initial
3815     thread because:
3816     (1) there is no magic in slot #0,
3817     (2) we cannot detect initial thread reliably (the first thread which does
3818        serial initialization may be not a real initial thread).
3819  */
3820  capacity = __kmp_threads_capacity;
3821  if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3822    --capacity;
3823  }
3824
3825  // If it is not for initializing the hidden helper team, we need to take
3826  // __kmp_hidden_helper_threads_num out of the capacity because it is included
3827  // in __kmp_threads_capacity.
3828  if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
3829    capacity -= __kmp_hidden_helper_threads_num;
3830  }
3831
3832  /* see if there are too many threads */
3833  if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) {
3834    if (__kmp_tp_cached) {
3835      __kmp_fatal(KMP_MSG(CantRegisterNewThread),
3836                  KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3837                  KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3838    } else {
3839      __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
3840                  __kmp_msg_null);
3841    }
3842  }
3843
3844  // When hidden helper task is enabled, __kmp_threads is organized as follows:
3845  // 0: initial thread, also a regular OpenMP thread.
3846  // [1, __kmp_hidden_helper_threads_num]: slots for hidden helper threads.
3847  // [__kmp_hidden_helper_threads_num + 1, __kmp_threads_capacity): slots for
3848  // regular OpenMP threads.
3849  if (TCR_4(__kmp_init_hidden_helper_threads)) {
3850    // Find an available thread slot for hidden helper thread. Slots for hidden
3851    // helper threads start from 1 to __kmp_hidden_helper_threads_num.
3852    for (gtid = 1; TCR_PTR(__kmp_threads[gtid]) != NULL &&
3853                   gtid <= __kmp_hidden_helper_threads_num;
3854         gtid++)
3855      ;
3856    KMP_ASSERT(gtid <= __kmp_hidden_helper_threads_num);
3857    KA_TRACE(1, ("__kmp_register_root: found slot in threads array for "
3858                 "hidden helper thread: T#%d\n",
3859                 gtid));
3860  } else {
3861    /* find an available thread slot */
3862    // Don't reassign the zero slot since we need that to only be used by
3863    // initial thread. Slots for hidden helper threads should also be skipped.
3864    if (initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3865      gtid = 0;
3866    } else {
3867      for (gtid = __kmp_hidden_helper_threads_num + 1;
3868           TCR_PTR(__kmp_threads[gtid]) != NULL; gtid++)
3869        ;
3870    }
3871    KA_TRACE(
3872        1, ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3873    KMP_ASSERT(gtid < __kmp_threads_capacity);
3874  }
3875
3876  /* update global accounting */
3877  __kmp_all_nth++;
3878  TCW_4(__kmp_nth, __kmp_nth + 1);
3879
3880  // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3881  // numbers of procs, and method #2 (keyed API call) for higher numbers.
3882  if (__kmp_adjust_gtid_mode) {
3883    if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3884      if (TCR_4(__kmp_gtid_mode) != 2) {
3885        TCW_4(__kmp_gtid_mode, 2);
3886      }
3887    } else {
3888      if (TCR_4(__kmp_gtid_mode) != 1) {
3889        TCW_4(__kmp_gtid_mode, 1);
3890      }
3891    }
3892  }
3893
3894#ifdef KMP_ADJUST_BLOCKTIME
3895  /* Adjust blocktime to zero if necessary            */
3896  /* Middle initialization might not have occurred yet */
3897  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3898    if (__kmp_nth > __kmp_avail_proc) {
3899      __kmp_zero_bt = TRUE;
3900    }
3901  }
3902#endif /* KMP_ADJUST_BLOCKTIME */
3903
3904  /* setup this new hierarchy */
3905  if (!(root = __kmp_root[gtid])) {
3906    root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3907    KMP_DEBUG_ASSERT(!root->r.r_root_team);
3908  }
3909
3910#if KMP_STATS_ENABLED
3911  // Initialize stats as soon as possible (right after gtid assignment).
3912  __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3913  __kmp_stats_thread_ptr->startLife();
3914  KMP_SET_THREAD_STATE(SERIAL_REGION);
3915  KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3916#endif
3917  __kmp_initialize_root(root);
3918
3919  /* setup new root thread structure */
3920  if (root->r.r_uber_thread) {
3921    root_thread = root->r.r_uber_thread;
3922  } else {
3923    root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3924    if (__kmp_storage_map) {
3925      __kmp_print_thread_storage_map(root_thread, gtid);
3926    }
3927    root_thread->th.th_info.ds.ds_gtid = gtid;
3928#if OMPT_SUPPORT
3929    root_thread->th.ompt_thread_info.thread_data = ompt_data_none;
3930#endif
3931    root_thread->th.th_root = root;
3932    if (__kmp_env_consistency_check) {
3933      root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3934    }
3935#if USE_FAST_MEMORY
3936    __kmp_initialize_fast_memory(root_thread);
3937#endif /* USE_FAST_MEMORY */
3938
3939#if KMP_USE_BGET
3940    KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3941    __kmp_initialize_bget(root_thread);
3942#endif
3943    __kmp_init_random(root_thread); // Initialize random number generator
3944  }
3945
3946  /* setup the serial team held in reserve by the root thread */
3947  if (!root_thread->th.th_serial_team) {
3948    kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3949    KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3950    root_thread->th.th_serial_team = __kmp_allocate_team(
3951        root, 1, 1,
3952#if OMPT_SUPPORT
3953        ompt_data_none, // root parallel id
3954#endif
3955        proc_bind_default, &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
3956  }
3957  KMP_ASSERT(root_thread->th.th_serial_team);
3958  KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3959                root_thread->th.th_serial_team));
3960
3961  /* drop root_thread into place */
3962  TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3963
3964  root->r.r_root_team->t.t_threads[0] = root_thread;
3965  root->r.r_hot_team->t.t_threads[0] = root_thread;
3966  root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3967  // AC: the team created in reserve, not for execution (it is unused for now).
3968  root_thread->th.th_serial_team->t.t_serialized = 0;
3969  root->r.r_uber_thread = root_thread;
3970
3971  /* initialize the thread, get it ready to go */
3972  __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
3973  TCW_4(__kmp_init_gtid, TRUE);
3974
3975  /* prepare the primary thread for get_gtid() */
3976  __kmp_gtid_set_specific(gtid);
3977
3978#if USE_ITT_BUILD
3979  __kmp_itt_thread_name(gtid);
3980#endif /* USE_ITT_BUILD */
3981
3982#ifdef KMP_TDATA_GTID
3983  __kmp_gtid = gtid;
3984#endif
3985  __kmp_create_worker(gtid, root_thread, __kmp_stksize);
3986  KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
3987
3988  KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
3989                "plain=%u\n",
3990                gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
3991                root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3992                KMP_INIT_BARRIER_STATE));
3993  { // Initialize barrier data.
3994    int b;
3995    for (b = 0; b < bs_last_barrier; ++b) {
3996      root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
3997#if USE_DEBUGGER
3998      root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
3999#endif
4000    }
4001  }
4002  KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
4003                   KMP_INIT_BARRIER_STATE);
4004
4005#if KMP_AFFINITY_SUPPORTED
4006  root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
4007  root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
4008  root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
4009  root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
4010#endif /* KMP_AFFINITY_SUPPORTED */
4011  root_thread->th.th_def_allocator = __kmp_def_allocator;
4012  root_thread->th.th_prev_level = 0;
4013  root_thread->th.th_prev_num_threads = 1;
4014
4015  kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
4016  tmp->cg_root = root_thread;
4017  tmp->cg_thread_limit = __kmp_cg_max_nth;
4018  tmp->cg_nthreads = 1;
4019  KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with"
4020                 " cg_nthreads init to 1\n",
4021                 root_thread, tmp));
4022  tmp->up = NULL;
4023  root_thread->th.th_cg_roots = tmp;
4024
4025  __kmp_root_counter++;
4026
4027#if OMPT_SUPPORT
4028  if (!initial_thread && ompt_enabled.enabled) {
4029
4030    kmp_info_t *root_thread = ompt_get_thread();
4031
4032    ompt_set_thread_state(root_thread, ompt_state_overhead);
4033
4034    if (ompt_enabled.ompt_callback_thread_begin) {
4035      ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
4036          ompt_thread_initial, __ompt_get_thread_data_internal());
4037    }
4038    ompt_data_t *task_data;
4039    ompt_data_t *parallel_data;
4040    __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
4041                                  NULL);
4042    if (ompt_enabled.ompt_callback_implicit_task) {
4043      ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
4044          ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial);
4045    }
4046
4047    ompt_set_thread_state(root_thread, ompt_state_work_serial);
4048  }
4049#endif
4050#if OMPD_SUPPORT
4051  if (ompd_state & OMPD_ENABLE_BP)
4052    ompd_bp_thread_begin();
4053#endif
4054
4055  KMP_MB();
4056  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4057
4058  return gtid;
4059}
4060
4061#if KMP_NESTED_HOT_TEAMS
4062static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
4063                                const int max_level) {
4064  int i, n, nth;
4065  kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
4066  if (!hot_teams || !hot_teams[level].hot_team) {
4067    return 0;
4068  }
4069  KMP_DEBUG_ASSERT(level < max_level);
4070  kmp_team_t *team = hot_teams[level].hot_team;
4071  nth = hot_teams[level].hot_team_nth;
4072  n = nth - 1; // primary thread is not freed
4073  if (level < max_level - 1) {
4074    for (i = 0; i < nth; ++i) {
4075      kmp_info_t *th = team->t.t_threads[i];
4076      n += __kmp_free_hot_teams(root, th, level + 1, max_level);
4077      if (i > 0 && th->th.th_hot_teams) {
4078        __kmp_free(th->th.th_hot_teams);
4079        th->th.th_hot_teams = NULL;
4080      }
4081    }
4082  }
4083  __kmp_free_team(root, team, NULL);
4084  return n;
4085}
4086#endif
4087
4088// Resets a root thread and clear its root and hot teams.
4089// Returns the number of __kmp_threads entries directly and indirectly freed.
4090static int __kmp_reset_root(int gtid, kmp_root_t *root) {
4091  kmp_team_t *root_team = root->r.r_root_team;
4092  kmp_team_t *hot_team = root->r.r_hot_team;
4093  int n = hot_team->t.t_nproc;
4094  int i;
4095
4096  KMP_DEBUG_ASSERT(!root->r.r_active);
4097
4098  root->r.r_root_team = NULL;
4099  root->r.r_hot_team = NULL;
4100  // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
4101  // before call to __kmp_free_team().
4102  __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
4103#if KMP_NESTED_HOT_TEAMS
4104  if (__kmp_hot_teams_max_level >
4105      0) { // need to free nested hot teams and their threads if any
4106    for (i = 0; i < hot_team->t.t_nproc; ++i) {
4107      kmp_info_t *th = hot_team->t.t_threads[i];
4108      if (__kmp_hot_teams_max_level > 1) {
4109        n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
4110      }
4111      if (th->th.th_hot_teams) {
4112        __kmp_free(th->th.th_hot_teams);
4113        th->th.th_hot_teams = NULL;
4114      }
4115    }
4116  }
4117#endif
4118  __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
4119
4120  // Before we can reap the thread, we need to make certain that all other
4121  // threads in the teams that had this root as ancestor have stopped trying to
4122  // steal tasks.
4123  if (__kmp_tasking_mode != tskm_immediate_exec) {
4124    __kmp_wait_to_unref_task_teams();
4125  }
4126
4127#if KMP_OS_WINDOWS
4128  /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
4129  KA_TRACE(
4130      10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
4131           "\n",
4132           (LPVOID) & (root->r.r_uber_thread->th),
4133           root->r.r_uber_thread->th.th_info.ds.ds_thread));
4134  __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
4135#endif /* KMP_OS_WINDOWS */
4136
4137#if OMPD_SUPPORT
4138  if (ompd_state & OMPD_ENABLE_BP)
4139    ompd_bp_thread_end();
4140#endif
4141
4142#if OMPT_SUPPORT
4143  ompt_data_t *task_data;
4144  ompt_data_t *parallel_data;
4145  __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
4146                                NULL);
4147  if (ompt_enabled.ompt_callback_implicit_task) {
4148    ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
4149        ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial);
4150  }
4151  if (ompt_enabled.ompt_callback_thread_end) {
4152    ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
4153        &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
4154  }
4155#endif
4156
4157  TCW_4(__kmp_nth,
4158        __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
4159  i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--;
4160  KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p"
4161                 " to %d\n",
4162                 root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots,
4163                 root->r.r_uber_thread->th.th_cg_roots->cg_nthreads));
4164  if (i == 1) {
4165    // need to free contention group structure
4166    KMP_DEBUG_ASSERT(root->r.r_uber_thread ==
4167                     root->r.r_uber_thread->th.th_cg_roots->cg_root);
4168    KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL);
4169    __kmp_free(root->r.r_uber_thread->th.th_cg_roots);
4170    root->r.r_uber_thread->th.th_cg_roots = NULL;
4171  }
4172  __kmp_reap_thread(root->r.r_uber_thread, 1);
4173
4174  // We canot put root thread to __kmp_thread_pool, so we have to reap it
4175  // instead of freeing.
4176  root->r.r_uber_thread = NULL;
4177  /* mark root as no longer in use */
4178  root->r.r_begin = FALSE;
4179
4180  return n;
4181}
4182
4183void __kmp_unregister_root_current_thread(int gtid) {
4184  KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
4185  /* this lock should be ok, since unregister_root_current_thread is never
4186     called during an abort, only during a normal close. furthermore, if you
4187     have the forkjoin lock, you should never try to get the initz lock */
4188  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
4189  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
4190    KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
4191                  "exiting T#%d\n",
4192                  gtid));
4193    __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4194    return;
4195  }
4196  kmp_root_t *root = __kmp_root[gtid];
4197
4198  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4199  KMP_ASSERT(KMP_UBER_GTID(gtid));
4200  KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4201  KMP_ASSERT(root->r.r_active == FALSE);
4202
4203  KMP_MB();
4204
4205  kmp_info_t *thread = __kmp_threads[gtid];
4206  kmp_team_t *team = thread->th.th_team;
4207  kmp_task_team_t *task_team = thread->th.th_task_team;
4208
4209  // we need to wait for the proxy tasks before finishing the thread
4210  if (task_team != NULL && (task_team->tt.tt_found_proxy_tasks ||
4211                            task_team->tt.tt_hidden_helper_task_encountered)) {
4212#if OMPT_SUPPORT
4213    // the runtime is shutting down so we won't report any events
4214    thread->th.ompt_thread_info.state = ompt_state_undefined;
4215#endif
4216    __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
4217  }
4218
4219  __kmp_reset_root(gtid, root);
4220
4221  KMP_MB();
4222  KC_TRACE(10,
4223           ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
4224
4225  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4226}
4227
4228#if KMP_OS_WINDOWS
4229/* __kmp_forkjoin_lock must be already held
4230   Unregisters a root thread that is not the current thread.  Returns the number
4231   of __kmp_threads entries freed as a result. */
4232static int __kmp_unregister_root_other_thread(int gtid) {
4233  kmp_root_t *root = __kmp_root[gtid];
4234  int r;
4235
4236  KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
4237  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4238  KMP_ASSERT(KMP_UBER_GTID(gtid));
4239  KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4240  KMP_ASSERT(root->r.r_active == FALSE);
4241
4242  r = __kmp_reset_root(gtid, root);
4243  KC_TRACE(10,
4244           ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
4245  return r;
4246}
4247#endif
4248
4249#if KMP_DEBUG
4250void __kmp_task_info() {
4251
4252  kmp_int32 gtid = __kmp_entry_gtid();
4253  kmp_int32 tid = __kmp_tid_from_gtid(gtid);
4254  kmp_info_t *this_thr = __kmp_threads[gtid];
4255  kmp_team_t *steam = this_thr->th.th_serial_team;
4256  kmp_team_t *team = this_thr->th.th_team;
4257
4258  __kmp_printf(
4259      "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p "
4260      "ptask=%p\n",
4261      gtid, tid, this_thr, team, steam, this_thr->th.th_current_task,
4262      team->t.t_implicit_task_taskdata[tid].td_parent);
4263}
4264#endif // KMP_DEBUG
4265
4266/* TODO optimize with one big memclr, take out what isn't needed, split
4267   responsibility to workers as much as possible, and delay initialization of
4268   features as much as possible  */
4269static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
4270                                  int tid, int gtid) {
4271  /* this_thr->th.th_info.ds.ds_gtid is setup in
4272     kmp_allocate_thread/create_worker.
4273     this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4274  KMP_DEBUG_ASSERT(this_thr != NULL);
4275  KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
4276  KMP_DEBUG_ASSERT(team);
4277  KMP_DEBUG_ASSERT(team->t.t_threads);
4278  KMP_DEBUG_ASSERT(team->t.t_dispatch);
4279  kmp_info_t *master = team->t.t_threads[0];
4280  KMP_DEBUG_ASSERT(master);
4281  KMP_DEBUG_ASSERT(master->th.th_root);
4282
4283  KMP_MB();
4284
4285  TCW_SYNC_PTR(this_thr->th.th_team, team);
4286
4287  this_thr->th.th_info.ds.ds_tid = tid;
4288  this_thr->th.th_set_nproc = 0;
4289  if (__kmp_tasking_mode != tskm_immediate_exec)
4290    // When tasking is possible, threads are not safe to reap until they are
4291    // done tasking; this will be set when tasking code is exited in wait
4292    this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
4293  else // no tasking --> always safe to reap
4294    this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
4295  this_thr->th.th_set_proc_bind = proc_bind_default;
4296#if KMP_AFFINITY_SUPPORTED
4297  this_thr->th.th_new_place = this_thr->th.th_current_place;
4298#endif
4299  this_thr->th.th_root = master->th.th_root;
4300
4301  /* setup the thread's cache of the team structure */
4302  this_thr->th.th_team_nproc = team->t.t_nproc;
4303  this_thr->th.th_team_master = master;
4304  this_thr->th.th_team_serialized = team->t.t_serialized;
4305
4306  KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4307
4308  KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4309                tid, gtid, this_thr, this_thr->th.th_current_task));
4310
4311  __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
4312                           team, tid, TRUE);
4313
4314  KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4315                tid, gtid, this_thr, this_thr->th.th_current_task));
4316  // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4317  // __kmp_initialize_team()?
4318
4319  /* TODO no worksharing in speculative threads */
4320  this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4321
4322  this_thr->th.th_local.this_construct = 0;
4323
4324  if (!this_thr->th.th_pri_common) {
4325    this_thr->th.th_pri_common =
4326        (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4327    if (__kmp_storage_map) {
4328      __kmp_print_storage_map_gtid(
4329          gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4330          sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
4331    }
4332    this_thr->th.th_pri_head = NULL;
4333  }
4334
4335  if (this_thr != master && // Primary thread's CG root is initialized elsewhere
4336      this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set
4337    // Make new thread's CG root same as primary thread's
4338    KMP_DEBUG_ASSERT(master->th.th_cg_roots);
4339    kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
4340    if (tmp) {
4341      // worker changes CG, need to check if old CG should be freed
4342      int i = tmp->cg_nthreads--;
4343      KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads"
4344                     " on node %p of thread %p to %d\n",
4345                     this_thr, tmp, tmp->cg_root, tmp->cg_nthreads));
4346      if (i == 1) {
4347        __kmp_free(tmp); // last thread left CG --> free it
4348      }
4349    }
4350    this_thr->th.th_cg_roots = master->th.th_cg_roots;
4351    // Increment new thread's CG root's counter to add the new thread
4352    this_thr->th.th_cg_roots->cg_nthreads++;
4353    KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on"
4354                   " node %p of thread %p to %d\n",
4355                   this_thr, this_thr->th.th_cg_roots,
4356                   this_thr->th.th_cg_roots->cg_root,
4357                   this_thr->th.th_cg_roots->cg_nthreads));
4358    this_thr->th.th_current_task->td_icvs.thread_limit =
4359        this_thr->th.th_cg_roots->cg_thread_limit;
4360  }
4361
4362  /* Initialize dynamic dispatch */
4363  {
4364    volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4365    // Use team max_nproc since this will never change for the team.
4366    size_t disp_size =
4367        sizeof(dispatch_private_info_t) *
4368        (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4369    KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4370                  team->t.t_max_nproc));
4371    KMP_ASSERT(dispatch);
4372    KMP_DEBUG_ASSERT(team->t.t_dispatch);
4373    KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4374
4375    dispatch->th_disp_index = 0;
4376    dispatch->th_doacross_buf_idx = 0;
4377    if (!dispatch->th_disp_buffer) {
4378      dispatch->th_disp_buffer =
4379          (dispatch_private_info_t *)__kmp_allocate(disp_size);
4380
4381      if (__kmp_storage_map) {
4382        __kmp_print_storage_map_gtid(
4383            gtid, &dispatch->th_disp_buffer[0],
4384            &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4385                                          ? 1
4386                                          : __kmp_dispatch_num_buffers],
4387            disp_size,
4388            "th_%d.th_dispatch.th_disp_buffer "
4389            "(team_%d.t_dispatch[%d].th_disp_buffer)",
4390            gtid, team->t.t_id, gtid);
4391      }
4392    } else {
4393      memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
4394    }
4395
4396    dispatch->th_dispatch_pr_current = 0;
4397    dispatch->th_dispatch_sh_current = 0;
4398
4399    dispatch->th_deo_fcn = 0; /* ORDERED     */
4400    dispatch->th_dxo_fcn = 0; /* END ORDERED */
4401  }
4402
4403  this_thr->th.th_next_pool = NULL;
4404
4405  if (!this_thr->th.th_task_state_memo_stack) {
4406    size_t i;
4407    this_thr->th.th_task_state_memo_stack =
4408        (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8));
4409    this_thr->th.th_task_state_top = 0;
4410    this_thr->th.th_task_state_stack_sz = 4;
4411    for (i = 0; i < this_thr->th.th_task_state_stack_sz;
4412         ++i) // zero init the stack
4413      this_thr->th.th_task_state_memo_stack[i] = 0;
4414  }
4415
4416  KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4417  KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4418
4419  KMP_MB();
4420}
4421
4422/* allocate a new thread for the requesting team. this is only called from
4423   within a forkjoin critical section. we will first try to get an available
4424   thread from the thread pool. if none is available, we will fork a new one
4425   assuming we are able to create a new one. this should be assured, as the
4426   caller should check on this first. */
4427kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4428                                  int new_tid) {
4429  kmp_team_t *serial_team;
4430  kmp_info_t *new_thr;
4431  int new_gtid;
4432
4433  KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4434  KMP_DEBUG_ASSERT(root && team);
4435#if !KMP_NESTED_HOT_TEAMS
4436  KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
4437#endif
4438  KMP_MB();
4439
4440  /* first, try to get one from the thread pool */
4441  if (__kmp_thread_pool) {
4442    new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
4443    __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4444    if (new_thr == __kmp_thread_pool_insert_pt) {
4445      __kmp_thread_pool_insert_pt = NULL;
4446    }
4447    TCW_4(new_thr->th.th_in_pool, FALSE);
4448    __kmp_suspend_initialize_thread(new_thr);
4449    __kmp_lock_suspend_mx(new_thr);
4450    if (new_thr->th.th_active_in_pool == TRUE) {
4451      KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE);
4452      KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
4453      new_thr->th.th_active_in_pool = FALSE;
4454    }
4455    __kmp_unlock_suspend_mx(new_thr);
4456
4457    KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4458                  __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4459    KMP_ASSERT(!new_thr->th.th_team);
4460    KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4461
4462    /* setup the thread structure */
4463    __kmp_initialize_info(new_thr, team, new_tid,
4464                          new_thr->th.th_info.ds.ds_gtid);
4465    KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4466
4467    TCW_4(__kmp_nth, __kmp_nth + 1);
4468
4469    new_thr->th.th_task_state = 0;
4470    new_thr->th.th_task_state_top = 0;
4471    new_thr->th.th_task_state_stack_sz = 4;
4472
4473    if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
4474      // Make sure pool thread has transitioned to waiting on own thread struct
4475      KMP_DEBUG_ASSERT(new_thr->th.th_used_in_team.load() == 0);
4476      // Thread activated in __kmp_allocate_team when increasing team size
4477    }
4478
4479#ifdef KMP_ADJUST_BLOCKTIME
4480    /* Adjust blocktime back to zero if necessary */
4481    /* Middle initialization might not have occurred yet */
4482    if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4483      if (__kmp_nth > __kmp_avail_proc) {
4484        __kmp_zero_bt = TRUE;
4485      }
4486    }
4487#endif /* KMP_ADJUST_BLOCKTIME */
4488
4489#if KMP_DEBUG
4490    // If thread entered pool via __kmp_free_thread, wait_flag should !=
4491    // KMP_BARRIER_PARENT_FLAG.
4492    int b;
4493    kmp_balign_t *balign = new_thr->th.th_bar;
4494    for (b = 0; b < bs_last_barrier; ++b)
4495      KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4496#endif
4497
4498    KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4499                  __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4500
4501    KMP_MB();
4502    return new_thr;
4503  }
4504
4505  /* no, well fork a new one */
4506  KMP_ASSERT(__kmp_nth == __kmp_all_nth);
4507  KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4508
4509#if KMP_USE_MONITOR
4510  // If this is the first worker thread the RTL is creating, then also
4511  // launch the monitor thread.  We try to do this as early as possible.
4512  if (!TCR_4(__kmp_init_monitor)) {
4513    __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4514    if (!TCR_4(__kmp_init_monitor)) {
4515      KF_TRACE(10, ("before __kmp_create_monitor\n"));
4516      TCW_4(__kmp_init_monitor, 1);
4517      __kmp_create_monitor(&__kmp_monitor);
4518      KF_TRACE(10, ("after __kmp_create_monitor\n"));
4519#if KMP_OS_WINDOWS
4520      // AC: wait until monitor has started. This is a fix for CQ232808.
4521      // The reason is that if the library is loaded/unloaded in a loop with
4522      // small (parallel) work in between, then there is high probability that
4523      // monitor thread started after the library shutdown. At shutdown it is
4524      // too late to cope with the problem, because when the primary thread is
4525      // in DllMain (process detach) the monitor has no chances to start (it is
4526      // blocked), and primary thread has no means to inform the monitor that
4527      // the library has gone, because all the memory which the monitor can
4528      // access is going to be released/reset.
4529      while (TCR_4(__kmp_init_monitor) < 2) {
4530        KMP_YIELD(TRUE);
4531      }
4532      KF_TRACE(10, ("after monitor thread has started\n"));
4533#endif
4534    }
4535    __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4536  }
4537#endif
4538
4539  KMP_MB();
4540
4541  {
4542    int new_start_gtid = TCR_4(__kmp_init_hidden_helper_threads)
4543                             ? 1
4544                             : __kmp_hidden_helper_threads_num + 1;
4545
4546    for (new_gtid = new_start_gtid; TCR_PTR(__kmp_threads[new_gtid]) != NULL;
4547         ++new_gtid) {
4548      KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4549    }
4550
4551    if (TCR_4(__kmp_init_hidden_helper_threads)) {
4552      KMP_DEBUG_ASSERT(new_gtid <= __kmp_hidden_helper_threads_num);
4553    }
4554  }
4555
4556  /* allocate space for it. */
4557  new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4558
4559  TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4560
4561#if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
4562  // suppress race conditions detection on synchronization flags in debug mode
4563  // this helps to analyze library internals eliminating false positives
4564  __itt_suppress_mark_range(
4565      __itt_suppress_range, __itt_suppress_threading_errors,
4566      &new_thr->th.th_sleep_loc, sizeof(new_thr->th.th_sleep_loc));
4567  __itt_suppress_mark_range(
4568      __itt_suppress_range, __itt_suppress_threading_errors,
4569      &new_thr->th.th_reap_state, sizeof(new_thr->th.th_reap_state));
4570#if KMP_OS_WINDOWS
4571  __itt_suppress_mark_range(
4572      __itt_suppress_range, __itt_suppress_threading_errors,
4573      &new_thr->th.th_suspend_init, sizeof(new_thr->th.th_suspend_init));
4574#else
4575  __itt_suppress_mark_range(__itt_suppress_range,
4576                            __itt_suppress_threading_errors,
4577                            &new_thr->th.th_suspend_init_count,
4578                            sizeof(new_thr->th.th_suspend_init_count));
4579#endif
4580  // TODO: check if we need to also suppress b_arrived flags
4581  __itt_suppress_mark_range(__itt_suppress_range,
4582                            __itt_suppress_threading_errors,
4583                            CCAST(kmp_uint64 *, &new_thr->th.th_bar[0].bb.b_go),
4584                            sizeof(new_thr->th.th_bar[0].bb.b_go));
4585  __itt_suppress_mark_range(__itt_suppress_range,
4586                            __itt_suppress_threading_errors,
4587                            CCAST(kmp_uint64 *, &new_thr->th.th_bar[1].bb.b_go),
4588                            sizeof(new_thr->th.th_bar[1].bb.b_go));
4589  __itt_suppress_mark_range(__itt_suppress_range,
4590                            __itt_suppress_threading_errors,
4591                            CCAST(kmp_uint64 *, &new_thr->th.th_bar[2].bb.b_go),
4592                            sizeof(new_thr->th.th_bar[2].bb.b_go));
4593#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */
4594  if (__kmp_storage_map) {
4595    __kmp_print_thread_storage_map(new_thr, new_gtid);
4596  }
4597
4598  // add the reserve serialized team, initialized from the team's primary thread
4599  {
4600    kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4601    KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4602    new_thr->th.th_serial_team = serial_team =
4603        (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
4604#if OMPT_SUPPORT
4605                                          ompt_data_none, // root parallel id
4606#endif
4607                                          proc_bind_default, &r_icvs,
4608                                          0 USE_NESTED_HOT_ARG(NULL));
4609  }
4610  KMP_ASSERT(serial_team);
4611  serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4612  // execution (it is unused for now).
4613  serial_team->t.t_threads[0] = new_thr;
4614  KF_TRACE(10,
4615           ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4616            new_thr));
4617
4618  /* setup the thread structures */
4619  __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
4620
4621#if USE_FAST_MEMORY
4622  __kmp_initialize_fast_memory(new_thr);
4623#endif /* USE_FAST_MEMORY */
4624
4625#if KMP_USE_BGET
4626  KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4627  __kmp_initialize_bget(new_thr);
4628#endif
4629
4630  __kmp_init_random(new_thr); // Initialize random number generator
4631
4632  /* Initialize these only once when thread is grabbed for a team allocation */
4633  KA_TRACE(20,
4634           ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4635            __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4636
4637  int b;
4638  kmp_balign_t *balign = new_thr->th.th_bar;
4639  for (b = 0; b < bs_last_barrier; ++b) {
4640    balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4641    balign[b].bb.team = NULL;
4642    balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4643    balign[b].bb.use_oncore_barrier = 0;
4644  }
4645
4646  TCW_PTR(new_thr->th.th_sleep_loc, NULL);
4647  new_thr->th.th_sleep_loc_type = flag_unset;
4648
4649  new_thr->th.th_spin_here = FALSE;
4650  new_thr->th.th_next_waiting = 0;
4651#if KMP_OS_UNIX
4652  new_thr->th.th_blocking = false;
4653#endif
4654
4655#if KMP_AFFINITY_SUPPORTED
4656  new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4657  new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4658  new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4659  new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4660#endif
4661  new_thr->th.th_def_allocator = __kmp_def_allocator;
4662  new_thr->th.th_prev_level = 0;
4663  new_thr->th.th_prev_num_threads = 1;
4664
4665  TCW_4(new_thr->th.th_in_pool, FALSE);
4666  new_thr->th.th_active_in_pool = FALSE;
4667  TCW_4(new_thr->th.th_active, TRUE);
4668
4669  /* adjust the global counters */
4670  __kmp_all_nth++;
4671  __kmp_nth++;
4672
4673  // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4674  // numbers of procs, and method #2 (keyed API call) for higher numbers.
4675  if (__kmp_adjust_gtid_mode) {
4676    if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4677      if (TCR_4(__kmp_gtid_mode) != 2) {
4678        TCW_4(__kmp_gtid_mode, 2);
4679      }
4680    } else {
4681      if (TCR_4(__kmp_gtid_mode) != 1) {
4682        TCW_4(__kmp_gtid_mode, 1);
4683      }
4684    }
4685  }
4686
4687#ifdef KMP_ADJUST_BLOCKTIME
4688  /* Adjust blocktime back to zero if necessary       */
4689  /* Middle initialization might not have occurred yet */
4690  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4691    if (__kmp_nth > __kmp_avail_proc) {
4692      __kmp_zero_bt = TRUE;
4693    }
4694  }
4695#endif /* KMP_ADJUST_BLOCKTIME */
4696
4697#if KMP_AFFINITY_SUPPORTED
4698  // Set the affinity and topology information for new thread
4699  __kmp_affinity_set_init_mask(new_gtid, /*isa_root=*/FALSE);
4700#endif
4701
4702  /* actually fork it and create the new worker thread */
4703  KF_TRACE(
4704      10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4705  __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
4706  KF_TRACE(10,
4707           ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4708
4709  KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4710                new_gtid));
4711  KMP_MB();
4712  return new_thr;
4713}
4714
4715/* Reinitialize team for reuse.
4716   The hot team code calls this case at every fork barrier, so EPCC barrier
4717   test are extremely sensitive to changes in it, esp. writes to the team
4718   struct, which cause a cache invalidation in all threads.
4719   IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
4720static void __kmp_reinitialize_team(kmp_team_t *team,
4721                                    kmp_internal_control_t *new_icvs,
4722                                    ident_t *loc) {
4723  KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4724                team->t.t_threads[0], team));
4725  KMP_DEBUG_ASSERT(team && new_icvs);
4726  KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4727  KMP_CHECK_UPDATE(team->t.t_ident, loc);
4728
4729  KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4730  // Copy ICVs to the primary thread's implicit taskdata
4731  __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
4732  copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4733
4734  KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4735                team->t.t_threads[0], team));
4736}
4737
4738/* Initialize the team data structure.
4739   This assumes the t_threads and t_max_nproc are already set.
4740   Also, we don't touch the arguments */
4741static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4742                                  kmp_internal_control_t *new_icvs,
4743                                  ident_t *loc) {
4744  KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4745
4746  /* verify */
4747  KMP_DEBUG_ASSERT(team);
4748  KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4749  KMP_DEBUG_ASSERT(team->t.t_threads);
4750  KMP_MB();
4751
4752  team->t.t_master_tid = 0; /* not needed */
4753  /* team->t.t_master_bar;        not needed */
4754  team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4755  team->t.t_nproc = new_nproc;
4756
4757  /* team->t.t_parent     = NULL; TODO not needed & would mess up hot team */
4758  team->t.t_next_pool = NULL;
4759  /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4760   * up hot team */
4761
4762  TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4763  team->t.t_invoke = NULL; /* not needed */
4764
4765  // TODO???: team->t.t_max_active_levels       = new_max_active_levels;
4766  team->t.t_sched.sched = new_icvs->sched.sched;
4767
4768#if KMP_ARCH_X86 || KMP_ARCH_X86_64
4769  team->t.t_fp_control_saved = FALSE; /* not needed */
4770  team->t.t_x87_fpu_control_word = 0; /* not needed */
4771  team->t.t_mxcsr = 0; /* not needed */
4772#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4773
4774  team->t.t_construct = 0;
4775
4776  team->t.t_ordered.dt.t_value = 0;
4777  team->t.t_master_active = FALSE;
4778
4779#ifdef KMP_DEBUG
4780  team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4781#endif
4782#if KMP_OS_WINDOWS
4783  team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4784#endif
4785
4786  team->t.t_control_stack_top = NULL;
4787
4788  __kmp_reinitialize_team(team, new_icvs, loc);
4789
4790  KMP_MB();
4791  KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4792}
4793
4794#if KMP_AFFINITY_SUPPORTED
4795static inline void __kmp_set_thread_place(kmp_team_t *team, kmp_info_t *th,
4796                                          int first, int last, int newp) {
4797  th->th.th_first_place = first;
4798  th->th.th_last_place = last;
4799  th->th.th_new_place = newp;
4800  if (newp != th->th.th_current_place) {
4801    if (__kmp_display_affinity && team->t.t_display_affinity != 1)
4802      team->t.t_display_affinity = 1;
4803    // Copy topology information associated with the new place
4804    th->th.th_topology_ids = __kmp_affinity.ids[th->th.th_new_place];
4805    th->th.th_topology_attrs = __kmp_affinity.attrs[th->th.th_new_place];
4806  }
4807}
4808
4809// __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4810// It calculates the worker + primary thread's partition based upon the parent
4811// thread's partition, and binds each worker to a thread in their partition.
4812// The primary thread's partition should already include its current binding.
4813static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4814  // Do not partition places for the hidden helper team
4815  if (KMP_HIDDEN_HELPER_TEAM(team))
4816    return;
4817  // Copy the primary thread's place partition to the team struct
4818  kmp_info_t *master_th = team->t.t_threads[0];
4819  KMP_DEBUG_ASSERT(master_th != NULL);
4820  kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4821  int first_place = master_th->th.th_first_place;
4822  int last_place = master_th->th.th_last_place;
4823  int masters_place = master_th->th.th_current_place;
4824  int num_masks = __kmp_affinity.num_masks;
4825  team->t.t_first_place = first_place;
4826  team->t.t_last_place = last_place;
4827
4828  KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4829                "bound to place %d partition = [%d,%d]\n",
4830                proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4831                team->t.t_id, masters_place, first_place, last_place));
4832
4833  switch (proc_bind) {
4834
4835  case proc_bind_default:
4836    // Serial teams might have the proc_bind policy set to proc_bind_default.
4837    // Not an issue -- we don't rebind primary thread for any proc_bind policy.
4838    KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4839    break;
4840
4841  case proc_bind_primary: {
4842    int f;
4843    int n_th = team->t.t_nproc;
4844    for (f = 1; f < n_th; f++) {
4845      kmp_info_t *th = team->t.t_threads[f];
4846      KMP_DEBUG_ASSERT(th != NULL);
4847      __kmp_set_thread_place(team, th, first_place, last_place, masters_place);
4848
4849      KA_TRACE(100, ("__kmp_partition_places: primary: T#%d(%d:%d) place %d "
4850                     "partition = [%d,%d]\n",
4851                     __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4852                     f, masters_place, first_place, last_place));
4853    }
4854  } break;
4855
4856  case proc_bind_close: {
4857    int f;
4858    int n_th = team->t.t_nproc;
4859    int n_places;
4860    if (first_place <= last_place) {
4861      n_places = last_place - first_place + 1;
4862    } else {
4863      n_places = num_masks - first_place + last_place + 1;
4864    }
4865    if (n_th <= n_places) {
4866      int place = masters_place;
4867      for (f = 1; f < n_th; f++) {
4868        kmp_info_t *th = team->t.t_threads[f];
4869        KMP_DEBUG_ASSERT(th != NULL);
4870
4871        if (place == last_place) {
4872          place = first_place;
4873        } else if (place == (num_masks - 1)) {
4874          place = 0;
4875        } else {
4876          place++;
4877        }
4878        __kmp_set_thread_place(team, th, first_place, last_place, place);
4879
4880        KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4881                       "partition = [%d,%d]\n",
4882                       __kmp_gtid_from_thread(team->t.t_threads[f]),
4883                       team->t.t_id, f, place, first_place, last_place));
4884      }
4885    } else {
4886      int S, rem, gap, s_count;
4887      S = n_th / n_places;
4888      s_count = 0;
4889      rem = n_th - (S * n_places);
4890      gap = rem > 0 ? n_places / rem : n_places;
4891      int place = masters_place;
4892      int gap_ct = gap;
4893      for (f = 0; f < n_th; f++) {
4894        kmp_info_t *th = team->t.t_threads[f];
4895        KMP_DEBUG_ASSERT(th != NULL);
4896
4897        __kmp_set_thread_place(team, th, first_place, last_place, place);
4898        s_count++;
4899
4900        if ((s_count == S) && rem && (gap_ct == gap)) {
4901          // do nothing, add an extra thread to place on next iteration
4902        } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4903          // we added an extra thread to this place; move to next place
4904          if (place == last_place) {
4905            place = first_place;
4906          } else if (place == (num_masks - 1)) {
4907            place = 0;
4908          } else {
4909            place++;
4910          }
4911          s_count = 0;
4912          gap_ct = 1;
4913          rem--;
4914        } else if (s_count == S) { // place full; don't add extra
4915          if (place == last_place) {
4916            place = first_place;
4917          } else if (place == (num_masks - 1)) {
4918            place = 0;
4919          } else {
4920            place++;
4921          }
4922          gap_ct++;
4923          s_count = 0;
4924        }
4925
4926        KA_TRACE(100,
4927                 ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4928                  "partition = [%d,%d]\n",
4929                  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4930                  th->th.th_new_place, first_place, last_place));
4931      }
4932      KMP_DEBUG_ASSERT(place == masters_place);
4933    }
4934  } break;
4935
4936  case proc_bind_spread: {
4937    int f;
4938    int n_th = team->t.t_nproc;
4939    int n_places;
4940    int thidx;
4941    if (first_place <= last_place) {
4942      n_places = last_place - first_place + 1;
4943    } else {
4944      n_places = num_masks - first_place + last_place + 1;
4945    }
4946    if (n_th <= n_places) {
4947      int place = -1;
4948
4949      if (n_places != num_masks) {
4950        int S = n_places / n_th;
4951        int s_count, rem, gap, gap_ct;
4952
4953        place = masters_place;
4954        rem = n_places - n_th * S;
4955        gap = rem ? n_th / rem : 1;
4956        gap_ct = gap;
4957        thidx = n_th;
4958        if (update_master_only == 1)
4959          thidx = 1;
4960        for (f = 0; f < thidx; f++) {
4961          kmp_info_t *th = team->t.t_threads[f];
4962          KMP_DEBUG_ASSERT(th != NULL);
4963
4964          int fplace = place, nplace = place;
4965          s_count = 1;
4966          while (s_count < S) {
4967            if (place == last_place) {
4968              place = first_place;
4969            } else if (place == (num_masks - 1)) {
4970              place = 0;
4971            } else {
4972              place++;
4973            }
4974            s_count++;
4975          }
4976          if (rem && (gap_ct == gap)) {
4977            if (place == last_place) {
4978              place = first_place;
4979            } else if (place == (num_masks - 1)) {
4980              place = 0;
4981            } else {
4982              place++;
4983            }
4984            rem--;
4985            gap_ct = 0;
4986          }
4987          __kmp_set_thread_place(team, th, fplace, place, nplace);
4988          gap_ct++;
4989
4990          if (place == last_place) {
4991            place = first_place;
4992          } else if (place == (num_masks - 1)) {
4993            place = 0;
4994          } else {
4995            place++;
4996          }
4997
4998          KA_TRACE(100,
4999                   ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
5000                    "partition = [%d,%d], num_masks: %u\n",
5001                    __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
5002                    f, th->th.th_new_place, th->th.th_first_place,
5003                    th->th.th_last_place, num_masks));
5004        }
5005      } else {
5006        /* Having uniform space of available computation places I can create
5007           T partitions of round(P/T) size and put threads into the first
5008           place of each partition. */
5009        double current = static_cast<double>(masters_place);
5010        double spacing =
5011            (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
5012        int first, last;
5013        kmp_info_t *th;
5014
5015        thidx = n_th + 1;
5016        if (update_master_only == 1)
5017          thidx = 1;
5018        for (f = 0; f < thidx; f++) {
5019          first = static_cast<int>(current);
5020          last = static_cast<int>(current + spacing) - 1;
5021          KMP_DEBUG_ASSERT(last >= first);
5022          if (first >= n_places) {
5023            if (masters_place) {
5024              first -= n_places;
5025              last -= n_places;
5026              if (first == (masters_place + 1)) {
5027                KMP_DEBUG_ASSERT(f == n_th);
5028                first--;
5029              }
5030              if (last == masters_place) {
5031                KMP_DEBUG_ASSERT(f == (n_th - 1));
5032                last--;
5033              }
5034            } else {
5035              KMP_DEBUG_ASSERT(f == n_th);
5036              first = 0;
5037              last = 0;
5038            }
5039          }
5040          if (last >= n_places) {
5041            last = (n_places - 1);
5042          }
5043          place = first;
5044          current += spacing;
5045          if (f < n_th) {
5046            KMP_DEBUG_ASSERT(0 <= first);
5047            KMP_DEBUG_ASSERT(n_places > first);
5048            KMP_DEBUG_ASSERT(0 <= last);
5049            KMP_DEBUG_ASSERT(n_places > last);
5050            KMP_DEBUG_ASSERT(last_place >= first_place);
5051            th = team->t.t_threads[f];
5052            KMP_DEBUG_ASSERT(th);
5053            __kmp_set_thread_place(team, th, first, last, place);
5054            KA_TRACE(100,
5055                     ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
5056                      "partition = [%d,%d], spacing = %.4f\n",
5057                      __kmp_gtid_from_thread(team->t.t_threads[f]),
5058                      team->t.t_id, f, th->th.th_new_place,
5059                      th->th.th_first_place, th->th.th_last_place, spacing));
5060          }
5061        }
5062      }
5063      KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
5064    } else {
5065      int S, rem, gap, s_count;
5066      S = n_th / n_places;
5067      s_count = 0;
5068      rem = n_th - (S * n_places);
5069      gap = rem > 0 ? n_places / rem : n_places;
5070      int place = masters_place;
5071      int gap_ct = gap;
5072      thidx = n_th;
5073      if (update_master_only == 1)
5074        thidx = 1;
5075      for (f = 0; f < thidx; f++) {
5076        kmp_info_t *th = team->t.t_threads[f];
5077        KMP_DEBUG_ASSERT(th != NULL);
5078
5079        __kmp_set_thread_place(team, th, place, place, place);
5080        s_count++;
5081
5082        if ((s_count == S) && rem && (gap_ct == gap)) {
5083          // do nothing, add an extra thread to place on next iteration
5084        } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
5085          // we added an extra thread to this place; move on to next place
5086          if (place == last_place) {
5087            place = first_place;
5088          } else if (place == (num_masks - 1)) {
5089            place = 0;
5090          } else {
5091            place++;
5092          }
5093          s_count = 0;
5094          gap_ct = 1;
5095          rem--;
5096        } else if (s_count == S) { // place is full; don't add extra thread
5097          if (place == last_place) {
5098            place = first_place;
5099          } else if (place == (num_masks - 1)) {
5100            place = 0;
5101          } else {
5102            place++;
5103          }
5104          gap_ct++;
5105          s_count = 0;
5106        }
5107
5108        KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
5109                       "partition = [%d,%d]\n",
5110                       __kmp_gtid_from_thread(team->t.t_threads[f]),
5111                       team->t.t_id, f, th->th.th_new_place,
5112                       th->th.th_first_place, th->th.th_last_place));
5113      }
5114      KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
5115    }
5116  } break;
5117
5118  default:
5119    break;
5120  }
5121
5122  KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
5123}
5124
5125#endif // KMP_AFFINITY_SUPPORTED
5126
5127/* allocate a new team data structure to use.  take one off of the free pool if
5128   available */
5129kmp_team_t *
5130__kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
5131#if OMPT_SUPPORT
5132                    ompt_data_t ompt_parallel_data,
5133#endif
5134                    kmp_proc_bind_t new_proc_bind,
5135                    kmp_internal_control_t *new_icvs,
5136                    int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5137  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
5138  int f;
5139  kmp_team_t *team;
5140  int use_hot_team = !root->r.r_active;
5141  int level = 0;
5142  int do_place_partition = 1;
5143
5144  KA_TRACE(20, ("__kmp_allocate_team: called\n"));
5145  KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
5146  KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
5147  KMP_MB();
5148
5149#if KMP_NESTED_HOT_TEAMS
5150  kmp_hot_team_ptr_t *hot_teams;
5151  if (master) {
5152    team = master->th.th_team;
5153    level = team->t.t_active_level;
5154    if (master->th.th_teams_microtask) { // in teams construct?
5155      if (master->th.th_teams_size.nteams > 1 &&
5156          ( // #teams > 1
5157              team->t.t_pkfn ==
5158                  (microtask_t)__kmp_teams_master || // inner fork of the teams
5159              master->th.th_teams_level <
5160                  team->t.t_level)) { // or nested parallel inside the teams
5161        ++level; // not increment if #teams==1, or for outer fork of the teams;
5162        // increment otherwise
5163      }
5164      // Do not perform the place partition if inner fork of the teams
5165      // Wait until nested parallel region encountered inside teams construct
5166      if ((master->th.th_teams_size.nteams == 1 &&
5167           master->th.th_teams_level >= team->t.t_level) ||
5168          (team->t.t_pkfn == (microtask_t)__kmp_teams_master))
5169        do_place_partition = 0;
5170    }
5171    hot_teams = master->th.th_hot_teams;
5172    if (level < __kmp_hot_teams_max_level && hot_teams &&
5173        hot_teams[level].hot_team) {
5174      // hot team has already been allocated for given level
5175      use_hot_team = 1;
5176    } else {
5177      use_hot_team = 0;
5178    }
5179  } else {
5180    // check we won't access uninitialized hot_teams, just in case
5181    KMP_DEBUG_ASSERT(new_nproc == 1);
5182  }
5183#endif
5184  // Optimization to use a "hot" team
5185  if (use_hot_team && new_nproc > 1) {
5186    KMP_DEBUG_ASSERT(new_nproc <= max_nproc);
5187#if KMP_NESTED_HOT_TEAMS
5188    team = hot_teams[level].hot_team;
5189#else
5190    team = root->r.r_hot_team;
5191#endif
5192#if KMP_DEBUG
5193    if (__kmp_tasking_mode != tskm_immediate_exec) {
5194      KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5195                    "task_team[1] = %p before reinit\n",
5196                    team->t.t_task_team[0], team->t.t_task_team[1]));
5197    }
5198#endif
5199
5200    if (team->t.t_nproc != new_nproc &&
5201        __kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5202      // Distributed barrier may need a resize
5203      int old_nthr = team->t.t_nproc;
5204      __kmp_resize_dist_barrier(team, old_nthr, new_nproc);
5205    }
5206
5207    // If not doing the place partition, then reset the team's proc bind
5208    // to indicate that partitioning of all threads still needs to take place
5209    if (do_place_partition == 0)
5210      team->t.t_proc_bind = proc_bind_default;
5211    // Has the number of threads changed?
5212    /* Let's assume the most common case is that the number of threads is
5213       unchanged, and put that case first. */
5214    if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
5215      KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
5216      // This case can mean that omp_set_num_threads() was called and the hot
5217      // team size was already reduced, so we check the special flag
5218      if (team->t.t_size_changed == -1) {
5219        team->t.t_size_changed = 1;
5220      } else {
5221        KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
5222      }
5223
5224      // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5225      kmp_r_sched_t new_sched = new_icvs->sched;
5226      // set primary thread's schedule as new run-time schedule
5227      KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
5228
5229      __kmp_reinitialize_team(team, new_icvs,
5230                              root->r.r_uber_thread->th.th_ident);
5231
5232      KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
5233                    team->t.t_threads[0], team));
5234      __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5235
5236#if KMP_AFFINITY_SUPPORTED
5237      if ((team->t.t_size_changed == 0) &&
5238          (team->t.t_proc_bind == new_proc_bind)) {
5239        if (new_proc_bind == proc_bind_spread) {
5240          if (do_place_partition) {
5241            // add flag to update only master for spread
5242            __kmp_partition_places(team, 1);
5243          }
5244        }
5245        KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
5246                       "proc_bind = %d, partition = [%d,%d]\n",
5247                       team->t.t_id, new_proc_bind, team->t.t_first_place,
5248                       team->t.t_last_place));
5249      } else {
5250        if (do_place_partition) {
5251          KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5252          __kmp_partition_places(team);
5253        }
5254      }
5255#else
5256      KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5257#endif /* KMP_AFFINITY_SUPPORTED */
5258    } else if (team->t.t_nproc > new_nproc) {
5259      KA_TRACE(20,
5260               ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
5261                new_nproc));
5262
5263      team->t.t_size_changed = 1;
5264      if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5265        // Barrier size already reduced earlier in this function
5266        // Activate team threads via th_used_in_team
5267        __kmp_add_threads_to_team(team, new_nproc);
5268      }
5269#if KMP_NESTED_HOT_TEAMS
5270      if (__kmp_hot_teams_mode == 0) {
5271        // AC: saved number of threads should correspond to team's value in this
5272        // mode, can be bigger in mode 1, when hot team has threads in reserve
5273        KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
5274        hot_teams[level].hot_team_nth = new_nproc;
5275#endif // KMP_NESTED_HOT_TEAMS
5276        /* release the extra threads we don't need any more */
5277        for (f = new_nproc; f < team->t.t_nproc; f++) {
5278          KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5279          if (__kmp_tasking_mode != tskm_immediate_exec) {
5280            // When decreasing team size, threads no longer in the team should
5281            // unref task team.
5282            team->t.t_threads[f]->th.th_task_team = NULL;
5283          }
5284          __kmp_free_thread(team->t.t_threads[f]);
5285          team->t.t_threads[f] = NULL;
5286        }
5287#if KMP_NESTED_HOT_TEAMS
5288      } // (__kmp_hot_teams_mode == 0)
5289      else {
5290        // When keeping extra threads in team, switch threads to wait on own
5291        // b_go flag
5292        for (f = new_nproc; f < team->t.t_nproc; ++f) {
5293          KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5294          kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
5295          for (int b = 0; b < bs_last_barrier; ++b) {
5296            if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
5297              balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5298            }
5299            KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
5300          }
5301        }
5302      }
5303#endif // KMP_NESTED_HOT_TEAMS
5304      team->t.t_nproc = new_nproc;
5305      // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5306      KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
5307      __kmp_reinitialize_team(team, new_icvs,
5308                              root->r.r_uber_thread->th.th_ident);
5309
5310      // Update remaining threads
5311      for (f = 0; f < new_nproc; ++f) {
5312        team->t.t_threads[f]->th.th_team_nproc = new_nproc;
5313      }
5314
5315      // restore the current task state of the primary thread: should be the
5316      // implicit task
5317      KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
5318                    team->t.t_threads[0], team));
5319
5320      __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5321
5322#ifdef KMP_DEBUG
5323      for (f = 0; f < team->t.t_nproc; f++) {
5324        KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5325                         team->t.t_threads[f]->th.th_team_nproc ==
5326                             team->t.t_nproc);
5327      }
5328#endif
5329
5330      if (do_place_partition) {
5331        KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5332#if KMP_AFFINITY_SUPPORTED
5333        __kmp_partition_places(team);
5334#endif
5335      }
5336    } else { // team->t.t_nproc < new_nproc
5337
5338      KA_TRACE(20,
5339               ("__kmp_allocate_team: increasing hot team thread count to %d\n",
5340                new_nproc));
5341      int old_nproc = team->t.t_nproc; // save old value and use to update only
5342      team->t.t_size_changed = 1;
5343
5344#if KMP_NESTED_HOT_TEAMS
5345      int avail_threads = hot_teams[level].hot_team_nth;
5346      if (new_nproc < avail_threads)
5347        avail_threads = new_nproc;
5348      kmp_info_t **other_threads = team->t.t_threads;
5349      for (f = team->t.t_nproc; f < avail_threads; ++f) {
5350        // Adjust barrier data of reserved threads (if any) of the team
5351        // Other data will be set in __kmp_initialize_info() below.
5352        int b;
5353        kmp_balign_t *balign = other_threads[f]->th.th_bar;
5354        for (b = 0; b < bs_last_barrier; ++b) {
5355          balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5356          KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5357#if USE_DEBUGGER
5358          balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5359#endif
5360        }
5361      }
5362      if (hot_teams[level].hot_team_nth >= new_nproc) {
5363        // we have all needed threads in reserve, no need to allocate any
5364        // this only possible in mode 1, cannot have reserved threads in mode 0
5365        KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5366        team->t.t_nproc = new_nproc; // just get reserved threads involved
5367      } else {
5368        // We may have some threads in reserve, but not enough;
5369        // get reserved threads involved if any.
5370        team->t.t_nproc = hot_teams[level].hot_team_nth;
5371        hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5372#endif // KMP_NESTED_HOT_TEAMS
5373        if (team->t.t_max_nproc < new_nproc) {
5374          /* reallocate larger arrays */
5375          __kmp_reallocate_team_arrays(team, new_nproc);
5376          __kmp_reinitialize_team(team, new_icvs, NULL);
5377        }
5378
5379#if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5380        /* Temporarily set full mask for primary thread before creation of
5381           workers. The reason is that workers inherit the affinity from the
5382           primary thread, so if a lot of workers are created on the single
5383           core quickly, they don't get a chance to set their own affinity for
5384           a long time. */
5385        kmp_affinity_raii_t new_temp_affinity{__kmp_affin_fullMask};
5386#endif
5387
5388        /* allocate new threads for the hot team */
5389        for (f = team->t.t_nproc; f < new_nproc; f++) {
5390          kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
5391          KMP_DEBUG_ASSERT(new_worker);
5392          team->t.t_threads[f] = new_worker;
5393
5394          KA_TRACE(20,
5395                   ("__kmp_allocate_team: team %d init T#%d arrived: "
5396                    "join=%llu, plain=%llu\n",
5397                    team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
5398                    team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5399                    team->t.t_bar[bs_plain_barrier].b_arrived));
5400
5401          { // Initialize barrier data for new threads.
5402            int b;
5403            kmp_balign_t *balign = new_worker->th.th_bar;
5404            for (b = 0; b < bs_last_barrier; ++b) {
5405              balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5406              KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
5407                               KMP_BARRIER_PARENT_FLAG);
5408#if USE_DEBUGGER
5409              balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5410#endif
5411            }
5412          }
5413        }
5414
5415#if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5416        /* Restore initial primary thread's affinity mask */
5417        new_temp_affinity.restore();
5418#endif
5419#if KMP_NESTED_HOT_TEAMS
5420      } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5421#endif // KMP_NESTED_HOT_TEAMS
5422      if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5423        // Barrier size already increased earlier in this function
5424        // Activate team threads via th_used_in_team
5425        __kmp_add_threads_to_team(team, new_nproc);
5426      }
5427      /* make sure everyone is syncronized */
5428      // new threads below
5429      __kmp_initialize_team(team, new_nproc, new_icvs,
5430                            root->r.r_uber_thread->th.th_ident);
5431
5432      /* reinitialize the threads */
5433      KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5434      for (f = 0; f < team->t.t_nproc; ++f)
5435        __kmp_initialize_info(team->t.t_threads[f], team, f,
5436                              __kmp_gtid_from_tid(f, team));
5437
5438      // set th_task_state for new threads in hot team with older thread's state
5439      kmp_uint8 old_state = team->t.t_threads[old_nproc - 1]->th.th_task_state;
5440      for (f = old_nproc; f < team->t.t_nproc; ++f)
5441        team->t.t_threads[f]->th.th_task_state = old_state;
5442
5443#ifdef KMP_DEBUG
5444      for (f = 0; f < team->t.t_nproc; ++f) {
5445        KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5446                         team->t.t_threads[f]->th.th_team_nproc ==
5447                             team->t.t_nproc);
5448      }
5449#endif
5450
5451      if (do_place_partition) {
5452        KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5453#if KMP_AFFINITY_SUPPORTED
5454        __kmp_partition_places(team);
5455#endif
5456      }
5457    } // Check changes in number of threads
5458
5459    kmp_info_t *master = team->t.t_threads[0];
5460    if (master->th.th_teams_microtask) {
5461      for (f = 1; f < new_nproc; ++f) {
5462        // propagate teams construct specific info to workers
5463        kmp_info_t *thr = team->t.t_threads[f];
5464        thr->th.th_teams_microtask = master->th.th_teams_microtask;
5465        thr->th.th_teams_level = master->th.th_teams_level;
5466        thr->th.th_teams_size = master->th.th_teams_size;
5467      }
5468    }
5469#if KMP_NESTED_HOT_TEAMS
5470    if (level) {
5471      // Sync barrier state for nested hot teams, not needed for outermost hot
5472      // team.
5473      for (f = 1; f < new_nproc; ++f) {
5474        kmp_info_t *thr = team->t.t_threads[f];
5475        int b;
5476        kmp_balign_t *balign = thr->th.th_bar;
5477        for (b = 0; b < bs_last_barrier; ++b) {
5478          balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5479          KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5480#if USE_DEBUGGER
5481          balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5482#endif
5483        }
5484      }
5485    }
5486#endif // KMP_NESTED_HOT_TEAMS
5487
5488    /* reallocate space for arguments if necessary */
5489    __kmp_alloc_argv_entries(argc, team, TRUE);
5490    KMP_CHECK_UPDATE(team->t.t_argc, argc);
5491    // The hot team re-uses the previous task team,
5492    // if untouched during the previous release->gather phase.
5493
5494    KF_TRACE(10, (" hot_team = %p\n", team));
5495
5496#if KMP_DEBUG
5497    if (__kmp_tasking_mode != tskm_immediate_exec) {
5498      KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5499                    "task_team[1] = %p after reinit\n",
5500                    team->t.t_task_team[0], team->t.t_task_team[1]));
5501    }
5502#endif
5503
5504#if OMPT_SUPPORT
5505    __ompt_team_assign_id(team, ompt_parallel_data);
5506#endif
5507
5508    KMP_MB();
5509
5510    return team;
5511  }
5512
5513  /* next, let's try to take one from the team pool */
5514  KMP_MB();
5515  for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
5516    /* TODO: consider resizing undersized teams instead of reaping them, now
5517       that we have a resizing mechanism */
5518    if (team->t.t_max_nproc >= max_nproc) {
5519      /* take this team from the team pool */
5520      __kmp_team_pool = team->t.t_next_pool;
5521
5522      if (max_nproc > 1 &&
5523          __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5524        if (!team->t.b) { // Allocate barrier structure
5525          team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub);
5526        }
5527      }
5528
5529      /* setup the team for fresh use */
5530      __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5531
5532      KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5533                    "task_team[1] %p to NULL\n",
5534                    &team->t.t_task_team[0], &team->t.t_task_team[1]));
5535      team->t.t_task_team[0] = NULL;
5536      team->t.t_task_team[1] = NULL;
5537
5538      /* reallocate space for arguments if necessary */
5539      __kmp_alloc_argv_entries(argc, team, TRUE);
5540      KMP_CHECK_UPDATE(team->t.t_argc, argc);
5541
5542      KA_TRACE(
5543          20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5544               team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5545      { // Initialize barrier data.
5546        int b;
5547        for (b = 0; b < bs_last_barrier; ++b) {
5548          team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5549#if USE_DEBUGGER
5550          team->t.t_bar[b].b_master_arrived = 0;
5551          team->t.t_bar[b].b_team_arrived = 0;
5552#endif
5553        }
5554      }
5555
5556      team->t.t_proc_bind = new_proc_bind;
5557
5558      KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5559                    team->t.t_id));
5560
5561#if OMPT_SUPPORT
5562      __ompt_team_assign_id(team, ompt_parallel_data);
5563#endif
5564
5565      KMP_MB();
5566
5567      return team;
5568    }
5569
5570    /* reap team if it is too small, then loop back and check the next one */
5571    // not sure if this is wise, but, will be redone during the hot-teams
5572    // rewrite.
5573    /* TODO: Use technique to find the right size hot-team, don't reap them */
5574    team = __kmp_reap_team(team);
5575    __kmp_team_pool = team;
5576  }
5577
5578  /* nothing available in the pool, no matter, make a new team! */
5579  KMP_MB();
5580  team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5581
5582  /* and set it up */
5583  team->t.t_max_nproc = max_nproc;
5584  if (max_nproc > 1 &&
5585      __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5586    // Allocate barrier structure
5587    team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub);
5588  }
5589
5590  /* NOTE well, for some reason allocating one big buffer and dividing it up
5591     seems to really hurt performance a lot on the P4, so, let's not use this */
5592  __kmp_allocate_team_arrays(team, max_nproc);
5593
5594  KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5595  __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5596
5597  KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5598                "%p to NULL\n",
5599                &team->t.t_task_team[0], &team->t.t_task_team[1]));
5600  team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5601  // memory, no need to duplicate
5602  team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5603  // memory, no need to duplicate
5604
5605  if (__kmp_storage_map) {
5606    __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
5607  }
5608
5609  /* allocate space for arguments */
5610  __kmp_alloc_argv_entries(argc, team, FALSE);
5611  team->t.t_argc = argc;
5612
5613  KA_TRACE(20,
5614           ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5615            team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5616  { // Initialize barrier data.
5617    int b;
5618    for (b = 0; b < bs_last_barrier; ++b) {
5619      team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5620#if USE_DEBUGGER
5621      team->t.t_bar[b].b_master_arrived = 0;
5622      team->t.t_bar[b].b_team_arrived = 0;
5623#endif
5624    }
5625  }
5626
5627  team->t.t_proc_bind = new_proc_bind;
5628
5629#if OMPT_SUPPORT
5630  __ompt_team_assign_id(team, ompt_parallel_data);
5631  team->t.ompt_serialized_team_info = NULL;
5632#endif
5633
5634  KMP_MB();
5635
5636  KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5637                team->t.t_id));
5638
5639  return team;
5640}
5641
5642/* TODO implement hot-teams at all levels */
5643/* TODO implement lazy thread release on demand (disband request) */
5644
5645/* free the team.  return it to the team pool.  release all the threads
5646 * associated with it */
5647void __kmp_free_team(kmp_root_t *root,
5648                     kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5649  int f;
5650  KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5651                team->t.t_id));
5652
5653  /* verify state */
5654  KMP_DEBUG_ASSERT(root);
5655  KMP_DEBUG_ASSERT(team);
5656  KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5657  KMP_DEBUG_ASSERT(team->t.t_threads);
5658
5659  int use_hot_team = team == root->r.r_hot_team;
5660#if KMP_NESTED_HOT_TEAMS
5661  int level;
5662  if (master) {
5663    level = team->t.t_active_level - 1;
5664    if (master->th.th_teams_microtask) { // in teams construct?
5665      if (master->th.th_teams_size.nteams > 1) {
5666        ++level; // level was not increased in teams construct for
5667        // team_of_masters
5668      }
5669      if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5670          master->th.th_teams_level == team->t.t_level) {
5671        ++level; // level was not increased in teams construct for
5672        // team_of_workers before the parallel
5673      } // team->t.t_level will be increased inside parallel
5674    }
5675#if KMP_DEBUG
5676    kmp_hot_team_ptr_t *hot_teams = master->th.th_hot_teams;
5677#endif
5678    if (level < __kmp_hot_teams_max_level) {
5679      KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5680      use_hot_team = 1;
5681    }
5682  }
5683#endif // KMP_NESTED_HOT_TEAMS
5684
5685  /* team is done working */
5686  TCW_SYNC_PTR(team->t.t_pkfn,
5687               NULL); // Important for Debugging Support Library.
5688#if KMP_OS_WINDOWS
5689  team->t.t_copyin_counter = 0; // init counter for possible reuse
5690#endif
5691  // Do not reset pointer to parent team to NULL for hot teams.
5692
5693  /* if we are non-hot team, release our threads */
5694  if (!use_hot_team) {
5695    if (__kmp_tasking_mode != tskm_immediate_exec) {
5696      // Wait for threads to reach reapable state
5697      for (f = 1; f < team->t.t_nproc; ++f) {
5698        KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5699        kmp_info_t *th = team->t.t_threads[f];
5700        volatile kmp_uint32 *state = &th->th.th_reap_state;
5701        while (*state != KMP_SAFE_TO_REAP) {
5702#if KMP_OS_WINDOWS
5703          // On Windows a thread can be killed at any time, check this
5704          DWORD ecode;
5705          if (!__kmp_is_thread_alive(th, &ecode)) {
5706            *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5707            break;
5708          }
5709#endif
5710          // first check if thread is sleeping
5711          kmp_flag_64<> fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th);
5712          if (fl.is_sleeping())
5713            fl.resume(__kmp_gtid_from_thread(th));
5714          KMP_CPU_PAUSE();
5715        }
5716      }
5717
5718      // Delete task teams
5719      int tt_idx;
5720      for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5721        kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5722        if (task_team != NULL) {
5723          for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams
5724            KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5725            team->t.t_threads[f]->th.th_task_team = NULL;
5726          }
5727          KA_TRACE(
5728              20,
5729              ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5730               __kmp_get_gtid(), task_team, team->t.t_id));
5731#if KMP_NESTED_HOT_TEAMS
5732          __kmp_free_task_team(master, task_team);
5733#endif
5734          team->t.t_task_team[tt_idx] = NULL;
5735        }
5736      }
5737    }
5738
5739    // Reset pointer to parent team only for non-hot teams.
5740    team->t.t_parent = NULL;
5741    team->t.t_level = 0;
5742    team->t.t_active_level = 0;
5743
5744    /* free the worker threads */
5745    for (f = 1; f < team->t.t_nproc; ++f) {
5746      KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5747      if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5748        KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team),
5749                                    1, 2);
5750      }
5751      __kmp_free_thread(team->t.t_threads[f]);
5752    }
5753
5754    if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5755      if (team->t.b) {
5756        // wake up thread at old location
5757        team->t.b->go_release();
5758        if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5759          for (f = 1; f < team->t.t_nproc; ++f) {
5760            if (team->t.b->sleep[f].sleep) {
5761              __kmp_atomic_resume_64(
5762                  team->t.t_threads[f]->th.th_info.ds.ds_gtid,
5763                  (kmp_atomic_flag_64<> *)NULL);
5764            }
5765          }
5766        }
5767        // Wait for threads to be removed from team
5768        for (int f = 1; f < team->t.t_nproc; ++f) {
5769          while (team->t.t_threads[f]->th.th_used_in_team.load() != 0)
5770            KMP_CPU_PAUSE();
5771        }
5772      }
5773    }
5774
5775    for (f = 1; f < team->t.t_nproc; ++f) {
5776      team->t.t_threads[f] = NULL;
5777    }
5778
5779    if (team->t.t_max_nproc > 1 &&
5780        __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5781      distributedBarrier::deallocate(team->t.b);
5782      team->t.b = NULL;
5783    }
5784    /* put the team back in the team pool */
5785    /* TODO limit size of team pool, call reap_team if pool too large */
5786    team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
5787    __kmp_team_pool = (volatile kmp_team_t *)team;
5788  } else { // Check if team was created for primary threads in teams construct
5789    // See if first worker is a CG root
5790    KMP_DEBUG_ASSERT(team->t.t_threads[1] &&
5791                     team->t.t_threads[1]->th.th_cg_roots);
5792    if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) {
5793      // Clean up the CG root nodes on workers so that this team can be re-used
5794      for (f = 1; f < team->t.t_nproc; ++f) {
5795        kmp_info_t *thr = team->t.t_threads[f];
5796        KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots &&
5797                         thr->th.th_cg_roots->cg_root == thr);
5798        // Pop current CG root off list
5799        kmp_cg_root_t *tmp = thr->th.th_cg_roots;
5800        thr->th.th_cg_roots = tmp->up;
5801        KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving"
5802                       " up to node %p. cg_nthreads was %d\n",
5803                       thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads));
5804        int i = tmp->cg_nthreads--;
5805        if (i == 1) {
5806          __kmp_free(tmp); // free CG if we are the last thread in it
5807        }
5808        // Restore current task's thread_limit from CG root
5809        if (thr->th.th_cg_roots)
5810          thr->th.th_current_task->td_icvs.thread_limit =
5811              thr->th.th_cg_roots->cg_thread_limit;
5812      }
5813    }
5814  }
5815
5816  KMP_MB();
5817}
5818
5819/* reap the team.  destroy it, reclaim all its resources and free its memory */
5820kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5821  kmp_team_t *next_pool = team->t.t_next_pool;
5822
5823  KMP_DEBUG_ASSERT(team);
5824  KMP_DEBUG_ASSERT(team->t.t_dispatch);
5825  KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5826  KMP_DEBUG_ASSERT(team->t.t_threads);
5827  KMP_DEBUG_ASSERT(team->t.t_argv);
5828
5829  /* TODO clean the threads that are a part of this? */
5830
5831  /* free stuff */
5832  __kmp_free_team_arrays(team);
5833  if (team->t.t_argv != &team->t.t_inline_argv[0])
5834    __kmp_free((void *)team->t.t_argv);
5835  __kmp_free(team);
5836
5837  KMP_MB();
5838  return next_pool;
5839}
5840
5841// Free the thread.  Don't reap it, just place it on the pool of available
5842// threads.
5843//
5844// Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5845// binding for the affinity mechanism to be useful.
5846//
5847// Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5848// However, we want to avoid a potential performance problem by always
5849// scanning through the list to find the correct point at which to insert
5850// the thread (potential N**2 behavior).  To do this we keep track of the
5851// last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5852// With single-level parallelism, threads will always be added to the tail
5853// of the list, kept track of by __kmp_thread_pool_insert_pt.  With nested
5854// parallelism, all bets are off and we may need to scan through the entire
5855// free list.
5856//
5857// This change also has a potentially large performance benefit, for some
5858// applications.  Previously, as threads were freed from the hot team, they
5859// would be placed back on the free list in inverse order.  If the hot team
5860// grew back to it's original size, then the freed thread would be placed
5861// back on the hot team in reverse order.  This could cause bad cache
5862// locality problems on programs where the size of the hot team regularly
5863// grew and shrunk.
5864//
5865// Now, for single-level parallelism, the OMP tid is always == gtid.
5866void __kmp_free_thread(kmp_info_t *this_th) {
5867  int gtid;
5868  kmp_info_t **scan;
5869
5870  KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5871                __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5872
5873  KMP_DEBUG_ASSERT(this_th);
5874
5875  // When moving thread to pool, switch thread to wait on own b_go flag, and
5876  // uninitialized (NULL team).
5877  int b;
5878  kmp_balign_t *balign = this_th->th.th_bar;
5879  for (b = 0; b < bs_last_barrier; ++b) {
5880    if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5881      balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5882    balign[b].bb.team = NULL;
5883    balign[b].bb.leaf_kids = 0;
5884  }
5885  this_th->th.th_task_state = 0;
5886  this_th->th.th_reap_state = KMP_SAFE_TO_REAP;
5887
5888  /* put thread back on the free pool */
5889  TCW_PTR(this_th->th.th_team, NULL);
5890  TCW_PTR(this_th->th.th_root, NULL);
5891  TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5892
5893  while (this_th->th.th_cg_roots) {
5894    this_th->th.th_cg_roots->cg_nthreads--;
5895    KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node"
5896                   " %p of thread  %p to %d\n",
5897                   this_th, this_th->th.th_cg_roots,
5898                   this_th->th.th_cg_roots->cg_root,
5899                   this_th->th.th_cg_roots->cg_nthreads));
5900    kmp_cg_root_t *tmp = this_th->th.th_cg_roots;
5901    if (tmp->cg_root == this_th) { // Thread is a cg_root
5902      KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0);
5903      KA_TRACE(
5904          5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp));
5905      this_th->th.th_cg_roots = tmp->up;
5906      __kmp_free(tmp);
5907    } else { // Worker thread
5908      if (tmp->cg_nthreads == 0) { // last thread leaves contention group
5909        __kmp_free(tmp);
5910      }
5911      this_th->th.th_cg_roots = NULL;
5912      break;
5913    }
5914  }
5915
5916  /* If the implicit task assigned to this thread can be used by other threads
5917   * -> multiple threads can share the data and try to free the task at
5918   * __kmp_reap_thread at exit. This duplicate use of the task data can happen
5919   * with higher probability when hot team is disabled but can occurs even when
5920   * the hot team is enabled */
5921  __kmp_free_implicit_task(this_th);
5922  this_th->th.th_current_task = NULL;
5923
5924  // If the __kmp_thread_pool_insert_pt is already past the new insert
5925  // point, then we need to re-scan the entire list.
5926  gtid = this_th->th.th_info.ds.ds_gtid;
5927  if (__kmp_thread_pool_insert_pt != NULL) {
5928    KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5929    if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5930      __kmp_thread_pool_insert_pt = NULL;
5931    }
5932  }
5933
5934  // Scan down the list to find the place to insert the thread.
5935  // scan is the address of a link in the list, possibly the address of
5936  // __kmp_thread_pool itself.
5937  //
5938  // In the absence of nested parallelism, the for loop will have 0 iterations.
5939  if (__kmp_thread_pool_insert_pt != NULL) {
5940    scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5941  } else {
5942    scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
5943  }
5944  for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5945       scan = &((*scan)->th.th_next_pool))
5946    ;
5947
5948  // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5949  // to its address.
5950  TCW_PTR(this_th->th.th_next_pool, *scan);
5951  __kmp_thread_pool_insert_pt = *scan = this_th;
5952  KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5953                   (this_th->th.th_info.ds.ds_gtid <
5954                    this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5955  TCW_4(this_th->th.th_in_pool, TRUE);
5956  __kmp_suspend_initialize_thread(this_th);
5957  __kmp_lock_suspend_mx(this_th);
5958  if (this_th->th.th_active == TRUE) {
5959    KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth);
5960    this_th->th.th_active_in_pool = TRUE;
5961  }
5962#if KMP_DEBUG
5963  else {
5964    KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE);
5965  }
5966#endif
5967  __kmp_unlock_suspend_mx(this_th);
5968
5969  TCW_4(__kmp_nth, __kmp_nth - 1);
5970
5971#ifdef KMP_ADJUST_BLOCKTIME
5972  /* Adjust blocktime back to user setting or default if necessary */
5973  /* Middle initialization might never have occurred                */
5974  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5975    KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5976    if (__kmp_nth <= __kmp_avail_proc) {
5977      __kmp_zero_bt = FALSE;
5978    }
5979  }
5980#endif /* KMP_ADJUST_BLOCKTIME */
5981
5982  KMP_MB();
5983}
5984
5985/* ------------------------------------------------------------------------ */
5986
5987void *__kmp_launch_thread(kmp_info_t *this_thr) {
5988#if OMP_PROFILING_SUPPORT
5989  ProfileTraceFile = getenv("LIBOMPTARGET_PROFILE");
5990  // TODO: add a configuration option for time granularity
5991  if (ProfileTraceFile)
5992    llvm::timeTraceProfilerInitialize(500 /* us */, "libomptarget");
5993#endif
5994
5995  int gtid = this_thr->th.th_info.ds.ds_gtid;
5996  /*    void                 *stack_data;*/
5997  kmp_team_t **volatile pteam;
5998
5999  KMP_MB();
6000  KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
6001
6002  if (__kmp_env_consistency_check) {
6003    this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
6004  }
6005
6006#if OMPD_SUPPORT
6007  if (ompd_state & OMPD_ENABLE_BP)
6008    ompd_bp_thread_begin();
6009#endif
6010
6011#if OMPT_SUPPORT
6012  ompt_data_t *thread_data = nullptr;
6013  if (ompt_enabled.enabled) {
6014    thread_data = &(this_thr->th.ompt_thread_info.thread_data);
6015    *thread_data = ompt_data_none;
6016
6017    this_thr->th.ompt_thread_info.state = ompt_state_overhead;
6018    this_thr->th.ompt_thread_info.wait_id = 0;
6019    this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
6020    this_thr->th.ompt_thread_info.parallel_flags = 0;
6021    if (ompt_enabled.ompt_callback_thread_begin) {
6022      ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
6023          ompt_thread_worker, thread_data);
6024    }
6025    this_thr->th.ompt_thread_info.state = ompt_state_idle;
6026  }
6027#endif
6028
6029  /* This is the place where threads wait for work */
6030  while (!TCR_4(__kmp_global.g.g_done)) {
6031    KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
6032    KMP_MB();
6033
6034    /* wait for work to do */
6035    KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
6036
6037    /* No tid yet since not part of a team */
6038    __kmp_fork_barrier(gtid, KMP_GTID_DNE);
6039
6040#if OMPT_SUPPORT
6041    if (ompt_enabled.enabled) {
6042      this_thr->th.ompt_thread_info.state = ompt_state_overhead;
6043    }
6044#endif
6045
6046    pteam = &this_thr->th.th_team;
6047
6048    /* have we been allocated? */
6049    if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
6050      /* we were just woken up, so run our new task */
6051      if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
6052        int rc;
6053        KA_TRACE(20,
6054                 ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
6055                  gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
6056                  (*pteam)->t.t_pkfn));
6057
6058        updateHWFPControl(*pteam);
6059
6060#if OMPT_SUPPORT
6061        if (ompt_enabled.enabled) {
6062          this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
6063        }
6064#endif
6065
6066        rc = (*pteam)->t.t_invoke(gtid);
6067        KMP_ASSERT(rc);
6068
6069        KMP_MB();
6070        KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
6071                      gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
6072                      (*pteam)->t.t_pkfn));
6073      }
6074#if OMPT_SUPPORT
6075      if (ompt_enabled.enabled) {
6076        /* no frame set while outside task */
6077        __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none;
6078
6079        this_thr->th.ompt_thread_info.state = ompt_state_overhead;
6080      }
6081#endif
6082      /* join barrier after parallel region */
6083      __kmp_join_barrier(gtid);
6084    }
6085  }
6086
6087#if OMPD_SUPPORT
6088  if (ompd_state & OMPD_ENABLE_BP)
6089    ompd_bp_thread_end();
6090#endif
6091
6092#if OMPT_SUPPORT
6093  if (ompt_enabled.ompt_callback_thread_end) {
6094    ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
6095  }
6096#endif
6097
6098  this_thr->th.th_task_team = NULL;
6099  /* run the destructors for the threadprivate data for this thread */
6100  __kmp_common_destroy_gtid(gtid);
6101
6102  KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
6103  KMP_MB();
6104
6105#if OMP_PROFILING_SUPPORT
6106  llvm::timeTraceProfilerFinishThread();
6107#endif
6108  return this_thr;
6109}
6110
6111/* ------------------------------------------------------------------------ */
6112
6113void __kmp_internal_end_dest(void *specific_gtid) {
6114  // Make sure no significant bits are lost
6115  int gtid;
6116  __kmp_type_convert((kmp_intptr_t)specific_gtid - 1, &gtid);
6117
6118  KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
6119  /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
6120   * this is because 0 is reserved for the nothing-stored case */
6121
6122  __kmp_internal_end_thread(gtid);
6123}
6124
6125#if KMP_OS_UNIX && KMP_DYNAMIC_LIB
6126
6127__attribute__((destructor)) void __kmp_internal_end_dtor(void) {
6128  __kmp_internal_end_atexit();
6129}
6130
6131#endif
6132
6133/* [Windows] josh: when the atexit handler is called, there may still be more
6134   than one thread alive */
6135void __kmp_internal_end_atexit(void) {
6136  KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
6137  /* [Windows]
6138     josh: ideally, we want to completely shutdown the library in this atexit
6139     handler, but stat code that depends on thread specific data for gtid fails
6140     because that data becomes unavailable at some point during the shutdown, so
6141     we call __kmp_internal_end_thread instead. We should eventually remove the
6142     dependency on __kmp_get_specific_gtid in the stat code and use
6143     __kmp_internal_end_library to cleanly shutdown the library.
6144
6145     // TODO: Can some of this comment about GVS be removed?
6146     I suspect that the offending stat code is executed when the calling thread
6147     tries to clean up a dead root thread's data structures, resulting in GVS
6148     code trying to close the GVS structures for that thread, but since the stat
6149     code uses __kmp_get_specific_gtid to get the gtid with the assumption that
6150     the calling thread is cleaning up itself instead of another thread, it get
6151     confused. This happens because allowing a thread to unregister and cleanup
6152     another thread is a recent modification for addressing an issue.
6153     Based on the current design (20050722), a thread may end up
6154     trying to unregister another thread only if thread death does not trigger
6155     the calling of __kmp_internal_end_thread.  For Linux* OS, there is the
6156     thread specific data destructor function to detect thread death. For
6157     Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
6158     is nothing.  Thus, the workaround is applicable only for Windows static
6159     stat library. */
6160  __kmp_internal_end_library(-1);
6161#if KMP_OS_WINDOWS
6162  __kmp_close_console();
6163#endif
6164}
6165
6166static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
6167  // It is assumed __kmp_forkjoin_lock is acquired.
6168
6169  int gtid;
6170
6171  KMP_DEBUG_ASSERT(thread != NULL);
6172
6173  gtid = thread->th.th_info.ds.ds_gtid;
6174
6175  if (!is_root) {
6176    if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
6177      /* Assume the threads are at the fork barrier here */
6178      KA_TRACE(
6179          20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
6180               gtid));
6181      if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
6182        while (
6183            !KMP_COMPARE_AND_STORE_ACQ32(&(thread->th.th_used_in_team), 0, 3))
6184          KMP_CPU_PAUSE();
6185        __kmp_resume_32(gtid, (kmp_flag_32<false, false> *)NULL);
6186      } else {
6187        /* Need release fence here to prevent seg faults for tree forkjoin
6188           barrier (GEH) */
6189        kmp_flag_64<> flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
6190                           thread);
6191        __kmp_release_64(&flag);
6192      }
6193    }
6194
6195    // Terminate OS thread.
6196    __kmp_reap_worker(thread);
6197
6198    // The thread was killed asynchronously.  If it was actively
6199    // spinning in the thread pool, decrement the global count.
6200    //
6201    // There is a small timing hole here - if the worker thread was just waking
6202    // up after sleeping in the pool, had reset it's th_active_in_pool flag but
6203    // not decremented the global counter __kmp_thread_pool_active_nth yet, then
6204    // the global counter might not get updated.
6205    //
6206    // Currently, this can only happen as the library is unloaded,
6207    // so there are no harmful side effects.
6208    if (thread->th.th_active_in_pool) {
6209      thread->th.th_active_in_pool = FALSE;
6210      KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
6211      KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0);
6212    }
6213  }
6214
6215  __kmp_free_implicit_task(thread);
6216
6217// Free the fast memory for tasking
6218#if USE_FAST_MEMORY
6219  __kmp_free_fast_memory(thread);
6220#endif /* USE_FAST_MEMORY */
6221
6222  __kmp_suspend_uninitialize_thread(thread);
6223
6224  KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
6225  TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
6226
6227  --__kmp_all_nth;
6228  // __kmp_nth was decremented when thread is added to the pool.
6229
6230#ifdef KMP_ADJUST_BLOCKTIME
6231  /* Adjust blocktime back to user setting or default if necessary */
6232  /* Middle initialization might never have occurred                */
6233  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
6234    KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
6235    if (__kmp_nth <= __kmp_avail_proc) {
6236      __kmp_zero_bt = FALSE;
6237    }
6238  }
6239#endif /* KMP_ADJUST_BLOCKTIME */
6240
6241  /* free the memory being used */
6242  if (__kmp_env_consistency_check) {
6243    if (thread->th.th_cons) {
6244      __kmp_free_cons_stack(thread->th.th_cons);
6245      thread->th.th_cons = NULL;
6246    }
6247  }
6248
6249  if (thread->th.th_pri_common != NULL) {
6250    __kmp_free(thread->th.th_pri_common);
6251    thread->th.th_pri_common = NULL;
6252  }
6253
6254  if (thread->th.th_task_state_memo_stack != NULL) {
6255    __kmp_free(thread->th.th_task_state_memo_stack);
6256    thread->th.th_task_state_memo_stack = NULL;
6257  }
6258
6259#if KMP_USE_BGET
6260  if (thread->th.th_local.bget_data != NULL) {
6261    __kmp_finalize_bget(thread);
6262  }
6263#endif
6264
6265#if KMP_AFFINITY_SUPPORTED
6266  if (thread->th.th_affin_mask != NULL) {
6267    KMP_CPU_FREE(thread->th.th_affin_mask);
6268    thread->th.th_affin_mask = NULL;
6269  }
6270#endif /* KMP_AFFINITY_SUPPORTED */
6271
6272#if KMP_USE_HIER_SCHED
6273  if (thread->th.th_hier_bar_data != NULL) {
6274    __kmp_free(thread->th.th_hier_bar_data);
6275    thread->th.th_hier_bar_data = NULL;
6276  }
6277#endif
6278
6279  __kmp_reap_team(thread->th.th_serial_team);
6280  thread->th.th_serial_team = NULL;
6281  __kmp_free(thread);
6282
6283  KMP_MB();
6284
6285} // __kmp_reap_thread
6286
6287static void __kmp_itthash_clean(kmp_info_t *th) {
6288#if USE_ITT_NOTIFY
6289  if (__kmp_itt_region_domains.count > 0) {
6290    for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) {
6291      kmp_itthash_entry_t *bucket = __kmp_itt_region_domains.buckets[i];
6292      while (bucket) {
6293        kmp_itthash_entry_t *next = bucket->next_in_bucket;
6294        __kmp_thread_free(th, bucket);
6295        bucket = next;
6296      }
6297    }
6298  }
6299  if (__kmp_itt_barrier_domains.count > 0) {
6300    for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) {
6301      kmp_itthash_entry_t *bucket = __kmp_itt_barrier_domains.buckets[i];
6302      while (bucket) {
6303        kmp_itthash_entry_t *next = bucket->next_in_bucket;
6304        __kmp_thread_free(th, bucket);
6305        bucket = next;
6306      }
6307    }
6308  }
6309#endif
6310}
6311
6312static void __kmp_internal_end(void) {
6313  int i;
6314
6315  /* First, unregister the library */
6316  __kmp_unregister_library();
6317
6318#if KMP_OS_WINDOWS
6319  /* In Win static library, we can't tell when a root actually dies, so we
6320     reclaim the data structures for any root threads that have died but not
6321     unregistered themselves, in order to shut down cleanly.
6322     In Win dynamic library we also can't tell when a thread dies.  */
6323  __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
6324// dead roots
6325#endif
6326
6327  for (i = 0; i < __kmp_threads_capacity; i++)
6328    if (__kmp_root[i])
6329      if (__kmp_root[i]->r.r_active)
6330        break;
6331  KMP_MB(); /* Flush all pending memory write invalidates.  */
6332  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6333
6334  if (i < __kmp_threads_capacity) {
6335#if KMP_USE_MONITOR
6336    // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
6337    KMP_MB(); /* Flush all pending memory write invalidates.  */
6338
6339    // Need to check that monitor was initialized before reaping it. If we are
6340    // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
6341    // __kmp_monitor will appear to contain valid data, but it is only valid in
6342    // the parent process, not the child.
6343    // New behavior (201008): instead of keying off of the flag
6344    // __kmp_init_parallel, the monitor thread creation is keyed off
6345    // of the new flag __kmp_init_monitor.
6346    __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6347    if (TCR_4(__kmp_init_monitor)) {
6348      __kmp_reap_monitor(&__kmp_monitor);
6349      TCW_4(__kmp_init_monitor, 0);
6350    }
6351    __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6352    KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6353#endif // KMP_USE_MONITOR
6354  } else {
6355/* TODO move this to cleanup code */
6356#ifdef KMP_DEBUG
6357    /* make sure that everything has properly ended */
6358    for (i = 0; i < __kmp_threads_capacity; i++) {
6359      if (__kmp_root[i]) {
6360        //                    KMP_ASSERT( ! KMP_UBER_GTID( i ) );         // AC:
6361        //                    there can be uber threads alive here
6362        KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
6363      }
6364    }
6365#endif
6366
6367    KMP_MB();
6368
6369    // Reap the worker threads.
6370    // This is valid for now, but be careful if threads are reaped sooner.
6371    while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
6372      // Get the next thread from the pool.
6373      kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
6374      __kmp_thread_pool = thread->th.th_next_pool;
6375      // Reap it.
6376      KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
6377      thread->th.th_next_pool = NULL;
6378      thread->th.th_in_pool = FALSE;
6379      __kmp_reap_thread(thread, 0);
6380    }
6381    __kmp_thread_pool_insert_pt = NULL;
6382
6383    // Reap teams.
6384    while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
6385      // Get the next team from the pool.
6386      kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
6387      __kmp_team_pool = team->t.t_next_pool;
6388      // Reap it.
6389      team->t.t_next_pool = NULL;
6390      __kmp_reap_team(team);
6391    }
6392
6393    __kmp_reap_task_teams();
6394
6395#if KMP_OS_UNIX
6396    // Threads that are not reaped should not access any resources since they
6397    // are going to be deallocated soon, so the shutdown sequence should wait
6398    // until all threads either exit the final spin-waiting loop or begin
6399    // sleeping after the given blocktime.
6400    for (i = 0; i < __kmp_threads_capacity; i++) {
6401      kmp_info_t *thr = __kmp_threads[i];
6402      while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking))
6403        KMP_CPU_PAUSE();
6404    }
6405#endif
6406
6407    for (i = 0; i < __kmp_threads_capacity; ++i) {
6408      // TBD: Add some checking...
6409      // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
6410    }
6411
6412    /* Make sure all threadprivate destructors get run by joining with all
6413       worker threads before resetting this flag */
6414    TCW_SYNC_4(__kmp_init_common, FALSE);
6415
6416    KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
6417    KMP_MB();
6418
6419#if KMP_USE_MONITOR
6420    // See note above: One of the possible fixes for CQ138434 / CQ140126
6421    //
6422    // FIXME: push both code fragments down and CSE them?
6423    // push them into __kmp_cleanup() ?
6424    __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6425    if (TCR_4(__kmp_init_monitor)) {
6426      __kmp_reap_monitor(&__kmp_monitor);
6427      TCW_4(__kmp_init_monitor, 0);
6428    }
6429    __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6430    KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6431#endif
6432  } /* else !__kmp_global.t_active */
6433  TCW_4(__kmp_init_gtid, FALSE);
6434  KMP_MB(); /* Flush all pending memory write invalidates.  */
6435
6436  __kmp_cleanup();
6437#if OMPT_SUPPORT
6438  ompt_fini();
6439#endif
6440}
6441
6442void __kmp_internal_end_library(int gtid_req) {
6443  /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6444  /* this shouldn't be a race condition because __kmp_internal_end() is the
6445     only place to clear __kmp_serial_init */
6446  /* we'll check this later too, after we get the lock */
6447  // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6448  // redundant, because the next check will work in any case.
6449  if (__kmp_global.g.g_abort) {
6450    KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
6451    /* TODO abort? */
6452    return;
6453  }
6454  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6455    KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
6456    return;
6457  }
6458
6459  // If hidden helper team has been initialized, we need to deinit it
6460  if (TCR_4(__kmp_init_hidden_helper) &&
6461      !TCR_4(__kmp_hidden_helper_team_done)) {
6462    TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6463    // First release the main thread to let it continue its work
6464    __kmp_hidden_helper_main_thread_release();
6465    // Wait until the hidden helper team has been destroyed
6466    __kmp_hidden_helper_threads_deinitz_wait();
6467  }
6468
6469  KMP_MB(); /* Flush all pending memory write invalidates.  */
6470  /* find out who we are and what we should do */
6471  {
6472    int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6473    KA_TRACE(
6474        10, ("__kmp_internal_end_library: enter T#%d  (%d)\n", gtid, gtid_req));
6475    if (gtid == KMP_GTID_SHUTDOWN) {
6476      KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
6477                    "already shutdown\n"));
6478      return;
6479    } else if (gtid == KMP_GTID_MONITOR) {
6480      KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
6481                    "registered, or system shutdown\n"));
6482      return;
6483    } else if (gtid == KMP_GTID_DNE) {
6484      KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
6485                    "shutdown\n"));
6486      /* we don't know who we are, but we may still shutdown the library */
6487    } else if (KMP_UBER_GTID(gtid)) {
6488      /* unregister ourselves as an uber thread.  gtid is no longer valid */
6489      if (__kmp_root[gtid]->r.r_active) {
6490        __kmp_global.g.g_abort = -1;
6491        TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6492        __kmp_unregister_library();
6493        KA_TRACE(10,
6494                 ("__kmp_internal_end_library: root still active, abort T#%d\n",
6495                  gtid));
6496        return;
6497      } else {
6498        __kmp_itthash_clean(__kmp_threads[gtid]);
6499        KA_TRACE(
6500            10,
6501            ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
6502        __kmp_unregister_root_current_thread(gtid);
6503      }
6504    } else {
6505/* worker threads may call this function through the atexit handler, if they
6506 * call exit() */
6507/* For now, skip the usual subsequent processing and just dump the debug buffer.
6508   TODO: do a thorough shutdown instead */
6509#ifdef DUMP_DEBUG_ON_EXIT
6510      if (__kmp_debug_buf)
6511        __kmp_dump_debug_buffer();
6512#endif
6513      // added unregister library call here when we switch to shm linux
6514      // if we don't, it will leave lots of files in /dev/shm
6515      // cleanup shared memory file before exiting.
6516      __kmp_unregister_library();
6517      return;
6518    }
6519  }
6520  /* synchronize the termination process */
6521  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6522
6523  /* have we already finished */
6524  if (__kmp_global.g.g_abort) {
6525    KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
6526    /* TODO abort? */
6527    __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6528    return;
6529  }
6530  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6531    __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6532    return;
6533  }
6534
6535  /* We need this lock to enforce mutex between this reading of
6536     __kmp_threads_capacity and the writing by __kmp_register_root.
6537     Alternatively, we can use a counter of roots that is atomically updated by
6538     __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6539     __kmp_internal_end_*.  */
6540  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6541
6542  /* now we can safely conduct the actual termination */
6543  __kmp_internal_end();
6544
6545  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6546  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6547
6548  KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
6549
6550#ifdef DUMP_DEBUG_ON_EXIT
6551  if (__kmp_debug_buf)
6552    __kmp_dump_debug_buffer();
6553#endif
6554
6555#if KMP_OS_WINDOWS
6556  __kmp_close_console();
6557#endif
6558
6559  __kmp_fini_allocator();
6560
6561} // __kmp_internal_end_library
6562
6563void __kmp_internal_end_thread(int gtid_req) {
6564  int i;
6565
6566  /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6567  /* this shouldn't be a race condition because __kmp_internal_end() is the
6568   * only place to clear __kmp_serial_init */
6569  /* we'll check this later too, after we get the lock */
6570  // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6571  // redundant, because the next check will work in any case.
6572  if (__kmp_global.g.g_abort) {
6573    KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6574    /* TODO abort? */
6575    return;
6576  }
6577  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6578    KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6579    return;
6580  }
6581
6582  // If hidden helper team has been initialized, we need to deinit it
6583  if (TCR_4(__kmp_init_hidden_helper) &&
6584      !TCR_4(__kmp_hidden_helper_team_done)) {
6585    TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6586    // First release the main thread to let it continue its work
6587    __kmp_hidden_helper_main_thread_release();
6588    // Wait until the hidden helper team has been destroyed
6589    __kmp_hidden_helper_threads_deinitz_wait();
6590  }
6591
6592  KMP_MB(); /* Flush all pending memory write invalidates.  */
6593
6594  /* find out who we are and what we should do */
6595  {
6596    int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6597    KA_TRACE(10,
6598             ("__kmp_internal_end_thread: enter T#%d  (%d)\n", gtid, gtid_req));
6599    if (gtid == KMP_GTID_SHUTDOWN) {
6600      KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6601                    "already shutdown\n"));
6602      return;
6603    } else if (gtid == KMP_GTID_MONITOR) {
6604      KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6605                    "registered, or system shutdown\n"));
6606      return;
6607    } else if (gtid == KMP_GTID_DNE) {
6608      KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6609                    "shutdown\n"));
6610      return;
6611      /* we don't know who we are */
6612    } else if (KMP_UBER_GTID(gtid)) {
6613      /* unregister ourselves as an uber thread.  gtid is no longer valid */
6614      if (__kmp_root[gtid]->r.r_active) {
6615        __kmp_global.g.g_abort = -1;
6616        TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6617        KA_TRACE(10,
6618                 ("__kmp_internal_end_thread: root still active, abort T#%d\n",
6619                  gtid));
6620        return;
6621      } else {
6622        KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6623                      gtid));
6624        __kmp_unregister_root_current_thread(gtid);
6625      }
6626    } else {
6627      /* just a worker thread, let's leave */
6628      KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6629
6630      if (gtid >= 0) {
6631        __kmp_threads[gtid]->th.th_task_team = NULL;
6632      }
6633
6634      KA_TRACE(10,
6635               ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6636                gtid));
6637      return;
6638    }
6639  }
6640#if KMP_DYNAMIC_LIB
6641  if (__kmp_pause_status != kmp_hard_paused)
6642  // AC: lets not shutdown the dynamic library at the exit of uber thread,
6643  // because we will better shutdown later in the library destructor.
6644  {
6645    KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6646    return;
6647  }
6648#endif
6649  /* synchronize the termination process */
6650  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6651
6652  /* have we already finished */
6653  if (__kmp_global.g.g_abort) {
6654    KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6655    /* TODO abort? */
6656    __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6657    return;
6658  }
6659  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6660    __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6661    return;
6662  }
6663
6664  /* We need this lock to enforce mutex between this reading of
6665     __kmp_threads_capacity and the writing by __kmp_register_root.
6666     Alternatively, we can use a counter of roots that is atomically updated by
6667     __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6668     __kmp_internal_end_*.  */
6669
6670  /* should we finish the run-time?  are all siblings done? */
6671  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6672
6673  for (i = 0; i < __kmp_threads_capacity; ++i) {
6674    if (KMP_UBER_GTID(i)) {
6675      KA_TRACE(
6676          10,
6677          ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6678      __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6679      __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6680      return;
6681    }
6682  }
6683
6684  /* now we can safely conduct the actual termination */
6685
6686  __kmp_internal_end();
6687
6688  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6689  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6690
6691  KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6692
6693#ifdef DUMP_DEBUG_ON_EXIT
6694  if (__kmp_debug_buf)
6695    __kmp_dump_debug_buffer();
6696#endif
6697} // __kmp_internal_end_thread
6698
6699// -----------------------------------------------------------------------------
6700// Library registration stuff.
6701
6702static long __kmp_registration_flag = 0;
6703// Random value used to indicate library initialization.
6704static char *__kmp_registration_str = NULL;
6705// Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6706
6707static inline char *__kmp_reg_status_name() {
6708/* On RHEL 3u5 if linked statically, getpid() returns different values in
6709   each thread. If registration and unregistration go in different threads
6710   (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6711   env var can not be found, because the name will contain different pid. */
6712// macOS* complains about name being too long with additional getuid()
6713#if KMP_OS_UNIX && !KMP_OS_DARWIN && KMP_DYNAMIC_LIB
6714  return __kmp_str_format("__KMP_REGISTERED_LIB_%d_%d", (int)getpid(),
6715                          (int)getuid());
6716#else
6717  return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6718#endif
6719} // __kmp_reg_status_get
6720
6721#if defined(KMP_USE_SHM)
6722bool __kmp_shm_available = false;
6723bool __kmp_tmp_available = false;
6724// If /dev/shm is not accessible, we will create a temporary file under /tmp.
6725char *temp_reg_status_file_name = nullptr;
6726#endif
6727
6728void __kmp_register_library_startup(void) {
6729
6730  char *name = __kmp_reg_status_name(); // Name of the environment variable.
6731  int done = 0;
6732  union {
6733    double dtime;
6734    long ltime;
6735  } time;
6736#if KMP_ARCH_X86 || KMP_ARCH_X86_64
6737  __kmp_initialize_system_tick();
6738#endif
6739  __kmp_read_system_time(&time.dtime);
6740  __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6741  __kmp_registration_str =
6742      __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
6743                       __kmp_registration_flag, KMP_LIBRARY_FILE);
6744
6745  KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6746                __kmp_registration_str));
6747
6748  while (!done) {
6749
6750    char *value = NULL; // Actual value of the environment variable.
6751
6752#if defined(KMP_USE_SHM)
6753    char *shm_name = nullptr;
6754    char *data1 = nullptr;
6755    __kmp_shm_available = __kmp_detect_shm();
6756    if (__kmp_shm_available) {
6757      int fd1 = -1;
6758      shm_name = __kmp_str_format("/%s", name);
6759      int shm_preexist = 0;
6760      fd1 = shm_open(shm_name, O_CREAT | O_EXCL | O_RDWR, 0666);
6761      if ((fd1 == -1) && (errno == EEXIST)) {
6762        // file didn't open because it already exists.
6763        // try opening existing file
6764        fd1 = shm_open(shm_name, O_RDWR, 0666);
6765        if (fd1 == -1) { // file didn't open
6766          KMP_WARNING(FunctionError, "Can't open SHM");
6767          __kmp_shm_available = false;
6768        } else { // able to open existing file
6769          shm_preexist = 1;
6770        }
6771      }
6772      if (__kmp_shm_available && shm_preexist == 0) { // SHM created, set size
6773        if (ftruncate(fd1, SHM_SIZE) == -1) { // error occured setting size;
6774          KMP_WARNING(FunctionError, "Can't set size of SHM");
6775          __kmp_shm_available = false;
6776        }
6777      }
6778      if (__kmp_shm_available) { // SHM exists, now map it
6779        data1 = (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED,
6780                             fd1, 0);
6781        if (data1 == MAP_FAILED) { // failed to map shared memory
6782          KMP_WARNING(FunctionError, "Can't map SHM");
6783          __kmp_shm_available = false;
6784        }
6785      }
6786      if (__kmp_shm_available) { // SHM mapped
6787        if (shm_preexist == 0) { // set data to SHM, set value
6788          KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str);
6789        }
6790        // Read value from either what we just wrote or existing file.
6791        value = __kmp_str_format("%s", data1); // read value from SHM
6792        munmap(data1, SHM_SIZE);
6793      }
6794      if (fd1 != -1)
6795        close(fd1);
6796    }
6797    if (!__kmp_shm_available)
6798      __kmp_tmp_available = __kmp_detect_tmp();
6799    if (!__kmp_shm_available && __kmp_tmp_available) {
6800      // SHM failed to work due to an error other than that the file already
6801      // exists. Try to create a temp file under /tmp.
6802      // If /tmp isn't accessible, fall back to using environment variable.
6803      // TODO: /tmp might not always be the temporary directory. For now we will
6804      // not consider TMPDIR.
6805      int fd1 = -1;
6806      temp_reg_status_file_name = __kmp_str_format("/tmp/%s", name);
6807      int tmp_preexist = 0;
6808      fd1 = open(temp_reg_status_file_name, O_CREAT | O_EXCL | O_RDWR, 0666);
6809      if ((fd1 == -1) && (errno == EEXIST)) {
6810        // file didn't open because it already exists.
6811        // try opening existing file
6812        fd1 = open(temp_reg_status_file_name, O_RDWR, 0666);
6813        if (fd1 == -1) { // file didn't open if (fd1 == -1) {
6814          KMP_WARNING(FunctionError, "Can't open TEMP");
6815          __kmp_tmp_available = false;
6816        } else {
6817          tmp_preexist = 1;
6818        }
6819      }
6820      if (__kmp_tmp_available && tmp_preexist == 0) {
6821        // we created /tmp file now set size
6822        if (ftruncate(fd1, SHM_SIZE) == -1) { // error occured setting size;
6823          KMP_WARNING(FunctionError, "Can't set size of /tmp file");
6824          __kmp_tmp_available = false;
6825        }
6826      }
6827      if (__kmp_tmp_available) {
6828        data1 = (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED,
6829                             fd1, 0);
6830        if (data1 == MAP_FAILED) { // failed to map /tmp
6831          KMP_WARNING(FunctionError, "Can't map /tmp");
6832          __kmp_tmp_available = false;
6833        }
6834      }
6835      if (__kmp_tmp_available) {
6836        if (tmp_preexist == 0) { // set data to TMP, set value
6837          KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str);
6838        }
6839        // Read value from either what we just wrote or existing file.
6840        value = __kmp_str_format("%s", data1); // read value from SHM
6841        munmap(data1, SHM_SIZE);
6842      }
6843      if (fd1 != -1)
6844        close(fd1);
6845    }
6846    if (!__kmp_shm_available && !__kmp_tmp_available) {
6847      // no /dev/shm and no /tmp -- fall back to environment variable
6848      // Set environment variable, but do not overwrite if it exists.
6849      __kmp_env_set(name, __kmp_registration_str, 0);
6850      // read value to see if it got set
6851      value = __kmp_env_get(name);
6852    }
6853#else // Windows and unix with static library
6854    // Set environment variable, but do not overwrite if it exists.
6855    __kmp_env_set(name, __kmp_registration_str, 0);
6856    // read value to see if it got set
6857    value = __kmp_env_get(name);
6858#endif
6859
6860    if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6861      done = 1; // Ok, environment variable set successfully, exit the loop.
6862    } else {
6863      // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6864      // Check whether it alive or dead.
6865      int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6866      char *tail = value;
6867      char *flag_addr_str = NULL;
6868      char *flag_val_str = NULL;
6869      char const *file_name = NULL;
6870      __kmp_str_split(tail, '-', &flag_addr_str, &tail);
6871      __kmp_str_split(tail, '-', &flag_val_str, &tail);
6872      file_name = tail;
6873      if (tail != NULL) {
6874        unsigned long *flag_addr = 0;
6875        unsigned long flag_val = 0;
6876        KMP_SSCANF(flag_addr_str, "%p", RCAST(void **, &flag_addr));
6877        KMP_SSCANF(flag_val_str, "%lx", &flag_val);
6878        if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
6879          // First, check whether environment-encoded address is mapped into
6880          // addr space.
6881          // If so, dereference it to see if it still has the right value.
6882          if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
6883            neighbor = 1;
6884          } else {
6885            // If not, then we know the other copy of the library is no longer
6886            // running.
6887            neighbor = 2;
6888          }
6889        }
6890      }
6891      switch (neighbor) {
6892      case 0: // Cannot parse environment variable -- neighbor status unknown.
6893        // Assume it is the incompatible format of future version of the
6894        // library. Assume the other library is alive.
6895        // WARN( ... ); // TODO: Issue a warning.
6896        file_name = "unknown library";
6897        KMP_FALLTHROUGH();
6898      // Attention! Falling to the next case. That's intentional.
6899      case 1: { // Neighbor is alive.
6900        // Check it is allowed.
6901        char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
6902        if (!__kmp_str_match_true(duplicate_ok)) {
6903          // That's not allowed. Issue fatal error.
6904          __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6905                      KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6906        }
6907        KMP_INTERNAL_FREE(duplicate_ok);
6908        __kmp_duplicate_library_ok = 1;
6909        done = 1; // Exit the loop.
6910      } break;
6911      case 2: { // Neighbor is dead.
6912
6913#if defined(KMP_USE_SHM)
6914        if (__kmp_shm_available) { // close shared memory.
6915          shm_unlink(shm_name); // this removes file in /dev/shm
6916        } else if (__kmp_tmp_available) {
6917          unlink(temp_reg_status_file_name); // this removes the temp file
6918        } else {
6919          // Clear the variable and try to register library again.
6920          __kmp_env_unset(name);
6921        }
6922#else
6923        // Clear the variable and try to register library again.
6924        __kmp_env_unset(name);
6925#endif
6926      } break;
6927      default: {
6928        KMP_DEBUG_ASSERT(0);
6929      } break;
6930      }
6931    }
6932    KMP_INTERNAL_FREE((void *)value);
6933#if defined(KMP_USE_SHM)
6934    if (shm_name)
6935      KMP_INTERNAL_FREE((void *)shm_name);
6936#endif
6937  } // while
6938  KMP_INTERNAL_FREE((void *)name);
6939
6940} // func __kmp_register_library_startup
6941
6942void __kmp_unregister_library(void) {
6943
6944  char *name = __kmp_reg_status_name();
6945  char *value = NULL;
6946
6947#if defined(KMP_USE_SHM)
6948  char *shm_name = nullptr;
6949  int fd1;
6950  if (__kmp_shm_available) {
6951    shm_name = __kmp_str_format("/%s", name);
6952    fd1 = shm_open(shm_name, O_RDONLY, 0666);
6953    if (fd1 != -1) { // File opened successfully
6954      char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0);
6955      if (data1 != MAP_FAILED) {
6956        value = __kmp_str_format("%s", data1); // read value from SHM
6957        munmap(data1, SHM_SIZE);
6958      }
6959      close(fd1);
6960    }
6961  } else if (__kmp_tmp_available) { // try /tmp
6962    fd1 = open(temp_reg_status_file_name, O_RDONLY);
6963    if (fd1 != -1) { // File opened successfully
6964      char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0);
6965      if (data1 != MAP_FAILED) {
6966        value = __kmp_str_format("%s", data1); // read value from /tmp
6967        munmap(data1, SHM_SIZE);
6968      }
6969      close(fd1);
6970    }
6971  } else { // fall back to envirable
6972    value = __kmp_env_get(name);
6973  }
6974#else
6975  value = __kmp_env_get(name);
6976#endif
6977
6978  KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
6979  KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
6980  if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6981//  Ok, this is our variable. Delete it.
6982#if defined(KMP_USE_SHM)
6983    if (__kmp_shm_available) {
6984      shm_unlink(shm_name); // this removes file in /dev/shm
6985    } else if (__kmp_tmp_available) {
6986      unlink(temp_reg_status_file_name); // this removes the temp file
6987    } else {
6988      __kmp_env_unset(name);
6989    }
6990#else
6991    __kmp_env_unset(name);
6992#endif
6993  }
6994
6995#if defined(KMP_USE_SHM)
6996  if (shm_name)
6997    KMP_INTERNAL_FREE(shm_name);
6998  if (temp_reg_status_file_name)
6999    KMP_INTERNAL_FREE(temp_reg_status_file_name);
7000#endif
7001
7002  KMP_INTERNAL_FREE(__kmp_registration_str);
7003  KMP_INTERNAL_FREE(value);
7004  KMP_INTERNAL_FREE(name);
7005
7006  __kmp_registration_flag = 0;
7007  __kmp_registration_str = NULL;
7008
7009} // __kmp_unregister_library
7010
7011// End of Library registration stuff.
7012// -----------------------------------------------------------------------------
7013
7014#if KMP_MIC_SUPPORTED
7015
7016static void __kmp_check_mic_type() {
7017  kmp_cpuid_t cpuid_state = {0};
7018  kmp_cpuid_t *cs_p = &cpuid_state;
7019  __kmp_x86_cpuid(1, 0, cs_p);
7020  // We don't support mic1 at the moment
7021  if ((cs_p->eax & 0xff0) == 0xB10) {
7022    __kmp_mic_type = mic2;
7023  } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
7024    __kmp_mic_type = mic3;
7025  } else {
7026    __kmp_mic_type = non_mic;
7027  }
7028}
7029
7030#endif /* KMP_MIC_SUPPORTED */
7031
7032#if KMP_HAVE_UMWAIT
7033static void __kmp_user_level_mwait_init() {
7034  struct kmp_cpuid buf;
7035  __kmp_x86_cpuid(7, 0, &buf);
7036  __kmp_waitpkg_enabled = ((buf.ecx >> 5) & 1);
7037  __kmp_umwait_enabled = __kmp_waitpkg_enabled && __kmp_user_level_mwait;
7038  __kmp_tpause_enabled = __kmp_waitpkg_enabled && (__kmp_tpause_state > 0);
7039  KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_umwait_enabled = %d\n",
7040                __kmp_umwait_enabled));
7041}
7042#elif KMP_HAVE_MWAIT
7043#ifndef AT_INTELPHIUSERMWAIT
7044// Spurious, non-existent value that should always fail to return anything.
7045// Will be replaced with the correct value when we know that.
7046#define AT_INTELPHIUSERMWAIT 10000
7047#endif
7048// getauxval() function is available in RHEL7 and SLES12. If a system with an
7049// earlier OS is used to build the RTL, we'll use the following internal
7050// function when the entry is not found.
7051unsigned long getauxval(unsigned long) KMP_WEAK_ATTRIBUTE_EXTERNAL;
7052unsigned long getauxval(unsigned long) { return 0; }
7053
7054static void __kmp_user_level_mwait_init() {
7055  // When getauxval() and correct value of AT_INTELPHIUSERMWAIT are available
7056  // use them to find if the user-level mwait is enabled. Otherwise, forcibly
7057  // set __kmp_mwait_enabled=TRUE on Intel MIC if the environment variable
7058  // KMP_USER_LEVEL_MWAIT was set to TRUE.
7059  if (__kmp_mic_type == mic3) {
7060    unsigned long res = getauxval(AT_INTELPHIUSERMWAIT);
7061    if ((res & 0x1) || __kmp_user_level_mwait) {
7062      __kmp_mwait_enabled = TRUE;
7063      if (__kmp_user_level_mwait) {
7064        KMP_INFORM(EnvMwaitWarn);
7065      }
7066    } else {
7067      __kmp_mwait_enabled = FALSE;
7068    }
7069  }
7070  KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_mic_type = %d, "
7071                "__kmp_mwait_enabled = %d\n",
7072                __kmp_mic_type, __kmp_mwait_enabled));
7073}
7074#endif /* KMP_HAVE_UMWAIT */
7075
7076static void __kmp_do_serial_initialize(void) {
7077  int i, gtid;
7078  size_t size;
7079
7080  KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
7081
7082  KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
7083  KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
7084  KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
7085  KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
7086  KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
7087
7088#if OMPT_SUPPORT
7089  ompt_pre_init();
7090#endif
7091#if OMPD_SUPPORT
7092  __kmp_env_dump();
7093  ompd_init();
7094#endif
7095
7096  __kmp_validate_locks();
7097
7098#if ENABLE_LIBOMPTARGET
7099  /* Initialize functions from libomptarget */
7100  __kmp_init_omptarget();
7101#endif
7102
7103  /* Initialize internal memory allocator */
7104  __kmp_init_allocator();
7105
7106  /* Register the library startup via an environment variable or via mapped
7107     shared memory file and check to see whether another copy of the library is
7108     already registered. Since forked child process is often terminated, we
7109     postpone the registration till middle initialization in the child */
7110  if (__kmp_need_register_serial)
7111    __kmp_register_library_startup();
7112
7113  /* TODO reinitialization of library */
7114  if (TCR_4(__kmp_global.g.g_done)) {
7115    KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
7116  }
7117
7118  __kmp_global.g.g_abort = 0;
7119  TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
7120
7121/* initialize the locks */
7122#if KMP_USE_ADAPTIVE_LOCKS
7123#if KMP_DEBUG_ADAPTIVE_LOCKS
7124  __kmp_init_speculative_stats();
7125#endif
7126#endif
7127#if KMP_STATS_ENABLED
7128  __kmp_stats_init();
7129#endif
7130  __kmp_init_lock(&__kmp_global_lock);
7131  __kmp_init_queuing_lock(&__kmp_dispatch_lock);
7132  __kmp_init_lock(&__kmp_debug_lock);
7133  __kmp_init_atomic_lock(&__kmp_atomic_lock);
7134  __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
7135  __kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
7136  __kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
7137  __kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
7138  __kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
7139  __kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
7140  __kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
7141  __kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
7142  __kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
7143  __kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
7144  __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
7145  __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
7146  __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
7147  __kmp_init_bootstrap_lock(&__kmp_exit_lock);
7148#if KMP_USE_MONITOR
7149  __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
7150#endif
7151  __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
7152
7153  /* conduct initialization and initial setup of configuration */
7154
7155  __kmp_runtime_initialize();
7156
7157#if KMP_MIC_SUPPORTED
7158  __kmp_check_mic_type();
7159#endif
7160
7161// Some global variable initialization moved here from kmp_env_initialize()
7162#ifdef KMP_DEBUG
7163  kmp_diag = 0;
7164#endif
7165  __kmp_abort_delay = 0;
7166
7167  // From __kmp_init_dflt_team_nth()
7168  /* assume the entire machine will be used */
7169  __kmp_dflt_team_nth_ub = __kmp_xproc;
7170  if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
7171    __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
7172  }
7173  if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
7174    __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
7175  }
7176  __kmp_max_nth = __kmp_sys_max_nth;
7177  __kmp_cg_max_nth = __kmp_sys_max_nth;
7178  __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
7179  if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
7180    __kmp_teams_max_nth = __kmp_sys_max_nth;
7181  }
7182
7183  // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
7184  // part
7185  __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
7186#if KMP_USE_MONITOR
7187  __kmp_monitor_wakeups =
7188      KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
7189  __kmp_bt_intervals =
7190      KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
7191#endif
7192  // From "KMP_LIBRARY" part of __kmp_env_initialize()
7193  __kmp_library = library_throughput;
7194  // From KMP_SCHEDULE initialization
7195  __kmp_static = kmp_sch_static_balanced;
7196// AC: do not use analytical here, because it is non-monotonous
7197//__kmp_guided = kmp_sch_guided_iterative_chunked;
7198//__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
7199// need to repeat assignment
7200// Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
7201// bit control and barrier method control parts
7202#if KMP_FAST_REDUCTION_BARRIER
7203#define kmp_reduction_barrier_gather_bb ((int)1)
7204#define kmp_reduction_barrier_release_bb ((int)1)
7205#define kmp_reduction_barrier_gather_pat __kmp_barrier_gather_pat_dflt
7206#define kmp_reduction_barrier_release_pat __kmp_barrier_release_pat_dflt
7207#endif // KMP_FAST_REDUCTION_BARRIER
7208  for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
7209    __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
7210    __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
7211    __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
7212    __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
7213#if KMP_FAST_REDUCTION_BARRIER
7214    if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
7215      // lin_64 ): hyper,1
7216      __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
7217      __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
7218      __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
7219      __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
7220    }
7221#endif // KMP_FAST_REDUCTION_BARRIER
7222  }
7223#if KMP_FAST_REDUCTION_BARRIER
7224#undef kmp_reduction_barrier_release_pat
7225#undef kmp_reduction_barrier_gather_pat
7226#undef kmp_reduction_barrier_release_bb
7227#undef kmp_reduction_barrier_gather_bb
7228#endif // KMP_FAST_REDUCTION_BARRIER
7229#if KMP_MIC_SUPPORTED
7230  if (__kmp_mic_type == mic2) { // KNC
7231    // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
7232    __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
7233    __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
7234        1; // forkjoin release
7235    __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
7236    __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
7237  }
7238#if KMP_FAST_REDUCTION_BARRIER
7239  if (__kmp_mic_type == mic2) { // KNC
7240    __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
7241    __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
7242  }
7243#endif // KMP_FAST_REDUCTION_BARRIER
7244#endif // KMP_MIC_SUPPORTED
7245
7246// From KMP_CHECKS initialization
7247#ifdef KMP_DEBUG
7248  __kmp_env_checks = TRUE; /* development versions have the extra checks */
7249#else
7250  __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
7251#endif
7252
7253  // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
7254  __kmp_foreign_tp = TRUE;
7255
7256  __kmp_global.g.g_dynamic = FALSE;
7257  __kmp_global.g.g_dynamic_mode = dynamic_default;
7258
7259  __kmp_init_nesting_mode();
7260
7261  __kmp_env_initialize(NULL);
7262
7263#if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
7264  __kmp_user_level_mwait_init();
7265#endif
7266// Print all messages in message catalog for testing purposes.
7267#ifdef KMP_DEBUG
7268  char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
7269  if (__kmp_str_match_true(val)) {
7270    kmp_str_buf_t buffer;
7271    __kmp_str_buf_init(&buffer);
7272    __kmp_i18n_dump_catalog(&buffer);
7273    __kmp_printf("%s", buffer.str);
7274    __kmp_str_buf_free(&buffer);
7275  }
7276  __kmp_env_free(&val);
7277#endif
7278
7279  __kmp_threads_capacity =
7280      __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
7281  // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
7282  __kmp_tp_capacity = __kmp_default_tp_capacity(
7283      __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
7284
7285  // If the library is shut down properly, both pools must be NULL. Just in
7286  // case, set them to NULL -- some memory may leak, but subsequent code will
7287  // work even if pools are not freed.
7288  KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
7289  KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
7290  KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
7291  __kmp_thread_pool = NULL;
7292  __kmp_thread_pool_insert_pt = NULL;
7293  __kmp_team_pool = NULL;
7294
7295  /* Allocate all of the variable sized records */
7296  /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
7297   * expandable */
7298  /* Since allocation is cache-aligned, just add extra padding at the end */
7299  size =
7300      (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
7301      CACHE_LINE;
7302  __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
7303  __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
7304                               sizeof(kmp_info_t *) * __kmp_threads_capacity);
7305
7306  /* init thread counts */
7307  KMP_DEBUG_ASSERT(__kmp_all_nth ==
7308                   0); // Asserts fail if the library is reinitializing and
7309  KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
7310  __kmp_all_nth = 0;
7311  __kmp_nth = 0;
7312
7313  /* setup the uber master thread and hierarchy */
7314  gtid = __kmp_register_root(TRUE);
7315  KA_TRACE(10, ("__kmp_do_serial_initialize  T#%d\n", gtid));
7316  KMP_ASSERT(KMP_UBER_GTID(gtid));
7317  KMP_ASSERT(KMP_INITIAL_GTID(gtid));
7318
7319  KMP_MB(); /* Flush all pending memory write invalidates.  */
7320
7321  __kmp_common_initialize();
7322
7323#if KMP_OS_UNIX
7324  /* invoke the child fork handler */
7325  __kmp_register_atfork();
7326#endif
7327
7328#if !KMP_DYNAMIC_LIB ||                                                        \
7329    ((KMP_COMPILER_ICC || KMP_COMPILER_ICX) && KMP_OS_DARWIN)
7330  {
7331    /* Invoke the exit handler when the program finishes, only for static
7332       library and macOS* dynamic. For other dynamic libraries, we already
7333       have _fini and DllMain. */
7334    int rc = atexit(__kmp_internal_end_atexit);
7335    if (rc != 0) {
7336      __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
7337                  __kmp_msg_null);
7338    }
7339  }
7340#endif
7341
7342#if KMP_HANDLE_SIGNALS
7343#if KMP_OS_UNIX
7344  /* NOTE: make sure that this is called before the user installs their own
7345     signal handlers so that the user handlers are called first. this way they
7346     can return false, not call our handler, avoid terminating the library, and
7347     continue execution where they left off. */
7348  __kmp_install_signals(FALSE);
7349#endif /* KMP_OS_UNIX */
7350#if KMP_OS_WINDOWS
7351  __kmp_install_signals(TRUE);
7352#endif /* KMP_OS_WINDOWS */
7353#endif
7354
7355  /* we have finished the serial initialization */
7356  __kmp_init_counter++;
7357
7358  __kmp_init_serial = TRUE;
7359
7360  if (__kmp_version) {
7361    __kmp_print_version_1();
7362  }
7363
7364  if (__kmp_settings) {
7365    __kmp_env_print();
7366  }
7367
7368  if (__kmp_display_env || __kmp_display_env_verbose) {
7369    __kmp_env_print_2();
7370  }
7371
7372#if OMPT_SUPPORT
7373  ompt_post_init();
7374#endif
7375
7376  KMP_MB();
7377
7378  KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
7379}
7380
7381void __kmp_serial_initialize(void) {
7382  if (__kmp_init_serial) {
7383    return;
7384  }
7385  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7386  if (__kmp_init_serial) {
7387    __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7388    return;
7389  }
7390  __kmp_do_serial_initialize();
7391  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7392}
7393
7394static void __kmp_do_middle_initialize(void) {
7395  int i, j;
7396  int prev_dflt_team_nth;
7397
7398  if (!__kmp_init_serial) {
7399    __kmp_do_serial_initialize();
7400  }
7401
7402  KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
7403
7404  if (UNLIKELY(!__kmp_need_register_serial)) {
7405    // We are in a forked child process. The registration was skipped during
7406    // serial initialization in __kmp_atfork_child handler. Do it here.
7407    __kmp_register_library_startup();
7408  }
7409
7410  // Save the previous value for the __kmp_dflt_team_nth so that
7411  // we can avoid some reinitialization if it hasn't changed.
7412  prev_dflt_team_nth = __kmp_dflt_team_nth;
7413
7414#if KMP_AFFINITY_SUPPORTED
7415  // __kmp_affinity_initialize() will try to set __kmp_ncores to the
7416  // number of cores on the machine.
7417  __kmp_affinity_initialize(__kmp_affinity);
7418
7419#endif /* KMP_AFFINITY_SUPPORTED */
7420
7421  KMP_ASSERT(__kmp_xproc > 0);
7422  if (__kmp_avail_proc == 0) {
7423    __kmp_avail_proc = __kmp_xproc;
7424  }
7425
7426  // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
7427  // correct them now
7428  j = 0;
7429  while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
7430    __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
7431        __kmp_avail_proc;
7432    j++;
7433  }
7434
7435  if (__kmp_dflt_team_nth == 0) {
7436#ifdef KMP_DFLT_NTH_CORES
7437    // Default #threads = #cores
7438    __kmp_dflt_team_nth = __kmp_ncores;
7439    KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7440                  "__kmp_ncores (%d)\n",
7441                  __kmp_dflt_team_nth));
7442#else
7443    // Default #threads = #available OS procs
7444    __kmp_dflt_team_nth = __kmp_avail_proc;
7445    KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7446                  "__kmp_avail_proc(%d)\n",
7447                  __kmp_dflt_team_nth));
7448#endif /* KMP_DFLT_NTH_CORES */
7449  }
7450
7451  if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
7452    __kmp_dflt_team_nth = KMP_MIN_NTH;
7453  }
7454  if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
7455    __kmp_dflt_team_nth = __kmp_sys_max_nth;
7456  }
7457
7458  if (__kmp_nesting_mode > 0)
7459    __kmp_set_nesting_mode_threads();
7460
7461  // There's no harm in continuing if the following check fails,
7462  // but it indicates an error in the previous logic.
7463  KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
7464
7465  if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
7466    // Run through the __kmp_threads array and set the num threads icv for each
7467    // root thread that is currently registered with the RTL (which has not
7468    // already explicitly set its nthreads-var with a call to
7469    // omp_set_num_threads()).
7470    for (i = 0; i < __kmp_threads_capacity; i++) {
7471      kmp_info_t *thread = __kmp_threads[i];
7472      if (thread == NULL)
7473        continue;
7474      if (thread->th.th_current_task->td_icvs.nproc != 0)
7475        continue;
7476
7477      set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
7478    }
7479  }
7480  KA_TRACE(
7481      20,
7482      ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
7483       __kmp_dflt_team_nth));
7484
7485#ifdef KMP_ADJUST_BLOCKTIME
7486  /* Adjust blocktime to zero if necessary  now that __kmp_avail_proc is set */
7487  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
7488    KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
7489    if (__kmp_nth > __kmp_avail_proc) {
7490      __kmp_zero_bt = TRUE;
7491    }
7492  }
7493#endif /* KMP_ADJUST_BLOCKTIME */
7494
7495  /* we have finished middle initialization */
7496  TCW_SYNC_4(__kmp_init_middle, TRUE);
7497
7498  KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
7499}
7500
7501void __kmp_middle_initialize(void) {
7502  if (__kmp_init_middle) {
7503    return;
7504  }
7505  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7506  if (__kmp_init_middle) {
7507    __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7508    return;
7509  }
7510  __kmp_do_middle_initialize();
7511  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7512}
7513
7514void __kmp_parallel_initialize(void) {
7515  int gtid = __kmp_entry_gtid(); // this might be a new root
7516
7517  /* synchronize parallel initialization (for sibling) */
7518  if (TCR_4(__kmp_init_parallel))
7519    return;
7520  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7521  if (TCR_4(__kmp_init_parallel)) {
7522    __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7523    return;
7524  }
7525
7526  /* TODO reinitialization after we have already shut down */
7527  if (TCR_4(__kmp_global.g.g_done)) {
7528    KA_TRACE(
7529        10,
7530        ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
7531    __kmp_infinite_loop();
7532  }
7533
7534  /* jc: The lock __kmp_initz_lock is already held, so calling
7535     __kmp_serial_initialize would cause a deadlock.  So we call
7536     __kmp_do_serial_initialize directly. */
7537  if (!__kmp_init_middle) {
7538    __kmp_do_middle_initialize();
7539  }
7540  __kmp_assign_root_init_mask();
7541  __kmp_resume_if_hard_paused();
7542
7543  /* begin initialization */
7544  KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
7545  KMP_ASSERT(KMP_UBER_GTID(gtid));
7546
7547#if KMP_ARCH_X86 || KMP_ARCH_X86_64
7548  // Save the FP control regs.
7549  // Worker threads will set theirs to these values at thread startup.
7550  __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
7551  __kmp_store_mxcsr(&__kmp_init_mxcsr);
7552  __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
7553#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
7554
7555#if KMP_OS_UNIX
7556#if KMP_HANDLE_SIGNALS
7557  /*  must be after __kmp_serial_initialize  */
7558  __kmp_install_signals(TRUE);
7559#endif
7560#endif
7561
7562  __kmp_suspend_initialize();
7563
7564#if defined(USE_LOAD_BALANCE)
7565  if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7566    __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
7567  }
7568#else
7569  if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7570    __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7571  }
7572#endif
7573
7574  if (__kmp_version) {
7575    __kmp_print_version_2();
7576  }
7577
7578  /* we have finished parallel initialization */
7579  TCW_SYNC_4(__kmp_init_parallel, TRUE);
7580
7581  KMP_MB();
7582  KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
7583
7584  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7585}
7586
7587void __kmp_hidden_helper_initialize() {
7588  if (TCR_4(__kmp_init_hidden_helper))
7589    return;
7590
7591  // __kmp_parallel_initialize is required before we initialize hidden helper
7592  if (!TCR_4(__kmp_init_parallel))
7593    __kmp_parallel_initialize();
7594
7595  // Double check. Note that this double check should not be placed before
7596  // __kmp_parallel_initialize as it will cause dead lock.
7597  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7598  if (TCR_4(__kmp_init_hidden_helper)) {
7599    __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7600    return;
7601  }
7602
7603#if KMP_AFFINITY_SUPPORTED
7604  // Initialize hidden helper affinity settings.
7605  // The above __kmp_parallel_initialize() will initialize
7606  // regular affinity (and topology) if not already done.
7607  if (!__kmp_hh_affinity.flags.initialized)
7608    __kmp_affinity_initialize(__kmp_hh_affinity);
7609#endif
7610
7611  // Set the count of hidden helper tasks to be executed to zero
7612  KMP_ATOMIC_ST_REL(&__kmp_unexecuted_hidden_helper_tasks, 0);
7613
7614  // Set the global variable indicating that we're initializing hidden helper
7615  // team/threads
7616  TCW_SYNC_4(__kmp_init_hidden_helper_threads, TRUE);
7617
7618  // Platform independent initialization
7619  __kmp_do_initialize_hidden_helper_threads();
7620
7621  // Wait here for the finish of initialization of hidden helper teams
7622  __kmp_hidden_helper_threads_initz_wait();
7623
7624  // We have finished hidden helper initialization
7625  TCW_SYNC_4(__kmp_init_hidden_helper, TRUE);
7626
7627  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7628}
7629
7630/* ------------------------------------------------------------------------ */
7631
7632void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7633                                   kmp_team_t *team) {
7634  kmp_disp_t *dispatch;
7635
7636  KMP_MB();
7637
7638  /* none of the threads have encountered any constructs, yet. */
7639  this_thr->th.th_local.this_construct = 0;
7640#if KMP_CACHE_MANAGE
7641  KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
7642#endif /* KMP_CACHE_MANAGE */
7643  dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
7644  KMP_DEBUG_ASSERT(dispatch);
7645  KMP_DEBUG_ASSERT(team->t.t_dispatch);
7646  // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
7647  // this_thr->th.th_info.ds.ds_tid ] );
7648
7649  dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
7650  dispatch->th_doacross_buf_idx = 0; // reset doacross dispatch buffer counter
7651  if (__kmp_env_consistency_check)
7652    __kmp_push_parallel(gtid, team->t.t_ident);
7653
7654  KMP_MB(); /* Flush all pending memory write invalidates.  */
7655}
7656
7657void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7658                                  kmp_team_t *team) {
7659  if (__kmp_env_consistency_check)
7660    __kmp_pop_parallel(gtid, team->t.t_ident);
7661
7662  __kmp_finish_implicit_task(this_thr);
7663}
7664
7665int __kmp_invoke_task_func(int gtid) {
7666  int rc;
7667  int tid = __kmp_tid_from_gtid(gtid);
7668  kmp_info_t *this_thr = __kmp_threads[gtid];
7669  kmp_team_t *team = this_thr->th.th_team;
7670
7671  __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
7672#if USE_ITT_BUILD
7673  if (__itt_stack_caller_create_ptr) {
7674    // inform ittnotify about entering user's code
7675    if (team->t.t_stack_id != NULL) {
7676      __kmp_itt_stack_callee_enter((__itt_caller)team->t.t_stack_id);
7677    } else {
7678      KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7679      __kmp_itt_stack_callee_enter(
7680          (__itt_caller)team->t.t_parent->t.t_stack_id);
7681    }
7682  }
7683#endif /* USE_ITT_BUILD */
7684#if INCLUDE_SSC_MARKS
7685  SSC_MARK_INVOKING();
7686#endif
7687
7688#if OMPT_SUPPORT
7689  void *dummy;
7690  void **exit_frame_p;
7691  ompt_data_t *my_task_data;
7692  ompt_data_t *my_parallel_data;
7693  int ompt_team_size;
7694
7695  if (ompt_enabled.enabled) {
7696    exit_frame_p = &(team->t.t_implicit_task_taskdata[tid]
7697                         .ompt_task_info.frame.exit_frame.ptr);
7698  } else {
7699    exit_frame_p = &dummy;
7700  }
7701
7702  my_task_data =
7703      &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
7704  my_parallel_data = &(team->t.ompt_team_info.parallel_data);
7705  if (ompt_enabled.ompt_callback_implicit_task) {
7706    ompt_team_size = team->t.t_nproc;
7707    ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7708        ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
7709        __kmp_tid_from_gtid(gtid), ompt_task_implicit);
7710    OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid);
7711  }
7712#endif
7713
7714#if KMP_STATS_ENABLED
7715  stats_state_e previous_state = KMP_GET_THREAD_STATE();
7716  if (previous_state == stats_state_e::TEAMS_REGION) {
7717    KMP_PUSH_PARTITIONED_TIMER(OMP_teams);
7718  } else {
7719    KMP_PUSH_PARTITIONED_TIMER(OMP_parallel);
7720  }
7721  KMP_SET_THREAD_STATE(IMPLICIT_TASK);
7722#endif
7723
7724  rc = __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
7725                              tid, (int)team->t.t_argc, (void **)team->t.t_argv
7726#if OMPT_SUPPORT
7727                              ,
7728                              exit_frame_p
7729#endif
7730  );
7731#if OMPT_SUPPORT
7732  *exit_frame_p = NULL;
7733  this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_team;
7734#endif
7735
7736#if KMP_STATS_ENABLED
7737  if (previous_state == stats_state_e::TEAMS_REGION) {
7738    KMP_SET_THREAD_STATE(previous_state);
7739  }
7740  KMP_POP_PARTITIONED_TIMER();
7741#endif
7742
7743#if USE_ITT_BUILD
7744  if (__itt_stack_caller_create_ptr) {
7745    // inform ittnotify about leaving user's code
7746    if (team->t.t_stack_id != NULL) {
7747      __kmp_itt_stack_callee_leave((__itt_caller)team->t.t_stack_id);
7748    } else {
7749      KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7750      __kmp_itt_stack_callee_leave(
7751          (__itt_caller)team->t.t_parent->t.t_stack_id);
7752    }
7753  }
7754#endif /* USE_ITT_BUILD */
7755  __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
7756
7757  return rc;
7758}
7759
7760void __kmp_teams_master(int gtid) {
7761  // This routine is called by all primary threads in teams construct
7762  kmp_info_t *thr = __kmp_threads[gtid];
7763  kmp_team_t *team = thr->th.th_team;
7764  ident_t *loc = team->t.t_ident;
7765  thr->th.th_set_nproc = thr->th.th_teams_size.nth;
7766  KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
7767  KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
7768  KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
7769                __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
7770
7771  // This thread is a new CG root.  Set up the proper variables.
7772  kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
7773  tmp->cg_root = thr; // Make thr the CG root
7774  // Init to thread limit stored when league primary threads were forked
7775  tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit;
7776  tmp->cg_nthreads = 1; // Init counter to one active thread, this one
7777  KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init"
7778                 " cg_nthreads to 1\n",
7779                 thr, tmp));
7780  tmp->up = thr->th.th_cg_roots;
7781  thr->th.th_cg_roots = tmp;
7782
7783// Launch league of teams now, but not let workers execute
7784// (they hang on fork barrier until next parallel)
7785#if INCLUDE_SSC_MARKS
7786  SSC_MARK_FORKING();
7787#endif
7788  __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
7789                  (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
7790                  VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
7791#if INCLUDE_SSC_MARKS
7792  SSC_MARK_JOINING();
7793#endif
7794  // If the team size was reduced from the limit, set it to the new size
7795  if (thr->th.th_team_nproc < thr->th.th_teams_size.nth)
7796    thr->th.th_teams_size.nth = thr->th.th_team_nproc;
7797  // AC: last parameter "1" eliminates join barrier which won't work because
7798  // worker threads are in a fork barrier waiting for more parallel regions
7799  __kmp_join_call(loc, gtid
7800#if OMPT_SUPPORT
7801                  ,
7802                  fork_context_intel
7803#endif
7804                  ,
7805                  1);
7806}
7807
7808int __kmp_invoke_teams_master(int gtid) {
7809  kmp_info_t *this_thr = __kmp_threads[gtid];
7810  kmp_team_t *team = this_thr->th.th_team;
7811#if KMP_DEBUG
7812  if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
7813    KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
7814                     (void *)__kmp_teams_master);
7815#endif
7816  __kmp_run_before_invoked_task(gtid, 0, this_thr, team);
7817#if OMPT_SUPPORT
7818  int tid = __kmp_tid_from_gtid(gtid);
7819  ompt_data_t *task_data =
7820      &team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data;
7821  ompt_data_t *parallel_data = &team->t.ompt_team_info.parallel_data;
7822  if (ompt_enabled.ompt_callback_implicit_task) {
7823    ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7824        ompt_scope_begin, parallel_data, task_data, team->t.t_nproc, tid,
7825        ompt_task_initial);
7826    OMPT_CUR_TASK_INFO(this_thr)->thread_num = tid;
7827  }
7828#endif
7829  __kmp_teams_master(gtid);
7830#if OMPT_SUPPORT
7831  this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_league;
7832#endif
7833  __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
7834  return 1;
7835}
7836
7837/* this sets the requested number of threads for the next parallel region
7838   encountered by this team. since this should be enclosed in the forkjoin
7839   critical section it should avoid race conditions with asymmetrical nested
7840   parallelism */
7841
7842void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
7843  kmp_info_t *thr = __kmp_threads[gtid];
7844
7845  if (num_threads > 0)
7846    thr->th.th_set_nproc = num_threads;
7847}
7848
7849static void __kmp_push_thread_limit(kmp_info_t *thr, int num_teams,
7850                                    int num_threads) {
7851  KMP_DEBUG_ASSERT(thr);
7852  // Remember the number of threads for inner parallel regions
7853  if (!TCR_4(__kmp_init_middle))
7854    __kmp_middle_initialize(); // get internal globals calculated
7855  __kmp_assign_root_init_mask();
7856  KMP_DEBUG_ASSERT(__kmp_avail_proc);
7857  KMP_DEBUG_ASSERT(__kmp_dflt_team_nth);
7858
7859  if (num_threads == 0) {
7860    if (__kmp_teams_thread_limit > 0) {
7861      num_threads = __kmp_teams_thread_limit;
7862    } else {
7863      num_threads = __kmp_avail_proc / num_teams;
7864    }
7865    // adjust num_threads w/o warning as it is not user setting
7866    // num_threads = min(num_threads, nthreads-var, thread-limit-var)
7867    // no thread_limit clause specified -  do not change thread-limit-var ICV
7868    if (num_threads > __kmp_dflt_team_nth) {
7869      num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7870    }
7871    if (num_threads > thr->th.th_current_task->td_icvs.thread_limit) {
7872      num_threads = thr->th.th_current_task->td_icvs.thread_limit;
7873    } // prevent team size to exceed thread-limit-var
7874    if (num_teams * num_threads > __kmp_teams_max_nth) {
7875      num_threads = __kmp_teams_max_nth / num_teams;
7876    }
7877    if (num_threads == 0) {
7878      num_threads = 1;
7879    }
7880  } else {
7881    if (num_threads < 0) {
7882      __kmp_msg(kmp_ms_warning, KMP_MSG(CantFormThrTeam, num_threads, 1),
7883                __kmp_msg_null);
7884      num_threads = 1;
7885    }
7886    // This thread will be the primary thread of the league primary threads
7887    // Store new thread limit; old limit is saved in th_cg_roots list
7888    thr->th.th_current_task->td_icvs.thread_limit = num_threads;
7889    // num_threads = min(num_threads, nthreads-var)
7890    if (num_threads > __kmp_dflt_team_nth) {
7891      num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7892    }
7893    if (num_teams * num_threads > __kmp_teams_max_nth) {
7894      int new_threads = __kmp_teams_max_nth / num_teams;
7895      if (new_threads == 0) {
7896        new_threads = 1;
7897      }
7898      if (new_threads != num_threads) {
7899        if (!__kmp_reserve_warn) { // user asked for too many threads
7900          __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT
7901          __kmp_msg(kmp_ms_warning,
7902                    KMP_MSG(CantFormThrTeam, num_threads, new_threads),
7903                    KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7904        }
7905      }
7906      num_threads = new_threads;
7907    }
7908  }
7909  thr->th.th_teams_size.nth = num_threads;
7910}
7911
7912/* this sets the requested number of teams for the teams region and/or
7913   the number of threads for the next parallel region encountered  */
7914void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
7915                          int num_threads) {
7916  kmp_info_t *thr = __kmp_threads[gtid];
7917  if (num_teams < 0) {
7918    // OpenMP specification requires requested values to be positive,
7919    // but people can send us any value, so we'd better check
7920    __kmp_msg(kmp_ms_warning, KMP_MSG(NumTeamsNotPositive, num_teams, 1),
7921              __kmp_msg_null);
7922    num_teams = 1;
7923  }
7924  if (num_teams == 0) {
7925    if (__kmp_nteams > 0) {
7926      num_teams = __kmp_nteams;
7927    } else {
7928      num_teams = 1; // default number of teams is 1.
7929    }
7930  }
7931  if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
7932    if (!__kmp_reserve_warn) {
7933      __kmp_reserve_warn = 1;
7934      __kmp_msg(kmp_ms_warning,
7935                KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7936                KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7937    }
7938    num_teams = __kmp_teams_max_nth;
7939  }
7940  // Set number of teams (number of threads in the outer "parallel" of the
7941  // teams)
7942  thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7943
7944  __kmp_push_thread_limit(thr, num_teams, num_threads);
7945}
7946
7947/* This sets the requested number of teams for the teams region and/or
7948   the number of threads for the next parallel region encountered  */
7949void __kmp_push_num_teams_51(ident_t *id, int gtid, int num_teams_lb,
7950                             int num_teams_ub, int num_threads) {
7951  kmp_info_t *thr = __kmp_threads[gtid];
7952  KMP_DEBUG_ASSERT(num_teams_lb >= 0 && num_teams_ub >= 0);
7953  KMP_DEBUG_ASSERT(num_teams_ub >= num_teams_lb);
7954  KMP_DEBUG_ASSERT(num_threads >= 0);
7955
7956  if (num_teams_lb > num_teams_ub) {
7957    __kmp_fatal(KMP_MSG(FailedToCreateTeam, num_teams_lb, num_teams_ub),
7958                KMP_HNT(SetNewBound, __kmp_teams_max_nth), __kmp_msg_null);
7959  }
7960
7961  int num_teams = 1; // defalt number of teams is 1.
7962
7963  if (num_teams_lb == 0 && num_teams_ub > 0)
7964    num_teams_lb = num_teams_ub;
7965
7966  if (num_teams_lb == 0 && num_teams_ub == 0) { // no num_teams clause
7967    num_teams = (__kmp_nteams > 0) ? __kmp_nteams : num_teams;
7968    if (num_teams > __kmp_teams_max_nth) {
7969      if (!__kmp_reserve_warn) {
7970        __kmp_reserve_warn = 1;
7971        __kmp_msg(kmp_ms_warning,
7972                  KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7973                  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7974      }
7975      num_teams = __kmp_teams_max_nth;
7976    }
7977  } else if (num_teams_lb == num_teams_ub) { // requires exact number of teams
7978    num_teams = num_teams_ub;
7979  } else { // num_teams_lb <= num_teams <= num_teams_ub
7980    if (num_threads <= 0) {
7981      if (num_teams_ub > __kmp_teams_max_nth) {
7982        num_teams = num_teams_lb;
7983      } else {
7984        num_teams = num_teams_ub;
7985      }
7986    } else {
7987      num_teams = (num_threads > __kmp_teams_max_nth)
7988                      ? num_teams
7989                      : __kmp_teams_max_nth / num_threads;
7990      if (num_teams < num_teams_lb) {
7991        num_teams = num_teams_lb;
7992      } else if (num_teams > num_teams_ub) {
7993        num_teams = num_teams_ub;
7994      }
7995    }
7996  }
7997  // Set number of teams (number of threads in the outer "parallel" of the
7998  // teams)
7999  thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
8000
8001  __kmp_push_thread_limit(thr, num_teams, num_threads);
8002}
8003
8004// Set the proc_bind var to use in the following parallel region.
8005void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
8006  kmp_info_t *thr = __kmp_threads[gtid];
8007  thr->th.th_set_proc_bind = proc_bind;
8008}
8009
8010/* Launch the worker threads into the microtask. */
8011
8012void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
8013  kmp_info_t *this_thr = __kmp_threads[gtid];
8014
8015#ifdef KMP_DEBUG
8016  int f;
8017#endif /* KMP_DEBUG */
8018
8019  KMP_DEBUG_ASSERT(team);
8020  KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
8021  KMP_ASSERT(KMP_MASTER_GTID(gtid));
8022  KMP_MB(); /* Flush all pending memory write invalidates.  */
8023
8024  team->t.t_construct = 0; /* no single directives seen yet */
8025  team->t.t_ordered.dt.t_value =
8026      0; /* thread 0 enters the ordered section first */
8027
8028  /* Reset the identifiers on the dispatch buffer */
8029  KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
8030  if (team->t.t_max_nproc > 1) {
8031    int i;
8032    for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
8033      team->t.t_disp_buffer[i].buffer_index = i;
8034      team->t.t_disp_buffer[i].doacross_buf_idx = i;
8035    }
8036  } else {
8037    team->t.t_disp_buffer[0].buffer_index = 0;
8038    team->t.t_disp_buffer[0].doacross_buf_idx = 0;
8039  }
8040
8041  KMP_MB(); /* Flush all pending memory write invalidates.  */
8042  KMP_ASSERT(this_thr->th.th_team == team);
8043
8044#ifdef KMP_DEBUG
8045  for (f = 0; f < team->t.t_nproc; f++) {
8046    KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
8047                     team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
8048  }
8049#endif /* KMP_DEBUG */
8050
8051  /* release the worker threads so they may begin working */
8052  __kmp_fork_barrier(gtid, 0);
8053}
8054
8055void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
8056  kmp_info_t *this_thr = __kmp_threads[gtid];
8057
8058  KMP_DEBUG_ASSERT(team);
8059  KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
8060  KMP_ASSERT(KMP_MASTER_GTID(gtid));
8061  KMP_MB(); /* Flush all pending memory write invalidates.  */
8062
8063  /* Join barrier after fork */
8064
8065#ifdef KMP_DEBUG
8066  if (__kmp_threads[gtid] &&
8067      __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
8068    __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
8069                 __kmp_threads[gtid]);
8070    __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
8071                 "team->t.t_nproc=%d\n",
8072                 gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
8073                 team->t.t_nproc);
8074    __kmp_print_structure();
8075  }
8076  KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
8077                   __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
8078#endif /* KMP_DEBUG */
8079
8080  __kmp_join_barrier(gtid); /* wait for everyone */
8081#if OMPT_SUPPORT
8082  if (ompt_enabled.enabled &&
8083      this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
8084    int ds_tid = this_thr->th.th_info.ds.ds_tid;
8085    ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr);
8086    this_thr->th.ompt_thread_info.state = ompt_state_overhead;
8087#if OMPT_OPTIONAL
8088    void *codeptr = NULL;
8089    if (KMP_MASTER_TID(ds_tid) &&
8090        (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
8091         ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
8092      codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
8093
8094    if (ompt_enabled.ompt_callback_sync_region_wait) {
8095      ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
8096          ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
8097          codeptr);
8098    }
8099    if (ompt_enabled.ompt_callback_sync_region) {
8100      ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
8101          ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
8102          codeptr);
8103    }
8104#endif
8105    if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
8106      ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
8107          ompt_scope_end, NULL, task_data, 0, ds_tid,
8108          ompt_task_implicit); // TODO: Can this be ompt_task_initial?
8109    }
8110  }
8111#endif
8112
8113  KMP_MB(); /* Flush all pending memory write invalidates.  */
8114  KMP_ASSERT(this_thr->th.th_team == team);
8115}
8116
8117/* ------------------------------------------------------------------------ */
8118
8119#ifdef USE_LOAD_BALANCE
8120
8121// Return the worker threads actively spinning in the hot team, if we
8122// are at the outermost level of parallelism.  Otherwise, return 0.
8123static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
8124  int i;
8125  int retval;
8126  kmp_team_t *hot_team;
8127
8128  if (root->r.r_active) {
8129    return 0;
8130  }
8131  hot_team = root->r.r_hot_team;
8132  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
8133    return hot_team->t.t_nproc - 1; // Don't count primary thread
8134  }
8135
8136  // Skip the primary thread - it is accounted for elsewhere.
8137  retval = 0;
8138  for (i = 1; i < hot_team->t.t_nproc; i++) {
8139    if (hot_team->t.t_threads[i]->th.th_active) {
8140      retval++;
8141    }
8142  }
8143  return retval;
8144}
8145
8146// Perform an automatic adjustment to the number of
8147// threads used by the next parallel region.
8148static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
8149  int retval;
8150  int pool_active;
8151  int hot_team_active;
8152  int team_curr_active;
8153  int system_active;
8154
8155  KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
8156                set_nproc));
8157  KMP_DEBUG_ASSERT(root);
8158  KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
8159                       ->th.th_current_task->td_icvs.dynamic == TRUE);
8160  KMP_DEBUG_ASSERT(set_nproc > 1);
8161
8162  if (set_nproc == 1) {
8163    KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
8164    return 1;
8165  }
8166
8167  // Threads that are active in the thread pool, active in the hot team for this
8168  // particular root (if we are at the outer par level), and the currently
8169  // executing thread (to become the primary thread) are available to add to the
8170  // new team, but are currently contributing to the system load, and must be
8171  // accounted for.
8172  pool_active = __kmp_thread_pool_active_nth;
8173  hot_team_active = __kmp_active_hot_team_nproc(root);
8174  team_curr_active = pool_active + hot_team_active + 1;
8175
8176  // Check the system load.
8177  system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
8178  KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
8179                "hot team active = %d\n",
8180                system_active, pool_active, hot_team_active));
8181
8182  if (system_active < 0) {
8183    // There was an error reading the necessary info from /proc, so use the
8184    // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
8185    // = dynamic_thread_limit, we shouldn't wind up getting back here.
8186    __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
8187    KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
8188
8189    // Make this call behave like the thread limit algorithm.
8190    retval = __kmp_avail_proc - __kmp_nth +
8191             (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
8192    if (retval > set_nproc) {
8193      retval = set_nproc;
8194    }
8195    if (retval < KMP_MIN_NTH) {
8196      retval = KMP_MIN_NTH;
8197    }
8198
8199    KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
8200                  retval));
8201    return retval;
8202  }
8203
8204  // There is a slight delay in the load balance algorithm in detecting new
8205  // running procs. The real system load at this instant should be at least as
8206  // large as the #active omp thread that are available to add to the team.
8207  if (system_active < team_curr_active) {
8208    system_active = team_curr_active;
8209  }
8210  retval = __kmp_avail_proc - system_active + team_curr_active;
8211  if (retval > set_nproc) {
8212    retval = set_nproc;
8213  }
8214  if (retval < KMP_MIN_NTH) {
8215    retval = KMP_MIN_NTH;
8216  }
8217
8218  KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
8219  return retval;
8220} // __kmp_load_balance_nproc()
8221
8222#endif /* USE_LOAD_BALANCE */
8223
8224/* ------------------------------------------------------------------------ */
8225
8226/* NOTE: this is called with the __kmp_init_lock held */
8227void __kmp_cleanup(void) {
8228  int f;
8229
8230  KA_TRACE(10, ("__kmp_cleanup: enter\n"));
8231
8232  if (TCR_4(__kmp_init_parallel)) {
8233#if KMP_HANDLE_SIGNALS
8234    __kmp_remove_signals();
8235#endif
8236    TCW_4(__kmp_init_parallel, FALSE);
8237  }
8238
8239  if (TCR_4(__kmp_init_middle)) {
8240#if KMP_AFFINITY_SUPPORTED
8241    __kmp_affinity_uninitialize();
8242#endif /* KMP_AFFINITY_SUPPORTED */
8243    __kmp_cleanup_hierarchy();
8244    TCW_4(__kmp_init_middle, FALSE);
8245  }
8246
8247  KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
8248
8249  if (__kmp_init_serial) {
8250    __kmp_runtime_destroy();
8251    __kmp_init_serial = FALSE;
8252  }
8253
8254  __kmp_cleanup_threadprivate_caches();
8255
8256  for (f = 0; f < __kmp_threads_capacity; f++) {
8257    if (__kmp_root[f] != NULL) {
8258      __kmp_free(__kmp_root[f]);
8259      __kmp_root[f] = NULL;
8260    }
8261  }
8262  __kmp_free(__kmp_threads);
8263  // __kmp_threads and __kmp_root were allocated at once, as single block, so
8264  // there is no need in freeing __kmp_root.
8265  __kmp_threads = NULL;
8266  __kmp_root = NULL;
8267  __kmp_threads_capacity = 0;
8268
8269  // Free old __kmp_threads arrays if they exist.
8270  kmp_old_threads_list_t *ptr = __kmp_old_threads_list;
8271  while (ptr) {
8272    kmp_old_threads_list_t *next = ptr->next;
8273    __kmp_free(ptr->threads);
8274    __kmp_free(ptr);
8275    ptr = next;
8276  }
8277
8278#if KMP_USE_DYNAMIC_LOCK
8279  __kmp_cleanup_indirect_user_locks();
8280#else
8281  __kmp_cleanup_user_locks();
8282#endif
8283#if OMPD_SUPPORT
8284  if (ompd_state) {
8285    __kmp_free(ompd_env_block);
8286    ompd_env_block = NULL;
8287    ompd_env_block_size = 0;
8288  }
8289#endif
8290
8291#if KMP_AFFINITY_SUPPORTED
8292  KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
8293  __kmp_cpuinfo_file = NULL;
8294#endif /* KMP_AFFINITY_SUPPORTED */
8295
8296#if KMP_USE_ADAPTIVE_LOCKS
8297#if KMP_DEBUG_ADAPTIVE_LOCKS
8298  __kmp_print_speculative_stats();
8299#endif
8300#endif
8301  KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
8302  __kmp_nested_nth.nth = NULL;
8303  __kmp_nested_nth.size = 0;
8304  __kmp_nested_nth.used = 0;
8305  KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
8306  __kmp_nested_proc_bind.bind_types = NULL;
8307  __kmp_nested_proc_bind.size = 0;
8308  __kmp_nested_proc_bind.used = 0;
8309  if (__kmp_affinity_format) {
8310    KMP_INTERNAL_FREE(__kmp_affinity_format);
8311    __kmp_affinity_format = NULL;
8312  }
8313
8314  __kmp_i18n_catclose();
8315
8316#if KMP_USE_HIER_SCHED
8317  __kmp_hier_scheds.deallocate();
8318#endif
8319
8320#if KMP_STATS_ENABLED
8321  __kmp_stats_fini();
8322#endif
8323
8324  KA_TRACE(10, ("__kmp_cleanup: exit\n"));
8325}
8326
8327/* ------------------------------------------------------------------------ */
8328
8329int __kmp_ignore_mppbeg(void) {
8330  char *env;
8331
8332  if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
8333    if (__kmp_str_match_false(env))
8334      return FALSE;
8335  }
8336  // By default __kmpc_begin() is no-op.
8337  return TRUE;
8338}
8339
8340int __kmp_ignore_mppend(void) {
8341  char *env;
8342
8343  if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
8344    if (__kmp_str_match_false(env))
8345      return FALSE;
8346  }
8347  // By default __kmpc_end() is no-op.
8348  return TRUE;
8349}
8350
8351void __kmp_internal_begin(void) {
8352  int gtid;
8353  kmp_root_t *root;
8354
8355  /* this is a very important step as it will register new sibling threads
8356     and assign these new uber threads a new gtid */
8357  gtid = __kmp_entry_gtid();
8358  root = __kmp_threads[gtid]->th.th_root;
8359  KMP_ASSERT(KMP_UBER_GTID(gtid));
8360
8361  if (root->r.r_begin)
8362    return;
8363  __kmp_acquire_lock(&root->r.r_begin_lock, gtid);
8364  if (root->r.r_begin) {
8365    __kmp_release_lock(&root->r.r_begin_lock, gtid);
8366    return;
8367  }
8368
8369  root->r.r_begin = TRUE;
8370
8371  __kmp_release_lock(&root->r.r_begin_lock, gtid);
8372}
8373
8374/* ------------------------------------------------------------------------ */
8375
8376void __kmp_user_set_library(enum library_type arg) {
8377  int gtid;
8378  kmp_root_t *root;
8379  kmp_info_t *thread;
8380
8381  /* first, make sure we are initialized so we can get our gtid */
8382
8383  gtid = __kmp_entry_gtid();
8384  thread = __kmp_threads[gtid];
8385
8386  root = thread->th.th_root;
8387
8388  KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
8389                library_serial));
8390  if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
8391                                  thread */
8392    KMP_WARNING(SetLibraryIncorrectCall);
8393    return;
8394  }
8395
8396  switch (arg) {
8397  case library_serial:
8398    thread->th.th_set_nproc = 0;
8399    set__nproc(thread, 1);
8400    break;
8401  case library_turnaround:
8402    thread->th.th_set_nproc = 0;
8403    set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
8404                                           : __kmp_dflt_team_nth_ub);
8405    break;
8406  case library_throughput:
8407    thread->th.th_set_nproc = 0;
8408    set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
8409                                           : __kmp_dflt_team_nth_ub);
8410    break;
8411  default:
8412    KMP_FATAL(UnknownLibraryType, arg);
8413  }
8414
8415  __kmp_aux_set_library(arg);
8416}
8417
8418void __kmp_aux_set_stacksize(size_t arg) {
8419  if (!__kmp_init_serial)
8420    __kmp_serial_initialize();
8421
8422#if KMP_OS_DARWIN
8423  if (arg & (0x1000 - 1)) {
8424    arg &= ~(0x1000 - 1);
8425    if (arg + 0x1000) /* check for overflow if we round up */
8426      arg += 0x1000;
8427  }
8428#endif
8429  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
8430
8431  /* only change the default stacksize before the first parallel region */
8432  if (!TCR_4(__kmp_init_parallel)) {
8433    size_t value = arg; /* argument is in bytes */
8434
8435    if (value < __kmp_sys_min_stksize)
8436      value = __kmp_sys_min_stksize;
8437    else if (value > KMP_MAX_STKSIZE)
8438      value = KMP_MAX_STKSIZE;
8439
8440    __kmp_stksize = value;
8441
8442    __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
8443  }
8444
8445  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
8446}
8447
8448/* set the behaviour of the runtime library */
8449/* TODO this can cause some odd behaviour with sibling parallelism... */
8450void __kmp_aux_set_library(enum library_type arg) {
8451  __kmp_library = arg;
8452
8453  switch (__kmp_library) {
8454  case library_serial: {
8455    KMP_INFORM(LibraryIsSerial);
8456  } break;
8457  case library_turnaround:
8458    if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set)
8459      __kmp_use_yield = 2; // only yield when oversubscribed
8460    break;
8461  case library_throughput:
8462    if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME)
8463      __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
8464    break;
8465  default:
8466    KMP_FATAL(UnknownLibraryType, arg);
8467  }
8468}
8469
8470/* Getting team information common for all team API */
8471// Returns NULL if not in teams construct
8472static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) {
8473  kmp_info_t *thr = __kmp_entry_thread();
8474  teams_serialized = 0;
8475  if (thr->th.th_teams_microtask) {
8476    kmp_team_t *team = thr->th.th_team;
8477    int tlevel = thr->th.th_teams_level; // the level of the teams construct
8478    int ii = team->t.t_level;
8479    teams_serialized = team->t.t_serialized;
8480    int level = tlevel + 1;
8481    KMP_DEBUG_ASSERT(ii >= tlevel);
8482    while (ii > level) {
8483      for (teams_serialized = team->t.t_serialized;
8484           (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) {
8485      }
8486      if (team->t.t_serialized && (!teams_serialized)) {
8487        team = team->t.t_parent;
8488        continue;
8489      }
8490      if (ii > level) {
8491        team = team->t.t_parent;
8492        ii--;
8493      }
8494    }
8495    return team;
8496  }
8497  return NULL;
8498}
8499
8500int __kmp_aux_get_team_num() {
8501  int serialized;
8502  kmp_team_t *team = __kmp_aux_get_team_info(serialized);
8503  if (team) {
8504    if (serialized > 1) {
8505      return 0; // teams region is serialized ( 1 team of 1 thread ).
8506    } else {
8507      return team->t.t_master_tid;
8508    }
8509  }
8510  return 0;
8511}
8512
8513int __kmp_aux_get_num_teams() {
8514  int serialized;
8515  kmp_team_t *team = __kmp_aux_get_team_info(serialized);
8516  if (team) {
8517    if (serialized > 1) {
8518      return 1;
8519    } else {
8520      return team->t.t_parent->t.t_nproc;
8521    }
8522  }
8523  return 1;
8524}
8525
8526/* ------------------------------------------------------------------------ */
8527
8528/*
8529 * Affinity Format Parser
8530 *
8531 * Field is in form of: %[[[0].]size]type
8532 * % and type are required (%% means print a literal '%')
8533 * type is either single char or long name surrounded by {},
8534 * e.g., N or {num_threads}
8535 * 0 => leading zeros
8536 * . => right justified when size is specified
8537 * by default output is left justified
8538 * size is the *minimum* field length
8539 * All other characters are printed as is
8540 *
8541 * Available field types:
8542 * L {thread_level}      - omp_get_level()
8543 * n {thread_num}        - omp_get_thread_num()
8544 * h {host}              - name of host machine
8545 * P {process_id}        - process id (integer)
8546 * T {thread_identifier} - native thread identifier (integer)
8547 * N {num_threads}       - omp_get_num_threads()
8548 * A {ancestor_tnum}     - omp_get_ancestor_thread_num(omp_get_level()-1)
8549 * a {thread_affinity}   - comma separated list of integers or integer ranges
8550 *                         (values of affinity mask)
8551 *
8552 * Implementation-specific field types can be added
8553 * If a type is unknown, print "undefined"
8554 */
8555
8556// Structure holding the short name, long name, and corresponding data type
8557// for snprintf.  A table of these will represent the entire valid keyword
8558// field types.
8559typedef struct kmp_affinity_format_field_t {
8560  char short_name; // from spec e.g., L -> thread level
8561  const char *long_name; // from spec thread_level -> thread level
8562  char field_format; // data type for snprintf (typically 'd' or 's'
8563  // for integer or string)
8564} kmp_affinity_format_field_t;
8565
8566static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = {
8567#if KMP_AFFINITY_SUPPORTED
8568    {'A', "thread_affinity", 's'},
8569#endif
8570    {'t', "team_num", 'd'},
8571    {'T', "num_teams", 'd'},
8572    {'L', "nesting_level", 'd'},
8573    {'n', "thread_num", 'd'},
8574    {'N', "num_threads", 'd'},
8575    {'a', "ancestor_tnum", 'd'},
8576    {'H', "host", 's'},
8577    {'P', "process_id", 'd'},
8578    {'i', "native_thread_id", 'd'}};
8579
8580// Return the number of characters it takes to hold field
8581static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th,
8582                                            const char **ptr,
8583                                            kmp_str_buf_t *field_buffer) {
8584  int rc, format_index, field_value;
8585  const char *width_left, *width_right;
8586  bool pad_zeros, right_justify, parse_long_name, found_valid_name;
8587  static const int FORMAT_SIZE = 20;
8588  char format[FORMAT_SIZE] = {0};
8589  char absolute_short_name = 0;
8590
8591  KMP_DEBUG_ASSERT(gtid >= 0);
8592  KMP_DEBUG_ASSERT(th);
8593  KMP_DEBUG_ASSERT(**ptr == '%');
8594  KMP_DEBUG_ASSERT(field_buffer);
8595
8596  __kmp_str_buf_clear(field_buffer);
8597
8598  // Skip the initial %
8599  (*ptr)++;
8600
8601  // Check for %% first
8602  if (**ptr == '%') {
8603    __kmp_str_buf_cat(field_buffer, "%", 1);
8604    (*ptr)++; // skip over the second %
8605    return 1;
8606  }
8607
8608  // Parse field modifiers if they are present
8609  pad_zeros = false;
8610  if (**ptr == '0') {
8611    pad_zeros = true;
8612    (*ptr)++; // skip over 0
8613  }
8614  right_justify = false;
8615  if (**ptr == '.') {
8616    right_justify = true;
8617    (*ptr)++; // skip over .
8618  }
8619  // Parse width of field: [width_left, width_right)
8620  width_left = width_right = NULL;
8621  if (**ptr >= '0' && **ptr <= '9') {
8622    width_left = *ptr;
8623    SKIP_DIGITS(*ptr);
8624    width_right = *ptr;
8625  }
8626
8627  // Create the format for KMP_SNPRINTF based on flags parsed above
8628  format_index = 0;
8629  format[format_index++] = '%';
8630  if (!right_justify)
8631    format[format_index++] = '-';
8632  if (pad_zeros)
8633    format[format_index++] = '0';
8634  if (width_left && width_right) {
8635    int i = 0;
8636    // Only allow 8 digit number widths.
8637    // This also prevents overflowing format variable
8638    while (i < 8 && width_left < width_right) {
8639      format[format_index++] = *width_left;
8640      width_left++;
8641      i++;
8642    }
8643  }
8644
8645  // Parse a name (long or short)
8646  // Canonicalize the name into absolute_short_name
8647  found_valid_name = false;
8648  parse_long_name = (**ptr == '{');
8649  if (parse_long_name)
8650    (*ptr)++; // skip initial left brace
8651  for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) /
8652                             sizeof(__kmp_affinity_format_table[0]);
8653       ++i) {
8654    char short_name = __kmp_affinity_format_table[i].short_name;
8655    const char *long_name = __kmp_affinity_format_table[i].long_name;
8656    char field_format = __kmp_affinity_format_table[i].field_format;
8657    if (parse_long_name) {
8658      size_t length = KMP_STRLEN(long_name);
8659      if (strncmp(*ptr, long_name, length) == 0) {
8660        found_valid_name = true;
8661        (*ptr) += length; // skip the long name
8662      }
8663    } else if (**ptr == short_name) {
8664      found_valid_name = true;
8665      (*ptr)++; // skip the short name
8666    }
8667    if (found_valid_name) {
8668      format[format_index++] = field_format;
8669      format[format_index++] = '\0';
8670      absolute_short_name = short_name;
8671      break;
8672    }
8673  }
8674  if (parse_long_name) {
8675    if (**ptr != '}') {
8676      absolute_short_name = 0;
8677    } else {
8678      (*ptr)++; // skip over the right brace
8679    }
8680  }
8681
8682  // Attempt to fill the buffer with the requested
8683  // value using snprintf within __kmp_str_buf_print()
8684  switch (absolute_short_name) {
8685  case 't':
8686    rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num());
8687    break;
8688  case 'T':
8689    rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams());
8690    break;
8691  case 'L':
8692    rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level);
8693    break;
8694  case 'n':
8695    rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid));
8696    break;
8697  case 'H': {
8698    static const int BUFFER_SIZE = 256;
8699    char buf[BUFFER_SIZE];
8700    __kmp_expand_host_name(buf, BUFFER_SIZE);
8701    rc = __kmp_str_buf_print(field_buffer, format, buf);
8702  } break;
8703  case 'P':
8704    rc = __kmp_str_buf_print(field_buffer, format, getpid());
8705    break;
8706  case 'i':
8707    rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid());
8708    break;
8709  case 'N':
8710    rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc);
8711    break;
8712  case 'a':
8713    field_value =
8714        __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1);
8715    rc = __kmp_str_buf_print(field_buffer, format, field_value);
8716    break;
8717#if KMP_AFFINITY_SUPPORTED
8718  case 'A': {
8719    kmp_str_buf_t buf;
8720    __kmp_str_buf_init(&buf);
8721    __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask);
8722    rc = __kmp_str_buf_print(field_buffer, format, buf.str);
8723    __kmp_str_buf_free(&buf);
8724  } break;
8725#endif
8726  default:
8727    // According to spec, If an implementation does not have info for field
8728    // type, then "undefined" is printed
8729    rc = __kmp_str_buf_print(field_buffer, "%s", "undefined");
8730    // Skip the field
8731    if (parse_long_name) {
8732      SKIP_TOKEN(*ptr);
8733      if (**ptr == '}')
8734        (*ptr)++;
8735    } else {
8736      (*ptr)++;
8737    }
8738  }
8739
8740  KMP_ASSERT(format_index <= FORMAT_SIZE);
8741  return rc;
8742}
8743
8744/*
8745 * Return number of characters needed to hold the affinity string
8746 * (not including null byte character)
8747 * The resultant string is printed to buffer, which the caller can then
8748 * handle afterwards
8749 */
8750size_t __kmp_aux_capture_affinity(int gtid, const char *format,
8751                                  kmp_str_buf_t *buffer) {
8752  const char *parse_ptr;
8753  size_t retval;
8754  const kmp_info_t *th;
8755  kmp_str_buf_t field;
8756
8757  KMP_DEBUG_ASSERT(buffer);
8758  KMP_DEBUG_ASSERT(gtid >= 0);
8759
8760  __kmp_str_buf_init(&field);
8761  __kmp_str_buf_clear(buffer);
8762
8763  th = __kmp_threads[gtid];
8764  retval = 0;
8765
8766  // If format is NULL or zero-length string, then we use
8767  // affinity-format-var ICV
8768  parse_ptr = format;
8769  if (parse_ptr == NULL || *parse_ptr == '\0') {
8770    parse_ptr = __kmp_affinity_format;
8771  }
8772  KMP_DEBUG_ASSERT(parse_ptr);
8773
8774  while (*parse_ptr != '\0') {
8775    // Parse a field
8776    if (*parse_ptr == '%') {
8777      // Put field in the buffer
8778      int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field);
8779      __kmp_str_buf_catbuf(buffer, &field);
8780      retval += rc;
8781    } else {
8782      // Put literal character in buffer
8783      __kmp_str_buf_cat(buffer, parse_ptr, 1);
8784      retval++;
8785      parse_ptr++;
8786    }
8787  }
8788  __kmp_str_buf_free(&field);
8789  return retval;
8790}
8791
8792// Displays the affinity string to stdout
8793void __kmp_aux_display_affinity(int gtid, const char *format) {
8794  kmp_str_buf_t buf;
8795  __kmp_str_buf_init(&buf);
8796  __kmp_aux_capture_affinity(gtid, format, &buf);
8797  __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str);
8798  __kmp_str_buf_free(&buf);
8799}
8800
8801/* ------------------------------------------------------------------------ */
8802void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
8803  int blocktime = arg; /* argument is in microseconds */
8804#if KMP_USE_MONITOR
8805  int bt_intervals;
8806#endif
8807  kmp_int8 bt_set;
8808
8809  __kmp_save_internal_controls(thread);
8810
8811  /* Normalize and set blocktime for the teams */
8812  if (blocktime < KMP_MIN_BLOCKTIME)
8813    blocktime = KMP_MIN_BLOCKTIME;
8814  else if (blocktime > KMP_MAX_BLOCKTIME)
8815    blocktime = KMP_MAX_BLOCKTIME;
8816
8817  set__blocktime_team(thread->th.th_team, tid, blocktime);
8818  set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
8819
8820#if KMP_USE_MONITOR
8821  /* Calculate and set blocktime intervals for the teams */
8822  bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
8823
8824  set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
8825  set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
8826#endif
8827
8828  /* Set whether blocktime has been set to "TRUE" */
8829  bt_set = TRUE;
8830
8831  set__bt_set_team(thread->th.th_team, tid, bt_set);
8832  set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
8833#if KMP_USE_MONITOR
8834  KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
8835                "bt_intervals=%d, monitor_updates=%d\n",
8836                __kmp_gtid_from_tid(tid, thread->th.th_team),
8837                thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
8838                __kmp_monitor_wakeups));
8839#else
8840  KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
8841                __kmp_gtid_from_tid(tid, thread->th.th_team),
8842                thread->th.th_team->t.t_id, tid, blocktime));
8843#endif
8844}
8845
8846void __kmp_aux_set_defaults(char const *str, size_t len) {
8847  if (!__kmp_init_serial) {
8848    __kmp_serial_initialize();
8849  }
8850  __kmp_env_initialize(str);
8851
8852  if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) {
8853    __kmp_env_print();
8854  }
8855} // __kmp_aux_set_defaults
8856
8857/* ------------------------------------------------------------------------ */
8858/* internal fast reduction routines */
8859
8860PACKED_REDUCTION_METHOD_T
8861__kmp_determine_reduction_method(
8862    ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
8863    void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
8864    kmp_critical_name *lck) {
8865
8866  // Default reduction method: critical construct ( lck != NULL, like in current
8867  // PAROPT )
8868  // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
8869  // can be selected by RTL
8870  // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
8871  // can be selected by RTL
8872  // Finally, it's up to OpenMP RTL to make a decision on which method to select
8873  // among generated by PAROPT.
8874
8875  PACKED_REDUCTION_METHOD_T retval;
8876
8877  int team_size;
8878
8879  KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
8880
8881#define FAST_REDUCTION_ATOMIC_METHOD_GENERATED                                 \
8882  (loc &&                                                                      \
8883   ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE)))
8884#define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
8885
8886  retval = critical_reduce_block;
8887
8888  // another choice of getting a team size (with 1 dynamic deference) is slower
8889  team_size = __kmp_get_team_num_threads(global_tid);
8890  if (team_size == 1) {
8891
8892    retval = empty_reduce_block;
8893
8894  } else {
8895
8896    int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8897
8898#if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 ||                   \
8899    KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 ||             \
8900    KMP_ARCH_VE || KMP_ARCH_S390X || KMP_ARCH_WASM
8901
8902#if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||     \
8903    KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD ||        \
8904    KMP_OS_SOLARIS || KMP_OS_WASI || KMP_OS_AIX
8905
8906    int teamsize_cutoff = 4;
8907
8908#if KMP_MIC_SUPPORTED
8909    if (__kmp_mic_type != non_mic) {
8910      teamsize_cutoff = 8;
8911    }
8912#endif
8913    int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8914    if (tree_available) {
8915      if (team_size <= teamsize_cutoff) {
8916        if (atomic_available) {
8917          retval = atomic_reduce_block;
8918        }
8919      } else {
8920        retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8921      }
8922    } else if (atomic_available) {
8923      retval = atomic_reduce_block;
8924    }
8925#else
8926#error "Unknown or unsupported OS"
8927#endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||
8928       // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD ||
8929       // KMP_OS_SOLARIS || KMP_OS_WASI || KMP_OS_AIX
8930
8931#elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS ||       \
8932    KMP_ARCH_WASM || KMP_ARCH_PPC
8933
8934#if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||     \
8935    KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_HURD || KMP_OS_SOLARIS ||       \
8936    KMP_OS_WASI || KMP_OS_AIX
8937
8938    // basic tuning
8939
8940    if (atomic_available) {
8941      if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
8942        retval = atomic_reduce_block;
8943      }
8944    } // otherwise: use critical section
8945
8946#elif KMP_OS_DARWIN
8947
8948    int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8949    if (atomic_available && (num_vars <= 3)) {
8950      retval = atomic_reduce_block;
8951    } else if (tree_available) {
8952      if ((reduce_size > (9 * sizeof(kmp_real64))) &&
8953          (reduce_size < (2000 * sizeof(kmp_real64)))) {
8954        retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
8955      }
8956    } // otherwise: use critical section
8957
8958#else
8959#error "Unknown or unsupported OS"
8960#endif
8961
8962#else
8963#error "Unknown or unsupported architecture"
8964#endif
8965  }
8966
8967  // KMP_FORCE_REDUCTION
8968
8969  // If the team is serialized (team_size == 1), ignore the forced reduction
8970  // method and stay with the unsynchronized method (empty_reduce_block)
8971  if (__kmp_force_reduction_method != reduction_method_not_defined &&
8972      team_size != 1) {
8973
8974    PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
8975
8976    int atomic_available, tree_available;
8977
8978    switch ((forced_retval = __kmp_force_reduction_method)) {
8979    case critical_reduce_block:
8980      KMP_ASSERT(lck); // lck should be != 0
8981      break;
8982
8983    case atomic_reduce_block:
8984      atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8985      if (!atomic_available) {
8986        KMP_WARNING(RedMethodNotSupported, "atomic");
8987        forced_retval = critical_reduce_block;
8988      }
8989      break;
8990
8991    case tree_reduce_block:
8992      tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8993      if (!tree_available) {
8994        KMP_WARNING(RedMethodNotSupported, "tree");
8995        forced_retval = critical_reduce_block;
8996      } else {
8997#if KMP_FAST_REDUCTION_BARRIER
8998        forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8999#endif
9000      }
9001      break;
9002
9003    default:
9004      KMP_ASSERT(0); // "unsupported method specified"
9005    }
9006
9007    retval = forced_retval;
9008  }
9009
9010  KA_TRACE(10, ("reduction method selected=%08x\n", retval));
9011
9012#undef FAST_REDUCTION_TREE_METHOD_GENERATED
9013#undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
9014
9015  return (retval);
9016}
9017// this function is for testing set/get/determine reduce method
9018kmp_int32 __kmp_get_reduce_method(void) {
9019  return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
9020}
9021
9022// Soft pause sets up threads to ignore blocktime and just go to sleep.
9023// Spin-wait code checks __kmp_pause_status and reacts accordingly.
9024void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; }
9025
9026// Hard pause shuts down the runtime completely.  Resume happens naturally when
9027// OpenMP is used subsequently.
9028void __kmp_hard_pause() {
9029  __kmp_pause_status = kmp_hard_paused;
9030  __kmp_internal_end_thread(-1);
9031}
9032
9033// Soft resume sets __kmp_pause_status, and wakes up all threads.
9034void __kmp_resume_if_soft_paused() {
9035  if (__kmp_pause_status == kmp_soft_paused) {
9036    __kmp_pause_status = kmp_not_paused;
9037
9038    for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) {
9039      kmp_info_t *thread = __kmp_threads[gtid];
9040      if (thread) { // Wake it if sleeping
9041        kmp_flag_64<> fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
9042                         thread);
9043        if (fl.is_sleeping())
9044          fl.resume(gtid);
9045        else if (__kmp_try_suspend_mx(thread)) { // got suspend lock
9046          __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep
9047        } else { // thread holds the lock and may sleep soon
9048          do { // until either the thread sleeps, or we can get the lock
9049            if (fl.is_sleeping()) {
9050              fl.resume(gtid);
9051              break;
9052            } else if (__kmp_try_suspend_mx(thread)) {
9053              __kmp_unlock_suspend_mx(thread);
9054              break;
9055            }
9056          } while (1);
9057        }
9058      }
9059    }
9060  }
9061}
9062
9063// This function is called via __kmpc_pause_resource. Returns 0 if successful.
9064// TODO: add warning messages
9065int __kmp_pause_resource(kmp_pause_status_t level) {
9066  if (level == kmp_not_paused) { // requesting resume
9067    if (__kmp_pause_status == kmp_not_paused) {
9068      // error message about runtime not being paused, so can't resume
9069      return 1;
9070    } else {
9071      KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused ||
9072                       __kmp_pause_status == kmp_hard_paused);
9073      __kmp_pause_status = kmp_not_paused;
9074      return 0;
9075    }
9076  } else if (level == kmp_soft_paused) { // requesting soft pause
9077    if (__kmp_pause_status != kmp_not_paused) {
9078      // error message about already being paused
9079      return 1;
9080    } else {
9081      __kmp_soft_pause();
9082      return 0;
9083    }
9084  } else if (level == kmp_hard_paused) { // requesting hard pause
9085    if (__kmp_pause_status != kmp_not_paused) {
9086      // error message about already being paused
9087      return 1;
9088    } else {
9089      __kmp_hard_pause();
9090      return 0;
9091    }
9092  } else {
9093    // error message about invalid level
9094    return 1;
9095  }
9096}
9097
9098void __kmp_omp_display_env(int verbose) {
9099  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
9100  if (__kmp_init_serial == 0)
9101    __kmp_do_serial_initialize();
9102  __kmp_display_env_impl(!verbose, verbose);
9103  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
9104}
9105
9106// The team size is changing, so distributed barrier must be modified
9107void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
9108                               int new_nthreads) {
9109  KMP_DEBUG_ASSERT(__kmp_barrier_release_pattern[bs_forkjoin_barrier] ==
9110                   bp_dist_bar);
9111  kmp_info_t **other_threads = team->t.t_threads;
9112
9113  // We want all the workers to stop waiting on the barrier while we adjust the
9114  // size of the team.
9115  for (int f = 1; f < old_nthreads; ++f) {
9116    KMP_DEBUG_ASSERT(other_threads[f] != NULL);
9117    // Ignore threads that are already inactive or not present in the team
9118    if (team->t.t_threads[f]->th.th_used_in_team.load() == 0) {
9119      // teams construct causes thread_limit to get passed in, and some of
9120      // those could be inactive; just ignore them
9121      continue;
9122    }
9123    // If thread is transitioning still to in_use state, wait for it
9124    if (team->t.t_threads[f]->th.th_used_in_team.load() == 3) {
9125      while (team->t.t_threads[f]->th.th_used_in_team.load() == 3)
9126        KMP_CPU_PAUSE();
9127    }
9128    // The thread should be in_use now
9129    KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 1);
9130    // Transition to unused state
9131    team->t.t_threads[f]->th.th_used_in_team.store(2);
9132    KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 2);
9133  }
9134  // Release all the workers
9135  team->t.b->go_release();
9136
9137  KMP_MFENCE();
9138
9139  // Workers should see transition status 2 and move to 0; but may need to be
9140  // woken up first
9141  int count = old_nthreads - 1;
9142  while (count > 0) {
9143    count = old_nthreads - 1;
9144    for (int f = 1; f < old_nthreads; ++f) {
9145      if (other_threads[f]->th.th_used_in_team.load() != 0) {
9146        if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up the workers
9147          kmp_atomic_flag_64<> *flag = (kmp_atomic_flag_64<> *)CCAST(
9148              void *, other_threads[f]->th.th_sleep_loc);
9149          __kmp_atomic_resume_64(other_threads[f]->th.th_info.ds.ds_gtid, flag);
9150        }
9151      } else {
9152        KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 0);
9153        count--;
9154      }
9155    }
9156  }
9157  // Now update the barrier size
9158  team->t.b->update_num_threads(new_nthreads);
9159  team->t.b->go_reset();
9160}
9161
9162void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads) {
9163  // Add the threads back to the team
9164  KMP_DEBUG_ASSERT(team);
9165  // Threads were paused and pointed at th_used_in_team temporarily during a
9166  // resize of the team. We're going to set th_used_in_team to 3 to indicate to
9167  // the thread that it should transition itself back into the team. Then, if
9168  // blocktime isn't infinite, the thread could be sleeping, so we send a resume
9169  // to wake it up.
9170  for (int f = 1; f < new_nthreads; ++f) {
9171    KMP_DEBUG_ASSERT(team->t.t_threads[f]);
9172    KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team), 0,
9173                                3);
9174    if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up sleeping threads
9175      __kmp_resume_32(team->t.t_threads[f]->th.th_info.ds.ds_gtid,
9176                      (kmp_flag_32<false, false> *)NULL);
9177    }
9178  }
9179  // The threads should be transitioning to the team; when they are done, they
9180  // should have set th_used_in_team to 1. This loop forces master to wait until
9181  // all threads have moved into the team and are waiting in the barrier.
9182  int count = new_nthreads - 1;
9183  while (count > 0) {
9184    count = new_nthreads - 1;
9185    for (int f = 1; f < new_nthreads; ++f) {
9186      if (team->t.t_threads[f]->th.th_used_in_team.load() == 1) {
9187        count--;
9188      }
9189    }
9190  }
9191}
9192
9193// Globals and functions for hidden helper task
9194kmp_info_t **__kmp_hidden_helper_threads;
9195kmp_info_t *__kmp_hidden_helper_main_thread;
9196std::atomic<kmp_int32> __kmp_unexecuted_hidden_helper_tasks;
9197#if KMP_OS_LINUX
9198kmp_int32 __kmp_hidden_helper_threads_num = 8;
9199kmp_int32 __kmp_enable_hidden_helper = TRUE;
9200#else
9201kmp_int32 __kmp_hidden_helper_threads_num = 0;
9202kmp_int32 __kmp_enable_hidden_helper = FALSE;
9203#endif
9204
9205namespace {
9206std::atomic<kmp_int32> __kmp_hit_hidden_helper_threads_num;
9207
9208void __kmp_hidden_helper_wrapper_fn(int *gtid, int *, ...) {
9209  // This is an explicit synchronization on all hidden helper threads in case
9210  // that when a regular thread pushes a hidden helper task to one hidden
9211  // helper thread, the thread has not been awaken once since they're released
9212  // by the main thread after creating the team.
9213  KMP_ATOMIC_INC(&__kmp_hit_hidden_helper_threads_num);
9214  while (KMP_ATOMIC_LD_ACQ(&__kmp_hit_hidden_helper_threads_num) !=
9215         __kmp_hidden_helper_threads_num)
9216    ;
9217
9218  // If main thread, then wait for signal
9219  if (__kmpc_master(nullptr, *gtid)) {
9220    // First, unset the initial state and release the initial thread
9221    TCW_4(__kmp_init_hidden_helper_threads, FALSE);
9222    __kmp_hidden_helper_initz_release();
9223    __kmp_hidden_helper_main_thread_wait();
9224    // Now wake up all worker threads
9225    for (int i = 1; i < __kmp_hit_hidden_helper_threads_num; ++i) {
9226      __kmp_hidden_helper_worker_thread_signal();
9227    }
9228  }
9229}
9230} // namespace
9231
9232void __kmp_hidden_helper_threads_initz_routine() {
9233  // Create a new root for hidden helper team/threads
9234  const int gtid = __kmp_register_root(TRUE);
9235  __kmp_hidden_helper_main_thread = __kmp_threads[gtid];
9236  __kmp_hidden_helper_threads = &__kmp_threads[gtid];
9237  __kmp_hidden_helper_main_thread->th.th_set_nproc =
9238      __kmp_hidden_helper_threads_num;
9239
9240  KMP_ATOMIC_ST_REL(&__kmp_hit_hidden_helper_threads_num, 0);
9241
9242  __kmpc_fork_call(nullptr, 0, __kmp_hidden_helper_wrapper_fn);
9243
9244  // Set the initialization flag to FALSE
9245  TCW_SYNC_4(__kmp_init_hidden_helper, FALSE);
9246
9247  __kmp_hidden_helper_threads_deinitz_release();
9248}
9249
9250/* Nesting Mode:
9251   Set via KMP_NESTING_MODE, which takes an integer.
9252   Note: we skip duplicate topology levels, and skip levels with only
9253      one entity.
9254   KMP_NESTING_MODE=0 is the default, and doesn't use nesting mode.
9255   KMP_NESTING_MODE=1 sets as many nesting levels as there are distinct levels
9256      in the topology, and initializes the number of threads at each of those
9257      levels to the number of entities at each level, respectively, below the
9258      entity at the parent level.
9259   KMP_NESTING_MODE=N, where N>1, attempts to create up to N nesting levels,
9260      but starts with nesting OFF -- max-active-levels-var is 1 -- and requires
9261      the user to turn nesting on explicitly. This is an even more experimental
9262      option to this experimental feature, and may change or go away in the
9263      future.
9264*/
9265
9266// Allocate space to store nesting levels
9267void __kmp_init_nesting_mode() {
9268  int levels = KMP_HW_LAST;
9269  __kmp_nesting_mode_nlevels = levels;
9270  __kmp_nesting_nth_level = (int *)KMP_INTERNAL_MALLOC(levels * sizeof(int));
9271  for (int i = 0; i < levels; ++i)
9272    __kmp_nesting_nth_level[i] = 0;
9273  if (__kmp_nested_nth.size < levels) {
9274    __kmp_nested_nth.nth =
9275        (int *)KMP_INTERNAL_REALLOC(__kmp_nested_nth.nth, levels * sizeof(int));
9276    __kmp_nested_nth.size = levels;
9277  }
9278}
9279
9280// Set # threads for top levels of nesting; must be called after topology set
9281void __kmp_set_nesting_mode_threads() {
9282  kmp_info_t *thread = __kmp_threads[__kmp_entry_gtid()];
9283
9284  if (__kmp_nesting_mode == 1)
9285    __kmp_nesting_mode_nlevels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
9286  else if (__kmp_nesting_mode > 1)
9287    __kmp_nesting_mode_nlevels = __kmp_nesting_mode;
9288
9289  if (__kmp_topology) { // use topology info
9290    int loc, hw_level;
9291    for (loc = 0, hw_level = 0; hw_level < __kmp_topology->get_depth() &&
9292                                loc < __kmp_nesting_mode_nlevels;
9293         loc++, hw_level++) {
9294      __kmp_nesting_nth_level[loc] = __kmp_topology->get_ratio(hw_level);
9295      if (__kmp_nesting_nth_level[loc] == 1)
9296        loc--;
9297    }
9298    // Make sure all cores are used
9299    if (__kmp_nesting_mode > 1 && loc > 1) {
9300      int core_level = __kmp_topology->get_level(KMP_HW_CORE);
9301      int num_cores = __kmp_topology->get_count(core_level);
9302      int upper_levels = 1;
9303      for (int level = 0; level < loc - 1; ++level)
9304        upper_levels *= __kmp_nesting_nth_level[level];
9305      if (upper_levels * __kmp_nesting_nth_level[loc - 1] < num_cores)
9306        __kmp_nesting_nth_level[loc - 1] =
9307            num_cores / __kmp_nesting_nth_level[loc - 2];
9308    }
9309    __kmp_nesting_mode_nlevels = loc;
9310    __kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
9311  } else { // no topology info available; provide a reasonable guesstimation
9312    if (__kmp_avail_proc >= 4) {
9313      __kmp_nesting_nth_level[0] = __kmp_avail_proc / 2;
9314      __kmp_nesting_nth_level[1] = 2;
9315      __kmp_nesting_mode_nlevels = 2;
9316    } else {
9317      __kmp_nesting_nth_level[0] = __kmp_avail_proc;
9318      __kmp_nesting_mode_nlevels = 1;
9319    }
9320    __kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
9321  }
9322  for (int i = 0; i < __kmp_nesting_mode_nlevels; ++i) {
9323    __kmp_nested_nth.nth[i] = __kmp_nesting_nth_level[i];
9324  }
9325  set__nproc(thread, __kmp_nesting_nth_level[0]);
9326  if (__kmp_nesting_mode > 1 && __kmp_nesting_mode_nlevels > __kmp_nesting_mode)
9327    __kmp_nesting_mode_nlevels = __kmp_nesting_mode;
9328  if (get__max_active_levels(thread) > 1) {
9329    // if max levels was set, set nesting mode levels to same
9330    __kmp_nesting_mode_nlevels = get__max_active_levels(thread);
9331  }
9332  if (__kmp_nesting_mode == 1) // turn on nesting for this case only
9333    set__max_active_levels(thread, __kmp_nesting_mode_nlevels);
9334}
9335
9336// Empty symbols to export (see exports_so.txt) when feature is disabled
9337extern "C" {
9338#if !KMP_STATS_ENABLED
9339void __kmp_reset_stats() {}
9340#endif
9341#if !USE_DEBUGGER
9342int __kmp_omp_debug_struct_info = FALSE;
9343int __kmp_debugging = FALSE;
9344#endif
9345#if !USE_ITT_BUILD || !USE_ITT_NOTIFY
9346void __kmp_itt_fini_ittlib() {}
9347void __kmp_itt_init_ittlib() {}
9348#endif
9349}
9350
9351// end of file
9352