1/*
2 * kmp_barrier.cpp
3 */
4
5//===----------------------------------------------------------------------===//
6//
7// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8// See https://llvm.org/LICENSE.txt for license information.
9// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10//
11//===----------------------------------------------------------------------===//
12
13#include "kmp_wait_release.h"
14#include "kmp_barrier.h"
15#include "kmp_itt.h"
16#include "kmp_os.h"
17#include "kmp_stats.h"
18#include "ompt-specific.h"
19// for distributed barrier
20#include "kmp_affinity.h"
21
22#if KMP_MIC
23#include <immintrin.h>
24#define USE_NGO_STORES 1
25#endif // KMP_MIC
26
27#if KMP_MIC && USE_NGO_STORES
28// ICV copying
29#define ngo_load(src) __m512d Vt = _mm512_load_pd((void *)(src))
30#define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
31#define ngo_store_go(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
32#define ngo_sync() __asm__ volatile("lock; addl $0,0(%%rsp)" ::: "memory")
33#else
34#define ngo_load(src) ((void)0)
35#define ngo_store_icvs(dst, src) copy_icvs((dst), (src))
36#define ngo_store_go(dst, src) KMP_MEMCPY((dst), (src), CACHE_LINE)
37#define ngo_sync() ((void)0)
38#endif /* KMP_MIC && USE_NGO_STORES */
39
40void __kmp_print_structure(void); // Forward declaration
41
42// ---------------------------- Barrier Algorithms ----------------------------
43// Distributed barrier
44
45// Compute how many threads to have polling each cache-line.
46// We want to limit the number of writes to IDEAL_GO_RESOLUTION.
47void distributedBarrier::computeVarsForN(size_t n) {
48  int nsockets = 1;
49  if (__kmp_topology) {
50    int socket_level = __kmp_topology->get_level(KMP_HW_SOCKET);
51    int core_level = __kmp_topology->get_level(KMP_HW_CORE);
52    int ncores_per_socket =
53        __kmp_topology->calculate_ratio(core_level, socket_level);
54    nsockets = __kmp_topology->get_count(socket_level);
55
56    if (nsockets <= 0)
57      nsockets = 1;
58    if (ncores_per_socket <= 0)
59      ncores_per_socket = 1;
60
61    threads_per_go = ncores_per_socket >> 1;
62    if (!fix_threads_per_go) {
63      // Minimize num_gos
64      if (threads_per_go > 4) {
65        if (KMP_OPTIMIZE_FOR_REDUCTIONS) {
66          threads_per_go = threads_per_go >> 1;
67        }
68        if (threads_per_go > 4 && nsockets == 1)
69          threads_per_go = threads_per_go >> 1;
70      }
71    }
72    if (threads_per_go == 0)
73      threads_per_go = 1;
74    fix_threads_per_go = true;
75    num_gos = n / threads_per_go;
76    if (n % threads_per_go)
77      num_gos++;
78    if (nsockets == 1 || num_gos == 1)
79      num_groups = 1;
80    else {
81      num_groups = num_gos / nsockets;
82      if (num_gos % nsockets)
83        num_groups++;
84    }
85    if (num_groups <= 0)
86      num_groups = 1;
87    gos_per_group = num_gos / num_groups;
88    if (num_gos % num_groups)
89      gos_per_group++;
90    threads_per_group = threads_per_go * gos_per_group;
91  } else {
92    num_gos = n / threads_per_go;
93    if (n % threads_per_go)
94      num_gos++;
95    if (num_gos == 1)
96      num_groups = 1;
97    else {
98      num_groups = num_gos / 2;
99      if (num_gos % 2)
100        num_groups++;
101    }
102    gos_per_group = num_gos / num_groups;
103    if (num_gos % num_groups)
104      gos_per_group++;
105    threads_per_group = threads_per_go * gos_per_group;
106  }
107}
108
109void distributedBarrier::computeGo(size_t n) {
110  // Minimize num_gos
111  for (num_gos = 1;; num_gos++)
112    if (IDEAL_CONTENTION * num_gos >= n)
113      break;
114  threads_per_go = n / num_gos;
115  if (n % num_gos)
116    threads_per_go++;
117  while (num_gos > MAX_GOS) {
118    threads_per_go++;
119    num_gos = n / threads_per_go;
120    if (n % threads_per_go)
121      num_gos++;
122  }
123  computeVarsForN(n);
124}
125
126// This function is to resize the barrier arrays when the new number of threads
127// exceeds max_threads, which is the current size of all the arrays
128void distributedBarrier::resize(size_t nthr) {
129  KMP_DEBUG_ASSERT(nthr > max_threads);
130
131  // expand to requested size * 2
132  max_threads = nthr * 2;
133
134  // allocate arrays to new max threads
135  for (int i = 0; i < MAX_ITERS; ++i) {
136    if (flags[i])
137      flags[i] = (flags_s *)KMP_INTERNAL_REALLOC(flags[i],
138                                                 max_threads * sizeof(flags_s));
139    else
140      flags[i] = (flags_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(flags_s));
141  }
142
143  if (go)
144    go = (go_s *)KMP_INTERNAL_REALLOC(go, max_threads * sizeof(go_s));
145  else
146    go = (go_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(go_s));
147
148  if (iter)
149    iter = (iter_s *)KMP_INTERNAL_REALLOC(iter, max_threads * sizeof(iter_s));
150  else
151    iter = (iter_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(iter_s));
152
153  if (sleep)
154    sleep =
155        (sleep_s *)KMP_INTERNAL_REALLOC(sleep, max_threads * sizeof(sleep_s));
156  else
157    sleep = (sleep_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(sleep_s));
158}
159
160// This function is to set all the go flags that threads might be waiting
161// on, and when blocktime is not infinite, it should be followed by a wake-up
162// call to each thread
163kmp_uint64 distributedBarrier::go_release() {
164  kmp_uint64 next_go = iter[0].iter + distributedBarrier::MAX_ITERS;
165  for (size_t j = 0; j < num_gos; j++) {
166    go[j].go.store(next_go);
167  }
168  return next_go;
169}
170
171void distributedBarrier::go_reset() {
172  for (size_t j = 0; j < max_threads; ++j) {
173    for (size_t i = 0; i < distributedBarrier::MAX_ITERS; ++i) {
174      flags[i][j].stillNeed = 1;
175    }
176    go[j].go.store(0);
177    iter[j].iter = 0;
178  }
179}
180
181// This function inits/re-inits the distributed barrier for a particular number
182// of threads. If a resize of arrays is needed, it calls the resize function.
183void distributedBarrier::init(size_t nthr) {
184  size_t old_max = max_threads;
185  if (nthr > max_threads) { // need more space in arrays
186    resize(nthr);
187  }
188
189  for (size_t i = 0; i < max_threads; i++) {
190    for (size_t j = 0; j < distributedBarrier::MAX_ITERS; j++) {
191      flags[j][i].stillNeed = 1;
192    }
193    go[i].go.store(0);
194    iter[i].iter = 0;
195    if (i >= old_max)
196      sleep[i].sleep = false;
197  }
198
199  // Recalculate num_gos, etc. based on new nthr
200  computeVarsForN(nthr);
201
202  num_threads = nthr;
203
204  if (team_icvs == NULL)
205    team_icvs = __kmp_allocate(sizeof(kmp_internal_control_t));
206}
207
208// This function is used only when KMP_BLOCKTIME is not infinite.
209// static
210void __kmp_dist_barrier_wakeup(enum barrier_type bt, kmp_team_t *team,
211                               size_t start, size_t stop, size_t inc,
212                               size_t tid) {
213  KMP_DEBUG_ASSERT(__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME);
214  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
215    return;
216
217  kmp_info_t **other_threads = team->t.t_threads;
218  for (size_t thr = start; thr < stop; thr += inc) {
219    KMP_DEBUG_ASSERT(other_threads[thr]);
220    int gtid = other_threads[thr]->th.th_info.ds.ds_gtid;
221    // Wake up worker regardless of if it appears to be sleeping or not
222    __kmp_atomic_resume_64(gtid, (kmp_atomic_flag_64<> *)NULL);
223  }
224}
225
226static void __kmp_dist_barrier_gather(
227    enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
228    void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
229  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_dist_gather);
230  kmp_team_t *team;
231  distributedBarrier *b;
232  kmp_info_t **other_threads;
233  kmp_uint64 my_current_iter, my_next_iter;
234  kmp_uint32 nproc;
235  bool group_leader;
236
237  team = this_thr->th.th_team;
238  nproc = this_thr->th.th_team_nproc;
239  other_threads = team->t.t_threads;
240  b = team->t.b;
241  my_current_iter = b->iter[tid].iter;
242  my_next_iter = (my_current_iter + 1) % distributedBarrier::MAX_ITERS;
243  group_leader = ((tid % b->threads_per_group) == 0);
244
245  KA_TRACE(20,
246           ("__kmp_dist_barrier_gather: T#%d(%d:%d) enter; barrier type %d\n",
247            gtid, team->t.t_id, tid, bt));
248
249#if USE_ITT_BUILD && USE_ITT_NOTIFY
250  // Barrier imbalance - save arrive time to the thread
251  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
252    this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
253        __itt_get_timestamp();
254  }
255#endif
256
257  if (group_leader) {
258    // Start from the thread after the group leader
259    size_t group_start = tid + 1;
260    size_t group_end = tid + b->threads_per_group;
261    size_t threads_pending = 0;
262
263    if (group_end > nproc)
264      group_end = nproc;
265    do { // wait for threads in my group
266      threads_pending = 0;
267      // Check all the flags every time to avoid branch misspredict
268      for (size_t thr = group_start; thr < group_end; thr++) {
269        // Each thread uses a different cache line
270        threads_pending += b->flags[my_current_iter][thr].stillNeed;
271      }
272      // Execute tasks here
273      if (__kmp_tasking_mode != tskm_immediate_exec) {
274        kmp_task_team_t *task_team = this_thr->th.th_task_team;
275        if (task_team != NULL) {
276          if (TCR_SYNC_4(task_team->tt.tt_active)) {
277            if (KMP_TASKING_ENABLED(task_team)) {
278              int tasks_completed = FALSE;
279              __kmp_atomic_execute_tasks_64(
280                  this_thr, gtid, (kmp_atomic_flag_64<> *)NULL, FALSE,
281                  &tasks_completed USE_ITT_BUILD_ARG(itt_sync_obj), 0);
282            } else
283              this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
284          }
285        } else {
286          this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
287        } // if
288      }
289      if (TCR_4(__kmp_global.g.g_done)) {
290        if (__kmp_global.g.g_abort)
291          __kmp_abort_thread();
292        break;
293      } else if (__kmp_tasking_mode != tskm_immediate_exec &&
294                 this_thr->th.th_reap_state == KMP_SAFE_TO_REAP) {
295        this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
296      }
297    } while (threads_pending > 0);
298
299    if (reduce) { // Perform reduction if needed
300      OMPT_REDUCTION_DECL(this_thr, gtid);
301      OMPT_REDUCTION_BEGIN;
302      // Group leader reduces all threads in group
303      for (size_t thr = group_start; thr < group_end; thr++) {
304        (*reduce)(this_thr->th.th_local.reduce_data,
305                  other_threads[thr]->th.th_local.reduce_data);
306      }
307      OMPT_REDUCTION_END;
308    }
309
310    // Set flag for next iteration
311    b->flags[my_next_iter][tid].stillNeed = 1;
312    // Each thread uses a different cache line; resets stillNeed to 0 to
313    // indicate it has reached the barrier
314    b->flags[my_current_iter][tid].stillNeed = 0;
315
316    do { // wait for all group leaders
317      threads_pending = 0;
318      for (size_t thr = 0; thr < nproc; thr += b->threads_per_group) {
319        threads_pending += b->flags[my_current_iter][thr].stillNeed;
320      }
321      // Execute tasks here
322      if (__kmp_tasking_mode != tskm_immediate_exec) {
323        kmp_task_team_t *task_team = this_thr->th.th_task_team;
324        if (task_team != NULL) {
325          if (TCR_SYNC_4(task_team->tt.tt_active)) {
326            if (KMP_TASKING_ENABLED(task_team)) {
327              int tasks_completed = FALSE;
328              __kmp_atomic_execute_tasks_64(
329                  this_thr, gtid, (kmp_atomic_flag_64<> *)NULL, FALSE,
330                  &tasks_completed USE_ITT_BUILD_ARG(itt_sync_obj), 0);
331            } else
332              this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
333          }
334        } else {
335          this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
336        } // if
337      }
338      if (TCR_4(__kmp_global.g.g_done)) {
339        if (__kmp_global.g.g_abort)
340          __kmp_abort_thread();
341        break;
342      } else if (__kmp_tasking_mode != tskm_immediate_exec &&
343                 this_thr->th.th_reap_state == KMP_SAFE_TO_REAP) {
344        this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
345      }
346    } while (threads_pending > 0);
347
348    if (reduce) { // Perform reduction if needed
349      if (KMP_MASTER_TID(tid)) { // Master reduces over group leaders
350        OMPT_REDUCTION_DECL(this_thr, gtid);
351        OMPT_REDUCTION_BEGIN;
352        for (size_t thr = b->threads_per_group; thr < nproc;
353             thr += b->threads_per_group) {
354          (*reduce)(this_thr->th.th_local.reduce_data,
355                    other_threads[thr]->th.th_local.reduce_data);
356        }
357        OMPT_REDUCTION_END;
358      }
359    }
360  } else {
361    // Set flag for next iteration
362    b->flags[my_next_iter][tid].stillNeed = 1;
363    // Each thread uses a different cache line; resets stillNeed to 0 to
364    // indicate it has reached the barrier
365    b->flags[my_current_iter][tid].stillNeed = 0;
366  }
367
368  KMP_MFENCE();
369
370  KA_TRACE(20,
371           ("__kmp_dist_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
372            gtid, team->t.t_id, tid, bt));
373}
374
375static void __kmp_dist_barrier_release(
376    enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
377    int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
378  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_dist_release);
379  kmp_team_t *team;
380  distributedBarrier *b;
381  kmp_bstate_t *thr_bar;
382  kmp_uint64 my_current_iter, next_go;
383  size_t my_go_index;
384  bool group_leader;
385
386  KA_TRACE(20, ("__kmp_dist_barrier_release: T#%d(%d) enter; barrier type %d\n",
387                gtid, tid, bt));
388
389  thr_bar = &this_thr->th.th_bar[bt].bb;
390
391  if (!KMP_MASTER_TID(tid)) {
392    // workers and non-master group leaders need to check their presence in team
393    do {
394      if (this_thr->th.th_used_in_team.load() != 1 &&
395          this_thr->th.th_used_in_team.load() != 3) {
396        // Thread is not in use in a team. Wait on location in tid's thread
397        // struct. The 0 value tells anyone looking that this thread is spinning
398        // or sleeping until this location becomes 3 again; 3 is the transition
399        // state to get to 1 which is waiting on go and being in the team
400        kmp_flag_32<false, false> my_flag(&(this_thr->th.th_used_in_team), 3);
401        if (KMP_COMPARE_AND_STORE_ACQ32(&(this_thr->th.th_used_in_team), 2,
402                                        0) ||
403            this_thr->th.th_used_in_team.load() == 0) {
404          my_flag.wait(this_thr, true USE_ITT_BUILD_ARG(itt_sync_obj));
405        }
406#if USE_ITT_BUILD && USE_ITT_NOTIFY
407        if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
408          // In fork barrier where we could not get the object reliably
409          itt_sync_obj =
410              __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
411          // Cancel wait on previous parallel region...
412          __kmp_itt_task_starting(itt_sync_obj);
413
414          if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
415            return;
416
417          itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
418          if (itt_sync_obj != NULL)
419            // Call prepare as early as possible for "new" barrier
420            __kmp_itt_task_finished(itt_sync_obj);
421        } else
422#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
423            if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
424          return;
425      }
426      if (this_thr->th.th_used_in_team.load() != 1 &&
427          this_thr->th.th_used_in_team.load() != 3) // spurious wake-up?
428        continue;
429      if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
430        return;
431
432      // At this point, the thread thinks it is in use in a team, or in
433      // transition to be used in a team, but it might have reached this barrier
434      // before it was marked unused by the team. Unused threads are awoken and
435      // shifted to wait on local thread struct elsewhere. It also might reach
436      // this point by being picked up for use by a different team. Either way,
437      // we need to update the tid.
438      tid = __kmp_tid_from_gtid(gtid);
439      team = this_thr->th.th_team;
440      KMP_DEBUG_ASSERT(tid >= 0);
441      KMP_DEBUG_ASSERT(team);
442      b = team->t.b;
443      my_current_iter = b->iter[tid].iter;
444      next_go = my_current_iter + distributedBarrier::MAX_ITERS;
445      my_go_index = tid / b->threads_per_go;
446      if (this_thr->th.th_used_in_team.load() == 3) {
447        KMP_COMPARE_AND_STORE_ACQ32(&(this_thr->th.th_used_in_team), 3, 1);
448      }
449      // Check if go flag is set
450      if (b->go[my_go_index].go.load() != next_go) {
451        // Wait on go flag on team
452        kmp_atomic_flag_64<false, true> my_flag(
453            &(b->go[my_go_index].go), next_go, &(b->sleep[tid].sleep));
454        my_flag.wait(this_thr, true USE_ITT_BUILD_ARG(itt_sync_obj));
455        KMP_DEBUG_ASSERT(my_current_iter == b->iter[tid].iter ||
456                         b->iter[tid].iter == 0);
457        KMP_DEBUG_ASSERT(b->sleep[tid].sleep == false);
458      }
459
460      if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
461        return;
462      // At this point, the thread's go location was set. This means the primary
463      // thread is safely in the barrier, and so this thread's data is
464      // up-to-date, but we should check again that this thread is really in
465      // use in the team, as it could have been woken up for the purpose of
466      // changing team size, or reaping threads at shutdown.
467      if (this_thr->th.th_used_in_team.load() == 1)
468        break;
469    } while (1);
470
471    if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
472      return;
473
474    group_leader = ((tid % b->threads_per_group) == 0);
475    if (group_leader) {
476      // Tell all the threads in my group they can go!
477      for (size_t go_idx = my_go_index + 1;
478           go_idx < my_go_index + b->gos_per_group; go_idx++) {
479        b->go[go_idx].go.store(next_go);
480      }
481      // Fence added so that workers can see changes to go. sfence inadequate.
482      KMP_MFENCE();
483    }
484
485#if KMP_BARRIER_ICV_PUSH
486    if (propagate_icvs) { // copy ICVs to final dest
487      __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team,
488                               tid, FALSE);
489      copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
490                (kmp_internal_control_t *)team->t.b->team_icvs);
491      copy_icvs(&thr_bar->th_fixed_icvs,
492                &team->t.t_implicit_task_taskdata[tid].td_icvs);
493    }
494#endif
495    if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME && group_leader) {
496      // This thread is now awake and participating in the barrier;
497      // wake up the other threads in the group
498      size_t nproc = this_thr->th.th_team_nproc;
499      size_t group_end = tid + b->threads_per_group;
500      if (nproc < group_end)
501        group_end = nproc;
502      __kmp_dist_barrier_wakeup(bt, team, tid + 1, group_end, 1, tid);
503    }
504  } else { //  Primary thread
505    team = this_thr->th.th_team;
506    b = team->t.b;
507    my_current_iter = b->iter[tid].iter;
508    next_go = my_current_iter + distributedBarrier::MAX_ITERS;
509#if KMP_BARRIER_ICV_PUSH
510    if (propagate_icvs) {
511      // primary thread has ICVs in final destination; copy
512      copy_icvs(&thr_bar->th_fixed_icvs,
513                &team->t.t_implicit_task_taskdata[tid].td_icvs);
514    }
515#endif
516    // Tell all the group leaders they can go!
517    for (size_t go_idx = 0; go_idx < b->num_gos; go_idx += b->gos_per_group) {
518      b->go[go_idx].go.store(next_go);
519    }
520
521    if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
522      // Wake-up the group leaders
523      size_t nproc = this_thr->th.th_team_nproc;
524      __kmp_dist_barrier_wakeup(bt, team, tid + b->threads_per_group, nproc,
525                                b->threads_per_group, tid);
526    }
527
528    // Tell all the threads in my group they can go!
529    for (size_t go_idx = 1; go_idx < b->gos_per_group; go_idx++) {
530      b->go[go_idx].go.store(next_go);
531    }
532
533    // Fence added so that workers can see changes to go. sfence inadequate.
534    KMP_MFENCE();
535
536    if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
537      // Wake-up the other threads in my group
538      size_t nproc = this_thr->th.th_team_nproc;
539      size_t group_end = tid + b->threads_per_group;
540      if (nproc < group_end)
541        group_end = nproc;
542      __kmp_dist_barrier_wakeup(bt, team, tid + 1, group_end, 1, tid);
543    }
544  }
545  // Update to next iteration
546  KMP_ASSERT(my_current_iter == b->iter[tid].iter);
547  b->iter[tid].iter = (b->iter[tid].iter + 1) % distributedBarrier::MAX_ITERS;
548
549  KA_TRACE(
550      20, ("__kmp_dist_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
551           gtid, team->t.t_id, tid, bt));
552}
553
554// Linear Barrier
555template <bool cancellable = false>
556static bool __kmp_linear_barrier_gather_template(
557    enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
558    void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
559  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_gather);
560  kmp_team_t *team = this_thr->th.th_team;
561  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
562  kmp_info_t **other_threads = team->t.t_threads;
563
564  KA_TRACE(
565      20,
566      ("__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
567       gtid, team->t.t_id, tid, bt));
568  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
569
570#if USE_ITT_BUILD && USE_ITT_NOTIFY
571  // Barrier imbalance - save arrive time to the thread
572  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
573    this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
574        __itt_get_timestamp();
575  }
576#endif
577  // We now perform a linear reduction to signal that all of the threads have
578  // arrived.
579  if (!KMP_MASTER_TID(tid)) {
580    KA_TRACE(20,
581             ("__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)"
582              "arrived(%p): %llu => %llu\n",
583              gtid, team->t.t_id, tid, __kmp_gtid_from_tid(0, team),
584              team->t.t_id, 0, &thr_bar->b_arrived, thr_bar->b_arrived,
585              thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
586    // Mark arrival to primary thread
587    /* After performing this write, a worker thread may not assume that the team
588       is valid any more - it could be deallocated by the primary thread at any
589       time. */
590    kmp_flag_64<> flag(&thr_bar->b_arrived, other_threads[0]);
591    flag.release();
592  } else {
593    kmp_balign_team_t *team_bar = &team->t.t_bar[bt];
594    int nproc = this_thr->th.th_team_nproc;
595    int i;
596    // Don't have to worry about sleep bit here or atomic since team setting
597    kmp_uint64 new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP;
598
599    // Collect all the worker team member threads.
600    for (i = 1; i < nproc; ++i) {
601#if KMP_CACHE_MANAGE
602      // Prefetch next thread's arrived count
603      if (i + 1 < nproc)
604        KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_arrived);
605#endif /* KMP_CACHE_MANAGE */
606      KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
607                    "arrived(%p) == %llu\n",
608                    gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
609                    team->t.t_id, i,
610                    &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state));
611
612      // Wait for worker thread to arrive
613      if (cancellable) {
614        kmp_flag_64<true, false> flag(
615            &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state);
616        if (flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)))
617          return true;
618      } else {
619        kmp_flag_64<> flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived,
620                           new_state);
621        flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
622      }
623#if USE_ITT_BUILD && USE_ITT_NOTIFY
624      // Barrier imbalance - write min of the thread time and the other thread
625      // time to the thread.
626      if (__kmp_forkjoin_frames_mode == 2) {
627        this_thr->th.th_bar_min_time = KMP_MIN(
628            this_thr->th.th_bar_min_time, other_threads[i]->th.th_bar_min_time);
629      }
630#endif
631      if (reduce) {
632        KA_TRACE(100,
633                 ("__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
634                  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
635                  team->t.t_id, i));
636        OMPT_REDUCTION_DECL(this_thr, gtid);
637        OMPT_REDUCTION_BEGIN;
638        (*reduce)(this_thr->th.th_local.reduce_data,
639                  other_threads[i]->th.th_local.reduce_data);
640        OMPT_REDUCTION_END;
641      }
642    }
643    // Don't have to worry about sleep bit here or atomic since team setting
644    team_bar->b_arrived = new_state;
645    KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d "
646                  "arrived(%p) = %llu\n",
647                  gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived,
648                  new_state));
649  }
650  KA_TRACE(
651      20,
652      ("__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
653       gtid, team->t.t_id, tid, bt));
654  return false;
655}
656
657template <bool cancellable = false>
658static bool __kmp_linear_barrier_release_template(
659    enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
660    int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
661  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_release);
662  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
663  kmp_team_t *team;
664
665  if (KMP_MASTER_TID(tid)) {
666    unsigned int i;
667    kmp_uint32 nproc = this_thr->th.th_team_nproc;
668    kmp_info_t **other_threads;
669
670    team = __kmp_threads[gtid]->th.th_team;
671    KMP_DEBUG_ASSERT(team != NULL);
672    other_threads = team->t.t_threads;
673
674    KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) primary enter for "
675                  "barrier type %d\n",
676                  gtid, team->t.t_id, tid, bt));
677
678    if (nproc > 1) {
679#if KMP_BARRIER_ICV_PUSH
680      {
681        KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
682        if (propagate_icvs) {
683          ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs);
684          for (i = 1; i < nproc; ++i) {
685            __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i],
686                                     team, i, FALSE);
687            ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs,
688                           &team->t.t_implicit_task_taskdata[0].td_icvs);
689          }
690          ngo_sync();
691        }
692      }
693#endif // KMP_BARRIER_ICV_PUSH
694
695      // Now, release all of the worker threads
696      for (i = 1; i < nproc; ++i) {
697#if KMP_CACHE_MANAGE
698        // Prefetch next thread's go flag
699        if (i + 1 < nproc)
700          KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_go);
701#endif /* KMP_CACHE_MANAGE */
702        KA_TRACE(
703            20,
704            ("__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) "
705             "go(%p): %u => %u\n",
706             gtid, team->t.t_id, tid, other_threads[i]->th.th_info.ds.ds_gtid,
707             team->t.t_id, i, &other_threads[i]->th.th_bar[bt].bb.b_go,
708             other_threads[i]->th.th_bar[bt].bb.b_go,
709             other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP));
710        kmp_flag_64<> flag(&other_threads[i]->th.th_bar[bt].bb.b_go,
711                           other_threads[i]);
712        flag.release();
713      }
714    }
715  } else { // Wait for the PRIMARY thread to release us
716    KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n",
717                  gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
718    if (cancellable) {
719      kmp_flag_64<true, false> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
720      if (flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)))
721        return true;
722    } else {
723      kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
724      flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
725    }
726#if USE_ITT_BUILD && USE_ITT_NOTIFY
727    if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
728      // In a fork barrier; cannot get the object reliably (or ITTNOTIFY is
729      // disabled)
730      itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
731      // Cancel wait on previous parallel region...
732      __kmp_itt_task_starting(itt_sync_obj);
733
734      if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
735        return false;
736
737      itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
738      if (itt_sync_obj != NULL)
739        // Call prepare as early as possible for "new" barrier
740        __kmp_itt_task_finished(itt_sync_obj);
741    } else
742#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
743        // Early exit for reaping threads releasing forkjoin barrier
744        if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
745      return false;
746// The worker thread may now assume that the team is valid.
747#ifdef KMP_DEBUG
748    tid = __kmp_tid_from_gtid(gtid);
749    team = __kmp_threads[gtid]->th.th_team;
750#endif
751    KMP_DEBUG_ASSERT(team != NULL);
752    TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
753    KA_TRACE(20,
754             ("__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
755              gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
756    KMP_MB(); // Flush all pending memory write invalidates.
757  }
758  KA_TRACE(
759      20,
760      ("__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
761       gtid, team->t.t_id, tid, bt));
762  return false;
763}
764
765static void __kmp_linear_barrier_gather(
766    enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
767    void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
768  __kmp_linear_barrier_gather_template<false>(
769      bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
770}
771
772static bool __kmp_linear_barrier_gather_cancellable(
773    enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
774    void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
775  return __kmp_linear_barrier_gather_template<true>(
776      bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
777}
778
779static void __kmp_linear_barrier_release(
780    enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
781    int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
782  __kmp_linear_barrier_release_template<false>(
783      bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));
784}
785
786static bool __kmp_linear_barrier_release_cancellable(
787    enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
788    int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
789  return __kmp_linear_barrier_release_template<true>(
790      bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));
791}
792
793// Tree barrier
794static void __kmp_tree_barrier_gather(
795    enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
796    void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
797  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_gather);
798  kmp_team_t *team = this_thr->th.th_team;
799  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
800  kmp_info_t **other_threads = team->t.t_threads;
801  kmp_uint32 nproc = this_thr->th.th_team_nproc;
802  kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
803  kmp_uint32 branch_factor = 1 << branch_bits;
804  kmp_uint32 child;
805  kmp_uint32 child_tid;
806  kmp_uint64 new_state = 0;
807
808  KA_TRACE(
809      20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
810           gtid, team->t.t_id, tid, bt));
811  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
812
813#if USE_ITT_BUILD && USE_ITT_NOTIFY
814  // Barrier imbalance - save arrive time to the thread
815  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
816    this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
817        __itt_get_timestamp();
818  }
819#endif
820  // Perform tree gather to wait until all threads have arrived; reduce any
821  // required data as we go
822  child_tid = (tid << branch_bits) + 1;
823  if (child_tid < nproc) {
824    // Parent threads wait for all their children to arrive
825    new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
826    child = 1;
827    do {
828      kmp_info_t *child_thr = other_threads[child_tid];
829      kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
830#if KMP_CACHE_MANAGE
831      // Prefetch next thread's arrived count
832      if (child + 1 <= branch_factor && child_tid + 1 < nproc)
833        KMP_CACHE_PREFETCH(
834            &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_arrived);
835#endif /* KMP_CACHE_MANAGE */
836      KA_TRACE(20,
837               ("__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
838                "arrived(%p) == %llu\n",
839                gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
840                team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
841      // Wait for child to arrive
842      kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
843      flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
844#if USE_ITT_BUILD && USE_ITT_NOTIFY
845      // Barrier imbalance - write min of the thread time and a child time to
846      // the thread.
847      if (__kmp_forkjoin_frames_mode == 2) {
848        this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
849                                               child_thr->th.th_bar_min_time);
850      }
851#endif
852      if (reduce) {
853        KA_TRACE(100,
854                 ("__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
855                  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
856                  team->t.t_id, child_tid));
857        OMPT_REDUCTION_DECL(this_thr, gtid);
858        OMPT_REDUCTION_BEGIN;
859        (*reduce)(this_thr->th.th_local.reduce_data,
860                  child_thr->th.th_local.reduce_data);
861        OMPT_REDUCTION_END;
862      }
863      child++;
864      child_tid++;
865    } while (child <= branch_factor && child_tid < nproc);
866  }
867
868  if (!KMP_MASTER_TID(tid)) { // Worker threads
869    kmp_int32 parent_tid = (tid - 1) >> branch_bits;
870
871    KA_TRACE(20,
872             ("__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
873              "arrived(%p): %llu => %llu\n",
874              gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
875              team->t.t_id, parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
876              thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
877
878    // Mark arrival to parent thread
879    /* After performing this write, a worker thread may not assume that the team
880       is valid any more - it could be deallocated by the primary thread at any
881       time.  */
882    kmp_flag_64<> flag(&thr_bar->b_arrived, other_threads[parent_tid]);
883    flag.release();
884  } else {
885    // Need to update the team arrived pointer if we are the primary thread
886    if (nproc > 1) // New value was already computed above
887      team->t.t_bar[bt].b_arrived = new_state;
888    else
889      team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
890    KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d "
891                  "arrived(%p) = %llu\n",
892                  gtid, team->t.t_id, tid, team->t.t_id,
893                  &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
894  }
895  KA_TRACE(20,
896           ("__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
897            gtid, team->t.t_id, tid, bt));
898}
899
900static void __kmp_tree_barrier_release(
901    enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
902    int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
903  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_release);
904  kmp_team_t *team;
905  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
906  kmp_uint32 nproc;
907  kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
908  kmp_uint32 branch_factor = 1 << branch_bits;
909  kmp_uint32 child;
910  kmp_uint32 child_tid;
911
912  // Perform a tree release for all of the threads that have been gathered
913  if (!KMP_MASTER_TID(
914          tid)) { // Handle fork barrier workers who aren't part of a team yet
915    KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n", gtid,
916                  &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
917    // Wait for parent thread to release us
918    kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
919    flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
920#if USE_ITT_BUILD && USE_ITT_NOTIFY
921    if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
922      // In fork barrier where we could not get the object reliably (or
923      // ITTNOTIFY is disabled)
924      itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
925      // Cancel wait on previous parallel region...
926      __kmp_itt_task_starting(itt_sync_obj);
927
928      if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
929        return;
930
931      itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
932      if (itt_sync_obj != NULL)
933        // Call prepare as early as possible for "new" barrier
934        __kmp_itt_task_finished(itt_sync_obj);
935    } else
936#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
937        // Early exit for reaping threads releasing forkjoin barrier
938        if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
939      return;
940
941    // The worker thread may now assume that the team is valid.
942    team = __kmp_threads[gtid]->th.th_team;
943    KMP_DEBUG_ASSERT(team != NULL);
944    tid = __kmp_tid_from_gtid(gtid);
945
946    TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
947    KA_TRACE(20,
948             ("__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", gtid,
949              team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
950    KMP_MB(); // Flush all pending memory write invalidates.
951  } else {
952    team = __kmp_threads[gtid]->th.th_team;
953    KMP_DEBUG_ASSERT(team != NULL);
954    KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) primary enter for "
955                  "barrier type %d\n",
956                  gtid, team->t.t_id, tid, bt));
957  }
958  nproc = this_thr->th.th_team_nproc;
959  child_tid = (tid << branch_bits) + 1;
960
961  if (child_tid < nproc) {
962    kmp_info_t **other_threads = team->t.t_threads;
963    child = 1;
964    // Parent threads release all their children
965    do {
966      kmp_info_t *child_thr = other_threads[child_tid];
967      kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
968#if KMP_CACHE_MANAGE
969      // Prefetch next thread's go count
970      if (child + 1 <= branch_factor && child_tid + 1 < nproc)
971        KMP_CACHE_PREFETCH(
972            &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_go);
973#endif /* KMP_CACHE_MANAGE */
974
975#if KMP_BARRIER_ICV_PUSH
976      {
977        KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
978        if (propagate_icvs) {
979          __kmp_init_implicit_task(team->t.t_ident,
980                                   team->t.t_threads[child_tid], team,
981                                   child_tid, FALSE);
982          copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs,
983                    &team->t.t_implicit_task_taskdata[0].td_icvs);
984        }
985      }
986#endif // KMP_BARRIER_ICV_PUSH
987      KA_TRACE(20,
988               ("__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
989                "go(%p): %u => %u\n",
990                gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
991                team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
992                child_bar->b_go + KMP_BARRIER_STATE_BUMP));
993      // Release child from barrier
994      kmp_flag_64<> flag(&child_bar->b_go, child_thr);
995      flag.release();
996      child++;
997      child_tid++;
998    } while (child <= branch_factor && child_tid < nproc);
999  }
1000  KA_TRACE(
1001      20, ("__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
1002           gtid, team->t.t_id, tid, bt));
1003}
1004
1005// Hyper Barrier
1006static void __kmp_hyper_barrier_gather(
1007    enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
1008    void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
1009  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_gather);
1010  kmp_team_t *team = this_thr->th.th_team;
1011  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1012  kmp_info_t **other_threads = team->t.t_threads;
1013  kmp_uint64 new_state = KMP_BARRIER_UNUSED_STATE;
1014  kmp_uint32 num_threads = this_thr->th.th_team_nproc;
1015  kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
1016  kmp_uint32 branch_factor = 1 << branch_bits;
1017  kmp_uint32 offset;
1018  kmp_uint32 level;
1019
1020  KA_TRACE(
1021      20,
1022      ("__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
1023       gtid, team->t.t_id, tid, bt));
1024  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
1025
1026#if USE_ITT_BUILD && USE_ITT_NOTIFY
1027  // Barrier imbalance - save arrive time to the thread
1028  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
1029    this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
1030        __itt_get_timestamp();
1031  }
1032#endif
1033  /* Perform a hypercube-embedded tree gather to wait until all of the threads
1034     have arrived, and reduce any required data as we go.  */
1035  kmp_flag_64<> p_flag(&thr_bar->b_arrived);
1036  for (level = 0, offset = 1; offset < num_threads;
1037       level += branch_bits, offset <<= branch_bits) {
1038    kmp_uint32 child;
1039    kmp_uint32 child_tid;
1040
1041    if (((tid >> level) & (branch_factor - 1)) != 0) {
1042      kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) - 1);
1043
1044      KMP_MB(); // Synchronize parent and child threads.
1045      KA_TRACE(20,
1046               ("__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
1047                "arrived(%p): %llu => %llu\n",
1048                gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
1049                team->t.t_id, parent_tid, &thr_bar->b_arrived,
1050                thr_bar->b_arrived,
1051                thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
1052      // Mark arrival to parent thread
1053      /* After performing this write (in the last iteration of the enclosing for
1054         loop), a worker thread may not assume that the team is valid any more
1055         - it could be deallocated by the primary thread at any time.  */
1056      p_flag.set_waiter(other_threads[parent_tid]);
1057      p_flag.release();
1058      break;
1059    }
1060
1061    // Parent threads wait for children to arrive
1062    if (new_state == KMP_BARRIER_UNUSED_STATE)
1063      new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
1064    for (child = 1, child_tid = tid + (1 << level);
1065         child < branch_factor && child_tid < num_threads;
1066         child++, child_tid += (1 << level)) {
1067      kmp_info_t *child_thr = other_threads[child_tid];
1068      kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1069#if KMP_CACHE_MANAGE
1070      kmp_uint32 next_child_tid = child_tid + (1 << level);
1071      // Prefetch next thread's arrived count
1072      if (child + 1 < branch_factor && next_child_tid < num_threads)
1073        KMP_CACHE_PREFETCH(
1074            &other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived);
1075#endif /* KMP_CACHE_MANAGE */
1076      KA_TRACE(20,
1077               ("__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
1078                "arrived(%p) == %llu\n",
1079                gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1080                team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
1081      // Wait for child to arrive
1082      kmp_flag_64<> c_flag(&child_bar->b_arrived, new_state);
1083      c_flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1084      KMP_MB(); // Synchronize parent and child threads.
1085#if USE_ITT_BUILD && USE_ITT_NOTIFY
1086      // Barrier imbalance - write min of the thread time and a child time to
1087      // the thread.
1088      if (__kmp_forkjoin_frames_mode == 2) {
1089        this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
1090                                               child_thr->th.th_bar_min_time);
1091      }
1092#endif
1093      if (reduce) {
1094        KA_TRACE(100,
1095                 ("__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
1096                  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1097                  team->t.t_id, child_tid));
1098        OMPT_REDUCTION_DECL(this_thr, gtid);
1099        OMPT_REDUCTION_BEGIN;
1100        (*reduce)(this_thr->th.th_local.reduce_data,
1101                  child_thr->th.th_local.reduce_data);
1102        OMPT_REDUCTION_END;
1103      }
1104    }
1105  }
1106
1107  if (KMP_MASTER_TID(tid)) {
1108    // Need to update the team arrived pointer if we are the primary thread
1109    if (new_state == KMP_BARRIER_UNUSED_STATE)
1110      team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
1111    else
1112      team->t.t_bar[bt].b_arrived = new_state;
1113    KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d "
1114                  "arrived(%p) = %llu\n",
1115                  gtid, team->t.t_id, tid, team->t.t_id,
1116                  &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
1117  }
1118  KA_TRACE(
1119      20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
1120           gtid, team->t.t_id, tid, bt));
1121}
1122
1123// The reverse versions seem to beat the forward versions overall
1124#define KMP_REVERSE_HYPER_BAR
1125static void __kmp_hyper_barrier_release(
1126    enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
1127    int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
1128  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_release);
1129  kmp_team_t *team;
1130  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1131  kmp_info_t **other_threads;
1132  kmp_uint32 num_threads;
1133  kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
1134  kmp_uint32 branch_factor = 1 << branch_bits;
1135  kmp_uint32 child;
1136  kmp_uint32 child_tid;
1137  kmp_uint32 offset;
1138  kmp_uint32 level;
1139
1140  /* Perform a hypercube-embedded tree release for all of the threads that have
1141     been gathered. If KMP_REVERSE_HYPER_BAR is defined (default) the threads
1142     are released in the reverse order of the corresponding gather, otherwise
1143     threads are released in the same order. */
1144  if (KMP_MASTER_TID(tid)) { // primary thread
1145    team = __kmp_threads[gtid]->th.th_team;
1146    KMP_DEBUG_ASSERT(team != NULL);
1147    KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) primary enter for "
1148                  "barrier type %d\n",
1149                  gtid, team->t.t_id, tid, bt));
1150#if KMP_BARRIER_ICV_PUSH
1151    if (propagate_icvs) { // primary already has ICVs in final destination; copy
1152      copy_icvs(&thr_bar->th_fixed_icvs,
1153                &team->t.t_implicit_task_taskdata[tid].td_icvs);
1154    }
1155#endif
1156  } else { // Handle fork barrier workers who aren't part of a team yet
1157    KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n", gtid,
1158                  &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
1159    // Wait for parent thread to release us
1160    kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
1161    flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1162#if USE_ITT_BUILD && USE_ITT_NOTIFY
1163    if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
1164      // In fork barrier where we could not get the object reliably
1165      itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
1166      // Cancel wait on previous parallel region...
1167      __kmp_itt_task_starting(itt_sync_obj);
1168
1169      if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
1170        return;
1171
1172      itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1173      if (itt_sync_obj != NULL)
1174        // Call prepare as early as possible for "new" barrier
1175        __kmp_itt_task_finished(itt_sync_obj);
1176    } else
1177#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1178        // Early exit for reaping threads releasing forkjoin barrier
1179        if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
1180      return;
1181
1182    // The worker thread may now assume that the team is valid.
1183    team = __kmp_threads[gtid]->th.th_team;
1184    KMP_DEBUG_ASSERT(team != NULL);
1185    tid = __kmp_tid_from_gtid(gtid);
1186
1187    TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
1188    KA_TRACE(20,
1189             ("__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
1190              gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
1191    KMP_MB(); // Flush all pending memory write invalidates.
1192  }
1193  num_threads = this_thr->th.th_team_nproc;
1194  other_threads = team->t.t_threads;
1195
1196#ifdef KMP_REVERSE_HYPER_BAR
1197  // Count up to correct level for parent
1198  for (level = 0, offset = 1;
1199       offset < num_threads && (((tid >> level) & (branch_factor - 1)) == 0);
1200       level += branch_bits, offset <<= branch_bits)
1201    ;
1202
1203  // Now go down from there
1204  for (level -= branch_bits, offset >>= branch_bits; offset != 0;
1205       level -= branch_bits, offset >>= branch_bits)
1206#else
1207  // Go down the tree, level by level
1208  for (level = 0, offset = 1; offset < num_threads;
1209       level += branch_bits, offset <<= branch_bits)
1210#endif // KMP_REVERSE_HYPER_BAR
1211  {
1212#ifdef KMP_REVERSE_HYPER_BAR
1213    /* Now go in reverse order through the children, highest to lowest.
1214       Initial setting of child is conservative here. */
1215    child = num_threads >> ((level == 0) ? level : level - 1);
1216    for (child = (child < branch_factor - 1) ? child : branch_factor - 1,
1217        child_tid = tid + (child << level);
1218         child >= 1; child--, child_tid -= (1 << level))
1219#else
1220    if (((tid >> level) & (branch_factor - 1)) != 0)
1221      // No need to go lower than this, since this is the level parent would be
1222      // notified
1223      break;
1224    // Iterate through children on this level of the tree
1225    for (child = 1, child_tid = tid + (1 << level);
1226         child < branch_factor && child_tid < num_threads;
1227         child++, child_tid += (1 << level))
1228#endif // KMP_REVERSE_HYPER_BAR
1229    {
1230      if (child_tid >= num_threads)
1231        continue; // Child doesn't exist so keep going
1232      else {
1233        kmp_info_t *child_thr = other_threads[child_tid];
1234        kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1235#if KMP_CACHE_MANAGE
1236        kmp_uint32 next_child_tid = child_tid - (1 << level);
1237// Prefetch next thread's go count
1238#ifdef KMP_REVERSE_HYPER_BAR
1239        if (child - 1 >= 1 && next_child_tid < num_threads)
1240#else
1241        if (child + 1 < branch_factor && next_child_tid < num_threads)
1242#endif // KMP_REVERSE_HYPER_BAR
1243          KMP_CACHE_PREFETCH(
1244              &other_threads[next_child_tid]->th.th_bar[bt].bb.b_go);
1245#endif /* KMP_CACHE_MANAGE */
1246
1247#if KMP_BARRIER_ICV_PUSH
1248        if (propagate_icvs) // push my fixed ICVs to my child
1249          copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
1250#endif // KMP_BARRIER_ICV_PUSH
1251
1252        KA_TRACE(
1253            20,
1254            ("__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
1255             "go(%p): %u => %u\n",
1256             gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1257             team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1258             child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1259        // Release child from barrier
1260        kmp_flag_64<> flag(&child_bar->b_go, child_thr);
1261        flag.release();
1262      }
1263    }
1264  }
1265#if KMP_BARRIER_ICV_PUSH
1266  if (propagate_icvs &&
1267      !KMP_MASTER_TID(tid)) { // copy ICVs locally to final dest
1268    __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
1269                             FALSE);
1270    copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1271              &thr_bar->th_fixed_icvs);
1272  }
1273#endif
1274  KA_TRACE(
1275      20,
1276      ("__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
1277       gtid, team->t.t_id, tid, bt));
1278}
1279
1280// Hierarchical Barrier
1281
1282// Initialize thread barrier data
1283/* Initializes/re-initializes the hierarchical barrier data stored on a thread.
1284   Performs the minimum amount of initialization required based on how the team
1285   has changed. Returns true if leaf children will require both on-core and
1286   traditional wake-up mechanisms. For example, if the team size increases,
1287   threads already in the team will respond to on-core wakeup on their parent
1288   thread, but threads newly added to the team will only be listening on the
1289   their local b_go. */
1290static bool __kmp_init_hierarchical_barrier_thread(enum barrier_type bt,
1291                                                   kmp_bstate_t *thr_bar,
1292                                                   kmp_uint32 nproc, int gtid,
1293                                                   int tid, kmp_team_t *team) {
1294  // Checks to determine if (re-)initialization is needed
1295  bool uninitialized = thr_bar->team == NULL;
1296  bool team_changed = team != thr_bar->team;
1297  bool team_sz_changed = nproc != thr_bar->nproc;
1298  bool tid_changed = tid != thr_bar->old_tid;
1299  bool retval = false;
1300
1301  if (uninitialized || team_sz_changed) {
1302    __kmp_get_hierarchy(nproc, thr_bar);
1303  }
1304
1305  if (uninitialized || team_sz_changed || tid_changed) {
1306    thr_bar->my_level = thr_bar->depth - 1; // default for primary thread
1307    thr_bar->parent_tid = -1; // default for primary thread
1308    if (!KMP_MASTER_TID(tid)) {
1309      // if not primary thread, find parent thread in hierarchy
1310      kmp_uint32 d = 0;
1311      while (d < thr_bar->depth) { // find parent based on level of thread in
1312        // hierarchy, and note level
1313        kmp_uint32 rem;
1314        if (d == thr_bar->depth - 2) { // reached level right below the primary
1315          thr_bar->parent_tid = 0;
1316          thr_bar->my_level = d;
1317          break;
1318        } else if ((rem = tid % thr_bar->skip_per_level[d + 1]) != 0) {
1319          // TODO: can we make the above op faster?
1320          // thread is not a subtree root at next level, so this is max
1321          thr_bar->parent_tid = tid - rem;
1322          thr_bar->my_level = d;
1323          break;
1324        }
1325        ++d;
1326      }
1327    }
1328    __kmp_type_convert(7 - ((tid - thr_bar->parent_tid) /
1329                            (thr_bar->skip_per_level[thr_bar->my_level])),
1330                       &(thr_bar->offset));
1331    thr_bar->old_tid = tid;
1332    thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
1333    thr_bar->team = team;
1334    thr_bar->parent_bar =
1335        &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
1336  }
1337  if (uninitialized || team_changed || tid_changed) {
1338    thr_bar->team = team;
1339    thr_bar->parent_bar =
1340        &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
1341    retval = true;
1342  }
1343  if (uninitialized || team_sz_changed || tid_changed) {
1344    thr_bar->nproc = nproc;
1345    thr_bar->leaf_kids = thr_bar->base_leaf_kids;
1346    if (thr_bar->my_level == 0)
1347      thr_bar->leaf_kids = 0;
1348    if (thr_bar->leaf_kids && (kmp_uint32)tid + thr_bar->leaf_kids + 1 > nproc)
1349      __kmp_type_convert(nproc - tid - 1, &(thr_bar->leaf_kids));
1350    thr_bar->leaf_state = 0;
1351    for (int i = 0; i < thr_bar->leaf_kids; ++i)
1352      ((char *)&(thr_bar->leaf_state))[7 - i] = 1;
1353  }
1354  return retval;
1355}
1356
1357static void __kmp_hierarchical_barrier_gather(
1358    enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
1359    void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
1360  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_gather);
1361  kmp_team_t *team = this_thr->th.th_team;
1362  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1363  kmp_uint32 nproc = this_thr->th.th_team_nproc;
1364  kmp_info_t **other_threads = team->t.t_threads;
1365  kmp_uint64 new_state = 0;
1366
1367  int level = team->t.t_level;
1368  if (other_threads[0]
1369          ->th.th_teams_microtask) // are we inside the teams construct?
1370    if (this_thr->th.th_teams_size.nteams > 1)
1371      ++level; // level was not increased in teams construct for team_of_masters
1372  if (level == 1)
1373    thr_bar->use_oncore_barrier = 1;
1374  else
1375    thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
1376
1377  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for "
1378                "barrier type %d\n",
1379                gtid, team->t.t_id, tid, bt));
1380  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
1381
1382#if USE_ITT_BUILD && USE_ITT_NOTIFY
1383  // Barrier imbalance - save arrive time to the thread
1384  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
1385    this_thr->th.th_bar_arrive_time = __itt_get_timestamp();
1386  }
1387#endif
1388
1389  (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid,
1390                                               team);
1391
1392  if (thr_bar->my_level) { // not a leaf (my_level==0 means leaf)
1393    kmp_int32 child_tid;
1394    new_state =
1395        (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
1396    if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1397        thr_bar->use_oncore_barrier) {
1398      if (thr_bar->leaf_kids) {
1399        // First, wait for leaf children to check-in on my b_arrived flag
1400        kmp_uint64 leaf_state =
1401            KMP_MASTER_TID(tid)
1402                ? thr_bar->b_arrived | thr_bar->leaf_state
1403                : team->t.t_bar[bt].b_arrived | thr_bar->leaf_state;
1404        KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) waiting "
1405                      "for leaf kids\n",
1406                      gtid, team->t.t_id, tid));
1407        kmp_flag_64<> flag(&thr_bar->b_arrived, leaf_state);
1408        flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1409        if (reduce) {
1410          OMPT_REDUCTION_DECL(this_thr, gtid);
1411          OMPT_REDUCTION_BEGIN;
1412          for (child_tid = tid + 1; child_tid <= tid + thr_bar->leaf_kids;
1413               ++child_tid) {
1414            KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
1415                           "T#%d(%d:%d)\n",
1416                           gtid, team->t.t_id, tid,
1417                           __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1418                           child_tid));
1419            (*reduce)(this_thr->th.th_local.reduce_data,
1420                      other_threads[child_tid]->th.th_local.reduce_data);
1421          }
1422          OMPT_REDUCTION_END;
1423        }
1424        // clear leaf_state bits
1425        KMP_TEST_THEN_AND64(&thr_bar->b_arrived, ~(thr_bar->leaf_state));
1426      }
1427      // Next, wait for higher level children on each child's b_arrived flag
1428      for (kmp_uint32 d = 1; d < thr_bar->my_level;
1429           ++d) { // gather lowest level threads first, but skip 0
1430        kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
1431                   skip = thr_bar->skip_per_level[d];
1432        if (last > nproc)
1433          last = nproc;
1434        for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
1435          kmp_info_t *child_thr = other_threads[child_tid];
1436          kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1437          KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
1438                        "T#%d(%d:%d) "
1439                        "arrived(%p) == %llu\n",
1440                        gtid, team->t.t_id, tid,
1441                        __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1442                        child_tid, &child_bar->b_arrived, new_state));
1443          kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
1444          flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1445          if (reduce) {
1446            KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
1447                           "T#%d(%d:%d)\n",
1448                           gtid, team->t.t_id, tid,
1449                           __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1450                           child_tid));
1451            (*reduce)(this_thr->th.th_local.reduce_data,
1452                      child_thr->th.th_local.reduce_data);
1453          }
1454        }
1455      }
1456    } else { // Blocktime is not infinite
1457      for (kmp_uint32 d = 0; d < thr_bar->my_level;
1458           ++d) { // Gather lowest level threads first
1459        kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
1460                   skip = thr_bar->skip_per_level[d];
1461        if (last > nproc)
1462          last = nproc;
1463        for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
1464          kmp_info_t *child_thr = other_threads[child_tid];
1465          kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1466          KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
1467                        "T#%d(%d:%d) "
1468                        "arrived(%p) == %llu\n",
1469                        gtid, team->t.t_id, tid,
1470                        __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1471                        child_tid, &child_bar->b_arrived, new_state));
1472          kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
1473          flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1474          if (reduce) {
1475            KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
1476                           "T#%d(%d:%d)\n",
1477                           gtid, team->t.t_id, tid,
1478                           __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1479                           child_tid));
1480            (*reduce)(this_thr->th.th_local.reduce_data,
1481                      child_thr->th.th_local.reduce_data);
1482          }
1483        }
1484      }
1485    }
1486  }
1487  // All subordinates are gathered; now release parent if not primary thread
1488
1489  if (!KMP_MASTER_TID(tid)) { // worker threads release parent in hierarchy
1490    KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing"
1491                  " T#%d(%d:%d) arrived(%p): %llu => %llu\n",
1492                  gtid, team->t.t_id, tid,
1493                  __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id,
1494                  thr_bar->parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
1495                  thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
1496    /* Mark arrival to parent: After performing this write, a worker thread may
1497       not assume that the team is valid any more - it could be deallocated by
1498       the primary thread at any time. */
1499    if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ||
1500        !thr_bar->use_oncore_barrier) { // Parent is waiting on my b_arrived
1501      // flag; release it
1502      kmp_flag_64<> flag(&thr_bar->b_arrived,
1503                         other_threads[thr_bar->parent_tid]);
1504      flag.release();
1505    } else {
1506      // Leaf does special release on "offset" bits of parent's b_arrived flag
1507      thr_bar->b_arrived = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
1508      kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived,
1509                           thr_bar->offset + 1);
1510      flag.set_waiter(other_threads[thr_bar->parent_tid]);
1511      flag.release();
1512    }
1513  } else { // Primary thread needs to update the team's b_arrived value
1514    team->t.t_bar[bt].b_arrived = new_state;
1515    KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d "
1516                  "arrived(%p) = %llu\n",
1517                  gtid, team->t.t_id, tid, team->t.t_id,
1518                  &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
1519  }
1520  // Is the team access below unsafe or just technically invalid?
1521  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for "
1522                "barrier type %d\n",
1523                gtid, team->t.t_id, tid, bt));
1524}
1525
1526static void __kmp_hierarchical_barrier_release(
1527    enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
1528    int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
1529  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_release);
1530  kmp_team_t *team;
1531  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1532  kmp_uint32 nproc;
1533  bool team_change = false; // indicates on-core barrier shouldn't be used
1534
1535  if (KMP_MASTER_TID(tid)) {
1536    team = __kmp_threads[gtid]->th.th_team;
1537    KMP_DEBUG_ASSERT(team != NULL);
1538    KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) primary "
1539                  "entered barrier type %d\n",
1540                  gtid, team->t.t_id, tid, bt));
1541  } else { // Worker threads
1542    // Wait for parent thread to release me
1543    if (!thr_bar->use_oncore_barrier ||
1544        __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME || thr_bar->my_level != 0 ||
1545        thr_bar->team == NULL) {
1546      // Use traditional method of waiting on my own b_go flag
1547      thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG;
1548      kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
1549      flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1550      TCW_8(thr_bar->b_go,
1551            KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1552    } else { // Thread barrier data is initialized, this is a leaf, blocktime is
1553      // infinite, not nested
1554      // Wait on my "offset" bits on parent's b_go flag
1555      thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG;
1556      kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP,
1557                           thr_bar->offset + 1, bt,
1558                           this_thr USE_ITT_BUILD_ARG(itt_sync_obj));
1559      flag.wait(this_thr, TRUE);
1560      if (thr_bar->wait_flag ==
1561          KMP_BARRIER_SWITCHING) { // Thread was switched to own b_go
1562        TCW_8(thr_bar->b_go,
1563              KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1564      } else { // Reset my bits on parent's b_go flag
1565        (RCAST(volatile char *,
1566               &(thr_bar->parent_bar->b_go)))[thr_bar->offset + 1] = 0;
1567      }
1568    }
1569    thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
1570    // Early exit for reaping threads releasing forkjoin barrier
1571    if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
1572      return;
1573    // The worker thread may now assume that the team is valid.
1574    team = __kmp_threads[gtid]->th.th_team;
1575    KMP_DEBUG_ASSERT(team != NULL);
1576    tid = __kmp_tid_from_gtid(gtid);
1577
1578    KA_TRACE(
1579        20,
1580        ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
1581         gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
1582    KMP_MB(); // Flush all pending memory write invalidates.
1583  }
1584
1585  nproc = this_thr->th.th_team_nproc;
1586  int level = team->t.t_level;
1587  if (team->t.t_threads[0]
1588          ->th.th_teams_microtask) { // are we inside the teams construct?
1589    if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
1590        this_thr->th.th_teams_level == level)
1591      ++level; // level was not increased in teams construct for team_of_workers
1592    if (this_thr->th.th_teams_size.nteams > 1)
1593      ++level; // level was not increased in teams construct for team_of_masters
1594  }
1595  if (level == 1)
1596    thr_bar->use_oncore_barrier = 1;
1597  else
1598    thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
1599
1600  // If the team size has increased, we still communicate with old leaves via
1601  // oncore barrier.
1602  unsigned short int old_leaf_kids = thr_bar->leaf_kids;
1603  kmp_uint64 old_leaf_state = thr_bar->leaf_state;
1604  team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid,
1605                                                       tid, team);
1606  // But if the entire team changes, we won't use oncore barrier at all
1607  if (team_change)
1608    old_leaf_kids = 0;
1609
1610#if KMP_BARRIER_ICV_PUSH
1611  if (propagate_icvs) {
1612    __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
1613                             FALSE);
1614    if (KMP_MASTER_TID(
1615            tid)) { // primary already has copy in final destination; copy
1616      copy_icvs(&thr_bar->th_fixed_icvs,
1617                &team->t.t_implicit_task_taskdata[tid].td_icvs);
1618    } else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1619               thr_bar->use_oncore_barrier) { // optimization for inf blocktime
1620      if (!thr_bar->my_level) // I'm a leaf in the hierarchy (my_level==0)
1621        // leaves (on-core children) pull parent's fixed ICVs directly to local
1622        // ICV store
1623        copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1624                  &thr_bar->parent_bar->th_fixed_icvs);
1625      // non-leaves will get ICVs piggybacked with b_go via NGO store
1626    } else { // blocktime is not infinite; pull ICVs from parent's fixed ICVs
1627      if (thr_bar->my_level) // not a leaf; copy ICVs to my fixed ICVs child can
1628        // access
1629        copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs);
1630      else // leaves copy parent's fixed ICVs directly to local ICV store
1631        copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1632                  &thr_bar->parent_bar->th_fixed_icvs);
1633    }
1634  }
1635#endif // KMP_BARRIER_ICV_PUSH
1636
1637  // Now, release my children
1638  if (thr_bar->my_level) { // not a leaf
1639    kmp_int32 child_tid;
1640    kmp_uint32 last;
1641    if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1642        thr_bar->use_oncore_barrier) {
1643      if (KMP_MASTER_TID(tid)) { // do a flat release
1644        // Set local b_go to bump children via NGO store of the cache line
1645        // containing IVCs and b_go.
1646        thr_bar->b_go = KMP_BARRIER_STATE_BUMP;
1647        // Use ngo stores if available; b_go piggybacks in the last 8 bytes of
1648        // the cache line
1649        ngo_load(&thr_bar->th_fixed_icvs);
1650        // This loops over all the threads skipping only the leaf nodes in the
1651        // hierarchy
1652        for (child_tid = thr_bar->skip_per_level[1]; child_tid < (int)nproc;
1653             child_tid += thr_bar->skip_per_level[1]) {
1654          kmp_bstate_t *child_bar =
1655              &team->t.t_threads[child_tid]->th.th_bar[bt].bb;
1656          KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1657                        "releasing T#%d(%d:%d)"
1658                        " go(%p): %u => %u\n",
1659                        gtid, team->t.t_id, tid,
1660                        __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1661                        child_tid, &child_bar->b_go, child_bar->b_go,
1662                        child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1663          // Use ngo store (if available) to both store ICVs and release child
1664          // via child's b_go
1665          ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
1666        }
1667        ngo_sync();
1668      }
1669      TCW_8(thr_bar->b_go,
1670            KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1671      // Now, release leaf children
1672      if (thr_bar->leaf_kids) { // if there are any
1673        // We test team_change on the off-chance that the level 1 team changed.
1674        if (team_change ||
1675            old_leaf_kids < thr_bar->leaf_kids) { // some old, some new
1676          if (old_leaf_kids) { // release old leaf kids
1677            thr_bar->b_go |= old_leaf_state;
1678          }
1679          // Release new leaf kids
1680          last = tid + thr_bar->skip_per_level[1];
1681          if (last > nproc)
1682            last = nproc;
1683          for (child_tid = tid + 1 + old_leaf_kids; child_tid < (int)last;
1684               ++child_tid) { // skip_per_level[0]=1
1685            kmp_info_t *child_thr = team->t.t_threads[child_tid];
1686            kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1687            KA_TRACE(
1688                20,
1689                ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing"
1690                 " T#%d(%d:%d) go(%p): %u => %u\n",
1691                 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1692                 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1693                 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1694            // Release child using child's b_go flag
1695            kmp_flag_64<> flag(&child_bar->b_go, child_thr);
1696            flag.release();
1697          }
1698        } else { // Release all children at once with leaf_state bits on my own
1699          // b_go flag
1700          thr_bar->b_go |= thr_bar->leaf_state;
1701        }
1702      }
1703    } else { // Blocktime is not infinite; do a simple hierarchical release
1704      for (int d = thr_bar->my_level - 1; d >= 0;
1705           --d) { // Release highest level threads first
1706        last = tid + thr_bar->skip_per_level[d + 1];
1707        kmp_uint32 skip = thr_bar->skip_per_level[d];
1708        if (last > nproc)
1709          last = nproc;
1710        for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
1711          kmp_info_t *child_thr = team->t.t_threads[child_tid];
1712          kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1713          KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1714                        "releasing T#%d(%d:%d) go(%p): %u => %u\n",
1715                        gtid, team->t.t_id, tid,
1716                        __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1717                        child_tid, &child_bar->b_go, child_bar->b_go,
1718                        child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1719          // Release child using child's b_go flag
1720          kmp_flag_64<> flag(&child_bar->b_go, child_thr);
1721          flag.release();
1722        }
1723      }
1724    }
1725#if KMP_BARRIER_ICV_PUSH
1726    if (propagate_icvs && !KMP_MASTER_TID(tid))
1727      // non-leaves copy ICVs from fixed ICVs to local dest
1728      copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1729                &thr_bar->th_fixed_icvs);
1730#endif // KMP_BARRIER_ICV_PUSH
1731  }
1732  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for "
1733                "barrier type %d\n",
1734                gtid, team->t.t_id, tid, bt));
1735}
1736
1737// End of Barrier Algorithms
1738
1739// type traits for cancellable value
1740// if cancellable is true, then is_cancellable is a normal boolean variable
1741// if cancellable is false, then is_cancellable is a compile time constant
1742template <bool cancellable> struct is_cancellable {};
1743template <> struct is_cancellable<true> {
1744  bool value;
1745  is_cancellable() : value(false) {}
1746  is_cancellable(bool b) : value(b) {}
1747  is_cancellable &operator=(bool b) {
1748    value = b;
1749    return *this;
1750  }
1751  operator bool() const { return value; }
1752};
1753template <> struct is_cancellable<false> {
1754  is_cancellable &operator=(bool b) { return *this; }
1755  constexpr operator bool() const { return false; }
1756};
1757
1758// Internal function to do a barrier.
1759/* If is_split is true, do a split barrier, otherwise, do a plain barrier
1760   If reduce is non-NULL, do a split reduction barrier, otherwise, do a split
1761   barrier
1762   When cancellable = false,
1763     Returns 0 if primary thread, 1 if worker thread.
1764   When cancellable = true
1765     Returns 0 if not cancelled, 1 if cancelled.  */
1766template <bool cancellable = false>
1767static int __kmp_barrier_template(enum barrier_type bt, int gtid, int is_split,
1768                                  size_t reduce_size, void *reduce_data,
1769                                  void (*reduce)(void *, void *)) {
1770  KMP_TIME_PARTITIONED_BLOCK(OMP_plain_barrier);
1771  KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
1772  int tid = __kmp_tid_from_gtid(gtid);
1773  kmp_info_t *this_thr = __kmp_threads[gtid];
1774  kmp_team_t *team = this_thr->th.th_team;
1775  int status = 0;
1776  is_cancellable<cancellable> cancelled;
1777#if OMPT_SUPPORT && OMPT_OPTIONAL
1778  ompt_data_t *my_task_data;
1779  ompt_data_t *my_parallel_data;
1780  void *return_address;
1781  ompt_sync_region_t barrier_kind;
1782#endif
1783
1784  KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) has arrived\n", gtid,
1785                __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1786
1787#if OMPT_SUPPORT
1788  if (ompt_enabled.enabled) {
1789#if OMPT_OPTIONAL
1790    my_task_data = OMPT_CUR_TASK_DATA(this_thr);
1791    my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
1792    return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1793    barrier_kind = __ompt_get_barrier_kind(bt, this_thr);
1794    if (ompt_enabled.ompt_callback_sync_region) {
1795      ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1796          barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data,
1797          return_address);
1798    }
1799    if (ompt_enabled.ompt_callback_sync_region_wait) {
1800      ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1801          barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data,
1802          return_address);
1803    }
1804#endif
1805    // It is OK to report the barrier state after the barrier begin callback.
1806    // According to the OMPT specification, a compliant implementation may
1807    // even delay reporting this state until the barrier begins to wait.
1808    this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier;
1809  }
1810#endif
1811
1812  if (!team->t.t_serialized) {
1813#if USE_ITT_BUILD
1814    // This value will be used in itt notify events below.
1815    void *itt_sync_obj = NULL;
1816#if USE_ITT_NOTIFY
1817    if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1818      itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1819#endif
1820#endif /* USE_ITT_BUILD */
1821    if (__kmp_tasking_mode == tskm_extra_barrier) {
1822      __kmp_tasking_barrier(team, this_thr, gtid);
1823      KA_TRACE(15,
1824               ("__kmp_barrier: T#%d(%d:%d) past tasking barrier\n", gtid,
1825                __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1826    }
1827
1828    /* Copy the blocktime info to the thread, where __kmp_wait_template() can
1829       access it when the team struct is not guaranteed to exist. */
1830    // See note about the corresponding code in __kmp_join_barrier() being
1831    // performance-critical.
1832    if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1833#if KMP_USE_MONITOR
1834      this_thr->th.th_team_bt_intervals =
1835          team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1836      this_thr->th.th_team_bt_set =
1837          team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1838#else
1839      this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1840#endif
1841    }
1842
1843#if USE_ITT_BUILD
1844    if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1845      __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1846#endif /* USE_ITT_BUILD */
1847#if USE_DEBUGGER
1848    // Let the debugger know: the thread arrived to the barrier and waiting.
1849    if (KMP_MASTER_TID(tid)) { // Primary thread counter stored in team struct
1850      team->t.t_bar[bt].b_master_arrived += 1;
1851    } else {
1852      this_thr->th.th_bar[bt].bb.b_worker_arrived += 1;
1853    } // if
1854#endif /* USE_DEBUGGER */
1855    if (reduce != NULL) {
1856      // KMP_DEBUG_ASSERT( is_split == TRUE );  // #C69956
1857      this_thr->th.th_local.reduce_data = reduce_data;
1858    }
1859
1860    if (KMP_MASTER_TID(tid) && __kmp_tasking_mode != tskm_immediate_exec)
1861      // use 0 to only setup the current team if nthreads > 1
1862      __kmp_task_team_setup(this_thr, team, 0);
1863
1864    if (cancellable) {
1865      cancelled = __kmp_linear_barrier_gather_cancellable(
1866          bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1867    } else {
1868      switch (__kmp_barrier_gather_pattern[bt]) {
1869      case bp_dist_bar: {
1870        __kmp_dist_barrier_gather(bt, this_thr, gtid, tid,
1871                                  reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1872        break;
1873      }
1874      case bp_hyper_bar: {
1875        // don't set branch bits to 0; use linear
1876        KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]);
1877        __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid,
1878                                   reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1879        break;
1880      }
1881      case bp_hierarchical_bar: {
1882        __kmp_hierarchical_barrier_gather(
1883            bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1884        break;
1885      }
1886      case bp_tree_bar: {
1887        // don't set branch bits to 0; use linear
1888        KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]);
1889        __kmp_tree_barrier_gather(bt, this_thr, gtid, tid,
1890                                  reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1891        break;
1892      }
1893      default: {
1894        __kmp_linear_barrier_gather(bt, this_thr, gtid, tid,
1895                                    reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1896      }
1897      }
1898    }
1899
1900    KMP_MB();
1901
1902    if (KMP_MASTER_TID(tid)) {
1903      status = 0;
1904      if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) {
1905        __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1906      }
1907#if USE_DEBUGGER
1908      // Let the debugger know: All threads are arrived and starting leaving the
1909      // barrier.
1910      team->t.t_bar[bt].b_team_arrived += 1;
1911#endif
1912
1913      if (__kmp_omp_cancellation) {
1914        kmp_int32 cancel_request = KMP_ATOMIC_LD_RLX(&team->t.t_cancel_request);
1915        // Reset cancellation flag for worksharing constructs
1916        if (cancel_request == cancel_loop ||
1917            cancel_request == cancel_sections) {
1918          KMP_ATOMIC_ST_RLX(&team->t.t_cancel_request, cancel_noreq);
1919        }
1920      }
1921#if USE_ITT_BUILD
1922      /* TODO: In case of split reduction barrier, primary thread may send
1923         acquired event early, before the final summation into the shared
1924         variable is done (final summation can be a long operation for array
1925         reductions).  */
1926      if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1927        __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1928#endif /* USE_ITT_BUILD */
1929#if USE_ITT_BUILD && USE_ITT_NOTIFY
1930      // Barrier - report frame end (only if active_level == 1)
1931      if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
1932          __kmp_forkjoin_frames_mode &&
1933          (this_thr->th.th_teams_microtask == NULL || // either not in teams
1934           this_thr->th.th_teams_size.nteams == 1) && // or inside single team
1935          team->t.t_active_level == 1) {
1936        ident_t *loc = __kmp_threads[gtid]->th.th_ident;
1937        kmp_uint64 cur_time = __itt_get_timestamp();
1938        kmp_info_t **other_threads = team->t.t_threads;
1939        int nproc = this_thr->th.th_team_nproc;
1940        int i;
1941        switch (__kmp_forkjoin_frames_mode) {
1942        case 1:
1943          __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1944                                 loc, nproc);
1945          this_thr->th.th_frame_time = cur_time;
1946          break;
1947        case 2: // AC 2015-01-19: currently does not work for hierarchical (to
1948          // be fixed)
1949          __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time,
1950                                 1, loc, nproc);
1951          break;
1952        case 3:
1953          if (__itt_metadata_add_ptr) {
1954            // Initialize with primary thread's wait time
1955            kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1956            // Set arrive time to zero to be able to check it in
1957            // __kmp_invoke_task(); the same is done inside the loop below
1958            this_thr->th.th_bar_arrive_time = 0;
1959            for (i = 1; i < nproc; ++i) {
1960              delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
1961              other_threads[i]->th.th_bar_arrive_time = 0;
1962            }
1963            __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
1964                                         cur_time, delta,
1965                                         (kmp_uint64)(reduce != NULL));
1966          }
1967          __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1968                                 loc, nproc);
1969          this_thr->th.th_frame_time = cur_time;
1970          break;
1971        }
1972      }
1973#endif /* USE_ITT_BUILD */
1974    } else {
1975      status = 1;
1976#if USE_ITT_BUILD
1977      if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1978        __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1979#endif /* USE_ITT_BUILD */
1980    }
1981    if ((status == 1 || !is_split) && !cancelled) {
1982      if (cancellable) {
1983        cancelled = __kmp_linear_barrier_release_cancellable(
1984            bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1985      } else {
1986        switch (__kmp_barrier_release_pattern[bt]) {
1987        case bp_dist_bar: {
1988          KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1989          __kmp_dist_barrier_release(bt, this_thr, gtid, tid,
1990                                     FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1991          break;
1992        }
1993        case bp_hyper_bar: {
1994          KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1995          __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
1996                                      FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1997          break;
1998        }
1999        case bp_hierarchical_bar: {
2000          __kmp_hierarchical_barrier_release(
2001              bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
2002          break;
2003        }
2004        case bp_tree_bar: {
2005          KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
2006          __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
2007                                     FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
2008          break;
2009        }
2010        default: {
2011          __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
2012                                       FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
2013        }
2014        }
2015      }
2016      if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) {
2017        __kmp_task_team_sync(this_thr, team);
2018      }
2019    }
2020
2021#if USE_ITT_BUILD
2022    /* GEH: TODO: Move this under if-condition above and also include in
2023       __kmp_end_split_barrier(). This will more accurately represent the actual
2024       release time of the threads for split barriers.  */
2025    if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
2026      __kmp_itt_barrier_finished(gtid, itt_sync_obj);
2027#endif /* USE_ITT_BUILD */
2028  } else { // Team is serialized.
2029    status = 0;
2030    if (__kmp_tasking_mode != tskm_immediate_exec) {
2031      if (this_thr->th.th_task_team != NULL) {
2032#if USE_ITT_NOTIFY
2033        void *itt_sync_obj = NULL;
2034        if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2035          itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
2036          __kmp_itt_barrier_starting(gtid, itt_sync_obj);
2037        }
2038#endif
2039
2040        KMP_DEBUG_ASSERT(
2041            this_thr->th.th_task_team->tt.tt_found_proxy_tasks == TRUE ||
2042            this_thr->th.th_task_team->tt.tt_hidden_helper_task_encountered ==
2043                TRUE);
2044        __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
2045        __kmp_task_team_setup(this_thr, team, 0);
2046
2047#if USE_ITT_BUILD
2048        if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
2049          __kmp_itt_barrier_finished(gtid, itt_sync_obj);
2050#endif /* USE_ITT_BUILD */
2051      }
2052    }
2053  }
2054  KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n",
2055                gtid, __kmp_team_from_gtid(gtid)->t.t_id,
2056                __kmp_tid_from_gtid(gtid), status));
2057
2058#if OMPT_SUPPORT
2059  if (ompt_enabled.enabled) {
2060#if OMPT_OPTIONAL
2061    if (ompt_enabled.ompt_callback_sync_region_wait) {
2062      ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2063          barrier_kind, ompt_scope_end, my_parallel_data, my_task_data,
2064          return_address);
2065    }
2066    if (ompt_enabled.ompt_callback_sync_region) {
2067      ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2068          barrier_kind, ompt_scope_end, my_parallel_data, my_task_data,
2069          return_address);
2070    }
2071#endif
2072    this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
2073  }
2074#endif
2075
2076  if (cancellable)
2077    return (int)cancelled;
2078  return status;
2079}
2080
2081// Returns 0 if primary thread, 1 if worker thread.
2082int __kmp_barrier(enum barrier_type bt, int gtid, int is_split,
2083                  size_t reduce_size, void *reduce_data,
2084                  void (*reduce)(void *, void *)) {
2085  return __kmp_barrier_template<>(bt, gtid, is_split, reduce_size, reduce_data,
2086                                  reduce);
2087}
2088
2089#if defined(KMP_GOMP_COMPAT)
2090// Returns 1 if cancelled, 0 otherwise
2091int __kmp_barrier_gomp_cancel(int gtid) {
2092  if (__kmp_omp_cancellation) {
2093    int cancelled = __kmp_barrier_template<true>(bs_plain_barrier, gtid, FALSE,
2094                                                 0, NULL, NULL);
2095    if (cancelled) {
2096      int tid = __kmp_tid_from_gtid(gtid);
2097      kmp_info_t *this_thr = __kmp_threads[gtid];
2098      if (KMP_MASTER_TID(tid)) {
2099        // Primary thread does not need to revert anything
2100      } else {
2101        // Workers need to revert their private b_arrived flag
2102        this_thr->th.th_bar[bs_plain_barrier].bb.b_arrived -=
2103            KMP_BARRIER_STATE_BUMP;
2104      }
2105    }
2106    return cancelled;
2107  }
2108  __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2109  return FALSE;
2110}
2111#endif
2112
2113void __kmp_end_split_barrier(enum barrier_type bt, int gtid) {
2114  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_end_split_barrier);
2115  KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
2116  KMP_DEBUG_ASSERT(bt < bs_last_barrier);
2117  int tid = __kmp_tid_from_gtid(gtid);
2118  kmp_info_t *this_thr = __kmp_threads[gtid];
2119  kmp_team_t *team = this_thr->th.th_team;
2120
2121  if (!team->t.t_serialized) {
2122    if (KMP_MASTER_GTID(gtid)) {
2123      switch (__kmp_barrier_release_pattern[bt]) {
2124      case bp_dist_bar: {
2125        __kmp_dist_barrier_release(bt, this_thr, gtid, tid,
2126                                   FALSE USE_ITT_BUILD_ARG(NULL));
2127        break;
2128      }
2129      case bp_hyper_bar: {
2130        KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
2131        __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
2132                                    FALSE USE_ITT_BUILD_ARG(NULL));
2133        break;
2134      }
2135      case bp_hierarchical_bar: {
2136        __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid,
2137                                           FALSE USE_ITT_BUILD_ARG(NULL));
2138        break;
2139      }
2140      case bp_tree_bar: {
2141        KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
2142        __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
2143                                   FALSE USE_ITT_BUILD_ARG(NULL));
2144        break;
2145      }
2146      default: {
2147        __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
2148                                     FALSE USE_ITT_BUILD_ARG(NULL));
2149      }
2150      }
2151      if (__kmp_tasking_mode != tskm_immediate_exec) {
2152        __kmp_task_team_sync(this_thr, team);
2153      } // if
2154    }
2155  }
2156}
2157
2158void __kmp_join_barrier(int gtid) {
2159  KMP_TIME_PARTITIONED_BLOCK(OMP_join_barrier);
2160  KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
2161
2162  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
2163
2164  kmp_info_t *this_thr = __kmp_threads[gtid];
2165  kmp_team_t *team;
2166  int tid;
2167#ifdef KMP_DEBUG
2168  int team_id;
2169#endif /* KMP_DEBUG */
2170#if USE_ITT_BUILD
2171  void *itt_sync_obj = NULL;
2172#if USE_ITT_NOTIFY
2173  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) // Don't call routine without need
2174    // Get object created at fork_barrier
2175    itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2176#endif
2177#endif /* USE_ITT_BUILD */
2178#if ((USE_ITT_BUILD && USE_ITT_NOTIFY) || defined KMP_DEBUG)
2179  int nproc = this_thr->th.th_team_nproc;
2180#endif
2181  KMP_MB();
2182
2183  // Get current info
2184  team = this_thr->th.th_team;
2185  KMP_DEBUG_ASSERT(nproc == team->t.t_nproc);
2186  tid = __kmp_tid_from_gtid(gtid);
2187#ifdef KMP_DEBUG
2188  team_id = team->t.t_id;
2189  kmp_info_t *master_thread = this_thr->th.th_team_master;
2190  if (master_thread != team->t.t_threads[0]) {
2191    __kmp_print_structure();
2192  }
2193#endif /* KMP_DEBUG */
2194  KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]);
2195  KMP_MB();
2196
2197  // Verify state
2198  KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team));
2199  KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root));
2200  KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]);
2201  KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n",
2202                gtid, team_id, tid));
2203
2204#if OMPT_SUPPORT
2205  if (ompt_enabled.enabled) {
2206#if OMPT_OPTIONAL
2207    ompt_data_t *my_task_data;
2208    ompt_data_t *my_parallel_data;
2209    void *codeptr = NULL;
2210    int ds_tid = this_thr->th.th_info.ds.ds_tid;
2211    if (KMP_MASTER_TID(ds_tid) &&
2212        (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
2213         ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
2214      codeptr = team->t.ompt_team_info.master_return_address;
2215    my_task_data = OMPT_CUR_TASK_DATA(this_thr);
2216    my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
2217    if (ompt_enabled.ompt_callback_sync_region) {
2218      ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2219          ompt_sync_region_barrier_implicit, ompt_scope_begin, my_parallel_data,
2220          my_task_data, codeptr);
2221    }
2222    if (ompt_enabled.ompt_callback_sync_region_wait) {
2223      ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2224          ompt_sync_region_barrier_implicit, ompt_scope_begin, my_parallel_data,
2225          my_task_data, codeptr);
2226    }
2227    if (!KMP_MASTER_TID(ds_tid))
2228      this_thr->th.ompt_thread_info.task_data = *OMPT_CUR_TASK_DATA(this_thr);
2229#endif
2230    this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier_implicit;
2231  }
2232#endif
2233
2234  if (__kmp_tasking_mode == tskm_extra_barrier) {
2235    __kmp_tasking_barrier(team, this_thr, gtid);
2236    KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past tasking barrier\n",
2237                  gtid, team_id, tid));
2238  }
2239#ifdef KMP_DEBUG
2240  if (__kmp_tasking_mode != tskm_immediate_exec) {
2241    KA_TRACE(20, ("__kmp_join_barrier: T#%d, old team = %d, old task_team = "
2242                  "%p, th_task_team = %p\n",
2243                  __kmp_gtid_from_thread(this_thr), team_id,
2244                  team->t.t_task_team[this_thr->th.th_task_state],
2245                  this_thr->th.th_task_team));
2246    if (this_thr->th.th_task_team)
2247      KMP_DEBUG_ASSERT(this_thr->th.th_task_team ==
2248                       team->t.t_task_team[this_thr->th.th_task_state]);
2249  }
2250#endif /* KMP_DEBUG */
2251
2252  /* Copy the blocktime info to the thread, where __kmp_wait_template() can
2253     access it when the team struct is not guaranteed to exist. Doing these
2254     loads causes a cache miss slows down EPCC parallel by 2x. As a workaround,
2255     we do not perform the copy if blocktime=infinite, since the values are not
2256     used by __kmp_wait_template() in that case. */
2257  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
2258#if KMP_USE_MONITOR
2259    this_thr->th.th_team_bt_intervals =
2260        team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
2261    this_thr->th.th_team_bt_set =
2262        team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
2263#else
2264    this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
2265#endif
2266  }
2267
2268#if USE_ITT_BUILD
2269  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
2270    __kmp_itt_barrier_starting(gtid, itt_sync_obj);
2271#endif /* USE_ITT_BUILD */
2272
2273  switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) {
2274  case bp_dist_bar: {
2275    __kmp_dist_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
2276                              NULL USE_ITT_BUILD_ARG(itt_sync_obj));
2277    break;
2278  }
2279  case bp_hyper_bar: {
2280    KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
2281    __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
2282                               NULL USE_ITT_BUILD_ARG(itt_sync_obj));
2283    break;
2284  }
2285  case bp_hierarchical_bar: {
2286    __kmp_hierarchical_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
2287                                      NULL USE_ITT_BUILD_ARG(itt_sync_obj));
2288    break;
2289  }
2290  case bp_tree_bar: {
2291    KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
2292    __kmp_tree_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
2293                              NULL USE_ITT_BUILD_ARG(itt_sync_obj));
2294    break;
2295  }
2296  default: {
2297    __kmp_linear_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
2298                                NULL USE_ITT_BUILD_ARG(itt_sync_obj));
2299  }
2300  }
2301
2302  /* From this point on, the team data structure may be deallocated at any time
2303     by the primary thread - it is unsafe to reference it in any of the worker
2304     threads. Any per-team data items that need to be referenced before the
2305     end of the barrier should be moved to the kmp_task_team_t structs.  */
2306  if (KMP_MASTER_TID(tid)) {
2307    if (__kmp_tasking_mode != tskm_immediate_exec) {
2308      __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
2309    }
2310    if (__kmp_display_affinity) {
2311      KMP_CHECK_UPDATE(team->t.t_display_affinity, 0);
2312    }
2313#if KMP_STATS_ENABLED
2314    // Have primary thread flag the workers to indicate they are now waiting for
2315    // next parallel region, Also wake them up so they switch their timers to
2316    // idle.
2317    for (int i = 0; i < team->t.t_nproc; ++i) {
2318      kmp_info_t *team_thread = team->t.t_threads[i];
2319      if (team_thread == this_thr)
2320        continue;
2321      team_thread->th.th_stats->setIdleFlag();
2322      if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME &&
2323          team_thread->th.th_sleep_loc != NULL)
2324        __kmp_null_resume_wrapper(team_thread);
2325    }
2326#endif
2327#if USE_ITT_BUILD
2328    if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
2329      __kmp_itt_barrier_middle(gtid, itt_sync_obj);
2330#endif /* USE_ITT_BUILD */
2331
2332#if USE_ITT_BUILD && USE_ITT_NOTIFY
2333    // Join barrier - report frame end
2334    if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2335        __kmp_forkjoin_frames_mode &&
2336        (this_thr->th.th_teams_microtask == NULL || // either not in teams
2337         this_thr->th.th_teams_size.nteams == 1) && // or inside single team
2338        team->t.t_active_level == 1) {
2339      kmp_uint64 cur_time = __itt_get_timestamp();
2340      ident_t *loc = team->t.t_ident;
2341      kmp_info_t **other_threads = team->t.t_threads;
2342      switch (__kmp_forkjoin_frames_mode) {
2343      case 1:
2344        __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
2345                               loc, nproc);
2346        break;
2347      case 2:
2348        __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1,
2349                               loc, nproc);
2350        break;
2351      case 3:
2352        if (__itt_metadata_add_ptr) {
2353          // Initialize with primary thread's wait time
2354          kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
2355          // Set arrive time to zero to be able to check it in
2356          // __kmp_invoke_task(); the same is done inside the loop below
2357          this_thr->th.th_bar_arrive_time = 0;
2358          for (int i = 1; i < nproc; ++i) {
2359            delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
2360            other_threads[i]->th.th_bar_arrive_time = 0;
2361          }
2362          __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
2363                                       cur_time, delta, 0);
2364        }
2365        __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
2366                               loc, nproc);
2367        this_thr->th.th_frame_time = cur_time;
2368        break;
2369      }
2370    }
2371#endif /* USE_ITT_BUILD */
2372  }
2373#if USE_ITT_BUILD
2374  else {
2375    if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
2376      __kmp_itt_barrier_middle(gtid, itt_sync_obj);
2377  }
2378#endif /* USE_ITT_BUILD */
2379
2380#if KMP_DEBUG
2381  if (KMP_MASTER_TID(tid)) {
2382    KA_TRACE(
2383        15,
2384        ("__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n",
2385         gtid, team_id, tid, nproc));
2386  }
2387#endif /* KMP_DEBUG */
2388
2389  // TODO now, mark worker threads as done so they may be disbanded
2390  KMP_MB(); // Flush all pending memory write invalidates.
2391  KA_TRACE(10,
2392           ("__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid));
2393
2394}
2395
2396// TODO release worker threads' fork barriers as we are ready instead of all at
2397// once
2398void __kmp_fork_barrier(int gtid, int tid) {
2399  KMP_TIME_PARTITIONED_BLOCK(OMP_fork_barrier);
2400  KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
2401  kmp_info_t *this_thr = __kmp_threads[gtid];
2402  kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL;
2403#if USE_ITT_BUILD
2404  void *itt_sync_obj = NULL;
2405#endif /* USE_ITT_BUILD */
2406#ifdef KMP_DEBUG
2407  if (team)
2408    KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n", gtid,
2409                  (team != NULL) ? team->t.t_id : -1, tid));
2410#endif
2411  // th_team pointer only valid for primary thread here
2412  if (KMP_MASTER_TID(tid)) {
2413#if USE_ITT_BUILD && USE_ITT_NOTIFY
2414    if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2415      // Create itt barrier object
2416      itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1);
2417      __kmp_itt_barrier_middle(gtid, itt_sync_obj); // Call acquired/releasing
2418    }
2419#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2420
2421#ifdef KMP_DEBUG
2422    KMP_DEBUG_ASSERT(team);
2423    kmp_info_t **other_threads = team->t.t_threads;
2424    int i;
2425
2426    // Verify state
2427    KMP_MB();
2428
2429    for (i = 1; i < team->t.t_nproc; ++i) {
2430      KA_TRACE(500,
2431               ("__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go "
2432                "== %u.\n",
2433                gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid,
2434                team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid,
2435                other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go));
2436      KMP_DEBUG_ASSERT(
2437          (TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go) &
2438           ~(KMP_BARRIER_SLEEP_STATE)) == KMP_INIT_BARRIER_STATE);
2439      KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team);
2440    }
2441#endif
2442
2443    if (__kmp_tasking_mode != tskm_immediate_exec) {
2444      // 0 indicates setup current task team if nthreads > 1
2445      __kmp_task_team_setup(this_thr, team, 0);
2446    }
2447
2448    /* The primary thread may have changed its blocktime between join barrier
2449       and fork barrier. Copy the blocktime info to the thread, where
2450       __kmp_wait_template() can access it when the team struct is not
2451       guaranteed to exist. */
2452    // See note about the corresponding code in __kmp_join_barrier() being
2453    // performance-critical
2454    if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
2455#if KMP_USE_MONITOR
2456      this_thr->th.th_team_bt_intervals =
2457          team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
2458      this_thr->th.th_team_bt_set =
2459          team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
2460#else
2461      this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
2462#endif
2463    }
2464  } // primary thread
2465
2466  switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) {
2467  case bp_dist_bar: {
2468    __kmp_dist_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
2469                               TRUE USE_ITT_BUILD_ARG(NULL));
2470    break;
2471  }
2472  case bp_hyper_bar: {
2473    KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
2474    __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
2475                                TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
2476    break;
2477  }
2478  case bp_hierarchical_bar: {
2479    __kmp_hierarchical_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
2480                                       TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
2481    break;
2482  }
2483  case bp_tree_bar: {
2484    KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
2485    __kmp_tree_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
2486                               TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
2487    break;
2488  }
2489  default: {
2490    __kmp_linear_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
2491                                 TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
2492  }
2493  }
2494
2495#if OMPT_SUPPORT
2496  if (ompt_enabled.enabled &&
2497      this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
2498    int ds_tid = this_thr->th.th_info.ds.ds_tid;
2499    ompt_data_t *task_data = (team)
2500                                 ? OMPT_CUR_TASK_DATA(this_thr)
2501                                 : &(this_thr->th.ompt_thread_info.task_data);
2502    this_thr->th.ompt_thread_info.state = ompt_state_overhead;
2503#if OMPT_OPTIONAL
2504    void *codeptr = NULL;
2505    if (KMP_MASTER_TID(ds_tid) &&
2506        (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
2507         ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
2508      codeptr = team ? team->t.ompt_team_info.master_return_address : NULL;
2509    if (ompt_enabled.ompt_callback_sync_region_wait) {
2510      ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2511          ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
2512          codeptr);
2513    }
2514    if (ompt_enabled.ompt_callback_sync_region) {
2515      ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2516          ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
2517          codeptr);
2518    }
2519#endif
2520    if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
2521      ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2522          ompt_scope_end, NULL, task_data, 0, ds_tid,
2523          ompt_task_implicit); // TODO: Can this be ompt_task_initial?
2524    }
2525  }
2526#endif
2527
2528  // Early exit for reaping threads releasing forkjoin barrier
2529  if (TCR_4(__kmp_global.g.g_done)) {
2530    this_thr->th.th_task_team = NULL;
2531
2532#if USE_ITT_BUILD && USE_ITT_NOTIFY
2533    if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2534      if (!KMP_MASTER_TID(tid)) {
2535        itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2536        if (itt_sync_obj)
2537          __kmp_itt_barrier_finished(gtid, itt_sync_obj);
2538      }
2539    }
2540#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2541    KA_TRACE(10, ("__kmp_fork_barrier: T#%d is leaving early\n", gtid));
2542    return;
2543  }
2544
2545  /* We can now assume that a valid team structure has been allocated by the
2546     primary thread and propagated to all worker threads. The current thread,
2547     however, may not be part of the team, so we can't blindly assume that the
2548     team pointer is non-null.  */
2549  team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team);
2550  KMP_DEBUG_ASSERT(team != NULL);
2551  tid = __kmp_tid_from_gtid(gtid);
2552
2553#if KMP_BARRIER_ICV_PULL
2554  /* Primary thread's copy of the ICVs was set up on the implicit taskdata in
2555     __kmp_reinitialize_team. __kmp_fork_call() assumes the primary thread's
2556     implicit task has this data before this function is called. We cannot
2557     modify __kmp_fork_call() to look at the fixed ICVs in the primary thread's
2558     thread struct, because it is not always the case that the threads arrays
2559     have been allocated when __kmp_fork_call() is executed. */
2560  {
2561    KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
2562    if (!KMP_MASTER_TID(tid)) { // primary thread already has ICVs
2563      // Copy the initial ICVs from the primary thread's thread struct to the
2564      // implicit task for this tid.
2565      KA_TRACE(10,
2566               ("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid));
2567      __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team,
2568                               tid, FALSE);
2569      copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
2570                &team->t.t_threads[0]
2571                     ->th.th_bar[bs_forkjoin_barrier]
2572                     .bb.th_fixed_icvs);
2573    }
2574  }
2575#endif // KMP_BARRIER_ICV_PULL
2576
2577  if (__kmp_tasking_mode != tskm_immediate_exec) {
2578    __kmp_task_team_sync(this_thr, team);
2579  }
2580
2581#if KMP_AFFINITY_SUPPORTED
2582  kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
2583  if (proc_bind == proc_bind_intel) {
2584    // Call dynamic affinity settings
2585    if (__kmp_affinity.type == affinity_balanced && team->t.t_size_changed) {
2586      __kmp_balanced_affinity(this_thr, team->t.t_nproc);
2587    }
2588  } else if (proc_bind != proc_bind_false) {
2589    if (this_thr->th.th_new_place == this_thr->th.th_current_place) {
2590      KA_TRACE(100, ("__kmp_fork_barrier: T#%d already in correct place %d\n",
2591                     __kmp_gtid_from_thread(this_thr),
2592                     this_thr->th.th_current_place));
2593    } else {
2594      __kmp_affinity_bind_place(gtid);
2595    }
2596  }
2597#endif // KMP_AFFINITY_SUPPORTED
2598  // Perform the display affinity functionality
2599  if (__kmp_display_affinity) {
2600    if (team->t.t_display_affinity
2601#if KMP_AFFINITY_SUPPORTED
2602        || (__kmp_affinity.type == affinity_balanced && team->t.t_size_changed)
2603#endif
2604    ) {
2605      // NULL means use the affinity-format-var ICV
2606      __kmp_aux_display_affinity(gtid, NULL);
2607      this_thr->th.th_prev_num_threads = team->t.t_nproc;
2608      this_thr->th.th_prev_level = team->t.t_level;
2609    }
2610  }
2611  if (!KMP_MASTER_TID(tid))
2612    KMP_CHECK_UPDATE(this_thr->th.th_def_allocator, team->t.t_def_allocator);
2613
2614#if USE_ITT_BUILD && USE_ITT_NOTIFY
2615  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2616    if (!KMP_MASTER_TID(tid)) {
2617      // Get correct barrier object
2618      itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2619      __kmp_itt_barrier_finished(gtid, itt_sync_obj); // Workers call acquired
2620    } // (prepare called inside barrier_release)
2621  }
2622#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2623  KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid,
2624                team->t.t_id, tid));
2625}
2626
2627void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
2628                          kmp_internal_control_t *new_icvs, ident_t *loc) {
2629  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_setup_icv_copy);
2630
2631  KMP_DEBUG_ASSERT(team && new_nproc && new_icvs);
2632  KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
2633
2634/* Primary thread's copy of the ICVs was set up on the implicit taskdata in
2635   __kmp_reinitialize_team. __kmp_fork_call() assumes the primary thread's
2636   implicit task has this data before this function is called. */
2637#if KMP_BARRIER_ICV_PULL
2638  /* Copy ICVs to primary thread's thread structure into th_fixed_icvs (which
2639     remains untouched), where all of the worker threads can access them and
2640     make their own copies after the barrier. */
2641  KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
2642  // allocated at this point
2643  copy_icvs(
2644      &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs,
2645      new_icvs);
2646  KF_TRACE(10, ("__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n", 0,
2647                team->t.t_threads[0], team));
2648#elif KMP_BARRIER_ICV_PUSH
2649  // The ICVs will be propagated in the fork barrier, so nothing needs to be
2650  // done here.
2651  KF_TRACE(10, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n", 0,
2652                team->t.t_threads[0], team));
2653#else
2654  // Copy the ICVs to each of the non-primary threads.  This takes O(nthreads)
2655  // time.
2656  ngo_load(new_icvs);
2657  KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
2658  // allocated at this point
2659  for (int f = 1; f < new_nproc; ++f) { // Skip the primary thread
2660    // TODO: GEH - pass in better source location info since usually NULL here
2661    KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2662                  f, team->t.t_threads[f], team));
2663    __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE);
2664    ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs);
2665    KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2666                  f, team->t.t_threads[f], team));
2667  }
2668  ngo_sync();
2669#endif // KMP_BARRIER_ICV_PULL
2670}
2671