kmp_barrier.cpp revision 360784
1/*
2 * kmp_barrier.cpp
3 */
4
5//===----------------------------------------------------------------------===//
6//
7// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8// See https://llvm.org/LICENSE.txt for license information.
9// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10//
11//===----------------------------------------------------------------------===//
12
13#include "kmp.h"
14#include "kmp_wait_release.h"
15#include "kmp_itt.h"
16#include "kmp_os.h"
17#include "kmp_stats.h"
18#include "ompt-specific.h"
19
20#if KMP_MIC
21#include <immintrin.h>
22#define USE_NGO_STORES 1
23#endif // KMP_MIC
24
25#include "tsan_annotations.h"
26
27#if KMP_MIC && USE_NGO_STORES
28// ICV copying
29#define ngo_load(src) __m512d Vt = _mm512_load_pd((void *)(src))
30#define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
31#define ngo_store_go(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
32#define ngo_sync() __asm__ volatile("lock; addl $0,0(%%rsp)" ::: "memory")
33#else
34#define ngo_load(src) ((void)0)
35#define ngo_store_icvs(dst, src) copy_icvs((dst), (src))
36#define ngo_store_go(dst, src) KMP_MEMCPY((dst), (src), CACHE_LINE)
37#define ngo_sync() ((void)0)
38#endif /* KMP_MIC && USE_NGO_STORES */
39
40void __kmp_print_structure(void); // Forward declaration
41
42// ---------------------------- Barrier Algorithms ----------------------------
43
44// Linear Barrier
45template <bool cancellable = false>
46static bool __kmp_linear_barrier_gather_template(
47    enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
48    void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
49  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_gather);
50  kmp_team_t *team = this_thr->th.th_team;
51  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
52  kmp_info_t **other_threads = team->t.t_threads;
53
54  KA_TRACE(
55      20,
56      ("__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
57       gtid, team->t.t_id, tid, bt));
58  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
59
60#if USE_ITT_BUILD && USE_ITT_NOTIFY
61  // Barrier imbalance - save arrive time to the thread
62  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
63    this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
64        __itt_get_timestamp();
65  }
66#endif
67  // We now perform a linear reduction to signal that all of the threads have
68  // arrived.
69  if (!KMP_MASTER_TID(tid)) {
70    KA_TRACE(20,
71             ("__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)"
72              "arrived(%p): %llu => %llu\n",
73              gtid, team->t.t_id, tid, __kmp_gtid_from_tid(0, team),
74              team->t.t_id, 0, &thr_bar->b_arrived, thr_bar->b_arrived,
75              thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
76    // Mark arrival to master thread
77    /* After performing this write, a worker thread may not assume that the team
78       is valid any more - it could be deallocated by the master thread at any
79       time. */
80    ANNOTATE_BARRIER_BEGIN(this_thr);
81    kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[0]);
82    flag.release();
83  } else {
84    kmp_balign_team_t *team_bar = &team->t.t_bar[bt];
85    int nproc = this_thr->th.th_team_nproc;
86    int i;
87    // Don't have to worry about sleep bit here or atomic since team setting
88    kmp_uint64 new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP;
89
90    // Collect all the worker team member threads.
91    for (i = 1; i < nproc; ++i) {
92#if KMP_CACHE_MANAGE
93      // Prefetch next thread's arrived count
94      if (i + 1 < nproc)
95        KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_arrived);
96#endif /* KMP_CACHE_MANAGE */
97      KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
98                    "arrived(%p) == %llu\n",
99                    gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
100                    team->t.t_id, i,
101                    &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state));
102
103      // Wait for worker thread to arrive
104      kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived,
105                       new_state);
106      if (cancellable) {
107        bool cancelled = flag.wait_cancellable_nosleep(
108            this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
109        if (cancelled)
110          return true;
111      } else {
112        flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
113      }
114      ANNOTATE_BARRIER_END(other_threads[i]);
115#if USE_ITT_BUILD && USE_ITT_NOTIFY
116      // Barrier imbalance - write min of the thread time and the other thread
117      // time to the thread.
118      if (__kmp_forkjoin_frames_mode == 2) {
119        this_thr->th.th_bar_min_time = KMP_MIN(
120            this_thr->th.th_bar_min_time, other_threads[i]->th.th_bar_min_time);
121      }
122#endif
123      if (reduce) {
124        KA_TRACE(100,
125                 ("__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
126                  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
127                  team->t.t_id, i));
128        ANNOTATE_REDUCE_AFTER(reduce);
129        OMPT_REDUCTION_DECL(this_thr, gtid);
130        OMPT_REDUCTION_BEGIN;
131        (*reduce)(this_thr->th.th_local.reduce_data,
132                  other_threads[i]->th.th_local.reduce_data);
133        OMPT_REDUCTION_END;
134        ANNOTATE_REDUCE_BEFORE(reduce);
135        ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
136      }
137    }
138    // Don't have to worry about sleep bit here or atomic since team setting
139    team_bar->b_arrived = new_state;
140    KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d "
141                  "arrived(%p) = %llu\n",
142                  gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived,
143                  new_state));
144  }
145  KA_TRACE(
146      20,
147      ("__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
148       gtid, team->t.t_id, tid, bt));
149  return false;
150}
151
152template <bool cancellable = false>
153static bool __kmp_linear_barrier_release_template(
154    enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
155    int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
156  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_release);
157  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
158  kmp_team_t *team;
159
160  if (KMP_MASTER_TID(tid)) {
161    unsigned int i;
162    kmp_uint32 nproc = this_thr->th.th_team_nproc;
163    kmp_info_t **other_threads;
164
165    team = __kmp_threads[gtid]->th.th_team;
166    KMP_DEBUG_ASSERT(team != NULL);
167    other_threads = team->t.t_threads;
168
169    KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) master enter for "
170                  "barrier type %d\n",
171                  gtid, team->t.t_id, tid, bt));
172
173    if (nproc > 1) {
174#if KMP_BARRIER_ICV_PUSH
175      {
176        KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
177        if (propagate_icvs) {
178          ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs);
179          for (i = 1; i < nproc; ++i) {
180            __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i],
181                                     team, i, FALSE);
182            ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs,
183                           &team->t.t_implicit_task_taskdata[0].td_icvs);
184          }
185          ngo_sync();
186        }
187      }
188#endif // KMP_BARRIER_ICV_PUSH
189
190      // Now, release all of the worker threads
191      for (i = 1; i < nproc; ++i) {
192#if KMP_CACHE_MANAGE
193        // Prefetch next thread's go flag
194        if (i + 1 < nproc)
195          KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_go);
196#endif /* KMP_CACHE_MANAGE */
197        KA_TRACE(
198            20,
199            ("__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) "
200             "go(%p): %u => %u\n",
201             gtid, team->t.t_id, tid, other_threads[i]->th.th_info.ds.ds_gtid,
202             team->t.t_id, i, &other_threads[i]->th.th_bar[bt].bb.b_go,
203             other_threads[i]->th.th_bar[bt].bb.b_go,
204             other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP));
205        ANNOTATE_BARRIER_BEGIN(other_threads[i]);
206        kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_go,
207                         other_threads[i]);
208        flag.release();
209      }
210    }
211  } else { // Wait for the MASTER thread to release us
212    KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n",
213                  gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
214    kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
215    if (cancellable) {
216      bool cancelled = flag.wait_cancellable_nosleep(
217          this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
218      if (cancelled) {
219        return true;
220      }
221    } else {
222      flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
223    }
224    ANNOTATE_BARRIER_END(this_thr);
225#if USE_ITT_BUILD && USE_ITT_NOTIFY
226    if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
227      // In a fork barrier; cannot get the object reliably (or ITTNOTIFY is
228      // disabled)
229      itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
230      // Cancel wait on previous parallel region...
231      __kmp_itt_task_starting(itt_sync_obj);
232
233      if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
234        return false;
235
236      itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
237      if (itt_sync_obj != NULL)
238        // Call prepare as early as possible for "new" barrier
239        __kmp_itt_task_finished(itt_sync_obj);
240    } else
241#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
242        // Early exit for reaping threads releasing forkjoin barrier
243        if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
244      return false;
245// The worker thread may now assume that the team is valid.
246#ifdef KMP_DEBUG
247    tid = __kmp_tid_from_gtid(gtid);
248    team = __kmp_threads[gtid]->th.th_team;
249#endif
250    KMP_DEBUG_ASSERT(team != NULL);
251    TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
252    KA_TRACE(20,
253             ("__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
254              gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
255    KMP_MB(); // Flush all pending memory write invalidates.
256  }
257  KA_TRACE(
258      20,
259      ("__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
260       gtid, team->t.t_id, tid, bt));
261  return false;
262}
263
264static void __kmp_linear_barrier_gather(
265    enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
266    void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
267  __kmp_linear_barrier_gather_template<false>(
268      bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
269}
270
271static bool __kmp_linear_barrier_gather_cancellable(
272    enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
273    void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
274  return __kmp_linear_barrier_gather_template<true>(
275      bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
276}
277
278static void __kmp_linear_barrier_release(
279    enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
280    int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
281  __kmp_linear_barrier_release_template<false>(
282      bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));
283}
284
285static bool __kmp_linear_barrier_release_cancellable(
286    enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
287    int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
288  return __kmp_linear_barrier_release_template<true>(
289      bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));
290}
291
292// Tree barrier
293static void
294__kmp_tree_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid,
295                          int tid, void (*reduce)(void *, void *)
296                                       USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
297  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_gather);
298  kmp_team_t *team = this_thr->th.th_team;
299  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
300  kmp_info_t **other_threads = team->t.t_threads;
301  kmp_uint32 nproc = this_thr->th.th_team_nproc;
302  kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
303  kmp_uint32 branch_factor = 1 << branch_bits;
304  kmp_uint32 child;
305  kmp_uint32 child_tid;
306  kmp_uint64 new_state;
307
308  KA_TRACE(
309      20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
310           gtid, team->t.t_id, tid, bt));
311  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
312
313#if USE_ITT_BUILD && USE_ITT_NOTIFY
314  // Barrier imbalance - save arrive time to the thread
315  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
316    this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
317        __itt_get_timestamp();
318  }
319#endif
320  // Perform tree gather to wait until all threads have arrived; reduce any
321  // required data as we go
322  child_tid = (tid << branch_bits) + 1;
323  if (child_tid < nproc) {
324    // Parent threads wait for all their children to arrive
325    new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
326    child = 1;
327    do {
328      kmp_info_t *child_thr = other_threads[child_tid];
329      kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
330#if KMP_CACHE_MANAGE
331      // Prefetch next thread's arrived count
332      if (child + 1 <= branch_factor && child_tid + 1 < nproc)
333        KMP_CACHE_PREFETCH(
334            &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_arrived);
335#endif /* KMP_CACHE_MANAGE */
336      KA_TRACE(20,
337               ("__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
338                "arrived(%p) == %llu\n",
339                gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
340                team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
341      // Wait for child to arrive
342      kmp_flag_64 flag(&child_bar->b_arrived, new_state);
343      flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
344      ANNOTATE_BARRIER_END(child_thr);
345#if USE_ITT_BUILD && USE_ITT_NOTIFY
346      // Barrier imbalance - write min of the thread time and a child time to
347      // the thread.
348      if (__kmp_forkjoin_frames_mode == 2) {
349        this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
350                                               child_thr->th.th_bar_min_time);
351      }
352#endif
353      if (reduce) {
354        KA_TRACE(100,
355                 ("__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
356                  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
357                  team->t.t_id, child_tid));
358        ANNOTATE_REDUCE_AFTER(reduce);
359        OMPT_REDUCTION_DECL(this_thr, gtid);
360        OMPT_REDUCTION_BEGIN;
361        (*reduce)(this_thr->th.th_local.reduce_data,
362                  child_thr->th.th_local.reduce_data);
363        OMPT_REDUCTION_END;
364        ANNOTATE_REDUCE_BEFORE(reduce);
365        ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
366      }
367      child++;
368      child_tid++;
369    } while (child <= branch_factor && child_tid < nproc);
370  }
371
372  if (!KMP_MASTER_TID(tid)) { // Worker threads
373    kmp_int32 parent_tid = (tid - 1) >> branch_bits;
374
375    KA_TRACE(20,
376             ("__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
377              "arrived(%p): %llu => %llu\n",
378              gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
379              team->t.t_id, parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
380              thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
381
382    // Mark arrival to parent thread
383    /* After performing this write, a worker thread may not assume that the team
384       is valid any more - it could be deallocated by the master thread at any
385       time.  */
386    ANNOTATE_BARRIER_BEGIN(this_thr);
387    kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[parent_tid]);
388    flag.release();
389  } else {
390    // Need to update the team arrived pointer if we are the master thread
391    if (nproc > 1) // New value was already computed above
392      team->t.t_bar[bt].b_arrived = new_state;
393    else
394      team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
395    KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d "
396                  "arrived(%p) = %llu\n",
397                  gtid, team->t.t_id, tid, team->t.t_id,
398                  &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
399  }
400  KA_TRACE(20,
401           ("__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
402            gtid, team->t.t_id, tid, bt));
403}
404
405static void __kmp_tree_barrier_release(
406    enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
407    int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
408  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_release);
409  kmp_team_t *team;
410  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
411  kmp_uint32 nproc;
412  kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
413  kmp_uint32 branch_factor = 1 << branch_bits;
414  kmp_uint32 child;
415  kmp_uint32 child_tid;
416
417  // Perform a tree release for all of the threads that have been gathered
418  if (!KMP_MASTER_TID(
419          tid)) { // Handle fork barrier workers who aren't part of a team yet
420    KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n", gtid,
421                  &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
422    // Wait for parent thread to release us
423    kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
424    flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
425    ANNOTATE_BARRIER_END(this_thr);
426#if USE_ITT_BUILD && USE_ITT_NOTIFY
427    if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
428      // In fork barrier where we could not get the object reliably (or
429      // ITTNOTIFY is disabled)
430      itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
431      // Cancel wait on previous parallel region...
432      __kmp_itt_task_starting(itt_sync_obj);
433
434      if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
435        return;
436
437      itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
438      if (itt_sync_obj != NULL)
439        // Call prepare as early as possible for "new" barrier
440        __kmp_itt_task_finished(itt_sync_obj);
441    } else
442#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
443        // Early exit for reaping threads releasing forkjoin barrier
444        if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
445      return;
446
447    // The worker thread may now assume that the team is valid.
448    team = __kmp_threads[gtid]->th.th_team;
449    KMP_DEBUG_ASSERT(team != NULL);
450    tid = __kmp_tid_from_gtid(gtid);
451
452    TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
453    KA_TRACE(20,
454             ("__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", gtid,
455              team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
456    KMP_MB(); // Flush all pending memory write invalidates.
457  } else {
458    team = __kmp_threads[gtid]->th.th_team;
459    KMP_DEBUG_ASSERT(team != NULL);
460    KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) master enter for "
461                  "barrier type %d\n",
462                  gtid, team->t.t_id, tid, bt));
463  }
464  nproc = this_thr->th.th_team_nproc;
465  child_tid = (tid << branch_bits) + 1;
466
467  if (child_tid < nproc) {
468    kmp_info_t **other_threads = team->t.t_threads;
469    child = 1;
470    // Parent threads release all their children
471    do {
472      kmp_info_t *child_thr = other_threads[child_tid];
473      kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
474#if KMP_CACHE_MANAGE
475      // Prefetch next thread's go count
476      if (child + 1 <= branch_factor && child_tid + 1 < nproc)
477        KMP_CACHE_PREFETCH(
478            &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_go);
479#endif /* KMP_CACHE_MANAGE */
480
481#if KMP_BARRIER_ICV_PUSH
482      {
483        KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
484        if (propagate_icvs) {
485          __kmp_init_implicit_task(team->t.t_ident,
486                                   team->t.t_threads[child_tid], team,
487                                   child_tid, FALSE);
488          copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs,
489                    &team->t.t_implicit_task_taskdata[0].td_icvs);
490        }
491      }
492#endif // KMP_BARRIER_ICV_PUSH
493      KA_TRACE(20,
494               ("__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
495                "go(%p): %u => %u\n",
496                gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
497                team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
498                child_bar->b_go + KMP_BARRIER_STATE_BUMP));
499      // Release child from barrier
500      ANNOTATE_BARRIER_BEGIN(child_thr);
501      kmp_flag_64 flag(&child_bar->b_go, child_thr);
502      flag.release();
503      child++;
504      child_tid++;
505    } while (child <= branch_factor && child_tid < nproc);
506  }
507  KA_TRACE(
508      20, ("__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
509           gtid, team->t.t_id, tid, bt));
510}
511
512// Hyper Barrier
513static void
514__kmp_hyper_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid,
515                           int tid, void (*reduce)(void *, void *)
516                                        USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
517  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_gather);
518  kmp_team_t *team = this_thr->th.th_team;
519  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
520  kmp_info_t **other_threads = team->t.t_threads;
521  kmp_uint64 new_state = KMP_BARRIER_UNUSED_STATE;
522  kmp_uint32 num_threads = this_thr->th.th_team_nproc;
523  kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
524  kmp_uint32 branch_factor = 1 << branch_bits;
525  kmp_uint32 offset;
526  kmp_uint32 level;
527
528  KA_TRACE(
529      20,
530      ("__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
531       gtid, team->t.t_id, tid, bt));
532  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
533
534#if USE_ITT_BUILD && USE_ITT_NOTIFY
535  // Barrier imbalance - save arrive time to the thread
536  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
537    this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
538        __itt_get_timestamp();
539  }
540#endif
541  /* Perform a hypercube-embedded tree gather to wait until all of the threads
542     have arrived, and reduce any required data as we go.  */
543  kmp_flag_64 p_flag(&thr_bar->b_arrived);
544  for (level = 0, offset = 1; offset < num_threads;
545       level += branch_bits, offset <<= branch_bits) {
546    kmp_uint32 child;
547    kmp_uint32 child_tid;
548
549    if (((tid >> level) & (branch_factor - 1)) != 0) {
550      kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) - 1);
551
552      KA_TRACE(20,
553               ("__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
554                "arrived(%p): %llu => %llu\n",
555                gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
556                team->t.t_id, parent_tid, &thr_bar->b_arrived,
557                thr_bar->b_arrived,
558                thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
559      // Mark arrival to parent thread
560      /* After performing this write (in the last iteration of the enclosing for
561         loop), a worker thread may not assume that the team is valid any more
562         - it could be deallocated by the master thread at any time.  */
563      ANNOTATE_BARRIER_BEGIN(this_thr);
564      p_flag.set_waiter(other_threads[parent_tid]);
565      p_flag.release();
566      break;
567    }
568
569    // Parent threads wait for children to arrive
570    if (new_state == KMP_BARRIER_UNUSED_STATE)
571      new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
572    for (child = 1, child_tid = tid + (1 << level);
573         child < branch_factor && child_tid < num_threads;
574         child++, child_tid += (1 << level)) {
575      kmp_info_t *child_thr = other_threads[child_tid];
576      kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
577#if KMP_CACHE_MANAGE
578      kmp_uint32 next_child_tid = child_tid + (1 << level);
579      // Prefetch next thread's arrived count
580      if (child + 1 < branch_factor && next_child_tid < num_threads)
581        KMP_CACHE_PREFETCH(
582            &other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived);
583#endif /* KMP_CACHE_MANAGE */
584      KA_TRACE(20,
585               ("__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
586                "arrived(%p) == %llu\n",
587                gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
588                team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
589      // Wait for child to arrive
590      kmp_flag_64 c_flag(&child_bar->b_arrived, new_state);
591      c_flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
592      ANNOTATE_BARRIER_END(child_thr);
593#if USE_ITT_BUILD && USE_ITT_NOTIFY
594      // Barrier imbalance - write min of the thread time and a child time to
595      // the thread.
596      if (__kmp_forkjoin_frames_mode == 2) {
597        this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
598                                               child_thr->th.th_bar_min_time);
599      }
600#endif
601      if (reduce) {
602        KA_TRACE(100,
603                 ("__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
604                  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
605                  team->t.t_id, child_tid));
606        ANNOTATE_REDUCE_AFTER(reduce);
607        OMPT_REDUCTION_DECL(this_thr, gtid);
608        OMPT_REDUCTION_BEGIN;
609        (*reduce)(this_thr->th.th_local.reduce_data,
610                  child_thr->th.th_local.reduce_data);
611        OMPT_REDUCTION_END;
612        ANNOTATE_REDUCE_BEFORE(reduce);
613        ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
614      }
615    }
616  }
617
618  if (KMP_MASTER_TID(tid)) {
619    // Need to update the team arrived pointer if we are the master thread
620    if (new_state == KMP_BARRIER_UNUSED_STATE)
621      team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
622    else
623      team->t.t_bar[bt].b_arrived = new_state;
624    KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d "
625                  "arrived(%p) = %llu\n",
626                  gtid, team->t.t_id, tid, team->t.t_id,
627                  &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
628  }
629  KA_TRACE(
630      20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
631           gtid, team->t.t_id, tid, bt));
632}
633
634// The reverse versions seem to beat the forward versions overall
635#define KMP_REVERSE_HYPER_BAR
636static void __kmp_hyper_barrier_release(
637    enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
638    int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
639  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_release);
640  kmp_team_t *team;
641  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
642  kmp_info_t **other_threads;
643  kmp_uint32 num_threads;
644  kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
645  kmp_uint32 branch_factor = 1 << branch_bits;
646  kmp_uint32 child;
647  kmp_uint32 child_tid;
648  kmp_uint32 offset;
649  kmp_uint32 level;
650
651  /* Perform a hypercube-embedded tree release for all of the threads that have
652     been gathered. If KMP_REVERSE_HYPER_BAR is defined (default) the threads
653     are released in the reverse order of the corresponding gather, otherwise
654     threads are released in the same order. */
655  if (KMP_MASTER_TID(tid)) { // master
656    team = __kmp_threads[gtid]->th.th_team;
657    KMP_DEBUG_ASSERT(team != NULL);
658    KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) master enter for "
659                  "barrier type %d\n",
660                  gtid, team->t.t_id, tid, bt));
661#if KMP_BARRIER_ICV_PUSH
662    if (propagate_icvs) { // master already has ICVs in final destination; copy
663      copy_icvs(&thr_bar->th_fixed_icvs,
664                &team->t.t_implicit_task_taskdata[tid].td_icvs);
665    }
666#endif
667  } else { // Handle fork barrier workers who aren't part of a team yet
668    KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n", gtid,
669                  &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
670    // Wait for parent thread to release us
671    kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
672    flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
673    ANNOTATE_BARRIER_END(this_thr);
674#if USE_ITT_BUILD && USE_ITT_NOTIFY
675    if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
676      // In fork barrier where we could not get the object reliably
677      itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
678      // Cancel wait on previous parallel region...
679      __kmp_itt_task_starting(itt_sync_obj);
680
681      if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
682        return;
683
684      itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
685      if (itt_sync_obj != NULL)
686        // Call prepare as early as possible for "new" barrier
687        __kmp_itt_task_finished(itt_sync_obj);
688    } else
689#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
690        // Early exit for reaping threads releasing forkjoin barrier
691        if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
692      return;
693
694    // The worker thread may now assume that the team is valid.
695    team = __kmp_threads[gtid]->th.th_team;
696    KMP_DEBUG_ASSERT(team != NULL);
697    tid = __kmp_tid_from_gtid(gtid);
698
699    TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
700    KA_TRACE(20,
701             ("__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
702              gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
703    KMP_MB(); // Flush all pending memory write invalidates.
704  }
705  num_threads = this_thr->th.th_team_nproc;
706  other_threads = team->t.t_threads;
707
708#ifdef KMP_REVERSE_HYPER_BAR
709  // Count up to correct level for parent
710  for (level = 0, offset = 1;
711       offset < num_threads && (((tid >> level) & (branch_factor - 1)) == 0);
712       level += branch_bits, offset <<= branch_bits)
713    ;
714
715  // Now go down from there
716  for (level -= branch_bits, offset >>= branch_bits; offset != 0;
717       level -= branch_bits, offset >>= branch_bits)
718#else
719  // Go down the tree, level by level
720  for (level = 0, offset = 1; offset < num_threads;
721       level += branch_bits, offset <<= branch_bits)
722#endif // KMP_REVERSE_HYPER_BAR
723  {
724#ifdef KMP_REVERSE_HYPER_BAR
725    /* Now go in reverse order through the children, highest to lowest.
726       Initial setting of child is conservative here. */
727    child = num_threads >> ((level == 0) ? level : level - 1);
728    for (child = (child < branch_factor - 1) ? child : branch_factor - 1,
729        child_tid = tid + (child << level);
730         child >= 1; child--, child_tid -= (1 << level))
731#else
732    if (((tid >> level) & (branch_factor - 1)) != 0)
733      // No need to go lower than this, since this is the level parent would be
734      // notified
735      break;
736    // Iterate through children on this level of the tree
737    for (child = 1, child_tid = tid + (1 << level);
738         child < branch_factor && child_tid < num_threads;
739         child++, child_tid += (1 << level))
740#endif // KMP_REVERSE_HYPER_BAR
741    {
742      if (child_tid >= num_threads)
743        continue; // Child doesn't exist so keep going
744      else {
745        kmp_info_t *child_thr = other_threads[child_tid];
746        kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
747#if KMP_CACHE_MANAGE
748        kmp_uint32 next_child_tid = child_tid - (1 << level);
749// Prefetch next thread's go count
750#ifdef KMP_REVERSE_HYPER_BAR
751        if (child - 1 >= 1 && next_child_tid < num_threads)
752#else
753        if (child + 1 < branch_factor && next_child_tid < num_threads)
754#endif // KMP_REVERSE_HYPER_BAR
755          KMP_CACHE_PREFETCH(
756              &other_threads[next_child_tid]->th.th_bar[bt].bb.b_go);
757#endif /* KMP_CACHE_MANAGE */
758
759#if KMP_BARRIER_ICV_PUSH
760        if (propagate_icvs) // push my fixed ICVs to my child
761          copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
762#endif // KMP_BARRIER_ICV_PUSH
763
764        KA_TRACE(
765            20,
766            ("__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
767             "go(%p): %u => %u\n",
768             gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
769             team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
770             child_bar->b_go + KMP_BARRIER_STATE_BUMP));
771        // Release child from barrier
772        ANNOTATE_BARRIER_BEGIN(child_thr);
773        kmp_flag_64 flag(&child_bar->b_go, child_thr);
774        flag.release();
775      }
776    }
777  }
778#if KMP_BARRIER_ICV_PUSH
779  if (propagate_icvs &&
780      !KMP_MASTER_TID(tid)) { // copy ICVs locally to final dest
781    __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
782                             FALSE);
783    copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
784              &thr_bar->th_fixed_icvs);
785  }
786#endif
787  KA_TRACE(
788      20,
789      ("__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
790       gtid, team->t.t_id, tid, bt));
791}
792
793// Hierarchical Barrier
794
795// Initialize thread barrier data
796/* Initializes/re-initializes the hierarchical barrier data stored on a thread.
797   Performs the minimum amount of initialization required based on how the team
798   has changed. Returns true if leaf children will require both on-core and
799   traditional wake-up mechanisms. For example, if the team size increases,
800   threads already in the team will respond to on-core wakeup on their parent
801   thread, but threads newly added to the team will only be listening on the
802   their local b_go. */
803static bool __kmp_init_hierarchical_barrier_thread(enum barrier_type bt,
804                                                   kmp_bstate_t *thr_bar,
805                                                   kmp_uint32 nproc, int gtid,
806                                                   int tid, kmp_team_t *team) {
807  // Checks to determine if (re-)initialization is needed
808  bool uninitialized = thr_bar->team == NULL;
809  bool team_changed = team != thr_bar->team;
810  bool team_sz_changed = nproc != thr_bar->nproc;
811  bool tid_changed = tid != thr_bar->old_tid;
812  bool retval = false;
813
814  if (uninitialized || team_sz_changed) {
815    __kmp_get_hierarchy(nproc, thr_bar);
816  }
817
818  if (uninitialized || team_sz_changed || tid_changed) {
819    thr_bar->my_level = thr_bar->depth - 1; // default for master
820    thr_bar->parent_tid = -1; // default for master
821    if (!KMP_MASTER_TID(
822            tid)) { // if not master, find parent thread in hierarchy
823      kmp_uint32 d = 0;
824      while (d < thr_bar->depth) { // find parent based on level of thread in
825        // hierarchy, and note level
826        kmp_uint32 rem;
827        if (d == thr_bar->depth - 2) { // reached level right below the master
828          thr_bar->parent_tid = 0;
829          thr_bar->my_level = d;
830          break;
831        } else if ((rem = tid % thr_bar->skip_per_level[d + 1]) !=
832                   0) { // TODO: can we make this op faster?
833          // thread is not a subtree root at next level, so this is max
834          thr_bar->parent_tid = tid - rem;
835          thr_bar->my_level = d;
836          break;
837        }
838        ++d;
839      }
840    }
841    thr_bar->offset = 7 - (tid - thr_bar->parent_tid - 1);
842    thr_bar->old_tid = tid;
843    thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
844    thr_bar->team = team;
845    thr_bar->parent_bar =
846        &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
847  }
848  if (uninitialized || team_changed || tid_changed) {
849    thr_bar->team = team;
850    thr_bar->parent_bar =
851        &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
852    retval = true;
853  }
854  if (uninitialized || team_sz_changed || tid_changed) {
855    thr_bar->nproc = nproc;
856    thr_bar->leaf_kids = thr_bar->base_leaf_kids;
857    if (thr_bar->my_level == 0)
858      thr_bar->leaf_kids = 0;
859    if (thr_bar->leaf_kids && (kmp_uint32)tid + thr_bar->leaf_kids + 1 > nproc)
860      thr_bar->leaf_kids = nproc - tid - 1;
861    thr_bar->leaf_state = 0;
862    for (int i = 0; i < thr_bar->leaf_kids; ++i)
863      ((char *)&(thr_bar->leaf_state))[7 - i] = 1;
864  }
865  return retval;
866}
867
868static void __kmp_hierarchical_barrier_gather(
869    enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
870    void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
871  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_gather);
872  kmp_team_t *team = this_thr->th.th_team;
873  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
874  kmp_uint32 nproc = this_thr->th.th_team_nproc;
875  kmp_info_t **other_threads = team->t.t_threads;
876  kmp_uint64 new_state;
877
878  int level = team->t.t_level;
879  if (other_threads[0]
880          ->th.th_teams_microtask) // are we inside the teams construct?
881    if (this_thr->th.th_teams_size.nteams > 1)
882      ++level; // level was not increased in teams construct for team_of_masters
883  if (level == 1)
884    thr_bar->use_oncore_barrier = 1;
885  else
886    thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
887
888  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for "
889                "barrier type %d\n",
890                gtid, team->t.t_id, tid, bt));
891  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
892
893#if USE_ITT_BUILD && USE_ITT_NOTIFY
894  // Barrier imbalance - save arrive time to the thread
895  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
896    this_thr->th.th_bar_arrive_time = __itt_get_timestamp();
897  }
898#endif
899
900  (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid,
901                                               team);
902
903  if (thr_bar->my_level) { // not a leaf (my_level==0 means leaf)
904    kmp_int32 child_tid;
905    new_state =
906        (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
907    if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
908        thr_bar->use_oncore_barrier) {
909      if (thr_bar->leaf_kids) {
910        // First, wait for leaf children to check-in on my b_arrived flag
911        kmp_uint64 leaf_state =
912            KMP_MASTER_TID(tid)
913                ? thr_bar->b_arrived | thr_bar->leaf_state
914                : team->t.t_bar[bt].b_arrived | thr_bar->leaf_state;
915        KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) waiting "
916                      "for leaf kids\n",
917                      gtid, team->t.t_id, tid));
918        kmp_flag_64 flag(&thr_bar->b_arrived, leaf_state);
919        flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
920        if (reduce) {
921          ANNOTATE_REDUCE_AFTER(reduce);
922          OMPT_REDUCTION_DECL(this_thr, gtid);
923          OMPT_REDUCTION_BEGIN;
924          for (child_tid = tid + 1; child_tid <= tid + thr_bar->leaf_kids;
925               ++child_tid) {
926            KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
927                           "T#%d(%d:%d)\n",
928                           gtid, team->t.t_id, tid,
929                           __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
930                           child_tid));
931            ANNOTATE_BARRIER_END(other_threads[child_tid]);
932            (*reduce)(this_thr->th.th_local.reduce_data,
933                      other_threads[child_tid]->th.th_local.reduce_data);
934          }
935          OMPT_REDUCTION_END;
936          ANNOTATE_REDUCE_BEFORE(reduce);
937          ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
938        }
939        // clear leaf_state bits
940        KMP_TEST_THEN_AND64(&thr_bar->b_arrived, ~(thr_bar->leaf_state));
941      }
942      // Next, wait for higher level children on each child's b_arrived flag
943      for (kmp_uint32 d = 1; d < thr_bar->my_level;
944           ++d) { // gather lowest level threads first, but skip 0
945        kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
946                   skip = thr_bar->skip_per_level[d];
947        if (last > nproc)
948          last = nproc;
949        for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
950          kmp_info_t *child_thr = other_threads[child_tid];
951          kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
952          KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
953                        "T#%d(%d:%d) "
954                        "arrived(%p) == %llu\n",
955                        gtid, team->t.t_id, tid,
956                        __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
957                        child_tid, &child_bar->b_arrived, new_state));
958          kmp_flag_64 flag(&child_bar->b_arrived, new_state);
959          flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
960          ANNOTATE_BARRIER_END(child_thr);
961          if (reduce) {
962            KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
963                           "T#%d(%d:%d)\n",
964                           gtid, team->t.t_id, tid,
965                           __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
966                           child_tid));
967            ANNOTATE_REDUCE_AFTER(reduce);
968            (*reduce)(this_thr->th.th_local.reduce_data,
969                      child_thr->th.th_local.reduce_data);
970            ANNOTATE_REDUCE_BEFORE(reduce);
971            ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
972          }
973        }
974      }
975    } else { // Blocktime is not infinite
976      for (kmp_uint32 d = 0; d < thr_bar->my_level;
977           ++d) { // Gather lowest level threads first
978        kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
979                   skip = thr_bar->skip_per_level[d];
980        if (last > nproc)
981          last = nproc;
982        for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
983          kmp_info_t *child_thr = other_threads[child_tid];
984          kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
985          KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
986                        "T#%d(%d:%d) "
987                        "arrived(%p) == %llu\n",
988                        gtid, team->t.t_id, tid,
989                        __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
990                        child_tid, &child_bar->b_arrived, new_state));
991          kmp_flag_64 flag(&child_bar->b_arrived, new_state);
992          flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
993          ANNOTATE_BARRIER_END(child_thr);
994          if (reduce) {
995            KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
996                           "T#%d(%d:%d)\n",
997                           gtid, team->t.t_id, tid,
998                           __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
999                           child_tid));
1000            ANNOTATE_REDUCE_AFTER(reduce);
1001            (*reduce)(this_thr->th.th_local.reduce_data,
1002                      child_thr->th.th_local.reduce_data);
1003            ANNOTATE_REDUCE_BEFORE(reduce);
1004            ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
1005          }
1006        }
1007      }
1008    }
1009  }
1010  // All subordinates are gathered; now release parent if not master thread
1011
1012  if (!KMP_MASTER_TID(tid)) { // worker threads release parent in hierarchy
1013    KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing"
1014                  " T#%d(%d:%d) arrived(%p): %llu => %llu\n",
1015                  gtid, team->t.t_id, tid,
1016                  __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id,
1017                  thr_bar->parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
1018                  thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
1019    /* Mark arrival to parent: After performing this write, a worker thread may
1020       not assume that the team is valid any more - it could be deallocated by
1021       the master thread at any time. */
1022    if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ||
1023        !thr_bar->use_oncore_barrier) { // Parent is waiting on my b_arrived
1024      // flag; release it
1025      ANNOTATE_BARRIER_BEGIN(this_thr);
1026      kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[thr_bar->parent_tid]);
1027      flag.release();
1028    } else {
1029      // Leaf does special release on "offset" bits of parent's b_arrived flag
1030      thr_bar->b_arrived = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
1031      kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived, thr_bar->offset);
1032      flag.set_waiter(other_threads[thr_bar->parent_tid]);
1033      flag.release();
1034    }
1035  } else { // Master thread needs to update the team's b_arrived value
1036    team->t.t_bar[bt].b_arrived = new_state;
1037    KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d "
1038                  "arrived(%p) = %llu\n",
1039                  gtid, team->t.t_id, tid, team->t.t_id,
1040                  &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
1041  }
1042  // Is the team access below unsafe or just technically invalid?
1043  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for "
1044                "barrier type %d\n",
1045                gtid, team->t.t_id, tid, bt));
1046}
1047
1048static void __kmp_hierarchical_barrier_release(
1049    enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
1050    int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
1051  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_release);
1052  kmp_team_t *team;
1053  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1054  kmp_uint32 nproc;
1055  bool team_change = false; // indicates on-core barrier shouldn't be used
1056
1057  if (KMP_MASTER_TID(tid)) {
1058    team = __kmp_threads[gtid]->th.th_team;
1059    KMP_DEBUG_ASSERT(team != NULL);
1060    KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) master "
1061                  "entered barrier type %d\n",
1062                  gtid, team->t.t_id, tid, bt));
1063  } else { // Worker threads
1064    // Wait for parent thread to release me
1065    if (!thr_bar->use_oncore_barrier ||
1066        __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME || thr_bar->my_level != 0 ||
1067        thr_bar->team == NULL) {
1068      // Use traditional method of waiting on my own b_go flag
1069      thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG;
1070      kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
1071      flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1072      ANNOTATE_BARRIER_END(this_thr);
1073      TCW_8(thr_bar->b_go,
1074            KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1075    } else { // Thread barrier data is initialized, this is a leaf, blocktime is
1076      // infinite, not nested
1077      // Wait on my "offset" bits on parent's b_go flag
1078      thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG;
1079      kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP,
1080                           thr_bar->offset, bt,
1081                           this_thr USE_ITT_BUILD_ARG(itt_sync_obj));
1082      flag.wait(this_thr, TRUE);
1083      if (thr_bar->wait_flag ==
1084          KMP_BARRIER_SWITCHING) { // Thread was switched to own b_go
1085        TCW_8(thr_bar->b_go,
1086              KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1087      } else { // Reset my bits on parent's b_go flag
1088        (RCAST(volatile char *,
1089               &(thr_bar->parent_bar->b_go)))[thr_bar->offset] = 0;
1090      }
1091    }
1092    thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
1093    // Early exit for reaping threads releasing forkjoin barrier
1094    if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
1095      return;
1096    // The worker thread may now assume that the team is valid.
1097    team = __kmp_threads[gtid]->th.th_team;
1098    KMP_DEBUG_ASSERT(team != NULL);
1099    tid = __kmp_tid_from_gtid(gtid);
1100
1101    KA_TRACE(
1102        20,
1103        ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
1104         gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
1105    KMP_MB(); // Flush all pending memory write invalidates.
1106  }
1107
1108  nproc = this_thr->th.th_team_nproc;
1109  int level = team->t.t_level;
1110  if (team->t.t_threads[0]
1111          ->th.th_teams_microtask) { // are we inside the teams construct?
1112    if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
1113        this_thr->th.th_teams_level == level)
1114      ++level; // level was not increased in teams construct for team_of_workers
1115    if (this_thr->th.th_teams_size.nteams > 1)
1116      ++level; // level was not increased in teams construct for team_of_masters
1117  }
1118  if (level == 1)
1119    thr_bar->use_oncore_barrier = 1;
1120  else
1121    thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
1122
1123  // If the team size has increased, we still communicate with old leaves via
1124  // oncore barrier.
1125  unsigned short int old_leaf_kids = thr_bar->leaf_kids;
1126  kmp_uint64 old_leaf_state = thr_bar->leaf_state;
1127  team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid,
1128                                                       tid, team);
1129  // But if the entire team changes, we won't use oncore barrier at all
1130  if (team_change)
1131    old_leaf_kids = 0;
1132
1133#if KMP_BARRIER_ICV_PUSH
1134  if (propagate_icvs) {
1135    __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
1136                             FALSE);
1137    if (KMP_MASTER_TID(
1138            tid)) { // master already has copy in final destination; copy
1139      copy_icvs(&thr_bar->th_fixed_icvs,
1140                &team->t.t_implicit_task_taskdata[tid].td_icvs);
1141    } else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1142               thr_bar->use_oncore_barrier) { // optimization for inf blocktime
1143      if (!thr_bar->my_level) // I'm a leaf in the hierarchy (my_level==0)
1144        // leaves (on-core children) pull parent's fixed ICVs directly to local
1145        // ICV store
1146        copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1147                  &thr_bar->parent_bar->th_fixed_icvs);
1148      // non-leaves will get ICVs piggybacked with b_go via NGO store
1149    } else { // blocktime is not infinite; pull ICVs from parent's fixed ICVs
1150      if (thr_bar->my_level) // not a leaf; copy ICVs to my fixed ICVs child can
1151        // access
1152        copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs);
1153      else // leaves copy parent's fixed ICVs directly to local ICV store
1154        copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1155                  &thr_bar->parent_bar->th_fixed_icvs);
1156    }
1157  }
1158#endif // KMP_BARRIER_ICV_PUSH
1159
1160  // Now, release my children
1161  if (thr_bar->my_level) { // not a leaf
1162    kmp_int32 child_tid;
1163    kmp_uint32 last;
1164    if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1165        thr_bar->use_oncore_barrier) {
1166      if (KMP_MASTER_TID(tid)) { // do a flat release
1167        // Set local b_go to bump children via NGO store of the cache line
1168        // containing IVCs and b_go.
1169        thr_bar->b_go = KMP_BARRIER_STATE_BUMP;
1170        // Use ngo stores if available; b_go piggybacks in the last 8 bytes of
1171        // the cache line
1172        ngo_load(&thr_bar->th_fixed_icvs);
1173        // This loops over all the threads skipping only the leaf nodes in the
1174        // hierarchy
1175        for (child_tid = thr_bar->skip_per_level[1]; child_tid < (int)nproc;
1176             child_tid += thr_bar->skip_per_level[1]) {
1177          kmp_bstate_t *child_bar =
1178              &team->t.t_threads[child_tid]->th.th_bar[bt].bb;
1179          KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1180                        "releasing T#%d(%d:%d)"
1181                        " go(%p): %u => %u\n",
1182                        gtid, team->t.t_id, tid,
1183                        __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1184                        child_tid, &child_bar->b_go, child_bar->b_go,
1185                        child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1186          // Use ngo store (if available) to both store ICVs and release child
1187          // via child's b_go
1188          ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
1189        }
1190        ngo_sync();
1191      }
1192      TCW_8(thr_bar->b_go,
1193            KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1194      // Now, release leaf children
1195      if (thr_bar->leaf_kids) { // if there are any
1196        // We test team_change on the off-chance that the level 1 team changed.
1197        if (team_change ||
1198            old_leaf_kids < thr_bar->leaf_kids) { // some old, some new
1199          if (old_leaf_kids) { // release old leaf kids
1200            thr_bar->b_go |= old_leaf_state;
1201          }
1202          // Release new leaf kids
1203          last = tid + thr_bar->skip_per_level[1];
1204          if (last > nproc)
1205            last = nproc;
1206          for (child_tid = tid + 1 + old_leaf_kids; child_tid < (int)last;
1207               ++child_tid) { // skip_per_level[0]=1
1208            kmp_info_t *child_thr = team->t.t_threads[child_tid];
1209            kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1210            KA_TRACE(
1211                20,
1212                ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing"
1213                 " T#%d(%d:%d) go(%p): %u => %u\n",
1214                 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1215                 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1216                 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1217            // Release child using child's b_go flag
1218            ANNOTATE_BARRIER_BEGIN(child_thr);
1219            kmp_flag_64 flag(&child_bar->b_go, child_thr);
1220            flag.release();
1221          }
1222        } else { // Release all children at once with leaf_state bits on my own
1223          // b_go flag
1224          thr_bar->b_go |= thr_bar->leaf_state;
1225        }
1226      }
1227    } else { // Blocktime is not infinite; do a simple hierarchical release
1228      for (int d = thr_bar->my_level - 1; d >= 0;
1229           --d) { // Release highest level threads first
1230        last = tid + thr_bar->skip_per_level[d + 1];
1231        kmp_uint32 skip = thr_bar->skip_per_level[d];
1232        if (last > nproc)
1233          last = nproc;
1234        for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
1235          kmp_info_t *child_thr = team->t.t_threads[child_tid];
1236          kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1237          KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1238                        "releasing T#%d(%d:%d) go(%p): %u => %u\n",
1239                        gtid, team->t.t_id, tid,
1240                        __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1241                        child_tid, &child_bar->b_go, child_bar->b_go,
1242                        child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1243          // Release child using child's b_go flag
1244          ANNOTATE_BARRIER_BEGIN(child_thr);
1245          kmp_flag_64 flag(&child_bar->b_go, child_thr);
1246          flag.release();
1247        }
1248      }
1249    }
1250#if KMP_BARRIER_ICV_PUSH
1251    if (propagate_icvs && !KMP_MASTER_TID(tid))
1252      // non-leaves copy ICVs from fixed ICVs to local dest
1253      copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1254                &thr_bar->th_fixed_icvs);
1255#endif // KMP_BARRIER_ICV_PUSH
1256  }
1257  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for "
1258                "barrier type %d\n",
1259                gtid, team->t.t_id, tid, bt));
1260}
1261
1262// End of Barrier Algorithms
1263
1264// type traits for cancellable value
1265// if cancellable is true, then is_cancellable is a normal boolean variable
1266// if cancellable is false, then is_cancellable is a compile time constant
1267template <bool cancellable> struct is_cancellable {};
1268template <> struct is_cancellable<true> {
1269  bool value;
1270  is_cancellable() : value(false) {}
1271  is_cancellable(bool b) : value(b) {}
1272  is_cancellable &operator=(bool b) {
1273    value = b;
1274    return *this;
1275  }
1276  operator bool() const { return value; }
1277};
1278template <> struct is_cancellable<false> {
1279  is_cancellable &operator=(bool b) { return *this; }
1280  constexpr operator bool() const { return false; }
1281};
1282
1283// Internal function to do a barrier.
1284/* If is_split is true, do a split barrier, otherwise, do a plain barrier
1285   If reduce is non-NULL, do a split reduction barrier, otherwise, do a split
1286   barrier
1287   When cancellable = false,
1288     Returns 0 if master thread, 1 if worker thread.
1289   When cancellable = true
1290     Returns 0 if not cancelled, 1 if cancelled.  */
1291template <bool cancellable = false>
1292static int __kmp_barrier_template(enum barrier_type bt, int gtid, int is_split,
1293                                  size_t reduce_size, void *reduce_data,
1294                                  void (*reduce)(void *, void *)) {
1295  KMP_TIME_PARTITIONED_BLOCK(OMP_plain_barrier);
1296  KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
1297  int tid = __kmp_tid_from_gtid(gtid);
1298  kmp_info_t *this_thr = __kmp_threads[gtid];
1299  kmp_team_t *team = this_thr->th.th_team;
1300  int status = 0;
1301  is_cancellable<cancellable> cancelled;
1302#if OMPT_SUPPORT && OMPT_OPTIONAL
1303  ompt_data_t *my_task_data;
1304  ompt_data_t *my_parallel_data;
1305  void *return_address;
1306  ompt_sync_region_t barrier_kind;
1307#endif
1308
1309  KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) has arrived\n", gtid,
1310                __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1311
1312  ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
1313#if OMPT_SUPPORT
1314  if (ompt_enabled.enabled) {
1315#if OMPT_OPTIONAL
1316    my_task_data = OMPT_CUR_TASK_DATA(this_thr);
1317    my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
1318    return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1319    barrier_kind = __ompt_get_barrier_kind(bt, this_thr);
1320    if (ompt_enabled.ompt_callback_sync_region) {
1321      ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1322          barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data,
1323          return_address);
1324    }
1325    if (ompt_enabled.ompt_callback_sync_region_wait) {
1326      ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1327          barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data,
1328          return_address);
1329    }
1330#endif
1331    // It is OK to report the barrier state after the barrier begin callback.
1332    // According to the OMPT specification, a compliant implementation may
1333    // even delay reporting this state until the barrier begins to wait.
1334    this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier;
1335  }
1336#endif
1337
1338  if (!team->t.t_serialized) {
1339#if USE_ITT_BUILD
1340    // This value will be used in itt notify events below.
1341    void *itt_sync_obj = NULL;
1342#if USE_ITT_NOTIFY
1343    if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1344      itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1345#endif
1346#endif /* USE_ITT_BUILD */
1347    if (__kmp_tasking_mode == tskm_extra_barrier) {
1348      __kmp_tasking_barrier(team, this_thr, gtid);
1349      KA_TRACE(15,
1350               ("__kmp_barrier: T#%d(%d:%d) past tasking barrier\n", gtid,
1351                __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1352    }
1353
1354    /* Copy the blocktime info to the thread, where __kmp_wait_template() can
1355       access it when the team struct is not guaranteed to exist. */
1356    // See note about the corresponding code in __kmp_join_barrier() being
1357    // performance-critical.
1358    if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1359#if KMP_USE_MONITOR
1360      this_thr->th.th_team_bt_intervals =
1361          team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1362      this_thr->th.th_team_bt_set =
1363          team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1364#else
1365      this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1366#endif
1367    }
1368
1369#if USE_ITT_BUILD
1370    if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1371      __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1372#endif /* USE_ITT_BUILD */
1373#if USE_DEBUGGER
1374    // Let the debugger know: the thread arrived to the barrier and waiting.
1375    if (KMP_MASTER_TID(tid)) { // Master counter is stored in team structure.
1376      team->t.t_bar[bt].b_master_arrived += 1;
1377    } else {
1378      this_thr->th.th_bar[bt].bb.b_worker_arrived += 1;
1379    } // if
1380#endif /* USE_DEBUGGER */
1381    if (reduce != NULL) {
1382      // KMP_DEBUG_ASSERT( is_split == TRUE );  // #C69956
1383      this_thr->th.th_local.reduce_data = reduce_data;
1384    }
1385
1386    if (KMP_MASTER_TID(tid) && __kmp_tasking_mode != tskm_immediate_exec)
1387      // use 0 to only setup the current team if nthreads > 1
1388      __kmp_task_team_setup(this_thr, team, 0);
1389
1390    if (cancellable) {
1391      cancelled = __kmp_linear_barrier_gather_cancellable(
1392          bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1393    } else {
1394      switch (__kmp_barrier_gather_pattern[bt]) {
1395      case bp_hyper_bar: {
1396        // don't set branch bits to 0; use linear
1397        KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]);
1398        __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid,
1399                                   reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1400        break;
1401      }
1402      case bp_hierarchical_bar: {
1403        __kmp_hierarchical_barrier_gather(
1404            bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1405        break;
1406      }
1407      case bp_tree_bar: {
1408        // don't set branch bits to 0; use linear
1409        KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]);
1410        __kmp_tree_barrier_gather(bt, this_thr, gtid, tid,
1411                                  reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1412        break;
1413      }
1414      default: {
1415        __kmp_linear_barrier_gather(bt, this_thr, gtid, tid,
1416                                    reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1417      }
1418      }
1419    }
1420
1421    KMP_MB();
1422
1423    if (KMP_MASTER_TID(tid)) {
1424      status = 0;
1425      if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) {
1426        __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1427      }
1428#if USE_DEBUGGER
1429      // Let the debugger know: All threads are arrived and starting leaving the
1430      // barrier.
1431      team->t.t_bar[bt].b_team_arrived += 1;
1432#endif
1433
1434      if (__kmp_omp_cancellation) {
1435        kmp_int32 cancel_request = KMP_ATOMIC_LD_RLX(&team->t.t_cancel_request);
1436        // Reset cancellation flag for worksharing constructs
1437        if (cancel_request == cancel_loop ||
1438            cancel_request == cancel_sections) {
1439          KMP_ATOMIC_ST_RLX(&team->t.t_cancel_request, cancel_noreq);
1440        }
1441      }
1442#if USE_ITT_BUILD
1443      /* TODO: In case of split reduction barrier, master thread may send
1444         acquired event early, before the final summation into the shared
1445         variable is done (final summation can be a long operation for array
1446         reductions).  */
1447      if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1448        __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1449#endif /* USE_ITT_BUILD */
1450#if USE_ITT_BUILD && USE_ITT_NOTIFY
1451      // Barrier - report frame end (only if active_level == 1)
1452      if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
1453          __kmp_forkjoin_frames_mode &&
1454          this_thr->th.th_teams_microtask == NULL &&
1455          team->t.t_active_level == 1) {
1456        ident_t *loc = __kmp_threads[gtid]->th.th_ident;
1457        kmp_uint64 cur_time = __itt_get_timestamp();
1458        kmp_info_t **other_threads = team->t.t_threads;
1459        int nproc = this_thr->th.th_team_nproc;
1460        int i;
1461        switch (__kmp_forkjoin_frames_mode) {
1462        case 1:
1463          __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1464                                 loc, nproc);
1465          this_thr->th.th_frame_time = cur_time;
1466          break;
1467        case 2: // AC 2015-01-19: currently does not work for hierarchical (to
1468          // be fixed)
1469          __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time,
1470                                 1, loc, nproc);
1471          break;
1472        case 3:
1473          if (__itt_metadata_add_ptr) {
1474            // Initialize with master's wait time
1475            kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1476            // Set arrive time to zero to be able to check it in
1477            // __kmp_invoke_task(); the same is done inside the loop below
1478            this_thr->th.th_bar_arrive_time = 0;
1479            for (i = 1; i < nproc; ++i) {
1480              delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
1481              other_threads[i]->th.th_bar_arrive_time = 0;
1482            }
1483            __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
1484                                         cur_time, delta,
1485                                         (kmp_uint64)(reduce != NULL));
1486          }
1487          __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1488                                 loc, nproc);
1489          this_thr->th.th_frame_time = cur_time;
1490          break;
1491        }
1492      }
1493#endif /* USE_ITT_BUILD */
1494    } else {
1495      status = 1;
1496#if USE_ITT_BUILD
1497      if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1498        __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1499#endif /* USE_ITT_BUILD */
1500    }
1501    if ((status == 1 || !is_split) && !cancelled) {
1502      if (cancellable) {
1503        cancelled = __kmp_linear_barrier_release_cancellable(
1504            bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1505      } else {
1506        switch (__kmp_barrier_release_pattern[bt]) {
1507        case bp_hyper_bar: {
1508          KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1509          __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
1510                                      FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1511          break;
1512        }
1513        case bp_hierarchical_bar: {
1514          __kmp_hierarchical_barrier_release(
1515              bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1516          break;
1517        }
1518        case bp_tree_bar: {
1519          KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1520          __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
1521                                     FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1522          break;
1523        }
1524        default: {
1525          __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
1526                                       FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1527        }
1528        }
1529      }
1530      if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) {
1531        __kmp_task_team_sync(this_thr, team);
1532      }
1533    }
1534
1535#if USE_ITT_BUILD
1536    /* GEH: TODO: Move this under if-condition above and also include in
1537       __kmp_end_split_barrier(). This will more accurately represent the actual
1538       release time of the threads for split barriers.  */
1539    if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1540      __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1541#endif /* USE_ITT_BUILD */
1542  } else { // Team is serialized.
1543    status = 0;
1544    if (__kmp_tasking_mode != tskm_immediate_exec) {
1545      if (this_thr->th.th_task_team != NULL) {
1546#if USE_ITT_NOTIFY
1547        void *itt_sync_obj = NULL;
1548        if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1549          itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1550          __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1551        }
1552#endif
1553
1554        KMP_DEBUG_ASSERT(this_thr->th.th_task_team->tt.tt_found_proxy_tasks ==
1555                         TRUE);
1556        __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1557        __kmp_task_team_setup(this_thr, team, 0);
1558
1559#if USE_ITT_BUILD
1560        if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1561          __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1562#endif /* USE_ITT_BUILD */
1563      }
1564    }
1565  }
1566  KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n",
1567                gtid, __kmp_team_from_gtid(gtid)->t.t_id,
1568                __kmp_tid_from_gtid(gtid), status));
1569
1570#if OMPT_SUPPORT
1571  if (ompt_enabled.enabled) {
1572#if OMPT_OPTIONAL
1573    if (ompt_enabled.ompt_callback_sync_region_wait) {
1574      ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1575          barrier_kind, ompt_scope_end, my_parallel_data, my_task_data,
1576          return_address);
1577    }
1578    if (ompt_enabled.ompt_callback_sync_region) {
1579      ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1580          barrier_kind, ompt_scope_end, my_parallel_data, my_task_data,
1581          return_address);
1582    }
1583#endif
1584    this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1585  }
1586#endif
1587  ANNOTATE_BARRIER_END(&team->t.t_bar);
1588
1589  if (cancellable)
1590    return (int)cancelled;
1591  return status;
1592}
1593
1594// Returns 0 if master thread, 1 if worker thread.
1595int __kmp_barrier(enum barrier_type bt, int gtid, int is_split,
1596                  size_t reduce_size, void *reduce_data,
1597                  void (*reduce)(void *, void *)) {
1598  return __kmp_barrier_template<>(bt, gtid, is_split, reduce_size, reduce_data,
1599                                  reduce);
1600}
1601
1602#if defined(KMP_GOMP_COMPAT)
1603// Returns 1 if cancelled, 0 otherwise
1604int __kmp_barrier_gomp_cancel(int gtid) {
1605  if (__kmp_omp_cancellation) {
1606    int cancelled = __kmp_barrier_template<true>(bs_plain_barrier, gtid, FALSE,
1607                                                 0, NULL, NULL);
1608    if (cancelled) {
1609      int tid = __kmp_tid_from_gtid(gtid);
1610      kmp_info_t *this_thr = __kmp_threads[gtid];
1611      if (KMP_MASTER_TID(tid)) {
1612        // Master does not need to revert anything
1613      } else {
1614        // Workers need to revert their private b_arrived flag
1615        this_thr->th.th_bar[bs_plain_barrier].bb.b_arrived -=
1616            KMP_BARRIER_STATE_BUMP;
1617      }
1618    }
1619    return cancelled;
1620  }
1621  __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
1622  return FALSE;
1623}
1624#endif
1625
1626void __kmp_end_split_barrier(enum barrier_type bt, int gtid) {
1627  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_end_split_barrier);
1628  KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
1629  int tid = __kmp_tid_from_gtid(gtid);
1630  kmp_info_t *this_thr = __kmp_threads[gtid];
1631  kmp_team_t *team = this_thr->th.th_team;
1632
1633  ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
1634  if (!team->t.t_serialized) {
1635    if (KMP_MASTER_GTID(gtid)) {
1636      switch (__kmp_barrier_release_pattern[bt]) {
1637      case bp_hyper_bar: {
1638        KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1639        __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
1640                                    FALSE USE_ITT_BUILD_ARG(NULL));
1641        break;
1642      }
1643      case bp_hierarchical_bar: {
1644        __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid,
1645                                           FALSE USE_ITT_BUILD_ARG(NULL));
1646        break;
1647      }
1648      case bp_tree_bar: {
1649        KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1650        __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
1651                                   FALSE USE_ITT_BUILD_ARG(NULL));
1652        break;
1653      }
1654      default: {
1655        __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
1656                                     FALSE USE_ITT_BUILD_ARG(NULL));
1657      }
1658      }
1659      if (__kmp_tasking_mode != tskm_immediate_exec) {
1660        __kmp_task_team_sync(this_thr, team);
1661      } // if
1662    }
1663  }
1664  ANNOTATE_BARRIER_END(&team->t.t_bar);
1665}
1666
1667void __kmp_join_barrier(int gtid) {
1668  KMP_TIME_PARTITIONED_BLOCK(OMP_join_barrier);
1669  KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
1670  kmp_info_t *this_thr = __kmp_threads[gtid];
1671  kmp_team_t *team;
1672  kmp_uint nproc;
1673  kmp_info_t *master_thread;
1674  int tid;
1675#ifdef KMP_DEBUG
1676  int team_id;
1677#endif /* KMP_DEBUG */
1678#if USE_ITT_BUILD
1679  void *itt_sync_obj = NULL;
1680#if USE_ITT_NOTIFY
1681  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) // Don't call routine without need
1682    // Get object created at fork_barrier
1683    itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1684#endif
1685#endif /* USE_ITT_BUILD */
1686  KMP_MB();
1687
1688  // Get current info
1689  team = this_thr->th.th_team;
1690  nproc = this_thr->th.th_team_nproc;
1691  KMP_DEBUG_ASSERT((int)nproc == team->t.t_nproc);
1692  tid = __kmp_tid_from_gtid(gtid);
1693#ifdef KMP_DEBUG
1694  team_id = team->t.t_id;
1695#endif /* KMP_DEBUG */
1696  master_thread = this_thr->th.th_team_master;
1697#ifdef KMP_DEBUG
1698  if (master_thread != team->t.t_threads[0]) {
1699    __kmp_print_structure();
1700  }
1701#endif /* KMP_DEBUG */
1702  KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]);
1703  KMP_MB();
1704
1705  // Verify state
1706  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
1707  KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team));
1708  KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root));
1709  KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]);
1710  KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n",
1711                gtid, team_id, tid));
1712
1713  ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
1714#if OMPT_SUPPORT
1715  if (ompt_enabled.enabled) {
1716#if OMPT_OPTIONAL
1717    ompt_data_t *my_task_data;
1718    ompt_data_t *my_parallel_data;
1719    void *codeptr = NULL;
1720    int ds_tid = this_thr->th.th_info.ds.ds_tid;
1721    if (KMP_MASTER_TID(ds_tid) &&
1722        (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
1723         ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
1724      codeptr = team->t.ompt_team_info.master_return_address;
1725    my_task_data = OMPT_CUR_TASK_DATA(this_thr);
1726    my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
1727    if (ompt_enabled.ompt_callback_sync_region) {
1728      ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1729          ompt_sync_region_barrier_implicit, ompt_scope_begin, my_parallel_data,
1730          my_task_data, codeptr);
1731    }
1732    if (ompt_enabled.ompt_callback_sync_region_wait) {
1733      ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1734          ompt_sync_region_barrier_implicit, ompt_scope_begin, my_parallel_data,
1735          my_task_data, codeptr);
1736    }
1737    if (!KMP_MASTER_TID(ds_tid))
1738      this_thr->th.ompt_thread_info.task_data = *OMPT_CUR_TASK_DATA(this_thr);
1739#endif
1740    this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier_implicit;
1741  }
1742#endif
1743
1744  if (__kmp_tasking_mode == tskm_extra_barrier) {
1745    __kmp_tasking_barrier(team, this_thr, gtid);
1746    KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past taking barrier\n", gtid,
1747                  team_id, tid));
1748  }
1749#ifdef KMP_DEBUG
1750  if (__kmp_tasking_mode != tskm_immediate_exec) {
1751    KA_TRACE(20, ("__kmp_join_barrier: T#%d, old team = %d, old task_team = "
1752                  "%p, th_task_team = %p\n",
1753                  __kmp_gtid_from_thread(this_thr), team_id,
1754                  team->t.t_task_team[this_thr->th.th_task_state],
1755                  this_thr->th.th_task_team));
1756    KMP_DEBUG_ASSERT(this_thr->th.th_task_team ==
1757                     team->t.t_task_team[this_thr->th.th_task_state]);
1758  }
1759#endif /* KMP_DEBUG */
1760
1761  /* Copy the blocktime info to the thread, where __kmp_wait_template() can
1762     access it when the team struct is not guaranteed to exist. Doing these
1763     loads causes a cache miss slows down EPCC parallel by 2x. As a workaround,
1764     we do not perform the copy if blocktime=infinite, since the values are not
1765     used by __kmp_wait_template() in that case. */
1766  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1767#if KMP_USE_MONITOR
1768    this_thr->th.th_team_bt_intervals =
1769        team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1770    this_thr->th.th_team_bt_set =
1771        team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1772#else
1773    this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1774#endif
1775  }
1776
1777#if USE_ITT_BUILD
1778  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1779    __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1780#endif /* USE_ITT_BUILD */
1781
1782  switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) {
1783  case bp_hyper_bar: {
1784    KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1785    __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1786                               NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1787    break;
1788  }
1789  case bp_hierarchical_bar: {
1790    __kmp_hierarchical_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1791                                      NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1792    break;
1793  }
1794  case bp_tree_bar: {
1795    KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1796    __kmp_tree_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1797                              NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1798    break;
1799  }
1800  default: {
1801    __kmp_linear_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1802                                NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1803  }
1804  }
1805
1806  /* From this point on, the team data structure may be deallocated at any time
1807     by the master thread - it is unsafe to reference it in any of the worker
1808     threads. Any per-team data items that need to be referenced before the
1809     end of the barrier should be moved to the kmp_task_team_t structs.  */
1810  if (KMP_MASTER_TID(tid)) {
1811    if (__kmp_tasking_mode != tskm_immediate_exec) {
1812      __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1813    }
1814    if (__kmp_display_affinity) {
1815      KMP_CHECK_UPDATE(team->t.t_display_affinity, 0);
1816    }
1817#if KMP_STATS_ENABLED
1818    // Have master thread flag the workers to indicate they are now waiting for
1819    // next parallel region, Also wake them up so they switch their timers to
1820    // idle.
1821    for (int i = 0; i < team->t.t_nproc; ++i) {
1822      kmp_info_t *team_thread = team->t.t_threads[i];
1823      if (team_thread == this_thr)
1824        continue;
1825      team_thread->th.th_stats->setIdleFlag();
1826      if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME &&
1827          team_thread->th.th_sleep_loc != NULL)
1828        __kmp_null_resume_wrapper(__kmp_gtid_from_thread(team_thread),
1829                                  team_thread->th.th_sleep_loc);
1830    }
1831#endif
1832#if USE_ITT_BUILD
1833    if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1834      __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1835#endif /* USE_ITT_BUILD */
1836
1837#if USE_ITT_BUILD && USE_ITT_NOTIFY
1838    // Join barrier - report frame end
1839    if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
1840        __kmp_forkjoin_frames_mode && this_thr->th.th_teams_microtask == NULL &&
1841        team->t.t_active_level == 1) {
1842      kmp_uint64 cur_time = __itt_get_timestamp();
1843      ident_t *loc = team->t.t_ident;
1844      kmp_info_t **other_threads = team->t.t_threads;
1845      int nproc = this_thr->th.th_team_nproc;
1846      int i;
1847      switch (__kmp_forkjoin_frames_mode) {
1848      case 1:
1849        __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1850                               loc, nproc);
1851        break;
1852      case 2:
1853        __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1,
1854                               loc, nproc);
1855        break;
1856      case 3:
1857        if (__itt_metadata_add_ptr) {
1858          // Initialize with master's wait time
1859          kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1860          // Set arrive time to zero to be able to check it in
1861          // __kmp_invoke_task(); the same is done inside the loop below
1862          this_thr->th.th_bar_arrive_time = 0;
1863          for (i = 1; i < nproc; ++i) {
1864            delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
1865            other_threads[i]->th.th_bar_arrive_time = 0;
1866          }
1867          __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
1868                                       cur_time, delta, 0);
1869        }
1870        __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1871                               loc, nproc);
1872        this_thr->th.th_frame_time = cur_time;
1873        break;
1874      }
1875    }
1876#endif /* USE_ITT_BUILD */
1877  }
1878#if USE_ITT_BUILD
1879  else {
1880    if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1881      __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1882  }
1883#endif /* USE_ITT_BUILD */
1884
1885#if KMP_DEBUG
1886  if (KMP_MASTER_TID(tid)) {
1887    KA_TRACE(
1888        15,
1889        ("__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n",
1890         gtid, team_id, tid, nproc));
1891  }
1892#endif /* KMP_DEBUG */
1893
1894  // TODO now, mark worker threads as done so they may be disbanded
1895  KMP_MB(); // Flush all pending memory write invalidates.
1896  KA_TRACE(10,
1897           ("__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid));
1898
1899  ANNOTATE_BARRIER_END(&team->t.t_bar);
1900}
1901
1902// TODO release worker threads' fork barriers as we are ready instead of all at
1903// once
1904void __kmp_fork_barrier(int gtid, int tid) {
1905  KMP_TIME_PARTITIONED_BLOCK(OMP_fork_barrier);
1906  KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
1907  kmp_info_t *this_thr = __kmp_threads[gtid];
1908  kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL;
1909#if USE_ITT_BUILD
1910  void *itt_sync_obj = NULL;
1911#endif /* USE_ITT_BUILD */
1912  if (team)
1913    ANNOTATE_BARRIER_END(&team->t.t_bar);
1914
1915  KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n", gtid,
1916                (team != NULL) ? team->t.t_id : -1, tid));
1917
1918  // th_team pointer only valid for master thread here
1919  if (KMP_MASTER_TID(tid)) {
1920#if USE_ITT_BUILD && USE_ITT_NOTIFY
1921    if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1922      // Create itt barrier object
1923      itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1);
1924      __kmp_itt_barrier_middle(gtid, itt_sync_obj); // Call acquired/releasing
1925    }
1926#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1927
1928#ifdef KMP_DEBUG
1929    kmp_info_t **other_threads = team->t.t_threads;
1930    int i;
1931
1932    // Verify state
1933    KMP_MB();
1934
1935    for (i = 1; i < team->t.t_nproc; ++i) {
1936      KA_TRACE(500,
1937               ("__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go "
1938                "== %u.\n",
1939                gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid,
1940                team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid,
1941                other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go));
1942      KMP_DEBUG_ASSERT(
1943          (TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go) &
1944           ~(KMP_BARRIER_SLEEP_STATE)) == KMP_INIT_BARRIER_STATE);
1945      KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team);
1946    }
1947#endif
1948
1949    if (__kmp_tasking_mode != tskm_immediate_exec) {
1950      // 0 indicates setup current task team if nthreads > 1
1951      __kmp_task_team_setup(this_thr, team, 0);
1952    }
1953
1954    /* The master thread may have changed its blocktime between the join barrier
1955       and the fork barrier. Copy the blocktime info to the thread, where
1956       __kmp_wait_template() can access it when the team struct is not
1957       guaranteed to exist. */
1958    // See note about the corresponding code in __kmp_join_barrier() being
1959    // performance-critical
1960    if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1961#if KMP_USE_MONITOR
1962      this_thr->th.th_team_bt_intervals =
1963          team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1964      this_thr->th.th_team_bt_set =
1965          team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1966#else
1967      this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1968#endif
1969    }
1970  } // master
1971
1972  switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) {
1973  case bp_hyper_bar: {
1974    KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1975    __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1976                                TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1977    break;
1978  }
1979  case bp_hierarchical_bar: {
1980    __kmp_hierarchical_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1981                                       TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1982    break;
1983  }
1984  case bp_tree_bar: {
1985    KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1986    __kmp_tree_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1987                               TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1988    break;
1989  }
1990  default: {
1991    __kmp_linear_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1992                                 TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1993  }
1994  }
1995
1996#if OMPT_SUPPORT
1997  if (ompt_enabled.enabled &&
1998      this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
1999    int ds_tid = this_thr->th.th_info.ds.ds_tid;
2000    ompt_data_t *task_data = (team)
2001                                 ? OMPT_CUR_TASK_DATA(this_thr)
2002                                 : &(this_thr->th.ompt_thread_info.task_data);
2003    this_thr->th.ompt_thread_info.state = ompt_state_overhead;
2004#if OMPT_OPTIONAL
2005    void *codeptr = NULL;
2006    if (KMP_MASTER_TID(ds_tid) &&
2007        (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
2008         ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
2009      codeptr = team->t.ompt_team_info.master_return_address;
2010    if (ompt_enabled.ompt_callback_sync_region_wait) {
2011      ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2012          ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
2013          codeptr);
2014    }
2015    if (ompt_enabled.ompt_callback_sync_region) {
2016      ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2017          ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
2018          codeptr);
2019    }
2020#endif
2021    if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
2022      ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2023          ompt_scope_end, NULL, task_data, 0, ds_tid, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
2024    }
2025  }
2026#endif
2027
2028  // Early exit for reaping threads releasing forkjoin barrier
2029  if (TCR_4(__kmp_global.g.g_done)) {
2030    this_thr->th.th_task_team = NULL;
2031
2032#if USE_ITT_BUILD && USE_ITT_NOTIFY
2033    if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2034      if (!KMP_MASTER_TID(tid)) {
2035        itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2036        if (itt_sync_obj)
2037          __kmp_itt_barrier_finished(gtid, itt_sync_obj);
2038      }
2039    }
2040#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2041    KA_TRACE(10, ("__kmp_fork_barrier: T#%d is leaving early\n", gtid));
2042    return;
2043  }
2044
2045  /* We can now assume that a valid team structure has been allocated by the
2046     master and propagated to all worker threads. The current thread, however,
2047     may not be part of the team, so we can't blindly assume that the team
2048     pointer is non-null.  */
2049  team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team);
2050  KMP_DEBUG_ASSERT(team != NULL);
2051  tid = __kmp_tid_from_gtid(gtid);
2052
2053#if KMP_BARRIER_ICV_PULL
2054  /* Master thread's copy of the ICVs was set up on the implicit taskdata in
2055     __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's
2056     implicit task has this data before this function is called. We cannot
2057     modify __kmp_fork_call() to look at the fixed ICVs in the master's thread
2058     struct, because it is not always the case that the threads arrays have
2059     been allocated when __kmp_fork_call() is executed. */
2060  {
2061    KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
2062    if (!KMP_MASTER_TID(tid)) { // master thread already has ICVs
2063      // Copy the initial ICVs from the master's thread struct to the implicit
2064      // task for this tid.
2065      KA_TRACE(10,
2066               ("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid));
2067      __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team,
2068                               tid, FALSE);
2069      copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
2070                &team->t.t_threads[0]
2071                     ->th.th_bar[bs_forkjoin_barrier]
2072                     .bb.th_fixed_icvs);
2073    }
2074  }
2075#endif // KMP_BARRIER_ICV_PULL
2076
2077  if (__kmp_tasking_mode != tskm_immediate_exec) {
2078    __kmp_task_team_sync(this_thr, team);
2079  }
2080
2081#if KMP_AFFINITY_SUPPORTED
2082  kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
2083  if (proc_bind == proc_bind_intel) {
2084    // Call dynamic affinity settings
2085    if (__kmp_affinity_type == affinity_balanced && team->t.t_size_changed) {
2086      __kmp_balanced_affinity(this_thr, team->t.t_nproc);
2087    }
2088  } else if (proc_bind != proc_bind_false) {
2089    if (this_thr->th.th_new_place == this_thr->th.th_current_place) {
2090      KA_TRACE(100, ("__kmp_fork_barrier: T#%d already in correct place %d\n",
2091                     __kmp_gtid_from_thread(this_thr),
2092                     this_thr->th.th_current_place));
2093    } else {
2094      __kmp_affinity_set_place(gtid);
2095    }
2096  }
2097#endif // KMP_AFFINITY_SUPPORTED
2098  // Perform the display affinity functionality
2099  if (__kmp_display_affinity) {
2100    if (team->t.t_display_affinity
2101#if KMP_AFFINITY_SUPPORTED
2102        || (__kmp_affinity_type == affinity_balanced && team->t.t_size_changed)
2103#endif
2104            ) {
2105      // NULL means use the affinity-format-var ICV
2106      __kmp_aux_display_affinity(gtid, NULL);
2107      this_thr->th.th_prev_num_threads = team->t.t_nproc;
2108      this_thr->th.th_prev_level = team->t.t_level;
2109    }
2110  }
2111  if (!KMP_MASTER_TID(tid))
2112    KMP_CHECK_UPDATE(this_thr->th.th_def_allocator, team->t.t_def_allocator);
2113
2114#if USE_ITT_BUILD && USE_ITT_NOTIFY
2115  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2116    if (!KMP_MASTER_TID(tid)) {
2117      // Get correct barrier object
2118      itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2119      __kmp_itt_barrier_finished(gtid, itt_sync_obj); // Workers call acquired
2120    } // (prepare called inside barrier_release)
2121  }
2122#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2123  ANNOTATE_BARRIER_END(&team->t.t_bar);
2124  KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid,
2125                team->t.t_id, tid));
2126}
2127
2128void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
2129                          kmp_internal_control_t *new_icvs, ident_t *loc) {
2130  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_setup_icv_copy);
2131
2132  KMP_DEBUG_ASSERT(team && new_nproc && new_icvs);
2133  KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
2134
2135/* Master thread's copy of the ICVs was set up on the implicit taskdata in
2136   __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's
2137   implicit task has this data before this function is called. */
2138#if KMP_BARRIER_ICV_PULL
2139  /* Copy ICVs to master's thread structure into th_fixed_icvs (which remains
2140     untouched), where all of the worker threads can access them and make their
2141     own copies after the barrier. */
2142  KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
2143  // allocated at this point
2144  copy_icvs(
2145      &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs,
2146      new_icvs);
2147  KF_TRACE(10, ("__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n", 0,
2148                team->t.t_threads[0], team));
2149#elif KMP_BARRIER_ICV_PUSH
2150  // The ICVs will be propagated in the fork barrier, so nothing needs to be
2151  // done here.
2152  KF_TRACE(10, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n", 0,
2153                team->t.t_threads[0], team));
2154#else
2155  // Copy the ICVs to each of the non-master threads.  This takes O(nthreads)
2156  // time.
2157  ngo_load(new_icvs);
2158  KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
2159  // allocated at this point
2160  for (int f = 1; f < new_nproc; ++f) { // Skip the master thread
2161    // TODO: GEH - pass in better source location info since usually NULL here
2162    KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2163                  f, team->t.t_threads[f], team));
2164    __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE);
2165    ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs);
2166    KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2167                  f, team->t.t_threads[f], team));
2168  }
2169  ngo_sync();
2170#endif // KMP_BARRIER_ICV_PULL
2171}
2172