1331722Seadler/*
2227068Sambrisko * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
3227068Sambrisko */
4227068Sambrisko
5227068Sambrisko//===----------------------------------------------------------------------===//
6227068Sambrisko//
7227068Sambrisko// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8227068Sambrisko// See https://llvm.org/LICENSE.txt for license information.
9227068Sambrisko// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10227068Sambrisko//
11227068Sambrisko//===----------------------------------------------------------------------===//
12227068Sambrisko
13227068Sambrisko/* Dynamic scheduling initialization and dispatch.
14227068Sambrisko *
15227068Sambrisko * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
16227068Sambrisko *       it may change values between parallel regions.  __kmp_max_nth
17227068Sambrisko *       is the largest value __kmp_nth may take, 1 is the smallest.
18227068Sambrisko */
19227068Sambrisko
20227068Sambrisko#include "kmp.h"
21227068Sambrisko#include "kmp_error.h"
22227068Sambrisko#include "kmp_i18n.h"
23227068Sambrisko#include "kmp_itt.h"
24227068Sambrisko#include "kmp_stats.h"
25227068Sambrisko#include "kmp_str.h"
26227068Sambrisko#if KMP_USE_X87CONTROL
27227068Sambrisko#include <float.h>
28227068Sambrisko#endif
29227068Sambrisko#include "kmp_lock.h"
30227068Sambrisko#include "kmp_dispatch.h"
31227068Sambrisko#if KMP_USE_HIER_SCHED
32227068Sambrisko#include "kmp_dispatch_hier.h"
33227068Sambrisko#endif
34233711Sambrisko
35227068Sambrisko#if OMPT_SUPPORT
36227068Sambrisko#include "ompt-specific.h"
37227068Sambrisko#endif
38227068Sambrisko
39227068Sambrisko/* ------------------------------------------------------------------------ */
40227068Sambrisko/* ------------------------------------------------------------------------ */
41227068Sambrisko
42227068Sambriskovoid __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
43227068Sambrisko  kmp_info_t *th;
44227068Sambrisko
45227068Sambrisko  KMP_DEBUG_ASSERT(gtid_ref);
46227068Sambrisko
47227068Sambrisko  if (__kmp_env_consistency_check) {
48227068Sambrisko    th = __kmp_threads[*gtid_ref];
49227068Sambrisko    if (th->th.th_root->r.r_active &&
50233711Sambrisko        (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) {
51227068Sambrisko#if KMP_USE_DYNAMIC_LOCK
52227068Sambrisko      __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0);
53227068Sambrisko#else
54227068Sambrisko      __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL);
55227068Sambrisko#endif
56227068Sambrisko    }
57227068Sambrisko  }
58247369Ssmh}
59227068Sambrisko
60227068Sambriskovoid __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
61227068Sambrisko  kmp_info_t *th;
62227068Sambrisko
63227068Sambrisko  if (__kmp_env_consistency_check) {
64227068Sambrisko    th = __kmp_threads[*gtid_ref];
65227068Sambrisko    if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) {
66227068Sambrisko      __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref);
67227068Sambrisko    }
68227068Sambrisko  }
69227068Sambrisko}
70227068Sambrisko
71227068Sambrisko// Returns either SCHEDULE_MONOTONIC or SCHEDULE_NONMONOTONIC
72227068Sambriskostatic inline int __kmp_get_monotonicity(ident_t *loc, enum sched_type schedule,
73227068Sambrisko                                         bool use_hier = false) {
74227068Sambrisko  // Pick up the nonmonotonic/monotonic bits from the scheduling type
75227068Sambrisko  // Nonmonotonic as default for dynamic schedule when no modifier is specified
76227068Sambrisko  int monotonicity = SCHEDULE_NONMONOTONIC;
77227068Sambrisko
78227068Sambrisko  // Let default be monotonic for executables
79227068Sambrisko  // compiled with OpenMP* 4.5 or less compilers
80235014Sambrisko  if (loc != NULL && loc->get_openmp_version() < 50)
81235014Sambrisko    monotonicity = SCHEDULE_MONOTONIC;
82227068Sambrisko
83227068Sambrisko  if (use_hier || __kmp_force_monotonic)
84227068Sambrisko    monotonicity = SCHEDULE_MONOTONIC;
85247369Ssmh  else if (SCHEDULE_HAS_NONMONOTONIC(schedule))
86247369Ssmh    monotonicity = SCHEDULE_NONMONOTONIC;
87247369Ssmh  else if (SCHEDULE_HAS_MONOTONIC(schedule))
88247369Ssmh    monotonicity = SCHEDULE_MONOTONIC;
89247369Ssmh
90247369Ssmh  return monotonicity;
91247369Ssmh}
92247369Ssmh
93227068Sambrisko#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
94227068Sambrisko// Return floating point number rounded to two decimal points
95227068Sambriskostatic inline float __kmp_round_2decimal_val(float num) {
96227068Sambrisko  return (float)(static_cast<int>(num * 100 + 0.5)) / 100;
97227068Sambrisko}
98227068Sambriskostatic inline int __kmp_get_round_val(float num) {
99227068Sambrisko  return static_cast<int>(num < 0 ? num - 0.5 : num + 0.5);
100227068Sambrisko}
101227068Sambrisko#endif
102227068Sambrisko
103227068Sambriskotemplate <typename T>
104227068Sambriskoinline void
105227068Sambrisko__kmp_initialize_self_buffer(kmp_team_t *team, T id,
106227068Sambrisko                             dispatch_private_info_template<T> *pr,
107227068Sambrisko                             typename traits_t<T>::unsigned_t nchunks, T nproc,
108227068Sambrisko                             typename traits_t<T>::unsigned_t &init,
109227068Sambrisko                             T &small_chunk, T &extras, T &p_extra) {
110227068Sambrisko
111227068Sambrisko#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
112227068Sambrisko  if (pr->flags.use_hybrid) {
113227068Sambrisko    kmp_info_t *th = __kmp_threads[__kmp_gtid_from_tid((int)id, team)];
114227068Sambrisko    kmp_hw_core_type_t type =
115227068Sambrisko        (kmp_hw_core_type_t)th->th.th_topology_attrs.core_type;
116227068Sambrisko    T pchunks = pr->u.p.pchunks;
117227068Sambrisko    T echunks = nchunks - pchunks;
118227068Sambrisko    T num_procs_with_pcore = pr->u.p.num_procs_with_pcore;
119227068Sambrisko    T num_procs_with_ecore = nproc - num_procs_with_pcore;
120233711Sambrisko    T first_thread_with_ecore = pr->u.p.first_thread_with_ecore;
121227068Sambrisko    T big_chunk =
122227068Sambrisko        pchunks / num_procs_with_pcore; // chunks per thread with p-core
123233711Sambrisko    small_chunk =
124227068Sambrisko        echunks / num_procs_with_ecore; // chunks per thread with e-core
125227068Sambrisko
126227068Sambrisko    extras =
127227068Sambrisko        (pchunks % num_procs_with_pcore) + (echunks % num_procs_with_ecore);
128227068Sambrisko
129233711Sambrisko    p_extra = (big_chunk - small_chunk);
130227068Sambrisko
131227068Sambrisko    if (type == KMP_HW_CORE_TYPE_CORE) {
132227068Sambrisko      if (id < first_thread_with_ecore) {
133227068Sambrisko        init = id * small_chunk + id * p_extra + (id < extras ? id : extras);
134227068Sambrisko      } else {
135227068Sambrisko        init = id * small_chunk + (id - num_procs_with_ecore) * p_extra +
136227068Sambrisko               (id < extras ? id : extras);
137227068Sambrisko      }
138233711Sambrisko    } else {
139227068Sambrisko      if (id == first_thread_with_ecore) {
140227068Sambrisko        init = id * small_chunk + id * p_extra + (id < extras ? id : extras);
141227068Sambrisko      } else {
142227068Sambrisko        init = id * small_chunk + first_thread_with_ecore * p_extra +
143233711Sambrisko               (id < extras ? id : extras);
144233711Sambrisko      }
145227068Sambrisko    }
146227068Sambrisko    p_extra = (type == KMP_HW_CORE_TYPE_CORE) ? p_extra : 0;
147235016Sambrisko    return;
148227068Sambrisko  }
149227068Sambrisko#endif
150227068Sambrisko
151235016Sambrisko  small_chunk = nchunks / nproc; // chunks per thread
152235016Sambrisko  extras = nchunks % nproc;
153227068Sambrisko  p_extra = 0;
154233711Sambrisko  init = id * small_chunk + (id < extras ? id : extras);
155233711Sambrisko}
156227068Sambrisko
157233711Sambrisko#if KMP_STATIC_STEAL_ENABLED
158233711Sambriskoenum { // values for steal_flag (possible states of private per-loop buffer)
159233711Sambrisko  UNUSED = 0,
160233711Sambrisko  CLAIMED = 1, // owner thread started initialization
161233711Sambrisko  READY = 2, // available for stealing
162233711Sambrisko  THIEF = 3 // finished by owner, or claimed by thief
163227068Sambrisko  // possible state changes:
164233711Sambrisko  // 0 -> 1 owner only, sync
165227068Sambrisko  // 0 -> 3 thief only, sync
166233711Sambrisko  // 1 -> 2 owner only, async
167227068Sambrisko  // 2 -> 3 owner only, async
168233711Sambrisko  // 3 -> 2 owner only, async
169233711Sambrisko  // 3 -> 0 last thread finishing the loop, async
170233711Sambrisko};
171247369Ssmh#endif
172247369Ssmh
173227068Sambrisko// Initialize a dispatch_private_info_template<T> buffer for a particular
174233711Sambrisko// type of schedule,chunk.  The loop description is found in lb (lower bound),
175233711Sambrisko// ub (upper bound), and st (stride).  nproc is the number of threads relevant
176233711Sambrisko// to the scheduling (often the number of threads in a team, but not always if
177227068Sambrisko// hierarchical scheduling is used).  tid is the id of the thread calling
178247369Ssmh// the function within the group of nproc threads.  It will have a value
179227068Sambrisko// between 0 and nproc - 1.  This is often just the thread id within a team, but
180233711Sambrisko// is not necessarily the case when using hierarchical scheduling.
181227068Sambrisko// loc is the source file location of the corresponding loop
182233711Sambrisko// gtid is the global thread id
183233711Sambriskotemplate <typename T>
184233711Sambriskovoid __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
185227068Sambrisko                                   dispatch_private_info_template<T> *pr,
186233711Sambrisko                                   enum sched_type schedule, T lb, T ub,
187233711Sambrisko                                   typename traits_t<T>::signed_t st,
188233711Sambrisko#if USE_ITT_BUILD
189233711Sambrisko                                   kmp_uint64 *cur_chunk,
190247369Ssmh#endif
191247369Ssmh                                   typename traits_t<T>::signed_t chunk,
192227068Sambrisko                                   T nproc, T tid) {
193233711Sambrisko  typedef typename traits_t<T>::unsigned_t UT;
194233711Sambrisko  typedef typename traits_t<T>::floating_t DBL;
195233711Sambrisko
196233711Sambrisko  int active;
197227068Sambrisko  T tc;
198227068Sambrisko  kmp_info_t *th;
199227068Sambrisko  kmp_team_t *team;
200235016Sambrisko  int monotonicity;
201227068Sambrisko  bool use_hier;
202235016Sambrisko
203235016Sambrisko#ifdef KMP_DEBUG
204227068Sambrisko  typedef typename traits_t<T>::signed_t ST;
205227068Sambrisko  {
206227068Sambrisko    char *buff;
207227068Sambrisko    // create format specifiers before the debug output
208227068Sambrisko    buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d called "
209227068Sambrisko                            "pr:%%p lb:%%%s ub:%%%s st:%%%s "
210227068Sambrisko                            "schedule:%%d chunk:%%%s nproc:%%%s tid:%%%s\n",
211227068Sambrisko                            traits_t<T>::spec, traits_t<T>::spec,
212227068Sambrisko                            traits_t<ST>::spec, traits_t<ST>::spec,
213227068Sambrisko                            traits_t<T>::spec, traits_t<T>::spec);
214227068Sambrisko    KD_TRACE(10, (buff, gtid, pr, lb, ub, st, schedule, chunk, nproc, tid));
215227068Sambrisko    __kmp_str_free(&buff);
216227068Sambrisko  }
217227068Sambrisko#endif
218227068Sambrisko  /* setup data */
219227068Sambrisko  th = __kmp_threads[gtid];
220227068Sambrisko  team = th->th.th_team;
221227068Sambrisko  active = !team->t.t_serialized;
222227068Sambrisko
223227068Sambrisko#if USE_ITT_BUILD
224227068Sambrisko  int itt_need_metadata_reporting =
225227068Sambrisko      __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
226227068Sambrisko      KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
227227068Sambrisko      team->t.t_active_level == 1;
228227068Sambrisko#endif
229227068Sambrisko
230227068Sambrisko#if KMP_USE_HIER_SCHED
231227068Sambrisko  use_hier = pr->flags.use_hier;
232227068Sambrisko#else
233227068Sambrisko  use_hier = false;
234227068Sambrisko#endif
235227068Sambrisko
236227068Sambrisko  /* Pick up the nonmonotonic/monotonic bits from the scheduling type */
237227068Sambrisko  monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);
238227068Sambrisko  schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
239227068Sambrisko
240227068Sambrisko  /* Pick up the nomerge/ordered bits from the scheduling type */
241227068Sambrisko  if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) {
242227068Sambrisko    pr->flags.nomerge = TRUE;
243227068Sambrisko    schedule =
244227068Sambrisko        (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
245235016Sambrisko  } else {
246235016Sambrisko    pr->flags.nomerge = FALSE;
247227068Sambrisko  }
248227068Sambrisko  pr->type_size = traits_t<T>::type_size; // remember the size of variables
249235016Sambrisko  if (kmp_ord_lower & schedule) {
250235016Sambrisko    pr->flags.ordered = TRUE;
251227068Sambrisko    schedule =
252227068Sambrisko        (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
253233711Sambrisko  } else {
254227068Sambrisko    pr->flags.ordered = FALSE;
255227068Sambrisko  }
256233711Sambrisko  // Ordered overrides nonmonotonic
257227068Sambrisko  if (pr->flags.ordered) {
258227068Sambrisko    monotonicity = SCHEDULE_MONOTONIC;
259227068Sambrisko  }
260227068Sambrisko
261227068Sambrisko  if (schedule == kmp_sch_static) {
262227068Sambrisko    schedule = __kmp_static;
263227068Sambrisko  } else {
264227068Sambrisko    if (schedule == kmp_sch_runtime) {
265227068Sambrisko      // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if
266227068Sambrisko      // not specified)
267227068Sambrisko      schedule = team->t.t_sched.r_sched_type;
268227068Sambrisko      monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);
269227068Sambrisko      schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
270227068Sambrisko      if (pr->flags.ordered) // correct monotonicity for ordered loop if needed
271235016Sambrisko        monotonicity = SCHEDULE_MONOTONIC;
272235016Sambrisko      // Detail the schedule if needed (global controls are differentiated
273227068Sambrisko      // appropriately)
274227068Sambrisko      if (schedule == kmp_sch_guided_chunked) {
275227068Sambrisko        schedule = __kmp_guided;
276227068Sambrisko      } else if (schedule == kmp_sch_static) {
277227068Sambrisko        schedule = __kmp_static;
278227068Sambrisko      }
279227068Sambrisko      // Use the chunk size specified by OMP_SCHEDULE (or default if not
280227068Sambrisko      // specified)
281227068Sambrisko      chunk = team->t.t_sched.chunk;
282233711Sambrisko#if USE_ITT_BUILD
283227068Sambrisko      if (cur_chunk)
284227068Sambrisko        *cur_chunk = chunk;
285227068Sambrisko#endif
286227068Sambrisko#ifdef KMP_DEBUG
287227068Sambrisko      {
288227068Sambrisko        char *buff;
289227068Sambrisko        // create format specifiers before the debug output
290227068Sambrisko        buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d new: "
291227068Sambrisko                                "schedule:%%d chunk:%%%s\n",
292227068Sambrisko                                traits_t<ST>::spec);
293227068Sambrisko        KD_TRACE(10, (buff, gtid, schedule, chunk));
294227068Sambrisko        __kmp_str_free(&buff);
295227068Sambrisko      }
296227068Sambrisko#endif
297233711Sambrisko    } else {
298227068Sambrisko      if (schedule == kmp_sch_guided_chunked) {
299227068Sambrisko        schedule = __kmp_guided;
300227068Sambrisko      }
301227068Sambrisko      if (chunk <= 0) {
302227068Sambrisko        chunk = KMP_DEFAULT_CHUNK;
303227068Sambrisko      }
304227068Sambrisko    }
305227068Sambrisko
306227068Sambrisko    if (schedule == kmp_sch_auto) {
307227068Sambrisko      // mapping and differentiation: in the __kmp_do_serial_initialize()
308227068Sambrisko      schedule = __kmp_auto;
309227068Sambrisko#ifdef KMP_DEBUG
310227068Sambrisko      {
311227068Sambrisko        char *buff;
312227068Sambrisko        // create format specifiers before the debug output
313227068Sambrisko        buff = __kmp_str_format(
314227068Sambrisko            "__kmp_dispatch_init_algorithm: kmp_sch_auto: T#%%d new: "
315227068Sambrisko            "schedule:%%d chunk:%%%s\n",
316227068Sambrisko            traits_t<ST>::spec);
317227068Sambrisko        KD_TRACE(10, (buff, gtid, schedule, chunk));
318227068Sambrisko        __kmp_str_free(&buff);
319227068Sambrisko      }
320247369Ssmh#endif
321247369Ssmh    }
322227068Sambrisko#if KMP_STATIC_STEAL_ENABLED
323227068Sambrisko    // map nonmonotonic:dynamic to static steal
324233711Sambrisko    if (schedule == kmp_sch_dynamic_chunked) {
325233711Sambrisko      if (monotonicity == SCHEDULE_NONMONOTONIC)
326227068Sambrisko        schedule = kmp_sch_static_steal;
327227068Sambrisko    }
328227068Sambrisko#endif
329227068Sambrisko    /* guided analytical not safe for too many threads */
330227068Sambrisko    if (schedule == kmp_sch_guided_analytical_chunked && nproc > 1 << 20) {
331235016Sambrisko      schedule = kmp_sch_guided_iterative_chunked;
332227068Sambrisko      KMP_WARNING(DispatchManyThreads);
333227068Sambrisko    }
334227068Sambrisko    if (schedule == kmp_sch_runtime_simd) {
335227068Sambrisko      // compiler provides simd_width in the chunk parameter
336227068Sambrisko      schedule = team->t.t_sched.r_sched_type;
337227068Sambrisko      monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);
338247369Ssmh      schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
339227068Sambrisko      // Detail the schedule if needed (global controls are differentiated
340233711Sambrisko      // appropriately)
341227068Sambrisko      if (schedule == kmp_sch_static || schedule == kmp_sch_auto ||
342247369Ssmh          schedule == __kmp_static) {
343227068Sambrisko        schedule = kmp_sch_static_balanced_chunked;
344227068Sambrisko      } else {
345247369Ssmh        if (schedule == kmp_sch_guided_chunked || schedule == __kmp_guided) {
346247369Ssmh          schedule = kmp_sch_guided_simd;
347227068Sambrisko        }
348233711Sambrisko        chunk = team->t.t_sched.chunk * chunk;
349247369Ssmh      }
350227068Sambrisko#if USE_ITT_BUILD
351227068Sambrisko      if (cur_chunk)
352227068Sambrisko        *cur_chunk = chunk;
353227068Sambrisko#endif
354247369Ssmh#ifdef KMP_DEBUG
355247369Ssmh      {
356227068Sambrisko        char *buff;
357227068Sambrisko        // create format specifiers before the debug output
358247369Ssmh        buff = __kmp_str_format(
359247369Ssmh            "__kmp_dispatch_init_algorithm: T#%%d new: schedule:%%d"
360247369Ssmh            " chunk:%%%s\n",
361247369Ssmh            traits_t<ST>::spec);
362247369Ssmh        KD_TRACE(10, (buff, gtid, schedule, chunk));
363227068Sambrisko        __kmp_str_free(&buff);
364227068Sambrisko      }
365227068Sambrisko#endif
366227068Sambrisko    }
367227068Sambrisko    pr->u.p.parm1 = chunk;
368227068Sambrisko  }
369227068Sambrisko  KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper),
370227068Sambrisko              "unknown scheduling type");
371227068Sambrisko
372227068Sambrisko  pr->u.p.count = 0;
373227068Sambrisko
374247369Ssmh  if (__kmp_env_consistency_check) {
375227068Sambrisko    if (st == 0) {
376227068Sambrisko      __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited,
377227068Sambrisko                            (pr->flags.ordered ? ct_pdo_ordered : ct_pdo), loc);
378227068Sambrisko    }
379227068Sambrisko  }
380227068Sambrisko  // compute trip count
381227068Sambrisko  if (st == 1) { // most common case
382227068Sambrisko    if (ub >= lb) {
383227068Sambrisko      tc = ub - lb + 1;
384227068Sambrisko    } else { // ub < lb
385227068Sambrisko      tc = 0; // zero-trip
386227068Sambrisko    }
387227068Sambrisko  } else if (st < 0) {
388227068Sambrisko    if (lb >= ub) {
389227068Sambrisko      // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B),
390227068Sambrisko      // where the division needs to be unsigned regardless of the result type
391227068Sambrisko      tc = (UT)(lb - ub) / (-st) + 1;
392227068Sambrisko    } else { // lb < ub
393233711Sambrisko      tc = 0; // zero-trip
394233711Sambrisko    }
395227068Sambrisko  } else { // st > 0
396227068Sambrisko    if (ub >= lb) {
397227068Sambrisko      // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B),
398227068Sambrisko      // where the division needs to be unsigned regardless of the result type
399227068Sambrisko      tc = (UT)(ub - lb) / st + 1;
400233711Sambrisko    } else { // ub < lb
401233711Sambrisko      tc = 0; // zero-trip
402233711Sambrisko    }
403227068Sambrisko  }
404227068Sambrisko
405227068Sambrisko#if KMP_STATS_ENABLED
406227068Sambrisko  if (KMP_MASTER_GTID(gtid)) {
407227068Sambrisko    KMP_COUNT_VALUE(OMP_loop_dynamic_total_iterations, tc);
408233711Sambrisko  }
409233711Sambrisko#endif
410233711Sambrisko
411227068Sambrisko  pr->u.p.lb = lb;
412227068Sambrisko  pr->u.p.ub = ub;
413227068Sambrisko  pr->u.p.st = st;
414233711Sambrisko  pr->u.p.tc = tc;
415233711Sambrisko
416227068Sambrisko#if KMP_OS_WINDOWS
417227068Sambrisko  pr->u.p.last_upper = ub + st;
418227068Sambrisko#endif /* KMP_OS_WINDOWS */
419227068Sambrisko
420227068Sambrisko  /* NOTE: only the active parallel region(s) has active ordered sections */
421227068Sambrisko
422227068Sambrisko  if (active) {
423227068Sambrisko    if (pr->flags.ordered) {
424227068Sambrisko      pr->ordered_bumped = 0;
425227068Sambrisko      pr->u.p.ordered_lower = 1;
426227068Sambrisko      pr->u.p.ordered_upper = 0;
427227068Sambrisko    }
428247369Ssmh  }
429227068Sambrisko
430227068Sambrisko  switch (schedule) {
431247369Ssmh#if KMP_STATIC_STEAL_ENABLED
432227068Sambrisko  case kmp_sch_static_steal: {
433247369Ssmh    T ntc, init = 0;
434247369Ssmh
435227068Sambrisko    KD_TRACE(100,
436247369Ssmh             ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n",
437247369Ssmh              gtid));
438227068Sambrisko
439227068Sambrisko    ntc = (tc % chunk ? 1 : 0) + tc / chunk;
440247369Ssmh    if (nproc > 1 && ntc >= nproc) {
441247369Ssmh      KMP_COUNT_BLOCK(OMP_LOOP_STATIC_STEAL);
442247369Ssmh      T id = tid;
443247369Ssmh      T small_chunk, extras, p_extra = 0;
444247369Ssmh      kmp_uint32 old = UNUSED;
445227068Sambrisko      int claimed = pr->steal_flag.compare_exchange_strong(old, CLAIMED);
446247369Ssmh      if (traits_t<T>::type_size > 4) {
447247369Ssmh        // AC: TODO: check if 16-byte CAS available and use it to
448227068Sambrisko        // improve performance (probably wait for explicit request
449227068Sambrisko        // before spending time on this).
450235016Sambrisko        // For now use dynamically allocated per-private-buffer lock,
451235016Sambrisko        // free memory in __kmp_dispatch_next when status==0.
452227068Sambrisko        pr->u.p.steal_lock = (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t));
453227068Sambrisko        __kmp_init_lock(pr->u.p.steal_lock);
454233711Sambrisko      }
455227068Sambrisko
456233711Sambrisko#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
457227068Sambrisko      // Iterations are divided in a 60/40 skewed distribution among CORE and
458227068Sambrisko      // ATOM processors for hybrid systems
459227068Sambrisko      bool use_hybrid = false;
460227068Sambrisko      kmp_hw_core_type_t core_type = KMP_HW_CORE_TYPE_UNKNOWN;
461227068Sambrisko      T first_thread_with_ecore = 0;
462227068Sambrisko      T num_procs_with_pcore = 0;
463227068Sambrisko      T num_procs_with_ecore = 0;
464227068Sambrisko      T p_ntc = 0, e_ntc = 0;
465227068Sambrisko      if (__kmp_is_hybrid_cpu() && __kmp_affinity.type != affinity_none &&
466247369Ssmh          __kmp_affinity.type != affinity_explicit) {
467247369Ssmh        use_hybrid = true;
468247369Ssmh        core_type = (kmp_hw_core_type_t)th->th.th_topology_attrs.core_type;
469247369Ssmh        if (core_type != KMP_HW_CORE_TYPE_UNKNOWN &&
470247369Ssmh            __kmp_first_osid_with_ecore > -1) {
471247369Ssmh          for (int i = 0; i < team->t.t_nproc; ++i) {
472247369Ssmh            kmp_hw_core_type_t type = (kmp_hw_core_type_t)team->t.t_threads[i]
473227068Sambrisko                                          ->th.th_topology_attrs.core_type;
474227068Sambrisko            int id = team->t.t_threads[i]->th.th_topology_ids.os_id;
475227068Sambrisko            if (id == __kmp_first_osid_with_ecore) {
476247369Ssmh              first_thread_with_ecore =
477247369Ssmh                  team->t.t_threads[i]->th.th_info.ds.ds_tid;
478247369Ssmh            }
479247369Ssmh            if (type == KMP_HW_CORE_TYPE_CORE) {
480247369Ssmh              num_procs_with_pcore++;
481227068Sambrisko            } else if (type == KMP_HW_CORE_TYPE_ATOM) {
482227068Sambrisko              num_procs_with_ecore++;
483227068Sambrisko            } else {
484227068Sambrisko              use_hybrid = false;
485227068Sambrisko              break;
486227068Sambrisko            }
487227068Sambrisko          }
488247369Ssmh        }
489247369Ssmh        if (num_procs_with_pcore > 0 && num_procs_with_ecore > 0) {
490227068Sambrisko          float multiplier = 60.0 / 40.0;
491227068Sambrisko          float p_ratio = (float)num_procs_with_pcore / nproc;
492227068Sambrisko          float e_ratio = (float)num_procs_with_ecore / nproc;
493227068Sambrisko          float e_multiplier =
494247369Ssmh              (float)1 /
495247369Ssmh              (((multiplier * num_procs_with_pcore) / nproc) + e_ratio);
496227068Sambrisko          float p_multiplier = multiplier * e_multiplier;
497227068Sambrisko          p_ntc = __kmp_get_round_val(ntc * p_ratio * p_multiplier);
498247369Ssmh          if ((int)p_ntc > (int)(ntc * p_ratio * p_multiplier))
499247369Ssmh            e_ntc =
500227068Sambrisko                (int)(__kmp_round_2decimal_val(ntc * e_ratio * e_multiplier));
501227068Sambrisko          else
502227068Sambrisko            e_ntc = __kmp_get_round_val(ntc * e_ratio * e_multiplier);
503227068Sambrisko          KMP_DEBUG_ASSERT(ntc == p_ntc + e_ntc);
504227068Sambrisko
505247369Ssmh          // Use regular static steal if not enough chunks for skewed
506227068Sambrisko          // distribution
507227068Sambrisko          use_hybrid = (use_hybrid && (p_ntc >= num_procs_with_pcore &&
508227068Sambrisko                                       e_ntc >= num_procs_with_ecore)
509227068Sambrisko                            ? true
510227068Sambrisko                            : false);
511227068Sambrisko        } else {
512227068Sambrisko          use_hybrid = false;
513227068Sambrisko        }
514227068Sambrisko      }
515227068Sambrisko      pr->flags.use_hybrid = use_hybrid;
516227068Sambrisko      pr->u.p.pchunks = p_ntc;
517227068Sambrisko      pr->u.p.num_procs_with_pcore = num_procs_with_pcore;
518227068Sambrisko      pr->u.p.first_thread_with_ecore = first_thread_with_ecore;
519227068Sambrisko
520227068Sambrisko      if (use_hybrid) {
521227068Sambrisko        KMP_DEBUG_ASSERT(nproc == num_procs_with_pcore + num_procs_with_ecore);
522227068Sambrisko        T big_chunk = p_ntc / num_procs_with_pcore;
523227068Sambrisko        small_chunk = e_ntc / num_procs_with_ecore;
524227068Sambrisko
525227068Sambrisko        extras =
526227068Sambrisko            (p_ntc % num_procs_with_pcore) + (e_ntc % num_procs_with_ecore);
527227068Sambrisko
528227068Sambrisko        p_extra = (big_chunk - small_chunk);
529227068Sambrisko
530242681Sambrisko        if (core_type == KMP_HW_CORE_TYPE_CORE) {
531227068Sambrisko          if (id < first_thread_with_ecore) {
532227068Sambrisko            init =
533227068Sambrisko                id * small_chunk + id * p_extra + (id < extras ? id : extras);
534227068Sambrisko          } else {
535227068Sambrisko            init = id * small_chunk + (id - num_procs_with_ecore) * p_extra +
536227068Sambrisko                   (id < extras ? id : extras);
537235016Sambrisko          }
538235016Sambrisko        } else {
539227068Sambrisko          if (id == first_thread_with_ecore) {
540227068Sambrisko            init =
541227068Sambrisko                id * small_chunk + id * p_extra + (id < extras ? id : extras);
542227068Sambrisko          } else {
543233711Sambrisko            init = id * small_chunk + first_thread_with_ecore * p_extra +
544233711Sambrisko                   (id < extras ? id : extras);
545227068Sambrisko          }
546227068Sambrisko        }
547227068Sambrisko        p_extra = (core_type == KMP_HW_CORE_TYPE_CORE) ? p_extra : 0;
548227068Sambrisko      } else
549233711Sambrisko#endif
550227068Sambrisko      {
551247369Ssmh        small_chunk = ntc / nproc;
552247369Ssmh        extras = ntc % nproc;
553233711Sambrisko        init = id * small_chunk + (id < extras ? id : extras);
554233711Sambrisko        p_extra = 0;
555233711Sambrisko      }
556227068Sambrisko      pr->u.p.count = init;
557227068Sambrisko      if (claimed) { // are we succeeded in claiming own buffer?
558227068Sambrisko        pr->u.p.ub = init + small_chunk + p_extra + (id < extras ? 1 : 0);
559227068Sambrisko        // Other threads will inspect steal_flag when searching for a victim.
560227068Sambrisko        // READY means other threads may steal from this thread from now on.
561227068Sambrisko        KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
562227068Sambrisko      } else {
563227068Sambrisko        // other thread has stolen whole our range
564227068Sambrisko        KMP_DEBUG_ASSERT(pr->steal_flag == THIEF);
565227068Sambrisko        pr->u.p.ub = init; // mark there is no iterations to work on
566227068Sambrisko      }
567227068Sambrisko      pr->u.p.parm2 = ntc; // save number of chunks
568227068Sambrisko      // parm3 is the number of times to attempt stealing which is
569227068Sambrisko      // nproc (just a heuristics, could be optimized later on).
570227068Sambrisko      pr->u.p.parm3 = nproc;
571227068Sambrisko      pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
572227068Sambrisko      break;
573227068Sambrisko    } else {
574235016Sambrisko      /* too few chunks: switching to kmp_sch_dynamic_chunked */
575235016Sambrisko      schedule = kmp_sch_dynamic_chunked;
576227068Sambrisko      KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d switching to "
577227068Sambrisko                     "kmp_sch_dynamic_chunked\n",
578227068Sambrisko                     gtid));
579233711Sambrisko      goto dynamic_init;
580227068Sambrisko      break;
581227068Sambrisko    } // if
582233711Sambrisko  } // case
583227068Sambrisko#endif
584227068Sambrisko  case kmp_sch_static_balanced: {
585227068Sambrisko    T init, limit;
586247369Ssmh
587227068Sambrisko    KD_TRACE(
588227068Sambrisko        100,
589227068Sambrisko        ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_balanced case\n",
590227068Sambrisko         gtid));
591227068Sambrisko
592235016Sambrisko    if (nproc > 1) {
593227068Sambrisko      T id = tid;
594227068Sambrisko
595227068Sambrisko      if (tc < nproc) {
596227068Sambrisko        if (id < tc) {
597227068Sambrisko          init = id;
598227068Sambrisko          limit = id;
599227068Sambrisko          pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */
600227068Sambrisko        } else {
601227068Sambrisko          pr->u.p.count = 1; /* means no more chunks to execute */
602227068Sambrisko          pr->u.p.parm1 = FALSE;
603227068Sambrisko          break;
604247369Ssmh        }
605247369Ssmh      } else {
606247369Ssmh        T small_chunk = tc / nproc;
607247369Ssmh        T extras = tc % nproc;
608247369Ssmh        init = id * small_chunk + (id < extras ? id : extras);
609227068Sambrisko        limit = init + small_chunk - (id < extras ? 0 : 1);
610247369Ssmh        pr->u.p.parm1 = (id == nproc - 1);
611247369Ssmh      }
612247369Ssmh    } else {
613247369Ssmh      if (tc > 0) {
614247369Ssmh        init = 0;
615247369Ssmh        limit = tc - 1;
616247369Ssmh        pr->u.p.parm1 = TRUE;
617247369Ssmh      } else {
618227068Sambrisko        // zero trip count
619247369Ssmh        pr->u.p.count = 1; /* means no more chunks to execute */
620247369Ssmh        pr->u.p.parm1 = FALSE;
621247369Ssmh        break;
622247369Ssmh      }
623227068Sambrisko    }
624247369Ssmh#if USE_ITT_BUILD
625247369Ssmh    // Calculate chunk for metadata report
626247369Ssmh    if (itt_need_metadata_reporting)
627247369Ssmh      if (cur_chunk)
628247369Ssmh        *cur_chunk = limit - init + 1;
629227068Sambrisko#endif
630247369Ssmh    if (st == 1) {
631247369Ssmh      pr->u.p.lb = lb + init;
632247369Ssmh      pr->u.p.ub = lb + limit;
633247369Ssmh    } else {
634247369Ssmh      // calculated upper bound, "ub" is user-defined upper bound
635227068Sambrisko      T ub_tmp = lb + limit * st;
636227068Sambrisko      pr->u.p.lb = lb + init * st;
637235016Sambrisko      // adjust upper bound to "ub" if needed, so that MS lastprivate will match
638233711Sambrisko      // it exactly
639233711Sambrisko      if (st > 0) {
640247369Ssmh        pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp);
641247369Ssmh      } else {
642233711Sambrisko        pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp);
643247369Ssmh      }
644247369Ssmh    }
645247369Ssmh    if (pr->flags.ordered) {
646233711Sambrisko      pr->u.p.ordered_lower = init;
647233711Sambrisko      pr->u.p.ordered_upper = limit;
648227068Sambrisko    }
649247369Ssmh    break;
650247369Ssmh  } // case
651247369Ssmh  case kmp_sch_static_balanced_chunked: {
652247369Ssmh    // similar to balanced, but chunk adjusted to multiple of simd width
653247369Ssmh    T nth = nproc;
654233711Sambrisko    KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d runtime(simd:static)"
655227068Sambrisko                   " -> falling-through to static_greedy\n",
656235014Sambrisko                   gtid));
657235014Sambrisko    schedule = kmp_sch_static_greedy;
658227068Sambrisko    if (nth > 1)
659227068Sambrisko      pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1);
660247369Ssmh    else
661227068Sambrisko      pr->u.p.parm1 = tc;
662227068Sambrisko    break;
663227068Sambrisko  } // case
664227068Sambrisko  case kmp_sch_guided_simd:
665227068Sambrisko  case kmp_sch_guided_iterative_chunked: {
666227068Sambrisko    KD_TRACE(
667227068Sambrisko        100,
668247369Ssmh        ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_guided_iterative_chunked"
669227068Sambrisko         " case\n",
670227068Sambrisko         gtid));
671227068Sambrisko
672227068Sambrisko    if (nproc > 1) {
673227068Sambrisko      if ((2L * chunk + 1) * nproc >= tc) {
674227068Sambrisko        /* chunk size too large, switch to dynamic */
675247369Ssmh        schedule = kmp_sch_dynamic_chunked;
676227068Sambrisko        goto dynamic_init;
677247369Ssmh      } else {
678247369Ssmh        // when remaining iters become less than parm2 - switch to dynamic
679227068Sambrisko        pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1);
680227068Sambrisko        *(double *)&pr->u.p.parm3 =
681227068Sambrisko            guided_flt_param / (double)nproc; // may occupy parm3 and parm4
682227068Sambrisko      }
683227068Sambrisko    } else {
684227068Sambrisko      KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
685227068Sambrisko                     "kmp_sch_static_greedy\n",
686227068Sambrisko                     gtid));
687227068Sambrisko      schedule = kmp_sch_static_greedy;
688227068Sambrisko      /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
689227068Sambrisko      KD_TRACE(
690227068Sambrisko          100,
691247369Ssmh          ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
692247369Ssmh           gtid));
693247369Ssmh      pr->u.p.parm1 = tc;
694247369Ssmh    } // if
695227068Sambrisko  } // case
696227068Sambrisko  break;
697247369Ssmh  case kmp_sch_guided_analytical_chunked: {
698247369Ssmh    KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
699247369Ssmh                   "kmp_sch_guided_analytical_chunked case\n",
700247369Ssmh                   gtid));
701247369Ssmh
702247369Ssmh    if (nproc > 1) {
703227068Sambrisko      if ((2L * chunk + 1) * nproc >= tc) {
704227068Sambrisko        /* chunk size too large, switch to dynamic */
705227068Sambrisko        schedule = kmp_sch_dynamic_chunked;
706227068Sambrisko        goto dynamic_init;
707227068Sambrisko      } else {
708235014Sambrisko        /* commonly used term: (2 nproc - 1)/(2 nproc) */
709227068Sambrisko        DBL x;
710247369Ssmh
711247369Ssmh#if KMP_USE_X87CONTROL
712242681Sambrisko        /* Linux* OS already has 64-bit computation by default for long double,
713242681Sambrisko           and on Windows* OS on Intel(R) 64, /Qlong_double doesn't work. On
714242681Sambrisko           Windows* OS on IA-32 architecture, we need to set precision to 64-bit
715242681Sambrisko           instead of the default 53-bit. Even though long double doesn't work
716247369Ssmh           on Windows* OS on Intel(R) 64, the resulting lack of precision is not
717242681Sambrisko           expected to impact the correctness of the algorithm, but this has not
718242681Sambrisko           been mathematically proven. */
719247369Ssmh        // save original FPCW and set precision to 64-bit, as
720247369Ssmh        // Windows* OS on IA-32 architecture defaults to 53-bit
721242681Sambrisko        unsigned int oldFpcw = _control87(0, 0);
722242681Sambrisko        _control87(_PC_64, _MCW_PC); // 0,0x30000
723242681Sambrisko#endif
724227068Sambrisko        /* value used for comparison in solver for cross-over point */
725227068Sambrisko        KMP_ASSERT(tc > 0);
726247369Ssmh        long double target = ((long double)chunk * 2 + 1) * nproc / tc;
727227068Sambrisko
728227068Sambrisko        /* crossover point--chunk indexes equal to or greater than
729227068Sambrisko           this point switch to dynamic-style scheduling */
730227068Sambrisko        UT cross;
731227068Sambrisko
732247369Ssmh        /* commonly used term: (2 nproc - 1)/(2 nproc) */
733247369Ssmh        x = 1.0 - 0.5 / (double)nproc;
734227068Sambrisko
735227068Sambrisko#ifdef KMP_DEBUG
736227068Sambrisko        { // test natural alignment
737227068Sambrisko          struct _test_a {
738227068Sambrisko            char a;
739227068Sambrisko            union {
740227068Sambrisko              char b;
741227068Sambrisko              DBL d;
742227068Sambrisko            };
743227068Sambrisko          } t;
744227068Sambrisko          ptrdiff_t natural_alignment =
745227068Sambrisko              (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
746227068Sambrisko          //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long
747233711Sambrisko          // long)natural_alignment );
748227068Sambrisko          KMP_DEBUG_ASSERT(
749227068Sambrisko              (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0);
750227068Sambrisko        }
751227068Sambrisko#endif // KMP_DEBUG
752227068Sambrisko
753227068Sambrisko        /* save the term in thread private dispatch structure */
754227068Sambrisko        *(DBL *)&pr->u.p.parm3 = x;
755233711Sambrisko
756227068Sambrisko        /* solve for the crossover point to the nearest integer i for which C_i
757227068Sambrisko           <= chunk */
758227068Sambrisko        {
759227068Sambrisko          UT left, right, mid;
760227068Sambrisko          long double p;
761235016Sambrisko
762227068Sambrisko          /* estimate initial upper and lower bound */
763227068Sambrisko
764227068Sambrisko          /* doesn't matter what value right is as long as it is positive, but
765227068Sambrisko             it affects performance of the solver */
766227068Sambrisko          right = 229;
767227068Sambrisko          p = __kmp_pow<UT>(x, right);
768235016Sambrisko          if (p > target) {
769247369Ssmh            do {
770227068Sambrisko              p *= p;
771227068Sambrisko              right <<= 1;
772227068Sambrisko            } while (p > target && right < (1 << 27));
773227068Sambrisko            /* lower bound is previous (failed) estimate of upper bound */
774227068Sambrisko            left = right >> 1;
775247369Ssmh          } else {
776247369Ssmh            left = 0;
777227068Sambrisko          }
778227068Sambrisko
779227068Sambrisko          /* bisection root-finding method */
780227068Sambrisko          while (left + 1 < right) {
781247369Ssmh            mid = (left + right) / 2;
782247369Ssmh            if (__kmp_pow<UT>(x, mid) > target) {
783247369Ssmh              left = mid;
784247369Ssmh            } else {
785247369Ssmh              right = mid;
786227068Sambrisko            }
787227068Sambrisko          } // while
788227068Sambrisko          cross = right;
789227068Sambrisko        }
790227068Sambrisko        /* assert sanity of computed crossover point */
791227068Sambrisko        KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target &&
792227068Sambrisko                   __kmp_pow<UT>(x, cross) <= target);
793227068Sambrisko
794227068Sambrisko        /* save the crossover point in thread private dispatch structure */
795227068Sambrisko        pr->u.p.parm2 = cross;
796227068Sambrisko
797227068Sambrisko// C75803
798227068Sambrisko#if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8))
799227068Sambrisko#define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3)
800227068Sambrisko#else
801227068Sambrisko#define GUIDED_ANALYTICAL_WORKAROUND (x)
802227068Sambrisko#endif
803227068Sambrisko        /* dynamic-style scheduling offset */
804227068Sambrisko        pr->u.p.count = tc -
805227068Sambrisko                        __kmp_dispatch_guided_remaining(
806233711Sambrisko                            tc, GUIDED_ANALYTICAL_WORKAROUND, cross) -
807227068Sambrisko                        cross * chunk;
808227068Sambrisko#if KMP_USE_X87CONTROL
809227068Sambrisko        // restore FPCW
810227068Sambrisko        _control87(oldFpcw, _MCW_PC);
811227068Sambrisko#endif
812227068Sambrisko      } // if
813227068Sambrisko    } else {
814247369Ssmh      KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
815227068Sambrisko                     "kmp_sch_static_greedy\n",
816227068Sambrisko                     gtid));
817227068Sambrisko      schedule = kmp_sch_static_greedy;
818227068Sambrisko      /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
819227068Sambrisko      pr->u.p.parm1 = tc;
820227068Sambrisko    } // if
821227068Sambrisko  } // case
822227068Sambrisko  break;
823227068Sambrisko  case kmp_sch_static_greedy:
824227068Sambrisko    KD_TRACE(
825227068Sambrisko        100,
826227068Sambrisko        ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
827227068Sambrisko         gtid));
828227068Sambrisko    pr->u.p.parm1 = (nproc > 1) ? (tc + nproc - 1) / nproc : tc;
829227068Sambrisko    break;
830227068Sambrisko  case kmp_sch_static_chunked:
831227068Sambrisko  case kmp_sch_dynamic_chunked:
832227068Sambrisko  dynamic_init:
833227068Sambrisko    if (tc == 0)
834227068Sambrisko      break;
835227068Sambrisko    if (pr->u.p.parm1 <= 0)
836227068Sambrisko      pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
837227068Sambrisko    else if (pr->u.p.parm1 > tc)
838227068Sambrisko      pr->u.p.parm1 = tc;
839227068Sambrisko    // Store the total number of chunks to prevent integer overflow during
840227068Sambrisko    // bounds calculations in the get next chunk routine.
841227068Sambrisko    pr->u.p.parm2 = (tc / pr->u.p.parm1) + (tc % pr->u.p.parm1 ? 1 : 0);
842227068Sambrisko    KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
843227068Sambrisko                   "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n",
844227068Sambrisko                   gtid));
845227068Sambrisko    break;
846227068Sambrisko  case kmp_sch_trapezoidal: {
847227068Sambrisko    /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
848227068Sambrisko
849227068Sambrisko    T parm1, parm2, parm3, parm4;
850227068Sambrisko    KD_TRACE(100,
851227068Sambrisko             ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_trapezoidal case\n",
852261535Smarkj              gtid));
853261535Smarkj
854227068Sambrisko    parm1 = chunk;
855227068Sambrisko
856227068Sambrisko    /* F : size of the first cycle */
857227068Sambrisko    parm2 = (tc / (2 * nproc));
858227068Sambrisko
859227068Sambrisko    if (parm2 < 1) {
860227068Sambrisko      parm2 = 1;
861227068Sambrisko    }
862227068Sambrisko
863227068Sambrisko    /* L : size of the last cycle.  Make sure the last cycle is not larger
864227068Sambrisko       than the first cycle. */
865227068Sambrisko    if (parm1 < 1) {
866242681Sambrisko      parm1 = 1;
867242681Sambrisko    } else if (parm1 > parm2) {
868242681Sambrisko      parm1 = parm2;
869242681Sambrisko    }
870242681Sambrisko
871242681Sambrisko    /* N : number of cycles */
872242681Sambrisko    parm3 = (parm2 + parm1);
873233711Sambrisko    parm3 = (2 * tc + parm3 - 1) / parm3;
874227068Sambrisko
875227068Sambrisko    if (parm3 < 2) {
876227068Sambrisko      parm3 = 2;
877227068Sambrisko    }
878227068Sambrisko
879235016Sambrisko    /* sigma : decreasing incr of the trapezoid */
880235016Sambrisko    parm4 = (parm3 - 1);
881235016Sambrisko    parm4 = (parm2 - parm1) / parm4;
882227068Sambrisko
883242681Sambrisko    // pointless check, because parm4 >= 0 always
884227068Sambrisko    // if ( parm4 < 0 ) {
885242681Sambrisko    //    parm4 = 0;
886242681Sambrisko    //}
887242681Sambrisko
888227068Sambrisko    pr->u.p.parm1 = parm1;
889242681Sambrisko    pr->u.p.parm2 = parm2;
890242681Sambrisko    pr->u.p.parm3 = parm3;
891242681Sambrisko    pr->u.p.parm4 = parm4;
892242681Sambrisko  } // case
893227068Sambrisko  break;
894242681Sambrisko
895227068Sambrisko  default: {
896242681Sambrisko    __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
897242681Sambrisko                KMP_HNT(GetNewerLibrary), // Hint
898242681Sambrisko                __kmp_msg_null // Variadic argument list terminator
899242681Sambrisko    );
900242681Sambrisko  } break;
901242681Sambrisko  } // switch
902227068Sambrisko  pr->schedule = schedule;
903242681Sambrisko}
904242681Sambrisko
905242681Sambrisko#if KMP_USE_HIER_SCHED
906242681Sambriskotemplate <typename T>
907242681Sambriskoinline void __kmp_dispatch_init_hier_runtime(ident_t *loc, T lb, T ub,
908242681Sambrisko                                             typename traits_t<T>::signed_t st);
909227068Sambriskotemplate <>
910227068Sambriskoinline void
911227068Sambrisko__kmp_dispatch_init_hier_runtime<kmp_int32>(ident_t *loc, kmp_int32 lb,
912227068Sambrisko                                            kmp_int32 ub, kmp_int32 st) {
913227068Sambrisko  __kmp_dispatch_init_hierarchy<kmp_int32>(
914227068Sambrisko      loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
915227068Sambrisko      __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
916227068Sambrisko}
917227068Sambriskotemplate <>
918227068Sambriskoinline void
919227068Sambrisko__kmp_dispatch_init_hier_runtime<kmp_uint32>(ident_t *loc, kmp_uint32 lb,
920227068Sambrisko                                             kmp_uint32 ub, kmp_int32 st) {
921227068Sambrisko  __kmp_dispatch_init_hierarchy<kmp_uint32>(
922227068Sambrisko      loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
923227068Sambrisko      __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
924227068Sambrisko}
925227068Sambriskotemplate <>
926227068Sambriskoinline void
927227068Sambrisko__kmp_dispatch_init_hier_runtime<kmp_int64>(ident_t *loc, kmp_int64 lb,
928227068Sambrisko                                            kmp_int64 ub, kmp_int64 st) {
929227068Sambrisko  __kmp_dispatch_init_hierarchy<kmp_int64>(
930227068Sambrisko      loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
931227068Sambrisko      __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
932227068Sambrisko}
933227068Sambriskotemplate <>
934242681Sambriskoinline void
935242681Sambrisko__kmp_dispatch_init_hier_runtime<kmp_uint64>(ident_t *loc, kmp_uint64 lb,
936242681Sambrisko                                             kmp_uint64 ub, kmp_int64 st) {
937227068Sambrisko  __kmp_dispatch_init_hierarchy<kmp_uint64>(
938227068Sambrisko      loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
939227068Sambrisko      __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
940227068Sambrisko}
941227068Sambrisko
942227068Sambrisko// free all the hierarchy scheduling memory associated with the team
943233711Sambriskovoid __kmp_dispatch_free_hierarchies(kmp_team_t *team) {
944227068Sambrisko  int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
945233711Sambrisko  for (int i = 0; i < num_disp_buff; ++i) {
946227068Sambrisko    // type does not matter here so use kmp_int32
947227068Sambrisko    auto sh =
948261535Smarkj        reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>(
949227068Sambrisko            &team->t.t_disp_buffer[i]);
950227068Sambrisko    if (sh->hier) {
951227068Sambrisko      sh->hier->deallocate();
952227068Sambrisko      __kmp_free(sh->hier);
953227068Sambrisko    }
954227068Sambrisko  }
955227068Sambrisko}
956227068Sambrisko#endif
957227068Sambrisko
958227068Sambrisko// UT - unsigned flavor of T, ST - signed flavor of T,
959227068Sambrisko// DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
960227068Sambriskotemplate <typename T>
961227068Sambriskostatic void
962233711Sambrisko__kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
963227068Sambrisko                    T ub, typename traits_t<T>::signed_t st,
964227068Sambrisko                    typename traits_t<T>::signed_t chunk, int push_ws) {
965227068Sambrisko  typedef typename traits_t<T>::unsigned_t UT;
966227068Sambrisko
967227068Sambrisko  int active;
968227068Sambrisko  kmp_info_t *th;
969227068Sambrisko  kmp_team_t *team;
970227068Sambrisko  kmp_uint32 my_buffer_index;
971227068Sambrisko  dispatch_private_info_template<T> *pr;
972261535Smarkj  dispatch_shared_info_template<T> volatile *sh;
973261535Smarkj
974261535Smarkj  KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) ==
975261535Smarkj                   sizeof(dispatch_private_info));
976261535Smarkj  KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) ==
977227068Sambrisko                   sizeof(dispatch_shared_info));
978227068Sambrisko  __kmp_assert_valid_gtid(gtid);
979233711Sambrisko
980233711Sambrisko  if (!TCR_4(__kmp_init_parallel))
981233711Sambrisko    __kmp_parallel_initialize();
982233711Sambrisko
983227068Sambrisko  __kmp_resume_if_soft_paused();
984227068Sambrisko
985227068Sambrisko#if INCLUDE_SSC_MARKS
986227068Sambrisko  SSC_MARK_DISPATCH_INIT();
987227068Sambrisko#endif
988233711Sambrisko#ifdef KMP_DEBUG
989227068Sambrisko  typedef typename traits_t<T>::signed_t ST;
990261535Smarkj  {
991261535Smarkj    char *buff;
992261535Smarkj    // create format specifiers before the debug output
993261535Smarkj    buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d "
994261535Smarkj                            "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
995227068Sambrisko                            traits_t<ST>::spec, traits_t<T>::spec,
996227068Sambrisko                            traits_t<T>::spec, traits_t<ST>::spec);
997227068Sambrisko    KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st));
998227068Sambrisko    __kmp_str_free(&buff);
999227068Sambrisko  }
1000227068Sambrisko#endif
1001227068Sambrisko  /* setup data */
1002227068Sambrisko  th = __kmp_threads[gtid];
1003227068Sambrisko  team = th->th.th_team;
1004227068Sambrisko  active = !team->t.t_serialized;
1005227068Sambrisko  th->th.th_ident = loc;
1006227068Sambrisko
1007227068Sambrisko  // Any half-decent optimizer will remove this test when the blocks are empty
1008227068Sambrisko  // since the macros expand to nothing
1009261535Smarkj  // when statistics are disabled.
1010261535Smarkj  if (schedule == __kmp_static) {
1011261535Smarkj    KMP_COUNT_BLOCK(OMP_LOOP_STATIC);
1012261535Smarkj  } else {
1013261535Smarkj    KMP_COUNT_BLOCK(OMP_LOOP_DYNAMIC);
1014227068Sambrisko  }
1015227068Sambrisko
1016233711Sambrisko#if KMP_USE_HIER_SCHED
1017227068Sambrisko  // Initialize the scheduling hierarchy if requested in OMP_SCHEDULE envirable
1018227068Sambrisko  // Hierarchical scheduling does not work with ordered, so if ordered is
1019227068Sambrisko  // detected, then revert back to threaded scheduling.
1020227068Sambrisko  bool ordered;
1021227068Sambrisko  enum sched_type my_sched = schedule;
1022233711Sambrisko  my_buffer_index = th->th.th_dispatch->th_disp_index;
1023227068Sambrisko  pr = reinterpret_cast<dispatch_private_info_template<T> *>(
1024233711Sambrisko      &th->th.th_dispatch
1025227068Sambrisko           ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
1026261535Smarkj  my_sched = SCHEDULE_WITHOUT_MODIFIERS(my_sched);
1027261535Smarkj  if ((my_sched >= kmp_nm_lower) && (my_sched < kmp_nm_upper))
1028261535Smarkj    my_sched =
1029261535Smarkj        (enum sched_type)(((int)my_sched) - (kmp_nm_lower - kmp_sch_lower));
1030261535Smarkj  ordered = (kmp_ord_lower & my_sched);
1031261535Smarkj  if (pr->flags.use_hier) {
1032261535Smarkj    if (ordered) {
1033227068Sambrisko      KD_TRACE(100, ("__kmp_dispatch_init: T#%d ordered loop detected.  "
1034227068Sambrisko                     "Disabling hierarchical scheduling.\n",
1035227068Sambrisko                     gtid));
1036227068Sambrisko      pr->flags.use_hier = FALSE;
1037227068Sambrisko    }
1038227068Sambrisko  }
1039227068Sambrisko  if (schedule == kmp_sch_runtime && __kmp_hier_scheds.size > 0) {
1040227068Sambrisko    // Don't use hierarchical for ordered parallel loops and don't
1041227068Sambrisko    // use the runtime hierarchy if one was specified in the program
1042227068Sambrisko    if (!ordered && !pr->flags.use_hier)
1043227068Sambrisko      __kmp_dispatch_init_hier_runtime<T>(loc, lb, ub, st);
1044227068Sambrisko  }
1045247369Ssmh#endif // KMP_USE_HIER_SCHED
1046247369Ssmh
1047247369Ssmh#if USE_ITT_BUILD
1048227068Sambrisko  kmp_uint64 cur_chunk = chunk;
1049227068Sambrisko  int itt_need_metadata_reporting =
1050227068Sambrisko      __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
1051247369Ssmh      KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
1052247369Ssmh      team->t.t_active_level == 1;
1053247369Ssmh#endif
1054247369Ssmh  if (!active) {
1055247369Ssmh    pr = reinterpret_cast<dispatch_private_info_template<T> *>(
1056247369Ssmh        th->th.th_dispatch->th_disp_buffer); /* top of the stack */
1057247369Ssmh  } else {
1058247369Ssmh    KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1059247369Ssmh                     &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1060227068Sambrisko
1061227068Sambrisko    my_buffer_index = th->th.th_dispatch->th_disp_index++;
1062227068Sambrisko
1063227068Sambrisko    /* What happens when number of threads changes, need to resize buffer? */
1064227068Sambrisko    pr = reinterpret_cast<dispatch_private_info_template<T> *>(
1065227068Sambrisko        &th->th.th_dispatch
1066227068Sambrisko             ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
1067227068Sambrisko    sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
1068227068Sambrisko        &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
1069227068Sambrisko    KD_TRACE(10, ("__kmp_dispatch_init: T#%d my_buffer_index:%d\n", gtid,
1070227068Sambrisko                  my_buffer_index));
1071227068Sambrisko    if (sh->buffer_index != my_buffer_index) { // too many loops in progress?
1072227068Sambrisko      KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d"
1073227068Sambrisko                     " sh->buffer_index:%d\n",
1074227068Sambrisko                     gtid, my_buffer_index, sh->buffer_index));
1075227068Sambrisko      __kmp_wait<kmp_uint32>(&sh->buffer_index, my_buffer_index,
1076227068Sambrisko                             __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
1077227068Sambrisko      // Note: KMP_WAIT() cannot be used there: buffer index and
1078247369Ssmh      // my_buffer_index are *always* 32-bit integers.
1079227068Sambrisko      KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d "
1080227068Sambrisko                     "sh->buffer_index:%d\n",
1081237546Skevlo                     gtid, my_buffer_index, sh->buffer_index));
1082227068Sambrisko    }
1083227068Sambrisko  }
1084227068Sambrisko
1085227068Sambrisko  __kmp_dispatch_init_algorithm(loc, gtid, pr, schedule, lb, ub, st,
1086227068Sambrisko#if USE_ITT_BUILD
1087227068Sambrisko                                &cur_chunk,
1088227068Sambrisko#endif
1089227068Sambrisko                                chunk, (T)th->th.th_team_nproc,
1090227068Sambrisko                                (T)th->th.th_info.ds.ds_tid);
1091227068Sambrisko  if (active) {
1092227068Sambrisko    if (pr->flags.ordered == 0) {
1093227068Sambrisko      th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
1094247369Ssmh      th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
1095227068Sambrisko    } else {
1096227068Sambrisko      th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>;
1097227068Sambrisko      th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>;
1098233711Sambrisko    }
1099227068Sambrisko    th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr;
1100227068Sambrisko    th->th.th_dispatch->th_dispatch_sh_current =
1101227068Sambrisko        CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh);
1102227068Sambrisko#if USE_ITT_BUILD
1103242681Sambrisko    if (pr->flags.ordered) {
1104235014Sambrisko      __kmp_itt_ordered_init(gtid);
1105227068Sambrisko    }
1106227068Sambrisko    // Report loop metadata
1107227068Sambrisko    if (itt_need_metadata_reporting) {
1108227068Sambrisko      // Only report metadata by primary thread of active team at level 1
1109227068Sambrisko      kmp_uint64 schedtype = 0;
1110227068Sambrisko      switch (schedule) {
1111227068Sambrisko      case kmp_sch_static_chunked:
1112227068Sambrisko      case kmp_sch_static_balanced: // Chunk is calculated in the switch above
1113227068Sambrisko        break;
1114227068Sambrisko      case kmp_sch_static_greedy:
1115227068Sambrisko        cur_chunk = pr->u.p.parm1;
1116227068Sambrisko        break;
1117227068Sambrisko      case kmp_sch_dynamic_chunked:
1118227068Sambrisko        schedtype = 1;
1119227068Sambrisko        break;
1120242681Sambrisko      case kmp_sch_guided_iterative_chunked:
1121227068Sambrisko      case kmp_sch_guided_analytical_chunked:
1122242681Sambrisko      case kmp_sch_guided_simd:
1123227068Sambrisko        schedtype = 2;
1124227068Sambrisko        break;
1125227068Sambrisko      default:
1126227068Sambrisko        // Should we put this case under "static"?
1127242681Sambrisko        // case kmp_sch_static_steal:
1128247369Ssmh        schedtype = 3;
1129247369Ssmh        break;
1130242681Sambrisko      }
1131242681Sambrisko      __kmp_itt_metadata_loop(loc, schedtype, pr->u.p.tc, cur_chunk);
1132242681Sambrisko    }
1133242681Sambrisko#if KMP_USE_HIER_SCHED
1134242681Sambrisko    if (pr->flags.use_hier) {
1135242681Sambrisko      pr->u.p.count = 0;
1136242681Sambrisko      pr->u.p.ub = pr->u.p.lb = pr->u.p.st = pr->u.p.tc = 0;
1137242681Sambrisko    }
1138242681Sambrisko#endif // KMP_USER_HIER_SCHED
1139242681Sambrisko#endif /* USE_ITT_BUILD */
1140242681Sambrisko  }
1141227068Sambrisko
1142227068Sambrisko#ifdef KMP_DEBUG
1143227068Sambrisko  {
1144227068Sambrisko    char *buff;
1145227068Sambrisko    // create format specifiers before the debug output
1146227068Sambrisko    buff = __kmp_str_format(
1147247369Ssmh        "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s "
1148247369Ssmh        "lb:%%%s ub:%%%s"
1149247369Ssmh        " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s"
1150247369Ssmh        " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
1151298955Spfg        traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec,
1152247369Ssmh        traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
1153247369Ssmh        traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec,
1154235014Sambrisko        traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec);
1155227068Sambrisko    KD_TRACE(10, (buff, gtid, pr->schedule, pr->flags.ordered, pr->u.p.lb,
1156227068Sambrisko                  pr->u.p.ub, pr->u.p.st, pr->u.p.tc, pr->u.p.count,
1157227068Sambrisko                  pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
1158242681Sambrisko                  pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4));
1159242681Sambrisko    __kmp_str_free(&buff);
1160247369Ssmh  }
1161247369Ssmh#endif
1162247369Ssmh#if OMPT_SUPPORT && OMPT_OPTIONAL
1163247369Ssmh  if (ompt_enabled.ompt_callback_work) {
1164242681Sambrisko    ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1165242681Sambrisko    ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
1166227068Sambrisko    ompt_callbacks.ompt_callback(ompt_callback_work)(
1167235016Sambrisko        ompt_work_loop, ompt_scope_begin, &(team_info->parallel_data),
1168247369Ssmh        &(task_info->task_data), pr->u.p.tc, OMPT_LOAD_RETURN_ADDRESS(gtid));
1169247369Ssmh  }
1170247369Ssmh#endif
1171235014Sambrisko  KMP_PUSH_PARTITIONED_TIMER(OMP_loop_dynamic);
1172227068Sambrisko}
1173235014Sambrisko
1174227068Sambrisko/* For ordered loops, either __kmp_dispatch_finish() should be called after
1175227068Sambrisko * every iteration, or __kmp_dispatch_finish_chunk() should be called after
1176227068Sambrisko * every chunk of iterations.  If the ordered section(s) were not executed
1177227068Sambrisko * for this iteration (or every iteration in this chunk), we need to set the
1178227068Sambrisko * ordered iteration counters so that the next thread can proceed. */
1179235016Sambriskotemplate <typename UT>
1180247369Ssmhstatic void __kmp_dispatch_finish(int gtid, ident_t *loc) {
1181227068Sambrisko  typedef typename traits_t<UT>::signed_t ST;
1182233711Sambrisko  __kmp_assert_valid_gtid(gtid);
1183247369Ssmh  kmp_info_t *th = __kmp_threads[gtid];
1184227068Sambrisko
1185227068Sambrisko  KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid));
1186233711Sambrisko  if (!th->th.th_team->t.t_serialized) {
1187227068Sambrisko
1188227068Sambrisko    dispatch_private_info_template<UT> *pr =
1189227068Sambrisko        reinterpret_cast<dispatch_private_info_template<UT> *>(
1190227068Sambrisko            th->th.th_dispatch->th_dispatch_pr_current);
1191227068Sambrisko    dispatch_shared_info_template<UT> volatile *sh =
1192227068Sambrisko        reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1193227068Sambrisko            th->th.th_dispatch->th_dispatch_sh_current);
1194227068Sambrisko    KMP_DEBUG_ASSERT(pr);
1195227068Sambrisko    KMP_DEBUG_ASSERT(sh);
1196247369Ssmh    KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1197247369Ssmh                     &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1198247369Ssmh
1199227068Sambrisko    if (pr->ordered_bumped) {
1200227068Sambrisko      KD_TRACE(
1201227068Sambrisko          1000,
1202227068Sambrisko          ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1203227068Sambrisko           gtid));
1204247369Ssmh      pr->ordered_bumped = 0;
1205247369Ssmh    } else {
1206247369Ssmh      UT lower = pr->u.p.ordered_lower;
1207247369Ssmh
1208247369Ssmh#ifdef KMP_DEBUG
1209247369Ssmh      {
1210247369Ssmh        char *buff;
1211247369Ssmh        // create format specifiers before the debug output
1212247369Ssmh        buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d before wait: "
1213247369Ssmh                                "ordered_iteration:%%%s lower:%%%s\n",
1214227068Sambrisko                                traits_t<UT>::spec, traits_t<UT>::spec);
1215227068Sambrisko        KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1216247369Ssmh        __kmp_str_free(&buff);
1217247369Ssmh      }
1218247369Ssmh#endif
1219247369Ssmh
1220247369Ssmh      __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
1221247369Ssmh                     __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1222247369Ssmh      KMP_MB(); /* is this necessary? */
1223247369Ssmh#ifdef KMP_DEBUG
1224227068Sambrisko      {
1225227068Sambrisko        char *buff;
1226227068Sambrisko        // create format specifiers before the debug output
1227227068Sambrisko        buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d after wait: "
1228235016Sambrisko                                "ordered_iteration:%%%s lower:%%%s\n",
1229247369Ssmh                                traits_t<UT>::spec, traits_t<UT>::spec);
1230227068Sambrisko        KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1231227068Sambrisko        __kmp_str_free(&buff);
1232247369Ssmh      }
1233227068Sambrisko#endif
1234247369Ssmh
1235227068Sambrisko      test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
1236227068Sambrisko    } // if
1237235016Sambrisko  } // if
1238235016Sambrisko  KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid));
1239227068Sambrisko}
1240227068Sambrisko
1241227068Sambrisko#ifdef KMP_GOMP_COMPAT
1242227068Sambrisko
1243227068Sambriskotemplate <typename UT>
1244233711Sambriskostatic void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) {
1245227068Sambrisko  typedef typename traits_t<UT>::signed_t ST;
1246227068Sambrisko  __kmp_assert_valid_gtid(gtid);
1247227068Sambrisko  kmp_info_t *th = __kmp_threads[gtid];
1248227068Sambrisko
1249227068Sambrisko  KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid));
1250233711Sambrisko  if (!th->th.th_team->t.t_serialized) {
1251227068Sambrisko    dispatch_private_info_template<UT> *pr =
1252227068Sambrisko        reinterpret_cast<dispatch_private_info_template<UT> *>(
1253227068Sambrisko            th->th.th_dispatch->th_dispatch_pr_current);
1254227068Sambrisko    dispatch_shared_info_template<UT> volatile *sh =
1255227068Sambrisko        reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1256233711Sambrisko            th->th.th_dispatch->th_dispatch_sh_current);
1257227068Sambrisko    KMP_DEBUG_ASSERT(pr);
1258233711Sambrisko    KMP_DEBUG_ASSERT(sh);
1259227068Sambrisko    KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1260233711Sambrisko                     &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1261227068Sambrisko
1262227068Sambrisko    UT lower = pr->u.p.ordered_lower;
1263227068Sambrisko    UT upper = pr->u.p.ordered_upper;
1264227068Sambrisko    UT inc = upper - lower + 1;
1265227068Sambrisko
1266233711Sambrisko    if (pr->ordered_bumped == inc) {
1267233711Sambrisko      KD_TRACE(
1268227068Sambrisko          1000,
1269247369Ssmh          ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1270247369Ssmh           gtid));
1271227068Sambrisko      pr->ordered_bumped = 0;
1272247369Ssmh    } else {
1273247369Ssmh      inc -= pr->ordered_bumped;
1274247369Ssmh
1275247369Ssmh#ifdef KMP_DEBUG
1276247369Ssmh      {
1277247369Ssmh        char *buff;
1278247369Ssmh        // create format specifiers before the debug output
1279227068Sambrisko        buff = __kmp_str_format(
1280247369Ssmh            "__kmp_dispatch_finish_chunk: T#%%d before wait: "
1281247369Ssmh            "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1282247369Ssmh            traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec);
1283227068Sambrisko        KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper));
1284227068Sambrisko        __kmp_str_free(&buff);
1285227068Sambrisko      }
1286247369Ssmh#endif
1287247369Ssmh
1288247369Ssmh      __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
1289227068Sambrisko                     __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1290227068Sambrisko
1291227068Sambrisko      KMP_MB(); /* is this necessary? */
1292247369Ssmh      KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting "
1293247369Ssmh                      "ordered_bumped to zero\n",
1294247369Ssmh                      gtid));
1295235014Sambrisko      pr->ordered_bumped = 0;
1296235014Sambrisko//!!!!! TODO check if the inc should be unsigned, or signed???
1297227068Sambrisko#ifdef KMP_DEBUG
1298227068Sambrisko      {
1299227068Sambrisko        char *buff;
1300227068Sambrisko        // create format specifiers before the debug output
1301227068Sambrisko        buff = __kmp_str_format(
1302227068Sambrisko            "__kmp_dispatch_finish_chunk: T#%%d after wait: "
1303227068Sambrisko            "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1304227068Sambrisko            traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
1305227068Sambrisko            traits_t<UT>::spec);
1306227068Sambrisko        KD_TRACE(1000,
1307247369Ssmh                 (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper));
1308247369Ssmh        __kmp_str_free(&buff);
1309227068Sambrisko      }
1310227068Sambrisko#endif
1311247369Ssmh
1312247369Ssmh      test_then_add<ST>((volatile ST *)&sh->u.s.ordered_iteration, inc);
1313227068Sambrisko    }
1314233711Sambrisko    //        }
1315227068Sambrisko  }
1316227068Sambrisko  KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid));
1317233711Sambrisko}
1318227068Sambrisko
1319227068Sambrisko#endif /* KMP_GOMP_COMPAT */
1320227068Sambrisko
1321227068Sambriskotemplate <typename T>
1322227068Sambriskoint __kmp_dispatch_next_algorithm(int gtid,
1323227068Sambrisko                                  dispatch_private_info_template<T> *pr,
1324227068Sambrisko                                  dispatch_shared_info_template<T> volatile *sh,
1325235014Sambrisko                                  kmp_int32 *p_last, T *p_lb, T *p_ub,
1326235014Sambrisko                                  typename traits_t<T>::signed_t *p_st, T nproc,
1327235014Sambrisko                                  T tid) {
1328235016Sambrisko  typedef typename traits_t<T>::unsigned_t UT;
1329235014Sambrisko  typedef typename traits_t<T>::signed_t ST;
1330235014Sambrisko  typedef typename traits_t<T>::floating_t DBL;
1331235014Sambrisko  int status = 0;
1332235014Sambrisko  bool last = false;
1333235014Sambrisko  T start;
1334235014Sambrisko  ST incr;
1335235014Sambrisko  UT limit, trip, init;
1336235014Sambrisko  kmp_info_t *th = __kmp_threads[gtid];
1337235014Sambrisko  kmp_team_t *team = th->th.th_team;
1338235014Sambrisko
1339235014Sambrisko  KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1340235014Sambrisko                   &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1341235014Sambrisko  KMP_DEBUG_ASSERT(pr);
1342235014Sambrisko  KMP_DEBUG_ASSERT(sh);
1343235014Sambrisko  KMP_DEBUG_ASSERT(tid >= 0 && tid < nproc);
1344235014Sambrisko#ifdef KMP_DEBUG
1345235014Sambrisko  {
1346235014Sambrisko    char *buff;
1347235014Sambrisko    // create format specifiers before the debug output
1348235014Sambrisko    buff =
1349235014Sambrisko        __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d called pr:%%p "
1350235014Sambrisko                         "sh:%%p nproc:%%%s tid:%%%s\n",
1351235014Sambrisko                         traits_t<T>::spec, traits_t<T>::spec);
1352235014Sambrisko    KD_TRACE(10, (buff, gtid, pr, sh, nproc, tid));
1353235014Sambrisko    __kmp_str_free(&buff);
1354298955Spfg  }
1355235014Sambrisko#endif
1356235014Sambrisko
1357235014Sambrisko  // zero trip count
1358235014Sambrisko  if (pr->u.p.tc == 0) {
1359235014Sambrisko    KD_TRACE(10,
1360235014Sambrisko             ("__kmp_dispatch_next_algorithm: T#%d early exit trip count is "
1361235014Sambrisko              "zero status:%d\n",
1362247369Ssmh              gtid, status));
1363247369Ssmh    return 0;
1364235014Sambrisko  }
1365247369Ssmh
1366235014Sambrisko  switch (pr->schedule) {
1367235014Sambrisko#if KMP_STATIC_STEAL_ENABLED
1368235014Sambrisko  case kmp_sch_static_steal: {
1369235014Sambrisko    T chunk = pr->u.p.parm1;
1370235014Sambrisko    UT nchunks = pr->u.p.parm2;
1371247369Ssmh    KD_TRACE(100,
1372247369Ssmh             ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_steal case\n",
1373235014Sambrisko              gtid));
1374235014Sambrisko
1375235014Sambrisko    trip = pr->u.p.tc - 1;
1376235014Sambrisko
1377235014Sambrisko    if (traits_t<T>::type_size > 4) {
1378235014Sambrisko      // use lock for 8-byte induction variable.
1379235014Sambrisko      // TODO (optional): check presence and use 16-byte CAS
1380235014Sambrisko      kmp_lock_t *lck = pr->u.p.steal_lock;
1381235014Sambrisko      KMP_DEBUG_ASSERT(lck != NULL);
1382247369Ssmh      if (pr->u.p.count < (UT)pr->u.p.ub) {
1383235014Sambrisko        KMP_DEBUG_ASSERT(pr->steal_flag == READY);
1384235014Sambrisko        __kmp_acquire_lock(lck, gtid);
1385235014Sambrisko        // try to get own chunk of iterations
1386235014Sambrisko        init = (pr->u.p.count)++;
1387235014Sambrisko        status = (init < (UT)pr->u.p.ub);
1388235014Sambrisko        __kmp_release_lock(lck, gtid);
1389235014Sambrisko      } else {
1390235014Sambrisko        status = 0; // no own chunks
1391235014Sambrisko      }
1392235014Sambrisko      if (!status) { // try to steal
1393235014Sambrisko        kmp_lock_t *lckv; // victim buffer's lock
1394235014Sambrisko        T while_limit = pr->u.p.parm3;
1395235014Sambrisko        T while_index = 0;
1396235014Sambrisko        int idx = (th->th.th_dispatch->th_disp_index - 1) %
1397247369Ssmh                  __kmp_dispatch_num_buffers; // current loop index
1398235040Sambrisko        // note: victim thread can potentially execute another loop
1399235040Sambrisko        KMP_ATOMIC_ST_REL(&pr->steal_flag, THIEF); // mark self buffer inactive
1400235040Sambrisko        while ((!status) && (while_limit != ++while_index)) {
1401235040Sambrisko          dispatch_private_info_template<T> *v;
1402247369Ssmh          T remaining;
1403235014Sambrisko          T victimId = pr->u.p.parm4;
1404235014Sambrisko          T oldVictimId = victimId ? victimId - 1 : nproc - 1;
1405235040Sambrisko          v = reinterpret_cast<dispatch_private_info_template<T> *>(
1406235040Sambrisko              &team->t.t_dispatch[victimId].th_disp_buffer[idx]);
1407235040Sambrisko          KMP_DEBUG_ASSERT(v);
1408235040Sambrisko          while ((v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) &&
1409235040Sambrisko                 oldVictimId != victimId) {
1410242681Sambrisko            victimId = (victimId + 1) % nproc;
1411235014Sambrisko            v = reinterpret_cast<dispatch_private_info_template<T> *>(
1412235014Sambrisko                &team->t.t_dispatch[victimId].th_disp_buffer[idx]);
1413235014Sambrisko            KMP_DEBUG_ASSERT(v);
1414235014Sambrisko          }
1415235014Sambrisko          if (v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) {
1416235014Sambrisko            continue; // try once more (nproc attempts in total)
1417235014Sambrisko          }
1418235014Sambrisko          if (KMP_ATOMIC_LD_RLX(&v->steal_flag) == UNUSED) {
1419235014Sambrisko            kmp_uint32 old = UNUSED;
1420235014Sambrisko            // try to steal whole range from inactive victim
1421235014Sambrisko            status = v->steal_flag.compare_exchange_strong(old, THIEF);
1422235014Sambrisko            if (status) {
1423235014Sambrisko              // initialize self buffer with victim's whole range of chunks
1424235014Sambrisko              T id = victimId;
1425235014Sambrisko              T small_chunk = 0, extras = 0, p_extra = 0;
1426235014Sambrisko              __kmp_initialize_self_buffer<T>(team, id, pr, nchunks, nproc,
1427235014Sambrisko                                              init, small_chunk, extras,
1428235014Sambrisko                                              p_extra);
1429235014Sambrisko              __kmp_acquire_lock(lck, gtid);
1430235014Sambrisko              pr->u.p.count = init + 1; // exclude one we execute immediately
1431235014Sambrisko              pr->u.p.ub = init + small_chunk + p_extra + (id < extras ? 1 : 0);
1432235014Sambrisko              __kmp_release_lock(lck, gtid);
1433235014Sambrisko              pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
1434235014Sambrisko              // no need to reinitialize other thread invariants: lb, st, etc.
1435235014Sambrisko#ifdef KMP_DEBUG
1436235014Sambrisko              {
1437235014Sambrisko                char *buff;
1438235040Sambrisko                // create format specifiers before the debug output
1439235040Sambrisko                buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d "
1440247369Ssmh                                        "stolen chunks from T#%%d, "
1441235040Sambrisko                                        "count:%%%s ub:%%%s\n",
1442235014Sambrisko                                        traits_t<UT>::spec, traits_t<T>::spec);
1443235014Sambrisko                KD_TRACE(10, (buff, gtid, id, pr->u.p.count, pr->u.p.ub));
1444235014Sambrisko                __kmp_str_free(&buff);
1445235014Sambrisko              }
1446235014Sambrisko#endif
1447235014Sambrisko              // activate non-empty buffer and let others steal from us
1448235014Sambrisko              if (pr->u.p.count < (UT)pr->u.p.ub)
1449235014Sambrisko                KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
1450235014Sambrisko              break;
1451235014Sambrisko            }
1452235014Sambrisko          }
1453235014Sambrisko          if (KMP_ATOMIC_LD_ACQ(&v->steal_flag) != READY ||
1454235014Sambrisko              v->u.p.count >= (UT)v->u.p.ub) {
1455235014Sambrisko            pr->u.p.parm4 = (victimId + 1) % nproc; // shift start victim tid
1456235014Sambrisko            continue; // no chunks to steal, try next victim
1457235014Sambrisko          }
1458235014Sambrisko          lckv = v->u.p.steal_lock;
1459235014Sambrisko          KMP_ASSERT(lckv != NULL);
1460235014Sambrisko          __kmp_acquire_lock(lckv, gtid);
1461235014Sambrisko          limit = v->u.p.ub; // keep initial ub
1462235014Sambrisko          if (v->u.p.count >= limit) {
1463235014Sambrisko            __kmp_release_lock(lckv, gtid);
1464235014Sambrisko            pr->u.p.parm4 = (victimId + 1) % nproc; // shift start victim tid
1465235014Sambrisko            continue; // no chunks to steal, try next victim
1466235014Sambrisko          }
1467235014Sambrisko
1468235014Sambrisko          // stealing succeded, reduce victim's ub by 1/4 of undone chunks
1469235014Sambrisko          // TODO: is this heuristics good enough??
1470235014Sambrisko          remaining = limit - v->u.p.count;
1471235014Sambrisko          if (remaining > 7) {
1472235014Sambrisko            // steal 1/4 of remaining
1473247369Ssmh            KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, remaining >> 2);
1474235014Sambrisko            init = (v->u.p.ub -= (remaining >> 2));
1475235014Sambrisko          } else {
1476235014Sambrisko            // steal 1 chunk of 1..7 remaining
1477235014Sambrisko            KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1);
1478247369Ssmh            init = (v->u.p.ub -= 1);
1479235014Sambrisko          }
1480235014Sambrisko          __kmp_release_lock(lckv, gtid);
1481235014Sambrisko#ifdef KMP_DEBUG
1482235014Sambrisko          {
1483235014Sambrisko            char *buff;
1484235014Sambrisko            // create format specifiers before the debug output
1485235014Sambrisko            buff = __kmp_str_format(
1486235014Sambrisko                "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, "
1487235014Sambrisko                "count:%%%s ub:%%%s\n",
1488235014Sambrisko                traits_t<UT>::spec, traits_t<UT>::spec);
1489235014Sambrisko            KD_TRACE(10, (buff, gtid, victimId, init, limit));
1490235014Sambrisko            __kmp_str_free(&buff);
1491235014Sambrisko          }
1492235014Sambrisko#endif
1493235014Sambrisko          KMP_DEBUG_ASSERT(init + 1 <= limit);
1494235014Sambrisko          pr->u.p.parm4 = victimId; // remember victim to steal from
1495247369Ssmh          status = 1;
1496235014Sambrisko          // now update own count and ub with stolen range excluding init chunk
1497247369Ssmh          __kmp_acquire_lock(lck, gtid);
1498235014Sambrisko          pr->u.p.count = init + 1;
1499          pr->u.p.ub = limit;
1500          __kmp_release_lock(lck, gtid);
1501          // activate non-empty buffer and let others steal from us
1502          if (init + 1 < limit)
1503            KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
1504        } // while (search for victim)
1505      } // if (try to find victim and steal)
1506    } else {
1507      // 4-byte induction variable, use 8-byte CAS for pair (count, ub)
1508      // as all operations on pair (count, ub) must be done atomically
1509      typedef union {
1510        struct {
1511          UT count;
1512          T ub;
1513        } p;
1514        kmp_int64 b;
1515      } union_i4;
1516      union_i4 vold, vnew;
1517      if (pr->u.p.count < (UT)pr->u.p.ub) {
1518        KMP_DEBUG_ASSERT(pr->steal_flag == READY);
1519        vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1520        vnew.b = vold.b;
1521        vnew.p.count++; // get chunk from head of self range
1522        while (!KMP_COMPARE_AND_STORE_REL64(
1523            (volatile kmp_int64 *)&pr->u.p.count,
1524            *VOLATILE_CAST(kmp_int64 *) & vold.b,
1525            *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1526          KMP_CPU_PAUSE();
1527          vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1528          vnew.b = vold.b;
1529          vnew.p.count++;
1530        }
1531        init = vold.p.count;
1532        status = (init < (UT)vold.p.ub);
1533      } else {
1534        status = 0; // no own chunks
1535      }
1536      if (!status) { // try to steal
1537        T while_limit = pr->u.p.parm3;
1538        T while_index = 0;
1539        int idx = (th->th.th_dispatch->th_disp_index - 1) %
1540                  __kmp_dispatch_num_buffers; // current loop index
1541        // note: victim thread can potentially execute another loop
1542        KMP_ATOMIC_ST_REL(&pr->steal_flag, THIEF); // mark self buffer inactive
1543        while ((!status) && (while_limit != ++while_index)) {
1544          dispatch_private_info_template<T> *v;
1545          T remaining;
1546          T victimId = pr->u.p.parm4;
1547          T oldVictimId = victimId ? victimId - 1 : nproc - 1;
1548          v = reinterpret_cast<dispatch_private_info_template<T> *>(
1549              &team->t.t_dispatch[victimId].th_disp_buffer[idx]);
1550          KMP_DEBUG_ASSERT(v);
1551          while ((v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) &&
1552                 oldVictimId != victimId) {
1553            victimId = (victimId + 1) % nproc;
1554            v = reinterpret_cast<dispatch_private_info_template<T> *>(
1555                &team->t.t_dispatch[victimId].th_disp_buffer[idx]);
1556            KMP_DEBUG_ASSERT(v);
1557          }
1558          if (v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) {
1559            continue; // try once more (nproc attempts in total)
1560          }
1561          if (KMP_ATOMIC_LD_RLX(&v->steal_flag) == UNUSED) {
1562            kmp_uint32 old = UNUSED;
1563            // try to steal whole range from inactive victim
1564            status = v->steal_flag.compare_exchange_strong(old, THIEF);
1565            if (status) {
1566              // initialize self buffer with victim's whole range of chunks
1567              T id = victimId;
1568              T small_chunk = 0, extras = 0, p_extra = 0;
1569              __kmp_initialize_self_buffer<T>(team, id, pr, nchunks, nproc,
1570                                              init, small_chunk, extras,
1571                                              p_extra);
1572              vnew.p.count = init + 1;
1573              vnew.p.ub = init + small_chunk + p_extra + (id < extras ? 1 : 0);
1574              // write pair (count, ub) at once atomically
1575#if KMP_ARCH_X86
1576              KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vnew.b);
1577#else
1578              *(volatile kmp_int64 *)(&pr->u.p.count) = vnew.b;
1579#endif
1580              pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
1581              // no need to initialize other thread invariants: lb, st, etc.
1582#ifdef KMP_DEBUG
1583              {
1584                char *buff;
1585                // create format specifiers before the debug output
1586                buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d "
1587                                        "stolen chunks from T#%%d, "
1588                                        "count:%%%s ub:%%%s\n",
1589                                        traits_t<UT>::spec, traits_t<T>::spec);
1590                KD_TRACE(10, (buff, gtid, id, pr->u.p.count, pr->u.p.ub));
1591                __kmp_str_free(&buff);
1592              }
1593#endif
1594              // activate non-empty buffer and let others steal from us
1595              if (pr->u.p.count < (UT)pr->u.p.ub)
1596                KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
1597              break;
1598            }
1599          }
1600          while (1) { // CAS loop with check if victim still has enough chunks
1601            // many threads may be stealing concurrently from same victim
1602            vold.b = *(volatile kmp_int64 *)(&v->u.p.count);
1603            if (KMP_ATOMIC_LD_ACQ(&v->steal_flag) != READY ||
1604                vold.p.count >= (UT)vold.p.ub) {
1605              pr->u.p.parm4 = (victimId + 1) % nproc; // shift start victim id
1606              break; // no chunks to steal, try next victim
1607            }
1608            vnew.b = vold.b;
1609            remaining = vold.p.ub - vold.p.count;
1610            // try to steal 1/4 of remaining
1611            // TODO: is this heuristics good enough??
1612            if (remaining > 7) {
1613              vnew.p.ub -= remaining >> 2; // steal from tail of victim's range
1614            } else {
1615              vnew.p.ub -= 1; // steal 1 chunk of 1..7 remaining
1616            }
1617            KMP_DEBUG_ASSERT(vnew.p.ub * (UT)chunk <= trip);
1618            if (KMP_COMPARE_AND_STORE_REL64(
1619                    (volatile kmp_int64 *)&v->u.p.count,
1620                    *VOLATILE_CAST(kmp_int64 *) & vold.b,
1621                    *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1622              // stealing succedded
1623#ifdef KMP_DEBUG
1624              {
1625                char *buff;
1626                // create format specifiers before the debug output
1627                buff = __kmp_str_format(
1628                    "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, "
1629                    "count:%%%s ub:%%%s\n",
1630                    traits_t<T>::spec, traits_t<T>::spec);
1631                KD_TRACE(10, (buff, gtid, victimId, vnew.p.ub, vold.p.ub));
1632                __kmp_str_free(&buff);
1633              }
1634#endif
1635              KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen,
1636                                        vold.p.ub - vnew.p.ub);
1637              status = 1;
1638              pr->u.p.parm4 = victimId; // keep victim id
1639              // now update own count and ub
1640              init = vnew.p.ub;
1641              vold.p.count = init + 1;
1642#if KMP_ARCH_X86
1643              KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vold.b);
1644#else
1645              *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b;
1646#endif
1647              // activate non-empty buffer and let others steal from us
1648              if (vold.p.count < (UT)vold.p.ub)
1649                KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
1650              break;
1651            } // if (check CAS result)
1652            KMP_CPU_PAUSE(); // CAS failed, repeatedly attempt
1653          } // while (try to steal from particular victim)
1654        } // while (search for victim)
1655      } // if (try to find victim and steal)
1656    } // if (4-byte induction variable)
1657    if (!status) {
1658      *p_lb = 0;
1659      *p_ub = 0;
1660      if (p_st != NULL)
1661        *p_st = 0;
1662    } else {
1663      start = pr->u.p.lb;
1664      init *= chunk;
1665      limit = chunk + init - 1;
1666      incr = pr->u.p.st;
1667      KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_chunks, 1);
1668
1669      KMP_DEBUG_ASSERT(init <= trip);
1670      // keep track of done chunks for possible early exit from stealing
1671      // TODO: count executed chunks locally with rare update of shared location
1672      // test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);
1673      if ((last = (limit >= trip)) != 0)
1674        limit = trip;
1675      if (p_st != NULL)
1676        *p_st = incr;
1677
1678      if (incr == 1) {
1679        *p_lb = start + init;
1680        *p_ub = start + limit;
1681      } else {
1682        *p_lb = start + init * incr;
1683        *p_ub = start + limit * incr;
1684      }
1685    } // if
1686    break;
1687  } // case
1688#endif // KMP_STATIC_STEAL_ENABLED
1689  case kmp_sch_static_balanced: {
1690    KD_TRACE(
1691        10,
1692        ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_balanced case\n",
1693         gtid));
1694    /* check if thread has any iteration to do */
1695    if ((status = !pr->u.p.count) != 0) {
1696      pr->u.p.count = 1;
1697      *p_lb = pr->u.p.lb;
1698      *p_ub = pr->u.p.ub;
1699      last = (pr->u.p.parm1 != 0);
1700      if (p_st != NULL)
1701        *p_st = pr->u.p.st;
1702    } else { /* no iterations to do */
1703      pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1704    }
1705  } // case
1706  break;
1707  case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was
1708                                 merged here */
1709  case kmp_sch_static_chunked: {
1710    T parm1;
1711
1712    KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
1713                   "kmp_sch_static_[affinity|chunked] case\n",
1714                   gtid));
1715    parm1 = pr->u.p.parm1;
1716
1717    trip = pr->u.p.tc - 1;
1718    init = parm1 * (pr->u.p.count + tid);
1719
1720    if ((status = (init <= trip)) != 0) {
1721      start = pr->u.p.lb;
1722      incr = pr->u.p.st;
1723      limit = parm1 + init - 1;
1724
1725      if ((last = (limit >= trip)) != 0)
1726        limit = trip;
1727
1728      if (p_st != NULL)
1729        *p_st = incr;
1730
1731      pr->u.p.count += nproc;
1732
1733      if (incr == 1) {
1734        *p_lb = start + init;
1735        *p_ub = start + limit;
1736      } else {
1737        *p_lb = start + init * incr;
1738        *p_ub = start + limit * incr;
1739      }
1740
1741      if (pr->flags.ordered) {
1742        pr->u.p.ordered_lower = init;
1743        pr->u.p.ordered_upper = limit;
1744      } // if
1745    } // if
1746  } // case
1747  break;
1748
1749  case kmp_sch_dynamic_chunked: {
1750    UT chunk_number;
1751    UT chunk_size = pr->u.p.parm1;
1752    UT nchunks = pr->u.p.parm2;
1753
1754    KD_TRACE(
1755        100,
1756        ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_dynamic_chunked case\n",
1757         gtid));
1758
1759    chunk_number = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1760    status = (chunk_number < nchunks);
1761    if (!status) {
1762      *p_lb = 0;
1763      *p_ub = 0;
1764      if (p_st != NULL)
1765        *p_st = 0;
1766    } else {
1767      init = chunk_size * chunk_number;
1768      trip = pr->u.p.tc - 1;
1769      start = pr->u.p.lb;
1770      incr = pr->u.p.st;
1771
1772      if ((last = (trip - init < (UT)chunk_size)))
1773        limit = trip;
1774      else
1775        limit = chunk_size + init - 1;
1776
1777      if (p_st != NULL)
1778        *p_st = incr;
1779
1780      if (incr == 1) {
1781        *p_lb = start + init;
1782        *p_ub = start + limit;
1783      } else {
1784        *p_lb = start + init * incr;
1785        *p_ub = start + limit * incr;
1786      }
1787
1788      if (pr->flags.ordered) {
1789        pr->u.p.ordered_lower = init;
1790        pr->u.p.ordered_upper = limit;
1791      } // if
1792    } // if
1793  } // case
1794  break;
1795
1796  case kmp_sch_guided_iterative_chunked: {
1797    T chunkspec = pr->u.p.parm1;
1798    KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_chunked "
1799                   "iterative case\n",
1800                   gtid));
1801    trip = pr->u.p.tc;
1802    // Start atomic part of calculations
1803    while (1) {
1804      ST remaining; // signed, because can be < 0
1805      init = sh->u.s.iteration; // shared value
1806      remaining = trip - init;
1807      if (remaining <= 0) { // AC: need to compare with 0 first
1808        // nothing to do, don't try atomic op
1809        status = 0;
1810        break;
1811      }
1812      if ((T)remaining <
1813          pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default
1814        // use dynamic-style schedule
1815        // atomically increment iterations, get old value
1816        init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1817                                 (ST)chunkspec);
1818        remaining = trip - init;
1819        if (remaining <= 0) {
1820          status = 0; // all iterations got by other threads
1821        } else {
1822          // got some iterations to work on
1823          status = 1;
1824          if ((T)remaining > chunkspec) {
1825            limit = init + chunkspec - 1;
1826          } else {
1827            last = true; // the last chunk
1828            limit = init + remaining - 1;
1829          } // if
1830        } // if
1831        break;
1832      } // if
1833      limit = init + (UT)((double)remaining *
1834                          *(double *)&pr->u.p.parm3); // divide by K*nproc
1835      if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1836                               (ST)init, (ST)limit)) {
1837        // CAS was successful, chunk obtained
1838        status = 1;
1839        --limit;
1840        break;
1841      } // if
1842    } // while
1843    if (status != 0) {
1844      start = pr->u.p.lb;
1845      incr = pr->u.p.st;
1846      if (p_st != NULL)
1847        *p_st = incr;
1848      *p_lb = start + init * incr;
1849      *p_ub = start + limit * incr;
1850      if (pr->flags.ordered) {
1851        pr->u.p.ordered_lower = init;
1852        pr->u.p.ordered_upper = limit;
1853      } // if
1854    } else {
1855      *p_lb = 0;
1856      *p_ub = 0;
1857      if (p_st != NULL)
1858        *p_st = 0;
1859    } // if
1860  } // case
1861  break;
1862
1863  case kmp_sch_guided_simd: {
1864    // same as iterative but curr-chunk adjusted to be multiple of given
1865    // chunk
1866    T chunk = pr->u.p.parm1;
1867    KD_TRACE(100,
1868             ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_simd case\n",
1869              gtid));
1870    trip = pr->u.p.tc;
1871    // Start atomic part of calculations
1872    while (1) {
1873      ST remaining; // signed, because can be < 0
1874      init = sh->u.s.iteration; // shared value
1875      remaining = trip - init;
1876      if (remaining <= 0) { // AC: need to compare with 0 first
1877        status = 0; // nothing to do, don't try atomic op
1878        break;
1879      }
1880      KMP_DEBUG_ASSERT(chunk && init % chunk == 0);
1881      // compare with K*nproc*(chunk+1), K=2 by default
1882      if ((T)remaining < pr->u.p.parm2) {
1883        // use dynamic-style schedule
1884        // atomically increment iterations, get old value
1885        init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1886                                 (ST)chunk);
1887        remaining = trip - init;
1888        if (remaining <= 0) {
1889          status = 0; // all iterations got by other threads
1890        } else {
1891          // got some iterations to work on
1892          status = 1;
1893          if ((T)remaining > chunk) {
1894            limit = init + chunk - 1;
1895          } else {
1896            last = true; // the last chunk
1897            limit = init + remaining - 1;
1898          } // if
1899        } // if
1900        break;
1901      } // if
1902      // divide by K*nproc
1903      UT span;
1904      __kmp_type_convert((double)remaining * (*(double *)&pr->u.p.parm3),
1905                         &span);
1906      UT rem = span % chunk;
1907      if (rem) // adjust so that span%chunk == 0
1908        span += chunk - rem;
1909      limit = init + span;
1910      if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1911                               (ST)init, (ST)limit)) {
1912        // CAS was successful, chunk obtained
1913        status = 1;
1914        --limit;
1915        break;
1916      } // if
1917    } // while
1918    if (status != 0) {
1919      start = pr->u.p.lb;
1920      incr = pr->u.p.st;
1921      if (p_st != NULL)
1922        *p_st = incr;
1923      *p_lb = start + init * incr;
1924      *p_ub = start + limit * incr;
1925      if (pr->flags.ordered) {
1926        pr->u.p.ordered_lower = init;
1927        pr->u.p.ordered_upper = limit;
1928      } // if
1929    } else {
1930      *p_lb = 0;
1931      *p_ub = 0;
1932      if (p_st != NULL)
1933        *p_st = 0;
1934    } // if
1935  } // case
1936  break;
1937
1938  case kmp_sch_guided_analytical_chunked: {
1939    T chunkspec = pr->u.p.parm1;
1940    UT chunkIdx;
1941#if KMP_USE_X87CONTROL
1942    /* for storing original FPCW value for Windows* OS on
1943       IA-32 architecture 8-byte version */
1944    unsigned int oldFpcw;
1945    unsigned int fpcwSet = 0;
1946#endif
1947    KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
1948                   "kmp_sch_guided_analytical_chunked case\n",
1949                   gtid));
1950
1951    trip = pr->u.p.tc;
1952
1953    KMP_DEBUG_ASSERT(nproc > 1);
1954    KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)nproc < trip);
1955
1956    while (1) { /* this while loop is a safeguard against unexpected zero
1957                   chunk sizes */
1958      chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1959      if (chunkIdx >= (UT)pr->u.p.parm2) {
1960        --trip;
1961        /* use dynamic-style scheduling */
1962        init = chunkIdx * chunkspec + pr->u.p.count;
1963        /* need to verify init > 0 in case of overflow in the above
1964         * calculation */
1965        if ((status = (init > 0 && init <= trip)) != 0) {
1966          limit = init + chunkspec - 1;
1967
1968          if ((last = (limit >= trip)) != 0)
1969            limit = trip;
1970        }
1971        break;
1972      } else {
1973/* use exponential-style scheduling */
1974/* The following check is to workaround the lack of long double precision on
1975   Windows* OS.
1976   This check works around the possible effect that init != 0 for chunkIdx == 0.
1977 */
1978#if KMP_USE_X87CONTROL
1979        /* If we haven't already done so, save original
1980           FPCW and set precision to 64-bit, as Windows* OS
1981           on IA-32 architecture defaults to 53-bit */
1982        if (!fpcwSet) {
1983          oldFpcw = _control87(0, 0);
1984          _control87(_PC_64, _MCW_PC);
1985          fpcwSet = 0x30000;
1986        }
1987#endif
1988        if (chunkIdx) {
1989          init = __kmp_dispatch_guided_remaining<T>(
1990              trip, *(DBL *)&pr->u.p.parm3, chunkIdx);
1991          KMP_DEBUG_ASSERT(init);
1992          init = trip - init;
1993        } else
1994          init = 0;
1995        limit = trip - __kmp_dispatch_guided_remaining<T>(
1996                           trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1);
1997        KMP_ASSERT(init <= limit);
1998        if (init < limit) {
1999          KMP_DEBUG_ASSERT(limit <= trip);
2000          --limit;
2001          status = 1;
2002          break;
2003        } // if
2004      } // if
2005    } // while (1)
2006#if KMP_USE_X87CONTROL
2007    /* restore FPCW if necessary
2008       AC: check fpcwSet flag first because oldFpcw can be uninitialized here
2009    */
2010    if (fpcwSet && (oldFpcw & fpcwSet))
2011      _control87(oldFpcw, _MCW_PC);
2012#endif
2013    if (status != 0) {
2014      start = pr->u.p.lb;
2015      incr = pr->u.p.st;
2016      if (p_st != NULL)
2017        *p_st = incr;
2018      *p_lb = start + init * incr;
2019      *p_ub = start + limit * incr;
2020      if (pr->flags.ordered) {
2021        pr->u.p.ordered_lower = init;
2022        pr->u.p.ordered_upper = limit;
2023      }
2024    } else {
2025      *p_lb = 0;
2026      *p_ub = 0;
2027      if (p_st != NULL)
2028        *p_st = 0;
2029    }
2030  } // case
2031  break;
2032
2033  case kmp_sch_trapezoidal: {
2034    UT index;
2035    T parm2 = pr->u.p.parm2;
2036    T parm3 = pr->u.p.parm3;
2037    T parm4 = pr->u.p.parm4;
2038    KD_TRACE(100,
2039             ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_trapezoidal case\n",
2040              gtid));
2041
2042    index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);
2043
2044    init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2;
2045    trip = pr->u.p.tc - 1;
2046
2047    if ((status = ((T)index < parm3 && init <= trip)) == 0) {
2048      *p_lb = 0;
2049      *p_ub = 0;
2050      if (p_st != NULL)
2051        *p_st = 0;
2052    } else {
2053      start = pr->u.p.lb;
2054      limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1;
2055      incr = pr->u.p.st;
2056
2057      if ((last = (limit >= trip)) != 0)
2058        limit = trip;
2059
2060      if (p_st != NULL)
2061        *p_st = incr;
2062
2063      if (incr == 1) {
2064        *p_lb = start + init;
2065        *p_ub = start + limit;
2066      } else {
2067        *p_lb = start + init * incr;
2068        *p_ub = start + limit * incr;
2069      }
2070
2071      if (pr->flags.ordered) {
2072        pr->u.p.ordered_lower = init;
2073        pr->u.p.ordered_upper = limit;
2074      } // if
2075    } // if
2076  } // case
2077  break;
2078  default: {
2079    status = 0; // to avoid complaints on uninitialized variable use
2080    __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
2081                KMP_HNT(GetNewerLibrary), // Hint
2082                __kmp_msg_null // Variadic argument list terminator
2083    );
2084  } break;
2085  } // switch
2086  if (p_last)
2087    *p_last = last;
2088#ifdef KMP_DEBUG
2089  if (pr->flags.ordered) {
2090    char *buff;
2091    // create format specifiers before the debug output
2092    buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d "
2093                            "ordered_lower:%%%s ordered_upper:%%%s\n",
2094                            traits_t<UT>::spec, traits_t<UT>::spec);
2095    KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper));
2096    __kmp_str_free(&buff);
2097  }
2098  {
2099    char *buff;
2100    // create format specifiers before the debug output
2101    buff = __kmp_str_format(
2102        "__kmp_dispatch_next_algorithm: T#%%d exit status:%%d p_last:%%d "
2103        "p_lb:%%%s p_ub:%%%s p_st:%%%s\n",
2104        traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2105    KMP_DEBUG_ASSERT(p_last);
2106    KMP_DEBUG_ASSERT(p_st);
2107    KD_TRACE(10, (buff, gtid, status, *p_last, *p_lb, *p_ub, *p_st));
2108    __kmp_str_free(&buff);
2109  }
2110#endif
2111  return status;
2112}
2113
2114/* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more
2115   work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini()
2116   is not called. */
2117#if OMPT_SUPPORT && OMPT_OPTIONAL
2118#define OMPT_LOOP_END                                                          \
2119  if (status == 0) {                                                           \
2120    if (ompt_enabled.ompt_callback_work) {                                     \
2121      ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);              \
2122      ompt_task_info_t *task_info = __ompt_get_task_info_object(0);            \
2123      ompt_callbacks.ompt_callback(ompt_callback_work)(                        \
2124          ompt_work_loop, ompt_scope_end, &(team_info->parallel_data),         \
2125          &(task_info->task_data), 0, codeptr);                                \
2126    }                                                                          \
2127  }
2128#define OMPT_LOOP_DISPATCH(lb, ub, st, status)                                 \
2129  if (ompt_enabled.ompt_callback_dispatch && status) {                         \
2130    ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);                \
2131    ompt_task_info_t *task_info = __ompt_get_task_info_object(0);              \
2132    ompt_dispatch_chunk_t chunk;                                               \
2133    ompt_data_t instance = ompt_data_none;                                     \
2134    OMPT_GET_DISPATCH_CHUNK(chunk, lb, ub, st);                                \
2135    instance.ptr = &chunk;                                                     \
2136    ompt_callbacks.ompt_callback(ompt_callback_dispatch)(                      \
2137        &(team_info->parallel_data), &(task_info->task_data),                  \
2138        ompt_dispatch_ws_loop_chunk, instance);                                \
2139  }
2140// TODO: implement count
2141#else
2142#define OMPT_LOOP_END // no-op
2143#define OMPT_LOOP_DISPATCH(lb, ub, st, status) // no-op
2144#endif
2145
2146#if KMP_STATS_ENABLED
2147#define KMP_STATS_LOOP_END                                                     \
2148  {                                                                            \
2149    kmp_int64 u, l, t, i;                                                      \
2150    l = (kmp_int64)(*p_lb);                                                    \
2151    u = (kmp_int64)(*p_ub);                                                    \
2152    i = (kmp_int64)(pr->u.p.st);                                               \
2153    if (status == 0) {                                                         \
2154      t = 0;                                                                   \
2155      KMP_POP_PARTITIONED_TIMER();                                             \
2156    } else if (i == 1) {                                                       \
2157      if (u >= l)                                                              \
2158        t = u - l + 1;                                                         \
2159      else                                                                     \
2160        t = 0;                                                                 \
2161    } else if (i < 0) {                                                        \
2162      if (l >= u)                                                              \
2163        t = (l - u) / (-i) + 1;                                                \
2164      else                                                                     \
2165        t = 0;                                                                 \
2166    } else {                                                                   \
2167      if (u >= l)                                                              \
2168        t = (u - l) / i + 1;                                                   \
2169      else                                                                     \
2170        t = 0;                                                                 \
2171    }                                                                          \
2172    KMP_COUNT_VALUE(OMP_loop_dynamic_iterations, t);                           \
2173  }
2174#else
2175#define KMP_STATS_LOOP_END /* Nothing */
2176#endif
2177
2178template <typename T>
2179static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
2180                               T *p_lb, T *p_ub,
2181                               typename traits_t<T>::signed_t *p_st
2182#if OMPT_SUPPORT && OMPT_OPTIONAL
2183                               ,
2184                               void *codeptr
2185#endif
2186) {
2187
2188  typedef typename traits_t<T>::unsigned_t UT;
2189  typedef typename traits_t<T>::signed_t ST;
2190  // This is potentially slightly misleading, schedule(runtime) will appear here
2191  // even if the actual runtime schedule is static. (Which points out a
2192  // disadvantage of schedule(runtime): even when static scheduling is used it
2193  // costs more than a compile time choice to use static scheduling would.)
2194  KMP_TIME_PARTITIONED_BLOCK(OMP_loop_dynamic_scheduling);
2195
2196  int status;
2197  dispatch_private_info_template<T> *pr;
2198  __kmp_assert_valid_gtid(gtid);
2199  kmp_info_t *th = __kmp_threads[gtid];
2200  kmp_team_t *team = th->th.th_team;
2201
2202  KMP_DEBUG_ASSERT(p_lb && p_ub && p_st); // AC: these cannot be NULL
2203  KD_TRACE(
2204      1000,
2205      ("__kmp_dispatch_next: T#%d called p_lb:%p p_ub:%p p_st:%p p_last: %p\n",
2206       gtid, p_lb, p_ub, p_st, p_last));
2207
2208  if (team->t.t_serialized) {
2209    /* NOTE: serialize this dispatch because we are not at the active level */
2210    pr = reinterpret_cast<dispatch_private_info_template<T> *>(
2211        th->th.th_dispatch->th_disp_buffer); /* top of the stack */
2212    KMP_DEBUG_ASSERT(pr);
2213
2214    if ((status = (pr->u.p.tc != 0)) == 0) {
2215      *p_lb = 0;
2216      *p_ub = 0;
2217      //            if ( p_last != NULL )
2218      //                *p_last = 0;
2219      if (p_st != NULL)
2220        *p_st = 0;
2221      if (__kmp_env_consistency_check) {
2222        if (pr->pushed_ws != ct_none) {
2223          pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2224        }
2225      }
2226    } else if (pr->flags.nomerge) {
2227      kmp_int32 last;
2228      T start;
2229      UT limit, trip, init;
2230      ST incr;
2231      T chunk = pr->u.p.parm1;
2232
2233      KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
2234                     gtid));
2235
2236      init = chunk * pr->u.p.count++;
2237      trip = pr->u.p.tc - 1;
2238
2239      if ((status = (init <= trip)) == 0) {
2240        *p_lb = 0;
2241        *p_ub = 0;
2242        //                if ( p_last != NULL )
2243        //                    *p_last = 0;
2244        if (p_st != NULL)
2245          *p_st = 0;
2246        if (__kmp_env_consistency_check) {
2247          if (pr->pushed_ws != ct_none) {
2248            pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2249          }
2250        }
2251      } else {
2252        start = pr->u.p.lb;
2253        limit = chunk + init - 1;
2254        incr = pr->u.p.st;
2255
2256        if ((last = (limit >= trip)) != 0) {
2257          limit = trip;
2258#if KMP_OS_WINDOWS
2259          pr->u.p.last_upper = pr->u.p.ub;
2260#endif /* KMP_OS_WINDOWS */
2261        }
2262        if (p_last != NULL)
2263          *p_last = last;
2264        if (p_st != NULL)
2265          *p_st = incr;
2266        if (incr == 1) {
2267          *p_lb = start + init;
2268          *p_ub = start + limit;
2269        } else {
2270          *p_lb = start + init * incr;
2271          *p_ub = start + limit * incr;
2272        }
2273
2274        if (pr->flags.ordered) {
2275          pr->u.p.ordered_lower = init;
2276          pr->u.p.ordered_upper = limit;
2277#ifdef KMP_DEBUG
2278          {
2279            char *buff;
2280            // create format specifiers before the debug output
2281            buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
2282                                    "ordered_lower:%%%s ordered_upper:%%%s\n",
2283                                    traits_t<UT>::spec, traits_t<UT>::spec);
2284            KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
2285                            pr->u.p.ordered_upper));
2286            __kmp_str_free(&buff);
2287          }
2288#endif
2289        } // if
2290      } // if
2291    } else {
2292      pr->u.p.tc = 0;
2293      *p_lb = pr->u.p.lb;
2294      *p_ub = pr->u.p.ub;
2295#if KMP_OS_WINDOWS
2296      pr->u.p.last_upper = *p_ub;
2297#endif /* KMP_OS_WINDOWS */
2298      if (p_last != NULL)
2299        *p_last = TRUE;
2300      if (p_st != NULL)
2301        *p_st = pr->u.p.st;
2302    } // if
2303#ifdef KMP_DEBUG
2304    {
2305      char *buff;
2306      // create format specifiers before the debug output
2307      buff = __kmp_str_format(
2308          "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s "
2309          "p_ub:%%%s p_st:%%%s p_last:%%p %%d  returning:%%d\n",
2310          traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2311      KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last,
2312                    (p_last ? *p_last : 0), status));
2313      __kmp_str_free(&buff);
2314    }
2315#endif
2316#if INCLUDE_SSC_MARKS
2317    SSC_MARK_DISPATCH_NEXT();
2318#endif
2319    OMPT_LOOP_DISPATCH(*p_lb, *p_ub, pr->u.p.st, status);
2320    OMPT_LOOP_END;
2321    KMP_STATS_LOOP_END;
2322    return status;
2323  } else {
2324    kmp_int32 last = 0;
2325    dispatch_shared_info_template<T> volatile *sh;
2326
2327    KMP_DEBUG_ASSERT(th->th.th_dispatch ==
2328                     &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
2329
2330    pr = reinterpret_cast<dispatch_private_info_template<T> *>(
2331        th->th.th_dispatch->th_dispatch_pr_current);
2332    KMP_DEBUG_ASSERT(pr);
2333    sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
2334        th->th.th_dispatch->th_dispatch_sh_current);
2335    KMP_DEBUG_ASSERT(sh);
2336
2337#if KMP_USE_HIER_SCHED
2338    if (pr->flags.use_hier)
2339      status = sh->hier->next(loc, gtid, pr, &last, p_lb, p_ub, p_st);
2340    else
2341#endif // KMP_USE_HIER_SCHED
2342      status = __kmp_dispatch_next_algorithm<T>(gtid, pr, sh, &last, p_lb, p_ub,
2343                                                p_st, th->th.th_team_nproc,
2344                                                th->th.th_info.ds.ds_tid);
2345    // status == 0: no more iterations to execute
2346    if (status == 0) {
2347      ST num_done;
2348      num_done = test_then_inc<ST>(&sh->u.s.num_done);
2349#ifdef KMP_DEBUG
2350      {
2351        char *buff;
2352        // create format specifiers before the debug output
2353        buff = __kmp_str_format(
2354            "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2355            traits_t<ST>::spec);
2356        KD_TRACE(10, (buff, gtid, sh->u.s.num_done));
2357        __kmp_str_free(&buff);
2358      }
2359#endif
2360
2361#if KMP_USE_HIER_SCHED
2362      pr->flags.use_hier = FALSE;
2363#endif
2364      if (num_done == th->th.th_team_nproc - 1) {
2365#if KMP_STATIC_STEAL_ENABLED
2366        if (pr->schedule == kmp_sch_static_steal) {
2367          int i;
2368          int idx = (th->th.th_dispatch->th_disp_index - 1) %
2369                    __kmp_dispatch_num_buffers; // current loop index
2370          // loop complete, safe to destroy locks used for stealing
2371          for (i = 0; i < th->th.th_team_nproc; ++i) {
2372            dispatch_private_info_template<T> *buf =
2373                reinterpret_cast<dispatch_private_info_template<T> *>(
2374                    &team->t.t_dispatch[i].th_disp_buffer[idx]);
2375            KMP_ASSERT(buf->steal_flag == THIEF); // buffer must be inactive
2376            KMP_ATOMIC_ST_RLX(&buf->steal_flag, UNUSED);
2377            if (traits_t<T>::type_size > 4) {
2378              // destroy locks used for stealing
2379              kmp_lock_t *lck = buf->u.p.steal_lock;
2380              KMP_ASSERT(lck != NULL);
2381              __kmp_destroy_lock(lck);
2382              __kmp_free(lck);
2383              buf->u.p.steal_lock = NULL;
2384            }
2385          }
2386        }
2387#endif
2388        /* NOTE: release shared buffer to be reused */
2389
2390        KMP_MB(); /* Flush all pending memory write invalidates.  */
2391
2392        sh->u.s.num_done = 0;
2393        sh->u.s.iteration = 0;
2394
2395        /* TODO replace with general release procedure? */
2396        if (pr->flags.ordered) {
2397          sh->u.s.ordered_iteration = 0;
2398        }
2399
2400        sh->buffer_index += __kmp_dispatch_num_buffers;
2401        KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2402                       gtid, sh->buffer_index));
2403
2404        KMP_MB(); /* Flush all pending memory write invalidates.  */
2405
2406      } // if
2407      if (__kmp_env_consistency_check) {
2408        if (pr->pushed_ws != ct_none) {
2409          pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2410        }
2411      }
2412
2413      th->th.th_dispatch->th_deo_fcn = NULL;
2414      th->th.th_dispatch->th_dxo_fcn = NULL;
2415      th->th.th_dispatch->th_dispatch_sh_current = NULL;
2416      th->th.th_dispatch->th_dispatch_pr_current = NULL;
2417    } // if (status == 0)
2418#if KMP_OS_WINDOWS
2419    else if (last) {
2420      pr->u.p.last_upper = pr->u.p.ub;
2421    }
2422#endif /* KMP_OS_WINDOWS */
2423    if (p_last != NULL && status != 0)
2424      *p_last = last;
2425  } // if
2426
2427#ifdef KMP_DEBUG
2428  {
2429    char *buff;
2430    // create format specifiers before the debug output
2431    buff = __kmp_str_format(
2432        "__kmp_dispatch_next: T#%%d normal case: "
2433        "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p (%%d) returning:%%d\n",
2434        traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2435    KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last,
2436                  (p_last ? *p_last : 0), status));
2437    __kmp_str_free(&buff);
2438  }
2439#endif
2440#if INCLUDE_SSC_MARKS
2441  SSC_MARK_DISPATCH_NEXT();
2442#endif
2443  OMPT_LOOP_DISPATCH(*p_lb, *p_ub, pr->u.p.st, status);
2444  OMPT_LOOP_END;
2445  KMP_STATS_LOOP_END;
2446  return status;
2447}
2448
2449/*!
2450@ingroup WORK_SHARING
2451@param loc  source location information
2452@param global_tid  global thread number
2453@return Zero if the parallel region is not active and this thread should execute
2454all sections, non-zero otherwise.
2455
2456Beginning of sections construct.
2457There are no implicit barriers in the "sections" calls, rather the compiler
2458should introduce an explicit barrier if it is required.
2459
2460This implementation is based on __kmp_dispatch_init, using same constructs for
2461shared data (we can't have sections nested directly in omp for loop, there
2462should be a parallel region in between)
2463*/
2464kmp_int32 __kmpc_sections_init(ident_t *loc, kmp_int32 gtid) {
2465
2466  int active;
2467  kmp_info_t *th;
2468  kmp_team_t *team;
2469  kmp_uint32 my_buffer_index;
2470  dispatch_shared_info_template<kmp_int32> volatile *sh;
2471
2472  KMP_DEBUG_ASSERT(__kmp_init_serial);
2473
2474  if (!TCR_4(__kmp_init_parallel))
2475    __kmp_parallel_initialize();
2476  __kmp_resume_if_soft_paused();
2477
2478  /* setup data */
2479  th = __kmp_threads[gtid];
2480  team = th->th.th_team;
2481  active = !team->t.t_serialized;
2482  th->th.th_ident = loc;
2483
2484  KMP_COUNT_BLOCK(OMP_SECTIONS);
2485  KD_TRACE(10, ("__kmpc_sections: called by T#%d\n", gtid));
2486
2487  if (active) {
2488    // Setup sections in the same way as dynamic scheduled loops.
2489    // We need one shared data: which section is to execute next.
2490    // (in case parallel is not active, all sections will be executed on the
2491    // same thread)
2492    KMP_DEBUG_ASSERT(th->th.th_dispatch ==
2493                     &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
2494
2495    my_buffer_index = th->th.th_dispatch->th_disp_index++;
2496
2497    // reuse shared data structures from dynamic sched loops:
2498    sh = reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>(
2499        &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
2500    KD_TRACE(10, ("__kmpc_sections_init: T#%d my_buffer_index:%d\n", gtid,
2501                  my_buffer_index));
2502
2503    th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
2504    th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
2505
2506    KD_TRACE(100, ("__kmpc_sections_init: T#%d before wait: my_buffer_index:%d "
2507                   "sh->buffer_index:%d\n",
2508                   gtid, my_buffer_index, sh->buffer_index));
2509    __kmp_wait<kmp_uint32>(&sh->buffer_index, my_buffer_index,
2510                           __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
2511    // Note: KMP_WAIT() cannot be used there: buffer index and
2512    // my_buffer_index are *always* 32-bit integers.
2513    KMP_MB();
2514    KD_TRACE(100, ("__kmpc_sections_init: T#%d after wait: my_buffer_index:%d "
2515                   "sh->buffer_index:%d\n",
2516                   gtid, my_buffer_index, sh->buffer_index));
2517
2518    th->th.th_dispatch->th_dispatch_pr_current =
2519        nullptr; // sections construct doesn't need private data
2520    th->th.th_dispatch->th_dispatch_sh_current =
2521        CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh);
2522  }
2523
2524#if OMPT_SUPPORT && OMPT_OPTIONAL
2525  if (ompt_enabled.ompt_callback_work) {
2526    ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
2527    ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2528    ompt_callbacks.ompt_callback(ompt_callback_work)(
2529        ompt_work_sections, ompt_scope_begin, &(team_info->parallel_data),
2530        &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));
2531  }
2532#endif
2533  KMP_PUSH_PARTITIONED_TIMER(OMP_sections);
2534
2535  return active;
2536}
2537
2538/*!
2539@ingroup WORK_SHARING
2540@param loc  source location information
2541@param global_tid  global thread number
2542@param numberOfSections  number of sections in the 'sections' construct
2543@return unsigned [from 0 to n) - number (id) of the section to execute next on
2544this thread. n (or any other number not in range) - nothing to execute on this
2545thread
2546*/
2547
2548kmp_int32 __kmpc_next_section(ident_t *loc, kmp_int32 gtid,
2549                              kmp_int32 numberOfSections) {
2550
2551  KMP_TIME_PARTITIONED_BLOCK(OMP_sections_overhead);
2552
2553  kmp_info_t *th = __kmp_threads[gtid];
2554#ifdef KMP_DEBUG
2555  kmp_team_t *team = th->th.th_team;
2556#endif
2557
2558  KD_TRACE(1000, ("__kmp_dispatch_next: T#%d; number of sections:%d\n", gtid,
2559                  numberOfSections));
2560
2561  // For serialized case we should not call this function:
2562  KMP_DEBUG_ASSERT(!team->t.t_serialized);
2563
2564  dispatch_shared_info_template<kmp_int32> volatile *sh;
2565
2566  KMP_DEBUG_ASSERT(th->th.th_dispatch ==
2567                   &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
2568
2569  KMP_DEBUG_ASSERT(!(th->th.th_dispatch->th_dispatch_pr_current));
2570  sh = reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>(
2571      th->th.th_dispatch->th_dispatch_sh_current);
2572  KMP_DEBUG_ASSERT(sh);
2573
2574  kmp_int32 sectionIndex = 0;
2575  bool moreSectionsToExecute = true;
2576
2577  // Find section to execute:
2578  sectionIndex = test_then_inc<kmp_int32>((kmp_int32 *)&sh->u.s.iteration);
2579  if (sectionIndex >= numberOfSections) {
2580    moreSectionsToExecute = false;
2581  }
2582
2583  // status == 0: no more sections to execute;
2584  // OMPTODO: __kmpc_end_sections could be bypassed?
2585  if (!moreSectionsToExecute) {
2586    kmp_int32 num_done;
2587
2588    num_done = test_then_inc<kmp_int32>((kmp_int32 *)(&sh->u.s.num_done));
2589
2590    if (num_done == th->th.th_team_nproc - 1) {
2591      /* NOTE: release this buffer to be reused */
2592
2593      KMP_MB(); /* Flush all pending memory write invalidates.  */
2594
2595      sh->u.s.num_done = 0;
2596      sh->u.s.iteration = 0;
2597
2598      KMP_MB(); /* Flush all pending memory write invalidates.  */
2599
2600      sh->buffer_index += __kmp_dispatch_num_buffers;
2601      KD_TRACE(100, ("__kmpc_next_section: T#%d change buffer_index:%d\n", gtid,
2602                     sh->buffer_index));
2603
2604      KMP_MB(); /* Flush all pending memory write invalidates.  */
2605
2606    } // if
2607
2608    th->th.th_dispatch->th_deo_fcn = NULL;
2609    th->th.th_dispatch->th_dxo_fcn = NULL;
2610    th->th.th_dispatch->th_dispatch_sh_current = NULL;
2611    th->th.th_dispatch->th_dispatch_pr_current = NULL;
2612
2613#if OMPT_SUPPORT && OMPT_OPTIONAL
2614    if (ompt_enabled.ompt_callback_dispatch) {
2615      ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
2616      ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2617      ompt_data_t instance = ompt_data_none;
2618      instance.ptr = OMPT_GET_RETURN_ADDRESS(0);
2619      ompt_callbacks.ompt_callback(ompt_callback_dispatch)(
2620          &(team_info->parallel_data), &(task_info->task_data),
2621          ompt_dispatch_section, instance);
2622    }
2623#endif
2624  }
2625
2626  return sectionIndex;
2627}
2628
2629/*!
2630@ingroup WORK_SHARING
2631@param loc  source location information
2632@param global_tid  global thread number
2633
2634End of "sections" construct.
2635Don't need to wait here: barrier is added separately when needed.
2636*/
2637void __kmpc_end_sections(ident_t *loc, kmp_int32 gtid) {
2638
2639  kmp_info_t *th = __kmp_threads[gtid];
2640  int active = !th->th.th_team->t.t_serialized;
2641
2642  KD_TRACE(100, ("__kmpc_end_sections: T#%d called\n", gtid));
2643
2644  if (!active) {
2645    // In active case call finalization is done in __kmpc_next_section
2646#if OMPT_SUPPORT && OMPT_OPTIONAL
2647    if (ompt_enabled.ompt_callback_work) {
2648      ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
2649      ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2650      ompt_callbacks.ompt_callback(ompt_callback_work)(
2651          ompt_work_sections, ompt_scope_end, &(team_info->parallel_data),
2652          &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));
2653    }
2654#endif
2655  }
2656
2657  KMP_POP_PARTITIONED_TIMER();
2658  KD_TRACE(100, ("__kmpc_end_sections: T#%d returned\n", gtid));
2659}
2660
2661template <typename T>
2662static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid,
2663                                  kmp_int32 *plastiter, T *plower, T *pupper,
2664                                  typename traits_t<T>::signed_t incr) {
2665  typedef typename traits_t<T>::unsigned_t UT;
2666  kmp_uint32 team_id;
2667  kmp_uint32 nteams;
2668  UT trip_count;
2669  kmp_team_t *team;
2670  kmp_info_t *th;
2671
2672  KMP_DEBUG_ASSERT(plastiter && plower && pupper);
2673  KE_TRACE(10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
2674#ifdef KMP_DEBUG
2675  typedef typename traits_t<T>::signed_t ST;
2676  {
2677    char *buff;
2678    // create format specifiers before the debug output
2679    buff = __kmp_str_format("__kmpc_dist_get_bounds: T#%%d liter=%%d "
2680                            "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2681                            traits_t<T>::spec, traits_t<T>::spec,
2682                            traits_t<ST>::spec, traits_t<T>::spec);
2683    KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr));
2684    __kmp_str_free(&buff);
2685  }
2686#endif
2687
2688  if (__kmp_env_consistency_check) {
2689    if (incr == 0) {
2690      __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
2691                            loc);
2692    }
2693    if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) {
2694      // The loop is illegal.
2695      // Some zero-trip loops maintained by compiler, e.g.:
2696      //   for(i=10;i<0;++i) // lower >= upper - run-time check
2697      //   for(i=0;i>10;--i) // lower <= upper - run-time check
2698      //   for(i=0;i>10;++i) // incr > 0       - compile-time check
2699      //   for(i=10;i<0;--i) // incr < 0       - compile-time check
2700      // Compiler does not check the following illegal loops:
2701      //   for(i=0;i<10;i+=incr) // where incr<0
2702      //   for(i=10;i>0;i-=incr) // where incr<0
2703      __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc);
2704    }
2705  }
2706  __kmp_assert_valid_gtid(gtid);
2707  th = __kmp_threads[gtid];
2708  team = th->th.th_team;
2709  KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct
2710  nteams = th->th.th_teams_size.nteams;
2711  team_id = team->t.t_master_tid;
2712  KMP_DEBUG_ASSERT(nteams == (kmp_uint32)team->t.t_parent->t.t_nproc);
2713
2714  // compute global trip count
2715  if (incr == 1) {
2716    trip_count = *pupper - *plower + 1;
2717  } else if (incr == -1) {
2718    trip_count = *plower - *pupper + 1;
2719  } else if (incr > 0) {
2720    // upper-lower can exceed the limit of signed type
2721    trip_count = (UT)(*pupper - *plower) / incr + 1;
2722  } else {
2723    trip_count = (UT)(*plower - *pupper) / (-incr) + 1;
2724  }
2725
2726  if (trip_count <= nteams) {
2727    KMP_DEBUG_ASSERT(
2728        __kmp_static == kmp_sch_static_greedy ||
2729        __kmp_static ==
2730            kmp_sch_static_balanced); // Unknown static scheduling type.
2731    // only some teams get single iteration, others get nothing
2732    if (team_id < trip_count) {
2733      *pupper = *plower = *plower + team_id * incr;
2734    } else {
2735      *plower = *pupper + incr; // zero-trip loop
2736    }
2737    if (plastiter != NULL)
2738      *plastiter = (team_id == trip_count - 1);
2739  } else {
2740    if (__kmp_static == kmp_sch_static_balanced) {
2741      UT chunk = trip_count / nteams;
2742      UT extras = trip_count % nteams;
2743      *plower +=
2744          incr * (team_id * chunk + (team_id < extras ? team_id : extras));
2745      *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr);
2746      if (plastiter != NULL)
2747        *plastiter = (team_id == nteams - 1);
2748    } else {
2749      T chunk_inc_count =
2750          (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr;
2751      T upper = *pupper;
2752      KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy);
2753      // Unknown static scheduling type.
2754      *plower += team_id * chunk_inc_count;
2755      *pupper = *plower + chunk_inc_count - incr;
2756      // Check/correct bounds if needed
2757      if (incr > 0) {
2758        if (*pupper < *plower)
2759          *pupper = traits_t<T>::max_value;
2760        if (plastiter != NULL)
2761          *plastiter = *plower <= upper && *pupper > upper - incr;
2762        if (*pupper > upper)
2763          *pupper = upper; // tracker C73258
2764      } else {
2765        if (*pupper > *plower)
2766          *pupper = traits_t<T>::min_value;
2767        if (plastiter != NULL)
2768          *plastiter = *plower >= upper && *pupper < upper - incr;
2769        if (*pupper < upper)
2770          *pupper = upper; // tracker C73258
2771      }
2772    }
2773  }
2774}
2775
2776//-----------------------------------------------------------------------------
2777// Dispatch routines
2778//    Transfer call to template< type T >
2779//    __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
2780//                         T lb, T ub, ST st, ST chunk )
2781extern "C" {
2782
2783/*!
2784@ingroup WORK_SHARING
2785@{
2786@param loc Source location
2787@param gtid Global thread id
2788@param schedule Schedule type
2789@param lb  Lower bound
2790@param ub  Upper bound
2791@param st  Step (or increment if you prefer)
2792@param chunk The chunk size to block with
2793
2794This function prepares the runtime to start a dynamically scheduled for loop,
2795saving the loop arguments.
2796These functions are all identical apart from the types of the arguments.
2797*/
2798
2799void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2800                            enum sched_type schedule, kmp_int32 lb,
2801                            kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) {
2802  KMP_DEBUG_ASSERT(__kmp_init_serial);
2803#if OMPT_SUPPORT && OMPT_OPTIONAL
2804  OMPT_STORE_RETURN_ADDRESS(gtid);
2805#endif
2806  __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2807}
2808/*!
2809See @ref __kmpc_dispatch_init_4
2810*/
2811void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2812                             enum sched_type schedule, kmp_uint32 lb,
2813                             kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) {
2814  KMP_DEBUG_ASSERT(__kmp_init_serial);
2815#if OMPT_SUPPORT && OMPT_OPTIONAL
2816  OMPT_STORE_RETURN_ADDRESS(gtid);
2817#endif
2818  __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2819}
2820
2821/*!
2822See @ref __kmpc_dispatch_init_4
2823*/
2824void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2825                            enum sched_type schedule, kmp_int64 lb,
2826                            kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) {
2827  KMP_DEBUG_ASSERT(__kmp_init_serial);
2828#if OMPT_SUPPORT && OMPT_OPTIONAL
2829  OMPT_STORE_RETURN_ADDRESS(gtid);
2830#endif
2831  __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2832}
2833
2834/*!
2835See @ref __kmpc_dispatch_init_4
2836*/
2837void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2838                             enum sched_type schedule, kmp_uint64 lb,
2839                             kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) {
2840  KMP_DEBUG_ASSERT(__kmp_init_serial);
2841#if OMPT_SUPPORT && OMPT_OPTIONAL
2842  OMPT_STORE_RETURN_ADDRESS(gtid);
2843#endif
2844  __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2845}
2846
2847/*!
2848See @ref __kmpc_dispatch_init_4
2849
2850Difference from __kmpc_dispatch_init set of functions is these functions
2851are called for composite distribute parallel for construct. Thus before
2852regular iterations dispatching we need to calc per-team iteration space.
2853
2854These functions are all identical apart from the types of the arguments.
2855*/
2856void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2857                                 enum sched_type schedule, kmp_int32 *p_last,
2858                                 kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2859                                 kmp_int32 chunk) {
2860  KMP_DEBUG_ASSERT(__kmp_init_serial);
2861#if OMPT_SUPPORT && OMPT_OPTIONAL
2862  OMPT_STORE_RETURN_ADDRESS(gtid);
2863#endif
2864  __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st);
2865  __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2866}
2867
2868void __kmpc_dist_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2869                                  enum sched_type schedule, kmp_int32 *p_last,
2870                                  kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2871                                  kmp_int32 chunk) {
2872  KMP_DEBUG_ASSERT(__kmp_init_serial);
2873#if OMPT_SUPPORT && OMPT_OPTIONAL
2874  OMPT_STORE_RETURN_ADDRESS(gtid);
2875#endif
2876  __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st);
2877  __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2878}
2879
2880void __kmpc_dist_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2881                                 enum sched_type schedule, kmp_int32 *p_last,
2882                                 kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2883                                 kmp_int64 chunk) {
2884  KMP_DEBUG_ASSERT(__kmp_init_serial);
2885#if OMPT_SUPPORT && OMPT_OPTIONAL
2886  OMPT_STORE_RETURN_ADDRESS(gtid);
2887#endif
2888  __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st);
2889  __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2890}
2891
2892void __kmpc_dist_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2893                                  enum sched_type schedule, kmp_int32 *p_last,
2894                                  kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2895                                  kmp_int64 chunk) {
2896  KMP_DEBUG_ASSERT(__kmp_init_serial);
2897#if OMPT_SUPPORT && OMPT_OPTIONAL
2898  OMPT_STORE_RETURN_ADDRESS(gtid);
2899#endif
2900  __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st);
2901  __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2902}
2903
2904/*!
2905@param loc Source code location
2906@param gtid Global thread id
2907@param p_last Pointer to a flag set to one if this is the last chunk or zero
2908otherwise
2909@param p_lb   Pointer to the lower bound for the next chunk of work
2910@param p_ub   Pointer to the upper bound for the next chunk of work
2911@param p_st   Pointer to the stride for the next chunk of work
2912@return one if there is work to be done, zero otherwise
2913
2914Get the next dynamically allocated chunk of work for this thread.
2915If there is no more work, then the lb,ub and stride need not be modified.
2916*/
2917int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2918                           kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) {
2919#if OMPT_SUPPORT && OMPT_OPTIONAL
2920  OMPT_STORE_RETURN_ADDRESS(gtid);
2921#endif
2922  return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st
2923#if OMPT_SUPPORT && OMPT_OPTIONAL
2924                                        ,
2925                                        OMPT_LOAD_RETURN_ADDRESS(gtid)
2926#endif
2927  );
2928}
2929
2930/*!
2931See @ref __kmpc_dispatch_next_4
2932*/
2933int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2934                            kmp_uint32 *p_lb, kmp_uint32 *p_ub,
2935                            kmp_int32 *p_st) {
2936#if OMPT_SUPPORT && OMPT_OPTIONAL
2937  OMPT_STORE_RETURN_ADDRESS(gtid);
2938#endif
2939  return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st
2940#if OMPT_SUPPORT && OMPT_OPTIONAL
2941                                         ,
2942                                         OMPT_LOAD_RETURN_ADDRESS(gtid)
2943#endif
2944  );
2945}
2946
2947/*!
2948See @ref __kmpc_dispatch_next_4
2949*/
2950int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2951                           kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) {
2952#if OMPT_SUPPORT && OMPT_OPTIONAL
2953  OMPT_STORE_RETURN_ADDRESS(gtid);
2954#endif
2955  return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st
2956#if OMPT_SUPPORT && OMPT_OPTIONAL
2957                                        ,
2958                                        OMPT_LOAD_RETURN_ADDRESS(gtid)
2959#endif
2960  );
2961}
2962
2963/*!
2964See @ref __kmpc_dispatch_next_4
2965*/
2966int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2967                            kmp_uint64 *p_lb, kmp_uint64 *p_ub,
2968                            kmp_int64 *p_st) {
2969#if OMPT_SUPPORT && OMPT_OPTIONAL
2970  OMPT_STORE_RETURN_ADDRESS(gtid);
2971#endif
2972  return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st
2973#if OMPT_SUPPORT && OMPT_OPTIONAL
2974                                         ,
2975                                         OMPT_LOAD_RETURN_ADDRESS(gtid)
2976#endif
2977  );
2978}
2979
2980/*!
2981@param loc Source code location
2982@param gtid Global thread id
2983
2984Mark the end of a dynamic loop.
2985*/
2986void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid) {
2987  __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2988}
2989
2990/*!
2991See @ref __kmpc_dispatch_fini_4
2992*/
2993void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid) {
2994  __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2995}
2996
2997/*!
2998See @ref __kmpc_dispatch_fini_4
2999*/
3000void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid) {
3001  __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
3002}
3003
3004/*!
3005See @ref __kmpc_dispatch_fini_4
3006*/
3007void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid) {
3008  __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
3009}
3010/*! @} */
3011
3012//-----------------------------------------------------------------------------
3013// Non-template routines from kmp_dispatch.cpp used in other sources
3014
3015kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) {
3016  return value == checker;
3017}
3018
3019kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) {
3020  return value != checker;
3021}
3022
3023kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) {
3024  return value < checker;
3025}
3026
3027kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) {
3028  return value >= checker;
3029}
3030
3031kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) {
3032  return value <= checker;
3033}
3034
3035kmp_uint32
3036__kmp_wait_4(volatile kmp_uint32 *spinner, kmp_uint32 checker,
3037             kmp_uint32 (*pred)(kmp_uint32, kmp_uint32),
3038             void *obj // Higher-level synchronization object, or NULL.
3039) {
3040  // note: we may not belong to a team at this point
3041  volatile kmp_uint32 *spin = spinner;
3042  kmp_uint32 check = checker;
3043  kmp_uint32 spins;
3044  kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred;
3045  kmp_uint32 r;
3046  kmp_uint64 time;
3047
3048  KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin));
3049  KMP_INIT_YIELD(spins);
3050  KMP_INIT_BACKOFF(time);
3051  // main wait spin loop
3052  while (!f(r = TCR_4(*spin), check)) {
3053    KMP_FSYNC_SPIN_PREPARE(obj);
3054    /* GEH - remove this since it was accidentally introduced when kmp_wait was
3055       split. It causes problems with infinite recursion because of exit lock */
3056    /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
3057        __kmp_abort_thread(); */
3058    KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time);
3059  }
3060  KMP_FSYNC_SPIN_ACQUIRED(obj);
3061  return r;
3062}
3063
3064void __kmp_wait_4_ptr(void *spinner, kmp_uint32 checker,
3065                      kmp_uint32 (*pred)(void *, kmp_uint32),
3066                      void *obj // Higher-level synchronization object, or NULL.
3067) {
3068  // note: we may not belong to a team at this point
3069  void *spin = spinner;
3070  kmp_uint32 check = checker;
3071  kmp_uint32 spins;
3072  kmp_uint32 (*f)(void *, kmp_uint32) = pred;
3073  kmp_uint64 time;
3074
3075  KMP_FSYNC_SPIN_INIT(obj, spin);
3076  KMP_INIT_YIELD(spins);
3077  KMP_INIT_BACKOFF(time);
3078  // main wait spin loop
3079  while (!f(spin, check)) {
3080    KMP_FSYNC_SPIN_PREPARE(obj);
3081    /* if we have waited a bit, or are noversubscribed, yield */
3082    /* pause is in the following code */
3083    KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time);
3084  }
3085  KMP_FSYNC_SPIN_ACQUIRED(obj);
3086}
3087
3088} // extern "C"
3089
3090#ifdef KMP_GOMP_COMPAT
3091
3092void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
3093                               enum sched_type schedule, kmp_int32 lb,
3094                               kmp_int32 ub, kmp_int32 st, kmp_int32 chunk,
3095                               int push_ws) {
3096  __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk,
3097                                 push_ws);
3098}
3099
3100void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
3101                                enum sched_type schedule, kmp_uint32 lb,
3102                                kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk,
3103                                int push_ws) {
3104  __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk,
3105                                  push_ws);
3106}
3107
3108void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
3109                               enum sched_type schedule, kmp_int64 lb,
3110                               kmp_int64 ub, kmp_int64 st, kmp_int64 chunk,
3111                               int push_ws) {
3112  __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk,
3113                                 push_ws);
3114}
3115
3116void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
3117                                enum sched_type schedule, kmp_uint64 lb,
3118                                kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk,
3119                                int push_ws) {
3120  __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk,
3121                                  push_ws);
3122}
3123
3124void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid) {
3125  __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
3126}
3127
3128void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid) {
3129  __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
3130}
3131
3132void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid) {
3133  __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
3134}
3135
3136void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid) {
3137  __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
3138}
3139
3140#endif /* KMP_GOMP_COMPAT */
3141
3142/* ------------------------------------------------------------------------ */
3143