1/*
2 * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
3 */
4
5//===----------------------------------------------------------------------===//
6//
7// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8// See https://llvm.org/LICENSE.txt for license information.
9// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10//
11//===----------------------------------------------------------------------===//
12
13/* Dynamic scheduling initialization and dispatch.
14 *
15 * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
16 *       it may change values between parallel regions.  __kmp_max_nth
17 *       is the largest value __kmp_nth may take, 1 is the smallest.
18 */
19
20#include "kmp.h"
21#include "kmp_error.h"
22#include "kmp_i18n.h"
23#include "kmp_itt.h"
24#include "kmp_stats.h"
25#include "kmp_str.h"
26#if KMP_USE_X87CONTROL
27#include <float.h>
28#endif
29#include "kmp_lock.h"
30#include "kmp_dispatch.h"
31#if KMP_USE_HIER_SCHED
32#include "kmp_dispatch_hier.h"
33#endif
34
35#if OMPT_SUPPORT
36#include "ompt-specific.h"
37#endif
38
39/* ------------------------------------------------------------------------ */
40/* ------------------------------------------------------------------------ */
41
42void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
43  kmp_info_t *th;
44
45  KMP_DEBUG_ASSERT(gtid_ref);
46
47  if (__kmp_env_consistency_check) {
48    th = __kmp_threads[*gtid_ref];
49    if (th->th.th_root->r.r_active &&
50        (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) {
51#if KMP_USE_DYNAMIC_LOCK
52      __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0);
53#else
54      __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL);
55#endif
56    }
57  }
58}
59
60void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
61  kmp_info_t *th;
62
63  if (__kmp_env_consistency_check) {
64    th = __kmp_threads[*gtid_ref];
65    if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) {
66      __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref);
67    }
68  }
69}
70
71// Returns either SCHEDULE_MONOTONIC or SCHEDULE_NONMONOTONIC
72static inline int __kmp_get_monotonicity(ident_t *loc, enum sched_type schedule,
73                                         bool use_hier = false) {
74  // Pick up the nonmonotonic/monotonic bits from the scheduling type
75  // Nonmonotonic as default for dynamic schedule when no modifier is specified
76  int monotonicity = SCHEDULE_NONMONOTONIC;
77
78  // Let default be monotonic for executables
79  // compiled with OpenMP* 4.5 or less compilers
80  if (loc != NULL && loc->get_openmp_version() < 50)
81    monotonicity = SCHEDULE_MONOTONIC;
82
83  if (use_hier || __kmp_force_monotonic)
84    monotonicity = SCHEDULE_MONOTONIC;
85  else if (SCHEDULE_HAS_NONMONOTONIC(schedule))
86    monotonicity = SCHEDULE_NONMONOTONIC;
87  else if (SCHEDULE_HAS_MONOTONIC(schedule))
88    monotonicity = SCHEDULE_MONOTONIC;
89
90  return monotonicity;
91}
92
93#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
94// Return floating point number rounded to two decimal points
95static inline float __kmp_round_2decimal_val(float num) {
96  return (float)(static_cast<int>(num * 100 + 0.5)) / 100;
97}
98static inline int __kmp_get_round_val(float num) {
99  return static_cast<int>(num < 0 ? num - 0.5 : num + 0.5);
100}
101#endif
102
103template <typename T>
104inline void
105__kmp_initialize_self_buffer(kmp_team_t *team, T id,
106                             dispatch_private_info_template<T> *pr,
107                             typename traits_t<T>::unsigned_t nchunks, T nproc,
108                             typename traits_t<T>::unsigned_t &init,
109                             T &small_chunk, T &extras, T &p_extra) {
110
111#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
112  if (pr->flags.use_hybrid) {
113    kmp_info_t *th = __kmp_threads[__kmp_gtid_from_tid((int)id, team)];
114    kmp_hw_core_type_t type =
115        (kmp_hw_core_type_t)th->th.th_topology_attrs.core_type;
116    T pchunks = pr->u.p.pchunks;
117    T echunks = nchunks - pchunks;
118    T num_procs_with_pcore = pr->u.p.num_procs_with_pcore;
119    T num_procs_with_ecore = nproc - num_procs_with_pcore;
120    T first_thread_with_ecore = pr->u.p.first_thread_with_ecore;
121    T big_chunk =
122        pchunks / num_procs_with_pcore; // chunks per thread with p-core
123    small_chunk =
124        echunks / num_procs_with_ecore; // chunks per thread with e-core
125
126    extras =
127        (pchunks % num_procs_with_pcore) + (echunks % num_procs_with_ecore);
128
129    p_extra = (big_chunk - small_chunk);
130
131    if (type == KMP_HW_CORE_TYPE_CORE) {
132      if (id < first_thread_with_ecore) {
133        init = id * small_chunk + id * p_extra + (id < extras ? id : extras);
134      } else {
135        init = id * small_chunk + (id - num_procs_with_ecore) * p_extra +
136               (id < extras ? id : extras);
137      }
138    } else {
139      if (id == first_thread_with_ecore) {
140        init = id * small_chunk + id * p_extra + (id < extras ? id : extras);
141      } else {
142        init = id * small_chunk + first_thread_with_ecore * p_extra +
143               (id < extras ? id : extras);
144      }
145    }
146    p_extra = (type == KMP_HW_CORE_TYPE_CORE) ? p_extra : 0;
147    return;
148  }
149#endif
150
151  small_chunk = nchunks / nproc; // chunks per thread
152  extras = nchunks % nproc;
153  p_extra = 0;
154  init = id * small_chunk + (id < extras ? id : extras);
155}
156
157#if KMP_STATIC_STEAL_ENABLED
158enum { // values for steal_flag (possible states of private per-loop buffer)
159  UNUSED = 0,
160  CLAIMED = 1, // owner thread started initialization
161  READY = 2, // available for stealing
162  THIEF = 3 // finished by owner, or claimed by thief
163  // possible state changes:
164  // 0 -> 1 owner only, sync
165  // 0 -> 3 thief only, sync
166  // 1 -> 2 owner only, async
167  // 2 -> 3 owner only, async
168  // 3 -> 2 owner only, async
169  // 3 -> 0 last thread finishing the loop, async
170};
171#endif
172
173// Initialize a dispatch_private_info_template<T> buffer for a particular
174// type of schedule,chunk.  The loop description is found in lb (lower bound),
175// ub (upper bound), and st (stride).  nproc is the number of threads relevant
176// to the scheduling (often the number of threads in a team, but not always if
177// hierarchical scheduling is used).  tid is the id of the thread calling
178// the function within the group of nproc threads.  It will have a value
179// between 0 and nproc - 1.  This is often just the thread id within a team, but
180// is not necessarily the case when using hierarchical scheduling.
181// loc is the source file location of the corresponding loop
182// gtid is the global thread id
183template <typename T>
184void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
185                                   dispatch_private_info_template<T> *pr,
186                                   enum sched_type schedule, T lb, T ub,
187                                   typename traits_t<T>::signed_t st,
188#if USE_ITT_BUILD
189                                   kmp_uint64 *cur_chunk,
190#endif
191                                   typename traits_t<T>::signed_t chunk,
192                                   T nproc, T tid) {
193  typedef typename traits_t<T>::unsigned_t UT;
194  typedef typename traits_t<T>::floating_t DBL;
195
196  int active;
197  T tc;
198  kmp_info_t *th;
199  kmp_team_t *team;
200  int monotonicity;
201  bool use_hier;
202
203#ifdef KMP_DEBUG
204  typedef typename traits_t<T>::signed_t ST;
205  {
206    char *buff;
207    // create format specifiers before the debug output
208    buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d called "
209                            "pr:%%p lb:%%%s ub:%%%s st:%%%s "
210                            "schedule:%%d chunk:%%%s nproc:%%%s tid:%%%s\n",
211                            traits_t<T>::spec, traits_t<T>::spec,
212                            traits_t<ST>::spec, traits_t<ST>::spec,
213                            traits_t<T>::spec, traits_t<T>::spec);
214    KD_TRACE(10, (buff, gtid, pr, lb, ub, st, schedule, chunk, nproc, tid));
215    __kmp_str_free(&buff);
216  }
217#endif
218  /* setup data */
219  th = __kmp_threads[gtid];
220  team = th->th.th_team;
221  active = !team->t.t_serialized;
222
223#if USE_ITT_BUILD
224  int itt_need_metadata_reporting =
225      __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
226      KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
227      team->t.t_active_level == 1;
228#endif
229
230#if KMP_USE_HIER_SCHED
231  use_hier = pr->flags.use_hier;
232#else
233  use_hier = false;
234#endif
235
236  /* Pick up the nonmonotonic/monotonic bits from the scheduling type */
237  monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);
238  schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
239
240  /* Pick up the nomerge/ordered bits from the scheduling type */
241  if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) {
242    pr->flags.nomerge = TRUE;
243    schedule =
244        (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
245  } else {
246    pr->flags.nomerge = FALSE;
247  }
248  pr->type_size = traits_t<T>::type_size; // remember the size of variables
249  if (kmp_ord_lower & schedule) {
250    pr->flags.ordered = TRUE;
251    schedule =
252        (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
253  } else {
254    pr->flags.ordered = FALSE;
255  }
256  // Ordered overrides nonmonotonic
257  if (pr->flags.ordered) {
258    monotonicity = SCHEDULE_MONOTONIC;
259  }
260
261  if (schedule == kmp_sch_static) {
262    schedule = __kmp_static;
263  } else {
264    if (schedule == kmp_sch_runtime) {
265      // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if
266      // not specified)
267      schedule = team->t.t_sched.r_sched_type;
268      monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);
269      schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
270      if (pr->flags.ordered) // correct monotonicity for ordered loop if needed
271        monotonicity = SCHEDULE_MONOTONIC;
272      // Detail the schedule if needed (global controls are differentiated
273      // appropriately)
274      if (schedule == kmp_sch_guided_chunked) {
275        schedule = __kmp_guided;
276      } else if (schedule == kmp_sch_static) {
277        schedule = __kmp_static;
278      }
279      // Use the chunk size specified by OMP_SCHEDULE (or default if not
280      // specified)
281      chunk = team->t.t_sched.chunk;
282#if USE_ITT_BUILD
283      if (cur_chunk)
284        *cur_chunk = chunk;
285#endif
286#ifdef KMP_DEBUG
287      {
288        char *buff;
289        // create format specifiers before the debug output
290        buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d new: "
291                                "schedule:%%d chunk:%%%s\n",
292                                traits_t<ST>::spec);
293        KD_TRACE(10, (buff, gtid, schedule, chunk));
294        __kmp_str_free(&buff);
295      }
296#endif
297    } else {
298      if (schedule == kmp_sch_guided_chunked) {
299        schedule = __kmp_guided;
300      }
301      if (chunk <= 0) {
302        chunk = KMP_DEFAULT_CHUNK;
303      }
304    }
305
306    if (schedule == kmp_sch_auto) {
307      // mapping and differentiation: in the __kmp_do_serial_initialize()
308      schedule = __kmp_auto;
309#ifdef KMP_DEBUG
310      {
311        char *buff;
312        // create format specifiers before the debug output
313        buff = __kmp_str_format(
314            "__kmp_dispatch_init_algorithm: kmp_sch_auto: T#%%d new: "
315            "schedule:%%d chunk:%%%s\n",
316            traits_t<ST>::spec);
317        KD_TRACE(10, (buff, gtid, schedule, chunk));
318        __kmp_str_free(&buff);
319      }
320#endif
321    }
322#if KMP_STATIC_STEAL_ENABLED
323    // map nonmonotonic:dynamic to static steal
324    if (schedule == kmp_sch_dynamic_chunked) {
325      if (monotonicity == SCHEDULE_NONMONOTONIC)
326        schedule = kmp_sch_static_steal;
327    }
328#endif
329    /* guided analytical not safe for too many threads */
330    if (schedule == kmp_sch_guided_analytical_chunked && nproc > 1 << 20) {
331      schedule = kmp_sch_guided_iterative_chunked;
332      KMP_WARNING(DispatchManyThreads);
333    }
334    if (schedule == kmp_sch_runtime_simd) {
335      // compiler provides simd_width in the chunk parameter
336      schedule = team->t.t_sched.r_sched_type;
337      monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);
338      schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
339      // Detail the schedule if needed (global controls are differentiated
340      // appropriately)
341      if (schedule == kmp_sch_static || schedule == kmp_sch_auto ||
342          schedule == __kmp_static) {
343        schedule = kmp_sch_static_balanced_chunked;
344      } else {
345        if (schedule == kmp_sch_guided_chunked || schedule == __kmp_guided) {
346          schedule = kmp_sch_guided_simd;
347        }
348        chunk = team->t.t_sched.chunk * chunk;
349      }
350#if USE_ITT_BUILD
351      if (cur_chunk)
352        *cur_chunk = chunk;
353#endif
354#ifdef KMP_DEBUG
355      {
356        char *buff;
357        // create format specifiers before the debug output
358        buff = __kmp_str_format(
359            "__kmp_dispatch_init_algorithm: T#%%d new: schedule:%%d"
360            " chunk:%%%s\n",
361            traits_t<ST>::spec);
362        KD_TRACE(10, (buff, gtid, schedule, chunk));
363        __kmp_str_free(&buff);
364      }
365#endif
366    }
367    pr->u.p.parm1 = chunk;
368  }
369  KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper),
370              "unknown scheduling type");
371
372  pr->u.p.count = 0;
373
374  if (__kmp_env_consistency_check) {
375    if (st == 0) {
376      __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited,
377                            (pr->flags.ordered ? ct_pdo_ordered : ct_pdo), loc);
378    }
379  }
380  // compute trip count
381  if (st == 1) { // most common case
382    if (ub >= lb) {
383      tc = ub - lb + 1;
384    } else { // ub < lb
385      tc = 0; // zero-trip
386    }
387  } else if (st < 0) {
388    if (lb >= ub) {
389      // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B),
390      // where the division needs to be unsigned regardless of the result type
391      tc = (UT)(lb - ub) / (-st) + 1;
392    } else { // lb < ub
393      tc = 0; // zero-trip
394    }
395  } else { // st > 0
396    if (ub >= lb) {
397      // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B),
398      // where the division needs to be unsigned regardless of the result type
399      tc = (UT)(ub - lb) / st + 1;
400    } else { // ub < lb
401      tc = 0; // zero-trip
402    }
403  }
404
405#if KMP_STATS_ENABLED
406  if (KMP_MASTER_GTID(gtid)) {
407    KMP_COUNT_VALUE(OMP_loop_dynamic_total_iterations, tc);
408  }
409#endif
410
411  pr->u.p.lb = lb;
412  pr->u.p.ub = ub;
413  pr->u.p.st = st;
414  pr->u.p.tc = tc;
415
416#if KMP_OS_WINDOWS
417  pr->u.p.last_upper = ub + st;
418#endif /* KMP_OS_WINDOWS */
419
420  /* NOTE: only the active parallel region(s) has active ordered sections */
421
422  if (active) {
423    if (pr->flags.ordered) {
424      pr->ordered_bumped = 0;
425      pr->u.p.ordered_lower = 1;
426      pr->u.p.ordered_upper = 0;
427    }
428  }
429
430  switch (schedule) {
431#if KMP_STATIC_STEAL_ENABLED
432  case kmp_sch_static_steal: {
433    T ntc, init = 0;
434
435    KD_TRACE(100,
436             ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n",
437              gtid));
438
439    ntc = (tc % chunk ? 1 : 0) + tc / chunk;
440    if (nproc > 1 && ntc >= nproc) {
441      KMP_COUNT_BLOCK(OMP_LOOP_STATIC_STEAL);
442      T id = tid;
443      T small_chunk, extras, p_extra = 0;
444      kmp_uint32 old = UNUSED;
445      int claimed = pr->steal_flag.compare_exchange_strong(old, CLAIMED);
446      if (traits_t<T>::type_size > 4) {
447        // AC: TODO: check if 16-byte CAS available and use it to
448        // improve performance (probably wait for explicit request
449        // before spending time on this).
450        // For now use dynamically allocated per-private-buffer lock,
451        // free memory in __kmp_dispatch_next when status==0.
452        pr->u.p.steal_lock = (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t));
453        __kmp_init_lock(pr->u.p.steal_lock);
454      }
455
456#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
457      // Iterations are divided in a 60/40 skewed distribution among CORE and
458      // ATOM processors for hybrid systems
459      bool use_hybrid = false;
460      kmp_hw_core_type_t core_type = KMP_HW_CORE_TYPE_UNKNOWN;
461      T first_thread_with_ecore = 0;
462      T num_procs_with_pcore = 0;
463      T num_procs_with_ecore = 0;
464      T p_ntc = 0, e_ntc = 0;
465      if (__kmp_is_hybrid_cpu() && __kmp_affinity.type != affinity_none &&
466          __kmp_affinity.type != affinity_explicit) {
467        use_hybrid = true;
468        core_type = (kmp_hw_core_type_t)th->th.th_topology_attrs.core_type;
469        if (core_type != KMP_HW_CORE_TYPE_UNKNOWN &&
470            __kmp_first_osid_with_ecore > -1) {
471          for (int i = 0; i < team->t.t_nproc; ++i) {
472            kmp_hw_core_type_t type = (kmp_hw_core_type_t)team->t.t_threads[i]
473                                          ->th.th_topology_attrs.core_type;
474            int id = team->t.t_threads[i]->th.th_topology_ids.os_id;
475            if (id == __kmp_first_osid_with_ecore) {
476              first_thread_with_ecore =
477                  team->t.t_threads[i]->th.th_info.ds.ds_tid;
478            }
479            if (type == KMP_HW_CORE_TYPE_CORE) {
480              num_procs_with_pcore++;
481            } else if (type == KMP_HW_CORE_TYPE_ATOM) {
482              num_procs_with_ecore++;
483            } else {
484              use_hybrid = false;
485              break;
486            }
487          }
488        }
489        if (num_procs_with_pcore > 0 && num_procs_with_ecore > 0) {
490          float multiplier = 60.0 / 40.0;
491          float p_ratio = (float)num_procs_with_pcore / nproc;
492          float e_ratio = (float)num_procs_with_ecore / nproc;
493          float e_multiplier =
494              (float)1 /
495              (((multiplier * num_procs_with_pcore) / nproc) + e_ratio);
496          float p_multiplier = multiplier * e_multiplier;
497          p_ntc = __kmp_get_round_val(ntc * p_ratio * p_multiplier);
498          if ((int)p_ntc > (int)(ntc * p_ratio * p_multiplier))
499            e_ntc =
500                (int)(__kmp_round_2decimal_val(ntc * e_ratio * e_multiplier));
501          else
502            e_ntc = __kmp_get_round_val(ntc * e_ratio * e_multiplier);
503          KMP_DEBUG_ASSERT(ntc == p_ntc + e_ntc);
504
505          // Use regular static steal if not enough chunks for skewed
506          // distribution
507          use_hybrid = (use_hybrid && (p_ntc >= num_procs_with_pcore &&
508                                       e_ntc >= num_procs_with_ecore)
509                            ? true
510                            : false);
511        } else {
512          use_hybrid = false;
513        }
514      }
515      pr->flags.use_hybrid = use_hybrid;
516      pr->u.p.pchunks = p_ntc;
517      pr->u.p.num_procs_with_pcore = num_procs_with_pcore;
518      pr->u.p.first_thread_with_ecore = first_thread_with_ecore;
519
520      if (use_hybrid) {
521        KMP_DEBUG_ASSERT(nproc == num_procs_with_pcore + num_procs_with_ecore);
522        T big_chunk = p_ntc / num_procs_with_pcore;
523        small_chunk = e_ntc / num_procs_with_ecore;
524
525        extras =
526            (p_ntc % num_procs_with_pcore) + (e_ntc % num_procs_with_ecore);
527
528        p_extra = (big_chunk - small_chunk);
529
530        if (core_type == KMP_HW_CORE_TYPE_CORE) {
531          if (id < first_thread_with_ecore) {
532            init =
533                id * small_chunk + id * p_extra + (id < extras ? id : extras);
534          } else {
535            init = id * small_chunk + (id - num_procs_with_ecore) * p_extra +
536                   (id < extras ? id : extras);
537          }
538        } else {
539          if (id == first_thread_with_ecore) {
540            init =
541                id * small_chunk + id * p_extra + (id < extras ? id : extras);
542          } else {
543            init = id * small_chunk + first_thread_with_ecore * p_extra +
544                   (id < extras ? id : extras);
545          }
546        }
547        p_extra = (core_type == KMP_HW_CORE_TYPE_CORE) ? p_extra : 0;
548      } else
549#endif
550      {
551        small_chunk = ntc / nproc;
552        extras = ntc % nproc;
553        init = id * small_chunk + (id < extras ? id : extras);
554        p_extra = 0;
555      }
556      pr->u.p.count = init;
557      if (claimed) { // are we succeeded in claiming own buffer?
558        pr->u.p.ub = init + small_chunk + p_extra + (id < extras ? 1 : 0);
559        // Other threads will inspect steal_flag when searching for a victim.
560        // READY means other threads may steal from this thread from now on.
561        KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
562      } else {
563        // other thread has stolen whole our range
564        KMP_DEBUG_ASSERT(pr->steal_flag == THIEF);
565        pr->u.p.ub = init; // mark there is no iterations to work on
566      }
567      pr->u.p.parm2 = ntc; // save number of chunks
568      // parm3 is the number of times to attempt stealing which is
569      // nproc (just a heuristics, could be optimized later on).
570      pr->u.p.parm3 = nproc;
571      pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
572      break;
573    } else {
574      /* too few chunks: switching to kmp_sch_dynamic_chunked */
575      schedule = kmp_sch_dynamic_chunked;
576      KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d switching to "
577                     "kmp_sch_dynamic_chunked\n",
578                     gtid));
579      goto dynamic_init;
580      break;
581    } // if
582  } // case
583#endif
584  case kmp_sch_static_balanced: {
585    T init, limit;
586
587    KD_TRACE(
588        100,
589        ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_balanced case\n",
590         gtid));
591
592    if (nproc > 1) {
593      T id = tid;
594
595      if (tc < nproc) {
596        if (id < tc) {
597          init = id;
598          limit = id;
599          pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */
600        } else {
601          pr->u.p.count = 1; /* means no more chunks to execute */
602          pr->u.p.parm1 = FALSE;
603          break;
604        }
605      } else {
606        T small_chunk = tc / nproc;
607        T extras = tc % nproc;
608        init = id * small_chunk + (id < extras ? id : extras);
609        limit = init + small_chunk - (id < extras ? 0 : 1);
610        pr->u.p.parm1 = (id == nproc - 1);
611      }
612    } else {
613      if (tc > 0) {
614        init = 0;
615        limit = tc - 1;
616        pr->u.p.parm1 = TRUE;
617      } else {
618        // zero trip count
619        pr->u.p.count = 1; /* means no more chunks to execute */
620        pr->u.p.parm1 = FALSE;
621        break;
622      }
623    }
624#if USE_ITT_BUILD
625    // Calculate chunk for metadata report
626    if (itt_need_metadata_reporting)
627      if (cur_chunk)
628        *cur_chunk = limit - init + 1;
629#endif
630    if (st == 1) {
631      pr->u.p.lb = lb + init;
632      pr->u.p.ub = lb + limit;
633    } else {
634      // calculated upper bound, "ub" is user-defined upper bound
635      T ub_tmp = lb + limit * st;
636      pr->u.p.lb = lb + init * st;
637      // adjust upper bound to "ub" if needed, so that MS lastprivate will match
638      // it exactly
639      if (st > 0) {
640        pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp);
641      } else {
642        pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp);
643      }
644    }
645    if (pr->flags.ordered) {
646      pr->u.p.ordered_lower = init;
647      pr->u.p.ordered_upper = limit;
648    }
649    break;
650  } // case
651  case kmp_sch_static_balanced_chunked: {
652    // similar to balanced, but chunk adjusted to multiple of simd width
653    T nth = nproc;
654    KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d runtime(simd:static)"
655                   " -> falling-through to static_greedy\n",
656                   gtid));
657    schedule = kmp_sch_static_greedy;
658    if (nth > 1)
659      pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1);
660    else
661      pr->u.p.parm1 = tc;
662    break;
663  } // case
664  case kmp_sch_guided_simd:
665  case kmp_sch_guided_iterative_chunked: {
666    KD_TRACE(
667        100,
668        ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_guided_iterative_chunked"
669         " case\n",
670         gtid));
671
672    if (nproc > 1) {
673      if ((2L * chunk + 1) * nproc >= tc) {
674        /* chunk size too large, switch to dynamic */
675        schedule = kmp_sch_dynamic_chunked;
676        goto dynamic_init;
677      } else {
678        // when remaining iters become less than parm2 - switch to dynamic
679        pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1);
680        *(double *)&pr->u.p.parm3 =
681            guided_flt_param / (double)nproc; // may occupy parm3 and parm4
682      }
683    } else {
684      KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
685                     "kmp_sch_static_greedy\n",
686                     gtid));
687      schedule = kmp_sch_static_greedy;
688      /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
689      KD_TRACE(
690          100,
691          ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
692           gtid));
693      pr->u.p.parm1 = tc;
694    } // if
695  } // case
696  break;
697  case kmp_sch_guided_analytical_chunked: {
698    KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
699                   "kmp_sch_guided_analytical_chunked case\n",
700                   gtid));
701
702    if (nproc > 1) {
703      if ((2L * chunk + 1) * nproc >= tc) {
704        /* chunk size too large, switch to dynamic */
705        schedule = kmp_sch_dynamic_chunked;
706        goto dynamic_init;
707      } else {
708        /* commonly used term: (2 nproc - 1)/(2 nproc) */
709        DBL x;
710
711#if KMP_USE_X87CONTROL
712        /* Linux* OS already has 64-bit computation by default for long double,
713           and on Windows* OS on Intel(R) 64, /Qlong_double doesn't work. On
714           Windows* OS on IA-32 architecture, we need to set precision to 64-bit
715           instead of the default 53-bit. Even though long double doesn't work
716           on Windows* OS on Intel(R) 64, the resulting lack of precision is not
717           expected to impact the correctness of the algorithm, but this has not
718           been mathematically proven. */
719        // save original FPCW and set precision to 64-bit, as
720        // Windows* OS on IA-32 architecture defaults to 53-bit
721        unsigned int oldFpcw = _control87(0, 0);
722        _control87(_PC_64, _MCW_PC); // 0,0x30000
723#endif
724        /* value used for comparison in solver for cross-over point */
725        KMP_ASSERT(tc > 0);
726        long double target = ((long double)chunk * 2 + 1) * nproc / tc;
727
728        /* crossover point--chunk indexes equal to or greater than
729           this point switch to dynamic-style scheduling */
730        UT cross;
731
732        /* commonly used term: (2 nproc - 1)/(2 nproc) */
733        x = 1.0 - 0.5 / (double)nproc;
734
735#ifdef KMP_DEBUG
736        { // test natural alignment
737          struct _test_a {
738            char a;
739            union {
740              char b;
741              DBL d;
742            };
743          } t;
744          ptrdiff_t natural_alignment =
745              (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
746          //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long
747          // long)natural_alignment );
748          KMP_DEBUG_ASSERT(
749              (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0);
750        }
751#endif // KMP_DEBUG
752
753        /* save the term in thread private dispatch structure */
754        *(DBL *)&pr->u.p.parm3 = x;
755
756        /* solve for the crossover point to the nearest integer i for which C_i
757           <= chunk */
758        {
759          UT left, right, mid;
760          long double p;
761
762          /* estimate initial upper and lower bound */
763
764          /* doesn't matter what value right is as long as it is positive, but
765             it affects performance of the solver */
766          right = 229;
767          p = __kmp_pow<UT>(x, right);
768          if (p > target) {
769            do {
770              p *= p;
771              right <<= 1;
772            } while (p > target && right < (1 << 27));
773            /* lower bound is previous (failed) estimate of upper bound */
774            left = right >> 1;
775          } else {
776            left = 0;
777          }
778
779          /* bisection root-finding method */
780          while (left + 1 < right) {
781            mid = (left + right) / 2;
782            if (__kmp_pow<UT>(x, mid) > target) {
783              left = mid;
784            } else {
785              right = mid;
786            }
787          } // while
788          cross = right;
789        }
790        /* assert sanity of computed crossover point */
791        KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target &&
792                   __kmp_pow<UT>(x, cross) <= target);
793
794        /* save the crossover point in thread private dispatch structure */
795        pr->u.p.parm2 = cross;
796
797// C75803
798#if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8))
799#define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3)
800#else
801#define GUIDED_ANALYTICAL_WORKAROUND (x)
802#endif
803        /* dynamic-style scheduling offset */
804        pr->u.p.count = tc -
805                        __kmp_dispatch_guided_remaining(
806                            tc, GUIDED_ANALYTICAL_WORKAROUND, cross) -
807                        cross * chunk;
808#if KMP_USE_X87CONTROL
809        // restore FPCW
810        _control87(oldFpcw, _MCW_PC);
811#endif
812      } // if
813    } else {
814      KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
815                     "kmp_sch_static_greedy\n",
816                     gtid));
817      schedule = kmp_sch_static_greedy;
818      /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
819      pr->u.p.parm1 = tc;
820    } // if
821  } // case
822  break;
823  case kmp_sch_static_greedy:
824    KD_TRACE(
825        100,
826        ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
827         gtid));
828    pr->u.p.parm1 = (nproc > 1) ? (tc + nproc - 1) / nproc : tc;
829    break;
830  case kmp_sch_static_chunked:
831  case kmp_sch_dynamic_chunked:
832  dynamic_init:
833    if (tc == 0)
834      break;
835    if (pr->u.p.parm1 <= 0)
836      pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
837    else if (pr->u.p.parm1 > tc)
838      pr->u.p.parm1 = tc;
839    // Store the total number of chunks to prevent integer overflow during
840    // bounds calculations in the get next chunk routine.
841    pr->u.p.parm2 = (tc / pr->u.p.parm1) + (tc % pr->u.p.parm1 ? 1 : 0);
842    KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
843                   "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n",
844                   gtid));
845    break;
846  case kmp_sch_trapezoidal: {
847    /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
848
849    T parm1, parm2, parm3, parm4;
850    KD_TRACE(100,
851             ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_trapezoidal case\n",
852              gtid));
853
854    parm1 = chunk;
855
856    /* F : size of the first cycle */
857    parm2 = (tc / (2 * nproc));
858
859    if (parm2 < 1) {
860      parm2 = 1;
861    }
862
863    /* L : size of the last cycle.  Make sure the last cycle is not larger
864       than the first cycle. */
865    if (parm1 < 1) {
866      parm1 = 1;
867    } else if (parm1 > parm2) {
868      parm1 = parm2;
869    }
870
871    /* N : number of cycles */
872    parm3 = (parm2 + parm1);
873    parm3 = (2 * tc + parm3 - 1) / parm3;
874
875    if (parm3 < 2) {
876      parm3 = 2;
877    }
878
879    /* sigma : decreasing incr of the trapezoid */
880    parm4 = (parm3 - 1);
881    parm4 = (parm2 - parm1) / parm4;
882
883    // pointless check, because parm4 >= 0 always
884    // if ( parm4 < 0 ) {
885    //    parm4 = 0;
886    //}
887
888    pr->u.p.parm1 = parm1;
889    pr->u.p.parm2 = parm2;
890    pr->u.p.parm3 = parm3;
891    pr->u.p.parm4 = parm4;
892  } // case
893  break;
894
895  default: {
896    __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
897                KMP_HNT(GetNewerLibrary), // Hint
898                __kmp_msg_null // Variadic argument list terminator
899    );
900  } break;
901  } // switch
902  pr->schedule = schedule;
903}
904
905#if KMP_USE_HIER_SCHED
906template <typename T>
907inline void __kmp_dispatch_init_hier_runtime(ident_t *loc, T lb, T ub,
908                                             typename traits_t<T>::signed_t st);
909template <>
910inline void
911__kmp_dispatch_init_hier_runtime<kmp_int32>(ident_t *loc, kmp_int32 lb,
912                                            kmp_int32 ub, kmp_int32 st) {
913  __kmp_dispatch_init_hierarchy<kmp_int32>(
914      loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
915      __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
916}
917template <>
918inline void
919__kmp_dispatch_init_hier_runtime<kmp_uint32>(ident_t *loc, kmp_uint32 lb,
920                                             kmp_uint32 ub, kmp_int32 st) {
921  __kmp_dispatch_init_hierarchy<kmp_uint32>(
922      loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
923      __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
924}
925template <>
926inline void
927__kmp_dispatch_init_hier_runtime<kmp_int64>(ident_t *loc, kmp_int64 lb,
928                                            kmp_int64 ub, kmp_int64 st) {
929  __kmp_dispatch_init_hierarchy<kmp_int64>(
930      loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
931      __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
932}
933template <>
934inline void
935__kmp_dispatch_init_hier_runtime<kmp_uint64>(ident_t *loc, kmp_uint64 lb,
936                                             kmp_uint64 ub, kmp_int64 st) {
937  __kmp_dispatch_init_hierarchy<kmp_uint64>(
938      loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
939      __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
940}
941
942// free all the hierarchy scheduling memory associated with the team
943void __kmp_dispatch_free_hierarchies(kmp_team_t *team) {
944  int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
945  for (int i = 0; i < num_disp_buff; ++i) {
946    // type does not matter here so use kmp_int32
947    auto sh =
948        reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>(
949            &team->t.t_disp_buffer[i]);
950    if (sh->hier) {
951      sh->hier->deallocate();
952      __kmp_free(sh->hier);
953    }
954  }
955}
956#endif
957
958// UT - unsigned flavor of T, ST - signed flavor of T,
959// DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
960template <typename T>
961static void
962__kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
963                    T ub, typename traits_t<T>::signed_t st,
964                    typename traits_t<T>::signed_t chunk, int push_ws) {
965  typedef typename traits_t<T>::unsigned_t UT;
966
967  int active;
968  kmp_info_t *th;
969  kmp_team_t *team;
970  kmp_uint32 my_buffer_index;
971  dispatch_private_info_template<T> *pr;
972  dispatch_shared_info_template<T> volatile *sh;
973
974  KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) ==
975                   sizeof(dispatch_private_info));
976  KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) ==
977                   sizeof(dispatch_shared_info));
978  __kmp_assert_valid_gtid(gtid);
979
980  if (!TCR_4(__kmp_init_parallel))
981    __kmp_parallel_initialize();
982
983  __kmp_resume_if_soft_paused();
984
985#if INCLUDE_SSC_MARKS
986  SSC_MARK_DISPATCH_INIT();
987#endif
988#ifdef KMP_DEBUG
989  typedef typename traits_t<T>::signed_t ST;
990  {
991    char *buff;
992    // create format specifiers before the debug output
993    buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d "
994                            "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
995                            traits_t<ST>::spec, traits_t<T>::spec,
996                            traits_t<T>::spec, traits_t<ST>::spec);
997    KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st));
998    __kmp_str_free(&buff);
999  }
1000#endif
1001  /* setup data */
1002  th = __kmp_threads[gtid];
1003  team = th->th.th_team;
1004  active = !team->t.t_serialized;
1005  th->th.th_ident = loc;
1006
1007  // Any half-decent optimizer will remove this test when the blocks are empty
1008  // since the macros expand to nothing
1009  // when statistics are disabled.
1010  if (schedule == __kmp_static) {
1011    KMP_COUNT_BLOCK(OMP_LOOP_STATIC);
1012  } else {
1013    KMP_COUNT_BLOCK(OMP_LOOP_DYNAMIC);
1014  }
1015
1016#if KMP_USE_HIER_SCHED
1017  // Initialize the scheduling hierarchy if requested in OMP_SCHEDULE envirable
1018  // Hierarchical scheduling does not work with ordered, so if ordered is
1019  // detected, then revert back to threaded scheduling.
1020  bool ordered;
1021  enum sched_type my_sched = schedule;
1022  my_buffer_index = th->th.th_dispatch->th_disp_index;
1023  pr = reinterpret_cast<dispatch_private_info_template<T> *>(
1024      &th->th.th_dispatch
1025           ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
1026  my_sched = SCHEDULE_WITHOUT_MODIFIERS(my_sched);
1027  if ((my_sched >= kmp_nm_lower) && (my_sched < kmp_nm_upper))
1028    my_sched =
1029        (enum sched_type)(((int)my_sched) - (kmp_nm_lower - kmp_sch_lower));
1030  ordered = (kmp_ord_lower & my_sched);
1031  if (pr->flags.use_hier) {
1032    if (ordered) {
1033      KD_TRACE(100, ("__kmp_dispatch_init: T#%d ordered loop detected.  "
1034                     "Disabling hierarchical scheduling.\n",
1035                     gtid));
1036      pr->flags.use_hier = FALSE;
1037    }
1038  }
1039  if (schedule == kmp_sch_runtime && __kmp_hier_scheds.size > 0) {
1040    // Don't use hierarchical for ordered parallel loops and don't
1041    // use the runtime hierarchy if one was specified in the program
1042    if (!ordered && !pr->flags.use_hier)
1043      __kmp_dispatch_init_hier_runtime<T>(loc, lb, ub, st);
1044  }
1045#endif // KMP_USE_HIER_SCHED
1046
1047#if USE_ITT_BUILD
1048  kmp_uint64 cur_chunk = chunk;
1049  int itt_need_metadata_reporting =
1050      __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
1051      KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
1052      team->t.t_active_level == 1;
1053#endif
1054  if (!active) {
1055    pr = reinterpret_cast<dispatch_private_info_template<T> *>(
1056        th->th.th_dispatch->th_disp_buffer); /* top of the stack */
1057  } else {
1058    KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1059                     &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1060
1061    my_buffer_index = th->th.th_dispatch->th_disp_index++;
1062
1063    /* What happens when number of threads changes, need to resize buffer? */
1064    pr = reinterpret_cast<dispatch_private_info_template<T> *>(
1065        &th->th.th_dispatch
1066             ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
1067    sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
1068        &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
1069    KD_TRACE(10, ("__kmp_dispatch_init: T#%d my_buffer_index:%d\n", gtid,
1070                  my_buffer_index));
1071    if (sh->buffer_index != my_buffer_index) { // too many loops in progress?
1072      KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d"
1073                     " sh->buffer_index:%d\n",
1074                     gtid, my_buffer_index, sh->buffer_index));
1075      __kmp_wait<kmp_uint32>(&sh->buffer_index, my_buffer_index,
1076                             __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
1077      // Note: KMP_WAIT() cannot be used there: buffer index and
1078      // my_buffer_index are *always* 32-bit integers.
1079      KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d "
1080                     "sh->buffer_index:%d\n",
1081                     gtid, my_buffer_index, sh->buffer_index));
1082    }
1083  }
1084
1085  __kmp_dispatch_init_algorithm(loc, gtid, pr, schedule, lb, ub, st,
1086#if USE_ITT_BUILD
1087                                &cur_chunk,
1088#endif
1089                                chunk, (T)th->th.th_team_nproc,
1090                                (T)th->th.th_info.ds.ds_tid);
1091  if (active) {
1092    if (pr->flags.ordered == 0) {
1093      th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
1094      th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
1095    } else {
1096      th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>;
1097      th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>;
1098    }
1099    th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr;
1100    th->th.th_dispatch->th_dispatch_sh_current =
1101        CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh);
1102#if USE_ITT_BUILD
1103    if (pr->flags.ordered) {
1104      __kmp_itt_ordered_init(gtid);
1105    }
1106    // Report loop metadata
1107    if (itt_need_metadata_reporting) {
1108      // Only report metadata by primary thread of active team at level 1
1109      kmp_uint64 schedtype = 0;
1110      switch (schedule) {
1111      case kmp_sch_static_chunked:
1112      case kmp_sch_static_balanced: // Chunk is calculated in the switch above
1113        break;
1114      case kmp_sch_static_greedy:
1115        cur_chunk = pr->u.p.parm1;
1116        break;
1117      case kmp_sch_dynamic_chunked:
1118        schedtype = 1;
1119        break;
1120      case kmp_sch_guided_iterative_chunked:
1121      case kmp_sch_guided_analytical_chunked:
1122      case kmp_sch_guided_simd:
1123        schedtype = 2;
1124        break;
1125      default:
1126        // Should we put this case under "static"?
1127        // case kmp_sch_static_steal:
1128        schedtype = 3;
1129        break;
1130      }
1131      __kmp_itt_metadata_loop(loc, schedtype, pr->u.p.tc, cur_chunk);
1132    }
1133#if KMP_USE_HIER_SCHED
1134    if (pr->flags.use_hier) {
1135      pr->u.p.count = 0;
1136      pr->u.p.ub = pr->u.p.lb = pr->u.p.st = pr->u.p.tc = 0;
1137    }
1138#endif // KMP_USER_HIER_SCHED
1139#endif /* USE_ITT_BUILD */
1140  }
1141
1142#ifdef KMP_DEBUG
1143  {
1144    char *buff;
1145    // create format specifiers before the debug output
1146    buff = __kmp_str_format(
1147        "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s "
1148        "lb:%%%s ub:%%%s"
1149        " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s"
1150        " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
1151        traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec,
1152        traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
1153        traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec,
1154        traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec);
1155    KD_TRACE(10, (buff, gtid, pr->schedule, pr->flags.ordered, pr->u.p.lb,
1156                  pr->u.p.ub, pr->u.p.st, pr->u.p.tc, pr->u.p.count,
1157                  pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
1158                  pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4));
1159    __kmp_str_free(&buff);
1160  }
1161#endif
1162#if OMPT_SUPPORT && OMPT_OPTIONAL
1163  if (ompt_enabled.ompt_callback_work) {
1164    ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1165    ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
1166    ompt_callbacks.ompt_callback(ompt_callback_work)(
1167        ompt_work_loop, ompt_scope_begin, &(team_info->parallel_data),
1168        &(task_info->task_data), pr->u.p.tc, OMPT_LOAD_RETURN_ADDRESS(gtid));
1169  }
1170#endif
1171  KMP_PUSH_PARTITIONED_TIMER(OMP_loop_dynamic);
1172}
1173
1174/* For ordered loops, either __kmp_dispatch_finish() should be called after
1175 * every iteration, or __kmp_dispatch_finish_chunk() should be called after
1176 * every chunk of iterations.  If the ordered section(s) were not executed
1177 * for this iteration (or every iteration in this chunk), we need to set the
1178 * ordered iteration counters so that the next thread can proceed. */
1179template <typename UT>
1180static void __kmp_dispatch_finish(int gtid, ident_t *loc) {
1181  typedef typename traits_t<UT>::signed_t ST;
1182  __kmp_assert_valid_gtid(gtid);
1183  kmp_info_t *th = __kmp_threads[gtid];
1184
1185  KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid));
1186  if (!th->th.th_team->t.t_serialized) {
1187
1188    dispatch_private_info_template<UT> *pr =
1189        reinterpret_cast<dispatch_private_info_template<UT> *>(
1190            th->th.th_dispatch->th_dispatch_pr_current);
1191    dispatch_shared_info_template<UT> volatile *sh =
1192        reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1193            th->th.th_dispatch->th_dispatch_sh_current);
1194    KMP_DEBUG_ASSERT(pr);
1195    KMP_DEBUG_ASSERT(sh);
1196    KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1197                     &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1198
1199    if (pr->ordered_bumped) {
1200      KD_TRACE(
1201          1000,
1202          ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1203           gtid));
1204      pr->ordered_bumped = 0;
1205    } else {
1206      UT lower = pr->u.p.ordered_lower;
1207
1208#ifdef KMP_DEBUG
1209      {
1210        char *buff;
1211        // create format specifiers before the debug output
1212        buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d before wait: "
1213                                "ordered_iteration:%%%s lower:%%%s\n",
1214                                traits_t<UT>::spec, traits_t<UT>::spec);
1215        KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1216        __kmp_str_free(&buff);
1217      }
1218#endif
1219
1220      __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
1221                     __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1222      KMP_MB(); /* is this necessary? */
1223#ifdef KMP_DEBUG
1224      {
1225        char *buff;
1226        // create format specifiers before the debug output
1227        buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d after wait: "
1228                                "ordered_iteration:%%%s lower:%%%s\n",
1229                                traits_t<UT>::spec, traits_t<UT>::spec);
1230        KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1231        __kmp_str_free(&buff);
1232      }
1233#endif
1234
1235      test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
1236    } // if
1237  } // if
1238  KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid));
1239}
1240
1241#ifdef KMP_GOMP_COMPAT
1242
1243template <typename UT>
1244static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) {
1245  typedef typename traits_t<UT>::signed_t ST;
1246  __kmp_assert_valid_gtid(gtid);
1247  kmp_info_t *th = __kmp_threads[gtid];
1248
1249  KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid));
1250  if (!th->th.th_team->t.t_serialized) {
1251    dispatch_private_info_template<UT> *pr =
1252        reinterpret_cast<dispatch_private_info_template<UT> *>(
1253            th->th.th_dispatch->th_dispatch_pr_current);
1254    dispatch_shared_info_template<UT> volatile *sh =
1255        reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1256            th->th.th_dispatch->th_dispatch_sh_current);
1257    KMP_DEBUG_ASSERT(pr);
1258    KMP_DEBUG_ASSERT(sh);
1259    KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1260                     &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1261
1262    UT lower = pr->u.p.ordered_lower;
1263    UT upper = pr->u.p.ordered_upper;
1264    UT inc = upper - lower + 1;
1265
1266    if (pr->ordered_bumped == inc) {
1267      KD_TRACE(
1268          1000,
1269          ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1270           gtid));
1271      pr->ordered_bumped = 0;
1272    } else {
1273      inc -= pr->ordered_bumped;
1274
1275#ifdef KMP_DEBUG
1276      {
1277        char *buff;
1278        // create format specifiers before the debug output
1279        buff = __kmp_str_format(
1280            "__kmp_dispatch_finish_chunk: T#%%d before wait: "
1281            "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1282            traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec);
1283        KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper));
1284        __kmp_str_free(&buff);
1285      }
1286#endif
1287
1288      __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
1289                     __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1290
1291      KMP_MB(); /* is this necessary? */
1292      KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting "
1293                      "ordered_bumped to zero\n",
1294                      gtid));
1295      pr->ordered_bumped = 0;
1296//!!!!! TODO check if the inc should be unsigned, or signed???
1297#ifdef KMP_DEBUG
1298      {
1299        char *buff;
1300        // create format specifiers before the debug output
1301        buff = __kmp_str_format(
1302            "__kmp_dispatch_finish_chunk: T#%%d after wait: "
1303            "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1304            traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
1305            traits_t<UT>::spec);
1306        KD_TRACE(1000,
1307                 (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper));
1308        __kmp_str_free(&buff);
1309      }
1310#endif
1311
1312      test_then_add<ST>((volatile ST *)&sh->u.s.ordered_iteration, inc);
1313    }
1314    //        }
1315  }
1316  KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid));
1317}
1318
1319#endif /* KMP_GOMP_COMPAT */
1320
1321template <typename T>
1322int __kmp_dispatch_next_algorithm(int gtid,
1323                                  dispatch_private_info_template<T> *pr,
1324                                  dispatch_shared_info_template<T> volatile *sh,
1325                                  kmp_int32 *p_last, T *p_lb, T *p_ub,
1326                                  typename traits_t<T>::signed_t *p_st, T nproc,
1327                                  T tid) {
1328  typedef typename traits_t<T>::unsigned_t UT;
1329  typedef typename traits_t<T>::signed_t ST;
1330  typedef typename traits_t<T>::floating_t DBL;
1331  int status = 0;
1332  bool last = false;
1333  T start;
1334  ST incr;
1335  UT limit, trip, init;
1336  kmp_info_t *th = __kmp_threads[gtid];
1337  kmp_team_t *team = th->th.th_team;
1338
1339  KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1340                   &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1341  KMP_DEBUG_ASSERT(pr);
1342  KMP_DEBUG_ASSERT(sh);
1343  KMP_DEBUG_ASSERT(tid >= 0 && tid < nproc);
1344#ifdef KMP_DEBUG
1345  {
1346    char *buff;
1347    // create format specifiers before the debug output
1348    buff =
1349        __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d called pr:%%p "
1350                         "sh:%%p nproc:%%%s tid:%%%s\n",
1351                         traits_t<T>::spec, traits_t<T>::spec);
1352    KD_TRACE(10, (buff, gtid, pr, sh, nproc, tid));
1353    __kmp_str_free(&buff);
1354  }
1355#endif
1356
1357  // zero trip count
1358  if (pr->u.p.tc == 0) {
1359    KD_TRACE(10,
1360             ("__kmp_dispatch_next_algorithm: T#%d early exit trip count is "
1361              "zero status:%d\n",
1362              gtid, status));
1363    return 0;
1364  }
1365
1366  switch (pr->schedule) {
1367#if KMP_STATIC_STEAL_ENABLED
1368  case kmp_sch_static_steal: {
1369    T chunk = pr->u.p.parm1;
1370    UT nchunks = pr->u.p.parm2;
1371    KD_TRACE(100,
1372             ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_steal case\n",
1373              gtid));
1374
1375    trip = pr->u.p.tc - 1;
1376
1377    if (traits_t<T>::type_size > 4) {
1378      // use lock for 8-byte induction variable.
1379      // TODO (optional): check presence and use 16-byte CAS
1380      kmp_lock_t *lck = pr->u.p.steal_lock;
1381      KMP_DEBUG_ASSERT(lck != NULL);
1382      if (pr->u.p.count < (UT)pr->u.p.ub) {
1383        KMP_DEBUG_ASSERT(pr->steal_flag == READY);
1384        __kmp_acquire_lock(lck, gtid);
1385        // try to get own chunk of iterations
1386        init = (pr->u.p.count)++;
1387        status = (init < (UT)pr->u.p.ub);
1388        __kmp_release_lock(lck, gtid);
1389      } else {
1390        status = 0; // no own chunks
1391      }
1392      if (!status) { // try to steal
1393        kmp_lock_t *lckv; // victim buffer's lock
1394        T while_limit = pr->u.p.parm3;
1395        T while_index = 0;
1396        int idx = (th->th.th_dispatch->th_disp_index - 1) %
1397                  __kmp_dispatch_num_buffers; // current loop index
1398        // note: victim thread can potentially execute another loop
1399        KMP_ATOMIC_ST_REL(&pr->steal_flag, THIEF); // mark self buffer inactive
1400        while ((!status) && (while_limit != ++while_index)) {
1401          dispatch_private_info_template<T> *v;
1402          T remaining;
1403          T victimId = pr->u.p.parm4;
1404          T oldVictimId = victimId ? victimId - 1 : nproc - 1;
1405          v = reinterpret_cast<dispatch_private_info_template<T> *>(
1406              &team->t.t_dispatch[victimId].th_disp_buffer[idx]);
1407          KMP_DEBUG_ASSERT(v);
1408          while ((v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) &&
1409                 oldVictimId != victimId) {
1410            victimId = (victimId + 1) % nproc;
1411            v = reinterpret_cast<dispatch_private_info_template<T> *>(
1412                &team->t.t_dispatch[victimId].th_disp_buffer[idx]);
1413            KMP_DEBUG_ASSERT(v);
1414          }
1415          if (v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) {
1416            continue; // try once more (nproc attempts in total)
1417          }
1418          if (KMP_ATOMIC_LD_RLX(&v->steal_flag) == UNUSED) {
1419            kmp_uint32 old = UNUSED;
1420            // try to steal whole range from inactive victim
1421            status = v->steal_flag.compare_exchange_strong(old, THIEF);
1422            if (status) {
1423              // initialize self buffer with victim's whole range of chunks
1424              T id = victimId;
1425              T small_chunk = 0, extras = 0, p_extra = 0;
1426              __kmp_initialize_self_buffer<T>(team, id, pr, nchunks, nproc,
1427                                              init, small_chunk, extras,
1428                                              p_extra);
1429              __kmp_acquire_lock(lck, gtid);
1430              pr->u.p.count = init + 1; // exclude one we execute immediately
1431              pr->u.p.ub = init + small_chunk + p_extra + (id < extras ? 1 : 0);
1432              __kmp_release_lock(lck, gtid);
1433              pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
1434              // no need to reinitialize other thread invariants: lb, st, etc.
1435#ifdef KMP_DEBUG
1436              {
1437                char *buff;
1438                // create format specifiers before the debug output
1439                buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d "
1440                                        "stolen chunks from T#%%d, "
1441                                        "count:%%%s ub:%%%s\n",
1442                                        traits_t<UT>::spec, traits_t<T>::spec);
1443                KD_TRACE(10, (buff, gtid, id, pr->u.p.count, pr->u.p.ub));
1444                __kmp_str_free(&buff);
1445              }
1446#endif
1447              // activate non-empty buffer and let others steal from us
1448              if (pr->u.p.count < (UT)pr->u.p.ub)
1449                KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
1450              break;
1451            }
1452          }
1453          if (KMP_ATOMIC_LD_ACQ(&v->steal_flag) != READY ||
1454              v->u.p.count >= (UT)v->u.p.ub) {
1455            pr->u.p.parm4 = (victimId + 1) % nproc; // shift start victim tid
1456            continue; // no chunks to steal, try next victim
1457          }
1458          lckv = v->u.p.steal_lock;
1459          KMP_ASSERT(lckv != NULL);
1460          __kmp_acquire_lock(lckv, gtid);
1461          limit = v->u.p.ub; // keep initial ub
1462          if (v->u.p.count >= limit) {
1463            __kmp_release_lock(lckv, gtid);
1464            pr->u.p.parm4 = (victimId + 1) % nproc; // shift start victim tid
1465            continue; // no chunks to steal, try next victim
1466          }
1467
1468          // stealing succeded, reduce victim's ub by 1/4 of undone chunks
1469          // TODO: is this heuristics good enough??
1470          remaining = limit - v->u.p.count;
1471          if (remaining > 7) {
1472            // steal 1/4 of remaining
1473            KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, remaining >> 2);
1474            init = (v->u.p.ub -= (remaining >> 2));
1475          } else {
1476            // steal 1 chunk of 1..7 remaining
1477            KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1);
1478            init = (v->u.p.ub -= 1);
1479          }
1480          __kmp_release_lock(lckv, gtid);
1481#ifdef KMP_DEBUG
1482          {
1483            char *buff;
1484            // create format specifiers before the debug output
1485            buff = __kmp_str_format(
1486                "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, "
1487                "count:%%%s ub:%%%s\n",
1488                traits_t<UT>::spec, traits_t<UT>::spec);
1489            KD_TRACE(10, (buff, gtid, victimId, init, limit));
1490            __kmp_str_free(&buff);
1491          }
1492#endif
1493          KMP_DEBUG_ASSERT(init + 1 <= limit);
1494          pr->u.p.parm4 = victimId; // remember victim to steal from
1495          status = 1;
1496          // now update own count and ub with stolen range excluding init chunk
1497          __kmp_acquire_lock(lck, gtid);
1498          pr->u.p.count = init + 1;
1499          pr->u.p.ub = limit;
1500          __kmp_release_lock(lck, gtid);
1501          // activate non-empty buffer and let others steal from us
1502          if (init + 1 < limit)
1503            KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
1504        } // while (search for victim)
1505      } // if (try to find victim and steal)
1506    } else {
1507      // 4-byte induction variable, use 8-byte CAS for pair (count, ub)
1508      // as all operations on pair (count, ub) must be done atomically
1509      typedef union {
1510        struct {
1511          UT count;
1512          T ub;
1513        } p;
1514        kmp_int64 b;
1515      } union_i4;
1516      union_i4 vold, vnew;
1517      if (pr->u.p.count < (UT)pr->u.p.ub) {
1518        KMP_DEBUG_ASSERT(pr->steal_flag == READY);
1519        vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1520        vnew.b = vold.b;
1521        vnew.p.count++; // get chunk from head of self range
1522        while (!KMP_COMPARE_AND_STORE_REL64(
1523            (volatile kmp_int64 *)&pr->u.p.count,
1524            *VOLATILE_CAST(kmp_int64 *) & vold.b,
1525            *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1526          KMP_CPU_PAUSE();
1527          vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1528          vnew.b = vold.b;
1529          vnew.p.count++;
1530        }
1531        init = vold.p.count;
1532        status = (init < (UT)vold.p.ub);
1533      } else {
1534        status = 0; // no own chunks
1535      }
1536      if (!status) { // try to steal
1537        T while_limit = pr->u.p.parm3;
1538        T while_index = 0;
1539        int idx = (th->th.th_dispatch->th_disp_index - 1) %
1540                  __kmp_dispatch_num_buffers; // current loop index
1541        // note: victim thread can potentially execute another loop
1542        KMP_ATOMIC_ST_REL(&pr->steal_flag, THIEF); // mark self buffer inactive
1543        while ((!status) && (while_limit != ++while_index)) {
1544          dispatch_private_info_template<T> *v;
1545          T remaining;
1546          T victimId = pr->u.p.parm4;
1547          T oldVictimId = victimId ? victimId - 1 : nproc - 1;
1548          v = reinterpret_cast<dispatch_private_info_template<T> *>(
1549              &team->t.t_dispatch[victimId].th_disp_buffer[idx]);
1550          KMP_DEBUG_ASSERT(v);
1551          while ((v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) &&
1552                 oldVictimId != victimId) {
1553            victimId = (victimId + 1) % nproc;
1554            v = reinterpret_cast<dispatch_private_info_template<T> *>(
1555                &team->t.t_dispatch[victimId].th_disp_buffer[idx]);
1556            KMP_DEBUG_ASSERT(v);
1557          }
1558          if (v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) {
1559            continue; // try once more (nproc attempts in total)
1560          }
1561          if (KMP_ATOMIC_LD_RLX(&v->steal_flag) == UNUSED) {
1562            kmp_uint32 old = UNUSED;
1563            // try to steal whole range from inactive victim
1564            status = v->steal_flag.compare_exchange_strong(old, THIEF);
1565            if (status) {
1566              // initialize self buffer with victim's whole range of chunks
1567              T id = victimId;
1568              T small_chunk = 0, extras = 0, p_extra = 0;
1569              __kmp_initialize_self_buffer<T>(team, id, pr, nchunks, nproc,
1570                                              init, small_chunk, extras,
1571                                              p_extra);
1572              vnew.p.count = init + 1;
1573              vnew.p.ub = init + small_chunk + p_extra + (id < extras ? 1 : 0);
1574              // write pair (count, ub) at once atomically
1575#if KMP_ARCH_X86
1576              KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vnew.b);
1577#else
1578              *(volatile kmp_int64 *)(&pr->u.p.count) = vnew.b;
1579#endif
1580              pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
1581              // no need to initialize other thread invariants: lb, st, etc.
1582#ifdef KMP_DEBUG
1583              {
1584                char *buff;
1585                // create format specifiers before the debug output
1586                buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d "
1587                                        "stolen chunks from T#%%d, "
1588                                        "count:%%%s ub:%%%s\n",
1589                                        traits_t<UT>::spec, traits_t<T>::spec);
1590                KD_TRACE(10, (buff, gtid, id, pr->u.p.count, pr->u.p.ub));
1591                __kmp_str_free(&buff);
1592              }
1593#endif
1594              // activate non-empty buffer and let others steal from us
1595              if (pr->u.p.count < (UT)pr->u.p.ub)
1596                KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
1597              break;
1598            }
1599          }
1600          while (1) { // CAS loop with check if victim still has enough chunks
1601            // many threads may be stealing concurrently from same victim
1602            vold.b = *(volatile kmp_int64 *)(&v->u.p.count);
1603            if (KMP_ATOMIC_LD_ACQ(&v->steal_flag) != READY ||
1604                vold.p.count >= (UT)vold.p.ub) {
1605              pr->u.p.parm4 = (victimId + 1) % nproc; // shift start victim id
1606              break; // no chunks to steal, try next victim
1607            }
1608            vnew.b = vold.b;
1609            remaining = vold.p.ub - vold.p.count;
1610            // try to steal 1/4 of remaining
1611            // TODO: is this heuristics good enough??
1612            if (remaining > 7) {
1613              vnew.p.ub -= remaining >> 2; // steal from tail of victim's range
1614            } else {
1615              vnew.p.ub -= 1; // steal 1 chunk of 1..7 remaining
1616            }
1617            KMP_DEBUG_ASSERT(vnew.p.ub * (UT)chunk <= trip);
1618            if (KMP_COMPARE_AND_STORE_REL64(
1619                    (volatile kmp_int64 *)&v->u.p.count,
1620                    *VOLATILE_CAST(kmp_int64 *) & vold.b,
1621                    *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1622              // stealing succedded
1623#ifdef KMP_DEBUG
1624              {
1625                char *buff;
1626                // create format specifiers before the debug output
1627                buff = __kmp_str_format(
1628                    "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, "
1629                    "count:%%%s ub:%%%s\n",
1630                    traits_t<T>::spec, traits_t<T>::spec);
1631                KD_TRACE(10, (buff, gtid, victimId, vnew.p.ub, vold.p.ub));
1632                __kmp_str_free(&buff);
1633              }
1634#endif
1635              KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen,
1636                                        vold.p.ub - vnew.p.ub);
1637              status = 1;
1638              pr->u.p.parm4 = victimId; // keep victim id
1639              // now update own count and ub
1640              init = vnew.p.ub;
1641              vold.p.count = init + 1;
1642#if KMP_ARCH_X86
1643              KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vold.b);
1644#else
1645              *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b;
1646#endif
1647              // activate non-empty buffer and let others steal from us
1648              if (vold.p.count < (UT)vold.p.ub)
1649                KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
1650              break;
1651            } // if (check CAS result)
1652            KMP_CPU_PAUSE(); // CAS failed, repeatedly attempt
1653          } // while (try to steal from particular victim)
1654        } // while (search for victim)
1655      } // if (try to find victim and steal)
1656    } // if (4-byte induction variable)
1657    if (!status) {
1658      *p_lb = 0;
1659      *p_ub = 0;
1660      if (p_st != NULL)
1661        *p_st = 0;
1662    } else {
1663      start = pr->u.p.lb;
1664      init *= chunk;
1665      limit = chunk + init - 1;
1666      incr = pr->u.p.st;
1667      KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_chunks, 1);
1668
1669      KMP_DEBUG_ASSERT(init <= trip);
1670      // keep track of done chunks for possible early exit from stealing
1671      // TODO: count executed chunks locally with rare update of shared location
1672      // test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);
1673      if ((last = (limit >= trip)) != 0)
1674        limit = trip;
1675      if (p_st != NULL)
1676        *p_st = incr;
1677
1678      if (incr == 1) {
1679        *p_lb = start + init;
1680        *p_ub = start + limit;
1681      } else {
1682        *p_lb = start + init * incr;
1683        *p_ub = start + limit * incr;
1684      }
1685    } // if
1686    break;
1687  } // case
1688#endif // KMP_STATIC_STEAL_ENABLED
1689  case kmp_sch_static_balanced: {
1690    KD_TRACE(
1691        10,
1692        ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_balanced case\n",
1693         gtid));
1694    /* check if thread has any iteration to do */
1695    if ((status = !pr->u.p.count) != 0) {
1696      pr->u.p.count = 1;
1697      *p_lb = pr->u.p.lb;
1698      *p_ub = pr->u.p.ub;
1699      last = (pr->u.p.parm1 != 0);
1700      if (p_st != NULL)
1701        *p_st = pr->u.p.st;
1702    } else { /* no iterations to do */
1703      pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1704    }
1705  } // case
1706  break;
1707  case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was
1708                                 merged here */
1709  case kmp_sch_static_chunked: {
1710    T parm1;
1711
1712    KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
1713                   "kmp_sch_static_[affinity|chunked] case\n",
1714                   gtid));
1715    parm1 = pr->u.p.parm1;
1716
1717    trip = pr->u.p.tc - 1;
1718    init = parm1 * (pr->u.p.count + tid);
1719
1720    if ((status = (init <= trip)) != 0) {
1721      start = pr->u.p.lb;
1722      incr = pr->u.p.st;
1723      limit = parm1 + init - 1;
1724
1725      if ((last = (limit >= trip)) != 0)
1726        limit = trip;
1727
1728      if (p_st != NULL)
1729        *p_st = incr;
1730
1731      pr->u.p.count += nproc;
1732
1733      if (incr == 1) {
1734        *p_lb = start + init;
1735        *p_ub = start + limit;
1736      } else {
1737        *p_lb = start + init * incr;
1738        *p_ub = start + limit * incr;
1739      }
1740
1741      if (pr->flags.ordered) {
1742        pr->u.p.ordered_lower = init;
1743        pr->u.p.ordered_upper = limit;
1744      } // if
1745    } // if
1746  } // case
1747  break;
1748
1749  case kmp_sch_dynamic_chunked: {
1750    UT chunk_number;
1751    UT chunk_size = pr->u.p.parm1;
1752    UT nchunks = pr->u.p.parm2;
1753
1754    KD_TRACE(
1755        100,
1756        ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_dynamic_chunked case\n",
1757         gtid));
1758
1759    chunk_number = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1760    status = (chunk_number < nchunks);
1761    if (!status) {
1762      *p_lb = 0;
1763      *p_ub = 0;
1764      if (p_st != NULL)
1765        *p_st = 0;
1766    } else {
1767      init = chunk_size * chunk_number;
1768      trip = pr->u.p.tc - 1;
1769      start = pr->u.p.lb;
1770      incr = pr->u.p.st;
1771
1772      if ((last = (trip - init < (UT)chunk_size)))
1773        limit = trip;
1774      else
1775        limit = chunk_size + init - 1;
1776
1777      if (p_st != NULL)
1778        *p_st = incr;
1779
1780      if (incr == 1) {
1781        *p_lb = start + init;
1782        *p_ub = start + limit;
1783      } else {
1784        *p_lb = start + init * incr;
1785        *p_ub = start + limit * incr;
1786      }
1787
1788      if (pr->flags.ordered) {
1789        pr->u.p.ordered_lower = init;
1790        pr->u.p.ordered_upper = limit;
1791      } // if
1792    } // if
1793  } // case
1794  break;
1795
1796  case kmp_sch_guided_iterative_chunked: {
1797    T chunkspec = pr->u.p.parm1;
1798    KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_chunked "
1799                   "iterative case\n",
1800                   gtid));
1801    trip = pr->u.p.tc;
1802    // Start atomic part of calculations
1803    while (1) {
1804      ST remaining; // signed, because can be < 0
1805      init = sh->u.s.iteration; // shared value
1806      remaining = trip - init;
1807      if (remaining <= 0) { // AC: need to compare with 0 first
1808        // nothing to do, don't try atomic op
1809        status = 0;
1810        break;
1811      }
1812      if ((T)remaining <
1813          pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default
1814        // use dynamic-style schedule
1815        // atomically increment iterations, get old value
1816        init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1817                                 (ST)chunkspec);
1818        remaining = trip - init;
1819        if (remaining <= 0) {
1820          status = 0; // all iterations got by other threads
1821        } else {
1822          // got some iterations to work on
1823          status = 1;
1824          if ((T)remaining > chunkspec) {
1825            limit = init + chunkspec - 1;
1826          } else {
1827            last = true; // the last chunk
1828            limit = init + remaining - 1;
1829          } // if
1830        } // if
1831        break;
1832      } // if
1833      limit = init + (UT)((double)remaining *
1834                          *(double *)&pr->u.p.parm3); // divide by K*nproc
1835      if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1836                               (ST)init, (ST)limit)) {
1837        // CAS was successful, chunk obtained
1838        status = 1;
1839        --limit;
1840        break;
1841      } // if
1842    } // while
1843    if (status != 0) {
1844      start = pr->u.p.lb;
1845      incr = pr->u.p.st;
1846      if (p_st != NULL)
1847        *p_st = incr;
1848      *p_lb = start + init * incr;
1849      *p_ub = start + limit * incr;
1850      if (pr->flags.ordered) {
1851        pr->u.p.ordered_lower = init;
1852        pr->u.p.ordered_upper = limit;
1853      } // if
1854    } else {
1855      *p_lb = 0;
1856      *p_ub = 0;
1857      if (p_st != NULL)
1858        *p_st = 0;
1859    } // if
1860  } // case
1861  break;
1862
1863  case kmp_sch_guided_simd: {
1864    // same as iterative but curr-chunk adjusted to be multiple of given
1865    // chunk
1866    T chunk = pr->u.p.parm1;
1867    KD_TRACE(100,
1868             ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_simd case\n",
1869              gtid));
1870    trip = pr->u.p.tc;
1871    // Start atomic part of calculations
1872    while (1) {
1873      ST remaining; // signed, because can be < 0
1874      init = sh->u.s.iteration; // shared value
1875      remaining = trip - init;
1876      if (remaining <= 0) { // AC: need to compare with 0 first
1877        status = 0; // nothing to do, don't try atomic op
1878        break;
1879      }
1880      KMP_DEBUG_ASSERT(chunk && init % chunk == 0);
1881      // compare with K*nproc*(chunk+1), K=2 by default
1882      if ((T)remaining < pr->u.p.parm2) {
1883        // use dynamic-style schedule
1884        // atomically increment iterations, get old value
1885        init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1886                                 (ST)chunk);
1887        remaining = trip - init;
1888        if (remaining <= 0) {
1889          status = 0; // all iterations got by other threads
1890        } else {
1891          // got some iterations to work on
1892          status = 1;
1893          if ((T)remaining > chunk) {
1894            limit = init + chunk - 1;
1895          } else {
1896            last = true; // the last chunk
1897            limit = init + remaining - 1;
1898          } // if
1899        } // if
1900        break;
1901      } // if
1902      // divide by K*nproc
1903      UT span;
1904      __kmp_type_convert((double)remaining * (*(double *)&pr->u.p.parm3),
1905                         &span);
1906      UT rem = span % chunk;
1907      if (rem) // adjust so that span%chunk == 0
1908        span += chunk - rem;
1909      limit = init + span;
1910      if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1911                               (ST)init, (ST)limit)) {
1912        // CAS was successful, chunk obtained
1913        status = 1;
1914        --limit;
1915        break;
1916      } // if
1917    } // while
1918    if (status != 0) {
1919      start = pr->u.p.lb;
1920      incr = pr->u.p.st;
1921      if (p_st != NULL)
1922        *p_st = incr;
1923      *p_lb = start + init * incr;
1924      *p_ub = start + limit * incr;
1925      if (pr->flags.ordered) {
1926        pr->u.p.ordered_lower = init;
1927        pr->u.p.ordered_upper = limit;
1928      } // if
1929    } else {
1930      *p_lb = 0;
1931      *p_ub = 0;
1932      if (p_st != NULL)
1933        *p_st = 0;
1934    } // if
1935  } // case
1936  break;
1937
1938  case kmp_sch_guided_analytical_chunked: {
1939    T chunkspec = pr->u.p.parm1;
1940    UT chunkIdx;
1941#if KMP_USE_X87CONTROL
1942    /* for storing original FPCW value for Windows* OS on
1943       IA-32 architecture 8-byte version */
1944    unsigned int oldFpcw;
1945    unsigned int fpcwSet = 0;
1946#endif
1947    KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
1948                   "kmp_sch_guided_analytical_chunked case\n",
1949                   gtid));
1950
1951    trip = pr->u.p.tc;
1952
1953    KMP_DEBUG_ASSERT(nproc > 1);
1954    KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)nproc < trip);
1955
1956    while (1) { /* this while loop is a safeguard against unexpected zero
1957                   chunk sizes */
1958      chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1959      if (chunkIdx >= (UT)pr->u.p.parm2) {
1960        --trip;
1961        /* use dynamic-style scheduling */
1962        init = chunkIdx * chunkspec + pr->u.p.count;
1963        /* need to verify init > 0 in case of overflow in the above
1964         * calculation */
1965        if ((status = (init > 0 && init <= trip)) != 0) {
1966          limit = init + chunkspec - 1;
1967
1968          if ((last = (limit >= trip)) != 0)
1969            limit = trip;
1970        }
1971        break;
1972      } else {
1973/* use exponential-style scheduling */
1974/* The following check is to workaround the lack of long double precision on
1975   Windows* OS.
1976   This check works around the possible effect that init != 0 for chunkIdx == 0.
1977 */
1978#if KMP_USE_X87CONTROL
1979        /* If we haven't already done so, save original
1980           FPCW and set precision to 64-bit, as Windows* OS
1981           on IA-32 architecture defaults to 53-bit */
1982        if (!fpcwSet) {
1983          oldFpcw = _control87(0, 0);
1984          _control87(_PC_64, _MCW_PC);
1985          fpcwSet = 0x30000;
1986        }
1987#endif
1988        if (chunkIdx) {
1989          init = __kmp_dispatch_guided_remaining<T>(
1990              trip, *(DBL *)&pr->u.p.parm3, chunkIdx);
1991          KMP_DEBUG_ASSERT(init);
1992          init = trip - init;
1993        } else
1994          init = 0;
1995        limit = trip - __kmp_dispatch_guided_remaining<T>(
1996                           trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1);
1997        KMP_ASSERT(init <= limit);
1998        if (init < limit) {
1999          KMP_DEBUG_ASSERT(limit <= trip);
2000          --limit;
2001          status = 1;
2002          break;
2003        } // if
2004      } // if
2005    } // while (1)
2006#if KMP_USE_X87CONTROL
2007    /* restore FPCW if necessary
2008       AC: check fpcwSet flag first because oldFpcw can be uninitialized here
2009    */
2010    if (fpcwSet && (oldFpcw & fpcwSet))
2011      _control87(oldFpcw, _MCW_PC);
2012#endif
2013    if (status != 0) {
2014      start = pr->u.p.lb;
2015      incr = pr->u.p.st;
2016      if (p_st != NULL)
2017        *p_st = incr;
2018      *p_lb = start + init * incr;
2019      *p_ub = start + limit * incr;
2020      if (pr->flags.ordered) {
2021        pr->u.p.ordered_lower = init;
2022        pr->u.p.ordered_upper = limit;
2023      }
2024    } else {
2025      *p_lb = 0;
2026      *p_ub = 0;
2027      if (p_st != NULL)
2028        *p_st = 0;
2029    }
2030  } // case
2031  break;
2032
2033  case kmp_sch_trapezoidal: {
2034    UT index;
2035    T parm2 = pr->u.p.parm2;
2036    T parm3 = pr->u.p.parm3;
2037    T parm4 = pr->u.p.parm4;
2038    KD_TRACE(100,
2039             ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_trapezoidal case\n",
2040              gtid));
2041
2042    index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);
2043
2044    init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2;
2045    trip = pr->u.p.tc - 1;
2046
2047    if ((status = ((T)index < parm3 && init <= trip)) == 0) {
2048      *p_lb = 0;
2049      *p_ub = 0;
2050      if (p_st != NULL)
2051        *p_st = 0;
2052    } else {
2053      start = pr->u.p.lb;
2054      limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1;
2055      incr = pr->u.p.st;
2056
2057      if ((last = (limit >= trip)) != 0)
2058        limit = trip;
2059
2060      if (p_st != NULL)
2061        *p_st = incr;
2062
2063      if (incr == 1) {
2064        *p_lb = start + init;
2065        *p_ub = start + limit;
2066      } else {
2067        *p_lb = start + init * incr;
2068        *p_ub = start + limit * incr;
2069      }
2070
2071      if (pr->flags.ordered) {
2072        pr->u.p.ordered_lower = init;
2073        pr->u.p.ordered_upper = limit;
2074      } // if
2075    } // if
2076  } // case
2077  break;
2078  default: {
2079    status = 0; // to avoid complaints on uninitialized variable use
2080    __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
2081                KMP_HNT(GetNewerLibrary), // Hint
2082                __kmp_msg_null // Variadic argument list terminator
2083    );
2084  } break;
2085  } // switch
2086  if (p_last)
2087    *p_last = last;
2088#ifdef KMP_DEBUG
2089  if (pr->flags.ordered) {
2090    char *buff;
2091    // create format specifiers before the debug output
2092    buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d "
2093                            "ordered_lower:%%%s ordered_upper:%%%s\n",
2094                            traits_t<UT>::spec, traits_t<UT>::spec);
2095    KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper));
2096    __kmp_str_free(&buff);
2097  }
2098  {
2099    char *buff;
2100    // create format specifiers before the debug output
2101    buff = __kmp_str_format(
2102        "__kmp_dispatch_next_algorithm: T#%%d exit status:%%d p_last:%%d "
2103        "p_lb:%%%s p_ub:%%%s p_st:%%%s\n",
2104        traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2105    KMP_DEBUG_ASSERT(p_last);
2106    KMP_DEBUG_ASSERT(p_st);
2107    KD_TRACE(10, (buff, gtid, status, *p_last, *p_lb, *p_ub, *p_st));
2108    __kmp_str_free(&buff);
2109  }
2110#endif
2111  return status;
2112}
2113
2114/* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more
2115   work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini()
2116   is not called. */
2117#if OMPT_SUPPORT && OMPT_OPTIONAL
2118#define OMPT_LOOP_END                                                          \
2119  if (status == 0) {                                                           \
2120    if (ompt_enabled.ompt_callback_work) {                                     \
2121      ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);              \
2122      ompt_task_info_t *task_info = __ompt_get_task_info_object(0);            \
2123      ompt_callbacks.ompt_callback(ompt_callback_work)(                        \
2124          ompt_work_loop, ompt_scope_end, &(team_info->parallel_data),         \
2125          &(task_info->task_data), 0, codeptr);                                \
2126    }                                                                          \
2127  }
2128#define OMPT_LOOP_DISPATCH(lb, ub, st, status)                                 \
2129  if (ompt_enabled.ompt_callback_dispatch && status) {                         \
2130    ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);                \
2131    ompt_task_info_t *task_info = __ompt_get_task_info_object(0);              \
2132    ompt_dispatch_chunk_t chunk;                                               \
2133    ompt_data_t instance = ompt_data_none;                                     \
2134    OMPT_GET_DISPATCH_CHUNK(chunk, lb, ub, st);                                \
2135    instance.ptr = &chunk;                                                     \
2136    ompt_callbacks.ompt_callback(ompt_callback_dispatch)(                      \
2137        &(team_info->parallel_data), &(task_info->task_data),                  \
2138        ompt_dispatch_ws_loop_chunk, instance);                                \
2139  }
2140// TODO: implement count
2141#else
2142#define OMPT_LOOP_END // no-op
2143#define OMPT_LOOP_DISPATCH(lb, ub, st, status) // no-op
2144#endif
2145
2146#if KMP_STATS_ENABLED
2147#define KMP_STATS_LOOP_END                                                     \
2148  {                                                                            \
2149    kmp_int64 u, l, t, i;                                                      \
2150    l = (kmp_int64)(*p_lb);                                                    \
2151    u = (kmp_int64)(*p_ub);                                                    \
2152    i = (kmp_int64)(pr->u.p.st);                                               \
2153    if (status == 0) {                                                         \
2154      t = 0;                                                                   \
2155      KMP_POP_PARTITIONED_TIMER();                                             \
2156    } else if (i == 1) {                                                       \
2157      if (u >= l)                                                              \
2158        t = u - l + 1;                                                         \
2159      else                                                                     \
2160        t = 0;                                                                 \
2161    } else if (i < 0) {                                                        \
2162      if (l >= u)                                                              \
2163        t = (l - u) / (-i) + 1;                                                \
2164      else                                                                     \
2165        t = 0;                                                                 \
2166    } else {                                                                   \
2167      if (u >= l)                                                              \
2168        t = (u - l) / i + 1;                                                   \
2169      else                                                                     \
2170        t = 0;                                                                 \
2171    }                                                                          \
2172    KMP_COUNT_VALUE(OMP_loop_dynamic_iterations, t);                           \
2173  }
2174#else
2175#define KMP_STATS_LOOP_END /* Nothing */
2176#endif
2177
2178template <typename T>
2179static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
2180                               T *p_lb, T *p_ub,
2181                               typename traits_t<T>::signed_t *p_st
2182#if OMPT_SUPPORT && OMPT_OPTIONAL
2183                               ,
2184                               void *codeptr
2185#endif
2186) {
2187
2188  typedef typename traits_t<T>::unsigned_t UT;
2189  typedef typename traits_t<T>::signed_t ST;
2190  // This is potentially slightly misleading, schedule(runtime) will appear here
2191  // even if the actual runtime schedule is static. (Which points out a
2192  // disadvantage of schedule(runtime): even when static scheduling is used it
2193  // costs more than a compile time choice to use static scheduling would.)
2194  KMP_TIME_PARTITIONED_BLOCK(OMP_loop_dynamic_scheduling);
2195
2196  int status;
2197  dispatch_private_info_template<T> *pr;
2198  __kmp_assert_valid_gtid(gtid);
2199  kmp_info_t *th = __kmp_threads[gtid];
2200  kmp_team_t *team = th->th.th_team;
2201
2202  KMP_DEBUG_ASSERT(p_lb && p_ub && p_st); // AC: these cannot be NULL
2203  KD_TRACE(
2204      1000,
2205      ("__kmp_dispatch_next: T#%d called p_lb:%p p_ub:%p p_st:%p p_last: %p\n",
2206       gtid, p_lb, p_ub, p_st, p_last));
2207
2208  if (team->t.t_serialized) {
2209    /* NOTE: serialize this dispatch because we are not at the active level */
2210    pr = reinterpret_cast<dispatch_private_info_template<T> *>(
2211        th->th.th_dispatch->th_disp_buffer); /* top of the stack */
2212    KMP_DEBUG_ASSERT(pr);
2213
2214    if ((status = (pr->u.p.tc != 0)) == 0) {
2215      *p_lb = 0;
2216      *p_ub = 0;
2217      //            if ( p_last != NULL )
2218      //                *p_last = 0;
2219      if (p_st != NULL)
2220        *p_st = 0;
2221      if (__kmp_env_consistency_check) {
2222        if (pr->pushed_ws != ct_none) {
2223          pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2224        }
2225      }
2226    } else if (pr->flags.nomerge) {
2227      kmp_int32 last;
2228      T start;
2229      UT limit, trip, init;
2230      ST incr;
2231      T chunk = pr->u.p.parm1;
2232
2233      KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
2234                     gtid));
2235
2236      init = chunk * pr->u.p.count++;
2237      trip = pr->u.p.tc - 1;
2238
2239      if ((status = (init <= trip)) == 0) {
2240        *p_lb = 0;
2241        *p_ub = 0;
2242        //                if ( p_last != NULL )
2243        //                    *p_last = 0;
2244        if (p_st != NULL)
2245          *p_st = 0;
2246        if (__kmp_env_consistency_check) {
2247          if (pr->pushed_ws != ct_none) {
2248            pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2249          }
2250        }
2251      } else {
2252        start = pr->u.p.lb;
2253        limit = chunk + init - 1;
2254        incr = pr->u.p.st;
2255
2256        if ((last = (limit >= trip)) != 0) {
2257          limit = trip;
2258#if KMP_OS_WINDOWS
2259          pr->u.p.last_upper = pr->u.p.ub;
2260#endif /* KMP_OS_WINDOWS */
2261        }
2262        if (p_last != NULL)
2263          *p_last = last;
2264        if (p_st != NULL)
2265          *p_st = incr;
2266        if (incr == 1) {
2267          *p_lb = start + init;
2268          *p_ub = start + limit;
2269        } else {
2270          *p_lb = start + init * incr;
2271          *p_ub = start + limit * incr;
2272        }
2273
2274        if (pr->flags.ordered) {
2275          pr->u.p.ordered_lower = init;
2276          pr->u.p.ordered_upper = limit;
2277#ifdef KMP_DEBUG
2278          {
2279            char *buff;
2280            // create format specifiers before the debug output
2281            buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
2282                                    "ordered_lower:%%%s ordered_upper:%%%s\n",
2283                                    traits_t<UT>::spec, traits_t<UT>::spec);
2284            KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
2285                            pr->u.p.ordered_upper));
2286            __kmp_str_free(&buff);
2287          }
2288#endif
2289        } // if
2290      } // if
2291    } else {
2292      pr->u.p.tc = 0;
2293      *p_lb = pr->u.p.lb;
2294      *p_ub = pr->u.p.ub;
2295#if KMP_OS_WINDOWS
2296      pr->u.p.last_upper = *p_ub;
2297#endif /* KMP_OS_WINDOWS */
2298      if (p_last != NULL)
2299        *p_last = TRUE;
2300      if (p_st != NULL)
2301        *p_st = pr->u.p.st;
2302    } // if
2303#ifdef KMP_DEBUG
2304    {
2305      char *buff;
2306      // create format specifiers before the debug output
2307      buff = __kmp_str_format(
2308          "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s "
2309          "p_ub:%%%s p_st:%%%s p_last:%%p %%d  returning:%%d\n",
2310          traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2311      KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last,
2312                    (p_last ? *p_last : 0), status));
2313      __kmp_str_free(&buff);
2314    }
2315#endif
2316#if INCLUDE_SSC_MARKS
2317    SSC_MARK_DISPATCH_NEXT();
2318#endif
2319    OMPT_LOOP_DISPATCH(*p_lb, *p_ub, pr->u.p.st, status);
2320    OMPT_LOOP_END;
2321    KMP_STATS_LOOP_END;
2322    return status;
2323  } else {
2324    kmp_int32 last = 0;
2325    dispatch_shared_info_template<T> volatile *sh;
2326
2327    KMP_DEBUG_ASSERT(th->th.th_dispatch ==
2328                     &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
2329
2330    pr = reinterpret_cast<dispatch_private_info_template<T> *>(
2331        th->th.th_dispatch->th_dispatch_pr_current);
2332    KMP_DEBUG_ASSERT(pr);
2333    sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
2334        th->th.th_dispatch->th_dispatch_sh_current);
2335    KMP_DEBUG_ASSERT(sh);
2336
2337#if KMP_USE_HIER_SCHED
2338    if (pr->flags.use_hier)
2339      status = sh->hier->next(loc, gtid, pr, &last, p_lb, p_ub, p_st);
2340    else
2341#endif // KMP_USE_HIER_SCHED
2342      status = __kmp_dispatch_next_algorithm<T>(gtid, pr, sh, &last, p_lb, p_ub,
2343                                                p_st, th->th.th_team_nproc,
2344                                                th->th.th_info.ds.ds_tid);
2345    // status == 0: no more iterations to execute
2346    if (status == 0) {
2347      ST num_done;
2348      num_done = test_then_inc<ST>(&sh->u.s.num_done);
2349#ifdef KMP_DEBUG
2350      {
2351        char *buff;
2352        // create format specifiers before the debug output
2353        buff = __kmp_str_format(
2354            "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2355            traits_t<ST>::spec);
2356        KD_TRACE(10, (buff, gtid, sh->u.s.num_done));
2357        __kmp_str_free(&buff);
2358      }
2359#endif
2360
2361#if KMP_USE_HIER_SCHED
2362      pr->flags.use_hier = FALSE;
2363#endif
2364      if (num_done == th->th.th_team_nproc - 1) {
2365#if KMP_STATIC_STEAL_ENABLED
2366        if (pr->schedule == kmp_sch_static_steal) {
2367          int i;
2368          int idx = (th->th.th_dispatch->th_disp_index - 1) %
2369                    __kmp_dispatch_num_buffers; // current loop index
2370          // loop complete, safe to destroy locks used for stealing
2371          for (i = 0; i < th->th.th_team_nproc; ++i) {
2372            dispatch_private_info_template<T> *buf =
2373                reinterpret_cast<dispatch_private_info_template<T> *>(
2374                    &team->t.t_dispatch[i].th_disp_buffer[idx]);
2375            KMP_ASSERT(buf->steal_flag == THIEF); // buffer must be inactive
2376            KMP_ATOMIC_ST_RLX(&buf->steal_flag, UNUSED);
2377            if (traits_t<T>::type_size > 4) {
2378              // destroy locks used for stealing
2379              kmp_lock_t *lck = buf->u.p.steal_lock;
2380              KMP_ASSERT(lck != NULL);
2381              __kmp_destroy_lock(lck);
2382              __kmp_free(lck);
2383              buf->u.p.steal_lock = NULL;
2384            }
2385          }
2386        }
2387#endif
2388        /* NOTE: release shared buffer to be reused */
2389
2390        KMP_MB(); /* Flush all pending memory write invalidates.  */
2391
2392        sh->u.s.num_done = 0;
2393        sh->u.s.iteration = 0;
2394
2395        /* TODO replace with general release procedure? */
2396        if (pr->flags.ordered) {
2397          sh->u.s.ordered_iteration = 0;
2398        }
2399
2400        sh->buffer_index += __kmp_dispatch_num_buffers;
2401        KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2402                       gtid, sh->buffer_index));
2403
2404        KMP_MB(); /* Flush all pending memory write invalidates.  */
2405
2406      } // if
2407      if (__kmp_env_consistency_check) {
2408        if (pr->pushed_ws != ct_none) {
2409          pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2410        }
2411      }
2412
2413      th->th.th_dispatch->th_deo_fcn = NULL;
2414      th->th.th_dispatch->th_dxo_fcn = NULL;
2415      th->th.th_dispatch->th_dispatch_sh_current = NULL;
2416      th->th.th_dispatch->th_dispatch_pr_current = NULL;
2417    } // if (status == 0)
2418#if KMP_OS_WINDOWS
2419    else if (last) {
2420      pr->u.p.last_upper = pr->u.p.ub;
2421    }
2422#endif /* KMP_OS_WINDOWS */
2423    if (p_last != NULL && status != 0)
2424      *p_last = last;
2425  } // if
2426
2427#ifdef KMP_DEBUG
2428  {
2429    char *buff;
2430    // create format specifiers before the debug output
2431    buff = __kmp_str_format(
2432        "__kmp_dispatch_next: T#%%d normal case: "
2433        "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p (%%d) returning:%%d\n",
2434        traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2435    KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last,
2436                  (p_last ? *p_last : 0), status));
2437    __kmp_str_free(&buff);
2438  }
2439#endif
2440#if INCLUDE_SSC_MARKS
2441  SSC_MARK_DISPATCH_NEXT();
2442#endif
2443  OMPT_LOOP_DISPATCH(*p_lb, *p_ub, pr->u.p.st, status);
2444  OMPT_LOOP_END;
2445  KMP_STATS_LOOP_END;
2446  return status;
2447}
2448
2449/*!
2450@ingroup WORK_SHARING
2451@param loc  source location information
2452@param global_tid  global thread number
2453@return Zero if the parallel region is not active and this thread should execute
2454all sections, non-zero otherwise.
2455
2456Beginning of sections construct.
2457There are no implicit barriers in the "sections" calls, rather the compiler
2458should introduce an explicit barrier if it is required.
2459
2460This implementation is based on __kmp_dispatch_init, using same constructs for
2461shared data (we can't have sections nested directly in omp for loop, there
2462should be a parallel region in between)
2463*/
2464kmp_int32 __kmpc_sections_init(ident_t *loc, kmp_int32 gtid) {
2465
2466  int active;
2467  kmp_info_t *th;
2468  kmp_team_t *team;
2469  kmp_uint32 my_buffer_index;
2470  dispatch_shared_info_template<kmp_int32> volatile *sh;
2471
2472  KMP_DEBUG_ASSERT(__kmp_init_serial);
2473
2474  if (!TCR_4(__kmp_init_parallel))
2475    __kmp_parallel_initialize();
2476  __kmp_resume_if_soft_paused();
2477
2478  /* setup data */
2479  th = __kmp_threads[gtid];
2480  team = th->th.th_team;
2481  active = !team->t.t_serialized;
2482  th->th.th_ident = loc;
2483
2484  KMP_COUNT_BLOCK(OMP_SECTIONS);
2485  KD_TRACE(10, ("__kmpc_sections: called by T#%d\n", gtid));
2486
2487  if (active) {
2488    // Setup sections in the same way as dynamic scheduled loops.
2489    // We need one shared data: which section is to execute next.
2490    // (in case parallel is not active, all sections will be executed on the
2491    // same thread)
2492    KMP_DEBUG_ASSERT(th->th.th_dispatch ==
2493                     &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
2494
2495    my_buffer_index = th->th.th_dispatch->th_disp_index++;
2496
2497    // reuse shared data structures from dynamic sched loops:
2498    sh = reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>(
2499        &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
2500    KD_TRACE(10, ("__kmpc_sections_init: T#%d my_buffer_index:%d\n", gtid,
2501                  my_buffer_index));
2502
2503    th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
2504    th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
2505
2506    KD_TRACE(100, ("__kmpc_sections_init: T#%d before wait: my_buffer_index:%d "
2507                   "sh->buffer_index:%d\n",
2508                   gtid, my_buffer_index, sh->buffer_index));
2509    __kmp_wait<kmp_uint32>(&sh->buffer_index, my_buffer_index,
2510                           __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
2511    // Note: KMP_WAIT() cannot be used there: buffer index and
2512    // my_buffer_index are *always* 32-bit integers.
2513    KMP_MB();
2514    KD_TRACE(100, ("__kmpc_sections_init: T#%d after wait: my_buffer_index:%d "
2515                   "sh->buffer_index:%d\n",
2516                   gtid, my_buffer_index, sh->buffer_index));
2517
2518    th->th.th_dispatch->th_dispatch_pr_current =
2519        nullptr; // sections construct doesn't need private data
2520    th->th.th_dispatch->th_dispatch_sh_current =
2521        CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh);
2522  }
2523
2524#if OMPT_SUPPORT && OMPT_OPTIONAL
2525  if (ompt_enabled.ompt_callback_work) {
2526    ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
2527    ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2528    ompt_callbacks.ompt_callback(ompt_callback_work)(
2529        ompt_work_sections, ompt_scope_begin, &(team_info->parallel_data),
2530        &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));
2531  }
2532#endif
2533  KMP_PUSH_PARTITIONED_TIMER(OMP_sections);
2534
2535  return active;
2536}
2537
2538/*!
2539@ingroup WORK_SHARING
2540@param loc  source location information
2541@param global_tid  global thread number
2542@param numberOfSections  number of sections in the 'sections' construct
2543@return unsigned [from 0 to n) - number (id) of the section to execute next on
2544this thread. n (or any other number not in range) - nothing to execute on this
2545thread
2546*/
2547
2548kmp_int32 __kmpc_next_section(ident_t *loc, kmp_int32 gtid,
2549                              kmp_int32 numberOfSections) {
2550
2551  KMP_TIME_PARTITIONED_BLOCK(OMP_sections_overhead);
2552
2553  kmp_info_t *th = __kmp_threads[gtid];
2554#ifdef KMP_DEBUG
2555  kmp_team_t *team = th->th.th_team;
2556#endif
2557
2558  KD_TRACE(1000, ("__kmp_dispatch_next: T#%d; number of sections:%d\n", gtid,
2559                  numberOfSections));
2560
2561  // For serialized case we should not call this function:
2562  KMP_DEBUG_ASSERT(!team->t.t_serialized);
2563
2564  dispatch_shared_info_template<kmp_int32> volatile *sh;
2565
2566  KMP_DEBUG_ASSERT(th->th.th_dispatch ==
2567                   &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
2568
2569  KMP_DEBUG_ASSERT(!(th->th.th_dispatch->th_dispatch_pr_current));
2570  sh = reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>(
2571      th->th.th_dispatch->th_dispatch_sh_current);
2572  KMP_DEBUG_ASSERT(sh);
2573
2574  kmp_int32 sectionIndex = 0;
2575  bool moreSectionsToExecute = true;
2576
2577  // Find section to execute:
2578  sectionIndex = test_then_inc<kmp_int32>((kmp_int32 *)&sh->u.s.iteration);
2579  if (sectionIndex >= numberOfSections) {
2580    moreSectionsToExecute = false;
2581  }
2582
2583  // status == 0: no more sections to execute;
2584  // OMPTODO: __kmpc_end_sections could be bypassed?
2585  if (!moreSectionsToExecute) {
2586    kmp_int32 num_done;
2587
2588    num_done = test_then_inc<kmp_int32>((kmp_int32 *)(&sh->u.s.num_done));
2589
2590    if (num_done == th->th.th_team_nproc - 1) {
2591      /* NOTE: release this buffer to be reused */
2592
2593      KMP_MB(); /* Flush all pending memory write invalidates.  */
2594
2595      sh->u.s.num_done = 0;
2596      sh->u.s.iteration = 0;
2597
2598      KMP_MB(); /* Flush all pending memory write invalidates.  */
2599
2600      sh->buffer_index += __kmp_dispatch_num_buffers;
2601      KD_TRACE(100, ("__kmpc_next_section: T#%d change buffer_index:%d\n", gtid,
2602                     sh->buffer_index));
2603
2604      KMP_MB(); /* Flush all pending memory write invalidates.  */
2605
2606    } // if
2607
2608    th->th.th_dispatch->th_deo_fcn = NULL;
2609    th->th.th_dispatch->th_dxo_fcn = NULL;
2610    th->th.th_dispatch->th_dispatch_sh_current = NULL;
2611    th->th.th_dispatch->th_dispatch_pr_current = NULL;
2612
2613#if OMPT_SUPPORT && OMPT_OPTIONAL
2614    if (ompt_enabled.ompt_callback_dispatch) {
2615      ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
2616      ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2617      ompt_data_t instance = ompt_data_none;
2618      instance.ptr = OMPT_GET_RETURN_ADDRESS(0);
2619      ompt_callbacks.ompt_callback(ompt_callback_dispatch)(
2620          &(team_info->parallel_data), &(task_info->task_data),
2621          ompt_dispatch_section, instance);
2622    }
2623#endif
2624  }
2625
2626  return sectionIndex;
2627}
2628
2629/*!
2630@ingroup WORK_SHARING
2631@param loc  source location information
2632@param global_tid  global thread number
2633
2634End of "sections" construct.
2635Don't need to wait here: barrier is added separately when needed.
2636*/
2637void __kmpc_end_sections(ident_t *loc, kmp_int32 gtid) {
2638
2639  kmp_info_t *th = __kmp_threads[gtid];
2640  int active = !th->th.th_team->t.t_serialized;
2641
2642  KD_TRACE(100, ("__kmpc_end_sections: T#%d called\n", gtid));
2643
2644  if (!active) {
2645    // In active case call finalization is done in __kmpc_next_section
2646#if OMPT_SUPPORT && OMPT_OPTIONAL
2647    if (ompt_enabled.ompt_callback_work) {
2648      ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
2649      ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2650      ompt_callbacks.ompt_callback(ompt_callback_work)(
2651          ompt_work_sections, ompt_scope_end, &(team_info->parallel_data),
2652          &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));
2653    }
2654#endif
2655  }
2656
2657  KMP_POP_PARTITIONED_TIMER();
2658  KD_TRACE(100, ("__kmpc_end_sections: T#%d returned\n", gtid));
2659}
2660
2661template <typename T>
2662static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid,
2663                                  kmp_int32 *plastiter, T *plower, T *pupper,
2664                                  typename traits_t<T>::signed_t incr) {
2665  typedef typename traits_t<T>::unsigned_t UT;
2666  kmp_uint32 team_id;
2667  kmp_uint32 nteams;
2668  UT trip_count;
2669  kmp_team_t *team;
2670  kmp_info_t *th;
2671
2672  KMP_DEBUG_ASSERT(plastiter && plower && pupper);
2673  KE_TRACE(10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
2674#ifdef KMP_DEBUG
2675  typedef typename traits_t<T>::signed_t ST;
2676  {
2677    char *buff;
2678    // create format specifiers before the debug output
2679    buff = __kmp_str_format("__kmpc_dist_get_bounds: T#%%d liter=%%d "
2680                            "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2681                            traits_t<T>::spec, traits_t<T>::spec,
2682                            traits_t<ST>::spec, traits_t<T>::spec);
2683    KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr));
2684    __kmp_str_free(&buff);
2685  }
2686#endif
2687
2688  if (__kmp_env_consistency_check) {
2689    if (incr == 0) {
2690      __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
2691                            loc);
2692    }
2693    if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) {
2694      // The loop is illegal.
2695      // Some zero-trip loops maintained by compiler, e.g.:
2696      //   for(i=10;i<0;++i) // lower >= upper - run-time check
2697      //   for(i=0;i>10;--i) // lower <= upper - run-time check
2698      //   for(i=0;i>10;++i) // incr > 0       - compile-time check
2699      //   for(i=10;i<0;--i) // incr < 0       - compile-time check
2700      // Compiler does not check the following illegal loops:
2701      //   for(i=0;i<10;i+=incr) // where incr<0
2702      //   for(i=10;i>0;i-=incr) // where incr<0
2703      __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc);
2704    }
2705  }
2706  __kmp_assert_valid_gtid(gtid);
2707  th = __kmp_threads[gtid];
2708  team = th->th.th_team;
2709  KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct
2710  nteams = th->th.th_teams_size.nteams;
2711  team_id = team->t.t_master_tid;
2712  KMP_DEBUG_ASSERT(nteams == (kmp_uint32)team->t.t_parent->t.t_nproc);
2713
2714  // compute global trip count
2715  if (incr == 1) {
2716    trip_count = *pupper - *plower + 1;
2717  } else if (incr == -1) {
2718    trip_count = *plower - *pupper + 1;
2719  } else if (incr > 0) {
2720    // upper-lower can exceed the limit of signed type
2721    trip_count = (UT)(*pupper - *plower) / incr + 1;
2722  } else {
2723    trip_count = (UT)(*plower - *pupper) / (-incr) + 1;
2724  }
2725
2726  if (trip_count <= nteams) {
2727    KMP_DEBUG_ASSERT(
2728        __kmp_static == kmp_sch_static_greedy ||
2729        __kmp_static ==
2730            kmp_sch_static_balanced); // Unknown static scheduling type.
2731    // only some teams get single iteration, others get nothing
2732    if (team_id < trip_count) {
2733      *pupper = *plower = *plower + team_id * incr;
2734    } else {
2735      *plower = *pupper + incr; // zero-trip loop
2736    }
2737    if (plastiter != NULL)
2738      *plastiter = (team_id == trip_count - 1);
2739  } else {
2740    if (__kmp_static == kmp_sch_static_balanced) {
2741      UT chunk = trip_count / nteams;
2742      UT extras = trip_count % nteams;
2743      *plower +=
2744          incr * (team_id * chunk + (team_id < extras ? team_id : extras));
2745      *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr);
2746      if (plastiter != NULL)
2747        *plastiter = (team_id == nteams - 1);
2748    } else {
2749      T chunk_inc_count =
2750          (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr;
2751      T upper = *pupper;
2752      KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy);
2753      // Unknown static scheduling type.
2754      *plower += team_id * chunk_inc_count;
2755      *pupper = *plower + chunk_inc_count - incr;
2756      // Check/correct bounds if needed
2757      if (incr > 0) {
2758        if (*pupper < *plower)
2759          *pupper = traits_t<T>::max_value;
2760        if (plastiter != NULL)
2761          *plastiter = *plower <= upper && *pupper > upper - incr;
2762        if (*pupper > upper)
2763          *pupper = upper; // tracker C73258
2764      } else {
2765        if (*pupper > *plower)
2766          *pupper = traits_t<T>::min_value;
2767        if (plastiter != NULL)
2768          *plastiter = *plower >= upper && *pupper < upper - incr;
2769        if (*pupper < upper)
2770          *pupper = upper; // tracker C73258
2771      }
2772    }
2773  }
2774}
2775
2776//-----------------------------------------------------------------------------
2777// Dispatch routines
2778//    Transfer call to template< type T >
2779//    __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
2780//                         T lb, T ub, ST st, ST chunk )
2781extern "C" {
2782
2783/*!
2784@ingroup WORK_SHARING
2785@{
2786@param loc Source location
2787@param gtid Global thread id
2788@param schedule Schedule type
2789@param lb  Lower bound
2790@param ub  Upper bound
2791@param st  Step (or increment if you prefer)
2792@param chunk The chunk size to block with
2793
2794This function prepares the runtime to start a dynamically scheduled for loop,
2795saving the loop arguments.
2796These functions are all identical apart from the types of the arguments.
2797*/
2798
2799void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2800                            enum sched_type schedule, kmp_int32 lb,
2801                            kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) {
2802  KMP_DEBUG_ASSERT(__kmp_init_serial);
2803#if OMPT_SUPPORT && OMPT_OPTIONAL
2804  OMPT_STORE_RETURN_ADDRESS(gtid);
2805#endif
2806  __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2807}
2808/*!
2809See @ref __kmpc_dispatch_init_4
2810*/
2811void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2812                             enum sched_type schedule, kmp_uint32 lb,
2813                             kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) {
2814  KMP_DEBUG_ASSERT(__kmp_init_serial);
2815#if OMPT_SUPPORT && OMPT_OPTIONAL
2816  OMPT_STORE_RETURN_ADDRESS(gtid);
2817#endif
2818  __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2819}
2820
2821/*!
2822See @ref __kmpc_dispatch_init_4
2823*/
2824void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2825                            enum sched_type schedule, kmp_int64 lb,
2826                            kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) {
2827  KMP_DEBUG_ASSERT(__kmp_init_serial);
2828#if OMPT_SUPPORT && OMPT_OPTIONAL
2829  OMPT_STORE_RETURN_ADDRESS(gtid);
2830#endif
2831  __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2832}
2833
2834/*!
2835See @ref __kmpc_dispatch_init_4
2836*/
2837void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2838                             enum sched_type schedule, kmp_uint64 lb,
2839                             kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) {
2840  KMP_DEBUG_ASSERT(__kmp_init_serial);
2841#if OMPT_SUPPORT && OMPT_OPTIONAL
2842  OMPT_STORE_RETURN_ADDRESS(gtid);
2843#endif
2844  __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2845}
2846
2847/*!
2848See @ref __kmpc_dispatch_init_4
2849
2850Difference from __kmpc_dispatch_init set of functions is these functions
2851are called for composite distribute parallel for construct. Thus before
2852regular iterations dispatching we need to calc per-team iteration space.
2853
2854These functions are all identical apart from the types of the arguments.
2855*/
2856void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2857                                 enum sched_type schedule, kmp_int32 *p_last,
2858                                 kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2859                                 kmp_int32 chunk) {
2860  KMP_DEBUG_ASSERT(__kmp_init_serial);
2861#if OMPT_SUPPORT && OMPT_OPTIONAL
2862  OMPT_STORE_RETURN_ADDRESS(gtid);
2863#endif
2864  __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st);
2865  __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2866}
2867
2868void __kmpc_dist_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2869                                  enum sched_type schedule, kmp_int32 *p_last,
2870                                  kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2871                                  kmp_int32 chunk) {
2872  KMP_DEBUG_ASSERT(__kmp_init_serial);
2873#if OMPT_SUPPORT && OMPT_OPTIONAL
2874  OMPT_STORE_RETURN_ADDRESS(gtid);
2875#endif
2876  __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st);
2877  __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2878}
2879
2880void __kmpc_dist_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2881                                 enum sched_type schedule, kmp_int32 *p_last,
2882                                 kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2883                                 kmp_int64 chunk) {
2884  KMP_DEBUG_ASSERT(__kmp_init_serial);
2885#if OMPT_SUPPORT && OMPT_OPTIONAL
2886  OMPT_STORE_RETURN_ADDRESS(gtid);
2887#endif
2888  __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st);
2889  __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2890}
2891
2892void __kmpc_dist_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2893                                  enum sched_type schedule, kmp_int32 *p_last,
2894                                  kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2895                                  kmp_int64 chunk) {
2896  KMP_DEBUG_ASSERT(__kmp_init_serial);
2897#if OMPT_SUPPORT && OMPT_OPTIONAL
2898  OMPT_STORE_RETURN_ADDRESS(gtid);
2899#endif
2900  __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st);
2901  __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2902}
2903
2904/*!
2905@param loc Source code location
2906@param gtid Global thread id
2907@param p_last Pointer to a flag set to one if this is the last chunk or zero
2908otherwise
2909@param p_lb   Pointer to the lower bound for the next chunk of work
2910@param p_ub   Pointer to the upper bound for the next chunk of work
2911@param p_st   Pointer to the stride for the next chunk of work
2912@return one if there is work to be done, zero otherwise
2913
2914Get the next dynamically allocated chunk of work for this thread.
2915If there is no more work, then the lb,ub and stride need not be modified.
2916*/
2917int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2918                           kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) {
2919#if OMPT_SUPPORT && OMPT_OPTIONAL
2920  OMPT_STORE_RETURN_ADDRESS(gtid);
2921#endif
2922  return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st
2923#if OMPT_SUPPORT && OMPT_OPTIONAL
2924                                        ,
2925                                        OMPT_LOAD_RETURN_ADDRESS(gtid)
2926#endif
2927  );
2928}
2929
2930/*!
2931See @ref __kmpc_dispatch_next_4
2932*/
2933int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2934                            kmp_uint32 *p_lb, kmp_uint32 *p_ub,
2935                            kmp_int32 *p_st) {
2936#if OMPT_SUPPORT && OMPT_OPTIONAL
2937  OMPT_STORE_RETURN_ADDRESS(gtid);
2938#endif
2939  return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st
2940#if OMPT_SUPPORT && OMPT_OPTIONAL
2941                                         ,
2942                                         OMPT_LOAD_RETURN_ADDRESS(gtid)
2943#endif
2944  );
2945}
2946
2947/*!
2948See @ref __kmpc_dispatch_next_4
2949*/
2950int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2951                           kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) {
2952#if OMPT_SUPPORT && OMPT_OPTIONAL
2953  OMPT_STORE_RETURN_ADDRESS(gtid);
2954#endif
2955  return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st
2956#if OMPT_SUPPORT && OMPT_OPTIONAL
2957                                        ,
2958                                        OMPT_LOAD_RETURN_ADDRESS(gtid)
2959#endif
2960  );
2961}
2962
2963/*!
2964See @ref __kmpc_dispatch_next_4
2965*/
2966int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2967                            kmp_uint64 *p_lb, kmp_uint64 *p_ub,
2968                            kmp_int64 *p_st) {
2969#if OMPT_SUPPORT && OMPT_OPTIONAL
2970  OMPT_STORE_RETURN_ADDRESS(gtid);
2971#endif
2972  return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st
2973#if OMPT_SUPPORT && OMPT_OPTIONAL
2974                                         ,
2975                                         OMPT_LOAD_RETURN_ADDRESS(gtid)
2976#endif
2977  );
2978}
2979
2980/*!
2981@param loc Source code location
2982@param gtid Global thread id
2983
2984Mark the end of a dynamic loop.
2985*/
2986void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid) {
2987  __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2988}
2989
2990/*!
2991See @ref __kmpc_dispatch_fini_4
2992*/
2993void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid) {
2994  __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2995}
2996
2997/*!
2998See @ref __kmpc_dispatch_fini_4
2999*/
3000void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid) {
3001  __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
3002}
3003
3004/*!
3005See @ref __kmpc_dispatch_fini_4
3006*/
3007void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid) {
3008  __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
3009}
3010/*! @} */
3011
3012//-----------------------------------------------------------------------------
3013// Non-template routines from kmp_dispatch.cpp used in other sources
3014
3015kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) {
3016  return value == checker;
3017}
3018
3019kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) {
3020  return value != checker;
3021}
3022
3023kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) {
3024  return value < checker;
3025}
3026
3027kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) {
3028  return value >= checker;
3029}
3030
3031kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) {
3032  return value <= checker;
3033}
3034
3035kmp_uint32
3036__kmp_wait_4(volatile kmp_uint32 *spinner, kmp_uint32 checker,
3037             kmp_uint32 (*pred)(kmp_uint32, kmp_uint32),
3038             void *obj // Higher-level synchronization object, or NULL.
3039) {
3040  // note: we may not belong to a team at this point
3041  volatile kmp_uint32 *spin = spinner;
3042  kmp_uint32 check = checker;
3043  kmp_uint32 spins;
3044  kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred;
3045  kmp_uint32 r;
3046  kmp_uint64 time;
3047
3048  KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin));
3049  KMP_INIT_YIELD(spins);
3050  KMP_INIT_BACKOFF(time);
3051  // main wait spin loop
3052  while (!f(r = TCR_4(*spin), check)) {
3053    KMP_FSYNC_SPIN_PREPARE(obj);
3054    /* GEH - remove this since it was accidentally introduced when kmp_wait was
3055       split. It causes problems with infinite recursion because of exit lock */
3056    /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
3057        __kmp_abort_thread(); */
3058    KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time);
3059  }
3060  KMP_FSYNC_SPIN_ACQUIRED(obj);
3061  return r;
3062}
3063
3064void __kmp_wait_4_ptr(void *spinner, kmp_uint32 checker,
3065                      kmp_uint32 (*pred)(void *, kmp_uint32),
3066                      void *obj // Higher-level synchronization object, or NULL.
3067) {
3068  // note: we may not belong to a team at this point
3069  void *spin = spinner;
3070  kmp_uint32 check = checker;
3071  kmp_uint32 spins;
3072  kmp_uint32 (*f)(void *, kmp_uint32) = pred;
3073  kmp_uint64 time;
3074
3075  KMP_FSYNC_SPIN_INIT(obj, spin);
3076  KMP_INIT_YIELD(spins);
3077  KMP_INIT_BACKOFF(time);
3078  // main wait spin loop
3079  while (!f(spin, check)) {
3080    KMP_FSYNC_SPIN_PREPARE(obj);
3081    /* if we have waited a bit, or are noversubscribed, yield */
3082    /* pause is in the following code */
3083    KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time);
3084  }
3085  KMP_FSYNC_SPIN_ACQUIRED(obj);
3086}
3087
3088} // extern "C"
3089
3090#ifdef KMP_GOMP_COMPAT
3091
3092void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
3093                               enum sched_type schedule, kmp_int32 lb,
3094                               kmp_int32 ub, kmp_int32 st, kmp_int32 chunk,
3095                               int push_ws) {
3096  __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk,
3097                                 push_ws);
3098}
3099
3100void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
3101                                enum sched_type schedule, kmp_uint32 lb,
3102                                kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk,
3103                                int push_ws) {
3104  __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk,
3105                                  push_ws);
3106}
3107
3108void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
3109                               enum sched_type schedule, kmp_int64 lb,
3110                               kmp_int64 ub, kmp_int64 st, kmp_int64 chunk,
3111                               int push_ws) {
3112  __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk,
3113                                 push_ws);
3114}
3115
3116void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
3117                                enum sched_type schedule, kmp_uint64 lb,
3118                                kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk,
3119                                int push_ws) {
3120  __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk,
3121                                  push_ws);
3122}
3123
3124void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid) {
3125  __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
3126}
3127
3128void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid) {
3129  __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
3130}
3131
3132void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid) {
3133  __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
3134}
3135
3136void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid) {
3137  __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
3138}
3139
3140#endif /* KMP_GOMP_COMPAT */
3141
3142/* ------------------------------------------------------------------------ */
3143