runtime/src/kmp_dispatch.cpp

331722Seadler/*
227068Sambrisko * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
227068Sambrisko */
227068Sambrisko
227068Sambrisko//===----------------------------------------------------------------------===//
227068Sambrisko//
227068Sambrisko// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
227068Sambrisko// See https://llvm.org/LICENSE.txt for license information.
227068Sambrisko// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
227068Sambrisko//
227068Sambrisko//===----------------------------------------------------------------------===//
227068Sambrisko
227068Sambrisko/* Dynamic scheduling initialization and dispatch.
227068Sambrisko *
227068Sambrisko * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
227068Sambrisko *       it may change values between parallel regions.  __kmp_max_nth
227068Sambrisko *       is the largest value __kmp_nth may take, 1 is the smallest.
227068Sambrisko */
227068Sambrisko
227068Sambrisko#include "kmp.h"
227068Sambrisko#include "kmp_error.h"
227068Sambrisko#include "kmp_i18n.h"
227068Sambrisko#include "kmp_itt.h"
227068Sambrisko#include "kmp_stats.h"
227068Sambrisko#include "kmp_str.h"
227068Sambrisko#if KMP_USE_X87CONTROL
227068Sambrisko#include <float.h>
227068Sambrisko#endif
227068Sambrisko#include "kmp_lock.h"
227068Sambrisko#include "kmp_dispatch.h"
227068Sambrisko#if KMP_USE_HIER_SCHED
227068Sambrisko#include "kmp_dispatch_hier.h"
227068Sambrisko#endif
233711Sambrisko
227068Sambrisko#if OMPT_SUPPORT
227068Sambrisko#include "ompt-specific.h"
227068Sambrisko#endif
227068Sambrisko
227068Sambrisko/* ------------------------------------------------------------------------ */
227068Sambrisko/* ------------------------------------------------------------------------ */
227068Sambrisko
227068Sambriskovoid __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
227068Sambrisko  kmp_info_t *th;
227068Sambrisko
227068Sambrisko  KMP_DEBUG_ASSERT(gtid_ref);
227068Sambrisko
227068Sambrisko  if (__kmp_env_consistency_check) {
227068Sambrisko    th = __kmp_threads[*gtid_ref];
227068Sambrisko    if (th->th.th_root->r.r_active &&
233711Sambrisko        (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) {
227068Sambrisko#if KMP_USE_DYNAMIC_LOCK
227068Sambrisko      __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0);
227068Sambrisko#else
227068Sambrisko      __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL);
227068Sambrisko#endif
227068Sambrisko    }
227068Sambrisko  }
247369Ssmh}
227068Sambrisko
227068Sambriskovoid __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
227068Sambrisko  kmp_info_t *th;
227068Sambrisko
227068Sambrisko  if (__kmp_env_consistency_check) {
227068Sambrisko    th = __kmp_threads[*gtid_ref];
227068Sambrisko    if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) {
227068Sambrisko      __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref);
227068Sambrisko    }
227068Sambrisko  }
227068Sambrisko}
227068Sambrisko
227068Sambrisko// Returns either SCHEDULE_MONOTONIC or SCHEDULE_NONMONOTONIC
227068Sambriskostatic inline int __kmp_get_monotonicity(ident_t *loc, enum sched_type schedule,
227068Sambrisko                                         bool use_hier = false) {
227068Sambrisko  // Pick up the nonmonotonic/monotonic bits from the scheduling type
227068Sambrisko  // Nonmonotonic as default for dynamic schedule when no modifier is specified
227068Sambrisko  int monotonicity = SCHEDULE_NONMONOTONIC;
227068Sambrisko
227068Sambrisko  // Let default be monotonic for executables
227068Sambrisko  // compiled with OpenMP* 4.5 or less compilers
235014Sambrisko  if (loc != NULL && loc->get_openmp_version() < 50)
235014Sambrisko    monotonicity = SCHEDULE_MONOTONIC;
227068Sambrisko
227068Sambrisko  if (use_hier || __kmp_force_monotonic)
227068Sambrisko    monotonicity = SCHEDULE_MONOTONIC;
247369Ssmh  else if (SCHEDULE_HAS_NONMONOTONIC(schedule))
247369Ssmh    monotonicity = SCHEDULE_NONMONOTONIC;
247369Ssmh  else if (SCHEDULE_HAS_MONOTONIC(schedule))
247369Ssmh    monotonicity = SCHEDULE_MONOTONIC;
247369Ssmh
247369Ssmh  return monotonicity;
247369Ssmh}
247369Ssmh
227068Sambrisko#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
227068Sambrisko// Return floating point number rounded to two decimal points
227068Sambriskostatic inline float __kmp_round_2decimal_val(float num) {
227068Sambrisko  return (float)(static_cast<int>(num * 100 + 0.5)) / 100;
227068Sambrisko}
227068Sambriskostatic inline int __kmp_get_round_val(float num) {
227068Sambrisko  return static_cast<int>(num < 0 ? num - 0.5 : num + 0.5);
227068Sambrisko}
227068Sambrisko#endif
227068Sambrisko
227068Sambriskotemplate <typename T>
227068Sambriskoinline void
227068Sambrisko__kmp_initialize_self_buffer(kmp_team_t *team, T id,
227068Sambrisko                             dispatch_private_info_template<T> *pr,
227068Sambrisko                             typename traits_t<T>::unsigned_t nchunks, T nproc,
227068Sambrisko                             typename traits_t<T>::unsigned_t &init,
227068Sambrisko                             T &small_chunk, T &extras, T &p_extra) {
227068Sambrisko
227068Sambrisko#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
227068Sambrisko  if (pr->flags.use_hybrid) {
227068Sambrisko    kmp_info_t *th = __kmp_threads[__kmp_gtid_from_tid((int)id, team)];
227068Sambrisko    kmp_hw_core_type_t type =
227068Sambrisko        (kmp_hw_core_type_t)th->th.th_topology_attrs.core_type;
227068Sambrisko    T pchunks = pr->u.p.pchunks;
227068Sambrisko    T echunks = nchunks - pchunks;
227068Sambrisko    T num_procs_with_pcore = pr->u.p.num_procs_with_pcore;
227068Sambrisko    T num_procs_with_ecore = nproc - num_procs_with_pcore;
233711Sambrisko    T first_thread_with_ecore = pr->u.p.first_thread_with_ecore;
227068Sambrisko    T big_chunk =
227068Sambrisko        pchunks / num_procs_with_pcore; // chunks per thread with p-core
233711Sambrisko    small_chunk =
227068Sambrisko        echunks / num_procs_with_ecore; // chunks per thread with e-core
227068Sambrisko
227068Sambrisko    extras =
227068Sambrisko        (pchunks % num_procs_with_pcore) + (echunks % num_procs_with_ecore);
227068Sambrisko
233711Sambrisko    p_extra = (big_chunk - small_chunk);
227068Sambrisko
227068Sambrisko    if (type == KMP_HW_CORE_TYPE_CORE) {
227068Sambrisko      if (id < first_thread_with_ecore) {
227068Sambrisko        init = id * small_chunk + id * p_extra + (id < extras ? id : extras);
227068Sambrisko      } else {
227068Sambrisko        init = id * small_chunk + (id - num_procs_with_ecore) * p_extra +
227068Sambrisko               (id < extras ? id : extras);
227068Sambrisko      }
233711Sambrisko    } else {
227068Sambrisko      if (id == first_thread_with_ecore) {
227068Sambrisko        init = id * small_chunk + id * p_extra + (id < extras ? id : extras);
227068Sambrisko      } else {
227068Sambrisko        init = id * small_chunk + first_thread_with_ecore * p_extra +
233711Sambrisko               (id < extras ? id : extras);
233711Sambrisko      }
227068Sambrisko    }
227068Sambrisko    p_extra = (type == KMP_HW_CORE_TYPE_CORE) ? p_extra : 0;
235016Sambrisko    return;
227068Sambrisko  }
227068Sambrisko#endif
227068Sambrisko
235016Sambrisko  small_chunk = nchunks / nproc; // chunks per thread
235016Sambrisko  extras = nchunks % nproc;
227068Sambrisko  p_extra = 0;
233711Sambrisko  init = id * small_chunk + (id < extras ? id : extras);
233711Sambrisko}
227068Sambrisko
233711Sambrisko#if KMP_STATIC_STEAL_ENABLED
233711Sambriskoenum { // values for steal_flag (possible states of private per-loop buffer)
233711Sambrisko  UNUSED = 0,
233711Sambrisko  CLAIMED = 1, // owner thread started initialization
233711Sambrisko  READY = 2, // available for stealing
233711Sambrisko  THIEF = 3 // finished by owner, or claimed by thief
227068Sambrisko  // possible state changes:
233711Sambrisko  // 0 -> 1 owner only, sync
227068Sambrisko  // 0 -> 3 thief only, sync
233711Sambrisko  // 1 -> 2 owner only, async
227068Sambrisko  // 2 -> 3 owner only, async
233711Sambrisko  // 3 -> 2 owner only, async
233711Sambrisko  // 3 -> 0 last thread finishing the loop, async
233711Sambrisko};
247369Ssmh#endif
247369Ssmh
227068Sambrisko// Initialize a dispatch_private_info_template<T> buffer for a particular
233711Sambrisko// type of schedule,chunk.  The loop description is found in lb (lower bound),
233711Sambrisko// ub (upper bound), and st (stride).  nproc is the number of threads relevant
233711Sambrisko// to the scheduling (often the number of threads in a team, but not always if
227068Sambrisko// hierarchical scheduling is used).  tid is the id of the thread calling
247369Ssmh// the function within the group of nproc threads.  It will have a value
227068Sambrisko// between 0 and nproc - 1.  This is often just the thread id within a team, but
233711Sambrisko// is not necessarily the case when using hierarchical scheduling.
227068Sambrisko// loc is the source file location of the corresponding loop
233711Sambrisko// gtid is the global thread id
233711Sambriskotemplate <typename T>
233711Sambriskovoid __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
227068Sambrisko                                   dispatch_private_info_template<T> *pr,
233711Sambrisko                                   enum sched_type schedule, T lb, T ub,
233711Sambrisko                                   typename traits_t<T>::signed_t st,
233711Sambrisko#if USE_ITT_BUILD
233711Sambrisko                                   kmp_uint64 *cur_chunk,
247369Ssmh#endif
247369Ssmh                                   typename traits_t<T>::signed_t chunk,
227068Sambrisko                                   T nproc, T tid) {
233711Sambrisko  typedef typename traits_t<T>::unsigned_t UT;
233711Sambrisko  typedef typename traits_t<T>::floating_t DBL;
233711Sambrisko
233711Sambrisko  int active;
227068Sambrisko  T tc;
227068Sambrisko  kmp_info_t *th;
227068Sambrisko  kmp_team_t *team;
235016Sambrisko  int monotonicity;
227068Sambrisko  bool use_hier;
235016Sambrisko
235016Sambrisko#ifdef KMP_DEBUG
227068Sambrisko  typedef typename traits_t<T>::signed_t ST;
227068Sambrisko  {
227068Sambrisko    char *buff;
227068Sambrisko    // create format specifiers before the debug output
227068Sambrisko    buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d called "
227068Sambrisko                            "pr:%%p lb:%%%s ub:%%%s st:%%%s "
227068Sambrisko                            "schedule:%%d chunk:%%%s nproc:%%%s tid:%%%s\n",
227068Sambrisko                            traits_t<T>::spec, traits_t<T>::spec,
227068Sambrisko                            traits_t<ST>::spec, traits_t<ST>::spec,
227068Sambrisko                            traits_t<T>::spec, traits_t<T>::spec);
227068Sambrisko    KD_TRACE(10, (buff, gtid, pr, lb, ub, st, schedule, chunk, nproc, tid));
227068Sambrisko    __kmp_str_free(&buff);
227068Sambrisko  }
227068Sambrisko#endif
227068Sambrisko  /* setup data */
227068Sambrisko  th = __kmp_threads[gtid];
227068Sambrisko  team = th->th.th_team;
227068Sambrisko  active = !team->t.t_serialized;
227068Sambrisko
227068Sambrisko#if USE_ITT_BUILD
227068Sambrisko  int itt_need_metadata_reporting =
227068Sambrisko      __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
227068Sambrisko      KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
227068Sambrisko      team->t.t_active_level == 1;
227068Sambrisko#endif
227068Sambrisko
227068Sambrisko#if KMP_USE_HIER_SCHED
227068Sambrisko  use_hier = pr->flags.use_hier;
227068Sambrisko#else
227068Sambrisko  use_hier = false;
227068Sambrisko#endif
227068Sambrisko
227068Sambrisko  /* Pick up the nonmonotonic/monotonic bits from the scheduling type */
227068Sambrisko  monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);
227068Sambrisko  schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
227068Sambrisko
227068Sambrisko  /* Pick up the nomerge/ordered bits from the scheduling type */
227068Sambrisko  if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) {
227068Sambrisko    pr->flags.nomerge = TRUE;
227068Sambrisko    schedule =
227068Sambrisko        (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
235016Sambrisko  } else {
235016Sambrisko    pr->flags.nomerge = FALSE;
227068Sambrisko  }
227068Sambrisko  pr->type_size = traits_t<T>::type_size; // remember the size of variables
235016Sambrisko  if (kmp_ord_lower & schedule) {
235016Sambrisko    pr->flags.ordered = TRUE;
227068Sambrisko    schedule =
227068Sambrisko        (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
233711Sambrisko  } else {
227068Sambrisko    pr->flags.ordered = FALSE;
227068Sambrisko  }
233711Sambrisko  // Ordered overrides nonmonotonic
227068Sambrisko  if (pr->flags.ordered) {
227068Sambrisko    monotonicity = SCHEDULE_MONOTONIC;
227068Sambrisko  }
227068Sambrisko
227068Sambrisko  if (schedule == kmp_sch_static) {
227068Sambrisko    schedule = __kmp_static;
227068Sambrisko  } else {
227068Sambrisko    if (schedule == kmp_sch_runtime) {
227068Sambrisko      // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if
227068Sambrisko      // not specified)
227068Sambrisko      schedule = team->t.t_sched.r_sched_type;
227068Sambrisko      monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);
227068Sambrisko      schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
227068Sambrisko      if (pr->flags.ordered) // correct monotonicity for ordered loop if needed
235016Sambrisko        monotonicity = SCHEDULE_MONOTONIC;
235016Sambrisko      // Detail the schedule if needed (global controls are differentiated
227068Sambrisko      // appropriately)
227068Sambrisko      if (schedule == kmp_sch_guided_chunked) {
227068Sambrisko        schedule = __kmp_guided;
227068Sambrisko      } else if (schedule == kmp_sch_static) {
227068Sambrisko        schedule = __kmp_static;
227068Sambrisko      }
227068Sambrisko      // Use the chunk size specified by OMP_SCHEDULE (or default if not
227068Sambrisko      // specified)
227068Sambrisko      chunk = team->t.t_sched.chunk;
233711Sambrisko#if USE_ITT_BUILD
227068Sambrisko      if (cur_chunk)
227068Sambrisko        *cur_chunk = chunk;
227068Sambrisko#endif
227068Sambrisko#ifdef KMP_DEBUG
227068Sambrisko      {
227068Sambrisko        char *buff;
227068Sambrisko        // create format specifiers before the debug output
227068Sambrisko        buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d new: "
227068Sambrisko                                "schedule:%%d chunk:%%%s\n",
227068Sambrisko                                traits_t<ST>::spec);
227068Sambrisko        KD_TRACE(10, (buff, gtid, schedule, chunk));
227068Sambrisko        __kmp_str_free(&buff);
227068Sambrisko      }
227068Sambrisko#endif
233711Sambrisko    } else {
227068Sambrisko      if (schedule == kmp_sch_guided_chunked) {
227068Sambrisko        schedule = __kmp_guided;
227068Sambrisko      }
227068Sambrisko      if (chunk <= 0) {
227068Sambrisko        chunk = KMP_DEFAULT_CHUNK;
227068Sambrisko      }
227068Sambrisko    }
227068Sambrisko
227068Sambrisko    if (schedule == kmp_sch_auto) {
227068Sambrisko      // mapping and differentiation: in the __kmp_do_serial_initialize()
227068Sambrisko      schedule = __kmp_auto;
227068Sambrisko#ifdef KMP_DEBUG
227068Sambrisko      {
227068Sambrisko        char *buff;
227068Sambrisko        // create format specifiers before the debug output
227068Sambrisko        buff = __kmp_str_format(
227068Sambrisko            "__kmp_dispatch_init_algorithm: kmp_sch_auto: T#%%d new: "
227068Sambrisko            "schedule:%%d chunk:%%%s\n",
227068Sambrisko            traits_t<ST>::spec);
227068Sambrisko        KD_TRACE(10, (buff, gtid, schedule, chunk));
227068Sambrisko        __kmp_str_free(&buff);
227068Sambrisko      }
247369Ssmh#endif
247369Ssmh    }
227068Sambrisko#if KMP_STATIC_STEAL_ENABLED
227068Sambrisko    // map nonmonotonic:dynamic to static steal
233711Sambrisko    if (schedule == kmp_sch_dynamic_chunked) {
233711Sambrisko      if (monotonicity == SCHEDULE_NONMONOTONIC)
227068Sambrisko        schedule = kmp_sch_static_steal;
227068Sambrisko    }
227068Sambrisko#endif
227068Sambrisko    /* guided analytical not safe for too many threads */
227068Sambrisko    if (schedule == kmp_sch_guided_analytical_chunked && nproc > 1 << 20) {
235016Sambrisko      schedule = kmp_sch_guided_iterative_chunked;
227068Sambrisko      KMP_WARNING(DispatchManyThreads);
227068Sambrisko    }
227068Sambrisko    if (schedule == kmp_sch_runtime_simd) {
227068Sambrisko      // compiler provides simd_width in the chunk parameter
227068Sambrisko      schedule = team->t.t_sched.r_sched_type;
227068Sambrisko      monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);
247369Ssmh      schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
227068Sambrisko      // Detail the schedule if needed (global controls are differentiated
233711Sambrisko      // appropriately)
227068Sambrisko      if (schedule == kmp_sch_static || schedule == kmp_sch_auto ||
247369Ssmh          schedule == __kmp_static) {
227068Sambrisko        schedule = kmp_sch_static_balanced_chunked;
227068Sambrisko      } else {
247369Ssmh        if (schedule == kmp_sch_guided_chunked || schedule == __kmp_guided) {
247369Ssmh          schedule = kmp_sch_guided_simd;
227068Sambrisko        }
233711Sambrisko        chunk = team->t.t_sched.chunk * chunk;
247369Ssmh      }
227068Sambrisko#if USE_ITT_BUILD
227068Sambrisko      if (cur_chunk)
227068Sambrisko        *cur_chunk = chunk;
227068Sambrisko#endif
247369Ssmh#ifdef KMP_DEBUG
247369Ssmh      {
227068Sambrisko        char *buff;
227068Sambrisko        // create format specifiers before the debug output
247369Ssmh        buff = __kmp_str_format(
247369Ssmh            "__kmp_dispatch_init_algorithm: T#%%d new: schedule:%%d"
247369Ssmh            " chunk:%%%s\n",
247369Ssmh            traits_t<ST>::spec);
247369Ssmh        KD_TRACE(10, (buff, gtid, schedule, chunk));
227068Sambrisko        __kmp_str_free(&buff);
227068Sambrisko      }
227068Sambrisko#endif
227068Sambrisko    }
227068Sambrisko    pr->u.p.parm1 = chunk;
227068Sambrisko  }
227068Sambrisko  KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper),
227068Sambrisko              "unknown scheduling type");
227068Sambrisko
227068Sambrisko  pr->u.p.count = 0;
227068Sambrisko
247369Ssmh  if (__kmp_env_consistency_check) {
227068Sambrisko    if (st == 0) {
227068Sambrisko      __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited,
227068Sambrisko                            (pr->flags.ordered ? ct_pdo_ordered : ct_pdo), loc);
227068Sambrisko    }
227068Sambrisko  }
227068Sambrisko  // compute trip count
227068Sambrisko  if (st == 1) { // most common case
227068Sambrisko    if (ub >= lb) {
227068Sambrisko      tc = ub - lb + 1;
227068Sambrisko    } else { // ub < lb
227068Sambrisko      tc = 0; // zero-trip
227068Sambrisko    }
227068Sambrisko  } else if (st < 0) {
227068Sambrisko    if (lb >= ub) {
227068Sambrisko      // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B),
227068Sambrisko      // where the division needs to be unsigned regardless of the result type
227068Sambrisko      tc = (UT)(lb - ub) / (-st) + 1;
227068Sambrisko    } else { // lb < ub
233711Sambrisko      tc = 0; // zero-trip
233711Sambrisko    }
227068Sambrisko  } else { // st > 0
227068Sambrisko    if (ub >= lb) {
227068Sambrisko      // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B),
227068Sambrisko      // where the division needs to be unsigned regardless of the result type
227068Sambrisko      tc = (UT)(ub - lb) / st + 1;
233711Sambrisko    } else { // ub < lb
233711Sambrisko      tc = 0; // zero-trip
233711Sambrisko    }
227068Sambrisko  }
227068Sambrisko
227068Sambrisko#if KMP_STATS_ENABLED
227068Sambrisko  if (KMP_MASTER_GTID(gtid)) {
227068Sambrisko    KMP_COUNT_VALUE(OMP_loop_dynamic_total_iterations, tc);
233711Sambrisko  }
233711Sambrisko#endif
233711Sambrisko
227068Sambrisko  pr->u.p.lb = lb;
227068Sambrisko  pr->u.p.ub = ub;
227068Sambrisko  pr->u.p.st = st;
233711Sambrisko  pr->u.p.tc = tc;
233711Sambrisko
227068Sambrisko#if KMP_OS_WINDOWS
227068Sambrisko  pr->u.p.last_upper = ub + st;
227068Sambrisko#endif /* KMP_OS_WINDOWS */
227068Sambrisko
227068Sambrisko  /* NOTE: only the active parallel region(s) has active ordered sections */
227068Sambrisko
227068Sambrisko  if (active) {
227068Sambrisko    if (pr->flags.ordered) {
227068Sambrisko      pr->ordered_bumped = 0;
227068Sambrisko      pr->u.p.ordered_lower = 1;
227068Sambrisko      pr->u.p.ordered_upper = 0;
227068Sambrisko    }
247369Ssmh  }
227068Sambrisko
227068Sambrisko  switch (schedule) {
247369Ssmh#if KMP_STATIC_STEAL_ENABLED
227068Sambrisko  case kmp_sch_static_steal: {
247369Ssmh    T ntc, init = 0;
247369Ssmh
227068Sambrisko    KD_TRACE(100,
247369Ssmh             ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n",
247369Ssmh              gtid));
227068Sambrisko
227068Sambrisko    ntc = (tc % chunk ? 1 : 0) + tc / chunk;
247369Ssmh    if (nproc > 1 && ntc >= nproc) {
247369Ssmh      KMP_COUNT_BLOCK(OMP_LOOP_STATIC_STEAL);
247369Ssmh      T id = tid;
247369Ssmh      T small_chunk, extras, p_extra = 0;
247369Ssmh      kmp_uint32 old = UNUSED;
227068Sambrisko      int claimed = pr->steal_flag.compare_exchange_strong(old, CLAIMED);
247369Ssmh      if (traits_t<T>::type_size > 4) {
247369Ssmh        // AC: TODO: check if 16-byte CAS available and use it to
227068Sambrisko        // improve performance (probably wait for explicit request
227068Sambrisko        // before spending time on this).
235016Sambrisko        // For now use dynamically allocated per-private-buffer lock,
235016Sambrisko        // free memory in __kmp_dispatch_next when status==0.
227068Sambrisko        pr->u.p.steal_lock = (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t));
227068Sambrisko        __kmp_init_lock(pr->u.p.steal_lock);
233711Sambrisko      }
227068Sambrisko
233711Sambrisko#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
227068Sambrisko      // Iterations are divided in a 60/40 skewed distribution among CORE and
227068Sambrisko      // ATOM processors for hybrid systems
227068Sambrisko      bool use_hybrid = false;
227068Sambrisko      kmp_hw_core_type_t core_type = KMP_HW_CORE_TYPE_UNKNOWN;
227068Sambrisko      T first_thread_with_ecore = 0;
227068Sambrisko      T num_procs_with_pcore = 0;
227068Sambrisko      T num_procs_with_ecore = 0;
227068Sambrisko      T p_ntc = 0, e_ntc = 0;
227068Sambrisko      if (__kmp_is_hybrid_cpu() && __kmp_affinity.type != affinity_none &&
247369Ssmh          __kmp_affinity.type != affinity_explicit) {
247369Ssmh        use_hybrid = true;
247369Ssmh        core_type = (kmp_hw_core_type_t)th->th.th_topology_attrs.core_type;
247369Ssmh        if (core_type != KMP_HW_CORE_TYPE_UNKNOWN &&
247369Ssmh            __kmp_first_osid_with_ecore > -1) {
247369Ssmh          for (int i = 0; i < team->t.t_nproc; ++i) {
247369Ssmh            kmp_hw_core_type_t type = (kmp_hw_core_type_t)team->t.t_threads[i]
227068Sambrisko                                          ->th.th_topology_attrs.core_type;
227068Sambrisko            int id = team->t.t_threads[i]->th.th_topology_ids.os_id;
227068Sambrisko            if (id == __kmp_first_osid_with_ecore) {
247369Ssmh              first_thread_with_ecore =
247369Ssmh                  team->t.t_threads[i]->th.th_info.ds.ds_tid;
247369Ssmh            }
247369Ssmh            if (type == KMP_HW_CORE_TYPE_CORE) {
247369Ssmh              num_procs_with_pcore++;
227068Sambrisko            } else if (type == KMP_HW_CORE_TYPE_ATOM) {
227068Sambrisko              num_procs_with_ecore++;
227068Sambrisko            } else {
227068Sambrisko              use_hybrid = false;
227068Sambrisko              break;
227068Sambrisko            }
227068Sambrisko          }
247369Ssmh        }
247369Ssmh        if (num_procs_with_pcore > 0 && num_procs_with_ecore > 0) {
227068Sambrisko          float multiplier = 60.0 / 40.0;
227068Sambrisko          float p_ratio = (float)num_procs_with_pcore / nproc;
227068Sambrisko          float e_ratio = (float)num_procs_with_ecore / nproc;
227068Sambrisko          float e_multiplier =
247369Ssmh              (float)1 /
247369Ssmh              (((multiplier * num_procs_with_pcore) / nproc) + e_ratio);
227068Sambrisko          float p_multiplier = multiplier * e_multiplier;
227068Sambrisko          p_ntc = __kmp_get_round_val(ntc * p_ratio * p_multiplier);
247369Ssmh          if ((int)p_ntc > (int)(ntc * p_ratio * p_multiplier))
247369Ssmh            e_ntc =
227068Sambrisko                (int)(__kmp_round_2decimal_val(ntc * e_ratio * e_multiplier));
227068Sambrisko          else
227068Sambrisko            e_ntc = __kmp_get_round_val(ntc * e_ratio * e_multiplier);
227068Sambrisko          KMP_DEBUG_ASSERT(ntc == p_ntc + e_ntc);
227068Sambrisko
247369Ssmh          // Use regular static steal if not enough chunks for skewed
227068Sambrisko          // distribution
227068Sambrisko          use_hybrid = (use_hybrid && (p_ntc >= num_procs_with_pcore &&
227068Sambrisko                                       e_ntc >= num_procs_with_ecore)
227068Sambrisko                            ? true
227068Sambrisko                            : false);
227068Sambrisko        } else {
227068Sambrisko          use_hybrid = false;
227068Sambrisko        }
227068Sambrisko      }
227068Sambrisko      pr->flags.use_hybrid = use_hybrid;
227068Sambrisko      pr->u.p.pchunks = p_ntc;
227068Sambrisko      pr->u.p.num_procs_with_pcore = num_procs_with_pcore;
227068Sambrisko      pr->u.p.first_thread_with_ecore = first_thread_with_ecore;
227068Sambrisko
227068Sambrisko      if (use_hybrid) {
227068Sambrisko        KMP_DEBUG_ASSERT(nproc == num_procs_with_pcore + num_procs_with_ecore);
227068Sambrisko        T big_chunk = p_ntc / num_procs_with_pcore;
227068Sambrisko        small_chunk = e_ntc / num_procs_with_ecore;
227068Sambrisko
227068Sambrisko        extras =
227068Sambrisko            (p_ntc % num_procs_with_pcore) + (e_ntc % num_procs_with_ecore);
227068Sambrisko
227068Sambrisko        p_extra = (big_chunk - small_chunk);
227068Sambrisko
242681Sambrisko        if (core_type == KMP_HW_CORE_TYPE_CORE) {
227068Sambrisko          if (id < first_thread_with_ecore) {
227068Sambrisko            init =
227068Sambrisko                id * small_chunk + id * p_extra + (id < extras ? id : extras);
227068Sambrisko          } else {
227068Sambrisko            init = id * small_chunk + (id - num_procs_with_ecore) * p_extra +
227068Sambrisko                   (id < extras ? id : extras);
235016Sambrisko          }
235016Sambrisko        } else {
227068Sambrisko          if (id == first_thread_with_ecore) {
227068Sambrisko            init =
227068Sambrisko                id * small_chunk + id * p_extra + (id < extras ? id : extras);
227068Sambrisko          } else {
233711Sambrisko            init = id * small_chunk + first_thread_with_ecore * p_extra +
233711Sambrisko                   (id < extras ? id : extras);
227068Sambrisko          }
227068Sambrisko        }
227068Sambrisko        p_extra = (core_type == KMP_HW_CORE_TYPE_CORE) ? p_extra : 0;
227068Sambrisko      } else
233711Sambrisko#endif
227068Sambrisko      {
247369Ssmh        small_chunk = ntc / nproc;
247369Ssmh        extras = ntc % nproc;
233711Sambrisko        init = id * small_chunk + (id < extras ? id : extras);
233711Sambrisko        p_extra = 0;
233711Sambrisko      }
227068Sambrisko      pr->u.p.count = init;
227068Sambrisko      if (claimed) { // are we succeeded in claiming own buffer?
227068Sambrisko        pr->u.p.ub = init + small_chunk + p_extra + (id < extras ? 1 : 0);
227068Sambrisko        // Other threads will inspect steal_flag when searching for a victim.
227068Sambrisko        // READY means other threads may steal from this thread from now on.
227068Sambrisko        KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
227068Sambrisko      } else {
227068Sambrisko        // other thread has stolen whole our range
227068Sambrisko        KMP_DEBUG_ASSERT(pr->steal_flag == THIEF);
227068Sambrisko        pr->u.p.ub = init; // mark there is no iterations to work on
227068Sambrisko      }
227068Sambrisko      pr->u.p.parm2 = ntc; // save number of chunks
227068Sambrisko      // parm3 is the number of times to attempt stealing which is
227068Sambrisko      // nproc (just a heuristics, could be optimized later on).
227068Sambrisko      pr->u.p.parm3 = nproc;
227068Sambrisko      pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
227068Sambrisko      break;
227068Sambrisko    } else {
235016Sambrisko      /* too few chunks: switching to kmp_sch_dynamic_chunked */
235016Sambrisko      schedule = kmp_sch_dynamic_chunked;
227068Sambrisko      KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d switching to "
227068Sambrisko                     "kmp_sch_dynamic_chunked\n",
227068Sambrisko                     gtid));
233711Sambrisko      goto dynamic_init;
227068Sambrisko      break;
227068Sambrisko    } // if
233711Sambrisko  } // case
227068Sambrisko#endif
227068Sambrisko  case kmp_sch_static_balanced: {
227068Sambrisko    T init, limit;
247369Ssmh
227068Sambrisko    KD_TRACE(
227068Sambrisko        100,
227068Sambrisko        ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_balanced case\n",
227068Sambrisko         gtid));
227068Sambrisko
235016Sambrisko    if (nproc > 1) {
227068Sambrisko      T id = tid;
227068Sambrisko
227068Sambrisko      if (tc < nproc) {
227068Sambrisko        if (id < tc) {
227068Sambrisko          init = id;
227068Sambrisko          limit = id;
227068Sambrisko          pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */
227068Sambrisko        } else {
227068Sambrisko          pr->u.p.count = 1; /* means no more chunks to execute */
227068Sambrisko          pr->u.p.parm1 = FALSE;
227068Sambrisko          break;
247369Ssmh        }
247369Ssmh      } else {
247369Ssmh        T small_chunk = tc / nproc;
247369Ssmh        T extras = tc % nproc;
247369Ssmh        init = id * small_chunk + (id < extras ? id : extras);
227068Sambrisko        limit = init + small_chunk - (id < extras ? 0 : 1);
247369Ssmh        pr->u.p.parm1 = (id == nproc - 1);
247369Ssmh      }
247369Ssmh    } else {
247369Ssmh      if (tc > 0) {
247369Ssmh        init = 0;
247369Ssmh        limit = tc - 1;
247369Ssmh        pr->u.p.parm1 = TRUE;
247369Ssmh      } else {
227068Sambrisko        // zero trip count
247369Ssmh        pr->u.p.count = 1; /* means no more chunks to execute */
247369Ssmh        pr->u.p.parm1 = FALSE;
247369Ssmh        break;
247369Ssmh      }
227068Sambrisko    }
247369Ssmh#if USE_ITT_BUILD
247369Ssmh    // Calculate chunk for metadata report
247369Ssmh    if (itt_need_metadata_reporting)
247369Ssmh      if (cur_chunk)
247369Ssmh        *cur_chunk = limit - init + 1;
227068Sambrisko#endif
247369Ssmh    if (st == 1) {
247369Ssmh      pr->u.p.lb = lb + init;
247369Ssmh      pr->u.p.ub = lb + limit;
247369Ssmh    } else {
247369Ssmh      // calculated upper bound, "ub" is user-defined upper bound
227068Sambrisko      T ub_tmp = lb + limit * st;
227068Sambrisko      pr->u.p.lb = lb + init * st;
235016Sambrisko      // adjust upper bound to "ub" if needed, so that MS lastprivate will match
233711Sambrisko      // it exactly
233711Sambrisko      if (st > 0) {
247369Ssmh        pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp);
247369Ssmh      } else {
233711Sambrisko        pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp);
247369Ssmh      }
247369Ssmh    }
247369Ssmh    if (pr->flags.ordered) {
233711Sambrisko      pr->u.p.ordered_lower = init;
233711Sambrisko      pr->u.p.ordered_upper = limit;
227068Sambrisko    }
247369Ssmh    break;
247369Ssmh  } // case
247369Ssmh  case kmp_sch_static_balanced_chunked: {
247369Ssmh    // similar to balanced, but chunk adjusted to multiple of simd width
247369Ssmh    T nth = nproc;
233711Sambrisko    KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d runtime(simd:static)"
227068Sambrisko                   " -> falling-through to static_greedy\n",
235014Sambrisko                   gtid));
235014Sambrisko    schedule = kmp_sch_static_greedy;
227068Sambrisko    if (nth > 1)
227068Sambrisko      pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1);
247369Ssmh    else
227068Sambrisko      pr->u.p.parm1 = tc;
227068Sambrisko    break;
227068Sambrisko  } // case
227068Sambrisko  case kmp_sch_guided_simd:
227068Sambrisko  case kmp_sch_guided_iterative_chunked: {
227068Sambrisko    KD_TRACE(
227068Sambrisko        100,
247369Ssmh        ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_guided_iterative_chunked"
227068Sambrisko         " case\n",
227068Sambrisko         gtid));
227068Sambrisko
227068Sambrisko    if (nproc > 1) {
227068Sambrisko      if ((2L * chunk + 1) * nproc >= tc) {
227068Sambrisko        /* chunk size too large, switch to dynamic */
247369Ssmh        schedule = kmp_sch_dynamic_chunked;
227068Sambrisko        goto dynamic_init;
247369Ssmh      } else {
247369Ssmh        // when remaining iters become less than parm2 - switch to dynamic
227068Sambrisko        pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1);
227068Sambrisko        *(double *)&pr->u.p.parm3 =
227068Sambrisko            guided_flt_param / (double)nproc; // may occupy parm3 and parm4
227068Sambrisko      }
227068Sambrisko    } else {
227068Sambrisko      KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
227068Sambrisko                     "kmp_sch_static_greedy\n",
227068Sambrisko                     gtid));
227068Sambrisko      schedule = kmp_sch_static_greedy;
227068Sambrisko      /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
227068Sambrisko      KD_TRACE(
227068Sambrisko          100,
247369Ssmh          ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
247369Ssmh           gtid));
247369Ssmh      pr->u.p.parm1 = tc;
247369Ssmh    } // if
227068Sambrisko  } // case
227068Sambrisko  break;
247369Ssmh  case kmp_sch_guided_analytical_chunked: {
247369Ssmh    KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
247369Ssmh                   "kmp_sch_guided_analytical_chunked case\n",
247369Ssmh                   gtid));
247369Ssmh
247369Ssmh    if (nproc > 1) {
227068Sambrisko      if ((2L * chunk + 1) * nproc >= tc) {
227068Sambrisko        /* chunk size too large, switch to dynamic */
227068Sambrisko        schedule = kmp_sch_dynamic_chunked;
227068Sambrisko        goto dynamic_init;
227068Sambrisko      } else {
235014Sambrisko        /* commonly used term: (2 nproc - 1)/(2 nproc) */
227068Sambrisko        DBL x;
247369Ssmh
247369Ssmh#if KMP_USE_X87CONTROL
242681Sambrisko        /* Linux* OS already has 64-bit computation by default for long double,
242681Sambrisko           and on Windows* OS on Intel(R) 64, /Qlong_double doesn't work. On
242681Sambrisko           Windows* OS on IA-32 architecture, we need to set precision to 64-bit
242681Sambrisko           instead of the default 53-bit. Even though long double doesn't work
247369Ssmh           on Windows* OS on Intel(R) 64, the resulting lack of precision is not
242681Sambrisko           expected to impact the correctness of the algorithm, but this has not
242681Sambrisko           been mathematically proven. */
247369Ssmh        // save original FPCW and set precision to 64-bit, as
247369Ssmh        // Windows* OS on IA-32 architecture defaults to 53-bit
242681Sambrisko        unsigned int oldFpcw = _control87(0, 0);
242681Sambrisko        _control87(_PC_64, _MCW_PC); // 0,0x30000
242681Sambrisko#endif
227068Sambrisko        /* value used for comparison in solver for cross-over point */
227068Sambrisko        KMP_ASSERT(tc > 0);
247369Ssmh        long double target = ((long double)chunk * 2 + 1) * nproc / tc;
227068Sambrisko
227068Sambrisko        /* crossover point--chunk indexes equal to or greater than
227068Sambrisko           this point switch to dynamic-style scheduling */
227068Sambrisko        UT cross;
227068Sambrisko
247369Ssmh        /* commonly used term: (2 nproc - 1)/(2 nproc) */
247369Ssmh        x = 1.0 - 0.5 / (double)nproc;
227068Sambrisko
227068Sambrisko#ifdef KMP_DEBUG
227068Sambrisko        { // test natural alignment
227068Sambrisko          struct _test_a {
227068Sambrisko            char a;
227068Sambrisko            union {
227068Sambrisko              char b;
227068Sambrisko              DBL d;
227068Sambrisko            };
227068Sambrisko          } t;
227068Sambrisko          ptrdiff_t natural_alignment =
227068Sambrisko              (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
227068Sambrisko          //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long
233711Sambrisko          // long)natural_alignment );
227068Sambrisko          KMP_DEBUG_ASSERT(
227068Sambrisko              (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0);
227068Sambrisko        }
227068Sambrisko#endif // KMP_DEBUG
227068Sambrisko
227068Sambrisko        /* save the term in thread private dispatch structure */
227068Sambrisko        *(DBL *)&pr->u.p.parm3 = x;
233711Sambrisko
227068Sambrisko        /* solve for the crossover point to the nearest integer i for which C_i
227068Sambrisko           <= chunk */
227068Sambrisko        {
227068Sambrisko          UT left, right, mid;
227068Sambrisko          long double p;
235016Sambrisko
227068Sambrisko          /* estimate initial upper and lower bound */
227068Sambrisko
227068Sambrisko          /* doesn't matter what value right is as long as it is positive, but
227068Sambrisko             it affects performance of the solver */
227068Sambrisko          right = 229;
227068Sambrisko          p = __kmp_pow<UT>(x, right);
235016Sambrisko          if (p > target) {
247369Ssmh            do {
227068Sambrisko              p *= p;
227068Sambrisko              right <<= 1;
227068Sambrisko            } while (p > target && right < (1 << 27));
227068Sambrisko            /* lower bound is previous (failed) estimate of upper bound */
227068Sambrisko            left = right >> 1;
247369Ssmh          } else {
247369Ssmh            left = 0;
227068Sambrisko          }
227068Sambrisko
227068Sambrisko          /* bisection root-finding method */
227068Sambrisko          while (left + 1 < right) {
247369Ssmh            mid = (left + right) / 2;
247369Ssmh            if (__kmp_pow<UT>(x, mid) > target) {
247369Ssmh              left = mid;
247369Ssmh            } else {
247369Ssmh              right = mid;
227068Sambrisko            }
227068Sambrisko          } // while
227068Sambrisko          cross = right;
227068Sambrisko        }
227068Sambrisko        /* assert sanity of computed crossover point */
227068Sambrisko        KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target &&
227068Sambrisko                   __kmp_pow<UT>(x, cross) <= target);
227068Sambrisko
227068Sambrisko        /* save the crossover point in thread private dispatch structure */
227068Sambrisko        pr->u.p.parm2 = cross;
227068Sambrisko
227068Sambrisko// C75803
227068Sambrisko#if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8))
227068Sambrisko#define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3)
227068Sambrisko#else
227068Sambrisko#define GUIDED_ANALYTICAL_WORKAROUND (x)
227068Sambrisko#endif
227068Sambrisko        /* dynamic-style scheduling offset */
227068Sambrisko        pr->u.p.count = tc -
227068Sambrisko                        __kmp_dispatch_guided_remaining(
233711Sambrisko                            tc, GUIDED_ANALYTICAL_WORKAROUND, cross) -
227068Sambrisko                        cross * chunk;
227068Sambrisko#if KMP_USE_X87CONTROL
227068Sambrisko        // restore FPCW
227068Sambrisko        _control87(oldFpcw, _MCW_PC);
227068Sambrisko#endif
227068Sambrisko      } // if
227068Sambrisko    } else {
247369Ssmh      KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
227068Sambrisko                     "kmp_sch_static_greedy\n",
227068Sambrisko                     gtid));
227068Sambrisko      schedule = kmp_sch_static_greedy;
227068Sambrisko      /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
227068Sambrisko      pr->u.p.parm1 = tc;
227068Sambrisko    } // if
227068Sambrisko  } // case
227068Sambrisko  break;
227068Sambrisko  case kmp_sch_static_greedy:
227068Sambrisko    KD_TRACE(
227068Sambrisko        100,
227068Sambrisko        ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
227068Sambrisko         gtid));
227068Sambrisko    pr->u.p.parm1 = (nproc > 1) ? (tc + nproc - 1) / nproc : tc;
227068Sambrisko    break;
227068Sambrisko  case kmp_sch_static_chunked:
227068Sambrisko  case kmp_sch_dynamic_chunked:
227068Sambrisko  dynamic_init:
227068Sambrisko    if (tc == 0)
227068Sambrisko      break;
227068Sambrisko    if (pr->u.p.parm1 <= 0)
227068Sambrisko      pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
227068Sambrisko    else if (pr->u.p.parm1 > tc)
227068Sambrisko      pr->u.p.parm1 = tc;
227068Sambrisko    // Store the total number of chunks to prevent integer overflow during
227068Sambrisko    // bounds calculations in the get next chunk routine.
227068Sambrisko    pr->u.p.parm2 = (tc / pr->u.p.parm1) + (tc % pr->u.p.parm1 ? 1 : 0);
227068Sambrisko    KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
227068Sambrisko                   "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n",
227068Sambrisko                   gtid));
227068Sambrisko    break;
227068Sambrisko  case kmp_sch_trapezoidal: {
227068Sambrisko    /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
227068Sambrisko
227068Sambrisko    T parm1, parm2, parm3, parm4;
227068Sambrisko    KD_TRACE(100,
227068Sambrisko             ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_trapezoidal case\n",
261535Smarkj              gtid));
261535Smarkj
227068Sambrisko    parm1 = chunk;
227068Sambrisko
227068Sambrisko    /* F : size of the first cycle */
227068Sambrisko    parm2 = (tc / (2 * nproc));
227068Sambrisko
227068Sambrisko    if (parm2 < 1) {
227068Sambrisko      parm2 = 1;
227068Sambrisko    }
227068Sambrisko
227068Sambrisko    /* L : size of the last cycle.  Make sure the last cycle is not larger
227068Sambrisko       than the first cycle. */
227068Sambrisko    if (parm1 < 1) {
242681Sambrisko      parm1 = 1;
242681Sambrisko    } else if (parm1 > parm2) {
242681Sambrisko      parm1 = parm2;
242681Sambrisko    }
242681Sambrisko
242681Sambrisko    /* N : number of cycles */
242681Sambrisko    parm3 = (parm2 + parm1);
233711Sambrisko    parm3 = (2 * tc + parm3 - 1) / parm3;
227068Sambrisko
227068Sambrisko    if (parm3 < 2) {
227068Sambrisko      parm3 = 2;
227068Sambrisko    }
227068Sambrisko
235016Sambrisko    /* sigma : decreasing incr of the trapezoid */
235016Sambrisko    parm4 = (parm3 - 1);
235016Sambrisko    parm4 = (parm2 - parm1) / parm4;
227068Sambrisko
242681Sambrisko    // pointless check, because parm4 >= 0 always
227068Sambrisko    // if ( parm4 < 0 ) {
242681Sambrisko    //    parm4 = 0;
242681Sambrisko    //}
242681Sambrisko
227068Sambrisko    pr->u.p.parm1 = parm1;
242681Sambrisko    pr->u.p.parm2 = parm2;
242681Sambrisko    pr->u.p.parm3 = parm3;
242681Sambrisko    pr->u.p.parm4 = parm4;
242681Sambrisko  } // case
227068Sambrisko  break;
242681Sambrisko
227068Sambrisko  default: {
242681Sambrisko    __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
242681Sambrisko                KMP_HNT(GetNewerLibrary), // Hint
242681Sambrisko                __kmp_msg_null // Variadic argument list terminator
242681Sambrisko    );
242681Sambrisko  } break;
242681Sambrisko  } // switch
227068Sambrisko  pr->schedule = schedule;
242681Sambrisko}
242681Sambrisko
242681Sambrisko#if KMP_USE_HIER_SCHED
242681Sambriskotemplate <typename T>
242681Sambriskoinline void __kmp_dispatch_init_hier_runtime(ident_t *loc, T lb, T ub,
242681Sambrisko                                             typename traits_t<T>::signed_t st);
227068Sambriskotemplate <>
227068Sambriskoinline void
227068Sambrisko__kmp_dispatch_init_hier_runtime<kmp_int32>(ident_t *loc, kmp_int32 lb,
227068Sambrisko                                            kmp_int32 ub, kmp_int32 st) {
227068Sambrisko  __kmp_dispatch_init_hierarchy<kmp_int32>(
227068Sambrisko      loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
227068Sambrisko      __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
227068Sambrisko}
227068Sambriskotemplate <>
227068Sambriskoinline void
227068Sambrisko__kmp_dispatch_init_hier_runtime<kmp_uint32>(ident_t *loc, kmp_uint32 lb,
227068Sambrisko                                             kmp_uint32 ub, kmp_int32 st) {
227068Sambrisko  __kmp_dispatch_init_hierarchy<kmp_uint32>(
227068Sambrisko      loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
227068Sambrisko      __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
227068Sambrisko}
227068Sambriskotemplate <>
227068Sambriskoinline void
227068Sambrisko__kmp_dispatch_init_hier_runtime<kmp_int64>(ident_t *loc, kmp_int64 lb,
227068Sambrisko                                            kmp_int64 ub, kmp_int64 st) {
227068Sambrisko  __kmp_dispatch_init_hierarchy<kmp_int64>(
227068Sambrisko      loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
227068Sambrisko      __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
227068Sambrisko}
227068Sambriskotemplate <>
242681Sambriskoinline void
242681Sambrisko__kmp_dispatch_init_hier_runtime<kmp_uint64>(ident_t *loc, kmp_uint64 lb,
242681Sambrisko                                             kmp_uint64 ub, kmp_int64 st) {
227068Sambrisko  __kmp_dispatch_init_hierarchy<kmp_uint64>(
227068Sambrisko      loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
227068Sambrisko      __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
227068Sambrisko}
227068Sambrisko
227068Sambrisko// free all the hierarchy scheduling memory associated with the team
233711Sambriskovoid __kmp_dispatch_free_hierarchies(kmp_team_t *team) {
227068Sambrisko  int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
233711Sambrisko  for (int i = 0; i < num_disp_buff; ++i) {
227068Sambrisko    // type does not matter here so use kmp_int32
227068Sambrisko    auto sh =
261535Smarkj        reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>(
227068Sambrisko            &team->t.t_disp_buffer[i]);
227068Sambrisko    if (sh->hier) {
227068Sambrisko      sh->hier->deallocate();
227068Sambrisko      __kmp_free(sh->hier);
227068Sambrisko    }
227068Sambrisko  }
227068Sambrisko}
227068Sambrisko#endif
227068Sambrisko
227068Sambrisko// UT - unsigned flavor of T, ST - signed flavor of T,
227068Sambrisko// DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
227068Sambriskotemplate <typename T>
227068Sambriskostatic void
233711Sambrisko__kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
227068Sambrisko                    T ub, typename traits_t<T>::signed_t st,
227068Sambrisko                    typename traits_t<T>::signed_t chunk, int push_ws) {
227068Sambrisko  typedef typename traits_t<T>::unsigned_t UT;
227068Sambrisko
227068Sambrisko  int active;
227068Sambrisko  kmp_info_t *th;
227068Sambrisko  kmp_team_t *team;
227068Sambrisko  kmp_uint32 my_buffer_index;
227068Sambrisko  dispatch_private_info_template<T> *pr;
261535Smarkj  dispatch_shared_info_template<T> volatile *sh;
261535Smarkj
261535Smarkj  KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) ==
261535Smarkj                   sizeof(dispatch_private_info));
261535Smarkj  KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) ==
227068Sambrisko                   sizeof(dispatch_shared_info));
227068Sambrisko  __kmp_assert_valid_gtid(gtid);
233711Sambrisko
233711Sambrisko  if (!TCR_4(__kmp_init_parallel))
233711Sambrisko    __kmp_parallel_initialize();
233711Sambrisko
227068Sambrisko  __kmp_resume_if_soft_paused();
227068Sambrisko
227068Sambrisko#if INCLUDE_SSC_MARKS
227068Sambrisko  SSC_MARK_DISPATCH_INIT();
227068Sambrisko#endif
233711Sambrisko#ifdef KMP_DEBUG
227068Sambrisko  typedef typename traits_t<T>::signed_t ST;
261535Smarkj  {
261535Smarkj    char *buff;
261535Smarkj    // create format specifiers before the debug output
261535Smarkj    buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d "
261535Smarkj                            "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
227068Sambrisko                            traits_t<ST>::spec, traits_t<T>::spec,
227068Sambrisko                            traits_t<T>::spec, traits_t<ST>::spec);
227068Sambrisko    KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st));
227068Sambrisko    __kmp_str_free(&buff);
227068Sambrisko  }
227068Sambrisko#endif
227068Sambrisko  /* setup data */
227068Sambrisko  th = __kmp_threads[gtid];
227068Sambrisko  team = th->th.th_team;
227068Sambrisko  active = !team->t.t_serialized;
227068Sambrisko  th->th.th_ident = loc;
227068Sambrisko
227068Sambrisko  // Any half-decent optimizer will remove this test when the blocks are empty
227068Sambrisko  // since the macros expand to nothing
261535Smarkj  // when statistics are disabled.
261535Smarkj  if (schedule == __kmp_static) {
261535Smarkj    KMP_COUNT_BLOCK(OMP_LOOP_STATIC);
261535Smarkj  } else {
261535Smarkj    KMP_COUNT_BLOCK(OMP_LOOP_DYNAMIC);
227068Sambrisko  }
227068Sambrisko
233711Sambrisko#if KMP_USE_HIER_SCHED
227068Sambrisko  // Initialize the scheduling hierarchy if requested in OMP_SCHEDULE envirable
227068Sambrisko  // Hierarchical scheduling does not work with ordered, so if ordered is
227068Sambrisko  // detected, then revert back to threaded scheduling.
227068Sambrisko  bool ordered;
227068Sambrisko  enum sched_type my_sched = schedule;
233711Sambrisko  my_buffer_index = th->th.th_dispatch->th_disp_index;
227068Sambrisko  pr = reinterpret_cast<dispatch_private_info_template<T> *>(
233711Sambrisko      &th->th.th_dispatch
227068Sambrisko           ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
261535Smarkj  my_sched = SCHEDULE_WITHOUT_MODIFIERS(my_sched);
261535Smarkj  if ((my_sched >= kmp_nm_lower) && (my_sched < kmp_nm_upper))
261535Smarkj    my_sched =
261535Smarkj        (enum sched_type)(((int)my_sched) - (kmp_nm_lower - kmp_sch_lower));
261535Smarkj  ordered = (kmp_ord_lower & my_sched);
261535Smarkj  if (pr->flags.use_hier) {
261535Smarkj    if (ordered) {
227068Sambrisko      KD_TRACE(100, ("__kmp_dispatch_init: T#%d ordered loop detected.  "
227068Sambrisko                     "Disabling hierarchical scheduling.\n",
227068Sambrisko                     gtid));
227068Sambrisko      pr->flags.use_hier = FALSE;
227068Sambrisko    }
227068Sambrisko  }
227068Sambrisko  if (schedule == kmp_sch_runtime && __kmp_hier_scheds.size > 0) {
227068Sambrisko    // Don't use hierarchical for ordered parallel loops and don't
227068Sambrisko    // use the runtime hierarchy if one was specified in the program
227068Sambrisko    if (!ordered && !pr->flags.use_hier)
227068Sambrisko      __kmp_dispatch_init_hier_runtime<T>(loc, lb, ub, st);
227068Sambrisko  }
247369Ssmh#endif // KMP_USE_HIER_SCHED
247369Ssmh
247369Ssmh#if USE_ITT_BUILD
227068Sambrisko  kmp_uint64 cur_chunk = chunk;
227068Sambrisko  int itt_need_metadata_reporting =
227068Sambrisko      __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
247369Ssmh      KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
247369Ssmh      team->t.t_active_level == 1;
247369Ssmh#endif
247369Ssmh  if (!active) {
247369Ssmh    pr = reinterpret_cast<dispatch_private_info_template<T> *>(
247369Ssmh        th->th.th_dispatch->th_disp_buffer); /* top of the stack */
247369Ssmh  } else {
247369Ssmh    KMP_DEBUG_ASSERT(th->th.th_dispatch ==
247369Ssmh                     &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
227068Sambrisko
227068Sambrisko    my_buffer_index = th->th.th_dispatch->th_disp_index++;
227068Sambrisko
227068Sambrisko    /* What happens when number of threads changes, need to resize buffer? */
227068Sambrisko    pr = reinterpret_cast<dispatch_private_info_template<T> *>(
227068Sambrisko        &th->th.th_dispatch
227068Sambrisko             ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
227068Sambrisko    sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
227068Sambrisko        &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
227068Sambrisko    KD_TRACE(10, ("__kmp_dispatch_init: T#%d my_buffer_index:%d\n", gtid,
227068Sambrisko                  my_buffer_index));
227068Sambrisko    if (sh->buffer_index != my_buffer_index) { // too many loops in progress?
227068Sambrisko      KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d"
227068Sambrisko                     " sh->buffer_index:%d\n",
227068Sambrisko                     gtid, my_buffer_index, sh->buffer_index));
227068Sambrisko      __kmp_wait<kmp_uint32>(&sh->buffer_index, my_buffer_index,
227068Sambrisko                             __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
227068Sambrisko      // Note: KMP_WAIT() cannot be used there: buffer index and
247369Ssmh      // my_buffer_index are *always* 32-bit integers.
227068Sambrisko      KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d "
227068Sambrisko                     "sh->buffer_index:%d\n",
237546Skevlo                     gtid, my_buffer_index, sh->buffer_index));
227068Sambrisko    }
227068Sambrisko  }
227068Sambrisko
227068Sambrisko  __kmp_dispatch_init_algorithm(loc, gtid, pr, schedule, lb, ub, st,
227068Sambrisko#if USE_ITT_BUILD
227068Sambrisko                                &cur_chunk,
227068Sambrisko#endif
227068Sambrisko                                chunk, (T)th->th.th_team_nproc,
227068Sambrisko                                (T)th->th.th_info.ds.ds_tid);
227068Sambrisko  if (active) {
227068Sambrisko    if (pr->flags.ordered == 0) {
227068Sambrisko      th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
247369Ssmh      th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
227068Sambrisko    } else {
227068Sambrisko      th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>;
227068Sambrisko      th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>;
233711Sambrisko    }
227068Sambrisko    th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr;
227068Sambrisko    th->th.th_dispatch->th_dispatch_sh_current =
227068Sambrisko        CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh);
227068Sambrisko#if USE_ITT_BUILD
242681Sambrisko    if (pr->flags.ordered) {
235014Sambrisko      __kmp_itt_ordered_init(gtid);
227068Sambrisko    }
227068Sambrisko    // Report loop metadata
227068Sambrisko    if (itt_need_metadata_reporting) {
227068Sambrisko      // Only report metadata by primary thread of active team at level 1
227068Sambrisko      kmp_uint64 schedtype = 0;
227068Sambrisko      switch (schedule) {
227068Sambrisko      case kmp_sch_static_chunked:
227068Sambrisko      case kmp_sch_static_balanced: // Chunk is calculated in the switch above
227068Sambrisko        break;
227068Sambrisko      case kmp_sch_static_greedy:
227068Sambrisko        cur_chunk = pr->u.p.parm1;
227068Sambrisko        break;
227068Sambrisko      case kmp_sch_dynamic_chunked:
227068Sambrisko        schedtype = 1;
227068Sambrisko        break;
242681Sambrisko      case kmp_sch_guided_iterative_chunked:
227068Sambrisko      case kmp_sch_guided_analytical_chunked:
242681Sambrisko      case kmp_sch_guided_simd:
227068Sambrisko        schedtype = 2;
227068Sambrisko        break;
227068Sambrisko      default:
227068Sambrisko        // Should we put this case under "static"?
242681Sambrisko        // case kmp_sch_static_steal:
247369Ssmh        schedtype = 3;
247369Ssmh        break;
242681Sambrisko      }
242681Sambrisko      __kmp_itt_metadata_loop(loc, schedtype, pr->u.p.tc, cur_chunk);
242681Sambrisko    }
242681Sambrisko#if KMP_USE_HIER_SCHED
242681Sambrisko    if (pr->flags.use_hier) {
242681Sambrisko      pr->u.p.count = 0;
242681Sambrisko      pr->u.p.ub = pr->u.p.lb = pr->u.p.st = pr->u.p.tc = 0;
242681Sambrisko    }
242681Sambrisko#endif // KMP_USER_HIER_SCHED
242681Sambrisko#endif /* USE_ITT_BUILD */
242681Sambrisko  }
227068Sambrisko
227068Sambrisko#ifdef KMP_DEBUG
227068Sambrisko  {
227068Sambrisko    char *buff;
227068Sambrisko    // create format specifiers before the debug output
227068Sambrisko    buff = __kmp_str_format(
247369Ssmh        "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s "
247369Ssmh        "lb:%%%s ub:%%%s"
247369Ssmh        " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s"
247369Ssmh        " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
298955Spfg        traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec,
247369Ssmh        traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
247369Ssmh        traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec,
235014Sambrisko        traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec);
227068Sambrisko    KD_TRACE(10, (buff, gtid, pr->schedule, pr->flags.ordered, pr->u.p.lb,
227068Sambrisko                  pr->u.p.ub, pr->u.p.st, pr->u.p.tc, pr->u.p.count,
227068Sambrisko                  pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
242681Sambrisko                  pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4));
242681Sambrisko    __kmp_str_free(&buff);
247369Ssmh  }
247369Ssmh#endif
247369Ssmh#if OMPT_SUPPORT && OMPT_OPTIONAL
247369Ssmh  if (ompt_enabled.ompt_callback_work) {
242681Sambrisko    ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
242681Sambrisko    ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
227068Sambrisko    ompt_callbacks.ompt_callback(ompt_callback_work)(
235016Sambrisko        ompt_work_loop, ompt_scope_begin, &(team_info->parallel_data),
247369Ssmh        &(task_info->task_data), pr->u.p.tc, OMPT_LOAD_RETURN_ADDRESS(gtid));
247369Ssmh  }
247369Ssmh#endif
235014Sambrisko  KMP_PUSH_PARTITIONED_TIMER(OMP_loop_dynamic);
227068Sambrisko}
235014Sambrisko
227068Sambrisko/* For ordered loops, either __kmp_dispatch_finish() should be called after
227068Sambrisko * every iteration, or __kmp_dispatch_finish_chunk() should be called after
227068Sambrisko * every chunk of iterations.  If the ordered section(s) were not executed
227068Sambrisko * for this iteration (or every iteration in this chunk), we need to set the
227068Sambrisko * ordered iteration counters so that the next thread can proceed. */
235016Sambriskotemplate <typename UT>
247369Ssmhstatic void __kmp_dispatch_finish(int gtid, ident_t *loc) {
227068Sambrisko  typedef typename traits_t<UT>::signed_t ST;
233711Sambrisko  __kmp_assert_valid_gtid(gtid);
247369Ssmh  kmp_info_t *th = __kmp_threads[gtid];
227068Sambrisko
227068Sambrisko  KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid));
233711Sambrisko  if (!th->th.th_team->t.t_serialized) {
227068Sambrisko
227068Sambrisko    dispatch_private_info_template<UT> *pr =
227068Sambrisko        reinterpret_cast<dispatch_private_info_template<UT> *>(
227068Sambrisko            th->th.th_dispatch->th_dispatch_pr_current);
227068Sambrisko    dispatch_shared_info_template<UT> volatile *sh =
227068Sambrisko        reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
227068Sambrisko            th->th.th_dispatch->th_dispatch_sh_current);
227068Sambrisko    KMP_DEBUG_ASSERT(pr);
227068Sambrisko    KMP_DEBUG_ASSERT(sh);
247369Ssmh    KMP_DEBUG_ASSERT(th->th.th_dispatch ==
247369Ssmh                     &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
247369Ssmh
227068Sambrisko    if (pr->ordered_bumped) {
227068Sambrisko      KD_TRACE(
227068Sambrisko          1000,
227068Sambrisko          ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
227068Sambrisko           gtid));
247369Ssmh      pr->ordered_bumped = 0;
247369Ssmh    } else {
247369Ssmh      UT lower = pr->u.p.ordered_lower;
247369Ssmh
247369Ssmh#ifdef KMP_DEBUG
247369Ssmh      {
247369Ssmh        char *buff;
247369Ssmh        // create format specifiers before the debug output
247369Ssmh        buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d before wait: "
247369Ssmh                                "ordered_iteration:%%%s lower:%%%s\n",
227068Sambrisko                                traits_t<UT>::spec, traits_t<UT>::spec);
227068Sambrisko        KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
247369Ssmh        __kmp_str_free(&buff);
247369Ssmh      }
247369Ssmh#endif
247369Ssmh
247369Ssmh      __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
247369Ssmh                     __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
247369Ssmh      KMP_MB(); /* is this necessary? */
247369Ssmh#ifdef KMP_DEBUG
227068Sambrisko      {
227068Sambrisko        char *buff;
227068Sambrisko        // create format specifiers before the debug output
227068Sambrisko        buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d after wait: "
235016Sambrisko                                "ordered_iteration:%%%s lower:%%%s\n",
247369Ssmh                                traits_t<UT>::spec, traits_t<UT>::spec);
227068Sambrisko        KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
227068Sambrisko        __kmp_str_free(&buff);
247369Ssmh      }
227068Sambrisko#endif
247369Ssmh
227068Sambrisko      test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
227068Sambrisko    } // if
235016Sambrisko  } // if
235016Sambrisko  KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid));
227068Sambrisko}
227068Sambrisko
227068Sambrisko#ifdef KMP_GOMP_COMPAT
227068Sambrisko
227068Sambriskotemplate <typename UT>
233711Sambriskostatic void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) {
227068Sambrisko  typedef typename traits_t<UT>::signed_t ST;
227068Sambrisko  __kmp_assert_valid_gtid(gtid);
227068Sambrisko  kmp_info_t *th = __kmp_threads[gtid];
227068Sambrisko
227068Sambrisko  KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid));
233711Sambrisko  if (!th->th.th_team->t.t_serialized) {
227068Sambrisko    dispatch_private_info_template<UT> *pr =
227068Sambrisko        reinterpret_cast<dispatch_private_info_template<UT> *>(
227068Sambrisko            th->th.th_dispatch->th_dispatch_pr_current);
227068Sambrisko    dispatch_shared_info_template<UT> volatile *sh =
227068Sambrisko        reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
233711Sambrisko            th->th.th_dispatch->th_dispatch_sh_current);
227068Sambrisko    KMP_DEBUG_ASSERT(pr);
233711Sambrisko    KMP_DEBUG_ASSERT(sh);
227068Sambrisko    KMP_DEBUG_ASSERT(th->th.th_dispatch ==
233711Sambrisko                     &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
227068Sambrisko
227068Sambrisko    UT lower = pr->u.p.ordered_lower;
227068Sambrisko    UT upper = pr->u.p.ordered_upper;
227068Sambrisko    UT inc = upper - lower + 1;
227068Sambrisko
233711Sambrisko    if (pr->ordered_bumped == inc) {
233711Sambrisko      KD_TRACE(
227068Sambrisko          1000,
247369Ssmh          ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
247369Ssmh           gtid));
227068Sambrisko      pr->ordered_bumped = 0;
247369Ssmh    } else {
247369Ssmh      inc -= pr->ordered_bumped;
247369Ssmh
247369Ssmh#ifdef KMP_DEBUG
247369Ssmh      {
247369Ssmh        char *buff;
247369Ssmh        // create format specifiers before the debug output
227068Sambrisko        buff = __kmp_str_format(
247369Ssmh            "__kmp_dispatch_finish_chunk: T#%%d before wait: "
247369Ssmh            "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
247369Ssmh            traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec);
227068Sambrisko        KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper));
227068Sambrisko        __kmp_str_free(&buff);
227068Sambrisko      }
247369Ssmh#endif
247369Ssmh
247369Ssmh      __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
227068Sambrisko                     __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
227068Sambrisko
227068Sambrisko      KMP_MB(); /* is this necessary? */
247369Ssmh      KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting "
247369Ssmh                      "ordered_bumped to zero\n",
247369Ssmh                      gtid));
235014Sambrisko      pr->ordered_bumped = 0;
235014Sambrisko//!!!!! TODO check if the inc should be unsigned, or signed???
227068Sambrisko#ifdef KMP_DEBUG
227068Sambrisko      {
227068Sambrisko        char *buff;
227068Sambrisko        // create format specifiers before the debug output
227068Sambrisko        buff = __kmp_str_format(
227068Sambrisko            "__kmp_dispatch_finish_chunk: T#%%d after wait: "
227068Sambrisko            "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
227068Sambrisko            traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
227068Sambrisko            traits_t<UT>::spec);
227068Sambrisko        KD_TRACE(1000,
247369Ssmh                 (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper));
247369Ssmh        __kmp_str_free(&buff);
227068Sambrisko      }
227068Sambrisko#endif
247369Ssmh
247369Ssmh      test_then_add<ST>((volatile ST *)&sh->u.s.ordered_iteration, inc);
227068Sambrisko    }
233711Sambrisko    //        }
227068Sambrisko  }
227068Sambrisko  KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid));
233711Sambrisko}
227068Sambrisko
227068Sambrisko#endif /* KMP_GOMP_COMPAT */
227068Sambrisko
227068Sambriskotemplate <typename T>
227068Sambriskoint __kmp_dispatch_next_algorithm(int gtid,
227068Sambrisko                                  dispatch_private_info_template<T> *pr,
227068Sambrisko                                  dispatch_shared_info_template<T> volatile *sh,
235014Sambrisko                                  kmp_int32 *p_last, T *p_lb, T *p_ub,
235014Sambrisko                                  typename traits_t<T>::signed_t *p_st, T nproc,
235014Sambrisko                                  T tid) {
235016Sambrisko  typedef typename traits_t<T>::unsigned_t UT;
235014Sambrisko  typedef typename traits_t<T>::signed_t ST;
235014Sambrisko  typedef typename traits_t<T>::floating_t DBL;
235014Sambrisko  int status = 0;
235014Sambrisko  bool last = false;
235014Sambrisko  T start;
235014Sambrisko  ST incr;
235014Sambrisko  UT limit, trip, init;
235014Sambrisko  kmp_info_t *th = __kmp_threads[gtid];
235014Sambrisko  kmp_team_t *team = th->th.th_team;
235014Sambrisko
235014Sambrisko  KMP_DEBUG_ASSERT(th->th.th_dispatch ==
235014Sambrisko                   &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
235014Sambrisko  KMP_DEBUG_ASSERT(pr);
235014Sambrisko  KMP_DEBUG_ASSERT(sh);
235014Sambrisko  KMP_DEBUG_ASSERT(tid >= 0 && tid < nproc);
235014Sambrisko#ifdef KMP_DEBUG
235014Sambrisko  {
235014Sambrisko    char *buff;
235014Sambrisko    // create format specifiers before the debug output
235014Sambrisko    buff =
235014Sambrisko        __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d called pr:%%p "
235014Sambrisko                         "sh:%%p nproc:%%%s tid:%%%s\n",
235014Sambrisko                         traits_t<T>::spec, traits_t<T>::spec);
235014Sambrisko    KD_TRACE(10, (buff, gtid, pr, sh, nproc, tid));
235014Sambrisko    __kmp_str_free(&buff);
298955Spfg  }
235014Sambrisko#endif
235014Sambrisko
235014Sambrisko  // zero trip count
235014Sambrisko  if (pr->u.p.tc == 0) {
235014Sambrisko    KD_TRACE(10,
235014Sambrisko             ("__kmp_dispatch_next_algorithm: T#%d early exit trip count is "
235014Sambrisko              "zero status:%d\n",
247369Ssmh              gtid, status));
247369Ssmh    return 0;
235014Sambrisko  }
247369Ssmh
235014Sambrisko  switch (pr->schedule) {
235014Sambrisko#if KMP_STATIC_STEAL_ENABLED
235014Sambrisko  case kmp_sch_static_steal: {
235014Sambrisko    T chunk = pr->u.p.parm1;
235014Sambrisko    UT nchunks = pr->u.p.parm2;
247369Ssmh    KD_TRACE(100,
247369Ssmh             ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_steal case\n",
235014Sambrisko              gtid));
235014Sambrisko
235014Sambrisko    trip = pr->u.p.tc - 1;
235014Sambrisko
235014Sambrisko    if (traits_t<T>::type_size > 4) {
235014Sambrisko      // use lock for 8-byte induction variable.
235014Sambrisko      // TODO (optional): check presence and use 16-byte CAS
235014Sambrisko      kmp_lock_t *lck = pr->u.p.steal_lock;
235014Sambrisko      KMP_DEBUG_ASSERT(lck != NULL);
247369Ssmh      if (pr->u.p.count < (UT)pr->u.p.ub) {
235014Sambrisko        KMP_DEBUG_ASSERT(pr->steal_flag == READY);
235014Sambrisko        __kmp_acquire_lock(lck, gtid);
235014Sambrisko        // try to get own chunk of iterations
235014Sambrisko        init = (pr->u.p.count)++;
235014Sambrisko        status = (init < (UT)pr->u.p.ub);
235014Sambrisko        __kmp_release_lock(lck, gtid);
235014Sambrisko      } else {
235014Sambrisko        status = 0; // no own chunks
235014Sambrisko      }
235014Sambrisko      if (!status) { // try to steal
235014Sambrisko        kmp_lock_t *lckv; // victim buffer's lock
235014Sambrisko        T while_limit = pr->u.p.parm3;
235014Sambrisko        T while_index = 0;
235014Sambrisko        int idx = (th->th.th_dispatch->th_disp_index - 1) %
247369Ssmh                  __kmp_dispatch_num_buffers; // current loop index
235040Sambrisko        // note: victim thread can potentially execute another loop
235040Sambrisko        KMP_ATOMIC_ST_REL(&pr->steal_flag, THIEF); // mark self buffer inactive
235040Sambrisko        while ((!status) && (while_limit != ++while_index)) {
235040Sambrisko          dispatch_private_info_template<T> *v;
247369Ssmh          T remaining;
235014Sambrisko          T victimId = pr->u.p.parm4;
235014Sambrisko          T oldVictimId = victimId ? victimId - 1 : nproc - 1;
235040Sambrisko          v = reinterpret_cast<dispatch_private_info_template<T> *>(
235040Sambrisko              &team->t.t_dispatch[victimId].th_disp_buffer[idx]);
235040Sambrisko          KMP_DEBUG_ASSERT(v);
235040Sambrisko          while ((v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) &&
235040Sambrisko                 oldVictimId != victimId) {
242681Sambrisko            victimId = (victimId + 1) % nproc;
235014Sambrisko            v = reinterpret_cast<dispatch_private_info_template<T> *>(
235014Sambrisko                &team->t.t_dispatch[victimId].th_disp_buffer[idx]);
235014Sambrisko            KMP_DEBUG_ASSERT(v);
235014Sambrisko          }
235014Sambrisko          if (v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) {
235014Sambrisko            continue; // try once more (nproc attempts in total)
235014Sambrisko          }
235014Sambrisko          if (KMP_ATOMIC_LD_RLX(&v->steal_flag) == UNUSED) {
235014Sambrisko            kmp_uint32 old = UNUSED;
235014Sambrisko            // try to steal whole range from inactive victim
235014Sambrisko            status = v->steal_flag.compare_exchange_strong(old, THIEF);
235014Sambrisko            if (status) {
235014Sambrisko              // initialize self buffer with victim's whole range of chunks
235014Sambrisko              T id = victimId;
235014Sambrisko              T small_chunk = 0, extras = 0, p_extra = 0;
235014Sambrisko              __kmp_initialize_self_buffer<T>(team, id, pr, nchunks, nproc,
235014Sambrisko                                              init, small_chunk, extras,
235014Sambrisko                                              p_extra);
235014Sambrisko              __kmp_acquire_lock(lck, gtid);
235014Sambrisko              pr->u.p.count = init + 1; // exclude one we execute immediately
235014Sambrisko              pr->u.p.ub = init + small_chunk + p_extra + (id < extras ? 1 : 0);
235014Sambrisko              __kmp_release_lock(lck, gtid);
235014Sambrisko              pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
235014Sambrisko              // no need to reinitialize other thread invariants: lb, st, etc.
235014Sambrisko#ifdef KMP_DEBUG
235014Sambrisko              {
235014Sambrisko                char *buff;
235040Sambrisko                // create format specifiers before the debug output
235040Sambrisko                buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d "
247369Ssmh                                        "stolen chunks from T#%%d, "
235040Sambrisko                                        "count:%%%s ub:%%%s\n",
235014Sambrisko                                        traits_t<UT>::spec, traits_t<T>::spec);
235014Sambrisko                KD_TRACE(10, (buff, gtid, id, pr->u.p.count, pr->u.p.ub));
235014Sambrisko                __kmp_str_free(&buff);
235014Sambrisko              }
235014Sambrisko#endif
235014Sambrisko              // activate non-empty buffer and let others steal from us
235014Sambrisko              if (pr->u.p.count < (UT)pr->u.p.ub)
235014Sambrisko                KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
235014Sambrisko              break;
235014Sambrisko            }
235014Sambrisko          }
235014Sambrisko          if (KMP_ATOMIC_LD_ACQ(&v->steal_flag) != READY ||
235014Sambrisko              v->u.p.count >= (UT)v->u.p.ub) {
235014Sambrisko            pr->u.p.parm4 = (victimId + 1) % nproc; // shift start victim tid
235014Sambrisko            continue; // no chunks to steal, try next victim
235014Sambrisko          }
235014Sambrisko          lckv = v->u.p.steal_lock;
235014Sambrisko          KMP_ASSERT(lckv != NULL);
235014Sambrisko          __kmp_acquire_lock(lckv, gtid);
235014Sambrisko          limit = v->u.p.ub; // keep initial ub
235014Sambrisko          if (v->u.p.count >= limit) {
235014Sambrisko            __kmp_release_lock(lckv, gtid);
235014Sambrisko            pr->u.p.parm4 = (victimId + 1) % nproc; // shift start victim tid
235014Sambrisko            continue; // no chunks to steal, try next victim
235014Sambrisko          }
235014Sambrisko
235014Sambrisko          // stealing succeded, reduce victim's ub by 1/4 of undone chunks
235014Sambrisko          // TODO: is this heuristics good enough??
235014Sambrisko          remaining = limit - v->u.p.count;
235014Sambrisko          if (remaining > 7) {
235014Sambrisko            // steal 1/4 of remaining
247369Ssmh            KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, remaining >> 2);
235014Sambrisko            init = (v->u.p.ub -= (remaining >> 2));
235014Sambrisko          } else {
235014Sambrisko            // steal 1 chunk of 1..7 remaining
235014Sambrisko            KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1);
247369Ssmh            init = (v->u.p.ub -= 1);
235014Sambrisko          }
235014Sambrisko          __kmp_release_lock(lckv, gtid);
235014Sambrisko#ifdef KMP_DEBUG
235014Sambrisko          {
235014Sambrisko            char *buff;
235014Sambrisko            // create format specifiers before the debug output
235014Sambrisko            buff = __kmp_str_format(
235014Sambrisko                "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, "
235014Sambrisko                "count:%%%s ub:%%%s\n",
235014Sambrisko                traits_t<UT>::spec, traits_t<UT>::spec);
235014Sambrisko            KD_TRACE(10, (buff, gtid, victimId, init, limit));
235014Sambrisko            __kmp_str_free(&buff);
235014Sambrisko          }
235014Sambrisko#endif
235014Sambrisko          KMP_DEBUG_ASSERT(init + 1 <= limit);
235014Sambrisko          pr->u.p.parm4 = victimId; // remember victim to steal from
247369Ssmh          status = 1;
235014Sambrisko          // now update own count and ub with stolen range excluding init chunk
247369Ssmh          __kmp_acquire_lock(lck, gtid);
235014Sambrisko          pr->u.p.count = init + 1;
          pr->u.p.ub = limit;
          __kmp_release_lock(lck, gtid);
          // activate non-empty buffer and let others steal from us
          if (init + 1 < limit)
            KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
        } // while (search for victim)
      } // if (try to find victim and steal)
    } else {
      // 4-byte induction variable, use 8-byte CAS for pair (count, ub)
      // as all operations on pair (count, ub) must be done atomically
      typedef union {
        struct {
          UT count;
          T ub;
        } p;
        kmp_int64 b;
      } union_i4;
      union_i4 vold, vnew;
      if (pr->u.p.count < (UT)pr->u.p.ub) {
        KMP_DEBUG_ASSERT(pr->steal_flag == READY);
        vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
        vnew.b = vold.b;
        vnew.p.count++; // get chunk from head of self range
        while (!KMP_COMPARE_AND_STORE_REL64(
            (volatile kmp_int64 *)&pr->u.p.count,
            *VOLATILE_CAST(kmp_int64 *) & vold.b,
            *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
          KMP_CPU_PAUSE();
          vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
          vnew.b = vold.b;
          vnew.p.count++;
        }
        init = vold.p.count;
        status = (init < (UT)vold.p.ub);
      } else {
        status = 0; // no own chunks
      }
      if (!status) { // try to steal
        T while_limit = pr->u.p.parm3;
        T while_index = 0;
        int idx = (th->th.th_dispatch->th_disp_index - 1) %
                  __kmp_dispatch_num_buffers; // current loop index
        // note: victim thread can potentially execute another loop
        KMP_ATOMIC_ST_REL(&pr->steal_flag, THIEF); // mark self buffer inactive
        while ((!status) && (while_limit != ++while_index)) {
          dispatch_private_info_template<T> *v;
          T remaining;
          T victimId = pr->u.p.parm4;
          T oldVictimId = victimId ? victimId - 1 : nproc - 1;
          v = reinterpret_cast<dispatch_private_info_template<T> *>(
              &team->t.t_dispatch[victimId].th_disp_buffer[idx]);
          KMP_DEBUG_ASSERT(v);
          while ((v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) &&
                 oldVictimId != victimId) {
            victimId = (victimId + 1) % nproc;
            v = reinterpret_cast<dispatch_private_info_template<T> *>(
                &team->t.t_dispatch[victimId].th_disp_buffer[idx]);
            KMP_DEBUG_ASSERT(v);
          }
          if (v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) {
            continue; // try once more (nproc attempts in total)
          }
          if (KMP_ATOMIC_LD_RLX(&v->steal_flag) == UNUSED) {
            kmp_uint32 old = UNUSED;
            // try to steal whole range from inactive victim
            status = v->steal_flag.compare_exchange_strong(old, THIEF);
            if (status) {
              // initialize self buffer with victim's whole range of chunks
              T id = victimId;
              T small_chunk = 0, extras = 0, p_extra = 0;
              __kmp_initialize_self_buffer<T>(team, id, pr, nchunks, nproc,
                                              init, small_chunk, extras,
                                              p_extra);
              vnew.p.count = init + 1;
              vnew.p.ub = init + small_chunk + p_extra + (id < extras ? 1 : 0);
              // write pair (count, ub) at once atomically
#if KMP_ARCH_X86
              KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vnew.b);
#else
              *(volatile kmp_int64 *)(&pr->u.p.count) = vnew.b;
#endif
              pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
              // no need to initialize other thread invariants: lb, st, etc.
#ifdef KMP_DEBUG
              {
                char *buff;
                // create format specifiers before the debug output
                buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d "
                                        "stolen chunks from T#%%d, "
                                        "count:%%%s ub:%%%s\n",
                                        traits_t<UT>::spec, traits_t<T>::spec);
                KD_TRACE(10, (buff, gtid, id, pr->u.p.count, pr->u.p.ub));
                __kmp_str_free(&buff);
              }
#endif
              // activate non-empty buffer and let others steal from us
              if (pr->u.p.count < (UT)pr->u.p.ub)
                KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
              break;
            }
          }
          while (1) { // CAS loop with check if victim still has enough chunks
            // many threads may be stealing concurrently from same victim
            vold.b = *(volatile kmp_int64 *)(&v->u.p.count);
            if (KMP_ATOMIC_LD_ACQ(&v->steal_flag) != READY ||
                vold.p.count >= (UT)vold.p.ub) {
              pr->u.p.parm4 = (victimId + 1) % nproc; // shift start victim id
              break; // no chunks to steal, try next victim
            }
            vnew.b = vold.b;
            remaining = vold.p.ub - vold.p.count;
            // try to steal 1/4 of remaining
            // TODO: is this heuristics good enough??
            if (remaining > 7) {
              vnew.p.ub -= remaining >> 2; // steal from tail of victim's range
            } else {
              vnew.p.ub -= 1; // steal 1 chunk of 1..7 remaining
            }
            KMP_DEBUG_ASSERT(vnew.p.ub * (UT)chunk <= trip);
            if (KMP_COMPARE_AND_STORE_REL64(
                    (volatile kmp_int64 *)&v->u.p.count,
                    *VOLATILE_CAST(kmp_int64 *) & vold.b,
                    *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
              // stealing succedded
#ifdef KMP_DEBUG
              {
                char *buff;
                // create format specifiers before the debug output
                buff = __kmp_str_format(
                    "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, "
                    "count:%%%s ub:%%%s\n",
                    traits_t<T>::spec, traits_t<T>::spec);
                KD_TRACE(10, (buff, gtid, victimId, vnew.p.ub, vold.p.ub));
                __kmp_str_free(&buff);
              }
#endif
              KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen,
                                        vold.p.ub - vnew.p.ub);
              status = 1;
              pr->u.p.parm4 = victimId; // keep victim id
              // now update own count and ub
              init = vnew.p.ub;
              vold.p.count = init + 1;
#if KMP_ARCH_X86
              KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vold.b);
#else
              *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b;
#endif
              // activate non-empty buffer and let others steal from us
              if (vold.p.count < (UT)vold.p.ub)
                KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
              break;
            } // if (check CAS result)
            KMP_CPU_PAUSE(); // CAS failed, repeatedly attempt
          } // while (try to steal from particular victim)
        } // while (search for victim)
      } // if (try to find victim and steal)
    } // if (4-byte induction variable)
    if (!status) {
      *p_lb = 0;
      *p_ub = 0;
      if (p_st != NULL)
        *p_st = 0;
    } else {
      start = pr->u.p.lb;
      init *= chunk;
      limit = chunk + init - 1;
      incr = pr->u.p.st;
      KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_chunks, 1);

      KMP_DEBUG_ASSERT(init <= trip);
      // keep track of done chunks for possible early exit from stealing
      // TODO: count executed chunks locally with rare update of shared location
      // test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);
      if ((last = (limit >= trip)) != 0)
        limit = trip;
      if (p_st != NULL)
        *p_st = incr;

      if (incr == 1) {
        *p_lb = start + init;
        *p_ub = start + limit;
      } else {
        *p_lb = start + init * incr;
        *p_ub = start + limit * incr;
      }
    } // if
    break;
  } // case
#endif // KMP_STATIC_STEAL_ENABLED
  case kmp_sch_static_balanced: {
    KD_TRACE(
        10,
        ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_balanced case\n",
         gtid));
    /* check if thread has any iteration to do */
    if ((status = !pr->u.p.count) != 0) {
      pr->u.p.count = 1;
      *p_lb = pr->u.p.lb;
      *p_ub = pr->u.p.ub;
      last = (pr->u.p.parm1 != 0);
      if (p_st != NULL)
        *p_st = pr->u.p.st;
    } else { /* no iterations to do */
      pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
    }
  } // case
  break;
  case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was
                                 merged here */
  case kmp_sch_static_chunked: {
    T parm1;

    KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
                   "kmp_sch_static_[affinity|chunked] case\n",
                   gtid));
    parm1 = pr->u.p.parm1;

    trip = pr->u.p.tc - 1;
    init = parm1 * (pr->u.p.count + tid);

    if ((status = (init <= trip)) != 0) {
      start = pr->u.p.lb;
      incr = pr->u.p.st;
      limit = parm1 + init - 1;

      if ((last = (limit >= trip)) != 0)
        limit = trip;

      if (p_st != NULL)
        *p_st = incr;

      pr->u.p.count += nproc;

      if (incr == 1) {
        *p_lb = start + init;
        *p_ub = start + limit;
      } else {
        *p_lb = start + init * incr;
        *p_ub = start + limit * incr;
      }

      if (pr->flags.ordered) {
        pr->u.p.ordered_lower = init;
        pr->u.p.ordered_upper = limit;
      } // if
    } // if
  } // case
  break;

  case kmp_sch_dynamic_chunked: {
    UT chunk_number;
    UT chunk_size = pr->u.p.parm1;
    UT nchunks = pr->u.p.parm2;

    KD_TRACE(
        100,
        ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_dynamic_chunked case\n",
         gtid));

    chunk_number = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
    status = (chunk_number < nchunks);
    if (!status) {
      *p_lb = 0;
      *p_ub = 0;
      if (p_st != NULL)
        *p_st = 0;
    } else {
      init = chunk_size * chunk_number;
      trip = pr->u.p.tc - 1;
      start = pr->u.p.lb;
      incr = pr->u.p.st;

      if ((last = (trip - init < (UT)chunk_size)))
        limit = trip;
      else
        limit = chunk_size + init - 1;

      if (p_st != NULL)
        *p_st = incr;

      if (incr == 1) {
        *p_lb = start + init;
        *p_ub = start + limit;
      } else {
        *p_lb = start + init * incr;
        *p_ub = start + limit * incr;
      }

      if (pr->flags.ordered) {
        pr->u.p.ordered_lower = init;
        pr->u.p.ordered_upper = limit;
      } // if
    } // if
  } // case
  break;

  case kmp_sch_guided_iterative_chunked: {
    T chunkspec = pr->u.p.parm1;
    KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_chunked "
                   "iterative case\n",
                   gtid));
    trip = pr->u.p.tc;
    // Start atomic part of calculations
    while (1) {
      ST remaining; // signed, because can be < 0
      init = sh->u.s.iteration; // shared value
      remaining = trip - init;
      if (remaining <= 0) { // AC: need to compare with 0 first
        // nothing to do, don't try atomic op
        status = 0;
        break;
      }
      if ((T)remaining <
          pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default
        // use dynamic-style schedule
        // atomically increment iterations, get old value
        init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
                                 (ST)chunkspec);
        remaining = trip - init;
        if (remaining <= 0) {
          status = 0; // all iterations got by other threads
        } else {
          // got some iterations to work on
          status = 1;
          if ((T)remaining > chunkspec) {
            limit = init + chunkspec - 1;
          } else {
            last = true; // the last chunk
            limit = init + remaining - 1;
          } // if
        } // if
        break;
      } // if
      limit = init + (UT)((double)remaining *
                          *(double *)&pr->u.p.parm3); // divide by K*nproc
      if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
                               (ST)init, (ST)limit)) {
        // CAS was successful, chunk obtained
        status = 1;
        --limit;
        break;
      } // if
    } // while
    if (status != 0) {
      start = pr->u.p.lb;
      incr = pr->u.p.st;
      if (p_st != NULL)
        *p_st = incr;
      *p_lb = start + init * incr;
      *p_ub = start + limit * incr;
      if (pr->flags.ordered) {
        pr->u.p.ordered_lower = init;
        pr->u.p.ordered_upper = limit;
      } // if
    } else {
      *p_lb = 0;
      *p_ub = 0;
      if (p_st != NULL)
        *p_st = 0;
    } // if
  } // case
  break;

  case kmp_sch_guided_simd: {
    // same as iterative but curr-chunk adjusted to be multiple of given
    // chunk
    T chunk = pr->u.p.parm1;
    KD_TRACE(100,
             ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_simd case\n",
              gtid));
    trip = pr->u.p.tc;
    // Start atomic part of calculations
    while (1) {
      ST remaining; // signed, because can be < 0
      init = sh->u.s.iteration; // shared value
      remaining = trip - init;
      if (remaining <= 0) { // AC: need to compare with 0 first
        status = 0; // nothing to do, don't try atomic op
        break;
      }
      KMP_DEBUG_ASSERT(chunk && init % chunk == 0);
      // compare with K*nproc*(chunk+1), K=2 by default
      if ((T)remaining < pr->u.p.parm2) {
        // use dynamic-style schedule
        // atomically increment iterations, get old value
        init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
                                 (ST)chunk);
        remaining = trip - init;
        if (remaining <= 0) {
          status = 0; // all iterations got by other threads
        } else {
          // got some iterations to work on
          status = 1;
          if ((T)remaining > chunk) {
            limit = init + chunk - 1;
          } else {
            last = true; // the last chunk
            limit = init + remaining - 1;
          } // if
        } // if
        break;
      } // if
      // divide by K*nproc
      UT span;
      __kmp_type_convert((double)remaining * (*(double *)&pr->u.p.parm3),
                         &span);
      UT rem = span % chunk;
      if (rem) // adjust so that span%chunk == 0
        span += chunk - rem;
      limit = init + span;
      if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
                               (ST)init, (ST)limit)) {
        // CAS was successful, chunk obtained
        status = 1;
        --limit;
        break;
      } // if
    } // while
    if (status != 0) {
      start = pr->u.p.lb;
      incr = pr->u.p.st;
      if (p_st != NULL)
        *p_st = incr;
      *p_lb = start + init * incr;
      *p_ub = start + limit * incr;
      if (pr->flags.ordered) {
        pr->u.p.ordered_lower = init;
        pr->u.p.ordered_upper = limit;
      } // if
    } else {
      *p_lb = 0;
      *p_ub = 0;
      if (p_st != NULL)
        *p_st = 0;
    } // if
  } // case
  break;

  case kmp_sch_guided_analytical_chunked: {
    T chunkspec = pr->u.p.parm1;
    UT chunkIdx;
#if KMP_USE_X87CONTROL
    /* for storing original FPCW value for Windows* OS on
       IA-32 architecture 8-byte version */
    unsigned int oldFpcw;
    unsigned int fpcwSet = 0;
#endif
    KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
                   "kmp_sch_guided_analytical_chunked case\n",
                   gtid));

    trip = pr->u.p.tc;

    KMP_DEBUG_ASSERT(nproc > 1);
    KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)nproc < trip);

    while (1) { /* this while loop is a safeguard against unexpected zero
                   chunk sizes */
      chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
      if (chunkIdx >= (UT)pr->u.p.parm2) {
        --trip;
        /* use dynamic-style scheduling */
        init = chunkIdx * chunkspec + pr->u.p.count;
        /* need to verify init > 0 in case of overflow in the above
         * calculation */
        if ((status = (init > 0 && init <= trip)) != 0) {
          limit = init + chunkspec - 1;

          if ((last = (limit >= trip)) != 0)
            limit = trip;
        }
        break;
      } else {
/* use exponential-style scheduling */
/* The following check is to workaround the lack of long double precision on
   Windows* OS.
   This check works around the possible effect that init != 0 for chunkIdx == 0.
 */
#if KMP_USE_X87CONTROL
        /* If we haven't already done so, save original
           FPCW and set precision to 64-bit, as Windows* OS
           on IA-32 architecture defaults to 53-bit */
        if (!fpcwSet) {
          oldFpcw = _control87(0, 0);
          _control87(_PC_64, _MCW_PC);
          fpcwSet = 0x30000;
        }
#endif
        if (chunkIdx) {
          init = __kmp_dispatch_guided_remaining<T>(
              trip, *(DBL *)&pr->u.p.parm3, chunkIdx);
          KMP_DEBUG_ASSERT(init);
          init = trip - init;
        } else
          init = 0;
        limit = trip - __kmp_dispatch_guided_remaining<T>(
                           trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1);
        KMP_ASSERT(init <= limit);
        if (init < limit) {
          KMP_DEBUG_ASSERT(limit <= trip);
          --limit;
          status = 1;
          break;
        } // if
      } // if
    } // while (1)
#if KMP_USE_X87CONTROL
    /* restore FPCW if necessary
       AC: check fpcwSet flag first because oldFpcw can be uninitialized here
    */
    if (fpcwSet && (oldFpcw & fpcwSet))
      _control87(oldFpcw, _MCW_PC);
#endif
    if (status != 0) {
      start = pr->u.p.lb;
      incr = pr->u.p.st;
      if (p_st != NULL)
        *p_st = incr;
      *p_lb = start + init * incr;
      *p_ub = start + limit * incr;
      if (pr->flags.ordered) {
        pr->u.p.ordered_lower = init;
        pr->u.p.ordered_upper = limit;
      }
    } else {
      *p_lb = 0;
      *p_ub = 0;
      if (p_st != NULL)
        *p_st = 0;
    }
  } // case
  break;

  case kmp_sch_trapezoidal: {
    UT index;
    T parm2 = pr->u.p.parm2;
    T parm3 = pr->u.p.parm3;
    T parm4 = pr->u.p.parm4;
    KD_TRACE(100,
             ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_trapezoidal case\n",
              gtid));

    index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);

    init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2;
    trip = pr->u.p.tc - 1;

    if ((status = ((T)index < parm3 && init <= trip)) == 0) {
      *p_lb = 0;
      *p_ub = 0;
      if (p_st != NULL)
        *p_st = 0;
    } else {
      start = pr->u.p.lb;
      limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1;
      incr = pr->u.p.st;

      if ((last = (limit >= trip)) != 0)
        limit = trip;

      if (p_st != NULL)
        *p_st = incr;

      if (incr == 1) {
        *p_lb = start + init;
        *p_ub = start + limit;
      } else {
        *p_lb = start + init * incr;
        *p_ub = start + limit * incr;
      }

      if (pr->flags.ordered) {
        pr->u.p.ordered_lower = init;
        pr->u.p.ordered_upper = limit;
      } // if
    } // if
  } // case
  break;
  default: {
    status = 0; // to avoid complaints on uninitialized variable use
    __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
                KMP_HNT(GetNewerLibrary), // Hint
                __kmp_msg_null // Variadic argument list terminator
    );
  } break;
  } // switch
  if (p_last)
    *p_last = last;
#ifdef KMP_DEBUG
  if (pr->flags.ordered) {
    char *buff;
    // create format specifiers before the debug output
    buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d "
                            "ordered_lower:%%%s ordered_upper:%%%s\n",
                            traits_t<UT>::spec, traits_t<UT>::spec);
    KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper));
    __kmp_str_free(&buff);
  }
  {
    char *buff;
    // create format specifiers before the debug output
    buff = __kmp_str_format(
        "__kmp_dispatch_next_algorithm: T#%%d exit status:%%d p_last:%%d "
        "p_lb:%%%s p_ub:%%%s p_st:%%%s\n",
        traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
    KMP_DEBUG_ASSERT(p_last);
    KMP_DEBUG_ASSERT(p_st);
    KD_TRACE(10, (buff, gtid, status, *p_last, *p_lb, *p_ub, *p_st));
    __kmp_str_free(&buff);
  }
#endif
  return status;
}

/* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more
   work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini()
   is not called. */
#if OMPT_SUPPORT && OMPT_OPTIONAL
#define OMPT_LOOP_END                                                          \
  if (status == 0) {                                                           \
    if (ompt_enabled.ompt_callback_work) {                                     \
      ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);              \
      ompt_task_info_t *task_info = __ompt_get_task_info_object(0);            \
      ompt_callbacks.ompt_callback(ompt_callback_work)(                        \
          ompt_work_loop, ompt_scope_end, &(team_info->parallel_data),         \
          &(task_info->task_data), 0, codeptr);                                \
    }                                                                          \
  }
#define OMPT_LOOP_DISPATCH(lb, ub, st, status)                                 \
  if (ompt_enabled.ompt_callback_dispatch && status) {                         \
    ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);                \
    ompt_task_info_t *task_info = __ompt_get_task_info_object(0);              \
    ompt_dispatch_chunk_t chunk;                                               \
    ompt_data_t instance = ompt_data_none;                                     \
    OMPT_GET_DISPATCH_CHUNK(chunk, lb, ub, st);                                \
    instance.ptr = &chunk;                                                     \
    ompt_callbacks.ompt_callback(ompt_callback_dispatch)(                      \
        &(team_info->parallel_data), &(task_info->task_data),                  \
        ompt_dispatch_ws_loop_chunk, instance);                                \
  }
// TODO: implement count
#else
#define OMPT_LOOP_END // no-op
#define OMPT_LOOP_DISPATCH(lb, ub, st, status) // no-op
#endif

#if KMP_STATS_ENABLED
#define KMP_STATS_LOOP_END                                                     \
  {                                                                            \
    kmp_int64 u, l, t, i;                                                      \
    l = (kmp_int64)(*p_lb);                                                    \
    u = (kmp_int64)(*p_ub);                                                    \
    i = (kmp_int64)(pr->u.p.st);                                               \
    if (status == 0) {                                                         \
      t = 0;                                                                   \
      KMP_POP_PARTITIONED_TIMER();                                             \
    } else if (i == 1) {                                                       \
      if (u >= l)                                                              \
        t = u - l + 1;                                                         \
      else                                                                     \
        t = 0;                                                                 \
    } else if (i < 0) {                                                        \
      if (l >= u)                                                              \
        t = (l - u) / (-i) + 1;                                                \
      else                                                                     \
        t = 0;                                                                 \
    } else {                                                                   \
      if (u >= l)                                                              \
        t = (u - l) / i + 1;                                                   \
      else                                                                     \
        t = 0;                                                                 \
    }                                                                          \
    KMP_COUNT_VALUE(OMP_loop_dynamic_iterations, t);                           \
  }
#else
#define KMP_STATS_LOOP_END /* Nothing */
#endif

template <typename T>
static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
                               T *p_lb, T *p_ub,
                               typename traits_t<T>::signed_t *p_st
#if OMPT_SUPPORT && OMPT_OPTIONAL
                               ,
                               void *codeptr
#endif
) {

  typedef typename traits_t<T>::unsigned_t UT;
  typedef typename traits_t<T>::signed_t ST;
  // This is potentially slightly misleading, schedule(runtime) will appear here
  // even if the actual runtime schedule is static. (Which points out a
  // disadvantage of schedule(runtime): even when static scheduling is used it
  // costs more than a compile time choice to use static scheduling would.)
  KMP_TIME_PARTITIONED_BLOCK(OMP_loop_dynamic_scheduling);

  int status;
  dispatch_private_info_template<T> *pr;
  __kmp_assert_valid_gtid(gtid);
  kmp_info_t *th = __kmp_threads[gtid];
  kmp_team_t *team = th->th.th_team;

  KMP_DEBUG_ASSERT(p_lb && p_ub && p_st); // AC: these cannot be NULL
  KD_TRACE(
      1000,
      ("__kmp_dispatch_next: T#%d called p_lb:%p p_ub:%p p_st:%p p_last: %p\n",
       gtid, p_lb, p_ub, p_st, p_last));

  if (team->t.t_serialized) {
    /* NOTE: serialize this dispatch because we are not at the active level */
    pr = reinterpret_cast<dispatch_private_info_template<T> *>(
        th->th.th_dispatch->th_disp_buffer); /* top of the stack */
    KMP_DEBUG_ASSERT(pr);

    if ((status = (pr->u.p.tc != 0)) == 0) {
      *p_lb = 0;
      *p_ub = 0;
      //            if ( p_last != NULL )
      //                *p_last = 0;
      if (p_st != NULL)
        *p_st = 0;
      if (__kmp_env_consistency_check) {
        if (pr->pushed_ws != ct_none) {
          pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
        }
      }
    } else if (pr->flags.nomerge) {
      kmp_int32 last;
      T start;
      UT limit, trip, init;
      ST incr;
      T chunk = pr->u.p.parm1;

      KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
                     gtid));

      init = chunk * pr->u.p.count++;
      trip = pr->u.p.tc - 1;

      if ((status = (init <= trip)) == 0) {
        *p_lb = 0;
        *p_ub = 0;
        //                if ( p_last != NULL )
        //                    *p_last = 0;
        if (p_st != NULL)
          *p_st = 0;
        if (__kmp_env_consistency_check) {
          if (pr->pushed_ws != ct_none) {
            pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
          }
        }
      } else {
        start = pr->u.p.lb;
        limit = chunk + init - 1;
        incr = pr->u.p.st;

        if ((last = (limit >= trip)) != 0) {
          limit = trip;
#if KMP_OS_WINDOWS
          pr->u.p.last_upper = pr->u.p.ub;
#endif /* KMP_OS_WINDOWS */
        }
        if (p_last != NULL)
          *p_last = last;
        if (p_st != NULL)
          *p_st = incr;
        if (incr == 1) {
          *p_lb = start + init;
          *p_ub = start + limit;
        } else {
          *p_lb = start + init * incr;
          *p_ub = start + limit * incr;
        }

        if (pr->flags.ordered) {
          pr->u.p.ordered_lower = init;
          pr->u.p.ordered_upper = limit;
#ifdef KMP_DEBUG
          {
            char *buff;
            // create format specifiers before the debug output
            buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
                                    "ordered_lower:%%%s ordered_upper:%%%s\n",
                                    traits_t<UT>::spec, traits_t<UT>::spec);
            KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
                            pr->u.p.ordered_upper));
            __kmp_str_free(&buff);
          }
#endif
        } // if
      } // if
    } else {
      pr->u.p.tc = 0;
      *p_lb = pr->u.p.lb;
      *p_ub = pr->u.p.ub;
#if KMP_OS_WINDOWS
      pr->u.p.last_upper = *p_ub;
#endif /* KMP_OS_WINDOWS */
      if (p_last != NULL)
        *p_last = TRUE;
      if (p_st != NULL)
        *p_st = pr->u.p.st;
    } // if
#ifdef KMP_DEBUG
    {
      char *buff;
      // create format specifiers before the debug output
      buff = __kmp_str_format(
          "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s "
          "p_ub:%%%s p_st:%%%s p_last:%%p %%d  returning:%%d\n",
          traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
      KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last,
                    (p_last ? *p_last : 0), status));
      __kmp_str_free(&buff);
    }
#endif
#if INCLUDE_SSC_MARKS
    SSC_MARK_DISPATCH_NEXT();
#endif
    OMPT_LOOP_DISPATCH(*p_lb, *p_ub, pr->u.p.st, status);
    OMPT_LOOP_END;
    KMP_STATS_LOOP_END;
    return status;
  } else {
    kmp_int32 last = 0;
    dispatch_shared_info_template<T> volatile *sh;

    KMP_DEBUG_ASSERT(th->th.th_dispatch ==
                     &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);

    pr = reinterpret_cast<dispatch_private_info_template<T> *>(
        th->th.th_dispatch->th_dispatch_pr_current);
    KMP_DEBUG_ASSERT(pr);
    sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
        th->th.th_dispatch->th_dispatch_sh_current);
    KMP_DEBUG_ASSERT(sh);

#if KMP_USE_HIER_SCHED
    if (pr->flags.use_hier)
      status = sh->hier->next(loc, gtid, pr, &last, p_lb, p_ub, p_st);
    else
#endif // KMP_USE_HIER_SCHED
      status = __kmp_dispatch_next_algorithm<T>(gtid, pr, sh, &last, p_lb, p_ub,
                                                p_st, th->th.th_team_nproc,
                                                th->th.th_info.ds.ds_tid);
    // status == 0: no more iterations to execute
    if (status == 0) {
      ST num_done;
      num_done = test_then_inc<ST>(&sh->u.s.num_done);
#ifdef KMP_DEBUG
      {
        char *buff;
        // create format specifiers before the debug output
        buff = __kmp_str_format(
            "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
            traits_t<ST>::spec);
        KD_TRACE(10, (buff, gtid, sh->u.s.num_done));
        __kmp_str_free(&buff);
      }
#endif

#if KMP_USE_HIER_SCHED
      pr->flags.use_hier = FALSE;
#endif
      if (num_done == th->th.th_team_nproc - 1) {
#if KMP_STATIC_STEAL_ENABLED
        if (pr->schedule == kmp_sch_static_steal) {
          int i;
          int idx = (th->th.th_dispatch->th_disp_index - 1) %
                    __kmp_dispatch_num_buffers; // current loop index
          // loop complete, safe to destroy locks used for stealing
          for (i = 0; i < th->th.th_team_nproc; ++i) {
            dispatch_private_info_template<T> *buf =
                reinterpret_cast<dispatch_private_info_template<T> *>(
                    &team->t.t_dispatch[i].th_disp_buffer[idx]);
            KMP_ASSERT(buf->steal_flag == THIEF); // buffer must be inactive
            KMP_ATOMIC_ST_RLX(&buf->steal_flag, UNUSED);
            if (traits_t<T>::type_size > 4) {
              // destroy locks used for stealing
              kmp_lock_t *lck = buf->u.p.steal_lock;
              KMP_ASSERT(lck != NULL);
              __kmp_destroy_lock(lck);
              __kmp_free(lck);
              buf->u.p.steal_lock = NULL;
            }
          }
        }
#endif
        /* NOTE: release shared buffer to be reused */

        KMP_MB(); /* Flush all pending memory write invalidates.  */

        sh->u.s.num_done = 0;
        sh->u.s.iteration = 0;

        /* TODO replace with general release procedure? */
        if (pr->flags.ordered) {
          sh->u.s.ordered_iteration = 0;
        }

        sh->buffer_index += __kmp_dispatch_num_buffers;
        KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
                       gtid, sh->buffer_index));

        KMP_MB(); /* Flush all pending memory write invalidates.  */

      } // if
      if (__kmp_env_consistency_check) {
        if (pr->pushed_ws != ct_none) {
          pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
        }
      }

      th->th.th_dispatch->th_deo_fcn = NULL;
      th->th.th_dispatch->th_dxo_fcn = NULL;
      th->th.th_dispatch->th_dispatch_sh_current = NULL;
      th->th.th_dispatch->th_dispatch_pr_current = NULL;
    } // if (status == 0)
#if KMP_OS_WINDOWS
    else if (last) {
      pr->u.p.last_upper = pr->u.p.ub;
    }
#endif /* KMP_OS_WINDOWS */
    if (p_last != NULL && status != 0)
      *p_last = last;
  } // if

#ifdef KMP_DEBUG
  {
    char *buff;
    // create format specifiers before the debug output
    buff = __kmp_str_format(
        "__kmp_dispatch_next: T#%%d normal case: "
        "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p (%%d) returning:%%d\n",
        traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
    KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last,
                  (p_last ? *p_last : 0), status));
    __kmp_str_free(&buff);
  }
#endif
#if INCLUDE_SSC_MARKS
  SSC_MARK_DISPATCH_NEXT();
#endif
  OMPT_LOOP_DISPATCH(*p_lb, *p_ub, pr->u.p.st, status);
  OMPT_LOOP_END;
  KMP_STATS_LOOP_END;
  return status;
}

/*!
@ingroup WORK_SHARING
@param loc  source location information
@param global_tid  global thread number
@return Zero if the parallel region is not active and this thread should execute
all sections, non-zero otherwise.

Beginning of sections construct.
There are no implicit barriers in the "sections" calls, rather the compiler
should introduce an explicit barrier if it is required.

This implementation is based on __kmp_dispatch_init, using same constructs for
shared data (we can't have sections nested directly in omp for loop, there
should be a parallel region in between)
*/
kmp_int32 __kmpc_sections_init(ident_t *loc, kmp_int32 gtid) {

  int active;
  kmp_info_t *th;
  kmp_team_t *team;
  kmp_uint32 my_buffer_index;
  dispatch_shared_info_template<kmp_int32> volatile *sh;

  KMP_DEBUG_ASSERT(__kmp_init_serial);

  if (!TCR_4(__kmp_init_parallel))
    __kmp_parallel_initialize();
  __kmp_resume_if_soft_paused();

  /* setup data */
  th = __kmp_threads[gtid];
  team = th->th.th_team;
  active = !team->t.t_serialized;
  th->th.th_ident = loc;

  KMP_COUNT_BLOCK(OMP_SECTIONS);
  KD_TRACE(10, ("__kmpc_sections: called by T#%d\n", gtid));

  if (active) {
    // Setup sections in the same way as dynamic scheduled loops.
    // We need one shared data: which section is to execute next.
    // (in case parallel is not active, all sections will be executed on the
    // same thread)
    KMP_DEBUG_ASSERT(th->th.th_dispatch ==
                     &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);

    my_buffer_index = th->th.th_dispatch->th_disp_index++;

    // reuse shared data structures from dynamic sched loops:
    sh = reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>(
        &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
    KD_TRACE(10, ("__kmpc_sections_init: T#%d my_buffer_index:%d\n", gtid,
                  my_buffer_index));

    th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
    th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;

    KD_TRACE(100, ("__kmpc_sections_init: T#%d before wait: my_buffer_index:%d "
                   "sh->buffer_index:%d\n",
                   gtid, my_buffer_index, sh->buffer_index));
    __kmp_wait<kmp_uint32>(&sh->buffer_index, my_buffer_index,
                           __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
    // Note: KMP_WAIT() cannot be used there: buffer index and
    // my_buffer_index are *always* 32-bit integers.
    KMP_MB();
    KD_TRACE(100, ("__kmpc_sections_init: T#%d after wait: my_buffer_index:%d "
                   "sh->buffer_index:%d\n",
                   gtid, my_buffer_index, sh->buffer_index));

    th->th.th_dispatch->th_dispatch_pr_current =
        nullptr; // sections construct doesn't need private data
    th->th.th_dispatch->th_dispatch_sh_current =
        CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh);
  }

#if OMPT_SUPPORT && OMPT_OPTIONAL
  if (ompt_enabled.ompt_callback_work) {
    ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
    ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
    ompt_callbacks.ompt_callback(ompt_callback_work)(
        ompt_work_sections, ompt_scope_begin, &(team_info->parallel_data),
        &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));
  }
#endif
  KMP_PUSH_PARTITIONED_TIMER(OMP_sections);

  return active;
}

/*!
@ingroup WORK_SHARING
@param loc  source location information
@param global_tid  global thread number
@param numberOfSections  number of sections in the 'sections' construct
@return unsigned [from 0 to n) - number (id) of the section to execute next on
this thread. n (or any other number not in range) - nothing to execute on this
thread
*/

kmp_int32 __kmpc_next_section(ident_t *loc, kmp_int32 gtid,
                              kmp_int32 numberOfSections) {

  KMP_TIME_PARTITIONED_BLOCK(OMP_sections_overhead);

  kmp_info_t *th = __kmp_threads[gtid];
#ifdef KMP_DEBUG
  kmp_team_t *team = th->th.th_team;
#endif

  KD_TRACE(1000, ("__kmp_dispatch_next: T#%d; number of sections:%d\n", gtid,
                  numberOfSections));

  // For serialized case we should not call this function:
  KMP_DEBUG_ASSERT(!team->t.t_serialized);

  dispatch_shared_info_template<kmp_int32> volatile *sh;

  KMP_DEBUG_ASSERT(th->th.th_dispatch ==
                   &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);

  KMP_DEBUG_ASSERT(!(th->th.th_dispatch->th_dispatch_pr_current));
  sh = reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>(
      th->th.th_dispatch->th_dispatch_sh_current);
  KMP_DEBUG_ASSERT(sh);

  kmp_int32 sectionIndex = 0;
  bool moreSectionsToExecute = true;

  // Find section to execute:
  sectionIndex = test_then_inc<kmp_int32>((kmp_int32 *)&sh->u.s.iteration);
  if (sectionIndex >= numberOfSections) {
    moreSectionsToExecute = false;
  }

  // status == 0: no more sections to execute;
  // OMPTODO: __kmpc_end_sections could be bypassed?
  if (!moreSectionsToExecute) {
    kmp_int32 num_done;

    num_done = test_then_inc<kmp_int32>((kmp_int32 *)(&sh->u.s.num_done));

    if (num_done == th->th.th_team_nproc - 1) {
      /* NOTE: release this buffer to be reused */

      KMP_MB(); /* Flush all pending memory write invalidates.  */

      sh->u.s.num_done = 0;
      sh->u.s.iteration = 0;

      KMP_MB(); /* Flush all pending memory write invalidates.  */

      sh->buffer_index += __kmp_dispatch_num_buffers;
      KD_TRACE(100, ("__kmpc_next_section: T#%d change buffer_index:%d\n", gtid,
                     sh->buffer_index));

      KMP_MB(); /* Flush all pending memory write invalidates.  */

    } // if

    th->th.th_dispatch->th_deo_fcn = NULL;
    th->th.th_dispatch->th_dxo_fcn = NULL;
    th->th.th_dispatch->th_dispatch_sh_current = NULL;
    th->th.th_dispatch->th_dispatch_pr_current = NULL;

#if OMPT_SUPPORT && OMPT_OPTIONAL
    if (ompt_enabled.ompt_callback_dispatch) {
      ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
      ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
      ompt_data_t instance = ompt_data_none;
      instance.ptr = OMPT_GET_RETURN_ADDRESS(0);
      ompt_callbacks.ompt_callback(ompt_callback_dispatch)(
          &(team_info->parallel_data), &(task_info->task_data),
          ompt_dispatch_section, instance);
    }
#endif
  }

  return sectionIndex;
}

/*!
@ingroup WORK_SHARING
@param loc  source location information
@param global_tid  global thread number

End of "sections" construct.
Don't need to wait here: barrier is added separately when needed.
*/
void __kmpc_end_sections(ident_t *loc, kmp_int32 gtid) {

  kmp_info_t *th = __kmp_threads[gtid];
  int active = !th->th.th_team->t.t_serialized;

  KD_TRACE(100, ("__kmpc_end_sections: T#%d called\n", gtid));

  if (!active) {
    // In active case call finalization is done in __kmpc_next_section
#if OMPT_SUPPORT && OMPT_OPTIONAL
    if (ompt_enabled.ompt_callback_work) {
      ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
      ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
      ompt_callbacks.ompt_callback(ompt_callback_work)(
          ompt_work_sections, ompt_scope_end, &(team_info->parallel_data),
          &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));
    }
#endif
  }

  KMP_POP_PARTITIONED_TIMER();
  KD_TRACE(100, ("__kmpc_end_sections: T#%d returned\n", gtid));
}

template <typename T>
static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid,
                                  kmp_int32 *plastiter, T *plower, T *pupper,
                                  typename traits_t<T>::signed_t incr) {
  typedef typename traits_t<T>::unsigned_t UT;
  kmp_uint32 team_id;
  kmp_uint32 nteams;
  UT trip_count;
  kmp_team_t *team;
  kmp_info_t *th;

  KMP_DEBUG_ASSERT(plastiter && plower && pupper);
  KE_TRACE(10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
#ifdef KMP_DEBUG
  typedef typename traits_t<T>::signed_t ST;
  {
    char *buff;
    // create format specifiers before the debug output
    buff = __kmp_str_format("__kmpc_dist_get_bounds: T#%%d liter=%%d "
                            "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
                            traits_t<T>::spec, traits_t<T>::spec,
                            traits_t<ST>::spec, traits_t<T>::spec);
    KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr));
    __kmp_str_free(&buff);
  }
#endif

  if (__kmp_env_consistency_check) {
    if (incr == 0) {
      __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
                            loc);
    }
    if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) {
      // The loop is illegal.
      // Some zero-trip loops maintained by compiler, e.g.:
      //   for(i=10;i<0;++i) // lower >= upper - run-time check
      //   for(i=0;i>10;--i) // lower <= upper - run-time check
      //   for(i=0;i>10;++i) // incr > 0       - compile-time check
      //   for(i=10;i<0;--i) // incr < 0       - compile-time check
      // Compiler does not check the following illegal loops:
      //   for(i=0;i<10;i+=incr) // where incr<0
      //   for(i=10;i>0;i-=incr) // where incr<0
      __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc);
    }
  }
  __kmp_assert_valid_gtid(gtid);
  th = __kmp_threads[gtid];
  team = th->th.th_team;
  KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct
  nteams = th->th.th_teams_size.nteams;
  team_id = team->t.t_master_tid;
  KMP_DEBUG_ASSERT(nteams == (kmp_uint32)team->t.t_parent->t.t_nproc);

  // compute global trip count
  if (incr == 1) {
    trip_count = *pupper - *plower + 1;
  } else if (incr == -1) {
    trip_count = *plower - *pupper + 1;
  } else if (incr > 0) {
    // upper-lower can exceed the limit of signed type
    trip_count = (UT)(*pupper - *plower) / incr + 1;
  } else {
    trip_count = (UT)(*plower - *pupper) / (-incr) + 1;
  }

  if (trip_count <= nteams) {
    KMP_DEBUG_ASSERT(
        __kmp_static == kmp_sch_static_greedy ||
        __kmp_static ==
            kmp_sch_static_balanced); // Unknown static scheduling type.
    // only some teams get single iteration, others get nothing
    if (team_id < trip_count) {
      *pupper = *plower = *plower + team_id * incr;
    } else {
      *plower = *pupper + incr; // zero-trip loop
    }
    if (plastiter != NULL)
      *plastiter = (team_id == trip_count - 1);
  } else {
    if (__kmp_static == kmp_sch_static_balanced) {
      UT chunk = trip_count / nteams;
      UT extras = trip_count % nteams;
      *plower +=
          incr * (team_id * chunk + (team_id < extras ? team_id : extras));
      *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr);
      if (plastiter != NULL)
        *plastiter = (team_id == nteams - 1);
    } else {
      T chunk_inc_count =
          (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr;
      T upper = *pupper;
      KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy);
      // Unknown static scheduling type.
      *plower += team_id * chunk_inc_count;
      *pupper = *plower + chunk_inc_count - incr;
      // Check/correct bounds if needed
      if (incr > 0) {
        if (*pupper < *plower)
          *pupper = traits_t<T>::max_value;
        if (plastiter != NULL)
          *plastiter = *plower <= upper && *pupper > upper - incr;
        if (*pupper > upper)
          *pupper = upper; // tracker C73258
      } else {
        if (*pupper > *plower)
          *pupper = traits_t<T>::min_value;
        if (plastiter != NULL)
          *plastiter = *plower >= upper && *pupper < upper - incr;
        if (*pupper < upper)
          *pupper = upper; // tracker C73258
      }
    }
  }
}

//-----------------------------------------------------------------------------
// Dispatch routines
//    Transfer call to template< type T >
//    __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
//                         T lb, T ub, ST st, ST chunk )
extern "C" {

/*!
@ingroup WORK_SHARING
@{
@param loc Source location
@param gtid Global thread id
@param schedule Schedule type
@param lb  Lower bound
@param ub  Upper bound
@param st  Step (or increment if you prefer)
@param chunk The chunk size to block with

This function prepares the runtime to start a dynamically scheduled for loop,
saving the loop arguments.
These functions are all identical apart from the types of the arguments.
*/

void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
                            enum sched_type schedule, kmp_int32 lb,
                            kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) {
  KMP_DEBUG_ASSERT(__kmp_init_serial);
#if OMPT_SUPPORT && OMPT_OPTIONAL
  OMPT_STORE_RETURN_ADDRESS(gtid);
#endif
  __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
}
/*!
See @ref __kmpc_dispatch_init_4
*/
void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
                             enum sched_type schedule, kmp_uint32 lb,
                             kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) {
  KMP_DEBUG_ASSERT(__kmp_init_serial);
#if OMPT_SUPPORT && OMPT_OPTIONAL
  OMPT_STORE_RETURN_ADDRESS(gtid);
#endif
  __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
}

/*!
See @ref __kmpc_dispatch_init_4
*/
void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
                            enum sched_type schedule, kmp_int64 lb,
                            kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) {
  KMP_DEBUG_ASSERT(__kmp_init_serial);
#if OMPT_SUPPORT && OMPT_OPTIONAL
  OMPT_STORE_RETURN_ADDRESS(gtid);
#endif
  __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
}

/*!
See @ref __kmpc_dispatch_init_4
*/
void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
                             enum sched_type schedule, kmp_uint64 lb,
                             kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) {
  KMP_DEBUG_ASSERT(__kmp_init_serial);
#if OMPT_SUPPORT && OMPT_OPTIONAL
  OMPT_STORE_RETURN_ADDRESS(gtid);
#endif
  __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
}

/*!
See @ref __kmpc_dispatch_init_4

Difference from __kmpc_dispatch_init set of functions is these functions
are called for composite distribute parallel for construct. Thus before
regular iterations dispatching we need to calc per-team iteration space.

These functions are all identical apart from the types of the arguments.
*/
void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
                                 enum sched_type schedule, kmp_int32 *p_last,
                                 kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
                                 kmp_int32 chunk) {
  KMP_DEBUG_ASSERT(__kmp_init_serial);
#if OMPT_SUPPORT && OMPT_OPTIONAL
  OMPT_STORE_RETURN_ADDRESS(gtid);
#endif
  __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st);
  __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
}

void __kmpc_dist_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
                                  enum sched_type schedule, kmp_int32 *p_last,
                                  kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
                                  kmp_int32 chunk) {
  KMP_DEBUG_ASSERT(__kmp_init_serial);
#if OMPT_SUPPORT && OMPT_OPTIONAL
  OMPT_STORE_RETURN_ADDRESS(gtid);
#endif
  __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st);
  __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
}

void __kmpc_dist_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
                                 enum sched_type schedule, kmp_int32 *p_last,
                                 kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
                                 kmp_int64 chunk) {
  KMP_DEBUG_ASSERT(__kmp_init_serial);
#if OMPT_SUPPORT && OMPT_OPTIONAL
  OMPT_STORE_RETURN_ADDRESS(gtid);
#endif
  __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st);
  __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
}

void __kmpc_dist_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
                                  enum sched_type schedule, kmp_int32 *p_last,
                                  kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
                                  kmp_int64 chunk) {
  KMP_DEBUG_ASSERT(__kmp_init_serial);
#if OMPT_SUPPORT && OMPT_OPTIONAL
  OMPT_STORE_RETURN_ADDRESS(gtid);
#endif
  __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st);
  __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
}

/*!
@param loc Source code location
@param gtid Global thread id
@param p_last Pointer to a flag set to one if this is the last chunk or zero
otherwise
@param p_lb   Pointer to the lower bound for the next chunk of work
@param p_ub   Pointer to the upper bound for the next chunk of work
@param p_st   Pointer to the stride for the next chunk of work
@return one if there is work to be done, zero otherwise

Get the next dynamically allocated chunk of work for this thread.
If there is no more work, then the lb,ub and stride need not be modified.
*/
int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
                           kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) {
#if OMPT_SUPPORT && OMPT_OPTIONAL
  OMPT_STORE_RETURN_ADDRESS(gtid);
#endif
  return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st
#if OMPT_SUPPORT && OMPT_OPTIONAL
                                        ,
                                        OMPT_LOAD_RETURN_ADDRESS(gtid)
#endif
  );
}

/*!
See @ref __kmpc_dispatch_next_4
*/
int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
                            kmp_uint32 *p_lb, kmp_uint32 *p_ub,
                            kmp_int32 *p_st) {
#if OMPT_SUPPORT && OMPT_OPTIONAL
  OMPT_STORE_RETURN_ADDRESS(gtid);
#endif
  return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st
#if OMPT_SUPPORT && OMPT_OPTIONAL
                                         ,
                                         OMPT_LOAD_RETURN_ADDRESS(gtid)
#endif
  );
}

/*!
See @ref __kmpc_dispatch_next_4
*/
int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
                           kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) {
#if OMPT_SUPPORT && OMPT_OPTIONAL
  OMPT_STORE_RETURN_ADDRESS(gtid);
#endif
  return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st
#if OMPT_SUPPORT && OMPT_OPTIONAL
                                        ,
                                        OMPT_LOAD_RETURN_ADDRESS(gtid)
#endif
  );
}

/*!
See @ref __kmpc_dispatch_next_4
*/
int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
                            kmp_uint64 *p_lb, kmp_uint64 *p_ub,
                            kmp_int64 *p_st) {
#if OMPT_SUPPORT && OMPT_OPTIONAL
  OMPT_STORE_RETURN_ADDRESS(gtid);
#endif
  return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st
#if OMPT_SUPPORT && OMPT_OPTIONAL
                                         ,
                                         OMPT_LOAD_RETURN_ADDRESS(gtid)
#endif
  );
}

/*!
@param loc Source code location
@param gtid Global thread id

Mark the end of a dynamic loop.
*/
void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid) {
  __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
}

/*!
See @ref __kmpc_dispatch_fini_4
*/
void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid) {
  __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
}

/*!
See @ref __kmpc_dispatch_fini_4
*/
void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid) {
  __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
}

/*!
See @ref __kmpc_dispatch_fini_4
*/
void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid) {
  __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
}
/*! @} */

//-----------------------------------------------------------------------------
// Non-template routines from kmp_dispatch.cpp used in other sources

kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) {
  return value == checker;
}

kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) {
  return value != checker;
}

kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) {
  return value < checker;
}

kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) {
  return value >= checker;
}

kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) {
  return value <= checker;
}

kmp_uint32
__kmp_wait_4(volatile kmp_uint32 *spinner, kmp_uint32 checker,
             kmp_uint32 (*pred)(kmp_uint32, kmp_uint32),
             void *obj // Higher-level synchronization object, or NULL.
) {
  // note: we may not belong to a team at this point
  volatile kmp_uint32 *spin = spinner;
  kmp_uint32 check = checker;
  kmp_uint32 spins;
  kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred;
  kmp_uint32 r;
  kmp_uint64 time;

  KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin));
  KMP_INIT_YIELD(spins);
  KMP_INIT_BACKOFF(time);
  // main wait spin loop
  while (!f(r = TCR_4(*spin), check)) {
    KMP_FSYNC_SPIN_PREPARE(obj);
    /* GEH - remove this since it was accidentally introduced when kmp_wait was
       split. It causes problems with infinite recursion because of exit lock */
    /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
        __kmp_abort_thread(); */
    KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time);
  }
  KMP_FSYNC_SPIN_ACQUIRED(obj);
  return r;
}

void __kmp_wait_4_ptr(void *spinner, kmp_uint32 checker,
                      kmp_uint32 (*pred)(void *, kmp_uint32),
                      void *obj // Higher-level synchronization object, or NULL.
) {
  // note: we may not belong to a team at this point
  void *spin = spinner;
  kmp_uint32 check = checker;
  kmp_uint32 spins;
  kmp_uint32 (*f)(void *, kmp_uint32) = pred;
  kmp_uint64 time;

  KMP_FSYNC_SPIN_INIT(obj, spin);
  KMP_INIT_YIELD(spins);
  KMP_INIT_BACKOFF(time);
  // main wait spin loop
  while (!f(spin, check)) {
    KMP_FSYNC_SPIN_PREPARE(obj);
    /* if we have waited a bit, or are noversubscribed, yield */
    /* pause is in the following code */
    KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time);
  }
  KMP_FSYNC_SPIN_ACQUIRED(obj);
}

} // extern "C"

#ifdef KMP_GOMP_COMPAT

void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
                               enum sched_type schedule, kmp_int32 lb,
                               kmp_int32 ub, kmp_int32 st, kmp_int32 chunk,
                               int push_ws) {
  __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk,
                                 push_ws);
}

void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
                                enum sched_type schedule, kmp_uint32 lb,
                                kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk,
                                int push_ws) {
  __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk,
                                  push_ws);
}

void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
                               enum sched_type schedule, kmp_int64 lb,
                               kmp_int64 ub, kmp_int64 st, kmp_int64 chunk,
                               int push_ws) {
  __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk,
                                 push_ws);
}

void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
                                enum sched_type schedule, kmp_uint64 lb,
                                kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk,
                                int push_ws) {
  __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk,
                                  push_ws);
}

void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid) {
  __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
}

void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid) {
  __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
}

void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid) {
  __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
}

void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid) {
  __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
}

#endif /* KMP_GOMP_COMPAT */

/* ------------------------------------------------------------------------ */